Add gemini-3-flash-preview fallback and Cloud Run service config

gemini_service.py: if the primary model (gemini-3.1-pro-preview) is unavailable or returns a permission error, all three call sites now automatically retry with gemini-3-flash-preview before propagating failure. cloudrun.yaml: new Cloud Run service definition that ensures stable WebSocket operation — 10-minute request timeout (vs 60s default), 2 vCPU / 4Gi RAM for PDF rasterisation, min 1 warm instance to prevent cold-start disconnects, and GEMINI_API_KEY sourced from Secret Manager so the service can actually reach the Gemini API. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-02 11:18:57 +00:00 · 2026-03-02 11:18:57 +00:00 · 82e38e8853
commit 82e38e8853
parent 138fa0fcdf
2 changed files with 89 additions and 9 deletions
--- a/backend/app/services/gemini_service.py
+++ b/backend/app/services/gemini_service.py
@ -23,6 +23,26 @@ class GeminiService:
        """
        self.client = genai.Client(api_key=api_key)
        self.model = "gemini-3.1-pro-preview"
+        self.fallback_model = "gemini-3-flash-preview"
+
+    async def _generate_content(self, contents, config) -> any:
+        """Call generate_content, falling back to fallback_model if the primary fails."""
+        try:
+            return await self.client.aio.models.generate_content(
+                model=self.model,
+                contents=contents,
+                config=config,
+            )
+        except Exception as e:
+            logger.warning(
+                f"[GEMINI API] Primary model {self.model} failed: {e}. "
+                f"Retrying with fallback {self.fallback_model}"
+            )
+            return await self.client.aio.models.generate_content(
+                model=self.fallback_model,
+                contents=contents,
+                config=config,
+            )

    async def analyze_with_image(
        self,
@ -102,13 +122,12 @@ class GeminiService:

            # Make the API call
            logger.info(f"[GEMINI API] Calling Gemini model: {self.model}")
-            response = await self.client.aio.models.generate_content(
-                model=self.model,
+            response = await self._generate_content(
                contents=[file_part, prompt],
                config=types.GenerateContentConfig(
                    response_mime_type="application/json",
                    response_schema=response_schema
-                )
+                ),
            )
            logger.info(f"[GEMINI API] Response received from Gemini")

@ -236,13 +255,12 @@ class GeminiService:

            # Make the API call
            logger.info(f"[GEMINI API] Calling Gemini model: {self.model} with {len(images)} images")
-            response = await self.client.aio.models.generate_content(
-                model=self.model,
+            response = await self._generate_content(
                contents=contents,
                config=types.GenerateContentConfig(
                    response_mime_type="application/json",
                    response_schema=response_schema
-                )
+                ),
            )
            logger.info(f"[GEMINI API] Response received from Gemini (multi-image)")

@ -318,13 +336,12 @@ class GeminiService:
                "required": ["overallStatus", "summary"]
            }

-            response = await self.client.aio.models.generate_content(
-                model=self.model,
+            response = await self._generate_content(
                contents=prompt,
                config=types.GenerateContentConfig(
                    response_mime_type="application/json",
                    response_schema=response_schema
-                )
+                ),
            )

            result = json.loads(response.text.strip())
--- a/cloudrun.yaml
+++ b/cloudrun.yaml
@ -0,0 +1,63 @@
+apiVersion: serving.knative.dev/v1
+kind: Service
+metadata:
+  name: modcomms-backend
+  annotations:
+    # Allow unauthenticated access (frontend connects directly)
+    run.googleapis.com/ingress: all
+spec:
+  template:
+    metadata:
+      annotations:
+        # Keep 1 instance warm to prevent cold-start WebSocket failures
+        autoscaling.knative.dev/minScale: "1"
+        autoscaling.knative.dev/maxScale: "10"
+        # Each instance handles up to 4 concurrent analyses (one per WebSocket)
+        autoscaling.knative.dev/target: "4"
+        # Required for WebSocket: disable HTTP/2 multiplexing
+        run.googleapis.com/execution-environment: gen2
+    spec:
+      # 10-minute timeout — analysis (4 agents + lead agent) can take 2-3 minutes
+      # for large multi-page PDFs; 600s gives headroom without being excessive
+      timeoutSeconds: 600
+      containerConcurrency: 4
+      containers:
+        - image: gcr.io/YOUR_PROJECT_ID/modcomms-backend:latest
+          ports:
+            - containerPort: 8000
+          resources:
+            limits:
+              # 2 vCPU + 4Gi RAM: handles PDF rasterisation and parallel agent calls
+              cpu: "2"
+              memory: 4Gi
+          env:
+            # ── Gemini ────────────────────────────────────────────────────────
+            - name: GEMINI_API_KEY
+              valueFrom:
+                secretKeyRef:
+                  name: gemini-api-key
+                  key: latest
+            # ── Database ─────────────────────────────────────────────────────
+            - name: DATABASE_URL
+              valueFrom:
+                secretKeyRef:
+                  name: database-url
+                  key: latest
+            # ── Azure AD auth ─────────────────────────────────────────────────
+            - name: AZURE_TENANT_ID
+              valueFrom:
+                secretKeyRef:
+                  name: azure-tenant-id
+                  key: latest
+            - name: AZURE_CLIENT_ID
+              valueFrom:
+                secretKeyRef:
+                  name: azure-client-id
+                  key: latest
+            # ── App settings ──────────────────────────────────────────────────
+            - name: CORS_ORIGINS
+              value: "https://YOUR_FRONTEND_DOMAIN"
+            - name: HOST
+              value: "0.0.0.0"
+            - name: PORT
+              value: "8000"