diff --git a/backend/app/services/gemini_service.py b/backend/app/services/gemini_service.py
index 13939e2..14b7752 100755
--- a/backend/app/services/gemini_service.py
+++ b/backend/app/services/gemini_service.py
@@ -23,6 +23,26 @@ class GeminiService:
         """
         self.client = genai.Client(api_key=api_key)
         self.model = "gemini-3.1-pro-preview"
+        self.fallback_model = "gemini-3-flash-preview"
+
+    async def _generate_content(self, contents, config) -> any:
+        """Call generate_content, falling back to fallback_model if the primary fails."""
+        try:
+            return await self.client.aio.models.generate_content(
+                model=self.model,
+                contents=contents,
+                config=config,
+            )
+        except Exception as e:
+            logger.warning(
+                f"[GEMINI API] Primary model {self.model} failed: {e}. "
+                f"Retrying with fallback {self.fallback_model}"
+            )
+            return await self.client.aio.models.generate_content(
+                model=self.fallback_model,
+                contents=contents,
+                config=config,
+            )
 
     async def analyze_with_image(
         self,
@@ -102,13 +122,12 @@ class GeminiService:
 
             # Make the API call
             logger.info(f"[GEMINI API] Calling Gemini model: {self.model}")
-            response = await self.client.aio.models.generate_content(
-                model=self.model,
+            response = await self._generate_content(
                 contents=[file_part, prompt],
                 config=types.GenerateContentConfig(
                     response_mime_type="application/json",
                     response_schema=response_schema
-                )
+                ),
             )
             logger.info(f"[GEMINI API] Response received from Gemini")
 
@@ -236,13 +255,12 @@ class GeminiService:
 
             # Make the API call
             logger.info(f"[GEMINI API] Calling Gemini model: {self.model} with {len(images)} images")
-            response = await self.client.aio.models.generate_content(
-                model=self.model,
+            response = await self._generate_content(
                 contents=contents,
                 config=types.GenerateContentConfig(
                     response_mime_type="application/json",
                     response_schema=response_schema
-                )
+                ),
             )
             logger.info(f"[GEMINI API] Response received from Gemini (multi-image)")
 
@@ -318,13 +336,12 @@ class GeminiService:
                 "required": ["overallStatus", "summary"]
             }
 
-            response = await self.client.aio.models.generate_content(
-                model=self.model,
+            response = await self._generate_content(
                 contents=prompt,
                 config=types.GenerateContentConfig(
                     response_mime_type="application/json",
                     response_schema=response_schema
-                )
+                ),
             )
 
             result = json.loads(response.text.strip())
diff --git a/cloudrun.yaml b/cloudrun.yaml
new file mode 100644
index 0000000..e45303f
--- /dev/null
+++ b/cloudrun.yaml
@@ -0,0 +1,63 @@
+apiVersion: serving.knative.dev/v1
+kind: Service
+metadata:
+  name: modcomms-backend
+  annotations:
+    # Allow unauthenticated access (frontend connects directly)
+    run.googleapis.com/ingress: all
+spec:
+  template:
+    metadata:
+      annotations:
+        # Keep 1 instance warm to prevent cold-start WebSocket failures
+        autoscaling.knative.dev/minScale: "1"
+        autoscaling.knative.dev/maxScale: "10"
+        # Each instance handles up to 4 concurrent analyses (one per WebSocket)
+        autoscaling.knative.dev/target: "4"
+        # Required for WebSocket: disable HTTP/2 multiplexing
+        run.googleapis.com/execution-environment: gen2
+    spec:
+      # 10-minute timeout — analysis (4 agents + lead agent) can take 2-3 minutes
+      # for large multi-page PDFs; 600s gives headroom without being excessive
+      timeoutSeconds: 600
+      containerConcurrency: 4
+      containers:
+        - image: gcr.io/YOUR_PROJECT_ID/modcomms-backend:latest
+          ports:
+            - containerPort: 8000
+          resources:
+            limits:
+              # 2 vCPU + 4Gi RAM: handles PDF rasterisation and parallel agent calls
+              cpu: "2"
+              memory: 4Gi
+          env:
+            # ── Gemini ────────────────────────────────────────────────────────
+            - name: GEMINI_API_KEY
+              valueFrom:
+                secretKeyRef:
+                  name: gemini-api-key
+                  key: latest
+            # ── Database ─────────────────────────────────────────────────────
+            - name: DATABASE_URL
+              valueFrom:
+                secretKeyRef:
+                  name: database-url
+                  key: latest
+            # ── Azure AD auth ─────────────────────────────────────────────────
+            - name: AZURE_TENANT_ID
+              valueFrom:
+                secretKeyRef:
+                  name: azure-tenant-id
+                  key: latest
+            - name: AZURE_CLIENT_ID
+              valueFrom:
+                secretKeyRef:
+                  name: azure-client-id
+                  key: latest
+            # ── App settings ──────────────────────────────────────────────────
+            - name: CORS_ORIGINS
+              value: "https://YOUR_FRONTEND_DOMAIN"
+            - name: HOST
+              value: "0.0.0.0"
+            - name: PORT
+              value: "8000"