Add gemini-3-flash-preview fallback and Cloud Run service config

gemini_service.py: if the primary model (gemini-3.1-pro-preview) is
unavailable or returns a permission error, all three call sites now
automatically retry with gemini-3-flash-preview before propagating failure.

cloudrun.yaml: new Cloud Run service definition that ensures stable
WebSocket operation — 10-minute request timeout (vs 60s default),
2 vCPU / 4Gi RAM for PDF rasterisation, min 1 warm instance to prevent
cold-start disconnects, and GEMINI_API_KEY sourced from Secret Manager
so the service can actually reach the Gemini API.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Vadym Samoilenko 2026-03-02 11:18:57 +00:00
parent 138fa0fcdf
commit 82e38e8853
2 changed files with 89 additions and 9 deletions

View file

@ -23,6 +23,26 @@ class GeminiService:
"""
self.client = genai.Client(api_key=api_key)
self.model = "gemini-3.1-pro-preview"
self.fallback_model = "gemini-3-flash-preview"
async def _generate_content(self, contents, config) -> any:
"""Call generate_content, falling back to fallback_model if the primary fails."""
try:
return await self.client.aio.models.generate_content(
model=self.model,
contents=contents,
config=config,
)
except Exception as e:
logger.warning(
f"[GEMINI API] Primary model {self.model} failed: {e}. "
f"Retrying with fallback {self.fallback_model}"
)
return await self.client.aio.models.generate_content(
model=self.fallback_model,
contents=contents,
config=config,
)
async def analyze_with_image(
self,
@ -102,13 +122,12 @@ class GeminiService:
# Make the API call
logger.info(f"[GEMINI API] Calling Gemini model: {self.model}")
response = await self.client.aio.models.generate_content(
model=self.model,
response = await self._generate_content(
contents=[file_part, prompt],
config=types.GenerateContentConfig(
response_mime_type="application/json",
response_schema=response_schema
)
),
)
logger.info(f"[GEMINI API] Response received from Gemini")
@ -236,13 +255,12 @@ class GeminiService:
# Make the API call
logger.info(f"[GEMINI API] Calling Gemini model: {self.model} with {len(images)} images")
response = await self.client.aio.models.generate_content(
model=self.model,
response = await self._generate_content(
contents=contents,
config=types.GenerateContentConfig(
response_mime_type="application/json",
response_schema=response_schema
)
),
)
logger.info(f"[GEMINI API] Response received from Gemini (multi-image)")
@ -318,13 +336,12 @@ class GeminiService:
"required": ["overallStatus", "summary"]
}
response = await self.client.aio.models.generate_content(
model=self.model,
response = await self._generate_content(
contents=prompt,
config=types.GenerateContentConfig(
response_mime_type="application/json",
response_schema=response_schema
)
),
)
result = json.loads(response.text.strip())

63
cloudrun.yaml Normal file
View file

@ -0,0 +1,63 @@
apiVersion: serving.knative.dev/v1
kind: Service
metadata:
name: modcomms-backend
annotations:
# Allow unauthenticated access (frontend connects directly)
run.googleapis.com/ingress: all
spec:
template:
metadata:
annotations:
# Keep 1 instance warm to prevent cold-start WebSocket failures
autoscaling.knative.dev/minScale: "1"
autoscaling.knative.dev/maxScale: "10"
# Each instance handles up to 4 concurrent analyses (one per WebSocket)
autoscaling.knative.dev/target: "4"
# Required for WebSocket: disable HTTP/2 multiplexing
run.googleapis.com/execution-environment: gen2
spec:
# 10-minute timeout — analysis (4 agents + lead agent) can take 2-3 minutes
# for large multi-page PDFs; 600s gives headroom without being excessive
timeoutSeconds: 600
containerConcurrency: 4
containers:
- image: gcr.io/YOUR_PROJECT_ID/modcomms-backend:latest
ports:
- containerPort: 8000
resources:
limits:
# 2 vCPU + 4Gi RAM: handles PDF rasterisation and parallel agent calls
cpu: "2"
memory: 4Gi
env:
# ── Gemini ────────────────────────────────────────────────────────
- name: GEMINI_API_KEY
valueFrom:
secretKeyRef:
name: gemini-api-key
key: latest
# ── Database ─────────────────────────────────────────────────────
- name: DATABASE_URL
valueFrom:
secretKeyRef:
name: database-url
key: latest
# ── Azure AD auth ─────────────────────────────────────────────────
- name: AZURE_TENANT_ID
valueFrom:
secretKeyRef:
name: azure-tenant-id
key: latest
- name: AZURE_CLIENT_ID
valueFrom:
secretKeyRef:
name: azure-client-id
key: latest
# ── App settings ──────────────────────────────────────────────────
- name: CORS_ORIGINS
value: "https://YOUR_FRONTEND_DOMAIN"
- name: HOST
value: "0.0.0.0"
- name: PORT
value: "8000"