Add gemini-3-flash-preview fallback and Cloud Run service config
gemini_service.py: if the primary model (gemini-3.1-pro-preview) is unavailable or returns a permission error, all three call sites now automatically retry with gemini-3-flash-preview before propagating failure. cloudrun.yaml: new Cloud Run service definition that ensures stable WebSocket operation — 10-minute request timeout (vs 60s default), 2 vCPU / 4Gi RAM for PDF rasterisation, min 1 warm instance to prevent cold-start disconnects, and GEMINI_API_KEY sourced from Secret Manager so the service can actually reach the Gemini API. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
138fa0fcdf
commit
82e38e8853
2 changed files with 89 additions and 9 deletions
|
|
@ -23,6 +23,26 @@ class GeminiService:
|
|||
"""
|
||||
self.client = genai.Client(api_key=api_key)
|
||||
self.model = "gemini-3.1-pro-preview"
|
||||
self.fallback_model = "gemini-3-flash-preview"
|
||||
|
||||
async def _generate_content(self, contents, config) -> any:
|
||||
"""Call generate_content, falling back to fallback_model if the primary fails."""
|
||||
try:
|
||||
return await self.client.aio.models.generate_content(
|
||||
model=self.model,
|
||||
contents=contents,
|
||||
config=config,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"[GEMINI API] Primary model {self.model} failed: {e}. "
|
||||
f"Retrying with fallback {self.fallback_model}"
|
||||
)
|
||||
return await self.client.aio.models.generate_content(
|
||||
model=self.fallback_model,
|
||||
contents=contents,
|
||||
config=config,
|
||||
)
|
||||
|
||||
async def analyze_with_image(
|
||||
self,
|
||||
|
|
@ -102,13 +122,12 @@ class GeminiService:
|
|||
|
||||
# Make the API call
|
||||
logger.info(f"[GEMINI API] Calling Gemini model: {self.model}")
|
||||
response = await self.client.aio.models.generate_content(
|
||||
model=self.model,
|
||||
response = await self._generate_content(
|
||||
contents=[file_part, prompt],
|
||||
config=types.GenerateContentConfig(
|
||||
response_mime_type="application/json",
|
||||
response_schema=response_schema
|
||||
)
|
||||
),
|
||||
)
|
||||
logger.info(f"[GEMINI API] Response received from Gemini")
|
||||
|
||||
|
|
@ -236,13 +255,12 @@ class GeminiService:
|
|||
|
||||
# Make the API call
|
||||
logger.info(f"[GEMINI API] Calling Gemini model: {self.model} with {len(images)} images")
|
||||
response = await self.client.aio.models.generate_content(
|
||||
model=self.model,
|
||||
response = await self._generate_content(
|
||||
contents=contents,
|
||||
config=types.GenerateContentConfig(
|
||||
response_mime_type="application/json",
|
||||
response_schema=response_schema
|
||||
)
|
||||
),
|
||||
)
|
||||
logger.info(f"[GEMINI API] Response received from Gemini (multi-image)")
|
||||
|
||||
|
|
@ -318,13 +336,12 @@ class GeminiService:
|
|||
"required": ["overallStatus", "summary"]
|
||||
}
|
||||
|
||||
response = await self.client.aio.models.generate_content(
|
||||
model=self.model,
|
||||
response = await self._generate_content(
|
||||
contents=prompt,
|
||||
config=types.GenerateContentConfig(
|
||||
response_mime_type="application/json",
|
||||
response_schema=response_schema
|
||||
)
|
||||
),
|
||||
)
|
||||
|
||||
result = json.loads(response.text.strip())
|
||||
|
|
|
|||
63
cloudrun.yaml
Normal file
63
cloudrun.yaml
Normal file
|
|
@ -0,0 +1,63 @@
|
|||
apiVersion: serving.knative.dev/v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: modcomms-backend
|
||||
annotations:
|
||||
# Allow unauthenticated access (frontend connects directly)
|
||||
run.googleapis.com/ingress: all
|
||||
spec:
|
||||
template:
|
||||
metadata:
|
||||
annotations:
|
||||
# Keep 1 instance warm to prevent cold-start WebSocket failures
|
||||
autoscaling.knative.dev/minScale: "1"
|
||||
autoscaling.knative.dev/maxScale: "10"
|
||||
# Each instance handles up to 4 concurrent analyses (one per WebSocket)
|
||||
autoscaling.knative.dev/target: "4"
|
||||
# Required for WebSocket: disable HTTP/2 multiplexing
|
||||
run.googleapis.com/execution-environment: gen2
|
||||
spec:
|
||||
# 10-minute timeout — analysis (4 agents + lead agent) can take 2-3 minutes
|
||||
# for large multi-page PDFs; 600s gives headroom without being excessive
|
||||
timeoutSeconds: 600
|
||||
containerConcurrency: 4
|
||||
containers:
|
||||
- image: gcr.io/YOUR_PROJECT_ID/modcomms-backend:latest
|
||||
ports:
|
||||
- containerPort: 8000
|
||||
resources:
|
||||
limits:
|
||||
# 2 vCPU + 4Gi RAM: handles PDF rasterisation and parallel agent calls
|
||||
cpu: "2"
|
||||
memory: 4Gi
|
||||
env:
|
||||
# ── Gemini ────────────────────────────────────────────────────────
|
||||
- name: GEMINI_API_KEY
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: gemini-api-key
|
||||
key: latest
|
||||
# ── Database ─────────────────────────────────────────────────────
|
||||
- name: DATABASE_URL
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: database-url
|
||||
key: latest
|
||||
# ── Azure AD auth ─────────────────────────────────────────────────
|
||||
- name: AZURE_TENANT_ID
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: azure-tenant-id
|
||||
key: latest
|
||||
- name: AZURE_CLIENT_ID
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: azure-client-id
|
||||
key: latest
|
||||
# ── App settings ──────────────────────────────────────────────────
|
||||
- name: CORS_ORIGINS
|
||||
value: "https://YOUR_FRONTEND_DOMAIN"
|
||||
- name: HOST
|
||||
value: "0.0.0.0"
|
||||
- name: PORT
|
||||
value: "8000"
|
||||
Loading…
Add table
Reference in a new issue