video-accessibility/infra/cloud-run/whisper-http-service.yaml
michael 95852f1357 fix: update Cloud Run service configs for compatibility
- FFmpeg: Enable CPU throttling to reduce idle costs
- Whisper: Keep CPU throttling disabled (model loading needs full CPU)
- Remove readinessProbe (requires BETA launch stage)
- Both services scale to zero when idle for cost savings

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-02 17:34:10 -06:00

120 lines
3.6 KiB
YAML

# =============================================================================
# Cloud Run Service: Whisper HTTP Service
# =============================================================================
# Autoscaling Whisper transcription service for Cloud Run deployment.
# This service handles CPU-intensive Whisper transcription via HTTP endpoints.
#
# Key features:
# - Scale to zero when idle (pay only for compute time used)
# - Up to 10 instances for parallel transcription
# - 8 vCPU / 32GB RAM for fast transcription
# - Startup CPU boost for faster cold starts
# =============================================================================
apiVersion: serving.knative.dev/v1
kind: Service
metadata:
name: whisper-http-service
annotations:
run.googleapis.com/ingress: internal # Only accessible from within GCP
run.googleapis.com/execution-environment: gen2 # Required for 8 vCPU
spec:
template:
metadata:
annotations:
# Autoscaling configuration
autoscaling.knative.dev/minScale: "0" # Scale to zero when idle
autoscaling.knative.dev/maxScale: "10" # Max 10 concurrent instances
# Cloud Run Gen2 features
run.googleapis.com/execution-environment: gen2 # Required for 8 vCPU
run.googleapis.com/cpu-throttling: "false" # Keep full CPU for model loading (takes 60-120s)
run.googleapis.com/startup-cpu-boost: "true" # Faster cold start
spec:
# Only 1 transcription at a time per instance (Whisper is CPU-intensive)
containerConcurrency: 1
# 5-minute timeout for long transcriptions
timeoutSeconds: 300
serviceAccountName: accessible-video-worker@PROJECT_ID.iam.gserviceaccount.com
containers:
- image: gcr.io/PROJECT_ID/whisper-http-service:latest
ports:
- containerPort: 8080
env:
- name: APP_ENV
value: "prod"
- name: PYTHONPATH
value: "/app"
- name: PYTHONUNBUFFERED
value: "1"
- name: PYTHONDONTWRITEBYTECODE
value: "1"
# GCP Configuration
- name: GCP_PROJECT_ID
value: "PROJECT_ID"
- name: GCS_BUCKET
valueFrom:
secretKeyRef:
name: gcs-bucket-name
key: latest
# MongoDB for job tracking (optional, for logging)
- name: MONGODB_URL
valueFrom:
secretKeyRef:
name: mongodb-url
key: latest
# Whisper Configuration
- name: WHISPER_MODEL
value: "medium"
# OpenTelemetry configuration
- name: OTEL_SERVICE_NAME
value: "whisper-http-service"
- name: OTEL_SERVICE_VERSION
value: "1.0.0"
- name: OTEL_TRACES_EXPORTER
value: "gcp_trace"
# Sentry configuration (optional)
- name: SENTRY_DSN
valueFrom:
secretKeyRef:
name: sentry-dsn
key: latest
- name: SENTRY_ENVIRONMENT
value: "production"
resources:
limits:
memory: "32Gi"
cpu: "8000m" # 8 vCPU
requests:
memory: "8Gi"
cpu: "4000m" # 4 vCPU minimum
# Health checks
startupProbe:
httpGet:
path: /health
port: 8080
initialDelaySeconds: 30 # Wait for Whisper model to load
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 6
livenessProbe:
httpGet:
path: /health
port: 8080
initialDelaySeconds: 60
periodSeconds: 30
timeoutSeconds: 10