video-accessibility/infra/cloud-run/whisper-http-service.yaml

# =============================================================================
# Cloud Run Service: Whisper HTTP Service
# =============================================================================
# Autoscaling Whisper transcription service for Cloud Run deployment.
# This service handles CPU-intensive Whisper transcription via HTTP endpoints.
#
# Key features:
# - Scale to zero when idle (pay only for compute time used)
# - Up to 10 instances for parallel transcription
# - 8 vCPU / 32GB RAM for fast transcription
# - Startup CPU boost for faster cold starts
# =============================================================================

apiVersion: serving.knative.dev/v1
kind: Service
metadata:
  name: whisper-http-service
  annotations:
    run.googleapis.com/ingress: internal  # Only accessible from within GCP
    run.googleapis.com/execution-environment: gen2  # Required for 8 vCPU
spec:
  template:
    metadata:
      annotations:
        # Autoscaling configuration
        autoscaling.knative.dev/minScale: "0"  # Scale to zero when idle
        autoscaling.knative.dev/maxScale: "10"  # Max 10 concurrent instances

        # Cloud Run Gen2 features
        run.googleapis.com/execution-environment: gen2  # Required for 8 vCPU
        run.googleapis.com/cpu-throttling: "false"  # Keep full CPU for model loading (takes 60-120s)
        run.googleapis.com/startup-cpu-boost: "true"  # Faster cold start

    spec:
      # Only 1 transcription at a time per instance (Whisper is CPU-intensive)
      containerConcurrency: 1

      # 5-minute timeout for long transcriptions
      timeoutSeconds: 300

      serviceAccountName: accessible-video-worker@PROJECT_ID.iam.gserviceaccount.com

      containers:
      - image: gcr.io/PROJECT_ID/whisper-http-service:latest

        ports:
        - containerPort: 8080

        env:
        - name: APP_ENV
          value: "prod"
        - name: PYTHONPATH
          value: "/app"
        - name: PYTHONUNBUFFERED
          value: "1"
        - name: PYTHONDONTWRITEBYTECODE
          value: "1"

        # GCP Configuration
        - name: GCP_PROJECT_ID
          value: "PROJECT_ID"
        - name: GCS_BUCKET
          valueFrom:
            secretKeyRef:
              name: gcs-bucket-name
              key: latest

        # MongoDB for job tracking (optional, for logging)
        - name: MONGODB_URL
          valueFrom:
            secretKeyRef:
              name: mongodb-url
              key: latest

        # Whisper Configuration
        - name: WHISPER_MODEL
          value: "medium"

        # OpenTelemetry configuration
        - name: OTEL_SERVICE_NAME
          value: "whisper-http-service"
        - name: OTEL_SERVICE_VERSION
          value: "1.0.0"
        - name: OTEL_TRACES_EXPORTER
          value: "gcp_trace"

        # Sentry configuration (optional)
        - name: SENTRY_DSN
          valueFrom:
            secretKeyRef:
              name: sentry-dsn
              key: latest
        - name: SENTRY_ENVIRONMENT
          value: "production"

        resources:
          limits:
            memory: "32Gi"
            cpu: "8000m"  # 8 vCPU
          requests:
            memory: "8Gi"
            cpu: "4000m"  # 4 vCPU minimum

        # Health checks
        startupProbe:
          httpGet:
            path: /health
            port: 8080
          initialDelaySeconds: 30  # Wait for Whisper model to load
          periodSeconds: 10
          timeoutSeconds: 5
          failureThreshold: 6

        livenessProbe:
          httpGet:
            path: /health
            port: 8080
          initialDelaySeconds: 60
          periodSeconds: 30
          timeoutSeconds: 10