initial commit

This commit is contained in:
michael 2025-08-24 16:28:33 -05:00
commit af2562096a
212 changed files with 36035 additions and 0 deletions

BIN
.DS_Store vendored Normal file

Binary file not shown.

182
.github/workflows/cd-backend.yml vendored Normal file
View file

@ -0,0 +1,182 @@
name: Deploy Backend
on:
push:
branches: [ main ]
paths:
- 'backend/**'
- '.github/workflows/cd-backend.yml'
workflow_dispatch:
env:
GCP_PROJECT_ID: ${{ secrets.GCP_PROJECT_ID }}
GCP_REGION: us-central1
SERVICE_NAME: accessible-video-api
WORKER_SERVICE_NAME: accessible-video-worker
jobs:
deploy-api:
name: Deploy API to Cloud Run
runs-on: ubuntu-latest
if: github.ref == 'refs/heads/main'
permissions:
contents: read
id-token: write
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Authenticate to Google Cloud
uses: google-github-actions/auth@v2
with:
workload_identity_provider: ${{ secrets.WIF_PROVIDER }}
service_account: ${{ secrets.WIF_SERVICE_ACCOUNT }}
- name: Set up Cloud SDK
uses: google-github-actions/setup-gcloud@v2
- name: Configure Docker auth
run: gcloud auth configure-docker
- name: Build and push Docker image
working-directory: ./backend
run: |
# Build image with multi-stage optimization
docker build \
--target production \
--tag gcr.io/${{ env.GCP_PROJECT_ID }}/${{ env.SERVICE_NAME }}:${{ github.sha }} \
--tag gcr.io/${{ env.GCP_PROJECT_ID }}/${{ env.SERVICE_NAME }}:latest \
.
# Push images
docker push gcr.io/${{ env.GCP_PROJECT_ID }}/${{ env.SERVICE_NAME }}:${{ github.sha }}
docker push gcr.io/${{ env.GCP_PROJECT_ID }}/${{ env.SERVICE_NAME }}:latest
- name: Deploy to Cloud Run
run: |
gcloud run deploy ${{ env.SERVICE_NAME }} \
--image gcr.io/${{ env.GCP_PROJECT_ID }}/${{ env.SERVICE_NAME }}:${{ github.sha }} \
--region ${{ env.GCP_REGION }} \
--platform managed \
--allow-unauthenticated \
--set-env-vars APP_ENV=prod \
--set-secrets JWT_SECRET=jwt-secret:latest,MONGODB_URI=mongodb-uri:latest,REDIS_URL=redis-url:latest,GEMINI_API_KEY=gemini-api-key:latest,SENDGRID_API_KEY=sendgrid-api-key:latest,SENTRY_DSN=sentry-dsn:latest \
--memory 2Gi \
--cpu 2 \
--max-instances 100 \
--min-instances 1 \
--port 8000 \
--timeout 300 \
--concurrency 80
- name: Update traffic to new revision
run: |
gcloud run services update-traffic ${{ env.SERVICE_NAME }} \
--region ${{ env.GCP_REGION }} \
--to-latest
deploy-worker:
name: Deploy Worker to Cloud Run
runs-on: ubuntu-latest
needs: [deploy-api]
permissions:
contents: read
id-token: write
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Authenticate to Google Cloud
uses: google-github-actions/auth@v2
with:
workload_identity_provider: ${{ secrets.WIF_PROVIDER }}
service_account: ${{ secrets.WIF_SERVICE_ACCOUNT }}
- name: Set up Cloud SDK
uses: google-github-actions/setup-gcloud@v2
- name: Configure Docker auth
run: gcloud auth configure-docker
- name: Build and push worker image
working-directory: ./backend
run: |
# Build worker image
docker build \
--target worker \
--tag gcr.io/${{ env.GCP_PROJECT_ID }}/${{ env.WORKER_SERVICE_NAME }}:${{ github.sha }} \
--tag gcr.io/${{ env.GCP_PROJECT_ID }}/${{ env.WORKER_SERVICE_NAME }}:latest \
.
# Push images
docker push gcr.io/${{ env.GCP_PROJECT_ID }}/${{ env.WORKER_SERVICE_NAME }}:${{ github.sha }}
docker push gcr.io/${{ env.GCP_PROJECT_ID }}/${{ env.WORKER_SERVICE_NAME }}:latest
- name: Deploy worker to Cloud Run
run: |
gcloud run deploy ${{ env.WORKER_SERVICE_NAME }} \
--image gcr.io/${{ env.GCP_PROJECT_ID }}/${{ env.WORKER_SERVICE_NAME }}:${{ github.sha }} \
--region ${{ env.GCP_REGION }} \
--platform managed \
--no-allow-unauthenticated \
--set-env-vars APP_ENV=prod \
--set-secrets JWT_SECRET=jwt-secret:latest,MONGODB_URI=mongodb-uri:latest,REDIS_URL=redis-url:latest,GEMINI_API_KEY=gemini-api-key:latest,SENDGRID_API_KEY=sendgrid-api-key:latest,SENTRY_DSN=sentry-dsn:latest \
--memory 4Gi \
--cpu 2 \
--max-instances 50 \
--min-instances 0 \
--timeout 1800 \
--concurrency 1
smoke-tests:
name: Run Smoke Tests
runs-on: ubuntu-latest
needs: [deploy-api, deploy-worker]
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: ${{ env.PYTHON_VERSION }}
- name: Install Poetry
uses: snok/install-poetry@v1
- name: Install dependencies
working-directory: ./backend
run: poetry install
- name: Run smoke tests against production
working-directory: ./backend
env:
API_BASE_URL: https://${{ env.SERVICE_NAME }}-${{ secrets.GCP_REGION_HASH }}-uc.a.run.app
SMOKE_TEST_EMAIL: ${{ secrets.SMOKE_TEST_EMAIL }}
SMOKE_TEST_PASSWORD: ${{ secrets.SMOKE_TEST_PASSWORD }}
run: |
poetry run pytest tests/e2e/test_smoke.py -v
notify-deployment:
name: Notify Deployment Status
runs-on: ubuntu-latest
needs: [smoke-tests]
if: always()
steps:
- name: Notify success
if: needs.smoke-tests.result == 'success'
run: |
echo "✅ Backend deployment completed successfully"
# Add Slack/email notification here if needed
- name: Notify failure
if: needs.smoke-tests.result == 'failure'
run: |
echo "❌ Backend deployment failed"
# Add Slack/email notification here if needed

147
.github/workflows/cd-frontend.yml vendored Normal file
View file

@ -0,0 +1,147 @@
name: Deploy Frontend
on:
push:
branches: [ main ]
paths:
- 'frontend/**'
- '.github/workflows/cd-frontend.yml'
workflow_dispatch:
env:
GCP_PROJECT_ID: ${{ secrets.GCP_PROJECT_ID }}
GCP_REGION: us-central1
BUCKET_NAME: ${{ secrets.FRONTEND_BUCKET_NAME }}
CDN_URL_MAP: accessible-video-frontend
NODE_VERSION: "20"
jobs:
build-and-deploy:
name: Build and Deploy Frontend
runs-on: ubuntu-latest
if: github.ref == 'refs/heads/main'
permissions:
contents: read
id-token: write
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Set up Node.js
uses: actions/setup-node@v4
with:
node-version: ${{ env.NODE_VERSION }}
cache: 'npm'
cache-dependency-path: frontend/package-lock.json
- name: Install dependencies
working-directory: ./frontend
run: npm ci
- name: Build for production
working-directory: ./frontend
env:
VITE_API_BASE_URL: ${{ secrets.PRODUCTION_API_URL }}
VITE_APP_ENV: production
VITE_SENTRY_DSN: ${{ secrets.FRONTEND_SENTRY_DSN }}
run: npm run build
- name: Authenticate to Google Cloud
uses: google-github-actions/auth@v2
with:
workload_identity_provider: ${{ secrets.WIF_PROVIDER }}
service_account: ${{ secrets.WIF_SERVICE_ACCOUNT }}
- name: Set up Cloud SDK
uses: google-github-actions/setup-gcloud@v2
- name: Deploy to Cloud Storage
working-directory: ./frontend
run: |
# Sync build files to Cloud Storage bucket
gsutil -m rsync -r -d dist/ gs://${{ env.BUCKET_NAME }}/
# Set public read permissions for web assets
gsutil -m acl ch -r -u AllUsers:R gs://${{ env.BUCKET_NAME }}
# Set cache headers for different file types
gsutil -m setmeta -h "Cache-Control:public, max-age=31536000, immutable" "gs://${{ env.BUCKET_NAME }}/**/*.js"
gsutil -m setmeta -h "Cache-Control:public, max-age=31536000, immutable" "gs://${{ env.BUCKET_NAME }}/**/*.css"
gsutil -m setmeta -h "Cache-Control:public, max-age=86400" "gs://${{ env.BUCKET_NAME }}/**/*.html"
gsutil -m setmeta -h "Cache-Control:public, max-age=86400" "gs://${{ env.BUCKET_NAME }}/index.html"
- name: Configure Load Balancer and CDN
run: |
# Create backend bucket if it doesn't exist
gcloud compute backend-buckets describe ${{ env.BUCKET_NAME }}-backend || \
gcloud compute backend-buckets create ${{ env.BUCKET_NAME }}-backend \
--gcs-bucket-name=${{ env.BUCKET_NAME }}
# Update the URL map to route to the bucket
gcloud compute url-maps describe ${{ env.CDN_URL_MAP }} || \
gcloud compute url-maps create ${{ env.CDN_URL_MAP }} \
--default-backend-bucket=${{ env.BUCKET_NAME }}-backend
# Create or update HTTPS proxy
gcloud compute target-https-proxies describe ${{ env.CDN_URL_MAP }}-https-proxy || \
gcloud compute target-https-proxies create ${{ env.CDN_URL_MAP }}-https-proxy \
--url-map=${{ env.CDN_URL_MAP }} \
--ssl-certificates=${{ secrets.SSL_CERT_NAME }}
# Create or update global forwarding rule
gcloud compute forwarding-rules describe ${{ env.CDN_URL_MAP }}-https-rule --global || \
gcloud compute forwarding-rules create ${{ env.CDN_URL_MAP }}-https-rule \
--global \
--target-https-proxy=${{ env.CDN_URL_MAP }}-https-proxy \
--ports=443
- name: Invalidate CDN cache
run: |
# Invalidate CDN cache for immediate deployment
gcloud compute url-maps invalidate-cdn-cache ${{ env.CDN_URL_MAP }} \
--path="/*" \
--async
- name: Run smoke tests
working-directory: ./frontend
env:
FRONTEND_URL: https://${{ secrets.FRONTEND_DOMAIN }}
run: |
# Wait a bit for CDN propagation
sleep 30
# Basic smoke test - check if main page loads
curl -f -s -o /dev/null -w "%{http_code}" "$FRONTEND_URL" | grep -q "200" || {
echo "Frontend smoke test failed - main page not accessible"
exit 1
}
# Check if assets are loading
curl -f -s -o /dev/null -w "%{http_code}" "$FRONTEND_URL/assets/" | grep -qE "(200|404)" || {
echo "Frontend smoke test failed - assets not accessible"
exit 1
}
echo "✅ Frontend smoke tests passed"
notify-deployment:
name: Notify Deployment Status
runs-on: ubuntu-latest
needs: [build-and-deploy]
if: always()
steps:
- name: Notify success
if: needs.build-and-deploy.result == 'success'
run: |
echo "✅ Frontend deployment completed successfully"
echo "Frontend is now live at: https://${{ secrets.FRONTEND_DOMAIN }}"
# Add Slack/email notification here if needed
- name: Notify failure
if: needs.build-and-deploy.result == 'failure'
run: |
echo "❌ Frontend deployment failed"
# Add Slack/email notification here if needed

312
.github/workflows/ci.yml vendored Normal file
View file

@ -0,0 +1,312 @@
name: CI
on:
push:
branches: [ main, develop ]
pull_request:
branches: [ main, develop ]
env:
PYTHON_VERSION: "3.11"
NODE_VERSION: "20"
jobs:
backend-lint-and-test:
name: Backend Lint & Test
runs-on: ubuntu-latest
services:
mongodb:
image: mongo:7.0
ports:
- 27017:27017
options: >-
--health-cmd "echo 'db.runCommand("ping").ok' | mongosh --quiet"
--health-interval 10s
--health-timeout 5s
--health-retries 5
redis:
image: redis:7-alpine
ports:
- 6379:6379
options: >-
--health-cmd "redis-cli ping"
--health-interval 10s
--health-timeout 5s
--health-retries 5
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: ${{ env.PYTHON_VERSION }}
- name: Install Poetry
uses: snok/install-poetry@v1
with:
version: latest
virtualenvs-create: true
virtualenvs-in-project: true
- name: Load cached dependencies
id: cached-poetry-dependencies
uses: actions/cache@v4
with:
path: backend/.venv
key: poetry-${{ runner.os }}-${{ env.PYTHON_VERSION }}-${{ hashFiles('backend/poetry.lock') }}
- name: Install dependencies
if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
working-directory: ./backend
run: poetry install --no-interaction --no-root
- name: Install project
working-directory: ./backend
run: poetry install --no-interaction
- name: Run linting (ruff)
working-directory: ./backend
run: poetry run ruff check .
- name: Run type checking (mypy)
working-directory: ./backend
run: poetry run mypy .
- name: Run unit tests
working-directory: ./backend
env:
MONGODB_URI: mongodb://localhost:27017
MONGODB_DB: test_accessible_video
REDIS_URL: redis://localhost:6379
JWT_SECRET: test_jwt_secret_for_ci
GEMINI_API_KEY: fake_key_for_testing
GCP_PROJECT_ID: test-project
GCS_BUCKET: test-bucket
SENDGRID_API_KEY: fake_sendgrid_key
EMAIL_FROM: test@example.com
CLIENT_BASE_URL: http://localhost:3000
run: |
poetry run pytest tests/unit/ -v --cov=app --cov-report=xml --cov-report=term-missing
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v4
with:
file: ./backend/coverage.xml
flags: backend
name: backend-coverage
frontend-lint-and-test:
name: Frontend Lint & Test
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Set up Node.js
uses: actions/setup-node@v4
with:
node-version: ${{ env.NODE_VERSION }}
cache: 'npm'
cache-dependency-path: frontend/package-lock.json
- name: Install dependencies
working-directory: ./frontend
run: npm ci
- name: Run linting (ESLint)
working-directory: ./frontend
run: npm run lint
- name: Run type checking (TypeScript)
working-directory: ./frontend
run: npm run type-check
- name: Run unit tests (Vitest)
working-directory: ./frontend
run: npm run test -- --coverage --reporter=verbose
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v4
with:
file: ./frontend/coverage/lcov.info
flags: frontend
name: frontend-coverage
integration-tests:
name: Integration Tests
runs-on: ubuntu-latest
needs: [backend-lint-and-test, frontend-lint-and-test]
services:
mongodb:
image: mongo:7.0
ports:
- 27017:27017
options: >-
--health-cmd "echo 'db.runCommand("ping").ok' | mongosh --quiet"
--health-interval 10s
--health-timeout 5s
--health-retries 5
redis:
image: redis:7-alpine
ports:
- 6379:6379
options: >-
--health-cmd "redis-cli ping"
--health-interval 10s
--health-timeout 5s
--health-retries 5
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: ${{ env.PYTHON_VERSION }}
- name: Install Poetry
uses: snok/install-poetry@v1
- name: Install backend dependencies
working-directory: ./backend
run: poetry install
- name: Run integration tests
working-directory: ./backend
env:
MONGODB_URI: mongodb://localhost:27017
MONGODB_DB: test_accessible_video_integration
REDIS_URL: redis://localhost:6379
JWT_SECRET: test_jwt_secret_for_integration
GEMINI_API_KEY: fake_key_for_testing
GCP_PROJECT_ID: test-project
GCS_BUCKET: test-bucket
SENDGRID_API_KEY: fake_sendgrid_key
EMAIL_FROM: test@example.com
CLIENT_BASE_URL: http://localhost:3000
run: |
poetry run pytest tests/integration/ -v
build-backend:
name: Build Backend Docker Image
runs-on: ubuntu-latest
needs: [backend-lint-and-test]
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Build backend image
uses: docker/build-push-action@v5
with:
context: ./backend
file: ./backend/Dockerfile
push: false
tags: accessible-video-backend:${{ github.sha }}
cache-from: type=gha
cache-to: type=gha,mode=max
build-frontend:
name: Build Frontend
runs-on: ubuntu-latest
needs: [frontend-lint-and-test]
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Set up Node.js
uses: actions/setup-node@v4
with:
node-version: ${{ env.NODE_VERSION }}
cache: 'npm'
cache-dependency-path: frontend/package-lock.json
- name: Install dependencies
working-directory: ./frontend
run: npm ci
- name: Build for production
working-directory: ./frontend
env:
VITE_API_BASE_URL: https://api.example.com # Placeholder for production
VITE_APP_ENV: production
run: npm run build
- name: Upload build artifacts
uses: actions/upload-artifact@v4
with:
name: frontend-dist
path: frontend/dist/
retention-days: 7
security-scan:
name: Security Scan
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Run Trivy vulnerability scanner
uses: aquasecurity/trivy-action@master
with:
scan-type: 'fs'
scan-ref: '.'
format: 'sarif'
output: 'trivy-results.sarif'
- name: Upload Trivy scan results
uses: github/codeql-action/upload-sarif@v3
with:
sarif_file: 'trivy-results.sarif'
- name: Run Semgrep security scan
uses: semgrep/semgrep-action@v1
with:
config: auto
generateBaseline: false
dependency-check:
name: Dependency Check
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: ${{ env.PYTHON_VERSION }}
- name: Install Poetry
uses: snok/install-poetry@v1
- name: Check backend dependencies
working-directory: ./backend
run: |
poetry check
poetry run pip-audit
- name: Set up Node.js
uses: actions/setup-node@v4
with:
node-version: ${{ env.NODE_VERSION }}
- name: Check frontend dependencies
working-directory: ./frontend
run: |
npm audit --audit-level moderate
npx better-npm-audit audit

1
.gitignore vendored Normal file
View file

@ -0,0 +1 @@
examples/

148
CLAUDE.md Normal file
View file

@ -0,0 +1,148 @@
# Accessible Video Processing Platform - Development Guide
## Project Overview
This is a comprehensive video accessibility platform that automatically generates closed captions and audio descriptions using AI, with quality control workflows and multi-language support.
**Core Tech Stack:**
- Frontend: React 18 + Vite SPA (TypeScript)
- Backend: FastAPI + Celery workers (Python 3.11+)
- Database: MongoDB Atlas
- Storage: Google Cloud Storage with signed URLs
- AI: Gemini 2.5 Pro, Google Cloud Translate, ElevenLabs TTS
- Queue: Redis + Celery
- Auth: JWT with HttpOnly refresh cookies
## Development Instructions
### CRITICAL: Always Read the Full Development Plan
**Before starting any development work, ALWAYS read the entire `video_accessibility_development_plan.txt` file.** This document contains:
- Complete technical specifications
- API contracts and schemas
- Database models and indexes
- Worker pipeline details
- Frontend component specifications
- Security requirements
- Testing strategies
The development plan is the authoritative source for all implementation details. Refer to it frequently during development to ensure consistency with the overall architecture.
## Key Implementation Phases
### Phase 1: Foundation & Setup
- Monorepo structure (backend/, frontend/, infra/)
- FastAPI backend initialization
- React + Vite frontend setup
- MongoDB and Redis configuration
- JWT authentication with RBAC
### Phase 2: Core Services
- Google Cloud Storage integration
- Gemini 2.5 Pro service
- Job model with state machine
- Celery worker infrastructure
### Phase 3: Ingestion & AI Pipeline
- Video upload system
- Ingestion worker task
- VTT generation
- Gemini prompt system
### Phase 4: Quality Control System
- VTT editor component
- QC dashboard for reviewers
- Approval/rejection workflow
- Video player with captions
### Phase 5: Translation & TTS Pipeline
- Google Cloud Translate integration
- Transcreation system
- Translation worker
- TTS service integration
### Phase 6: Final Review & Delivery
- Final review interface
- Job completion workflow
- Email notifications
- Client download portal
### Phase 7: Production Readiness
- Comprehensive testing
- Security hardening
- Observability setup
- CI/CD configuration
## Job Status State Machine
```
created → ingesting → ai_processing → pending_qc → approved_english | rejected → translating → tts_generating → pending_final_review → completed
```
## Key Architecture Decisions
### Security
- Access tokens stored in memory (not localStorage)
- Refresh tokens in HttpOnly cookies
- RBAC enforcement server-side
- Signed URLs for file access (24h expiry)
- Audit logs for all reviewer actions
### Data Flow
1. Client uploads MP4 → GCS + MongoDB record
2. Celery worker processes video with Gemini 2.5 Pro
3. Generates English captions.vtt and audio_description.vtt
4. Reviewer QC approval triggers translation pipeline
5. Multi-language assets generated (translate/transcreate + TTS)
6. Final review and client notification with download links
### File Structure
```
gs://accessible-video/{jobId}/
source.mp4
en/
captions.vtt
ad.vtt
ad.mp3
{lang}/
captions.vtt
ad.vtt
ad.mp3
```
## Development Guidelines
### Before Each Session
1. Read the complete `video_accessibility_development_plan.txt`
2. Review the current todo list and phase
3. Check existing code patterns and conventions
4. Understand the security and accessibility requirements
### Code Standards
- Follow existing patterns in the codebase
- Implement proper error handling and retries
- Add OpenTelemetry tracing for observability
- Ensure RBAC is enforced on all endpoints
- Validate all VTT outputs for correctness
- Write unit tests for all services and utilities
### Testing Requirements
- Unit tests ≥80% coverage for services/utils
- Integration tests with mocked AI services
- E2E tests for complete workflows
- Performance testing for video processing
### Lint/Type Check Commands
- Backend: `ruff check .` and `mypy .`
- Frontend: `npm run lint` and `npm run type-check`
## Important Files to Reference
- `video_accessibility_development_plan.txt` - Complete specification
- Backend schemas in section 17 of the plan
- API design in section 7 of the plan
- Frontend component specs in section 10 of the plan
- Security requirements in section 11 of the plan
## Risk Mitigations
- Invalid JSON from AI models: Pydantic validation + self-heal prompts
- Timestamp drift: Preserve cue timings in translations
- TTS alignment: Per-cue synthesis with crossfades
- Queue backlog: Autoscaling workers with monitoring
- Security: Secret Manager, least-privilege IAM, no client secrets

62
Makefile Normal file
View file

@ -0,0 +1,62 @@
.PHONY: help install dev-backend dev-frontend dev-worker test lint clean
help: ## Show this help message
@echo 'Usage: make [target]'
@echo ''
@echo 'Targets:'
@awk 'BEGIN {FS = ":.*?## "} /^[a-zA-Z_-]+:.*?## / {printf " %-15s %s\n", $$1, $$2}' $(MAKEFILE_LIST)
install: ## Install all dependencies
@echo "Installing backend dependencies..."
cd backend && poetry install
@echo "Installing frontend dependencies..."
cd frontend && npm install
dev-backend: ## Start backend development server
cd backend && poetry run uvicorn app.main:app --reload --host 0.0.0.0 --port 8000
dev-frontend: ## Start frontend development server
cd frontend && npm run dev
dev-worker: ## Start Celery worker
cd backend && poetry run celery -A celery_worker.celery_app worker --loglevel=info
test-backend: ## Run backend tests
cd backend && poetry run pytest
test-frontend: ## Run frontend tests
cd frontend && npm run test
lint-backend: ## Lint backend code
cd backend && poetry run ruff check . && poetry run mypy .
lint-frontend: ## Lint frontend code
cd frontend && npm run lint && npm run type-check
lint: lint-backend lint-frontend ## Lint all code
clean: ## Clean build artifacts
cd backend && rm -rf __pycache__ .pytest_cache .mypy_cache
cd frontend && rm -rf node_modules/.cache dist
build-backend: ## Build backend Docker image
cd backend && docker build -t accessible-video-backend .
build-frontend: ## Build frontend for production
cd frontend && npm run build
# Development helpers
setup-env: ## Copy environment templates
cp backend/.env.example backend/.env
cp frontend/.env.example frontend/.env
@echo "Environment files created. Please update with your actual values."
dev: ## Start all development services (requires tmux)
tmux new-session -d -s accessible-video
tmux send-keys -t accessible-video 'make dev-backend' C-m
tmux split-window -t accessible-video
tmux send-keys -t accessible-video 'make dev-frontend' C-m
tmux split-window -t accessible-video
tmux send-keys -t accessible-video 'make dev-worker' C-m
tmux select-layout -t accessible-video tiled
tmux attach -t accessible-video

178
README.md Normal file
View file

@ -0,0 +1,178 @@
# Accessible Video Processing Platform
An AI-powered platform for generating accessible video content including closed captions, audio descriptions, and multi-language translations.
## Features
- **AI-Powered Processing**: Uses Gemini 2.5 Pro for intelligent caption and audio description generation
- **Multi-Language Support**: Automatic translation and cultural transcreation
- **Quality Control Workflow**: Built-in review and approval process
- **Audio Description**: Text-to-speech generation for voiceovers
- **Secure File Handling**: Google Cloud Storage with signed URLs
- **Role-Based Access**: Client, reviewer, and admin roles with appropriate permissions
## Tech Stack
### Backend
- **FastAPI** - Modern Python web framework
- **Celery** - Distributed task queue for video processing
- **MongoDB** - Document database for job and user data
- **Redis** - Task queue broker and caching
- **Google Cloud Services** - Storage, AI, and TTS
### Frontend
- **React 18** - UI framework
- **Vite** - Fast build tool and dev server
- **TypeScript** - Type safety
- **TanStack Query** - Data fetching and caching
- **Tailwind CSS** - Utility-first styling
## Getting Started
### Prerequisites
- Python 3.11+
- Node.js 18+
- Poetry (for Python dependency management)
- MongoDB (Atlas recommended)
- Redis
- Google Cloud Project with required APIs enabled
### Installation
1. **Clone and setup environment:**
```bash
git clone <repository>
cd accessible-video
make setup-env
```
2. **Install dependencies:**
```bash
make install
```
3. **Configure environment variables:**
- Update `backend/.env` with your database, API keys, and service credentials
- Update `frontend/.env` with your API base URL
### Development
**Start all services (requires tmux):**
```bash
make dev
```
**Or start services individually:**
```bash
# Terminal 1 - Backend API
make dev-backend
# Terminal 2 - Frontend SPA
make dev-frontend
# Terminal 3 - Celery Worker
make dev-worker
```
The application will be available at:
- Frontend: http://localhost:5173
- Backend API: http://localhost:8000
- API Docs: http://localhost:8000/docs
### Testing
```bash
# Run all tests
make test-backend
make test-frontend
# Lint code
make lint
```
## Architecture
### Job Processing Pipeline
1. **Upload**: Client uploads MP4 video
2. **Ingestion**: Video is processed and analyzed by Gemini 2.5 Pro
3. **QC Review**: Human reviewer approves/rejects English captions and audio descriptions
4. **Translation**: Approved content is translated to target languages
5. **TTS Generation**: Audio descriptions are converted to speech
6. **Final Review**: Reviewer approves final multi-language assets
7. **Delivery**: Client receives email with download links
### File Structure
```
backend/ # FastAPI application
├── app/
│ ├── api/ # REST API routes
│ ├── core/ # Configuration and shared utilities
│ ├── models/ # Pydantic data models
│ ├── services/ # External service integrations
│ ├── tasks/ # Celery background tasks
│ └── prompts/ # AI prompt templates
└── tests/ # Test suite
frontend/ # React SPA
├── src/
│ ├── components/ # Reusable UI components
│ ├── routes/ # Page components
│ ├── lib/ # Utilities and API client
│ ├── hooks/ # Custom React hooks
│ └── types/ # TypeScript definitions
└── public/ # Static assets
```
## Configuration
### Required Environment Variables
**Backend (.env):**
- `MONGODB_URI` - MongoDB connection string
- `REDIS_URL` - Redis connection string
- `JWT_SECRET` - Secret for JWT token signing
- `GEMINI_API_KEY` - Google Gemini API key
- `GCS_BUCKET` - Google Cloud Storage bucket name
- `SENDGRID_API_KEY` - SendGrid for email notifications
**Frontend (.env):**
- `VITE_API_BASE_URL` - Backend API URL
### Google Cloud Setup
1. Create a GCP project
2. Enable required APIs:
- Cloud Storage API
- Cloud Translation API
- Cloud Text-to-Speech API
- Vertex AI API (for Gemini)
3. Create service account with appropriate permissions
4. Download service account key and configure `GOOGLE_APPLICATION_CREDENTIALS`
## Deployment
The application is designed for deployment on Google Cloud:
- **Backend**: Cloud Run with auto-scaling
- **Workers**: Cloud Run with Celery
- **Frontend**: Cloud Storage + Cloud CDN
- **Database**: MongoDB Atlas
- **Queue**: Cloud Memorystore (Redis)
See `/infra` directory for deployment configurations.
## Security
- JWT authentication with refresh token rotation
- Role-based access control (RBAC)
- Signed URLs for secure file access
- Audit logging for all reviewer actions
- HTTPS enforcement in production
## Development Guide
Always refer to the complete development plan in `video_accessibility_development_plan.txt` for detailed specifications and requirements. The CLAUDE.md file contains additional development guidelines and phase-by-phase implementation details.

92
backend/.dockerignore Normal file
View file

@ -0,0 +1,92 @@
# Python
__pycache__/
*.py[cod]
*$py.class
*.so
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# Poetry (keep poetry.lock for reproducible builds)
# poetry.lock
# Virtual environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# IDE
.vscode/
.idea/
*.swp
*.swo
*~
# OS
.DS_Store
.DS_Store?
._*
.Spotlight-V100
.Trashes
ehthumbs.db
Thumbs.db
# Testing
.coverage
.pytest_cache/
.mypy_cache/
.tox/
htmlcov/
coverage.xml
*.cover
.hypothesis/
# Documentation
docs/
*.md
README*
# Logs
*.log
logs/
# Git
.git/
.gitignore
# Docker
Dockerfile*
.dockerignore
docker-compose*
# CI/CD
.github/
# Local development
.env.local
.env.development
.env.test
# Temporary files
tmp/
temp/
*.tmp
*.bak

42
backend/.env Normal file
View file

@ -0,0 +1,42 @@
APP_ENV=dev
API_BASE_URL=http://localhost:8000
# Auth
JWT_SECRET=this_is_a_jwt_secret
JWT_ALG=HS256
JWT_ACCESS_TTL_MIN=240
JWT_REFRESH_TTL_DAYS=7
COOKIE_DOMAIN=localdomain.com
COOKIE_SECURE=true
COOKIE_SAMESITE=Lax
# MongoDB
MONGODB_URI=mongodb://admin:password123@localhost:27017/accessible_video?authSource=admin&replicaSet=rs0
MONGODB_DB=accessible_video
# Redis
REDIS_URL=redis://localhost:6379/0
# Celery (uses Redis)
CELERY_BROKER_URL=redis://localhost:6379/0
CELERY_RESULT_BACKEND=redis://localhost:6379/0
# GCP
GCP_PROJECT_ID=optical-414516
GCS_BUCKET=accessible-video
GOOGLE_APPLICATION_CREDENTIALS=/Users/michael.clervi/Documents/projects/video_accessibility/backend/optical-414516-80e2475f6412.json
# AI
GEMINI_API_KEY=AIzaSyAuuVGcvqfoP7pqX-YwieGszPsNSeAft-0
TRANSLATE_API_KEY=...
ELEVENLABS_API_KEY=...
GOOGLE_TTS_CREDENTIALS=/secrets/gcp_tts.json
# Email
SENDGRID_API_KEY=disabled_for_local_testing
EMAIL_FROM=test@localhost.com
CLIENT_BASE_URL=http://localhost:5173
# Observability
SENTRY_DSN=...
OTEL_EXPORTER_OTLP_ENDPOINT=

42
backend/.env.example Normal file
View file

@ -0,0 +1,42 @@
APP_ENV=dev
API_BASE_URL=https://api.yourdomain.com
# Auth
JWT_SECRET=change_me
JWT_ALG=HS256
JWT_ACCESS_TTL_MIN=240
JWT_REFRESH_TTL_DAYS=7
COOKIE_DOMAIN=yourdomain.com
COOKIE_SECURE=true
COOKIE_SAMESITE=Lax
# MongoDB
MONGODB_URI=mongodb://localhost:27017/accessible_video
MONGODB_DB=accessible_video
# Redis
REDIS_URL=redis://localhost:6379/0
# Celery (uses Redis)
CELERY_BROKER_URL=redis://localhost:6379/0
CELERY_RESULT_BACKEND=redis://localhost:6379/0
# GCP
GCP_PROJECT_ID=...
GCS_BUCKET=accessible-video
GOOGLE_APPLICATION_CREDENTIALS=/secrets/gcp.json
# AI
GEMINI_API_KEY=...
TRANSLATE_API_KEY=...
ELEVENLABS_API_KEY=...
GOOGLE_TTS_CREDENTIALS=/secrets/gcp_tts.json
# Email
SENDGRID_API_KEY=...
EMAIL_FROM=support@yourdomain.com
CLIENT_BASE_URL=https://app.yourdomain.com
# Observability
SENTRY_DSN=...
OTEL_EXPORTER_OTLP_ENDPOINT=...

127
backend/Dockerfile Normal file
View file

@ -0,0 +1,127 @@
# Build stage - Install dependencies and build wheels
FROM python:3.11-slim AS builder
# Install build dependencies
RUN apt-get update && apt-get install -y \
build-essential \
curl \
&& rm -rf /var/lib/apt/lists/*
# Install Poetry
RUN pip install poetry==1.8.2
# Set Poetry configuration
ENV POETRY_NO_INTERACTION=1 \
POETRY_VENV_IN_PROJECT=1 \
POETRY_CACHE_DIR=/tmp/poetry_cache
WORKDIR /app
# Copy dependency files
COPY pyproject.toml poetry.lock ./
# Install dependencies into venv
RUN poetry config virtualenvs.in-project true && \
poetry lock --no-update || true && \
poetry install --only=main --no-root && \
rm -rf $POETRY_CACHE_DIR
# Base runtime stage
FROM python:3.11-slim AS base
# Install runtime system dependencies
RUN apt-get update && apt-get install -y \
ffmpeg \
curl \
tini \
&& rm -rf /var/lib/apt/lists/* \
&& apt-get clean
# Create non-root user
RUN groupadd --gid 1000 app \
&& useradd --uid 1000 --gid app --shell /bin/bash --create-home app
# Set working directory
WORKDIR /app
# Copy virtual environment from builder stage
COPY --from=builder --chown=app:app /app/.venv /app/.venv
# Ensure venv is in PATH
ENV PATH="/app/.venv/bin:$PATH"
# Copy application code
COPY --chown=app:app . .
# Switch to non-root user
USER app
# Production API stage
FROM base AS production
# Set environment variables for production
ENV APP_ENV=prod \
PYTHONPATH=/app \
PYTHONUNBUFFERED=1 \
PYTHONDONTWRITEBYTECODE=1
# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
CMD curl -f http://localhost:8000/health || exit 1
# Expose port
EXPOSE 8000
# Use tini as init system for proper signal handling
ENTRYPOINT ["tini", "--"]
# Default command for API server
CMD ["gunicorn", "-c", "gunicorn_conf.py"]
# Worker stage for Celery workers
FROM base AS worker
# Set environment variables for worker
ENV APP_ENV=prod \
PYTHONPATH=/app \
PYTHONUNBUFFERED=1 \
PYTHONDONTWRITEBYTECODE=1 \
C_FORCE_ROOT=1
# Health check for worker (check if Celery is responding)
HEALTHCHECK --interval=60s --timeout=15s --start-period=10s --retries=3 \
CMD python -c "from celery import Celery; app=Celery('app'); print('Worker healthy')" || exit 1
# Use tini as init system for proper signal handling
ENTRYPOINT ["tini", "--"]
# Default command for Celery worker
CMD ["celery", "-A", "app.tasks", "worker", "--loglevel=info", "--concurrency=1"]
# Development stage with dev dependencies
FROM builder AS development
# Install all dependencies including dev
RUN poetry install --no-root && rm -rf $POETRY_CACHE_DIR
# Install additional dev tools
RUN apt-get update && apt-get install -y \
git \
vim \
&& rm -rf /var/lib/apt/lists/*
# Copy application code
COPY --chown=app:app . .
# Switch to non-root user
USER app
# Set environment for development
ENV APP_ENV=dev \
PYTHONPATH=/app \
PYTHONUNBUFFERED=1
EXPOSE 8000
# Development command with hot reload
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000", "--reload"]

Binary file not shown.

Binary file not shown.

View file

@ -0,0 +1,770 @@
from datetime import datetime, timedelta
from typing import Optional
from bson import ObjectId
from fastapi import APIRouter, Depends, HTTPException, Query, Request, status
from motor.motor_asyncio import AsyncIOMotorDatabase
from ...core.database import get_database
from ...core.dependencies import get_current_user, require_roles
from ...core.logging import get_logger
from ...core.security import get_password_hash, verify_password
from ...models.user import User, UserRole
from ...models.audit_log import AuditLogQuery, AuditLogResponse
from ...schemas.auth import (
AdminStatsResponse,
ChangePasswordRequest,
CreateUserRequest,
ResetPasswordRequest,
UpdateUserRequest,
UserListResponse,
UserResponse,
)
from ...services.audit_logger import audit_logger, log_user_management, log_security_event
from ...telemetry import app_metrics
logger = get_logger(__name__)
router = APIRouter(prefix="/admin", tags=["admin"])
@router.get("/users", response_model=UserListResponse)
async def list_users(
page: int = Query(1, ge=1),
size: int = Query(20, ge=1, le=100),
role: Optional[str] = Query(None),
active_only: bool = Query(True),
current_user: User = Depends(require_roles(UserRole.ADMIN)),
db: AsyncIOMotorDatabase = Depends(get_database),
):
"""List users with filtering and pagination (admin only)"""
query = {}
if role:
query["role"] = role
if active_only:
query["is_active"] = True
# Get total count
total = await db.users.count_documents(query)
# Get paginated results
skip = (page - 1) * size
cursor = db.users.find(query, {"hashed_password": 0}).sort("created_at", -1).skip(skip).limit(size)
users = await cursor.to_list(length=size)
user_responses = []
for user_doc in users:
user_responses.append(UserResponse(
id=str(user_doc["_id"]),
email=user_doc["email"],
full_name=user_doc["full_name"],
role=user_doc["role"],
is_active=user_doc["is_active"],
created_at=user_doc.get("created_at", datetime.utcnow()).isoformat()
))
return UserListResponse(
users=user_responses,
total=total,
page=page,
size=size
)
@router.get("/users/{user_id}", response_model=UserResponse)
async def get_user(
user_id: str,
current_user: User = Depends(require_roles(UserRole.ADMIN)),
db: AsyncIOMotorDatabase = Depends(get_database),
):
"""Get user details by ID (admin only)"""
user_doc = await db.users.find_one({"_id": user_id}, {"hashed_password": 0})
if not user_doc:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="User not found"
)
return UserResponse(
id=str(user_doc["_id"]),
email=user_doc["email"],
full_name=user_doc["full_name"],
role=user_doc["role"],
is_active=user_doc["is_active"],
created_at=user_doc.get("created_at", datetime.utcnow()).isoformat()
)
@router.post("/users", response_model=UserResponse, status_code=status.HTTP_201_CREATED)
async def create_user(
user_data: CreateUserRequest,
current_user: User = Depends(require_roles(UserRole.ADMIN)),
db: AsyncIOMotorDatabase = Depends(get_database),
):
"""Create a new user (admin only)"""
# Check if user already exists
existing_user = await db.users.find_one({"email": user_data.email})
if existing_user:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail="User with this email already exists"
)
# Create user document
user_id = str(ObjectId())
user_doc = {
"_id": user_id,
"email": user_data.email,
"hashed_password": get_password_hash(user_data.password),
"full_name": user_data.full_name,
"role": user_data.role.value,
"is_active": True,
"created_at": datetime.utcnow(),
"updated_at": datetime.utcnow()
}
await db.users.insert_one(user_doc)
# Record metrics
app_metrics.record_auth_attempt("user_created", user_data.role.value)
logger.info(f"Admin {current_user.id} created user {user_id} with role {user_data.role.value}")
return UserResponse(
id=user_id,
email=user_data.email,
full_name=user_data.full_name,
role=user_data.role,
is_active=True,
created_at=user_doc["created_at"].isoformat()
)
@router.patch("/users/{user_id}", response_model=UserResponse)
async def update_user(
user_id: str,
user_update: UpdateUserRequest,
current_user: User = Depends(require_roles(UserRole.ADMIN)),
db: AsyncIOMotorDatabase = Depends(get_database),
):
"""Update user details (admin only)"""
# Check if user exists
user_doc = await db.users.find_one({"_id": user_id})
if not user_doc:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="User not found"
)
# Check if email is being changed and doesn't conflict
if user_update.email and user_update.email != user_doc["email"]:
existing_user = await db.users.find_one({"email": user_update.email, "_id": {"$ne": user_id}})
if existing_user:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail="Email already in use by another user"
)
# Build update document
update_data = {"updated_at": datetime.utcnow()}
if user_update.email:
update_data["email"] = user_update.email
if user_update.full_name:
update_data["full_name"] = user_update.full_name
if user_update.role:
update_data["role"] = user_update.role.value
if user_update.is_active is not None:
update_data["is_active"] = user_update.is_active
# Update user
result = await db.users.find_one_and_update(
{"_id": user_id},
{"$set": update_data},
return_document=True
)
logger.info(f"Admin {current_user.id} updated user {user_id}")
return UserResponse(
id=str(result["_id"]),
email=result["email"],
full_name=result["full_name"],
role=result["role"],
is_active=result["is_active"],
created_at=result.get("created_at", datetime.utcnow()).isoformat()
)
@router.delete("/users/{user_id}")
async def deactivate_user(
user_id: str,
current_user: User = Depends(require_roles(UserRole.ADMIN)),
db: AsyncIOMotorDatabase = Depends(get_database),
):
"""Deactivate user account (admin only) - soft delete"""
if str(current_user.id) == user_id:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail="Cannot deactivate your own account"
)
result = await db.users.update_one(
{"_id": user_id},
{
"$set": {
"is_active": False,
"updated_at": datetime.utcnow()
}
}
)
if result.matched_count == 0:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="User not found"
)
logger.info(f"Admin {current_user.id} deactivated user {user_id}")
return {"message": "User deactivated successfully"}
@router.post("/users/{user_id}/reset-password")
async def admin_reset_password(
user_id: str,
reset_request: ResetPasswordRequest,
current_user: User = Depends(require_roles(UserRole.ADMIN)),
db: AsyncIOMotorDatabase = Depends(get_database),
):
"""Reset user password (admin only)"""
# Generate temporary password
import secrets
import string
temp_password = ''.join(secrets.choice(string.ascii_letters + string.digits) for _ in range(12))
hashed_password = get_password_hash(temp_password)
result = await db.users.update_one(
{"_id": user_id},
{
"$set": {
"hashed_password": hashed_password,
"updated_at": datetime.utcnow()
}
}
)
if result.matched_count == 0:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="User not found"
)
logger.info(f"Admin {current_user.id} reset password for user {user_id}")
# In production, send email with temp password instead of returning it
return {
"message": "Password reset successfully",
"temporary_password": temp_password # Remove this in production, send via email
}
@router.get("/stats", response_model=AdminStatsResponse)
async def get_admin_stats(
current_user: User = Depends(require_roles(UserRole.ADMIN)),
db: AsyncIOMotorDatabase = Depends(get_database),
):
"""Get system statistics (admin only)"""
# Get user count
total_users = await db.users.count_documents({"is_active": True})
# Get job counts
total_jobs = await db.jobs.count_documents({})
# Get jobs by status
pipeline = [
{"$group": {"_id": "$status", "count": {"$sum": 1}}}
]
status_counts = await db.jobs.aggregate(pipeline).to_list(None)
jobs_by_status = {item["_id"]: item["count"] for item in status_counts}
# Get jobs created today
today_start = datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0)
active_jobs_today = await db.jobs.count_documents({
"created_at": {"$gte": today_start}
})
# Calculate average processing time for completed jobs
avg_processing_pipeline = [
{"$match": {"status": "completed", "created_at": {"$exists": True}, "updated_at": {"$exists": True}}},
{
"$project": {
"processing_time_hours": {
"$divide": [
{"$subtract": ["$updated_at", "$created_at"]},
3600000 # Convert milliseconds to hours
]
}
}
},
{
"$group": {
"_id": None,
"avg_processing_time": {"$avg": "$processing_time_hours"}
}
}
]
avg_result = await db.jobs.aggregate(avg_processing_pipeline).to_list(None)
avg_processing_time = avg_result[0]["avg_processing_time"] if avg_result else 0.0
return AdminStatsResponse(
total_users=total_users,
total_jobs=total_jobs,
jobs_by_status=jobs_by_status,
active_jobs_today=active_jobs_today,
avg_processing_time_hours=round(avg_processing_time, 2)
)
@router.get("/health/detailed")
async def detailed_health_check(
current_user: User = Depends(require_roles(UserRole.ADMIN, UserRole.REVIEWER)),
db: AsyncIOMotorDatabase = Depends(get_database),
):
"""Detailed health check with system component status (admin/reviewer only)"""
health_status = {
"status": "healthy",
"timestamp": datetime.utcnow().isoformat(),
"components": {}
}
# Check MongoDB
try:
await db.command("ping")
health_status["components"]["mongodb"] = {"status": "healthy"}
except Exception as e:
health_status["components"]["mongodb"] = {"status": "unhealthy", "error": str(e)}
health_status["status"] = "degraded"
# Check Redis (via import to avoid circular dependency)
try:
from ...core.redis import redis_client
if redis_client:
await redis_client.ping()
health_status["components"]["redis"] = {"status": "healthy"}
else:
health_status["components"]["redis"] = {"status": "not_configured"}
except Exception as e:
health_status["components"]["redis"] = {"status": "unhealthy", "error": str(e)}
health_status["status"] = "degraded"
# Check GCS (basic check)
try:
from ...services.gcs import gcs_service
# Simple check to see if bucket is accessible
bucket_exists = await gcs_service.file_exists("health_check_dummy") # This will return False but won't error if bucket accessible
health_status["components"]["gcs"] = {"status": "healthy"}
except Exception as e:
health_status["components"]["gcs"] = {"status": "unhealthy", "error": str(e)}
health_status["status"] = "degraded"
# Check job queue health
try:
from ...tasks import celery_app
inspect = celery_app.control.inspect()
active_tasks = inspect.active()
if active_tasks:
total_active = sum(len(tasks) for tasks in active_tasks.values())
health_status["components"]["celery"] = {
"status": "healthy",
"active_tasks": total_active,
"workers": len(active_tasks)
}
else:
health_status["components"]["celery"] = {
"status": "no_workers",
"active_tasks": 0,
"workers": 0
}
except Exception as e:
health_status["components"]["celery"] = {"status": "unhealthy", "error": str(e)}
health_status["status"] = "degraded"
return health_status
@router.get("/jobs/stats")
async def get_job_statistics(
days: int = Query(7, ge=1, le=90),
current_user: User = Depends(require_roles(UserRole.ADMIN, UserRole.REVIEWER)),
db: AsyncIOMotorDatabase = Depends(get_database),
):
"""Get job processing statistics (admin/reviewer only)"""
since_date = datetime.utcnow() - timedelta(days=days)
# Jobs created in period
jobs_in_period = await db.jobs.count_documents({
"created_at": {"$gte": since_date}
})
# Jobs completed in period
jobs_completed = await db.jobs.count_documents({
"status": "completed",
"updated_at": {"$gte": since_date}
})
# Average processing time for completed jobs
avg_pipeline = [
{
"$match": {
"status": "completed",
"created_at": {"$gte": since_date},
"updated_at": {"$exists": True}
}
},
{
"$project": {
"processing_time_hours": {
"$divide": [
{"$subtract": ["$updated_at", "$created_at"]},
3600000
]
}
}
},
{
"$group": {
"_id": None,
"avg_time": {"$avg": "$processing_time_hours"},
"min_time": {"$min": "$processing_time_hours"},
"max_time": {"$max": "$processing_time_hours"}
}
}
]
avg_result = await db.jobs.aggregate(avg_pipeline).to_list(None)
processing_stats = avg_result[0] if avg_result else {
"avg_time": 0, "min_time": 0, "max_time": 0
}
# Current queue status
current_queue_stats = {}
pipeline = [
{"$group": {"_id": "$status", "count": {"$sum": 1}}}
]
status_counts = await db.jobs.aggregate(pipeline).to_list(None)
for item in status_counts:
current_queue_stats[item["_id"]] = item["count"]
return {
"period_days": days,
"jobs_created": jobs_in_period,
"jobs_completed": jobs_completed,
"completion_rate": round(jobs_completed / max(jobs_in_period, 1) * 100, 2),
"avg_processing_time_hours": round(processing_stats["avg_time"], 2),
"min_processing_time_hours": round(processing_stats["min_time"], 2),
"max_processing_time_hours": round(processing_stats["max_time"], 2),
"current_queue_status": current_queue_stats
}
@router.post("/users/{user_id}/password/reset")
async def admin_force_password_reset(
user_id: str,
current_user: User = Depends(require_roles(UserRole.ADMIN)),
db: AsyncIOMotorDatabase = Depends(get_database),
):
"""Force password reset for user (admin only)"""
if str(current_user.id) == user_id:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail="Cannot reset your own password this way"
)
# Check if user exists
user_doc = await db.users.find_one({"_id": user_id})
if not user_doc:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="User not found"
)
# Generate secure temporary password
import secrets
import string
temp_password = ''.join(secrets.choice(
string.ascii_letters + string.digits + "!@#$%"
) for _ in range(16))
# Update password
await db.users.update_one(
{"_id": user_id},
{
"$set": {
"hashed_password": get_password_hash(temp_password),
"updated_at": datetime.utcnow()
}
}
)
# TODO: In production, send via secure email instead of returning password
logger.info(f"Admin {current_user.id} reset password for user {user_id}")
return {
"message": "Password reset successfully",
"temporary_password": temp_password,
"note": "User should change this password immediately"
}
@router.get("/audit-logs")
async def get_audit_logs(
job_id: Optional[str] = Query(None),
action: Optional[str] = Query(None),
days: int = Query(7, ge=1, le=90),
page: int = Query(1, ge=1),
size: int = Query(50, ge=1, le=200),
current_user: User = Depends(require_roles(UserRole.ADMIN)),
db: AsyncIOMotorDatabase = Depends(get_database),
):
"""Get audit logs with filtering (admin only)"""
query = {
"when": {"$gte": datetime.utcnow() - timedelta(days=days)}
}
if job_id:
query["job_id"] = job_id
if action:
query["action"] = action
# Get total count
total = await db.audit_logs.count_documents(query)
# Get paginated results
skip = (page - 1) * size
cursor = (
db.audit_logs.find(query)
.sort("when", -1)
.skip(skip)
.limit(size)
)
logs = await cursor.to_list(length=size)
return {
"logs": logs,
"total": total,
"page": page,
"size": size,
"period_days": days
}
@router.post("/maintenance/reprocess-job/{job_id}")
async def reprocess_job(
job_id: str,
current_user: User = Depends(require_roles(UserRole.ADMIN)),
db: AsyncIOMotorDatabase = Depends(get_database),
):
"""Force reprocessing of a job (admin emergency function)"""
# Check if job exists
job_doc = await db.jobs.find_one({"_id": job_id})
if not job_doc:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="Job not found"
)
# Reset job to created status for reprocessing
await db.jobs.update_one(
{"_id": job_id},
{
"$set": {
"status": "created",
"error": None,
"updated_at": datetime.utcnow()
},
"$push": {
"review.history": {
"at": datetime.utcnow(),
"status": "reprocessing",
"by": str(current_user.id),
"notes": "Admin-triggered reprocessing"
}
}
}
)
# Trigger ingestion task
from ...tasks.ingest_and_ai import ingest_and_ai_task
ingest_and_ai_task.delay(job_id)
logger.warning(f"Admin {current_user.id} triggered reprocessing for job {job_id}")
return {"message": f"Job {job_id} queued for reprocessing"}
@router.get("/audit-logs", response_model=AuditLogResponse)
async def get_audit_logs(
# Time range
start_date: Optional[datetime] = Query(None, description="Start date for audit logs"),
end_date: Optional[datetime] = Query(None, description="End date for audit logs"),
# Filters
action: Optional[str] = Query(None, description="Filter by action type"),
severity: Optional[str] = Query(None, description="Filter by severity level"),
user_email: Optional[str] = Query(None, description="Filter by user email"),
resource_type: Optional[str] = Query(None, description="Filter by resource type"),
resource_id: Optional[str] = Query(None, description="Filter by resource ID"),
success: Optional[bool] = Query(None, description="Filter by success status"),
# Search
search: Optional[str] = Query(None, description="Search in description and details"),
# Pagination
page: int = Query(1, ge=1, description="Page number"),
size: int = Query(50, ge=1, le=500, description="Page size"),
# Sorting
sort_by: str = Query("timestamp", description="Field to sort by"),
sort_order: int = Query(-1, ge=-1, le=1, description="Sort order (-1 desc, 1 asc)"),
current_user: User = Depends(require_roles(UserRole.ADMIN)),
request: Request = None,
):
"""Get audit logs with filtering and pagination (admin only)"""
# Log audit log access
await audit_logger.log_action(
action="admin.audit.access",
description=f"Admin {current_user.email} accessed audit logs",
user=current_user,
request=request,
details={
"filters": {
"start_date": start_date.isoformat() if start_date else None,
"end_date": end_date.isoformat() if end_date else None,
"action": action,
"severity": severity,
"user_email": user_email,
"resource_type": resource_type,
"search": search
}
}
)
# Build query
query = AuditLogQuery(
start_date=start_date,
end_date=end_date,
action=action,
severity=severity,
user_email=user_email,
resource_type=resource_type,
resource_id=resource_id,
success=success,
search=search,
skip=(page - 1) * size,
limit=size,
sort_by=sort_by,
sort_order=sort_order
)
return await audit_logger.query_logs(query)
@router.get("/audit-logs/user/{user_id}")
async def get_user_audit_logs(
user_id: str,
days: int = Query(30, ge=1, le=365, description="Number of days to look back"),
current_user: User = Depends(require_roles(UserRole.ADMIN)),
request: Request = None,
):
"""Get audit logs for a specific user (admin only)"""
# Validate user_id
try:
ObjectId(user_id)
except Exception:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail="Invalid user ID format"
)
# Log access to user audit logs
await audit_logger.log_action(
action="admin.audit.access",
description=f"Admin {current_user.email} accessed user audit logs for {user_id}",
user=current_user,
request=request,
resource_type="user",
resource_id=user_id,
details={"days_requested": days}
)
logs = await audit_logger.get_user_activity(user_id, days)
return {"logs": logs, "user_id": user_id, "days": days}
@router.get("/audit-logs/security")
async def get_security_events(
hours: int = Query(24, ge=1, le=168, description="Number of hours to look back"),
current_user: User = Depends(require_roles(UserRole.ADMIN)),
request: Request = None,
):
"""Get recent security events (admin only)"""
# Log access to security events
await audit_logger.log_action(
action="admin.audit.access",
description=f"Admin {current_user.email} accessed security events",
user=current_user,
request=request,
details={"hours_requested": hours}
)
logs = await audit_logger.get_security_events(hours)
return {"logs": logs, "hours": hours}
@router.delete("/audit-logs/cleanup")
async def cleanup_audit_logs(
retention_days: int = Query(365, ge=30, le=2555, description="Retention period in days"),
current_user: User = Depends(require_roles(UserRole.ADMIN)),
request: Request = None,
):
"""Clean up old audit logs (admin only)"""
# Log audit cleanup action
await audit_logger.log_action(
action="admin.system.action",
description=f"Admin {current_user.email} initiated audit log cleanup",
user=current_user,
request=request,
details={"retention_days": retention_days},
severity="warning"
)
deleted_count = await audit_logger.cleanup_old_logs(retention_days)
# Log cleanup completion
await audit_logger.log_action(
action="admin.system.action",
description=f"Audit log cleanup completed: {deleted_count} logs deleted",
user=current_user,
request=request,
details={
"retention_days": retention_days,
"deleted_count": deleted_count
}
)
return {
"message": f"Deleted {deleted_count} audit logs older than {retention_days} days",
"deleted_count": deleted_count,
"retention_days": retention_days
}

View file

@ -0,0 +1,161 @@
from fastapi import APIRouter, Depends, HTTPException, Request, Response, status
from fastapi.security import HTTPBearer
from motor.motor_asyncio import AsyncIOMotorClient, AsyncIOMotorDatabase
from ...core.config import settings
from ...core.database import get_database
from ...core.security import (
create_access_token,
create_refresh_token,
decode_token,
verify_password,
)
from ...models.user import User
from ...schemas.auth import LoginRequest, LoginResponse, LogoutResponse, RefreshResponse
router = APIRouter(prefix="/auth", tags=["auth"])
security = HTTPBearer()
@router.post("/login", response_model=LoginResponse)
async def login(
login_data: LoginRequest,
response: Response,
):
print(f"LOGIN: Starting login for {login_data.email}")
# Create database connection directly (bypass dependency injection issues)
client = AsyncIOMotorClient(settings.mongodb_uri)
db = client[settings.mongodb_db]
try:
print("LOGIN: Database connection created")
# Find user by email
print("LOGIN: Looking up user in database")
user_doc = await db.users.find_one({"email": login_data.email})
print(f"LOGIN: User lookup complete, found: {user_doc is not None}")
if not user_doc:
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail="Incorrect email or password",
)
user = User(**user_doc)
# Verify password
if not verify_password(login_data.password, user.hashed_password):
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail="Incorrect email or password",
)
if not user.is_active:
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail="User account is disabled",
)
# Create tokens
access_token = create_access_token(subject=str(user.id))
refresh_token = create_refresh_token(subject=str(user.id))
# Set refresh token as HttpOnly cookie
response.set_cookie(
key="refresh_token",
value=refresh_token,
httponly=True,
secure=settings.cookie_secure,
samesite=settings.cookie_samesite,
domain=settings.cookie_domain if settings.app_env == "prod" else None,
max_age=settings.jwt_refresh_ttl_days * 24 * 60 * 60,
)
return LoginResponse(
access_token=access_token,
user_id=str(user.id),
role=user.role,
)
finally:
# Close database connection
client.close()
@router.post("/refresh", response_model=RefreshResponse)
async def refresh_token(
request: Request,
response: Response,
db: AsyncIOMotorDatabase = Depends(get_database),
):
refresh_token = request.cookies.get("refresh_token")
if not refresh_token:
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail="Refresh token not found",
)
try:
payload = decode_token(refresh_token)
if payload.get("type") != "refresh":
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail="Invalid token type",
)
user_id = payload.get("sub")
if not user_id:
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail="Invalid token",
)
# Verify user still exists and is active
user_doc = await db.users.find_one({"_id": user_id})
if not user_doc:
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail="User not found",
)
user = User(**user_doc)
if not user.is_active:
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail="User account is disabled",
)
# Create new tokens
new_access_token = create_access_token(subject=user_id)
new_refresh_token = create_refresh_token(subject=user_id)
# Update refresh token cookie
response.set_cookie(
key="refresh_token",
value=new_refresh_token,
httponly=True,
secure=settings.cookie_secure,
samesite=settings.cookie_samesite,
domain=settings.cookie_domain if settings.app_env == "prod" else None,
max_age=settings.jwt_refresh_ttl_days * 24 * 60 * 60,
)
return RefreshResponse(access_token=new_access_token)
except Exception:
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail="Invalid refresh token",
)
@router.post("/logout", response_model=LogoutResponse)
async def logout(response: Response):
# Clear refresh token cookie
response.delete_cookie(
key="refresh_token",
httponly=True,
secure=settings.cookie_secure,
samesite=settings.cookie_samesite,
domain=settings.cookie_domain if settings.app_env == "prod" else None,
)
return LogoutResponse()

View file

@ -0,0 +1,51 @@
from fastapi import APIRouter, Depends, HTTPException, status
from motor.motor_asyncio import AsyncIOMotorDatabase
from ...core.database import get_database
from ...core.dependencies import get_current_user
from ...models.user import User
from ...schemas.file import SignedUploadRequest, SignedUploadResponse
from ...services.gcs import generate_signed_upload_url
router = APIRouter(prefix="/files", tags=["files"])
@router.post("/signed-upload", response_model=SignedUploadResponse)
async def get_signed_upload_url(
request: SignedUploadRequest,
current_user: User = Depends(get_current_user),
db: AsyncIOMotorDatabase = Depends(get_database),
):
"""
Generate a signed URL for direct browser-to-GCS upload
This optimizes large file uploads by bypassing the API server
"""
if not request.content_type.startswith("video/"):
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail="Only video files are supported"
)
# Generate unique blob path
from bson import ObjectId
blob_path = f"temp/{ObjectId()}/{request.filename}"
try:
# Generate signed upload URL with form fields
signed_data = await generate_signed_upload_url(
blob_path=blob_path,
content_type=request.content_type,
max_size=request.max_size or 1024 * 1024 * 1024 # 1GB default
)
return SignedUploadResponse(
upload_url=signed_data["url"],
fields=signed_data["fields"],
blob_path=blob_path
)
except Exception as e:
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Failed to generate signed upload URL: {str(e)}"
)

File diff suppressed because it is too large Load diff

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View file

@ -0,0 +1,77 @@
from pydantic_settings import BaseSettings
class Settings(BaseSettings):
# App
app_env: str = "dev"
api_base_url: str = "http://localhost:8000"
# Auth
jwt_secret: str
jwt_alg: str = "HS256"
jwt_access_ttl_min: int = 15
jwt_refresh_ttl_days: int = 7
cookie_domain: str = "localhost"
cookie_secure: bool = False
cookie_samesite: str = "Lax"
# Database
mongodb_uri: str
mongodb_db: str = "accessible_video"
# Redis
redis_url: str
# Celery
celery_broker_url: str = ""
celery_result_backend: str = ""
# GCP
gcp_project_id: str
gcs_bucket: str = "accessible-video"
google_application_credentials: str = ""
# AI Services
gemini_api_key: str
translate_api_key: str = ""
elevenlabs_api_key: str = ""
google_tts_credentials: str = ""
# TTS Voice Configuration
tts_provider: str = "google" # "google" or "elevenlabs"
google_tts_voices: dict[str, str] = {
"en-US": "en-US-Neural2-D",
"es-ES": "es-ES-Neural2-A",
"fr-FR": "fr-FR-Neural2-A",
"de-DE": "de-DE-Neural2-B"
}
elevenlabs_voices: dict[str, str] = {
"en-US": "21m00Tcm4TlvDq8ikWAM",
"es-ES": "VR6AewLTigWG4xSOukaG",
"fr-FR": "TxGEqnHWrfWFTfGW9XjX",
"de-DE": "pNInz6obpgDQGcFmaJgB"
}
# Email
sendgrid_api_key: str
email_from: str
client_base_url: str
# Observability
sentry_dsn: str = ""
otel_exporter_otlp_endpoint: str = ""
# CORS
cors_origins: list[str] = ["http://localhost:5173", "http://localhost:3000"]
class Config:
env_file = ".env"
settings = Settings()
def get_settings():
"""Get settings instance - for dependency injection"""
return settings

View file

@ -0,0 +1,67 @@
from motor.motor_asyncio import AsyncIOMotorClient, AsyncIOMotorDatabase
from ..core.logging import get_logger
from .config import settings
logger = get_logger(__name__)
class MongoDB:
client: AsyncIOMotorClient = None
database: AsyncIOMotorDatabase = None
mongodb = MongoDB()
async def connect_to_mongo():
logger.info("Connecting to MongoDB...")
mongodb.client = AsyncIOMotorClient(settings.mongodb_uri)
mongodb.database = mongodb.client[settings.mongodb_db]
# Test connection
try:
await mongodb.client.admin.command('ping')
logger.info("Successfully connected to MongoDB")
except Exception as e:
logger.error(f"Failed to connect to MongoDB: {e}")
raise
async def close_mongo_connection():
logger.info("Closing MongoDB connection...")
if mongodb.client:
mongodb.client.close()
async def get_database() -> AsyncIOMotorDatabase:
return mongodb.database
async def create_indexes():
"""Create database indexes as specified in the development plan"""
db = mongodb.database
# Jobs collection indexes
await db.jobs.create_index([("status", 1), ("created_at", -1)])
await db.jobs.create_index([("client_id", 1)])
# Users collection indexes
await db.users.create_index([("email", 1)], unique=True)
# Audit logs collection indexes - comprehensive indexing for audit queries
await db.audit_logs.create_index([("timestamp", -1)]) # Primary sort field
await db.audit_logs.create_index([("action", 1), ("timestamp", -1)]) # Filter by action
await db.audit_logs.create_index([("user_id", 1), ("timestamp", -1)]) # User activity
await db.audit_logs.create_index([("severity", 1), ("timestamp", -1)]) # Security events
await db.audit_logs.create_index([("resource_type", 1), ("resource_id", 1)]) # Resource tracking
await db.audit_logs.create_index([("ip_address", 1), ("timestamp", -1)]) # IP-based analysis
await db.audit_logs.create_index([("success", 1), ("timestamp", -1)]) # Failed operations
# Text search index for description and details
await db.audit_logs.create_index([
("description", "text"),
("details", "text"),
("error_message", "text")
])
logger.info("Database indexes created successfully")

View file

@ -0,0 +1,88 @@
from typing import Optional
from bson import ObjectId
from fastapi import Depends, HTTPException, Request, status
from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer
from motor.motor_asyncio import AsyncIOMotorDatabase
from ..models.user import User, UserRole
from .database import get_database
from .security import decode_token
security = HTTPBearer()
async def get_current_user(
credentials: HTTPAuthorizationCredentials = Depends(security),
db: AsyncIOMotorDatabase = Depends(get_database),
) -> User:
token = credentials.credentials
payload = decode_token(token)
user_id: str = payload.get("sub")
if user_id is None:
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail="Could not validate credentials",
)
user_doc = await db.users.find_one({"_id": ObjectId(user_id)})
if user_doc is None:
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail="User not found",
)
return User(**user_doc)
def require_role(required_role: UserRole):
async def role_checker(current_user: User = Depends(get_current_user)) -> User:
if current_user.role != required_role and current_user.role != UserRole.ADMIN:
raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN,
detail="Insufficient permissions",
)
return current_user
return role_checker
def require_roles(*required_roles: UserRole):
async def roles_checker(current_user: User = Depends(get_current_user)) -> User:
if current_user.role not in required_roles and current_user.role != UserRole.ADMIN:
raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN,
detail="Insufficient permissions",
)
return current_user
return roles_checker
async def get_current_user_optional(
request: Request,
db: AsyncIOMotorDatabase = Depends(get_database),
) -> Optional[User]:
authorization: str = request.headers.get("Authorization")
if not authorization:
return None
try:
scheme, token = authorization.split()
if scheme.lower() != "bearer":
return None
payload = decode_token(token)
user_id: str = payload.get("sub")
if user_id is None:
return None
user_doc = await db.users.find_one({"_id": ObjectId(user_id)})
if user_doc is None:
return None
return User(**user_doc)
except Exception:
return None

View file

@ -0,0 +1,65 @@
import logging
import sys
from typing import Any
class StructuredFormatter(logging.Formatter):
def format(self, record: logging.LogRecord) -> str:
log_entry = {
"timestamp": self.formatTime(record),
"level": record.levelname,
"logger": record.name,
"message": record.getMessage(),
}
if hasattr(record, "extra_fields"):
log_entry.update(record.extra_fields)
if record.exc_info:
log_entry["exception"] = self.formatException(record.exc_info)
return str(log_entry)
def setup_logging() -> None:
root_logger = logging.getLogger()
root_logger.setLevel(logging.INFO)
# Remove default handlers
for handler in root_logger.handlers[:]:
root_logger.removeHandler(handler)
# Add structured handler
handler = logging.StreamHandler(sys.stdout)
handler.setFormatter(StructuredFormatter())
root_logger.addHandler(handler)
# Set levels for third-party loggers
logging.getLogger("uvicorn.access").setLevel(logging.WARNING)
logging.getLogger("httpx").setLevel(logging.WARNING)
def get_logger(name: str) -> logging.Logger:
return logging.getLogger(name)
class LogContext:
def __init__(self, logger: logging.Logger, **context: Any):
self.logger = logger
self.context = context
def info(self, message: str, **extra: Any) -> None:
self._log(logging.INFO, message, **extra)
def warning(self, message: str, **extra: Any) -> None:
self._log(logging.WARNING, message, **extra)
def error(self, message: str, **extra: Any) -> None:
self._log(logging.ERROR, message, **extra)
def _log(self, level: int, message: str, **extra: Any) -> None:
combined_extra = {**self.context, **extra}
record = self.logger.makeRecord(
self.logger.name, level, "", 0, message, (), None, extra_fields=combined_extra
)
self.logger.handle(record)

49
backend/app/core/redis.py Normal file
View file

@ -0,0 +1,49 @@
import redis.asyncio as redis
from .config import settings
from .logging import get_logger
logger = get_logger(__name__)
class RedisConnection:
pool: redis.ConnectionPool = None
client: redis.Redis = None
redis_conn = RedisConnection()
async def connect_to_redis():
logger.info("Connecting to Redis...")
redis_conn.pool = redis.ConnectionPool.from_url(
settings.redis_url,
encoding="utf-8",
decode_responses=True,
max_connections=20,
)
redis_conn.client = redis.Redis(connection_pool=redis_conn.pool)
# Test connection
try:
await redis_conn.client.ping()
logger.info("Successfully connected to Redis")
except Exception as e:
logger.error(f"Failed to connect to Redis: {e}")
raise
async def close_redis_connection():
logger.info("Closing Redis connection...")
if redis_conn.client:
await redis_conn.client.close()
if redis_conn.pool:
await redis_conn.pool.disconnect()
async def get_redis() -> redis.Redis:
return redis_conn.client
def get_redis_client() -> redis.Redis:
"""Get the Redis client synchronously (for middleware setup)."""
return redis_conn.client

View file

@ -0,0 +1,145 @@
"""Enhanced configuration system with Secret Manager integration."""
import os
import asyncio
from typing import Dict, Optional, Any
from functools import lru_cache
from pydantic_settings import BaseSettings
from .config import Settings as BaseConfig
from .logging import get_logger
logger = get_logger(__name__)
class SecretsConfig(BaseConfig):
"""Enhanced configuration that loads secrets from GCP Secret Manager."""
def __init__(self, **kwargs):
# Initialize with base configuration first
super().__init__(**kwargs)
# Flag to track if secrets have been loaded
self._secrets_loaded = False
self._secret_values: Dict[str, str] = {}
async def load_secrets(self) -> None:
"""Load secrets from Secret Manager asynchronously."""
if self._secrets_loaded:
return
try:
# Only import here to avoid circular imports
from app.services.secrets_manager import secrets_manager
# Define which config fields should be loaded from secrets
secret_mappings = {
# Config field -> Secret Manager name
"jwt_secret": "jwt-secret",
"jwt_refresh_secret": "jwt-refresh-secret",
"mongodb_uri": "mongodb-url",
"redis_url": "redis-url",
"gemini_api_key": "gemini-api-key",
"sendgrid_api_key": "sendgrid-api-key",
"elevenlabs_api_key": "elevenlabs-api-key",
"sentry_dsn": "sentry-dsn"
}
# Get all secrets in batch
secret_names = list(secret_mappings.values())
retrieved_secrets = await secrets_manager.get_secrets_batch(secret_names)
# Map secrets back to config fields
for config_field, secret_name in secret_mappings.items():
if secret_name in retrieved_secrets:
self._secret_values[config_field] = retrieved_secrets[secret_name]
# Override the config value
setattr(self, config_field, retrieved_secrets[secret_name])
logger.debug(f"Loaded secret for {config_field}")
else:
logger.warning(f"Secret {secret_name} not available, using environment/default")
self._secrets_loaded = True
logger.info(f"Successfully loaded {len(retrieved_secrets)} secrets from Secret Manager")
except Exception as e:
logger.warning(f"Failed to load secrets from Secret Manager: {e}")
logger.warning("Falling back to environment variables")
self._secrets_loaded = True # Mark as loaded to prevent retries
def get_secret_value(self, field_name: str) -> Optional[str]:
"""Get a secret value if it was loaded from Secret Manager."""
return self._secret_values.get(field_name)
async def refresh_secrets(self) -> None:
"""Force refresh secrets from Secret Manager."""
self._secrets_loaded = False
self._secret_values.clear()
# Clear the secrets manager cache
from app.services.secrets_manager import secrets_manager
secrets_manager.clear_cache()
await self.load_secrets()
@property
def is_production(self) -> bool:
"""Check if running in production environment."""
return self.app_env == "prod"
@property
def is_development(self) -> bool:
"""Check if running in development environment."""
return self.app_env == "dev"
@property
def google_cloud_project(self) -> str:
"""Get Google Cloud Project ID."""
return self.gcp_project_id
@property
def jwt_refresh_secret(self) -> str:
"""Get JWT refresh secret (fallback to main secret if not set)."""
return getattr(self, '_jwt_refresh_secret', self.jwt_secret)
@jwt_refresh_secret.setter
def jwt_refresh_secret(self, value: str) -> None:
"""Set JWT refresh secret."""
self._jwt_refresh_secret = value
# Global configuration instance
_config_instance: Optional[SecretsConfig] = None
async def initialize_config() -> SecretsConfig:
"""Initialize configuration with secrets loading."""
global _config_instance
if _config_instance is None:
_config_instance = SecretsConfig()
await _config_instance.load_secrets()
return _config_instance
def get_settings() -> SecretsConfig:
"""Get settings instance (synchronous)."""
global _config_instance
if _config_instance is None:
# Initialize without secrets for backwards compatibility
_config_instance = SecretsConfig()
logger.warning("Settings accessed before async initialization - secrets not loaded")
return _config_instance
@lru_cache()
def get_settings_cached() -> SecretsConfig:
"""Get cached settings instance."""
return get_settings()
# Backwards compatibility
settings = get_settings()

View file

@ -0,0 +1,55 @@
from datetime import datetime, timedelta
from typing import Any, Optional, Union
from fastapi import HTTPException, status
from jose import JWTError, jwt
from passlib.context import CryptContext
from .config import settings
pwd_context = CryptContext(schemes=["bcrypt"], deprecated="auto")
def create_access_token(
subject: Union[str, Any], expires_delta: Optional[timedelta] = None
) -> str:
if expires_delta:
expire = datetime.utcnow() + expires_delta
else:
expire = datetime.utcnow() + timedelta(minutes=settings.jwt_access_ttl_min)
to_encode = {"exp": expire, "sub": str(subject)}
encoded_jwt = jwt.encode(to_encode, settings.jwt_secret, algorithm=settings.jwt_alg)
return encoded_jwt
def create_refresh_token(
subject: Union[str, Any], expires_delta: Optional[timedelta] = None
) -> str:
if expires_delta:
expire = datetime.utcnow() + expires_delta
else:
expire = datetime.utcnow() + timedelta(days=settings.jwt_refresh_ttl_days)
to_encode = {"exp": expire, "sub": str(subject), "type": "refresh"}
encoded_jwt = jwt.encode(to_encode, settings.jwt_secret, algorithm=settings.jwt_alg)
return encoded_jwt
def verify_password(plain_password: str, hashed_password: str) -> bool:
return pwd_context.verify(plain_password, hashed_password)
def get_password_hash(password: str) -> str:
return pwd_context.hash(password)
def decode_token(token: str) -> dict[str, Any]:
try:
payload = jwt.decode(token, settings.jwt_secret, algorithms=[settings.jwt_alg])
return payload
except JWTError:
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail="Could not validate credentials",
)

Binary file not shown.

222
backend/app/lib/vtt.py Normal file
View file

@ -0,0 +1,222 @@
import re
from dataclasses import dataclass
@dataclass
class VTTCue:
start_time: float # seconds
end_time: float # seconds
text: str
identifier: str | None = None
class VTTParser:
"""Parser and builder for WebVTT files"""
@staticmethod
def parse(vtt_content: str) -> list[VTTCue]:
"""Parse VTT content into a list of cues"""
lines = vtt_content.strip().split('\n')
cues = []
i = 0
while i < len(lines):
line = lines[i].strip()
# Skip WEBVTT header, empty lines, and NOTE lines
if line == "WEBVTT" or line == "" or line.startswith("NOTE"):
i += 1
continue
# Check if this line is a cue identifier (optional)
identifier = None
if " --> " not in line and i + 1 < len(lines) and " --> " in lines[i + 1]:
identifier = line
i += 1
line = lines[i].strip()
# Parse timing line
if " --> " in line:
timing_match = re.match(r'([\d:.,]+)\s+-->\s+([\d:.,]+)', line)
if timing_match:
start_time = VTTParser._parse_timestamp(timing_match.group(1))
end_time = VTTParser._parse_timestamp(timing_match.group(2))
# Collect text lines until empty line or next cue
i += 1
text_lines = []
while i < len(lines) and lines[i].strip() != "":
text_lines.append(lines[i].strip())
i += 1
if text_lines:
cues.append(VTTCue(
start_time=start_time,
end_time=end_time,
text="\n".join(text_lines),
identifier=identifier
))
else:
i += 1
return cues
@staticmethod
def build(cues: list[VTTCue]) -> str:
"""Build VTT content from a list of cues"""
lines = ["WEBVTT", ""]
for cue in cues:
# Add identifier if present
if cue.identifier:
lines.append(cue.identifier)
# Add timing line
start_timestamp = VTTParser._format_timestamp(cue.start_time)
end_timestamp = VTTParser._format_timestamp(cue.end_time)
lines.append(f"{start_timestamp} --> {end_timestamp}")
# Add text (can be multi-line)
lines.append(cue.text)
lines.append("") # Empty line between cues
return "\n".join(lines)
@staticmethod
def _parse_timestamp(timestamp: str) -> float:
"""Convert VTT timestamp (HH:MM:SS.mmm or MM:SS.mmm) to seconds"""
# Clean up timestamp (handle both . and , as decimal separator)
timestamp = timestamp.replace(',', '.')
# Split by colon
parts = timestamp.split(':')
if len(parts) == 3: # HH:MM:SS.mmm
hours, minutes, seconds = parts
elif len(parts) == 2: # MM:SS.mmm
hours, minutes, seconds = "0", parts[0], parts[1]
else:
raise ValueError(f"Invalid timestamp format: {timestamp}")
# Parse seconds and decimal part
sec_parts = seconds.split('.')
whole_seconds = int(sec_parts[0])
decimal_part = int(sec_parts[1]) if len(sec_parts) > 1 else 0
# Convert to total seconds
total_seconds = (
int(hours) * 3600 +
int(minutes) * 60 +
whole_seconds +
decimal_part / 1000.0
)
return total_seconds
@staticmethod
def _format_timestamp(seconds: float) -> str:
"""Convert seconds to VTT timestamp format (HH:MM:SS.mmm)"""
hours = int(seconds // 3600)
minutes = int((seconds % 3600) // 60)
secs = seconds % 60
whole_secs = int(secs)
milliseconds = int((secs - whole_secs) * 1000)
return f"{hours:02d}:{minutes:02d}:{whole_secs:02d}.{milliseconds:03d}"
class VTTEditor:
"""Utility class for editing VTT content while preserving timing"""
@staticmethod
def translate_preserving_timing(
vtt_content: str,
translated_texts: list[str]
) -> str:
"""Replace text in VTT cues while preserving all timing information"""
cues = VTTParser.parse(vtt_content)
if len(translated_texts) != len(cues):
raise ValueError(
f"Text count mismatch: {len(translated_texts)} texts for {len(cues)} cues"
)
# Update cue texts
for i, translated_text in enumerate(translated_texts):
cues[i].text = translated_text
return VTTParser.build(cues)
@staticmethod
def update_cue_text(vtt_content: str, cue_index: int, new_text: str) -> str:
"""Update text for a specific cue by index"""
cues = VTTParser.parse(vtt_content)
if cue_index < 0 or cue_index >= len(cues):
raise ValueError(f"Invalid cue index: {cue_index}")
cues[cue_index].text = new_text
return VTTParser.build(cues)
@staticmethod
def validate_vtt(vtt_content: str) -> tuple[bool, list[str]]:
"""Validate VTT content and return errors if any"""
errors = []
if not vtt_content.strip().startswith("WEBVTT"):
errors.append("VTT must start with 'WEBVTT'")
try:
cues = VTTParser.parse(vtt_content)
# Check timing consistency
for i, cue in enumerate(cues):
if cue.start_time >= cue.end_time:
errors.append(f"Cue {i + 1}: Start time must be before end time")
if i > 0 and cue.start_time < cues[i - 1].end_time:
errors.append(f"Cue {i + 1}: Overlapping with previous cue")
if not cue.text.strip():
errors.append(f"Cue {i + 1}: Empty text content")
except Exception as e:
errors.append(f"Parse error: {str(e)}")
return len(errors) == 0, errors
@staticmethod
def get_cue_count(vtt_content: str) -> int:
"""Get the number of cues in VTT content"""
try:
cues = VTTParser.parse(vtt_content)
return len(cues)
except Exception:
return 0
@staticmethod
def get_total_duration(vtt_content: str) -> float:
"""Get total duration of VTT content in seconds"""
try:
cues = VTTParser.parse(vtt_content)
if not cues:
return 0.0
return max(cue.end_time for cue in cues)
except Exception:
return 0.0
@staticmethod
def adjust_timing_offset(vtt_content: str, offset_seconds: float) -> str:
"""
Adjust all VTT cue timings by a fixed offset
Positive offset moves captions later, negative moves them earlier
"""
cues = VTTParser.parse(vtt_content)
for cue in cues:
cue.start_time = max(0.0, cue.start_time + offset_seconds)
cue.end_time = max(cue.start_time + 0.5, cue.end_time + offset_seconds)
return VTTParser.build(cues)

216
backend/app/main.py Normal file
View file

@ -0,0 +1,216 @@
from contextlib import asynccontextmanager
import sentry_sdk
from fastapi import FastAPI, Request, HTTPException
from fastapi.exceptions import RequestValidationError
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse
from sentry_sdk.integrations.fastapi import FastApiIntegration
from sentry_sdk.integrations.redis import RedisIntegration
from sentry_sdk.integrations.pymongo import PyMongoIntegration
from sentry_sdk.integrations.celery import CeleryIntegration
from .api.v1.routes_admin import router as admin_router
from .api.v1.routes_auth import router as auth_router
from .api.v1.routes_files import router as files_router
from .api.v1.routes_jobs import router as jobs_router
from .core.config import settings
from .core.secrets_config import initialize_config
from .core.database import close_mongo_connection, connect_to_mongo, create_indexes
from .core.logging import setup_logging
from .core.redis import close_redis_connection, connect_to_redis, get_redis_client
from .middleware import create_rate_limit_middleware, create_validation_middleware
from .telemetry import (
app_metrics,
instrument_dependencies,
instrument_fastapi_app,
setup_tracing
)
@asynccontextmanager
async def lifespan(app: FastAPI):
# Startup
setup_logging()
# Initialize configuration with secrets
if settings.app_env == "prod":
try:
await initialize_config()
print("✅ Configuration initialized with Secret Manager")
except Exception as e:
print(f"⚠️ Failed to load secrets from Secret Manager: {e}")
print("⚠️ Falling back to environment variables")
# Initialize Sentry error tracking
if settings.sentry_dsn and settings.sentry_dsn.startswith(('http', 'https')):
sentry_sdk.init(
dsn=settings.sentry_dsn,
integrations=[
FastApiIntegration(),
RedisIntegration(),
PyMongoIntegration(),
CeleryIntegration(monitor_beat_tasks=True),
],
traces_sample_rate=0.1 if settings.app_env == "prod" else 1.0,
environment=settings.app_env,
release="1.0.0",
attach_stacktrace=True,
send_default_pii=False, # Don't send PII for privacy
)
# Initialize telemetry (disabled for local development)
# setup_tracing("accessible-video-api", "1.0.0")
# instrument_dependencies()
# Start Prometheus metrics server in production
if settings.app_env == "prod":
app_metrics.start_prometheus_server(port=8001)
await connect_to_mongo()
await connect_to_redis()
# await create_indexes() # Temporarily disabled for debugging
# Initialize middleware with Redis client
redis_client = get_redis_client()
if redis_client:
rate_limit_middleware = await create_rate_limit_middleware(redis_client)
validation_middleware = await create_validation_middleware()
# Store middleware in app state for access
app.state.rate_limit_middleware = rate_limit_middleware
app.state.validation_middleware = validation_middleware
yield
# Shutdown
await close_mongo_connection()
await close_redis_connection()
app = FastAPI(
title="Accessible Video API",
description="API for accessible video processing platform",
version="1.0.0",
lifespan=lifespan,
)
# CORS middleware
app.add_middleware(
CORSMiddleware,
allow_origins=settings.cors_origins,
allow_credentials=True,
allow_methods=["GET", "POST", "PUT", "PATCH", "DELETE"],
allow_headers=["*"],
)
# Custom CORS error handler middleware to ensure CORS headers are added to all error responses
# This must be added BEFORE CORSMiddleware (which will be applied after due to reverse order)
@app.middleware("http")
async def cors_error_handler(request, call_next):
"""Ensure CORS headers are added to all responses, including errors."""
try:
response = await call_next(request)
except Exception as e:
# Handle any unhandled exceptions and add CORS headers
from fastapi.responses import JSONResponse
response = JSONResponse(
status_code=500,
content={"detail": "Internal server error"}
)
# Always add CORS headers for allowed origins
origin = request.headers.get("origin")
if origin and origin in settings.cors_origins:
response.headers["access-control-allow-origin"] = origin
response.headers["access-control-allow-credentials"] = "true"
# Add other necessary CORS headers for error responses
if response.status_code >= 400:
response.headers["access-control-allow-methods"] = "GET, POST, PUT, PATCH, DELETE"
response.headers["access-control-allow-headers"] = "*"
return response
# Global exception handler to ensure CORS headers on all errors
@app.exception_handler(HTTPException)
async def http_exception_handler(request: Request, exc: HTTPException):
"""Handle HTTP exceptions with CORS headers"""
response = JSONResponse(
status_code=exc.status_code,
content={"detail": exc.detail}
)
# Add CORS headers
origin = request.headers.get("origin")
if origin and origin in settings.cors_origins:
response.headers["access-control-allow-origin"] = origin
response.headers["access-control-allow-credentials"] = "true"
response.headers["access-control-allow-methods"] = "GET, POST, PUT, PATCH, DELETE"
response.headers["access-control-allow-headers"] = "*"
return response
# Global exception handler for validation errors
@app.exception_handler(RequestValidationError)
async def validation_exception_handler(request: Request, exc: RequestValidationError):
"""Handle request validation errors with CORS headers"""
response = JSONResponse(
status_code=422,
content={"detail": exc.errors(), "body": exc.body}
)
# Add CORS headers
origin = request.headers.get("origin")
if origin and origin in settings.cors_origins:
response.headers["access-control-allow-origin"] = origin
response.headers["access-control-allow-credentials"] = "true"
response.headers["access-control-allow-methods"] = "GET, POST, PUT, PATCH, DELETE"
response.headers["access-control-allow-headers"] = "*"
return response
# Add custom middleware (order matters - applied in reverse order)
@app.middleware("http")
async def rate_limiting_middleware(request, call_next):
"""Apply rate limiting middleware."""
# Skip middleware for auth endpoints during debugging
if request.url.path in ["/api/v1/auth/login", "/api/v1/auth/refresh"]:
return await call_next(request)
if hasattr(app.state, 'rate_limit_middleware'):
return await app.state.rate_limit_middleware(request, call_next)
return await call_next(request)
@app.middleware("http")
async def validation_middleware(request, call_next):
"""Apply request validation middleware."""
# Skip middleware for auth endpoints during debugging
if request.url.path in ["/api/v1/auth/login", "/api/v1/auth/refresh"]:
return await call_next(request)
if hasattr(app.state, 'validation_middleware'):
return await app.state.validation_middleware(request, call_next)
return await call_next(request)
# Instrument FastAPI app for tracing (disabled for local development)
# instrument_fastapi_app(app)
# Include routers
app.include_router(auth_router, prefix="/api/v1")
app.include_router(files_router, prefix="/api/v1")
app.include_router(jobs_router, prefix="/api/v1")
app.include_router(admin_router, prefix="/api/v1")
@app.get("/health")
async def health_check():
return {"status": "healthy", "version": "1.0.0"}
@app.get("/metrics")
async def metrics():
"""Prometheus metrics endpoint"""
from prometheus_client import generate_latest, CONTENT_TYPE_LATEST
from fastapi import Response
return Response(
content=generate_latest(),
media_type=CONTENT_TYPE_LATEST
)

View file

@ -0,0 +1,12 @@
"""Middleware package for FastAPI application."""
from .rate_limiting import RateLimitMiddleware, IPWhitelist, create_rate_limit_middleware
from .validation import ValidationMiddleware, create_validation_middleware
__all__ = [
"RateLimitMiddleware",
"IPWhitelist",
"create_rate_limit_middleware",
"ValidationMiddleware",
"create_validation_middleware"
]

View file

@ -0,0 +1,264 @@
"""Rate limiting middleware for API endpoints."""
import time
from collections import defaultdict
from typing import Dict, Optional, Tuple
import redis.asyncio as aioredis
from fastapi import HTTPException, Request, status
from fastapi.responses import JSONResponse
import json
import asyncio
from datetime import datetime, timedelta
from app.core.config import get_settings
from app.telemetry.metrics import track_rate_limit_metrics
class RateLimiter:
"""Redis-based rate limiter with sliding window algorithm."""
def __init__(self, redis_client: aioredis.Redis):
self.redis = redis_client
async def is_allowed(
self,
key: str,
limit: int,
window_seconds: int,
identifier: str = ""
) -> Tuple[bool, Dict[str, int]]:
"""
Check if request is allowed under rate limit.
Returns:
Tuple of (is_allowed, rate_limit_info)
"""
now = time.time()
pipeline = self.redis.pipeline()
# Remove expired entries
pipeline.zremrangebyscore(key, 0, now - window_seconds)
# Count current requests in window
pipeline.zcard(key)
# Add current request
pipeline.zadd(key, {str(now): now})
# Set expiry
pipeline.expire(key, window_seconds)
results = await pipeline.execute()
current_requests = results[1]
rate_limit_info = {
"limit": limit,
"remaining": max(0, limit - current_requests),
"reset_time": int(now + window_seconds),
"retry_after": window_seconds if current_requests >= limit else 0
}
is_allowed = current_requests <= limit
# Track metrics
track_rate_limit_metrics(
identifier=identifier,
is_allowed=is_allowed,
current_requests=current_requests,
limit=limit
)
return is_allowed, rate_limit_info
class RateLimitMiddleware:
"""FastAPI middleware for rate limiting."""
def __init__(self, redis_client: aioredis.Redis):
self.limiter = RateLimiter(redis_client)
self.settings = get_settings()
# Rate limit configurations by endpoint pattern
self.rate_limits = {
# Authentication endpoints
"POST:/api/v1/auth/login": (5, 300), # 5 requests per 5 minutes
"POST:/api/v1/auth/register": (3, 3600), # 3 requests per hour
"POST:/api/v1/auth/refresh": (10, 300), # 10 requests per 5 minutes
"POST:/api/v1/auth/forgot-password": (3, 3600), # 3 requests per hour
# File upload endpoints
"POST:/api/v1/files/upload": (10, 3600), # 10 uploads per hour
"POST:/api/v1/jobs": (20, 3600), # 20 job creations per hour
# Job management endpoints
"GET:/api/v1/jobs": (100, 300), # 100 requests per 5 minutes
"PATCH:/api/v1/jobs/*/approve": (50, 3600), # 50 approvals per hour
"PATCH:/api/v1/jobs/*/reject": (50, 3600), # 50 rejections per hour
# VTT editing endpoints
"PATCH:/api/v1/jobs/*/vtt": (100, 3600), # 100 VTT edits per hour
# Admin endpoints (more restrictive)
"GET:/api/v1/admin/*": (50, 300), # 50 requests per 5 minutes
"POST:/api/v1/admin/*": (20, 3600), # 20 admin actions per hour
"PATCH:/api/v1/admin/*": (20, 3600), # 20 admin updates per hour
"DELETE:/api/v1/admin/*": (10, 3600), # 10 admin deletions per hour
}
# Default rate limits
self.default_limits = {
"authenticated": (1000, 3600), # 1000 requests per hour for authenticated users
"anonymous": (100, 3600), # 100 requests per hour for anonymous users
}
def _get_client_identifier(self, request: Request) -> str:
"""Get client identifier for rate limiting."""
# Try to get user ID from JWT token
user = getattr(request.state, 'user', None)
if user:
return f"user:{user.id}"
# Fall back to IP address
forwarded_for = request.headers.get("X-Forwarded-For")
if forwarded_for:
return f"ip:{forwarded_for.split(',')[0].strip()}"
client_ip = request.client.host if request.client else "unknown"
return f"ip:{client_ip}"
def _get_endpoint_key(self, request: Request) -> str:
"""Get endpoint pattern for rate limiting."""
method = request.method
path = request.url.path
# Replace job IDs with wildcard for pattern matching
import re
path = re.sub(r'/jobs/[a-f0-9-]+/', '/jobs/*/', path)
path = re.sub(r'/admin/users/[a-f0-9-]+', '/admin/users/*', path)
return f"{method}:{path}"
def _get_rate_limit(self, request: Request) -> Tuple[int, int]:
"""Get rate limit for the current request."""
endpoint_key = self._get_endpoint_key(request)
# Check for specific endpoint limits
if endpoint_key in self.rate_limits:
return self.rate_limits[endpoint_key]
# Check for wildcard matches
for pattern, limits in self.rate_limits.items():
if pattern.endswith("*") and endpoint_key.startswith(pattern[:-1]):
return limits
# Use default limits based on authentication
user = getattr(request.state, 'user', None)
if user:
return self.default_limits["authenticated"]
else:
return self.default_limits["anonymous"]
async def __call__(self, request: Request, call_next):
"""Process rate limiting for the request."""
# Skip rate limiting for health checks and login (temporary for debugging)
if request.url.path in ["/health", "/metrics", "/api/v1/auth/login"]:
return await call_next(request)
client_id = self._get_client_identifier(request)
endpoint_key = self._get_endpoint_key(request)
limit, window = self._get_rate_limit(request)
# Create rate limit key
rate_limit_key = f"rate_limit:{client_id}:{endpoint_key}"
try:
is_allowed, rate_info = await self.limiter.is_allowed(
key=rate_limit_key,
limit=limit,
window_seconds=window,
identifier=client_id
)
if not is_allowed:
# Return rate limit exceeded response
return JSONResponse(
status_code=status.HTTP_429_TOO_MANY_REQUESTS,
content={
"detail": "Rate limit exceeded",
"error_code": "RATE_LIMIT_EXCEEDED",
"rate_limit": rate_info
},
headers={
"X-RateLimit-Limit": str(rate_info["limit"]),
"X-RateLimit-Remaining": str(rate_info["remaining"]),
"X-RateLimit-Reset": str(rate_info["reset_time"]),
"Retry-After": str(rate_info["retry_after"])
}
)
# Process the request
response = await call_next(request)
# Add rate limit headers to response
response.headers["X-RateLimit-Limit"] = str(rate_info["limit"])
response.headers["X-RateLimit-Remaining"] = str(rate_info["remaining"])
response.headers["X-RateLimit-Reset"] = str(rate_info["reset_time"])
return response
except Exception as e:
# Log error but don't block request if rate limiting fails
print(f"Rate limiting error: {e}")
return await call_next(request)
class IPWhitelist:
"""IP whitelist for bypassing rate limits."""
def __init__(self, redis_client: aioredis.Redis):
self.redis = redis_client
self.whitelist_key = "ip_whitelist"
# Default whitelisted IPs (health checks, monitoring)
self.default_whitelist = {
"127.0.0.1",
"::1",
"169.254.169.254", # GCP metadata server
}
async def is_whitelisted(self, ip: str) -> bool:
"""Check if IP is whitelisted."""
if ip in self.default_whitelist:
return True
try:
is_member = await self.redis.sismember(self.whitelist_key, ip)
return bool(is_member)
except Exception:
return False
async def add_ip(self, ip: str, ttl_seconds: Optional[int] = None) -> bool:
"""Add IP to whitelist."""
try:
await self.redis.sadd(self.whitelist_key, ip)
if ttl_seconds:
# Create temporary whitelist entry
temp_key = f"{self.whitelist_key}:temp:{ip}"
await self.redis.setex(temp_key, ttl_seconds, "1")
return True
except Exception:
return False
async def remove_ip(self, ip: str) -> bool:
"""Remove IP from whitelist."""
try:
await self.redis.srem(self.whitelist_key, ip)
return True
except Exception:
return False
async def create_rate_limit_middleware(redis_client: aioredis.Redis) -> RateLimitMiddleware:
"""Factory function to create rate limit middleware."""
return RateLimitMiddleware(redis_client)

View file

@ -0,0 +1,324 @@
"""Enhanced request validation middleware."""
import json
import re
import time
from typing import Any, Dict, List, Optional, Set
from fastapi import HTTPException, Request, status
from fastapi.responses import JSONResponse
from pydantic import BaseModel, ValidationError as PydanticValidationError
import magic
from urllib.parse import unquote
from app.telemetry.metrics import track_validation_metrics
class ValidationError(Exception):
"""Custom validation error."""
pass
class SecurityValidationError(Exception):
"""Raised when security validation fails."""
pass
class RequestValidator:
"""Enhanced request validation with security checks."""
def __init__(self):
# File type restrictions
self.allowed_video_types = {
"video/mp4",
"video/quicktime",
"video/x-msvideo" # AVI
}
self.allowed_subtitle_types = {
"text/vtt",
"text/plain"
}
# Security patterns to block
self.malicious_patterns = [
# SQL injection patterns
r"(union|select|insert|update|delete|drop|create|alter)\s+",
r"(script|javascript|vbscript|onload|onerror|onclick)",
r"<\s*script[^>]*>",
r"javascript:",
r"data:.*base64",
# Path traversal
r"\.\./",
r"\.\.\\",
r"%2e%2e%2f",
r"%2e%2e\\",
# Command injection
r"[;&|`$]",
r"(rm|wget|curl|nc|bash|sh|cmd|powershell)\s+",
# MongoDB injection
r"\$where|\$ne|\$gt|\$lt|\$regex",
]
self.compiled_patterns = [re.compile(pattern, re.IGNORECASE) for pattern in self.malicious_patterns]
# Max file sizes (in bytes)
self.max_video_size = 2 * 1024 * 1024 * 1024 # 2GB
self.max_subtitle_size = 10 * 1024 * 1024 # 10MB
# Request size limits
self.max_json_size = 1024 * 1024 # 1MB
self.max_form_fields = 50
def validate_string_content(self, content: str, field_name: str = "input") -> None:
"""Validate string content for malicious patterns."""
if not isinstance(content, str):
return
for pattern in self.compiled_patterns:
if pattern.search(content):
raise SecurityValidationError(
f"Potentially malicious content detected in {field_name}"
)
def validate_filename(self, filename: str) -> str:
"""Validate and sanitize filename."""
if not filename:
raise ValidationError("Filename cannot be empty")
# Decode URL encoding
filename = unquote(filename)
# Check for malicious patterns
self.validate_string_content(filename, "filename")
# Remove dangerous characters
safe_filename = re.sub(r'[^\w\-_\.]', '_', filename)
# Prevent hidden files
if safe_filename.startswith('.'):
safe_filename = 'file_' + safe_filename[1:]
# Limit length
if len(safe_filename) > 255:
name, ext = safe_filename.rsplit('.', 1) if '.' in safe_filename else (safe_filename, '')
safe_filename = name[:250] + ('.' + ext if ext else '')
return safe_filename
def validate_file_type(self, content: bytes, expected_type: str, filename: str) -> None:
"""Validate file type using magic numbers."""
try:
detected_type = magic.from_buffer(content, mime=True)
except Exception:
# Fallback to extension-based validation
ext = filename.lower().split('.')[-1] if '.' in filename else ''
video_extensions = {'mp4', 'mov', 'avi', 'mkv'}
subtitle_extensions = {'vtt', 'srt', 'txt'}
if expected_type == "video" and ext not in video_extensions:
raise ValidationError(f"Invalid video file extension: {ext}")
elif expected_type == "subtitle" and ext not in subtitle_extensions:
raise ValidationError(f"Invalid subtitle file extension: {ext}")
return
if expected_type == "video" and detected_type not in self.allowed_video_types:
raise ValidationError(
f"Invalid video file type: {detected_type}. "
f"Allowed types: {', '.join(self.allowed_video_types)}"
)
elif expected_type == "subtitle" and detected_type not in self.allowed_subtitle_types:
raise ValidationError(
f"Invalid subtitle file type: {detected_type}. "
f"Allowed types: {', '.join(self.allowed_subtitle_types)}"
)
def validate_file_size(self, size: int, file_type: str) -> None:
"""Validate file size limits."""
if file_type == "video" and size > self.max_video_size:
raise ValidationError(
f"Video file too large: {size} bytes. "
f"Maximum allowed: {self.max_video_size} bytes"
)
elif file_type == "subtitle" and size > self.max_subtitle_size:
raise ValidationError(
f"Subtitle file too large: {size} bytes. "
f"Maximum allowed: {self.max_subtitle_size} bytes"
)
async def validate_json_payload(self, request: Request) -> Optional[Dict[str, Any]]:
"""Validate JSON request payload."""
if not request.headers.get("content-type", "").startswith("application/json"):
return None
content_length = request.headers.get("content-length")
if content_length and int(content_length) > self.max_json_size:
raise ValidationError(f"JSON payload too large: {content_length} bytes")
try:
# Check if body has already been read
if hasattr(request, '_cached_body'):
body = request._cached_body
else:
body = await request.body()
# Cache the body so FastAPI can read it later
request._cached_body = body
if len(body) > self.max_json_size:
raise ValidationError(f"JSON payload too large: {len(body)} bytes")
if not body:
return {}
payload = json.loads(body)
# Recursively validate all string values
self._validate_json_values(payload)
return payload
except json.JSONDecodeError as e:
raise ValidationError(f"Invalid JSON: {e}")
def _validate_json_values(self, obj: Any, path: str = "root") -> None:
"""Recursively validate JSON values."""
if isinstance(obj, dict):
if len(obj) > self.max_form_fields:
raise ValidationError(f"Too many fields in object at {path}")
for key, value in obj.items():
if isinstance(key, str):
self.validate_string_content(key, f"{path}.{key}")
self._validate_json_values(value, f"{path}.{key}")
elif isinstance(obj, list):
if len(obj) > 1000: # Prevent large arrays
raise ValidationError(f"Array too large at {path}")
for i, item in enumerate(obj):
self._validate_json_values(item, f"{path}[{i}]")
elif isinstance(obj, str):
self.validate_string_content(obj, path)
def validate_query_params(self, request: Request) -> None:
"""Validate query parameters."""
for key, value in request.query_params.items():
self.validate_string_content(key, f"query.{key}")
self.validate_string_content(str(value), f"query.{key}")
def validate_headers(self, request: Request) -> None:
"""Validate request headers."""
suspicious_headers = {
"x-forwarded-host",
"x-original-host",
"x-rewrite-url"
}
for header_name, header_value in request.headers.items():
# Check for suspicious headers
if header_name.lower() in suspicious_headers:
self.validate_string_content(header_value, f"header.{header_name}")
# Validate user-agent length
if header_name.lower() == "user-agent" and len(header_value) > 500:
raise SecurityValidationError("User-Agent header too long")
class ValidationMiddleware:
"""FastAPI middleware for enhanced request validation."""
def __init__(self):
self.validator = RequestValidator()
async def __call__(self, request: Request, call_next):
"""Process validation for the request."""
start_time = time.time()
validation_errors = []
# Skip validation for timing adjustment endpoint temporarily
if "/vtt/adjust-timing" in request.url.path:
return await call_next(request)
try:
# Validate headers
self.validator.validate_headers(request)
# Validate query parameters
self.validator.validate_query_params(request)
# Validate JSON payload if present
if request.method in ["POST", "PUT", "PATCH"]:
await self.validator.validate_json_payload(request)
# Process the request
response = await call_next(request)
# Track successful validation
track_validation_metrics(
endpoint=request.url.path,
method=request.method,
is_valid=True,
validation_time=time.time() - start_time,
error_types=[]
)
return response
except SecurityValidationError as e:
validation_errors.append("security")
track_validation_metrics(
endpoint=request.url.path,
method=request.method,
is_valid=False,
validation_time=time.time() - start_time,
error_types=validation_errors
)
return JSONResponse(
status_code=status.HTTP_400_BAD_REQUEST,
content={
"detail": "Security validation failed",
"error_code": "SECURITY_VALIDATION_ERROR"
}
)
except ValidationError as e:
validation_errors.append("format")
track_validation_metrics(
endpoint=request.url.path,
method=request.method,
is_valid=False,
validation_time=time.time() - start_time,
error_types=validation_errors
)
return JSONResponse(
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
content={
"detail": str(e),
"error_code": "VALIDATION_ERROR"
}
)
except Exception as e:
validation_errors.append("unknown")
track_validation_metrics(
endpoint=request.url.path,
method=request.method,
is_valid=False,
validation_time=time.time() - start_time,
error_types=validation_errors
)
# Log unexpected error but continue processing
print(f"Validation middleware error: {e}")
return await call_next(request)
async def create_validation_middleware() -> ValidationMiddleware:
"""Factory function to create validation middleware."""
return ValidationMiddleware()

View file

@ -0,0 +1,5 @@
"""Database migration framework for MongoDB."""
from .migrator import MigrationManager, Migration
__all__ = ["MigrationManager", "Migration"]

View file

@ -0,0 +1,253 @@
"""MongoDB migration framework."""
import os
import importlib.util
from abc import ABC, abstractmethod
from datetime import datetime
from pathlib import Path
from typing import List, Optional
from motor.motor_asyncio import AsyncIOMotorDatabase
from app.core.database import get_database
from app.core.logging import get_logger
from app.telemetry.tracing import trace_async_operation
logger = get_logger(__name__)
class Migration(ABC):
"""Base class for database migrations."""
def __init__(self):
self.version: str = "0000-00-00-000000" # Format: YYYY-MM-DD-HHMMSS
self.description: str = ""
self.db: Optional[AsyncIOMotorDatabase] = None
@abstractmethod
async def up(self) -> None:
"""Apply the migration."""
pass
@abstractmethod
async def down(self) -> None:
"""Rollback the migration."""
pass
async def set_database(self, db: AsyncIOMotorDatabase) -> None:
"""Set the database instance."""
self.db = db
class MigrationRecord:
"""Represents a migration record in the database."""
def __init__(self, version: str, description: str, applied_at: datetime):
self.version = version
self.description = description
self.applied_at = applied_at
class MigrationManager:
"""Manages database migrations."""
def __init__(self):
self.db: Optional[AsyncIOMotorDatabase] = None
self.migrations_dir = Path(__file__).parent / "scripts"
self.collection_name = "migration_history"
async def initialize(self) -> None:
"""Initialize the migration manager."""
self.db = await get_database()
await self._ensure_migration_collection()
async def _ensure_migration_collection(self) -> None:
"""Ensure the migration history collection exists with proper indexes."""
collection = self.db[self.collection_name]
# Create indexes for migration history
await collection.create_index([("version", 1)], unique=True)
await collection.create_index([("applied_at", -1)])
logger.info("Migration history collection initialized")
def discover_migrations(self) -> List[str]:
"""Discover all migration files in the migrations directory."""
if not self.migrations_dir.exists():
logger.warning(f"Migrations directory not found: {self.migrations_dir}")
return []
migration_files = []
for file_path in self.migrations_dir.glob("*.py"):
if file_path.name.startswith("migration_") and not file_path.name.startswith("__"):
migration_files.append(file_path.stem)
# Sort by version (filename should start with version)
migration_files.sort()
return migration_files
async def load_migration(self, migration_name: str) -> Migration:
"""Dynamically load a migration class."""
migration_path = self.migrations_dir / f"{migration_name}.py"
if not migration_path.exists():
raise FileNotFoundError(f"Migration file not found: {migration_path}")
# Load the module
spec = importlib.util.spec_from_file_location(migration_name, migration_path)
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
# Get the migration class (assume it's named Migration)
if not hasattr(module, 'Migration'):
raise AttributeError(f"Migration class not found in {migration_name}")
migration_class = getattr(module, 'Migration')
migration = migration_class()
await migration.set_database(self.db)
return migration
async def get_applied_migrations(self) -> List[str]:
"""Get list of applied migration versions."""
collection = self.db[self.collection_name]
cursor = collection.find({}, {"version": 1}).sort("version", 1)
applied = []
async for doc in cursor:
applied.append(doc["version"])
return applied
async def record_migration(self, migration: Migration) -> None:
"""Record a successful migration in the database."""
collection = self.db[self.collection_name]
record = {
"version": migration.version,
"description": migration.description,
"applied_at": datetime.utcnow()
}
await collection.insert_one(record)
logger.info(f"Recorded migration: {migration.version} - {migration.description}")
async def remove_migration_record(self, version: str) -> None:
"""Remove a migration record (for rollback)."""
collection = self.db[self.collection_name]
await collection.delete_one({"version": version})
logger.info(f"Removed migration record: {version}")
@trace_async_operation("migration_manager.migrate_up")
async def migrate_up(self, target_version: Optional[str] = None) -> List[str]:
"""
Apply migrations up to the target version.
Args:
target_version: Version to migrate to. If None, applies all pending migrations.
Returns:
List of applied migration versions.
"""
await self.initialize()
# Discover all migrations
all_migrations = self.discover_migrations()
applied_migrations = await self.get_applied_migrations()
# Find pending migrations
pending_migrations = []
for migration_name in all_migrations:
# Extract version from filename (assumes format: migration_YYYY-MM-DD-HHMMSS_description.py)
version = migration_name.replace("migration_", "").split("_")[0]
if version not in applied_migrations:
if target_version is None or version <= target_version:
pending_migrations.append((migration_name, version))
# Sort by version
pending_migrations.sort(key=lambda x: x[1])
applied = []
for migration_name, version in pending_migrations:
try:
logger.info(f"Applying migration: {migration_name}")
migration = await self.load_migration(migration_name)
await migration.up()
await self.record_migration(migration)
applied.append(version)
logger.info(f"Successfully applied migration: {version}")
except Exception as e:
logger.error(f"Failed to apply migration {migration_name}: {e}")
raise
return applied
@trace_async_operation("migration_manager.migrate_down")
async def migrate_down(self, target_version: str) -> List[str]:
"""
Rollback migrations down to the target version.
Args:
target_version: Version to rollback to.
Returns:
List of rolled back migration versions.
"""
await self.initialize()
applied_migrations = await self.get_applied_migrations()
# Find migrations to rollback (newer than target)
to_rollback = []
for version in reversed(applied_migrations):
if version > target_version:
to_rollback.append(version)
rolled_back = []
for version in to_rollback:
try:
# Find migration file for this version
migration_name = None
for migration_file in self.discover_migrations():
if version in migration_file:
migration_name = migration_file
break
if not migration_name:
logger.warning(f"Migration file not found for version {version}")
continue
logger.info(f"Rolling back migration: {migration_name}")
migration = await self.load_migration(migration_name)
await migration.down()
await self.remove_migration_record(version)
rolled_back.append(version)
logger.info(f"Successfully rolled back migration: {version}")
except Exception as e:
logger.error(f"Failed to rollback migration {version}: {e}")
raise
return rolled_back
async def get_migration_status(self) -> dict:
"""Get current migration status."""
await self.initialize()
all_migrations = self.discover_migrations()
applied_migrations = await self.get_applied_migrations()
pending_count = len(all_migrations) - len(applied_migrations)
return {
"total_migrations": len(all_migrations),
"applied_migrations": len(applied_migrations),
"pending_migrations": pending_count,
"latest_applied": applied_migrations[-1] if applied_migrations else None,
"all_applied": applied_migrations
}

View file

@ -0,0 +1,64 @@
"""Initial database schema setup migration."""
from datetime import datetime
from app.migrations.migrator import Migration
class Migration(Migration):
"""Initial schema setup with all collections and indexes."""
def __init__(self):
super().__init__()
self.version = "2025-08-17-120000"
self.description = "Initial database schema with users, jobs, and audit_logs collections"
async def up(self) -> None:
"""Create initial collections and indexes."""
# Users collection setup
await self.db.users.create_index([("email", 1)], unique=True)
await self.db.users.create_index([("role", 1)])
await self.db.users.create_index([("is_active", 1)])
await self.db.users.create_index([("created_at", -1)])
# Jobs collection setup
await self.db.jobs.create_index([("status", 1), ("created_at", -1)])
await self.db.jobs.create_index([("client_id", 1)])
await self.db.jobs.create_index([("updated_at", -1)])
await self.db.jobs.create_index([("languages", 1)])
# Create compound index for job queries
await self.db.jobs.create_index([
("status", 1),
("client_id", 1),
("created_at", -1)
])
# Audit logs collection setup
await self.db.audit_logs.create_index([("timestamp", -1)])
await self.db.audit_logs.create_index([("action", 1), ("timestamp", -1)])
await self.db.audit_logs.create_index([("user_id", 1), ("timestamp", -1)])
await self.db.audit_logs.create_index([("severity", 1), ("timestamp", -1)])
await self.db.audit_logs.create_index([("resource_type", 1), ("resource_id", 1)])
await self.db.audit_logs.create_index([("ip_address", 1), ("timestamp", -1)])
await self.db.audit_logs.create_index([("success", 1), ("timestamp", -1)])
# Text search index for audit logs
await self.db.audit_logs.create_index([
("description", "text"),
("details", "text"),
("error_message", "text")
])
print(f"✅ Applied migration {self.version}: {self.description}")
async def down(self) -> None:
"""Drop all collections (destructive - use with caution)."""
# This is a destructive operation - in production, you might want to backup first
await self.db.users.drop()
await self.db.jobs.drop()
await self.db.audit_logs.drop()
print(f"⚠️ Rolled back migration {self.version}: {self.description}")
print("⚠️ WARNING: All data has been deleted!")

View file

@ -0,0 +1,134 @@
"""Index optimization migration for improved query performance."""
from app.migrations.migrator import Migration
class Migration(Migration):
"""Optimize indexes for better query performance."""
def __init__(self):
super().__init__()
self.version = "2025-08-17-120001"
self.description = "Index optimization for query performance improvements"
async def up(self) -> None:
"""Add optimized indexes for common query patterns."""
# Jobs collection optimizations
# Index for job status transitions and monitoring
await self.db.jobs.create_index([
("status", 1),
("updated_at", -1),
("client_id", 1)
], name="jobs_status_updated_client_idx")
# Index for queue management (pending jobs)
await self.db.jobs.create_index([
("status", 1),
("created_at", 1)
], name="jobs_queue_processing_idx")
# Index for client job history
await self.db.jobs.create_index([
("client_id", 1),
("created_at", -1),
("status", 1)
], name="jobs_client_history_idx")
# Sparse index for error tracking
await self.db.jobs.create_index([
("status", 1),
("error", 1)
], sparse=True, name="jobs_error_tracking_idx")
# Users collection optimizations
# Index for active user queries
await self.db.users.create_index([
("is_active", 1),
("role", 1),
("last_login_at", -1)
], name="users_active_role_login_idx")
# Index for user search by email pattern
await self.db.users.create_index([
("email", "text"),
("first_name", "text"),
("last_name", "text")
], name="users_search_idx")
# Audit logs collection optimizations
# Compound index for security monitoring
await self.db.audit_logs.create_index([
("severity", 1),
("action", 1),
("timestamp", -1)
], name="audit_security_monitoring_idx")
# Index for user activity analysis
await self.db.audit_logs.create_index([
("user_id", 1),
("action", 1),
("timestamp", -1)
], name="audit_user_activity_idx")
# Index for resource access tracking
await self.db.audit_logs.create_index([
("resource_type", 1),
("resource_id", 1),
("action", 1),
("timestamp", -1)
], name="audit_resource_access_idx")
# Sparse index for failed operations
await self.db.audit_logs.create_index([
("success", 1),
("timestamp", -1)
], sparse=True, name="audit_failures_idx")
# Add TTL index for automatic audit log cleanup (optional)
# Uncomment if you want automatic cleanup after 2 years
# await self.db.audit_logs.create_index(
# [("timestamp", 1)],
# expireAfterSeconds=63072000, # 2 years
# name="audit_ttl_idx"
# )
print(f"✅ Applied migration {self.version}: {self.description}")
async def down(self) -> None:
"""Remove the optimized indexes."""
# Drop the indexes we created
indexes_to_drop = [
"jobs_status_updated_client_idx",
"jobs_queue_processing_idx",
"jobs_client_history_idx",
"jobs_error_tracking_idx",
"users_active_role_login_idx",
"users_search_idx",
"audit_security_monitoring_idx",
"audit_user_activity_idx",
"audit_resource_access_idx",
"audit_failures_idx"
]
for index_name in indexes_to_drop:
try:
await self.db.jobs.drop_index(index_name)
except Exception:
pass # Index might not exist on this collection
try:
await self.db.users.drop_index(index_name)
except Exception:
pass
try:
await self.db.audit_logs.drop_index(index_name)
except Exception:
pass
print(f"⚠️ Rolled back migration {self.version}: {self.description}")

View file

@ -0,0 +1,155 @@
"""Migrate audit log schema from basic to comprehensive format."""
from datetime import datetime
from app.migrations.migrator import Migration
class Migration(Migration):
"""Update audit log schema to comprehensive format."""
def __init__(self):
super().__init__()
self.version = "2025-08-17-120002"
self.description = "Update audit log schema from basic to comprehensive format"
async def up(self) -> None:
"""Migrate existing audit logs to new schema format."""
# Find all existing audit logs with old schema
old_logs_cursor = self.db.audit_logs.find({
# Look for logs that have the old schema structure
"$or": [
{"when": {"$exists": True}}, # Old timestamp field
{"job_id": {"$exists": True}}, # Old job-specific logs
{"timestamp": {"$exists": False}} # Missing new timestamp field
]
})
migration_count = 0
async for old_log in old_logs_cursor:
try:
# Map old fields to new schema
new_log = {
"_id": old_log["_id"],
"timestamp": old_log.get("when", old_log.get("timestamp", datetime.utcnow())),
"action": self._map_old_action(old_log.get("action", "unknown")),
"severity": "info",
"description": old_log.get("action", "Legacy action"),
"success": True,
"environment": "prod",
"service_name": "accessible-video-api",
"api_version": "v1"
}
# Map optional fields if they exist
if "user_id" in old_log:
new_log["user_id"] = old_log["user_id"]
if "job_id" in old_log:
new_log["resource_type"] = "job"
new_log["resource_id"] = old_log["job_id"]
if "ip_address" in old_log:
new_log["ip_address"] = old_log["ip_address"]
if "user_agent" in old_log:
new_log["user_agent"] = old_log["user_agent"]
if "details" in old_log:
new_log["details"] = old_log["details"]
# Replace the old document with the new schema
await self.db.audit_logs.replace_one(
{"_id": old_log["_id"]},
new_log
)
migration_count += 1
except Exception as e:
print(f"Error migrating audit log {old_log.get('_id')}: {e}")
continue
print(f"✅ Applied migration {self.version}: Migrated {migration_count} audit log records")
def _map_old_action(self, old_action: str) -> str:
"""Map old action strings to new AuditAction enum values."""
action_mapping = {
# Job actions
"job_created": "job.create",
"job_approved": "job.approve",
"job_rejected": "job.reject",
"job_updated": "job.update",
"job_cancelled": "job.cancel",
# Auth actions
"login": "auth.login.success",
"logout": "auth.logout",
"login_failed": "auth.login.failure",
# File actions
"file_uploaded": "file.upload",
"file_downloaded": "file.download",
# VTT actions
"vtt_edited": "vtt.edit",
# Admin actions
"user_created": "user.create",
"user_updated": "user.update",
"user_deleted": "user.delete",
}
return action_mapping.get(old_action, old_action)
async def down(self) -> None:
"""Rollback to old audit log schema format (limited)."""
# Find all audit logs with new schema
new_logs_cursor = self.db.audit_logs.find({
"timestamp": {"$exists": True},
"action": {"$exists": True}
})
rollback_count = 0
async for new_log in new_logs_cursor:
try:
# Map new fields back to old schema (lossy conversion)
old_log = {
"_id": new_log["_id"],
"when": new_log["timestamp"],
"action": new_log["action"]
}
# Map back optional fields
if "user_id" in new_log:
old_log["user_id"] = new_log["user_id"]
if "resource_type" in new_log and new_log["resource_type"] == "job":
old_log["job_id"] = new_log.get("resource_id")
if "ip_address" in new_log:
old_log["ip_address"] = new_log["ip_address"]
if "user_agent" in new_log:
old_log["user_agent"] = new_log["user_agent"]
if "details" in new_log:
old_log["details"] = new_log["details"]
# Replace with old schema
await self.db.audit_logs.replace_one(
{"_id": new_log["_id"]},
old_log
)
rollback_count += 1
except Exception as e:
print(f"Error rolling back audit log {new_log.get('_id')}: {e}")
continue
print(f"⚠️ Rolled back migration {self.version}: Reverted {rollback_count} audit log records")
print("⚠️ WARNING: Some audit log data may have been lost due to schema differences")

Binary file not shown.

Binary file not shown.

View file

@ -0,0 +1,175 @@
"""Audit log model for tracking sensitive operations."""
from datetime import datetime
from enum import Enum
from typing import Any, Dict, Optional
from bson import ObjectId
from pydantic import BaseModel, Field
from .user import PyObjectId
class AuditAction(str, Enum):
"""Enumeration of auditable actions."""
# Authentication actions
LOGIN_SUCCESS = "auth.login.success"
LOGIN_FAILURE = "auth.login.failure"
LOGOUT = "auth.logout"
TOKEN_REFRESH = "auth.token.refresh"
PASSWORD_CHANGE = "auth.password.change"
PASSWORD_RESET = "auth.password.reset"
# User management actions
USER_CREATE = "user.create"
USER_UPDATE = "user.update"
USER_DELETE = "user.delete"
USER_ROLE_CHANGE = "user.role.change"
USER_ACTIVATE = "user.activate"
USER_DEACTIVATE = "user.deactivate"
# Job management actions
JOB_CREATE = "job.create"
JOB_UPDATE = "job.update"
JOB_DELETE = "job.delete"
JOB_APPROVE = "job.approve"
JOB_REJECT = "job.reject"
JOB_CANCEL = "job.cancel"
JOB_STATUS_CHANGE = "job.status.change"
# File operations
FILE_UPLOAD = "file.upload"
FILE_DOWNLOAD = "file.download"
FILE_DELETE = "file.delete"
FILE_ACCESS = "file.access"
# VTT editing actions
VTT_EDIT = "vtt.edit"
VTT_APPROVE = "vtt.approve"
VTT_REJECT = "vtt.reject"
# Admin actions
ADMIN_CONFIG_CHANGE = "admin.config.change"
ADMIN_SYSTEM_ACTION = "admin.system.action"
ADMIN_DATA_EXPORT = "admin.data.export"
ADMIN_AUDIT_ACCESS = "admin.audit.access"
# Security events
RATE_LIMIT_EXCEEDED = "security.rate_limit.exceeded"
VALIDATION_FAILURE = "security.validation.failure"
UNAUTHORIZED_ACCESS = "security.unauthorized.access"
SUSPICIOUS_ACTIVITY = "security.suspicious.activity"
class AuditLogSeverity(str, Enum):
"""Severity levels for audit events."""
INFO = "info" # Normal operations
WARNING = "warning" # Suspicious but not critical
ERROR = "error" # Failed operations
CRITICAL = "critical" # Security incidents
class AuditLog(BaseModel):
"""Audit log entry model."""
id: Optional[PyObjectId] = Field(default_factory=PyObjectId, alias="_id")
# Core audit fields
timestamp: datetime = Field(default_factory=datetime.utcnow)
action: AuditAction
severity: AuditLogSeverity = AuditLogSeverity.INFO
# Actor information
user_id: Optional[PyObjectId] = None
user_email: Optional[str] = None
user_role: Optional[str] = None
# Request context
ip_address: Optional[str] = None
user_agent: Optional[str] = None
request_id: Optional[str] = None
session_id: Optional[str] = None
# Resource information
resource_type: Optional[str] = None # e.g., "job", "user", "file"
resource_id: Optional[str] = None
resource_name: Optional[str] = None
# Action details
description: str
details: Dict[str, Any] = Field(default_factory=dict)
# Outcome
success: bool = True
error_message: Optional[str] = None
# Additional metadata
environment: str = "prod"
service_name: str = "accessible-video-api"
api_version: str = "v1"
class Config:
populate_by_name = True
arbitrary_types_allowed = True
json_encoders = {ObjectId: str}
class AuditLogCreate(BaseModel):
"""Schema for creating audit log entries."""
action: AuditAction
severity: AuditLogSeverity = AuditLogSeverity.INFO
description: str
# Optional fields that can be provided
user_id: Optional[PyObjectId] = None
user_email: Optional[str] = None
user_role: Optional[str] = None
ip_address: Optional[str] = None
user_agent: Optional[str] = None
request_id: Optional[str] = None
resource_type: Optional[str] = None
resource_id: Optional[str] = None
resource_name: Optional[str] = None
details: Dict[str, Any] = Field(default_factory=dict)
success: bool = True
error_message: Optional[str] = None
class AuditLogQuery(BaseModel):
"""Schema for querying audit logs."""
# Time range
start_date: Optional[datetime] = None
end_date: Optional[datetime] = None
# Filters
action: Optional[AuditAction] = None
severity: Optional[AuditLogSeverity] = None
user_id: Optional[PyObjectId] = None
user_email: Optional[str] = None
resource_type: Optional[str] = None
resource_id: Optional[str] = None
success: Optional[bool] = None
# Search
search: Optional[str] = None # Full-text search in description and details
# Pagination
skip: int = 0
limit: int = 100
# Sorting
sort_by: str = "timestamp"
sort_order: int = -1 # -1 for descending, 1 for ascending
class AuditLogResponse(BaseModel):
"""Response schema for audit log queries."""
logs: list[AuditLog]
total_count: int
page: int
page_size: int
has_more: bool

95
backend/app/models/job.py Normal file
View file

@ -0,0 +1,95 @@
from datetime import datetime
from enum import Enum
from typing import Any, Literal, Optional
from pydantic import BaseModel, Field, constr
class JobStatus(str, Enum):
CREATED = "created"
INGESTING = "ingesting"
AI_PROCESSING = "ai_processing"
PENDING_QC = "pending_qc"
APPROVED_ENGLISH = "approved_english"
REJECTED = "rejected"
QC_FEEDBACK = "qc_feedback"
TRANSLATING = "translating"
TTS_GENERATING = "tts_generating"
PENDING_FINAL_REVIEW = "pending_final_review"
COMPLETED = "completed"
class Source(BaseModel):
filename: str
original_filename: Optional[str] = None
gcs_uri: str
duration_s: Optional[float] = None
language: constr(min_length=2, max_length=10) = "en"
class RequestedOutputs(BaseModel):
captions_vtt: bool = True
audio_description_vtt: bool = True
audio_description_mp3: bool = True
languages: list[str] = []
transcreation: list[str] = []
class LangOutput(BaseModel):
captions_vtt_gcs: Optional[str] = None
ad_vtt_gcs: Optional[str] = None
ad_mp3_gcs: Optional[str] = None
origin: Optional[Literal["translate", "transcreate"]] = None
qa_notes: Optional[str] = None
class ReviewHistoryItem(BaseModel):
at: datetime
status: str
by: Optional[str] = None
notes: Optional[str] = None
class Review(BaseModel):
notes: Optional[str] = ""
reviewer_id: Optional[str] = None
history: list[ReviewHistoryItem] = []
class AISection(BaseModel):
ingestion_json: Optional[dict[str, Any]] = None
confidence: Optional[float] = None
class Job(BaseModel):
id: Optional[str] = Field(None, alias="_id")
client_id: str
title: str
source: Source
requested_outputs: RequestedOutputs
status: JobStatus = JobStatus.CREATED
review: Review = Review()
outputs: Optional[dict[str, LangOutput]] = None
ai: Optional[AISection] = None
error: Optional[dict[str, Any]] = None
created_at: Optional[datetime] = None
updated_at: Optional[datetime] = None
class Config:
populate_by_name = True
use_enum_values = True
class JobCreate(BaseModel):
title: str
language: str = "en"
requested_outputs: RequestedOutputs
class JobUpdate(BaseModel):
title: Optional[str] = None
status: Optional[JobStatus] = None
review: Optional[Review] = None
outputs: Optional[dict[str, LangOutput]] = None
ai: Optional[AISection] = None
error: Optional[dict[str, Any]] = None

View file

@ -0,0 +1,57 @@
from datetime import datetime
from enum import Enum
from typing import Optional, Annotated
from bson import ObjectId
from pydantic import BaseModel, EmailStr, Field, BeforeValidator
def validate_object_id(v) -> str:
"""Convert ObjectId to string"""
if isinstance(v, ObjectId):
return str(v)
if isinstance(v, str):
return v
raise ValueError('Invalid ObjectId')
PyObjectId = Annotated[str, BeforeValidator(validate_object_id)]
class UserRole(str, Enum):
CLIENT = "client"
REVIEWER = "reviewer"
ADMIN = "admin"
class User(BaseModel):
id: Optional[PyObjectId] = Field(None, alias="_id")
email: EmailStr
hashed_password: str
full_name: str
role: UserRole = UserRole.CLIENT
is_active: bool = True
created_at: Optional[datetime] = None
updated_at: Optional[datetime] = None
class Config:
populate_by_name = True
use_enum_values = True
class UserInDB(User):
pass
class UserCreate(BaseModel):
email: EmailStr
password: str
full_name: str
role: UserRole = UserRole.CLIENT
class UserUpdate(BaseModel):
email: Optional[EmailStr] = None
full_name: Optional[str] = None
role: Optional[UserRole] = None
is_active: Optional[bool] = None

View file

@ -0,0 +1,57 @@
SYSTEM:
You are an expert accessibility writer for film/TV and e-learning. Produce STRICT JSON only.
USER:
You are given a video. Return a JSON object with:
- language: BCP-47 code (e.g., "en")
- confidence: 0..1
- summary: 12 sentence synopsis
- transcript_plaintext: full spoken words, punctuated
- captions_vtt: a valid WebVTT file as a single string, with accurate timings and no styling
- audio_description_vtt: a valid WebVTT file as a single string, describing key visual elements (no spoilers), synchronized with the program
Constraints:
- Output MUST be valid JSON. Do not include markdown fences or any other text.
- All JSON strings must be properly escaped (use \" for quotes within strings)
- Use detailed, descriptive audio description phrases that paint a vivid picture. Aim for rich descriptions that are 20% longer than typical AD, providing enhanced visual context without duplicating spoken dialogue.
- WebVTT must start with "WEBVTT" and follow this exact format:
- Timestamp format: HH:MM:SS.mmm --> HH:MM:SS.mmm (ALWAYS include hours, even if 00:)
- Example: "00:01:23.456 --> 00:01:27.890"
- Each cue must be separated by blank lines
- Never use MM:SS format - always include the hour component
- Escape all newlines in VTT strings as \n
- Do not include trailing commas in JSON objects or arrays
CRITICAL TIMING REQUIREMENTS:
- Caption timing must be PRECISELY synchronized with the actual speech in the video
- Each caption cue should start exactly when the speaker begins that phrase/sentence
- Each caption cue should end exactly when the speaker finishes that phrase/sentence
- Listen carefully to detect natural speech pauses and word boundaries
- Avoid starting captions too early or ending them too late
- Ensure captions align with lip movement and speech rhythm
- For audio descriptions, time them during natural speech gaps or over non-dialogue audio
- Validate that all timestamps are monotonically increasing (each cue starts after the previous one ends)
AUDIO DESCRIPTION GUIDELINES:
- Provide rich, detailed descriptions that include setting, characters, actions, facial expressions, body language, and visual mood
- Describe colors, lighting, camera angles, and composition when relevant to understanding
- Include environmental details like weather, time of day, architectural features, or technological elements
- Mention clothing, objects, and spatial relationships that contribute to scene understanding
- Use vivid, engaging language that creates a complete mental picture for visually impaired viewers
- Aim for descriptions that are substantive enough to fill natural pauses and reduce silence between spoken content
CRITICAL: Return ONLY valid JSON that can be parsed by JSON.parse(). No additional text.
Example output format:
```json
{
"language": "en",
"confidence": 0.95,
"summary": "A tutorial video showing how to use a web application dashboard.",
"transcript_plaintext": "Hello everyone, welcome to this tutorial. Today we'll be exploring the dashboard interface. First, let's log in to the system.",
"captions_vtt": "WEBVTT\n\n00:00:01.000 --> 00:00:03.500\nHello everyone, welcome to this tutorial.\n\n00:00:04.000 --> 00:00:07.200\nToday we'll be exploring the dashboard interface.\n\n00:00:08.000 --> 00:00:10.500\nFirst, let's log in to the system.",
"audio_description_vtt": "WEBVTT\n\n00:00:00.500 --> 00:00:02.000\nA bright computer monitor displays a clean, modern login page with blue and white corporate branding. The interface features prominently positioned username and password fields.\n\n00:00:05.000 --> 00:00:07.000\nA cursor arrow hovers over the rectangular username input field, which highlights with a subtle blue border as the user prepares to type.\n\n00:00:10.000 --> 00:00:12.000\nThe screen transitions to reveal a comprehensive dashboard filled with colorful charts, data widgets, and navigation panels arranged in an organized grid layout."
}
```
Follow this exact structure and formatting.

View file

@ -0,0 +1,20 @@
SYSTEM:
You are a culturally-savvy accessibility writer.
USER:
Rewrite the following English captions and audio descriptions into {TARGET_LANGUAGE}, preserving:
- meaning, tone, and accessibility intent,
- timing boundaries (same cue timestamps),
- line lengths friendly for readability (~3240 chars).
Input:
- captions_vtt_en: <VTT text>
- ad_vtt_en: <VTT text>
- brief: <brand + audience notes>
Output:
JSON:
{
"captions_vtt": "<VTT in {TARGET_LANGUAGE}>",
"audio_description_vtt": "<VTT in {TARGET_LANGUAGE}>"
}

Binary file not shown.

Binary file not shown.

Binary file not shown.

View file

@ -0,0 +1,72 @@
from typing import Optional
from pydantic import BaseModel, EmailStr
from ..models.user import UserRole
class LoginRequest(BaseModel):
email: EmailStr
password: str
class LoginResponse(BaseModel):
access_token: str
token_type: str = "bearer"
user_id: str
role: str
class RefreshResponse(BaseModel):
access_token: str
token_type: str = "bearer"
class LogoutResponse(BaseModel):
message: str = "Successfully logged out"
# User management schemas for admin routes
class UserResponse(BaseModel):
id: str
email: EmailStr
full_name: str
role: UserRole
is_active: bool
created_at: Optional[str] = None
class UserListResponse(BaseModel):
users: list[UserResponse]
total: int
page: int
size: int
class CreateUserRequest(BaseModel):
email: EmailStr
password: str
full_name: str
role: UserRole = UserRole.CLIENT
class UpdateUserRequest(BaseModel):
email: Optional[EmailStr] = None
full_name: Optional[str] = None
role: Optional[UserRole] = None
is_active: Optional[bool] = None
class ChangePasswordRequest(BaseModel):
current_password: str
new_password: str
class ResetPasswordRequest(BaseModel):
email: EmailStr
class AdminStatsResponse(BaseModel):
total_users: int
total_jobs: int
jobs_by_status: dict[str, int]
active_jobs_today: int
avg_processing_time_hours: float

View file

@ -0,0 +1,15 @@
from typing import Optional
from pydantic import BaseModel
class SignedUploadRequest(BaseModel):
filename: str
content_type: str
max_size: Optional[int] = None
class SignedUploadResponse(BaseModel):
upload_url: str
fields: dict[str, str]
blob_path: str

View file

@ -0,0 +1,89 @@
from typing import Any, Optional, Union
from pydantic import BaseModel
from ..models.job import JobStatus, LangOutput, RequestedOutputs, Review
class JobResponse(BaseModel):
id: str
title: str
status: JobStatus
source: dict[str, Any]
requested_outputs: RequestedOutputs
review: Review
outputs: Optional[dict[str, LangOutput]] = None
created_at: Optional[str] = None
updated_at: Optional[str] = None
class JobListResponse(BaseModel):
jobs: list[JobResponse]
total: int
page: int
size: int
class JobCreateRequest(BaseModel):
title: str
language: str = "en"
requested_outputs: RequestedOutputs
class JobUpdateRequest(BaseModel):
title: Optional[str] = None
review_notes: Optional[str] = None
class ApproveEnglishRequest(BaseModel):
notes: Optional[str] = None
class RejectJobRequest(BaseModel):
notes: str
class CompleteJobRequest(BaseModel):
notes: Optional[str] = None
class VttUpdateRequest(BaseModel):
captions_vtt: Optional[str] = None
audio_description_vtt: Optional[str] = None
language: str = "en"
class VttTimingAdjustRequest(BaseModel):
offset_seconds: float
language: str = "en"
adjust_captions: bool = True
adjust_audio_description: bool = True
class JobDownloadsResponse(BaseModel):
downloads: dict[str, Union[dict[str, str], str]] # language -> {file_type: signed_url} OR source_video -> signed_url
class VttContentResponse(BaseModel):
captions_vtt: Optional[str] = None
audio_description_vtt: Optional[str] = None
class AssetValidationResponse(BaseModel):
is_valid: bool
errors: list[str]
warnings: list[str] = []
class JobDeleteResponse(BaseModel):
message: str
class BulkDeleteRequest(BaseModel):
job_ids: list[str]
class BulkDeleteResponse(BaseModel):
deleted_count: int
total_requested: int
errors: list[str]

Binary file not shown.

Binary file not shown.

View file

@ -0,0 +1,331 @@
"""Audit logging service for tracking sensitive operations."""
import uuid
from datetime import datetime, timedelta
from typing import Any, Dict, List, Optional
from fastapi import Request
from motor.motor_asyncio import AsyncIOMotorCollection
from app.core.database import get_database
from app.core.config import get_settings
from app.models.audit_log import (
AuditLog,
AuditLogCreate,
AuditLogQuery,
AuditLogResponse,
AuditAction,
AuditLogSeverity
)
from app.models.user import User
from app.telemetry.tracing import trace_async_operation
class AuditLogger:
"""Service for managing audit logs."""
def __init__(self):
self.settings = get_settings()
self.collection: Optional[AsyncIOMotorCollection] = None
async def _get_collection(self) -> AsyncIOMotorCollection:
"""Get the audit logs collection."""
if not self.collection:
db = await get_database()
self.collection = db.audit_logs
return self.collection
@trace_async_operation("audit_logger.log_action")
async def log_action(
self,
action: AuditAction,
description: str,
user: Optional[User] = None,
request: Optional[Request] = None,
resource_type: Optional[str] = None,
resource_id: Optional[str] = None,
resource_name: Optional[str] = None,
details: Optional[Dict[str, Any]] = None,
severity: AuditLogSeverity = AuditLogSeverity.INFO,
success: bool = True,
error_message: Optional[str] = None
) -> str:
"""
Log an audit event.
Returns:
The ID of the created audit log entry.
"""
# Extract request context
ip_address = None
user_agent = None
request_id = None
if request:
# Get IP address (handle forwarded headers)
forwarded_for = request.headers.get("X-Forwarded-For")
if forwarded_for:
ip_address = forwarded_for.split(',')[0].strip()
elif request.client:
ip_address = request.client.host
user_agent = request.headers.get("User-Agent")
request_id = request.headers.get("X-Request-ID", str(uuid.uuid4()))
# Create audit log entry
audit_log = AuditLog(
action=action,
severity=severity,
description=description,
user_id=user.id if user else None,
user_email=user.email if user else None,
user_role=user.role.value if user else None,
ip_address=ip_address,
user_agent=user_agent,
request_id=request_id,
resource_type=resource_type,
resource_id=resource_id,
resource_name=resource_name,
details=details or {},
success=success,
error_message=error_message,
environment=self.settings.app_env,
service_name="accessible-video-api",
api_version="v1"
)
# Save to database
collection = await self._get_collection()
result = await collection.insert_one(audit_log.dict(by_alias=True))
return str(result.inserted_id)
@trace_async_operation("audit_logger.query_logs")
async def query_logs(self, query: AuditLogQuery) -> AuditLogResponse:
"""Query audit logs with filtering and pagination."""
collection = await self._get_collection()
# Build MongoDB query
mongo_query = {}
# Time range filter
if query.start_date or query.end_date:
timestamp_filter = {}
if query.start_date:
timestamp_filter["$gte"] = query.start_date
if query.end_date:
timestamp_filter["$lte"] = query.end_date
mongo_query["timestamp"] = timestamp_filter
# Exact match filters
if query.action:
mongo_query["action"] = query.action
if query.severity:
mongo_query["severity"] = query.severity
if query.user_id:
mongo_query["user_id"] = query.user_id
if query.user_email:
mongo_query["user_email"] = query.user_email
if query.resource_type:
mongo_query["resource_type"] = query.resource_type
if query.resource_id:
mongo_query["resource_id"] = query.resource_id
if query.success is not None:
mongo_query["success"] = query.success
# Text search
if query.search:
mongo_query["$or"] = [
{"description": {"$regex": query.search, "$options": "i"}},
{"details": {"$regex": query.search, "$options": "i"}},
{"error_message": {"$regex": query.search, "$options": "i"}}
]
# Get total count
total_count = await collection.count_documents(mongo_query)
# Execute query with pagination and sorting
cursor = collection.find(mongo_query)
# Apply sorting
sort_direction = query.sort_order
cursor = cursor.sort(query.sort_by, sort_direction)
# Apply pagination
cursor = cursor.skip(query.skip).limit(query.limit)
# Execute query
documents = await cursor.to_list(length=query.limit)
# Convert to Pydantic models
logs = []
for doc in documents:
try:
logs.append(AuditLog(**doc))
except Exception as e:
# Log conversion error but continue
print(f"Error converting audit log document: {e}")
continue
# Calculate pagination info
page = (query.skip // query.limit) + 1
has_more = (query.skip + len(logs)) < total_count
return AuditLogResponse(
logs=logs,
total_count=total_count,
page=page,
page_size=len(logs),
has_more=has_more
)
async def get_user_activity(self, user_id: str, days: int = 30) -> List[AuditLog]:
"""Get recent activity for a specific user."""
from_date = datetime.utcnow().replace(
hour=0, minute=0, second=0, microsecond=0
) - timedelta(days=days)
query = AuditLogQuery(
user_id=user_id,
start_date=from_date,
limit=1000,
sort_by="timestamp",
sort_order=-1
)
response = await self.query_logs(query)
return response.logs
async def get_security_events(self, hours: int = 24) -> List[AuditLog]:
"""Get recent security-related events."""
from_date = datetime.utcnow() - timedelta(hours=hours)
security_actions = [
AuditAction.LOGIN_FAILURE,
AuditAction.RATE_LIMIT_EXCEEDED,
AuditAction.VALIDATION_FAILURE,
AuditAction.UNAUTHORIZED_ACCESS,
AuditAction.SUSPICIOUS_ACTIVITY
]
collection = await self._get_collection()
query = {
"timestamp": {"$gte": from_date},
"action": {"$in": security_actions}
}
cursor = collection.find(query).sort("timestamp", -1).limit(1000)
documents = await cursor.to_list(length=1000)
logs = []
for doc in documents:
try:
logs.append(AuditLog(**doc))
except Exception:
continue
return logs
async def cleanup_old_logs(self, retention_days: int = 365) -> int:
"""Clean up audit logs older than retention period."""
cutoff_date = datetime.utcnow().replace(
hour=0, minute=0, second=0, microsecond=0
) - timedelta(days=retention_days)
collection = await self._get_collection()
result = await collection.delete_many({
"timestamp": {"$lt": cutoff_date}
})
return result.deleted_count
# Global audit logger instance
audit_logger = AuditLogger()
# Convenience functions for common audit operations
async def log_auth_success(user: User, request: Request):
"""Log successful authentication."""
await audit_logger.log_action(
action=AuditAction.LOGIN_SUCCESS,
description=f"User {user.email} logged in successfully",
user=user,
request=request,
severity=AuditLogSeverity.INFO
)
async def log_auth_failure(email: str, request: Request, reason: str):
"""Log failed authentication attempt."""
await audit_logger.log_action(
action=AuditAction.LOGIN_FAILURE,
description=f"Failed login attempt for {email}: {reason}",
request=request,
severity=AuditLogSeverity.WARNING,
success=False,
error_message=reason,
details={"attempted_email": email}
)
async def log_job_action(action: AuditAction, job_id: str, user: User, request: Request, details: Optional[Dict] = None):
"""Log job-related actions."""
action_descriptions = {
AuditAction.JOB_CREATE: "Job created",
AuditAction.JOB_APPROVE: "Job approved",
AuditAction.JOB_REJECT: "Job rejected",
AuditAction.JOB_CANCEL: "Job cancelled",
AuditAction.JOB_UPDATE: "Job updated"
}
await audit_logger.log_action(
action=action,
description=f"{action_descriptions.get(action, str(action))} by {user.email}",
user=user,
request=request,
resource_type="job",
resource_id=job_id,
details=details
)
async def log_user_management(action: AuditAction, target_user_id: str, admin_user: User, request: Request, details: Optional[Dict] = None):
"""Log user management actions."""
action_descriptions = {
AuditAction.USER_CREATE: "User created",
AuditAction.USER_UPDATE: "User updated",
AuditAction.USER_DELETE: "User deleted",
AuditAction.USER_ROLE_CHANGE: "User role changed",
AuditAction.USER_ACTIVATE: "User activated",
AuditAction.USER_DEACTIVATE: "User deactivated"
}
await audit_logger.log_action(
action=action,
description=f"{action_descriptions.get(action, str(action))} by admin {admin_user.email}",
user=admin_user,
request=request,
resource_type="user",
resource_id=target_user_id,
details=details,
severity=AuditLogSeverity.INFO
)
async def log_security_event(action: AuditAction, description: str, request: Request, user: Optional[User] = None, details: Optional[Dict] = None):
"""Log security-related events."""
await audit_logger.log_action(
action=action,
description=description,
user=user,
request=request,
severity=AuditLogSeverity.WARNING if action != AuditAction.SUSPICIOUS_ACTIVITY else AuditLogSeverity.CRITICAL,
success=False,
details=details
)

View file

@ -0,0 +1,123 @@
from jinja2 import Template
from sendgrid import SendGridAPIClient
from sendgrid.helpers.mail import Content, From, Mail, Subject, To
from ..core.config import settings
from ..core.logging import get_logger
logger = get_logger(__name__)
class EmailService:
def __init__(self):
if settings.sendgrid_api_key:
self.client = SendGridAPIClient(api_key=settings.sendgrid_api_key)
else:
logger.warning("SendGrid API key not configured")
self.client = None
async def send_completion_email(
self,
recipient_email: str,
job_title: str,
download_links: dict[str, dict[str, str]]
) -> bool:
"""Send job completion email with download links"""
if not self.client:
logger.error("SendGrid not configured, cannot send email")
return False
try:
# Render email template
html_content = self._render_completion_template(
job_title=job_title,
download_links=download_links
)
message = Mail(
from_email=From(settings.email_from, "Accessible Video Platform"),
to_emails=To(recipient_email),
subject=Subject(f"Your accessible video assets are ready: {job_title}"),
html_content=Content("text/html", html_content)
)
response = self.client.send(message)
if response.status_code == 202:
logger.info(f"Completion email sent successfully to {recipient_email}")
return True
else:
logger.error(f"Failed to send email, status code: {response.status_code}")
return False
except Exception as e:
logger.error(f"Email sending failed: {e}")
return False
def _render_completion_template(
self,
job_title: str,
download_links: dict[str, dict[str, str]]
) -> str:
"""Render the completion email HTML template"""
template_str = """
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>Your Accessible Video Assets Are Ready</title>
<style>
body { font-family: Arial, sans-serif; line-height: 1.6; color: #333; }
.container { max-width: 600px; margin: 0 auto; padding: 20px; }
.header { background-color: #4f46e5; color: white; padding: 20px; text-align: center; }
.content { padding: 20px; }
.download-section { margin: 20px 0; padding: 15px; background-color: #f9fafb; border-radius: 8px; }
.download-link { display: inline-block; padding: 10px 20px; margin: 5px; background-color: #4f46e5; color: white; text-decoration: none; border-radius: 5px; }
.footer { text-align: center; padding: 20px; color: #6b7280; font-size: 12px; }
</style>
</head>
<body>
<div class="container">
<div class="header">
<h1>Your Accessible Video Assets Are Ready!</h1>
</div>
<div class="content">
<h2>{{ job_title }}</h2>
<p>Great news! Your video accessibility assets have been processed and are ready for download.</p>
{% for language, files in download_links.items() %}
<div class="download-section">
<h3>{{ language.upper() }} Assets</h3>
{% for file_type, url in files.items() %}
<a href="{{ url }}" class="download-link">
Download {{ file_type|replace('_', ' ')|title }}
</a>
{% endfor %}
</div>
{% endfor %}
<p><strong>Important:</strong> These download links will expire in 24 hours for security purposes.</p>
<p>If you need assistance or have questions about your accessible video assets, please don't hesitate to contact our support team.</p>
</div>
<div class="footer">
<p>This email was sent by the Accessible Video Platform</p>
<p>Links expire in 24 hours for security</p>
</div>
</div>
</body>
</html>
"""
template = Template(template_str)
return template.render(
job_title=job_title,
download_links=download_links
)
# Global service instance
email_service = EmailService()

168
backend/app/services/gcs.py Normal file
View file

@ -0,0 +1,168 @@
import asyncio
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime, timedelta
from typing import Optional
from fastapi import HTTPException, UploadFile
from google.cloud import storage
from google.cloud.exceptions import NotFound
from ..core.config import settings
from ..core.logging import get_logger
logger = get_logger(__name__)
class GCSService:
def __init__(self):
self.client = storage.Client(project=settings.gcp_project_id)
self.bucket = self.client.bucket(settings.gcs_bucket)
self.executor = ThreadPoolExecutor(max_workers=4)
async def upload_file_to_gcs(
self,
file: UploadFile,
destination_path: str,
content_type: Optional[str] = None
) -> str:
"""Upload file to GCS and return the GCS URI"""
def _upload():
blob = self.bucket.blob(destination_path)
# Set content type
if content_type:
blob.content_type = content_type
elif file.content_type:
blob.content_type = file.content_type
# Upload file
file.file.seek(0) # Reset file pointer
blob.upload_from_file(file.file)
return f"gs://{settings.gcs_bucket}/{destination_path}"
loop = asyncio.get_event_loop()
try:
return await loop.run_in_executor(self.executor, _upload)
except Exception as e:
logger.error(f"Failed to upload file to GCS: {e}")
raise HTTPException(status_code=500, detail="File upload failed")
async def upload_text_to_gcs(
self,
content: str,
destination_path: str,
content_type: str = "text/plain"
) -> str:
"""Upload text content to GCS and return the GCS URI"""
def _upload():
blob = self.bucket.blob(destination_path)
blob.content_type = content_type
blob.upload_from_string(content, content_type=content_type)
return f"gs://{settings.gcs_bucket}/{destination_path}"
loop = asyncio.get_event_loop()
try:
return await loop.run_in_executor(self.executor, _upload)
except Exception as e:
logger.error(f"Failed to upload text to GCS: {e}")
raise HTTPException(status_code=500, detail="Text upload failed")
async def get_signed_url(
self,
blob_path: str,
expiration_hours: int = 24,
method: str = "GET"
) -> str:
"""Generate a signed URL for downloading a file"""
def _get_signed_url():
blob = self.bucket.blob(blob_path)
# Check if blob exists
if not blob.exists():
raise NotFound(f"File not found: {blob_path}")
expiration = datetime.utcnow() + timedelta(hours=expiration_hours)
return blob.generate_signed_url(
expiration=expiration,
method=method,
version="v4"
)
loop = asyncio.get_event_loop()
try:
return await loop.run_in_executor(self.executor, _get_signed_url)
except NotFound:
raise HTTPException(status_code=404, detail="File not found")
except Exception as e:
logger.error(f"Failed to generate signed URL: {e}")
raise HTTPException(status_code=500, detail="Failed to generate download URL")
async def delete_file(self, blob_path: str) -> bool:
"""Delete a file from GCS"""
def _delete():
blob = self.bucket.blob(blob_path)
blob.delete()
return True
loop = asyncio.get_event_loop()
try:
return await loop.run_in_executor(self.executor, _delete)
except NotFound:
return False
except Exception as e:
logger.error(f"Failed to delete file from GCS: {e}")
raise HTTPException(status_code=500, detail="File deletion failed")
async def file_exists(self, blob_path: str) -> bool:
"""Check if a file exists in GCS"""
def _exists():
blob = self.bucket.blob(blob_path)
return blob.exists()
loop = asyncio.get_event_loop()
return await loop.run_in_executor(self.executor, _exists)
# Global GCS service instance
gcs_service = GCSService()
# Convenience functions
async def upload_file_to_gcs(file: UploadFile, destination_path: str) -> str:
return await gcs_service.upload_file_to_gcs(file, destination_path)
async def upload_vtt_to_gcs(content: str, destination_path: str) -> str:
return await gcs_service.upload_text_to_gcs(content, destination_path, "text/vtt")
async def upload_json_to_gcs(content: str, destination_path: str) -> str:
return await gcs_service.upload_text_to_gcs(content, destination_path, "application/json")
async def get_signed_download_url(blob_path: str, expiration_hours: int = 24) -> str:
return await gcs_service.get_signed_url(blob_path, expiration_hours)
async def generate_signed_upload_url(
blob_path: str,
content_type: str,
max_size: int = 1024 * 1024 * 1024 # 1GB
) -> dict:
"""Generate a signed URL for direct browser-to-GCS upload"""
def _generate():
blob = gcs_service.bucket.blob(blob_path)
# Generate signed POST URL
url, fields = blob.generate_signed_post_policy_v4(
expiration=timedelta(hours=1),
conditions=[
["content-length-range", 1, max_size],
["starts-with", "$Content-Type", content_type.split("/")[0]]
],
fields={
"Content-Type": content_type
}
)
return {"url": url, "fields": fields}
loop = asyncio.get_event_loop()
return await loop.run_in_executor(gcs_service.executor, _generate)

View file

@ -0,0 +1,350 @@
import json
import asyncio
from pathlib import Path
from typing import Any, Optional
import google.genai as genai
from ..core.config import settings
from ..core.logging import get_logger
logger = get_logger(__name__)
# Configure Gemini client
client = genai.Client(api_key=settings.gemini_api_key)
class GeminiService:
def __init__(self):
self.model_name = 'gemini-2.5-pro' # Stable production model
self.prompts_dir = Path(__file__).parent.parent / "prompts"
def _load_prompt(self, prompt_file: str) -> str:
"""Load prompt template from prompts directory"""
prompt_path = self.prompts_dir / prompt_file
try:
return prompt_path.read_text()
except FileNotFoundError:
logger.error(f"Prompt file not found: {prompt_file}")
raise
async def _wait_for_file_active(self, file_name: str, max_wait_seconds: int = 300) -> bool:
"""Wait for uploaded file to become ACTIVE state"""
wait_time = 1 # Start with 1 second
total_waited = 0
while total_waited < max_wait_seconds:
try:
# Get file status
file_info = client.files.get(name=file_name)
logger.info(f"File {file_name} status: {file_info.state} (waited {total_waited}s)")
if file_info.state == "ACTIVE":
logger.info(f"File {file_name} is now ACTIVE!")
return True
elif file_info.state == "FAILED":
logger.error(f"File {file_name} processing FAILED")
return False
# Wait with exponential backoff (max 30s)
logger.info(f"File not ready, waiting {wait_time}s...")
await asyncio.sleep(wait_time)
total_waited += wait_time
wait_time = min(wait_time * 1.5, 30) # Exponential backoff, max 30s
except Exception as e:
logger.error(f"Error checking file status: {e}")
await asyncio.sleep(5) # Wait 5s on error
total_waited += 5
logger.error(f"File {file_name} did not become ACTIVE within {max_wait_seconds}s")
return False
async def extract_accessibility(self, video_file_path: str) -> dict[str, Any]:
"""
Extract captions and audio descriptions from video using Gemini 2.0
Returns structured JSON with transcript, captions VTT, and audio description VTT
"""
prompt = self._load_prompt("gemini_ingestion.md")
try:
logger.info(f"Starting Gemini processing for video: {video_file_path}")
# Upload video file to Gemini using new API
logger.info("Uploading video file to Gemini API...")
uploaded_file = client.files.upload(
file=video_file_path,
config={
"display_name": f"video_processing_{Path(video_file_path).name}",
"mime_type": "video/mp4"
}
)
logger.info(f"Successfully uploaded file: {uploaded_file.name} (URI: {uploaded_file.uri})")
# Wait for file to become ACTIVE before using it
logger.info("Waiting for file to become ACTIVE...")
file_ready = await self._wait_for_file_active(uploaded_file.name)
if not file_ready:
raise Exception("File failed to become ACTIVE within timeout")
# Generate content using new API
logger.info("Generating content with Gemini model...")
response = client.models.generate_content(
model=self.model_name,
contents=[
genai.types.Part.from_text(text=prompt),
genai.types.Part.from_uri(
file_uri=uploaded_file.uri,
mime_type=uploaded_file.mime_type
)
]
)
# Parse JSON response
response_text = response.text.strip()
logger.info(f"Received Gemini response (first 200 chars): {response_text[:200]}...")
# Handle potential markdown formatting
if response_text.startswith("```json"):
response_text = response_text.replace("```json", "").replace("```", "").strip()
logger.info("Cleaned markdown formatting from response")
# Additional cleanup for common JSON issues
response_text = response_text.strip()
logger.info("Parsing JSON response...")
try:
result = json.loads(response_text)
except json.JSONDecodeError as e:
logger.error(f"JSON parse error at position {e.pos}: {e.msg}")
# Log the problematic area
start = max(0, e.pos - 100)
end = min(len(response_text), e.pos + 100)
problematic_text = response_text[start:end]
logger.error(f"Problematic JSON area: ...{problematic_text}...")
raise
# Validate required fields
required_fields = [
"language", "confidence", "summary",
"transcript_plaintext", "captions_vtt", "audio_description_vtt"
]
for field in required_fields:
if field not in result:
raise ValueError(f"Missing required field: {field}")
# Validate VTT format
if not result["captions_vtt"].startswith("WEBVTT"):
raise ValueError("Invalid captions VTT format")
if not result["audio_description_vtt"].startswith("WEBVTT"):
raise ValueError("Invalid audio description VTT format")
logger.info(
f"Successfully extracted accessibility content with confidence: {result['confidence']}"
)
# Clean up uploaded file
try:
client.files.delete(name=uploaded_file.name)
except Exception as e:
logger.warning(f"Failed to cleanup uploaded file: {e}")
return result
except json.JSONDecodeError as e:
logger.error(f"Failed to parse Gemini JSON response: {e}")
logger.error(f"Raw response that failed to parse: {response_text}")
# Attempt self-healing
return await self._self_heal_response(video_file_path, response_text)
except Exception as e:
logger.error(f"Gemini extraction failed with exception: {type(e).__name__}: {str(e)}")
logger.error(f"Video file path: {video_file_path}")
# Print to stdout for immediate visibility
print(f"🚨 GEMINI ERROR: {type(e).__name__}: {str(e)}")
raise
async def _self_heal_response(self, video_file_path: str, invalid_response: str) -> dict[str, Any]:
"""Attempt to self-heal invalid JSON response from Gemini"""
logger.info("Attempting to self-heal JSON response without re-uploading video")
# Try to fix common JSON issues first
try:
fixed_response = self._attempt_json_fix(invalid_response)
if fixed_response:
logger.info("Successfully fixed JSON without re-processing")
return fixed_response
except Exception as e:
logger.warning(f"JSON fix attempt failed: {e}")
# If simple fixes don't work, try a text-only self-heal prompt with more context
self_heal_prompt = f"""
SYSTEM: You are a JSON repair service. Fix the malformed JSON below and return ONLY the corrected JSON.
CRITICAL REQUIREMENTS:
- The JSON MUST contain these exact fields: language, confidence, summary, transcript_plaintext, captions_vtt, audio_description_vtt
- If audio_description_vtt is truncated or missing, reconstruct it as a valid WebVTT with at least basic descriptions
- All VTT content must start with "WEBVTT" and have proper timestamp format (HH:MM:SS.mmm --> HH:MM:SS.mmm)
- Properly escape all quotes within strings using \"
- Fix unterminated strings by adding closing quotes
- Remove trailing commas
- Ensure all JSON is properly closed with }}
Fix the JSON and return it:
{invalid_response}
"""
try:
response = client.models.generate_content(
model=self.model_name,
contents=[genai.types.Part.from_text(text=self_heal_prompt)]
)
response_text = response.text.strip()
# Handle potential markdown formatting
if response_text.startswith("```json"):
response_text = response_text.replace("```json", "").replace("```", "").strip()
result = json.loads(response_text)
# Validate that all required fields are present after healing
required_fields = [
"language", "confidence", "summary",
"transcript_plaintext", "captions_vtt", "audio_description_vtt"
]
missing_fields = [field for field in required_fields if field not in result]
if missing_fields:
logger.error(f"Self-heal lost required fields: {missing_fields}")
# If audio_description_vtt is missing, create a basic one
if "audio_description_vtt" in missing_fields:
logger.info("Creating fallback audio_description_vtt")
result["audio_description_vtt"] = "WEBVTT\n\n00:00:00.000 --> 00:00:05.000\nVideo content with visual elements described."
# If other critical fields are missing, raise an error
remaining_missing = [f for f in missing_fields if f != "audio_description_vtt"]
if remaining_missing:
raise ValueError(f"Self-heal failed to preserve required fields: {remaining_missing}")
logger.info("Successfully self-healed Gemini response with all required fields")
return result
except Exception as e:
logger.error(f"Self-heal attempt failed: {e}")
raise ValueError("Failed to get valid JSON from Gemini after self-heal attempt")
def _attempt_json_fix(self, json_text: str) -> dict[str, Any] | None:
"""Attempt to fix common JSON syntax issues"""
# Try to identify and fix common issues
fixes_tried = []
fixed_text = json_text
import re
# Fix 1: Remove trailing commas
fixed_text = re.sub(r',(\s*[}\]])', r'\1', fixed_text)
fixes_tried.append("removed trailing commas")
# Fix 2: Try to fix unterminated strings by adding closing quote and brace
if fixed_text.count('"') % 2 != 0: # Odd number of quotes suggests unterminated string
# Find the last quote and see if we need to close the JSON
last_quote_pos = fixed_text.rfind('"')
remainder = fixed_text[last_quote_pos + 1:].strip()
# If there's no closing brace after the last quote, try to fix it
if remainder and not remainder.endswith('}'):
# Try to intelligently close the JSON
if 'audio_description_vtt' in fixed_text[max(0, last_quote_pos - 100):]:
# This appears to be in the audio_description_vtt field
fixed_text += '"\n}'
fixes_tried.append("closed unterminated audio_description_vtt string")
else:
fixed_text += '"'
fixes_tried.append("closed unterminated string")
# Fix 3: Ensure JSON ends with closing brace
if not fixed_text.rstrip().endswith('}'):
fixed_text = fixed_text.rstrip() + '\n}'
fixes_tried.append("added closing brace")
try:
result = json.loads(fixed_text)
logger.info(f"JSON fixed with: {', '.join(fixes_tried)}")
# Validate that we have the required fields
required_fields = [
"language", "confidence", "summary",
"transcript_plaintext", "captions_vtt", "audio_description_vtt"
]
missing_fields = [field for field in required_fields if field not in result]
if missing_fields:
logger.warning(f"Fixed JSON is missing required fields: {missing_fields}")
return None # Let the more advanced self-healing handle this
return result
except json.JSONDecodeError as e:
logger.debug(f"JSON fix attempt failed: {e}")
return None
async def transcreate_content(
self,
captions_vtt: str,
ad_vtt: str,
target_language: str,
brief: Optional[str] = None
) -> dict[str, str]:
"""
Transcreate English VTT content to target language with cultural adaptation
"""
prompt_template = self._load_prompt("gemini_transcreation.md")
# Format prompt with actual content
prompt = prompt_template.format(
TARGET_LANGUAGE=target_language
)
user_prompt = f"""
Input:
- captions_vtt_en: {captions_vtt}
- ad_vtt_en: {ad_vtt}
- brief: {brief or "No specific brand guidelines provided"}
Output:
JSON:
"""
try:
response = client.models.generate_content(
model=self.model_name,
contents=[
genai.types.Part.from_text(text=prompt + "\n\n" + user_prompt)
]
)
response_text = response.text.strip()
# Handle potential markdown formatting
if response_text.startswith("```json"):
response_text = response_text.replace("```json", "").replace("```", "").strip()
result = json.loads(response_text)
# Validate required fields
if "captions_vtt" not in result or "audio_description_vtt" not in result:
raise ValueError("Missing required VTT fields in transcreation response")
logger.info(f"Successfully transcreated content to {target_language}")
return result
except json.JSONDecodeError as e:
logger.error(f"Failed to parse transcreation JSON response: {e}")
raise ValueError("Invalid JSON response from transcreation")
except Exception as e:
logger.error(f"Transcreation failed: {e}")
raise
# Global service instance
gemini_service = GeminiService()

View file

@ -0,0 +1,284 @@
"""Google Cloud Secret Manager integration service."""
import os
import asyncio
from typing import Dict, List, Optional, Any
from functools import lru_cache
from google.cloud import secretmanager
from google.api_core import exceptions as gcp_exceptions
from app.core.config import get_settings
from app.core.logging import get_logger
from app.telemetry.tracing import trace_async_operation
logger = get_logger(__name__)
class SecretManagerError(Exception):
"""Custom exception for Secret Manager operations."""
pass
class SecretsManager:
"""Service for managing secrets via Google Cloud Secret Manager."""
def __init__(self):
self.settings = get_settings()
self.client: Optional[secretmanager.SecretManagerServiceClient] = None
self.project_id = self.settings.google_cloud_project
self._cache: Dict[str, str] = {}
self._cache_ttl = 300 # 5 minutes cache
def _get_client(self) -> secretmanager.SecretManagerServiceClient:
"""Get or create Secret Manager client."""
if not self.client:
try:
self.client = secretmanager.SecretManagerServiceClient()
logger.info("Secret Manager client initialized")
except Exception as e:
logger.error(f"Failed to initialize Secret Manager client: {e}")
raise SecretManagerError(f"Failed to initialize Secret Manager: {e}")
return self.client
@trace_async_operation("secrets_manager.get_secret")
async def get_secret(self, secret_name: str, version: str = "latest") -> str:
"""
Retrieve a secret from Google Cloud Secret Manager.
Args:
secret_name: Name of the secret
version: Version of the secret (default: "latest")
Returns:
The secret value as a string
Raises:
SecretManagerError: If secret cannot be retrieved
"""
cache_key = f"{secret_name}:{version}"
# Check cache first
if cache_key in self._cache:
logger.debug(f"Secret {secret_name} retrieved from cache")
return self._cache[cache_key]
try:
# Build the secret name
name = f"projects/{self.project_id}/secrets/{secret_name}/versions/{version}"
# Get the secret
client = self._get_client()
# Run in thread pool since Secret Manager client is synchronous
loop = asyncio.get_event_loop()
response = await loop.run_in_executor(
None,
client.access_secret_version,
{"name": name}
)
secret_value = response.payload.data.decode("UTF-8")
# Cache the secret (with TTL handled by application restart)
self._cache[cache_key] = secret_value
logger.info(f"Successfully retrieved secret: {secret_name}")
return secret_value
except gcp_exceptions.NotFound:
error_msg = f"Secret not found: {secret_name}"
logger.error(error_msg)
raise SecretManagerError(error_msg)
except gcp_exceptions.PermissionDenied:
error_msg = f"Permission denied accessing secret: {secret_name}"
logger.error(error_msg)
raise SecretManagerError(error_msg)
except Exception as e:
error_msg = f"Failed to retrieve secret {secret_name}: {e}"
logger.error(error_msg)
raise SecretManagerError(error_msg)
@trace_async_operation("secrets_manager.get_secrets_batch")
async def get_secrets_batch(self, secret_names: List[str]) -> Dict[str, str]:
"""
Retrieve multiple secrets efficiently.
Args:
secret_names: List of secret names to retrieve
Returns:
Dictionary mapping secret names to their values
"""
secrets = {}
tasks = []
for secret_name in secret_names:
task = asyncio.create_task(
self.get_secret(secret_name),
name=f"get_secret_{secret_name}"
)
tasks.append((secret_name, task))
# Wait for all tasks to complete
for secret_name, task in tasks:
try:
secrets[secret_name] = await task
except SecretManagerError as e:
logger.warning(f"Failed to retrieve secret {secret_name}: {e}")
# Continue with other secrets
continue
return secrets
async def create_secret(self, secret_name: str, secret_value: str, labels: Optional[Dict[str, str]] = None) -> str:
"""
Create a new secret in Secret Manager.
Args:
secret_name: Name of the secret
secret_value: Value to store
labels: Optional labels for the secret
Returns:
The full secret resource name
"""
try:
client = self._get_client()
parent = f"projects/{self.project_id}"
# Create the secret
secret = {
"labels": labels or {},
"replication": {"automatic": {}}
}
loop = asyncio.get_event_loop()
# Create secret resource
create_response = await loop.run_in_executor(
None,
client.create_secret,
{
"parent": parent,
"secret_id": secret_name,
"secret": secret
}
)
# Add secret version with the actual value
version_response = await loop.run_in_executor(
None,
client.add_secret_version,
{
"parent": create_response.name,
"payload": {"data": secret_value.encode("UTF-8")}
}
)
logger.info(f"Successfully created secret: {secret_name}")
return version_response.name
except gcp_exceptions.AlreadyExists:
error_msg = f"Secret already exists: {secret_name}"
logger.error(error_msg)
raise SecretManagerError(error_msg)
except Exception as e:
error_msg = f"Failed to create secret {secret_name}: {e}"
logger.error(error_msg)
raise SecretManagerError(error_msg)
def clear_cache(self) -> None:
"""Clear the secrets cache."""
self._cache.clear()
logger.info("Secrets cache cleared")
# Global secrets manager instance
secrets_manager = SecretsManager()
# Convenience functions for common operations
async def get_secret(secret_name: str, version: str = "latest") -> str:
"""Get a secret value."""
return await secrets_manager.get_secret(secret_name, version)
async def get_database_url() -> str:
"""Get MongoDB connection URL from Secret Manager."""
try:
return await secrets_manager.get_secret("mongodb-url")
except SecretManagerError:
# Fallback to environment variable
url = os.getenv("MONGODB_URL")
if not url:
raise SecretManagerError("MongoDB URL not available in secrets or environment")
return url
async def get_redis_url() -> str:
"""Get Redis connection URL from Secret Manager."""
try:
return await secrets_manager.get_secret("redis-url")
except SecretManagerError:
# Fallback to environment variable
url = os.getenv("REDIS_URL")
if not url:
raise SecretManagerError("Redis URL not available in secrets or environment")
return url
async def get_jwt_secrets() -> Dict[str, str]:
"""Get JWT secrets from Secret Manager."""
try:
return await secrets_manager.get_secrets_batch([
"jwt-secret",
"jwt-refresh-secret"
])
except SecretManagerError:
# Fallback to environment variables
return {
"jwt-secret": os.getenv("JWT_SECRET_KEY", "dev-secret-change-in-production"),
"jwt-refresh-secret": os.getenv("JWT_REFRESH_SECRET_KEY", "dev-refresh-secret-change-in-production")
}
async def get_api_keys() -> Dict[str, str]:
"""Get all API keys from Secret Manager."""
api_keys = {}
secret_names = [
"gemini-api-key",
"sendgrid-api-key",
"elevenlabs-api-key",
"sentry-dsn"
]
try:
api_keys = await secrets_manager.get_secrets_batch(secret_names)
except SecretManagerError:
logger.warning("Failed to retrieve some API keys from Secret Manager, using environment fallback")
# Fallback to environment variables for missing keys
env_mapping = {
"gemini-api-key": "GEMINI_API_KEY",
"sendgrid-api-key": "SENDGRID_API_KEY",
"elevenlabs-api-key": "ELEVENLABS_API_KEY",
"sentry-dsn": "SENTRY_DSN"
}
for secret_name, env_var in env_mapping.items():
if secret_name not in api_keys:
env_value = os.getenv(env_var)
if env_value:
api_keys[secret_name] = env_value
else:
logger.warning(f"API key {secret_name} not available in secrets or environment")
return api_keys

View file

@ -0,0 +1,110 @@
from google.cloud import translate_v2 as translate
from ..core.config import settings
from ..core.logging import get_logger
logger = get_logger(__name__)
class TranslateService:
def __init__(self):
if settings.translate_api_key:
self.client = translate.Client()
else:
logger.warning("Google Translate API key not configured")
self.client = None
async def translate_vtt(self, vtt_content: str, target_language: str) -> str:
"""
Translate VTT content while preserving timing and structure
"""
if not self.client:
raise ValueError("Google Translate not configured")
# Parse VTT to extract cues
cues = self._parse_vtt_cues(vtt_content)
# Extract text for translation
texts_to_translate = [cue["text"] for cue in cues]
if not texts_to_translate:
return vtt_content
try:
# Translate all texts in batch
results = self.client.translate(
texts_to_translate,
target_language=target_language,
source_language="en"
)
# Rebuild VTT with translated text
translated_cues = []
for i, cue in enumerate(cues):
translated_text = results[i]["translatedText"] if isinstance(results, list) else results["translatedText"]
translated_cues.append({
"start": cue["start"],
"end": cue["end"],
"text": translated_text
})
return self._build_vtt(translated_cues)
except Exception as e:
logger.error(f"Translation failed: {e}")
raise
def _parse_vtt_cues(self, vtt_content: str) -> list[dict[str, str]]:
"""Parse VTT content and extract timing and text cues"""
lines = vtt_content.strip().split('\n')
cues = []
current_cue = {}
for line in lines:
line = line.strip()
# Skip WEBVTT header and empty lines
if line == "WEBVTT" or line == "" or line.startswith("NOTE"):
continue
# Check if line contains timing
if " --> " in line:
timing_parts = line.split(" --> ")
current_cue = {
"start": timing_parts[0].strip(),
"end": timing_parts[1].strip(),
"text": ""
}
elif current_cue and line:
# This is subtitle text
if current_cue.get("text"):
current_cue["text"] += " " + line
else:
current_cue["text"] = line
# If next line is empty or timing, cue is complete
# For simplicity, we'll add the cue here and handle multi-line in a more robust way
if current_cue["text"]:
cues.append(current_cue.copy())
current_cue = {}
# Add final cue if exists
if current_cue and current_cue.get("text"):
cues.append(current_cue)
return cues
def _build_vtt(self, cues: list[dict[str, str]]) -> str:
"""Build VTT content from cues"""
vtt_lines = ["WEBVTT", ""]
for cue in cues:
vtt_lines.append(f"{cue['start']} --> {cue['end']}")
vtt_lines.append(cue["text"])
vtt_lines.append("") # Empty line between cues
return "\n".join(vtt_lines)
# Global service instance
translate_service = TranslateService()

301
backend/app/services/tts.py Normal file
View file

@ -0,0 +1,301 @@
import io
from typing import Optional
import aiohttp
from google.cloud import texttospeech
from pydub import AudioSegment
from ..core.config import settings
from ..core.logging import get_logger
logger = get_logger(__name__)
class TTSService:
def __init__(self):
# Initialize Google TTS
if settings.google_tts_credentials:
self.google_client = texttospeech.TextToSpeechClient()
else:
logger.warning("Google TTS credentials not configured")
self.google_client = None
# Check ElevenLabs availability
self.elevenlabs_available = bool(settings.elevenlabs_api_key)
async def synthesize_audio_description(
self,
ad_vtt_content: str,
language_code: str = "en-US",
voice_name: Optional[str] = None
) -> bytes:
"""
Generate MP3 audio from audio description VTT content
Synthesizes each cue separately and stitches them together with timing
Uses Google TTS with ElevenLabs fallback
"""
# Try Google TTS first, fallback to ElevenLabs
try:
if self.google_client:
return await self._synthesize_with_google(ad_vtt_content, language_code, voice_name)
elif self.elevenlabs_available:
return await self._synthesize_with_elevenlabs(ad_vtt_content, language_code, voice_name)
else:
raise ValueError("No TTS service configured")
except Exception as e:
if self.elevenlabs_available and self.google_client:
logger.warning(f"Google TTS failed, trying ElevenLabs: {e}")
return await self._synthesize_with_elevenlabs(ad_vtt_content, language_code, voice_name)
raise
async def _synthesize_with_google(
self,
ad_vtt_content: str,
language_code: str = "en-US",
voice_name: Optional[str] = None
) -> bytes:
"""Generate MP3 using Google TTS with 2-second pauses between passages"""
# Parse VTT cues
cues = self._parse_ad_cues(ad_vtt_content)
if not cues:
raise ValueError("No audio description cues found")
# Synthesize each cue separately with precise timing anchoring
audio_segments = []
current_audio_position = 0.0 # Track actual audio timeline position
for i, cue in enumerate(cues):
# Calculate where this cue should start (anchored to VTT timing)
target_start_time = cue["start_time"]
# Add silence to reach the exact VTT start time
if target_start_time > current_audio_position:
silence_duration = target_start_time - current_audio_position
silence = AudioSegment.silent(duration=int(silence_duration * 1000))
audio_segments.append(silence)
current_audio_position = target_start_time
# Synthesize this cue's text
text = cue["text"].strip()
if text:
# Ensure proper punctuation for natural TTS flow
if not text.endswith(('.', '!', '?')):
text += "."
# Synthesize this individual cue
audio_data = await self._synthesize_text_google(
text, language_code, voice_name
)
# Convert to AudioSegment and get actual duration
audio_segment = AudioSegment.from_file(io.BytesIO(audio_data), format="mp3")
audio_segments.append(audio_segment)
# Update current position based on actual audio duration (not VTT end time)
actual_audio_duration = len(audio_segment) / 1000.0 # Convert ms to seconds
current_audio_position += actual_audio_duration
# Combine all segments
if audio_segments:
final_audio = sum(audio_segments, AudioSegment.empty())
else:
# Fallback to empty audio if no segments
final_audio = AudioSegment.silent(duration=1000)
# Export to MP3
output_buffer = io.BytesIO()
final_audio.export(output_buffer, format="mp3", bitrate="128k")
return output_buffer.getvalue()
async def _synthesize_with_elevenlabs(
self,
ad_vtt_content: str,
language_code: str = "en-US",
voice_name: Optional[str] = None
) -> bytes:
"""Generate MP3 using ElevenLabs TTS"""
# Parse VTT cues
cues = self._parse_ad_cues(ad_vtt_content)
if not cues:
raise ValueError("No audio description cues found")
# Get voice ID for language
voice_id = self._get_elevenlabs_voice(language_code, voice_name)
# Synthesize each cue with precise timing anchoring
audio_segments = []
current_audio_position = 0.0 # Track actual audio timeline position
for i, cue in enumerate(cues):
# Calculate where this cue should start (anchored to VTT timing)
target_start_time = cue["start_time"]
# Add silence to reach the exact VTT start time
if target_start_time > current_audio_position:
silence_duration = target_start_time - current_audio_position
silence = AudioSegment.silent(duration=int(silence_duration * 1000))
audio_segments.append(silence)
current_audio_position = target_start_time
# Synthesize this cue with ElevenLabs
text = cue["text"].strip()
if text:
audio_data = await self._synthesize_text_elevenlabs(text, voice_id)
# Convert to AudioSegment and get actual duration
audio_segment = AudioSegment.from_file(io.BytesIO(audio_data), format="mp3")
audio_segments.append(audio_segment)
# Update current position based on actual audio duration (not VTT end time)
actual_audio_duration = len(audio_segment) / 1000.0 # Convert ms to seconds
current_audio_position += actual_audio_duration
# Combine all segments
final_audio = sum(audio_segments, AudioSegment.empty())
# Export to MP3
output_buffer = io.BytesIO()
final_audio.export(output_buffer, format="mp3", bitrate="128k")
return output_buffer.getvalue()
async def _synthesize_text_google(
self,
text: str,
language_code: str,
voice_name: Optional[str] = None
) -> bytes:
"""Synthesize a single text string to audio using Google TTS"""
# Configure voice
if not voice_name:
voice_name = settings.google_tts_voices.get(language_code, "en-US-Neural2-D")
voice = texttospeech.VoiceSelectionParams(
language_code=language_code,
name=voice_name
)
# Configure audio
audio_config = texttospeech.AudioConfig(
audio_encoding=texttospeech.AudioEncoding.MP3,
speaking_rate=1.2, # Faster cadence for better flow
pitch=0.0
)
# Synthesize
synthesis_input = texttospeech.SynthesisInput(text=text)
response = self.google_client.synthesize_speech(
input=synthesis_input,
voice=voice,
audio_config=audio_config
)
return response.audio_content
async def _synthesize_text_elevenlabs(self, text: str, voice_id: str) -> bytes:
"""Synthesize text using ElevenLabs API"""
url = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}"
headers = {
"Accept": "audio/mpeg",
"Content-Type": "application/json",
"xi-api-key": settings.elevenlabs_api_key
}
data = {
"text": text,
"model_id": "eleven_multilingual_v2",
"voice_settings": {
"stability": 0.5,
"similarity_boost": 0.5,
"style": 0.0,
"use_speaker_boost": True
}
}
async with aiohttp.ClientSession() as session:
async with session.post(url, json=data, headers=headers) as response:
if response.status == 200:
return await response.read()
else:
error_text = await response.text()
raise ValueError(f"ElevenLabs TTS failed: {response.status} - {error_text}")
def _get_elevenlabs_voice(self, language_code: str, voice_name: Optional[str] = None) -> str:
"""Get ElevenLabs voice ID for language"""
if voice_name:
return voice_name
return settings.elevenlabs_voices.get(language_code, "21m00Tcm4TlvDq8ikWAM")
def _parse_ad_cues(self, vtt_content: str) -> list[dict]:
"""Parse audio description VTT and extract timing + text"""
lines = vtt_content.strip().split('\n')
cues = []
i = 0
while i < len(lines):
line = lines[i].strip()
# Skip header and empty lines
if line == "WEBVTT" or line == "" or line.startswith("NOTE"):
i += 1
continue
# Check for timing line
if " --> " in line:
timing_parts = line.split(" --> ")
start_time = self._parse_timestamp(timing_parts[0].strip())
end_time = self._parse_timestamp(timing_parts[1].strip())
# Get text from next line(s)
i += 1
text_lines = []
while i < len(lines) and lines[i].strip() != "":
text_lines.append(lines[i].strip())
i += 1
if text_lines:
cues.append({
"start_time": start_time,
"end_time": end_time,
"text": " ".join(text_lines)
})
else:
i += 1
return cues
def _parse_timestamp(self, timestamp: str) -> float:
"""Convert VTT timestamp to seconds"""
# Format: HH:MM:SS.mmm or MM:SS.mmm
parts = timestamp.split(":")
if len(parts) == 3: # HH:MM:SS.mmm
hours, minutes, seconds = parts
elif len(parts) == 2: # MM:SS.mmm
hours, minutes, seconds = "0", parts[0], parts[1]
else:
raise ValueError(f"Invalid timestamp format: {timestamp}")
# Parse seconds and milliseconds
sec_parts = seconds.split(".")
seconds = int(sec_parts[0])
milliseconds = int(sec_parts[1]) if len(sec_parts) > 1 else 0
total_seconds = (
int(hours) * 3600 +
int(minutes) * 60 +
seconds +
milliseconds / 1000.0
)
return total_seconds
# Global service instance
tts_service = TTSService()

View file

@ -0,0 +1,130 @@
from typing import Dict, List, Any
from ..core.logging import get_logger
from ..lib.vtt import VTTEditor
from ..services.gcs import gcs_service
logger = get_logger(__name__)
class AssetValidationService:
"""Service for validating job assets before completion"""
@staticmethod
async def validate_job_assets(job_doc: Dict[str, Any]) -> tuple[bool, List[str]]:
"""
Validate all assets for a job before allowing completion
Returns (is_valid, list_of_errors)
"""
errors = []
outputs = job_doc.get("outputs", {})
requested_outputs = job_doc.get("requested_outputs", {})
if not outputs:
errors.append("No outputs generated for this job")
return False, errors
# Validate each language
for language in requested_outputs.get("languages", ["en"]):
lang_output = outputs.get(language)
if not lang_output:
errors.append(f"Missing outputs for language: {language}")
continue
# Validate captions VTT if requested
if requested_outputs.get("captions_vtt"):
captions_error = await AssetValidationService._validate_vtt_asset(
lang_output.get("captions_vtt_gcs"),
f"{language} captions VTT"
)
if captions_error:
errors.append(captions_error)
# Validate audio description VTT if requested
if requested_outputs.get("audio_description_vtt"):
ad_vtt_error = await AssetValidationService._validate_vtt_asset(
lang_output.get("ad_vtt_gcs"),
f"{language} audio description VTT"
)
if ad_vtt_error:
errors.append(ad_vtt_error)
# Validate MP3 if requested
if requested_outputs.get("audio_description_mp3"):
mp3_error = await AssetValidationService._validate_mp3_asset(
lang_output.get("ad_mp3_gcs"),
f"{language} audio description MP3"
)
if mp3_error:
errors.append(mp3_error)
# Check minimum quality requirements
ai_confidence = job_doc.get("ai", {}).get("confidence", 0)
if ai_confidence < 0.7:
errors.append(f"AI confidence too low: {ai_confidence:.1%} (minimum: 70%)")
return len(errors) == 0, errors
@staticmethod
async def _validate_vtt_asset(gcs_uri: str, asset_name: str) -> str | None:
"""Validate a VTT asset exists and is properly formatted"""
if not gcs_uri:
return f"Missing {asset_name}"
try:
# Download and validate VTT content
blob_path = gcs_uri.replace(f"gs://{gcs_service.bucket.name}/", "")
blob = gcs_service.bucket.blob(blob_path)
if not blob.exists():
return f"{asset_name} file not found in storage"
vtt_content = blob.download_as_text()
is_valid, vtt_errors = VTTEditor.validate_vtt(vtt_content)
if not is_valid:
return f"{asset_name} validation failed: {'; '.join(vtt_errors[:3])}"
# Check minimum content requirements
cue_count = VTTEditor.get_cue_count(vtt_content)
if cue_count == 0:
return f"{asset_name} contains no cues"
except Exception as e:
logger.error(f"Failed to validate {asset_name}: {e}")
return f"{asset_name} validation error: {str(e)}"
return None
@staticmethod
async def _validate_mp3_asset(gcs_uri: str, asset_name: str) -> str | None:
"""Validate an MP3 asset exists and has reasonable properties"""
if not gcs_uri:
return f"Missing {asset_name}"
try:
blob_path = gcs_uri.replace(f"gs://{gcs_service.bucket.name}/", "")
blob = gcs_service.bucket.blob(blob_path)
if not blob.exists():
return f"{asset_name} file not found in storage"
# Reload blob to get metadata (including size)
blob.reload()
# Check file size (should be reasonable for audio)
size_mb = blob.size / (1024 * 1024) if blob.size else 0
if size_mb < 0.01: # Less than 10KB
return f"{asset_name} file too small (likely empty)"
elif size_mb > 500: # More than 500MB
return f"{asset_name} file too large ({size_mb:.1f}MB)"
except Exception as e:
logger.error(f"Failed to validate {asset_name}: {e}")
return f"{asset_name} validation error: {str(e)}"
return None
# Global service instance
asset_validation_service = AssetValidationService()

View file

@ -0,0 +1,158 @@
from celery import Celery
from celery.signals import task_failure, task_success, task_retry
from ..core.config import settings
from ..core.logging import get_logger
logger = get_logger(__name__)
celery_app = Celery(
"accessible-video-tasks",
broker=settings.redis_url,
backend=settings.redis_url,
)
celery_app.conf.update(
task_serializer="json",
accept_content=["json"],
result_serializer="json",
timezone="UTC",
enable_utc=True,
task_track_started=True,
task_time_limit=30 * 60, # 30 minutes default
task_soft_time_limit=25 * 60, # 25 minutes default
worker_prefetch_multiplier=1,
task_acks_late=True,
worker_max_tasks_per_child=1000,
task_routes={
"app.tasks.ingest_and_ai.*": {"queue": "ingest"},
"app.tasks.translate_and_synthesize.*": {"queue": "default"},
"app.tasks.notify.*": {"queue": "notify"},
"app.tasks.watchers.*": {"queue": "default"},
},
task_default_queue="default",
task_create_missing_queues=True,
# Task-specific timeout overrides
task_annotations={
'app.tasks.watchers.start_change_stream_watcher': {
'time_limit': None,
'soft_time_limit': None,
},
'app.tasks.watchers.ensure_watcher_running': {
'time_limit': 300, # 5 minutes
'soft_time_limit': 240, # 4 minutes
},
},
)
# Add a simple test task for debugging
@celery_app.task
def test_task(message="test"):
"""Simple test task to verify worker connectivity"""
logger.info(f"🧪 TEST TASK EXECUTED: {message}")
print(f"🧪 TEST TASK EXECUTED: {message}")
return f"Test task completed: {message}"
# Add task received handler for debugging
from celery.signals import task_received, task_prerun, worker_ready
import threading
import time
@worker_ready.connect
def worker_ready_handler(sender=None, **kwargs):
"""Log when worker is ready and start heartbeat"""
logger.info(f"🟢 WORKER READY: {sender}")
print(f"🟢 WORKER READY: {sender} - Worker is online and listening!")
# Start MongoDB change stream watcher
# Note: The main job progression is handled by immediate triggering in approve_english endpoint
# This watcher provides redundancy for status change detection
if _watchers_available and 'app.tasks.watchers.ensure_watcher_running' in celery_app.tasks:
try:
from .watchers import ensure_watcher_running
ensure_watcher_running.apply_async(countdown=3) # Start after 3 seconds
logger.info("Scheduled MongoDB change stream watcher to start")
except Exception as e:
logger.error(f"Failed to schedule change stream watcher: {e}")
else:
logger.info("Watcher not available or not registered, using primary job progression via approve_english endpoint")
@task_received.connect
def task_received_handler(sender=None, task_id=None, task=None, args=None, kwargs=None, retries=None, eta=None, **kwds):
"""Log when a task is received by the worker"""
logger.info(f"🎯 TASK RECEIVED: {task} [{task_id}] with args: {args}")
print(f"🎯 TASK RECEIVED: {task} [{task_id}] - Worker is picking up the task!")
@task_prerun.connect
def task_prerun_handler(sender=None, task_id=None, task=None, args=None, kwargs=None, **kwds):
"""Log when a task starts executing"""
logger.info(f"🚀 TASK STARTING: {task} [{task_id}]")
print(f"🚀 TASK STARTING: {task} [{task_id}] - About to execute!")
# Celery signal handlers for centralized logging
@task_failure.connect
def task_failure_handler(sender=None, task_id=None, exception=None, traceback=None, einfo=None, **kwargs):
"""Log task failures to centralized logging"""
exception_type = exception.__class__.__name__ if exception else "Unknown"
exception_msg = str(exception) if exception else "No details"
# Log comprehensive error details
error_details = f"""
=== CELERY TASK FAILURE ===
Task: {sender}
Task ID: {task_id}
Exception Type: {exception_type}
Exception Message: {exception_msg}
Full Traceback:
{traceback}
Additional Info: {einfo}
=============================
"""
logger.error(error_details)
# Also log to stdout for immediate visibility
print(f"🚨 TASK FAILURE: {sender} [{task_id}] - {exception_type}: {exception_msg}")
if traceback:
print(f"Full traceback:\n{traceback}")
@task_success.connect
def task_success_handler(sender=None, result=None, **kwargs):
"""Log task success"""
result_str = str(result)[:100] if result else "No result"
logger.info(f"Celery task completed: {sender} - Result: {result_str}")
@task_retry.connect
def task_retry_handler(sender=None, task_id=None, reason=None, einfo=None, **kwargs):
"""Log task retries"""
reason_str = str(reason) if reason else "No reason provided"
logger.warning(f"Celery task retry: {sender} [{task_id}] - Reason: {reason_str}")
def import_task_modules():
"""Import all task modules to register them with Celery"""
try:
from . import ingest_and_ai # noqa: E402, F401
from . import translate_and_synthesize # noqa: E402, F401
from . import notify # noqa: E402, F401
logger.info("Successfully imported core task modules")
except Exception as e:
logger.error(f"Error importing core task modules: {e}")
# Import watchers module conditionally to handle import errors gracefully
try:
from . import watchers # noqa: E402, F401
logger.info("Successfully imported watchers module")
return True
except ImportError as e:
logger.warning(f"Could not import watchers module: {e}")
return False
except Exception as e:
logger.error(f"Error importing watchers module: {e}")
return False
# Import task modules at startup
_watchers_available = import_task_modules()

Binary file not shown.

Binary file not shown.

Binary file not shown.

View file

@ -0,0 +1,213 @@
import asyncio
import os
import tempfile
from datetime import datetime
import ffmpeg
from celery import Task
from motor.motor_asyncio import AsyncIOMotorClient
from ..core.config import settings
from ..core.logging import get_logger
from ..models.job import JobStatus
from ..services.gcs import gcs_service, upload_vtt_to_gcs
from ..services.gemini import gemini_service
from . import celery_app
logger = get_logger(__name__)
class AsyncTask(Task):
"""Base task class that supports async execution"""
def __call__(self, *args, **kwargs):
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
return loop.run_until_complete(self.run_async(*args, **kwargs))
finally:
loop.close()
async def run_async(self, *args, **kwargs):
raise NotImplementedError
class IngestAndAITask(AsyncTask):
async def run_async(self, job_id: str):
return await ingest_and_ai_task_impl(job_id)
@celery_app.task(bind=True, base=IngestAndAITask)
def ingest_and_ai_task(self, job_id: str):
"""
Pipeline 1: Ingestion & AI Processing
Task wrapper that delegates to async implementation
"""
# This method is called by AsyncTask.__call__
pass
async def ingest_and_ai_task_impl(job_id: str):
"""
Pipeline 1: Ingestion & AI Processing
1. Update status to 'ingesting'
2. Probe video for metadata (duration, codec)
3. Process with Gemini 2.5 Pro
4. Generate VTT files
5. Update status to 'pending_qc'
"""
logger.info(f"Starting ingestion and AI processing for job {job_id}")
# Connect to MongoDB
client = AsyncIOMotorClient(settings.mongodb_uri)
db = client[settings.mongodb_db]
try:
# Update status to ingesting
await db.jobs.update_one(
{"_id": job_id},
{
"$set": {
"status": JobStatus.INGESTING.value,
"updated_at": datetime.utcnow()
},
"$push": {
"review.history": {
"at": datetime.utcnow(),
"status": JobStatus.INGESTING.value,
"by": "system"
}
}
}
)
# Get job details
job_doc = await db.jobs.find_one({"_id": job_id})
if not job_doc:
raise ValueError(f"Job {job_id} not found")
# Download video file temporarily for processing
source_blob_path = job_doc["source"]["gcs_uri"].replace(f"gs://{settings.gcs_bucket}/", "")
with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as temp_file:
temp_path = temp_file.name
# Download from GCS
blob = gcs_service.bucket.blob(source_blob_path)
blob.download_to_filename(temp_path)
try:
# Update status to AI processing
await db.jobs.update_one(
{"_id": job_id},
{
"$set": {
"status": JobStatus.AI_PROCESSING.value,
"updated_at": datetime.utcnow()
},
"$push": {
"review.history": {
"at": datetime.utcnow(),
"status": JobStatus.AI_PROCESSING.value,
"by": "system"
}
}
}
)
# Probe video for metadata
duration = await _get_video_duration(temp_path)
# Update source with duration
await db.jobs.update_one(
{"_id": job_id},
{"$set": {"source.duration_s": duration}}
)
# Process with Gemini
ai_result = await gemini_service.extract_accessibility(temp_path)
# Final safety check for required fields
required_fields = ["captions_vtt", "audio_description_vtt"]
missing_fields = [field for field in required_fields if field not in ai_result]
if missing_fields:
logger.error(f"Missing required fields after AI processing: {missing_fields}")
# Create fallback content for missing fields
if "audio_description_vtt" in missing_fields:
ai_result["audio_description_vtt"] = "WEBVTT\n\n00:00:00.000 --> 00:00:05.000\nVideo content with visual elements."
logger.info("Created fallback audio_description_vtt")
# Upload VTT files to GCS
captions_gcs_uri = await upload_vtt_to_gcs(
ai_result["captions_vtt"],
f"{job_id}/en/captions.vtt"
)
ad_gcs_uri = await upload_vtt_to_gcs(
ai_result["audio_description_vtt"],
f"{job_id}/en/ad.vtt"
)
# Update job with AI results and outputs
await db.jobs.update_one(
{"_id": job_id},
{
"$set": {
"status": JobStatus.PENDING_QC.value,
"ai.ingestion_json": ai_result,
"ai.confidence": ai_result["confidence"],
"outputs.en": {
"captions_vtt_gcs": captions_gcs_uri,
"ad_vtt_gcs": ad_gcs_uri
},
"updated_at": datetime.utcnow()
},
"$push": {
"review.history": {
"at": datetime.utcnow(),
"status": JobStatus.PENDING_QC.value,
"by": "system"
}
}
}
)
logger.info(f"Successfully completed ingestion and AI processing for job {job_id}")
finally:
# Clean up temp file
os.unlink(temp_path)
except Exception as e:
logger.error(f"Ingestion and AI processing failed for job {job_id}: {e}")
# Update job with error
await db.jobs.update_one(
{"_id": job_id},
{
"$set": {
"error": {
"type": "ingestion_failure",
"message": str(e),
"timestamp": datetime.utcnow().isoformat()
},
"updated_at": datetime.utcnow()
}
}
)
raise
finally:
client.close()
async def _get_video_duration(video_path: str) -> float:
"""Get video duration using ffprobe"""
try:
probe = ffmpeg.probe(video_path)
duration = float(probe['streams'][0]['duration'])
return duration
except Exception as e:
logger.warning(f"Could not determine video duration: {e}")
return 0.0

142
backend/app/tasks/notify.py Normal file
View file

@ -0,0 +1,142 @@
import asyncio
from datetime import datetime
from celery import Task
from motor.motor_asyncio import AsyncIOMotorClient
from ..core.config import settings
from ..core.logging import get_logger
from ..models.audit_log import AuditLogCreate
from ..services.emailer import email_service
from ..services.gcs import get_signed_download_url
from . import celery_app
logger = get_logger(__name__)
class AsyncTask(Task):
"""Base task class that supports async execution"""
def __call__(self, *args, **kwargs):
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
return loop.run_until_complete(self.run_async(*args, **kwargs))
finally:
loop.close()
async def run_async(self, *args, **kwargs):
raise NotImplementedError
@celery_app.task(bind=True, base=AsyncTask)
async def notify_client_task(self, job_id: str):
"""
Pipeline 3: Client Notification
Triggered when job status changes to 'completed'
"""
logger.info(f"Starting client notification for job {job_id}")
# Connect to MongoDB
client = AsyncIOMotorClient(settings.mongodb_uri)
db = client[settings.mongodb_db]
try:
# Get job and client details
job_doc = await db.jobs.find_one({"_id": job_id})
if not job_doc:
raise ValueError(f"Job {job_id} not found")
if job_doc["status"] != "completed":
logger.warning(f"Job {job_id} not in completed status, skipping notification")
return
# Get client email
client_doc = await db.users.find_one({"_id": job_doc["client_id"]})
if not client_doc:
raise ValueError(f"Client {job_doc['client_id']} not found")
# Generate signed URLs for all outputs
download_links = {}
outputs = job_doc.get("outputs", {})
for language, lang_output in outputs.items():
if not isinstance(lang_output, dict):
continue
lang_downloads = {}
# Captions VTT
if "captions_vtt_gcs" in lang_output:
blob_path = lang_output["captions_vtt_gcs"].replace(f"gs://{settings.gcs_bucket}/", "")
try:
signed_url = await get_signed_download_url(blob_path, 24)
lang_downloads["captions_vtt"] = signed_url
except Exception as e:
logger.warning(f"Failed to generate signed URL for captions {language}: {e}")
# Audio Description VTT
if "ad_vtt_gcs" in lang_output:
blob_path = lang_output["ad_vtt_gcs"].replace(f"gs://{settings.gcs_bucket}/", "")
try:
signed_url = await get_signed_download_url(blob_path, 24)
lang_downloads["audio_description_vtt"] = signed_url
except Exception as e:
logger.warning(f"Failed to generate signed URL for AD VTT {language}: {e}")
# Audio Description MP3
if "ad_mp3_gcs" in lang_output:
blob_path = lang_output["ad_mp3_gcs"].replace(f"gs://{settings.gcs_bucket}/", "")
try:
signed_url = await get_signed_download_url(blob_path, 24)
lang_downloads["audio_description_mp3"] = signed_url
except Exception as e:
logger.warning(f"Failed to generate signed URL for AD MP3 {language}: {e}")
if lang_downloads:
download_links[language] = lang_downloads
# Send completion email
success = await email_service.send_completion_email(
recipient_email=client_doc["email"],
job_title=job_doc["title"],
download_links=download_links
)
if success:
# Log audit entry
audit_log = AuditLogCreate(
job_id=job_id,
action="client_notified",
details={
"email": client_doc["email"],
"download_count": sum(len(files) for files in download_links.values())
}
)
await db.audit_logs.insert_one(audit_log.dict())
logger.info(f"Successfully notified client for job {job_id}")
else:
raise ValueError("Failed to send completion email")
except Exception as e:
logger.error(f"Client notification failed for job {job_id}: {e}")
# Update job with error
await db.jobs.update_one(
{"_id": job_id},
{
"$set": {
"error": {
"type": "notification_failure",
"message": str(e),
"timestamp": datetime.utcnow().isoformat()
},
"updated_at": datetime.utcnow()
}
}
)
raise
finally:
client.close()

View file

@ -0,0 +1,317 @@
import asyncio
from datetime import datetime
from typing import Any
import time
import random
from celery import Task
from motor.motor_asyncio import AsyncIOMotorClient
from ..core.config import settings
from ..core.logging import get_logger
from ..models.job import JobStatus
from ..services.gcs import gcs_service, upload_vtt_to_gcs
from ..services.gemini import gemini_service
from ..services.translate import translate_service
from ..services.tts import tts_service
from . import celery_app
logger = get_logger(__name__)
async def retry_with_backoff(func, max_retries=3, base_delay=1):
"""Retry a function with exponential backoff"""
last_exception = None
for attempt in range(max_retries):
try:
return await func()
except Exception as e:
last_exception = e
if attempt == max_retries - 1:
break
# Exponential backoff with jitter
delay = base_delay * (2 ** attempt) + random.uniform(0, 1)
logger.warning(f"Attempt {attempt + 1} failed, retrying in {delay:.2f}s: {e}")
await asyncio.sleep(delay)
raise last_exception
@celery_app.task(bind=True)
def translate_and_synthesize_task(self, job_id: str):
"""
Pipeline 2: Translation & MP3 Generation
Triggered when job status changes to 'approved_english'
"""
logger.info(f"🚀 CELERY TASK STARTED: translate_and_synthesize_task for job {job_id}")
try:
logger.info(f"📝 About to call asyncio.run for job {job_id}")
result = asyncio.run(_async_translate_and_synthesize(job_id))
logger.info(f"✅ CELERY TASK COMPLETED successfully for job {job_id}")
return result
except Exception as e:
logger.error(f"❌ CELERY TASK FAILED for job {job_id}: {str(e)}")
logger.error(f"❌ Exception type: {type(e).__name__}")
logger.error(f"❌ Exception args: {e.args}")
import traceback
logger.error(f"❌ Full traceback: {traceback.format_exc()}")
raise
async def _async_translate_and_synthesize(job_id: str):
"""Async implementation of translation and synthesis"""
logger.info(f"🔄 ASYNC FUNCTION STARTED: _async_translate_and_synthesize for job {job_id}")
# Connect to MongoDB
logger.info(f"📡 Connecting to MongoDB for job {job_id}")
client = AsyncIOMotorClient(settings.mongodb_uri)
db = client[settings.mongodb_db]
logger.info(f"📡 MongoDB connection established for job {job_id}")
try:
# Get job details
logger.info(f"🔍 Looking up job document for job {job_id}")
job_doc = await db.jobs.find_one({"_id": job_id})
if not job_doc:
logger.error(f"❌ Job {job_id} not found in database!")
raise ValueError(f"Job {job_id} not found")
logger.info(f"✅ Found job document for {job_id}, status: {job_doc.get('status', 'UNKNOWN')}")
if job_doc["status"] != JobStatus.APPROVED_ENGLISH.value:
logger.warning(f"⚠️ Job {job_id} not in approved_english status (current: {job_doc['status']}), skipping")
return
logger.info(f"✅ Job {job_id} is in correct status, proceeding with translation")
# Update status to translating
await db.jobs.update_one(
{"_id": job_id},
{
"$set": {
"status": JobStatus.TRANSLATING.value,
"updated_at": datetime.utcnow()
},
"$push": {
"review.history": {
"at": datetime.utcnow(),
"status": JobStatus.TRANSLATING.value,
"by": "system"
}
}
}
)
# Get English VTT content
en_outputs = job_doc["outputs"]["en"]
# Download English VTT files
captions_blob_path = en_outputs["captions_vtt_gcs"].replace(f"gs://{settings.gcs_bucket}/", "")
ad_blob_path = en_outputs["ad_vtt_gcs"].replace(f"gs://{settings.gcs_bucket}/", "")
captions_blob = gcs_service.bucket.blob(captions_blob_path)
ad_blob = gcs_service.bucket.blob(ad_blob_path)
en_captions_vtt = captions_blob.download_as_text()
en_ad_vtt = ad_blob.download_as_text()
# Process each requested language
requested_languages = job_doc["requested_outputs"]["languages"]
transcreation_languages = job_doc["requested_outputs"]["transcreation"]
updated_outputs = job_doc.get("outputs", {})
for language in requested_languages:
if language == "en":
continue # Skip English as it's already processed
logger.info(f"Processing language: {language}")
try:
if language in transcreation_languages:
# Use transcreation for cultural adaptation with retry
async def transcreate():
return await gemini_service.transcreate_content(
en_captions_vtt,
en_ad_vtt,
language,
brief="Standard accessibility content"
)
result = await retry_with_backoff(transcreate, max_retries=3)
translated_captions = result["captions_vtt"]
translated_ad = result["audio_description_vtt"]
origin = "transcreate"
else:
# Use standard translation with retry
async def translate_captions():
return await translate_service.translate_vtt(en_captions_vtt, language)
async def translate_ad():
return await translate_service.translate_vtt(en_ad_vtt, language)
translated_captions = await retry_with_backoff(translate_captions, max_retries=3)
translated_ad = await retry_with_backoff(translate_ad, max_retries=3)
origin = "translate"
# Upload translated VTT files
captions_gcs_uri = await upload_vtt_to_gcs(
translated_captions,
f"{job_id}/{language}/captions.vtt"
)
ad_gcs_uri = await upload_vtt_to_gcs(
translated_ad,
f"{job_id}/{language}/ad.vtt"
)
# Store language outputs
updated_outputs[language] = {
"captions_vtt_gcs": captions_gcs_uri,
"ad_vtt_gcs": ad_gcs_uri,
"origin": origin
}
logger.info(f"Successfully processed VTT files for language: {language}")
except Exception as e:
logger.error(f"Failed to process language {language}: {e}")
updated_outputs[language] = {
"origin": "translate" if language not in transcreation_languages else "transcreate",
"qa_notes": f"Translation failed: {str(e)}"
}
# Update status to TTS generating
await db.jobs.update_one(
{"_id": job_id},
{
"$set": {
"status": JobStatus.TTS_GENERATING.value,
"outputs": updated_outputs,
"updated_at": datetime.utcnow()
},
"$push": {
"review.history": {
"at": datetime.utcnow(),
"status": JobStatus.TTS_GENERATING.value,
"by": "system"
}
}
}
)
# Generate TTS for languages that need MP3
if job_doc["requested_outputs"]["audio_description_mp3"]:
await _generate_tts_for_languages(job_id, updated_outputs, db)
# Update final status
await db.jobs.update_one(
{"_id": job_id},
{
"$set": {
"status": JobStatus.PENDING_FINAL_REVIEW.value,
"updated_at": datetime.utcnow()
},
"$push": {
"review.history": {
"at": datetime.utcnow(),
"status": JobStatus.PENDING_FINAL_REVIEW.value,
"by": "system"
}
}
}
)
logger.info(f"Successfully completed translation and synthesis for job {job_id}")
except Exception as e:
logger.error(f"Translation and synthesis failed for job {job_id}: {e}")
# Update job with error
await db.jobs.update_one(
{"_id": job_id},
{
"$set": {
"error": {
"type": "translation_failure",
"message": str(e),
"timestamp": datetime.utcnow().isoformat()
},
"updated_at": datetime.utcnow()
}
}
)
raise
finally:
client.close()
async def _generate_tts_for_languages(job_id: str, outputs: dict[str, Any], db):
"""Generate TTS audio for each language's audio description"""
# Always generate English MP3
if "en" in outputs:
await _generate_language_tts(job_id, "en", outputs["en"], db)
# Generate for other languages
for language, lang_output in outputs.items():
if language != "en" and "ad_vtt_gcs" in lang_output:
await _generate_language_tts(job_id, language, lang_output, db)
async def _generate_language_tts(job_id: str, language: str, lang_output: dict, db):
"""Generate TTS for a specific language"""
try:
# Download AD VTT content
ad_blob_path = lang_output["ad_vtt_gcs"].replace(f"gs://{settings.gcs_bucket}/", "")
ad_blob = gcs_service.bucket.blob(ad_blob_path)
ad_vtt_content = ad_blob.download_as_text()
# Generate MP3 with retry
language_code = f"{language}-US" if language == "en" else f"{language}-{language.upper()}"
async def synthesize():
return await tts_service.synthesize_audio_description(ad_vtt_content, language_code)
mp3_data = await retry_with_backoff(synthesize, max_retries=3)
# Upload MP3 to GCS
mp3_blob_path = f"{job_id}/{language}/ad.mp3"
mp3_blob = gcs_service.bucket.blob(mp3_blob_path)
mp3_blob.content_type = "audio/mpeg"
mp3_blob.upload_from_string(mp3_data, content_type="audio/mpeg")
mp3_gcs_uri = f"gs://{settings.gcs_bucket}/{mp3_blob_path}"
# Update job outputs
await db.jobs.update_one(
{"_id": job_id},
{
"$set": {
f"outputs.{language}.ad_mp3_gcs": mp3_gcs_uri,
"updated_at": datetime.utcnow()
}
}
)
logger.info(f"Successfully generated TTS for {language}")
except Exception as e:
logger.error(f"TTS generation failed for {language}: {e}")
# Update with error note
await db.jobs.update_one(
{"_id": job_id},
{
"$set": {
f"outputs.{language}.qa_notes": f"TTS generation failed: {str(e)}",
"updated_at": datetime.utcnow()
}
}
)

View file

@ -0,0 +1,136 @@
import asyncio
from motor.motor_asyncio import AsyncIOMotorClient
from ..core.config import settings
from ..core.logging import get_logger
from ..models.job import JobStatus
from . import celery_app
logger = get_logger(__name__)
@celery_app.task(
bind=True,
acks_late=True, # Acknowledge task only after completion
reject_on_worker_lost=True, # Retry if worker crashes
autoretry_for=(Exception,), # Auto-retry on any exception
retry_kwargs={'max_retries': None, 'countdown': 60}, # Infinite retries with 60s delay
retry_backoff=True, # Exponential backoff
)
def start_change_stream_watcher(self):
"""Start MongoDB change stream watcher for job status changes"""
try:
asyncio.run(_watch_job_changes())
except Exception as e:
logger.error(f"Change stream watcher failed: {e}")
# Task will auto-retry due to configuration
raise
async def _watch_job_changes():
"""Watch MongoDB change streams for job status updates"""
client = AsyncIOMotorClient(settings.mongodb_uri)
db = client[settings.mongodb_db]
logger.info("Starting MongoDB change stream watcher")
try:
# Add a heartbeat mechanism to ensure the connection stays alive
await client.admin.command('ping')
logger.info("MongoDB connection verified")
# Watch for changes to the jobs collection
pipeline = [
{
"$match": {
"operationType": "update",
"fullDocument.status": {
"$in": [
JobStatus.APPROVED_ENGLISH.value,
JobStatus.COMPLETED.value
]
}
}
}
]
async with db.jobs.watch(
pipeline,
full_document="updateLookup",
max_await_time_ms=30000, # 30 second timeout for getMore operations
batch_size=10 # Process changes in small batches
) as stream:
logger.info("Change stream watcher active, waiting for job status changes...")
async for change in stream:
try:
job_doc = change["fullDocument"]
if not job_doc:
logger.warning("Received change event without fullDocument")
continue
job_id = str(job_doc["_id"])
status = job_doc["status"]
logger.info(f"Job {job_id} status changed to {status}")
if status == JobStatus.APPROVED_ENGLISH.value:
# Trigger translation and synthesis
from .translate_and_synthesize import translate_and_synthesize_task
translate_and_synthesize_task.delay(job_id)
logger.info(f"Enqueued translation task for job {job_id}")
elif status == JobStatus.COMPLETED.value:
# Trigger client notification
from .notify import notify_client_task
notify_client_task.delay(job_id)
logger.info(f"Enqueued notification task for job {job_id}")
except Exception as e:
logger.error(f"Error processing change stream event: {e}")
# Continue processing other events
continue
except Exception as e:
error_msg = str(e)
if "replica sets" in error_msg:
logger.warning("Change stream watcher not available - MongoDB not configured as replica set")
logger.info("This is normal in development. Job progression works via immediate triggering in approval endpoint.")
else:
logger.error(f"Change stream watcher failed: {e}")
# Don't re-raise in development to prevent worker crashes
finally:
client.close()
# Auto-start the watcher when the worker starts
@celery_app.task(
bind=True,
autoretry_for=(Exception,),
retry_kwargs={'max_retries': 3, 'countdown': 30}
)
def ensure_watcher_running(self):
"""Ensure the change stream watcher is running"""
try:
# Check if watcher is already running
active_tasks = celery_app.control.inspect().active()
if not active_tasks:
logger.warning("Could not inspect active tasks - starting watcher anyway")
else:
# Look for running watcher
for worker, tasks in active_tasks.items():
if tasks: # Check if tasks list is not None
for task in tasks:
if task.get("name") == "app.tasks.watchers.start_change_stream_watcher":
logger.info(f"Change stream watcher already running on worker {worker}")
return
# Start the watcher
result = start_change_stream_watcher.delay()
logger.info(f"Started change stream watcher with task ID: {result.id}")
except Exception as e:
logger.error(f"Failed to ensure watcher is running: {e}")
raise # Will trigger retry

View file

@ -0,0 +1,33 @@
"""Telemetry package for OpenTelemetry tracing and metrics collection"""
from .metrics import app_metrics, time_ai_request, time_job_processing, time_storage_operation, time_celery_task
from .tracing import (
get_tracer,
instrument_dependencies,
instrument_fastapi_app,
setup_tracing,
trace_ai_operation,
trace_job_pipeline,
trace_storage_operation,
TracingContext,
trace_api_request,
trace_celery_task,
)
__all__ = [
"app_metrics",
"time_ai_request",
"time_job_processing",
"time_storage_operation",
"time_celery_task",
"get_tracer",
"instrument_dependencies",
"instrument_fastapi_app",
"setup_tracing",
"trace_ai_operation",
"trace_job_pipeline",
"trace_storage_operation",
"TracingContext",
"trace_api_request",
"trace_celery_task",
]

View file

@ -0,0 +1,359 @@
import time
from typing import Optional
from opentelemetry import metrics
# from opentelemetry.exporter.prometheus import PrometheusMetricReader # Disabled for local dev
from opentelemetry.sdk.metrics import MeterProvider
from opentelemetry.sdk.resources import Resource
from prometheus_client import start_http_server
from ..core.config import settings
from ..core.logging import get_logger
logger = get_logger(__name__)
class ApplicationMetrics:
"""Central metrics collection for the accessible video platform"""
def __init__(self):
self.setup_metrics()
# Job processing metrics
self.job_total_counter = self.meter.create_counter(
name="jobs_total",
description="Total number of jobs created",
unit="1"
)
self.job_status_gauge = self.meter.create_up_down_counter(
name="jobs_by_status",
description="Current number of jobs by status",
unit="1"
)
self.job_processing_duration = self.meter.create_histogram(
name="job_processing_duration_seconds",
description="Time taken to process jobs through each stage",
unit="s"
)
# AI service metrics
self.ai_requests_counter = self.meter.create_counter(
name="ai_requests_total",
description="Total AI service requests",
unit="1"
)
self.ai_request_duration = self.meter.create_histogram(
name="ai_request_duration_seconds",
description="Duration of AI service requests",
unit="s"
)
self.ai_confidence_histogram = self.meter.create_histogram(
name="ai_confidence_score",
description="AI confidence scores distribution",
unit="1"
)
# Storage metrics
self.storage_operations_counter = self.meter.create_counter(
name="storage_operations_total",
description="Total storage operations",
unit="1"
)
self.storage_operation_duration = self.meter.create_histogram(
name="storage_operation_duration_seconds",
description="Duration of storage operations",
unit="s"
)
# Queue metrics
self.queue_tasks_counter = self.meter.create_counter(
name="celery_tasks_total",
description="Total Celery tasks processed",
unit="1"
)
self.queue_task_duration = self.meter.create_histogram(
name="celery_task_duration_seconds",
description="Duration of Celery task execution",
unit="s"
)
# User activity metrics
self.auth_attempts_counter = self.meter.create_counter(
name="auth_attempts_total",
description="Total authentication attempts",
unit="1"
)
self.active_users_gauge = self.meter.create_up_down_counter(
name="active_users",
description="Number of currently active users",
unit="1"
)
# Rate limiting metrics
self.rate_limit_counter = self.meter.create_counter(
name="rate_limit_checks_total",
description="Total rate limit checks performed",
unit="1"
)
# Request validation metrics
self.validation_counter = self.meter.create_counter(
name="request_validation_total",
description="Total request validations performed",
unit="1"
)
self.validation_duration = self.meter.create_histogram(
name="request_validation_duration_seconds",
description="Duration of request validation",
unit="s"
)
def setup_metrics(self):
"""Initialize metrics provider and meter"""
resource = Resource.create({
"service.name": "accessible-video-api",
"service.version": "1.0.0",
"deployment.environment": settings.app_env,
})
# Set up Prometheus metrics reader (disabled for local dev)
# prometheus_reader = PrometheusMetricReader()
# Create metrics provider
provider = MeterProvider(
resource=resource,
# metric_readers=[prometheus_reader] # Disabled for local dev
)
metrics.set_meter_provider(provider)
# Get meter for this service
self.meter = metrics.get_meter("accessible-video-api")
logger.info("Metrics provider initialized with Prometheus exporter")
def start_prometheus_server(self, port: int = 8001):
"""Start Prometheus metrics HTTP server"""
try:
start_http_server(port)
logger.info(f"Prometheus metrics server started on port {port}")
except Exception as e:
logger.error(f"Failed to start Prometheus server: {e}")
# Job metrics methods
def record_job_created(self, client_id: str, language: str):
"""Record a new job creation"""
self.job_total_counter.add(
1,
attributes={
"client_id": client_id,
"source_language": language,
"action": "created"
}
)
def record_job_status_change(self, job_id: str, old_status: str, new_status: str):
"""Record job status change"""
# Decrement old status count
self.job_status_gauge.add(
-1,
attributes={"status": old_status}
)
# Increment new status count
self.job_status_gauge.add(
1,
attributes={"status": new_status}
)
def record_job_processing_time(self, stage: str, duration_seconds: float, job_id: str):
"""Record time taken for job processing stage"""
self.job_processing_duration.record(
duration_seconds,
attributes={
"stage": stage,
"job_id": job_id
}
)
# AI service metrics methods
def record_ai_request(self, service: str, operation: str, language: Optional[str] = None):
"""Record AI service request"""
attributes = {
"service": service,
"operation": operation
}
if language:
attributes["language"] = language
self.ai_requests_counter.add(1, attributes=attributes)
def record_ai_request_duration(self, service: str, operation: str, duration_seconds: float):
"""Record AI request duration"""
self.ai_request_duration.record(
duration_seconds,
attributes={
"service": service,
"operation": operation
}
)
def record_ai_confidence(self, confidence: float, service: str):
"""Record AI confidence score"""
self.ai_confidence_histogram.record(
confidence,
attributes={"service": service}
)
# Storage metrics methods
def record_storage_operation(self, operation: str, file_type: str, success: bool):
"""Record storage operation"""
self.storage_operations_counter.add(
1,
attributes={
"operation": operation,
"file_type": file_type,
"result": "success" if success else "error"
}
)
def record_storage_duration(self, operation: str, duration_seconds: float):
"""Record storage operation duration"""
self.storage_operation_duration.record(
duration_seconds,
attributes={"operation": operation}
)
# Queue metrics methods
def record_celery_task(self, task_name: str, queue: str, result: str):
"""Record Celery task execution"""
self.queue_tasks_counter.add(
1,
attributes={
"task_name": task_name,
"queue": queue,
"result": result
}
)
def record_celery_task_duration(self, task_name: str, duration_seconds: float):
"""Record Celery task duration"""
self.queue_task_duration.record(
duration_seconds,
attributes={"task_name": task_name}
)
# Auth metrics methods
def record_auth_attempt(self, result: str, user_role: Optional[str] = None):
"""Record authentication attempt"""
attributes = {"result": result}
if user_role:
attributes["user_role"] = user_role
self.auth_attempts_counter.add(1, attributes=attributes)
def update_active_users(self, count_change: int, user_role: str):
"""Update active users count"""
self.active_users_gauge.add(
count_change,
attributes={"user_role": user_role}
)
# Global metrics instance
app_metrics = ApplicationMetrics()
class MetricsTimer:
"""Context manager for timing operations"""
def __init__(self, metric_recorder, *args, **kwargs):
self.metric_recorder = metric_recorder
self.args = args
self.kwargs = kwargs
self.start_time = None
def __enter__(self):
self.start_time = time.time()
return self
def __exit__(self, exc_type, exc_val, exc_tb):
if self.start_time:
duration = time.time() - self.start_time
self.metric_recorder(duration, *self.args, **self.kwargs)
# Convenience functions for common metrics patterns
def time_job_processing(stage: str, job_id: str):
"""Time a job processing stage"""
return MetricsTimer(
app_metrics.record_job_processing_time,
stage, job_id
)
def time_ai_request(service: str, operation: str):
"""Time an AI service request"""
return MetricsTimer(
app_metrics.record_ai_request_duration,
service, operation
)
def time_storage_operation(operation: str):
"""Time a storage operation"""
return MetricsTimer(
app_metrics.record_storage_duration,
operation
)
def time_celery_task(task_name: str):
"""Time a Celery task execution"""
return MetricsTimer(
app_metrics.record_celery_task_duration,
task_name
)
def track_rate_limit_metrics(identifier: str, is_allowed: bool, current_requests: int, limit: int):
"""Track rate limiting metrics"""
if hasattr(app_metrics, 'rate_limit_counter'):
app_metrics.rate_limit_counter.add(
1,
attributes={
"identifier_type": identifier.split(":")[0] if ":" in identifier else "unknown",
"is_allowed": str(is_allowed),
"status": "allowed" if is_allowed else "blocked"
}
)
def track_validation_metrics(endpoint: str, method: str, is_valid: bool, validation_time: float, error_types: list):
"""Track request validation metrics"""
if hasattr(app_metrics, 'validation_counter'):
app_metrics.validation_counter.add(
1,
attributes={
"endpoint": endpoint,
"method": method,
"is_valid": str(is_valid),
"error_types": ",".join(error_types) if error_types else "none"
}
)
if hasattr(app_metrics, 'validation_duration'):
app_metrics.validation_duration.record(
validation_time,
attributes={
"endpoint": endpoint,
"method": method
}
)

View file

@ -0,0 +1,268 @@
import logging
from typing import Optional
from opentelemetry import trace
# from opentelemetry.exporter.gcp.trace import CloudTraceSpanExporter # Disabled for local dev
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
from opentelemetry.instrumentation.pymongo import PymongoInstrumentor
from opentelemetry.instrumentation.redis import RedisInstrumentor
from opentelemetry.sdk.resources import Resource
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from ..core.config import settings
logger = logging.getLogger(__name__)
def setup_tracing(app_name: str = "accessible-video-api", service_version: str = "1.0.0"):
"""Initialize OpenTelemetry tracing for the application"""
# Create resource with service information
resource = Resource.create({
"service.name": app_name,
"service.version": service_version,
"service.namespace": "accessible-video",
"deployment.environment": settings.app_env,
})
# Set up tracer provider
tracer_provider = TracerProvider(resource=resource)
trace.set_tracer_provider(tracer_provider)
# Configure span processor and exporter based on environment
if settings.app_env == "prod" and settings.gcp_project_id:
# Use Google Cloud Trace in production (disabled for local dev)
# cloud_trace_exporter = CloudTraceSpanExporter(
# project_id=settings.gcp_project_id
# )
# span_processor = BatchSpanProcessor(cloud_trace_exporter)
# tracer_provider.add_span_processor(span_processor)
logger.info("Google Cloud Trace disabled for local dev")
elif settings.otel_exporter_otlp_endpoint:
# Use OTLP exporter for other observability platforms
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
otlp_exporter = OTLPSpanExporter(
endpoint=settings.otel_exporter_otlp_endpoint,
headers={},
)
span_processor = BatchSpanProcessor(otlp_exporter)
tracer_provider.add_span_processor(span_processor)
logger.info(f"Configured OTLP trace exporter: {settings.otel_exporter_otlp_endpoint}")
else:
# Development mode - use console exporter
from opentelemetry.sdk.trace.export import ConsoleSpanExporter
console_exporter = ConsoleSpanExporter()
span_processor = BatchSpanProcessor(console_exporter)
tracer_provider.add_span_processor(span_processor)
logger.info("Configured console trace exporter for development")
logger.info(f"OpenTelemetry tracing initialized for {app_name}")
return tracer_provider
def instrument_fastapi_app(app):
"""Instrument FastAPI application with automatic tracing"""
FastAPIInstrumentor.instrument_app(
app,
tracer_provider=trace.get_tracer_provider(),
excluded_urls="health,metrics", # Don't trace health checks
)
logger.info("FastAPI instrumentation enabled")
def instrument_dependencies():
"""Instrument external dependencies for automatic tracing"""
# Instrument MongoDB
PymongoInstrumentor().instrument(
tracer_provider=trace.get_tracer_provider()
)
logger.info("MongoDB instrumentation enabled")
# Instrument Redis
RedisInstrumentor().instrument(
tracer_provider=trace.get_tracer_provider()
)
logger.info("Redis instrumentation enabled")
def get_tracer(name: str = "accessible-video"):
"""Get a tracer instance for manual instrumentation"""
return trace.get_tracer(name)
def trace_async_operation(operation_name: str, **attributes):
"""Decorator for tracing async operations"""
def decorator(func):
async def wrapper(*args, **kwargs):
tracer = get_tracer()
with tracer.start_as_current_span(
operation_name,
attributes=attributes
) as span:
try:
result = await func(*args, **kwargs)
span.set_attribute("operation.result", "success")
return result
except Exception as e:
span.set_attribute("operation.result", "error")
span.set_attribute("operation.error_message", str(e))
span.record_exception(e)
raise
return wrapper
return decorator
def trace_job_pipeline(job_id: str, pipeline_stage: str):
"""Decorator for tracing job pipeline stages"""
def decorator(func):
async def wrapper(*args, **kwargs):
tracer = get_tracer()
with tracer.start_as_current_span(
f"job_pipeline.{pipeline_stage}",
attributes={
"job.id": job_id,
"job.pipeline_stage": pipeline_stage,
}
) as span:
try:
result = await func(*args, **kwargs)
span.set_attribute("job.result", "success")
return result
except Exception as e:
span.set_attribute("job.result", "error")
span.set_attribute("job.error_message", str(e))
span.record_exception(e)
raise
return wrapper
return decorator
def trace_ai_operation(operation_type: str, language: Optional[str] = None):
"""Decorator for tracing AI service operations"""
def decorator(func):
async def wrapper(*args, **kwargs):
tracer = get_tracer()
span_attributes = {
"ai.operation_type": operation_type,
"ai.provider": "gemini" if "gemini" in operation_type else "google_translate"
}
if language:
span_attributes["ai.language"] = language
with tracer.start_as_current_span(
f"ai.{operation_type}",
attributes=span_attributes
) as span:
try:
result = await func(*args, **kwargs)
# Add result attributes if available
if isinstance(result, dict):
if "confidence" in result:
span.set_attribute("ai.confidence", result["confidence"])
if "language" in result:
span.set_attribute("ai.detected_language", result["language"])
span.set_attribute("ai.result", "success")
return result
except Exception as e:
span.set_attribute("ai.result", "error")
span.set_attribute("ai.error_message", str(e))
span.record_exception(e)
raise
return wrapper
return decorator
def trace_storage_operation(operation_type: str, file_path: str):
"""Decorator for tracing storage operations"""
def decorator(func):
async def wrapper(*args, **kwargs):
tracer = get_tracer()
with tracer.start_as_current_span(
f"storage.{operation_type}",
attributes={
"storage.operation": operation_type,
"storage.path": file_path,
"storage.provider": "gcs"
}
) as span:
try:
result = await func(*args, **kwargs)
span.set_attribute("storage.result", "success")
if isinstance(result, str) and result.startswith("gs://"):
span.set_attribute("storage.result_uri", result)
return result
except Exception as e:
span.set_attribute("storage.result", "error")
span.set_attribute("storage.error_message", str(e))
span.record_exception(e)
raise
return wrapper
return decorator
class TracingContext:
"""Context manager for manual span creation with attributes"""
def __init__(self, span_name: str, attributes: Optional[dict] = None):
self.span_name = span_name
self.attributes = attributes or {}
self.tracer = get_tracer()
self.span = None
def __enter__(self):
self.span = self.tracer.start_span(self.span_name, attributes=self.attributes)
return self.span
def __exit__(self, exc_type, exc_val, exc_tb):
if exc_type:
self.span.set_attribute("error", True)
self.span.set_attribute("error_message", str(exc_val))
self.span.record_exception(exc_val)
self.span.end()
# Convenience functions for common tracing patterns
def trace_api_request(endpoint: str, user_id: Optional[str] = None):
"""Create span for API request with common attributes"""
attributes = {
"http.route": endpoint,
"component": "api"
}
if user_id:
attributes["user.id"] = user_id
return TracingContext(f"api.{endpoint.replace('/', '_')}", attributes)
def trace_celery_task(task_name: str, job_id: Optional[str] = None):
"""Create span for Celery task execution"""
attributes = {
"celery.task_name": task_name,
"component": "worker"
}
if job_id:
attributes["job.id"] = job_id
return TracingContext(f"celery.{task_name}", attributes)

42
backend/celery_worker.py Normal file
View file

@ -0,0 +1,42 @@
import sentry_sdk
from sentry_sdk.integrations.celery import CeleryIntegration
from app.core.config import settings
from app.core.logging import setup_logging, get_logger
from app.tasks import celery_app
# Set up logging first
setup_logging()
logger = get_logger(__name__)
# Initialize Sentry for worker
if settings.sentry_dsn and settings.sentry_dsn.startswith(('http', 'https')):
sentry_sdk.init(
dsn=settings.sentry_dsn,
integrations=[CeleryIntegration(monitor_beat_tasks=True)],
environment=settings.app_env,
release="1.0.0",
send_default_pii=False,
)
logger.info("Starting Celery worker with structured logging")
# Import task modules to register them
from app.tasks import ingest_and_ai
from app.tasks import translate_and_synthesize
# Debug: Show registered tasks
logger.info(f"Celery app: {celery_app}")
logger.info(f"Registered tasks: {list(celery_app.tasks.keys())}")
logger.info(f"Task routes: {celery_app.conf.task_routes}")
logger.info(f"Worker listening to queues: default,ingest")
# Specifically check for our translation task
if 'app.tasks.translate_and_synthesize.translate_and_synthesize_task' in celery_app.tasks:
logger.info("✅ translate_and_synthesize_task is registered")
else:
logger.error("❌ translate_and_synthesize_task is NOT registered")
logger.error(f"Available tasks: {[t for t in celery_app.tasks.keys() if not t.startswith('celery.')]}")
if __name__ == "__main__":
celery_app.start()

8
backend/cors-config.json Normal file
View file

@ -0,0 +1,8 @@
[
{
"origin": ["*"],
"method": ["GET", "HEAD", "OPTIONS"],
"responseHeader": ["*"],
"maxAgeSeconds": 3600
}
]

View file

@ -0,0 +1,78 @@
#!/usr/bin/env python3
"""Create test users for the accessible video platform."""
import asyncio
from datetime import datetime
from passlib.context import CryptContext
from motor.motor_asyncio import AsyncIOMotorClient
from app.core.config import settings
from app.models.user import UserRole
pwd_context = CryptContext(schemes=["bcrypt"], deprecated="auto")
async def create_test_users():
"""Create test users in the database."""
print("Connecting to MongoDB...")
client = AsyncIOMotorClient(settings.mongodb_uri)
db = client[settings.mongodb_db]
# Test connection
await client.admin.command('ping')
print("Connected to MongoDB successfully")
users_collection = db.users
# Check if users already exist
existing_admin = await users_collection.find_one({"email": "admin@example.com"})
existing_reviewer = await users_collection.find_one({"email": "reviewer@example.com"})
test_users = [
{
"email": "admin@example.com",
"hashed_password": pwd_context.hash("admin"),
"full_name": "Admin User",
"role": UserRole.ADMIN.value,
"is_active": True,
"created_at": datetime.utcnow(),
"updated_at": datetime.utcnow(),
},
{
"email": "reviewer@example.com",
"hashed_password": pwd_context.hash("reviewer"),
"full_name": "Reviewer User",
"role": UserRole.REVIEWER.value,
"is_active": True,
"created_at": datetime.utcnow(),
"updated_at": datetime.utcnow(),
},
{
"email": "client@example.com",
"hashed_password": pwd_context.hash("client123"),
"full_name": "Client User",
"role": UserRole.CLIENT.value,
"is_active": True,
"created_at": datetime.utcnow(),
"updated_at": datetime.utcnow(),
}
]
for user in test_users:
existing = await users_collection.find_one({"email": user["email"]})
if existing:
print(f"User {user['email']} already exists, skipping...")
continue
result = await users_collection.insert_one(user)
print(f"Created user: {user['email']} (ID: {result.inserted_id})")
# Show all users
print("\nAll users in database:")
async for user in users_collection.find({}, {"email": 1, "role": 1, "is_active": 1}):
print(f" {user['email']} - {user['role']} - Active: {user['is_active']}")
client.close()
print("Done!")
if __name__ == "__main__":
asyncio.run(create_test_users())

52
backend/debug_login.py Normal file
View file

@ -0,0 +1,52 @@
#!/usr/bin/env python3
"""Debug login issues by testing components individually."""
import asyncio
from motor.motor_asyncio import AsyncIOMotorClient
from app.core.config import settings
from app.core.security import verify_password
from app.models.user import User
async def test_database_connection():
"""Test direct database connection."""
print("1. Testing database connection...")
client = AsyncIOMotorClient(settings.mongodb_uri)
db = client[settings.mongodb_db]
try:
# Test connection
await client.admin.command('ping')
print("✅ Database connection successful")
# Check if users collection exists
collections = await db.list_collection_names()
print(f"✅ Collections: {collections}")
# Count users
user_count = await db.users.count_documents({})
print(f"✅ User count: {user_count}")
# Find admin user
user_doc = await db.users.find_one({"email": "admin@example.com"})
if user_doc:
print(f"✅ Found admin user: {user_doc['email']}")
user = User(**user_doc)
print(f"✅ User model validation successful")
# Test password verification
print("2. Testing password verification...")
password_correct = verify_password("admin", user.hashed_password)
print(f"✅ Password verification result: {password_correct}")
else:
print("❌ Admin user not found")
except Exception as e:
print(f"❌ Database error: {e}")
import traceback
traceback.print_exc()
finally:
client.close()
if __name__ == "__main__":
asyncio.run(test_database_connection())

29
backend/gunicorn_conf.py Normal file
View file

@ -0,0 +1,29 @@
import multiprocessing
import os
# Server socket
bind = f"0.0.0.0:{os.getenv('PORT', '8000')}"
backlog = 2048
# Worker processes
workers = multiprocessing.cpu_count() * 2 + 1
worker_class = "uvicorn.workers.UvicornWorker"
worker_connections = 1000
max_requests = 1000
max_requests_jitter = 50
# Timeouts
timeout = 120
keepalive = 2
# Logging
loglevel = "info"
access_log_format = '%(h)s %(l)s %(u)s %(t)s "%(r)s" %(s)s %(b)s "%(f)s" "%(a)s" %(D)s'
accesslog = "-"
errorlog = "-"
# Process naming
proc_name = "accessible-video-api"
# Application
module = "app.main:app"

Some files were not shown because too many files have changed in this diff Show more