initial commit
This commit is contained in:
commit
af2562096a
212 changed files with 36035 additions and 0 deletions
BIN
.DS_Store
vendored
Normal file
BIN
.DS_Store
vendored
Normal file
Binary file not shown.
182
.github/workflows/cd-backend.yml
vendored
Normal file
182
.github/workflows/cd-backend.yml
vendored
Normal file
|
|
@ -0,0 +1,182 @@
|
|||
name: Deploy Backend
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [ main ]
|
||||
paths:
|
||||
- 'backend/**'
|
||||
- '.github/workflows/cd-backend.yml'
|
||||
workflow_dispatch:
|
||||
|
||||
env:
|
||||
GCP_PROJECT_ID: ${{ secrets.GCP_PROJECT_ID }}
|
||||
GCP_REGION: us-central1
|
||||
SERVICE_NAME: accessible-video-api
|
||||
WORKER_SERVICE_NAME: accessible-video-worker
|
||||
|
||||
jobs:
|
||||
deploy-api:
|
||||
name: Deploy API to Cloud Run
|
||||
runs-on: ubuntu-latest
|
||||
if: github.ref == 'refs/heads/main'
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
id-token: write
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Authenticate to Google Cloud
|
||||
uses: google-github-actions/auth@v2
|
||||
with:
|
||||
workload_identity_provider: ${{ secrets.WIF_PROVIDER }}
|
||||
service_account: ${{ secrets.WIF_SERVICE_ACCOUNT }}
|
||||
|
||||
- name: Set up Cloud SDK
|
||||
uses: google-github-actions/setup-gcloud@v2
|
||||
|
||||
- name: Configure Docker auth
|
||||
run: gcloud auth configure-docker
|
||||
|
||||
- name: Build and push Docker image
|
||||
working-directory: ./backend
|
||||
run: |
|
||||
# Build image with multi-stage optimization
|
||||
docker build \
|
||||
--target production \
|
||||
--tag gcr.io/${{ env.GCP_PROJECT_ID }}/${{ env.SERVICE_NAME }}:${{ github.sha }} \
|
||||
--tag gcr.io/${{ env.GCP_PROJECT_ID }}/${{ env.SERVICE_NAME }}:latest \
|
||||
.
|
||||
|
||||
# Push images
|
||||
docker push gcr.io/${{ env.GCP_PROJECT_ID }}/${{ env.SERVICE_NAME }}:${{ github.sha }}
|
||||
docker push gcr.io/${{ env.GCP_PROJECT_ID }}/${{ env.SERVICE_NAME }}:latest
|
||||
|
||||
- name: Deploy to Cloud Run
|
||||
run: |
|
||||
gcloud run deploy ${{ env.SERVICE_NAME }} \
|
||||
--image gcr.io/${{ env.GCP_PROJECT_ID }}/${{ env.SERVICE_NAME }}:${{ github.sha }} \
|
||||
--region ${{ env.GCP_REGION }} \
|
||||
--platform managed \
|
||||
--allow-unauthenticated \
|
||||
--set-env-vars APP_ENV=prod \
|
||||
--set-secrets JWT_SECRET=jwt-secret:latest,MONGODB_URI=mongodb-uri:latest,REDIS_URL=redis-url:latest,GEMINI_API_KEY=gemini-api-key:latest,SENDGRID_API_KEY=sendgrid-api-key:latest,SENTRY_DSN=sentry-dsn:latest \
|
||||
--memory 2Gi \
|
||||
--cpu 2 \
|
||||
--max-instances 100 \
|
||||
--min-instances 1 \
|
||||
--port 8000 \
|
||||
--timeout 300 \
|
||||
--concurrency 80
|
||||
|
||||
- name: Update traffic to new revision
|
||||
run: |
|
||||
gcloud run services update-traffic ${{ env.SERVICE_NAME }} \
|
||||
--region ${{ env.GCP_REGION }} \
|
||||
--to-latest
|
||||
|
||||
deploy-worker:
|
||||
name: Deploy Worker to Cloud Run
|
||||
runs-on: ubuntu-latest
|
||||
needs: [deploy-api]
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
id-token: write
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Authenticate to Google Cloud
|
||||
uses: google-github-actions/auth@v2
|
||||
with:
|
||||
workload_identity_provider: ${{ secrets.WIF_PROVIDER }}
|
||||
service_account: ${{ secrets.WIF_SERVICE_ACCOUNT }}
|
||||
|
||||
- name: Set up Cloud SDK
|
||||
uses: google-github-actions/setup-gcloud@v2
|
||||
|
||||
- name: Configure Docker auth
|
||||
run: gcloud auth configure-docker
|
||||
|
||||
- name: Build and push worker image
|
||||
working-directory: ./backend
|
||||
run: |
|
||||
# Build worker image
|
||||
docker build \
|
||||
--target worker \
|
||||
--tag gcr.io/${{ env.GCP_PROJECT_ID }}/${{ env.WORKER_SERVICE_NAME }}:${{ github.sha }} \
|
||||
--tag gcr.io/${{ env.GCP_PROJECT_ID }}/${{ env.WORKER_SERVICE_NAME }}:latest \
|
||||
.
|
||||
|
||||
# Push images
|
||||
docker push gcr.io/${{ env.GCP_PROJECT_ID }}/${{ env.WORKER_SERVICE_NAME }}:${{ github.sha }}
|
||||
docker push gcr.io/${{ env.GCP_PROJECT_ID }}/${{ env.WORKER_SERVICE_NAME }}:latest
|
||||
|
||||
- name: Deploy worker to Cloud Run
|
||||
run: |
|
||||
gcloud run deploy ${{ env.WORKER_SERVICE_NAME }} \
|
||||
--image gcr.io/${{ env.GCP_PROJECT_ID }}/${{ env.WORKER_SERVICE_NAME }}:${{ github.sha }} \
|
||||
--region ${{ env.GCP_REGION }} \
|
||||
--platform managed \
|
||||
--no-allow-unauthenticated \
|
||||
--set-env-vars APP_ENV=prod \
|
||||
--set-secrets JWT_SECRET=jwt-secret:latest,MONGODB_URI=mongodb-uri:latest,REDIS_URL=redis-url:latest,GEMINI_API_KEY=gemini-api-key:latest,SENDGRID_API_KEY=sendgrid-api-key:latest,SENTRY_DSN=sentry-dsn:latest \
|
||||
--memory 4Gi \
|
||||
--cpu 2 \
|
||||
--max-instances 50 \
|
||||
--min-instances 0 \
|
||||
--timeout 1800 \
|
||||
--concurrency 1
|
||||
|
||||
smoke-tests:
|
||||
name: Run Smoke Tests
|
||||
runs-on: ubuntu-latest
|
||||
needs: [deploy-api, deploy-worker]
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: ${{ env.PYTHON_VERSION }}
|
||||
|
||||
- name: Install Poetry
|
||||
uses: snok/install-poetry@v1
|
||||
|
||||
- name: Install dependencies
|
||||
working-directory: ./backend
|
||||
run: poetry install
|
||||
|
||||
- name: Run smoke tests against production
|
||||
working-directory: ./backend
|
||||
env:
|
||||
API_BASE_URL: https://${{ env.SERVICE_NAME }}-${{ secrets.GCP_REGION_HASH }}-uc.a.run.app
|
||||
SMOKE_TEST_EMAIL: ${{ secrets.SMOKE_TEST_EMAIL }}
|
||||
SMOKE_TEST_PASSWORD: ${{ secrets.SMOKE_TEST_PASSWORD }}
|
||||
run: |
|
||||
poetry run pytest tests/e2e/test_smoke.py -v
|
||||
|
||||
notify-deployment:
|
||||
name: Notify Deployment Status
|
||||
runs-on: ubuntu-latest
|
||||
needs: [smoke-tests]
|
||||
if: always()
|
||||
|
||||
steps:
|
||||
- name: Notify success
|
||||
if: needs.smoke-tests.result == 'success'
|
||||
run: |
|
||||
echo "✅ Backend deployment completed successfully"
|
||||
# Add Slack/email notification here if needed
|
||||
|
||||
- name: Notify failure
|
||||
if: needs.smoke-tests.result == 'failure'
|
||||
run: |
|
||||
echo "❌ Backend deployment failed"
|
||||
# Add Slack/email notification here if needed
|
||||
147
.github/workflows/cd-frontend.yml
vendored
Normal file
147
.github/workflows/cd-frontend.yml
vendored
Normal file
|
|
@ -0,0 +1,147 @@
|
|||
name: Deploy Frontend
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [ main ]
|
||||
paths:
|
||||
- 'frontend/**'
|
||||
- '.github/workflows/cd-frontend.yml'
|
||||
workflow_dispatch:
|
||||
|
||||
env:
|
||||
GCP_PROJECT_ID: ${{ secrets.GCP_PROJECT_ID }}
|
||||
GCP_REGION: us-central1
|
||||
BUCKET_NAME: ${{ secrets.FRONTEND_BUCKET_NAME }}
|
||||
CDN_URL_MAP: accessible-video-frontend
|
||||
NODE_VERSION: "20"
|
||||
|
||||
jobs:
|
||||
build-and-deploy:
|
||||
name: Build and Deploy Frontend
|
||||
runs-on: ubuntu-latest
|
||||
if: github.ref == 'refs/heads/main'
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
id-token: write
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Node.js
|
||||
uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: ${{ env.NODE_VERSION }}
|
||||
cache: 'npm'
|
||||
cache-dependency-path: frontend/package-lock.json
|
||||
|
||||
- name: Install dependencies
|
||||
working-directory: ./frontend
|
||||
run: npm ci
|
||||
|
||||
- name: Build for production
|
||||
working-directory: ./frontend
|
||||
env:
|
||||
VITE_API_BASE_URL: ${{ secrets.PRODUCTION_API_URL }}
|
||||
VITE_APP_ENV: production
|
||||
VITE_SENTRY_DSN: ${{ secrets.FRONTEND_SENTRY_DSN }}
|
||||
run: npm run build
|
||||
|
||||
- name: Authenticate to Google Cloud
|
||||
uses: google-github-actions/auth@v2
|
||||
with:
|
||||
workload_identity_provider: ${{ secrets.WIF_PROVIDER }}
|
||||
service_account: ${{ secrets.WIF_SERVICE_ACCOUNT }}
|
||||
|
||||
- name: Set up Cloud SDK
|
||||
uses: google-github-actions/setup-gcloud@v2
|
||||
|
||||
- name: Deploy to Cloud Storage
|
||||
working-directory: ./frontend
|
||||
run: |
|
||||
# Sync build files to Cloud Storage bucket
|
||||
gsutil -m rsync -r -d dist/ gs://${{ env.BUCKET_NAME }}/
|
||||
|
||||
# Set public read permissions for web assets
|
||||
gsutil -m acl ch -r -u AllUsers:R gs://${{ env.BUCKET_NAME }}
|
||||
|
||||
# Set cache headers for different file types
|
||||
gsutil -m setmeta -h "Cache-Control:public, max-age=31536000, immutable" "gs://${{ env.BUCKET_NAME }}/**/*.js"
|
||||
gsutil -m setmeta -h "Cache-Control:public, max-age=31536000, immutable" "gs://${{ env.BUCKET_NAME }}/**/*.css"
|
||||
gsutil -m setmeta -h "Cache-Control:public, max-age=86400" "gs://${{ env.BUCKET_NAME }}/**/*.html"
|
||||
gsutil -m setmeta -h "Cache-Control:public, max-age=86400" "gs://${{ env.BUCKET_NAME }}/index.html"
|
||||
|
||||
- name: Configure Load Balancer and CDN
|
||||
run: |
|
||||
# Create backend bucket if it doesn't exist
|
||||
gcloud compute backend-buckets describe ${{ env.BUCKET_NAME }}-backend || \
|
||||
gcloud compute backend-buckets create ${{ env.BUCKET_NAME }}-backend \
|
||||
--gcs-bucket-name=${{ env.BUCKET_NAME }}
|
||||
|
||||
# Update the URL map to route to the bucket
|
||||
gcloud compute url-maps describe ${{ env.CDN_URL_MAP }} || \
|
||||
gcloud compute url-maps create ${{ env.CDN_URL_MAP }} \
|
||||
--default-backend-bucket=${{ env.BUCKET_NAME }}-backend
|
||||
|
||||
# Create or update HTTPS proxy
|
||||
gcloud compute target-https-proxies describe ${{ env.CDN_URL_MAP }}-https-proxy || \
|
||||
gcloud compute target-https-proxies create ${{ env.CDN_URL_MAP }}-https-proxy \
|
||||
--url-map=${{ env.CDN_URL_MAP }} \
|
||||
--ssl-certificates=${{ secrets.SSL_CERT_NAME }}
|
||||
|
||||
# Create or update global forwarding rule
|
||||
gcloud compute forwarding-rules describe ${{ env.CDN_URL_MAP }}-https-rule --global || \
|
||||
gcloud compute forwarding-rules create ${{ env.CDN_URL_MAP }}-https-rule \
|
||||
--global \
|
||||
--target-https-proxy=${{ env.CDN_URL_MAP }}-https-proxy \
|
||||
--ports=443
|
||||
|
||||
- name: Invalidate CDN cache
|
||||
run: |
|
||||
# Invalidate CDN cache for immediate deployment
|
||||
gcloud compute url-maps invalidate-cdn-cache ${{ env.CDN_URL_MAP }} \
|
||||
--path="/*" \
|
||||
--async
|
||||
|
||||
- name: Run smoke tests
|
||||
working-directory: ./frontend
|
||||
env:
|
||||
FRONTEND_URL: https://${{ secrets.FRONTEND_DOMAIN }}
|
||||
run: |
|
||||
# Wait a bit for CDN propagation
|
||||
sleep 30
|
||||
|
||||
# Basic smoke test - check if main page loads
|
||||
curl -f -s -o /dev/null -w "%{http_code}" "$FRONTEND_URL" | grep -q "200" || {
|
||||
echo "Frontend smoke test failed - main page not accessible"
|
||||
exit 1
|
||||
}
|
||||
|
||||
# Check if assets are loading
|
||||
curl -f -s -o /dev/null -w "%{http_code}" "$FRONTEND_URL/assets/" | grep -qE "(200|404)" || {
|
||||
echo "Frontend smoke test failed - assets not accessible"
|
||||
exit 1
|
||||
}
|
||||
|
||||
echo "✅ Frontend smoke tests passed"
|
||||
|
||||
notify-deployment:
|
||||
name: Notify Deployment Status
|
||||
runs-on: ubuntu-latest
|
||||
needs: [build-and-deploy]
|
||||
if: always()
|
||||
|
||||
steps:
|
||||
- name: Notify success
|
||||
if: needs.build-and-deploy.result == 'success'
|
||||
run: |
|
||||
echo "✅ Frontend deployment completed successfully"
|
||||
echo "Frontend is now live at: https://${{ secrets.FRONTEND_DOMAIN }}"
|
||||
# Add Slack/email notification here if needed
|
||||
|
||||
- name: Notify failure
|
||||
if: needs.build-and-deploy.result == 'failure'
|
||||
run: |
|
||||
echo "❌ Frontend deployment failed"
|
||||
# Add Slack/email notification here if needed
|
||||
312
.github/workflows/ci.yml
vendored
Normal file
312
.github/workflows/ci.yml
vendored
Normal file
|
|
@ -0,0 +1,312 @@
|
|||
name: CI
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [ main, develop ]
|
||||
pull_request:
|
||||
branches: [ main, develop ]
|
||||
|
||||
env:
|
||||
PYTHON_VERSION: "3.11"
|
||||
NODE_VERSION: "20"
|
||||
|
||||
jobs:
|
||||
backend-lint-and-test:
|
||||
name: Backend Lint & Test
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
services:
|
||||
mongodb:
|
||||
image: mongo:7.0
|
||||
ports:
|
||||
- 27017:27017
|
||||
options: >-
|
||||
--health-cmd "echo 'db.runCommand("ping").ok' | mongosh --quiet"
|
||||
--health-interval 10s
|
||||
--health-timeout 5s
|
||||
--health-retries 5
|
||||
|
||||
redis:
|
||||
image: redis:7-alpine
|
||||
ports:
|
||||
- 6379:6379
|
||||
options: >-
|
||||
--health-cmd "redis-cli ping"
|
||||
--health-interval 10s
|
||||
--health-timeout 5s
|
||||
--health-retries 5
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: ${{ env.PYTHON_VERSION }}
|
||||
|
||||
- name: Install Poetry
|
||||
uses: snok/install-poetry@v1
|
||||
with:
|
||||
version: latest
|
||||
virtualenvs-create: true
|
||||
virtualenvs-in-project: true
|
||||
|
||||
- name: Load cached dependencies
|
||||
id: cached-poetry-dependencies
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: backend/.venv
|
||||
key: poetry-${{ runner.os }}-${{ env.PYTHON_VERSION }}-${{ hashFiles('backend/poetry.lock') }}
|
||||
|
||||
- name: Install dependencies
|
||||
if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
|
||||
working-directory: ./backend
|
||||
run: poetry install --no-interaction --no-root
|
||||
|
||||
- name: Install project
|
||||
working-directory: ./backend
|
||||
run: poetry install --no-interaction
|
||||
|
||||
- name: Run linting (ruff)
|
||||
working-directory: ./backend
|
||||
run: poetry run ruff check .
|
||||
|
||||
- name: Run type checking (mypy)
|
||||
working-directory: ./backend
|
||||
run: poetry run mypy .
|
||||
|
||||
- name: Run unit tests
|
||||
working-directory: ./backend
|
||||
env:
|
||||
MONGODB_URI: mongodb://localhost:27017
|
||||
MONGODB_DB: test_accessible_video
|
||||
REDIS_URL: redis://localhost:6379
|
||||
JWT_SECRET: test_jwt_secret_for_ci
|
||||
GEMINI_API_KEY: fake_key_for_testing
|
||||
GCP_PROJECT_ID: test-project
|
||||
GCS_BUCKET: test-bucket
|
||||
SENDGRID_API_KEY: fake_sendgrid_key
|
||||
EMAIL_FROM: test@example.com
|
||||
CLIENT_BASE_URL: http://localhost:3000
|
||||
run: |
|
||||
poetry run pytest tests/unit/ -v --cov=app --cov-report=xml --cov-report=term-missing
|
||||
|
||||
- name: Upload coverage to Codecov
|
||||
uses: codecov/codecov-action@v4
|
||||
with:
|
||||
file: ./backend/coverage.xml
|
||||
flags: backend
|
||||
name: backend-coverage
|
||||
|
||||
frontend-lint-and-test:
|
||||
name: Frontend Lint & Test
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Node.js
|
||||
uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: ${{ env.NODE_VERSION }}
|
||||
cache: 'npm'
|
||||
cache-dependency-path: frontend/package-lock.json
|
||||
|
||||
- name: Install dependencies
|
||||
working-directory: ./frontend
|
||||
run: npm ci
|
||||
|
||||
- name: Run linting (ESLint)
|
||||
working-directory: ./frontend
|
||||
run: npm run lint
|
||||
|
||||
- name: Run type checking (TypeScript)
|
||||
working-directory: ./frontend
|
||||
run: npm run type-check
|
||||
|
||||
- name: Run unit tests (Vitest)
|
||||
working-directory: ./frontend
|
||||
run: npm run test -- --coverage --reporter=verbose
|
||||
|
||||
- name: Upload coverage to Codecov
|
||||
uses: codecov/codecov-action@v4
|
||||
with:
|
||||
file: ./frontend/coverage/lcov.info
|
||||
flags: frontend
|
||||
name: frontend-coverage
|
||||
|
||||
integration-tests:
|
||||
name: Integration Tests
|
||||
runs-on: ubuntu-latest
|
||||
needs: [backend-lint-and-test, frontend-lint-and-test]
|
||||
|
||||
services:
|
||||
mongodb:
|
||||
image: mongo:7.0
|
||||
ports:
|
||||
- 27017:27017
|
||||
options: >-
|
||||
--health-cmd "echo 'db.runCommand("ping").ok' | mongosh --quiet"
|
||||
--health-interval 10s
|
||||
--health-timeout 5s
|
||||
--health-retries 5
|
||||
|
||||
redis:
|
||||
image: redis:7-alpine
|
||||
ports:
|
||||
- 6379:6379
|
||||
options: >-
|
||||
--health-cmd "redis-cli ping"
|
||||
--health-interval 10s
|
||||
--health-timeout 5s
|
||||
--health-retries 5
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: ${{ env.PYTHON_VERSION }}
|
||||
|
||||
- name: Install Poetry
|
||||
uses: snok/install-poetry@v1
|
||||
|
||||
- name: Install backend dependencies
|
||||
working-directory: ./backend
|
||||
run: poetry install
|
||||
|
||||
- name: Run integration tests
|
||||
working-directory: ./backend
|
||||
env:
|
||||
MONGODB_URI: mongodb://localhost:27017
|
||||
MONGODB_DB: test_accessible_video_integration
|
||||
REDIS_URL: redis://localhost:6379
|
||||
JWT_SECRET: test_jwt_secret_for_integration
|
||||
GEMINI_API_KEY: fake_key_for_testing
|
||||
GCP_PROJECT_ID: test-project
|
||||
GCS_BUCKET: test-bucket
|
||||
SENDGRID_API_KEY: fake_sendgrid_key
|
||||
EMAIL_FROM: test@example.com
|
||||
CLIENT_BASE_URL: http://localhost:3000
|
||||
run: |
|
||||
poetry run pytest tests/integration/ -v
|
||||
|
||||
build-backend:
|
||||
name: Build Backend Docker Image
|
||||
runs-on: ubuntu-latest
|
||||
needs: [backend-lint-and-test]
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Build backend image
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: ./backend
|
||||
file: ./backend/Dockerfile
|
||||
push: false
|
||||
tags: accessible-video-backend:${{ github.sha }}
|
||||
cache-from: type=gha
|
||||
cache-to: type=gha,mode=max
|
||||
|
||||
build-frontend:
|
||||
name: Build Frontend
|
||||
runs-on: ubuntu-latest
|
||||
needs: [frontend-lint-and-test]
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Node.js
|
||||
uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: ${{ env.NODE_VERSION }}
|
||||
cache: 'npm'
|
||||
cache-dependency-path: frontend/package-lock.json
|
||||
|
||||
- name: Install dependencies
|
||||
working-directory: ./frontend
|
||||
run: npm ci
|
||||
|
||||
- name: Build for production
|
||||
working-directory: ./frontend
|
||||
env:
|
||||
VITE_API_BASE_URL: https://api.example.com # Placeholder for production
|
||||
VITE_APP_ENV: production
|
||||
run: npm run build
|
||||
|
||||
- name: Upload build artifacts
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: frontend-dist
|
||||
path: frontend/dist/
|
||||
retention-days: 7
|
||||
|
||||
security-scan:
|
||||
name: Security Scan
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Run Trivy vulnerability scanner
|
||||
uses: aquasecurity/trivy-action@master
|
||||
with:
|
||||
scan-type: 'fs'
|
||||
scan-ref: '.'
|
||||
format: 'sarif'
|
||||
output: 'trivy-results.sarif'
|
||||
|
||||
- name: Upload Trivy scan results
|
||||
uses: github/codeql-action/upload-sarif@v3
|
||||
with:
|
||||
sarif_file: 'trivy-results.sarif'
|
||||
|
||||
- name: Run Semgrep security scan
|
||||
uses: semgrep/semgrep-action@v1
|
||||
with:
|
||||
config: auto
|
||||
generateBaseline: false
|
||||
|
||||
dependency-check:
|
||||
name: Dependency Check
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: ${{ env.PYTHON_VERSION }}
|
||||
|
||||
- name: Install Poetry
|
||||
uses: snok/install-poetry@v1
|
||||
|
||||
- name: Check backend dependencies
|
||||
working-directory: ./backend
|
||||
run: |
|
||||
poetry check
|
||||
poetry run pip-audit
|
||||
|
||||
- name: Set up Node.js
|
||||
uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: ${{ env.NODE_VERSION }}
|
||||
|
||||
- name: Check frontend dependencies
|
||||
working-directory: ./frontend
|
||||
run: |
|
||||
npm audit --audit-level moderate
|
||||
npx better-npm-audit audit
|
||||
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
|
|
@ -0,0 +1 @@
|
|||
examples/
|
||||
148
CLAUDE.md
Normal file
148
CLAUDE.md
Normal file
|
|
@ -0,0 +1,148 @@
|
|||
# Accessible Video Processing Platform - Development Guide
|
||||
|
||||
## Project Overview
|
||||
This is a comprehensive video accessibility platform that automatically generates closed captions and audio descriptions using AI, with quality control workflows and multi-language support.
|
||||
|
||||
**Core Tech Stack:**
|
||||
- Frontend: React 18 + Vite SPA (TypeScript)
|
||||
- Backend: FastAPI + Celery workers (Python 3.11+)
|
||||
- Database: MongoDB Atlas
|
||||
- Storage: Google Cloud Storage with signed URLs
|
||||
- AI: Gemini 2.5 Pro, Google Cloud Translate, ElevenLabs TTS
|
||||
- Queue: Redis + Celery
|
||||
- Auth: JWT with HttpOnly refresh cookies
|
||||
|
||||
## Development Instructions
|
||||
|
||||
### CRITICAL: Always Read the Full Development Plan
|
||||
**Before starting any development work, ALWAYS read the entire `video_accessibility_development_plan.txt` file.** This document contains:
|
||||
- Complete technical specifications
|
||||
- API contracts and schemas
|
||||
- Database models and indexes
|
||||
- Worker pipeline details
|
||||
- Frontend component specifications
|
||||
- Security requirements
|
||||
- Testing strategies
|
||||
|
||||
The development plan is the authoritative source for all implementation details. Refer to it frequently during development to ensure consistency with the overall architecture.
|
||||
|
||||
## Key Implementation Phases
|
||||
|
||||
### Phase 1: Foundation & Setup
|
||||
- Monorepo structure (backend/, frontend/, infra/)
|
||||
- FastAPI backend initialization
|
||||
- React + Vite frontend setup
|
||||
- MongoDB and Redis configuration
|
||||
- JWT authentication with RBAC
|
||||
|
||||
### Phase 2: Core Services
|
||||
- Google Cloud Storage integration
|
||||
- Gemini 2.5 Pro service
|
||||
- Job model with state machine
|
||||
- Celery worker infrastructure
|
||||
|
||||
### Phase 3: Ingestion & AI Pipeline
|
||||
- Video upload system
|
||||
- Ingestion worker task
|
||||
- VTT generation
|
||||
- Gemini prompt system
|
||||
|
||||
### Phase 4: Quality Control System
|
||||
- VTT editor component
|
||||
- QC dashboard for reviewers
|
||||
- Approval/rejection workflow
|
||||
- Video player with captions
|
||||
|
||||
### Phase 5: Translation & TTS Pipeline
|
||||
- Google Cloud Translate integration
|
||||
- Transcreation system
|
||||
- Translation worker
|
||||
- TTS service integration
|
||||
|
||||
### Phase 6: Final Review & Delivery
|
||||
- Final review interface
|
||||
- Job completion workflow
|
||||
- Email notifications
|
||||
- Client download portal
|
||||
|
||||
### Phase 7: Production Readiness
|
||||
- Comprehensive testing
|
||||
- Security hardening
|
||||
- Observability setup
|
||||
- CI/CD configuration
|
||||
|
||||
## Job Status State Machine
|
||||
```
|
||||
created → ingesting → ai_processing → pending_qc → approved_english | rejected → translating → tts_generating → pending_final_review → completed
|
||||
```
|
||||
|
||||
## Key Architecture Decisions
|
||||
|
||||
### Security
|
||||
- Access tokens stored in memory (not localStorage)
|
||||
- Refresh tokens in HttpOnly cookies
|
||||
- RBAC enforcement server-side
|
||||
- Signed URLs for file access (24h expiry)
|
||||
- Audit logs for all reviewer actions
|
||||
|
||||
### Data Flow
|
||||
1. Client uploads MP4 → GCS + MongoDB record
|
||||
2. Celery worker processes video with Gemini 2.5 Pro
|
||||
3. Generates English captions.vtt and audio_description.vtt
|
||||
4. Reviewer QC approval triggers translation pipeline
|
||||
5. Multi-language assets generated (translate/transcreate + TTS)
|
||||
6. Final review and client notification with download links
|
||||
|
||||
### File Structure
|
||||
```
|
||||
gs://accessible-video/{jobId}/
|
||||
source.mp4
|
||||
en/
|
||||
captions.vtt
|
||||
ad.vtt
|
||||
ad.mp3
|
||||
{lang}/
|
||||
captions.vtt
|
||||
ad.vtt
|
||||
ad.mp3
|
||||
```
|
||||
|
||||
## Development Guidelines
|
||||
|
||||
### Before Each Session
|
||||
1. Read the complete `video_accessibility_development_plan.txt`
|
||||
2. Review the current todo list and phase
|
||||
3. Check existing code patterns and conventions
|
||||
4. Understand the security and accessibility requirements
|
||||
|
||||
### Code Standards
|
||||
- Follow existing patterns in the codebase
|
||||
- Implement proper error handling and retries
|
||||
- Add OpenTelemetry tracing for observability
|
||||
- Ensure RBAC is enforced on all endpoints
|
||||
- Validate all VTT outputs for correctness
|
||||
- Write unit tests for all services and utilities
|
||||
|
||||
### Testing Requirements
|
||||
- Unit tests ≥80% coverage for services/utils
|
||||
- Integration tests with mocked AI services
|
||||
- E2E tests for complete workflows
|
||||
- Performance testing for video processing
|
||||
|
||||
### Lint/Type Check Commands
|
||||
- Backend: `ruff check .` and `mypy .`
|
||||
- Frontend: `npm run lint` and `npm run type-check`
|
||||
|
||||
## Important Files to Reference
|
||||
- `video_accessibility_development_plan.txt` - Complete specification
|
||||
- Backend schemas in section 17 of the plan
|
||||
- API design in section 7 of the plan
|
||||
- Frontend component specs in section 10 of the plan
|
||||
- Security requirements in section 11 of the plan
|
||||
|
||||
## Risk Mitigations
|
||||
- Invalid JSON from AI models: Pydantic validation + self-heal prompts
|
||||
- Timestamp drift: Preserve cue timings in translations
|
||||
- TTS alignment: Per-cue synthesis with crossfades
|
||||
- Queue backlog: Autoscaling workers with monitoring
|
||||
- Security: Secret Manager, least-privilege IAM, no client secrets
|
||||
62
Makefile
Normal file
62
Makefile
Normal file
|
|
@ -0,0 +1,62 @@
|
|||
.PHONY: help install dev-backend dev-frontend dev-worker test lint clean
|
||||
|
||||
help: ## Show this help message
|
||||
@echo 'Usage: make [target]'
|
||||
@echo ''
|
||||
@echo 'Targets:'
|
||||
@awk 'BEGIN {FS = ":.*?## "} /^[a-zA-Z_-]+:.*?## / {printf " %-15s %s\n", $$1, $$2}' $(MAKEFILE_LIST)
|
||||
|
||||
install: ## Install all dependencies
|
||||
@echo "Installing backend dependencies..."
|
||||
cd backend && poetry install
|
||||
@echo "Installing frontend dependencies..."
|
||||
cd frontend && npm install
|
||||
|
||||
dev-backend: ## Start backend development server
|
||||
cd backend && poetry run uvicorn app.main:app --reload --host 0.0.0.0 --port 8000
|
||||
|
||||
dev-frontend: ## Start frontend development server
|
||||
cd frontend && npm run dev
|
||||
|
||||
dev-worker: ## Start Celery worker
|
||||
cd backend && poetry run celery -A celery_worker.celery_app worker --loglevel=info
|
||||
|
||||
test-backend: ## Run backend tests
|
||||
cd backend && poetry run pytest
|
||||
|
||||
test-frontend: ## Run frontend tests
|
||||
cd frontend && npm run test
|
||||
|
||||
lint-backend: ## Lint backend code
|
||||
cd backend && poetry run ruff check . && poetry run mypy .
|
||||
|
||||
lint-frontend: ## Lint frontend code
|
||||
cd frontend && npm run lint && npm run type-check
|
||||
|
||||
lint: lint-backend lint-frontend ## Lint all code
|
||||
|
||||
clean: ## Clean build artifacts
|
||||
cd backend && rm -rf __pycache__ .pytest_cache .mypy_cache
|
||||
cd frontend && rm -rf node_modules/.cache dist
|
||||
|
||||
build-backend: ## Build backend Docker image
|
||||
cd backend && docker build -t accessible-video-backend .
|
||||
|
||||
build-frontend: ## Build frontend for production
|
||||
cd frontend && npm run build
|
||||
|
||||
# Development helpers
|
||||
setup-env: ## Copy environment templates
|
||||
cp backend/.env.example backend/.env
|
||||
cp frontend/.env.example frontend/.env
|
||||
@echo "Environment files created. Please update with your actual values."
|
||||
|
||||
dev: ## Start all development services (requires tmux)
|
||||
tmux new-session -d -s accessible-video
|
||||
tmux send-keys -t accessible-video 'make dev-backend' C-m
|
||||
tmux split-window -t accessible-video
|
||||
tmux send-keys -t accessible-video 'make dev-frontend' C-m
|
||||
tmux split-window -t accessible-video
|
||||
tmux send-keys -t accessible-video 'make dev-worker' C-m
|
||||
tmux select-layout -t accessible-video tiled
|
||||
tmux attach -t accessible-video
|
||||
178
README.md
Normal file
178
README.md
Normal file
|
|
@ -0,0 +1,178 @@
|
|||
# Accessible Video Processing Platform
|
||||
|
||||
An AI-powered platform for generating accessible video content including closed captions, audio descriptions, and multi-language translations.
|
||||
|
||||
## Features
|
||||
|
||||
- **AI-Powered Processing**: Uses Gemini 2.5 Pro for intelligent caption and audio description generation
|
||||
- **Multi-Language Support**: Automatic translation and cultural transcreation
|
||||
- **Quality Control Workflow**: Built-in review and approval process
|
||||
- **Audio Description**: Text-to-speech generation for voiceovers
|
||||
- **Secure File Handling**: Google Cloud Storage with signed URLs
|
||||
- **Role-Based Access**: Client, reviewer, and admin roles with appropriate permissions
|
||||
|
||||
## Tech Stack
|
||||
|
||||
### Backend
|
||||
- **FastAPI** - Modern Python web framework
|
||||
- **Celery** - Distributed task queue for video processing
|
||||
- **MongoDB** - Document database for job and user data
|
||||
- **Redis** - Task queue broker and caching
|
||||
- **Google Cloud Services** - Storage, AI, and TTS
|
||||
|
||||
### Frontend
|
||||
- **React 18** - UI framework
|
||||
- **Vite** - Fast build tool and dev server
|
||||
- **TypeScript** - Type safety
|
||||
- **TanStack Query** - Data fetching and caching
|
||||
- **Tailwind CSS** - Utility-first styling
|
||||
|
||||
## Getting Started
|
||||
|
||||
### Prerequisites
|
||||
|
||||
- Python 3.11+
|
||||
- Node.js 18+
|
||||
- Poetry (for Python dependency management)
|
||||
- MongoDB (Atlas recommended)
|
||||
- Redis
|
||||
- Google Cloud Project with required APIs enabled
|
||||
|
||||
### Installation
|
||||
|
||||
1. **Clone and setup environment:**
|
||||
```bash
|
||||
git clone <repository>
|
||||
cd accessible-video
|
||||
make setup-env
|
||||
```
|
||||
|
||||
2. **Install dependencies:**
|
||||
```bash
|
||||
make install
|
||||
```
|
||||
|
||||
3. **Configure environment variables:**
|
||||
- Update `backend/.env` with your database, API keys, and service credentials
|
||||
- Update `frontend/.env` with your API base URL
|
||||
|
||||
### Development
|
||||
|
||||
**Start all services (requires tmux):**
|
||||
```bash
|
||||
make dev
|
||||
```
|
||||
|
||||
**Or start services individually:**
|
||||
|
||||
```bash
|
||||
# Terminal 1 - Backend API
|
||||
make dev-backend
|
||||
|
||||
# Terminal 2 - Frontend SPA
|
||||
make dev-frontend
|
||||
|
||||
# Terminal 3 - Celery Worker
|
||||
make dev-worker
|
||||
```
|
||||
|
||||
The application will be available at:
|
||||
- Frontend: http://localhost:5173
|
||||
- Backend API: http://localhost:8000
|
||||
- API Docs: http://localhost:8000/docs
|
||||
|
||||
### Testing
|
||||
|
||||
```bash
|
||||
# Run all tests
|
||||
make test-backend
|
||||
make test-frontend
|
||||
|
||||
# Lint code
|
||||
make lint
|
||||
```
|
||||
|
||||
## Architecture
|
||||
|
||||
### Job Processing Pipeline
|
||||
|
||||
1. **Upload**: Client uploads MP4 video
|
||||
2. **Ingestion**: Video is processed and analyzed by Gemini 2.5 Pro
|
||||
3. **QC Review**: Human reviewer approves/rejects English captions and audio descriptions
|
||||
4. **Translation**: Approved content is translated to target languages
|
||||
5. **TTS Generation**: Audio descriptions are converted to speech
|
||||
6. **Final Review**: Reviewer approves final multi-language assets
|
||||
7. **Delivery**: Client receives email with download links
|
||||
|
||||
### File Structure
|
||||
|
||||
```
|
||||
backend/ # FastAPI application
|
||||
├── app/
|
||||
│ ├── api/ # REST API routes
|
||||
│ ├── core/ # Configuration and shared utilities
|
||||
│ ├── models/ # Pydantic data models
|
||||
│ ├── services/ # External service integrations
|
||||
│ ├── tasks/ # Celery background tasks
|
||||
│ └── prompts/ # AI prompt templates
|
||||
└── tests/ # Test suite
|
||||
|
||||
frontend/ # React SPA
|
||||
├── src/
|
||||
│ ├── components/ # Reusable UI components
|
||||
│ ├── routes/ # Page components
|
||||
│ ├── lib/ # Utilities and API client
|
||||
│ ├── hooks/ # Custom React hooks
|
||||
│ └── types/ # TypeScript definitions
|
||||
└── public/ # Static assets
|
||||
```
|
||||
|
||||
## Configuration
|
||||
|
||||
### Required Environment Variables
|
||||
|
||||
**Backend (.env):**
|
||||
- `MONGODB_URI` - MongoDB connection string
|
||||
- `REDIS_URL` - Redis connection string
|
||||
- `JWT_SECRET` - Secret for JWT token signing
|
||||
- `GEMINI_API_KEY` - Google Gemini API key
|
||||
- `GCS_BUCKET` - Google Cloud Storage bucket name
|
||||
- `SENDGRID_API_KEY` - SendGrid for email notifications
|
||||
|
||||
**Frontend (.env):**
|
||||
- `VITE_API_BASE_URL` - Backend API URL
|
||||
|
||||
### Google Cloud Setup
|
||||
|
||||
1. Create a GCP project
|
||||
2. Enable required APIs:
|
||||
- Cloud Storage API
|
||||
- Cloud Translation API
|
||||
- Cloud Text-to-Speech API
|
||||
- Vertex AI API (for Gemini)
|
||||
3. Create service account with appropriate permissions
|
||||
4. Download service account key and configure `GOOGLE_APPLICATION_CREDENTIALS`
|
||||
|
||||
## Deployment
|
||||
|
||||
The application is designed for deployment on Google Cloud:
|
||||
|
||||
- **Backend**: Cloud Run with auto-scaling
|
||||
- **Workers**: Cloud Run with Celery
|
||||
- **Frontend**: Cloud Storage + Cloud CDN
|
||||
- **Database**: MongoDB Atlas
|
||||
- **Queue**: Cloud Memorystore (Redis)
|
||||
|
||||
See `/infra` directory for deployment configurations.
|
||||
|
||||
## Security
|
||||
|
||||
- JWT authentication with refresh token rotation
|
||||
- Role-based access control (RBAC)
|
||||
- Signed URLs for secure file access
|
||||
- Audit logging for all reviewer actions
|
||||
- HTTPS enforcement in production
|
||||
|
||||
## Development Guide
|
||||
|
||||
Always refer to the complete development plan in `video_accessibility_development_plan.txt` for detailed specifications and requirements. The CLAUDE.md file contains additional development guidelines and phase-by-phase implementation details.
|
||||
92
backend/.dockerignore
Normal file
92
backend/.dockerignore
Normal file
|
|
@ -0,0 +1,92 @@
|
|||
# Python
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
*.so
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# Poetry (keep poetry.lock for reproducible builds)
|
||||
# poetry.lock
|
||||
|
||||
# Virtual environments
|
||||
.env
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# IDE
|
||||
.vscode/
|
||||
.idea/
|
||||
*.swp
|
||||
*.swo
|
||||
*~
|
||||
|
||||
# OS
|
||||
.DS_Store
|
||||
.DS_Store?
|
||||
._*
|
||||
.Spotlight-V100
|
||||
.Trashes
|
||||
ehthumbs.db
|
||||
Thumbs.db
|
||||
|
||||
# Testing
|
||||
.coverage
|
||||
.pytest_cache/
|
||||
.mypy_cache/
|
||||
.tox/
|
||||
htmlcov/
|
||||
coverage.xml
|
||||
*.cover
|
||||
.hypothesis/
|
||||
|
||||
# Documentation
|
||||
docs/
|
||||
*.md
|
||||
README*
|
||||
|
||||
# Logs
|
||||
*.log
|
||||
logs/
|
||||
|
||||
# Git
|
||||
.git/
|
||||
.gitignore
|
||||
|
||||
# Docker
|
||||
Dockerfile*
|
||||
.dockerignore
|
||||
docker-compose*
|
||||
|
||||
# CI/CD
|
||||
.github/
|
||||
|
||||
# Local development
|
||||
.env.local
|
||||
.env.development
|
||||
.env.test
|
||||
|
||||
# Temporary files
|
||||
tmp/
|
||||
temp/
|
||||
*.tmp
|
||||
*.bak
|
||||
42
backend/.env
Normal file
42
backend/.env
Normal file
|
|
@ -0,0 +1,42 @@
|
|||
APP_ENV=dev
|
||||
API_BASE_URL=http://localhost:8000
|
||||
|
||||
# Auth
|
||||
JWT_SECRET=this_is_a_jwt_secret
|
||||
JWT_ALG=HS256
|
||||
JWT_ACCESS_TTL_MIN=240
|
||||
JWT_REFRESH_TTL_DAYS=7
|
||||
COOKIE_DOMAIN=localdomain.com
|
||||
COOKIE_SECURE=true
|
||||
COOKIE_SAMESITE=Lax
|
||||
|
||||
# MongoDB
|
||||
MONGODB_URI=mongodb://admin:password123@localhost:27017/accessible_video?authSource=admin&replicaSet=rs0
|
||||
MONGODB_DB=accessible_video
|
||||
|
||||
# Redis
|
||||
REDIS_URL=redis://localhost:6379/0
|
||||
|
||||
# Celery (uses Redis)
|
||||
CELERY_BROKER_URL=redis://localhost:6379/0
|
||||
CELERY_RESULT_BACKEND=redis://localhost:6379/0
|
||||
|
||||
# GCP
|
||||
GCP_PROJECT_ID=optical-414516
|
||||
GCS_BUCKET=accessible-video
|
||||
GOOGLE_APPLICATION_CREDENTIALS=/Users/michael.clervi/Documents/projects/video_accessibility/backend/optical-414516-80e2475f6412.json
|
||||
|
||||
# AI
|
||||
GEMINI_API_KEY=AIzaSyAuuVGcvqfoP7pqX-YwieGszPsNSeAft-0
|
||||
TRANSLATE_API_KEY=...
|
||||
ELEVENLABS_API_KEY=...
|
||||
GOOGLE_TTS_CREDENTIALS=/secrets/gcp_tts.json
|
||||
|
||||
# Email
|
||||
SENDGRID_API_KEY=disabled_for_local_testing
|
||||
EMAIL_FROM=test@localhost.com
|
||||
CLIENT_BASE_URL=http://localhost:5173
|
||||
|
||||
# Observability
|
||||
SENTRY_DSN=...
|
||||
OTEL_EXPORTER_OTLP_ENDPOINT=
|
||||
42
backend/.env.example
Normal file
42
backend/.env.example
Normal file
|
|
@ -0,0 +1,42 @@
|
|||
APP_ENV=dev
|
||||
API_BASE_URL=https://api.yourdomain.com
|
||||
|
||||
# Auth
|
||||
JWT_SECRET=change_me
|
||||
JWT_ALG=HS256
|
||||
JWT_ACCESS_TTL_MIN=240
|
||||
JWT_REFRESH_TTL_DAYS=7
|
||||
COOKIE_DOMAIN=yourdomain.com
|
||||
COOKIE_SECURE=true
|
||||
COOKIE_SAMESITE=Lax
|
||||
|
||||
# MongoDB
|
||||
MONGODB_URI=mongodb://localhost:27017/accessible_video
|
||||
MONGODB_DB=accessible_video
|
||||
|
||||
# Redis
|
||||
REDIS_URL=redis://localhost:6379/0
|
||||
|
||||
# Celery (uses Redis)
|
||||
CELERY_BROKER_URL=redis://localhost:6379/0
|
||||
CELERY_RESULT_BACKEND=redis://localhost:6379/0
|
||||
|
||||
# GCP
|
||||
GCP_PROJECT_ID=...
|
||||
GCS_BUCKET=accessible-video
|
||||
GOOGLE_APPLICATION_CREDENTIALS=/secrets/gcp.json
|
||||
|
||||
# AI
|
||||
GEMINI_API_KEY=...
|
||||
TRANSLATE_API_KEY=...
|
||||
ELEVENLABS_API_KEY=...
|
||||
GOOGLE_TTS_CREDENTIALS=/secrets/gcp_tts.json
|
||||
|
||||
# Email
|
||||
SENDGRID_API_KEY=...
|
||||
EMAIL_FROM=support@yourdomain.com
|
||||
CLIENT_BASE_URL=https://app.yourdomain.com
|
||||
|
||||
# Observability
|
||||
SENTRY_DSN=...
|
||||
OTEL_EXPORTER_OTLP_ENDPOINT=...
|
||||
127
backend/Dockerfile
Normal file
127
backend/Dockerfile
Normal file
|
|
@ -0,0 +1,127 @@
|
|||
# Build stage - Install dependencies and build wheels
|
||||
FROM python:3.11-slim AS builder
|
||||
|
||||
# Install build dependencies
|
||||
RUN apt-get update && apt-get install -y \
|
||||
build-essential \
|
||||
curl \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install Poetry
|
||||
RUN pip install poetry==1.8.2
|
||||
|
||||
# Set Poetry configuration
|
||||
ENV POETRY_NO_INTERACTION=1 \
|
||||
POETRY_VENV_IN_PROJECT=1 \
|
||||
POETRY_CACHE_DIR=/tmp/poetry_cache
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# Copy dependency files
|
||||
COPY pyproject.toml poetry.lock ./
|
||||
|
||||
# Install dependencies into venv
|
||||
RUN poetry config virtualenvs.in-project true && \
|
||||
poetry lock --no-update || true && \
|
||||
poetry install --only=main --no-root && \
|
||||
rm -rf $POETRY_CACHE_DIR
|
||||
|
||||
# Base runtime stage
|
||||
FROM python:3.11-slim AS base
|
||||
|
||||
# Install runtime system dependencies
|
||||
RUN apt-get update && apt-get install -y \
|
||||
ffmpeg \
|
||||
curl \
|
||||
tini \
|
||||
&& rm -rf /var/lib/apt/lists/* \
|
||||
&& apt-get clean
|
||||
|
||||
# Create non-root user
|
||||
RUN groupadd --gid 1000 app \
|
||||
&& useradd --uid 1000 --gid app --shell /bin/bash --create-home app
|
||||
|
||||
# Set working directory
|
||||
WORKDIR /app
|
||||
|
||||
# Copy virtual environment from builder stage
|
||||
COPY --from=builder --chown=app:app /app/.venv /app/.venv
|
||||
|
||||
# Ensure venv is in PATH
|
||||
ENV PATH="/app/.venv/bin:$PATH"
|
||||
|
||||
# Copy application code
|
||||
COPY --chown=app:app . .
|
||||
|
||||
# Switch to non-root user
|
||||
USER app
|
||||
|
||||
# Production API stage
|
||||
FROM base AS production
|
||||
|
||||
# Set environment variables for production
|
||||
ENV APP_ENV=prod \
|
||||
PYTHONPATH=/app \
|
||||
PYTHONUNBUFFERED=1 \
|
||||
PYTHONDONTWRITEBYTECODE=1
|
||||
|
||||
# Health check
|
||||
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
|
||||
CMD curl -f http://localhost:8000/health || exit 1
|
||||
|
||||
# Expose port
|
||||
EXPOSE 8000
|
||||
|
||||
# Use tini as init system for proper signal handling
|
||||
ENTRYPOINT ["tini", "--"]
|
||||
|
||||
# Default command for API server
|
||||
CMD ["gunicorn", "-c", "gunicorn_conf.py"]
|
||||
|
||||
# Worker stage for Celery workers
|
||||
FROM base AS worker
|
||||
|
||||
# Set environment variables for worker
|
||||
ENV APP_ENV=prod \
|
||||
PYTHONPATH=/app \
|
||||
PYTHONUNBUFFERED=1 \
|
||||
PYTHONDONTWRITEBYTECODE=1 \
|
||||
C_FORCE_ROOT=1
|
||||
|
||||
# Health check for worker (check if Celery is responding)
|
||||
HEALTHCHECK --interval=60s --timeout=15s --start-period=10s --retries=3 \
|
||||
CMD python -c "from celery import Celery; app=Celery('app'); print('Worker healthy')" || exit 1
|
||||
|
||||
# Use tini as init system for proper signal handling
|
||||
ENTRYPOINT ["tini", "--"]
|
||||
|
||||
# Default command for Celery worker
|
||||
CMD ["celery", "-A", "app.tasks", "worker", "--loglevel=info", "--concurrency=1"]
|
||||
|
||||
# Development stage with dev dependencies
|
||||
FROM builder AS development
|
||||
|
||||
# Install all dependencies including dev
|
||||
RUN poetry install --no-root && rm -rf $POETRY_CACHE_DIR
|
||||
|
||||
# Install additional dev tools
|
||||
RUN apt-get update && apt-get install -y \
|
||||
git \
|
||||
vim \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Copy application code
|
||||
COPY --chown=app:app . .
|
||||
|
||||
# Switch to non-root user
|
||||
USER app
|
||||
|
||||
# Set environment for development
|
||||
ENV APP_ENV=dev \
|
||||
PYTHONPATH=/app \
|
||||
PYTHONUNBUFFERED=1
|
||||
|
||||
EXPOSE 8000
|
||||
|
||||
# Development command with hot reload
|
||||
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000", "--reload"]
|
||||
BIN
backend/__pycache__/celery_worker.cpython-313.pyc
Normal file
BIN
backend/__pycache__/celery_worker.cpython-313.pyc
Normal file
Binary file not shown.
BIN
backend/app/__pycache__/main.cpython-313.pyc
Normal file
BIN
backend/app/__pycache__/main.cpython-313.pyc
Normal file
Binary file not shown.
BIN
backend/app/api/v1/__pycache__/routes_admin.cpython-313.pyc
Normal file
BIN
backend/app/api/v1/__pycache__/routes_admin.cpython-313.pyc
Normal file
Binary file not shown.
BIN
backend/app/api/v1/__pycache__/routes_auth.cpython-313.pyc
Normal file
BIN
backend/app/api/v1/__pycache__/routes_auth.cpython-313.pyc
Normal file
Binary file not shown.
BIN
backend/app/api/v1/__pycache__/routes_files.cpython-313.pyc
Normal file
BIN
backend/app/api/v1/__pycache__/routes_files.cpython-313.pyc
Normal file
Binary file not shown.
BIN
backend/app/api/v1/__pycache__/routes_jobs.cpython-313.pyc
Normal file
BIN
backend/app/api/v1/__pycache__/routes_jobs.cpython-313.pyc
Normal file
Binary file not shown.
770
backend/app/api/v1/routes_admin.py
Normal file
770
backend/app/api/v1/routes_admin.py
Normal file
|
|
@ -0,0 +1,770 @@
|
|||
from datetime import datetime, timedelta
|
||||
from typing import Optional
|
||||
|
||||
from bson import ObjectId
|
||||
from fastapi import APIRouter, Depends, HTTPException, Query, Request, status
|
||||
from motor.motor_asyncio import AsyncIOMotorDatabase
|
||||
|
||||
from ...core.database import get_database
|
||||
from ...core.dependencies import get_current_user, require_roles
|
||||
from ...core.logging import get_logger
|
||||
from ...core.security import get_password_hash, verify_password
|
||||
from ...models.user import User, UserRole
|
||||
from ...models.audit_log import AuditLogQuery, AuditLogResponse
|
||||
from ...schemas.auth import (
|
||||
AdminStatsResponse,
|
||||
ChangePasswordRequest,
|
||||
CreateUserRequest,
|
||||
ResetPasswordRequest,
|
||||
UpdateUserRequest,
|
||||
UserListResponse,
|
||||
UserResponse,
|
||||
)
|
||||
from ...services.audit_logger import audit_logger, log_user_management, log_security_event
|
||||
from ...telemetry import app_metrics
|
||||
|
||||
logger = get_logger(__name__)
|
||||
router = APIRouter(prefix="/admin", tags=["admin"])
|
||||
|
||||
|
||||
@router.get("/users", response_model=UserListResponse)
|
||||
async def list_users(
|
||||
page: int = Query(1, ge=1),
|
||||
size: int = Query(20, ge=1, le=100),
|
||||
role: Optional[str] = Query(None),
|
||||
active_only: bool = Query(True),
|
||||
current_user: User = Depends(require_roles(UserRole.ADMIN)),
|
||||
db: AsyncIOMotorDatabase = Depends(get_database),
|
||||
):
|
||||
"""List users with filtering and pagination (admin only)"""
|
||||
query = {}
|
||||
|
||||
if role:
|
||||
query["role"] = role
|
||||
|
||||
if active_only:
|
||||
query["is_active"] = True
|
||||
|
||||
# Get total count
|
||||
total = await db.users.count_documents(query)
|
||||
|
||||
# Get paginated results
|
||||
skip = (page - 1) * size
|
||||
cursor = db.users.find(query, {"hashed_password": 0}).sort("created_at", -1).skip(skip).limit(size)
|
||||
users = await cursor.to_list(length=size)
|
||||
|
||||
user_responses = []
|
||||
for user_doc in users:
|
||||
user_responses.append(UserResponse(
|
||||
id=str(user_doc["_id"]),
|
||||
email=user_doc["email"],
|
||||
full_name=user_doc["full_name"],
|
||||
role=user_doc["role"],
|
||||
is_active=user_doc["is_active"],
|
||||
created_at=user_doc.get("created_at", datetime.utcnow()).isoformat()
|
||||
))
|
||||
|
||||
return UserListResponse(
|
||||
users=user_responses,
|
||||
total=total,
|
||||
page=page,
|
||||
size=size
|
||||
)
|
||||
|
||||
|
||||
@router.get("/users/{user_id}", response_model=UserResponse)
|
||||
async def get_user(
|
||||
user_id: str,
|
||||
current_user: User = Depends(require_roles(UserRole.ADMIN)),
|
||||
db: AsyncIOMotorDatabase = Depends(get_database),
|
||||
):
|
||||
"""Get user details by ID (admin only)"""
|
||||
user_doc = await db.users.find_one({"_id": user_id}, {"hashed_password": 0})
|
||||
if not user_doc:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_404_NOT_FOUND,
|
||||
detail="User not found"
|
||||
)
|
||||
|
||||
return UserResponse(
|
||||
id=str(user_doc["_id"]),
|
||||
email=user_doc["email"],
|
||||
full_name=user_doc["full_name"],
|
||||
role=user_doc["role"],
|
||||
is_active=user_doc["is_active"],
|
||||
created_at=user_doc.get("created_at", datetime.utcnow()).isoformat()
|
||||
)
|
||||
|
||||
|
||||
@router.post("/users", response_model=UserResponse, status_code=status.HTTP_201_CREATED)
|
||||
async def create_user(
|
||||
user_data: CreateUserRequest,
|
||||
current_user: User = Depends(require_roles(UserRole.ADMIN)),
|
||||
db: AsyncIOMotorDatabase = Depends(get_database),
|
||||
):
|
||||
"""Create a new user (admin only)"""
|
||||
# Check if user already exists
|
||||
existing_user = await db.users.find_one({"email": user_data.email})
|
||||
if existing_user:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_400_BAD_REQUEST,
|
||||
detail="User with this email already exists"
|
||||
)
|
||||
|
||||
# Create user document
|
||||
user_id = str(ObjectId())
|
||||
user_doc = {
|
||||
"_id": user_id,
|
||||
"email": user_data.email,
|
||||
"hashed_password": get_password_hash(user_data.password),
|
||||
"full_name": user_data.full_name,
|
||||
"role": user_data.role.value,
|
||||
"is_active": True,
|
||||
"created_at": datetime.utcnow(),
|
||||
"updated_at": datetime.utcnow()
|
||||
}
|
||||
|
||||
await db.users.insert_one(user_doc)
|
||||
|
||||
# Record metrics
|
||||
app_metrics.record_auth_attempt("user_created", user_data.role.value)
|
||||
|
||||
logger.info(f"Admin {current_user.id} created user {user_id} with role {user_data.role.value}")
|
||||
|
||||
return UserResponse(
|
||||
id=user_id,
|
||||
email=user_data.email,
|
||||
full_name=user_data.full_name,
|
||||
role=user_data.role,
|
||||
is_active=True,
|
||||
created_at=user_doc["created_at"].isoformat()
|
||||
)
|
||||
|
||||
|
||||
@router.patch("/users/{user_id}", response_model=UserResponse)
|
||||
async def update_user(
|
||||
user_id: str,
|
||||
user_update: UpdateUserRequest,
|
||||
current_user: User = Depends(require_roles(UserRole.ADMIN)),
|
||||
db: AsyncIOMotorDatabase = Depends(get_database),
|
||||
):
|
||||
"""Update user details (admin only)"""
|
||||
# Check if user exists
|
||||
user_doc = await db.users.find_one({"_id": user_id})
|
||||
if not user_doc:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_404_NOT_FOUND,
|
||||
detail="User not found"
|
||||
)
|
||||
|
||||
# Check if email is being changed and doesn't conflict
|
||||
if user_update.email and user_update.email != user_doc["email"]:
|
||||
existing_user = await db.users.find_one({"email": user_update.email, "_id": {"$ne": user_id}})
|
||||
if existing_user:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_400_BAD_REQUEST,
|
||||
detail="Email already in use by another user"
|
||||
)
|
||||
|
||||
# Build update document
|
||||
update_data = {"updated_at": datetime.utcnow()}
|
||||
|
||||
if user_update.email:
|
||||
update_data["email"] = user_update.email
|
||||
if user_update.full_name:
|
||||
update_data["full_name"] = user_update.full_name
|
||||
if user_update.role:
|
||||
update_data["role"] = user_update.role.value
|
||||
if user_update.is_active is not None:
|
||||
update_data["is_active"] = user_update.is_active
|
||||
|
||||
# Update user
|
||||
result = await db.users.find_one_and_update(
|
||||
{"_id": user_id},
|
||||
{"$set": update_data},
|
||||
return_document=True
|
||||
)
|
||||
|
||||
logger.info(f"Admin {current_user.id} updated user {user_id}")
|
||||
|
||||
return UserResponse(
|
||||
id=str(result["_id"]),
|
||||
email=result["email"],
|
||||
full_name=result["full_name"],
|
||||
role=result["role"],
|
||||
is_active=result["is_active"],
|
||||
created_at=result.get("created_at", datetime.utcnow()).isoformat()
|
||||
)
|
||||
|
||||
|
||||
@router.delete("/users/{user_id}")
|
||||
async def deactivate_user(
|
||||
user_id: str,
|
||||
current_user: User = Depends(require_roles(UserRole.ADMIN)),
|
||||
db: AsyncIOMotorDatabase = Depends(get_database),
|
||||
):
|
||||
"""Deactivate user account (admin only) - soft delete"""
|
||||
if str(current_user.id) == user_id:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_400_BAD_REQUEST,
|
||||
detail="Cannot deactivate your own account"
|
||||
)
|
||||
|
||||
result = await db.users.update_one(
|
||||
{"_id": user_id},
|
||||
{
|
||||
"$set": {
|
||||
"is_active": False,
|
||||
"updated_at": datetime.utcnow()
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
if result.matched_count == 0:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_404_NOT_FOUND,
|
||||
detail="User not found"
|
||||
)
|
||||
|
||||
logger.info(f"Admin {current_user.id} deactivated user {user_id}")
|
||||
|
||||
return {"message": "User deactivated successfully"}
|
||||
|
||||
|
||||
@router.post("/users/{user_id}/reset-password")
|
||||
async def admin_reset_password(
|
||||
user_id: str,
|
||||
reset_request: ResetPasswordRequest,
|
||||
current_user: User = Depends(require_roles(UserRole.ADMIN)),
|
||||
db: AsyncIOMotorDatabase = Depends(get_database),
|
||||
):
|
||||
"""Reset user password (admin only)"""
|
||||
# Generate temporary password
|
||||
import secrets
|
||||
import string
|
||||
|
||||
temp_password = ''.join(secrets.choice(string.ascii_letters + string.digits) for _ in range(12))
|
||||
hashed_password = get_password_hash(temp_password)
|
||||
|
||||
result = await db.users.update_one(
|
||||
{"_id": user_id},
|
||||
{
|
||||
"$set": {
|
||||
"hashed_password": hashed_password,
|
||||
"updated_at": datetime.utcnow()
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
if result.matched_count == 0:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_404_NOT_FOUND,
|
||||
detail="User not found"
|
||||
)
|
||||
|
||||
logger.info(f"Admin {current_user.id} reset password for user {user_id}")
|
||||
|
||||
# In production, send email with temp password instead of returning it
|
||||
return {
|
||||
"message": "Password reset successfully",
|
||||
"temporary_password": temp_password # Remove this in production, send via email
|
||||
}
|
||||
|
||||
|
||||
@router.get("/stats", response_model=AdminStatsResponse)
|
||||
async def get_admin_stats(
|
||||
current_user: User = Depends(require_roles(UserRole.ADMIN)),
|
||||
db: AsyncIOMotorDatabase = Depends(get_database),
|
||||
):
|
||||
"""Get system statistics (admin only)"""
|
||||
# Get user count
|
||||
total_users = await db.users.count_documents({"is_active": True})
|
||||
|
||||
# Get job counts
|
||||
total_jobs = await db.jobs.count_documents({})
|
||||
|
||||
# Get jobs by status
|
||||
pipeline = [
|
||||
{"$group": {"_id": "$status", "count": {"$sum": 1}}}
|
||||
]
|
||||
status_counts = await db.jobs.aggregate(pipeline).to_list(None)
|
||||
jobs_by_status = {item["_id"]: item["count"] for item in status_counts}
|
||||
|
||||
# Get jobs created today
|
||||
today_start = datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0)
|
||||
active_jobs_today = await db.jobs.count_documents({
|
||||
"created_at": {"$gte": today_start}
|
||||
})
|
||||
|
||||
# Calculate average processing time for completed jobs
|
||||
avg_processing_pipeline = [
|
||||
{"$match": {"status": "completed", "created_at": {"$exists": True}, "updated_at": {"$exists": True}}},
|
||||
{
|
||||
"$project": {
|
||||
"processing_time_hours": {
|
||||
"$divide": [
|
||||
{"$subtract": ["$updated_at", "$created_at"]},
|
||||
3600000 # Convert milliseconds to hours
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"$group": {
|
||||
"_id": None,
|
||||
"avg_processing_time": {"$avg": "$processing_time_hours"}
|
||||
}
|
||||
}
|
||||
]
|
||||
|
||||
avg_result = await db.jobs.aggregate(avg_processing_pipeline).to_list(None)
|
||||
avg_processing_time = avg_result[0]["avg_processing_time"] if avg_result else 0.0
|
||||
|
||||
return AdminStatsResponse(
|
||||
total_users=total_users,
|
||||
total_jobs=total_jobs,
|
||||
jobs_by_status=jobs_by_status,
|
||||
active_jobs_today=active_jobs_today,
|
||||
avg_processing_time_hours=round(avg_processing_time, 2)
|
||||
)
|
||||
|
||||
|
||||
@router.get("/health/detailed")
|
||||
async def detailed_health_check(
|
||||
current_user: User = Depends(require_roles(UserRole.ADMIN, UserRole.REVIEWER)),
|
||||
db: AsyncIOMotorDatabase = Depends(get_database),
|
||||
):
|
||||
"""Detailed health check with system component status (admin/reviewer only)"""
|
||||
health_status = {
|
||||
"status": "healthy",
|
||||
"timestamp": datetime.utcnow().isoformat(),
|
||||
"components": {}
|
||||
}
|
||||
|
||||
# Check MongoDB
|
||||
try:
|
||||
await db.command("ping")
|
||||
health_status["components"]["mongodb"] = {"status": "healthy"}
|
||||
except Exception as e:
|
||||
health_status["components"]["mongodb"] = {"status": "unhealthy", "error": str(e)}
|
||||
health_status["status"] = "degraded"
|
||||
|
||||
# Check Redis (via import to avoid circular dependency)
|
||||
try:
|
||||
from ...core.redis import redis_client
|
||||
if redis_client:
|
||||
await redis_client.ping()
|
||||
health_status["components"]["redis"] = {"status": "healthy"}
|
||||
else:
|
||||
health_status["components"]["redis"] = {"status": "not_configured"}
|
||||
except Exception as e:
|
||||
health_status["components"]["redis"] = {"status": "unhealthy", "error": str(e)}
|
||||
health_status["status"] = "degraded"
|
||||
|
||||
# Check GCS (basic check)
|
||||
try:
|
||||
from ...services.gcs import gcs_service
|
||||
# Simple check to see if bucket is accessible
|
||||
bucket_exists = await gcs_service.file_exists("health_check_dummy") # This will return False but won't error if bucket accessible
|
||||
health_status["components"]["gcs"] = {"status": "healthy"}
|
||||
except Exception as e:
|
||||
health_status["components"]["gcs"] = {"status": "unhealthy", "error": str(e)}
|
||||
health_status["status"] = "degraded"
|
||||
|
||||
# Check job queue health
|
||||
try:
|
||||
from ...tasks import celery_app
|
||||
inspect = celery_app.control.inspect()
|
||||
active_tasks = inspect.active()
|
||||
|
||||
if active_tasks:
|
||||
total_active = sum(len(tasks) for tasks in active_tasks.values())
|
||||
health_status["components"]["celery"] = {
|
||||
"status": "healthy",
|
||||
"active_tasks": total_active,
|
||||
"workers": len(active_tasks)
|
||||
}
|
||||
else:
|
||||
health_status["components"]["celery"] = {
|
||||
"status": "no_workers",
|
||||
"active_tasks": 0,
|
||||
"workers": 0
|
||||
}
|
||||
except Exception as e:
|
||||
health_status["components"]["celery"] = {"status": "unhealthy", "error": str(e)}
|
||||
health_status["status"] = "degraded"
|
||||
|
||||
return health_status
|
||||
|
||||
|
||||
@router.get("/jobs/stats")
|
||||
async def get_job_statistics(
|
||||
days: int = Query(7, ge=1, le=90),
|
||||
current_user: User = Depends(require_roles(UserRole.ADMIN, UserRole.REVIEWER)),
|
||||
db: AsyncIOMotorDatabase = Depends(get_database),
|
||||
):
|
||||
"""Get job processing statistics (admin/reviewer only)"""
|
||||
since_date = datetime.utcnow() - timedelta(days=days)
|
||||
|
||||
# Jobs created in period
|
||||
jobs_in_period = await db.jobs.count_documents({
|
||||
"created_at": {"$gte": since_date}
|
||||
})
|
||||
|
||||
# Jobs completed in period
|
||||
jobs_completed = await db.jobs.count_documents({
|
||||
"status": "completed",
|
||||
"updated_at": {"$gte": since_date}
|
||||
})
|
||||
|
||||
# Average processing time for completed jobs
|
||||
avg_pipeline = [
|
||||
{
|
||||
"$match": {
|
||||
"status": "completed",
|
||||
"created_at": {"$gte": since_date},
|
||||
"updated_at": {"$exists": True}
|
||||
}
|
||||
},
|
||||
{
|
||||
"$project": {
|
||||
"processing_time_hours": {
|
||||
"$divide": [
|
||||
{"$subtract": ["$updated_at", "$created_at"]},
|
||||
3600000
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"$group": {
|
||||
"_id": None,
|
||||
"avg_time": {"$avg": "$processing_time_hours"},
|
||||
"min_time": {"$min": "$processing_time_hours"},
|
||||
"max_time": {"$max": "$processing_time_hours"}
|
||||
}
|
||||
}
|
||||
]
|
||||
|
||||
avg_result = await db.jobs.aggregate(avg_pipeline).to_list(None)
|
||||
processing_stats = avg_result[0] if avg_result else {
|
||||
"avg_time": 0, "min_time": 0, "max_time": 0
|
||||
}
|
||||
|
||||
# Current queue status
|
||||
current_queue_stats = {}
|
||||
pipeline = [
|
||||
{"$group": {"_id": "$status", "count": {"$sum": 1}}}
|
||||
]
|
||||
status_counts = await db.jobs.aggregate(pipeline).to_list(None)
|
||||
for item in status_counts:
|
||||
current_queue_stats[item["_id"]] = item["count"]
|
||||
|
||||
return {
|
||||
"period_days": days,
|
||||
"jobs_created": jobs_in_period,
|
||||
"jobs_completed": jobs_completed,
|
||||
"completion_rate": round(jobs_completed / max(jobs_in_period, 1) * 100, 2),
|
||||
"avg_processing_time_hours": round(processing_stats["avg_time"], 2),
|
||||
"min_processing_time_hours": round(processing_stats["min_time"], 2),
|
||||
"max_processing_time_hours": round(processing_stats["max_time"], 2),
|
||||
"current_queue_status": current_queue_stats
|
||||
}
|
||||
|
||||
|
||||
@router.post("/users/{user_id}/password/reset")
|
||||
async def admin_force_password_reset(
|
||||
user_id: str,
|
||||
current_user: User = Depends(require_roles(UserRole.ADMIN)),
|
||||
db: AsyncIOMotorDatabase = Depends(get_database),
|
||||
):
|
||||
"""Force password reset for user (admin only)"""
|
||||
if str(current_user.id) == user_id:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_400_BAD_REQUEST,
|
||||
detail="Cannot reset your own password this way"
|
||||
)
|
||||
|
||||
# Check if user exists
|
||||
user_doc = await db.users.find_one({"_id": user_id})
|
||||
if not user_doc:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_404_NOT_FOUND,
|
||||
detail="User not found"
|
||||
)
|
||||
|
||||
# Generate secure temporary password
|
||||
import secrets
|
||||
import string
|
||||
|
||||
temp_password = ''.join(secrets.choice(
|
||||
string.ascii_letters + string.digits + "!@#$%"
|
||||
) for _ in range(16))
|
||||
|
||||
# Update password
|
||||
await db.users.update_one(
|
||||
{"_id": user_id},
|
||||
{
|
||||
"$set": {
|
||||
"hashed_password": get_password_hash(temp_password),
|
||||
"updated_at": datetime.utcnow()
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
# TODO: In production, send via secure email instead of returning password
|
||||
logger.info(f"Admin {current_user.id} reset password for user {user_id}")
|
||||
|
||||
return {
|
||||
"message": "Password reset successfully",
|
||||
"temporary_password": temp_password,
|
||||
"note": "User should change this password immediately"
|
||||
}
|
||||
|
||||
|
||||
@router.get("/audit-logs")
|
||||
async def get_audit_logs(
|
||||
job_id: Optional[str] = Query(None),
|
||||
action: Optional[str] = Query(None),
|
||||
days: int = Query(7, ge=1, le=90),
|
||||
page: int = Query(1, ge=1),
|
||||
size: int = Query(50, ge=1, le=200),
|
||||
current_user: User = Depends(require_roles(UserRole.ADMIN)),
|
||||
db: AsyncIOMotorDatabase = Depends(get_database),
|
||||
):
|
||||
"""Get audit logs with filtering (admin only)"""
|
||||
query = {
|
||||
"when": {"$gte": datetime.utcnow() - timedelta(days=days)}
|
||||
}
|
||||
|
||||
if job_id:
|
||||
query["job_id"] = job_id
|
||||
if action:
|
||||
query["action"] = action
|
||||
|
||||
# Get total count
|
||||
total = await db.audit_logs.count_documents(query)
|
||||
|
||||
# Get paginated results
|
||||
skip = (page - 1) * size
|
||||
cursor = (
|
||||
db.audit_logs.find(query)
|
||||
.sort("when", -1)
|
||||
.skip(skip)
|
||||
.limit(size)
|
||||
)
|
||||
logs = await cursor.to_list(length=size)
|
||||
|
||||
return {
|
||||
"logs": logs,
|
||||
"total": total,
|
||||
"page": page,
|
||||
"size": size,
|
||||
"period_days": days
|
||||
}
|
||||
|
||||
|
||||
@router.post("/maintenance/reprocess-job/{job_id}")
|
||||
async def reprocess_job(
|
||||
job_id: str,
|
||||
current_user: User = Depends(require_roles(UserRole.ADMIN)),
|
||||
db: AsyncIOMotorDatabase = Depends(get_database),
|
||||
):
|
||||
"""Force reprocessing of a job (admin emergency function)"""
|
||||
# Check if job exists
|
||||
job_doc = await db.jobs.find_one({"_id": job_id})
|
||||
if not job_doc:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_404_NOT_FOUND,
|
||||
detail="Job not found"
|
||||
)
|
||||
|
||||
# Reset job to created status for reprocessing
|
||||
await db.jobs.update_one(
|
||||
{"_id": job_id},
|
||||
{
|
||||
"$set": {
|
||||
"status": "created",
|
||||
"error": None,
|
||||
"updated_at": datetime.utcnow()
|
||||
},
|
||||
"$push": {
|
||||
"review.history": {
|
||||
"at": datetime.utcnow(),
|
||||
"status": "reprocessing",
|
||||
"by": str(current_user.id),
|
||||
"notes": "Admin-triggered reprocessing"
|
||||
}
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
# Trigger ingestion task
|
||||
from ...tasks.ingest_and_ai import ingest_and_ai_task
|
||||
ingest_and_ai_task.delay(job_id)
|
||||
|
||||
logger.warning(f"Admin {current_user.id} triggered reprocessing for job {job_id}")
|
||||
|
||||
return {"message": f"Job {job_id} queued for reprocessing"}
|
||||
|
||||
|
||||
@router.get("/audit-logs", response_model=AuditLogResponse)
|
||||
async def get_audit_logs(
|
||||
# Time range
|
||||
start_date: Optional[datetime] = Query(None, description="Start date for audit logs"),
|
||||
end_date: Optional[datetime] = Query(None, description="End date for audit logs"),
|
||||
|
||||
# Filters
|
||||
action: Optional[str] = Query(None, description="Filter by action type"),
|
||||
severity: Optional[str] = Query(None, description="Filter by severity level"),
|
||||
user_email: Optional[str] = Query(None, description="Filter by user email"),
|
||||
resource_type: Optional[str] = Query(None, description="Filter by resource type"),
|
||||
resource_id: Optional[str] = Query(None, description="Filter by resource ID"),
|
||||
success: Optional[bool] = Query(None, description="Filter by success status"),
|
||||
|
||||
# Search
|
||||
search: Optional[str] = Query(None, description="Search in description and details"),
|
||||
|
||||
# Pagination
|
||||
page: int = Query(1, ge=1, description="Page number"),
|
||||
size: int = Query(50, ge=1, le=500, description="Page size"),
|
||||
|
||||
# Sorting
|
||||
sort_by: str = Query("timestamp", description="Field to sort by"),
|
||||
sort_order: int = Query(-1, ge=-1, le=1, description="Sort order (-1 desc, 1 asc)"),
|
||||
|
||||
current_user: User = Depends(require_roles(UserRole.ADMIN)),
|
||||
request: Request = None,
|
||||
):
|
||||
"""Get audit logs with filtering and pagination (admin only)"""
|
||||
|
||||
# Log audit log access
|
||||
await audit_logger.log_action(
|
||||
action="admin.audit.access",
|
||||
description=f"Admin {current_user.email} accessed audit logs",
|
||||
user=current_user,
|
||||
request=request,
|
||||
details={
|
||||
"filters": {
|
||||
"start_date": start_date.isoformat() if start_date else None,
|
||||
"end_date": end_date.isoformat() if end_date else None,
|
||||
"action": action,
|
||||
"severity": severity,
|
||||
"user_email": user_email,
|
||||
"resource_type": resource_type,
|
||||
"search": search
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
# Build query
|
||||
query = AuditLogQuery(
|
||||
start_date=start_date,
|
||||
end_date=end_date,
|
||||
action=action,
|
||||
severity=severity,
|
||||
user_email=user_email,
|
||||
resource_type=resource_type,
|
||||
resource_id=resource_id,
|
||||
success=success,
|
||||
search=search,
|
||||
skip=(page - 1) * size,
|
||||
limit=size,
|
||||
sort_by=sort_by,
|
||||
sort_order=sort_order
|
||||
)
|
||||
|
||||
return await audit_logger.query_logs(query)
|
||||
|
||||
|
||||
@router.get("/audit-logs/user/{user_id}")
|
||||
async def get_user_audit_logs(
|
||||
user_id: str,
|
||||
days: int = Query(30, ge=1, le=365, description="Number of days to look back"),
|
||||
current_user: User = Depends(require_roles(UserRole.ADMIN)),
|
||||
request: Request = None,
|
||||
):
|
||||
"""Get audit logs for a specific user (admin only)"""
|
||||
|
||||
# Validate user_id
|
||||
try:
|
||||
ObjectId(user_id)
|
||||
except Exception:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_400_BAD_REQUEST,
|
||||
detail="Invalid user ID format"
|
||||
)
|
||||
|
||||
# Log access to user audit logs
|
||||
await audit_logger.log_action(
|
||||
action="admin.audit.access",
|
||||
description=f"Admin {current_user.email} accessed user audit logs for {user_id}",
|
||||
user=current_user,
|
||||
request=request,
|
||||
resource_type="user",
|
||||
resource_id=user_id,
|
||||
details={"days_requested": days}
|
||||
)
|
||||
|
||||
logs = await audit_logger.get_user_activity(user_id, days)
|
||||
return {"logs": logs, "user_id": user_id, "days": days}
|
||||
|
||||
|
||||
@router.get("/audit-logs/security")
|
||||
async def get_security_events(
|
||||
hours: int = Query(24, ge=1, le=168, description="Number of hours to look back"),
|
||||
current_user: User = Depends(require_roles(UserRole.ADMIN)),
|
||||
request: Request = None,
|
||||
):
|
||||
"""Get recent security events (admin only)"""
|
||||
|
||||
# Log access to security events
|
||||
await audit_logger.log_action(
|
||||
action="admin.audit.access",
|
||||
description=f"Admin {current_user.email} accessed security events",
|
||||
user=current_user,
|
||||
request=request,
|
||||
details={"hours_requested": hours}
|
||||
)
|
||||
|
||||
logs = await audit_logger.get_security_events(hours)
|
||||
return {"logs": logs, "hours": hours}
|
||||
|
||||
|
||||
@router.delete("/audit-logs/cleanup")
|
||||
async def cleanup_audit_logs(
|
||||
retention_days: int = Query(365, ge=30, le=2555, description="Retention period in days"),
|
||||
current_user: User = Depends(require_roles(UserRole.ADMIN)),
|
||||
request: Request = None,
|
||||
):
|
||||
"""Clean up old audit logs (admin only)"""
|
||||
|
||||
# Log audit cleanup action
|
||||
await audit_logger.log_action(
|
||||
action="admin.system.action",
|
||||
description=f"Admin {current_user.email} initiated audit log cleanup",
|
||||
user=current_user,
|
||||
request=request,
|
||||
details={"retention_days": retention_days},
|
||||
severity="warning"
|
||||
)
|
||||
|
||||
deleted_count = await audit_logger.cleanup_old_logs(retention_days)
|
||||
|
||||
# Log cleanup completion
|
||||
await audit_logger.log_action(
|
||||
action="admin.system.action",
|
||||
description=f"Audit log cleanup completed: {deleted_count} logs deleted",
|
||||
user=current_user,
|
||||
request=request,
|
||||
details={
|
||||
"retention_days": retention_days,
|
||||
"deleted_count": deleted_count
|
||||
}
|
||||
)
|
||||
|
||||
return {
|
||||
"message": f"Deleted {deleted_count} audit logs older than {retention_days} days",
|
||||
"deleted_count": deleted_count,
|
||||
"retention_days": retention_days
|
||||
}
|
||||
161
backend/app/api/v1/routes_auth.py
Normal file
161
backend/app/api/v1/routes_auth.py
Normal file
|
|
@ -0,0 +1,161 @@
|
|||
from fastapi import APIRouter, Depends, HTTPException, Request, Response, status
|
||||
from fastapi.security import HTTPBearer
|
||||
from motor.motor_asyncio import AsyncIOMotorClient, AsyncIOMotorDatabase
|
||||
|
||||
from ...core.config import settings
|
||||
from ...core.database import get_database
|
||||
from ...core.security import (
|
||||
create_access_token,
|
||||
create_refresh_token,
|
||||
decode_token,
|
||||
verify_password,
|
||||
)
|
||||
from ...models.user import User
|
||||
from ...schemas.auth import LoginRequest, LoginResponse, LogoutResponse, RefreshResponse
|
||||
|
||||
router = APIRouter(prefix="/auth", tags=["auth"])
|
||||
security = HTTPBearer()
|
||||
|
||||
|
||||
@router.post("/login", response_model=LoginResponse)
|
||||
async def login(
|
||||
login_data: LoginRequest,
|
||||
response: Response,
|
||||
):
|
||||
print(f"LOGIN: Starting login for {login_data.email}")
|
||||
# Create database connection directly (bypass dependency injection issues)
|
||||
client = AsyncIOMotorClient(settings.mongodb_uri)
|
||||
db = client[settings.mongodb_db]
|
||||
|
||||
try:
|
||||
print("LOGIN: Database connection created")
|
||||
# Find user by email
|
||||
print("LOGIN: Looking up user in database")
|
||||
user_doc = await db.users.find_one({"email": login_data.email})
|
||||
print(f"LOGIN: User lookup complete, found: {user_doc is not None}")
|
||||
if not user_doc:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_401_UNAUTHORIZED,
|
||||
detail="Incorrect email or password",
|
||||
)
|
||||
|
||||
user = User(**user_doc)
|
||||
|
||||
# Verify password
|
||||
if not verify_password(login_data.password, user.hashed_password):
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_401_UNAUTHORIZED,
|
||||
detail="Incorrect email or password",
|
||||
)
|
||||
|
||||
if not user.is_active:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_401_UNAUTHORIZED,
|
||||
detail="User account is disabled",
|
||||
)
|
||||
|
||||
# Create tokens
|
||||
access_token = create_access_token(subject=str(user.id))
|
||||
refresh_token = create_refresh_token(subject=str(user.id))
|
||||
|
||||
# Set refresh token as HttpOnly cookie
|
||||
response.set_cookie(
|
||||
key="refresh_token",
|
||||
value=refresh_token,
|
||||
httponly=True,
|
||||
secure=settings.cookie_secure,
|
||||
samesite=settings.cookie_samesite,
|
||||
domain=settings.cookie_domain if settings.app_env == "prod" else None,
|
||||
max_age=settings.jwt_refresh_ttl_days * 24 * 60 * 60,
|
||||
)
|
||||
|
||||
return LoginResponse(
|
||||
access_token=access_token,
|
||||
user_id=str(user.id),
|
||||
role=user.role,
|
||||
)
|
||||
|
||||
finally:
|
||||
# Close database connection
|
||||
client.close()
|
||||
|
||||
|
||||
@router.post("/refresh", response_model=RefreshResponse)
|
||||
async def refresh_token(
|
||||
request: Request,
|
||||
response: Response,
|
||||
db: AsyncIOMotorDatabase = Depends(get_database),
|
||||
):
|
||||
refresh_token = request.cookies.get("refresh_token")
|
||||
if not refresh_token:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_401_UNAUTHORIZED,
|
||||
detail="Refresh token not found",
|
||||
)
|
||||
|
||||
try:
|
||||
payload = decode_token(refresh_token)
|
||||
if payload.get("type") != "refresh":
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_401_UNAUTHORIZED,
|
||||
detail="Invalid token type",
|
||||
)
|
||||
|
||||
user_id = payload.get("sub")
|
||||
if not user_id:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_401_UNAUTHORIZED,
|
||||
detail="Invalid token",
|
||||
)
|
||||
|
||||
# Verify user still exists and is active
|
||||
user_doc = await db.users.find_one({"_id": user_id})
|
||||
if not user_doc:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_401_UNAUTHORIZED,
|
||||
detail="User not found",
|
||||
)
|
||||
|
||||
user = User(**user_doc)
|
||||
if not user.is_active:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_401_UNAUTHORIZED,
|
||||
detail="User account is disabled",
|
||||
)
|
||||
|
||||
# Create new tokens
|
||||
new_access_token = create_access_token(subject=user_id)
|
||||
new_refresh_token = create_refresh_token(subject=user_id)
|
||||
|
||||
# Update refresh token cookie
|
||||
response.set_cookie(
|
||||
key="refresh_token",
|
||||
value=new_refresh_token,
|
||||
httponly=True,
|
||||
secure=settings.cookie_secure,
|
||||
samesite=settings.cookie_samesite,
|
||||
domain=settings.cookie_domain if settings.app_env == "prod" else None,
|
||||
max_age=settings.jwt_refresh_ttl_days * 24 * 60 * 60,
|
||||
)
|
||||
|
||||
return RefreshResponse(access_token=new_access_token)
|
||||
|
||||
except Exception:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_401_UNAUTHORIZED,
|
||||
detail="Invalid refresh token",
|
||||
)
|
||||
|
||||
|
||||
@router.post("/logout", response_model=LogoutResponse)
|
||||
async def logout(response: Response):
|
||||
# Clear refresh token cookie
|
||||
response.delete_cookie(
|
||||
key="refresh_token",
|
||||
httponly=True,
|
||||
secure=settings.cookie_secure,
|
||||
samesite=settings.cookie_samesite,
|
||||
domain=settings.cookie_domain if settings.app_env == "prod" else None,
|
||||
)
|
||||
|
||||
return LogoutResponse()
|
||||
51
backend/app/api/v1/routes_files.py
Normal file
51
backend/app/api/v1/routes_files.py
Normal file
|
|
@ -0,0 +1,51 @@
|
|||
from fastapi import APIRouter, Depends, HTTPException, status
|
||||
from motor.motor_asyncio import AsyncIOMotorDatabase
|
||||
|
||||
from ...core.database import get_database
|
||||
from ...core.dependencies import get_current_user
|
||||
from ...models.user import User
|
||||
from ...schemas.file import SignedUploadRequest, SignedUploadResponse
|
||||
from ...services.gcs import generate_signed_upload_url
|
||||
|
||||
router = APIRouter(prefix="/files", tags=["files"])
|
||||
|
||||
|
||||
@router.post("/signed-upload", response_model=SignedUploadResponse)
|
||||
async def get_signed_upload_url(
|
||||
request: SignedUploadRequest,
|
||||
current_user: User = Depends(get_current_user),
|
||||
db: AsyncIOMotorDatabase = Depends(get_database),
|
||||
):
|
||||
"""
|
||||
Generate a signed URL for direct browser-to-GCS upload
|
||||
This optimizes large file uploads by bypassing the API server
|
||||
"""
|
||||
if not request.content_type.startswith("video/"):
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_400_BAD_REQUEST,
|
||||
detail="Only video files are supported"
|
||||
)
|
||||
|
||||
# Generate unique blob path
|
||||
from bson import ObjectId
|
||||
blob_path = f"temp/{ObjectId()}/{request.filename}"
|
||||
|
||||
try:
|
||||
# Generate signed upload URL with form fields
|
||||
signed_data = await generate_signed_upload_url(
|
||||
blob_path=blob_path,
|
||||
content_type=request.content_type,
|
||||
max_size=request.max_size or 1024 * 1024 * 1024 # 1GB default
|
||||
)
|
||||
|
||||
return SignedUploadResponse(
|
||||
upload_url=signed_data["url"],
|
||||
fields=signed_data["fields"],
|
||||
blob_path=blob_path
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=f"Failed to generate signed upload URL: {str(e)}"
|
||||
)
|
||||
1033
backend/app/api/v1/routes_jobs.py
Normal file
1033
backend/app/api/v1/routes_jobs.py
Normal file
File diff suppressed because it is too large
Load diff
BIN
backend/app/core/__pycache__/config.cpython-313.pyc
Normal file
BIN
backend/app/core/__pycache__/config.cpython-313.pyc
Normal file
Binary file not shown.
BIN
backend/app/core/__pycache__/database.cpython-313.pyc
Normal file
BIN
backend/app/core/__pycache__/database.cpython-313.pyc
Normal file
Binary file not shown.
BIN
backend/app/core/__pycache__/dependencies.cpython-313.pyc
Normal file
BIN
backend/app/core/__pycache__/dependencies.cpython-313.pyc
Normal file
Binary file not shown.
BIN
backend/app/core/__pycache__/logging.cpython-313.pyc
Normal file
BIN
backend/app/core/__pycache__/logging.cpython-313.pyc
Normal file
Binary file not shown.
BIN
backend/app/core/__pycache__/redis.cpython-313.pyc
Normal file
BIN
backend/app/core/__pycache__/redis.cpython-313.pyc
Normal file
Binary file not shown.
BIN
backend/app/core/__pycache__/secrets_config.cpython-313.pyc
Normal file
BIN
backend/app/core/__pycache__/secrets_config.cpython-313.pyc
Normal file
Binary file not shown.
BIN
backend/app/core/__pycache__/security.cpython-313.pyc
Normal file
BIN
backend/app/core/__pycache__/security.cpython-313.pyc
Normal file
Binary file not shown.
77
backend/app/core/config.py
Normal file
77
backend/app/core/config.py
Normal file
|
|
@ -0,0 +1,77 @@
|
|||
|
||||
from pydantic_settings import BaseSettings
|
||||
|
||||
|
||||
class Settings(BaseSettings):
|
||||
# App
|
||||
app_env: str = "dev"
|
||||
api_base_url: str = "http://localhost:8000"
|
||||
|
||||
# Auth
|
||||
jwt_secret: str
|
||||
jwt_alg: str = "HS256"
|
||||
jwt_access_ttl_min: int = 15
|
||||
jwt_refresh_ttl_days: int = 7
|
||||
cookie_domain: str = "localhost"
|
||||
cookie_secure: bool = False
|
||||
cookie_samesite: str = "Lax"
|
||||
|
||||
# Database
|
||||
mongodb_uri: str
|
||||
mongodb_db: str = "accessible_video"
|
||||
|
||||
# Redis
|
||||
redis_url: str
|
||||
|
||||
# Celery
|
||||
celery_broker_url: str = ""
|
||||
celery_result_backend: str = ""
|
||||
|
||||
# GCP
|
||||
gcp_project_id: str
|
||||
gcs_bucket: str = "accessible-video"
|
||||
google_application_credentials: str = ""
|
||||
|
||||
# AI Services
|
||||
gemini_api_key: str
|
||||
translate_api_key: str = ""
|
||||
elevenlabs_api_key: str = ""
|
||||
google_tts_credentials: str = ""
|
||||
|
||||
# TTS Voice Configuration
|
||||
tts_provider: str = "google" # "google" or "elevenlabs"
|
||||
google_tts_voices: dict[str, str] = {
|
||||
"en-US": "en-US-Neural2-D",
|
||||
"es-ES": "es-ES-Neural2-A",
|
||||
"fr-FR": "fr-FR-Neural2-A",
|
||||
"de-DE": "de-DE-Neural2-B"
|
||||
}
|
||||
elevenlabs_voices: dict[str, str] = {
|
||||
"en-US": "21m00Tcm4TlvDq8ikWAM",
|
||||
"es-ES": "VR6AewLTigWG4xSOukaG",
|
||||
"fr-FR": "TxGEqnHWrfWFTfGW9XjX",
|
||||
"de-DE": "pNInz6obpgDQGcFmaJgB"
|
||||
}
|
||||
|
||||
# Email
|
||||
sendgrid_api_key: str
|
||||
email_from: str
|
||||
client_base_url: str
|
||||
|
||||
# Observability
|
||||
sentry_dsn: str = ""
|
||||
otel_exporter_otlp_endpoint: str = ""
|
||||
|
||||
# CORS
|
||||
cors_origins: list[str] = ["http://localhost:5173", "http://localhost:3000"]
|
||||
|
||||
class Config:
|
||||
env_file = ".env"
|
||||
|
||||
|
||||
settings = Settings()
|
||||
|
||||
|
||||
def get_settings():
|
||||
"""Get settings instance - for dependency injection"""
|
||||
return settings
|
||||
67
backend/app/core/database.py
Normal file
67
backend/app/core/database.py
Normal file
|
|
@ -0,0 +1,67 @@
|
|||
from motor.motor_asyncio import AsyncIOMotorClient, AsyncIOMotorDatabase
|
||||
|
||||
from ..core.logging import get_logger
|
||||
from .config import settings
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
class MongoDB:
|
||||
client: AsyncIOMotorClient = None
|
||||
database: AsyncIOMotorDatabase = None
|
||||
|
||||
|
||||
mongodb = MongoDB()
|
||||
|
||||
|
||||
async def connect_to_mongo():
|
||||
logger.info("Connecting to MongoDB...")
|
||||
mongodb.client = AsyncIOMotorClient(settings.mongodb_uri)
|
||||
mongodb.database = mongodb.client[settings.mongodb_db]
|
||||
|
||||
# Test connection
|
||||
try:
|
||||
await mongodb.client.admin.command('ping')
|
||||
logger.info("Successfully connected to MongoDB")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to connect to MongoDB: {e}")
|
||||
raise
|
||||
|
||||
|
||||
async def close_mongo_connection():
|
||||
logger.info("Closing MongoDB connection...")
|
||||
if mongodb.client:
|
||||
mongodb.client.close()
|
||||
|
||||
|
||||
async def get_database() -> AsyncIOMotorDatabase:
|
||||
return mongodb.database
|
||||
|
||||
|
||||
async def create_indexes():
|
||||
"""Create database indexes as specified in the development plan"""
|
||||
db = mongodb.database
|
||||
|
||||
# Jobs collection indexes
|
||||
await db.jobs.create_index([("status", 1), ("created_at", -1)])
|
||||
await db.jobs.create_index([("client_id", 1)])
|
||||
|
||||
# Users collection indexes
|
||||
await db.users.create_index([("email", 1)], unique=True)
|
||||
|
||||
# Audit logs collection indexes - comprehensive indexing for audit queries
|
||||
await db.audit_logs.create_index([("timestamp", -1)]) # Primary sort field
|
||||
await db.audit_logs.create_index([("action", 1), ("timestamp", -1)]) # Filter by action
|
||||
await db.audit_logs.create_index([("user_id", 1), ("timestamp", -1)]) # User activity
|
||||
await db.audit_logs.create_index([("severity", 1), ("timestamp", -1)]) # Security events
|
||||
await db.audit_logs.create_index([("resource_type", 1), ("resource_id", 1)]) # Resource tracking
|
||||
await db.audit_logs.create_index([("ip_address", 1), ("timestamp", -1)]) # IP-based analysis
|
||||
await db.audit_logs.create_index([("success", 1), ("timestamp", -1)]) # Failed operations
|
||||
|
||||
# Text search index for description and details
|
||||
await db.audit_logs.create_index([
|
||||
("description", "text"),
|
||||
("details", "text"),
|
||||
("error_message", "text")
|
||||
])
|
||||
|
||||
logger.info("Database indexes created successfully")
|
||||
88
backend/app/core/dependencies.py
Normal file
88
backend/app/core/dependencies.py
Normal file
|
|
@ -0,0 +1,88 @@
|
|||
from typing import Optional
|
||||
|
||||
from bson import ObjectId
|
||||
from fastapi import Depends, HTTPException, Request, status
|
||||
from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer
|
||||
from motor.motor_asyncio import AsyncIOMotorDatabase
|
||||
|
||||
from ..models.user import User, UserRole
|
||||
from .database import get_database
|
||||
from .security import decode_token
|
||||
|
||||
security = HTTPBearer()
|
||||
|
||||
|
||||
async def get_current_user(
|
||||
credentials: HTTPAuthorizationCredentials = Depends(security),
|
||||
db: AsyncIOMotorDatabase = Depends(get_database),
|
||||
) -> User:
|
||||
token = credentials.credentials
|
||||
payload = decode_token(token)
|
||||
user_id: str = payload.get("sub")
|
||||
|
||||
if user_id is None:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_401_UNAUTHORIZED,
|
||||
detail="Could not validate credentials",
|
||||
)
|
||||
|
||||
user_doc = await db.users.find_one({"_id": ObjectId(user_id)})
|
||||
if user_doc is None:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_401_UNAUTHORIZED,
|
||||
detail="User not found",
|
||||
)
|
||||
|
||||
return User(**user_doc)
|
||||
|
||||
|
||||
def require_role(required_role: UserRole):
|
||||
async def role_checker(current_user: User = Depends(get_current_user)) -> User:
|
||||
if current_user.role != required_role and current_user.role != UserRole.ADMIN:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_403_FORBIDDEN,
|
||||
detail="Insufficient permissions",
|
||||
)
|
||||
return current_user
|
||||
|
||||
return role_checker
|
||||
|
||||
|
||||
def require_roles(*required_roles: UserRole):
|
||||
async def roles_checker(current_user: User = Depends(get_current_user)) -> User:
|
||||
if current_user.role not in required_roles and current_user.role != UserRole.ADMIN:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_403_FORBIDDEN,
|
||||
detail="Insufficient permissions",
|
||||
)
|
||||
return current_user
|
||||
|
||||
return roles_checker
|
||||
|
||||
|
||||
async def get_current_user_optional(
|
||||
request: Request,
|
||||
db: AsyncIOMotorDatabase = Depends(get_database),
|
||||
) -> Optional[User]:
|
||||
authorization: str = request.headers.get("Authorization")
|
||||
if not authorization:
|
||||
return None
|
||||
|
||||
try:
|
||||
scheme, token = authorization.split()
|
||||
if scheme.lower() != "bearer":
|
||||
return None
|
||||
|
||||
payload = decode_token(token)
|
||||
user_id: str = payload.get("sub")
|
||||
|
||||
if user_id is None:
|
||||
return None
|
||||
|
||||
user_doc = await db.users.find_one({"_id": ObjectId(user_id)})
|
||||
if user_doc is None:
|
||||
return None
|
||||
|
||||
return User(**user_doc)
|
||||
except Exception:
|
||||
return None
|
||||
65
backend/app/core/logging.py
Normal file
65
backend/app/core/logging.py
Normal file
|
|
@ -0,0 +1,65 @@
|
|||
import logging
|
||||
import sys
|
||||
from typing import Any
|
||||
|
||||
|
||||
class StructuredFormatter(logging.Formatter):
|
||||
def format(self, record: logging.LogRecord) -> str:
|
||||
log_entry = {
|
||||
"timestamp": self.formatTime(record),
|
||||
"level": record.levelname,
|
||||
"logger": record.name,
|
||||
"message": record.getMessage(),
|
||||
}
|
||||
|
||||
if hasattr(record, "extra_fields"):
|
||||
log_entry.update(record.extra_fields)
|
||||
|
||||
if record.exc_info:
|
||||
log_entry["exception"] = self.formatException(record.exc_info)
|
||||
|
||||
return str(log_entry)
|
||||
|
||||
|
||||
def setup_logging() -> None:
|
||||
root_logger = logging.getLogger()
|
||||
root_logger.setLevel(logging.INFO)
|
||||
|
||||
# Remove default handlers
|
||||
for handler in root_logger.handlers[:]:
|
||||
root_logger.removeHandler(handler)
|
||||
|
||||
# Add structured handler
|
||||
handler = logging.StreamHandler(sys.stdout)
|
||||
handler.setFormatter(StructuredFormatter())
|
||||
root_logger.addHandler(handler)
|
||||
|
||||
# Set levels for third-party loggers
|
||||
logging.getLogger("uvicorn.access").setLevel(logging.WARNING)
|
||||
logging.getLogger("httpx").setLevel(logging.WARNING)
|
||||
|
||||
|
||||
def get_logger(name: str) -> logging.Logger:
|
||||
return logging.getLogger(name)
|
||||
|
||||
|
||||
class LogContext:
|
||||
def __init__(self, logger: logging.Logger, **context: Any):
|
||||
self.logger = logger
|
||||
self.context = context
|
||||
|
||||
def info(self, message: str, **extra: Any) -> None:
|
||||
self._log(logging.INFO, message, **extra)
|
||||
|
||||
def warning(self, message: str, **extra: Any) -> None:
|
||||
self._log(logging.WARNING, message, **extra)
|
||||
|
||||
def error(self, message: str, **extra: Any) -> None:
|
||||
self._log(logging.ERROR, message, **extra)
|
||||
|
||||
def _log(self, level: int, message: str, **extra: Any) -> None:
|
||||
combined_extra = {**self.context, **extra}
|
||||
record = self.logger.makeRecord(
|
||||
self.logger.name, level, "", 0, message, (), None, extra_fields=combined_extra
|
||||
)
|
||||
self.logger.handle(record)
|
||||
49
backend/app/core/redis.py
Normal file
49
backend/app/core/redis.py
Normal file
|
|
@ -0,0 +1,49 @@
|
|||
import redis.asyncio as redis
|
||||
|
||||
from .config import settings
|
||||
from .logging import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
class RedisConnection:
|
||||
pool: redis.ConnectionPool = None
|
||||
client: redis.Redis = None
|
||||
|
||||
|
||||
redis_conn = RedisConnection()
|
||||
|
||||
|
||||
async def connect_to_redis():
|
||||
logger.info("Connecting to Redis...")
|
||||
redis_conn.pool = redis.ConnectionPool.from_url(
|
||||
settings.redis_url,
|
||||
encoding="utf-8",
|
||||
decode_responses=True,
|
||||
max_connections=20,
|
||||
)
|
||||
redis_conn.client = redis.Redis(connection_pool=redis_conn.pool)
|
||||
|
||||
# Test connection
|
||||
try:
|
||||
await redis_conn.client.ping()
|
||||
logger.info("Successfully connected to Redis")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to connect to Redis: {e}")
|
||||
raise
|
||||
|
||||
|
||||
async def close_redis_connection():
|
||||
logger.info("Closing Redis connection...")
|
||||
if redis_conn.client:
|
||||
await redis_conn.client.close()
|
||||
if redis_conn.pool:
|
||||
await redis_conn.pool.disconnect()
|
||||
|
||||
|
||||
async def get_redis() -> redis.Redis:
|
||||
return redis_conn.client
|
||||
|
||||
|
||||
def get_redis_client() -> redis.Redis:
|
||||
"""Get the Redis client synchronously (for middleware setup)."""
|
||||
return redis_conn.client
|
||||
145
backend/app/core/secrets_config.py
Normal file
145
backend/app/core/secrets_config.py
Normal file
|
|
@ -0,0 +1,145 @@
|
|||
"""Enhanced configuration system with Secret Manager integration."""
|
||||
|
||||
import os
|
||||
import asyncio
|
||||
from typing import Dict, Optional, Any
|
||||
from functools import lru_cache
|
||||
from pydantic_settings import BaseSettings
|
||||
|
||||
from .config import Settings as BaseConfig
|
||||
from .logging import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
class SecretsConfig(BaseConfig):
|
||||
"""Enhanced configuration that loads secrets from GCP Secret Manager."""
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
# Initialize with base configuration first
|
||||
super().__init__(**kwargs)
|
||||
|
||||
# Flag to track if secrets have been loaded
|
||||
self._secrets_loaded = False
|
||||
self._secret_values: Dict[str, str] = {}
|
||||
|
||||
async def load_secrets(self) -> None:
|
||||
"""Load secrets from Secret Manager asynchronously."""
|
||||
if self._secrets_loaded:
|
||||
return
|
||||
|
||||
try:
|
||||
# Only import here to avoid circular imports
|
||||
from app.services.secrets_manager import secrets_manager
|
||||
|
||||
# Define which config fields should be loaded from secrets
|
||||
secret_mappings = {
|
||||
# Config field -> Secret Manager name
|
||||
"jwt_secret": "jwt-secret",
|
||||
"jwt_refresh_secret": "jwt-refresh-secret",
|
||||
"mongodb_uri": "mongodb-url",
|
||||
"redis_url": "redis-url",
|
||||
"gemini_api_key": "gemini-api-key",
|
||||
"sendgrid_api_key": "sendgrid-api-key",
|
||||
"elevenlabs_api_key": "elevenlabs-api-key",
|
||||
"sentry_dsn": "sentry-dsn"
|
||||
}
|
||||
|
||||
# Get all secrets in batch
|
||||
secret_names = list(secret_mappings.values())
|
||||
retrieved_secrets = await secrets_manager.get_secrets_batch(secret_names)
|
||||
|
||||
# Map secrets back to config fields
|
||||
for config_field, secret_name in secret_mappings.items():
|
||||
if secret_name in retrieved_secrets:
|
||||
self._secret_values[config_field] = retrieved_secrets[secret_name]
|
||||
# Override the config value
|
||||
setattr(self, config_field, retrieved_secrets[secret_name])
|
||||
logger.debug(f"Loaded secret for {config_field}")
|
||||
else:
|
||||
logger.warning(f"Secret {secret_name} not available, using environment/default")
|
||||
|
||||
self._secrets_loaded = True
|
||||
logger.info(f"Successfully loaded {len(retrieved_secrets)} secrets from Secret Manager")
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to load secrets from Secret Manager: {e}")
|
||||
logger.warning("Falling back to environment variables")
|
||||
self._secrets_loaded = True # Mark as loaded to prevent retries
|
||||
|
||||
def get_secret_value(self, field_name: str) -> Optional[str]:
|
||||
"""Get a secret value if it was loaded from Secret Manager."""
|
||||
return self._secret_values.get(field_name)
|
||||
|
||||
async def refresh_secrets(self) -> None:
|
||||
"""Force refresh secrets from Secret Manager."""
|
||||
self._secrets_loaded = False
|
||||
self._secret_values.clear()
|
||||
|
||||
# Clear the secrets manager cache
|
||||
from app.services.secrets_manager import secrets_manager
|
||||
secrets_manager.clear_cache()
|
||||
|
||||
await self.load_secrets()
|
||||
|
||||
@property
|
||||
def is_production(self) -> bool:
|
||||
"""Check if running in production environment."""
|
||||
return self.app_env == "prod"
|
||||
|
||||
@property
|
||||
def is_development(self) -> bool:
|
||||
"""Check if running in development environment."""
|
||||
return self.app_env == "dev"
|
||||
|
||||
@property
|
||||
def google_cloud_project(self) -> str:
|
||||
"""Get Google Cloud Project ID."""
|
||||
return self.gcp_project_id
|
||||
|
||||
@property
|
||||
def jwt_refresh_secret(self) -> str:
|
||||
"""Get JWT refresh secret (fallback to main secret if not set)."""
|
||||
return getattr(self, '_jwt_refresh_secret', self.jwt_secret)
|
||||
|
||||
@jwt_refresh_secret.setter
|
||||
def jwt_refresh_secret(self, value: str) -> None:
|
||||
"""Set JWT refresh secret."""
|
||||
self._jwt_refresh_secret = value
|
||||
|
||||
|
||||
# Global configuration instance
|
||||
_config_instance: Optional[SecretsConfig] = None
|
||||
|
||||
|
||||
async def initialize_config() -> SecretsConfig:
|
||||
"""Initialize configuration with secrets loading."""
|
||||
global _config_instance
|
||||
|
||||
if _config_instance is None:
|
||||
_config_instance = SecretsConfig()
|
||||
await _config_instance.load_secrets()
|
||||
|
||||
return _config_instance
|
||||
|
||||
|
||||
def get_settings() -> SecretsConfig:
|
||||
"""Get settings instance (synchronous)."""
|
||||
global _config_instance
|
||||
|
||||
if _config_instance is None:
|
||||
# Initialize without secrets for backwards compatibility
|
||||
_config_instance = SecretsConfig()
|
||||
logger.warning("Settings accessed before async initialization - secrets not loaded")
|
||||
|
||||
return _config_instance
|
||||
|
||||
|
||||
@lru_cache()
|
||||
def get_settings_cached() -> SecretsConfig:
|
||||
"""Get cached settings instance."""
|
||||
return get_settings()
|
||||
|
||||
|
||||
# Backwards compatibility
|
||||
settings = get_settings()
|
||||
55
backend/app/core/security.py
Normal file
55
backend/app/core/security.py
Normal file
|
|
@ -0,0 +1,55 @@
|
|||
from datetime import datetime, timedelta
|
||||
from typing import Any, Optional, Union
|
||||
|
||||
from fastapi import HTTPException, status
|
||||
from jose import JWTError, jwt
|
||||
from passlib.context import CryptContext
|
||||
|
||||
from .config import settings
|
||||
|
||||
pwd_context = CryptContext(schemes=["bcrypt"], deprecated="auto")
|
||||
|
||||
|
||||
def create_access_token(
|
||||
subject: Union[str, Any], expires_delta: Optional[timedelta] = None
|
||||
) -> str:
|
||||
if expires_delta:
|
||||
expire = datetime.utcnow() + expires_delta
|
||||
else:
|
||||
expire = datetime.utcnow() + timedelta(minutes=settings.jwt_access_ttl_min)
|
||||
|
||||
to_encode = {"exp": expire, "sub": str(subject)}
|
||||
encoded_jwt = jwt.encode(to_encode, settings.jwt_secret, algorithm=settings.jwt_alg)
|
||||
return encoded_jwt
|
||||
|
||||
|
||||
def create_refresh_token(
|
||||
subject: Union[str, Any], expires_delta: Optional[timedelta] = None
|
||||
) -> str:
|
||||
if expires_delta:
|
||||
expire = datetime.utcnow() + expires_delta
|
||||
else:
|
||||
expire = datetime.utcnow() + timedelta(days=settings.jwt_refresh_ttl_days)
|
||||
|
||||
to_encode = {"exp": expire, "sub": str(subject), "type": "refresh"}
|
||||
encoded_jwt = jwt.encode(to_encode, settings.jwt_secret, algorithm=settings.jwt_alg)
|
||||
return encoded_jwt
|
||||
|
||||
|
||||
def verify_password(plain_password: str, hashed_password: str) -> bool:
|
||||
return pwd_context.verify(plain_password, hashed_password)
|
||||
|
||||
|
||||
def get_password_hash(password: str) -> str:
|
||||
return pwd_context.hash(password)
|
||||
|
||||
|
||||
def decode_token(token: str) -> dict[str, Any]:
|
||||
try:
|
||||
payload = jwt.decode(token, settings.jwt_secret, algorithms=[settings.jwt_alg])
|
||||
return payload
|
||||
except JWTError:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_401_UNAUTHORIZED,
|
||||
detail="Could not validate credentials",
|
||||
)
|
||||
BIN
backend/app/lib/__pycache__/vtt.cpython-313.pyc
Normal file
BIN
backend/app/lib/__pycache__/vtt.cpython-313.pyc
Normal file
Binary file not shown.
222
backend/app/lib/vtt.py
Normal file
222
backend/app/lib/vtt.py
Normal file
|
|
@ -0,0 +1,222 @@
|
|||
import re
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class VTTCue:
|
||||
start_time: float # seconds
|
||||
end_time: float # seconds
|
||||
text: str
|
||||
identifier: str | None = None
|
||||
|
||||
|
||||
class VTTParser:
|
||||
"""Parser and builder for WebVTT files"""
|
||||
|
||||
@staticmethod
|
||||
def parse(vtt_content: str) -> list[VTTCue]:
|
||||
"""Parse VTT content into a list of cues"""
|
||||
lines = vtt_content.strip().split('\n')
|
||||
cues = []
|
||||
|
||||
i = 0
|
||||
while i < len(lines):
|
||||
line = lines[i].strip()
|
||||
|
||||
# Skip WEBVTT header, empty lines, and NOTE lines
|
||||
if line == "WEBVTT" or line == "" or line.startswith("NOTE"):
|
||||
i += 1
|
||||
continue
|
||||
|
||||
# Check if this line is a cue identifier (optional)
|
||||
identifier = None
|
||||
if " --> " not in line and i + 1 < len(lines) and " --> " in lines[i + 1]:
|
||||
identifier = line
|
||||
i += 1
|
||||
line = lines[i].strip()
|
||||
|
||||
# Parse timing line
|
||||
if " --> " in line:
|
||||
timing_match = re.match(r'([\d:.,]+)\s+-->\s+([\d:.,]+)', line)
|
||||
if timing_match:
|
||||
start_time = VTTParser._parse_timestamp(timing_match.group(1))
|
||||
end_time = VTTParser._parse_timestamp(timing_match.group(2))
|
||||
|
||||
# Collect text lines until empty line or next cue
|
||||
i += 1
|
||||
text_lines = []
|
||||
while i < len(lines) and lines[i].strip() != "":
|
||||
text_lines.append(lines[i].strip())
|
||||
i += 1
|
||||
|
||||
if text_lines:
|
||||
cues.append(VTTCue(
|
||||
start_time=start_time,
|
||||
end_time=end_time,
|
||||
text="\n".join(text_lines),
|
||||
identifier=identifier
|
||||
))
|
||||
else:
|
||||
i += 1
|
||||
|
||||
return cues
|
||||
|
||||
@staticmethod
|
||||
def build(cues: list[VTTCue]) -> str:
|
||||
"""Build VTT content from a list of cues"""
|
||||
lines = ["WEBVTT", ""]
|
||||
|
||||
for cue in cues:
|
||||
# Add identifier if present
|
||||
if cue.identifier:
|
||||
lines.append(cue.identifier)
|
||||
|
||||
# Add timing line
|
||||
start_timestamp = VTTParser._format_timestamp(cue.start_time)
|
||||
end_timestamp = VTTParser._format_timestamp(cue.end_time)
|
||||
lines.append(f"{start_timestamp} --> {end_timestamp}")
|
||||
|
||||
# Add text (can be multi-line)
|
||||
lines.append(cue.text)
|
||||
lines.append("") # Empty line between cues
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
@staticmethod
|
||||
def _parse_timestamp(timestamp: str) -> float:
|
||||
"""Convert VTT timestamp (HH:MM:SS.mmm or MM:SS.mmm) to seconds"""
|
||||
# Clean up timestamp (handle both . and , as decimal separator)
|
||||
timestamp = timestamp.replace(',', '.')
|
||||
|
||||
# Split by colon
|
||||
parts = timestamp.split(':')
|
||||
|
||||
if len(parts) == 3: # HH:MM:SS.mmm
|
||||
hours, minutes, seconds = parts
|
||||
elif len(parts) == 2: # MM:SS.mmm
|
||||
hours, minutes, seconds = "0", parts[0], parts[1]
|
||||
else:
|
||||
raise ValueError(f"Invalid timestamp format: {timestamp}")
|
||||
|
||||
# Parse seconds and decimal part
|
||||
sec_parts = seconds.split('.')
|
||||
whole_seconds = int(sec_parts[0])
|
||||
decimal_part = int(sec_parts[1]) if len(sec_parts) > 1 else 0
|
||||
|
||||
# Convert to total seconds
|
||||
total_seconds = (
|
||||
int(hours) * 3600 +
|
||||
int(minutes) * 60 +
|
||||
whole_seconds +
|
||||
decimal_part / 1000.0
|
||||
)
|
||||
|
||||
return total_seconds
|
||||
|
||||
@staticmethod
|
||||
def _format_timestamp(seconds: float) -> str:
|
||||
"""Convert seconds to VTT timestamp format (HH:MM:SS.mmm)"""
|
||||
hours = int(seconds // 3600)
|
||||
minutes = int((seconds % 3600) // 60)
|
||||
secs = seconds % 60
|
||||
|
||||
whole_secs = int(secs)
|
||||
milliseconds = int((secs - whole_secs) * 1000)
|
||||
|
||||
return f"{hours:02d}:{minutes:02d}:{whole_secs:02d}.{milliseconds:03d}"
|
||||
|
||||
|
||||
class VTTEditor:
|
||||
"""Utility class for editing VTT content while preserving timing"""
|
||||
|
||||
@staticmethod
|
||||
def translate_preserving_timing(
|
||||
vtt_content: str,
|
||||
translated_texts: list[str]
|
||||
) -> str:
|
||||
"""Replace text in VTT cues while preserving all timing information"""
|
||||
cues = VTTParser.parse(vtt_content)
|
||||
|
||||
if len(translated_texts) != len(cues):
|
||||
raise ValueError(
|
||||
f"Text count mismatch: {len(translated_texts)} texts for {len(cues)} cues"
|
||||
)
|
||||
|
||||
# Update cue texts
|
||||
for i, translated_text in enumerate(translated_texts):
|
||||
cues[i].text = translated_text
|
||||
|
||||
return VTTParser.build(cues)
|
||||
|
||||
@staticmethod
|
||||
def update_cue_text(vtt_content: str, cue_index: int, new_text: str) -> str:
|
||||
"""Update text for a specific cue by index"""
|
||||
cues = VTTParser.parse(vtt_content)
|
||||
|
||||
if cue_index < 0 or cue_index >= len(cues):
|
||||
raise ValueError(f"Invalid cue index: {cue_index}")
|
||||
|
||||
cues[cue_index].text = new_text
|
||||
return VTTParser.build(cues)
|
||||
|
||||
@staticmethod
|
||||
def validate_vtt(vtt_content: str) -> tuple[bool, list[str]]:
|
||||
"""Validate VTT content and return errors if any"""
|
||||
errors = []
|
||||
|
||||
if not vtt_content.strip().startswith("WEBVTT"):
|
||||
errors.append("VTT must start with 'WEBVTT'")
|
||||
|
||||
try:
|
||||
cues = VTTParser.parse(vtt_content)
|
||||
|
||||
# Check timing consistency
|
||||
for i, cue in enumerate(cues):
|
||||
if cue.start_time >= cue.end_time:
|
||||
errors.append(f"Cue {i + 1}: Start time must be before end time")
|
||||
|
||||
if i > 0 and cue.start_time < cues[i - 1].end_time:
|
||||
errors.append(f"Cue {i + 1}: Overlapping with previous cue")
|
||||
|
||||
if not cue.text.strip():
|
||||
errors.append(f"Cue {i + 1}: Empty text content")
|
||||
|
||||
except Exception as e:
|
||||
errors.append(f"Parse error: {str(e)}")
|
||||
|
||||
return len(errors) == 0, errors
|
||||
|
||||
@staticmethod
|
||||
def get_cue_count(vtt_content: str) -> int:
|
||||
"""Get the number of cues in VTT content"""
|
||||
try:
|
||||
cues = VTTParser.parse(vtt_content)
|
||||
return len(cues)
|
||||
except Exception:
|
||||
return 0
|
||||
|
||||
@staticmethod
|
||||
def get_total_duration(vtt_content: str) -> float:
|
||||
"""Get total duration of VTT content in seconds"""
|
||||
try:
|
||||
cues = VTTParser.parse(vtt_content)
|
||||
if not cues:
|
||||
return 0.0
|
||||
return max(cue.end_time for cue in cues)
|
||||
except Exception:
|
||||
return 0.0
|
||||
|
||||
@staticmethod
|
||||
def adjust_timing_offset(vtt_content: str, offset_seconds: float) -> str:
|
||||
"""
|
||||
Adjust all VTT cue timings by a fixed offset
|
||||
Positive offset moves captions later, negative moves them earlier
|
||||
"""
|
||||
cues = VTTParser.parse(vtt_content)
|
||||
|
||||
for cue in cues:
|
||||
cue.start_time = max(0.0, cue.start_time + offset_seconds)
|
||||
cue.end_time = max(cue.start_time + 0.5, cue.end_time + offset_seconds)
|
||||
|
||||
return VTTParser.build(cues)
|
||||
|
||||
216
backend/app/main.py
Normal file
216
backend/app/main.py
Normal file
|
|
@ -0,0 +1,216 @@
|
|||
from contextlib import asynccontextmanager
|
||||
|
||||
import sentry_sdk
|
||||
from fastapi import FastAPI, Request, HTTPException
|
||||
from fastapi.exceptions import RequestValidationError
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from fastapi.responses import JSONResponse
|
||||
from sentry_sdk.integrations.fastapi import FastApiIntegration
|
||||
from sentry_sdk.integrations.redis import RedisIntegration
|
||||
from sentry_sdk.integrations.pymongo import PyMongoIntegration
|
||||
from sentry_sdk.integrations.celery import CeleryIntegration
|
||||
|
||||
from .api.v1.routes_admin import router as admin_router
|
||||
from .api.v1.routes_auth import router as auth_router
|
||||
from .api.v1.routes_files import router as files_router
|
||||
from .api.v1.routes_jobs import router as jobs_router
|
||||
from .core.config import settings
|
||||
from .core.secrets_config import initialize_config
|
||||
from .core.database import close_mongo_connection, connect_to_mongo, create_indexes
|
||||
from .core.logging import setup_logging
|
||||
from .core.redis import close_redis_connection, connect_to_redis, get_redis_client
|
||||
from .middleware import create_rate_limit_middleware, create_validation_middleware
|
||||
from .telemetry import (
|
||||
app_metrics,
|
||||
instrument_dependencies,
|
||||
instrument_fastapi_app,
|
||||
setup_tracing
|
||||
)
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
# Startup
|
||||
setup_logging()
|
||||
|
||||
# Initialize configuration with secrets
|
||||
if settings.app_env == "prod":
|
||||
try:
|
||||
await initialize_config()
|
||||
print("✅ Configuration initialized with Secret Manager")
|
||||
except Exception as e:
|
||||
print(f"⚠️ Failed to load secrets from Secret Manager: {e}")
|
||||
print("⚠️ Falling back to environment variables")
|
||||
|
||||
# Initialize Sentry error tracking
|
||||
if settings.sentry_dsn and settings.sentry_dsn.startswith(('http', 'https')):
|
||||
sentry_sdk.init(
|
||||
dsn=settings.sentry_dsn,
|
||||
integrations=[
|
||||
FastApiIntegration(),
|
||||
RedisIntegration(),
|
||||
PyMongoIntegration(),
|
||||
CeleryIntegration(monitor_beat_tasks=True),
|
||||
],
|
||||
traces_sample_rate=0.1 if settings.app_env == "prod" else 1.0,
|
||||
environment=settings.app_env,
|
||||
release="1.0.0",
|
||||
attach_stacktrace=True,
|
||||
send_default_pii=False, # Don't send PII for privacy
|
||||
)
|
||||
|
||||
# Initialize telemetry (disabled for local development)
|
||||
# setup_tracing("accessible-video-api", "1.0.0")
|
||||
# instrument_dependencies()
|
||||
|
||||
# Start Prometheus metrics server in production
|
||||
if settings.app_env == "prod":
|
||||
app_metrics.start_prometheus_server(port=8001)
|
||||
|
||||
await connect_to_mongo()
|
||||
await connect_to_redis()
|
||||
# await create_indexes() # Temporarily disabled for debugging
|
||||
|
||||
# Initialize middleware with Redis client
|
||||
redis_client = get_redis_client()
|
||||
if redis_client:
|
||||
rate_limit_middleware = await create_rate_limit_middleware(redis_client)
|
||||
validation_middleware = await create_validation_middleware()
|
||||
|
||||
# Store middleware in app state for access
|
||||
app.state.rate_limit_middleware = rate_limit_middleware
|
||||
app.state.validation_middleware = validation_middleware
|
||||
|
||||
yield
|
||||
# Shutdown
|
||||
await close_mongo_connection()
|
||||
await close_redis_connection()
|
||||
|
||||
|
||||
app = FastAPI(
|
||||
title="Accessible Video API",
|
||||
description="API for accessible video processing platform",
|
||||
version="1.0.0",
|
||||
lifespan=lifespan,
|
||||
)
|
||||
|
||||
# CORS middleware
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=settings.cors_origins,
|
||||
allow_credentials=True,
|
||||
allow_methods=["GET", "POST", "PUT", "PATCH", "DELETE"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
# Custom CORS error handler middleware to ensure CORS headers are added to all error responses
|
||||
# This must be added BEFORE CORSMiddleware (which will be applied after due to reverse order)
|
||||
@app.middleware("http")
|
||||
async def cors_error_handler(request, call_next):
|
||||
"""Ensure CORS headers are added to all responses, including errors."""
|
||||
try:
|
||||
response = await call_next(request)
|
||||
except Exception as e:
|
||||
# Handle any unhandled exceptions and add CORS headers
|
||||
from fastapi.responses import JSONResponse
|
||||
response = JSONResponse(
|
||||
status_code=500,
|
||||
content={"detail": "Internal server error"}
|
||||
)
|
||||
|
||||
# Always add CORS headers for allowed origins
|
||||
origin = request.headers.get("origin")
|
||||
if origin and origin in settings.cors_origins:
|
||||
response.headers["access-control-allow-origin"] = origin
|
||||
response.headers["access-control-allow-credentials"] = "true"
|
||||
# Add other necessary CORS headers for error responses
|
||||
if response.status_code >= 400:
|
||||
response.headers["access-control-allow-methods"] = "GET, POST, PUT, PATCH, DELETE"
|
||||
response.headers["access-control-allow-headers"] = "*"
|
||||
|
||||
return response
|
||||
|
||||
# Global exception handler to ensure CORS headers on all errors
|
||||
@app.exception_handler(HTTPException)
|
||||
async def http_exception_handler(request: Request, exc: HTTPException):
|
||||
"""Handle HTTP exceptions with CORS headers"""
|
||||
response = JSONResponse(
|
||||
status_code=exc.status_code,
|
||||
content={"detail": exc.detail}
|
||||
)
|
||||
|
||||
# Add CORS headers
|
||||
origin = request.headers.get("origin")
|
||||
if origin and origin in settings.cors_origins:
|
||||
response.headers["access-control-allow-origin"] = origin
|
||||
response.headers["access-control-allow-credentials"] = "true"
|
||||
response.headers["access-control-allow-methods"] = "GET, POST, PUT, PATCH, DELETE"
|
||||
response.headers["access-control-allow-headers"] = "*"
|
||||
|
||||
return response
|
||||
|
||||
# Global exception handler for validation errors
|
||||
@app.exception_handler(RequestValidationError)
|
||||
async def validation_exception_handler(request: Request, exc: RequestValidationError):
|
||||
"""Handle request validation errors with CORS headers"""
|
||||
response = JSONResponse(
|
||||
status_code=422,
|
||||
content={"detail": exc.errors(), "body": exc.body}
|
||||
)
|
||||
|
||||
# Add CORS headers
|
||||
origin = request.headers.get("origin")
|
||||
if origin and origin in settings.cors_origins:
|
||||
response.headers["access-control-allow-origin"] = origin
|
||||
response.headers["access-control-allow-credentials"] = "true"
|
||||
response.headers["access-control-allow-methods"] = "GET, POST, PUT, PATCH, DELETE"
|
||||
response.headers["access-control-allow-headers"] = "*"
|
||||
|
||||
return response
|
||||
|
||||
# Add custom middleware (order matters - applied in reverse order)
|
||||
@app.middleware("http")
|
||||
async def rate_limiting_middleware(request, call_next):
|
||||
"""Apply rate limiting middleware."""
|
||||
# Skip middleware for auth endpoints during debugging
|
||||
if request.url.path in ["/api/v1/auth/login", "/api/v1/auth/refresh"]:
|
||||
return await call_next(request)
|
||||
if hasattr(app.state, 'rate_limit_middleware'):
|
||||
return await app.state.rate_limit_middleware(request, call_next)
|
||||
return await call_next(request)
|
||||
|
||||
@app.middleware("http")
|
||||
async def validation_middleware(request, call_next):
|
||||
"""Apply request validation middleware."""
|
||||
# Skip middleware for auth endpoints during debugging
|
||||
if request.url.path in ["/api/v1/auth/login", "/api/v1/auth/refresh"]:
|
||||
return await call_next(request)
|
||||
if hasattr(app.state, 'validation_middleware'):
|
||||
return await app.state.validation_middleware(request, call_next)
|
||||
return await call_next(request)
|
||||
|
||||
# Instrument FastAPI app for tracing (disabled for local development)
|
||||
# instrument_fastapi_app(app)
|
||||
|
||||
# Include routers
|
||||
app.include_router(auth_router, prefix="/api/v1")
|
||||
app.include_router(files_router, prefix="/api/v1")
|
||||
app.include_router(jobs_router, prefix="/api/v1")
|
||||
app.include_router(admin_router, prefix="/api/v1")
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
async def health_check():
|
||||
return {"status": "healthy", "version": "1.0.0"}
|
||||
|
||||
|
||||
@app.get("/metrics")
|
||||
async def metrics():
|
||||
"""Prometheus metrics endpoint"""
|
||||
from prometheus_client import generate_latest, CONTENT_TYPE_LATEST
|
||||
from fastapi import Response
|
||||
|
||||
return Response(
|
||||
content=generate_latest(),
|
||||
media_type=CONTENT_TYPE_LATEST
|
||||
)
|
||||
12
backend/app/middleware/__init__.py
Normal file
12
backend/app/middleware/__init__.py
Normal file
|
|
@ -0,0 +1,12 @@
|
|||
"""Middleware package for FastAPI application."""
|
||||
|
||||
from .rate_limiting import RateLimitMiddleware, IPWhitelist, create_rate_limit_middleware
|
||||
from .validation import ValidationMiddleware, create_validation_middleware
|
||||
|
||||
__all__ = [
|
||||
"RateLimitMiddleware",
|
||||
"IPWhitelist",
|
||||
"create_rate_limit_middleware",
|
||||
"ValidationMiddleware",
|
||||
"create_validation_middleware"
|
||||
]
|
||||
BIN
backend/app/middleware/__pycache__/__init__.cpython-313.pyc
Normal file
BIN
backend/app/middleware/__pycache__/__init__.cpython-313.pyc
Normal file
Binary file not shown.
BIN
backend/app/middleware/__pycache__/rate_limiting.cpython-313.pyc
Normal file
BIN
backend/app/middleware/__pycache__/rate_limiting.cpython-313.pyc
Normal file
Binary file not shown.
BIN
backend/app/middleware/__pycache__/validation.cpython-313.pyc
Normal file
BIN
backend/app/middleware/__pycache__/validation.cpython-313.pyc
Normal file
Binary file not shown.
264
backend/app/middleware/rate_limiting.py
Normal file
264
backend/app/middleware/rate_limiting.py
Normal file
|
|
@ -0,0 +1,264 @@
|
|||
"""Rate limiting middleware for API endpoints."""
|
||||
|
||||
import time
|
||||
from collections import defaultdict
|
||||
from typing import Dict, Optional, Tuple
|
||||
import redis.asyncio as aioredis
|
||||
from fastapi import HTTPException, Request, status
|
||||
from fastapi.responses import JSONResponse
|
||||
import json
|
||||
import asyncio
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
from app.core.config import get_settings
|
||||
from app.telemetry.metrics import track_rate_limit_metrics
|
||||
|
||||
|
||||
class RateLimiter:
|
||||
"""Redis-based rate limiter with sliding window algorithm."""
|
||||
|
||||
def __init__(self, redis_client: aioredis.Redis):
|
||||
self.redis = redis_client
|
||||
|
||||
async def is_allowed(
|
||||
self,
|
||||
key: str,
|
||||
limit: int,
|
||||
window_seconds: int,
|
||||
identifier: str = ""
|
||||
) -> Tuple[bool, Dict[str, int]]:
|
||||
"""
|
||||
Check if request is allowed under rate limit.
|
||||
|
||||
Returns:
|
||||
Tuple of (is_allowed, rate_limit_info)
|
||||
"""
|
||||
now = time.time()
|
||||
pipeline = self.redis.pipeline()
|
||||
|
||||
# Remove expired entries
|
||||
pipeline.zremrangebyscore(key, 0, now - window_seconds)
|
||||
|
||||
# Count current requests in window
|
||||
pipeline.zcard(key)
|
||||
|
||||
# Add current request
|
||||
pipeline.zadd(key, {str(now): now})
|
||||
|
||||
# Set expiry
|
||||
pipeline.expire(key, window_seconds)
|
||||
|
||||
results = await pipeline.execute()
|
||||
current_requests = results[1]
|
||||
|
||||
rate_limit_info = {
|
||||
"limit": limit,
|
||||
"remaining": max(0, limit - current_requests),
|
||||
"reset_time": int(now + window_seconds),
|
||||
"retry_after": window_seconds if current_requests >= limit else 0
|
||||
}
|
||||
|
||||
is_allowed = current_requests <= limit
|
||||
|
||||
# Track metrics
|
||||
track_rate_limit_metrics(
|
||||
identifier=identifier,
|
||||
is_allowed=is_allowed,
|
||||
current_requests=current_requests,
|
||||
limit=limit
|
||||
)
|
||||
|
||||
return is_allowed, rate_limit_info
|
||||
|
||||
|
||||
class RateLimitMiddleware:
|
||||
"""FastAPI middleware for rate limiting."""
|
||||
|
||||
def __init__(self, redis_client: aioredis.Redis):
|
||||
self.limiter = RateLimiter(redis_client)
|
||||
self.settings = get_settings()
|
||||
|
||||
# Rate limit configurations by endpoint pattern
|
||||
self.rate_limits = {
|
||||
# Authentication endpoints
|
||||
"POST:/api/v1/auth/login": (5, 300), # 5 requests per 5 minutes
|
||||
"POST:/api/v1/auth/register": (3, 3600), # 3 requests per hour
|
||||
"POST:/api/v1/auth/refresh": (10, 300), # 10 requests per 5 minutes
|
||||
"POST:/api/v1/auth/forgot-password": (3, 3600), # 3 requests per hour
|
||||
|
||||
# File upload endpoints
|
||||
"POST:/api/v1/files/upload": (10, 3600), # 10 uploads per hour
|
||||
"POST:/api/v1/jobs": (20, 3600), # 20 job creations per hour
|
||||
|
||||
# Job management endpoints
|
||||
"GET:/api/v1/jobs": (100, 300), # 100 requests per 5 minutes
|
||||
"PATCH:/api/v1/jobs/*/approve": (50, 3600), # 50 approvals per hour
|
||||
"PATCH:/api/v1/jobs/*/reject": (50, 3600), # 50 rejections per hour
|
||||
|
||||
# VTT editing endpoints
|
||||
"PATCH:/api/v1/jobs/*/vtt": (100, 3600), # 100 VTT edits per hour
|
||||
|
||||
# Admin endpoints (more restrictive)
|
||||
"GET:/api/v1/admin/*": (50, 300), # 50 requests per 5 minutes
|
||||
"POST:/api/v1/admin/*": (20, 3600), # 20 admin actions per hour
|
||||
"PATCH:/api/v1/admin/*": (20, 3600), # 20 admin updates per hour
|
||||
"DELETE:/api/v1/admin/*": (10, 3600), # 10 admin deletions per hour
|
||||
}
|
||||
|
||||
# Default rate limits
|
||||
self.default_limits = {
|
||||
"authenticated": (1000, 3600), # 1000 requests per hour for authenticated users
|
||||
"anonymous": (100, 3600), # 100 requests per hour for anonymous users
|
||||
}
|
||||
|
||||
def _get_client_identifier(self, request: Request) -> str:
|
||||
"""Get client identifier for rate limiting."""
|
||||
# Try to get user ID from JWT token
|
||||
user = getattr(request.state, 'user', None)
|
||||
if user:
|
||||
return f"user:{user.id}"
|
||||
|
||||
# Fall back to IP address
|
||||
forwarded_for = request.headers.get("X-Forwarded-For")
|
||||
if forwarded_for:
|
||||
return f"ip:{forwarded_for.split(',')[0].strip()}"
|
||||
|
||||
client_ip = request.client.host if request.client else "unknown"
|
||||
return f"ip:{client_ip}"
|
||||
|
||||
def _get_endpoint_key(self, request: Request) -> str:
|
||||
"""Get endpoint pattern for rate limiting."""
|
||||
method = request.method
|
||||
path = request.url.path
|
||||
|
||||
# Replace job IDs with wildcard for pattern matching
|
||||
import re
|
||||
path = re.sub(r'/jobs/[a-f0-9-]+/', '/jobs/*/', path)
|
||||
path = re.sub(r'/admin/users/[a-f0-9-]+', '/admin/users/*', path)
|
||||
|
||||
return f"{method}:{path}"
|
||||
|
||||
def _get_rate_limit(self, request: Request) -> Tuple[int, int]:
|
||||
"""Get rate limit for the current request."""
|
||||
endpoint_key = self._get_endpoint_key(request)
|
||||
|
||||
# Check for specific endpoint limits
|
||||
if endpoint_key in self.rate_limits:
|
||||
return self.rate_limits[endpoint_key]
|
||||
|
||||
# Check for wildcard matches
|
||||
for pattern, limits in self.rate_limits.items():
|
||||
if pattern.endswith("*") and endpoint_key.startswith(pattern[:-1]):
|
||||
return limits
|
||||
|
||||
# Use default limits based on authentication
|
||||
user = getattr(request.state, 'user', None)
|
||||
if user:
|
||||
return self.default_limits["authenticated"]
|
||||
else:
|
||||
return self.default_limits["anonymous"]
|
||||
|
||||
async def __call__(self, request: Request, call_next):
|
||||
"""Process rate limiting for the request."""
|
||||
|
||||
# Skip rate limiting for health checks and login (temporary for debugging)
|
||||
if request.url.path in ["/health", "/metrics", "/api/v1/auth/login"]:
|
||||
return await call_next(request)
|
||||
|
||||
client_id = self._get_client_identifier(request)
|
||||
endpoint_key = self._get_endpoint_key(request)
|
||||
limit, window = self._get_rate_limit(request)
|
||||
|
||||
# Create rate limit key
|
||||
rate_limit_key = f"rate_limit:{client_id}:{endpoint_key}"
|
||||
|
||||
try:
|
||||
is_allowed, rate_info = await self.limiter.is_allowed(
|
||||
key=rate_limit_key,
|
||||
limit=limit,
|
||||
window_seconds=window,
|
||||
identifier=client_id
|
||||
)
|
||||
|
||||
if not is_allowed:
|
||||
# Return rate limit exceeded response
|
||||
return JSONResponse(
|
||||
status_code=status.HTTP_429_TOO_MANY_REQUESTS,
|
||||
content={
|
||||
"detail": "Rate limit exceeded",
|
||||
"error_code": "RATE_LIMIT_EXCEEDED",
|
||||
"rate_limit": rate_info
|
||||
},
|
||||
headers={
|
||||
"X-RateLimit-Limit": str(rate_info["limit"]),
|
||||
"X-RateLimit-Remaining": str(rate_info["remaining"]),
|
||||
"X-RateLimit-Reset": str(rate_info["reset_time"]),
|
||||
"Retry-After": str(rate_info["retry_after"])
|
||||
}
|
||||
)
|
||||
|
||||
# Process the request
|
||||
response = await call_next(request)
|
||||
|
||||
# Add rate limit headers to response
|
||||
response.headers["X-RateLimit-Limit"] = str(rate_info["limit"])
|
||||
response.headers["X-RateLimit-Remaining"] = str(rate_info["remaining"])
|
||||
response.headers["X-RateLimit-Reset"] = str(rate_info["reset_time"])
|
||||
|
||||
return response
|
||||
|
||||
except Exception as e:
|
||||
# Log error but don't block request if rate limiting fails
|
||||
print(f"Rate limiting error: {e}")
|
||||
return await call_next(request)
|
||||
|
||||
|
||||
class IPWhitelist:
|
||||
"""IP whitelist for bypassing rate limits."""
|
||||
|
||||
def __init__(self, redis_client: aioredis.Redis):
|
||||
self.redis = redis_client
|
||||
self.whitelist_key = "ip_whitelist"
|
||||
|
||||
# Default whitelisted IPs (health checks, monitoring)
|
||||
self.default_whitelist = {
|
||||
"127.0.0.1",
|
||||
"::1",
|
||||
"169.254.169.254", # GCP metadata server
|
||||
}
|
||||
|
||||
async def is_whitelisted(self, ip: str) -> bool:
|
||||
"""Check if IP is whitelisted."""
|
||||
if ip in self.default_whitelist:
|
||||
return True
|
||||
|
||||
try:
|
||||
is_member = await self.redis.sismember(self.whitelist_key, ip)
|
||||
return bool(is_member)
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
async def add_ip(self, ip: str, ttl_seconds: Optional[int] = None) -> bool:
|
||||
"""Add IP to whitelist."""
|
||||
try:
|
||||
await self.redis.sadd(self.whitelist_key, ip)
|
||||
if ttl_seconds:
|
||||
# Create temporary whitelist entry
|
||||
temp_key = f"{self.whitelist_key}:temp:{ip}"
|
||||
await self.redis.setex(temp_key, ttl_seconds, "1")
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
async def remove_ip(self, ip: str) -> bool:
|
||||
"""Remove IP from whitelist."""
|
||||
try:
|
||||
await self.redis.srem(self.whitelist_key, ip)
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
async def create_rate_limit_middleware(redis_client: aioredis.Redis) -> RateLimitMiddleware:
|
||||
"""Factory function to create rate limit middleware."""
|
||||
return RateLimitMiddleware(redis_client)
|
||||
324
backend/app/middleware/validation.py
Normal file
324
backend/app/middleware/validation.py
Normal file
|
|
@ -0,0 +1,324 @@
|
|||
"""Enhanced request validation middleware."""
|
||||
|
||||
import json
|
||||
import re
|
||||
import time
|
||||
from typing import Any, Dict, List, Optional, Set
|
||||
from fastapi import HTTPException, Request, status
|
||||
from fastapi.responses import JSONResponse
|
||||
from pydantic import BaseModel, ValidationError as PydanticValidationError
|
||||
import magic
|
||||
from urllib.parse import unquote
|
||||
|
||||
from app.telemetry.metrics import track_validation_metrics
|
||||
|
||||
|
||||
class ValidationError(Exception):
|
||||
"""Custom validation error."""
|
||||
pass
|
||||
|
||||
|
||||
class SecurityValidationError(Exception):
|
||||
"""Raised when security validation fails."""
|
||||
pass
|
||||
|
||||
|
||||
class RequestValidator:
|
||||
"""Enhanced request validation with security checks."""
|
||||
|
||||
def __init__(self):
|
||||
# File type restrictions
|
||||
self.allowed_video_types = {
|
||||
"video/mp4",
|
||||
"video/quicktime",
|
||||
"video/x-msvideo" # AVI
|
||||
}
|
||||
|
||||
self.allowed_subtitle_types = {
|
||||
"text/vtt",
|
||||
"text/plain"
|
||||
}
|
||||
|
||||
# Security patterns to block
|
||||
self.malicious_patterns = [
|
||||
# SQL injection patterns
|
||||
r"(union|select|insert|update|delete|drop|create|alter)\s+",
|
||||
r"(script|javascript|vbscript|onload|onerror|onclick)",
|
||||
r"<\s*script[^>]*>",
|
||||
r"javascript:",
|
||||
r"data:.*base64",
|
||||
|
||||
# Path traversal
|
||||
r"\.\./",
|
||||
r"\.\.\\",
|
||||
r"%2e%2e%2f",
|
||||
r"%2e%2e\\",
|
||||
|
||||
# Command injection
|
||||
r"[;&|`$]",
|
||||
r"(rm|wget|curl|nc|bash|sh|cmd|powershell)\s+",
|
||||
|
||||
# MongoDB injection
|
||||
r"\$where|\$ne|\$gt|\$lt|\$regex",
|
||||
]
|
||||
|
||||
self.compiled_patterns = [re.compile(pattern, re.IGNORECASE) for pattern in self.malicious_patterns]
|
||||
|
||||
# Max file sizes (in bytes)
|
||||
self.max_video_size = 2 * 1024 * 1024 * 1024 # 2GB
|
||||
self.max_subtitle_size = 10 * 1024 * 1024 # 10MB
|
||||
|
||||
# Request size limits
|
||||
self.max_json_size = 1024 * 1024 # 1MB
|
||||
self.max_form_fields = 50
|
||||
|
||||
def validate_string_content(self, content: str, field_name: str = "input") -> None:
|
||||
"""Validate string content for malicious patterns."""
|
||||
if not isinstance(content, str):
|
||||
return
|
||||
|
||||
for pattern in self.compiled_patterns:
|
||||
if pattern.search(content):
|
||||
raise SecurityValidationError(
|
||||
f"Potentially malicious content detected in {field_name}"
|
||||
)
|
||||
|
||||
def validate_filename(self, filename: str) -> str:
|
||||
"""Validate and sanitize filename."""
|
||||
if not filename:
|
||||
raise ValidationError("Filename cannot be empty")
|
||||
|
||||
# Decode URL encoding
|
||||
filename = unquote(filename)
|
||||
|
||||
# Check for malicious patterns
|
||||
self.validate_string_content(filename, "filename")
|
||||
|
||||
# Remove dangerous characters
|
||||
safe_filename = re.sub(r'[^\w\-_\.]', '_', filename)
|
||||
|
||||
# Prevent hidden files
|
||||
if safe_filename.startswith('.'):
|
||||
safe_filename = 'file_' + safe_filename[1:]
|
||||
|
||||
# Limit length
|
||||
if len(safe_filename) > 255:
|
||||
name, ext = safe_filename.rsplit('.', 1) if '.' in safe_filename else (safe_filename, '')
|
||||
safe_filename = name[:250] + ('.' + ext if ext else '')
|
||||
|
||||
return safe_filename
|
||||
|
||||
def validate_file_type(self, content: bytes, expected_type: str, filename: str) -> None:
|
||||
"""Validate file type using magic numbers."""
|
||||
try:
|
||||
detected_type = magic.from_buffer(content, mime=True)
|
||||
except Exception:
|
||||
# Fallback to extension-based validation
|
||||
ext = filename.lower().split('.')[-1] if '.' in filename else ''
|
||||
video_extensions = {'mp4', 'mov', 'avi', 'mkv'}
|
||||
subtitle_extensions = {'vtt', 'srt', 'txt'}
|
||||
|
||||
if expected_type == "video" and ext not in video_extensions:
|
||||
raise ValidationError(f"Invalid video file extension: {ext}")
|
||||
elif expected_type == "subtitle" and ext not in subtitle_extensions:
|
||||
raise ValidationError(f"Invalid subtitle file extension: {ext}")
|
||||
return
|
||||
|
||||
if expected_type == "video" and detected_type not in self.allowed_video_types:
|
||||
raise ValidationError(
|
||||
f"Invalid video file type: {detected_type}. "
|
||||
f"Allowed types: {', '.join(self.allowed_video_types)}"
|
||||
)
|
||||
elif expected_type == "subtitle" and detected_type not in self.allowed_subtitle_types:
|
||||
raise ValidationError(
|
||||
f"Invalid subtitle file type: {detected_type}. "
|
||||
f"Allowed types: {', '.join(self.allowed_subtitle_types)}"
|
||||
)
|
||||
|
||||
def validate_file_size(self, size: int, file_type: str) -> None:
|
||||
"""Validate file size limits."""
|
||||
if file_type == "video" and size > self.max_video_size:
|
||||
raise ValidationError(
|
||||
f"Video file too large: {size} bytes. "
|
||||
f"Maximum allowed: {self.max_video_size} bytes"
|
||||
)
|
||||
elif file_type == "subtitle" and size > self.max_subtitle_size:
|
||||
raise ValidationError(
|
||||
f"Subtitle file too large: {size} bytes. "
|
||||
f"Maximum allowed: {self.max_subtitle_size} bytes"
|
||||
)
|
||||
|
||||
async def validate_json_payload(self, request: Request) -> Optional[Dict[str, Any]]:
|
||||
"""Validate JSON request payload."""
|
||||
if not request.headers.get("content-type", "").startswith("application/json"):
|
||||
return None
|
||||
|
||||
content_length = request.headers.get("content-length")
|
||||
if content_length and int(content_length) > self.max_json_size:
|
||||
raise ValidationError(f"JSON payload too large: {content_length} bytes")
|
||||
|
||||
try:
|
||||
# Check if body has already been read
|
||||
if hasattr(request, '_cached_body'):
|
||||
body = request._cached_body
|
||||
else:
|
||||
body = await request.body()
|
||||
# Cache the body so FastAPI can read it later
|
||||
request._cached_body = body
|
||||
|
||||
if len(body) > self.max_json_size:
|
||||
raise ValidationError(f"JSON payload too large: {len(body)} bytes")
|
||||
|
||||
if not body:
|
||||
return {}
|
||||
|
||||
payload = json.loads(body)
|
||||
|
||||
# Recursively validate all string values
|
||||
self._validate_json_values(payload)
|
||||
|
||||
return payload
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
raise ValidationError(f"Invalid JSON: {e}")
|
||||
|
||||
def _validate_json_values(self, obj: Any, path: str = "root") -> None:
|
||||
"""Recursively validate JSON values."""
|
||||
if isinstance(obj, dict):
|
||||
if len(obj) > self.max_form_fields:
|
||||
raise ValidationError(f"Too many fields in object at {path}")
|
||||
|
||||
for key, value in obj.items():
|
||||
if isinstance(key, str):
|
||||
self.validate_string_content(key, f"{path}.{key}")
|
||||
self._validate_json_values(value, f"{path}.{key}")
|
||||
|
||||
elif isinstance(obj, list):
|
||||
if len(obj) > 1000: # Prevent large arrays
|
||||
raise ValidationError(f"Array too large at {path}")
|
||||
|
||||
for i, item in enumerate(obj):
|
||||
self._validate_json_values(item, f"{path}[{i}]")
|
||||
|
||||
elif isinstance(obj, str):
|
||||
self.validate_string_content(obj, path)
|
||||
|
||||
def validate_query_params(self, request: Request) -> None:
|
||||
"""Validate query parameters."""
|
||||
for key, value in request.query_params.items():
|
||||
self.validate_string_content(key, f"query.{key}")
|
||||
self.validate_string_content(str(value), f"query.{key}")
|
||||
|
||||
def validate_headers(self, request: Request) -> None:
|
||||
"""Validate request headers."""
|
||||
suspicious_headers = {
|
||||
"x-forwarded-host",
|
||||
"x-original-host",
|
||||
"x-rewrite-url"
|
||||
}
|
||||
|
||||
for header_name, header_value in request.headers.items():
|
||||
# Check for suspicious headers
|
||||
if header_name.lower() in suspicious_headers:
|
||||
self.validate_string_content(header_value, f"header.{header_name}")
|
||||
|
||||
# Validate user-agent length
|
||||
if header_name.lower() == "user-agent" and len(header_value) > 500:
|
||||
raise SecurityValidationError("User-Agent header too long")
|
||||
|
||||
|
||||
class ValidationMiddleware:
|
||||
"""FastAPI middleware for enhanced request validation."""
|
||||
|
||||
def __init__(self):
|
||||
self.validator = RequestValidator()
|
||||
|
||||
async def __call__(self, request: Request, call_next):
|
||||
"""Process validation for the request."""
|
||||
|
||||
start_time = time.time()
|
||||
validation_errors = []
|
||||
|
||||
# Skip validation for timing adjustment endpoint temporarily
|
||||
if "/vtt/adjust-timing" in request.url.path:
|
||||
return await call_next(request)
|
||||
|
||||
try:
|
||||
# Validate headers
|
||||
self.validator.validate_headers(request)
|
||||
|
||||
# Validate query parameters
|
||||
self.validator.validate_query_params(request)
|
||||
|
||||
# Validate JSON payload if present
|
||||
if request.method in ["POST", "PUT", "PATCH"]:
|
||||
await self.validator.validate_json_payload(request)
|
||||
|
||||
# Process the request
|
||||
response = await call_next(request)
|
||||
|
||||
# Track successful validation
|
||||
track_validation_metrics(
|
||||
endpoint=request.url.path,
|
||||
method=request.method,
|
||||
is_valid=True,
|
||||
validation_time=time.time() - start_time,
|
||||
error_types=[]
|
||||
)
|
||||
|
||||
return response
|
||||
|
||||
except SecurityValidationError as e:
|
||||
validation_errors.append("security")
|
||||
track_validation_metrics(
|
||||
endpoint=request.url.path,
|
||||
method=request.method,
|
||||
is_valid=False,
|
||||
validation_time=time.time() - start_time,
|
||||
error_types=validation_errors
|
||||
)
|
||||
|
||||
return JSONResponse(
|
||||
status_code=status.HTTP_400_BAD_REQUEST,
|
||||
content={
|
||||
"detail": "Security validation failed",
|
||||
"error_code": "SECURITY_VALIDATION_ERROR"
|
||||
}
|
||||
)
|
||||
|
||||
except ValidationError as e:
|
||||
validation_errors.append("format")
|
||||
track_validation_metrics(
|
||||
endpoint=request.url.path,
|
||||
method=request.method,
|
||||
is_valid=False,
|
||||
validation_time=time.time() - start_time,
|
||||
error_types=validation_errors
|
||||
)
|
||||
|
||||
return JSONResponse(
|
||||
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
|
||||
content={
|
||||
"detail": str(e),
|
||||
"error_code": "VALIDATION_ERROR"
|
||||
}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
validation_errors.append("unknown")
|
||||
track_validation_metrics(
|
||||
endpoint=request.url.path,
|
||||
method=request.method,
|
||||
is_valid=False,
|
||||
validation_time=time.time() - start_time,
|
||||
error_types=validation_errors
|
||||
)
|
||||
|
||||
# Log unexpected error but continue processing
|
||||
print(f"Validation middleware error: {e}")
|
||||
return await call_next(request)
|
||||
|
||||
|
||||
async def create_validation_middleware() -> ValidationMiddleware:
|
||||
"""Factory function to create validation middleware."""
|
||||
return ValidationMiddleware()
|
||||
5
backend/app/migrations/__init__.py
Normal file
5
backend/app/migrations/__init__.py
Normal file
|
|
@ -0,0 +1,5 @@
|
|||
"""Database migration framework for MongoDB."""
|
||||
|
||||
from .migrator import MigrationManager, Migration
|
||||
|
||||
__all__ = ["MigrationManager", "Migration"]
|
||||
253
backend/app/migrations/migrator.py
Normal file
253
backend/app/migrations/migrator.py
Normal file
|
|
@ -0,0 +1,253 @@
|
|||
"""MongoDB migration framework."""
|
||||
|
||||
import os
|
||||
import importlib.util
|
||||
from abc import ABC, abstractmethod
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import List, Optional
|
||||
from motor.motor_asyncio import AsyncIOMotorDatabase
|
||||
|
||||
from app.core.database import get_database
|
||||
from app.core.logging import get_logger
|
||||
from app.telemetry.tracing import trace_async_operation
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
class Migration(ABC):
|
||||
"""Base class for database migrations."""
|
||||
|
||||
def __init__(self):
|
||||
self.version: str = "0000-00-00-000000" # Format: YYYY-MM-DD-HHMMSS
|
||||
self.description: str = ""
|
||||
self.db: Optional[AsyncIOMotorDatabase] = None
|
||||
|
||||
@abstractmethod
|
||||
async def up(self) -> None:
|
||||
"""Apply the migration."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def down(self) -> None:
|
||||
"""Rollback the migration."""
|
||||
pass
|
||||
|
||||
async def set_database(self, db: AsyncIOMotorDatabase) -> None:
|
||||
"""Set the database instance."""
|
||||
self.db = db
|
||||
|
||||
|
||||
class MigrationRecord:
|
||||
"""Represents a migration record in the database."""
|
||||
|
||||
def __init__(self, version: str, description: str, applied_at: datetime):
|
||||
self.version = version
|
||||
self.description = description
|
||||
self.applied_at = applied_at
|
||||
|
||||
|
||||
class MigrationManager:
|
||||
"""Manages database migrations."""
|
||||
|
||||
def __init__(self):
|
||||
self.db: Optional[AsyncIOMotorDatabase] = None
|
||||
self.migrations_dir = Path(__file__).parent / "scripts"
|
||||
self.collection_name = "migration_history"
|
||||
|
||||
async def initialize(self) -> None:
|
||||
"""Initialize the migration manager."""
|
||||
self.db = await get_database()
|
||||
await self._ensure_migration_collection()
|
||||
|
||||
async def _ensure_migration_collection(self) -> None:
|
||||
"""Ensure the migration history collection exists with proper indexes."""
|
||||
collection = self.db[self.collection_name]
|
||||
|
||||
# Create indexes for migration history
|
||||
await collection.create_index([("version", 1)], unique=True)
|
||||
await collection.create_index([("applied_at", -1)])
|
||||
|
||||
logger.info("Migration history collection initialized")
|
||||
|
||||
def discover_migrations(self) -> List[str]:
|
||||
"""Discover all migration files in the migrations directory."""
|
||||
if not self.migrations_dir.exists():
|
||||
logger.warning(f"Migrations directory not found: {self.migrations_dir}")
|
||||
return []
|
||||
|
||||
migration_files = []
|
||||
for file_path in self.migrations_dir.glob("*.py"):
|
||||
if file_path.name.startswith("migration_") and not file_path.name.startswith("__"):
|
||||
migration_files.append(file_path.stem)
|
||||
|
||||
# Sort by version (filename should start with version)
|
||||
migration_files.sort()
|
||||
return migration_files
|
||||
|
||||
async def load_migration(self, migration_name: str) -> Migration:
|
||||
"""Dynamically load a migration class."""
|
||||
migration_path = self.migrations_dir / f"{migration_name}.py"
|
||||
|
||||
if not migration_path.exists():
|
||||
raise FileNotFoundError(f"Migration file not found: {migration_path}")
|
||||
|
||||
# Load the module
|
||||
spec = importlib.util.spec_from_file_location(migration_name, migration_path)
|
||||
module = importlib.util.module_from_spec(spec)
|
||||
spec.loader.exec_module(module)
|
||||
|
||||
# Get the migration class (assume it's named Migration)
|
||||
if not hasattr(module, 'Migration'):
|
||||
raise AttributeError(f"Migration class not found in {migration_name}")
|
||||
|
||||
migration_class = getattr(module, 'Migration')
|
||||
migration = migration_class()
|
||||
await migration.set_database(self.db)
|
||||
|
||||
return migration
|
||||
|
||||
async def get_applied_migrations(self) -> List[str]:
|
||||
"""Get list of applied migration versions."""
|
||||
collection = self.db[self.collection_name]
|
||||
cursor = collection.find({}, {"version": 1}).sort("version", 1)
|
||||
|
||||
applied = []
|
||||
async for doc in cursor:
|
||||
applied.append(doc["version"])
|
||||
|
||||
return applied
|
||||
|
||||
async def record_migration(self, migration: Migration) -> None:
|
||||
"""Record a successful migration in the database."""
|
||||
collection = self.db[self.collection_name]
|
||||
|
||||
record = {
|
||||
"version": migration.version,
|
||||
"description": migration.description,
|
||||
"applied_at": datetime.utcnow()
|
||||
}
|
||||
|
||||
await collection.insert_one(record)
|
||||
logger.info(f"Recorded migration: {migration.version} - {migration.description}")
|
||||
|
||||
async def remove_migration_record(self, version: str) -> None:
|
||||
"""Remove a migration record (for rollback)."""
|
||||
collection = self.db[self.collection_name]
|
||||
await collection.delete_one({"version": version})
|
||||
logger.info(f"Removed migration record: {version}")
|
||||
|
||||
@trace_async_operation("migration_manager.migrate_up")
|
||||
async def migrate_up(self, target_version: Optional[str] = None) -> List[str]:
|
||||
"""
|
||||
Apply migrations up to the target version.
|
||||
|
||||
Args:
|
||||
target_version: Version to migrate to. If None, applies all pending migrations.
|
||||
|
||||
Returns:
|
||||
List of applied migration versions.
|
||||
"""
|
||||
await self.initialize()
|
||||
|
||||
# Discover all migrations
|
||||
all_migrations = self.discover_migrations()
|
||||
applied_migrations = await self.get_applied_migrations()
|
||||
|
||||
# Find pending migrations
|
||||
pending_migrations = []
|
||||
for migration_name in all_migrations:
|
||||
# Extract version from filename (assumes format: migration_YYYY-MM-DD-HHMMSS_description.py)
|
||||
version = migration_name.replace("migration_", "").split("_")[0]
|
||||
|
||||
if version not in applied_migrations:
|
||||
if target_version is None or version <= target_version:
|
||||
pending_migrations.append((migration_name, version))
|
||||
|
||||
# Sort by version
|
||||
pending_migrations.sort(key=lambda x: x[1])
|
||||
|
||||
applied = []
|
||||
for migration_name, version in pending_migrations:
|
||||
try:
|
||||
logger.info(f"Applying migration: {migration_name}")
|
||||
|
||||
migration = await self.load_migration(migration_name)
|
||||
await migration.up()
|
||||
await self.record_migration(migration)
|
||||
|
||||
applied.append(version)
|
||||
logger.info(f"Successfully applied migration: {version}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to apply migration {migration_name}: {e}")
|
||||
raise
|
||||
|
||||
return applied
|
||||
|
||||
@trace_async_operation("migration_manager.migrate_down")
|
||||
async def migrate_down(self, target_version: str) -> List[str]:
|
||||
"""
|
||||
Rollback migrations down to the target version.
|
||||
|
||||
Args:
|
||||
target_version: Version to rollback to.
|
||||
|
||||
Returns:
|
||||
List of rolled back migration versions.
|
||||
"""
|
||||
await self.initialize()
|
||||
|
||||
applied_migrations = await self.get_applied_migrations()
|
||||
|
||||
# Find migrations to rollback (newer than target)
|
||||
to_rollback = []
|
||||
for version in reversed(applied_migrations):
|
||||
if version > target_version:
|
||||
to_rollback.append(version)
|
||||
|
||||
rolled_back = []
|
||||
for version in to_rollback:
|
||||
try:
|
||||
# Find migration file for this version
|
||||
migration_name = None
|
||||
for migration_file in self.discover_migrations():
|
||||
if version in migration_file:
|
||||
migration_name = migration_file
|
||||
break
|
||||
|
||||
if not migration_name:
|
||||
logger.warning(f"Migration file not found for version {version}")
|
||||
continue
|
||||
|
||||
logger.info(f"Rolling back migration: {migration_name}")
|
||||
|
||||
migration = await self.load_migration(migration_name)
|
||||
await migration.down()
|
||||
await self.remove_migration_record(version)
|
||||
|
||||
rolled_back.append(version)
|
||||
logger.info(f"Successfully rolled back migration: {version}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to rollback migration {version}: {e}")
|
||||
raise
|
||||
|
||||
return rolled_back
|
||||
|
||||
async def get_migration_status(self) -> dict:
|
||||
"""Get current migration status."""
|
||||
await self.initialize()
|
||||
|
||||
all_migrations = self.discover_migrations()
|
||||
applied_migrations = await self.get_applied_migrations()
|
||||
|
||||
pending_count = len(all_migrations) - len(applied_migrations)
|
||||
|
||||
return {
|
||||
"total_migrations": len(all_migrations),
|
||||
"applied_migrations": len(applied_migrations),
|
||||
"pending_migrations": pending_count,
|
||||
"latest_applied": applied_migrations[-1] if applied_migrations else None,
|
||||
"all_applied": applied_migrations
|
||||
}
|
||||
|
|
@ -0,0 +1,64 @@
|
|||
"""Initial database schema setup migration."""
|
||||
|
||||
from datetime import datetime
|
||||
from app.migrations.migrator import Migration
|
||||
|
||||
|
||||
class Migration(Migration):
|
||||
"""Initial schema setup with all collections and indexes."""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.version = "2025-08-17-120000"
|
||||
self.description = "Initial database schema with users, jobs, and audit_logs collections"
|
||||
|
||||
async def up(self) -> None:
|
||||
"""Create initial collections and indexes."""
|
||||
|
||||
# Users collection setup
|
||||
await self.db.users.create_index([("email", 1)], unique=True)
|
||||
await self.db.users.create_index([("role", 1)])
|
||||
await self.db.users.create_index([("is_active", 1)])
|
||||
await self.db.users.create_index([("created_at", -1)])
|
||||
|
||||
# Jobs collection setup
|
||||
await self.db.jobs.create_index([("status", 1), ("created_at", -1)])
|
||||
await self.db.jobs.create_index([("client_id", 1)])
|
||||
await self.db.jobs.create_index([("updated_at", -1)])
|
||||
await self.db.jobs.create_index([("languages", 1)])
|
||||
|
||||
# Create compound index for job queries
|
||||
await self.db.jobs.create_index([
|
||||
("status", 1),
|
||||
("client_id", 1),
|
||||
("created_at", -1)
|
||||
])
|
||||
|
||||
# Audit logs collection setup
|
||||
await self.db.audit_logs.create_index([("timestamp", -1)])
|
||||
await self.db.audit_logs.create_index([("action", 1), ("timestamp", -1)])
|
||||
await self.db.audit_logs.create_index([("user_id", 1), ("timestamp", -1)])
|
||||
await self.db.audit_logs.create_index([("severity", 1), ("timestamp", -1)])
|
||||
await self.db.audit_logs.create_index([("resource_type", 1), ("resource_id", 1)])
|
||||
await self.db.audit_logs.create_index([("ip_address", 1), ("timestamp", -1)])
|
||||
await self.db.audit_logs.create_index([("success", 1), ("timestamp", -1)])
|
||||
|
||||
# Text search index for audit logs
|
||||
await self.db.audit_logs.create_index([
|
||||
("description", "text"),
|
||||
("details", "text"),
|
||||
("error_message", "text")
|
||||
])
|
||||
|
||||
print(f"✅ Applied migration {self.version}: {self.description}")
|
||||
|
||||
async def down(self) -> None:
|
||||
"""Drop all collections (destructive - use with caution)."""
|
||||
|
||||
# This is a destructive operation - in production, you might want to backup first
|
||||
await self.db.users.drop()
|
||||
await self.db.jobs.drop()
|
||||
await self.db.audit_logs.drop()
|
||||
|
||||
print(f"⚠️ Rolled back migration {self.version}: {self.description}")
|
||||
print("⚠️ WARNING: All data has been deleted!")
|
||||
|
|
@ -0,0 +1,134 @@
|
|||
"""Index optimization migration for improved query performance."""
|
||||
|
||||
from app.migrations.migrator import Migration
|
||||
|
||||
|
||||
class Migration(Migration):
|
||||
"""Optimize indexes for better query performance."""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.version = "2025-08-17-120001"
|
||||
self.description = "Index optimization for query performance improvements"
|
||||
|
||||
async def up(self) -> None:
|
||||
"""Add optimized indexes for common query patterns."""
|
||||
|
||||
# Jobs collection optimizations
|
||||
|
||||
# Index for job status transitions and monitoring
|
||||
await self.db.jobs.create_index([
|
||||
("status", 1),
|
||||
("updated_at", -1),
|
||||
("client_id", 1)
|
||||
], name="jobs_status_updated_client_idx")
|
||||
|
||||
# Index for queue management (pending jobs)
|
||||
await self.db.jobs.create_index([
|
||||
("status", 1),
|
||||
("created_at", 1)
|
||||
], name="jobs_queue_processing_idx")
|
||||
|
||||
# Index for client job history
|
||||
await self.db.jobs.create_index([
|
||||
("client_id", 1),
|
||||
("created_at", -1),
|
||||
("status", 1)
|
||||
], name="jobs_client_history_idx")
|
||||
|
||||
# Sparse index for error tracking
|
||||
await self.db.jobs.create_index([
|
||||
("status", 1),
|
||||
("error", 1)
|
||||
], sparse=True, name="jobs_error_tracking_idx")
|
||||
|
||||
# Users collection optimizations
|
||||
|
||||
# Index for active user queries
|
||||
await self.db.users.create_index([
|
||||
("is_active", 1),
|
||||
("role", 1),
|
||||
("last_login_at", -1)
|
||||
], name="users_active_role_login_idx")
|
||||
|
||||
# Index for user search by email pattern
|
||||
await self.db.users.create_index([
|
||||
("email", "text"),
|
||||
("first_name", "text"),
|
||||
("last_name", "text")
|
||||
], name="users_search_idx")
|
||||
|
||||
# Audit logs collection optimizations
|
||||
|
||||
# Compound index for security monitoring
|
||||
await self.db.audit_logs.create_index([
|
||||
("severity", 1),
|
||||
("action", 1),
|
||||
("timestamp", -1)
|
||||
], name="audit_security_monitoring_idx")
|
||||
|
||||
# Index for user activity analysis
|
||||
await self.db.audit_logs.create_index([
|
||||
("user_id", 1),
|
||||
("action", 1),
|
||||
("timestamp", -1)
|
||||
], name="audit_user_activity_idx")
|
||||
|
||||
# Index for resource access tracking
|
||||
await self.db.audit_logs.create_index([
|
||||
("resource_type", 1),
|
||||
("resource_id", 1),
|
||||
("action", 1),
|
||||
("timestamp", -1)
|
||||
], name="audit_resource_access_idx")
|
||||
|
||||
# Sparse index for failed operations
|
||||
await self.db.audit_logs.create_index([
|
||||
("success", 1),
|
||||
("timestamp", -1)
|
||||
], sparse=True, name="audit_failures_idx")
|
||||
|
||||
# Add TTL index for automatic audit log cleanup (optional)
|
||||
# Uncomment if you want automatic cleanup after 2 years
|
||||
# await self.db.audit_logs.create_index(
|
||||
# [("timestamp", 1)],
|
||||
# expireAfterSeconds=63072000, # 2 years
|
||||
# name="audit_ttl_idx"
|
||||
# )
|
||||
|
||||
print(f"✅ Applied migration {self.version}: {self.description}")
|
||||
|
||||
async def down(self) -> None:
|
||||
"""Remove the optimized indexes."""
|
||||
|
||||
# Drop the indexes we created
|
||||
indexes_to_drop = [
|
||||
"jobs_status_updated_client_idx",
|
||||
"jobs_queue_processing_idx",
|
||||
"jobs_client_history_idx",
|
||||
"jobs_error_tracking_idx",
|
||||
"users_active_role_login_idx",
|
||||
"users_search_idx",
|
||||
"audit_security_monitoring_idx",
|
||||
"audit_user_activity_idx",
|
||||
"audit_resource_access_idx",
|
||||
"audit_failures_idx"
|
||||
]
|
||||
|
||||
for index_name in indexes_to_drop:
|
||||
try:
|
||||
await self.db.jobs.drop_index(index_name)
|
||||
except Exception:
|
||||
pass # Index might not exist on this collection
|
||||
|
||||
try:
|
||||
await self.db.users.drop_index(index_name)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
try:
|
||||
await self.db.audit_logs.drop_index(index_name)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
print(f"⚠️ Rolled back migration {self.version}: {self.description}")
|
||||
|
|
@ -0,0 +1,155 @@
|
|||
"""Migrate audit log schema from basic to comprehensive format."""
|
||||
|
||||
from datetime import datetime
|
||||
from app.migrations.migrator import Migration
|
||||
|
||||
|
||||
class Migration(Migration):
|
||||
"""Update audit log schema to comprehensive format."""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.version = "2025-08-17-120002"
|
||||
self.description = "Update audit log schema from basic to comprehensive format"
|
||||
|
||||
async def up(self) -> None:
|
||||
"""Migrate existing audit logs to new schema format."""
|
||||
|
||||
# Find all existing audit logs with old schema
|
||||
old_logs_cursor = self.db.audit_logs.find({
|
||||
# Look for logs that have the old schema structure
|
||||
"$or": [
|
||||
{"when": {"$exists": True}}, # Old timestamp field
|
||||
{"job_id": {"$exists": True}}, # Old job-specific logs
|
||||
{"timestamp": {"$exists": False}} # Missing new timestamp field
|
||||
]
|
||||
})
|
||||
|
||||
migration_count = 0
|
||||
|
||||
async for old_log in old_logs_cursor:
|
||||
try:
|
||||
# Map old fields to new schema
|
||||
new_log = {
|
||||
"_id": old_log["_id"],
|
||||
"timestamp": old_log.get("when", old_log.get("timestamp", datetime.utcnow())),
|
||||
"action": self._map_old_action(old_log.get("action", "unknown")),
|
||||
"severity": "info",
|
||||
"description": old_log.get("action", "Legacy action"),
|
||||
"success": True,
|
||||
"environment": "prod",
|
||||
"service_name": "accessible-video-api",
|
||||
"api_version": "v1"
|
||||
}
|
||||
|
||||
# Map optional fields if they exist
|
||||
if "user_id" in old_log:
|
||||
new_log["user_id"] = old_log["user_id"]
|
||||
|
||||
if "job_id" in old_log:
|
||||
new_log["resource_type"] = "job"
|
||||
new_log["resource_id"] = old_log["job_id"]
|
||||
|
||||
if "ip_address" in old_log:
|
||||
new_log["ip_address"] = old_log["ip_address"]
|
||||
|
||||
if "user_agent" in old_log:
|
||||
new_log["user_agent"] = old_log["user_agent"]
|
||||
|
||||
if "details" in old_log:
|
||||
new_log["details"] = old_log["details"]
|
||||
|
||||
# Replace the old document with the new schema
|
||||
await self.db.audit_logs.replace_one(
|
||||
{"_id": old_log["_id"]},
|
||||
new_log
|
||||
)
|
||||
|
||||
migration_count += 1
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error migrating audit log {old_log.get('_id')}: {e}")
|
||||
continue
|
||||
|
||||
print(f"✅ Applied migration {self.version}: Migrated {migration_count} audit log records")
|
||||
|
||||
def _map_old_action(self, old_action: str) -> str:
|
||||
"""Map old action strings to new AuditAction enum values."""
|
||||
action_mapping = {
|
||||
# Job actions
|
||||
"job_created": "job.create",
|
||||
"job_approved": "job.approve",
|
||||
"job_rejected": "job.reject",
|
||||
"job_updated": "job.update",
|
||||
"job_cancelled": "job.cancel",
|
||||
|
||||
# Auth actions
|
||||
"login": "auth.login.success",
|
||||
"logout": "auth.logout",
|
||||
"login_failed": "auth.login.failure",
|
||||
|
||||
# File actions
|
||||
"file_uploaded": "file.upload",
|
||||
"file_downloaded": "file.download",
|
||||
|
||||
# VTT actions
|
||||
"vtt_edited": "vtt.edit",
|
||||
|
||||
# Admin actions
|
||||
"user_created": "user.create",
|
||||
"user_updated": "user.update",
|
||||
"user_deleted": "user.delete",
|
||||
}
|
||||
|
||||
return action_mapping.get(old_action, old_action)
|
||||
|
||||
async def down(self) -> None:
|
||||
"""Rollback to old audit log schema format (limited)."""
|
||||
|
||||
# Find all audit logs with new schema
|
||||
new_logs_cursor = self.db.audit_logs.find({
|
||||
"timestamp": {"$exists": True},
|
||||
"action": {"$exists": True}
|
||||
})
|
||||
|
||||
rollback_count = 0
|
||||
|
||||
async for new_log in new_logs_cursor:
|
||||
try:
|
||||
# Map new fields back to old schema (lossy conversion)
|
||||
old_log = {
|
||||
"_id": new_log["_id"],
|
||||
"when": new_log["timestamp"],
|
||||
"action": new_log["action"]
|
||||
}
|
||||
|
||||
# Map back optional fields
|
||||
if "user_id" in new_log:
|
||||
old_log["user_id"] = new_log["user_id"]
|
||||
|
||||
if "resource_type" in new_log and new_log["resource_type"] == "job":
|
||||
old_log["job_id"] = new_log.get("resource_id")
|
||||
|
||||
if "ip_address" in new_log:
|
||||
old_log["ip_address"] = new_log["ip_address"]
|
||||
|
||||
if "user_agent" in new_log:
|
||||
old_log["user_agent"] = new_log["user_agent"]
|
||||
|
||||
if "details" in new_log:
|
||||
old_log["details"] = new_log["details"]
|
||||
|
||||
# Replace with old schema
|
||||
await self.db.audit_logs.replace_one(
|
||||
{"_id": new_log["_id"]},
|
||||
old_log
|
||||
)
|
||||
|
||||
rollback_count += 1
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error rolling back audit log {new_log.get('_id')}: {e}")
|
||||
continue
|
||||
|
||||
print(f"⚠️ Rolled back migration {self.version}: Reverted {rollback_count} audit log records")
|
||||
print("⚠️ WARNING: Some audit log data may have been lost due to schema differences")
|
||||
BIN
backend/app/models/__pycache__/audit_log.cpython-313.pyc
Normal file
BIN
backend/app/models/__pycache__/audit_log.cpython-313.pyc
Normal file
Binary file not shown.
BIN
backend/app/models/__pycache__/job.cpython-313.pyc
Normal file
BIN
backend/app/models/__pycache__/job.cpython-313.pyc
Normal file
Binary file not shown.
BIN
backend/app/models/__pycache__/user.cpython-313.pyc
Normal file
BIN
backend/app/models/__pycache__/user.cpython-313.pyc
Normal file
Binary file not shown.
175
backend/app/models/audit_log.py
Normal file
175
backend/app/models/audit_log.py
Normal file
|
|
@ -0,0 +1,175 @@
|
|||
"""Audit log model for tracking sensitive operations."""
|
||||
|
||||
from datetime import datetime
|
||||
from enum import Enum
|
||||
from typing import Any, Dict, Optional
|
||||
from bson import ObjectId
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from .user import PyObjectId
|
||||
|
||||
|
||||
class AuditAction(str, Enum):
|
||||
"""Enumeration of auditable actions."""
|
||||
|
||||
# Authentication actions
|
||||
LOGIN_SUCCESS = "auth.login.success"
|
||||
LOGIN_FAILURE = "auth.login.failure"
|
||||
LOGOUT = "auth.logout"
|
||||
TOKEN_REFRESH = "auth.token.refresh"
|
||||
PASSWORD_CHANGE = "auth.password.change"
|
||||
PASSWORD_RESET = "auth.password.reset"
|
||||
|
||||
# User management actions
|
||||
USER_CREATE = "user.create"
|
||||
USER_UPDATE = "user.update"
|
||||
USER_DELETE = "user.delete"
|
||||
USER_ROLE_CHANGE = "user.role.change"
|
||||
USER_ACTIVATE = "user.activate"
|
||||
USER_DEACTIVATE = "user.deactivate"
|
||||
|
||||
# Job management actions
|
||||
JOB_CREATE = "job.create"
|
||||
JOB_UPDATE = "job.update"
|
||||
JOB_DELETE = "job.delete"
|
||||
JOB_APPROVE = "job.approve"
|
||||
JOB_REJECT = "job.reject"
|
||||
JOB_CANCEL = "job.cancel"
|
||||
JOB_STATUS_CHANGE = "job.status.change"
|
||||
|
||||
# File operations
|
||||
FILE_UPLOAD = "file.upload"
|
||||
FILE_DOWNLOAD = "file.download"
|
||||
FILE_DELETE = "file.delete"
|
||||
FILE_ACCESS = "file.access"
|
||||
|
||||
# VTT editing actions
|
||||
VTT_EDIT = "vtt.edit"
|
||||
VTT_APPROVE = "vtt.approve"
|
||||
VTT_REJECT = "vtt.reject"
|
||||
|
||||
# Admin actions
|
||||
ADMIN_CONFIG_CHANGE = "admin.config.change"
|
||||
ADMIN_SYSTEM_ACTION = "admin.system.action"
|
||||
ADMIN_DATA_EXPORT = "admin.data.export"
|
||||
ADMIN_AUDIT_ACCESS = "admin.audit.access"
|
||||
|
||||
# Security events
|
||||
RATE_LIMIT_EXCEEDED = "security.rate_limit.exceeded"
|
||||
VALIDATION_FAILURE = "security.validation.failure"
|
||||
UNAUTHORIZED_ACCESS = "security.unauthorized.access"
|
||||
SUSPICIOUS_ACTIVITY = "security.suspicious.activity"
|
||||
|
||||
|
||||
class AuditLogSeverity(str, Enum):
|
||||
"""Severity levels for audit events."""
|
||||
|
||||
INFO = "info" # Normal operations
|
||||
WARNING = "warning" # Suspicious but not critical
|
||||
ERROR = "error" # Failed operations
|
||||
CRITICAL = "critical" # Security incidents
|
||||
|
||||
|
||||
class AuditLog(BaseModel):
|
||||
"""Audit log entry model."""
|
||||
|
||||
id: Optional[PyObjectId] = Field(default_factory=PyObjectId, alias="_id")
|
||||
|
||||
# Core audit fields
|
||||
timestamp: datetime = Field(default_factory=datetime.utcnow)
|
||||
action: AuditAction
|
||||
severity: AuditLogSeverity = AuditLogSeverity.INFO
|
||||
|
||||
# Actor information
|
||||
user_id: Optional[PyObjectId] = None
|
||||
user_email: Optional[str] = None
|
||||
user_role: Optional[str] = None
|
||||
|
||||
# Request context
|
||||
ip_address: Optional[str] = None
|
||||
user_agent: Optional[str] = None
|
||||
request_id: Optional[str] = None
|
||||
session_id: Optional[str] = None
|
||||
|
||||
# Resource information
|
||||
resource_type: Optional[str] = None # e.g., "job", "user", "file"
|
||||
resource_id: Optional[str] = None
|
||||
resource_name: Optional[str] = None
|
||||
|
||||
# Action details
|
||||
description: str
|
||||
details: Dict[str, Any] = Field(default_factory=dict)
|
||||
|
||||
# Outcome
|
||||
success: bool = True
|
||||
error_message: Optional[str] = None
|
||||
|
||||
# Additional metadata
|
||||
environment: str = "prod"
|
||||
service_name: str = "accessible-video-api"
|
||||
api_version: str = "v1"
|
||||
|
||||
class Config:
|
||||
populate_by_name = True
|
||||
arbitrary_types_allowed = True
|
||||
json_encoders = {ObjectId: str}
|
||||
|
||||
|
||||
class AuditLogCreate(BaseModel):
|
||||
"""Schema for creating audit log entries."""
|
||||
|
||||
action: AuditAction
|
||||
severity: AuditLogSeverity = AuditLogSeverity.INFO
|
||||
description: str
|
||||
|
||||
# Optional fields that can be provided
|
||||
user_id: Optional[PyObjectId] = None
|
||||
user_email: Optional[str] = None
|
||||
user_role: Optional[str] = None
|
||||
ip_address: Optional[str] = None
|
||||
user_agent: Optional[str] = None
|
||||
request_id: Optional[str] = None
|
||||
resource_type: Optional[str] = None
|
||||
resource_id: Optional[str] = None
|
||||
resource_name: Optional[str] = None
|
||||
details: Dict[str, Any] = Field(default_factory=dict)
|
||||
success: bool = True
|
||||
error_message: Optional[str] = None
|
||||
|
||||
|
||||
class AuditLogQuery(BaseModel):
|
||||
"""Schema for querying audit logs."""
|
||||
|
||||
# Time range
|
||||
start_date: Optional[datetime] = None
|
||||
end_date: Optional[datetime] = None
|
||||
|
||||
# Filters
|
||||
action: Optional[AuditAction] = None
|
||||
severity: Optional[AuditLogSeverity] = None
|
||||
user_id: Optional[PyObjectId] = None
|
||||
user_email: Optional[str] = None
|
||||
resource_type: Optional[str] = None
|
||||
resource_id: Optional[str] = None
|
||||
success: Optional[bool] = None
|
||||
|
||||
# Search
|
||||
search: Optional[str] = None # Full-text search in description and details
|
||||
|
||||
# Pagination
|
||||
skip: int = 0
|
||||
limit: int = 100
|
||||
|
||||
# Sorting
|
||||
sort_by: str = "timestamp"
|
||||
sort_order: int = -1 # -1 for descending, 1 for ascending
|
||||
|
||||
|
||||
class AuditLogResponse(BaseModel):
|
||||
"""Response schema for audit log queries."""
|
||||
|
||||
logs: list[AuditLog]
|
||||
total_count: int
|
||||
page: int
|
||||
page_size: int
|
||||
has_more: bool
|
||||
95
backend/app/models/job.py
Normal file
95
backend/app/models/job.py
Normal file
|
|
@ -0,0 +1,95 @@
|
|||
from datetime import datetime
|
||||
from enum import Enum
|
||||
from typing import Any, Literal, Optional
|
||||
|
||||
from pydantic import BaseModel, Field, constr
|
||||
|
||||
|
||||
class JobStatus(str, Enum):
|
||||
CREATED = "created"
|
||||
INGESTING = "ingesting"
|
||||
AI_PROCESSING = "ai_processing"
|
||||
PENDING_QC = "pending_qc"
|
||||
APPROVED_ENGLISH = "approved_english"
|
||||
REJECTED = "rejected"
|
||||
QC_FEEDBACK = "qc_feedback"
|
||||
TRANSLATING = "translating"
|
||||
TTS_GENERATING = "tts_generating"
|
||||
PENDING_FINAL_REVIEW = "pending_final_review"
|
||||
COMPLETED = "completed"
|
||||
|
||||
|
||||
class Source(BaseModel):
|
||||
filename: str
|
||||
original_filename: Optional[str] = None
|
||||
gcs_uri: str
|
||||
duration_s: Optional[float] = None
|
||||
language: constr(min_length=2, max_length=10) = "en"
|
||||
|
||||
|
||||
class RequestedOutputs(BaseModel):
|
||||
captions_vtt: bool = True
|
||||
audio_description_vtt: bool = True
|
||||
audio_description_mp3: bool = True
|
||||
languages: list[str] = []
|
||||
transcreation: list[str] = []
|
||||
|
||||
|
||||
class LangOutput(BaseModel):
|
||||
captions_vtt_gcs: Optional[str] = None
|
||||
ad_vtt_gcs: Optional[str] = None
|
||||
ad_mp3_gcs: Optional[str] = None
|
||||
origin: Optional[Literal["translate", "transcreate"]] = None
|
||||
qa_notes: Optional[str] = None
|
||||
|
||||
|
||||
class ReviewHistoryItem(BaseModel):
|
||||
at: datetime
|
||||
status: str
|
||||
by: Optional[str] = None
|
||||
notes: Optional[str] = None
|
||||
|
||||
|
||||
class Review(BaseModel):
|
||||
notes: Optional[str] = ""
|
||||
reviewer_id: Optional[str] = None
|
||||
history: list[ReviewHistoryItem] = []
|
||||
|
||||
|
||||
class AISection(BaseModel):
|
||||
ingestion_json: Optional[dict[str, Any]] = None
|
||||
confidence: Optional[float] = None
|
||||
|
||||
|
||||
class Job(BaseModel):
|
||||
id: Optional[str] = Field(None, alias="_id")
|
||||
client_id: str
|
||||
title: str
|
||||
source: Source
|
||||
requested_outputs: RequestedOutputs
|
||||
status: JobStatus = JobStatus.CREATED
|
||||
review: Review = Review()
|
||||
outputs: Optional[dict[str, LangOutput]] = None
|
||||
ai: Optional[AISection] = None
|
||||
error: Optional[dict[str, Any]] = None
|
||||
created_at: Optional[datetime] = None
|
||||
updated_at: Optional[datetime] = None
|
||||
|
||||
class Config:
|
||||
populate_by_name = True
|
||||
use_enum_values = True
|
||||
|
||||
|
||||
class JobCreate(BaseModel):
|
||||
title: str
|
||||
language: str = "en"
|
||||
requested_outputs: RequestedOutputs
|
||||
|
||||
|
||||
class JobUpdate(BaseModel):
|
||||
title: Optional[str] = None
|
||||
status: Optional[JobStatus] = None
|
||||
review: Optional[Review] = None
|
||||
outputs: Optional[dict[str, LangOutput]] = None
|
||||
ai: Optional[AISection] = None
|
||||
error: Optional[dict[str, Any]] = None
|
||||
57
backend/app/models/user.py
Normal file
57
backend/app/models/user.py
Normal file
|
|
@ -0,0 +1,57 @@
|
|||
from datetime import datetime
|
||||
from enum import Enum
|
||||
from typing import Optional, Annotated
|
||||
|
||||
from bson import ObjectId
|
||||
from pydantic import BaseModel, EmailStr, Field, BeforeValidator
|
||||
|
||||
|
||||
def validate_object_id(v) -> str:
|
||||
"""Convert ObjectId to string"""
|
||||
if isinstance(v, ObjectId):
|
||||
return str(v)
|
||||
if isinstance(v, str):
|
||||
return v
|
||||
raise ValueError('Invalid ObjectId')
|
||||
|
||||
|
||||
PyObjectId = Annotated[str, BeforeValidator(validate_object_id)]
|
||||
|
||||
|
||||
class UserRole(str, Enum):
|
||||
CLIENT = "client"
|
||||
REVIEWER = "reviewer"
|
||||
ADMIN = "admin"
|
||||
|
||||
|
||||
class User(BaseModel):
|
||||
id: Optional[PyObjectId] = Field(None, alias="_id")
|
||||
email: EmailStr
|
||||
hashed_password: str
|
||||
full_name: str
|
||||
role: UserRole = UserRole.CLIENT
|
||||
is_active: bool = True
|
||||
created_at: Optional[datetime] = None
|
||||
updated_at: Optional[datetime] = None
|
||||
|
||||
class Config:
|
||||
populate_by_name = True
|
||||
use_enum_values = True
|
||||
|
||||
|
||||
class UserInDB(User):
|
||||
pass
|
||||
|
||||
|
||||
class UserCreate(BaseModel):
|
||||
email: EmailStr
|
||||
password: str
|
||||
full_name: str
|
||||
role: UserRole = UserRole.CLIENT
|
||||
|
||||
|
||||
class UserUpdate(BaseModel):
|
||||
email: Optional[EmailStr] = None
|
||||
full_name: Optional[str] = None
|
||||
role: Optional[UserRole] = None
|
||||
is_active: Optional[bool] = None
|
||||
57
backend/app/prompts/gemini_ingestion.md
Normal file
57
backend/app/prompts/gemini_ingestion.md
Normal file
|
|
@ -0,0 +1,57 @@
|
|||
SYSTEM:
|
||||
You are an expert accessibility writer for film/TV and e-learning. Produce STRICT JSON only.
|
||||
|
||||
USER:
|
||||
You are given a video. Return a JSON object with:
|
||||
- language: BCP-47 code (e.g., "en")
|
||||
- confidence: 0..1
|
||||
- summary: 1–2 sentence synopsis
|
||||
- transcript_plaintext: full spoken words, punctuated
|
||||
- captions_vtt: a valid WebVTT file as a single string, with accurate timings and no styling
|
||||
- audio_description_vtt: a valid WebVTT file as a single string, describing key visual elements (no spoilers), synchronized with the program
|
||||
|
||||
Constraints:
|
||||
- Output MUST be valid JSON. Do not include markdown fences or any other text.
|
||||
- All JSON strings must be properly escaped (use \" for quotes within strings)
|
||||
- Use detailed, descriptive audio description phrases that paint a vivid picture. Aim for rich descriptions that are 20% longer than typical AD, providing enhanced visual context without duplicating spoken dialogue.
|
||||
- WebVTT must start with "WEBVTT" and follow this exact format:
|
||||
- Timestamp format: HH:MM:SS.mmm --> HH:MM:SS.mmm (ALWAYS include hours, even if 00:)
|
||||
- Example: "00:01:23.456 --> 00:01:27.890"
|
||||
- Each cue must be separated by blank lines
|
||||
- Never use MM:SS format - always include the hour component
|
||||
- Escape all newlines in VTT strings as \n
|
||||
- Do not include trailing commas in JSON objects or arrays
|
||||
|
||||
CRITICAL TIMING REQUIREMENTS:
|
||||
- Caption timing must be PRECISELY synchronized with the actual speech in the video
|
||||
- Each caption cue should start exactly when the speaker begins that phrase/sentence
|
||||
- Each caption cue should end exactly when the speaker finishes that phrase/sentence
|
||||
- Listen carefully to detect natural speech pauses and word boundaries
|
||||
- Avoid starting captions too early or ending them too late
|
||||
- Ensure captions align with lip movement and speech rhythm
|
||||
- For audio descriptions, time them during natural speech gaps or over non-dialogue audio
|
||||
- Validate that all timestamps are monotonically increasing (each cue starts after the previous one ends)
|
||||
|
||||
AUDIO DESCRIPTION GUIDELINES:
|
||||
- Provide rich, detailed descriptions that include setting, characters, actions, facial expressions, body language, and visual mood
|
||||
- Describe colors, lighting, camera angles, and composition when relevant to understanding
|
||||
- Include environmental details like weather, time of day, architectural features, or technological elements
|
||||
- Mention clothing, objects, and spatial relationships that contribute to scene understanding
|
||||
- Use vivid, engaging language that creates a complete mental picture for visually impaired viewers
|
||||
- Aim for descriptions that are substantive enough to fill natural pauses and reduce silence between spoken content
|
||||
|
||||
CRITICAL: Return ONLY valid JSON that can be parsed by JSON.parse(). No additional text.
|
||||
|
||||
Example output format:
|
||||
```json
|
||||
{
|
||||
"language": "en",
|
||||
"confidence": 0.95,
|
||||
"summary": "A tutorial video showing how to use a web application dashboard.",
|
||||
"transcript_plaintext": "Hello everyone, welcome to this tutorial. Today we'll be exploring the dashboard interface. First, let's log in to the system.",
|
||||
"captions_vtt": "WEBVTT\n\n00:00:01.000 --> 00:00:03.500\nHello everyone, welcome to this tutorial.\n\n00:00:04.000 --> 00:00:07.200\nToday we'll be exploring the dashboard interface.\n\n00:00:08.000 --> 00:00:10.500\nFirst, let's log in to the system.",
|
||||
"audio_description_vtt": "WEBVTT\n\n00:00:00.500 --> 00:00:02.000\nA bright computer monitor displays a clean, modern login page with blue and white corporate branding. The interface features prominently positioned username and password fields.\n\n00:00:05.000 --> 00:00:07.000\nA cursor arrow hovers over the rectangular username input field, which highlights with a subtle blue border as the user prepares to type.\n\n00:00:10.000 --> 00:00:12.000\nThe screen transitions to reveal a comprehensive dashboard filled with colorful charts, data widgets, and navigation panels arranged in an organized grid layout."
|
||||
}
|
||||
```
|
||||
|
||||
Follow this exact structure and formatting.
|
||||
20
backend/app/prompts/gemini_transcreation.md
Normal file
20
backend/app/prompts/gemini_transcreation.md
Normal file
|
|
@ -0,0 +1,20 @@
|
|||
SYSTEM:
|
||||
You are a culturally-savvy accessibility writer.
|
||||
|
||||
USER:
|
||||
Rewrite the following English captions and audio descriptions into {TARGET_LANGUAGE}, preserving:
|
||||
- meaning, tone, and accessibility intent,
|
||||
- timing boundaries (same cue timestamps),
|
||||
- line lengths friendly for readability (~32–40 chars).
|
||||
|
||||
Input:
|
||||
- captions_vtt_en: <VTT text>
|
||||
- ad_vtt_en: <VTT text>
|
||||
- brief: <brand + audience notes>
|
||||
|
||||
Output:
|
||||
JSON:
|
||||
{
|
||||
"captions_vtt": "<VTT in {TARGET_LANGUAGE}>",
|
||||
"audio_description_vtt": "<VTT in {TARGET_LANGUAGE}>"
|
||||
}
|
||||
BIN
backend/app/schemas/__pycache__/auth.cpython-313.pyc
Normal file
BIN
backend/app/schemas/__pycache__/auth.cpython-313.pyc
Normal file
Binary file not shown.
BIN
backend/app/schemas/__pycache__/file.cpython-313.pyc
Normal file
BIN
backend/app/schemas/__pycache__/file.cpython-313.pyc
Normal file
Binary file not shown.
BIN
backend/app/schemas/__pycache__/job.cpython-313.pyc
Normal file
BIN
backend/app/schemas/__pycache__/job.cpython-313.pyc
Normal file
Binary file not shown.
72
backend/app/schemas/auth.py
Normal file
72
backend/app/schemas/auth.py
Normal file
|
|
@ -0,0 +1,72 @@
|
|||
from typing import Optional
|
||||
from pydantic import BaseModel, EmailStr
|
||||
from ..models.user import UserRole
|
||||
|
||||
|
||||
class LoginRequest(BaseModel):
|
||||
email: EmailStr
|
||||
password: str
|
||||
|
||||
|
||||
class LoginResponse(BaseModel):
|
||||
access_token: str
|
||||
token_type: str = "bearer"
|
||||
user_id: str
|
||||
role: str
|
||||
|
||||
|
||||
class RefreshResponse(BaseModel):
|
||||
access_token: str
|
||||
token_type: str = "bearer"
|
||||
|
||||
|
||||
class LogoutResponse(BaseModel):
|
||||
message: str = "Successfully logged out"
|
||||
|
||||
|
||||
# User management schemas for admin routes
|
||||
class UserResponse(BaseModel):
|
||||
id: str
|
||||
email: EmailStr
|
||||
full_name: str
|
||||
role: UserRole
|
||||
is_active: bool
|
||||
created_at: Optional[str] = None
|
||||
|
||||
|
||||
class UserListResponse(BaseModel):
|
||||
users: list[UserResponse]
|
||||
total: int
|
||||
page: int
|
||||
size: int
|
||||
|
||||
|
||||
class CreateUserRequest(BaseModel):
|
||||
email: EmailStr
|
||||
password: str
|
||||
full_name: str
|
||||
role: UserRole = UserRole.CLIENT
|
||||
|
||||
|
||||
class UpdateUserRequest(BaseModel):
|
||||
email: Optional[EmailStr] = None
|
||||
full_name: Optional[str] = None
|
||||
role: Optional[UserRole] = None
|
||||
is_active: Optional[bool] = None
|
||||
|
||||
|
||||
class ChangePasswordRequest(BaseModel):
|
||||
current_password: str
|
||||
new_password: str
|
||||
|
||||
|
||||
class ResetPasswordRequest(BaseModel):
|
||||
email: EmailStr
|
||||
|
||||
|
||||
class AdminStatsResponse(BaseModel):
|
||||
total_users: int
|
||||
total_jobs: int
|
||||
jobs_by_status: dict[str, int]
|
||||
active_jobs_today: int
|
||||
avg_processing_time_hours: float
|
||||
15
backend/app/schemas/file.py
Normal file
15
backend/app/schemas/file.py
Normal file
|
|
@ -0,0 +1,15 @@
|
|||
from typing import Optional
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class SignedUploadRequest(BaseModel):
|
||||
filename: str
|
||||
content_type: str
|
||||
max_size: Optional[int] = None
|
||||
|
||||
|
||||
class SignedUploadResponse(BaseModel):
|
||||
upload_url: str
|
||||
fields: dict[str, str]
|
||||
blob_path: str
|
||||
89
backend/app/schemas/job.py
Normal file
89
backend/app/schemas/job.py
Normal file
|
|
@ -0,0 +1,89 @@
|
|||
from typing import Any, Optional, Union
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
from ..models.job import JobStatus, LangOutput, RequestedOutputs, Review
|
||||
|
||||
|
||||
class JobResponse(BaseModel):
|
||||
id: str
|
||||
title: str
|
||||
status: JobStatus
|
||||
source: dict[str, Any]
|
||||
requested_outputs: RequestedOutputs
|
||||
review: Review
|
||||
outputs: Optional[dict[str, LangOutput]] = None
|
||||
created_at: Optional[str] = None
|
||||
updated_at: Optional[str] = None
|
||||
|
||||
|
||||
class JobListResponse(BaseModel):
|
||||
jobs: list[JobResponse]
|
||||
total: int
|
||||
page: int
|
||||
size: int
|
||||
|
||||
|
||||
class JobCreateRequest(BaseModel):
|
||||
title: str
|
||||
language: str = "en"
|
||||
requested_outputs: RequestedOutputs
|
||||
|
||||
|
||||
class JobUpdateRequest(BaseModel):
|
||||
title: Optional[str] = None
|
||||
review_notes: Optional[str] = None
|
||||
|
||||
|
||||
class ApproveEnglishRequest(BaseModel):
|
||||
notes: Optional[str] = None
|
||||
|
||||
|
||||
class RejectJobRequest(BaseModel):
|
||||
notes: str
|
||||
|
||||
|
||||
class CompleteJobRequest(BaseModel):
|
||||
notes: Optional[str] = None
|
||||
|
||||
|
||||
class VttUpdateRequest(BaseModel):
|
||||
captions_vtt: Optional[str] = None
|
||||
audio_description_vtt: Optional[str] = None
|
||||
language: str = "en"
|
||||
|
||||
|
||||
class VttTimingAdjustRequest(BaseModel):
|
||||
offset_seconds: float
|
||||
language: str = "en"
|
||||
adjust_captions: bool = True
|
||||
adjust_audio_description: bool = True
|
||||
|
||||
|
||||
class JobDownloadsResponse(BaseModel):
|
||||
downloads: dict[str, Union[dict[str, str], str]] # language -> {file_type: signed_url} OR source_video -> signed_url
|
||||
|
||||
|
||||
class VttContentResponse(BaseModel):
|
||||
captions_vtt: Optional[str] = None
|
||||
audio_description_vtt: Optional[str] = None
|
||||
|
||||
|
||||
class AssetValidationResponse(BaseModel):
|
||||
is_valid: bool
|
||||
errors: list[str]
|
||||
warnings: list[str] = []
|
||||
|
||||
|
||||
class JobDeleteResponse(BaseModel):
|
||||
message: str
|
||||
|
||||
|
||||
class BulkDeleteRequest(BaseModel):
|
||||
job_ids: list[str]
|
||||
|
||||
|
||||
class BulkDeleteResponse(BaseModel):
|
||||
deleted_count: int
|
||||
total_requested: int
|
||||
errors: list[str]
|
||||
BIN
backend/app/services/__pycache__/audit_logger.cpython-313.pyc
Normal file
BIN
backend/app/services/__pycache__/audit_logger.cpython-313.pyc
Normal file
Binary file not shown.
BIN
backend/app/services/__pycache__/emailer.cpython-313.pyc
Normal file
BIN
backend/app/services/__pycache__/emailer.cpython-313.pyc
Normal file
Binary file not shown.
BIN
backend/app/services/__pycache__/gcs.cpython-313.pyc
Normal file
BIN
backend/app/services/__pycache__/gcs.cpython-313.pyc
Normal file
Binary file not shown.
BIN
backend/app/services/__pycache__/gemini.cpython-313.pyc
Normal file
BIN
backend/app/services/__pycache__/gemini.cpython-313.pyc
Normal file
Binary file not shown.
BIN
backend/app/services/__pycache__/translate.cpython-313.pyc
Normal file
BIN
backend/app/services/__pycache__/translate.cpython-313.pyc
Normal file
Binary file not shown.
BIN
backend/app/services/__pycache__/tts.cpython-313.pyc
Normal file
BIN
backend/app/services/__pycache__/tts.cpython-313.pyc
Normal file
Binary file not shown.
BIN
backend/app/services/__pycache__/validation.cpython-313.pyc
Normal file
BIN
backend/app/services/__pycache__/validation.cpython-313.pyc
Normal file
Binary file not shown.
331
backend/app/services/audit_logger.py
Normal file
331
backend/app/services/audit_logger.py
Normal file
|
|
@ -0,0 +1,331 @@
|
|||
"""Audit logging service for tracking sensitive operations."""
|
||||
|
||||
import uuid
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Any, Dict, List, Optional
|
||||
from fastapi import Request
|
||||
from motor.motor_asyncio import AsyncIOMotorCollection
|
||||
|
||||
from app.core.database import get_database
|
||||
from app.core.config import get_settings
|
||||
from app.models.audit_log import (
|
||||
AuditLog,
|
||||
AuditLogCreate,
|
||||
AuditLogQuery,
|
||||
AuditLogResponse,
|
||||
AuditAction,
|
||||
AuditLogSeverity
|
||||
)
|
||||
from app.models.user import User
|
||||
from app.telemetry.tracing import trace_async_operation
|
||||
|
||||
|
||||
class AuditLogger:
|
||||
"""Service for managing audit logs."""
|
||||
|
||||
def __init__(self):
|
||||
self.settings = get_settings()
|
||||
self.collection: Optional[AsyncIOMotorCollection] = None
|
||||
|
||||
async def _get_collection(self) -> AsyncIOMotorCollection:
|
||||
"""Get the audit logs collection."""
|
||||
if not self.collection:
|
||||
db = await get_database()
|
||||
self.collection = db.audit_logs
|
||||
return self.collection
|
||||
|
||||
@trace_async_operation("audit_logger.log_action")
|
||||
async def log_action(
|
||||
self,
|
||||
action: AuditAction,
|
||||
description: str,
|
||||
user: Optional[User] = None,
|
||||
request: Optional[Request] = None,
|
||||
resource_type: Optional[str] = None,
|
||||
resource_id: Optional[str] = None,
|
||||
resource_name: Optional[str] = None,
|
||||
details: Optional[Dict[str, Any]] = None,
|
||||
severity: AuditLogSeverity = AuditLogSeverity.INFO,
|
||||
success: bool = True,
|
||||
error_message: Optional[str] = None
|
||||
) -> str:
|
||||
"""
|
||||
Log an audit event.
|
||||
|
||||
Returns:
|
||||
The ID of the created audit log entry.
|
||||
"""
|
||||
|
||||
# Extract request context
|
||||
ip_address = None
|
||||
user_agent = None
|
||||
request_id = None
|
||||
|
||||
if request:
|
||||
# Get IP address (handle forwarded headers)
|
||||
forwarded_for = request.headers.get("X-Forwarded-For")
|
||||
if forwarded_for:
|
||||
ip_address = forwarded_for.split(',')[0].strip()
|
||||
elif request.client:
|
||||
ip_address = request.client.host
|
||||
|
||||
user_agent = request.headers.get("User-Agent")
|
||||
request_id = request.headers.get("X-Request-ID", str(uuid.uuid4()))
|
||||
|
||||
# Create audit log entry
|
||||
audit_log = AuditLog(
|
||||
action=action,
|
||||
severity=severity,
|
||||
description=description,
|
||||
user_id=user.id if user else None,
|
||||
user_email=user.email if user else None,
|
||||
user_role=user.role.value if user else None,
|
||||
ip_address=ip_address,
|
||||
user_agent=user_agent,
|
||||
request_id=request_id,
|
||||
resource_type=resource_type,
|
||||
resource_id=resource_id,
|
||||
resource_name=resource_name,
|
||||
details=details or {},
|
||||
success=success,
|
||||
error_message=error_message,
|
||||
environment=self.settings.app_env,
|
||||
service_name="accessible-video-api",
|
||||
api_version="v1"
|
||||
)
|
||||
|
||||
# Save to database
|
||||
collection = await self._get_collection()
|
||||
result = await collection.insert_one(audit_log.dict(by_alias=True))
|
||||
|
||||
return str(result.inserted_id)
|
||||
|
||||
@trace_async_operation("audit_logger.query_logs")
|
||||
async def query_logs(self, query: AuditLogQuery) -> AuditLogResponse:
|
||||
"""Query audit logs with filtering and pagination."""
|
||||
|
||||
collection = await self._get_collection()
|
||||
|
||||
# Build MongoDB query
|
||||
mongo_query = {}
|
||||
|
||||
# Time range filter
|
||||
if query.start_date or query.end_date:
|
||||
timestamp_filter = {}
|
||||
if query.start_date:
|
||||
timestamp_filter["$gte"] = query.start_date
|
||||
if query.end_date:
|
||||
timestamp_filter["$lte"] = query.end_date
|
||||
mongo_query["timestamp"] = timestamp_filter
|
||||
|
||||
# Exact match filters
|
||||
if query.action:
|
||||
mongo_query["action"] = query.action
|
||||
if query.severity:
|
||||
mongo_query["severity"] = query.severity
|
||||
if query.user_id:
|
||||
mongo_query["user_id"] = query.user_id
|
||||
if query.user_email:
|
||||
mongo_query["user_email"] = query.user_email
|
||||
if query.resource_type:
|
||||
mongo_query["resource_type"] = query.resource_type
|
||||
if query.resource_id:
|
||||
mongo_query["resource_id"] = query.resource_id
|
||||
if query.success is not None:
|
||||
mongo_query["success"] = query.success
|
||||
|
||||
# Text search
|
||||
if query.search:
|
||||
mongo_query["$or"] = [
|
||||
{"description": {"$regex": query.search, "$options": "i"}},
|
||||
{"details": {"$regex": query.search, "$options": "i"}},
|
||||
{"error_message": {"$regex": query.search, "$options": "i"}}
|
||||
]
|
||||
|
||||
# Get total count
|
||||
total_count = await collection.count_documents(mongo_query)
|
||||
|
||||
# Execute query with pagination and sorting
|
||||
cursor = collection.find(mongo_query)
|
||||
|
||||
# Apply sorting
|
||||
sort_direction = query.sort_order
|
||||
cursor = cursor.sort(query.sort_by, sort_direction)
|
||||
|
||||
# Apply pagination
|
||||
cursor = cursor.skip(query.skip).limit(query.limit)
|
||||
|
||||
# Execute query
|
||||
documents = await cursor.to_list(length=query.limit)
|
||||
|
||||
# Convert to Pydantic models
|
||||
logs = []
|
||||
for doc in documents:
|
||||
try:
|
||||
logs.append(AuditLog(**doc))
|
||||
except Exception as e:
|
||||
# Log conversion error but continue
|
||||
print(f"Error converting audit log document: {e}")
|
||||
continue
|
||||
|
||||
# Calculate pagination info
|
||||
page = (query.skip // query.limit) + 1
|
||||
has_more = (query.skip + len(logs)) < total_count
|
||||
|
||||
return AuditLogResponse(
|
||||
logs=logs,
|
||||
total_count=total_count,
|
||||
page=page,
|
||||
page_size=len(logs),
|
||||
has_more=has_more
|
||||
)
|
||||
|
||||
async def get_user_activity(self, user_id: str, days: int = 30) -> List[AuditLog]:
|
||||
"""Get recent activity for a specific user."""
|
||||
|
||||
from_date = datetime.utcnow().replace(
|
||||
hour=0, minute=0, second=0, microsecond=0
|
||||
) - timedelta(days=days)
|
||||
|
||||
query = AuditLogQuery(
|
||||
user_id=user_id,
|
||||
start_date=from_date,
|
||||
limit=1000,
|
||||
sort_by="timestamp",
|
||||
sort_order=-1
|
||||
)
|
||||
|
||||
response = await self.query_logs(query)
|
||||
return response.logs
|
||||
|
||||
async def get_security_events(self, hours: int = 24) -> List[AuditLog]:
|
||||
"""Get recent security-related events."""
|
||||
|
||||
from_date = datetime.utcnow() - timedelta(hours=hours)
|
||||
|
||||
security_actions = [
|
||||
AuditAction.LOGIN_FAILURE,
|
||||
AuditAction.RATE_LIMIT_EXCEEDED,
|
||||
AuditAction.VALIDATION_FAILURE,
|
||||
AuditAction.UNAUTHORIZED_ACCESS,
|
||||
AuditAction.SUSPICIOUS_ACTIVITY
|
||||
]
|
||||
|
||||
collection = await self._get_collection()
|
||||
|
||||
query = {
|
||||
"timestamp": {"$gte": from_date},
|
||||
"action": {"$in": security_actions}
|
||||
}
|
||||
|
||||
cursor = collection.find(query).sort("timestamp", -1).limit(1000)
|
||||
documents = await cursor.to_list(length=1000)
|
||||
|
||||
logs = []
|
||||
for doc in documents:
|
||||
try:
|
||||
logs.append(AuditLog(**doc))
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
return logs
|
||||
|
||||
async def cleanup_old_logs(self, retention_days: int = 365) -> int:
|
||||
"""Clean up audit logs older than retention period."""
|
||||
|
||||
cutoff_date = datetime.utcnow().replace(
|
||||
hour=0, minute=0, second=0, microsecond=0
|
||||
) - timedelta(days=retention_days)
|
||||
|
||||
collection = await self._get_collection()
|
||||
result = await collection.delete_many({
|
||||
"timestamp": {"$lt": cutoff_date}
|
||||
})
|
||||
|
||||
return result.deleted_count
|
||||
|
||||
|
||||
# Global audit logger instance
|
||||
audit_logger = AuditLogger()
|
||||
|
||||
|
||||
# Convenience functions for common audit operations
|
||||
async def log_auth_success(user: User, request: Request):
|
||||
"""Log successful authentication."""
|
||||
await audit_logger.log_action(
|
||||
action=AuditAction.LOGIN_SUCCESS,
|
||||
description=f"User {user.email} logged in successfully",
|
||||
user=user,
|
||||
request=request,
|
||||
severity=AuditLogSeverity.INFO
|
||||
)
|
||||
|
||||
|
||||
async def log_auth_failure(email: str, request: Request, reason: str):
|
||||
"""Log failed authentication attempt."""
|
||||
await audit_logger.log_action(
|
||||
action=AuditAction.LOGIN_FAILURE,
|
||||
description=f"Failed login attempt for {email}: {reason}",
|
||||
request=request,
|
||||
severity=AuditLogSeverity.WARNING,
|
||||
success=False,
|
||||
error_message=reason,
|
||||
details={"attempted_email": email}
|
||||
)
|
||||
|
||||
|
||||
async def log_job_action(action: AuditAction, job_id: str, user: User, request: Request, details: Optional[Dict] = None):
|
||||
"""Log job-related actions."""
|
||||
action_descriptions = {
|
||||
AuditAction.JOB_CREATE: "Job created",
|
||||
AuditAction.JOB_APPROVE: "Job approved",
|
||||
AuditAction.JOB_REJECT: "Job rejected",
|
||||
AuditAction.JOB_CANCEL: "Job cancelled",
|
||||
AuditAction.JOB_UPDATE: "Job updated"
|
||||
}
|
||||
|
||||
await audit_logger.log_action(
|
||||
action=action,
|
||||
description=f"{action_descriptions.get(action, str(action))} by {user.email}",
|
||||
user=user,
|
||||
request=request,
|
||||
resource_type="job",
|
||||
resource_id=job_id,
|
||||
details=details
|
||||
)
|
||||
|
||||
|
||||
async def log_user_management(action: AuditAction, target_user_id: str, admin_user: User, request: Request, details: Optional[Dict] = None):
|
||||
"""Log user management actions."""
|
||||
action_descriptions = {
|
||||
AuditAction.USER_CREATE: "User created",
|
||||
AuditAction.USER_UPDATE: "User updated",
|
||||
AuditAction.USER_DELETE: "User deleted",
|
||||
AuditAction.USER_ROLE_CHANGE: "User role changed",
|
||||
AuditAction.USER_ACTIVATE: "User activated",
|
||||
AuditAction.USER_DEACTIVATE: "User deactivated"
|
||||
}
|
||||
|
||||
await audit_logger.log_action(
|
||||
action=action,
|
||||
description=f"{action_descriptions.get(action, str(action))} by admin {admin_user.email}",
|
||||
user=admin_user,
|
||||
request=request,
|
||||
resource_type="user",
|
||||
resource_id=target_user_id,
|
||||
details=details,
|
||||
severity=AuditLogSeverity.INFO
|
||||
)
|
||||
|
||||
|
||||
async def log_security_event(action: AuditAction, description: str, request: Request, user: Optional[User] = None, details: Optional[Dict] = None):
|
||||
"""Log security-related events."""
|
||||
await audit_logger.log_action(
|
||||
action=action,
|
||||
description=description,
|
||||
user=user,
|
||||
request=request,
|
||||
severity=AuditLogSeverity.WARNING if action != AuditAction.SUSPICIOUS_ACTIVITY else AuditLogSeverity.CRITICAL,
|
||||
success=False,
|
||||
details=details
|
||||
)
|
||||
123
backend/app/services/emailer.py
Normal file
123
backend/app/services/emailer.py
Normal file
|
|
@ -0,0 +1,123 @@
|
|||
|
||||
from jinja2 import Template
|
||||
from sendgrid import SendGridAPIClient
|
||||
from sendgrid.helpers.mail import Content, From, Mail, Subject, To
|
||||
|
||||
from ..core.config import settings
|
||||
from ..core.logging import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
class EmailService:
|
||||
def __init__(self):
|
||||
if settings.sendgrid_api_key:
|
||||
self.client = SendGridAPIClient(api_key=settings.sendgrid_api_key)
|
||||
else:
|
||||
logger.warning("SendGrid API key not configured")
|
||||
self.client = None
|
||||
|
||||
async def send_completion_email(
|
||||
self,
|
||||
recipient_email: str,
|
||||
job_title: str,
|
||||
download_links: dict[str, dict[str, str]]
|
||||
) -> bool:
|
||||
"""Send job completion email with download links"""
|
||||
if not self.client:
|
||||
logger.error("SendGrid not configured, cannot send email")
|
||||
return False
|
||||
|
||||
try:
|
||||
# Render email template
|
||||
html_content = self._render_completion_template(
|
||||
job_title=job_title,
|
||||
download_links=download_links
|
||||
)
|
||||
|
||||
message = Mail(
|
||||
from_email=From(settings.email_from, "Accessible Video Platform"),
|
||||
to_emails=To(recipient_email),
|
||||
subject=Subject(f"Your accessible video assets are ready: {job_title}"),
|
||||
html_content=Content("text/html", html_content)
|
||||
)
|
||||
|
||||
response = self.client.send(message)
|
||||
|
||||
if response.status_code == 202:
|
||||
logger.info(f"Completion email sent successfully to {recipient_email}")
|
||||
return True
|
||||
else:
|
||||
logger.error(f"Failed to send email, status code: {response.status_code}")
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Email sending failed: {e}")
|
||||
return False
|
||||
|
||||
def _render_completion_template(
|
||||
self,
|
||||
job_title: str,
|
||||
download_links: dict[str, dict[str, str]]
|
||||
) -> str:
|
||||
"""Render the completion email HTML template"""
|
||||
template_str = """
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<title>Your Accessible Video Assets Are Ready</title>
|
||||
<style>
|
||||
body { font-family: Arial, sans-serif; line-height: 1.6; color: #333; }
|
||||
.container { max-width: 600px; margin: 0 auto; padding: 20px; }
|
||||
.header { background-color: #4f46e5; color: white; padding: 20px; text-align: center; }
|
||||
.content { padding: 20px; }
|
||||
.download-section { margin: 20px 0; padding: 15px; background-color: #f9fafb; border-radius: 8px; }
|
||||
.download-link { display: inline-block; padding: 10px 20px; margin: 5px; background-color: #4f46e5; color: white; text-decoration: none; border-radius: 5px; }
|
||||
.footer { text-align: center; padding: 20px; color: #6b7280; font-size: 12px; }
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div class="container">
|
||||
<div class="header">
|
||||
<h1>Your Accessible Video Assets Are Ready!</h1>
|
||||
</div>
|
||||
|
||||
<div class="content">
|
||||
<h2>{{ job_title }}</h2>
|
||||
|
||||
<p>Great news! Your video accessibility assets have been processed and are ready for download.</p>
|
||||
|
||||
{% for language, files in download_links.items() %}
|
||||
<div class="download-section">
|
||||
<h3>{{ language.upper() }} Assets</h3>
|
||||
{% for file_type, url in files.items() %}
|
||||
<a href="{{ url }}" class="download-link">
|
||||
Download {{ file_type|replace('_', ' ')|title }}
|
||||
</a>
|
||||
{% endfor %}
|
||||
</div>
|
||||
{% endfor %}
|
||||
|
||||
<p><strong>Important:</strong> These download links will expire in 24 hours for security purposes.</p>
|
||||
|
||||
<p>If you need assistance or have questions about your accessible video assets, please don't hesitate to contact our support team.</p>
|
||||
</div>
|
||||
|
||||
<div class="footer">
|
||||
<p>This email was sent by the Accessible Video Platform</p>
|
||||
<p>Links expire in 24 hours for security</p>
|
||||
</div>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
template = Template(template_str)
|
||||
return template.render(
|
||||
job_title=job_title,
|
||||
download_links=download_links
|
||||
)
|
||||
|
||||
|
||||
# Global service instance
|
||||
email_service = EmailService()
|
||||
168
backend/app/services/gcs.py
Normal file
168
backend/app/services/gcs.py
Normal file
|
|
@ -0,0 +1,168 @@
|
|||
import asyncio
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Optional
|
||||
|
||||
from fastapi import HTTPException, UploadFile
|
||||
from google.cloud import storage
|
||||
from google.cloud.exceptions import NotFound
|
||||
|
||||
from ..core.config import settings
|
||||
from ..core.logging import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
class GCSService:
|
||||
def __init__(self):
|
||||
self.client = storage.Client(project=settings.gcp_project_id)
|
||||
self.bucket = self.client.bucket(settings.gcs_bucket)
|
||||
self.executor = ThreadPoolExecutor(max_workers=4)
|
||||
|
||||
async def upload_file_to_gcs(
|
||||
self,
|
||||
file: UploadFile,
|
||||
destination_path: str,
|
||||
content_type: Optional[str] = None
|
||||
) -> str:
|
||||
"""Upload file to GCS and return the GCS URI"""
|
||||
def _upload():
|
||||
blob = self.bucket.blob(destination_path)
|
||||
|
||||
# Set content type
|
||||
if content_type:
|
||||
blob.content_type = content_type
|
||||
elif file.content_type:
|
||||
blob.content_type = file.content_type
|
||||
|
||||
# Upload file
|
||||
file.file.seek(0) # Reset file pointer
|
||||
blob.upload_from_file(file.file)
|
||||
|
||||
return f"gs://{settings.gcs_bucket}/{destination_path}"
|
||||
|
||||
loop = asyncio.get_event_loop()
|
||||
try:
|
||||
return await loop.run_in_executor(self.executor, _upload)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to upload file to GCS: {e}")
|
||||
raise HTTPException(status_code=500, detail="File upload failed")
|
||||
|
||||
async def upload_text_to_gcs(
|
||||
self,
|
||||
content: str,
|
||||
destination_path: str,
|
||||
content_type: str = "text/plain"
|
||||
) -> str:
|
||||
"""Upload text content to GCS and return the GCS URI"""
|
||||
def _upload():
|
||||
blob = self.bucket.blob(destination_path)
|
||||
blob.content_type = content_type
|
||||
blob.upload_from_string(content, content_type=content_type)
|
||||
|
||||
return f"gs://{settings.gcs_bucket}/{destination_path}"
|
||||
|
||||
loop = asyncio.get_event_loop()
|
||||
try:
|
||||
return await loop.run_in_executor(self.executor, _upload)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to upload text to GCS: {e}")
|
||||
raise HTTPException(status_code=500, detail="Text upload failed")
|
||||
|
||||
async def get_signed_url(
|
||||
self,
|
||||
blob_path: str,
|
||||
expiration_hours: int = 24,
|
||||
method: str = "GET"
|
||||
) -> str:
|
||||
"""Generate a signed URL for downloading a file"""
|
||||
def _get_signed_url():
|
||||
blob = self.bucket.blob(blob_path)
|
||||
|
||||
# Check if blob exists
|
||||
if not blob.exists():
|
||||
raise NotFound(f"File not found: {blob_path}")
|
||||
|
||||
expiration = datetime.utcnow() + timedelta(hours=expiration_hours)
|
||||
|
||||
return blob.generate_signed_url(
|
||||
expiration=expiration,
|
||||
method=method,
|
||||
version="v4"
|
||||
)
|
||||
|
||||
loop = asyncio.get_event_loop()
|
||||
try:
|
||||
return await loop.run_in_executor(self.executor, _get_signed_url)
|
||||
except NotFound:
|
||||
raise HTTPException(status_code=404, detail="File not found")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to generate signed URL: {e}")
|
||||
raise HTTPException(status_code=500, detail="Failed to generate download URL")
|
||||
|
||||
async def delete_file(self, blob_path: str) -> bool:
|
||||
"""Delete a file from GCS"""
|
||||
def _delete():
|
||||
blob = self.bucket.blob(blob_path)
|
||||
blob.delete()
|
||||
return True
|
||||
|
||||
loop = asyncio.get_event_loop()
|
||||
try:
|
||||
return await loop.run_in_executor(self.executor, _delete)
|
||||
except NotFound:
|
||||
return False
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to delete file from GCS: {e}")
|
||||
raise HTTPException(status_code=500, detail="File deletion failed")
|
||||
|
||||
async def file_exists(self, blob_path: str) -> bool:
|
||||
"""Check if a file exists in GCS"""
|
||||
def _exists():
|
||||
blob = self.bucket.blob(blob_path)
|
||||
return blob.exists()
|
||||
|
||||
loop = asyncio.get_event_loop()
|
||||
return await loop.run_in_executor(self.executor, _exists)
|
||||
|
||||
|
||||
# Global GCS service instance
|
||||
gcs_service = GCSService()
|
||||
|
||||
# Convenience functions
|
||||
async def upload_file_to_gcs(file: UploadFile, destination_path: str) -> str:
|
||||
return await gcs_service.upload_file_to_gcs(file, destination_path)
|
||||
|
||||
async def upload_vtt_to_gcs(content: str, destination_path: str) -> str:
|
||||
return await gcs_service.upload_text_to_gcs(content, destination_path, "text/vtt")
|
||||
|
||||
async def upload_json_to_gcs(content: str, destination_path: str) -> str:
|
||||
return await gcs_service.upload_text_to_gcs(content, destination_path, "application/json")
|
||||
|
||||
async def get_signed_download_url(blob_path: str, expiration_hours: int = 24) -> str:
|
||||
return await gcs_service.get_signed_url(blob_path, expiration_hours)
|
||||
|
||||
async def generate_signed_upload_url(
|
||||
blob_path: str,
|
||||
content_type: str,
|
||||
max_size: int = 1024 * 1024 * 1024 # 1GB
|
||||
) -> dict:
|
||||
"""Generate a signed URL for direct browser-to-GCS upload"""
|
||||
def _generate():
|
||||
blob = gcs_service.bucket.blob(blob_path)
|
||||
|
||||
# Generate signed POST URL
|
||||
url, fields = blob.generate_signed_post_policy_v4(
|
||||
expiration=timedelta(hours=1),
|
||||
conditions=[
|
||||
["content-length-range", 1, max_size],
|
||||
["starts-with", "$Content-Type", content_type.split("/")[0]]
|
||||
],
|
||||
fields={
|
||||
"Content-Type": content_type
|
||||
}
|
||||
)
|
||||
|
||||
return {"url": url, "fields": fields}
|
||||
|
||||
loop = asyncio.get_event_loop()
|
||||
return await loop.run_in_executor(gcs_service.executor, _generate)
|
||||
350
backend/app/services/gemini.py
Normal file
350
backend/app/services/gemini.py
Normal file
|
|
@ -0,0 +1,350 @@
|
|||
import json
|
||||
import asyncio
|
||||
from pathlib import Path
|
||||
from typing import Any, Optional
|
||||
|
||||
import google.genai as genai
|
||||
|
||||
from ..core.config import settings
|
||||
from ..core.logging import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
# Configure Gemini client
|
||||
client = genai.Client(api_key=settings.gemini_api_key)
|
||||
|
||||
class GeminiService:
|
||||
def __init__(self):
|
||||
self.model_name = 'gemini-2.5-pro' # Stable production model
|
||||
self.prompts_dir = Path(__file__).parent.parent / "prompts"
|
||||
|
||||
def _load_prompt(self, prompt_file: str) -> str:
|
||||
"""Load prompt template from prompts directory"""
|
||||
prompt_path = self.prompts_dir / prompt_file
|
||||
try:
|
||||
return prompt_path.read_text()
|
||||
except FileNotFoundError:
|
||||
logger.error(f"Prompt file not found: {prompt_file}")
|
||||
raise
|
||||
|
||||
async def _wait_for_file_active(self, file_name: str, max_wait_seconds: int = 300) -> bool:
|
||||
"""Wait for uploaded file to become ACTIVE state"""
|
||||
wait_time = 1 # Start with 1 second
|
||||
total_waited = 0
|
||||
|
||||
while total_waited < max_wait_seconds:
|
||||
try:
|
||||
# Get file status
|
||||
file_info = client.files.get(name=file_name)
|
||||
logger.info(f"File {file_name} status: {file_info.state} (waited {total_waited}s)")
|
||||
|
||||
if file_info.state == "ACTIVE":
|
||||
logger.info(f"File {file_name} is now ACTIVE!")
|
||||
return True
|
||||
elif file_info.state == "FAILED":
|
||||
logger.error(f"File {file_name} processing FAILED")
|
||||
return False
|
||||
|
||||
# Wait with exponential backoff (max 30s)
|
||||
logger.info(f"File not ready, waiting {wait_time}s...")
|
||||
await asyncio.sleep(wait_time)
|
||||
total_waited += wait_time
|
||||
wait_time = min(wait_time * 1.5, 30) # Exponential backoff, max 30s
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error checking file status: {e}")
|
||||
await asyncio.sleep(5) # Wait 5s on error
|
||||
total_waited += 5
|
||||
|
||||
logger.error(f"File {file_name} did not become ACTIVE within {max_wait_seconds}s")
|
||||
return False
|
||||
|
||||
async def extract_accessibility(self, video_file_path: str) -> dict[str, Any]:
|
||||
"""
|
||||
Extract captions and audio descriptions from video using Gemini 2.0
|
||||
Returns structured JSON with transcript, captions VTT, and audio description VTT
|
||||
"""
|
||||
prompt = self._load_prompt("gemini_ingestion.md")
|
||||
|
||||
try:
|
||||
logger.info(f"Starting Gemini processing for video: {video_file_path}")
|
||||
|
||||
# Upload video file to Gemini using new API
|
||||
logger.info("Uploading video file to Gemini API...")
|
||||
uploaded_file = client.files.upload(
|
||||
file=video_file_path,
|
||||
config={
|
||||
"display_name": f"video_processing_{Path(video_file_path).name}",
|
||||
"mime_type": "video/mp4"
|
||||
}
|
||||
)
|
||||
logger.info(f"Successfully uploaded file: {uploaded_file.name} (URI: {uploaded_file.uri})")
|
||||
|
||||
# Wait for file to become ACTIVE before using it
|
||||
logger.info("Waiting for file to become ACTIVE...")
|
||||
file_ready = await self._wait_for_file_active(uploaded_file.name)
|
||||
if not file_ready:
|
||||
raise Exception("File failed to become ACTIVE within timeout")
|
||||
|
||||
# Generate content using new API
|
||||
logger.info("Generating content with Gemini model...")
|
||||
response = client.models.generate_content(
|
||||
model=self.model_name,
|
||||
contents=[
|
||||
genai.types.Part.from_text(text=prompt),
|
||||
genai.types.Part.from_uri(
|
||||
file_uri=uploaded_file.uri,
|
||||
mime_type=uploaded_file.mime_type
|
||||
)
|
||||
]
|
||||
)
|
||||
|
||||
# Parse JSON response
|
||||
response_text = response.text.strip()
|
||||
logger.info(f"Received Gemini response (first 200 chars): {response_text[:200]}...")
|
||||
|
||||
# Handle potential markdown formatting
|
||||
if response_text.startswith("```json"):
|
||||
response_text = response_text.replace("```json", "").replace("```", "").strip()
|
||||
logger.info("Cleaned markdown formatting from response")
|
||||
|
||||
# Additional cleanup for common JSON issues
|
||||
response_text = response_text.strip()
|
||||
|
||||
logger.info("Parsing JSON response...")
|
||||
try:
|
||||
result = json.loads(response_text)
|
||||
except json.JSONDecodeError as e:
|
||||
logger.error(f"JSON parse error at position {e.pos}: {e.msg}")
|
||||
# Log the problematic area
|
||||
start = max(0, e.pos - 100)
|
||||
end = min(len(response_text), e.pos + 100)
|
||||
problematic_text = response_text[start:end]
|
||||
logger.error(f"Problematic JSON area: ...{problematic_text}...")
|
||||
raise
|
||||
|
||||
# Validate required fields
|
||||
required_fields = [
|
||||
"language", "confidence", "summary",
|
||||
"transcript_plaintext", "captions_vtt", "audio_description_vtt"
|
||||
]
|
||||
|
||||
for field in required_fields:
|
||||
if field not in result:
|
||||
raise ValueError(f"Missing required field: {field}")
|
||||
|
||||
# Validate VTT format
|
||||
if not result["captions_vtt"].startswith("WEBVTT"):
|
||||
raise ValueError("Invalid captions VTT format")
|
||||
|
||||
if not result["audio_description_vtt"].startswith("WEBVTT"):
|
||||
raise ValueError("Invalid audio description VTT format")
|
||||
|
||||
logger.info(
|
||||
f"Successfully extracted accessibility content with confidence: {result['confidence']}"
|
||||
)
|
||||
|
||||
# Clean up uploaded file
|
||||
try:
|
||||
client.files.delete(name=uploaded_file.name)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to cleanup uploaded file: {e}")
|
||||
|
||||
return result
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
logger.error(f"Failed to parse Gemini JSON response: {e}")
|
||||
logger.error(f"Raw response that failed to parse: {response_text}")
|
||||
# Attempt self-healing
|
||||
return await self._self_heal_response(video_file_path, response_text)
|
||||
except Exception as e:
|
||||
logger.error(f"Gemini extraction failed with exception: {type(e).__name__}: {str(e)}")
|
||||
logger.error(f"Video file path: {video_file_path}")
|
||||
# Print to stdout for immediate visibility
|
||||
print(f"🚨 GEMINI ERROR: {type(e).__name__}: {str(e)}")
|
||||
raise
|
||||
|
||||
async def _self_heal_response(self, video_file_path: str, invalid_response: str) -> dict[str, Any]:
|
||||
"""Attempt to self-heal invalid JSON response from Gemini"""
|
||||
logger.info("Attempting to self-heal JSON response without re-uploading video")
|
||||
|
||||
# Try to fix common JSON issues first
|
||||
try:
|
||||
fixed_response = self._attempt_json_fix(invalid_response)
|
||||
if fixed_response:
|
||||
logger.info("Successfully fixed JSON without re-processing")
|
||||
return fixed_response
|
||||
except Exception as e:
|
||||
logger.warning(f"JSON fix attempt failed: {e}")
|
||||
|
||||
# If simple fixes don't work, try a text-only self-heal prompt with more context
|
||||
self_heal_prompt = f"""
|
||||
SYSTEM: You are a JSON repair service. Fix the malformed JSON below and return ONLY the corrected JSON.
|
||||
|
||||
CRITICAL REQUIREMENTS:
|
||||
- The JSON MUST contain these exact fields: language, confidence, summary, transcript_plaintext, captions_vtt, audio_description_vtt
|
||||
- If audio_description_vtt is truncated or missing, reconstruct it as a valid WebVTT with at least basic descriptions
|
||||
- All VTT content must start with "WEBVTT" and have proper timestamp format (HH:MM:SS.mmm --> HH:MM:SS.mmm)
|
||||
- Properly escape all quotes within strings using \"
|
||||
- Fix unterminated strings by adding closing quotes
|
||||
- Remove trailing commas
|
||||
- Ensure all JSON is properly closed with }}
|
||||
|
||||
Fix the JSON and return it:
|
||||
|
||||
{invalid_response}
|
||||
"""
|
||||
|
||||
try:
|
||||
response = client.models.generate_content(
|
||||
model=self.model_name,
|
||||
contents=[genai.types.Part.from_text(text=self_heal_prompt)]
|
||||
)
|
||||
|
||||
response_text = response.text.strip()
|
||||
|
||||
# Handle potential markdown formatting
|
||||
if response_text.startswith("```json"):
|
||||
response_text = response_text.replace("```json", "").replace("```", "").strip()
|
||||
|
||||
result = json.loads(response_text)
|
||||
|
||||
# Validate that all required fields are present after healing
|
||||
required_fields = [
|
||||
"language", "confidence", "summary",
|
||||
"transcript_plaintext", "captions_vtt", "audio_description_vtt"
|
||||
]
|
||||
|
||||
missing_fields = [field for field in required_fields if field not in result]
|
||||
if missing_fields:
|
||||
logger.error(f"Self-heal lost required fields: {missing_fields}")
|
||||
# If audio_description_vtt is missing, create a basic one
|
||||
if "audio_description_vtt" in missing_fields:
|
||||
logger.info("Creating fallback audio_description_vtt")
|
||||
result["audio_description_vtt"] = "WEBVTT\n\n00:00:00.000 --> 00:00:05.000\nVideo content with visual elements described."
|
||||
|
||||
# If other critical fields are missing, raise an error
|
||||
remaining_missing = [f for f in missing_fields if f != "audio_description_vtt"]
|
||||
if remaining_missing:
|
||||
raise ValueError(f"Self-heal failed to preserve required fields: {remaining_missing}")
|
||||
|
||||
logger.info("Successfully self-healed Gemini response with all required fields")
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Self-heal attempt failed: {e}")
|
||||
raise ValueError("Failed to get valid JSON from Gemini after self-heal attempt")
|
||||
|
||||
def _attempt_json_fix(self, json_text: str) -> dict[str, Any] | None:
|
||||
"""Attempt to fix common JSON syntax issues"""
|
||||
# Try to identify and fix common issues
|
||||
fixes_tried = []
|
||||
fixed_text = json_text
|
||||
import re
|
||||
|
||||
# Fix 1: Remove trailing commas
|
||||
fixed_text = re.sub(r',(\s*[}\]])', r'\1', fixed_text)
|
||||
fixes_tried.append("removed trailing commas")
|
||||
|
||||
# Fix 2: Try to fix unterminated strings by adding closing quote and brace
|
||||
if fixed_text.count('"') % 2 != 0: # Odd number of quotes suggests unterminated string
|
||||
# Find the last quote and see if we need to close the JSON
|
||||
last_quote_pos = fixed_text.rfind('"')
|
||||
remainder = fixed_text[last_quote_pos + 1:].strip()
|
||||
|
||||
# If there's no closing brace after the last quote, try to fix it
|
||||
if remainder and not remainder.endswith('}'):
|
||||
# Try to intelligently close the JSON
|
||||
if 'audio_description_vtt' in fixed_text[max(0, last_quote_pos - 100):]:
|
||||
# This appears to be in the audio_description_vtt field
|
||||
fixed_text += '"\n}'
|
||||
fixes_tried.append("closed unterminated audio_description_vtt string")
|
||||
else:
|
||||
fixed_text += '"'
|
||||
fixes_tried.append("closed unterminated string")
|
||||
|
||||
# Fix 3: Ensure JSON ends with closing brace
|
||||
if not fixed_text.rstrip().endswith('}'):
|
||||
fixed_text = fixed_text.rstrip() + '\n}'
|
||||
fixes_tried.append("added closing brace")
|
||||
|
||||
try:
|
||||
result = json.loads(fixed_text)
|
||||
logger.info(f"JSON fixed with: {', '.join(fixes_tried)}")
|
||||
|
||||
# Validate that we have the required fields
|
||||
required_fields = [
|
||||
"language", "confidence", "summary",
|
||||
"transcript_plaintext", "captions_vtt", "audio_description_vtt"
|
||||
]
|
||||
|
||||
missing_fields = [field for field in required_fields if field not in result]
|
||||
if missing_fields:
|
||||
logger.warning(f"Fixed JSON is missing required fields: {missing_fields}")
|
||||
return None # Let the more advanced self-healing handle this
|
||||
|
||||
return result
|
||||
except json.JSONDecodeError as e:
|
||||
logger.debug(f"JSON fix attempt failed: {e}")
|
||||
return None
|
||||
|
||||
async def transcreate_content(
|
||||
self,
|
||||
captions_vtt: str,
|
||||
ad_vtt: str,
|
||||
target_language: str,
|
||||
brief: Optional[str] = None
|
||||
) -> dict[str, str]:
|
||||
"""
|
||||
Transcreate English VTT content to target language with cultural adaptation
|
||||
"""
|
||||
prompt_template = self._load_prompt("gemini_transcreation.md")
|
||||
|
||||
# Format prompt with actual content
|
||||
prompt = prompt_template.format(
|
||||
TARGET_LANGUAGE=target_language
|
||||
)
|
||||
|
||||
user_prompt = f"""
|
||||
Input:
|
||||
- captions_vtt_en: {captions_vtt}
|
||||
- ad_vtt_en: {ad_vtt}
|
||||
- brief: {brief or "No specific brand guidelines provided"}
|
||||
|
||||
Output:
|
||||
JSON:
|
||||
"""
|
||||
|
||||
try:
|
||||
response = client.models.generate_content(
|
||||
model=self.model_name,
|
||||
contents=[
|
||||
genai.types.Part.from_text(text=prompt + "\n\n" + user_prompt)
|
||||
]
|
||||
)
|
||||
|
||||
response_text = response.text.strip()
|
||||
|
||||
# Handle potential markdown formatting
|
||||
if response_text.startswith("```json"):
|
||||
response_text = response_text.replace("```json", "").replace("```", "").strip()
|
||||
|
||||
result = json.loads(response_text)
|
||||
|
||||
# Validate required fields
|
||||
if "captions_vtt" not in result or "audio_description_vtt" not in result:
|
||||
raise ValueError("Missing required VTT fields in transcreation response")
|
||||
|
||||
logger.info(f"Successfully transcreated content to {target_language}")
|
||||
return result
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
logger.error(f"Failed to parse transcreation JSON response: {e}")
|
||||
raise ValueError("Invalid JSON response from transcreation")
|
||||
except Exception as e:
|
||||
logger.error(f"Transcreation failed: {e}")
|
||||
raise
|
||||
|
||||
|
||||
# Global service instance
|
||||
gemini_service = GeminiService()
|
||||
284
backend/app/services/secrets_manager.py
Normal file
284
backend/app/services/secrets_manager.py
Normal file
|
|
@ -0,0 +1,284 @@
|
|||
"""Google Cloud Secret Manager integration service."""
|
||||
|
||||
import os
|
||||
import asyncio
|
||||
from typing import Dict, List, Optional, Any
|
||||
from functools import lru_cache
|
||||
from google.cloud import secretmanager
|
||||
from google.api_core import exceptions as gcp_exceptions
|
||||
|
||||
from app.core.config import get_settings
|
||||
from app.core.logging import get_logger
|
||||
from app.telemetry.tracing import trace_async_operation
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
class SecretManagerError(Exception):
|
||||
"""Custom exception for Secret Manager operations."""
|
||||
pass
|
||||
|
||||
|
||||
class SecretsManager:
|
||||
"""Service for managing secrets via Google Cloud Secret Manager."""
|
||||
|
||||
def __init__(self):
|
||||
self.settings = get_settings()
|
||||
self.client: Optional[secretmanager.SecretManagerServiceClient] = None
|
||||
self.project_id = self.settings.google_cloud_project
|
||||
self._cache: Dict[str, str] = {}
|
||||
self._cache_ttl = 300 # 5 minutes cache
|
||||
|
||||
def _get_client(self) -> secretmanager.SecretManagerServiceClient:
|
||||
"""Get or create Secret Manager client."""
|
||||
if not self.client:
|
||||
try:
|
||||
self.client = secretmanager.SecretManagerServiceClient()
|
||||
logger.info("Secret Manager client initialized")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to initialize Secret Manager client: {e}")
|
||||
raise SecretManagerError(f"Failed to initialize Secret Manager: {e}")
|
||||
|
||||
return self.client
|
||||
|
||||
@trace_async_operation("secrets_manager.get_secret")
|
||||
async def get_secret(self, secret_name: str, version: str = "latest") -> str:
|
||||
"""
|
||||
Retrieve a secret from Google Cloud Secret Manager.
|
||||
|
||||
Args:
|
||||
secret_name: Name of the secret
|
||||
version: Version of the secret (default: "latest")
|
||||
|
||||
Returns:
|
||||
The secret value as a string
|
||||
|
||||
Raises:
|
||||
SecretManagerError: If secret cannot be retrieved
|
||||
"""
|
||||
|
||||
cache_key = f"{secret_name}:{version}"
|
||||
|
||||
# Check cache first
|
||||
if cache_key in self._cache:
|
||||
logger.debug(f"Secret {secret_name} retrieved from cache")
|
||||
return self._cache[cache_key]
|
||||
|
||||
try:
|
||||
# Build the secret name
|
||||
name = f"projects/{self.project_id}/secrets/{secret_name}/versions/{version}"
|
||||
|
||||
# Get the secret
|
||||
client = self._get_client()
|
||||
|
||||
# Run in thread pool since Secret Manager client is synchronous
|
||||
loop = asyncio.get_event_loop()
|
||||
response = await loop.run_in_executor(
|
||||
None,
|
||||
client.access_secret_version,
|
||||
{"name": name}
|
||||
)
|
||||
|
||||
secret_value = response.payload.data.decode("UTF-8")
|
||||
|
||||
# Cache the secret (with TTL handled by application restart)
|
||||
self._cache[cache_key] = secret_value
|
||||
|
||||
logger.info(f"Successfully retrieved secret: {secret_name}")
|
||||
return secret_value
|
||||
|
||||
except gcp_exceptions.NotFound:
|
||||
error_msg = f"Secret not found: {secret_name}"
|
||||
logger.error(error_msg)
|
||||
raise SecretManagerError(error_msg)
|
||||
|
||||
except gcp_exceptions.PermissionDenied:
|
||||
error_msg = f"Permission denied accessing secret: {secret_name}"
|
||||
logger.error(error_msg)
|
||||
raise SecretManagerError(error_msg)
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Failed to retrieve secret {secret_name}: {e}"
|
||||
logger.error(error_msg)
|
||||
raise SecretManagerError(error_msg)
|
||||
|
||||
@trace_async_operation("secrets_manager.get_secrets_batch")
|
||||
async def get_secrets_batch(self, secret_names: List[str]) -> Dict[str, str]:
|
||||
"""
|
||||
Retrieve multiple secrets efficiently.
|
||||
|
||||
Args:
|
||||
secret_names: List of secret names to retrieve
|
||||
|
||||
Returns:
|
||||
Dictionary mapping secret names to their values
|
||||
"""
|
||||
|
||||
secrets = {}
|
||||
tasks = []
|
||||
|
||||
for secret_name in secret_names:
|
||||
task = asyncio.create_task(
|
||||
self.get_secret(secret_name),
|
||||
name=f"get_secret_{secret_name}"
|
||||
)
|
||||
tasks.append((secret_name, task))
|
||||
|
||||
# Wait for all tasks to complete
|
||||
for secret_name, task in tasks:
|
||||
try:
|
||||
secrets[secret_name] = await task
|
||||
except SecretManagerError as e:
|
||||
logger.warning(f"Failed to retrieve secret {secret_name}: {e}")
|
||||
# Continue with other secrets
|
||||
continue
|
||||
|
||||
return secrets
|
||||
|
||||
async def create_secret(self, secret_name: str, secret_value: str, labels: Optional[Dict[str, str]] = None) -> str:
|
||||
"""
|
||||
Create a new secret in Secret Manager.
|
||||
|
||||
Args:
|
||||
secret_name: Name of the secret
|
||||
secret_value: Value to store
|
||||
labels: Optional labels for the secret
|
||||
|
||||
Returns:
|
||||
The full secret resource name
|
||||
"""
|
||||
|
||||
try:
|
||||
client = self._get_client()
|
||||
parent = f"projects/{self.project_id}"
|
||||
|
||||
# Create the secret
|
||||
secret = {
|
||||
"labels": labels or {},
|
||||
"replication": {"automatic": {}}
|
||||
}
|
||||
|
||||
loop = asyncio.get_event_loop()
|
||||
|
||||
# Create secret resource
|
||||
create_response = await loop.run_in_executor(
|
||||
None,
|
||||
client.create_secret,
|
||||
{
|
||||
"parent": parent,
|
||||
"secret_id": secret_name,
|
||||
"secret": secret
|
||||
}
|
||||
)
|
||||
|
||||
# Add secret version with the actual value
|
||||
version_response = await loop.run_in_executor(
|
||||
None,
|
||||
client.add_secret_version,
|
||||
{
|
||||
"parent": create_response.name,
|
||||
"payload": {"data": secret_value.encode("UTF-8")}
|
||||
}
|
||||
)
|
||||
|
||||
logger.info(f"Successfully created secret: {secret_name}")
|
||||
return version_response.name
|
||||
|
||||
except gcp_exceptions.AlreadyExists:
|
||||
error_msg = f"Secret already exists: {secret_name}"
|
||||
logger.error(error_msg)
|
||||
raise SecretManagerError(error_msg)
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Failed to create secret {secret_name}: {e}"
|
||||
logger.error(error_msg)
|
||||
raise SecretManagerError(error_msg)
|
||||
|
||||
def clear_cache(self) -> None:
|
||||
"""Clear the secrets cache."""
|
||||
self._cache.clear()
|
||||
logger.info("Secrets cache cleared")
|
||||
|
||||
|
||||
# Global secrets manager instance
|
||||
secrets_manager = SecretsManager()
|
||||
|
||||
|
||||
# Convenience functions for common operations
|
||||
async def get_secret(secret_name: str, version: str = "latest") -> str:
|
||||
"""Get a secret value."""
|
||||
return await secrets_manager.get_secret(secret_name, version)
|
||||
|
||||
|
||||
async def get_database_url() -> str:
|
||||
"""Get MongoDB connection URL from Secret Manager."""
|
||||
try:
|
||||
return await secrets_manager.get_secret("mongodb-url")
|
||||
except SecretManagerError:
|
||||
# Fallback to environment variable
|
||||
url = os.getenv("MONGODB_URL")
|
||||
if not url:
|
||||
raise SecretManagerError("MongoDB URL not available in secrets or environment")
|
||||
return url
|
||||
|
||||
|
||||
async def get_redis_url() -> str:
|
||||
"""Get Redis connection URL from Secret Manager."""
|
||||
try:
|
||||
return await secrets_manager.get_secret("redis-url")
|
||||
except SecretManagerError:
|
||||
# Fallback to environment variable
|
||||
url = os.getenv("REDIS_URL")
|
||||
if not url:
|
||||
raise SecretManagerError("Redis URL not available in secrets or environment")
|
||||
return url
|
||||
|
||||
|
||||
async def get_jwt_secrets() -> Dict[str, str]:
|
||||
"""Get JWT secrets from Secret Manager."""
|
||||
try:
|
||||
return await secrets_manager.get_secrets_batch([
|
||||
"jwt-secret",
|
||||
"jwt-refresh-secret"
|
||||
])
|
||||
except SecretManagerError:
|
||||
# Fallback to environment variables
|
||||
return {
|
||||
"jwt-secret": os.getenv("JWT_SECRET_KEY", "dev-secret-change-in-production"),
|
||||
"jwt-refresh-secret": os.getenv("JWT_REFRESH_SECRET_KEY", "dev-refresh-secret-change-in-production")
|
||||
}
|
||||
|
||||
|
||||
async def get_api_keys() -> Dict[str, str]:
|
||||
"""Get all API keys from Secret Manager."""
|
||||
api_keys = {}
|
||||
|
||||
secret_names = [
|
||||
"gemini-api-key",
|
||||
"sendgrid-api-key",
|
||||
"elevenlabs-api-key",
|
||||
"sentry-dsn"
|
||||
]
|
||||
|
||||
try:
|
||||
api_keys = await secrets_manager.get_secrets_batch(secret_names)
|
||||
except SecretManagerError:
|
||||
logger.warning("Failed to retrieve some API keys from Secret Manager, using environment fallback")
|
||||
|
||||
# Fallback to environment variables for missing keys
|
||||
env_mapping = {
|
||||
"gemini-api-key": "GEMINI_API_KEY",
|
||||
"sendgrid-api-key": "SENDGRID_API_KEY",
|
||||
"elevenlabs-api-key": "ELEVENLABS_API_KEY",
|
||||
"sentry-dsn": "SENTRY_DSN"
|
||||
}
|
||||
|
||||
for secret_name, env_var in env_mapping.items():
|
||||
if secret_name not in api_keys:
|
||||
env_value = os.getenv(env_var)
|
||||
if env_value:
|
||||
api_keys[secret_name] = env_value
|
||||
else:
|
||||
logger.warning(f"API key {secret_name} not available in secrets or environment")
|
||||
|
||||
return api_keys
|
||||
110
backend/app/services/translate.py
Normal file
110
backend/app/services/translate.py
Normal file
|
|
@ -0,0 +1,110 @@
|
|||
|
||||
from google.cloud import translate_v2 as translate
|
||||
|
||||
from ..core.config import settings
|
||||
from ..core.logging import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
class TranslateService:
|
||||
def __init__(self):
|
||||
if settings.translate_api_key:
|
||||
self.client = translate.Client()
|
||||
else:
|
||||
logger.warning("Google Translate API key not configured")
|
||||
self.client = None
|
||||
|
||||
async def translate_vtt(self, vtt_content: str, target_language: str) -> str:
|
||||
"""
|
||||
Translate VTT content while preserving timing and structure
|
||||
"""
|
||||
if not self.client:
|
||||
raise ValueError("Google Translate not configured")
|
||||
|
||||
# Parse VTT to extract cues
|
||||
cues = self._parse_vtt_cues(vtt_content)
|
||||
|
||||
# Extract text for translation
|
||||
texts_to_translate = [cue["text"] for cue in cues]
|
||||
|
||||
if not texts_to_translate:
|
||||
return vtt_content
|
||||
|
||||
try:
|
||||
# Translate all texts in batch
|
||||
results = self.client.translate(
|
||||
texts_to_translate,
|
||||
target_language=target_language,
|
||||
source_language="en"
|
||||
)
|
||||
|
||||
# Rebuild VTT with translated text
|
||||
translated_cues = []
|
||||
for i, cue in enumerate(cues):
|
||||
translated_text = results[i]["translatedText"] if isinstance(results, list) else results["translatedText"]
|
||||
translated_cues.append({
|
||||
"start": cue["start"],
|
||||
"end": cue["end"],
|
||||
"text": translated_text
|
||||
})
|
||||
|
||||
return self._build_vtt(translated_cues)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Translation failed: {e}")
|
||||
raise
|
||||
|
||||
def _parse_vtt_cues(self, vtt_content: str) -> list[dict[str, str]]:
|
||||
"""Parse VTT content and extract timing and text cues"""
|
||||
lines = vtt_content.strip().split('\n')
|
||||
cues = []
|
||||
current_cue = {}
|
||||
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
|
||||
# Skip WEBVTT header and empty lines
|
||||
if line == "WEBVTT" or line == "" or line.startswith("NOTE"):
|
||||
continue
|
||||
|
||||
# Check if line contains timing
|
||||
if " --> " in line:
|
||||
timing_parts = line.split(" --> ")
|
||||
current_cue = {
|
||||
"start": timing_parts[0].strip(),
|
||||
"end": timing_parts[1].strip(),
|
||||
"text": ""
|
||||
}
|
||||
elif current_cue and line:
|
||||
# This is subtitle text
|
||||
if current_cue.get("text"):
|
||||
current_cue["text"] += " " + line
|
||||
else:
|
||||
current_cue["text"] = line
|
||||
|
||||
# If next line is empty or timing, cue is complete
|
||||
# For simplicity, we'll add the cue here and handle multi-line in a more robust way
|
||||
if current_cue["text"]:
|
||||
cues.append(current_cue.copy())
|
||||
current_cue = {}
|
||||
|
||||
# Add final cue if exists
|
||||
if current_cue and current_cue.get("text"):
|
||||
cues.append(current_cue)
|
||||
|
||||
return cues
|
||||
|
||||
def _build_vtt(self, cues: list[dict[str, str]]) -> str:
|
||||
"""Build VTT content from cues"""
|
||||
vtt_lines = ["WEBVTT", ""]
|
||||
|
||||
for cue in cues:
|
||||
vtt_lines.append(f"{cue['start']} --> {cue['end']}")
|
||||
vtt_lines.append(cue["text"])
|
||||
vtt_lines.append("") # Empty line between cues
|
||||
|
||||
return "\n".join(vtt_lines)
|
||||
|
||||
|
||||
# Global service instance
|
||||
translate_service = TranslateService()
|
||||
301
backend/app/services/tts.py
Normal file
301
backend/app/services/tts.py
Normal file
|
|
@ -0,0 +1,301 @@
|
|||
import io
|
||||
from typing import Optional
|
||||
|
||||
import aiohttp
|
||||
from google.cloud import texttospeech
|
||||
from pydub import AudioSegment
|
||||
|
||||
from ..core.config import settings
|
||||
from ..core.logging import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
class TTSService:
|
||||
def __init__(self):
|
||||
# Initialize Google TTS
|
||||
if settings.google_tts_credentials:
|
||||
self.google_client = texttospeech.TextToSpeechClient()
|
||||
else:
|
||||
logger.warning("Google TTS credentials not configured")
|
||||
self.google_client = None
|
||||
|
||||
# Check ElevenLabs availability
|
||||
self.elevenlabs_available = bool(settings.elevenlabs_api_key)
|
||||
|
||||
async def synthesize_audio_description(
|
||||
self,
|
||||
ad_vtt_content: str,
|
||||
language_code: str = "en-US",
|
||||
voice_name: Optional[str] = None
|
||||
) -> bytes:
|
||||
"""
|
||||
Generate MP3 audio from audio description VTT content
|
||||
Synthesizes each cue separately and stitches them together with timing
|
||||
Uses Google TTS with ElevenLabs fallback
|
||||
"""
|
||||
# Try Google TTS first, fallback to ElevenLabs
|
||||
try:
|
||||
if self.google_client:
|
||||
return await self._synthesize_with_google(ad_vtt_content, language_code, voice_name)
|
||||
elif self.elevenlabs_available:
|
||||
return await self._synthesize_with_elevenlabs(ad_vtt_content, language_code, voice_name)
|
||||
else:
|
||||
raise ValueError("No TTS service configured")
|
||||
except Exception as e:
|
||||
if self.elevenlabs_available and self.google_client:
|
||||
logger.warning(f"Google TTS failed, trying ElevenLabs: {e}")
|
||||
return await self._synthesize_with_elevenlabs(ad_vtt_content, language_code, voice_name)
|
||||
raise
|
||||
|
||||
async def _synthesize_with_google(
|
||||
self,
|
||||
ad_vtt_content: str,
|
||||
language_code: str = "en-US",
|
||||
voice_name: Optional[str] = None
|
||||
) -> bytes:
|
||||
"""Generate MP3 using Google TTS with 2-second pauses between passages"""
|
||||
|
||||
# Parse VTT cues
|
||||
cues = self._parse_ad_cues(ad_vtt_content)
|
||||
|
||||
if not cues:
|
||||
raise ValueError("No audio description cues found")
|
||||
|
||||
# Synthesize each cue separately with precise timing anchoring
|
||||
audio_segments = []
|
||||
current_audio_position = 0.0 # Track actual audio timeline position
|
||||
|
||||
for i, cue in enumerate(cues):
|
||||
# Calculate where this cue should start (anchored to VTT timing)
|
||||
target_start_time = cue["start_time"]
|
||||
|
||||
# Add silence to reach the exact VTT start time
|
||||
if target_start_time > current_audio_position:
|
||||
silence_duration = target_start_time - current_audio_position
|
||||
silence = AudioSegment.silent(duration=int(silence_duration * 1000))
|
||||
audio_segments.append(silence)
|
||||
current_audio_position = target_start_time
|
||||
|
||||
# Synthesize this cue's text
|
||||
text = cue["text"].strip()
|
||||
if text:
|
||||
# Ensure proper punctuation for natural TTS flow
|
||||
if not text.endswith(('.', '!', '?')):
|
||||
text += "."
|
||||
|
||||
# Synthesize this individual cue
|
||||
audio_data = await self._synthesize_text_google(
|
||||
text, language_code, voice_name
|
||||
)
|
||||
|
||||
# Convert to AudioSegment and get actual duration
|
||||
audio_segment = AudioSegment.from_file(io.BytesIO(audio_data), format="mp3")
|
||||
audio_segments.append(audio_segment)
|
||||
|
||||
# Update current position based on actual audio duration (not VTT end time)
|
||||
actual_audio_duration = len(audio_segment) / 1000.0 # Convert ms to seconds
|
||||
current_audio_position += actual_audio_duration
|
||||
|
||||
# Combine all segments
|
||||
if audio_segments:
|
||||
final_audio = sum(audio_segments, AudioSegment.empty())
|
||||
else:
|
||||
# Fallback to empty audio if no segments
|
||||
final_audio = AudioSegment.silent(duration=1000)
|
||||
|
||||
# Export to MP3
|
||||
output_buffer = io.BytesIO()
|
||||
final_audio.export(output_buffer, format="mp3", bitrate="128k")
|
||||
|
||||
return output_buffer.getvalue()
|
||||
|
||||
async def _synthesize_with_elevenlabs(
|
||||
self,
|
||||
ad_vtt_content: str,
|
||||
language_code: str = "en-US",
|
||||
voice_name: Optional[str] = None
|
||||
) -> bytes:
|
||||
"""Generate MP3 using ElevenLabs TTS"""
|
||||
# Parse VTT cues
|
||||
cues = self._parse_ad_cues(ad_vtt_content)
|
||||
|
||||
if not cues:
|
||||
raise ValueError("No audio description cues found")
|
||||
|
||||
# Get voice ID for language
|
||||
voice_id = self._get_elevenlabs_voice(language_code, voice_name)
|
||||
|
||||
# Synthesize each cue with precise timing anchoring
|
||||
audio_segments = []
|
||||
current_audio_position = 0.0 # Track actual audio timeline position
|
||||
|
||||
for i, cue in enumerate(cues):
|
||||
# Calculate where this cue should start (anchored to VTT timing)
|
||||
target_start_time = cue["start_time"]
|
||||
|
||||
# Add silence to reach the exact VTT start time
|
||||
if target_start_time > current_audio_position:
|
||||
silence_duration = target_start_time - current_audio_position
|
||||
silence = AudioSegment.silent(duration=int(silence_duration * 1000))
|
||||
audio_segments.append(silence)
|
||||
current_audio_position = target_start_time
|
||||
|
||||
# Synthesize this cue with ElevenLabs
|
||||
text = cue["text"].strip()
|
||||
if text:
|
||||
audio_data = await self._synthesize_text_elevenlabs(text, voice_id)
|
||||
|
||||
# Convert to AudioSegment and get actual duration
|
||||
audio_segment = AudioSegment.from_file(io.BytesIO(audio_data), format="mp3")
|
||||
audio_segments.append(audio_segment)
|
||||
|
||||
# Update current position based on actual audio duration (not VTT end time)
|
||||
actual_audio_duration = len(audio_segment) / 1000.0 # Convert ms to seconds
|
||||
current_audio_position += actual_audio_duration
|
||||
|
||||
# Combine all segments
|
||||
final_audio = sum(audio_segments, AudioSegment.empty())
|
||||
|
||||
# Export to MP3
|
||||
output_buffer = io.BytesIO()
|
||||
final_audio.export(output_buffer, format="mp3", bitrate="128k")
|
||||
|
||||
return output_buffer.getvalue()
|
||||
|
||||
async def _synthesize_text_google(
|
||||
self,
|
||||
text: str,
|
||||
language_code: str,
|
||||
voice_name: Optional[str] = None
|
||||
) -> bytes:
|
||||
"""Synthesize a single text string to audio using Google TTS"""
|
||||
# Configure voice
|
||||
if not voice_name:
|
||||
voice_name = settings.google_tts_voices.get(language_code, "en-US-Neural2-D")
|
||||
|
||||
voice = texttospeech.VoiceSelectionParams(
|
||||
language_code=language_code,
|
||||
name=voice_name
|
||||
)
|
||||
|
||||
# Configure audio
|
||||
audio_config = texttospeech.AudioConfig(
|
||||
audio_encoding=texttospeech.AudioEncoding.MP3,
|
||||
speaking_rate=1.2, # Faster cadence for better flow
|
||||
pitch=0.0
|
||||
)
|
||||
|
||||
# Synthesize
|
||||
synthesis_input = texttospeech.SynthesisInput(text=text)
|
||||
|
||||
response = self.google_client.synthesize_speech(
|
||||
input=synthesis_input,
|
||||
voice=voice,
|
||||
audio_config=audio_config
|
||||
)
|
||||
|
||||
return response.audio_content
|
||||
|
||||
async def _synthesize_text_elevenlabs(self, text: str, voice_id: str) -> bytes:
|
||||
"""Synthesize text using ElevenLabs API"""
|
||||
url = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}"
|
||||
|
||||
headers = {
|
||||
"Accept": "audio/mpeg",
|
||||
"Content-Type": "application/json",
|
||||
"xi-api-key": settings.elevenlabs_api_key
|
||||
}
|
||||
|
||||
data = {
|
||||
"text": text,
|
||||
"model_id": "eleven_multilingual_v2",
|
||||
"voice_settings": {
|
||||
"stability": 0.5,
|
||||
"similarity_boost": 0.5,
|
||||
"style": 0.0,
|
||||
"use_speaker_boost": True
|
||||
}
|
||||
}
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.post(url, json=data, headers=headers) as response:
|
||||
if response.status == 200:
|
||||
return await response.read()
|
||||
else:
|
||||
error_text = await response.text()
|
||||
raise ValueError(f"ElevenLabs TTS failed: {response.status} - {error_text}")
|
||||
|
||||
def _get_elevenlabs_voice(self, language_code: str, voice_name: Optional[str] = None) -> str:
|
||||
"""Get ElevenLabs voice ID for language"""
|
||||
if voice_name:
|
||||
return voice_name
|
||||
|
||||
return settings.elevenlabs_voices.get(language_code, "21m00Tcm4TlvDq8ikWAM")
|
||||
|
||||
def _parse_ad_cues(self, vtt_content: str) -> list[dict]:
|
||||
"""Parse audio description VTT and extract timing + text"""
|
||||
lines = vtt_content.strip().split('\n')
|
||||
cues = []
|
||||
|
||||
i = 0
|
||||
while i < len(lines):
|
||||
line = lines[i].strip()
|
||||
|
||||
# Skip header and empty lines
|
||||
if line == "WEBVTT" or line == "" or line.startswith("NOTE"):
|
||||
i += 1
|
||||
continue
|
||||
|
||||
# Check for timing line
|
||||
if " --> " in line:
|
||||
timing_parts = line.split(" --> ")
|
||||
start_time = self._parse_timestamp(timing_parts[0].strip())
|
||||
end_time = self._parse_timestamp(timing_parts[1].strip())
|
||||
|
||||
# Get text from next line(s)
|
||||
i += 1
|
||||
text_lines = []
|
||||
while i < len(lines) and lines[i].strip() != "":
|
||||
text_lines.append(lines[i].strip())
|
||||
i += 1
|
||||
|
||||
if text_lines:
|
||||
cues.append({
|
||||
"start_time": start_time,
|
||||
"end_time": end_time,
|
||||
"text": " ".join(text_lines)
|
||||
})
|
||||
else:
|
||||
i += 1
|
||||
|
||||
return cues
|
||||
|
||||
def _parse_timestamp(self, timestamp: str) -> float:
|
||||
"""Convert VTT timestamp to seconds"""
|
||||
# Format: HH:MM:SS.mmm or MM:SS.mmm
|
||||
parts = timestamp.split(":")
|
||||
|
||||
if len(parts) == 3: # HH:MM:SS.mmm
|
||||
hours, minutes, seconds = parts
|
||||
elif len(parts) == 2: # MM:SS.mmm
|
||||
hours, minutes, seconds = "0", parts[0], parts[1]
|
||||
else:
|
||||
raise ValueError(f"Invalid timestamp format: {timestamp}")
|
||||
|
||||
# Parse seconds and milliseconds
|
||||
sec_parts = seconds.split(".")
|
||||
seconds = int(sec_parts[0])
|
||||
milliseconds = int(sec_parts[1]) if len(sec_parts) > 1 else 0
|
||||
|
||||
total_seconds = (
|
||||
int(hours) * 3600 +
|
||||
int(minutes) * 60 +
|
||||
seconds +
|
||||
milliseconds / 1000.0
|
||||
)
|
||||
|
||||
return total_seconds
|
||||
|
||||
|
||||
# Global service instance
|
||||
tts_service = TTSService()
|
||||
130
backend/app/services/validation.py
Normal file
130
backend/app/services/validation.py
Normal file
|
|
@ -0,0 +1,130 @@
|
|||
from typing import Dict, List, Any
|
||||
|
||||
from ..core.logging import get_logger
|
||||
from ..lib.vtt import VTTEditor
|
||||
from ..services.gcs import gcs_service
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
class AssetValidationService:
|
||||
"""Service for validating job assets before completion"""
|
||||
|
||||
@staticmethod
|
||||
async def validate_job_assets(job_doc: Dict[str, Any]) -> tuple[bool, List[str]]:
|
||||
"""
|
||||
Validate all assets for a job before allowing completion
|
||||
Returns (is_valid, list_of_errors)
|
||||
"""
|
||||
errors = []
|
||||
outputs = job_doc.get("outputs", {})
|
||||
requested_outputs = job_doc.get("requested_outputs", {})
|
||||
|
||||
if not outputs:
|
||||
errors.append("No outputs generated for this job")
|
||||
return False, errors
|
||||
|
||||
# Validate each language
|
||||
for language in requested_outputs.get("languages", ["en"]):
|
||||
lang_output = outputs.get(language)
|
||||
if not lang_output:
|
||||
errors.append(f"Missing outputs for language: {language}")
|
||||
continue
|
||||
|
||||
# Validate captions VTT if requested
|
||||
if requested_outputs.get("captions_vtt"):
|
||||
captions_error = await AssetValidationService._validate_vtt_asset(
|
||||
lang_output.get("captions_vtt_gcs"),
|
||||
f"{language} captions VTT"
|
||||
)
|
||||
if captions_error:
|
||||
errors.append(captions_error)
|
||||
|
||||
# Validate audio description VTT if requested
|
||||
if requested_outputs.get("audio_description_vtt"):
|
||||
ad_vtt_error = await AssetValidationService._validate_vtt_asset(
|
||||
lang_output.get("ad_vtt_gcs"),
|
||||
f"{language} audio description VTT"
|
||||
)
|
||||
if ad_vtt_error:
|
||||
errors.append(ad_vtt_error)
|
||||
|
||||
# Validate MP3 if requested
|
||||
if requested_outputs.get("audio_description_mp3"):
|
||||
mp3_error = await AssetValidationService._validate_mp3_asset(
|
||||
lang_output.get("ad_mp3_gcs"),
|
||||
f"{language} audio description MP3"
|
||||
)
|
||||
if mp3_error:
|
||||
errors.append(mp3_error)
|
||||
|
||||
# Check minimum quality requirements
|
||||
ai_confidence = job_doc.get("ai", {}).get("confidence", 0)
|
||||
if ai_confidence < 0.7:
|
||||
errors.append(f"AI confidence too low: {ai_confidence:.1%} (minimum: 70%)")
|
||||
|
||||
return len(errors) == 0, errors
|
||||
|
||||
@staticmethod
|
||||
async def _validate_vtt_asset(gcs_uri: str, asset_name: str) -> str | None:
|
||||
"""Validate a VTT asset exists and is properly formatted"""
|
||||
if not gcs_uri:
|
||||
return f"Missing {asset_name}"
|
||||
|
||||
try:
|
||||
# Download and validate VTT content
|
||||
blob_path = gcs_uri.replace(f"gs://{gcs_service.bucket.name}/", "")
|
||||
blob = gcs_service.bucket.blob(blob_path)
|
||||
|
||||
if not blob.exists():
|
||||
return f"{asset_name} file not found in storage"
|
||||
|
||||
vtt_content = blob.download_as_text()
|
||||
is_valid, vtt_errors = VTTEditor.validate_vtt(vtt_content)
|
||||
|
||||
if not is_valid:
|
||||
return f"{asset_name} validation failed: {'; '.join(vtt_errors[:3])}"
|
||||
|
||||
# Check minimum content requirements
|
||||
cue_count = VTTEditor.get_cue_count(vtt_content)
|
||||
if cue_count == 0:
|
||||
return f"{asset_name} contains no cues"
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to validate {asset_name}: {e}")
|
||||
return f"{asset_name} validation error: {str(e)}"
|
||||
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
async def _validate_mp3_asset(gcs_uri: str, asset_name: str) -> str | None:
|
||||
"""Validate an MP3 asset exists and has reasonable properties"""
|
||||
if not gcs_uri:
|
||||
return f"Missing {asset_name}"
|
||||
|
||||
try:
|
||||
blob_path = gcs_uri.replace(f"gs://{gcs_service.bucket.name}/", "")
|
||||
blob = gcs_service.bucket.blob(blob_path)
|
||||
|
||||
if not blob.exists():
|
||||
return f"{asset_name} file not found in storage"
|
||||
|
||||
# Reload blob to get metadata (including size)
|
||||
blob.reload()
|
||||
|
||||
# Check file size (should be reasonable for audio)
|
||||
size_mb = blob.size / (1024 * 1024) if blob.size else 0
|
||||
if size_mb < 0.01: # Less than 10KB
|
||||
return f"{asset_name} file too small (likely empty)"
|
||||
elif size_mb > 500: # More than 500MB
|
||||
return f"{asset_name} file too large ({size_mb:.1f}MB)"
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to validate {asset_name}: {e}")
|
||||
return f"{asset_name} validation error: {str(e)}"
|
||||
|
||||
return None
|
||||
|
||||
|
||||
# Global service instance
|
||||
asset_validation_service = AssetValidationService()
|
||||
158
backend/app/tasks/__init__.py
Normal file
158
backend/app/tasks/__init__.py
Normal file
|
|
@ -0,0 +1,158 @@
|
|||
from celery import Celery
|
||||
from celery.signals import task_failure, task_success, task_retry
|
||||
|
||||
from ..core.config import settings
|
||||
from ..core.logging import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
celery_app = Celery(
|
||||
"accessible-video-tasks",
|
||||
broker=settings.redis_url,
|
||||
backend=settings.redis_url,
|
||||
)
|
||||
|
||||
celery_app.conf.update(
|
||||
task_serializer="json",
|
||||
accept_content=["json"],
|
||||
result_serializer="json",
|
||||
timezone="UTC",
|
||||
enable_utc=True,
|
||||
task_track_started=True,
|
||||
task_time_limit=30 * 60, # 30 minutes default
|
||||
task_soft_time_limit=25 * 60, # 25 minutes default
|
||||
worker_prefetch_multiplier=1,
|
||||
task_acks_late=True,
|
||||
worker_max_tasks_per_child=1000,
|
||||
task_routes={
|
||||
"app.tasks.ingest_and_ai.*": {"queue": "ingest"},
|
||||
"app.tasks.translate_and_synthesize.*": {"queue": "default"},
|
||||
"app.tasks.notify.*": {"queue": "notify"},
|
||||
"app.tasks.watchers.*": {"queue": "default"},
|
||||
},
|
||||
task_default_queue="default",
|
||||
task_create_missing_queues=True,
|
||||
# Task-specific timeout overrides
|
||||
task_annotations={
|
||||
'app.tasks.watchers.start_change_stream_watcher': {
|
||||
'time_limit': None,
|
||||
'soft_time_limit': None,
|
||||
},
|
||||
'app.tasks.watchers.ensure_watcher_running': {
|
||||
'time_limit': 300, # 5 minutes
|
||||
'soft_time_limit': 240, # 4 minutes
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
# Add a simple test task for debugging
|
||||
@celery_app.task
|
||||
def test_task(message="test"):
|
||||
"""Simple test task to verify worker connectivity"""
|
||||
logger.info(f"🧪 TEST TASK EXECUTED: {message}")
|
||||
print(f"🧪 TEST TASK EXECUTED: {message}")
|
||||
return f"Test task completed: {message}"
|
||||
|
||||
|
||||
# Add task received handler for debugging
|
||||
from celery.signals import task_received, task_prerun, worker_ready
|
||||
import threading
|
||||
import time
|
||||
|
||||
@worker_ready.connect
|
||||
def worker_ready_handler(sender=None, **kwargs):
|
||||
"""Log when worker is ready and start heartbeat"""
|
||||
logger.info(f"🟢 WORKER READY: {sender}")
|
||||
print(f"🟢 WORKER READY: {sender} - Worker is online and listening!")
|
||||
|
||||
# Start MongoDB change stream watcher
|
||||
# Note: The main job progression is handled by immediate triggering in approve_english endpoint
|
||||
# This watcher provides redundancy for status change detection
|
||||
if _watchers_available and 'app.tasks.watchers.ensure_watcher_running' in celery_app.tasks:
|
||||
try:
|
||||
from .watchers import ensure_watcher_running
|
||||
ensure_watcher_running.apply_async(countdown=3) # Start after 3 seconds
|
||||
logger.info("Scheduled MongoDB change stream watcher to start")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to schedule change stream watcher: {e}")
|
||||
else:
|
||||
logger.info("Watcher not available or not registered, using primary job progression via approve_english endpoint")
|
||||
|
||||
|
||||
@task_received.connect
|
||||
def task_received_handler(sender=None, task_id=None, task=None, args=None, kwargs=None, retries=None, eta=None, **kwds):
|
||||
"""Log when a task is received by the worker"""
|
||||
logger.info(f"🎯 TASK RECEIVED: {task} [{task_id}] with args: {args}")
|
||||
print(f"🎯 TASK RECEIVED: {task} [{task_id}] - Worker is picking up the task!")
|
||||
|
||||
@task_prerun.connect
|
||||
def task_prerun_handler(sender=None, task_id=None, task=None, args=None, kwargs=None, **kwds):
|
||||
"""Log when a task starts executing"""
|
||||
logger.info(f"🚀 TASK STARTING: {task} [{task_id}]")
|
||||
print(f"🚀 TASK STARTING: {task} [{task_id}] - About to execute!")
|
||||
|
||||
# Celery signal handlers for centralized logging
|
||||
@task_failure.connect
|
||||
def task_failure_handler(sender=None, task_id=None, exception=None, traceback=None, einfo=None, **kwargs):
|
||||
"""Log task failures to centralized logging"""
|
||||
exception_type = exception.__class__.__name__ if exception else "Unknown"
|
||||
exception_msg = str(exception) if exception else "No details"
|
||||
|
||||
# Log comprehensive error details
|
||||
error_details = f"""
|
||||
=== CELERY TASK FAILURE ===
|
||||
Task: {sender}
|
||||
Task ID: {task_id}
|
||||
Exception Type: {exception_type}
|
||||
Exception Message: {exception_msg}
|
||||
Full Traceback:
|
||||
{traceback}
|
||||
Additional Info: {einfo}
|
||||
=============================
|
||||
"""
|
||||
logger.error(error_details)
|
||||
|
||||
# Also log to stdout for immediate visibility
|
||||
print(f"🚨 TASK FAILURE: {sender} [{task_id}] - {exception_type}: {exception_msg}")
|
||||
if traceback:
|
||||
print(f"Full traceback:\n{traceback}")
|
||||
|
||||
|
||||
@task_success.connect
|
||||
def task_success_handler(sender=None, result=None, **kwargs):
|
||||
"""Log task success"""
|
||||
result_str = str(result)[:100] if result else "No result"
|
||||
logger.info(f"Celery task completed: {sender} - Result: {result_str}")
|
||||
|
||||
|
||||
@task_retry.connect
|
||||
def task_retry_handler(sender=None, task_id=None, reason=None, einfo=None, **kwargs):
|
||||
"""Log task retries"""
|
||||
reason_str = str(reason) if reason else "No reason provided"
|
||||
logger.warning(f"Celery task retry: {sender} [{task_id}] - Reason: {reason_str}")
|
||||
|
||||
|
||||
def import_task_modules():
|
||||
"""Import all task modules to register them with Celery"""
|
||||
try:
|
||||
from . import ingest_and_ai # noqa: E402, F401
|
||||
from . import translate_and_synthesize # noqa: E402, F401
|
||||
from . import notify # noqa: E402, F401
|
||||
logger.info("Successfully imported core task modules")
|
||||
except Exception as e:
|
||||
logger.error(f"Error importing core task modules: {e}")
|
||||
|
||||
# Import watchers module conditionally to handle import errors gracefully
|
||||
try:
|
||||
from . import watchers # noqa: E402, F401
|
||||
logger.info("Successfully imported watchers module")
|
||||
return True
|
||||
except ImportError as e:
|
||||
logger.warning(f"Could not import watchers module: {e}")
|
||||
return False
|
||||
except Exception as e:
|
||||
logger.error(f"Error importing watchers module: {e}")
|
||||
return False
|
||||
|
||||
# Import task modules at startup
|
||||
_watchers_available = import_task_modules()
|
||||
BIN
backend/app/tasks/__pycache__/__init__.cpython-313.pyc
Normal file
BIN
backend/app/tasks/__pycache__/__init__.cpython-313.pyc
Normal file
Binary file not shown.
BIN
backend/app/tasks/__pycache__/ingest_and_ai.cpython-313.pyc
Normal file
BIN
backend/app/tasks/__pycache__/ingest_and_ai.cpython-313.pyc
Normal file
Binary file not shown.
BIN
backend/app/tasks/__pycache__/notify.cpython-313.pyc
Normal file
BIN
backend/app/tasks/__pycache__/notify.cpython-313.pyc
Normal file
Binary file not shown.
Binary file not shown.
BIN
backend/app/tasks/__pycache__/watchers.cpython-313.pyc
Normal file
BIN
backend/app/tasks/__pycache__/watchers.cpython-313.pyc
Normal file
Binary file not shown.
213
backend/app/tasks/ingest_and_ai.py
Normal file
213
backend/app/tasks/ingest_and_ai.py
Normal file
|
|
@ -0,0 +1,213 @@
|
|||
import asyncio
|
||||
import os
|
||||
import tempfile
|
||||
from datetime import datetime
|
||||
|
||||
import ffmpeg
|
||||
from celery import Task
|
||||
from motor.motor_asyncio import AsyncIOMotorClient
|
||||
|
||||
from ..core.config import settings
|
||||
from ..core.logging import get_logger
|
||||
from ..models.job import JobStatus
|
||||
from ..services.gcs import gcs_service, upload_vtt_to_gcs
|
||||
from ..services.gemini import gemini_service
|
||||
from . import celery_app
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
class AsyncTask(Task):
|
||||
"""Base task class that supports async execution"""
|
||||
def __call__(self, *args, **kwargs):
|
||||
loop = asyncio.new_event_loop()
|
||||
asyncio.set_event_loop(loop)
|
||||
try:
|
||||
return loop.run_until_complete(self.run_async(*args, **kwargs))
|
||||
finally:
|
||||
loop.close()
|
||||
|
||||
async def run_async(self, *args, **kwargs):
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class IngestAndAITask(AsyncTask):
|
||||
async def run_async(self, job_id: str):
|
||||
return await ingest_and_ai_task_impl(job_id)
|
||||
|
||||
|
||||
@celery_app.task(bind=True, base=IngestAndAITask)
|
||||
def ingest_and_ai_task(self, job_id: str):
|
||||
"""
|
||||
Pipeline 1: Ingestion & AI Processing
|
||||
Task wrapper that delegates to async implementation
|
||||
"""
|
||||
# This method is called by AsyncTask.__call__
|
||||
pass
|
||||
|
||||
|
||||
async def ingest_and_ai_task_impl(job_id: str):
|
||||
"""
|
||||
Pipeline 1: Ingestion & AI Processing
|
||||
1. Update status to 'ingesting'
|
||||
2. Probe video for metadata (duration, codec)
|
||||
3. Process with Gemini 2.5 Pro
|
||||
4. Generate VTT files
|
||||
5. Update status to 'pending_qc'
|
||||
"""
|
||||
logger.info(f"Starting ingestion and AI processing for job {job_id}")
|
||||
|
||||
# Connect to MongoDB
|
||||
client = AsyncIOMotorClient(settings.mongodb_uri)
|
||||
db = client[settings.mongodb_db]
|
||||
|
||||
try:
|
||||
# Update status to ingesting
|
||||
await db.jobs.update_one(
|
||||
{"_id": job_id},
|
||||
{
|
||||
"$set": {
|
||||
"status": JobStatus.INGESTING.value,
|
||||
"updated_at": datetime.utcnow()
|
||||
},
|
||||
"$push": {
|
||||
"review.history": {
|
||||
"at": datetime.utcnow(),
|
||||
"status": JobStatus.INGESTING.value,
|
||||
"by": "system"
|
||||
}
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
# Get job details
|
||||
job_doc = await db.jobs.find_one({"_id": job_id})
|
||||
if not job_doc:
|
||||
raise ValueError(f"Job {job_id} not found")
|
||||
|
||||
# Download video file temporarily for processing
|
||||
source_blob_path = job_doc["source"]["gcs_uri"].replace(f"gs://{settings.gcs_bucket}/", "")
|
||||
|
||||
with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as temp_file:
|
||||
temp_path = temp_file.name
|
||||
|
||||
# Download from GCS
|
||||
blob = gcs_service.bucket.blob(source_blob_path)
|
||||
blob.download_to_filename(temp_path)
|
||||
|
||||
try:
|
||||
# Update status to AI processing
|
||||
await db.jobs.update_one(
|
||||
{"_id": job_id},
|
||||
{
|
||||
"$set": {
|
||||
"status": JobStatus.AI_PROCESSING.value,
|
||||
"updated_at": datetime.utcnow()
|
||||
},
|
||||
"$push": {
|
||||
"review.history": {
|
||||
"at": datetime.utcnow(),
|
||||
"status": JobStatus.AI_PROCESSING.value,
|
||||
"by": "system"
|
||||
}
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
# Probe video for metadata
|
||||
duration = await _get_video_duration(temp_path)
|
||||
|
||||
# Update source with duration
|
||||
await db.jobs.update_one(
|
||||
{"_id": job_id},
|
||||
{"$set": {"source.duration_s": duration}}
|
||||
)
|
||||
|
||||
# Process with Gemini
|
||||
ai_result = await gemini_service.extract_accessibility(temp_path)
|
||||
|
||||
# Final safety check for required fields
|
||||
required_fields = ["captions_vtt", "audio_description_vtt"]
|
||||
missing_fields = [field for field in required_fields if field not in ai_result]
|
||||
|
||||
if missing_fields:
|
||||
logger.error(f"Missing required fields after AI processing: {missing_fields}")
|
||||
# Create fallback content for missing fields
|
||||
if "audio_description_vtt" in missing_fields:
|
||||
ai_result["audio_description_vtt"] = "WEBVTT\n\n00:00:00.000 --> 00:00:05.000\nVideo content with visual elements."
|
||||
logger.info("Created fallback audio_description_vtt")
|
||||
|
||||
# Upload VTT files to GCS
|
||||
captions_gcs_uri = await upload_vtt_to_gcs(
|
||||
ai_result["captions_vtt"],
|
||||
f"{job_id}/en/captions.vtt"
|
||||
)
|
||||
|
||||
ad_gcs_uri = await upload_vtt_to_gcs(
|
||||
ai_result["audio_description_vtt"],
|
||||
f"{job_id}/en/ad.vtt"
|
||||
)
|
||||
|
||||
# Update job with AI results and outputs
|
||||
await db.jobs.update_one(
|
||||
{"_id": job_id},
|
||||
{
|
||||
"$set": {
|
||||
"status": JobStatus.PENDING_QC.value,
|
||||
"ai.ingestion_json": ai_result,
|
||||
"ai.confidence": ai_result["confidence"],
|
||||
"outputs.en": {
|
||||
"captions_vtt_gcs": captions_gcs_uri,
|
||||
"ad_vtt_gcs": ad_gcs_uri
|
||||
},
|
||||
"updated_at": datetime.utcnow()
|
||||
},
|
||||
"$push": {
|
||||
"review.history": {
|
||||
"at": datetime.utcnow(),
|
||||
"status": JobStatus.PENDING_QC.value,
|
||||
"by": "system"
|
||||
}
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
logger.info(f"Successfully completed ingestion and AI processing for job {job_id}")
|
||||
|
||||
finally:
|
||||
# Clean up temp file
|
||||
os.unlink(temp_path)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Ingestion and AI processing failed for job {job_id}: {e}")
|
||||
|
||||
# Update job with error
|
||||
await db.jobs.update_one(
|
||||
{"_id": job_id},
|
||||
{
|
||||
"$set": {
|
||||
"error": {
|
||||
"type": "ingestion_failure",
|
||||
"message": str(e),
|
||||
"timestamp": datetime.utcnow().isoformat()
|
||||
},
|
||||
"updated_at": datetime.utcnow()
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
raise
|
||||
|
||||
finally:
|
||||
client.close()
|
||||
|
||||
|
||||
async def _get_video_duration(video_path: str) -> float:
|
||||
"""Get video duration using ffprobe"""
|
||||
try:
|
||||
probe = ffmpeg.probe(video_path)
|
||||
duration = float(probe['streams'][0]['duration'])
|
||||
return duration
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not determine video duration: {e}")
|
||||
return 0.0
|
||||
142
backend/app/tasks/notify.py
Normal file
142
backend/app/tasks/notify.py
Normal file
|
|
@ -0,0 +1,142 @@
|
|||
import asyncio
|
||||
from datetime import datetime
|
||||
|
||||
from celery import Task
|
||||
from motor.motor_asyncio import AsyncIOMotorClient
|
||||
|
||||
from ..core.config import settings
|
||||
from ..core.logging import get_logger
|
||||
from ..models.audit_log import AuditLogCreate
|
||||
from ..services.emailer import email_service
|
||||
from ..services.gcs import get_signed_download_url
|
||||
from . import celery_app
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
class AsyncTask(Task):
|
||||
"""Base task class that supports async execution"""
|
||||
def __call__(self, *args, **kwargs):
|
||||
loop = asyncio.new_event_loop()
|
||||
asyncio.set_event_loop(loop)
|
||||
try:
|
||||
return loop.run_until_complete(self.run_async(*args, **kwargs))
|
||||
finally:
|
||||
loop.close()
|
||||
|
||||
async def run_async(self, *args, **kwargs):
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
@celery_app.task(bind=True, base=AsyncTask)
|
||||
async def notify_client_task(self, job_id: str):
|
||||
"""
|
||||
Pipeline 3: Client Notification
|
||||
Triggered when job status changes to 'completed'
|
||||
"""
|
||||
logger.info(f"Starting client notification for job {job_id}")
|
||||
|
||||
# Connect to MongoDB
|
||||
client = AsyncIOMotorClient(settings.mongodb_uri)
|
||||
db = client[settings.mongodb_db]
|
||||
|
||||
try:
|
||||
# Get job and client details
|
||||
job_doc = await db.jobs.find_one({"_id": job_id})
|
||||
if not job_doc:
|
||||
raise ValueError(f"Job {job_id} not found")
|
||||
|
||||
if job_doc["status"] != "completed":
|
||||
logger.warning(f"Job {job_id} not in completed status, skipping notification")
|
||||
return
|
||||
|
||||
# Get client email
|
||||
client_doc = await db.users.find_one({"_id": job_doc["client_id"]})
|
||||
if not client_doc:
|
||||
raise ValueError(f"Client {job_doc['client_id']} not found")
|
||||
|
||||
# Generate signed URLs for all outputs
|
||||
download_links = {}
|
||||
outputs = job_doc.get("outputs", {})
|
||||
|
||||
for language, lang_output in outputs.items():
|
||||
if not isinstance(lang_output, dict):
|
||||
continue
|
||||
|
||||
lang_downloads = {}
|
||||
|
||||
# Captions VTT
|
||||
if "captions_vtt_gcs" in lang_output:
|
||||
blob_path = lang_output["captions_vtt_gcs"].replace(f"gs://{settings.gcs_bucket}/", "")
|
||||
try:
|
||||
signed_url = await get_signed_download_url(blob_path, 24)
|
||||
lang_downloads["captions_vtt"] = signed_url
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to generate signed URL for captions {language}: {e}")
|
||||
|
||||
# Audio Description VTT
|
||||
if "ad_vtt_gcs" in lang_output:
|
||||
blob_path = lang_output["ad_vtt_gcs"].replace(f"gs://{settings.gcs_bucket}/", "")
|
||||
try:
|
||||
signed_url = await get_signed_download_url(blob_path, 24)
|
||||
lang_downloads["audio_description_vtt"] = signed_url
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to generate signed URL for AD VTT {language}: {e}")
|
||||
|
||||
# Audio Description MP3
|
||||
if "ad_mp3_gcs" in lang_output:
|
||||
blob_path = lang_output["ad_mp3_gcs"].replace(f"gs://{settings.gcs_bucket}/", "")
|
||||
try:
|
||||
signed_url = await get_signed_download_url(blob_path, 24)
|
||||
lang_downloads["audio_description_mp3"] = signed_url
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to generate signed URL for AD MP3 {language}: {e}")
|
||||
|
||||
if lang_downloads:
|
||||
download_links[language] = lang_downloads
|
||||
|
||||
# Send completion email
|
||||
success = await email_service.send_completion_email(
|
||||
recipient_email=client_doc["email"],
|
||||
job_title=job_doc["title"],
|
||||
download_links=download_links
|
||||
)
|
||||
|
||||
if success:
|
||||
# Log audit entry
|
||||
audit_log = AuditLogCreate(
|
||||
job_id=job_id,
|
||||
action="client_notified",
|
||||
details={
|
||||
"email": client_doc["email"],
|
||||
"download_count": sum(len(files) for files in download_links.values())
|
||||
}
|
||||
)
|
||||
await db.audit_logs.insert_one(audit_log.dict())
|
||||
|
||||
logger.info(f"Successfully notified client for job {job_id}")
|
||||
else:
|
||||
raise ValueError("Failed to send completion email")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Client notification failed for job {job_id}: {e}")
|
||||
|
||||
# Update job with error
|
||||
await db.jobs.update_one(
|
||||
{"_id": job_id},
|
||||
{
|
||||
"$set": {
|
||||
"error": {
|
||||
"type": "notification_failure",
|
||||
"message": str(e),
|
||||
"timestamp": datetime.utcnow().isoformat()
|
||||
},
|
||||
"updated_at": datetime.utcnow()
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
raise
|
||||
|
||||
finally:
|
||||
client.close()
|
||||
317
backend/app/tasks/translate_and_synthesize.py
Normal file
317
backend/app/tasks/translate_and_synthesize.py
Normal file
|
|
@ -0,0 +1,317 @@
|
|||
import asyncio
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
import time
|
||||
import random
|
||||
|
||||
from celery import Task
|
||||
from motor.motor_asyncio import AsyncIOMotorClient
|
||||
|
||||
from ..core.config import settings
|
||||
from ..core.logging import get_logger
|
||||
from ..models.job import JobStatus
|
||||
from ..services.gcs import gcs_service, upload_vtt_to_gcs
|
||||
from ..services.gemini import gemini_service
|
||||
from ..services.translate import translate_service
|
||||
from ..services.tts import tts_service
|
||||
from . import celery_app
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
async def retry_with_backoff(func, max_retries=3, base_delay=1):
|
||||
"""Retry a function with exponential backoff"""
|
||||
last_exception = None
|
||||
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
return await func()
|
||||
except Exception as e:
|
||||
last_exception = e
|
||||
if attempt == max_retries - 1:
|
||||
break
|
||||
|
||||
# Exponential backoff with jitter
|
||||
delay = base_delay * (2 ** attempt) + random.uniform(0, 1)
|
||||
logger.warning(f"Attempt {attempt + 1} failed, retrying in {delay:.2f}s: {e}")
|
||||
await asyncio.sleep(delay)
|
||||
|
||||
raise last_exception
|
||||
|
||||
|
||||
@celery_app.task(bind=True)
|
||||
def translate_and_synthesize_task(self, job_id: str):
|
||||
"""
|
||||
Pipeline 2: Translation & MP3 Generation
|
||||
Triggered when job status changes to 'approved_english'
|
||||
"""
|
||||
logger.info(f"🚀 CELERY TASK STARTED: translate_and_synthesize_task for job {job_id}")
|
||||
|
||||
try:
|
||||
logger.info(f"📝 About to call asyncio.run for job {job_id}")
|
||||
result = asyncio.run(_async_translate_and_synthesize(job_id))
|
||||
logger.info(f"✅ CELERY TASK COMPLETED successfully for job {job_id}")
|
||||
return result
|
||||
except Exception as e:
|
||||
logger.error(f"❌ CELERY TASK FAILED for job {job_id}: {str(e)}")
|
||||
logger.error(f"❌ Exception type: {type(e).__name__}")
|
||||
logger.error(f"❌ Exception args: {e.args}")
|
||||
import traceback
|
||||
logger.error(f"❌ Full traceback: {traceback.format_exc()}")
|
||||
raise
|
||||
|
||||
|
||||
async def _async_translate_and_synthesize(job_id: str):
|
||||
"""Async implementation of translation and synthesis"""
|
||||
logger.info(f"🔄 ASYNC FUNCTION STARTED: _async_translate_and_synthesize for job {job_id}")
|
||||
|
||||
# Connect to MongoDB
|
||||
logger.info(f"📡 Connecting to MongoDB for job {job_id}")
|
||||
client = AsyncIOMotorClient(settings.mongodb_uri)
|
||||
db = client[settings.mongodb_db]
|
||||
logger.info(f"📡 MongoDB connection established for job {job_id}")
|
||||
|
||||
try:
|
||||
# Get job details
|
||||
logger.info(f"🔍 Looking up job document for job {job_id}")
|
||||
job_doc = await db.jobs.find_one({"_id": job_id})
|
||||
if not job_doc:
|
||||
logger.error(f"❌ Job {job_id} not found in database!")
|
||||
raise ValueError(f"Job {job_id} not found")
|
||||
|
||||
logger.info(f"✅ Found job document for {job_id}, status: {job_doc.get('status', 'UNKNOWN')}")
|
||||
|
||||
if job_doc["status"] != JobStatus.APPROVED_ENGLISH.value:
|
||||
logger.warning(f"⚠️ Job {job_id} not in approved_english status (current: {job_doc['status']}), skipping")
|
||||
return
|
||||
|
||||
logger.info(f"✅ Job {job_id} is in correct status, proceeding with translation")
|
||||
|
||||
# Update status to translating
|
||||
await db.jobs.update_one(
|
||||
{"_id": job_id},
|
||||
{
|
||||
"$set": {
|
||||
"status": JobStatus.TRANSLATING.value,
|
||||
"updated_at": datetime.utcnow()
|
||||
},
|
||||
"$push": {
|
||||
"review.history": {
|
||||
"at": datetime.utcnow(),
|
||||
"status": JobStatus.TRANSLATING.value,
|
||||
"by": "system"
|
||||
}
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
# Get English VTT content
|
||||
en_outputs = job_doc["outputs"]["en"]
|
||||
|
||||
# Download English VTT files
|
||||
captions_blob_path = en_outputs["captions_vtt_gcs"].replace(f"gs://{settings.gcs_bucket}/", "")
|
||||
ad_blob_path = en_outputs["ad_vtt_gcs"].replace(f"gs://{settings.gcs_bucket}/", "")
|
||||
|
||||
captions_blob = gcs_service.bucket.blob(captions_blob_path)
|
||||
ad_blob = gcs_service.bucket.blob(ad_blob_path)
|
||||
|
||||
en_captions_vtt = captions_blob.download_as_text()
|
||||
en_ad_vtt = ad_blob.download_as_text()
|
||||
|
||||
# Process each requested language
|
||||
requested_languages = job_doc["requested_outputs"]["languages"]
|
||||
transcreation_languages = job_doc["requested_outputs"]["transcreation"]
|
||||
|
||||
updated_outputs = job_doc.get("outputs", {})
|
||||
|
||||
for language in requested_languages:
|
||||
if language == "en":
|
||||
continue # Skip English as it's already processed
|
||||
|
||||
logger.info(f"Processing language: {language}")
|
||||
|
||||
try:
|
||||
if language in transcreation_languages:
|
||||
# Use transcreation for cultural adaptation with retry
|
||||
async def transcreate():
|
||||
return await gemini_service.transcreate_content(
|
||||
en_captions_vtt,
|
||||
en_ad_vtt,
|
||||
language,
|
||||
brief="Standard accessibility content"
|
||||
)
|
||||
|
||||
result = await retry_with_backoff(transcreate, max_retries=3)
|
||||
translated_captions = result["captions_vtt"]
|
||||
translated_ad = result["audio_description_vtt"]
|
||||
origin = "transcreate"
|
||||
|
||||
else:
|
||||
# Use standard translation with retry
|
||||
async def translate_captions():
|
||||
return await translate_service.translate_vtt(en_captions_vtt, language)
|
||||
|
||||
async def translate_ad():
|
||||
return await translate_service.translate_vtt(en_ad_vtt, language)
|
||||
|
||||
translated_captions = await retry_with_backoff(translate_captions, max_retries=3)
|
||||
translated_ad = await retry_with_backoff(translate_ad, max_retries=3)
|
||||
origin = "translate"
|
||||
|
||||
# Upload translated VTT files
|
||||
captions_gcs_uri = await upload_vtt_to_gcs(
|
||||
translated_captions,
|
||||
f"{job_id}/{language}/captions.vtt"
|
||||
)
|
||||
|
||||
ad_gcs_uri = await upload_vtt_to_gcs(
|
||||
translated_ad,
|
||||
f"{job_id}/{language}/ad.vtt"
|
||||
)
|
||||
|
||||
# Store language outputs
|
||||
updated_outputs[language] = {
|
||||
"captions_vtt_gcs": captions_gcs_uri,
|
||||
"ad_vtt_gcs": ad_gcs_uri,
|
||||
"origin": origin
|
||||
}
|
||||
|
||||
logger.info(f"Successfully processed VTT files for language: {language}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to process language {language}: {e}")
|
||||
updated_outputs[language] = {
|
||||
"origin": "translate" if language not in transcreation_languages else "transcreate",
|
||||
"qa_notes": f"Translation failed: {str(e)}"
|
||||
}
|
||||
|
||||
# Update status to TTS generating
|
||||
await db.jobs.update_one(
|
||||
{"_id": job_id},
|
||||
{
|
||||
"$set": {
|
||||
"status": JobStatus.TTS_GENERATING.value,
|
||||
"outputs": updated_outputs,
|
||||
"updated_at": datetime.utcnow()
|
||||
},
|
||||
"$push": {
|
||||
"review.history": {
|
||||
"at": datetime.utcnow(),
|
||||
"status": JobStatus.TTS_GENERATING.value,
|
||||
"by": "system"
|
||||
}
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
# Generate TTS for languages that need MP3
|
||||
if job_doc["requested_outputs"]["audio_description_mp3"]:
|
||||
await _generate_tts_for_languages(job_id, updated_outputs, db)
|
||||
|
||||
# Update final status
|
||||
await db.jobs.update_one(
|
||||
{"_id": job_id},
|
||||
{
|
||||
"$set": {
|
||||
"status": JobStatus.PENDING_FINAL_REVIEW.value,
|
||||
"updated_at": datetime.utcnow()
|
||||
},
|
||||
"$push": {
|
||||
"review.history": {
|
||||
"at": datetime.utcnow(),
|
||||
"status": JobStatus.PENDING_FINAL_REVIEW.value,
|
||||
"by": "system"
|
||||
}
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
logger.info(f"Successfully completed translation and synthesis for job {job_id}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Translation and synthesis failed for job {job_id}: {e}")
|
||||
|
||||
# Update job with error
|
||||
await db.jobs.update_one(
|
||||
{"_id": job_id},
|
||||
{
|
||||
"$set": {
|
||||
"error": {
|
||||
"type": "translation_failure",
|
||||
"message": str(e),
|
||||
"timestamp": datetime.utcnow().isoformat()
|
||||
},
|
||||
"updated_at": datetime.utcnow()
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
raise
|
||||
|
||||
finally:
|
||||
client.close()
|
||||
|
||||
|
||||
async def _generate_tts_for_languages(job_id: str, outputs: dict[str, Any], db):
|
||||
"""Generate TTS audio for each language's audio description"""
|
||||
|
||||
# Always generate English MP3
|
||||
if "en" in outputs:
|
||||
await _generate_language_tts(job_id, "en", outputs["en"], db)
|
||||
|
||||
# Generate for other languages
|
||||
for language, lang_output in outputs.items():
|
||||
if language != "en" and "ad_vtt_gcs" in lang_output:
|
||||
await _generate_language_tts(job_id, language, lang_output, db)
|
||||
|
||||
|
||||
async def _generate_language_tts(job_id: str, language: str, lang_output: dict, db):
|
||||
"""Generate TTS for a specific language"""
|
||||
try:
|
||||
# Download AD VTT content
|
||||
ad_blob_path = lang_output["ad_vtt_gcs"].replace(f"gs://{settings.gcs_bucket}/", "")
|
||||
ad_blob = gcs_service.bucket.blob(ad_blob_path)
|
||||
ad_vtt_content = ad_blob.download_as_text()
|
||||
|
||||
# Generate MP3 with retry
|
||||
language_code = f"{language}-US" if language == "en" else f"{language}-{language.upper()}"
|
||||
|
||||
async def synthesize():
|
||||
return await tts_service.synthesize_audio_description(ad_vtt_content, language_code)
|
||||
|
||||
mp3_data = await retry_with_backoff(synthesize, max_retries=3)
|
||||
|
||||
# Upload MP3 to GCS
|
||||
mp3_blob_path = f"{job_id}/{language}/ad.mp3"
|
||||
mp3_blob = gcs_service.bucket.blob(mp3_blob_path)
|
||||
mp3_blob.content_type = "audio/mpeg"
|
||||
mp3_blob.upload_from_string(mp3_data, content_type="audio/mpeg")
|
||||
|
||||
mp3_gcs_uri = f"gs://{settings.gcs_bucket}/{mp3_blob_path}"
|
||||
|
||||
# Update job outputs
|
||||
await db.jobs.update_one(
|
||||
{"_id": job_id},
|
||||
{
|
||||
"$set": {
|
||||
f"outputs.{language}.ad_mp3_gcs": mp3_gcs_uri,
|
||||
"updated_at": datetime.utcnow()
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
logger.info(f"Successfully generated TTS for {language}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"TTS generation failed for {language}: {e}")
|
||||
|
||||
# Update with error note
|
||||
await db.jobs.update_one(
|
||||
{"_id": job_id},
|
||||
{
|
||||
"$set": {
|
||||
f"outputs.{language}.qa_notes": f"TTS generation failed: {str(e)}",
|
||||
"updated_at": datetime.utcnow()
|
||||
}
|
||||
}
|
||||
)
|
||||
136
backend/app/tasks/watchers.py
Normal file
136
backend/app/tasks/watchers.py
Normal file
|
|
@ -0,0 +1,136 @@
|
|||
import asyncio
|
||||
|
||||
from motor.motor_asyncio import AsyncIOMotorClient
|
||||
|
||||
from ..core.config import settings
|
||||
from ..core.logging import get_logger
|
||||
from ..models.job import JobStatus
|
||||
from . import celery_app
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
@celery_app.task(
|
||||
bind=True,
|
||||
acks_late=True, # Acknowledge task only after completion
|
||||
reject_on_worker_lost=True, # Retry if worker crashes
|
||||
autoretry_for=(Exception,), # Auto-retry on any exception
|
||||
retry_kwargs={'max_retries': None, 'countdown': 60}, # Infinite retries with 60s delay
|
||||
retry_backoff=True, # Exponential backoff
|
||||
)
|
||||
def start_change_stream_watcher(self):
|
||||
"""Start MongoDB change stream watcher for job status changes"""
|
||||
try:
|
||||
asyncio.run(_watch_job_changes())
|
||||
except Exception as e:
|
||||
logger.error(f"Change stream watcher failed: {e}")
|
||||
# Task will auto-retry due to configuration
|
||||
raise
|
||||
|
||||
|
||||
async def _watch_job_changes():
|
||||
"""Watch MongoDB change streams for job status updates"""
|
||||
client = AsyncIOMotorClient(settings.mongodb_uri)
|
||||
db = client[settings.mongodb_db]
|
||||
|
||||
logger.info("Starting MongoDB change stream watcher")
|
||||
|
||||
try:
|
||||
# Add a heartbeat mechanism to ensure the connection stays alive
|
||||
await client.admin.command('ping')
|
||||
logger.info("MongoDB connection verified")
|
||||
# Watch for changes to the jobs collection
|
||||
pipeline = [
|
||||
{
|
||||
"$match": {
|
||||
"operationType": "update",
|
||||
"fullDocument.status": {
|
||||
"$in": [
|
||||
JobStatus.APPROVED_ENGLISH.value,
|
||||
JobStatus.COMPLETED.value
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
|
||||
async with db.jobs.watch(
|
||||
pipeline,
|
||||
full_document="updateLookup",
|
||||
max_await_time_ms=30000, # 30 second timeout for getMore operations
|
||||
batch_size=10 # Process changes in small batches
|
||||
) as stream:
|
||||
logger.info("Change stream watcher active, waiting for job status changes...")
|
||||
|
||||
async for change in stream:
|
||||
try:
|
||||
job_doc = change["fullDocument"]
|
||||
if not job_doc:
|
||||
logger.warning("Received change event without fullDocument")
|
||||
continue
|
||||
|
||||
job_id = str(job_doc["_id"])
|
||||
status = job_doc["status"]
|
||||
|
||||
logger.info(f"Job {job_id} status changed to {status}")
|
||||
|
||||
if status == JobStatus.APPROVED_ENGLISH.value:
|
||||
# Trigger translation and synthesis
|
||||
from .translate_and_synthesize import translate_and_synthesize_task
|
||||
translate_and_synthesize_task.delay(job_id)
|
||||
logger.info(f"Enqueued translation task for job {job_id}")
|
||||
|
||||
elif status == JobStatus.COMPLETED.value:
|
||||
# Trigger client notification
|
||||
from .notify import notify_client_task
|
||||
notify_client_task.delay(job_id)
|
||||
logger.info(f"Enqueued notification task for job {job_id}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing change stream event: {e}")
|
||||
# Continue processing other events
|
||||
continue
|
||||
|
||||
except Exception as e:
|
||||
error_msg = str(e)
|
||||
if "replica sets" in error_msg:
|
||||
logger.warning("Change stream watcher not available - MongoDB not configured as replica set")
|
||||
logger.info("This is normal in development. Job progression works via immediate triggering in approval endpoint.")
|
||||
else:
|
||||
logger.error(f"Change stream watcher failed: {e}")
|
||||
# Don't re-raise in development to prevent worker crashes
|
||||
|
||||
finally:
|
||||
client.close()
|
||||
|
||||
|
||||
# Auto-start the watcher when the worker starts
|
||||
@celery_app.task(
|
||||
bind=True,
|
||||
autoretry_for=(Exception,),
|
||||
retry_kwargs={'max_retries': 3, 'countdown': 30}
|
||||
)
|
||||
def ensure_watcher_running(self):
|
||||
"""Ensure the change stream watcher is running"""
|
||||
try:
|
||||
# Check if watcher is already running
|
||||
active_tasks = celery_app.control.inspect().active()
|
||||
|
||||
if not active_tasks:
|
||||
logger.warning("Could not inspect active tasks - starting watcher anyway")
|
||||
else:
|
||||
# Look for running watcher
|
||||
for worker, tasks in active_tasks.items():
|
||||
if tasks: # Check if tasks list is not None
|
||||
for task in tasks:
|
||||
if task.get("name") == "app.tasks.watchers.start_change_stream_watcher":
|
||||
logger.info(f"Change stream watcher already running on worker {worker}")
|
||||
return
|
||||
|
||||
# Start the watcher
|
||||
result = start_change_stream_watcher.delay()
|
||||
logger.info(f"Started change stream watcher with task ID: {result.id}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to ensure watcher is running: {e}")
|
||||
raise # Will trigger retry
|
||||
33
backend/app/telemetry/__init__.py
Normal file
33
backend/app/telemetry/__init__.py
Normal file
|
|
@ -0,0 +1,33 @@
|
|||
"""Telemetry package for OpenTelemetry tracing and metrics collection"""
|
||||
|
||||
from .metrics import app_metrics, time_ai_request, time_job_processing, time_storage_operation, time_celery_task
|
||||
from .tracing import (
|
||||
get_tracer,
|
||||
instrument_dependencies,
|
||||
instrument_fastapi_app,
|
||||
setup_tracing,
|
||||
trace_ai_operation,
|
||||
trace_job_pipeline,
|
||||
trace_storage_operation,
|
||||
TracingContext,
|
||||
trace_api_request,
|
||||
trace_celery_task,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"app_metrics",
|
||||
"time_ai_request",
|
||||
"time_job_processing",
|
||||
"time_storage_operation",
|
||||
"time_celery_task",
|
||||
"get_tracer",
|
||||
"instrument_dependencies",
|
||||
"instrument_fastapi_app",
|
||||
"setup_tracing",
|
||||
"trace_ai_operation",
|
||||
"trace_job_pipeline",
|
||||
"trace_storage_operation",
|
||||
"TracingContext",
|
||||
"trace_api_request",
|
||||
"trace_celery_task",
|
||||
]
|
||||
BIN
backend/app/telemetry/__pycache__/__init__.cpython-313.pyc
Normal file
BIN
backend/app/telemetry/__pycache__/__init__.cpython-313.pyc
Normal file
Binary file not shown.
BIN
backend/app/telemetry/__pycache__/metrics.cpython-313.pyc
Normal file
BIN
backend/app/telemetry/__pycache__/metrics.cpython-313.pyc
Normal file
Binary file not shown.
BIN
backend/app/telemetry/__pycache__/tracing.cpython-313.pyc
Normal file
BIN
backend/app/telemetry/__pycache__/tracing.cpython-313.pyc
Normal file
Binary file not shown.
359
backend/app/telemetry/metrics.py
Normal file
359
backend/app/telemetry/metrics.py
Normal file
|
|
@ -0,0 +1,359 @@
|
|||
import time
|
||||
from typing import Optional
|
||||
|
||||
from opentelemetry import metrics
|
||||
# from opentelemetry.exporter.prometheus import PrometheusMetricReader # Disabled for local dev
|
||||
from opentelemetry.sdk.metrics import MeterProvider
|
||||
from opentelemetry.sdk.resources import Resource
|
||||
from prometheus_client import start_http_server
|
||||
|
||||
from ..core.config import settings
|
||||
from ..core.logging import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
class ApplicationMetrics:
|
||||
"""Central metrics collection for the accessible video platform"""
|
||||
|
||||
def __init__(self):
|
||||
self.setup_metrics()
|
||||
|
||||
# Job processing metrics
|
||||
self.job_total_counter = self.meter.create_counter(
|
||||
name="jobs_total",
|
||||
description="Total number of jobs created",
|
||||
unit="1"
|
||||
)
|
||||
|
||||
self.job_status_gauge = self.meter.create_up_down_counter(
|
||||
name="jobs_by_status",
|
||||
description="Current number of jobs by status",
|
||||
unit="1"
|
||||
)
|
||||
|
||||
self.job_processing_duration = self.meter.create_histogram(
|
||||
name="job_processing_duration_seconds",
|
||||
description="Time taken to process jobs through each stage",
|
||||
unit="s"
|
||||
)
|
||||
|
||||
# AI service metrics
|
||||
self.ai_requests_counter = self.meter.create_counter(
|
||||
name="ai_requests_total",
|
||||
description="Total AI service requests",
|
||||
unit="1"
|
||||
)
|
||||
|
||||
self.ai_request_duration = self.meter.create_histogram(
|
||||
name="ai_request_duration_seconds",
|
||||
description="Duration of AI service requests",
|
||||
unit="s"
|
||||
)
|
||||
|
||||
self.ai_confidence_histogram = self.meter.create_histogram(
|
||||
name="ai_confidence_score",
|
||||
description="AI confidence scores distribution",
|
||||
unit="1"
|
||||
)
|
||||
|
||||
# Storage metrics
|
||||
self.storage_operations_counter = self.meter.create_counter(
|
||||
name="storage_operations_total",
|
||||
description="Total storage operations",
|
||||
unit="1"
|
||||
)
|
||||
|
||||
self.storage_operation_duration = self.meter.create_histogram(
|
||||
name="storage_operation_duration_seconds",
|
||||
description="Duration of storage operations",
|
||||
unit="s"
|
||||
)
|
||||
|
||||
# Queue metrics
|
||||
self.queue_tasks_counter = self.meter.create_counter(
|
||||
name="celery_tasks_total",
|
||||
description="Total Celery tasks processed",
|
||||
unit="1"
|
||||
)
|
||||
|
||||
self.queue_task_duration = self.meter.create_histogram(
|
||||
name="celery_task_duration_seconds",
|
||||
description="Duration of Celery task execution",
|
||||
unit="s"
|
||||
)
|
||||
|
||||
# User activity metrics
|
||||
self.auth_attempts_counter = self.meter.create_counter(
|
||||
name="auth_attempts_total",
|
||||
description="Total authentication attempts",
|
||||
unit="1"
|
||||
)
|
||||
|
||||
self.active_users_gauge = self.meter.create_up_down_counter(
|
||||
name="active_users",
|
||||
description="Number of currently active users",
|
||||
unit="1"
|
||||
)
|
||||
|
||||
# Rate limiting metrics
|
||||
self.rate_limit_counter = self.meter.create_counter(
|
||||
name="rate_limit_checks_total",
|
||||
description="Total rate limit checks performed",
|
||||
unit="1"
|
||||
)
|
||||
|
||||
# Request validation metrics
|
||||
self.validation_counter = self.meter.create_counter(
|
||||
name="request_validation_total",
|
||||
description="Total request validations performed",
|
||||
unit="1"
|
||||
)
|
||||
|
||||
self.validation_duration = self.meter.create_histogram(
|
||||
name="request_validation_duration_seconds",
|
||||
description="Duration of request validation",
|
||||
unit="s"
|
||||
)
|
||||
|
||||
def setup_metrics(self):
|
||||
"""Initialize metrics provider and meter"""
|
||||
resource = Resource.create({
|
||||
"service.name": "accessible-video-api",
|
||||
"service.version": "1.0.0",
|
||||
"deployment.environment": settings.app_env,
|
||||
})
|
||||
|
||||
# Set up Prometheus metrics reader (disabled for local dev)
|
||||
# prometheus_reader = PrometheusMetricReader()
|
||||
|
||||
# Create metrics provider
|
||||
provider = MeterProvider(
|
||||
resource=resource,
|
||||
# metric_readers=[prometheus_reader] # Disabled for local dev
|
||||
)
|
||||
metrics.set_meter_provider(provider)
|
||||
|
||||
# Get meter for this service
|
||||
self.meter = metrics.get_meter("accessible-video-api")
|
||||
|
||||
logger.info("Metrics provider initialized with Prometheus exporter")
|
||||
|
||||
def start_prometheus_server(self, port: int = 8001):
|
||||
"""Start Prometheus metrics HTTP server"""
|
||||
try:
|
||||
start_http_server(port)
|
||||
logger.info(f"Prometheus metrics server started on port {port}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to start Prometheus server: {e}")
|
||||
|
||||
# Job metrics methods
|
||||
def record_job_created(self, client_id: str, language: str):
|
||||
"""Record a new job creation"""
|
||||
self.job_total_counter.add(
|
||||
1,
|
||||
attributes={
|
||||
"client_id": client_id,
|
||||
"source_language": language,
|
||||
"action": "created"
|
||||
}
|
||||
)
|
||||
|
||||
def record_job_status_change(self, job_id: str, old_status: str, new_status: str):
|
||||
"""Record job status change"""
|
||||
# Decrement old status count
|
||||
self.job_status_gauge.add(
|
||||
-1,
|
||||
attributes={"status": old_status}
|
||||
)
|
||||
|
||||
# Increment new status count
|
||||
self.job_status_gauge.add(
|
||||
1,
|
||||
attributes={"status": new_status}
|
||||
)
|
||||
|
||||
def record_job_processing_time(self, stage: str, duration_seconds: float, job_id: str):
|
||||
"""Record time taken for job processing stage"""
|
||||
self.job_processing_duration.record(
|
||||
duration_seconds,
|
||||
attributes={
|
||||
"stage": stage,
|
||||
"job_id": job_id
|
||||
}
|
||||
)
|
||||
|
||||
# AI service metrics methods
|
||||
def record_ai_request(self, service: str, operation: str, language: Optional[str] = None):
|
||||
"""Record AI service request"""
|
||||
attributes = {
|
||||
"service": service,
|
||||
"operation": operation
|
||||
}
|
||||
if language:
|
||||
attributes["language"] = language
|
||||
|
||||
self.ai_requests_counter.add(1, attributes=attributes)
|
||||
|
||||
def record_ai_request_duration(self, service: str, operation: str, duration_seconds: float):
|
||||
"""Record AI request duration"""
|
||||
self.ai_request_duration.record(
|
||||
duration_seconds,
|
||||
attributes={
|
||||
"service": service,
|
||||
"operation": operation
|
||||
}
|
||||
)
|
||||
|
||||
def record_ai_confidence(self, confidence: float, service: str):
|
||||
"""Record AI confidence score"""
|
||||
self.ai_confidence_histogram.record(
|
||||
confidence,
|
||||
attributes={"service": service}
|
||||
)
|
||||
|
||||
# Storage metrics methods
|
||||
def record_storage_operation(self, operation: str, file_type: str, success: bool):
|
||||
"""Record storage operation"""
|
||||
self.storage_operations_counter.add(
|
||||
1,
|
||||
attributes={
|
||||
"operation": operation,
|
||||
"file_type": file_type,
|
||||
"result": "success" if success else "error"
|
||||
}
|
||||
)
|
||||
|
||||
def record_storage_duration(self, operation: str, duration_seconds: float):
|
||||
"""Record storage operation duration"""
|
||||
self.storage_operation_duration.record(
|
||||
duration_seconds,
|
||||
attributes={"operation": operation}
|
||||
)
|
||||
|
||||
# Queue metrics methods
|
||||
def record_celery_task(self, task_name: str, queue: str, result: str):
|
||||
"""Record Celery task execution"""
|
||||
self.queue_tasks_counter.add(
|
||||
1,
|
||||
attributes={
|
||||
"task_name": task_name,
|
||||
"queue": queue,
|
||||
"result": result
|
||||
}
|
||||
)
|
||||
|
||||
def record_celery_task_duration(self, task_name: str, duration_seconds: float):
|
||||
"""Record Celery task duration"""
|
||||
self.queue_task_duration.record(
|
||||
duration_seconds,
|
||||
attributes={"task_name": task_name}
|
||||
)
|
||||
|
||||
# Auth metrics methods
|
||||
def record_auth_attempt(self, result: str, user_role: Optional[str] = None):
|
||||
"""Record authentication attempt"""
|
||||
attributes = {"result": result}
|
||||
if user_role:
|
||||
attributes["user_role"] = user_role
|
||||
|
||||
self.auth_attempts_counter.add(1, attributes=attributes)
|
||||
|
||||
def update_active_users(self, count_change: int, user_role: str):
|
||||
"""Update active users count"""
|
||||
self.active_users_gauge.add(
|
||||
count_change,
|
||||
attributes={"user_role": user_role}
|
||||
)
|
||||
|
||||
|
||||
# Global metrics instance
|
||||
app_metrics = ApplicationMetrics()
|
||||
|
||||
|
||||
class MetricsTimer:
|
||||
"""Context manager for timing operations"""
|
||||
|
||||
def __init__(self, metric_recorder, *args, **kwargs):
|
||||
self.metric_recorder = metric_recorder
|
||||
self.args = args
|
||||
self.kwargs = kwargs
|
||||
self.start_time = None
|
||||
|
||||
def __enter__(self):
|
||||
self.start_time = time.time()
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
if self.start_time:
|
||||
duration = time.time() - self.start_time
|
||||
self.metric_recorder(duration, *self.args, **self.kwargs)
|
||||
|
||||
|
||||
# Convenience functions for common metrics patterns
|
||||
def time_job_processing(stage: str, job_id: str):
|
||||
"""Time a job processing stage"""
|
||||
return MetricsTimer(
|
||||
app_metrics.record_job_processing_time,
|
||||
stage, job_id
|
||||
)
|
||||
|
||||
|
||||
def time_ai_request(service: str, operation: str):
|
||||
"""Time an AI service request"""
|
||||
return MetricsTimer(
|
||||
app_metrics.record_ai_request_duration,
|
||||
service, operation
|
||||
)
|
||||
|
||||
|
||||
def time_storage_operation(operation: str):
|
||||
"""Time a storage operation"""
|
||||
return MetricsTimer(
|
||||
app_metrics.record_storage_duration,
|
||||
operation
|
||||
)
|
||||
|
||||
|
||||
def time_celery_task(task_name: str):
|
||||
"""Time a Celery task execution"""
|
||||
return MetricsTimer(
|
||||
app_metrics.record_celery_task_duration,
|
||||
task_name
|
||||
)
|
||||
|
||||
|
||||
def track_rate_limit_metrics(identifier: str, is_allowed: bool, current_requests: int, limit: int):
|
||||
"""Track rate limiting metrics"""
|
||||
if hasattr(app_metrics, 'rate_limit_counter'):
|
||||
app_metrics.rate_limit_counter.add(
|
||||
1,
|
||||
attributes={
|
||||
"identifier_type": identifier.split(":")[0] if ":" in identifier else "unknown",
|
||||
"is_allowed": str(is_allowed),
|
||||
"status": "allowed" if is_allowed else "blocked"
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
def track_validation_metrics(endpoint: str, method: str, is_valid: bool, validation_time: float, error_types: list):
|
||||
"""Track request validation metrics"""
|
||||
if hasattr(app_metrics, 'validation_counter'):
|
||||
app_metrics.validation_counter.add(
|
||||
1,
|
||||
attributes={
|
||||
"endpoint": endpoint,
|
||||
"method": method,
|
||||
"is_valid": str(is_valid),
|
||||
"error_types": ",".join(error_types) if error_types else "none"
|
||||
}
|
||||
)
|
||||
|
||||
if hasattr(app_metrics, 'validation_duration'):
|
||||
app_metrics.validation_duration.record(
|
||||
validation_time,
|
||||
attributes={
|
||||
"endpoint": endpoint,
|
||||
"method": method
|
||||
}
|
||||
)
|
||||
268
backend/app/telemetry/tracing.py
Normal file
268
backend/app/telemetry/tracing.py
Normal file
|
|
@ -0,0 +1,268 @@
|
|||
import logging
|
||||
from typing import Optional
|
||||
|
||||
from opentelemetry import trace
|
||||
# from opentelemetry.exporter.gcp.trace import CloudTraceSpanExporter # Disabled for local dev
|
||||
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
|
||||
from opentelemetry.instrumentation.pymongo import PymongoInstrumentor
|
||||
from opentelemetry.instrumentation.redis import RedisInstrumentor
|
||||
from opentelemetry.sdk.resources import Resource
|
||||
from opentelemetry.sdk.trace import TracerProvider
|
||||
from opentelemetry.sdk.trace.export import BatchSpanProcessor
|
||||
|
||||
from ..core.config import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def setup_tracing(app_name: str = "accessible-video-api", service_version: str = "1.0.0"):
|
||||
"""Initialize OpenTelemetry tracing for the application"""
|
||||
|
||||
# Create resource with service information
|
||||
resource = Resource.create({
|
||||
"service.name": app_name,
|
||||
"service.version": service_version,
|
||||
"service.namespace": "accessible-video",
|
||||
"deployment.environment": settings.app_env,
|
||||
})
|
||||
|
||||
# Set up tracer provider
|
||||
tracer_provider = TracerProvider(resource=resource)
|
||||
trace.set_tracer_provider(tracer_provider)
|
||||
|
||||
# Configure span processor and exporter based on environment
|
||||
if settings.app_env == "prod" and settings.gcp_project_id:
|
||||
# Use Google Cloud Trace in production (disabled for local dev)
|
||||
# cloud_trace_exporter = CloudTraceSpanExporter(
|
||||
# project_id=settings.gcp_project_id
|
||||
# )
|
||||
# span_processor = BatchSpanProcessor(cloud_trace_exporter)
|
||||
# tracer_provider.add_span_processor(span_processor)
|
||||
logger.info("Google Cloud Trace disabled for local dev")
|
||||
|
||||
elif settings.otel_exporter_otlp_endpoint:
|
||||
# Use OTLP exporter for other observability platforms
|
||||
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
|
||||
|
||||
otlp_exporter = OTLPSpanExporter(
|
||||
endpoint=settings.otel_exporter_otlp_endpoint,
|
||||
headers={},
|
||||
)
|
||||
span_processor = BatchSpanProcessor(otlp_exporter)
|
||||
tracer_provider.add_span_processor(span_processor)
|
||||
logger.info(f"Configured OTLP trace exporter: {settings.otel_exporter_otlp_endpoint}")
|
||||
|
||||
else:
|
||||
# Development mode - use console exporter
|
||||
from opentelemetry.sdk.trace.export import ConsoleSpanExporter
|
||||
|
||||
console_exporter = ConsoleSpanExporter()
|
||||
span_processor = BatchSpanProcessor(console_exporter)
|
||||
tracer_provider.add_span_processor(span_processor)
|
||||
logger.info("Configured console trace exporter for development")
|
||||
|
||||
logger.info(f"OpenTelemetry tracing initialized for {app_name}")
|
||||
|
||||
return tracer_provider
|
||||
|
||||
|
||||
def instrument_fastapi_app(app):
|
||||
"""Instrument FastAPI application with automatic tracing"""
|
||||
FastAPIInstrumentor.instrument_app(
|
||||
app,
|
||||
tracer_provider=trace.get_tracer_provider(),
|
||||
excluded_urls="health,metrics", # Don't trace health checks
|
||||
)
|
||||
logger.info("FastAPI instrumentation enabled")
|
||||
|
||||
|
||||
def instrument_dependencies():
|
||||
"""Instrument external dependencies for automatic tracing"""
|
||||
# Instrument MongoDB
|
||||
PymongoInstrumentor().instrument(
|
||||
tracer_provider=trace.get_tracer_provider()
|
||||
)
|
||||
logger.info("MongoDB instrumentation enabled")
|
||||
|
||||
# Instrument Redis
|
||||
RedisInstrumentor().instrument(
|
||||
tracer_provider=trace.get_tracer_provider()
|
||||
)
|
||||
logger.info("Redis instrumentation enabled")
|
||||
|
||||
|
||||
def get_tracer(name: str = "accessible-video"):
|
||||
"""Get a tracer instance for manual instrumentation"""
|
||||
return trace.get_tracer(name)
|
||||
|
||||
|
||||
def trace_async_operation(operation_name: str, **attributes):
|
||||
"""Decorator for tracing async operations"""
|
||||
def decorator(func):
|
||||
async def wrapper(*args, **kwargs):
|
||||
tracer = get_tracer()
|
||||
|
||||
with tracer.start_as_current_span(
|
||||
operation_name,
|
||||
attributes=attributes
|
||||
) as span:
|
||||
try:
|
||||
result = await func(*args, **kwargs)
|
||||
span.set_attribute("operation.result", "success")
|
||||
return result
|
||||
except Exception as e:
|
||||
span.set_attribute("operation.result", "error")
|
||||
span.set_attribute("operation.error_message", str(e))
|
||||
span.record_exception(e)
|
||||
raise
|
||||
|
||||
return wrapper
|
||||
return decorator
|
||||
|
||||
|
||||
def trace_job_pipeline(job_id: str, pipeline_stage: str):
|
||||
"""Decorator for tracing job pipeline stages"""
|
||||
def decorator(func):
|
||||
async def wrapper(*args, **kwargs):
|
||||
tracer = get_tracer()
|
||||
|
||||
with tracer.start_as_current_span(
|
||||
f"job_pipeline.{pipeline_stage}",
|
||||
attributes={
|
||||
"job.id": job_id,
|
||||
"job.pipeline_stage": pipeline_stage,
|
||||
}
|
||||
) as span:
|
||||
try:
|
||||
result = await func(*args, **kwargs)
|
||||
span.set_attribute("job.result", "success")
|
||||
return result
|
||||
except Exception as e:
|
||||
span.set_attribute("job.result", "error")
|
||||
span.set_attribute("job.error_message", str(e))
|
||||
span.record_exception(e)
|
||||
raise
|
||||
|
||||
return wrapper
|
||||
return decorator
|
||||
|
||||
|
||||
def trace_ai_operation(operation_type: str, language: Optional[str] = None):
|
||||
"""Decorator for tracing AI service operations"""
|
||||
def decorator(func):
|
||||
async def wrapper(*args, **kwargs):
|
||||
tracer = get_tracer()
|
||||
|
||||
span_attributes = {
|
||||
"ai.operation_type": operation_type,
|
||||
"ai.provider": "gemini" if "gemini" in operation_type else "google_translate"
|
||||
}
|
||||
|
||||
if language:
|
||||
span_attributes["ai.language"] = language
|
||||
|
||||
with tracer.start_as_current_span(
|
||||
f"ai.{operation_type}",
|
||||
attributes=span_attributes
|
||||
) as span:
|
||||
try:
|
||||
result = await func(*args, **kwargs)
|
||||
|
||||
# Add result attributes if available
|
||||
if isinstance(result, dict):
|
||||
if "confidence" in result:
|
||||
span.set_attribute("ai.confidence", result["confidence"])
|
||||
if "language" in result:
|
||||
span.set_attribute("ai.detected_language", result["language"])
|
||||
|
||||
span.set_attribute("ai.result", "success")
|
||||
return result
|
||||
except Exception as e:
|
||||
span.set_attribute("ai.result", "error")
|
||||
span.set_attribute("ai.error_message", str(e))
|
||||
span.record_exception(e)
|
||||
raise
|
||||
|
||||
return wrapper
|
||||
return decorator
|
||||
|
||||
|
||||
def trace_storage_operation(operation_type: str, file_path: str):
|
||||
"""Decorator for tracing storage operations"""
|
||||
def decorator(func):
|
||||
async def wrapper(*args, **kwargs):
|
||||
tracer = get_tracer()
|
||||
|
||||
with tracer.start_as_current_span(
|
||||
f"storage.{operation_type}",
|
||||
attributes={
|
||||
"storage.operation": operation_type,
|
||||
"storage.path": file_path,
|
||||
"storage.provider": "gcs"
|
||||
}
|
||||
) as span:
|
||||
try:
|
||||
result = await func(*args, **kwargs)
|
||||
span.set_attribute("storage.result", "success")
|
||||
|
||||
if isinstance(result, str) and result.startswith("gs://"):
|
||||
span.set_attribute("storage.result_uri", result)
|
||||
|
||||
return result
|
||||
except Exception as e:
|
||||
span.set_attribute("storage.result", "error")
|
||||
span.set_attribute("storage.error_message", str(e))
|
||||
span.record_exception(e)
|
||||
raise
|
||||
|
||||
return wrapper
|
||||
return decorator
|
||||
|
||||
|
||||
class TracingContext:
|
||||
"""Context manager for manual span creation with attributes"""
|
||||
|
||||
def __init__(self, span_name: str, attributes: Optional[dict] = None):
|
||||
self.span_name = span_name
|
||||
self.attributes = attributes or {}
|
||||
self.tracer = get_tracer()
|
||||
self.span = None
|
||||
|
||||
def __enter__(self):
|
||||
self.span = self.tracer.start_span(self.span_name, attributes=self.attributes)
|
||||
return self.span
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
if exc_type:
|
||||
self.span.set_attribute("error", True)
|
||||
self.span.set_attribute("error_message", str(exc_val))
|
||||
self.span.record_exception(exc_val)
|
||||
|
||||
self.span.end()
|
||||
|
||||
|
||||
# Convenience functions for common tracing patterns
|
||||
def trace_api_request(endpoint: str, user_id: Optional[str] = None):
|
||||
"""Create span for API request with common attributes"""
|
||||
attributes = {
|
||||
"http.route": endpoint,
|
||||
"component": "api"
|
||||
}
|
||||
|
||||
if user_id:
|
||||
attributes["user.id"] = user_id
|
||||
|
||||
return TracingContext(f"api.{endpoint.replace('/', '_')}", attributes)
|
||||
|
||||
|
||||
def trace_celery_task(task_name: str, job_id: Optional[str] = None):
|
||||
"""Create span for Celery task execution"""
|
||||
attributes = {
|
||||
"celery.task_name": task_name,
|
||||
"component": "worker"
|
||||
}
|
||||
|
||||
if job_id:
|
||||
attributes["job.id"] = job_id
|
||||
|
||||
return TracingContext(f"celery.{task_name}", attributes)
|
||||
42
backend/celery_worker.py
Normal file
42
backend/celery_worker.py
Normal file
|
|
@ -0,0 +1,42 @@
|
|||
import sentry_sdk
|
||||
from sentry_sdk.integrations.celery import CeleryIntegration
|
||||
|
||||
from app.core.config import settings
|
||||
from app.core.logging import setup_logging, get_logger
|
||||
from app.tasks import celery_app
|
||||
|
||||
# Set up logging first
|
||||
setup_logging()
|
||||
logger = get_logger(__name__)
|
||||
|
||||
# Initialize Sentry for worker
|
||||
if settings.sentry_dsn and settings.sentry_dsn.startswith(('http', 'https')):
|
||||
sentry_sdk.init(
|
||||
dsn=settings.sentry_dsn,
|
||||
integrations=[CeleryIntegration(monitor_beat_tasks=True)],
|
||||
environment=settings.app_env,
|
||||
release="1.0.0",
|
||||
send_default_pii=False,
|
||||
)
|
||||
|
||||
logger.info("Starting Celery worker with structured logging")
|
||||
|
||||
# Import task modules to register them
|
||||
from app.tasks import ingest_and_ai
|
||||
from app.tasks import translate_and_synthesize
|
||||
|
||||
# Debug: Show registered tasks
|
||||
logger.info(f"Celery app: {celery_app}")
|
||||
logger.info(f"Registered tasks: {list(celery_app.tasks.keys())}")
|
||||
logger.info(f"Task routes: {celery_app.conf.task_routes}")
|
||||
logger.info(f"Worker listening to queues: default,ingest")
|
||||
|
||||
# Specifically check for our translation task
|
||||
if 'app.tasks.translate_and_synthesize.translate_and_synthesize_task' in celery_app.tasks:
|
||||
logger.info("✅ translate_and_synthesize_task is registered")
|
||||
else:
|
||||
logger.error("❌ translate_and_synthesize_task is NOT registered")
|
||||
logger.error(f"Available tasks: {[t for t in celery_app.tasks.keys() if not t.startswith('celery.')]}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
celery_app.start()
|
||||
8
backend/cors-config.json
Normal file
8
backend/cors-config.json
Normal file
|
|
@ -0,0 +1,8 @@
|
|||
[
|
||||
{
|
||||
"origin": ["*"],
|
||||
"method": ["GET", "HEAD", "OPTIONS"],
|
||||
"responseHeader": ["*"],
|
||||
"maxAgeSeconds": 3600
|
||||
}
|
||||
]
|
||||
78
backend/create_test_users.py
Normal file
78
backend/create_test_users.py
Normal file
|
|
@ -0,0 +1,78 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Create test users for the accessible video platform."""
|
||||
|
||||
import asyncio
|
||||
from datetime import datetime
|
||||
from passlib.context import CryptContext
|
||||
from motor.motor_asyncio import AsyncIOMotorClient
|
||||
|
||||
from app.core.config import settings
|
||||
from app.models.user import UserRole
|
||||
|
||||
pwd_context = CryptContext(schemes=["bcrypt"], deprecated="auto")
|
||||
|
||||
async def create_test_users():
|
||||
"""Create test users in the database."""
|
||||
print("Connecting to MongoDB...")
|
||||
client = AsyncIOMotorClient(settings.mongodb_uri)
|
||||
db = client[settings.mongodb_db]
|
||||
|
||||
# Test connection
|
||||
await client.admin.command('ping')
|
||||
print("Connected to MongoDB successfully")
|
||||
|
||||
users_collection = db.users
|
||||
|
||||
# Check if users already exist
|
||||
existing_admin = await users_collection.find_one({"email": "admin@example.com"})
|
||||
existing_reviewer = await users_collection.find_one({"email": "reviewer@example.com"})
|
||||
|
||||
test_users = [
|
||||
{
|
||||
"email": "admin@example.com",
|
||||
"hashed_password": pwd_context.hash("admin"),
|
||||
"full_name": "Admin User",
|
||||
"role": UserRole.ADMIN.value,
|
||||
"is_active": True,
|
||||
"created_at": datetime.utcnow(),
|
||||
"updated_at": datetime.utcnow(),
|
||||
},
|
||||
{
|
||||
"email": "reviewer@example.com",
|
||||
"hashed_password": pwd_context.hash("reviewer"),
|
||||
"full_name": "Reviewer User",
|
||||
"role": UserRole.REVIEWER.value,
|
||||
"is_active": True,
|
||||
"created_at": datetime.utcnow(),
|
||||
"updated_at": datetime.utcnow(),
|
||||
},
|
||||
{
|
||||
"email": "client@example.com",
|
||||
"hashed_password": pwd_context.hash("client123"),
|
||||
"full_name": "Client User",
|
||||
"role": UserRole.CLIENT.value,
|
||||
"is_active": True,
|
||||
"created_at": datetime.utcnow(),
|
||||
"updated_at": datetime.utcnow(),
|
||||
}
|
||||
]
|
||||
|
||||
for user in test_users:
|
||||
existing = await users_collection.find_one({"email": user["email"]})
|
||||
if existing:
|
||||
print(f"User {user['email']} already exists, skipping...")
|
||||
continue
|
||||
|
||||
result = await users_collection.insert_one(user)
|
||||
print(f"Created user: {user['email']} (ID: {result.inserted_id})")
|
||||
|
||||
# Show all users
|
||||
print("\nAll users in database:")
|
||||
async for user in users_collection.find({}, {"email": 1, "role": 1, "is_active": 1}):
|
||||
print(f" {user['email']} - {user['role']} - Active: {user['is_active']}")
|
||||
|
||||
client.close()
|
||||
print("Done!")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(create_test_users())
|
||||
52
backend/debug_login.py
Normal file
52
backend/debug_login.py
Normal file
|
|
@ -0,0 +1,52 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Debug login issues by testing components individually."""
|
||||
|
||||
import asyncio
|
||||
from motor.motor_asyncio import AsyncIOMotorClient
|
||||
from app.core.config import settings
|
||||
from app.core.security import verify_password
|
||||
from app.models.user import User
|
||||
|
||||
async def test_database_connection():
|
||||
"""Test direct database connection."""
|
||||
print("1. Testing database connection...")
|
||||
client = AsyncIOMotorClient(settings.mongodb_uri)
|
||||
db = client[settings.mongodb_db]
|
||||
|
||||
try:
|
||||
# Test connection
|
||||
await client.admin.command('ping')
|
||||
print("✅ Database connection successful")
|
||||
|
||||
# Check if users collection exists
|
||||
collections = await db.list_collection_names()
|
||||
print(f"✅ Collections: {collections}")
|
||||
|
||||
# Count users
|
||||
user_count = await db.users.count_documents({})
|
||||
print(f"✅ User count: {user_count}")
|
||||
|
||||
# Find admin user
|
||||
user_doc = await db.users.find_one({"email": "admin@example.com"})
|
||||
if user_doc:
|
||||
print(f"✅ Found admin user: {user_doc['email']}")
|
||||
user = User(**user_doc)
|
||||
print(f"✅ User model validation successful")
|
||||
|
||||
# Test password verification
|
||||
print("2. Testing password verification...")
|
||||
password_correct = verify_password("admin", user.hashed_password)
|
||||
print(f"✅ Password verification result: {password_correct}")
|
||||
|
||||
else:
|
||||
print("❌ Admin user not found")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Database error: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
finally:
|
||||
client.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(test_database_connection())
|
||||
29
backend/gunicorn_conf.py
Normal file
29
backend/gunicorn_conf.py
Normal file
|
|
@ -0,0 +1,29 @@
|
|||
import multiprocessing
|
||||
import os
|
||||
|
||||
# Server socket
|
||||
bind = f"0.0.0.0:{os.getenv('PORT', '8000')}"
|
||||
backlog = 2048
|
||||
|
||||
# Worker processes
|
||||
workers = multiprocessing.cpu_count() * 2 + 1
|
||||
worker_class = "uvicorn.workers.UvicornWorker"
|
||||
worker_connections = 1000
|
||||
max_requests = 1000
|
||||
max_requests_jitter = 50
|
||||
|
||||
# Timeouts
|
||||
timeout = 120
|
||||
keepalive = 2
|
||||
|
||||
# Logging
|
||||
loglevel = "info"
|
||||
access_log_format = '%(h)s %(l)s %(u)s %(t)s "%(r)s" %(s)s %(b)s "%(f)s" "%(a)s" %(D)s'
|
||||
accesslog = "-"
|
||||
errorlog = "-"
|
||||
|
||||
# Process naming
|
||||
proc_name = "accessible-video-api"
|
||||
|
||||
# Application
|
||||
module = "app.main:app"
|
||||
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Reference in a new issue