Compare commits

..

No commits in common. "main" and "feat/workflow-blockers" have entirely different histories.

310 changed files with 7306 additions and 22323 deletions

View file

@ -1,94 +0,0 @@
{
"permissions": {
"allow": [
"WebSearch",
"Bash(cd /Volumes/SSD/Projects/Oliver/video-accessibility/backend && ruff check app/services/elevenlabs_voices.py app/services/tts.py app/api/v1/routes_tts.py app/models/job.py app/tasks/tts_synthesis.py app/core/config.py 2>&1)",
"Bash(cd /Volumes/SSD/Projects/Oliver/video-accessibility/backend && python -m ruff check app/services/elevenlabs_voices.py app/services/tts.py app/api/v1/routes_tts.py app/models/job.py app/tasks/tts_synthesis.py app/core/config.py 2>&1)",
"Bash(cd /Volumes/SSD/Projects/Oliver/video-accessibility/backend && pip3 show ruff 2>&1 | head -5; which pip3 2>&1)",
"Bash(cd /Volumes/SSD/Projects/Oliver/video-accessibility/frontend && npm run type-check 2>&1 | tail -20)",
"Bash(node_modules/.bin/tsc --noEmit 2>&1 | tail -20)",
"Bash(./node_modules/.bin/tsc --noEmit 2>&1 | tail -30)",
"Bash(npm run type-check 2>&1)",
"Bash(cd /Volumes/SSD/Projects/Oliver/video-accessibility/frontend && npm run type-check 2>&1)",
"Bash(npm run lint 2>&1)",
"WebFetch(domain:dcmp.org)",
"WebFetch(domain:www.w3.org)",
"WebFetch(domain:partnerhelp.netflixstudios.com)",
"WebFetch(domain:m.media-amazon.com)",
"WebFetch(domain:www.acb.org)",
"Bash(./node_modules/.bin/tsc --noEmit)",
"Bash(node_modules/.bin/tsc --noEmit)",
"Bash(pandoc --version)",
"WebFetch(domain:ai-sandbox.oliver.solutions)",
"Bash(gcloud run:*)",
"Bash(gcloud logging:*)",
"Bash(ssh optical:*)",
"Bash(/Volumes/SSD/Projects/Oliver/video-accessibility/backend/.venv/bin/python3.11 -c \"import sys; sys.path.insert\\(0, '.'\\); from app.models.user import UserRole; print\\([r.value for r in UserRole]\\)\")",
"Bash(npm list *)",
"Bash(brew list *)",
"Bash(npx --yes puppeteer --version)",
"Bash(node md_to_pdf.js)",
"Bash(npm root *)",
"Bash(node *)",
"Bash(ssh optical-web-1 *)",
"Bash(git *)",
"WebFetch(domain:docs.anthropic.com)",
"Bash(poetry lock *)",
"Bash(pip show *)",
"Read(//Users/ai_leed/.local/bin/**)",
"Read(//opt/homebrew/bin/**)",
"Bash(pip3 install *)",
"Bash(poetry --version)",
"Bash(docker run *)",
"Read(//Users/ai_leed/.docker/run/**)",
"Bash(docker context *)",
"Bash(DOCKER_HOST=unix:///var/run/docker.sock docker run --rm -v \"$\\(pwd\\):/app\" -w /app python:3.11-slim bash -c \"pip install poetry==1.8.2 -q && poetry lock --no-update\")",
"Bash(brew install *)",
"Bash(npm run *)",
"Bash(scp /Users/ai_leed/Documents/Projects/Oliver/video-accessibility/backend/app/models/audit_log.py optical:/tmp/audit_log.py)",
"Bash(scp *)",
"Bash(kill %1)",
"Bash(ssh optical-dev *)",
"Skill(fullstack-dev-skills:security-reviewer)",
"Bash(chmod +x *)",
"Bash(gcloud auth *)",
"Bash(gcloud config *)",
"Bash(gcloud artifacts *)",
"Bash(sed -n '190,200p' /Users/ai_leed/Documents/Projects/Oliver/video-accessibility/backend/app/api/v1/routes_jobs.py)",
"Bash(sed -n '1914,1922p' /Users/ai_leed/Documents/Projects/Oliver/video-accessibility/backend/app/api/v1/routes_jobs.py)",
"Bash(sed -n '2048,2062p' /Users/ai_leed/Documents/Projects/Oliver/video-accessibility/backend/app/api/v1/routes_jobs.py)",
"Bash(sed -n '2490,2502p' /Users/ai_leed/Documents/Projects/Oliver/video-accessibility/backend/app/api/v1/routes_jobs.py)",
"Bash(sed -n '2628,2638p' /Users/ai_leed/Documents/Projects/Oliver/video-accessibility/backend/app/api/v1/routes_jobs.py)",
"Bash(gcloud builds submit *)",
"Bash(gcloud builds describe 79802b34-e17b-4446-b01d-68d99d569262 *)",
"Bash(gcloud compute instances list *)",
"Bash(gcloud compute networks vpc-access connectors list *)",
"Bash(gcloud builds *)",
"Bash(gcloud projects get-iam-policy optical-414516 *)",
"Bash(gcloud projects *)",
"Bash(npm audit *)",
"Skill(codebase-audit-suite:ln-622-build-auditor)",
"Skill(codebase-audit-suite:ln-624-code-quality-auditor)",
"Skill(codebase-audit-suite:ln-625-dependencies-auditor)",
"Skill(codebase-audit-suite:ln-626-dead-code-auditor)",
"Bash(/opt/homebrew/bin/ruff check *)",
"Bash(npm test *)",
"Bash(sed -n '35,42p' /Users/ai_leed/Documents/Projects/Oliver/video-accessibility/frontend/src/test/utils.tsx)",
"Bash(sed -n '55,90p' /Users/ai_leed/Documents/Projects/Oliver/video-accessibility/frontend/tests/helpers/auth.ts)",
"Bash(sed -n '48,60p' /Users/ai_leed/Documents/Projects/Oliver/video-accessibility/frontend/src/components/Layout/Sidebar.tsx)",
"Bash(sed -n '152,170p' /Users/ai_leed/Documents/Projects/Oliver/video-accessibility/frontend/src/components/Layout/Sidebar.tsx)",
"Bash(poetry env *)",
"Bash(poetry install *)",
"Bash(poetry run *)",
"Bash(docker info *)",
"Bash(sed -n '1,30p' /Users/ai_leed/Documents/Projects/Oliver/video-accessibility/backend/app/services/gcs.py)",
"Bash(sed -n '155,165p' /Users/ai_leed/Documents/Projects/Oliver/video-accessibility/backend/app/services/gcs.py)",
"Bash(gcloud secrets *)",
"Bash(openssl rand *)",
"Bash(ssh *)",
"Skill(commit-commands:commit-push-pr)",
"Bash(obsidian read *)",
"Bash(obsidian search *)"
]
}
}

View file

@ -9,18 +9,18 @@
# App Configuration
# -----------------------------------------------------------------------------
APP_ENV=prod
API_BASE_URL=https://optical-dev.oliver.solutions/video-accessibility
API_BASE_URL=https://ai-sandbox.oliver.solutions/video-accessibility-back
# -----------------------------------------------------------------------------
# Authentication & Security
# -----------------------------------------------------------------------------
# IMPORTANT: Generate a secure random secret for JWT_SECRET
# Example: openssl rand -hex 32
JWT_SECRET=d81fd31798510f53b374951908b6bedd75f7ddaabe9b4e4c4ca5bf81393f48b7
JWT_SECRET=CHANGE_ME_TO_SECURE_RANDOM_64_CHAR_STRING
JWT_ALG=HS256
JWT_ACCESS_TTL_MIN=240
JWT_REFRESH_TTL_DAYS=7
COOKIE_DOMAIN=optical-dev.oliver.solutions
COOKIE_DOMAIN=ai-sandbox.oliver.solutions
COOKIE_SECURE=true
COOKIE_SAMESITE=Lax
@ -63,31 +63,29 @@ TRANSLATE_API_KEY=
ELEVENLABS_API_KEY=sk_c17be2768ca784f1807018420b84c7f1ee969946e698f986
# -----------------------------------------------------------------------------
# Email Configuration (Mailgun)
# Email Configuration (SendGrid)
# -----------------------------------------------------------------------------
# IMPORTANT: Get SendGrid API key from https://app.sendgrid.com/settings/api_keys
SENDGRID_API_KEY=
MAILGUN_API_KEY=1d8c6f38c53f237305353cc2e55f39f2-c6620443-4b9961f5
MAILGUN_DOMAIN=mg.oliver.solutions
MAILGUN_FROM=noreply@mg.oliver.solutions
# Email sender address
EMAIL_FROM=noreply@mg.oliver.solutions
# Email sender address (must be verified in SendGrid)
EMAIL_FROM=noreply@ai-sandbox.oliver.solutions
# Client-facing URL (used in emails)
CLIENT_BASE_URL=https://optical-dev.oliver.solutions/video-accessibility
CLIENT_BASE_URL=https://ai-sandbox.oliver.solutions/video-accessibility
# -----------------------------------------------------------------------------
# Microsoft Authentication (Azure AD)
# -----------------------------------------------------------------------------
AZURE_CLIENT_ID=9079054c-9620-4757-a256-23413042f1ef
AZURE_AUTHORITY=https://login.microsoftonline.com/e519c2e6-bc6d-4fdf-8d9c-923c2f002385
AZURE_REDIRECT_URI=https://optical-dev.oliver.solutions/video-accessibility/
AZURE_REDIRECT_URI=https://ai-sandbox.oliver.solutions/video-accessibility/
# -----------------------------------------------------------------------------
# CORS Configuration
# -----------------------------------------------------------------------------
# Comma-separated list of allowed origins
CORS_ORIGINS=https://optical-dev.oliver.solutions
CORS_ORIGINS=https://ai-sandbox.oliver.solutions
# -----------------------------------------------------------------------------
# Observability & Monitoring (Optional)
@ -118,9 +116,6 @@ OTEL_EXPORTER_OTLP_ENDPOINT=
WHISPER_SERVICE_URL=https://whisper-http-service-bcb6ipdqka-uc.a.run.app
FFMPEG_SERVICE_URL=https://ffmpeg-http-service-bcb6ipdqka-uc.a.run.app
# optical-dev uses Celery workers (not Cloud Run Jobs) for pipeline dispatch
USE_CELERY_FALLBACK=true
# Worker Concurrency (higher values for Cloud Run mode since workers just make HTTP calls)
WHISPER_WORKER_CONCURRENCY=10
FFMPEG_WORKER_CONCURRENCY=20

View file

@ -1,23 +0,0 @@
# Screenshot capture credentials — copy to .env.screenshots and fill in values
# NEVER commit .env.screenshots (it is gitignored)
BASE_URL=https://optical-dev.oliver.solutions/video-accessibility
# Local-password admin seeded by backend/scripts/seed_test_users.py
TEST_ADMIN_EMAIL=test-admin@oliver.agency
TEST_ADMIN_PASSWORD=TestAdmin2026!
TEST_CLIENT_EMAIL=test-client@oliver.agency
TEST_CLIENT_PASSWORD=TestClient2026!
TEST_LINGUIST_EMAIL=test-linguist@oliver.agency
TEST_LINGUIST_PASSWORD=TestLinguist2026!
TEST_REVIEWER_EMAIL=test-reviewer@oliver.agency
TEST_REVIEWER_PASSWORD=TestReviewer2026!
TEST_PRODUCTION_EMAIL=test-production@oliver.agency
TEST_PRODUCTION_PASSWORD=TestProduction2026!
TEST_PM_EMAIL=test-pm@oliver.agency
TEST_PM_PASSWORD=TestPM2026!

13
.gitignore vendored
View file

@ -12,7 +12,6 @@ examples/
.env.local
.env.production
.env.*.local
.env.screenshots
secrets/
*.pem
*.key
@ -99,15 +98,3 @@ docs/*.pdf
/var/www/html/video-accessibility.backup.*
backend/.env
# Node / npm artifacts at repo root (Playwright MCP installs these)
node_modules/
package.json
package-lock.json
# Playwright MCP session snapshots
.playwright-mcp/
# Test videos
test-video.mp4
.worktrees/

View file

@ -1,118 +0,0 @@
# Build Health Audit — ln-622
**Score: 5.5/10** | Issues: 28 (C:0 H:5 M:18 L:5)
**Date:** 2026-04-30 | **Stack:** Python 3.11 / FastAPI / Celery + React 19 / Vite / TypeScript 5.8
---
## 1. Compiler / Linter Errors
### Backend — ruff: 1314 errors (HIGH)
`ruff check app/` exits non-zero with 1314 violations. The ruff config in `pyproject.toml` uses **deprecated top-level `select`/`ignore`/`per-file-ignores`** instead of `[tool.ruff.lint]` — ruff emits a warning on every run.
Top violation codes:
| Code | Meaning | Volume |
|------|---------|--------|
| I001 | Import block unsorted | ~400 |
| UP | pyupgrade (f-strings, typing aliases) | ~500 |
| B | flake8-bugbear | ~200 |
| F401 | Unused import | 58 |
Most violations are **auto-fixable** (`ruff check --fix`). The unsorted imports and UP rules are cosmetic but make CI noisy and block future enforcement.
**Severity: HIGH** — CI cannot gate on ruff without fixing this first.
### Frontend — ESLint: 36 problems (30 errors, 6 warnings) (MEDIUM)
Key errors:
| File | Rule | Count |
|------|------|-------|
| `contexts/GlobalWebSocketContext.tsx:56` | `react-refresh/only-export-components` | 1 |
| `contexts/NotificationContext.tsx:91` | `react-refresh/only-export-components` | 1 |
| `contexts/ToastContext.tsx:83` | `react-refresh/only-export-components` | 1 |
| `lib/api.ts:539` | `@typescript-eslint/no-explicit-any` | 1 |
| `routes/admin/QCDetail.tsx` | `@typescript-eslint/no-explicit-any` | 6 |
| `routes/AcceptInvite.tsx` | `@typescript-eslint/no-explicit-any` | 1 |
| `routes/jobs/JobDetail.tsx` | `no-unused-vars` (err catch) | 2 |
| `hooks/__tests__/useJob.test.tsx` | `no-unused-vars` | 1 |
| `tests/helpers/auth.ts` | `no-explicit-any` | 3 |
**Severity: MEDIUM** — build succeeds, but `any` types and react-refresh errors degrade DX and HMR.
---
## 2. Type Errors
### Frontend — tsc: CLEAN ✓
`tsc --noEmit` exits 0. No TypeScript compilation errors. The `any` issues above are ESLint-level, not tsc errors.
### Backend — mypy: NOT RUN
Cannot run mypy outside the poetry venv. Needs `poetry run mypy .` inside Docker or an activated venv.
**Severity: LOW** (mypy not blocking, but should be run in CI)
---
## 3. Tests
### Frontend — vitest: 13 failed / 75 total (HIGH)
8 test files affected:
| Test | Failures | Root cause |
|------|----------|-----------|
| `auth.test.ts` | 1 | Mock shape mismatch — response has extra field `organizationId` |
| `StatusBadge.test.tsx` | 1 | Unknown status no longer renders text (component changed) |
| `VttEditor.test.tsx` | 1 | Multiple elements found for `Insert cue before` title — DOM duplication |
| `useJob.test.tsx` | 3 | `useApproveEnglish` — pending state never resolves in test (timeout 1s); `useCreateJob` arg mismatch |
| `UploadDropzone.test.tsx` | 6 | Text broken across elements — test uses exact string match, component renders in `<span>` nodes |
| `useJobStatusWebSocket.test.tsx` | 1 | (see output) |
**Severity: HIGH** — 17% test failure rate. Several are stale tests from component refactors (UploadDropzone, StatusBadge).
### Backend — pytest: CANNOT RUN (CRITICAL)
Running `pytest` outside poetry venv fails with `ModuleNotFoundError` for `fastapi`, `aiohttp`, etc. Tests must be run with `poetry run pytest` inside Docker or an activated poetry environment.
The `backend/.venv` exists but appears to be a plain venv, not the poetry-managed one. **Tests are effectively unrunnable in local dev without explicit poetry activation.**
**Severity: CRITICAL** — Developers with system Python cannot run tests without explicit setup steps.
---
## 4. Build Configuration Issues
### ruff config deprecated (MEDIUM)
`pyproject.toml` uses `[tool.ruff]` top-level `select`, `ignore`, `per-file-ignores`. Current ruff ≥ 0.2 expects `[tool.ruff.lint]`. Fix:
```toml
# Before
[tool.ruff]
select = ["E", "W", ...]
ignore = ["E501", ...]
# After
[tool.ruff]
target-version = "py311"
line-length = 88
[tool.ruff.lint]
select = ["E", "W", ...]
ignore = ["E501", ...]
```
### Backend venv mismatch (MEDIUM)
`backend/.venv` cannot run `ruff`, `pytest`, or `mypy` — they are installed in the poetry-managed venv, not this one. Confusing to new devs.
### AGENTS.md commands incorrect (LOW)
`AGENTS.md` documents `cd backend && poetry run pytest` but the backend has `.venv` and `pyproject.toml` with no Makefile wrapper. The actual working path is `cd backend && .venv/bin/python -m pytest` or requires `poetry shell`.
---
## Summary
| Check | Result | Severity |
|-------|--------|---------|
| ruff backend | 1314 violations (auto-fixable) | HIGH |
| ESLint frontend | 36 problems | MEDIUM |
| tsc frontend | ✓ Clean | OK |
| mypy backend | Not runnable locally | LOW |
| vitest frontend | 13/75 failing | HIGH |
| pytest backend | Not runnable locally | CRITICAL |
| ruff config | Deprecated syntax | MEDIUM |
| venv setup | Confusing / broken | MEDIUM |

View file

@ -1,116 +0,0 @@
# Code Quality Audit — ln-624
**Score: 5.0/10** | Issues: 22 (C:2 H:8 M:9 L:3)
**Date:** 2026-04-30
---
## 1. God Classes / Files (> 500 lines)
| File | Lines | Severity |
|------|-------|---------|
| `backend/app/api/v1/routes_jobs.py` | 2882 | **CRITICAL** |
| `frontend/src/routes/admin/QCDetail.tsx` | 2079 | **CRITICAL** |
| `backend/app/services/video_renderer.py` | 1695 | **HIGH** |
| `frontend/src/routes/jobs/JobsList.tsx` | 1246 | **HIGH** |
| `frontend/src/lib/api.ts` | 1056 | **HIGH** |
| `backend/app/tasks/translate_and_synthesize.py` | 1019 | **HIGH** |
| `frontend/src/routes/jobs/NewJob.tsx` | 1038 | **HIGH** |
| `frontend/src/types/api.ts` | 891 | **MEDIUM** |
| `frontend/src/routes/jobs/JobDetail.tsx` | 732 | **MEDIUM** |
| `frontend/src/routes/admin/UserDetail.tsx` | 523 | **MEDIUM** |
| `frontend/src/hooks/useJobStatusWebSocket.ts` | 443 | **MEDIUM** |
**routes_jobs.py at 2882 lines** is the worst offender — it mixes upload, approval, translation, TTS, VTT editing, download, admin, and websocket concerns in a single router. Splitting by domain (e.g., `routes_upload.py`, `routes_vtt.py`, `routes_review.py`, `routes_tts.py`) would bring each under 500 lines.
**QCDetail.tsx at 2079 lines** handles the entire QC workflow, VTT display, audio preview, language selection, and approval modals in one component. Needs extraction of at minimum: `LanguageQCPanel`, `VttReviewView`, `ApprovalModal`.
---
## 2. Long Methods (> 100 lines)
| File:line | Function | Length | Severity |
|-----------|---------|--------|---------|
| `tasks/translate_and_synthesize.py:109` | `_async_translate_and_synthesize()` | 485 lines | **CRITICAL** |
| `services/video_renderer.py:487` | `_render_pause_insert_method()` | 419 lines | **CRITICAL** |
| `tasks/ingest_and_ai.py:53` | `ingest_and_ai_task_impl()` | 276 lines | **HIGH** |
| `tasks/rerender_accessible_video.py:110` | `_async_rerender_accessible_video()` | 280 lines | **HIGH** |
| `tasks/render_accessible_video.py:56` | `_async_render_accessible_video()` | 287 lines | **HIGH** |
| `api/v1/routes_jobs.py:1552` | `update_job_vtt_content()` | 215 lines | **HIGH** |
| `tasks/notify.py:29` | `run_async()` | 169 lines | **HIGH** |
| `api/v1/routes_jobs.py:2738` | `update_tts_preferences()` | 144 lines | **MEDIUM** |
| `services/whisper_service.py:241` | `_find_sentence_boundaries()` | 120 lines | **MEDIUM** |
| `services/gemini.py:591` | `analyze_accessible_video_placement()` | 132 lines | **MEDIUM** |
The two most critical ones (`_async_translate_and_synthesize` at 485 lines and `_render_pause_insert_method` at 419 lines) are orchestrator-style functions with sequential pipeline steps. They could be split into named pipeline stages, each ~50 lines.
---
## 3. Deep Nesting
Not systematically scanned with a tool (radon/lizard not installed). The long functions above likely contain 45+ nesting levels given their complexity.
---
## 4. Too Many Parameters
| Location | Function | Params | Severity |
|----------|---------|--------|---------|
| `services/gemini.py` | `extract_accessibility_targeted()` | 7+ | **MEDIUM** |
| `tasks/translate_and_synthesize.py` | `_generate_language_tts()` | 8+ | **MEDIUM** |
Pattern: many functions pass `db`, `job`, `language`, `settings`, `gcs_client`, etc. individually instead of grouping into a context dataclass.
---
## 5. Magic Numbers
### Backend (MEDIUM)
Scattered timing constants without named definitions:
- TTS retry delays (hardcoded seconds)
- chunk sizes in upload
- Audio padding values in video_renderer.py
### Frontend (LOW)
Mostly clean. Some inline pixel values in Tailwind (acceptable). No concerning business-logic magic numbers found.
---
## 6. N+1 Query Patterns (MEDIUM)
Potential N+1 patterns found:
- `app/main.py:102``async for job_doc in db.jobs.find(...)` — check if this iterates and makes additional queries per document
- `app/core/dependencies.py:185``async for m in db.memberships.find(...)` — membership lookup per request in auth middleware (acceptable if cached, but no caching observed)
- `app/core/authz.py:54``async for doc in db.memberships.find(...)` — similar pattern in auth check
These are all async iterators over `find()` — not necessarily N+1 if no nested DB calls, but should be reviewed for `.find()` calls inside the loop body.
---
## 7. Method Signature Quality
### Boolean flag parameters (MEDIUM)
Several async functions in tasks accept `bool` flags controlling behavior variants (e.g., `skip_tts`, `force_regenerate`). These should be enums or separate functions.
### Unclear return types (MEDIUM)
Some routes return `dict` or untyped responses instead of Pydantic response models. `routes_admin_production.py` has a few endpoints returning bare dicts.
---
## 8. Side-Effect Cascade Depth
`_async_translate_and_synthesize()` at 485 lines is the worst case: it writes to GCS, updates MongoDB, dispatches TTS tasks, sends notifications, and updates job status — 5+ distinct side-effect categories from a single function call. This warrants extraction into an orchestrator that delegates to named sink functions.
---
## Summary
| Check | Status | Severity |
|-------|--------|---------|
| God files (>500L) | 11 files | CRITICAL×2, HIGH×4 |
| Long methods (>100L) | 10 functions | CRITICAL×2, HIGH×5 |
| N+1 patterns | 3 potential | MEDIUM |
| Magic numbers | Some in tasks | MEDIUM |
| Method signatures | Boolean flags, unclear returns | MEDIUM |
| Side-effect cascade | translate_and_synthesize | HIGH |
**Primary recommendation:** Split `routes_jobs.py` and `QCDetail.tsx` — these two files account for the majority of the quality debt.

View file

@ -1,94 +0,0 @@
# Dependencies & Reuse Audit — ln-625
**Score: 7.5/10** | Issues: 9 (C:0 H:2 M:5 L:2)
**Date:** 2026-04-30
---
## 1. Vulnerability Scan (CVE/CVSS)
### Frontend — npm audit: ✓ CLEAN
```
Total packages: 479
Vulnerabilities: info:0 low:0 moderate:0 high:0 critical:0 total:0
```
Zero CVEs. Excellent.
### Backend — pip-audit: NOT RUN
`pip-audit` not installed in local env. Recommended to add to CI:
```bash
pip install pip-audit && pip-audit -r requirements.txt
```
Given many heavy deps (Celery 5.3, google-cloud-*, faster-whisper, aiohttp), a CI scan is strongly advised.
---
## 2. Outdated Packages
### Frontend — npm outdated (many minor/major updates pending)
**MAJOR version gaps (HIGH):**
| Package | Installed | Latest | Notes |
|---------|-----------|--------|-------|
| `@azure/msal-browser` | 4.25.0 | **5.9.0** | MSAL v5 has breaking API changes |
| `@azure/msal-react` | 3.0.20 | **5.3.2** | Paired with msal-browser, coordinated upgrade needed |
| `@sentry/react` | 8.55.0 | **10.51.0** | Sentry v10 has breaking changes |
| `typescript` | 5.8.3 | **6.0.3** | TS 6 has strictness changes |
| `vite` | 7.3.2 | **8.0.10** | Vite 8 breaking changes |
| `eslint` | 9.33.0 | **10.2.1** | ESLint 10 config format may change |
| `jsdom` | 26.1.0 | **29.1.1** | Test environment |
**Minor updates (LOW-MEDIUM):** Most other packages have minor/patch updates pending (react 19.1→19.2, tailwindcss 4.1→4.2, etc.)
**Recommendation:** Keep MSAL and Sentry on current major until dedicated upgrade sprint. React, TailwindCSS, react-query minor updates are safe to apply immediately.
### Backend — pip outdated: pip-audit not available
Based on pyproject.toml dates vs ecosystem:
- `ruff ^0.1.6` → installed ruff is `0.15.12` (already updated, good)
- `google-genai ^1.56.0` → recently updated per git log
- `faster-whisper ^1.2.0` → check for 1.x updates
---
## 3. Unused Dependencies
### Backend — `sendgrid` (MEDIUM)
`pyproject.toml` lists `sendgrid = "^6.11.0"`. However:
- The actual emailer (`app/services/emailer.py`) uses **Mailgun** REST API via `httpx`
- `sendgrid` is referenced **only** in `app/core/config.py` as a dead config field `sendgrid_api_key: str = ""` with comment `# Email (Mailgun — primary; sendgrid_api_key kept for backward compat)`
- No `import sendgrid` anywhere in app code
**Action:** Remove `sendgrid` from `pyproject.toml` dependencies and remove the `sendgrid_api_key` config field.
### Frontend — no unused dependencies found
- `axios` → used in `lib/api.ts`
- `@azure/msal-*` → used in `main.tsx`, `routes/Login.tsx`
- `date-fns` → used in 5+ components
- `zustand`, `@tanstack/react-query`, `react-hook-form`, `zod` → all actively used
- `react-dropzone` → used in upload components
---
## 4. Available Native Alternatives
### Frontend — axios vs fetch (LOW)
`axios` is used for all API calls in `lib/api.ts`. The project targets modern browsers and uses Vite. Native `fetch` + `AbortController` could replace axios, reducing bundle by ~14kb gzipped. However, axios provides request/response interceptors that are actively used for auth token refresh — migration effort is medium. **Not urgent.**
---
## 5. Custom Implementations
No custom crypto or hand-rolled validation libraries found. All auth uses `python-jose` + `libpass` (bcrypt). VTT parsing is domain-specific and not replaceable by a library. No concerns.
---
## Summary
| Check | Result | Severity |
|-------|--------|---------|
| Frontend CVEs | ✓ 0 vulnerabilities | OK |
| Backend CVEs | ⚠ Not scanned | MEDIUM |
| Frontend major updates | MSAL×2, Sentry, TS, Vite, ESLint | HIGH |
| Frontend minor updates | Many | LOW |
| Backend unused dep | `sendgrid` in pyproject.toml | MEDIUM |
| Native alternatives | axios → fetch possible | LOW |
| Custom implementations | None found | OK |

View file

@ -1,143 +0,0 @@
# Dead Code Audit — ln-626
**Score: 7.0/10** | Issues: 14 (C:0 H:0 M:6 L:8)
**Date:** 2026-04-30
---
## 1. Unused Imports (Python — F401)
ruff detected **58 unused import violations** across backend. Sample:
| File | Unused import |
|------|--------------|
| `routes_admin.py:9` | `get_current_user` |
| `routes_admin.py:11` | `verify_password` |
| `routes_admin.py:16` | `ChangePasswordRequest` |
| `routes_admin.py:23` | `log_security_event` |
| (+ 54 more across all files) | |
All are auto-fixable with `ruff check --fix --select F401`. The `__init__.py` files are correctly excluded via `per-file-ignores`.
**Severity: MEDIUM** — clutters imports, increases cognitive load when reading files.
---
## 2. Deprecated / Legacy Types (Frontend)
`frontend/src/types/api.ts` contains 3 deprecated exported types with JSDoc markers:
| Line | Type | Marker |
|------|------|--------|
| 96 | `TtsVoicesResponse` | `@deprecated Use ProviderVoicesResponse instead` |
| 137 | `TtsOptionsResponse` | `@deprecated Use ProviderOptionsResponse instead` |
| 555-566 | `Client` / `OrganizationLegacy` | `@deprecated Use Organization instead` + `export { Client as OrganizationLegacy }` |
These types are still exported, meaning consumers could use them by mistake. If no external consumers exist (library not published), they should be deleted.
**Severity: MEDIUM** — active deprecation markers indicate intent to remove. Leaving them causes confusion.
---
## 3. Legacy Status Values (Frontend)
`frontend/src/types/api.ts:12,14`:
```ts
| "tts_failed" // legacy: keep for back-compat
| "render_failed" // legacy: keep for back-compat
```
These job statuses are marked as legacy. If the backend no longer emits them, they are dead type branches. If it still does (for old jobs in MongoDB), they're valid — but should be clearly documented with a removal condition.
**Severity: LOW** — no runtime impact, but requires clarification.
---
## 4. Backward Compatibility Code (Frontend)
### lib/api.ts:239 — Legacy approval method (MEDIUM)
```ts
// Legacy method - calls approve_source for backwards compatibility
```
A backward-compat shim in the API client. If all callers have been updated to the new method, this should be removed.
### VideoWithCaptions.tsx:1643 — Legacy single-language props (MEDIUM)
```ts
// Legacy single-language props (still supported)
sourceLanguage?: string; // Language code for legacy props
// Legacy props
// Combine legacy props with tracks (use useMemo to prevent recreation)
```
The component maintains backward-compat with old single-language prop API. If no callers use these legacy props, they can be removed.
### JobDetail.tsx:41 — Legacy status mapping (LOW)
```ts
// Handle legacy approved_english/approved_source statuses (map to pending_final_review)
```
Status mapping shim for old job records. Should be removed after all existing jobs are migrated.
---
## 5. Commented-Out Code (Backend)
| File | Line | Content |
|------|------|---------|
| `telemetry/tracing.py:5` | `# from opentelemetry.exporter.gcp.trace import CloudTraceSpanExporter # Disabled for local dev` | GCP trace exporter disabled |
| `telemetry/metrics.py:5` | `# from opentelemetry.exporter.prometheus import PrometheusMetricReader # Disabled for local dev` | Prometheus reader disabled |
| `pyproject.toml` | `# opentelemetry-exporter-prometheus = ... # Temporarily disabled - version conflicts` | Dep commented out |
These are intentional (local dev vs prod config), not dead code. However, the conditional should be expressed via environment config, not source comments. **Low priority.**
**Severity: LOW**
---
## 6. Leftover .old Files (MEDIUM)
| File | Age | Action |
|------|-----|--------|
| `docker-compose.yml.old` | Created 2026-03-03 (~2 months) | Delete |
| `backend/Dockerfile.old` | Created 2026-03-03 (~2 months) | Delete |
| `backend/.dockerignore.old` | — | Delete |
These files have no build references. Git history preserves them.
---
## 7. Unused Dockerfiles
| File | Referenced in compose? |
|------|----------------------|
| `backend/Dockerfile.ffmpeg-service` | No — ffmpeg is embedded in main worker |
| `backend/Dockerfile.cloudrun` | Yes — referenced for Cloud Run deploys |
| `backend/Dockerfile.whisper-service` | Yes — whisper-worker service in compose |
`Dockerfile.ffmpeg-service` appears to be dead — the main Dockerfile handles ffmpeg. Should be confirmed and deleted if unused.
**Severity: LOW**
---
## 8. Dead Config Field
`backend/app/core/config.py:272`:
```python
sendgrid_api_key: str = "" # Email (Mailgun — primary; sendgrid_api_key kept for backward compat)
```
`sendgrid` package not used. Config field and `secrets_config.py` secret reference both dead.
**Severity: MEDIUM** — misleads ops into configuring a sendgrid secret that has no effect.
---
## Summary
| Check | Issues | Severity |
|-------|--------|---------|
| Unused Python imports | 58 (auto-fixable) | MEDIUM |
| Deprecated TS types | 3 types | MEDIUM |
| Backward-compat shims | 3 in frontend | MEDIUM |
| Commented-out code | 3 telemetry lines | LOW |
| .old files | 3 files | MEDIUM |
| Unused Dockerfile | Dockerfile.ffmpeg-service | LOW |
| Dead config field | sendgrid_api_key | MEDIUM |
| Legacy status values | 2 status strings | LOW |

View file

@ -1,96 +1,172 @@
# =============================================================================
# Apache config fragment — Accessible Video Platform
# Inject into: /etc/apache2/sites-available/optical-dev.oliver.solutions-ssl.conf
#
# Required modules:
# sudo a2enmod proxy proxy_http proxy_wstunnel rewrite headers
#
# Container port map:
# accessible-video-api → 0.0.0.0:8012->8000/tcp
# Apache Configuration for Accessible Video Platform
# =============================================================================
# Add this configuration to your existing VirtualHost for ai-sandbox.oliver.solutions
# Location: /etc/apache2/sites-available/ai-sandbox.oliver.solutions-ssl.conf
# =============================================================================
# ── Timeouts for large video uploads (up to 2 GB, ~10 min) ──────────────────
<IfModule mod_proxy.c>
ProxyTimeout 600
</IfModule>
# -----------------------------------------------------------------------------
# Frontend - Static React SPA served from subdirectory
# -----------------------------------------------------------------------------
# ── WebSocket proxy (MUST be before /api/ HTTP proxy) ───────────────────────
# disablereuse=on prevents long-lived WS connections from exhausting the pool
ProxyPassMatch ^/video-accessibility/api/v1/ws/(.*)$ ws://127.0.0.1:8012/api/v1/ws/$1 disablereuse=on
ProxyPassReverse /video-accessibility/api/v1/ws/ ws://127.0.0.1:8012/api/v1/ws/
# ── API proxy ────────────────────────────────────────────────────────────────
# Strips /video-accessibility prefix — FastAPI sees /api/v1/...
ProxyPassMatch ^/video-accessibility/api/(.*)$ http://127.0.0.1:8012/api/$1
ProxyPassReverse /video-accessibility/api/ http://127.0.0.1:8012/api/
# Swagger / OpenAPI
ProxyPassMatch ^/video-accessibility/docs(/.*)?$ http://127.0.0.1:8012/docs$1
ProxyPassReverse /video-accessibility/docs http://127.0.0.1:8012/docs
ProxyPassMatch ^/video-accessibility/openapi\.json$ http://127.0.0.1:8012/openapi.json
ProxyPassReverse /video-accessibility/openapi.json http://127.0.0.1:8012/openapi.json
# ── SPA static files ─────────────────────────────────────────────────────────
# Serve frontend from /video-accessibility subdirectory
Alias /video-accessibility /var/www/html/video-accessibility
<Directory /var/www/html/video-accessibility>
# Basic options
Options -Indexes +FollowSymLinks
AllowOverride None
AllowOverride All
Require all granted
# Allow video uploads up to 2 GB
LimitRequestBody 2147483648
# React SPA routing - rewrite all requests to index.html
RewriteEngine On
RewriteBase /video-accessibility/
RewriteBase /video-accessibility
# Serve real files/directories directly (JS, CSS, assets, fonts)
RewriteCond %{REQUEST_FILENAME} -f [OR]
RewriteCond %{REQUEST_FILENAME} -d
RewriteRule ^ - [L]
# Don't rewrite files or directories that exist
RewriteCond %{REQUEST_FILENAME} !-f
RewriteCond %{REQUEST_FILENAME} !-d
# Everything else → index.html (React Router handles client-side nav)
RewriteRule ^ index.html [L]
# Cache-bust hashed assets indefinitely; never cache HTML
<FilesMatch "\.(js|css|woff2?|ttf|eot|png|jpg|jpeg|gif|ico|svg)$">
Header set Cache-Control "public, max-age=31536000, immutable"
</FilesMatch>
<FilesMatch "\.html$">
Header set Cache-Control "no-cache, no-store, must-revalidate"
</FilesMatch>
# Rewrite everything else to index.html
RewriteRule ^ /video-accessibility/index.html [L]
# Security headers
Header always set X-Frame-Options "SAMEORIGIN"
Header always set X-Content-Type-Options "nosniff"
Header always set X-XSS-Protection "1; mode=block"
Header always set Referrer-Policy "strict-origin-when-cross-origin"
# Cache control for static assets
<FilesMatch "\.(js|css|png|jpg|jpeg|gif|ico|svg|woff|woff2|ttf|eot)$">
Header set Cache-Control "public, max-age=31536000, immutable"
</FilesMatch>
# No cache for HTML files
<FilesMatch "\.(html)$">
Header set Cache-Control "no-cache, no-store, must-revalidate"
Header set Pragma "no-cache"
Header set Expires "0"
</FilesMatch>
</Directory>
# -----------------------------------------------------------------------------
# Backend API - Reverse proxy to Docker container
# -----------------------------------------------------------------------------
# Proxy backend API to Docker container on port 8000
<Location /video-accessibility-back>
# Preserve original host header
ProxyPreserveHost On
# Proxy HTTP requests
ProxyPass http://localhost:8000
ProxyPassReverse http://localhost:8000
# Proxy timeout settings (important for long-running video processing)
ProxyTimeout 300
# WebSocket support (CRITICAL for real-time job updates)
RewriteEngine On
RewriteCond %{HTTP:Upgrade} =websocket [NC]
RewriteRule /video-accessibility-back/(.*) ws://localhost:8000/$1 [P,L]
RewriteCond %{HTTP:Upgrade} !=websocket [NC]
RewriteRule /video-accessibility-back/(.*) http://localhost:8000/$1 [P,L]
# Security headers
Header always set X-Frame-Options "SAMEORIGIN"
Header always set X-Content-Type-Options "nosniff"
# CORS is handled by the backend, don't add headers here
</Location>
# -----------------------------------------------------------------------------
# Required Apache Modules
# -----------------------------------------------------------------------------
# Enable these modules with:
# sudo a2enmod rewrite
# sudo a2enmod proxy
# sudo a2enmod proxy_http
# sudo a2enmod proxy_wstunnel
# sudo a2enmod headers
# sudo systemctl restart apache2
# Verify modules are enabled:
# apache2ctl -M | grep -E '(rewrite|proxy|headers)'
# =============================================================================
# Full VirtualHost skeleton (reference — values match optical-web-1)
# Full VirtualHost Example
# =============================================================================
# Example of complete VirtualHost configuration:
#
# <VirtualHost *:443>
# ServerName optical-dev.oliver.solutions
# ServerName ai-sandbox.oliver.solutions
# ServerAdmin admin@oliver.solutions
#
# DocumentRoot /var/www/html
#
# # SSL Configuration (with wildcard cert)
# SSLEngine on
# SSLCertificateFile /path/to/wildcard.crt
# SSLCertificateKeyFile /path/to/wildcard.key
# SSLCertificateFile /path/to/wildcard-ai-sandbox.oliver.solutions.crt
# SSLCertificateKeyFile /path/to/wildcard-ai-sandbox.oliver.solutions.key
# SSLCertificateChainFile /path/to/chain.crt # If needed
#
# SSLProtocol all -SSLv2 -SSLv3 -TLSv1 -TLSv1.1
# # SSL Protocol and Cipher settings
# SSLProtocol all -SSLv2 -SSLv3 -TLSv1 -TLSv1.1
# SSLCipherSuite HIGH:!aNULL:!MD5
#
# # — paste the block above here —
# # Frontend configuration (from above)
# Alias /video-accessibility /var/www/html/video-accessibility
# <Directory /var/www/html/video-accessibility>
# ...
# </Directory>
#
# ErrorLog ${APACHE_LOG_DIR}/optical-dev-error.log
# CustomLog ${APACHE_LOG_DIR}/optical-dev-access.log combined
# # Backend API configuration (from above)
# <Location /video-accessibility-back>
# ...
# </Location>
#
# # Logging
# ErrorLog ${APACHE_LOG_DIR}/ai-sandbox-error.log
# CustomLog ${APACHE_LOG_DIR}/ai-sandbox-access.log combined
# </VirtualHost>
# =============================================================================
# Verify
# Testing & Verification
# =============================================================================
# sudo apache2ctl configtest
# sudo systemctl reload apache2
# curl -I https://optical-dev.oliver.solutions/video-accessibility/
# curl https://optical-dev.oliver.solutions/video-accessibility/api/v1/health
# wscat -c wss://optical-dev.oliver.solutions/video-accessibility/api/v1/ws/job-list
# Test Apache configuration:
# sudo apache2ctl configtest
#
# Restart Apache:
# sudo systemctl restart apache2
#
# Test frontend:
# curl -I https://ai-sandbox.oliver.solutions/video-accessibility
#
# Test backend:
# curl https://ai-sandbox.oliver.solutions/video-accessibility-back/health
#
# Test WebSocket (requires wscat):
# wscat -c wss://ai-sandbox.oliver.solutions/video-accessibility-back/api/v1/ws/job-list
# =============================================================================
# Troubleshooting
# =============================================================================
# Check Apache logs:
# sudo tail -f /var/log/apache2/ai-sandbox-error.log
# sudo tail -f /var/log/apache2/ai-sandbox-access.log
#
# Check if backend is running:
# curl http://localhost:8000/health
#
# Check Docker containers:
# cd /opt/accessible-video
# docker-compose ps
#
# Common issues:
# - 502 Bad Gateway: Backend container not running
# - 404 Not Found: Frontend not deployed or Apache alias incorrect
# - WebSocket fails: mod_proxy_wstunnel not enabled
# - CORS errors: Check backend CORS configuration, not Apache

92
backend/.dockerignore.old Normal file
View file

@ -0,0 +1,92 @@
# Python
__pycache__/
*.py[cod]
*$py.class
*.so
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# Poetry (keep poetry.lock for reproducible builds)
# poetry.lock
# Virtual environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# IDE
.vscode/
.idea/
*.swp
*.swo
*~
# OS
.DS_Store
.DS_Store?
._*
.Spotlight-V100
.Trashes
ehthumbs.db
Thumbs.db
# Testing
.coverage
.pytest_cache/
.mypy_cache/
.tox/
htmlcov/
coverage.xml
*.cover
.hypothesis/
# Documentation
docs/
*.md
README*
# Logs
*.log
logs/
# Git
.git/
.gitignore
# Docker
Dockerfile*
.dockerignore
docker-compose*
# CI/CD
.github/
# Local development
.env.local
.env.development
.env.test
# Temporary files
tmp/
temp/
*.tmp
*.bak

View file

@ -3,8 +3,8 @@
# =============================================================================
# Stage 1: Builder - Install dependencies
# Stage 2: Base - Common runtime for API and Worker
# Stage 3: API - FastAPI + Gunicorn (no ffmpeg — heavy tasks run on Cloud Run Jobs)
# Stage 4: Worker - Celery worker, lightweight queues only (notify, embed)
# Stage 3: API - FastAPI + Gunicorn (with ffmpeg for TTS audio conversion)
# Stage 4: Worker - Celery worker (with ffmpeg for video processing)
# =============================================================================
# -----------------------------------------------------------------------------
@ -46,7 +46,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
libmagic1 \
curl \
tini \
ffmpeg \
&& rm -rf /var/lib/apt/lists/* \
&& apt-get clean
@ -73,10 +72,21 @@ USER app
# -----------------------------------------------------------------------------
# Stage 3: API - FastAPI + Gunicorn (Production API Server)
# Heavy pipeline tasks (ingest/translate/render) run on Cloud Run Jobs
# -----------------------------------------------------------------------------
FROM base AS api
# Switch to root to install ffmpeg
USER root
# Install ffmpeg for TTS audio conversion
RUN apt-get update && apt-get install -y --no-install-recommends \
ffmpeg \
&& rm -rf /var/lib/apt/lists/* \
&& apt-get clean
# Switch back to non-root user
USER app
# Set production environment variables
ENV APP_ENV=prod
@ -94,10 +104,22 @@ ENTRYPOINT ["tini", "--"]
CMD ["gunicorn", "-c", "gunicorn_conf.py", "app.main:app"]
# -----------------------------------------------------------------------------
# Stage 4: Worker - Celery Worker (lightweight queues: notify, embed)
# Stage 4: Worker - Celery Worker (with ffmpeg for video processing)
# -----------------------------------------------------------------------------
FROM base AS worker
# Switch back to root to install ffmpeg
USER root
# Install ffmpeg for video processing
RUN apt-get update && apt-get install -y --no-install-recommends \
ffmpeg \
&& rm -rf /var/lib/apt/lists/* \
&& apt-get clean
# Switch back to non-root user
USER app
# Set production environment variables
# WORKER_CONCURRENCY can be overridden at runtime (default: 8)
ENV APP_ENV=prod \
@ -126,6 +148,18 @@ CMD celery -A celery_worker worker \
# -----------------------------------------------------------------------------
FROM base AS whisper-worker
# Switch back to root to install ffmpeg
USER root
# Install ffmpeg for audio extraction
RUN apt-get update && apt-get install -y --no-install-recommends \
ffmpeg \
&& rm -rf /var/lib/apt/lists/* \
&& apt-get clean
# Switch back to non-root user
USER app
# Pre-download Whisper medium model during build to avoid cold start delays
# Model is cached in ~/.cache/huggingface/hub (~1.5GB)
RUN python -c "from faster_whisper import WhisperModel; WhisperModel('medium', device='cpu', compute_type='int8')"

View file

@ -1,55 +0,0 @@
# =============================================================================
# Cloud Run Job image — va-worker
#
# Reuses the multi-stage base from Dockerfile.
# Entrypoint: python -m app.tasks.runner --task <name> --job-id <id>
#
# Build:
# docker build -f backend/Dockerfile.cloudrun -t va-worker backend/
# =============================================================================
# ── Stage 1: Builder ─────────────────────────────────────────────────────────
FROM python:3.11-slim AS builder
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential curl \
&& rm -rf /var/lib/apt/lists/*
RUN pip install --no-cache-dir poetry==1.8.3
WORKDIR /app
COPY pyproject.toml poetry.lock ./
RUN poetry config virtualenvs.create false \
&& poetry install --no-interaction --no-ansi --only main
# ── Stage 2: Runtime ─────────────────────────────────────────────────────────
FROM python:3.11-slim AS runtime
# ffmpeg required for video rendering tasks
RUN apt-get update && apt-get install -y --no-install-recommends \
ffmpeg \
tini \
curl \
&& rm -rf /var/lib/apt/lists/*
# Copy installed packages from builder
COPY --from=builder /usr/local/lib/python3.11/site-packages /usr/local/lib/python3.11/site-packages
COPY --from=builder /usr/local/bin /usr/local/bin
WORKDIR /app
COPY . .
# Non-root user for security
RUN groupadd -r worker && useradd -r -g worker worker \
&& chown -R worker:worker /app
USER worker
# Cloud Run Jobs: no persistent HTTP port needed.
# Cloud Run passes CLOUD_RUN_TASK_INDEX and CLOUD_RUN_TASK_COUNT env vars.
ENV PYTHONUNBUFFERED=1 \
PYTHONDONTWRITEBYTECODE=1 \
PYTHONPATH=/app
ENTRYPOINT ["tini", "--", "python", "-m", "app.tasks.runner"]
# Args are injected per-execution via Cloud Run Job overrides:
# --task ingest|translate|render|rerender --job-id <id> [--language <lang>] ...

127
backend/Dockerfile.old Normal file
View file

@ -0,0 +1,127 @@
# Build stage - Install dependencies and build wheels
FROM python:3.11-slim AS builder
# Install build dependencies
RUN apt-get update && apt-get install -y \
build-essential \
curl \
&& rm -rf /var/lib/apt/lists/*
# Install Poetry
RUN pip install poetry==1.8.2
# Set Poetry configuration
ENV POETRY_NO_INTERACTION=1 \
POETRY_VENV_IN_PROJECT=1 \
POETRY_CACHE_DIR=/tmp/poetry_cache
WORKDIR /app
# Copy dependency files
COPY pyproject.toml poetry.lock ./
# Install dependencies into venv
RUN poetry config virtualenvs.in-project true && \
poetry lock --no-update || true && \
poetry install --only=main --no-root && \
rm -rf $POETRY_CACHE_DIR
# Base runtime stage
FROM python:3.11-slim AS base
# Install runtime system dependencies
RUN apt-get update && apt-get install -y \
ffmpeg \
curl \
tini \
&& rm -rf /var/lib/apt/lists/* \
&& apt-get clean
# Create non-root user
RUN groupadd --gid 1000 app \
&& useradd --uid 1000 --gid app --shell /bin/bash --create-home app
# Set working directory
WORKDIR /app
# Copy virtual environment from builder stage
COPY --from=builder --chown=app:app /app/.venv /app/.venv
# Ensure venv is in PATH
ENV PATH="/app/.venv/bin:$PATH"
# Copy application code
COPY --chown=app:app . .
# Switch to non-root user
USER app
# Production API stage
FROM base AS production
# Set environment variables for production
ENV APP_ENV=prod \
PYTHONPATH=/app \
PYTHONUNBUFFERED=1 \
PYTHONDONTWRITEBYTECODE=1
# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
CMD curl -f http://localhost:8000/health || exit 1
# Expose port
EXPOSE 8000
# Use tini as init system for proper signal handling
ENTRYPOINT ["tini", "--"]
# Default command for API server
CMD ["gunicorn", "-c", "gunicorn_conf.py"]
# Worker stage for Celery workers
FROM base AS worker
# Set environment variables for worker
ENV APP_ENV=prod \
PYTHONPATH=/app \
PYTHONUNBUFFERED=1 \
PYTHONDONTWRITEBYTECODE=1 \
C_FORCE_ROOT=1
# Health check for worker (check if Celery is responding)
HEALTHCHECK --interval=60s --timeout=15s --start-period=10s --retries=3 \
CMD python -c "from celery import Celery; app=Celery('app'); print('Worker healthy')" || exit 1
# Use tini as init system for proper signal handling
ENTRYPOINT ["tini", "--"]
# Default command for Celery worker
CMD ["celery", "-A", "app.tasks", "worker", "--loglevel=info", "--concurrency=1"]
# Development stage with dev dependencies
FROM builder AS development
# Install all dependencies including dev
RUN poetry install --no-root && rm -rf $POETRY_CACHE_DIR
# Install additional dev tools
RUN apt-get update && apt-get install -y \
git \
vim \
&& rm -rf /var/lib/apt/lists/*
# Copy application code
COPY --chown=app:app . .
# Switch to non-root user
USER app
# Set environment for development
ENV APP_ENV=dev \
PYTHONPATH=/app \
PYTHONUNBUFFERED=1
EXPOSE 8000
# Development command with hot reload
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000", "--reload"]

View file

@ -1,28 +1,26 @@
from datetime import datetime, timedelta
from typing import Optional
from bson import ObjectId
from fastapi import APIRouter, Depends, HTTPException, Query, Request, status
from motor.motor_asyncio import AsyncIOMotorDatabase
from ...core.authz import MembershipContext, get_membership_context
from ...core.database import get_database
from ...core.dependencies import get_current_user, require_roles
from ...core.logging import get_logger
from ...core.security import get_password_hash
from ...models.audit_log import AuditAction, AuditLogQuery, AuditLogResponse
from ...core.security import get_password_hash, verify_password
from ...models.user import User, UserRole
from ...models.audit_log import AuditAction, AuditLogQuery, AuditLogResponse
from ...schemas.auth import (
AdminStatsResponse,
ChangePasswordRequest,
CreateUserRequest,
ResetPasswordRequest,
UpdateUserRequest,
UserListResponse,
UserResponse,
)
from ...services.audit_logger import (
audit_logger,
log_user_management,
)
from ...services.audit_logger import audit_logger, log_user_management, log_security_event
from ...telemetry import app_metrics
logger = get_logger(__name__)
@ -33,48 +31,28 @@ router = APIRouter(prefix="/admin", tags=["admin"])
async def list_users(
page: int = Query(1, ge=1),
size: int = Query(20, ge=1, le=500),
role: str | None = Query(None, description="Single role or comma-separated list, e.g. 'linguist,admin'"),
role: Optional[str] = Query(None),
active_only: bool = Query(True),
org_id: str | None = Query(None, description="Filter by org (platform admin only)"),
current_user: User = Depends(require_roles(UserRole.ADMIN)),
ctx: MembershipContext = Depends(get_membership_context),
db: AsyncIOMotorDatabase = Depends(get_database),
):
"""List users with filtering and pagination (admin only)"""
query: dict = {}
query = {}
if role:
roles = [r.strip() for r in role.split(",") if r.strip()]
query["role"] = {"$in": roles} if len(roles) > 1 else roles[0]
query["role"] = role
if active_only:
query["is_active"] = True
if not ctx.is_platform_admin:
# Org-scoped admin: show only users in their org(s) via membership collection
accessible_org_ids = ctx.accessible_org_ids()
if not accessible_org_ids:
return UserListResponse(users=[], total=0, page=page, size=size)
member_ids_cursor = db.memberships.find(
{"organization_id": {"$in": accessible_org_ids}},
{"user_id": 1},
)
member_ids = [doc["user_id"] async for doc in member_ids_cursor]
query["_id"] = {"$in": member_ids}
elif org_id:
# Platform admin filtered to a specific org
member_ids_cursor = db.memberships.find({"organization_id": org_id}, {"user_id": 1})
member_ids = [doc["user_id"] async for doc in member_ids_cursor]
query["_id"] = {"$in": member_ids}
# Get total count
total = await db.users.count_documents(query)
# Get paginated results
skip = (page - 1) * size
cursor = db.users.find(query, {"hashed_password": 0}).sort("created_at", -1).skip(skip).limit(size)
users = await cursor.to_list(length=size)
user_responses = []
for user_doc in users:
user_responses.append(UserResponse(
@ -86,9 +64,8 @@ async def list_users(
is_active=user_doc["is_active"],
created_at=user_doc.get("created_at", datetime.utcnow()).isoformat(),
pm_client_ids=user_doc.get("pm_client_ids", []),
languages=user_doc.get("languages", []),
))
return UserListResponse(
users=user_responses,
total=total,
@ -97,32 +74,6 @@ async def list_users(
)
@router.get("/brief-assignees", response_model=list[UserResponse])
async def list_brief_assignees(
current_user: User = Depends(get_current_user),
db: AsyncIOMotorDatabase = Depends(get_database),
):
"""Return users who can be assigned a brief (PM, production, admin). Accessible to all brief-creating roles."""
docs = await db.users.find(
{
"role": {"$in": [UserRole.ADMIN.value, UserRole.PROJECT_MANAGER.value, UserRole.PRODUCTION.value]},
"is_active": True,
},
{"hashed_password": 0},
).sort("full_name", 1).to_list(None)
return [UserResponse(
id=str(d["_id"]),
email=d["email"],
full_name=d["full_name"],
role=d["role"],
auth_provider=d.get("auth_provider", "local"),
is_active=d["is_active"],
created_at=d.get("created_at", datetime.utcnow()).isoformat() if d.get("created_at") else None,
pm_client_ids=d.get("pm_client_ids", []),
languages=d.get("languages", []),
) for d in docs]
@router.get("/users/{user_id}", response_model=UserResponse)
async def get_user(
user_id: str,
@ -136,7 +87,7 @@ async def get_user(
status_code=status.HTTP_404_NOT_FOUND,
detail="User not found"
)
return UserResponse(
id=str(user_doc["_id"]),
email=user_doc["email"],
@ -146,7 +97,6 @@ async def get_user(
is_active=user_doc["is_active"],
created_at=user_doc.get("created_at", datetime.utcnow()).isoformat(),
pm_client_ids=user_doc.get("pm_client_ids", []),
languages=user_doc.get("languages", []),
)
@ -165,7 +115,7 @@ async def create_user(
status_code=status.HTTP_400_BAD_REQUEST,
detail="User with this email already exists"
)
# Create user document
user_id = str(ObjectId())
user_doc = {
@ -179,12 +129,12 @@ async def create_user(
"created_at": datetime.utcnow(),
"updated_at": datetime.utcnow()
}
await db.users.insert_one(user_doc)
# Record metrics
app_metrics.record_auth_attempt("user_created", user_data.role.value)
logger.info(f"Admin {current_user.id} created user {user_id} with role {user_data.role.value}")
await log_user_management(
AuditAction.USER_CREATE, user_id, current_user, request,
@ -200,7 +150,6 @@ async def create_user(
is_active=True,
created_at=user_doc["created_at"].isoformat(),
pm_client_ids=[],
languages=[],
)
@ -220,7 +169,7 @@ async def update_user(
status_code=status.HTTP_404_NOT_FOUND,
detail="User not found"
)
# Check if email is being changed and doesn't conflict
if user_update.email and user_update.email != user_doc["email"]:
existing_user = await db.users.find_one({"email": user_update.email, "_id": {"$ne": user_id}})
@ -229,10 +178,10 @@ async def update_user(
status_code=status.HTTP_400_BAD_REQUEST,
detail="Email already in use by another user"
)
# Build update document
update_data = {"updated_at": datetime.utcnow()}
if user_update.email:
update_data["email"] = user_update.email
if user_update.full_name:
@ -241,19 +190,19 @@ async def update_user(
update_data["role"] = user_update.role.value
if user_update.is_active is not None:
update_data["is_active"] = user_update.is_active
# Update user
result = await db.users.find_one_and_update(
{"_id": user_id},
{"$set": update_data},
return_document=True
)
logger.info(f"Admin {current_user.id} updated user {user_id}")
action = AuditAction.USER_ROLE_CHANGE if user_update.role else AuditAction.USER_UPDATE
await log_user_management(
action, user_id, current_user, request,
details=dict(user_update.dict(exclude_none=True).items()),
details={k: v for k, v in user_update.dict(exclude_none=True).items()},
)
return UserResponse(
@ -265,7 +214,6 @@ async def update_user(
is_active=result["is_active"],
created_at=result.get("created_at", datetime.utcnow()).isoformat(),
pm_client_ids=result.get("pm_client_ids", []),
languages=result.get("languages", []),
)
@ -282,7 +230,7 @@ async def deactivate_user(
status_code=status.HTTP_400_BAD_REQUEST,
detail="Cannot deactivate your own account"
)
result = await db.users.update_one(
{"_id": user_id},
{
@ -292,13 +240,13 @@ async def deactivate_user(
}
}
)
if result.matched_count == 0:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="User not found"
)
logger.info(f"Admin {current_user.id} deactivated user {user_id}")
await log_user_management(AuditAction.USER_DEACTIVATE, user_id, current_user, request)
@ -316,10 +264,10 @@ async def admin_reset_password(
# Generate temporary password
import secrets
import string
temp_password = ''.join(secrets.choice(string.ascii_letters + string.digits) for _ in range(12))
hashed_password = get_password_hash(temp_password)
result = await db.users.update_one(
{"_id": user_id},
{
@ -329,15 +277,15 @@ async def admin_reset_password(
}
}
)
if result.matched_count == 0:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="User not found"
)
logger.info(f"Admin {current_user.id} reset password for user {user_id}")
# In production, send email with temp password instead of returning it
return {
"message": "Password reset successfully",
@ -353,23 +301,23 @@ async def get_admin_stats(
"""Get system statistics (production/admin only)"""
# Get user count
total_users = await db.users.count_documents({"is_active": True})
# Get job counts
total_jobs = await db.jobs.count_documents({})
# Get jobs by status
pipeline = [
{"$group": {"_id": "$status", "count": {"$sum": 1}}}
]
status_counts = await db.jobs.aggregate(pipeline).to_list(None)
jobs_by_status = {item["_id"]: item["count"] for item in status_counts}
# Get jobs created today
today_start = datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0)
active_jobs_today = await db.jobs.count_documents({
"created_at": {"$gte": today_start}
})
# Calculate average processing time for completed jobs
avg_processing_pipeline = [
{"$match": {"status": "completed", "created_at": {"$exists": True}, "updated_at": {"$exists": True}}},
@ -390,10 +338,10 @@ async def get_admin_stats(
}
}
]
avg_result = await db.jobs.aggregate(avg_processing_pipeline).to_list(None)
avg_processing_time = avg_result[0]["avg_processing_time"] if avg_result else 0.0
return AdminStatsResponse(
total_users=total_users,
total_jobs=total_jobs,
@ -414,7 +362,7 @@ async def detailed_health_check(
"timestamp": datetime.utcnow().isoformat(),
"components": {}
}
# Check MongoDB
try:
await db.command("ping")
@ -422,7 +370,7 @@ async def detailed_health_check(
except Exception as e:
health_status["components"]["mongodb"] = {"status": "unhealthy", "error": str(e)}
health_status["status"] = "degraded"
# Check Redis (via import to avoid circular dependency)
try:
from ...core.redis import redis_client
@ -434,23 +382,23 @@ async def detailed_health_check(
except Exception as e:
health_status["components"]["redis"] = {"status": "unhealthy", "error": str(e)}
health_status["status"] = "degraded"
# Check GCS (basic check)
try:
from ...services.gcs import gcs_service
# Simple check to see if bucket is accessible
await gcs_service.file_exists("health_check_dummy") # This will return False but won't error if bucket accessible
bucket_exists = await gcs_service.file_exists("health_check_dummy") # This will return False but won't error if bucket accessible
health_status["components"]["gcs"] = {"status": "healthy"}
except Exception as e:
health_status["components"]["gcs"] = {"status": "unhealthy", "error": str(e)}
health_status["status"] = "degraded"
# Check job queue health
try:
from ...tasks import celery_app
inspect = celery_app.control.inspect()
active_tasks = inspect.active()
if active_tasks:
total_active = sum(len(tasks) for tasks in active_tasks.values())
health_status["components"]["celery"] = {
@ -467,7 +415,7 @@ async def detailed_health_check(
except Exception as e:
health_status["components"]["celery"] = {"status": "unhealthy", "error": str(e)}
health_status["status"] = "degraded"
return health_status
@ -479,18 +427,18 @@ async def get_job_statistics(
):
"""Get job processing statistics (reviewer/production/admin only)"""
since_date = datetime.utcnow() - timedelta(days=days)
# Jobs created in period
jobs_in_period = await db.jobs.count_documents({
"created_at": {"$gte": since_date}
})
# Jobs completed in period
jobs_completed = await db.jobs.count_documents({
"status": "completed",
"updated_at": {"$gte": since_date}
})
# Average processing time for completed jobs
avg_pipeline = [
{
@ -519,12 +467,12 @@ async def get_job_statistics(
}
}
]
avg_result = await db.jobs.aggregate(avg_pipeline).to_list(None)
processing_stats = avg_result[0] if avg_result else {
"avg_time": 0, "min_time": 0, "max_time": 0
}
# Current queue status
current_queue_stats = {}
pipeline = [
@ -533,7 +481,7 @@ async def get_job_statistics(
status_counts = await db.jobs.aggregate(pipeline).to_list(None)
for item in status_counts:
current_queue_stats[item["_id"]] = item["count"]
return {
"period_days": days,
"jobs_created": jobs_in_period,
@ -558,7 +506,7 @@ async def admin_force_password_reset(
status_code=status.HTTP_400_BAD_REQUEST,
detail="Cannot reset your own password this way"
)
# Check if user exists
user_doc = await db.users.find_one({"_id": user_id})
if not user_doc:
@ -566,15 +514,15 @@ async def admin_force_password_reset(
status_code=status.HTTP_404_NOT_FOUND,
detail="User not found"
)
# Generate secure temporary password
import secrets
import string
temp_password = ''.join(secrets.choice(
string.ascii_letters + string.digits + "!@#$%"
) for _ in range(16))
# Update password
await db.users.update_one(
{"_id": user_id},
@ -585,10 +533,10 @@ async def admin_force_password_reset(
}
}
)
# TODO: In production, send via secure email instead of returning password
logger.info(f"Admin {current_user.id} reset password for user {user_id}")
return {
"message": "Password reset successfully",
"temporary_password": temp_password,
@ -611,7 +559,7 @@ async def reprocess_job(
status_code=status.HTTP_404_NOT_FOUND,
detail="Job not found"
)
# Reset job to created status for reprocessing
await db.jobs.update_one(
{"_id": job_id},
@ -631,7 +579,7 @@ async def reprocess_job(
}
}
)
# Broadcast status update
try:
from ...services.websocket import connection_manager
@ -643,32 +591,32 @@ async def reprocess_job(
)
except Exception as e:
logger.warning(f"Failed to broadcast status update for job reset {job_id}: {e}")
# Trigger ingestion task
from ...tasks.ingest_and_ai import ingest_and_ai_task
ingest_and_ai_task.delay(job_id)
logger.warning(f"Admin {current_user.id} triggered reprocessing for job {job_id}")
return {"message": f"Job {job_id} queued for reprocessing"}
@router.get("/audit-logs", response_model=AuditLogResponse)
async def get_audit_logs_detailed(
# Time range
start_date: datetime | None = Query(None, description="Start date for audit logs"),
end_date: datetime | None = Query(None, description="End date for audit logs"),
start_date: Optional[datetime] = Query(None, description="Start date for audit logs"),
end_date: Optional[datetime] = Query(None, description="End date for audit logs"),
# Filters
action: str | None = Query(None, description="Filter by action type"),
severity: str | None = Query(None, description="Filter by severity level"),
user_email: str | None = Query(None, description="Filter by user email"),
resource_type: str | None = Query(None, description="Filter by resource type"),
resource_id: str | None = Query(None, description="Filter by resource ID"),
success: bool | None = Query(None, description="Filter by success status"),
action: Optional[str] = Query(None, description="Filter by action type"),
severity: Optional[str] = Query(None, description="Filter by severity level"),
user_email: Optional[str] = Query(None, description="Filter by user email"),
resource_type: Optional[str] = Query(None, description="Filter by resource type"),
resource_id: Optional[str] = Query(None, description="Filter by resource ID"),
success: Optional[bool] = Query(None, description="Filter by success status"),
# Search
search: str | None = Query(None, description="Search in description and details"),
search: Optional[str] = Query(None, description="Search in description and details"),
# Pagination (skip/limit to match frontend AuditLogQuery)
skip: int = Query(0, ge=0, description="Number of records to skip"),
@ -699,7 +647,7 @@ async def get_audit_logs_detailed(
sort_by=sort_by,
sort_order=sort_order
)
return await audit_logger.query_logs(query)
@ -745,7 +693,7 @@ async def get_security_events(
request: Request = None,
):
"""Get recent security events (production/admin only)"""
# Log access to security events
await audit_logger.log_action(
action="admin.audit.access",
@ -754,7 +702,7 @@ async def get_security_events(
request=request,
details={"hours_requested": hours}
)
logs = await audit_logger.get_security_events(hours)
return logs
@ -766,7 +714,7 @@ async def cleanup_audit_logs(
request: Request = None,
):
"""Clean up old audit logs (admin only)"""
# Log audit cleanup action
await audit_logger.log_action(
action="admin.system.action",
@ -776,9 +724,9 @@ async def cleanup_audit_logs(
details={"retention_days": retention_days},
severity="warning"
)
deleted_count = await audit_logger.cleanup_old_logs(retention_days)
# Log cleanup completion
await audit_logger.log_action(
action="admin.system.action",
@ -790,9 +738,9 @@ async def cleanup_audit_logs(
"deleted_count": deleted_count
}
)
return {
"message": f"Deleted {deleted_count} audit logs older than {retention_days} days",
"deleted_count": deleted_count,
"retention_days": retention_days
}
}

View file

@ -1,295 +0,0 @@
"""Admin production endpoints: failure dashboard, bulk retry, queue stats, VTT override."""
from datetime import datetime
import redis.asyncio as aioredis
from fastapi import (
APIRouter,
Depends,
File,
Form,
HTTPException,
Query,
UploadFile,
status,
)
from motor.motor_asyncio import AsyncIOMotorDatabase
from pydantic import BaseModel
from ...core.database import get_database
from ...core.dependencies import require_roles
from ...core.logging import get_logger
from ...core.redis import get_redis
from ...models.audit_log import AuditAction
from ...models.job import JobStatus, RequestedOutputs
from ...models.user import User, UserRole
from ...schemas.job import JobResponse
from ...services.audit_logger import audit_logger
from ...services.cloud_run_dispatch import dispatch as _cr_dispatch
from ...services.gcs import upload_vtt_to_gcs
logger = get_logger(__name__)
router = APIRouter(prefix="/admin/production", tags=["admin-production"])
_FAILURE_STATUSES = [
JobStatus.PROCESSING_FAILED.value,
JobStatus.TTS_FAILED.value,
JobStatus.RENDER_FAILED.value,
]
_RETRY_CAP = 50
class BulkRetryRequest(BaseModel):
job_ids: list[str]
strategy: str = "auto" # "auto" | "from_scratch"
class BulkRetryResponse(BaseModel):
retried: list[str]
skipped: list[str]
errors: list[dict]
@router.get("/failures", response_model=list[JobResponse])
async def list_failures(
step: str | None = Query(None, description="Filter by failure.step"),
org_id: str | None = Query(None, description="Filter by organization_id"),
limit: int = Query(50, ge=1, le=200),
skip: int = Query(0, ge=0),
current_user: User = Depends(require_roles(UserRole.PRODUCTION, UserRole.ADMIN)),
db: AsyncIOMotorDatabase = Depends(get_database),
):
"""List all jobs in a failed status, optionally filtered by step and org."""
query: dict = {"status": {"$in": _FAILURE_STATUSES}}
if step:
query["failure.step"] = step
if org_id:
query["organization_id"] = org_id
cursor = db.jobs.find(query).sort("updated_at", -1).skip(skip).limit(limit)
jobs = await cursor.to_list(length=limit)
return [
JobResponse(
id=str(j["_id"]),
title=j["title"],
status=j["status"],
source=j["source"],
requested_outputs=RequestedOutputs(**j["requested_outputs"]),
review=j.get("review", {"notes": "", "history": []}),
outputs=j.get("outputs"),
created_at=j["created_at"].isoformat(),
updated_at=j["updated_at"].isoformat(),
)
for j in jobs
]
@router.post("/bulk-retry", response_model=BulkRetryResponse)
async def bulk_retry(
payload: BulkRetryRequest,
current_user: User = Depends(require_roles(UserRole.PRODUCTION, UserRole.ADMIN)),
db: AsyncIOMotorDatabase = Depends(get_database),
):
"""Retry up to 50 failed jobs in one call."""
if len(payload.job_ids) > _RETRY_CAP:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail=f"Cannot retry more than {_RETRY_CAP} jobs at once",
)
retried: list[str] = []
skipped: list[str] = []
errors: list[dict] = []
now = datetime.utcnow()
for job_id in payload.job_ids:
try:
job_doc = await db.jobs.find_one({"_id": job_id})
if not job_doc:
skipped.append(job_id)
continue
if job_doc["status"] not in _FAILURE_STATUSES:
skipped.append(job_id)
continue
failure = job_doc.get("failure") or {}
if payload.strategy == "from_scratch":
step = "ingestion"
else:
step = failure.get("step")
if not step:
step = "tts" if job_doc["status"] == JobStatus.TTS_FAILED.value else "render"
if step in ("ingestion", "ai_processing"):
reset_status = JobStatus.CREATED.value
elif step == "translation":
reset_status = JobStatus.AI_PROCESSING.value
elif step == "tts":
src = job_doc["source"].get("language", "en")
reset_status = (
JobStatus.APPROVED_ENGLISH.value if src == "en" else JobStatus.APPROVED_SOURCE.value
)
elif step == "render":
reset_status = JobStatus.PENDING_QC.value
else:
skipped.append(job_id)
continue
await db.jobs.update_one(
{"_id": job_id},
{
"$set": {"status": reset_status, "error": None, "updated_at": now},
"$inc": {"retry_count": 1},
"$push": {
"review.history": {
"at": now,
"status": f"bulk_retry_{step}",
"by": str(current_user.id),
}
},
},
)
if step in ("ingestion", "ai_processing"):
await _cr_dispatch("ingest", job_id)
elif step in ("translation", "tts"):
await _cr_dispatch("translate", job_id)
elif step == "render":
lang = job_doc.get("last_render_language", "en")
await _cr_dispatch("rerender", job_id, language=lang)
retried.append(job_id)
except Exception as e:
logger.error(f"bulk-retry failed for job {job_id}: {e}")
errors.append({"job_id": job_id, "error": str(e)})
try:
await audit_logger.log(
action=AuditAction.JOB_BULK_RETRY,
user_id=str(current_user.id),
user_email=current_user.email,
user_role=current_user.role.value if current_user.role else None,
resource_type="job",
description=f"Bulk retry {len(retried)} jobs (strategy={payload.strategy})",
details={"retried": retried, "skipped": skipped, "error_count": len(errors)},
)
except Exception as e:
logger.warning(f"Failed to write bulk-retry audit log: {e}")
return BulkRetryResponse(retried=retried, skipped=skipped, errors=errors)
# ---------------------------------------------------------------------------
# PR-7: Queue depth stats
# ---------------------------------------------------------------------------
_CELERY_QUEUES = ["default", "ingest", "tts", "render", "ffmpeg", "whisper", "notify", "embed"]
class QueueStats(BaseModel):
queues: dict[str, int] # queue_name → pending task count
total_pending: int
@router.get("/queue-stats", response_model=QueueStats)
async def get_queue_stats(
current_user: User = Depends(require_roles(UserRole.PRODUCTION, UserRole.ADMIN)),
redis: aioredis.Redis = Depends(get_redis),
):
"""Return pending task counts per Celery queue (via Redis LLEN)."""
counts: dict[str, int] = {}
for q in _CELERY_QUEUES:
try:
n = await redis.llen(q)
counts[q] = n
except Exception:
counts[q] = 0
return QueueStats(queues=counts, total_pending=sum(counts.values()))
# ---------------------------------------------------------------------------
# PR-8: Upload final VTT override — bypass AI, jump to PENDING_QC
# ---------------------------------------------------------------------------
_BYPASSABLE_STATUSES = {
JobStatus.CREATED.value,
JobStatus.INGESTING.value,
JobStatus.AI_PROCESSING.value,
JobStatus.PROCESSING_FAILED.value,
JobStatus.TTS_FAILED.value,
JobStatus.RENDER_FAILED.value,
}
@router.post("/jobs/{job_id}/upload-final-vtt")
async def upload_final_vtt(
job_id: str,
language: str = Form(..., description="BCP-47 language code, e.g. 'en' or 'fr'"),
vtt_file: UploadFile = File(..., description="WebVTT (.vtt) file"),
vtt_type: str = Form("captions", description="'captions' or 'ad'"),
current_user: User = Depends(require_roles(UserRole.PRODUCTION, UserRole.ADMIN)),
db: AsyncIOMotorDatabase = Depends(get_database),
):
"""Upload a hand-crafted VTT to override AI output and advance job to PENDING_QC."""
job_doc = await db.jobs.find_one({"_id": job_id})
if not job_doc:
raise HTTPException(status_code=404, detail="Job not found")
if job_doc["status"] not in _BYPASSABLE_STATUSES:
raise HTTPException(
status_code=status.HTTP_409_CONFLICT,
detail=f"Cannot override VTT when job is in status '{job_doc['status']}'. "
f"Only allowed in: {sorted(_BYPASSABLE_STATUSES)}",
)
if not vtt_file.filename or not vtt_file.filename.endswith(".vtt"):
raise HTTPException(status_code=400, detail="File must be a .vtt file")
vtt_content = (await vtt_file.read()).decode("utf-8")
if not vtt_content.strip().startswith("WEBVTT"):
raise HTTPException(status_code=400, detail="File does not start with WEBVTT header")
if vtt_type not in ("captions", "ad"):
raise HTTPException(status_code=400, detail="vtt_type must be 'captions' or 'ad'")
lang_key = language.replace("-", "_")
field = "captions_vtt_gcs" if vtt_type == "captions" else "ad_vtt_gcs"
gcs_path = f"{job_id}/{lang_key}/{vtt_type}.vtt"
gcs_uri = await upload_vtt_to_gcs(vtt_content, gcs_path)
now = datetime.utcnow()
await db.jobs.update_one(
{"_id": job_id},
{
"$set": {
f"outputs.{lang_key}.{field}": gcs_uri,
"status": JobStatus.PENDING_QC.value,
"updated_at": now,
},
"$push": {
"review.history": {
"at": now,
"status": "manual_vtt_upload",
"by": str(current_user.id),
"note": f"Manual {vtt_type} VTT upload for {language} by {current_user.email}",
}
},
},
)
try:
await audit_logger.log(
action=AuditAction.VTT_EDIT,
user_id=str(current_user.id),
user_email=current_user.email,
user_role=current_user.role.value if current_user.role else None,
resource_type="job",
resource_id=job_id,
description=f"Manual {vtt_type} VTT upload for {language} — job advanced to PENDING_QC",
)
except Exception as e:
logger.warning(f"Failed to write upload-final-vtt audit log: {e}")
return {"status": "ok", "gcs_uri": gcs_uri, "job_status": JobStatus.PENDING_QC.value}

View file

@ -1,7 +1,5 @@
import re
import secrets
from datetime import datetime
from fastapi import APIRouter, Depends, HTTPException, Request, Response, status
from fastapi.security import HTTPBearer
from motor.motor_asyncio import AsyncIOMotorDatabase
@ -16,7 +14,7 @@ from ...core.security import (
verify_password,
)
from ...models.audit_log import AuditAction, AuditLogSeverity
from ...models.user import AuthProvider, User, UserRole
from ...models.user import User, AuthProvider, UserRole
from ...schemas.auth import (
LoginRequest,
LoginResponse,
@ -25,11 +23,11 @@ from ...schemas.auth import (
MicrosoftLoginResponse,
RefreshResponse,
)
from ...services.audit_logger import audit_logger, log_auth_failure, log_auth_success
from ...services.audit_logger import log_auth_success, log_auth_failure, audit_logger
from ...services.microsoft_auth import (
get_microsoft_auth_service,
MicrosoftAuthError,
MicrosoftTokenValidationError,
get_microsoft_auth_service,
)
logger = get_logger(__name__)
@ -37,40 +35,6 @@ router = APIRouter(prefix="/auth", tags=["auth"])
security = HTTPBearer()
async def _get_user_org_ids(user_id: str, db: AsyncIOMotorDatabase) -> list[str]:
"""Return list of org IDs the user belongs to — used as a JWT hint only."""
cursor = db.memberships.find({"user_id": user_id}, {"organization_id": 1})
memberships = await cursor.to_list(length=200)
return [str(m["organization_id"]) for m in memberships if m.get("organization_id")]
def _set_auth_cookies(response: Response, refresh_token: str) -> str:
"""Set httponly refresh_token cookie and readable csrf_token cookie. Returns the csrf token."""
csrf_token = secrets.token_hex(32)
ttl = settings.jwt_refresh_ttl_days * 24 * 60 * 60
domain = settings.cookie_domain if settings.app_env == "prod" else None
response.set_cookie(
key="refresh_token",
value=refresh_token,
httponly=True,
secure=settings.cookie_secure,
samesite=settings.cookie_samesite,
domain=domain,
max_age=ttl,
)
response.set_cookie(
key="csrf_token",
value=csrf_token,
httponly=False, # JS-readable for Double Submit Cookie pattern
secure=settings.cookie_secure,
samesite=settings.cookie_samesite,
domain=domain,
max_age=ttl,
)
return csrf_token
@router.post("/login", response_model=LoginResponse)
async def login(
login_data: LoginRequest,
@ -109,11 +73,18 @@ async def login(
detail="User account is disabled",
)
org_ids = await _get_user_org_ids(str(user.id), db)
access_token = create_access_token(subject=str(user.id), org_ids=org_ids)
access_token = create_access_token(subject=str(user.id))
refresh_token = create_refresh_token(subject=str(user.id))
_set_auth_cookies(response, refresh_token)
response.set_cookie(
key="refresh_token",
value=refresh_token,
httponly=True,
secure=settings.cookie_secure,
samesite=settings.cookie_samesite,
domain=settings.cookie_domain if settings.app_env == "prod" else None,
max_age=settings.jwt_refresh_ttl_days * 24 * 60 * 60,
)
await log_auth_success(user, request)
return LoginResponse(
@ -143,13 +114,13 @@ async def microsoft_login(
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail=f"Microsoft authentication failed: {str(e)}",
) from None
)
except MicrosoftAuthError as e:
await log_auth_failure("microsoft-sso", request, f"MS auth service error: {e}")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail="Microsoft authentication service error",
) from None
)
# Look up by Microsoft-derived ID first — handles email casing changes across logins
ms_user_id = f"ms-{user_info.sub[:20]}"
@ -191,11 +162,18 @@ async def microsoft_login(
detail="User account is disabled",
)
org_ids = await _get_user_org_ids(str(user.id), db)
access_token = create_access_token(subject=str(user.id), org_ids=org_ids)
access_token = create_access_token(subject=str(user.id))
refresh_token = create_refresh_token(subject=str(user.id))
_set_auth_cookies(response, refresh_token)
response.set_cookie(
key="refresh_token",
value=refresh_token,
httponly=True,
secure=settings.cookie_secure,
samesite=settings.cookie_samesite,
domain=settings.cookie_domain if settings.app_env == "prod" else None,
max_age=settings.jwt_refresh_ttl_days * 24 * 60 * 60,
)
await log_auth_success(user, request)
return MicrosoftLoginResponse(
@ -222,15 +200,6 @@ async def refresh_token(
detail="Refresh token not found",
)
# CSRF protection: Double Submit Cookie pattern
csrf_cookie = request.cookies.get("csrf_token")
csrf_header = request.headers.get("X-CSRF-Token")
if csrf_cookie and (not csrf_header or csrf_header != csrf_cookie):
raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN,
detail="CSRF token mismatch",
)
try:
payload = decode_token(refresh_token)
@ -262,13 +231,20 @@ async def refresh_token(
detail="User account is disabled",
)
# Create new tokens (include org_ids claim for prefilter hint)
_org_ids = await _get_user_org_ids(user_id, db)
new_access_token = create_access_token(subject=user_id, org_ids=_org_ids)
# Create new tokens
new_access_token = create_access_token(subject=user_id)
new_refresh_token = create_refresh_token(subject=user_id)
# Rotate both refresh and CSRF cookies
_set_auth_cookies(response, new_refresh_token)
# Update refresh token cookie
response.set_cookie(
key="refresh_token",
value=new_refresh_token,
httponly=True,
secure=settings.cookie_secure,
samesite=settings.cookie_samesite,
domain=settings.cookie_domain if settings.app_env == "prod" else None,
max_age=settings.jwt_refresh_ttl_days * 24 * 60 * 60,
)
logger.info("Token refresh successful for user %s", user_id)
return RefreshResponse(
@ -287,7 +263,7 @@ async def refresh_token(
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail="Invalid refresh token",
) from None
)
@router.post("/logout", response_model=LogoutResponse)

View file

@ -1,245 +0,0 @@
"""Job Brief CRUD endpoints."""
from datetime import datetime
from fastapi import APIRouter, Depends, HTTPException, Request, status
from motor.motor_asyncio import AsyncIOMotorDatabase
from ...core.authz import MembershipContext, assert_user_in_org, get_membership_context
from ...core.database import get_database
from ...core.logging import get_logger
from ...models.audit_log import AuditAction
from ...models.job_brief import (
BriefStatus,
JobBriefCreate,
JobBriefResponse,
JobBriefUpdate,
)
from ...models.organization import OrgRole
from ...services.audit_logger import audit_logger
logger = get_logger(__name__)
router = APIRouter(prefix="/briefs", tags=["briefs"])
def _doc_to_response(doc: dict) -> JobBriefResponse:
return JobBriefResponse(
id=str(doc["_id"]),
organization_id=doc["organization_id"],
project_id=doc.get("project_id"),
title=doc["title"],
description=doc.get("description"),
requested_outputs=doc["requested_outputs"],
languages=doc.get("languages", []),
deadline=doc.get("deadline"),
status=doc["status"],
created_by=doc["created_by"],
assignee_id=doc.get("assignee_id"),
job_id=doc.get("job_id"),
created_at=doc["created_at"].isoformat(),
updated_at=doc["updated_at"].isoformat(),
submitted_at=doc["submitted_at"].isoformat() if doc.get("submitted_at") else None,
approved_by=doc.get("approved_by"),
)
@router.get("", response_model=list[JobBriefResponse])
async def list_briefs(
ctx: MembershipContext = Depends(get_membership_context),
db: AsyncIOMotorDatabase = Depends(get_database),
):
org_ids = [m.organization_id for m in ctx.memberships] if hasattr(ctx, "memberships") else []
if ctx.is_platform_admin:
query: dict = {}
elif org_ids:
query = {"organization_id": {"$in": org_ids}}
else:
raise HTTPException(status_code=403, detail="No org memberships")
cursor = db.job_briefs.find(query).sort("created_at", -1).limit(100)
docs = await cursor.to_list(length=100)
return [_doc_to_response(d) for d in docs]
@router.post("", response_model=JobBriefResponse, status_code=status.HTTP_201_CREATED)
async def create_brief(
payload: JobBriefCreate,
http_request: Request,
ctx: MembershipContext = Depends(get_membership_context),
db: AsyncIOMotorDatabase = Depends(get_database),
):
# Resolve org from project if not directly identifiable
org_id: str | None = None
if payload.project_id:
project = await db.projects.find_one({"_id": payload.project_id}, {"client_id": 1})
if project:
org_id = project.get("client_id")
if not org_id:
# Use first membership org if user has only one (or admin)
if ctx.is_platform_admin:
raise HTTPException(status_code=400, detail="Admin must supply project_id or org_id cannot be inferred")
memberships = [m for m in (ctx.memberships if hasattr(ctx, "memberships") else [])
if ctx.can_access_org(m.organization_id, OrgRole.MANAGER)]
if len(memberships) == 1:
org_id = memberships[0].organization_id
else:
raise HTTPException(status_code=400, detail="Cannot infer organization; supply project_id")
assert_user_in_org(ctx, org_id, OrgRole.MANAGER)
now = datetime.utcnow()
doc = {
"_id": f"brief_{now.strftime('%Y%m%d%H%M%S%f')}_{str(ctx.user.id)[-6:]}",
"organization_id": org_id,
"project_id": payload.project_id,
"title": payload.title,
"description": payload.description,
"requested_outputs": payload.requested_outputs.model_dump(),
"languages": payload.languages,
"deadline": payload.deadline,
"assignee_id": payload.assignee_id,
"status": BriefStatus.DRAFT.value,
"created_by": str(ctx.user.id),
"job_id": None,
"created_at": now,
"updated_at": now,
"submitted_at": None,
"approved_by": None,
}
await db.job_briefs.insert_one(doc)
await audit_logger.log_action(
action=AuditAction.BRIEF_CREATE,
description=f"Brief '{payload.title}' created",
user=ctx.user,
request=http_request,
resource_type="brief",
resource_id=str(doc["_id"]),
details={"title": payload.title, "organization_id": org_id},
)
return _doc_to_response(doc)
@router.get("/{brief_id}", response_model=JobBriefResponse)
async def get_brief(
brief_id: str,
ctx: MembershipContext = Depends(get_membership_context),
db: AsyncIOMotorDatabase = Depends(get_database),
):
doc = await db.job_briefs.find_one({"_id": brief_id})
if not doc:
raise HTTPException(status_code=404, detail="Brief not found")
assert_user_in_org(ctx, doc["organization_id"], OrgRole.VIEWER)
return _doc_to_response(doc)
@router.patch("/{brief_id}", response_model=JobBriefResponse)
async def update_brief(
brief_id: str,
payload: JobBriefUpdate,
http_request: Request,
ctx: MembershipContext = Depends(get_membership_context),
db: AsyncIOMotorDatabase = Depends(get_database),
):
doc = await db.job_briefs.find_one({"_id": brief_id})
if not doc:
raise HTTPException(status_code=404, detail="Brief not found")
assert_user_in_org(ctx, doc["organization_id"], OrgRole.MANAGER)
if doc["status"] != BriefStatus.DRAFT.value:
raise HTTPException(status_code=400, detail="Only DRAFT briefs can be updated")
updates: dict = {"updated_at": datetime.utcnow()}
if payload.title is not None:
updates["title"] = payload.title
if payload.description is not None:
updates["description"] = payload.description
if payload.requested_outputs is not None:
updates["requested_outputs"] = payload.requested_outputs.model_dump()
if payload.languages is not None:
updates["languages"] = payload.languages
if payload.deadline is not None:
updates["deadline"] = payload.deadline
result = await db.job_briefs.find_one_and_update(
{"_id": brief_id},
{"$set": updates},
return_document=True,
)
await audit_logger.log_action(
action=AuditAction.BRIEF_UPDATE,
description=f"Brief '{brief_id}' updated",
user=ctx.user,
request=http_request,
resource_type="brief",
resource_id=brief_id,
details={"fields_updated": list(updates.keys())},
)
return _doc_to_response(result)
@router.post("/{brief_id}/submit", response_model=JobBriefResponse)
async def submit_brief(
brief_id: str,
http_request: Request,
ctx: MembershipContext = Depends(get_membership_context),
db: AsyncIOMotorDatabase = Depends(get_database),
):
doc = await db.job_briefs.find_one({"_id": brief_id})
if not doc:
raise HTTPException(status_code=404, detail="Brief not found")
assert_user_in_org(ctx, doc["organization_id"], OrgRole.MANAGER)
if doc["status"] != BriefStatus.DRAFT.value:
raise HTTPException(status_code=400, detail="Only DRAFT briefs can be submitted")
now = datetime.utcnow()
result = await db.job_briefs.find_one_and_update(
{"_id": brief_id},
{"$set": {"status": BriefStatus.SUBMITTED.value, "submitted_at": now, "updated_at": now}},
return_document=True,
)
await audit_logger.log_action(
action=AuditAction.BRIEF_SUBMIT,
description=f"Brief '{brief_id}' submitted for review",
user=ctx.user,
request=http_request,
resource_type="brief",
resource_id=brief_id,
details={"organization_id": result.get("organization_id")},
)
return _doc_to_response(result)
@router.post("/{brief_id}/approve", response_model=JobBriefResponse)
async def approve_brief(
brief_id: str,
http_request: Request,
ctx: MembershipContext = Depends(get_membership_context),
db: AsyncIOMotorDatabase = Depends(get_database),
):
doc = await db.job_briefs.find_one({"_id": brief_id})
if not doc:
raise HTTPException(status_code=404, detail="Brief not found")
assert_user_in_org(ctx, doc["organization_id"], OrgRole.ADMIN)
if doc["status"] != BriefStatus.SUBMITTED.value:
raise HTTPException(status_code=400, detail="Only SUBMITTED briefs can be approved")
now = datetime.utcnow()
result = await db.job_briefs.find_one_and_update(
{"_id": brief_id},
{
"$set": {
"status": BriefStatus.APPROVED.value,
"approved_by": str(ctx.user.id),
"updated_at": now,
}
},
return_document=True,
)
await audit_logger.log_action(
action=AuditAction.BRIEF_APPROVE,
description=f"Brief '{brief_id}' approved",
user=ctx.user,
request=http_request,
resource_type="brief",
resource_id=brief_id,
details={"organization_id": result.get("organization_id")},
)
return _doc_to_response(result)

View file

@ -9,16 +9,15 @@ Access rules:
- List projects (read) Admin, PM, or any team member of the client
"""
from datetime import UTC, datetime
from datetime import datetime, timezone
from bson import ObjectId
from fastapi import APIRouter, Depends, HTTPException, Request
from fastapi import APIRouter, Depends, HTTPException
from motor.motor_asyncio import AsyncIOMotorDatabase
from pydantic import BaseModel
from ...core.database import get_database
from ...core.dependencies import get_current_user, require_roles
from ...models.audit_log import AuditAction
from ...models.client import (
Client,
ClientCreate,
@ -31,7 +30,6 @@ from ...models.client import (
TeamUpdate,
)
from ...models.user import User, UserRole
from ...services.audit_logger import audit_logger
router = APIRouter(prefix="/clients", tags=["clients"])
@ -41,7 +39,7 @@ router = APIRouter(prefix="/clients", tags=["clients"])
# ---------------------------------------------------------------------------
def _now() -> datetime:
return datetime.now(UTC)
return datetime.now(timezone.utc)
async def _get_client_or_404(client_id: str, db: AsyncIOMotorDatabase) -> dict:
@ -123,7 +121,6 @@ async def list_clients(
@router.post("", response_model=Client)
async def create_client(
body: ClientCreate,
request: Request,
current_user: User = Depends(require_roles(UserRole.ADMIN)),
db: AsyncIOMotorDatabase = Depends(get_database),
):
@ -140,18 +137,7 @@ async def create_client(
"updated_at": now,
})
doc = await db.clients.find_one({"_id": client_id})
client = _client_from_doc(doc)
await audit_logger.log_action(
action=AuditAction.CLIENT_CREATE,
description=f"Client '{client.name}' created",
user=current_user,
request=request,
resource_type="client",
resource_id=str(client.id),
resource_name=client.name,
details={"slug": client.slug},
)
return client
return _client_from_doc(doc)
@router.get("/{client_id}", response_model=Client)
@ -172,12 +158,11 @@ async def get_client(
async def update_client(
client_id: str,
body: ClientUpdate,
request: Request,
current_user: User = Depends(require_roles(UserRole.ADMIN)),
db: AsyncIOMotorDatabase = Depends(get_database),
):
await _get_client_or_404(client_id, db)
update: dict = dict(body.model_dump(exclude_none=True).items())
update: dict = {k: v for k, v in body.model_dump(exclude_none=True).items()}
if not update:
raise HTTPException(status_code=422, detail="No fields to update")
if "slug" in update and await db.clients.find_one({"slug": update["slug"], "_id": {"$ne": client_id}}):
@ -185,39 +170,17 @@ async def update_client(
update["updated_at"] = _now()
await db.clients.update_one({"_id": client_id}, {"$set": update})
doc = await db.clients.find_one({"_id": client_id})
client = _client_from_doc(doc)
await audit_logger.log_action(
action=AuditAction.CLIENT_UPDATE,
description=f"Client '{client.name}' updated",
user=current_user,
request=request,
resource_type="client",
resource_id=client_id,
resource_name=client.name,
details={"fields_updated": list(body.model_dump(exclude_none=True).keys())},
)
return client
return _client_from_doc(doc)
@router.delete("/{client_id}", status_code=204)
async def deactivate_client(
client_id: str,
request: Request,
current_user: User = Depends(require_roles(UserRole.ADMIN)),
db: AsyncIOMotorDatabase = Depends(get_database),
):
doc = await _get_client_or_404(client_id, db)
await _get_client_or_404(client_id, db)
await db.clients.update_one({"_id": client_id}, {"$set": {"is_active": False, "updated_at": _now()}})
await audit_logger.log_action(
action=AuditAction.CLIENT_DEACTIVATE,
description=f"Client '{doc['name']}' deactivated",
user=current_user,
request=request,
resource_type="client",
resource_id=client_id,
resource_name=doc["name"],
details={"was_active": doc.get("is_active", True)},
)
# ---------------------------------------------------------------------------
@ -232,11 +195,10 @@ class AssignPMRequest(BaseModel):
async def assign_pm(
client_id: str,
body: AssignPMRequest,
request: Request,
current_user: User = Depends(require_roles(UserRole.ADMIN)),
db: AsyncIOMotorDatabase = Depends(get_database),
):
client_doc = await _get_client_or_404(client_id, db)
await _get_client_or_404(client_id, db)
user_doc = await db.users.find_one({"_id": body.user_id})
if not user_doc:
raise HTTPException(status_code=404, detail="User not found")
@ -247,28 +209,16 @@ async def assign_pm(
"$set": {"role": UserRole.PROJECT_MANAGER.value, "updated_at": _now()},
},
)
await audit_logger.log_action(
action=AuditAction.CLIENT_PM_ASSIGN,
description=f"PM '{user_doc.get('email', body.user_id)}' assigned to client '{client_doc['name']}'",
user=current_user,
request=request,
resource_type="client",
resource_id=client_id,
resource_name=client_doc["name"],
details={"pm_user_id": body.user_id, "pm_email": user_doc.get("email")},
)
@router.delete("/{client_id}/pm/{user_id}", status_code=204)
async def remove_pm(
client_id: str,
user_id: str,
request: Request,
current_user: User = Depends(require_roles(UserRole.ADMIN)),
db: AsyncIOMotorDatabase = Depends(get_database),
):
client_doc = await _get_client_or_404(client_id, db)
pm_doc = await db.users.find_one({"_id": user_id})
await _get_client_or_404(client_id, db)
await db.users.update_one(
{"_id": user_id},
{"$pull": {"pm_client_ids": client_id}, "$set": {"updated_at": _now()}},
@ -280,16 +230,6 @@ async def remove_pm(
{"_id": user_id},
{"$set": {"role": UserRole.CLIENT.value, "updated_at": _now()}},
)
await audit_logger.log_action(
action=AuditAction.CLIENT_PM_REMOVE,
description=f"PM '{pm_doc.get('email', user_id) if pm_doc else user_id}' removed from client '{client_doc['name']}'",
user=current_user,
request=request,
resource_type="client",
resource_id=client_id,
resource_name=client_doc["name"],
details={"pm_user_id": user_id, "pm_email": pm_doc.get("email") if pm_doc else None},
)
@router.get("/{client_id}/pm", response_model=list[dict])
@ -326,11 +266,10 @@ async def list_teams(
async def create_team(
client_id: str,
body: TeamCreate,
request: Request,
current_user: User = Depends(get_current_user),
db: AsyncIOMotorDatabase = Depends(get_database),
):
client_doc = await _get_client_or_404(client_id, db)
await _get_client_or_404(client_id, db)
await _assert_pm_or_admin(current_user, client_id, db)
now = _now()
team_id = str(ObjectId())
@ -343,18 +282,7 @@ async def create_team(
"updated_at": now,
})
doc = await db.teams.find_one({"_id": team_id})
team = _team_from_doc(doc)
await audit_logger.log_action(
action=AuditAction.CLIENT_TEAM_CREATE,
description=f"Team '{team.name}' created for client '{client_doc['name']}'",
user=current_user,
request=request,
resource_type="client",
resource_id=client_id,
resource_name=client_doc["name"],
details={"team_id": team_id, "team_name": team.name},
)
return team
return _team_from_doc(doc)
@router.patch("/{client_id}/teams/{team_id}", response_model=Team)
@ -362,55 +290,32 @@ async def update_team(
client_id: str,
team_id: str,
body: TeamUpdate,
request: Request,
current_user: User = Depends(get_current_user),
db: AsyncIOMotorDatabase = Depends(get_database),
):
client_doc = await _get_client_or_404(client_id, db)
await _get_client_or_404(client_id, db)
await _assert_pm_or_admin(current_user, client_id, db)
await _get_team_or_404(team_id, client_id, db)
update = dict(body.model_dump(exclude_none=True).items())
update = {k: v for k, v in body.model_dump(exclude_none=True).items()}
if not update:
raise HTTPException(status_code=422, detail="No fields to update")
update["updated_at"] = _now()
await db.teams.update_one({"_id": team_id}, {"$set": update})
doc = await db.teams.find_one({"_id": team_id})
team = _team_from_doc(doc)
await audit_logger.log_action(
action=AuditAction.CLIENT_TEAM_UPDATE,
description=f"Team '{team.name}' updated for client '{client_doc['name']}'",
user=current_user,
request=request,
resource_type="client",
resource_id=client_id,
resource_name=client_doc["name"],
details={"team_id": team_id, "team_name": team.name, "fields_updated": list(body.model_dump(exclude_none=True).keys())},
)
return team
return _team_from_doc(doc)
@router.delete("/{client_id}/teams/{team_id}", status_code=204)
async def delete_team(
client_id: str,
team_id: str,
request: Request,
current_user: User = Depends(get_current_user),
db: AsyncIOMotorDatabase = Depends(get_database),
):
client_doc = await _get_client_or_404(client_id, db)
await _get_client_or_404(client_id, db)
await _assert_pm_or_admin(current_user, client_id, db)
team_doc = await _get_team_or_404(team_id, client_id, db)
await _get_team_or_404(team_id, client_id, db)
await db.teams.delete_one({"_id": team_id})
await audit_logger.log_action(
action=AuditAction.CLIENT_TEAM_DELETE,
description=f"Team '{team_doc['name']}' deleted from client '{client_doc['name']}'",
user=current_user,
request=request,
resource_type="client",
resource_id=client_id,
resource_name=client_doc["name"],
details={"team_id": team_id, "team_name": team_doc["name"]},
)
# Team membership
@ -424,35 +329,18 @@ async def add_team_member(
client_id: str,
team_id: str,
body: AddMemberRequest,
request: Request,
current_user: User = Depends(get_current_user),
db: AsyncIOMotorDatabase = Depends(get_database),
):
client_doc = await _get_client_or_404(client_id, db)
await _get_client_or_404(client_id, db)
await _assert_pm_or_admin(current_user, client_id, db)
team_doc = await _get_team_or_404(team_id, client_id, db)
member_doc = await db.users.find_one({"_id": body.user_id})
if not member_doc:
await _get_team_or_404(team_id, client_id, db)
if not await db.users.find_one({"_id": body.user_id}):
raise HTTPException(status_code=404, detail="User not found")
# Write to both Team.member_user_ids (legacy) and Membership.team_ids (MT-17)
await db.teams.update_one(
{"_id": team_id},
{"$addToSet": {"member_user_ids": body.user_id}, "$set": {"updated_at": _now()}},
)
await db.memberships.update_one(
{"user_id": body.user_id, "organization_id": client_id},
{"$addToSet": {"team_ids": team_id}},
)
await audit_logger.log_action(
action=AuditAction.CLIENT_TEAM_MEMBER_ADD,
description=f"User '{member_doc.get('email', body.user_id)}' added to team '{team_doc['name']}' of client '{client_doc['name']}'",
user=current_user,
request=request,
resource_type="client",
resource_id=client_id,
resource_name=client_doc["name"],
details={"team_id": team_id, "team_name": team_doc["name"], "member_user_id": body.user_id, "member_email": member_doc.get("email")},
)
@router.delete("/{client_id}/teams/{team_id}/members/{user_id}", status_code=204)
@ -460,56 +348,22 @@ async def remove_team_member(
client_id: str,
team_id: str,
user_id: str,
request: Request,
current_user: User = Depends(get_current_user),
db: AsyncIOMotorDatabase = Depends(get_database),
):
client_doc = await _get_client_or_404(client_id, db)
await _get_client_or_404(client_id, db)
await _assert_pm_or_admin(current_user, client_id, db)
team_doc = await _get_team_or_404(team_id, client_id, db)
member_doc = await db.users.find_one({"_id": user_id})
await _get_team_or_404(team_id, client_id, db)
await db.teams.update_one(
{"_id": team_id},
{"$pull": {"member_user_ids": user_id}, "$set": {"updated_at": _now()}},
)
await db.memberships.update_one(
{"user_id": user_id, "organization_id": client_id},
{"$pull": {"team_ids": team_id}},
)
await audit_logger.log_action(
action=AuditAction.CLIENT_TEAM_MEMBER_REMOVE,
description=f"User '{member_doc.get('email', user_id) if member_doc else user_id}' removed from team '{team_doc['name']}' of client '{client_doc['name']}'",
user=current_user,
request=request,
resource_type="client",
resource_id=client_id,
resource_name=client_doc["name"],
details={"team_id": team_id, "team_name": team_doc["name"], "member_user_id": user_id, "member_email": member_doc.get("email") if member_doc else None},
)
# ---------------------------------------------------------------------------
# Project endpoints
# ---------------------------------------------------------------------------
@router.get("/all-projects", response_model=list[Project])
async def list_all_projects(
current_user: User = Depends(get_current_user),
db: AsyncIOMotorDatabase = Depends(get_database),
):
"""Return all active projects accessible to the current user (across all clients)."""
if current_user.role in (UserRole.ADMIN, UserRole.PRODUCTION, UserRole.PROJECT_MANAGER):
docs = await db.projects.find({"is_active": True}).to_list(None)
else:
accessible_client_ids = await _get_accessible_client_ids(current_user, db)
if not accessible_client_ids:
return []
docs = await db.projects.find(
{"client_id": {"$in": accessible_client_ids}, "is_active": True}
).to_list(None)
return [_project_from_doc(d) for d in docs]
@router.get("/{client_id}/projects", response_model=list[Project])
async def list_projects(
client_id: str,
@ -526,11 +380,10 @@ async def list_projects(
async def create_project(
client_id: str,
body: ProjectCreate,
request: Request,
current_user: User = Depends(get_current_user),
db: AsyncIOMotorDatabase = Depends(get_database),
):
client_doc = await _get_client_or_404(client_id, db)
await _get_client_or_404(client_id, db)
await _assert_pm_or_client_member(current_user, client_id, db)
now = _now()
project_id = str(ObjectId())
@ -546,18 +399,7 @@ async def create_project(
"updated_at": now,
})
doc = await db.projects.find_one({"_id": project_id})
project = _project_from_doc(doc)
await audit_logger.log_action(
action=AuditAction.CLIENT_PROJECT_CREATE,
description=f"Project '{project.name}' created for client '{client_doc['name']}'",
user=current_user,
request=request,
resource_type="client",
resource_id=client_id,
resource_name=client_doc["name"],
details={"project_id": project_id, "project_name": project.name, "default_languages": body.default_languages},
)
return project
return _project_from_doc(doc)
@router.patch("/{client_id}/projects/{project_id}", response_model=Project)
@ -565,58 +407,35 @@ async def update_project(
client_id: str,
project_id: str,
body: ProjectUpdate,
request: Request,
current_user: User = Depends(get_current_user),
db: AsyncIOMotorDatabase = Depends(get_database),
):
client_doc = await _get_client_or_404(client_id, db)
await _get_client_or_404(client_id, db)
await _assert_pm_or_admin(current_user, client_id, db)
await _get_project_or_404(project_id, client_id, db)
update = dict(body.model_dump(exclude_none=True).items())
update = {k: v for k, v in body.model_dump(exclude_none=True).items()}
if not update:
raise HTTPException(status_code=422, detail="No fields to update")
update["updated_at"] = _now()
await db.projects.update_one({"_id": project_id}, {"$set": update})
doc = await db.projects.find_one({"_id": project_id})
project = _project_from_doc(doc)
await audit_logger.log_action(
action=AuditAction.CLIENT_PROJECT_UPDATE,
description=f"Project '{project.name}' updated for client '{client_doc['name']}'",
user=current_user,
request=request,
resource_type="client",
resource_id=client_id,
resource_name=client_doc["name"],
details={"project_id": project_id, "project_name": project.name, "fields_updated": list(body.model_dump(exclude_none=True).keys())},
)
return project
return _project_from_doc(doc)
@router.delete("/{client_id}/projects/{project_id}", status_code=204)
async def archive_project(
client_id: str,
project_id: str,
request: Request,
current_user: User = Depends(get_current_user),
db: AsyncIOMotorDatabase = Depends(get_database),
):
client_doc = await _get_client_or_404(client_id, db)
await _get_client_or_404(client_id, db)
await _assert_pm_or_admin(current_user, client_id, db)
project_doc = await _get_project_or_404(project_id, client_id, db)
await _get_project_or_404(project_id, client_id, db)
await db.projects.update_one(
{"_id": project_id},
{"$set": {"is_active": False, "updated_at": _now()}},
)
await audit_logger.log_action(
action=AuditAction.CLIENT_PROJECT_ARCHIVE,
description=f"Project '{project_doc['name']}' archived for client '{client_doc['name']}'",
user=current_user,
request=request,
resource_type="client",
resource_id=client_id,
resource_name=client_doc["name"],
details={"project_id": project_id, "project_name": project_doc["name"]},
)
# ---------------------------------------------------------------------------
@ -654,19 +473,6 @@ async def _assert_pm_or_client_member(user: User, client_id: str, db: AsyncIOMot
raise HTTPException(status_code=403, detail="Not authorized to create projects for this client")
async def _get_accessible_client_ids(user: User, db: AsyncIOMotorDatabase) -> list[str]:
"""Return list of client_ids the user can access."""
ids: set[str] = set()
# PM assignments (legacy)
if user.pm_client_ids:
ids.update(user.pm_client_ids)
# Org memberships
mems = await db.memberships.find({"user_id": str(user.id)}).to_list(None)
for m in mems:
ids.add(m["organization_id"])
return list(ids)
async def _assert_client_access(user: User, client_id: str, db: AsyncIOMotorDatabase) -> None:
"""Allow platform staff, org members (any role), or PM of the client."""
if user.role in (UserRole.ADMIN, UserRole.REVIEWER, UserRole.PRODUCTION, UserRole.LINGUIST):
@ -678,4 +484,6 @@ async def _assert_client_access(user: User, client_id: str, db: AsyncIOMotorData
# Legacy fallback for pre-migration users
if user.role == UserRole.PROJECT_MANAGER and client_id in (user.pm_client_ids or []):
return
if user.role in (UserRole.CLIENT, UserRole.PROJECT_MANAGER):
return
raise HTTPException(status_code=403, detail="Insufficient permissions")

View file

@ -3,11 +3,11 @@ from motor.motor_asyncio import AsyncIOMotorDatabase
from ...core.database import get_database
from ...core.dependencies import get_current_user
from ...models.audit_log import AuditAction
from ...models.user import User
from ...schemas.file import SignedUploadRequest, SignedUploadResponse
from ...services.audit_logger import audit_logger
from ...services.gcs import generate_signed_upload_url
from ...services.audit_logger import audit_logger
from ...models.audit_log import AuditAction
router = APIRouter(prefix="/files", tags=["files"])
@ -28,11 +28,11 @@ async def get_signed_upload_url(
status_code=status.HTTP_400_BAD_REQUEST,
detail="Only video files are supported"
)
# Generate unique blob path
from bson import ObjectId
blob_path = f"temp/{ObjectId()}/{request.filename}"
try:
# Generate signed upload URL with form fields
signed_data = await generate_signed_upload_url(
@ -40,7 +40,7 @@ async def get_signed_upload_url(
content_type=request.content_type,
max_size=request.max_size or 1024 * 1024 * 1024 # 1GB default
)
await audit_logger.log_action(
action=AuditAction.FILE_UPLOAD,
description=f"Signed upload URL generated for {request.filename}",
@ -62,4 +62,4 @@ async def get_signed_upload_url(
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Failed to generate signed upload URL: {str(e)}"
) from None
)

View file

@ -11,7 +11,7 @@ from __future__ import annotations
from fastapi import APIRouter, Depends, File, Form, HTTPException, Query, UploadFile
from ...core.authz import MembershipContext, assert_user_in_org, get_membership_context
from ...core.dependencies import get_current_user, require_pm_for_client, require_roles
from ...core.logging import get_logger
from ...models.audit_log import AuditAction
from ...models.glossary import (
@ -19,7 +19,7 @@ from ...models.glossary import (
GlossaryResponse,
GlossaryVersionResponse,
)
from ...models.organization import OrgRole
from ...models.user import User, UserRole
from ...services import audit_logger as audit_svc
from ...services import glossary_service as svc
@ -37,18 +37,22 @@ _ALLOWED_CONTENT_TYPES = {
_MAX_FILE_SIZE_MB = 50
def _require_client_staff(client_id: str):
"""Dependency: admin or PM of this client."""
return require_pm_for_client(client_id_param="client_id")
# ── List glossaries ───────────────────────────────────────────────────────────
@router.get("", response_model=list[GlossaryResponse])
async def list_glossaries(
client_id: str,
ctx: MembershipContext = Depends(get_membership_context),
current_user: User = Depends(get_current_user),
):
"""List all active glossaries for a client."""
assert_user_in_org(ctx, client_id, OrgRole.VIEWER)
_assert_can_read(current_user)
glossaries = await svc.get_glossaries_for_client(client_id)
version_map = await svc.get_versions_by_ids([g.current_version_id for g in glossaries if g.current_version_id])
return [_to_response(g, version_map.get(g.current_version_id)) for g in glossaries]
return [_to_response(g) for g in glossaries]
# ── Upload new glossary ───────────────────────────────────────────────────────
@ -62,10 +66,9 @@ async def upload_glossary(
source_locale_col: str = Form(..., description="xlsx column header for the source language, e.g. en_gb"),
description: str | None = Form(None),
change_note: str | None = Form(None),
ctx: MembershipContext = Depends(get_membership_context),
current_user: User = Depends(require_roles(UserRole.ADMIN, UserRole.PROJECT_MANAGER)),
):
"""Upload a new glossary xlsx file and associate it with a client."""
assert_user_in_org(ctx, client_id, OrgRole.MANAGER)
_validate_xlsx(file)
try:
@ -75,7 +78,7 @@ async def upload_glossary(
source_locale=source_locale,
source_locale_col=source_locale_col,
file=file,
user_id=str(ctx.user.id),
user_id=str(current_user.id),
description=description,
change_note=change_note,
)
@ -85,7 +88,7 @@ async def upload_glossary(
await audit_svc.audit_logger.log_action(
action=AuditAction.GLOSSARY_UPLOAD,
description=f"Glossary '{name}' uploaded for client {client_id}",
user=ctx.user,
user=current_user,
resource_type="glossary",
resource_id=glossary.id,
details={"term_count": version.term_count, "source_locale": source_locale},
@ -101,9 +104,9 @@ async def upload_glossary(
async def get_glossary(
client_id: str,
glossary_id: str,
ctx: MembershipContext = Depends(get_membership_context),
current_user: User = Depends(get_current_user),
):
assert_user_in_org(ctx, client_id, OrgRole.VIEWER)
_assert_can_read(current_user)
glossary = await svc.get_glossary(glossary_id)
if not glossary or glossary.client_id != client_id:
raise HTTPException(status_code=404, detail="Glossary not found")
@ -121,9 +124,9 @@ async def list_terms(
search: str | None = Query(None),
page: int = Query(1, ge=1),
page_size: int = Query(50, ge=1, le=200),
ctx: MembershipContext = Depends(get_membership_context),
current_user: User = Depends(get_current_user),
):
assert_user_in_org(ctx, client_id, OrgRole.VIEWER)
_assert_can_read(current_user)
glossary = await svc.get_glossary(glossary_id)
if not glossary or glossary.client_id != client_id:
raise HTTPException(status_code=404, detail="Glossary not found")
@ -150,10 +153,9 @@ async def upload_version(
file: UploadFile = File(...),
source_locale_col: str = Form(...),
change_note: str | None = Form(None),
ctx: MembershipContext = Depends(get_membership_context),
current_user: User = Depends(require_roles(UserRole.ADMIN, UserRole.PROJECT_MANAGER)),
):
"""Upload a new xlsx file as a new version of an existing glossary."""
assert_user_in_org(ctx, client_id, OrgRole.MANAGER)
_validate_xlsx(file)
glossary = await svc.get_glossary(glossary_id)
if not glossary or glossary.client_id != client_id:
@ -164,7 +166,7 @@ async def upload_version(
glossary_id=glossary_id,
source_locale_col=source_locale_col,
file=file,
user_id=str(ctx.user.id),
user_id=str(current_user.id),
change_note=change_note,
)
except ValueError as exc:
@ -173,7 +175,7 @@ async def upload_version(
await audit_svc.audit_logger.log_action(
action=AuditAction.GLOSSARY_VERSION_UPLOAD,
description=f"New glossary version uploaded for glossary {glossary_id}",
user=ctx.user,
user=current_user,
resource_type="glossary_version",
resource_id=version.id,
details={"term_count": version.term_count, "version_number": version.version_number},
@ -188,9 +190,8 @@ async def activate_version(
client_id: str,
glossary_id: str,
version_id: str = Form(...),
ctx: MembershipContext = Depends(get_membership_context),
current_user: User = Depends(require_roles(UserRole.ADMIN, UserRole.PROJECT_MANAGER)),
):
assert_user_in_org(ctx, client_id, OrgRole.MANAGER)
glossary = await svc.get_glossary(glossary_id)
if not glossary or glossary.client_id != client_id:
raise HTTPException(status_code=404, detail="Glossary not found")
@ -203,7 +204,7 @@ async def activate_version(
await audit_svc.audit_logger.log_action(
action=AuditAction.GLOSSARY_ACTIVATE,
description=f"Glossary version {version_id} activated",
user=ctx.user,
user=current_user,
resource_type="glossary",
resource_id=glossary_id,
details={"version_id": version_id},
@ -218,10 +219,9 @@ async def reembed_version(
client_id: str,
glossary_id: str,
version_id: str,
ctx: MembershipContext = Depends(get_membership_context),
current_user: User = Depends(require_roles(UserRole.ADMIN, UserRole.PROJECT_MANAGER)),
):
"""Re-queue the embedding task for a glossary version (resets failed/pending/stuck embeds)."""
assert_user_in_org(ctx, client_id, OrgRole.MANAGER)
glossary = await svc.get_glossary(glossary_id)
if not glossary or glossary.client_id != client_id:
raise HTTPException(status_code=404, detail="Glossary not found")
@ -232,11 +232,10 @@ async def reembed_version(
raise HTTPException(status_code=404, detail="Version not found")
try:
import motor.motor_asyncio
from bson import ObjectId
from ...core.config import settings
from ...tasks.embed_glossary import embed_glossary_version_task
from bson import ObjectId
import motor.motor_asyncio
from ...core.config import settings
client_db = motor.motor_asyncio.AsyncIOMotorClient(settings.mongodb_uri)
db = client_db[settings.mongodb_db]
@ -253,15 +252,14 @@ async def reembed_version(
return {"status": "queued", "version_id": version_id}
# ── Delete ───────────────────────────────────────────────────────────────────
# ── Archive (soft-delete) ─────────────────────────────────────────────────────
@router.delete("/{glossary_id}", status_code=204)
async def archive_glossary(
client_id: str,
glossary_id: str,
ctx: MembershipContext = Depends(get_membership_context),
current_user: User = Depends(require_roles(UserRole.ADMIN)),
):
assert_user_in_org(ctx, client_id, OrgRole.ADMIN)
glossary = await svc.get_glossary(glossary_id)
if not glossary or glossary.client_id != client_id:
raise HTTPException(status_code=404, detail="Glossary not found")
@ -269,7 +267,7 @@ async def archive_glossary(
await audit_svc.audit_logger.log_action(
action=AuditAction.GLOSSARY_ARCHIVE,
description=f"Glossary {glossary_id} archived",
user=ctx.user,
user=current_user,
resource_type="glossary",
resource_id=glossary_id,
)
@ -277,6 +275,13 @@ async def archive_glossary(
# ── Helpers ───────────────────────────────────────────────────────────────────
def _assert_can_read(user: User) -> None:
allowed = {UserRole.ADMIN, UserRole.PROJECT_MANAGER, UserRole.REVIEWER,
UserRole.LINGUIST, UserRole.PRODUCTION}
if user.role not in allowed:
raise HTTPException(status_code=403, detail="Insufficient permissions")
def _validate_xlsx(file: UploadFile) -> None:
if file.content_type not in _ALLOWED_CONTENT_TYPES and not (
file.filename and file.filename.endswith(".xlsx")
@ -287,7 +292,7 @@ def _validate_xlsx(file: UploadFile) -> None:
)
def _to_response(g, current_version=None) -> GlossaryResponse:
def _to_response(g) -> GlossaryResponse:
return GlossaryResponse(
id=str(g.id),
client_id=g.client_id,
@ -297,9 +302,6 @@ def _to_response(g, current_version=None) -> GlossaryResponse:
source=g.source,
status=g.status,
current_version_id=g.current_version_id,
current_version_embedding_status=current_version.embedding_status if current_version else None,
current_version_embedded_count=current_version.embedded_count if current_version else None,
current_version_term_count=current_version.term_count if current_version else None,
created_at=g.created_at,
created_by=g.created_by,
)

View file

@ -14,21 +14,16 @@ Protected endpoints:
import hashlib
import re
import secrets
from datetime import UTC, datetime, timedelta
from datetime import datetime, timedelta, timezone
from fastapi import APIRouter, Depends, HTTPException, Request
from fastapi import APIRouter, Depends, HTTPException, status
from motor.motor_asyncio import AsyncIOMotorDatabase
from ...core.authz import bump_user_membership_cache
from ...core.database import get_database
from ...core.dependencies import get_current_user
from ...core.security import (
create_access_token,
create_refresh_token,
get_password_hash,
)
from ...models.audit_log import AuditAction
from ...core.security import create_access_token, create_refresh_token, get_password_hash
from ...models.invitation import (
Invitation,
InvitationAcceptRequest,
InvitationCreate,
InvitationPreviewResponse,
@ -36,7 +31,7 @@ from ...models.invitation import (
)
from ...models.organization import OrgRole
from ...models.user import AuthProvider, User, UserRole
from ...services.audit_logger import audit_logger
from ...core.authz import bump_user_membership_cache
from ...services.emailer import email_service
from ...services.membership_service import get_membership, upsert_membership
@ -44,7 +39,7 @@ router = APIRouter(tags=["invitations"])
def _now() -> datetime:
return datetime.now(UTC)
return datetime.now(timezone.utc)
def _hash_token(plaintext: str) -> str:
@ -59,7 +54,7 @@ def _make_token() -> tuple[str, str]:
def _inv_from_doc(doc: dict) -> InvitationResponse:
now = _now()
expires_at = doc["expires_at"].replace(tzinfo=UTC) if doc["expires_at"].tzinfo is None else doc["expires_at"]
expires_at = doc["expires_at"].replace(tzinfo=timezone.utc) if doc["expires_at"].tzinfo is None else doc["expires_at"]
return InvitationResponse(
id=str(doc["_id"]),
email=doc["email"],
@ -105,7 +100,6 @@ org_router = APIRouter(prefix="/organizations", tags=["invitations"])
async def create_invitation(
org_id: str,
body: InvitationCreate,
request: Request,
current_user: User = Depends(get_current_user),
db: AsyncIOMotorDatabase = Depends(get_database),
):
@ -127,18 +121,6 @@ async def create_invitation(
detail="A pending invitation already exists for this email. Revoke it first to re-invite.",
)
# MT-19: ensure all target_team_ids belong to this org (client_id == org_id)
if body.target_team_ids:
valid_teams = await db.teams.count_documents({
"_id": {"$in": body.target_team_ids},
"client_id": org_id,
})
if valid_teams != len(body.target_team_ids):
raise HTTPException(
status_code=400,
detail="One or more target_team_ids do not belong to this organization.",
)
plaintext, token_hash = _make_token()
now = _now()
expires_at = now + timedelta(days=body.expires_in_days)
@ -172,17 +154,7 @@ async def create_invitation(
expires_at=expires_at,
)
inv = _inv_from_doc(doc)
await audit_logger.log_action(
action=AuditAction.INVITATION_CREATE,
description=f"Invitation created for '{email_lower}' to organization '{org_id}'",
user=current_user,
request=request,
resource_type="invitation",
resource_id=inv.id,
details={"invited_email": email_lower, "org_id": org_id, "role": body.role_in_org},
)
return inv
return _inv_from_doc(doc)
@org_router.get("/{org_id}/invitations", response_model=list[InvitationResponse])
@ -202,30 +174,16 @@ async def list_invitations(
async def revoke_invitation(
org_id: str,
invitation_id: str,
request: Request,
current_user: User = Depends(get_current_user),
db: AsyncIOMotorDatabase = Depends(get_database),
):
await _assert_org_admin(org_id, current_user, db)
inv_doc = await db.invitations.find_one({"_id": invitation_id, "organization_id": org_id})
result = await db.invitations.update_one(
{"_id": invitation_id, "organization_id": org_id, "accepted_at": None, "revoked_at": None},
{"$set": {"revoked_at": _now()}},
)
if result.matched_count == 0:
raise HTTPException(status_code=404, detail="Invitation not found or already accepted/revoked")
await audit_logger.log_action(
action=AuditAction.INVITATION_REVOKE,
description=f"Invitation '{invitation_id}' revoked in organization '{org_id}'",
user=current_user,
request=request,
resource_type="invitation",
resource_id=invitation_id,
details={
"invited_email": inv_doc["email"] if inv_doc else None,
"org_id": org_id,
},
)
# ---------------------------------------------------------------------------
@ -248,7 +206,7 @@ async def preview_invitation(
raise HTTPException(status_code=410, detail="Invitation not found or has expired")
now = _now()
expires_at = doc["expires_at"].replace(tzinfo=UTC) if doc["expires_at"].tzinfo is None else doc["expires_at"]
expires_at = doc["expires_at"].replace(tzinfo=timezone.utc) if doc["expires_at"].tzinfo is None else doc["expires_at"]
if doc.get("revoked_at"):
raise HTTPException(status_code=410, detail="This invitation has been revoked")
@ -297,7 +255,6 @@ async def preview_invitation(
@router.post("/invitations/accept")
async def accept_invitation(
body: InvitationAcceptRequest,
request: Request,
db: AsyncIOMotorDatabase = Depends(get_database),
):
"""Accept an invitation. Creates user if needed, creates membership, returns tokens."""
@ -360,16 +317,12 @@ async def accept_invitation(
await upsert_membership(user_id, org_id, role_in_org, doc["invited_by_user_id"], db)
await bump_user_membership_cache(user_id)
# Auto-add to target teams — write to both Team.member_user_ids (legacy) and Membership.team_ids (MT-17)
# Auto-add to target teams
for team_id in doc.get("target_team_ids", []):
await db.teams.update_one(
{"_id": team_id, "client_id": org_id},
{"$addToSet": {"member_user_ids": user_id}},
)
await db.memberships.update_one(
{"user_id": user_id, "organization_id": org_id},
{"$addToSet": {"team_ids": team_id}},
)
# Send welcome email
if not existing_user.get("_welcomed"):
@ -380,23 +333,12 @@ async def accept_invitation(
org_name=org_name,
)
# Issue JWT tokens with org_ids claim
_inv_org_ids = [m["organization_id"] async for m in db.memberships.find({"user_id": user_id}, {"organization_id": 1})]
access_token = create_access_token(subject=user_id, org_ids=[str(o) for o in _inv_org_ids if o])
# Issue JWT tokens
access_token = create_access_token(subject=user_id)
refresh_token = create_refresh_token(subject=user_id)
org_name, org_slug = await _get_org_name(org_id, db)
await audit_logger.log_action(
action=AuditAction.INVITATION_ACCEPT,
description=f"Invitation accepted by '{email_lower}' for organization '{org_id}'",
user=None,
request=request,
resource_type="invitation",
resource_id=str(doc["_id"]),
details={"invited_email": email_lower, "org_id": org_id},
)
return {
"access_token": access_token,
"refresh_token": refresh_token,

File diff suppressed because it is too large Load diff

View file

@ -1,18 +1,17 @@
"""Per-language QC endpoints — two-stage (linguist + reviewer) assignment, workflow, comments."""
from datetime import datetime
from typing import Optional
from fastapi import APIRouter, Depends, HTTPException, Query, Request
from fastapi import APIRouter, Depends, Query, Request
from motor.motor_asyncio import AsyncIOMotorDatabase
from pydantic import BaseModel, Field
from ...core.database import get_database
from ...core.dependencies import require_roles
from ...models.audit_log import AuditAction
from ...models.job import LanguageQCComment, LanguageQCState
from ...models.user import User, UserRole
from ...services import language_qc as lqc
from ...services.audit_logger import audit_logger
router = APIRouter(tags=["language-qc"])
@ -21,39 +20,38 @@ router = APIRouter(tags=["language-qc"])
class AssignRequest(BaseModel):
linguist_user_id: str
notes: str | None = None
deadline: datetime | None = None
notes: Optional[str] = None
deadline: Optional[datetime] = None
class ReassignRequest(BaseModel):
linguist_user_id: str
notes: str | None = None
deadline: datetime | None = None
notes: Optional[str] = None
deadline: Optional[datetime] = None
class AssignReviewerRequest(BaseModel):
reviewer_user_id: str
notes: str | None = None
deadline: datetime | None = None
notes: Optional[str] = None
deadline: Optional[datetime] = None
class ReassignReviewerRequest(BaseModel):
reviewer_user_id: str
notes: str | None = None
deadline: datetime | None = None
notes: Optional[str] = None
deadline: Optional[datetime] = None
class ApproveLanguageRequest(BaseModel):
notes: str | None = None
notes: Optional[str] = None
class RejectLanguageRequest(BaseModel):
notes: str
category: str | None = None # timing | mistranslation | terminology | profanity | length | other
class ReopenLanguageRequest(BaseModel):
notes: str | None = None
notes: Optional[str] = None
class AddCommentRequest(BaseModel):
@ -76,8 +74,8 @@ class QueueItem(BaseModel):
job_status: str
lang: str
lang_qc_status: str
assigned_at: str | None = None
reviewed_at: str | None = None
assigned_at: Optional[str] = None
reviewed_at: Optional[str] = None
class QueueResponse(BaseModel):
@ -85,20 +83,6 @@ class QueueResponse(BaseModel):
total: int
class BulkAssignRequest(BaseModel):
linguist_user_id: str
reviewer_user_id: str | None = None
languages: list[str] | None = None # None = all available languages
only_unassigned: bool = False # skip languages that already have an assignment
deadline: datetime | None = None
class BulkAssignResponse(BaseModel):
assigned: list[str]
skipped: list[str]
errors: dict[str, str]
# ── Routes ────────────────────────────────────────────────────────────────────
@router.get("/jobs/{job_id}/language-qc", response_model=LanguageQCMapResponse)
@ -110,8 +94,6 @@ async def get_language_qc(
)),
db: AsyncIOMotorDatabase = Depends(get_database),
):
# Lazy auto-assignment: apply project/job defaults on first open in PENDING_QC
await lqc.auto_assign_defaults(db, job_id)
states = await lqc.get_all_states(db, job_id)
return LanguageQCMapResponse(job_id=job_id, language_qc=states)
@ -125,7 +107,7 @@ async def assign_language(
request: AssignRequest,
http_request: Request,
current_user: User = Depends(require_roles(
UserRole.REVIEWER, UserRole.PROJECT_MANAGER, UserRole.PRODUCTION, UserRole.ADMIN,
UserRole.PROJECT_MANAGER, UserRole.PRODUCTION, UserRole.ADMIN,
)),
db: AsyncIOMotorDatabase = Depends(get_database),
):
@ -133,15 +115,6 @@ async def assign_language(
db, job_id, lang, request.linguist_user_id, current_user,
http_request=http_request, notes=request.notes, deadline=request.deadline,
)
await audit_logger.log_action(
action=AuditAction.LANGUAGE_QC_ASSIGN,
description=f"Language '{lang}' assigned to linguist '{request.linguist_user_id}' for job {job_id}",
user=current_user,
request=http_request,
resource_type="job",
resource_id=job_id,
details={"lang": lang, "linguist_user_id": request.linguist_user_id},
)
return LanguageQCStateResponse(lang=lang, state=state)
@ -160,15 +133,6 @@ async def reassign_language(
db, job_id, lang, request.linguist_user_id, current_user,
http_request=http_request, notes=request.notes, deadline=request.deadline,
)
await audit_logger.log_action(
action=AuditAction.LANGUAGE_QC_REASSIGN,
description=f"Language '{lang}' reassigned to linguist '{request.linguist_user_id}' for job {job_id}",
user=current_user,
request=http_request,
resource_type="job",
resource_id=job_id,
details={"lang": lang, "linguist_user_id": request.linguist_user_id},
)
return LanguageQCStateResponse(lang=lang, state=state)
@ -181,7 +145,7 @@ async def assign_reviewer(
request: AssignReviewerRequest,
http_request: Request,
current_user: User = Depends(require_roles(
UserRole.REVIEWER, UserRole.PROJECT_MANAGER, UserRole.PRODUCTION, UserRole.ADMIN,
UserRole.PROJECT_MANAGER, UserRole.PRODUCTION, UserRole.ADMIN,
)),
db: AsyncIOMotorDatabase = Depends(get_database),
):
@ -189,15 +153,6 @@ async def assign_reviewer(
db, job_id, lang, request.reviewer_user_id, current_user,
http_request=http_request, notes=request.notes, deadline=request.deadline,
)
await audit_logger.log_action(
action=AuditAction.LANGUAGE_QC_REVIEWER_ASSIGN,
description=f"Reviewer '{request.reviewer_user_id}' assigned to language '{lang}' for job {job_id}",
user=current_user,
request=http_request,
resource_type="job",
resource_id=job_id,
details={"lang": lang, "reviewer_user_id": request.reviewer_user_id},
)
return LanguageQCStateResponse(lang=lang, state=state)
@ -208,7 +163,7 @@ async def reassign_reviewer(
request: ReassignReviewerRequest,
http_request: Request,
current_user: User = Depends(require_roles(
UserRole.REVIEWER, UserRole.PROJECT_MANAGER, UserRole.PRODUCTION, UserRole.ADMIN,
UserRole.PROJECT_MANAGER, UserRole.PRODUCTION, UserRole.ADMIN,
)),
db: AsyncIOMotorDatabase = Depends(get_database),
):
@ -216,94 +171,9 @@ async def reassign_reviewer(
db, job_id, lang, request.reviewer_user_id, current_user,
http_request=http_request, notes=request.notes, deadline=request.deadline,
)
await audit_logger.log_action(
action=AuditAction.LANGUAGE_QC_REVIEWER_REASSIGN,
description=f"Reviewer reassigned to '{request.reviewer_user_id}' for language '{lang}', job {job_id}",
user=current_user,
request=http_request,
resource_type="job",
resource_id=job_id,
details={"lang": lang, "reviewer_user_id": request.reviewer_user_id},
)
return LanguageQCStateResponse(lang=lang, state=state)
# ── Bulk assignment ───────────────────────────────────────────────────────────
@router.post("/jobs/{job_id}/languages/bulk-assign", response_model=BulkAssignResponse)
async def bulk_assign_languages(
job_id: str,
request: BulkAssignRequest,
http_request: Request,
current_user: User = Depends(require_roles(
UserRole.REVIEWER, UserRole.PROJECT_MANAGER, UserRole.PRODUCTION, UserRole.ADMIN,
)),
db: AsyncIOMotorDatabase = Depends(get_database),
):
"""Assign one linguist (and optionally one reviewer) to multiple languages in one call."""
job_doc = await db["jobs"].find_one({"_id": job_id})
if not job_doc:
raise HTTPException(status_code=404, detail="Job not found")
available = list((job_doc.get("outputs") or {}).keys())
target_langs = request.languages if request.languages else available
assigned: list[str] = []
skipped: list[str] = []
errors: dict[str, str] = {}
language_qc = job_doc.get("language_qc") or {}
for lang in target_langs:
if lang not in available:
skipped.append(lang)
continue
lang_state = language_qc.get(lang) or {}
already_assigned = bool(lang_state.get("assigned_linguist_id"))
if request.only_unassigned and already_assigned:
skipped.append(lang)
continue
try:
await lqc.assign_linguist(
db, job_id, lang, request.linguist_user_id, current_user,
http_request=http_request, deadline=request.deadline,
)
except Exception as exc:
errors[lang] = str(exc)
continue
if request.reviewer_user_id:
try:
await lqc.assign_reviewer(
db, job_id, lang, request.reviewer_user_id, current_user,
http_request=http_request, deadline=request.deadline,
)
except Exception as exc:
errors[f"{lang}:reviewer"] = str(exc)
assigned.append(lang)
await audit_logger.log_action(
action=AuditAction.LANGUAGE_QC_BULK_ASSIGN,
description=f"Bulk assignment for job {job_id}: {len(assigned)} language(s) assigned to linguist '{request.linguist_user_id}'",
user=current_user,
request=http_request,
resource_type="job",
resource_id=job_id,
details={
"languages": assigned,
"linguist_user_id": request.linguist_user_id,
"reviewer_user_id": request.reviewer_user_id,
"skipped": skipped,
"errors": errors,
},
)
return BulkAssignResponse(assigned=assigned, skipped=skipped, errors=errors)
# ── Workflow transitions ──────────────────────────────────────────────────────
@router.post("/jobs/{job_id}/languages/{lang}/start-work", response_model=LanguageQCStateResponse)
@ -318,15 +188,6 @@ async def start_linguist_work(
):
"""Linguist opens the language — pending → in_progress."""
state = await lqc.start_linguist_work(db, job_id, lang, current_user)
await audit_logger.log_action(
action=AuditAction.LANGUAGE_QC_START_WORK,
description=f"Linguist started work on language '{lang}' for job {job_id}",
user=current_user,
request=http_request,
resource_type="job",
resource_id=job_id,
details={"lang": lang},
)
return LanguageQCStateResponse(lang=lang, state=state)
@ -342,15 +203,6 @@ async def submit_for_review(
):
"""Linguist submits — in_progress → pending_review. Notifies reviewer by email."""
state = await lqc.submit_for_review(db, job_id, lang, current_user, http_request=http_request)
await audit_logger.log_action(
action=AuditAction.LANGUAGE_QC_SUBMIT,
description=f"Language '{lang}' submitted for review for job {job_id}",
user=current_user,
request=http_request,
resource_type="job",
resource_id=job_id,
details={"lang": lang},
)
return LanguageQCStateResponse(lang=lang, state=state)
@ -366,15 +218,6 @@ async def open_review(
):
"""Reviewer opens the review — pending_review → in_review."""
state = await lqc.open_review(db, job_id, lang, current_user, http_request=http_request)
await audit_logger.log_action(
action=AuditAction.LANGUAGE_QC_OPEN_REVIEW,
description=f"Reviewer opened review for language '{lang}', job {job_id}",
user=current_user,
request=http_request,
resource_type="job",
resource_id=job_id,
details={"lang": lang},
)
return LanguageQCStateResponse(lang=lang, state=state)
@ -394,15 +237,6 @@ async def approve_language(
state = await lqc.approve_language(
db, job_id, lang, current_user, http_request=http_request, notes=request.notes,
)
await audit_logger.log_action(
action=AuditAction.LANGUAGE_QC_APPROVE,
description=f"Language '{lang}' approved for job {job_id}",
user=current_user,
request=http_request,
resource_type="job",
resource_id=job_id,
details={"lang": lang, "notes": request.notes},
)
return LanguageQCStateResponse(lang=lang, state=state)
@ -418,50 +252,8 @@ async def reject_language(
db: AsyncIOMotorDatabase = Depends(get_database),
):
state = await lqc.reject_language(
db, job_id, lang, current_user, request.notes, category=request.category, http_request=http_request,
db, job_id, lang, current_user, request.notes, http_request=http_request,
)
await audit_logger.log_action(
action=AuditAction.LANGUAGE_QC_REJECT,
description=f"Language '{lang}' rejected for job {job_id}",
user=current_user,
request=http_request,
resource_type="job",
resource_id=job_id,
details={"lang": lang, "notes": request.notes, "category": request.category},
)
return LanguageQCStateResponse(lang=lang, state=state)
class MarkCueReviewedRequest(BaseModel):
total_cues: int | None = None # client sends on first call to set total
@router.post("/jobs/{job_id}/languages/{lang}/mark-cue-reviewed", response_model=LanguageQCStateResponse)
async def mark_cue_reviewed(
job_id: str,
lang: str,
request: MarkCueReviewedRequest,
http_request: Request,
current_user: User = Depends(require_roles(UserRole.REVIEWER, UserRole.ADMIN)),
db: AsyncIOMotorDatabase = Depends(get_database),
):
"""Increment reviewed_cues counter; optionally set total_cues on first call."""
job_doc = await db.jobs.find_one({"_id": job_id})
if not job_doc:
raise HTTPException(status_code=404, detail="Job not found")
inc_op: dict = {f"language_qc.{lang}.reviewed_cues": 1}
set_op: dict = {"updated_at": datetime.utcnow()}
if request.total_cues is not None:
set_op[f"language_qc.{lang}.total_cues"] = request.total_cues
await db.jobs.update_one({"_id": job_id}, {"$inc": inc_op, "$set": set_op})
updated_doc = await db.jobs.find_one({"_id": job_id})
state_dict = (updated_doc.get("language_qc") or {}).get(lang, {})
from ...models.job import LanguageQCState
state = LanguageQCState(**state_dict) if isinstance(state_dict, dict) else LanguageQCState()
return LanguageQCStateResponse(lang=lang, state=state)
@ -477,15 +269,6 @@ async def reopen_language(
state = await lqc.reopen_language(
db, job_id, lang, current_user, http_request=http_request, notes=request.notes,
)
await audit_logger.log_action(
action=AuditAction.LANGUAGE_QC_REOPEN,
description=f"Language '{lang}' reopened for job {job_id}",
user=current_user,
request=http_request,
resource_type="job",
resource_id=job_id,
details={"lang": lang, "notes": request.notes},
)
return LanguageQCStateResponse(lang=lang, state=state)
@ -506,15 +289,6 @@ async def add_comment(
comment = await lqc.add_comment(
db, job_id, lang, current_user, request.body, http_request=http_request,
)
await audit_logger.log_action(
action=AuditAction.LANGUAGE_QC_COMMENT,
description=f"Comment added to language '{lang}' for job {job_id}",
user=current_user,
request=http_request,
resource_type="job",
resource_id=job_id,
details={"lang": lang, "comment_id": str(comment.id) if hasattr(comment, "id") else None},
)
return comment
@ -539,7 +313,7 @@ async def list_comments(
@router.get("/me/language-qc-queue", response_model=QueueResponse)
async def my_language_qc_queue(
role: str = Query("linguist", description="'linguist' or 'reviewer'"),
qc_status: str | None = Query(None, description="Filter by status"),
qc_status: Optional[str] = Query(None, description="Filter by status"),
skip: int = Query(0, ge=0),
limit: int = Query(50, ge=1, le=200),
current_user: User = Depends(require_roles(
@ -548,17 +322,13 @@ async def my_language_qc_queue(
db: AsyncIOMotorDatabase = Depends(get_database),
):
"""List jobs and languages assigned to the current user as linguist or reviewer."""
# ADMIN sees all orgs; staff scoped to their orgs from JWT claim (MT-18)
org_ids: list[str] | None = None if current_user.role == UserRole.ADMIN else getattr(current_user, "org_ids", None)
if role == "reviewer":
jobs = await lqc.list_for_reviewer(
db, str(current_user.id), accessible_org_ids=org_ids,
status_filter=qc_status, skip=skip, limit=limit,
db, str(current_user.id), status_filter=qc_status, skip=skip, limit=limit,
)
else:
jobs = await lqc.list_for_linguist(
db, str(current_user.id), accessible_org_ids=org_ids,
status_filter=qc_status, skip=skip, limit=limit,
db, str(current_user.id), status_filter=qc_status, skip=skip, limit=limit,
)
items: list[QueueItem] = []

View file

@ -12,25 +12,19 @@ underlying MongoDB collections used by routes_clients.py so both
endpoints coexist without data duplication.
"""
from datetime import UTC, datetime
from datetime import datetime, timezone
from fastapi import APIRouter, Depends, HTTPException, Request
from bson import ObjectId
from fastapi import APIRouter, Depends, HTTPException
from motor.motor_asyncio import AsyncIOMotorDatabase
from pydantic import BaseModel
from ...core.authz import bump_user_membership_cache
from ...core.database import get_database
from ...core.dependencies import get_current_user, require_roles
from ...models.audit_log import AuditAction
from ...models.membership import MemberDetail, MembershipCreate, MembershipUpdate
from ...models.organization import (
Organization,
OrganizationCreate,
OrganizationUpdate,
OrgRole,
)
from ...models.organization import OrgRole, Organization, OrganizationCreate, OrganizationUpdate
from ...models.user import User, UserRole
from ...services.audit_logger import audit_logger
from ...core.authz import bump_user_membership_cache
from ...services.membership_service import (
get_membership,
get_memberships_for_user,
@ -45,7 +39,7 @@ ADMIN_ROLES = [UserRole.ADMIN]
def _now() -> datetime:
return datetime.now(UTC)
return datetime.now(timezone.utc)
# ---------------------------------------------------------------------------
@ -121,7 +115,6 @@ class _OrgCreate(BaseModel):
@router.post("", response_model=Organization, status_code=201)
async def create_organization(
body: OrganizationCreate,
request: Request,
current_user: User = Depends(require_roles(UserRole.ADMIN)),
db: AsyncIOMotorDatabase = Depends(get_database),
):
@ -140,25 +133,13 @@ async def create_organization(
"updated_at": now,
}
await db.clients.insert_one(doc)
org = _org_from_doc(doc)
await audit_logger.log_action(
action=AuditAction.ORG_CREATE,
description=f"Organization '{org.name}' created",
user=current_user,
request=request,
resource_type="organization",
resource_id=str(org.id),
resource_name=org.name,
details={"slug": org.slug},
)
return org
return _org_from_doc(doc)
@router.patch("/{org_id}", response_model=Organization)
async def update_organization(
org_id: str,
body: OrganizationUpdate,
request: Request,
current_user: User = Depends(require_roles(UserRole.ADMIN)),
db: AsyncIOMotorDatabase = Depends(get_database),
):
@ -175,18 +156,7 @@ async def update_organization(
await db.clients.update_one({"_id": org_id}, {"$set": updates})
updated = {**doc, **updates}
org = _org_from_doc(updated)
await audit_logger.log_action(
action=AuditAction.ORG_UPDATE,
description=f"Organization '{org.name}' updated",
user=current_user,
request=request,
resource_type="organization",
resource_id=str(org.id),
resource_name=org.name,
details={k: v for k, v in updates.items() if k != "updated_at"},
)
return org
return _org_from_doc(updated)
# ---------------------------------------------------------------------------
@ -208,7 +178,6 @@ async def list_members(
async def add_member(
org_id: str,
body: MembershipCreate,
request: Request,
current_user: User = Depends(get_current_user),
db: AsyncIOMotorDatabase = Depends(get_database),
):
@ -224,15 +193,6 @@ async def add_member(
members = await list_org_members(org_id, db)
for m in members:
if m.user_id == body.user_id:
await audit_logger.log_action(
action=AuditAction.ORG_MEMBER_ADD,
description=f"Member '{body.user_id}' added to organization '{org_id}' with role '{body.role_in_org}'",
user=current_user,
request=request,
resource_type="organization",
resource_id=org_id,
details={"user_id": body.user_id, "role": body.role_in_org},
)
return m
raise HTTPException(status_code=500, detail="Membership created but could not be retrieved")
@ -242,7 +202,6 @@ async def update_member(
org_id: str,
user_id: str,
body: MembershipUpdate,
request: Request,
current_user: User = Depends(get_current_user),
db: AsyncIOMotorDatabase = Depends(get_database),
):
@ -259,15 +218,6 @@ async def update_member(
members = await list_org_members(org_id, db)
for m in members:
if m.user_id == user_id:
await audit_logger.log_action(
action=AuditAction.ORG_MEMBER_UPDATE,
description=f"Member '{user_id}' role updated in organization '{org_id}' to '{body.role_in_org}'",
user=current_user,
request=request,
resource_type="organization",
resource_id=org_id,
details={"user_id": user_id, "role": body.role_in_org},
)
return m
raise HTTPException(status_code=500, detail="Could not retrieve updated membership")
@ -276,7 +226,6 @@ async def update_member(
async def remove_member(
org_id: str,
user_id: str,
request: Request,
current_user: User = Depends(get_current_user),
db: AsyncIOMotorDatabase = Depends(get_database),
):
@ -290,15 +239,6 @@ async def remove_member(
await remove_membership(user_id, org_id, db)
await bump_user_membership_cache(user_id)
await audit_logger.log_action(
action=AuditAction.ORG_MEMBER_REMOVE,
description=f"Member '{user_id}' removed from organization '{org_id}'",
user=current_user,
request=request,
resource_type="organization",
resource_id=org_id,
details={"user_id": user_id, "role": existing.role_in_org},
)
# ---------------------------------------------------------------------------

View file

@ -1,14 +1,14 @@
"""API routes for review notes - timestamped notes on video assets during review."""
from datetime import datetime
from typing import Optional
from bson import ObjectId
from fastapi import APIRouter, Depends, HTTPException, Query, status
from motor.motor_asyncio import AsyncIOMotorDatabase
from ...core.authz import MembershipContext, get_job_or_403, get_membership_context
from ...core.database import get_database
from ...core.dependencies import require_roles
from ...core.dependencies import get_current_user, require_roles
from ...core.logging import get_logger
from ...models.user import User, UserRole
from ...schemas.review_note import (
@ -25,13 +25,18 @@ router = APIRouter(prefix="/jobs/{job_id}/review-notes", tags=["review-notes"])
@router.get("", response_model=ReviewNotesListResponse)
async def list_review_notes(
job_id: str,
asset_key: str | None = Query(None, description="Filter notes by asset key"),
asset_key: Optional[str] = Query(None, description="Filter notes by asset key"),
current_user: User = Depends(require_roles(UserRole.REVIEWER, UserRole.LINGUIST, UserRole.PRODUCTION, UserRole.ADMIN)),
ctx: MembershipContext = Depends(get_membership_context),
db: AsyncIOMotorDatabase = Depends(get_database),
):
"""List all review notes for a job, optionally filtered by asset key."""
await get_job_or_403(job_id, ctx, db) # org check + existence check
# Verify job exists
job = await db.jobs.find_one({"_id": job_id})
if not job:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="Job not found"
)
# Build query
query = {"job_id": job_id}
@ -53,11 +58,16 @@ async def create_review_note(
job_id: str,
request: ReviewNoteCreateRequest,
current_user: User = Depends(require_roles(UserRole.REVIEWER, UserRole.LINGUIST, UserRole.PRODUCTION, UserRole.ADMIN)),
ctx: MembershipContext = Depends(get_membership_context),
db: AsyncIOMotorDatabase = Depends(get_database),
):
"""Create a new review note for a video asset."""
await get_job_or_403(job_id, ctx, db) # org check + existence check
# Verify job exists
job = await db.jobs.find_one({"_id": job_id})
if not job:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="Job not found"
)
# Create note document
note_id = str(ObjectId())
@ -86,11 +96,9 @@ async def get_review_note(
job_id: str,
note_id: str,
current_user: User = Depends(require_roles(UserRole.REVIEWER, UserRole.LINGUIST, UserRole.PRODUCTION, UserRole.ADMIN)),
ctx: MembershipContext = Depends(get_membership_context),
db: AsyncIOMotorDatabase = Depends(get_database),
):
"""Get a single review note by ID."""
await get_job_or_403(job_id, ctx, db) # org check
note = await db.review_notes.find_one({"_id": note_id, "job_id": job_id})
if not note:
raise HTTPException(
@ -107,11 +115,9 @@ async def update_review_note(
note_id: str,
request: ReviewNoteUpdateRequest,
current_user: User = Depends(require_roles(UserRole.REVIEWER, UserRole.LINGUIST, UserRole.PRODUCTION, UserRole.ADMIN)),
ctx: MembershipContext = Depends(get_membership_context),
db: AsyncIOMotorDatabase = Depends(get_database),
):
"""Update a review note. Only the note owner can update."""
await get_job_or_403(job_id, ctx, db) # org check
note = await db.review_notes.find_one({"_id": note_id, "job_id": job_id})
if not note:
raise HTTPException(
@ -145,11 +151,9 @@ async def delete_review_note(
job_id: str,
note_id: str,
current_user: User = Depends(require_roles(UserRole.REVIEWER, UserRole.LINGUIST, UserRole.PRODUCTION, UserRole.ADMIN)),
ctx: MembershipContext = Depends(get_membership_context),
db: AsyncIOMotorDatabase = Depends(get_database),
):
"""Delete a review note. Only the note owner can delete."""
await get_job_or_403(job_id, ctx, db) # org check
note = await db.review_notes.find_one({"_id": note_id, "job_id": job_id})
if not note:
raise HTTPException(

View file

@ -1,354 +0,0 @@
"""Share-token endpoints — create/revoke/list tokens + public read-only view + client decision."""
import secrets
from datetime import datetime, timedelta
from typing import Literal
from fastapi import APIRouter, Depends, HTTPException, Request
from motor.motor_asyncio import AsyncIOMotorDatabase
from pydantic import BaseModel
from ...core.config import settings
from ...core.database import get_database
from ...core.dependencies import require_roles
from ...models.audit_log import AuditAction
from ...models.share_token import ShareTokenResponse
from ...models.user import User, UserRole
from ...services.audit_logger import audit_logger
from ...services.gcs import get_signed_download_url
router = APIRouter(tags=["share"])
_TOKENS = "share_tokens"
_JOBS = "jobs"
def _share_url(token: str) -> str:
return f"{settings.app_url}/share/{token}"
# ── Request schemas ───────────────────────────────────────────────────────────
class CreateShareTokenRequest(BaseModel):
expires_in_days: int | None = 30 # None = no expiry
label: str | None = None
class ShareTokenListResponse(BaseModel):
tokens: list[ShareTokenResponse]
class PublicJobPreviewLanguage(BaseModel):
captions_vtt_url: str | None = None
audio_description_vtt_url: str | None = None
accessible_video_mp4_url: str | None = None
audio_description_mp3_url: str | None = None
class PublicJobPreviewResponse(BaseModel):
job_id: str
job_title: str
job_status: str
source_language: str
languages: list[str]
language_outputs: dict[str, PublicJobPreviewLanguage]
class ClientDecisionRequest(BaseModel):
action: Literal["approve", "reject"]
notes: str | None = None
client_name: str | None = None
class ClientDecisionResponse(BaseModel):
status: str
new_job_status: str
# ── Authenticated routes ──────────────────────────────────────────────────────
@router.post("/jobs/{job_id}/share", response_model=ShareTokenResponse, status_code=201)
async def create_share_token(
job_id: str,
request: CreateShareTokenRequest,
http_request: Request,
current_user: User = Depends(require_roles(
UserRole.PROJECT_MANAGER, UserRole.PRODUCTION, UserRole.ADMIN,
)),
db: AsyncIOMotorDatabase = Depends(get_database),
):
"""Generate a read-only share link for a job."""
job_doc = await db[_JOBS].find_one({"_id": job_id})
if not job_doc:
raise HTTPException(status_code=404, detail="Job not found")
token_id = secrets.token_hex(32)
now = datetime.utcnow()
expires_at = (now + timedelta(days=request.expires_in_days)) if request.expires_in_days else None
token_doc = {
"_id": token_id,
"job_id": job_id,
"organization_id": job_doc.get("organization_id", ""),
"created_by_user_id": str(current_user.id),
"created_by_email": current_user.email,
"created_at": now,
"expires_at": expires_at,
"is_active": True,
"label": request.label,
}
await db[_TOKENS].insert_one(token_doc)
await audit_logger.log_action(
action=AuditAction.SHARE_TOKEN_CREATE,
description=f"Share token created for job '{job_id}'",
user=current_user,
request=http_request,
resource_type="job",
resource_id=job_id,
details={"token_id": token_id, "label": request.label, "expires_in_days": request.expires_in_days},
)
return ShareTokenResponse(
id=token_id,
job_id=job_id,
created_by_email=current_user.email,
created_at=now,
expires_at=expires_at,
is_active=True,
label=request.label,
share_url=_share_url(token_id),
)
@router.get("/jobs/{job_id}/share", response_model=ShareTokenListResponse)
async def list_share_tokens(
job_id: str,
current_user: User = Depends(require_roles(
UserRole.PROJECT_MANAGER, UserRole.PRODUCTION, UserRole.ADMIN,
)),
db: AsyncIOMotorDatabase = Depends(get_database),
):
"""List all active share tokens for a job."""
job_doc = await db[_JOBS].find_one({"_id": job_id})
if not job_doc:
raise HTTPException(status_code=404, detail="Job not found")
cursor = db[_TOKENS].find({"job_id": job_id, "is_active": True})
tokens = []
async for doc in cursor:
tokens.append(ShareTokenResponse(
id=doc["_id"],
job_id=doc["job_id"],
created_by_email=doc["created_by_email"],
created_at=doc["created_at"],
expires_at=doc.get("expires_at"),
is_active=doc["is_active"],
label=doc.get("label"),
share_url=_share_url(doc["_id"]),
))
return ShareTokenListResponse(tokens=tokens)
@router.delete("/jobs/{job_id}/share/{token_id}", status_code=204)
async def revoke_share_token(
job_id: str,
token_id: str,
http_request: Request,
current_user: User = Depends(require_roles(
UserRole.PROJECT_MANAGER, UserRole.PRODUCTION, UserRole.ADMIN,
)),
db: AsyncIOMotorDatabase = Depends(get_database),
):
"""Revoke (deactivate) a share token."""
result = await db[_TOKENS].update_one(
{"_id": token_id, "job_id": job_id},
{"$set": {"is_active": False}},
)
if result.matched_count == 0:
raise HTTPException(status_code=404, detail="Token not found")
await audit_logger.log_action(
action=AuditAction.SHARE_TOKEN_REVOKE,
description=f"Share token '{token_id}' revoked for job '{job_id}'",
user=current_user,
request=http_request,
resource_type="job",
resource_id=job_id,
details={"token_id": token_id},
)
# ── Public route (no auth) ────────────────────────────────────────────────────
@router.get("/public/share/{token}", response_model=PublicJobPreviewResponse)
async def get_public_job_preview(
token: str,
db: AsyncIOMotorDatabase = Depends(get_database),
):
"""Return read-only job preview for a valid share token. No authentication required."""
token_doc = await db[_TOKENS].find_one({"_id": token, "is_active": True})
if not token_doc:
raise HTTPException(status_code=404, detail="Share link not found or has been revoked")
if token_doc.get("expires_at") and token_doc["expires_at"] < datetime.utcnow():
raise HTTPException(status_code=410, detail="Share link has expired")
job_doc = await db[_JOBS].find_one({"_id": token_doc["job_id"]})
if not job_doc:
raise HTTPException(status_code=404, detail="Job not found")
outputs = job_doc.get("outputs") or {}
language_outputs: dict[str, PublicJobPreviewLanguage] = {}
for lang, lang_output in outputs.items():
if not isinstance(lang_output, dict):
continue
lang_data = PublicJobPreviewLanguage()
if "captions_vtt_gcs" in lang_output:
blob_path = lang_output["captions_vtt_gcs"].replace(f"gs://{settings.gcs_bucket}/", "")
try:
lang_data.captions_vtt_url = await get_signed_download_url(blob_path, 6)
except Exception:
pass
if "ad_vtt_gcs" in lang_output:
blob_path = lang_output["ad_vtt_gcs"].replace(f"gs://{settings.gcs_bucket}/", "")
try:
lang_data.audio_description_vtt_url = await get_signed_download_url(blob_path, 6)
except Exception:
pass
if "ad_mp3_gcs" in lang_output:
blob_path = lang_output["ad_mp3_gcs"].replace(f"gs://{settings.gcs_bucket}/", "")
try:
lang_data.audio_description_mp3_url = await get_signed_download_url(blob_path, 6)
except Exception:
pass
if "accessible_video_gcs" in lang_output:
blob_path = lang_output["accessible_video_gcs"].replace(f"gs://{settings.gcs_bucket}/", "")
try:
lang_data.accessible_video_mp4_url = await get_signed_download_url(blob_path, 6)
except Exception:
pass
language_outputs[lang] = lang_data
return PublicJobPreviewResponse(
job_id=str(job_doc["_id"]),
job_title=job_doc.get("title", "Untitled"),
job_status=job_doc.get("status", ""),
source_language=job_doc.get("source", {}).get("language", "en"),
languages=list(outputs.keys()),
language_outputs=language_outputs,
)
@router.post("/public/share/{token}/decision", response_model=ClientDecisionResponse)
async def client_decision(
token: str,
request: ClientDecisionRequest,
http_request: Request,
db: AsyncIOMotorDatabase = Depends(get_database),
):
"""Submit client approval or rejection via a share link. No authentication required."""
from ...services.validation import asset_validation_service
token_doc = await db[_TOKENS].find_one({"_id": token, "is_active": True})
if not token_doc:
raise HTTPException(status_code=404, detail="Share link not found or has been revoked")
if token_doc.get("expires_at") and token_doc["expires_at"] < datetime.utcnow():
raise HTTPException(status_code=410, detail="Share link has expired")
job_id = token_doc["job_id"]
job_doc = await db[_JOBS].find_one({"_id": job_id})
if not job_doc:
raise HTTPException(status_code=404, detail="Job not found")
if job_doc.get("status") != "pending_final_review":
raise HTTPException(
status_code=409,
detail="This job is not currently awaiting client review"
)
now = datetime.utcnow()
by_label = f"client:{request.client_name or 'anonymous'} (share/{token[:8]})"
if request.action == "approve":
is_valid, validation_errors = await asset_validation_service.validate_job_assets(job_doc)
if not is_valid:
raise HTTPException(
status_code=400,
detail=f"Asset validation failed: {'; '.join(validation_errors)}"
)
new_status = "completed"
update = {
"$set": {
"status": new_status,
"review.notes": request.notes or "",
"updated_at": now,
},
"$push": {
"review.history": {
"at": now,
"status": new_status,
"by": by_label,
"notes": request.notes or "",
}
},
}
else:
new_status = "qc_feedback"
update = {
"$set": {
"status": new_status,
"review.notes": request.notes or "",
"review.reviewer_id": by_label,
"updated_at": now,
},
"$push": {
"review.history": {
"at": now,
"status": new_status,
"by": by_label,
"notes": request.notes or "",
}
},
}
result = await db[_JOBS].find_one_and_update(
{"_id": job_id, "status": "pending_final_review"},
update,
return_document=True,
)
if not result:
raise HTTPException(
status_code=409,
detail="Decision could not be submitted — the job status may have changed"
)
await audit_logger.log_action(
action=AuditAction.SHARE_CLIENT_DECISION,
description=f"Client '{request.client_name or 'anonymous'}' submitted decision '{request.action}' for job '{job_id}' via share token",
user=None,
request=http_request,
resource_type="job",
resource_id=job_id,
details={
"action": request.action,
"token": token,
"client_name": request.client_name,
"new_status": new_status,
"notes": request.notes,
},
)
if request.action == "approve":
try:
from ...tasks.notify import notify_client_task
notify_client_task.delay(job_id)
except Exception:
pass
return ClientDecisionResponse(status="ok", new_job_status=new_status)

View file

@ -1,18 +1,18 @@
import asyncio
import time
from typing import Literal
from typing import Literal, Optional
from fastapi import APIRouter, Depends, HTTPException, Query
from fastapi.responses import Response
from pydantic import BaseModel, Field
from ...core.config import settings
from ...core.dependencies import get_current_user
from ...core.logging import get_logger
from ...services import cost_tracker
from ...services.elevenlabs_voices import elevenlabs_voice_service
from ...services.gemini_tts import gemini_tts_service
from ...services.elevenlabs_voices import elevenlabs_voice_service
from ...services.tts import tts_service
from ...services import cost_tracker
from ...core.dependencies import get_current_user
logger = get_logger(__name__)
@ -30,20 +30,20 @@ class VoicePreviewRequest(BaseModel):
style_preset: Literal[
"neutral", "calm", "energetic", "professional", "warm", "documentary", "custom"
] = "neutral"
custom_style_prompt: str | None = None
custom_style_prompt: Optional[str] = None
# ElevenLabs-specific
stability: float | None = Field(default=None, ge=0.0, le=1.0)
similarity_boost: float | None = Field(default=None, ge=0.0, le=1.0)
stability: Optional[float] = Field(default=None, ge=0.0, le=1.0)
similarity_boost: Optional[float] = Field(default=None, ge=0.0, le=1.0)
class VoiceInfo(BaseModel):
"""Structured voice information for any provider."""
id: str
name: str
description: str | None = None
preview_url: str | None = None
labels: dict[str, str] | None = None
category: str | None = None
description: Optional[str] = None
preview_url: Optional[str] = None
labels: Optional[dict[str, str]] = None
category: Optional[str] = None
class ProviderVoicesResponse(BaseModel):
@ -52,7 +52,7 @@ class ProviderVoicesResponse(BaseModel):
voices: list[VoiceInfo]
default: str
available: bool = True
error: str | None = None
error: Optional[str] = None
class LanguagesResponse(BaseModel):
@ -87,12 +87,12 @@ class ProviderOptionsResponse(BaseModel):
"""Available TTS configuration options for a provider."""
provider: str
# Gemini-specific
models: list[TTSOptionItem] | None = None
style_presets: list[TTSOptionItem] | None = None
speed_range: SpeedRange | None = None
models: Optional[list[TTSOptionItem]] = None
style_presets: Optional[list[TTSOptionItem]] = None
speed_range: Optional[SpeedRange] = None
# ElevenLabs-specific
stability_range: FloatRange | None = None
similarity_boost_range: FloatRange | None = None
stability_range: Optional[FloatRange] = None
similarity_boost_range: Optional[FloatRange] = None
@router.get("/voices", response_model=ProviderVoicesResponse)

View file

@ -3,21 +3,15 @@
from fastapi import APIRouter, Depends, HTTPException, Query, Request, status
from motor.motor_asyncio import AsyncIOMotorDatabase
from ...core.authz import MembershipContext, get_job_or_403, get_membership_context
from ...core.config import settings
from ...core.database import get_database
from ...core.dependencies import require_roles
from ...models.audit_log import AuditAction
from ...models.user import User, UserRole
from ...models.vtt_version import (
VttDiffResponse,
VttKind,
VttVersionListResponse,
VttVersionSummary,
)
from ...models.vtt_version import VttDiffResponse, VttKind, VttVersionListResponse, VttVersionSummary
from ...services import vtt_versioning
from ...services.audit_logger import audit_logger
from ...models.audit_log import AuditAction
from ...services.gcs import gcs_service
from ...core.config import settings
router = APIRouter(prefix="/jobs", tags=["vtt-versions"])
@ -32,11 +26,9 @@ async def list_vtt_versions(
skip: int = Query(0, ge=0),
limit: int = Query(50, ge=1, le=200),
current_user: User = Depends(require_roles(*_EDITABLE_ROLES)),
ctx: MembershipContext = Depends(get_membership_context),
db: AsyncIOMotorDatabase = Depends(get_database),
):
"""List all VTT versions for a job/lang/kind, newest first."""
await get_job_or_403(job_id, ctx, db) # org check
return await vtt_versioning.list_versions(db, job_id, lang, kind, skip, limit)
@ -47,11 +39,9 @@ async def get_vtt_version(
lang: str = Query(...),
kind: VttKind = Query(...),
current_user: User = Depends(require_roles(*_EDITABLE_ROLES)),
ctx: MembershipContext = Depends(get_membership_context),
db: AsyncIOMotorDatabase = Depends(get_database),
):
"""Get full VTT content for a specific version."""
await get_job_or_403(job_id, ctx, db) # org check
v = await vtt_versioning.get_version(db, job_id, lang, kind, version)
if not v:
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Version not found")
@ -79,11 +69,9 @@ async def diff_vtt_versions(
from_version: int = Query(..., alias="from"),
to_version: int = Query(..., alias="to"),
current_user: User = Depends(require_roles(*_EDITABLE_ROLES)),
ctx: MembershipContext = Depends(get_membership_context),
db: AsyncIOMotorDatabase = Depends(get_database),
):
"""Line-level diff between two versions of a VTT file."""
await get_job_or_403(job_id, ctx, db) # org check
v_from = await vtt_versioning.get_version(db, job_id, lang, kind, from_version)
v_to = await vtt_versioning.get_version(db, job_id, lang, kind, to_version)
if not v_from:
@ -105,7 +93,6 @@ async def restore_vtt_version(
kind: VttKind = Query(...),
http_request: Request = None,
current_user: User = Depends(require_roles(UserRole.PRODUCTION, UserRole.ADMIN)),
ctx: MembershipContext = Depends(get_membership_context),
db: AsyncIOMotorDatabase = Depends(get_database),
):
"""
@ -113,7 +100,6 @@ async def restore_vtt_version(
Non-destructive: creates a new version entry whose content mirrors the old one,
then overwrites the live GCS file.
"""
await get_job_or_403(job_id, ctx, db) # org check
src = await vtt_versioning.get_version(db, job_id, lang, kind, version)
if not src:
raise HTTPException(status_code=404, detail="Version not found")
@ -129,7 +115,7 @@ async def restore_vtt_version(
raise HTTPException(
status_code=500,
detail=f"Version snapshot created (v{new_ver.version}) but live file update failed: {exc}",
) from None
)
# Update the GCS URI pointer in the job document
gcs_uri_key = "captions_vtt_gcs" if kind == "captions" else "ad_vtt_gcs"

View file

@ -5,146 +5,107 @@ Provides WebSocket endpoints for:
1. Individual job status updates: /ws/jobs/{job_id}
2. Job list updates: /ws/jobs (all jobs for authenticated user)
"""
import asyncio
import logging
from typing import Optional
from fastapi import (
APIRouter,
Depends,
Query,
WebSocket,
WebSocketDisconnect,
)
from fastapi import APIRouter, WebSocket, WebSocketDisconnect, HTTPException, Depends, Query
from fastapi.security import HTTPBearer
from ...core.authz import PLATFORM_ADMIN_ROLES, _cached_memberships
from ...core.database import get_database
from ...models.user import UserRole
from ...services.websocket import (
ConnectionManager,
authenticate_websocket,
connection_manager,
authenticate_websocket,
get_connection_manager,
ConnectionManager
)
from ...models.job import Job
from ...core.database import get_database
from ...core.dependencies import get_current_user
logger = logging.getLogger(__name__)
router = APIRouter(tags=["WebSocket"])
security = HTTPBearer()
# Close codes that indicate a permanent auth/permission failure — frontend must NOT retry
_TERMINAL_CLOSE_CODES = {4001, 4003, 4004, 4403}
# Seconds between server-side keepalive frames.
# Must be < Apache mod_proxy_wstunnel idle timeout.
# Mod Comms incident 2026-03-18: 25s was insufficient; 20s is safe.
_KEEPALIVE_INTERVAL_S = 20
async def _resolve_user_and_org(websocket: WebSocket, user_id: str, db):
"""
Fetch user document and resolve org memberships from cache.
Returns (user_doc, memberships_dict) or closes the socket and returns (None, None).
"""
user = await db["users"].find_one({"_id": user_id})
if not user:
try:
from bson import ObjectId
user = await db["users"].find_one({"_id": ObjectId(user_id)})
except Exception:
pass
if not user:
await websocket.close(code=4001, reason="User not found")
return None, None
is_platform_admin = UserRole(user.get("role", "")) in PLATFORM_ADMIN_ROLES
if is_platform_admin:
return user, None # None memberships = unrestricted
memberships = await _cached_memberships(user_id, db)
return user, memberships
def _can_access_org(org_id: str | None, memberships: dict | None) -> bool:
"""Return True if user (with these memberships) may access the given org_id."""
if memberships is None:
return True # platform admin
if not org_id:
return True # legacy job without org: allow (further checks done below if needed)
return org_id in memberships
@router.websocket("/ws/jobs/{job_id}")
async def websocket_job_status(
websocket: WebSocket,
job_id: str,
token: str | None = Query(None),
token: Optional[str] = Query(None),
manager: ConnectionManager = Depends(get_connection_manager)
):
"""
WebSocket endpoint for real-time job status updates.
WebSocket endpoint for real-time job status updates
Usage:
- Connect: ws://localhost:8000/api/v1/ws/jobs/{job_id}?token={jwt_token}
- Receives: Real-time status updates for the specific job
Close codes:
4001 user not found
4003 role-based access denied
4004 job not found
4403 org membership access denied (do not retry)
Message format:
{
"type": "job_status_update",
"data": {
"job_id": "...",
"status": "processing",
"updated_at": "2023-...",
"message": "Processing video...",
"progress": 45
}
}
"""
# Authenticate the WebSocket connection
user_id = await authenticate_websocket(websocket, token)
if not user_id:
return
try:
# Verify user has access to this job
db = await get_database()
job = await db["jobs"].find_one({"_id": job_id})
jobs_collection = db["jobs"]
job = await jobs_collection.find_one({"_id": job_id})
if not job:
await websocket.close(code=4004, reason="Job not found")
return
user, memberships = await _resolve_user_and_org(websocket, user_id, db)
if user is None:
return # socket already closed inside helper
# Role-based client restriction
# Check permissions - users can only access their own jobs unless they're admin/reviewer
user = await db["users"].find_one({"_id": user_id})
if not user:
try:
from bson import ObjectId
user = await db["users"].find_one({"_id": ObjectId(user_id)})
except Exception:
pass # Invalid ObjectId format
if not user:
await websocket.close(code=4001, reason="User not found")
return
# Check access permissions
if user["role"] == "client" and job.get("created_by") != user_id:
await websocket.close(code=4003, reason="Access denied")
return
# Org membership check
job_org = job.get("organization_id")
if not _can_access_org(job_org, memberships):
await websocket.close(code=4403, reason="Org access denied")
return
# Connect to job status updates
await manager.connect_job_status(websocket, user_id, job_id)
# Keep connection alive and handle incoming messages
while True:
try:
# Wait up to _KEEPALIVE_INTERVAL_S for a client message.
# On timeout send a keepalive frame so the proxy idle timer resets.
message = await asyncio.wait_for(
websocket.receive_text(),
timeout=_KEEPALIVE_INTERVAL_S,
)
# Wait for incoming WebSocket messages (for heartbeat, etc.)
message = await websocket.receive_text()
logger.debug(f"Received WebSocket message from user {user_id}: {message}")
# Handle heartbeat or other client messages if needed
if message == "ping":
await websocket.send_text("pong")
except TimeoutError:
await websocket.send_text("keepalive")
except WebSocketDisconnect:
break
except Exception as e:
logger.error(f"Error in WebSocket message handling: {e}")
break
except WebSocketDisconnect:
pass
except Exception as e:
@ -156,54 +117,75 @@ async def websocket_job_status(
@router.websocket("/ws/jobs")
async def websocket_job_list(
websocket: WebSocket,
token: str | None = Query(None),
token: Optional[str] = Query(None),
manager: ConnectionManager = Depends(get_connection_manager)
):
"""
WebSocket endpoint for real-time job list updates.
WebSocket endpoint for real-time job list updates
Usage:
- Connect: ws://localhost:8000/api/v1/ws/jobs?token={jwt_token}
- Receives: Real-time status updates for all jobs the user can access
Only events for jobs in the user's accessible orgs are delivered.
Message format:
{
"type": "job_list_update",
"data": {
"job_id": "...",
"status": "processing",
"updated_at": "2023-...",
"message": "Processing video...",
"progress": 45
}
}
"""
# Authenticate the WebSocket connection
user_id = await authenticate_websocket(websocket, token)
if not user_id:
return
try:
# Verify user exists
logger.info(f"WebSocket: Looking up user {user_id} in database")
db = await get_database()
user, memberships = await _resolve_user_and_org(websocket, user_id, db)
if user is None:
return # socket already closed inside helper
# Try looking up user by string ID first, then by ObjectId
user = await db["users"].find_one({"_id": user_id})
if not user:
try:
from bson import ObjectId
user = await db["users"].find_one({"_id": ObjectId(user_id)})
except Exception:
pass # Invalid ObjectId format
if not user:
logger.warning(f"WebSocket: User {user_id} not found in database (tried both string and ObjectId)")
await websocket.close(code=4001, reason="User not found")
return
logger.info(f"WebSocket: User {user_id} found, role: {user.get('role', 'unknown')}")
accessible_org_ids = None if memberships is None else list(memberships.keys())
await manager.connect_job_list(websocket, user_id, accessible_org_ids=accessible_org_ids)
logger.info(f"WebSocket: User {user_id} found, connecting to job list updates")
# Connect to job list updates
await manager.connect_job_list(websocket, user_id)
# Keep connection alive and handle incoming messages
while True:
try:
message = await asyncio.wait_for(
websocket.receive_text(),
timeout=_KEEPALIVE_INTERVAL_S,
)
# Wait for incoming WebSocket messages
message = await websocket.receive_text()
logger.debug(f"Received WebSocket message from user {user_id}: {message}")
# Handle heartbeat or other client messages if needed
if message == "ping":
await websocket.send_text("pong")
except TimeoutError:
await websocket.send_text("keepalive")
except WebSocketDisconnect:
break
except Exception as e:
logger.error(f"Error in WebSocket message handling: {e}")
break
except WebSocketDisconnect:
pass
except Exception as e:
@ -214,15 +196,19 @@ async def websocket_job_list(
@router.get("/ws/status")
async def websocket_status():
"""Get WebSocket connection status and statistics (debug/monitoring)."""
"""
Get WebSocket connection status and statistics
Useful for debugging and monitoring
"""
stats = {
"active_connections": len(connection_manager.active_connections),
"job_subscriptions": len(connection_manager.job_subscriptions),
"global_subscriptions": len(connection_manager.global_subscriptions),
"redis_connected": connection_manager.redis_client is not None,
"subscriber_running": (
connection_manager.subscriber_task is not None and
connection_manager.subscriber_task is not None and
not connection_manager.subscriber_task.done()
)
}
return stats
return stats

View file

@ -11,6 +11,7 @@ Provides:
import json
from dataclasses import dataclass
from typing import Optional
from fastapi import Depends, HTTPException, status
from motor.motor_asyncio import AsyncIOMotorDatabase
@ -65,7 +66,7 @@ async def _cached_memberships(
"""Load memberships, with Redis cache (60s TTL)."""
cache_key = f"mem:user:{user_id}"
try:
redis = await get_redis()
redis = get_redis()
if redis:
cached = await redis.get(cache_key)
if cached:
@ -77,7 +78,7 @@ async def _cached_memberships(
memberships = await _load_memberships(user_id, db)
try:
redis = await get_redis()
redis = get_redis()
if redis:
await redis.setex(
cache_key,
@ -158,7 +159,7 @@ class OrgScopedQuery:
def filter(
self,
base_query: dict,
org_id: str | None = None,
org_id: Optional[str] = None,
org_field: str = "organization_id",
) -> dict:
if self.ctx.is_platform_admin:
@ -182,50 +183,6 @@ class OrgScopedQuery:
return {**base_query, org_field: {"$in": accessible}}
def assert_user_in_org(
ctx: "MembershipContext",
org_id: str,
min_role: OrgRole = OrgRole.VIEWER,
) -> None:
"""Raise 403 if ctx user does not have min_role in org_id. Platform admins always pass."""
if not ctx.can_access_org(org_id, min_role):
raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN,
detail="Access to this organization is not permitted",
)
async def get_job_or_403(
job_id: str,
ctx: "MembershipContext",
db: AsyncIOMotorDatabase,
) -> dict:
"""Load job document and verify ctx user can access its organization. Returns 404 for missing jobs."""
job_doc = await db.jobs.find_one({"_id": job_id})
if not job_doc:
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Job not found")
org_id = job_doc.get("organization_id")
if not org_id:
# Legacy job without org: try resolving via project
project_id = job_doc.get("project_id")
if project_id:
project = await db.projects.find_one({"_id": project_id}, {"client_id": 1})
if project:
org_id = project.get("client_id")
if org_id:
if not ctx.can_access_org(org_id):
# Return 404 to avoid leaking existence of cross-org jobs
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Job not found")
else:
# Truly legacy job (no project, no org): only the original uploader or admin can access
if not ctx.is_platform_admin and job_doc.get("client_id") != str(ctx.user.id):
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Job not found")
return job_doc
async def bump_user_membership_cache(user_id: str) -> None:
"""Invalidate the Redis membership cache for a user (call on any membership write)."""
try:

View file

@ -6,7 +6,6 @@ class Settings(BaseSettings):
# App
app_env: str = "dev"
api_base_url: str = "http://localhost:8000"
app_url: str = "https://optical-dev.oliver.solutions/video-accessibility"
# Auth
jwt_secret: str
@ -23,14 +22,13 @@ class Settings(BaseSettings):
# Redis
redis_url: str
# Celery
celery_broker_url: str = ""
celery_result_backend: str = ""
# GCP
gcp_project_id: str
gcp_location: str = "us-central1"
gcs_bucket: str = "accessible-video"
google_application_credentials: str = ""
@ -38,7 +36,7 @@ class Settings(BaseSettings):
gemini_api_key: str
elevenlabs_api_key: str = ""
google_tts_credentials: str = ""
# TTS Voice Configuration
tts_provider: str = "gemini" # "gemini", "google", or "elevenlabs"
google_tts_voices: dict[str, str] = {
@ -52,7 +50,7 @@ class Settings(BaseSettings):
elevenlabs_voices: dict[str, str] = {}
# Gemini TTS Configuration
gemini_tts_model: str = "gemini-3.1-flash-tts-preview"
gemini_tts_model: str = "gemini-2.5-flash-preview-tts"
gemini_tts_default_voice: str = "Kore"
gemini_tts_voices: list[str] = [
"Zephyr", "Puck", "Charon", "Kore", "Fenrir", "Leda", "Orus", "Aoede",
@ -223,8 +221,8 @@ class Settings(BaseSettings):
# Gemini TTS Model Options
gemini_tts_models: dict[str, str] = {
"flash": "gemini-3.1-flash-tts-preview", # Fast, cost-efficient (Preview)
"pro": "gemini-2.5-pro-tts", # Higher quality (GA)
"flash": "gemini-2.5-flash-preview-tts", # Fast, cost-efficient
"pro": "gemini-2.5-pro-preview-tts", # Higher quality
}
# Gemini TTS Style Presets - prompts prepended to text for style control
@ -249,14 +247,6 @@ class Settings(BaseSettings):
whisper_sentence_gap_threshold: float = 0.5 # Gap duration to classify as sentence boundary
whisper_phrase_gap_threshold: float = 0.3 # Gap duration to classify as phrase boundary
whisper_min_gap_threshold: float = 0.15 # Minimum gap duration to consider
# Forward-preferred snap windows (A2)
whisper_snap_forward_window: float = 4.0 # Prefer boundary up to N seconds ahead of Gemini point
whisper_snap_backward_window: float = 1.5 # Fall back to boundary up to N seconds behind
# Adaptive silence buffer (A1)
ad_silence_buffer_default: float = 0.5 # Base silence duration (s) before/after AD audio
ad_silence_buffer_min_after: float = 0.1 # Minimum silence after AD audio
# Minimum gap required at the chosen pause point (A3)
ad_min_acceptable_gap: float = 0.2 # Seconds; points with shorter gaps trigger forward search
# Cloud Run Service URLs (empty = use local processing)
# When set, CPU-intensive work is offloaded to Cloud Run with autoscaling
@ -275,10 +265,11 @@ class Settings(BaseSettings):
ffmpeg_worker_concurrency: int = 4 # FFmpeg tasks on main worker
tts_worker_concurrency: int = 8 # TTS worker
# Email (Mailgun)
# Email (Mailgun — primary; sendgrid_api_key kept for backward compat)
mailgun_api_key: str = ""
mailgun_domain: str = "mg.oliver.solutions"
mailgun_from: str = "noreply@mg.oliver.solutions"
sendgrid_api_key: str = ""
email_from: str = "noreply@mg.oliver.solutions"
client_base_url: str
@ -297,10 +288,6 @@ class Settings(BaseSettings):
cost_tracker_source_app: str = "video-accessibility"
cost_tracker_enabled: bool = True
# Upload limits (T-14 — single source of truth)
upload_max_video_bytes: int = 2 * 1024 * 1024 * 1024 # 2GB
upload_signed_url_ttl_hours: int = 24 # signed URL lifetime
# CORS - comma-separated list of allowed origins
cors_origins: str = "http://localhost:5173,http://localhost:5174,http://localhost:3000,http://localhost:6001"

View file

@ -56,7 +56,7 @@ async def create_indexes():
await db.audit_logs.create_index([("resource_type", 1), ("resource_id", 1)]) # Resource tracking
await db.audit_logs.create_index([("ip_address", 1), ("timestamp", -1)]) # IP-based analysis
await db.audit_logs.create_index([("success", 1), ("timestamp", -1)]) # Failed operations
# Text search index for description and details
await db.audit_logs.create_index([
("description", "text"),

View file

@ -1,9 +1,11 @@
from typing import Optional
from fastapi import Depends, HTTPException, Request, status
from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer
from motor.motor_asyncio import AsyncIOMotorDatabase
from ..models.user import User, UserRole
from .config import settings
from .database import get_database
from .security import decode_token
@ -41,12 +43,7 @@ async def get_current_user(
detail="User not found",
)
user = User(**user_doc)
# Attach org_ids hint from token as transient attribute (never used for authz)
token_org_ids = payload.get("org_ids", [])
if token_org_ids:
user.__dict__["org_ids"] = token_org_ids
return user
return User(**user_doc)
def require_role(required_role: UserRole):
@ -76,7 +73,7 @@ def require_roles(*required_roles: UserRole):
async def get_current_user_optional(
request: Request,
db: AsyncIOMotorDatabase = Depends(get_database),
) -> User | None:
) -> Optional[User]:
authorization: str = request.headers.get("Authorization")
if not authorization:
return None
@ -107,7 +104,7 @@ async def get_current_user_optional(
async def get_accessible_project_ids(
user: User,
db: AsyncIOMotorDatabase,
) -> list[str] | None:
) -> Optional[list[str]]:
"""
Returns project IDs the user may access, or None meaning "see everything".
@ -123,12 +120,9 @@ async def get_accessible_project_ids(
user_id = str(user.id)
# Primary path: use Redis-cached memberships (60s TTL, same cache as authz.py)
from .authz import (
_cached_memberships, # local import to avoid circular dep at module level
)
memberships_map = await _cached_memberships(user_id, db)
org_ids = list(memberships_map.keys())
# Primary path: use memberships collection (Phase 3 SaaS)
membership_cursor = db.memberships.find({"user_id": user_id}, {"organization_id": 1})
org_ids = [doc["organization_id"] async for doc in membership_cursor]
if org_ids:
projects = await db.projects.find(
@ -170,67 +164,6 @@ async def get_accessible_project_ids(
return []
async def get_user_org_ids(user: User, db: AsyncIOMotorDatabase) -> list[str] | None:
"""Return org IDs the user belongs to, or None meaning unrestricted (ADMIN).
Priority: memberships pm_client_ids (PM legacy) team.member_user_ids (staff legacy)
"""
if user.role == UserRole.ADMIN:
return None
user_id = str(user.id)
# Primary: Membership collection
org_ids: list[str] = []
async for m in db.memberships.find({"user_id": user_id}, {"organization_id": 1}):
if m.get("organization_id"):
org_ids.append(str(m["organization_id"]))
if org_ids:
return org_ids
# PM legacy: pm_client_ids
if user.role == UserRole.PROJECT_MANAGER:
return list(user.pm_client_ids or [])
# Staff legacy: team.member_user_ids
teams = await db.teams.find({"member_user_ids": user_id}, {"client_id": 1}).to_list(None)
if teams:
return [str(t["client_id"]) for t in teams if t.get("client_id")]
return []
async def assert_job_in_user_org(job: dict, user: User, db: AsyncIOMotorDatabase) -> None:
"""Raise 404 (not 403) when user cannot access this job — avoids information disclosure."""
if user.role == UserRole.ADMIN:
return
org_ids = await get_user_org_ids(user, db)
if org_ids is None:
return # unrestricted
job_org = job.get("organization_id")
if job_org:
if job_org in org_ids:
return
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Job not found")
# No organization_id — try project fallback
project_id = job.get("project_id")
if project_id:
project = await db.projects.find_one({"_id": project_id}, {"client_id": 1})
if project and project.get("client_id") in org_ids:
return
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Job not found")
# Legacy: client_id == creator user_id
job_client_id = job.get("client_id")
if job_client_id and job_client_id == str(user.id):
return
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Job not found")
def require_pm_for_client(client_id_param: str = "client_id"):
"""Dependency: ensures the current user is an Admin or PM for the given client."""
async def checker(

View file

@ -1,6 +1,10 @@
"""Enhanced configuration system with Secret Manager integration."""
import os
import asyncio
from typing import Dict, Optional, Any
from functools import lru_cache
from pydantic_settings import BaseSettings
from .config import Settings as BaseConfig
from .logging import get_logger
@ -10,40 +14,41 @@ logger = get_logger(__name__)
class SecretsConfig(BaseConfig):
"""Enhanced configuration that loads secrets from GCP Secret Manager."""
def __init__(self, **kwargs):
# Initialize with base configuration first
super().__init__(**kwargs)
# Flag to track if secrets have been loaded
self._secrets_loaded = False
self._secret_values: dict[str, str] = {}
self._secret_values: Dict[str, str] = {}
async def load_secrets(self) -> None:
"""Load secrets from Secret Manager asynchronously."""
if self._secrets_loaded:
return
try:
# Only import here to avoid circular imports
from app.services.secrets_manager import secrets_manager
# Define which config fields should be loaded from secrets
secret_mappings = {
# Config field -> Secret Manager name
"jwt_secret": "jwt-secret",
"jwt_refresh_secret": "jwt-refresh-secret",
"jwt_refresh_secret": "jwt-refresh-secret",
"mongodb_uri": "mongodb-url",
"redis_url": "redis-url",
"gemini_api_key": "gemini-api-key",
"sendgrid_api_key": "sendgrid-api-key",
"elevenlabs_api_key": "elevenlabs-api-key",
"sentry_dsn": "sentry-dsn"
}
# Get all secrets in batch
secret_names = list(secret_mappings.values())
retrieved_secrets = await secrets_manager.get_secrets_batch(secret_names)
# Map secrets back to config fields
for config_field, secret_name in secret_mappings.items():
if secret_name in retrieved_secrets:
@ -53,50 +58,50 @@ class SecretsConfig(BaseConfig):
logger.debug(f"Loaded secret for {config_field}")
else:
logger.warning(f"Secret {secret_name} not available, using environment/default")
self._secrets_loaded = True
logger.info(f"Successfully loaded {len(retrieved_secrets)} secrets from Secret Manager")
except Exception as e:
logger.warning(f"Failed to load secrets from Secret Manager: {e}")
logger.warning("Falling back to environment variables")
self._secrets_loaded = True # Mark as loaded to prevent retries
def get_secret_value(self, field_name: str) -> str | None:
def get_secret_value(self, field_name: str) -> Optional[str]:
"""Get a secret value if it was loaded from Secret Manager."""
return self._secret_values.get(field_name)
async def refresh_secrets(self) -> None:
"""Force refresh secrets from Secret Manager."""
self._secrets_loaded = False
self._secret_values.clear()
# Clear the secrets manager cache
from app.services.secrets_manager import secrets_manager
secrets_manager.clear_cache()
await self.load_secrets()
@property
def is_production(self) -> bool:
"""Check if running in production environment."""
return self.app_env == "prod"
@property
def is_development(self) -> bool:
"""Check if running in development environment."""
return self.app_env == "dev"
@property
def google_cloud_project(self) -> str:
"""Get Google Cloud Project ID."""
return self.gcp_project_id
@property
def jwt_refresh_secret(self) -> str:
"""Get JWT refresh secret (fallback to main secret if not set)."""
return getattr(self, '_jwt_refresh_secret', self.jwt_secret)
@jwt_refresh_secret.setter
def jwt_refresh_secret(self, value: str) -> None:
"""Set JWT refresh secret."""
@ -104,37 +109,37 @@ class SecretsConfig(BaseConfig):
# Global configuration instance
_config_instance: SecretsConfig | None = None
_config_instance: Optional[SecretsConfig] = None
async def initialize_config() -> SecretsConfig:
"""Initialize configuration with secrets loading."""
global _config_instance
if _config_instance is None:
_config_instance = SecretsConfig()
await _config_instance.load_secrets()
return _config_instance
def get_settings() -> SecretsConfig:
"""Get settings instance (synchronous)."""
global _config_instance
if _config_instance is None:
# Initialize without secrets for backwards compatibility
_config_instance = SecretsConfig()
logger.warning("Settings accessed before async initialization - secrets not loaded")
return _config_instance
@lru_cache
@lru_cache()
def get_settings_cached() -> SecretsConfig:
"""Get cached settings instance."""
return get_settings()
# Backwards compatibility
settings = get_settings()
settings = get_settings()

View file

@ -1,5 +1,5 @@
from datetime import datetime, timedelta
from typing import Any
from typing import Any, Optional, Union
from fastapi import HTTPException, status
from jose import JWTError, jwt
@ -11,24 +11,20 @@ pwd_context = CryptContext(schemes=["bcrypt"], deprecated="auto")
def create_access_token(
subject: str | Any,
expires_delta: timedelta | None = None,
org_ids: list[str] | None = None,
subject: Union[str, Any], expires_delta: Optional[timedelta] = None
) -> str:
if expires_delta:
expire = datetime.utcnow() + expires_delta
else:
expire = datetime.utcnow() + timedelta(minutes=settings.jwt_access_ttl_min)
to_encode: dict[str, Any] = {"exp": expire, "sub": str(subject), "v": 2}
if org_ids:
to_encode["org_ids"] = org_ids
to_encode = {"exp": expire, "sub": str(subject)}
encoded_jwt = jwt.encode(to_encode, settings.jwt_secret, algorithm=settings.jwt_alg)
return encoded_jwt
def create_refresh_token(
subject: str | Any, expires_delta: timedelta | None = None
subject: Union[str, Any], expires_delta: Optional[timedelta] = None
) -> str:
if expires_delta:
expire = datetime.utcnow() + expires_delta
@ -41,8 +37,6 @@ def create_refresh_token(
def verify_password(plain_password: str, hashed_password: str) -> bool:
if not hashed_password:
return False
return pwd_context.verify(plain_password, hashed_password)
@ -58,4 +52,4 @@ def decode_token(token: str) -> dict[str, Any]:
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail="Could not validate credentials",
) from None
)

View file

@ -8,7 +8,6 @@ class VTTCue:
end_time: float # seconds
text: str
identifier: str | None = None
settings: str = ""
class VTTParser:
@ -38,11 +37,10 @@ class VTTParser:
# Parse timing line
if " --> " in line:
timing_match = re.match(r'([\d:.,]+)\s+-->\s+([\d:.,]+)\s*(.*)', line)
timing_match = re.match(r'([\d:.,]+)\s+-->\s+([\d:.,]+)', line)
if timing_match:
start_time = VTTParser._parse_timestamp(timing_match.group(1))
end_time = VTTParser._parse_timestamp(timing_match.group(2))
settings = timing_match.group(3).strip()
# Collect text lines until empty line or next cue
i += 1
@ -51,13 +49,13 @@ class VTTParser:
text_lines.append(lines[i].strip())
i += 1
cues.append(VTTCue(
start_time=start_time,
end_time=end_time,
text="\n".join(text_lines),
identifier=identifier,
settings=settings,
))
if text_lines:
cues.append(VTTCue(
start_time=start_time,
end_time=end_time,
text="\n".join(text_lines),
identifier=identifier
))
else:
i += 1
@ -73,19 +71,16 @@ class VTTParser:
if cue.identifier:
lines.append(cue.identifier)
# Add timing line (preserve cue settings like line:0%)
# Add timing line
start_timestamp = VTTParser._format_timestamp(cue.start_time)
end_timestamp = VTTParser._format_timestamp(cue.end_time)
timing_line = f"{start_timestamp} --> {end_timestamp}"
if cue.settings:
timing_line += f" {cue.settings}"
lines.append(timing_line)
lines.append(f"{start_timestamp} --> {end_timestamp}")
# Add text (can be multi-line)
lines.append(cue.text)
lines.append("") # Empty line between cues
return "\n".join(lines) + "\n"
return "\n".join(lines)
@staticmethod
def _parse_timestamp(timestamp: str) -> float:
@ -126,7 +121,7 @@ class VTTParser:
secs = seconds % 60
whole_secs = int(secs)
milliseconds = round((secs - whole_secs) * 1000)
milliseconds = int((secs - whole_secs) * 1000)
return f"{hours:02d}:{minutes:02d}:{whole_secs:02d}.{milliseconds:03d}"
@ -153,22 +148,6 @@ class VTTEditor:
return VTTParser.build(cues)
@staticmethod
def assert_cue_alignment(en_vtt: str, target_vtt: str, lang: str) -> None:
"""Raise ValueError if target VTT cue count or timestamps diverge from EN master."""
en_cues = VTTParser.parse(en_vtt)
tgt_cues = VTTParser.parse(target_vtt)
if len(tgt_cues) != len(en_cues):
raise ValueError(
f"Cue count mismatch for {lang}: EN has {len(en_cues)}, target has {len(tgt_cues)}"
)
for i, (en, tgt) in enumerate(zip(en_cues, tgt_cues, strict=True)):
if en.start_time != tgt.start_time or en.end_time != tgt.end_time:
raise ValueError(
f"Timestamp mismatch for {lang} cue {i}: "
f"EN {en.start_time}-->{en.end_time}, target {tgt.start_time}-->{tgt.end_time}"
)
@staticmethod
def update_cue_text(vtt_content: str, cue_index: int, new_text: str) -> str:
"""Update text for a specific cue by index"""
@ -207,20 +186,6 @@ class VTTEditor:
return len(errors) == 0, errors
@staticmethod
def fix_overlapping_cues(vtt_content: str) -> str:
"""Trim end_time of each cue so it does not overlap the next cue's start_time."""
cues = VTTParser.parse(vtt_content)
for i in range(1, len(cues)):
if cues[i].start_time < cues[i - 1].end_time:
# Clamp previous cue end to 1ms before next cue start
new_end = cues[i].start_time - 0.001
# Never let end_time go at or below start_time
if new_end <= cues[i - 1].start_time:
new_end = cues[i - 1].start_time + 0.001
cues[i - 1].end_time = new_end
return VTTParser.build(cues)
@staticmethod
def get_cue_count(vtt_content: str) -> int:
"""Get the number of cues in VTT content"""
@ -256,7 +221,7 @@ class VTTEditor:
)
return False, errors
for i, (src, tgt) in enumerate(zip(source_cues, translated_cues, strict=False)):
for i, (src, tgt) in enumerate(zip(source_cues, translated_cues)):
if abs(src.start_time - tgt.start_time) > 0.001:
errors.append(
f"Cue {i + 1}: start time changed "
@ -286,33 +251,3 @@ class VTTEditor:
return VTTParser.build(cues)
# DCMP §6.01 filler patterns per language (whole-word, case-insensitive)
_FILLER_PATTERNS: dict[str, str] = {
"en": r'\b(um+|uh+|ah+|er+|hmm+|you know|i mean|sort of|kind of|basically|literally|honestly|actually|right\?|so yeah)\b',
"es": r'\b(eh+|este|o sea|pues|bueno|o sea que|mmm+)\b',
"fr": r'\b(euh+|beh|ben|donc|quoi|enfin|voilà|genre)\b',
"de": r'\b(äh+|ähm+|halt|ne|also|naja|sozusagen|quasi)\b',
"it": r'\b(ehm+|allora|cioè|tipo|praticamente|insomma|ecco)\b',
"nl": r'\b(eh+|nou|zeg|eigenlijk|gewoon|toch|zo van|hè)\b',
"pt": r'\b(ahn+|hã+|né|sabe|tipo|então|assim)\b',
"pl": r'\b(no|że|bo|znaczy|właśnie|jakby|wiesz)\b',
"uk": r'\b(ну+|ем+|типу|знаєш|значить|власне|от)\b',
"ru": r'\b(ну+|эм+|типа|знаешь|значит|вот|собственно)\b',
}
@staticmethod
def clean_disfluencies(vtt_content: str, lang: str) -> str:
"""Remove filler words and hesitations per DCMP §6.01 for supported languages."""
pattern = VTTEditor._FILLER_PATTERNS.get(lang.split("-")[0].lower())
if not pattern:
return vtt_content
cues = VTTParser.parse(vtt_content)
compiled = re.compile(pattern, re.IGNORECASE)
for cue in cues:
cleaned = compiled.sub("", cue.text)
# Collapse multiple spaces and strip leading/trailing punctuation artifacts
cleaned = re.sub(r'[ \t]{2,}', ' ', cleaned).strip().strip(',').strip()
if cleaned:
cue.text = cleaned
return VTTParser.build(cues)

View file

@ -11,9 +11,7 @@ from sentry_sdk.integrations.pymongo import PyMongoIntegration
from sentry_sdk.integrations.redis import RedisIntegration
from .api.v1.routes_admin import router as admin_router
from .api.v1.routes_admin_production import router as admin_production_router
from .api.v1.routes_auth import router as auth_router
from .api.v1.routes_briefs import router as briefs_router
from .api.v1.routes_clients import router as clients_router
from .api.v1.routes_files import router as files_router
from .api.v1.routes_glossaries import router as glossaries_router
@ -23,7 +21,6 @@ from .api.v1.routes_jobs import router as jobs_router
from .api.v1.routes_language_qc import router as language_qc_router
from .api.v1.routes_organizations import router as organizations_router
from .api.v1.routes_review_notes import router as review_notes_router
from .api.v1.routes_share import router as share_router
from .api.v1.routes_tts import router as tts_router
from .api.v1.routes_vtt_versions import router as vtt_versions_router
from .api.v1.routes_websockets import router as websockets_router
@ -94,17 +91,12 @@ async def lifespan(app: FastAPI):
print(f"⚠️ Could not seed default admin: {e}")
# await create_indexes() # Temporarily disabled for debugging
# T-16: Seed language_qc only for jobs that still lack it (idempotent, skips on subsequent starts)
# Seed language_qc for existing jobs that don't have it yet
try:
db = await get_database()
pending_count = await db.jobs.count_documents({"language_qc": {"$exists": False}})
if pending_count > 0:
async for job_doc in db.jobs.find(
{"language_qc": {"$exists": False}},
{"_id": 1, "status": 1, "outputs": 1, "source": 1, "review": 1, "updated_at": 1, "requested_outputs": 1},
):
await seed_language_qc_for_job(db, job_doc)
print(f"✅ language_qc migration complete ({pending_count} jobs seeded)")
async for job_doc in db.jobs.find({"language_qc": {"$exists": False}}, {"_id": 1, "status": 1, "outputs": 1, "source": 1, "review": 1, "updated_at": 1, "requested_outputs": 1}):
await seed_language_qc_for_job(db, job_doc)
print("✅ language_qc migration complete")
except Exception as e:
print(f"⚠️ language_qc migration failed: {e}")
@ -120,9 +112,6 @@ async def lifespan(app: FastAPI):
# Store middleware in app state for access
app.state.rate_limit_middleware = rate_limit_middleware
app.state.validation_middleware = validation_middleware
elif settings.redis_url:
# T-13: REDIS_URL is configured but client unavailable — rate limiting is disabled
print(f"⚠️ Redis configured at {settings.redis_url!r} but connection failed — rate limiting disabled")
yield
# Shutdown
@ -156,7 +145,6 @@ async def cors_error_handler(request, call_next):
response = await call_next(request)
except Exception as e:
import traceback
from .core.logging import get_logger as _get_logger
_get_logger(__name__).exception("🚨 CORS middleware caught: %s\n%s", e, traceback.format_exc())
@ -277,9 +265,6 @@ app.include_router(language_qc_router, prefix="/api/v1")
app.include_router(glossaries_router, prefix="/api/v1")
app.include_router(tts_router, prefix="/api/v1")
app.include_router(admin_router, prefix="/api/v1")
app.include_router(admin_production_router, prefix="/api/v1")
app.include_router(briefs_router, prefix="/api/v1")
app.include_router(share_router, prefix="/api/v1")
app.include_router(websockets_router, prefix="/api/v1")

View file

@ -1,16 +1,12 @@
"""Middleware package for FastAPI application."""
from .rate_limiting import (
IPWhitelist,
RateLimitMiddleware,
create_rate_limit_middleware,
)
from .rate_limiting import RateLimitMiddleware, IPWhitelist, create_rate_limit_middleware
from .validation import ValidationMiddleware, create_validation_middleware
__all__ = [
"RateLimitMiddleware",
"IPWhitelist",
"IPWhitelist",
"create_rate_limit_middleware",
"ValidationMiddleware",
"create_validation_middleware"
]
]

View file

@ -1,10 +1,14 @@
"""Rate limiting middleware for API endpoints."""
import time
from collections import defaultdict
from typing import Dict, Optional, Tuple
import redis.asyncio as aioredis
from fastapi import Request, status
from fastapi import HTTPException, Request, status
from fastapi.responses import JSONResponse
import json
import asyncio
from datetime import datetime, timedelta
from app.core.config import get_settings
from app.telemetry.metrics import track_rate_limit_metrics
@ -12,50 +16,50 @@ from app.telemetry.metrics import track_rate_limit_metrics
class RateLimiter:
"""Redis-based rate limiter with sliding window algorithm."""
def __init__(self, redis_client: aioredis.Redis):
self.redis = redis_client
async def is_allowed(
self,
key: str,
limit: int,
self,
key: str,
limit: int,
window_seconds: int,
identifier: str = ""
) -> tuple[bool, dict[str, int]]:
) -> Tuple[bool, Dict[str, int]]:
"""
Check if request is allowed under rate limit.
Returns:
Tuple of (is_allowed, rate_limit_info)
"""
now = time.time()
pipeline = self.redis.pipeline()
# Remove expired entries
pipeline.zremrangebyscore(key, 0, now - window_seconds)
# Count current requests in window
pipeline.zcard(key)
# Add current request
pipeline.zadd(key, {str(now): now})
# Set expiry
pipeline.expire(key, window_seconds)
results = await pipeline.execute()
current_requests = results[1]
rate_limit_info = {
"limit": limit,
"remaining": max(0, limit - current_requests),
"reset_time": int(now + window_seconds),
"retry_after": window_seconds if current_requests >= limit else 0
}
is_allowed = current_requests <= limit
# Track metrics
track_rate_limit_metrics(
identifier=identifier,
@ -63,17 +67,17 @@ class RateLimiter:
current_requests=current_requests,
limit=limit
)
return is_allowed, rate_limit_info
class RateLimitMiddleware:
"""FastAPI middleware for rate limiting."""
def __init__(self, redis_client: aioredis.Redis):
self.limiter = RateLimiter(redis_client)
self.settings = get_settings()
# Rate limit configurations by endpoint pattern
self.rate_limits = {
# Authentication endpoints
@ -81,32 +85,32 @@ class RateLimitMiddleware:
"POST:/api/v1/auth/register": (3, 3600), # 3 requests per hour
"POST:/api/v1/auth/refresh": (10, 300), # 10 requests per 5 minutes
"POST:/api/v1/auth/forgot-password": (3, 3600), # 3 requests per hour
# File upload endpoints
"POST:/api/v1/files/upload": (10, 3600), # 10 uploads per hour
"POST:/api/v1/jobs": (20, 3600), # 20 job creations per hour
# Job management endpoints
"GET:/api/v1/jobs": (100, 300), # 100 requests per 5 minutes
"PATCH:/api/v1/jobs/*/approve": (50, 3600), # 50 approvals per hour
"PATCH:/api/v1/jobs/*/reject": (50, 3600), # 50 rejections per hour
# VTT editing endpoints
"PATCH:/api/v1/jobs/*/vtt": (100, 3600), # 100 VTT edits per hour
# Admin endpoints (more restrictive)
"GET:/api/v1/admin/*": (50, 300), # 50 requests per 5 minutes
"POST:/api/v1/admin/*": (20, 3600), # 20 admin actions per hour
"PATCH:/api/v1/admin/*": (20, 3600), # 20 admin updates per hour
"DELETE:/api/v1/admin/*": (10, 3600), # 10 admin deletions per hour
}
# Default rate limits
self.default_limits = {
"authenticated": (1000, 3600), # 1000 requests per hour for authenticated users
"anonymous": (100, 3600), # 100 requests per hour for anonymous users
}
def _get_client_identifier(self, request: Request) -> str:
"""Get client identifier for rate limiting."""
user = getattr(request.state, 'user', None)
@ -124,53 +128,53 @@ class RateLimitMiddleware:
client_ip = request.client.host if request.client else "unknown"
return f"ip:{client_ip}"
def _get_endpoint_key(self, request: Request) -> str:
"""Get endpoint pattern for rate limiting."""
method = request.method
path = request.url.path
# Replace job IDs with wildcard for pattern matching
import re
path = re.sub(r'/jobs/[a-f0-9-]+/', '/jobs/*/', path)
path = re.sub(r'/admin/users/[a-f0-9-]+', '/admin/users/*', path)
return f"{method}:{path}"
def _get_rate_limit(self, request: Request) -> tuple[int, int]:
def _get_rate_limit(self, request: Request) -> Tuple[int, int]:
"""Get rate limit for the current request."""
endpoint_key = self._get_endpoint_key(request)
# Check for specific endpoint limits
if endpoint_key in self.rate_limits:
return self.rate_limits[endpoint_key]
# Check for wildcard matches
for pattern, limits in self.rate_limits.items():
if pattern.endswith("*") and endpoint_key.startswith(pattern[:-1]):
return limits
# Use default limits based on authentication
user = getattr(request.state, 'user', None)
if user:
return self.default_limits["authenticated"]
else:
return self.default_limits["anonymous"]
async def __call__(self, request: Request, call_next):
"""Process rate limiting for the request."""
# Skip rate limiting for health checks and metrics only
if request.url.path in ["/health", "/metrics"]:
return await call_next(request)
client_id = self._get_client_identifier(request)
endpoint_key = self._get_endpoint_key(request)
limit, window = self._get_rate_limit(request)
# Create rate limit key
rate_limit_key = f"rate_limit:{client_id}:{endpoint_key}"
try:
is_allowed, rate_info = await self.limiter.is_allowed(
key=rate_limit_key,
@ -178,7 +182,7 @@ class RateLimitMiddleware:
window_seconds=window,
identifier=client_id
)
if not is_allowed:
# Return rate limit exceeded response
return JSONResponse(
@ -195,17 +199,17 @@ class RateLimitMiddleware:
"Retry-After": str(rate_info["retry_after"])
}
)
# Process the request
response = await call_next(request)
# Add rate limit headers to response
response.headers["X-RateLimit-Limit"] = str(rate_info["limit"])
response.headers["X-RateLimit-Remaining"] = str(rate_info["remaining"])
response.headers["X-RateLimit-Reset"] = str(rate_info["reset_time"])
return response
except Exception as e:
# Log error but don't block request if rate limiting fails
print(f"Rate limiting error: {e}")
@ -214,30 +218,30 @@ class RateLimitMiddleware:
class IPWhitelist:
"""IP whitelist for bypassing rate limits."""
def __init__(self, redis_client: aioredis.Redis):
self.redis = redis_client
self.whitelist_key = "ip_whitelist"
# Default whitelisted IPs (health checks, monitoring)
self.default_whitelist = {
"127.0.0.1",
"::1",
"169.254.169.254", # GCP metadata server
}
async def is_whitelisted(self, ip: str) -> bool:
"""Check if IP is whitelisted."""
if ip in self.default_whitelist:
return True
try:
is_member = await self.redis.sismember(self.whitelist_key, ip)
return bool(is_member)
except Exception:
return False
async def add_ip(self, ip: str, ttl_seconds: int | None = None) -> bool:
async def add_ip(self, ip: str, ttl_seconds: Optional[int] = None) -> bool:
"""Add IP to whitelist."""
try:
await self.redis.sadd(self.whitelist_key, ip)
@ -248,7 +252,7 @@ class IPWhitelist:
return True
except Exception:
return False
async def remove_ip(self, ip: str) -> bool:
"""Remove IP from whitelist."""
try:
@ -260,4 +264,4 @@ class IPWhitelist:
async def create_rate_limit_middleware(redis_client: aioredis.Redis) -> RateLimitMiddleware:
"""Factory function to create rate limit middleware."""
return RateLimitMiddleware(redis_client)
return RateLimitMiddleware(redis_client)

View file

@ -3,17 +3,15 @@
import json
import re
import time
from typing import Any
from typing import Any, Dict, List, Optional, Set
from fastapi import HTTPException, Request, status
from fastapi.responses import JSONResponse
from pydantic import BaseModel, ValidationError as PydanticValidationError
import magic
from urllib.parse import unquote
import magic
from fastapi import Request, status
from fastapi.responses import JSONResponse
from app.telemetry.metrics import track_validation_metrics
from ..core.config import settings
class ValidationError(Exception):
"""Custom validation error."""
@ -27,93 +25,92 @@ class SecurityValidationError(Exception):
class RequestValidator:
"""Enhanced request validation with security checks."""
def __init__(self):
# File type restrictions
self.allowed_video_types = {
"video/mp4",
"video/quicktime",
"video/quicktime",
"video/x-msvideo" # AVI
}
self.allowed_subtitle_types = {
"text/vtt",
"text/plain"
}
# Security patterns to block
self.malicious_patterns = [
# SQL injection patterns
r"\b(union|select|insert|update|delete|drop|create|alter)\b\s+",
r"vbscript:", # vbscript protocol injection
r"\b(onload|onerror|onclick)\s*=", # HTML event handler attribute injection
r"(union|select|insert|update|delete|drop|create|alter)\s+",
r"(script|javascript|vbscript|onload|onerror|onclick)",
r"<\s*script[^>]*>",
r"javascript:",
r"data:.*base64",
# Path traversal
r"\.\./",
r"\.\.\\",
r"%2e%2e%2f",
r"%2e%2e\\",
# Command injection (removed $ and ; — semicolons are common in natural language)
r"[&|`](?!\s*$)",
r"\b(rm|wget|curl|nc|bash|sh|cmd|powershell)\b\s+",
# Command injection (removed $ to allow MongoDB operators in controlled contexts)
r"[;&|`](?!\s*$)", # Allow $ but not as command separator
r"(rm|wget|curl|nc|bash|sh|cmd|powershell)\s+",
# MongoDB injection — NoSQL operator abuse
r"\$where|\$expr|\$function|\$accumulator"
r"|\$ne|\$nin|\$not"
r"|\$gt|\$gte|\$lt|\$lte"
r"|\$regex|\$jsonSchema|\$mod",
]
self.compiled_patterns = [re.compile(pattern, re.IGNORECASE) for pattern in self.malicious_patterns]
# Max file sizes (in bytes) — driven by central config (T-14)
self.max_video_size = settings.upload_max_video_bytes
# Max file sizes (in bytes)
self.max_video_size = 2 * 1024 * 1024 * 1024 # 2GB
self.max_subtitle_size = 10 * 1024 * 1024 # 10MB
# Request size limits
self.max_json_size = 1024 * 1024 # 1MB
self.max_form_fields = 50
def validate_string_content(self, content: str, field_name: str = "input") -> None:
"""Validate string content for malicious patterns."""
if not isinstance(content, str):
return
for pattern in self.compiled_patterns:
if pattern.search(content):
raise SecurityValidationError(
f"Potentially malicious content detected in {field_name}"
)
def validate_filename(self, filename: str) -> str:
"""Validate and sanitize filename."""
if not filename:
raise ValidationError("Filename cannot be empty")
# Decode URL encoding
filename = unquote(filename)
# Check for malicious patterns
self.validate_string_content(filename, "filename")
# Remove dangerous characters
safe_filename = re.sub(r'[^\w\-_\.]', '_', filename)
# Prevent hidden files
if safe_filename.startswith('.'):
safe_filename = 'file_' + safe_filename[1:]
# Limit length
if len(safe_filename) > 255:
name, ext = safe_filename.rsplit('.', 1) if '.' in safe_filename else (safe_filename, '')
safe_filename = name[:250] + ('.' + ext if ext else '')
return safe_filename
def validate_file_type(self, content: bytes, expected_type: str, filename: str) -> None:
"""Validate file type using magic numbers."""
try:
@ -123,13 +120,13 @@ class RequestValidator:
ext = filename.lower().split('.')[-1] if '.' in filename else ''
video_extensions = {'mp4', 'mov', 'avi', 'mkv'}
subtitle_extensions = {'vtt', 'srt', 'txt'}
if expected_type == "video" and ext not in video_extensions:
raise ValidationError(f"Invalid video file extension: {ext}") from None
raise ValidationError(f"Invalid video file extension: {ext}")
elif expected_type == "subtitle" and ext not in subtitle_extensions:
raise ValidationError(f"Invalid subtitle file extension: {ext}") from None
raise ValidationError(f"Invalid subtitle file extension: {ext}")
return
if expected_type == "video" and detected_type not in self.allowed_video_types:
raise ValidationError(
f"Invalid video file type: {detected_type}. "
@ -140,7 +137,7 @@ class RequestValidator:
f"Invalid subtitle file type: {detected_type}. "
f"Allowed types: {', '.join(self.allowed_subtitle_types)}"
)
def validate_file_size(self, size: int, file_type: str) -> None:
"""Validate file size limits."""
if file_type == "video" and size > self.max_video_size:
@ -153,16 +150,16 @@ class RequestValidator:
f"Subtitle file too large: {size} bytes. "
f"Maximum allowed: {self.max_subtitle_size} bytes"
)
async def validate_json_payload(self, request: Request) -> dict[str, Any] | None:
async def validate_json_payload(self, request: Request) -> Optional[Dict[str, Any]]:
"""Validate JSON request payload."""
if not request.headers.get("content-type", "").startswith("application/json"):
return None
content_length = request.headers.get("content-length")
if content_length and int(content_length) > self.max_json_size:
raise ValidationError(f"JSON payload too large: {content_length} bytes")
try:
# Check if body has already been read
if hasattr(request, '_cached_body'):
@ -171,67 +168,63 @@ class RequestValidator:
body = await request.body()
# Cache the body so FastAPI can read it later
request._cached_body = body
if len(body) > self.max_json_size:
raise ValidationError(f"JSON payload too large: {len(body)} bytes")
if not body:
return {}
payload = json.loads(body)
# Recursively validate all string values
self._validate_json_values(payload)
return payload
except json.JSONDecodeError as e:
raise ValidationError(f"Invalid JSON: {e}") from e
# Fields that contain free-form natural language — skip injection pattern checks
_FREETEXT_FIELDS = {"captions_vtt", "audio_description_vtt", "text", "notes", "change_note", "description"}
raise ValidationError(f"Invalid JSON: {e}")
def _validate_json_values(self, obj: Any, path: str = "root") -> None:
"""Recursively validate JSON values."""
if isinstance(obj, dict):
if len(obj) > self.max_form_fields:
raise ValidationError(f"Too many fields in object at {path}")
for key, value in obj.items():
self.validate_string_content(key, f"{path}.key")
# Skip pattern scanning for free-text fields (VTT content, notes, etc.)
if key not in self._FREETEXT_FIELDS:
self._validate_json_values(value, f"{path}.{key}")
if isinstance(key, str):
self.validate_string_content(key, f"{path}.{key}")
self._validate_json_values(value, f"{path}.{key}")
elif isinstance(obj, list):
if len(obj) > 1000: # Prevent large arrays
raise ValidationError(f"Array too large at {path}")
for i, item in enumerate(obj):
self._validate_json_values(item, f"{path}[{i}]")
elif isinstance(obj, str):
self.validate_string_content(obj, path)
def validate_query_params(self, request: Request) -> None:
"""Validate query parameters."""
for key, value in request.query_params.items():
self.validate_string_content(key, f"query.{key}")
self.validate_string_content(str(value), f"query.{key}")
def validate_headers(self, request: Request) -> None:
"""Validate request headers."""
suspicious_headers = {
"x-forwarded-host",
"x-original-host",
"x-original-host",
"x-rewrite-url"
}
for header_name, header_value in request.headers.items():
# Check for suspicious headers
if header_name.lower() in suspicious_headers:
self.validate_string_content(header_value, f"header.{header_name}")
# Validate user-agent length
if header_name.lower() == "user-agent" and len(header_value) > 500:
raise SecurityValidationError("User-Agent header too long")
@ -239,34 +232,34 @@ class RequestValidator:
class ValidationMiddleware:
"""FastAPI middleware for enhanced request validation."""
def __init__(self):
self.validator = RequestValidator()
async def __call__(self, request: Request, call_next):
"""Process validation for the request."""
start_time = time.time()
validation_errors = []
# Skip validation for timing adjustment endpoint temporarily
if "/vtt/adjust-timing" in request.url.path:
return await call_next(request)
try:
# Validate headers
self.validator.validate_headers(request)
# Validate query parameters
self.validator.validate_query_params(request)
# Validate JSON payload if present
if request.method in ["POST", "PUT", "PATCH"]:
await self.validator.validate_json_payload(request)
# Process the request
response = await call_next(request)
# Track successful validation
track_validation_metrics(
endpoint=request.url.path,
@ -275,10 +268,10 @@ class ValidationMiddleware:
validation_time=time.time() - start_time,
error_types=[]
)
return response
except SecurityValidationError:
except SecurityValidationError as e:
validation_errors.append("security")
track_validation_metrics(
endpoint=request.url.path,
@ -287,7 +280,7 @@ class ValidationMiddleware:
validation_time=time.time() - start_time,
error_types=validation_errors
)
return JSONResponse(
status_code=status.HTTP_400_BAD_REQUEST,
content={
@ -295,7 +288,7 @@ class ValidationMiddleware:
"error_code": "SECURITY_VALIDATION_ERROR"
}
)
except ValidationError as e:
validation_errors.append("format")
track_validation_metrics(
@ -305,7 +298,7 @@ class ValidationMiddleware:
validation_time=time.time() - start_time,
error_types=validation_errors
)
return JSONResponse(
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
content={
@ -313,7 +306,7 @@ class ValidationMiddleware:
"error_code": "VALIDATION_ERROR"
}
)
except Exception as e:
validation_errors.append("unknown")
track_validation_metrics(
@ -323,7 +316,7 @@ class ValidationMiddleware:
validation_time=time.time() - start_time,
error_types=validation_errors
)
# Log unexpected error but continue processing
print(f"Validation middleware error: {e}")
return await call_next(request)
@ -331,4 +324,4 @@ class ValidationMiddleware:
async def create_validation_middleware() -> ValidationMiddleware:
"""Factory function to create validation middleware."""
return ValidationMiddleware()
return ValidationMiddleware()

View file

@ -1,5 +1,5 @@
"""Database migration framework for MongoDB."""
from .migrator import Migration, MigrationManager
from .migrator import MigrationManager, Migration
__all__ = ["MigrationManager", "Migration"]
__all__ = ["MigrationManager", "Migration"]

View file

@ -1,10 +1,11 @@
"""MongoDB migration framework."""
import os
import importlib.util
from abc import ABC, abstractmethod
from datetime import datetime
from pathlib import Path
from typing import List, Optional
from motor.motor_asyncio import AsyncIOMotorDatabase
from app.core.database import get_database
@ -16,23 +17,22 @@ logger = get_logger(__name__)
class Migration(ABC):
"""Base class for database migrations."""
version: str = "0000-00-00-000000" # overridden by subclass as class variable
description: str = ""
def __init__(self):
self.db: AsyncIOMotorDatabase | None = None
self.version: str = "0000-00-00-000000" # Format: YYYY-MM-DD-HHMMSS
self.description: str = ""
self.db: Optional[AsyncIOMotorDatabase] = None
@abstractmethod
async def up(self) -> None:
"""Apply the migration."""
pass
@abstractmethod
async def down(self) -> None:
"""Rollback the migration."""
pass
async def set_database(self, db: AsyncIOMotorDatabase) -> None:
"""Set the database instance."""
self.db = db
@ -40,7 +40,7 @@ class Migration(ABC):
class MigrationRecord:
"""Represents a migration record in the database."""
def __init__(self, version: str, description: str, applied_at: datetime):
self.version = version
self.description = description
@ -49,163 +49,163 @@ class MigrationRecord:
class MigrationManager:
"""Manages database migrations."""
def __init__(self):
self.db: AsyncIOMotorDatabase | None = None
self.db: Optional[AsyncIOMotorDatabase] = None
self.migrations_dir = Path(__file__).parent / "scripts"
self.collection_name = "migration_history"
async def initialize(self) -> None:
"""Initialize the migration manager."""
self.db = await get_database()
await self._ensure_migration_collection()
async def _ensure_migration_collection(self) -> None:
"""Ensure the migration history collection exists with proper indexes."""
collection = self.db[self.collection_name]
# Create indexes for migration history
await collection.create_index([("version", 1)], unique=True)
await collection.create_index([("applied_at", -1)])
logger.info("Migration history collection initialized")
def discover_migrations(self) -> list[str]:
def discover_migrations(self) -> List[str]:
"""Discover all migration files in the migrations directory."""
if not self.migrations_dir.exists():
logger.warning(f"Migrations directory not found: {self.migrations_dir}")
return []
migration_files = []
for file_path in self.migrations_dir.glob("*.py"):
if file_path.name.startswith("migration_") and not file_path.name.startswith("__"):
migration_files.append(file_path.stem)
# Sort by version (filename should start with version)
migration_files.sort()
return migration_files
async def load_migration(self, migration_name: str) -> Migration:
"""Dynamically load a migration class."""
migration_path = self.migrations_dir / f"{migration_name}.py"
if not migration_path.exists():
raise FileNotFoundError(f"Migration file not found: {migration_path}")
# Load the module
spec = importlib.util.spec_from_file_location(migration_name, migration_path)
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
# Get the migration class (assume it's named Migration)
if not hasattr(module, 'Migration'):
raise AttributeError(f"Migration class not found in {migration_name}")
migration_class = module.Migration
migration_class = getattr(module, 'Migration')
migration = migration_class()
await migration.set_database(self.db)
return migration
async def get_applied_migrations(self) -> list[str]:
async def get_applied_migrations(self) -> List[str]:
"""Get list of applied migration versions."""
collection = self.db[self.collection_name]
cursor = collection.find({}, {"version": 1}).sort("version", 1)
applied = []
async for doc in cursor:
applied.append(doc["version"])
return applied
async def record_migration(self, migration: Migration) -> None:
"""Record a successful migration in the database."""
collection = self.db[self.collection_name]
record = {
"version": migration.version,
"description": migration.description,
"applied_at": datetime.utcnow()
}
await collection.insert_one(record)
logger.info(f"Recorded migration: {migration.version} - {migration.description}")
async def remove_migration_record(self, version: str) -> None:
"""Remove a migration record (for rollback)."""
collection = self.db[self.collection_name]
await collection.delete_one({"version": version})
logger.info(f"Removed migration record: {version}")
@trace_async_operation("migration_manager.migrate_up")
async def migrate_up(self, target_version: str | None = None) -> list[str]:
async def migrate_up(self, target_version: Optional[str] = None) -> List[str]:
"""
Apply migrations up to the target version.
Args:
target_version: Version to migrate to. If None, applies all pending migrations.
Returns:
List of applied migration versions.
"""
await self.initialize()
# Discover all migrations
all_migrations = self.discover_migrations()
applied_migrations = await self.get_applied_migrations()
# Find pending migrations
pending_migrations = []
for migration_name in all_migrations:
# Extract version from filename (assumes format: migration_YYYY-MM-DD-HHMMSS_description.py)
version = migration_name.replace("migration_", "").split("_")[0]
if version not in applied_migrations:
if target_version is None or version <= target_version:
pending_migrations.append((migration_name, version))
# Sort by version
pending_migrations.sort(key=lambda x: x[1])
applied = []
for migration_name, version in pending_migrations:
try:
logger.info(f"Applying migration: {migration_name}")
migration = await self.load_migration(migration_name)
await migration.up()
await self.record_migration(migration)
applied.append(version)
logger.info(f"Successfully applied migration: {version}")
except Exception as e:
logger.error(f"Failed to apply migration {migration_name}: {e}")
raise
return applied
@trace_async_operation("migration_manager.migrate_down")
async def migrate_down(self, target_version: str) -> list[str]:
async def migrate_down(self, target_version: str) -> List[str]:
"""
Rollback migrations down to the target version.
Args:
target_version: Version to rollback to.
Returns:
List of rolled back migration versions.
"""
await self.initialize()
applied_migrations = await self.get_applied_migrations()
# Find migrations to rollback (newer than target)
to_rollback = []
for version in reversed(applied_migrations):
if version > target_version:
to_rollback.append(version)
rolled_back = []
for version in to_rollback:
try:
@ -215,39 +215,39 @@ class MigrationManager:
if version in migration_file:
migration_name = migration_file
break
if not migration_name:
logger.warning(f"Migration file not found for version {version}")
continue
logger.info(f"Rolling back migration: {migration_name}")
migration = await self.load_migration(migration_name)
await migration.down()
await self.remove_migration_record(version)
rolled_back.append(version)
logger.info(f"Successfully rolled back migration: {version}")
except Exception as e:
logger.error(f"Failed to rollback migration {version}: {e}")
raise
return rolled_back
async def get_migration_status(self) -> dict:
"""Get current migration status."""
await self.initialize()
all_migrations = self.discover_migrations()
applied_migrations = await self.get_applied_migrations()
pending_count = len(all_migrations) - len(applied_migrations)
return {
"total_migrations": len(all_migrations),
"applied_migrations": len(applied_migrations),
"pending_migrations": pending_count,
"latest_applied": applied_migrations[-1] if applied_migrations else None,
"all_applied": applied_migrations
}
}

View file

@ -1,22 +0,0 @@
"""Entry point for running migrations: python -m app.migrations.run"""
import asyncio
from app.core.database import close_mongo_connection, connect_to_mongo
from app.migrations.migrator import MigrationManager
async def main() -> None:
await connect_to_mongo()
try:
mgr = MigrationManager()
applied = await mgr.migrate_up()
if applied:
print(f"Applied {len(applied)} migration(s): {applied}")
else:
print("Already up to date — no pending migrations.")
finally:
await close_mongo_connection()
if __name__ == "__main__":
asyncio.run(main())

View file

@ -1,38 +1,39 @@
"""Initial database schema setup migration."""
from datetime import datetime
from app.migrations.migrator import Migration
class Migration(Migration):
"""Initial schema setup with all collections and indexes."""
def __init__(self):
super().__init__()
self.version = "2025-08-17-120000"
self.description = "Initial database schema with users, jobs, and audit_logs collections"
async def up(self) -> None:
"""Create initial collections and indexes."""
# Users collection setup
await self.db.users.create_index([("email", 1)], unique=True)
await self.db.users.create_index([("role", 1)])
await self.db.users.create_index([("is_active", 1)])
await self.db.users.create_index([("created_at", -1)])
# Jobs collection setup
await self.db.jobs.create_index([("status", 1), ("created_at", -1)])
await self.db.jobs.create_index([("client_id", 1)])
await self.db.jobs.create_index([("updated_at", -1)])
await self.db.jobs.create_index([("languages", 1)])
# Create compound index for job queries
await self.db.jobs.create_index([
("status", 1),
("client_id", 1),
("created_at", -1)
])
# Audit logs collection setup
await self.db.audit_logs.create_index([("timestamp", -1)])
await self.db.audit_logs.create_index([("action", 1), ("timestamp", -1)])
@ -41,23 +42,23 @@ class Migration(Migration):
await self.db.audit_logs.create_index([("resource_type", 1), ("resource_id", 1)])
await self.db.audit_logs.create_index([("ip_address", 1), ("timestamp", -1)])
await self.db.audit_logs.create_index([("success", 1), ("timestamp", -1)])
# Text search index for audit logs
await self.db.audit_logs.create_index([
("description", "text"),
("details", "text"),
("error_message", "text")
])
print(f"✅ Applied migration {self.version}: {self.description}")
async def down(self) -> None:
"""Drop all collections (destructive - use with caution)."""
# This is a destructive operation - in production, you might want to backup first
await self.db.users.drop()
await self.db.jobs.drop()
await self.db.audit_logs.drop()
print(f"⚠️ Rolled back migration {self.version}: {self.description}")
print("⚠️ WARNING: All data has been deleted!")
print("⚠️ WARNING: All data has been deleted!")

View file

@ -5,75 +5,75 @@ from app.migrations.migrator import Migration
class Migration(Migration):
"""Optimize indexes for better query performance."""
def __init__(self):
super().__init__()
self.version = "2025-08-17-120001"
self.description = "Index optimization for query performance improvements"
async def up(self) -> None:
"""Add optimized indexes for common query patterns."""
# Jobs collection optimizations
# Index for job status transitions and monitoring
await self.db.jobs.create_index([
("status", 1),
("updated_at", -1),
("client_id", 1)
], name="jobs_status_updated_client_idx")
# Index for queue management (pending jobs)
await self.db.jobs.create_index([
("status", 1),
("created_at", 1)
], name="jobs_queue_processing_idx")
# Index for client job history
await self.db.jobs.create_index([
("client_id", 1),
("created_at", -1),
("status", 1)
], name="jobs_client_history_idx")
# Sparse index for error tracking
await self.db.jobs.create_index([
("status", 1),
("error", 1)
], sparse=True, name="jobs_error_tracking_idx")
# Users collection optimizations
# Index for active user queries
await self.db.users.create_index([
("is_active", 1),
("role", 1),
("last_login_at", -1)
], name="users_active_role_login_idx")
# Index for user search by email pattern
await self.db.users.create_index([
("email", "text"),
("first_name", "text"),
("last_name", "text")
], name="users_search_idx")
# Audit logs collection optimizations
# Compound index for security monitoring
await self.db.audit_logs.create_index([
("severity", 1),
("action", 1),
("timestamp", -1)
], name="audit_security_monitoring_idx")
# Index for user activity analysis
await self.db.audit_logs.create_index([
("user_id", 1),
("action", 1),
("timestamp", -1)
], name="audit_user_activity_idx")
# Index for resource access tracking
await self.db.audit_logs.create_index([
("resource_type", 1),
@ -81,30 +81,30 @@ class Migration(Migration):
("action", 1),
("timestamp", -1)
], name="audit_resource_access_idx")
# Sparse index for failed operations
await self.db.audit_logs.create_index([
("success", 1),
("timestamp", -1)
], sparse=True, name="audit_failures_idx")
# Add TTL index for automatic audit log cleanup (optional)
# Uncomment if you want automatic cleanup after 2 years
# await self.db.audit_logs.create_index(
# [("timestamp", 1)],
# [("timestamp", 1)],
# expireAfterSeconds=63072000, # 2 years
# name="audit_ttl_idx"
# )
print(f"✅ Applied migration {self.version}: {self.description}")
async def down(self) -> None:
"""Remove the optimized indexes."""
# Drop the indexes we created
indexes_to_drop = [
"jobs_status_updated_client_idx",
"jobs_queue_processing_idx",
"jobs_queue_processing_idx",
"jobs_client_history_idx",
"jobs_error_tracking_idx",
"users_active_role_login_idx",
@ -114,21 +114,21 @@ class Migration(Migration):
"audit_resource_access_idx",
"audit_failures_idx"
]
for index_name in indexes_to_drop:
try:
await self.db.jobs.drop_index(index_name)
except Exception:
pass # Index might not exist on this collection
try:
await self.db.users.drop_index(index_name)
except Exception:
pass
try:
await self.db.audit_logs.drop_index(index_name)
except Exception:
pass
print(f"⚠️ Rolled back migration {self.version}: {self.description}")
print(f"⚠️ Rolled back migration {self.version}: {self.description}")

View file

@ -1,21 +1,20 @@
"""Migrate audit log schema from basic to comprehensive format."""
from datetime import datetime
from app.migrations.migrator import Migration
class Migration(Migration):
"""Update audit log schema to comprehensive format."""
def __init__(self):
super().__init__()
self.version = "2025-08-17-120002"
self.description = "Update audit log schema from basic to comprehensive format"
async def up(self) -> None:
"""Migrate existing audit logs to new schema format."""
# Find all existing audit logs with old schema
old_logs_cursor = self.db.audit_logs.find({
# Look for logs that have the old schema structure
@ -25,9 +24,9 @@ class Migration(Migration):
{"timestamp": {"$exists": False}} # Missing new timestamp field
]
})
migration_count = 0
async for old_log in old_logs_cursor:
try:
# Map old fields to new schema
@ -39,82 +38,82 @@ class Migration(Migration):
"description": old_log.get("action", "Legacy action"),
"success": True,
"environment": "prod",
"service_name": "accessible-video-api",
"service_name": "accessible-video-api",
"api_version": "v1"
}
# Map optional fields if they exist
if "user_id" in old_log:
new_log["user_id"] = old_log["user_id"]
if "job_id" in old_log:
new_log["resource_type"] = "job"
new_log["resource_id"] = old_log["job_id"]
if "ip_address" in old_log:
new_log["ip_address"] = old_log["ip_address"]
if "user_agent" in old_log:
new_log["user_agent"] = old_log["user_agent"]
if "details" in old_log:
new_log["details"] = old_log["details"]
# Replace the old document with the new schema
await self.db.audit_logs.replace_one(
{"_id": old_log["_id"]},
new_log
)
migration_count += 1
except Exception as e:
print(f"Error migrating audit log {old_log.get('_id')}: {e}")
continue
print(f"✅ Applied migration {self.version}: Migrated {migration_count} audit log records")
def _map_old_action(self, old_action: str) -> str:
"""Map old action strings to new AuditAction enum values."""
action_mapping = {
# Job actions
"job_created": "job.create",
"job_approved": "job.approve",
"job_approved": "job.approve",
"job_rejected": "job.reject",
"job_updated": "job.update",
"job_cancelled": "job.cancel",
# Auth actions
"login": "auth.login.success",
"logout": "auth.logout",
"login_failed": "auth.login.failure",
# File actions
"file_uploaded": "file.upload",
"file_downloaded": "file.download",
# VTT actions
"vtt_edited": "vtt.edit",
# Admin actions
"user_created": "user.create",
"user_updated": "user.update",
"user_deleted": "user.delete",
}
return action_mapping.get(old_action, old_action)
async def down(self) -> None:
"""Rollback to old audit log schema format (limited)."""
# Find all audit logs with new schema
new_logs_cursor = self.db.audit_logs.find({
"timestamp": {"$exists": True},
"action": {"$exists": True}
})
rollback_count = 0
async for new_log in new_logs_cursor:
try:
# Map new fields back to old schema (lossy conversion)
@ -123,34 +122,34 @@ class Migration(Migration):
"when": new_log["timestamp"],
"action": new_log["action"]
}
# Map back optional fields
if "user_id" in new_log:
old_log["user_id"] = new_log["user_id"]
if "resource_type" in new_log and new_log["resource_type"] == "job":
old_log["job_id"] = new_log.get("resource_id")
if "ip_address" in new_log:
old_log["ip_address"] = new_log["ip_address"]
if "user_agent" in new_log:
old_log["user_agent"] = new_log["user_agent"]
if "details" in new_log:
old_log["details"] = new_log["details"]
# Replace with old schema
await self.db.audit_logs.replace_one(
{"_id": new_log["_id"]},
old_log
)
rollback_count += 1
except Exception as e:
print(f"Error rolling back audit log {new_log.get('_id')}: {e}")
continue
print(f"⚠️ Rolled back migration {self.version}: Reverted {rollback_count} audit log records")
print("⚠️ WARNING: Some audit log data may have been lost due to schema differences")
print("⚠️ WARNING: Some audit log data may have been lost due to schema differences")

View file

@ -24,7 +24,7 @@ class Migration(Migration):
# Create index on auth_provider for faster queries
await self.db.users.create_index([("auth_provider", 1)])
print("✅ Created index on auth_provider field")
print(f"✅ Created index on auth_provider field")
print(f"✅ Applied migration {self.version}: {self.description}")
@ -34,7 +34,7 @@ class Migration(Migration):
# Drop the index
try:
await self.db.users.drop_index("auth_provider_1")
print("✅ Dropped index on auth_provider field")
print(f"✅ Dropped index on auth_provider field")
except Exception as e:
print(f"⚠️ Could not drop index: {e}")

View file

@ -75,7 +75,7 @@ class Migration(Migration):
"validationLevel": "moderate", # moderate = only validate on insert/update, not existing docs
"validationAction": "error" # error = reject invalid documents
})
print("✅ Updated users collection validator")
print(f"✅ Updated users collection validator")
except Exception as e:
print(f"⚠️ Could not update validator: {e}")
# Try creating the collection if it doesn't exist
@ -86,7 +86,7 @@ class Migration(Migration):
validationLevel="moderate",
validationAction="error"
)
print("✅ Created users collection with validator")
print(f"✅ Created users collection with validator")
except Exception as e2:
print(f"⚠️ Could not create collection: {e2}")
@ -136,4 +136,4 @@ class Migration(Migration):
})
print(f"⚠️ Rolled back migration {self.version}: {self.description}")
print("⚠️ WARNING: Production role users will fail validation!")
print(f"⚠️ WARNING: Production role users will fail validation!")

View file

@ -53,7 +53,7 @@ class Migration(Migration):
"validationLevel": "moderate",
"validationAction": "error"
})
print(" Updated jobs collection validator")
print(f" Updated jobs collection validator")
except Exception as e:
print(f" Could not update validator: {e}")
raise
@ -101,4 +101,4 @@ class Migration(Migration):
})
print(f" Rolled back migration {self.version}: {self.description}")
print(" WARNING: Jobs with approved_source or qc_feedback status will fail validation!")
print(f" WARNING: Jobs with approved_source or qc_feedback status will fail validation!")

View file

@ -54,7 +54,7 @@ class Migration(Migration):
"validationLevel": "moderate",
"validationAction": "error"
})
print(" Updated jobs collection validator")
print(f" Updated jobs collection validator")
except Exception as e:
print(f" Could not update validator: {e}")
raise
@ -104,4 +104,4 @@ class Migration(Migration):
})
print(f" Rolled back migration {self.version}: {self.description}")
print(" WARNING: Jobs with rendering_video status will fail validation!")
print(f" WARNING: Jobs with rendering_video status will fail validation!")

View file

@ -60,7 +60,7 @@ class Migration(Migration):
"validationLevel": "moderate",
"validationAction": "error"
})
print(" Updated jobs collection validator")
print(f" Updated jobs collection validator")
except Exception as e:
print(f" Could not update validator: {e}")
raise
@ -111,4 +111,4 @@ class Migration(Migration):
})
print(f" Rolled back migration {self.version}: {self.description}")
print(" WARNING: Jobs with tts_failed or render_failed status will fail validation!")
print(f" WARNING: Jobs with tts_failed or render_failed status will fail validation!")

View file

@ -61,7 +61,7 @@ class Migration(Migration):
"validationLevel": "moderate",
"validationAction": "error"
})
print(" Updated jobs collection validator")
print(f" Updated jobs collection validator")
except Exception as e:
print(f" Could not update validator: {e}")
raise
@ -114,4 +114,4 @@ class Migration(Migration):
})
print(f" Rolled back migration {self.version}: {self.description}")
print(" WARNING: Jobs with rendering_qc status will fail validation!")
print(f" WARNING: Jobs with rendering_qc status will fail validation!")

View file

@ -64,7 +64,7 @@ class Migration(Migration):
"validationLevel": "moderate",
"validationAction": "error"
})
print("✅ Updated users collection validator")
print(f"✅ Updated users collection validator")
except Exception as e:
print(f"⚠️ Could not update validator: {e}")
try:
@ -74,7 +74,7 @@ class Migration(Migration):
validationLevel="moderate",
validationAction="error"
)
print("✅ Created users collection with validator")
print(f"✅ Created users collection with validator")
except Exception as e2:
print(f"⚠️ Could not create collection: {e2}")
@ -134,4 +134,4 @@ class Migration(Migration):
})
print(f"⚠️ Rolled back migration {self.version}: {self.description}")
print("⚠️ WARNING: Linguist role users will fail validation!")
print(f"⚠️ WARNING: Linguist role users will fail validation!")

View file

@ -69,7 +69,7 @@ class Migration(Migration):
"validationLevel": "moderate",
"validationAction": "error"
})
print("✅ Updated users collection validator")
print(f"✅ Updated users collection validator")
except Exception as e:
print(f"⚠️ Could not update validator: {e}")
try:
@ -79,7 +79,7 @@ class Migration(Migration):
validationLevel="moderate",
validationAction="error"
)
print("✅ Created users collection with validator")
print(f"✅ Created users collection with validator")
except Exception as e2:
print(f"⚠️ Could not create collection: {e2}")
@ -139,4 +139,4 @@ class Migration(Migration):
})
print(f"⚠️ Rolled back migration {self.version}: {self.description}")
print("⚠️ WARNING: project_manager role users will fail validation!")
print(f"⚠️ WARNING: project_manager role users will fail validation!")

View file

@ -1,6 +1,6 @@
"""Backfill memberships collection from existing pm_client_ids and team.member_user_ids."""
from datetime import UTC, datetime
from datetime import datetime, timezone
from app.migrations.migrator import Migration
@ -13,7 +13,7 @@ class Migration(Migration):
self.description = "Backfill memberships from pm_client_ids and team member lists"
async def up(self) -> None:
now = datetime.now(UTC)
now = datetime.now(timezone.utc)
upserted = 0
# 1. PROJECT_MANAGER users → MANAGER membership for each pm_client_id

View file

@ -1,53 +0,0 @@
"""Add PROCESSING_FAILED status to job schema validator and create failure indexes."""
from app.migrations.migrator import Migration
class Migration(Migration):
version = "2026-04-29-000000"
description = "Add processing_failed status and failure/status compound indexes on jobs"
async def up(self) -> None:
db = self.db
# Add processing_failed to the schema validator enum (if validator exists)
try:
validator_info = await db.command(
"listCollections", filter={"name": "jobs"}
)
collections = [c async for c in validator_info["cursor"]]
if collections and collections[0].get("options", {}).get("validator"):
existing_validator = collections[0]["options"]["validator"]
status_path = (
existing_validator.get("$jsonSchema", {})
.get("properties", {})
.get("status", {})
.get("enum", [])
)
if status_path and "processing_failed" not in status_path:
status_path.append("processing_failed")
await db.command(
"collMod",
"jobs",
validator=existing_validator,
validationAction="warn",
)
except Exception:
# No validator or unsupported — skip gracefully
pass
# Indexes for failure dashboard queries
await db.jobs.create_index(
[("failure.step", 1), ("status", 1)],
name="idx_jobs_failure_step_status",
background=True,
)
await db.jobs.create_index(
[("status", 1), ("organization_id", 1), ("created_at", -1)],
name="idx_jobs_status_org_created",
background=True,
)
async def down(self) -> None:
db = self.db
await db.jobs.drop_index("idx_jobs_failure_step_status")
await db.jobs.drop_index("idx_jobs_status_org_created")

View file

@ -1,46 +0,0 @@
"""Create job_briefs collection with indexes."""
from app.migrations.migrator import Migration
class Migration(Migration):
version = "2026-04-29-000001"
description = "Create job_briefs collection and indexes"
async def up(self) -> None:
db = self.db
# Ensure collection exists (insert + delete a dummy doc)
try:
await db.create_collection("job_briefs")
except Exception:
pass # already exists
await db.job_briefs.create_index(
[("organization_id", 1), ("status", 1), ("created_at", -1)],
name="idx_briefs_org_status_created",
background=True,
)
await db.job_briefs.create_index(
[("created_by", 1)],
name="idx_briefs_created_by",
background=True,
)
await db.job_briefs.create_index(
[("project_id", 1)],
name="idx_briefs_project_id",
background=True,
sparse=True,
)
await db.job_briefs.create_index(
[("job_id", 1)],
name="idx_briefs_job_id",
background=True,
sparse=True,
)
async def down(self) -> None:
db = self.db
await db.job_briefs.drop_index("idx_briefs_org_status_created")
await db.job_briefs.drop_index("idx_briefs_created_by")
await db.job_briefs.drop_index("idx_briefs_project_id")
await db.job_briefs.drop_index("idx_briefs_job_id")

View file

@ -1,44 +0,0 @@
"""Backfill Membership.team_ids from Team.member_user_ids (MT-17)."""
from app.migrations.migrator import Migration
class Migration(Migration):
version = "2026-04-30-000000"
description = "Backfill team_ids on Membership records from Team.member_user_ids"
async def up(self) -> None:
db = self.db
upserted = 0
# For each team that has member_user_ids, push team_id into the matching Membership
async for team in db.teams.find(
{"member_user_ids": {"$exists": True, "$ne": []}},
{"_id": 1, "client_id": 1, "member_user_ids": 1},
):
team_id = str(team["_id"])
org_id = str(team.get("client_id", ""))
for user_id in team.get("member_user_ids", []):
result = await db.memberships.update_one(
{"user_id": str(user_id), "organization_id": org_id},
{"$addToSet": {"team_ids": team_id}},
)
if result.modified_count:
upserted += 1
# Ensure index for efficient team-based lookups
await db.memberships.create_index(
[("team_ids", 1)],
name="idx_memberships_team_ids",
background=True,
sparse=True,
)
print(f"✅ Backfilled team_ids on {upserted} Membership records")
async def down(self) -> None:
db = self.db
await db.memberships.update_many({}, {"$unset": {"team_ids": ""}})
try:
await db.memberships.drop_index("idx_memberships_team_ids")
except Exception:
pass

View file

@ -1,38 +0,0 @@
"""Add cancelled status to job schema validator."""
from app.migrations.migrator import Migration
class Migration(Migration):
version = "2026-04-30-000001"
description = "Add cancelled status to jobs collection schema validator"
async def up(self) -> None:
db = self.db
try:
validator_info = await db.command(
"listCollections", filter={"name": "jobs"}
)
collections = [c async for c in validator_info["cursor"]]
if collections and collections[0].get("options", {}).get("validator"):
existing_validator = collections[0]["options"]["validator"]
status_path = (
existing_validator.get("$jsonSchema", {})
.get("properties", {})
.get("status", {})
.get("enum", [])
)
if status_path and "cancelled" not in status_path:
status_path.append("cancelled")
await db.command(
"collMod",
"jobs",
validator=existing_validator,
validationAction="warn",
)
except Exception:
# No validator or unsupported — skip gracefully
pass
async def down(self) -> None:
pass

View file

@ -1,47 +0,0 @@
"""Replace status enum in $jsonSchema validator with the full current list."""
from app.migrations.migrator import Migration
ALL_STATUSES = [
"created", "ingesting", "ai_processing",
"pending_qc", "approved_english", "approved_source",
"rejected", "qc_feedback",
"translating", "tts_generating", "tts_failed",
"rendering_video", "render_failed", "rendering_qc",
"pending_final_review", "completed",
"processing_failed", "cancelled",
]
class Migration(Migration):
version = "2026-04-30-000002"
description = "Fix status enum in jobs $jsonSchema validator (add processing_failed + cancelled)"
async def up(self) -> None:
db = self.db
result = await db.command("listCollections", filter={"name": "jobs"})
batch = result.get("cursor", {}).get("firstBatch", [])
if not batch:
return
existing_validator = batch[0].get("options", {}).get("validator")
if not existing_validator:
return
schema = existing_validator.get("$jsonSchema", {})
status_prop = schema.get("properties", {}).get("status")
if not status_prop:
return
status_prop["enum"] = ALL_STATUSES
await db.command(
"collMod",
"jobs",
validator=existing_validator,
validationLevel="moderate",
validationAction="error",
)
async def down(self) -> None:
pass

View file

@ -1,26 +0,0 @@
"""Backfill source_has_ad=False on existing jobs and job_briefs."""
from app.migrations.migrator import Migration
class Migration(Migration):
version = "2026-05-08-000000"
description = "Add source_has_ad field to jobs.source and job_briefs"
async def up(self) -> None:
db = self.db
jobs_result = await db.jobs.update_many(
{"source.source_has_ad": {"$exists": False}},
{"$set": {"source.source_has_ad": False}},
)
briefs_result = await db.job_briefs.update_many(
{"source_has_ad": {"$exists": False}},
{"$set": {"source_has_ad": False}},
)
print(f"✅ Backfilled source_has_ad on {jobs_result.modified_count} jobs, {briefs_result.modified_count} job_briefs")
async def down(self) -> None:
db = self.db
await db.jobs.update_many({}, {"$unset": {"source.source_has_ad": ""}})
await db.job_briefs.update_many({}, {"$unset": {"source_has_ad": ""}})

View file

@ -1,18 +1,17 @@
"""Audit log model for tracking sensitive operations."""
from datetime import datetime
from enum import StrEnum
from typing import Any
from enum import Enum
from typing import Any, Dict, Optional
from bson import ObjectId
from pydantic import BaseModel, Field
from .user import PyObjectId
class AuditAction(StrEnum):
class AuditAction(str, Enum):
"""Enumeration of auditable actions."""
# Authentication actions
LOGIN_SUCCESS = "auth.login.success"
LOGIN_FAILURE = "auth.login.failure"
@ -20,7 +19,7 @@ class AuditAction(StrEnum):
TOKEN_REFRESH = "auth.token.refresh"
PASSWORD_CHANGE = "auth.password.change"
PASSWORD_RESET = "auth.password.reset"
# User management actions
USER_CREATE = "user.create"
USER_UPDATE = "user.update"
@ -28,7 +27,7 @@ class AuditAction(StrEnum):
USER_ROLE_CHANGE = "user.role.change"
USER_ACTIVATE = "user.activate"
USER_DEACTIVATE = "user.deactivate"
# Job management actions
JOB_CREATE = "job.create"
JOB_UPDATE = "job.update"
@ -37,21 +36,17 @@ class AuditAction(StrEnum):
JOB_REJECT = "job.reject"
JOB_CANCEL = "job.cancel"
JOB_STATUS_CHANGE = "job.status.change"
JOB_TASK_FAILED = "job.task.failed"
JOB_RETRY = "job.retry"
JOB_BULK_RETRY = "job.bulk_retry"
# File operations
FILE_UPLOAD = "file.upload"
FILE_DOWNLOAD = "file.download"
FILE_DELETE = "file.delete"
FILE_ACCESS = "file.access"
# VTT editing actions
VTT_EDIT = "vtt.edit"
VTT_APPROVE = "vtt.approve"
VTT_REJECT = "vtt.reject"
VTT_RETRANSLATE = "vtt.retranslate"
# Per-language QC actions
LANGUAGE_QC_ASSIGN = "language_qc.assign"
@ -64,62 +59,19 @@ class AuditAction(StrEnum):
LANGUAGE_QC_REJECT = "language_qc.reject"
LANGUAGE_QC_REOPEN = "language_qc.reopen"
LANGUAGE_QC_COMMENT = "language_qc.comment"
# Admin actions
ADMIN_CONFIG_CHANGE = "admin.config.change"
ADMIN_SYSTEM_ACTION = "admin.system.action"
ADMIN_DATA_EXPORT = "admin.data.export"
ADMIN_AUDIT_ACCESS = "admin.audit.access"
# Glossary management
GLOSSARY_UPLOAD = "glossary.upload"
GLOSSARY_VERSION_UPLOAD = "glossary.version.upload"
GLOSSARY_ACTIVATE = "glossary.activate"
GLOSSARY_ARCHIVE = "glossary.archive"
# Client management
CLIENT_CREATE = "client.create"
CLIENT_UPDATE = "client.update"
CLIENT_DEACTIVATE = "client.deactivate"
CLIENT_PM_ASSIGN = "client.pm_assign"
CLIENT_PM_REMOVE = "client.pm_remove"
CLIENT_TEAM_CREATE = "client.team_create"
CLIENT_TEAM_UPDATE = "client.team_update"
CLIENT_TEAM_DELETE = "client.team_delete"
CLIENT_TEAM_MEMBER_ADD = "client.team_member_add"
CLIENT_TEAM_MEMBER_REMOVE = "client.team_member_remove"
CLIENT_PROJECT_CREATE = "client.project_create"
CLIENT_PROJECT_UPDATE = "client.project_update"
CLIENT_PROJECT_ARCHIVE = "client.project_archive"
# Organization management
ORG_CREATE = "org.create"
ORG_UPDATE = "org.update"
ORG_MEMBER_ADD = "org.member_add"
ORG_MEMBER_UPDATE = "org.member_update"
ORG_MEMBER_REMOVE = "org.member_remove"
# Invitations
INVITATION_CREATE = "invitation.create"
INVITATION_REVOKE = "invitation.revoke"
INVITATION_ACCEPT = "invitation.accept"
# Language QC (additional)
LANGUAGE_QC_BULK_ASSIGN = "language_qc.bulk_assign"
LANGUAGE_QC_START_WORK = "language_qc.start_work"
LANGUAGE_QC_MARK_CUE_REVIEWED = "language_qc.mark_cue_reviewed"
# Brief management
BRIEF_CREATE = "brief.create"
BRIEF_UPDATE = "brief.update"
BRIEF_SUBMIT = "brief.submit"
BRIEF_APPROVE = "brief.approve"
# Share tokens
SHARE_TOKEN_CREATE = "share.token_create"
SHARE_TOKEN_REVOKE = "share.token_revoke"
SHARE_CLIENT_DECISION = "share.client_decision"
# Security events
RATE_LIMIT_EXCEEDED = "security.rate_limit.exceeded"
VALIDATION_FAILURE = "security.validation.failure"
@ -127,9 +79,9 @@ class AuditAction(StrEnum):
SUSPICIOUS_ACTIVITY = "security.suspicious.activity"
class AuditLogSeverity(StrEnum):
class AuditLogSeverity(str, Enum):
"""Severity levels for audit events."""
INFO = "info" # Normal operations
WARNING = "warning" # Suspicious but not critical
ERROR = "error" # Failed operations
@ -138,43 +90,43 @@ class AuditLogSeverity(StrEnum):
class AuditLog(BaseModel):
"""Audit log entry model."""
id: PyObjectId | None = Field(default_factory=lambda: str(ObjectId()), alias="_id")
id: Optional[PyObjectId] = Field(default_factory=lambda: str(ObjectId()), alias="_id")
# Core audit fields
timestamp: datetime = Field(default_factory=datetime.utcnow)
action: AuditAction
severity: AuditLogSeverity = AuditLogSeverity.INFO
# Actor information
user_id: PyObjectId | None = None
user_email: str | None = None
user_role: str | None = None
user_id: Optional[PyObjectId] = None
user_email: Optional[str] = None
user_role: Optional[str] = None
# Request context
ip_address: str | None = None
user_agent: str | None = None
request_id: str | None = None
session_id: str | None = None
ip_address: Optional[str] = None
user_agent: Optional[str] = None
request_id: Optional[str] = None
session_id: Optional[str] = None
# Resource information
resource_type: str | None = None # e.g., "job", "user", "file"
resource_id: str | None = None
resource_name: str | None = None
resource_type: Optional[str] = None # e.g., "job", "user", "file"
resource_id: Optional[str] = None
resource_name: Optional[str] = None
# Action details
description: str
details: dict[str, Any] = Field(default_factory=dict)
details: Dict[str, Any] = Field(default_factory=dict)
# Outcome
success: bool = True
error_message: str | None = None
error_message: Optional[str] = None
# Additional metadata
environment: str = "prod"
service_name: str = "accessible-video-api"
api_version: str = "v1"
class Config:
populate_by_name = True
arbitrary_types_allowed = True
@ -183,49 +135,49 @@ class AuditLog(BaseModel):
class AuditLogCreate(BaseModel):
"""Schema for creating audit log entries."""
action: AuditAction
severity: AuditLogSeverity = AuditLogSeverity.INFO
description: str
# Optional fields that can be provided
user_id: PyObjectId | None = None
user_email: str | None = None
user_role: str | None = None
ip_address: str | None = None
user_agent: str | None = None
request_id: str | None = None
resource_type: str | None = None
resource_id: str | None = None
resource_name: str | None = None
details: dict[str, Any] = Field(default_factory=dict)
user_id: Optional[PyObjectId] = None
user_email: Optional[str] = None
user_role: Optional[str] = None
ip_address: Optional[str] = None
user_agent: Optional[str] = None
request_id: Optional[str] = None
resource_type: Optional[str] = None
resource_id: Optional[str] = None
resource_name: Optional[str] = None
details: Dict[str, Any] = Field(default_factory=dict)
success: bool = True
error_message: str | None = None
error_message: Optional[str] = None
class AuditLogQuery(BaseModel):
"""Schema for querying audit logs."""
# Time range
start_date: datetime | None = None
end_date: datetime | None = None
start_date: Optional[datetime] = None
end_date: Optional[datetime] = None
# Filters
action: AuditAction | None = None
severity: AuditLogSeverity | None = None
user_id: PyObjectId | None = None
user_email: str | None = None
resource_type: str | None = None
resource_id: str | None = None
success: bool | None = None
action: Optional[AuditAction] = None
severity: Optional[AuditLogSeverity] = None
user_id: Optional[PyObjectId] = None
user_email: Optional[str] = None
resource_type: Optional[str] = None
resource_id: Optional[str] = None
success: Optional[bool] = None
# Search
search: str | None = None # Full-text search in description and details
search: Optional[str] = None # Full-text search in description and details
# Pagination
skip: int = 0
limit: int = 100
# Sorting
sort_by: str = "timestamp"
sort_order: int = -1 # -1 for descending, 1 for ascending
@ -233,7 +185,7 @@ class AuditLogQuery(BaseModel):
class AuditLogResponse(BaseModel):
"""Response schema for audit log queries."""
logs: list[AuditLog]
total_count: int
page: int

View file

@ -1,5 +1,5 @@
from datetime import datetime
from typing import Annotated
from typing import Optional, Annotated
from bson import ObjectId
from pydantic import BaseModel, BeforeValidator
@ -17,12 +17,12 @@ PyObjectId = Annotated[str, BeforeValidator(validate_object_id)]
class Client(BaseModel):
id: str | None = None
id: Optional[str] = None
name: str
slug: str
is_active: bool = True
created_at: datetime | None = None
updated_at: datetime | None = None
created_at: Optional[datetime] = None
updated_at: Optional[datetime] = None
class ClientCreate(BaseModel):
@ -31,18 +31,18 @@ class ClientCreate(BaseModel):
class ClientUpdate(BaseModel):
name: str | None = None
slug: str | None = None
is_active: bool | None = None
name: Optional[str] = None
slug: Optional[str] = None
is_active: Optional[bool] = None
class Team(BaseModel):
id: str | None = None
id: Optional[str] = None
name: str
client_id: str
member_user_ids: list[str] = []
created_at: datetime | None = None
updated_at: datetime | None = None
created_at: Optional[datetime] = None
updated_at: Optional[datetime] = None
class TeamCreate(BaseModel):
@ -50,31 +50,31 @@ class TeamCreate(BaseModel):
class TeamUpdate(BaseModel):
name: str | None = None
name: Optional[str] = None
class Project(BaseModel):
id: str | None = None
id: Optional[str] = None
name: str
client_id: str
is_active: bool = True
default_languages: list[str] = []
default_linguist_id: str | None = None
default_reviewer_id: str | None = None
created_at: datetime | None = None
updated_at: datetime | None = None
default_linguist_id: Optional[str] = None
default_reviewer_id: Optional[str] = None
created_at: Optional[datetime] = None
updated_at: Optional[datetime] = None
class ProjectCreate(BaseModel):
name: str
default_languages: list[str] = []
default_linguist_id: str | None = None
default_reviewer_id: str | None = None
default_linguist_id: Optional[str] = None
default_reviewer_id: Optional[str] = None
class ProjectUpdate(BaseModel):
name: str | None = None
is_active: bool | None = None
default_languages: list[str] | None = None
default_linguist_id: str | None = None
default_reviewer_id: str | None = None
name: Optional[str] = None
is_active: Optional[bool] = None
default_languages: Optional[list[str]] = None
default_linguist_id: Optional[str] = None
default_reviewer_id: Optional[str] = None

View file

@ -91,9 +91,6 @@ class GlossaryResponse(BaseModel):
source: GlossarySource
status: GlossaryStatus
current_version_id: str | None = None
current_version_embedding_status: EmbeddingStatus | None = None
current_version_embedded_count: int | None = None
current_version_term_count: int | None = None
created_at: datetime
created_by: str

View file

@ -1,4 +1,5 @@
from datetime import datetime
from typing import Optional
from pydantic import BaseModel, EmailStr
@ -6,7 +7,7 @@ from .organization import OrgRole
class Invitation(BaseModel):
id: str | None = None
id: Optional[str] = None
email: str
organization_id: str
role_in_org: OrgRole
@ -14,9 +15,9 @@ class Invitation(BaseModel):
token_hash: str
invited_by_user_id: str
expires_at: datetime
accepted_at: datetime | None = None
revoked_at: datetime | None = None
created_at: datetime | None = None
accepted_at: Optional[datetime] = None
revoked_at: Optional[datetime] = None
created_at: Optional[datetime] = None
class InvitationCreate(BaseModel):
@ -39,9 +40,9 @@ class InvitationPreviewResponse(BaseModel):
class InvitationAcceptRequest(BaseModel):
token: str
full_name: str | None = None
password: str | None = None
ms_id_token: str | None = None
full_name: Optional[str] = None
password: Optional[str] = None
ms_id_token: Optional[str] = None
class InvitationResponse(BaseModel):
@ -51,9 +52,9 @@ class InvitationResponse(BaseModel):
role_in_org: OrgRole
invited_by_user_id: str
expires_at: datetime
accepted_at: datetime | None = None
revoked_at: datetime | None = None
created_at: datetime | None = None
accepted_at: Optional[datetime] = None
revoked_at: Optional[datetime] = None
created_at: Optional[datetime] = None
is_expired: bool = False
is_accepted: bool = False
is_revoked: bool = False

View file

@ -1,13 +1,11 @@
from datetime import datetime
from enum import StrEnum
from typing import Any, Literal
from enum import Enum
from typing import Any, Literal, Optional
from pydantic import BaseModel, Field, constr
FailureStep = Literal["ingestion", "ai_processing", "translation", "tts", "render"]
class JobStatus(StrEnum):
class JobStatus(str, Enum):
CREATED = "created"
INGESTING = "ingesting"
AI_PROCESSING = "ai_processing"
@ -18,14 +16,12 @@ class JobStatus(StrEnum):
QC_FEEDBACK = "qc_feedback"
TRANSLATING = "translating"
TTS_GENERATING = "tts_generating"
TTS_FAILED = "tts_failed" # legacy: use PROCESSING_FAILED + failure.step="tts" for new failures
TTS_FAILED = "tts_failed" # TTS synthesis failed after retries, requires reprocessing
RENDERING_VIDEO = "rendering_video" # Accessible video rendering in progress
RENDER_FAILED = "render_failed" # legacy: use PROCESSING_FAILED + failure.step="render" for new failures
PROCESSING_FAILED = "processing_failed" # unified failure status; see Job.failure for step details
RENDER_FAILED = "render_failed" # Accessible video rendering failed, requires reprocessing
RENDERING_QC = "rendering_qc" # Re-rendering accessible video during QC review
PENDING_FINAL_REVIEW = "pending_final_review"
COMPLETED = "completed"
CANCELLED = "cancelled"
@classmethod
def is_approved(cls, status: str) -> bool:
@ -33,24 +29,14 @@ class JobStatus(StrEnum):
return status in [cls.APPROVED_ENGLISH.value, cls.APPROVED_SOURCE.value]
class JobFailure(BaseModel):
step: FailureStep
type: str
message: str
retriable: bool = True
occurred_at: datetime
retry_count: int = 0
class Source(BaseModel):
filename: str
original_filename: str | None = None
original_filename: Optional[str] = None
gcs_uri: str
duration_s: float | None = None
duration_s: Optional[float] = None
language: constr(min_length=2, max_length=10) = "en" # Final source language (from detection or explicit)
language_hint: str | None = None # User-provided hint for non-English videos
detected_language: str | None = None # AI-detected language from Gemini
source_has_ad: bool = False # Source video already contains professional audio descriptions
language_hint: Optional[str] = None # User-provided hint for non-English videos
detected_language: Optional[str] = None # AI-detected language from Gemini
class TTSPreferences(BaseModel):
@ -64,10 +50,10 @@ class TTSPreferences(BaseModel):
style_preset: Literal[
"neutral", "calm", "energetic", "professional", "warm", "documentary", "custom"
] = "neutral"
custom_style_prompt: str | None = None # Used when style_preset is "custom"
custom_style_prompt: Optional[str] = None # Used when style_preset is "custom"
# ElevenLabs-specific settings
stability: float | None = None # 0.0-1.0, default 0.5 when used
similarity_boost: float | None = None # 0.0-1.0, default 0.5 when used
stability: Optional[float] = None # 0.0-1.0, default 0.5 when used
similarity_boost: Optional[float] = None # 0.0-1.0, default 0.5 when used
class RequestedOutputs(BaseModel):
@ -75,24 +61,22 @@ class RequestedOutputs(BaseModel):
audio_description_vtt: bool = True
audio_description_mp3: bool = True
accessible_video_mp4: bool = False # Rendered video with embedded audio descriptions
accessible_video_method: Literal["overlay", "pause_insert"] | None = None # User-selected method
accessible_video_method: Optional[Literal["overlay", "pause_insert"]] = None # User-selected method
sdh_vtt: bool = False # SDH (Subtitles for Deaf and Hard of Hearing) captions with speaker labels, sound effects, music notation
descriptive_transcript: bool = False # WCAG-compliant combined speech+description transcript text file
languages: list[str] = []
transcreation: list[str] = []
tts_preferences: TTSPreferences | None = None
translation_mode: Literal["traditional", "video_native"] = "traditional"
tts_preferences: Optional[TTSPreferences] = None
translation_mode: Literal["traditional", "video_native"] = "video_native"
class PausePointData(BaseModel):
"""Pause point timing data for accessible video editing during QC."""
cue_index: int # AD cue index this pause point belongs to
original_ms: float # Rendered timeline position (ms) - for UI display
source_ms: float | None = None # Source video cut point (ms) - for re-rendering (None = use original_ms)
adjusted_ms: float | None = None # User-adjusted timestamp (ms), None = use original
source_ms: Optional[float] = None # Source video cut point (ms) - for re-rendering (None = use original_ms)
adjusted_ms: Optional[float] = None # User-adjusted timestamp (ms), None = use original
min_bound_ms: float # Minimum allowed value (end of previous AD segment)
max_bound_ms: float # Maximum allowed value (start of next AD segment)
natural_gap_ms: float = 0.0 # Natural silence already present at pause point (ms); used to size silence buffers
class VideoSegmentMetadata(BaseModel):
@ -103,16 +87,16 @@ class VideoSegmentMetadata(BaseModel):
gcs_uri: str # GCS path to segment MP4
duration_ms: float # Actual segment duration (ms)
is_freeze_frame: bool = False # True if this is a freeze frame segment with AD audio
cue_index: int | None = None # AD cue index (only for freeze frame segments)
cue_index: Optional[int] = None # AD cue index (only for freeze frame segments)
class TTSRegenerationRequest(BaseModel):
"""Request to regenerate TTS for a specific cue during QC."""
cue_index: int
requested_at: datetime
new_text: str | None = None # If provided, use this text instead of current VTT
new_text: Optional[str] = None # If provided, use this text instead of current VTT
status: Literal["pending", "processing", "completed", "failed"] = "pending"
error_message: str | None = None
error_message: Optional[str] = None
class AccessibleVideoEditState(BaseModel):
@ -120,45 +104,45 @@ class AccessibleVideoEditState(BaseModel):
pause_points: list[PausePointData] = []
video_segments: list[VideoSegmentMetadata] = []
tts_regeneration_queue: list[TTSRegenerationRequest] = []
last_render_at: datetime | None = None
last_render_at: Optional[datetime] = None
whisper_refine_enabled: bool = False # Default: off (user enables if cue positions changed)
class LangOutput(BaseModel):
captions_vtt_gcs: str | None = None
sdh_captions_vtt_gcs: str | None = None # SDH-format captions (speaker labels, sound effects, music)
ad_vtt_gcs: str | None = None
ad_mp3_gcs: str | None = None
captions_vtt_gcs: Optional[str] = None
sdh_captions_vtt_gcs: Optional[str] = None # SDH-format captions (speaker labels, sound effects, music)
ad_vtt_gcs: Optional[str] = None
ad_mp3_gcs: Optional[str] = None
# Accessible video outputs
accessible_video_gcs: str | None = None # Rendered accessible MP4
accessible_video_method: Literal["overlay", "pause_insert"] | None = None
retimed_captions_vtt_gcs: str | None = None # Re-timed captions for pause-insert method
ad_cues_gcs_prefix: str | None = None # GCS path prefix for per-cue MP3 segments
ad_cue_manifest: list[dict] | None = None # Per-cue manifest: [{cue_index, gcs_uri, text, duration_s}]
accessible_video_gcs: Optional[str] = None # Rendered accessible MP4
accessible_video_method: Optional[Literal["overlay", "pause_insert"]] = None
retimed_captions_vtt_gcs: Optional[str] = None # Re-timed captions for pause-insert method
ad_cues_gcs_prefix: Optional[str] = None # GCS path prefix for per-cue MP3 segments
ad_cue_manifest: Optional[list[dict]] = None # Per-cue manifest: [{cue_index, gcs_uri, text, duration_s}]
# QC editing state for accessible video
video_segments_gcs_prefix: str | None = None # GCS prefix for persisted video segments
accessible_video_edit_state: AccessibleVideoEditState | None = None
origin: Literal["translate", "transcreate", "gemini_translate", "video_native"] | None = None
qa_notes: str | None = None
descriptive_transcript_gcs: str | None = None # WCAG-compliant combined speech+description transcript
video_segments_gcs_prefix: Optional[str] = None # GCS prefix for persisted video segments
accessible_video_edit_state: Optional[AccessibleVideoEditState] = None
origin: Optional[Literal["translate", "transcreate", "gemini_translate", "video_native"]] = None
qa_notes: Optional[str] = None
descriptive_transcript_gcs: Optional[str] = None # WCAG-compliant combined speech+description transcript
class ReviewHistoryItem(BaseModel):
at: datetime
status: str
by: str | None = None
notes: str | None = None
by: Optional[str] = None
notes: Optional[str] = None
class Review(BaseModel):
notes: str | None = ""
reviewer_id: str | None = None
notes: Optional[str] = ""
reviewer_id: Optional[str] = None
history: list[ReviewHistoryItem] = []
# ── Per-language QC ───────────────────────────────────────────────────────────
class LanguageQCStatus(StrEnum):
class LanguageQCStatus(str, Enum):
PENDING = "pending"
IN_PROGRESS = "in_progress" # linguist is working
PENDING_REVIEW = "pending_review" # linguist submitted, awaiting reviewer
@ -178,8 +162,8 @@ class LanguageQCEvent(BaseModel):
"approve", "reject", "reopen",
"comment_added",
]
notes: str | None = None
previous_assignee_id: str | None = None
notes: Optional[str] = None
previous_assignee_id: Optional[str] = None
class LanguageQCComment(BaseModel):
@ -194,29 +178,25 @@ class LanguageQCComment(BaseModel):
class LanguageQCState(BaseModel):
status: LanguageQCStatus = LanguageQCStatus.PENDING
# Linguist slot
assigned_linguist_id: str | None = None
assigned_linguist_email: str | None = None
assigned_linguist_name: str | None = None
assigned_at: datetime | None = None
assigned_by_user_id: str | None = None
submitted_for_review_at: datetime | None = None
linguist_deadline: datetime | None = None # when linguist must submit
assigned_linguist_id: Optional[str] = None
assigned_linguist_email: Optional[str] = None
assigned_linguist_name: Optional[str] = None
assigned_at: Optional[datetime] = None
assigned_by_user_id: Optional[str] = None
submitted_for_review_at: Optional[datetime] = None
linguist_deadline: Optional[datetime] = None # when linguist must submit
# Reviewer slot
assigned_reviewer_id: str | None = None
assigned_reviewer_email: str | None = None
assigned_reviewer_name: str | None = None
assigned_reviewer_at: datetime | None = None
review_started_at: datetime | None = None
reviewer_deadline: datetime | None = None # when reviewer must decide
# Reviewer progress
total_cues: int | None = None # set when reviewer opens the job
reviewed_cues: int = 0 # incremented as reviewer marks cues reviewed
assigned_reviewer_id: Optional[str] = None
assigned_reviewer_email: Optional[str] = None
assigned_reviewer_name: Optional[str] = None
assigned_reviewer_at: Optional[datetime] = None
review_started_at: Optional[datetime] = None
reviewer_deadline: Optional[datetime] = None # when reviewer must decide
# Final outcome
reviewed_at: datetime | None = None
reviewed_by_user_id: str | None = None
reviewed_by_email: str | None = None
notes: str | None = None
reject_category: str | None = None # e.g. timing/mistranslation/terminology/profanity/length
reviewed_at: Optional[datetime] = None
reviewed_by_user_id: Optional[str] = None
reviewed_by_email: Optional[str] = None
notes: Optional[str] = None
history: list[LanguageQCEvent] = []
comments: list[LanguageQCComment] = []
@ -229,47 +209,39 @@ class QCAssignment(BaseModel):
class AISection(BaseModel):
ingestion_json: dict[str, Any] | None = None
confidence: float | None = None
ingestion_json: Optional[dict[str, Any]] = None
confidence: Optional[float] = None
class AccessibleVideoProgressItem(BaseModel):
"""Progress tracking for accessible video rendering per language."""
status: Literal["pending", "rendering", "completed", "failed"] = "pending"
method: Literal["overlay", "pause_insert"] | None = None
error_message: str | None = None
started_at: datetime | None = None
completed_at: datetime | None = None
method: Optional[Literal["overlay", "pause_insert"]] = None
error_message: Optional[str] = None
started_at: Optional[datetime] = None
completed_at: Optional[datetime] = None
class Job(BaseModel):
id: str | None = Field(None, alias="_id")
id: Optional[str] = Field(None, alias="_id")
client_id: str
title: str
source: Source
requested_outputs: RequestedOutputs
status: JobStatus = JobStatus.CREATED
review: Review = Review()
outputs: dict[str, LangOutput] | None = None
accessible_video_progress: dict[str, AccessibleVideoProgressItem] | None = None
ai: AISection | None = None
error: dict[str, Any] | None = None
failure: JobFailure | None = None # structured failure info; see failure.step for pipeline stage
retry_count: int = 0 # total number of manual retries attempted
tts_rewrites: list[dict[str, Any]] | None = None # Track auto-rewritten TTS cues
project_id: str | None = None # Platform project this job belongs to (Client → Project → Job)
organization_id: str | None = None # org-tenant ID; backfilled by 2026-04-28-000003 migration
brief_id: str | None = None # JobBrief that originated this job (W-12)
gcs_prefix: str | None = None # GCS path prefix; None = legacy flat {job_id}/ layout
initial_linguist_id: str | None = None
initial_reviewer_id: str | None = None
brand_context: str | None = None # Brand names present in the video for accurate product identification
cost_tracker_project_id: str | None = None # External project ID for AI cost attribution
deadline: datetime | None = None # job-level PM deadline (overdue if past and not completed)
outputs: Optional[dict[str, LangOutput]] = None
accessible_video_progress: Optional[dict[str, AccessibleVideoProgressItem]] = None
ai: Optional[AISection] = None
error: Optional[dict[str, Any]] = None
tts_rewrites: Optional[list[dict[str, Any]]] = None # Track auto-rewritten TTS cues
project_id: Optional[str] = None # Platform project this job belongs to (Client → Project → Job)
brand_context: Optional[str] = None # Brand names present in the video for accurate product identification
cost_tracker_project_id: Optional[str] = None # External project ID for AI cost attribution
language_qc: dict[str, LanguageQCState] = {} # per-language QC state, keyed by lang code
qc_assignments: list[QCAssignment] = [] # denormalized for linguist-queue queries
created_at: datetime | None = None
updated_at: datetime | None = None
created_at: Optional[datetime] = None
updated_at: Optional[datetime] = None
class Config:
populate_by_name = True
@ -279,17 +251,15 @@ class Job(BaseModel):
class JobCreate(BaseModel):
title: str
source_is_english: bool = True # True = English source, False = other language (auto-detect)
language_hint: str | None = None # Optional hint when source_is_english=False
language_hint: Optional[str] = None # Optional hint when source_is_english=False
requested_outputs: RequestedOutputs
brand_context: str | None = None # Comma-separated brand names present in the video (e.g. "Sellotape, Coca-Cola")
source_has_ad: bool = False # Source video already contains professional audio descriptions
brand_context: Optional[str] = None # Comma-separated brand names present in the video (e.g. "Sellotape, Coca-Cola")
class JobUpdate(BaseModel):
title: str | None = None
status: JobStatus | None = None
review: Review | None = None
outputs: dict[str, LangOutput] | None = None
ai: AISection | None = None
error: dict[str, Any] | None = None
deadline: datetime | None = None
title: Optional[str] = None
status: Optional[JobStatus] = None
review: Optional[Review] = None
outputs: Optional[dict[str, LangOutput]] = None
ai: Optional[AISection] = None
error: Optional[dict[str, Any]] = None

View file

@ -1,75 +0,0 @@
"""Job Brief model — pre-approved work order submitted before job creation."""
from datetime import datetime
from enum import StrEnum
from pydantic import BaseModel, Field
from .job import RequestedOutputs
class BriefStatus(StrEnum):
DRAFT = "draft"
SUBMITTED = "submitted"
APPROVED = "approved"
REJECTED = "rejected"
FULFILLED = "fulfilled"
class JobBrief(BaseModel):
id: str | None = Field(None, alias="_id")
organization_id: str
project_id: str | None = None
title: str
description: str | None = None
requested_outputs: RequestedOutputs
languages: list[str] = []
deadline: datetime | None = None
status: BriefStatus = BriefStatus.DRAFT
created_by: str
job_id: str | None = None
created_at: datetime = Field(default_factory=datetime.utcnow)
updated_at: datetime = Field(default_factory=datetime.utcnow)
submitted_at: datetime | None = None
approved_by: str | None = None
reject_reason: str | None = None
class Config:
populate_by_name = True
class JobBriefCreate(BaseModel):
title: str
description: str | None = None
requested_outputs: RequestedOutputs
languages: list[str] = []
deadline: datetime | None = None
project_id: str | None = None
assignee_id: str | None = None
source_has_ad: bool = False # Source video already contains professional audio descriptions
class JobBriefUpdate(BaseModel):
title: str | None = None
description: str | None = None
requested_outputs: RequestedOutputs | None = None
languages: list[str] | None = None
deadline: datetime | None = None
class JobBriefResponse(BaseModel):
id: str
organization_id: str
project_id: str | None = None
title: str
description: str | None = None
requested_outputs: RequestedOutputs
languages: list[str]
deadline: datetime | None = None
status: BriefStatus
created_by: str
assignee_id: str | None = None
job_id: str | None = None
created_at: str
updated_at: str
submitted_at: str | None = None
approved_by: str | None = None

View file

@ -1,4 +1,5 @@
from datetime import datetime
from typing import Optional
from pydantic import BaseModel
@ -6,13 +7,12 @@ from .organization import OrgRole
class Membership(BaseModel):
id: str | None = None
id: Optional[str] = None
user_id: str
organization_id: str
role_in_org: OrgRole
team_ids: list[str] = [] # teams the user belongs to within this org (MT-17)
created_at: datetime | None = None
created_by: str | None = None
created_at: Optional[datetime] = None
created_by: Optional[str] = None
class MembershipCreate(BaseModel):
@ -31,4 +31,4 @@ class MemberDetail(BaseModel):
email: str
full_name: str
role_in_org: OrgRole
created_at: datetime | None = None
created_at: Optional[datetime] = None

View file

@ -1,10 +1,11 @@
from datetime import datetime
from enum import StrEnum
from enum import Enum
from typing import Optional
from pydantic import BaseModel
class OrgRole(StrEnum):
class OrgRole(str, Enum):
OWNER = "owner"
ADMIN = "admin"
MANAGER = "manager"
@ -29,13 +30,13 @@ class OrgRole(StrEnum):
class Organization(BaseModel):
id: str | None = None
id: Optional[str] = None
name: str
slug: str
is_active: bool = True
plan: str = "standard"
created_at: datetime | None = None
updated_at: datetime | None = None
created_at: Optional[datetime] = None
updated_at: Optional[datetime] = None
class OrganizationCreate(BaseModel):
@ -44,7 +45,7 @@ class OrganizationCreate(BaseModel):
class OrganizationUpdate(BaseModel):
name: str | None = None
slug: str | None = None
is_active: bool | None = None
plan: str | None = None
name: Optional[str] = None
slug: Optional[str] = None
is_active: Optional[bool] = None
plan: Optional[str] = None

View file

@ -1,6 +1,7 @@
"""Review Note model for timestamped video review notes."""
from datetime import datetime
from typing import Optional
from pydantic import BaseModel, Field
@ -8,7 +9,7 @@ from pydantic import BaseModel, Field
class ReviewNote(BaseModel):
"""A timestamped note attached to a video asset during review."""
id: str | None = Field(None, alias="_id")
id: Optional[str] = Field(None, alias="_id")
job_id: str
asset_key: str # e.g., "en", "es", "en_accessible"
timestamp_seconds: float # Video timestamp when note was created
@ -16,7 +17,7 @@ class ReviewNote(BaseModel):
user_id: str # Author's user ID
user_name: str # Author's display name (denormalized for display)
created_at: datetime
updated_at: datetime | None = None
updated_at: Optional[datetime] = None
class Config:
populate_by_name = True

View file

@ -1,26 +0,0 @@
from datetime import datetime
from pydantic import BaseModel
class ShareToken(BaseModel):
id: str | None = None # token itself (32 hex chars), used as _id
job_id: str
organization_id: str
created_by_user_id: str
created_by_email: str
created_at: datetime | None = None
expires_at: datetime | None = None
is_active: bool = True
label: str | None = None # human-readable note e.g. "Sent to ACME 2026-05-01"
class ShareTokenResponse(BaseModel):
id: str
job_id: str
created_by_email: str
created_at: datetime
expires_at: datetime | None = None
is_active: bool
label: str | None = None
share_url: str # full public URL, assembled server-side

View file

@ -1,9 +1,9 @@
from datetime import datetime
from enum import StrEnum
from typing import Annotated
from enum import Enum
from typing import Optional, Annotated
from bson import ObjectId
from pydantic import BaseModel, BeforeValidator, EmailStr, Field
from pydantic import BaseModel, EmailStr, Field, BeforeValidator
def validate_object_id(v) -> str:
@ -18,7 +18,7 @@ def validate_object_id(v) -> str:
PyObjectId = Annotated[str, BeforeValidator(validate_object_id)]
class UserRole(StrEnum):
class UserRole(str, Enum):
CLIENT = "client"
REVIEWER = "reviewer"
LINGUIST = "linguist"
@ -27,23 +27,22 @@ class UserRole(StrEnum):
ADMIN = "admin"
class AuthProvider(StrEnum):
class AuthProvider(str, Enum):
LOCAL = "local"
MICROSOFT = "microsoft"
class User(BaseModel):
id: PyObjectId | None = Field(None, alias="_id")
id: Optional[PyObjectId] = Field(None, alias="_id")
email: EmailStr
hashed_password: str | None = None # Optional for Microsoft users
hashed_password: Optional[str] = None # Optional for Microsoft users
full_name: str
role: UserRole = UserRole.CLIENT
auth_provider: AuthProvider = AuthProvider.LOCAL
is_active: bool = True
pm_client_ids: list[str] = [] # Client IDs where this user is Project Manager (admin-assigned)
languages: list[str] = [] # BCP-47 language codes the user is competent in (R-8)
created_at: datetime | None = None
updated_at: datetime | None = None
created_at: Optional[datetime] = None
updated_at: Optional[datetime] = None
class Config:
populate_by_name = True
@ -62,9 +61,8 @@ class UserCreate(BaseModel):
class UserUpdate(BaseModel):
email: EmailStr | None = None
full_name: str | None = None
role: UserRole | None = None
is_active: bool | None = None
pm_client_ids: list[str] | None = None
languages: list[str] | None = None
email: Optional[EmailStr] = None
full_name: Optional[str] = None
role: Optional[UserRole] = None
is_active: Optional[bool] = None
pm_client_ids: Optional[list[str]] = None

View file

@ -1,8 +1,9 @@
from datetime import datetime
from typing import Literal
from typing import Literal, Optional
from pydantic import BaseModel, Field
VttKind = Literal["captions", "ad"]
@ -12,7 +13,7 @@ class VttVersionActor(BaseModel):
class VttVersion(BaseModel):
id: str | None = Field(None, alias="_id")
id: Optional[str] = Field(None, alias="_id")
job_id: str
lang: str
kind: VttKind
@ -21,8 +22,8 @@ class VttVersion(BaseModel):
gcs_uri: str
created_at: datetime = Field(default_factory=datetime.utcnow)
created_by: VttVersionActor
note: str | None = None
parent_version: int | None = None
note: Optional[str] = None
parent_version: Optional[int] = None
cue_count: int = 0
byte_size: int = 0
@ -32,7 +33,7 @@ class VttVersion(BaseModel):
class VttVersionSummary(BaseModel):
"""Lightweight version entry for list responses (no content)."""
id: str | None = Field(None, alias="_id")
id: Optional[str] = Field(None, alias="_id")
job_id: str
lang: str
kind: VttKind
@ -40,8 +41,8 @@ class VttVersionSummary(BaseModel):
gcs_uri: str
created_at: datetime
created_by: VttVersionActor
note: str | None = None
parent_version: int | None = None
note: Optional[str] = None
parent_version: Optional[int] = None
cue_count: int = 0
byte_size: int = 0
@ -57,8 +58,8 @@ class VttVersionListResponse(BaseModel):
class DiffLine(BaseModel):
type: Literal["unchanged", "added", "removed"]
content: str
line_no_old: int | None = None
line_no_new: int | None = None
line_no_old: Optional[int] = None
line_no_new: Optional[int] = None
class VttDiffResponse(BaseModel):

View file

@ -10,7 +10,6 @@ You are given a video. Return a JSON object with:
- captions_vtt: a valid WebVTT file as a single string, with accurate timings and no styling (in the detected language)
- audio_description_vtt: a valid WebVTT file as a single string, describing key visual elements (no spoilers), synchronized with the program (MUST be written in the detected language)
{SDH_FIELD}
{SOURCE_HAS_AD}
CRITICAL LANGUAGE REQUIREMENT:
- First, detect the language spoken in the video
@ -37,7 +36,7 @@ CRITICAL TIMING REQUIREMENTS:
- Each caption cue should end exactly when the speaker finishes that phrase/sentence
- Listen carefully to detect natural speech pauses and word boundaries
- Avoid starting captions too early or ending them too late
- Caption ALL audible speech — include off-screen narrators, voiceover, and any speaker not visible on screen. Do NOT omit speech because the speaker is not visible or because it plays over non-dialogue segments.
- Ensure captions align with lip movement and speech rhythm
- For audio descriptions, time them during natural speech gaps or over non-dialogue audio
- Validate that all timestamps are monotonically increasing (each cue starts after the previous one ends)
@ -58,14 +57,6 @@ CAPTION FORMATTING (DCMP standard):
- Minimum caption duration: approximately 1.3 seconds. Maximum: 6 seconds
- Use mixed case. Use ALL CAPS only for screaming or shouting
DISFLUENCY REMOVAL (DCMP §6.01):
- MANDATORY: Never include filler words, false starts, or hesitations in captions — remove them silently
- English fillers to remove: "um", "uh", "ah", "er", "hmm", "you know", "I mean", "sort of", "kind of", "basically", "literally", "honestly"
- Language-specific fillers: French "euh"/"beh"/"ben"/"genre", German "äh"/"ähm"/"halt"/"also", Spanish "eh"/"este"/"o sea"/"pues", Italian "ehm"/"allora"/"cioè"/"tipo", Dutch "eh"/"nou"/"zeg"/"eigenlijk", Portuguese "ahn"/"né"/"sabe"/"tipo"
- Remove false starts when the speaker self-corrects immediately (e.g., "I was — I went to the store" → "I went to the store")
- Do NOT remove meaningful repetition, emphasis, or intentional stylistic choices
- When in doubt whether a word is a filler or content: omit it — clean captions are preferred over over-inclusive ones
SOUND AND MUSIC FORMATTING (DCMP standard):
- Sound effects: lowercase in square brackets — e.g., [door slams], [footsteps approaching]
- Use present participle for sustained sounds: [dog barking]; use third person for abrupt sounds: [dog barks]
@ -78,9 +69,7 @@ SOUND AND MUSIC FORMATTING (DCMP standard):
CAPTION PLACEMENT:
- Captions are normally positioned at the bottom of the screen
- CRITICAL: When ANY of the following are visible at the BOTTOM of the frame during a caption cue — on-screen text, lower-thirds, name plates, location titles, graphics, logos, product labels, URLs, or any visual information — you MUST add the VTT cue setting "line:0%" to move that cue to the top of the screen. Format: "00:00:01.000 --> 00:00:03.000 line:0%"
- When in doubt whether bottom content conflicts with captions, use "line:0%" — it is better to be at the top than to obstruct important on-screen information
- Example: if a lower-third name plate is visible at seconds 0:050:08, all caption cues overlapping that range must have "line:0%"
- When visible text, graphics, logos, or on-screen information appear at the bottom of the frame during a caption cue, add the VTT cue setting "line:0%" to move that caption to the top — format: "00:00:01.000 --> 00:00:03.000 line:0%"
ETHICAL GUIDELINES FOR DESCRIBING PEOPLE (DCMP standard):
- Consistently identify people/characters by name. When a name is not yet known, identify by the most obvious visible attribute (e.g., "the person in the red jacket") until the name is established, then switch to the name and use it consistently

View file

@ -10,7 +10,6 @@ You are given a video. Return a JSON object with:
- captions_vtt: a valid WebVTT file as a single string, with accurate timings and no styling (written in {TARGET_LANGUAGE})
- audio_description_vtt: a valid WebVTT file as a single string, describing key visual elements (no spoilers), synchronized with the program (written in {TARGET_LANGUAGE})
{SDH_FIELD}
{SOURCE_HAS_AD}
TARGET LANGUAGE: {TARGET_LANGUAGE}
@ -41,7 +40,7 @@ CRITICAL TIMING REQUIREMENTS:
- Each caption cue should end exactly when the speaker finishes that phrase/sentence
- Listen carefully to detect natural speech pauses and word boundaries
- Avoid starting captions too early or ending them too late
- Caption ALL audible speech — include off-screen narrators, voiceover, and any speaker not visible on screen. Do NOT omit speech because the speaker is not visible or because it plays over non-dialogue segments.
- Ensure captions align with lip movement and speech rhythm
- For audio descriptions, time them during natural speech gaps or over non-dialogue audio
- Validate that all timestamps are monotonically increasing (each cue starts after the previous one ends)
@ -62,13 +61,6 @@ CAPTION FORMATTING (DCMP standard):
- Minimum caption duration: approximately 1.3 seconds. Maximum: 6 seconds
- Use mixed case. Use ALL CAPS only for screaming or shouting
DISFLUENCY REMOVAL (DCMP §6.01):
- Do NOT include filler words, false starts, or hesitations in captions
- Remove: "um", "uh", "ah", "er", "hmm", "like" (as filler), "you know" (as filler), "I mean" (as filler)
- Also remove language-specific fillers (e.g., "euh"/"beh" in French, "äh"/"ähm" in German, "eh"/"este" in Spanish, "ehm"/"allora" in Italian)
- Remove false starts when the speaker self-corrects immediately (e.g., "I was — I went to the store" → "I went to the store")
- Do NOT remove meaningful repetition, emphasis, or intentional stylistic choices
SOUND AND MUSIC FORMATTING (DCMP standard):
- Sound effects: lowercase in square brackets — e.g., [door slams], [footsteps approaching]
- Use present participle for sustained sounds: [dog barking]; use third person for abrupt sounds: [dog barks]

View file

@ -1,11 +1,12 @@
"""Schemas for accessible video generation with embedded audio descriptions."""
from enum import StrEnum
from enum import Enum
from typing import Optional
from pydantic import BaseModel, Field
class AccessibleVideoMethod(StrEnum):
class AccessibleVideoMethod(str, Enum):
"""Method used for integrating audio descriptions into video."""
OVERLAY = "overlay"
PAUSE_INSERT = "pause_insert"
@ -31,29 +32,29 @@ class ADPlacementCue(BaseModel):
target_start_time: float = Field(..., description="Target time in output video (seconds)")
ad_duration: float = Field(..., description="Duration of the AD TTS audio in seconds")
# For pause-insert method
pause_point: float | None = Field(
pause_point: Optional[float] = Field(
None,
description="Where to pause the video - just before the next sentence starts (gap.end - buffer). Used for pause-insert method."
)
resume_from: float | None = Field(
resume_from: Optional[float] = Field(
None,
description="Where to resume video after AD plays - just after the previous sentence ends (gap.start + buffer). Creates a small overlap for natural transitions."
)
pause_point_rationale: str | None = Field(
pause_point_rationale: Optional[str] = Field(
None,
description="Explanation of why this pause point was chosen, referencing the sentence boundary."
)
# Whisper refinement tracking
original_pause_point: float | None = Field(
original_pause_point: Optional[float] = Field(
None,
description="Original pause point from Gemini before Whisper refinement (seconds)."
)
# For overlay method
duck_start: float | None = Field(
duck_start: Optional[float] = Field(
None,
description="When to start ducking original audio (seconds). Used for overlay method."
)
duck_end: float | None = Field(
duck_end: Optional[float] = Field(
None,
description="When to end ducking original audio (seconds). Used for overlay method."
)
@ -117,10 +118,10 @@ class AccessibleVideoRenderRequest(BaseModel):
class AccessibleVideoProgress(BaseModel):
"""Progress status for accessible video rendering."""
status: str = Field(..., description="pending | rendering | completed | failed")
method: AccessibleVideoMethod | None = None
error_message: str | None = None
started_at: str | None = None
completed_at: str | None = None
method: Optional[AccessibleVideoMethod] = None
error_message: Optional[str] = None
started_at: Optional[str] = None
completed_at: Optional[str] = None
# === QC Review Accessible Video Editing Schemas ===
@ -130,8 +131,8 @@ class PausePointResponse(BaseModel):
"""Pause point timing data for QC editing."""
cue_index: int = Field(..., description="AD cue index this pause point belongs to")
original_ms: float = Field(..., description="Rendered timeline position (ms) - for display")
source_ms: float | None = Field(None, description="Source video cut point (ms) - for re-rendering (None = use original_ms)")
adjusted_ms: float | None = Field(None, description="User-adjusted timestamp (ms)")
source_ms: Optional[float] = Field(None, description="Source video cut point (ms) - for re-rendering (None = use original_ms)")
adjusted_ms: Optional[float] = Field(None, description="User-adjusted timestamp (ms)")
min_bound_ms: float = Field(..., description="Minimum allowed value (ms)")
max_bound_ms: float = Field(..., description="Maximum allowed value (ms)")
@ -144,16 +145,16 @@ class VideoSegmentResponse(BaseModel):
gcs_uri: str = Field(..., description="GCS path to segment MP4")
duration_ms: float = Field(..., description="Actual segment duration (ms)")
is_freeze_frame: bool = Field(False, description="True if freeze frame with AD audio")
cue_index: int | None = Field(None, description="AD cue index (freeze frames only)")
cue_index: Optional[int] = Field(None, description="AD cue index (freeze frames only)")
class TTSRegenerationItem(BaseModel):
"""A queued TTS regeneration request."""
cue_index: int = Field(..., description="AD cue index to regenerate")
requested_at: str = Field(..., description="ISO timestamp when requested")
new_text: str | None = Field(None, description="Override text (if provided)")
new_text: Optional[str] = Field(None, description="Override text (if provided)")
status: str = Field("pending", description="pending | processing | completed | failed")
error_message: str | None = None
error_message: Optional[str] = None
class AccessibleVideoEditStateResponse(BaseModel):
@ -170,12 +171,12 @@ class AccessibleVideoEditStateResponse(BaseModel):
default_factory=list,
description="Queued TTS regeneration requests"
)
last_render_at: str | None = Field(
last_render_at: Optional[str] = Field(
None,
description="ISO timestamp of last accessible video render"
)
total_duration_ms: float = Field(..., description="Total accessible video duration (ms)")
accessible_video_url: str | None = Field(
accessible_video_url: Optional[str] = Field(
None,
description="Signed URL for accessible video preview"
)

View file

@ -1,7 +1,6 @@
from typing import Optional
from pydantic import BaseModel, EmailStr
from ..models.user import AuthProvider, UserRole
from ..models.user import UserRole, AuthProvider
class LoginRequest(BaseModel):
@ -53,9 +52,8 @@ class UserResponse(BaseModel):
role: UserRole
auth_provider: AuthProvider
is_active: bool
created_at: str | None = None
created_at: Optional[str] = None
pm_client_ids: list[str] = []
languages: list[str] = [] # BCP-47 codes for R-8 linguist competence check
class UserListResponse(BaseModel):
@ -73,10 +71,10 @@ class CreateUserRequest(BaseModel):
class UpdateUserRequest(BaseModel):
email: EmailStr | None = None
full_name: str | None = None
role: UserRole | None = None
is_active: bool | None = None
email: Optional[EmailStr] = None
full_name: Optional[str] = None
role: Optional[UserRole] = None
is_active: Optional[bool] = None
class ChangePasswordRequest(BaseModel):

View file

@ -1,3 +1,4 @@
from typing import Optional
from pydantic import BaseModel
@ -5,10 +6,10 @@ from pydantic import BaseModel
class SignedUploadRequest(BaseModel):
filename: str
content_type: str
max_size: int | None = None
max_size: Optional[int] = None
class SignedUploadResponse(BaseModel):
upload_url: str
fields: dict[str, str]
blob_path: str
blob_path: str

View file

@ -1,10 +1,9 @@
from typing import Any
from typing import Any, Literal, Optional, Union
from pydantic import BaseModel, field_validator
from pydantic import BaseModel
from ..models.job import (
AccessibleVideoProgressItem,
JobFailure,
JobStatus,
LangOutput,
RequestedOutputs,
@ -16,20 +15,18 @@ from ..schemas.accessible_video import AccessibleVideoMethod
class JobResponse(BaseModel):
id: str
client_id: str | None = None # ID of the user who created the job
client_id: Optional[str] = None # ID of the user who created the job
title: str
status: JobStatus
source: dict[str, Any]
requested_outputs: RequestedOutputs
review: Review
outputs: dict[str, LangOutput] | None = None
accessible_video_progress: dict[str, AccessibleVideoProgressItem] | None = None
failure: JobFailure | None = None
error: dict[str, Any] | None = None
created_at: str | None = None
updated_at: str | None = None
created_by_name: str | None = None # User's full_name who created the job
cost_tracker_project_id: str | None = None
outputs: Optional[dict[str, LangOutput]] = None
accessible_video_progress: Optional[dict[str, AccessibleVideoProgressItem]] = None
created_at: Optional[str] = None
updated_at: Optional[str] = None
created_by_name: Optional[str] = None # User's full_name who created the job
cost_tracker_project_id: Optional[str] = None
class JobListResponse(BaseModel):
@ -45,20 +42,20 @@ class JobCreateRequest(BaseModel):
class JobUpdateRequest(BaseModel):
title: str | None = None
review_notes: str | None = None
cost_tracker_project_id: str | None = None
title: Optional[str] = None
review_notes: Optional[str] = None
cost_tracker_project_id: Optional[str] = None
class ApproveEnglishRequest(BaseModel):
notes: str | None = None
notes: Optional[str] = None
class ApproveSourceRequest(BaseModel):
"""Request to approve source language content (works for any language)"""
notes: str | None = None
tts_preferences: TTSPreferences | None = None # Override TTS voice settings
accessible_video_method: AccessibleVideoMethod | None = None # User-selected method for accessible video
notes: Optional[str] = None
tts_preferences: Optional[TTSPreferences] = None # Override TTS voice settings
accessible_video_method: Optional[AccessibleVideoMethod] = None # User-selected method for accessible video
class UpdateTTSPreferencesRequest(BaseModel):
@ -71,21 +68,13 @@ class RejectJobRequest(BaseModel):
class CompleteJobRequest(BaseModel):
notes: str | None = None
notes: Optional[str] = None
class VttUpdateRequest(BaseModel):
captions_vtt: str | None = None
audio_description_vtt: str | None = None
language: str | None = None # If None, defaults to source language
if_match: str | None = None # Optimistic locking — SHA1 of expected current content
retranslate_languages: bool = False # Re-translate all target languages from updated source VTT
note: str | None = None # Optional save message shown in version history
@field_validator('captions_vtt', 'audio_description_vtt', mode='before')
@classmethod
def empty_str_to_none(cls, v: Any) -> str | None:
return None if v == '' else v
captions_vtt: Optional[str] = None
audio_description_vtt: Optional[str] = None
language: Optional[str] = None # If None, defaults to source language
class VttTimingAdjustRequest(BaseModel):
@ -96,14 +85,13 @@ class VttTimingAdjustRequest(BaseModel):
class JobDownloadsResponse(BaseModel):
downloads: dict[str, dict[str, str] | str] # language -> {file_type: signed_url} OR source_video -> signed_url
downloads: dict[str, Union[dict[str, str], str]] # language -> {file_type: signed_url} OR source_video -> signed_url
class VttContentResponse(BaseModel):
captions_vtt: str | None = None
audio_description_vtt: str | None = None
retimed_captions_vtt: str | None = None # Re-timed captions for accessible videos
etag: str | None = None # SHA1 hash for optimistic locking (If-Match on PATCH)
captions_vtt: Optional[str] = None
audio_description_vtt: Optional[str] = None
retimed_captions_vtt: Optional[str] = None # Re-timed captions for accessible videos
class AssetValidationResponse(BaseModel):
@ -129,9 +117,9 @@ class BulkDeleteResponse(BaseModel):
class BulkApproveRequest(BaseModel):
"""Request to bulk approve multiple jobs with optional settings"""
job_ids: list[str]
notes: str | None = None
accessible_video_method: AccessibleVideoMethod | None = None # Method for accessible video
tts_preferences: TTSPreferences | None = None
notes: Optional[str] = None
accessible_video_method: Optional[AccessibleVideoMethod] = None # Method for accessible video
tts_preferences: Optional[TTSPreferences] = None
class BulkApproveResponse(BaseModel):
@ -159,42 +147,3 @@ class BulkReturnToQCResponse(BaseModel):
class BulkDownloadRequest(BaseModel):
"""Request to download multiple jobs as a single zip file"""
job_ids: list[str]
class BlockedOnSourceRequest(BaseModel):
reason: str # brief description of what is wrong with the source video
class PromoteToQCRequest(BaseModel):
notes: str = "" # optional context for the QC team
# ── PR-3: Resumable / chunked upload ──────────────────────────────────────────
class UploadInitRequest(BaseModel):
filename: str
content_type: str
file_size: int # bytes — validated server-side against settings.upload_max_video_bytes
class UploadInitResponse(BaseModel):
job_id: str
upload_url: str # GCS resumable session URI — browser uploads chunks directly here
class UploadCompleteRequest(BaseModel):
job_id: str
title: str
original_filename: str
requested_outputs: dict
brand_context: str | None = None
project_id: str | None = None
brief_id: str | None = None
deadline: str | None = None
initial_linguist_id: str | None = None
initial_reviewer_id: str | None = None
class RetranslateLanguageRequest(BaseModel):
language: str
reason: str | None = None

View file

@ -1,6 +1,7 @@
"""Pydantic schemas for Review Note API requests and responses."""
from datetime import datetime
from typing import Optional
from pydantic import BaseModel, Field
@ -30,7 +31,7 @@ class ReviewNoteResponse(BaseModel):
user_id: str
user_name: str
created_at: str # ISO format
updated_at: str | None = None # ISO format
updated_at: Optional[str] = None # ISO format
@classmethod
def from_model(cls, note: dict) -> "ReviewNoteResponse":

View file

@ -2,19 +2,19 @@
import uuid
from datetime import datetime, timedelta
from typing import Any
from typing import Any, Dict, List, Optional
from fastapi import Request
from motor.motor_asyncio import AsyncIOMotorCollection
from app.core.config import get_settings
from app.core.database import get_database
from app.core.config import get_settings
from app.models.audit_log import (
AuditAction,
AuditLog,
AuditLogQuery,
AuditLog,
AuditLogCreate,
AuditLogQuery,
AuditLogResponse,
AuditLogSeverity,
AuditAction,
AuditLogSeverity
)
from app.models.user import User
from app.telemetry.tracing import trace_async_operation
@ -22,45 +22,45 @@ from app.telemetry.tracing import trace_async_operation
class AuditLogger:
"""Service for managing audit logs."""
def __init__(self):
self.settings = get_settings()
self.collection: AsyncIOMotorCollection | None = None
self.collection: Optional[AsyncIOMotorCollection] = None
async def _get_collection(self) -> AsyncIOMotorCollection:
"""Get the audit logs collection."""
if self.collection is None:
db = await get_database()
self.collection = db.audit_logs
return self.collection
@trace_async_operation("audit_logger.log_action")
async def log_action(
self,
action: AuditAction,
description: str,
user: User | None = None,
request: Request | None = None,
resource_type: str | None = None,
resource_id: str | None = None,
resource_name: str | None = None,
details: dict[str, Any] | None = None,
user: Optional[User] = None,
request: Optional[Request] = None,
resource_type: Optional[str] = None,
resource_id: Optional[str] = None,
resource_name: Optional[str] = None,
details: Optional[Dict[str, Any]] = None,
severity: AuditLogSeverity = AuditLogSeverity.INFO,
success: bool = True,
error_message: str | None = None
error_message: Optional[str] = None
) -> str:
"""
Log an audit event.
Returns:
The ID of the created audit log entry.
"""
# Extract request context
ip_address = None
user_agent = None
request_id = None
if request:
# Get IP address (handle forwarded headers)
forwarded_for = request.headers.get("X-Forwarded-For")
@ -68,10 +68,10 @@ class AuditLogger:
ip_address = forwarded_for.split(',')[0].strip()
elif request.client:
ip_address = request.client.host
user_agent = request.headers.get("User-Agent")
request_id = request.headers.get("X-Request-ID", str(uuid.uuid4()))
# Create audit log entry
audit_log = AuditLog(
action=action,
@ -93,26 +93,22 @@ class AuditLogger:
service_name="accessible-video-api",
api_version="v1"
)
# Save to database — non-raising so audit failure never aborts the primary operation
# Save to database
collection = await self._get_collection()
try:
result = await collection.insert_one(audit_log.dict(by_alias=True))
return str(result.inserted_id)
except Exception as exc: # noqa: BLE001
import logging
logging.getLogger(__name__).error("audit log insert failed: %s", exc)
return ""
result = await collection.insert_one(audit_log.dict(by_alias=True))
return str(result.inserted_id)
@trace_async_operation("audit_logger.query_logs")
async def query_logs(self, query: AuditLogQuery) -> AuditLogResponse:
"""Query audit logs with filtering and pagination."""
collection = await self._get_collection()
# Build MongoDB query
mongo_query = {}
# Time range filter
if query.start_date or query.end_date:
timestamp_filter = {}
@ -121,7 +117,7 @@ class AuditLogger:
if query.end_date:
timestamp_filter["$lte"] = query.end_date
mongo_query["timestamp"] = timestamp_filter
# Exact match filters
if query.action:
mongo_query["action"] = query.action
@ -140,7 +136,7 @@ class AuditLogger:
mongo_query["resource_id"] = query.resource_id
if query.success is not None:
mongo_query["success"] = query.success
# Text search
if query.search:
mongo_query["$or"] = [
@ -148,23 +144,23 @@ class AuditLogger:
{"details": {"$regex": query.search, "$options": "i"}},
{"error_message": {"$regex": query.search, "$options": "i"}}
]
# Get total count
total_count = await collection.count_documents(mongo_query)
# Execute query with pagination and sorting
cursor = collection.find(mongo_query)
# Apply sorting
sort_direction = query.sort_order
cursor = cursor.sort(query.sort_by, sort_direction)
# Apply pagination
cursor = cursor.skip(query.skip).limit(query.limit)
# Execute query
documents = await cursor.to_list(length=query.limit)
# Convert to Pydantic models
logs = []
for doc in documents:
@ -174,11 +170,11 @@ class AuditLogger:
# Log conversion error but continue
print(f"Error converting audit log document: {e}")
continue
# Calculate pagination info
page = (query.skip // query.limit) + 1
has_more = (query.skip + len(logs)) < total_count
return AuditLogResponse(
logs=logs,
total_count=total_count,
@ -186,14 +182,14 @@ class AuditLogger:
page_size=len(logs),
has_more=has_more
)
async def get_user_activity(self, user_id: str, days: int = 30) -> list[AuditLog]:
async def get_user_activity(self, user_id: str, days: int = 30) -> List[AuditLog]:
"""Get recent activity for a specific user."""
from_date = datetime.utcnow().replace(
hour=0, minute=0, second=0, microsecond=0
) - timedelta(days=days)
query = AuditLogQuery(
user_id=user_id,
start_date=from_date,
@ -201,15 +197,15 @@ class AuditLogger:
sort_by="timestamp",
sort_order=-1
)
response = await self.query_logs(query)
return response.logs
async def get_security_events(self, hours: int = 24) -> list[AuditLog]:
async def get_security_events(self, hours: int = 24) -> List[AuditLog]:
"""Get recent security-related events."""
from_date = datetime.utcnow() - timedelta(hours=hours)
security_actions = [
AuditAction.LOGIN_FAILURE,
AuditAction.RATE_LIMIT_EXCEEDED,
@ -217,38 +213,38 @@ class AuditLogger:
AuditAction.UNAUTHORIZED_ACCESS,
AuditAction.SUSPICIOUS_ACTIVITY
]
collection = await self._get_collection()
query = {
"timestamp": {"$gte": from_date},
"action": {"$in": security_actions}
}
cursor = collection.find(query).sort("timestamp", -1).limit(1000)
documents = await cursor.to_list(length=1000)
logs = []
for doc in documents:
try:
logs.append(AuditLog(**doc))
except Exception:
continue
return logs
async def cleanup_old_logs(self, retention_days: int = 365) -> int:
"""Clean up audit logs older than retention period."""
cutoff_date = datetime.utcnow().replace(
hour=0, minute=0, second=0, microsecond=0
) - timedelta(days=retention_days)
collection = await self._get_collection()
result = await collection.delete_many({
"timestamp": {"$lt": cutoff_date}
})
return result.deleted_count
@ -281,16 +277,16 @@ async def log_auth_failure(email: str, request: Request, reason: str):
)
async def log_job_action(action: AuditAction, job_id: str, user: User, request: Request, details: dict | None = None):
async def log_job_action(action: AuditAction, job_id: str, user: User, request: Request, details: Optional[Dict] = None):
"""Log job-related actions."""
action_descriptions = {
AuditAction.JOB_CREATE: "Job created",
AuditAction.JOB_APPROVE: "Job approved",
AuditAction.JOB_APPROVE: "Job approved",
AuditAction.JOB_REJECT: "Job rejected",
AuditAction.JOB_CANCEL: "Job cancelled",
AuditAction.JOB_UPDATE: "Job updated"
}
await audit_logger.log_action(
action=action,
description=f"{action_descriptions.get(action, str(action))} by {user.email}",
@ -302,7 +298,7 @@ async def log_job_action(action: AuditAction, job_id: str, user: User, request:
)
async def log_user_management(action: AuditAction, target_user_id: str, admin_user: User, request: Request, details: dict | None = None):
async def log_user_management(action: AuditAction, target_user_id: str, admin_user: User, request: Request, details: Optional[Dict] = None):
"""Log user management actions."""
action_descriptions = {
AuditAction.USER_CREATE: "User created",
@ -312,7 +308,7 @@ async def log_user_management(action: AuditAction, target_user_id: str, admin_us
AuditAction.USER_ACTIVATE: "User activated",
AuditAction.USER_DEACTIVATE: "User deactivated"
}
await audit_logger.log_action(
action=action,
description=f"{action_descriptions.get(action, str(action))} by admin {admin_user.email}",
@ -325,7 +321,7 @@ async def log_user_management(action: AuditAction, target_user_id: str, admin_us
)
async def log_security_event(action: AuditAction, description: str, request: Request, user: User | None = None, details: dict | None = None):
async def log_security_event(action: AuditAction, description: str, request: Request, user: Optional[User] = None, details: Optional[Dict] = None):
"""Log security-related events."""
await audit_logger.log_action(
action=action,
@ -335,4 +331,4 @@ async def log_security_event(action: AuditAction, description: str, request: Req
severity=AuditLogSeverity.WARNING if action != AuditAction.SUSPICIOUS_ACTIVITY else AuditLogSeverity.CRITICAL,
success=False,
details=details
)
)

View file

@ -1,135 +0,0 @@
"""Align Gemini caption VTT timings against Whisper word-level timestamps.
Algorithm:
For each VTT cue, tokenise its text and search for the token sequence in the
Whisper word stream starting from the cursor position (with a look-ahead window).
When a match of sufficient confidence is found the cue's start/end timestamps
are replaced with the matched Whisper words' start/end. Cues that cannot be
matched (music notation, sound effects, empty cues) keep their original Gemini
timestamps. The result has Whisper-accurate timings early in the video and
graceful fallbacks where Whisper didn't capture the audio.
"""
import bisect
import re
from dataclasses import dataclass
from ..core.logging import get_logger
from ..lib.vtt import VTTEditor, VTTParser
from ..services.whisper_service import WordTimestamp
logger = get_logger(__name__)
# Characters to strip when comparing tokens
_PUNCT = re.compile(r"[^\w']", re.UNICODE)
# Tokens shorter than this are considered stop-words and excluded from matching
_MIN_TOKEN_LEN = 2
# Minimum fraction of cue tokens that must match Whisper words for alignment.
# Lowered from 0.5 → 0.35 to handle Gemini paraphrasing and short cues.
_MIN_MATCH_RATIO = 0.35
# How many Whisper words ahead of the cursor to search for a cue's tokens.
# Widened from 60 → 150 so the window stays valid even after several failed cues.
_SEARCH_WINDOW = 150
def _tokenise(text: str) -> list[str]:
"""Lower-case, strip punctuation, drop short tokens."""
return [
t for t in (_PUNCT.sub("", w).lower() for w in text.split())
if len(t) >= _MIN_TOKEN_LEN
]
@dataclass
class _Match:
first_word_idx: int
last_word_idx: int
ratio: float # matched_tokens / cue_tokens
def _find_match(
cue_tokens: list[str],
whisper_words: list[WordTimestamp],
cursor: int,
) -> _Match | None:
"""Return the best match for cue_tokens starting at cursor ± SEARCH_WINDOW."""
if not cue_tokens:
return None
best: _Match | None = None
end = min(cursor + _SEARCH_WINDOW, len(whisper_words))
for start_idx in range(cursor, end):
matched = 0
last_idx = start_idx
token_pos = 0
for w_idx in range(start_idx, end):
if token_pos >= len(cue_tokens):
break
w_tok = _PUNCT.sub("", whisper_words[w_idx].word).lower()
if w_tok == cue_tokens[token_pos]:
matched += 1
last_idx = w_idx
token_pos += 1
ratio = matched / len(cue_tokens)
if ratio >= _MIN_MATCH_RATIO:
if best is None or ratio > best.ratio:
best = _Match(start_idx, last_idx, ratio)
if ratio == 1.0:
break # perfect match — no need to search further
return best
def _cursor_for_time(whisper_words: list[WordTimestamp], t: float, from_idx: int) -> int:
"""Return the index of the first Whisper word at or after time t, starting from from_idx."""
starts = [w.start for w in whisper_words]
idx = bisect.bisect_left(starts, t, from_idx)
return min(idx, len(whisper_words) - 1)
def align(captions_vtt: str, whisper_words: list[WordTimestamp]) -> str:
"""Replace VTT cue timings with Whisper-accurate timestamps where possible.
Returns a VTT string with the same cue count as the input, with improved
timing accuracy on cues that could be matched to Whisper word output.
"""
if not whisper_words:
logger.warning("caption_aligner: no Whisper words supplied — returning original VTT")
return captions_vtt
cues = VTTParser.parse(captions_vtt)
cursor = 0
aligned = 0
for cue in cues:
tokens = _tokenise(cue.text)
if not tokens:
continue
match = _find_match(tokens, whisper_words, cursor)
if match is None:
# Advance cursor to the Whisper word closest to this cue's start time
# so subsequent cues don't search from a stale position.
cursor = _cursor_for_time(whisper_words, cue.start_time, cursor)
continue
new_start = whisper_words[match.first_word_idx].start
new_end = whisper_words[match.last_word_idx].end
if new_end > new_start:
cue.start_time = new_start
cue.end_time = new_end
aligned += 1
cursor = match.last_word_idx + 1
logger.info(
f"caption_aligner: aligned {aligned}/{len(cues)} cues "
f"against {len(whisper_words)} Whisper words"
)
return VTTEditor.translate_preserving_timing(
captions_vtt, [c.text for c in cues]
) if aligned == 0 else VTTParser.build(cues)

View file

@ -1,100 +0,0 @@
"""
Cloud Run Jobs dispatcher replaces Celery .delay() for heavy pipeline tasks.
Heavy tasks (ingest, translate, render, rerender) are dispatched as Cloud Run Job
executions. Each execution runs `python -m app.tasks.runner --task <name> --job-id <id>`.
Light tasks (notify, embed_glossary) stay on the local Celery worker.
Env vars:
CLOUD_RUN_WORKER_JOB Cloud Run Job name (default: va-worker)
GCP_PROJECT_ID GCP project (from settings)
GCP_REGION Cloud Run region (default: europe-west1)
USE_CELERY_FALLBACK set to "true" to use local Celery instead (local dev)
"""
from __future__ import annotations
import os
from typing import TYPE_CHECKING
from ..core.logging import get_logger
if TYPE_CHECKING:
pass
logger = get_logger(__name__)
_JOB_NAME = os.environ.get("CLOUD_RUN_WORKER_JOB", "va-worker")
_REGION = os.environ.get("GCP_REGION", "europe-west1")
_USE_CELERY = os.environ.get("USE_CELERY_FALLBACK", "false").lower() == "true"
def _job_resource(project: str) -> str:
return f"projects/{project}/locations/{_REGION}/jobs/{_JOB_NAME}"
async def dispatch(task: str, job_id: str, **extra_args: str | list) -> str:
"""
Dispatch a heavy task to Cloud Run Jobs.
Returns the Cloud Run Operation name (useful for tracking).
Falls back to local Celery when USE_CELERY_FALLBACK=true (local dev).
"""
if _USE_CELERY:
return _celery_fallback(task, job_id, **extra_args)
from google.cloud import run_v2 # type: ignore[import]
from ..core.config import settings
args = ["--task", task, "--job-id", job_id]
for key, val in extra_args.items():
cli_key = f"--{key.replace('_', '-')}"
if isinstance(val, list):
args += [cli_key, ",".join(str(v) for v in val)]
elif val is not None:
args += [cli_key, str(val)]
client = run_v2.JobsAsyncClient()
request = run_v2.RunJobRequest(
name=_job_resource(settings.gcp_project_id),
overrides=run_v2.RunJobRequest.Overrides(
container_overrides=[
run_v2.RunJobRequest.Overrides.ContainerOverride(args=args)
]
),
)
logger.info("Dispatching Cloud Run Job: task=%s job_id=%s args=%s", task, job_id, args)
operation = await client.run_job(request=request)
op_name = operation.operation.name
logger.info("Cloud Run Job dispatched: %s", op_name)
return op_name
def _celery_fallback(task: str, job_id: str, **extra_args) -> str:
"""Use local Celery when Cloud Run is not available (dev/test)."""
logger.warning("USE_CELERY_FALLBACK=true — dispatching via local Celery: task=%s", task)
if task == "ingest":
from ..tasks.ingest_and_ai import ingest_and_ai_task
ingest_and_ai_task.delay(job_id)
elif task == "translate":
from ..tasks.translate_and_synthesize import translate_and_synthesize_task
_langs = extra_args.get("languages")
if isinstance(_langs, str):
_langs = [lang for lang in _langs.split(",") if lang]
translate_and_synthesize_task.delay(job_id, languages=_langs or None)
elif task == "render":
from ..tasks.render_accessible_video import render_accessible_video_task
render_accessible_video_task.delay(job_id, extra_args.get("language", "en"))
elif task == "rerender":
from ..tasks.rerender_accessible_video import rerender_accessible_video_task
rerender_accessible_video_task.delay(
job_id,
extra_args.get("language", "en"),
extra_args.get("regenerate_cues", []),
extra_args.get("whisper_refine", False),
)
else:
raise ValueError(f"Unknown task: {task}")
return f"celery:{task}:{job_id}"

View file

@ -1,6 +1,7 @@
"""Thin HTTP client for the centralized Oliver AI Cost Tracker."""
import asyncio
from typing import Optional
import httpx
@ -18,7 +19,7 @@ def preflight(
*,
model: str,
user_external_id: str,
project_id: str | None = None,
project_id: Optional[str] = None,
) -> None:
if not settings.cost_tracker_base_url or not settings.cost_tracker_enabled:
return
@ -50,7 +51,7 @@ async def aio_preflight(
*,
model: str,
user_external_id: str,
project_id: str | None = None,
project_id: Optional[str] = None,
) -> None:
await asyncio.to_thread(preflight, model=model, user_external_id=user_external_id, project_id=project_id)
@ -60,11 +61,11 @@ def record(
model: str,
provider: str,
user_external_id: str,
project_id: str | None = None,
project_id: Optional[str] = None,
job_external_id: str = "",
input_tokens: int = 0,
output_tokens: int = 0,
chars: int | None = None,
chars: Optional[int] = None,
latency_ms: int = 0,
status: str = "success",
) -> None:
@ -75,10 +76,8 @@ def record(
if chars is not None:
units["char"] = chars
else:
if input_tokens:
units["token_input"] = input_tokens
if output_tokens:
units["token_output"] = output_tokens
if input_tokens: units["token_input"] = input_tokens
if output_tokens: units["token_output"] = output_tokens
payload: dict = {
"source_app": settings.cost_tracker_source_app,
@ -89,10 +88,8 @@ def record(
"latency_ms": latency_ms,
"status": status,
}
if project_id:
payload["project_external_id"] = project_id
if job_external_id:
payload["job_external_id"] = job_external_id
if project_id: payload["project_external_id"] = project_id
if job_external_id: payload["job_external_id"] = job_external_id
httpx.post(
f"{settings.cost_tracker_base_url}/usage/record",

View file

@ -16,8 +16,8 @@ Format:
Reference: WCAG 2.1 Success Criterion 1.2.1
"""
from ..core.logging import get_logger
from ..lib.vtt import VTTCue, VTTParser
from ..core.logging import get_logger
logger = get_logger(__name__)

View file

@ -6,6 +6,7 @@ Fetches and caches available voices from the ElevenLabs API.
import time
from dataclasses import dataclass, field
from typing import Optional
import aiohttp
@ -89,7 +90,7 @@ class ElevenLabsVoiceService:
return voices
async def get_voice_by_id(self, voice_id: str) -> ElevenLabsVoice | None:
async def get_voice_by_id(self, voice_id: str) -> Optional[ElevenLabsVoice]:
"""Look up a specific voice by ID."""
voices = await self.get_voices()
for v in voices:

View file

@ -1,7 +1,5 @@
import html as _html
from datetime import datetime
from jinja2 import Template
from ..core.config import settings
@ -387,7 +385,7 @@ class EmailService:
template = Template(template_str)
return template.render(
job_title=_html.escape(job_title),
job_title=job_title,
download_links=download_links
)

View file

@ -1,15 +1,13 @@
"""
Embedding service backed by Vertex AI text-multilingual-embedding-002.
Embedding service backed by Gemini text-embedding-004.
Uses the google-genai SDK in Vertex AI mode (Application Default Credentials)
instead of AI Studio so we get higher per-project quotas and no per-user limits.
Batch size: 100 texts per API call.
Provides batch embedding with retry/backoff for use in glossary ingestion.
Batch size: 100 texts per API call (API limit is 2048 but we keep it conservative
for memory and retry ergonomics with large glossaries).
"""
from __future__ import annotations
import asyncio
import re
from collections.abc import Sequence
from google import genai
@ -20,29 +18,15 @@ from ..core.logging import get_logger
logger = get_logger(__name__)
# Vertex AI multilingual model — 768-dim, 50+ languages, higher quota than AI Studio
_EMBED_MODEL = "text-multilingual-embedding-002"
_EMBED_MODEL = "gemini-embedding-001"
_BATCH_SIZE = 100
_MAX_RETRIES = 5
_INITIAL_BACKOFF = 4.0
# Matches the 'retryDelay': '7s' field in Gemini/Vertex 429 error bodies
_RETRY_DELAY_RE = re.compile(r"'retryDelay':\s*'(\d+)s'")
def _parse_retry_delay(exc: Exception) -> float | None:
"""Extract the server-suggested retry delay from a 429 error."""
m = _RETRY_DELAY_RE.search(str(exc))
return float(m.group(1)) if m else None
_MAX_RETRIES = 3
_INITIAL_BACKOFF = 2.0
class EmbeddingService:
def __init__(self) -> None:
self._client = genai.Client(
vertexai=True,
project=settings.gcp_project_id,
location=settings.gcp_location,
)
self._client = genai.Client(api_key=settings.gemini_api_key)
async def embed_texts(self, texts: Sequence[str]) -> list[list[float]]:
"""
@ -78,12 +62,8 @@ class EmbeddingService:
if attempt == _MAX_RETRIES:
logger.error(f"Embedding batch failed after {_MAX_RETRIES} attempts: {exc}")
raise
# Honour the server-suggested retryDelay when present (e.g. 429 RESOURCE_EXHAUSTED).
# Fall back to our own exponential backoff otherwise.
server_delay = _parse_retry_delay(exc)
delay = max(server_delay + 1.0, backoff) if server_delay else backoff
logger.warning(f"Embedding attempt {attempt} failed, retrying in {delay}s: {exc}")
await asyncio.sleep(delay)
logger.warning(f"Embedding attempt {attempt} failed, retrying in {backoff}s: {exc}")
await asyncio.sleep(backoff)
backoff *= 2
raise RuntimeError("unreachable") # makes type-checker happy

View file

@ -13,6 +13,8 @@ import logging
import os
import subprocess
import tempfile
import uuid
from typing import Any, Optional
from fastapi import FastAPI, HTTPException
from google.cloud import storage
@ -273,7 +275,7 @@ async def run_ffmpeg(request: RunFFmpegRequest):
raise
except Exception as e:
logger.error(f"FFmpeg operation failed: {e}")
raise HTTPException(status_code=500, detail=str(e)) from None
raise HTTPException(status_code=500, detail=str(e))
@app.post("/probe", response_model=ProbeResponse)
@ -328,7 +330,7 @@ async def probe_video(request: ProbeRequest):
raise
except Exception as e:
logger.error(f"Probe failed: {e}")
raise HTTPException(status_code=500, detail=str(e)) from None
raise HTTPException(status_code=500, detail=str(e))
@app.post("/encode-segment", response_model=RunFFmpegResponse)
@ -380,7 +382,7 @@ async def encode_segment(request: EncodeSegmentRequest):
raise
except Exception as e:
logger.error(f"Encode segment failed: {e}")
raise HTTPException(status_code=500, detail=str(e)) from None
raise HTTPException(status_code=500, detail=str(e))
@app.post("/extract-frame", response_model=RunFFmpegResponse)
@ -425,7 +427,7 @@ async def extract_frame(request: ExtractFrameRequest):
raise
except Exception as e:
logger.error(f"Extract frame failed: {e}")
raise HTTPException(status_code=500, detail=str(e)) from None
raise HTTPException(status_code=500, detail=str(e))
@app.post("/create-freeze-segment", response_model=RunFFmpegResponse)
@ -480,7 +482,7 @@ async def create_freeze_segment(request: CreateFreezeSegmentRequest):
raise
except Exception as e:
logger.error(f"Create freeze segment failed: {e}")
raise HTTPException(status_code=500, detail=str(e)) from None
raise HTTPException(status_code=500, detail=str(e))
@app.post("/concatenate", response_model=RunFFmpegResponse)
@ -534,4 +536,4 @@ async def concatenate_segments(request: ConcatenateRequest):
raise
except Exception as e:
logger.error(f"Concatenate failed: {e}")
raise HTTPException(status_code=500, detail=str(e)) from None
raise HTTPException(status_code=500, detail=str(e))

View file

@ -1,6 +1,7 @@
import asyncio
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime, timedelta
from typing import Optional
from fastapi import HTTPException, UploadFile
from google.cloud import storage
@ -12,27 +13,16 @@ from ..core.logging import get_logger
logger = get_logger(__name__)
class GCSService:
def __init__(self) -> None:
self._client: storage.Client | None = None
self._bucket = None
def __init__(self):
self.client = storage.Client(project=settings.gcp_project_id)
self.bucket = self.client.bucket(settings.gcs_bucket)
self.executor = ThreadPoolExecutor(max_workers=4)
@property
def bucket(self):
if self._bucket is None:
self._client = storage.Client(project=settings.gcp_project_id)
self._bucket = self._client.bucket(settings.gcs_bucket)
return self._bucket
@bucket.setter
def bucket(self, value) -> None:
self._bucket = value
async def upload_file_to_gcs(
self,
file: UploadFile,
destination_path: str,
content_type: str | None = None
content_type: Optional[str] = None
) -> str:
"""Upload file to GCS and return the GCS URI"""
def _upload():
@ -55,7 +45,7 @@ class GCSService:
return await loop.run_in_executor(self.executor, _upload)
except Exception as e:
logger.error(f"Failed to upload file to GCS: {e}")
raise HTTPException(status_code=500, detail="File upload failed") from None
raise HTTPException(status_code=500, detail="File upload failed")
async def upload_text_to_gcs(
self,
@ -76,7 +66,7 @@ class GCSService:
return await loop.run_in_executor(self.executor, _upload)
except Exception as e:
logger.error(f"Failed to upload text to GCS: {e}")
raise HTTPException(status_code=500, detail="Text upload failed") from None
raise HTTPException(status_code=500, detail="Text upload failed")
async def get_signed_url(
self,
@ -104,26 +94,10 @@ class GCSService:
try:
return await loop.run_in_executor(self.executor, _get_signed_url)
except NotFound:
raise HTTPException(status_code=404, detail="File not found") from None
raise HTTPException(status_code=404, detail="File not found")
except Exception as e:
logger.error(f"Failed to generate signed URL: {e}")
raise HTTPException(status_code=500, detail="Failed to generate download URL") from None
async def create_resumable_upload_session(self, blob_path: str, content_type: str) -> str:
"""Create a GCS resumable upload session and return the session URI."""
def _create():
blob = self.bucket.blob(blob_path)
return blob.create_resumable_upload_session(
content_type=content_type,
timeout=60,
)
loop = asyncio.get_running_loop()
try:
return await loop.run_in_executor(self.executor, _create)
except Exception as e:
logger.error(f"Failed to create resumable upload session: {e}")
raise HTTPException(status_code=500, detail="Failed to initiate upload session") from None
raise HTTPException(status_code=500, detail="Failed to generate download URL")
async def delete_file(self, blob_path: str) -> bool:
"""Delete a file from GCS"""
@ -139,7 +113,7 @@ class GCSService:
return False
except Exception as e:
logger.error(f"Failed to delete file from GCS: {e}")
raise HTTPException(status_code=500, detail="File deletion failed") from None
raise HTTPException(status_code=500, detail="File deletion failed")
async def file_exists(self, blob_path: str) -> bool:
"""Check if a file exists in GCS"""
@ -151,22 +125,6 @@ class GCSService:
return await loop.run_in_executor(self.executor, _exists)
def gcs_path(job: "dict | object", *parts: str) -> str:
"""Return a GCS object path rooted at the job's prefix.
Jobs created before MT-14 have no gcs_prefix and use bare job_id/ as the
prefix. New jobs get prefix=orgs/{org_id}/jobs/{job_id}/.
"""
if isinstance(job, dict):
prefix = job.get("gcs_prefix") or job["_id"]
if not job.get("gcs_prefix"):
prefix = job["_id"]
else:
prefix = getattr(job, "gcs_prefix", None) or getattr(job, "id", str(job))
prefix = prefix.rstrip("/")
return "/".join([prefix, *parts]) if parts else prefix
# Global GCS service instance
gcs_service = GCSService()
@ -183,9 +141,6 @@ async def upload_json_to_gcs(content: str, destination_path: str) -> str:
async def get_signed_download_url(blob_path: str, expiration_hours: int = 24) -> str:
return await gcs_service.get_signed_url(blob_path, expiration_hours)
async def create_resumable_upload_session(blob_path: str, content_type: str) -> str:
return await gcs_service.create_resumable_upload_session(blob_path, content_type)
async def generate_signed_upload_url(
blob_path: str,
content_type: str,
@ -194,7 +149,7 @@ async def generate_signed_upload_url(
"""Generate a signed URL for direct browser-to-GCS upload"""
def _generate():
blob = gcs_service.bucket.blob(blob_path)
# Generate signed POST URL
url, fields = blob.generate_signed_post_policy_v4(
expiration=timedelta(hours=1),
@ -206,8 +161,8 @@ async def generate_signed_upload_url(
"Content-Type": content_type
}
)
return {"url": url, "fields": fields}
loop = asyncio.get_event_loop()
return await loop.run_in_executor(gcs_service.executor, _generate)

View file

@ -1,8 +1,8 @@
import asyncio
import json
import asyncio
import time
from pathlib import Path
from typing import Any
from typing import Any, Optional
import google.genai as genai
@ -21,7 +21,7 @@ async def _record_gemini_usage(
model: str,
user_id: str,
job_id: str,
project_id: str | None,
project_id: Optional[str],
elapsed_ms: int,
) -> None:
try:
@ -44,39 +44,10 @@ async def _record_gemini_usage(
class GeminiService:
_fallback_models: list[str] = [
"gemini-3-flash-preview",
"gemini-2.5-pro",
]
def __init__(self):
self.model_name = 'gemini-3.1-pro-preview'
self.model_name = 'gemini-3-pro-preview' # Gemini 3 Pro preview model
self.prompts_dir = Path(__file__).parent.parent / "prompts"
async def _generate(self, contents: Any, config: Any = None) -> tuple[Any, str]:
"""Call generate_content, falling back on 429/503 transient errors. Returns (response, model_used)."""
for model in [self.model_name, *self._fallback_models]:
try:
kw: dict[str, Any] = {"model": model, "contents": contents}
if config is not None:
kw["config"] = config
response = await asyncio.to_thread(client.models.generate_content, **kw)
if response.text is None:
logger.warning(f"Model {model!r} returned empty response (safety block or overload), trying next fallback")
last_exc: Exception = RuntimeError(f"Model {model!r} returned empty response")
continue
if model != self.model_name:
logger.warning(f"Used fallback model {model!r} (primary unavailable)")
return response, model
except Exception as exc:
msg = str(exc)
if "429" in msg or "RESOURCE_EXHAUSTED" in msg or "503" in msg or "UNAVAILABLE" in msg:
logger.warning(f"Model {model!r} unavailable, trying next fallback")
last_exc = exc
continue
raise
raise last_exc # noqa: F821 — set in loop above when all models exhausted
def _load_prompt(self, prompt_file: str) -> str:
"""Load prompt template from prompts directory"""
prompt_path = self.prompts_dir / prompt_file
@ -90,31 +61,31 @@ class GeminiService:
"""Wait for uploaded file to become ACTIVE state"""
wait_time = 1 # Start with 1 second
total_waited = 0
while total_waited < max_wait_seconds:
try:
# Get file status - use asyncio.to_thread to avoid blocking event loop
file_info = await asyncio.to_thread(client.files.get, name=file_name)
logger.info(f"File {file_name} status: {file_info.state} (waited {total_waited}s)")
if file_info.state == "ACTIVE":
logger.info(f"File {file_name} is now ACTIVE!")
return True
elif file_info.state == "FAILED":
logger.error(f"File {file_name} processing FAILED")
return False
# Wait with exponential backoff (max 30s)
logger.info(f"File not ready, waiting {wait_time}s...")
await asyncio.sleep(wait_time)
total_waited += wait_time
wait_time = min(wait_time * 1.5, 30) # Exponential backoff, max 30s
except Exception as e:
logger.error(f"Error checking file status: {e}")
await asyncio.sleep(5) # Wait 5s on error
total_waited += 5
logger.error(f"File {file_name} did not become ACTIVE within {max_wait_seconds}s")
return False
@ -136,25 +107,13 @@ Generate sdh_captions_vtt using the same cue timings as captions_vtt, enriched w
- Maintain the same timestamp format as captions_vtt (HH:MM:SS.mmm --> HH:MM:SS.mmm)
- Only add sound effect cues where they add meaningful context; do not annotate every minor sound"""
def _build_glossary_block(self, glossary_block: str | None) -> str:
def _build_glossary_block(self, glossary_block: Optional[str]) -> str:
"""Return the pre-built glossary block (from glossary_service.build_glossary_prompt_block), or empty string."""
if glossary_block and glossary_block.strip():
return glossary_block.strip()
return ""
def _build_source_has_ad_block(self, source_has_ad: bool) -> str:
if source_has_ad:
return (
"SOURCE AUDIO DESCRIPTION NOTICE: This video already has professional audio descriptions "
"embedded in its audio track. "
"1) Return an empty audio_description_vtt containing only the WEBVTT header (\"WEBVTT\\n\") — do NOT generate new audio descriptions. "
"2) For captions_vtt: transcribe ONLY the original program dialogue and relevant sound effects. "
"Do NOT caption the audio description narration — AD narration is spoken during natural pauses "
"and describes visual scenes rather than being part of the original dialogue."
)
return ""
def _build_brand_context_block(self, brand_context: str | None) -> str:
def _build_brand_context_block(self, brand_context: Optional[str]) -> str:
"""Build the brand context instruction block for injection into prompts."""
if brand_context and brand_context.strip():
brands = [b.strip() for b in brand_context.split(",") if b.strip()]
@ -166,7 +125,7 @@ Generate sdh_captions_vtt using the same cue timings as captions_vtt, enriched w
)
return "No specific brand names have been provided for this video."
async def extract_accessibility(self, video_file_path: str, brand_context: str | None = None, sdh_requested: bool = False, glossary_block: str | None = None, source_has_ad: bool = False, _cost_ctx: dict | None = None) -> dict[str, Any]:
async def extract_accessibility(self, video_file_path: str, brand_context: Optional[str] = None, sdh_requested: bool = False, glossary_block: Optional[str] = None, _cost_ctx: Optional[dict] = None) -> dict[str, Any]:
"""
Extract captions and audio descriptions from video using Gemini 2.0
Returns structured JSON with transcript, captions VTT, and audio description VTT
@ -178,13 +137,12 @@ Generate sdh_captions_vtt using the same cue timings as captions_vtt, enriched w
.replace("{GLOSSARY}", self._build_glossary_block(glossary_block))
.replace("{SDH_FIELD}", self._build_sdh_field(sdh_requested))
.replace("{SDH_GUIDELINES}", self._build_sdh_guidelines(sdh_requested))
.replace("{SOURCE_HAS_AD}", self._build_source_has_ad_block(source_has_ad))
)
uploaded_file = None
try:
logger.info(f"Starting Gemini processing for video: {video_file_path}")
# Upload video file to Gemini using new API - use asyncio.to_thread to avoid blocking
logger.info("Uploading video file to Gemini API...")
uploaded_file = await asyncio.to_thread(
@ -196,17 +154,19 @@ Generate sdh_captions_vtt using the same cue timings as captions_vtt, enriched w
}
)
logger.info(f"Successfully uploaded file: {uploaded_file.name} (URI: {uploaded_file.uri})")
# Wait for file to become ACTIVE before using it
logger.info("Waiting for file to become ACTIVE...")
file_ready = await self._wait_for_file_active(uploaded_file.name)
if not file_ready:
raise Exception("File failed to become ACTIVE within timeout")
# Generate content using new API - use asyncio.to_thread to avoid blocking
logger.info("Generating content with Gemini model...")
_t0 = time.monotonic()
response, _model_used = await self._generate(
response = await asyncio.to_thread(
client.models.generate_content,
model=self.model_name,
contents=[
genai.types.Part.from_text(text=prompt),
genai.types.Part.from_uri(
@ -215,13 +175,13 @@ Generate sdh_captions_vtt using the same cue timings as captions_vtt, enriched w
)
],
config=genai.types.GenerateContentConfig(
temperature=0.2,
temperature=0.2, # Lower temperature for consistent, deterministic AD output
top_p=0.8,
top_k=40,
),
)
if _cost_ctx:
asyncio.create_task(_record_gemini_usage(response, _model_used, _cost_ctx.get("user_id", "system"), _cost_ctx.get("job_id", ""), _cost_ctx.get("project_id"), int((time.monotonic() - _t0) * 1000)))
asyncio.create_task(_record_gemini_usage(response, self.model_name, _cost_ctx.get("user_id", "system"), _cost_ctx.get("job_id", ""), _cost_ctx.get("project_id"), int((time.monotonic() - _t0) * 1000)))
# Parse JSON response
response_text = response.text.strip()
@ -231,10 +191,10 @@ Generate sdh_captions_vtt using the same cue timings as captions_vtt, enriched w
if response_text.startswith("```json"):
response_text = response_text.replace("```json", "").replace("```", "").strip()
logger.info("Cleaned markdown formatting from response")
# Additional cleanup for common JSON issues
response_text = response_text.strip()
logger.info("Parsing JSON response...")
try:
result = json.loads(response_text)
@ -293,7 +253,7 @@ Generate sdh_captions_vtt using the same cue timings as captions_vtt, enriched w
async def _self_heal_response(self, video_file_path: str, invalid_response: str) -> dict[str, Any]:
"""Attempt to self-heal invalid JSON response from Gemini"""
logger.info("Attempting to self-heal JSON response without re-uploading video")
# Try to fix common JSON issues first
try:
fixed_response = self._attempt_json_fix(invalid_response)
@ -302,7 +262,7 @@ Generate sdh_captions_vtt using the same cue timings as captions_vtt, enriched w
return fixed_response
except Exception as e:
logger.warning(f"JSON fix attempt failed: {e}")
# If simple fixes don't work, try a text-only self-heal prompt with more context
self_heal_prompt = f"""
SYSTEM: You are a JSON repair service. Fix the malformed JSON below and return ONLY the corrected JSON.
@ -322,24 +282,26 @@ Fix the JSON and return it:
"""
try:
response, _ = await self._generate(
response = await asyncio.to_thread(
client.models.generate_content,
model=self.model_name,
contents=[genai.types.Part.from_text(text=self_heal_prompt)]
)
response_text = response.text.strip()
# Handle potential markdown formatting
if response_text.startswith("```json"):
response_text = response_text.replace("```json", "").replace("```", "").strip()
result = json.loads(response_text)
# Validate that all required fields are present after healing
required_fields = [
"language", "confidence", "summary",
"transcript_plaintext", "captions_vtt", "audio_description_vtt"
]
missing_fields = [field for field in required_fields if field not in result]
if missing_fields:
logger.error(f"Self-heal lost required fields: {missing_fields}")
@ -347,27 +309,27 @@ Fix the JSON and return it:
if "audio_description_vtt" in missing_fields:
logger.info("Creating fallback audio_description_vtt")
result["audio_description_vtt"] = "WEBVTT\n\n00:00:00.000 --> 00:00:05.000\nVideo content with visual elements described."
# If other critical fields are missing, raise an error
remaining_missing = [f for f in missing_fields if f != "audio_description_vtt"]
if remaining_missing:
raise ValueError(f"Self-heal failed to preserve required fields: {remaining_missing}")
logger.info("Successfully self-healed Gemini response with all required fields")
return result
except Exception as e:
logger.error(f"Self-heal attempt failed: {e}")
raise ValueError("Failed to get valid JSON from Gemini after self-heal attempt") from e
raise ValueError("Failed to get valid JSON from Gemini after self-heal attempt")
async def extract_accessibility_targeted(
self,
video_file_path: str,
target_language: str,
brand_context: str | None = None,
brand_context: Optional[str] = None,
sdh_requested: bool = False,
glossary_block: str | None = None,
_cost_ctx: dict | None = None,
glossary_block: Optional[str] = None,
_cost_ctx: Optional[dict] = None,
) -> dict[str, Any]:
"""
Extract captions and audio descriptions from video using Gemini,
@ -422,7 +384,9 @@ Fix the JSON and return it:
# Generate content using new API
logger.info(f"Generating content with Gemini model for {target_language}...")
_t0 = time.monotonic()
response, _model_used = await self._generate(
response = await asyncio.to_thread(
client.models.generate_content,
model=self.model_name,
contents=[
genai.types.Part.from_text(text=prompt),
genai.types.Part.from_uri(
@ -432,7 +396,7 @@ Fix the JSON and return it:
]
)
if _cost_ctx:
asyncio.create_task(_record_gemini_usage(response, _model_used, _cost_ctx.get("user_id", "system"), _cost_ctx.get("job_id", ""), _cost_ctx.get("project_id"), int((time.monotonic() - _t0) * 1000)))
asyncio.create_task(_record_gemini_usage(response, self.model_name, _cost_ctx.get("user_id", "system"), _cost_ctx.get("job_id", ""), _cost_ctx.get("project_id"), int((time.monotonic() - _t0) * 1000)))
# Parse JSON response
response_text = response.text.strip()
@ -535,7 +499,9 @@ Fix the JSON and return it:
"""
try:
response, _ = await self._generate(
response = await asyncio.to_thread(
client.models.generate_content,
model=self.model_name,
contents=[genai.types.Part.from_text(text=self_heal_prompt)]
)
@ -567,7 +533,7 @@ Fix the JSON and return it:
except Exception as e:
logger.error(f"Self-heal attempt failed for {target_language}: {e}")
raise ValueError(f"Failed to get valid JSON from Gemini targeted extraction for {target_language}") from e
raise ValueError(f"Failed to get valid JSON from Gemini targeted extraction for {target_language}")
def _attempt_json_fix(self, json_text: str) -> dict[str, Any] | None:
"""Attempt to fix common JSON syntax issues"""
@ -575,17 +541,17 @@ Fix the JSON and return it:
fixes_tried = []
fixed_text = json_text
import re
# Fix 1: Remove trailing commas
fixed_text = re.sub(r',(\s*[}\]])', r'\1', fixed_text)
fixes_tried.append("removed trailing commas")
# Fix 2: Try to fix unterminated strings by adding closing quote and brace
if fixed_text.count('"') % 2 != 0: # Odd number of quotes suggests unterminated string
# Find the last quote and see if we need to close the JSON
last_quote_pos = fixed_text.rfind('"')
remainder = fixed_text[last_quote_pos + 1:].strip()
# If there's no closing brace after the last quote, try to fix it
if remainder and not remainder.endswith('}'):
# Try to intelligently close the JSON
@ -596,27 +562,27 @@ Fix the JSON and return it:
else:
fixed_text += '"'
fixes_tried.append("closed unterminated string")
# Fix 3: Ensure JSON ends with closing brace
if not fixed_text.rstrip().endswith('}'):
fixed_text = fixed_text.rstrip() + '\n}'
fixes_tried.append("added closing brace")
try:
result = json.loads(fixed_text)
logger.info(f"JSON fixed with: {', '.join(fixes_tried)}")
# Validate that we have the required fields
required_fields = [
"language", "confidence", "summary",
"transcript_plaintext", "captions_vtt", "audio_description_vtt"
]
missing_fields = [field for field in required_fields if field not in result]
if missing_fields:
logger.warning(f"Fixed JSON is missing required fields: {missing_fields}")
return None # Let the more advanced self-healing handle this
return result
except json.JSONDecodeError as e:
logger.debug(f"JSON fix attempt failed: {e}")
@ -692,7 +658,9 @@ Fix the JSON and return it:
# Generate content with video and prompt
logger.info("Analyzing video with Gemini for accessible video placement...")
response, _ = await self._generate(
response = await asyncio.to_thread(
client.models.generate_content,
model=self.model_name,
contents=[
genai.types.Part.from_text(text=prompt),
genai.types.Part.from_uri(
@ -774,7 +742,9 @@ Fix the JSON and return it:
"""
try:
response, _ = await self._generate(
response = await asyncio.to_thread(
client.models.generate_content,
model=self.model_name,
contents=[genai.types.Part.from_text(text=self_heal_prompt)]
)
@ -788,16 +758,16 @@ Fix the JSON and return it:
except Exception as e:
logger.error(f"Self-heal attempt for accessible video analysis failed: {e}")
raise ValueError("Failed to get valid JSON from accessible video analysis after self-heal") from e
raise ValueError("Failed to get valid JSON from accessible video analysis after self-heal")
async def transcreate_content(
self,
captions_vtt: str,
ad_vtt: str,
target_language: str,
brief: str | None = None,
glossary_block: str | None = None,
_cost_ctx: dict | None = None,
brief: Optional[str] = None,
glossary_block: Optional[str] = None,
_cost_ctx: Optional[dict] = None,
) -> dict[str, str]:
"""
Transcreate English VTT content to target language with cultural adaptation
@ -822,11 +792,15 @@ JSON:
try:
_t0 = time.monotonic()
response, _model_used = await self._generate(
contents=[genai.types.Part.from_text(text=prompt + "\n\n" + user_prompt)]
response = await asyncio.to_thread(
client.models.generate_content,
model=self.model_name,
contents=[
genai.types.Part.from_text(text=prompt + "\n\n" + user_prompt)
]
)
if _cost_ctx:
asyncio.create_task(_record_gemini_usage(response, _model_used, _cost_ctx.get("user_id", "system"), _cost_ctx.get("job_id", ""), _cost_ctx.get("project_id"), int((time.monotonic() - _t0) * 1000)))
asyncio.create_task(_record_gemini_usage(response, self.model_name, _cost_ctx.get("user_id", "system"), _cost_ctx.get("job_id", ""), _cost_ctx.get("project_id"), int((time.monotonic() - _t0) * 1000)))
response_text = response.text.strip()
@ -845,7 +819,7 @@ JSON:
except json.JSONDecodeError as e:
logger.error(f"Failed to parse transcreation JSON response: {e}")
raise ValueError("Invalid JSON response from transcreation") from e
raise ValueError("Invalid JSON response from transcreation")
except Exception as e:
logger.error(f"Transcreation failed: {e}")
raise
@ -855,9 +829,8 @@ JSON:
vtt_content: str,
target_language: str,
source_language: str = "en",
glossary_block: str | None = None,
style: str = "literal",
_cost_ctx: dict | None = None,
glossary_block: Optional[str] = None,
_cost_ctx: Optional[dict] = None,
) -> str:
"""
Translate VTT content using Gemini, preserving timing programmatically.
@ -866,10 +839,9 @@ JSON:
1. Send only the text cues (no timestamps) to Gemini as a numbered list
2. Apply translated texts back onto the original VTT using translate_preserving_timing()
style="literal" direct translation preserving meaning exactly
style="transcreate" culturally adapted but still returns EXACTLY N cues 1:1
This avoids any possibility of Gemini drifting or altering timestamps.
"""
from ..lib.vtt import VTTEditor, VTTParser
from ..lib.vtt import VTTParser, VTTEditor
source_cues = VTTParser.parse(vtt_content)
if not source_cues:
@ -878,13 +850,6 @@ JSON:
cue_count = len(source_cues)
_style_instruction = (
"- Culturally adapt the text for {tgt} audiences (brand voice, natural phrasing), "
"while keeping accessibility intent and line length (~3240 chars)\n"
if style == "transcreate"
else ""
)
async def _attempt_translation(extra_instruction: str = "") -> list[str]:
numbered_texts = "\n".join(
f"{i + 1}. {cue.text.replace(chr(10), ' ')}"
@ -894,29 +859,26 @@ JSON:
_tgt_label = locale_lib.get_gemini_label(target_language)
_glossary_section = self._build_glossary_block(glossary_block)
_glossary_line = f"\n\n{_glossary_section}" if _glossary_section else ""
_glossary_req = (
"\n- MUST use the exact approved terms from the glossary below — these override natural translation choices, even for English terms"
if _glossary_section else ""
)
_adapt_line = _style_instruction.format(tgt=_tgt_label) if style == "transcreate" else ""
prompt = f"""Translate the following {cue_count} numbered text segments from {_src_label} to {_tgt_label}.
REQUIREMENTS:
- Return EXACTLY {cue_count} numbered lines, one translation per line
- Format: "1. translated text", "2. translated text", etc.
- Preserve speaker labels like [Speaker 1]: unchanged
- {_adapt_line}Use natural, idiomatic {_tgt_label}
- Do NOT add any explanation, preamble, or extra lines{extra_instruction}{_glossary_req}{_glossary_line}
- Use natural, idiomatic {_tgt_label}
- Do NOT add any explanation, preamble, or extra lines{extra_instruction}{_glossary_line}
Segments to translate:
{numbered_texts}"""
_t0 = time.monotonic()
response, _model_used = await self._generate(
response = await asyncio.to_thread(
client.models.generate_content,
model=self.model_name,
contents=[genai.types.Part.from_text(text=prompt)]
)
if _cost_ctx:
asyncio.create_task(_record_gemini_usage(response, _model_used, _cost_ctx.get("user_id", "system"), _cost_ctx.get("job_id", ""), _cost_ctx.get("project_id"), int((time.monotonic() - _t0) * 1000)))
asyncio.create_task(_record_gemini_usage(response, self.model_name, _cost_ctx.get("user_id", "system"), _cost_ctx.get("job_id", ""), _cost_ctx.get("project_id"), int((time.monotonic() - _t0) * 1000)))
return self._parse_numbered_translation(response.text.strip(), cue_count)
try:
@ -977,7 +939,7 @@ Segments to translate:
self,
original_text: str,
language: str = "en",
_cost_ctx: dict | None = None,
_cost_ctx: Optional[dict] = None,
) -> str:
"""
Rewrite an audio description cue to be TTS-friendly.
@ -1003,11 +965,13 @@ Segments to translate:
logger.info(f"Rewriting TTS cue for safety: '{original_text[:50]}...'")
_t0 = time.monotonic()
response, _model_used = await self._generate(
response = await asyncio.to_thread(
client.models.generate_content,
model=self.model_name,
contents=[genai.types.Part.from_text(text=prompt)]
)
if _cost_ctx:
asyncio.create_task(_record_gemini_usage(response, _model_used, _cost_ctx.get("user_id", "system"), _cost_ctx.get("job_id", ""), _cost_ctx.get("project_id"), int((time.monotonic() - _t0) * 1000)))
asyncio.create_task(_record_gemini_usage(response, self.model_name, _cost_ctx.get("user_id", "system"), _cost_ctx.get("job_id", ""), _cost_ctx.get("project_id"), int((time.monotonic() - _t0) * 1000)))
result = response.text.strip()

View file

@ -1,7 +1,8 @@
import io
import re
import wave
from google.cloud import texttospeech
from google import genai
from google.genai import types
from pydub import AudioSegment
from ..core.config import settings
@ -22,26 +23,14 @@ class TTSSynthesisError(Exception):
class GeminiTTSService:
"""Text-to-Speech service using Google Cloud Text-to-Speech API with Gemini models."""
"""Text-to-Speech service using Gemini TTS API"""
def __init__(self):
self.client = texttospeech.TextToSpeechClient()
self.client = genai.Client(api_key=settings.gemini_api_key)
self.model = settings.gemini_tts_model
self.default_voice = settings.gemini_tts_default_voice
logger.info(f"Gemini TTS service initialized with model: {self.model}")
@staticmethod
def _extract_retry_after(error: Exception) -> float | None:
"""Return seconds to wait from a Google API 429 retryDelay, or None."""
msg = str(error)
m = re.search(r"retry in ([0-9.]+)s", msg, re.IGNORECASE)
if m:
return float(m.group(1)) + 5
m = re.search(r"'retryDelay':\s*'([0-9.]+)s'", msg)
if m:
return float(m.group(1)) + 5
return None
async def synthesize_text(
self,
text: str,
@ -52,56 +41,117 @@ class GeminiTTSService:
style_prompt: str = ""
) -> bytes:
"""
Synthesize text to MP3 using Google Cloud TTS with Gemini model.
Synthesize text to audio using Gemini TTS.
Returns MP3 audio bytes.
Args:
text: The text to synthesize
voice_name: Gemini voice name (e.g. "Kore", "Puck")
language: Language code (e.g. "en", "en-US", "fr")
model: Model variant key "flash" or "pro"
speed: Speech rate multiplier (0.254.0)
style_prompt: Natural-language style instruction sent as prompt
voice_name: Name of the voice to use
language: Language code (e.g., "en", "es")
model: Model variant - "flash" (fast) or "pro" (quality)
speed: Speech rate multiplier (0.5 to 2.0)
style_prompt: Style instructions to prepend (e.g., "Speak calmly...")
"""
if not text.strip():
raise ValueError("Text cannot be empty")
# Validate voice
if voice_name not in settings.gemini_tts_voices:
logger.warning(f"Unknown voice '{voice_name}', using default '{self.default_voice}'")
voice_name = self.default_voice
# Select model from config
model_id = settings.gemini_tts_models.get(model, settings.gemini_tts_model)
language_code = locale_lib.get_tts_lang(language)
# Build the full prompt with style and speed instructions
prompt_parts = []
# Add style prompt if provided
if style_prompt:
prompt_parts.append(style_prompt)
# Add speed instruction if not default
if speed != 1.0:
speed_pct = int(speed * 100)
if speed < 1.0:
prompt_parts.append(f"Speak slowly at approximately {speed_pct}% of normal speed. ")
else:
prompt_parts.append(f"Speak quickly at approximately {speed_pct}% of normal speed. ")
# Combine prompts with actual text
full_text = "".join(prompt_parts) + text
try:
synthesis_input = texttospeech.SynthesisInput(text=text)
if style_prompt:
synthesis_input = texttospeech.SynthesisInput(
text=text,
prompt=style_prompt,
# Generate audio using Gemini TTS
response = self.client.models.generate_content(
model=model_id,
contents=full_text,
config=types.GenerateContentConfig(
response_modalities=["AUDIO"],
speech_config=types.SpeechConfig(
voice_config=types.VoiceConfig(
prebuilt_voice_config=types.PrebuiltVoiceConfig(
voice_name=voice_name,
)
)
),
)
response = self.client.synthesize_speech(
input=synthesis_input,
voice=texttospeech.VoiceSelectionParams(
language_code=language_code,
name=voice_name,
model_name=model_id,
),
audio_config=texttospeech.AudioConfig(
audio_encoding=texttospeech.AudioEncoding.MP3,
speaking_rate=speed,
),
)
if not response.audio_content:
raise ValueError("Empty audio content in Cloud TTS response")
# Extract PCM audio data from response with proper null-safe checks
if not response.candidates:
logger.error(
f"Gemini TTS response missing candidates. "
f"Response type: {type(response)}, Response: {response}"
)
raise ValueError("No candidates in Gemini TTS response")
return response.audio_content
candidate = response.candidates[0]
if candidate.content is None:
logger.error(
f"Gemini TTS candidate has no content. "
f"Finish reason: {getattr(candidate, 'finish_reason', 'unknown')}, "
f"Safety ratings: {getattr(candidate, 'safety_ratings', 'unknown')}"
)
raise ValueError(
f"Candidate content is None in Gemini TTS response. "
f"Finish reason: {getattr(candidate, 'finish_reason', 'unknown')}"
)
if not candidate.content.parts:
logger.error(
f"Gemini TTS content has no parts. "
f"Content role: {getattr(candidate.content, 'role', 'unknown')}"
)
raise ValueError("No parts in Gemini TTS response content")
part = candidate.content.parts[0]
if not hasattr(part, 'inline_data') or part.inline_data is None:
logger.error(
f"Gemini TTS part missing inline_data. "
f"Part type: {type(part)}, Part: {part}"
)
raise ValueError("No inline_data in Gemini TTS response part")
pcm_data = part.inline_data.data
# Convert PCM to MP3
mp3_data = self._pcm_to_mp3(pcm_data)
return mp3_data
except Exception as e:
# Log comprehensive error information for debugging
error_context = {
"text_length": len(text),
"text_preview": text[:100] + "..." if len(text) > 100 else text,
"voice_name": voice_name,
"language": language,
"model_id": model_id,
}
logger.error(
f"Gemini TTS synthesis failed: {e}. "
f"text_len={len(text)}, voice={voice_name}, model={model_id}, lang={language_code}"
f"Gemini TTS synthesis failed: {e}. Context: {error_context}"
)
raise
@ -113,18 +163,23 @@ class GeminiTTSService:
speed: float = 1.0,
style_prompt: str = ""
) -> bytes:
"""Generate a preview audio sample for voice selection."""
"""
Generate a preview audio sample for voice selection.
Uses language-specific sample text and applies all TTS settings.
"""
# Get preview sample text — try settings override, then locale registry, then fallback
sample_text = (
settings.gemini_tts_preview_samples.get(language)
or locale_lib.get_preview_sample(language)
)
return await self.synthesize_text(
sample_text,
voice_name,
language,
model=model,
speed=speed,
style_prompt=style_prompt,
style_prompt=style_prompt
)
async def _synthesize_cue_with_retry(
@ -139,7 +194,26 @@ class GeminiTTSService:
max_attempts: int = 3,
base_delay: float = 1.0
) -> bytes:
"""Synthesize a single cue with retry, honouring API-provided retryDelay on 429."""
"""
Synthesize a single cue with exponential backoff retry.
Args:
cue_index: Index of the cue (for error reporting)
text: Text to synthesize
voice_name: TTS voice name
language: Language code
model: Model variant
speed: Speech rate
style_prompt: Style instructions
max_attempts: Total attempts (1 initial + retries)
base_delay: Base delay in seconds for backoff
Returns:
MP3 audio bytes
Raises:
TTSSynthesisError: If all attempts fail
"""
import asyncio
import random
@ -154,31 +228,32 @@ class GeminiTTSService:
language,
model=model,
speed=speed,
style_prompt=style_prompt,
style_prompt=style_prompt
)
except Exception as e:
last_exception = e
api_response_info = str(e)
if attempt < max_attempts - 1:
api_delay = self._extract_retry_after(e)
delay = api_delay if api_delay else base_delay * (2 ** attempt) + random.uniform(0, 1)
# Exponential backoff with jitter
delay = base_delay * (2 ** attempt) + random.uniform(0, 1)
logger.warning(
f"TTS attempt {attempt + 1}/{max_attempts} failed for cue {cue_index}. "
f"TTS synthesis attempt {attempt + 1}/{max_attempts} failed for cue {cue_index}. "
f"Retrying in {delay:.2f}s. Error: {e}"
)
await asyncio.sleep(delay)
else:
logger.error(
f"TTS FAILED after {max_attempts} attempts for cue {cue_index}. "
f"text='{text[:50]}{'...' if len(text) > 50 else ''}'. Error: {e}"
f"TTS synthesis FAILED after {max_attempts} attempts for cue {cue_index}. "
f"Text: {text[:50]}{'...' if len(text) > 50 else ''}. Error: {e}"
)
# All retries exhausted - raise hard failure
raise TTSSynthesisError(
message=f"TTS synthesis failed after {max_attempts} attempts: {last_exception}",
cue_index=cue_index,
cue_text=text,
api_response_info=api_response_info,
api_response_info=api_response_info
)
async def synthesize_audio_description(
@ -193,38 +268,56 @@ class GeminiTTSService:
"""
Synthesize full audio description from VTT content.
Maintains timing alignment with original VTT cues.
Args:
ad_vtt_content: VTT content with audio description cues
language: Language code (e.g., "en", "es")
voice_name: Name of the voice to use (defaults to service default)
model: Model variant - "flash" (fast) or "pro" (quality)
speed: Speech rate multiplier (0.5 to 2.0)
style_prompt: Style instructions to prepend to each cue
"""
if voice_name is None:
voice_name = self.default_voice
# Validate voice
if voice_name not in settings.gemini_tts_voices:
logger.warning(f"Unknown voice '{voice_name}', using default '{self.default_voice}'")
voice_name = self.default_voice
# Parse VTT cues
cues = self._parse_ad_cues(ad_vtt_content)
if not cues:
raise ValueError("No audio description cues found in VTT content")
logger.info(
f"Synthesizing {len(cues)} AD cues: voice='{voice_name}', model='{model}', speed={speed}x"
f"Synthesizing {len(cues)} audio description cues with voice '{voice_name}', "
f"model '{model}', speed {speed}x"
)
# Synthesize each cue with precise timing anchoring
audio_segments = []
current_audio_position = 0.0
for i, cue in enumerate(cues):
target_start_time = cue["start_time"]
# Add silence to reach the exact VTT start time
if target_start_time > current_audio_position:
silence_duration = target_start_time - current_audio_position
audio_segments.append(AudioSegment.silent(duration=int(silence_duration * 1000)))
silence = AudioSegment.silent(duration=int(silence_duration * 1000))
audio_segments.append(silence)
current_audio_position = target_start_time
# Synthesize this cue's text
text = cue["text"].strip()
if text:
# Ensure proper punctuation for natural TTS flow
if not text.endswith(('.', '!', '?')):
text += "."
# Use retry helper - will raise TTSSynthesisError on failure after retries
audio_data = await self._synthesize_cue_with_retry(
cue_index=i,
text=text,
@ -234,62 +327,115 @@ class GeminiTTSService:
speed=speed,
style_prompt=style_prompt,
max_attempts=3,
base_delay=1.0,
base_delay=1.0
)
# Convert to AudioSegment and get actual duration
audio_segment = AudioSegment.from_file(io.BytesIO(audio_data), format="mp3")
audio_segments.append(audio_segment)
current_audio_position += len(audio_segment) / 1000.0
final_audio = sum(audio_segments, AudioSegment.empty()) if audio_segments else AudioSegment.silent(duration=1000)
# Update position based on actual audio duration
actual_audio_duration = len(audio_segment) / 1000.0
current_audio_position += actual_audio_duration
# Combine all segments
if audio_segments:
final_audio = sum(audio_segments, AudioSegment.empty())
else:
final_audio = AudioSegment.silent(duration=1000)
# Export to MP3
output_buffer = io.BytesIO()
final_audio.export(output_buffer, format="mp3", bitrate="128k")
logger.info(f"Audio description synthesized: {len(output_buffer.getvalue())} bytes")
return output_buffer.getvalue()
def _pcm_to_mp3(self, pcm_data: bytes) -> bytes:
"""
Convert raw PCM audio (24kHz, 16-bit, mono) to MP3.
Gemini TTS outputs PCM at 24000 Hz sample rate.
"""
# Create WAV from PCM data
wav_buffer = io.BytesIO()
with wave.open(wav_buffer, "wb") as wf:
wf.setnchannels(1) # Mono
wf.setsampwidth(2) # 16-bit (2 bytes)
wf.setframerate(24000) # 24kHz
wf.writeframes(pcm_data)
# Convert WAV to MP3 using pydub
wav_buffer.seek(0)
audio_segment = AudioSegment.from_wav(wav_buffer)
# Export as MP3
mp3_buffer = io.BytesIO()
audio_segment.export(mp3_buffer, format="mp3", bitrate="128k")
return mp3_buffer.getvalue()
def _parse_ad_cues(self, vtt_content: str) -> list[dict]:
"""Parse audio description VTT and extract timing + text."""
"""Parse audio description VTT and extract timing + text"""
lines = vtt_content.strip().split('\n')
cues = []
i = 0
while i < len(lines):
line = lines[i].strip()
if line in ("WEBVTT", "") or line.startswith("NOTE"):
# Skip header and empty lines
if line == "WEBVTT" or line == "" or line.startswith("NOTE"):
i += 1
continue
# Check for timing line
if " --> " in line:
timing_parts = line.split(" --> ")
start_time = self._parse_timestamp(timing_parts[0].strip())
end_time = self._parse_timestamp(timing_parts[1].strip())
# Get text from next line(s)
i += 1
text_lines = []
while i < len(lines) and lines[i].strip():
while i < len(lines) and lines[i].strip() != "":
text_lines.append(lines[i].strip())
i += 1
if text_lines:
cues.append({"start_time": start_time, "end_time": end_time, "text": " ".join(text_lines)})
cues.append({
"start_time": start_time,
"end_time": end_time,
"text": " ".join(text_lines)
})
else:
i += 1
return cues
def _parse_timestamp(self, timestamp: str) -> float:
"""Convert VTT timestamp to seconds."""
"""Convert VTT timestamp to seconds"""
parts = timestamp.split(":")
if len(parts) == 3:
if len(parts) == 3: # HH:MM:SS.mmm
hours, minutes, seconds = parts
elif len(parts) == 2:
elif len(parts) == 2: # MM:SS.mmm
hours, minutes, seconds = "0", parts[0], parts[1]
else:
raise ValueError(f"Invalid timestamp format: {timestamp}")
sec_parts = seconds.split(".")
return (
int(hours) * 3600
+ int(minutes) * 60
+ int(sec_parts[0])
+ (int(sec_parts[1]) / 1000.0 if len(sec_parts) > 1 else 0)
seconds_val = int(sec_parts[0])
milliseconds = int(sec_parts[1]) if len(sec_parts) > 1 else 0
total_seconds = (
int(hours) * 3600 +
int(minutes) * 60 +
seconds_val +
milliseconds / 1000.0
)
return total_seconds
# Global service instance
gemini_tts_service = GeminiTTSService()

View file

@ -26,6 +26,7 @@ from ..models.glossary import (
EmbeddingStatus,
Glossary,
GlossaryStatus,
GlossaryTerm,
GlossaryVersion,
MatchedTerm,
glossary_from_doc,
@ -334,24 +335,12 @@ async def activate_version(glossary_id: str, version_id: str) -> None:
async def archive_glossary(glossary_id: str) -> None:
"""Hard-delete the glossary and all its versions and terms."""
db = await get_database()
versions = await db[_COLL_VERSIONS].find(
{"glossary_id": glossary_id}, {"_id": 1}
).to_list(length=None)
version_ids = [str(v["_id"]) for v in versions]
if version_ids:
terms_result = await db[_COLL_TERMS].delete_many({"version_id": {"$in": version_ids}})
logger.info(f"Deleted {terms_result.deleted_count} terms for glossary {glossary_id}")
await db[_COLL_VERSIONS].delete_many({"glossary_id": glossary_id})
logger.info(f"Deleted {len(version_ids)} versions for glossary {glossary_id}")
await db[_COLL_GLOSSARIES].delete_one({"_id": ObjectId(glossary_id)})
await db[_COLL_GLOSSARIES].update_one(
{"_id": ObjectId(glossary_id)},
{"$set": {"status": GlossaryStatus.ARCHIVED.value}},
)
await _invalidate_cache(glossary_id)
logger.info(f"Deleted glossary {glossary_id}")
# ── Retrieval ─────────────────────────────────────────────────────────────────
@ -465,11 +454,8 @@ async def _exact_match(
# Build automaton
automaton = ahocorasick.Automaton()
for doc in terms:
stl = doc.get("source_term_lower") or doc.get("source_term", "")
if stl:
automaton.add_word(stl.lower(), (doc["source_term"], doc.get("translations", {})))
if not automaton:
return []
stl = doc["source_term_lower"]
automaton.add_word(stl, (doc["source_term"], doc["translations"]))
automaton.make_automaton()
text_lower = text.lower()
@ -559,26 +545,18 @@ async def _vector_match(
def _get_translation(translations: dict[str, str], target_locale: str) -> str | None:
"""Look up a translation with locale-fallback.
Specific bare: fr-CA fr-FR siblings fr
Bare specific: fr fr-FR, fr-CA (first match)
"""
if not translations or not target_locale:
"""Look up a translation with locale-fallback: fr-CA → fr-FR → fr → None."""
if not translations:
return None
if target_locale in translations:
return translations[target_locale]
if "-" in target_locale:
# Specific locale: try sibling regions and bare parent (fr-CA → fr-FR → fr)
parent = target_locale.split("-")[0]
# Try parent language
parent = target_locale.split("-")[0] if "-" in target_locale else None
if parent:
# Try sibling locales, e.g. fr-CA not found → try fr-FR
for code, text in translations.items():
if code.startswith(parent + "-") or code == parent:
return text
else:
# Bare code (fr): try any fr-* region variant stored in the glossary
for code, text in translations.items():
if code == target_locale or code.startswith(target_locale + "-"):
return text
return None
@ -652,35 +630,28 @@ async def get_glossary_block_for_job(
Non-fatal: any failure returns "" so the pipeline continues without a glossary.
"""
try:
job_id_for_log = job_doc.get("_id", "unknown")
project_id = job_doc.get("project_id")
if not project_id:
logger.debug(f"Glossary skip job={job_id_for_log}: no project_id")
return ""
project = await db.projects.find_one({"_id": project_id})
if not project:
logger.warning(f"Glossary skip job={job_id_for_log}: project {project_id!r} not found")
return ""
client_id = project.get("client_id")
if not client_id:
logger.debug(f"Glossary skip job={job_id_for_log}: project has no client_id")
return ""
# Get active version id via our cache-backed helper (reuses Redis if available)
active_version_id = await _get_active_version_id(client_id)
if not active_version_id:
logger.debug(f"Glossary skip job={job_id_for_log}: no active glossary for client {client_id!r}")
return ""
# Combine source VTT texts for matching
source_text = job_doc.get("_glossary_source_text", "")
if not source_text:
logger.debug(f"Glossary skip job={job_id_for_log}: no source text provided for matching")
return ""
logger.info(f"Glossary lookup job={job_id_for_log} client={client_id!r} version={active_version_id!r} locale={target_locale!r}")
norm_target = locale_lib.normalize_code(target_locale)
exact_matches = await _exact_match(db, active_version_id, source_text, norm_target)
@ -705,8 +676,7 @@ async def get_glossary_block_for_job(
return build_glossary_prompt_block(combined, target_locale)
except Exception as e:
import traceback
logger.warning(f"Glossary lookup failed for job {job_doc.get('_id')} (non-fatal): {e}\n{traceback.format_exc()}")
logger.warning(f"Glossary lookup failed for job {job_doc.get('_id')} (non-fatal): {e}")
return ""
@ -728,17 +698,6 @@ async def get_glossary(glossary_id: str) -> Glossary | None:
return glossary_from_doc(doc) if doc else None
async def get_versions_by_ids(version_ids: list[str]) -> dict[str, GlossaryVersion]:
"""Batch-fetch versions by ID, returns {version_id: GlossaryVersion}."""
if not version_ids:
return {}
db = await get_database()
docs = await db[_COLL_VERSIONS].find(
{"_id": {"$in": [ObjectId(vid) for vid in version_ids]}}
).to_list(length=len(version_ids))
return {str(d["_id"]): glossary_version_from_doc(d) for d in docs}
async def get_versions(glossary_id: str) -> list[GlossaryVersion]:
db = await get_database()
cursor = db[_COLL_VERSIONS].find(

View file

@ -2,10 +2,10 @@
import asyncio
from datetime import datetime
from typing import Any
from typing import Any, Optional
from uuid import uuid4
from fastapi import HTTPException, status
from fastapi import HTTPException
from motor.motor_asyncio import AsyncIOMotorDatabase
from ..core.logging import get_logger
@ -26,32 +26,6 @@ logger = get_logger(__name__)
_JOBS = "jobs"
async def _assert_user_in_job_org(
db: AsyncIOMotorDatabase,
user_id: str,
job_doc: dict,
) -> None:
"""Raise 403 if user_id is not a member of the job's organization."""
org_id = job_doc.get("organization_id")
if not org_id:
project_id = job_doc.get("project_id")
if project_id:
project = await db.projects.find_one({"_id": project_id}, {"client_id": 1})
if project:
org_id = project.get("client_id")
if not org_id:
raise HTTPException(
status_code=422,
detail="Job is not bound to an organization; cannot validate cross-org assignment",
)
mem = await db.memberships.find_one({"user_id": user_id, "organization_id": org_id})
if not mem:
raise HTTPException(
status_code=403,
detail="Assignee is not a member of this job's organization",
)
# ── Helpers ───────────────────────────────────────────────────────────────────
def _job_languages(job_doc: dict) -> list[str]:
@ -98,13 +72,13 @@ def _rebuild_qc_assignments(language_qc: dict) -> list[dict]:
def _qc_recipients(
job_doc: dict,
lang_state: dict,
exclude_user_id: str | None,
exclude_user_id: Optional[str],
) -> list[tuple[str, str]]:
"""Return [(email, full_name)] for linguist + reviewer assigned to a language, minus the actor."""
seen: set[str] = set()
result: list[tuple[str, str]] = []
def _add(email: str | None, name: str | None) -> None:
def _add(email: Optional[str], name: Optional[str]) -> None:
if email and email not in seen and email != exclude_user_id:
seen.add(email)
result.append((email, name or email.split("@")[0]))
@ -116,73 +90,13 @@ def _qc_recipients(
def _deep_link(job_id: str, lang: str) -> str:
from ..core.config import settings
return f"{settings.app_url}/admin/qc/{job_id}#lang-{lang}"
# ── Auto-assignment ───────────────────────────────────────────────────────────
async def auto_assign_defaults(db: AsyncIOMotorDatabase, job_id: str) -> int:
"""Apply job.initial_linguist_id / initial_reviewer_id to all unassigned languages.
Called lazily when the language-QC map is first fetched in PENDING_QC state,
so PM assignments made at job-creation time take effect without touching Celery tasks.
Returns the number of languages updated.
"""
job_doc = await db[_JOBS].find_one({"_id": job_id})
if not job_doc:
return 0
linguist_id: str | None = job_doc.get("initial_linguist_id")
reviewer_id: str | None = job_doc.get("initial_reviewer_id")
if not linguist_id and not reviewer_id:
return 0
languages: list[str] = (job_doc.get("requested_outputs") or {}).get("languages") or []
if not languages:
return 0
linguist_doc = await db.users.find_one({"_id": linguist_id}) if linguist_id else None
reviewer_doc = await db.users.find_one({"_id": reviewer_id}) if reviewer_id else None
now = datetime.utcnow()
updated = 0
current_qc: dict = job_doc.get("language_qc") or {}
for lang in languages:
lang_state: dict = current_qc.get(lang) or {}
already_assigned = bool(lang_state.get("assigned_linguist_id"))
if already_assigned:
continue
patch: dict = {}
if linguist_doc:
patch.update({
f"language_qc.{lang}.assigned_linguist_id": linguist_id,
f"language_qc.{lang}.assigned_linguist_email": linguist_doc["email"],
f"language_qc.{lang}.assigned_linguist_name": linguist_doc.get("full_name", ""),
f"language_qc.{lang}.assigned_at": now,
f"language_qc.{lang}.assigned_by_user_id": "system",
f"language_qc.{lang}.status": lang_state.get("status", LanguageQCStatus.PENDING.value),
})
if reviewer_doc:
patch.update({
f"language_qc.{lang}.assigned_reviewer_id": reviewer_id,
f"language_qc.{lang}.assigned_reviewer_email": reviewer_doc["email"],
f"language_qc.{lang}.assigned_reviewer_name": reviewer_doc.get("full_name", ""),
})
if patch:
await db[_JOBS].update_one({"_id": job_id}, {"$set": patch})
updated += 1
if updated:
logger.info("auto_assign_defaults: assigned %d languages on job %s", updated, job_id)
return updated
base = getattr(settings, "app_url", "https://ai-sandbox.oliver.solutions/video-accessibility")
return f"{base}/admin/qc/{job_id}#lang-{lang}"
# ── Core mutations ────────────────────────────────────────────────────────────
async def get_state(db: AsyncIOMotorDatabase, job_id: str, lang: str) -> LanguageQCState | None:
async def get_state(db: AsyncIOMotorDatabase, job_id: str, lang: str) -> Optional[LanguageQCState]:
job_doc = await db[_JOBS].find_one({"_id": job_id}, {f"language_qc.{lang}": 1})
if not job_doc:
return None
@ -212,8 +126,8 @@ async def assign_linguist(
actor: User,
*,
http_request=None,
notes: str | None = None,
deadline: datetime | None = None,
notes: Optional[str] = None,
deadline: Optional[datetime] = None,
) -> LanguageQCState:
"""PM/PROD/ADMIN assigns a linguist to a language. Creates per-lang state if missing."""
job_doc = await db[_JOBS].find_one({"_id": job_id})
@ -224,8 +138,6 @@ async def assign_linguist(
if not linguist_doc:
raise HTTPException(status_code=404, detail="Linguist not found")
await _assert_user_in_job_org(db, linguist_user_id, job_doc)
now = datetime.utcnow()
current_state_raw = (job_doc.get("language_qc") or {}).get(lang, {})
prev_assignee = current_state_raw.get("assigned_linguist_id") if isinstance(current_state_raw, dict) else None
@ -309,8 +221,8 @@ async def reassign_linguist(
actor: User,
*,
http_request=None,
notes: str | None = None,
deadline: datetime | None = None,
notes: Optional[str] = None,
deadline: Optional[datetime] = None,
) -> LanguageQCState:
"""Currently-assigned linguist OR PM/PROD/ADMIN hands off to a colleague."""
job_doc = await db[_JOBS].find_one({"_id": job_id})
@ -339,8 +251,8 @@ async def assign_reviewer(
actor: User,
*,
http_request=None,
notes: str | None = None,
deadline: datetime | None = None,
notes: Optional[str] = None,
deadline: Optional[datetime] = None,
) -> LanguageQCState:
"""PM/PROD/ADMIN assigns a reviewer to a language."""
job_doc = await db[_JOBS].find_one({"_id": job_id})
@ -351,8 +263,6 @@ async def assign_reviewer(
if not reviewer_doc:
raise HTTPException(status_code=404, detail="Reviewer not found")
await _assert_user_in_job_org(db, reviewer_user_id, job_doc)
now = datetime.utcnow()
current_state_raw = (job_doc.get("language_qc") or {}).get(lang, {})
prev_reviewer = current_state_raw.get("assigned_reviewer_id") if isinstance(current_state_raw, dict) else None
@ -425,8 +335,8 @@ async def reassign_reviewer(
actor: User,
*,
http_request=None,
notes: str | None = None,
deadline: datetime | None = None,
notes: Optional[str] = None,
deadline: Optional[datetime] = None,
) -> LanguageQCState:
if actor.role not in (UserRole.PRODUCTION, UserRole.ADMIN, UserRole.PROJECT_MANAGER):
raise HTTPException(status_code=403, detail="Only PM/PROD/ADMIN can reassign reviewer")
@ -514,7 +424,6 @@ async def submit_for_review(
**(current_state_raw if isinstance(current_state_raw, dict) else {}),
"status": LanguageQCStatus.PENDING_REVIEW.value,
"submitted_for_review_at": now,
"reviewed_cues": 0, # R-12: reviewer must re-acknowledge cues after each resubmit
"history": history,
}
@ -626,7 +535,7 @@ async def approve_language(
actor: User,
*,
http_request=None,
notes: str | None = None,
notes: Optional[str] = None,
) -> LanguageQCState:
job_doc = await db[_JOBS].find_one({"_id": job_id})
if not job_doc:
@ -692,76 +601,11 @@ async def approve_language(
logger.exception("Failed to send approval emails")
refreshed = await db[_JOBS].find_one({"_id": job_id})
# When the source language is approved, dispatch translation for any target
# languages that don't have VTTs yet (EN-first gate).
source_lang = (refreshed.get("source") or {}).get("language", "en")
if lang == source_lang:
target_langs = [lg for lg in _job_languages(refreshed) if lg != source_lang]
if target_langs:
outputs = refreshed.get("outputs") or {}
untranslated = [lg for lg in target_langs if not (outputs.get(lg) or {}).get("captions_vtt_gcs")]
if untranslated:
try:
from ..services.cloud_run_dispatch import dispatch as _cr_dispatch
await db[_JOBS].update_one(
{"_id": job_id},
{
"$set": {
"status": JobStatus.TRANSLATING.value,
"updated_at": datetime.utcnow(),
},
"$push": {
"review.history": {
"at": datetime.utcnow(),
"status": JobStatus.TRANSLATING.value,
"by": "system",
"notes": f"EN approved — dispatching translation for {untranslated}",
}
},
},
)
await _cr_dispatch("translate", job_id, languages=untranslated)
logger.info(f"Job {job_id}: EN approved, dispatched translation for {untranslated}")
return LanguageQCState(**updated_state)
except Exception as exc:
logger.error(f"Job {job_id}: failed to dispatch translation after EN approval: {exc}")
elif (refreshed.get("requested_outputs") or {}).get("accessible_video_mp4"):
# Source-only job requesting accessible video: no translation needed,
# but TTS+render pipeline must run to produce the accessible MP4.
try:
from ..services.cloud_run_dispatch import dispatch as _cr_dispatch
await db[_JOBS].update_one(
{"_id": job_id},
{
"$set": {
"status": JobStatus.TRANSLATING.value,
"updated_at": datetime.utcnow(),
},
"$push": {
"review.history": {
"at": datetime.utcnow(),
"status": JobStatus.TRANSLATING.value,
"by": "system",
"notes": "EN approved — dispatching TTS and accessible video render (source-only)",
}
},
},
)
await _cr_dispatch("translate", job_id)
logger.info(f"Job {job_id}: EN approved (source-only), dispatched TTS+render pipeline")
return LanguageQCState(**updated_state)
except Exception as exc:
logger.error(f"Job {job_id}: failed to dispatch TTS+render after EN approval: {exc}")
await _maybe_advance_job(db, refreshed)
return LanguageQCState(**updated_state)
REJECT_CATEGORIES = frozenset(["timing", "mistranslation", "terminology", "profanity", "length", "other"])
async def reject_language(
db: AsyncIOMotorDatabase,
job_id: str,
@ -769,13 +613,10 @@ async def reject_language(
actor: User,
notes: str,
*,
category: str | None = None,
http_request=None,
) -> LanguageQCState:
if not notes or not notes.strip():
raise HTTPException(status_code=422, detail="Rejection notes are required")
if category and category not in REJECT_CATEGORIES:
raise HTTPException(status_code=422, detail=f"Invalid reject category. Must be one of: {', '.join(sorted(REJECT_CATEGORIES))}")
job_doc = await db[_JOBS].find_one({"_id": job_id})
if not job_doc:
@ -798,8 +639,6 @@ async def reject_language(
"reviewed_by_user_id": str(actor.id),
"reviewed_by_email": actor.email,
"notes": notes,
"reject_category": category,
"reviewed_cues": 0,
"submitted_for_review_at": None,
"history": history,
}
@ -862,7 +701,7 @@ async def reopen_language(
actor: User,
*,
http_request=None,
notes: str | None = None,
notes: Optional[str] = None,
) -> LanguageQCState:
"""PROD/ADMIN only — resets an approved language back to pending for re-review."""
if actor.role not in (UserRole.PRODUCTION, UserRole.ADMIN):
@ -979,22 +818,7 @@ async def add_comment(
details={"lang": lang},
)
# WS broadcast — live comment indicator for everyone on this job
try:
await connection_manager.broadcast_to_job(job_id, {
"type": "language_qc_comment",
"job_id": job_id,
"lang": lang,
"data": {
"author_name": actor.full_name or actor.email,
"lang": lang,
"comment_id": comment.id,
},
})
except Exception:
pass
# Fan-out email to all other assignees
# Fan-out to all other assignees
recipients = _qc_recipients(job_doc, current_state_raw if isinstance(current_state_raw, dict) else {}, exclude_user_id=actor.email)
if recipients:
try:
@ -1021,15 +845,12 @@ async def list_for_linguist(
db: AsyncIOMotorDatabase,
linguist_id: str,
*,
accessible_org_ids: list[str] | None = None,
status_filter: str | None = None,
status_filter: Optional[str] = None,
skip: int = 0,
limit: int = 50,
) -> list[dict]:
"""Return jobs where the linguist has an assignment, along with which languages."""
query: dict = {"qc_assignments.linguist_id": linguist_id}
if accessible_org_ids is not None:
query["organization_id"] = {"$in": accessible_org_ids}
if status_filter:
query["qc_assignments"] = {"$elemMatch": {"linguist_id": linguist_id, "status": status_filter}}
@ -1047,18 +868,14 @@ async def list_for_reviewer(
db: AsyncIOMotorDatabase,
reviewer_id: str,
*,
accessible_org_ids: list[str] | None = None,
status_filter: str | None = None,
status_filter: Optional[str] = None,
skip: int = 0,
limit: int = 50,
) -> list[dict]:
"""Return jobs where the reviewer is assigned to at least one language."""
# language_qc is a dict keyed by lang; pre-filter by org then scan in Python for assigned reviewer
base_query: dict = {}
if accessible_org_ids is not None:
base_query["organization_id"] = {"$in": accessible_org_ids}
# language_qc is an embedded dict keyed by lang code; scan in Python
all_jobs_cursor = db[_JOBS].find(
base_query,
{},
{"title": 1, "status": 1, "language_qc": 1, "qc_assignments": 1, "created_at": 1, "updated_at": 1}
).sort("updated_at", -1).skip(skip).limit(limit * 5) # over-fetch, filter in Python
@ -1149,20 +966,8 @@ def _assert_can_approve(job_doc: dict, lang: str, actor: User) -> None:
"""Raise 403 if actor cannot approve this language.
Two-stage QC is enforced: linguist must submit before reviewer can approve.
English-first is enforced: source language must be approved before any target.
PRODUCTION and ADMIN may override both gates.
PRODUCTION and ADMIN may override (explicit admin action, logged separately).
"""
source_lang = (job_doc.get("source") or {}).get("language", "en")
if lang != source_lang and actor.role not in (UserRole.PRODUCTION, UserRole.ADMIN):
source_state = (job_doc.get("language_qc") or {}).get(source_lang, {})
if not isinstance(source_state, dict):
source_state = {}
if source_state.get("status") != LanguageQCStatus.APPROVED.value:
raise HTTPException(
status_code=status.HTTP_409_CONFLICT,
detail=f"Source language '{source_lang}' must be approved before approving '{lang}'",
)
if actor.role in (UserRole.PRODUCTION, UserRole.ADMIN):
return

View file

@ -1,28 +1,24 @@
"""Membership service — queries the memberships collection."""
from datetime import UTC, datetime
from datetime import datetime, timezone
from typing import Optional
from motor.motor_asyncio import AsyncIOMotorDatabase
from ..models.membership import MemberDetail, Membership
from ..models.membership import Membership, MemberDetail
from ..models.organization import OrgRole
def _now() -> datetime:
return datetime.now(UTC)
return datetime.now(timezone.utc)
def _membership_from_doc(doc: dict) -> Membership:
raw_role = doc.get("role_in_org", "member")
try:
role = OrgRole(raw_role)
except ValueError:
role = OrgRole.MEMBER
return Membership(
id=str(doc["_id"]),
user_id=doc.get("user_id", ""),
organization_id=doc.get("organization_id", ""),
role_in_org=role,
user_id=doc["user_id"],
organization_id=doc["organization_id"],
role_in_org=OrgRole(doc["role_in_org"]),
created_at=doc.get("created_at"),
created_by=doc.get("created_by"),
)
@ -40,7 +36,7 @@ async def get_membership(
user_id: str,
organization_id: str,
db: AsyncIOMotorDatabase,
) -> Membership | None:
) -> Optional[Membership]:
doc = await db.memberships.find_one(
{"user_id": user_id, "organization_id": organization_id}
)
@ -63,7 +59,7 @@ async def upsert_membership(
user_id: str,
organization_id: str,
role_in_org: OrgRole,
created_by: str | None,
created_by: Optional[str],
db: AsyncIOMotorDatabase,
) -> Membership:
now = _now()
@ -109,24 +105,19 @@ async def list_org_members(
"as": "user_doc",
}
},
{"$unwind": {"path": "$user_doc", "preserveNullAndEmptyArrays": False}},
{"$unwind": {"path": "$user_doc", "preserveNullAndEmpty": False}},
{"$sort": {"created_at": 1}},
]
details = []
async for doc in db.memberships.aggregate(pipeline):
u = doc["user_doc"]
raw_role = doc.get("role_in_org", "member")
try:
role = OrgRole(raw_role)
except ValueError:
role = OrgRole.MEMBER
details.append(
MemberDetail(
membership_id=str(doc["_id"]),
user_id=doc.get("user_id", ""),
email=u.get("email") or "",
full_name=u.get("full_name") or "",
role_in_org=role,
user_id=doc["user_id"],
email=u.get("email", ""),
full_name=u.get("full_name", ""),
role_in_org=OrgRole(doc["role_in_org"]),
created_at=doc.get("created_at"),
)
)

View file

@ -3,6 +3,7 @@
Validates Microsoft ID tokens and extracts user information.
"""
import time
from typing import Dict, Optional
import httpx
from jose import JWTError, jwt
@ -49,11 +50,11 @@ class MicrosoftAuthService:
self.openid_config_url = f"{self.authority}/v2.0/.well-known/openid-configuration"
# Cache for JWKS (public keys)
self._jwks_cache: dict | None = None
self._jwks_cache: Optional[Dict] = None
self._jwks_cache_time: float = 0
self._jwks_cache_ttl: int = 3600 # Cache for 1 hour
async def _get_openid_config(self) -> dict:
async def _get_openid_config(self) -> Dict:
"""Fetch OpenID Connect configuration from Microsoft."""
try:
async with httpx.AsyncClient(timeout=10) as client:
@ -62,9 +63,9 @@ class MicrosoftAuthService:
return response.json()
except httpx.HTTPError as e:
logger.error(f"Failed to fetch OpenID configuration: {e}")
raise MicrosoftAuthError("Failed to fetch Microsoft authentication configuration") from e
raise MicrosoftAuthError("Failed to fetch Microsoft authentication configuration")
async def _get_jwks(self, force_refresh: bool = False) -> dict:
async def _get_jwks(self, force_refresh: bool = False) -> Dict:
"""Fetch JSON Web Key Set (JWKS) from Microsoft.
Args:
@ -97,7 +98,7 @@ class MicrosoftAuthService:
except httpx.HTTPError as e:
logger.error(f"Failed to fetch JWKS: {e}")
raise MicrosoftAuthError("Failed to fetch Microsoft public keys") from e
raise MicrosoftAuthError("Failed to fetch Microsoft public keys")
async def validate_token(self, id_token: str) -> MicrosoftUserInfo:
"""Validate Microsoft ID token and extract user information.
@ -120,7 +121,7 @@ class MicrosoftAuthService:
if not kid:
raise MicrosoftTokenValidationError("Token header missing 'kid' claim")
def _find_key(keys: list) -> dict | None:
def _find_key(keys: list) -> Optional[Dict]:
for key in keys:
if key.get('kid') == kid:
return {'kty': key['kty'], 'kid': key['kid'], 'use': key.get('use'),
@ -145,7 +146,7 @@ class MicrosoftAuthService:
issuer=f"https://login.microsoftonline.com/{self.tenant_id}/v2.0"
)
except JWTError as e:
raise MicrosoftTokenValidationError(f"Token validation failed: {str(e)}") from e
raise MicrosoftTokenValidationError(f"Token validation failed: {str(e)}")
email = payload.get('email') or payload.get('preferred_username')
if not email:
@ -176,12 +177,12 @@ class MicrosoftAuthService:
except JWKError as e:
logger.error(f"JWK error during token validation: {e}")
raise MicrosoftTokenValidationError(f"Key processing error: {str(e)}") from e
raise MicrosoftTokenValidationError(f"Key processing error: {str(e)}")
except Exception as e:
if isinstance(e, (MicrosoftAuthError, MicrosoftTokenValidationError)):
raise
logger.error(f"Unexpected error during token validation: {e}")
raise MicrosoftTokenValidationError(f"Token validation failed: {str(e)}") from e
raise MicrosoftTokenValidationError(f"Token validation failed: {str(e)}")
# Singleton instance

View file

@ -1,10 +1,11 @@
"""Google Cloud Secret Manager integration service."""
import asyncio
import os
from google.api_core import exceptions as gcp_exceptions
import asyncio
from typing import Dict, List, Optional, Any
from functools import lru_cache
from google.cloud import secretmanager
from google.api_core import exceptions as gcp_exceptions
from app.core.config import get_settings
from app.core.logging import get_logger
@ -20,14 +21,14 @@ class SecretManagerError(Exception):
class SecretsManager:
"""Service for managing secrets via Google Cloud Secret Manager."""
def __init__(self):
self.settings = get_settings()
self.client: secretmanager.SecretManagerServiceClient | None = None
self.client: Optional[secretmanager.SecretManagerServiceClient] = None
self.project_id = self.settings.google_cloud_project
self._cache: dict[str, str] = {}
self._cache: Dict[str, str] = {}
self._cache_ttl = 300 # 5 minutes cache
def _get_client(self) -> secretmanager.SecretManagerServiceClient:
"""Get or create Secret Manager client."""
if not self.client:
@ -36,93 +37,93 @@ class SecretsManager:
logger.info("Secret Manager client initialized")
except Exception as e:
logger.error(f"Failed to initialize Secret Manager client: {e}")
raise SecretManagerError(f"Failed to initialize Secret Manager: {e}") from e
raise SecretManagerError(f"Failed to initialize Secret Manager: {e}")
return self.client
@trace_async_operation("secrets_manager.get_secret")
async def get_secret(self, secret_name: str, version: str = "latest") -> str:
"""
Retrieve a secret from Google Cloud Secret Manager.
Args:
secret_name: Name of the secret
version: Version of the secret (default: "latest")
Returns:
The secret value as a string
Raises:
SecretManagerError: If secret cannot be retrieved
"""
cache_key = f"{secret_name}:{version}"
# Check cache first
if cache_key in self._cache:
logger.debug(f"Secret {secret_name} retrieved from cache")
return self._cache[cache_key]
try:
# Build the secret name
name = f"projects/{self.project_id}/secrets/{secret_name}/versions/{version}"
# Get the secret
client = self._get_client()
# Run in thread pool since Secret Manager client is synchronous
loop = asyncio.get_event_loop()
response = await loop.run_in_executor(
None,
None,
client.access_secret_version,
{"name": name}
)
secret_value = response.payload.data.decode("UTF-8")
# Cache the secret (with TTL handled by application restart)
self._cache[cache_key] = secret_value
logger.info(f"Successfully retrieved secret: {secret_name}")
return secret_value
except gcp_exceptions.NotFound:
error_msg = f"Secret not found: {secret_name}"
logger.error(error_msg)
raise SecretManagerError(error_msg) from None
raise SecretManagerError(error_msg)
except gcp_exceptions.PermissionDenied:
error_msg = f"Permission denied accessing secret: {secret_name}"
logger.error(error_msg)
raise SecretManagerError(error_msg) from None
raise SecretManagerError(error_msg)
except Exception as e:
error_msg = f"Failed to retrieve secret {secret_name}: {e}"
logger.error(error_msg)
raise SecretManagerError(error_msg) from e
raise SecretManagerError(error_msg)
@trace_async_operation("secrets_manager.get_secrets_batch")
async def get_secrets_batch(self, secret_names: list[str]) -> dict[str, str]:
async def get_secrets_batch(self, secret_names: List[str]) -> Dict[str, str]:
"""
Retrieve multiple secrets efficiently.
Args:
secret_names: List of secret names to retrieve
Returns:
Dictionary mapping secret names to their values
"""
secrets = {}
tasks = []
for secret_name in secret_names:
task = asyncio.create_task(
self.get_secret(secret_name),
name=f"get_secret_{secret_name}"
)
tasks.append((secret_name, task))
# Wait for all tasks to complete
for secret_name, task in tasks:
try:
@ -131,34 +132,34 @@ class SecretsManager:
logger.warning(f"Failed to retrieve secret {secret_name}: {e}")
# Continue with other secrets
continue
return secrets
async def create_secret(self, secret_name: str, secret_value: str, labels: dict[str, str] | None = None) -> str:
async def create_secret(self, secret_name: str, secret_value: str, labels: Optional[Dict[str, str]] = None) -> str:
"""
Create a new secret in Secret Manager.
Args:
secret_name: Name of the secret
secret_value: Value to store
labels: Optional labels for the secret
Returns:
The full secret resource name
"""
try:
client = self._get_client()
parent = f"projects/{self.project_id}"
# Create the secret
secret = {
"labels": labels or {},
"replication": {"automatic": {}}
}
loop = asyncio.get_event_loop()
# Create secret resource
create_response = await loop.run_in_executor(
None,
@ -169,7 +170,7 @@ class SecretsManager:
"secret": secret
}
)
# Add secret version with the actual value
version_response = await loop.run_in_executor(
None,
@ -179,20 +180,20 @@ class SecretsManager:
"payload": {"data": secret_value.encode("UTF-8")}
}
)
logger.info(f"Successfully created secret: {secret_name}")
return version_response.name
except gcp_exceptions.AlreadyExists:
error_msg = f"Secret already exists: {secret_name}"
logger.error(error_msg)
raise SecretManagerError(error_msg) from None
raise SecretManagerError(error_msg)
except Exception as e:
error_msg = f"Failed to create secret {secret_name}: {e}"
logger.error(error_msg)
raise SecretManagerError(error_msg) from e
raise SecretManagerError(error_msg)
def clear_cache(self) -> None:
"""Clear the secrets cache."""
self._cache.clear()
@ -217,7 +218,7 @@ async def get_database_url() -> str:
# Fallback to environment variable
url = os.getenv("MONGODB_URL")
if not url:
raise SecretManagerError("MongoDB URL not available in secrets or environment") from None
raise SecretManagerError("MongoDB URL not available in secrets or environment")
return url
@ -229,11 +230,11 @@ async def get_redis_url() -> str:
# Fallback to environment variable
url = os.getenv("REDIS_URL")
if not url:
raise SecretManagerError("Redis URL not available in secrets or environment") from None
raise SecretManagerError("Redis URL not available in secrets or environment")
return url
async def get_jwt_secrets() -> dict[str, str]:
async def get_jwt_secrets() -> Dict[str, str]:
"""Get JWT secrets from Secret Manager."""
try:
return await secrets_manager.get_secrets_batch([
@ -248,22 +249,22 @@ async def get_jwt_secrets() -> dict[str, str]:
}
async def get_api_keys() -> dict[str, str]:
async def get_api_keys() -> Dict[str, str]:
"""Get all API keys from Secret Manager."""
api_keys = {}
secret_names = [
"gemini-api-key",
"sendgrid-api-key",
"sendgrid-api-key",
"elevenlabs-api-key",
"sentry-dsn"
]
try:
api_keys = await secrets_manager.get_secrets_batch(secret_names)
except SecretManagerError:
logger.warning("Failed to retrieve some API keys from Secret Manager, using environment fallback")
# Fallback to environment variables for missing keys
env_mapping = {
"gemini-api-key": "GEMINI_API_KEY",
@ -271,7 +272,7 @@ async def get_api_keys() -> dict[str, str]:
"elevenlabs-api-key": "ELEVENLABS_API_KEY",
"sentry-dsn": "SENTRY_DSN"
}
for secret_name, env_var in env_mapping.items():
if secret_name not in api_keys:
env_value = os.getenv(env_var)
@ -279,5 +280,5 @@ async def get_api_keys() -> dict[str, str]:
api_keys[secret_name] = env_value
else:
logger.warning(f"API key {secret_name} not available in secrets or environment")
return api_keys
return api_keys

View file

@ -1,5 +1,6 @@
import io
from dataclasses import dataclass
from typing import Optional
import aiohttp
from google.cloud import texttospeech
@ -46,8 +47,8 @@ class TTSService:
self,
ad_vtt_content: str,
language_code: str = "en-US",
voice_name: str | None = None,
provider: str | None = None,
voice_name: Optional[str] = None,
provider: Optional[str] = None,
model: str = "flash",
speed: float = 1.0,
style_prompt: str = "",
@ -113,8 +114,8 @@ class TTSService:
self,
ad_vtt_content: str,
language_code: str = "en-US",
voice_name: str | None = None,
provider: str | None = None,
voice_name: Optional[str] = None,
provider: Optional[str] = None,
model: str = "flash",
speed: float = 1.0,
style_prompt: str = "",
@ -218,7 +219,7 @@ class TTSService:
self,
ad_vtt_content: str,
language_code: str = "en-US",
voice_name: str | None = None
voice_name: Optional[str] = None
) -> bytes:
"""Generate MP3 using Google TTS with 2-second pauses between passages"""
@ -232,7 +233,7 @@ class TTSService:
audio_segments = []
current_audio_position = 0.0 # Track actual audio timeline position
for _i, cue in enumerate(cues):
for i, cue in enumerate(cues):
# Calculate where this cue should start (anchored to VTT timing)
target_start_time = cue["start_time"]
@ -280,7 +281,7 @@ class TTSService:
self,
ad_vtt_content: str,
language_code: str = "en-US",
voice_name: str | None = None,
voice_name: Optional[str] = None,
stability: float = 0.5,
similarity_boost: float = 0.5,
) -> bytes:
@ -298,7 +299,7 @@ class TTSService:
audio_segments = []
current_audio_position = 0.0 # Track actual audio timeline position
for _i, cue in enumerate(cues):
for i, cue in enumerate(cues):
# Calculate where this cue should start (anchored to VTT timing)
target_start_time = cue["start_time"]
@ -338,7 +339,7 @@ class TTSService:
self,
text: str,
language_code: str,
voice_name: str | None = None
voice_name: Optional[str] = None
) -> bytes:
"""Synthesize a single text string to audio using Google TTS"""
# Configure voice
@ -403,7 +404,7 @@ class TTSService:
error_text = await response.text()
raise ValueError(f"ElevenLabs TTS failed: {response.status} - {error_text}")
def _get_elevenlabs_voice(self, language_code: str, voice_name: str | None = None) -> str:
def _get_elevenlabs_voice(self, language_code: str, voice_name: Optional[str] = None) -> str:
"""Get ElevenLabs voice ID for language"""
if voice_name:
return voice_name
@ -451,32 +452,28 @@ class TTSService:
def _parse_timestamp(self, timestamp: str) -> float:
"""Convert VTT timestamp to seconds"""
# Format: HH:MM:SS.mmm or MM:SS.mmm
try:
parts = timestamp.split(":")
parts = timestamp.split(":")
if len(parts) == 3: # HH:MM:SS.mmm
hours, minutes, seconds = parts
elif len(parts) == 2: # MM:SS.mmm
hours, minutes, seconds = "0", parts[0], parts[1]
else:
raise ValueError(f"Invalid timestamp format: {timestamp}")
if len(parts) == 3: # HH:MM:SS.mmm
hours, minutes, seconds = parts
elif len(parts) == 2: # MM:SS.mmm
hours, minutes, seconds = "0", parts[0], parts[1]
else:
raise ValueError(f"Invalid timestamp format: {timestamp}")
# Parse seconds and milliseconds
sec_parts = seconds.split(".")
seconds_int = int(sec_parts[0])
milliseconds = int(sec_parts[1]) if len(sec_parts) > 1 else 0
# Parse seconds and milliseconds
sec_parts = seconds.split(".")
seconds = int(sec_parts[0])
milliseconds = int(sec_parts[1]) if len(sec_parts) > 1 else 0
total_seconds = (
int(hours) * 3600 +
int(minutes) * 60 +
seconds_int +
milliseconds / 1000.0
)
return total_seconds
except (ValueError, IndexError) as e:
if "Invalid timestamp format" in str(e):
raise
raise ValueError(f"Invalid timestamp format: {timestamp}") from e
total_seconds = (
int(hours) * 3600 +
int(minutes) * 60 +
seconds +
milliseconds / 1000.0
)
return total_seconds
# Global service instance

View file

@ -1,4 +1,4 @@
from typing import Any
from typing import Dict, List, Any
from ..core.logging import get_logger
from ..lib.vtt import VTTEditor
@ -11,7 +11,7 @@ class AssetValidationService:
"""Service for validating job assets before completion"""
@staticmethod
async def validate_job_assets(job_doc: dict[str, Any]) -> tuple[bool, list[str]]:
async def validate_job_assets(job_doc: Dict[str, Any]) -> tuple[bool, List[str]]:
"""
Validate all assets for a job before allowing completion
Returns (is_valid, list_of_errors)
@ -19,7 +19,7 @@ class AssetValidationService:
errors = []
outputs = job_doc.get("outputs", {})
requested_outputs = job_doc.get("requested_outputs", {})
if not outputs:
errors.append("No outputs generated for this job")
return False, errors
@ -88,13 +88,13 @@ class AssetValidationService:
# Download and validate VTT content
blob_path = gcs_uri.replace(f"gs://{gcs_service.bucket.name}/", "")
blob = gcs_service.bucket.blob(blob_path)
if not blob.exists():
return f"{asset_name} file not found in storage"
vtt_content = blob.download_as_text()
is_valid, vtt_errors = VTTEditor.validate_vtt(vtt_content)
if not is_valid:
return f"{asset_name} validation failed: {'; '.join(vtt_errors[:3])}"
@ -118,13 +118,13 @@ class AssetValidationService:
try:
blob_path = gcs_uri.replace(f"gs://{gcs_service.bucket.name}/", "")
blob = gcs_service.bucket.blob(blob_path)
if not blob.exists():
return f"{asset_name} file not found in storage"
# Reload blob to get metadata (including size)
blob.reload()
# Check file size (should be reasonable for audio)
size_mb = blob.size / (1024 * 1024) if blob.size else 0
if size_mb < 0.01: # Less than 10KB
@ -169,4 +169,4 @@ class AssetValidationService:
# Global service instance
asset_validation_service = AssetValidationService()
asset_validation_service = AssetValidationService()

View file

@ -23,6 +23,7 @@ from google.oauth2 import id_token
from ..core.config import settings
from ..core.logging import get_logger
from ..models.job import PausePointData, VideoSegmentMetadata
from ..schemas.accessible_video import AccessibleVideoMethod, GeminiAccessibleVideoAnalysis
logger = get_logger(__name__)
@ -54,9 +55,6 @@ class VideoRendererService:
# Audio ducking settings
self.duck_level = getattr(settings, 'accessible_video_duck_level', 0.3)
self.duck_fade_ms = getattr(settings, 'accessible_video_duck_fade_ms', 200)
# Adaptive silence buffer settings (A1)
self._silence_buffer_default = getattr(settings, 'ad_silence_buffer_default', 0.5)
self._silence_buffer_min_after = getattr(settings, 'ad_silence_buffer_min_after', 0.1)
# Cloud Run support
self._gcs_client: storage.Client | None = None
# Source video caching for Cloud Run (uploaded once, reused across operations)
@ -231,7 +229,7 @@ class VideoRendererService:
error_detail = e.response.json().get("detail", str(e))
except Exception:
error_detail = str(e)
raise FFmpegExecutionError(f"Cloud Run {endpoint} failed: {error_detail}") from e
raise FFmpegExecutionError(f"Cloud Run {endpoint} failed: {error_detail}")
async def _dispatch_ffmpeg(self, cmd: list[str], timeout: int = 3600) -> dict[str, Any]:
"""
@ -251,7 +249,6 @@ class VideoRendererService:
FFmpegExecutionError: If the command fails
"""
from celery.result import allow_join_result
from ..tasks.ffmpeg_operations import run_ffmpeg_command
# Dispatch to ffmpeg queue
@ -295,7 +292,6 @@ class VideoRendererService:
FFmpegExecutionError: If the command fails
"""
from celery.result import allow_join_result
from ..tasks.ffmpeg_operations import run_ffprobe_command
# Dispatch to ffmpeg queue
@ -391,7 +387,8 @@ class VideoRendererService:
logger.info(f"Starting overlay render for {source_video_path}")
placements = analysis.get("placements", [])
with tempfile.TemporaryDirectory() as _temp_dir:
with tempfile.TemporaryDirectory() as temp_dir:
temp_dir_path = Path(temp_dir)
# Get source video duration
duration = await self._get_video_duration(source_video_path)
@ -414,7 +411,7 @@ class VideoRendererService:
filter_parts = []
# Add each AD segment as input
for _cue_index, mp3_path in ad_segments:
for cue_index, mp3_path in ad_segments:
inputs.extend(["-i", mp3_path])
# Build complex filter
@ -428,7 +425,7 @@ class VideoRendererService:
# Add delay to each AD segment and mix
ad_labels = []
for i, (cue_index, _mp3_path) in enumerate(ad_segments):
for i, (cue_index, mp3_path) in enumerate(ad_segments):
# Find the placement for this cue
placement = next(
(p for p in placements if p.get("ad_cue_index") == cue_index),
@ -481,7 +478,7 @@ class VideoRendererService:
output_path
])
logger.info("Running ffmpeg overlay command...")
logger.info(f"Running ffmpeg overlay command...")
await self._run_ffmpeg(cmd)
logger.info(f"Overlay render complete: {output_path}")
@ -563,7 +560,7 @@ class VideoRendererService:
logger.info(f"Source Properties: {video_props}, Duration: {source_duration:.2f}s")
# Create a mapping of cue_index to mp3_path
cue_to_mp3 = dict(ad_segments)
cue_to_mp3 = {cue_index: mp3_path for cue_index, mp3_path in ad_segments}
# Pre-process placements and validate
valid_placements = []
@ -601,38 +598,18 @@ class VideoRendererService:
final_segment_needed = final_segment_start < source_duration
# ============================================================
# PARALLEL PHASE 1: Generate per-cue silence files + extract all frames + video segments
# PARALLEL PHASE 1: Generate shared silence + extract all frames + all video segments
# ============================================================
logger.info(f"Phase 1: Parallel extraction of {len(valid_placements)} frames and video segments")
# Compute adaptive silence buffers per cue (A1):
# natural_gap_ms already present at the pause point reduces how much silence we add.
_buf_default = self._silence_buffer_default
_buf_min_after = self._silence_buffer_min_after
silence_pre_paths: dict[int, str] = {}
silence_post_paths: dict[int, str] = {}
for p in valid_placements:
i = p["index"]
natural_gap = (p.get("natural_gap_ms") or 0.0) / 1000.0
silence_before = max(0.05, _buf_default - natural_gap * 0.5)
silence_after = max(_buf_min_after, _buf_default - natural_gap * 0.3)
p["silence_before"] = silence_before
p["silence_after"] = silence_after
silence_pre_paths[i] = str(temp_dir_path / f"silence_pre_{i}.m4a")
silence_post_paths[i] = str(temp_dir_path / f"silence_post_{i}.m4a")
logger.debug(
f"Cue {p['cue_index']}: natural_gap={natural_gap*1000:.0f}ms → "
f"silence_before={silence_before*1000:.0f}ms silence_after={silence_after*1000:.0f}ms"
)
silence_duration = 0.5 # 500ms shared by all
silence_path = temp_dir_path / "silence_shared.m4a"
# Build tasks for phase 1
phase1_tasks = []
# Tasks: Generate per-cue silence files
for p in valid_placements:
i = p["index"]
phase1_tasks.append(self._generate_silence(p["silence_before"], silence_pre_paths[i], video_props))
phase1_tasks.append(self._generate_silence(p["silence_after"], silence_post_paths[i], video_props))
# Task: Generate silence (just once, shared by all)
phase1_tasks.append(self._generate_silence(silence_duration, str(silence_path), video_props))
# Tasks: Extract all video segments
video_segment_paths = {}
@ -689,7 +666,7 @@ class VideoRendererService:
combined_audio_path = temp_dir_path / f"combined_audio_{i}.m4a"
combined_audio_paths[i] = str(combined_audio_path)
phase2_tasks.append(self._concatenate_audio(
[silence_pre_paths[i], p["ad_mp3_path"], silence_post_paths[i]],
[str(silence_path), p["ad_mp3_path"], str(silence_path)],
str(combined_audio_path),
video_props
))
@ -708,14 +685,11 @@ class VideoRendererService:
i = p["index"]
cue_index = p["cue_index"]
ad_duration = p["ad_duration"]
silence_before = p["silence_before"]
silence_after = p["silence_after"]
total_freeze_duration = ad_duration + silence_before + silence_after
total_freeze_duration = ad_duration + (2 * silence_duration)
logger.info(
f"Cue {cue_index}: Freeze segment — "
f"pre={silence_before*1000:.0f}ms + AD={ad_duration:.2f}s + "
f"post={silence_after*1000:.0f}ms = {total_freeze_duration:.2f}s"
f"Cue {cue_index}: Freeze segment with silence buffers - "
f"500ms + AD={ad_duration:.2f}s + 500ms = {total_freeze_duration:.2f}s"
)
freeze_segment_path = temp_dir_path / f"freeze_segment_{i}.mp4"
@ -734,17 +708,29 @@ class VideoRendererService:
# ============================================================
# PHASE 3.5: Measure actual freeze segment durations for VTT retiming
# ============================================================
# Use the pre-computed expected duration for each freeze segment.
# Cloud Run-generated freeze segments are created to this exact duration,
# so probing is unnecessary and avoids dispatching to the Celery ffmpeg
# queue (which caused FFprobe code 1 failures on Cloud Run output files).
logger.info("Setting freeze segment durations from pre-computed values...")
# NOTE: Use _get_video_duration_local directly since freeze segments are
# local files. Using _get_video_duration would incorrectly use the cached
# source video URI in Cloud Run mode instead of measuring the freeze segment.
logger.info("Measuring actual freeze segment durations...")
for p in valid_placements:
expected = p["ad_duration"] + p["silence_before"] + p["silence_after"]
p["actual_freeze_duration"] = expected
logger.debug(
f"Freeze segment cue {p['cue_index']}: duration={expected:.3f}s"
)
i = p["index"]
freeze_path = freeze_segment_paths[i]
actual_duration = await self._get_video_duration_local(freeze_path)
p["actual_freeze_duration"] = actual_duration
# Log any discrepancy between expected and actual duration
expected = p["ad_duration"] + (2 * silence_duration)
discrepancy = actual_duration - expected
if abs(discrepancy) > 0.01: # 10ms threshold
logger.warning(
f"Freeze segment duration mismatch for cue {p['cue_index']}: "
f"expected={expected:.3f}s, actual={actual_duration:.3f}s, "
f"discrepancy={discrepancy:+.3f}s"
)
else:
logger.debug(
f"Freeze segment cue {p['cue_index']}: duration={actual_duration:.3f}s (expected={expected:.3f}s)"
)
# ============================================================
# PHASE 4: Assemble segment list in correct order
@ -883,6 +869,9 @@ class VideoRendererService:
# Pause point is at the START of the freeze frame in the rendered timeline
pause_ms = freeze_frame_starts.get(cue_index, p["pause_point"] * 1000)
# Find the freeze segment for this cue to get its end position
freeze_seg = next((s for s in segment_metadata_list if s.is_freeze_frame and s.cue_index == cue_index), None)
# Compute min bound: end of previous AD segment (or 0 for first)
if idx == 0:
min_bound_ms = 0.0
@ -1545,7 +1534,7 @@ class VideoRendererService:
"""
Generate a silent audio file of specified duration.
Used to create adaptive silence buffers before/after AD audio.
Used to create 500ms silence buffers before/after AD audio.
"""
if self._use_cloud_run:
await self._generate_silence_cloud_run(duration, output_path, props)

Some files were not shown because too many files have changed in this diff Show more