diff --git a/backend/.gitignore b/backend/.gitignore index 4b42251..2b2b48f 100644 --- a/backend/.gitignore +++ b/backend/.gitignore @@ -23,6 +23,7 @@ eggs/ .eggs/ lib/ lib64/ +!app/lib/ parts/ sdist/ var/ diff --git a/backend/app/api/v1/routes_glossaries.py b/backend/app/api/v1/routes_glossaries.py new file mode 100644 index 0000000..5eaf993 --- /dev/null +++ b/backend/app/api/v1/routes_glossaries.py @@ -0,0 +1,288 @@ +""" +Glossary management endpoints. + +Access: + - All glossary mutations (upload, activate, archive) → Admin or PM of the client + - Glossary reads (list, detail, terms) → Admin, PM, or staff members + +Routes are nested under /clients/{client_id}/glossaries to keep ownership clear. +""" +from __future__ import annotations + +from fastapi import APIRouter, Depends, File, Form, HTTPException, Query, UploadFile + +from ...core.dependencies import get_current_user, require_pm_for_client, require_roles +from ...core.logging import get_logger +from ...models.audit_log import AuditAction +from ...models.glossary import ( + GlossaryDetailResponse, + GlossaryResponse, + GlossaryVersionResponse, +) +from ...models.user import User, UserRole +from ...services import audit_logger as audit_svc +from ...services import glossary_service as svc + +logger = get_logger(__name__) + +router = APIRouter( + prefix="/clients/{client_id}/glossaries", + tags=["glossaries"], +) + +_ALLOWED_CONTENT_TYPES = { + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + "application/vnd.ms-excel", +} +_MAX_FILE_SIZE_MB = 50 + + +def _require_client_staff(client_id: str): + """Dependency: admin or PM of this client.""" + return require_pm_for_client(client_id_param="client_id") + + +# ── List glossaries ─────────────────────────────────────────────────────────── + +@router.get("", response_model=list[GlossaryResponse]) +async def list_glossaries( + client_id: str, + current_user: User = Depends(get_current_user), +): + """List all active glossaries for a client.""" + _assert_can_read(current_user) + glossaries = await svc.get_glossaries_for_client(client_id) + return [_to_response(g) for g in glossaries] + + +# ── Upload new glossary ─────────────────────────────────────────────────────── + +@router.post("", response_model=GlossaryDetailResponse, status_code=201) +async def upload_glossary( + client_id: str, + file: UploadFile = File(..., description="xlsx glossary file"), + name: str = Form(...), + source_locale: str = Form(..., description="BCP-47 source locale, e.g. en-GB"), + source_locale_col: str = Form(..., description="xlsx column header for the source language, e.g. en_gb"), + description: str | None = Form(None), + change_note: str | None = Form(None), + current_user: User = Depends(require_roles(UserRole.ADMIN, UserRole.PROJECT_MANAGER)), +): + """Upload a new glossary xlsx file and associate it with a client.""" + _validate_xlsx(file) + + try: + glossary, version = await svc.ingest_glossary( + client_id=client_id, + name=name, + source_locale=source_locale, + source_locale_col=source_locale_col, + file=file, + user_id=str(current_user.id), + description=description, + change_note=change_note, + ) + except ValueError as exc: + raise HTTPException(status_code=422, detail=str(exc)) from exc + + await audit_svc.audit_logger.log_action( + action=AuditAction.GLOSSARY_UPLOAD, + description=f"Glossary '{name}' uploaded for client {client_id}", + user=current_user, + resource_type="glossary", + resource_id=glossary.id, + details={"term_count": version.term_count, "source_locale": source_locale}, + ) + + versions = await svc.get_versions(glossary.id) + return _to_detail_response(glossary, versions) + + +# ── Get glossary detail ─────────────────────────────────────────────────────── + +@router.get("/{glossary_id}", response_model=GlossaryDetailResponse) +async def get_glossary( + client_id: str, + glossary_id: str, + current_user: User = Depends(get_current_user), +): + _assert_can_read(current_user) + glossary = await svc.get_glossary(glossary_id) + if not glossary or glossary.client_id != client_id: + raise HTTPException(status_code=404, detail="Glossary not found") + versions = await svc.get_versions(glossary_id) + return _to_detail_response(glossary, versions) + + +# ── Browse terms ────────────────────────────────────────────────────────────── + +@router.get("/{glossary_id}/terms") +async def list_terms( + client_id: str, + glossary_id: str, + version_id: str | None = Query(None, description="Specific version; defaults to active"), + search: str | None = Query(None), + page: int = Query(1, ge=1), + page_size: int = Query(50, ge=1, le=200), + current_user: User = Depends(get_current_user), +): + _assert_can_read(current_user) + glossary = await svc.get_glossary(glossary_id) + if not glossary or glossary.client_id != client_id: + raise HTTPException(status_code=404, detail="Glossary not found") + + vid = version_id or glossary.current_version_id + if not vid: + return {"terms": [], "total": 0, "page": page, "page_size": page_size} + + terms, total = await svc.get_terms_page(vid, search=search, page=page, page_size=page_size) + return { + "terms": [{"source_term": t.source_term, "translations": t.translations} for t in terms], + "total": total, + "page": page, + "page_size": page_size, + } + + +# ── Upload new version ──────────────────────────────────────────────────────── + +@router.post("/{glossary_id}/versions", response_model=GlossaryVersionResponse, status_code=201) +async def upload_version( + client_id: str, + glossary_id: str, + file: UploadFile = File(...), + source_locale_col: str = Form(...), + change_note: str | None = Form(None), + current_user: User = Depends(require_roles(UserRole.ADMIN, UserRole.PROJECT_MANAGER)), +): + """Upload a new xlsx file as a new version of an existing glossary.""" + _validate_xlsx(file) + glossary = await svc.get_glossary(glossary_id) + if not glossary or glossary.client_id != client_id: + raise HTTPException(status_code=404, detail="Glossary not found") + + try: + version = await svc.ingest_new_version( + glossary_id=glossary_id, + source_locale_col=source_locale_col, + file=file, + user_id=str(current_user.id), + change_note=change_note, + ) + except ValueError as exc: + raise HTTPException(status_code=422, detail=str(exc)) from exc + + await audit_svc.audit_logger.log_action( + action=AuditAction.GLOSSARY_VERSION_UPLOAD, + description=f"New glossary version uploaded for glossary {glossary_id}", + user=current_user, + resource_type="glossary_version", + resource_id=version.id, + details={"term_count": version.term_count, "version_number": version.version_number}, + ) + return _version_to_response(version) + + +# ── Activate a version ──────────────────────────────────────────────────────── + +@router.post("/{glossary_id}/activate") +async def activate_version( + client_id: str, + glossary_id: str, + version_id: str = Form(...), + current_user: User = Depends(require_roles(UserRole.ADMIN, UserRole.PROJECT_MANAGER)), +): + glossary = await svc.get_glossary(glossary_id) + if not glossary or glossary.client_id != client_id: + raise HTTPException(status_code=404, detail="Glossary not found") + + try: + await svc.activate_version(glossary_id, version_id) + except ValueError as exc: + raise HTTPException(status_code=404, detail=str(exc)) from exc + + await audit_svc.audit_logger.log_action( + action=AuditAction.GLOSSARY_ACTIVATE, + description=f"Glossary version {version_id} activated", + user=current_user, + resource_type="glossary", + resource_id=glossary_id, + details={"version_id": version_id}, + ) + return {"status": "ok", "active_version_id": version_id} + + +# ── Archive (soft-delete) ───────────────────────────────────────────────────── + +@router.delete("/{glossary_id}", status_code=204) +async def archive_glossary( + client_id: str, + glossary_id: str, + current_user: User = Depends(require_roles(UserRole.ADMIN)), +): + glossary = await svc.get_glossary(glossary_id) + if not glossary or glossary.client_id != client_id: + raise HTTPException(status_code=404, detail="Glossary not found") + await svc.archive_glossary(glossary_id) + await audit_svc.audit_logger.log_action( + action=AuditAction.GLOSSARY_ARCHIVE, + description=f"Glossary {glossary_id} archived", + user=current_user, + resource_type="glossary", + resource_id=glossary_id, + ) + + +# ── Helpers ─────────────────────────────────────────────────────────────────── + +def _assert_can_read(user: User) -> None: + allowed = {UserRole.ADMIN, UserRole.PROJECT_MANAGER, UserRole.REVIEWER, + UserRole.LINGUIST, UserRole.PRODUCTION} + if user.role not in allowed: + raise HTTPException(status_code=403, detail="Insufficient permissions") + + +def _validate_xlsx(file: UploadFile) -> None: + if file.content_type not in _ALLOWED_CONTENT_TYPES and not ( + file.filename and file.filename.endswith(".xlsx") + ): + raise HTTPException( + status_code=422, + detail="Only .xlsx files are accepted", + ) + + +def _to_response(g) -> GlossaryResponse: + return GlossaryResponse( + id=str(g.id), + client_id=g.client_id, + name=g.name, + description=g.description, + source_locale=g.source_locale, + source=g.source, + status=g.status, + current_version_id=g.current_version_id, + created_at=g.created_at, + created_by=g.created_by, + ) + + +def _version_to_response(v) -> GlossaryVersionResponse: + return GlossaryVersionResponse( + id=str(v.id), + glossary_id=v.glossary_id, + version_number=v.version_number, + term_count=v.term_count, + embedded_count=v.embedded_count, + embedding_status=v.embedding_status, + created_at=v.created_at, + created_by=v.created_by, + change_note=v.change_note, + ) + + +def _to_detail_response(glossary, versions) -> GlossaryDetailResponse: + return GlossaryDetailResponse( + **_to_response(glossary).model_dump(), + versions=[_version_to_response(v) for v in versions], + ) diff --git a/backend/app/core/config.py b/backend/app/core/config.py index 4a4576c..788188e 100644 --- a/backend/app/core/config.py +++ b/backend/app/core/config.py @@ -93,7 +93,24 @@ class Settings(BaseSettings): "sv": "sv-SE", "es-419": "es-US", "pt-BR": "pt-BR", - "fr-CA": "fr-CA" + "fr-CA": "fr-CA", + # Explicit region variants (added for locale-aware glossary support) + "de-DE": "de-DE", + "en-US": "en-US", + "en-GB": "en-GB", + "en-CA": "en-CA", + "es-ES": "es-ES", + "es-MX": "es-US", + "fr-FR": "fr-FR", + "it-IT": "it-IT", + "ja-JP": "ja-JP", + "ko-KR": "ko-KR", + "nl-NL": "nl-NL", + "pl-PL": "pl-PL", + "cs-CZ": "cs-CZ", + "tr-TR": "tr-TR", + "id-ID": "id-ID", + "pt-PT": "pt-PT", } gemini_tts_language_names: dict[str, str] = { "en": "English", @@ -129,7 +146,24 @@ class Settings(BaseSettings): "sv": "Swedish", "es-419": "Spanish (Latin America)", "pt-BR": "Portuguese (Brazil)", - "fr-CA": "French (Canada)" + "fr-CA": "French (Canada)", + # Explicit region variants + "de-DE": "German (Germany)", + "en-US": "English (US)", + "en-GB": "English (UK)", + "en-CA": "English (Canada)", + "es-ES": "Spanish (Spain)", + "es-MX": "Spanish (Mexico)", + "fr-FR": "French (France)", + "it-IT": "Italian (Italy)", + "ja-JP": "Japanese (Japan)", + "ko-KR": "Korean (Korea)", + "nl-NL": "Dutch (Netherlands)", + "pl-PL": "Polish (Poland)", + "cs-CZ": "Czech (Czech Republic)", + "tr-TR": "Turkish (Turkey)", + "id-ID": "Indonesian (Indonesia)", + "pt-PT": "Portuguese (Portugal)", } gemini_tts_preview_samples: dict[str, str] = { "en": "This is a preview of the audio description voice.", @@ -165,7 +199,24 @@ class Settings(BaseSettings): "sv": "Det här är en förhandsgranskning av ljudbeskrivningsrösten.", "es-419": "Esta es una vista previa de la voz de audiodescripción.", "pt-BR": "Esta é uma prévia da voz da audiodescrição.", - "fr-CA": "Ceci est un aperçu de la voix de l'audiodescription." + "fr-CA": "Ceci est un aperçu de la voix de l'audiodescription.", + # Explicit region variants + "de-DE": "Dies ist eine Vorschau der Audiodeskriptionsstimme.", + "en-US": "This is a preview of the audio description voice.", + "en-GB": "This is a preview of the audio description voice.", + "en-CA": "This is a preview of the audio description voice.", + "es-ES": "Esta es una vista previa de la voz de audiodescripción.", + "es-MX": "Esta es una vista previa de la voz de audiodescripción.", + "fr-FR": "Ceci est un aperçu de la voix de l'audiodescription.", + "it-IT": "Questa è un'anteprima della voce dell'audiodescrizione.", + "ja-JP": "これは音声解説の声のプレビューです。", + "ko-KR": "이것은 오디오 설명 음성의 미리보기입니다.", + "nl-NL": "Dit is een voorbeeld van de audiodescriptiestem.", + "pl-PL": "To jest podgląd głosu audiodeskrypcji.", + "cs-CZ": "Toto je náhled hlasu zvukového popisu.", + "tr-TR": "Bu, sesli betimleme sesinin bir önizlemesidir.", + "id-ID": "Ini adalah pratinjau suara deskripsi audio.", + "pt-PT": "Esta é uma pré-visualização da voz da audiodescrição.", } # Gemini TTS Model Options diff --git a/backend/app/lib/locales.py b/backend/app/lib/locales.py new file mode 100644 index 0000000..62d4013 --- /dev/null +++ b/backend/app/lib/locales.py @@ -0,0 +1,245 @@ +""" +Central locale registry. + +Provides a single source of truth for BCP-47 codes, display names, +and Gemini-friendly labels used throughout the translation/TTS pipeline. + +Convention: BCP-47 with hyphen separator (fr-FR, en-GB, pt-BR). +xlsx underscore format (fr_fr, en_gb) is normalized at import time. +Bare language-only codes (fr, en) remain valid for legacy compat. +""" +from __future__ import annotations + +from dataclasses import dataclass + + +@dataclass(frozen=True) +class Locale: + code: str # canonical BCP-47 (e.g. "fr-FR") + display_name: str # human-readable (e.g. "French (France)") + gemini_label: str # what to pass to Gemini prompts (e.g. "French (France)") + tts_lang: str # BCP-47 for TTS API (may differ, e.g. es-MX → es-US) + preview_sample: str # sample sentence for TTS preview + + +# Master locale registry. Bare language codes (legacy) + explicit region variants. +_REGISTRY: dict[str, Locale] = {loc.code: loc for loc in [ + # ── English ────────────────────────────────────────────────────────────── + Locale("en", "English", "English", "en-US", + "This is a preview of the audio description voice."), + Locale("en-US", "English (US)", "English (United States)", "en-US", + "This is a preview of the audio description voice."), + Locale("en-GB", "English (UK)", "English (United Kingdom)", "en-GB", + "This is a preview of the audio description voice."), + Locale("en-CA", "English (Canada)", "English (Canada)", "en-CA", + "This is a preview of the audio description voice."), + # ── Spanish ────────────────────────────────────────────────────────────── + Locale("es", "Spanish", "Spanish", "es-US", + "Esta es una vista previa de la voz de audiodescripcion."), + Locale("es-ES", "Spanish (Spain)", "Spanish (Spain)", "es-ES", + "Esta es una vista previa de la voz de audiodescripción."), + Locale("es-MX", "Spanish (Mexico)", "Spanish (Mexico)", "es-US", + "Esta es una vista previa de la voz de audiodescripción."), + Locale("es-419", "Spanish (Latin America)", "Spanish (Latin America)", "es-US", + "Esta es una vista previa de la voz de audiodescripción."), + # ── French ─────────────────────────────────────────────────────────────── + Locale("fr", "French", "French", "fr-FR", + "Ceci est un apercu de la voix de l'audiodescription."), + Locale("fr-FR", "French (France)", "French (France)", "fr-FR", + "Ceci est un aperçu de la voix de l'audiodescription."), + Locale("fr-CA", "French (Canada)", "French (Canada)", "fr-CA", + "Ceci est un aperçu de la voix de l'audiodescription."), + # ── German ─────────────────────────────────────────────────────────────── + Locale("de", "German", "German", "de-DE", + "Dies ist eine Vorschau der Audiodeskriptionsstimme."), + Locale("de-DE", "German (Germany)", "German (Germany)", "de-DE", + "Dies ist eine Vorschau der Audiodeskriptionsstimme."), + # ── Italian ────────────────────────────────────────────────────────────── + Locale("it", "Italian", "Italian", "it-IT", + "Questa e un'anteprima della voce dell'audiodescrizione."), + Locale("it-IT", "Italian (Italy)", "Italian (Italy)", "it-IT", + "Questa è un'anteprima della voce dell'audiodescrizione."), + # ── Portuguese ─────────────────────────────────────────────────────────── + Locale("pt", "Portuguese", "Portuguese", "pt-BR", + "Esta e uma previa da voz da audiodescricao."), + Locale("pt-BR", "Portuguese (Brazil)", "Portuguese (Brazil)", "pt-BR", + "Esta é uma prévia da voz da audiodescrição."), + Locale("pt-PT", "Portuguese (Portugal)", "Portuguese (Portugal)", "pt-PT", + "Esta é uma pré-visualização da voz da audiodescrição."), + # ── Japanese ───────────────────────────────────────────────────────────── + Locale("ja", "Japanese", "Japanese", "ja-JP", + "これは音声解説の声のプレビューです。"), + Locale("ja-JP", "Japanese (Japan)", "Japanese (Japan)", "ja-JP", + "これは音声解説の声のプレビューです。"), + # ── Korean ─────────────────────────────────────────────────────────────── + Locale("ko", "Korean", "Korean", "ko-KR", + "이것은 오디오 설명 음성의 미리보기입니다."), + Locale("ko-KR", "Korean (Korea)", "Korean (South Korea)", "ko-KR", + "이것은 오디오 설명 음성의 미리보기입니다."), + # ── Arabic ─────────────────────────────────────────────────────────────── + Locale("ar", "Arabic", "Arabic", "ar-EG", + "هذه معاينة لصوت الوصف الصوتي."), + # ── Hindi ──────────────────────────────────────────────────────────────── + Locale("hi", "Hindi", "Hindi", "hi-IN", + "यह ऑडियो विवरण आवाज का पूर्वावलोकन है।"), + # ── Indonesian ─────────────────────────────────────────────────────────── + Locale("id", "Indonesian", "Indonesian", "id-ID", + "Ini adalah pratinjau suara deskripsi audio."), + Locale("id-ID", "Indonesian (Indonesia)", "Indonesian (Indonesia)", "id-ID", + "Ini adalah pratinjau suara deskripsi audio."), + # ── Dutch ──────────────────────────────────────────────────────────────── + Locale("nl", "Dutch", "Dutch", "nl-NL", + "Dit is een voorbeeld van de audiodescriptiestem."), + Locale("nl-NL", "Dutch (Netherlands)", "Dutch (Netherlands)", "nl-NL", + "Dit is een voorbeeld van de audiodescriptiestem."), + # ── Polish ─────────────────────────────────────────────────────────────── + Locale("pl", "Polish", "Polish", "pl-PL", + "To jest podglad glosu audiodeskrypcji."), + Locale("pl-PL", "Polish (Poland)", "Polish (Poland)", "pl-PL", + "To jest podgląd głosu audiodeskrypcji."), + # ── Russian ────────────────────────────────────────────────────────────── + Locale("ru", "Russian", "Russian", "ru-RU", + "Это предварительный просмотр голоса аудиоописания."), + # ── Thai ───────────────────────────────────────────────────────────────── + Locale("th", "Thai", "Thai", "th-TH", + "นี่คือตัวอย่างเสียงบรรยายภาพ"), + # ── Turkish ────────────────────────────────────────────────────────────── + Locale("tr", "Turkish", "Turkish", "tr-TR", + "Bu, sesli betimleme sesinin bir onizlemesidir."), + Locale("tr-TR", "Turkish (Turkey)", "Turkish (Turkey)", "tr-TR", + "Bu, sesli betimleme sesinin bir önizlemesidir."), + # ── Vietnamese ─────────────────────────────────────────────────────────── + Locale("vi", "Vietnamese", "Vietnamese", "vi-VN", + "Day la ban xem truoc giong mo ta am thanh."), + # ── Romanian ───────────────────────────────────────────────────────────── + Locale("ro", "Romanian", "Romanian", "ro-RO", + "Aceasta este o previzualizare a vocii descrierii audio."), + # ── Ukrainian ──────────────────────────────────────────────────────────── + Locale("uk", "Ukrainian", "Ukrainian", "uk-UA", + "Це попередній перегляд голосу аудіоопису."), + # ── Bengali ────────────────────────────────────────────────────────────── + Locale("bn", "Bengali", "Bengali", "bn-BD", + "এটি অডিও বর্ণনা ভয়েসের একটি প্রিভিউ।"), + # ── Marathi ────────────────────────────────────────────────────────────── + Locale("mr", "Marathi", "Marathi", "mr-IN", + "हे ऑडिओ वर्णन आवाजाचे पूर्वावलोकन आहे."), + # ── Tamil ──────────────────────────────────────────────────────────────── + Locale("ta", "Tamil", "Tamil", "ta-IN", + "இது ஆடியோ விளக்க குரலின் முன்னோட்டம்."), + # ── Telugu ─────────────────────────────────────────────────────────────── + Locale("te", "Telugu", "Telugu", "te-IN", + "ఇది ఆడియో వివరణ స్వరం యొక్క ప్రివ్యూ."), + # ── Chinese ────────────────────────────────────────────────────────────── + Locale("zh", "Chinese", "Chinese (Simplified)", "zh-CN", + "这是音频描述语音的预览。"), + # ── Czech ──────────────────────────────────────────────────────────────── + Locale("cs", "Czech", "Czech", "cs-CZ", + "Toto je náhled hlasu zvukového popisu."), + Locale("cs-CZ", "Czech (Czech Republic)", "Czech (Czech Republic)", "cs-CZ", + "Toto je náhled hlasu zvukového popisu."), + # ── Danish ─────────────────────────────────────────────────────────────── + Locale("da", "Danish", "Danish", "da-DK", + "Dette er en forhåndsvisning af lydbeskrivelsesstemmen."), + # ── Finnish ────────────────────────────────────────────────────────────── + Locale("fi", "Finnish", "Finnish", "fi-FI", + "Tämä on äänikuvauksen äänen esikatselu."), + # ── Hungarian ──────────────────────────────────────────────────────────── + Locale("hu", "Hungarian", "Hungarian", "hu-HU", + "Ez a hangos leírás hangjának előnézete."), + # ── Norwegian ──────────────────────────────────────────────────────────── + Locale("no", "Norwegian", "Norwegian", "nb-NO", + "Dette er en forhåndsvisning av lydbeskrivelsesstemmen."), + # ── Slovak ─────────────────────────────────────────────────────────────── + Locale("sk", "Slovak", "Slovak", "sk-SK", + "Toto je náhľad hlasu zvukového popisu."), + # ── Swedish ────────────────────────────────────────────────────────────── + Locale("sv", "Swedish", "Swedish", "sv-SE", + "Det här är en förhandsgranskning av ljudbeskrivningsrösten."), +]} + +# xlsx uses underscores; normalize to BCP-47 hyphen form +_XLSX_ALIASES: dict[str, str] = { + code.replace("-", "_").lower(): code + for code in _REGISTRY + if "-" in code +} +# a few extra mappings for edge cases +_XLSX_ALIASES.update({ + "id": "id", # Indonesian column header is just "id" (no region) +}) + + +def normalize_code(code: str) -> str: + """ + Normalize an arbitrary locale code to the canonical BCP-47 form used in this registry. + + Handles: + - xlsx underscore form: "fr_fr" → "fr-FR" + - Bare language code: "fr" → "fr" (passthrough, legacy compat) + - Already canonical: "fr-FR" → "fr-FR" + """ + if not code: + return code + lowered = code.strip().lower() + # e.g. "fr_fr" -> check alias table + if "_" in lowered: + return _XLSX_ALIASES.get(lowered, code.replace("_", "-").upper() if len(lowered) > 3 else code) + # Already hyphen form — canonicalise case + if "-" in code: + parts = code.split("-", 1) + canonical = f"{parts[0].lower()}-{parts[1].upper()}" + if canonical in _REGISTRY: + return canonical + return canonical + # Bare language code — return as-is (legacy) + return lowered + + +def get(code: str) -> Locale | None: + """Return Locale for the given code, or None if unknown.""" + canonical = normalize_code(code) + return _REGISTRY.get(canonical) or _REGISTRY.get(canonical.split("-")[0]) + + +def get_display_name(code: str) -> str: + """Human-readable display name, e.g. 'French (Canada)'.""" + locale = get(code) + return locale.display_name if locale else code + + +def get_gemini_label(code: str) -> str: + """ + Label to use inside Gemini prompts, e.g. 'French (Canada)'. + Gemini models respond more reliably to human-readable language names + than to bare BCP-47 codes when used inside instruction prompts. + """ + locale = get(code) + return locale.gemini_label if locale else code + + +def get_tts_lang(code: str) -> str: + """BCP-47 code for the TTS API (may differ from canonical, e.g. es-MX → es-US).""" + locale = get(code) + return locale.tts_lang if locale else code + + +def get_preview_sample(code: str) -> str: + """Language-appropriate TTS preview sentence.""" + locale = get(code) + if locale: + return locale.preview_sample + # fallback: try parent language then English + parent = get(code.split("-")[0]) if "-" in code else None + if parent: + return parent.preview_sample + return "This is a preview of the audio description voice." + + +def all_codes() -> list[str]: + """Return all registered locale codes, sorted.""" + return sorted(_REGISTRY.keys()) + + +def all_display_map() -> dict[str, str]: + """Return {code: display_name} for all registered locales.""" + return {code: locale.display_name for code, locale in _REGISTRY.items()} diff --git a/backend/app/main.py b/backend/app/main.py index 0b60f2c..86b8c79 100644 --- a/backend/app/main.py +++ b/backend/app/main.py @@ -23,6 +23,7 @@ from .api.v1.routes_tts import router as tts_router from .api.v1.routes_websockets import router as websockets_router from .api.v1.routes_vtt_versions import router as vtt_versions_router from .api.v1.routes_language_qc import router as language_qc_router +from .api.v1.routes_glossaries import router as glossaries_router from .services.websocket import connection_manager from .core.config import settings from .core.secrets_config import initialize_config @@ -273,6 +274,7 @@ app.include_router(jobs_router, prefix="/api/v1") app.include_router(review_notes_router, prefix="/api/v1") app.include_router(vtt_versions_router, prefix="/api/v1") app.include_router(language_qc_router, prefix="/api/v1") +app.include_router(glossaries_router, prefix="/api/v1") app.include_router(tts_router, prefix="/api/v1") app.include_router(admin_router, prefix="/api/v1") app.include_router(websockets_router, prefix="/api/v1") diff --git a/backend/app/models/audit_log.py b/backend/app/models/audit_log.py index 1c96738..0a84153 100644 --- a/backend/app/models/audit_log.py +++ b/backend/app/models/audit_log.py @@ -61,6 +61,12 @@ class AuditAction(str, Enum): ADMIN_DATA_EXPORT = "admin.data.export" ADMIN_AUDIT_ACCESS = "admin.audit.access" + # Glossary management + GLOSSARY_UPLOAD = "glossary.upload" + GLOSSARY_VERSION_UPLOAD = "glossary.version.upload" + GLOSSARY_ACTIVATE = "glossary.activate" + GLOSSARY_ARCHIVE = "glossary.archive" + # Security events RATE_LIMIT_EXCEEDED = "security.rate_limit.exceeded" VALIDATION_FAILURE = "security.validation.failure" diff --git a/backend/app/models/glossary.py b/backend/app/models/glossary.py new file mode 100644 index 0000000..0dfc60a --- /dev/null +++ b/backend/app/models/glossary.py @@ -0,0 +1,139 @@ +from __future__ import annotations + +from datetime import datetime +from enum import StrEnum + +from pydantic import BaseModel, Field + + +class GlossarySource(StrEnum): + XLSX_UPLOAD = "xlsx_upload" + FRAZE_API = "fraze_api" # reserved for future FRAZE integration + + +class GlossaryStatus(StrEnum): + ACTIVE = "active" + ARCHIVED = "archived" + + +class EmbeddingStatus(StrEnum): + PENDING = "pending" + IN_PROGRESS = "in_progress" + DONE = "done" + FAILED = "failed" + + +class Glossary(BaseModel): + id: str | None = Field(None, alias="_id") + client_id: str + name: str + description: str | None = None + source_locale: str # BCP-47 source column, e.g. "en-GB" + source: GlossarySource = GlossarySource.XLSX_UPLOAD + status: GlossaryStatus = GlossaryStatus.ACTIVE + current_version_id: str | None = None + created_at: datetime = Field(default_factory=datetime.utcnow) + created_by: str # user_id + + model_config = {"populate_by_name": True, "arbitrary_types_allowed": True} + + +class GlossaryVersion(BaseModel): + id: str | None = Field(None, alias="_id") + glossary_id: str + version_number: int + source_xlsx_gcs_path: str | None = None # GCS path to original file + term_count: int = 0 + embedded_count: int = 0 + embedding_status: EmbeddingStatus = EmbeddingStatus.PENDING + created_at: datetime = Field(default_factory=datetime.utcnow) + created_by: str + change_note: str | None = None + + model_config = {"populate_by_name": True} + + +class GlossaryTerm(BaseModel): + """One source term with its per-locale translations.""" + id: str | None = Field(None, alias="_id") + glossary_id: str + version_id: str + cid: str | None = None # 3M Content ID from xlsx + tid: str | None = None # 3M Term ID from xlsx + source_term: str # canonical source text (whitespace-normalised) + source_term_lower: str # lowercase for case-insensitive index + translations: dict[str, str] = {} # {locale_code: translated_text} + embedding: list[float] | None = None # 768-dim Gemini embedding + + model_config = {"populate_by_name": True} + + +# ── Schema models (API request/response) ────────────────────────────────────── + +class GlossaryCreate(BaseModel): + name: str + description: str | None = None + source_locale: str + change_note: str | None = None + + +class GlossaryVersionCreate(BaseModel): + source_locale: str + change_note: str | None = None + + +class GlossaryResponse(BaseModel): + id: str + client_id: str + name: str + description: str | None = None + source_locale: str + source: GlossarySource + status: GlossaryStatus + current_version_id: str | None = None + created_at: datetime + created_by: str + + +class GlossaryVersionResponse(BaseModel): + id: str + glossary_id: str + version_number: int + term_count: int + embedded_count: int + embedding_status: EmbeddingStatus + created_at: datetime + created_by: str + change_note: str | None = None + + +class GlossaryDetailResponse(GlossaryResponse): + versions: list[GlossaryVersionResponse] = [] + + +class GlossaryTermPreview(BaseModel): + """Subset of GlossaryTerm for UI previews.""" + source_term: str + translations: dict[str, str] + + +class MatchedTerm(BaseModel): + """A term matched against VTT source text, with the target-locale translation.""" + source_term: str + target_translation: str + match_kind: str # "exact" | "vector" + score: float # 1.0 for exact, cosine similarity for vector + + +def glossary_from_doc(doc: dict) -> Glossary: + doc = dict(doc) + if "_id" in doc: + doc["_id"] = str(doc["_id"]) + return Glossary.model_validate(doc) + + +def glossary_version_from_doc(doc: dict) -> GlossaryVersion: + doc = dict(doc) + if "_id" in doc: + doc["_id"] = str(doc["_id"]) + return GlossaryVersion.model_validate(doc) diff --git a/backend/app/prompts/gemini_ingestion.md b/backend/app/prompts/gemini_ingestion.md index f52a745..b471d66 100644 --- a/backend/app/prompts/gemini_ingestion.md +++ b/backend/app/prompts/gemini_ingestion.md @@ -47,6 +47,8 @@ BRAND NAMES AND PRODUCTS: - If a product is on the brand list, use the brand name even if the label is partially obscured — use your best confident identification - If a product is NOT on the list or is completely unclear, use a generic descriptor — do not invent brand names +{GLOSSARY} + CAPTION FORMATTING (DCMP standard): - Maximum TWO lines per caption. Never use three or more lines. - Each line should be no longer than ~37 characters where possible (42 absolute max) diff --git a/backend/app/prompts/gemini_ingestion_targeted.md b/backend/app/prompts/gemini_ingestion_targeted.md index 4af276b..f9f0efd 100644 --- a/backend/app/prompts/gemini_ingestion_targeted.md +++ b/backend/app/prompts/gemini_ingestion_targeted.md @@ -51,6 +51,8 @@ BRAND NAMES AND PRODUCTS: - If a product is on the brand list, use the brand name even if the label is partially obscured — use your best confident identification - If a product is NOT on the list or is completely unclear, use a generic descriptor — do not invent brand names +{GLOSSARY} + CAPTION FORMATTING (DCMP standard): - Maximum TWO lines per caption. Never use three or more lines. - Each line should be no longer than ~37 characters where possible (42 absolute max) diff --git a/backend/app/prompts/gemini_transcreation.md b/backend/app/prompts/gemini_transcreation.md index 13f38d4..0255cc1 100644 --- a/backend/app/prompts/gemini_transcreation.md +++ b/backend/app/prompts/gemini_transcreation.md @@ -7,6 +7,8 @@ Rewrite the following English captions and audio descriptions into {TARGET_LANGU - timing boundaries (same cue timestamps), - line lengths friendly for readability (~32–40 chars). +{GLOSSARY} + Input: - captions_vtt_en: - ad_vtt_en: diff --git a/backend/app/services/embedding_service.py b/backend/app/services/embedding_service.py new file mode 100644 index 0000000..3c703a7 --- /dev/null +++ b/backend/app/services/embedding_service.py @@ -0,0 +1,72 @@ +""" +Embedding service backed by Gemini text-embedding-004. + +Provides batch embedding with retry/backoff for use in glossary ingestion. +Batch size: 100 texts per API call (API limit is 2048 but we keep it conservative +for memory and retry ergonomics with large glossaries). +""" +from __future__ import annotations + +import asyncio +from collections.abc import Sequence + +from google import genai +from google.genai import types as genai_types + +from ..core.config import settings +from ..core.logging import get_logger + +logger = get_logger(__name__) + +_EMBED_MODEL = "text-embedding-004" +_BATCH_SIZE = 100 +_MAX_RETRIES = 3 +_INITIAL_BACKOFF = 2.0 + + +class EmbeddingService: + def __init__(self) -> None: + self._client = genai.Client(api_key=settings.gemini_api_key) + + async def embed_texts(self, texts: Sequence[str]) -> list[list[float]]: + """ + Embed a list of texts and return a list of 768-dim float vectors. + Processes in batches; retries with exponential backoff on transient errors. + Order is preserved. + """ + results: list[list[float]] = [] + for i in range(0, len(texts), _BATCH_SIZE): + batch = list(texts[i: i + _BATCH_SIZE]) + vectors = await self._embed_batch_with_retry(batch) + results.extend(vectors) + return results + + async def embed_text(self, text: str) -> list[float]: + vectors = await self.embed_texts([text]) + return vectors[0] + + async def _embed_batch_with_retry(self, texts: list[str]) -> list[list[float]]: + backoff = _INITIAL_BACKOFF + for attempt in range(1, _MAX_RETRIES + 1): + try: + response = await asyncio.to_thread( + self._client.models.embed_content, + model=_EMBED_MODEL, + contents=texts, + config=genai_types.EmbedContentConfig( + task_type="RETRIEVAL_DOCUMENT", + ), + ) + return [list(emb.values) for emb in response.embeddings] + except Exception as exc: + if attempt == _MAX_RETRIES: + logger.error(f"Embedding batch failed after {_MAX_RETRIES} attempts: {exc}") + raise + logger.warning(f"Embedding attempt {attempt} failed, retrying in {backoff}s: {exc}") + await asyncio.sleep(backoff) + backoff *= 2 + + raise RuntimeError("unreachable") # makes type-checker happy + + +embedding_service = EmbeddingService() diff --git a/backend/app/services/gemini.py b/backend/app/services/gemini.py index 72b351e..b281175 100644 --- a/backend/app/services/gemini.py +++ b/backend/app/services/gemini.py @@ -8,6 +8,7 @@ import google.genai as genai from ..core.config import settings from ..core.logging import get_logger +from ..lib import locales as locale_lib logger = get_logger(__name__) @@ -106,6 +107,12 @@ Generate sdh_captions_vtt using the same cue timings as captions_vtt, enriched w - Maintain the same timestamp format as captions_vtt (HH:MM:SS.mmm --> HH:MM:SS.mmm) - Only add sound effect cues where they add meaningful context; do not annotate every minor sound""" + def _build_glossary_block(self, glossary_block: Optional[str]) -> str: + """Return the pre-built glossary block (from glossary_service.build_glossary_prompt_block), or empty string.""" + if glossary_block and glossary_block.strip(): + return glossary_block.strip() + return "" + def _build_brand_context_block(self, brand_context: Optional[str]) -> str: """Build the brand context instruction block for injection into prompts.""" if brand_context and brand_context.strip(): @@ -118,7 +125,7 @@ Generate sdh_captions_vtt using the same cue timings as captions_vtt, enriched w ) return "No specific brand names have been provided for this video." - async def extract_accessibility(self, video_file_path: str, brand_context: Optional[str] = None, sdh_requested: bool = False, _cost_ctx: Optional[dict] = None) -> dict[str, Any]: + async def extract_accessibility(self, video_file_path: str, brand_context: Optional[str] = None, sdh_requested: bool = False, glossary_block: Optional[str] = None, _cost_ctx: Optional[dict] = None) -> dict[str, Any]: """ Extract captions and audio descriptions from video using Gemini 2.0 Returns structured JSON with transcript, captions VTT, and audio description VTT @@ -127,6 +134,7 @@ Generate sdh_captions_vtt using the same cue timings as captions_vtt, enriched w prompt = ( prompt_template .replace("{BRAND_CONTEXT}", self._build_brand_context_block(brand_context)) + .replace("{GLOSSARY}", self._build_glossary_block(glossary_block)) .replace("{SDH_FIELD}", self._build_sdh_field(sdh_requested)) .replace("{SDH_GUIDELINES}", self._build_sdh_guidelines(sdh_requested)) ) @@ -320,6 +328,7 @@ Fix the JSON and return it: target_language: str, brand_context: Optional[str] = None, sdh_requested: bool = False, + glossary_block: Optional[str] = None, _cost_ctx: Optional[dict] = None, ) -> dict[str, Any]: """ @@ -343,8 +352,9 @@ Fix the JSON and return it: prompt_template = self._load_prompt("gemini_ingestion_targeted.md") prompt = ( prompt_template - .replace("{TARGET_LANGUAGE}", target_language) + .replace("{TARGET_LANGUAGE}", locale_lib.get_gemini_label(target_language)) .replace("{BRAND_CONTEXT}", self._build_brand_context_block(brand_context)) + .replace("{GLOSSARY}", self._build_glossary_block(glossary_block)) .replace("{SDH_FIELD}", self._build_sdh_field(sdh_requested)) .replace("{SDH_GUIDELINES}", self._build_sdh_guidelines(sdh_requested)) ) @@ -756,6 +766,7 @@ Fix the JSON and return it: ad_vtt: str, target_language: str, brief: Optional[str] = None, + glossary_block: Optional[str] = None, _cost_ctx: Optional[dict] = None, ) -> dict[str, str]: """ @@ -765,7 +776,8 @@ Fix the JSON and return it: # Format prompt with actual content prompt = prompt_template.format( - TARGET_LANGUAGE=target_language + TARGET_LANGUAGE=locale_lib.get_gemini_label(target_language), + GLOSSARY=self._build_glossary_block(glossary_block), ) user_prompt = f""" @@ -817,6 +829,7 @@ JSON: vtt_content: str, target_language: str, source_language: str = "en", + glossary_block: Optional[str] = None, _cost_ctx: Optional[dict] = None, ) -> str: """ @@ -842,14 +855,18 @@ JSON: f"{i + 1}. {cue.text.replace(chr(10), ' ')}" for i, cue in enumerate(source_cues) ) - prompt = f"""Translate the following {cue_count} numbered text segments from {source_language} to {target_language}. + _src_label = locale_lib.get_gemini_label(source_language) + _tgt_label = locale_lib.get_gemini_label(target_language) + _glossary_section = self._build_glossary_block(glossary_block) + _glossary_line = f"\n\n{_glossary_section}" if _glossary_section else "" + prompt = f"""Translate the following {cue_count} numbered text segments from {_src_label} to {_tgt_label}. REQUIREMENTS: - Return EXACTLY {cue_count} numbered lines, one translation per line - Format: "1. translated text", "2. translated text", etc. - Preserve speaker labels like [Speaker 1]: unchanged -- Use natural, idiomatic {target_language} -- Do NOT add any explanation, preamble, or extra lines{extra_instruction} +- Use natural, idiomatic {_tgt_label} +- Do NOT add any explanation, preamble, or extra lines{extra_instruction}{_glossary_line} Segments to translate: {numbered_texts}""" diff --git a/backend/app/services/gemini_tts.py b/backend/app/services/gemini_tts.py index 3a2a1a6..c559866 100644 --- a/backend/app/services/gemini_tts.py +++ b/backend/app/services/gemini_tts.py @@ -7,6 +7,7 @@ from pydub import AudioSegment from ..core.config import settings from ..core.logging import get_logger +from ..lib import locales as locale_lib logger = get_logger(__name__) @@ -166,10 +167,10 @@ class GeminiTTSService: Generate a preview audio sample for voice selection. Uses language-specific sample text and applies all TTS settings. """ - # Get preview sample text for the language - sample_text = settings.gemini_tts_preview_samples.get( - language, - settings.gemini_tts_preview_samples.get("en", "This is a voice preview.") + # Get preview sample text — try settings override, then locale registry, then fallback + sample_text = ( + settings.gemini_tts_preview_samples.get(language) + or locale_lib.get_preview_sample(language) ) return await self.synthesize_text( diff --git a/backend/app/services/glossary_service.py b/backend/app/services/glossary_service.py new file mode 100644 index 0000000..5c96742 --- /dev/null +++ b/backend/app/services/glossary_service.py @@ -0,0 +1,736 @@ +""" +Glossary service — per-client terminology management. + +Responsibilities: + • parse_xlsx(bytes, source_col) → list of (source_term, {locale: translation}) + • ingest_glossary(...) → create Glossary + GlossaryVersion + GlossaryTerms in Mongo + • activate_version(...) → atomic swap of current_version_id + • match_terms_for_text(...) → hybrid exact + vector retrieval for prompt injection + • build_glossary_prompt_block(...) → formats matched terms for the Gemini prompt +""" +from __future__ import annotations + +import io +import re +from collections.abc import Sequence +from dataclasses import dataclass +from datetime import datetime + +from bson import ObjectId +from fastapi import UploadFile + +from ..core.database import get_database +from ..core.logging import get_logger +from ..lib import locales as locale_lib +from ..models.glossary import ( + EmbeddingStatus, + Glossary, + GlossaryStatus, + GlossaryTerm, + GlossaryVersion, + MatchedTerm, + glossary_from_doc, + glossary_version_from_doc, +) + +logger = get_logger(__name__) + +_COLL_GLOSSARIES = "glossaries" +_COLL_VERSIONS = "glossary_versions" +_COLL_TERMS = "glossary_terms" + +# Maximum number of terms injected into a single Gemini prompt +_MAX_TERMS_IN_PROMPT = 50 + +# Atlas Vector Search index name (must exist on the collection) +_VECTOR_INDEX = "glossary_embedding_index" +_VECTOR_DIMS = 768 +_VECTOR_SIMILARITY_THRESHOLD = 0.75 +_VECTOR_TOP_K = 20 + + +# ── xlsx parsing ───────────────────────────────────────────────────────────── + +@dataclass +class _ParsedTerm: + cid: str | None + tid: str | None + source_term: str + translations: dict[str, str] # {normalized_locale: text} + + +def _cell(row: tuple, idx: int | None) -> str | None: + if idx is None or idx >= len(row): + return None + v = row[idx] + return str(v).strip() if v is not None else None + + +def parse_xlsx(file_bytes: bytes, source_locale_col: str) -> list[_ParsedTerm]: + """ + Parse an xlsx glossary file. + + Args: + file_bytes: Raw xlsx bytes. + source_locale_col: The column header that contains the source text, + e.g. "en_gb" or "en-GB". Case-insensitive. + + Returns: + List of parsed terms. Rows where the source column is empty are skipped. + """ + import openpyxl # local import — only used during ingest + + wb = openpyxl.load_workbook(io.BytesIO(file_bytes), read_only=True, data_only=True) + ws = wb.active + + rows = ws.iter_rows(values_only=True) + try: + header_row = next(rows) + except StopIteration: + return [] + + # Normalise header names to canonical locale codes + headers: list[str | None] = [] + for h in header_row: + if h is None: + headers.append(None) + continue + s = str(h).strip() + headers.append(s) + + # Find column indices + src_col_name = source_locale_col.strip() + # Try exact match first, then case-insensitive + src_idx: int | None = None + for i, h in enumerate(headers): + if h and h.lower() == src_col_name.lower(): + src_idx = i + break + + if src_idx is None: + raise ValueError(f"Source column '{source_locale_col}' not found in xlsx. Available: {[h for h in headers if h]}") + + cid_idx = next((i for i, h in enumerate(headers) if h and h.upper() == "CID"), None) + tid_idx = next((i for i, h in enumerate(headers) if h and h.upper() == "TID"), None) + + # All other columns with valid locale-like names become translation columns + locale_cols: list[tuple[int, str]] = [] # [(col_index, normalized_locale_code)] + for i, h in enumerate(headers): + if h is None or i == src_idx or i == cid_idx or i == tid_idx: + continue + norm = locale_lib.normalize_code(h) + if norm: + locale_cols.append((i, norm)) + + terms: list[_ParsedTerm] = [] + for row in rows: + if not row or all(v is None for v in row): + continue + + source = _cell(row, src_idx) + if not source: + continue + + translations: dict[str, str] = {} + for col_idx, locale_code in locale_cols: + val = _cell(row, col_idx) + if val: + translations[locale_code] = val + + terms.append(_ParsedTerm( + cid=_cell(row, cid_idx), + tid=_cell(row, tid_idx), + source_term=source, + translations=translations, + )) + + wb.close() + return terms + + +# ── Ingest ──────────────────────────────────────────────────────────────────── + +async def ingest_glossary( + client_id: str, + name: str, + source_locale: str, + source_locale_col: str, + file: UploadFile, + user_id: str, + description: str | None = None, + change_note: str | None = None, +) -> tuple[Glossary, GlossaryVersion]: + """ + Full glossary ingestion pipeline: + 1. Upload xlsx to GCS + 2. Parse terms + 3. Create Glossary + GlossaryVersion + GlossaryTerm documents in Mongo + 4. Kick off background embedding task + + Returns (Glossary, GlossaryVersion) on success. + """ + db = await get_database() + + # ── Upload original xlsx to GCS ── + file_bytes = await file.read() + glossary_id = str(ObjectId()) + version_id = str(ObjectId()) + gcs_path = f"glossaries/{client_id}/{glossary_id}/{version_id}/source.xlsx" + await _upload_bytes_to_gcs(file_bytes, gcs_path, + content_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet") + + # ── Parse ── + logger.info(f"Parsing xlsx for glossary {glossary_id}, source_col={source_locale_col}") + parsed_terms = parse_xlsx(file_bytes, source_locale_col) + logger.info(f"Parsed {len(parsed_terms)} terms") + + # ── Create Glossary doc ── + now = datetime.utcnow() + glossary_doc = { + "_id": ObjectId(glossary_id), + "client_id": client_id, + "name": name, + "description": description, + "source_locale": locale_lib.normalize_code(source_locale), + "source": "xlsx_upload", + "status": GlossaryStatus.ACTIVE.value, + "current_version_id": version_id, + "created_at": now, + "created_by": user_id, + } + await db[_COLL_GLOSSARIES].insert_one(glossary_doc) + + # ── Create GlossaryVersion doc ── + version_doc = { + "_id": ObjectId(version_id), + "glossary_id": glossary_id, + "version_number": 1, + "source_xlsx_gcs_path": gcs_path, + "term_count": len(parsed_terms), + "embedded_count": 0, + "embedding_status": EmbeddingStatus.PENDING.value, + "created_at": now, + "created_by": user_id, + "change_note": change_note, + } + await db[_COLL_VERSIONS].insert_one(version_doc) + + # ── Bulk insert GlossaryTerms ── + if parsed_terms: + term_docs = [ + { + "_id": ObjectId(), + "glossary_id": glossary_id, + "version_id": version_id, + "cid": t.cid, + "tid": t.tid, + "source_term": t.source_term, + "source_term_lower": t.source_term.lower(), + "translations": t.translations, + "embedding": None, + } + for t in parsed_terms + ] + await db[_COLL_TERMS].insert_many(term_docs, ordered=False) + + # ── Create collection indexes (idempotent) ── + await _ensure_indexes(db) + + # ── Kick off embedding Celery task ── + try: + from ..tasks.embed_glossary import embed_glossary_version_task + embed_glossary_version_task.delay(version_id) + logger.info(f"Queued embedding task for version {version_id}") + except Exception as e: + logger.warning(f"Could not queue embedding task: {e}") + + glossary = glossary_from_doc(glossary_doc) + version = glossary_version_from_doc(version_doc) + return glossary, version + + +async def ingest_new_version( + glossary_id: str, + source_locale_col: str, + file: UploadFile, + user_id: str, + change_note: str | None = None, +) -> GlossaryVersion: + """Add a new version to an existing glossary without replacing it as active.""" + db = await get_database() + + glossary_doc = await db[_COLL_GLOSSARIES].find_one({"_id": ObjectId(glossary_id)}) + if not glossary_doc: + raise ValueError(f"Glossary {glossary_id} not found") + + client_id = glossary_doc["client_id"] + + # Find next version number + last_version = await db[_COLL_VERSIONS].find_one( + {"glossary_id": glossary_id}, + sort=[("version_number", -1)], + ) + next_version_num = (last_version["version_number"] + 1) if last_version else 1 + + file_bytes = await file.read() + version_id = str(ObjectId()) + gcs_path = f"glossaries/{client_id}/{glossary_id}/{version_id}/source.xlsx" + await _upload_bytes_to_gcs(file_bytes, gcs_path, + content_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet") + + parsed_terms = parse_xlsx(file_bytes, source_locale_col) + + now = datetime.utcnow() + version_doc = { + "_id": ObjectId(version_id), + "glossary_id": glossary_id, + "version_number": next_version_num, + "source_xlsx_gcs_path": gcs_path, + "term_count": len(parsed_terms), + "embedded_count": 0, + "embedding_status": EmbeddingStatus.PENDING.value, + "created_at": now, + "created_by": user_id, + "change_note": change_note, + } + await db[_COLL_VERSIONS].insert_one(version_doc) + + if parsed_terms: + term_docs = [ + { + "_id": ObjectId(), + "glossary_id": glossary_id, + "version_id": version_id, + "cid": t.cid, + "tid": t.tid, + "source_term": t.source_term, + "source_term_lower": t.source_term.lower(), + "translations": t.translations, + "embedding": None, + } + for t in parsed_terms + ] + await db[_COLL_TERMS].insert_many(term_docs, ordered=False) + + try: + from ..tasks.embed_glossary import embed_glossary_version_task + embed_glossary_version_task.delay(version_id) + except Exception as e: + logger.warning(f"Could not queue embedding task: {e}") + + return glossary_version_from_doc(version_doc) + + +async def activate_version(glossary_id: str, version_id: str) -> None: + """Atomically set the active version of a glossary.""" + db = await get_database() + result = await db[_COLL_GLOSSARIES].update_one( + {"_id": ObjectId(glossary_id)}, + {"$set": {"current_version_id": version_id}}, + ) + if result.matched_count == 0: + raise ValueError(f"Glossary {glossary_id} not found") + # Invalidate Redis cache + await _invalidate_cache(glossary_id) + + +async def archive_glossary(glossary_id: str) -> None: + db = await get_database() + await db[_COLL_GLOSSARIES].update_one( + {"_id": ObjectId(glossary_id)}, + {"$set": {"status": GlossaryStatus.ARCHIVED.value}}, + ) + await _invalidate_cache(glossary_id) + + +# ── Retrieval ───────────────────────────────────────────────────────────────── + +async def match_terms_for_text( + client_id: str, + text: str, + target_locale: str, + top_k: int = _MAX_TERMS_IN_PROMPT, +) -> list[MatchedTerm]: + """ + Hybrid retrieval: exact-match (Aho-Corasick) + semantic (Atlas Vector Search). + + Returns a ranked, deduplicated list of up to `top_k` MatchedTerm objects, + each with the source term and its translation in `target_locale`. + Exact matches rank before vector matches. + """ + db = await get_database() + norm_target = locale_lib.normalize_code(target_locale) + + active_version_id = await _get_active_version_id(client_id) + if not active_version_id: + return [] + + # ── Exact pass ── + exact_matches = await _exact_match(db, active_version_id, text, norm_target) + + # ── Vector pass (if we haven't hit the limit yet) ── + remaining = top_k - len(exact_matches) + already_found = {m.source_term.lower() for m in exact_matches} + vector_matches: list[MatchedTerm] = [] + + if remaining > 0: + try: + vector_matches = await _vector_match( + db, active_version_id, text, norm_target, + top_k=_VECTOR_TOP_K, exclude_terms=already_found, + ) + except Exception as e: + logger.warning(f"Vector search failed (non-fatal): {e}") + + combined = exact_matches + vector_matches + if len(combined) > top_k: + logger.info(f"glossary_terms_truncated: had {len(combined)}, capped at {top_k}") + combined = combined[:top_k] + + return combined + + +async def _get_active_version_id(client_id: str) -> str | None: + """Return the active version_id for the active glossary of a client, or None.""" + try: + from ..core.redis import redis_client # lazy import + cache_key = f"glossary:active_version:{client_id}" + cached = await redis_client.get(cache_key) + if cached: + return cached.decode() if isinstance(cached, bytes) else cached + except Exception: + pass + + db = await get_database() + glossary_doc = await db[_COLL_GLOSSARIES].find_one( + {"client_id": client_id, "status": GlossaryStatus.ACTIVE.value}, + sort=[("created_at", -1)], + ) + if not glossary_doc or not glossary_doc.get("current_version_id"): + return None + + version_id = glossary_doc["current_version_id"] + + try: + from ..core.redis import redis_client + cache_key = f"glossary:active_version:{client_id}" + await redis_client.setex(cache_key, 3600, version_id) + except Exception: + pass + + return version_id + + +async def _invalidate_cache(glossary_id: str) -> None: + """Clear Redis cache for a glossary's client.""" + try: + db = await get_database() + doc = await db[_COLL_GLOSSARIES].find_one({"_id": ObjectId(glossary_id)}) + if doc: + from ..core.redis import redis_client + await redis_client.delete(f"glossary:active_version:{doc['client_id']}") + except Exception as e: + logger.debug(f"Cache invalidation skipped: {e}") + + +async def _exact_match( + db, + version_id: str, + text: str, + target_locale: str, +) -> list[MatchedTerm]: + """Find terms present in `text` using Aho-Corasick over the glossary terms.""" + import ahocorasick # pyahocorasick + + # Load all terms for this version (source_term_lower + translations) + cursor = db[_COLL_TERMS].find( + {"version_id": version_id}, + {"source_term": 1, "source_term_lower": 1, "translations": 1}, + ) + terms = await cursor.to_list(length=None) + if not terms: + return [] + + # Build automaton + automaton = ahocorasick.Automaton() + for doc in terms: + stl = doc["source_term_lower"] + automaton.add_word(stl, (doc["source_term"], doc["translations"])) + automaton.make_automaton() + + text_lower = text.lower() + matched: list[MatchedTerm] = [] + seen: set[str] = set() + + for _end_idx, (source_term, translations) in automaton.iter(text_lower): + if source_term in seen: + continue + # Require word/phrase boundaries around the match + start_idx = _end_idx - len(source_term.lower()) + 1 + if start_idx > 0 and text_lower[start_idx - 1].isalnum(): + continue + end_after = _end_idx + 1 + if end_after < len(text_lower) and text_lower[end_after].isalnum(): + continue + + target_text = _get_translation(translations, target_locale) + if not target_text: + continue + seen.add(source_term) + matched.append(MatchedTerm( + source_term=source_term, + target_translation=target_text, + match_kind="exact", + score=1.0, + )) + + return matched + + +async def _vector_match( + db, + version_id: str, + text: str, + target_locale: str, + top_k: int = 20, + exclude_terms: set[str] | None = None, +) -> list[MatchedTerm]: + """Semantic search via Atlas Vector Search ($vectorSearch).""" + from ..services.embedding_service import embedding_service + + query_embedding = await embedding_service.embed_text(text[:2000]) # cap input length + + pipeline = [ + { + "$vectorSearch": { + "index": _VECTOR_INDEX, + "path": "embedding", + "queryVector": query_embedding, + "numCandidates": top_k * 4, + "limit": top_k, + "filter": {"version_id": version_id}, + } + }, + { + "$project": { + "source_term": 1, + "translations": 1, + "score": {"$meta": "vectorSearchScore"}, + } + }, + ] + + cursor = db[_COLL_TERMS].aggregate(pipeline) + results = await cursor.to_list(length=top_k) + + matched: list[MatchedTerm] = [] + for doc in results: + score = doc.get("score", 0.0) + if score < _VECTOR_SIMILARITY_THRESHOLD: + continue + source_term = doc["source_term"] + if exclude_terms and source_term.lower() in exclude_terms: + continue + target_text = _get_translation(doc["translations"], target_locale) + if not target_text: + continue + matched.append(MatchedTerm( + source_term=source_term, + target_translation=target_text, + match_kind="vector", + score=score, + )) + + return matched + + +def _get_translation(translations: dict[str, str], target_locale: str) -> str | None: + """Look up a translation with locale-fallback: fr-CA → fr-FR → fr → None.""" + if not translations: + return None + if target_locale in translations: + return translations[target_locale] + # Try parent language + parent = target_locale.split("-")[0] if "-" in target_locale else None + if parent: + # Try sibling locales, e.g. fr-CA not found → try fr-FR + for code, text in translations.items(): + if code.startswith(parent + "-") or code == parent: + return text + return None + + +# ── Prompt block ────────────────────────────────────────────────────────────── + +def build_glossary_prompt_block( + matched_terms: Sequence[MatchedTerm], + target_locale: str, +) -> str: + """ + Format matched terms for injection into a Gemini prompt. + Returns an empty string if no terms were matched. + """ + if not matched_terms: + return "" + + target_label = locale_lib.get_gemini_label(target_locale) + lines = [ + f"## Approved {target_label} terminology", + "Use these exact translations when the source terms appear — do not deviate:", + ] + for term in matched_terms: + lines.append(f'- "{term.source_term}" → "{term.target_translation}"') + + return "\n".join(lines) + + +# ── Helpers ─────────────────────────────────────────────────────────────────── + +async def _upload_bytes_to_gcs(data: bytes, gcs_path: str, content_type: str) -> None: + import asyncio + loop = asyncio.get_event_loop() + + def _upload() -> None: + from google.cloud import storage as gcs_storage + + from ..core.config import settings + client = gcs_storage.Client(project=settings.gcp_project_id) + bucket = client.bucket(settings.gcs_bucket) + blob = bucket.blob(gcs_path) + blob.content_type = content_type + blob.upload_from_string(data, content_type=content_type) + + await loop.run_in_executor(None, _upload) + + +async def _ensure_indexes(db) -> None: + try: + await db[_COLL_GLOSSARIES].create_index([("client_id", 1), ("status", 1)]) + await db[_COLL_VERSIONS].create_index([("glossary_id", 1), ("version_number", -1)]) + await db[_COLL_TERMS].create_index([("version_id", 1), ("source_term_lower", 1)]) + await db[_COLL_TERMS].create_index([("glossary_id", 1)]) + except Exception as e: + logger.debug(f"Index creation skipped (likely already exist): {e}") + + +# ── Task helpers ───────────────────────────────────────────────────────────── + +async def get_glossary_block_for_job( + job_doc: dict, + target_locale: str, + db, +) -> str: + """ + Convenience function for Celery tasks: given a job document and a target locale, + return the formatted glossary block for Gemini prompt injection (or empty string). + + Looks up: + job_doc.project_id → db.projects → client_id → active glossary version + + Non-fatal: any failure returns "" so the pipeline continues without a glossary. + """ + try: + project_id = job_doc.get("project_id") + if not project_id: + return "" + + project = await db.projects.find_one({"_id": project_id}) + if not project: + return "" + + client_id = project.get("client_id") + if not client_id: + return "" + + # Get active version id via our cache-backed helper (reuses Redis if available) + active_version_id = await _get_active_version_id(client_id) + if not active_version_id: + return "" + + # Combine source VTT texts for matching + source_text = job_doc.get("_glossary_source_text", "") + if not source_text: + return "" + + norm_target = locale_lib.normalize_code(target_locale) + exact_matches = await _exact_match(db, active_version_id, source_text, norm_target) + + remaining = _MAX_TERMS_IN_PROMPT - len(exact_matches) + already_found = {m.source_term.lower() for m in exact_matches} + vector_matches: list[MatchedTerm] = [] + + if remaining > 0: + try: + vector_matches = await _vector_match( + db, active_version_id, source_text, norm_target, + top_k=_VECTOR_TOP_K, exclude_terms=already_found, + ) + except Exception as ve: + logger.debug(f"Vector search skipped in task context: {ve}") + + combined = exact_matches + vector_matches + if len(combined) > _MAX_TERMS_IN_PROMPT: + logger.info(f"glossary_terms_truncated: capped at {_MAX_TERMS_IN_PROMPT}") + combined = combined[:_MAX_TERMS_IN_PROMPT] + + return build_glossary_prompt_block(combined, target_locale) + + except Exception as e: + logger.warning(f"Glossary lookup failed for job {job_doc.get('_id')} (non-fatal): {e}") + return "" + + +# ── Listing helpers ─────────────────────────────────────────────────────────── + +async def get_glossaries_for_client(client_id: str) -> list[Glossary]: + db = await get_database() + cursor = db[_COLL_GLOSSARIES].find( + {"client_id": client_id, "status": {"$ne": GlossaryStatus.ARCHIVED.value}}, + sort=[("created_at", -1)], + ) + docs = await cursor.to_list(length=100) + return [glossary_from_doc(d) for d in docs] + + +async def get_glossary(glossary_id: str) -> Glossary | None: + db = await get_database() + doc = await db[_COLL_GLOSSARIES].find_one({"_id": ObjectId(glossary_id)}) + return glossary_from_doc(doc) if doc else None + + +async def get_versions(glossary_id: str) -> list[GlossaryVersion]: + db = await get_database() + cursor = db[_COLL_VERSIONS].find( + {"glossary_id": glossary_id}, + sort=[("version_number", -1)], + ) + docs = await cursor.to_list(length=50) + return [glossary_version_from_doc(d) for d in docs] + + +async def get_terms_page( + version_id: str, + search: str | None = None, + page: int = 1, + page_size: int = 50, +) -> tuple[list[GlossaryTerm], int]: + """Returns (terms, total_count) for paginated UI preview.""" + db = await get_database() + query: dict = {"version_id": version_id} + if search: + query["source_term_lower"] = {"$regex": re.escape(search.lower())} + + total = await db[_COLL_TERMS].count_documents(query) + cursor = db[_COLL_TERMS].find( + query, + {"_id": 1, "source_term": 1, "translations": 1}, + skip=(page - 1) * page_size, + limit=page_size, + sort=[("source_term_lower", 1)], + ) + docs = await cursor.to_list(length=page_size) + terms = [] + for d in docs: + d["_id"] = str(d["_id"]) + terms.append(GlossaryTerm.model_validate(d)) + return terms, total diff --git a/backend/app/services/tts.py b/backend/app/services/tts.py index dd66e35..861d9e6 100644 --- a/backend/app/services/tts.py +++ b/backend/app/services/tts.py @@ -74,19 +74,16 @@ class TTSService: # Determine which provider to use active_provider = provider or settings.tts_provider - # Extract simple language code for Gemini (e.g., "en-US" -> "en") - simple_lang = language_code.split("-")[0] if "-" in language_code else language_code - # Try the configured provider first, then fallback if active_provider == "gemini" and self.gemini_available: try: logger.info( - f"Using Gemini TTS for language: {simple_lang}, voice: {voice_name}, " + f"Using Gemini TTS for language: {language_code}, voice: {voice_name}, " f"model: {model}, speed: {speed}x" ) return await gemini_tts_service.synthesize_audio_description( ad_vtt_content, - simple_lang, + language_code, voice_name, model=model, speed=speed, @@ -135,9 +132,6 @@ class TTSService: # Determine which provider to use active_provider = provider or settings.tts_provider - # Extract simple language code for Gemini (e.g., "en-US" -> "en") - simple_lang = language_code.split("-")[0] if "-" in language_code else language_code - # Parse VTT cues first cues = self._parse_ad_cues(ad_vtt_content) if not cues: @@ -169,7 +163,7 @@ class TTSService: if active_provider == "gemini" and self.gemini_available: audio_data = await gemini_tts_service.synthesize_text( text, voice_name or gemini_tts_service.default_voice, - simple_lang, model=model, speed=speed, style_prompt=style_prompt + language_code, model=model, speed=speed, style_prompt=style_prompt ) elif self.google_client: audio_data = await self._synthesize_text_google(text, language_code, voice_name) diff --git a/backend/app/tasks/__init__.py b/backend/app/tasks/__init__.py index de9d225..14e0287 100644 --- a/backend/app/tasks/__init__.py +++ b/backend/app/tasks/__init__.py @@ -128,6 +128,7 @@ def import_task_modules(): from . import notify # noqa: E402, F401 from . import ffmpeg_operations # noqa: E402, F401 from . import whisper_transcribe # noqa: E402, F401 + from . import embed_glossary # noqa: E402, F401 logger.info("Successfully imported all task modules") except Exception as e: logger.error(f"Error importing task modules: {e}") diff --git a/backend/app/tasks/embed_glossary.py b/backend/app/tasks/embed_glossary.py new file mode 100644 index 0000000..da90f20 --- /dev/null +++ b/backend/app/tasks/embed_glossary.py @@ -0,0 +1,102 @@ +""" +Celery task: compute and store Gemini embeddings for all terms in a glossary version. + +Runs as a background job after glossary ingestion so the API response is fast. +Processes terms in batches of 100 and updates embedded_count incrementally. +""" +from __future__ import annotations + +import asyncio + +from bson import ObjectId +from motor.motor_asyncio import AsyncIOMotorClient + +from ..core.config import settings +from ..core.logging import get_logger +from ..models.glossary import EmbeddingStatus +from . import celery_app + +logger = get_logger(__name__) + +_BATCH_SIZE = 100 + + +@celery_app.task(name="embed_glossary_version", bind=True, max_retries=3) +def embed_glossary_version_task(self, version_id: str) -> dict: + """ + Compute embeddings for all GlossaryTerms of `version_id`. + Updates embedded_count and embedding_status on the GlossaryVersion doc. + """ + try: + result = asyncio.run(_async_embed_version(version_id)) + return result + except Exception as exc: + logger.error(f"embed_glossary_version_task failed for {version_id}: {exc}") + raise self.retry(exc=exc, countdown=60) from None + + +async def _async_embed_version(version_id: str) -> dict: + from ..services.embedding_service import embedding_service + + mongo_client = AsyncIOMotorClient(settings.mongodb_uri) + db = mongo_client[settings.mongodb_db] + + try: + # Mark in-progress + await db.glossary_versions.update_one( + {"_id": ObjectId(version_id)}, + {"$set": {"embedding_status": EmbeddingStatus.IN_PROGRESS.value}}, + ) + + # Fetch all terms without embeddings + cursor = db.glossary_terms.find( + {"version_id": version_id, "embedding": None}, + {"_id": 1, "source_term": 1}, + ) + terms = await cursor.to_list(length=None) + total = len(terms) + logger.info(f"Embedding {total} terms for version {version_id}") + + embedded_count = 0 + for i in range(0, total, _BATCH_SIZE): + batch = terms[i: i + _BATCH_SIZE] + texts = [t["source_term"] for t in batch] + ids = [t["_id"] for t in batch] + + embeddings = await embedding_service.embed_texts(texts) + + # Bulk update + ops = [] + from pymongo import UpdateOne + for term_id, embedding in zip(ids, embeddings, strict=False): + ops.append(UpdateOne({"_id": term_id}, {"$set": {"embedding": embedding}})) + + if ops: + await db.glossary_terms.bulk_write(ops, ordered=False) + + embedded_count += len(batch) + await db.glossary_versions.update_one( + {"_id": ObjectId(version_id)}, + {"$set": {"embedded_count": embedded_count}}, + ) + logger.info(f"Version {version_id}: embedded {embedded_count}/{total}") + + # Mark done + await db.glossary_versions.update_one( + {"_id": ObjectId(version_id)}, + {"$set": { + "embedding_status": EmbeddingStatus.DONE.value, + "embedded_count": total, + }}, + ) + logger.info(f"Embedding complete for version {version_id}: {total} terms") + return {"version_id": version_id, "total": total} + + except Exception: + await db.glossary_versions.update_one( + {"_id": ObjectId(version_id)}, + {"$set": {"embedding_status": EmbeddingStatus.FAILED.value}}, + ) + raise + finally: + mongo_client.close() diff --git a/backend/app/tasks/translate_and_synthesize.py b/backend/app/tasks/translate_and_synthesize.py index 674f45a..4a84a32 100644 --- a/backend/app/tasks/translate_and_synthesize.py +++ b/backend/app/tasks/translate_and_synthesize.py @@ -219,6 +219,9 @@ async def _async_translate_and_synthesize(job_id: str): # Get translation mode (default to "traditional" for backwards compatibility) translation_mode = job_doc["requested_outputs"].get("translation_mode", "traditional") + + # Glossary: lazy-loaded per target language during the loop + from ..services.glossary_service import get_glossary_block_for_job logger.info(f"Translation mode for job {job_id}: {translation_mode}") sdh_requested = job_doc["requested_outputs"].get("sdh_vtt", False) @@ -293,12 +296,17 @@ async def _async_translate_and_synthesize(job_id: str): project_id=_cost_ctx["project_id"], ) + # Build glossary block from source VTT for this language + _job_for_glossary = {**job_doc, "_glossary_source_text": ""} + _glossary = await get_glossary_block_for_job(_job_for_glossary, lang, db) + async def extract_targeted(): return await gemini_service.extract_accessibility_targeted( video_local_path, lang, brand_context=job_brand_context, sdh_requested=sdh_requested, + glossary_block=_glossary, _cost_ctx=_cost_ctx, ) @@ -382,6 +390,9 @@ async def _async_translate_and_synthesize(job_id: str): logger.info(f"Successfully processed VTT files for language: {lang} (origin: video_native)") else: + # Combine source VTTs for glossary term matching + _source_text_for_glossary = " ".join(filter(None, [source_captions_vtt, source_ad_vtt])) + # TRADITIONAL MODE: Process languages sequentially for language in target_languages: logger.info(f"Processing language: {language} (from source: {source_language}, mode: {translation_mode})") @@ -392,6 +403,10 @@ async def _async_translate_and_synthesize(job_id: str): project_id=_cost_ctx["project_id"], ) + # Lookup glossary terms for this target language + _job_for_glossary = {**job_doc, "_glossary_source_text": _source_text_for_glossary} + _glossary = await get_glossary_block_for_job(_job_for_glossary, language, db) + try: if language in transcreation_languages: # TRADITIONAL MODE with transcreation: cultural adaptation @@ -401,6 +416,7 @@ async def _async_translate_and_synthesize(job_id: str): source_ad_vtt, language, brief="Standard accessibility content", + glossary_block=_glossary, _cost_ctx=_cost_ctx, ) @@ -414,12 +430,14 @@ async def _async_translate_and_synthesize(job_id: str): async def translate_captions(): return await gemini_service.translate_vtt( source_captions_vtt, language, source_language=source_language, + glossary_block=_glossary, _cost_ctx=_cost_ctx, ) async def translate_ad(): return await gemini_service.translate_vtt( source_ad_vtt, language, source_language=source_language, + glossary_block=_glossary, _cost_ctx=_cost_ctx, ) @@ -448,6 +466,7 @@ async def _async_translate_and_synthesize(job_id: str): async def translate_sdh(): return await gemini_service.translate_vtt( source_sdh_vtt, language, source_language=source_language, + glossary_block=_glossary, _cost_ctx=_cost_ctx, ) translated_sdh = await retry_with_backoff(translate_sdh, max_retries=3) diff --git a/backend/pyproject.toml b/backend/pyproject.toml index 943a1b2..bda03bf 100644 --- a/backend/pyproject.toml +++ b/backend/pyproject.toml @@ -42,6 +42,8 @@ python-magic = "^0.4.27" aiohttp = "^3.12.15" jinja2 = "^3.1.6" audioop-lts = {version = "^0.2.2", python = ">=3.13"} +openpyxl = "^3.1.2" +pyahocorasick = "^2.1.1" [tool.poetry.group.dev.dependencies] pytest = "^7.4.3" diff --git a/frontend/src/App.tsx b/frontend/src/App.tsx index d49867e..d444ced 100644 --- a/frontend/src/App.tsx +++ b/frontend/src/App.tsx @@ -18,6 +18,9 @@ import { UserList } from './routes/admin/UserList'; import { UserDetail } from './routes/admin/UserDetail'; import { ClientList } from './routes/admin/ClientList'; import { ClientDetail } from './routes/admin/ClientDetail'; +import { GlossaryList } from './routes/admin/glossaries/GlossaryList'; +import { GlossaryUpload } from './routes/admin/glossaries/GlossaryUpload'; +import { GlossaryDetail } from './routes/admin/glossaries/GlossaryDetail'; import { AuditLog } from './routes/admin/AuditLog'; import { LinguistQueue } from './routes/jobs/LinguistQueue'; import { Downloads } from './routes/Downloads'; @@ -149,6 +152,27 @@ function AppContent() { } /> + + + + + + } /> + + + + + + } /> + + + + + + } /> diff --git a/frontend/src/lib/api.ts b/frontend/src/lib/api.ts index b506dd7..7995106 100644 --- a/frontend/src/lib/api.ts +++ b/frontend/src/lib/api.ts @@ -59,6 +59,10 @@ import type { LanguageQCMapResponse, LanguageQCStateResponse, QueueResponse, + Glossary, + GlossaryDetail, + GlossaryVersion, + GlossaryTermsResponse, } from '../types/api'; const API_BASE_URL = import.meta.env.VITE_API_BASE_URL || 'http://localhost:8000'; @@ -761,6 +765,84 @@ class ApiClient { const r = await this.client.get(`/me/language-qc-queue?${params.toString()}`); return r.data; } + + // ── Glossary endpoints ────────────────────────────────────────────────────── + + async getGlossaries(clientId: string): Promise { + const r = await this.client.get(`/clients/${clientId}/glossaries`); + return r.data; + } + + async getGlossary(clientId: string, glossaryId: string): Promise { + const r = await this.client.get(`/clients/${clientId}/glossaries/${glossaryId}`); + return r.data; + } + + async uploadGlossary( + clientId: string, + file: File, + name: string, + sourceLocale: string, + sourceLocaleCol: string, + description?: string, + changeNote?: string, + ): Promise { + const form = new FormData(); + form.append('file', file); + form.append('name', name); + form.append('source_locale', sourceLocale); + form.append('source_locale_col', sourceLocaleCol); + if (description) form.append('description', description); + if (changeNote) form.append('change_note', changeNote); + const r = await this.client.post(`/clients/${clientId}/glossaries`, form, { + headers: { 'Content-Type': 'multipart/form-data' }, + timeout: 120000, + }); + return r.data; + } + + async uploadGlossaryVersion( + clientId: string, + glossaryId: string, + file: File, + sourceLocaleCol: string, + changeNote?: string, + ): Promise { + const form = new FormData(); + form.append('file', file); + form.append('source_locale_col', sourceLocaleCol); + if (changeNote) form.append('change_note', changeNote); + const r = await this.client.post(`/clients/${clientId}/glossaries/${glossaryId}/versions`, form, { + headers: { 'Content-Type': 'multipart/form-data' }, + timeout: 120000, + }); + return r.data; + } + + async activateGlossaryVersion(clientId: string, glossaryId: string, versionId: string): Promise<{ status: string; active_version_id: string }> { + const form = new FormData(); + form.append('version_id', versionId); + const r = await this.client.post(`/clients/${clientId}/glossaries/${glossaryId}/activate`, form); + return r.data; + } + + async getGlossaryTerms( + clientId: string, + glossaryId: string, + opts?: { versionId?: string; search?: string; page?: number; pageSize?: number }, + ): Promise { + const params = new URLSearchParams(); + if (opts?.versionId) params.append('version_id', opts.versionId); + if (opts?.search) params.append('search', opts.search); + if (opts?.page) params.append('page', String(opts.page)); + if (opts?.pageSize) params.append('page_size', String(opts.pageSize)); + const r = await this.client.get(`/clients/${clientId}/glossaries/${glossaryId}/terms?${params.toString()}`); + return r.data; + } + + async archiveGlossary(clientId: string, glossaryId: string): Promise { + await this.client.delete(`/clients/${clientId}/glossaries/${glossaryId}`); + } } export const apiClient = new ApiClient(); diff --git a/frontend/src/routes/admin/ClientDetail.tsx b/frontend/src/routes/admin/ClientDetail.tsx index 84c2875..43ae232 100644 --- a/frontend/src/routes/admin/ClientDetail.tsx +++ b/frontend/src/routes/admin/ClientDetail.tsx @@ -1,5 +1,8 @@ import { useState } from 'react'; -import { useParams } from 'react-router-dom'; +import { useParams, Link } from 'react-router-dom'; +import { useQuery } from '@tanstack/react-query'; +import { apiClient } from '../../lib/api'; +import type { Glossary } from '../../types/api'; import { useClient, useTeams, useCreateTeam, useUpdateTeam, useDeleteTeam, @@ -50,6 +53,12 @@ export function ClientDetail() { const [pmUserId, setPmUserId] = useState(''); + const { data: glossaries = [] } = useQuery({ + queryKey: ['glossaries', clientId], + queryFn: () => apiClient.getGlossaries(clientId!), + enabled: !!clientId, + }); + if (clientLoading) { return
; } @@ -337,6 +346,53 @@ export function ClientDetail() { + {/* Glossaries */} +
+
+

Glossaries

+ + View all → + +
+ {glossaries.length === 0 ? ( +

No glossaries yet

+ ) : ( +
+ {glossaries.slice(0, 3).map(g => ( +
+ + {g.name} + + + {g.status} + +
+ ))} + {glossaries.length > 3 && ( +

+{glossaries.length - 3} more

+ )} +
+ )} + {(isAdmin || user?.role === 'project_manager') && ( +
+ + + Upload glossary + +
+ )} +
+ {/* Rename team modal */} {editingTeam && (
diff --git a/frontend/src/routes/admin/glossaries/GlossaryDetail.tsx b/frontend/src/routes/admin/glossaries/GlossaryDetail.tsx new file mode 100644 index 0000000..622126c --- /dev/null +++ b/frontend/src/routes/admin/glossaries/GlossaryDetail.tsx @@ -0,0 +1,335 @@ +import { useState, useRef } from 'react'; +import { useParams, Link } from 'react-router-dom'; +import { useQuery, useMutation, useQueryClient } from '@tanstack/react-query'; +import { apiClient } from '../../../lib/api'; +import { useToastContext } from '../../../contexts/ToastContext'; +import { useAuthStore } from '../../../lib/auth'; +import type { GlossaryVersion, GlossaryDetail as GlossaryDetailType } from '../../../types/api'; + +type Tab = 'terms' | 'versions'; + +function EmbeddingPill({ v }: { v: GlossaryVersion }) { + const pct = v.term_count > 0 ? Math.round((v.embedded_count / v.term_count) * 100) : 0; + switch (v.embedding_status) { + case 'done': return Embedded ({v.embedded_count}/{v.term_count}); + case 'in_progress': return ( + + Embedding… {pct}% ({v.embedded_count}/{v.term_count}) + + ); + case 'failed': return Embedding failed; + default: return Pending embedding; + } +} + +export function GlossaryDetail() { + const { clientId, glossaryId } = useParams<{ clientId: string; glossaryId: string }>(); + const { user } = useAuthStore(); + const toast = useToastContext(); + const qc = useQueryClient(); + const isAdmin = user?.role === 'admin'; + const isPM = user?.role === 'project_manager'; + + const [tab, setTab] = useState('terms'); + const [search, setSearch] = useState(''); + const [page, setPage] = useState(1); + + // New version upload state + const [showVersionUpload, setShowVersionUpload] = useState(false); + const [versionFile, setVersionFile] = useState(null); + const [versionSourceCol, setVersionSourceCol] = useState(''); + const [versionChangeNote, setVersionChangeNote] = useState(''); + const versionFileRef = useRef(null); + + const PAGE_SIZE = 50; + + const { data: glossary, isLoading } = useQuery({ + queryKey: ['glossary', clientId, glossaryId], + queryFn: () => apiClient.getGlossary(clientId!, glossaryId!), + enabled: !!clientId && !!glossaryId, + refetchInterval: (q) => { + const g = q.state.data as GlossaryDetailType | undefined; + if (!g) return false; + const hasInProgress = g.versions.some(v => v.embedding_status === 'in_progress' || v.embedding_status === 'pending'); + return hasInProgress ? 5000 : false; + }, + }); + + const { data: termsData, isLoading: termsLoading } = useQuery({ + queryKey: ['glossary-terms', clientId, glossaryId, search, page], + queryFn: () => apiClient.getGlossaryTerms(clientId!, glossaryId!, { search: search || undefined, page, pageSize: PAGE_SIZE }), + enabled: !!clientId && !!glossaryId && tab === 'terms', + placeholderData: (prev) => prev, + }); + + const activateMut = useMutation({ + mutationFn: (versionId: string) => apiClient.activateGlossaryVersion(clientId!, glossaryId!, versionId), + onSuccess: () => { + qc.invalidateQueries({ queryKey: ['glossary', clientId, glossaryId] }); + qc.invalidateQueries({ queryKey: ['glossaries', clientId] }); + toast.success('Version activated'); + }, + onError: () => toast.error('Failed to activate version'), + }); + + const uploadVersionMut = useMutation({ + mutationFn: () => apiClient.uploadGlossaryVersion(clientId!, glossaryId!, versionFile!, versionSourceCol.trim(), versionChangeNote.trim() || undefined), + onSuccess: () => { + qc.invalidateQueries({ queryKey: ['glossary', clientId, glossaryId] }); + setShowVersionUpload(false); + setVersionFile(null); + setVersionSourceCol(''); + setVersionChangeNote(''); + toast.success('New version uploaded — embedding in background'); + }, + onError: (err: unknown) => { + const msg = (err as { response?: { data?: { detail?: string } } })?.response?.data?.detail ?? 'Upload failed'; + toast.error(msg); + }, + }); + + if (isLoading || !glossary) { + return ( +
+
+
+
+ ); + } + + const activeVersion = glossary.versions.find(v => v.id === glossary.current_version_id); + const totalPages = termsData ? Math.ceil(termsData.total / PAGE_SIZE) : 1; + + return ( +
+ {/* Header */} +
+

+ ← Glossaries +

+
+
+

{glossary.name}

+ {glossary.description &&

{glossary.description}

} +

+ Source: {glossary.source_locale} + {activeVersion && ( + <> · Active: v{activeVersion.version_number} · {activeVersion.term_count.toLocaleString()} terms + )} +

+ {activeVersion && ( +
+ )} +
+ {(isAdmin || isPM) && ( + + )} +
+
+ + {/* New version upload panel */} + {showVersionUpload && ( +
+

Upload new version

+
versionFileRef.current?.click()} + > + setVersionFile(e.target.files?.[0] ?? null)} + /> + {versionFile + ?

{versionFile.name}

+ :

Click to select .xlsx file

+ } +
+
+ + setVersionSourceCol(e.target.value)} + placeholder="e.g. en_gb" + className="w-full border border-gray-300 rounded-lg px-3 py-1.5 text-sm focus:outline-none focus:ring-2 focus:ring-blue-500" + /> +
+
+ + setVersionChangeNote(e.target.value)} + placeholder="e.g. Updated Q2 terms" + className="w-full border border-gray-300 rounded-lg px-3 py-1.5 text-sm focus:outline-none focus:ring-2 focus:ring-blue-500" + /> +
+
+ + +
+
+ )} + + {/* Tabs */} +
+ +
+ + {/* Terms tab */} + {tab === 'terms' && ( +
+ { setSearch(e.target.value); setPage(1); }} + placeholder="Search terms…" + className="w-full border border-gray-300 rounded-lg px-3 py-2 text-sm focus:outline-none focus:ring-2 focus:ring-blue-500" + /> + + {termsLoading ? ( +
+ {[1,2,3,4,5].map(i =>
)} +
+ ) : termsData?.terms.length === 0 ? ( +

+ {search ? 'No terms match your search' : 'No terms in this glossary yet'} +

+ ) : ( + <> +
{termsData?.total.toLocaleString()} terms total
+
+ + + + + + + + + {termsData?.terms.map((term, idx) => ( + + + + + ))} + +
Source termTranslations
{term.source_term} +
+ {Object.entries(term.translations).slice(0, 6).map(([locale, text]) => ( + + {locale} + {text} + + ))} + {Object.keys(term.translations).length > 6 && ( + +{Object.keys(term.translations).length - 6} more + )} +
+
+
+ + {totalPages > 1 && ( +
+ + Page {page} of {totalPages} + +
+ )} + + )} +
+ )} + + {/* Versions tab */} + {tab === 'versions' && ( +
+ {glossary.versions.map((v) => { + const isActive = v.id === glossary.current_version_id; + return ( +
+
+
+ Version {v.version_number} + {isActive && ( + Active + )} +
+

+ {v.term_count.toLocaleString()} terms · uploaded {new Date(v.created_at).toLocaleDateString()} +

+ {v.change_note &&

"{v.change_note}"

} +
+
+ {(isAdmin || isPM) && !isActive && ( + + )} +
+ ); + })} +
+ )} +
+ ); +} diff --git a/frontend/src/routes/admin/glossaries/GlossaryList.tsx b/frontend/src/routes/admin/glossaries/GlossaryList.tsx new file mode 100644 index 0000000..31c717a --- /dev/null +++ b/frontend/src/routes/admin/glossaries/GlossaryList.tsx @@ -0,0 +1,131 @@ +import { useQuery, useMutation, useQueryClient } from '@tanstack/react-query'; +import { Link, useParams } from 'react-router-dom'; +import { apiClient } from '../../../lib/api'; +import { useToastContext } from '../../../contexts/ToastContext'; +import { useAuthStore } from '../../../lib/auth'; +import type { Glossary } from '../../../types/api'; + +function statusBadge(status: string) { + return status === 'active' + ? 'bg-green-100 text-green-700' + : 'bg-gray-100 text-gray-500'; +} + +function embeddingBadge(status: string) { + switch (status) { + case 'done': return Embedded ✓; + case 'in_progress': return Embedding…; + case 'failed': return Embed failed; + default: return Pending embed; + } +} + +export function GlossaryList() { + const { clientId } = useParams<{ clientId: string }>(); + const { user } = useAuthStore(); + const toast = useToastContext(); + const qc = useQueryClient(); + const isAdmin = user?.role === 'admin'; + const isPM = user?.role === 'project_manager'; + + const { data: glossaries = [], isLoading } = useQuery({ + queryKey: ['glossaries', clientId], + queryFn: () => apiClient.getGlossaries(clientId!), + enabled: !!clientId, + refetchInterval: (q) => { + const data = q.state.data; + return Array.isArray(data) ? 5000 : false; + }, + }); + + const archiveMut = useMutation({ + mutationFn: (id: string) => apiClient.archiveGlossary(clientId!, id), + onSuccess: () => { + qc.invalidateQueries({ queryKey: ['glossaries', clientId] }); + toast.success('Glossary archived'); + }, + onError: () => toast.error('Failed to archive glossary'), + }); + + if (isLoading) { + return ( +
+ {[1, 2].map(i =>
)} +
+ ); + } + + return ( +
+
+
+

+ ← Client +

+

Glossaries

+
+ {(isAdmin || isPM) && ( + + + Upload glossary + + )} +
+ + {glossaries.length === 0 ? ( +
+

No glossaries yet

+ {(isAdmin || isPM) && ( + + Upload the first glossary + + )} +
+ ) : ( +
+ {glossaries.map((g) => ( +
+
+
+ + {g.name} + + + {g.status} + +
+ {g.description &&

{g.description}

} +

+ Source: {g.source_locale} + {' · '}Created {new Date(g.created_at).toLocaleDateString()} +

+
+
+
+ {g.current_version_id ? embeddingBadge('') : null} +
+ {isAdmin && g.status === 'active' && ( + + )} +
+
+ ))} +
+ )} +
+ ); +} diff --git a/frontend/src/routes/admin/glossaries/GlossaryUpload.tsx b/frontend/src/routes/admin/glossaries/GlossaryUpload.tsx new file mode 100644 index 0000000..362a6f3 --- /dev/null +++ b/frontend/src/routes/admin/glossaries/GlossaryUpload.tsx @@ -0,0 +1,204 @@ +import { useState, useRef } from 'react'; +import { useNavigate, useParams, Link } from 'react-router-dom'; +import { useMutation, useQueryClient } from '@tanstack/react-query'; +import { apiClient } from '../../../lib/api'; +import { useToastContext } from '../../../contexts/ToastContext'; + +const KNOWN_LOCALES: { code: string; label: string }[] = [ + { code: 'en-GB', label: 'English (UK)' }, + { code: 'en-US', label: 'English (US)' }, + { code: 'en-CA', label: 'English (Canada)' }, + { code: 'de-DE', label: 'German' }, + { code: 'fr-FR', label: 'French (France)' }, + { code: 'fr-CA', label: 'French (Canada)' }, + { code: 'es-ES', label: 'Spanish (Spain)' }, + { code: 'es-MX', label: 'Spanish (Mexico)' }, + { code: 'es-419', label: 'Spanish (Latin America)' }, + { code: 'it-IT', label: 'Italian' }, + { code: 'pt-BR', label: 'Portuguese (Brazil)' }, + { code: 'pt-PT', label: 'Portuguese (Portugal)' }, + { code: 'nl-NL', label: 'Dutch' }, + { code: 'pl-PL', label: 'Polish' }, + { code: 'cs-CZ', label: 'Czech' }, + { code: 'tr-TR', label: 'Turkish' }, + { code: 'ko-KR', label: 'Korean' }, + { code: 'ja-JP', label: 'Japanese' }, + { code: 'id-ID', label: 'Indonesian' }, +]; + +export function GlossaryUpload() { + const { clientId } = useParams<{ clientId: string }>(); + const navigate = useNavigate(); + const toast = useToastContext(); + const qc = useQueryClient(); + + const [file, setFile] = useState(null); + const [name, setName] = useState(''); + const [sourceLocale, setSourceLocale] = useState('en-GB'); + const [sourceLocaleCol, setSourceLocaleCol] = useState(''); + const [description, setDescription] = useState(''); + const [changeNote, setChangeNote] = useState(''); + const [dragOver, setDragOver] = useState(false); + const fileInputRef = useRef(null); + + const uploadMut = useMutation({ + mutationFn: () => apiClient.uploadGlossary( + clientId!, + file!, + name.trim(), + sourceLocale, + sourceLocaleCol.trim(), + description.trim() || undefined, + changeNote.trim() || undefined, + ), + onSuccess: (g) => { + qc.invalidateQueries({ queryKey: ['glossaries', clientId] }); + toast.success(`Glossary "${g.name}" uploaded — embedding in background`); + navigate(`/admin/clients/${clientId}/glossaries/${g.id}`); + }, + onError: (err: unknown) => { + const msg = (err as { response?: { data?: { detail?: string } } })?.response?.data?.detail ?? 'Upload failed'; + toast.error(msg); + }, + }); + + const handleDrop = (e: React.DragEvent) => { + e.preventDefault(); + setDragOver(false); + const f = e.dataTransfer.files[0]; + if (f && f.name.endsWith('.xlsx')) setFile(f); + else toast.error('Only .xlsx files are accepted'); + }; + + const canSubmit = !!file && !!name.trim() && !!sourceLocale && !!sourceLocaleCol.trim() && !uploadMut.isPending; + + return ( +
+
+

+ ← Glossaries +

+

Upload glossary

+

Upload an xlsx file with terminology translations.

+
+ + {/* Drop zone */} +
fileInputRef.current?.click()} + onDragOver={(e) => { e.preventDefault(); setDragOver(true); }} + onDragLeave={() => setDragOver(false)} + onDrop={handleDrop} + > + { + const f = e.target.files?.[0]; + if (f) setFile(f); + }} + /> + {file ? ( +
+

{file.name}

+

{(file.size / 1024 / 1024).toFixed(1)} MB

+ +
+ ) : ( +
+

Drop .xlsx file here or click to browse

+

Max 50 MB

+
+ )} +
+ + {/* Form fields */} +
+
+ + setName(e.target.value)} + placeholder="e.g. 3M Master Terminology" + className="w-full border border-gray-300 rounded-lg px-3 py-2 text-sm focus:outline-none focus:ring-2 focus:ring-blue-500" + /> +
+ +
+ + +
+ +
+ + setSourceLocaleCol(e.target.value)} + placeholder="e.g. en_gb or English (GB)" + className="w-full border border-gray-300 rounded-lg px-3 py-2 text-sm focus:outline-none focus:ring-2 focus:ring-blue-500" + /> +

Must exactly match the column header in the xlsx file (case-insensitive).

+
+ +
+ +