video-accessibility/backend/app/lib/locales.py
Vadym Samoilenko fa351e4d25 feat: per-client glossary — hybrid exact/vector retrieval + AI injection
Adds full glossary system so Gemini uses client-approved terminology
when generating subtitles and translations (critical for 3M brand names
and product codes across 16 target locales).

Backend:
- lib/locales.py: BCP-47 locale registry, normalises xlsx fr_fr → fr-FR
- models/glossary.py: Glossary / GlossaryVersion / GlossaryTerm + enums
- services/glossary_service.py: xlsx parse (openpyxl), ingest to Mongo,
  hybrid retrieval (Aho-Corasick exact + Atlas Vector Search), prompt block
- services/embedding_service.py: Gemini text-embedding-004, batch 100, retry
- tasks/embed_glossary.py: Celery background task for async embedding
- api/v1/routes_glossaries.py: CRUD endpoints under /clients/{id}/glossaries
- gemini.py: _build_glossary_block(), {GLOSSARY} injection in all 4 call sites
- tts.py / gemini_tts.py: pass full locale codes (no split("-")[0] truncation)
- tasks/translate_and_synthesize.py: glossary lookup + injection per language
- prompts: {GLOSSARY} placeholder in ingestion, targeted, transcreation prompts
- pyproject.toml: +openpyxl, +pyahocorasick

Frontend:
- routes/admin/glossaries/: GlossaryList, GlossaryUpload, GlossaryDetail
- App.tsx: 3 new routes under /admin/clients/:clientId/glossaries
- ClientDetail.tsx: Glossaries card with count + quick links
- types/api.ts: Glossary, GlossaryVersion, GlossaryDetail, GlossaryTerm types
- lib/api.ts: 7 new API methods (upload, list, detail, terms, versions, activate, archive)

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-04-29 13:03:38 +01:00

245 lines
16 KiB
Python

"""
Central locale registry.
Provides a single source of truth for BCP-47 codes, display names,
and Gemini-friendly labels used throughout the translation/TTS pipeline.
Convention: BCP-47 with hyphen separator (fr-FR, en-GB, pt-BR).
xlsx underscore format (fr_fr, en_gb) is normalized at import time.
Bare language-only codes (fr, en) remain valid for legacy compat.
"""
from __future__ import annotations
from dataclasses import dataclass
@dataclass(frozen=True)
class Locale:
code: str # canonical BCP-47 (e.g. "fr-FR")
display_name: str # human-readable (e.g. "French (France)")
gemini_label: str # what to pass to Gemini prompts (e.g. "French (France)")
tts_lang: str # BCP-47 for TTS API (may differ, e.g. es-MX → es-US)
preview_sample: str # sample sentence for TTS preview
# Master locale registry. Bare language codes (legacy) + explicit region variants.
_REGISTRY: dict[str, Locale] = {loc.code: loc for loc in [
# ── English ──────────────────────────────────────────────────────────────
Locale("en", "English", "English", "en-US",
"This is a preview of the audio description voice."),
Locale("en-US", "English (US)", "English (United States)", "en-US",
"This is a preview of the audio description voice."),
Locale("en-GB", "English (UK)", "English (United Kingdom)", "en-GB",
"This is a preview of the audio description voice."),
Locale("en-CA", "English (Canada)", "English (Canada)", "en-CA",
"This is a preview of the audio description voice."),
# ── Spanish ──────────────────────────────────────────────────────────────
Locale("es", "Spanish", "Spanish", "es-US",
"Esta es una vista previa de la voz de audiodescripcion."),
Locale("es-ES", "Spanish (Spain)", "Spanish (Spain)", "es-ES",
"Esta es una vista previa de la voz de audiodescripción."),
Locale("es-MX", "Spanish (Mexico)", "Spanish (Mexico)", "es-US",
"Esta es una vista previa de la voz de audiodescripción."),
Locale("es-419", "Spanish (Latin America)", "Spanish (Latin America)", "es-US",
"Esta es una vista previa de la voz de audiodescripción."),
# ── French ───────────────────────────────────────────────────────────────
Locale("fr", "French", "French", "fr-FR",
"Ceci est un apercu de la voix de l'audiodescription."),
Locale("fr-FR", "French (France)", "French (France)", "fr-FR",
"Ceci est un aperçu de la voix de l'audiodescription."),
Locale("fr-CA", "French (Canada)", "French (Canada)", "fr-CA",
"Ceci est un aperçu de la voix de l'audiodescription."),
# ── German ───────────────────────────────────────────────────────────────
Locale("de", "German", "German", "de-DE",
"Dies ist eine Vorschau der Audiodeskriptionsstimme."),
Locale("de-DE", "German (Germany)", "German (Germany)", "de-DE",
"Dies ist eine Vorschau der Audiodeskriptionsstimme."),
# ── Italian ──────────────────────────────────────────────────────────────
Locale("it", "Italian", "Italian", "it-IT",
"Questa e un'anteprima della voce dell'audiodescrizione."),
Locale("it-IT", "Italian (Italy)", "Italian (Italy)", "it-IT",
"Questa è un'anteprima della voce dell'audiodescrizione."),
# ── Portuguese ───────────────────────────────────────────────────────────
Locale("pt", "Portuguese", "Portuguese", "pt-BR",
"Esta e uma previa da voz da audiodescricao."),
Locale("pt-BR", "Portuguese (Brazil)", "Portuguese (Brazil)", "pt-BR",
"Esta é uma prévia da voz da audiodescrição."),
Locale("pt-PT", "Portuguese (Portugal)", "Portuguese (Portugal)", "pt-PT",
"Esta é uma pré-visualização da voz da audiodescrição."),
# ── Japanese ─────────────────────────────────────────────────────────────
Locale("ja", "Japanese", "Japanese", "ja-JP",
"これは音声解説の声のプレビューです。"),
Locale("ja-JP", "Japanese (Japan)", "Japanese (Japan)", "ja-JP",
"これは音声解説の声のプレビューです。"),
# ── Korean ───────────────────────────────────────────────────────────────
Locale("ko", "Korean", "Korean", "ko-KR",
"이것은 오디오 설명 음성의 미리보기입니다."),
Locale("ko-KR", "Korean (Korea)", "Korean (South Korea)", "ko-KR",
"이것은 오디오 설명 음성의 미리보기입니다."),
# ── Arabic ───────────────────────────────────────────────────────────────
Locale("ar", "Arabic", "Arabic", "ar-EG",
"هذه معاينة لصوت الوصف الصوتي."),
# ── Hindi ────────────────────────────────────────────────────────────────
Locale("hi", "Hindi", "Hindi", "hi-IN",
"यह ऑडियो विवरण आवाज का पूर्वावलोकन है।"),
# ── Indonesian ───────────────────────────────────────────────────────────
Locale("id", "Indonesian", "Indonesian", "id-ID",
"Ini adalah pratinjau suara deskripsi audio."),
Locale("id-ID", "Indonesian (Indonesia)", "Indonesian (Indonesia)", "id-ID",
"Ini adalah pratinjau suara deskripsi audio."),
# ── Dutch ────────────────────────────────────────────────────────────────
Locale("nl", "Dutch", "Dutch", "nl-NL",
"Dit is een voorbeeld van de audiodescriptiestem."),
Locale("nl-NL", "Dutch (Netherlands)", "Dutch (Netherlands)", "nl-NL",
"Dit is een voorbeeld van de audiodescriptiestem."),
# ── Polish ───────────────────────────────────────────────────────────────
Locale("pl", "Polish", "Polish", "pl-PL",
"To jest podglad glosu audiodeskrypcji."),
Locale("pl-PL", "Polish (Poland)", "Polish (Poland)", "pl-PL",
"To jest podgląd głosu audiodeskrypcji."),
# ── Russian ──────────────────────────────────────────────────────────────
Locale("ru", "Russian", "Russian", "ru-RU",
"Это предварительный просмотр голоса аудиоописания."),
# ── Thai ─────────────────────────────────────────────────────────────────
Locale("th", "Thai", "Thai", "th-TH",
"นี่คือตัวอย่างเสียงบรรยายภาพ"),
# ── Turkish ──────────────────────────────────────────────────────────────
Locale("tr", "Turkish", "Turkish", "tr-TR",
"Bu, sesli betimleme sesinin bir onizlemesidir."),
Locale("tr-TR", "Turkish (Turkey)", "Turkish (Turkey)", "tr-TR",
"Bu, sesli betimleme sesinin bir önizlemesidir."),
# ── Vietnamese ───────────────────────────────────────────────────────────
Locale("vi", "Vietnamese", "Vietnamese", "vi-VN",
"Day la ban xem truoc giong mo ta am thanh."),
# ── Romanian ─────────────────────────────────────────────────────────────
Locale("ro", "Romanian", "Romanian", "ro-RO",
"Aceasta este o previzualizare a vocii descrierii audio."),
# ── Ukrainian ────────────────────────────────────────────────────────────
Locale("uk", "Ukrainian", "Ukrainian", "uk-UA",
"Це попередній перегляд голосу аудіоопису."),
# ── Bengali ──────────────────────────────────────────────────────────────
Locale("bn", "Bengali", "Bengali", "bn-BD",
"এটি অডিও বর্ণনা ভয়েসের একটি প্রিভিউ।"),
# ── Marathi ──────────────────────────────────────────────────────────────
Locale("mr", "Marathi", "Marathi", "mr-IN",
"हे ऑडिओ वर्णन आवाजाचे पूर्वावलोकन आहे."),
# ── Tamil ────────────────────────────────────────────────────────────────
Locale("ta", "Tamil", "Tamil", "ta-IN",
"இது ஆடியோ விளக்க குரலின் முன்னோட்டம்."),
# ── Telugu ───────────────────────────────────────────────────────────────
Locale("te", "Telugu", "Telugu", "te-IN",
"ఇది ఆడియో వివరణ స్వరం యొక్క ప్రివ్యూ."),
# ── Chinese ──────────────────────────────────────────────────────────────
Locale("zh", "Chinese", "Chinese (Simplified)", "zh-CN",
"这是音频描述语音的预览。"),
# ── Czech ────────────────────────────────────────────────────────────────
Locale("cs", "Czech", "Czech", "cs-CZ",
"Toto je náhled hlasu zvukového popisu."),
Locale("cs-CZ", "Czech (Czech Republic)", "Czech (Czech Republic)", "cs-CZ",
"Toto je náhled hlasu zvukového popisu."),
# ── Danish ───────────────────────────────────────────────────────────────
Locale("da", "Danish", "Danish", "da-DK",
"Dette er en forhåndsvisning af lydbeskrivelsesstemmen."),
# ── Finnish ──────────────────────────────────────────────────────────────
Locale("fi", "Finnish", "Finnish", "fi-FI",
"Tämä on äänikuvauksen äänen esikatselu."),
# ── Hungarian ────────────────────────────────────────────────────────────
Locale("hu", "Hungarian", "Hungarian", "hu-HU",
"Ez a hangos leírás hangjának előnézete."),
# ── Norwegian ────────────────────────────────────────────────────────────
Locale("no", "Norwegian", "Norwegian", "nb-NO",
"Dette er en forhåndsvisning av lydbeskrivelsesstemmen."),
# ── Slovak ───────────────────────────────────────────────────────────────
Locale("sk", "Slovak", "Slovak", "sk-SK",
"Toto je náhľad hlasu zvukového popisu."),
# ── Swedish ──────────────────────────────────────────────────────────────
Locale("sv", "Swedish", "Swedish", "sv-SE",
"Det här är en förhandsgranskning av ljudbeskrivningsrösten."),
]}
# xlsx uses underscores; normalize to BCP-47 hyphen form
_XLSX_ALIASES: dict[str, str] = {
code.replace("-", "_").lower(): code
for code in _REGISTRY
if "-" in code
}
# a few extra mappings for edge cases
_XLSX_ALIASES.update({
"id": "id", # Indonesian column header is just "id" (no region)
})
def normalize_code(code: str) -> str:
"""
Normalize an arbitrary locale code to the canonical BCP-47 form used in this registry.
Handles:
- xlsx underscore form: "fr_fr""fr-FR"
- Bare language code: "fr""fr" (passthrough, legacy compat)
- Already canonical: "fr-FR""fr-FR"
"""
if not code:
return code
lowered = code.strip().lower()
# e.g. "fr_fr" -> check alias table
if "_" in lowered:
return _XLSX_ALIASES.get(lowered, code.replace("_", "-").upper() if len(lowered) > 3 else code)
# Already hyphen form — canonicalise case
if "-" in code:
parts = code.split("-", 1)
canonical = f"{parts[0].lower()}-{parts[1].upper()}"
if canonical in _REGISTRY:
return canonical
return canonical
# Bare language code — return as-is (legacy)
return lowered
def get(code: str) -> Locale | None:
"""Return Locale for the given code, or None if unknown."""
canonical = normalize_code(code)
return _REGISTRY.get(canonical) or _REGISTRY.get(canonical.split("-")[0])
def get_display_name(code: str) -> str:
"""Human-readable display name, e.g. 'French (Canada)'."""
locale = get(code)
return locale.display_name if locale else code
def get_gemini_label(code: str) -> str:
"""
Label to use inside Gemini prompts, e.g. 'French (Canada)'.
Gemini models respond more reliably to human-readable language names
than to bare BCP-47 codes when used inside instruction prompts.
"""
locale = get(code)
return locale.gemini_label if locale else code
def get_tts_lang(code: str) -> str:
"""BCP-47 code for the TTS API (may differ from canonical, e.g. es-MX → es-US)."""
locale = get(code)
return locale.tts_lang if locale else code
def get_preview_sample(code: str) -> str:
"""Language-appropriate TTS preview sentence."""
locale = get(code)
if locale:
return locale.preview_sample
# fallback: try parent language then English
parent = get(code.split("-")[0]) if "-" in code else None
if parent:
return parent.preview_sample
return "This is a preview of the audio description voice."
def all_codes() -> list[str]:
"""Return all registered locale codes, sorted."""
return sorted(_REGISTRY.keys())
def all_display_map() -> dict[str, str]:
"""Return {code: display_name} for all registered locales."""
return {code: locale.display_name for code, locale in _REGISTRY.items()}