Adds full glossary system so Gemini uses client-approved terminology
when generating subtitles and translations (critical for 3M brand names
and product codes across 16 target locales).
Backend:
- lib/locales.py: BCP-47 locale registry, normalises xlsx fr_fr → fr-FR
- models/glossary.py: Glossary / GlossaryVersion / GlossaryTerm + enums
- services/glossary_service.py: xlsx parse (openpyxl), ingest to Mongo,
hybrid retrieval (Aho-Corasick exact + Atlas Vector Search), prompt block
- services/embedding_service.py: Gemini text-embedding-004, batch 100, retry
- tasks/embed_glossary.py: Celery background task for async embedding
- api/v1/routes_glossaries.py: CRUD endpoints under /clients/{id}/glossaries
- gemini.py: _build_glossary_block(), {GLOSSARY} injection in all 4 call sites
- tts.py / gemini_tts.py: pass full locale codes (no split("-")[0] truncation)
- tasks/translate_and_synthesize.py: glossary lookup + injection per language
- prompts: {GLOSSARY} placeholder in ingestion, targeted, transcreation prompts
- pyproject.toml: +openpyxl, +pyahocorasick
Frontend:
- routes/admin/glossaries/: GlossaryList, GlossaryUpload, GlossaryDetail
- App.tsx: 3 new routes under /admin/clients/:clientId/glossaries
- ClientDetail.tsx: Glossaries card with count + quick links
- types/api.ts: Glossary, GlossaryVersion, GlossaryDetail, GlossaryTerm types
- lib/api.ts: 7 new API methods (upload, list, detail, terms, versions, activate, archive)
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
245 lines
16 KiB
Python
245 lines
16 KiB
Python
"""
|
|
Central locale registry.
|
|
|
|
Provides a single source of truth for BCP-47 codes, display names,
|
|
and Gemini-friendly labels used throughout the translation/TTS pipeline.
|
|
|
|
Convention: BCP-47 with hyphen separator (fr-FR, en-GB, pt-BR).
|
|
xlsx underscore format (fr_fr, en_gb) is normalized at import time.
|
|
Bare language-only codes (fr, en) remain valid for legacy compat.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
from dataclasses import dataclass
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class Locale:
|
|
code: str # canonical BCP-47 (e.g. "fr-FR")
|
|
display_name: str # human-readable (e.g. "French (France)")
|
|
gemini_label: str # what to pass to Gemini prompts (e.g. "French (France)")
|
|
tts_lang: str # BCP-47 for TTS API (may differ, e.g. es-MX → es-US)
|
|
preview_sample: str # sample sentence for TTS preview
|
|
|
|
|
|
# Master locale registry. Bare language codes (legacy) + explicit region variants.
|
|
_REGISTRY: dict[str, Locale] = {loc.code: loc for loc in [
|
|
# ── English ──────────────────────────────────────────────────────────────
|
|
Locale("en", "English", "English", "en-US",
|
|
"This is a preview of the audio description voice."),
|
|
Locale("en-US", "English (US)", "English (United States)", "en-US",
|
|
"This is a preview of the audio description voice."),
|
|
Locale("en-GB", "English (UK)", "English (United Kingdom)", "en-GB",
|
|
"This is a preview of the audio description voice."),
|
|
Locale("en-CA", "English (Canada)", "English (Canada)", "en-CA",
|
|
"This is a preview of the audio description voice."),
|
|
# ── Spanish ──────────────────────────────────────────────────────────────
|
|
Locale("es", "Spanish", "Spanish", "es-US",
|
|
"Esta es una vista previa de la voz de audiodescripcion."),
|
|
Locale("es-ES", "Spanish (Spain)", "Spanish (Spain)", "es-ES",
|
|
"Esta es una vista previa de la voz de audiodescripción."),
|
|
Locale("es-MX", "Spanish (Mexico)", "Spanish (Mexico)", "es-US",
|
|
"Esta es una vista previa de la voz de audiodescripción."),
|
|
Locale("es-419", "Spanish (Latin America)", "Spanish (Latin America)", "es-US",
|
|
"Esta es una vista previa de la voz de audiodescripción."),
|
|
# ── French ───────────────────────────────────────────────────────────────
|
|
Locale("fr", "French", "French", "fr-FR",
|
|
"Ceci est un apercu de la voix de l'audiodescription."),
|
|
Locale("fr-FR", "French (France)", "French (France)", "fr-FR",
|
|
"Ceci est un aperçu de la voix de l'audiodescription."),
|
|
Locale("fr-CA", "French (Canada)", "French (Canada)", "fr-CA",
|
|
"Ceci est un aperçu de la voix de l'audiodescription."),
|
|
# ── German ───────────────────────────────────────────────────────────────
|
|
Locale("de", "German", "German", "de-DE",
|
|
"Dies ist eine Vorschau der Audiodeskriptionsstimme."),
|
|
Locale("de-DE", "German (Germany)", "German (Germany)", "de-DE",
|
|
"Dies ist eine Vorschau der Audiodeskriptionsstimme."),
|
|
# ── Italian ──────────────────────────────────────────────────────────────
|
|
Locale("it", "Italian", "Italian", "it-IT",
|
|
"Questa e un'anteprima della voce dell'audiodescrizione."),
|
|
Locale("it-IT", "Italian (Italy)", "Italian (Italy)", "it-IT",
|
|
"Questa è un'anteprima della voce dell'audiodescrizione."),
|
|
# ── Portuguese ───────────────────────────────────────────────────────────
|
|
Locale("pt", "Portuguese", "Portuguese", "pt-BR",
|
|
"Esta e uma previa da voz da audiodescricao."),
|
|
Locale("pt-BR", "Portuguese (Brazil)", "Portuguese (Brazil)", "pt-BR",
|
|
"Esta é uma prévia da voz da audiodescrição."),
|
|
Locale("pt-PT", "Portuguese (Portugal)", "Portuguese (Portugal)", "pt-PT",
|
|
"Esta é uma pré-visualização da voz da audiodescrição."),
|
|
# ── Japanese ─────────────────────────────────────────────────────────────
|
|
Locale("ja", "Japanese", "Japanese", "ja-JP",
|
|
"これは音声解説の声のプレビューです。"),
|
|
Locale("ja-JP", "Japanese (Japan)", "Japanese (Japan)", "ja-JP",
|
|
"これは音声解説の声のプレビューです。"),
|
|
# ── Korean ───────────────────────────────────────────────────────────────
|
|
Locale("ko", "Korean", "Korean", "ko-KR",
|
|
"이것은 오디오 설명 음성의 미리보기입니다."),
|
|
Locale("ko-KR", "Korean (Korea)", "Korean (South Korea)", "ko-KR",
|
|
"이것은 오디오 설명 음성의 미리보기입니다."),
|
|
# ── Arabic ───────────────────────────────────────────────────────────────
|
|
Locale("ar", "Arabic", "Arabic", "ar-EG",
|
|
"هذه معاينة لصوت الوصف الصوتي."),
|
|
# ── Hindi ────────────────────────────────────────────────────────────────
|
|
Locale("hi", "Hindi", "Hindi", "hi-IN",
|
|
"यह ऑडियो विवरण आवाज का पूर्वावलोकन है।"),
|
|
# ── Indonesian ───────────────────────────────────────────────────────────
|
|
Locale("id", "Indonesian", "Indonesian", "id-ID",
|
|
"Ini adalah pratinjau suara deskripsi audio."),
|
|
Locale("id-ID", "Indonesian (Indonesia)", "Indonesian (Indonesia)", "id-ID",
|
|
"Ini adalah pratinjau suara deskripsi audio."),
|
|
# ── Dutch ────────────────────────────────────────────────────────────────
|
|
Locale("nl", "Dutch", "Dutch", "nl-NL",
|
|
"Dit is een voorbeeld van de audiodescriptiestem."),
|
|
Locale("nl-NL", "Dutch (Netherlands)", "Dutch (Netherlands)", "nl-NL",
|
|
"Dit is een voorbeeld van de audiodescriptiestem."),
|
|
# ── Polish ───────────────────────────────────────────────────────────────
|
|
Locale("pl", "Polish", "Polish", "pl-PL",
|
|
"To jest podglad glosu audiodeskrypcji."),
|
|
Locale("pl-PL", "Polish (Poland)", "Polish (Poland)", "pl-PL",
|
|
"To jest podgląd głosu audiodeskrypcji."),
|
|
# ── Russian ──────────────────────────────────────────────────────────────
|
|
Locale("ru", "Russian", "Russian", "ru-RU",
|
|
"Это предварительный просмотр голоса аудиоописания."),
|
|
# ── Thai ─────────────────────────────────────────────────────────────────
|
|
Locale("th", "Thai", "Thai", "th-TH",
|
|
"นี่คือตัวอย่างเสียงบรรยายภาพ"),
|
|
# ── Turkish ──────────────────────────────────────────────────────────────
|
|
Locale("tr", "Turkish", "Turkish", "tr-TR",
|
|
"Bu, sesli betimleme sesinin bir onizlemesidir."),
|
|
Locale("tr-TR", "Turkish (Turkey)", "Turkish (Turkey)", "tr-TR",
|
|
"Bu, sesli betimleme sesinin bir önizlemesidir."),
|
|
# ── Vietnamese ───────────────────────────────────────────────────────────
|
|
Locale("vi", "Vietnamese", "Vietnamese", "vi-VN",
|
|
"Day la ban xem truoc giong mo ta am thanh."),
|
|
# ── Romanian ─────────────────────────────────────────────────────────────
|
|
Locale("ro", "Romanian", "Romanian", "ro-RO",
|
|
"Aceasta este o previzualizare a vocii descrierii audio."),
|
|
# ── Ukrainian ────────────────────────────────────────────────────────────
|
|
Locale("uk", "Ukrainian", "Ukrainian", "uk-UA",
|
|
"Це попередній перегляд голосу аудіоопису."),
|
|
# ── Bengali ──────────────────────────────────────────────────────────────
|
|
Locale("bn", "Bengali", "Bengali", "bn-BD",
|
|
"এটি অডিও বর্ণনা ভয়েসের একটি প্রিভিউ।"),
|
|
# ── Marathi ──────────────────────────────────────────────────────────────
|
|
Locale("mr", "Marathi", "Marathi", "mr-IN",
|
|
"हे ऑडिओ वर्णन आवाजाचे पूर्वावलोकन आहे."),
|
|
# ── Tamil ────────────────────────────────────────────────────────────────
|
|
Locale("ta", "Tamil", "Tamil", "ta-IN",
|
|
"இது ஆடியோ விளக்க குரலின் முன்னோட்டம்."),
|
|
# ── Telugu ───────────────────────────────────────────────────────────────
|
|
Locale("te", "Telugu", "Telugu", "te-IN",
|
|
"ఇది ఆడియో వివరణ స్వరం యొక్క ప్రివ్యూ."),
|
|
# ── Chinese ──────────────────────────────────────────────────────────────
|
|
Locale("zh", "Chinese", "Chinese (Simplified)", "zh-CN",
|
|
"这是音频描述语音的预览。"),
|
|
# ── Czech ────────────────────────────────────────────────────────────────
|
|
Locale("cs", "Czech", "Czech", "cs-CZ",
|
|
"Toto je náhled hlasu zvukového popisu."),
|
|
Locale("cs-CZ", "Czech (Czech Republic)", "Czech (Czech Republic)", "cs-CZ",
|
|
"Toto je náhled hlasu zvukového popisu."),
|
|
# ── Danish ───────────────────────────────────────────────────────────────
|
|
Locale("da", "Danish", "Danish", "da-DK",
|
|
"Dette er en forhåndsvisning af lydbeskrivelsesstemmen."),
|
|
# ── Finnish ──────────────────────────────────────────────────────────────
|
|
Locale("fi", "Finnish", "Finnish", "fi-FI",
|
|
"Tämä on äänikuvauksen äänen esikatselu."),
|
|
# ── Hungarian ────────────────────────────────────────────────────────────
|
|
Locale("hu", "Hungarian", "Hungarian", "hu-HU",
|
|
"Ez a hangos leírás hangjának előnézete."),
|
|
# ── Norwegian ────────────────────────────────────────────────────────────
|
|
Locale("no", "Norwegian", "Norwegian", "nb-NO",
|
|
"Dette er en forhåndsvisning av lydbeskrivelsesstemmen."),
|
|
# ── Slovak ───────────────────────────────────────────────────────────────
|
|
Locale("sk", "Slovak", "Slovak", "sk-SK",
|
|
"Toto je náhľad hlasu zvukového popisu."),
|
|
# ── Swedish ──────────────────────────────────────────────────────────────
|
|
Locale("sv", "Swedish", "Swedish", "sv-SE",
|
|
"Det här är en förhandsgranskning av ljudbeskrivningsrösten."),
|
|
]}
|
|
|
|
# xlsx uses underscores; normalize to BCP-47 hyphen form
|
|
_XLSX_ALIASES: dict[str, str] = {
|
|
code.replace("-", "_").lower(): code
|
|
for code in _REGISTRY
|
|
if "-" in code
|
|
}
|
|
# a few extra mappings for edge cases
|
|
_XLSX_ALIASES.update({
|
|
"id": "id", # Indonesian column header is just "id" (no region)
|
|
})
|
|
|
|
|
|
def normalize_code(code: str) -> str:
|
|
"""
|
|
Normalize an arbitrary locale code to the canonical BCP-47 form used in this registry.
|
|
|
|
Handles:
|
|
- xlsx underscore form: "fr_fr" → "fr-FR"
|
|
- Bare language code: "fr" → "fr" (passthrough, legacy compat)
|
|
- Already canonical: "fr-FR" → "fr-FR"
|
|
"""
|
|
if not code:
|
|
return code
|
|
lowered = code.strip().lower()
|
|
# e.g. "fr_fr" -> check alias table
|
|
if "_" in lowered:
|
|
return _XLSX_ALIASES.get(lowered, code.replace("_", "-").upper() if len(lowered) > 3 else code)
|
|
# Already hyphen form — canonicalise case
|
|
if "-" in code:
|
|
parts = code.split("-", 1)
|
|
canonical = f"{parts[0].lower()}-{parts[1].upper()}"
|
|
if canonical in _REGISTRY:
|
|
return canonical
|
|
return canonical
|
|
# Bare language code — return as-is (legacy)
|
|
return lowered
|
|
|
|
|
|
def get(code: str) -> Locale | None:
|
|
"""Return Locale for the given code, or None if unknown."""
|
|
canonical = normalize_code(code)
|
|
return _REGISTRY.get(canonical) or _REGISTRY.get(canonical.split("-")[0])
|
|
|
|
|
|
def get_display_name(code: str) -> str:
|
|
"""Human-readable display name, e.g. 'French (Canada)'."""
|
|
locale = get(code)
|
|
return locale.display_name if locale else code
|
|
|
|
|
|
def get_gemini_label(code: str) -> str:
|
|
"""
|
|
Label to use inside Gemini prompts, e.g. 'French (Canada)'.
|
|
Gemini models respond more reliably to human-readable language names
|
|
than to bare BCP-47 codes when used inside instruction prompts.
|
|
"""
|
|
locale = get(code)
|
|
return locale.gemini_label if locale else code
|
|
|
|
|
|
def get_tts_lang(code: str) -> str:
|
|
"""BCP-47 code for the TTS API (may differ from canonical, e.g. es-MX → es-US)."""
|
|
locale = get(code)
|
|
return locale.tts_lang if locale else code
|
|
|
|
|
|
def get_preview_sample(code: str) -> str:
|
|
"""Language-appropriate TTS preview sentence."""
|
|
locale = get(code)
|
|
if locale:
|
|
return locale.preview_sample
|
|
# fallback: try parent language then English
|
|
parent = get(code.split("-")[0]) if "-" in code else None
|
|
if parent:
|
|
return parent.preview_sample
|
|
return "This is a preview of the audio description voice."
|
|
|
|
|
|
def all_codes() -> list[str]:
|
|
"""Return all registered locale codes, sorted."""
|
|
return sorted(_REGISTRY.keys())
|
|
|
|
|
|
def all_display_map() -> dict[str, str]:
|
|
"""Return {code: display_name} for all registered locales."""
|
|
return {code: locale.display_name for code, locale in _REGISTRY.items()}
|