Adds full glossary system so Gemini uses client-approved terminology
when generating subtitles and translations (critical for 3M brand names
and product codes across 16 target locales).
Backend:
- lib/locales.py: BCP-47 locale registry, normalises xlsx fr_fr → fr-FR
- models/glossary.py: Glossary / GlossaryVersion / GlossaryTerm + enums
- services/glossary_service.py: xlsx parse (openpyxl), ingest to Mongo,
hybrid retrieval (Aho-Corasick exact + Atlas Vector Search), prompt block
- services/embedding_service.py: Gemini text-embedding-004, batch 100, retry
- tasks/embed_glossary.py: Celery background task for async embedding
- api/v1/routes_glossaries.py: CRUD endpoints under /clients/{id}/glossaries
- gemini.py: _build_glossary_block(), {GLOSSARY} injection in all 4 call sites
- tts.py / gemini_tts.py: pass full locale codes (no split("-")[0] truncation)
- tasks/translate_and_synthesize.py: glossary lookup + injection per language
- prompts: {GLOSSARY} placeholder in ingestion, targeted, transcreation prompts
- pyproject.toml: +openpyxl, +pyahocorasick
Frontend:
- routes/admin/glossaries/: GlossaryList, GlossaryUpload, GlossaryDetail
- App.tsx: 3 new routes under /admin/clients/:clientId/glossaries
- ClientDetail.tsx: Glossaries card with count + quick links
- types/api.ts: Glossary, GlossaryVersion, GlossaryDetail, GlossaryTerm types
- lib/api.ts: 7 new API methods (upload, list, detail, terms, versions, activate, archive)
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
139 lines
3.9 KiB
Python
139 lines
3.9 KiB
Python
from __future__ import annotations
|
|
|
|
from datetime import datetime
|
|
from enum import StrEnum
|
|
|
|
from pydantic import BaseModel, Field
|
|
|
|
|
|
class GlossarySource(StrEnum):
|
|
XLSX_UPLOAD = "xlsx_upload"
|
|
FRAZE_API = "fraze_api" # reserved for future FRAZE integration
|
|
|
|
|
|
class GlossaryStatus(StrEnum):
|
|
ACTIVE = "active"
|
|
ARCHIVED = "archived"
|
|
|
|
|
|
class EmbeddingStatus(StrEnum):
|
|
PENDING = "pending"
|
|
IN_PROGRESS = "in_progress"
|
|
DONE = "done"
|
|
FAILED = "failed"
|
|
|
|
|
|
class Glossary(BaseModel):
|
|
id: str | None = Field(None, alias="_id")
|
|
client_id: str
|
|
name: str
|
|
description: str | None = None
|
|
source_locale: str # BCP-47 source column, e.g. "en-GB"
|
|
source: GlossarySource = GlossarySource.XLSX_UPLOAD
|
|
status: GlossaryStatus = GlossaryStatus.ACTIVE
|
|
current_version_id: str | None = None
|
|
created_at: datetime = Field(default_factory=datetime.utcnow)
|
|
created_by: str # user_id
|
|
|
|
model_config = {"populate_by_name": True, "arbitrary_types_allowed": True}
|
|
|
|
|
|
class GlossaryVersion(BaseModel):
|
|
id: str | None = Field(None, alias="_id")
|
|
glossary_id: str
|
|
version_number: int
|
|
source_xlsx_gcs_path: str | None = None # GCS path to original file
|
|
term_count: int = 0
|
|
embedded_count: int = 0
|
|
embedding_status: EmbeddingStatus = EmbeddingStatus.PENDING
|
|
created_at: datetime = Field(default_factory=datetime.utcnow)
|
|
created_by: str
|
|
change_note: str | None = None
|
|
|
|
model_config = {"populate_by_name": True}
|
|
|
|
|
|
class GlossaryTerm(BaseModel):
|
|
"""One source term with its per-locale translations."""
|
|
id: str | None = Field(None, alias="_id")
|
|
glossary_id: str
|
|
version_id: str
|
|
cid: str | None = None # 3M Content ID from xlsx
|
|
tid: str | None = None # 3M Term ID from xlsx
|
|
source_term: str # canonical source text (whitespace-normalised)
|
|
source_term_lower: str # lowercase for case-insensitive index
|
|
translations: dict[str, str] = {} # {locale_code: translated_text}
|
|
embedding: list[float] | None = None # 768-dim Gemini embedding
|
|
|
|
model_config = {"populate_by_name": True}
|
|
|
|
|
|
# ── Schema models (API request/response) ──────────────────────────────────────
|
|
|
|
class GlossaryCreate(BaseModel):
|
|
name: str
|
|
description: str | None = None
|
|
source_locale: str
|
|
change_note: str | None = None
|
|
|
|
|
|
class GlossaryVersionCreate(BaseModel):
|
|
source_locale: str
|
|
change_note: str | None = None
|
|
|
|
|
|
class GlossaryResponse(BaseModel):
|
|
id: str
|
|
client_id: str
|
|
name: str
|
|
description: str | None = None
|
|
source_locale: str
|
|
source: GlossarySource
|
|
status: GlossaryStatus
|
|
current_version_id: str | None = None
|
|
created_at: datetime
|
|
created_by: str
|
|
|
|
|
|
class GlossaryVersionResponse(BaseModel):
|
|
id: str
|
|
glossary_id: str
|
|
version_number: int
|
|
term_count: int
|
|
embedded_count: int
|
|
embedding_status: EmbeddingStatus
|
|
created_at: datetime
|
|
created_by: str
|
|
change_note: str | None = None
|
|
|
|
|
|
class GlossaryDetailResponse(GlossaryResponse):
|
|
versions: list[GlossaryVersionResponse] = []
|
|
|
|
|
|
class GlossaryTermPreview(BaseModel):
|
|
"""Subset of GlossaryTerm for UI previews."""
|
|
source_term: str
|
|
translations: dict[str, str]
|
|
|
|
|
|
class MatchedTerm(BaseModel):
|
|
"""A term matched against VTT source text, with the target-locale translation."""
|
|
source_term: str
|
|
target_translation: str
|
|
match_kind: str # "exact" | "vector"
|
|
score: float # 1.0 for exact, cosine similarity for vector
|
|
|
|
|
|
def glossary_from_doc(doc: dict) -> Glossary:
|
|
doc = dict(doc)
|
|
if "_id" in doc:
|
|
doc["_id"] = str(doc["_id"])
|
|
return Glossary.model_validate(doc)
|
|
|
|
|
|
def glossary_version_from_doc(doc: dict) -> GlossaryVersion:
|
|
doc = dict(doc)
|
|
if "_id" in doc:
|
|
doc["_id"] = str(doc["_id"])
|
|
return GlossaryVersion.model_validate(doc)
|