video-accessibility/backend/app/models/glossary.py
Vadym Samoilenko fa351e4d25 feat: per-client glossary — hybrid exact/vector retrieval + AI injection
Adds full glossary system so Gemini uses client-approved terminology
when generating subtitles and translations (critical for 3M brand names
and product codes across 16 target locales).

Backend:
- lib/locales.py: BCP-47 locale registry, normalises xlsx fr_fr → fr-FR
- models/glossary.py: Glossary / GlossaryVersion / GlossaryTerm + enums
- services/glossary_service.py: xlsx parse (openpyxl), ingest to Mongo,
  hybrid retrieval (Aho-Corasick exact + Atlas Vector Search), prompt block
- services/embedding_service.py: Gemini text-embedding-004, batch 100, retry
- tasks/embed_glossary.py: Celery background task for async embedding
- api/v1/routes_glossaries.py: CRUD endpoints under /clients/{id}/glossaries
- gemini.py: _build_glossary_block(), {GLOSSARY} injection in all 4 call sites
- tts.py / gemini_tts.py: pass full locale codes (no split("-")[0] truncation)
- tasks/translate_and_synthesize.py: glossary lookup + injection per language
- prompts: {GLOSSARY} placeholder in ingestion, targeted, transcreation prompts
- pyproject.toml: +openpyxl, +pyahocorasick

Frontend:
- routes/admin/glossaries/: GlossaryList, GlossaryUpload, GlossaryDetail
- App.tsx: 3 new routes under /admin/clients/:clientId/glossaries
- ClientDetail.tsx: Glossaries card with count + quick links
- types/api.ts: Glossary, GlossaryVersion, GlossaryDetail, GlossaryTerm types
- lib/api.ts: 7 new API methods (upload, list, detail, terms, versions, activate, archive)

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-04-29 13:03:38 +01:00

139 lines
3.9 KiB
Python

from __future__ import annotations
from datetime import datetime
from enum import StrEnum
from pydantic import BaseModel, Field
class GlossarySource(StrEnum):
XLSX_UPLOAD = "xlsx_upload"
FRAZE_API = "fraze_api" # reserved for future FRAZE integration
class GlossaryStatus(StrEnum):
ACTIVE = "active"
ARCHIVED = "archived"
class EmbeddingStatus(StrEnum):
PENDING = "pending"
IN_PROGRESS = "in_progress"
DONE = "done"
FAILED = "failed"
class Glossary(BaseModel):
id: str | None = Field(None, alias="_id")
client_id: str
name: str
description: str | None = None
source_locale: str # BCP-47 source column, e.g. "en-GB"
source: GlossarySource = GlossarySource.XLSX_UPLOAD
status: GlossaryStatus = GlossaryStatus.ACTIVE
current_version_id: str | None = None
created_at: datetime = Field(default_factory=datetime.utcnow)
created_by: str # user_id
model_config = {"populate_by_name": True, "arbitrary_types_allowed": True}
class GlossaryVersion(BaseModel):
id: str | None = Field(None, alias="_id")
glossary_id: str
version_number: int
source_xlsx_gcs_path: str | None = None # GCS path to original file
term_count: int = 0
embedded_count: int = 0
embedding_status: EmbeddingStatus = EmbeddingStatus.PENDING
created_at: datetime = Field(default_factory=datetime.utcnow)
created_by: str
change_note: str | None = None
model_config = {"populate_by_name": True}
class GlossaryTerm(BaseModel):
"""One source term with its per-locale translations."""
id: str | None = Field(None, alias="_id")
glossary_id: str
version_id: str
cid: str | None = None # 3M Content ID from xlsx
tid: str | None = None # 3M Term ID from xlsx
source_term: str # canonical source text (whitespace-normalised)
source_term_lower: str # lowercase for case-insensitive index
translations: dict[str, str] = {} # {locale_code: translated_text}
embedding: list[float] | None = None # 768-dim Gemini embedding
model_config = {"populate_by_name": True}
# ── Schema models (API request/response) ──────────────────────────────────────
class GlossaryCreate(BaseModel):
name: str
description: str | None = None
source_locale: str
change_note: str | None = None
class GlossaryVersionCreate(BaseModel):
source_locale: str
change_note: str | None = None
class GlossaryResponse(BaseModel):
id: str
client_id: str
name: str
description: str | None = None
source_locale: str
source: GlossarySource
status: GlossaryStatus
current_version_id: str | None = None
created_at: datetime
created_by: str
class GlossaryVersionResponse(BaseModel):
id: str
glossary_id: str
version_number: int
term_count: int
embedded_count: int
embedding_status: EmbeddingStatus
created_at: datetime
created_by: str
change_note: str | None = None
class GlossaryDetailResponse(GlossaryResponse):
versions: list[GlossaryVersionResponse] = []
class GlossaryTermPreview(BaseModel):
"""Subset of GlossaryTerm for UI previews."""
source_term: str
translations: dict[str, str]
class MatchedTerm(BaseModel):
"""A term matched against VTT source text, with the target-locale translation."""
source_term: str
target_translation: str
match_kind: str # "exact" | "vector"
score: float # 1.0 for exact, cosine similarity for vector
def glossary_from_doc(doc: dict) -> Glossary:
doc = dict(doc)
if "_id" in doc:
doc["_id"] = str(doc["_id"])
return Glossary.model_validate(doc)
def glossary_version_from_doc(doc: dict) -> GlossaryVersion:
doc = dict(doc)
if "_id" in doc:
doc["_id"] = str(doc["_id"])
return GlossaryVersion.model_validate(doc)