feat: per-client glossary — hybrid exact/vector retrieval + AI injection

Adds full glossary system so Gemini uses client-approved terminology
when generating subtitles and translations (critical for 3M brand names
and product codes across 16 target locales).

Backend:
- lib/locales.py: BCP-47 locale registry, normalises xlsx fr_fr → fr-FR
- models/glossary.py: Glossary / GlossaryVersion / GlossaryTerm + enums
- services/glossary_service.py: xlsx parse (openpyxl), ingest to Mongo,
  hybrid retrieval (Aho-Corasick exact + Atlas Vector Search), prompt block
- services/embedding_service.py: Gemini text-embedding-004, batch 100, retry
- tasks/embed_glossary.py: Celery background task for async embedding
- api/v1/routes_glossaries.py: CRUD endpoints under /clients/{id}/glossaries
- gemini.py: _build_glossary_block(), {GLOSSARY} injection in all 4 call sites
- tts.py / gemini_tts.py: pass full locale codes (no split("-")[0] truncation)
- tasks/translate_and_synthesize.py: glossary lookup + injection per language
- prompts: {GLOSSARY} placeholder in ingestion, targeted, transcreation prompts
- pyproject.toml: +openpyxl, +pyahocorasick

Frontend:
- routes/admin/glossaries/: GlossaryList, GlossaryUpload, GlossaryDetail
- App.tsx: 3 new routes under /admin/clients/:clientId/glossaries
- ClientDetail.tsx: Glossaries card with count + quick links
- types/api.ts: Glossary, GlossaryVersion, GlossaryDetail, GlossaryTerm types
- lib/api.ts: 7 new API methods (upload, list, detail, terms, versions, activate, archive)

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Vadym Samoilenko 2026-04-29 13:03:38 +01:00
parent 05f25a1141
commit fa351e4d25
26 changed files with 2593 additions and 23 deletions

1
backend/.gitignore vendored
View file

@ -23,6 +23,7 @@ eggs/
.eggs/
lib/
lib64/
!app/lib/
parts/
sdist/
var/

View file

@ -0,0 +1,288 @@
"""
Glossary management endpoints.
Access:
- All glossary mutations (upload, activate, archive) Admin or PM of the client
- Glossary reads (list, detail, terms) Admin, PM, or staff members
Routes are nested under /clients/{client_id}/glossaries to keep ownership clear.
"""
from __future__ import annotations
from fastapi import APIRouter, Depends, File, Form, HTTPException, Query, UploadFile
from ...core.dependencies import get_current_user, require_pm_for_client, require_roles
from ...core.logging import get_logger
from ...models.audit_log import AuditAction
from ...models.glossary import (
GlossaryDetailResponse,
GlossaryResponse,
GlossaryVersionResponse,
)
from ...models.user import User, UserRole
from ...services import audit_logger as audit_svc
from ...services import glossary_service as svc
logger = get_logger(__name__)
router = APIRouter(
prefix="/clients/{client_id}/glossaries",
tags=["glossaries"],
)
_ALLOWED_CONTENT_TYPES = {
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"application/vnd.ms-excel",
}
_MAX_FILE_SIZE_MB = 50
def _require_client_staff(client_id: str):
"""Dependency: admin or PM of this client."""
return require_pm_for_client(client_id_param="client_id")
# ── List glossaries ───────────────────────────────────────────────────────────
@router.get("", response_model=list[GlossaryResponse])
async def list_glossaries(
client_id: str,
current_user: User = Depends(get_current_user),
):
"""List all active glossaries for a client."""
_assert_can_read(current_user)
glossaries = await svc.get_glossaries_for_client(client_id)
return [_to_response(g) for g in glossaries]
# ── Upload new glossary ───────────────────────────────────────────────────────
@router.post("", response_model=GlossaryDetailResponse, status_code=201)
async def upload_glossary(
client_id: str,
file: UploadFile = File(..., description="xlsx glossary file"),
name: str = Form(...),
source_locale: str = Form(..., description="BCP-47 source locale, e.g. en-GB"),
source_locale_col: str = Form(..., description="xlsx column header for the source language, e.g. en_gb"),
description: str | None = Form(None),
change_note: str | None = Form(None),
current_user: User = Depends(require_roles(UserRole.ADMIN, UserRole.PROJECT_MANAGER)),
):
"""Upload a new glossary xlsx file and associate it with a client."""
_validate_xlsx(file)
try:
glossary, version = await svc.ingest_glossary(
client_id=client_id,
name=name,
source_locale=source_locale,
source_locale_col=source_locale_col,
file=file,
user_id=str(current_user.id),
description=description,
change_note=change_note,
)
except ValueError as exc:
raise HTTPException(status_code=422, detail=str(exc)) from exc
await audit_svc.audit_logger.log_action(
action=AuditAction.GLOSSARY_UPLOAD,
description=f"Glossary '{name}' uploaded for client {client_id}",
user=current_user,
resource_type="glossary",
resource_id=glossary.id,
details={"term_count": version.term_count, "source_locale": source_locale},
)
versions = await svc.get_versions(glossary.id)
return _to_detail_response(glossary, versions)
# ── Get glossary detail ───────────────────────────────────────────────────────
@router.get("/{glossary_id}", response_model=GlossaryDetailResponse)
async def get_glossary(
client_id: str,
glossary_id: str,
current_user: User = Depends(get_current_user),
):
_assert_can_read(current_user)
glossary = await svc.get_glossary(glossary_id)
if not glossary or glossary.client_id != client_id:
raise HTTPException(status_code=404, detail="Glossary not found")
versions = await svc.get_versions(glossary_id)
return _to_detail_response(glossary, versions)
# ── Browse terms ──────────────────────────────────────────────────────────────
@router.get("/{glossary_id}/terms")
async def list_terms(
client_id: str,
glossary_id: str,
version_id: str | None = Query(None, description="Specific version; defaults to active"),
search: str | None = Query(None),
page: int = Query(1, ge=1),
page_size: int = Query(50, ge=1, le=200),
current_user: User = Depends(get_current_user),
):
_assert_can_read(current_user)
glossary = await svc.get_glossary(glossary_id)
if not glossary or glossary.client_id != client_id:
raise HTTPException(status_code=404, detail="Glossary not found")
vid = version_id or glossary.current_version_id
if not vid:
return {"terms": [], "total": 0, "page": page, "page_size": page_size}
terms, total = await svc.get_terms_page(vid, search=search, page=page, page_size=page_size)
return {
"terms": [{"source_term": t.source_term, "translations": t.translations} for t in terms],
"total": total,
"page": page,
"page_size": page_size,
}
# ── Upload new version ────────────────────────────────────────────────────────
@router.post("/{glossary_id}/versions", response_model=GlossaryVersionResponse, status_code=201)
async def upload_version(
client_id: str,
glossary_id: str,
file: UploadFile = File(...),
source_locale_col: str = Form(...),
change_note: str | None = Form(None),
current_user: User = Depends(require_roles(UserRole.ADMIN, UserRole.PROJECT_MANAGER)),
):
"""Upload a new xlsx file as a new version of an existing glossary."""
_validate_xlsx(file)
glossary = await svc.get_glossary(glossary_id)
if not glossary or glossary.client_id != client_id:
raise HTTPException(status_code=404, detail="Glossary not found")
try:
version = await svc.ingest_new_version(
glossary_id=glossary_id,
source_locale_col=source_locale_col,
file=file,
user_id=str(current_user.id),
change_note=change_note,
)
except ValueError as exc:
raise HTTPException(status_code=422, detail=str(exc)) from exc
await audit_svc.audit_logger.log_action(
action=AuditAction.GLOSSARY_VERSION_UPLOAD,
description=f"New glossary version uploaded for glossary {glossary_id}",
user=current_user,
resource_type="glossary_version",
resource_id=version.id,
details={"term_count": version.term_count, "version_number": version.version_number},
)
return _version_to_response(version)
# ── Activate a version ────────────────────────────────────────────────────────
@router.post("/{glossary_id}/activate")
async def activate_version(
client_id: str,
glossary_id: str,
version_id: str = Form(...),
current_user: User = Depends(require_roles(UserRole.ADMIN, UserRole.PROJECT_MANAGER)),
):
glossary = await svc.get_glossary(glossary_id)
if not glossary or glossary.client_id != client_id:
raise HTTPException(status_code=404, detail="Glossary not found")
try:
await svc.activate_version(glossary_id, version_id)
except ValueError as exc:
raise HTTPException(status_code=404, detail=str(exc)) from exc
await audit_svc.audit_logger.log_action(
action=AuditAction.GLOSSARY_ACTIVATE,
description=f"Glossary version {version_id} activated",
user=current_user,
resource_type="glossary",
resource_id=glossary_id,
details={"version_id": version_id},
)
return {"status": "ok", "active_version_id": version_id}
# ── Archive (soft-delete) ─────────────────────────────────────────────────────
@router.delete("/{glossary_id}", status_code=204)
async def archive_glossary(
client_id: str,
glossary_id: str,
current_user: User = Depends(require_roles(UserRole.ADMIN)),
):
glossary = await svc.get_glossary(glossary_id)
if not glossary or glossary.client_id != client_id:
raise HTTPException(status_code=404, detail="Glossary not found")
await svc.archive_glossary(glossary_id)
await audit_svc.audit_logger.log_action(
action=AuditAction.GLOSSARY_ARCHIVE,
description=f"Glossary {glossary_id} archived",
user=current_user,
resource_type="glossary",
resource_id=glossary_id,
)
# ── Helpers ───────────────────────────────────────────────────────────────────
def _assert_can_read(user: User) -> None:
allowed = {UserRole.ADMIN, UserRole.PROJECT_MANAGER, UserRole.REVIEWER,
UserRole.LINGUIST, UserRole.PRODUCTION}
if user.role not in allowed:
raise HTTPException(status_code=403, detail="Insufficient permissions")
def _validate_xlsx(file: UploadFile) -> None:
if file.content_type not in _ALLOWED_CONTENT_TYPES and not (
file.filename and file.filename.endswith(".xlsx")
):
raise HTTPException(
status_code=422,
detail="Only .xlsx files are accepted",
)
def _to_response(g) -> GlossaryResponse:
return GlossaryResponse(
id=str(g.id),
client_id=g.client_id,
name=g.name,
description=g.description,
source_locale=g.source_locale,
source=g.source,
status=g.status,
current_version_id=g.current_version_id,
created_at=g.created_at,
created_by=g.created_by,
)
def _version_to_response(v) -> GlossaryVersionResponse:
return GlossaryVersionResponse(
id=str(v.id),
glossary_id=v.glossary_id,
version_number=v.version_number,
term_count=v.term_count,
embedded_count=v.embedded_count,
embedding_status=v.embedding_status,
created_at=v.created_at,
created_by=v.created_by,
change_note=v.change_note,
)
def _to_detail_response(glossary, versions) -> GlossaryDetailResponse:
return GlossaryDetailResponse(
**_to_response(glossary).model_dump(),
versions=[_version_to_response(v) for v in versions],
)

View file

@ -93,7 +93,24 @@ class Settings(BaseSettings):
"sv": "sv-SE",
"es-419": "es-US",
"pt-BR": "pt-BR",
"fr-CA": "fr-CA"
"fr-CA": "fr-CA",
# Explicit region variants (added for locale-aware glossary support)
"de-DE": "de-DE",
"en-US": "en-US",
"en-GB": "en-GB",
"en-CA": "en-CA",
"es-ES": "es-ES",
"es-MX": "es-US",
"fr-FR": "fr-FR",
"it-IT": "it-IT",
"ja-JP": "ja-JP",
"ko-KR": "ko-KR",
"nl-NL": "nl-NL",
"pl-PL": "pl-PL",
"cs-CZ": "cs-CZ",
"tr-TR": "tr-TR",
"id-ID": "id-ID",
"pt-PT": "pt-PT",
}
gemini_tts_language_names: dict[str, str] = {
"en": "English",
@ -129,7 +146,24 @@ class Settings(BaseSettings):
"sv": "Swedish",
"es-419": "Spanish (Latin America)",
"pt-BR": "Portuguese (Brazil)",
"fr-CA": "French (Canada)"
"fr-CA": "French (Canada)",
# Explicit region variants
"de-DE": "German (Germany)",
"en-US": "English (US)",
"en-GB": "English (UK)",
"en-CA": "English (Canada)",
"es-ES": "Spanish (Spain)",
"es-MX": "Spanish (Mexico)",
"fr-FR": "French (France)",
"it-IT": "Italian (Italy)",
"ja-JP": "Japanese (Japan)",
"ko-KR": "Korean (Korea)",
"nl-NL": "Dutch (Netherlands)",
"pl-PL": "Polish (Poland)",
"cs-CZ": "Czech (Czech Republic)",
"tr-TR": "Turkish (Turkey)",
"id-ID": "Indonesian (Indonesia)",
"pt-PT": "Portuguese (Portugal)",
}
gemini_tts_preview_samples: dict[str, str] = {
"en": "This is a preview of the audio description voice.",
@ -165,7 +199,24 @@ class Settings(BaseSettings):
"sv": "Det här är en förhandsgranskning av ljudbeskrivningsrösten.",
"es-419": "Esta es una vista previa de la voz de audiodescripción.",
"pt-BR": "Esta é uma prévia da voz da audiodescrição.",
"fr-CA": "Ceci est un aperçu de la voix de l'audiodescription."
"fr-CA": "Ceci est un aperçu de la voix de l'audiodescription.",
# Explicit region variants
"de-DE": "Dies ist eine Vorschau der Audiodeskriptionsstimme.",
"en-US": "This is a preview of the audio description voice.",
"en-GB": "This is a preview of the audio description voice.",
"en-CA": "This is a preview of the audio description voice.",
"es-ES": "Esta es una vista previa de la voz de audiodescripción.",
"es-MX": "Esta es una vista previa de la voz de audiodescripción.",
"fr-FR": "Ceci est un aperçu de la voix de l'audiodescription.",
"it-IT": "Questa è un'anteprima della voce dell'audiodescrizione.",
"ja-JP": "これは音声解説の声のプレビューです。",
"ko-KR": "이것은 오디오 설명 음성의 미리보기입니다.",
"nl-NL": "Dit is een voorbeeld van de audiodescriptiestem.",
"pl-PL": "To jest podgląd głosu audiodeskrypcji.",
"cs-CZ": "Toto je náhled hlasu zvukového popisu.",
"tr-TR": "Bu, sesli betimleme sesinin bir önizlemesidir.",
"id-ID": "Ini adalah pratinjau suara deskripsi audio.",
"pt-PT": "Esta é uma pré-visualização da voz da audiodescrição.",
}
# Gemini TTS Model Options

245
backend/app/lib/locales.py Normal file
View file

@ -0,0 +1,245 @@
"""
Central locale registry.
Provides a single source of truth for BCP-47 codes, display names,
and Gemini-friendly labels used throughout the translation/TTS pipeline.
Convention: BCP-47 with hyphen separator (fr-FR, en-GB, pt-BR).
xlsx underscore format (fr_fr, en_gb) is normalized at import time.
Bare language-only codes (fr, en) remain valid for legacy compat.
"""
from __future__ import annotations
from dataclasses import dataclass
@dataclass(frozen=True)
class Locale:
code: str # canonical BCP-47 (e.g. "fr-FR")
display_name: str # human-readable (e.g. "French (France)")
gemini_label: str # what to pass to Gemini prompts (e.g. "French (France)")
tts_lang: str # BCP-47 for TTS API (may differ, e.g. es-MX → es-US)
preview_sample: str # sample sentence for TTS preview
# Master locale registry. Bare language codes (legacy) + explicit region variants.
_REGISTRY: dict[str, Locale] = {loc.code: loc for loc in [
# ── English ──────────────────────────────────────────────────────────────
Locale("en", "English", "English", "en-US",
"This is a preview of the audio description voice."),
Locale("en-US", "English (US)", "English (United States)", "en-US",
"This is a preview of the audio description voice."),
Locale("en-GB", "English (UK)", "English (United Kingdom)", "en-GB",
"This is a preview of the audio description voice."),
Locale("en-CA", "English (Canada)", "English (Canada)", "en-CA",
"This is a preview of the audio description voice."),
# ── Spanish ──────────────────────────────────────────────────────────────
Locale("es", "Spanish", "Spanish", "es-US",
"Esta es una vista previa de la voz de audiodescripcion."),
Locale("es-ES", "Spanish (Spain)", "Spanish (Spain)", "es-ES",
"Esta es una vista previa de la voz de audiodescripción."),
Locale("es-MX", "Spanish (Mexico)", "Spanish (Mexico)", "es-US",
"Esta es una vista previa de la voz de audiodescripción."),
Locale("es-419", "Spanish (Latin America)", "Spanish (Latin America)", "es-US",
"Esta es una vista previa de la voz de audiodescripción."),
# ── French ───────────────────────────────────────────────────────────────
Locale("fr", "French", "French", "fr-FR",
"Ceci est un apercu de la voix de l'audiodescription."),
Locale("fr-FR", "French (France)", "French (France)", "fr-FR",
"Ceci est un aperçu de la voix de l'audiodescription."),
Locale("fr-CA", "French (Canada)", "French (Canada)", "fr-CA",
"Ceci est un aperçu de la voix de l'audiodescription."),
# ── German ───────────────────────────────────────────────────────────────
Locale("de", "German", "German", "de-DE",
"Dies ist eine Vorschau der Audiodeskriptionsstimme."),
Locale("de-DE", "German (Germany)", "German (Germany)", "de-DE",
"Dies ist eine Vorschau der Audiodeskriptionsstimme."),
# ── Italian ──────────────────────────────────────────────────────────────
Locale("it", "Italian", "Italian", "it-IT",
"Questa e un'anteprima della voce dell'audiodescrizione."),
Locale("it-IT", "Italian (Italy)", "Italian (Italy)", "it-IT",
"Questa è un'anteprima della voce dell'audiodescrizione."),
# ── Portuguese ───────────────────────────────────────────────────────────
Locale("pt", "Portuguese", "Portuguese", "pt-BR",
"Esta e uma previa da voz da audiodescricao."),
Locale("pt-BR", "Portuguese (Brazil)", "Portuguese (Brazil)", "pt-BR",
"Esta é uma prévia da voz da audiodescrição."),
Locale("pt-PT", "Portuguese (Portugal)", "Portuguese (Portugal)", "pt-PT",
"Esta é uma pré-visualização da voz da audiodescrição."),
# ── Japanese ─────────────────────────────────────────────────────────────
Locale("ja", "Japanese", "Japanese", "ja-JP",
"これは音声解説の声のプレビューです。"),
Locale("ja-JP", "Japanese (Japan)", "Japanese (Japan)", "ja-JP",
"これは音声解説の声のプレビューです。"),
# ── Korean ───────────────────────────────────────────────────────────────
Locale("ko", "Korean", "Korean", "ko-KR",
"이것은 오디오 설명 음성의 미리보기입니다."),
Locale("ko-KR", "Korean (Korea)", "Korean (South Korea)", "ko-KR",
"이것은 오디오 설명 음성의 미리보기입니다."),
# ── Arabic ───────────────────────────────────────────────────────────────
Locale("ar", "Arabic", "Arabic", "ar-EG",
"هذه معاينة لصوت الوصف الصوتي."),
# ── Hindi ────────────────────────────────────────────────────────────────
Locale("hi", "Hindi", "Hindi", "hi-IN",
"यह ऑडियो विवरण आवाज का पूर्वावलोकन है।"),
# ── Indonesian ───────────────────────────────────────────────────────────
Locale("id", "Indonesian", "Indonesian", "id-ID",
"Ini adalah pratinjau suara deskripsi audio."),
Locale("id-ID", "Indonesian (Indonesia)", "Indonesian (Indonesia)", "id-ID",
"Ini adalah pratinjau suara deskripsi audio."),
# ── Dutch ────────────────────────────────────────────────────────────────
Locale("nl", "Dutch", "Dutch", "nl-NL",
"Dit is een voorbeeld van de audiodescriptiestem."),
Locale("nl-NL", "Dutch (Netherlands)", "Dutch (Netherlands)", "nl-NL",
"Dit is een voorbeeld van de audiodescriptiestem."),
# ── Polish ───────────────────────────────────────────────────────────────
Locale("pl", "Polish", "Polish", "pl-PL",
"To jest podglad glosu audiodeskrypcji."),
Locale("pl-PL", "Polish (Poland)", "Polish (Poland)", "pl-PL",
"To jest podgląd głosu audiodeskrypcji."),
# ── Russian ──────────────────────────────────────────────────────────────
Locale("ru", "Russian", "Russian", "ru-RU",
"Это предварительный просмотр голоса аудиоописания."),
# ── Thai ─────────────────────────────────────────────────────────────────
Locale("th", "Thai", "Thai", "th-TH",
"นี่คือตัวอย่างเสียงบรรยายภาพ"),
# ── Turkish ──────────────────────────────────────────────────────────────
Locale("tr", "Turkish", "Turkish", "tr-TR",
"Bu, sesli betimleme sesinin bir onizlemesidir."),
Locale("tr-TR", "Turkish (Turkey)", "Turkish (Turkey)", "tr-TR",
"Bu, sesli betimleme sesinin bir önizlemesidir."),
# ── Vietnamese ───────────────────────────────────────────────────────────
Locale("vi", "Vietnamese", "Vietnamese", "vi-VN",
"Day la ban xem truoc giong mo ta am thanh."),
# ── Romanian ─────────────────────────────────────────────────────────────
Locale("ro", "Romanian", "Romanian", "ro-RO",
"Aceasta este o previzualizare a vocii descrierii audio."),
# ── Ukrainian ────────────────────────────────────────────────────────────
Locale("uk", "Ukrainian", "Ukrainian", "uk-UA",
"Це попередній перегляд голосу аудіоопису."),
# ── Bengali ──────────────────────────────────────────────────────────────
Locale("bn", "Bengali", "Bengali", "bn-BD",
"এটি অডিও বর্ণনা ভয়েসের একটি প্রিভিউ।"),
# ── Marathi ──────────────────────────────────────────────────────────────
Locale("mr", "Marathi", "Marathi", "mr-IN",
"हे ऑडिओ वर्णन आवाजाचे पूर्वावलोकन आहे."),
# ── Tamil ────────────────────────────────────────────────────────────────
Locale("ta", "Tamil", "Tamil", "ta-IN",
"இது ஆடியோ விளக்க குரலின் முன்னோட்டம்."),
# ── Telugu ───────────────────────────────────────────────────────────────
Locale("te", "Telugu", "Telugu", "te-IN",
"ఇది ఆడియో వివరణ స్వరం యొక్క ప్రివ్యూ."),
# ── Chinese ──────────────────────────────────────────────────────────────
Locale("zh", "Chinese", "Chinese (Simplified)", "zh-CN",
"这是音频描述语音的预览。"),
# ── Czech ────────────────────────────────────────────────────────────────
Locale("cs", "Czech", "Czech", "cs-CZ",
"Toto je náhled hlasu zvukového popisu."),
Locale("cs-CZ", "Czech (Czech Republic)", "Czech (Czech Republic)", "cs-CZ",
"Toto je náhled hlasu zvukového popisu."),
# ── Danish ───────────────────────────────────────────────────────────────
Locale("da", "Danish", "Danish", "da-DK",
"Dette er en forhåndsvisning af lydbeskrivelsesstemmen."),
# ── Finnish ──────────────────────────────────────────────────────────────
Locale("fi", "Finnish", "Finnish", "fi-FI",
"Tämä on äänikuvauksen äänen esikatselu."),
# ── Hungarian ────────────────────────────────────────────────────────────
Locale("hu", "Hungarian", "Hungarian", "hu-HU",
"Ez a hangos leírás hangjának előnézete."),
# ── Norwegian ────────────────────────────────────────────────────────────
Locale("no", "Norwegian", "Norwegian", "nb-NO",
"Dette er en forhåndsvisning av lydbeskrivelsesstemmen."),
# ── Slovak ───────────────────────────────────────────────────────────────
Locale("sk", "Slovak", "Slovak", "sk-SK",
"Toto je náhľad hlasu zvukového popisu."),
# ── Swedish ──────────────────────────────────────────────────────────────
Locale("sv", "Swedish", "Swedish", "sv-SE",
"Det här är en förhandsgranskning av ljudbeskrivningsrösten."),
]}
# xlsx uses underscores; normalize to BCP-47 hyphen form
_XLSX_ALIASES: dict[str, str] = {
code.replace("-", "_").lower(): code
for code in _REGISTRY
if "-" in code
}
# a few extra mappings for edge cases
_XLSX_ALIASES.update({
"id": "id", # Indonesian column header is just "id" (no region)
})
def normalize_code(code: str) -> str:
"""
Normalize an arbitrary locale code to the canonical BCP-47 form used in this registry.
Handles:
- xlsx underscore form: "fr_fr" "fr-FR"
- Bare language code: "fr" "fr" (passthrough, legacy compat)
- Already canonical: "fr-FR" "fr-FR"
"""
if not code:
return code
lowered = code.strip().lower()
# e.g. "fr_fr" -> check alias table
if "_" in lowered:
return _XLSX_ALIASES.get(lowered, code.replace("_", "-").upper() if len(lowered) > 3 else code)
# Already hyphen form — canonicalise case
if "-" in code:
parts = code.split("-", 1)
canonical = f"{parts[0].lower()}-{parts[1].upper()}"
if canonical in _REGISTRY:
return canonical
return canonical
# Bare language code — return as-is (legacy)
return lowered
def get(code: str) -> Locale | None:
"""Return Locale for the given code, or None if unknown."""
canonical = normalize_code(code)
return _REGISTRY.get(canonical) or _REGISTRY.get(canonical.split("-")[0])
def get_display_name(code: str) -> str:
"""Human-readable display name, e.g. 'French (Canada)'."""
locale = get(code)
return locale.display_name if locale else code
def get_gemini_label(code: str) -> str:
"""
Label to use inside Gemini prompts, e.g. 'French (Canada)'.
Gemini models respond more reliably to human-readable language names
than to bare BCP-47 codes when used inside instruction prompts.
"""
locale = get(code)
return locale.gemini_label if locale else code
def get_tts_lang(code: str) -> str:
"""BCP-47 code for the TTS API (may differ from canonical, e.g. es-MX → es-US)."""
locale = get(code)
return locale.tts_lang if locale else code
def get_preview_sample(code: str) -> str:
"""Language-appropriate TTS preview sentence."""
locale = get(code)
if locale:
return locale.preview_sample
# fallback: try parent language then English
parent = get(code.split("-")[0]) if "-" in code else None
if parent:
return parent.preview_sample
return "This is a preview of the audio description voice."
def all_codes() -> list[str]:
"""Return all registered locale codes, sorted."""
return sorted(_REGISTRY.keys())
def all_display_map() -> dict[str, str]:
"""Return {code: display_name} for all registered locales."""
return {code: locale.display_name for code, locale in _REGISTRY.items()}

View file

@ -23,6 +23,7 @@ from .api.v1.routes_tts import router as tts_router
from .api.v1.routes_websockets import router as websockets_router
from .api.v1.routes_vtt_versions import router as vtt_versions_router
from .api.v1.routes_language_qc import router as language_qc_router
from .api.v1.routes_glossaries import router as glossaries_router
from .services.websocket import connection_manager
from .core.config import settings
from .core.secrets_config import initialize_config
@ -273,6 +274,7 @@ app.include_router(jobs_router, prefix="/api/v1")
app.include_router(review_notes_router, prefix="/api/v1")
app.include_router(vtt_versions_router, prefix="/api/v1")
app.include_router(language_qc_router, prefix="/api/v1")
app.include_router(glossaries_router, prefix="/api/v1")
app.include_router(tts_router, prefix="/api/v1")
app.include_router(admin_router, prefix="/api/v1")
app.include_router(websockets_router, prefix="/api/v1")

View file

@ -61,6 +61,12 @@ class AuditAction(str, Enum):
ADMIN_DATA_EXPORT = "admin.data.export"
ADMIN_AUDIT_ACCESS = "admin.audit.access"
# Glossary management
GLOSSARY_UPLOAD = "glossary.upload"
GLOSSARY_VERSION_UPLOAD = "glossary.version.upload"
GLOSSARY_ACTIVATE = "glossary.activate"
GLOSSARY_ARCHIVE = "glossary.archive"
# Security events
RATE_LIMIT_EXCEEDED = "security.rate_limit.exceeded"
VALIDATION_FAILURE = "security.validation.failure"

View file

@ -0,0 +1,139 @@
from __future__ import annotations
from datetime import datetime
from enum import StrEnum
from pydantic import BaseModel, Field
class GlossarySource(StrEnum):
XLSX_UPLOAD = "xlsx_upload"
FRAZE_API = "fraze_api" # reserved for future FRAZE integration
class GlossaryStatus(StrEnum):
ACTIVE = "active"
ARCHIVED = "archived"
class EmbeddingStatus(StrEnum):
PENDING = "pending"
IN_PROGRESS = "in_progress"
DONE = "done"
FAILED = "failed"
class Glossary(BaseModel):
id: str | None = Field(None, alias="_id")
client_id: str
name: str
description: str | None = None
source_locale: str # BCP-47 source column, e.g. "en-GB"
source: GlossarySource = GlossarySource.XLSX_UPLOAD
status: GlossaryStatus = GlossaryStatus.ACTIVE
current_version_id: str | None = None
created_at: datetime = Field(default_factory=datetime.utcnow)
created_by: str # user_id
model_config = {"populate_by_name": True, "arbitrary_types_allowed": True}
class GlossaryVersion(BaseModel):
id: str | None = Field(None, alias="_id")
glossary_id: str
version_number: int
source_xlsx_gcs_path: str | None = None # GCS path to original file
term_count: int = 0
embedded_count: int = 0
embedding_status: EmbeddingStatus = EmbeddingStatus.PENDING
created_at: datetime = Field(default_factory=datetime.utcnow)
created_by: str
change_note: str | None = None
model_config = {"populate_by_name": True}
class GlossaryTerm(BaseModel):
"""One source term with its per-locale translations."""
id: str | None = Field(None, alias="_id")
glossary_id: str
version_id: str
cid: str | None = None # 3M Content ID from xlsx
tid: str | None = None # 3M Term ID from xlsx
source_term: str # canonical source text (whitespace-normalised)
source_term_lower: str # lowercase for case-insensitive index
translations: dict[str, str] = {} # {locale_code: translated_text}
embedding: list[float] | None = None # 768-dim Gemini embedding
model_config = {"populate_by_name": True}
# ── Schema models (API request/response) ──────────────────────────────────────
class GlossaryCreate(BaseModel):
name: str
description: str | None = None
source_locale: str
change_note: str | None = None
class GlossaryVersionCreate(BaseModel):
source_locale: str
change_note: str | None = None
class GlossaryResponse(BaseModel):
id: str
client_id: str
name: str
description: str | None = None
source_locale: str
source: GlossarySource
status: GlossaryStatus
current_version_id: str | None = None
created_at: datetime
created_by: str
class GlossaryVersionResponse(BaseModel):
id: str
glossary_id: str
version_number: int
term_count: int
embedded_count: int
embedding_status: EmbeddingStatus
created_at: datetime
created_by: str
change_note: str | None = None
class GlossaryDetailResponse(GlossaryResponse):
versions: list[GlossaryVersionResponse] = []
class GlossaryTermPreview(BaseModel):
"""Subset of GlossaryTerm for UI previews."""
source_term: str
translations: dict[str, str]
class MatchedTerm(BaseModel):
"""A term matched against VTT source text, with the target-locale translation."""
source_term: str
target_translation: str
match_kind: str # "exact" | "vector"
score: float # 1.0 for exact, cosine similarity for vector
def glossary_from_doc(doc: dict) -> Glossary:
doc = dict(doc)
if "_id" in doc:
doc["_id"] = str(doc["_id"])
return Glossary.model_validate(doc)
def glossary_version_from_doc(doc: dict) -> GlossaryVersion:
doc = dict(doc)
if "_id" in doc:
doc["_id"] = str(doc["_id"])
return GlossaryVersion.model_validate(doc)

View file

@ -47,6 +47,8 @@ BRAND NAMES AND PRODUCTS:
- If a product is on the brand list, use the brand name even if the label is partially obscured — use your best confident identification
- If a product is NOT on the list or is completely unclear, use a generic descriptor — do not invent brand names
{GLOSSARY}
CAPTION FORMATTING (DCMP standard):
- Maximum TWO lines per caption. Never use three or more lines.
- Each line should be no longer than ~37 characters where possible (42 absolute max)

View file

@ -51,6 +51,8 @@ BRAND NAMES AND PRODUCTS:
- If a product is on the brand list, use the brand name even if the label is partially obscured — use your best confident identification
- If a product is NOT on the list or is completely unclear, use a generic descriptor — do not invent brand names
{GLOSSARY}
CAPTION FORMATTING (DCMP standard):
- Maximum TWO lines per caption. Never use three or more lines.
- Each line should be no longer than ~37 characters where possible (42 absolute max)

View file

@ -7,6 +7,8 @@ Rewrite the following English captions and audio descriptions into {TARGET_LANGU
- timing boundaries (same cue timestamps),
- line lengths friendly for readability (~3240 chars).
{GLOSSARY}
Input:
- captions_vtt_en: <VTT text>
- ad_vtt_en: <VTT text>

View file

@ -0,0 +1,72 @@
"""
Embedding service backed by Gemini text-embedding-004.
Provides batch embedding with retry/backoff for use in glossary ingestion.
Batch size: 100 texts per API call (API limit is 2048 but we keep it conservative
for memory and retry ergonomics with large glossaries).
"""
from __future__ import annotations
import asyncio
from collections.abc import Sequence
from google import genai
from google.genai import types as genai_types
from ..core.config import settings
from ..core.logging import get_logger
logger = get_logger(__name__)
_EMBED_MODEL = "text-embedding-004"
_BATCH_SIZE = 100
_MAX_RETRIES = 3
_INITIAL_BACKOFF = 2.0
class EmbeddingService:
def __init__(self) -> None:
self._client = genai.Client(api_key=settings.gemini_api_key)
async def embed_texts(self, texts: Sequence[str]) -> list[list[float]]:
"""
Embed a list of texts and return a list of 768-dim float vectors.
Processes in batches; retries with exponential backoff on transient errors.
Order is preserved.
"""
results: list[list[float]] = []
for i in range(0, len(texts), _BATCH_SIZE):
batch = list(texts[i: i + _BATCH_SIZE])
vectors = await self._embed_batch_with_retry(batch)
results.extend(vectors)
return results
async def embed_text(self, text: str) -> list[float]:
vectors = await self.embed_texts([text])
return vectors[0]
async def _embed_batch_with_retry(self, texts: list[str]) -> list[list[float]]:
backoff = _INITIAL_BACKOFF
for attempt in range(1, _MAX_RETRIES + 1):
try:
response = await asyncio.to_thread(
self._client.models.embed_content,
model=_EMBED_MODEL,
contents=texts,
config=genai_types.EmbedContentConfig(
task_type="RETRIEVAL_DOCUMENT",
),
)
return [list(emb.values) for emb in response.embeddings]
except Exception as exc:
if attempt == _MAX_RETRIES:
logger.error(f"Embedding batch failed after {_MAX_RETRIES} attempts: {exc}")
raise
logger.warning(f"Embedding attempt {attempt} failed, retrying in {backoff}s: {exc}")
await asyncio.sleep(backoff)
backoff *= 2
raise RuntimeError("unreachable") # makes type-checker happy
embedding_service = EmbeddingService()

View file

@ -8,6 +8,7 @@ import google.genai as genai
from ..core.config import settings
from ..core.logging import get_logger
from ..lib import locales as locale_lib
logger = get_logger(__name__)
@ -106,6 +107,12 @@ Generate sdh_captions_vtt using the same cue timings as captions_vtt, enriched w
- Maintain the same timestamp format as captions_vtt (HH:MM:SS.mmm --> HH:MM:SS.mmm)
- Only add sound effect cues where they add meaningful context; do not annotate every minor sound"""
def _build_glossary_block(self, glossary_block: Optional[str]) -> str:
"""Return the pre-built glossary block (from glossary_service.build_glossary_prompt_block), or empty string."""
if glossary_block and glossary_block.strip():
return glossary_block.strip()
return ""
def _build_brand_context_block(self, brand_context: Optional[str]) -> str:
"""Build the brand context instruction block for injection into prompts."""
if brand_context and brand_context.strip():
@ -118,7 +125,7 @@ Generate sdh_captions_vtt using the same cue timings as captions_vtt, enriched w
)
return "No specific brand names have been provided for this video."
async def extract_accessibility(self, video_file_path: str, brand_context: Optional[str] = None, sdh_requested: bool = False, _cost_ctx: Optional[dict] = None) -> dict[str, Any]:
async def extract_accessibility(self, video_file_path: str, brand_context: Optional[str] = None, sdh_requested: bool = False, glossary_block: Optional[str] = None, _cost_ctx: Optional[dict] = None) -> dict[str, Any]:
"""
Extract captions and audio descriptions from video using Gemini 2.0
Returns structured JSON with transcript, captions VTT, and audio description VTT
@ -127,6 +134,7 @@ Generate sdh_captions_vtt using the same cue timings as captions_vtt, enriched w
prompt = (
prompt_template
.replace("{BRAND_CONTEXT}", self._build_brand_context_block(brand_context))
.replace("{GLOSSARY}", self._build_glossary_block(glossary_block))
.replace("{SDH_FIELD}", self._build_sdh_field(sdh_requested))
.replace("{SDH_GUIDELINES}", self._build_sdh_guidelines(sdh_requested))
)
@ -320,6 +328,7 @@ Fix the JSON and return it:
target_language: str,
brand_context: Optional[str] = None,
sdh_requested: bool = False,
glossary_block: Optional[str] = None,
_cost_ctx: Optional[dict] = None,
) -> dict[str, Any]:
"""
@ -343,8 +352,9 @@ Fix the JSON and return it:
prompt_template = self._load_prompt("gemini_ingestion_targeted.md")
prompt = (
prompt_template
.replace("{TARGET_LANGUAGE}", target_language)
.replace("{TARGET_LANGUAGE}", locale_lib.get_gemini_label(target_language))
.replace("{BRAND_CONTEXT}", self._build_brand_context_block(brand_context))
.replace("{GLOSSARY}", self._build_glossary_block(glossary_block))
.replace("{SDH_FIELD}", self._build_sdh_field(sdh_requested))
.replace("{SDH_GUIDELINES}", self._build_sdh_guidelines(sdh_requested))
)
@ -756,6 +766,7 @@ Fix the JSON and return it:
ad_vtt: str,
target_language: str,
brief: Optional[str] = None,
glossary_block: Optional[str] = None,
_cost_ctx: Optional[dict] = None,
) -> dict[str, str]:
"""
@ -765,7 +776,8 @@ Fix the JSON and return it:
# Format prompt with actual content
prompt = prompt_template.format(
TARGET_LANGUAGE=target_language
TARGET_LANGUAGE=locale_lib.get_gemini_label(target_language),
GLOSSARY=self._build_glossary_block(glossary_block),
)
user_prompt = f"""
@ -817,6 +829,7 @@ JSON:
vtt_content: str,
target_language: str,
source_language: str = "en",
glossary_block: Optional[str] = None,
_cost_ctx: Optional[dict] = None,
) -> str:
"""
@ -842,14 +855,18 @@ JSON:
f"{i + 1}. {cue.text.replace(chr(10), ' ')}"
for i, cue in enumerate(source_cues)
)
prompt = f"""Translate the following {cue_count} numbered text segments from {source_language} to {target_language}.
_src_label = locale_lib.get_gemini_label(source_language)
_tgt_label = locale_lib.get_gemini_label(target_language)
_glossary_section = self._build_glossary_block(glossary_block)
_glossary_line = f"\n\n{_glossary_section}" if _glossary_section else ""
prompt = f"""Translate the following {cue_count} numbered text segments from {_src_label} to {_tgt_label}.
REQUIREMENTS:
- Return EXACTLY {cue_count} numbered lines, one translation per line
- Format: "1. translated text", "2. translated text", etc.
- Preserve speaker labels like [Speaker 1]: unchanged
- Use natural, idiomatic {target_language}
- Do NOT add any explanation, preamble, or extra lines{extra_instruction}
- Use natural, idiomatic {_tgt_label}
- Do NOT add any explanation, preamble, or extra lines{extra_instruction}{_glossary_line}
Segments to translate:
{numbered_texts}"""

View file

@ -7,6 +7,7 @@ from pydub import AudioSegment
from ..core.config import settings
from ..core.logging import get_logger
from ..lib import locales as locale_lib
logger = get_logger(__name__)
@ -166,10 +167,10 @@ class GeminiTTSService:
Generate a preview audio sample for voice selection.
Uses language-specific sample text and applies all TTS settings.
"""
# Get preview sample text for the language
sample_text = settings.gemini_tts_preview_samples.get(
language,
settings.gemini_tts_preview_samples.get("en", "This is a voice preview.")
# Get preview sample text — try settings override, then locale registry, then fallback
sample_text = (
settings.gemini_tts_preview_samples.get(language)
or locale_lib.get_preview_sample(language)
)
return await self.synthesize_text(

View file

@ -0,0 +1,736 @@
"""
Glossary service per-client terminology management.
Responsibilities:
parse_xlsx(bytes, source_col) list of (source_term, {locale: translation})
ingest_glossary(...) create Glossary + GlossaryVersion + GlossaryTerms in Mongo
activate_version(...) atomic swap of current_version_id
match_terms_for_text(...) hybrid exact + vector retrieval for prompt injection
build_glossary_prompt_block(...) formats matched terms for the Gemini prompt
"""
from __future__ import annotations
import io
import re
from collections.abc import Sequence
from dataclasses import dataclass
from datetime import datetime
from bson import ObjectId
from fastapi import UploadFile
from ..core.database import get_database
from ..core.logging import get_logger
from ..lib import locales as locale_lib
from ..models.glossary import (
EmbeddingStatus,
Glossary,
GlossaryStatus,
GlossaryTerm,
GlossaryVersion,
MatchedTerm,
glossary_from_doc,
glossary_version_from_doc,
)
logger = get_logger(__name__)
_COLL_GLOSSARIES = "glossaries"
_COLL_VERSIONS = "glossary_versions"
_COLL_TERMS = "glossary_terms"
# Maximum number of terms injected into a single Gemini prompt
_MAX_TERMS_IN_PROMPT = 50
# Atlas Vector Search index name (must exist on the collection)
_VECTOR_INDEX = "glossary_embedding_index"
_VECTOR_DIMS = 768
_VECTOR_SIMILARITY_THRESHOLD = 0.75
_VECTOR_TOP_K = 20
# ── xlsx parsing ─────────────────────────────────────────────────────────────
@dataclass
class _ParsedTerm:
cid: str | None
tid: str | None
source_term: str
translations: dict[str, str] # {normalized_locale: text}
def _cell(row: tuple, idx: int | None) -> str | None:
if idx is None or idx >= len(row):
return None
v = row[idx]
return str(v).strip() if v is not None else None
def parse_xlsx(file_bytes: bytes, source_locale_col: str) -> list[_ParsedTerm]:
"""
Parse an xlsx glossary file.
Args:
file_bytes: Raw xlsx bytes.
source_locale_col: The column header that contains the source text,
e.g. "en_gb" or "en-GB". Case-insensitive.
Returns:
List of parsed terms. Rows where the source column is empty are skipped.
"""
import openpyxl # local import — only used during ingest
wb = openpyxl.load_workbook(io.BytesIO(file_bytes), read_only=True, data_only=True)
ws = wb.active
rows = ws.iter_rows(values_only=True)
try:
header_row = next(rows)
except StopIteration:
return []
# Normalise header names to canonical locale codes
headers: list[str | None] = []
for h in header_row:
if h is None:
headers.append(None)
continue
s = str(h).strip()
headers.append(s)
# Find column indices
src_col_name = source_locale_col.strip()
# Try exact match first, then case-insensitive
src_idx: int | None = None
for i, h in enumerate(headers):
if h and h.lower() == src_col_name.lower():
src_idx = i
break
if src_idx is None:
raise ValueError(f"Source column '{source_locale_col}' not found in xlsx. Available: {[h for h in headers if h]}")
cid_idx = next((i for i, h in enumerate(headers) if h and h.upper() == "CID"), None)
tid_idx = next((i for i, h in enumerate(headers) if h and h.upper() == "TID"), None)
# All other columns with valid locale-like names become translation columns
locale_cols: list[tuple[int, str]] = [] # [(col_index, normalized_locale_code)]
for i, h in enumerate(headers):
if h is None or i == src_idx or i == cid_idx or i == tid_idx:
continue
norm = locale_lib.normalize_code(h)
if norm:
locale_cols.append((i, norm))
terms: list[_ParsedTerm] = []
for row in rows:
if not row or all(v is None for v in row):
continue
source = _cell(row, src_idx)
if not source:
continue
translations: dict[str, str] = {}
for col_idx, locale_code in locale_cols:
val = _cell(row, col_idx)
if val:
translations[locale_code] = val
terms.append(_ParsedTerm(
cid=_cell(row, cid_idx),
tid=_cell(row, tid_idx),
source_term=source,
translations=translations,
))
wb.close()
return terms
# ── Ingest ────────────────────────────────────────────────────────────────────
async def ingest_glossary(
client_id: str,
name: str,
source_locale: str,
source_locale_col: str,
file: UploadFile,
user_id: str,
description: str | None = None,
change_note: str | None = None,
) -> tuple[Glossary, GlossaryVersion]:
"""
Full glossary ingestion pipeline:
1. Upload xlsx to GCS
2. Parse terms
3. Create Glossary + GlossaryVersion + GlossaryTerm documents in Mongo
4. Kick off background embedding task
Returns (Glossary, GlossaryVersion) on success.
"""
db = await get_database()
# ── Upload original xlsx to GCS ──
file_bytes = await file.read()
glossary_id = str(ObjectId())
version_id = str(ObjectId())
gcs_path = f"glossaries/{client_id}/{glossary_id}/{version_id}/source.xlsx"
await _upload_bytes_to_gcs(file_bytes, gcs_path,
content_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")
# ── Parse ──
logger.info(f"Parsing xlsx for glossary {glossary_id}, source_col={source_locale_col}")
parsed_terms = parse_xlsx(file_bytes, source_locale_col)
logger.info(f"Parsed {len(parsed_terms)} terms")
# ── Create Glossary doc ──
now = datetime.utcnow()
glossary_doc = {
"_id": ObjectId(glossary_id),
"client_id": client_id,
"name": name,
"description": description,
"source_locale": locale_lib.normalize_code(source_locale),
"source": "xlsx_upload",
"status": GlossaryStatus.ACTIVE.value,
"current_version_id": version_id,
"created_at": now,
"created_by": user_id,
}
await db[_COLL_GLOSSARIES].insert_one(glossary_doc)
# ── Create GlossaryVersion doc ──
version_doc = {
"_id": ObjectId(version_id),
"glossary_id": glossary_id,
"version_number": 1,
"source_xlsx_gcs_path": gcs_path,
"term_count": len(parsed_terms),
"embedded_count": 0,
"embedding_status": EmbeddingStatus.PENDING.value,
"created_at": now,
"created_by": user_id,
"change_note": change_note,
}
await db[_COLL_VERSIONS].insert_one(version_doc)
# ── Bulk insert GlossaryTerms ──
if parsed_terms:
term_docs = [
{
"_id": ObjectId(),
"glossary_id": glossary_id,
"version_id": version_id,
"cid": t.cid,
"tid": t.tid,
"source_term": t.source_term,
"source_term_lower": t.source_term.lower(),
"translations": t.translations,
"embedding": None,
}
for t in parsed_terms
]
await db[_COLL_TERMS].insert_many(term_docs, ordered=False)
# ── Create collection indexes (idempotent) ──
await _ensure_indexes(db)
# ── Kick off embedding Celery task ──
try:
from ..tasks.embed_glossary import embed_glossary_version_task
embed_glossary_version_task.delay(version_id)
logger.info(f"Queued embedding task for version {version_id}")
except Exception as e:
logger.warning(f"Could not queue embedding task: {e}")
glossary = glossary_from_doc(glossary_doc)
version = glossary_version_from_doc(version_doc)
return glossary, version
async def ingest_new_version(
glossary_id: str,
source_locale_col: str,
file: UploadFile,
user_id: str,
change_note: str | None = None,
) -> GlossaryVersion:
"""Add a new version to an existing glossary without replacing it as active."""
db = await get_database()
glossary_doc = await db[_COLL_GLOSSARIES].find_one({"_id": ObjectId(glossary_id)})
if not glossary_doc:
raise ValueError(f"Glossary {glossary_id} not found")
client_id = glossary_doc["client_id"]
# Find next version number
last_version = await db[_COLL_VERSIONS].find_one(
{"glossary_id": glossary_id},
sort=[("version_number", -1)],
)
next_version_num = (last_version["version_number"] + 1) if last_version else 1
file_bytes = await file.read()
version_id = str(ObjectId())
gcs_path = f"glossaries/{client_id}/{glossary_id}/{version_id}/source.xlsx"
await _upload_bytes_to_gcs(file_bytes, gcs_path,
content_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")
parsed_terms = parse_xlsx(file_bytes, source_locale_col)
now = datetime.utcnow()
version_doc = {
"_id": ObjectId(version_id),
"glossary_id": glossary_id,
"version_number": next_version_num,
"source_xlsx_gcs_path": gcs_path,
"term_count": len(parsed_terms),
"embedded_count": 0,
"embedding_status": EmbeddingStatus.PENDING.value,
"created_at": now,
"created_by": user_id,
"change_note": change_note,
}
await db[_COLL_VERSIONS].insert_one(version_doc)
if parsed_terms:
term_docs = [
{
"_id": ObjectId(),
"glossary_id": glossary_id,
"version_id": version_id,
"cid": t.cid,
"tid": t.tid,
"source_term": t.source_term,
"source_term_lower": t.source_term.lower(),
"translations": t.translations,
"embedding": None,
}
for t in parsed_terms
]
await db[_COLL_TERMS].insert_many(term_docs, ordered=False)
try:
from ..tasks.embed_glossary import embed_glossary_version_task
embed_glossary_version_task.delay(version_id)
except Exception as e:
logger.warning(f"Could not queue embedding task: {e}")
return glossary_version_from_doc(version_doc)
async def activate_version(glossary_id: str, version_id: str) -> None:
"""Atomically set the active version of a glossary."""
db = await get_database()
result = await db[_COLL_GLOSSARIES].update_one(
{"_id": ObjectId(glossary_id)},
{"$set": {"current_version_id": version_id}},
)
if result.matched_count == 0:
raise ValueError(f"Glossary {glossary_id} not found")
# Invalidate Redis cache
await _invalidate_cache(glossary_id)
async def archive_glossary(glossary_id: str) -> None:
db = await get_database()
await db[_COLL_GLOSSARIES].update_one(
{"_id": ObjectId(glossary_id)},
{"$set": {"status": GlossaryStatus.ARCHIVED.value}},
)
await _invalidate_cache(glossary_id)
# ── Retrieval ─────────────────────────────────────────────────────────────────
async def match_terms_for_text(
client_id: str,
text: str,
target_locale: str,
top_k: int = _MAX_TERMS_IN_PROMPT,
) -> list[MatchedTerm]:
"""
Hybrid retrieval: exact-match (Aho-Corasick) + semantic (Atlas Vector Search).
Returns a ranked, deduplicated list of up to `top_k` MatchedTerm objects,
each with the source term and its translation in `target_locale`.
Exact matches rank before vector matches.
"""
db = await get_database()
norm_target = locale_lib.normalize_code(target_locale)
active_version_id = await _get_active_version_id(client_id)
if not active_version_id:
return []
# ── Exact pass ──
exact_matches = await _exact_match(db, active_version_id, text, norm_target)
# ── Vector pass (if we haven't hit the limit yet) ──
remaining = top_k - len(exact_matches)
already_found = {m.source_term.lower() for m in exact_matches}
vector_matches: list[MatchedTerm] = []
if remaining > 0:
try:
vector_matches = await _vector_match(
db, active_version_id, text, norm_target,
top_k=_VECTOR_TOP_K, exclude_terms=already_found,
)
except Exception as e:
logger.warning(f"Vector search failed (non-fatal): {e}")
combined = exact_matches + vector_matches
if len(combined) > top_k:
logger.info(f"glossary_terms_truncated: had {len(combined)}, capped at {top_k}")
combined = combined[:top_k]
return combined
async def _get_active_version_id(client_id: str) -> str | None:
"""Return the active version_id for the active glossary of a client, or None."""
try:
from ..core.redis import redis_client # lazy import
cache_key = f"glossary:active_version:{client_id}"
cached = await redis_client.get(cache_key)
if cached:
return cached.decode() if isinstance(cached, bytes) else cached
except Exception:
pass
db = await get_database()
glossary_doc = await db[_COLL_GLOSSARIES].find_one(
{"client_id": client_id, "status": GlossaryStatus.ACTIVE.value},
sort=[("created_at", -1)],
)
if not glossary_doc or not glossary_doc.get("current_version_id"):
return None
version_id = glossary_doc["current_version_id"]
try:
from ..core.redis import redis_client
cache_key = f"glossary:active_version:{client_id}"
await redis_client.setex(cache_key, 3600, version_id)
except Exception:
pass
return version_id
async def _invalidate_cache(glossary_id: str) -> None:
"""Clear Redis cache for a glossary's client."""
try:
db = await get_database()
doc = await db[_COLL_GLOSSARIES].find_one({"_id": ObjectId(glossary_id)})
if doc:
from ..core.redis import redis_client
await redis_client.delete(f"glossary:active_version:{doc['client_id']}")
except Exception as e:
logger.debug(f"Cache invalidation skipped: {e}")
async def _exact_match(
db,
version_id: str,
text: str,
target_locale: str,
) -> list[MatchedTerm]:
"""Find terms present in `text` using Aho-Corasick over the glossary terms."""
import ahocorasick # pyahocorasick
# Load all terms for this version (source_term_lower + translations)
cursor = db[_COLL_TERMS].find(
{"version_id": version_id},
{"source_term": 1, "source_term_lower": 1, "translations": 1},
)
terms = await cursor.to_list(length=None)
if not terms:
return []
# Build automaton
automaton = ahocorasick.Automaton()
for doc in terms:
stl = doc["source_term_lower"]
automaton.add_word(stl, (doc["source_term"], doc["translations"]))
automaton.make_automaton()
text_lower = text.lower()
matched: list[MatchedTerm] = []
seen: set[str] = set()
for _end_idx, (source_term, translations) in automaton.iter(text_lower):
if source_term in seen:
continue
# Require word/phrase boundaries around the match
start_idx = _end_idx - len(source_term.lower()) + 1
if start_idx > 0 and text_lower[start_idx - 1].isalnum():
continue
end_after = _end_idx + 1
if end_after < len(text_lower) and text_lower[end_after].isalnum():
continue
target_text = _get_translation(translations, target_locale)
if not target_text:
continue
seen.add(source_term)
matched.append(MatchedTerm(
source_term=source_term,
target_translation=target_text,
match_kind="exact",
score=1.0,
))
return matched
async def _vector_match(
db,
version_id: str,
text: str,
target_locale: str,
top_k: int = 20,
exclude_terms: set[str] | None = None,
) -> list[MatchedTerm]:
"""Semantic search via Atlas Vector Search ($vectorSearch)."""
from ..services.embedding_service import embedding_service
query_embedding = await embedding_service.embed_text(text[:2000]) # cap input length
pipeline = [
{
"$vectorSearch": {
"index": _VECTOR_INDEX,
"path": "embedding",
"queryVector": query_embedding,
"numCandidates": top_k * 4,
"limit": top_k,
"filter": {"version_id": version_id},
}
},
{
"$project": {
"source_term": 1,
"translations": 1,
"score": {"$meta": "vectorSearchScore"},
}
},
]
cursor = db[_COLL_TERMS].aggregate(pipeline)
results = await cursor.to_list(length=top_k)
matched: list[MatchedTerm] = []
for doc in results:
score = doc.get("score", 0.0)
if score < _VECTOR_SIMILARITY_THRESHOLD:
continue
source_term = doc["source_term"]
if exclude_terms and source_term.lower() in exclude_terms:
continue
target_text = _get_translation(doc["translations"], target_locale)
if not target_text:
continue
matched.append(MatchedTerm(
source_term=source_term,
target_translation=target_text,
match_kind="vector",
score=score,
))
return matched
def _get_translation(translations: dict[str, str], target_locale: str) -> str | None:
"""Look up a translation with locale-fallback: fr-CA → fr-FR → fr → None."""
if not translations:
return None
if target_locale in translations:
return translations[target_locale]
# Try parent language
parent = target_locale.split("-")[0] if "-" in target_locale else None
if parent:
# Try sibling locales, e.g. fr-CA not found → try fr-FR
for code, text in translations.items():
if code.startswith(parent + "-") or code == parent:
return text
return None
# ── Prompt block ──────────────────────────────────────────────────────────────
def build_glossary_prompt_block(
matched_terms: Sequence[MatchedTerm],
target_locale: str,
) -> str:
"""
Format matched terms for injection into a Gemini prompt.
Returns an empty string if no terms were matched.
"""
if not matched_terms:
return ""
target_label = locale_lib.get_gemini_label(target_locale)
lines = [
f"## Approved {target_label} terminology",
"Use these exact translations when the source terms appear — do not deviate:",
]
for term in matched_terms:
lines.append(f'- "{term.source_term}""{term.target_translation}"')
return "\n".join(lines)
# ── Helpers ───────────────────────────────────────────────────────────────────
async def _upload_bytes_to_gcs(data: bytes, gcs_path: str, content_type: str) -> None:
import asyncio
loop = asyncio.get_event_loop()
def _upload() -> None:
from google.cloud import storage as gcs_storage
from ..core.config import settings
client = gcs_storage.Client(project=settings.gcp_project_id)
bucket = client.bucket(settings.gcs_bucket)
blob = bucket.blob(gcs_path)
blob.content_type = content_type
blob.upload_from_string(data, content_type=content_type)
await loop.run_in_executor(None, _upload)
async def _ensure_indexes(db) -> None:
try:
await db[_COLL_GLOSSARIES].create_index([("client_id", 1), ("status", 1)])
await db[_COLL_VERSIONS].create_index([("glossary_id", 1), ("version_number", -1)])
await db[_COLL_TERMS].create_index([("version_id", 1), ("source_term_lower", 1)])
await db[_COLL_TERMS].create_index([("glossary_id", 1)])
except Exception as e:
logger.debug(f"Index creation skipped (likely already exist): {e}")
# ── Task helpers ─────────────────────────────────────────────────────────────
async def get_glossary_block_for_job(
job_doc: dict,
target_locale: str,
db,
) -> str:
"""
Convenience function for Celery tasks: given a job document and a target locale,
return the formatted glossary block for Gemini prompt injection (or empty string).
Looks up:
job_doc.project_id db.projects client_id active glossary version
Non-fatal: any failure returns "" so the pipeline continues without a glossary.
"""
try:
project_id = job_doc.get("project_id")
if not project_id:
return ""
project = await db.projects.find_one({"_id": project_id})
if not project:
return ""
client_id = project.get("client_id")
if not client_id:
return ""
# Get active version id via our cache-backed helper (reuses Redis if available)
active_version_id = await _get_active_version_id(client_id)
if not active_version_id:
return ""
# Combine source VTT texts for matching
source_text = job_doc.get("_glossary_source_text", "")
if not source_text:
return ""
norm_target = locale_lib.normalize_code(target_locale)
exact_matches = await _exact_match(db, active_version_id, source_text, norm_target)
remaining = _MAX_TERMS_IN_PROMPT - len(exact_matches)
already_found = {m.source_term.lower() for m in exact_matches}
vector_matches: list[MatchedTerm] = []
if remaining > 0:
try:
vector_matches = await _vector_match(
db, active_version_id, source_text, norm_target,
top_k=_VECTOR_TOP_K, exclude_terms=already_found,
)
except Exception as ve:
logger.debug(f"Vector search skipped in task context: {ve}")
combined = exact_matches + vector_matches
if len(combined) > _MAX_TERMS_IN_PROMPT:
logger.info(f"glossary_terms_truncated: capped at {_MAX_TERMS_IN_PROMPT}")
combined = combined[:_MAX_TERMS_IN_PROMPT]
return build_glossary_prompt_block(combined, target_locale)
except Exception as e:
logger.warning(f"Glossary lookup failed for job {job_doc.get('_id')} (non-fatal): {e}")
return ""
# ── Listing helpers ───────────────────────────────────────────────────────────
async def get_glossaries_for_client(client_id: str) -> list[Glossary]:
db = await get_database()
cursor = db[_COLL_GLOSSARIES].find(
{"client_id": client_id, "status": {"$ne": GlossaryStatus.ARCHIVED.value}},
sort=[("created_at", -1)],
)
docs = await cursor.to_list(length=100)
return [glossary_from_doc(d) for d in docs]
async def get_glossary(glossary_id: str) -> Glossary | None:
db = await get_database()
doc = await db[_COLL_GLOSSARIES].find_one({"_id": ObjectId(glossary_id)})
return glossary_from_doc(doc) if doc else None
async def get_versions(glossary_id: str) -> list[GlossaryVersion]:
db = await get_database()
cursor = db[_COLL_VERSIONS].find(
{"glossary_id": glossary_id},
sort=[("version_number", -1)],
)
docs = await cursor.to_list(length=50)
return [glossary_version_from_doc(d) for d in docs]
async def get_terms_page(
version_id: str,
search: str | None = None,
page: int = 1,
page_size: int = 50,
) -> tuple[list[GlossaryTerm], int]:
"""Returns (terms, total_count) for paginated UI preview."""
db = await get_database()
query: dict = {"version_id": version_id}
if search:
query["source_term_lower"] = {"$regex": re.escape(search.lower())}
total = await db[_COLL_TERMS].count_documents(query)
cursor = db[_COLL_TERMS].find(
query,
{"_id": 1, "source_term": 1, "translations": 1},
skip=(page - 1) * page_size,
limit=page_size,
sort=[("source_term_lower", 1)],
)
docs = await cursor.to_list(length=page_size)
terms = []
for d in docs:
d["_id"] = str(d["_id"])
terms.append(GlossaryTerm.model_validate(d))
return terms, total

View file

@ -74,19 +74,16 @@ class TTSService:
# Determine which provider to use
active_provider = provider or settings.tts_provider
# Extract simple language code for Gemini (e.g., "en-US" -> "en")
simple_lang = language_code.split("-")[0] if "-" in language_code else language_code
# Try the configured provider first, then fallback
if active_provider == "gemini" and self.gemini_available:
try:
logger.info(
f"Using Gemini TTS for language: {simple_lang}, voice: {voice_name}, "
f"Using Gemini TTS for language: {language_code}, voice: {voice_name}, "
f"model: {model}, speed: {speed}x"
)
return await gemini_tts_service.synthesize_audio_description(
ad_vtt_content,
simple_lang,
language_code,
voice_name,
model=model,
speed=speed,
@ -135,9 +132,6 @@ class TTSService:
# Determine which provider to use
active_provider = provider or settings.tts_provider
# Extract simple language code for Gemini (e.g., "en-US" -> "en")
simple_lang = language_code.split("-")[0] if "-" in language_code else language_code
# Parse VTT cues first
cues = self._parse_ad_cues(ad_vtt_content)
if not cues:
@ -169,7 +163,7 @@ class TTSService:
if active_provider == "gemini" and self.gemini_available:
audio_data = await gemini_tts_service.synthesize_text(
text, voice_name or gemini_tts_service.default_voice,
simple_lang, model=model, speed=speed, style_prompt=style_prompt
language_code, model=model, speed=speed, style_prompt=style_prompt
)
elif self.google_client:
audio_data = await self._synthesize_text_google(text, language_code, voice_name)

View file

@ -128,6 +128,7 @@ def import_task_modules():
from . import notify # noqa: E402, F401
from . import ffmpeg_operations # noqa: E402, F401
from . import whisper_transcribe # noqa: E402, F401
from . import embed_glossary # noqa: E402, F401
logger.info("Successfully imported all task modules")
except Exception as e:
logger.error(f"Error importing task modules: {e}")

View file

@ -0,0 +1,102 @@
"""
Celery task: compute and store Gemini embeddings for all terms in a glossary version.
Runs as a background job after glossary ingestion so the API response is fast.
Processes terms in batches of 100 and updates embedded_count incrementally.
"""
from __future__ import annotations
import asyncio
from bson import ObjectId
from motor.motor_asyncio import AsyncIOMotorClient
from ..core.config import settings
from ..core.logging import get_logger
from ..models.glossary import EmbeddingStatus
from . import celery_app
logger = get_logger(__name__)
_BATCH_SIZE = 100
@celery_app.task(name="embed_glossary_version", bind=True, max_retries=3)
def embed_glossary_version_task(self, version_id: str) -> dict:
"""
Compute embeddings for all GlossaryTerms of `version_id`.
Updates embedded_count and embedding_status on the GlossaryVersion doc.
"""
try:
result = asyncio.run(_async_embed_version(version_id))
return result
except Exception as exc:
logger.error(f"embed_glossary_version_task failed for {version_id}: {exc}")
raise self.retry(exc=exc, countdown=60) from None
async def _async_embed_version(version_id: str) -> dict:
from ..services.embedding_service import embedding_service
mongo_client = AsyncIOMotorClient(settings.mongodb_uri)
db = mongo_client[settings.mongodb_db]
try:
# Mark in-progress
await db.glossary_versions.update_one(
{"_id": ObjectId(version_id)},
{"$set": {"embedding_status": EmbeddingStatus.IN_PROGRESS.value}},
)
# Fetch all terms without embeddings
cursor = db.glossary_terms.find(
{"version_id": version_id, "embedding": None},
{"_id": 1, "source_term": 1},
)
terms = await cursor.to_list(length=None)
total = len(terms)
logger.info(f"Embedding {total} terms for version {version_id}")
embedded_count = 0
for i in range(0, total, _BATCH_SIZE):
batch = terms[i: i + _BATCH_SIZE]
texts = [t["source_term"] for t in batch]
ids = [t["_id"] for t in batch]
embeddings = await embedding_service.embed_texts(texts)
# Bulk update
ops = []
from pymongo import UpdateOne
for term_id, embedding in zip(ids, embeddings, strict=False):
ops.append(UpdateOne({"_id": term_id}, {"$set": {"embedding": embedding}}))
if ops:
await db.glossary_terms.bulk_write(ops, ordered=False)
embedded_count += len(batch)
await db.glossary_versions.update_one(
{"_id": ObjectId(version_id)},
{"$set": {"embedded_count": embedded_count}},
)
logger.info(f"Version {version_id}: embedded {embedded_count}/{total}")
# Mark done
await db.glossary_versions.update_one(
{"_id": ObjectId(version_id)},
{"$set": {
"embedding_status": EmbeddingStatus.DONE.value,
"embedded_count": total,
}},
)
logger.info(f"Embedding complete for version {version_id}: {total} terms")
return {"version_id": version_id, "total": total}
except Exception:
await db.glossary_versions.update_one(
{"_id": ObjectId(version_id)},
{"$set": {"embedding_status": EmbeddingStatus.FAILED.value}},
)
raise
finally:
mongo_client.close()

View file

@ -219,6 +219,9 @@ async def _async_translate_and_synthesize(job_id: str):
# Get translation mode (default to "traditional" for backwards compatibility)
translation_mode = job_doc["requested_outputs"].get("translation_mode", "traditional")
# Glossary: lazy-loaded per target language during the loop
from ..services.glossary_service import get_glossary_block_for_job
logger.info(f"Translation mode for job {job_id}: {translation_mode}")
sdh_requested = job_doc["requested_outputs"].get("sdh_vtt", False)
@ -293,12 +296,17 @@ async def _async_translate_and_synthesize(job_id: str):
project_id=_cost_ctx["project_id"],
)
# Build glossary block from source VTT for this language
_job_for_glossary = {**job_doc, "_glossary_source_text": ""}
_glossary = await get_glossary_block_for_job(_job_for_glossary, lang, db)
async def extract_targeted():
return await gemini_service.extract_accessibility_targeted(
video_local_path,
lang,
brand_context=job_brand_context,
sdh_requested=sdh_requested,
glossary_block=_glossary,
_cost_ctx=_cost_ctx,
)
@ -382,6 +390,9 @@ async def _async_translate_and_synthesize(job_id: str):
logger.info(f"Successfully processed VTT files for language: {lang} (origin: video_native)")
else:
# Combine source VTTs for glossary term matching
_source_text_for_glossary = " ".join(filter(None, [source_captions_vtt, source_ad_vtt]))
# TRADITIONAL MODE: Process languages sequentially
for language in target_languages:
logger.info(f"Processing language: {language} (from source: {source_language}, mode: {translation_mode})")
@ -392,6 +403,10 @@ async def _async_translate_and_synthesize(job_id: str):
project_id=_cost_ctx["project_id"],
)
# Lookup glossary terms for this target language
_job_for_glossary = {**job_doc, "_glossary_source_text": _source_text_for_glossary}
_glossary = await get_glossary_block_for_job(_job_for_glossary, language, db)
try:
if language in transcreation_languages:
# TRADITIONAL MODE with transcreation: cultural adaptation
@ -401,6 +416,7 @@ async def _async_translate_and_synthesize(job_id: str):
source_ad_vtt,
language,
brief="Standard accessibility content",
glossary_block=_glossary,
_cost_ctx=_cost_ctx,
)
@ -414,12 +430,14 @@ async def _async_translate_and_synthesize(job_id: str):
async def translate_captions():
return await gemini_service.translate_vtt(
source_captions_vtt, language, source_language=source_language,
glossary_block=_glossary,
_cost_ctx=_cost_ctx,
)
async def translate_ad():
return await gemini_service.translate_vtt(
source_ad_vtt, language, source_language=source_language,
glossary_block=_glossary,
_cost_ctx=_cost_ctx,
)
@ -448,6 +466,7 @@ async def _async_translate_and_synthesize(job_id: str):
async def translate_sdh():
return await gemini_service.translate_vtt(
source_sdh_vtt, language, source_language=source_language,
glossary_block=_glossary,
_cost_ctx=_cost_ctx,
)
translated_sdh = await retry_with_backoff(translate_sdh, max_retries=3)

View file

@ -42,6 +42,8 @@ python-magic = "^0.4.27"
aiohttp = "^3.12.15"
jinja2 = "^3.1.6"
audioop-lts = {version = "^0.2.2", python = ">=3.13"}
openpyxl = "^3.1.2"
pyahocorasick = "^2.1.1"
[tool.poetry.group.dev.dependencies]
pytest = "^7.4.3"

View file

@ -18,6 +18,9 @@ import { UserList } from './routes/admin/UserList';
import { UserDetail } from './routes/admin/UserDetail';
import { ClientList } from './routes/admin/ClientList';
import { ClientDetail } from './routes/admin/ClientDetail';
import { GlossaryList } from './routes/admin/glossaries/GlossaryList';
import { GlossaryUpload } from './routes/admin/glossaries/GlossaryUpload';
import { GlossaryDetail } from './routes/admin/glossaries/GlossaryDetail';
import { AuditLog } from './routes/admin/AuditLog';
import { LinguistQueue } from './routes/jobs/LinguistQueue';
import { Downloads } from './routes/Downloads';
@ -149,6 +152,27 @@ function AppContent() {
</RoleGate>
</AuthenticatedRoute>
} />
<Route path="/admin/clients/:clientId/glossaries" element={
<AuthenticatedRoute>
<RoleGate allowedRoles={['admin', 'project_manager', 'linguist', 'reviewer', 'production']}>
<GlossaryList />
</RoleGate>
</AuthenticatedRoute>
} />
<Route path="/admin/clients/:clientId/glossaries/upload" element={
<AuthenticatedRoute>
<RoleGate allowedRoles={['admin', 'project_manager']}>
<GlossaryUpload />
</RoleGate>
</AuthenticatedRoute>
} />
<Route path="/admin/clients/:clientId/glossaries/:glossaryId" element={
<AuthenticatedRoute>
<RoleGate allowedRoles={['admin', 'project_manager', 'linguist', 'reviewer', 'production']}>
<GlossaryDetail />
</RoleGate>
</AuthenticatedRoute>
} />
<Route path="/admin/audit-log" element={
<AuthenticatedRoute>
<RoleGate allowedRoles={['production', 'admin']}>

View file

@ -59,6 +59,10 @@ import type {
LanguageQCMapResponse,
LanguageQCStateResponse,
QueueResponse,
Glossary,
GlossaryDetail,
GlossaryVersion,
GlossaryTermsResponse,
} from '../types/api';
const API_BASE_URL = import.meta.env.VITE_API_BASE_URL || 'http://localhost:8000';
@ -761,6 +765,84 @@ class ApiClient {
const r = await this.client.get(`/me/language-qc-queue?${params.toString()}`);
return r.data;
}
// ── Glossary endpoints ──────────────────────────────────────────────────────
async getGlossaries(clientId: string): Promise<Glossary[]> {
const r = await this.client.get(`/clients/${clientId}/glossaries`);
return r.data;
}
async getGlossary(clientId: string, glossaryId: string): Promise<GlossaryDetail> {
const r = await this.client.get(`/clients/${clientId}/glossaries/${glossaryId}`);
return r.data;
}
async uploadGlossary(
clientId: string,
file: File,
name: string,
sourceLocale: string,
sourceLocaleCol: string,
description?: string,
changeNote?: string,
): Promise<GlossaryDetail> {
const form = new FormData();
form.append('file', file);
form.append('name', name);
form.append('source_locale', sourceLocale);
form.append('source_locale_col', sourceLocaleCol);
if (description) form.append('description', description);
if (changeNote) form.append('change_note', changeNote);
const r = await this.client.post(`/clients/${clientId}/glossaries`, form, {
headers: { 'Content-Type': 'multipart/form-data' },
timeout: 120000,
});
return r.data;
}
async uploadGlossaryVersion(
clientId: string,
glossaryId: string,
file: File,
sourceLocaleCol: string,
changeNote?: string,
): Promise<GlossaryVersion> {
const form = new FormData();
form.append('file', file);
form.append('source_locale_col', sourceLocaleCol);
if (changeNote) form.append('change_note', changeNote);
const r = await this.client.post(`/clients/${clientId}/glossaries/${glossaryId}/versions`, form, {
headers: { 'Content-Type': 'multipart/form-data' },
timeout: 120000,
});
return r.data;
}
async activateGlossaryVersion(clientId: string, glossaryId: string, versionId: string): Promise<{ status: string; active_version_id: string }> {
const form = new FormData();
form.append('version_id', versionId);
const r = await this.client.post(`/clients/${clientId}/glossaries/${glossaryId}/activate`, form);
return r.data;
}
async getGlossaryTerms(
clientId: string,
glossaryId: string,
opts?: { versionId?: string; search?: string; page?: number; pageSize?: number },
): Promise<GlossaryTermsResponse> {
const params = new URLSearchParams();
if (opts?.versionId) params.append('version_id', opts.versionId);
if (opts?.search) params.append('search', opts.search);
if (opts?.page) params.append('page', String(opts.page));
if (opts?.pageSize) params.append('page_size', String(opts.pageSize));
const r = await this.client.get(`/clients/${clientId}/glossaries/${glossaryId}/terms?${params.toString()}`);
return r.data;
}
async archiveGlossary(clientId: string, glossaryId: string): Promise<void> {
await this.client.delete(`/clients/${clientId}/glossaries/${glossaryId}`);
}
}
export const apiClient = new ApiClient();

View file

@ -1,5 +1,8 @@
import { useState } from 'react';
import { useParams } from 'react-router-dom';
import { useParams, Link } from 'react-router-dom';
import { useQuery } from '@tanstack/react-query';
import { apiClient } from '../../lib/api';
import type { Glossary } from '../../types/api';
import {
useClient,
useTeams, useCreateTeam, useUpdateTeam, useDeleteTeam,
@ -50,6 +53,12 @@ export function ClientDetail() {
const [pmUserId, setPmUserId] = useState('');
const { data: glossaries = [] } = useQuery<Glossary[]>({
queryKey: ['glossaries', clientId],
queryFn: () => apiClient.getGlossaries(clientId!),
enabled: !!clientId,
});
if (clientLoading) {
return <div className="container mx-auto px-4 py-8 animate-pulse"><div className="h-8 bg-gray-200 rounded w-1/3" /></div>;
}
@ -337,6 +346,53 @@ export function ClientDetail() {
</form>
</section>
{/* Glossaries */}
<section className="bg-white rounded-xl border border-gray-200 p-5">
<div className="flex items-center justify-between mb-4">
<h2 className="text-base font-semibold text-gray-800">Glossaries</h2>
<Link
to={`/admin/clients/${clientId}/glossaries`}
className="text-sm text-blue-600 hover:text-blue-700"
>
View all
</Link>
</div>
{glossaries.length === 0 ? (
<p className="text-sm text-gray-400">No glossaries yet</p>
) : (
<div className="space-y-2">
{glossaries.slice(0, 3).map(g => (
<div key={g.id} className="flex items-center justify-between py-1.5">
<Link
to={`/admin/clients/${clientId}/glossaries/${g.id}`}
className="text-sm text-gray-800 hover:text-blue-600"
>
{g.name}
</Link>
<span className={`text-xs px-2 py-0.5 rounded-full font-medium ${
g.status === 'active' ? 'bg-green-100 text-green-700' : 'bg-gray-100 text-gray-500'
}`}>
{g.status}
</span>
</div>
))}
{glossaries.length > 3 && (
<p className="text-xs text-gray-400">+{glossaries.length - 3} more</p>
)}
</div>
)}
{(isAdmin || user?.role === 'project_manager') && (
<div className="mt-4">
<Link
to={`/admin/clients/${clientId}/glossaries/upload`}
className="text-sm text-blue-600 hover:underline"
>
+ Upload glossary
</Link>
</div>
)}
</section>
{/* Rename team modal */}
{editingTeam && (
<div className="fixed inset-0 bg-black/50 flex items-center justify-center z-50">

View file

@ -0,0 +1,335 @@
import { useState, useRef } from 'react';
import { useParams, Link } from 'react-router-dom';
import { useQuery, useMutation, useQueryClient } from '@tanstack/react-query';
import { apiClient } from '../../../lib/api';
import { useToastContext } from '../../../contexts/ToastContext';
import { useAuthStore } from '../../../lib/auth';
import type { GlossaryVersion, GlossaryDetail as GlossaryDetailType } from '../../../types/api';
type Tab = 'terms' | 'versions';
function EmbeddingPill({ v }: { v: GlossaryVersion }) {
const pct = v.term_count > 0 ? Math.round((v.embedded_count / v.term_count) * 100) : 0;
switch (v.embedding_status) {
case 'done': return <span className="text-xs text-green-600 font-medium">Embedded ({v.embedded_count}/{v.term_count})</span>;
case 'in_progress': return (
<span className="text-xs text-blue-600 animate-pulse font-medium">
Embedding {pct}% ({v.embedded_count}/{v.term_count})
</span>
);
case 'failed': return <span className="text-xs text-red-500 font-medium">Embedding failed</span>;
default: return <span className="text-xs text-gray-400">Pending embedding</span>;
}
}
export function GlossaryDetail() {
const { clientId, glossaryId } = useParams<{ clientId: string; glossaryId: string }>();
const { user } = useAuthStore();
const toast = useToastContext();
const qc = useQueryClient();
const isAdmin = user?.role === 'admin';
const isPM = user?.role === 'project_manager';
const [tab, setTab] = useState<Tab>('terms');
const [search, setSearch] = useState('');
const [page, setPage] = useState(1);
// New version upload state
const [showVersionUpload, setShowVersionUpload] = useState(false);
const [versionFile, setVersionFile] = useState<File | null>(null);
const [versionSourceCol, setVersionSourceCol] = useState('');
const [versionChangeNote, setVersionChangeNote] = useState('');
const versionFileRef = useRef<HTMLInputElement>(null);
const PAGE_SIZE = 50;
const { data: glossary, isLoading } = useQuery<GlossaryDetailType>({
queryKey: ['glossary', clientId, glossaryId],
queryFn: () => apiClient.getGlossary(clientId!, glossaryId!),
enabled: !!clientId && !!glossaryId,
refetchInterval: (q) => {
const g = q.state.data as GlossaryDetailType | undefined;
if (!g) return false;
const hasInProgress = g.versions.some(v => v.embedding_status === 'in_progress' || v.embedding_status === 'pending');
return hasInProgress ? 5000 : false;
},
});
const { data: termsData, isLoading: termsLoading } = useQuery({
queryKey: ['glossary-terms', clientId, glossaryId, search, page],
queryFn: () => apiClient.getGlossaryTerms(clientId!, glossaryId!, { search: search || undefined, page, pageSize: PAGE_SIZE }),
enabled: !!clientId && !!glossaryId && tab === 'terms',
placeholderData: (prev) => prev,
});
const activateMut = useMutation({
mutationFn: (versionId: string) => apiClient.activateGlossaryVersion(clientId!, glossaryId!, versionId),
onSuccess: () => {
qc.invalidateQueries({ queryKey: ['glossary', clientId, glossaryId] });
qc.invalidateQueries({ queryKey: ['glossaries', clientId] });
toast.success('Version activated');
},
onError: () => toast.error('Failed to activate version'),
});
const uploadVersionMut = useMutation({
mutationFn: () => apiClient.uploadGlossaryVersion(clientId!, glossaryId!, versionFile!, versionSourceCol.trim(), versionChangeNote.trim() || undefined),
onSuccess: () => {
qc.invalidateQueries({ queryKey: ['glossary', clientId, glossaryId] });
setShowVersionUpload(false);
setVersionFile(null);
setVersionSourceCol('');
setVersionChangeNote('');
toast.success('New version uploaded — embedding in background');
},
onError: (err: unknown) => {
const msg = (err as { response?: { data?: { detail?: string } } })?.response?.data?.detail ?? 'Upload failed';
toast.error(msg);
},
});
if (isLoading || !glossary) {
return (
<div className="container mx-auto px-4 py-8 max-w-4xl animate-pulse space-y-4">
<div className="h-8 bg-gray-200 rounded w-1/3" />
<div className="h-48 bg-gray-200 rounded-xl" />
</div>
);
}
const activeVersion = glossary.versions.find(v => v.id === glossary.current_version_id);
const totalPages = termsData ? Math.ceil(termsData.total / PAGE_SIZE) : 1;
return (
<div className="container mx-auto px-4 py-8 max-w-4xl space-y-6">
{/* Header */}
<div>
<p className="text-sm text-gray-400 mb-1">
<Link to={`/admin/clients/${clientId}/glossaries`} className="hover:text-blue-600">Glossaries</Link>
</p>
<div className="flex items-start justify-between gap-4">
<div>
<h1 className="text-2xl font-bold text-gray-900">{glossary.name}</h1>
{glossary.description && <p className="text-sm text-gray-500 mt-0.5">{glossary.description}</p>}
<p className="text-xs text-gray-400 mt-1">
Source: <span className="font-mono">{glossary.source_locale}</span>
{activeVersion && (
<> · Active: v{activeVersion.version_number} · {activeVersion.term_count.toLocaleString()} terms</>
)}
</p>
{activeVersion && (
<div className="mt-1"><EmbeddingPill v={activeVersion} /></div>
)}
</div>
{(isAdmin || isPM) && (
<button
onClick={() => setShowVersionUpload(!showVersionUpload)}
className="px-3 py-1.5 border border-gray-300 text-sm rounded-lg hover:bg-gray-50 shrink-0"
>
+ New version
</button>
)}
</div>
</div>
{/* New version upload panel */}
{showVersionUpload && (
<div className="bg-blue-50 border border-blue-200 rounded-xl p-5 space-y-4">
<h3 className="text-sm font-semibold text-blue-900">Upload new version</h3>
<div
className="border-2 border-dashed border-blue-300 rounded-lg p-6 text-center cursor-pointer hover:border-blue-400"
onClick={() => versionFileRef.current?.click()}
>
<input
ref={versionFileRef}
type="file"
accept=".xlsx"
className="hidden"
onChange={(e) => setVersionFile(e.target.files?.[0] ?? null)}
/>
{versionFile
? <p className="text-sm font-medium text-gray-900">{versionFile.name}</p>
: <p className="text-sm text-gray-500">Click to select .xlsx file</p>
}
</div>
<div>
<label className="block text-xs font-medium text-gray-700 mb-1">Source column header *</label>
<input
type="text"
value={versionSourceCol}
onChange={e => setVersionSourceCol(e.target.value)}
placeholder="e.g. en_gb"
className="w-full border border-gray-300 rounded-lg px-3 py-1.5 text-sm focus:outline-none focus:ring-2 focus:ring-blue-500"
/>
</div>
<div>
<label className="block text-xs font-medium text-gray-700 mb-1">Change note (optional)</label>
<input
type="text"
value={versionChangeNote}
onChange={e => setVersionChangeNote(e.target.value)}
placeholder="e.g. Updated Q2 terms"
className="w-full border border-gray-300 rounded-lg px-3 py-1.5 text-sm focus:outline-none focus:ring-2 focus:ring-blue-500"
/>
</div>
<div className="flex gap-2">
<button
type="button"
onClick={() => setShowVersionUpload(false)}
className="px-3 py-1.5 border border-gray-300 text-sm rounded-lg hover:bg-gray-50"
>
Cancel
</button>
<button
type="button"
disabled={!versionFile || !versionSourceCol.trim() || uploadVersionMut.isPending}
onClick={() => uploadVersionMut.mutate()}
className="px-4 py-1.5 bg-blue-600 text-white text-sm font-medium rounded-lg hover:bg-blue-700 disabled:opacity-50"
>
{uploadVersionMut.isPending ? 'Uploading…' : 'Upload'}
</button>
</div>
</div>
)}
{/* Tabs */}
<div className="border-b border-gray-200">
<nav className="flex gap-6">
{(['terms', 'versions'] as Tab[]).map(t => (
<button
key={t}
onClick={() => setTab(t)}
className={`pb-3 text-sm font-medium border-b-2 transition-colors capitalize ${
tab === t ? 'border-blue-600 text-blue-600' : 'border-transparent text-gray-500 hover:text-gray-700'
}`}
>
{t}
{t === 'versions' && <span className="ml-1 text-xs text-gray-400">({glossary.versions.length})</span>}
</button>
))}
</nav>
</div>
{/* Terms tab */}
{tab === 'terms' && (
<div className="space-y-4">
<input
type="text"
value={search}
onChange={e => { setSearch(e.target.value); setPage(1); }}
placeholder="Search terms…"
className="w-full border border-gray-300 rounded-lg px-3 py-2 text-sm focus:outline-none focus:ring-2 focus:ring-blue-500"
/>
{termsLoading ? (
<div className="animate-pulse space-y-2">
{[1,2,3,4,5].map(i => <div key={i} className="h-10 bg-gray-100 rounded" />)}
</div>
) : termsData?.terms.length === 0 ? (
<p className="text-sm text-gray-400 text-center py-8">
{search ? 'No terms match your search' : 'No terms in this glossary yet'}
</p>
) : (
<>
<div className="text-xs text-gray-400">{termsData?.total.toLocaleString()} terms total</div>
<div className="border border-gray-200 rounded-xl overflow-hidden">
<table className="w-full text-sm">
<thead className="bg-gray-50 border-b border-gray-200">
<tr>
<th className="text-left px-4 py-2.5 font-medium text-gray-600 w-1/3">Source term</th>
<th className="text-left px-4 py-2.5 font-medium text-gray-600">Translations</th>
</tr>
</thead>
<tbody className="divide-y divide-gray-100">
{termsData?.terms.map((term, idx) => (
<tr key={idx} className="hover:bg-gray-50">
<td className="px-4 py-2.5 font-medium text-gray-900 align-top">{term.source_term}</td>
<td className="px-4 py-2.5 text-gray-600">
<div className="flex flex-wrap gap-2">
{Object.entries(term.translations).slice(0, 6).map(([locale, text]) => (
<span key={locale} className="inline-flex items-center gap-1 text-xs bg-gray-100 rounded px-1.5 py-0.5">
<span className="font-mono text-gray-400">{locale}</span>
<span className="text-gray-700 truncate max-w-[120px]">{text}</span>
</span>
))}
{Object.keys(term.translations).length > 6 && (
<span className="text-xs text-gray-400">+{Object.keys(term.translations).length - 6} more</span>
)}
</div>
</td>
</tr>
))}
</tbody>
</table>
</div>
{totalPages > 1 && (
<div className="flex items-center justify-between pt-2">
<button
disabled={page <= 1}
onClick={() => setPage(p => p - 1)}
className="px-3 py-1 text-sm border rounded disabled:opacity-40"
>
Previous
</button>
<span className="text-sm text-gray-500">Page {page} of {totalPages}</span>
<button
disabled={page >= totalPages}
onClick={() => setPage(p => p + 1)}
className="px-3 py-1 text-sm border rounded disabled:opacity-40"
>
Next
</button>
</div>
)}
</>
)}
</div>
)}
{/* Versions tab */}
{tab === 'versions' && (
<div className="space-y-3">
{glossary.versions.map((v) => {
const isActive = v.id === glossary.current_version_id;
return (
<div
key={v.id}
className={`rounded-xl border p-4 flex items-start justify-between gap-4 ${
isActive ? 'border-blue-300 bg-blue-50' : 'border-gray-200 bg-white'
}`}
>
<div>
<div className="flex items-center gap-2 mb-1">
<span className="text-sm font-semibold text-gray-900">Version {v.version_number}</span>
{isActive && (
<span className="text-xs bg-blue-600 text-white px-2 py-0.5 rounded-full font-medium">Active</span>
)}
</div>
<p className="text-xs text-gray-500">
{v.term_count.toLocaleString()} terms · uploaded {new Date(v.created_at).toLocaleDateString()}
</p>
{v.change_note && <p className="text-xs text-gray-400 mt-0.5 italic">"{v.change_note}"</p>}
<div className="mt-1"><EmbeddingPill v={v} /></div>
</div>
{(isAdmin || isPM) && !isActive && (
<button
onClick={() => {
if (confirm(`Activate version ${v.version_number}? AI translations will start using this version.`)) {
activateMut.mutate(v.id);
}
}}
disabled={activateMut.isPending}
className="text-xs px-3 py-1.5 bg-blue-600 text-white rounded-lg hover:bg-blue-700 disabled:opacity-50 shrink-0"
>
Activate
</button>
)}
</div>
);
})}
</div>
)}
</div>
);
}

View file

@ -0,0 +1,131 @@
import { useQuery, useMutation, useQueryClient } from '@tanstack/react-query';
import { Link, useParams } from 'react-router-dom';
import { apiClient } from '../../../lib/api';
import { useToastContext } from '../../../contexts/ToastContext';
import { useAuthStore } from '../../../lib/auth';
import type { Glossary } from '../../../types/api';
function statusBadge(status: string) {
return status === 'active'
? 'bg-green-100 text-green-700'
: 'bg-gray-100 text-gray-500';
}
function embeddingBadge(status: string) {
switch (status) {
case 'done': return <span className="text-xs text-green-600">Embedded </span>;
case 'in_progress': return <span className="text-xs text-blue-600 animate-pulse">Embedding</span>;
case 'failed': return <span className="text-xs text-red-500">Embed failed</span>;
default: return <span className="text-xs text-gray-400">Pending embed</span>;
}
}
export function GlossaryList() {
const { clientId } = useParams<{ clientId: string }>();
const { user } = useAuthStore();
const toast = useToastContext();
const qc = useQueryClient();
const isAdmin = user?.role === 'admin';
const isPM = user?.role === 'project_manager';
const { data: glossaries = [], isLoading } = useQuery<Glossary[]>({
queryKey: ['glossaries', clientId],
queryFn: () => apiClient.getGlossaries(clientId!),
enabled: !!clientId,
refetchInterval: (q) => {
const data = q.state.data;
return Array.isArray(data) ? 5000 : false;
},
});
const archiveMut = useMutation({
mutationFn: (id: string) => apiClient.archiveGlossary(clientId!, id),
onSuccess: () => {
qc.invalidateQueries({ queryKey: ['glossaries', clientId] });
toast.success('Glossary archived');
},
onError: () => toast.error('Failed to archive glossary'),
});
if (isLoading) {
return (
<div className="container mx-auto px-4 py-8 max-w-4xl animate-pulse space-y-3">
{[1, 2].map(i => <div key={i} className="h-16 bg-gray-200 rounded-xl" />)}
</div>
);
}
return (
<div className="container mx-auto px-4 py-8 max-w-4xl space-y-6">
<div className="flex items-center justify-between">
<div>
<p className="text-sm text-gray-400 mb-1">
<Link to={`/admin/clients/${clientId}`} className="hover:text-blue-600">Client</Link>
</p>
<h1 className="text-2xl font-bold text-gray-900">Glossaries</h1>
</div>
{(isAdmin || isPM) && (
<Link
to={`/admin/clients/${clientId}/glossaries/upload`}
className="px-4 py-2 bg-blue-600 text-white text-sm font-medium rounded-lg hover:bg-blue-700"
>
+ Upload glossary
</Link>
)}
</div>
{glossaries.length === 0 ? (
<div className="text-center py-16 text-gray-400">
<p className="text-lg mb-2">No glossaries yet</p>
{(isAdmin || isPM) && (
<Link to={`/admin/clients/${clientId}/glossaries/upload`} className="text-blue-500 hover:underline text-sm">
Upload the first glossary
</Link>
)}
</div>
) : (
<div className="space-y-3">
{glossaries.map((g) => (
<div key={g.id} className="bg-white rounded-xl border border-gray-200 p-5 flex items-start justify-between gap-4">
<div className="flex-1 min-w-0">
<div className="flex items-center gap-2 mb-1">
<Link
to={`/admin/clients/${clientId}/glossaries/${g.id}`}
className="text-base font-semibold text-gray-900 hover:text-blue-600 truncate"
>
{g.name}
</Link>
<span className={`text-xs px-2 py-0.5 rounded-full font-medium ${statusBadge(g.status)}`}>
{g.status}
</span>
</div>
{g.description && <p className="text-sm text-gray-500 mb-1 truncate">{g.description}</p>}
<p className="text-xs text-gray-400">
Source: <span className="font-mono">{g.source_locale}</span>
{' · '}Created {new Date(g.created_at).toLocaleDateString()}
</p>
</div>
<div className="flex items-center gap-4 shrink-0">
<div className="text-right text-xs text-gray-400">
{g.current_version_id ? embeddingBadge('') : null}
</div>
{isAdmin && g.status === 'active' && (
<button
onClick={() => {
if (confirm('Archive this glossary? It will no longer be used for AI translations.')) {
archiveMut.mutate(g.id);
}
}}
className="text-xs text-red-500 hover:text-red-700"
>
Archive
</button>
)}
</div>
</div>
))}
</div>
)}
</div>
);
}

View file

@ -0,0 +1,204 @@
import { useState, useRef } from 'react';
import { useNavigate, useParams, Link } from 'react-router-dom';
import { useMutation, useQueryClient } from '@tanstack/react-query';
import { apiClient } from '../../../lib/api';
import { useToastContext } from '../../../contexts/ToastContext';
const KNOWN_LOCALES: { code: string; label: string }[] = [
{ code: 'en-GB', label: 'English (UK)' },
{ code: 'en-US', label: 'English (US)' },
{ code: 'en-CA', label: 'English (Canada)' },
{ code: 'de-DE', label: 'German' },
{ code: 'fr-FR', label: 'French (France)' },
{ code: 'fr-CA', label: 'French (Canada)' },
{ code: 'es-ES', label: 'Spanish (Spain)' },
{ code: 'es-MX', label: 'Spanish (Mexico)' },
{ code: 'es-419', label: 'Spanish (Latin America)' },
{ code: 'it-IT', label: 'Italian' },
{ code: 'pt-BR', label: 'Portuguese (Brazil)' },
{ code: 'pt-PT', label: 'Portuguese (Portugal)' },
{ code: 'nl-NL', label: 'Dutch' },
{ code: 'pl-PL', label: 'Polish' },
{ code: 'cs-CZ', label: 'Czech' },
{ code: 'tr-TR', label: 'Turkish' },
{ code: 'ko-KR', label: 'Korean' },
{ code: 'ja-JP', label: 'Japanese' },
{ code: 'id-ID', label: 'Indonesian' },
];
export function GlossaryUpload() {
const { clientId } = useParams<{ clientId: string }>();
const navigate = useNavigate();
const toast = useToastContext();
const qc = useQueryClient();
const [file, setFile] = useState<File | null>(null);
const [name, setName] = useState('');
const [sourceLocale, setSourceLocale] = useState('en-GB');
const [sourceLocaleCol, setSourceLocaleCol] = useState('');
const [description, setDescription] = useState('');
const [changeNote, setChangeNote] = useState('');
const [dragOver, setDragOver] = useState(false);
const fileInputRef = useRef<HTMLInputElement>(null);
const uploadMut = useMutation({
mutationFn: () => apiClient.uploadGlossary(
clientId!,
file!,
name.trim(),
sourceLocale,
sourceLocaleCol.trim(),
description.trim() || undefined,
changeNote.trim() || undefined,
),
onSuccess: (g) => {
qc.invalidateQueries({ queryKey: ['glossaries', clientId] });
toast.success(`Glossary "${g.name}" uploaded — embedding in background`);
navigate(`/admin/clients/${clientId}/glossaries/${g.id}`);
},
onError: (err: unknown) => {
const msg = (err as { response?: { data?: { detail?: string } } })?.response?.data?.detail ?? 'Upload failed';
toast.error(msg);
},
});
const handleDrop = (e: React.DragEvent) => {
e.preventDefault();
setDragOver(false);
const f = e.dataTransfer.files[0];
if (f && f.name.endsWith('.xlsx')) setFile(f);
else toast.error('Only .xlsx files are accepted');
};
const canSubmit = !!file && !!name.trim() && !!sourceLocale && !!sourceLocaleCol.trim() && !uploadMut.isPending;
return (
<div className="container mx-auto px-4 py-8 max-w-xl space-y-6">
<div>
<p className="text-sm text-gray-400 mb-1">
<Link to={`/admin/clients/${clientId}/glossaries`} className="hover:text-blue-600">Glossaries</Link>
</p>
<h1 className="text-2xl font-bold text-gray-900">Upload glossary</h1>
<p className="text-sm text-gray-500 mt-1">Upload an xlsx file with terminology translations.</p>
</div>
{/* Drop zone */}
<div
className={`border-2 border-dashed rounded-xl p-8 text-center cursor-pointer transition-colors ${
dragOver ? 'border-blue-400 bg-blue-50' : 'border-gray-300 hover:border-gray-400'
}`}
onClick={() => fileInputRef.current?.click()}
onDragOver={(e) => { e.preventDefault(); setDragOver(true); }}
onDragLeave={() => setDragOver(false)}
onDrop={handleDrop}
>
<input
ref={fileInputRef}
type="file"
accept=".xlsx,application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
className="hidden"
onChange={(e) => {
const f = e.target.files?.[0];
if (f) setFile(f);
}}
/>
{file ? (
<div>
<p className="text-sm font-medium text-gray-900">{file.name}</p>
<p className="text-xs text-gray-400 mt-1">{(file.size / 1024 / 1024).toFixed(1)} MB</p>
<button
type="button"
onClick={(e) => { e.stopPropagation(); setFile(null); }}
className="mt-2 text-xs text-red-500 hover:text-red-700"
>
Remove
</button>
</div>
) : (
<div className="text-gray-400">
<p className="text-sm font-medium">Drop .xlsx file here or click to browse</p>
<p className="text-xs mt-1">Max 50 MB</p>
</div>
)}
</div>
{/* Form fields */}
<div className="space-y-4">
<div>
<label className="block text-sm font-medium text-gray-700 mb-1">Glossary name *</label>
<input
type="text"
value={name}
onChange={e => setName(e.target.value)}
placeholder="e.g. 3M Master Terminology"
className="w-full border border-gray-300 rounded-lg px-3 py-2 text-sm focus:outline-none focus:ring-2 focus:ring-blue-500"
/>
</div>
<div>
<label className="block text-sm font-medium text-gray-700 mb-1">Source language *</label>
<select
value={sourceLocale}
onChange={e => setSourceLocale(e.target.value)}
className="w-full border border-gray-300 rounded-lg px-3 py-2 text-sm focus:outline-none focus:ring-2 focus:ring-blue-500"
>
{KNOWN_LOCALES.map(l => (
<option key={l.code} value={l.code}>{l.label} ({l.code})</option>
))}
</select>
</div>
<div>
<label className="block text-sm font-medium text-gray-700 mb-1">Source column header in xlsx *</label>
<input
type="text"
value={sourceLocaleCol}
onChange={e => setSourceLocaleCol(e.target.value)}
placeholder="e.g. en_gb or English (GB)"
className="w-full border border-gray-300 rounded-lg px-3 py-2 text-sm focus:outline-none focus:ring-2 focus:ring-blue-500"
/>
<p className="text-xs text-gray-400 mt-1">Must exactly match the column header in the xlsx file (case-insensitive).</p>
</div>
<div>
<label className="block text-sm font-medium text-gray-700 mb-1">Description (optional)</label>
<textarea
value={description}
onChange={e => setDescription(e.target.value)}
rows={2}
className="w-full border border-gray-300 rounded-lg px-3 py-2 text-sm focus:outline-none focus:ring-2 focus:ring-blue-500 resize-none"
/>
</div>
<div>
<label className="block text-sm font-medium text-gray-700 mb-1">Change note (optional)</label>
<input
type="text"
value={changeNote}
onChange={e => setChangeNote(e.target.value)}
placeholder="e.g. Initial import Q1 2026"
className="w-full border border-gray-300 rounded-lg px-3 py-2 text-sm focus:outline-none focus:ring-2 focus:ring-blue-500"
/>
</div>
</div>
<div className="flex gap-3 pt-2">
<button
type="button"
onClick={() => navigate(`/admin/clients/${clientId}/glossaries`)}
className="px-4 py-2 border border-gray-300 text-sm rounded-lg hover:bg-gray-50"
>
Cancel
</button>
<button
type="button"
disabled={!canSubmit}
onClick={() => uploadMut.mutate()}
className="flex-1 px-4 py-2 bg-blue-600 text-white text-sm font-medium rounded-lg hover:bg-blue-700 disabled:opacity-50"
>
{uploadMut.isPending ? 'Uploading…' : 'Upload glossary'}
</button>
</div>
</div>
);
}

View file

@ -741,4 +741,60 @@ export interface AuditLogQuery {
limit?: number;
sort_by?: string;
sort_order?: number;
}
// ── Glossary ────────────────────────────────────────────────────────────────
export type GlossaryStatus = 'active' | 'archived';
export type EmbeddingStatus = 'pending' | 'in_progress' | 'done' | 'failed';
export interface GlossaryVersion {
id: string;
glossary_id: string;
version_number: number;
source_xlsx_gcs_path?: string;
term_count: number;
embedded_count: number;
embedding_status: EmbeddingStatus;
created_at: string;
created_by: string;
change_note?: string;
}
export interface Glossary {
id: string;
client_id: string;
name: string;
description?: string;
source_locale: string;
source: string;
status: GlossaryStatus;
current_version_id?: string;
created_at: string;
created_by: string;
}
export interface GlossaryDetail extends Glossary {
versions: GlossaryVersion[];
}
export interface GlossaryTerm {
id?: string;
source_term: string;
translations: Record<string, string>;
}
export interface GlossaryTermsResponse {
terms: GlossaryTerm[];
total: number;
page: number;
page_size: number;
}
export interface GlossaryUploadRequest {
name: string;
source_locale: string;
source_locale_col: string;
description?: string;
change_note?: string;
}