feat: per-client glossary — hybrid exact/vector retrieval + AI injection
Adds full glossary system so Gemini uses client-approved terminology
when generating subtitles and translations (critical for 3M brand names
and product codes across 16 target locales).
Backend:
- lib/locales.py: BCP-47 locale registry, normalises xlsx fr_fr → fr-FR
- models/glossary.py: Glossary / GlossaryVersion / GlossaryTerm + enums
- services/glossary_service.py: xlsx parse (openpyxl), ingest to Mongo,
hybrid retrieval (Aho-Corasick exact + Atlas Vector Search), prompt block
- services/embedding_service.py: Gemini text-embedding-004, batch 100, retry
- tasks/embed_glossary.py: Celery background task for async embedding
- api/v1/routes_glossaries.py: CRUD endpoints under /clients/{id}/glossaries
- gemini.py: _build_glossary_block(), {GLOSSARY} injection in all 4 call sites
- tts.py / gemini_tts.py: pass full locale codes (no split("-")[0] truncation)
- tasks/translate_and_synthesize.py: glossary lookup + injection per language
- prompts: {GLOSSARY} placeholder in ingestion, targeted, transcreation prompts
- pyproject.toml: +openpyxl, +pyahocorasick
Frontend:
- routes/admin/glossaries/: GlossaryList, GlossaryUpload, GlossaryDetail
- App.tsx: 3 new routes under /admin/clients/:clientId/glossaries
- ClientDetail.tsx: Glossaries card with count + quick links
- types/api.ts: Glossary, GlossaryVersion, GlossaryDetail, GlossaryTerm types
- lib/api.ts: 7 new API methods (upload, list, detail, terms, versions, activate, archive)
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
parent
05f25a1141
commit
fa351e4d25
26 changed files with 2593 additions and 23 deletions
1
backend/.gitignore
vendored
1
backend/.gitignore
vendored
|
|
@ -23,6 +23,7 @@ eggs/
|
|||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
!app/lib/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
|
|
|
|||
288
backend/app/api/v1/routes_glossaries.py
Normal file
288
backend/app/api/v1/routes_glossaries.py
Normal file
|
|
@ -0,0 +1,288 @@
|
|||
"""
|
||||
Glossary management endpoints.
|
||||
|
||||
Access:
|
||||
- All glossary mutations (upload, activate, archive) → Admin or PM of the client
|
||||
- Glossary reads (list, detail, terms) → Admin, PM, or staff members
|
||||
|
||||
Routes are nested under /clients/{client_id}/glossaries to keep ownership clear.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from fastapi import APIRouter, Depends, File, Form, HTTPException, Query, UploadFile
|
||||
|
||||
from ...core.dependencies import get_current_user, require_pm_for_client, require_roles
|
||||
from ...core.logging import get_logger
|
||||
from ...models.audit_log import AuditAction
|
||||
from ...models.glossary import (
|
||||
GlossaryDetailResponse,
|
||||
GlossaryResponse,
|
||||
GlossaryVersionResponse,
|
||||
)
|
||||
from ...models.user import User, UserRole
|
||||
from ...services import audit_logger as audit_svc
|
||||
from ...services import glossary_service as svc
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
router = APIRouter(
|
||||
prefix="/clients/{client_id}/glossaries",
|
||||
tags=["glossaries"],
|
||||
)
|
||||
|
||||
_ALLOWED_CONTENT_TYPES = {
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||
"application/vnd.ms-excel",
|
||||
}
|
||||
_MAX_FILE_SIZE_MB = 50
|
||||
|
||||
|
||||
def _require_client_staff(client_id: str):
|
||||
"""Dependency: admin or PM of this client."""
|
||||
return require_pm_for_client(client_id_param="client_id")
|
||||
|
||||
|
||||
# ── List glossaries ───────────────────────────────────────────────────────────
|
||||
|
||||
@router.get("", response_model=list[GlossaryResponse])
|
||||
async def list_glossaries(
|
||||
client_id: str,
|
||||
current_user: User = Depends(get_current_user),
|
||||
):
|
||||
"""List all active glossaries for a client."""
|
||||
_assert_can_read(current_user)
|
||||
glossaries = await svc.get_glossaries_for_client(client_id)
|
||||
return [_to_response(g) for g in glossaries]
|
||||
|
||||
|
||||
# ── Upload new glossary ───────────────────────────────────────────────────────
|
||||
|
||||
@router.post("", response_model=GlossaryDetailResponse, status_code=201)
|
||||
async def upload_glossary(
|
||||
client_id: str,
|
||||
file: UploadFile = File(..., description="xlsx glossary file"),
|
||||
name: str = Form(...),
|
||||
source_locale: str = Form(..., description="BCP-47 source locale, e.g. en-GB"),
|
||||
source_locale_col: str = Form(..., description="xlsx column header for the source language, e.g. en_gb"),
|
||||
description: str | None = Form(None),
|
||||
change_note: str | None = Form(None),
|
||||
current_user: User = Depends(require_roles(UserRole.ADMIN, UserRole.PROJECT_MANAGER)),
|
||||
):
|
||||
"""Upload a new glossary xlsx file and associate it with a client."""
|
||||
_validate_xlsx(file)
|
||||
|
||||
try:
|
||||
glossary, version = await svc.ingest_glossary(
|
||||
client_id=client_id,
|
||||
name=name,
|
||||
source_locale=source_locale,
|
||||
source_locale_col=source_locale_col,
|
||||
file=file,
|
||||
user_id=str(current_user.id),
|
||||
description=description,
|
||||
change_note=change_note,
|
||||
)
|
||||
except ValueError as exc:
|
||||
raise HTTPException(status_code=422, detail=str(exc)) from exc
|
||||
|
||||
await audit_svc.audit_logger.log_action(
|
||||
action=AuditAction.GLOSSARY_UPLOAD,
|
||||
description=f"Glossary '{name}' uploaded for client {client_id}",
|
||||
user=current_user,
|
||||
resource_type="glossary",
|
||||
resource_id=glossary.id,
|
||||
details={"term_count": version.term_count, "source_locale": source_locale},
|
||||
)
|
||||
|
||||
versions = await svc.get_versions(glossary.id)
|
||||
return _to_detail_response(glossary, versions)
|
||||
|
||||
|
||||
# ── Get glossary detail ───────────────────────────────────────────────────────
|
||||
|
||||
@router.get("/{glossary_id}", response_model=GlossaryDetailResponse)
|
||||
async def get_glossary(
|
||||
client_id: str,
|
||||
glossary_id: str,
|
||||
current_user: User = Depends(get_current_user),
|
||||
):
|
||||
_assert_can_read(current_user)
|
||||
glossary = await svc.get_glossary(glossary_id)
|
||||
if not glossary or glossary.client_id != client_id:
|
||||
raise HTTPException(status_code=404, detail="Glossary not found")
|
||||
versions = await svc.get_versions(glossary_id)
|
||||
return _to_detail_response(glossary, versions)
|
||||
|
||||
|
||||
# ── Browse terms ──────────────────────────────────────────────────────────────
|
||||
|
||||
@router.get("/{glossary_id}/terms")
|
||||
async def list_terms(
|
||||
client_id: str,
|
||||
glossary_id: str,
|
||||
version_id: str | None = Query(None, description="Specific version; defaults to active"),
|
||||
search: str | None = Query(None),
|
||||
page: int = Query(1, ge=1),
|
||||
page_size: int = Query(50, ge=1, le=200),
|
||||
current_user: User = Depends(get_current_user),
|
||||
):
|
||||
_assert_can_read(current_user)
|
||||
glossary = await svc.get_glossary(glossary_id)
|
||||
if not glossary or glossary.client_id != client_id:
|
||||
raise HTTPException(status_code=404, detail="Glossary not found")
|
||||
|
||||
vid = version_id or glossary.current_version_id
|
||||
if not vid:
|
||||
return {"terms": [], "total": 0, "page": page, "page_size": page_size}
|
||||
|
||||
terms, total = await svc.get_terms_page(vid, search=search, page=page, page_size=page_size)
|
||||
return {
|
||||
"terms": [{"source_term": t.source_term, "translations": t.translations} for t in terms],
|
||||
"total": total,
|
||||
"page": page,
|
||||
"page_size": page_size,
|
||||
}
|
||||
|
||||
|
||||
# ── Upload new version ────────────────────────────────────────────────────────
|
||||
|
||||
@router.post("/{glossary_id}/versions", response_model=GlossaryVersionResponse, status_code=201)
|
||||
async def upload_version(
|
||||
client_id: str,
|
||||
glossary_id: str,
|
||||
file: UploadFile = File(...),
|
||||
source_locale_col: str = Form(...),
|
||||
change_note: str | None = Form(None),
|
||||
current_user: User = Depends(require_roles(UserRole.ADMIN, UserRole.PROJECT_MANAGER)),
|
||||
):
|
||||
"""Upload a new xlsx file as a new version of an existing glossary."""
|
||||
_validate_xlsx(file)
|
||||
glossary = await svc.get_glossary(glossary_id)
|
||||
if not glossary or glossary.client_id != client_id:
|
||||
raise HTTPException(status_code=404, detail="Glossary not found")
|
||||
|
||||
try:
|
||||
version = await svc.ingest_new_version(
|
||||
glossary_id=glossary_id,
|
||||
source_locale_col=source_locale_col,
|
||||
file=file,
|
||||
user_id=str(current_user.id),
|
||||
change_note=change_note,
|
||||
)
|
||||
except ValueError as exc:
|
||||
raise HTTPException(status_code=422, detail=str(exc)) from exc
|
||||
|
||||
await audit_svc.audit_logger.log_action(
|
||||
action=AuditAction.GLOSSARY_VERSION_UPLOAD,
|
||||
description=f"New glossary version uploaded for glossary {glossary_id}",
|
||||
user=current_user,
|
||||
resource_type="glossary_version",
|
||||
resource_id=version.id,
|
||||
details={"term_count": version.term_count, "version_number": version.version_number},
|
||||
)
|
||||
return _version_to_response(version)
|
||||
|
||||
|
||||
# ── Activate a version ────────────────────────────────────────────────────────
|
||||
|
||||
@router.post("/{glossary_id}/activate")
|
||||
async def activate_version(
|
||||
client_id: str,
|
||||
glossary_id: str,
|
||||
version_id: str = Form(...),
|
||||
current_user: User = Depends(require_roles(UserRole.ADMIN, UserRole.PROJECT_MANAGER)),
|
||||
):
|
||||
glossary = await svc.get_glossary(glossary_id)
|
||||
if not glossary or glossary.client_id != client_id:
|
||||
raise HTTPException(status_code=404, detail="Glossary not found")
|
||||
|
||||
try:
|
||||
await svc.activate_version(glossary_id, version_id)
|
||||
except ValueError as exc:
|
||||
raise HTTPException(status_code=404, detail=str(exc)) from exc
|
||||
|
||||
await audit_svc.audit_logger.log_action(
|
||||
action=AuditAction.GLOSSARY_ACTIVATE,
|
||||
description=f"Glossary version {version_id} activated",
|
||||
user=current_user,
|
||||
resource_type="glossary",
|
||||
resource_id=glossary_id,
|
||||
details={"version_id": version_id},
|
||||
)
|
||||
return {"status": "ok", "active_version_id": version_id}
|
||||
|
||||
|
||||
# ── Archive (soft-delete) ─────────────────────────────────────────────────────
|
||||
|
||||
@router.delete("/{glossary_id}", status_code=204)
|
||||
async def archive_glossary(
|
||||
client_id: str,
|
||||
glossary_id: str,
|
||||
current_user: User = Depends(require_roles(UserRole.ADMIN)),
|
||||
):
|
||||
glossary = await svc.get_glossary(glossary_id)
|
||||
if not glossary or glossary.client_id != client_id:
|
||||
raise HTTPException(status_code=404, detail="Glossary not found")
|
||||
await svc.archive_glossary(glossary_id)
|
||||
await audit_svc.audit_logger.log_action(
|
||||
action=AuditAction.GLOSSARY_ARCHIVE,
|
||||
description=f"Glossary {glossary_id} archived",
|
||||
user=current_user,
|
||||
resource_type="glossary",
|
||||
resource_id=glossary_id,
|
||||
)
|
||||
|
||||
|
||||
# ── Helpers ───────────────────────────────────────────────────────────────────
|
||||
|
||||
def _assert_can_read(user: User) -> None:
|
||||
allowed = {UserRole.ADMIN, UserRole.PROJECT_MANAGER, UserRole.REVIEWER,
|
||||
UserRole.LINGUIST, UserRole.PRODUCTION}
|
||||
if user.role not in allowed:
|
||||
raise HTTPException(status_code=403, detail="Insufficient permissions")
|
||||
|
||||
|
||||
def _validate_xlsx(file: UploadFile) -> None:
|
||||
if file.content_type not in _ALLOWED_CONTENT_TYPES and not (
|
||||
file.filename and file.filename.endswith(".xlsx")
|
||||
):
|
||||
raise HTTPException(
|
||||
status_code=422,
|
||||
detail="Only .xlsx files are accepted",
|
||||
)
|
||||
|
||||
|
||||
def _to_response(g) -> GlossaryResponse:
|
||||
return GlossaryResponse(
|
||||
id=str(g.id),
|
||||
client_id=g.client_id,
|
||||
name=g.name,
|
||||
description=g.description,
|
||||
source_locale=g.source_locale,
|
||||
source=g.source,
|
||||
status=g.status,
|
||||
current_version_id=g.current_version_id,
|
||||
created_at=g.created_at,
|
||||
created_by=g.created_by,
|
||||
)
|
||||
|
||||
|
||||
def _version_to_response(v) -> GlossaryVersionResponse:
|
||||
return GlossaryVersionResponse(
|
||||
id=str(v.id),
|
||||
glossary_id=v.glossary_id,
|
||||
version_number=v.version_number,
|
||||
term_count=v.term_count,
|
||||
embedded_count=v.embedded_count,
|
||||
embedding_status=v.embedding_status,
|
||||
created_at=v.created_at,
|
||||
created_by=v.created_by,
|
||||
change_note=v.change_note,
|
||||
)
|
||||
|
||||
|
||||
def _to_detail_response(glossary, versions) -> GlossaryDetailResponse:
|
||||
return GlossaryDetailResponse(
|
||||
**_to_response(glossary).model_dump(),
|
||||
versions=[_version_to_response(v) for v in versions],
|
||||
)
|
||||
|
|
@ -93,7 +93,24 @@ class Settings(BaseSettings):
|
|||
"sv": "sv-SE",
|
||||
"es-419": "es-US",
|
||||
"pt-BR": "pt-BR",
|
||||
"fr-CA": "fr-CA"
|
||||
"fr-CA": "fr-CA",
|
||||
# Explicit region variants (added for locale-aware glossary support)
|
||||
"de-DE": "de-DE",
|
||||
"en-US": "en-US",
|
||||
"en-GB": "en-GB",
|
||||
"en-CA": "en-CA",
|
||||
"es-ES": "es-ES",
|
||||
"es-MX": "es-US",
|
||||
"fr-FR": "fr-FR",
|
||||
"it-IT": "it-IT",
|
||||
"ja-JP": "ja-JP",
|
||||
"ko-KR": "ko-KR",
|
||||
"nl-NL": "nl-NL",
|
||||
"pl-PL": "pl-PL",
|
||||
"cs-CZ": "cs-CZ",
|
||||
"tr-TR": "tr-TR",
|
||||
"id-ID": "id-ID",
|
||||
"pt-PT": "pt-PT",
|
||||
}
|
||||
gemini_tts_language_names: dict[str, str] = {
|
||||
"en": "English",
|
||||
|
|
@ -129,7 +146,24 @@ class Settings(BaseSettings):
|
|||
"sv": "Swedish",
|
||||
"es-419": "Spanish (Latin America)",
|
||||
"pt-BR": "Portuguese (Brazil)",
|
||||
"fr-CA": "French (Canada)"
|
||||
"fr-CA": "French (Canada)",
|
||||
# Explicit region variants
|
||||
"de-DE": "German (Germany)",
|
||||
"en-US": "English (US)",
|
||||
"en-GB": "English (UK)",
|
||||
"en-CA": "English (Canada)",
|
||||
"es-ES": "Spanish (Spain)",
|
||||
"es-MX": "Spanish (Mexico)",
|
||||
"fr-FR": "French (France)",
|
||||
"it-IT": "Italian (Italy)",
|
||||
"ja-JP": "Japanese (Japan)",
|
||||
"ko-KR": "Korean (Korea)",
|
||||
"nl-NL": "Dutch (Netherlands)",
|
||||
"pl-PL": "Polish (Poland)",
|
||||
"cs-CZ": "Czech (Czech Republic)",
|
||||
"tr-TR": "Turkish (Turkey)",
|
||||
"id-ID": "Indonesian (Indonesia)",
|
||||
"pt-PT": "Portuguese (Portugal)",
|
||||
}
|
||||
gemini_tts_preview_samples: dict[str, str] = {
|
||||
"en": "This is a preview of the audio description voice.",
|
||||
|
|
@ -165,7 +199,24 @@ class Settings(BaseSettings):
|
|||
"sv": "Det här är en förhandsgranskning av ljudbeskrivningsrösten.",
|
||||
"es-419": "Esta es una vista previa de la voz de audiodescripción.",
|
||||
"pt-BR": "Esta é uma prévia da voz da audiodescrição.",
|
||||
"fr-CA": "Ceci est un aperçu de la voix de l'audiodescription."
|
||||
"fr-CA": "Ceci est un aperçu de la voix de l'audiodescription.",
|
||||
# Explicit region variants
|
||||
"de-DE": "Dies ist eine Vorschau der Audiodeskriptionsstimme.",
|
||||
"en-US": "This is a preview of the audio description voice.",
|
||||
"en-GB": "This is a preview of the audio description voice.",
|
||||
"en-CA": "This is a preview of the audio description voice.",
|
||||
"es-ES": "Esta es una vista previa de la voz de audiodescripción.",
|
||||
"es-MX": "Esta es una vista previa de la voz de audiodescripción.",
|
||||
"fr-FR": "Ceci est un aperçu de la voix de l'audiodescription.",
|
||||
"it-IT": "Questa è un'anteprima della voce dell'audiodescrizione.",
|
||||
"ja-JP": "これは音声解説の声のプレビューです。",
|
||||
"ko-KR": "이것은 오디오 설명 음성의 미리보기입니다.",
|
||||
"nl-NL": "Dit is een voorbeeld van de audiodescriptiestem.",
|
||||
"pl-PL": "To jest podgląd głosu audiodeskrypcji.",
|
||||
"cs-CZ": "Toto je náhled hlasu zvukového popisu.",
|
||||
"tr-TR": "Bu, sesli betimleme sesinin bir önizlemesidir.",
|
||||
"id-ID": "Ini adalah pratinjau suara deskripsi audio.",
|
||||
"pt-PT": "Esta é uma pré-visualização da voz da audiodescrição.",
|
||||
}
|
||||
|
||||
# Gemini TTS Model Options
|
||||
|
|
|
|||
245
backend/app/lib/locales.py
Normal file
245
backend/app/lib/locales.py
Normal file
|
|
@ -0,0 +1,245 @@
|
|||
"""
|
||||
Central locale registry.
|
||||
|
||||
Provides a single source of truth for BCP-47 codes, display names,
|
||||
and Gemini-friendly labels used throughout the translation/TTS pipeline.
|
||||
|
||||
Convention: BCP-47 with hyphen separator (fr-FR, en-GB, pt-BR).
|
||||
xlsx underscore format (fr_fr, en_gb) is normalized at import time.
|
||||
Bare language-only codes (fr, en) remain valid for legacy compat.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Locale:
|
||||
code: str # canonical BCP-47 (e.g. "fr-FR")
|
||||
display_name: str # human-readable (e.g. "French (France)")
|
||||
gemini_label: str # what to pass to Gemini prompts (e.g. "French (France)")
|
||||
tts_lang: str # BCP-47 for TTS API (may differ, e.g. es-MX → es-US)
|
||||
preview_sample: str # sample sentence for TTS preview
|
||||
|
||||
|
||||
# Master locale registry. Bare language codes (legacy) + explicit region variants.
|
||||
_REGISTRY: dict[str, Locale] = {loc.code: loc for loc in [
|
||||
# ── English ──────────────────────────────────────────────────────────────
|
||||
Locale("en", "English", "English", "en-US",
|
||||
"This is a preview of the audio description voice."),
|
||||
Locale("en-US", "English (US)", "English (United States)", "en-US",
|
||||
"This is a preview of the audio description voice."),
|
||||
Locale("en-GB", "English (UK)", "English (United Kingdom)", "en-GB",
|
||||
"This is a preview of the audio description voice."),
|
||||
Locale("en-CA", "English (Canada)", "English (Canada)", "en-CA",
|
||||
"This is a preview of the audio description voice."),
|
||||
# ── Spanish ──────────────────────────────────────────────────────────────
|
||||
Locale("es", "Spanish", "Spanish", "es-US",
|
||||
"Esta es una vista previa de la voz de audiodescripcion."),
|
||||
Locale("es-ES", "Spanish (Spain)", "Spanish (Spain)", "es-ES",
|
||||
"Esta es una vista previa de la voz de audiodescripción."),
|
||||
Locale("es-MX", "Spanish (Mexico)", "Spanish (Mexico)", "es-US",
|
||||
"Esta es una vista previa de la voz de audiodescripción."),
|
||||
Locale("es-419", "Spanish (Latin America)", "Spanish (Latin America)", "es-US",
|
||||
"Esta es una vista previa de la voz de audiodescripción."),
|
||||
# ── French ───────────────────────────────────────────────────────────────
|
||||
Locale("fr", "French", "French", "fr-FR",
|
||||
"Ceci est un apercu de la voix de l'audiodescription."),
|
||||
Locale("fr-FR", "French (France)", "French (France)", "fr-FR",
|
||||
"Ceci est un aperçu de la voix de l'audiodescription."),
|
||||
Locale("fr-CA", "French (Canada)", "French (Canada)", "fr-CA",
|
||||
"Ceci est un aperçu de la voix de l'audiodescription."),
|
||||
# ── German ───────────────────────────────────────────────────────────────
|
||||
Locale("de", "German", "German", "de-DE",
|
||||
"Dies ist eine Vorschau der Audiodeskriptionsstimme."),
|
||||
Locale("de-DE", "German (Germany)", "German (Germany)", "de-DE",
|
||||
"Dies ist eine Vorschau der Audiodeskriptionsstimme."),
|
||||
# ── Italian ──────────────────────────────────────────────────────────────
|
||||
Locale("it", "Italian", "Italian", "it-IT",
|
||||
"Questa e un'anteprima della voce dell'audiodescrizione."),
|
||||
Locale("it-IT", "Italian (Italy)", "Italian (Italy)", "it-IT",
|
||||
"Questa è un'anteprima della voce dell'audiodescrizione."),
|
||||
# ── Portuguese ───────────────────────────────────────────────────────────
|
||||
Locale("pt", "Portuguese", "Portuguese", "pt-BR",
|
||||
"Esta e uma previa da voz da audiodescricao."),
|
||||
Locale("pt-BR", "Portuguese (Brazil)", "Portuguese (Brazil)", "pt-BR",
|
||||
"Esta é uma prévia da voz da audiodescrição."),
|
||||
Locale("pt-PT", "Portuguese (Portugal)", "Portuguese (Portugal)", "pt-PT",
|
||||
"Esta é uma pré-visualização da voz da audiodescrição."),
|
||||
# ── Japanese ─────────────────────────────────────────────────────────────
|
||||
Locale("ja", "Japanese", "Japanese", "ja-JP",
|
||||
"これは音声解説の声のプレビューです。"),
|
||||
Locale("ja-JP", "Japanese (Japan)", "Japanese (Japan)", "ja-JP",
|
||||
"これは音声解説の声のプレビューです。"),
|
||||
# ── Korean ───────────────────────────────────────────────────────────────
|
||||
Locale("ko", "Korean", "Korean", "ko-KR",
|
||||
"이것은 오디오 설명 음성의 미리보기입니다."),
|
||||
Locale("ko-KR", "Korean (Korea)", "Korean (South Korea)", "ko-KR",
|
||||
"이것은 오디오 설명 음성의 미리보기입니다."),
|
||||
# ── Arabic ───────────────────────────────────────────────────────────────
|
||||
Locale("ar", "Arabic", "Arabic", "ar-EG",
|
||||
"هذه معاينة لصوت الوصف الصوتي."),
|
||||
# ── Hindi ────────────────────────────────────────────────────────────────
|
||||
Locale("hi", "Hindi", "Hindi", "hi-IN",
|
||||
"यह ऑडियो विवरण आवाज का पूर्वावलोकन है।"),
|
||||
# ── Indonesian ───────────────────────────────────────────────────────────
|
||||
Locale("id", "Indonesian", "Indonesian", "id-ID",
|
||||
"Ini adalah pratinjau suara deskripsi audio."),
|
||||
Locale("id-ID", "Indonesian (Indonesia)", "Indonesian (Indonesia)", "id-ID",
|
||||
"Ini adalah pratinjau suara deskripsi audio."),
|
||||
# ── Dutch ────────────────────────────────────────────────────────────────
|
||||
Locale("nl", "Dutch", "Dutch", "nl-NL",
|
||||
"Dit is een voorbeeld van de audiodescriptiestem."),
|
||||
Locale("nl-NL", "Dutch (Netherlands)", "Dutch (Netherlands)", "nl-NL",
|
||||
"Dit is een voorbeeld van de audiodescriptiestem."),
|
||||
# ── Polish ───────────────────────────────────────────────────────────────
|
||||
Locale("pl", "Polish", "Polish", "pl-PL",
|
||||
"To jest podglad glosu audiodeskrypcji."),
|
||||
Locale("pl-PL", "Polish (Poland)", "Polish (Poland)", "pl-PL",
|
||||
"To jest podgląd głosu audiodeskrypcji."),
|
||||
# ── Russian ──────────────────────────────────────────────────────────────
|
||||
Locale("ru", "Russian", "Russian", "ru-RU",
|
||||
"Это предварительный просмотр голоса аудиоописания."),
|
||||
# ── Thai ─────────────────────────────────────────────────────────────────
|
||||
Locale("th", "Thai", "Thai", "th-TH",
|
||||
"นี่คือตัวอย่างเสียงบรรยายภาพ"),
|
||||
# ── Turkish ──────────────────────────────────────────────────────────────
|
||||
Locale("tr", "Turkish", "Turkish", "tr-TR",
|
||||
"Bu, sesli betimleme sesinin bir onizlemesidir."),
|
||||
Locale("tr-TR", "Turkish (Turkey)", "Turkish (Turkey)", "tr-TR",
|
||||
"Bu, sesli betimleme sesinin bir önizlemesidir."),
|
||||
# ── Vietnamese ───────────────────────────────────────────────────────────
|
||||
Locale("vi", "Vietnamese", "Vietnamese", "vi-VN",
|
||||
"Day la ban xem truoc giong mo ta am thanh."),
|
||||
# ── Romanian ─────────────────────────────────────────────────────────────
|
||||
Locale("ro", "Romanian", "Romanian", "ro-RO",
|
||||
"Aceasta este o previzualizare a vocii descrierii audio."),
|
||||
# ── Ukrainian ────────────────────────────────────────────────────────────
|
||||
Locale("uk", "Ukrainian", "Ukrainian", "uk-UA",
|
||||
"Це попередній перегляд голосу аудіоопису."),
|
||||
# ── Bengali ──────────────────────────────────────────────────────────────
|
||||
Locale("bn", "Bengali", "Bengali", "bn-BD",
|
||||
"এটি অডিও বর্ণনা ভয়েসের একটি প্রিভিউ।"),
|
||||
# ── Marathi ──────────────────────────────────────────────────────────────
|
||||
Locale("mr", "Marathi", "Marathi", "mr-IN",
|
||||
"हे ऑडिओ वर्णन आवाजाचे पूर्वावलोकन आहे."),
|
||||
# ── Tamil ────────────────────────────────────────────────────────────────
|
||||
Locale("ta", "Tamil", "Tamil", "ta-IN",
|
||||
"இது ஆடியோ விளக்க குரலின் முன்னோட்டம்."),
|
||||
# ── Telugu ───────────────────────────────────────────────────────────────
|
||||
Locale("te", "Telugu", "Telugu", "te-IN",
|
||||
"ఇది ఆడియో వివరణ స్వరం యొక్క ప్రివ్యూ."),
|
||||
# ── Chinese ──────────────────────────────────────────────────────────────
|
||||
Locale("zh", "Chinese", "Chinese (Simplified)", "zh-CN",
|
||||
"这是音频描述语音的预览。"),
|
||||
# ── Czech ────────────────────────────────────────────────────────────────
|
||||
Locale("cs", "Czech", "Czech", "cs-CZ",
|
||||
"Toto je náhled hlasu zvukového popisu."),
|
||||
Locale("cs-CZ", "Czech (Czech Republic)", "Czech (Czech Republic)", "cs-CZ",
|
||||
"Toto je náhled hlasu zvukového popisu."),
|
||||
# ── Danish ───────────────────────────────────────────────────────────────
|
||||
Locale("da", "Danish", "Danish", "da-DK",
|
||||
"Dette er en forhåndsvisning af lydbeskrivelsesstemmen."),
|
||||
# ── Finnish ──────────────────────────────────────────────────────────────
|
||||
Locale("fi", "Finnish", "Finnish", "fi-FI",
|
||||
"Tämä on äänikuvauksen äänen esikatselu."),
|
||||
# ── Hungarian ────────────────────────────────────────────────────────────
|
||||
Locale("hu", "Hungarian", "Hungarian", "hu-HU",
|
||||
"Ez a hangos leírás hangjának előnézete."),
|
||||
# ── Norwegian ────────────────────────────────────────────────────────────
|
||||
Locale("no", "Norwegian", "Norwegian", "nb-NO",
|
||||
"Dette er en forhåndsvisning av lydbeskrivelsesstemmen."),
|
||||
# ── Slovak ───────────────────────────────────────────────────────────────
|
||||
Locale("sk", "Slovak", "Slovak", "sk-SK",
|
||||
"Toto je náhľad hlasu zvukového popisu."),
|
||||
# ── Swedish ──────────────────────────────────────────────────────────────
|
||||
Locale("sv", "Swedish", "Swedish", "sv-SE",
|
||||
"Det här är en förhandsgranskning av ljudbeskrivningsrösten."),
|
||||
]}
|
||||
|
||||
# xlsx uses underscores; normalize to BCP-47 hyphen form
|
||||
_XLSX_ALIASES: dict[str, str] = {
|
||||
code.replace("-", "_").lower(): code
|
||||
for code in _REGISTRY
|
||||
if "-" in code
|
||||
}
|
||||
# a few extra mappings for edge cases
|
||||
_XLSX_ALIASES.update({
|
||||
"id": "id", # Indonesian column header is just "id" (no region)
|
||||
})
|
||||
|
||||
|
||||
def normalize_code(code: str) -> str:
|
||||
"""
|
||||
Normalize an arbitrary locale code to the canonical BCP-47 form used in this registry.
|
||||
|
||||
Handles:
|
||||
- xlsx underscore form: "fr_fr" → "fr-FR"
|
||||
- Bare language code: "fr" → "fr" (passthrough, legacy compat)
|
||||
- Already canonical: "fr-FR" → "fr-FR"
|
||||
"""
|
||||
if not code:
|
||||
return code
|
||||
lowered = code.strip().lower()
|
||||
# e.g. "fr_fr" -> check alias table
|
||||
if "_" in lowered:
|
||||
return _XLSX_ALIASES.get(lowered, code.replace("_", "-").upper() if len(lowered) > 3 else code)
|
||||
# Already hyphen form — canonicalise case
|
||||
if "-" in code:
|
||||
parts = code.split("-", 1)
|
||||
canonical = f"{parts[0].lower()}-{parts[1].upper()}"
|
||||
if canonical in _REGISTRY:
|
||||
return canonical
|
||||
return canonical
|
||||
# Bare language code — return as-is (legacy)
|
||||
return lowered
|
||||
|
||||
|
||||
def get(code: str) -> Locale | None:
|
||||
"""Return Locale for the given code, or None if unknown."""
|
||||
canonical = normalize_code(code)
|
||||
return _REGISTRY.get(canonical) or _REGISTRY.get(canonical.split("-")[0])
|
||||
|
||||
|
||||
def get_display_name(code: str) -> str:
|
||||
"""Human-readable display name, e.g. 'French (Canada)'."""
|
||||
locale = get(code)
|
||||
return locale.display_name if locale else code
|
||||
|
||||
|
||||
def get_gemini_label(code: str) -> str:
|
||||
"""
|
||||
Label to use inside Gemini prompts, e.g. 'French (Canada)'.
|
||||
Gemini models respond more reliably to human-readable language names
|
||||
than to bare BCP-47 codes when used inside instruction prompts.
|
||||
"""
|
||||
locale = get(code)
|
||||
return locale.gemini_label if locale else code
|
||||
|
||||
|
||||
def get_tts_lang(code: str) -> str:
|
||||
"""BCP-47 code for the TTS API (may differ from canonical, e.g. es-MX → es-US)."""
|
||||
locale = get(code)
|
||||
return locale.tts_lang if locale else code
|
||||
|
||||
|
||||
def get_preview_sample(code: str) -> str:
|
||||
"""Language-appropriate TTS preview sentence."""
|
||||
locale = get(code)
|
||||
if locale:
|
||||
return locale.preview_sample
|
||||
# fallback: try parent language then English
|
||||
parent = get(code.split("-")[0]) if "-" in code else None
|
||||
if parent:
|
||||
return parent.preview_sample
|
||||
return "This is a preview of the audio description voice."
|
||||
|
||||
|
||||
def all_codes() -> list[str]:
|
||||
"""Return all registered locale codes, sorted."""
|
||||
return sorted(_REGISTRY.keys())
|
||||
|
||||
|
||||
def all_display_map() -> dict[str, str]:
|
||||
"""Return {code: display_name} for all registered locales."""
|
||||
return {code: locale.display_name for code, locale in _REGISTRY.items()}
|
||||
|
|
@ -23,6 +23,7 @@ from .api.v1.routes_tts import router as tts_router
|
|||
from .api.v1.routes_websockets import router as websockets_router
|
||||
from .api.v1.routes_vtt_versions import router as vtt_versions_router
|
||||
from .api.v1.routes_language_qc import router as language_qc_router
|
||||
from .api.v1.routes_glossaries import router as glossaries_router
|
||||
from .services.websocket import connection_manager
|
||||
from .core.config import settings
|
||||
from .core.secrets_config import initialize_config
|
||||
|
|
@ -273,6 +274,7 @@ app.include_router(jobs_router, prefix="/api/v1")
|
|||
app.include_router(review_notes_router, prefix="/api/v1")
|
||||
app.include_router(vtt_versions_router, prefix="/api/v1")
|
||||
app.include_router(language_qc_router, prefix="/api/v1")
|
||||
app.include_router(glossaries_router, prefix="/api/v1")
|
||||
app.include_router(tts_router, prefix="/api/v1")
|
||||
app.include_router(admin_router, prefix="/api/v1")
|
||||
app.include_router(websockets_router, prefix="/api/v1")
|
||||
|
|
|
|||
|
|
@ -61,6 +61,12 @@ class AuditAction(str, Enum):
|
|||
ADMIN_DATA_EXPORT = "admin.data.export"
|
||||
ADMIN_AUDIT_ACCESS = "admin.audit.access"
|
||||
|
||||
# Glossary management
|
||||
GLOSSARY_UPLOAD = "glossary.upload"
|
||||
GLOSSARY_VERSION_UPLOAD = "glossary.version.upload"
|
||||
GLOSSARY_ACTIVATE = "glossary.activate"
|
||||
GLOSSARY_ARCHIVE = "glossary.archive"
|
||||
|
||||
# Security events
|
||||
RATE_LIMIT_EXCEEDED = "security.rate_limit.exceeded"
|
||||
VALIDATION_FAILURE = "security.validation.failure"
|
||||
|
|
|
|||
139
backend/app/models/glossary.py
Normal file
139
backend/app/models/glossary.py
Normal file
|
|
@ -0,0 +1,139 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime
|
||||
from enum import StrEnum
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
class GlossarySource(StrEnum):
|
||||
XLSX_UPLOAD = "xlsx_upload"
|
||||
FRAZE_API = "fraze_api" # reserved for future FRAZE integration
|
||||
|
||||
|
||||
class GlossaryStatus(StrEnum):
|
||||
ACTIVE = "active"
|
||||
ARCHIVED = "archived"
|
||||
|
||||
|
||||
class EmbeddingStatus(StrEnum):
|
||||
PENDING = "pending"
|
||||
IN_PROGRESS = "in_progress"
|
||||
DONE = "done"
|
||||
FAILED = "failed"
|
||||
|
||||
|
||||
class Glossary(BaseModel):
|
||||
id: str | None = Field(None, alias="_id")
|
||||
client_id: str
|
||||
name: str
|
||||
description: str | None = None
|
||||
source_locale: str # BCP-47 source column, e.g. "en-GB"
|
||||
source: GlossarySource = GlossarySource.XLSX_UPLOAD
|
||||
status: GlossaryStatus = GlossaryStatus.ACTIVE
|
||||
current_version_id: str | None = None
|
||||
created_at: datetime = Field(default_factory=datetime.utcnow)
|
||||
created_by: str # user_id
|
||||
|
||||
model_config = {"populate_by_name": True, "arbitrary_types_allowed": True}
|
||||
|
||||
|
||||
class GlossaryVersion(BaseModel):
|
||||
id: str | None = Field(None, alias="_id")
|
||||
glossary_id: str
|
||||
version_number: int
|
||||
source_xlsx_gcs_path: str | None = None # GCS path to original file
|
||||
term_count: int = 0
|
||||
embedded_count: int = 0
|
||||
embedding_status: EmbeddingStatus = EmbeddingStatus.PENDING
|
||||
created_at: datetime = Field(default_factory=datetime.utcnow)
|
||||
created_by: str
|
||||
change_note: str | None = None
|
||||
|
||||
model_config = {"populate_by_name": True}
|
||||
|
||||
|
||||
class GlossaryTerm(BaseModel):
|
||||
"""One source term with its per-locale translations."""
|
||||
id: str | None = Field(None, alias="_id")
|
||||
glossary_id: str
|
||||
version_id: str
|
||||
cid: str | None = None # 3M Content ID from xlsx
|
||||
tid: str | None = None # 3M Term ID from xlsx
|
||||
source_term: str # canonical source text (whitespace-normalised)
|
||||
source_term_lower: str # lowercase for case-insensitive index
|
||||
translations: dict[str, str] = {} # {locale_code: translated_text}
|
||||
embedding: list[float] | None = None # 768-dim Gemini embedding
|
||||
|
||||
model_config = {"populate_by_name": True}
|
||||
|
||||
|
||||
# ── Schema models (API request/response) ──────────────────────────────────────
|
||||
|
||||
class GlossaryCreate(BaseModel):
|
||||
name: str
|
||||
description: str | None = None
|
||||
source_locale: str
|
||||
change_note: str | None = None
|
||||
|
||||
|
||||
class GlossaryVersionCreate(BaseModel):
|
||||
source_locale: str
|
||||
change_note: str | None = None
|
||||
|
||||
|
||||
class GlossaryResponse(BaseModel):
|
||||
id: str
|
||||
client_id: str
|
||||
name: str
|
||||
description: str | None = None
|
||||
source_locale: str
|
||||
source: GlossarySource
|
||||
status: GlossaryStatus
|
||||
current_version_id: str | None = None
|
||||
created_at: datetime
|
||||
created_by: str
|
||||
|
||||
|
||||
class GlossaryVersionResponse(BaseModel):
|
||||
id: str
|
||||
glossary_id: str
|
||||
version_number: int
|
||||
term_count: int
|
||||
embedded_count: int
|
||||
embedding_status: EmbeddingStatus
|
||||
created_at: datetime
|
||||
created_by: str
|
||||
change_note: str | None = None
|
||||
|
||||
|
||||
class GlossaryDetailResponse(GlossaryResponse):
|
||||
versions: list[GlossaryVersionResponse] = []
|
||||
|
||||
|
||||
class GlossaryTermPreview(BaseModel):
|
||||
"""Subset of GlossaryTerm for UI previews."""
|
||||
source_term: str
|
||||
translations: dict[str, str]
|
||||
|
||||
|
||||
class MatchedTerm(BaseModel):
|
||||
"""A term matched against VTT source text, with the target-locale translation."""
|
||||
source_term: str
|
||||
target_translation: str
|
||||
match_kind: str # "exact" | "vector"
|
||||
score: float # 1.0 for exact, cosine similarity for vector
|
||||
|
||||
|
||||
def glossary_from_doc(doc: dict) -> Glossary:
|
||||
doc = dict(doc)
|
||||
if "_id" in doc:
|
||||
doc["_id"] = str(doc["_id"])
|
||||
return Glossary.model_validate(doc)
|
||||
|
||||
|
||||
def glossary_version_from_doc(doc: dict) -> GlossaryVersion:
|
||||
doc = dict(doc)
|
||||
if "_id" in doc:
|
||||
doc["_id"] = str(doc["_id"])
|
||||
return GlossaryVersion.model_validate(doc)
|
||||
|
|
@ -47,6 +47,8 @@ BRAND NAMES AND PRODUCTS:
|
|||
- If a product is on the brand list, use the brand name even if the label is partially obscured — use your best confident identification
|
||||
- If a product is NOT on the list or is completely unclear, use a generic descriptor — do not invent brand names
|
||||
|
||||
{GLOSSARY}
|
||||
|
||||
CAPTION FORMATTING (DCMP standard):
|
||||
- Maximum TWO lines per caption. Never use three or more lines.
|
||||
- Each line should be no longer than ~37 characters where possible (42 absolute max)
|
||||
|
|
|
|||
|
|
@ -51,6 +51,8 @@ BRAND NAMES AND PRODUCTS:
|
|||
- If a product is on the brand list, use the brand name even if the label is partially obscured — use your best confident identification
|
||||
- If a product is NOT on the list or is completely unclear, use a generic descriptor — do not invent brand names
|
||||
|
||||
{GLOSSARY}
|
||||
|
||||
CAPTION FORMATTING (DCMP standard):
|
||||
- Maximum TWO lines per caption. Never use three or more lines.
|
||||
- Each line should be no longer than ~37 characters where possible (42 absolute max)
|
||||
|
|
|
|||
|
|
@ -7,6 +7,8 @@ Rewrite the following English captions and audio descriptions into {TARGET_LANGU
|
|||
- timing boundaries (same cue timestamps),
|
||||
- line lengths friendly for readability (~32–40 chars).
|
||||
|
||||
{GLOSSARY}
|
||||
|
||||
Input:
|
||||
- captions_vtt_en: <VTT text>
|
||||
- ad_vtt_en: <VTT text>
|
||||
|
|
|
|||
72
backend/app/services/embedding_service.py
Normal file
72
backend/app/services/embedding_service.py
Normal file
|
|
@ -0,0 +1,72 @@
|
|||
"""
|
||||
Embedding service backed by Gemini text-embedding-004.
|
||||
|
||||
Provides batch embedding with retry/backoff for use in glossary ingestion.
|
||||
Batch size: 100 texts per API call (API limit is 2048 but we keep it conservative
|
||||
for memory and retry ergonomics with large glossaries).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
from collections.abc import Sequence
|
||||
|
||||
from google import genai
|
||||
from google.genai import types as genai_types
|
||||
|
||||
from ..core.config import settings
|
||||
from ..core.logging import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
_EMBED_MODEL = "text-embedding-004"
|
||||
_BATCH_SIZE = 100
|
||||
_MAX_RETRIES = 3
|
||||
_INITIAL_BACKOFF = 2.0
|
||||
|
||||
|
||||
class EmbeddingService:
|
||||
def __init__(self) -> None:
|
||||
self._client = genai.Client(api_key=settings.gemini_api_key)
|
||||
|
||||
async def embed_texts(self, texts: Sequence[str]) -> list[list[float]]:
|
||||
"""
|
||||
Embed a list of texts and return a list of 768-dim float vectors.
|
||||
Processes in batches; retries with exponential backoff on transient errors.
|
||||
Order is preserved.
|
||||
"""
|
||||
results: list[list[float]] = []
|
||||
for i in range(0, len(texts), _BATCH_SIZE):
|
||||
batch = list(texts[i: i + _BATCH_SIZE])
|
||||
vectors = await self._embed_batch_with_retry(batch)
|
||||
results.extend(vectors)
|
||||
return results
|
||||
|
||||
async def embed_text(self, text: str) -> list[float]:
|
||||
vectors = await self.embed_texts([text])
|
||||
return vectors[0]
|
||||
|
||||
async def _embed_batch_with_retry(self, texts: list[str]) -> list[list[float]]:
|
||||
backoff = _INITIAL_BACKOFF
|
||||
for attempt in range(1, _MAX_RETRIES + 1):
|
||||
try:
|
||||
response = await asyncio.to_thread(
|
||||
self._client.models.embed_content,
|
||||
model=_EMBED_MODEL,
|
||||
contents=texts,
|
||||
config=genai_types.EmbedContentConfig(
|
||||
task_type="RETRIEVAL_DOCUMENT",
|
||||
),
|
||||
)
|
||||
return [list(emb.values) for emb in response.embeddings]
|
||||
except Exception as exc:
|
||||
if attempt == _MAX_RETRIES:
|
||||
logger.error(f"Embedding batch failed after {_MAX_RETRIES} attempts: {exc}")
|
||||
raise
|
||||
logger.warning(f"Embedding attempt {attempt} failed, retrying in {backoff}s: {exc}")
|
||||
await asyncio.sleep(backoff)
|
||||
backoff *= 2
|
||||
|
||||
raise RuntimeError("unreachable") # makes type-checker happy
|
||||
|
||||
|
||||
embedding_service = EmbeddingService()
|
||||
|
|
@ -8,6 +8,7 @@ import google.genai as genai
|
|||
|
||||
from ..core.config import settings
|
||||
from ..core.logging import get_logger
|
||||
from ..lib import locales as locale_lib
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
|
@ -106,6 +107,12 @@ Generate sdh_captions_vtt using the same cue timings as captions_vtt, enriched w
|
|||
- Maintain the same timestamp format as captions_vtt (HH:MM:SS.mmm --> HH:MM:SS.mmm)
|
||||
- Only add sound effect cues where they add meaningful context; do not annotate every minor sound"""
|
||||
|
||||
def _build_glossary_block(self, glossary_block: Optional[str]) -> str:
|
||||
"""Return the pre-built glossary block (from glossary_service.build_glossary_prompt_block), or empty string."""
|
||||
if glossary_block and glossary_block.strip():
|
||||
return glossary_block.strip()
|
||||
return ""
|
||||
|
||||
def _build_brand_context_block(self, brand_context: Optional[str]) -> str:
|
||||
"""Build the brand context instruction block for injection into prompts."""
|
||||
if brand_context and brand_context.strip():
|
||||
|
|
@ -118,7 +125,7 @@ Generate sdh_captions_vtt using the same cue timings as captions_vtt, enriched w
|
|||
)
|
||||
return "No specific brand names have been provided for this video."
|
||||
|
||||
async def extract_accessibility(self, video_file_path: str, brand_context: Optional[str] = None, sdh_requested: bool = False, _cost_ctx: Optional[dict] = None) -> dict[str, Any]:
|
||||
async def extract_accessibility(self, video_file_path: str, brand_context: Optional[str] = None, sdh_requested: bool = False, glossary_block: Optional[str] = None, _cost_ctx: Optional[dict] = None) -> dict[str, Any]:
|
||||
"""
|
||||
Extract captions and audio descriptions from video using Gemini 2.0
|
||||
Returns structured JSON with transcript, captions VTT, and audio description VTT
|
||||
|
|
@ -127,6 +134,7 @@ Generate sdh_captions_vtt using the same cue timings as captions_vtt, enriched w
|
|||
prompt = (
|
||||
prompt_template
|
||||
.replace("{BRAND_CONTEXT}", self._build_brand_context_block(brand_context))
|
||||
.replace("{GLOSSARY}", self._build_glossary_block(glossary_block))
|
||||
.replace("{SDH_FIELD}", self._build_sdh_field(sdh_requested))
|
||||
.replace("{SDH_GUIDELINES}", self._build_sdh_guidelines(sdh_requested))
|
||||
)
|
||||
|
|
@ -320,6 +328,7 @@ Fix the JSON and return it:
|
|||
target_language: str,
|
||||
brand_context: Optional[str] = None,
|
||||
sdh_requested: bool = False,
|
||||
glossary_block: Optional[str] = None,
|
||||
_cost_ctx: Optional[dict] = None,
|
||||
) -> dict[str, Any]:
|
||||
"""
|
||||
|
|
@ -343,8 +352,9 @@ Fix the JSON and return it:
|
|||
prompt_template = self._load_prompt("gemini_ingestion_targeted.md")
|
||||
prompt = (
|
||||
prompt_template
|
||||
.replace("{TARGET_LANGUAGE}", target_language)
|
||||
.replace("{TARGET_LANGUAGE}", locale_lib.get_gemini_label(target_language))
|
||||
.replace("{BRAND_CONTEXT}", self._build_brand_context_block(brand_context))
|
||||
.replace("{GLOSSARY}", self._build_glossary_block(glossary_block))
|
||||
.replace("{SDH_FIELD}", self._build_sdh_field(sdh_requested))
|
||||
.replace("{SDH_GUIDELINES}", self._build_sdh_guidelines(sdh_requested))
|
||||
)
|
||||
|
|
@ -756,6 +766,7 @@ Fix the JSON and return it:
|
|||
ad_vtt: str,
|
||||
target_language: str,
|
||||
brief: Optional[str] = None,
|
||||
glossary_block: Optional[str] = None,
|
||||
_cost_ctx: Optional[dict] = None,
|
||||
) -> dict[str, str]:
|
||||
"""
|
||||
|
|
@ -765,7 +776,8 @@ Fix the JSON and return it:
|
|||
|
||||
# Format prompt with actual content
|
||||
prompt = prompt_template.format(
|
||||
TARGET_LANGUAGE=target_language
|
||||
TARGET_LANGUAGE=locale_lib.get_gemini_label(target_language),
|
||||
GLOSSARY=self._build_glossary_block(glossary_block),
|
||||
)
|
||||
|
||||
user_prompt = f"""
|
||||
|
|
@ -817,6 +829,7 @@ JSON:
|
|||
vtt_content: str,
|
||||
target_language: str,
|
||||
source_language: str = "en",
|
||||
glossary_block: Optional[str] = None,
|
||||
_cost_ctx: Optional[dict] = None,
|
||||
) -> str:
|
||||
"""
|
||||
|
|
@ -842,14 +855,18 @@ JSON:
|
|||
f"{i + 1}. {cue.text.replace(chr(10), ' ')}"
|
||||
for i, cue in enumerate(source_cues)
|
||||
)
|
||||
prompt = f"""Translate the following {cue_count} numbered text segments from {source_language} to {target_language}.
|
||||
_src_label = locale_lib.get_gemini_label(source_language)
|
||||
_tgt_label = locale_lib.get_gemini_label(target_language)
|
||||
_glossary_section = self._build_glossary_block(glossary_block)
|
||||
_glossary_line = f"\n\n{_glossary_section}" if _glossary_section else ""
|
||||
prompt = f"""Translate the following {cue_count} numbered text segments from {_src_label} to {_tgt_label}.
|
||||
|
||||
REQUIREMENTS:
|
||||
- Return EXACTLY {cue_count} numbered lines, one translation per line
|
||||
- Format: "1. translated text", "2. translated text", etc.
|
||||
- Preserve speaker labels like [Speaker 1]: unchanged
|
||||
- Use natural, idiomatic {target_language}
|
||||
- Do NOT add any explanation, preamble, or extra lines{extra_instruction}
|
||||
- Use natural, idiomatic {_tgt_label}
|
||||
- Do NOT add any explanation, preamble, or extra lines{extra_instruction}{_glossary_line}
|
||||
|
||||
Segments to translate:
|
||||
{numbered_texts}"""
|
||||
|
|
|
|||
|
|
@ -7,6 +7,7 @@ from pydub import AudioSegment
|
|||
|
||||
from ..core.config import settings
|
||||
from ..core.logging import get_logger
|
||||
from ..lib import locales as locale_lib
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
|
@ -166,10 +167,10 @@ class GeminiTTSService:
|
|||
Generate a preview audio sample for voice selection.
|
||||
Uses language-specific sample text and applies all TTS settings.
|
||||
"""
|
||||
# Get preview sample text for the language
|
||||
sample_text = settings.gemini_tts_preview_samples.get(
|
||||
language,
|
||||
settings.gemini_tts_preview_samples.get("en", "This is a voice preview.")
|
||||
# Get preview sample text — try settings override, then locale registry, then fallback
|
||||
sample_text = (
|
||||
settings.gemini_tts_preview_samples.get(language)
|
||||
or locale_lib.get_preview_sample(language)
|
||||
)
|
||||
|
||||
return await self.synthesize_text(
|
||||
|
|
|
|||
736
backend/app/services/glossary_service.py
Normal file
736
backend/app/services/glossary_service.py
Normal file
|
|
@ -0,0 +1,736 @@
|
|||
"""
|
||||
Glossary service — per-client terminology management.
|
||||
|
||||
Responsibilities:
|
||||
• parse_xlsx(bytes, source_col) → list of (source_term, {locale: translation})
|
||||
• ingest_glossary(...) → create Glossary + GlossaryVersion + GlossaryTerms in Mongo
|
||||
• activate_version(...) → atomic swap of current_version_id
|
||||
• match_terms_for_text(...) → hybrid exact + vector retrieval for prompt injection
|
||||
• build_glossary_prompt_block(...) → formats matched terms for the Gemini prompt
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
import re
|
||||
from collections.abc import Sequence
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
|
||||
from bson import ObjectId
|
||||
from fastapi import UploadFile
|
||||
|
||||
from ..core.database import get_database
|
||||
from ..core.logging import get_logger
|
||||
from ..lib import locales as locale_lib
|
||||
from ..models.glossary import (
|
||||
EmbeddingStatus,
|
||||
Glossary,
|
||||
GlossaryStatus,
|
||||
GlossaryTerm,
|
||||
GlossaryVersion,
|
||||
MatchedTerm,
|
||||
glossary_from_doc,
|
||||
glossary_version_from_doc,
|
||||
)
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
_COLL_GLOSSARIES = "glossaries"
|
||||
_COLL_VERSIONS = "glossary_versions"
|
||||
_COLL_TERMS = "glossary_terms"
|
||||
|
||||
# Maximum number of terms injected into a single Gemini prompt
|
||||
_MAX_TERMS_IN_PROMPT = 50
|
||||
|
||||
# Atlas Vector Search index name (must exist on the collection)
|
||||
_VECTOR_INDEX = "glossary_embedding_index"
|
||||
_VECTOR_DIMS = 768
|
||||
_VECTOR_SIMILARITY_THRESHOLD = 0.75
|
||||
_VECTOR_TOP_K = 20
|
||||
|
||||
|
||||
# ── xlsx parsing ─────────────────────────────────────────────────────────────
|
||||
|
||||
@dataclass
|
||||
class _ParsedTerm:
|
||||
cid: str | None
|
||||
tid: str | None
|
||||
source_term: str
|
||||
translations: dict[str, str] # {normalized_locale: text}
|
||||
|
||||
|
||||
def _cell(row: tuple, idx: int | None) -> str | None:
|
||||
if idx is None or idx >= len(row):
|
||||
return None
|
||||
v = row[idx]
|
||||
return str(v).strip() if v is not None else None
|
||||
|
||||
|
||||
def parse_xlsx(file_bytes: bytes, source_locale_col: str) -> list[_ParsedTerm]:
|
||||
"""
|
||||
Parse an xlsx glossary file.
|
||||
|
||||
Args:
|
||||
file_bytes: Raw xlsx bytes.
|
||||
source_locale_col: The column header that contains the source text,
|
||||
e.g. "en_gb" or "en-GB". Case-insensitive.
|
||||
|
||||
Returns:
|
||||
List of parsed terms. Rows where the source column is empty are skipped.
|
||||
"""
|
||||
import openpyxl # local import — only used during ingest
|
||||
|
||||
wb = openpyxl.load_workbook(io.BytesIO(file_bytes), read_only=True, data_only=True)
|
||||
ws = wb.active
|
||||
|
||||
rows = ws.iter_rows(values_only=True)
|
||||
try:
|
||||
header_row = next(rows)
|
||||
except StopIteration:
|
||||
return []
|
||||
|
||||
# Normalise header names to canonical locale codes
|
||||
headers: list[str | None] = []
|
||||
for h in header_row:
|
||||
if h is None:
|
||||
headers.append(None)
|
||||
continue
|
||||
s = str(h).strip()
|
||||
headers.append(s)
|
||||
|
||||
# Find column indices
|
||||
src_col_name = source_locale_col.strip()
|
||||
# Try exact match first, then case-insensitive
|
||||
src_idx: int | None = None
|
||||
for i, h in enumerate(headers):
|
||||
if h and h.lower() == src_col_name.lower():
|
||||
src_idx = i
|
||||
break
|
||||
|
||||
if src_idx is None:
|
||||
raise ValueError(f"Source column '{source_locale_col}' not found in xlsx. Available: {[h for h in headers if h]}")
|
||||
|
||||
cid_idx = next((i for i, h in enumerate(headers) if h and h.upper() == "CID"), None)
|
||||
tid_idx = next((i for i, h in enumerate(headers) if h and h.upper() == "TID"), None)
|
||||
|
||||
# All other columns with valid locale-like names become translation columns
|
||||
locale_cols: list[tuple[int, str]] = [] # [(col_index, normalized_locale_code)]
|
||||
for i, h in enumerate(headers):
|
||||
if h is None or i == src_idx or i == cid_idx or i == tid_idx:
|
||||
continue
|
||||
norm = locale_lib.normalize_code(h)
|
||||
if norm:
|
||||
locale_cols.append((i, norm))
|
||||
|
||||
terms: list[_ParsedTerm] = []
|
||||
for row in rows:
|
||||
if not row or all(v is None for v in row):
|
||||
continue
|
||||
|
||||
source = _cell(row, src_idx)
|
||||
if not source:
|
||||
continue
|
||||
|
||||
translations: dict[str, str] = {}
|
||||
for col_idx, locale_code in locale_cols:
|
||||
val = _cell(row, col_idx)
|
||||
if val:
|
||||
translations[locale_code] = val
|
||||
|
||||
terms.append(_ParsedTerm(
|
||||
cid=_cell(row, cid_idx),
|
||||
tid=_cell(row, tid_idx),
|
||||
source_term=source,
|
||||
translations=translations,
|
||||
))
|
||||
|
||||
wb.close()
|
||||
return terms
|
||||
|
||||
|
||||
# ── Ingest ────────────────────────────────────────────────────────────────────
|
||||
|
||||
async def ingest_glossary(
|
||||
client_id: str,
|
||||
name: str,
|
||||
source_locale: str,
|
||||
source_locale_col: str,
|
||||
file: UploadFile,
|
||||
user_id: str,
|
||||
description: str | None = None,
|
||||
change_note: str | None = None,
|
||||
) -> tuple[Glossary, GlossaryVersion]:
|
||||
"""
|
||||
Full glossary ingestion pipeline:
|
||||
1. Upload xlsx to GCS
|
||||
2. Parse terms
|
||||
3. Create Glossary + GlossaryVersion + GlossaryTerm documents in Mongo
|
||||
4. Kick off background embedding task
|
||||
|
||||
Returns (Glossary, GlossaryVersion) on success.
|
||||
"""
|
||||
db = await get_database()
|
||||
|
||||
# ── Upload original xlsx to GCS ──
|
||||
file_bytes = await file.read()
|
||||
glossary_id = str(ObjectId())
|
||||
version_id = str(ObjectId())
|
||||
gcs_path = f"glossaries/{client_id}/{glossary_id}/{version_id}/source.xlsx"
|
||||
await _upload_bytes_to_gcs(file_bytes, gcs_path,
|
||||
content_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")
|
||||
|
||||
# ── Parse ──
|
||||
logger.info(f"Parsing xlsx for glossary {glossary_id}, source_col={source_locale_col}")
|
||||
parsed_terms = parse_xlsx(file_bytes, source_locale_col)
|
||||
logger.info(f"Parsed {len(parsed_terms)} terms")
|
||||
|
||||
# ── Create Glossary doc ──
|
||||
now = datetime.utcnow()
|
||||
glossary_doc = {
|
||||
"_id": ObjectId(glossary_id),
|
||||
"client_id": client_id,
|
||||
"name": name,
|
||||
"description": description,
|
||||
"source_locale": locale_lib.normalize_code(source_locale),
|
||||
"source": "xlsx_upload",
|
||||
"status": GlossaryStatus.ACTIVE.value,
|
||||
"current_version_id": version_id,
|
||||
"created_at": now,
|
||||
"created_by": user_id,
|
||||
}
|
||||
await db[_COLL_GLOSSARIES].insert_one(glossary_doc)
|
||||
|
||||
# ── Create GlossaryVersion doc ──
|
||||
version_doc = {
|
||||
"_id": ObjectId(version_id),
|
||||
"glossary_id": glossary_id,
|
||||
"version_number": 1,
|
||||
"source_xlsx_gcs_path": gcs_path,
|
||||
"term_count": len(parsed_terms),
|
||||
"embedded_count": 0,
|
||||
"embedding_status": EmbeddingStatus.PENDING.value,
|
||||
"created_at": now,
|
||||
"created_by": user_id,
|
||||
"change_note": change_note,
|
||||
}
|
||||
await db[_COLL_VERSIONS].insert_one(version_doc)
|
||||
|
||||
# ── Bulk insert GlossaryTerms ──
|
||||
if parsed_terms:
|
||||
term_docs = [
|
||||
{
|
||||
"_id": ObjectId(),
|
||||
"glossary_id": glossary_id,
|
||||
"version_id": version_id,
|
||||
"cid": t.cid,
|
||||
"tid": t.tid,
|
||||
"source_term": t.source_term,
|
||||
"source_term_lower": t.source_term.lower(),
|
||||
"translations": t.translations,
|
||||
"embedding": None,
|
||||
}
|
||||
for t in parsed_terms
|
||||
]
|
||||
await db[_COLL_TERMS].insert_many(term_docs, ordered=False)
|
||||
|
||||
# ── Create collection indexes (idempotent) ──
|
||||
await _ensure_indexes(db)
|
||||
|
||||
# ── Kick off embedding Celery task ──
|
||||
try:
|
||||
from ..tasks.embed_glossary import embed_glossary_version_task
|
||||
embed_glossary_version_task.delay(version_id)
|
||||
logger.info(f"Queued embedding task for version {version_id}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not queue embedding task: {e}")
|
||||
|
||||
glossary = glossary_from_doc(glossary_doc)
|
||||
version = glossary_version_from_doc(version_doc)
|
||||
return glossary, version
|
||||
|
||||
|
||||
async def ingest_new_version(
|
||||
glossary_id: str,
|
||||
source_locale_col: str,
|
||||
file: UploadFile,
|
||||
user_id: str,
|
||||
change_note: str | None = None,
|
||||
) -> GlossaryVersion:
|
||||
"""Add a new version to an existing glossary without replacing it as active."""
|
||||
db = await get_database()
|
||||
|
||||
glossary_doc = await db[_COLL_GLOSSARIES].find_one({"_id": ObjectId(glossary_id)})
|
||||
if not glossary_doc:
|
||||
raise ValueError(f"Glossary {glossary_id} not found")
|
||||
|
||||
client_id = glossary_doc["client_id"]
|
||||
|
||||
# Find next version number
|
||||
last_version = await db[_COLL_VERSIONS].find_one(
|
||||
{"glossary_id": glossary_id},
|
||||
sort=[("version_number", -1)],
|
||||
)
|
||||
next_version_num = (last_version["version_number"] + 1) if last_version else 1
|
||||
|
||||
file_bytes = await file.read()
|
||||
version_id = str(ObjectId())
|
||||
gcs_path = f"glossaries/{client_id}/{glossary_id}/{version_id}/source.xlsx"
|
||||
await _upload_bytes_to_gcs(file_bytes, gcs_path,
|
||||
content_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")
|
||||
|
||||
parsed_terms = parse_xlsx(file_bytes, source_locale_col)
|
||||
|
||||
now = datetime.utcnow()
|
||||
version_doc = {
|
||||
"_id": ObjectId(version_id),
|
||||
"glossary_id": glossary_id,
|
||||
"version_number": next_version_num,
|
||||
"source_xlsx_gcs_path": gcs_path,
|
||||
"term_count": len(parsed_terms),
|
||||
"embedded_count": 0,
|
||||
"embedding_status": EmbeddingStatus.PENDING.value,
|
||||
"created_at": now,
|
||||
"created_by": user_id,
|
||||
"change_note": change_note,
|
||||
}
|
||||
await db[_COLL_VERSIONS].insert_one(version_doc)
|
||||
|
||||
if parsed_terms:
|
||||
term_docs = [
|
||||
{
|
||||
"_id": ObjectId(),
|
||||
"glossary_id": glossary_id,
|
||||
"version_id": version_id,
|
||||
"cid": t.cid,
|
||||
"tid": t.tid,
|
||||
"source_term": t.source_term,
|
||||
"source_term_lower": t.source_term.lower(),
|
||||
"translations": t.translations,
|
||||
"embedding": None,
|
||||
}
|
||||
for t in parsed_terms
|
||||
]
|
||||
await db[_COLL_TERMS].insert_many(term_docs, ordered=False)
|
||||
|
||||
try:
|
||||
from ..tasks.embed_glossary import embed_glossary_version_task
|
||||
embed_glossary_version_task.delay(version_id)
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not queue embedding task: {e}")
|
||||
|
||||
return glossary_version_from_doc(version_doc)
|
||||
|
||||
|
||||
async def activate_version(glossary_id: str, version_id: str) -> None:
|
||||
"""Atomically set the active version of a glossary."""
|
||||
db = await get_database()
|
||||
result = await db[_COLL_GLOSSARIES].update_one(
|
||||
{"_id": ObjectId(glossary_id)},
|
||||
{"$set": {"current_version_id": version_id}},
|
||||
)
|
||||
if result.matched_count == 0:
|
||||
raise ValueError(f"Glossary {glossary_id} not found")
|
||||
# Invalidate Redis cache
|
||||
await _invalidate_cache(glossary_id)
|
||||
|
||||
|
||||
async def archive_glossary(glossary_id: str) -> None:
|
||||
db = await get_database()
|
||||
await db[_COLL_GLOSSARIES].update_one(
|
||||
{"_id": ObjectId(glossary_id)},
|
||||
{"$set": {"status": GlossaryStatus.ARCHIVED.value}},
|
||||
)
|
||||
await _invalidate_cache(glossary_id)
|
||||
|
||||
|
||||
# ── Retrieval ─────────────────────────────────────────────────────────────────
|
||||
|
||||
async def match_terms_for_text(
|
||||
client_id: str,
|
||||
text: str,
|
||||
target_locale: str,
|
||||
top_k: int = _MAX_TERMS_IN_PROMPT,
|
||||
) -> list[MatchedTerm]:
|
||||
"""
|
||||
Hybrid retrieval: exact-match (Aho-Corasick) + semantic (Atlas Vector Search).
|
||||
|
||||
Returns a ranked, deduplicated list of up to `top_k` MatchedTerm objects,
|
||||
each with the source term and its translation in `target_locale`.
|
||||
Exact matches rank before vector matches.
|
||||
"""
|
||||
db = await get_database()
|
||||
norm_target = locale_lib.normalize_code(target_locale)
|
||||
|
||||
active_version_id = await _get_active_version_id(client_id)
|
||||
if not active_version_id:
|
||||
return []
|
||||
|
||||
# ── Exact pass ──
|
||||
exact_matches = await _exact_match(db, active_version_id, text, norm_target)
|
||||
|
||||
# ── Vector pass (if we haven't hit the limit yet) ──
|
||||
remaining = top_k - len(exact_matches)
|
||||
already_found = {m.source_term.lower() for m in exact_matches}
|
||||
vector_matches: list[MatchedTerm] = []
|
||||
|
||||
if remaining > 0:
|
||||
try:
|
||||
vector_matches = await _vector_match(
|
||||
db, active_version_id, text, norm_target,
|
||||
top_k=_VECTOR_TOP_K, exclude_terms=already_found,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"Vector search failed (non-fatal): {e}")
|
||||
|
||||
combined = exact_matches + vector_matches
|
||||
if len(combined) > top_k:
|
||||
logger.info(f"glossary_terms_truncated: had {len(combined)}, capped at {top_k}")
|
||||
combined = combined[:top_k]
|
||||
|
||||
return combined
|
||||
|
||||
|
||||
async def _get_active_version_id(client_id: str) -> str | None:
|
||||
"""Return the active version_id for the active glossary of a client, or None."""
|
||||
try:
|
||||
from ..core.redis import redis_client # lazy import
|
||||
cache_key = f"glossary:active_version:{client_id}"
|
||||
cached = await redis_client.get(cache_key)
|
||||
if cached:
|
||||
return cached.decode() if isinstance(cached, bytes) else cached
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
db = await get_database()
|
||||
glossary_doc = await db[_COLL_GLOSSARIES].find_one(
|
||||
{"client_id": client_id, "status": GlossaryStatus.ACTIVE.value},
|
||||
sort=[("created_at", -1)],
|
||||
)
|
||||
if not glossary_doc or not glossary_doc.get("current_version_id"):
|
||||
return None
|
||||
|
||||
version_id = glossary_doc["current_version_id"]
|
||||
|
||||
try:
|
||||
from ..core.redis import redis_client
|
||||
cache_key = f"glossary:active_version:{client_id}"
|
||||
await redis_client.setex(cache_key, 3600, version_id)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return version_id
|
||||
|
||||
|
||||
async def _invalidate_cache(glossary_id: str) -> None:
|
||||
"""Clear Redis cache for a glossary's client."""
|
||||
try:
|
||||
db = await get_database()
|
||||
doc = await db[_COLL_GLOSSARIES].find_one({"_id": ObjectId(glossary_id)})
|
||||
if doc:
|
||||
from ..core.redis import redis_client
|
||||
await redis_client.delete(f"glossary:active_version:{doc['client_id']}")
|
||||
except Exception as e:
|
||||
logger.debug(f"Cache invalidation skipped: {e}")
|
||||
|
||||
|
||||
async def _exact_match(
|
||||
db,
|
||||
version_id: str,
|
||||
text: str,
|
||||
target_locale: str,
|
||||
) -> list[MatchedTerm]:
|
||||
"""Find terms present in `text` using Aho-Corasick over the glossary terms."""
|
||||
import ahocorasick # pyahocorasick
|
||||
|
||||
# Load all terms for this version (source_term_lower + translations)
|
||||
cursor = db[_COLL_TERMS].find(
|
||||
{"version_id": version_id},
|
||||
{"source_term": 1, "source_term_lower": 1, "translations": 1},
|
||||
)
|
||||
terms = await cursor.to_list(length=None)
|
||||
if not terms:
|
||||
return []
|
||||
|
||||
# Build automaton
|
||||
automaton = ahocorasick.Automaton()
|
||||
for doc in terms:
|
||||
stl = doc["source_term_lower"]
|
||||
automaton.add_word(stl, (doc["source_term"], doc["translations"]))
|
||||
automaton.make_automaton()
|
||||
|
||||
text_lower = text.lower()
|
||||
matched: list[MatchedTerm] = []
|
||||
seen: set[str] = set()
|
||||
|
||||
for _end_idx, (source_term, translations) in automaton.iter(text_lower):
|
||||
if source_term in seen:
|
||||
continue
|
||||
# Require word/phrase boundaries around the match
|
||||
start_idx = _end_idx - len(source_term.lower()) + 1
|
||||
if start_idx > 0 and text_lower[start_idx - 1].isalnum():
|
||||
continue
|
||||
end_after = _end_idx + 1
|
||||
if end_after < len(text_lower) and text_lower[end_after].isalnum():
|
||||
continue
|
||||
|
||||
target_text = _get_translation(translations, target_locale)
|
||||
if not target_text:
|
||||
continue
|
||||
seen.add(source_term)
|
||||
matched.append(MatchedTerm(
|
||||
source_term=source_term,
|
||||
target_translation=target_text,
|
||||
match_kind="exact",
|
||||
score=1.0,
|
||||
))
|
||||
|
||||
return matched
|
||||
|
||||
|
||||
async def _vector_match(
|
||||
db,
|
||||
version_id: str,
|
||||
text: str,
|
||||
target_locale: str,
|
||||
top_k: int = 20,
|
||||
exclude_terms: set[str] | None = None,
|
||||
) -> list[MatchedTerm]:
|
||||
"""Semantic search via Atlas Vector Search ($vectorSearch)."""
|
||||
from ..services.embedding_service import embedding_service
|
||||
|
||||
query_embedding = await embedding_service.embed_text(text[:2000]) # cap input length
|
||||
|
||||
pipeline = [
|
||||
{
|
||||
"$vectorSearch": {
|
||||
"index": _VECTOR_INDEX,
|
||||
"path": "embedding",
|
||||
"queryVector": query_embedding,
|
||||
"numCandidates": top_k * 4,
|
||||
"limit": top_k,
|
||||
"filter": {"version_id": version_id},
|
||||
}
|
||||
},
|
||||
{
|
||||
"$project": {
|
||||
"source_term": 1,
|
||||
"translations": 1,
|
||||
"score": {"$meta": "vectorSearchScore"},
|
||||
}
|
||||
},
|
||||
]
|
||||
|
||||
cursor = db[_COLL_TERMS].aggregate(pipeline)
|
||||
results = await cursor.to_list(length=top_k)
|
||||
|
||||
matched: list[MatchedTerm] = []
|
||||
for doc in results:
|
||||
score = doc.get("score", 0.0)
|
||||
if score < _VECTOR_SIMILARITY_THRESHOLD:
|
||||
continue
|
||||
source_term = doc["source_term"]
|
||||
if exclude_terms and source_term.lower() in exclude_terms:
|
||||
continue
|
||||
target_text = _get_translation(doc["translations"], target_locale)
|
||||
if not target_text:
|
||||
continue
|
||||
matched.append(MatchedTerm(
|
||||
source_term=source_term,
|
||||
target_translation=target_text,
|
||||
match_kind="vector",
|
||||
score=score,
|
||||
))
|
||||
|
||||
return matched
|
||||
|
||||
|
||||
def _get_translation(translations: dict[str, str], target_locale: str) -> str | None:
|
||||
"""Look up a translation with locale-fallback: fr-CA → fr-FR → fr → None."""
|
||||
if not translations:
|
||||
return None
|
||||
if target_locale in translations:
|
||||
return translations[target_locale]
|
||||
# Try parent language
|
||||
parent = target_locale.split("-")[0] if "-" in target_locale else None
|
||||
if parent:
|
||||
# Try sibling locales, e.g. fr-CA not found → try fr-FR
|
||||
for code, text in translations.items():
|
||||
if code.startswith(parent + "-") or code == parent:
|
||||
return text
|
||||
return None
|
||||
|
||||
|
||||
# ── Prompt block ──────────────────────────────────────────────────────────────
|
||||
|
||||
def build_glossary_prompt_block(
|
||||
matched_terms: Sequence[MatchedTerm],
|
||||
target_locale: str,
|
||||
) -> str:
|
||||
"""
|
||||
Format matched terms for injection into a Gemini prompt.
|
||||
Returns an empty string if no terms were matched.
|
||||
"""
|
||||
if not matched_terms:
|
||||
return ""
|
||||
|
||||
target_label = locale_lib.get_gemini_label(target_locale)
|
||||
lines = [
|
||||
f"## Approved {target_label} terminology",
|
||||
"Use these exact translations when the source terms appear — do not deviate:",
|
||||
]
|
||||
for term in matched_terms:
|
||||
lines.append(f'- "{term.source_term}" → "{term.target_translation}"')
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
# ── Helpers ───────────────────────────────────────────────────────────────────
|
||||
|
||||
async def _upload_bytes_to_gcs(data: bytes, gcs_path: str, content_type: str) -> None:
|
||||
import asyncio
|
||||
loop = asyncio.get_event_loop()
|
||||
|
||||
def _upload() -> None:
|
||||
from google.cloud import storage as gcs_storage
|
||||
|
||||
from ..core.config import settings
|
||||
client = gcs_storage.Client(project=settings.gcp_project_id)
|
||||
bucket = client.bucket(settings.gcs_bucket)
|
||||
blob = bucket.blob(gcs_path)
|
||||
blob.content_type = content_type
|
||||
blob.upload_from_string(data, content_type=content_type)
|
||||
|
||||
await loop.run_in_executor(None, _upload)
|
||||
|
||||
|
||||
async def _ensure_indexes(db) -> None:
|
||||
try:
|
||||
await db[_COLL_GLOSSARIES].create_index([("client_id", 1), ("status", 1)])
|
||||
await db[_COLL_VERSIONS].create_index([("glossary_id", 1), ("version_number", -1)])
|
||||
await db[_COLL_TERMS].create_index([("version_id", 1), ("source_term_lower", 1)])
|
||||
await db[_COLL_TERMS].create_index([("glossary_id", 1)])
|
||||
except Exception as e:
|
||||
logger.debug(f"Index creation skipped (likely already exist): {e}")
|
||||
|
||||
|
||||
# ── Task helpers ─────────────────────────────────────────────────────────────
|
||||
|
||||
async def get_glossary_block_for_job(
|
||||
job_doc: dict,
|
||||
target_locale: str,
|
||||
db,
|
||||
) -> str:
|
||||
"""
|
||||
Convenience function for Celery tasks: given a job document and a target locale,
|
||||
return the formatted glossary block for Gemini prompt injection (or empty string).
|
||||
|
||||
Looks up:
|
||||
job_doc.project_id → db.projects → client_id → active glossary version
|
||||
|
||||
Non-fatal: any failure returns "" so the pipeline continues without a glossary.
|
||||
"""
|
||||
try:
|
||||
project_id = job_doc.get("project_id")
|
||||
if not project_id:
|
||||
return ""
|
||||
|
||||
project = await db.projects.find_one({"_id": project_id})
|
||||
if not project:
|
||||
return ""
|
||||
|
||||
client_id = project.get("client_id")
|
||||
if not client_id:
|
||||
return ""
|
||||
|
||||
# Get active version id via our cache-backed helper (reuses Redis if available)
|
||||
active_version_id = await _get_active_version_id(client_id)
|
||||
if not active_version_id:
|
||||
return ""
|
||||
|
||||
# Combine source VTT texts for matching
|
||||
source_text = job_doc.get("_glossary_source_text", "")
|
||||
if not source_text:
|
||||
return ""
|
||||
|
||||
norm_target = locale_lib.normalize_code(target_locale)
|
||||
exact_matches = await _exact_match(db, active_version_id, source_text, norm_target)
|
||||
|
||||
remaining = _MAX_TERMS_IN_PROMPT - len(exact_matches)
|
||||
already_found = {m.source_term.lower() for m in exact_matches}
|
||||
vector_matches: list[MatchedTerm] = []
|
||||
|
||||
if remaining > 0:
|
||||
try:
|
||||
vector_matches = await _vector_match(
|
||||
db, active_version_id, source_text, norm_target,
|
||||
top_k=_VECTOR_TOP_K, exclude_terms=already_found,
|
||||
)
|
||||
except Exception as ve:
|
||||
logger.debug(f"Vector search skipped in task context: {ve}")
|
||||
|
||||
combined = exact_matches + vector_matches
|
||||
if len(combined) > _MAX_TERMS_IN_PROMPT:
|
||||
logger.info(f"glossary_terms_truncated: capped at {_MAX_TERMS_IN_PROMPT}")
|
||||
combined = combined[:_MAX_TERMS_IN_PROMPT]
|
||||
|
||||
return build_glossary_prompt_block(combined, target_locale)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Glossary lookup failed for job {job_doc.get('_id')} (non-fatal): {e}")
|
||||
return ""
|
||||
|
||||
|
||||
# ── Listing helpers ───────────────────────────────────────────────────────────
|
||||
|
||||
async def get_glossaries_for_client(client_id: str) -> list[Glossary]:
|
||||
db = await get_database()
|
||||
cursor = db[_COLL_GLOSSARIES].find(
|
||||
{"client_id": client_id, "status": {"$ne": GlossaryStatus.ARCHIVED.value}},
|
||||
sort=[("created_at", -1)],
|
||||
)
|
||||
docs = await cursor.to_list(length=100)
|
||||
return [glossary_from_doc(d) for d in docs]
|
||||
|
||||
|
||||
async def get_glossary(glossary_id: str) -> Glossary | None:
|
||||
db = await get_database()
|
||||
doc = await db[_COLL_GLOSSARIES].find_one({"_id": ObjectId(glossary_id)})
|
||||
return glossary_from_doc(doc) if doc else None
|
||||
|
||||
|
||||
async def get_versions(glossary_id: str) -> list[GlossaryVersion]:
|
||||
db = await get_database()
|
||||
cursor = db[_COLL_VERSIONS].find(
|
||||
{"glossary_id": glossary_id},
|
||||
sort=[("version_number", -1)],
|
||||
)
|
||||
docs = await cursor.to_list(length=50)
|
||||
return [glossary_version_from_doc(d) for d in docs]
|
||||
|
||||
|
||||
async def get_terms_page(
|
||||
version_id: str,
|
||||
search: str | None = None,
|
||||
page: int = 1,
|
||||
page_size: int = 50,
|
||||
) -> tuple[list[GlossaryTerm], int]:
|
||||
"""Returns (terms, total_count) for paginated UI preview."""
|
||||
db = await get_database()
|
||||
query: dict = {"version_id": version_id}
|
||||
if search:
|
||||
query["source_term_lower"] = {"$regex": re.escape(search.lower())}
|
||||
|
||||
total = await db[_COLL_TERMS].count_documents(query)
|
||||
cursor = db[_COLL_TERMS].find(
|
||||
query,
|
||||
{"_id": 1, "source_term": 1, "translations": 1},
|
||||
skip=(page - 1) * page_size,
|
||||
limit=page_size,
|
||||
sort=[("source_term_lower", 1)],
|
||||
)
|
||||
docs = await cursor.to_list(length=page_size)
|
||||
terms = []
|
||||
for d in docs:
|
||||
d["_id"] = str(d["_id"])
|
||||
terms.append(GlossaryTerm.model_validate(d))
|
||||
return terms, total
|
||||
|
|
@ -74,19 +74,16 @@ class TTSService:
|
|||
# Determine which provider to use
|
||||
active_provider = provider or settings.tts_provider
|
||||
|
||||
# Extract simple language code for Gemini (e.g., "en-US" -> "en")
|
||||
simple_lang = language_code.split("-")[0] if "-" in language_code else language_code
|
||||
|
||||
# Try the configured provider first, then fallback
|
||||
if active_provider == "gemini" and self.gemini_available:
|
||||
try:
|
||||
logger.info(
|
||||
f"Using Gemini TTS for language: {simple_lang}, voice: {voice_name}, "
|
||||
f"Using Gemini TTS for language: {language_code}, voice: {voice_name}, "
|
||||
f"model: {model}, speed: {speed}x"
|
||||
)
|
||||
return await gemini_tts_service.synthesize_audio_description(
|
||||
ad_vtt_content,
|
||||
simple_lang,
|
||||
language_code,
|
||||
voice_name,
|
||||
model=model,
|
||||
speed=speed,
|
||||
|
|
@ -135,9 +132,6 @@ class TTSService:
|
|||
# Determine which provider to use
|
||||
active_provider = provider or settings.tts_provider
|
||||
|
||||
# Extract simple language code for Gemini (e.g., "en-US" -> "en")
|
||||
simple_lang = language_code.split("-")[0] if "-" in language_code else language_code
|
||||
|
||||
# Parse VTT cues first
|
||||
cues = self._parse_ad_cues(ad_vtt_content)
|
||||
if not cues:
|
||||
|
|
@ -169,7 +163,7 @@ class TTSService:
|
|||
if active_provider == "gemini" and self.gemini_available:
|
||||
audio_data = await gemini_tts_service.synthesize_text(
|
||||
text, voice_name or gemini_tts_service.default_voice,
|
||||
simple_lang, model=model, speed=speed, style_prompt=style_prompt
|
||||
language_code, model=model, speed=speed, style_prompt=style_prompt
|
||||
)
|
||||
elif self.google_client:
|
||||
audio_data = await self._synthesize_text_google(text, language_code, voice_name)
|
||||
|
|
|
|||
|
|
@ -128,6 +128,7 @@ def import_task_modules():
|
|||
from . import notify # noqa: E402, F401
|
||||
from . import ffmpeg_operations # noqa: E402, F401
|
||||
from . import whisper_transcribe # noqa: E402, F401
|
||||
from . import embed_glossary # noqa: E402, F401
|
||||
logger.info("Successfully imported all task modules")
|
||||
except Exception as e:
|
||||
logger.error(f"Error importing task modules: {e}")
|
||||
|
|
|
|||
102
backend/app/tasks/embed_glossary.py
Normal file
102
backend/app/tasks/embed_glossary.py
Normal file
|
|
@ -0,0 +1,102 @@
|
|||
"""
|
||||
Celery task: compute and store Gemini embeddings for all terms in a glossary version.
|
||||
|
||||
Runs as a background job after glossary ingestion so the API response is fast.
|
||||
Processes terms in batches of 100 and updates embedded_count incrementally.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
|
||||
from bson import ObjectId
|
||||
from motor.motor_asyncio import AsyncIOMotorClient
|
||||
|
||||
from ..core.config import settings
|
||||
from ..core.logging import get_logger
|
||||
from ..models.glossary import EmbeddingStatus
|
||||
from . import celery_app
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
_BATCH_SIZE = 100
|
||||
|
||||
|
||||
@celery_app.task(name="embed_glossary_version", bind=True, max_retries=3)
|
||||
def embed_glossary_version_task(self, version_id: str) -> dict:
|
||||
"""
|
||||
Compute embeddings for all GlossaryTerms of `version_id`.
|
||||
Updates embedded_count and embedding_status on the GlossaryVersion doc.
|
||||
"""
|
||||
try:
|
||||
result = asyncio.run(_async_embed_version(version_id))
|
||||
return result
|
||||
except Exception as exc:
|
||||
logger.error(f"embed_glossary_version_task failed for {version_id}: {exc}")
|
||||
raise self.retry(exc=exc, countdown=60) from None
|
||||
|
||||
|
||||
async def _async_embed_version(version_id: str) -> dict:
|
||||
from ..services.embedding_service import embedding_service
|
||||
|
||||
mongo_client = AsyncIOMotorClient(settings.mongodb_uri)
|
||||
db = mongo_client[settings.mongodb_db]
|
||||
|
||||
try:
|
||||
# Mark in-progress
|
||||
await db.glossary_versions.update_one(
|
||||
{"_id": ObjectId(version_id)},
|
||||
{"$set": {"embedding_status": EmbeddingStatus.IN_PROGRESS.value}},
|
||||
)
|
||||
|
||||
# Fetch all terms without embeddings
|
||||
cursor = db.glossary_terms.find(
|
||||
{"version_id": version_id, "embedding": None},
|
||||
{"_id": 1, "source_term": 1},
|
||||
)
|
||||
terms = await cursor.to_list(length=None)
|
||||
total = len(terms)
|
||||
logger.info(f"Embedding {total} terms for version {version_id}")
|
||||
|
||||
embedded_count = 0
|
||||
for i in range(0, total, _BATCH_SIZE):
|
||||
batch = terms[i: i + _BATCH_SIZE]
|
||||
texts = [t["source_term"] for t in batch]
|
||||
ids = [t["_id"] for t in batch]
|
||||
|
||||
embeddings = await embedding_service.embed_texts(texts)
|
||||
|
||||
# Bulk update
|
||||
ops = []
|
||||
from pymongo import UpdateOne
|
||||
for term_id, embedding in zip(ids, embeddings, strict=False):
|
||||
ops.append(UpdateOne({"_id": term_id}, {"$set": {"embedding": embedding}}))
|
||||
|
||||
if ops:
|
||||
await db.glossary_terms.bulk_write(ops, ordered=False)
|
||||
|
||||
embedded_count += len(batch)
|
||||
await db.glossary_versions.update_one(
|
||||
{"_id": ObjectId(version_id)},
|
||||
{"$set": {"embedded_count": embedded_count}},
|
||||
)
|
||||
logger.info(f"Version {version_id}: embedded {embedded_count}/{total}")
|
||||
|
||||
# Mark done
|
||||
await db.glossary_versions.update_one(
|
||||
{"_id": ObjectId(version_id)},
|
||||
{"$set": {
|
||||
"embedding_status": EmbeddingStatus.DONE.value,
|
||||
"embedded_count": total,
|
||||
}},
|
||||
)
|
||||
logger.info(f"Embedding complete for version {version_id}: {total} terms")
|
||||
return {"version_id": version_id, "total": total}
|
||||
|
||||
except Exception:
|
||||
await db.glossary_versions.update_one(
|
||||
{"_id": ObjectId(version_id)},
|
||||
{"$set": {"embedding_status": EmbeddingStatus.FAILED.value}},
|
||||
)
|
||||
raise
|
||||
finally:
|
||||
mongo_client.close()
|
||||
|
|
@ -219,6 +219,9 @@ async def _async_translate_and_synthesize(job_id: str):
|
|||
|
||||
# Get translation mode (default to "traditional" for backwards compatibility)
|
||||
translation_mode = job_doc["requested_outputs"].get("translation_mode", "traditional")
|
||||
|
||||
# Glossary: lazy-loaded per target language during the loop
|
||||
from ..services.glossary_service import get_glossary_block_for_job
|
||||
logger.info(f"Translation mode for job {job_id}: {translation_mode}")
|
||||
|
||||
sdh_requested = job_doc["requested_outputs"].get("sdh_vtt", False)
|
||||
|
|
@ -293,12 +296,17 @@ async def _async_translate_and_synthesize(job_id: str):
|
|||
project_id=_cost_ctx["project_id"],
|
||||
)
|
||||
|
||||
# Build glossary block from source VTT for this language
|
||||
_job_for_glossary = {**job_doc, "_glossary_source_text": ""}
|
||||
_glossary = await get_glossary_block_for_job(_job_for_glossary, lang, db)
|
||||
|
||||
async def extract_targeted():
|
||||
return await gemini_service.extract_accessibility_targeted(
|
||||
video_local_path,
|
||||
lang,
|
||||
brand_context=job_brand_context,
|
||||
sdh_requested=sdh_requested,
|
||||
glossary_block=_glossary,
|
||||
_cost_ctx=_cost_ctx,
|
||||
)
|
||||
|
||||
|
|
@ -382,6 +390,9 @@ async def _async_translate_and_synthesize(job_id: str):
|
|||
logger.info(f"Successfully processed VTT files for language: {lang} (origin: video_native)")
|
||||
|
||||
else:
|
||||
# Combine source VTTs for glossary term matching
|
||||
_source_text_for_glossary = " ".join(filter(None, [source_captions_vtt, source_ad_vtt]))
|
||||
|
||||
# TRADITIONAL MODE: Process languages sequentially
|
||||
for language in target_languages:
|
||||
logger.info(f"Processing language: {language} (from source: {source_language}, mode: {translation_mode})")
|
||||
|
|
@ -392,6 +403,10 @@ async def _async_translate_and_synthesize(job_id: str):
|
|||
project_id=_cost_ctx["project_id"],
|
||||
)
|
||||
|
||||
# Lookup glossary terms for this target language
|
||||
_job_for_glossary = {**job_doc, "_glossary_source_text": _source_text_for_glossary}
|
||||
_glossary = await get_glossary_block_for_job(_job_for_glossary, language, db)
|
||||
|
||||
try:
|
||||
if language in transcreation_languages:
|
||||
# TRADITIONAL MODE with transcreation: cultural adaptation
|
||||
|
|
@ -401,6 +416,7 @@ async def _async_translate_and_synthesize(job_id: str):
|
|||
source_ad_vtt,
|
||||
language,
|
||||
brief="Standard accessibility content",
|
||||
glossary_block=_glossary,
|
||||
_cost_ctx=_cost_ctx,
|
||||
)
|
||||
|
||||
|
|
@ -414,12 +430,14 @@ async def _async_translate_and_synthesize(job_id: str):
|
|||
async def translate_captions():
|
||||
return await gemini_service.translate_vtt(
|
||||
source_captions_vtt, language, source_language=source_language,
|
||||
glossary_block=_glossary,
|
||||
_cost_ctx=_cost_ctx,
|
||||
)
|
||||
|
||||
async def translate_ad():
|
||||
return await gemini_service.translate_vtt(
|
||||
source_ad_vtt, language, source_language=source_language,
|
||||
glossary_block=_glossary,
|
||||
_cost_ctx=_cost_ctx,
|
||||
)
|
||||
|
||||
|
|
@ -448,6 +466,7 @@ async def _async_translate_and_synthesize(job_id: str):
|
|||
async def translate_sdh():
|
||||
return await gemini_service.translate_vtt(
|
||||
source_sdh_vtt, language, source_language=source_language,
|
||||
glossary_block=_glossary,
|
||||
_cost_ctx=_cost_ctx,
|
||||
)
|
||||
translated_sdh = await retry_with_backoff(translate_sdh, max_retries=3)
|
||||
|
|
|
|||
|
|
@ -42,6 +42,8 @@ python-magic = "^0.4.27"
|
|||
aiohttp = "^3.12.15"
|
||||
jinja2 = "^3.1.6"
|
||||
audioop-lts = {version = "^0.2.2", python = ">=3.13"}
|
||||
openpyxl = "^3.1.2"
|
||||
pyahocorasick = "^2.1.1"
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
pytest = "^7.4.3"
|
||||
|
|
|
|||
|
|
@ -18,6 +18,9 @@ import { UserList } from './routes/admin/UserList';
|
|||
import { UserDetail } from './routes/admin/UserDetail';
|
||||
import { ClientList } from './routes/admin/ClientList';
|
||||
import { ClientDetail } from './routes/admin/ClientDetail';
|
||||
import { GlossaryList } from './routes/admin/glossaries/GlossaryList';
|
||||
import { GlossaryUpload } from './routes/admin/glossaries/GlossaryUpload';
|
||||
import { GlossaryDetail } from './routes/admin/glossaries/GlossaryDetail';
|
||||
import { AuditLog } from './routes/admin/AuditLog';
|
||||
import { LinguistQueue } from './routes/jobs/LinguistQueue';
|
||||
import { Downloads } from './routes/Downloads';
|
||||
|
|
@ -149,6 +152,27 @@ function AppContent() {
|
|||
</RoleGate>
|
||||
</AuthenticatedRoute>
|
||||
} />
|
||||
<Route path="/admin/clients/:clientId/glossaries" element={
|
||||
<AuthenticatedRoute>
|
||||
<RoleGate allowedRoles={['admin', 'project_manager', 'linguist', 'reviewer', 'production']}>
|
||||
<GlossaryList />
|
||||
</RoleGate>
|
||||
</AuthenticatedRoute>
|
||||
} />
|
||||
<Route path="/admin/clients/:clientId/glossaries/upload" element={
|
||||
<AuthenticatedRoute>
|
||||
<RoleGate allowedRoles={['admin', 'project_manager']}>
|
||||
<GlossaryUpload />
|
||||
</RoleGate>
|
||||
</AuthenticatedRoute>
|
||||
} />
|
||||
<Route path="/admin/clients/:clientId/glossaries/:glossaryId" element={
|
||||
<AuthenticatedRoute>
|
||||
<RoleGate allowedRoles={['admin', 'project_manager', 'linguist', 'reviewer', 'production']}>
|
||||
<GlossaryDetail />
|
||||
</RoleGate>
|
||||
</AuthenticatedRoute>
|
||||
} />
|
||||
<Route path="/admin/audit-log" element={
|
||||
<AuthenticatedRoute>
|
||||
<RoleGate allowedRoles={['production', 'admin']}>
|
||||
|
|
|
|||
|
|
@ -59,6 +59,10 @@ import type {
|
|||
LanguageQCMapResponse,
|
||||
LanguageQCStateResponse,
|
||||
QueueResponse,
|
||||
Glossary,
|
||||
GlossaryDetail,
|
||||
GlossaryVersion,
|
||||
GlossaryTermsResponse,
|
||||
} from '../types/api';
|
||||
|
||||
const API_BASE_URL = import.meta.env.VITE_API_BASE_URL || 'http://localhost:8000';
|
||||
|
|
@ -761,6 +765,84 @@ class ApiClient {
|
|||
const r = await this.client.get(`/me/language-qc-queue?${params.toString()}`);
|
||||
return r.data;
|
||||
}
|
||||
|
||||
// ── Glossary endpoints ──────────────────────────────────────────────────────
|
||||
|
||||
async getGlossaries(clientId: string): Promise<Glossary[]> {
|
||||
const r = await this.client.get(`/clients/${clientId}/glossaries`);
|
||||
return r.data;
|
||||
}
|
||||
|
||||
async getGlossary(clientId: string, glossaryId: string): Promise<GlossaryDetail> {
|
||||
const r = await this.client.get(`/clients/${clientId}/glossaries/${glossaryId}`);
|
||||
return r.data;
|
||||
}
|
||||
|
||||
async uploadGlossary(
|
||||
clientId: string,
|
||||
file: File,
|
||||
name: string,
|
||||
sourceLocale: string,
|
||||
sourceLocaleCol: string,
|
||||
description?: string,
|
||||
changeNote?: string,
|
||||
): Promise<GlossaryDetail> {
|
||||
const form = new FormData();
|
||||
form.append('file', file);
|
||||
form.append('name', name);
|
||||
form.append('source_locale', sourceLocale);
|
||||
form.append('source_locale_col', sourceLocaleCol);
|
||||
if (description) form.append('description', description);
|
||||
if (changeNote) form.append('change_note', changeNote);
|
||||
const r = await this.client.post(`/clients/${clientId}/glossaries`, form, {
|
||||
headers: { 'Content-Type': 'multipart/form-data' },
|
||||
timeout: 120000,
|
||||
});
|
||||
return r.data;
|
||||
}
|
||||
|
||||
async uploadGlossaryVersion(
|
||||
clientId: string,
|
||||
glossaryId: string,
|
||||
file: File,
|
||||
sourceLocaleCol: string,
|
||||
changeNote?: string,
|
||||
): Promise<GlossaryVersion> {
|
||||
const form = new FormData();
|
||||
form.append('file', file);
|
||||
form.append('source_locale_col', sourceLocaleCol);
|
||||
if (changeNote) form.append('change_note', changeNote);
|
||||
const r = await this.client.post(`/clients/${clientId}/glossaries/${glossaryId}/versions`, form, {
|
||||
headers: { 'Content-Type': 'multipart/form-data' },
|
||||
timeout: 120000,
|
||||
});
|
||||
return r.data;
|
||||
}
|
||||
|
||||
async activateGlossaryVersion(clientId: string, glossaryId: string, versionId: string): Promise<{ status: string; active_version_id: string }> {
|
||||
const form = new FormData();
|
||||
form.append('version_id', versionId);
|
||||
const r = await this.client.post(`/clients/${clientId}/glossaries/${glossaryId}/activate`, form);
|
||||
return r.data;
|
||||
}
|
||||
|
||||
async getGlossaryTerms(
|
||||
clientId: string,
|
||||
glossaryId: string,
|
||||
opts?: { versionId?: string; search?: string; page?: number; pageSize?: number },
|
||||
): Promise<GlossaryTermsResponse> {
|
||||
const params = new URLSearchParams();
|
||||
if (opts?.versionId) params.append('version_id', opts.versionId);
|
||||
if (opts?.search) params.append('search', opts.search);
|
||||
if (opts?.page) params.append('page', String(opts.page));
|
||||
if (opts?.pageSize) params.append('page_size', String(opts.pageSize));
|
||||
const r = await this.client.get(`/clients/${clientId}/glossaries/${glossaryId}/terms?${params.toString()}`);
|
||||
return r.data;
|
||||
}
|
||||
|
||||
async archiveGlossary(clientId: string, glossaryId: string): Promise<void> {
|
||||
await this.client.delete(`/clients/${clientId}/glossaries/${glossaryId}`);
|
||||
}
|
||||
}
|
||||
|
||||
export const apiClient = new ApiClient();
|
||||
|
|
|
|||
|
|
@ -1,5 +1,8 @@
|
|||
import { useState } from 'react';
|
||||
import { useParams } from 'react-router-dom';
|
||||
import { useParams, Link } from 'react-router-dom';
|
||||
import { useQuery } from '@tanstack/react-query';
|
||||
import { apiClient } from '../../lib/api';
|
||||
import type { Glossary } from '../../types/api';
|
||||
import {
|
||||
useClient,
|
||||
useTeams, useCreateTeam, useUpdateTeam, useDeleteTeam,
|
||||
|
|
@ -50,6 +53,12 @@ export function ClientDetail() {
|
|||
|
||||
const [pmUserId, setPmUserId] = useState('');
|
||||
|
||||
const { data: glossaries = [] } = useQuery<Glossary[]>({
|
||||
queryKey: ['glossaries', clientId],
|
||||
queryFn: () => apiClient.getGlossaries(clientId!),
|
||||
enabled: !!clientId,
|
||||
});
|
||||
|
||||
if (clientLoading) {
|
||||
return <div className="container mx-auto px-4 py-8 animate-pulse"><div className="h-8 bg-gray-200 rounded w-1/3" /></div>;
|
||||
}
|
||||
|
|
@ -337,6 +346,53 @@ export function ClientDetail() {
|
|||
</form>
|
||||
</section>
|
||||
|
||||
{/* Glossaries */}
|
||||
<section className="bg-white rounded-xl border border-gray-200 p-5">
|
||||
<div className="flex items-center justify-between mb-4">
|
||||
<h2 className="text-base font-semibold text-gray-800">Glossaries</h2>
|
||||
<Link
|
||||
to={`/admin/clients/${clientId}/glossaries`}
|
||||
className="text-sm text-blue-600 hover:text-blue-700"
|
||||
>
|
||||
View all →
|
||||
</Link>
|
||||
</div>
|
||||
{glossaries.length === 0 ? (
|
||||
<p className="text-sm text-gray-400">No glossaries yet</p>
|
||||
) : (
|
||||
<div className="space-y-2">
|
||||
{glossaries.slice(0, 3).map(g => (
|
||||
<div key={g.id} className="flex items-center justify-between py-1.5">
|
||||
<Link
|
||||
to={`/admin/clients/${clientId}/glossaries/${g.id}`}
|
||||
className="text-sm text-gray-800 hover:text-blue-600"
|
||||
>
|
||||
{g.name}
|
||||
</Link>
|
||||
<span className={`text-xs px-2 py-0.5 rounded-full font-medium ${
|
||||
g.status === 'active' ? 'bg-green-100 text-green-700' : 'bg-gray-100 text-gray-500'
|
||||
}`}>
|
||||
{g.status}
|
||||
</span>
|
||||
</div>
|
||||
))}
|
||||
{glossaries.length > 3 && (
|
||||
<p className="text-xs text-gray-400">+{glossaries.length - 3} more</p>
|
||||
)}
|
||||
</div>
|
||||
)}
|
||||
{(isAdmin || user?.role === 'project_manager') && (
|
||||
<div className="mt-4">
|
||||
<Link
|
||||
to={`/admin/clients/${clientId}/glossaries/upload`}
|
||||
className="text-sm text-blue-600 hover:underline"
|
||||
>
|
||||
+ Upload glossary
|
||||
</Link>
|
||||
</div>
|
||||
)}
|
||||
</section>
|
||||
|
||||
{/* Rename team modal */}
|
||||
{editingTeam && (
|
||||
<div className="fixed inset-0 bg-black/50 flex items-center justify-center z-50">
|
||||
|
|
|
|||
335
frontend/src/routes/admin/glossaries/GlossaryDetail.tsx
Normal file
335
frontend/src/routes/admin/glossaries/GlossaryDetail.tsx
Normal file
|
|
@ -0,0 +1,335 @@
|
|||
import { useState, useRef } from 'react';
|
||||
import { useParams, Link } from 'react-router-dom';
|
||||
import { useQuery, useMutation, useQueryClient } from '@tanstack/react-query';
|
||||
import { apiClient } from '../../../lib/api';
|
||||
import { useToastContext } from '../../../contexts/ToastContext';
|
||||
import { useAuthStore } from '../../../lib/auth';
|
||||
import type { GlossaryVersion, GlossaryDetail as GlossaryDetailType } from '../../../types/api';
|
||||
|
||||
type Tab = 'terms' | 'versions';
|
||||
|
||||
function EmbeddingPill({ v }: { v: GlossaryVersion }) {
|
||||
const pct = v.term_count > 0 ? Math.round((v.embedded_count / v.term_count) * 100) : 0;
|
||||
switch (v.embedding_status) {
|
||||
case 'done': return <span className="text-xs text-green-600 font-medium">Embedded ({v.embedded_count}/{v.term_count})</span>;
|
||||
case 'in_progress': return (
|
||||
<span className="text-xs text-blue-600 animate-pulse font-medium">
|
||||
Embedding… {pct}% ({v.embedded_count}/{v.term_count})
|
||||
</span>
|
||||
);
|
||||
case 'failed': return <span className="text-xs text-red-500 font-medium">Embedding failed</span>;
|
||||
default: return <span className="text-xs text-gray-400">Pending embedding</span>;
|
||||
}
|
||||
}
|
||||
|
||||
export function GlossaryDetail() {
|
||||
const { clientId, glossaryId } = useParams<{ clientId: string; glossaryId: string }>();
|
||||
const { user } = useAuthStore();
|
||||
const toast = useToastContext();
|
||||
const qc = useQueryClient();
|
||||
const isAdmin = user?.role === 'admin';
|
||||
const isPM = user?.role === 'project_manager';
|
||||
|
||||
const [tab, setTab] = useState<Tab>('terms');
|
||||
const [search, setSearch] = useState('');
|
||||
const [page, setPage] = useState(1);
|
||||
|
||||
// New version upload state
|
||||
const [showVersionUpload, setShowVersionUpload] = useState(false);
|
||||
const [versionFile, setVersionFile] = useState<File | null>(null);
|
||||
const [versionSourceCol, setVersionSourceCol] = useState('');
|
||||
const [versionChangeNote, setVersionChangeNote] = useState('');
|
||||
const versionFileRef = useRef<HTMLInputElement>(null);
|
||||
|
||||
const PAGE_SIZE = 50;
|
||||
|
||||
const { data: glossary, isLoading } = useQuery<GlossaryDetailType>({
|
||||
queryKey: ['glossary', clientId, glossaryId],
|
||||
queryFn: () => apiClient.getGlossary(clientId!, glossaryId!),
|
||||
enabled: !!clientId && !!glossaryId,
|
||||
refetchInterval: (q) => {
|
||||
const g = q.state.data as GlossaryDetailType | undefined;
|
||||
if (!g) return false;
|
||||
const hasInProgress = g.versions.some(v => v.embedding_status === 'in_progress' || v.embedding_status === 'pending');
|
||||
return hasInProgress ? 5000 : false;
|
||||
},
|
||||
});
|
||||
|
||||
const { data: termsData, isLoading: termsLoading } = useQuery({
|
||||
queryKey: ['glossary-terms', clientId, glossaryId, search, page],
|
||||
queryFn: () => apiClient.getGlossaryTerms(clientId!, glossaryId!, { search: search || undefined, page, pageSize: PAGE_SIZE }),
|
||||
enabled: !!clientId && !!glossaryId && tab === 'terms',
|
||||
placeholderData: (prev) => prev,
|
||||
});
|
||||
|
||||
const activateMut = useMutation({
|
||||
mutationFn: (versionId: string) => apiClient.activateGlossaryVersion(clientId!, glossaryId!, versionId),
|
||||
onSuccess: () => {
|
||||
qc.invalidateQueries({ queryKey: ['glossary', clientId, glossaryId] });
|
||||
qc.invalidateQueries({ queryKey: ['glossaries', clientId] });
|
||||
toast.success('Version activated');
|
||||
},
|
||||
onError: () => toast.error('Failed to activate version'),
|
||||
});
|
||||
|
||||
const uploadVersionMut = useMutation({
|
||||
mutationFn: () => apiClient.uploadGlossaryVersion(clientId!, glossaryId!, versionFile!, versionSourceCol.trim(), versionChangeNote.trim() || undefined),
|
||||
onSuccess: () => {
|
||||
qc.invalidateQueries({ queryKey: ['glossary', clientId, glossaryId] });
|
||||
setShowVersionUpload(false);
|
||||
setVersionFile(null);
|
||||
setVersionSourceCol('');
|
||||
setVersionChangeNote('');
|
||||
toast.success('New version uploaded — embedding in background');
|
||||
},
|
||||
onError: (err: unknown) => {
|
||||
const msg = (err as { response?: { data?: { detail?: string } } })?.response?.data?.detail ?? 'Upload failed';
|
||||
toast.error(msg);
|
||||
},
|
||||
});
|
||||
|
||||
if (isLoading || !glossary) {
|
||||
return (
|
||||
<div className="container mx-auto px-4 py-8 max-w-4xl animate-pulse space-y-4">
|
||||
<div className="h-8 bg-gray-200 rounded w-1/3" />
|
||||
<div className="h-48 bg-gray-200 rounded-xl" />
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
const activeVersion = glossary.versions.find(v => v.id === glossary.current_version_id);
|
||||
const totalPages = termsData ? Math.ceil(termsData.total / PAGE_SIZE) : 1;
|
||||
|
||||
return (
|
||||
<div className="container mx-auto px-4 py-8 max-w-4xl space-y-6">
|
||||
{/* Header */}
|
||||
<div>
|
||||
<p className="text-sm text-gray-400 mb-1">
|
||||
← <Link to={`/admin/clients/${clientId}/glossaries`} className="hover:text-blue-600">Glossaries</Link>
|
||||
</p>
|
||||
<div className="flex items-start justify-between gap-4">
|
||||
<div>
|
||||
<h1 className="text-2xl font-bold text-gray-900">{glossary.name}</h1>
|
||||
{glossary.description && <p className="text-sm text-gray-500 mt-0.5">{glossary.description}</p>}
|
||||
<p className="text-xs text-gray-400 mt-1">
|
||||
Source: <span className="font-mono">{glossary.source_locale}</span>
|
||||
{activeVersion && (
|
||||
<> · Active: v{activeVersion.version_number} · {activeVersion.term_count.toLocaleString()} terms</>
|
||||
)}
|
||||
</p>
|
||||
{activeVersion && (
|
||||
<div className="mt-1"><EmbeddingPill v={activeVersion} /></div>
|
||||
)}
|
||||
</div>
|
||||
{(isAdmin || isPM) && (
|
||||
<button
|
||||
onClick={() => setShowVersionUpload(!showVersionUpload)}
|
||||
className="px-3 py-1.5 border border-gray-300 text-sm rounded-lg hover:bg-gray-50 shrink-0"
|
||||
>
|
||||
+ New version
|
||||
</button>
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{/* New version upload panel */}
|
||||
{showVersionUpload && (
|
||||
<div className="bg-blue-50 border border-blue-200 rounded-xl p-5 space-y-4">
|
||||
<h3 className="text-sm font-semibold text-blue-900">Upload new version</h3>
|
||||
<div
|
||||
className="border-2 border-dashed border-blue-300 rounded-lg p-6 text-center cursor-pointer hover:border-blue-400"
|
||||
onClick={() => versionFileRef.current?.click()}
|
||||
>
|
||||
<input
|
||||
ref={versionFileRef}
|
||||
type="file"
|
||||
accept=".xlsx"
|
||||
className="hidden"
|
||||
onChange={(e) => setVersionFile(e.target.files?.[0] ?? null)}
|
||||
/>
|
||||
{versionFile
|
||||
? <p className="text-sm font-medium text-gray-900">{versionFile.name}</p>
|
||||
: <p className="text-sm text-gray-500">Click to select .xlsx file</p>
|
||||
}
|
||||
</div>
|
||||
<div>
|
||||
<label className="block text-xs font-medium text-gray-700 mb-1">Source column header *</label>
|
||||
<input
|
||||
type="text"
|
||||
value={versionSourceCol}
|
||||
onChange={e => setVersionSourceCol(e.target.value)}
|
||||
placeholder="e.g. en_gb"
|
||||
className="w-full border border-gray-300 rounded-lg px-3 py-1.5 text-sm focus:outline-none focus:ring-2 focus:ring-blue-500"
|
||||
/>
|
||||
</div>
|
||||
<div>
|
||||
<label className="block text-xs font-medium text-gray-700 mb-1">Change note (optional)</label>
|
||||
<input
|
||||
type="text"
|
||||
value={versionChangeNote}
|
||||
onChange={e => setVersionChangeNote(e.target.value)}
|
||||
placeholder="e.g. Updated Q2 terms"
|
||||
className="w-full border border-gray-300 rounded-lg px-3 py-1.5 text-sm focus:outline-none focus:ring-2 focus:ring-blue-500"
|
||||
/>
|
||||
</div>
|
||||
<div className="flex gap-2">
|
||||
<button
|
||||
type="button"
|
||||
onClick={() => setShowVersionUpload(false)}
|
||||
className="px-3 py-1.5 border border-gray-300 text-sm rounded-lg hover:bg-gray-50"
|
||||
>
|
||||
Cancel
|
||||
</button>
|
||||
<button
|
||||
type="button"
|
||||
disabled={!versionFile || !versionSourceCol.trim() || uploadVersionMut.isPending}
|
||||
onClick={() => uploadVersionMut.mutate()}
|
||||
className="px-4 py-1.5 bg-blue-600 text-white text-sm font-medium rounded-lg hover:bg-blue-700 disabled:opacity-50"
|
||||
>
|
||||
{uploadVersionMut.isPending ? 'Uploading…' : 'Upload'}
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
|
||||
{/* Tabs */}
|
||||
<div className="border-b border-gray-200">
|
||||
<nav className="flex gap-6">
|
||||
{(['terms', 'versions'] as Tab[]).map(t => (
|
||||
<button
|
||||
key={t}
|
||||
onClick={() => setTab(t)}
|
||||
className={`pb-3 text-sm font-medium border-b-2 transition-colors capitalize ${
|
||||
tab === t ? 'border-blue-600 text-blue-600' : 'border-transparent text-gray-500 hover:text-gray-700'
|
||||
}`}
|
||||
>
|
||||
{t}
|
||||
{t === 'versions' && <span className="ml-1 text-xs text-gray-400">({glossary.versions.length})</span>}
|
||||
</button>
|
||||
))}
|
||||
</nav>
|
||||
</div>
|
||||
|
||||
{/* Terms tab */}
|
||||
{tab === 'terms' && (
|
||||
<div className="space-y-4">
|
||||
<input
|
||||
type="text"
|
||||
value={search}
|
||||
onChange={e => { setSearch(e.target.value); setPage(1); }}
|
||||
placeholder="Search terms…"
|
||||
className="w-full border border-gray-300 rounded-lg px-3 py-2 text-sm focus:outline-none focus:ring-2 focus:ring-blue-500"
|
||||
/>
|
||||
|
||||
{termsLoading ? (
|
||||
<div className="animate-pulse space-y-2">
|
||||
{[1,2,3,4,5].map(i => <div key={i} className="h-10 bg-gray-100 rounded" />)}
|
||||
</div>
|
||||
) : termsData?.terms.length === 0 ? (
|
||||
<p className="text-sm text-gray-400 text-center py-8">
|
||||
{search ? 'No terms match your search' : 'No terms in this glossary yet'}
|
||||
</p>
|
||||
) : (
|
||||
<>
|
||||
<div className="text-xs text-gray-400">{termsData?.total.toLocaleString()} terms total</div>
|
||||
<div className="border border-gray-200 rounded-xl overflow-hidden">
|
||||
<table className="w-full text-sm">
|
||||
<thead className="bg-gray-50 border-b border-gray-200">
|
||||
<tr>
|
||||
<th className="text-left px-4 py-2.5 font-medium text-gray-600 w-1/3">Source term</th>
|
||||
<th className="text-left px-4 py-2.5 font-medium text-gray-600">Translations</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody className="divide-y divide-gray-100">
|
||||
{termsData?.terms.map((term, idx) => (
|
||||
<tr key={idx} className="hover:bg-gray-50">
|
||||
<td className="px-4 py-2.5 font-medium text-gray-900 align-top">{term.source_term}</td>
|
||||
<td className="px-4 py-2.5 text-gray-600">
|
||||
<div className="flex flex-wrap gap-2">
|
||||
{Object.entries(term.translations).slice(0, 6).map(([locale, text]) => (
|
||||
<span key={locale} className="inline-flex items-center gap-1 text-xs bg-gray-100 rounded px-1.5 py-0.5">
|
||||
<span className="font-mono text-gray-400">{locale}</span>
|
||||
<span className="text-gray-700 truncate max-w-[120px]">{text}</span>
|
||||
</span>
|
||||
))}
|
||||
{Object.keys(term.translations).length > 6 && (
|
||||
<span className="text-xs text-gray-400">+{Object.keys(term.translations).length - 6} more</span>
|
||||
)}
|
||||
</div>
|
||||
</td>
|
||||
</tr>
|
||||
))}
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
|
||||
{totalPages > 1 && (
|
||||
<div className="flex items-center justify-between pt-2">
|
||||
<button
|
||||
disabled={page <= 1}
|
||||
onClick={() => setPage(p => p - 1)}
|
||||
className="px-3 py-1 text-sm border rounded disabled:opacity-40"
|
||||
>
|
||||
← Previous
|
||||
</button>
|
||||
<span className="text-sm text-gray-500">Page {page} of {totalPages}</span>
|
||||
<button
|
||||
disabled={page >= totalPages}
|
||||
onClick={() => setPage(p => p + 1)}
|
||||
className="px-3 py-1 text-sm border rounded disabled:opacity-40"
|
||||
>
|
||||
Next →
|
||||
</button>
|
||||
</div>
|
||||
)}
|
||||
</>
|
||||
)}
|
||||
</div>
|
||||
)}
|
||||
|
||||
{/* Versions tab */}
|
||||
{tab === 'versions' && (
|
||||
<div className="space-y-3">
|
||||
{glossary.versions.map((v) => {
|
||||
const isActive = v.id === glossary.current_version_id;
|
||||
return (
|
||||
<div
|
||||
key={v.id}
|
||||
className={`rounded-xl border p-4 flex items-start justify-between gap-4 ${
|
||||
isActive ? 'border-blue-300 bg-blue-50' : 'border-gray-200 bg-white'
|
||||
}`}
|
||||
>
|
||||
<div>
|
||||
<div className="flex items-center gap-2 mb-1">
|
||||
<span className="text-sm font-semibold text-gray-900">Version {v.version_number}</span>
|
||||
{isActive && (
|
||||
<span className="text-xs bg-blue-600 text-white px-2 py-0.5 rounded-full font-medium">Active</span>
|
||||
)}
|
||||
</div>
|
||||
<p className="text-xs text-gray-500">
|
||||
{v.term_count.toLocaleString()} terms · uploaded {new Date(v.created_at).toLocaleDateString()}
|
||||
</p>
|
||||
{v.change_note && <p className="text-xs text-gray-400 mt-0.5 italic">"{v.change_note}"</p>}
|
||||
<div className="mt-1"><EmbeddingPill v={v} /></div>
|
||||
</div>
|
||||
{(isAdmin || isPM) && !isActive && (
|
||||
<button
|
||||
onClick={() => {
|
||||
if (confirm(`Activate version ${v.version_number}? AI translations will start using this version.`)) {
|
||||
activateMut.mutate(v.id);
|
||||
}
|
||||
}}
|
||||
disabled={activateMut.isPending}
|
||||
className="text-xs px-3 py-1.5 bg-blue-600 text-white rounded-lg hover:bg-blue-700 disabled:opacity-50 shrink-0"
|
||||
>
|
||||
Activate
|
||||
</button>
|
||||
)}
|
||||
</div>
|
||||
);
|
||||
})}
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
);
|
||||
}
|
||||
131
frontend/src/routes/admin/glossaries/GlossaryList.tsx
Normal file
131
frontend/src/routes/admin/glossaries/GlossaryList.tsx
Normal file
|
|
@ -0,0 +1,131 @@
|
|||
import { useQuery, useMutation, useQueryClient } from '@tanstack/react-query';
|
||||
import { Link, useParams } from 'react-router-dom';
|
||||
import { apiClient } from '../../../lib/api';
|
||||
import { useToastContext } from '../../../contexts/ToastContext';
|
||||
import { useAuthStore } from '../../../lib/auth';
|
||||
import type { Glossary } from '../../../types/api';
|
||||
|
||||
function statusBadge(status: string) {
|
||||
return status === 'active'
|
||||
? 'bg-green-100 text-green-700'
|
||||
: 'bg-gray-100 text-gray-500';
|
||||
}
|
||||
|
||||
function embeddingBadge(status: string) {
|
||||
switch (status) {
|
||||
case 'done': return <span className="text-xs text-green-600">Embedded ✓</span>;
|
||||
case 'in_progress': return <span className="text-xs text-blue-600 animate-pulse">Embedding…</span>;
|
||||
case 'failed': return <span className="text-xs text-red-500">Embed failed</span>;
|
||||
default: return <span className="text-xs text-gray-400">Pending embed</span>;
|
||||
}
|
||||
}
|
||||
|
||||
export function GlossaryList() {
|
||||
const { clientId } = useParams<{ clientId: string }>();
|
||||
const { user } = useAuthStore();
|
||||
const toast = useToastContext();
|
||||
const qc = useQueryClient();
|
||||
const isAdmin = user?.role === 'admin';
|
||||
const isPM = user?.role === 'project_manager';
|
||||
|
||||
const { data: glossaries = [], isLoading } = useQuery<Glossary[]>({
|
||||
queryKey: ['glossaries', clientId],
|
||||
queryFn: () => apiClient.getGlossaries(clientId!),
|
||||
enabled: !!clientId,
|
||||
refetchInterval: (q) => {
|
||||
const data = q.state.data;
|
||||
return Array.isArray(data) ? 5000 : false;
|
||||
},
|
||||
});
|
||||
|
||||
const archiveMut = useMutation({
|
||||
mutationFn: (id: string) => apiClient.archiveGlossary(clientId!, id),
|
||||
onSuccess: () => {
|
||||
qc.invalidateQueries({ queryKey: ['glossaries', clientId] });
|
||||
toast.success('Glossary archived');
|
||||
},
|
||||
onError: () => toast.error('Failed to archive glossary'),
|
||||
});
|
||||
|
||||
if (isLoading) {
|
||||
return (
|
||||
<div className="container mx-auto px-4 py-8 max-w-4xl animate-pulse space-y-3">
|
||||
{[1, 2].map(i => <div key={i} className="h-16 bg-gray-200 rounded-xl" />)}
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
return (
|
||||
<div className="container mx-auto px-4 py-8 max-w-4xl space-y-6">
|
||||
<div className="flex items-center justify-between">
|
||||
<div>
|
||||
<p className="text-sm text-gray-400 mb-1">
|
||||
← <Link to={`/admin/clients/${clientId}`} className="hover:text-blue-600">Client</Link>
|
||||
</p>
|
||||
<h1 className="text-2xl font-bold text-gray-900">Glossaries</h1>
|
||||
</div>
|
||||
{(isAdmin || isPM) && (
|
||||
<Link
|
||||
to={`/admin/clients/${clientId}/glossaries/upload`}
|
||||
className="px-4 py-2 bg-blue-600 text-white text-sm font-medium rounded-lg hover:bg-blue-700"
|
||||
>
|
||||
+ Upload glossary
|
||||
</Link>
|
||||
)}
|
||||
</div>
|
||||
|
||||
{glossaries.length === 0 ? (
|
||||
<div className="text-center py-16 text-gray-400">
|
||||
<p className="text-lg mb-2">No glossaries yet</p>
|
||||
{(isAdmin || isPM) && (
|
||||
<Link to={`/admin/clients/${clientId}/glossaries/upload`} className="text-blue-500 hover:underline text-sm">
|
||||
Upload the first glossary
|
||||
</Link>
|
||||
)}
|
||||
</div>
|
||||
) : (
|
||||
<div className="space-y-3">
|
||||
{glossaries.map((g) => (
|
||||
<div key={g.id} className="bg-white rounded-xl border border-gray-200 p-5 flex items-start justify-between gap-4">
|
||||
<div className="flex-1 min-w-0">
|
||||
<div className="flex items-center gap-2 mb-1">
|
||||
<Link
|
||||
to={`/admin/clients/${clientId}/glossaries/${g.id}`}
|
||||
className="text-base font-semibold text-gray-900 hover:text-blue-600 truncate"
|
||||
>
|
||||
{g.name}
|
||||
</Link>
|
||||
<span className={`text-xs px-2 py-0.5 rounded-full font-medium ${statusBadge(g.status)}`}>
|
||||
{g.status}
|
||||
</span>
|
||||
</div>
|
||||
{g.description && <p className="text-sm text-gray-500 mb-1 truncate">{g.description}</p>}
|
||||
<p className="text-xs text-gray-400">
|
||||
Source: <span className="font-mono">{g.source_locale}</span>
|
||||
{' · '}Created {new Date(g.created_at).toLocaleDateString()}
|
||||
</p>
|
||||
</div>
|
||||
<div className="flex items-center gap-4 shrink-0">
|
||||
<div className="text-right text-xs text-gray-400">
|
||||
{g.current_version_id ? embeddingBadge('') : null}
|
||||
</div>
|
||||
{isAdmin && g.status === 'active' && (
|
||||
<button
|
||||
onClick={() => {
|
||||
if (confirm('Archive this glossary? It will no longer be used for AI translations.')) {
|
||||
archiveMut.mutate(g.id);
|
||||
}
|
||||
}}
|
||||
className="text-xs text-red-500 hover:text-red-700"
|
||||
>
|
||||
Archive
|
||||
</button>
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
))}
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
);
|
||||
}
|
||||
204
frontend/src/routes/admin/glossaries/GlossaryUpload.tsx
Normal file
204
frontend/src/routes/admin/glossaries/GlossaryUpload.tsx
Normal file
|
|
@ -0,0 +1,204 @@
|
|||
import { useState, useRef } from 'react';
|
||||
import { useNavigate, useParams, Link } from 'react-router-dom';
|
||||
import { useMutation, useQueryClient } from '@tanstack/react-query';
|
||||
import { apiClient } from '../../../lib/api';
|
||||
import { useToastContext } from '../../../contexts/ToastContext';
|
||||
|
||||
const KNOWN_LOCALES: { code: string; label: string }[] = [
|
||||
{ code: 'en-GB', label: 'English (UK)' },
|
||||
{ code: 'en-US', label: 'English (US)' },
|
||||
{ code: 'en-CA', label: 'English (Canada)' },
|
||||
{ code: 'de-DE', label: 'German' },
|
||||
{ code: 'fr-FR', label: 'French (France)' },
|
||||
{ code: 'fr-CA', label: 'French (Canada)' },
|
||||
{ code: 'es-ES', label: 'Spanish (Spain)' },
|
||||
{ code: 'es-MX', label: 'Spanish (Mexico)' },
|
||||
{ code: 'es-419', label: 'Spanish (Latin America)' },
|
||||
{ code: 'it-IT', label: 'Italian' },
|
||||
{ code: 'pt-BR', label: 'Portuguese (Brazil)' },
|
||||
{ code: 'pt-PT', label: 'Portuguese (Portugal)' },
|
||||
{ code: 'nl-NL', label: 'Dutch' },
|
||||
{ code: 'pl-PL', label: 'Polish' },
|
||||
{ code: 'cs-CZ', label: 'Czech' },
|
||||
{ code: 'tr-TR', label: 'Turkish' },
|
||||
{ code: 'ko-KR', label: 'Korean' },
|
||||
{ code: 'ja-JP', label: 'Japanese' },
|
||||
{ code: 'id-ID', label: 'Indonesian' },
|
||||
];
|
||||
|
||||
export function GlossaryUpload() {
|
||||
const { clientId } = useParams<{ clientId: string }>();
|
||||
const navigate = useNavigate();
|
||||
const toast = useToastContext();
|
||||
const qc = useQueryClient();
|
||||
|
||||
const [file, setFile] = useState<File | null>(null);
|
||||
const [name, setName] = useState('');
|
||||
const [sourceLocale, setSourceLocale] = useState('en-GB');
|
||||
const [sourceLocaleCol, setSourceLocaleCol] = useState('');
|
||||
const [description, setDescription] = useState('');
|
||||
const [changeNote, setChangeNote] = useState('');
|
||||
const [dragOver, setDragOver] = useState(false);
|
||||
const fileInputRef = useRef<HTMLInputElement>(null);
|
||||
|
||||
const uploadMut = useMutation({
|
||||
mutationFn: () => apiClient.uploadGlossary(
|
||||
clientId!,
|
||||
file!,
|
||||
name.trim(),
|
||||
sourceLocale,
|
||||
sourceLocaleCol.trim(),
|
||||
description.trim() || undefined,
|
||||
changeNote.trim() || undefined,
|
||||
),
|
||||
onSuccess: (g) => {
|
||||
qc.invalidateQueries({ queryKey: ['glossaries', clientId] });
|
||||
toast.success(`Glossary "${g.name}" uploaded — embedding in background`);
|
||||
navigate(`/admin/clients/${clientId}/glossaries/${g.id}`);
|
||||
},
|
||||
onError: (err: unknown) => {
|
||||
const msg = (err as { response?: { data?: { detail?: string } } })?.response?.data?.detail ?? 'Upload failed';
|
||||
toast.error(msg);
|
||||
},
|
||||
});
|
||||
|
||||
const handleDrop = (e: React.DragEvent) => {
|
||||
e.preventDefault();
|
||||
setDragOver(false);
|
||||
const f = e.dataTransfer.files[0];
|
||||
if (f && f.name.endsWith('.xlsx')) setFile(f);
|
||||
else toast.error('Only .xlsx files are accepted');
|
||||
};
|
||||
|
||||
const canSubmit = !!file && !!name.trim() && !!sourceLocale && !!sourceLocaleCol.trim() && !uploadMut.isPending;
|
||||
|
||||
return (
|
||||
<div className="container mx-auto px-4 py-8 max-w-xl space-y-6">
|
||||
<div>
|
||||
<p className="text-sm text-gray-400 mb-1">
|
||||
← <Link to={`/admin/clients/${clientId}/glossaries`} className="hover:text-blue-600">Glossaries</Link>
|
||||
</p>
|
||||
<h1 className="text-2xl font-bold text-gray-900">Upload glossary</h1>
|
||||
<p className="text-sm text-gray-500 mt-1">Upload an xlsx file with terminology translations.</p>
|
||||
</div>
|
||||
|
||||
{/* Drop zone */}
|
||||
<div
|
||||
className={`border-2 border-dashed rounded-xl p-8 text-center cursor-pointer transition-colors ${
|
||||
dragOver ? 'border-blue-400 bg-blue-50' : 'border-gray-300 hover:border-gray-400'
|
||||
}`}
|
||||
onClick={() => fileInputRef.current?.click()}
|
||||
onDragOver={(e) => { e.preventDefault(); setDragOver(true); }}
|
||||
onDragLeave={() => setDragOver(false)}
|
||||
onDrop={handleDrop}
|
||||
>
|
||||
<input
|
||||
ref={fileInputRef}
|
||||
type="file"
|
||||
accept=".xlsx,application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
||||
className="hidden"
|
||||
onChange={(e) => {
|
||||
const f = e.target.files?.[0];
|
||||
if (f) setFile(f);
|
||||
}}
|
||||
/>
|
||||
{file ? (
|
||||
<div>
|
||||
<p className="text-sm font-medium text-gray-900">{file.name}</p>
|
||||
<p className="text-xs text-gray-400 mt-1">{(file.size / 1024 / 1024).toFixed(1)} MB</p>
|
||||
<button
|
||||
type="button"
|
||||
onClick={(e) => { e.stopPropagation(); setFile(null); }}
|
||||
className="mt-2 text-xs text-red-500 hover:text-red-700"
|
||||
>
|
||||
Remove
|
||||
</button>
|
||||
</div>
|
||||
) : (
|
||||
<div className="text-gray-400">
|
||||
<p className="text-sm font-medium">Drop .xlsx file here or click to browse</p>
|
||||
<p className="text-xs mt-1">Max 50 MB</p>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
|
||||
{/* Form fields */}
|
||||
<div className="space-y-4">
|
||||
<div>
|
||||
<label className="block text-sm font-medium text-gray-700 mb-1">Glossary name *</label>
|
||||
<input
|
||||
type="text"
|
||||
value={name}
|
||||
onChange={e => setName(e.target.value)}
|
||||
placeholder="e.g. 3M Master Terminology"
|
||||
className="w-full border border-gray-300 rounded-lg px-3 py-2 text-sm focus:outline-none focus:ring-2 focus:ring-blue-500"
|
||||
/>
|
||||
</div>
|
||||
|
||||
<div>
|
||||
<label className="block text-sm font-medium text-gray-700 mb-1">Source language *</label>
|
||||
<select
|
||||
value={sourceLocale}
|
||||
onChange={e => setSourceLocale(e.target.value)}
|
||||
className="w-full border border-gray-300 rounded-lg px-3 py-2 text-sm focus:outline-none focus:ring-2 focus:ring-blue-500"
|
||||
>
|
||||
{KNOWN_LOCALES.map(l => (
|
||||
<option key={l.code} value={l.code}>{l.label} ({l.code})</option>
|
||||
))}
|
||||
</select>
|
||||
</div>
|
||||
|
||||
<div>
|
||||
<label className="block text-sm font-medium text-gray-700 mb-1">Source column header in xlsx *</label>
|
||||
<input
|
||||
type="text"
|
||||
value={sourceLocaleCol}
|
||||
onChange={e => setSourceLocaleCol(e.target.value)}
|
||||
placeholder="e.g. en_gb or English (GB)"
|
||||
className="w-full border border-gray-300 rounded-lg px-3 py-2 text-sm focus:outline-none focus:ring-2 focus:ring-blue-500"
|
||||
/>
|
||||
<p className="text-xs text-gray-400 mt-1">Must exactly match the column header in the xlsx file (case-insensitive).</p>
|
||||
</div>
|
||||
|
||||
<div>
|
||||
<label className="block text-sm font-medium text-gray-700 mb-1">Description (optional)</label>
|
||||
<textarea
|
||||
value={description}
|
||||
onChange={e => setDescription(e.target.value)}
|
||||
rows={2}
|
||||
className="w-full border border-gray-300 rounded-lg px-3 py-2 text-sm focus:outline-none focus:ring-2 focus:ring-blue-500 resize-none"
|
||||
/>
|
||||
</div>
|
||||
|
||||
<div>
|
||||
<label className="block text-sm font-medium text-gray-700 mb-1">Change note (optional)</label>
|
||||
<input
|
||||
type="text"
|
||||
value={changeNote}
|
||||
onChange={e => setChangeNote(e.target.value)}
|
||||
placeholder="e.g. Initial import Q1 2026"
|
||||
className="w-full border border-gray-300 rounded-lg px-3 py-2 text-sm focus:outline-none focus:ring-2 focus:ring-blue-500"
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div className="flex gap-3 pt-2">
|
||||
<button
|
||||
type="button"
|
||||
onClick={() => navigate(`/admin/clients/${clientId}/glossaries`)}
|
||||
className="px-4 py-2 border border-gray-300 text-sm rounded-lg hover:bg-gray-50"
|
||||
>
|
||||
Cancel
|
||||
</button>
|
||||
<button
|
||||
type="button"
|
||||
disabled={!canSubmit}
|
||||
onClick={() => uploadMut.mutate()}
|
||||
className="flex-1 px-4 py-2 bg-blue-600 text-white text-sm font-medium rounded-lg hover:bg-blue-700 disabled:opacity-50"
|
||||
>
|
||||
{uploadMut.isPending ? 'Uploading…' : 'Upload glossary'}
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
|
@ -741,4 +741,60 @@ export interface AuditLogQuery {
|
|||
limit?: number;
|
||||
sort_by?: string;
|
||||
sort_order?: number;
|
||||
}
|
||||
|
||||
// ── Glossary ────────────────────────────────────────────────────────────────
|
||||
|
||||
export type GlossaryStatus = 'active' | 'archived';
|
||||
export type EmbeddingStatus = 'pending' | 'in_progress' | 'done' | 'failed';
|
||||
|
||||
export interface GlossaryVersion {
|
||||
id: string;
|
||||
glossary_id: string;
|
||||
version_number: number;
|
||||
source_xlsx_gcs_path?: string;
|
||||
term_count: number;
|
||||
embedded_count: number;
|
||||
embedding_status: EmbeddingStatus;
|
||||
created_at: string;
|
||||
created_by: string;
|
||||
change_note?: string;
|
||||
}
|
||||
|
||||
export interface Glossary {
|
||||
id: string;
|
||||
client_id: string;
|
||||
name: string;
|
||||
description?: string;
|
||||
source_locale: string;
|
||||
source: string;
|
||||
status: GlossaryStatus;
|
||||
current_version_id?: string;
|
||||
created_at: string;
|
||||
created_by: string;
|
||||
}
|
||||
|
||||
export interface GlossaryDetail extends Glossary {
|
||||
versions: GlossaryVersion[];
|
||||
}
|
||||
|
||||
export interface GlossaryTerm {
|
||||
id?: string;
|
||||
source_term: string;
|
||||
translations: Record<string, string>;
|
||||
}
|
||||
|
||||
export interface GlossaryTermsResponse {
|
||||
terms: GlossaryTerm[];
|
||||
total: number;
|
||||
page: number;
|
||||
page_size: number;
|
||||
}
|
||||
|
||||
export interface GlossaryUploadRequest {
|
||||
name: string;
|
||||
source_locale: string;
|
||||
source_locale_col: string;
|
||||
description?: string;
|
||||
change_note?: string;
|
||||
}
|
||||
Loading…
Add table
Reference in a new issue