video-accessibility/backend/app/models/glossary.py
Vadym Samoilenko 4645e67611 fix(glossary-list): show real embedding progress in glossary list view
- Add current_version_embedding_status/embedded_count/term_count to GlossaryResponse
- Batch-fetch current versions in list endpoint (single extra query, not N queries)
- Add get_versions_by_ids() helper to glossary_service
- Fix GlossaryList.tsx: embeddingBadge('') → embeddingBadge(g) with real status + pct

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-13 19:00:56 +01:00

142 lines
4 KiB
Python

from __future__ import annotations
from datetime import datetime
from enum import StrEnum
from pydantic import BaseModel, Field
class GlossarySource(StrEnum):
XLSX_UPLOAD = "xlsx_upload"
FRAZE_API = "fraze_api" # reserved for future FRAZE integration
class GlossaryStatus(StrEnum):
ACTIVE = "active"
ARCHIVED = "archived"
class EmbeddingStatus(StrEnum):
PENDING = "pending"
IN_PROGRESS = "in_progress"
DONE = "done"
FAILED = "failed"
class Glossary(BaseModel):
id: str | None = Field(None, alias="_id")
client_id: str
name: str
description: str | None = None
source_locale: str # BCP-47 source column, e.g. "en-GB"
source: GlossarySource = GlossarySource.XLSX_UPLOAD
status: GlossaryStatus = GlossaryStatus.ACTIVE
current_version_id: str | None = None
created_at: datetime = Field(default_factory=datetime.utcnow)
created_by: str # user_id
model_config = {"populate_by_name": True, "arbitrary_types_allowed": True}
class GlossaryVersion(BaseModel):
id: str | None = Field(None, alias="_id")
glossary_id: str
version_number: int
source_xlsx_gcs_path: str | None = None # GCS path to original file
term_count: int = 0
embedded_count: int = 0
embedding_status: EmbeddingStatus = EmbeddingStatus.PENDING
created_at: datetime = Field(default_factory=datetime.utcnow)
created_by: str
change_note: str | None = None
model_config = {"populate_by_name": True}
class GlossaryTerm(BaseModel):
"""One source term with its per-locale translations."""
id: str | None = Field(None, alias="_id")
glossary_id: str
version_id: str
cid: str | None = None # 3M Content ID from xlsx
tid: str | None = None # 3M Term ID from xlsx
source_term: str # canonical source text (whitespace-normalised)
source_term_lower: str # lowercase for case-insensitive index
translations: dict[str, str] = {} # {locale_code: translated_text}
embedding: list[float] | None = None # 768-dim Gemini embedding
model_config = {"populate_by_name": True}
# ── Schema models (API request/response) ──────────────────────────────────────
class GlossaryCreate(BaseModel):
name: str
description: str | None = None
source_locale: str
change_note: str | None = None
class GlossaryVersionCreate(BaseModel):
source_locale: str
change_note: str | None = None
class GlossaryResponse(BaseModel):
id: str
client_id: str
name: str
description: str | None = None
source_locale: str
source: GlossarySource
status: GlossaryStatus
current_version_id: str | None = None
current_version_embedding_status: EmbeddingStatus | None = None
current_version_embedded_count: int | None = None
current_version_term_count: int | None = None
created_at: datetime
created_by: str
class GlossaryVersionResponse(BaseModel):
id: str
glossary_id: str
version_number: int
term_count: int
embedded_count: int
embedding_status: EmbeddingStatus
created_at: datetime
created_by: str
change_note: str | None = None
class GlossaryDetailResponse(GlossaryResponse):
versions: list[GlossaryVersionResponse] = []
class GlossaryTermPreview(BaseModel):
"""Subset of GlossaryTerm for UI previews."""
source_term: str
translations: dict[str, str]
class MatchedTerm(BaseModel):
"""A term matched against VTT source text, with the target-locale translation."""
source_term: str
target_translation: str
match_kind: str # "exact" | "vector"
score: float # 1.0 for exact, cosine similarity for vector
def glossary_from_doc(doc: dict) -> Glossary:
doc = dict(doc)
if "_id" in doc:
doc["_id"] = str(doc["_id"])
return Glossary.model_validate(doc)
def glossary_version_from_doc(doc: dict) -> GlossaryVersion:
doc = dict(doc)
if "_id" in doc:
doc["_id"] = str(doc["_id"])
return GlossaryVersion.model_validate(doc)