Adds full glossary system so Gemini uses client-approved terminology
when generating subtitles and translations (critical for 3M brand names
and product codes across 16 target locales).
Backend:
- lib/locales.py: BCP-47 locale registry, normalises xlsx fr_fr → fr-FR
- models/glossary.py: Glossary / GlossaryVersion / GlossaryTerm + enums
- services/glossary_service.py: xlsx parse (openpyxl), ingest to Mongo,
hybrid retrieval (Aho-Corasick exact + Atlas Vector Search), prompt block
- services/embedding_service.py: Gemini text-embedding-004, batch 100, retry
- tasks/embed_glossary.py: Celery background task for async embedding
- api/v1/routes_glossaries.py: CRUD endpoints under /clients/{id}/glossaries
- gemini.py: _build_glossary_block(), {GLOSSARY} injection in all 4 call sites
- tts.py / gemini_tts.py: pass full locale codes (no split("-")[0] truncation)
- tasks/translate_and_synthesize.py: glossary lookup + injection per language
- prompts: {GLOSSARY} placeholder in ingestion, targeted, transcreation prompts
- pyproject.toml: +openpyxl, +pyahocorasick
Frontend:
- routes/admin/glossaries/: GlossaryList, GlossaryUpload, GlossaryDetail
- App.tsx: 3 new routes under /admin/clients/:clientId/glossaries
- ClientDetail.tsx: Glossaries card with count + quick links
- types/api.ts: Glossary, GlossaryVersion, GlossaryDetail, GlossaryTerm types
- lib/api.ts: 7 new API methods (upload, list, detail, terms, versions, activate, archive)
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
102 lines
3.4 KiB
Python
102 lines
3.4 KiB
Python
"""
|
|
Celery task: compute and store Gemini embeddings for all terms in a glossary version.
|
|
|
|
Runs as a background job after glossary ingestion so the API response is fast.
|
|
Processes terms in batches of 100 and updates embedded_count incrementally.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
|
|
from bson import ObjectId
|
|
from motor.motor_asyncio import AsyncIOMotorClient
|
|
|
|
from ..core.config import settings
|
|
from ..core.logging import get_logger
|
|
from ..models.glossary import EmbeddingStatus
|
|
from . import celery_app
|
|
|
|
logger = get_logger(__name__)
|
|
|
|
_BATCH_SIZE = 100
|
|
|
|
|
|
@celery_app.task(name="embed_glossary_version", bind=True, max_retries=3)
|
|
def embed_glossary_version_task(self, version_id: str) -> dict:
|
|
"""
|
|
Compute embeddings for all GlossaryTerms of `version_id`.
|
|
Updates embedded_count and embedding_status on the GlossaryVersion doc.
|
|
"""
|
|
try:
|
|
result = asyncio.run(_async_embed_version(version_id))
|
|
return result
|
|
except Exception as exc:
|
|
logger.error(f"embed_glossary_version_task failed for {version_id}: {exc}")
|
|
raise self.retry(exc=exc, countdown=60) from None
|
|
|
|
|
|
async def _async_embed_version(version_id: str) -> dict:
|
|
from ..services.embedding_service import embedding_service
|
|
|
|
mongo_client = AsyncIOMotorClient(settings.mongodb_uri)
|
|
db = mongo_client[settings.mongodb_db]
|
|
|
|
try:
|
|
# Mark in-progress
|
|
await db.glossary_versions.update_one(
|
|
{"_id": ObjectId(version_id)},
|
|
{"$set": {"embedding_status": EmbeddingStatus.IN_PROGRESS.value}},
|
|
)
|
|
|
|
# Fetch all terms without embeddings
|
|
cursor = db.glossary_terms.find(
|
|
{"version_id": version_id, "embedding": None},
|
|
{"_id": 1, "source_term": 1},
|
|
)
|
|
terms = await cursor.to_list(length=None)
|
|
total = len(terms)
|
|
logger.info(f"Embedding {total} terms for version {version_id}")
|
|
|
|
embedded_count = 0
|
|
for i in range(0, total, _BATCH_SIZE):
|
|
batch = terms[i: i + _BATCH_SIZE]
|
|
texts = [t["source_term"] for t in batch]
|
|
ids = [t["_id"] for t in batch]
|
|
|
|
embeddings = await embedding_service.embed_texts(texts)
|
|
|
|
# Bulk update
|
|
ops = []
|
|
from pymongo import UpdateOne
|
|
for term_id, embedding in zip(ids, embeddings, strict=False):
|
|
ops.append(UpdateOne({"_id": term_id}, {"$set": {"embedding": embedding}}))
|
|
|
|
if ops:
|
|
await db.glossary_terms.bulk_write(ops, ordered=False)
|
|
|
|
embedded_count += len(batch)
|
|
await db.glossary_versions.update_one(
|
|
{"_id": ObjectId(version_id)},
|
|
{"$set": {"embedded_count": embedded_count}},
|
|
)
|
|
logger.info(f"Version {version_id}: embedded {embedded_count}/{total}")
|
|
|
|
# Mark done
|
|
await db.glossary_versions.update_one(
|
|
{"_id": ObjectId(version_id)},
|
|
{"$set": {
|
|
"embedding_status": EmbeddingStatus.DONE.value,
|
|
"embedded_count": total,
|
|
}},
|
|
)
|
|
logger.info(f"Embedding complete for version {version_id}: {total} terms")
|
|
return {"version_id": version_id, "total": total}
|
|
|
|
except Exception:
|
|
await db.glossary_versions.update_one(
|
|
{"_id": ObjectId(version_id)},
|
|
{"$set": {"embedding_status": EmbeddingStatus.FAILED.value}},
|
|
)
|
|
raise
|
|
finally:
|
|
mongo_client.close()
|