diff --git a/deploy.sh b/deploy.sh index 777c238..34686af 100755 --- a/deploy.sh +++ b/deploy.sh @@ -139,6 +139,10 @@ if [ "$INIT" = true ]; then $COMPOSE exec -T backend python -m seed.create_default_client $COMPOSE exec -T backend python -m seed.create_test_users + # Register existing TM + reference files in the database + log "Registering storage files in database..." + $COMPOSE exec -T backend python -m seed.register_storage_files + # Start all remaining services log "Starting all services..." $COMPOSE up -d --remove-orphans diff --git a/seed/register_storage_files.py b/seed/register_storage_files.py new file mode 100644 index 0000000..a8d08cf --- /dev/null +++ b/seed/register_storage_files.py @@ -0,0 +1,215 @@ +#!/usr/bin/env python3 +""" +Register Storage Files +====================== +Scans storage/amazon/tm/ and storage/amazon/ref/ and creates database +registry entries for every JSON file found on disk. + +Designed to run inside the Docker container: + docker compose exec backend python -m seed.register_storage_files + +Idempotent: skips files that already have a registry entry (matched by file_path). +""" + +import asyncio +import os +import re +import sys +from pathlib import Path + +_seed_dir = Path(__file__).resolve().parent +_backend_dir = _seed_dir.parent / "backend" +if _backend_dir.is_dir(): + sys.path.insert(0, str(_backend_dir)) +else: + sys.path.insert(0, str(_seed_dir.parent)) + +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine + +from app.config import settings +from app.models.base import Base +from app.models.client import Client +from app.models.files import ReferenceFile, ReferenceFileType, TMFileRegistry +from app.models.user import User + + +# ── Channel extraction from TM filenames ───────────────────────────── +# flat_MASS_de-de.json -> channel = "MASS" +# flat_value_de-de.json -> channel = "value" +# flat_PrimeDualBenefit_de-de.json -> channel = "PrimeDualBenefit" + +_TM_FILENAME_RE = re.compile(r"^flat_(.+?)_[a-z]{2}-[a-z]{2}\.json$") + + +def _extract_channel(filename: str) -> str: + m = _TM_FILENAME_RE.match(filename) + return m.group(1) if m else "unknown" + + +# ── Locale extraction from ref filenames ───────────────────────────── +# de_DE_glossary.json -> locale = "de-DE" +# ca_ES_blacklist.json -> locale = "ca-ES" + +_REF_LOCALE_RE = re.compile(r"^([a-z]{2})_([A-Z]{2})_") + + +def _extract_ref_locale(filename: str) -> str: + m = _REF_LOCALE_RE.match(filename) + if m: + return f"{m.group(1)}-{m.group(2)}" + return "global" + + +# ── File type mapping from directory name ──────────────────────────── + +DIR_TO_FILE_TYPE: dict[str, ReferenceFileType] = { + "glossary": ReferenceFileType.glossary, + "blacklist": ReferenceFileType.blacklist, + "tov_global": ReferenceFileType.tov_global, + "tov_supplement": ReferenceFileType.tov_supplement, + "locale_considerations": ReferenceFileType.locale_considerations, + "date_pct_formats": ReferenceFileType.date_pct_formats, +} + + +def _count_jsonl_lines(filepath: str) -> int: + """Count non-empty lines in a JSONL file.""" + count = 0 + try: + with open(filepath, "r", encoding="utf-8") as f: + for line in f: + if line.strip(): + count += 1 + except Exception: + pass + return count + + +async def register_files() -> None: + engine = create_async_engine(settings.DATABASE_URL) + session_factory = async_sessionmaker(engine, expire_on_commit=False) + + async with session_factory() as db: + # Find the Amazon client + result = await db.execute( + select(Client).where(Client.name == "Amazon") + ) + client = result.scalar_one_or_none() + if client is None: + print("ERROR: Amazon client not found. Run create_default_client first.") + return + + # Find an admin user for uploaded_by + result = await db.execute( + select(User).where(User.role == "admin").limit(1) + ) + admin = result.scalar_one_or_none() + admin_id = admin.id if admin else None + + client_id = client.id + storage_root = settings.STORAGE_ROOT + + tm_registered = 0 + tm_skipped = 0 + ref_registered = 0 + ref_skipped = 0 + + # ── Register TM files ─────────────────────────────────────── + tm_base = os.path.join(storage_root, "amazon", "tm") + if os.path.isdir(tm_base): + for locale_dir in sorted(os.listdir(tm_base)): + locale_path = os.path.join(tm_base, locale_dir) + if not os.path.isdir(locale_path): + continue + + locale_code = locale_dir # e.g. "de-DE" + + for filename in sorted(os.listdir(locale_path)): + if not filename.endswith(".json"): + continue + + rel_path = f"amazon/tm/{locale_dir}/{filename}" + abs_path = os.path.join(locale_path, filename) + + # Check if already registered + existing = await db.execute( + select(TMFileRegistry).where( + TMFileRegistry.file_path == rel_path + ) + ) + if existing.scalar_one_or_none() is not None: + tm_skipped += 1 + continue + + channel = _extract_channel(filename) + segment_count = _count_jsonl_lines(abs_path) + + tm = TMFileRegistry( + client_id=client_id, + locale_code=locale_code, + channel=channel, + filename=filename, + file_path=rel_path, + segment_count=segment_count, + uploaded_by=admin_id, + ) + db.add(tm) + tm_registered += 1 + print(f" TM: {locale_code}/{filename} ({channel}, {segment_count} entries)") + + # ── Register Reference files ───────────────────────────────── + ref_base = os.path.join(storage_root, "amazon", "ref") + if os.path.isdir(ref_base): + for type_dir in sorted(os.listdir(ref_base)): + type_path = os.path.join(ref_base, type_dir) + if not os.path.isdir(type_path): + continue + + file_type = DIR_TO_FILE_TYPE.get(type_dir) + if file_type is None: + print(f" WARN: Unknown ref type directory: {type_dir}") + continue + + for filename in sorted(os.listdir(type_path)): + if not filename.endswith(".json"): + continue + + rel_path = f"amazon/ref/{type_dir}/{filename}" + + # Check if already registered + existing = await db.execute( + select(ReferenceFile).where( + ReferenceFile.file_path == rel_path + ) + ) + if existing.scalar_one_or_none() is not None: + ref_skipped += 1 + continue + + locale_scope = _extract_ref_locale(filename) + + ref = ReferenceFile( + client_id=client_id, + file_type=file_type, + locale_scope=locale_scope, + filename=filename, + file_path=rel_path, + uploaded_by=admin_id, + ) + db.add(ref) + ref_registered += 1 + print(f" REF: {type_dir}/{filename} ({file_type.value}, {locale_scope})") + + await db.commit() + + await engine.dispose() + + print() + print(f"TM files: {tm_registered} registered, {tm_skipped} already existed") + print(f"Ref files: {ref_registered} registered, {ref_skipped} already existed") + print("Done.") + + +if __name__ == "__main__": + asyncio.run(register_files())