From ff4e7e768e29ff2a2f9eea9d7a26e673ead144da Mon Sep 17 00:00:00 2001 From: DJP Date: Tue, 14 Apr 2026 10:08:23 -0400 Subject: [PATCH] Add seed script to register existing TM/ref files in database Scans storage/amazon/tm/ and storage/amazon/ref/, creates DB registry entries for each JSON file so they appear in the TM Registry and Reference Library pages. Extracts channel from TM filenames, locale from ref filenames, counts JSONL segments. Idempotent (skips duplicates). Also added to deploy.sh --init flow. Co-Authored-By: Claude Opus 4.6 --- deploy.sh | 4 + seed/register_storage_files.py | 215 +++++++++++++++++++++++++++++++++ 2 files changed, 219 insertions(+) create mode 100644 seed/register_storage_files.py diff --git a/deploy.sh b/deploy.sh index 777c238..34686af 100755 --- a/deploy.sh +++ b/deploy.sh @@ -139,6 +139,10 @@ if [ "$INIT" = true ]; then $COMPOSE exec -T backend python -m seed.create_default_client $COMPOSE exec -T backend python -m seed.create_test_users + # Register existing TM + reference files in the database + log "Registering storage files in database..." + $COMPOSE exec -T backend python -m seed.register_storage_files + # Start all remaining services log "Starting all services..." $COMPOSE up -d --remove-orphans diff --git a/seed/register_storage_files.py b/seed/register_storage_files.py new file mode 100644 index 0000000..a8d08cf --- /dev/null +++ b/seed/register_storage_files.py @@ -0,0 +1,215 @@ +#!/usr/bin/env python3 +""" +Register Storage Files +====================== +Scans storage/amazon/tm/ and storage/amazon/ref/ and creates database +registry entries for every JSON file found on disk. + +Designed to run inside the Docker container: + docker compose exec backend python -m seed.register_storage_files + +Idempotent: skips files that already have a registry entry (matched by file_path). +""" + +import asyncio +import os +import re +import sys +from pathlib import Path + +_seed_dir = Path(__file__).resolve().parent +_backend_dir = _seed_dir.parent / "backend" +if _backend_dir.is_dir(): + sys.path.insert(0, str(_backend_dir)) +else: + sys.path.insert(0, str(_seed_dir.parent)) + +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine + +from app.config import settings +from app.models.base import Base +from app.models.client import Client +from app.models.files import ReferenceFile, ReferenceFileType, TMFileRegistry +from app.models.user import User + + +# ── Channel extraction from TM filenames ───────────────────────────── +# flat_MASS_de-de.json -> channel = "MASS" +# flat_value_de-de.json -> channel = "value" +# flat_PrimeDualBenefit_de-de.json -> channel = "PrimeDualBenefit" + +_TM_FILENAME_RE = re.compile(r"^flat_(.+?)_[a-z]{2}-[a-z]{2}\.json$") + + +def _extract_channel(filename: str) -> str: + m = _TM_FILENAME_RE.match(filename) + return m.group(1) if m else "unknown" + + +# ── Locale extraction from ref filenames ───────────────────────────── +# de_DE_glossary.json -> locale = "de-DE" +# ca_ES_blacklist.json -> locale = "ca-ES" + +_REF_LOCALE_RE = re.compile(r"^([a-z]{2})_([A-Z]{2})_") + + +def _extract_ref_locale(filename: str) -> str: + m = _REF_LOCALE_RE.match(filename) + if m: + return f"{m.group(1)}-{m.group(2)}" + return "global" + + +# ── File type mapping from directory name ──────────────────────────── + +DIR_TO_FILE_TYPE: dict[str, ReferenceFileType] = { + "glossary": ReferenceFileType.glossary, + "blacklist": ReferenceFileType.blacklist, + "tov_global": ReferenceFileType.tov_global, + "tov_supplement": ReferenceFileType.tov_supplement, + "locale_considerations": ReferenceFileType.locale_considerations, + "date_pct_formats": ReferenceFileType.date_pct_formats, +} + + +def _count_jsonl_lines(filepath: str) -> int: + """Count non-empty lines in a JSONL file.""" + count = 0 + try: + with open(filepath, "r", encoding="utf-8") as f: + for line in f: + if line.strip(): + count += 1 + except Exception: + pass + return count + + +async def register_files() -> None: + engine = create_async_engine(settings.DATABASE_URL) + session_factory = async_sessionmaker(engine, expire_on_commit=False) + + async with session_factory() as db: + # Find the Amazon client + result = await db.execute( + select(Client).where(Client.name == "Amazon") + ) + client = result.scalar_one_or_none() + if client is None: + print("ERROR: Amazon client not found. Run create_default_client first.") + return + + # Find an admin user for uploaded_by + result = await db.execute( + select(User).where(User.role == "admin").limit(1) + ) + admin = result.scalar_one_or_none() + admin_id = admin.id if admin else None + + client_id = client.id + storage_root = settings.STORAGE_ROOT + + tm_registered = 0 + tm_skipped = 0 + ref_registered = 0 + ref_skipped = 0 + + # ── Register TM files ─────────────────────────────────────── + tm_base = os.path.join(storage_root, "amazon", "tm") + if os.path.isdir(tm_base): + for locale_dir in sorted(os.listdir(tm_base)): + locale_path = os.path.join(tm_base, locale_dir) + if not os.path.isdir(locale_path): + continue + + locale_code = locale_dir # e.g. "de-DE" + + for filename in sorted(os.listdir(locale_path)): + if not filename.endswith(".json"): + continue + + rel_path = f"amazon/tm/{locale_dir}/{filename}" + abs_path = os.path.join(locale_path, filename) + + # Check if already registered + existing = await db.execute( + select(TMFileRegistry).where( + TMFileRegistry.file_path == rel_path + ) + ) + if existing.scalar_one_or_none() is not None: + tm_skipped += 1 + continue + + channel = _extract_channel(filename) + segment_count = _count_jsonl_lines(abs_path) + + tm = TMFileRegistry( + client_id=client_id, + locale_code=locale_code, + channel=channel, + filename=filename, + file_path=rel_path, + segment_count=segment_count, + uploaded_by=admin_id, + ) + db.add(tm) + tm_registered += 1 + print(f" TM: {locale_code}/{filename} ({channel}, {segment_count} entries)") + + # ── Register Reference files ───────────────────────────────── + ref_base = os.path.join(storage_root, "amazon", "ref") + if os.path.isdir(ref_base): + for type_dir in sorted(os.listdir(ref_base)): + type_path = os.path.join(ref_base, type_dir) + if not os.path.isdir(type_path): + continue + + file_type = DIR_TO_FILE_TYPE.get(type_dir) + if file_type is None: + print(f" WARN: Unknown ref type directory: {type_dir}") + continue + + for filename in sorted(os.listdir(type_path)): + if not filename.endswith(".json"): + continue + + rel_path = f"amazon/ref/{type_dir}/{filename}" + + # Check if already registered + existing = await db.execute( + select(ReferenceFile).where( + ReferenceFile.file_path == rel_path + ) + ) + if existing.scalar_one_or_none() is not None: + ref_skipped += 1 + continue + + locale_scope = _extract_ref_locale(filename) + + ref = ReferenceFile( + client_id=client_id, + file_type=file_type, + locale_scope=locale_scope, + filename=filename, + file_path=rel_path, + uploaded_by=admin_id, + ) + db.add(ref) + ref_registered += 1 + print(f" REF: {type_dir}/{filename} ({file_type.value}, {locale_scope})") + + await db.commit() + + await engine.dispose() + + print() + print(f"TM files: {tm_registered} registered, {tm_skipped} already existed") + print(f"Ref files: {ref_registered} registered, {ref_skipped} already existed") + print("Done.") + + +if __name__ == "__main__": + asyncio.run(register_files())