Add seed script to register existing TM/ref files in database

Scans storage/amazon/tm/ and storage/amazon/ref/, creates DB registry
entries for each JSON file so they appear in the TM Registry and
Reference Library pages. Extracts channel from TM filenames, locale
from ref filenames, counts JSONL segments. Idempotent (skips duplicates).

Also added to deploy.sh --init flow.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
DJP 2026-04-14 10:08:23 -04:00
parent 8d4dc65993
commit ff4e7e768e
2 changed files with 219 additions and 0 deletions

View file

@ -139,6 +139,10 @@ if [ "$INIT" = true ]; then
$COMPOSE exec -T backend python -m seed.create_default_client
$COMPOSE exec -T backend python -m seed.create_test_users
# Register existing TM + reference files in the database
log "Registering storage files in database..."
$COMPOSE exec -T backend python -m seed.register_storage_files
# Start all remaining services
log "Starting all services..."
$COMPOSE up -d --remove-orphans

View file

@ -0,0 +1,215 @@
#!/usr/bin/env python3
"""
Register Storage Files
======================
Scans storage/amazon/tm/ and storage/amazon/ref/ and creates database
registry entries for every JSON file found on disk.
Designed to run inside the Docker container:
docker compose exec backend python -m seed.register_storage_files
Idempotent: skips files that already have a registry entry (matched by file_path).
"""
import asyncio
import os
import re
import sys
from pathlib import Path
_seed_dir = Path(__file__).resolve().parent
_backend_dir = _seed_dir.parent / "backend"
if _backend_dir.is_dir():
sys.path.insert(0, str(_backend_dir))
else:
sys.path.insert(0, str(_seed_dir.parent))
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine
from app.config import settings
from app.models.base import Base
from app.models.client import Client
from app.models.files import ReferenceFile, ReferenceFileType, TMFileRegistry
from app.models.user import User
# ── Channel extraction from TM filenames ─────────────────────────────
# flat_MASS_de-de.json -> channel = "MASS"
# flat_value_de-de.json -> channel = "value"
# flat_PrimeDualBenefit_de-de.json -> channel = "PrimeDualBenefit"
_TM_FILENAME_RE = re.compile(r"^flat_(.+?)_[a-z]{2}-[a-z]{2}\.json$")
def _extract_channel(filename: str) -> str:
m = _TM_FILENAME_RE.match(filename)
return m.group(1) if m else "unknown"
# ── Locale extraction from ref filenames ─────────────────────────────
# de_DE_glossary.json -> locale = "de-DE"
# ca_ES_blacklist.json -> locale = "ca-ES"
_REF_LOCALE_RE = re.compile(r"^([a-z]{2})_([A-Z]{2})_")
def _extract_ref_locale(filename: str) -> str:
m = _REF_LOCALE_RE.match(filename)
if m:
return f"{m.group(1)}-{m.group(2)}"
return "global"
# ── File type mapping from directory name ────────────────────────────
DIR_TO_FILE_TYPE: dict[str, ReferenceFileType] = {
"glossary": ReferenceFileType.glossary,
"blacklist": ReferenceFileType.blacklist,
"tov_global": ReferenceFileType.tov_global,
"tov_supplement": ReferenceFileType.tov_supplement,
"locale_considerations": ReferenceFileType.locale_considerations,
"date_pct_formats": ReferenceFileType.date_pct_formats,
}
def _count_jsonl_lines(filepath: str) -> int:
"""Count non-empty lines in a JSONL file."""
count = 0
try:
with open(filepath, "r", encoding="utf-8") as f:
for line in f:
if line.strip():
count += 1
except Exception:
pass
return count
async def register_files() -> None:
engine = create_async_engine(settings.DATABASE_URL)
session_factory = async_sessionmaker(engine, expire_on_commit=False)
async with session_factory() as db:
# Find the Amazon client
result = await db.execute(
select(Client).where(Client.name == "Amazon")
)
client = result.scalar_one_or_none()
if client is None:
print("ERROR: Amazon client not found. Run create_default_client first.")
return
# Find an admin user for uploaded_by
result = await db.execute(
select(User).where(User.role == "admin").limit(1)
)
admin = result.scalar_one_or_none()
admin_id = admin.id if admin else None
client_id = client.id
storage_root = settings.STORAGE_ROOT
tm_registered = 0
tm_skipped = 0
ref_registered = 0
ref_skipped = 0
# ── Register TM files ───────────────────────────────────────
tm_base = os.path.join(storage_root, "amazon", "tm")
if os.path.isdir(tm_base):
for locale_dir in sorted(os.listdir(tm_base)):
locale_path = os.path.join(tm_base, locale_dir)
if not os.path.isdir(locale_path):
continue
locale_code = locale_dir # e.g. "de-DE"
for filename in sorted(os.listdir(locale_path)):
if not filename.endswith(".json"):
continue
rel_path = f"amazon/tm/{locale_dir}/{filename}"
abs_path = os.path.join(locale_path, filename)
# Check if already registered
existing = await db.execute(
select(TMFileRegistry).where(
TMFileRegistry.file_path == rel_path
)
)
if existing.scalar_one_or_none() is not None:
tm_skipped += 1
continue
channel = _extract_channel(filename)
segment_count = _count_jsonl_lines(abs_path)
tm = TMFileRegistry(
client_id=client_id,
locale_code=locale_code,
channel=channel,
filename=filename,
file_path=rel_path,
segment_count=segment_count,
uploaded_by=admin_id,
)
db.add(tm)
tm_registered += 1
print(f" TM: {locale_code}/{filename} ({channel}, {segment_count} entries)")
# ── Register Reference files ─────────────────────────────────
ref_base = os.path.join(storage_root, "amazon", "ref")
if os.path.isdir(ref_base):
for type_dir in sorted(os.listdir(ref_base)):
type_path = os.path.join(ref_base, type_dir)
if not os.path.isdir(type_path):
continue
file_type = DIR_TO_FILE_TYPE.get(type_dir)
if file_type is None:
print(f" WARN: Unknown ref type directory: {type_dir}")
continue
for filename in sorted(os.listdir(type_path)):
if not filename.endswith(".json"):
continue
rel_path = f"amazon/ref/{type_dir}/{filename}"
# Check if already registered
existing = await db.execute(
select(ReferenceFile).where(
ReferenceFile.file_path == rel_path
)
)
if existing.scalar_one_or_none() is not None:
ref_skipped += 1
continue
locale_scope = _extract_ref_locale(filename)
ref = ReferenceFile(
client_id=client_id,
file_type=file_type,
locale_scope=locale_scope,
filename=filename,
file_path=rel_path,
uploaded_by=admin_id,
)
db.add(ref)
ref_registered += 1
print(f" REF: {type_dir}/{filename} ({file_type.value}, {locale_scope})")
await db.commit()
await engine.dispose()
print()
print(f"TM files: {tm_registered} registered, {tm_skipped} already existed")
print(f"Ref files: {ref_registered} registered, {ref_skipped} already existed")
print("Done.")
if __name__ == "__main__":
asyncio.run(register_files())