Add seed script to register existing TM/ref files in database
Scans storage/amazon/tm/ and storage/amazon/ref/, creates DB registry entries for each JSON file so they appear in the TM Registry and Reference Library pages. Extracts channel from TM filenames, locale from ref filenames, counts JSONL segments. Idempotent (skips duplicates). Also added to deploy.sh --init flow. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
8d4dc65993
commit
ff4e7e768e
2 changed files with 219 additions and 0 deletions
|
|
@ -139,6 +139,10 @@ if [ "$INIT" = true ]; then
|
|||
$COMPOSE exec -T backend python -m seed.create_default_client
|
||||
$COMPOSE exec -T backend python -m seed.create_test_users
|
||||
|
||||
# Register existing TM + reference files in the database
|
||||
log "Registering storage files in database..."
|
||||
$COMPOSE exec -T backend python -m seed.register_storage_files
|
||||
|
||||
# Start all remaining services
|
||||
log "Starting all services..."
|
||||
$COMPOSE up -d --remove-orphans
|
||||
|
|
|
|||
215
seed/register_storage_files.py
Normal file
215
seed/register_storage_files.py
Normal file
|
|
@ -0,0 +1,215 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Register Storage Files
|
||||
======================
|
||||
Scans storage/amazon/tm/ and storage/amazon/ref/ and creates database
|
||||
registry entries for every JSON file found on disk.
|
||||
|
||||
Designed to run inside the Docker container:
|
||||
docker compose exec backend python -m seed.register_storage_files
|
||||
|
||||
Idempotent: skips files that already have a registry entry (matched by file_path).
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
_seed_dir = Path(__file__).resolve().parent
|
||||
_backend_dir = _seed_dir.parent / "backend"
|
||||
if _backend_dir.is_dir():
|
||||
sys.path.insert(0, str(_backend_dir))
|
||||
else:
|
||||
sys.path.insert(0, str(_seed_dir.parent))
|
||||
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine
|
||||
|
||||
from app.config import settings
|
||||
from app.models.base import Base
|
||||
from app.models.client import Client
|
||||
from app.models.files import ReferenceFile, ReferenceFileType, TMFileRegistry
|
||||
from app.models.user import User
|
||||
|
||||
|
||||
# ── Channel extraction from TM filenames ─────────────────────────────
|
||||
# flat_MASS_de-de.json -> channel = "MASS"
|
||||
# flat_value_de-de.json -> channel = "value"
|
||||
# flat_PrimeDualBenefit_de-de.json -> channel = "PrimeDualBenefit"
|
||||
|
||||
_TM_FILENAME_RE = re.compile(r"^flat_(.+?)_[a-z]{2}-[a-z]{2}\.json$")
|
||||
|
||||
|
||||
def _extract_channel(filename: str) -> str:
|
||||
m = _TM_FILENAME_RE.match(filename)
|
||||
return m.group(1) if m else "unknown"
|
||||
|
||||
|
||||
# ── Locale extraction from ref filenames ─────────────────────────────
|
||||
# de_DE_glossary.json -> locale = "de-DE"
|
||||
# ca_ES_blacklist.json -> locale = "ca-ES"
|
||||
|
||||
_REF_LOCALE_RE = re.compile(r"^([a-z]{2})_([A-Z]{2})_")
|
||||
|
||||
|
||||
def _extract_ref_locale(filename: str) -> str:
|
||||
m = _REF_LOCALE_RE.match(filename)
|
||||
if m:
|
||||
return f"{m.group(1)}-{m.group(2)}"
|
||||
return "global"
|
||||
|
||||
|
||||
# ── File type mapping from directory name ────────────────────────────
|
||||
|
||||
DIR_TO_FILE_TYPE: dict[str, ReferenceFileType] = {
|
||||
"glossary": ReferenceFileType.glossary,
|
||||
"blacklist": ReferenceFileType.blacklist,
|
||||
"tov_global": ReferenceFileType.tov_global,
|
||||
"tov_supplement": ReferenceFileType.tov_supplement,
|
||||
"locale_considerations": ReferenceFileType.locale_considerations,
|
||||
"date_pct_formats": ReferenceFileType.date_pct_formats,
|
||||
}
|
||||
|
||||
|
||||
def _count_jsonl_lines(filepath: str) -> int:
|
||||
"""Count non-empty lines in a JSONL file."""
|
||||
count = 0
|
||||
try:
|
||||
with open(filepath, "r", encoding="utf-8") as f:
|
||||
for line in f:
|
||||
if line.strip():
|
||||
count += 1
|
||||
except Exception:
|
||||
pass
|
||||
return count
|
||||
|
||||
|
||||
async def register_files() -> None:
|
||||
engine = create_async_engine(settings.DATABASE_URL)
|
||||
session_factory = async_sessionmaker(engine, expire_on_commit=False)
|
||||
|
||||
async with session_factory() as db:
|
||||
# Find the Amazon client
|
||||
result = await db.execute(
|
||||
select(Client).where(Client.name == "Amazon")
|
||||
)
|
||||
client = result.scalar_one_or_none()
|
||||
if client is None:
|
||||
print("ERROR: Amazon client not found. Run create_default_client first.")
|
||||
return
|
||||
|
||||
# Find an admin user for uploaded_by
|
||||
result = await db.execute(
|
||||
select(User).where(User.role == "admin").limit(1)
|
||||
)
|
||||
admin = result.scalar_one_or_none()
|
||||
admin_id = admin.id if admin else None
|
||||
|
||||
client_id = client.id
|
||||
storage_root = settings.STORAGE_ROOT
|
||||
|
||||
tm_registered = 0
|
||||
tm_skipped = 0
|
||||
ref_registered = 0
|
||||
ref_skipped = 0
|
||||
|
||||
# ── Register TM files ───────────────────────────────────────
|
||||
tm_base = os.path.join(storage_root, "amazon", "tm")
|
||||
if os.path.isdir(tm_base):
|
||||
for locale_dir in sorted(os.listdir(tm_base)):
|
||||
locale_path = os.path.join(tm_base, locale_dir)
|
||||
if not os.path.isdir(locale_path):
|
||||
continue
|
||||
|
||||
locale_code = locale_dir # e.g. "de-DE"
|
||||
|
||||
for filename in sorted(os.listdir(locale_path)):
|
||||
if not filename.endswith(".json"):
|
||||
continue
|
||||
|
||||
rel_path = f"amazon/tm/{locale_dir}/{filename}"
|
||||
abs_path = os.path.join(locale_path, filename)
|
||||
|
||||
# Check if already registered
|
||||
existing = await db.execute(
|
||||
select(TMFileRegistry).where(
|
||||
TMFileRegistry.file_path == rel_path
|
||||
)
|
||||
)
|
||||
if existing.scalar_one_or_none() is not None:
|
||||
tm_skipped += 1
|
||||
continue
|
||||
|
||||
channel = _extract_channel(filename)
|
||||
segment_count = _count_jsonl_lines(abs_path)
|
||||
|
||||
tm = TMFileRegistry(
|
||||
client_id=client_id,
|
||||
locale_code=locale_code,
|
||||
channel=channel,
|
||||
filename=filename,
|
||||
file_path=rel_path,
|
||||
segment_count=segment_count,
|
||||
uploaded_by=admin_id,
|
||||
)
|
||||
db.add(tm)
|
||||
tm_registered += 1
|
||||
print(f" TM: {locale_code}/{filename} ({channel}, {segment_count} entries)")
|
||||
|
||||
# ── Register Reference files ─────────────────────────────────
|
||||
ref_base = os.path.join(storage_root, "amazon", "ref")
|
||||
if os.path.isdir(ref_base):
|
||||
for type_dir in sorted(os.listdir(ref_base)):
|
||||
type_path = os.path.join(ref_base, type_dir)
|
||||
if not os.path.isdir(type_path):
|
||||
continue
|
||||
|
||||
file_type = DIR_TO_FILE_TYPE.get(type_dir)
|
||||
if file_type is None:
|
||||
print(f" WARN: Unknown ref type directory: {type_dir}")
|
||||
continue
|
||||
|
||||
for filename in sorted(os.listdir(type_path)):
|
||||
if not filename.endswith(".json"):
|
||||
continue
|
||||
|
||||
rel_path = f"amazon/ref/{type_dir}/{filename}"
|
||||
|
||||
# Check if already registered
|
||||
existing = await db.execute(
|
||||
select(ReferenceFile).where(
|
||||
ReferenceFile.file_path == rel_path
|
||||
)
|
||||
)
|
||||
if existing.scalar_one_or_none() is not None:
|
||||
ref_skipped += 1
|
||||
continue
|
||||
|
||||
locale_scope = _extract_ref_locale(filename)
|
||||
|
||||
ref = ReferenceFile(
|
||||
client_id=client_id,
|
||||
file_type=file_type,
|
||||
locale_scope=locale_scope,
|
||||
filename=filename,
|
||||
file_path=rel_path,
|
||||
uploaded_by=admin_id,
|
||||
)
|
||||
db.add(ref)
|
||||
ref_registered += 1
|
||||
print(f" REF: {type_dir}/{filename} ({file_type.value}, {locale_scope})")
|
||||
|
||||
await db.commit()
|
||||
|
||||
await engine.dispose()
|
||||
|
||||
print()
|
||||
print(f"TM files: {tm_registered} registered, {tm_skipped} already existed")
|
||||
print(f"Ref files: {ref_registered} registered, {ref_skipped} already existed")
|
||||
print("Done.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(register_files())
|
||||
Loading…
Add table
Reference in a new issue