#!/usr/bin/env python3 """ TM Format Migrator ================== Converts compact TM files (single "t" field) to the multi-field JSONL format. Compact format: {"t": "{seg_key} {note_type} {locale_code} {EN_source} {TX_target}"} Multi-field output: {"seg_key": "...", "date": "", "en": "", "lc": "...", "tx": "", "nt": "...", "_text": "..."} Since the EN/TX boundary is ambiguous (no delimiter between languages), both en and tx are left empty with a combined _text field for manual review. Usage: python scripts/tm_format_migrator.py """ import argparse import json import re import sys from pathlib import Path # Regex to find locale code in xx-xx format (e.g., es-es, de-de, fr-fr) LOCALE_PATTERN = re.compile(r"\b([a-z]{2}-[a-z]{2})\b") # Known note types that appear between the sequence number and the locale code KNOWN_NOTE_TYPES = [ "BVO", "VO", "SRT", "SUPER", "GFX", "BANNER", "CTA", "HEADLINE", "SUBHEAD", "BODY", "COPY", "LEGAL", "DISCLAIMER", "TITLE", "DESC", "ENDCARD", "PRE-ROLL", "MID-ROLL", "POST-ROLL", "OLV", "RADIO", "DISPLAY", "SOCIAL", "PRINT", "OOH", "DIGITAL", "AUDIO", ] def parse_compact_line(raw_text: str) -> dict | None: """ Parse a compact TM entry's 't' field into structured fields. Returns a dict with seg_key, date, en, lc, tx, nt, _text or None on failure. """ match = LOCALE_PATTERN.search(raw_text) if not match: return None locale_code = match.group(1) locale_start = match.start() locale_end = match.end() # Everything before the locale code contains seg_key + note_type prefix = raw_text[:locale_start].rstrip() # Everything after the locale code is the combined EN + TX text combined_text = raw_text[locale_end:].strip() # Split prefix into seg_key and note_type # The note_type is typically the last whitespace-separated token(s) before the locale # Strategy: walk backwards from the end of prefix to find note_type tokens prefix_tokens = prefix.split() if not prefix_tokens: return None # Find where the note_type begins by looking for known note types # or by finding the last numeric token (sequence number) note_type_tokens = [] seg_key_tokens = [] # Walk backwards through tokens to find the note_type boundary found_seq_number = False for i in range(len(prefix_tokens) - 1, -1, -1): token = prefix_tokens[i] # A sequence number is a 3-digit number like 001, 002, etc. if re.match(r"^\d{2,4}$", token) and not found_seq_number: found_seq_number = True seg_key_tokens = prefix_tokens[: i + 1] note_type_tokens = prefix_tokens[i + 1 :] break # If we didn't find a clear sequence number, fall back to putting # everything in seg_key with empty note_type if not found_seq_number: seg_key_tokens = prefix_tokens note_type_tokens = [] seg_key = " ".join(seg_key_tokens) note_type = " ".join(note_type_tokens) return { "seg_key": seg_key, "date": "", "en": "", "lc": locale_code, "tx": "", "nt": note_type, "_text": combined_text, # TODO: manually split into en/tx } def migrate_file(input_path: Path, output_path: Path) -> dict: """ Migrate a single compact TM file to multi-field JSONL format. Returns a stats dict with counts of processed, skipped, and error lines. """ stats = {"processed": 0, "skipped": 0, "errors": 0, "locales": set()} output_path.parent.mkdir(parents=True, exist_ok=True) with open(input_path, "r", encoding="utf-8") as infile, \ open(output_path, "w", encoding="utf-8") as outfile: for line_num, line in enumerate(infile, start=1): line = line.strip() if not line: stats["skipped"] += 1 continue try: entry = json.loads(line) except json.JSONDecodeError as exc: print( f" WARNING: {input_path.name}:{line_num} - " f"Invalid JSON: {exc}", file=sys.stderr, ) stats["errors"] += 1 continue raw_text = entry.get("t") if raw_text is None: # Not a compact format line; check if already multi-field if "seg_key" in entry: # Already in new format, pass through outfile.write(json.dumps(entry, ensure_ascii=False) + "\n") stats["processed"] += 1 else: print( f" WARNING: {input_path.name}:{line_num} - " f"No 't' or 'seg_key' field found", file=sys.stderr, ) stats["errors"] += 1 continue parsed = parse_compact_line(raw_text) if parsed is None: print( f" WARNING: {input_path.name}:{line_num} - " f"Could not parse locale from: {raw_text[:80]}...", file=sys.stderr, ) stats["errors"] += 1 continue outfile.write(json.dumps(parsed, ensure_ascii=False) + "\n") stats["processed"] += 1 stats["locales"].add(parsed["lc"]) return stats def migrate_directory(input_dir: Path, output_dir: Path) -> None: """Recursively migrate all .json TM files from input_dir to output_dir.""" json_files = sorted(input_dir.rglob("*.json")) if not json_files: print(f"No .json files found in {input_dir}") return print(f"Found {len(json_files)} .json file(s) in {input_dir}\n") total_stats = {"files": 0, "processed": 0, "skipped": 0, "errors": 0, "locales": set()} for json_file in json_files: # Preserve relative directory structure relative_path = json_file.relative_to(input_dir) output_path = output_dir / relative_path.with_suffix(".jsonl") print(f"Processing: {relative_path}") stats = migrate_file(json_file, output_path) total_stats["files"] += 1 total_stats["processed"] += stats["processed"] total_stats["skipped"] += stats["skipped"] total_stats["errors"] += stats["errors"] total_stats["locales"].update(stats["locales"]) print( f" -> {stats['processed']} entries migrated, " f"{stats['skipped']} skipped, " f"{stats['errors']} errors" ) print("\n" + "=" * 60) print("Migration Summary") print("=" * 60) print(f" Files processed: {total_stats['files']}") print(f" Total entries: {total_stats['processed']}") print(f" Skipped lines: {total_stats['skipped']}") print(f" Errors: {total_stats['errors']}") print(f" Locales found: {', '.join(sorted(total_stats['locales']))}") print("=" * 60) def main() -> None: parser = argparse.ArgumentParser( description="Migrate compact TM files to multi-field JSONL format.", ) parser.add_argument( "input_dir", type=Path, help="Directory containing compact .json TM files", ) parser.add_argument( "output_dir", type=Path, help="Directory to write migrated .jsonl files", ) parser.add_argument( "--dry-run", action="store_true", help="Parse and report without writing output files", ) args = parser.parse_args() if not args.input_dir.is_dir(): print(f"Error: Input directory does not exist: {args.input_dir}", file=sys.stderr) sys.exit(1) if args.dry_run: print("[DRY RUN] No files will be written.\n") args.output_dir.mkdir(parents=True, exist_ok=True) migrate_directory(args.input_dir, args.output_dir) if __name__ == "__main__": main()