""" Dow Jones / OLIVER file naming convention validator. Implements the playbook spec (slides 38-39): [OMGID] - [Domain]-[Subteam]?-[Brand or Event]-[Initiative or Event]?-[YY]-[Sequence]_[AssetName]_v## Examples from the playbook: 000000 - PMKT-ACQ-WSJ-BIC-26-01_MetaBanner_v1 000001 - PMKT-ENGRT-WSJ-BIC-26-01_Email_v1 000002 - EVNT-WSJ-GFF-26-02_Agenda_v3 This module is called by api_server.process_single_check when check_name == 'dj_file_naming'. It runs deterministically (no LLM call) and returns a result dict shaped like an LLM check result so it slots into the existing scoring + report pipeline. """ import os import re DOMAINS = {"PMKT", "BRND", "EVNT", "B2B", "UX"} DOMAINS_WITH_SUBTEAM = {"PMKT", "B2B"} SUBTEAMS = {"ACQ", "CMKT", "ENGRT", "ENT"} BRANDS = { "WSJ", "WSJ+", "BAR", "MW", "DF", "DJE", "FAC", "FE", "GRI", "NWS", "OA", "RSK", "RSKC", "DJRJ", "R&C", } EVENTS = { "GH", "DJRJS", "WECR", "FEOE", "FOH", "GFF", "JH", "TL", "TLQ", "TLCYB", "FOE", "WSJIL", "BODC", "CCOC", "CEOC", "CFOC", "CMOC", "CPOC", "TECC", "WSJLI", } def _strip_extension(filename): base, _ = os.path.splitext(filename) return base def validate_filename(filename): """ Validate a filename against the Dow Jones / OLIVER naming convention. Returns a dict containing: score (int 1-10) passed (bool) — score >= 6 parts (dict) — extracted structural pieces issues (list[str]) — human-readable problems explanation (str) — summary suitable for the QC report recommendations (list[str]) """ base = _strip_extension(filename) issues = [] parts = { "omg_id": None, "domain": None, "subteam": None, "brand": None, "event_or_initiative": None, "year": None, "sequence": None, "asset_name": None, "version": None, } # Top-level shape: " - __v##" head_match = re.match(r"^(\d{4,})\s*-\s*(.+)$", base) if not head_match: issues.append( "Filename must start with the OMG ID followed by ' - ' " "(e.g. '2382033 - PMKT-WSJ-26_AssetName_v1')." ) return _build_result(score=1, parts=parts, issues=issues, base=base) parts["omg_id"] = head_match.group(1) remainder = head_match.group(2) # Split tag block from "_AssetName_v##" suffix. suffix_match = re.search(r"_([^_]+)_v(\d+)$", remainder) if suffix_match: parts["asset_name"] = suffix_match.group(1) parts["version"] = f"v{suffix_match.group(2)}" tag_block = remainder[: suffix_match.start()] else: # No clean version suffix — try a looser match for "_AssetName" only. loose_match = re.search(r"_([^_]+)$", remainder) if loose_match: parts["asset_name"] = loose_match.group(1) tag_block = remainder[: loose_match.start()] issues.append( "Missing version suffix '_v##' (e.g. '_v1', '_v3'). " "Every asset filename should end with a version number." ) else: tag_block = remainder issues.append( "Missing the '_AssetName_v##' suffix. " "Asset name and version number must be appended." ) # Tag block: hyphen-separated pieces. tags = [t for t in tag_block.split("-") if t] if not tags: issues.append( "Tag block is empty. Expected Domain-[Subteam-]Brand-[Event/Initiative-]YY-Sequence." ) return _build_result(score=2, parts=parts, issues=issues, base=base) # First tag is always the domain. domain = tags.pop(0) parts["domain"] = domain if domain not in DOMAINS: issues.append( f"Unrecognised domain code '{domain}'. " f"Allowed domains: {sorted(DOMAINS)}." ) # Optional subteam (only for PMKT / B2B). if domain in DOMAINS_WITH_SUBTEAM and tags and tags[0] in SUBTEAMS: parts["subteam"] = tags.pop(0) elif domain in DOMAINS_WITH_SUBTEAM: issues.append( f"Domain '{domain}' requires a subteam ({sorted(SUBTEAMS)}) " "between the domain and the brand." ) # Brand (required). if not tags: issues.append("Missing brand code (e.g. WSJ, BAR, MW).") else: brand = tags.pop(0) parts["brand"] = brand if brand not in BRANDS: issues.append( f"Unrecognised brand code '{brand}'. " f"Allowed brands: {sorted(BRANDS)}." ) # Last two tags are conventionally YY-Sequence (2-digit year + sequence). if len(tags) >= 2 and re.fullmatch(r"\d{2}", tags[-2]) and re.fullmatch(r"\d{1,3}", tags[-1]): parts["sequence"] = tags.pop() parts["year"] = tags.pop() else: issues.append( "Missing or malformed 'YY-Sequence' tail (e.g. '26-01' for fiscal year 2026, sequence 01)." ) # Anything left between brand and YY-Sequence is the event-or-initiative slot. if tags: token = tags.pop(0) parts["event_or_initiative"] = token if domain == "EVNT" and token not in EVENTS: issues.append( f"Domain is 'EVNT' but '{token}' is not a recognised event code. " f"Allowed events: {sorted(EVENTS)}." ) # Score: start at 10, deduct for each issue, floor at 1. score = max(1, 10 - 2 * len(issues)) return _build_result(score=score, parts=parts, issues=issues, base=base) def _build_result(score, parts, issues, base): passed = score >= 6 if passed and not issues: explanation = ( f"Filename '{base}' matches the Dow Jones / OLIVER naming convention. " f"Detected: OMG {parts['omg_id']}, " f"{parts['domain']}" f"{'-' + parts['subteam'] if parts['subteam'] else ''}" f"{'-' + parts['brand'] if parts['brand'] else ''}" f"{'-' + parts['event_or_initiative'] if parts['event_or_initiative'] else ''}" f"{'-' + parts['year'] if parts['year'] else ''}" f"{'-' + parts['sequence'] if parts['sequence'] else ''}" f", asset '{parts['asset_name']}', version {parts['version']}." ) else: joined = " ".join(issues) if issues else "No structural issues found." explanation = ( f"Filename '{base}' was checked against the Dow Jones / OLIVER naming convention. " f"{joined}" ) recommendations = [] if issues: recommendations.append( "Update the filename to match: " "' - -----__v##'." ) recommendations.append( "Naming helper: https://ai-sandbox.oliver.solutions/wsj-filenaming/index.php" ) return { "score": score, "passed": passed, "parts": parts, "issues": issues, "explanation": explanation, "recommendations": recommendations, }