ai_qc/backend/file_naming_validator.py

203 lines
7 KiB
Python

"""
Dow Jones / OLIVER file naming convention validator.
Implements the playbook spec (slides 38-39):
[OMGID] - [Domain]-[Subteam]?-[Brand or Event]-[Initiative or Event]?-[YY]-[Sequence]_[AssetName]_v##
Examples from the playbook:
000000 - PMKT-ACQ-WSJ-BIC-26-01_MetaBanner_v1
000001 - PMKT-ENGRT-WSJ-BIC-26-01_Email_v1
000002 - EVNT-WSJ-GFF-26-02_Agenda_v3
This module is called by api_server.process_single_check when check_name == 'dj_file_naming'.
It runs deterministically (no LLM call) and returns a result dict shaped like an LLM check
result so it slots into the existing scoring + report pipeline.
"""
import os
import re
DOMAINS = {"PMKT", "BRND", "EVNT", "B2B", "UX"}
DOMAINS_WITH_SUBTEAM = {"PMKT", "B2B"}
SUBTEAMS = {"ACQ", "CMKT", "ENGRT", "ENT"}
BRANDS = {
"WSJ", "WSJ+", "BAR", "MW", "DF", "DJE", "FAC", "FE",
"GRI", "NWS", "OA", "RSK", "RSKC", "DJRJ", "R&C",
}
EVENTS = {
"GH", "DJRJS", "WECR", "FEOE", "FOH", "GFF", "JH", "TL",
"TLQ", "TLCYB", "FOE", "WSJIL", "BODC", "CCOC", "CEOC",
"CFOC", "CMOC", "CPOC", "TECC", "WSJLI",
}
def _strip_extension(filename):
base, _ = os.path.splitext(filename)
return base
def validate_filename(filename):
"""
Validate a filename against the Dow Jones / OLIVER naming convention.
Returns a dict containing:
score (int 1-10)
passed (bool) — score >= 6
parts (dict) — extracted structural pieces
issues (list[str]) — human-readable problems
explanation (str) — summary suitable for the QC report
recommendations (list[str])
"""
base = _strip_extension(filename)
issues = []
parts = {
"omg_id": None,
"domain": None,
"subteam": None,
"brand": None,
"event_or_initiative": None,
"year": None,
"sequence": None,
"asset_name": None,
"version": None,
}
# Top-level shape: "<OMGID> - <TAG_BLOCK>_<AssetName>_v##"
head_match = re.match(r"^(\d{4,})\s*-\s*(.+)$", base)
if not head_match:
issues.append(
"Filename must start with the OMG ID followed by ' - ' "
"(e.g. '2382033 - PMKT-WSJ-26_AssetName_v1')."
)
return _build_result(score=1, parts=parts, issues=issues, base=base)
parts["omg_id"] = head_match.group(1)
remainder = head_match.group(2)
# Split tag block from "_AssetName_v##" suffix.
suffix_match = re.search(r"_([^_]+)_v(\d+)$", remainder)
if suffix_match:
parts["asset_name"] = suffix_match.group(1)
parts["version"] = f"v{suffix_match.group(2)}"
tag_block = remainder[: suffix_match.start()]
else:
# No clean version suffix — try a looser match for "_AssetName" only.
loose_match = re.search(r"_([^_]+)$", remainder)
if loose_match:
parts["asset_name"] = loose_match.group(1)
tag_block = remainder[: loose_match.start()]
issues.append(
"Missing version suffix '_v##' (e.g. '_v1', '_v3'). "
"Every asset filename should end with a version number."
)
else:
tag_block = remainder
issues.append(
"Missing the '_AssetName_v##' suffix. "
"Asset name and version number must be appended."
)
# Tag block: hyphen-separated pieces.
tags = [t for t in tag_block.split("-") if t]
if not tags:
issues.append(
"Tag block is empty. Expected Domain-[Subteam-]Brand-[Event/Initiative-]YY-Sequence."
)
return _build_result(score=2, parts=parts, issues=issues, base=base)
# First tag is always the domain.
domain = tags.pop(0)
parts["domain"] = domain
if domain not in DOMAINS:
issues.append(
f"Unrecognised domain code '{domain}'. "
f"Allowed domains: {sorted(DOMAINS)}."
)
# Optional subteam (only for PMKT / B2B).
if domain in DOMAINS_WITH_SUBTEAM and tags and tags[0] in SUBTEAMS:
parts["subteam"] = tags.pop(0)
elif domain in DOMAINS_WITH_SUBTEAM:
issues.append(
f"Domain '{domain}' requires a subteam ({sorted(SUBTEAMS)}) "
"between the domain and the brand."
)
# Brand (required).
if not tags:
issues.append("Missing brand code (e.g. WSJ, BAR, MW).")
else:
brand = tags.pop(0)
parts["brand"] = brand
if brand not in BRANDS:
issues.append(
f"Unrecognised brand code '{brand}'. "
f"Allowed brands: {sorted(BRANDS)}."
)
# Last two tags are conventionally YY-Sequence (2-digit year + sequence).
if len(tags) >= 2 and re.fullmatch(r"\d{2}", tags[-2]) and re.fullmatch(r"\d{1,3}", tags[-1]):
parts["sequence"] = tags.pop()
parts["year"] = tags.pop()
else:
issues.append(
"Missing or malformed 'YY-Sequence' tail (e.g. '26-01' for fiscal year 2026, sequence 01)."
)
# Anything left between brand and YY-Sequence is the event-or-initiative slot.
if tags:
token = tags.pop(0)
parts["event_or_initiative"] = token
if domain == "EVNT" and token not in EVENTS:
issues.append(
f"Domain is 'EVNT' but '{token}' is not a recognised event code. "
f"Allowed events: {sorted(EVENTS)}."
)
# Score: start at 10, deduct for each issue, floor at 1.
score = max(1, 10 - 2 * len(issues))
return _build_result(score=score, parts=parts, issues=issues, base=base)
def _build_result(score, parts, issues, base):
passed = score >= 6
if passed and not issues:
explanation = (
f"Filename '{base}' matches the Dow Jones / OLIVER naming convention. "
f"Detected: OMG {parts['omg_id']}, "
f"{parts['domain']}"
f"{'-' + parts['subteam'] if parts['subteam'] else ''}"
f"{'-' + parts['brand'] if parts['brand'] else ''}"
f"{'-' + parts['event_or_initiative'] if parts['event_or_initiative'] else ''}"
f"{'-' + parts['year'] if parts['year'] else ''}"
f"{'-' + parts['sequence'] if parts['sequence'] else ''}"
f", asset '{parts['asset_name']}', version {parts['version']}."
)
else:
joined = " ".join(issues) if issues else "No structural issues found."
explanation = (
f"Filename '{base}' was checked against the Dow Jones / OLIVER naming convention. "
f"{joined}"
)
recommendations = []
if issues:
recommendations.append(
"Update the filename to match: "
"'<OMGID> - <Domain>-<Subteam?>-<Brand>-<Event/Initiative?>-<YY>-<Sequence>_<AssetName>_v##'."
)
recommendations.append(
"Naming helper: https://ai-sandbox.oliver.solutions/wsj-filenaming/index.php"
)
return {
"score": score,
"passed": passed,
"parts": parts,
"issues": issues,
"explanation": explanation,
"recommendations": recommendations,
}