203 lines
7 KiB
Python
203 lines
7 KiB
Python
"""
|
|
Dow Jones / OLIVER file naming convention validator.
|
|
|
|
Implements the playbook spec (slides 38-39):
|
|
[OMGID] - [Domain]-[Subteam]?-[Brand or Event]-[Initiative or Event]?-[YY]-[Sequence]_[AssetName]_v##
|
|
|
|
Examples from the playbook:
|
|
000000 - PMKT-ACQ-WSJ-BIC-26-01_MetaBanner_v1
|
|
000001 - PMKT-ENGRT-WSJ-BIC-26-01_Email_v1
|
|
000002 - EVNT-WSJ-GFF-26-02_Agenda_v3
|
|
|
|
This module is called by api_server.process_single_check when check_name == 'dj_file_naming'.
|
|
It runs deterministically (no LLM call) and returns a result dict shaped like an LLM check
|
|
result so it slots into the existing scoring + report pipeline.
|
|
"""
|
|
|
|
import os
|
|
import re
|
|
|
|
DOMAINS = {"PMKT", "BRND", "EVNT", "B2B", "UX"}
|
|
DOMAINS_WITH_SUBTEAM = {"PMKT", "B2B"}
|
|
SUBTEAMS = {"ACQ", "CMKT", "ENGRT", "ENT"}
|
|
|
|
BRANDS = {
|
|
"WSJ", "WSJ+", "BAR", "MW", "DF", "DJE", "FAC", "FE",
|
|
"GRI", "NWS", "OA", "RSK", "RSKC", "DJRJ", "R&C",
|
|
}
|
|
|
|
EVENTS = {
|
|
"GH", "DJRJS", "WECR", "FEOE", "FOH", "GFF", "JH", "TL",
|
|
"TLQ", "TLCYB", "FOE", "WSJIL", "BODC", "CCOC", "CEOC",
|
|
"CFOC", "CMOC", "CPOC", "TECC", "WSJLI",
|
|
}
|
|
|
|
|
|
def _strip_extension(filename):
|
|
base, _ = os.path.splitext(filename)
|
|
return base
|
|
|
|
|
|
def validate_filename(filename):
|
|
"""
|
|
Validate a filename against the Dow Jones / OLIVER naming convention.
|
|
|
|
Returns a dict containing:
|
|
score (int 1-10)
|
|
passed (bool) — score >= 6
|
|
parts (dict) — extracted structural pieces
|
|
issues (list[str]) — human-readable problems
|
|
explanation (str) — summary suitable for the QC report
|
|
recommendations (list[str])
|
|
"""
|
|
base = _strip_extension(filename)
|
|
|
|
issues = []
|
|
parts = {
|
|
"omg_id": None,
|
|
"domain": None,
|
|
"subteam": None,
|
|
"brand": None,
|
|
"event_or_initiative": None,
|
|
"year": None,
|
|
"sequence": None,
|
|
"asset_name": None,
|
|
"version": None,
|
|
}
|
|
|
|
# Top-level shape: "<OMGID> - <TAG_BLOCK>_<AssetName>_v##"
|
|
head_match = re.match(r"^(\d{4,})\s*-\s*(.+)$", base)
|
|
if not head_match:
|
|
issues.append(
|
|
"Filename must start with the OMG ID followed by ' - ' "
|
|
"(e.g. '2382033 - PMKT-WSJ-26_AssetName_v1')."
|
|
)
|
|
return _build_result(score=1, parts=parts, issues=issues, base=base)
|
|
|
|
parts["omg_id"] = head_match.group(1)
|
|
remainder = head_match.group(2)
|
|
|
|
# Split tag block from "_AssetName_v##" suffix.
|
|
suffix_match = re.search(r"_([^_]+)_v(\d+)$", remainder)
|
|
if suffix_match:
|
|
parts["asset_name"] = suffix_match.group(1)
|
|
parts["version"] = f"v{suffix_match.group(2)}"
|
|
tag_block = remainder[: suffix_match.start()]
|
|
else:
|
|
# No clean version suffix — try a looser match for "_AssetName" only.
|
|
loose_match = re.search(r"_([^_]+)$", remainder)
|
|
if loose_match:
|
|
parts["asset_name"] = loose_match.group(1)
|
|
tag_block = remainder[: loose_match.start()]
|
|
issues.append(
|
|
"Missing version suffix '_v##' (e.g. '_v1', '_v3'). "
|
|
"Every asset filename should end with a version number."
|
|
)
|
|
else:
|
|
tag_block = remainder
|
|
issues.append(
|
|
"Missing the '_AssetName_v##' suffix. "
|
|
"Asset name and version number must be appended."
|
|
)
|
|
|
|
# Tag block: hyphen-separated pieces.
|
|
tags = [t for t in tag_block.split("-") if t]
|
|
if not tags:
|
|
issues.append(
|
|
"Tag block is empty. Expected Domain-[Subteam-]Brand-[Event/Initiative-]YY-Sequence."
|
|
)
|
|
return _build_result(score=2, parts=parts, issues=issues, base=base)
|
|
|
|
# First tag is always the domain.
|
|
domain = tags.pop(0)
|
|
parts["domain"] = domain
|
|
if domain not in DOMAINS:
|
|
issues.append(
|
|
f"Unrecognised domain code '{domain}'. "
|
|
f"Allowed domains: {sorted(DOMAINS)}."
|
|
)
|
|
|
|
# Optional subteam (only for PMKT / B2B).
|
|
if domain in DOMAINS_WITH_SUBTEAM and tags and tags[0] in SUBTEAMS:
|
|
parts["subteam"] = tags.pop(0)
|
|
elif domain in DOMAINS_WITH_SUBTEAM:
|
|
issues.append(
|
|
f"Domain '{domain}' requires a subteam ({sorted(SUBTEAMS)}) "
|
|
"between the domain and the brand."
|
|
)
|
|
|
|
# Brand (required).
|
|
if not tags:
|
|
issues.append("Missing brand code (e.g. WSJ, BAR, MW).")
|
|
else:
|
|
brand = tags.pop(0)
|
|
parts["brand"] = brand
|
|
if brand not in BRANDS:
|
|
issues.append(
|
|
f"Unrecognised brand code '{brand}'. "
|
|
f"Allowed brands: {sorted(BRANDS)}."
|
|
)
|
|
|
|
# Last two tags are conventionally YY-Sequence (2-digit year + sequence).
|
|
if len(tags) >= 2 and re.fullmatch(r"\d{2}", tags[-2]) and re.fullmatch(r"\d{1,3}", tags[-1]):
|
|
parts["sequence"] = tags.pop()
|
|
parts["year"] = tags.pop()
|
|
else:
|
|
issues.append(
|
|
"Missing or malformed 'YY-Sequence' tail (e.g. '26-01' for fiscal year 2026, sequence 01)."
|
|
)
|
|
|
|
# Anything left between brand and YY-Sequence is the event-or-initiative slot.
|
|
if tags:
|
|
token = tags.pop(0)
|
|
parts["event_or_initiative"] = token
|
|
if domain == "EVNT" and token not in EVENTS:
|
|
issues.append(
|
|
f"Domain is 'EVNT' but '{token}' is not a recognised event code. "
|
|
f"Allowed events: {sorted(EVENTS)}."
|
|
)
|
|
|
|
# Score: start at 10, deduct for each issue, floor at 1.
|
|
score = max(1, 10 - 2 * len(issues))
|
|
return _build_result(score=score, parts=parts, issues=issues, base=base)
|
|
|
|
|
|
def _build_result(score, parts, issues, base):
|
|
passed = score >= 6
|
|
if passed and not issues:
|
|
explanation = (
|
|
f"Filename '{base}' matches the Dow Jones / OLIVER naming convention. "
|
|
f"Detected: OMG {parts['omg_id']}, "
|
|
f"{parts['domain']}"
|
|
f"{'-' + parts['subteam'] if parts['subteam'] else ''}"
|
|
f"{'-' + parts['brand'] if parts['brand'] else ''}"
|
|
f"{'-' + parts['event_or_initiative'] if parts['event_or_initiative'] else ''}"
|
|
f"{'-' + parts['year'] if parts['year'] else ''}"
|
|
f"{'-' + parts['sequence'] if parts['sequence'] else ''}"
|
|
f", asset '{parts['asset_name']}', version {parts['version']}."
|
|
)
|
|
else:
|
|
joined = " ".join(issues) if issues else "No structural issues found."
|
|
explanation = (
|
|
f"Filename '{base}' was checked against the Dow Jones / OLIVER naming convention. "
|
|
f"{joined}"
|
|
)
|
|
|
|
recommendations = []
|
|
if issues:
|
|
recommendations.append(
|
|
"Update the filename to match: "
|
|
"'<OMGID> - <Domain>-<Subteam?>-<Brand>-<Event/Initiative?>-<YY>-<Sequence>_<AssetName>_v##'."
|
|
)
|
|
recommendations.append(
|
|
"Naming helper: https://ai-sandbox.oliver.solutions/wsj-filenaming/index.php"
|
|
)
|
|
|
|
return {
|
|
"score": score,
|
|
"passed": passed,
|
|
"parts": parts,
|
|
"issues": issues,
|
|
"explanation": explanation,
|
|
"recommendations": recommendations,
|
|
}
|