ai_qc/backend/file_naming_validator.py

"""
Dow Jones / OLIVER file naming convention validator.

Implements the playbook spec (slides 38-39):
    [OMGID] - [Domain]-[Subteam]?-[Brand or Event]-[Initiative or Event]?-[YY]-[Sequence]_[AssetName]_v##

Examples from the playbook:
    000000 - PMKT-ACQ-WSJ-BIC-26-01_MetaBanner_v1
    000001 - PMKT-ENGRT-WSJ-BIC-26-01_Email_v1
    000002 - EVNT-WSJ-GFF-26-02_Agenda_v3

This module is called by api_server.process_single_check when check_name == 'dj_file_naming'.
It runs deterministically (no LLM call) and returns a result dict shaped like an LLM check
result so it slots into the existing scoring + report pipeline.
"""

import os
import re

DOMAINS = {"PMKT", "BRND", "EVNT", "B2B", "UX"}
DOMAINS_WITH_SUBTEAM = {"PMKT", "B2B"}
SUBTEAMS = {"ACQ", "CMKT", "ENGRT", "ENT"}

BRANDS = {
    "WSJ", "WSJ+", "BAR", "MW", "DF", "DJE", "FAC", "FE",
    "GRI", "NWS", "OA", "RSK", "RSKC", "DJRJ", "R&C",
}

EVENTS = {
    "GH", "DJRJS", "WECR", "FEOE", "FOH", "GFF", "JH", "TL",
    "TLQ", "TLCYB", "FOE", "WSJIL", "BODC", "CCOC", "CEOC",
    "CFOC", "CMOC", "CPOC", "TECC", "WSJLI",
}


def _strip_extension(filename):
    base, _ = os.path.splitext(filename)
    return base


def validate_filename(filename):
    """
    Validate a filename against the Dow Jones / OLIVER naming convention.

    Returns a dict containing:
        score (int 1-10)
        passed (bool)        — score >= 6
        parts (dict)         — extracted structural pieces
        issues (list[str])   — human-readable problems
        explanation (str)    — summary suitable for the QC report
        recommendations (list[str])
    """
    base = _strip_extension(filename)

    issues = []
    parts = {
        "omg_id": None,
        "domain": None,
        "subteam": None,
        "brand": None,
        "event_or_initiative": None,
        "year": None,
        "sequence": None,
        "asset_name": None,
        "version": None,
    }

    # Top-level shape: "<OMGID> - <TAG_BLOCK>_<AssetName>_v##"
    head_match = re.match(r"^(\d{4,})\s*-\s*(.+)$", base)
    if not head_match:
        issues.append(
            "Filename must start with the OMG ID followed by ' - ' "
            "(e.g. '2382033 - PMKT-WSJ-26_AssetName_v1')."
        )
        return _build_result(score=1, parts=parts, issues=issues, base=base)

    parts["omg_id"] = head_match.group(1)
    remainder = head_match.group(2)

    # Split tag block from "_AssetName_v##" suffix.
    suffix_match = re.search(r"_([^_]+)_v(\d+)$", remainder)
    if suffix_match:
        parts["asset_name"] = suffix_match.group(1)
        parts["version"] = f"v{suffix_match.group(2)}"
        tag_block = remainder[: suffix_match.start()]
    else:
        # No clean version suffix — try a looser match for "_AssetName" only.
        loose_match = re.search(r"_([^_]+)$", remainder)
        if loose_match:
            parts["asset_name"] = loose_match.group(1)
            tag_block = remainder[: loose_match.start()]
            issues.append(
                "Missing version suffix '_v##' (e.g. '_v1', '_v3'). "
                "Every asset filename should end with a version number."
            )
        else:
            tag_block = remainder
            issues.append(
                "Missing the '_AssetName_v##' suffix. "
                "Asset name and version number must be appended."
            )

    # Tag block: hyphen-separated pieces.
    tags = [t for t in tag_block.split("-") if t]
    if not tags:
        issues.append(
            "Tag block is empty. Expected Domain-[Subteam-]Brand-[Event/Initiative-]YY-Sequence."
        )
        return _build_result(score=2, parts=parts, issues=issues, base=base)

    # First tag is always the domain.
    domain = tags.pop(0)
    parts["domain"] = domain
    if domain not in DOMAINS:
        issues.append(
            f"Unrecognised domain code '{domain}'. "
            f"Allowed domains: {sorted(DOMAINS)}."
        )

    # Optional subteam (only for PMKT / B2B).
    if domain in DOMAINS_WITH_SUBTEAM and tags and tags[0] in SUBTEAMS:
        parts["subteam"] = tags.pop(0)
    elif domain in DOMAINS_WITH_SUBTEAM:
        issues.append(
            f"Domain '{domain}' requires a subteam ({sorted(SUBTEAMS)}) "
            "between the domain and the brand."
        )

    # Brand (required).
    if not tags:
        issues.append("Missing brand code (e.g. WSJ, BAR, MW).")
    else:
        brand = tags.pop(0)
        parts["brand"] = brand
        if brand not in BRANDS:
            issues.append(
                f"Unrecognised brand code '{brand}'. "
                f"Allowed brands: {sorted(BRANDS)}."
            )

    # Last two tags are conventionally YY-Sequence (2-digit year + sequence).
    if len(tags) >= 2 and re.fullmatch(r"\d{2}", tags[-2]) and re.fullmatch(r"\d{1,3}", tags[-1]):
        parts["sequence"] = tags.pop()
        parts["year"] = tags.pop()
    else:
        issues.append(
            "Missing or malformed 'YY-Sequence' tail (e.g. '26-01' for fiscal year 2026, sequence 01)."
        )

    # Anything left between brand and YY-Sequence is the event-or-initiative slot.
    if tags:
        token = tags.pop(0)
        parts["event_or_initiative"] = token
        if domain == "EVNT" and token not in EVENTS:
            issues.append(
                f"Domain is 'EVNT' but '{token}' is not a recognised event code. "
                f"Allowed events: {sorted(EVENTS)}."
            )

    # Score: start at 10, deduct for each issue, floor at 1.
    score = max(1, 10 - 2 * len(issues))
    return _build_result(score=score, parts=parts, issues=issues, base=base)


def _build_result(score, parts, issues, base):
    passed = score >= 6
    if passed and not issues:
        explanation = (
            f"Filename '{base}' matches the Dow Jones / OLIVER naming convention. "
            f"Detected: OMG {parts['omg_id']}, "
            f"{parts['domain']}"
            f"{'-' + parts['subteam'] if parts['subteam'] else ''}"
            f"{'-' + parts['brand'] if parts['brand'] else ''}"
            f"{'-' + parts['event_or_initiative'] if parts['event_or_initiative'] else ''}"
            f"{'-' + parts['year'] if parts['year'] else ''}"
            f"{'-' + parts['sequence'] if parts['sequence'] else ''}"
            f", asset '{parts['asset_name']}', version {parts['version']}."
        )
    else:
        joined = " ".join(issues) if issues else "No structural issues found."
        explanation = (
            f"Filename '{base}' was checked against the Dow Jones / OLIVER naming convention. "
            f"{joined}"
        )

    recommendations = []
    if issues:
        recommendations.append(
            "Update the filename to match: "
            "'<OMGID> - <Domain>-<Subteam?>-<Brand>-<Event/Initiative?>-<YY>-<Sequence>_<AssetName>_v##'."
        )
        recommendations.append(
            "Naming helper: https://ai-sandbox.oliver.solutions/wsj-filenaming/index.php"
        )

    return {
        "score": score,
        "passed": passed,
        "parts": parts,
        "issues": issues,
        "explanation": explanation,
        "recommendations": recommendations,
    }