hm_qc/checks/HM_filename_parse.py

import os
import json
from checks.analyze_with_gpt import analyze_with_gpt  # Imports the FUNCTION
import re

def run_check(config: dict, context: dict, check_id: str) -> dict:
    """
    QC check that parses filename using GPT and shares results via context.
    Now gets filename from HM_parse context instead of direct file access.
    """
    # Get parsed filename from HM_parse context
    hm_parse_data = context.get("HM_parse", {})
    filename = hm_parse_data.get("filename")

    if not filename:
        return {
            "status": "error",
            "error_message": "Filename not found in HM_parse context. Ensure HM_parse check runs first."
        }

    # Remove extension from the filename we got from context
    pattern = r'^(?:[^_]+_){5}'
    base_name, ext = os.path.splitext(os.path.basename(filename))

    short_name = re.sub(pattern, '', base_name)

    # GPT prompt to parse the filename
    prompt = f"""
    Parse this H&M artwork filename.

    Full filename: {base_name}
    Shortened filename (prefixes removed): {short_name}

    H&M filenames can follow these formats:

    Format 1: dimensions_format_year_reference-number_language-country.pdf
    Example: 21.6x27.9cm_letter_2028_10062-01_en-us.pdf
    - year = 2028
    - reference = 10062-01
    - language = en-us

    Format 2: dimensions_format_prefix_reference-number_language-country.pdf
    Example: 50x70cm_Poster_9000_10107-06_el-CY.pdf
    - reference = 9000_10107-06 (INCLUDE the prefix number before the dash code!)
    - language = el-CY

    Format 3: dimensions_format_reference-number_(GEN|CEN).pdf
    Example: 04_10.8x14cm_quarter_letter_1001D_10004-02_GEN.pdf
    - reference = 10004-02
    - language = GEN

    Format 4 (OOH files): campaigncode_OOH_identifiers_dimensions_codes_reference_language.pdf
    Example: 1022A_OOH_Static_AMS_BL18_600x300cm_PL-pl_prio1_pl-PL.pdf
    - reference = 1022A (campaign code at start, before OOH)
    - language = pl-PL (last segment after final underscore)

    CRITICAL INSTRUCTIONS:
    - The "reference" field should include ALL numeric/alphanumeric codes that identify the document
    - If there are multiple numeric segments before the language code (e.g., 9000_10107-06),
      combine them with underscore as the reference (e.g., reference = "9000_10107-06")
    - The language is ALWAYS at the END of the filename (e.g., en-us, pl-PL, GEN, CEN)
    - Do NOT use "prio1" or similar priority indicators as part of the reference
    - Only use "year" field if it's clearly a 4-digit year (2024, 2025, 2028, etc.)
    - If unsure whether a number is year or reference prefix, include it in the reference

    Return only a JSON object with these exact keys:
    dimensions, format, year, reference, language

    For any component that can't be identified, use an empty string.
    """

    try:
        # Get GPT analysis
        gpt_response = analyze_with_gpt(
            prompt=prompt,
            content="",
            images=None,
            expect_json=True
        )

        # Parse and store results in context
        parsed = json.loads(gpt_response)
        context[check_id] = {
            "filename": filename,
            "short_name": short_name,
            "parsed": {
                "dimensions": parsed.get("dimensions", ""),
                "format": parsed.get("format", ""),
                "year": parsed.get("year", ""),
                "reference": parsed.get("reference", ""),
                "language": parsed.get("language", "")
            }
        }

        return {
            "status": "passed",
            "details": {
                "message": "Filename parsed and stored in context",
                "filename_source": "HM_parse context",
                "gpt_response_summary": f"Parsed {len(parsed)} components",
                "parsed": {
                    "dimensions": parsed.get("dimensions", ""),
                    "format": parsed.get("format", ""),
                    "year": parsed.get("year", ""),
                    "reference": parsed.get("reference", ""),
                    "language": parsed.get("language", "")
                }
            }
        }

    except json.JSONDecodeError as e:
        return {
            "status": "error",
            "error_message": f"GPT returned invalid JSON: {str(e)}",
            "raw_response": gpt_response[:200] + "..." if gpt_response else None
        }
    except Exception as e:
        return {
            "status": "error",
            "error_message": f"Filename parsing failed: {str(e)}"
        }