hm_qc/checks/HM_filename_parse.py
2025-11-13 13:41:31 +02:00

121 lines
4.5 KiB
Python
Executable file

import os
import json
from checks.analyze_with_gpt import analyze_with_gpt # Imports the FUNCTION
import re
def run_check(config: dict, context: dict, check_id: str) -> dict:
"""
QC check that parses filename using GPT and shares results via context.
Now gets filename from HM_parse context instead of direct file access.
"""
# Get parsed filename from HM_parse context
hm_parse_data = context.get("HM_parse", {})
filename = hm_parse_data.get("filename")
if not filename:
return {
"status": "error",
"error_message": "Filename not found in HM_parse context. Ensure HM_parse check runs first."
}
# Remove extension from the filename we got from context
pattern = r'^(?:[^_]+_){5}'
base_name, ext = os.path.splitext(os.path.basename(filename))
short_name = re.sub(pattern, '', base_name)
# GPT prompt to parse the filename
prompt = f"""
Parse this H&M artwork filename.
Full filename: {base_name}
Shortened filename (prefixes removed): {short_name}
H&M filenames can follow these formats:
Format 1: dimensions_format_year_reference-number_language-country.pdf
Example: 21.6x27.9cm_letter_2028_10062-01_en-us.pdf
- year = 2028
- reference = 10062-01
- language = en-us
Format 2: dimensions_format_prefix_reference-number_language-country.pdf
Example: 50x70cm_Poster_9000_10107-06_el-CY.pdf
- reference = 9000_10107-06 (INCLUDE the prefix number before the dash code!)
- language = el-CY
Format 3: dimensions_format_reference-number_(GEN|CEN).pdf
Example: 04_10.8x14cm_quarter_letter_1001D_10004-02_GEN.pdf
- reference = 10004-02
- language = GEN
Format 4 (OOH files): campaigncode_OOH_identifiers_dimensions_codes_reference_language.pdf
Example: 1022A_OOH_Static_AMS_BL18_600x300cm_PL-pl_prio1_pl-PL.pdf
- reference = 1022A (campaign code at start, before OOH)
- language = pl-PL (last segment after final underscore)
CRITICAL INSTRUCTIONS:
- The "reference" field should include ALL numeric/alphanumeric codes that identify the document
- If there are multiple numeric segments before the language code (e.g., 9000_10107-06),
combine them with underscore as the reference (e.g., reference = "9000_10107-06")
- The language is ALWAYS at the END of the filename (e.g., en-us, pl-PL, GEN, CEN)
- Do NOT use "prio1" or similar priority indicators as part of the reference
- Only use "year" field if it's clearly a 4-digit year (2024, 2025, 2028, etc.)
- If unsure whether a number is year or reference prefix, include it in the reference
Return only a JSON object with these exact keys:
dimensions, format, year, reference, language
For any component that can't be identified, use an empty string.
"""
try:
# Get GPT analysis
gpt_response = analyze_with_gpt(
prompt=prompt,
content="",
images=None,
expect_json=True
)
# Parse and store results in context
parsed = json.loads(gpt_response)
context[check_id] = {
"filename": filename,
"short_name": short_name,
"parsed": {
"dimensions": parsed.get("dimensions", ""),
"format": parsed.get("format", ""),
"year": parsed.get("year", ""),
"reference": parsed.get("reference", ""),
"language": parsed.get("language", "")
}
}
return {
"status": "passed",
"details": {
"message": "Filename parsed and stored in context",
"filename_source": "HM_parse context",
"gpt_response_summary": f"Parsed {len(parsed)} components",
"parsed": {
"dimensions": parsed.get("dimensions", ""),
"format": parsed.get("format", ""),
"year": parsed.get("year", ""),
"reference": parsed.get("reference", ""),
"language": parsed.get("language", "")
}
}
}
except json.JSONDecodeError as e:
return {
"status": "error",
"error_message": f"GPT returned invalid JSON: {str(e)}",
"raw_response": gpt_response[:200] + "..." if gpt_response else None
}
except Exception as e:
return {
"status": "error",
"error_message": f"Filename parsing failed: {str(e)}"
}