hm_qc/checks/HM_image_filename_parse.py
2025-11-13 13:41:31 +02:00

210 lines
8.3 KiB
Python

import os
import json
import re
from checks.analyze_with_gpt import analyze_with_gpt
def run_check(config: dict, context: dict, check_id: str) -> dict:
"""
QC check that parses static image filenames using pattern matching and GPT.
Static image naming conventions (from cheat sheet):
- SOME STATIC: Market_Language_campaignnumber_campaignname_format_creativetype_ratio_ImageNumber
- DOOH Static: CampaignNumber_DOOH_Static_CreativeOption_MediaOwner_FormatName_FormatSize_Market-Language
- OOH: CampaignNumber_OOH_Static_CreativeOption_MediaOwner_FormatName_FormatSize_Language-Market
- Display Banners: CampaignNumber_Display_Static_CreativeOption_MediaOwner_FormatName_FormatSize_Language-Market
- POS GEN: Size_Format_CampaignNumber_POPNumber_GenCode
- POS Country Specific: Size_Format_CampaignNumber_POPNumber_Language-Market
- DS: CampaignNumber_CampaignName_Index_BU_Resolution_language-COUNTRY
Key extraction: language/country code
"""
# Get parsed filename from HM_image_parse context
hm_parse_data = context.get("HM_image_parse", {})
filename = hm_parse_data.get("filename")
if not filename:
return {
"status": "error",
"error_message": "Filename not found in HM_image_parse context. Ensure HM_image_parse check runs first."
}
# Remove extension from filename
base_name, ext = os.path.splitext(os.path.basename(filename))
# Try to detect format type and extract language/country code using patterns
detected_format = None
language_code = None
# Pattern 1: SOME STATIC - Market_Language at the beginning
# Example: AT_de_4116A_Halloween_Stories_fb_9x16_1
some_static_pattern = r'^([A-Z]{2})_([a-z]{2})_'
match = re.match(some_static_pattern, base_name)
if match:
detected_format = "SOME_STATIC"
market = match.group(1)
language = match.group(2)
language_code = f"{language}-{market}"
# Pattern 2: DOOH/OOH/Display - ends with Market-Language or Language-Market
# Example: 4045_DOOH_Static_PRIO1_EyeMediaGiant_Noreport_1080x1920_NO-no
if not language_code:
dooh_pattern = r'_([a-z]{2})-([A-Z]{2})(?:\.[^.]+)?$'
match = re.search(dooh_pattern, base_name)
if match:
language = match.group(1)
market = match.group(2)
language_code = f"{language}-{market}"
if "_DOOH_" in base_name:
detected_format = "DOOH"
elif "_OOH_" in base_name:
detected_format = "OOH"
elif "_Display_" in base_name:
detected_format = "DISPLAY_BANNER"
# Pattern 3: Alternative Language-Market format
# Example: 4045_OOH_PRIO1_EyeMediaGiant_Noreport_1080x1920_no-NO
if not language_code:
alt_pattern = r'_([a-z]{2})-([A-Z]{2})(?:\.[^.]+)?$'
match = re.search(alt_pattern, base_name)
if match:
language = match.group(1)
market = match.group(2)
language_code = f"{language}-{market}"
if "_OOH_" in base_name:
detected_format = "OOH"
# Pattern 4: POS GEN format - ends with _GEN
# Example: 21x29.7cm_A4_4068A_10065-01_GEN
if not language_code:
gen_pattern = r'_(GEN|CEN)(?:\.[^.]+)?$'
match = re.search(gen_pattern, base_name, re.IGNORECASE)
if match:
language_code = match.group(1).upper()
detected_format = "POS_GEN"
# Pattern 5: POS Country Specific - ends with language-Market
# Example: 50x70cm_Poster_4068A_10107-01_en-GB
if not language_code:
pos_country_pattern = r'_([a-z]{2})-([A-Z]{2})(?:\.[^.]+)?$'
match = re.search(pos_country_pattern, base_name)
if match:
language = match.group(1)
market = match.group(2)
language_code = f"{language}-{market}"
detected_format = "POS_COUNTRY"
# Pattern 6: DS format - language-COUNTRY or GEN_LOGO
# Example: 1019_SPRINGBRAND_00_W_1400x1050_fr-CA
if not language_code:
ds_pattern = r'_([a-z]{2})-([A-Z]{2})(?:\.[^.]+)?$'
match = re.search(ds_pattern, base_name)
if match:
language = match.group(1)
market = match.group(2)
language_code = f"{language}-{market}"
detected_format = "DS"
else:
# Check for GEN_LOGO pattern
if "GEN_LOGO" in base_name.upper() or "_GEN" in base_name.upper():
language_code = "GEN"
detected_format = "DS_GEN"
# If pattern matching failed, use GPT as fallback
if not language_code:
prompt = f"""
Parse this H&M static image filename: {base_name}
H&M static image filenames follow various formats:
1. SOME STATIC: Market_Language_campaignnumber_campaignname_format_creativetype_ratio_ImageNumber
Example: AT_de_4116A_Halloween_Stories_fb_9x16_1 (language = de-AT)
2. DOOH/OOH/Display: CampaignNumber_Type_Static_..._FormatSize_Language-Market
Example: 4045_DOOH_Static_PRIO1_EyeMediaGiant_Noreport_1080x1920_NO-no (language = no-NO)
3. POS GEN: Size_Format_CampaignNumber_POPNumber_GEN
Example: 21x29.7cm_A4_4068A_10065-01_GEN (language = GEN)
4. POS Country: Size_Format_CampaignNumber_POPNumber_Language-Market
Example: 50x70cm_Poster_4068A_10107-01_en-GB (language = en-GB)
5. DS: CampaignNumber_CampaignName_Index_BU_Resolution_language-COUNTRY
Example: 1019_SPRINGBRAND_00_W_1400x1050_fr-CA (language = fr-CA)
Return only a JSON object with these exact keys:
format_type, campaign_number, language
For language: Use format 'xx-YY' (e.g., 'en-GB') or 'GEN' or 'CEN'
For any component that can't be identified, use an empty string.
"""
try:
gpt_response = analyze_with_gpt(
prompt=prompt,
content="",
images=None,
expect_json=True
)
parsed = json.loads(gpt_response)
language_code = parsed.get("language", "")
detected_format = parsed.get("format_type", "UNKNOWN")
campaign_number = parsed.get("campaign_number", "")
except Exception as e:
# Store partial results and return error
context[check_id] = {
"filename": filename,
"base_name": base_name,
"detected_format": "PARSE_FAILED",
"parsed": {
"language": "",
"format_type": "UNKNOWN",
"campaign_number": ""
}
}
return {
"status": "error",
"error_message": f"Failed to parse filename: {str(e)}"
}
else:
# Extract campaign number from filename if possible
campaign_pattern = r'(\d{4}[A-Z]?)'
campaign_match = re.search(campaign_pattern, base_name)
campaign_number = campaign_match.group(1) if campaign_match else ""
# Extract dimensions from filename (e.g., 1080x1920, 21x29.7cm, 1400x1050)
dimensions = ""
dimension_pattern = r'(\d+(?:\.\d+)?x\d+(?:\.\d+)?(?:px|cm|mm)?)'
dim_match = re.search(dimension_pattern, base_name, re.IGNORECASE)
if dim_match:
dimensions = dim_match.group(1)
# Store results in context
context[check_id] = {
"filename": filename,
"base_name": base_name,
"detected_format": detected_format,
"parsed": {
"language": language_code if language_code else "",
"format_type": detected_format if detected_format else "UNKNOWN",
"campaign_number": campaign_number if 'campaign_number' in locals() else "",
"dimensions": dimensions
}
}
return {
"status": "passed",
"details": {
"message": "Image filename parsed successfully",
"filename_source": "HM_image_parse context",
"detected_format": detected_format,
"parsed": {
"language": language_code if language_code else "",
"format_type": detected_format if detected_format else "UNKNOWN",
"campaign_number": campaign_number if 'campaign_number' in locals() else "",
"dimensions": dimensions
}
}
}