Rewrite filename check + add price/currency check to image QC

Filename check: - Rewritten to flexibly parse multiple H&M naming conventions (Display, DOOH, OOH, SOME STATIC, Social, POS, DS) - Extracts country code, language code, dimensions, campaign number - Scores based on how much metadata was extracted (not rigid pattern) - Tested against real filenames: BG_bg, ES_es, NO-no formats Price/currency check (new): - Detects prices in images via LLM vision API - Validates currency against global pricing reference (deterministic) - Falls back to LLM validation for unknown countries - Optional campaign pricing sheet validation when has_pricing=True - Added to profile with weight 30 Profile weights rebalanced: filename 30, quality 40, price 30 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-26 18:39:54 +02:00 · 2026-03-26 18:39:54 +02:00 · fc15a2dda3
commit fc15a2dda3
parent dc73268309
5 changed files with 454 additions and 87 deletions
--- a/modules/hm_qc/checks/init.py
+++ b/modules/hm_qc/checks/init.py
@ -7,5 +7,6 @@ from .base_check import BaseCheck
 from .sample_filename_check import FilenameCheck
 from .sample_quality_check import QualityCheck
 from .dimension_check import DimensionCheck
+from .price_currency_check import PriceCurrencyCheck

-__all__ = ['BaseCheck', 'FilenameCheck', 'QualityCheck', 'DimensionCheck']
+__all__ = ['BaseCheck', 'FilenameCheck', 'QualityCheck', 'DimensionCheck', 'PriceCurrencyCheck']
--- a/modules/hm_qc/checks/price_currency_check.py
+++ b/modules/hm_qc/checks/price_currency_check.py
@ -0,0 +1,271 @@
+"""
+Price & Currency Check.
+
+Validates that prices/currency in the image match the expected country
+using the global pricing reference and optional campaign pricing sheet.
+"""
+import os
+import json
+import logging
+from typing import Dict, Any
+from PIL import Image
+from .base_check import BaseCheck
+from core.services.llm_config import LLMConfig
+
+logger = logging.getLogger(__name__)
+
+
+class PriceCurrencyCheck(BaseCheck):
+    """
+    Check that price/currency in the image matches the country code from filename.
+
+    Uses:
+    1. Global pricing reference (storage/reference/global_pricing.json) for
+       deterministic currency symbol/format validation
+    2. Campaign pricing sheet (if has_pricing=True) for actual price validation
+    3. LLM fallback for unknown countries
+    """
+
+    def __init__(self, name: str = "price_currency", weight: float = 25.0, config: Dict[str, Any] = None):
+        super().__init__(name, weight, config)
+        self.llm_provider = self.config.get('llm_provider', 'openai')
+        self.llm_model = self.config.get('llm_model', 'gpt-4o')
+
+    def run(self, file_path: str, context: Dict[str, Any]) -> Dict[str, Any]:
+        try:
+            # Get language/country from filename check
+            filename_data = context.get('filename_data', {})
+            language = filename_data.get('language', '')
+            country_code = filename_data.get('country_code', '')
+
+            if not language and not country_code:
+                return self._create_result(
+                    status='skipped',
+                    score=100.0,
+                    message='No country/language code found in filename — skipping price check',
+                    details={'reason': 'no_country_code'}
+                )
+
+            # Skip for GEN/CEN files
+            if language and language.upper() in ('GEN', 'CEN'):
+                return self._create_result(
+                    status='skipped',
+                    score=100.0,
+                    message=f'Price check skipped for {language.upper()} files',
+                    details={'reason': 'gen_cen_file'}
+                )
+
+            # Check file is an image
+            ext = os.path.splitext(file_path)[1].lower()
+            if ext not in ('.jpg', '.jpeg', '.png'):
+                return self._create_result(
+                    status='skipped',
+                    score=100.0,
+                    message='Price check only applies to image files',
+                    details={'file_type': ext}
+                )
+
+            self.logger.info(f"Running price/currency check for {language} ({country_code})")
+
+            # Step 1: Detect prices in the image using LLM
+            price_info = self._detect_prices(file_path, context)
+
+            if not price_info or price_info.get('currency_found') == 'NOT_FOUND':
+                return self._create_result(
+                    status='skipped',
+                    score=100.0,
+                    message='No price/currency detected in image — skipping validation',
+                    details={'price_detection': price_info or {}}
+                )
+
+            # Step 2: Validate against global pricing reference
+            currency = price_info.get('currency_found', '')
+            validation = self._validate_currency(currency, language, country_code)
+
+            # Step 3: Campaign-specific price validation
+            campaign_result = self._validate_campaign_price(price_info, language, context)
+
+            # Build result
+            score = 100.0
+            issues = []
+            details = {
+                'language': language,
+                'country_code': country_code,
+                'detected_currency': currency,
+                'detected_price': price_info.get('price_value'),
+                'confidence': price_info.get('confidence', 0),
+                **validation
+            }
+
+            if campaign_result:
+                details['campaign_price'] = campaign_result
+
+            # Score deductions
+            if price_info.get('confidence', 0) < 0.7:
+                score -= 20
+                issues.append(f"Low confidence in price detection ({price_info.get('confidence', 0):.0%})")
+
+            if not validation.get('matches_region', True):
+                score -= 40
+                issues.append(
+                    f"Currency mismatch: found {currency}, "
+                    f"expected {validation.get('expected_currency', '?')} for {language}"
+                )
+
+            if campaign_result and campaign_result.get('price_matches_campaign') is False:
+                score -= 20
+                issues.append(
+                    f"Price does not match campaign sheet: "
+                    f"expected {campaign_result.get('expected_price', '?')}"
+                )
+
+            details['issues'] = issues
+
+            if score >= 90:
+                status = 'passed'
+                message = f'Price/currency validation passed — {currency} correct for {language}'
+            elif score >= 70:
+                status = 'warning'
+                message = f'Price/currency has minor issues: {", ".join(issues)}'
+            else:
+                status = 'failed'
+                message = f'Price/currency validation failed: {", ".join(issues)}'
+
+            return self._create_result(
+                status=status,
+                score=score,
+                message=message,
+                details=details,
+                recommendations=[
+                    f'Expected currency for {language}: {validation.get("expected_currency", "unknown")} ({validation.get("expected_symbol", "")})'
+                ] if issues else None
+            )
+
+        except Exception as e:
+            self.logger.error(f"Price/currency check error: {e}", exc_info=True)
+            return self._create_result(
+                status='error',
+                score=0.0,
+                message=f'Error running price check: {str(e)}',
+                details={'error': str(e)}
+            )
+
+    def _detect_prices(self, file_path: str, context: Dict[str, Any]) -> dict:
+        """Use LLM to detect prices and currency in the image."""
+        prompt = """Analyze this image for price and currency information.
+
+Extract any prices shown and identify the currency used.
+Be flexible in recognizing different formats (e.g., "$100", "LE 699", "€20", "29,99 лв.").
+
+Return JSON with:
+- currency_found: 3-letter currency code (e.g., "BGN", "EUR", "USD") or "NOT_FOUND"
+- currency_symbol: the actual symbol shown (e.g., "лв.", "€", "$") or null
+- price_value: detected numerical value as string or null
+- symbol_position: "before" or "after" the price, or null
+- format_valid: boolean - is the price properly formatted
+- confidence: confidence score 0-1
+"""
+        try:
+            response = LLMConfig.call_vision_api(
+                prompt=prompt,
+                image_asset=file_path,
+                provider=self.llm_provider,
+                model=self.llm_model,
+                usage_context={
+                    'module': 'hm_qc',
+                    'check_name': 'price_currency',
+                    'user': context.get('user'),
+                    'session_id': context.get('session_id')
+                }
+            )
+
+            text = response.get('text', '')
+            # Extract JSON from response
+            start = text.find('{')
+            end = text.rfind('}') + 1
+            if start != -1 and end > start:
+                return json.loads(text[start:end])
+        except Exception as e:
+            self.logger.warning(f"Price detection failed: {e}")
+
+        return None
+
+    def _validate_currency(self, currency: str, language: str, country_code: str) -> dict:
+        """Validate detected currency against global pricing reference."""
+        result = {'matches_region': True, 'validation_method': 'none'}
+
+        # Try global pricing reference
+        pricing_path = os.path.join('storage', 'reference', 'global_pricing.json')
+        if os.path.exists(pricing_path):
+            try:
+                with open(pricing_path, 'r', encoding='utf-8') as f:
+                    global_pricing = json.load(f)
+
+                # Try language code (e.g., "bg-BG"), then country code (e.g., "BG")
+                entry = global_pricing.get(language) or global_pricing.get(country_code)
+
+                if entry:
+                    expected_currency = entry.get('currency_code', '')
+                    expected_symbol = entry.get('symbol', '')
+                    matches = (
+                        currency.upper() == expected_currency.upper() or
+                        currency == expected_symbol
+                    )
+                    return {
+                        'matches_region': matches,
+                        'expected_currency': expected_currency,
+                        'expected_symbol': expected_symbol,
+                        'expected_position': entry.get('position', ''),
+                        'expected_country': entry.get('country', ''),
+                        'validation_method': 'global_pricing_reference',
+                        'reason': (
+                            f"{'Match' if matches else 'Mismatch'}: "
+                            f"{language} expects {expected_currency} ({expected_symbol}), found {currency}"
+                        )
+                    }
+            except (json.JSONDecodeError, OSError):
+                pass
+
+        # No global pricing available — don't penalize
+        result['validation_method'] = 'skipped_no_reference'
+        result['reason'] = 'No global pricing reference available'
+        return result
+
+    def _validate_campaign_price(self, price_info: dict, language: str, context: Dict[str, Any]) -> dict:
+        """Validate actual price against campaign pricing sheet if available."""
+        campaign_ctx = context.get('campaign_presentation', {})
+        if not campaign_ctx.get('has_pricing') or not campaign_ctx.get('parsed_content'):
+            return None
+
+        try:
+            currency = price_info.get('currency_found', '')
+            price_value = price_info.get('price_value', '')
+
+            prompt = f"""Compare the price detected in this asset against the campaign pricing sheet.
+
+Detected price: {price_value} {currency}
+Region/Language: {language}
+
+Campaign pricing reference:
+{campaign_ctx['parsed_content'][:6000]}
+
+Return JSON with:
+- price_matches_campaign: true/false
+- expected_price: the expected price from the campaign sheet for this region (or null)
+- reason: brief explanation
+"""
+            client = LLMConfig.get_client('openai', 'gpt-4o')
+            response = client.chat.completions.create(
+                model='gpt-4o',
+                messages=[{"role": "user", "content": prompt}],
+                max_tokens=1024
+            )
+            text = response.choices[0].message.content or ''
+            start = text.find('{')
+            end = text.rfind('}') + 1
+            if start != -1 and end > start:
+                return json.loads(text[start:end])
+        except Exception as e:
+            self.logger.warning(f"Campaign price validation failed: {e}")
+
+        return None
--- a/modules/hm_qc/checks/sample_filename_check.py
+++ b/modules/hm_qc/checks/sample_filename_check.py
@ -1,106 +1,97 @@
 """
-Sample Filename Check.
+H&M Filename Check.

-Validates H&M filename conventions (simple check, no AI needed).
-Demonstrates scoring with deductions for issues.
+Extracts key metadata from H&M filenames using flexible pattern matching.
+Supports multiple naming conventions used across H&M campaigns.
 """
 import os
 import re
-from typing import Dict, Any
+from typing import Dict, Any, Optional
 from .base_check import BaseCheck


 class FilenameCheck(BaseCheck):
    """
-    Check filename against H&M conventions.
+    Flexible filename parser for H&M marketing assets.

-    Expected format: {dimensions}_{format}_{year}_{reference}_{language}.{ext}
-    Example: 1080x1920_jpg_2024_spring_en-GB.pdf
+    Extracts country code, language code, dimensions, campaign number,
+    and format type from various H&M naming conventions. Scores based
+    on how much metadata could be extracted.
+
+    Known H&M filename formats:
+    - SOME STATIC: Market_Language_CampaignNum_CampaignName_Format_CreativeType_Ratio_ImageNum
+      e.g., AT_de_4116A_Halloween_Stories_fb_9x16_1.jpg
+    - Display Banners: ID_ID_CampaignNum_Display_Static_MediaOwner_Size_Version_Market_Lang_Priority
+      e.g., 6186653_1910358_1022B_Display_Static_Netinfo_300x600_V1_BG_bg_PRIO4.jpg
+    - DOOH/OOH: CampaignNum_DOOH_Static_Option_MediaOwner_FormatName_Size_Language-Market
+      e.g., 4045_DOOH_Static_PRIO1_EyeMediaGiant_Noreport_1080x1920_NO-no.jpg
+    - POS GEN: Size_Format_CampaignNum_POPNumber_GEN
+      e.g., 21x29.7cm_A4_4068A_10065-01_GEN.jpg
+    - POS Country: Size_Format_CampaignNum_POPNumber_Language-Market
+      e.g., 50x70cm_Poster_4068A_10107-01_en-GB.jpg
+    - Video: Market_Lang_CampaignNum_CampaignName_Platform_Ratio_Duration_Version_ID
+      e.g., ES_es_1013A_Spring_W_Bumper_YT_16x9_6_A_6889135.mp4
    """

    def __init__(self, name: str = "filename_check", weight: float = 10.0, config: Dict[str, Any] = None):
        super().__init__(name, weight, config)

-        # Filename pattern (simplified for demo)
-        self.pattern = re.compile(
-            r'^(\d+x\d+)_([a-zA-Z0-9]+)_(\d{4})_([a-zA-Z0-9_-]+)_([a-zA-Z]{2}-[A-Z]{2}|GEN|CEN)\.(pdf|jpg|jpeg|png|psd)$',
-            re.IGNORECASE
-        )
-
    def run(self, file_path: str, context: Dict[str, Any]) -> Dict[str, Any]:
-        """
-        Run filename validation check.
-
-        Args:
-            file_path: Path to file
-            context: Shared context
-
-        Returns:
-            Check result with score
-        """
        try:
            filename = os.path.basename(file_path)
+            base_name, ext = os.path.splitext(filename)
+            ext_clean = ext.lstrip('.').lower()
            self.logger.info(f"Checking filename: {filename}")

-            # Parse filename
-            match = self.pattern.match(filename)
+            # Extract metadata using flexible patterns
+            parsed = self._parse_filename(base_name, ext_clean)

-            if not match:
-                return self._create_result(
-                    status='failed',
-                    score=30.0,
-                    message='Filename does not match H&M conventions',
-                    details={
-                        'filename': filename,
-                        'expected_format': '{dimensions}_{format}_{year}_{reference}_{language}.{ext}',
-                        'example': '1080x1920_jpg_2024_spring_en-GB.pdf'
-                    },
-                    recommendations=[
-                        'Rename file to follow H&M naming convention',
-                        'Ensure all required fields are present',
-                        'Use correct language code format (e.g., en-GB, GEN, CEN)'
-                    ]
-                )
+            # Store in context for other checks (quality, pricing)
+            context['filename_data'] = parsed

-            # Extract components
-            dimensions, format_type, year, reference, language, ext = match.groups()
-
-            # Calculate score with deductions
-            score = 100.0
+            # Score based on what was extracted
+            score = 50.0  # Base score for having a parseable filename
            issues = []
+            found = []

-            # Check year (should be current or recent)
-            current_year = 2025
-            year_int = int(year)
-            if year_int < current_year - 2:
-                score -= 10
-                issues.append(f"Year {year} is more than 2 years old")
+            if parsed.get('country_code'):
+                score += 15
+                found.append(f"Country: {parsed['country_code']}")
+            else:
+                issues.append("Could not extract country code")

-            # Check format consistency
-            if format_type.lower() != ext.lower() and ext.lower() != 'pdf':
-                score -= 15
-                issues.append(f"Format '{format_type}' doesn't match extension '{ext}'")
+            if parsed.get('language_code'):
+                score += 10
+                found.append(f"Language: {parsed['language_code']}")
+            else:
+                issues.append("Could not extract language code")
+
+            if parsed.get('dimensions'):
+                score += 10
+                found.append(f"Dimensions: {parsed['dimensions']}")
+            else:
+                issues.append("Could not extract dimensions/size")
+
+            if parsed.get('campaign_number'):
+                score += 10
+                found.append(f"Campaign: {parsed['campaign_number']}")
+            else:
+                issues.append("Could not extract campaign number")
+
+            if parsed.get('format_type'):
+                score += 5
+                found.append(f"Format: {parsed['format_type']}")

            # Determine status
            if score >= 90:
                status = 'passed'
-                message = 'Filename follows H&M conventions'
+                message = 'Filename parsed successfully — all key metadata extracted'
            elif score >= 70:
                status = 'warning'
-                message = f'Filename valid but has minor issues: {", ".join(issues)}'
+                message = f'Filename partially parsed — missing: {", ".join(issues)}'
            else:
                status = 'failed'
-                message = f'Filename has significant issues: {", ".join(issues)}'
-
-            # Store parsed data in context for other checks
-            context['filename_data'] = {
-                'dimensions': dimensions,
-                'format': format_type,
-                'year': year,
-                'reference': reference,
-                'language': language,
-                'extension': ext
-            }
+                message = f'Filename could not be parsed — missing: {", ".join(issues)}'

            return self._create_result(
                status=status,
@ -108,18 +99,16 @@ class FilenameCheck(BaseCheck):
                message=message,
                details={
                    'filename': filename,
-                    'parsed': {
-                        'dimensions': dimensions,
-                        'format': format_type,
-                        'year': year,
-                        'reference': reference,
-                        'language': language,
-                        'extension': ext
-                    },
-                    'issues': issues if issues else []
+                    'detected_format': parsed.get('detected_format', 'unknown'),
+                    'extracted': {k: v for k, v in parsed.items() if v and k != 'detected_format'},
+                    'found': found,
+                    'issues': issues
                },
-                recommendations=['Update year to current year'] if issues else None,
-                analysis=f"Filename components parsed successfully. {len(issues)} issue(s) found." if issues else "Filename is valid and follows conventions."
+                recommendations=[
+                    'Ensure filename contains country code (e.g., BG, GB, ES)',
+                    'Ensure filename contains language code (e.g., bg, en, es)',
+                    'Ensure filename contains dimensions (e.g., 300x600, 1080x1920)'
+                ] if issues else None
            )

        except Exception as e:
@ -130,3 +119,101 @@ class FilenameCheck(BaseCheck):
                message=f'Error checking filename: {str(e)}',
                details={'error': str(e)}
            )
+
+    def _parse_filename(self, base_name: str, ext: str) -> Dict[str, Any]:
+        """
+        Extract metadata from filename using multiple pattern strategies.
+
+        Returns dict with: country_code, language_code, language (combined),
+        dimensions, campaign_number, format_type, detected_format, extension
+        """
+        result = {
+            'country_code': None,
+            'language_code': None,
+            'language': None,  # Combined "lang-COUNTRY" for other checks
+            'dimensions': None,
+            'campaign_number': None,
+            'format_type': None,
+            'detected_format': None,
+            'extension': ext,
+            'ratio': None,
+        }
+
+        parts = base_name.split('_')
+
+        # --- Extract dimensions (NxN pattern) ---
+        dim_match = re.search(r'(\d+x\d+)', base_name, re.IGNORECASE)
+        if dim_match:
+            result['dimensions'] = dim_match.group(1)
+            # Derive ratio from dimensions
+            try:
+                w, h = dim_match.group(1).split('x')
+                result['ratio'] = f"{w}x{h}"
+            except ValueError:
+                pass
+
+        # --- Extract campaign number (digits + optional letter, e.g., 1022B, 4116A) ---
+        campaign_match = re.search(r'(?:^|_)(\d{3,5}[A-Z]?)(?:_|$)', base_name)
+        if campaign_match:
+            result['campaign_number'] = campaign_match.group(1)
+
+        # --- Detect format type ---
+        name_upper = base_name.upper()
+        if '_DOOH_' in name_upper:
+            result['format_type'] = 'DOOH'
+            result['detected_format'] = 'DOOH'
+        elif '_OOH_' in name_upper:
+            result['format_type'] = 'OOH'
+            result['detected_format'] = 'OOH'
+        elif '_DISPLAY_' in name_upper:
+            result['format_type'] = 'Display'
+            result['detected_format'] = 'DISPLAY_BANNER'
+        elif any(x in name_upper for x in ['_FB_', '_TK_', '_PN_', '_YT_']):
+            result['format_type'] = 'Social'
+            result['detected_format'] = 'SOCIAL_MEDIA'
+        elif '_POS_' in name_upper or re.match(r'^\d+x\d+cm', base_name, re.IGNORECASE):
+            result['format_type'] = 'POS'
+            result['detected_format'] = 'POS'
+
+        # --- Extract country/language codes ---
+
+        # Strategy 1: SOME STATIC — Market_Language at start (e.g., AT_de_...)
+        start_match = re.match(r'^([A-Z]{2})_([a-z]{2})_', base_name)
+        if start_match:
+            result['country_code'] = start_match.group(1)
+            result['language_code'] = start_match.group(2)
+            result['detected_format'] = result['detected_format'] or 'SOME_STATIC'
+
+        # Strategy 2: Display banner — _MARKET_lang_ near end (e.g., _BG_bg_)
+        if not result['country_code']:
+            market_lang = re.search(r'_([A-Z]{2})_([a-z]{2})(?:_|$)', base_name)
+            if market_lang:
+                result['country_code'] = market_lang.group(1)
+                result['language_code'] = market_lang.group(2)
+
+        # Strategy 3: Hyphenated — lang-MARKET or MARKET-lang at end (e.g., _en-GB, _NO-no)
+        if not result['country_code']:
+            hyph_match = re.search(r'_([a-z]{2})-([A-Z]{2})(?:\.|_|$)', base_name)
+            if hyph_match:
+                result['language_code'] = hyph_match.group(1)
+                result['country_code'] = hyph_match.group(2)
+            else:
+                hyph_match2 = re.search(r'_([A-Z]{2})-([a-z]{2})(?:\.|_|$)', base_name)
+                if hyph_match2:
+                    result['country_code'] = hyph_match2.group(1)
+                    result['language_code'] = hyph_match2.group(2)
+
+        # Strategy 4: GEN/CEN marker
+        if not result['country_code']:
+            gen_match = re.search(r'_(GEN|CEN)(?:\.|_|$)', base_name, re.IGNORECASE)
+            if gen_match:
+                result['language'] = gen_match.group(1).upper()
+                result['country_code'] = gen_match.group(1).upper()
+                result['language_code'] = gen_match.group(1).upper()
+                result['detected_format'] = result['detected_format'] or 'POS_GEN'
+
+        # Build combined language code (e.g., "bg-BG")
+        if result['country_code'] and result['language_code'] and not result['language']:
+            result['language'] = f"{result['language_code']}-{result['country_code']}"
+
+        return result
--- a/modules/hm_qc/executor.py
+++ b/modules/hm_qc/executor.py
@ -11,7 +11,7 @@ import logging
 from datetime import datetime
 from typing import Dict, List, Any
 from .scoring import ScoringEngine
-from .checks import FilenameCheck, QualityCheck, DimensionCheck
+from .checks import FilenameCheck, QualityCheck, DimensionCheck, PriceCurrencyCheck
 from core.utils.progress_tracker import UnifiedProgressTracker
 from core.models.qc_report import QCReport
 from core.models.database import db
@ -171,7 +171,8 @@ class QCExecutor:
            'filename_parse': FilenameCheck,
            'quality_check': QualityCheck,
            'image_quality': QualityCheck,
-            'dimension_check': DimensionCheck
+            'dimension_check': DimensionCheck,
+            'price_currency': PriceCurrencyCheck
        }

        for check_config in profile_checks:
--- a/modules/hm_qc/profiles/profiles.yaml
+++ b/modules/hm_qc/profiles/profiles.yaml
@ -11,17 +11,24 @@ profiles:
    description: "Quality checks for H&M image assets (JPG, PNG, PSD)"
    checks:
      - name: "filename_parse"
-        weight: 50
+        weight: 30
        enabled: true
        llm_provider: null
-        description: "Validate H&M filename conventions"
+        description: "Extract and validate country code, language, dimensions from filename"

      - name: "image_quality"
-        weight: 50
+        weight: 40
        enabled: true
        llm_provider: "openai"
        llm_model: "gpt-4o"
-        description: "AI-powered image quality and legibility assessment"
+        description: "AI-powered image quality, legibility, and campaign guideline assessment"
+
+      - name: "price_currency"
+        weight: 30
+        enabled: true
+        llm_provider: "openai"
+        llm_model: "gpt-4o"
+        description: "Validate price/currency matches country using global pricing reference"

 # Note: Weights should sum to 100 for each profile
 # Higher weight = more important to overall score