""" Ferrero filename parser - Extract metadata from Ferrero naming convention Format: [JOB]_[BRAND]_[SUBJECT]_[ASSET]_[DURATION]_[RATIO]_[SPOT]_[COUNTRY]_[LANGUAGE]_[SOCIAL]_[TRACKING] Example: 1234567_RAF_ME-MOMENT_OLV_6S_1x1_REF_GL_it_IGF_pOiJ9s.mp4 """ import re from pathlib import Path from typing import Optional from core.data_loader import DataLoader class FerreroFilenameParser: """Parse Ferrero filenames using sequential detection algorithm""" def __init__(self, data_loader: DataLoader): """ Initialize parser with data loader Args: data_loader: DataLoader instance for validation and resolution """ self.data_loader = data_loader # Regex patterns for field detection self.patterns = { 'job_number': re.compile(r'^\d{7,10}$'), 'duration': re.compile(r'^\d{1,3}S$'), 'aspect_ratio': re.compile(r'^\d+[x:]\d+$'), 'spot_version': re.compile(r'^(MST|REF)$'), 'tracking_id': re.compile(r'^[a-zA-Z0-9]{6}(-N)?$'), } def parse(self, filename: str) -> dict: """ Parse filename and extract all metadata fields Args: filename: Filename to parse (with or without path) Returns: dict: Parsed metadata with all fields Raises: ValueError: If filename format is invalid or required fields missing """ # Extract just the filename if full path provided filename = Path(filename).name # Store original filename original_filename = filename # Remove extension name_no_ext = Path(filename).stem # Split by underscore parts = name_no_ext.split('_') if len(parts) < 6: raise ValueError( f"Invalid filename format: too few components. " f"Expected at least 6 parts, got {len(parts)}. " f"Format: [JOB]_[BRAND]_[SUBJECT]_[ASSET]_[RATIO]_[COUNTRY]_[LANGUAGE]" ) # Initialize result dict result = { 'job_number': '', 'brand_code': '', 'brand_name': '', 'subject': '', 'asset_type': '', 'asset_type_name': '', 'duration': '', 'aspect_ratio': '', 'spot_version': '', 'country_code': '', 'country_name': '', 'language_code': '', 'language_name': '', 'social_media': '', 'channel': '', 'tracking_id': '', 'original_filename': original_filename, 'parsed_filename': name_no_ext, } # Track current position in parts list idx = 0 # 1. Check for optional job number (first part if 7-10 digits) if self.patterns['job_number'].match(parts[idx]): result['job_number'] = parts[idx] idx += 1 # 2. Extract brand (required) if idx >= len(parts): raise ValueError("Missing brand code") brand_code = parts[idx].upper() if not self.data_loader.validate_brand_code(brand_code): similar = self.data_loader.get_similar_codes(brand_code, 'brand') suggestions = f" Did you mean: {', '.join(similar)}?" if similar else "" raise ValueError(f"Invalid brand code: '{brand_code}'.{suggestions}") result['brand_code'] = brand_code result['brand_name'] = self.data_loader.get_brand_name(brand_code) idx += 1 # 3. Extract subject (required) if idx >= len(parts): raise ValueError("Missing subject title") result['subject'] = parts[idx] idx += 1 # 4. Extract asset type (required) if idx >= len(parts): raise ValueError("Missing asset type") asset_type = parts[idx].upper() if not self.data_loader.validate_asset_type(asset_type): raise ValueError(f"Invalid asset type: '{asset_type}'") result['asset_type'] = asset_type result['asset_type_name'] = self.data_loader.get_asset_type_name(asset_type) or asset_type idx += 1 # Now parse optional fields using sequential detection found_ratio = False found_country = False found_language = False while idx < len(parts): part = parts[idx] # Duration: Ends with 'S' and starts with digits (e.g., "6S", "30S") if not found_ratio and self.patterns['duration'].match(part): result['duration'] = part.rstrip('S') idx += 1 # Aspect Ratio: Contains 'x' or ':' (e.g., "16x9", "1x1") elif not found_ratio and self.patterns['aspect_ratio'].match(part): result['aspect_ratio'] = part found_ratio = True idx += 1 # Spot Version: Exactly "MST" or "REF" elif found_ratio and not found_country and self.patterns['spot_version'].match(part): result['spot_version'] = part idx += 1 # Country Code: 2 uppercase alphabetic chars (after ratio) elif found_ratio and not found_country and len(part) == 2 and part.isalpha() and part.isupper(): country_code = part.upper() if not self.data_loader.validate_country_code(country_code): similar = self.data_loader.get_similar_codes(country_code, 'country') suggestions = f" Did you mean: {', '.join(similar)}?" if similar else "" raise ValueError(f"Invalid country code: '{country_code}'.{suggestions}") result['country_code'] = country_code result['country_name'] = self.data_loader.get_country_name(country_code) found_country = True idx += 1 # Language Code: 2-3 lowercase alphabetic chars (after country) elif found_country and not found_language and len(part) in [2, 3] and part.isalpha() and part.islower(): language_code = part.lower() if not self.data_loader.validate_language_code(language_code): raise ValueError(f"Invalid language code: '{language_code}'") result['language_code'] = language_code result['language_name'] = self.data_loader.get_language_name(language_code) found_language = True idx += 1 # Social Media: 3 uppercase chars (after language) elif found_language and len(part) == 3 and part.isalpha() and part.isupper(): social_code = part.upper() if self.data_loader.validate_social_code(social_code): result['social_media'] = social_code idx += 1 else: # Might be tracking ID or unknown, continue idx += 1 # Tracking ID: 6 alphanumeric, optionally with -N suffix elif self.patterns['tracking_id'].match(part): result['tracking_id'] = part idx += 1 # Fallback: Skip unknown parts else: idx += 1 # Validate required fields required_fields = { 'brand_code': result['brand_code'], 'subject': result['subject'], 'asset_type': result['asset_type'], 'aspect_ratio': result['aspect_ratio'], 'country_code': result['country_code'], 'language_code': result['language_code'], } missing_fields = [name for name, value in required_fields.items() if not value] if missing_fields: raise ValueError( f"Missing required fields: {', '.join(missing_fields)}. " f"Parsed so far: {result}" ) # Determine channel if result['social_media']: # Use social media name result['channel'] = self.data_loader.get_social_media_name(result['social_media']) or result['social_media'] else: # Use asset type as channel result['channel'] = result['asset_type'] return result def validate_required_fields(self, parsed_data: dict) -> bool: """ Validate that all required fields are present Args: parsed_data: Parsed filename data Returns: bool: True if all required fields present Raises: ValueError: If any required field is missing """ required = ['brand_code', 'subject', 'asset_type', 'aspect_ratio', 'country_code', 'language_code'] missing = [field for field in required if not parsed_data.get(field)] if missing: raise ValueError(f"Missing required fields: {', '.join(missing)}") return True