creative-x-ferrero/core/filename_parser.py
DJP 72e9b54ff6 Initial commit: CreativeX API integration for Ferrero assets
- Ferrero filename parser with NEW format support
- CreativeX API client with retry logic
- State persistence with resume capability
- Upload and status checking scripts
- Comprehensive documentation
- Virtual environment support
2026-01-09 14:33:00 -05:00

245 lines
8.8 KiB
Python

"""
Ferrero filename parser - Extract metadata from Ferrero naming convention
Format: [JOB]_[BRAND]_[SUBJECT]_[ASSET]_[DURATION]_[RATIO]_[SPOT]_[COUNTRY]_[LANGUAGE]_[SOCIAL]_[TRACKING]
Example: 1234567_RAF_ME-MOMENT_OLV_6S_1x1_REF_GL_it_IGF_pOiJ9s.mp4
"""
import re
from pathlib import Path
from typing import Optional
from core.data_loader import DataLoader
class FerreroFilenameParser:
"""Parse Ferrero filenames using sequential detection algorithm"""
def __init__(self, data_loader: DataLoader):
"""
Initialize parser with data loader
Args:
data_loader: DataLoader instance for validation and resolution
"""
self.data_loader = data_loader
# Regex patterns for field detection
self.patterns = {
'job_number': re.compile(r'^\d{7,10}$'),
'duration': re.compile(r'^\d{1,3}S$'),
'aspect_ratio': re.compile(r'^\d+[x:]\d+$'),
'spot_version': re.compile(r'^(MST|REF)$'),
'tracking_id': re.compile(r'^[a-zA-Z0-9]{6}(-N)?$'),
}
def parse(self, filename: str) -> dict:
"""
Parse filename and extract all metadata fields
Args:
filename: Filename to parse (with or without path)
Returns:
dict: Parsed metadata with all fields
Raises:
ValueError: If filename format is invalid or required fields missing
"""
# Extract just the filename if full path provided
filename = Path(filename).name
# Store original filename
original_filename = filename
# Remove extension
name_no_ext = Path(filename).stem
# Split by underscore
parts = name_no_ext.split('_')
if len(parts) < 6:
raise ValueError(
f"Invalid filename format: too few components. "
f"Expected at least 6 parts, got {len(parts)}. "
f"Format: [JOB]_[BRAND]_[SUBJECT]_[ASSET]_[RATIO]_[COUNTRY]_[LANGUAGE]"
)
# Initialize result dict
result = {
'job_number': '',
'brand_code': '',
'brand_name': '',
'subject': '',
'asset_type': '',
'asset_type_name': '',
'duration': '',
'aspect_ratio': '',
'spot_version': '',
'country_code': '',
'country_name': '',
'language_code': '',
'language_name': '',
'social_media': '',
'channel': '',
'tracking_id': '',
'original_filename': original_filename,
'parsed_filename': name_no_ext,
}
# Track current position in parts list
idx = 0
# 1. Check for optional job number (first part if 7-10 digits)
if self.patterns['job_number'].match(parts[idx]):
result['job_number'] = parts[idx]
idx += 1
# 2. Extract brand (required)
if idx >= len(parts):
raise ValueError("Missing brand code")
brand_code = parts[idx].upper()
if not self.data_loader.validate_brand_code(brand_code):
similar = self.data_loader.get_similar_codes(brand_code, 'brand')
suggestions = f" Did you mean: {', '.join(similar)}?" if similar else ""
raise ValueError(f"Invalid brand code: '{brand_code}'.{suggestions}")
result['brand_code'] = brand_code
result['brand_name'] = self.data_loader.get_brand_name(brand_code)
idx += 1
# 3. Extract subject (required)
if idx >= len(parts):
raise ValueError("Missing subject title")
result['subject'] = parts[idx]
idx += 1
# 4. Extract asset type (required)
if idx >= len(parts):
raise ValueError("Missing asset type")
asset_type = parts[idx].upper()
if not self.data_loader.validate_asset_type(asset_type):
raise ValueError(f"Invalid asset type: '{asset_type}'")
result['asset_type'] = asset_type
result['asset_type_name'] = self.data_loader.get_asset_type_name(asset_type) or asset_type
idx += 1
# Now parse optional fields using sequential detection
found_ratio = False
found_country = False
found_language = False
while idx < len(parts):
part = parts[idx]
# Duration: Ends with 'S' and starts with digits (e.g., "6S", "30S")
if not found_ratio and self.patterns['duration'].match(part):
result['duration'] = part.rstrip('S')
idx += 1
# Aspect Ratio: Contains 'x' or ':' (e.g., "16x9", "1x1")
elif not found_ratio and self.patterns['aspect_ratio'].match(part):
result['aspect_ratio'] = part
found_ratio = True
idx += 1
# Spot Version: Exactly "MST" or "REF"
elif found_ratio and not found_country and self.patterns['spot_version'].match(part):
result['spot_version'] = part
idx += 1
# Country Code: 2 uppercase alphabetic chars (after ratio)
elif found_ratio and not found_country and len(part) == 2 and part.isalpha() and part.isupper():
country_code = part.upper()
if not self.data_loader.validate_country_code(country_code):
similar = self.data_loader.get_similar_codes(country_code, 'country')
suggestions = f" Did you mean: {', '.join(similar)}?" if similar else ""
raise ValueError(f"Invalid country code: '{country_code}'.{suggestions}")
result['country_code'] = country_code
result['country_name'] = self.data_loader.get_country_name(country_code)
found_country = True
idx += 1
# Language Code: 2-3 lowercase alphabetic chars (after country)
elif found_country and not found_language and len(part) in [2, 3] and part.isalpha() and part.islower():
language_code = part.lower()
if not self.data_loader.validate_language_code(language_code):
raise ValueError(f"Invalid language code: '{language_code}'")
result['language_code'] = language_code
result['language_name'] = self.data_loader.get_language_name(language_code)
found_language = True
idx += 1
# Social Media: 3 uppercase chars (after language)
elif found_language and len(part) == 3 and part.isalpha() and part.isupper():
social_code = part.upper()
if self.data_loader.validate_social_code(social_code):
result['social_media'] = social_code
idx += 1
else:
# Might be tracking ID or unknown, continue
idx += 1
# Tracking ID: 6 alphanumeric, optionally with -N suffix
elif self.patterns['tracking_id'].match(part):
result['tracking_id'] = part
idx += 1
# Fallback: Skip unknown parts
else:
idx += 1
# Validate required fields
required_fields = {
'brand_code': result['brand_code'],
'subject': result['subject'],
'asset_type': result['asset_type'],
'aspect_ratio': result['aspect_ratio'],
'country_code': result['country_code'],
'language_code': result['language_code'],
}
missing_fields = [name for name, value in required_fields.items() if not value]
if missing_fields:
raise ValueError(
f"Missing required fields: {', '.join(missing_fields)}. "
f"Parsed so far: {result}"
)
# Determine channel
if result['social_media']:
# Use social media name
result['channel'] = self.data_loader.get_social_media_name(result['social_media']) or result['social_media']
else:
# Use asset type as channel
result['channel'] = result['asset_type']
return result
def validate_required_fields(self, parsed_data: dict) -> bool:
"""
Validate that all required fields are present
Args:
parsed_data: Parsed filename data
Returns:
bool: True if all required fields present
Raises:
ValueError: If any required field is missing
"""
required = ['brand_code', 'subject', 'asset_type', 'aspect_ratio',
'country_code', 'language_code']
missing = [field for field in required if not parsed_data.get(field)]
if missing:
raise ValueError(f"Missing required fields: {', '.join(missing)}")
return True