- Ferrero filename parser with NEW format support - CreativeX API client with retry logic - State persistence with resume capability - Upload and status checking scripts - Comprehensive documentation - Virtual environment support
245 lines
8.8 KiB
Python
245 lines
8.8 KiB
Python
"""
|
|
Ferrero filename parser - Extract metadata from Ferrero naming convention
|
|
|
|
Format: [JOB]_[BRAND]_[SUBJECT]_[ASSET]_[DURATION]_[RATIO]_[SPOT]_[COUNTRY]_[LANGUAGE]_[SOCIAL]_[TRACKING]
|
|
Example: 1234567_RAF_ME-MOMENT_OLV_6S_1x1_REF_GL_it_IGF_pOiJ9s.mp4
|
|
"""
|
|
|
|
import re
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
from core.data_loader import DataLoader
|
|
|
|
|
|
class FerreroFilenameParser:
|
|
"""Parse Ferrero filenames using sequential detection algorithm"""
|
|
|
|
def __init__(self, data_loader: DataLoader):
|
|
"""
|
|
Initialize parser with data loader
|
|
|
|
Args:
|
|
data_loader: DataLoader instance for validation and resolution
|
|
"""
|
|
self.data_loader = data_loader
|
|
|
|
# Regex patterns for field detection
|
|
self.patterns = {
|
|
'job_number': re.compile(r'^\d{7,10}$'),
|
|
'duration': re.compile(r'^\d{1,3}S$'),
|
|
'aspect_ratio': re.compile(r'^\d+[x:]\d+$'),
|
|
'spot_version': re.compile(r'^(MST|REF)$'),
|
|
'tracking_id': re.compile(r'^[a-zA-Z0-9]{6}(-N)?$'),
|
|
}
|
|
|
|
def parse(self, filename: str) -> dict:
|
|
"""
|
|
Parse filename and extract all metadata fields
|
|
|
|
Args:
|
|
filename: Filename to parse (with or without path)
|
|
|
|
Returns:
|
|
dict: Parsed metadata with all fields
|
|
|
|
Raises:
|
|
ValueError: If filename format is invalid or required fields missing
|
|
"""
|
|
# Extract just the filename if full path provided
|
|
filename = Path(filename).name
|
|
|
|
# Store original filename
|
|
original_filename = filename
|
|
|
|
# Remove extension
|
|
name_no_ext = Path(filename).stem
|
|
|
|
# Split by underscore
|
|
parts = name_no_ext.split('_')
|
|
|
|
if len(parts) < 6:
|
|
raise ValueError(
|
|
f"Invalid filename format: too few components. "
|
|
f"Expected at least 6 parts, got {len(parts)}. "
|
|
f"Format: [JOB]_[BRAND]_[SUBJECT]_[ASSET]_[RATIO]_[COUNTRY]_[LANGUAGE]"
|
|
)
|
|
|
|
# Initialize result dict
|
|
result = {
|
|
'job_number': '',
|
|
'brand_code': '',
|
|
'brand_name': '',
|
|
'subject': '',
|
|
'asset_type': '',
|
|
'asset_type_name': '',
|
|
'duration': '',
|
|
'aspect_ratio': '',
|
|
'spot_version': '',
|
|
'country_code': '',
|
|
'country_name': '',
|
|
'language_code': '',
|
|
'language_name': '',
|
|
'social_media': '',
|
|
'channel': '',
|
|
'tracking_id': '',
|
|
'original_filename': original_filename,
|
|
'parsed_filename': name_no_ext,
|
|
}
|
|
|
|
# Track current position in parts list
|
|
idx = 0
|
|
|
|
# 1. Check for optional job number (first part if 7-10 digits)
|
|
if self.patterns['job_number'].match(parts[idx]):
|
|
result['job_number'] = parts[idx]
|
|
idx += 1
|
|
|
|
# 2. Extract brand (required)
|
|
if idx >= len(parts):
|
|
raise ValueError("Missing brand code")
|
|
|
|
brand_code = parts[idx].upper()
|
|
if not self.data_loader.validate_brand_code(brand_code):
|
|
similar = self.data_loader.get_similar_codes(brand_code, 'brand')
|
|
suggestions = f" Did you mean: {', '.join(similar)}?" if similar else ""
|
|
raise ValueError(f"Invalid brand code: '{brand_code}'.{suggestions}")
|
|
|
|
result['brand_code'] = brand_code
|
|
result['brand_name'] = self.data_loader.get_brand_name(brand_code)
|
|
idx += 1
|
|
|
|
# 3. Extract subject (required)
|
|
if idx >= len(parts):
|
|
raise ValueError("Missing subject title")
|
|
|
|
result['subject'] = parts[idx]
|
|
idx += 1
|
|
|
|
# 4. Extract asset type (required)
|
|
if idx >= len(parts):
|
|
raise ValueError("Missing asset type")
|
|
|
|
asset_type = parts[idx].upper()
|
|
if not self.data_loader.validate_asset_type(asset_type):
|
|
raise ValueError(f"Invalid asset type: '{asset_type}'")
|
|
|
|
result['asset_type'] = asset_type
|
|
result['asset_type_name'] = self.data_loader.get_asset_type_name(asset_type) or asset_type
|
|
idx += 1
|
|
|
|
# Now parse optional fields using sequential detection
|
|
found_ratio = False
|
|
found_country = False
|
|
found_language = False
|
|
|
|
while idx < len(parts):
|
|
part = parts[idx]
|
|
|
|
# Duration: Ends with 'S' and starts with digits (e.g., "6S", "30S")
|
|
if not found_ratio and self.patterns['duration'].match(part):
|
|
result['duration'] = part.rstrip('S')
|
|
idx += 1
|
|
|
|
# Aspect Ratio: Contains 'x' or ':' (e.g., "16x9", "1x1")
|
|
elif not found_ratio and self.patterns['aspect_ratio'].match(part):
|
|
result['aspect_ratio'] = part
|
|
found_ratio = True
|
|
idx += 1
|
|
|
|
# Spot Version: Exactly "MST" or "REF"
|
|
elif found_ratio and not found_country and self.patterns['spot_version'].match(part):
|
|
result['spot_version'] = part
|
|
idx += 1
|
|
|
|
# Country Code: 2 uppercase alphabetic chars (after ratio)
|
|
elif found_ratio and not found_country and len(part) == 2 and part.isalpha() and part.isupper():
|
|
country_code = part.upper()
|
|
if not self.data_loader.validate_country_code(country_code):
|
|
similar = self.data_loader.get_similar_codes(country_code, 'country')
|
|
suggestions = f" Did you mean: {', '.join(similar)}?" if similar else ""
|
|
raise ValueError(f"Invalid country code: '{country_code}'.{suggestions}")
|
|
|
|
result['country_code'] = country_code
|
|
result['country_name'] = self.data_loader.get_country_name(country_code)
|
|
found_country = True
|
|
idx += 1
|
|
|
|
# Language Code: 2-3 lowercase alphabetic chars (after country)
|
|
elif found_country and not found_language and len(part) in [2, 3] and part.isalpha() and part.islower():
|
|
language_code = part.lower()
|
|
if not self.data_loader.validate_language_code(language_code):
|
|
raise ValueError(f"Invalid language code: '{language_code}'")
|
|
|
|
result['language_code'] = language_code
|
|
result['language_name'] = self.data_loader.get_language_name(language_code)
|
|
found_language = True
|
|
idx += 1
|
|
|
|
# Social Media: 3 uppercase chars (after language)
|
|
elif found_language and len(part) == 3 and part.isalpha() and part.isupper():
|
|
social_code = part.upper()
|
|
if self.data_loader.validate_social_code(social_code):
|
|
result['social_media'] = social_code
|
|
idx += 1
|
|
else:
|
|
# Might be tracking ID or unknown, continue
|
|
idx += 1
|
|
|
|
# Tracking ID: 6 alphanumeric, optionally with -N suffix
|
|
elif self.patterns['tracking_id'].match(part):
|
|
result['tracking_id'] = part
|
|
idx += 1
|
|
|
|
# Fallback: Skip unknown parts
|
|
else:
|
|
idx += 1
|
|
|
|
# Validate required fields
|
|
required_fields = {
|
|
'brand_code': result['brand_code'],
|
|
'subject': result['subject'],
|
|
'asset_type': result['asset_type'],
|
|
'aspect_ratio': result['aspect_ratio'],
|
|
'country_code': result['country_code'],
|
|
'language_code': result['language_code'],
|
|
}
|
|
|
|
missing_fields = [name for name, value in required_fields.items() if not value]
|
|
if missing_fields:
|
|
raise ValueError(
|
|
f"Missing required fields: {', '.join(missing_fields)}. "
|
|
f"Parsed so far: {result}"
|
|
)
|
|
|
|
# Determine channel
|
|
if result['social_media']:
|
|
# Use social media name
|
|
result['channel'] = self.data_loader.get_social_media_name(result['social_media']) or result['social_media']
|
|
else:
|
|
# Use asset type as channel
|
|
result['channel'] = result['asset_type']
|
|
|
|
return result
|
|
|
|
def validate_required_fields(self, parsed_data: dict) -> bool:
|
|
"""
|
|
Validate that all required fields are present
|
|
|
|
Args:
|
|
parsed_data: Parsed filename data
|
|
|
|
Returns:
|
|
bool: True if all required fields present
|
|
|
|
Raises:
|
|
ValueError: If any required field is missing
|
|
"""
|
|
required = ['brand_code', 'subject', 'asset_type', 'aspect_ratio',
|
|
'country_code', 'language_code']
|
|
|
|
missing = [field for field in required if not parsed_data.get(field)]
|
|
|
|
if missing:
|
|
raise ValueError(f"Missing required fields: {', '.join(missing)}")
|
|
|
|
return True
|