ferrero-opentext/Python-Version/scripts/shared/filename_parser.py
nickviljoen 444ac7ac6d Fix: PPR multiple master asset IDs now correctly populate MASTERASSETIDS field
Fixed issue where only 1 of 3 master asset IDs was being added to the
FERRERO.MASTERASSETIDS tabular field. The bug was caused by calling
_add_master_asset_id_field() before _add_master_asset_ids_field(),
which created the field with a single value and blocked the multi-value
method from adding all IDs.

Changes:
- metadata_extractor_mvp.py: Prioritize master_opentext_ids parameter
  using if/elif logic to prevent single-ID method from blocking multi-ID
- a2_to_a3_upload_polling.py: Load multiple master assets in PPR mode
- filename_parser.py: Parse multiple tracking IDs (e.g., ID1+ID2+ID3)
- query_db.py: Fix .env loading path
- Added documentation and test files for multiple master asset IDs

Tested in PPR with 3 tracking IDs (BqB8vo+SfUQ7m+laRJo0) - all 3 master
asset IDs now correctly appear in the metadata structure.

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-02-03 21:02:09 +02:00

331 lines
13 KiB
Python

"""
Filename Parser - V2.1 Naming Convention Parser
Updated November 2025 with new field positions
Compatible with Python 3.6+
"""
import re
import logging
logger = logging.getLogger('FilenameParser')
class FilenameParser:
"""
Parse V2.1 naming convention filenames:
[JOB]_[BRAND]_[SUBJECT]_[ASSET]_[DUR]_[RATIO]_[SPOT]_[COUNTRY]_[LANG]_[SOCIAL]_[TRACKING]
Example: 1234567_RAF_ME-MOMENT_OLV_6S_1x1_REF_GL_it_IGF_pOiJ9s
PPR Environment: Supports multiple tracking IDs (e.g., pOiJ9s+BqB8vo+laRJo0)
PROD Environment: Single tracking ID only (backward compatible)
"""
# Known social media platform codes
SOCIAL_MEDIA_CODES = ['FBP', 'FBR', 'IGF', 'IGR'] # Expandable
def __init__(self, dam_base_url=None):
"""
Initialize parser with optional environment detection
Args:
dam_base_url: DAM base URL for environment detection (optional)
"""
self.dam_base_url = dam_base_url
self.is_ppr = self._is_ppr_environment()
def _is_ppr_environment(self):
"""Check if running in PPR environment"""
if not self.dam_base_url:
return False
return 'ppr.dam.ferrero.com' in self.dam_base_url.lower()
def parse_filename(self, filename):
"""
Parse V2.1 filename into components
New Structure (V2.1):
[JOB]_[BRAND]_[SUBJECT]_[ASSET]_[DUR]_[RATIO]_[SPOT]_[COUNTRY]_[LANG]_[SOCIAL]_[TRACKING]
Args:
filename: Filename to parse (with or without extension)
Returns:
dict with parsed components and validation results
"""
validation_errors = []
warnings = []
# Remove extension
if '.' in filename:
filename_without_ext, extension = filename.rsplit('.', 1)
extension = '.' + extension
else:
filename_without_ext = filename
extension = ''
# Split by underscore
parts = filename_without_ext.split('_')
# Minimum 7 parts: JOB + BRAND + SUBJECT + ASSET + RATIO + COUNTRY + LANG
if len(parts) < 7:
validation_errors.append("Invalid structure: expected min 7 parts, got {}".format(len(parts)))
parsed = {
'original_filename': filename,
'filename_without_ext': filename_without_ext,
'extension': extension,
'omg_job_number': None,
'brand_code': None,
'subject_title': None,
'asset_type': None,
'seconds': None,
'aspect_ratio': None,
'spot_version': None,
'country_code': None,
'language_code': None,
'social_media_version': None,
'tracking_id': None,
'tracking_mode': 'full',
'tracking_id_with_suffix': None,
'has_master': False,
'validation_errors': [],
'warnings': [],
'is_valid': False
}
if len(parts) < 7:
parsed['validation_errors'] = validation_errors
return parsed
index = 0
# ===================================================================
# FIXED POSITIONS (Always in these positions)
# ===================================================================
# 1. OMG Job Number (digits only, max 10)
if index < len(parts) and parts[index].isdigit():
omg = parts[index]
if len(omg) > 10:
validation_errors.append("OMG Job Number too long: {} (max 10)".format(omg))
else:
parsed['omg_job_number'] = omg
index += 1
else:
if index < len(parts):
validation_errors.append("OMG Job Number missing or invalid: {}".format(parts[index]))
index += 1
# 2. Brand Code (2-5 chars, uppercase)
if index < len(parts):
brand = parts[index].upper()
if 2 <= len(brand) <= 5:
parsed['brand_code'] = brand
else:
validation_errors.append("Brand Code invalid: {} (must be 2-5 chars)".format(brand))
index += 1
# 3. Subject Title (NEW POSITION - was 5, now 3)
if index < len(parts):
subject = parts[index]
if len(subject) > 15:
warnings.append("Subject title exceeds 15 chars: {}".format(subject))
parsed['subject_title'] = subject
index += 1
# 4. Asset Type (NEW POSITION - was 6, now 4)
if index < len(parts):
asset = parts[index].upper()
if len(asset) == 3:
parsed['asset_type'] = asset
else:
validation_errors.append("Asset Type invalid: {} (must be 3 chars)".format(asset))
index += 1
# ===================================================================
# VARIABLE/OPTIONAL POSITIONS (Pattern-based detection)
# ===================================================================
# Now parse remaining parts using pattern detection
# Fields can appear in this order but some may be missing:
# [DURATION] [RATIO] [SPOT] [COUNTRY] [LANG] [SOCIAL] [TRACKING]
found_ratio = False
found_country = False
found_language = False
while index < len(parts):
part = parts[index]
# Duration: Digits + 'S' (e.g., "6S", "30S") - BEFORE ratio
if not found_ratio and re.match(r'^\d+S$', part, re.IGNORECASE):
parsed['seconds'] = part[:-1] # Remove 'S'
logger.debug("Found duration: {}".format(part))
index += 1
# Aspect Ratio: Contains 'x' or ':' (e.g., "16x9", "1x1")
elif not found_ratio and ('x' in part.lower() or ':' in part):
parsed['aspect_ratio'] = part
found_ratio = True
logger.debug("Found aspect ratio: {}".format(part))
index += 1
# Spot Version: Exactly "MST" or "REF" - AFTER ratio, BEFORE country
elif found_ratio and not found_country and part.upper() in ['MST', 'REF']:
parsed['spot_version'] = part.upper()
parsed['has_master'] = (part.upper() == 'MST')
logger.debug("Found spot version: {}".format(part))
index += 1
# Country Code: 2 uppercase alpha - AFTER ratio/spot
elif found_ratio and not found_country and len(part) == 2 and part.isalpha() and part.isupper():
parsed['country_code'] = part.upper()
found_country = True
logger.debug("Found country: {}".format(part))
index += 1
# Language Code: 2-3 lowercase alpha - AFTER country
elif found_country and not found_language and len(part) in [2, 3] and part.isalpha() and part.islower():
parsed['language_code'] = part.lower()
found_language = True
logger.debug("Found language: {}".format(part))
index += 1
# Social Media: One of known codes - AFTER language
elif found_language and part.upper() in self.SOCIAL_MEDIA_CODES:
parsed['social_media_version'] = part.upper()
logger.debug("Found social media: {}".format(part))
index += 1
# Tracking ID(s): 6 alphanumeric, optionally with -N suffix
# PPR: Supports multiple IDs (e.g., "BqB8vo+SfUQ7m+laRJo0")
# PROD: Single ID only (backward compatible)
elif re.match(r'^[a-zA-Z0-9]{6}(-N)?(\+[a-zA-Z0-9]{6}(-N)?)*$', part):
# Check if multiple IDs provided
if '+' in part and self.is_ppr:
# PPR ONLY: Parse multiple tracking IDs
tracking_ids = []
tracking_modes = []
tracking_ids_with_suffix = []
id_parts = part.split('+')
logger.info("PPR Environment - Multiple tracking IDs detected: {}".format(len(id_parts)))
for tracking in id_parts:
tracking_mode = 'full'
base_tracking_id = tracking
if tracking.endswith('-N'):
tracking_mode = 'folder_only'
base_tracking_id = tracking[:-2]
logger.info("Folder-only tracking ID: {} (base: {})".format(tracking, base_tracking_id))
tracking_ids.append(base_tracking_id)
tracking_modes.append(tracking_mode)
tracking_ids_with_suffix.append(tracking)
# Store primary (first) for backward compatibility
parsed['tracking_id'] = tracking_ids[0]
parsed['tracking_mode'] = tracking_modes[0]
parsed['tracking_id_with_suffix'] = tracking_ids_with_suffix[0]
# Store all IDs for multi-master support
parsed['tracking_ids'] = tracking_ids
parsed['tracking_modes'] = tracking_modes
parsed['tracking_ids_with_suffix'] = tracking_ids_with_suffix
parsed['has_multiple_masters'] = True
logger.info("Parsed {} tracking IDs: {}".format(len(tracking_ids), ', '.join(tracking_ids)))
else:
# PROD or Single ID: Use only first tracking ID
if '+' in part:
logger.warning("PROD Environment - Multiple tracking IDs not supported, using first ID only")
part = part.split('+')[0] # Take only first ID
tracking = part
tracking_mode = 'full'
base_tracking_id = tracking
if tracking.endswith('-N'):
tracking_mode = 'folder_only'
base_tracking_id = tracking[:-2]
logger.info("Folder-only tracking ID: {} (base: {})".format(tracking, base_tracking_id))
parsed['tracking_id'] = base_tracking_id
parsed['tracking_mode'] = tracking_mode
parsed['tracking_id_with_suffix'] = tracking
parsed['tracking_ids'] = [base_tracking_id] # Single item list for compatibility
parsed['has_multiple_masters'] = False
logger.debug("Found tracking ID: {}".format(tracking))
index += 1
# Unknown part - could be aspect ratio fallback
elif not found_ratio:
# Might be aspect ratio in unexpected format
parsed['aspect_ratio'] = part
found_ratio = True
warnings.append("Aspect ratio in unexpected format: {}".format(part))
index += 1
else:
# Unknown component - skip it
warnings.append("Unknown component skipped: {}".format(part))
index += 1
# Set validation status
parsed['validation_errors'] = validation_errors
parsed['warnings'] = warnings
parsed['is_valid'] = len(validation_errors) == 0
return parsed
def strip_upload_components(self, filename):
"""
Strip OMG Job Number and Tracking ID from filename
Returns clean filename in V2.1 order
Args:
filename: Original filename
Returns:
Clean filename for upload (no job number, no tracking ID)
Example:
Input: 1234567_RAF_TEST_OLV_6S_1x1_REF_GL_it_IGF_abc123.mp4
Output: RAF_TEST_OLV_6S_1x1_REF_GL_it_IGF.mp4
"""
parsed = self.parse_filename(filename)
if not parsed:
return filename
# Build clean filename in V2.1 order
# [BRAND]_[SUBJECT]_[ASSET]_[DUR]_[RATIO]_[SPOT]_[COUNTRY]_[LANG]_[SOCIAL]
clean_parts = []
if parsed['brand_code']:
clean_parts.append(parsed['brand_code'])
if parsed['subject_title']:
clean_parts.append(parsed['subject_title'])
if parsed['asset_type']:
clean_parts.append(parsed['asset_type'])
if parsed['seconds']:
clean_parts.append(parsed['seconds'] + 'S')
if parsed['aspect_ratio']:
clean_parts.append(parsed['aspect_ratio'])
if parsed['spot_version']:
clean_parts.append(parsed['spot_version'])
if parsed['country_code']:
clean_parts.append(parsed['country_code'])
if parsed['language_code']:
clean_parts.append(parsed['language_code'])
if parsed['social_media_version']:
clean_parts.append(parsed['social_media_version'])
clean_filename = '_'.join(clean_parts)
if parsed['extension']:
clean_filename += parsed['extension']
return clean_filename