""" Filename Parser - V2.1 Naming Convention Parser Updated November 2025 with new field positions Compatible with Python 3.6+ """ import re import logging logger = logging.getLogger('FilenameParser') class FilenameParser: """ Parse V2.1 naming convention filenames: [JOB]_[BRAND]_[SUBJECT]_[ASSET]_[DUR]_[RATIO]_[SPOT]_[COUNTRY]_[LANG]_[SOCIAL]_[TRACKING] Example: 1234567_RAF_ME-MOMENT_OLV_6S_1x1_REF_GL_it_IGF_pOiJ9s PPR Environment: Supports multiple tracking IDs (e.g., pOiJ9s+BqB8vo+laRJo0) PROD Environment: Single tracking ID only (backward compatible) """ # Known social media platform codes (from Ferrero naming tool data.json) SOCIAL_MEDIA_CODES = [ # Facebook 'FBD', 'FGF', 'FBR', 'FRO', 'FBS', 'FBF', 'FBP', 'FIA', 'FIV', 'FMP', 'FPF', 'FRC', 'FSE', 'FSS', 'FSV', 'FUK', 'FVF', # Instagram 'IGF', 'IGE', 'IGG', 'IGT', 'IPF', 'IPR', 'IGR', 'IGO', 'IGS', 'ISH', 'IST', # Audience Network 'ANC', 'ANI', 'ANR', # Messenger 'MSI', 'MSS', # YouTube 'YTA', 'YTB', 'YTS', # Other platforms 'AMZ', 'DV3', 'GOO', 'PIN', 'SNA', 'SPT', 'TIK', 'TWI', 'VOD', ] def __init__(self, dam_base_url=None): """ Initialize parser with optional environment detection Args: dam_base_url: DAM base URL for environment detection (optional) """ self.dam_base_url = dam_base_url self.is_ppr = self._is_ppr_environment() def _is_ppr_environment(self): """Check if running in PPR environment""" if not self.dam_base_url: return False return 'ppr.dam.ferrero.com' in self.dam_base_url.lower() def parse_filename(self, filename): """ Parse V2.1 filename into components New Structure (V2.1): [JOB]_[BRAND]_[SUBJECT]_[ASSET]_[DUR]_[RATIO]_[SPOT]_[COUNTRY]_[LANG]_[SOCIAL]_[TRACKING] Args: filename: Filename to parse (with or without extension) Returns: dict with parsed components and validation results """ validation_errors = [] warnings = [] # Remove extension if '.' in filename: filename_without_ext, extension = filename.rsplit('.', 1) extension = '.' + extension else: filename_without_ext = filename extension = '' # Split by underscore parts = filename_without_ext.split('_') # Minimum 7 parts: JOB + BRAND + SUBJECT + ASSET + RATIO + COUNTRY + LANG if len(parts) < 7: validation_errors.append("Invalid structure: expected min 7 parts, got {}".format(len(parts))) parsed = { 'original_filename': filename, 'filename_without_ext': filename_without_ext, 'extension': extension, 'omg_job_number': None, 'brand_code': None, 'subject_title': None, 'asset_type': None, 'seconds': None, 'aspect_ratio': None, 'spot_version': None, 'country_code': None, 'language_code': None, 'social_media_version': None, 'tracking_id': None, 'tracking_mode': 'full', 'tracking_id_with_suffix': None, 'has_master': False, 'validation_errors': [], 'warnings': [], 'is_valid': False } if len(parts) < 7: parsed['validation_errors'] = validation_errors return parsed index = 0 # =================================================================== # FIXED POSITIONS (Always in these positions) # =================================================================== # 1. OMG Job Number (digits only, max 10) if index < len(parts) and parts[index].isdigit(): omg = parts[index] if len(omg) > 10: validation_errors.append("OMG Job Number too long: {} (max 10)".format(omg)) else: parsed['omg_job_number'] = omg index += 1 else: if index < len(parts): validation_errors.append("OMG Job Number missing or invalid: {}".format(parts[index])) index += 1 # 2. Brand Code (2-5 chars, uppercase) if index < len(parts): brand = parts[index].upper() if 2 <= len(brand) <= 5: parsed['brand_code'] = brand else: validation_errors.append("Brand Code invalid: {} (must be 2-5 chars)".format(brand)) index += 1 # 3. Subject Title (NEW POSITION - was 5, now 3) if index < len(parts): subject = parts[index] if len(subject) > 15: warnings.append("Subject title exceeds 15 chars: {}".format(subject)) parsed['subject_title'] = subject index += 1 # 4. Asset Type (NEW POSITION - was 6, now 4) if index < len(parts): asset = parts[index].upper() if len(asset) == 3: parsed['asset_type'] = asset else: validation_errors.append("Asset Type invalid: {} (must be 3 chars)".format(asset)) index += 1 # =================================================================== # VARIABLE/OPTIONAL POSITIONS (Pattern-based detection) # =================================================================== # Now parse remaining parts using pattern detection # Fields can appear in this order but some may be missing: # [DURATION] [RATIO] [SPOT] [COUNTRY] [LANG] [SOCIAL] [TRACKING] found_ratio = False found_country = False found_language = False while index < len(parts): part = parts[index] # Duration: Digits + 'S' (e.g., "6S", "30S") - BEFORE ratio if not found_ratio and re.match(r'^\d+S$', part, re.IGNORECASE): parsed['seconds'] = part[:-1] # Remove 'S' logger.debug("Found duration: {}".format(part)) index += 1 # Aspect Ratio: Contains 'x' or ':' (e.g., "16x9", "1x1") elif not found_ratio and ('x' in part.lower() or ':' in part): parsed['aspect_ratio'] = part found_ratio = True logger.debug("Found aspect ratio: {}".format(part)) index += 1 # Spot Version: Exactly "MST" or "REF" - AFTER ratio, BEFORE country elif found_ratio and not found_country and part.upper() in ['MST', 'REF']: parsed['spot_version'] = part.upper() parsed['has_master'] = (part.upper() == 'MST') logger.debug("Found spot version: {}".format(part)) index += 1 # Country Code: 2 uppercase alpha - AFTER ratio/spot elif found_ratio and not found_country and len(part) == 2 and part.isalpha() and part.isupper(): parsed['country_code'] = part.upper() found_country = True logger.debug("Found country: {}".format(part)) index += 1 # Language Code: 2-3 lowercase alpha - AFTER country elif found_country and not found_language and len(part) in [2, 3] and part.isalpha() and part.islower(): parsed['language_code'] = part.lower() found_language = True logger.debug("Found language: {}".format(part)) index += 1 # Social Media: One of known codes - AFTER language elif found_language and part.upper() in self.SOCIAL_MEDIA_CODES: parsed['social_media_version'] = part.upper() logger.debug("Found social media: {}".format(part)) index += 1 # Tracking ID(s): 6 alphanumeric, optionally with -N suffix # PPR: Supports multiple IDs (e.g., "BqB8vo+SfUQ7m+laRJo0") # PROD: Single ID only (backward compatible) elif re.match(r'^[a-zA-Z0-9]{6}(-N)?(\+[a-zA-Z0-9]{6}(-N)?)*$', part): # Check if multiple IDs provided if '+' in part and self.is_ppr: # PPR ONLY: Parse multiple tracking IDs tracking_ids = [] tracking_modes = [] tracking_ids_with_suffix = [] id_parts = part.split('+') logger.info("PPR Environment - Multiple tracking IDs detected: {}".format(len(id_parts))) for tracking in id_parts: tracking_mode = 'full' base_tracking_id = tracking if tracking.endswith('-N'): tracking_mode = 'folder_only' base_tracking_id = tracking[:-2] logger.info("Folder-only tracking ID: {} (base: {})".format(tracking, base_tracking_id)) tracking_ids.append(base_tracking_id) tracking_modes.append(tracking_mode) tracking_ids_with_suffix.append(tracking) # Store primary (first) for backward compatibility parsed['tracking_id'] = tracking_ids[0] parsed['tracking_mode'] = tracking_modes[0] parsed['tracking_id_with_suffix'] = tracking_ids_with_suffix[0] # Store all IDs for multi-master support parsed['tracking_ids'] = tracking_ids parsed['tracking_modes'] = tracking_modes parsed['tracking_ids_with_suffix'] = tracking_ids_with_suffix parsed['has_multiple_masters'] = True logger.info("Parsed {} tracking IDs: {}".format(len(tracking_ids), ', '.join(tracking_ids))) else: # PROD or Single ID: Use only first tracking ID if '+' in part: logger.warning("PROD Environment - Multiple tracking IDs not supported, using first ID only") part = part.split('+')[0] # Take only first ID tracking = part tracking_mode = 'full' base_tracking_id = tracking if tracking.endswith('-N'): tracking_mode = 'folder_only' base_tracking_id = tracking[:-2] logger.info("Folder-only tracking ID: {} (base: {})".format(tracking, base_tracking_id)) parsed['tracking_id'] = base_tracking_id parsed['tracking_mode'] = tracking_mode parsed['tracking_id_with_suffix'] = tracking parsed['tracking_ids'] = [base_tracking_id] # Single item list for compatibility parsed['has_multiple_masters'] = False logger.debug("Found tracking ID: {}".format(tracking)) index += 1 # Unknown part - could be aspect ratio fallback elif not found_ratio: # Might be aspect ratio in unexpected format parsed['aspect_ratio'] = part found_ratio = True warnings.append("Aspect ratio in unexpected format: {}".format(part)) index += 1 else: # Unknown component - skip it warnings.append("Unknown component skipped: {}".format(part)) index += 1 # Set validation status parsed['validation_errors'] = validation_errors parsed['warnings'] = warnings parsed['is_valid'] = len(validation_errors) == 0 return parsed def strip_upload_components(self, filename): """ Strip OMG Job Number from front and Tracking ID from back of filename. Keeps everything else as-is (including social media codes, DV3, etc.) Args: filename: Original filename Returns: Clean filename for upload (no job number, no tracking ID) Example: Input: 6662777_NUT_XMAS-SHARETHELOVE-GLAS_OLV_6S_16X9_PL_pl_YTA_EvQJrM.mp4 Output: NUT_XMAS-SHARETHELOVE-GLAS_OLV_6S_16X9_PL_pl_YTA.mp4 """ import os base, ext = os.path.splitext(filename) parts = base.split('_') if len(parts) < 3: return filename # Strip job number from front (digits only) if parts[0].isdigit(): parts = parts[1:] # Strip tracking ID(s) from back (6 alphanumeric chars, optionally with +joined IDs or -N suffix) if parts and re.match(r'^[a-zA-Z0-9]{6}(-N)?(\+[a-zA-Z0-9]{6}(-N)?)*$', parts[-1]): parts = parts[:-1] return '_'.join(parts) + ext