ferrero-opentext/Python-Version/scripts/shared/metadata_extractor_mvp.py
DJP 15eb47fc43 Add CreativeX fields to asset representation if missing from master metadata
Fixes issue where CreativeX score field was not appearing in final upload
because it didn't exist in the master metadata from DAM.

Problem:
- Master metadata from A1→A2 doesn't include CREATIVEX fields (new fields)
- _update_creativex_fields() only UPDATED existing fields
- If field not present, it logged error but didn't add the field
- Result: CREATIVEX score missing from upload, only URL appeared

Solution:
- Check if CREATIVEX Score field exists in mvp_fields
- If NOT found: Create and append field with proper structure
- If found: Update value as before
- Same logic for CREATIVEX URL field

Field Structures Created:

CREATIVEX Score (FERRERO.TAB.FIELD.CREATIVEX):
- Type: MetadataTableField (tabular field)
- Parent: FERRERO.TABULAR.FIELD.PLATFORMRATING
- Data type: INTEGER
- Value structure: {'value': {'value': score}}

CREATIVEX URL (FERRERO.FIELD.CREATIVEX LINK):
- Type: MetadataField (regular field)
- Data type: CHAR
- Value structure: {'value': {'value': url}}

Logging:
- Changed from ERROR to WARNING when field not found
- Logs "adding it now" instead of just error
- Confirms field added with value

Impact:
Both CreativeX fields will now appear in uploads even if master
metadata doesn't have them (common for older campaigns downloaded
before CreativeX integration).

Testing:
Run with --dryrun to verify both CREATIVEX fields in JSON output.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-13 12:44:17 -05:00

465 lines
20 KiB
Python

"""
Metadata Extractor MVP - Extract MVP fields from master metadata
Ported from PHP MetadataExtractorMVP.php
Compatible with Python 3.6+
"""
import logging
from shared.config_loader import load_country_code_mappings
logger = logging.getLogger('MetadataExtractorMVP')
class MetadataExtractorMVP:
def __init__(self, field_mappings):
"""
Initialize with field mappings from config
Args:
field_mappings: dict from field_mappings.yaml
"""
self.mvp_field_ids = field_mappings['mvp_fields']
self.filename_updates = field_mappings.get('filename_updates', {})
self.forced_values = field_mappings.get('forced_values', {})
self.defaults = field_mappings.get('defaults', {})
# Load country code mappings (ISO -> DAM codes)
self.country_mappings = load_country_code_mappings()
if self.country_mappings:
logger.info("Loaded {} country code mappings (ISO->DAM)".format(len(self.country_mappings)))
def extract_mvp_fields(self, master_metadata):
"""
Extract only MVP fields from full master metadata
Args:
master_metadata: Complete DAM asset metadata
Returns:
List of MVP field objects
"""
extracted_fields = []
found_field_ids = []
# Navigate to metadata structure
# master_metadata is the full asset, need to go to: metadata.metadata_element_list
metadata_list = []
if isinstance(master_metadata, dict):
if 'metadata' in master_metadata and 'metadata_element_list' in master_metadata['metadata']:
metadata_list = master_metadata['metadata']['metadata_element_list']
logger.info("Using master_metadata['metadata']['metadata_element_list']")
logger.info("Searching through {} categories for MVP fields".format(len(metadata_list)))
# Search through categories for MVP fields
for item in metadata_list:
if 'metadata_element_list' in item:
# Category with nested fields
for field in item['metadata_element_list']:
field_id = field.get('id')
if field_id in self.mvp_field_ids:
extracted_fields.append(field)
found_field_ids.append(field_id)
logger.debug("Found MVP field: {}".format(field_id))
elif 'id' in item and item['id'] in self.mvp_field_ids:
# Direct field
extracted_fields.append(item)
found_field_ids.append(item['id'])
logger.debug("Found direct MVP field: {}".format(item['id']))
# Log results
missing = [f for f in self.mvp_field_ids if f not in found_field_ids]
logger.info("Found {}/{} MVP fields".format(len(found_field_ids), len(self.mvp_field_ids)))
if missing:
logger.info("Missing fields: {}".format(', '.join(missing[:5])))
return extracted_fields
def build_mvp_asset_representation(self, master_metadata, clean_filename, parsed_filename, box_metadata=None, tracking_mode='full'):
"""
Build asset representation with MVP fields + updates from filename
Args:
master_metadata: Full master asset metadata
clean_filename: Clean filename (stripped)
parsed_filename: Parsed V2 filename dict
box_metadata: Optional Box metadata
tracking_mode: 'full' (inherit all metadata) or 'folder_only' (only use folder)
Returns:
Asset representation dict ready for upload
"""
if tracking_mode == 'full':
# FULL INHERITANCE MODE - Standard behavior
logger.info("Full inheritance mode - using master metadata")
# Extract MVP fields from master
mvp_fields = self.extract_mvp_fields(master_metadata)
# Update fields from filename and forced values
mvp_fields = self._update_fields(mvp_fields, clean_filename, parsed_filename)
elif tracking_mode == 'folder_only':
# FOLDER ONLY MODE - New asset, only use upload folder
logger.info("Folder-only mode (-N suffix) - building metadata from filename only")
logger.warning("Note: Upload folder comes from master, all other metadata from filename")
# Start with empty fields, build from filename
mvp_fields = []
mvp_fields = self._build_fields_from_filename(parsed_filename, clean_filename)
# Add missing MVP fields with defaults (both modes)
mvp_fields = self._add_missing_fields(mvp_fields, parsed_filename)
# Update CreativeX fields from Box metadata if provided
if box_metadata:
mvp_fields = self._update_creativex_fields(mvp_fields, box_metadata)
# Build asset representation
asset_rep = {
'asset_resource': {
'asset': {
'metadata': {
'metadata_element_list': mvp_fields
},
'metadata_model_id': 'ECOMMERCE',
'security_policy_list': [
{'id': 1594}
]
}
}
}
logger.info("Built MVP asset representation with {} fields".format(len(mvp_fields)))
return asset_rep
def _update_fields(self, mvp_fields, clean_filename, parsed_filename):
"""Update specific fields from filename and forced values"""
# Update ASSET NAME
for field in mvp_fields:
if field.get('id') == 'ARTESIA.FIELD.ASSET NAME':
self._set_field_value(field, clean_filename)
logger.info("Updated ASSET NAME: {}".format(clean_filename))
# Update DESCRIPTION from subject_title
if parsed_filename and parsed_filename.get('subject_title'):
for field in mvp_fields:
if field.get('id') == 'ARTESIA.FIELD.ASSET DESCRIPTION':
self._set_field_value(field, parsed_filename['subject_title'])
logger.info("Updated DESCRIPTION: {}".format(parsed_filename['subject_title']))
# Apply country code mapping (ISO -> DAM codes)
for field in mvp_fields:
if field.get('id') == 'FERRERO.FIELD.COUNTRY':
current_value = self._get_field_value(field)
if current_value:
mapped_value = self._map_country_code(current_value)
if mapped_value != current_value:
self._set_field_value(field, mapped_value)
logger.info("Mapped country code: {} -> {}".format(current_value, mapped_value))
# Force STATE to Local
for field in mvp_fields:
if field.get('id') == 'FERRERO.FIELD.STATE':
self._set_field_value(field, 'Local')
logger.info("Set STATE to Local")
return mvp_fields
def _add_missing_fields(self, mvp_fields, parsed_filename):
"""Add missing MVP fields from filename or defaults"""
field_ids = [f.get('id') for f in mvp_fields]
# Add MAIN_LANGUAGES if missing
if 'MAIN_LANGUAGES' not in field_ids and parsed_filename:
if parsed_filename.get('language_code'):
language = parsed_filename['language_code'].upper()
logger.info("Adding MAIN_LANGUAGES: {}".format(language))
mvp_fields.append({
'id': 'MAIN_LANGUAGES',
'parent_table_id': 'FERRERO.TABULAR.FIELD.MAIN LANGUAGES',
'type': 'com.artesia.metadata.MetadataTableField',
'values': [
{
'cascading_domain_value': False,
'domain_value': True,
'value': {
'field_value': {
'type': 'string',
'value': language
},
'type': 'com.artesia.metadata.DomainValue'
}
}
]
})
# Add other missing fields with defaults
field_ids = [f.get('id') for f in mvp_fields]
for field_id, default_value in self.defaults.items():
if field_id not in field_ids:
logger.info("Adding {} with default: {}".format(field_id, default_value))
# Check if it's a tabular field (contains .TABULAR. in parent table ID)
is_tabular = 'TABULAR' in field_id or field_id in [
'FERRERO.FIELD.ASSETCOMPLIANCE', 'MARKETING_TAG'
]
if is_tabular:
mvp_fields.append({
'id': field_id,
'parent_table_id': 'FERRERO.TABULAR.FIELD.' + field_id.split('.')[-1],
'type': 'com.artesia.metadata.MetadataTableField',
'values': [
{
'cascading_domain_value': False,
'domain_value': True,
'value': {
'field_value': {
'type': 'string',
'value': default_value
},
'type': 'com.artesia.metadata.DomainValue'
}
}
]
})
else:
mvp_fields.append({
'id': field_id,
'type': 'com.artesia.metadata.MetadataField',
'value': {
'cascading_domain_value': False,
'domain_value': True,
'value': {
'type': 'string',
'value': default_value
}
}
})
return mvp_fields
def _map_country_code(self, iso_code):
"""
Map ISO country code to DAM country code
Args:
iso_code: ISO 3166-1 Alpha-2 code (e.g., 'BD', 'DE')
Returns:
str: DAM country code (e.g., 'BG' for BD, 'DE' for DE)
"""
if not iso_code:
return iso_code
iso_upper = iso_code.upper()
# Check if we have a mapping
if iso_upper in self.country_mappings:
dam_code = self.country_mappings[iso_upper]
if dam_code != iso_upper:
logger.info("Country code mapping: {} (ISO) -> {} (DAM)".format(iso_upper, dam_code))
return dam_code
else:
# No mapping found, use ISO code as-is
logger.debug("No mapping for country code: {} (using as-is)".format(iso_upper))
return iso_upper
def _build_fields_from_filename(self, parsed_filename, clean_filename):
"""
Build ALL metadata fields from parsed filename
Used in folder-only mode (tracking ID with -N suffix)
Note: Uses codes directly for now. Can add lookup tables later
for brand_code->brand_name, country_code->country_name, etc.
"""
fields = []
# ASSET NAME
fields.append({
'id': 'ARTESIA.FIELD.ASSET NAME',
'value': {'value': {'value': clean_filename}}
})
# DESCRIPTION (from subject_title)
if parsed_filename.get('subject_title'):
fields.append({
'id': 'ARTESIA.FIELD.ASSET DESCRIPTION',
'value': {'value': {'value': parsed_filename['subject_title']}}
})
# BRAND (use code for now, could add lookup later)
if parsed_filename.get('brand_code'):
fields.append({
'id': 'FERRERO.FIELD.BRAND',
'value': {'value': {'value': parsed_filename['brand_code']}}
})
# COUNTRY (map ISO code to DAM code)
if parsed_filename.get('country_code'):
dam_country_code = self._map_country_code(parsed_filename['country_code'])
fields.append({
'id': 'FERRERO.FIELD.COUNTRY',
'value': {'value': {'value': dam_country_code}}
})
# LANGUAGE (use code for now)
if parsed_filename.get('language_code'):
fields.append({
'id': 'FERRERO.FIELD.LANGUAGES',
'value': {'value': {'value': parsed_filename['language_code']}}
})
# ASSET TYPE (use code for now)
if parsed_filename.get('asset_type'):
fields.append({
'id': 'FERRERO.FIELD.ASSET TYPE',
'value': {'value': {'value': parsed_filename['asset_type']}}
})
# STATE (force to Local)
fields.append({
'id': 'FERRERO.FIELD.STATE',
'value': {'value': {'value': 'Local'}}
})
logger.info("Built {} fields from filename (folder-only mode)".format(len(fields)))
return fields
def _get_field_value(self, field):
"""Get field value handling different structures"""
if 'value' in field:
if isinstance(field['value'], dict):
if 'value' in field['value'] and isinstance(field['value']['value'], dict):
if 'value' in field['value']['value']:
return field['value']['value']['value']
elif 'field_value' in field['value']['value']:
return field['value']['value']['field_value'].get('value')
return None
def _set_field_value(self, field, value):
"""Set field value handling different structures"""
import json
field_id = field.get('id', 'UNKNOWN')
logger.info("_set_field_value called for: {} with value: {}".format(field_id, value))
logger.info("Current field['value']: {}".format(json.dumps(field.get('value'), indent=2) if field.get('value') else 'None'))
if 'value' in field:
if isinstance(field['value'], dict):
# Try nested structure first (most common)
if 'value' in field['value'] and isinstance(field['value']['value'], dict):
if 'value' in field['value']['value']:
field['value']['value']['value'] = value
logger.info("Set via field['value']['value']['value']")
elif 'field_value' in field['value']['value']:
field['value']['value']['field_value']['value'] = value
logger.info("Set via field['value']['value']['field_value']['value']")
else:
# If nested dict is empty, create the value structure
field['value']['value'] = {'value': value}
logger.info("Created field['value']['value'] = {{'value': {}}}".format(value))
else:
# If value dict is empty or doesn't have nested value, create it
field['value'] = {'value': {'value': value}}
logger.info("Created field['value'] = {{'value': {{'value': {}}}}}".format(value))
logger.info("After setting, field['value']: {}".format(json.dumps(field.get('value'), indent=2) if field.get('value') else 'None'))
def _update_creativex_fields(self, mvp_fields, box_metadata):
"""
Update CreativeX fields from Box metadata template
Args:
mvp_fields: List of MVP fields
box_metadata: dict with 'score' and 'url' from Box template
Returns:
Updated mvp_fields list
"""
# Map Box metadata to DAM field IDs
creativex_mapping = {
'score': 'FERRERO.TAB.FIELD.CREATIVEX', # Platform > Rating (%)
'url': 'FERRERO.FIELD.CREATIVEX LINK' # CreativeX Hyperlink
}
if box_metadata.get('score'):
# Update CreativeX Score field (tabular field structure)
logger.info("Updating CreativeX Score from database: {}".format(box_metadata['score']))
score_field_found = False
for field in mvp_fields:
if field.get('id') == 'FERRERO.TAB.FIELD.CREATIVEX':
score_field_found = True
try:
# Log field structure before setting
logger.info("CREATIVEX Score field structure: {}".format(field.get('value', {}).keys() if isinstance(field.get('value'), dict) else 'not a dict'))
self._set_field_value(field, box_metadata['score'])
logger.info("Set CREATIVEX Score to: {}".format(box_metadata['score']))
except Exception as e:
logger.error("Failed to set CreativeX Score: {}".format(str(e)))
import traceback
logger.error(traceback.format_exc())
break
if not score_field_found:
logger.warning("CREATIVEX Score field not found in master metadata - adding it now")
# Create the field structure (tabular field)
creativex_score_field = {
'id': 'FERRERO.TAB.FIELD.CREATIVEX',
'name': 'Rating (%)',
'type': 'com.artesia.metadata.MetadataTableField',
'parent_table_id': 'FERRERO.TABULAR.FIELD.PLATFORMRATING',
'value': {
'value': {
'value': box_metadata['score']
}
},
'data_type': 'INTEGER',
'required': False
}
mvp_fields.append(creativex_score_field)
logger.info("Added CREATIVEX Score field with value: {}".format(box_metadata['score']))
if box_metadata.get('url'):
# Update CreativeX URL field
logger.info("Updating CreativeX URL from database: {}".format(box_metadata['url']))
url_field_found = False
for field in mvp_fields:
if field.get('id') == 'FERRERO.FIELD.CREATIVEX LINK':
url_field_found = True
try:
# Log field structure before setting
logger.info("CREATIVEX URL field structure: {}".format(field.get('value', {}).keys() if isinstance(field.get('value'), dict) else 'not a dict'))
self._set_field_value(field, box_metadata['url'])
logger.info("Set CREATIVEX LINK to: {}".format(box_metadata['url']))
except Exception as e:
logger.error("Failed to set CreativeX URL: {}".format(str(e)))
import traceback
logger.error(traceback.format_exc())
break
if not url_field_found:
logger.warning("CREATIVEX URL field not found in master metadata - adding it now")
# Create the field structure (text field)
creativex_url_field = {
'id': 'FERRERO.FIELD.CREATIVEX LINK',
'name': 'CreativeX Hyperlink',
'type': 'com.artesia.metadata.MetadataField',
'value': {
'value': {
'value': box_metadata['url']
}
},
'data_type': 'CHAR',
'required': False
}
mvp_fields.append(creativex_url_field)
logger.info("Added CREATIVEX URL field with value: {}".format(box_metadata['url']))
return mvp_fields