ferrero-opentext/Python-Version/scripts/shared/metadata_extractor_mvp.py.backup
nickviljoen f83b4fae3e PPR Environment: Use SIMPLE metadata structure for tabular fields
Key Changes:
- Updated metadata_extractor_mvp.py to use SIMPLE structure for all tabular fields
- All tabular fields now use direct value objects (no MetadataTableFieldRow wrapper)
- MAIN_LANGUAGES, ASSETCOMPLIANCE, MARKETING_TAG, CREATIVEX all use SIMPLE structure
- Master Asset ID field updated to SIMPLE structure
- Date fields now use type 'string' instead of 'long'
- Matches DAM reference structure from asset_representation.json

Added Files:
- metadata_extractor_mvp_PROD.py: PROD-specific version with same SIMPLE structure
- Backup files for safety
- Analysis and comparison documentation

Environment:
- Tested and working in PPR environment (ppr.dam.ferrero.com)
- All tabular fields match DAM-supplied reference structure
- Successful uploads confirmed

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-01-23 16:52:50 +02:00

758 lines
32 KiB
Text

"""
Metadata Extractor MVP - Extract MVP fields from master metadata
Ported from PHP MetadataExtractorMVP.php
Compatible with Python 3.6+
"""
import logging
from datetime import datetime, timedelta
import os
from shared.config_loader import load_country_code_mappings
logger = logging.getLogger('MetadataExtractorMVP')
class MetadataExtractorMVP:
def __init__(self, field_mappings):
"""
Initialize with field mappings from config
Args:
field_mappings: dict from field_mappings.yaml
"""
self.mvp_field_ids = field_mappings['mvp_fields']
self.filename_updates = field_mappings.get('filename_updates', {})
self.forced_values = field_mappings.get('forced_values', {})
self.defaults = field_mappings.get('defaults', {})
# Load country code mappings (ISO -> DAM codes)
self.country_mappings = load_country_code_mappings()
if self.country_mappings:
logger.info("Loaded {} country code mappings (ISO->DAM)".format(len(self.country_mappings)))
# Load asset type mappings (3-letter codes -> DAM codes)
self.asset_type_mappings = self._load_asset_type_mappings()
if self.asset_type_mappings:
logger.info("Loaded {} asset type mappings (3-letter->DAM)".format(len(self.asset_type_mappings)))
def extract_mvp_fields(self, master_metadata):
"""
Extract only MVP fields from full master metadata
Args:
master_metadata: Complete DAM asset metadata
Returns:
List of MVP field objects
"""
extracted_fields = []
found_field_ids = []
# Navigate to metadata structure
# master_metadata is the full asset, need to go to: metadata.metadata_element_list
metadata_list = []
if isinstance(master_metadata, dict):
if 'metadata' in master_metadata and 'metadata_element_list' in master_metadata['metadata']:
metadata_list = master_metadata['metadata']['metadata_element_list']
logger.info("Using master_metadata['metadata']['metadata_element_list']")
logger.info("Searching through {} categories for MVP fields".format(len(metadata_list)))
# Search through categories for MVP fields
for item in metadata_list:
if 'metadata_element_list' in item:
# Category with nested fields
for field in item['metadata_element_list']:
field_id = field.get('id')
if field_id in self.mvp_field_ids:
extracted_fields.append(field)
found_field_ids.append(field_id)
logger.debug("Found MVP field: {}".format(field_id))
elif 'id' in item and item['id'] in self.mvp_field_ids:
# Direct field
extracted_fields.append(item)
found_field_ids.append(item['id'])
logger.debug("Found direct MVP field: {}".format(item['id']))
# Log results
missing = [f for f in self.mvp_field_ids if f not in found_field_ids]
logger.info("Found {}/{} MVP fields".format(len(found_field_ids), len(self.mvp_field_ids)))
if missing:
logger.info("Missing fields: {}".format(', '.join(missing[:5])))
return extracted_fields
def build_mvp_asset_representation(self, master_metadata, clean_filename, parsed_filename, box_metadata=None, tracking_mode='full', master_opentext_id=None):
"""
Build asset representation with MVP fields + updates from filename
Args:
master_metadata: Full master asset metadata
clean_filename: Clean filename (stripped)
parsed_filename: Parsed V2 filename dict
box_metadata: Optional Box metadata
tracking_mode: 'full' (inherit all metadata) or 'folder_only' (only use folder)
master_opentext_id: Optional DAM Asset ID of master asset (for derivative tracking)
Returns:
Asset representation dict ready for upload
"""
if tracking_mode == 'full':
# FULL INHERITANCE MODE - Standard behavior
logger.info("Full inheritance mode - using master metadata")
# Extract MVP fields from master
mvp_fields = self.extract_mvp_fields(master_metadata)
# Update fields from filename and forced values
mvp_fields = self._update_fields(mvp_fields, clean_filename, parsed_filename)
elif tracking_mode == 'folder_only':
# FOLDER ONLY MODE - New asset, only use upload folder
logger.info("Folder-only mode (-N suffix) - building metadata from filename only")
logger.warning("Note: Upload folder comes from master, all other metadata from filename")
# Start with empty fields, build from filename
mvp_fields = []
mvp_fields = self._build_fields_from_filename(parsed_filename, clean_filename)
# Add missing MVP fields with defaults (both modes)
mvp_fields = self._add_missing_fields(mvp_fields, parsed_filename)
# Update CreativeX fields from Box metadata if provided
if box_metadata:
mvp_fields = self._update_creativex_fields(mvp_fields, box_metadata)
# Add Master Asset ID field if provided (derivative tracking)
if master_opentext_id:
mvp_fields = self._add_master_asset_id_field(mvp_fields, master_opentext_id)
logger.info("Added Master Asset ID field: {}".format(master_opentext_id))
# Build asset representation
asset_rep = {
'asset_resource': {
'asset': {
'metadata': {
'metadata_element_list': mvp_fields
},
'metadata_model_id': 'ECOMMERCE',
'security_policy_list': [
{'id': 1594}
]
}
}
}
logger.info("Built MVP asset representation with {} fields".format(len(mvp_fields)))
return asset_rep
def _update_fields(self, mvp_fields, clean_filename, parsed_filename):
"""Update specific fields from filename and forced values"""
# Process filename_updates from configuration
for field_id, config in self.filename_updates.items():
source = config.get('source')
transform = config.get('transform', '')
# Get value from appropriate source
if source == 'clean_filename':
value = clean_filename
elif source and parsed_filename:
value = parsed_filename.get(source)
else:
continue
if not value:
continue
# Apply transform if specified
if transform == 'uppercase':
value = value.upper()
elif transform == 'lowercase':
value = value.lower()
# Apply asset type mapping if this is the asset type field
if field_id == 'FERRERO.FIELD.MKTG.ASSET TYPE' and source == 'asset_type':
value = self._map_asset_type(value)
# Update the field
for field in mvp_fields:
if field.get('id') == field_id:
self._set_field_value(field, value)
logger.info("Updated {} from filename: {}".format(field_id, value))
break
# Apply country code mapping (ISO -> DAM codes)
for field in mvp_fields:
if field.get('id') == 'FERRERO.FIELD.COUNTRY':
current_value = self._get_field_value(field)
if current_value:
mapped_value = self._map_country_code(current_value)
if mapped_value != current_value:
self._set_field_value(field, mapped_value)
logger.info("Mapped country code: {} -> {}".format(current_value, mapped_value))
# Apply forced values from configuration
for field_id, forced_value in self.forced_values.items():
for field in mvp_fields:
if field.get('id') == field_id:
self._set_field_value(field, forced_value)
logger.info("Set {} to {}".format(field_id, forced_value))
break
# Set Asset Validity Dates (Start = Today, End = Today + 1 Year)
# Field 4: Date the asset was uploaded
# Field 5: Add 1 year from date provided above
try:
today = datetime.now()
one_year_later = today + timedelta(days=365)
# Convert to US Date Format (MM/DD/YYYY)
# This is the format the DAM expects for date fields
start_date_str = today.strftime('%m/%d/%Y')
end_date_str = one_year_later.strftime('%m/%d/%Y')
date_fields = {
'FERRERO.FIELD.ASSET VALIDITY START PERIOD': start_date_str,
'FERRERO.FIELD.ASSET VALIDITY END PERIOD': end_date_str
}
for field_id, value in date_fields.items():
field_found = False
for field in mvp_fields:
if field.get('id') == field_id:
# Use specialized method for date fields
self._set_date_field_value(field, value)
logger.info("Set {} to {} ms (Upload Date Logic)".format(field_id, value))
field_found = True
break
if not field_found:
# Add new date field with proper structure
mvp_fields.append({
'id': field_id,
'type': 'com.artesia.metadata.MetadataField',
'value': {
'value': {
'type': 'long',
'value': value
}
}
})
logger.info("Added {} with value {} ms (Upload Date Logic)".format(field_id, value))
except Exception as e:
logger.error("Failed to set validity dates: {}".format(str(e)))
return mvp_fields
def _add_missing_fields(self, mvp_fields, parsed_filename):
"""Add missing MVP fields from filename or defaults"""
field_ids = [f.get('id') for f in mvp_fields]
# Add MAIN_LANGUAGES if missing
if 'MAIN_LANGUAGES' not in field_ids and parsed_filename:
if parsed_filename.get('language_code'):
language = parsed_filename['language_code'].upper()
logger.info("Adding MAIN_LANGUAGES: {}".format(language))
mvp_fields.append({
'id': 'MAIN_LANGUAGES',
'parent_table_id': 'FERRERO.TABULAR.FIELD.MAIN LANGUAGES',
'type': 'com.artesia.metadata.MetadataTableField',
'values': [
{
'cascading_domain_value': False,
'domain_value': True,
'value': {
'field_value': {
'type': 'string',
'value': language
},
'type': 'com.artesia.metadata.DomainValue'
}
}
]
})
# Add other missing fields with defaults
field_ids = [f.get('id') for f in mvp_fields]
for field_id, default_value in self.defaults.items():
if field_id not in field_ids:
logger.info("Adding {} with default: {}".format(field_id, default_value))
# Check if it's a tabular field (contains .TABULAR. in parent table ID)
is_tabular = 'TABULAR' in field_id or field_id in [
'FERRERO.FIELD.ASSETCOMPLIANCE', 'MARKETING_TAG'
]
if is_tabular:
mvp_fields.append({
'id': field_id,
'parent_table_id': 'FERRERO.TABULAR.FIELD.' + field_id.split('.')[-1],
'type': 'com.artesia.metadata.MetadataTableField',
'values': [
{
'cascading_domain_value': False,
'domain_value': True,
'value': {
'field_value': {
'type': 'string',
'value': default_value
},
'type': 'com.artesia.metadata.DomainValue'
}
}
]
})
else:
mvp_fields.append({
'id': field_id,
'type': 'com.artesia.metadata.MetadataField',
'value': {
'cascading_domain_value': False,
'domain_value': True,
'value': {
'type': 'string',
'value': default_value
}
}
})
return mvp_fields
def _map_country_code(self, iso_code):
"""
Map ISO country code to DAM country code
Args:
iso_code: ISO 3166-1 Alpha-2 code (e.g., 'BD', 'DE')
Returns:
str: DAM country code (e.g., 'BG' for BD, 'DE' for DE)
"""
if not iso_code:
return iso_code
iso_upper = iso_code.upper()
# Check if we have a mapping
if iso_upper in self.country_mappings:
dam_code = self.country_mappings[iso_upper]
if dam_code != iso_upper:
logger.info("Country code mapping: {} (ISO) -> {} (DAM)".format(iso_upper, dam_code))
return dam_code
else:
# No mapping found, use ISO code as-is
logger.debug("No mapping for country code: {} (using as-is)".format(iso_upper))
return iso_upper
def _load_asset_type_mappings(self):
"""
Load asset type mappings: 3-letter codes -> DAM codes
Returns:
dict: 3-letter code -> DAM code mapping
"""
import yaml
mapping_path = 'config/asset_type_mappings.yaml'
try:
with open(mapping_path, 'r') as f:
mappings = yaml.safe_load(f)
return mappings if mappings else {}
except Exception as e:
logger.warning("Could not load asset type mappings: {}".format(str(e)))
return {}
def _map_asset_type(self, three_letter_code):
"""
Map 3-letter asset type code to DAM code
Args:
three_letter_code: 3-letter code (e.g., 'EHI', 'IMG', 'TVC')
Returns:
DAM code (e.g., 'heroimage', 'keyvisual', 'tvc')
"""
if not three_letter_code:
return three_letter_code
code_upper = three_letter_code.upper()
# Check if we have a mapping
if code_upper in self.asset_type_mappings:
dam_code = self.asset_type_mappings[code_upper]
logger.info("Asset type mapping: {} -> {}".format(code_upper, dam_code))
return dam_code
# No mapping - return as-is
logger.warning("No mapping for asset type: {} - using as-is (may fail DAM validation)".format(code_upper))
return three_letter_code
def _build_fields_from_filename(self, parsed_filename, clean_filename):
"""
Build ALL metadata fields from parsed filename
Used in folder-only mode (tracking ID with -N suffix)
Note: Uses codes directly for now. Can add lookup tables later
for brand_code->brand_name, country_code->country_name, etc.
"""
fields = []
# ASSET NAME
fields.append({
'id': 'ARTESIA.FIELD.ASSET NAME',
'value': {'value': {'value': clean_filename}}
})
# DESCRIPTION (from subject_title)
if parsed_filename.get('subject_title'):
fields.append({
'id': 'ARTESIA.FIELD.ASSET DESCRIPTION',
'value': {'value': {'value': parsed_filename['subject_title']}}
})
# BRAND (use code for now, could add lookup later)
if parsed_filename.get('brand_code'):
fields.append({
'id': 'FERRERO.FIELD.BRAND',
'value': {'value': {'value': parsed_filename['brand_code']}}
})
# COUNTRY (map ISO code to DAM code)
if parsed_filename.get('country_code'):
dam_country_code = self._map_country_code(parsed_filename['country_code'])
fields.append({
'id': 'FERRERO.FIELD.COUNTRY',
'value': {'value': {'value': dam_country_code}}
})
# LANGUAGE (use code for now)
if parsed_filename.get('language_code'):
fields.append({
'id': 'FERRERO.FIELD.LANGUAGES',
'value': {'value': {'value': parsed_filename['language_code']}}
})
# ASSET TYPE (use code for now)
if parsed_filename.get('asset_type'):
fields.append({
'id': 'FERRERO.FIELD.ASSET TYPE',
'value': {'value': {'value': parsed_filename['asset_type']}}
})
# STATE (force to Local)
fields.append({
'id': 'FERRERO.FIELD.STATE',
'value': {'value': {'value': 'Local'}}
})
logger.info("Built {} fields from filename (folder-only mode)".format(len(fields)))
return fields
def _get_field_value(self, field):
"""Get field value handling different structures"""
if 'value' in field:
if isinstance(field['value'], dict):
if 'value' in field['value'] and isinstance(field['value']['value'], dict):
if 'value' in field['value']['value']:
return field['value']['value']['value']
elif 'field_value' in field['value']['value']:
return field['value']['value']['field_value'].get('value')
return None
def _set_field_value(self, field, value):
"""Set field value handling different structures"""
import json
field_id = field.get('id', 'UNKNOWN')
logger.info("_set_field_value called for: {} with value: {}".format(field_id, value))
logger.info("Current field['value']: {}".format(json.dumps(field.get('value'), indent=2) if field.get('value') else 'None'))
if 'value' in field:
if isinstance(field['value'], dict):
# Try nested structure first (most common)
if 'value' in field['value'] and isinstance(field['value']['value'], dict):
if 'value' in field['value']['value']:
field['value']['value']['value'] = value
# Ensure type is set for CreativeX URL field
if field_id == 'FERRERO.FIELD.CREATIVEX LINK' and 'type' not in field['value']['value']:
field['value']['value']['type'] = 'string'
logger.info("Set via field['value']['value']['value']")
elif 'field_value' in field['value']['value']:
field['value']['value']['field_value']['value'] = value
logger.info("Set via field['value']['value']['field_value']['value']")
else:
# If nested dict is empty, create the value structure with type
field['value']['value'] = {'type': 'string', 'value': value}
logger.info("Created field['value']['value'] = {{'type': 'string', 'value': {}}}".format(value))
else:
# If value dict is empty or doesn't have nested value, create it with type
field['value'] = {'value': {'type': 'string', 'value': value}}
logger.info("Created field['value'] = {{'value': {{'type': 'string', 'value': {}}}}}".format(value))
logger.info("After setting, field['value']: {}".format(json.dumps(field.get('value'), indent=2) if field.get('value') else 'None'))
def _set_date_field_value(self, field, date_string):
"""
Set date field value with proper type for DAM API
Args:
field: Field dict to update
date_string: Date as ISO string (YYYY-MM-DDTHH:mm:ss)
"""
field_id = field.get('id', 'UNKNOWN')
logger.info("_set_date_field_value called for: {} with value: {}".format(
field_id, date_string
))
if 'value' in field:
if isinstance(field['value'], dict):
if 'value' in field['value'] and isinstance(field['value']['value'], dict):
# Update existing nested structure
field['value']['value']['type'] = 'string'
field['value']['value']['value'] = date_string
logger.info("Set via field['value']['value'] with type 'string'")
else:
# Create nested structure
field['value'] = {
'value': {
'type': 'string',
'value': date_string
}
}
logger.info("Created field['value'] with type 'string'")
else:
# Create value structure from scratch
field['value'] = {
'value': {
'type': 'string',
'value': date_string
}
}
logger.info("Created field['value'] from scratch with type 'string'")
def _update_creativex_fields(self, mvp_fields, box_metadata):
"""
Update CreativeX fields from Box metadata template
Args:
mvp_fields: List of MVP fields
box_metadata: dict with 'score' and 'url' from Box template
Returns:
Updated mvp_fields list
"""
# Map Box metadata to DAM field IDs
creativex_mapping = {
'score': 'FERRERO.TAB.FIELD.CREATIVEX', # Platform > Rating (%)
'url': 'FERRERO.FIELD.CREATIVEX LINK' # CreativeX Hyperlink
}
if box_metadata.get('score'):
# Update CreativeX Score field (tabular field structure)
# New structure: Platform^Score (e.g., "Google Ads^100")
score_val = box_metadata['score']
platforms = box_metadata.get('platforms', [])
# If no platforms, default to Unknown
if not platforms:
logger.warning("No Platforms mapped for CreativeX score - using 'Unknown'")
platforms = ["Unknown"]
# Construct value objects for each platform
value_objects = []
for platform in platforms:
combined_value = "{}^{}".format(platform, score_val)
value_obj = {
"type": "com.artesia.metadata.MetadataTableFieldRow",
"fields": [
{
"id": "FERRERO.TAB.FIELD.CREATIVEX",
"type": "com.artesia.metadata.MetadataField",
"value": {
"cascading_domain_value": True,
"domain_value": False,
"is_locked": False,
"value": {
"type": "com.artesia.metadata.CascadingDomainValue",
"field_value": {
"type": "string",
"value": combined_value
}
}
}
}
]
}
value_objects.append(value_obj)
logger.info("Constructed CreativeX value: {}".format(combined_value))
score_field_found = False
for field in mvp_fields:
if field.get('id') == 'FERRERO.TAB.FIELD.CREATIVEX':
score_field_found = True
try:
# Replace values list with new list of objects
field['values'] = value_objects
logger.info("Set CREATIVEX field with {} values".format(len(value_objects)))
except Exception as e:
logger.error("Failed to set CreativeX Score: {}".format(str(e)))
import traceback
logger.error(traceback.format_exc())
break
if not score_field_found:
logger.warning("CREATIVEX Score field not found in master metadata - adding it now")
# Create the field structure (tabular field)
creativex_score_field = {
"type": "com.artesia.metadata.MetadataTableField",
"id": "FERRERO.TAB.FIELD.CREATIVEX",
"parent_table_id": "FERRERO.TABULAR.FIELD.CREATIVEX",
"values": value_objects
}
mvp_fields.append(creativex_score_field)
logger.info("Added CREATIVEX Score field with {} values".format(len(value_objects)))
if box_metadata.get('url'):
# Update CreativeX URL field
logger.info("Updating CreativeX URL from database: {}".format(box_metadata['url']))
url_field_found = False
for field in mvp_fields:
if field.get('id') == 'FERRERO.FIELD.CREATIVEX LINK':
url_field_found = True
try:
# Log field structure before setting
logger.info("CREATIVEX URL field structure: {}".format(field.get('value', {}).keys() if isinstance(field.get('value'), dict) else 'not a dict'))
self._set_field_value(field, box_metadata['url'])
logger.info("Set CREATIVEX LINK to: {}".format(box_metadata['url']))
except Exception as e:
logger.error("Failed to set CreativeX URL: {}".format(str(e)))
import traceback
logger.error(traceback.format_exc())
break
if not url_field_found:
logger.warning("CREATIVEX URL field not found in master metadata - adding it now")
# Create the field structure (text field)
creativex_url_field = {
'id': 'FERRERO.FIELD.CREATIVEX LINK',
'name': 'CreativeX Hyperlink',
'type': 'com.artesia.metadata.MetadataField',
'value': {
'value': {
'type': 'string',
'value': box_metadata['url']
}
},
'data_type': 'CHAR',
'required': False
}
mvp_fields.append(creativex_url_field)
logger.info("Added CREATIVEX URL field with value: {}".format(box_metadata['url']))
return mvp_fields
def _add_master_asset_id_field(self, mvp_fields, master_opentext_id):
"""
Add Master Asset ID field (configurable via MASTER_ASSET_ID_FIELD in .env)
Args:
mvp_fields: List of MVP fields
master_opentext_id: DAM Asset ID of the master asset
Returns:
Updated mvp_fields list
"""
# Read configured field ID from environment, default to legacy ARTESIA field
master_field_id = os.environ.get('MASTER_ASSET_ID_FIELD', 'ARTESIA.FIELD.ASSET_ID')
logger.info("Using Master Asset ID field: {} (Value: {})".format(master_field_id, master_opentext_id))
# Check if field already exists in MVP fields (update scenario)
for field in mvp_fields:
field_id = self._get_field_id(field)
if field_id == master_field_id:
# Update existing field value
# If tabular, we need special handling, but _add_missing_fields usually won't add this
# so we assume if it exists, we just update the value
if 'TABULAR' in master_field_id:
# Tabular field update logic would go here if needed
# For now, assuming we are creating it new mostly
pass
self._set_field_value(field, master_opentext_id)
logger.info("Updated existing Master Asset ID field: {}".format(master_opentext_id))
return mvp_fields
# Field doesn't exist - add new field
# Check if it's a tabular field
if 'TABULAR' in master_field_id:
# Construct tabular field structure
# Logic updated based on Staging Definition:
# Parent: FERRERO.TABULAR.FIELD.MASTERASSETIDS
# Child Column: FERRERO.MASTERASSETIDS
# Determine child column ID
if master_field_id == 'FERRERO.TABULAR.FIELD.MASTERASSETIDS':
child_column_id = 'FERRERO.MASTERASSETIDS'
else:
# Fallback for other potential tabular fields
child_column_id = master_field_id
new_field = {
'id': master_field_id,
'parent_table_id': master_field_id,
'type': 'com.artesia.metadata.MetadataTableField',
'values': [
{
'type': 'com.artesia.metadata.MetadataTableFieldRow',
'fields': [
{
'id': child_column_id,
'type': 'com.artesia.metadata.MetadataField',
'value': {
'field_value': {
'type': 'string',
'value': master_opentext_id
},
'type': 'com.artesia.metadata.DomainValue'
}
}
]
}
]
}
mvp_fields.append(new_field)
logger.info("Added new TABULAR Master Asset ID field: {} (Column: {})".format(master_field_id, child_column_id))
else:
# Standard Text Field
mvp_fields.append({
'id': master_field_id,
'type': 'com.artesia.metadata.MetadataField',
'value': {
'value': {
'type': 'string',
'value': master_opentext_id
}
}
})
logger.info("Added new Master Asset ID field: {}".format(master_field_id))
return mvp_fields
def _get_field_id(self, field):
"""Extract field ID from field dict"""
if isinstance(field, dict):
return field.get('id', '')
return ''