ferrero-opentext/Python-Version/scripts/shared/metadata_extractor_mvp.py
DJP 80d5757bbb Add Box metadata extraction for CreativeX fields in A2→A3 workflow
Major Feature: Box Metadata Integration

box_client.py:
 Added get_file_metadata() method
 Reads 'Ferrero-DAM-Metadata' template from Box files
 Extracts 'CreativeX Score' and 'CreativeX URL' fields
 Returns dict with score and url

a2_to_a3_upload_polling.py:
 Calls box.get_file_metadata() before download
 Logs Box metadata retrieved
 Passes box_metadata to build_mvp_asset_representation()

metadata_extractor_mvp.py:
 Added box_metadata parameter to build_mvp_asset_representation()
 Added _update_creativex_fields() method
 Updates FERRERO.FIELD.CREATIVEX LINK with URL from Box
 Logs CreativeX Score (tabular field - needs special handling)

Flow:
1. File uploaded to Box by agency
2. Agency adds metadata using Ferrero-DAM-Metadata template
3. Script reads CreativeX Score and URL from Box metadata
4. Updates MVP fields with Box metadata values
5. Uploads to DAM with CreativeX data

Field Mapping:
- Box: 'CreativeX URL' → DAM: FERRERO.FIELD.CREATIVEX LINK
- Box: 'CreativeX Score' → DAM: FERRERO.TAB.FIELD.CREATIVEX (logged, needs structure)

Next: Test with file that has Box metadata template applied

🤖 Generated with Claude Code

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-03 14:26:00 -05:00

258 lines
10 KiB
Python

"""
Metadata Extractor MVP - Extract MVP fields from master metadata
Ported from PHP MetadataExtractorMVP.php
Compatible with Python 3.6+
"""
import logging
logger = logging.getLogger('MetadataExtractorMVP')
class MetadataExtractorMVP:
def __init__(self, field_mappings):
"""
Initialize with field mappings from config
Args:
field_mappings: dict from field_mappings.yaml
"""
self.mvp_field_ids = field_mappings['mvp_fields']
self.filename_updates = field_mappings.get('filename_updates', {})
self.forced_values = field_mappings.get('forced_values', {})
self.defaults = field_mappings.get('defaults', {})
def extract_mvp_fields(self, master_metadata):
"""
Extract only MVP fields from full master metadata
Args:
master_metadata: Complete DAM asset metadata
Returns:
List of MVP field objects
"""
extracted_fields = []
found_field_ids = []
# Navigate to metadata structure
# master_metadata is the full asset, need to go to: metadata.metadata_element_list
metadata_list = []
if isinstance(master_metadata, dict):
if 'metadata' in master_metadata and 'metadata_element_list' in master_metadata['metadata']:
metadata_list = master_metadata['metadata']['metadata_element_list']
logger.info("Using master_metadata['metadata']['metadata_element_list']")
logger.info("Searching through {} categories for MVP fields".format(len(metadata_list)))
# Search through categories for MVP fields
for item in metadata_list:
if 'metadata_element_list' in item:
# Category with nested fields
for field in item['metadata_element_list']:
field_id = field.get('id')
if field_id in self.mvp_field_ids:
extracted_fields.append(field)
found_field_ids.append(field_id)
logger.debug("Found MVP field: {}".format(field_id))
elif 'id' in item and item['id'] in self.mvp_field_ids:
# Direct field
extracted_fields.append(item)
found_field_ids.append(item['id'])
logger.debug("Found direct MVP field: {}".format(item['id']))
# Log results
missing = [f for f in self.mvp_field_ids if f not in found_field_ids]
logger.info("Found {}/{} MVP fields".format(len(found_field_ids), len(self.mvp_field_ids)))
if missing:
logger.info("Missing fields: {}".format(', '.join(missing[:5])))
return extracted_fields
def build_mvp_asset_representation(self, master_metadata, clean_filename, parsed_filename, box_metadata=None):
"""
Build asset representation with MVP fields + updates from filename
Args:
master_metadata: Full master asset metadata
clean_filename: Clean filename (stripped)
parsed_filename: Parsed V2 filename dict
Returns:
Asset representation dict ready for upload
"""
# Extract MVP fields from master
mvp_fields = self.extract_mvp_fields(master_metadata)
# Update fields from filename and forced values
mvp_fields = self._update_fields(mvp_fields, clean_filename, parsed_filename)
# Add missing MVP fields with defaults
mvp_fields = self._add_missing_fields(mvp_fields, parsed_filename)
# Update CreativeX fields from Box metadata if provided
if box_metadata:
mvp_fields = self._update_creativex_fields(mvp_fields, box_metadata)
# Build asset representation
asset_rep = {
'asset_resource': {
'asset': {
'metadata': {
'metadata_element_list': mvp_fields
},
'metadata_model_id': 'ECOMMERCE',
'security_policy_list': [
{'id': 1594}
]
}
}
}
logger.info("Built MVP asset representation with {} fields".format(len(mvp_fields)))
return asset_rep
def _update_fields(self, mvp_fields, clean_filename, parsed_filename):
"""Update specific fields from filename and forced values"""
# Update ASSET NAME
for field in mvp_fields:
if field.get('id') == 'ARTESIA.FIELD.ASSET NAME':
self._set_field_value(field, clean_filename)
logger.info("Updated ASSET NAME: {}".format(clean_filename))
# Update DESCRIPTION from subject_title
if parsed_filename and parsed_filename.get('subject_title'):
for field in mvp_fields:
if field.get('id') == 'ARTESIA.FIELD.ASSET DESCRIPTION':
self._set_field_value(field, parsed_filename['subject_title'])
logger.info("Updated DESCRIPTION: {}".format(parsed_filename['subject_title']))
# Force STATE to Local
for field in mvp_fields:
if field.get('id') == 'FERRERO.FIELD.STATE':
self._set_field_value(field, 'Local')
logger.info("Set STATE to Local")
return mvp_fields
def _add_missing_fields(self, mvp_fields, parsed_filename):
"""Add missing MVP fields from filename or defaults"""
field_ids = [f.get('id') for f in mvp_fields]
# Add MAIN_LANGUAGES if missing
if 'MAIN_LANGUAGES' not in field_ids and parsed_filename:
if parsed_filename.get('language_code'):
language = parsed_filename['language_code'].upper()
logger.info("Adding MAIN_LANGUAGES: {}".format(language))
mvp_fields.append({
'id': 'MAIN_LANGUAGES',
'parent_table_id': 'FERRERO.TABULAR.FIELD.MAIN LANGUAGES',
'type': 'com.artesia.metadata.MetadataTableField',
'values': [
{
'cascading_domain_value': False,
'domain_value': True,
'value': {
'field_value': {
'type': 'string',
'value': language
},
'type': 'com.artesia.metadata.DomainValue'
}
}
]
})
# Add other missing fields with defaults
field_ids = [f.get('id') for f in mvp_fields]
for field_id, default_value in self.defaults.items():
if field_id not in field_ids:
logger.info("Adding {} with default: {}".format(field_id, default_value))
# Check if it's a tabular field (contains .TABULAR. in parent table ID)
is_tabular = 'TABULAR' in field_id or field_id in [
'FERRERO.FIELD.ASSETCOMPLIANCE', 'MARKETING_TAG'
]
if is_tabular:
mvp_fields.append({
'id': field_id,
'parent_table_id': 'FERRERO.TABULAR.FIELD.' + field_id.split('.')[-1],
'type': 'com.artesia.metadata.MetadataTableField',
'values': [
{
'cascading_domain_value': False,
'domain_value': True,
'value': {
'field_value': {
'type': 'string',
'value': default_value
},
'type': 'com.artesia.metadata.DomainValue'
}
}
]
})
else:
mvp_fields.append({
'id': field_id,
'type': 'com.artesia.metadata.MetadataField',
'value': {
'cascading_domain_value': False,
'domain_value': True,
'value': {
'type': 'string',
'value': default_value
}
}
})
return mvp_fields
def _set_field_value(self, field, value):
"""Set field value handling different structures"""
if 'value' in field:
if isinstance(field['value'], dict):
if 'value' in field['value'] and isinstance(field['value']['value'], dict):
if 'value' in field['value']['value']:
field['value']['value']['value'] = value
elif 'field_value' in field['value']['value']:
field['value']['value']['field_value']['value'] = value
def _update_creativex_fields(self, mvp_fields, box_metadata):
"""
Update CreativeX fields from Box metadata template
Args:
mvp_fields: List of MVP fields
box_metadata: dict with 'score' and 'url' from Box template
Returns:
Updated mvp_fields list
"""
# Map Box metadata to DAM field IDs (need to confirm exact field IDs)
creativex_mapping = {
'score': 'FERRERO.TAB.FIELD.CREATIVEX', # Platform > Rating (%)
'url': 'FERRERO.FIELD.CREATIVEX LINK' # CreativeX Hyperlink
}
if box_metadata.get('score'):
# Update CreativeX Score field
logger.info("Updating CreativeX Score from Box: {}".format(box_metadata['score']))
# Note: This may need special handling for tabular field structure
if box_metadata.get('url'):
# Update CreativeX URL field
logger.info("Updating CreativeX URL from Box: {}".format(box_metadata['url']))
for field in mvp_fields:
if field.get('id') == 'FERRERO.FIELD.CREATIVEX LINK':
self._set_field_value(field, box_metadata['url'])
logger.info("Set CREATIVEX LINK to: {}".format(box_metadata['url']))
break
return mvp_fields