ferrero-opentext/Python-Version/scripts/test-metadata-diagnostic.py
nickviljoen f83b4fae3e PPR Environment: Use SIMPLE metadata structure for tabular fields
Key Changes:
- Updated metadata_extractor_mvp.py to use SIMPLE structure for all tabular fields
- All tabular fields now use direct value objects (no MetadataTableFieldRow wrapper)
- MAIN_LANGUAGES, ASSETCOMPLIANCE, MARKETING_TAG, CREATIVEX all use SIMPLE structure
- Master Asset ID field updated to SIMPLE structure
- Date fields now use type 'string' instead of 'long'
- Matches DAM reference structure from asset_representation.json

Added Files:
- metadata_extractor_mvp_PROD.py: PROD-specific version with same SIMPLE structure
- Backup files for safety
- Analysis and comparison documentation

Environment:
- Tested and working in PPR environment (ppr.dam.ferrero.com)
- All tabular fields match DAM-supplied reference structure
- Successful uploads confirmed

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-01-23 16:52:50 +02:00

253 lines
8.3 KiB
Python

#!/usr/bin/env python3
"""
Metadata Diagnostic Tool
Analyzes a file's metadata to show what was in the original DAM asset vs what was sent
Usage: python scripts/test-metadata-diagnostic.py "filename.jpg"
"""
import sys
import os
import json
import argparse
# Add shared library to path
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
from shared.config_loader import load_config, load_field_mappings
from shared.database import Database
from shared.metadata_extractor_mvp import MetadataExtractorMVP
from shared.filename_parser import FilenameParser
def extract_field_value(field):
"""Extract value from DAM field structure"""
if 'value' in field:
val = field['value']
if isinstance(val, dict):
if 'value' in val and isinstance(val['value'], dict):
if 'value' in val['value']:
return val['value']['value']
elif 'field_value' in val['value'] and 'value' in val['value']['field_value']:
return val['value']['field_value']['value']
return None
def analyze_metadata(filename):
"""Analyze metadata for a given filename"""
print("=" * 80)
print("METADATA DIAGNOSTIC TOOL")
print("=" * 80)
print("")
print("Filename: {}".format(filename))
print("")
# Load config and initialize
config = load_config('config/config.yaml')
field_mappings = load_field_mappings(config)
db = Database(config)
parser = FilenameParser()
mvp_extractor = MetadataExtractorMVP(field_mappings, config=config)
# Parse filename to get tracking ID
parsed = parser.parse_filename(filename)
if not parsed['is_valid']:
print("❌ ERROR: Invalid V2 filename")
print("Validation errors: {}".format(', '.join(parsed['validation_errors'])))
return
tracking_id = parsed['tracking_id']
print("✓ Tracking ID: {}".format(tracking_id))
print("")
# Get master asset from database
master_asset = db.get_master_asset(tracking_id)
if not master_asset:
print("❌ ERROR: No master asset found for tracking ID: {}".format(tracking_id))
return
print("✓ Master asset found in database")
print(" OpenText ID: {}".format(master_asset['opentext_id']))
print("")
# Get full metadata
full_metadata = master_asset['full_metadata']
# Extract all metadata fields from original DAM asset
print("=" * 80)
print("ORIGINAL DAM METADATA FIELDS")
print("=" * 80)
print("")
original_fields = {}
metadata_elements = full_metadata.get('metadata', {}).get('metadata_element_list', [])
for category in metadata_elements:
if 'metadata_element_list' in category:
for field in category['metadata_element_list']:
field_id = field.get('id')
value = extract_field_value(field)
if field_id:
original_fields[field_id] = value
status = "✓ POPULATED" if value else "✗ EMPTY"
print("{:60} {}".format(field_id, status))
if value:
print(" Value: {}".format(str(value)[:100]))
print("")
print("Total fields in original metadata: {}".format(len(original_fields)))
print("")
# Build asset representation (what we would send to DAM)
print("=" * 80)
print("BUILDING ASSET REPRESENTATION")
print("=" * 80)
print("")
clean_filename = parser.strip_upload_components(filename)
# Mock box_metadata for testing
box_metadata = {
'score': '85',
'url': 'https://app.creativex.com/test',
'platforms': ['Facebook', 'Instagram']
}
asset_rep = mvp_extractor.build_mvp_asset_representation(
master_metadata=full_metadata,
clean_filename=clean_filename,
parsed_filename=parsed,
box_metadata=box_metadata,
tracking_mode='full'
)
print("✓ Asset representation built")
print("")
# Extract fields from asset representation
print("=" * 80)
print("FIELDS IN ASSET REPRESENTATION (WHAT WE SEND)")
print("=" * 80)
print("")
sent_fields = {}
if 'metadata' in asset_rep and 'metadata_element_list' in asset_rep['metadata']:
for category in asset_rep['metadata']['metadata_element_list']:
if 'metadata_element_list' in category:
for field in category['metadata_element_list']:
field_id = field.get('id')
value = extract_field_value(field)
if field_id:
sent_fields[field_id] = value
status = "✓ POPULATED" if value else "✗ EMPTY"
print("{:60} {}".format(field_id, status))
if value:
print(" Value: {}".format(str(value)[:100]))
print("")
print("Total fields in asset representation: {}".format(len(sent_fields)))
print("")
# Compare original vs sent
print("=" * 80)
print("COMPARISON: ORIGINAL vs SENT")
print("=" * 80)
print("")
# Fields that were in original but empty in sent
print("Fields that were POPULATED in original but EMPTY in sent:")
print("-" * 80)
lost_data = []
for field_id, orig_value in original_fields.items():
if orig_value and field_id in sent_fields and not sent_fields[field_id]:
lost_data.append(field_id)
print("⚠️ {}".format(field_id))
print(" Original: {}".format(str(orig_value)[:100]))
print(" Sent: EMPTY")
print("")
if not lost_data:
print("✓ No data loss detected")
print("")
# Fields that were empty in original and empty in sent
print("Fields that were EMPTY in both original and sent:")
print("-" * 80)
empty_both = []
for field_id, orig_value in original_fields.items():
if not orig_value and field_id in sent_fields and not sent_fields[field_id]:
empty_both.append(field_id)
print(" {}".format(field_id))
if not empty_both:
print("✓ No fields empty in both")
print("")
# Fields only in sent (new fields)
print("Fields ONLY in sent (not in original):")
print("-" * 80)
new_fields = []
for field_id in sent_fields:
if field_id not in original_fields:
new_fields.append(field_id)
value = sent_fields[field_id]
status = "✓ POPULATED" if value else "✗ EMPTY"
print("{:60} {}".format(field_id, status))
if value:
print(" Value: {}".format(str(value)[:100]))
if not new_fields:
print("✓ No new fields added")
print("")
# Summary
print("=" * 80)
print("SUMMARY")
print("=" * 80)
print("")
print("Original metadata fields: {}".format(len(original_fields)))
print("Sent metadata fields: {}".format(len(sent_fields)))
print("Data loss (populated → empty): {}".format(len(lost_data)))
print("Empty in both: {}".format(len(empty_both)))
print("New fields added: {}".format(len(new_fields)))
print("")
# Save full JSON for inspection
output_dir = 'temp/metadata_diagnostic'
os.makedirs(output_dir, exist_ok=True)
original_json = os.path.join(output_dir, 'original_metadata_{}.json'.format(tracking_id))
sent_json = os.path.join(output_dir, 'sent_asset_rep_{}.json'.format(tracking_id))
with open(original_json, 'w') as f:
json.dump(full_metadata, f, indent=2)
with open(sent_json, 'w') as f:
json.dump(asset_rep, f, indent=2)
print("Full JSON files saved:")
print(" Original: {}".format(original_json))
print(" Sent: {}".format(sent_json))
print("")
db.close()
def main():
parser = argparse.ArgumentParser(description='Metadata Diagnostic Tool')
parser.add_argument('filename', help='Filename to analyze (e.g., "C000000078_KIN_IT_IT_1920x1080_09Dux1.jpg")')
args = parser.parse_args()
try:
analyze_metadata(args.filename)
except Exception as e:
print("")
print("=" * 80)
print("ERROR")
print("=" * 80)
print(str(e))
import traceback
traceback.print_exc()
sys.exit(1)
if __name__ == '__main__':
main()