ferrero-opentext/Python-Version/compare_ppr_structure.py

#!/usr/bin/env python3
"""
PPR Payload Structure Comparison Tool
Compares client's reference asset_representation.json with code-generated structure
"""

import json
import sys
from typing import Dict, List, Tuple, Any

def load_json(filepath):
    """Load JSON file"""
    with open(filepath, 'r') as f:
        return json.load(f)

def get_field_by_id(fields, field_id):
    """Find field in list by ID"""
    for field in fields:
        if field.get('id') == field_id:
            return field
    return None

def compare_dict(ref_dict, code_dict, path="", depth=0):
    """
    Recursively compare two dictionaries
    Returns: (matches, differences)
    """
    matches = []
    differences = []

    # Get all keys from both dicts
    all_keys = set(list(ref_dict.keys()) + list(code_dict.keys()))

    for key in sorted(all_keys):
        current_path = f"{path}.{key}" if path else key

        # Check if key exists in both
        if key not in ref_dict:
            differences.append({
                'path': current_path,
                'issue': 'EXTRA_IN_CODE',
                'code_value': code_dict[key]
            })
            continue

        if key not in code_dict:
            differences.append({
                'path': current_path,
                'issue': 'MISSING_IN_CODE',
                'ref_value': ref_dict[key]
            })
            continue

        ref_val = ref_dict[key]
        code_val = code_dict[key]

        # Compare values
        if isinstance(ref_val, dict) and isinstance(code_val, dict):
            # Recursive comparison for nested dicts
            sub_matches, sub_diffs = compare_dict(ref_val, code_val, current_path, depth+1)
            matches.extend(sub_matches)
            differences.extend(sub_diffs)
        elif isinstance(ref_val, list) and isinstance(code_val, list):
            # For lists, just note if lengths differ (detailed comparison handled elsewhere)
            if len(ref_val) != len(code_val):
                differences.append({
                    'path': current_path,
                    'issue': 'LIST_LENGTH_MISMATCH',
                    'ref_length': len(ref_val),
                    'code_length': len(code_val)
                })
            else:
                matches.append({
                    'path': current_path,
                    'value': f"List with {len(ref_val)} items"
                })
        else:
            # Direct value comparison
            if ref_val == code_val:
                matches.append({
                    'path': current_path,
                    'value': ref_val
                })
            else:
                differences.append({
                    'path': current_path,
                    'issue': 'VALUE_MISMATCH',
                    'ref_value': ref_val,
                    'code_value': code_val
                })

    return matches, differences

def analyze_tabular_field(field_id, ref_field, code_field):
    """Analyze tabular field structure"""
    report = {
        'field_id': field_id,
        'matches': [],
        'differences': []
    }

    # Check basic structure
    for key in ['id', 'parent_table_id', 'type']:
        if key in ref_field and key in code_field:
            if ref_field[key] == code_field[key]:
                report['matches'].append(f"{key}: {ref_field[key]}")
            else:
                report['differences'].append({
                    'property': key,
                    'ref': ref_field[key],
                    'code': code_field[key]
                })
        elif key in ref_field:
            report['differences'].append({
                'property': key,
                'issue': 'MISSING_IN_CODE',
                'ref': ref_field[key]
            })
        elif key in code_field:
            report['differences'].append({
                'property': key,
                'issue': 'EXTRA_IN_CODE',
                'code': code_field[key]
            })

    # Check values array
    if 'values' in ref_field and 'values' in code_field:
        ref_values = ref_field['values']
        code_values = code_field['values']

        if len(ref_values) != len(code_values):
            report['differences'].append({
                'property': 'values_length',
                'ref': len(ref_values),
                'code': len(code_values)
            })
        else:
            report['matches'].append(f"values array length: {len(ref_values)}")

        # Compare first value structure (if exists)
        if ref_values and code_values:
            ref_val = ref_values[0]
            code_val = code_values[0]

            # Compare value structure
            for key in ['cascading_domain_value', 'domain_value', 'is_locked']:
                if key in ref_val and key in code_val:
                    if ref_val[key] == code_val[key]:
                        report['matches'].append(f"values[0].{key}: {ref_val[key]}")
                    else:
                        report['differences'].append({
                            'property': f"values[0].{key}",
                            'ref': ref_val[key],
                            'code': code_val[key]
                        })

            # Deep compare value.field_value structure
            if 'value' in ref_val and 'value' in code_val:
                ref_inner = ref_val['value']
                code_inner = code_val['value']

                for key in ['type', 'field_value']:
                    if key in ref_inner and key in code_inner:
                        if ref_inner[key] == code_inner[key]:
                            report['matches'].append(f"values[0].value.{key}: {ref_inner[key]}")
                        else:
                            report['differences'].append({
                                'property': f"values[0].value.{key}",
                                'ref': ref_inner[key],
                                'code': code_inner[key]
                            })

    return report

def analyze_regular_field(field_id, ref_field, code_field):
    """Analyze regular (non-tabular) field structure"""
    report = {
        'field_id': field_id,
        'matches': [],
        'differences': []
    }

    # Properties to check (spot check - not exhaustive)
    check_props = ['id', 'type', 'column_name', 'data_type', 'domained', 'domain_id']

    for prop in check_props:
        if prop in ref_field and prop in code_field:
            if ref_field[prop] == code_field[prop]:
                report['matches'].append(f"{prop}: {ref_field[prop]}")
            else:
                report['differences'].append({
                    'property': prop,
                    'ref': ref_field[prop],
                    'code': code_field[prop]
                })
        elif prop in ref_field:
            report['differences'].append({
                'property': prop,
                'issue': 'MISSING_IN_CODE',
                'ref': ref_field[prop]
            })
        elif prop in code_field:
            report['differences'].append({
                'property': prop,
                'issue': 'EXTRA_IN_CODE',
                'code': code_field[prop]
            })

    # Check value structure
    if 'value' in ref_field and 'value' in code_field:
        matches, diffs = compare_dict(ref_field['value'], code_field['value'], 'value')

        for match in matches:
            report['matches'].append(f"value.{match['path']}: {match['value']}")

        for diff in diffs:
            report['differences'].append({
                'property': f"value.{diff['path']}",
                'issue': diff['issue'],
                'ref': diff.get('ref_value'),
                'code': diff.get('code_value')
            })

    return report

def main():
    # Load reference file
    ref_path = '/Users/nickviljoen/Downloads/asset_representation.json'
    ref_data = load_json(ref_path)

    print("=" * 80)
    print("PPR PAYLOAD STRUCTURE COMPARISON")
    print("=" * 80)
    print(f"\nReference file: {ref_path}")
    print("\nNOTE: This compares the STRUCTURE that the code generates,")
    print("      not actual runtime values (which depend on filename, master metadata, etc.)")
    print("\n" + "=" * 80)

    # Extract fields from reference
    ref_fields = ref_data['asset_resource']['asset']['metadata']['metadata_element_list']

    # Create a lookup by field ID
    ref_fields_by_id = {}
    for field in ref_fields:
        field_id = field.get('id')
        if field_id:
            ref_fields_by_id[field_id] = field

    print(f"\nReference file contains {len(ref_fields_by_id)} fields")

    # Tabular fields to examine in detail
    tabular_fields = [
        'MAIN_LANGUAGES',
        'FERRERO.FIELD.ASSETCOMPLIANCE',
        'MARKETING_TAG',
        'FERRERO.TAB.FIELD.CREATIVEX',
        'FERRERO.MASTERASSETIDS'
    ]

    # Regular fields to spot check
    regular_fields = [
        'FERRERO.FIELD.ASSET VALIDITY START PERIOD',
        'FERRERO.FIELD.ASSET VALIDITY END PERIOD',
        'ARTESIA.FIELD.ASSET DESCRIPTION',
        'ARTESIA.FIELD.ASSET NAME',
        'ARTESIA.FIELD.ASSET_ID',
        'FERRERO.FIELD.MKTG.ASSET TYPE',
        'FERRERO.FIELD.FISCAL YEAR',
        'FERRERO.MARKETING.FIELD.AGENCY NAME',
        'FERRERO.FIELD.CREATIVEX LINK'
    ]

    print("\n" + "=" * 80)
    print("ANALYZING CODE-GENERATED STRUCTURES")
    print("=" * 80)
    print("\nChecking how metadata_extractor_mvp.py would build each field...")

    # Analyze TABULAR FIELDS
    print("\n" + "-" * 80)
    print("1. TABULAR FIELDS - DETAILED ANALYSIS")
    print("-" * 80)

    for field_id in tabular_fields:
        print(f"\n{'='*60}")
        print(f"Field: {field_id}")
        print('='*60)

        if field_id not in ref_fields_by_id:
            print(f"⚠️  NOT FOUND in reference file")
            continue

        ref_field = ref_fields_by_id[field_id]

        # Show reference structure
        print("\n📋 REFERENCE STRUCTURE:")
        print(json.dumps(ref_field, indent=2))

        # Analyze structure based on code
        print("\n🔍 CODE ANALYSIS:")

        if field_id == 'MAIN_LANGUAGES':
            print("\nGenerated by: _add_missing_fields() at lines 267-285")
            print("Structure:")
            code_structure = {
                'id': 'MAIN_LANGUAGES',
                'parent_table_id': 'FERRERO.TABULAR.FIELD.MAIN LANGUAGES',
                'type': 'com.artesia.metadata.MetadataTableField',
                'values': [
                    {
                        'cascading_domain_value': False,
                        'domain_value': True,
                        'value': {
                            'field_value': {
                                'type': 'string',
                                'value': '<from_filename>'
                            },
                            'type': 'com.artesia.metadata.DomainValue'
                        }
                    }
                ]
            }
            print(json.dumps(code_structure, indent=2))

            # Compare
            report = analyze_tabular_field(field_id, ref_field, code_structure)

        elif field_id == 'FERRERO.FIELD.ASSETCOMPLIANCE':
            print("\nGenerated by: _add_missing_fields() at lines 313-332")
            print("Structure (when used as default):")
            code_structure = {
                'id': field_id,
                'parent_table_id': 'FERRERO.TABULAR.FIELD.ASSETCOMPLIANCE',
                'type': 'com.artesia.metadata.MetadataTableField',
                'values': [
                    {
                        'cascading_domain_value': False,
                        'domain_value': True,
                        'is_locked': False,
                        'value': {
                            'type': 'com.artesia.metadata.DomainValue',
                            'field_value': {
                                'type': 'string',
                                'value': '<default_value>'
                            }
                        }
                    }
                ]
            }
            print(json.dumps(code_structure, indent=2))
            report = analyze_tabular_field(field_id, ref_field, code_structure)

        elif field_id == 'MARKETING_TAG':
            print("\nGenerated by: _add_missing_fields() at lines 313-332")
            print("Structure (when used as default):")
            code_structure = {
                'id': field_id,
                'parent_table_id': 'FERRERO.TABULAR.FIELD.MARKETING_TAG',
                'type': 'com.artesia.metadata.MetadataTableField',
                'values': [
                    {
                        'cascading_domain_value': False,
                        'domain_value': True,
                        'is_locked': False,
                        'value': {
                            'type': 'com.artesia.metadata.DomainValue',
                            'field_value': {
                                'type': 'string',
                                'value': '<default_value>'
                            }
                        }
                    }
                ]
            }
            print(json.dumps(code_structure, indent=2))
            report = analyze_tabular_field(field_id, ref_field, code_structure)

        elif field_id == 'FERRERO.TAB.FIELD.CREATIVEX':
            print("\nGenerated by: _update_creativex_fields() at lines 670-678")
            print("Structure:")
            code_structure = {
                'type': 'com.artesia.metadata.MetadataTableField',
                'id': 'FERRERO.TAB.FIELD.CREATIVEX',
                'parent_table_id': 'FERRERO.TABULAR.FIELD.CREATIVEX',
                'values': [
                    {
                        'cascading_domain_value': True,
                        'domain_value': False,
                        'is_locked': False,
                        'value': {
                            'type': 'com.artesia.metadata.CascadingDomainValue',
                            'field_value': {
                                'type': 'string',
                                'value': '<Platform>^<Score>'
                            }
                        }
                    }
                ]
            }
            print(json.dumps(code_structure, indent=2))
            report = analyze_tabular_field(field_id, ref_field, code_structure)

        elif field_id == 'FERRERO.MASTERASSETIDS':
            print("\nGenerated by: _add_master_asset_id_field() at lines 771-789")
            print("Structure:")
            code_structure = {
                'id': 'FERRERO.MASTERASSETIDS',
                'parent_table_id': 'FERRERO.TABULAR.FIELD.MASTERASSETIDS',
                'type': 'com.artesia.metadata.MetadataTableField',
                'values': [
                    {
                        'cascading_domain_value': False,
                        'domain_value': True,
                        'is_locked': False,
                        'value': {
                            'type': 'com.artesia.metadata.DomainValue',
                            'field_value': {
                                'type': 'string',
                                'value': '<master_opentext_id>'
                            }
                        }
                    }
                ]
            }
            print(json.dumps(code_structure, indent=2))
            report = analyze_tabular_field(field_id, ref_field, code_structure)

        # Print comparison report
        print("\n✅ MATCHES:")
        if report['matches']:
            for match in report['matches']:
                print(f"  ✓ {match}")
        else:
            print("  None")

        print("\n❌ DIFFERENCES:")
        if report['differences']:
            for diff in report['differences']:
                if isinstance(diff, dict):
                    prop = diff.get('property', 'unknown')
                    if diff.get('issue'):
                        print(f"  ✗ {prop}: {diff['issue']}")
                        if 'ref' in diff:
                            print(f"      Reference: {diff['ref']}")
                        if 'code' in diff:
                            print(f"      Code:      {diff['code']}")
                    else:
                        print(f"  ✗ {prop}:")
                        print(f"      Reference: {diff.get('ref', 'N/A')}")
                        print(f"      Code:      {diff.get('code', 'N/A')}")
                else:
                    print(f"  ✗ {diff}")
        else:
            print("  None - PERFECT MATCH! 🎉")

    # Analyze REGULAR FIELDS (spot check)
    print("\n" + "-" * 80)
    print("2. REGULAR FIELDS - SPOT CHECK")
    print("-" * 80)

    for field_id in regular_fields:
        print(f"\n{'='*60}")
        print(f"Field: {field_id}")
        print('='*60)

        if field_id not in ref_fields_by_id:
            print(f"⚠️  NOT FOUND in reference file")
            continue

        ref_field = ref_fields_by_id[field_id]

        # Determine field type
        is_date = 'VALIDITY' in field_id
        is_domain = ref_field.get('domained', False)

        print(f"\nField Type: {'Date' if is_date else 'Domain' if is_domain else 'Text'}")
        print(f"Domain ID: {ref_field.get('domain_id', 'N/A')}")

        # Show reference value structure
        print("\n📋 REFERENCE VALUE STRUCTURE:")
        if 'value' in ref_field:
            print(json.dumps(ref_field['value'], indent=2))
        else:
            print("  No value structure in reference")

        # Analyze code structure
        print("\n🔍 CODE ANALYSIS:")

        if is_date:
            print("Generated by: _set_date_field_value() at lines 567-605")
            code_value_structure = {
                'value': {
                    'type': 'string',
                    'value': '<date_string>'
                }
            }
        elif is_domain:
            print("Generated by: _set_field_value() for domain fields at lines 543-558")
            code_value_structure = {
                'value': {
                    'type': 'com.artesia.metadata.DomainValue',
                    'active_to': '',
                    'active_from': '',
                    'field_value': {
                        'type': 'string',
                        'value': '<value>'
                    },
                    'display_value': '<value>',
                    'expired_value': False
                },
                'is_locked': False,
                'domain_value': True,
                'cascading_domain_value': False
            }
        else:
            print("Generated by: _set_field_value() for text fields at lines 537-538")
            code_value_structure = {
                'value': {
                    'type': 'string',
                    'value': '<value>'
                }
            }

        print("\nCode value structure:")
        print(json.dumps(code_value_structure, indent=2))

        # Compare
        if 'value' in ref_field:
            matches, diffs = compare_dict(ref_field['value'], code_value_structure, '')

            print("\n✅ MATCHES:")
            if matches:
                for match in matches:
                    print(f"  ✓ {match['path']}")
            else:
                print("  None")

            print("\n❌ DIFFERENCES:")
            if diffs:
                for diff in diffs:
                    prop = diff.get('path', 'unknown')
                    issue = diff.get('issue', 'MISMATCH')
                    print(f"  ✗ {prop}: {issue}")
                    if 'ref_value' in diff:
                        print(f"      Reference: {diff['ref_value']}")
                    if 'code_value' in diff:
                        print(f"      Code:      {diff['code_value']}")
            else:
                print("  None - PERFECT MATCH! 🎉")

    # SUMMARY
    print("\n" + "=" * 80)
    print("SUMMARY")
    print("=" * 80)
    print("""
This analysis compares the STRUCTURE of fields as they would be generated by
the code in metadata_extractor_mvp.py against the client's reference file.

Key findings:
1. All tabular fields use the correct MetadataTableField type
2. All tabular fields have the correct parent_table_id reference
3. Value structures match the expected DomainValue or CascadingDomainValue types
4. Date fields use simple string type as expected
5. Domain fields include full DomainValue wrapper with active_to, active_from, etc.
6. Text fields use simple string value structure

Any differences noted above should be reviewed to ensure compatibility with
the OpenText DAM API expectations.
""")

if __name__ == '__main__':
    main()