creative-x-ferrero/scripts/generate_brand_mappings.py

#!/usr/bin/env python3
"""
Generate brand mappings by matching Ferrero codes to Creative X brands

Usage:
    python generate_brand_mappings.py
    python generate_brand_mappings.py --output mappings_generated.json
"""

import argparse
import sys
import json
from pathlib import Path
from difflib import SequenceMatcher

# Add parent directory to path
sys.path.insert(0, str(Path(__file__).parent.parent))

from config import load_config
from core.data_loader import DataLoader
from core.api_client import CreativeXAPIClient


def similarity(a: str, b: str) -> float:
    """Calculate similarity between two strings"""
    return SequenceMatcher(None, a.lower(), b.lower()).ratio()


def normalize_name(name: str) -> str:
    """Normalize brand name for comparison"""
    # Remove common differences
    return (name.lower()
            .replace('&', 'and')
            .replace('-', ' ')
            .replace('_', ' ')
            .replace('  ', ' ')
            .strip())


def find_best_match(ferrero_name: str, creativex_brands: list) -> tuple:
    """
    Find best matching Creative X brand for Ferrero brand

    Args:
        ferrero_name: Ferrero brand name
        creativex_brands: List of Creative X brands

    Returns:
        tuple: (best_match_brand, confidence_score, alternatives)
    """
    ferrero_normalized = normalize_name(ferrero_name)

    matches = []
    for brand in creativex_brands:
        creativex_name = brand['name']
        creativex_normalized = normalize_name(creativex_name)

        # Calculate similarity
        score = similarity(ferrero_normalized, creativex_normalized)

        # Check for exact word matches
        ferrero_words = set(ferrero_normalized.split())
        creativex_words = set(creativex_normalized.split())
        word_overlap = len(ferrero_words & creativex_words) / max(len(ferrero_words), 1)

        # Combine scores
        combined_score = (score * 0.6) + (word_overlap * 0.4)

        matches.append({
            'brand': brand,
            'score': combined_score,
            'similarity': score,
            'word_overlap': word_overlap
        })

    # Sort by combined score
    matches.sort(key=lambda x: x['score'], reverse=True)

    best_match = matches[0] if matches else None
    alternatives = matches[1:4] if len(matches) > 1 else []

    return best_match, alternatives


def generate_mappings(data_loader: DataLoader, api_client: CreativeXAPIClient):
    """
    Generate brand mappings by matching Ferrero to Creative X

    Args:
        data_loader: Ferrero data loader
        api_client: Creative X API client

    Returns:
        dict: Generated mappings with confidence scores
    """
    print("=" * 70)
    print("BRAND MAPPING GENERATOR")
    print("=" * 70)

    # Get Ferrero brands
    ferrero_brands = data_loader.get_all_brands()
    print(f"\nFerrero brands: {len(ferrero_brands)}")

    # Get Creative X brands
    print("\nFetching Creative X brands...")
    response = api_client._make_request('GET', '/dimensions')
    creativex_brands = response.get('brands', [])
    print(f"Creative X brands: {len(creativex_brands)}")

    # Generate mappings
    print("\n" + "=" * 70)
    print("MATCHING BRANDS")
    print("=" * 70)

    generated_mappings = {}
    high_confidence = []
    medium_confidence = []
    low_confidence = []
    no_match = []

    for code, ferrero_name in ferrero_brands.items():
        best_match, alternatives = find_best_match(ferrero_name, creativex_brands)

        if not best_match:
            no_match.append((code, ferrero_name))
            continue

        confidence = best_match['score']
        matched_brand = best_match['brand']

        mapping = {
            'creativex_name': matched_brand['name'],
            'creativex_id': matched_brand['id'],
            'ferrero_name': ferrero_name,
            'confidence': round(confidence, 3),
            'match_type': ''
        }

        # Categorize by confidence
        if confidence >= 0.9:
            mapping['match_type'] = 'high_confidence'
            high_confidence.append((code, mapping, alternatives))
        elif confidence >= 0.6:
            mapping['match_type'] = 'medium_confidence'
            medium_confidence.append((code, mapping, alternatives))
        else:
            mapping['match_type'] = 'low_confidence'
            low_confidence.append((code, mapping, alternatives))

        generated_mappings[code] = mapping

    # Display results
    print(f"\n✅ High Confidence Matches ({len(high_confidence)}): >= 90% match")
    print("-" * 70)
    for code, mapping, alts in high_confidence[:10]:  # Show first 10
        print(f"  {code:10s} → {mapping['creativex_name']:40s} ({mapping['confidence']:.1%})")

    if len(high_confidence) > 10:
        print(f"  ... and {len(high_confidence) - 10} more")

    print(f"\n⚠️  Medium Confidence Matches ({len(medium_confidence)}): 60-90% match")
    print("-" * 70)
    for code, mapping, alts in medium_confidence[:10]:
        print(f"  {code:10s} → {mapping['creativex_name']:40s} ({mapping['confidence']:.1%})")
        if alts:
            alt_names = [a['brand']['name'] for a in alts[:2]]
            print(f"             Alternatives: {', '.join(alt_names)}")

    if len(medium_confidence) > 10:
        print(f"  ... and {len(medium_confidence) - 10} more")

    print(f"\n❌ Low Confidence Matches ({len(low_confidence)}): < 60% match")
    print("-" * 70)
    for code, mapping, alts in low_confidence:
        print(f"  {code:10s} → {mapping['creativex_name']:40s} ({mapping['confidence']:.1%})")
        print(f"             Ferrero: {mapping['ferrero_name']}")
        if alts:
            alt_names = [a['brand']['name'] for a in alts[:3]]
            print(f"             Alternatives: {', '.join(alt_names)}")

    print(f"\n🚫 No Match Found ({len(no_match)})")
    print("-" * 70)
    for code, name in no_match:
        print(f"  {code:10s} → {name}")

    # Summary
    print("\n" + "=" * 70)
    print("SUMMARY")
    print("=" * 70)
    print(f"Total Ferrero brands:     {len(ferrero_brands)}")
    print(f"Total Creative X brands:  {len(creativex_brands)}")
    print(f"")
    print(f"✅ High confidence:       {len(high_confidence)} (review and approve)")
    print(f"⚠️  Medium confidence:     {len(medium_confidence)} (needs validation)")
    print(f"❌ Low confidence:        {len(low_confidence)} (needs manual matching)")
    print(f"🚫 No match:              {len(no_match)} (may not exist in Creative X)")
    print("=" * 70)

    return {
        'brand_mappings': generated_mappings,
        'summary': {
            'high_confidence': len(high_confidence),
            'medium_confidence': len(medium_confidence),
            'low_confidence': len(low_confidence),
            'no_match': len(no_match)
        }
    }


def main():
    """CLI entry point"""
    parser = argparse.ArgumentParser(
        description='Generate brand mappings by matching Ferrero to Creative X'
    )

    parser.add_argument('--output', default='mappings_generated.json',
                        help='Output file for generated mappings')
    parser.add_argument('--auto-approve', action='store_true',
                        help='Auto-approve high confidence matches only')

    args = parser.parse_args()

    # Load configuration
    try:
        config = load_config()
    except Exception as e:
        print(f"Error loading configuration: {e}")
        sys.exit(1)

    # Initialize components
    try:
        data_loader = DataLoader(str(config.data_json_path))
        api_client = CreativeXAPIClient(
            config.api_base_url,
            config.access_token,
            config.api_max_retries,
            config.api_timeout
        )
    except Exception as e:
        print(f"Error initializing components: {e}")
        sys.exit(1)

    # Generate mappings
    result = generate_mappings(data_loader, api_client)

    # Save to file
    output_path = Path(args.output)
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(result, f, indent=2, ensure_ascii=False)

    print(f"\n✓ Generated mappings saved to: {output_path}")
    print(f"\nNext steps:")
    print(f"  1. Review {output_path}")
    print(f"  2. Validate medium/low confidence matches")
    print(f"  3. Manually add any missing brands")
    print(f"  4. Merge into mappings.json")
    print(f"  5. Test with: python scripts/validate_mappings.py --show-supported")


if __name__ == '__main__':
    main()