solventum-image-metadata/backend/app/processors/field_mapper.py

"""Field mapping with automatic detection and manual override."""

import json
from typing import Dict, List, Optional, Tuple
from difflib import SequenceMatcher
from pathlib import Path
from .utils import get_logger

logger = get_logger(__name__)


class FieldMapper:
    """Map source fields to standard metadata fields with fuzzy matching."""

    # Standard metadata fields used in Oliver Metadata Tool
    STANDARD_FIELDS = ['title', 'subject', 'keywords', 'description']

    # Common aliases for fuzzy matching (case-insensitive)
    FIELD_ALIASES = {
        'title': [
            'title', 'name', 'heading', 'filename', 'file_name', 'document_title',
            'asset_title', 'resource_title', 'object_name', 'label'
        ],
        'subject': [
            'subject', 'description', 'summary', 'abstract', 'alt_text',
            'external_description', 'caption', 'about', 'overview', 'details',
            'desc', 'long_description', 'content'
        ],
        'keywords': [
            'keywords', 'tags', 'categories', 'labels', 'subjects', 'topics',
            'taxonomy', 'classification', 'key_words', 'search_terms'
        ],
        'description': [
            'description', 'desc', 'summary', 'notes', 'comments', 'remarks',
            'details', 'about', 'information', 'info'
        ]
    }

    # Similarity threshold for fuzzy matching (0.0 to 1.0)
    SIMILARITY_THRESHOLD = 0.6

    def __init__(self, presets_path: Optional[str] = None):
        """
        Initialize field mapper.

        Args:
            presets_path: Path to JSON file for saving/loading mapping presets
        """
        self.presets_path = presets_path or 'field_mapping_presets.json'

    def auto_map(self, source_fields: List[str], strict: bool = False) -> Dict[str, Tuple[str, float]]:
        """
        Automatically map source fields to standard fields using fuzzy matching.

        Args:
            source_fields: List of field names from source data
            strict: If True, only accept matches above high confidence threshold (0.8)

        Returns:
            Dictionary mapping {source_field: (target_field, confidence_score)}
            Example: {'File Name': ('title', 0.85), 'Alt Text': ('subject', 0.92)}
        """
        mapping = {}
        threshold = 0.8 if strict else self.SIMILARITY_THRESHOLD

        for source_field in source_fields:
            best_match = self._find_best_match(source_field, threshold)
            if best_match:
                target_field, score = best_match
                mapping[source_field] = (target_field, score)
                logger.info(f"Auto-mapped '{source_field}' -> '{target_field}' (confidence: {score:.2f})")

        return mapping

    def _find_best_match(self, source_field: str, threshold: float = 0.6) -> Optional[Tuple[str, float]]:
        """
        Find best matching standard field for source field.

        Args:
            source_field: Source field name
            threshold: Minimum similarity score (0.0 to 1.0)

        Returns:
            Tuple of (target_field, confidence_score) or None
        """
        source_lower = source_field.lower().replace(' ', '_').replace('-', '_')
        best_score = 0.0
        best_field = None

        for standard_field, aliases in self.FIELD_ALIASES.items():
            for alias in aliases:
                # Calculate similarity score
                score = SequenceMatcher(None, source_lower, alias).ratio()

                # Exact match bonus
                if source_lower == alias:
                    score = 1.0

                # Substring match bonus
                elif alias in source_lower or source_lower in alias:
                    score = max(score, 0.85)

                if score > best_score and score >= threshold:
                    best_score = score
                    best_field = standard_field

        if best_field:
            return (best_field, best_score)
        return None

    def validate_mapping(self, mapping: Dict[str, str]) -> Dict[str, List[str]]:
        """
        Validate a field mapping configuration.

        Args:
            mapping: Dictionary mapping {source_field: target_field}

        Returns:
            Dictionary with validation results:
            {
                'valid': [list of valid mappings],
                'invalid': [list of invalid mappings],
                'warnings': [list of warnings]
            }
        """
        result = {
            'valid': [],
            'invalid': [],
            'warnings': []
        }

        # Track which target fields are used
        target_usage = {}

        for source_field, target_field in mapping.items():
            # Check if target field is valid
            if target_field not in self.STANDARD_FIELDS:
                result['invalid'].append(
                    f"'{target_field}' is not a valid target field (source: '{source_field}')"
                )
                continue

            result['valid'].append(f"'{source_field}' -> '{target_field}'")

            # Track multiple sources mapping to same target
            if target_field in target_usage:
                target_usage[target_field].append(source_field)
            else:
                target_usage[target_field] = [source_field]

        # Warn about multiple sources mapping to same target
        for target_field, sources in target_usage.items():
            if len(sources) > 1:
                result['warnings'].append(
                    f"Multiple source fields map to '{target_field}': {', '.join(sources)}"
                )

        return result

    def apply_mapping(self, data: Dict[str, str], mapping: Dict[str, str]) -> Dict[str, str]:
        """
        Apply field mapping to transform source data to standard format.

        Args:
            data: Source data dictionary
            mapping: Field mapping {source_field: target_field}

        Returns:
            Transformed data with standard field names
        """
        result = {field: '' for field in self.STANDARD_FIELDS}

        for source_field, target_field in mapping.items():
            if source_field in data and target_field in self.STANDARD_FIELDS:
                value = data[source_field]

                # Handle multiple values mapping to same target (concatenate)
                if result[target_field]:
                    result[target_field] += f"; {value}"
                else:
                    result[target_field] = value

        return result

    def save_preset(self, name: str, mapping: Dict[str, str], description: str = ""):
        """
        Save mapping preset to file.

        Args:
            name: Preset name
            mapping: Field mapping dictionary
            description: Optional description
        """
        presets = self._load_presets()

        presets[name] = {
            'mapping': mapping,
            'description': description,
            'created_at': self._get_timestamp()
        }

        try:
            with open(self.presets_path, 'w') as f:
                json.dump(presets, f, indent=2)
            logger.info(f"Saved mapping preset: {name}")
        except Exception as e:
            logger.error(f"Failed to save preset '{name}': {e}")
            raise

    def load_preset(self, name: str) -> Optional[Dict[str, str]]:
        """
        Load mapping preset from file.

        Args:
            name: Preset name

        Returns:
            Mapping dictionary or None if not found
        """
        presets = self._load_presets()

        if name in presets:
            logger.info(f"Loaded mapping preset: {name}")
            return presets[name].get('mapping', {})

        logger.warning(f"Preset not found: {name}")
        return None

    def list_presets(self) -> List[Dict[str, str]]:
        """
        List all saved presets.

        Returns:
            List of preset information dictionaries
        """
        presets = self._load_presets()

        return [
            {
                'name': name,
                'description': data.get('description', ''),
                'created_at': data.get('created_at', ''),
                'fields': len(data.get('mapping', {}))
            }
            for name, data in presets.items()
        ]

    def delete_preset(self, name: str) -> bool:
        """
        Delete a mapping preset.

        Args:
            name: Preset name

        Returns:
            True if deleted, False if not found
        """
        presets = self._load_presets()

        if name in presets:
            del presets[name]

            try:
                with open(self.presets_path, 'w') as f:
                    json.dump(presets, f, indent=2)
                logger.info(f"Deleted mapping preset: {name}")
                return True
            except Exception as e:
                logger.error(f"Failed to delete preset '{name}': {e}")
                raise

        return False

    def suggest_mapping(self, source_fields: List[str]) -> Dict:
        """
        Generate mapping suggestions with confidence scores and alternatives.

        Args:
            source_fields: List of source field names

        Returns:
            Dictionary with suggestions:
            {
                'source_field': {
                    'best_match': 'target_field',
                    'confidence': 0.85,
                    'alternatives': [
                        {'field': 'other_target', 'confidence': 0.65},
                        ...
                    ]
                }
            }
        """
        suggestions = {}

        for source_field in source_fields:
            # Find all potential matches
            matches = self._find_all_matches(source_field)

            if matches:
                best_match = matches[0]
                suggestions[source_field] = {
                    'best_match': best_match[0],
                    'confidence': best_match[1],
                    'alternatives': [
                        {'field': field, 'confidence': score}
                        for field, score in matches[1:3]  # Top 2 alternatives
                    ]
                }
            else:
                suggestions[source_field] = {
                    'best_match': None,
                    'confidence': 0.0,
                    'alternatives': []
                }

        return suggestions

    def _find_all_matches(self, source_field: str, min_threshold: float = 0.4) -> List[Tuple[str, float]]:
        """
        Find all matching standard fields above threshold, sorted by score.

        Args:
            source_field: Source field name
            min_threshold: Minimum similarity score

        Returns:
            List of (target_field, score) tuples sorted by score descending
        """
        source_lower = source_field.lower().replace(' ', '_').replace('-', '_')
        matches = []

        for standard_field, aliases in self.FIELD_ALIASES.items():
            best_score = 0.0

            for alias in aliases:
                score = SequenceMatcher(None, source_lower, alias).ratio()

                # Exact match
                if source_lower == alias:
                    score = 1.0
                # Substring match
                elif alias in source_lower or source_lower in alias:
                    score = max(score, 0.85)

                best_score = max(best_score, score)

            if best_score >= min_threshold:
                matches.append((standard_field, best_score))

        # Sort by score descending
        matches.sort(key=lambda x: x[1], reverse=True)
        return matches

    def _load_presets(self) -> Dict:
        """Load all presets from file."""
        if Path(self.presets_path).exists():
            try:
                with open(self.presets_path, 'r') as f:
                    return json.load(f)
            except Exception as e:
                logger.error(f"Failed to load presets: {e}")
                return {}
        return {}

    def _get_timestamp(self) -> str:
        """Get current timestamp as ISO format string."""
        from datetime import datetime
        return datetime.now().isoformat()

    def get_unmapped_fields(self, source_fields: List[str], mapping: Dict[str, str]) -> List[str]:
        """
        Get list of source fields that are not mapped.

        Args:
            source_fields: All source field names
            mapping: Current mapping dictionary

        Returns:
            List of unmapped source fields
        """
        return [field for field in source_fields if field not in mapping]

    def get_mapping_coverage(self, source_fields: List[str], mapping: Dict[str, str]) -> Dict:
        """
        Calculate mapping coverage statistics.

        Args:
            source_fields: All source field names
            mapping: Current mapping dictionary

        Returns:
            Statistics dictionary with coverage info
        """
        total_fields = len(source_fields)
        mapped_fields = len(mapping)
        unmapped = self.get_unmapped_fields(source_fields, mapping)

        # Count unique target fields used
        unique_targets = len(set(mapping.values()))

        return {
            'total_source_fields': total_fields,
            'mapped_fields': mapped_fields,
            'unmapped_fields': len(unmapped),
            'coverage_percent': (mapped_fields / total_fields * 100) if total_fields > 0 else 0,
            'unique_targets_used': unique_targets,
            'unmapped_field_list': unmapped
        }