solventum-image-metadata/src/metadata_importer.py

"""Metadata importer for external files (CSV, Excel, JSON)."""

import pandas as pd
import json
from pathlib import Path
from typing import Dict, Optional, List, Tuple
from .utils import get_logger
from .field_mapper import FieldMapper

logger = get_logger(__name__)


class MetadataImporter:
    """Import metadata from various file formats (CSV, Excel, JSON)."""

    def import_from_csv(self, csv_path: str) -> Dict[str, Dict]:
        """
        Import metadata from CSV file.
        Expected columns: filename, title, subject/description, keywords

        Args:
            csv_path: Path to CSV file

        Returns:
            Dictionary mapping filename stems to metadata dicts
        """
        try:
            df = pd.read_csv(csv_path, encoding='utf-8')
            logger.info(f"Loaded CSV with {len(df)} rows from {csv_path}")
            return self._parse_dataframe(df)

        except UnicodeDecodeError:
            # Try alternative encodings
            for encoding in ['latin1', 'iso-8859-1', 'cp1252']:
                try:
                    df = pd.read_csv(csv_path, encoding=encoding)
                    logger.info(f"Loaded CSV with {len(df)} rows using {encoding} encoding")
                    return self._parse_dataframe(df)
                except Exception:
                    continue

            raise ValueError(f"Could not read CSV file with any supported encoding")

        except Exception as e:
            logger.error(f"Error importing from CSV: {e}")
            raise

    def import_from_excel(self, excel_path: str, sheet_name: Optional[str] = None) -> Dict[str, Dict]:
        """
        Import metadata from Excel file.

        Args:
            excel_path: Path to Excel file (.xlsx, .xls)
            sheet_name: Name of sheet to read (None = first sheet)

        Returns:
            Dictionary mapping filename stems to metadata dicts
        """
        try:
            # Read Excel file
            if sheet_name:
                df = pd.read_excel(excel_path, sheet_name=sheet_name)
                logger.info(f"Loaded Excel sheet '{sheet_name}' with {len(df)} rows")
            else:
                df = pd.read_excel(excel_path)
                logger.info(f"Loaded Excel with {len(df)} rows from first sheet")

            return self._parse_dataframe(df)

        except Exception as e:
            logger.error(f"Error importing from Excel: {e}")
            raise

    def import_from_json(self, json_path: str) -> Dict[str, Dict]:
        """
        Import metadata from JSON file.

        Expected format:
        {
            "filename.pdf": {"title": "...", "subject": "...", "keywords": "..."},
            "image.jpg": {"title": "...", "subject": "...", "keywords": "..."}
        }

        Or array format:
        [
            {"filename": "file.pdf", "title": "...", "subject": "...", "keywords": "..."},
            {"filename": "image.jpg", "title": "...", "subject": "...", "keywords": "..."}
        ]

        Args:
            json_path: Path to JSON file

        Returns:
            Dictionary mapping filename stems to metadata dicts
        """
        try:
            with open(json_path, 'r', encoding='utf-8') as f:
                data = json.load(f)

            metadata_map = {}

            if isinstance(data, dict):
                # Object format: {"filename": {metadata}}
                for filename, metadata in data.items():
                    filename_stem = Path(filename).stem.lower()
                    metadata_map[filename_stem] = self._normalize_metadata(metadata)

            elif isinstance(data, list):
                # Array format: [{filename, metadata}]
                for item in data:
                    if not isinstance(item, dict):
                        continue

                    # Find filename field
                    filename = None
                    for key in ['filename', 'file', 'name', 'file_name']:
                        if key in item:
                            filename = item[key]
                            break

                    if not filename:
                        logger.warning(f"Skipping item without filename: {item}")
                        continue

                    filename_stem = Path(filename).stem.lower()
                    metadata_map[filename_stem] = self._normalize_metadata(item)

            else:
                raise ValueError("JSON must be an object or array")

            logger.info(f"Loaded {len(metadata_map)} metadata records from JSON")
            return metadata_map

        except Exception as e:
            logger.error(f"Error importing from JSON: {e}")
            raise

    def _parse_dataframe(self, df: pd.DataFrame) -> Dict[str, Dict]:
        """
        Parse pandas DataFrame into metadata map.

        Args:
            df: DataFrame with metadata

        Returns:
            Dictionary mapping filename stems to metadata dicts
        """
        metadata_map = {}

        # Detect filename column (try common names)
        filename_col = self._detect_column(df, ['filename', 'file', 'name', 'file_name', 'path'])

        if not filename_col:
            raise ValueError("Could not find filename column in data. Tried: filename, file, name, file_name, path")

        # Detect metadata columns
        title_col = self._detect_column(df, ['title', 'heading', 'name', 'document_title'])
        subject_col = self._detect_column(df, ['subject', 'description', 'summary', 'desc', 'external_description', 'alt_text'])
        keywords_col = self._detect_column(df, ['keywords', 'tags', 'categories', 'labels'])

        logger.info(f"Detected columns - filename: {filename_col}, title: {title_col}, subject: {subject_col}, keywords: {keywords_col}")

        # Parse rows
        for _, row in df.iterrows():
            filename = str(row.get(filename_col, '')).strip()
            if not filename or pd.isna(filename):
                continue

            filename_stem = Path(filename).stem.lower()

            metadata_map[filename_stem] = {
                'title': self._get_value(row, title_col),
                'subject': self._get_value(row, subject_col),
                'keywords': self._get_value(row, keywords_col)
            }

        logger.info(f"Parsed {len(metadata_map)} metadata records from DataFrame")
        return metadata_map

    def _detect_column(self, df: pd.DataFrame, candidates: List[str]) -> Optional[str]:
        """
        Detect column name from a list of candidates (case-insensitive).

        Args:
            df: DataFrame to search
            candidates: List of possible column names

        Returns:
            Actual column name if found, None otherwise
        """
        # Create lowercase mapping
        col_map = {col.lower(): col for col in df.columns}

        # Try each candidate
        for candidate in candidates:
            if candidate.lower() in col_map:
                return col_map[candidate.lower()]

        return None

    def _get_value(self, row: pd.Series, column: Optional[str]) -> str:
        """
        Get value from row, handling None column and NaN values.

        Args:
            row: DataFrame row
            column: Column name (can be None)

        Returns:
            String value or empty string
        """
        if column is None:
            return ''

        value = row.get(column, '')

        if pd.isna(value):
            return ''

        return str(value).strip()

    def _normalize_metadata(self, metadata: Dict) -> Dict[str, str]:
        """
        Normalize metadata dictionary to standard format.

        Args:
            metadata: Raw metadata dict

        Returns:
            Normalized metadata with title, subject, keywords keys
        """
        normalized = {
            'title': '',
            'subject': '',
            'keywords': ''
        }

        # Map title
        for key in ['title', 'heading', 'name', 'document_title']:
            if key in metadata and metadata[key]:
                normalized['title'] = str(metadata[key]).strip()
                break

        # Map subject/description
        for key in ['subject', 'description', 'summary', 'desc', 'external_description', 'alt_text']:
            if key in metadata and metadata[key]:
                normalized['subject'] = str(metadata[key]).strip()
                break

        # Map keywords
        for key in ['keywords', 'tags', 'categories', 'labels']:
            if key in metadata and metadata[key]:
                value = metadata[key]
                # Handle arrays
                if isinstance(value, list):
                    normalized['keywords'] = ', '.join(str(v) for v in value)
                else:
                    normalized['keywords'] = str(value).strip()
                break

        return normalized

    def get_metadata_for_file(self, metadata_map: Dict[str, Dict], filename: str) -> Optional[Dict[str, str]]:
        """
        Get metadata for a specific file from imported map.

        Args:
            metadata_map: Dictionary returned by import_* methods
            filename: Filename to look up (with or without extension)

        Returns:
            Metadata dict if found, None otherwise
        """
        filename_stem = Path(filename).stem.lower()
        return metadata_map.get(filename_stem)

    def validate_import(self, metadata_map: Dict[str, Dict]) -> Dict:
        """
        Validate imported metadata and return statistics.

        Args:
            metadata_map: Dictionary returned by import_* methods

        Returns:
            Statistics about the import
        """
        stats = {
            'total_records': len(metadata_map),
            'with_title': 0,
            'with_subject': 0,
            'with_keywords': 0,
            'empty_records': 0
        }

        for metadata in metadata_map.values():
            if metadata.get('title'):
                stats['with_title'] += 1
            if metadata.get('subject'):
                stats['with_subject'] += 1
            if metadata.get('keywords'):
                stats['with_keywords'] += 1

            if not any([metadata.get('title'), metadata.get('subject'), metadata.get('keywords')]):
                stats['empty_records'] += 1

        return stats

    def preview_file_structure(self, file_path: str, file_type: str = 'auto') -> Tuple[List[str], List[Dict], Dict]:
        """
        Preview file structure and suggest field mappings without importing.

        Args:
            file_path: Path to file (CSV, Excel, JSON)
            file_type: File type ('csv', 'excel', 'json', or 'auto')

        Returns:
            Tuple of (column_names, sample_rows, suggested_mapping)
        """
        if file_type == 'auto':
            ext = Path(file_path).suffix.lower()
            if ext == '.csv':
                file_type = 'csv'
            elif ext in ['.xlsx', '.xls']:
                file_type = 'excel'
            elif ext == '.json':
                file_type = 'json'
            else:
                raise ValueError(f"Unsupported file type: {ext}")

        # Load file
        if file_type == 'csv':
            df = pd.read_csv(file_path, encoding='utf-8', nrows=10)
        elif file_type == 'excel':
            df = pd.read_excel(file_path, nrows=10)
        elif file_type == 'json':
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
                if isinstance(data, list) and len(data) > 0:
                    df = pd.DataFrame(data[:10])
                elif isinstance(data, dict):
                    # Convert dict to list
                    items = [{'filename': k, **v} for k, v in list(data.items())[:10]]
                    df = pd.DataFrame(items)
                else:
                    raise ValueError("JSON format not supported for preview")

        # Get column names
        columns = df.columns.tolist()

        # Get sample rows
        sample_rows = df.head(5).to_dict('records')

        # Suggest field mapping
        mapper = FieldMapper()
        suggestions = mapper.suggest_mapping(columns)

        return (columns, sample_rows, suggestions)

    def import_with_mapping(self, file_path: str, mapping: Dict[str, str], file_type: str = 'auto') -> Dict[str, Dict]:
        """
        Import file with custom field mapping.

        Args:
            file_path: Path to file
            mapping: Field mapping {source_field: target_field}
            file_type: File type ('csv', 'excel', 'json', or 'auto')

        Returns:
            Dictionary mapping filename stems to metadata dicts
        """
        # Load file
        if file_type == 'auto':
            ext = Path(file_path).suffix.lower()
            if ext == '.csv':
                file_type = 'csv'
            elif ext in ['.xlsx', '.xls']:
                file_type = 'excel'
            elif ext == '.json':
                file_type = 'json'

        if file_type == 'csv':
            df = pd.read_csv(file_path, encoding='utf-8')
        elif file_type == 'excel':
            df = pd.read_excel(file_path)
        elif file_type == 'json':
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
                if isinstance(data, list):
                    df = pd.DataFrame(data)
                elif isinstance(data, dict):
                    items = [{'filename': k, **v} for k, v in data.items()]
                    df = pd.DataFrame(items)

        # Apply field mapper
        mapper = FieldMapper()
        metadata_map = {}

        # Find filename column
        filename_col = None
        for col in df.columns:
            if col.lower() in ['filename', 'file', 'name', 'file_name']:
                filename_col = col
                break

        if not filename_col:
            raise ValueError("Could not find filename column")

        # Process each row
        for _, row in df.iterrows():
            filename = str(row.get(filename_col, '')).strip()
            if not filename or pd.isna(filename):
                continue

            filename_stem = Path(filename).stem.lower()

            # Apply mapping to transform row data
            row_dict = row.to_dict()
            metadata = mapper.apply_mapping(row_dict, mapping)

            metadata_map[filename_stem] = {
                'title': str(metadata.get('title', '')).strip(),
                'subject': str(metadata.get('subject', '')).strip(),
                'keywords': str(metadata.get('keywords', '')).strip()
            }

        logger.info(f"Imported {len(metadata_map)} records with custom mapping")
        return metadata_map