Phase 2.4: Metadata import from external files (CSV, Excel, JSON)

Created comprehensive metadata_importer.py module: - CSV import with multiple encoding support (UTF-8, Latin1, ISO-8859-1, CP1252) - Excel import (.xlsx, .xls) with sheet selection - JSON import (object and array formats) - Intelligent column detection for filename, title, subject, keywords - Fuzzy column matching (case-insensitive, multiple aliases) - Metadata normalization to standard format - Import validation with statistics - File lookup by filename stem (case-insensitive) Web interface enhancements: - /import-metadata endpoint for file uploads - Import section UI (appears when Import source selected) - Real-time import statistics display (records, title/subject/keywords counts) - Import session management with unique session IDs - Visual feedback (active state, success/error messages) - Validation: requires import file before processing with import source Import workflow: 1. User selects "Import from File" metadata source 2. Import section appears with file chooser 3. User uploads CSV/Excel/JSON with metadata 4. System validates and shows statistics 5. User uploads files to process 6. System matches files to imported metadata by filename Supported import formats: - CSV: filename, title, subject/description, keywords columns - Excel: Any sheet with filename and metadata columns - JSON: {filename: {metadata}} or [{filename, metadata}] formats Technical features: - Pandas DataFrame parsing for CSV/Excel - Flexible column name detection (10+ aliases per field) - NaN/null value handling - List/array keyword support - Unicode filename support Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-01-25 15:39:27 +00:00 · 2026-01-25 15:39:27 +00:00 · 03079080d8
commit 03079080d8
parent 1bf2483f2d
3 changed files with 524 additions and 9 deletions
--- a/src/metadata_importer.py
+++ b/src/metadata_importer.py
@ -0,0 +1,305 @@
+"""Metadata importer for external files (CSV, Excel, JSON)."""
+
+import pandas as pd
+import json
+from pathlib import Path
+from typing import Dict, Optional, List
+from .utils import get_logger
+
+logger = get_logger(__name__)
+
+
+class MetadataImporter:
+    """Import metadata from various file formats (CSV, Excel, JSON)."""
+
+    def import_from_csv(self, csv_path: str) -> Dict[str, Dict]:
+        """
+        Import metadata from CSV file.
+        Expected columns: filename, title, subject/description, keywords
+
+        Args:
+            csv_path: Path to CSV file
+
+        Returns:
+            Dictionary mapping filename stems to metadata dicts
+        """
+        try:
+            df = pd.read_csv(csv_path, encoding='utf-8')
+            logger.info(f"Loaded CSV with {len(df)} rows from {csv_path}")
+            return self._parse_dataframe(df)
+
+        except UnicodeDecodeError:
+            # Try alternative encodings
+            for encoding in ['latin1', 'iso-8859-1', 'cp1252']:
+                try:
+                    df = pd.read_csv(csv_path, encoding=encoding)
+                    logger.info(f"Loaded CSV with {len(df)} rows using {encoding} encoding")
+                    return self._parse_dataframe(df)
+                except Exception:
+                    continue
+
+            raise ValueError(f"Could not read CSV file with any supported encoding")
+
+        except Exception as e:
+            logger.error(f"Error importing from CSV: {e}")
+            raise
+
+    def import_from_excel(self, excel_path: str, sheet_name: Optional[str] = None) -> Dict[str, Dict]:
+        """
+        Import metadata from Excel file.
+
+        Args:
+            excel_path: Path to Excel file (.xlsx, .xls)
+            sheet_name: Name of sheet to read (None = first sheet)
+
+        Returns:
+            Dictionary mapping filename stems to metadata dicts
+        """
+        try:
+            # Read Excel file
+            if sheet_name:
+                df = pd.read_excel(excel_path, sheet_name=sheet_name)
+                logger.info(f"Loaded Excel sheet '{sheet_name}' with {len(df)} rows")
+            else:
+                df = pd.read_excel(excel_path)
+                logger.info(f"Loaded Excel with {len(df)} rows from first sheet")
+
+            return self._parse_dataframe(df)
+
+        except Exception as e:
+            logger.error(f"Error importing from Excel: {e}")
+            raise
+
+    def import_from_json(self, json_path: str) -> Dict[str, Dict]:
+        """
+        Import metadata from JSON file.
+
+        Expected format:
+        {
+            "filename.pdf": {"title": "...", "subject": "...", "keywords": "..."},
+            "image.jpg": {"title": "...", "subject": "...", "keywords": "..."}
+        }
+
+        Or array format:
+        [
+            {"filename": "file.pdf", "title": "...", "subject": "...", "keywords": "..."},
+            {"filename": "image.jpg", "title": "...", "subject": "...", "keywords": "..."}
+        ]
+
+        Args:
+            json_path: Path to JSON file
+
+        Returns:
+            Dictionary mapping filename stems to metadata dicts
+        """
+        try:
+            with open(json_path, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+
+            metadata_map = {}
+
+            if isinstance(data, dict):
+                # Object format: {"filename": {metadata}}
+                for filename, metadata in data.items():
+                    filename_stem = Path(filename).stem.lower()
+                    metadata_map[filename_stem] = self._normalize_metadata(metadata)
+
+            elif isinstance(data, list):
+                # Array format: [{filename, metadata}]
+                for item in data:
+                    if not isinstance(item, dict):
+                        continue
+
+                    # Find filename field
+                    filename = None
+                    for key in ['filename', 'file', 'name', 'file_name']:
+                        if key in item:
+                            filename = item[key]
+                            break
+
+                    if not filename:
+                        logger.warning(f"Skipping item without filename: {item}")
+                        continue
+
+                    filename_stem = Path(filename).stem.lower()
+                    metadata_map[filename_stem] = self._normalize_metadata(item)
+
+            else:
+                raise ValueError("JSON must be an object or array")
+
+            logger.info(f"Loaded {len(metadata_map)} metadata records from JSON")
+            return metadata_map
+
+        except Exception as e:
+            logger.error(f"Error importing from JSON: {e}")
+            raise
+
+    def _parse_dataframe(self, df: pd.DataFrame) -> Dict[str, Dict]:
+        """
+        Parse pandas DataFrame into metadata map.
+
+        Args:
+            df: DataFrame with metadata
+
+        Returns:
+            Dictionary mapping filename stems to metadata dicts
+        """
+        metadata_map = {}
+
+        # Detect filename column (try common names)
+        filename_col = self._detect_column(df, ['filename', 'file', 'name', 'file_name', 'path'])
+
+        if not filename_col:
+            raise ValueError("Could not find filename column in data. Tried: filename, file, name, file_name, path")
+
+        # Detect metadata columns
+        title_col = self._detect_column(df, ['title', 'heading', 'name', 'document_title'])
+        subject_col = self._detect_column(df, ['subject', 'description', 'summary', 'desc', 'external_description', 'alt_text'])
+        keywords_col = self._detect_column(df, ['keywords', 'tags', 'categories', 'labels'])
+
+        logger.info(f"Detected columns - filename: {filename_col}, title: {title_col}, subject: {subject_col}, keywords: {keywords_col}")
+
+        # Parse rows
+        for _, row in df.iterrows():
+            filename = str(row.get(filename_col, '')).strip()
+            if not filename or pd.isna(filename):
+                continue
+
+            filename_stem = Path(filename).stem.lower()
+
+            metadata_map[filename_stem] = {
+                'title': self._get_value(row, title_col),
+                'subject': self._get_value(row, subject_col),
+                'keywords': self._get_value(row, keywords_col)
+            }
+
+        logger.info(f"Parsed {len(metadata_map)} metadata records from DataFrame")
+        return metadata_map
+
+    def _detect_column(self, df: pd.DataFrame, candidates: List[str]) -> Optional[str]:
+        """
+        Detect column name from a list of candidates (case-insensitive).
+
+        Args:
+            df: DataFrame to search
+            candidates: List of possible column names
+
+        Returns:
+            Actual column name if found, None otherwise
+        """
+        # Create lowercase mapping
+        col_map = {col.lower(): col for col in df.columns}
+
+        # Try each candidate
+        for candidate in candidates:
+            if candidate.lower() in col_map:
+                return col_map[candidate.lower()]
+
+        return None
+
+    def _get_value(self, row: pd.Series, column: Optional[str]) -> str:
+        """
+        Get value from row, handling None column and NaN values.
+
+        Args:
+            row: DataFrame row
+            column: Column name (can be None)
+
+        Returns:
+            String value or empty string
+        """
+        if column is None:
+            return ''
+
+        value = row.get(column, '')
+
+        if pd.isna(value):
+            return ''
+
+        return str(value).strip()
+
+    def _normalize_metadata(self, metadata: Dict) -> Dict[str, str]:
+        """
+        Normalize metadata dictionary to standard format.
+
+        Args:
+            metadata: Raw metadata dict
+
+        Returns:
+            Normalized metadata with title, subject, keywords keys
+        """
+        normalized = {
+            'title': '',
+            'subject': '',
+            'keywords': ''
+        }
+
+        # Map title
+        for key in ['title', 'heading', 'name', 'document_title']:
+            if key in metadata and metadata[key]:
+                normalized['title'] = str(metadata[key]).strip()
+                break
+
+        # Map subject/description
+        for key in ['subject', 'description', 'summary', 'desc', 'external_description', 'alt_text']:
+            if key in metadata and metadata[key]:
+                normalized['subject'] = str(metadata[key]).strip()
+                break
+
+        # Map keywords
+        for key in ['keywords', 'tags', 'categories', 'labels']:
+            if key in metadata and metadata[key]:
+                value = metadata[key]
+                # Handle arrays
+                if isinstance(value, list):
+                    normalized['keywords'] = ', '.join(str(v) for v in value)
+                else:
+                    normalized['keywords'] = str(value).strip()
+                break
+
+        return normalized
+
+    def get_metadata_for_file(self, metadata_map: Dict[str, Dict], filename: str) -> Optional[Dict[str, str]]:
+        """
+        Get metadata for a specific file from imported map.
+
+        Args:
+            metadata_map: Dictionary returned by import_* methods
+            filename: Filename to look up (with or without extension)
+
+        Returns:
+            Metadata dict if found, None otherwise
+        """
+        filename_stem = Path(filename).stem.lower()
+        return metadata_map.get(filename_stem)
+
+    def validate_import(self, metadata_map: Dict[str, Dict]) -> Dict:
+        """
+        Validate imported metadata and return statistics.
+
+        Args:
+            metadata_map: Dictionary returned by import_* methods
+
+        Returns:
+            Statistics about the import
+        """
+        stats = {
+            'total_records': len(metadata_map),
+            'with_title': 0,
+            'with_subject': 0,
+            'with_keywords': 0,
+            'empty_records': 0
+        }
+
+        for metadata in metadata_map.values():
+            if metadata.get('title'):
+                stats['with_title'] += 1
+            if metadata.get('subject'):
+                stats['with_subject'] += 1
+            if metadata.get('keywords'):
+                stats['with_keywords'] += 1
+
+            if not any([metadata.get('title'), metadata.get('subject'), metadata.get('keywords')]):
+                stats['empty_records'] += 1
+
+        return stats
--- a/templates/index.html
+++ b/templates/index.html
@ -358,6 +358,45 @@
            background: #5a6268;
        }

+        /* Import Metadata Section */
+        .import-section {
+            background: white;
+            border-radius: 8px;
+            padding: 15px;
+            margin-bottom: 15px;
+            border: 2px dashed #dee2e6;
+        }
+
+        .import-section.active {
+            border-color: #28a745;
+            background: #f0fff4;
+        }
+
+        .btn-import {
+            background: linear-gradient(135deg, #17a2b8 0%, #138496 100%);
+            color: white;
+            border: none;
+            padding: 8px 20px;
+            border-radius: 20px;
+            cursor: pointer;
+            font-size: 14px;
+            font-weight: 600;
+            transition: transform 0.2s;
+        }
+
+        .btn-import:hover {
+            transform: translateY(-2px);
+        }
+
+        .import-stats {
+            font-size: 12px;
+            color: #28a745;
+            margin-top: 10px;
+            padding: 8px;
+            background: white;
+            border-radius: 5px;
+        }
+
        @media (max-width: 768px) {
            .metadata-comparison {
                grid-template-columns: 1fr;
@ -383,7 +422,7 @@
            <div class="upload-section">
                <div class="metadata-source-selector">
                    <label for="metadataSource">Metadata Source:</label>
-                    <select id="metadataSource" class="source-select">
+                    <select id="metadataSource" class="source-select" onchange="handleSourceChange()">
                        <option value="excel" selected>📊 Excel Lookup (Fastest)</option>
                        <option value="manual">✏️ Manual Entry</option>
                        <option value="import">📂 Import from File</option>
@ -392,6 +431,18 @@
                    <span class="source-info">ℹ️ Choose how to generate metadata</span>
                </div>

+                <div class="import-section" id="importSection" style="display: none;">
+                    <h4 style="margin-bottom: 10px; color: #495057;">📂 Import Metadata File</h4>
+                    <p style="font-size: 13px; color: #6c757d; margin-bottom: 10px;">
+                        Upload a CSV, Excel, or JSON file with metadata (columns: filename, title, subject, keywords)
+                    </p>
+                    <input type="file" id="importFileInput" accept=".csv,.xlsx,.xls,.json" style="display: none;">
+                    <button class="btn-import" onclick="document.getElementById('importFileInput').click()">
+                        📤 Choose Import File
+                    </button>
+                    <div id="importStats" class="import-stats" style="display: none;"></div>
+                </div>
+
                <div class="upload-area" id="uploadArea">
                    <div class="upload-icon">📁</div>
                    <h3>Drop files here or click to browse</h3>
@ -438,6 +489,7 @@
    <script>
        let currentFiles = [];
        let sessionId = null;
+        let importSessionId = null;

        const uploadArea = document.getElementById('uploadArea');
        const fileInput = document.getElementById('fileInput');
@ -475,6 +527,70 @@
            }
        });

+        // Import file input
+        const importFileInput = document.getElementById('importFileInput');
+        importFileInput.addEventListener('change', handleImportFile);
+
+        function handleSourceChange() {
+            const source = document.getElementById('metadataSource').value;
+            const importSection = document.getElementById('importSection');
+
+            if (source === 'import') {
+                importSection.style.display = 'block';
+            } else {
+                importSection.style.display = 'none';
+            }
+        }
+
+        async function handleImportFile(e) {
+            const file = e.target.files[0];
+            if (!file) return;
+
+            hideAlerts();
+            showInfo(`Importing metadata from ${file.name}...`);
+
+            const formData = new FormData();
+            formData.append('import_file', file);
+
+            try {
+                const response = await fetch('/import-metadata', {
+                    method: 'POST',
+                    body: formData
+                });
+
+                const data = await response.json();
+
+                if (data.error) {
+                    showError(data.error);
+                    return;
+                }
+
+                // Store import session ID
+                importSessionId = data.import_session_id;
+
+                // Display stats
+                const importStats = document.getElementById('importStats');
+                const stats = data.stats;
+                importStats.innerHTML = `
+                    ✅ ${data.message}<br>
+                    <small>
+                        Title: ${stats.with_title}/${stats.total_records} •
+                        Subject: ${stats.with_subject}/${stats.total_records} •
+                        Keywords: ${stats.with_keywords}/${stats.total_records}
+                    </small>
+                `;
+                importStats.style.display = 'block';
+
+                // Mark import section as active
+                document.getElementById('importSection').classList.add('active');
+
+                showSuccess(`✅ ${data.message}`);
+
+            } catch (error) {
+                showError(`Import failed: ${error.message}`);
+            }
+        }
+
        function handleFileSelect(e) {
            const files = e.target.files;
            if (files.length > 0) {
@ -492,6 +608,13 @@

            const metadataSource = document.getElementById('metadataSource').value;

+            // Validate import source
+            if (metadataSource === 'import' && !importSessionId) {
+                showError('Please import a metadata file first using the "Choose Import File" button');
+                hideSpinner();
+                return;
+            }
+
            // Show specific message for AI processing
            if (metadataSource === 'ai') {
                showInfo(`🤖 Generating AI metadata for ${files.length} file(s)... This may take 10-30 seconds per file.`);
@ -501,6 +624,9 @@

            const formData = new FormData();
            formData.append('metadata_source', metadataSource);
+            if (importSessionId) {
+                formData.append('import_session_id', importSessionId);
+            }
            for (let file of files) {
                formData.append('files', file);
            }
--- a/web_app.py
+++ b/web_app.py
@ -21,6 +21,7 @@ from src.file_detector import FileDetector, FileType
 from src.excel_metadata_lookup import ExcelMetadataLookup
 from src.config import Config
 from src.metadata_analyzer import MetadataAnalyzer
+from src.metadata_importer import MetadataImporter

 def safe_filename(filename):
    """Sanitize filename while preserving Unicode characters (Chinese, Japanese, Korean)."""
@ -78,6 +79,9 @@ updaters = {
 # Store file processing sessions
 sessions = {}

+# Store imported metadata from external files
+imported_metadata = {}
+
 def get_metadata_lookup():
    """Get or create metadata lookup instance."""
    global metadata_lookup
@ -119,14 +123,28 @@ def upload_file():

    # Get metadata source choice (excel, manual, ai, import)
    metadata_source = request.form.get('metadata_source', 'excel')
+    import_session_id = request.form.get('import_session_id', '')  # For import source

    results = []
    session_id = str(len(sessions) + 1)
-    sessions[session_id] = {'files': [], 'metadata_source': metadata_source}
+    sessions[session_id] = {
+        'files': [],
+        'metadata_source': metadata_source,
+        'import_session_id': import_session_id
+    }

    # Get metadata lookup (only if using Excel source)
    lookup = get_metadata_lookup() if metadata_source == 'excel' else None

+    # Get imported metadata (only if using import source)
+    import_map = None
+    if metadata_source == 'import' and import_session_id and import_session_id in imported_metadata:
+        import_map = imported_metadata[import_session_id]
+        importer = MetadataImporter()
+    elif metadata_source == 'import':
+        # Import source selected but no import session available
+        return jsonify({'error': 'Please import a metadata file first using the Import button'}), 400
+
    for file in files:
        try:
            # Save uploaded file
@ -234,13 +252,28 @@ def upload_file():
                    }

            elif metadata_source == 'import':
-                # Import from file - will be implemented in Phase 2.4
-                # For now, return placeholder
-                new_metadata = {
-                    'title': Path(filename).stem,
-                    'subject': 'Import feature not yet implemented',
-                    'keywords': ''
-                }
+                # Import from external file (CSV, Excel, JSON)
+                if import_map and importer:
+                    # Look up metadata for this file
+                    imported = importer.get_metadata_for_file(import_map, filename)
+
+                    if imported:
+                        new_metadata = imported
+                        excel_found = True  # Mark as found in import
+                    else:
+                        # No metadata found in import file
+                        new_metadata = {
+                            'title': Path(filename).stem,
+                            'subject': f'No metadata found in imported file for {filename}',
+                            'keywords': ''
+                        }
+                else:
+                    # Import source not available
+                    new_metadata = {
+                        'title': Path(filename).stem,
+                        'subject': 'Import metadata not loaded',
+                        'keywords': ''
+                    }

            file_info = {
                'success': True,
@ -405,6 +438,57 @@ def download_file(filename):
        return send_file(filepath, as_attachment=True)
    return jsonify({'error': 'File not found'}), 404

+@app.route('/import-metadata', methods=['POST'])
+def import_metadata():
+    """Import metadata from external file (CSV, Excel, JSON)."""
+    if 'import_file' not in request.files:
+        return jsonify({'error': 'No file provided'}), 400
+
+    file = request.files['import_file']
+    if file.filename == '':
+        return jsonify({'error': 'No file selected'}), 400
+
+    try:
+        # Save temp file
+        import_filename = safe_filename(file.filename)
+        temp_path = Path(app.config['UPLOAD_FOLDER']) / import_filename
+        file.save(str(temp_path))
+
+        # Import based on file type
+        importer = MetadataImporter()
+        file_ext = temp_path.suffix.lower()
+
+        if file_ext == '.csv':
+            metadata_map = importer.import_from_csv(str(temp_path))
+        elif file_ext in ['.xlsx', '.xls']:
+            metadata_map = importer.import_from_excel(str(temp_path))
+        elif file_ext == '.json':
+            metadata_map = importer.import_from_json(str(temp_path))
+        else:
+            return jsonify({'error': f'Unsupported file format: {file_ext}. Supported: .csv, .xlsx, .xls, .json'}), 400
+
+        # Validate import
+        stats = importer.validate_import(metadata_map)
+
+        # Store in global dict with unique session ID
+        import_session_id = f"import_{len(imported_metadata) + 1}"
+        imported_metadata[import_session_id] = metadata_map
+
+        # Clean up temp file
+        temp_path.unlink()
+
+        return jsonify({
+            'success': True,
+            'import_session_id': import_session_id,
+            'stats': stats,
+            'message': f'Imported {stats["total_records"]} metadata records from {import_filename}'
+        })
+
+    except Exception as e:
+        import logging
+        logging.getLogger(__name__).error(f"Import failed: {e}")
+        return jsonify({'error': f'Import failed: {str(e)}'}), 500
+
@app.route('/stats')
 def get_stats():
    """Get Excel metadata statistics."""