Phase 2.4: Metadata import from external files (CSV, Excel, JSON)

Created comprehensive metadata_importer.py module:
- CSV import with multiple encoding support (UTF-8, Latin1, ISO-8859-1, CP1252)
- Excel import (.xlsx, .xls) with sheet selection
- JSON import (object and array formats)
- Intelligent column detection for filename, title, subject, keywords
- Fuzzy column matching (case-insensitive, multiple aliases)
- Metadata normalization to standard format
- Import validation with statistics
- File lookup by filename stem (case-insensitive)

Web interface enhancements:
- /import-metadata endpoint for file uploads
- Import section UI (appears when Import source selected)
- Real-time import statistics display (records, title/subject/keywords counts)
- Import session management with unique session IDs
- Visual feedback (active state, success/error messages)
- Validation: requires import file before processing with import source

Import workflow:
1. User selects "Import from File" metadata source
2. Import section appears with file chooser
3. User uploads CSV/Excel/JSON with metadata
4. System validates and shows statistics
5. User uploads files to process
6. System matches files to imported metadata by filename

Supported import formats:
- CSV: filename, title, subject/description, keywords columns
- Excel: Any sheet with filename and metadata columns
- JSON: {filename: {metadata}} or [{filename, metadata}] formats

Technical features:
- Pandas DataFrame parsing for CSV/Excel
- Flexible column name detection (10+ aliases per field)
- NaN/null value handling
- List/array keyword support
- Unicode filename support

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
SamoilenkoVadym 2026-01-25 15:39:27 +00:00
parent 1bf2483f2d
commit 03079080d8
3 changed files with 524 additions and 9 deletions

305
src/metadata_importer.py Normal file
View file

@ -0,0 +1,305 @@
"""Metadata importer for external files (CSV, Excel, JSON)."""
import pandas as pd
import json
from pathlib import Path
from typing import Dict, Optional, List
from .utils import get_logger
logger = get_logger(__name__)
class MetadataImporter:
"""Import metadata from various file formats (CSV, Excel, JSON)."""
def import_from_csv(self, csv_path: str) -> Dict[str, Dict]:
"""
Import metadata from CSV file.
Expected columns: filename, title, subject/description, keywords
Args:
csv_path: Path to CSV file
Returns:
Dictionary mapping filename stems to metadata dicts
"""
try:
df = pd.read_csv(csv_path, encoding='utf-8')
logger.info(f"Loaded CSV with {len(df)} rows from {csv_path}")
return self._parse_dataframe(df)
except UnicodeDecodeError:
# Try alternative encodings
for encoding in ['latin1', 'iso-8859-1', 'cp1252']:
try:
df = pd.read_csv(csv_path, encoding=encoding)
logger.info(f"Loaded CSV with {len(df)} rows using {encoding} encoding")
return self._parse_dataframe(df)
except Exception:
continue
raise ValueError(f"Could not read CSV file with any supported encoding")
except Exception as e:
logger.error(f"Error importing from CSV: {e}")
raise
def import_from_excel(self, excel_path: str, sheet_name: Optional[str] = None) -> Dict[str, Dict]:
"""
Import metadata from Excel file.
Args:
excel_path: Path to Excel file (.xlsx, .xls)
sheet_name: Name of sheet to read (None = first sheet)
Returns:
Dictionary mapping filename stems to metadata dicts
"""
try:
# Read Excel file
if sheet_name:
df = pd.read_excel(excel_path, sheet_name=sheet_name)
logger.info(f"Loaded Excel sheet '{sheet_name}' with {len(df)} rows")
else:
df = pd.read_excel(excel_path)
logger.info(f"Loaded Excel with {len(df)} rows from first sheet")
return self._parse_dataframe(df)
except Exception as e:
logger.error(f"Error importing from Excel: {e}")
raise
def import_from_json(self, json_path: str) -> Dict[str, Dict]:
"""
Import metadata from JSON file.
Expected format:
{
"filename.pdf": {"title": "...", "subject": "...", "keywords": "..."},
"image.jpg": {"title": "...", "subject": "...", "keywords": "..."}
}
Or array format:
[
{"filename": "file.pdf", "title": "...", "subject": "...", "keywords": "..."},
{"filename": "image.jpg", "title": "...", "subject": "...", "keywords": "..."}
]
Args:
json_path: Path to JSON file
Returns:
Dictionary mapping filename stems to metadata dicts
"""
try:
with open(json_path, 'r', encoding='utf-8') as f:
data = json.load(f)
metadata_map = {}
if isinstance(data, dict):
# Object format: {"filename": {metadata}}
for filename, metadata in data.items():
filename_stem = Path(filename).stem.lower()
metadata_map[filename_stem] = self._normalize_metadata(metadata)
elif isinstance(data, list):
# Array format: [{filename, metadata}]
for item in data:
if not isinstance(item, dict):
continue
# Find filename field
filename = None
for key in ['filename', 'file', 'name', 'file_name']:
if key in item:
filename = item[key]
break
if not filename:
logger.warning(f"Skipping item without filename: {item}")
continue
filename_stem = Path(filename).stem.lower()
metadata_map[filename_stem] = self._normalize_metadata(item)
else:
raise ValueError("JSON must be an object or array")
logger.info(f"Loaded {len(metadata_map)} metadata records from JSON")
return metadata_map
except Exception as e:
logger.error(f"Error importing from JSON: {e}")
raise
def _parse_dataframe(self, df: pd.DataFrame) -> Dict[str, Dict]:
"""
Parse pandas DataFrame into metadata map.
Args:
df: DataFrame with metadata
Returns:
Dictionary mapping filename stems to metadata dicts
"""
metadata_map = {}
# Detect filename column (try common names)
filename_col = self._detect_column(df, ['filename', 'file', 'name', 'file_name', 'path'])
if not filename_col:
raise ValueError("Could not find filename column in data. Tried: filename, file, name, file_name, path")
# Detect metadata columns
title_col = self._detect_column(df, ['title', 'heading', 'name', 'document_title'])
subject_col = self._detect_column(df, ['subject', 'description', 'summary', 'desc', 'external_description', 'alt_text'])
keywords_col = self._detect_column(df, ['keywords', 'tags', 'categories', 'labels'])
logger.info(f"Detected columns - filename: {filename_col}, title: {title_col}, subject: {subject_col}, keywords: {keywords_col}")
# Parse rows
for _, row in df.iterrows():
filename = str(row.get(filename_col, '')).strip()
if not filename or pd.isna(filename):
continue
filename_stem = Path(filename).stem.lower()
metadata_map[filename_stem] = {
'title': self._get_value(row, title_col),
'subject': self._get_value(row, subject_col),
'keywords': self._get_value(row, keywords_col)
}
logger.info(f"Parsed {len(metadata_map)} metadata records from DataFrame")
return metadata_map
def _detect_column(self, df: pd.DataFrame, candidates: List[str]) -> Optional[str]:
"""
Detect column name from a list of candidates (case-insensitive).
Args:
df: DataFrame to search
candidates: List of possible column names
Returns:
Actual column name if found, None otherwise
"""
# Create lowercase mapping
col_map = {col.lower(): col for col in df.columns}
# Try each candidate
for candidate in candidates:
if candidate.lower() in col_map:
return col_map[candidate.lower()]
return None
def _get_value(self, row: pd.Series, column: Optional[str]) -> str:
"""
Get value from row, handling None column and NaN values.
Args:
row: DataFrame row
column: Column name (can be None)
Returns:
String value or empty string
"""
if column is None:
return ''
value = row.get(column, '')
if pd.isna(value):
return ''
return str(value).strip()
def _normalize_metadata(self, metadata: Dict) -> Dict[str, str]:
"""
Normalize metadata dictionary to standard format.
Args:
metadata: Raw metadata dict
Returns:
Normalized metadata with title, subject, keywords keys
"""
normalized = {
'title': '',
'subject': '',
'keywords': ''
}
# Map title
for key in ['title', 'heading', 'name', 'document_title']:
if key in metadata and metadata[key]:
normalized['title'] = str(metadata[key]).strip()
break
# Map subject/description
for key in ['subject', 'description', 'summary', 'desc', 'external_description', 'alt_text']:
if key in metadata and metadata[key]:
normalized['subject'] = str(metadata[key]).strip()
break
# Map keywords
for key in ['keywords', 'tags', 'categories', 'labels']:
if key in metadata and metadata[key]:
value = metadata[key]
# Handle arrays
if isinstance(value, list):
normalized['keywords'] = ', '.join(str(v) for v in value)
else:
normalized['keywords'] = str(value).strip()
break
return normalized
def get_metadata_for_file(self, metadata_map: Dict[str, Dict], filename: str) -> Optional[Dict[str, str]]:
"""
Get metadata for a specific file from imported map.
Args:
metadata_map: Dictionary returned by import_* methods
filename: Filename to look up (with or without extension)
Returns:
Metadata dict if found, None otherwise
"""
filename_stem = Path(filename).stem.lower()
return metadata_map.get(filename_stem)
def validate_import(self, metadata_map: Dict[str, Dict]) -> Dict:
"""
Validate imported metadata and return statistics.
Args:
metadata_map: Dictionary returned by import_* methods
Returns:
Statistics about the import
"""
stats = {
'total_records': len(metadata_map),
'with_title': 0,
'with_subject': 0,
'with_keywords': 0,
'empty_records': 0
}
for metadata in metadata_map.values():
if metadata.get('title'):
stats['with_title'] += 1
if metadata.get('subject'):
stats['with_subject'] += 1
if metadata.get('keywords'):
stats['with_keywords'] += 1
if not any([metadata.get('title'), metadata.get('subject'), metadata.get('keywords')]):
stats['empty_records'] += 1
return stats

View file

@ -358,6 +358,45 @@
background: #5a6268;
}
/* Import Metadata Section */
.import-section {
background: white;
border-radius: 8px;
padding: 15px;
margin-bottom: 15px;
border: 2px dashed #dee2e6;
}
.import-section.active {
border-color: #28a745;
background: #f0fff4;
}
.btn-import {
background: linear-gradient(135deg, #17a2b8 0%, #138496 100%);
color: white;
border: none;
padding: 8px 20px;
border-radius: 20px;
cursor: pointer;
font-size: 14px;
font-weight: 600;
transition: transform 0.2s;
}
.btn-import:hover {
transform: translateY(-2px);
}
.import-stats {
font-size: 12px;
color: #28a745;
margin-top: 10px;
padding: 8px;
background: white;
border-radius: 5px;
}
@media (max-width: 768px) {
.metadata-comparison {
grid-template-columns: 1fr;
@ -383,7 +422,7 @@
<div class="upload-section">
<div class="metadata-source-selector">
<label for="metadataSource">Metadata Source:</label>
<select id="metadataSource" class="source-select">
<select id="metadataSource" class="source-select" onchange="handleSourceChange()">
<option value="excel" selected>📊 Excel Lookup (Fastest)</option>
<option value="manual">✏️ Manual Entry</option>
<option value="import">📂 Import from File</option>
@ -392,6 +431,18 @@
<span class="source-info"> Choose how to generate metadata</span>
</div>
<div class="import-section" id="importSection" style="display: none;">
<h4 style="margin-bottom: 10px; color: #495057;">📂 Import Metadata File</h4>
<p style="font-size: 13px; color: #6c757d; margin-bottom: 10px;">
Upload a CSV, Excel, or JSON file with metadata (columns: filename, title, subject, keywords)
</p>
<input type="file" id="importFileInput" accept=".csv,.xlsx,.xls,.json" style="display: none;">
<button class="btn-import" onclick="document.getElementById('importFileInput').click()">
📤 Choose Import File
</button>
<div id="importStats" class="import-stats" style="display: none;"></div>
</div>
<div class="upload-area" id="uploadArea">
<div class="upload-icon">📁</div>
<h3>Drop files here or click to browse</h3>
@ -438,6 +489,7 @@
<script>
let currentFiles = [];
let sessionId = null;
let importSessionId = null;
const uploadArea = document.getElementById('uploadArea');
const fileInput = document.getElementById('fileInput');
@ -475,6 +527,70 @@
}
});
// Import file input
const importFileInput = document.getElementById('importFileInput');
importFileInput.addEventListener('change', handleImportFile);
function handleSourceChange() {
const source = document.getElementById('metadataSource').value;
const importSection = document.getElementById('importSection');
if (source === 'import') {
importSection.style.display = 'block';
} else {
importSection.style.display = 'none';
}
}
async function handleImportFile(e) {
const file = e.target.files[0];
if (!file) return;
hideAlerts();
showInfo(`Importing metadata from ${file.name}...`);
const formData = new FormData();
formData.append('import_file', file);
try {
const response = await fetch('/import-metadata', {
method: 'POST',
body: formData
});
const data = await response.json();
if (data.error) {
showError(data.error);
return;
}
// Store import session ID
importSessionId = data.import_session_id;
// Display stats
const importStats = document.getElementById('importStats');
const stats = data.stats;
importStats.innerHTML = `
✅ ${data.message}<br>
<small>
Title: ${stats.with_title}/${stats.total_records} •
Subject: ${stats.with_subject}/${stats.total_records} •
Keywords: ${stats.with_keywords}/${stats.total_records}
</small>
`;
importStats.style.display = 'block';
// Mark import section as active
document.getElementById('importSection').classList.add('active');
showSuccess(`✅ ${data.message}`);
} catch (error) {
showError(`Import failed: ${error.message}`);
}
}
function handleFileSelect(e) {
const files = e.target.files;
if (files.length > 0) {
@ -492,6 +608,13 @@
const metadataSource = document.getElementById('metadataSource').value;
// Validate import source
if (metadataSource === 'import' && !importSessionId) {
showError('Please import a metadata file first using the "Choose Import File" button');
hideSpinner();
return;
}
// Show specific message for AI processing
if (metadataSource === 'ai') {
showInfo(`🤖 Generating AI metadata for ${files.length} file(s)... This may take 10-30 seconds per file.`);
@ -501,6 +624,9 @@
const formData = new FormData();
formData.append('metadata_source', metadataSource);
if (importSessionId) {
formData.append('import_session_id', importSessionId);
}
for (let file of files) {
formData.append('files', file);
}

View file

@ -21,6 +21,7 @@ from src.file_detector import FileDetector, FileType
from src.excel_metadata_lookup import ExcelMetadataLookup
from src.config import Config
from src.metadata_analyzer import MetadataAnalyzer
from src.metadata_importer import MetadataImporter
def safe_filename(filename):
"""Sanitize filename while preserving Unicode characters (Chinese, Japanese, Korean)."""
@ -78,6 +79,9 @@ updaters = {
# Store file processing sessions
sessions = {}
# Store imported metadata from external files
imported_metadata = {}
def get_metadata_lookup():
"""Get or create metadata lookup instance."""
global metadata_lookup
@ -119,14 +123,28 @@ def upload_file():
# Get metadata source choice (excel, manual, ai, import)
metadata_source = request.form.get('metadata_source', 'excel')
import_session_id = request.form.get('import_session_id', '') # For import source
results = []
session_id = str(len(sessions) + 1)
sessions[session_id] = {'files': [], 'metadata_source': metadata_source}
sessions[session_id] = {
'files': [],
'metadata_source': metadata_source,
'import_session_id': import_session_id
}
# Get metadata lookup (only if using Excel source)
lookup = get_metadata_lookup() if metadata_source == 'excel' else None
# Get imported metadata (only if using import source)
import_map = None
if metadata_source == 'import' and import_session_id and import_session_id in imported_metadata:
import_map = imported_metadata[import_session_id]
importer = MetadataImporter()
elif metadata_source == 'import':
# Import source selected but no import session available
return jsonify({'error': 'Please import a metadata file first using the Import button'}), 400
for file in files:
try:
# Save uploaded file
@ -234,13 +252,28 @@ def upload_file():
}
elif metadata_source == 'import':
# Import from file - will be implemented in Phase 2.4
# For now, return placeholder
new_metadata = {
'title': Path(filename).stem,
'subject': 'Import feature not yet implemented',
'keywords': ''
}
# Import from external file (CSV, Excel, JSON)
if import_map and importer:
# Look up metadata for this file
imported = importer.get_metadata_for_file(import_map, filename)
if imported:
new_metadata = imported
excel_found = True # Mark as found in import
else:
# No metadata found in import file
new_metadata = {
'title': Path(filename).stem,
'subject': f'No metadata found in imported file for {filename}',
'keywords': ''
}
else:
# Import source not available
new_metadata = {
'title': Path(filename).stem,
'subject': 'Import metadata not loaded',
'keywords': ''
}
file_info = {
'success': True,
@ -405,6 +438,57 @@ def download_file(filename):
return send_file(filepath, as_attachment=True)
return jsonify({'error': 'File not found'}), 404
@app.route('/import-metadata', methods=['POST'])
def import_metadata():
"""Import metadata from external file (CSV, Excel, JSON)."""
if 'import_file' not in request.files:
return jsonify({'error': 'No file provided'}), 400
file = request.files['import_file']
if file.filename == '':
return jsonify({'error': 'No file selected'}), 400
try:
# Save temp file
import_filename = safe_filename(file.filename)
temp_path = Path(app.config['UPLOAD_FOLDER']) / import_filename
file.save(str(temp_path))
# Import based on file type
importer = MetadataImporter()
file_ext = temp_path.suffix.lower()
if file_ext == '.csv':
metadata_map = importer.import_from_csv(str(temp_path))
elif file_ext in ['.xlsx', '.xls']:
metadata_map = importer.import_from_excel(str(temp_path))
elif file_ext == '.json':
metadata_map = importer.import_from_json(str(temp_path))
else:
return jsonify({'error': f'Unsupported file format: {file_ext}. Supported: .csv, .xlsx, .xls, .json'}), 400
# Validate import
stats = importer.validate_import(metadata_map)
# Store in global dict with unique session ID
import_session_id = f"import_{len(imported_metadata) + 1}"
imported_metadata[import_session_id] = metadata_map
# Clean up temp file
temp_path.unlink()
return jsonify({
'success': True,
'import_session_id': import_session_id,
'stats': stats,
'message': f'Imported {stats["total_records"]} metadata records from {import_filename}'
})
except Exception as e:
import logging
logging.getLogger(__name__).error(f"Import failed: {e}")
return jsonify({'error': f'Import failed: {str(e)}'}), 500
@app.route('/stats')
def get_stats():
"""Get Excel metadata statistics."""