solventum-image-metadata/web_app.py
SamoilenkoVadym 03079080d8 Phase 2.4: Metadata import from external files (CSV, Excel, JSON)
Created comprehensive metadata_importer.py module:
- CSV import with multiple encoding support (UTF-8, Latin1, ISO-8859-1, CP1252)
- Excel import (.xlsx, .xls) with sheet selection
- JSON import (object and array formats)
- Intelligent column detection for filename, title, subject, keywords
- Fuzzy column matching (case-insensitive, multiple aliases)
- Metadata normalization to standard format
- Import validation with statistics
- File lookup by filename stem (case-insensitive)

Web interface enhancements:
- /import-metadata endpoint for file uploads
- Import section UI (appears when Import source selected)
- Real-time import statistics display (records, title/subject/keywords counts)
- Import session management with unique session IDs
- Visual feedback (active state, success/error messages)
- Validation: requires import file before processing with import source

Import workflow:
1. User selects "Import from File" metadata source
2. Import section appears with file chooser
3. User uploads CSV/Excel/JSON with metadata
4. System validates and shows statistics
5. User uploads files to process
6. System matches files to imported metadata by filename

Supported import formats:
- CSV: filename, title, subject/description, keywords columns
- Excel: Any sheet with filename and metadata columns
- JSON: {filename: {metadata}} or [{filename, metadata}] formats

Technical features:
- Pandas DataFrame parsing for CSV/Excel
- Flexible column name detection (10+ aliases per field)
- NaN/null value handling
- List/array keyword support
- Unicode filename support

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-01-25 15:39:27 +00:00

554 lines
20 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Oliver Metadata Tool - Web Interface
Universal metadata creation and management tool for files.
Flask-based web app for local or server deployment.
Supports multiple metadata sources: Excel, AI, manual entry, and file import.
"""
from flask import Flask, render_template, request, jsonify, send_file
from werkzeug.utils import secure_filename # noqa: F401 - kept as fallback
from pathlib import Path
import os
import tempfile
import threading
import webbrowser
from time import sleep
import shutil
import unicodedata
from src.file_detector import FileDetector, FileType
from src.excel_metadata_lookup import ExcelMetadataLookup
from src.config import Config
from src.metadata_analyzer import MetadataAnalyzer
from src.metadata_importer import MetadataImporter
def safe_filename(filename):
"""Sanitize filename while preserving Unicode characters (Chinese, Japanese, Korean)."""
# Normalize unicode
filename = unicodedata.normalize('NFC', filename)
# Remove path separators and null bytes
filename = filename.replace('/', '_').replace('\\', '_').replace('\x00', '')
# Remove leading/trailing dots and spaces
filename = filename.strip('. ')
# If empty, use default
if not filename:
filename = 'unnamed_file'
return filename
from src.extractors.pdf_extractor import PDFExtractor
from src.extractors.image_extractor import ImageExtractor
from src.extractors.office_extractor import OfficeExtractor
from src.extractors.video_extractor import VideoExtractor
from src.updaters.pdf_updater import PDFUpdater
from src.updaters.image_updater import ImageUpdater
from src.updaters.office_updater import OfficeUpdater
from src.updaters.video_updater import VideoUpdater
app = Flask(__name__)
app.config['MAX_CONTENT_LENGTH'] = 500 * 1024 * 1024 # 500MB max file size
app.config['UPLOAD_FOLDER'] = tempfile.mkdtemp()
# Excel file path for metadata lookup
EXCEL_PATH = Path(__file__).parent / "Celum ID to Adobe Asset Path Mapping Spreadsheet (1).xlsx"
# Initialize metadata lookup from Excel
metadata_lookup = None
# Initialize AI analyzer (lazy initialization)
ai_analyzer = None
# Initialize extractors and updaters
extractors = {
FileType.PDF: PDFExtractor(),
FileType.IMAGE: ImageExtractor(),
FileType.OFFICE_DOC: OfficeExtractor(),
FileType.OFFICE_SHEET: OfficeExtractor(),
FileType.OFFICE_PRESENTATION: OfficeExtractor(),
FileType.VIDEO: VideoExtractor()
}
updaters = {
FileType.PDF: PDFUpdater(),
FileType.IMAGE: ImageUpdater(),
FileType.OFFICE_DOC: OfficeUpdater(),
FileType.OFFICE_SHEET: OfficeUpdater(),
FileType.OFFICE_PRESENTATION: OfficeUpdater(),
FileType.VIDEO: VideoUpdater()
}
# Store file processing sessions
sessions = {}
# Store imported metadata from external files
imported_metadata = {}
def get_metadata_lookup():
"""Get or create metadata lookup instance."""
global metadata_lookup
if metadata_lookup is None:
metadata_lookup = ExcelMetadataLookup(str(EXCEL_PATH))
return metadata_lookup
def get_ai_analyzer():
"""Get or create AI analyzer instance."""
global ai_analyzer
if ai_analyzer is None:
if Config.OPENAI_API_KEY:
try:
ai_analyzer = MetadataAnalyzer()
logger = __import__('logging').getLogger(__name__)
logger.info("AI analyzer initialized successfully")
except Exception as e:
logger = __import__('logging').getLogger(__name__)
logger.error(f"Failed to initialize AI analyzer: {e}")
return None
else:
return None
return ai_analyzer
@app.route('/')
def index():
"""Main page."""
return render_template('index.html')
@app.route('/upload', methods=['POST'])
def upload_file():
"""Handle multiple file uploads and metadata lookup from Excel."""
if 'files' not in request.files:
return jsonify({'error': 'No files provided'}), 400
files = request.files.getlist('files')
if not files or files[0].filename == '':
return jsonify({'error': 'No files selected'}), 400
# Get metadata source choice (excel, manual, ai, import)
metadata_source = request.form.get('metadata_source', 'excel')
import_session_id = request.form.get('import_session_id', '') # For import source
results = []
session_id = str(len(sessions) + 1)
sessions[session_id] = {
'files': [],
'metadata_source': metadata_source,
'import_session_id': import_session_id
}
# Get metadata lookup (only if using Excel source)
lookup = get_metadata_lookup() if metadata_source == 'excel' else None
# Get imported metadata (only if using import source)
import_map = None
if metadata_source == 'import' and import_session_id and import_session_id in imported_metadata:
import_map = imported_metadata[import_session_id]
importer = MetadataImporter()
elif metadata_source == 'import':
# Import source selected but no import session available
return jsonify({'error': 'Please import a metadata file first using the Import button'}), 400
for file in files:
try:
# Save uploaded file
filename = safe_filename(file.filename)
filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename)
file.save(filepath)
# Detect file type
file_type = FileDetector.detect_file_type(filepath)
if file_type == FileType.UNSUPPORTED:
results.append({
'filename': filename,
'error': 'Unsupported file type'
})
continue
# Get extractor for this file type
extractor = extractors.get(file_type)
if not extractor:
results.append({
'filename': filename,
'error': 'No extractor available'
})
continue
# Read current metadata from file
old_metadata = extractor.read_metadata(filepath)
# Generate metadata based on chosen source
excel_found = False
new_metadata = {'title': '', 'subject': '', 'keywords': ''}
if metadata_source == 'excel' and lookup:
# Lookup metadata from Excel by filename
excel_data = lookup.lookup_by_filename(filename)
if excel_data:
new_metadata = {
'title': excel_data.get('title', ''),
'subject': excel_data.get('description', ''),
'keywords': ''
}
excel_found = True
else:
# No Excel data found - use filename as fallback
new_metadata = {
'title': Path(filename).stem,
'subject': f'No metadata found in Excel for {filename}',
'keywords': ''
}
elif metadata_source == 'manual':
# Return empty metadata for user to fill manually
new_metadata = {
'title': Path(filename).stem, # Suggest filename
'subject': '',
'keywords': ''
}
elif metadata_source == 'ai':
# AI generation using MetadataAnalyzer
analyzer = get_ai_analyzer()
if analyzer:
try:
# Extract content from file
content = extractor.extract_content(str(filepath))
if not content or len(content.strip()) < 10:
# Not enough content for AI analysis
new_metadata = {
'title': Path(filename).stem,
'subject': 'Insufficient content for AI analysis',
'keywords': '',
'_ai_error': 'Not enough text content extracted'
}
else:
# Generate metadata with AI
new_metadata = analyzer.analyze_content(content, filename, file_type)
# Log token usage if available
if '_tokens_used' in new_metadata:
import logging
logging.getLogger(__name__).info(
f"AI tokens used for {filename}: {new_metadata['_tokens_used']}"
)
except Exception as e:
import logging
logging.getLogger(__name__).error(f"AI generation failed for {filename}: {e}")
new_metadata = {
'title': Path(filename).stem,
'subject': f'AI generation error: {str(e)}',
'keywords': '',
'_ai_error': str(e)
}
else:
# AI not configured
new_metadata = {
'title': Path(filename).stem,
'subject': 'AI generation not available (OpenAI API key not configured)',
'keywords': '',
'_ai_error': 'OpenAI API key not configured'
}
elif metadata_source == 'import':
# Import from external file (CSV, Excel, JSON)
if import_map and importer:
# Look up metadata for this file
imported = importer.get_metadata_for_file(import_map, filename)
if imported:
new_metadata = imported
excel_found = True # Mark as found in import
else:
# No metadata found in import file
new_metadata = {
'title': Path(filename).stem,
'subject': f'No metadata found in imported file for {filename}',
'keywords': ''
}
else:
# Import source not available
new_metadata = {
'title': Path(filename).stem,
'subject': 'Import metadata not loaded',
'keywords': ''
}
file_info = {
'success': True,
'filename': filename,
'file_type': file_type.value,
'current_metadata': old_metadata,
'suggested_metadata': new_metadata,
'filepath': filepath,
'metadata_source': metadata_source,
'excel_found': excel_found
}
results.append(file_info)
sessions[session_id]['files'].append(file_info)
except Exception as e:
results.append({
'filename': file.filename,
'error': str(e)
})
return jsonify({
'success': True,
'session_id': session_id,
'files': results
})
@app.route('/update', methods=['POST'])
def update_metadata():
"""Update file metadata from Excel and save to chosen location."""
data = request.json
filepath = data.get('filepath')
output_dir = data.get('output_dir', '') # User-selected output directory
if not filepath or not os.path.exists(filepath):
return jsonify({'error': 'File not found'}), 404
try:
# Detect file type
file_type = FileDetector.detect_file_type(filepath)
if file_type == FileType.UNSUPPORTED:
return jsonify({'error': 'Unsupported file type'}), 400
# Get updater
updater = updaters.get(file_type)
if not updater:
return jsonify({'error': 'No updater available for this file type'}), 400
# Lookup metadata from Excel
filename = Path(filepath).name
lookup = get_metadata_lookup()
excel_data = lookup.lookup_by_filename(filename)
if excel_data:
new_metadata = {
'title': excel_data.get('title', ''),
'subject': excel_data.get('description', ''), # External Description/Alt Text
'keywords': ''
}
else:
return jsonify({'error': f'No metadata found in Excel for {filename}'}), 400
# Copy file to output directory if specified
if output_dir and os.path.isdir(output_dir):
output_path = os.path.join(output_dir, filename)
shutil.copy2(filepath, output_path)
target_file = output_path
else:
target_file = filepath
# Update the file metadata WITHOUT changing filename
success = updater.update_metadata(target_file, new_metadata, backup=False)
if not success:
return jsonify({'error': 'Failed to update metadata'}), 500
# Verify update
verified = updater.verify_metadata(target_file, new_metadata)
return jsonify({
'success': True,
'message': 'Metadata updated successfully',
'verified': verified,
'output_path': target_file,
'metadata': new_metadata
})
except Exception as e:
return jsonify({'error': str(e)}), 500
@app.route('/update-manual', methods=['POST'])
def update_manual_metadata():
"""Update file with manually entered metadata."""
data = request.json
session_id = data.get('session_id')
file_index = data.get('file_index')
# Validate and sanitize metadata
custom_metadata = {
'title': data.get('title', '').strip()[:200],
'subject': data.get('subject', '').strip()[:300],
'keywords': data.get('keywords', '').strip()[:500]
}
# Validate session
if not session_id or session_id not in sessions:
return jsonify({'error': 'Invalid or expired session'}), 400
# Validate file index
if file_index is None or file_index >= len(sessions[session_id]['files']):
return jsonify({'error': 'Invalid file index'}), 400
try:
# Get file info from session
file_info = sessions[session_id]['files'][file_index]
filepath = file_info.get('filepath')
if not filepath or not os.path.exists(filepath):
return jsonify({'error': 'File not found'}), 404
# Detect file type
file_type = FileDetector.detect_file_type(filepath)
if file_type == FileType.UNSUPPORTED:
return jsonify({'error': 'Unsupported file type'}), 400
# Get updater for this file type
updater = updaters.get(file_type)
if not updater:
return jsonify({'error': 'No updater available for this file type'}), 400
# Update metadata
success = updater.update_metadata(filepath, custom_metadata, backup=True)
if not success:
return jsonify({'error': 'Failed to update metadata'}), 500
# Update session with new metadata
sessions[session_id]['files'][file_index]['suggested_metadata'] = custom_metadata
# Verify update
verified = updater.verify_metadata(filepath, custom_metadata)
return jsonify({
'status': 'success',
'message': 'Metadata updated successfully',
'verified': verified,
'metadata': custom_metadata
})
except Exception as e:
return jsonify({'error': f'Error updating metadata: {str(e)}'}), 500
@app.route('/download/<filename>')
def download_file(filename):
"""Download processed file."""
filepath = os.path.join(app.config['UPLOAD_FOLDER'], safe_filename(filename))
if os.path.exists(filepath):
return send_file(filepath, as_attachment=True)
return jsonify({'error': 'File not found'}), 404
@app.route('/import-metadata', methods=['POST'])
def import_metadata():
"""Import metadata from external file (CSV, Excel, JSON)."""
if 'import_file' not in request.files:
return jsonify({'error': 'No file provided'}), 400
file = request.files['import_file']
if file.filename == '':
return jsonify({'error': 'No file selected'}), 400
try:
# Save temp file
import_filename = safe_filename(file.filename)
temp_path = Path(app.config['UPLOAD_FOLDER']) / import_filename
file.save(str(temp_path))
# Import based on file type
importer = MetadataImporter()
file_ext = temp_path.suffix.lower()
if file_ext == '.csv':
metadata_map = importer.import_from_csv(str(temp_path))
elif file_ext in ['.xlsx', '.xls']:
metadata_map = importer.import_from_excel(str(temp_path))
elif file_ext == '.json':
metadata_map = importer.import_from_json(str(temp_path))
else:
return jsonify({'error': f'Unsupported file format: {file_ext}. Supported: .csv, .xlsx, .xls, .json'}), 400
# Validate import
stats = importer.validate_import(metadata_map)
# Store in global dict with unique session ID
import_session_id = f"import_{len(imported_metadata) + 1}"
imported_metadata[import_session_id] = metadata_map
# Clean up temp file
temp_path.unlink()
return jsonify({
'success': True,
'import_session_id': import_session_id,
'stats': stats,
'message': f'Imported {stats["total_records"]} metadata records from {import_filename}'
})
except Exception as e:
import logging
logging.getLogger(__name__).error(f"Import failed: {e}")
return jsonify({'error': f'Import failed: {str(e)}'}), 500
@app.route('/stats')
def get_stats():
"""Get Excel metadata statistics."""
try:
lookup = get_metadata_lookup()
stats = lookup.get_stats()
return jsonify({
'success': True,
'stats': stats
})
except Exception as e:
return jsonify({'error': str(e)}), 500
def open_browser():
"""Open browser after short delay."""
sleep(1.5)
webbrowser.open('http://localhost:5001')
if __name__ == '__main__':
print("="*60)
print(f"{Config.APP_NAME} v{Config.APP_VERSION} - Web Interface")
print("="*60)
# Check dependencies
print("\n🔍 Checking dependencies...")
# Check Excel file
if not EXCEL_PATH.exists():
print(f"⚠️ Warning: Excel file not found at {EXCEL_PATH}")
print(" Excel metadata lookup will not be available")
print(" Please ensure the Excel file is in the project root")
else:
print(f"✓ Excel file found: {EXCEL_PATH.name}")
# Check OpenAI API key (optional)
if Config.OPENAI_API_KEY:
print("✓ OpenAI API key configured (AI metadata generation available)")
else:
print(" OpenAI API key not configured (AI generation disabled)")
# Check ExifTool (optional)
if Config.check_exiftool():
print("✓ ExifTool available for enhanced metadata operations")
else:
print(" ExifTool not installed (using Python libraries)")
print("\nMetadata sources available:")
print(" • Excel lookup (Celum ID mapping)")
if Config.OPENAI_API_KEY:
print(" • AI generation (OpenAI)")
print(" • Manual entry")
print(" • File import (CSV/Excel/JSON)")
print("\nStarting server...")
print("Opening browser at http://localhost:5001")
print("\nPress Ctrl+C to stop the server")
print("="*60)
# Open browser in background
threading.Thread(target=open_browser, daemon=True).start()
# Run Flask app
app.run(debug=False, port=5001, host='127.0.0.1')