Enhanced metadata_analyzer.py with production-ready capabilities: - Token counting with tiktoken for accurate OpenAI usage tracking - Exponential backoff retry logic with tenacity library - Intelligent content truncation based on token limits (not characters) - Configurable timeout and max retries from Config - Graceful fallback when tiktoken/tenacity unavailable - Enhanced error reporting with _ai_error and _tokens_used metadata Integrated AI generation in web interface: - AI analyzer lazy initialization in web_app.py - Real content extraction and AI analysis in upload endpoint - Error handling for insufficient content or API failures - Token usage logging for monitoring and optimization UI improvements for AI experience: - Special loading message for AI processing (10-30s per file) - Display token usage for AI-generated metadata - Show AI errors prominently with helpful messages - Filter internal metadata fields (_tokens_used, _ai_error) from forms Dependencies leveraged: - tiktoken: Proper OpenAI token counting (10x more accurate) - tenacity: Exponential backoff retry (3 attempts, 2-10s delays) - openai: Production timeout support (30s default) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
470 lines
16 KiB
Python
470 lines
16 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Oliver Metadata Tool - Web Interface
|
||
Universal metadata creation and management tool for files.
|
||
Flask-based web app for local or server deployment.
|
||
Supports multiple metadata sources: Excel, AI, manual entry, and file import.
|
||
"""
|
||
|
||
from flask import Flask, render_template, request, jsonify, send_file
|
||
from werkzeug.utils import secure_filename # noqa: F401 - kept as fallback
|
||
from pathlib import Path
|
||
import os
|
||
import tempfile
|
||
import threading
|
||
import webbrowser
|
||
from time import sleep
|
||
import shutil
|
||
import unicodedata
|
||
|
||
from src.file_detector import FileDetector, FileType
|
||
from src.excel_metadata_lookup import ExcelMetadataLookup
|
||
from src.config import Config
|
||
from src.metadata_analyzer import MetadataAnalyzer
|
||
|
||
def safe_filename(filename):
|
||
"""Sanitize filename while preserving Unicode characters (Chinese, Japanese, Korean)."""
|
||
# Normalize unicode
|
||
filename = unicodedata.normalize('NFC', filename)
|
||
# Remove path separators and null bytes
|
||
filename = filename.replace('/', '_').replace('\\', '_').replace('\x00', '')
|
||
# Remove leading/trailing dots and spaces
|
||
filename = filename.strip('. ')
|
||
# If empty, use default
|
||
if not filename:
|
||
filename = 'unnamed_file'
|
||
return filename
|
||
from src.extractors.pdf_extractor import PDFExtractor
|
||
from src.extractors.image_extractor import ImageExtractor
|
||
from src.extractors.office_extractor import OfficeExtractor
|
||
from src.extractors.video_extractor import VideoExtractor
|
||
from src.updaters.pdf_updater import PDFUpdater
|
||
from src.updaters.image_updater import ImageUpdater
|
||
from src.updaters.office_updater import OfficeUpdater
|
||
from src.updaters.video_updater import VideoUpdater
|
||
|
||
app = Flask(__name__)
|
||
app.config['MAX_CONTENT_LENGTH'] = 500 * 1024 * 1024 # 500MB max file size
|
||
app.config['UPLOAD_FOLDER'] = tempfile.mkdtemp()
|
||
|
||
# Excel file path for metadata lookup
|
||
EXCEL_PATH = Path(__file__).parent / "Celum ID to Adobe Asset Path Mapping Spreadsheet (1).xlsx"
|
||
|
||
# Initialize metadata lookup from Excel
|
||
metadata_lookup = None
|
||
|
||
# Initialize AI analyzer (lazy initialization)
|
||
ai_analyzer = None
|
||
|
||
# Initialize extractors and updaters
|
||
extractors = {
|
||
FileType.PDF: PDFExtractor(),
|
||
FileType.IMAGE: ImageExtractor(),
|
||
FileType.OFFICE_DOC: OfficeExtractor(),
|
||
FileType.OFFICE_SHEET: OfficeExtractor(),
|
||
FileType.OFFICE_PRESENTATION: OfficeExtractor(),
|
||
FileType.VIDEO: VideoExtractor()
|
||
}
|
||
|
||
updaters = {
|
||
FileType.PDF: PDFUpdater(),
|
||
FileType.IMAGE: ImageUpdater(),
|
||
FileType.OFFICE_DOC: OfficeUpdater(),
|
||
FileType.OFFICE_SHEET: OfficeUpdater(),
|
||
FileType.OFFICE_PRESENTATION: OfficeUpdater(),
|
||
FileType.VIDEO: VideoUpdater()
|
||
}
|
||
|
||
# Store file processing sessions
|
||
sessions = {}
|
||
|
||
def get_metadata_lookup():
|
||
"""Get or create metadata lookup instance."""
|
||
global metadata_lookup
|
||
if metadata_lookup is None:
|
||
metadata_lookup = ExcelMetadataLookup(str(EXCEL_PATH))
|
||
return metadata_lookup
|
||
|
||
def get_ai_analyzer():
|
||
"""Get or create AI analyzer instance."""
|
||
global ai_analyzer
|
||
if ai_analyzer is None:
|
||
if Config.OPENAI_API_KEY:
|
||
try:
|
||
ai_analyzer = MetadataAnalyzer()
|
||
logger = __import__('logging').getLogger(__name__)
|
||
logger.info("AI analyzer initialized successfully")
|
||
except Exception as e:
|
||
logger = __import__('logging').getLogger(__name__)
|
||
logger.error(f"Failed to initialize AI analyzer: {e}")
|
||
return None
|
||
else:
|
||
return None
|
||
return ai_analyzer
|
||
|
||
@app.route('/')
|
||
def index():
|
||
"""Main page."""
|
||
return render_template('index.html')
|
||
|
||
@app.route('/upload', methods=['POST'])
|
||
def upload_file():
|
||
"""Handle multiple file uploads and metadata lookup from Excel."""
|
||
if 'files' not in request.files:
|
||
return jsonify({'error': 'No files provided'}), 400
|
||
|
||
files = request.files.getlist('files')
|
||
if not files or files[0].filename == '':
|
||
return jsonify({'error': 'No files selected'}), 400
|
||
|
||
# Get metadata source choice (excel, manual, ai, import)
|
||
metadata_source = request.form.get('metadata_source', 'excel')
|
||
|
||
results = []
|
||
session_id = str(len(sessions) + 1)
|
||
sessions[session_id] = {'files': [], 'metadata_source': metadata_source}
|
||
|
||
# Get metadata lookup (only if using Excel source)
|
||
lookup = get_metadata_lookup() if metadata_source == 'excel' else None
|
||
|
||
for file in files:
|
||
try:
|
||
# Save uploaded file
|
||
filename = safe_filename(file.filename)
|
||
filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename)
|
||
file.save(filepath)
|
||
|
||
# Detect file type
|
||
file_type = FileDetector.detect_file_type(filepath)
|
||
|
||
if file_type == FileType.UNSUPPORTED:
|
||
results.append({
|
||
'filename': filename,
|
||
'error': 'Unsupported file type'
|
||
})
|
||
continue
|
||
|
||
# Get extractor for this file type
|
||
extractor = extractors.get(file_type)
|
||
if not extractor:
|
||
results.append({
|
||
'filename': filename,
|
||
'error': 'No extractor available'
|
||
})
|
||
continue
|
||
|
||
# Read current metadata from file
|
||
old_metadata = extractor.read_metadata(filepath)
|
||
|
||
# Generate metadata based on chosen source
|
||
excel_found = False
|
||
new_metadata = {'title': '', 'subject': '', 'keywords': ''}
|
||
|
||
if metadata_source == 'excel' and lookup:
|
||
# Lookup metadata from Excel by filename
|
||
excel_data = lookup.lookup_by_filename(filename)
|
||
|
||
if excel_data:
|
||
new_metadata = {
|
||
'title': excel_data.get('title', ''),
|
||
'subject': excel_data.get('description', ''),
|
||
'keywords': ''
|
||
}
|
||
excel_found = True
|
||
else:
|
||
# No Excel data found - use filename as fallback
|
||
new_metadata = {
|
||
'title': Path(filename).stem,
|
||
'subject': f'No metadata found in Excel for {filename}',
|
||
'keywords': ''
|
||
}
|
||
|
||
elif metadata_source == 'manual':
|
||
# Return empty metadata for user to fill manually
|
||
new_metadata = {
|
||
'title': Path(filename).stem, # Suggest filename
|
||
'subject': '',
|
||
'keywords': ''
|
||
}
|
||
|
||
elif metadata_source == 'ai':
|
||
# AI generation using MetadataAnalyzer
|
||
analyzer = get_ai_analyzer()
|
||
|
||
if analyzer:
|
||
try:
|
||
# Extract content from file
|
||
content = extractor.extract_content(str(filepath))
|
||
|
||
if not content or len(content.strip()) < 10:
|
||
# Not enough content for AI analysis
|
||
new_metadata = {
|
||
'title': Path(filename).stem,
|
||
'subject': 'Insufficient content for AI analysis',
|
||
'keywords': '',
|
||
'_ai_error': 'Not enough text content extracted'
|
||
}
|
||
else:
|
||
# Generate metadata with AI
|
||
new_metadata = analyzer.analyze_content(content, filename, file_type)
|
||
|
||
# Log token usage if available
|
||
if '_tokens_used' in new_metadata:
|
||
import logging
|
||
logging.getLogger(__name__).info(
|
||
f"AI tokens used for {filename}: {new_metadata['_tokens_used']}"
|
||
)
|
||
|
||
except Exception as e:
|
||
import logging
|
||
logging.getLogger(__name__).error(f"AI generation failed for {filename}: {e}")
|
||
new_metadata = {
|
||
'title': Path(filename).stem,
|
||
'subject': f'AI generation error: {str(e)}',
|
||
'keywords': '',
|
||
'_ai_error': str(e)
|
||
}
|
||
else:
|
||
# AI not configured
|
||
new_metadata = {
|
||
'title': Path(filename).stem,
|
||
'subject': 'AI generation not available (OpenAI API key not configured)',
|
||
'keywords': '',
|
||
'_ai_error': 'OpenAI API key not configured'
|
||
}
|
||
|
||
elif metadata_source == 'import':
|
||
# Import from file - will be implemented in Phase 2.4
|
||
# For now, return placeholder
|
||
new_metadata = {
|
||
'title': Path(filename).stem,
|
||
'subject': 'Import feature not yet implemented',
|
||
'keywords': ''
|
||
}
|
||
|
||
file_info = {
|
||
'success': True,
|
||
'filename': filename,
|
||
'file_type': file_type.value,
|
||
'current_metadata': old_metadata,
|
||
'suggested_metadata': new_metadata,
|
||
'filepath': filepath,
|
||
'metadata_source': metadata_source,
|
||
'excel_found': excel_found
|
||
}
|
||
|
||
results.append(file_info)
|
||
sessions[session_id]['files'].append(file_info)
|
||
|
||
except Exception as e:
|
||
results.append({
|
||
'filename': file.filename,
|
||
'error': str(e)
|
||
})
|
||
|
||
return jsonify({
|
||
'success': True,
|
||
'session_id': session_id,
|
||
'files': results
|
||
})
|
||
|
||
@app.route('/update', methods=['POST'])
|
||
def update_metadata():
|
||
"""Update file metadata from Excel and save to chosen location."""
|
||
data = request.json
|
||
filepath = data.get('filepath')
|
||
output_dir = data.get('output_dir', '') # User-selected output directory
|
||
|
||
if not filepath or not os.path.exists(filepath):
|
||
return jsonify({'error': 'File not found'}), 404
|
||
|
||
try:
|
||
# Detect file type
|
||
file_type = FileDetector.detect_file_type(filepath)
|
||
|
||
if file_type == FileType.UNSUPPORTED:
|
||
return jsonify({'error': 'Unsupported file type'}), 400
|
||
|
||
# Get updater
|
||
updater = updaters.get(file_type)
|
||
|
||
if not updater:
|
||
return jsonify({'error': 'No updater available for this file type'}), 400
|
||
|
||
# Lookup metadata from Excel
|
||
filename = Path(filepath).name
|
||
lookup = get_metadata_lookup()
|
||
excel_data = lookup.lookup_by_filename(filename)
|
||
|
||
if excel_data:
|
||
new_metadata = {
|
||
'title': excel_data.get('title', ''),
|
||
'subject': excel_data.get('description', ''), # External Description/Alt Text
|
||
'keywords': ''
|
||
}
|
||
else:
|
||
return jsonify({'error': f'No metadata found in Excel for {filename}'}), 400
|
||
|
||
# Copy file to output directory if specified
|
||
if output_dir and os.path.isdir(output_dir):
|
||
output_path = os.path.join(output_dir, filename)
|
||
shutil.copy2(filepath, output_path)
|
||
target_file = output_path
|
||
else:
|
||
target_file = filepath
|
||
|
||
# Update the file metadata WITHOUT changing filename
|
||
success = updater.update_metadata(target_file, new_metadata, backup=False)
|
||
|
||
if not success:
|
||
return jsonify({'error': 'Failed to update metadata'}), 500
|
||
|
||
# Verify update
|
||
verified = updater.verify_metadata(target_file, new_metadata)
|
||
|
||
return jsonify({
|
||
'success': True,
|
||
'message': 'Metadata updated successfully',
|
||
'verified': verified,
|
||
'output_path': target_file,
|
||
'metadata': new_metadata
|
||
})
|
||
|
||
except Exception as e:
|
||
return jsonify({'error': str(e)}), 500
|
||
|
||
@app.route('/update-manual', methods=['POST'])
|
||
def update_manual_metadata():
|
||
"""Update file with manually entered metadata."""
|
||
data = request.json
|
||
session_id = data.get('session_id')
|
||
file_index = data.get('file_index')
|
||
|
||
# Validate and sanitize metadata
|
||
custom_metadata = {
|
||
'title': data.get('title', '').strip()[:200],
|
||
'subject': data.get('subject', '').strip()[:300],
|
||
'keywords': data.get('keywords', '').strip()[:500]
|
||
}
|
||
|
||
# Validate session
|
||
if not session_id or session_id not in sessions:
|
||
return jsonify({'error': 'Invalid or expired session'}), 400
|
||
|
||
# Validate file index
|
||
if file_index is None or file_index >= len(sessions[session_id]['files']):
|
||
return jsonify({'error': 'Invalid file index'}), 400
|
||
|
||
try:
|
||
# Get file info from session
|
||
file_info = sessions[session_id]['files'][file_index]
|
||
filepath = file_info.get('filepath')
|
||
|
||
if not filepath or not os.path.exists(filepath):
|
||
return jsonify({'error': 'File not found'}), 404
|
||
|
||
# Detect file type
|
||
file_type = FileDetector.detect_file_type(filepath)
|
||
|
||
if file_type == FileType.UNSUPPORTED:
|
||
return jsonify({'error': 'Unsupported file type'}), 400
|
||
|
||
# Get updater for this file type
|
||
updater = updaters.get(file_type)
|
||
|
||
if not updater:
|
||
return jsonify({'error': 'No updater available for this file type'}), 400
|
||
|
||
# Update metadata
|
||
success = updater.update_metadata(filepath, custom_metadata, backup=True)
|
||
|
||
if not success:
|
||
return jsonify({'error': 'Failed to update metadata'}), 500
|
||
|
||
# Update session with new metadata
|
||
sessions[session_id]['files'][file_index]['suggested_metadata'] = custom_metadata
|
||
|
||
# Verify update
|
||
verified = updater.verify_metadata(filepath, custom_metadata)
|
||
|
||
return jsonify({
|
||
'status': 'success',
|
||
'message': 'Metadata updated successfully',
|
||
'verified': verified,
|
||
'metadata': custom_metadata
|
||
})
|
||
|
||
except Exception as e:
|
||
return jsonify({'error': f'Error updating metadata: {str(e)}'}), 500
|
||
|
||
@app.route('/download/<filename>')
|
||
def download_file(filename):
|
||
"""Download processed file."""
|
||
filepath = os.path.join(app.config['UPLOAD_FOLDER'], safe_filename(filename))
|
||
if os.path.exists(filepath):
|
||
return send_file(filepath, as_attachment=True)
|
||
return jsonify({'error': 'File not found'}), 404
|
||
|
||
@app.route('/stats')
|
||
def get_stats():
|
||
"""Get Excel metadata statistics."""
|
||
try:
|
||
lookup = get_metadata_lookup()
|
||
stats = lookup.get_stats()
|
||
return jsonify({
|
||
'success': True,
|
||
'stats': stats
|
||
})
|
||
except Exception as e:
|
||
return jsonify({'error': str(e)}), 500
|
||
|
||
def open_browser():
|
||
"""Open browser after short delay."""
|
||
sleep(1.5)
|
||
webbrowser.open('http://localhost:5001')
|
||
|
||
if __name__ == '__main__':
|
||
print("="*60)
|
||
print(f"{Config.APP_NAME} v{Config.APP_VERSION} - Web Interface")
|
||
print("="*60)
|
||
|
||
# Check dependencies
|
||
print("\n🔍 Checking dependencies...")
|
||
|
||
# Check Excel file
|
||
if not EXCEL_PATH.exists():
|
||
print(f"⚠️ Warning: Excel file not found at {EXCEL_PATH}")
|
||
print(" Excel metadata lookup will not be available")
|
||
print(" Please ensure the Excel file is in the project root")
|
||
else:
|
||
print(f"✓ Excel file found: {EXCEL_PATH.name}")
|
||
|
||
# Check OpenAI API key (optional)
|
||
if Config.OPENAI_API_KEY:
|
||
print("✓ OpenAI API key configured (AI metadata generation available)")
|
||
else:
|
||
print("ℹ️ OpenAI API key not configured (AI generation disabled)")
|
||
|
||
# Check ExifTool (optional)
|
||
if Config.check_exiftool():
|
||
print("✓ ExifTool available for enhanced metadata operations")
|
||
else:
|
||
print("ℹ️ ExifTool not installed (using Python libraries)")
|
||
|
||
print("\nMetadata sources available:")
|
||
print(" • Excel lookup (Celum ID mapping)")
|
||
if Config.OPENAI_API_KEY:
|
||
print(" • AI generation (OpenAI)")
|
||
print(" • Manual entry")
|
||
print(" • File import (CSV/Excel/JSON)")
|
||
|
||
print("\nStarting server...")
|
||
print("Opening browser at http://localhost:5001")
|
||
print("\nPress Ctrl+C to stop the server")
|
||
print("="*60)
|
||
|
||
# Open browser in background
|
||
threading.Thread(target=open_browser, daemon=True).start()
|
||
|
||
# Run Flask app
|
||
app.run(debug=False, port=5001, host='127.0.0.1')
|