solventum-image-metadata/web_app.py

#!/usr/bin/env python3
"""
Oliver Metadata Tool - Web Interface
Universal metadata creation and management tool for files.
Flask-based web app for local or server deployment.
Supports multiple metadata sources: Excel, AI, manual entry, and file import.
"""

from flask import Flask, render_template, request, jsonify, send_file
from werkzeug.utils import secure_filename  # noqa: F401 - kept as fallback
from pathlib import Path
import os
import tempfile
import threading
import webbrowser
from time import sleep
import shutil
import unicodedata

from src.file_detector import FileDetector, FileType
from src.excel_metadata_lookup import ExcelMetadataLookup
from src.config import Config
from src.metadata_analyzer import MetadataAnalyzer

def safe_filename(filename):
    """Sanitize filename while preserving Unicode characters (Chinese, Japanese, Korean)."""
    # Normalize unicode
    filename = unicodedata.normalize('NFC', filename)
    # Remove path separators and null bytes
    filename = filename.replace('/', '_').replace('\\', '_').replace('\x00', '')
    # Remove leading/trailing dots and spaces
    filename = filename.strip('. ')
    # If empty, use default
    if not filename:
        filename = 'unnamed_file'
    return filename
from src.extractors.pdf_extractor import PDFExtractor
from src.extractors.image_extractor import ImageExtractor
from src.extractors.office_extractor import OfficeExtractor
from src.extractors.video_extractor import VideoExtractor
from src.updaters.pdf_updater import PDFUpdater
from src.updaters.image_updater import ImageUpdater
from src.updaters.office_updater import OfficeUpdater
from src.updaters.video_updater import VideoUpdater

app = Flask(__name__)
app.config['MAX_CONTENT_LENGTH'] = 500 * 1024 * 1024  # 500MB max file size
app.config['UPLOAD_FOLDER'] = tempfile.mkdtemp()

# Excel file path for metadata lookup
EXCEL_PATH = Path(__file__).parent / "Celum ID to Adobe Asset Path Mapping Spreadsheet (1).xlsx"

# Initialize metadata lookup from Excel
metadata_lookup = None

# Initialize AI analyzer (lazy initialization)
ai_analyzer = None

# Initialize extractors and updaters
extractors = {
    FileType.PDF: PDFExtractor(),
    FileType.IMAGE: ImageExtractor(),
    FileType.OFFICE_DOC: OfficeExtractor(),
    FileType.OFFICE_SHEET: OfficeExtractor(),
    FileType.OFFICE_PRESENTATION: OfficeExtractor(),
    FileType.VIDEO: VideoExtractor()
}

updaters = {
    FileType.PDF: PDFUpdater(),
    FileType.IMAGE: ImageUpdater(),
    FileType.OFFICE_DOC: OfficeUpdater(),
    FileType.OFFICE_SHEET: OfficeUpdater(),
    FileType.OFFICE_PRESENTATION: OfficeUpdater(),
    FileType.VIDEO: VideoUpdater()
}

# Store file processing sessions
sessions = {}

def get_metadata_lookup():
    """Get or create metadata lookup instance."""
    global metadata_lookup
    if metadata_lookup is None:
        metadata_lookup = ExcelMetadataLookup(str(EXCEL_PATH))
    return metadata_lookup

def get_ai_analyzer():
    """Get or create AI analyzer instance."""
    global ai_analyzer
    if ai_analyzer is None:
        if Config.OPENAI_API_KEY:
            try:
                ai_analyzer = MetadataAnalyzer()
                logger = __import__('logging').getLogger(__name__)
                logger.info("AI analyzer initialized successfully")
            except Exception as e:
                logger = __import__('logging').getLogger(__name__)
                logger.error(f"Failed to initialize AI analyzer: {e}")
                return None
        else:
            return None
    return ai_analyzer

@app.route('/')
def index():
    """Main page."""
    return render_template('index.html')

@app.route('/upload', methods=['POST'])
def upload_file():
    """Handle multiple file uploads and metadata lookup from Excel."""
    if 'files' not in request.files:
        return jsonify({'error': 'No files provided'}), 400

    files = request.files.getlist('files')
    if not files or files[0].filename == '':
        return jsonify({'error': 'No files selected'}), 400

    # Get metadata source choice (excel, manual, ai, import)
    metadata_source = request.form.get('metadata_source', 'excel')

    results = []
    session_id = str(len(sessions) + 1)
    sessions[session_id] = {'files': [], 'metadata_source': metadata_source}

    # Get metadata lookup (only if using Excel source)
    lookup = get_metadata_lookup() if metadata_source == 'excel' else None

    for file in files:
        try:
            # Save uploaded file
            filename = safe_filename(file.filename)
            filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename)
            file.save(filepath)

            # Detect file type
            file_type = FileDetector.detect_file_type(filepath)

            if file_type == FileType.UNSUPPORTED:
                results.append({
                    'filename': filename,
                    'error': 'Unsupported file type'
                })
                continue

            # Get extractor for this file type
            extractor = extractors.get(file_type)
            if not extractor:
                results.append({
                    'filename': filename,
                    'error': 'No extractor available'
                })
                continue

            # Read current metadata from file
            old_metadata = extractor.read_metadata(filepath)

            # Generate metadata based on chosen source
            excel_found = False
            new_metadata = {'title': '', 'subject': '', 'keywords': ''}

            if metadata_source == 'excel' and lookup:
                # Lookup metadata from Excel by filename
                excel_data = lookup.lookup_by_filename(filename)

                if excel_data:
                    new_metadata = {
                        'title': excel_data.get('title', ''),
                        'subject': excel_data.get('description', ''),
                        'keywords': ''
                    }
                    excel_found = True
                else:
                    # No Excel data found - use filename as fallback
                    new_metadata = {
                        'title': Path(filename).stem,
                        'subject': f'No metadata found in Excel for {filename}',
                        'keywords': ''
                    }

            elif metadata_source == 'manual':
                # Return empty metadata for user to fill manually
                new_metadata = {
                    'title': Path(filename).stem,  # Suggest filename
                    'subject': '',
                    'keywords': ''
                }

            elif metadata_source == 'ai':
                # AI generation using MetadataAnalyzer
                analyzer = get_ai_analyzer()

                if analyzer:
                    try:
                        # Extract content from file
                        content = extractor.extract_content(str(filepath))

                        if not content or len(content.strip()) < 10:
                            # Not enough content for AI analysis
                            new_metadata = {
                                'title': Path(filename).stem,
                                'subject': 'Insufficient content for AI analysis',
                                'keywords': '',
                                '_ai_error': 'Not enough text content extracted'
                            }
                        else:
                            # Generate metadata with AI
                            new_metadata = analyzer.analyze_content(content, filename, file_type)

                            # Log token usage if available
                            if '_tokens_used' in new_metadata:
                                import logging
                                logging.getLogger(__name__).info(
                                    f"AI tokens used for {filename}: {new_metadata['_tokens_used']}"
                                )

                    except Exception as e:
                        import logging
                        logging.getLogger(__name__).error(f"AI generation failed for {filename}: {e}")
                        new_metadata = {
                            'title': Path(filename).stem,
                            'subject': f'AI generation error: {str(e)}',
                            'keywords': '',
                            '_ai_error': str(e)
                        }
                else:
                    # AI not configured
                    new_metadata = {
                        'title': Path(filename).stem,
                        'subject': 'AI generation not available (OpenAI API key not configured)',
                        'keywords': '',
                        '_ai_error': 'OpenAI API key not configured'
                    }

            elif metadata_source == 'import':
                # Import from file - will be implemented in Phase 2.4
                # For now, return placeholder
                new_metadata = {
                    'title': Path(filename).stem,
                    'subject': 'Import feature not yet implemented',
                    'keywords': ''
                }

            file_info = {
                'success': True,
                'filename': filename,
                'file_type': file_type.value,
                'current_metadata': old_metadata,
                'suggested_metadata': new_metadata,
                'filepath': filepath,
                'metadata_source': metadata_source,
                'excel_found': excel_found
            }

            results.append(file_info)
            sessions[session_id]['files'].append(file_info)

        except Exception as e:
            results.append({
                'filename': file.filename,
                'error': str(e)
            })

    return jsonify({
        'success': True,
        'session_id': session_id,
        'files': results
    })

@app.route('/update', methods=['POST'])
def update_metadata():
    """Update file metadata from Excel and save to chosen location."""
    data = request.json
    filepath = data.get('filepath')
    output_dir = data.get('output_dir', '')  # User-selected output directory

    if not filepath or not os.path.exists(filepath):
        return jsonify({'error': 'File not found'}), 404

    try:
        # Detect file type
        file_type = FileDetector.detect_file_type(filepath)

        if file_type == FileType.UNSUPPORTED:
            return jsonify({'error': 'Unsupported file type'}), 400

        # Get updater
        updater = updaters.get(file_type)

        if not updater:
            return jsonify({'error': 'No updater available for this file type'}), 400

        # Lookup metadata from Excel
        filename = Path(filepath).name
        lookup = get_metadata_lookup()
        excel_data = lookup.lookup_by_filename(filename)

        if excel_data:
            new_metadata = {
                'title': excel_data.get('title', ''),
                'subject': excel_data.get('description', ''),  # External Description/Alt Text
                'keywords': ''
            }
        else:
            return jsonify({'error': f'No metadata found in Excel for {filename}'}), 400

        # Copy file to output directory if specified
        if output_dir and os.path.isdir(output_dir):
            output_path = os.path.join(output_dir, filename)
            shutil.copy2(filepath, output_path)
            target_file = output_path
        else:
            target_file = filepath

        # Update the file metadata WITHOUT changing filename
        success = updater.update_metadata(target_file, new_metadata, backup=False)

        if not success:
            return jsonify({'error': 'Failed to update metadata'}), 500

        # Verify update
        verified = updater.verify_metadata(target_file, new_metadata)

        return jsonify({
            'success': True,
            'message': 'Metadata updated successfully',
            'verified': verified,
            'output_path': target_file,
            'metadata': new_metadata
        })

    except Exception as e:
        return jsonify({'error': str(e)}), 500

@app.route('/update-manual', methods=['POST'])
def update_manual_metadata():
    """Update file with manually entered metadata."""
    data = request.json
    session_id = data.get('session_id')
    file_index = data.get('file_index')

    # Validate and sanitize metadata
    custom_metadata = {
        'title': data.get('title', '').strip()[:200],
        'subject': data.get('subject', '').strip()[:300],
        'keywords': data.get('keywords', '').strip()[:500]
    }

    # Validate session
    if not session_id or session_id not in sessions:
        return jsonify({'error': 'Invalid or expired session'}), 400

    # Validate file index
    if file_index is None or file_index >= len(sessions[session_id]['files']):
        return jsonify({'error': 'Invalid file index'}), 400

    try:
        # Get file info from session
        file_info = sessions[session_id]['files'][file_index]
        filepath = file_info.get('filepath')

        if not filepath or not os.path.exists(filepath):
            return jsonify({'error': 'File not found'}), 404

        # Detect file type
        file_type = FileDetector.detect_file_type(filepath)

        if file_type == FileType.UNSUPPORTED:
            return jsonify({'error': 'Unsupported file type'}), 400

        # Get updater for this file type
        updater = updaters.get(file_type)

        if not updater:
            return jsonify({'error': 'No updater available for this file type'}), 400

        # Update metadata
        success = updater.update_metadata(filepath, custom_metadata, backup=True)

        if not success:
            return jsonify({'error': 'Failed to update metadata'}), 500

        # Update session with new metadata
        sessions[session_id]['files'][file_index]['suggested_metadata'] = custom_metadata

        # Verify update
        verified = updater.verify_metadata(filepath, custom_metadata)

        return jsonify({
            'status': 'success',
            'message': 'Metadata updated successfully',
            'verified': verified,
            'metadata': custom_metadata
        })

    except Exception as e:
        return jsonify({'error': f'Error updating metadata: {str(e)}'}), 500

@app.route('/download/<filename>')
def download_file(filename):
    """Download processed file."""
    filepath = os.path.join(app.config['UPLOAD_FOLDER'], safe_filename(filename))
    if os.path.exists(filepath):
        return send_file(filepath, as_attachment=True)
    return jsonify({'error': 'File not found'}), 404

@app.route('/stats')
def get_stats():
    """Get Excel metadata statistics."""
    try:
        lookup = get_metadata_lookup()
        stats = lookup.get_stats()
        return jsonify({
            'success': True,
            'stats': stats
        })
    except Exception as e:
        return jsonify({'error': str(e)}), 500

def open_browser():
    """Open browser after short delay."""
    sleep(1.5)
    webbrowser.open('http://localhost:5001')

if __name__ == '__main__':
    print("="*60)
    print(f"{Config.APP_NAME} v{Config.APP_VERSION} - Web Interface")
    print("="*60)

    # Check dependencies
    print("\n🔍 Checking dependencies...")

    # Check Excel file
    if not EXCEL_PATH.exists():
        print(f"⚠️  Warning: Excel file not found at {EXCEL_PATH}")
        print("   Excel metadata lookup will not be available")
        print("   Please ensure the Excel file is in the project root")
    else:
        print(f"✓ Excel file found: {EXCEL_PATH.name}")

    # Check OpenAI API key (optional)
    if Config.OPENAI_API_KEY:
        print("✓ OpenAI API key configured (AI metadata generation available)")
    else:
        print("ℹ️  OpenAI API key not configured (AI generation disabled)")

    # Check ExifTool (optional)
    if Config.check_exiftool():
        print("✓ ExifTool available for enhanced metadata operations")
    else:
        print("ℹ️  ExifTool not installed (using Python libraries)")

    print("\nMetadata sources available:")
    print("  • Excel lookup (Celum ID mapping)")
    if Config.OPENAI_API_KEY:
        print("  • AI generation (OpenAI)")
    print("  • Manual entry")
    print("  • File import (CSV/Excel/JSON)")

    print("\nStarting server...")
    print("Opening browser at http://localhost:5001")
    print("\nPress Ctrl+C to stop the server")
    print("="*60)

    # Open browser in background
    threading.Thread(target=open_browser, daemon=True).start()

    # Run Flask app
    app.run(debug=False, port=5001, host='127.0.0.1')