From 3deaa5ef40a6a329c663ee7e1e56cd4542e2e3d8 Mon Sep 17 00:00:00 2001 From: SamoilenkoVadym Date: Mon, 9 Feb 2026 21:23:42 +0000 Subject: [PATCH] Initial commit: Oliver Metadata Tool (FastAPI) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Complete Flask → FastAPI migration with: - FastAPI app with session auth, Azure AD SSO, rate limiting - SQLite-backed session store (survives restarts) - Bulk AI metadata generation with SSE progress - Admin panel (user management, audit log, AI usage) - Subpath deployment support (ROOT_PATH config) - Docker + deploy.sh for production deployment - Test suite (auth, upload, templates, imports, admin, sessions) Co-Authored-By: Claude Opus 4.6 (1M context) --- .env.example | 29 + .gitignore | 105 ++ DOCKER.md | 385 +++++++ Dockerfile | 64 ++ README.md | 515 +++++++++ app/__init__.py | 0 app/config.py | 101 ++ app/dependencies.py | 107 ++ app/main.py | 126 +++ app/models/__init__.py | 0 app/models/requests.py | 67 ++ app/models/responses.py | 70 ++ app/routers/__init__.py | 0 app/routers/admin.py | 126 +++ app/routers/auth.py | 251 +++++ app/routers/downloads.py | 116 ++ app/routers/imports.py | 201 ++++ app/routers/metadata.py | 179 +++ app/routers/sse.py | 67 ++ app/routers/templates.py | 182 +++ app/routers/upload.py | 302 +++++ app/security.py | 7 + app/services/__init__.py | 0 app/services/admin_service.py | 108 ++ app/services/ai_service.py | 189 ++++ app/services/auth_service.py | 207 ++++ app/services/file_service.py | 99 ++ app/services/metadata_service.py | 186 ++++ app/session/__init__.py | 0 app/session/store.py | 298 +++++ deploy.sh | 78 ++ deploy/apache-solventum-metadata.conf | 17 + deploy/deploy.sh | 94 ++ deploy/oliver-metadata.conf | 57 + deploy/oliver-metadata.service | 37 + docker-compose.yml | 44 + docker-run.sh | 165 +++ docs/EXIFTOOL_SETUP.md | 243 ++++ requirements.txt | 54 + run.py | 13 + src/__init__.py | 4 + src/auth.py | 324 ++++++ src/base_extractor.py | 64 ++ src/base_updater.py | 60 + src/config.py | 70 ++ src/database.py | 525 +++++++++ src/excel_metadata_lookup.py | 171 +++ src/extractors/__init__.py | 1 + src/extractors/exiftool_extractor.py | 174 +++ src/extractors/image_extractor.py | 179 +++ src/extractors/office_extractor.py | 207 ++++ src/extractors/pdf_extractor.py | 228 ++++ src/extractors/video_extractor.py | 153 +++ src/field_mapper.py | 409 +++++++ src/file_detector.py | 97 ++ src/main.py | 293 +++++ src/metadata_analyzer.py | 424 +++++++ src/metadata_importer.py | 427 +++++++ src/template_manager.py | 410 +++++++ src/updaters/__init__.py | 1 + src/updaters/exiftool_updater.py | 223 ++++ src/updaters/image_updater.py | 221 ++++ src/updaters/office_updater.py | 253 +++++ src/updaters/pdf_updater.py | 132 +++ src/updaters/video_updater.py | 185 +++ src/utils.py | 175 +++ static/css/admin.css | 204 ++++ static/css/app.css | 811 ++++++++++++++ static/js/admin.js | 265 +++++ static/js/app.js | 1488 +++++++++++++++++++++++++ templates/admin.html | 187 ++++ templates/index.html | 184 +++ templates/login.html | 302 +++++ tests/__init__.py | 0 tests/conftest.py | 95 ++ tests/test_admin.py | 30 + tests/test_auth.py | 68 ++ tests/test_imports.py | 36 + tests/test_session_store.py | 95 ++ tests/test_templates.py | 93 ++ tests/test_upload.py | 52 + web_app.py | 1381 +++++++++++++++++++++++ 82 files changed, 15590 insertions(+) create mode 100644 .env.example create mode 100644 .gitignore create mode 100644 DOCKER.md create mode 100644 Dockerfile create mode 100644 README.md create mode 100644 app/__init__.py create mode 100644 app/config.py create mode 100644 app/dependencies.py create mode 100644 app/main.py create mode 100644 app/models/__init__.py create mode 100644 app/models/requests.py create mode 100644 app/models/responses.py create mode 100644 app/routers/__init__.py create mode 100644 app/routers/admin.py create mode 100644 app/routers/auth.py create mode 100644 app/routers/downloads.py create mode 100644 app/routers/imports.py create mode 100644 app/routers/metadata.py create mode 100644 app/routers/sse.py create mode 100644 app/routers/templates.py create mode 100644 app/routers/upload.py create mode 100644 app/security.py create mode 100644 app/services/__init__.py create mode 100644 app/services/admin_service.py create mode 100644 app/services/ai_service.py create mode 100644 app/services/auth_service.py create mode 100644 app/services/file_service.py create mode 100644 app/services/metadata_service.py create mode 100644 app/session/__init__.py create mode 100644 app/session/store.py create mode 100755 deploy.sh create mode 100644 deploy/apache-solventum-metadata.conf create mode 100755 deploy/deploy.sh create mode 100644 deploy/oliver-metadata.conf create mode 100644 deploy/oliver-metadata.service create mode 100644 docker-compose.yml create mode 100755 docker-run.sh create mode 100644 docs/EXIFTOOL_SETUP.md create mode 100644 requirements.txt create mode 100644 run.py create mode 100644 src/__init__.py create mode 100644 src/auth.py create mode 100644 src/base_extractor.py create mode 100644 src/base_updater.py create mode 100644 src/config.py create mode 100644 src/database.py create mode 100644 src/excel_metadata_lookup.py create mode 100644 src/extractors/__init__.py create mode 100644 src/extractors/exiftool_extractor.py create mode 100644 src/extractors/image_extractor.py create mode 100644 src/extractors/office_extractor.py create mode 100644 src/extractors/pdf_extractor.py create mode 100644 src/extractors/video_extractor.py create mode 100644 src/field_mapper.py create mode 100644 src/file_detector.py create mode 100644 src/main.py create mode 100644 src/metadata_analyzer.py create mode 100644 src/metadata_importer.py create mode 100644 src/template_manager.py create mode 100644 src/updaters/__init__.py create mode 100644 src/updaters/exiftool_updater.py create mode 100644 src/updaters/image_updater.py create mode 100644 src/updaters/office_updater.py create mode 100644 src/updaters/pdf_updater.py create mode 100644 src/updaters/video_updater.py create mode 100644 src/utils.py create mode 100644 static/css/admin.css create mode 100644 static/css/app.css create mode 100644 static/js/admin.js create mode 100644 static/js/app.js create mode 100644 templates/admin.html create mode 100644 templates/index.html create mode 100644 templates/login.html create mode 100644 tests/__init__.py create mode 100644 tests/conftest.py create mode 100644 tests/test_admin.py create mode 100644 tests/test_auth.py create mode 100644 tests/test_imports.py create mode 100644 tests/test_session_store.py create mode 100644 tests/test_templates.py create mode 100644 tests/test_upload.py create mode 100644 web_app.py diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..06842cd --- /dev/null +++ b/.env.example @@ -0,0 +1,29 @@ +# Solventum Image Metadata Tool — Environment Configuration +# Copy this file to .env and fill in your secrets: +# cp .env.example .env + +# === Required === +# Generate with: python3 -c "import secrets; print(secrets.token_hex(32))" +SECRET_KEY=CHANGE_ME_GENERATE_A_RANDOM_KEY +DOCKER_MODE=true +# Subpath prefix (must match Apache reverse proxy config, no trailing slash) +ROOT_PATH=/solventum-image-metadata + +# === Azure AD / SSO === +AZURE_TENANT_ID=e519c2e6-bc6d-4fdf-8d9c-923c2f002385 +AZURE_CLIENT_ID=9079054c-9620-4757-a256-23413042f1ef +AZURE_CLIENT_SECRET=YOUR_AZURE_CLIENT_SECRET_HERE +# Must match Azure AD App Registration > Authentication > Redirect URIs exactly +REDIRECT_URI=https://ai-sandbox.oliver.solutions/solventum-image-metadata/auth/callback + +# === OpenAI (optional — for AI metadata generation) === +OPENAI_API_KEY= + +# === Admin === +# This email will be auto-created as admin on first startup (SSO login) +SUPERADMIN_EMAIL=vadymsamoilenko@oliver.agency + +# === Options === +ENABLE_TEST_USER=false +HTTPS_ONLY=true +DEBUG=false diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..38c241d --- /dev/null +++ b/.gitignore @@ -0,0 +1,105 @@ +# These are some examples of commonly ignored file patterns. +# You should customize this list as applicable to your project. +# Learn more about .gitignore: +# https://www.atlassian.com/git/tutorials/saving-changes/gitignore + +# Node artifact files +node_modules/ +dist/ + +# Compiled Java class files +*.class + +# Compiled Python bytecode +*.py[cod] + +# Log files +*.log + +# Package files +*.jar + +# Maven +target/ +dist/ + +# JetBrains IDE +.idea/ + +# Unit test reports +TEST*.xml + +# Generated by MacOS +.DS_Store + +# Generated by Windows +Thumbs.db + +# Applications +*.app +*.exe +*.war + +# Large media files +*.mp4 +*.tiff +*.avi +*.flv +*.mov +*.wmv + +# Python virtual environments +venv/ +venv_new/ +venv_local/ +env/ +ENV/ +.venv/ + +# Python cache +__pycache__/ +*.pyc + +# Environment variables +.env +.env.local + +# Excel files with data +*.xlsx +*.xls + +# Uploads and output directories +uploads/ +output/ +Files/ + +# IDE +.vscode/ +.claude/ +CLAUDE.md + +# Database files +*.db +*.sqlite +*.sqlite3 + +# Server files +server.pid +server.log +nohup.out + +# Test files +test_*.csv +test_*.xlsx +test_*.json +TEST_REPORT.md + +# Docker +.dockerignore +docker-compose.override.yml + +# Backup files +*.tar.gz +*.zip +backup-*/ + diff --git a/DOCKER.md b/DOCKER.md new file mode 100644 index 0000000..721ce88 --- /dev/null +++ b/DOCKER.md @@ -0,0 +1,385 @@ +# Docker Deployment Guide + +Complete guide for deploying Oliver Metadata Tool using Docker. + +## Prerequisites + +- Docker 20.10+ installed +- Docker Compose 2.0+ installed +- 2GB+ available disk space +- Network access for pulling base images + +## Quick Start + +### 1. Build and Start + +```bash +# Using docker-compose directly +docker-compose up -d + +# Or using the helper script +./docker-run.sh build +./docker-run.sh start +``` + +### 2. Access Application + +Open browser at: **http://localhost:5001** + +Default credentials: +- Username: `tester` +- Password: `oliveradmin` + +### 3. View Logs + +```bash +# Using docker-compose +docker-compose logs -f + +# Or using the helper script +./docker-run.sh logs +``` + +## Configuration + +### Environment Variables + +Create `.env` file in project root (optional): + +```env +# Required for AI metadata generation +OPENAI_API_KEY=your-openai-api-key-here + +# Optional: AI Configuration +AI_MODEL=gpt-4o-mini +MAX_TOKENS=500 +TEMPERATURE=0.5 + +# Optional: Microsoft SSO +AZURE_CLIENT_ID=your-azure-client-id +AZURE_CLIENT_SECRET=your-azure-client-secret +AZURE_TENANT_ID=your-azure-tenant-id +REDIRECT_URI=http://localhost:5001/auth/callback + +# Optional: Flask secret key +SECRET_KEY=your-secret-key-here +``` + +### Docker Compose Configuration + +The `docker-compose.yml` file includes: + +- **Port mapping**: `5001:5001` +- **Persistent volumes**: + - `uploads:/app/uploads` - Temporary file uploads + - `database:/app/data` - SQLite database + - `output:/app/output` - Processed files, backups, reports +- **Auto-restart**: Container restarts unless explicitly stopped +- **Health checks**: Every 30 seconds + +## Management Commands + +### Using docker-run.sh Script + +```bash +# Build image +./docker-run.sh build + +# Start application +./docker-run.sh start + +# Stop application +./docker-run.sh stop + +# Restart application +./docker-run.sh restart + +# View logs +./docker-run.sh logs + +# Show status +./docker-run.sh status + +# Clean up (removes data!) +./docker-run.sh clean +``` + +### Using Docker Compose Directly + +```bash +# Build image +docker-compose build + +# Start in background +docker-compose up -d + +# Start with logs +docker-compose up + +# Stop +docker-compose down + +# Restart +docker-compose restart + +# View logs +docker-compose logs -f + +# Check status +docker-compose ps + +# Remove containers and volumes (deletes data!) +docker-compose down -v +``` + +## Data Persistence + +### Volumes + +Three Docker volumes persist data between container restarts: + +1. **uploads** - `/app/uploads` + - Temporary file uploads during processing + - Cleared when files are downloaded + +2. **database** - `/app/data` + - SQLite database (`oliver_metadata.db`) + - User accounts, sessions, audit logs + +3. **output** - `/app/output` + - Processed files + - Backups + - Reports + - Templates + +### Backup Data + +```bash +# Backup database +docker-compose exec oliver-metadata tar -czf /tmp/backup.tar.gz /app/data +docker cp oliver-metadata-tool:/tmp/backup.tar.gz ./backup-$(date +%Y%m%d).tar.gz + +# Or backup entire volumes +docker run --rm -v oliver-metadata_database:/data -v $(pwd):/backup alpine tar -czf /backup/database-backup.tar.gz -C /data . +``` + +### Restore Data + +```bash +# Stop container +docker-compose down + +# Remove old volume +docker volume rm oliver-metadata_database + +# Recreate volume and restore +docker run --rm -v oliver-metadata_database:/data -v $(pwd):/backup alpine tar -xzf /backup/database-backup.tar.gz -C /data + +# Start container +docker-compose up -d +``` + +## Troubleshooting + +### Container won't start + +```bash +# Check logs +docker-compose logs + +# Check if port is in use +lsof -i :5001 + +# Rebuild image +docker-compose build --no-cache +``` + +### Permission issues + +```bash +# Check volume permissions +docker-compose exec oliver-metadata ls -la /app/uploads /app/data /app/output + +# Fix permissions (if needed) +docker-compose exec oliver-metadata chown -R root:root /app/uploads /app/data /app/output +``` + +### Database locked errors + +```bash +# Stop container +docker-compose down + +# Start with fresh database +docker volume rm oliver-metadata_database +docker-compose up -d +``` + +### ExifTool not found + +ExifTool is installed in the Docker image. Verify: + +```bash +docker-compose exec oliver-metadata exiftool -ver +``` + +Should output version 12.15+ + +### Memory issues + +Increase Docker memory allocation: +- Docker Desktop → Settings → Resources → Memory +- Recommended: 2GB minimum, 4GB+ for large batches + +## Production Deployment + +### Security Recommendations + +1. **Change default credentials** + - Create new users via web interface + - Disable or remove test account + +2. **Use environment variables** + - Never commit `.env` to git + - Use secrets management (Docker secrets, Kubernetes secrets) + +3. **Enable HTTPS** + - Use reverse proxy (nginx, Traefik, Caddy) + - Terminate SSL at proxy level + +4. **Set custom secret key** + ```env + SECRET_KEY=$(openssl rand -hex 32) + ``` + +5. **Limit file upload size** + - Default: 500MB + - Adjust via nginx/proxy if needed + +### Reverse Proxy Example (nginx) + +```nginx +server { + listen 80; + server_name metadata.example.com; + + location / { + proxy_pass http://localhost:5001; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + + # Increase timeouts for large file uploads + proxy_read_timeout 300; + proxy_connect_timeout 300; + proxy_send_timeout 300; + } +} +``` + +### Resource Limits + +Add to `docker-compose.yml`: + +```yaml +services: + oliver-metadata: + # ... existing config ... + deploy: + resources: + limits: + cpus: '2.0' + memory: 4G + reservations: + cpus: '1.0' + memory: 2G +``` + +## System Requirements + +### Container Resources + +- **CPU**: 1-2 cores (AI generation can use more) +- **Memory**: 2GB minimum, 4GB recommended +- **Disk**: 5GB+ (depends on file volume) + +### Host Requirements + +- **OS**: Linux, macOS, Windows with WSL2 +- **Docker**: 20.10+ +- **Architecture**: x86_64/amd64 (ARM64 may work but untested) + +## Updates + +### Update to latest version + +```bash +# Pull latest code +git pull origin main + +# Rebuild image +docker-compose build + +# Restart containers +docker-compose up -d +``` + +### Update Python dependencies + +```bash +# Rebuild without cache +docker-compose build --no-cache + +# Restart +docker-compose up -d +``` + +## Monitoring + +### Health Checks + +Built-in health check runs every 30 seconds: + +```bash +# Check health status +docker ps + +# View health check logs +docker inspect oliver-metadata-tool | jq '.[0].State.Health' +``` + +### Resource Usage + +```bash +# Real-time stats +docker stats oliver-metadata-tool + +# Container info +docker inspect oliver-metadata-tool +``` + +## Support + +For issues or questions: +1. Check logs: `docker-compose logs -f` +2. Verify configuration: `docker-compose config` +3. Test connection: `curl http://localhost:5001/login` +4. Open GitHub issue with logs and configuration + +## FAQ + +**Q: Can I change the port?** +A: Yes, edit `docker-compose.yml` port mapping: `"8080:5001"` + +**Q: Does this work on ARM (Apple Silicon)?** +A: Should work but untested. Try building with `--platform linux/arm64` + +**Q: How do I use my own database?** +A: Mount external database file as volume: `./my-db.db:/app/data/oliver_metadata.db` + +**Q: Can I run multiple instances?** +A: Yes, change port mapping and container name in docker-compose.yml for each instance + +**Q: Does it support S3 storage?** +A: Not yet, but you can mount S3 as volume using FUSE/s3fs diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..3bc4f89 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,64 @@ +# Oliver Metadata Tool - Docker Image +# Multi-stage build for optimized image size + +FROM python:3.11-slim as base + +# Set working directory +WORKDIR /app + +# Install system dependencies +RUN apt-get update && apt-get install -y --no-install-recommends \ + # ExifTool - critical for metadata operations (300+ formats) + libimage-exiftool-perl \ + # Tesseract OCR with CJK language support + tesseract-ocr \ + tesseract-ocr-eng \ + tesseract-ocr-chi-sim \ + tesseract-ocr-chi-tra \ + tesseract-ocr-jpn \ + tesseract-ocr-kor \ + # Poppler for PDF to image conversion + poppler-utils \ + # FFmpeg for video processing + ffmpeg \ + # curl for health check + curl \ + # Build dependencies + gcc \ + && rm -rf /var/lib/apt/lists/* + +# Verify ExifTool installation +RUN exiftool -ver + +# Copy requirements first for better layer caching +COPY requirements.txt . + +# Install Python dependencies +RUN pip install --no-cache-dir -r requirements.txt + +# Copy application code +COPY . . + +# Create necessary directories +RUN mkdir -p /app/uploads /app/output /app/data /app/templates_saved + +# Set environment variables +ENV PYTHONUNBUFFERED=1 +ENV DOCKER_MODE=true + +# Expose port +EXPOSE 5001 + +# Health check +HEALTHCHECK --interval=30s --timeout=10s --start-period=40s --retries=3 \ + CMD curl -sf http://localhost:5001/login || exit 1 + +# Run application with gunicorn + uvicorn workers +CMD ["gunicorn", "app.main:app", \ + "--worker-class", "uvicorn.workers.UvicornWorker", \ + "--workers", "2", \ + "--bind", "0.0.0.0:5001", \ + "--timeout", "120", \ + "--graceful-timeout", "30", \ + "--access-logfile", "-", \ + "--error-logfile", "-"] diff --git a/README.md b/README.md new file mode 100644 index 0000000..5f5572e --- /dev/null +++ b/README.md @@ -0,0 +1,515 @@ +# Oliver Metadata Tool v3.1 Enterprise Edition + +Universal metadata creation and management tool for all file types. Create, import, and manage metadata from multiple sources with an intuitive web interface, user authentication, and AI-powered metadata generation. + +**Developer:** Vadym Samoilenko +**License:** Corporate License - Oliver Marketing +**Version:** 3.1 (Enterprise Edition) + +--- + +## Features + +### Multiple Metadata Sources +- **📂 File Import**: Import metadata from CSV, Excel, or JSON with smart column mapping and sheet selection +- **🤖 AI Generation**: OpenAI-powered intelligent metadata generation +- **✏️ Manual Entry**: Direct editing with real-time validation +- **📋 Templates**: Reusable metadata templates with variables + +### Enterprise Features +- **🔐 Authentication**: Local user authentication + Microsoft SSO support +- **👥 User Management**: SQLite database for users and sessions +- **📊 Audit Logging**: Track all user actions and metadata changes +- **🔍 AI Usage Tracking**: Monitor OpenAI token usage and costs + +### File Support +- **300+ File Formats** via ExifTool integration +- **PDF Files**: Full metadata support (title, subject, keywords, author, copyright) +- **Images**: JPEG, PNG, GIF, HEIC, TIFF, RAW formats +- **Office Documents**: Word, Excel, PowerPoint +- **Video Files**: MP4, MOV, AVI, MKV +- **Unicode Support**: Full support for Chinese, Japanese, Korean characters + +### Advanced Capabilities +- **Smart Field Mapping**: Auto-detect columns with fuzzy matching +- **Batch Processing**: Process multiple files with selective updates +- **Custom Metadata Fields**: Add unlimited custom fields +- **CSV Export**: Export metadata and processing results +- **Template Variables**: {filename}, {date}, {user}, custom variables + +--- + +## Requirements + +### System Dependencies +- **Python 3.8+** +- **ExifTool 12.15+** (required for 300+ format support) +- **Tesseract OCR** (optional - for image text extraction) +- **Poppler** (optional - for PDF content extraction) + +### Python Dependencies +All listed in `requirements.txt`: +- Flask 2.3.0+ (Web framework) +- pandas, openpyxl (Excel/CSV processing) +- PyExifTool 0.5.6+ (Metadata operations) +- openai 1.0.0+ (AI generation) +- tiktoken 0.5.0+ (Token counting) +- tenacity 8.2.0+ (Retry logic) +- msal (Microsoft SSO - optional) + +--- + +## Installation + +### 1. Install System Dependencies + +**macOS:** +```bash +brew install exiftool tesseract tesseract-lang poppler +``` + +**Linux (Ubuntu/Debian):** +```bash +sudo apt-get install libimage-exiftool-perl tesseract-ocr tesseract-ocr-chi-sim tesseract-ocr-chi-tra tesseract-ocr-jpn tesseract-ocr-kor poppler-utils +``` + +**Windows:** +```bash +# Install ExifTool from: https://exiftool.org/ +choco install exiftool tesseract +``` + +**Verify ExifTool Installation:** +```bash +exiftool -ver +# Should show version 12.15 or higher +``` + +See [docs/EXIFTOOL_SETUP.md](docs/EXIFTOOL_SETUP.md) for detailed setup instructions. + +### 2. Create Virtual Environment + +```bash +python3 -m venv venv_local +source venv_local/bin/activate # On Windows: venv_local\Scripts\activate +``` + +### 3. Install Python Dependencies + +```bash +pip install -r requirements.txt +``` + +### 4. Configure Environment Variables + +Create a `.env` file in the project root: + +```env +# Required: OpenAI API Key (for AI metadata generation) +OPENAI_API_KEY=your-openai-api-key-here + +# Optional: Microsoft SSO (for enterprise authentication) +# AZURE_CLIENT_ID=your-azure-client-id +# AZURE_CLIENT_SECRET=your-azure-client-secret +# AZURE_TENANT_ID=your-azure-tenant-id +# REDIRECT_URI=http://localhost:5001/auth/callback + +# Optional: Flask secret key (auto-generated if not set) +# SECRET_KEY=your-secret-key-here + +# Optional: AI settings (defaults shown) +# AI_MODEL=gpt-4o-mini +# MAX_TOKENS=500 +# TEMPERATURE=0.5 +# API_TIMEOUT=30 +# API_MAX_RETRIES=3 +``` + +### 5. Initialize Database + +The database will be created automatically on first run. To manually initialize: + +```bash +python -c "from src.database import Database; db = Database(); print('Database initialized')" +``` + +--- + +## Docker Deployment (Recommended) + +### Quick Start with Docker + +```bash +# Build and start +docker-compose up -d + +# Or use the helper script +./docker-run.sh build +./docker-run.sh start + +# Access at http://localhost:5001 +``` + +**Benefits:** +- ✅ No manual dependency installation +- ✅ Consistent environment across systems +- ✅ Persistent data storage via volumes +- ✅ Easy updates and rollbacks +- ✅ Production-ready configuration + +**See [DOCKER.md](DOCKER.md) for complete Docker deployment guide.** + +--- + +## Usage + +### Starting the Web Application + +**Local Development:** +```bash +python web_app.py +``` + +**Docker:** +```bash +docker-compose up -d +``` + +The application will: +1. ✅ Check for ExifTool availability +2. ✅ Initialize SQLite database (users, sessions, audit_log) +3. ✅ Start Flask server on http://localhost:5001 +4. 🌐 Open browser automatically (local mode only) + +### Login + +**Test Account:** +- Username: `tester` +- Password: `oliveradmin` + +**Microsoft SSO** (if configured): +- Click "Sign in with Microsoft" button +- Authenticate via Azure AD +- Users auto-created on first login + +### Using Metadata Sources + +#### 1. Import from File +1. Select "Import from File (CSV/Excel/JSON)" from metadata source dropdown (default) +2. Click "Choose File" and select your metadata file +3. Configure mapping modal: + - For Excel files: Select sheet name + - Map columns: Filename (required), Title, Description, Keywords + - Auto-detection suggests best matches + - Preview first 3 rows +4. Confirm mapping +5. Upload files to process - tool matches files by filename + +#### 2. AI Generation +1. Select "AI Generation" from metadata source dropdown +2. Upload files +3. AI generates metadata (10-30 seconds per file) +4. Review and edit generated metadata +5. Save changes + +#### 3. Manual Entry +1. Select "Manual Entry" +2. Upload files +3. Fill in metadata fields manually +4. Save changes + +#### 4. Templates +1. Create template with variables +2. Select template from dropdown +3. Apply to selected files +4. Review and save + +### Batch Operations + +1. Upload multiple files +2. Use checkboxes to select files +3. "Select All" / "Deselect All" buttons +4. Edit metadata individually +5. Click "Update Selected Files" to save all at once +6. Export results to CSV + +--- + +## Configuration + +### Database Schema + +**Users Table:** +- id, username, password_hash, email, full_name +- auth_method (local/sso) +- created_at, last_login, is_active + +**Sessions Table:** +- session_id, user_id, created_at, expires_at +- ip_address, user_agent + +**Audit Log Table:** +- id, user_id, action, details, timestamp + +### AI Usage Tracking + +Every AI metadata generation is logged with: +- User ID +- Timestamp +- Tokens used (prompt + completion) +- Cost estimate (based on gpt-4o-mini pricing) + +View logs in database: +```sql +SELECT * FROM audit_log WHERE action = 'ai_generation' ORDER BY timestamp DESC; +``` + +### User Management + +**Create New User:** +```python +from src.database import Database +db = Database() +db.create_user( + username='newuser', + password='password123', + email='user@example.com', + full_name='New User', + auth_method='local' +) +``` + +**List All Users:** +```python +users = db.get_all_users() +for user in users: + print(f"{user['username']} - Last login: {user['last_login']}") +``` + +--- + +## Architecture + +### File Structure + +``` +oliver-metadata-tool/ +├── web_app.py # Flask web application (main entry point) +├── requirements.txt # Python dependencies +├── .env # Environment configuration +├── oliver_metadata.db # SQLite database (auto-created) +├── src/ +│ ├── config.py # Configuration management +│ ├── database.py # Database operations +│ ├── auth.py # Authentication logic +│ ├── metadata_analyzer.py # AI metadata generation +│ ├── metadata_importer.py # Import from files +│ ├── template_manager.py # Template system +│ ├── field_mapper.py # Column mapping +│ ├── excel_metadata_lookup.py # Excel lookup +│ ├── extractors/ +│ │ ├── pdf_extractor.py +│ │ ├── image_extractor.py +│ │ ├── office_extractor.py +│ │ ├── video_extractor.py +│ │ └── exiftool_extractor.py +│ └── updaters/ +│ ├── pdf_updater.py +│ ├── image_updater.py +│ ├── office_updater.py +│ ├── video_updater.py +│ └── exiftool_updater.py +├── templates/ +│ ├── index.html # Main UI +│ └── login.html # Login page +└── docs/ + └── EXIFTOOL_SETUP.md # ExifTool setup guide +``` + +### Technology Stack + +- **Backend:** Flask (Python) +- **Database:** SQLite +- **Frontend:** HTML5, CSS3, JavaScript (Vanilla) +- **Design:** Montserrat font, Dark & Gold theme +- **Authentication:** Flask-Session, werkzeug.security, MSAL +- **AI:** OpenAI API (gpt-4o-mini) +- **Metadata:** PyExifTool, pypdf, python-docx, openpyxl + +--- + +## API Endpoints + +### Authentication +- `GET /login` - Login page +- `POST /login` - Authenticate user +- `GET /logout` - Destroy session +- `GET /login/microsoft` - Microsoft SSO redirect +- `GET /auth/callback` - SSO callback + +### File Operations +- `POST /upload` - Upload files and generate metadata +- `POST /update-manual` - Update file metadata manually +- `GET /download/` - Download processed file + +### Metadata Sources +- `POST /upload-excel` - Upload Excel file for mapping +- `POST /preview-excel-sheet` - Preview Excel sheet structure +- `POST /configure-excel-mapping` - Configure Excel column mapping +- `POST /import-metadata` - Upload import file for mapping +- `POST /configure-import-mapping` - Configure import column mapping + +### Templates +- `GET /templates/list` - List all templates +- `POST /templates/save` - Save new template +- `POST /templates/load` - Load template by name +- `DELETE /templates/delete` - Delete template +- `POST /templates/apply` - Apply template to files +- `POST /templates/preview` - Preview template output + +--- + +## Security & Privacy + +### Authentication +- Passwords hashed with werkzeug.security (pbkdf2:sha256) +- Session tokens: 32-byte cryptographically secure random strings +- Sessions expire after 24 hours +- Microsoft SSO via OAuth2 + Azure AD + +### Data Protection +- All credentials stored in `.env` (excluded from git) +- Database file excluded from git +- API keys never logged or exposed to frontend +- Audit trail for all user actions + +### Production Recommendations +1. **HTTPS:** Use SSL/TLS certificates in production +2. **Database:** Migrate to PostgreSQL for better concurrency +3. **Rate Limiting:** Add rate limits to prevent abuse +4. **CSRF Protection:** Enable Flask-WTF for form security +5. **Error Tracking:** Integrate Sentry or similar service +6. **Backups:** Regular database backups +7. **Monitoring:** Track AI token usage for cost management + +--- + +## Troubleshooting + +### Common Issues + +**ExifTool not found:** +```bash +# Verify installation +exiftool -ver + +# macOS: Reinstall with Homebrew +brew reinstall exiftool + +# Linux: Reinstall with apt +sudo apt-get install --reinstall libimage-exiftool-perl +``` + +**Database locked error:** +```bash +# Stop all instances +lsof -ti:5001 | xargs kill -9 + +# Restart application +python web_app.py +``` + +**OpenAI API errors:** +- Check API key in `.env` file +- Verify API key is valid at https://platform.openai.com/api-keys +- Check token usage limits on OpenAI dashboard + +**Import failed - column not found:** +- Use the mapping modal to manually select columns +- Check that your file has headers in the first row +- Verify file encoding is UTF-8 + +--- + +## Development + +### Running Tests + +```bash +# Unit tests (if implemented) +pytest tests/ + +# Manual integration test +python -c "from src.database import Database; from src.config import Config; print('✅ All imports successful')" +``` + +### Git Workflow + +```bash +# Check status +git status + +# Add changes +git add . + +# Commit with message +git commit -m "Your commit message" + +# Push to remote +git push origin main +``` + +--- + +## License & Credits + +**License:** Corporate License - Oliver Marketing +All rights reserved. Unauthorized copying, distribution, or modification is prohibited. + +**Developer:** Vadym Samoilenko +**Company:** Oliver Marketing +**Version:** 3.1 Enterprise Edition +**Release Date:** January 2026 + +**Third-Party Software:** +- ExifTool by Phil Harvey (Perl Artistic License) +- Flask by Pallets (BSD License) +- OpenAI API (Commercial License) +- PyExifTool (LGPL License) + +--- + +## Support + +For issues, questions, or feature requests: +- **Internal Support:** Contact IT department +- **Developer:** Vadym Samoilenko +- **Documentation:** See `docs/` folder + +--- + +## Changelog + +### v3.1 (January 2026) - Enterprise Edition +- ✅ User authentication (local + Microsoft SSO) +- ✅ SQLite database with audit logging +- ✅ Unified import from file (CSV/Excel/JSON) with smart column mapping +- ✅ Excel sheet selection and preview +- ✅ Custom metadata fields support +- ✅ AI usage tracking and cost monitoring +- ✅ Dark & Gold UI redesign +- ✅ Template variables and preview +- ✅ Batch selection and CSV export +- ✅ Consolidated metadata sources (removed redundant Excel Lookup) + +### v3.0 (January 2026) +- ✅ ExifTool integration (300+ formats) +- ✅ Multiple metadata sources (Import, AI, Manual) +- ✅ Field mapping with fuzzy matching +- ✅ Metadata templates system +- ✅ Rebranded to Oliver Metadata Tool + +### v2.x (Prior) +- Basic Excel lookup functionality +- Multi-format file support +- Web interface diff --git a/app/__init__.py b/app/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/config.py b/app/config.py new file mode 100644 index 0000000..87e8cb7 --- /dev/null +++ b/app/config.py @@ -0,0 +1,101 @@ +"""Application settings via pydantic-settings.""" + +import secrets +import os +from pathlib import Path +from pydantic_settings import BaseSettings + + +class Settings(BaseSettings): + """Application settings loaded from environment variables and .env file.""" + + # App + APP_NAME: str = "Oliver Metadata Tool" + APP_VERSION: str = "4.0.0" + DEBUG: bool = False + DOCKER_MODE: bool = False + ROOT_PATH: str = "" # Subpath prefix, e.g. "/solventum-image-metadata" + + # Security + SECRET_KEY: str = secrets.token_hex(32) + HTTPS_ONLY: bool = False + ENABLE_TEST_USER: bool = False + + # Paths + UPLOAD_FOLDER: str = "" + DB_PATH: str = "" + SESSION_DB_PATH: str = "" + TEMPLATES_DIR: str = "" + + # OpenAI + OPENAI_API_KEY: str = "" + AI_MODEL: str = "gpt-4o-mini" + MAX_TOKENS: int = 500 + TEMPERATURE: float = 0.5 + MAX_TEXT_LENGTH: int = 4000 + API_TIMEOUT: int = 30 + API_MAX_RETRIES: int = 3 + + # Azure SSO + AZURE_CLIENT_ID: str = "" + AZURE_CLIENT_SECRET: str = "" + AZURE_TENANT_ID: str = "" + REDIRECT_URI: str = "http://localhost:5001/auth/callback" + + # OCR + OCR_LANGUAGES: str = "eng+chi_sim+chi_tra+jpn+kor" + TESSERACT_PATH: str = "" + FFMPEG_PATH: str = "" + + # Limits + MAX_UPLOAD_SIZE_MB: int = 500 + SESSION_EXPIRE_HOURS: int = 24 + FILE_CLEANUP_HOURS: int = 24 + + # Superadmin + SUPERADMIN_EMAIL: str = "vadymsamoilenko@oliver.agency" + + model_config = { + "env_file": ".env", + "env_file_encoding": "utf-8", + "extra": "ignore", + } + + def __init__(self, **kwargs): + super().__init__(**kwargs) + project_root = Path(__file__).parent.parent + + if self.DOCKER_MODE: + if not self.UPLOAD_FOLDER: + self.UPLOAD_FOLDER = "/app/uploads" + if not self.DB_PATH: + self.DB_PATH = "/app/data/oliver_metadata.db" + if not self.SESSION_DB_PATH: + self.SESSION_DB_PATH = "/app/data/oliver_sessions.db" + else: + if not self.UPLOAD_FOLDER: + self.UPLOAD_FOLDER = str(project_root / "uploads") + if not self.DB_PATH: + self.DB_PATH = str(project_root / "oliver_metadata.db") + if not self.SESSION_DB_PATH: + self.SESSION_DB_PATH = str(project_root / "oliver_sessions.db") + + if not self.TEMPLATES_DIR: + self.TEMPLATES_DIR = str(project_root / "templates") + + # Ensure upload directory exists + Path(self.UPLOAD_FOLDER).mkdir(parents=True, exist_ok=True) + + # Ensure data directory exists (for Docker) + Path(self.DB_PATH).parent.mkdir(parents=True, exist_ok=True) + + +_settings = None + + +def get_settings() -> Settings: + """Get cached settings instance.""" + global _settings + if _settings is None: + _settings = Settings() + return _settings diff --git a/app/dependencies.py b/app/dependencies.py new file mode 100644 index 0000000..85ac2b9 --- /dev/null +++ b/app/dependencies.py @@ -0,0 +1,107 @@ +"""FastAPI dependency injection providers.""" + +import logging +from typing import Optional, Dict +from fastapi import Depends, Request, HTTPException, status + +from .config import Settings, get_settings +from .session.store import SessionStore +from .services.auth_service import AuthService + +logger = logging.getLogger(__name__) + +# Singletons (initialized once via lifespan) +_database = None +_session_store = None +_auth_service = None + + +def init_dependencies(settings: Settings): + """Initialize singleton dependencies. Called once from app lifespan.""" + global _database, _session_store, _auth_service + + from src.database import Database + + _database = Database(db_path=settings.DB_PATH) + _session_store = SessionStore(db_path=settings.SESSION_DB_PATH) + _auth_service = AuthService(database=_database) + + logger.info("Dependencies initialized") + + +def get_database(): + """Get Database instance.""" + if _database is None: + raise RuntimeError("Database not initialized") + return _database + + +def get_session_store() -> SessionStore: + """Get SessionStore instance.""" + if _session_store is None: + raise RuntimeError("SessionStore not initialized") + return _session_store + + +def get_auth_service() -> AuthService: + """Get AuthService instance.""" + if _auth_service is None: + raise RuntimeError("AuthService not initialized") + return _auth_service + + +async def get_current_user(request: Request) -> Dict: + """FastAPI dependency: require authenticated user. + + Replaces Flask's @login_required decorator. + Checks session cookie against database, returns user dict or raises 401. + """ + session_id = request.session.get("session_id") + if not session_id: + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="Not authenticated", + ) + + auth = get_auth_service() + db_session = auth.validate_session(session_id) + if not db_session: + # Session expired or invalid — clear it + request.session.clear() + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="Session expired", + ) + + user_id = db_session["user_id"] + user = auth.get_user_by_id(user_id) + if not user: + request.session.clear() + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="User not found", + ) + + return user + + +async def get_current_user_optional(request: Request) -> Optional[Dict]: + """Same as get_current_user but returns None instead of raising.""" + try: + return await get_current_user(request) + except HTTPException: + return None + + +async def get_current_admin(request: Request) -> Dict: + """FastAPI dependency: require authenticated admin user. + + Raises 403 if user is not an admin. + """ + user = await get_current_user(request) + if user.get("role") != "admin": + raise HTTPException( + status_code=status.HTTP_403_FORBIDDEN, + detail="Admin access required", + ) + return user diff --git a/app/main.py b/app/main.py new file mode 100644 index 0000000..9001282 --- /dev/null +++ b/app/main.py @@ -0,0 +1,126 @@ +"""FastAPI application factory with lifespan management.""" + +import logging +from contextlib import asynccontextmanager +from pathlib import Path + +from fastapi import FastAPI, Request, Depends +from fastapi.exceptions import HTTPException +from fastapi.middleware.cors import CORSMiddleware +from fastapi.responses import HTMLResponse, RedirectResponse +from fastapi.staticfiles import StaticFiles +from fastapi.templating import Jinja2Templates +from slowapi import _rate_limit_exceeded_handler +from slowapi.errors import RateLimitExceeded +from starlette.middleware.sessions import SessionMiddleware + +from .config import get_settings +from .dependencies import init_dependencies, get_current_user +from .security import limiter + +logger = logging.getLogger(__name__) + + +@asynccontextmanager +async def lifespan(app: FastAPI): + """Startup/shutdown lifecycle.""" + settings = get_settings() + init_dependencies(settings) + logger.info(f"{settings.APP_NAME} v{settings.APP_VERSION} starting") + yield + logger.info("Shutting down") + + +def create_app() -> FastAPI: + settings = get_settings() + + app = FastAPI( + title=settings.APP_NAME, + version=settings.APP_VERSION, + root_path=settings.ROOT_PATH, + docs_url="/docs" if settings.DEBUG else None, + redoc_url=None, + lifespan=lifespan, + ) + + app.state.limiter = limiter + app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler) + + # CORS — same origin only (restrict in production) + app.add_middleware( + CORSMiddleware, + allow_origins=[settings.REDIRECT_URI.rsplit("/", 1)[0]] if not settings.DEBUG else ["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], + ) + + # Session middleware (cookie-based) + app.add_middleware( + SessionMiddleware, + secret_key=settings.SECRET_KEY, + session_cookie="oliver_session", + max_age=settings.SESSION_EXPIRE_HOURS * 3600, + same_site="lax", + https_only=settings.HTTPS_ONLY, + ) + + # Static files + project_root = Path(__file__).parent.parent + static_dir = project_root / "static" + if static_dir.exists(): + app.mount("/static", StaticFiles(directory=str(static_dir)), name="static") + + # Templates + templates = Jinja2Templates(directory=settings.TEMPLATES_DIR) + + # Register routers + from .routers import auth as auth_router + from .routers import upload as upload_router + from .routers import metadata as metadata_router + from .routers import templates as templates_router + from .routers import imports as imports_router + from .routers import downloads as downloads_router + from .routers import sse as sse_router + from .routers import admin as admin_router + + auth_router.set_templates(templates) + admin_router.set_templates(templates) + app.include_router(auth_router.router) + app.include_router(upload_router.router) + app.include_router(metadata_router.router) + app.include_router(templates_router.router) + app.include_router(imports_router.router) + app.include_router(downloads_router.router) + app.include_router(sse_router.router) + app.include_router(admin_router.router) + + # Main page + @app.get("/", response_class=HTMLResponse) + async def index(request: Request, user=Depends(get_current_user)): + return templates.TemplateResponse( + "index.html", + { + "request": request, + "username": user["username"], + "docker_mode": settings.DOCKER_MODE, + }, + ) + + # Redirect unauthenticated users to login + @app.exception_handler(HTTPException) + async def http_exception_handler(request: Request, exc: HTTPException): + if exc.status_code == 401: + root = request.scope.get("root_path", "") + return RedirectResponse(url=f"{root}/login?next={request.url.path}", status_code=302) + # Re-raise other HTTP exceptions as JSON + from fastapi.responses import JSONResponse + return JSONResponse( + status_code=exc.status_code, + content={"detail": exc.detail}, + ) + + return app + + +app = create_app() diff --git a/app/models/__init__.py b/app/models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/models/requests.py b/app/models/requests.py new file mode 100644 index 0000000..15ded9e --- /dev/null +++ b/app/models/requests.py @@ -0,0 +1,67 @@ +"""Pydantic request models with validation.""" + +from typing import Optional, Dict, List +from pydantic import BaseModel, Field + + +class UpdateMetadataRequest(BaseModel): + """Request to update file metadata from session.""" + session_id: str + file_index: int + filepath: Optional[str] = None # Deprecated: resolved from session + output_dir: Optional[str] = "" + + +class UpdateManualMetadataRequest(BaseModel): + """Request to update file with manually entered metadata.""" + session_id: str + file_index: int + title: str = Field(default="", max_length=200) + subject: str = Field(default="", max_length=300) + keywords: str = Field(default="", max_length=500) + author: str = Field(default="", max_length=100) + copyright: str = Field(default="", max_length=150) + comments: str = Field(default="", max_length=500) + custom_fields: Optional[Dict[str, str]] = None + + +class ExcelSheetPreviewRequest(BaseModel): + """Request to preview a specific Excel sheet.""" + excel_session_id: str + sheet_name: str + + +class ExcelMappingRequest(BaseModel): + """Request to configure Excel column mapping.""" + excel_session_id: str + sheet_name: str + column_mapping: Dict[str, str] # {filename: 'col', title: 'col', ...} + + +class ImportMappingRequest(BaseModel): + """Request to configure import column mapping.""" + import_session_id: str + column_mapping: Dict[str, str] + + +class TemplateApplyRequest(BaseModel): + """Request to apply a template to files.""" + template_name: str + session_id: str + file_indices: List[int] + custom_vars: Optional[Dict[str, str]] = None + + +class TemplatePreviewRequest(BaseModel): + """Request to preview template output.""" + title: str = "" + subject: str = "" + keywords: str = "" + sample_filename: str = "example.pdf" + custom_vars: Optional[Dict[str, str]] = None + + +class DownloadSelectedRequest(BaseModel): + """Request to download selected files as ZIP.""" + session_id: str + file_indices: List[int] diff --git a/app/models/responses.py b/app/models/responses.py new file mode 100644 index 0000000..9c03d3f --- /dev/null +++ b/app/models/responses.py @@ -0,0 +1,70 @@ +"""Pydantic response models.""" + +from typing import Optional, Dict, List, Any +from pydantic import BaseModel + + +class FileResult(BaseModel): + """Result for a single processed file.""" + success: bool = True + filename: str + file_type: Optional[str] = None + current_metadata: Optional[Dict[str, str]] = None + suggested_metadata: Optional[Dict[str, str]] = None + metadata_source: Optional[str] = None + excel_found: bool = False + error: Optional[str] = None + + +class UploadResponse(BaseModel): + """Response from file upload endpoint.""" + success: bool + session_id: Optional[str] = None + files: List[FileResult] = [] + error: Optional[str] = None + + +class UpdateResponse(BaseModel): + """Response from metadata update endpoint.""" + success: bool = True + message: str = "" + verified: bool = False + metadata: Optional[Dict[str, str]] = None + error: Optional[str] = None + + +class ExcelUploadResponse(BaseModel): + """Response from Excel file upload.""" + success: bool + excel_session_id: Optional[str] = None + filename: Optional[str] = None + sheets: Optional[List[str]] = None + preview: Optional[Dict[str, Any]] = None + message: Optional[str] = None + error: Optional[str] = None + + +class ImportUploadResponse(BaseModel): + """Response from import file upload.""" + success: bool + import_session_id: Optional[str] = None + filename: Optional[str] = None + columns: Optional[List[str]] = None + sample_data: Optional[List[Dict[str, Any]]] = None + message: Optional[str] = None + error: Optional[str] = None + + +class MappingConfigResponse(BaseModel): + """Response from mapping configuration.""" + success: bool + excel_session_id: Optional[str] = None + import_session_id: Optional[str] = None + stats: Optional[Dict[str, int]] = None + message: Optional[str] = None + error: Optional[str] = None + + +class ErrorResponse(BaseModel): + """Standard error response.""" + error: str diff --git a/app/routers/__init__.py b/app/routers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/routers/admin.py b/app/routers/admin.py new file mode 100644 index 0000000..41699c4 --- /dev/null +++ b/app/routers/admin.py @@ -0,0 +1,126 @@ +"""Admin router: user management, audit log, AI usage stats.""" + +import logging +from typing import Dict + +from fastapi import APIRouter, Request, Depends +from fastapi.responses import HTMLResponse, JSONResponse +from fastapi.templating import Jinja2Templates + +from ..config import get_settings +from ..dependencies import get_current_admin, get_database +from ..services.admin_service import AdminService + +logger = logging.getLogger(__name__) + +router = APIRouter(prefix="/admin", tags=["admin"]) + +_templates: Jinja2Templates = None +_admin_service: AdminService = None + + +def set_templates(templates: Jinja2Templates): + global _templates + _templates = templates + + +def _get_admin_service() -> AdminService: + global _admin_service + if _admin_service is None: + _admin_service = AdminService(database=get_database()) + return _admin_service + + +@router.get("", response_class=HTMLResponse) +async def admin_dashboard(request: Request, user: Dict = Depends(get_current_admin)): + """Admin dashboard page.""" + svc = _get_admin_service() + stats = svc.get_dashboard_stats() + return _templates.TemplateResponse( + "admin.html", + { + "request": request, + "username": user["username"], + "stats": stats, + }, + ) + + +@router.get("/users") +async def list_users( + include_inactive: bool = False, + user: Dict = Depends(get_current_admin), +): + """List all users.""" + svc = _get_admin_service() + users = svc.list_users(include_inactive=include_inactive) + return {"success": True, "users": users} + + +@router.post("/users") +async def create_user( + request: Request, + user: Dict = Depends(get_current_admin), +): + """Create a new user.""" + try: + data = await request.json() + svc = _get_admin_service() + user_id = svc.create_user( + username=data.get("username", "").strip(), + email=data.get("email", "").strip(), + full_name=data.get("full_name", "").strip(), + role=data.get("role", "user"), + password=data.get("password"), + auth_method=data.get("auth_method", "local"), + ) + if user_id: + db = get_database() + db.log_action(user["id"], "admin_create_user", f"Created user {data.get('username')} (ID: {user_id})") + return {"success": True, "user_id": user_id} + return JSONResponse({"error": "Failed to create user (username may already exist)"}, status_code=400) + except Exception as e: + return JSONResponse({"error": str(e)}, status_code=500) + + +@router.put("/users/{user_id}") +async def update_user( + user_id: int, + request: Request, + admin: Dict = Depends(get_current_admin), +): + """Update user (role, is_active, full_name, email).""" + try: + data = await request.json() + svc = _get_admin_service() + success = svc.update_user(user_id, data) + if success: + db = get_database() + db.log_action(admin["id"], "admin_update_user", f"Updated user {user_id}: {data}") + return {"success": True} + return JSONResponse({"error": "No changes applied"}, status_code=400) + except Exception as e: + return JSONResponse({"error": str(e)}, status_code=500) + + +@router.get("/audit") +async def get_audit_log( + user_id: int = None, + action: str = None, + limit: int = 100, + offset: int = 0, + admin: Dict = Depends(get_current_admin), +): + """Get audit log with optional filters.""" + svc = _get_admin_service() + entries = svc.get_audit_log(user_id=user_id, action=action, limit=limit, offset=offset) + return {"success": True, "entries": entries, "count": len(entries)} + + +@router.get("/ai-usage") +async def get_ai_usage(admin: Dict = Depends(get_current_admin)): + """Get AI usage statistics.""" + svc = _get_admin_service() + stats = svc.get_ai_usage_stats() + by_user = svc.get_ai_usage_by_user() + return {"success": True, "stats": stats, "by_user": by_user} diff --git a/app/routers/auth.py b/app/routers/auth.py new file mode 100644 index 0000000..d3748b4 --- /dev/null +++ b/app/routers/auth.py @@ -0,0 +1,251 @@ +"""Authentication router: login, logout, Microsoft SSO.""" + +import secrets +import logging +from typing import Dict +from fastapi import APIRouter, Request, Depends, Form +from fastapi.responses import HTMLResponse, RedirectResponse +from fastapi.templating import Jinja2Templates + +from ..config import get_settings, Settings +from ..dependencies import get_auth_service, get_current_user_optional +from ..security import limiter +from ..services.auth_service import AuthService + +logger = logging.getLogger(__name__) + +router = APIRouter(tags=["auth"]) + +# Templates are set from main.py after mounting +_templates: Jinja2Templates = None + + +def set_templates(templates: Jinja2Templates): + global _templates + _templates = templates + + +@router.get("/login", response_class=HTMLResponse) +async def login_page( + request: Request, + error: str = None, + info: str = None, + settings: Settings = Depends(get_settings), + auth: AuthService = Depends(get_auth_service), +): + """Render login page.""" + # If already logged in, redirect to index + user = await get_current_user_optional(request) + if user: + root = request.scope.get("root_path", "") + return RedirectResponse(url=f"{root}/", status_code=302) + + return _templates.TemplateResponse( + "login.html", + { + "request": request, + "error": error, + "info": info, + "sso_enabled": auth.sso_enabled, + "enable_test_user": settings.ENABLE_TEST_USER, + "app_version": settings.APP_VERSION, + }, + ) + + +@router.post("/login") +@limiter.limit("5/minute") +async def login_submit( + request: Request, + username: str = Form(...), + password: str = Form(...), + settings: Settings = Depends(get_settings), + auth: AuthService = Depends(get_auth_service), +): + """Process login form. Rate limited to 5 attempts per minute.""" + username = username.strip() + if not username or not password: + return _templates.TemplateResponse( + "login.html", + { + "request": request, + "error": "Please enter both username and password", + "sso_enabled": auth.sso_enabled, + "enable_test_user": settings.ENABLE_TEST_USER, + "app_version": settings.APP_VERSION, + }, + ) + + result = auth.authenticate_user(username, password) + + if not result["success"]: + return _templates.TemplateResponse( + "login.html", + { + "request": request, + "error": result.get("error"), + "sso_enabled": auth.sso_enabled, + "enable_test_user": settings.ENABLE_TEST_USER, + "app_version": settings.APP_VERSION, + }, + ) + + user = result["user"] + session_id = auth.create_session( + user=user, + ip_address=request.client.host if request.client else None, + user_agent=request.headers.get("user-agent"), + ) + + if not session_id: + return _templates.TemplateResponse( + "login.html", + { + "request": request, + "error": "Failed to create session", + "sso_enabled": auth.sso_enabled, + "enable_test_user": settings.ENABLE_TEST_USER, + "app_version": settings.APP_VERSION, + }, + ) + + # Set session data + request.session["user_id"] = user["id"] + request.session["username"] = user["username"] + request.session["session_id"] = session_id + + root = request.scope.get("root_path", "") + next_url = request.query_params.get("next", "/") + # Prefix with root_path if next_url is a relative path + if next_url.startswith("/") and not next_url.startswith(root): + next_url = f"{root}{next_url}" + return RedirectResponse(url=next_url, status_code=302) + + +@router.get("/logout") +async def logout( + request: Request, + auth: AuthService = Depends(get_auth_service), +): + """Logout and destroy session.""" + user_id = request.session.get("user_id") + session_id = request.session.get("session_id") + + if session_id: + auth.destroy_session(session_id, user_id) + + request.session.clear() + root = request.scope.get("root_path", "") + return RedirectResponse(url=f"{root}/login", status_code=302) + + +@router.get("/login/microsoft") +async def login_microsoft( + request: Request, + settings: Settings = Depends(get_settings), + auth: AuthService = Depends(get_auth_service), +): + """Redirect to Microsoft SSO.""" + if not auth.sso_enabled: + return _templates.TemplateResponse( + "login.html", + { + "request": request, + "error": "Microsoft SSO not configured", + "sso_enabled": False, + "enable_test_user": settings.ENABLE_TEST_USER, + "app_version": settings.APP_VERSION, + }, + ) + + state = secrets.token_urlsafe(16) + request.session["oauth_state"] = state + + auth_url = auth.sso.get_auth_url(state=state) + if auth_url: + return RedirectResponse(url=auth_url, status_code=302) + + return _templates.TemplateResponse( + "login.html", + { + "request": request, + "error": "Failed to generate SSO URL", + "sso_enabled": auth.sso_enabled, + "enable_test_user": settings.ENABLE_TEST_USER, + "app_version": settings.APP_VERSION, + }, + ) + + +@router.get("/auth/callback") +async def auth_callback( + request: Request, + state: str = None, + code: str = None, + error_description: str = None, + settings: Settings = Depends(get_settings), + auth: AuthService = Depends(get_auth_service), +): + """Handle Microsoft SSO callback.""" + from ..dependencies import get_database + + # Verify state + if state != request.session.get("oauth_state"): + return _templates.TemplateResponse( + "login.html", + { + "request": request, + "error": "Invalid state parameter", + "sso_enabled": auth.sso_enabled, + "enable_test_user": settings.ENABLE_TEST_USER, + "app_version": settings.APP_VERSION, + }, + ) + + if not code: + error_msg = error_description or "No authorization code" + return _templates.TemplateResponse( + "login.html", + { + "request": request, + "error": f"SSO failed: {error_msg}", + "sso_enabled": auth.sso_enabled, + "enable_test_user": settings.ENABLE_TEST_USER, + "app_version": settings.APP_VERSION, + }, + ) + + # Exchange code for token + result = auth.sso.acquire_token(code) + + if result and "access_token" in result: + user_info = auth.sso.get_user_info(result["access_token"]) + + if user_info: + db = get_database() + user = auth.sso.create_or_update_user(user_info, db) + + if user: + session_id = auth.create_session( + user=user, + ip_address=request.client.host if request.client else None, + user_agent=request.headers.get("user-agent"), + ) + + if session_id: + request.session["user_id"] = user["id"] + request.session["username"] = user["username"] + request.session["session_id"] = session_id + root = request.scope.get("root_path", "") + return RedirectResponse(url=f"{root}/", status_code=302) + + return _templates.TemplateResponse( + "login.html", + { + "request": request, + "error": "SSO authentication failed", + "sso_enabled": auth.sso_enabled, + "enable_test_user": settings.ENABLE_TEST_USER, + "app_version": settings.APP_VERSION, + }, + ) diff --git a/app/routers/downloads.py b/app/routers/downloads.py new file mode 100644 index 0000000..3fad3f4 --- /dev/null +++ b/app/routers/downloads.py @@ -0,0 +1,116 @@ +"""Download router: single file, ZIP batch, session cleanup.""" + +import os +import io +import zipfile +import logging +from pathlib import Path +from typing import Dict +from datetime import datetime + +from fastapi import APIRouter, Request, Depends, BackgroundTasks +from fastapi.responses import FileResponse, StreamingResponse, JSONResponse + +from ..dependencies import get_current_user, get_session_store +from ..services.file_service import safe_filename +from ..session.store import SessionStore +from ..config import get_settings + +logger = logging.getLogger(__name__) + +router = APIRouter(tags=["downloads"]) + + +@router.get("/download/{filename}") +async def download_file( + filename: str, + user: Dict = Depends(get_current_user), +): + """Download a single processed file.""" + settings = get_settings() + filepath = os.path.join(settings.UPLOAD_FOLDER, str(user["id"]), safe_filename(filename)) + + # Also check root upload folder for backward compat + if not os.path.exists(filepath): + filepath = os.path.join(settings.UPLOAD_FOLDER, safe_filename(filename)) + + if os.path.exists(filepath): + return FileResponse(filepath, filename=filename, media_type="application/octet-stream") + + return JSONResponse({"error": "File not found"}, status_code=404) + + +@router.post("/download-selected") +async def download_selected_files( + request: Request, + user: Dict = Depends(get_current_user), + store: SessionStore = Depends(get_session_store), +): + """Download selected files from session as ZIP archive.""" + try: + data = await request.json() + session_id = data.get("session_id") + file_indices = data.get("file_indices", []) + + session_data = store.get_file_session(session_id) + if not session_data: + return JSONResponse({"error": "Session not found"}, status_code=404) + + if not file_indices: + return JSONResponse({"error": "No files selected"}, status_code=400) + + files = session_data.get("files", []) + if not files: + return JSONResponse({"error": "No files in session"}, status_code=404) + + # Create in-memory ZIP + zip_buffer = io.BytesIO() + with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zf: + for index in file_indices: + if 0 <= index < len(files): + file_info = files[index] + filepath = file_info.get("filepath", "") + filename = file_info.get("filename", "") + + if filepath and os.path.exists(filepath): + zf.write(filepath, filename) + + zip_buffer.seek(0) + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + zip_filename = f"oliver_metadata_files_{timestamp}.zip" + + return StreamingResponse( + zip_buffer, + media_type="application/zip", + headers={"Content-Disposition": f'attachment; filename="{zip_filename}"'}, + ) + + except Exception as e: + logger.error(f"Download error: {e}", exc_info=True) + return JSONResponse({"error": f"Error creating ZIP archive: {e}"}, status_code=500) + + +@router.post("/cleanup-session/{session_id}") +async def cleanup_session( + session_id: str, + background_tasks: BackgroundTasks, + user: Dict = Depends(get_current_user), + store: SessionStore = Depends(get_session_store), +): + """Clean up session files.""" + try: + session_data = store.get_file_session(session_id) + if session_data: + # Delete uploaded files in background + files = session_data.get("files", []) + for file_info in files: + filepath = file_info.get("filepath", "") + if filepath and os.path.exists(filepath): + background_tasks.add_task(os.remove, filepath) + + store.delete_file_session(session_id) + + return {"success": True, "message": "Session cleaned up successfully"} + except Exception as e: + logger.error(f"Cleanup error: {e}") + return JSONResponse({"error": str(e)}, status_code=500) diff --git a/app/routers/imports.py b/app/routers/imports.py new file mode 100644 index 0000000..38f361e --- /dev/null +++ b/app/routers/imports.py @@ -0,0 +1,201 @@ +"""Import router: import metadata from CSV/Excel/JSON files.""" + +import logging +from pathlib import Path +from typing import Dict + +from fastapi import APIRouter, Request, UploadFile, File, Depends +from fastapi.responses import JSONResponse + +from ..dependencies import get_current_user, get_session_store +from ..services.file_service import FileService, safe_filename +from ..session.store import SessionStore +from ..config import get_settings + +logger = logging.getLogger(__name__) + +router = APIRouter(tags=["imports"]) + +_file_service = None + + +def _get_file_service() -> FileService: + global _file_service + if _file_service is None: + settings = get_settings() + _file_service = FileService( + upload_folder=settings.UPLOAD_FOLDER, + max_size_mb=settings.MAX_UPLOAD_SIZE_MB, + ) + return _file_service + + +@router.post("/import-metadata") +async def import_metadata( + request: Request, + import_file: UploadFile = File(...), + user: Dict = Depends(get_current_user), + store: SessionStore = Depends(get_session_store), +): + """Upload import file and preview structure for mapping.""" + try: + import pandas as pd + + file_svc = _get_file_service() + filepath = await file_svc.save_upload(import_file, user["id"]) + file_ext = Path(filepath).suffix.lower() + + if file_ext == ".csv": + df = pd.read_csv(filepath, nrows=5, encoding="utf-8") + elif file_ext in [".xlsx", ".xls"]: + df = pd.read_excel(filepath, nrows=5) + elif file_ext == ".json": + import json + with open(filepath, "r", encoding="utf-8") as f: + data = json.load(f) + if isinstance(data, list): + df = pd.DataFrame(data[:5]) + elif isinstance(data, dict): + df = pd.DataFrame([data]) + else: + return JSONResponse({"error": "Invalid JSON format"}, status_code=400) + else: + return JSONResponse({"error": f"Unsupported file format: {file_ext}"}, status_code=400) + + columns = df.columns.tolist() + sample_data = df.fillna("").to_dict("records") + + import_session_id = store.create_import_session( + user_id=user["id"], + session_type="import", + file_info={"path": filepath, "filename": Path(filepath).name, "file_type": file_ext}, + ) + + return { + "success": True, + "import_session_id": import_session_id, + "filename": Path(filepath).name, + "columns": columns, + "sample_data": sample_data, + "message": "Import file uploaded. Please configure column mapping.", + } + + except Exception as e: + logger.error(f"Import upload failed: {e}") + return JSONResponse({"error": f"Import upload failed: {e}"}, status_code=500) + + +@router.post("/configure-import-mapping") +async def configure_import_mapping( + request: Request, + user: Dict = Depends(get_current_user), + store: SessionStore = Depends(get_session_store), +): + """Configure import column mapping and load metadata.""" + try: + import pandas as pd + import json + + data = await request.json() + import_session_id = data.get("import_session_id") + column_mapping = data.get("column_mapping", {}) + + session_data = store.get_import_session(import_session_id) + if not session_data: + return JSONResponse({"error": "Invalid session ID"}, status_code=400) + + import_path = session_data["file_info"].get("path", "") + file_ext = session_data["file_info"].get("file_type", "") + + if file_ext == ".csv": + df = pd.read_csv(import_path, encoding="utf-8") + elif file_ext in [".xlsx", ".xls"]: + df = pd.read_excel(import_path) + elif file_ext == ".json": + with open(import_path, "r", encoding="utf-8") as f: + json_data = json.load(f) + df = pd.DataFrame(json_data if isinstance(json_data, list) else [json_data]) + else: + return JSONResponse({"error": "Unsupported file type"}, status_code=400) + + filename_col = column_mapping.get("filename") + title_col = column_mapping.get("title") + subject_col = column_mapping.get("subject") + keywords_col = column_mapping.get("keywords") + + if not filename_col: + return JSONResponse({"error": "Filename column is required"}, status_code=400) + + metadata_map = {} + for _, row in df.iterrows(): + fname = row.get(filename_col) + if pd.notna(fname) and str(fname).strip(): + stem = Path(str(fname).strip()).stem.lower() + metadata_map[stem] = { + "title": str(row.get(title_col, "")).strip() if title_col and pd.notna(row.get(title_col)) else "", + "subject": str(row.get(subject_col, "")).strip() if subject_col and pd.notna(row.get(subject_col)) else "", + "keywords": str(row.get(keywords_col, "")).strip() if keywords_col and pd.notna(row.get(keywords_col)) else "", + "original_filename": str(fname).strip(), + } + + store.update_import_session(import_session_id, metadata_map=metadata_map) + + stats = { + "total_records": len(metadata_map), + "with_title": sum(1 for v in metadata_map.values() if v.get("title")), + "with_subject": sum(1 for v in metadata_map.values() if v.get("subject")), + "with_keywords": sum(1 for v in metadata_map.values() if v.get("keywords")), + } + + return { + "success": True, + "import_session_id": import_session_id, + "stats": stats, + "message": f"Configured mapping for {stats['total_records']} records", + } + + except Exception as e: + logger.error(f"Import configuration failed: {e}") + return JSONResponse({"error": f"Import configuration failed: {e}"}, status_code=500) + + +@router.post("/preview-import") +async def preview_import( + request: Request, + import_file: UploadFile = File(...), + user: Dict = Depends(get_current_user), +): + """Preview file structure and suggest field mappings.""" + try: + file_svc = _get_file_service() + filepath = await file_svc.save_upload(import_file, user["id"]) + + from src.metadata_importer import MetadataImporter + importer = MetadataImporter() + columns, sample_rows, suggestions = importer.preview_file_structure(filepath) + + # Clean up temp file + file_svc.delete_file(filepath) + + formatted_suggestions = {} + for source_field, suggestion_data in suggestions.items(): + formatted_suggestions[source_field] = { + "best_match": suggestion_data["best_match"], + "confidence": round(suggestion_data["confidence"], 2), + "alternatives": [ + {"field": alt["field"], "confidence": round(alt["confidence"], 2)} + for alt in suggestion_data.get("alternatives", []) + ], + } + + return { + "success": True, + "columns": columns, + "sample_rows": sample_rows[:5], + "suggestions": formatted_suggestions, + "filename": Path(filepath).name, + } + + except Exception as e: + logger.error(f"Preview failed: {e}") + return JSONResponse({"error": f"Preview failed: {e}"}, status_code=500) diff --git a/app/routers/metadata.py b/app/routers/metadata.py new file mode 100644 index 0000000..ae0f5d5 --- /dev/null +++ b/app/routers/metadata.py @@ -0,0 +1,179 @@ +"""Metadata router: update, manual update, stats.""" + +import os +import shutil +import logging +from typing import Dict + +from fastapi import APIRouter, Request, Depends +from fastapi.responses import JSONResponse + +from ..dependencies import get_current_user, get_session_store +from ..services import metadata_service +from ..services.file_service import FileService +from ..session.store import SessionStore +from ..config import get_settings + +logger = logging.getLogger(__name__) + +router = APIRouter(tags=["metadata"]) + + +@router.post("/update") +async def update_metadata( + request: Request, + user: Dict = Depends(get_current_user), + store: SessionStore = Depends(get_session_store), +): + """Update file metadata using suggested metadata from session.""" + data = await request.json() + session_id = data.get("session_id") + file_index = data.get("file_index") + + if not session_id: + return JSONResponse({"error": "Invalid or expired session"}, status_code=400) + + session_data = store.get_file_session(session_id) + if not session_data: + return JSONResponse({"error": "Invalid or expired session"}, status_code=400) + + files = session_data.get("files", []) + if file_index is None or file_index < 0 or file_index >= len(files): + return JSONResponse({"error": "Invalid file index"}, status_code=400) + + try: + file_info = files[file_index] + filepath = file_info.get("filepath") + + if not filepath or not os.path.exists(filepath): + return JSONResponse({"error": "File not found"}, status_code=404) + + new_metadata = file_info.get("suggested_metadata", {}) + if not new_metadata or not new_metadata.get("title"): + return JSONResponse({"error": "No metadata available for this file"}, status_code=400) + + from src.file_detector import FileDetector, FileType + + file_type = FileDetector.detect_file_type(filepath) + if file_type == FileType.UNSUPPORTED: + return JSONResponse({"error": "Unsupported file type"}, status_code=400) + + settings = get_settings() + + # Update metadata in-place + success = metadata_service.update_file_metadata( + filepath, file_type, new_metadata, backup=False + ) + if not success: + return JSONResponse({"error": "Failed to update metadata"}, status_code=500) + + verified = metadata_service.verify_file_metadata(filepath, file_type, new_metadata) + + return { + "success": True, + "message": "Metadata updated successfully", + "verified": verified, + "metadata": new_metadata, + } + + except Exception as e: + logger.error(f"Update error: {e}") + return JSONResponse({"error": str(e)}, status_code=500) + + +@router.post("/update-manual") +async def update_manual_metadata( + request: Request, + user: Dict = Depends(get_current_user), + store: SessionStore = Depends(get_session_store), +): + """Update file with manually entered metadata.""" + data = await request.json() + session_id = data.get("session_id") + file_index = data.get("file_index") + + custom_metadata = { + "title": str(data.get("title", "")).strip()[:200], + "subject": str(data.get("subject", "")).strip()[:300], + "keywords": str(data.get("keywords", "")).strip()[:500], + "author": str(data.get("author", "")).strip()[:100], + "copyright": str(data.get("copyright", "")).strip()[:150], + "comments": str(data.get("comments", "")).strip()[:500], + } + + # Handle custom fields + custom_fields = data.get("custom_fields", {}) + if custom_fields and isinstance(custom_fields, dict): + for field_name, field_value in custom_fields.items(): + safe_name = str(field_name).strip()[:50] + safe_value = str(field_value).strip()[:200] + if safe_name and safe_value: + custom_metadata[safe_name] = safe_value + + if not session_id: + return JSONResponse({"error": "Invalid or expired session"}, status_code=400) + + session_data = store.get_file_session(session_id) + if not session_data: + return JSONResponse({"error": "Invalid or expired session"}, status_code=400) + + files = session_data.get("files", []) + if file_index is None or file_index < 0 or file_index >= len(files): + return JSONResponse({"error": "Invalid file index"}, status_code=400) + + try: + file_info = files[file_index] + filepath = file_info.get("filepath") + + if not filepath or not os.path.exists(filepath): + return JSONResponse({"error": "File not found"}, status_code=404) + + from src.file_detector import FileDetector, FileType + + file_type = FileDetector.detect_file_type(filepath) + if file_type == FileType.UNSUPPORTED: + return JSONResponse({"error": "Unsupported file type"}, status_code=400) + + success = metadata_service.update_file_metadata( + filepath, file_type, custom_metadata, backup=True + ) + if not success: + return JSONResponse({"error": "Failed to update metadata"}, status_code=500) + + # Update session with new metadata + store.update_file_in_session( + session_id, file_index, {"suggested_metadata": custom_metadata} + ) + + verified = metadata_service.verify_file_metadata(filepath, file_type, custom_metadata) + + return { + "status": "success", + "message": "Metadata updated successfully", + "verified": verified, + "metadata": custom_metadata, + } + + except Exception as e: + logger.error(f"Manual update error: {e}") + return JSONResponse({"error": f"Error updating metadata: {e}"}, status_code=500) + + +@router.get("/stats") +async def get_stats( + user: Dict = Depends(get_current_user), +): + """Get metadata statistics.""" + try: + from src.excel_metadata_lookup import ExcelMetadataLookup + from pathlib import Path + + excel_path = Path(__file__).parent.parent.parent / "Celum ID to Adobe Asset Path Mapping Spreadsheet (1).xlsx" + if excel_path.exists(): + lookup = ExcelMetadataLookup(str(excel_path)) + stats = lookup.get_stats() + return {"success": True, "stats": stats} + else: + return {"success": True, "stats": {"message": "No default Excel file configured"}} + except Exception as e: + return JSONResponse({"error": str(e)}, status_code=500) diff --git a/app/routers/sse.py b/app/routers/sse.py new file mode 100644 index 0000000..cded39f --- /dev/null +++ b/app/routers/sse.py @@ -0,0 +1,67 @@ +"""SSE router: Server-Sent Events for realtime AI progress.""" + +import asyncio +import logging +from typing import Dict + +from fastapi import APIRouter, Request, Depends +from fastapi.responses import StreamingResponse + +from ..dependencies import get_current_user +from ..services.ai_service import get_progress_queue, remove_progress_queue + +logger = logging.getLogger(__name__) + +router = APIRouter(tags=["sse"]) + + +@router.get("/events/ai-progress/{session_id}") +async def ai_progress_stream( + session_id: str, + request: Request, + user: Dict = Depends(get_current_user), +): + """Stream AI processing progress events via SSE. + + Events: + - processing: {file_index, filename, current, total} + - file_complete: {file_index, filename, metadata} + - error: {file_index, filename, error} + - done: {total_processed, total_errors} + """ + + async def event_generator(): + queue = get_progress_queue(session_id) + try: + while True: + # Check if client disconnected + if await request.is_disconnected(): + break + + try: + event = await asyncio.wait_for(queue.get(), timeout=30.0) + except asyncio.TimeoutError: + # Send keepalive + yield ": keepalive\n\n" + continue + + event_type = event.get("type", "message") + import json + data = json.dumps(event) + yield f"event: {event_type}\ndata: {data}\n\n" + + # Stop after 'done' event + if event_type == "done": + break + finally: + remove_progress_queue(session_id) + + return StreamingResponse( + event_generator(), + media_type="text/event-stream", + headers={ + "Cache-Control": "no-cache", + "Connection": "keep-alive", + "X-Accel-Buffering": "no", + }, + ) diff --git a/app/routers/templates.py b/app/routers/templates.py new file mode 100644 index 0000000..67f1549 --- /dev/null +++ b/app/routers/templates.py @@ -0,0 +1,182 @@ +"""Template management router: list, save, load, delete, apply, preview.""" + +import logging +from typing import Dict + +from fastapi import APIRouter, Request, Depends +from fastapi.responses import JSONResponse + +from ..dependencies import get_current_user, get_session_store +from ..session.store import SessionStore + +logger = logging.getLogger(__name__) + +router = APIRouter(prefix="/templates", tags=["templates"]) + +# Lazy-initialized template manager +_template_manager = None + + +def _get_template_manager(): + global _template_manager + if _template_manager is None: + from src.template_manager import TemplateManager + _template_manager = TemplateManager() + return _template_manager + + +@router.get("/list") +async def list_templates(user: Dict = Depends(get_current_user)): + """List all available templates.""" + try: + tm = _get_template_manager() + templates = tm.list_templates() + return {"success": True, "templates": templates} + except Exception as e: + return JSONResponse({"error": str(e)}, status_code=500) + + +@router.post("/save") +async def save_template( + request: Request, + user: Dict = Depends(get_current_user), +): + """Save a new template.""" + try: + data = await request.json() + name = data.get("name", "").strip() + if not name: + return JSONResponse({"error": "Template name is required"}, status_code=400) + + tm = _get_template_manager() + template = tm.create_template( + name=name, + title_template=data.get("title", ""), + subject_template=data.get("subject", ""), + keywords_template=data.get("keywords", ""), + description=data.get("description", ""), + ) + success = tm.save_template(template) + + if success: + return {"success": True, "message": f'Template "{name}" saved successfully', "template": template} + return JSONResponse({"error": "Failed to save template"}, status_code=500) + except Exception as e: + return JSONResponse({"error": str(e)}, status_code=500) + + +@router.get("/load/{name}") +async def load_template(name: str, user: Dict = Depends(get_current_user)): + """Load a template by name.""" + try: + tm = _get_template_manager() + template = tm.load_template(name) + if template: + return {"success": True, "template": template} + return JSONResponse({"error": f'Template "{name}" not found'}, status_code=404) + except Exception as e: + return JSONResponse({"error": str(e)}, status_code=500) + + +@router.delete("/delete/{name}") +async def delete_template(name: str, user: Dict = Depends(get_current_user)): + """Delete a template.""" + try: + tm = _get_template_manager() + success = tm.delete_template(name) + if success: + return {"success": True, "message": f'Template "{name}" deleted successfully'} + return JSONResponse({"error": f'Template "{name}" not found'}, status_code=404) + except Exception as e: + return JSONResponse({"error": str(e)}, status_code=500) + + +@router.post("/apply") +async def apply_template( + request: Request, + user: Dict = Depends(get_current_user), + store: SessionStore = Depends(get_session_store), +): + """Apply a template to generate metadata for files.""" + try: + data = await request.json() + template_name = data.get("template_name", "").strip() + file_indices = data.get("file_indices", []) + session_id = data.get("session_id") + custom_vars = data.get("custom_vars", {}) + + if not template_name: + return JSONResponse({"error": "Template name is required"}, status_code=400) + + session_data = store.get_file_session(session_id) + if not session_data: + return JSONResponse({"error": "Invalid or expired session"}, status_code=400) + + tm = _get_template_manager() + template = tm.load_template(template_name) + if not template: + return JSONResponse({"error": f'Template "{template_name}" not found'}, status_code=404) + + files = session_data.get("files", []) + results = [] + + for file_index in file_indices: + if file_index >= len(files): + continue + file_info = files[file_index] + filename = file_info.get("filename", "unknown") + + metadata = tm.apply_template( + template=template, + filename=filename, + user="web_user", + custom_vars=custom_vars, + ) + + # Update session + store.update_file_in_session(session_id, file_index, {"suggested_metadata": metadata}) + + results.append({ + "file_index": file_index, + "filename": filename, + "metadata": metadata, + }) + + return { + "success": True, + "message": f"Template applied to {len(results)} file(s)", + "results": results, + } + except Exception as e: + return JSONResponse({"error": str(e)}, status_code=500) + + +@router.post("/preview") +async def preview_template( + request: Request, + user: Dict = Depends(get_current_user), +): + """Preview template output with sample data.""" + try: + data = await request.json() + template = { + "name": "preview", + "title": data.get("title", ""), + "subject": data.get("subject", ""), + "keywords": data.get("keywords", ""), + } + sample_filename = data.get("sample_filename", "example.pdf") + custom_vars = data.get("custom_vars", {}) + + tm = _get_template_manager() + preview = tm.preview_template( + template=template, + sample_filename=sample_filename, + user="web_user", + custom_vars=custom_vars, + ) + available_vars = tm.get_available_variables() + + return {"success": True, "preview": preview, "available_variables": available_vars} + except Exception as e: + return JSONResponse({"error": str(e)}, status_code=500) diff --git a/app/routers/upload.py b/app/routers/upload.py new file mode 100644 index 0000000..1b40180 --- /dev/null +++ b/app/routers/upload.py @@ -0,0 +1,302 @@ +"""Upload router: file upload, Excel upload, mapping configuration.""" + +import secrets +import logging +from pathlib import Path +from typing import Dict, List + +from fastapi import APIRouter, Request, Depends, UploadFile, File, Form +from fastapi.responses import JSONResponse + +from ..dependencies import get_current_user, get_session_store +from ..security import limiter +from ..services.file_service import FileService, safe_filename +from ..services import metadata_service +from ..session.store import SessionStore +from ..config import get_settings, Settings + +logger = logging.getLogger(__name__) + +router = APIRouter(tags=["upload"]) + +# Lazy-initialized file service +_file_service = None + + +def _get_file_service() -> FileService: + global _file_service + if _file_service is None: + settings = get_settings() + _file_service = FileService( + upload_folder=settings.UPLOAD_FOLDER, + max_size_mb=settings.MAX_UPLOAD_SIZE_MB, + ) + return _file_service + + +@router.post("/upload") +@limiter.limit("10/minute") +async def upload_files( + request: Request, + files: List[UploadFile] = File(...), + metadata_source: str = Form("manual"), + import_session_id: str = Form(""), + excel_session_id: str = Form(""), + user: Dict = Depends(get_current_user), + store: SessionStore = Depends(get_session_store), +): + """Handle multiple file uploads with metadata source selection.""" + if not files or (len(files) == 1 and not files[0].filename): + return JSONResponse({"error": "No files provided"}, status_code=400) + + file_svc = _get_file_service() + user_id = user["id"] + + # Resolve lookup / import_map based on source + lookup = None + import_map = None + + if metadata_source == "excel": + if excel_session_id: + session_data = store.get_import_session(excel_session_id) + if session_data and "metadata_map" in session_data: + # Wrap metadata_map as a lookup-like object + lookup = _ExcelLookupAdapter(session_data["metadata_map"]) + if not lookup: + return JSONResponse( + {"error": "Please upload an Excel file first using the Upload Excel File button"}, + status_code=400, + ) + + elif metadata_source == "import": + if import_session_id: + session_data = store.get_import_session(import_session_id) + if session_data and "metadata_map" in session_data: + import_map = session_data["metadata_map"] + if not import_map: + return JSONResponse( + {"error": "Please import a metadata file first using the Import button"}, + status_code=400, + ) + + # Create file session + session_id = store.create_file_session( + user_id=user_id, + metadata_source=metadata_source, + import_session_id=import_session_id, + ) + + results = [] + ai_pending = [] # Files needing background AI processing + + for upload_file in files: + try: + filepath = await file_svc.save_upload(upload_file, user_id) + filename = Path(filepath).name + + if metadata_source == "ai": + # For AI source: save files first, process AI in background + file_type = metadata_service.detect_file(filepath) + old_metadata = metadata_service.extract_metadata(filepath, file_type) + file_result = { + "success": True, + "filename": filename, + "file_type": file_type.value, + "current_metadata": old_metadata, + "suggested_metadata": {"title": "", "subject": "AI processing...", "keywords": ""}, + "filepath": filepath, + "metadata_source": "ai", + "ai_status": "pending", + } + store.add_file_to_session(session_id, file_result) + ai_pending.append({ + "file_index": len(results), + "filepath": filepath, + "filename": filename, + "file_type": file_type, + }) + results.append(file_result) + else: + file_result = await metadata_service.process_uploaded_file( + filepath=filepath, + filename=filename, + metadata_source=metadata_source, + lookup=lookup, + import_map=import_map, + ) + store.add_file_to_session(session_id, file_result) + results.append(file_result) + + except ValueError as e: + results.append({"filename": upload_file.filename, "error": str(e)}) + except Exception as e: + logger.error(f"Upload error for {upload_file.filename}: {e}") + results.append({"filename": upload_file.filename, "error": str(e)}) + + # Start background AI processing if needed + if ai_pending: + import asyncio + from ..services.ai_service import process_bulk_ai + asyncio.create_task(process_bulk_ai(session_id, ai_pending, store, user_id)) + + # Strip server paths from client response + safe_results = [{k: v for k, v in r.items() if k != "filepath"} for r in results] + + return {"success": True, "session_id": session_id, "files": safe_results, "ai_processing": bool(ai_pending)} + + +@router.post("/upload-excel") +async def upload_excel( + request: Request, + excel_file: UploadFile = File(...), + user: Dict = Depends(get_current_user), + store: SessionStore = Depends(get_session_store), +): + """Upload Excel file for metadata lookup — returns sheet structure for mapping.""" + try: + import pandas as pd + + file_svc = _get_file_service() + filepath = await file_svc.save_upload(excel_file, user["id"]) + + excel = pd.ExcelFile(filepath) + sheet_names = excel.sheet_names + + preview_data = {} + for sheet_name in sheet_names[:5]: + df = pd.read_excel(excel, sheet_name=sheet_name, nrows=5) + preview_data[sheet_name] = { + "columns": df.columns.tolist(), + "sample_data": df.head(3).fillna("").to_dict("records"), + } + + # Store as import session with file info + excel_session_id = store.create_import_session( + user_id=user["id"], + session_type="excel", + file_info={ + "path": filepath, + "filename": Path(filepath).name, + "sheet_names": sheet_names, + }, + ) + + return { + "success": True, + "excel_session_id": excel_session_id, + "filename": Path(filepath).name, + "sheets": sheet_names, + "preview": preview_data, + "message": "Excel file uploaded. Please configure column mapping.", + } + + except Exception as e: + logger.error(f"Excel upload failed: {e}") + return JSONResponse({"error": f"Excel upload failed: {e}"}, status_code=500) + + +@router.post("/preview-excel-sheet") +async def preview_excel_sheet( + request: Request, + user: Dict = Depends(get_current_user), + store: SessionStore = Depends(get_session_store), +): + """Preview a specific sheet from uploaded Excel file.""" + try: + import pandas as pd + + data = await request.json() + excel_session_id = data.get("excel_session_id") + sheet_name = data.get("sheet_name") + + session_data = store.get_import_session(excel_session_id) + if not session_data: + return JSONResponse({"error": "Invalid session ID"}, status_code=400) + + excel_path = session_data["file_info"].get("path", "") + df = pd.read_excel(excel_path, sheet_name=sheet_name, nrows=10) + + return { + "success": True, + "columns": df.columns.tolist(), + "sample_data": df.head(5).fillna("").to_dict("records"), + } + + except Exception as e: + logger.error(f"Sheet preview failed: {e}") + return JSONResponse({"error": f"Sheet preview failed: {e}"}, status_code=500) + + +@router.post("/configure-excel-mapping") +async def configure_excel_mapping( + request: Request, + user: Dict = Depends(get_current_user), + store: SessionStore = Depends(get_session_store), +): + """Configure Excel column mapping and load metadata into session.""" + try: + import pandas as pd + + data = await request.json() + excel_session_id = data.get("excel_session_id") + sheet_name = data.get("sheet_name") + column_mapping = data.get("column_mapping", {}) + + session_data = store.get_import_session(excel_session_id) + if not session_data: + return JSONResponse({"error": "Invalid session ID"}, status_code=400) + + excel_path = session_data["file_info"].get("path", "") + df = pd.read_excel(excel_path, sheet_name=sheet_name) + + filename_col = column_mapping.get("filename") + title_col = column_mapping.get("title") + description_col = column_mapping.get("description") + keywords_col = column_mapping.get("keywords") + + if not filename_col: + return JSONResponse({"error": "Filename column is required"}, status_code=400) + + metadata_map = {} + for _, row in df.iterrows(): + fname = row.get(filename_col) + if pd.notna(fname) and str(fname).strip(): + stem = Path(str(fname).strip()).stem.lower() + metadata_map[stem] = { + "title": str(row.get(title_col, "")).strip() if title_col and pd.notna(row.get(title_col)) else "", + "description": str(row.get(description_col, "")).strip() if description_col and pd.notna(row.get(description_col)) else "", + "keywords": str(row.get(keywords_col, "")).strip() if keywords_col and pd.notna(row.get(keywords_col)) else "", + "original_filename": str(fname).strip(), + } + + # Store the built metadata_map in the session + store.update_import_session(excel_session_id, metadata_map=metadata_map) + + stats = { + "total_records": len(metadata_map), + "with_title": sum(1 for v in metadata_map.values() if v.get("title")), + "with_description": sum(1 for v in metadata_map.values() if v.get("description")), + "with_keywords": sum(1 for v in metadata_map.values() if v.get("keywords")), + } + + return { + "success": True, + "excel_session_id": excel_session_id, + "stats": stats, + "message": f"Configured mapping for {stats['total_records']} records from sheet \"{sheet_name}\"", + } + + except Exception as e: + logger.error(f"Excel configuration failed: {e}") + return JSONResponse({"error": f"Excel configuration failed: {e}"}, status_code=500) + + +class _ExcelLookupAdapter: + """Wraps a metadata_map dict to behave like ExcelMetadataLookup.""" + + def __init__(self, metadata_map: dict): + self.metadata_map = metadata_map + + def lookup_by_filename(self, filename: str): + stem = Path(filename).stem.lower() + return self.metadata_map.get(stem) diff --git a/app/security.py b/app/security.py new file mode 100644 index 0000000..86493a1 --- /dev/null +++ b/app/security.py @@ -0,0 +1,7 @@ +"""Security utilities: rate limiter, audit helper.""" + +from slowapi import Limiter +from slowapi.util import get_remote_address + +# Shared rate limiter instance +limiter = Limiter(key_func=get_remote_address) diff --git a/app/services/__init__.py b/app/services/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/services/admin_service.py b/app/services/admin_service.py new file mode 100644 index 0000000..f913519 --- /dev/null +++ b/app/services/admin_service.py @@ -0,0 +1,108 @@ +"""Admin service: user management, audit log, AI usage stats.""" + +import logging +from typing import Dict, List, Optional +from datetime import datetime + +logger = logging.getLogger(__name__) + + +class AdminService: + """Business logic for admin operations.""" + + def __init__(self, database): + self.db = database + + # --- User Management --- + + def list_users(self, include_inactive: bool = False) -> List[Dict]: + """Get all users with sanitized output (no password hashes).""" + users = self.db.get_all_users(include_inactive=include_inactive) + for user in users: + user.pop("password_hash", None) + return users + + def get_user(self, user_id: int) -> Optional[Dict]: + """Get single user by ID.""" + user = self.db.get_user_by_id(user_id) + if user: + user.pop("password_hash", None) + return user + + def create_user( + self, + username: str, + email: str = "", + full_name: str = "", + role: str = "user", + password: str = None, + auth_method: str = "local", + ) -> Optional[int]: + """Create a new user.""" + password_hash = None + if password: + from werkzeug.security import generate_password_hash + password_hash = generate_password_hash(password) + + return self.db.create_user( + username=username, + password_hash=password_hash, + email=email, + full_name=full_name, + auth_method=auth_method, + role=role, + ) + + def update_user(self, user_id: int, updates: Dict) -> bool: + """Update user fields (role, is_active, full_name, email).""" + allowed_fields = {"role", "is_active", "full_name", "email"} + filtered = {k: v for k, v in updates.items() if k in allowed_fields} + if not filtered: + return False + return self.db.update_user(user_id, filtered) + + def deactivate_user(self, user_id: int) -> bool: + """Deactivate a user account.""" + return self.db.update_user(user_id, {"is_active": 0}) + + def activate_user(self, user_id: int) -> bool: + """Reactivate a user account.""" + return self.db.update_user(user_id, {"is_active": 1}) + + # --- Audit Log --- + + def get_audit_log( + self, + user_id: Optional[int] = None, + action: Optional[str] = None, + limit: int = 100, + offset: int = 0, + ) -> List[Dict]: + """Get audit log with optional filters.""" + return self.db.get_audit_log( + user_id=user_id, + action=action, + limit=limit, + offset=offset, + ) + + # --- AI Usage Stats --- + + def get_ai_usage_stats(self) -> Dict: + """Get aggregate AI usage statistics.""" + return self.db.get_ai_usage_stats() + + def get_ai_usage_by_user(self, limit: int = 50) -> List[Dict]: + """Get AI usage broken down by user.""" + return self.db.get_ai_usage_by_user(limit=limit) + + # --- Dashboard Stats --- + + def get_dashboard_stats(self) -> Dict: + """Get combined statistics for admin dashboard.""" + db_stats = self.db.get_stats() + ai_stats = self.db.get_ai_usage_stats() + return { + **db_stats, + "ai_usage": ai_stats, + } diff --git a/app/services/ai_service.py b/app/services/ai_service.py new file mode 100644 index 0000000..1ca35c4 --- /dev/null +++ b/app/services/ai_service.py @@ -0,0 +1,189 @@ +"""Async wrapper around MetadataAnalyzer for non-blocking AI generation.""" + +import asyncio +import logging +from typing import Dict, Optional + +logger = logging.getLogger(__name__) + +# Lazy-initialized singleton +_analyzer = None + +# Progress queues per session (for SSE streaming) +_progress_queues: Dict[str, asyncio.Queue] = {} + + +def _get_analyzer(): + """Lazy-initialize MetadataAnalyzer.""" + global _analyzer + if _analyzer is None: + from app.config import get_settings + settings = get_settings() + if settings.OPENAI_API_KEY: + try: + from src.metadata_analyzer import MetadataAnalyzer + _analyzer = MetadataAnalyzer() + logger.info("MetadataAnalyzer initialized") + except Exception as e: + logger.error(f"Failed to initialize MetadataAnalyzer: {e}") + return _analyzer + + +def get_progress_queue(session_id: str) -> asyncio.Queue: + """Get or create a progress queue for a session.""" + if session_id not in _progress_queues: + _progress_queues[session_id] = asyncio.Queue() + return _progress_queues[session_id] + + +def remove_progress_queue(session_id: str): + """Remove a progress queue when SSE connection closes.""" + _progress_queues.pop(session_id, None) + + +async def generate_metadata_async( + content: str, + filename: str, + file_type, +) -> Dict[str, str]: + """Run AI metadata generation in a thread pool (non-blocking). + + Args: + content: Extracted text content from the file. + filename: Original filename. + file_type: FileType enum value. + + Returns: + Dict with 'title', 'subject', 'keywords' and internal fields. + """ + analyzer = _get_analyzer() + if not analyzer: + return { + "title": "", + "subject": "AI generation not available (OpenAI API key not configured)", + "keywords": "", + "_ai_error": "OpenAI API key not configured", + } + + if not content or len(content.strip()) < 10: + from pathlib import Path + return { + "title": Path(filename).stem, + "subject": "Insufficient content for AI analysis", + "keywords": "", + "_ai_error": "Not enough text content extracted", + } + + loop = asyncio.get_event_loop() + try: + result = await loop.run_in_executor( + None, analyzer.analyze_content, content, filename, file_type + ) + if "_tokens_used" in result: + logger.info(f"AI tokens used for {filename}: {result['_tokens_used']}") + return result + except Exception as e: + logger.error(f"AI generation failed for {filename}: {e}") + from pathlib import Path + return { + "title": Path(filename).stem, + "subject": f"AI generation error: {e}", + "keywords": "", + "_ai_error": str(e), + } + + +async def process_bulk_ai( + session_id: str, + files_data: list, + store, + user_id: int, +): + """Process multiple files with AI in background, sending progress via SSE. + + Args: + session_id: File session ID. + files_data: List of dicts with {file_index, filepath, filename, file_type}. + store: SessionStore instance. + user_id: User ID for AI usage logging. + """ + from .metadata_service import extract_content + + queue = get_progress_queue(session_id) + total = len(files_data) + processed = 0 + errors = 0 + + for i, file_info in enumerate(files_data): + file_index = file_info["file_index"] + filename = file_info["filename"] + filepath = file_info["filepath"] + file_type = file_info["file_type"] + + # Send 'processing' event + await queue.put({ + "type": "processing", + "file_index": file_index, + "filename": filename, + "current": i + 1, + "total": total, + }) + + try: + content = extract_content(filepath, file_type) + metadata = await generate_metadata_async(content, filename, file_type) + + # Update session with result + store.update_file_in_session(session_id, file_index, { + "suggested_metadata": metadata, + "ai_status": "complete", + }) + + # Log AI usage + tokens_used = metadata.get("_tokens_used", 0) + if tokens_used and user_id: + try: + from app.dependencies import get_database + db = get_database() + db.log_ai_usage( + user_id=user_id, + filename=filename, + tokens_total=tokens_used, + model=metadata.get("_model", ""), + ) + except Exception: + pass + + # Send 'file_complete' event + await queue.put({ + "type": "file_complete", + "file_index": file_index, + "filename": filename, + "metadata": { + "title": metadata.get("title", ""), + "subject": metadata.get("subject", ""), + "keywords": metadata.get("keywords", ""), + }, + }) + processed += 1 + + except Exception as e: + logger.error(f"Bulk AI error for {filename}: {e}") + errors += 1 + store.update_file_in_session(session_id, file_index, { + "ai_status": "error", + "ai_error": str(e), + }) + await queue.put({ + "type": "error", + "file_index": file_index, + "filename": filename, + "error": str(e), + }) + + # Send 'done' event + await queue.put({ + "type": "done", + "total_processed": processed, + "total_errors": errors, + }) diff --git a/app/services/auth_service.py b/app/services/auth_service.py new file mode 100644 index 0000000..40cee26 --- /dev/null +++ b/app/services/auth_service.py @@ -0,0 +1,207 @@ +"""Framework-agnostic authentication service.""" + +import os +import secrets +import logging +from typing import Dict, Optional + +logger = logging.getLogger(__name__) + + +class AuthService: + """Authentication logic extracted from src/auth.py, without Flask dependencies.""" + + def __init__(self, database): + self.db = database + self._sso = None + + def authenticate_user(self, username: str, password: str) -> Dict: + """Authenticate user with username and password. + + Returns dict with 'success' bool and either 'user' dict or 'error' message. + """ + try: + from werkzeug.security import check_password_hash + + user = self.db.get_user_by_username(username) + if user and user.get("password_hash"): + if check_password_hash(user["password_hash"], password): + logger.info(f"User '{username}' authenticated successfully") + return {"success": True, "user": user} + + logger.warning(f"Authentication failed for user '{username}'") + return {"success": False, "error": "Invalid username or password"} + + except ImportError: + logger.error("werkzeug not available - cannot verify passwords") + return {"success": False, "error": "Authentication system not available"} + except Exception as e: + logger.error(f"Authentication error: {e}") + return {"success": False, "error": "Authentication error occurred"} + + def create_session( + self, + user: Dict, + ip_address: Optional[str] = None, + user_agent: Optional[str] = None, + ) -> Optional[str]: + """Create a new auth session for an authenticated user.""" + session_id = secrets.token_urlsafe(32) + user_id = user["id"] + + success = self.db.create_session( + user_id=user_id, + session_id=session_id, + expires_in_hours=24, + ip_address=ip_address, + user_agent=user_agent, + ) + + if success: + self.db.update_last_login(user_id) + self.db.log_action(user_id, "login", f"IP: {ip_address}") + logger.info(f"Created session for user {user['username']} (ID: {user_id})") + return session_id + + logger.error(f"Failed to create session for user {user_id}") + return None + + def destroy_session(self, session_id: str, user_id: Optional[int] = None): + """Destroy an auth session (logout).""" + self.db.delete_session(session_id) + if user_id: + self.db.log_action(user_id, "logout", f"Session: {session_id}") + logger.info(f"User {user_id} logged out") + + def validate_session(self, session_id: str) -> Optional[Dict]: + """Validate a session and return session data if valid.""" + return self.db.get_session(session_id) + + def get_user_by_id(self, user_id: int) -> Optional[Dict]: + """Get user by ID.""" + return self.db.get_user_by_id(user_id) + + def cleanup_expired_sessions(self): + """Clean up expired auth sessions.""" + self.db.cleanup_expired_sessions() + + # --- Microsoft SSO --- + + @property + def sso(self): + """Lazy-initialize Microsoft SSO.""" + if self._sso is None: + self._sso = MicrosoftSSO() + return self._sso + + @property + def sso_enabled(self) -> bool: + return self.sso.enabled + + +class MicrosoftSSO: + """Microsoft SSO authentication handler using MSAL.""" + + def __init__(self): + self.client_id = os.getenv("AZURE_CLIENT_ID") + self.client_secret = os.getenv("AZURE_CLIENT_SECRET") + self.tenant_id = os.getenv("AZURE_TENANT_ID") + self.redirect_uri = os.getenv("REDIRECT_URI", "http://localhost:5001/auth/callback") + + if not all([self.client_id, self.client_secret, self.tenant_id]): + self.enabled = False + logger.warning("Microsoft SSO not configured (missing Azure credentials)") + return + + try: + import msal + + self.authority = f"https://login.microsoftonline.com/{self.tenant_id}" + self.app = msal.ConfidentialClientApplication( + self.client_id, + authority=self.authority, + client_credential=self.client_secret, + ) + self.enabled = True + logger.info("Microsoft SSO initialized successfully") + except ImportError: + self.enabled = False + logger.warning("Microsoft SSO not available (msal library not installed)") + except Exception as e: + self.enabled = False + logger.error(f"Failed to initialize Microsoft SSO: {e}") + + def get_auth_url(self, state: Optional[str] = None) -> Optional[str]: + if not self.enabled: + return None + try: + return self.app.get_authorization_request_url( + scopes=["User.Read"], + state=state, + redirect_uri=self.redirect_uri, + ) + except Exception as e: + logger.error(f"Error generating auth URL: {e}") + return None + + def acquire_token(self, auth_code: str) -> Optional[Dict]: + if not self.enabled: + return None + try: + return self.app.acquire_token_by_authorization_code( + auth_code, + scopes=["User.Read"], + redirect_uri=self.redirect_uri, + ) + except Exception as e: + logger.error(f"Error acquiring token: {e}") + return None + + def get_user_info(self, access_token: str) -> Optional[Dict]: + if not self.enabled: + return None + try: + import requests + + headers = {"Authorization": f"Bearer {access_token}"} + response = requests.get( + "https://graph.microsoft.com/v1.0/me", + headers=headers, + timeout=10, + ) + if response.status_code == 200: + return response.json() + logger.error(f"Graph API error: {response.status_code}") + return None + except Exception as e: + logger.error(f"Error fetching user info: {e}") + return None + + def create_or_update_user(self, user_info: Dict, database) -> Optional[Dict]: + """Create or update user from SSO login.""" + try: + email = user_info.get("mail") or user_info.get("userPrincipalName") + username = email.split("@")[0] if email else user_info.get("displayName", "unknown") + full_name = user_info.get("displayName") + + user = database.get_user_by_username(username) + if not user: + user_id = database.create_user( + username=username, + email=email, + full_name=full_name, + auth_method="sso", + ) + if user_id: + user = database.get_user_by_id(user_id) + logger.info(f"Created new SSO user: {username}") + else: + logger.error(f"Failed to create SSO user: {username}") + return None + else: + logger.info(f"Existing SSO user logged in: {username}") + + return user + except Exception as e: + logger.error(f"Error creating/updating SSO user: {e}") + return None diff --git a/app/services/file_service.py b/app/services/file_service.py new file mode 100644 index 0000000..6209d5a --- /dev/null +++ b/app/services/file_service.py @@ -0,0 +1,99 @@ +"""File handling: upload, naming, cleanup.""" + +import os +import shutil +import unicodedata +import logging +from pathlib import Path +from typing import Optional + +logger = logging.getLogger(__name__) + + +def safe_filename(filename: str) -> str: + """Sanitize filename while preserving Unicode characters (CJK, etc.).""" + filename = unicodedata.normalize("NFC", filename) + filename = filename.replace("/", "_").replace("\\", "_").replace("\x00", "") + filename = filename.strip(". ") + if not filename: + filename = "unnamed_file" + return filename + + +class FileService: + """Handles file uploads, per-user storage, and cleanup.""" + + def __init__(self, upload_folder: str, max_size_mb: int = 500): + self.upload_folder = Path(upload_folder) + self.upload_folder.mkdir(parents=True, exist_ok=True) + self.max_size_bytes = max_size_mb * 1024 * 1024 + + async def save_upload(self, upload_file, user_id: int) -> str: + """Save an uploaded file to disk using streaming. + + Returns the path to the saved file. + """ + filename = safe_filename(upload_file.filename or "unnamed") + user_dir = self.upload_folder / str(user_id) + user_dir.mkdir(parents=True, exist_ok=True) + + filepath = user_dir / filename + # Handle name collisions + if filepath.exists(): + stem = filepath.stem + suffix = filepath.suffix + counter = 1 + while filepath.exists(): + filepath = user_dir / f"{stem}_{counter}{suffix}" + counter += 1 + + # Stream to disk (handles large files without loading into memory) + with open(filepath, "wb") as f: + shutil.copyfileobj(upload_file.file, f) + + size = filepath.stat().st_size + if size > self.max_size_bytes: + filepath.unlink() + raise ValueError(f"File exceeds {self.max_size_bytes // (1024*1024)}MB limit") + + logger.info(f"Saved upload: {filepath.name} ({size} bytes) for user {user_id}") + return str(filepath) + + def delete_file(self, filepath: str): + """Delete a file from disk.""" + try: + path = Path(filepath) + if path.exists() and path.is_file(): + path.unlink() + logger.info(f"Deleted file: {filepath}") + except Exception as e: + logger.warning(f"Failed to delete {filepath}: {e}") + + def cleanup_user_files(self, user_id: int): + """Delete all files for a user.""" + user_dir = self.upload_folder / str(user_id) + if user_dir.exists(): + shutil.rmtree(user_dir, ignore_errors=True) + logger.info(f"Cleaned up files for user {user_id}") + + def get_filepath(self, filename: str, user_id: Optional[int] = None) -> Optional[str]: + """Resolve filepath from filename. Checks user dir first, then root.""" + if user_id: + user_path = self.upload_folder / str(user_id) / safe_filename(filename) + if user_path.exists(): + return str(user_path) + + root_path = self.upload_folder / safe_filename(filename) + if root_path.exists(): + return str(root_path) + + return None + + def validate_filepath(self, filepath: str) -> bool: + """Validate that filepath is within upload folder (prevent traversal).""" + try: + resolved = Path(filepath).resolve() + upload_resolved = self.upload_folder.resolve() + return str(resolved).startswith(str(upload_resolved)) + except Exception: + return False diff --git a/app/services/metadata_service.py b/app/services/metadata_service.py new file mode 100644 index 0000000..6a6f92d --- /dev/null +++ b/app/services/metadata_service.py @@ -0,0 +1,186 @@ +"""Metadata processing orchestration: upload → detect → extract → generate.""" + +import logging +from pathlib import Path +from typing import Dict, Optional + +from src.file_detector import FileDetector, FileType +from src.extractors.pdf_extractor import PDFExtractor +from src.extractors.image_extractor import ImageExtractor +from src.extractors.office_extractor import OfficeExtractor +from src.extractors.video_extractor import VideoExtractor +from src.updaters.pdf_updater import PDFUpdater +from src.updaters.image_updater import ImageUpdater +from src.updaters.office_updater import OfficeUpdater +from src.updaters.video_updater import VideoUpdater + +logger = logging.getLogger(__name__) + +# Extractor/updater instances (stateless, safe to share) +EXTRACTORS = { + FileType.PDF: PDFExtractor(), + FileType.IMAGE: ImageExtractor(), + FileType.OFFICE_DOC: OfficeExtractor(), + FileType.OFFICE_SHEET: OfficeExtractor(), + FileType.OFFICE_PRESENTATION: OfficeExtractor(), + FileType.VIDEO: VideoExtractor(), +} + +UPDATERS = { + FileType.PDF: PDFUpdater(), + FileType.IMAGE: ImageUpdater(), + FileType.OFFICE_DOC: OfficeUpdater(), + FileType.OFFICE_SHEET: OfficeUpdater(), + FileType.OFFICE_PRESENTATION: OfficeUpdater(), + FileType.VIDEO: VideoUpdater(), +} + + +def detect_file(filepath: str) -> FileType: + """Detect the type of a file.""" + return FileDetector.detect_file_type(filepath) + + +def extract_metadata(filepath: str, file_type: FileType) -> Dict[str, str]: + """Read current metadata from file.""" + extractor = EXTRACTORS.get(file_type) + if not extractor: + return {} + try: + return extractor.read_metadata(filepath) + except Exception as e: + logger.error(f"Failed to extract metadata from {filepath}: {e}") + return {} + + +def extract_content(filepath: str, file_type: FileType) -> str: + """Extract text content for AI analysis.""" + extractor = EXTRACTORS.get(file_type) + if not extractor: + return "" + try: + return extractor.extract_content(filepath) + except Exception as e: + logger.error(f"Failed to extract content from {filepath}: {e}") + return "" + + +def update_file_metadata( + filepath: str, + file_type: FileType, + metadata: Dict[str, str], + backup: bool = False, +) -> bool: + """Write metadata to file. Returns True on success.""" + updater = UPDATERS.get(file_type) + if not updater: + logger.error(f"No updater for file type: {file_type}") + return False + try: + return updater.update_metadata(filepath, metadata, backup=backup) + except Exception as e: + logger.error(f"Failed to update metadata for {filepath}: {e}") + return False + + +def verify_file_metadata( + filepath: str, + file_type: FileType, + metadata: Dict[str, str], +) -> bool: + """Verify metadata was written correctly.""" + updater = UPDATERS.get(file_type) + if not updater: + return False + try: + return updater.verify_metadata(filepath, metadata) + except Exception as e: + logger.error(f"Failed to verify metadata for {filepath}: {e}") + return False + + +async def process_uploaded_file( + filepath: str, + filename: str, + metadata_source: str, + lookup=None, + import_map=None, +) -> Dict: + """Process a single uploaded file through the full pipeline. + + Args: + filepath: Path to uploaded file on disk. + filename: Original filename. + metadata_source: One of 'excel', 'ai', 'manual', 'import'. + lookup: Excel lookup instance (for excel source). + import_map: Metadata map dict (for import source). + + Returns: + Dict with file processing results. + """ + file_type = detect_file(filepath) + + if file_type == FileType.UNSUPPORTED: + return {"success": False, "filename": filename, "error": "Unsupported file type"} + + # Read current metadata + old_metadata = extract_metadata(filepath, file_type) + + # Generate new metadata based on source + excel_found = False + new_metadata = {"title": "", "subject": "", "keywords": ""} + + if metadata_source == "excel" and lookup: + excel_data = lookup.lookup_by_filename(filename) + if excel_data: + new_metadata = { + "title": excel_data.get("title", ""), + "subject": excel_data.get("description", ""), + "keywords": "", + } + excel_found = True + else: + new_metadata = { + "title": Path(filename).stem, + "subject": f"No metadata found in Excel for {filename}", + "keywords": "", + } + + elif metadata_source == "manual": + new_metadata = { + "title": Path(filename).stem, + "subject": "", + "keywords": "", + } + + elif metadata_source == "ai": + from .ai_service import generate_metadata_async + + content = extract_content(filepath, file_type) + new_metadata = await generate_metadata_async(content, filename, file_type) + + elif metadata_source == "import" and import_map: + from src.metadata_importer import MetadataImporter + + importer = MetadataImporter() + imported = importer.get_metadata_for_file(import_map, filename) + if imported: + new_metadata = imported + excel_found = True + else: + new_metadata = { + "title": Path(filename).stem, + "subject": f"No metadata found in imported file for {filename}", + "keywords": "", + } + + return { + "success": True, + "filename": filename, + "file_type": file_type.value, + "current_metadata": old_metadata, + "suggested_metadata": new_metadata, + "filepath": filepath, + "metadata_source": metadata_source, + "excel_found": excel_found, + } diff --git a/app/session/__init__.py b/app/session/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/session/store.py b/app/session/store.py new file mode 100644 index 0000000..e0c6487 --- /dev/null +++ b/app/session/store.py @@ -0,0 +1,298 @@ +"""SQLite-backed session store for file processing and import sessions.""" + +import json +import sqlite3 +import secrets +import logging +from datetime import datetime, timedelta +from typing import Optional, Dict, List, Any +from pathlib import Path + +logger = logging.getLogger(__name__) + + +class SessionStore: + """Persistent session store replacing in-memory dicts. + + Stores file processing sessions and imported metadata maps in SQLite, + surviving server restarts and supporting multi-worker deployments. + """ + + def __init__(self, db_path: str): + self.db_path = db_path + Path(db_path).parent.mkdir(parents=True, exist_ok=True) + self._init_tables() + + def _get_conn(self) -> sqlite3.Connection: + """Create a new connection per call (thread-safe).""" + conn = sqlite3.connect(self.db_path, timeout=10) + conn.row_factory = sqlite3.Row + conn.execute("PRAGMA journal_mode=WAL") + return conn + + def _init_tables(self): + conn = self._get_conn() + try: + conn.execute(""" + CREATE TABLE IF NOT EXISTS file_sessions ( + session_id TEXT PRIMARY KEY, + user_id INTEGER NOT NULL, + metadata_source TEXT DEFAULT 'manual', + import_session_id TEXT DEFAULT '', + files_json TEXT DEFAULT '[]', + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + expires_at TIMESTAMP NOT NULL + ) + """) + conn.execute(""" + CREATE TABLE IF NOT EXISTS import_sessions ( + session_id TEXT PRIMARY KEY, + user_id INTEGER NOT NULL, + session_type TEXT DEFAULT 'import', + metadata_json TEXT DEFAULT '{}', + file_info_json TEXT DEFAULT '{}', + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + expires_at TIMESTAMP NOT NULL + ) + """) + conn.execute("CREATE INDEX IF NOT EXISTS idx_fs_user ON file_sessions(user_id)") + conn.execute("CREATE INDEX IF NOT EXISTS idx_fs_expires ON file_sessions(expires_at)") + conn.execute("CREATE INDEX IF NOT EXISTS idx_is_user ON import_sessions(user_id)") + conn.execute("CREATE INDEX IF NOT EXISTS idx_is_expires ON import_sessions(expires_at)") + conn.commit() + logger.info(f"Session store initialized at {self.db_path}") + finally: + conn.close() + + # --- File Sessions --- + + def create_file_session( + self, + user_id: int, + metadata_source: str = "manual", + import_session_id: str = "", + expires_hours: int = 24, + ) -> str: + """Create a new file processing session with a secure random ID.""" + session_id = secrets.token_urlsafe(32) + expires_at = datetime.now() + timedelta(hours=expires_hours) + conn = self._get_conn() + try: + conn.execute( + "INSERT INTO file_sessions (session_id, user_id, metadata_source, import_session_id, expires_at) VALUES (?,?,?,?,?)", + (session_id, user_id, metadata_source, import_session_id, expires_at), + ) + conn.commit() + logger.info(f"Created file session {session_id[:8]}... for user {user_id}") + return session_id + finally: + conn.close() + + def get_file_session(self, session_id: str) -> Optional[Dict[str, Any]]: + """Get file session by ID. Returns None if expired or not found.""" + conn = self._get_conn() + try: + row = conn.execute( + "SELECT * FROM file_sessions WHERE session_id = ? AND expires_at > datetime('now')", + (session_id,), + ).fetchone() + if row: + result = dict(row) + result["files"] = json.loads(result.pop("files_json")) + return result + return None + finally: + conn.close() + + def add_file_to_session(self, session_id: str, file_entry: Dict[str, Any]): + """Add a processed file entry to a session.""" + conn = self._get_conn() + try: + row = conn.execute( + "SELECT files_json FROM file_sessions WHERE session_id = ?", + (session_id,), + ).fetchone() + if row: + files = json.loads(row["files_json"]) + files.append(file_entry) + conn.execute( + "UPDATE file_sessions SET files_json = ? WHERE session_id = ?", + (json.dumps(files, ensure_ascii=False), session_id), + ) + conn.commit() + finally: + conn.close() + + def update_file_in_session( + self, session_id: str, file_index: int, updates: Dict[str, Any] + ): + """Update specific fields of a file entry within a session.""" + conn = self._get_conn() + try: + row = conn.execute( + "SELECT files_json FROM file_sessions WHERE session_id = ?", + (session_id,), + ).fetchone() + if row: + files = json.loads(row["files_json"]) + if 0 <= file_index < len(files): + files[file_index].update(updates) + conn.execute( + "UPDATE file_sessions SET files_json = ? WHERE session_id = ?", + (json.dumps(files, ensure_ascii=False), session_id), + ) + conn.commit() + finally: + conn.close() + + def get_file_session_files(self, session_id: str) -> List[Dict[str, Any]]: + """Get just the files list from a session.""" + session = self.get_file_session(session_id) + if session: + return session["files"] + return [] + + def delete_file_session(self, session_id: str): + """Delete a file session.""" + conn = self._get_conn() + try: + conn.execute("DELETE FROM file_sessions WHERE session_id = ?", (session_id,)) + conn.commit() + finally: + conn.close() + + def get_user_file_sessions(self, user_id: int) -> List[str]: + """Get all active session IDs for a user.""" + conn = self._get_conn() + try: + rows = conn.execute( + "SELECT session_id FROM file_sessions WHERE user_id = ? AND expires_at > datetime('now')", + (user_id,), + ).fetchall() + return [row["session_id"] for row in rows] + finally: + conn.close() + + # --- Import Sessions --- + + def create_import_session( + self, + user_id: int, + session_type: str = "import", + metadata_map: Optional[Dict] = None, + file_info: Optional[Dict] = None, + expires_hours: int = 24, + ) -> str: + """Create an import/excel session.""" + session_id = f"{session_type}_{secrets.token_urlsafe(8)}" + expires_at = datetime.now() + timedelta(hours=expires_hours) + conn = self._get_conn() + try: + conn.execute( + "INSERT INTO import_sessions (session_id, user_id, session_type, metadata_json, file_info_json, expires_at) VALUES (?,?,?,?,?,?)", + ( + session_id, + user_id, + session_type, + json.dumps(metadata_map or {}, ensure_ascii=False), + json.dumps(file_info or {}, ensure_ascii=False), + expires_at, + ), + ) + conn.commit() + logger.info(f"Created {session_type} session {session_id} for user {user_id}") + return session_id + finally: + conn.close() + + def get_import_session(self, session_id: str) -> Optional[Dict[str, Any]]: + """Get import session by ID.""" + conn = self._get_conn() + try: + row = conn.execute( + "SELECT * FROM import_sessions WHERE session_id = ? AND expires_at > datetime('now')", + (session_id,), + ).fetchone() + if row: + result = dict(row) + result["metadata_map"] = json.loads(result.pop("metadata_json")) + result["file_info"] = json.loads(result.pop("file_info_json")) + return result + return None + finally: + conn.close() + + def update_import_session( + self, + session_id: str, + metadata_map: Optional[Dict] = None, + file_info: Optional[Dict] = None, + ): + """Update an import session's metadata map or file info.""" + conn = self._get_conn() + try: + updates = [] + params = [] + if metadata_map is not None: + updates.append("metadata_json = ?") + params.append(json.dumps(metadata_map, ensure_ascii=False)) + if file_info is not None: + updates.append("file_info_json = ?") + params.append(json.dumps(file_info, ensure_ascii=False)) + if updates: + params.append(session_id) + conn.execute( + f"UPDATE import_sessions SET {', '.join(updates)} WHERE session_id = ?", + params, + ) + conn.commit() + finally: + conn.close() + + def delete_import_session(self, session_id: str): + """Delete an import session.""" + conn = self._get_conn() + try: + conn.execute("DELETE FROM import_sessions WHERE session_id = ?", (session_id,)) + conn.commit() + finally: + conn.close() + + # --- Cleanup --- + + def cleanup_expired(self) -> int: + """Remove all expired sessions. Returns count of deleted rows.""" + conn = self._get_conn() + try: + c1 = conn.execute("DELETE FROM file_sessions WHERE expires_at < datetime('now')") + c2 = conn.execute("DELETE FROM import_sessions WHERE expires_at < datetime('now')") + conn.commit() + total = c1.rowcount + c2.rowcount + if total > 0: + logger.info(f"Cleaned up {total} expired sessions") + return total + finally: + conn.close() + + def cleanup_user_sessions(self, user_id: int) -> List[str]: + """Delete all sessions for a user. Returns file paths for cleanup.""" + conn = self._get_conn() + try: + # Collect file paths before deleting + rows = conn.execute( + "SELECT files_json FROM file_sessions WHERE user_id = ?", + (user_id,), + ).fetchall() + file_paths = [] + for row in rows: + files = json.loads(row["files_json"]) + for f in files: + if f.get("filepath"): + file_paths.append(f["filepath"]) + + conn.execute("DELETE FROM file_sessions WHERE user_id = ?", (user_id,)) + conn.execute("DELETE FROM import_sessions WHERE user_id = ?", (user_id,)) + conn.commit() + return file_paths + finally: + conn.close() diff --git a/deploy.sh b/deploy.sh new file mode 100755 index 0000000..73e3132 --- /dev/null +++ b/deploy.sh @@ -0,0 +1,78 @@ +#!/bin/bash +# Solventum Image Metadata — Idempotent Deployment Script +# Usage: ./deploy.sh +# +# First run: +# cd /opt/oliver-metadata-tool +# cp .env.example .env # edit with your secrets +# chmod +x deploy.sh +# ./deploy.sh +# +# Subsequent updates: +# cd /opt/oliver-metadata-tool && ./deploy.sh + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +COMPOSE_PROJECT="solventum-image-metadata" + +cd "$SCRIPT_DIR" + +echo "=== Solventum Image Metadata — Deploy ===" +echo "Directory: $SCRIPT_DIR" +echo "" + +# 1. Pull latest code from Bitbucket +echo ">>> Pulling latest code..." +git pull + +# 2. Check .env exists (first-run guard) +if [ ! -f .env ]; then + echo "" + echo "ERROR: .env file not found!" + echo "" + echo " cp .env.example .env" + echo " Then edit .env with your secrets (AZURE_CLIENT_SECRET, SECRET_KEY, etc.)" + echo "" + exit 1 +fi + +# 3. Build Docker image (uses layer cache, picks up code changes via COPY . .) +echo ">>> Building Docker image..." +docker compose -p "$COMPOSE_PROJECT" build + +# 4. Start or restart containers (idempotent — creates if missing, restarts if running) +echo ">>> Starting containers..." +docker compose -p "$COMPOSE_PROJECT" up -d + +# 5. Wait for health check +# Database auto-initializes on first container startup: +# - Tables created via CREATE TABLE IF NOT EXISTS +# - Migrations run in-code (check-before-act pattern) +# - Superadmin created if SUPERADMIN_EMAIL is set +echo ">>> Waiting for app to be healthy..." +HEALTHY=false +for i in $(seq 1 20); do + if curl -sf http://127.0.0.1:5001/login > /dev/null 2>&1; then + echo ">>> App is healthy!" + HEALTHY=true + break + fi + echo " Waiting... ($i/20)" + sleep 3 +done + +if [ "$HEALTHY" = false ]; then + echo "" + echo "WARNING: App may not be healthy after 60 seconds." + echo "Check logs:" + echo " docker compose -p $COMPOSE_PROJECT logs --tail 50" + echo "" + exit 1 +fi + +echo "" +echo "=== Deploy complete ===" +echo "URL: https://ai-sandbox.oliver.solutions/solventum-image-metadata/" +echo "" +docker compose -p "$COMPOSE_PROJECT" ps diff --git a/deploy/apache-solventum-metadata.conf b/deploy/apache-solventum-metadata.conf new file mode 100644 index 0000000..3b6f095 --- /dev/null +++ b/deploy/apache-solventum-metadata.conf @@ -0,0 +1,17 @@ +# Solventum Image Metadata Tool — Apache Config Additions +# Add these directives inside your existing for ai-sandbox.oliver.solutions +# +# The main reverse proxy rule is already configured: +# ProxyPass /solventum-image-metadata/ http://localhost:5001/ +# ProxyPassReverse /solventum-image-metadata/ http://localhost:5001/ + +# SSE support (disable buffering for realtime AI progress events) + + SetEnv proxy-sendchunked 1 + SetEnv proxy-interim-response RFC + + +# Upload size limit (500MB) + + LimitRequestBody 524288000 + diff --git a/deploy/deploy.sh b/deploy/deploy.sh new file mode 100755 index 0000000..ad95d62 --- /dev/null +++ b/deploy/deploy.sh @@ -0,0 +1,94 @@ +#!/bin/bash +# Oliver Metadata Tool — Deployment Script +# Usage: ./deploy.sh [--first-run] +set -euo pipefail + +APP_DIR="/var/www/oliver" +SERVICE_NAME="oliver-metadata" +VENV_DIR="$APP_DIR/venv" +REPO_BRANCH="${DEPLOY_BRANCH:-main}" + +echo "=== Oliver Metadata Tool Deployment ===" +echo "Directory: $APP_DIR" +echo "Service: $SERVICE_NAME" +echo "" + +# Check we're running as root or with sudo +if [ "$EUID" -ne 0 ]; then + echo "Please run with sudo" + exit 1 +fi + +cd "$APP_DIR" + +# First run setup +if [ "${1:-}" = "--first-run" ]; then + echo ">>> First-run setup..." + + # System dependencies + apt-get update + apt-get install -y python3.11 python3.11-venv python3.11-dev \ + libimage-exiftool-perl tesseract-ocr tesseract-ocr-eng \ + tesseract-ocr-chi-sim tesseract-ocr-chi-tra tesseract-ocr-jpn tesseract-ocr-kor \ + poppler-utils ffmpeg gcc + + # Create venv + python3.11 -m venv "$VENV_DIR" + + # Create directories + mkdir -p "$APP_DIR/uploads" "$APP_DIR/data" "$APP_DIR/templates_saved" + + # Set permissions + chown -R www-data:www-data "$APP_DIR" + + # Install systemd service + cp "$APP_DIR/deploy/oliver-metadata.service" /etc/systemd/system/ + systemctl daemon-reload + systemctl enable "$SERVICE_NAME" + + # Install Apache config (if Apache is installed) + if command -v apache2 &> /dev/null; then + cp "$APP_DIR/deploy/oliver-metadata.conf" /etc/apache2/sites-available/ + a2enmod proxy proxy_http headers rewrite ssl expires + a2ensite oliver-metadata + echo ">>> Apache config installed. Update SSL paths and restart Apache." + fi + + echo ">>> First-run setup complete." + echo ">>> Edit $APP_DIR/.env before starting the service." + echo "" +fi + +# Pull latest code +echo ">>> Pulling latest code..." +sudo -u www-data git pull origin "$REPO_BRANCH" + +# Install/update Python deps +echo ">>> Installing Python dependencies..." +"$VENV_DIR/bin/pip" install --upgrade pip +"$VENV_DIR/bin/pip" install -r requirements.txt + +# Restart service +echo ">>> Restarting service..." +systemctl restart "$SERVICE_NAME" + +# Wait for health +echo ">>> Waiting for service to start..." +sleep 3 + +# Health check +for i in {1..10}; do + if curl -sf http://127.0.0.1:5001/login > /dev/null 2>&1; then + echo ">>> Service is healthy!" + systemctl status "$SERVICE_NAME" --no-pager -l + echo "" + echo "=== Deployment complete ===" + exit 0 + fi + echo " Waiting... ($i/10)" + sleep 2 +done + +echo ">>> WARNING: Service may not be healthy. Check logs:" +echo " journalctl -u $SERVICE_NAME -n 50 --no-pager" +exit 1 diff --git a/deploy/oliver-metadata.conf b/deploy/oliver-metadata.conf new file mode 100644 index 0000000..680d67b --- /dev/null +++ b/deploy/oliver-metadata.conf @@ -0,0 +1,57 @@ + + ServerName metadata.oliver.agency + + # SSL — provide your own certificates + SSLEngine on + SSLCertificateFile /etc/ssl/certs/oliver-metadata.crt + SSLCertificateKeyFile /etc/ssl/private/oliver-metadata.key + # SSLCertificateChainFile /etc/ssl/certs/ca-bundle.crt + + # Serve static files directly via Apache (bypass gunicorn) + Alias /static /var/www/oliver/static + + Require all granted + Options -Indexes + ExpiresActive On + ExpiresDefault "access plus 1 week" + Header set Cache-Control "public, max-age=604800" + + + # Proxy to gunicorn/uvicorn + ProxyPreserveHost On + ProxyPass /static ! + ProxyPass / http://127.0.0.1:5001/ + ProxyPassReverse / http://127.0.0.1:5001/ + + # SSE support — disable buffering for event streams + + ProxyPass http://127.0.0.1:5001 + ProxyPassReverse http://127.0.0.1:5001 + SetEnv proxy-sendchunked 1 + SetEnv proxy-interim-response RFC + + + # Timeouts (AI generation can take 30+ seconds per file) + ProxyTimeout 120 + Timeout 120 + + # Upload size limit (500MB) + LimitRequestBody 524288000 + + # Security headers + Header always set X-Content-Type-Options "nosniff" + Header always set X-Frame-Options "DENY" + Header always set X-XSS-Protection "1; mode=block" + Header always set Referrer-Policy "strict-origin-when-cross-origin" + + # Logging + ErrorLog ${APACHE_LOG_DIR}/oliver-metadata-error.log + CustomLog ${APACHE_LOG_DIR}/oliver-metadata-access.log combined + + +# Redirect HTTP to HTTPS + + ServerName metadata.oliver.agency + RewriteEngine On + RewriteRule ^(.*)$ https://%{HTTP_HOST}$1 [R=301,L] + diff --git a/deploy/oliver-metadata.service b/deploy/oliver-metadata.service new file mode 100644 index 0000000..072a427 --- /dev/null +++ b/deploy/oliver-metadata.service @@ -0,0 +1,37 @@ +[Unit] +Description=Oliver Metadata Tool (FastAPI) +After=network.target +Wants=network-online.target + +[Service] +Type=notify +User=www-data +Group=www-data +WorkingDirectory=/var/www/oliver +Environment="PATH=/var/www/oliver/venv/bin:/usr/local/bin:/usr/bin:/bin" +EnvironmentFile=/var/www/oliver/.env + +ExecStart=/var/www/oliver/venv/bin/gunicorn app.main:app \ + --worker-class uvicorn.workers.UvicornWorker \ + --workers 2 \ + --bind 127.0.0.1:5001 \ + --timeout 120 \ + --graceful-timeout 30 \ + --access-logfile - \ + --error-logfile - + +ExecReload=/bin/kill -s HUP $MAINPID +KillMode=mixed +TimeoutStopSec=10 +Restart=on-failure +RestartSec=5 + +# Security hardening +NoNewPrivileges=yes +ProtectSystem=strict +ProtectHome=yes +ReadWritePaths=/var/www/oliver/uploads /var/www/oliver/data /var/www/oliver/oliver_metadata.db /var/www/oliver/oliver_sessions.db /tmp +PrivateTmp=yes + +[Install] +WantedBy=multi-user.target diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..72c1a5b --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,44 @@ +services: + oliver-metadata: + build: + context: . + dockerfile: Dockerfile + container_name: oliver-metadata-tool + ports: + - "127.0.0.1:5001:5001" + volumes: + # Persistent storage for uploads + - uploads:/app/uploads + # Persistent storage for database + - database:/app/data + # Persistent storage for output/backups/reports + - output:/app/output + + # Load environment variables from .env file (if exists) + env_file: + - .env + + environment: + # Docker mode enabled + - DOCKER_MODE=true + + restart: unless-stopped + + healthcheck: + test: ["CMD", "curl", "-sf", "http://localhost:5001/login"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 40s + +volumes: + uploads: + driver: local + database: + driver: local + output: + driver: local + +networks: + default: + name: oliver-metadata-network diff --git a/docker-run.sh b/docker-run.sh new file mode 100755 index 0000000..29b2989 --- /dev/null +++ b/docker-run.sh @@ -0,0 +1,165 @@ +#!/bin/bash +# Oliver Metadata Tool - Docker Management Script + +set -e + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# Functions +print_header() { + echo -e "${BLUE}============================================${NC}" + echo -e "${BLUE} Oliver Metadata Tool - Docker Manager${NC}" + echo -e "${BLUE}============================================${NC}" +} + +print_success() { + echo -e "${GREEN}✓ $1${NC}" +} + +print_error() { + echo -e "${RED}✗ $1${NC}" +} + +print_info() { + echo -e "${YELLOW}ℹ $1${NC}" +} + +# Check if Docker is installed +check_docker() { + if ! command -v docker &> /dev/null; then + print_error "Docker is not installed. Please install Docker first." + exit 1 + fi + + if ! command -v docker-compose &> /dev/null && ! docker compose version &> /dev/null; then + print_error "Docker Compose is not installed. Please install Docker Compose first." + exit 1 + fi +} + +# Build Docker image +build() { + print_header + print_info "Building Docker image..." + docker-compose build + print_success "Docker image built successfully" +} + +# Start containers +start() { + print_header + print_info "Starting Oliver Metadata Tool..." + docker-compose up -d + print_success "Application started successfully" + print_info "Access the application at: http://localhost:5001" + print_info "Default credentials: tester / oliveradmin" +} + +# Stop containers +stop() { + print_header + print_info "Stopping Oliver Metadata Tool..." + docker-compose down + print_success "Application stopped successfully" +} + +# View logs +logs() { + print_header + print_info "Showing application logs (Ctrl+C to exit)..." + docker-compose logs -f +} + +# Restart containers +restart() { + print_header + print_info "Restarting Oliver Metadata Tool..." + docker-compose restart + print_success "Application restarted successfully" +} + +# Show status +status() { + print_header + docker-compose ps +} + +# Clean up (remove containers and volumes) +clean() { + print_header + print_error "WARNING: This will remove all containers, volumes, and data!" + read -p "Are you sure? (yes/no): " confirm + if [ "$confirm" == "yes" ]; then + print_info "Cleaning up..." + docker-compose down -v + print_success "Cleanup completed" + else + print_info "Cleanup cancelled" + fi +} + +# Show help +show_help() { + print_header + echo "" + echo "Usage: ./docker-run.sh [command]" + echo "" + echo "Commands:" + echo " build - Build Docker image" + echo " start - Start the application" + echo " stop - Stop the application" + echo " restart - Restart the application" + echo " logs - View application logs" + echo " status - Show container status" + echo " clean - Remove containers and volumes (WARNING: deletes data)" + echo " help - Show this help message" + echo "" + echo "Examples:" + echo " ./docker-run.sh build # Build image" + echo " ./docker-run.sh start # Start application" + echo " ./docker-run.sh logs # View logs" + echo "" +} + +# Main script +check_docker + +case "$1" in + build) + build + ;; + start) + start + ;; + stop) + stop + ;; + restart) + restart + ;; + logs) + logs + ;; + status) + status + ;; + clean) + clean + ;; + help|--help|-h) + show_help + ;; + "") + show_help + ;; + *) + print_error "Unknown command: $1" + show_help + exit 1 + ;; +esac diff --git a/docs/EXIFTOOL_SETUP.md b/docs/EXIFTOOL_SETUP.md new file mode 100644 index 0000000..682f90a --- /dev/null +++ b/docs/EXIFTOOL_SETUP.md @@ -0,0 +1,243 @@ +# ExifTool Setup Guide + +ExifTool is a powerful command-line application for reading, writing, and editing metadata in a wide variety of files. Oliver Metadata Tool uses ExifTool to provide enhanced metadata support for 300+ file formats. + +## Why ExifTool? + +- **Unified API**: Single tool handles images, videos, PDFs, and more +- **300+ formats**: Support for virtually all media file types +- **Better performance**: Optimized batch operations (10-60x faster) +- **Battle-tested**: 20+ years of development and widespread use +- **PDF writing support**: Can write PDF metadata (unlike pypdf) + +## Installation + +### macOS + +```bash +brew install exiftool +``` + +Verify installation: +```bash +exiftool -ver +# Should show version 12.15 or higher +``` + +### Linux (Ubuntu/Debian) + +```bash +sudo apt-get update +sudo apt-get install libimage-exiftool-perl +``` + +Verify installation: +```bash +exiftool -ver +``` + +### Linux (Fedora/RHEL/CentOS) + +```bash +sudo yum install perl-Image-ExifTool +``` + +### Windows + +**Option 1: Chocolatey** +```powershell +choco install exiftool +``` + +**Option 2: Manual installation** +1. Download from https://exiftool.org/ +2. Extract the `.zip` file +3. Rename `exiftool(-k).exe` to `exiftool.exe` +4. Add the directory to your PATH + +Verify installation: +```powershell +exiftool -ver +``` + +## Verification + +After installation, verify ExifTool is accessible: + +```bash +# Check version +exiftool -ver + +# Check location +which exiftool # macOS/Linux +where exiftool # Windows + +# Test with a file +exiftool your-image.jpg +``` + +## What Oliver Metadata Tool Uses ExifTool For + +### Supported Operations + +1. **Images (JPEG, PNG, GIF, TIFF, HEIC, RAW formats)** + - Read/write Title, Description, Keywords + - Access EXIF, IPTC, XMP metadata + - Support for camera metadata + +2. **Videos (MP4, MOV, AVI, MKV)** + - Read/write Title, Description, Keywords + - QuickTime metadata support + - Unified API across formats + +3. **PDFs** + - Read/write PDF metadata fields + - Better than pypdf for metadata writing + - Preserves document structure + +### Format Coverage + +ExifTool provides support for these additional formats beyond Python libraries: + +- **Images**: HEIC, CR2, NEF, ARW, DNG (RAW formats) +- **Video**: MKV, WebM, FLV, WMV (extended video formats) +- **Audio**: MP3, FLAC, WAV, OGG (audio files) +- **Documents**: EPUB, MOBI (ebook formats) +- **3D/CAD**: STL, DWG, DXF +- And 250+ more formats + +## PyExifTool Python Wrapper + +Oliver Metadata Tool uses the PyExifTool library to interact with ExifTool from Python: + +```python +from exiftool import ExifToolHelper + +# Read metadata +with ExifToolHelper() as et: + metadata = et.get_metadata(["image.jpg"]) + print(metadata[0]) + +# Write metadata +with ExifToolHelper() as et: + et.set_tags( + ["image.jpg"], + tags={"EXIF:ImageDescription": "New Title"}, + params=["-overwrite_original"] + ) +``` + +### Batch Mode Performance + +PyExifTool uses ExifTool's `-stay_open` mode, which keeps one ExifTool process running for multiple operations: + +- **Single file operations**: ~50-100ms overhead +- **Batch operations (100 files)**: 10-60x faster than spawning new processes +- **Memory efficient**: One process handles all operations + +## Troubleshooting + +### ExifTool not found + +**Error:** `ExifTool not found` or `exiftool command not available` + +**Solution:** +1. Install ExifTool using the instructions above +2. Restart your terminal/command prompt +3. Verify with `exiftool -ver` +4. If still not found, check your PATH environment variable + +### Permission denied + +**Error:** `Permission denied when executing exiftool` + +**Solution (macOS/Linux):** +```bash +chmod +x /path/to/exiftool +``` + +### PyExifTool import error + +**Error:** `ModuleNotFoundError: No module named 'exiftool'` + +**Solution:** +```bash +pip install PyExifTool>=0.5.6 +``` + +### Encoding issues with Unicode filenames + +ExifTool handles Unicode filenames natively. If you encounter issues: + +1. Ensure your terminal supports UTF-8 +2. Use the PyExifTool wrapper (handles encoding automatically) +3. Check file system supports Unicode filenames + +## Performance Tips + +### Use batch mode for multiple files + +```python +# Good: Process multiple files in one batch +with ExifToolHelper() as et: + et.set_tags( + ["file1.jpg", "file2.jpg", "file3.jpg"], + tags={"EXIF:ImageDescription": "Title"}, + params=["-overwrite_original"] + ) + +# Avoid: Processing files one at a time +for file in files: + with ExifToolHelper() as et: + et.set_tags([file], tags={...}) +``` + +### Use specific tag names + +```python +# Good: Specific tag queries +et.get_tags(["image.jpg"], tags=["EXIF:ImageDescription", "XMP:Title"]) + +# Slower: Extract all tags +et.get_metadata(["image.jpg"]) # Returns 100+ tags +``` + +### Skip unnecessary tags with -fast + +For read-only operations where you only need basic metadata: + +```python +et.execute("-fast", "-json", "image.jpg") +``` + +## Integration with Oliver Metadata Tool + +Oliver Metadata Tool automatically detects ExifTool and uses it when available: + +1. **On startup**: Checks for ExifTool installation +2. **Hybrid approach**: Uses ExifTool for images/video/PDF, Python libraries for Office docs +3. **Graceful fallback**: Falls back to pure Python if ExifTool unavailable + +### Check ExifTool status + +```python +from src.config import Config + +if Config.check_exiftool(): + print("ExifTool available") +else: + print("Using Python libraries") +``` + +## References + +- [ExifTool Official Website](https://exiftool.org/) +- [ExifTool Documentation](https://exiftool.org/exiftool_pod.html) +- [PyExifTool GitHub](https://github.com/sylikc/pyexiftool) +- [PyExifTool Documentation](https://sylikc.github.io/pyexiftool/) +- [Supported File Types](https://exiftool.org/#supported) +- [Tag Names Reference](https://exiftool.org/TagNames/) + +## License + +ExifTool is free software licensed under the Perl Artistic License or GPL version 1 or later. diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..c54a80d --- /dev/null +++ b/requirements.txt @@ -0,0 +1,54 @@ +# Core Libraries +python-magic>=0.4.27 +python-dotenv>=1.0.1 +tqdm>=4.66.0 + +# Excel Processing +pandas>=2.0.0 +openpyxl>=3.1.0 + +# PDF Processing +pypdf>=4.0.0 +pdfplumber>=0.11.0 +PyPDF2>=3.0.0 + +# Image Processing +Pillow>=10.2.0 +pytesseract>=0.3.0 +pdf2image>=1.16.0 +piexif>=1.1.0 +iptcinfo3>=2.1.0 + +# Office Documents +python-docx>=1.0.0 +python-pptx>=0.6.0 + +# Video Processing +mutagen>=1.45.0 +ffmpeg-python>=0.2.0 +pymediainfo>=7.0.0 + +# AI & Metadata Generation +openai>=1.0.0 +tiktoken>=0.5.0 +tenacity>=8.2.0 + +# ExifTool Integration (optional but recommended) +PyExifTool>=0.5.6 + +# Web Framework (FastAPI) +fastapi>=0.109.0 +uvicorn[standard]>=0.27.0 +gunicorn>=21.2.0 +python-multipart>=0.0.6 +pydantic-settings>=2.1.0 +jinja2>=3.1.0 + +# Password Hashing (from Flask ecosystem, still needed) +Werkzeug>=3.0.0 + +# Authentication & SSO +msal>=1.20.0 # Microsoft Authentication Library for SSO (optional) + +# Security +slowapi>=0.1.9 diff --git a/run.py b/run.py new file mode 100644 index 0000000..5a6c699 --- /dev/null +++ b/run.py @@ -0,0 +1,13 @@ +#!/usr/bin/env python3 +"""Development entry point for Oliver Metadata Tool.""" + +import uvicorn + +if __name__ == "__main__": + uvicorn.run( + "app.main:app", + host="127.0.0.1", + port=5001, + reload=True, + log_level="info", + ) diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..72088a3 --- /dev/null +++ b/src/__init__.py @@ -0,0 +1,4 @@ +"""Universal Metadata Automation Tool""" + +__version__ = "1.0.0" +__author__ = "Oliver Team" diff --git a/src/auth.py b/src/auth.py new file mode 100644 index 0000000..37b89e4 --- /dev/null +++ b/src/auth.py @@ -0,0 +1,324 @@ +"""Authentication and authorization module.""" + +import os +import secrets +from functools import wraps +from flask import session, redirect, url_for, request +from typing import Dict, Optional +from .database import Database +from .utils import get_logger + +logger = get_logger(__name__) + +# Initialize database +db = Database() + + +def login_required(f): + """ + Decorator to require login for routes. + + Usage: + @app.route('/protected') + @login_required + def protected_route(): + return 'Protected content' + """ + @wraps(f) + def decorated_function(*args, **kwargs): + if 'user_id' not in session: + # Save the original URL to redirect after login + return redirect(url_for('login', next=request.url)) + + # Check if session is still valid in database + session_id = session.get('session_id') + if session_id: + db_session = db.get_session(session_id) + if not db_session: + # Session expired or invalid + session.clear() + return redirect(url_for('login', next=request.url)) + + return f(*args, **kwargs) + return decorated_function + + +def authenticate_user(username: str, password: str) -> Dict: + """ + Authenticate user with username and password. + + Args: + username: Username + password: Plain text password + + Returns: + Dictionary with 'success' boolean and either 'user' dict or 'error' message + """ + try: + # Import werkzeug for password verification + from werkzeug.security import check_password_hash + + # Check test user first (hardcoded for testing) + if username == 'tester' and password == 'oliveradmin': + user = db.get_user_by_username('tester') + if user: + logger.info(f"Test user '{username}' authenticated successfully") + return {'success': True, 'user': user} + + # Check database for other users + user = db.get_user_by_username(username) + + if user and user.get('password_hash'): + if check_password_hash(user['password_hash'], password): + logger.info(f"User '{username}' authenticated successfully (database)") + return {'success': True, 'user': user} + + logger.warning(f"Authentication failed for user '{username}'") + return {'success': False, 'error': 'Invalid username or password'} + + except ImportError: + logger.error("werkzeug not available - cannot verify passwords") + return {'success': False, 'error': 'Authentication system not available'} + except Exception as e: + logger.error(f"Authentication error: {e}") + return {'success': False, 'error': 'Authentication error occurred'} + + +def create_user_session(user: Dict, ip_address: Optional[str] = None, user_agent: Optional[str] = None) -> str: + """ + Create a new session for authenticated user. + + Args: + user: User dictionary from database + ip_address: Client IP address + user_agent: Client user agent string + + Returns: + Session ID + """ + session_id = secrets.token_urlsafe(32) + user_id = user['id'] + + # Create session in database + success = db.create_session( + user_id=user_id, + session_id=session_id, + expires_in_hours=24, + ip_address=ip_address, + user_agent=user_agent + ) + + if success: + # Update last login timestamp + db.update_last_login(user_id) + + # Log the login action + db.log_action(user_id, 'login', f'IP: {ip_address}') + + logger.info(f"Created session for user {user['username']} (ID: {user_id})") + return session_id + else: + logger.error(f"Failed to create session for user {user_id}") + return None + + +def destroy_user_session(session_id: str, user_id: Optional[int] = None): + """ + Destroy user session (logout). + + Args: + session_id: Session ID to destroy + user_id: Optional user ID for logging + """ + db.delete_session(session_id) + + if user_id: + db.log_action(user_id, 'logout', f'Session: {session_id}') + logger.info(f"User {user_id} logged out") + + +def get_current_user() -> Optional[Dict]: + """ + Get current logged-in user from session. + + Returns: + User dictionary or None if not logged in + """ + user_id = session.get('user_id') + if user_id: + return db.get_user_by_id(user_id) + return None + + +def cleanup_sessions(): + """Clean up expired sessions from database.""" + db.cleanup_expired_sessions() + + +class MicrosoftSSO: + """Microsoft SSO authentication handler using MSAL.""" + + def __init__(self): + """Initialize Microsoft SSO with environment variables.""" + self.client_id = os.getenv('AZURE_CLIENT_ID') + self.client_secret = os.getenv('AZURE_CLIENT_SECRET') + self.tenant_id = os.getenv('AZURE_TENANT_ID') + self.redirect_uri = os.getenv('REDIRECT_URI', 'http://localhost:5001/auth/callback') + + # Check if SSO is configured + if not all([self.client_id, self.client_secret, self.tenant_id]): + self.enabled = False + logger.warning("Microsoft SSO not configured (missing Azure credentials)") + return + + try: + import msal + self.authority = f"https://login.microsoftonline.com/{self.tenant_id}" + self.app = msal.ConfidentialClientApplication( + self.client_id, + authority=self.authority, + client_credential=self.client_secret + ) + self.enabled = True + logger.info("Microsoft SSO initialized successfully") + except ImportError: + self.enabled = False + logger.warning("Microsoft SSO not available (msal library not installed)") + except Exception as e: + self.enabled = False + logger.error(f"Failed to initialize Microsoft SSO: {e}") + + def get_auth_url(self, state: Optional[str] = None) -> Optional[str]: + """ + Get Microsoft login URL. + + Args: + state: State parameter for CSRF protection + + Returns: + Authorization URL or None if SSO not enabled + """ + if not self.enabled: + return None + + try: + return self.app.get_authorization_request_url( + scopes=["User.Read"], + state=state, + redirect_uri=self.redirect_uri + ) + except Exception as e: + logger.error(f"Error generating auth URL: {e}") + return None + + def acquire_token(self, auth_code: str) -> Optional[Dict]: + """ + Exchange authorization code for access token. + + Args: + auth_code: Authorization code from Microsoft + + Returns: + Token result dictionary or None if failed + """ + if not self.enabled: + return None + + try: + result = self.app.acquire_token_by_authorization_code( + auth_code, + scopes=["User.Read"], + redirect_uri=self.redirect_uri + ) + return result + except Exception as e: + logger.error(f"Error acquiring token: {e}") + return None + + def get_user_info(self, access_token: str) -> Optional[Dict]: + """ + Get user info from Microsoft Graph API. + + Args: + access_token: Access token from Microsoft + + Returns: + User info dictionary or None if failed + """ + if not self.enabled: + return None + + try: + import requests + headers = {'Authorization': f'Bearer {access_token}'} + response = requests.get( + 'https://graph.microsoft.com/v1.0/me', + headers=headers, + timeout=10 + ) + + if response.status_code == 200: + return response.json() + else: + logger.error(f"Graph API error: {response.status_code}") + return None + + except Exception as e: + logger.error(f"Error fetching user info: {e}") + return None + + def create_or_update_user(self, user_info: Dict) -> Optional[Dict]: + """ + Create or update user from SSO login. + + Args: + user_info: User information from Microsoft Graph + + Returns: + User dictionary or None if failed + """ + try: + email = user_info.get('mail') or user_info.get('userPrincipalName') + username = email.split('@')[0] if email else user_info.get('displayName', 'unknown') + full_name = user_info.get('displayName') + + # Check if user exists + user = db.get_user_by_username(username) + + if not user: + # Create new user + user_id = db.create_user( + username=username, + email=email, + full_name=full_name, + auth_method='sso' + ) + + if user_id: + user = db.get_user_by_id(user_id) + logger.info(f"Created new SSO user: {username}") + else: + logger.error(f"Failed to create SSO user: {username}") + return None + else: + logger.info(f"Existing SSO user logged in: {username}") + + return user + + except Exception as e: + logger.error(f"Error creating/updating SSO user: {e}") + return None + + +# Initialize Microsoft SSO +sso = MicrosoftSSO() + + +def is_sso_enabled() -> bool: + """Check if Microsoft SSO is enabled and configured.""" + return sso.enabled + + +def get_sso_instance() -> MicrosoftSSO: + """Get Microsoft SSO instance.""" + return sso diff --git a/src/base_extractor.py b/src/base_extractor.py new file mode 100644 index 0000000..d8a66bd --- /dev/null +++ b/src/base_extractor.py @@ -0,0 +1,64 @@ +"""Base class for all content extractors.""" + +from abc import ABC, abstractmethod +from typing import Dict, Optional + +class BaseExtractor(ABC): + """Abstract base class for content extractors.""" + + @abstractmethod + def extract_content(self, file_path: str) -> str: + """ + Extract text content from file. + + Args: + file_path: Path to the file + + Returns: + Extracted text content + """ + pass + + @abstractmethod + def read_metadata(self, file_path: str) -> Dict[str, str]: + """ + Read existing metadata from file. + + Args: + file_path: Path to the file + + Returns: + Dictionary of metadata fields + """ + pass + + def truncate_content(self, content: str, max_length: int = 3000) -> str: + """ + Truncate content to maximum length for AI processing. + + Args: + content: Text content + max_length: Maximum length + + Returns: + Truncated content + """ + if len(content) <= max_length: + return content + return content[:max_length] + "..." + + def clean_text(self, text: str) -> str: + """ + Clean extracted text (remove excessive whitespace, etc.). + + Args: + text: Raw text + + Returns: + Cleaned text + """ + # Remove multiple spaces + text = ' '.join(text.split()) + # Remove multiple newlines + text = '\n'.join(line for line in text.split('\n') if line.strip()) + return text.strip() diff --git a/src/base_updater.py b/src/base_updater.py new file mode 100644 index 0000000..1a820c6 --- /dev/null +++ b/src/base_updater.py @@ -0,0 +1,60 @@ +"""Base class for all metadata updaters.""" + +from abc import ABC, abstractmethod +from typing import Dict, Optional + +class BaseUpdater(ABC): + """Abstract base class for metadata updaters.""" + + @abstractmethod + def update_metadata(self, file_path: str, metadata: Dict[str, str], backup: bool = True) -> bool: + """ + Update file metadata. + + Args: + file_path: Path to the file + metadata: Dictionary of metadata to update + backup: Whether to create backup before updating + + Returns: + True if successful, False otherwise + """ + pass + + @abstractmethod + def verify_metadata(self, file_path: str, expected_metadata: Dict[str, str]) -> bool: + """ + Verify metadata was written correctly. + + Args: + file_path: Path to the file + expected_metadata: Expected metadata values + + Returns: + True if metadata matches expected values + """ + pass + + def validate_metadata(self, metadata: Dict[str, str]) -> bool: + """ + Validate metadata before writing. + + Args: + metadata: Metadata dictionary + + Returns: + True if valid + """ + # Check for required fields + required_fields = ['title'] + for field in required_fields: + if field not in metadata or not metadata[field]: + return False + + # Check field lengths + if len(metadata.get('title', '')) > 200: + return False + if len(metadata.get('keywords', '')) > 500: + return False + + return True diff --git a/src/config.py b/src/config.py new file mode 100644 index 0000000..0651f8a --- /dev/null +++ b/src/config.py @@ -0,0 +1,70 @@ +"""Configuration management for Oliver Metadata Tool.""" + +import os +import shutil +import logging +from pathlib import Path +from dotenv import load_dotenv + +# Load environment variables +load_dotenv() + +logger = logging.getLogger(__name__) + +class Config: + """Configuration class for managing settings.""" + + # App Info + APP_NAME = "Oliver Metadata Tool" + APP_VERSION = "3.0.0" + APP_DESCRIPTION = "Universal metadata creation and management tool" + + # Paths + PROJECT_ROOT = Path(__file__).parent.parent + OUTPUT_DIR = PROJECT_ROOT / 'output' + BACKUP_DIR = OUTPUT_DIR / 'backup' + REPORTS_DIR = OUTPUT_DIR / 'reports' + + # External tool paths (optional) + TESSERACT_PATH = os.getenv('TESSERACT_PATH') + FFMPEG_PATH = os.getenv('FFMPEG_PATH') + + # Processing Settings + PDF_MAX_PAGES = 3 # Maximum pages to extract from PDF + + # OCR Settings - languages for Tesseract (CGA region support) + # eng=English, chi_sim=Chinese Simplified, chi_tra=Chinese Traditional, + # jpn=Japanese, kor=Korean + OCR_LANGUAGES = os.getenv('OCR_LANGUAGES', 'eng+chi_sim+chi_tra+jpn+kor') + + # AI Settings (for CLI and Web AI mode) + OPENAI_API_KEY = os.getenv('OPENAI_API_KEY') + AI_MODEL = os.getenv('AI_MODEL', 'gpt-4o-mini') # Better than gpt-3.5-turbo + MAX_TOKENS = int(os.getenv('MAX_TOKENS', '500')) + TEMPERATURE = float(os.getenv('TEMPERATURE', '0.5')) # 0.5 better for factual content + MAX_TEXT_LENGTH = int(os.getenv('MAX_TEXT_LENGTH', '4000')) + + # API Rate Limiting & Retry (from open source analysis) + API_TIMEOUT = int(os.getenv('API_TIMEOUT', '30')) + API_MAX_RETRIES = int(os.getenv('API_MAX_RETRIES', '3')) + API_RETRY_DELAY = float(os.getenv('API_RETRY_DELAY', '1.0')) # exponential backoff multiplier + + @classmethod + def ensure_directories(cls): + """Ensure required directories exist.""" + cls.OUTPUT_DIR.mkdir(exist_ok=True) + cls.BACKUP_DIR.mkdir(exist_ok=True) + cls.REPORTS_DIR.mkdir(exist_ok=True) + + @classmethod + def check_exiftool(cls): + """Check if ExifTool is installed.""" + exiftool_path = shutil.which('exiftool') + if not exiftool_path: + logger.warning("⚠️ ExifTool not found. Install with: brew install exiftool (macOS) or apt-get install libimage-exiftool-perl (Linux)") + return False + logger.info(f"✓ ExifTool found at {exiftool_path}") + return True + +# Ensure directories on import +Config.ensure_directories() diff --git a/src/database.py b/src/database.py new file mode 100644 index 0000000..8119aec --- /dev/null +++ b/src/database.py @@ -0,0 +1,525 @@ +"""Database management for user authentication and sessions.""" + +import sqlite3 +import os +from datetime import datetime, timedelta +from typing import Optional, Dict, List +from pathlib import Path +from .utils import get_logger + +logger = get_logger(__name__) + + +class Database: + """SQLite database manager for Oliver Metadata Tool. + + Uses connection-per-operation pattern for thread safety with + multiple uvicorn workers. + """ + + def __init__(self, db_path: str = None): + # Auto-detect database path based on environment + if db_path is None: + DOCKER_MODE = os.getenv('DOCKER_MODE', 'false').lower() == 'true' + if DOCKER_MODE: + db_dir = Path('/app/data') + db_dir.mkdir(parents=True, exist_ok=True) + db_path = str(db_dir / 'oliver_metadata.db') + else: + db_path = 'oliver_metadata.db' + + self.db_path = db_path + Path(db_path).parent.mkdir(parents=True, exist_ok=True) + self._create_tables() + logger.info(f"Database initialized at {db_path}") + + def _get_conn(self) -> sqlite3.Connection: + """Create a new connection per call (thread-safe).""" + conn = sqlite3.connect(self.db_path, timeout=10) + conn.row_factory = sqlite3.Row + conn.execute("PRAGMA journal_mode=WAL") + return conn + + def _create_tables(self): + """Create database tables if they don't exist.""" + conn = self._get_conn() + try: + # Users table (with role column) + conn.execute(''' + CREATE TABLE IF NOT EXISTS users ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + username TEXT UNIQUE NOT NULL, + password_hash TEXT, + email TEXT, + full_name TEXT, + role TEXT DEFAULT 'user', + auth_method TEXT DEFAULT 'local', + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + last_login TIMESTAMP, + is_active BOOLEAN DEFAULT 1 + ) + ''') + + # Sessions table + conn.execute(''' + CREATE TABLE IF NOT EXISTS sessions ( + session_id TEXT PRIMARY KEY, + user_id INTEGER NOT NULL, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + expires_at TIMESTAMP NOT NULL, + ip_address TEXT, + user_agent TEXT, + FOREIGN KEY (user_id) REFERENCES users (id) + ) + ''') + + # Audit log table + conn.execute(''' + CREATE TABLE IF NOT EXISTS audit_log ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + user_id INTEGER NOT NULL, + action TEXT NOT NULL, + details TEXT, + timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + FOREIGN KEY (user_id) REFERENCES users (id) + ) + ''') + + # AI usage table + conn.execute(''' + CREATE TABLE IF NOT EXISTS ai_usage ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + user_id INTEGER NOT NULL, + filename TEXT, + tokens_total INTEGER DEFAULT 0, + model TEXT DEFAULT '', + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + FOREIGN KEY (user_id) REFERENCES users (id) + ) + ''') + + # Indexes + conn.execute('CREATE INDEX IF NOT EXISTS idx_sessions_user_id ON sessions(user_id)') + conn.execute('CREATE INDEX IF NOT EXISTS idx_sessions_expires_at ON sessions(expires_at)') + conn.execute('CREATE INDEX IF NOT EXISTS idx_audit_user_id ON audit_log(user_id)') + conn.execute('CREATE INDEX IF NOT EXISTS idx_audit_timestamp ON audit_log(timestamp)') + conn.execute('CREATE INDEX IF NOT EXISTS idx_ai_usage_user_id ON ai_usage(user_id)') + conn.execute('CREATE INDEX IF NOT EXISTS idx_ai_usage_created ON ai_usage(created_at)') + + conn.commit() + logger.info("Database tables created/verified") + + # Add role column to existing databases (migration) + self._migrate_add_role_column(conn) + + # Create test user if enabled + enable_test = os.getenv('ENABLE_TEST_USER', 'false').lower() == 'true' + if enable_test: + self._create_test_user(conn) + + # Create superadmin if configured + superadmin_email = os.getenv('SUPERADMIN_EMAIL', '') + if superadmin_email: + self._create_superadmin(conn, superadmin_email) + + finally: + conn.close() + + def _migrate_add_role_column(self, conn: sqlite3.Connection): + """Add role column if it doesn't exist (for existing databases).""" + try: + cursor = conn.execute("PRAGMA table_info(users)") + columns = [row['name'] for row in cursor.fetchall()] + if 'role' not in columns: + conn.execute("ALTER TABLE users ADD COLUMN role TEXT DEFAULT 'user'") + conn.commit() + logger.info("Added 'role' column to users table") + except Exception as e: + logger.error(f"Error migrating role column: {e}") + + def _create_test_user(self, conn: sqlite3.Connection): + """Create test user (tester/oliveradmin) if doesn't exist.""" + try: + cursor = conn.execute('SELECT id FROM users WHERE username = ?', ('tester',)) + if not cursor.fetchone(): + try: + from werkzeug.security import generate_password_hash + password_hash = generate_password_hash('oliveradmin') + conn.execute( + 'INSERT INTO users (username, password_hash, email, full_name, role, auth_method) VALUES (?, ?, ?, ?, ?, ?)', + ('tester', password_hash, 'tester@oliver.local', 'Test User', 'user', 'local'), + ) + conn.commit() + logger.info("Test user 'tester' created") + except ImportError: + logger.warning("werkzeug not available - test user not created") + except Exception as e: + logger.error(f"Error creating test user: {e}") + + def _create_superadmin(self, conn: sqlite3.Connection, email: str): + """Create or promote superadmin user.""" + try: + username = email.split('@')[0] + cursor = conn.execute('SELECT id, role FROM users WHERE username = ? OR email = ?', (username, email)) + row = cursor.fetchone() + if row: + if row['role'] != 'admin': + conn.execute('UPDATE users SET role = ? WHERE id = ?', ('admin', row['id'])) + conn.commit() + logger.info(f"Promoted user '{username}' to admin") + else: + conn.execute( + 'INSERT INTO users (username, email, full_name, role, auth_method) VALUES (?, ?, ?, ?, ?)', + (username, email, username, 'admin', 'sso'), + ) + conn.commit() + logger.info(f"Created superadmin user '{username}' ({email})") + except Exception as e: + logger.error(f"Error creating superadmin: {e}") + + # --- User Operations --- + + def get_user_by_username(self, username: str) -> Optional[Dict]: + """Get user by username.""" + conn = self._get_conn() + try: + cursor = conn.execute('SELECT * FROM users WHERE username = ? AND is_active = 1', (username,)) + row = cursor.fetchone() + return dict(row) if row else None + except Exception as e: + logger.error(f"Error fetching user '{username}': {e}") + return None + finally: + conn.close() + + def get_user_by_id(self, user_id: int) -> Optional[Dict]: + """Get user by ID.""" + conn = self._get_conn() + try: + cursor = conn.execute('SELECT * FROM users WHERE id = ? AND is_active = 1', (user_id,)) + row = cursor.fetchone() + return dict(row) if row else None + except Exception as e: + logger.error(f"Error fetching user ID {user_id}: {e}") + return None + finally: + conn.close() + + def create_user( + self, + username: str, + password_hash: Optional[str] = None, + email: Optional[str] = None, + full_name: Optional[str] = None, + auth_method: str = 'local', + role: str = 'user', + ) -> Optional[int]: + """Create a new user. Returns user ID if successful.""" + conn = self._get_conn() + try: + cursor = conn.execute( + 'INSERT INTO users (username, password_hash, email, full_name, role, auth_method) VALUES (?, ?, ?, ?, ?, ?)', + (username, password_hash, email, full_name, role, auth_method), + ) + conn.commit() + user_id = cursor.lastrowid + logger.info(f"Created user '{username}' (ID: {user_id})") + return user_id + except sqlite3.IntegrityError: + logger.warning(f"User '{username}' already exists") + return None + except Exception as e: + logger.error(f"Error creating user '{username}': {e}") + return None + finally: + conn.close() + + def update_last_login(self, user_id: int): + """Update user's last login timestamp.""" + conn = self._get_conn() + try: + conn.execute('UPDATE users SET last_login = CURRENT_TIMESTAMP WHERE id = ?', (user_id,)) + conn.commit() + except Exception as e: + logger.error(f"Error updating last login for user {user_id}: {e}") + finally: + conn.close() + + # --- Session Operations --- + + def create_session( + self, + user_id: int, + session_id: str, + expires_in_hours: int = 24, + ip_address: Optional[str] = None, + user_agent: Optional[str] = None, + ) -> bool: + """Create new session for user.""" + conn = self._get_conn() + try: + expires_at = datetime.now() + timedelta(hours=expires_in_hours) + conn.execute( + 'INSERT INTO sessions (session_id, user_id, expires_at, ip_address, user_agent) VALUES (?, ?, ?, ?, ?)', + (session_id, user_id, expires_at, ip_address, user_agent), + ) + conn.commit() + return True + except Exception as e: + logger.error(f"Error creating session: {e}") + return False + finally: + conn.close() + + def get_session(self, session_id: str) -> Optional[Dict]: + """Get session by ID. Returns None if expired or not found.""" + conn = self._get_conn() + try: + cursor = conn.execute(''' + SELECT s.*, u.username, u.email, u.full_name + FROM sessions s + JOIN users u ON s.user_id = u.id + WHERE s.session_id = ? AND s.expires_at > CURRENT_TIMESTAMP + ''', (session_id,)) + row = cursor.fetchone() + return dict(row) if row else None + except Exception as e: + logger.error(f"Error fetching session: {e}") + return None + finally: + conn.close() + + def delete_session(self, session_id: str) -> bool: + """Delete session (logout).""" + conn = self._get_conn() + try: + conn.execute('DELETE FROM sessions WHERE session_id = ?', (session_id,)) + conn.commit() + return True + except Exception as e: + logger.error(f"Error deleting session: {e}") + return False + finally: + conn.close() + + def cleanup_expired_sessions(self): + """Remove expired sessions from database.""" + conn = self._get_conn() + try: + cursor = conn.execute('DELETE FROM sessions WHERE expires_at < CURRENT_TIMESTAMP') + conn.commit() + deleted_count = cursor.rowcount + if deleted_count > 0: + logger.info(f"Cleaned up {deleted_count} expired sessions") + except Exception as e: + logger.error(f"Error cleaning up sessions: {e}") + finally: + conn.close() + + # --- Audit Log --- + + def log_action(self, user_id: int, action: str, details: Optional[str] = None): + """Log user action to audit trail.""" + conn = self._get_conn() + try: + conn.execute( + 'INSERT INTO audit_log (user_id, action, details) VALUES (?, ?, ?)', + (user_id, action, details), + ) + conn.commit() + except Exception as e: + logger.error(f"Error logging action: {e}") + finally: + conn.close() + + def get_user_activity(self, user_id: int, limit: int = 100, offset: int = 0) -> List[Dict]: + """Get user activity log.""" + conn = self._get_conn() + try: + cursor = conn.execute( + 'SELECT * FROM audit_log WHERE user_id = ? ORDER BY timestamp DESC LIMIT ? OFFSET ?', + (user_id, limit, offset), + ) + return [dict(row) for row in cursor.fetchall()] + except Exception as e: + logger.error(f"Error fetching user activity: {e}") + return [] + finally: + conn.close() + + def get_all_users(self, include_inactive: bool = False) -> List[Dict]: + """Get all users.""" + conn = self._get_conn() + try: + query = 'SELECT * FROM users' + if not include_inactive: + query += ' WHERE is_active = 1' + query += ' ORDER BY created_at DESC' + cursor = conn.execute(query) + return [dict(row) for row in cursor.fetchall()] + except Exception as e: + logger.error(f"Error fetching users: {e}") + return [] + finally: + conn.close() + + def get_stats(self) -> Dict: + """Get database statistics.""" + conn = self._get_conn() + try: + stats = {} + cursor = conn.execute('SELECT COUNT(*) as count FROM users WHERE is_active = 1') + stats['active_users'] = cursor.fetchone()['count'] + + cursor = conn.execute('SELECT COUNT(*) as count FROM sessions WHERE expires_at > CURRENT_TIMESTAMP') + stats['active_sessions'] = cursor.fetchone()['count'] + + cursor = conn.execute('SELECT COUNT(*) as count FROM audit_log') + stats['audit_entries'] = cursor.fetchone()['count'] + + cursor = conn.execute("SELECT COUNT(*) as count FROM audit_log WHERE timestamp > datetime('now', '-24 hours')") + stats['recent_activity'] = cursor.fetchone()['count'] + + return stats + except Exception as e: + logger.error(f"Error fetching stats: {e}") + return {} + finally: + conn.close() + + # --- User Update --- + + def update_user(self, user_id: int, updates: Dict) -> bool: + """Update user fields. Returns True on success.""" + allowed = {'role', 'is_active', 'full_name', 'email'} + filtered = {k: v for k, v in updates.items() if k in allowed} + if not filtered: + return False + conn = self._get_conn() + try: + set_clause = ', '.join(f'{k} = ?' for k in filtered) + values = list(filtered.values()) + [user_id] + conn.execute(f'UPDATE users SET {set_clause} WHERE id = ?', values) + conn.commit() + return conn.total_changes > 0 + except Exception as e: + logger.error(f"Error updating user {user_id}: {e}") + return False + finally: + conn.close() + + # --- Audit Log (extended) --- + + def get_audit_log( + self, + user_id: Optional[int] = None, + action: Optional[str] = None, + limit: int = 100, + offset: int = 0, + ) -> List[Dict]: + """Get audit log with optional filters.""" + conn = self._get_conn() + try: + query = ''' + SELECT a.*, u.username + FROM audit_log a + LEFT JOIN users u ON a.user_id = u.id + ''' + conditions = [] + params = [] + if user_id is not None: + conditions.append('a.user_id = ?') + params.append(user_id) + if action: + conditions.append('a.action = ?') + params.append(action) + if conditions: + query += ' WHERE ' + ' AND '.join(conditions) + query += ' ORDER BY a.timestamp DESC LIMIT ? OFFSET ?' + params.extend([limit, offset]) + cursor = conn.execute(query, params) + return [dict(row) for row in cursor.fetchall()] + except Exception as e: + logger.error(f"Error fetching audit log: {e}") + return [] + finally: + conn.close() + + # --- AI Usage --- + + def log_ai_usage( + self, + user_id: int, + filename: str = "", + tokens_total: int = 0, + model: str = "", + ): + """Log AI token usage for a file.""" + conn = self._get_conn() + try: + conn.execute( + 'INSERT INTO ai_usage (user_id, filename, tokens_total, model) VALUES (?, ?, ?, ?)', + (user_id, filename, tokens_total, model), + ) + conn.commit() + except Exception as e: + logger.error(f"Error logging AI usage: {e}") + finally: + conn.close() + + def get_ai_usage_stats(self) -> Dict: + """Get aggregate AI usage statistics.""" + conn = self._get_conn() + try: + stats = {} + cursor = conn.execute('SELECT COUNT(*) as count, COALESCE(SUM(tokens_total), 0) as total_tokens FROM ai_usage') + row = cursor.fetchone() + stats['total_requests'] = row['count'] + stats['total_tokens'] = row['total_tokens'] + + cursor = conn.execute( + "SELECT COUNT(*) as count, COALESCE(SUM(tokens_total), 0) as tokens FROM ai_usage WHERE created_at > datetime('now', '-24 hours')" + ) + row = cursor.fetchone() + stats['requests_24h'] = row['count'] + stats['tokens_24h'] = row['tokens'] + + cursor = conn.execute( + "SELECT COUNT(*) as count, COALESCE(SUM(tokens_total), 0) as tokens FROM ai_usage WHERE created_at > datetime('now', '-7 days')" + ) + row = cursor.fetchone() + stats['requests_7d'] = row['count'] + stats['tokens_7d'] = row['tokens'] + + return stats + except Exception as e: + logger.error(f"Error fetching AI usage stats: {e}") + return {} + finally: + conn.close() + + def get_ai_usage_by_user(self, limit: int = 50) -> List[Dict]: + """Get AI usage broken down by user.""" + conn = self._get_conn() + try: + cursor = conn.execute(''' + SELECT u.username, u.id as user_id, + COUNT(*) as request_count, + COALESCE(SUM(a.tokens_total), 0) as total_tokens, + MAX(a.created_at) as last_used + FROM ai_usage a + JOIN users u ON a.user_id = u.id + GROUP BY u.id + ORDER BY total_tokens DESC + LIMIT ? + ''', (limit,)) + return [dict(row) for row in cursor.fetchall()] + except Exception as e: + logger.error(f"Error fetching AI usage by user: {e}") + return [] + finally: + conn.close() + + def close(self): + """No-op for connection-per-operation pattern.""" + pass diff --git a/src/excel_metadata_lookup.py b/src/excel_metadata_lookup.py new file mode 100644 index 0000000..2d512d4 --- /dev/null +++ b/src/excel_metadata_lookup.py @@ -0,0 +1,171 @@ +"""Excel-based metadata lookup service.""" + +import pandas as pd +from pathlib import Path +from typing import Dict, Optional +from .utils import get_logger + +logger = get_logger(__name__) + + +class ExcelMetadataLookup: + """Lookup metadata from Excel spreadsheet by filename.""" + + def __init__(self, excel_path: str): + """ + Initialize the lookup service. + + Args: + excel_path: Path to the Excel file with metadata + """ + self.excel_path = Path(excel_path) + self.filename_to_metadata = {} + self._load_excel() + + def _load_excel(self): + """Load and index the Excel file from multiple sheets.""" + try: + logger.info(f"Loading metadata from: {self.excel_path}") + + # Load Sheet 1: DSB Celum ID to Path mapping + self._load_dsb_sheet() + + # Load Sheet 2: Medsurg Metadata Cheat (fallback) + self._load_medsurg_sheet() + + logger.info(f"✅ Total loaded: {len(self.filename_to_metadata)} metadata records") + + except Exception as e: + logger.error(f"Failed to load Excel file: {e}", exc_info=True) + raise + + def _load_dsb_sheet(self): + """Load DSB Celum ID to Path mapping sheet.""" + try: + df = pd.read_excel( + self.excel_path, + sheet_name="DSB Celum ID to Path mapping" + ) + + # Skip header row (first row contains template) + df = df[df['Celum ID'].notna()][1:] + + count = 0 + for _, row in df.iterrows(): + filename = row.get('File Name') + if pd.notna(filename): + # Get filename without extension for indexing + filename_stem = Path(str(filename).strip()).stem.lower() + + metadata = { + 'celum_id': str(row['Celum ID']) if pd.notna(row.get('Celum ID')) else '', + 'title': str(row['Title']) if pd.notna(row.get('Title')) else '', + 'description': str(row['External Description/Alt Text']) if pd.notna(row.get('External Description/Alt Text')) else '', + 'business': str(row['Business']) if pd.notna(row.get('Business')) else '', + 'original_filename': str(filename).strip(), + 'source_sheet': 'DSB' + } + + # Only add if not already exists + if filename_stem not in self.filename_to_metadata: + self.filename_to_metadata[filename_stem] = metadata + count += 1 + + logger.info(f"✅ Loaded {count} records from DSB sheet") + + except Exception as e: + logger.warning(f"Failed to load DSB sheet: {e}") + + def _load_medsurg_sheet(self): + """Load Medsurg Metadata Cheat sheet.""" + try: + df = pd.read_excel( + self.excel_path, + sheet_name="Medsurg Metadata Cheat" + ) + + # Skip header row + df = df[df['Celum ID'].notna()][1:] + + count = 0 + for _, row in df.iterrows(): + # Get filename from Solventum DAM Asset Path (extract filename from path) + asset_path = row.get('Solventum DAM Asset Path') + if pd.notna(asset_path): + # Extract filename from path + filename = Path(str(asset_path).strip()).name + filename_stem = Path(filename).stem.lower() + + metadata = { + 'celum_id': str(row['Celum ID']) if pd.notna(row.get('Celum ID')) else '', + 'title': str(row['Title']) if pd.notna(row.get('Title')) else '', + 'description': str(row['External Description/Alt Text']) if pd.notna(row.get('External Description/Alt Text')) else '', + 'business': str(row['Business']) if pd.notna(row.get('Business')) else '', + 'original_filename': filename, + 'source_sheet': 'Medsurg' + } + + # Only add if not already exists (DSB has priority) + if filename_stem not in self.filename_to_metadata: + self.filename_to_metadata[filename_stem] = metadata + count += 1 + + logger.info(f"✅ Loaded {count} records from Medsurg sheet") + + except Exception as e: + logger.warning(f"Failed to load Medsurg sheet: {e}") + + def lookup_by_filename(self, filename: str) -> Optional[Dict[str, str]]: + """ + Lookup metadata by filename (ignoring extension). + + Args: + filename: Name of the file (with or without extension) + + Returns: + Dictionary with metadata fields, or None if not found + """ + # Extract just the filename without path and extension + filename_stem = Path(filename).stem.lower() + + # Direct lookup by stem (case-insensitive) + if filename_stem in self.filename_to_metadata: + result = self.filename_to_metadata[filename_stem] + logger.info(f"✅ Found match for: {filename} (from {result.get('source_sheet', 'unknown')} sheet)") + return result + + logger.warning(f"⚠️ No metadata found for: {filename} (searched: {filename_stem})") + return None + + def search_by_celum_id(self, celum_id: str) -> Optional[Dict[str, str]]: + """ + Search metadata by Celum ID. + + Args: + celum_id: Celum ID to search for + + Returns: + Dictionary with metadata fields, or None if not found + """ + celum_id = str(celum_id).strip() + + for metadata in self.filename_to_metadata.values(): + if metadata['celum_id'] == celum_id: + logger.info(f"✅ Found metadata for Celum ID: {celum_id}") + return metadata + + logger.warning(f"⚠️ No metadata found for Celum ID: {celum_id}") + return None + + def get_stats(self) -> Dict[str, int]: + """Get statistics about loaded metadata.""" + dsb_count = sum(1 for m in self.filename_to_metadata.values() if m.get('source_sheet') == 'DSB') + medsurg_count = sum(1 for m in self.filename_to_metadata.values() if m.get('source_sheet') == 'Medsurg') + + return { + 'total_records': len(self.filename_to_metadata), + 'dsb_records': dsb_count, + 'medsurg_records': medsurg_count, + 'with_title': sum(1 for m in self.filename_to_metadata.values() if m['title']), + 'with_description': sum(1 for m in self.filename_to_metadata.values() if m['description']), + } diff --git a/src/extractors/__init__.py b/src/extractors/__init__.py new file mode 100644 index 0000000..b5e9edf --- /dev/null +++ b/src/extractors/__init__.py @@ -0,0 +1 @@ +"""Content extractors for different file types.""" diff --git a/src/extractors/exiftool_extractor.py b/src/extractors/exiftool_extractor.py new file mode 100644 index 0000000..7ac7fdd --- /dev/null +++ b/src/extractors/exiftool_extractor.py @@ -0,0 +1,174 @@ +"""Unified metadata extractor using ExifTool for images, video, and PDF files.""" + +from typing import Dict, Optional +from pathlib import Path +import logging + +try: + from exiftool import ExifToolHelper + EXIFTOOL_AVAILABLE = True +except ImportError: + EXIFTOOL_AVAILABLE = False + +from ..base_extractor import BaseExtractor +from ..utils import get_logger + +logger = get_logger(__name__) + + +class ExifToolExtractor(BaseExtractor): + """ + Extract metadata using ExifTool. + + Supports images (JPEG, PNG, GIF, TIFF, HEIC, RAW), + videos (MP4, MOV, AVI, MKV), and PDF metadata extraction. + + Note: This does NOT extract content (text) from files - only metadata. + For content extraction, use the regular extractors (PDFExtractor, ImageExtractor with OCR). + """ + + # Map ExifTool tags to our standard metadata fields + TAG_MAPPING = { + # Images (JPEG/PNG/TIFF) + 'EXIF:ImageDescription': 'title', + 'XMP:Description': 'subject', + 'IPTC:Caption-Abstract': 'subject', + 'IPTC:Headline': 'title', + 'XMP:Title': 'title', + 'EXIF:XPSubject': 'subject', + 'EXIF:XPKeywords': 'keywords', + 'IPTC:Keywords': 'keywords', + 'XMP:Subject': 'keywords', + + # PDF + 'PDF:Title': 'title', + 'PDF:Subject': 'subject', + 'PDF:Keywords': 'keywords', + + # Video (QuickTime/MP4) + 'QuickTime:Title': 'title', + 'QuickTime:Description': 'subject', + 'QuickTime:Keywords': 'keywords', + 'UserData:Title': 'title', + 'UserData:Description': 'subject', + } + + def __init__(self): + """Initialize ExifTool extractor.""" + if not EXIFTOOL_AVAILABLE: + raise ImportError( + "PyExifTool not installed. Install with: pip install PyExifTool>=0.5.6\n" + "Also ensure ExifTool is installed on your system." + ) + + def extract_content(self, file_path: str) -> str: + """ + ExifTool does not extract text content - only metadata. + + This method returns empty string. For content extraction: + - PDFs: Use PDFExtractor + - Images: Use ImageExtractor with OCR + - Office docs: Use OfficeExtractor + + Args: + file_path: Path to the file + + Returns: + Empty string (ExifTool doesn't extract content) + """ + logger.debug(f"ExifToolExtractor.extract_content called for {file_path} - returning empty (metadata only)") + return "" + + def read_metadata(self, file_path: str) -> Dict[str, str]: + """ + Read metadata using ExifTool. + + Extracts title, subject, and keywords from various metadata fields. + Supports images, videos, and PDFs. + + Args: + file_path: Path to the file + + Returns: + Dictionary with metadata (title, subject, keywords) + """ + try: + with ExifToolHelper() as et: + metadata_list = et.get_metadata([file_path]) + if not metadata_list: + logger.warning(f"No metadata returned by ExifTool for {file_path}") + return {'title': '', 'subject': '', 'keywords': ''} + + exif_data = metadata_list[0] + result = {'title': '', 'subject': '', 'keywords': ''} + + # Map ExifTool tags to standard fields + for exif_tag, standard_key in self.TAG_MAPPING.items(): + if exif_tag in exif_data and exif_data[exif_tag]: + value = exif_data[exif_tag] + + # Handle list values (keywords often come as arrays) + if isinstance(value, list): + value = ', '.join(str(v) for v in value) + else: + value = str(value) + + # First non-empty value wins (priority based on TAG_MAPPING order) + if not result[standard_key] and value.strip(): + result[standard_key] = value.strip() + + logger.info(f"Extracted metadata from {Path(file_path).name}: " + f"title={bool(result['title'])}, " + f"subject={bool(result['subject'])}, " + f"keywords={bool(result['keywords'])}") + + return result + + except Exception as e: + logger.error(f"ExifTool extraction failed for {file_path}: {e}") + return {'title': '', 'subject': '', 'keywords': ''} + + def get_all_tags(self, file_path: str) -> Dict: + """ + Get all available metadata tags from a file. + + Useful for debugging or exploring available metadata fields. + + Args: + file_path: Path to the file + + Returns: + Dictionary of all metadata tags + """ + try: + with ExifToolHelper() as et: + metadata_list = et.get_metadata([file_path]) + if metadata_list: + return metadata_list[0] + return {} + except Exception as e: + logger.error(f"Failed to get all tags for {file_path}: {e}") + return {} + + def get_specific_tags(self, file_path: str, tags: list) -> Dict: + """ + Get specific metadata tags from a file. + + More efficient than get_all_tags when you know which tags you need. + + Args: + file_path: Path to the file + tags: List of tag names (e.g., ['EXIF:ImageDescription', 'PDF:Title']) + + Returns: + Dictionary of requested tags + """ + try: + with ExifToolHelper() as et: + metadata_list = et.get_tags([file_path], tags=tags) + if metadata_list: + return metadata_list[0] + return {} + except Exception as e: + logger.error(f"Failed to get specific tags for {file_path}: {e}") + return {} diff --git a/src/extractors/image_extractor.py b/src/extractors/image_extractor.py new file mode 100644 index 0000000..673fb28 --- /dev/null +++ b/src/extractors/image_extractor.py @@ -0,0 +1,179 @@ +"""Image content and metadata extractor.""" + +import pytesseract +import piexif +from PIL import Image +from typing import Dict +import os + +from ..base_extractor import BaseExtractor +from ..config import Config +from ..utils import get_logger + +logger = get_logger(__name__) + + +class ImageExtractor(BaseExtractor): + """Extractor for image files (JPEG, PNG, etc.) with OCR and EXIF metadata.""" + + def __init__(self): + """Initialize image extractor.""" + self.tesseract_path = Config.TESSERACT_PATH + if self.tesseract_path and os.path.exists(self.tesseract_path): + pytesseract.pytesseract.pytesseract_cmd = self.tesseract_path + # Get OCR languages from config (supports Chinese, Japanese, Korean, etc.) + self.ocr_lang = Config.OCR_LANGUAGES + + def extract_content(self, file_path: str) -> str: + """ + Extract text content from image using OCR. + + Uses pytesseract to perform optical character recognition on the image. + Supports multiple languages including Chinese, Japanese, Korean. + + Args: + file_path: Path to the image file + + Returns: + Extracted text content + + Raises: + Exception: If extraction fails + """ + try: + logger.info(f"Starting image OCR extraction from {file_path}") + + # Open image + image = Image.open(file_path) + + # Apply OCR with multi-language support + text = pytesseract.image_to_string(image, lang=self.ocr_lang) + + if text and len(text.strip()) > 0: + cleaned_text = self.clean_text(text) + logger.info(f"Successfully extracted {len(cleaned_text)} characters from {file_path}") + return cleaned_text + else: + logger.warning(f"OCR extraction returned empty content for {file_path}") + return "" + + except Exception as e: + logger.error(f"Failed to extract content from image {file_path}: {e}", exc_info=True) + return "" + + def read_metadata(self, file_path: str) -> Dict[str, str]: + """ + Read image metadata from EXIF and IPTC data. + + Extracts standard image metadata fields including camera info, date taken, + copyright, etc. + + Args: + file_path: Path to the image file + + Returns: + Dictionary of metadata fields + + Raises: + Exception: If metadata reading fails + """ + metadata = {} + + try: + # Get file extension to determine format + file_ext = file_path.lower().split('.')[-1] + + # Try EXIF data + metadata = self._read_exif_metadata(file_path) + + # For PNG files, try IPTC data + if file_ext in ['png']: + iptc_metadata = self._read_iptc_metadata(file_path) + metadata.update(iptc_metadata) + + logger.info(f"Successfully read metadata from {file_path}") + return metadata + + except Exception as e: + logger.error(f"Failed to read image metadata from {file_path}: {e}", exc_info=True) + return {} + + def _read_exif_metadata(self, file_path: str) -> Dict[str, str]: + """ + Read EXIF metadata from image. + + Args: + file_path: Path to image file + + Returns: + Dictionary of EXIF metadata + """ + try: + # Try piexif first for JPEG + if file_path.lower().endswith(('.jpg', '.jpeg')): + try: + exif_dict = piexif.load(file_path) + metadata = {} + + # Extract commonly useful EXIF fields + if "0th" in exif_dict: + for tag, value in exif_dict["0th"].items(): + tag_name = piexif.TAGS["0th"][tag]["name"] + try: + if isinstance(value, bytes): + value = value.decode('utf-8', errors='ignore') + metadata[tag_name.lower()] = str(value).strip() + except Exception: + pass + + return metadata + except Exception as e: + logger.debug(f"piexif extraction failed: {e}") + + # Fallback to PIL for all image types + image = Image.open(file_path) + metadata = {} + + if hasattr(image, '_getexif') and image._getexif() is not None: + exif_data = image._getexif() + for tag_id, value in exif_data.items(): + tag_name = piexif.TAGS["0th"].get(tag_id, {}).get("name", f"tag_{tag_id}") + if isinstance(value, bytes): + value = value.decode('utf-8', errors='ignore') + metadata[tag_name.lower()] = str(value).strip() + + return metadata + + except Exception as e: + logger.debug(f"EXIF metadata extraction failed: {e}") + return {} + + def _read_iptc_metadata(self, file_path: str) -> Dict[str, str]: + """ + Read IPTC metadata from image. + + Args: + file_path: Path to image file + + Returns: + Dictionary of IPTC metadata + """ + try: + from PIL import Image + from PIL.PngImagePlugin import PngInfo + + image = Image.open(file_path) + metadata = {} + + # Check for PNG info + if hasattr(image, 'info'): + for key, value in image.info.items(): + if isinstance(value, bytes): + value = value.decode('utf-8', errors='ignore') + metadata[str(key).lower()] = str(value).strip() + + return metadata + + except Exception as e: + logger.debug(f"IPTC metadata extraction failed: {e}") + return {} diff --git a/src/extractors/office_extractor.py b/src/extractors/office_extractor.py new file mode 100644 index 0000000..2de9da5 --- /dev/null +++ b/src/extractors/office_extractor.py @@ -0,0 +1,207 @@ +"""Office document content and metadata extractor.""" + +from docx import Document as DocxDocument +from openpyxl import load_workbook +from pptx import Presentation +from typing import Dict + +from ..base_extractor import BaseExtractor +from ..utils import get_logger + +logger = get_logger(__name__) + + +class OfficeExtractor(BaseExtractor): + """Extractor for Office files (DOCX, XLSX, PPTX).""" + + SUPPORTED_FORMATS = ['docx', 'xlsx', 'pptx'] + + def extract_content(self, file_path: str) -> str: + """ + Extract text content from Office document. + + Routes to appropriate extraction method based on file format. + + Args: + file_path: Path to the Office file + + Returns: + Extracted text content + """ + try: + file_ext = file_path.lower().split('.')[-1] + + if file_ext == 'docx': + return self._extract_docx_content(file_path) + elif file_ext == 'xlsx': + return self._extract_xlsx_content(file_path) + elif file_ext == 'pptx': + return self._extract_pptx_content(file_path) + else: + logger.error(f"Unsupported Office format: {file_ext}") + return "" + + except Exception as e: + logger.error(f"Failed to extract content from Office file {file_path}: {e}", exc_info=True) + return "" + + def read_metadata(self, file_path: str) -> Dict[str, str]: + """ + Read metadata from Office document. + + Routes to appropriate metadata reading method based on file format. + + Args: + file_path: Path to the Office file + + Returns: + Dictionary of metadata fields + """ + try: + file_ext = file_path.lower().split('.')[-1] + + if file_ext == 'docx': + return self._read_docx_metadata(file_path) + elif file_ext == 'xlsx': + return self._read_xlsx_metadata(file_path) + elif file_ext == 'pptx': + return self._read_pptx_metadata(file_path) + else: + logger.error(f"Unsupported Office format: {file_ext}") + return {} + + except Exception as e: + logger.error(f"Failed to read metadata from Office file {file_path}: {e}", exc_info=True) + return {} + + def _extract_docx_content(self, file_path: str) -> str: + """Extract text content from DOCX file.""" + try: + logger.info(f"Extracting content from DOCX: {file_path}") + doc = DocxDocument(file_path) + paragraphs = [para.text for para in doc.paragraphs if para.text.strip()] + content = "\n".join(paragraphs) + cleaned_content = self.clean_text(content) + logger.info(f"Successfully extracted {len(cleaned_content)} characters from DOCX") + return cleaned_content + except Exception as e: + logger.error(f"Failed to extract DOCX content: {e}", exc_info=True) + return "" + + def _extract_xlsx_content(self, file_path: str) -> str: + """Extract text content from XLSX file.""" + try: + logger.info(f"Extracting content from XLSX: {file_path}") + workbook = load_workbook(file_path) + content_parts = [] + + for sheet_name in workbook.sheetnames: + sheet = workbook[sheet_name] + content_parts.append(f"Sheet: {sheet_name}") + + for row in sheet.iter_rows(values_only=True): + row_text = " | ".join(str(cell) if cell is not None else "" for cell in row) + if row_text.strip(): + content_parts.append(row_text) + + content = "\n".join(content_parts) + cleaned_content = self.clean_text(content) + logger.info(f"Successfully extracted {len(cleaned_content)} characters from XLSX") + return cleaned_content + except Exception as e: + logger.error(f"Failed to extract XLSX content: {e}", exc_info=True) + return "" + + def _extract_pptx_content(self, file_path: str) -> str: + """Extract text content from PPTX file.""" + try: + logger.info(f"Extracting content from PPTX: {file_path}") + presentation = Presentation(file_path) + content_parts = [] + + for slide_num, slide in enumerate(presentation.slides, 1): + content_parts.append(f"Slide {slide_num}:") + + for shape in slide.shapes: + if hasattr(shape, "text") and shape.text.strip(): + content_parts.append(shape.text) + + content = "\n".join(content_parts) + cleaned_content = self.clean_text(content) + logger.info(f"Successfully extracted {len(cleaned_content)} characters from PPTX") + return cleaned_content + except Exception as e: + logger.error(f"Failed to extract PPTX content: {e}", exc_info=True) + return "" + + def _read_docx_metadata(self, file_path: str) -> Dict[str, str]: + """Read metadata from DOCX file.""" + try: + logger.info(f"Reading metadata from DOCX: {file_path}") + doc = DocxDocument(file_path) + core_props = doc.core_properties + + metadata = { + 'title': getattr(core_props, 'title', '') or '', + 'subject': getattr(core_props, 'subject', '') or '', + 'keywords': getattr(core_props, 'keywords', '') or '', + 'author': getattr(core_props, 'author', '') or '', + 'comments': getattr(core_props, 'comments', '') or '', + 'category': getattr(core_props, 'category', '') or '', + } + + # Remove empty values + metadata = {k: v for k, v in metadata.items() if v} + logger.info(f"Successfully read metadata from DOCX") + return metadata + except Exception as e: + logger.error(f"Failed to read DOCX metadata: {e}", exc_info=True) + return {} + + def _read_xlsx_metadata(self, file_path: str) -> Dict[str, str]: + """Read metadata from XLSX file.""" + try: + logger.info(f"Reading metadata from XLSX: {file_path}") + workbook = load_workbook(file_path) + props = workbook.properties + + metadata = { + 'title': getattr(props, 'title', '') or '', + 'subject': getattr(props, 'subject', '') or '', + 'keywords': getattr(props, 'keywords', '') or '', + 'author': getattr(props, 'author', '') or '', + 'comments': getattr(props, 'comments', '') or '', + 'category': getattr(props, 'category', '') or '', + } + + # Remove empty values + metadata = {k: v for k, v in metadata.items() if v} + logger.info(f"Successfully read metadata from XLSX") + return metadata + except Exception as e: + logger.error(f"Failed to read XLSX metadata: {e}", exc_info=True) + return {} + + def _read_pptx_metadata(self, file_path: str) -> Dict[str, str]: + """Read metadata from PPTX file.""" + try: + logger.info(f"Reading metadata from PPTX: {file_path}") + presentation = Presentation(file_path) + core_props = presentation.core_properties + + metadata = { + 'title': getattr(core_props, 'title', '') or '', + 'subject': getattr(core_props, 'subject', '') or '', + 'keywords': getattr(core_props, 'keywords', '') or '', + 'author': getattr(core_props, 'author', '') or '', + 'comments': getattr(core_props, 'comments', '') or '', + 'category': getattr(core_props, 'category', '') or '', + } + + # Remove empty values + metadata = {k: v for k, v in metadata.items() if v} + logger.info(f"Successfully read metadata from PPTX") + return metadata + except Exception as e: + logger.error(f"Failed to read PPTX metadata: {e}", exc_info=True) + return {} diff --git a/src/extractors/pdf_extractor.py b/src/extractors/pdf_extractor.py new file mode 100644 index 0000000..e53434b --- /dev/null +++ b/src/extractors/pdf_extractor.py @@ -0,0 +1,228 @@ +"""PDF content extractor.""" + +import pypdf +import pdfplumber +from pdf2image import convert_from_path +import pytesseract +from typing import Dict +from pathlib import Path +import os + +from ..base_extractor import BaseExtractor +from ..config import Config +from ..utils import get_logger + +logger = get_logger(__name__) + + +class PDFExtractor(BaseExtractor): + """Extractor for PDF files with fallback to OCR.""" + + def __init__(self): + """Initialize PDF extractor.""" + self.tesseract_path = Config.TESSERACT_PATH + if self.tesseract_path and os.path.exists(self.tesseract_path): + pytesseract.pytesseract.pytesseract_cmd = self.tesseract_path + self.max_pages = Config.PDF_MAX_PAGES + + def extract_content(self, file_path: str) -> str: + """ + Extract text content from PDF using multiple fallback strategies. + + First tries pypdf, then pdfplumber, then OCR if both fail. + Limits extraction to the first MAX_PDF_PAGES pages. + + Args: + file_path: Path to the PDF file + + Returns: + Extracted text content + + Raises: + Exception: If all extraction methods fail + """ + try: + logger.info(f"Starting PDF extraction from {file_path}") + + # Strategy 1: Try pypdf + content = self._extract_with_pypdf(file_path) + if content and len(content.strip()) > 100: + logger.info(f"Successfully extracted {len(content)} characters using pypdf") + return self.clean_text(content) + + logger.debug("pypdf returned minimal content, trying pdfplumber") + + # Strategy 2: Try pdfplumber + content = self._extract_with_pdfplumber(file_path) + if content and len(content.strip()) > 100: + logger.info(f"Successfully extracted {len(content)} characters using pdfplumber") + return self.clean_text(content) + + logger.debug("pdfplumber returned minimal content, attempting OCR") + + # Strategy 3: Try OCR as last resort + content = self._extract_with_ocr(file_path) + if content and len(content.strip()) > 50: + logger.info(f"Successfully extracted {len(content)} characters using OCR") + return self.clean_text(content) + + logger.warning(f"All extraction methods returned minimal content for {file_path}") + return "" + + except Exception as e: + logger.error(f"Failed to extract PDF content from {file_path}: {e}", exc_info=True) + return "" + + def _extract_with_pypdf(self, file_path: str) -> str: + """ + Extract text using pypdf library. + + Args: + file_path: Path to PDF file + + Returns: + Extracted text + """ + try: + content = [] + with open(file_path, 'rb') as f: + pdf_reader = pypdf.PdfReader(f) + num_pages = min(len(pdf_reader.pages), self.max_pages) + + for page_num in range(num_pages): + try: + page = pdf_reader.pages[page_num] + text = page.extract_text() + if text: + content.append(text) + except Exception as e: + logger.debug(f"Error extracting page {page_num} with pypdf: {e}") + continue + + return "\n".join(content) + + except Exception as e: + logger.debug(f"pypdf extraction failed: {e}") + return "" + + def _extract_with_pdfplumber(self, file_path: str) -> str: + """ + Extract text using pdfplumber library. + + Args: + file_path: Path to PDF file + + Returns: + Extracted text + """ + try: + content = [] + with pdfplumber.open(file_path) as pdf: + num_pages = min(len(pdf.pages), self.max_pages) + + for page_num in range(num_pages): + try: + page = pdf.pages[page_num] + text = page.extract_text() + if text: + content.append(text) + except Exception as e: + logger.debug(f"Error extracting page {page_num} with pdfplumber: {e}") + continue + + return "\n".join(content) + + except Exception as e: + logger.debug(f"pdfplumber extraction failed: {e}") + return "" + + def _extract_with_ocr(self, file_path: str) -> str: + """ + Extract text using OCR via pdf2image and pytesseract. + + Args: + file_path: Path to PDF file + + Returns: + Extracted text + """ + try: + content = [] + + # Convert PDF pages to images + images = convert_from_path(file_path) + + # Limit to max_pages + images = images[:self.max_pages] + + # Get OCR languages from config (supports Chinese, Japanese, Korean, etc.) + ocr_lang = Config.OCR_LANGUAGES + + # Apply OCR to each image + for page_num, image in enumerate(images): + try: + text = pytesseract.image_to_string(image, lang=ocr_lang) + if text: + content.append(text) + except Exception as e: + logger.debug(f"Error running OCR on page {page_num}: {e}") + continue + + return "\n".join(content) + + except Exception as e: + logger.debug(f"OCR extraction failed: {e}") + return "" + + def read_metadata(self, file_path: str) -> Dict[str, str]: + """ + Read PDF metadata from document properties. + + Extracts standard PDF metadata fields: Title, Subject, Keywords, Author, Creator. + + Args: + file_path: Path to PDF file + + Returns: + Dictionary of metadata fields with lowercase keys + + Raises: + Exception: If metadata reading fails + """ + metadata = {} + + try: + with open(file_path, 'rb') as f: + pdf_reader = pypdf.PdfReader(f) + + # Get document information + doc_info = pdf_reader.metadata + + if doc_info: + # Map PDF metadata fields to standardized keys + field_mapping = { + '/Title': 'title', + '/Subject': 'subject', + '/Keywords': 'keywords', + '/Author': 'author', + '/Creator': 'creator', + } + + for pdf_field, standard_field in field_mapping.items(): + try: + value = doc_info.get(pdf_field) + if value: + # Convert bytes to string if necessary + if isinstance(value, bytes): + value = value.decode('utf-8', errors='ignore') + metadata[standard_field] = str(value).strip() + except Exception as e: + logger.debug(f"Error reading field {pdf_field}: {e}") + continue + + logger.info(f"Successfully read metadata from {file_path}") + return metadata + + except Exception as e: + logger.error(f"Failed to read PDF metadata from {file_path}: {e}", exc_info=True) + return {} diff --git a/src/extractors/video_extractor.py b/src/extractors/video_extractor.py new file mode 100644 index 0000000..ef16b7c --- /dev/null +++ b/src/extractors/video_extractor.py @@ -0,0 +1,153 @@ +"""Video metadata extractor.""" + +from typing import Dict + +from ..base_extractor import BaseExtractor +from ..utils import get_logger + +logger = get_logger(__name__) + + +class VideoExtractor(BaseExtractor): + """Extractor for video files (MP4, MOV, AVI) - metadata extraction only.""" + + SUPPORTED_FORMATS = ['mp4', 'mov', 'avi', 'mkv', 'flv', 'wmv', 'webm'] + + def extract_content(self, file_path: str) -> str: + """ + Extract text content from video (not supported). + + Video files cannot be easily processed for text content without expensive + OCR/speech-to-text processing. This method returns empty string. + + Args: + file_path: Path to the video file + + Returns: + Empty string (not supported for video) + """ + logger.info(f"Text extraction not supported for video files: {file_path}") + return "" + + def read_metadata(self, file_path: str) -> Dict[str, str]: + """ + Read metadata from video file using mutagen. + + Extracts standard video metadata tags. + + Args: + file_path: Path to the video file + + Returns: + Dictionary of metadata fields + """ + try: + logger.info(f"Reading metadata from video: {file_path}") + metadata = self._read_with_mutagen(file_path) + logger.info(f"Successfully read metadata from video") + return metadata + + except Exception as e: + logger.error(f"Failed to read video metadata from {file_path}: {e}", exc_info=True) + return {} + + def _read_with_mutagen(self, file_path: str) -> Dict[str, str]: + """ + Read video metadata using mutagen. + + Args: + file_path: Path to video file + + Returns: + Dictionary of metadata + """ + try: + from mutagen import File + except ImportError: + logger.warning("mutagen not installed, attempting pymediainfo fallback") + return self._read_with_pymediainfo(file_path) + + try: + audio = File(file_path) + metadata = {} + + if audio is not None: + # Extract common tags + tag_mapping = { + 'TIT2': 'title', + '\xa9nam': 'title', + 'Title': 'title', + 'TIT3': 'subtitle', + '\xa9cmt': 'comments', + 'Comments': 'comments', + 'TPE1': 'artist', + '\xa9ART': 'artist', + 'Artist': 'artist', + 'TALB': 'album', + '\xa9alb': 'album', + 'Album': 'album', + 'TXXX:KEYWORDS': 'keywords', + 'TXXX:Description': 'description', + } + + for key, value in audio.items(): + # Check direct mapping + if key in tag_mapping: + standard_key = tag_mapping[key] + if isinstance(value, list): + value = value[0] if value else "" + if value: + metadata[standard_key] = str(value).strip() + + # Generic fallback for other tags + elif isinstance(value, (list, tuple)): + if value: + metadata[key.lower()] = str(value[0]).strip() + else: + metadata[key.lower()] = str(value).strip() + + return metadata + + except Exception as e: + logger.debug(f"Mutagen extraction failed: {e}") + return self._read_with_pymediainfo(file_path) + + def _read_with_pymediainfo(self, file_path: str) -> Dict[str, str]: + """ + Read video metadata using pymediainfo. + + Args: + file_path: Path to video file + + Returns: + Dictionary of metadata + """ + try: + from pymediainfo import MediaInfo + except ImportError: + logger.warning("pymediainfo not installed, cannot extract video metadata") + return {} + + try: + media_info = MediaInfo.parse(file_path) + metadata = {} + + # Extract from general track + for track in media_info.tracks: + if track.track_type == "General": + if hasattr(track, 'title') and track.title: + metadata['title'] = track.title + if hasattr(track, 'comment') and track.comment: + metadata['comments'] = track.comment + if hasattr(track, 'performer') and track.performer: + metadata['artist'] = track.performer + if hasattr(track, 'description') and track.description: + metadata['description'] = track.description + + break + + return metadata + + except Exception as e: + logger.debug(f"pymediainfo extraction failed: {e}") + return {} diff --git a/src/field_mapper.py b/src/field_mapper.py new file mode 100644 index 0000000..99bc624 --- /dev/null +++ b/src/field_mapper.py @@ -0,0 +1,409 @@ +"""Field mapping with automatic detection and manual override.""" + +import json +from typing import Dict, List, Optional, Tuple +from difflib import SequenceMatcher +from pathlib import Path +from .utils import get_logger + +logger = get_logger(__name__) + + +class FieldMapper: + """Map source fields to standard metadata fields with fuzzy matching.""" + + # Standard metadata fields used in Oliver Metadata Tool + STANDARD_FIELDS = ['title', 'subject', 'keywords', 'description'] + + # Common aliases for fuzzy matching (case-insensitive) + FIELD_ALIASES = { + 'title': [ + 'title', 'name', 'heading', 'filename', 'file_name', 'document_title', + 'asset_title', 'resource_title', 'object_name', 'label' + ], + 'subject': [ + 'subject', 'description', 'summary', 'abstract', 'alt_text', + 'external_description', 'caption', 'about', 'overview', 'details', + 'desc', 'long_description', 'content' + ], + 'keywords': [ + 'keywords', 'tags', 'categories', 'labels', 'subjects', 'topics', + 'taxonomy', 'classification', 'key_words', 'search_terms' + ], + 'description': [ + 'description', 'desc', 'summary', 'notes', 'comments', 'remarks', + 'details', 'about', 'information', 'info' + ] + } + + # Similarity threshold for fuzzy matching (0.0 to 1.0) + SIMILARITY_THRESHOLD = 0.6 + + def __init__(self, presets_path: Optional[str] = None): + """ + Initialize field mapper. + + Args: + presets_path: Path to JSON file for saving/loading mapping presets + """ + self.presets_path = presets_path or 'field_mapping_presets.json' + + def auto_map(self, source_fields: List[str], strict: bool = False) -> Dict[str, Tuple[str, float]]: + """ + Automatically map source fields to standard fields using fuzzy matching. + + Args: + source_fields: List of field names from source data + strict: If True, only accept matches above high confidence threshold (0.8) + + Returns: + Dictionary mapping {source_field: (target_field, confidence_score)} + Example: {'File Name': ('title', 0.85), 'Alt Text': ('subject', 0.92)} + """ + mapping = {} + threshold = 0.8 if strict else self.SIMILARITY_THRESHOLD + + for source_field in source_fields: + best_match = self._find_best_match(source_field, threshold) + if best_match: + target_field, score = best_match + mapping[source_field] = (target_field, score) + logger.info(f"Auto-mapped '{source_field}' -> '{target_field}' (confidence: {score:.2f})") + + return mapping + + def _find_best_match(self, source_field: str, threshold: float = 0.6) -> Optional[Tuple[str, float]]: + """ + Find best matching standard field for source field. + + Args: + source_field: Source field name + threshold: Minimum similarity score (0.0 to 1.0) + + Returns: + Tuple of (target_field, confidence_score) or None + """ + source_lower = source_field.lower().replace(' ', '_').replace('-', '_') + best_score = 0.0 + best_field = None + + for standard_field, aliases in self.FIELD_ALIASES.items(): + for alias in aliases: + # Calculate similarity score + score = SequenceMatcher(None, source_lower, alias).ratio() + + # Exact match bonus + if source_lower == alias: + score = 1.0 + + # Substring match bonus + elif alias in source_lower or source_lower in alias: + score = max(score, 0.85) + + if score > best_score and score >= threshold: + best_score = score + best_field = standard_field + + if best_field: + return (best_field, best_score) + return None + + def validate_mapping(self, mapping: Dict[str, str]) -> Dict[str, List[str]]: + """ + Validate a field mapping configuration. + + Args: + mapping: Dictionary mapping {source_field: target_field} + + Returns: + Dictionary with validation results: + { + 'valid': [list of valid mappings], + 'invalid': [list of invalid mappings], + 'warnings': [list of warnings] + } + """ + result = { + 'valid': [], + 'invalid': [], + 'warnings': [] + } + + # Track which target fields are used + target_usage = {} + + for source_field, target_field in mapping.items(): + # Check if target field is valid + if target_field not in self.STANDARD_FIELDS: + result['invalid'].append( + f"'{target_field}' is not a valid target field (source: '{source_field}')" + ) + continue + + result['valid'].append(f"'{source_field}' -> '{target_field}'") + + # Track multiple sources mapping to same target + if target_field in target_usage: + target_usage[target_field].append(source_field) + else: + target_usage[target_field] = [source_field] + + # Warn about multiple sources mapping to same target + for target_field, sources in target_usage.items(): + if len(sources) > 1: + result['warnings'].append( + f"Multiple source fields map to '{target_field}': {', '.join(sources)}" + ) + + return result + + def apply_mapping(self, data: Dict[str, str], mapping: Dict[str, str]) -> Dict[str, str]: + """ + Apply field mapping to transform source data to standard format. + + Args: + data: Source data dictionary + mapping: Field mapping {source_field: target_field} + + Returns: + Transformed data with standard field names + """ + result = {field: '' for field in self.STANDARD_FIELDS} + + for source_field, target_field in mapping.items(): + if source_field in data and target_field in self.STANDARD_FIELDS: + value = data[source_field] + + # Handle multiple values mapping to same target (concatenate) + if result[target_field]: + result[target_field] += f"; {value}" + else: + result[target_field] = value + + return result + + def save_preset(self, name: str, mapping: Dict[str, str], description: str = ""): + """ + Save mapping preset to file. + + Args: + name: Preset name + mapping: Field mapping dictionary + description: Optional description + """ + presets = self._load_presets() + + presets[name] = { + 'mapping': mapping, + 'description': description, + 'created_at': self._get_timestamp() + } + + try: + with open(self.presets_path, 'w') as f: + json.dump(presets, f, indent=2) + logger.info(f"Saved mapping preset: {name}") + except Exception as e: + logger.error(f"Failed to save preset '{name}': {e}") + raise + + def load_preset(self, name: str) -> Optional[Dict[str, str]]: + """ + Load mapping preset from file. + + Args: + name: Preset name + + Returns: + Mapping dictionary or None if not found + """ + presets = self._load_presets() + + if name in presets: + logger.info(f"Loaded mapping preset: {name}") + return presets[name].get('mapping', {}) + + logger.warning(f"Preset not found: {name}") + return None + + def list_presets(self) -> List[Dict[str, str]]: + """ + List all saved presets. + + Returns: + List of preset information dictionaries + """ + presets = self._load_presets() + + return [ + { + 'name': name, + 'description': data.get('description', ''), + 'created_at': data.get('created_at', ''), + 'fields': len(data.get('mapping', {})) + } + for name, data in presets.items() + ] + + def delete_preset(self, name: str) -> bool: + """ + Delete a mapping preset. + + Args: + name: Preset name + + Returns: + True if deleted, False if not found + """ + presets = self._load_presets() + + if name in presets: + del presets[name] + + try: + with open(self.presets_path, 'w') as f: + json.dump(presets, f, indent=2) + logger.info(f"Deleted mapping preset: {name}") + return True + except Exception as e: + logger.error(f"Failed to delete preset '{name}': {e}") + raise + + return False + + def suggest_mapping(self, source_fields: List[str]) -> Dict: + """ + Generate mapping suggestions with confidence scores and alternatives. + + Args: + source_fields: List of source field names + + Returns: + Dictionary with suggestions: + { + 'source_field': { + 'best_match': 'target_field', + 'confidence': 0.85, + 'alternatives': [ + {'field': 'other_target', 'confidence': 0.65}, + ... + ] + } + } + """ + suggestions = {} + + for source_field in source_fields: + # Find all potential matches + matches = self._find_all_matches(source_field) + + if matches: + best_match = matches[0] + suggestions[source_field] = { + 'best_match': best_match[0], + 'confidence': best_match[1], + 'alternatives': [ + {'field': field, 'confidence': score} + for field, score in matches[1:3] # Top 2 alternatives + ] + } + else: + suggestions[source_field] = { + 'best_match': None, + 'confidence': 0.0, + 'alternatives': [] + } + + return suggestions + + def _find_all_matches(self, source_field: str, min_threshold: float = 0.4) -> List[Tuple[str, float]]: + """ + Find all matching standard fields above threshold, sorted by score. + + Args: + source_field: Source field name + min_threshold: Minimum similarity score + + Returns: + List of (target_field, score) tuples sorted by score descending + """ + source_lower = source_field.lower().replace(' ', '_').replace('-', '_') + matches = [] + + for standard_field, aliases in self.FIELD_ALIASES.items(): + best_score = 0.0 + + for alias in aliases: + score = SequenceMatcher(None, source_lower, alias).ratio() + + # Exact match + if source_lower == alias: + score = 1.0 + # Substring match + elif alias in source_lower or source_lower in alias: + score = max(score, 0.85) + + best_score = max(best_score, score) + + if best_score >= min_threshold: + matches.append((standard_field, best_score)) + + # Sort by score descending + matches.sort(key=lambda x: x[1], reverse=True) + return matches + + def _load_presets(self) -> Dict: + """Load all presets from file.""" + if Path(self.presets_path).exists(): + try: + with open(self.presets_path, 'r') as f: + return json.load(f) + except Exception as e: + logger.error(f"Failed to load presets: {e}") + return {} + return {} + + def _get_timestamp(self) -> str: + """Get current timestamp as ISO format string.""" + from datetime import datetime + return datetime.now().isoformat() + + def get_unmapped_fields(self, source_fields: List[str], mapping: Dict[str, str]) -> List[str]: + """ + Get list of source fields that are not mapped. + + Args: + source_fields: All source field names + mapping: Current mapping dictionary + + Returns: + List of unmapped source fields + """ + return [field for field in source_fields if field not in mapping] + + def get_mapping_coverage(self, source_fields: List[str], mapping: Dict[str, str]) -> Dict: + """ + Calculate mapping coverage statistics. + + Args: + source_fields: All source field names + mapping: Current mapping dictionary + + Returns: + Statistics dictionary with coverage info + """ + total_fields = len(source_fields) + mapped_fields = len(mapping) + unmapped = self.get_unmapped_fields(source_fields, mapping) + + # Count unique target fields used + unique_targets = len(set(mapping.values())) + + return { + 'total_source_fields': total_fields, + 'mapped_fields': mapped_fields, + 'unmapped_fields': len(unmapped), + 'coverage_percent': (mapped_fields / total_fields * 100) if total_fields > 0 else 0, + 'unique_targets_used': unique_targets, + 'unmapped_field_list': unmapped + } diff --git a/src/file_detector.py b/src/file_detector.py new file mode 100644 index 0000000..d2f7b8b --- /dev/null +++ b/src/file_detector.py @@ -0,0 +1,97 @@ +"""File type detection and routing.""" + +from enum import Enum +from pathlib import Path +from typing import Optional +import mimetypes + +class FileType(Enum): + """Supported file types.""" + PDF = "pdf" + IMAGE = "image" + OFFICE_DOC = "office_doc" + OFFICE_SHEET = "office_sheet" + OFFICE_PRESENTATION = "office_presentation" + VIDEO = "video" + UNSUPPORTED = "unsupported" + +class FileDetector: + """Detect file type and route to appropriate handlers.""" + + # File extension mappings + PDF_EXTENSIONS = {'.pdf'} + IMAGE_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.gif', '.tiff', '.tif', '.bmp', '.webp'} + OFFICE_DOC_EXTENSIONS = {'.docx'} + OFFICE_SHEET_EXTENSIONS = {'.xlsx'} + OFFICE_PRESENTATION_EXTENSIONS = {'.pptx'} + VIDEO_EXTENSIONS = {'.mp4', '.mov', '.avi', '.mkv', '.m4v', '.wmv'} + + @classmethod + def detect_file_type(cls, file_path: str) -> FileType: + """ + Detect file type based on extension and MIME type. + + Args: + file_path: Path to the file + + Returns: + FileType enum value + """ + path = Path(file_path) + + if not path.exists(): + raise FileNotFoundError(f"File not found: {file_path}") + + extension = path.suffix.lower() + + # Check by extension first + if extension in cls.PDF_EXTENSIONS: + return FileType.PDF + elif extension in cls.IMAGE_EXTENSIONS: + return FileType.IMAGE + elif extension in cls.OFFICE_DOC_EXTENSIONS: + return FileType.OFFICE_DOC + elif extension in cls.OFFICE_SHEET_EXTENSIONS: + return FileType.OFFICE_SHEET + elif extension in cls.OFFICE_PRESENTATION_EXTENSIONS: + return FileType.OFFICE_PRESENTATION + elif extension in cls.VIDEO_EXTENSIONS: + return FileType.VIDEO + + # Fallback to MIME type check + mime_type, _ = mimetypes.guess_type(str(path)) + if mime_type: + if 'pdf' in mime_type: + return FileType.PDF + elif 'image' in mime_type: + return FileType.IMAGE + elif 'video' in mime_type: + return FileType.VIDEO + elif 'officedocument.wordprocessingml' in mime_type: + return FileType.OFFICE_DOC + elif 'officedocument.spreadsheetml' in mime_type: + return FileType.OFFICE_SHEET + elif 'officedocument.presentationml' in mime_type: + return FileType.OFFICE_PRESENTATION + + return FileType.UNSUPPORTED + + @classmethod + def is_supported(cls, file_path: str) -> bool: + """Check if file type is supported.""" + file_type = cls.detect_file_type(file_path) + return file_type != FileType.UNSUPPORTED + + @classmethod + def get_file_type_name(cls, file_type: FileType) -> str: + """Get human-readable file type name.""" + type_names = { + FileType.PDF: "PDF Document", + FileType.IMAGE: "Image", + FileType.OFFICE_DOC: "Word Document", + FileType.OFFICE_SHEET: "Excel Spreadsheet", + FileType.OFFICE_PRESENTATION: "PowerPoint Presentation", + FileType.VIDEO: "Video", + FileType.UNSUPPORTED: "Unsupported File" + } + return type_names.get(file_type, "Unknown") diff --git a/src/main.py b/src/main.py new file mode 100644 index 0000000..7360301 --- /dev/null +++ b/src/main.py @@ -0,0 +1,293 @@ +#!/usr/bin/env python3 +"""Main CLI application for metadata automation.""" + +import sys +import argparse +from pathlib import Path +from typing import List, Dict +from tqdm import tqdm +import csv +from datetime import datetime + +# Import project modules +from .config import Config +from .file_detector import FileDetector, FileType +from .metadata_analyzer import MetadataAnalyzer +from .utils import ( + create_backup, get_logger, format_metadata_comparison, + validate_file_path, create_report_entry +) + +# Import extractors +from .extractors.pdf_extractor import PDFExtractor +from .extractors.image_extractor import ImageExtractor +from .extractors.office_extractor import OfficeExtractor +from .extractors.video_extractor import VideoExtractor + +# Import updaters +from .updaters.pdf_updater import PDFUpdater +from .updaters.image_updater import ImageUpdater +from .updaters.office_updater import OfficeUpdater +from .updaters.video_updater import VideoUpdater + +logger = get_logger(__name__) + +class MetadataProcessor: + """Main processor for metadata automation.""" + + def __init__(self, preview_mode: bool = False): + """ + Initialize the processor. + + Args: + preview_mode: If True, show changes without applying them + """ + self.preview_mode = preview_mode + self.analyzer = MetadataAnalyzer() + + # Initialize extractors and updaters + self.extractors = { + FileType.PDF: PDFExtractor(), + FileType.IMAGE: ImageExtractor(), + FileType.OFFICE_DOC: OfficeExtractor(), + FileType.OFFICE_SHEET: OfficeExtractor(), + FileType.OFFICE_PRESENTATION: OfficeExtractor(), + FileType.VIDEO: VideoExtractor() + } + + self.updaters = { + FileType.PDF: PDFUpdater(), + FileType.IMAGE: ImageUpdater(), + FileType.OFFICE_DOC: OfficeUpdater(), + FileType.OFFICE_SHEET: OfficeUpdater(), + FileType.OFFICE_PRESENTATION: OfficeUpdater(), + FileType.VIDEO: VideoUpdater() + } + + self.report_data = [] + + def process_file(self, file_path: str) -> bool: + """ + Process a single file. + + Args: + file_path: Path to the file + + Returns: + True if successful + """ + try: + logger.info(f"\nProcessing: {file_path}") + + # Validate file + if not validate_file_path(file_path): + logger.error(f"Invalid file path: {file_path}") + return False + + # Detect file type + file_type = FileDetector.detect_file_type(file_path) + + if file_type == FileType.UNSUPPORTED: + logger.warning(f"Unsupported file type: {file_path}") + return False + + logger.info(f"File type: {FileDetector.get_file_type_name(file_type)}") + + # Get appropriate extractor + extractor = self.extractors.get(file_type) + if not extractor: + logger.error(f"No extractor found for {file_type}") + return False + + # Extract content and current metadata + logger.info("Extracting content...") + content = extractor.extract_content(file_path) + + if not content or len(content.strip()) < 10: + logger.warning("Insufficient content extracted, using filename only") + content = Path(file_path).stem + + logger.info(f"Extracted {len(content)} characters") + + logger.info("Reading current metadata...") + old_metadata = extractor.read_metadata(file_path) + + # Analyze content and generate new metadata + logger.info("Analyzing content with AI...") + filename = Path(file_path).name + new_metadata = self.analyzer.analyze_content(content, filename, file_type) + + # Display comparison + print(format_metadata_comparison(old_metadata, new_metadata)) + + # Store report data + self.report_data.append( + create_report_entry( + file_path, file_type.value, old_metadata, new_metadata, + "preview" if self.preview_mode else "pending" + ) + ) + + # Update metadata if not in preview mode + if not self.preview_mode: + updater = self.updaters.get(file_type) + if not updater: + logger.error(f"No updater found for {file_type}") + return False + + logger.info("Updating metadata...") + success = updater.update_metadata(file_path, new_metadata, backup=True) + + if success: + logger.info("✓ Metadata updated successfully!") + self.report_data[-1]['status'] = 'success' + + # Verify metadata + if updater.verify_metadata(file_path, new_metadata): + logger.info("✓ Metadata verified!") + else: + logger.warning("⚠ Metadata verification failed") + else: + logger.error("✗ Failed to update metadata") + self.report_data[-1]['status'] = 'failed' + return False + else: + logger.info("[PREVIEW MODE] Changes not applied") + + return True + + except Exception as e: + logger.error(f"Error processing {file_path}: {e}", exc_info=True) + return False + + def process_directory(self, directory: str, recursive: bool = False) -> Dict[str, int]: + """ + Process all supported files in a directory. + + Args: + directory: Path to directory + recursive: Process subdirectories + + Returns: + Dictionary with processing statistics + """ + dir_path = Path(directory) + + if not dir_path.exists() or not dir_path.is_dir(): + logger.error(f"Invalid directory: {directory}") + return {} + + # Find all files + pattern = '**/*' if recursive else '*' + all_files = list(dir_path.glob(pattern)) + + # Filter supported files + supported_files = [ + f for f in all_files + if f.is_file() and FileDetector.is_supported(str(f)) + ] + + logger.info(f"Found {len(supported_files)} supported files") + + # Process files with progress bar + stats = {'success': 0, 'failed': 0, 'total': len(supported_files)} + + for file_path in tqdm(supported_files, desc="Processing files"): + if self.process_file(str(file_path)): + stats['success'] += 1 + else: + stats['failed'] += 1 + + return stats + + def save_report(self, output_path: str = None): + """Save processing report to CSV.""" + if not self.report_data: + logger.info("No report data to save") + return + + if not output_path: + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + output_path = Config.REPORTS_DIR / f"metadata_report_{timestamp}.csv" + + output_path = Path(output_path) + output_path.parent.mkdir(parents=True, exist_ok=True) + + with open(output_path, 'w', newline='', encoding='utf-8') as f: + if self.report_data: + writer = csv.DictWriter(f, fieldnames=self.report_data[0].keys()) + writer.writeheader() + writer.writerows(self.report_data) + + logger.info(f"Report saved to: {output_path}") + +def main(): + """Main CLI entry point.""" + parser = argparse.ArgumentParser( + description='Universal Metadata Automation Tool', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Process single file + python -m src.main file.pdf + + # Preview changes without applying + python -m src.main --preview file.pdf + + # Process entire directory + python -m src.main --directory ./files + + # Process directory recursively + python -m src.main --directory ./files --recursive + + # Save report + python -m src.main file.pdf --report report.csv + """ + ) + + parser.add_argument('input', nargs='?', help='Input file or directory') + parser.add_argument('--directory', '-d', help='Process entire directory') + parser.add_argument('--recursive', '-r', action='store_true', help='Process subdirectories') + parser.add_argument('--preview', '-p', action='store_true', help='Preview mode (no changes)') + parser.add_argument('--report', help='Save report to CSV file') + + args = parser.parse_args() + + # Validate input + if not args.input and not args.directory: + parser.print_help() + sys.exit(1) + + # Initialize processor + processor = MetadataProcessor(preview_mode=args.preview) + + try: + # Process input + if args.directory: + stats = processor.process_directory(args.directory, args.recursive) + print(f"\n{'='*60}") + print(f"BATCH PROCESSING RESULTS") + print(f"{'='*60}") + print(f"Total files: {stats.get('total', 0)}") + print(f"Successful: {stats.get('success', 0)}") + print(f"Failed: {stats.get('failed', 0)}") + print(f"{'='*60}\n") + elif args.input: + success = processor.process_file(args.input) + sys.exit(0 if success else 1) + + # Save report + if args.report: + processor.save_report(args.report) + elif processor.report_data: + processor.save_report() + + except KeyboardInterrupt: + print("\n\nOperation cancelled by user") + sys.exit(1) + except Exception as e: + logger.error(f"Fatal error: {e}", exc_info=True) + sys.exit(1) + +if __name__ == '__main__': + main() diff --git a/src/metadata_analyzer.py b/src/metadata_analyzer.py new file mode 100644 index 0000000..f0de954 --- /dev/null +++ b/src/metadata_analyzer.py @@ -0,0 +1,424 @@ +"""AI-powered metadata analysis using OpenAI GPT with production-ready features.""" + +import json +from openai import OpenAI +from typing import Dict, Optional +from .config import Config +from .file_detector import FileType +from .utils import get_logger, sanitize_metadata_value + +# Production-ready imports +try: + import tiktoken + TIKTOKEN_AVAILABLE = True +except ImportError: + TIKTOKEN_AVAILABLE = False + +try: + from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type + TENACITY_AVAILABLE = True +except ImportError: + TENACITY_AVAILABLE = False + +logger = get_logger(__name__) + +class MetadataAnalyzer: + """Analyze content and generate metadata using OpenAI GPT with production-ready error handling.""" + + # Valid OpenAI models (as of January 2026) + VALID_MODELS = [ + # GPT-5 models (2026 release) + 'gpt-5', 'gpt-5-mini', 'gpt-5-nano', + 'gpt-5-mini-2025-08-07', 'gpt-5-nano-2025-08-07', + # GPT-4 models + 'gpt-4o', 'gpt-4o-mini', 'gpt-4o-mini-2024-07-18', + 'gpt-4-turbo', 'gpt-4', 'gpt-3.5-turbo', + # Reasoning models + 'o1', 'o1-mini', 'o1-preview' + ] + + def __init__(self): + """Initialize the analyzer with OpenAI client.""" + if not Config.OPENAI_API_KEY: + raise ValueError("OpenAI API key not configured") + + self.client = OpenAI(api_key=Config.OPENAI_API_KEY) + self.model = Config.AI_MODEL + + # Validate model name + if not self._is_valid_model(self.model): + logger.warning(f"⚠️ Model '{self.model}' may not be valid. Valid models: {', '.join(self.VALID_MODELS)}") + logger.warning(f"⚠️ Using fallback model: gpt-4o-mini") + self.model = 'gpt-4o-mini' + + self.max_tokens = Config.MAX_TOKENS + self.temperature = Config.TEMPERATURE + + logger.info(f"Initialized MetadataAnalyzer with model: {self.model}") + + # Initialize tiktoken encoding for proper token counting + if TIKTOKEN_AVAILABLE: + try: + self.encoding = tiktoken.encoding_for_model(self.model) + except KeyError: + # Fallback for models not in tiktoken registry + self.encoding = tiktoken.get_encoding("cl100k_base") + else: + self.encoding = None + logger.warning("tiktoken not available - using character-based truncation") + + def _count_tokens(self, text: str) -> int: + """Count tokens using tiktoken (proper tokenization).""" + if self.encoding: + return len(self.encoding.encode(text)) + else: + # Fallback: rough estimate (1 token ≈ 4 characters) + return len(text) // 4 + + def _truncate_content(self, content: str, max_tokens: int = 3000) -> str: + """Intelligently truncate content to fit token limit.""" + if not self.encoding: + # Character-based fallback + max_chars = max_tokens * 4 + if len(content) <= max_chars: + return content + return content[:max_chars] + + tokens = self.encoding.encode(content) + if len(tokens) <= max_tokens: + return content + + # Truncate and decode back + truncated_tokens = tokens[:max_tokens] + return self.encoding.decode(truncated_tokens) + + def _is_valid_model(self, model: str) -> bool: + """Check if model name is valid.""" + # Exact match + if model in self.VALID_MODELS: + return True + # Check if it starts with a valid prefix (for dated versions) + for valid_model in self.VALID_MODELS: + if model.startswith(valid_model): + return True + return False + + def _is_new_model(self) -> bool: + """ + Check if model is a new generation model. + New models (GPT-5, GPT-4o, o1) use max_completion_tokens and don't support custom temperature. + """ + new_models = ['gpt-5', 'gpt-4o', 'gpt-4-turbo', 'o1'] + return any(self.model.startswith(prefix) for prefix in new_models) + + def _get_api_params(self) -> dict: + """ + Get the correct API parameters based on model. + Newer models (GPT-5, GPT-4o, o1) use max_completion_tokens and don't support custom temperature. + Older models (GPT-3.5-turbo) use max_tokens and support temperature. + """ + params = {} + + # Token parameter + if self._is_new_model(): + params['max_completion_tokens'] = self.max_tokens + # New models (GPT-5, GPT-4o, o1) don't support custom temperature (only default value 1) + logger.debug(f"Using max_completion_tokens for {self.model}") + else: + params['max_tokens'] = self.max_tokens + params['temperature'] = self.temperature + logger.debug(f"Using max_tokens + temperature for {self.model}") + + return params + + def _call_openai_api(self, messages: list) -> dict: + """ + Call OpenAI API with automatic retry on failures. + Uses tenacity for exponential backoff if available. + """ + # Get the correct API parameters + api_params = self._get_api_params() + + if TENACITY_AVAILABLE: + # Use retry decorator dynamically + retry_decorator = retry( + stop=stop_after_attempt(Config.API_MAX_RETRIES), + wait=wait_exponential(multiplier=Config.API_RETRY_DELAY, min=2, max=10), + retry=retry_if_exception_type((Exception,)), + reraise=True + ) + + @retry_decorator + def _api_call(): + return self.client.chat.completions.create( + model=self.model, + messages=messages, + timeout=Config.API_TIMEOUT, + **api_params + ) + + return _api_call() + else: + # Fallback: simple retry without exponential backoff + import time + last_error = None + + for attempt in range(Config.API_MAX_RETRIES): + try: + return self.client.chat.completions.create( + model=self.model, + messages=messages, + timeout=Config.API_TIMEOUT, + **api_params + ) + except Exception as e: + last_error = e + if attempt < Config.API_MAX_RETRIES - 1: + wait_time = Config.API_RETRY_DELAY * (2 ** attempt) + logger.warning(f"API call failed (attempt {attempt + 1}/{Config.API_MAX_RETRIES}), retrying in {wait_time}s: {e}") + time.sleep(wait_time) + + raise last_error + + def analyze_content(self, content: str, filename: str, file_type: FileType) -> Dict[str, str]: + """ + Analyze content and generate appropriate metadata with production-ready error handling. + + Args: + content: Extracted text content + filename: Original filename + file_type: Type of file + + Returns: + Dictionary with metadata (title, subject, keywords, _tokens_used, _confidence) + """ + try: + # Truncate content if needed with proper token counting + content_tokens = self._count_tokens(content) + if content_tokens > Config.MAX_TEXT_LENGTH: + content = self._truncate_content(content, Config.MAX_TEXT_LENGTH) + logger.info(f"Truncated content from {content_tokens} to {self._count_tokens(content)} tokens") + + # Generate prompt based on file type + prompt = self._create_prompt(content, filename, file_type) + + # Count total tokens before API call + prompt_tokens = self._count_tokens(prompt) + logger.info(f"API call for {filename}: {prompt_tokens} prompt tokens") + + # Call API with retry logic + response = self._call_openai_api([ + {"role": "system", "content": "You are a metadata expert who generates professional, accurate metadata for documents in English."}, + {"role": "user", "content": prompt} + ]) + + # Parse response with detailed logging + logger.info(f"API Response for {filename}:") + logger.info(f" - Model used: {response.model}") + logger.info(f" - Finish reason: {response.choices[0].finish_reason}") + logger.info(f" - Tokens: prompt={response.usage.prompt_tokens}, completion={response.usage.completion_tokens}, total={response.usage.total_tokens}") + + metadata_text = response.choices[0].message.content + logger.info(f" - Content length: {len(metadata_text) if metadata_text else 0} chars") + logger.info(f" - Content preview: {metadata_text[:200] if metadata_text else '(empty)'}") + + # Check if content is None or empty + if not metadata_text or len(metadata_text.strip()) == 0: + logger.error(f"❌ API returned empty content for {filename}!") + logger.error(f" This usually means:") + logger.error(f" 1. Invalid model name: {self.model}") + logger.error(f" 2. Model doesn't support this request type") + logger.error(f" 3. Content was filtered/refused") + logger.error(f" Using fallback metadata instead.") + return self._generate_fallback_metadata(filename, file_type) + + metadata = self._parse_metadata_response(metadata_text) + + # Sanitize metadata values + metadata = { + key: sanitize_metadata_value(value) + for key, value in metadata.items() + } + + # Add metadata about the generation + metadata['_tokens_used'] = response.usage.total_tokens + metadata['_confidence'] = 0.9 # Could calculate based on response + + logger.info(f"Generated metadata for {filename} (tokens used: {metadata['_tokens_used']})") + return metadata + + except Exception as e: + logger.error(f"Error analyzing content for {filename}: {e}") + # Return fallback metadata with error info + fallback = self._generate_fallback_metadata(filename, file_type) + fallback['_ai_error'] = str(e) + fallback['_tokens_used'] = 0 + return fallback + + def _create_prompt(self, content: str, filename: str, file_type: FileType) -> str: + """Create AI prompt based on file type.""" + file_type_descriptions = { + FileType.PDF: "PDF document", + FileType.IMAGE: "image file", + FileType.OFFICE_DOC: "Word document", + FileType.OFFICE_SHEET: "Excel spreadsheet", + FileType.OFFICE_PRESENTATION: "PowerPoint presentation", + FileType.VIDEO: "video file" + } + + file_desc = file_type_descriptions.get(file_type, "file") + + prompt = f"""Analyze the following {file_desc} content and generate professional metadata in English. + +Filename: {filename} +Content: {content} + +Generate metadata with these fields: +1. Title: A concise, professional title (50-100 characters) that clearly describes the document/content +2. Subject: A brief description (1-2 sentences) of the document's purpose and content +3. Keywords: 5-10 relevant keywords separated by commas (include product names, categories, topics) + +Rules: +- All text MUST be in English +- Title should identify the main product/service and document type (e.g., "guide", "brochure", "manual") +- Subject should explain what the document is about and its purpose +- Keywords should be searchable terms relevant to the content +- Be professional and concise +- Return ONLY a JSON object with fields: title, subject, keywords + +Example output format: +{{ + "title": "3M Filtek Universal Restorative - Shade Selection Guide", + "subject": "Shade selection guide for 3M Filtek Universal Restorative dental material", + "keywords": "Filtek, Universal Restorative, shade selection, dental, restorative material, 3M, dentistry, composite" +}} + +Return only the JSON object, no additional text.""" + + return prompt + + def _parse_metadata_response(self, response_text: str) -> Dict[str, str]: + """Parse AI response into metadata dictionary.""" + try: + # Try to parse as JSON first + response_text = response_text.strip() + logger.info(f"Parsing response (length={len(response_text)}): {response_text[:200]}") + + # Remove markdown code blocks if present + if response_text.startswith('```'): + lines = response_text.split('\n') + # Find first and last code block markers + start_idx = 0 + end_idx = len(lines) + for i, line in enumerate(lines): + if line.startswith('```'): + if start_idx == 0: + start_idx = i + 1 + else: + end_idx = i + break + response_text = '\n'.join(lines[start_idx:end_idx]) + + # Try to find JSON object in text + # Look for { ... } pattern + start = response_text.find('{') + end = response_text.rfind('}') + if start != -1 and end != -1: + json_str = response_text[start:end+1] + metadata = json.loads(json_str) + else: + metadata = json.loads(response_text) + + # Ensure all required fields are present + required_fields = ['title', 'subject', 'keywords'] + for field in required_fields: + if field not in metadata: + metadata[field] = "" + + # Validate that we got actual content + if not metadata.get('title') or len(metadata.get('title', '').strip()) < 3: + logger.warning("JSON parsed but title is empty or too short, using text parsing") + return self._parse_metadata_text(response_text) + + return metadata + + except (json.JSONDecodeError, ValueError, KeyError) as e: + logger.warning(f"Failed to parse JSON response ({str(e)}), using text parsing") + return self._parse_metadata_text(response_text) + + def _parse_metadata_text(self, text: str) -> Dict[str, str]: + """Parse metadata from plain text response.""" + metadata = { + 'title': '', + 'subject': '', + 'keywords': '' + } + + # Improved text parsing + lines = text.split('\n') + + for line in lines: + line = line.strip() + if not line or line.startswith('#') or line.startswith('//'): + continue + + # Remove quotes and extra whitespace + line_clean = line.strip('"\'') + + # Look for field indicators (case insensitive) + line_lower = line_clean.lower() + + if ':' in line_clean: + parts = line_clean.split(':', 1) + key = parts[0].strip().lower() + value = parts[1].strip().strip('",\'') + + if 'title' in key and not metadata['title']: + metadata['title'] = value + elif 'subject' in key and not metadata['subject']: + metadata['subject'] = value + elif 'keyword' in key and not metadata['keywords']: + metadata['keywords'] = value + + # If still empty, try to extract from unstructured text + if not metadata['title']: + # Look for first substantial line as title + for line in lines: + line = line.strip().strip('"\'') + if len(line) > 10 and not line.lower().startswith(('title', 'subject', 'keyword')): + metadata['title'] = line[:200] # Limit length + break + + logger.info(f"Text parsing result: title='{metadata['title'][:50]}...', subject='{metadata['subject'][:50]}...'") + return metadata + + def _generate_fallback_metadata(self, filename: str, file_type: FileType) -> Dict[str, str]: + """Generate basic metadata based on filename when AI fails.""" + # Remove extension and clean filename + from pathlib import Path + clean_name = Path(filename).stem.replace('_', ' ').replace('-', ' ') + + return { + 'title': clean_name, + 'subject': f"{clean_name} - {FileType(file_type).value}", + 'keywords': clean_name.replace(' ', ', ') + } + + def generate_metadata_for_pdf(self, text: str) -> Dict[str, str]: + """Specialized metadata generation for PDF documents.""" + # Wrapper for PDF-specific logic if needed + return self.analyze_content(text, "document.pdf", FileType.PDF) + + def generate_metadata_for_image(self, text: str) -> Dict[str, str]: + """Specialized metadata generation for images.""" + return self.analyze_content(text, "image.jpg", FileType.IMAGE) + + def generate_metadata_for_office(self, text: str) -> Dict[str, str]: + """Specialized metadata generation for Office documents.""" + return self.analyze_content(text, "document.docx", FileType.OFFICE_DOC) + + def generate_metadata_for_video(self, metadata: Dict[str, str]) -> Dict[str, str]: + """Specialized metadata generation for videos.""" + # For videos, we might use existing metadata as input + text = f"Video title: {metadata.get('title', 'N/A')}" + return self.analyze_content(text, "video.mp4", FileType.VIDEO) diff --git a/src/metadata_importer.py b/src/metadata_importer.py new file mode 100644 index 0000000..a8bfe1d --- /dev/null +++ b/src/metadata_importer.py @@ -0,0 +1,427 @@ +"""Metadata importer for external files (CSV, Excel, JSON).""" + +import pandas as pd +import json +from pathlib import Path +from typing import Dict, Optional, List, Tuple +from .utils import get_logger +from .field_mapper import FieldMapper + +logger = get_logger(__name__) + + +class MetadataImporter: + """Import metadata from various file formats (CSV, Excel, JSON).""" + + def import_from_csv(self, csv_path: str) -> Dict[str, Dict]: + """ + Import metadata from CSV file. + Expected columns: filename, title, subject/description, keywords + + Args: + csv_path: Path to CSV file + + Returns: + Dictionary mapping filename stems to metadata dicts + """ + try: + df = pd.read_csv(csv_path, encoding='utf-8') + logger.info(f"Loaded CSV with {len(df)} rows from {csv_path}") + return self._parse_dataframe(df) + + except UnicodeDecodeError: + # Try alternative encodings + for encoding in ['latin1', 'iso-8859-1', 'cp1252']: + try: + df = pd.read_csv(csv_path, encoding=encoding) + logger.info(f"Loaded CSV with {len(df)} rows using {encoding} encoding") + return self._parse_dataframe(df) + except Exception: + continue + + raise ValueError(f"Could not read CSV file with any supported encoding") + + except Exception as e: + logger.error(f"Error importing from CSV: {e}") + raise + + def import_from_excel(self, excel_path: str, sheet_name: Optional[str] = None) -> Dict[str, Dict]: + """ + Import metadata from Excel file. + + Args: + excel_path: Path to Excel file (.xlsx, .xls) + sheet_name: Name of sheet to read (None = first sheet) + + Returns: + Dictionary mapping filename stems to metadata dicts + """ + try: + # Read Excel file + if sheet_name: + df = pd.read_excel(excel_path, sheet_name=sheet_name) + logger.info(f"Loaded Excel sheet '{sheet_name}' with {len(df)} rows") + else: + df = pd.read_excel(excel_path) + logger.info(f"Loaded Excel with {len(df)} rows from first sheet") + + return self._parse_dataframe(df) + + except Exception as e: + logger.error(f"Error importing from Excel: {e}") + raise + + def import_from_json(self, json_path: str) -> Dict[str, Dict]: + """ + Import metadata from JSON file. + + Expected format: + { + "filename.pdf": {"title": "...", "subject": "...", "keywords": "..."}, + "image.jpg": {"title": "...", "subject": "...", "keywords": "..."} + } + + Or array format: + [ + {"filename": "file.pdf", "title": "...", "subject": "...", "keywords": "..."}, + {"filename": "image.jpg", "title": "...", "subject": "...", "keywords": "..."} + ] + + Args: + json_path: Path to JSON file + + Returns: + Dictionary mapping filename stems to metadata dicts + """ + try: + with open(json_path, 'r', encoding='utf-8') as f: + data = json.load(f) + + metadata_map = {} + + if isinstance(data, dict): + # Object format: {"filename": {metadata}} + for filename, metadata in data.items(): + filename_stem = Path(filename).stem.lower() + metadata_map[filename_stem] = self._normalize_metadata(metadata) + + elif isinstance(data, list): + # Array format: [{filename, metadata}] + for item in data: + if not isinstance(item, dict): + continue + + # Find filename field + filename = None + for key in ['filename', 'file', 'name', 'file_name']: + if key in item: + filename = item[key] + break + + if not filename: + logger.warning(f"Skipping item without filename: {item}") + continue + + filename_stem = Path(filename).stem.lower() + metadata_map[filename_stem] = self._normalize_metadata(item) + + else: + raise ValueError("JSON must be an object or array") + + logger.info(f"Loaded {len(metadata_map)} metadata records from JSON") + return metadata_map + + except Exception as e: + logger.error(f"Error importing from JSON: {e}") + raise + + def _parse_dataframe(self, df: pd.DataFrame) -> Dict[str, Dict]: + """ + Parse pandas DataFrame into metadata map. + + Args: + df: DataFrame with metadata + + Returns: + Dictionary mapping filename stems to metadata dicts + """ + metadata_map = {} + + # Detect filename column (try common names) + filename_col = self._detect_column(df, ['filename', 'file', 'name', 'file_name', 'path']) + + if not filename_col: + raise ValueError("Could not find filename column in data. Tried: filename, file, name, file_name, path") + + # Detect metadata columns + title_col = self._detect_column(df, ['title', 'heading', 'name', 'document_title']) + subject_col = self._detect_column(df, ['subject', 'description', 'summary', 'desc', 'external_description', 'alt_text']) + keywords_col = self._detect_column(df, ['keywords', 'tags', 'categories', 'labels']) + + logger.info(f"Detected columns - filename: {filename_col}, title: {title_col}, subject: {subject_col}, keywords: {keywords_col}") + + # Parse rows + for _, row in df.iterrows(): + filename = str(row.get(filename_col, '')).strip() + if not filename or pd.isna(filename): + continue + + filename_stem = Path(filename).stem.lower() + + metadata_map[filename_stem] = { + 'title': self._get_value(row, title_col), + 'subject': self._get_value(row, subject_col), + 'keywords': self._get_value(row, keywords_col) + } + + logger.info(f"Parsed {len(metadata_map)} metadata records from DataFrame") + return metadata_map + + def _detect_column(self, df: pd.DataFrame, candidates: List[str]) -> Optional[str]: + """ + Detect column name from a list of candidates (case-insensitive). + + Args: + df: DataFrame to search + candidates: List of possible column names + + Returns: + Actual column name if found, None otherwise + """ + # Create lowercase mapping + col_map = {col.lower(): col for col in df.columns} + + # Try each candidate + for candidate in candidates: + if candidate.lower() in col_map: + return col_map[candidate.lower()] + + return None + + def _get_value(self, row: pd.Series, column: Optional[str]) -> str: + """ + Get value from row, handling None column and NaN values. + + Args: + row: DataFrame row + column: Column name (can be None) + + Returns: + String value or empty string + """ + if column is None: + return '' + + value = row.get(column, '') + + if pd.isna(value): + return '' + + return str(value).strip() + + def _normalize_metadata(self, metadata: Dict) -> Dict[str, str]: + """ + Normalize metadata dictionary to standard format. + + Args: + metadata: Raw metadata dict + + Returns: + Normalized metadata with title, subject, keywords keys + """ + normalized = { + 'title': '', + 'subject': '', + 'keywords': '' + } + + # Map title + for key in ['title', 'heading', 'name', 'document_title']: + if key in metadata and metadata[key]: + normalized['title'] = str(metadata[key]).strip() + break + + # Map subject/description + for key in ['subject', 'description', 'summary', 'desc', 'external_description', 'alt_text']: + if key in metadata and metadata[key]: + normalized['subject'] = str(metadata[key]).strip() + break + + # Map keywords + for key in ['keywords', 'tags', 'categories', 'labels']: + if key in metadata and metadata[key]: + value = metadata[key] + # Handle arrays + if isinstance(value, list): + normalized['keywords'] = ', '.join(str(v) for v in value) + else: + normalized['keywords'] = str(value).strip() + break + + return normalized + + def get_metadata_for_file(self, metadata_map: Dict[str, Dict], filename: str) -> Optional[Dict[str, str]]: + """ + Get metadata for a specific file from imported map. + + Args: + metadata_map: Dictionary returned by import_* methods + filename: Filename to look up (with or without extension) + + Returns: + Metadata dict if found, None otherwise + """ + filename_stem = Path(filename).stem.lower() + return metadata_map.get(filename_stem) + + def validate_import(self, metadata_map: Dict[str, Dict]) -> Dict: + """ + Validate imported metadata and return statistics. + + Args: + metadata_map: Dictionary returned by import_* methods + + Returns: + Statistics about the import + """ + stats = { + 'total_records': len(metadata_map), + 'with_title': 0, + 'with_subject': 0, + 'with_keywords': 0, + 'empty_records': 0 + } + + for metadata in metadata_map.values(): + if metadata.get('title'): + stats['with_title'] += 1 + if metadata.get('subject'): + stats['with_subject'] += 1 + if metadata.get('keywords'): + stats['with_keywords'] += 1 + + if not any([metadata.get('title'), metadata.get('subject'), metadata.get('keywords')]): + stats['empty_records'] += 1 + + return stats + + def preview_file_structure(self, file_path: str, file_type: str = 'auto') -> Tuple[List[str], List[Dict], Dict]: + """ + Preview file structure and suggest field mappings without importing. + + Args: + file_path: Path to file (CSV, Excel, JSON) + file_type: File type ('csv', 'excel', 'json', or 'auto') + + Returns: + Tuple of (column_names, sample_rows, suggested_mapping) + """ + if file_type == 'auto': + ext = Path(file_path).suffix.lower() + if ext == '.csv': + file_type = 'csv' + elif ext in ['.xlsx', '.xls']: + file_type = 'excel' + elif ext == '.json': + file_type = 'json' + else: + raise ValueError(f"Unsupported file type: {ext}") + + # Load file + if file_type == 'csv': + df = pd.read_csv(file_path, encoding='utf-8', nrows=10) + elif file_type == 'excel': + df = pd.read_excel(file_path, nrows=10) + elif file_type == 'json': + with open(file_path, 'r', encoding='utf-8') as f: + data = json.load(f) + if isinstance(data, list) and len(data) > 0: + df = pd.DataFrame(data[:10]) + elif isinstance(data, dict): + # Convert dict to list + items = [{'filename': k, **v} for k, v in list(data.items())[:10]] + df = pd.DataFrame(items) + else: + raise ValueError("JSON format not supported for preview") + + # Get column names + columns = df.columns.tolist() + + # Get sample rows + sample_rows = df.head(5).to_dict('records') + + # Suggest field mapping + mapper = FieldMapper() + suggestions = mapper.suggest_mapping(columns) + + return (columns, sample_rows, suggestions) + + def import_with_mapping(self, file_path: str, mapping: Dict[str, str], file_type: str = 'auto') -> Dict[str, Dict]: + """ + Import file with custom field mapping. + + Args: + file_path: Path to file + mapping: Field mapping {source_field: target_field} + file_type: File type ('csv', 'excel', 'json', or 'auto') + + Returns: + Dictionary mapping filename stems to metadata dicts + """ + # Load file + if file_type == 'auto': + ext = Path(file_path).suffix.lower() + if ext == '.csv': + file_type = 'csv' + elif ext in ['.xlsx', '.xls']: + file_type = 'excel' + elif ext == '.json': + file_type = 'json' + + if file_type == 'csv': + df = pd.read_csv(file_path, encoding='utf-8') + elif file_type == 'excel': + df = pd.read_excel(file_path) + elif file_type == 'json': + with open(file_path, 'r', encoding='utf-8') as f: + data = json.load(f) + if isinstance(data, list): + df = pd.DataFrame(data) + elif isinstance(data, dict): + items = [{'filename': k, **v} for k, v in data.items()] + df = pd.DataFrame(items) + + # Apply field mapper + mapper = FieldMapper() + metadata_map = {} + + # Find filename column + filename_col = None + for col in df.columns: + if col.lower() in ['filename', 'file', 'name', 'file_name']: + filename_col = col + break + + if not filename_col: + raise ValueError("Could not find filename column") + + # Process each row + for _, row in df.iterrows(): + filename = str(row.get(filename_col, '')).strip() + if not filename or pd.isna(filename): + continue + + filename_stem = Path(filename).stem.lower() + + # Apply mapping to transform row data + row_dict = row.to_dict() + metadata = mapper.apply_mapping(row_dict, mapping) + + metadata_map[filename_stem] = { + 'title': str(metadata.get('title', '')).strip(), + 'subject': str(metadata.get('subject', '')).strip(), + 'keywords': str(metadata.get('keywords', '')).strip() + } + + logger.info(f"Imported {len(metadata_map)} records with custom mapping") + return metadata_map diff --git a/src/template_manager.py b/src/template_manager.py new file mode 100644 index 0000000..7ab7051 --- /dev/null +++ b/src/template_manager.py @@ -0,0 +1,410 @@ +"""Metadata template manager with variable substitution.""" + +import json +from pathlib import Path +from typing import Dict, List, Optional +from datetime import datetime +from .utils import get_logger + +logger = get_logger(__name__) + + +class TemplateManager: + """Manage metadata templates with variable substitution.""" + + # Available variables for substitution + AVAILABLE_VARIABLES = { + '{filename}': 'Original filename without extension', + '{date}': 'Current date (YYYY-MM-DD)', + '{datetime}': 'Current date and time', + '{user}': 'Current username', + '{year}': 'Current year', + '{month}': 'Current month', + '{day}': 'Current day' + } + + def __init__(self, templates_path: Optional[str] = None): + """ + Initialize template manager. + + Args: + templates_path: Path to JSON file for storing templates + """ + self.templates_path = templates_path or 'metadata_templates.json' + + def create_template( + self, + name: str, + title_template: str, + subject_template: str, + keywords_template: str, + description: str = '' + ) -> Dict: + """ + Create a new metadata template. + + Args: + name: Template name + title_template: Title template with variables (e.g., "{filename} - Product Guide") + subject_template: Subject template with variables + keywords_template: Keywords template with variables + description: Optional description of template usage + + Returns: + Template dictionary + """ + template = { + 'name': name, + 'description': description, + 'title': title_template, + 'subject': subject_template, + 'keywords': keywords_template, + 'created_at': self._get_timestamp(), + 'updated_at': self._get_timestamp() + } + + # Validate template + validation = self.validate_template(template) + if validation['invalid']: + logger.warning(f"Template '{name}' has invalid variables: {validation['invalid']}") + + return template + + def save_template(self, template: Dict) -> bool: + """ + Save template to storage. + + Args: + template: Template dictionary + + Returns: + True if successful + """ + try: + templates = self._load_templates() + template['updated_at'] = self._get_timestamp() + templates[template['name']] = template + + with open(self.templates_path, 'w', encoding='utf-8') as f: + json.dump(templates, f, indent=2, ensure_ascii=False) + + logger.info(f"Saved template: {template['name']}") + return True + + except Exception as e: + logger.error(f"Failed to save template '{template['name']}': {e}") + return False + + def load_template(self, name: str) -> Optional[Dict]: + """ + Load template by name. + + Args: + name: Template name + + Returns: + Template dictionary or None if not found + """ + templates = self._load_templates() + template = templates.get(name) + + if template: + logger.info(f"Loaded template: {name}") + else: + logger.warning(f"Template not found: {name}") + + return template + + def list_templates(self) -> List[Dict]: + """ + List all available templates. + + Returns: + List of template summaries + """ + templates = self._load_templates() + + return [ + { + 'name': name, + 'description': data.get('description', ''), + 'created_at': data.get('created_at', ''), + 'updated_at': data.get('updated_at', ''), + 'variables_used': self._extract_variables(data) + } + for name, data in templates.items() + ] + + def delete_template(self, name: str) -> bool: + """ + Delete a template. + + Args: + name: Template name + + Returns: + True if deleted, False if not found + """ + templates = self._load_templates() + + if name in templates: + del templates[name] + + try: + with open(self.templates_path, 'w', encoding='utf-8') as f: + json.dump(templates, f, indent=2, ensure_ascii=False) + + logger.info(f"Deleted template: {name}") + return True + except Exception as e: + logger.error(f"Failed to delete template '{name}': {e}") + return False + + logger.warning(f"Template not found: {name}") + return False + + def apply_template( + self, + template: Dict, + filename: str, + user: str = 'Unknown', + custom_vars: Optional[Dict[str, str]] = None + ) -> Dict[str, str]: + """ + Apply template to generate metadata for a file. + + Args: + template: Template dictionary + filename: Filename to process + user: Username for {user} variable + custom_vars: Additional custom variables (e.g., {'product_line': 'Dental'}) + + Returns: + Dictionary with title, subject, keywords + """ + # Build variable substitution map + variables = self._build_variable_map(filename, user, custom_vars) + + # Apply substitutions + metadata = { + 'title': self._substitute_variables(template.get('title', ''), variables), + 'subject': self._substitute_variables(template.get('subject', ''), variables), + 'keywords': self._substitute_variables(template.get('keywords', ''), variables) + } + + logger.info(f"Applied template '{template['name']}' to {filename}") + return metadata + + def validate_template(self, template: Dict) -> Dict[str, List[str]]: + """ + Validate template for correct variable usage. + + Args: + template: Template dictionary + + Returns: + Dictionary with 'valid' and 'invalid' variable lists + """ + result = { + 'valid': [], + 'invalid': [] + } + + # Extract all variables from template + all_text = ( + template.get('title', '') + + template.get('subject', '') + + template.get('keywords', '') + ) + + # Find all {variable} patterns + import re + variables = re.findall(r'\{[^}]+\}', all_text) + + for var in variables: + if var in self.AVAILABLE_VARIABLES: + if var not in result['valid']: + result['valid'].append(var) + else: + if var not in result['invalid']: + result['invalid'].append(var) + + return result + + def _load_templates(self) -> Dict: + """Load all templates from file.""" + if Path(self.templates_path).exists(): + try: + with open(self.templates_path, 'r', encoding='utf-8') as f: + return json.load(f) + except Exception as e: + logger.error(f"Failed to load templates: {e}") + return {} + return {} + + def _get_timestamp(self) -> str: + """Get current timestamp as ISO format string.""" + return datetime.now().isoformat() + + def _build_variable_map( + self, + filename: str, + user: str, + custom_vars: Optional[Dict[str, str]] + ) -> Dict[str, str]: + """ + Build variable substitution map. + + Args: + filename: Filename (with or without extension) + user: Username + custom_vars: Custom variables + + Returns: + Dictionary mapping variable names to values + """ + # Get filename without extension + filename_stem = Path(filename).stem + + # Current date/time + now = datetime.now() + + variables = { + '{filename}': filename_stem, + '{date}': now.strftime('%Y-%m-%d'), + '{datetime}': now.strftime('%Y-%m-%d %H:%M:%S'), + '{user}': user, + '{year}': str(now.year), + '{month}': now.strftime('%m'), + '{day}': now.strftime('%d') + } + + # Add custom variables + if custom_vars: + for key, value in custom_vars.items(): + # Ensure custom variables are wrapped in {} + var_key = f'{{{key}}}' if not key.startswith('{') else key + variables[var_key] = value + + return variables + + def _substitute_variables(self, template_text: str, variables: Dict[str, str]) -> str: + """ + Substitute variables in template text. + + Args: + template_text: Text with {variable} placeholders + variables: Variable substitution map + + Returns: + Text with variables replaced + """ + result = template_text + + for var, value in variables.items(): + result = result.replace(var, value) + + return result + + def _extract_variables(self, template: Dict) -> List[str]: + """ + Extract all variables used in a template. + + Args: + template: Template dictionary + + Returns: + List of variable names (e.g., ['{filename}', '{date}']) + """ + import re + all_text = ( + template.get('title', '') + + template.get('subject', '') + + template.get('keywords', '') + ) + + variables = re.findall(r'\{[^}]+\}', all_text) + return list(set(variables)) + + def get_available_variables(self) -> Dict[str, str]: + """ + Get list of available variables with descriptions. + + Returns: + Dictionary mapping variable names to descriptions + """ + return self.AVAILABLE_VARIABLES.copy() + + def preview_template( + self, + template: Dict, + sample_filename: str = 'example.pdf', + user: str = 'User', + custom_vars: Optional[Dict[str, str]] = None + ) -> Dict[str, str]: + """ + Preview template output with sample data. + + Args: + template: Template dictionary + sample_filename: Sample filename for preview + user: Sample username + custom_vars: Sample custom variables + + Returns: + Preview metadata + """ + return self.apply_template(template, sample_filename, user, custom_vars) + + def export_template(self, name: str, export_path: str) -> bool: + """ + Export single template to JSON file. + + Args: + name: Template name + export_path: Path to save template + + Returns: + True if successful + """ + template = self.load_template(name) + if not template: + return False + + try: + with open(export_path, 'w', encoding='utf-8') as f: + json.dump(template, f, indent=2, ensure_ascii=False) + + logger.info(f"Exported template '{name}' to {export_path}") + return True + + except Exception as e: + logger.error(f"Failed to export template '{name}': {e}") + return False + + def import_template(self, import_path: str) -> Optional[Dict]: + """ + Import template from JSON file. + + Args: + import_path: Path to template JSON file + + Returns: + Imported template dictionary or None + """ + try: + with open(import_path, 'r', encoding='utf-8') as f: + template = json.load(f) + + # Validate required fields + required_fields = ['name', 'title', 'subject', 'keywords'] + if not all(field in template for field in required_fields): + logger.error(f"Invalid template file: missing required fields") + return None + + logger.info(f"Imported template from {import_path}") + return template + + except Exception as e: + logger.error(f"Failed to import template: {e}") + return None diff --git a/src/updaters/__init__.py b/src/updaters/__init__.py new file mode 100644 index 0000000..84fa57c --- /dev/null +++ b/src/updaters/__init__.py @@ -0,0 +1 @@ +"""Metadata updaters for different file types.""" diff --git a/src/updaters/exiftool_updater.py b/src/updaters/exiftool_updater.py new file mode 100644 index 0000000..6581f74 --- /dev/null +++ b/src/updaters/exiftool_updater.py @@ -0,0 +1,223 @@ +"""Unified metadata updater using ExifTool for images, video, and PDF files.""" + +from typing import Dict +from pathlib import Path +import logging + +try: + from exiftool import ExifToolHelper + EXIFTOOL_AVAILABLE = True +except ImportError: + EXIFTOOL_AVAILABLE = False + +from ..base_updater import BaseUpdater +from ..utils import get_logger, create_backup + +logger = get_logger(__name__) + + +class ExifToolUpdater(BaseUpdater): + """ + Update metadata using ExifTool. + + Supports images (JPEG, PNG, GIF, TIFF, HEIC, RAW), + videos (MP4, MOV, AVI, MKV), and PDF files. + + Provides a unified API for metadata updates across all supported formats. + """ + + def __init__(self): + """Initialize ExifTool updater.""" + if not EXIFTOOL_AVAILABLE: + raise ImportError( + "PyExifTool not installed. Install with: pip install PyExifTool>=0.5.6\n" + "Also ensure ExifTool is installed on your system." + ) + + def update_metadata(self, file_path: str, metadata: Dict[str, str], backup: bool = True) -> bool: + """ + Update file metadata using ExifTool. + + Writes title, subject, and keywords to appropriate metadata fields + based on file type (images use EXIF/IPTC/XMP, PDFs use PDF fields, etc.). + + Args: + file_path: Path to the file + metadata: Dictionary with 'title', 'subject', 'keywords' keys + backup: Whether to create backup before updating (default: True) + + Returns: + True if successful, False otherwise + """ + try: + # Validate metadata + if not self.validate_metadata(metadata): + logger.error(f"Invalid metadata for {file_path}") + return False + + # Create backup if requested + if backup: + backup_path = create_backup(file_path) + if not backup_path: + logger.warning(f"Failed to create backup for {file_path}, proceeding anyway") + + # Build ExifTool tags dict + updates = {} + + # Determine file type and set appropriate tags + file_ext = Path(file_path).suffix.lower() + + if self._is_image(file_ext): + updates = self._build_image_tags(metadata) + elif self._is_video(file_ext): + updates = self._build_video_tags(metadata) + elif self._is_pdf(file_ext): + updates = self._build_pdf_tags(metadata) + else: + logger.warning(f"Unknown file type {file_ext}, trying generic metadata tags") + updates = self._build_generic_tags(metadata) + + # Apply updates using ExifTool + if not updates: + logger.warning(f"No metadata tags to update for {file_path}") + return True + + with ExifToolHelper() as et: + et.set_tags( + [file_path], + tags=updates, + params=["-overwrite_original", "-P"] # Preserve file modification date + ) + + logger.info(f"Successfully updated metadata for {Path(file_path).name}") + + # Verify the update + if self.verify_update(file_path, metadata): + logger.info(f"Metadata verification passed for {Path(file_path).name}") + return True + else: + logger.warning(f"Metadata verification failed for {Path(file_path).name}, but update succeeded") + return True # Still return True as update itself worked + + except Exception as e: + logger.error(f"Failed to update metadata for {file_path}: {e}") + return False + + def verify_update(self, file_path: str, expected_metadata: Dict[str, str]) -> bool: + """ + Verify that metadata was successfully written to the file. + + Args: + file_path: Path to the file + expected_metadata: Metadata that was supposed to be written + + Returns: + True if verification passes, False otherwise + """ + try: + from .exiftool_extractor import ExifToolExtractor + extractor = ExifToolExtractor() + actual_metadata = extractor.read_metadata(file_path) + + # Check each field (allow partial matches for verification) + for key in ['title', 'subject', 'keywords']: + expected = expected_metadata.get(key, '').strip() + actual = actual_metadata.get(key, '').strip() + + if expected and expected not in actual: + logger.warning(f"Verification mismatch for {key}: expected '{expected}', got '{actual}'") + return False + + return True + + except Exception as e: + logger.error(f"Verification failed for {file_path}: {e}") + return False + + def _is_image(self, ext: str) -> bool: + """Check if file extension is an image format.""" + image_exts = {'.jpg', '.jpeg', '.png', '.gif', '.tif', '.tiff', '.bmp', '.webp', '.heic', '.heif'} + return ext in image_exts + + def _is_video(self, ext: str) -> bool: + """Check if file extension is a video format.""" + video_exts = {'.mp4', '.mov', '.avi', '.mkv', '.m4v', '.wmv', '.flv', '.webm'} + return ext in video_exts + + def _is_pdf(self, ext: str) -> bool: + """Check if file extension is PDF.""" + return ext == '.pdf' + + def _build_image_tags(self, metadata: Dict[str, str]) -> Dict[str, str]: + """ + Build ExifTool tags for image files. + + Uses EXIF, IPTC, and XMP tags for maximum compatibility. + """ + tags = {} + + if metadata.get('title'): + tags['EXIF:ImageDescription'] = metadata['title'] + tags['IPTC:Headline'] = metadata['title'] + tags['XMP:Title'] = metadata['title'] + + if metadata.get('subject'): + tags['EXIF:XPSubject'] = metadata['subject'] + tags['IPTC:Caption-Abstract'] = metadata['subject'] + tags['XMP:Description'] = metadata['subject'] + + if metadata.get('keywords'): + tags['EXIF:XPKeywords'] = metadata['keywords'] + tags['IPTC:Keywords'] = metadata['keywords'] + tags['XMP:Subject'] = metadata['keywords'] + + return tags + + def _build_video_tags(self, metadata: Dict[str, str]) -> Dict[str, str]: + """Build ExifTool tags for video files.""" + tags = {} + + if metadata.get('title'): + tags['QuickTime:Title'] = metadata['title'] + tags['UserData:Title'] = metadata['title'] + + if metadata.get('subject'): + tags['QuickTime:Description'] = metadata['subject'] + tags['UserData:Description'] = metadata['subject'] + + if metadata.get('keywords'): + tags['QuickTime:Keywords'] = metadata['keywords'] + + return tags + + def _build_pdf_tags(self, metadata: Dict[str, str]) -> Dict[str, str]: + """Build ExifTool tags for PDF files.""" + tags = {} + + if metadata.get('title'): + tags['PDF:Title'] = metadata['title'] + + if metadata.get('subject'): + tags['PDF:Subject'] = metadata['subject'] + + if metadata.get('keywords'): + tags['PDF:Keywords'] = metadata['keywords'] + + return tags + + def _build_generic_tags(self, metadata: Dict[str, str]) -> Dict[str, str]: + """Build generic metadata tags for unknown file types.""" + tags = {} + + # Try common tags that might work + if metadata.get('title'): + tags['Title'] = metadata['title'] + + if metadata.get('subject'): + tags['Description'] = metadata['subject'] + tags['Subject'] = metadata['subject'] + + if metadata.get('keywords'): + tags['Keywords'] = metadata['keywords'] + + return tags diff --git a/src/updaters/image_updater.py b/src/updaters/image_updater.py new file mode 100644 index 0000000..14c69f6 --- /dev/null +++ b/src/updaters/image_updater.py @@ -0,0 +1,221 @@ +"""Image metadata updater.""" + +import piexif +from PIL import Image +from PIL.PngImagePlugin import PngInfo +from typing import Dict +from pathlib import Path + +from ..base_updater import BaseUpdater +from ..utils import get_logger, create_backup, sanitize_metadata_value + +logger = get_logger(__name__) + + +class ImageUpdater(BaseUpdater): + """Updater for image file metadata (JPEG, PNG).""" + + SUPPORTED_FORMATS = ['jpg', 'jpeg', 'png', 'gif', 'bmp'] + + def update_metadata(self, file_path: str, metadata: Dict[str, str], backup: bool = True) -> bool: + """ + Update image metadata using EXIF for JPEG and PIL for PNG. + + Args: + file_path: Path to the image file + metadata: Dictionary with 'title', 'subject', 'keywords' keys + backup: Whether to create backup before updating + + Returns: + True if successful, False otherwise + """ + try: + # Validate metadata + if not self.validate_metadata(metadata): + logger.error(f"Invalid metadata for {file_path}") + return False + + # Check file format + file_ext = file_path.lower().split('.')[-1] + if file_ext not in self.SUPPORTED_FORMATS: + logger.error(f"Unsupported image format: {file_ext}") + return False + + # Create backup if requested + if backup: + backup_path = create_backup(file_path) + if not backup_path: + logger.warning(f"Failed to create backup for {file_path}, proceeding anyway") + + # Route to appropriate update method + if file_ext in ['jpg', 'jpeg']: + success = self._update_jpeg_metadata(file_path, metadata) + elif file_ext == 'png': + success = self._update_png_metadata(file_path, metadata) + else: + # For GIF, BMP and other formats - skip metadata update + # These formats don't support metadata in the same way + logger.warning(f"Metadata update not supported for {file_ext} format") + return True # Return success to not block the workflow + + if success: + logger.info(f"Successfully updated metadata for {file_path}") + else: + logger.error(f"Failed to update metadata for {file_path}") + + return success + + except Exception as e: + logger.error(f"Failed to update image metadata for {file_path}: {e}", exc_info=True) + return False + + def _update_jpeg_metadata(self, file_path: str, metadata: Dict[str, str]) -> bool: + """ + Update JPEG metadata using EXIF. + + Args: + file_path: Path to JPEG file + metadata: Metadata dictionary + + Returns: + True if successful + """ + try: + # Sanitize metadata + title = sanitize_metadata_value(metadata.get('title', ''), max_length=200) + subject = sanitize_metadata_value(metadata.get('subject', ''), max_length=300) + keywords = sanitize_metadata_value(metadata.get('keywords', ''), max_length=500) + + # Read existing EXIF + try: + exif_dict = piexif.load(file_path) + except (piexif.InvalidImageDataError, FileNotFoundError): + exif_dict = {"0th": {}, "Exif": {}, "GPS": {}, "1st": {}} + + # Update metadata fields + exif_dict["0th"][piexif.ImageIFD.ImageDescription] = title.encode('utf-8') + exif_dict["0th"][piexif.ImageIFD.XPSubject] = subject.encode('utf-8') + exif_dict["0th"][piexif.ImageIFD.XPKeywords] = keywords.encode('utf-8') + + # Encode EXIF data + exif_bytes = piexif.dump(exif_dict) + + # Open image and save with new EXIF + image = Image.open(file_path) + image.save(file_path, exif=exif_bytes) + + logger.debug(f"Updated JPEG metadata - Title: {title}, Subject: {subject}, Keywords: {keywords}") + return True + + except Exception as e: + logger.error(f"Failed to update JPEG metadata: {e}", exc_info=True) + return False + + def _update_png_metadata(self, file_path: str, metadata: Dict[str, str]) -> bool: + """ + Update PNG metadata using PIL. + + Args: + file_path: Path to PNG file + metadata: Metadata dictionary + + Returns: + True if successful + """ + try: + # Sanitize metadata + title = sanitize_metadata_value(metadata.get('title', ''), max_length=200) + subject = sanitize_metadata_value(metadata.get('subject', ''), max_length=300) + keywords = sanitize_metadata_value(metadata.get('keywords', ''), max_length=500) + + # Open image + image = Image.open(file_path) + + # Create metadata dictionary + pnginfo = PngInfo() + pnginfo.add_text("Title", title) + pnginfo.add_text("Subject", subject) + pnginfo.add_text("Keywords", keywords) + + # Save image with new metadata + image.save(file_path, pnginfo=pnginfo) + + logger.debug(f"Updated PNG metadata - Title: {title}, Subject: {subject}, Keywords: {keywords}") + return True + + except Exception as e: + logger.error(f"Failed to update PNG metadata: {e}", exc_info=True) + return False + + def verify_metadata(self, file_path: str, expected_metadata: Dict[str, str]) -> bool: + """ + Verify that metadata was written correctly to image. + + Args: + file_path: Path to the image file + expected_metadata: Expected metadata values + + Returns: + True if metadata matches expected values, False otherwise + """ + try: + file_ext = file_path.lower().split('.')[-1] + + if file_ext in ['jpg', 'jpeg']: + return self._verify_jpeg_metadata(file_path, expected_metadata) + else: + return self._verify_png_metadata(file_path, expected_metadata) + + except Exception as e: + logger.error(f"Failed to verify image metadata for {file_path}: {e}", exc_info=True) + return False + + def _verify_jpeg_metadata(self, file_path: str, expected_metadata: Dict[str, str]) -> bool: + """Verify JPEG metadata.""" + try: + exif_dict = piexif.load(file_path) + + expected_title = sanitize_metadata_value(expected_metadata.get('title', ''), max_length=200) + expected_subject = sanitize_metadata_value(expected_metadata.get('subject', ''), max_length=300) + expected_keywords = sanitize_metadata_value(expected_metadata.get('keywords', ''), max_length=500) + + # Check fields + actual_title = exif_dict["0th"].get(piexif.ImageIFD.ImageDescription, b"").decode('utf-8', errors='ignore') + actual_subject = exif_dict["0th"].get(piexif.ImageIFD.XPSubject, b"").decode('utf-8', errors='ignore') + actual_keywords = exif_dict["0th"].get(piexif.ImageIFD.XPKeywords, b"").decode('utf-8', errors='ignore') + + if actual_title == expected_title and actual_subject == expected_subject and actual_keywords == expected_keywords: + logger.info(f"Metadata verification successful for {file_path}") + return True + else: + logger.warning(f"Metadata verification failed for {file_path}") + return False + + except Exception as e: + logger.debug(f"JPEG metadata verification failed: {e}") + return False + + def _verify_png_metadata(self, file_path: str, expected_metadata: Dict[str, str]) -> bool: + """Verify PNG metadata.""" + try: + image = Image.open(file_path) + + expected_title = sanitize_metadata_value(expected_metadata.get('title', ''), max_length=200) + expected_subject = sanitize_metadata_value(expected_metadata.get('subject', ''), max_length=300) + expected_keywords = sanitize_metadata_value(expected_metadata.get('keywords', ''), max_length=500) + + # Check metadata + actual_title = image.info.get('Title', '').strip() + actual_subject = image.info.get('Subject', '').strip() + actual_keywords = image.info.get('Keywords', '').strip() + + if actual_title == expected_title and actual_subject == expected_subject and actual_keywords == expected_keywords: + logger.info(f"Metadata verification successful for {file_path}") + return True + else: + logger.warning(f"Metadata verification failed for {file_path}") + return False + + except Exception as e: + logger.debug(f"PNG metadata verification failed: {e}") + return False diff --git a/src/updaters/office_updater.py b/src/updaters/office_updater.py new file mode 100644 index 0000000..3a0c563 --- /dev/null +++ b/src/updaters/office_updater.py @@ -0,0 +1,253 @@ +"""Office document metadata updater.""" + +from docx import Document as DocxDocument +from openpyxl import load_workbook +from pptx import Presentation +from typing import Dict + +from ..base_updater import BaseUpdater +from ..utils import get_logger, create_backup, sanitize_metadata_value + +logger = get_logger(__name__) + + +class OfficeUpdater(BaseUpdater): + """Updater for Office file metadata (DOCX, XLSX, PPTX).""" + + SUPPORTED_FORMATS = ['docx', 'xlsx', 'pptx'] + + def update_metadata(self, file_path: str, metadata: Dict[str, str], backup: bool = True) -> bool: + """ + Update Office document metadata. + + Updates core properties (title, subject, keywords) for DOCX, XLSX, and PPTX files. + + Args: + file_path: Path to the Office file + metadata: Dictionary with 'title', 'subject', 'keywords' keys + backup: Whether to create backup before updating + + Returns: + True if successful, False otherwise + """ + try: + # Validate metadata + if not self.validate_metadata(metadata): + logger.error(f"Invalid metadata for {file_path}") + return False + + # Check file format + file_ext = file_path.lower().split('.')[-1] + if file_ext not in self.SUPPORTED_FORMATS: + logger.error(f"Unsupported Office format: {file_ext}") + return False + + # Create backup if requested + if backup: + backup_path = create_backup(file_path) + if not backup_path: + logger.warning(f"Failed to create backup for {file_path}, proceeding anyway") + + # Route to appropriate update method + if file_ext == 'docx': + success = self._update_docx_metadata(file_path, metadata) + elif file_ext == 'xlsx': + success = self._update_xlsx_metadata(file_path, metadata) + elif file_ext == 'pptx': + success = self._update_pptx_metadata(file_path, metadata) + else: + return False + + if success: + logger.info(f"Successfully updated metadata for {file_path}") + else: + logger.error(f"Failed to update metadata for {file_path}") + + return success + + except Exception as e: + logger.error(f"Failed to update Office metadata for {file_path}: {e}", exc_info=True) + return False + + def _update_docx_metadata(self, file_path: str, metadata: Dict[str, str]) -> bool: + """Update DOCX metadata.""" + try: + # Sanitize metadata + title = sanitize_metadata_value(metadata.get('title', ''), max_length=200) + subject = sanitize_metadata_value(metadata.get('subject', ''), max_length=300) + keywords = sanitize_metadata_value(metadata.get('keywords', ''), max_length=500) + + # Open document + doc = DocxDocument(file_path) + core_props = doc.core_properties + + # Update properties + core_props.title = title + core_props.subject = subject + core_props.keywords = keywords + + # Save document + doc.save(file_path) + + logger.debug(f"Updated DOCX metadata - Title: {title}, Subject: {subject}, Keywords: {keywords}") + return True + + except Exception as e: + logger.error(f"Failed to update DOCX metadata: {e}", exc_info=True) + return False + + def _update_xlsx_metadata(self, file_path: str, metadata: Dict[str, str]) -> bool: + """Update XLSX metadata.""" + try: + # Sanitize metadata + title = sanitize_metadata_value(metadata.get('title', ''), max_length=200) + subject = sanitize_metadata_value(metadata.get('subject', ''), max_length=300) + keywords = sanitize_metadata_value(metadata.get('keywords', ''), max_length=500) + + # Open workbook + workbook = load_workbook(file_path) + props = workbook.properties + + # Update properties + props.title = title + props.subject = subject + props.keywords = keywords + + # Save workbook + workbook.save(file_path) + + logger.debug(f"Updated XLSX metadata - Title: {title}, Subject: {subject}, Keywords: {keywords}") + return True + + except Exception as e: + logger.error(f"Failed to update XLSX metadata: {e}", exc_info=True) + return False + + def _update_pptx_metadata(self, file_path: str, metadata: Dict[str, str]) -> bool: + """Update PPTX metadata.""" + try: + # Sanitize metadata + title = sanitize_metadata_value(metadata.get('title', ''), max_length=200) + subject = sanitize_metadata_value(metadata.get('subject', ''), max_length=300) + keywords = sanitize_metadata_value(metadata.get('keywords', ''), max_length=500) + + # Open presentation + presentation = Presentation(file_path) + core_props = presentation.core_properties + + # Update properties + core_props.title = title + core_props.subject = subject + core_props.keywords = keywords + + # Save presentation + presentation.save(file_path) + + logger.debug(f"Updated PPTX metadata - Title: {title}, Subject: {subject}, Keywords: {keywords}") + return True + + except Exception as e: + logger.error(f"Failed to update PPTX metadata: {e}", exc_info=True) + return False + + def verify_metadata(self, file_path: str, expected_metadata: Dict[str, str]) -> bool: + """ + Verify that metadata was written correctly to Office document. + + Args: + file_path: Path to the Office file + expected_metadata: Expected metadata values + + Returns: + True if metadata matches expected values, False otherwise + """ + try: + file_ext = file_path.lower().split('.')[-1] + + if file_ext == 'docx': + return self._verify_docx_metadata(file_path, expected_metadata) + elif file_ext == 'xlsx': + return self._verify_xlsx_metadata(file_path, expected_metadata) + elif file_ext == 'pptx': + return self._verify_pptx_metadata(file_path, expected_metadata) + else: + return False + + except Exception as e: + logger.error(f"Failed to verify Office metadata for {file_path}: {e}", exc_info=True) + return False + + def _verify_docx_metadata(self, file_path: str, expected_metadata: Dict[str, str]) -> bool: + """Verify DOCX metadata.""" + try: + doc = DocxDocument(file_path) + core_props = doc.core_properties + + expected_title = sanitize_metadata_value(expected_metadata.get('title', ''), max_length=200) + expected_subject = sanitize_metadata_value(expected_metadata.get('subject', ''), max_length=300) + expected_keywords = sanitize_metadata_value(expected_metadata.get('keywords', ''), max_length=500) + + actual_title = (core_props.title or '').strip() + actual_subject = (core_props.subject or '').strip() + actual_keywords = (core_props.keywords or '').strip() + + if actual_title == expected_title and actual_subject == expected_subject and actual_keywords == expected_keywords: + logger.info(f"Metadata verification successful for {file_path}") + return True + else: + logger.warning(f"Metadata verification failed for {file_path}") + return False + + except Exception as e: + logger.debug(f"DOCX metadata verification failed: {e}") + return False + + def _verify_xlsx_metadata(self, file_path: str, expected_metadata: Dict[str, str]) -> bool: + """Verify XLSX metadata.""" + try: + workbook = load_workbook(file_path) + props = workbook.properties + + expected_title = sanitize_metadata_value(expected_metadata.get('title', ''), max_length=200) + expected_subject = sanitize_metadata_value(expected_metadata.get('subject', ''), max_length=300) + expected_keywords = sanitize_metadata_value(expected_metadata.get('keywords', ''), max_length=500) + + actual_title = (props.title or '').strip() + actual_subject = (props.subject or '').strip() + actual_keywords = (props.keywords or '').strip() + + if actual_title == expected_title and actual_subject == expected_subject and actual_keywords == expected_keywords: + logger.info(f"Metadata verification successful for {file_path}") + return True + else: + logger.warning(f"Metadata verification failed for {file_path}") + return False + + except Exception as e: + logger.debug(f"XLSX metadata verification failed: {e}") + return False + + def _verify_pptx_metadata(self, file_path: str, expected_metadata: Dict[str, str]) -> bool: + """Verify PPTX metadata.""" + try: + presentation = Presentation(file_path) + core_props = presentation.core_properties + + expected_title = sanitize_metadata_value(expected_metadata.get('title', ''), max_length=200) + expected_subject = sanitize_metadata_value(expected_metadata.get('subject', ''), max_length=300) + expected_keywords = sanitize_metadata_value(expected_metadata.get('keywords', ''), max_length=500) + + actual_title = (core_props.title or '').strip() + actual_subject = (core_props.subject or '').strip() + actual_keywords = (core_props.keywords or '').strip() + + if actual_title == expected_title and actual_subject == expected_subject and actual_keywords == expected_keywords: + logger.info(f"Metadata verification successful for {file_path}") + return True + else: + logger.warning(f"Metadata verification failed for {file_path}") + return False + + except Exception as e: + logger.debug(f"PPTX metadata verification failed: {e}") + return False diff --git a/src/updaters/pdf_updater.py b/src/updaters/pdf_updater.py new file mode 100644 index 0000000..aaf8708 --- /dev/null +++ b/src/updaters/pdf_updater.py @@ -0,0 +1,132 @@ +"""PDF metadata updater.""" + +import pypdf +from typing import Dict +from pathlib import Path + +from ..base_updater import BaseUpdater +from ..utils import get_logger, create_backup, sanitize_metadata_value + +logger = get_logger(__name__) + + +class PDFUpdater(BaseUpdater): + """Updater for PDF file metadata.""" + + def update_metadata(self, file_path: str, metadata: Dict[str, str], backup: bool = True) -> bool: + """ + Update PDF metadata fields. + + Updates /Title, /Subject, /Keywords fields in the PDF document information dictionary. + + Args: + file_path: Path to the PDF file + metadata: Dictionary with 'title', 'subject', 'keywords' keys + backup: Whether to create backup before updating + + Returns: + True if successful, False otherwise + """ + try: + # Validate metadata + if not self.validate_metadata(metadata): + logger.error(f"Invalid metadata for {file_path}") + return False + + # Create backup if requested + if backup: + backup_path = create_backup(file_path) + if not backup_path: + logger.warning(f"Failed to create backup for {file_path}, proceeding anyway") + + # Sanitize metadata values + title = sanitize_metadata_value(metadata.get('title', ''), max_length=200) + subject = sanitize_metadata_value(metadata.get('subject', ''), max_length=300) + keywords = sanitize_metadata_value(metadata.get('keywords', ''), max_length=500) + + # Read existing PDF + with open(file_path, 'rb') as f: + pdf_reader = pypdf.PdfReader(f) + pdf_writer = pypdf.PdfWriter() + + # Copy all pages + for page in pdf_reader.pages: + pdf_writer.add_page(page) + + # Update metadata + pdf_writer.add_metadata({ + '/Title': title, + '/Subject': subject, + '/Keywords': keywords, + }) + + # Write updated PDF + with open(file_path, 'wb') as f: + pdf_writer.write(f) + + logger.info(f"Successfully updated metadata for {file_path}") + logger.debug(f"Updated fields - Title: {title}, Subject: {subject}, Keywords: {keywords}") + + return True + + except Exception as e: + logger.error(f"Failed to update PDF metadata for {file_path}: {e}", exc_info=True) + return False + + def verify_metadata(self, file_path: str, expected_metadata: Dict[str, str]) -> bool: + """ + Verify that metadata was written correctly to PDF. + + Checks if the written metadata matches the expected values. + + Args: + file_path: Path to the PDF file + expected_metadata: Expected metadata values + + Returns: + True if metadata matches expected values, False otherwise + """ + try: + # Read the updated PDF + with open(file_path, 'rb') as f: + pdf_reader = pypdf.PdfReader(f) + doc_info = pdf_reader.metadata + + if not doc_info: + logger.warning(f"No metadata found in {file_path}") + return False + + # Check each expected field + expected_title = sanitize_metadata_value(expected_metadata.get('title', ''), max_length=200) + expected_subject = sanitize_metadata_value(expected_metadata.get('subject', ''), max_length=300) + expected_keywords = sanitize_metadata_value(expected_metadata.get('keywords', ''), max_length=500) + + # Get actual values and handle bytes + actual_title = doc_info.get('/Title') + if isinstance(actual_title, bytes): + actual_title = actual_title.decode('utf-8', errors='ignore') + actual_title = str(actual_title).strip() if actual_title else "" + + actual_subject = doc_info.get('/Subject') + if isinstance(actual_subject, bytes): + actual_subject = actual_subject.decode('utf-8', errors='ignore') + actual_subject = str(actual_subject).strip() if actual_subject else "" + + actual_keywords = doc_info.get('/Keywords') + if isinstance(actual_keywords, bytes): + actual_keywords = actual_keywords.decode('utf-8', errors='ignore') + actual_keywords = str(actual_keywords).strip() if actual_keywords else "" + + # Compare + if actual_title == expected_title and actual_subject == expected_subject and actual_keywords == expected_keywords: + logger.info(f"Metadata verification successful for {file_path}") + return True + else: + logger.warning(f"Metadata verification failed for {file_path}") + logger.debug(f"Expected - Title: {expected_title}, Subject: {expected_subject}, Keywords: {expected_keywords}") + logger.debug(f"Actual - Title: {actual_title}, Subject: {actual_subject}, Keywords: {actual_keywords}") + return False + + except Exception as e: + logger.error(f"Failed to verify PDF metadata for {file_path}: {e}", exc_info=True) + return False diff --git a/src/updaters/video_updater.py b/src/updaters/video_updater.py new file mode 100644 index 0000000..9063556 --- /dev/null +++ b/src/updaters/video_updater.py @@ -0,0 +1,185 @@ +"""Video metadata updater.""" + +from typing import Dict + +from ..base_updater import BaseUpdater +from ..utils import get_logger, create_backup, sanitize_metadata_value + +logger = get_logger(__name__) + + +class VideoUpdater(BaseUpdater): + """Updater for video file metadata (MP4, MOV, AVI).""" + + SUPPORTED_FORMATS = ['mp4', 'mov', 'avi', 'mkv', 'flv', 'wmv', 'webm'] + + def update_metadata(self, file_path: str, metadata: Dict[str, str], backup: bool = True) -> bool: + """ + Update video metadata using mutagen. + + Args: + file_path: Path to the video file + metadata: Dictionary with 'title', 'subject', 'keywords' keys + backup: Whether to create backup before updating + + Returns: + True if successful, False otherwise + """ + try: + # Validate metadata + if not self.validate_metadata(metadata): + logger.error(f"Invalid metadata for {file_path}") + return False + + # Check file format + file_ext = file_path.lower().split('.')[-1] + if file_ext not in self.SUPPORTED_FORMATS: + logger.error(f"Unsupported video format: {file_ext}") + return False + + # Create backup if requested + if backup: + backup_path = create_backup(file_path) + if not backup_path: + logger.warning(f"Failed to create backup for {file_path}, proceeding anyway") + + # Update using mutagen + success = self._update_with_mutagen(file_path, metadata) + + if success: + logger.info(f"Successfully updated metadata for {file_path}") + else: + logger.error(f"Failed to update metadata for {file_path}") + + return success + + except Exception as e: + logger.error(f"Failed to update video metadata for {file_path}: {e}", exc_info=True) + return False + + def _update_with_mutagen(self, file_path: str, metadata: Dict[str, str]) -> bool: + """ + Update video metadata using mutagen. + + Args: + file_path: Path to video file + metadata: Metadata dictionary + + Returns: + True if successful + """ + try: + from mutagen import File + except ImportError: + logger.error("mutagen not installed, cannot update video metadata") + return False + + try: + # Sanitize metadata + title = sanitize_metadata_value(metadata.get('title', ''), max_length=200) + subject = sanitize_metadata_value(metadata.get('subject', ''), max_length=300) + keywords = sanitize_metadata_value(metadata.get('keywords', ''), max_length=500) + + # Open audio file + audio = File(file_path) + + if audio is None: + logger.warning(f"mutagen could not identify file format: {file_path}") + return False + + # Update tags based on file format + file_ext = file_path.lower().split('.')[-1] + + if file_ext == 'mp4': + # MP4 uses specific atom names + audio['\xa9nam'] = title + audio['\xa9cmt'] = subject + if 'TXXX:Keywords' not in audio: + audio['TXXX:Keywords'] = keywords + elif file_ext == 'mov': + # MOV is similar to MP4 + audio['\xa9nam'] = title + audio['\xa9cmt'] = subject + if 'TXXX:Keywords' not in audio: + audio['TXXX:Keywords'] = keywords + else: + # For other formats (AVI, MKV, etc.), use generic ID3/Vorbis tags + if hasattr(audio, 'add'): + # ID3v2 style + audio.add_tags() + audio['TIT2'] = title + audio['TXXX:Subject'] = subject + audio['TXXX:Keywords'] = keywords + else: + # Vorbis Comment style + audio['title'] = title + audio['subject'] = subject + audio['keywords'] = keywords + + # Save file + audio.save() + + logger.debug(f"Updated video metadata - Title: {title}, Subject: {subject}, Keywords: {keywords}") + return True + + except Exception as e: + logger.error(f"Failed to update video metadata with mutagen: {e}", exc_info=True) + return False + + def verify_metadata(self, file_path: str, expected_metadata: Dict[str, str]) -> bool: + """ + Verify that metadata was written correctly to video. + + Args: + file_path: Path to the video file + expected_metadata: Expected metadata values + + Returns: + True if metadata matches expected values, False otherwise + """ + try: + from mutagen import File + except ImportError: + logger.error("mutagen not installed, cannot verify video metadata") + return False + + try: + audio = File(file_path) + + if audio is None: + logger.warning(f"Could not read file for verification: {file_path}") + return False + + expected_title = sanitize_metadata_value(expected_metadata.get('title', ''), max_length=200) + expected_subject = sanitize_metadata_value(expected_metadata.get('subject', ''), max_length=300) + expected_keywords = sanitize_metadata_value(expected_metadata.get('keywords', ''), max_length=500) + + # Get actual values + file_ext = file_path.lower().split('.')[-1] + + if file_ext in ['mp4', 'mov']: + actual_title = audio.get('\xa9nam', [''])[0] if '\xa9nam' in audio else "" + actual_subject = audio.get('\xa9cmt', [''])[0] if '\xa9cmt' in audio else "" + actual_keywords = audio.get('TXXX:Keywords', [''])[0] if 'TXXX:Keywords' in audio else "" + else: + actual_title = audio.get('TIT2', [''])[0] if 'TIT2' in audio else audio.get('title', [''])[0] if 'title' in audio else "" + actual_subject = audio.get('TXXX:Subject', [''])[0] if 'TXXX:Subject' in audio else audio.get('subject', [''])[0] if 'subject' in audio else "" + actual_keywords = audio.get('TXXX:Keywords', [''])[0] if 'TXXX:Keywords' in audio else audio.get('keywords', [''])[0] if 'keywords' in audio else "" + + # Normalize strings + actual_title = str(actual_title).strip() if actual_title else "" + actual_subject = str(actual_subject).strip() if actual_subject else "" + actual_keywords = str(actual_keywords).strip() if actual_keywords else "" + + if actual_title == expected_title and actual_subject == expected_subject and actual_keywords == expected_keywords: + logger.info(f"Metadata verification successful for {file_path}") + return True + else: + logger.warning(f"Metadata verification failed for {file_path}") + logger.debug(f"Expected - Title: {expected_title}, Subject: {expected_subject}, Keywords: {expected_keywords}") + logger.debug(f"Actual - Title: {actual_title}, Subject: {actual_subject}, Keywords: {actual_keywords}") + return False + + except Exception as e: + logger.error(f"Failed to verify video metadata for {file_path}: {e}", exc_info=True) + return False diff --git a/src/utils.py b/src/utils.py new file mode 100644 index 0000000..cc11dd9 --- /dev/null +++ b/src/utils.py @@ -0,0 +1,175 @@ +"""Utility functions for backup, logging, and file operations.""" + +import shutil +import logging +from pathlib import Path +from datetime import datetime +from typing import Optional +from .config import Config + +# Setup logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' +) +logger = logging.getLogger(__name__) + +def create_backup(file_path: str) -> Optional[Path]: + """ + Create a backup of the file before modification. + + Args: + file_path: Path to the file to backup + + Returns: + Path to the backup file, or None if backup failed + """ + try: + source = Path(file_path) + if not source.exists(): + logger.error(f"File not found for backup: {file_path}") + return None + + # Create backup filename with timestamp + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + backup_filename = f"{source.stem}_{timestamp}{source.suffix}" + backup_path = Config.BACKUP_DIR / backup_filename + + # Ensure backup directory exists + Config.BACKUP_DIR.mkdir(parents=True, exist_ok=True) + + # Copy file + shutil.copy2(source, backup_path) + logger.info(f"Backup created: {backup_path}") + + return backup_path + + except Exception as e: + logger.error(f"Failed to create backup for {file_path}: {e}") + return None + +def get_logger(name: str) -> logging.Logger: + """ + Get a logger instance. + + Args: + name: Logger name + + Returns: + Logger instance + """ + return logging.getLogger(name) + +def format_metadata_comparison(old_metadata: dict, new_metadata: dict) -> str: + """ + Format metadata comparison for display. + + Args: + old_metadata: Old metadata dictionary + new_metadata: New metadata dictionary + + Returns: + Formatted comparison string + """ + lines = ["\n" + "="*60] + lines.append("METADATA COMPARISON") + lines.append("="*60) + + all_keys = set(old_metadata.keys()) | set(new_metadata.keys()) + + for key in sorted(all_keys): + old_value = old_metadata.get(key, "N/A") + new_value = new_metadata.get(key, "N/A") + + lines.append(f"\n{key.upper()}:") + lines.append(f" Old: {old_value}") + lines.append(f" New: {new_value}") + + if old_value != new_value: + lines.append(" [CHANGED]") + + lines.append("="*60 + "\n") + return "\n".join(lines) + +def sanitize_metadata_value(value: str, max_length: int = 500) -> str: + """ + Sanitize and truncate metadata value. + + Args: + value: Metadata value + max_length: Maximum length + + Returns: + Sanitized value + """ + if not value: + return "" + + # Remove control characters and excessive whitespace + value = ' '.join(value.split()) + + # Truncate if too long + if len(value) > max_length: + value = value[:max_length-3] + "..." + + return value.strip() + +def validate_file_path(file_path: str) -> bool: + """ + Validate file path exists and is accessible. + + Args: + file_path: Path to validate + + Returns: + True if valid + """ + try: + path = Path(file_path) + return path.exists() and path.is_file() + except Exception: + return False + +def get_file_size_mb(file_path: str) -> float: + """ + Get file size in MB. + + Args: + file_path: Path to file + + Returns: + File size in MB + """ + try: + size_bytes = Path(file_path).stat().st_size + return size_bytes / (1024 * 1024) + except Exception: + return 0.0 + +def create_report_entry(file_path: str, file_type: str, old_metadata: dict, + new_metadata: dict, status: str) -> dict: + """ + Create a report entry for CSV export. + + Args: + file_path: Path to file + file_type: Type of file + old_metadata: Old metadata + new_metadata: New metadata + status: Processing status (success/failed) + + Returns: + Dictionary with report data + """ + return { + 'timestamp': datetime.now().isoformat(), + 'file_path': file_path, + 'file_type': file_type, + 'old_title': old_metadata.get('title', 'N/A'), + 'new_title': new_metadata.get('title', 'N/A'), + 'old_subject': old_metadata.get('subject', 'N/A'), + 'new_subject': new_metadata.get('subject', 'N/A'), + 'old_keywords': old_metadata.get('keywords', 'N/A'), + 'new_keywords': new_metadata.get('keywords', 'N/A'), + 'status': status + } diff --git a/static/css/admin.css b/static/css/admin.css new file mode 100644 index 0000000..4157336 --- /dev/null +++ b/static/css/admin.css @@ -0,0 +1,204 @@ +/* Admin Dashboard Styles */ + +.admin-stats { + display: grid; + grid-template-columns: repeat(auto-fit, minmax(180px, 1fr)); + gap: 15px; + margin-bottom: 25px; +} + +.stat-card { + background: white; + border-radius: 12px; + padding: 20px; + text-align: center; + box-shadow: 0 2px 8px rgba(0,0,0,0.06); + border: 1px solid #e5e7eb; +} + +.stat-value { + font-size: 28px; + font-weight: 700; + color: var(--primary-gold-dark, #e6b007); +} + +.stat-label { + font-size: 13px; + color: #6b7280; + margin-top: 5px; +} + +.admin-tabs { + display: flex; + gap: 5px; + margin-bottom: 20px; + border-bottom: 2px solid #e5e7eb; + padding-bottom: 0; +} + +.admin-tab { + padding: 10px 20px; + border: none; + background: none; + cursor: pointer; + font-size: 14px; + font-weight: 500; + color: #6b7280; + border-bottom: 2px solid transparent; + margin-bottom: -2px; + transition: all 0.2s; +} + +.admin-tab:hover { + color: #1f2937; +} + +.admin-tab.active { + color: var(--primary-gold-dark, #e6b007); + border-bottom-color: var(--primary-gold, #FFC407); +} + +.admin-panel { + background: white; + border-radius: 12px; + padding: 20px; + box-shadow: 0 2px 8px rgba(0,0,0,0.06); + border: 1px solid #e5e7eb; +} + +.panel-header { + display: flex; + justify-content: space-between; + align-items: center; + margin-bottom: 15px; +} + +.panel-header h3 { + margin: 0; + font-size: 18px; + color: #1f2937; +} + +.admin-table-container { + overflow-x: auto; +} + +.admin-table { + width: 100%; + border-collapse: collapse; + font-size: 13px; +} + +.admin-table th, +.admin-table td { + padding: 10px 12px; + text-align: left; + border-bottom: 1px solid #e5e7eb; +} + +.admin-table th { + background: #f9fafb; + font-weight: 600; + color: #374151; + white-space: nowrap; +} + +.admin-table tr:hover { + background: #f9fafb; +} + +.badge { + display: inline-block; + padding: 2px 8px; + border-radius: 10px; + font-size: 11px; + font-weight: 600; +} + +.badge-admin { + background: #fef3c7; + color: #92400e; +} + +.badge-user { + background: #dbeafe; + color: #1e40af; +} + +.badge-active { + background: #d1fae5; + color: #065f46; +} + +.badge-inactive { + background: #fee2e2; + color: #991b1b; +} + +.btn-sm { + padding: 6px 14px; + font-size: 12px; + border-radius: 6px; +} + +.btn-action { + padding: 4px 10px; + font-size: 11px; + border: 1px solid #d1d5db; + background: white; + border-radius: 4px; + cursor: pointer; + color: #374151; +} + +.btn-action:hover { + background: #f3f4f6; +} + +.btn-action.danger { + color: #dc2626; + border-color: #fca5a5; +} + +.btn-action.danger:hover { + background: #fef2f2; +} + +.audit-filters { + display: flex; + gap: 10px; + align-items: center; +} + +.audit-filters select { + padding: 6px 10px; + border: 1px solid #d1d5db; + border-radius: 6px; + font-size: 13px; +} + +.ai-stats-grid { + display: grid; + grid-template-columns: repeat(auto-fit, minmax(150px, 1fr)); + gap: 12px; + margin-bottom: 20px; +} + +.ai-stat-card { + background: #f9fafb; + border-radius: 8px; + padding: 15px; + text-align: center; +} + +.ai-stat-value { + font-size: 22px; + font-weight: 600; + color: #1f2937; +} + +.ai-stat-label { + font-size: 12px; + color: #6b7280; + margin-top: 3px; +} diff --git a/static/css/app.css b/static/css/app.css new file mode 100644 index 0000000..254b82c --- /dev/null +++ b/static/css/app.css @@ -0,0 +1,811 @@ + /* ========== CSS VARIABLES ========== */ + :root { + /* Main colors */ + --primary-gold: #FFC407; + --primary-gold-dark: #e6b007; + --primary-gold-light: #ffcf33; + + /* Dark colors */ + --dark-primary: #2c2c2c; + --dark-secondary: #1a1a1a; + + /* Light colors */ + --white: #ffffff; + --light-bg: #fafafa; + --light-bg-gradient: #f8fafc; + + /* Text colors */ + --text-primary: #1f2937; + --text-secondary: #374151; + --text-muted: #6b7280; + + /* Status colors */ + --success-green: #4ade80; + --error-red: #ef4444; + + /* Opacity */ + --overlay-light: rgba(255, 255, 255, 0.95); + --overlay-dark: rgba(0, 0, 0, 0.5); + --border-light: rgba(255, 255, 255, 0.2); + --border-subtle: rgba(0, 0, 0, 0.05); + + /* Shadows */ + --shadow-sm: 0 2px 8px rgba(0, 0, 0, 0.1); + --shadow-md: 0 10px 25px rgba(0, 0, 0, 0.15); + --shadow-lg: 0 20px 40px rgba(0, 0, 0, 0.1); + + /* Radius */ + --radius-sm: 4px; + --radius-md: 12px; + --radius-lg: 18px; + --radius-xl: 20px; + + /* Spacing */ + --spacing-xs: 4px; + --spacing-sm: 8px; + --spacing-md: 12px; + --spacing-lg: 16px; + --spacing-xl: 20px; + --spacing-2xl: 25px; + + /* Fonts */ + --font-family: 'Montserrat', -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif; + + /* Transitions */ + --transition-fast: 0.15s ease; + --transition-normal: 0.3s ease; + --transition-slow: 0.5s ease; + } + + * { margin: 0; padding: 0; box-sizing: border-box; } + + body { + font-family: var(--font-family); + background: linear-gradient(135deg, var(--dark-primary) 0%, var(--dark-secondary) 100%); + min-height: 100vh; + padding: 20px; + } + .container { + max-width: 1200px; + margin: 0 auto; + background: var(--overlay-light); + backdrop-filter: blur(20px); + border-radius: var(--radius-xl); + box-shadow: var(--shadow-lg); + overflow: hidden; + border: 1px solid var(--border-light); + } + .header { + background: linear-gradient(135deg, var(--primary-gold) 0%, var(--primary-gold-dark) 100%); + color: var(--dark-secondary); + padding: 30px; + text-align: center; + position: relative; + } + .header::before { + content: ''; + position: absolute; + top: 0; + left: 0; + right: 0; + bottom: 0; + background: linear-gradient(45deg, transparent 30%, rgba(255,255,255,0.1) 50%, transparent 70%); + animation: shimmer 3s infinite; + pointer-events: none; + } + .header h1 { + font-size: 28px; + margin-bottom: 10px; + font-weight: 600; + position: relative; + z-index: 1; + } + .header p { + opacity: 0.9; + font-size: 14px; + position: relative; + z-index: 1; + } + @keyframes shimmer { + 0% { transform: translateX(-100%); } + 100% { transform: translateX(100%); } + } + .content { + padding: 40px; + background: linear-gradient(180deg, var(--light-bg) 0%, var(--light-bg-gradient) 100%); + } + + @keyframes slideIn { + from { + opacity: 0; + transform: translateY(20px); + } + to { + opacity: 1; + transform: translateY(0); + } + } + + @keyframes fadeIn { + from { opacity: 0; } + to { opacity: 1; } + } + + @keyframes pulse { + 0%, 100% { transform: scale(1); } + 50% { transform: scale(1.05); } + } + + .upload-section { + background: var(--white); + border-radius: var(--radius-md); + padding: 20px; + margin-bottom: 30px; + box-shadow: var(--shadow-sm); + } + + .upload-area { + border: 3px dashed var(--primary-gold); + border-radius: var(--radius-md); + padding: 60px 20px; + text-align: center; + cursor: pointer; + transition: all var(--transition-normal); + background: var(--light-bg); + margin-bottom: 20px; + } + .upload-area:hover { + background: #fffbf0; + border-color: var(--primary-gold-dark); + transform: translateY(-2px); + } + .upload-area.dragover { + background: #fff9e6; + transform: scale(1.02); + border-color: var(--primary-gold-dark); + } + + #fileInput { display: none; } + .upload-icon { font-size: 48px; margin-bottom: 15px; } + + .output-dir-section { + display: flex; + align-items: center; + gap: 15px; + margin-bottom: 20px; + padding: 15px; + background: white; + border-radius: 8px; + } + + .output-dir-section label { + font-weight: 600; + color: #495057; + min-width: 120px; + } + + #outputDir { + flex: 1; + padding: 10px; + border: 2px solid #dee2e6; + border-radius: var(--radius-sm); + font-size: 14px; + font-family: var(--font-family); + transition: border-color var(--transition-fast); + } + + #outputDir:focus { + outline: none; + border-color: var(--primary-gold); + } + + .output-dir-hint { + font-size: 12px; + color: #6c757d; + margin-top: 5px; + } + + .btn { + background: linear-gradient(135deg, var(--primary-gold), var(--primary-gold-dark)); + color: var(--dark-secondary); + border: none; + padding: 12px 30px; + border-radius: var(--radius-md); + cursor: pointer; + font-size: 16px; + font-weight: 600; + font-family: var(--font-family); + transition: all var(--transition-fast); + margin: 5px; + } + .btn:hover:not(:disabled) { + transform: translateY(-2px); + box-shadow: 0 4px 12px rgba(255, 196, 7, 0.4); + } + .btn:active:not(:disabled) { + transform: translateY(0); + } + .btn:disabled { + opacity: 0.5; + cursor: not-allowed; + transform: none; + } + + .btn-small { + padding: 8px 20px; + font-size: 14px; + } + + .progress-bar { + width: 100%; + height: 30px; + background: #e9ecef; + border-radius: 15px; + overflow: hidden; + margin: 20px 0; + display: none; + } + + .progress-fill { + height: 100%; + background: linear-gradient(135deg, var(--primary-gold), var(--primary-gold-dark)); + transition: width var(--transition-normal); + display: flex; + align-items: center; + justify-content: center; + color: var(--dark-secondary); + font-weight: 600; + font-size: 14px; + } + + .file-list { + margin-top: 30px; + display: none; + } + + .batch-toolbar { + background: var(--white); + border-radius: var(--radius-md); + padding: 15px; + margin-bottom: 20px; + display: flex; + justify-content: space-between; + align-items: center; + gap: 15px; + box-shadow: var(--shadow-sm); + } + + .batch-toolbar-left { + display: flex; + gap: 10px; + align-items: center; + } + + .batch-toolbar-right { + display: flex; + gap: 10px; + } + + .btn-toolbar { + background: #6c757d; + color: white; + border: none; + padding: 8px 16px; + border-radius: 20px; + cursor: pointer; + font-size: 13px; + font-weight: 600; + transition: transform 0.2s; + } + + .btn-toolbar:hover { + transform: translateY(-2px); + background: #5a6268; + } + + .btn-export { + background: linear-gradient(135deg, #28a745 0%, #20c997 100%); + } + + .btn-export:hover { + background: linear-gradient(135deg, #218838 0%, #1fa589 100%); + } + + .selection-count { + font-size: 13px; + color: #495057; + font-weight: 600; + } + + .file-item { + background: var(--white); + border-radius: var(--radius-md); + padding: 20px; + margin-bottom: 20px; + border-left: 4px solid var(--primary-gold); + box-shadow: var(--shadow-sm); + transition: all var(--transition-fast); + } + + .file-item:hover { + box-shadow: var(--shadow-md); + transform: translateX(2px); + } + + .file-item.selected { + background: #fffbf0; + border-left-color: var(--success-green); + } + + .file-header { + display: flex; + justify-content: space-between; + align-items: center; + margin-bottom: 15px; + } + + .file-header-left { + display: flex; + align-items: center; + gap: 12px; + } + + .file-checkbox { + width: 20px; + height: 20px; + cursor: pointer; + } + + .file-name { + font-weight: 600; + font-size: 16px; + color: #495057; + } + + .file-type { + background: linear-gradient(135deg, var(--primary-gold), var(--primary-gold-dark)); + color: var(--dark-secondary); + padding: 4px 12px; + border-radius: 12px; + font-size: 12px; + font-weight: 600; + } + + .metadata-comparison { + display: grid; + grid-template-columns: 1fr 1fr; + gap: 15px; + } + + .metadata-box { + background: var(--light-bg); + border-radius: var(--radius-sm); + padding: 15px; + border: 1px solid var(--border-subtle); + } + + .metadata-box h4 { + color: var(--primary-gold-dark); + margin-bottom: 10px; + font-size: 14px; + font-weight: 600; + } + + .metadata-item { + display: flex; + flex-direction: column; + padding: 8px 0; + border-bottom: 1px solid #dee2e6; + } + + .metadata-item:last-child { border-bottom: none; } + .metadata-label { font-weight: 600; color: #495057; font-size: 12px; margin-bottom: 4px; } + .metadata-value { color: #6c757d; font-size: 13px; } + + .alert { + padding: 15px; + border-radius: 8px; + margin: 15px 0; + display: none; + } + .alert-error { background: #f8d7da; color: #721c24; border: 1px solid #f5c6cb; } + .alert-success { background: #d4edda; color: #155724; border: 1px solid #c3e6cb; } + .alert-info { background: #d1ecf1; color: #0c5460; border: 1px solid #bee5eb; } + + .actions { + text-align: center; + margin-top: 20px; + } + + .spinner { + border: 3px solid #f3f3f3; + border-top: 3px solid var(--primary-gold); + border-radius: 50%; + width: 40px; + height: 40px; + animation: spin 1s linear infinite; + margin: 20px auto; + display: none; + } + + @keyframes spin { + 0% { transform: rotate(0deg); } + 100% { transform: rotate(360deg); } + } + + .footer { + text-align: center; + padding: 20px; + color: #6c757d; + font-size: 12px; + border-top: 1px solid #dee2e6; + } + + /* Metadata Source Selector */ + .metadata-source-selector { + background: white; + border-radius: 8px; + padding: 15px; + margin-bottom: 20px; + display: flex; + align-items: center; + gap: 15px; + } + + .metadata-source-selector label { + font-weight: 600; + color: #495057; + min-width: 140px; + } + + .source-select { + flex: 1; + padding: 10px; + border: 2px solid var(--primary-gold); + border-radius: var(--radius-sm); + font-size: 14px; + font-family: var(--font-family); + cursor: pointer; + background: var(--white); + transition: border-color var(--transition-fast); + } + .source-select:focus { + outline: none; + border-color: var(--primary-gold-dark); + } + + .source-info { + font-size: 12px; + color: #6c757d; + margin-left: 10px; + } + + /* Editable Metadata Fields */ + .editable-field { + width: 100%; + padding: 8px; + border: 2px solid #dee2e6; + border-radius: 5px; + font-size: 13px; + font-family: inherit; + transition: border-color 0.3s; + } + + .editable-field:focus { + outline: none; + border-color: var(--primary-gold); + box-shadow: 0 0 0 3px rgba(255, 196, 7, 0.1); + } + + .editable-field.invalid { + border-color: #dc3545; + } + + textarea.editable-field { + min-height: 60px; + resize: vertical; + } + + .char-count { + font-size: 11px; + color: #6c757d; + margin-top: 4px; + display: block; + } + + .char-count.warning { + color: #ffc107; + } + + .char-count.danger { + color: #dc3545; + } + + .metadata-field { + margin-bottom: 15px; + } + + .metadata-field label { + display: block; + font-weight: 600; + color: #495057; + font-size: 12px; + margin-bottom: 5px; + } + + /* File Action Buttons */ + .file-actions { + display: flex; + gap: 10px; + margin-top: 15px; + } + + .btn-save { + background: linear-gradient(135deg, #28a745 0%, #20c997 100%); + color: white; + border: none; + padding: 8px 20px; + border-radius: 20px; + cursor: pointer; + font-size: 14px; + font-weight: 600; + transition: transform 0.2s; + } + + .btn-save:hover { + transform: translateY(-2px); + } + + .btn-save:disabled { + opacity: 0.5; + cursor: not-allowed; + transform: none; + } + + .btn-reset { + background: #6c757d; + color: white; + border: none; + padding: 8px 20px; + border-radius: 20px; + cursor: pointer; + font-size: 14px; + font-weight: 600; + transition: transform 0.2s; + } + + .btn-reset:hover { + transform: translateY(-2px); + background: #5a6268; + } + + /* Import Metadata Section */ + .import-section { + background: white; + border-radius: 8px; + padding: 15px; + margin-bottom: 15px; + border: 2px dashed #dee2e6; + } + + .import-section.active { + border-color: var(--success-green); + background: #f0fff4; + } + + .btn-import { + background: linear-gradient(135deg, #17a2b8 0%, #138496 100%); + color: white; + border: none; + padding: 8px 20px; + border-radius: 20px; + cursor: pointer; + font-size: 14px; + font-weight: 600; + transition: transform 0.2s; + } + + .btn-import:hover { + transform: translateY(-2px); + } + + .import-stats { + font-size: 12px; + color: #28a745; + margin-top: 10px; + padding: 8px; + background: white; + border-radius: 5px; + } + + /* Template Section */ + .template-section { + background: white; + border-radius: 8px; + padding: 15px; + margin-bottom: 15px; + border: 2px dashed #dee2e6; + } + + .template-section.active { + border-color: var(--primary-gold); + background: #fffbf0; + } + + .template-controls { + display: flex; + gap: 10px; + align-items: center; + flex-wrap: wrap; + } + + .template-select { + flex: 1; + min-width: 200px; + padding: 8px; + border: 2px solid var(--primary-gold); + border-radius: var(--radius-sm); + font-size: 13px; + font-family: var(--font-family); + cursor: pointer; + transition: border-color var(--transition-fast); + } + + .template-select:focus { + outline: none; + border-color: var(--primary-gold-dark); + } + + .btn-template { + background: linear-gradient(135deg, var(--primary-gold), var(--primary-gold-dark)); + color: var(--dark-secondary); + border: none; + padding: 8px 16px; + border-radius: var(--radius-md); + cursor: pointer; + font-size: 13px; + font-weight: 600; + font-family: var(--font-family); + transition: all var(--transition-fast); + } + + .btn-template:hover:not(:disabled) { + transform: translateY(-2px); + box-shadow: 0 4px 12px rgba(255, 196, 7, 0.3); + } + + .btn-template:disabled { + opacity: 0.5; + cursor: not-allowed; + transform: none; + } + + .template-preview { + margin-top: 10px; + padding: 10px; + background: white; + border-radius: 5px; + font-size: 12px; + color: #495057; + display: none; + } + + .template-preview-item { + margin-bottom: 5px; + } + + .template-preview-label { + font-weight: 600; + color: var(--primary-gold-dark); + } + + /* Modal Styles */ + .modal { + display: none; + position: fixed; + z-index: 1000; + left: 0; + top: 0; + width: 100%; + height: 100%; + background-color: rgba(0,0,0,0.5); + } + + .modal-content { + background-color: white; + margin: 5% auto; + padding: 30px; + border-radius: 15px; + width: 90%; + max-width: 600px; + box-shadow: 0 20px 60px rgba(0,0,0,0.3); + } + + .modal-header { + display: flex; + justify-content: space-between; + align-items: center; + margin-bottom: 20px; + } + + .modal-header h3 { + color: var(--primary-gold-dark); + margin: 0; + font-weight: 600; + } + + .close-modal { + font-size: 28px; + font-weight: bold; + color: #aaa; + cursor: pointer; + } + + .close-modal:hover { + color: #000; + } + + .form-group { + margin-bottom: 15px; + } + + .form-group label { + display: block; + font-weight: 600; + color: #495057; + margin-bottom: 5px; + font-size: 13px; + } + + .form-group input, + .form-group textarea { + width: 100%; + padding: 10px; + border: 2px solid #dee2e6; + border-radius: var(--radius-sm); + font-size: 13px; + font-family: var(--font-family); + transition: border-color var(--transition-fast); + } + + .form-group input:focus, + .form-group textarea:focus { + outline: none; + border-color: var(--primary-gold); + box-shadow: 0 0 0 3px rgba(255, 196, 7, 0.1); + } + + .form-group textarea { + min-height: 60px; + resize: vertical; + } + + .form-group small { + font-size: 11px; + color: #6c757d; + margin-top: 3px; + display: block; + } + + .variable-hint { + background: #fffbf0; + padding: 8px; + border-radius: var(--radius-sm); + font-size: 11px; + color: var(--primary-gold-dark); + margin-top: 5px; + border: 1px solid rgba(255, 196, 7, 0.2); + } + + @media (max-width: 768px) { + .metadata-comparison { + grid-template-columns: 1fr; + } + .metadata-source-selector { + flex-direction: column; + align-items: flex-start; + } + .metadata-source-selector label { + min-width: auto; + } + } diff --git a/static/js/admin.js b/static/js/admin.js new file mode 100644 index 0000000..1dcc740 --- /dev/null +++ b/static/js/admin.js @@ -0,0 +1,265 @@ +// Admin Dashboard JavaScript + +document.addEventListener('DOMContentLoaded', () => { + loadUsers(); +}); + +function switchTab(tab) { + document.querySelectorAll('.admin-tab').forEach(t => t.classList.remove('active')); + document.querySelectorAll('.admin-panel').forEach(p => p.style.display = 'none'); + + event.target.classList.add('active'); + + if (tab === 'users') { + document.getElementById('usersPanel').style.display = 'block'; + loadUsers(); + } else if (tab === 'audit') { + document.getElementById('auditPanel').style.display = 'block'; + loadAuditLog(); + } else if (tab === 'ai-usage') { + document.getElementById('aiUsagePanel').style.display = 'block'; + loadAiUsage(); + } +} + +// --- Users --- + +async function loadUsers() { + try { + const resp = await fetch(BASE_PATH + '/admin/users?include_inactive=true'); + const data = await resp.json(); + if (data.success) { + renderUsersTable(data.users); + populateAuditUserFilter(data.users); + } + } catch (err) { + console.error('Failed to load users:', err); + } +} + +function renderUsersTable(users) { + const tbody = document.getElementById('usersTableBody'); + if (!users.length) { + tbody.innerHTML = 'No users found'; + return; + } + tbody.innerHTML = users.map(u => ` + + ${u.id} + ${escapeHtml(u.username)} + ${escapeHtml(u.email || '-')} + ${u.role} + ${u.auth_method || 'local'} + ${u.last_login ? formatDate(u.last_login) : 'Never'} + ${u.is_active ? 'Active' : 'Inactive'} + + ${u.is_active + ? `` + : `` + } + + + + `).join(''); +} + +async function toggleUser(userId, activate) { + try { + const resp = await fetch(`${BASE_PATH}/admin/users/${userId}`, { + method: 'PUT', + headers: {'Content-Type': 'application/json'}, + body: JSON.stringify({is_active: activate ? 1 : 0}), + }); + const data = await resp.json(); + if (data.success) loadUsers(); + else alert(data.error || 'Failed to update user'); + } catch (err) { + alert('Error: ' + err.message); + } +} + +async function toggleRole(userId, currentRole) { + const newRole = currentRole === 'admin' ? 'user' : 'admin'; + if (!confirm(`Change user role to "${newRole}"?`)) return; + try { + const resp = await fetch(`${BASE_PATH}/admin/users/${userId}`, { + method: 'PUT', + headers: {'Content-Type': 'application/json'}, + body: JSON.stringify({role: newRole}), + }); + const data = await resp.json(); + if (data.success) loadUsers(); + else alert(data.error || 'Failed to update role'); + } catch (err) { + alert('Error: ' + err.message); + } +} + +function showCreateUserModal() { + document.getElementById('createUserModal').style.display = 'flex'; +} + +function closeCreateUserModal() { + document.getElementById('createUserModal').style.display = 'none'; + document.getElementById('newUsername').value = ''; + document.getElementById('newEmail').value = ''; + document.getElementById('newFullName').value = ''; + document.getElementById('newPassword').value = ''; + document.getElementById('newRole').value = 'user'; + document.getElementById('newAuthMethod').value = 'local'; +} + +async function createUser() { + const username = document.getElementById('newUsername').value.trim(); + if (!username) { alert('Username is required'); return; } + + const payload = { + username, + email: document.getElementById('newEmail').value.trim(), + full_name: document.getElementById('newFullName').value.trim(), + password: document.getElementById('newPassword').value || null, + role: document.getElementById('newRole').value, + auth_method: document.getElementById('newAuthMethod').value, + }; + + try { + const resp = await fetch(BASE_PATH + '/admin/users', { + method: 'POST', + headers: {'Content-Type': 'application/json'}, + body: JSON.stringify(payload), + }); + const data = await resp.json(); + if (data.success) { + closeCreateUserModal(); + loadUsers(); + } else { + alert(data.error || 'Failed to create user'); + } + } catch (err) { + alert('Error: ' + err.message); + } +} + +// --- Audit Log --- + +function populateAuditUserFilter(users) { + const select = document.getElementById('auditUserFilter'); + const currentVal = select.value; + select.innerHTML = ''; + users.forEach(u => { + select.innerHTML += ``; + }); + select.value = currentVal; +} + +async function loadAuditLog() { + const userId = document.getElementById('auditUserFilter').value; + let url = BASE_PATH + '/admin/audit?limit=200'; + if (userId) url += `&user_id=${userId}`; + + try { + const resp = await fetch(url); + const data = await resp.json(); + if (data.success) { + renderAuditTable(data.entries); + } + } catch (err) { + console.error('Failed to load audit log:', err); + } +} + +function renderAuditTable(entries) { + const tbody = document.getElementById('auditTableBody'); + if (!entries.length) { + tbody.innerHTML = 'No audit entries'; + return; + } + tbody.innerHTML = entries.map(e => ` + + ${formatDate(e.timestamp)} + ${escapeHtml(e.username || 'Unknown')} + ${escapeHtml(e.action)} + ${escapeHtml(e.details || '-')} + + `).join(''); +} + +// --- AI Usage --- + +async function loadAiUsage() { + try { + const resp = await fetch(BASE_PATH + '/admin/ai-usage'); + const data = await resp.json(); + if (data.success) { + renderAiStats(data.stats); + renderAiUsageTable(data.by_user); + } + } catch (err) { + console.error('Failed to load AI usage:', err); + } +} + +function renderAiStats(stats) { + const grid = document.getElementById('aiStatsGrid'); + grid.innerHTML = ` +
+
${stats.total_requests || 0}
+
Total Requests
+
+
+
${(stats.total_tokens || 0).toLocaleString()}
+
Total Tokens
+
+
+
${stats.requests_24h || 0}
+
Requests (24h)
+
+
+
${(stats.tokens_24h || 0).toLocaleString()}
+
Tokens (24h)
+
+
+
${stats.requests_7d || 0}
+
Requests (7d)
+
+
+
${(stats.tokens_7d || 0).toLocaleString()}
+
Tokens (7d)
+
+ `; +} + +function renderAiUsageTable(byUser) { + const tbody = document.getElementById('aiUsageTableBody'); + if (!byUser.length) { + tbody.innerHTML = 'No AI usage data'; + return; + } + tbody.innerHTML = byUser.map(u => ` + + ${escapeHtml(u.username)} + ${u.request_count} + ${(u.total_tokens || 0).toLocaleString()} + ${u.last_used ? formatDate(u.last_used) : '-'} + + `).join(''); +} + +// --- Helpers --- + +function escapeHtml(str) { + if (!str) return ''; + const div = document.createElement('div'); + div.textContent = str; + return div.innerHTML; +} + +function formatDate(dateStr) { + if (!dateStr) return '-'; + try { + const d = new Date(dateStr); + return d.toLocaleString(); + } catch { + return dateStr; + } +} diff --git a/static/js/app.js b/static/js/app.js new file mode 100644 index 0000000..dbcde96 --- /dev/null +++ b/static/js/app.js @@ -0,0 +1,1488 @@ + let currentFiles = []; + let sessionId = null; + let importSessionId = null; + let selectedFiles = new Set(); + + const uploadArea = document.getElementById('uploadArea'); + const fileInput = document.getElementById('fileInput'); + const spinner = document.getElementById('spinner'); + const progressBar = document.getElementById('progressBar'); + const progressFill = document.getElementById('progressFill'); + const fileList = document.getElementById('fileList'); + const actions = document.getElementById('actions'); + const errorAlert = document.getElementById('errorAlert'); + const successAlert = document.getElementById('successAlert'); + const infoAlert = document.getElementById('infoAlert'); + + // Click to upload + uploadArea.addEventListener('click', () => fileInput.click()); + + // File selection + fileInput.addEventListener('change', handleFileSelect); + + // Drag and drop + uploadArea.addEventListener('dragover', (e) => { + e.preventDefault(); + uploadArea.classList.add('dragover'); + }); + + uploadArea.addEventListener('dragleave', () => { + uploadArea.classList.remove('dragover'); + }); + + uploadArea.addEventListener('drop', (e) => { + e.preventDefault(); + uploadArea.classList.remove('dragover'); + const files = e.dataTransfer.files; + if (files.length > 0) { + handleFiles(files); + } + }); + + // Import file input + const importFileInput = document.getElementById('importFileInput'); + importFileInput.addEventListener('change', handleImportFile); + + function handleSourceChange() { + const source = document.getElementById('metadataSource').value; + const importSection = document.getElementById('importSection'); + + // Show/hide import section + if (source === 'import') { + importSection.style.display = 'block'; + } else { + importSection.style.display = 'none'; + } + } + + let currentImportData = null; + + async function handleImportFile(e) { + const file = e.target.files[0]; + if (!file) return; + + hideAlerts(); + showInfo(`Uploading import file: ${file.name}...`); + + const formData = new FormData(); + formData.append('import_file', file); + + try { + const response = await fetch(BASE_PATH + '/import-metadata', { + method: 'POST', + body: formData + }); + + const data = await response.json(); + + if (data.error) { + showError(data.error); + return; + } + + // Store import data and show mapping modal + currentImportData = data; + showImportMappingModal(data); + + } catch (error) { + showError(`Import upload failed: ${error.message}`); + } + } + + function showImportMappingModal(data) { + const modal = document.getElementById('importMappingModal'); + const content = document.getElementById('importMappingContent'); + + // Build sheet selector for Excel files + let sheetsHTML = ''; + if (data.sheets && data.sheets.length > 0) { + sheetsHTML = '
'; + sheetsHTML += ''; + sheetsHTML += '
'; + } + + content.innerHTML = ` +
+ +
+ + ${sheetsHTML} + +
+ +
+
+ + + Column containing filenames (with or without extension) +
+ +
+ + +
+ +
+ + +
+ +
+ + +
+
+
+ +
+ +
+ ${buildImportPreviewTable(data)} +
+
+ +
+ + +
+ `; + + // Auto-select likely columns + autoSelectImportColumns(data.columns); + + modal.style.display = 'flex'; + } + + async function updateImportSheetPreview() { + const sheetName = document.getElementById('importSheetSelect').value; + if (!currentImportData || !sheetName) return; + + try { + showInfo('Loading sheet preview...'); + + const response = await fetch(BASE_PATH + '/preview-excel-sheet', { + method: 'POST', + headers: {'Content-Type': 'application/json'}, + body: JSON.stringify({ + excel_session_id: currentImportData.import_session_id, + sheet_name: sheetName + }) + }); + + const data = await response.json(); + + if (data.error) { + showError(data.error); + return; + } + + // Update column dropdowns + const columns = data.columns; + ['importFilenameColumn', 'importTitleColumn', 'importSubjectColumn', 'importKeywordsColumn'].forEach(id => { + const select = document.getElementById(id); + const currentValue = select.value; + select.innerHTML = id === 'importFilenameColumn' + ? '' + : ''; + columns.forEach(col => { + select.innerHTML += ``; + }); + // Try to restore selection + if (columns.includes(currentValue)) { + select.value = currentValue; + } + }); + + // Update preview table + document.getElementById('importPreviewTable').innerHTML = buildImportPreviewTable(data); + + // Auto-select columns again + autoSelectImportColumns(columns); + + hideAlerts(); + + } catch (error) { + showError(`Failed to load sheet preview: ${error.message}`); + } + } + + function autoSelectImportColumns(columns) { + // Try to auto-detect filename column + const filenameCandidates = ['filename', 'file name', 'file', 'name', 'path']; + for (const col of columns) { + if (filenameCandidates.some(c => col.toLowerCase().includes(c))) { + document.getElementById('importFilenameColumn').value = col; + break; + } + } + + // Try to auto-detect title column + const titleCandidates = ['title', 'heading', 'name']; + for (const col of columns) { + if (titleCandidates.some(c => col.toLowerCase() === c)) { + document.getElementById('importTitleColumn').value = col; + break; + } + } + + // Try to auto-detect subject column + const subjectCandidates = ['description', 'desc', 'subject', 'summary']; + for (const col of columns) { + if (subjectCandidates.some(c => col.toLowerCase().includes(c))) { + document.getElementById('importSubjectColumn').value = col; + break; + } + } + + // Try to auto-detect keywords column + const keywordsCandidates = ['keywords', 'keyword', 'tags', 'tag']; + for (const col of columns) { + if (keywordsCandidates.some(c => col.toLowerCase().includes(c))) { + document.getElementById('importKeywordsColumn').value = col; + break; + } + } + } + + function buildImportPreviewTable(data) { + if (!data || !data.sample_data || data.sample_data.length === 0) { + return '

No preview data available

'; + } + + let html = ''; + + // Header + html += ''; + data.columns.forEach(col => { + html += ``; + }); + html += ''; + + // Rows + html += ''; + data.sample_data.slice(0, 3).forEach((row, idx) => { + html += ``; + data.columns.forEach(col => { + const value = row[col] || ''; + html += ``; + }); + html += ''; + }); + html += '
${col}
${value}
'; + + return html; + } + + async function confirmImportMapping() { + const filenameColumn = document.getElementById('importFilenameColumn').value; + const titleColumn = document.getElementById('importTitleColumn').value; + const subjectColumn = document.getElementById('importSubjectColumn').value; + const keywordsColumn = document.getElementById('importKeywordsColumn').value; + + if (!filenameColumn) { + showError('Please select a filename column'); + return; + } + + try { + showInfo('Configuring import mapping...'); + + const response = await fetch(BASE_PATH + '/configure-import-mapping', { + method: 'POST', + headers: {'Content-Type': 'application/json'}, + body: JSON.stringify({ + import_session_id: currentImportData.import_session_id, + column_mapping: { + filename: filenameColumn, + title: titleColumn, + subject: subjectColumn, + keywords: keywordsColumn + } + }) + }); + + const data = await response.json(); + + if (data.error) { + showError(data.error); + return; + } + + // Store import session ID + importSessionId = data.import_session_id; + + // Display stats + const importStats = document.getElementById('importStats'); + const stats = data.stats; + importStats.innerHTML = ` + ✅ ${data.message}
+ + Title: ${stats.with_title}/${stats.total_records} • + Subject: ${stats.with_subject}/${stats.total_records} • + Keywords: ${stats.with_keywords}/${stats.total_records} + + `; + importStats.style.display = 'block'; + + // Mark import section as active + document.getElementById('importSection').classList.add('active'); + + closeImportMappingModal(); + showSuccess(`✅ ${data.message}`); + + } catch (error) { + showError(`Import configuration failed: ${error.message}`); + } + } + + function closeImportMappingModal() { + const modal = document.getElementById('importMappingModal'); + modal.style.display = 'none'; + currentImportData = null; + } + + function handleFileSelect(e) { + const files = e.target.files; + if (files.length > 0) { + handleFiles(files); + } + } + + async function handleFiles(files) { + hideAlerts(); + showSpinner(); + showProgress(0); + fileList.style.display = 'none'; + actions.style.display = 'none'; + currentFiles = []; + + const metadataSource = document.getElementById('metadataSource').value; + + // Validate metadata sources + if (metadataSource === 'import' && !importSessionId) { + showError('Please import a metadata file first using the "Choose Import File" button'); + hideSpinner(); + return; + } + + // Show specific message for AI processing + if (metadataSource === 'ai') { + showInfo(`🤖 Generating AI metadata for ${files.length} file(s)... This may take 10-30 seconds per file.`); + // Start animated progress for AI + startProgressAnimation(); + } else { + showInfo(`Processing ${files.length} file(s) with ${metadataSource} source...`); + } + + const formData = new FormData(); + formData.append('metadata_source', metadataSource); + if (importSessionId) { + formData.append('import_session_id', importSessionId); + } + for (let file of files) { + formData.append('files', file); + } + + try { + const response = await fetch(BASE_PATH + '/upload', { + method: 'POST', + body: formData + }); + + const data = await response.json(); + hideSpinner(); + hideProgress(); + stopProgressAnimation(); + + if (data.error) { + showError(data.error); + return; + } + + sessionId = data.session_id; + currentFiles = data.files; + displayFiles(data.files); + showSuccess(`Successfully analyzed ${data.files.length} file(s)!`); + actions.style.display = 'block'; + + } catch (error) { + hideSpinner(); + hideProgress(); + stopProgressAnimation(); + showError('Error processing files: ' + error.message); + } + } + + function displayFiles(files) { + const batchToolbar = ` +
+
+ + + 0 selected +
+
+ +
+
+ `; + + fileList.innerHTML = batchToolbar; + fileList.style.display = 'block'; + + // Reset selected files + selectedFiles.clear(); + + files.forEach((file, index) => { + if (file.error) { + const errorItem = document.createElement('div'); + errorItem.className = 'file-item'; + errorItem.style.borderLeftColor = '#dc3545'; + errorItem.innerHTML = ` +
+
❌ ${file.filename}
+
+
${file.error}
+ `; + fileList.appendChild(errorItem); + return; + } + + const fileItem = document.createElement('div'); + fileItem.className = 'file-item'; + fileItem.id = `file-${index}`; + + // Build AI info section if available + let aiInfoHtml = ''; + if (file.suggested_metadata._tokens_used) { + aiInfoHtml = `
+ ✓ AI generated (${file.suggested_metadata._tokens_used} tokens used) +
`; + } + if (file.suggested_metadata._ai_error) { + aiInfoHtml = `
+ ⚠️ AI Error: ${file.suggested_metadata._ai_error} +
`; + } + + fileItem.innerHTML = ` +
+
+ +
📄 ${file.filename}
+
+
${file.file_type}
+
+ + + +
+ +
+ `; + + fileList.appendChild(fileItem); + + // Initialize character counters + initCharCounters(index); + + // Select by default + selectedFiles.add(index); + fileItem.classList.add('selected'); + }); + + updateSelectionCount(); + } + + function toggleFileSelection(index) { + const checkbox = document.getElementById(`checkbox-${index}`); + const fileItem = document.getElementById(`file-${index}`); + + if (checkbox.checked) { + selectedFiles.add(index); + fileItem.classList.add('selected'); + } else { + selectedFiles.delete(index); + fileItem.classList.remove('selected'); + } + + updateSelectionCount(); + } + + function selectAllFiles() { + selectedFiles.clear(); + currentFiles.forEach((file, index) => { + if (!file.error) { + selectedFiles.add(index); + const checkbox = document.getElementById(`checkbox-${index}`); + const fileItem = document.getElementById(`file-${index}`); + if (checkbox) checkbox.checked = true; + if (fileItem) fileItem.classList.add('selected'); + } + }); + updateSelectionCount(); + } + + function deselectAllFiles() { + selectedFiles.clear(); + currentFiles.forEach((file, index) => { + if (!file.error) { + const checkbox = document.getElementById(`checkbox-${index}`); + const fileItem = document.getElementById(`file-${index}`); + if (checkbox) checkbox.checked = false; + if (fileItem) fileItem.classList.remove('selected'); + } + }); + updateSelectionCount(); + } + + function updateSelectionCount() { + const countElement = document.getElementById('selectionCount'); + if (countElement) { + countElement.textContent = `${selectedFiles.size} selected`; + } + + // Update download button text if it exists + const downloadBtn = document.getElementById('download-selected-btn'); + if (downloadBtn) { + downloadBtn.innerHTML = `📦 Download Selected Files (${selectedFiles.size}) as ZIP`; + } + } + + function displayMetadata(metadata) { + if (!metadata || Object.keys(metadata).length === 0) { + return '

(empty)

'; + } + + let html = ''; + for (const [key, value] of Object.entries(metadata)) { + html += ` + + `; + } + return html; + } + + function displayEditableMetadata(metadata, index) { + // Filter out internal fields (starting with _) + const title = metadata?.title || ''; + const subject = metadata?.subject || ''; + const keywords = metadata?.keywords || ''; + const author = metadata?.author || metadata?.creator || ''; + const copyright = metadata?.copyright || ''; + const comments = metadata?.comments || ''; + + return ` + + + + + + +
+ +
+ + `; + } + + function escapeHtml(text) { + const div = document.createElement('div'); + div.textContent = text; + return div.innerHTML; + } + + function initCharCounters(index) { + const fields = ['title', 'subject', 'keywords', 'author', 'copyright', 'comments']; + const limits = { + title: 200, + subject: 300, + keywords: 500, + author: 100, + copyright: 150, + comments: 500 + }; + + fields.forEach(field => { + const input = document.getElementById(`${field}-${index}`); + const counter = document.getElementById(`${field}-count-${index}`); + + if (input && counter) { + // Initial count + updateCharCount(input, counter, limits[field]); + + // Listen for changes + input.addEventListener('input', () => { + updateCharCount(input, counter, limits[field]); + }); + } + }); + } + + function updateCharCount(input, counter, limit) { + const length = input.value.length; + counter.textContent = `${length}/${limit}`; + + // Remove all classes first + counter.classList.remove('warning', 'danger'); + input.classList.remove('invalid'); + + // Add warning/danger classes + if (length >= limit) { + counter.classList.add('danger'); + input.classList.add('invalid'); + } else if (length >= limit * 0.9) { + counter.classList.add('warning'); + } + } + + async function saveMetadata(index) { + const file = currentFiles[index]; + if (!file || file.error) return; + + const saveBtn = document.getElementById(`saveBtn-${index}`); + saveBtn.disabled = true; + saveBtn.textContent = '💾 Saving...'; + + // Get edited metadata + const title = document.getElementById(`title-${index}`).value.trim(); + const subject = document.getElementById(`subject-${index}`).value.trim(); + const keywords = document.getElementById(`keywords-${index}`).value.trim(); + const author = document.getElementById(`author-${index}`).value.trim(); + const copyright = document.getElementById(`copyright-${index}`).value.trim(); + const comments = document.getElementById(`comments-${index}`).value.trim(); + + // Get custom fields + const customFields = getCustomFields(index); + + try { + const response = await fetch(BASE_PATH + '/update-manual', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + session_id: sessionId, + file_index: index, + title: title, + subject: subject, + keywords: keywords, + author: author, + copyright: copyright, + comments: comments, + custom_fields: customFields + }) + }); + + const data = await response.json(); + + if (data.error) { + showError(`Failed to update ${file.filename}: ${data.error}`); + saveBtn.textContent = '💾 Save Changes'; + saveBtn.disabled = false; + return; + } + + // Update the file in currentFiles + currentFiles[index].suggested_metadata = { + title, subject, keywords, author, copyright, comments + }; + + // Show success indicator + const fileItem = document.getElementById(`file-${index}`); + if (fileItem) { + fileItem.style.borderLeftColor = '#28a745'; + + // Remove old success message if exists + const oldSuccess = fileItem.querySelector('.save-success'); + if (oldSuccess) oldSuccess.remove(); + + // Add success message + const successDiv = document.createElement('div'); + successDiv.className = 'alert alert-success save-success'; + successDiv.style.display = 'block'; + successDiv.style.marginTop = '10px'; + successDiv.textContent = `✅ Metadata saved successfully!`; + fileItem.appendChild(successDiv); + + // Remove success message after 3 seconds + setTimeout(() => { + successDiv.remove(); + fileItem.style.borderLeftColor = '#667eea'; + }, 3000); + } + + saveBtn.textContent = '✅ Saved!'; + setTimeout(() => { + saveBtn.textContent = '💾 Save Changes'; + saveBtn.disabled = false; + }, 2000); + + } catch (error) { + showError(`Error saving metadata: ${error.message}`); + saveBtn.textContent = '💾 Save Changes'; + saveBtn.disabled = false; + } + } + + + async function updateAllFiles() { + if (selectedFiles.size === 0) { + showError('Please select at least one file to update'); + return; + } + + const outputDirEl = document.getElementById('outputDir'); + const outputDir = outputDirEl ? outputDirEl.value.trim() : ''; + const updateBtn = document.getElementById('updateAllBtn'); + updateBtn.disabled = true; + hideAlerts(); + showProgress(0); + + showInfo(`Updating ${selectedFiles.size} selected file(s)...`); + + let successCount = 0; + let errorCount = 0; + + const selectedIndices = Array.from(selectedFiles); + + for (let idx = 0; idx < selectedIndices.length; idx++) { + const i = selectedIndices[idx]; + const file = currentFiles[i]; + + if (file.error) { + errorCount++; + continue; + } + + const progress = ((idx + 1) / selectedIndices.length) * 100; + showProgress(progress); + + try { + const response = await fetch(BASE_PATH + '/update', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + filepath: file.filepath, + session_id: sessionId, + file_index: i, + output_dir: outputDir + }) + }); + + const data = await response.json(); + + if (data.error) { + errorCount++; + const fileItem = document.getElementById(`file-${i}`); + if (fileItem) { + fileItem.style.borderLeftColor = '#dc3545'; + const errorDiv = document.createElement('div'); + errorDiv.className = 'alert alert-error'; + errorDiv.style.display = 'block'; + errorDiv.textContent = `Error: ${data.error}`; + fileItem.appendChild(errorDiv); + } + } else { + successCount++; + const fileItem = document.getElementById(`file-${i}`); + if (fileItem) { + fileItem.style.borderLeftColor = '#28a745'; + + // Create success message with download button + const successDiv = document.createElement('div'); + successDiv.className = 'alert alert-success'; + successDiv.style.display = 'flex'; + successDiv.style.alignItems = 'center'; + successDiv.style.justifyContent = 'space-between'; + successDiv.style.gap = '15px'; + + const messageSpan = document.createElement('span'); + messageSpan.textContent = data.verified ? + `✅ Updated and verified!` : + `✅ Updated!`; + + const downloadBtn = document.createElement('a'); + downloadBtn.href = `${BASE_PATH}/download/${file.filename}`; + downloadBtn.className = 'btn'; + downloadBtn.style.padding = '8px 16px'; + downloadBtn.style.fontSize = '14px'; + downloadBtn.style.backgroundColor = '#28a745'; + downloadBtn.style.color = 'white'; + downloadBtn.style.textDecoration = 'none'; + downloadBtn.style.borderRadius = '4px'; + downloadBtn.style.display = 'inline-block'; + downloadBtn.download = file.filename; + downloadBtn.textContent = '⬇️ Download'; + + successDiv.appendChild(messageSpan); + successDiv.appendChild(downloadBtn); + fileItem.appendChild(successDiv); + } + } + } catch (error) { + errorCount++; + console.error('Error updating file:', error); + } + } + + hideProgress(); + updateBtn.disabled = false; + + // Show final message and Download All button + if (successCount > 0 && errorCount === 0) { + showSuccess(`✅ All ${successCount} file(s) updated successfully!`); + showDownloadAllButton(); + } else if (successCount > 0 && errorCount > 0) { + showInfo(`⚠️ Updated ${successCount} file(s), ${errorCount} failed.`); + showDownloadAllButton(); + } else { + showError(`❌ Failed to update files. Check individual file errors above.`); + } + } + + function showDownloadAllButton() { + // Remove existing Download All button if present + const existingBtn = document.getElementById('download-all-btn'); + if (existingBtn) { + existingBtn.remove(); + } + + // Create Download All button container + const btnContainer = document.createElement('div'); + btnContainer.id = 'download-all-btn'; + btnContainer.style.marginTop = '30px'; + btnContainer.style.marginBottom = '20px'; + btnContainer.style.textAlign = 'center'; + btnContainer.style.padding = '20px'; + btnContainer.style.backgroundColor = '#f8f9fa'; + btnContainer.style.borderRadius = '8px'; + btnContainer.style.border = '2px solid #007bff'; + + const downloadAllBtn = document.createElement('button'); + downloadAllBtn.id = 'download-selected-btn'; + downloadAllBtn.className = 'btn'; + downloadAllBtn.style.padding = '15px 30px'; + downloadAllBtn.style.fontSize = '18px'; + downloadAllBtn.style.fontWeight = 'bold'; + downloadAllBtn.style.backgroundColor = '#007bff'; + downloadAllBtn.style.color = 'white'; + downloadAllBtn.style.border = 'none'; + downloadAllBtn.style.borderRadius = '8px'; + downloadAllBtn.style.cursor = 'pointer'; + downloadAllBtn.style.boxShadow = '0 2px 4px rgba(0,123,255,0.3)'; + downloadAllBtn.innerHTML = `📦 Download Selected Files (${selectedFiles.size}) as ZIP`; + downloadAllBtn.onmouseover = function() { this.style.backgroundColor = '#0056b3'; }; + downloadAllBtn.onmouseout = function() { this.style.backgroundColor = '#007bff'; }; + downloadAllBtn.onclick = downloadSelectedFiles; + + btnContainer.appendChild(downloadAllBtn); + + // Insert after the file list + const fileList = document.getElementById('fileList'); + if (fileList && fileList.parentNode) { + fileList.parentNode.insertBefore(btnContainer, fileList.nextSibling); + } + } + + async function downloadSelectedFiles() { + if (!sessionId) { + showError('No active session'); + return; + } + + if (selectedFiles.size === 0) { + showError('Please select at least one file to download'); + return; + } + + const selectedIndices = Array.from(selectedFiles); + + try { + showInfo('📦 Preparing ZIP archive for download...'); + + // Send POST request with selected file indices + const response = await fetch(BASE_PATH + '/download-selected', { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + }, + body: JSON.stringify({ + session_id: sessionId, + file_indices: selectedIndices + }) + }); + + if (!response.ok) { + const error = await response.json(); + throw new Error(error.error || 'Failed to create ZIP archive'); + } + + // Get the blob from response + const blob = await response.blob(); + + // Create download link + const url = window.URL.createObjectURL(blob); + const link = document.createElement('a'); + link.href = url; + link.download = `oliver_metadata_files_${new Date().getTime()}.zip`; + document.body.appendChild(link); + link.click(); + document.body.removeChild(link); + window.URL.revokeObjectURL(url); + + showSuccess(`✅ Downloaded ${selectedFiles.size} file(s) as ZIP archive`); + } catch (error) { + showError(`Error: ${error.message}`); + } + } + + async function exportResults() { + if (currentFiles.length === 0) { + showError('No files to export'); + return; + } + + // Create CSV content + let csvContent = 'Filename,Title,Subject,Keywords,Status\n'; + + currentFiles.forEach((file, index) => { + if (file.error) { + csvContent += `"${file.filename}","","","","Error: ${file.error}"\n`; + } else { + const title = file.suggested_metadata?.title || ''; + const subject = file.suggested_metadata?.subject || ''; + const keywords = file.suggested_metadata?.keywords || ''; + const status = selectedFiles.has(index) ? 'Selected' : 'Not selected'; + + // Escape quotes in CSV + const escapeCsv = (str) => `"${String(str).replace(/"/g, '""')}"`; + + csvContent += `${escapeCsv(file.filename)},${escapeCsv(title)},${escapeCsv(subject)},${escapeCsv(keywords)},${escapeCsv(status)}\n`; + } + }); + + // Create download + const blob = new Blob([csvContent], { type: 'text/csv;charset=utf-8;' }); + const link = document.createElement('a'); + const url = URL.createObjectURL(blob); + + const timestamp = new Date().toISOString().replace(/[:.]/g, '-').slice(0, 19); + link.setAttribute('href', url); + link.setAttribute('download', `oliver_metadata_export_${timestamp}.csv`); + link.style.visibility = 'hidden'; + + document.body.appendChild(link); + link.click(); + document.body.removeChild(link); + + showSuccess('✅ Results exported to CSV'); + } + + function resetForm() { + fileInput.value = ''; + fileList.style.display = 'none'; + actions.style.display = 'none'; + hideAlerts(); + hideProgress(); + currentFiles = []; + sessionId = null; + selectedFiles.clear(); + } + + function showSpinner() { + spinner.style.display = 'block'; + } + + function hideSpinner() { + spinner.style.display = 'none'; + } + + function showProgress(percent) { + progressBar.style.display = 'block'; + progressFill.style.width = percent + '%'; + progressFill.textContent = Math.round(percent) + '%'; + } + + function hideProgress() { + progressBar.style.display = 'none'; + } + + // Animated progress for long-running operations like AI generation + let progressAnimationInterval = null; + let animatedProgress = 0; + + function startProgressAnimation() { + animatedProgress = 0; + showProgress(0); + + progressAnimationInterval = setInterval(() => { + // Slow down as we approach 90% + if (animatedProgress < 30) { + animatedProgress += 2; + } else if (animatedProgress < 60) { + animatedProgress += 1; + } else if (animatedProgress < 90) { + animatedProgress += 0.5; + } + + showProgress(animatedProgress); + }, 500); + } + + function stopProgressAnimation() { + if (progressAnimationInterval) { + clearInterval(progressAnimationInterval); + progressAnimationInterval = null; + } + animatedProgress = 0; + } + + function showError(message) { + errorAlert.textContent = message; + errorAlert.style.display = 'block'; + } + + function showSuccess(message) { + successAlert.textContent = message; + successAlert.style.display = 'block'; + } + + function showInfo(message) { + infoAlert.textContent = message; + infoAlert.style.display = 'block'; + } + + function hideAlerts() { + errorAlert.style.display = 'none'; + successAlert.style.display = 'none'; + infoAlert.style.display = 'none'; + } + + // Custom Fields Management + let customFieldCounters = {}; // Track number of custom fields per file + + function addCustomField(index) { + if (!customFieldCounters[index]) { + customFieldCounters[index] = 0; + } + + const fieldId = customFieldCounters[index]++; + const container = document.getElementById(`custom-fields-${index}`); + + const customFieldDiv = document.createElement('div'); + customFieldDiv.className = 'metadata-field'; + customFieldDiv.id = `custom-field-container-${index}-${fieldId}`; + customFieldDiv.style.border = '1px dashed #17a2b8'; + customFieldDiv.style.padding = '10px'; + customFieldDiv.style.borderRadius = '5px'; + customFieldDiv.style.marginTop = '10px'; + customFieldDiv.style.background = '#f0f9ff'; + + customFieldDiv.innerHTML = ` +
+
+ + + + + + 0/200 +
+ +
+ `; + + container.appendChild(customFieldDiv); + + // Initialize character counter for value field + const valueInput = document.getElementById(`custom-field-value-${index}-${fieldId}`); + const counter = document.getElementById(`custom-field-count-${index}-${fieldId}`); + if (valueInput && counter) { + valueInput.addEventListener('input', () => { + updateCharCount(valueInput, counter, 200); + }); + updateCharCount(valueInput, counter, 200); + } + } + + function removeCustomField(index, fieldId) { + const container = document.getElementById(`custom-field-container-${index}-${fieldId}`); + if (container) { + container.remove(); + } + } + + function getCustomFields(index) { + const customFields = {}; + const container = document.getElementById(`custom-fields-${index}`); + + if (container) { + const fieldContainers = container.querySelectorAll('[id^="custom-field-container-"]'); + fieldContainers.forEach(fieldContainer => { + const match = fieldContainer.id.match(/custom-field-container-(\d+)-(\d+)/); + if (match && match[1] === String(index)) { + const fieldId = match[2]; + const nameInput = document.getElementById(`custom-field-name-${index}-${fieldId}`); + const valueInput = document.getElementById(`custom-field-value-${index}-${fieldId}`); + + if (nameInput && valueInput && nameInput.value.trim()) { + customFields[nameInput.value.trim()] = valueInput.value.trim(); + } + } + }); + } + + return customFields; + } + + // Template Management Functions + async function loadTemplateList() { + try { + const response = await fetch(BASE_PATH + '/templates/list'); + const data = await response.json(); + + if (data.success) { + const templateSelect = document.getElementById('templateSelect'); + templateSelect.innerHTML = ''; + + data.templates.forEach(template => { + const option = document.createElement('option'); + option.value = template.name; + option.textContent = template.name; + if (template.description) { + option.title = template.description; + } + templateSelect.appendChild(option); + }); + + // Enable/disable apply button + document.getElementById('applyTemplateBtn').disabled = !data.templates.length; + } + } catch (error) { + console.error('Failed to load templates:', error); + } + } + + // Load templates on page load + loadTemplateList(); + + function showCreateTemplateModal() { + document.getElementById('createTemplateModal').style.display = 'block'; + } + + function closeCreateTemplateModal() { + document.getElementById('createTemplateModal').style.display = 'none'; + // Clear form + document.getElementById('templateName').value = ''; + document.getElementById('templateDescription').value = ''; + document.getElementById('templateTitle').value = ''; + document.getElementById('templateSubject').value = ''; + document.getElementById('templateKeywords').value = ''; + } + + async function saveNewTemplate() { + const name = document.getElementById('templateName').value.trim(); + const description = document.getElementById('templateDescription').value.trim(); + const title = document.getElementById('templateTitle').value.trim(); + const subject = document.getElementById('templateSubject').value.trim(); + const keywords = document.getElementById('templateKeywords').value.trim(); + + if (!name || !title || !subject || !keywords) { + showError('Please fill in all required fields (Name, Title, Subject, Keywords)'); + return; + } + + hideAlerts(); + + try { + const response = await fetch(BASE_PATH + '/templates/save', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ name, description, title, subject, keywords }) + }); + + const data = await response.json(); + + if (data.success) { + showSuccess(`Template "${name}" created successfully!`); + closeCreateTemplateModal(); + loadTemplateList(); + } else { + showError(data.error || 'Failed to save template'); + } + } catch (error) { + showError(`Failed to save template: ${error.message}`); + } + } + + async function applyTemplate() { + const templateName = document.getElementById('templateSelect').value; + + if (!templateName) { + showError('Please select a template'); + return; + } + + if (selectedFiles.size === 0) { + showError('Please select at least one file to apply template'); + return; + } + + if (!sessionId) { + showError('No active session. Please upload files first.'); + return; + } + + hideAlerts(); + showInfo(`Applying template "${templateName}" to ${selectedFiles.size} file(s)...`); + + try { + const response = await fetch(BASE_PATH + '/templates/apply', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + template_name: templateName, + file_indices: Array.from(selectedFiles), + session_id: sessionId, + custom_vars: {} + }) + }); + + const data = await response.json(); + + if (data.success) { + // Update UI with new metadata + data.results.forEach(result => { + const index = result.file_index; + document.getElementById(`title-${index}`).value = result.metadata.title; + document.getElementById(`subject-${index}`).value = result.metadata.subject; + document.getElementById(`keywords-${index}`).value = result.metadata.keywords; + + // Update character counters + initCharCounters(index); + + // Update currentFiles + currentFiles[index].suggested_metadata = result.metadata; + }); + + showSuccess(`✅ Template applied to ${data.results.length} file(s)`); + } else { + showError(data.error || 'Failed to apply template'); + } + } catch (error) { + showError(`Failed to apply template: ${error.message}`); + } + } + + async function manageTemplates() { + try { + const response = await fetch(BASE_PATH + '/templates/list'); + const data = await response.json(); + + if (!data.success) { + showError('Failed to load templates'); + return; + } + + if (data.templates.length === 0) { + showInfo('No templates available. Create a new template to get started!'); + return; + } + + let message = 'Available Templates:\n\n'; + data.templates.forEach((template, index) => { + message += `${index + 1}. ${template.name}\n`; + if (template.description) { + message += ` ${template.description}\n`; + } + message += ` Created: ${new Date(template.created_at).toLocaleDateString()}\n`; + message += ` Variables: ${template.variables_used.join(', ') || 'None'}\n\n`; + }); + + const templateName = prompt(message + '\nEnter template name to delete (or Cancel):'); + + if (templateName) { + const confirmDelete = confirm(`Are you sure you want to delete template "${templateName}"?`); + if (confirmDelete) { + await deleteTemplate(templateName); + } + } + } catch (error) { + showError(`Failed to manage templates: ${error.message}`); + } + } + + async function deleteTemplate(name) { + try { + const response = await fetch(`${BASE_PATH}/templates/delete/${encodeURIComponent(name)}`, { + method: 'DELETE' + }); + + const data = await response.json(); + + if (data.success) { + showSuccess(`Template "${name}" deleted successfully`); + loadTemplateList(); + } else { + showError(data.error || 'Failed to delete template'); + } + } catch (error) { + showError(`Failed to delete template: ${error.message}`); + } + } + + // Preview template when selected + document.getElementById('templateSelect').addEventListener('change', async function() { + const templateName = this.value; + const previewDiv = document.getElementById('templatePreview'); + + if (!templateName) { + previewDiv.style.display = 'none'; + return; + } + + try { + const response = await fetch(`${BASE_PATH}/templates/load/${encodeURIComponent(templateName)}`); + const data = await response.json(); + + if (data.success) { + const template = data.template; + previewDiv.innerHTML = ` +
+ Title: ${template.title} +
+
+ Subject: ${template.subject} +
+
+ Keywords: ${template.keywords} +
+ `; + previewDiv.style.display = 'block'; + } else { + previewDiv.style.display = 'none'; + } + } catch (error) { + console.error('Failed to preview template:', error); + previewDiv.style.display = 'none'; + } + }); diff --git a/templates/admin.html b/templates/admin.html new file mode 100644 index 0000000..0754819 --- /dev/null +++ b/templates/admin.html @@ -0,0 +1,187 @@ + + + + + + Admin - Oliver Metadata Tool + + + + + +{% set base = request.scope.get('root_path', '') %} +
+
+

Admin Dashboard

+

Oliver Metadata Tool - Administration

+
+ {{ username }} | + Home | + Logout +
+
+ +
+ +
+
+
{{ stats.active_users | default(0) }}
+
Active Users
+
+
+
{{ stats.active_sessions | default(0) }}
+
Active Sessions
+
+
+
{{ stats.recent_activity | default(0) }}
+
Activity (24h)
+
+
+
{{ stats.ai_usage.total_tokens | default(0) }}
+
AI Tokens Used
+
+
+ + +
+ + + +
+ + +
+
+

User Management

+ +
+
+ + + + + + + + + + + + + + + + +
IDUsernameEmailRoleAuthLast LoginStatusActions
Loading...
+
+
+ + + + + + +
+ + +
+ + + + + + + + diff --git a/templates/index.html b/templates/index.html new file mode 100644 index 0000000..a448418 --- /dev/null +++ b/templates/index.html @@ -0,0 +1,184 @@ + + + + + + Oliver Metadata Tool + + + + +{% set base = request.scope.get('root_path', '') %} +
+
+

Oliver Metadata Tool

+

Universal metadata creation and management for all file types

+
+ {{ username }} | Logout +
+
+ +
+
+ + +
+

Import Metadata File

+

+ Upload a CSV, Excel (.xlsx, .xls), or JSON file with metadata. You'll configure column mapping after upload. +

+ + + +
+ +
+

Metadata Templates

+

+ Use templates with variables like {filename}, {date}, {user} for quick metadata generation +

+
+ + + + +
+
+
+ +
+
📁
+

Drop files here or click to browse

+

Supported: PDF, JPG, PNG, DOCX, XLSX, PPTX, MP4, MOV

+

Multiple files supported!

+ +
+ + {% if not docker_mode %} +
+ + +
+
+ How to copy folder path:
+ + Mac: Right-click folder in Finder → hold Option key → click "Copy ... as Pathname"
+ Windows: Shift + Right-click folder → "Copy as path" (remove quotes after pasting) +
+
+ {% else %} +
+ Docker Mode: Files will be updated and available for download from your browser after processing. +
+ {% endif %} +
+ +
+
0%
+
+ +
+
+
+
+ +
+ +
+ + +
+ + +
+ + + + + + + + + + + diff --git a/templates/login.html b/templates/login.html new file mode 100644 index 0000000..65c9d8f --- /dev/null +++ b/templates/login.html @@ -0,0 +1,302 @@ + + + + + + Login - Oliver Metadata Tool + + + + +{% set base = request.scope.get('root_path', '') %} + + + diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..c02a654 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,95 @@ +"""Test fixtures for Oliver Metadata Tool.""" + +import os +import tempfile +import shutil +from pathlib import Path + +import pytest +from fastapi.testclient import TestClient + +# Set test environment BEFORE importing app +os.environ["SECRET_KEY"] = "test-secret-key-for-testing-only" +os.environ["ENABLE_TEST_USER"] = "true" +os.environ["DOCKER_MODE"] = "false" +os.environ["OPENAI_API_KEY"] = "" # No AI in tests + + +@pytest.fixture(scope="session") +def temp_dir(): + """Create a temporary directory for test artifacts.""" + d = tempfile.mkdtemp(prefix="oliver_test_") + yield d + shutil.rmtree(d, ignore_errors=True) + + +@pytest.fixture(scope="session") +def app(temp_dir): + """Create test FastAPI application.""" + os.environ["UPLOAD_FOLDER"] = str(Path(temp_dir) / "uploads") + os.environ["DB_PATH"] = str(Path(temp_dir) / "test.db") + os.environ["SESSION_DB_PATH"] = str(Path(temp_dir) / "test_sessions.db") + os.environ["TEMPLATES_DIR"] = str(Path(__file__).parent.parent / "templates") + + # Force settings reload + from app.config import get_settings + import app.config as config_module + config_module._settings = None + + from app.main import create_app + return create_app() + + +@pytest.fixture(scope="session") +def client(app): + """Create test HTTP client.""" + return TestClient(app) + + +@pytest.fixture +def auth_client(client): + """Authenticated test client (logged in as tester).""" + # Login as test user + response = client.post( + "/login", + data={"username": "tester", "password": "oliveradmin"}, + follow_redirects=False, + ) + assert response.status_code == 302 + return client + + +@pytest.fixture +def sample_pdf(temp_dir): + """Create a minimal PDF for testing.""" + pdf_path = Path(temp_dir) / "test.pdf" + # Minimal valid PDF + pdf_content = b"""%PDF-1.4 +1 0 obj<>endobj +2 0 obj<>endobj +3 0 obj<>endobj +xref +0 4 +0000000000 65535 f +0000000009 00000 n +0000000058 00000 n +0000000115 00000 n +trailer<> +startxref +190 +%%EOF""" + pdf_path.write_bytes(pdf_content) + return str(pdf_path) + + +@pytest.fixture +def sample_csv(temp_dir): + """Create a sample CSV for import testing.""" + csv_path = Path(temp_dir) / "metadata.csv" + csv_path.write_text( + "filename,title,subject,keywords\n" + "test.pdf,Test Title,Test Subject,keyword1 keyword2\n" + "image.jpg,Image Title,Image Subject,photo landscape\n", + encoding="utf-8", + ) + return str(csv_path) diff --git a/tests/test_admin.py b/tests/test_admin.py new file mode 100644 index 0000000..ce406bb --- /dev/null +++ b/tests/test_admin.py @@ -0,0 +1,30 @@ +"""Tests for admin endpoints.""" + + +class TestAdminAccess: + def test_admin_requires_auth(self, client): + """GET /admin requires authentication.""" + client.cookies.clear() + response = client.get("/admin", follow_redirects=False) + assert response.status_code == 302 + + def test_admin_requires_admin_role(self, auth_client): + """GET /admin returns 403 for non-admin users.""" + response = auth_client.get("/admin") + # tester user has role='user', should get 403 + assert response.status_code == 403 or "detail" in response.json() + + def test_admin_users_requires_admin(self, auth_client): + """GET /admin/users returns 403 for non-admin users.""" + response = auth_client.get("/admin/users") + assert response.status_code == 403 + + def test_admin_audit_requires_admin(self, auth_client): + """GET /admin/audit returns 403 for non-admin users.""" + response = auth_client.get("/admin/audit") + assert response.status_code == 403 + + def test_admin_ai_usage_requires_admin(self, auth_client): + """GET /admin/ai-usage returns 403 for non-admin users.""" + response = auth_client.get("/admin/ai-usage") + assert response.status_code == 403 diff --git a/tests/test_auth.py b/tests/test_auth.py new file mode 100644 index 0000000..91b3721 --- /dev/null +++ b/tests/test_auth.py @@ -0,0 +1,68 @@ +"""Tests for authentication endpoints.""" + + +class TestLoginPage: + def test_login_page_renders(self, client): + """GET /login returns login form.""" + response = client.get("/login") + assert response.status_code == 200 + assert "login" in response.text.lower() + + def test_unauthenticated_redirect(self, client): + """Unauthenticated access to / redirects to /login.""" + response = client.get("/", follow_redirects=False) + assert response.status_code == 302 + assert "/login" in response.headers.get("location", "") + + +class TestLogin: + def test_login_success(self, client): + """POST /login with valid credentials redirects to /.""" + response = client.post( + "/login", + data={"username": "tester", "password": "oliveradmin"}, + follow_redirects=False, + ) + assert response.status_code == 302 + assert response.headers.get("location") == "/" + + def test_login_wrong_password(self, client): + """POST /login with wrong password shows error.""" + response = client.post( + "/login", + data={"username": "tester", "password": "wrongpass"}, + ) + assert response.status_code == 200 + # Should show error message on the login page + assert "error" in response.text.lower() or "invalid" in response.text.lower() or "incorrect" in response.text.lower() + + def test_login_empty_fields(self, client): + """POST /login with empty fields shows error.""" + response = client.post( + "/login", + data={"username": "", "password": ""}, + ) + assert response.status_code == 200 + + +class TestLogout: + def test_logout_redirects(self, auth_client): + """GET /logout redirects to /login.""" + response = auth_client.get("/logout", follow_redirects=False) + assert response.status_code == 302 + assert "/login" in response.headers.get("location", "") + + +class TestProtectedRoutes: + def test_index_requires_auth(self, client): + """/ requires authentication.""" + # Clear any existing session + client.cookies.clear() + response = client.get("/", follow_redirects=False) + assert response.status_code == 302 + + def test_index_accessible_when_authenticated(self, auth_client): + """/ is accessible after login.""" + response = auth_client.get("/") + assert response.status_code == 200 + assert "Oliver Metadata Tool" in response.text diff --git a/tests/test_imports.py b/tests/test_imports.py new file mode 100644 index 0000000..89bf906 --- /dev/null +++ b/tests/test_imports.py @@ -0,0 +1,36 @@ +"""Tests for import endpoints.""" + + +class TestImport: + def test_import_csv(self, auth_client, sample_csv): + """POST /import-metadata with CSV file returns columns and sample data.""" + with open(sample_csv, "rb") as f: + response = auth_client.post( + "/import-metadata", + files={"import_file": ("metadata.csv", f, "text/csv")}, + ) + data = response.json() + assert data.get("success") is True + assert "columns" in data + assert "filename" in data["columns"] + assert "title" in data["columns"] + assert len(data["sample_data"]) > 0 + + def test_import_unsupported_format(self, auth_client, temp_dir): + """POST /import-metadata with unsupported file returns error.""" + import io + response = auth_client.post( + "/import-metadata", + files={"import_file": ("data.txt", io.BytesIO(b"hello"), "text/plain")}, + ) + assert response.status_code == 400 or "error" in response.json() + + def test_import_requires_auth(self, client): + """POST /import-metadata requires authentication.""" + client.cookies.clear() + response = client.post( + "/import-metadata", + files={"import_file": ("data.csv", b"a,b\n1,2", "text/csv")}, + follow_redirects=False, + ) + assert response.status_code == 302 diff --git a/tests/test_session_store.py b/tests/test_session_store.py new file mode 100644 index 0000000..2c22536 --- /dev/null +++ b/tests/test_session_store.py @@ -0,0 +1,95 @@ +"""Tests for the SQLite-backed session store.""" + +import tempfile +import os +from pathlib import Path + +import pytest + +from app.session.store import SessionStore + + +@pytest.fixture +def store(): + """Create a temporary session store.""" + fd, path = tempfile.mkstemp(suffix=".db") + os.close(fd) + s = SessionStore(db_path=path) + yield s + os.unlink(path) + + +class TestFileSession: + def test_create_and_get(self, store): + """Create and retrieve a file session.""" + sid = store.create_file_session(user_id=1, metadata_source="manual") + assert sid + session = store.get_file_session(sid) + assert session is not None + assert session["user_id"] == 1 + assert session["files"] == [] + + def test_add_file_to_session(self, store): + """Add files to a session.""" + sid = store.create_file_session(user_id=1) + store.add_file_to_session(sid, {"filename": "test.pdf", "success": True}) + store.add_file_to_session(sid, {"filename": "img.jpg", "success": True}) + + session = store.get_file_session(sid) + assert len(session["files"]) == 2 + assert session["files"][0]["filename"] == "test.pdf" + + def test_update_file_in_session(self, store): + """Update a specific file entry.""" + sid = store.create_file_session(user_id=1) + store.add_file_to_session(sid, {"filename": "test.pdf", "status": "pending"}) + store.update_file_in_session(sid, 0, {"status": "complete", "metadata": {"title": "T"}}) + + session = store.get_file_session(sid) + assert session["files"][0]["status"] == "complete" + assert session["files"][0]["metadata"]["title"] == "T" + + def test_delete_session(self, store): + """Delete a file session.""" + sid = store.create_file_session(user_id=1) + store.delete_file_session(sid) + assert store.get_file_session(sid) is None + + def test_session_id_is_secure(self, store): + """Session IDs should be cryptographically random.""" + ids = [store.create_file_session(user_id=1) for _ in range(5)] + assert len(set(ids)) == 5 # All unique + for sid in ids: + assert len(sid) > 20 # Long enough for security + + +class TestImportSession: + def test_create_import_session(self, store): + """Create and retrieve an import session.""" + sid = store.create_import_session( + user_id=1, + session_type="import", + file_info={"path": "/tmp/test.csv", "filename": "test.csv"}, + ) + session = store.get_import_session(sid) + assert session is not None + assert session["file_info"]["filename"] == "test.csv" + + def test_update_import_metadata_map(self, store): + """Update import session with metadata map.""" + sid = store.create_import_session(user_id=1, session_type="import") + metadata_map = {"test": {"title": "Test Title", "subject": "Test Subject"}} + store.update_import_session(sid, metadata_map=metadata_map) + + session = store.get_import_session(sid) + assert session["metadata_map"]["test"]["title"] == "Test Title" + + +class TestCleanup: + def test_cleanup_expired(self, store): + """Cleanup removes expired sessions.""" + # Create a session with 0 hours expiry (immediately expired) + sid = store.create_file_session(user_id=1, expires_hours=0) + count = store.cleanup_expired() + assert count >= 1 + assert store.get_file_session(sid) is None diff --git a/tests/test_templates.py b/tests/test_templates.py new file mode 100644 index 0000000..e8309da --- /dev/null +++ b/tests/test_templates.py @@ -0,0 +1,93 @@ +"""Tests for template management endpoints.""" + +import json + + +class TestTemplates: + def test_list_templates(self, auth_client): + """GET /templates/list returns template list.""" + response = auth_client.get("/templates/list") + data = response.json() + assert data.get("success") is True + assert "templates" in data + + def test_save_template(self, auth_client): + """POST /templates/save creates a new template.""" + response = auth_client.post( + "/templates/save", + content=json.dumps({ + "name": "Test Template", + "title": "{filename} - Test", + "subject": "Test subject for {filename}", + "keywords": "test, {year}", + "description": "A test template", + }), + headers={"Content-Type": "application/json"}, + ) + data = response.json() + assert data.get("success") is True + + def test_load_template(self, auth_client): + """GET /templates/load/{name} loads a template.""" + # First save, then load + auth_client.post( + "/templates/save", + content=json.dumps({ + "name": "LoadTest", + "title": "{filename}", + "subject": "Subject", + "keywords": "kw", + }), + headers={"Content-Type": "application/json"}, + ) + response = auth_client.get("/templates/load/LoadTest") + data = response.json() + assert data.get("success") is True + assert data["template"]["name"] == "LoadTest" + + def test_load_nonexistent_template(self, auth_client): + """GET /templates/load/{name} returns 404 for missing template.""" + response = auth_client.get("/templates/load/NonExistent12345") + assert response.status_code == 404 + + def test_save_template_empty_name(self, auth_client): + """POST /templates/save with empty name returns error.""" + response = auth_client.post( + "/templates/save", + content=json.dumps({"name": "", "title": "t", "subject": "s", "keywords": "k"}), + headers={"Content-Type": "application/json"}, + ) + assert response.status_code == 400 + + def test_delete_template(self, auth_client): + """DELETE /templates/delete/{name} removes a template.""" + # Create first + auth_client.post( + "/templates/save", + content=json.dumps({ + "name": "DeleteMe", + "title": "t", + "subject": "s", + "keywords": "k", + }), + headers={"Content-Type": "application/json"}, + ) + response = auth_client.delete("/templates/delete/DeleteMe") + data = response.json() + assert data.get("success") is True + + def test_preview_template(self, auth_client): + """POST /templates/preview returns preview output.""" + response = auth_client.post( + "/templates/preview", + content=json.dumps({ + "title": "{filename} - Preview", + "subject": "Subject for {filename}", + "keywords": "test, {year}", + "sample_filename": "example.pdf", + }), + headers={"Content-Type": "application/json"}, + ) + data = response.json() + assert data.get("success") is True + assert "preview" in data diff --git a/tests/test_upload.py b/tests/test_upload.py new file mode 100644 index 0000000..3a67d47 --- /dev/null +++ b/tests/test_upload.py @@ -0,0 +1,52 @@ +"""Tests for upload endpoints.""" + +import io +from pathlib import Path + + +class TestUpload: + def test_upload_no_files(self, auth_client): + """POST /upload with no files returns error.""" + response = auth_client.post( + "/upload", + data={"metadata_source": "manual"}, + files={"files": ("", b"", "application/octet-stream")}, + ) + assert response.status_code == 400 + + def test_upload_manual_source(self, auth_client, sample_pdf): + """POST /upload with manual source processes file.""" + with open(sample_pdf, "rb") as f: + response = auth_client.post( + "/upload", + data={"metadata_source": "manual"}, + files={"files": ("test.pdf", f, "application/pdf")}, + ) + data = response.json() + assert data.get("success") is True + assert "session_id" in data + assert len(data["files"]) == 1 + + def test_upload_response_no_filepath(self, auth_client, sample_pdf): + """API response should not expose server file paths.""" + with open(sample_pdf, "rb") as f: + response = auth_client.post( + "/upload", + data={"metadata_source": "manual"}, + files={"files": ("test.pdf", f, "application/pdf")}, + ) + data = response.json() + for file_result in data.get("files", []): + assert "filepath" not in file_result + + +class TestUploadExcel: + def test_upload_excel_requires_auth(self, client): + """POST /upload-excel requires authentication.""" + client.cookies.clear() + response = client.post( + "/upload-excel", + files={"excel_file": ("test.xlsx", b"fake", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")}, + follow_redirects=False, + ) + assert response.status_code == 302 diff --git a/web_app.py b/web_app.py new file mode 100644 index 0000000..0a7291e --- /dev/null +++ b/web_app.py @@ -0,0 +1,1381 @@ +#!/usr/bin/env python3 +""" +Oliver Metadata Tool - Web Interface +Universal metadata creation and management tool for files. +Flask-based web app for local or server deployment. +Supports multiple metadata sources: Excel, AI, manual entry, and file import. +""" + +from flask import Flask, render_template, request, jsonify, send_file, session, redirect, url_for +from werkzeug.utils import secure_filename # noqa: F401 - kept as fallback +from pathlib import Path +import os +import tempfile +import threading +import webbrowser +from time import sleep +import shutil +import unicodedata +import secrets +import zipfile +from datetime import datetime + +from src.file_detector import FileDetector, FileType +from src.excel_metadata_lookup import ExcelMetadataLookup +from src.config import Config +from src.metadata_analyzer import MetadataAnalyzer +from src.metadata_importer import MetadataImporter +from src.template_manager import TemplateManager +from src.auth import login_required, authenticate_user, create_user_session, destroy_user_session, get_current_user, is_sso_enabled, get_sso_instance, cleanup_sessions +from src.database import Database + +def safe_filename(filename): + """Sanitize filename while preserving Unicode characters (Chinese, Japanese, Korean).""" + # Normalize unicode + filename = unicodedata.normalize('NFC', filename) + # Remove path separators and null bytes + filename = filename.replace('/', '_').replace('\\', '_').replace('\x00', '') + # Remove leading/trailing dots and spaces + filename = filename.strip('. ') + # If empty, use default + if not filename: + filename = 'unnamed_file' + return filename +from src.extractors.pdf_extractor import PDFExtractor +from src.extractors.image_extractor import ImageExtractor +from src.extractors.office_extractor import OfficeExtractor +from src.extractors.video_extractor import VideoExtractor +from src.updaters.pdf_updater import PDFUpdater +from src.updaters.image_updater import ImageUpdater +from src.updaters.office_updater import OfficeUpdater +from src.updaters.video_updater import VideoUpdater + +app = Flask(__name__) +app.config['MAX_CONTENT_LENGTH'] = 500 * 1024 * 1024 # 500MB max file size + +# Docker mode detection +DOCKER_MODE = os.getenv('DOCKER_MODE', 'false').lower() == 'true' + +# Upload folder configuration +if DOCKER_MODE: + # Use persistent directory in Docker + UPLOAD_FOLDER = Path('/app/uploads') + UPLOAD_FOLDER.mkdir(parents=True, exist_ok=True) + app.config['UPLOAD_FOLDER'] = str(UPLOAD_FOLDER) +else: + # Use temp directory for local development + app.config['UPLOAD_FOLDER'] = tempfile.mkdtemp() + +app.config['SECRET_KEY'] = os.getenv('SECRET_KEY', secrets.token_hex(32)) + +# Excel file path for metadata lookup +EXCEL_PATH = Path(__file__).parent / "Celum ID to Adobe Asset Path Mapping Spreadsheet (1).xlsx" + +# Initialize metadata lookup from Excel +metadata_lookup = None + +# Initialize AI analyzer (lazy initialization) +ai_analyzer = None + +# Initialize extractors and updaters +extractors = { + FileType.PDF: PDFExtractor(), + FileType.IMAGE: ImageExtractor(), + FileType.OFFICE_DOC: OfficeExtractor(), + FileType.OFFICE_SHEET: OfficeExtractor(), + FileType.OFFICE_PRESENTATION: OfficeExtractor(), + FileType.VIDEO: VideoExtractor() +} + +updaters = { + FileType.PDF: PDFUpdater(), + FileType.IMAGE: ImageUpdater(), + FileType.OFFICE_DOC: OfficeUpdater(), + FileType.OFFICE_SHEET: OfficeUpdater(), + FileType.OFFICE_PRESENTATION: OfficeUpdater(), + FileType.VIDEO: VideoUpdater() +} + +# Store file processing sessions +sessions = {} + +# Store imported metadata from external files +imported_metadata = {} + +def cleanup_session_files(session_id: str): + """ + Clean up files associated with a session. + Removes uploaded files from disk to free up space. + """ + if session_id not in sessions: + return + + session_data = sessions[session_id] + files = session_data.get('files', []) + + for file_info in files: + filepath = file_info.get('filepath') + if filepath and os.path.exists(filepath): + try: + os.remove(filepath) + app.logger.info(f"Cleaned up file: {filepath}") + except Exception as e: + app.logger.warning(f"Failed to cleanup file {filepath}: {e}") + + # Remove session from memory + sessions.pop(session_id, None) + +def cleanup_old_files(max_age_hours: int = 24): + """ + Clean up files older than max_age_hours from upload folder. + Runs automatically to prevent disk space issues. + """ + try: + upload_folder = Path(app.config['UPLOAD_FOLDER']) + now = datetime.now().timestamp() + max_age_seconds = max_age_hours * 3600 + + cleaned = 0 + for filepath in upload_folder.glob('*'): + if filepath.is_file(): + file_age = now - filepath.stat().st_mtime + if file_age > max_age_seconds: + try: + filepath.unlink() + cleaned += 1 + app.logger.info(f"Cleaned up old file: {filepath.name}") + except Exception as e: + app.logger.warning(f"Failed to cleanup {filepath.name}: {e}") + + if cleaned > 0: + app.logger.info(f"Cleaned up {cleaned} old file(s)") + except Exception as e: + app.logger.error(f"Error in cleanup_old_files: {e}") + +def get_metadata_lookup(): + """Get or create metadata lookup instance.""" + global metadata_lookup + if metadata_lookup is None: + metadata_lookup = ExcelMetadataLookup(str(EXCEL_PATH)) + return metadata_lookup + +def get_ai_analyzer(): + """Get or create AI analyzer instance.""" + global ai_analyzer + if ai_analyzer is None: + if Config.OPENAI_API_KEY: + try: + ai_analyzer = MetadataAnalyzer() + logger = __import__('logging').getLogger(__name__) + logger.info("AI analyzer initialized successfully") + except Exception as e: + logger = __import__('logging').getLogger(__name__) + logger.error(f"Failed to initialize AI analyzer: {e}") + return None + else: + return None + return ai_analyzer + +@app.route('/login', methods=['GET', 'POST']) +def login(): + """Login page and handler.""" + if request.method == 'POST': + username = request.form.get('username', '').strip() + password = request.form.get('password', '') + + if not username or not password: + return render_template('login.html', error='Please enter both username and password', sso_enabled=is_sso_enabled()) + + # Authenticate user + result = authenticate_user(username, password) + + if result['success']: + user = result['user'] + + # Create session + session_id = create_user_session( + user=user, + ip_address=request.remote_addr, + user_agent=request.headers.get('User-Agent') + ) + + if session_id: + # Set Flask session + session['user_id'] = user['id'] + session['username'] = user['username'] + session['session_id'] = session_id + + # Redirect to original destination or home + next_url = request.args.get('next', url_for('index')) + return redirect(next_url) + else: + return render_template('login.html', error='Failed to create session', sso_enabled=is_sso_enabled()) + else: + return render_template('login.html', error=result.get('error'), sso_enabled=is_sso_enabled()) + + # GET request - show login form + return render_template('login.html', sso_enabled=is_sso_enabled()) + + +@app.route('/logout') +def logout(): + """Logout user and cleanup session files.""" + user_id = session.get('user_id') + session_id = session.get('session_id') + + # Clean up all file processing sessions for this user + # (In-memory sessions don't have user_id, so we clean all) + sessions_to_cleanup = list(sessions.keys()) + for sid in sessions_to_cleanup: + cleanup_session_files(sid) + + if session_id: + destroy_user_session(session_id, user_id) + + session.clear() + return redirect(url_for('login')) + + +@app.route('/login/microsoft') +def login_microsoft(): + """Redirect to Microsoft SSO.""" + sso = get_sso_instance() + + if not sso.enabled: + return render_template('login.html', error='Microsoft SSO not configured', sso_enabled=False) + + # Generate state for CSRF protection + state = secrets.token_urlsafe(16) + session['oauth_state'] = state + + auth_url = sso.get_auth_url(state=state) + if auth_url: + return redirect(auth_url) + else: + return render_template('login.html', error='Failed to generate SSO URL', sso_enabled=is_sso_enabled()) + + +@app.route('/auth/callback') +def auth_callback(): + """Handle Microsoft SSO callback.""" + sso = get_sso_instance() + + # Verify state + if request.args.get('state') != session.get('oauth_state'): + return render_template('login.html', error='Invalid state parameter', sso_enabled=is_sso_enabled()) + + code = request.args.get('code') + if not code: + error_desc = request.args.get('error_description', 'No authorization code') + return render_template('login.html', error=f'SSO failed: {error_desc}', sso_enabled=is_sso_enabled()) + + # Exchange code for token + result = sso.acquire_token(code) + + if result and 'access_token' in result: + # Get user info from Microsoft Graph + user_info = sso.get_user_info(result['access_token']) + + if user_info: + # Create or update user + user = sso.create_or_update_user(user_info) + + if user: + # Create session + session_id = create_user_session( + user=user, + ip_address=request.remote_addr, + user_agent=request.headers.get('User-Agent') + ) + + if session_id: + # Set Flask session + session['user_id'] = user['id'] + session['username'] = user['username'] + session['session_id'] = session_id + + return redirect(url_for('index')) + + return render_template('login.html', error='SSO authentication failed', sso_enabled=is_sso_enabled()) + + +@app.route('/') +@login_required +def index(): + """Main page.""" + user = get_current_user() + return render_template('index.html', + username=user['username'] if user else None, + docker_mode=DOCKER_MODE) + +@app.route('/upload', methods=['POST']) +@login_required +def upload_file(): + """Handle multiple file uploads and metadata lookup from Excel.""" + if 'files' not in request.files: + return jsonify({'error': 'No files provided'}), 400 + + files = request.files.getlist('files') + if not files or files[0].filename == '': + return jsonify({'error': 'No files selected'}), 400 + + # Get metadata source choice (excel, manual, ai, import) + metadata_source = request.form.get('metadata_source', 'excel') + import_session_id = request.form.get('import_session_id', '') # For import source + + results = [] + session_id = str(len(sessions) + 1) + sessions[session_id] = { + 'files': [], + 'metadata_source': metadata_source, + 'import_session_id': import_session_id + } + + # Get metadata lookup (only if using Excel source) + excel_session_id = request.form.get('excel_session_id') + lookup = None + + if metadata_source == 'excel': + if excel_session_id and excel_session_id in imported_metadata: + # Use uploaded Excel file + lookup = imported_metadata[excel_session_id] + else: + # Try default Excel file if available + try: + lookup = get_metadata_lookup() + except: + return jsonify({'error': 'Please upload an Excel file first using the Upload Excel File button'}), 400 + + # Get imported metadata (only if using import source) + import_map = None + if metadata_source == 'import' and import_session_id and import_session_id in imported_metadata: + import_map = imported_metadata[import_session_id] + importer = MetadataImporter() + elif metadata_source == 'import': + # Import source selected but no import session available + return jsonify({'error': 'Please import a metadata file first using the Import button'}), 400 + + for file in files: + try: + # Save uploaded file + filename = safe_filename(file.filename) + filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename) + file.save(filepath) + + # Detect file type + file_type = FileDetector.detect_file_type(filepath) + + if file_type == FileType.UNSUPPORTED: + results.append({ + 'filename': filename, + 'error': 'Unsupported file type' + }) + continue + + # Get extractor for this file type + extractor = extractors.get(file_type) + if not extractor: + results.append({ + 'filename': filename, + 'error': 'No extractor available' + }) + continue + + # Read current metadata from file + old_metadata = extractor.read_metadata(filepath) + + # Generate metadata based on chosen source + excel_found = False + new_metadata = {'title': '', 'subject': '', 'keywords': ''} + + if metadata_source == 'excel' and lookup: + # Lookup metadata from Excel by filename + excel_data = lookup.lookup_by_filename(filename) + + if excel_data: + new_metadata = { + 'title': excel_data.get('title', ''), + 'subject': excel_data.get('description', ''), + 'keywords': '' + } + excel_found = True + else: + # No Excel data found - use filename as fallback + new_metadata = { + 'title': Path(filename).stem, + 'subject': f'No metadata found in Excel for {filename}', + 'keywords': '' + } + + elif metadata_source == 'manual': + # Return empty metadata for user to fill manually + new_metadata = { + 'title': Path(filename).stem, # Suggest filename + 'subject': '', + 'keywords': '' + } + + elif metadata_source == 'ai': + # AI generation using MetadataAnalyzer + analyzer = get_ai_analyzer() + + if analyzer: + try: + # Extract content from file + content = extractor.extract_content(str(filepath)) + + if not content or len(content.strip()) < 10: + # Not enough content for AI analysis + new_metadata = { + 'title': Path(filename).stem, + 'subject': 'Insufficient content for AI analysis', + 'keywords': '', + '_ai_error': 'Not enough text content extracted' + } + else: + # Generate metadata with AI + new_metadata = analyzer.analyze_content(content, filename, file_type) + + # Log token usage if available + if '_tokens_used' in new_metadata: + import logging + logging.getLogger(__name__).info( + f"AI tokens used for {filename}: {new_metadata['_tokens_used']}" + ) + + except Exception as e: + import logging + logging.getLogger(__name__).error(f"AI generation failed for {filename}: {e}") + new_metadata = { + 'title': Path(filename).stem, + 'subject': f'AI generation error: {str(e)}', + 'keywords': '', + '_ai_error': str(e) + } + else: + # AI not configured + new_metadata = { + 'title': Path(filename).stem, + 'subject': 'AI generation not available (OpenAI API key not configured)', + 'keywords': '', + '_ai_error': 'OpenAI API key not configured' + } + + elif metadata_source == 'import': + # Import from external file (CSV, Excel, JSON) + if import_map and importer: + # Look up metadata for this file + imported = importer.get_metadata_for_file(import_map, filename) + + if imported: + new_metadata = imported + excel_found = True # Mark as found in import + else: + # No metadata found in import file + new_metadata = { + 'title': Path(filename).stem, + 'subject': f'No metadata found in imported file for {filename}', + 'keywords': '' + } + else: + # Import source not available + new_metadata = { + 'title': Path(filename).stem, + 'subject': 'Import metadata not loaded', + 'keywords': '' + } + + file_info = { + 'success': True, + 'filename': filename, + 'file_type': file_type.value, + 'current_metadata': old_metadata, + 'suggested_metadata': new_metadata, + 'filepath': filepath, + 'metadata_source': metadata_source, + 'excel_found': excel_found + } + + results.append(file_info) + sessions[session_id]['files'].append(file_info) + + except Exception as e: + results.append({ + 'filename': file.filename, + 'error': str(e) + }) + + return jsonify({ + 'success': True, + 'session_id': session_id, + 'files': results + }) + +@app.route('/update', methods=['POST']) +@login_required +def update_metadata(): + """Update file metadata using suggested metadata from session.""" + data = request.json + filepath = data.get('filepath') + session_id = data.get('session_id') + file_index = data.get('file_index') + output_dir = data.get('output_dir', '') # User-selected output directory + + if not filepath or not os.path.exists(filepath): + return jsonify({'error': 'File not found'}), 404 + + # Validate session + if not session_id or session_id not in sessions: + return jsonify({'error': 'Invalid or expired session'}), 400 + + # Validate file index + if file_index is None or file_index >= len(sessions[session_id]['files']): + return jsonify({'error': 'Invalid file index'}), 400 + + try: + # Get file info from session + file_info = sessions[session_id]['files'][file_index] + + # Get suggested metadata from session + new_metadata = file_info.get('suggested_metadata', {}) + + if not new_metadata or not new_metadata.get('title'): + return jsonify({'error': 'No metadata available for this file'}), 400 + + # Detect file type + file_type = FileDetector.detect_file_type(filepath) + + if file_type == FileType.UNSUPPORTED: + return jsonify({'error': 'Unsupported file type'}), 400 + + # Get updater + updater = updaters.get(file_type) + + if not updater: + return jsonify({'error': 'No updater available for this file type'}), 400 + + filename = Path(filepath).name + + # In Docker mode, always update in-place (user will download via browser) + # In local mode, allow copying to output directory + if not DOCKER_MODE and output_dir and os.path.isdir(output_dir): + output_path = os.path.join(output_dir, filename) + shutil.copy2(filepath, output_path) + target_file = output_path + else: + # Update in-place for Docker or when no output_dir specified + target_file = filepath + + # Update the file metadata + success = updater.update_metadata(target_file, new_metadata, backup=False) + + if not success: + return jsonify({'error': 'Failed to update metadata'}), 500 + + # Verify update + verified = updater.verify_metadata(target_file, new_metadata) + + return jsonify({ + 'success': True, + 'message': 'Metadata updated successfully', + 'verified': verified, + 'output_path': target_file, + 'metadata': new_metadata + }) + + except Exception as e: + return jsonify({'error': str(e)}), 500 + +@app.route('/update-manual', methods=['POST']) +@login_required +def update_manual_metadata(): + """Update file with manually entered metadata.""" + data = request.json + session_id = data.get('session_id') + file_index = data.get('file_index') + + # Validate and sanitize metadata + custom_metadata = { + 'title': data.get('title', '').strip()[:200], + 'subject': data.get('subject', '').strip()[:300], + 'keywords': data.get('keywords', '').strip()[:500], + 'author': data.get('author', '').strip()[:100], + 'copyright': data.get('copyright', '').strip()[:150], + 'comments': data.get('comments', '').strip()[:500] + } + + # Add custom fields if provided + custom_fields = data.get('custom_fields', {}) + if custom_fields and isinstance(custom_fields, dict): + for field_name, field_value in custom_fields.items(): + # Sanitize custom field names and values + safe_name = str(field_name).strip()[:50] + safe_value = str(field_value).strip()[:200] + if safe_name and safe_value: + custom_metadata[safe_name] = safe_value + + # Validate session + if not session_id or session_id not in sessions: + return jsonify({'error': 'Invalid or expired session'}), 400 + + # Validate file index + if file_index is None or file_index >= len(sessions[session_id]['files']): + return jsonify({'error': 'Invalid file index'}), 400 + + try: + # Get file info from session + file_info = sessions[session_id]['files'][file_index] + filepath = file_info.get('filepath') + + if not filepath or not os.path.exists(filepath): + return jsonify({'error': 'File not found'}), 404 + + # Detect file type + file_type = FileDetector.detect_file_type(filepath) + + if file_type == FileType.UNSUPPORTED: + return jsonify({'error': 'Unsupported file type'}), 400 + + # Get updater for this file type + updater = updaters.get(file_type) + + if not updater: + return jsonify({'error': 'No updater available for this file type'}), 400 + + # Update metadata + success = updater.update_metadata(filepath, custom_metadata, backup=True) + + if not success: + return jsonify({'error': 'Failed to update metadata'}), 500 + + # Update session with new metadata + sessions[session_id]['files'][file_index]['suggested_metadata'] = custom_metadata + + # Verify update + verified = updater.verify_metadata(filepath, custom_metadata) + + return jsonify({ + 'status': 'success', + 'message': 'Metadata updated successfully', + 'verified': verified, + 'metadata': custom_metadata + }) + + except Exception as e: + return jsonify({'error': f'Error updating metadata: {str(e)}'}), 500 + +@app.route('/download/') +@login_required +def download_file(filename): + """Download processed file.""" + filepath = os.path.join(app.config['UPLOAD_FOLDER'], safe_filename(filename)) + if os.path.exists(filepath): + return send_file(filepath, as_attachment=True) + return jsonify({'error': 'File not found'}), 404 + +@app.route('/download-selected', methods=['POST']) +@login_required +def download_selected_files(): + """Download selected files from session as ZIP archive.""" + try: + data = request.json + app.logger.info(f"download-selected called with data: {data}") + + session_id = data.get('session_id') + file_indices = data.get('file_indices', []) + + app.logger.info(f"session_id: {session_id}, file_indices: {file_indices}") + app.logger.info(f"Available sessions: {list(sessions.keys())}") + + if session_id not in sessions: + app.logger.error(f"Session not found: {session_id}") + return jsonify({'error': 'Session not found'}), 404 + + if not file_indices: + app.logger.error("No files selected") + return jsonify({'error': 'No files selected'}), 400 + + session_data = sessions[session_id] + all_files = session_data.get('files', []) + + app.logger.info(f"Found {len(all_files)} files in session") + + if not all_files: + app.logger.error("No files in session") + return jsonify({'error': 'No files in session'}), 404 + + # Create a temporary ZIP file + timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') + zip_filename = f'oliver_metadata_files_{timestamp}.zip' + zip_path = os.path.join(app.config['UPLOAD_FOLDER'], zip_filename) + + app.logger.info(f"Creating ZIP at: {zip_path}") + + with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf: + for index in file_indices: + if 0 <= index < len(all_files): + file_info = all_files[index] + filepath = file_info['filepath'] + filename = file_info['filename'] + + app.logger.info(f"Adding file {index}: {filename} from {filepath}") + + if os.path.exists(filepath): + # Add file to ZIP with its original name + zipf.write(filepath, filename) + app.logger.info(f"Added {filename} to ZIP") + else: + app.logger.warning(f"File not found: {filepath}") + + app.logger.info(f"ZIP created successfully, sending file") + + # Send the ZIP file and delete it after sending + return send_file( + zip_path, + as_attachment=True, + download_name=zip_filename, + mimetype='application/zip' + ) + except Exception as e: + app.logger.error(f"Error in download_selected_files: {str(e)}", exc_info=True) + if 'zip_path' in locals() and os.path.exists(zip_path): + os.remove(zip_path) + return jsonify({'error': f'Error creating ZIP archive: {str(e)}'}), 500 + +@app.route('/cleanup-session/', methods=['POST']) +@login_required +def cleanup_session(session_id): + """Clean up session files manually.""" + try: + cleanup_session_files(session_id) + return jsonify({'success': True, 'message': 'Session cleaned up successfully'}) + except Exception as e: + app.logger.error(f"Error cleaning up session: {str(e)}") + return jsonify({'error': str(e)}), 500 + +@app.route('/upload-excel', methods=['POST']) +@login_required +def upload_excel(): + """Upload Excel file for Excel Lookup metadata source.""" + if 'excel_file' not in request.files: + return jsonify({'error': 'No file provided'}), 400 + + file = request.files['excel_file'] + if file.filename == '': + return jsonify({'error': 'No file selected'}), 400 + + try: + import pandas as pd + + # Save temp file + excel_filename = safe_filename(file.filename) + temp_path = Path(app.config['UPLOAD_FOLDER']) / excel_filename + file.save(str(temp_path)) + + # Preview Excel structure instead of loading directly + excel_file = pd.ExcelFile(str(temp_path)) + sheet_names = excel_file.sheet_names + + # Get columns and sample data from first sheet + preview_data = {} + for sheet_name in sheet_names[:5]: # Limit to first 5 sheets + df = pd.read_excel(excel_file, sheet_name=sheet_name, nrows=5) + preview_data[sheet_name] = { + 'columns': df.columns.tolist(), + 'sample_data': df.head(3).fillna('').to_dict('records') + } + + # Store file path temporarily for later configuration + excel_session_id = f"excel_{secrets.token_urlsafe(8)}" + if 'excel_files' not in imported_metadata: + imported_metadata['excel_files'] = {} + imported_metadata['excel_files'][excel_session_id] = { + 'path': str(temp_path), + 'filename': excel_filename, + 'sheet_names': sheet_names + } + + return jsonify({ + 'success': True, + 'excel_session_id': excel_session_id, + 'filename': excel_filename, + 'sheets': sheet_names, + 'preview': preview_data, + 'message': f'Excel file uploaded. Please configure column mapping.' + }) + + except Exception as e: + import logging + logging.getLogger(__name__).error(f"Excel upload failed: {e}") + return jsonify({'error': f'Excel upload failed: {str(e)}'}), 500 + +@app.route('/preview-excel-sheet', methods=['POST']) +@login_required +def preview_excel_sheet(): + """Preview a specific sheet from uploaded Excel file.""" + try: + import pandas as pd + + data = request.json + excel_session_id = data.get('excel_session_id') + sheet_name = data.get('sheet_name') + + if not excel_session_id or excel_session_id not in imported_metadata.get('excel_files', {}): + return jsonify({'error': 'Invalid session ID'}), 400 + + excel_info = imported_metadata['excel_files'][excel_session_id] + excel_path = excel_info['path'] + + # Read the specific sheet + df = pd.read_excel(excel_path, sheet_name=sheet_name, nrows=10) + + return jsonify({ + 'success': True, + 'columns': df.columns.tolist(), + 'sample_data': df.head(5).fillna('').to_dict('records') + }) + + except Exception as e: + import logging + logging.getLogger(__name__).error(f"Sheet preview failed: {e}") + return jsonify({'error': f'Sheet preview failed: {str(e)}'}), 500 + +@app.route('/configure-excel-mapping', methods=['POST']) +@login_required +def configure_excel_mapping(): + """Configure Excel column mapping and load metadata.""" + try: + import pandas as pd + + data = request.json + excel_session_id = data.get('excel_session_id') + sheet_name = data.get('sheet_name') + column_mapping = data.get('column_mapping', {}) # {filename: 'col', title: 'col', ...} + + if not excel_session_id or excel_session_id not in imported_metadata.get('excel_files', {}): + return jsonify({'error': 'Invalid session ID'}), 400 + + excel_info = imported_metadata['excel_files'][excel_session_id] + excel_path = excel_info['path'] + + # Read the configured sheet + df = pd.read_excel(excel_path, sheet_name=sheet_name) + + # Build metadata map using configured columns + metadata_map = {} + filename_col = column_mapping.get('filename') + title_col = column_mapping.get('title') + description_col = column_mapping.get('description') + keywords_col = column_mapping.get('keywords') + + if not filename_col: + return jsonify({'error': 'Filename column is required'}), 400 + + for _, row in df.iterrows(): + filename = row.get(filename_col) + if pd.notna(filename) and str(filename).strip(): + # Get filename without extension for indexing (case-insensitive) + filename_stem = Path(str(filename).strip()).stem.lower() + + metadata = { + 'title': str(row.get(title_col, '')).strip() if title_col and pd.notna(row.get(title_col)) else '', + 'description': str(row.get(description_col, '')).strip() if description_col and pd.notna(row.get(description_col)) else '', + 'keywords': str(row.get(keywords_col, '')).strip() if keywords_col and pd.notna(row.get(keywords_col)) else '', + 'original_filename': str(filename).strip() + } + + metadata_map[filename_stem] = metadata + + # Create a simple lookup object + class ConfiguredExcelLookup: + def __init__(self, metadata_map): + self.metadata_map = metadata_map + self.filename_to_metadata = metadata_map + + def lookup_by_filename(self, filename: str): + filename_stem = Path(filename).stem.lower() + return self.metadata_map.get(filename_stem) + + lookup = ConfiguredExcelLookup(metadata_map) + + # Store configured lookup + imported_metadata[excel_session_id] = lookup + + # Get stats + stats = { + 'total_records': len(metadata_map), + 'with_title': sum(1 for v in metadata_map.values() if v.get('title')), + 'with_description': sum(1 for v in metadata_map.values() if v.get('description')), + 'with_keywords': sum(1 for v in metadata_map.values() if v.get('keywords')) + } + + return jsonify({ + 'success': True, + 'excel_session_id': excel_session_id, + 'stats': stats, + 'message': f'Configured mapping for {stats["total_records"]} records from sheet "{sheet_name}"' + }) + + except Exception as e: + import logging + logging.getLogger(__name__).error(f"Excel configuration failed: {e}") + return jsonify({'error': f'Excel configuration failed: {str(e)}'}), 500 + +@app.route('/import-metadata', methods=['POST']) +@login_required +def import_metadata(): + """Upload import file and preview structure for mapping.""" + if 'import_file' not in request.files: + return jsonify({'error': 'No file provided'}), 400 + + file = request.files['import_file'] + if file.filename == '': + return jsonify({'error': 'No file selected'}), 400 + + try: + import pandas as pd + + # Save temp file + import_filename = safe_filename(file.filename) + temp_path = Path(app.config['UPLOAD_FOLDER']) / import_filename + file.save(str(temp_path)) + + file_ext = temp_path.suffix.lower() + + # Read file and get structure + if file_ext == '.csv': + df = pd.read_csv(str(temp_path), nrows=5, encoding='utf-8') + elif file_ext in ['.xlsx', '.xls']: + df = pd.read_excel(str(temp_path), nrows=5) + elif file_ext == '.json': + import json + with open(str(temp_path), 'r', encoding='utf-8') as f: + data = json.load(f) + # Convert to DataFrame + if isinstance(data, list): + df = pd.DataFrame(data[:5]) + elif isinstance(data, dict): + df = pd.DataFrame([data]) + else: + return jsonify({'error': 'Invalid JSON format'}), 400 + else: + return jsonify({'error': f'Unsupported file format: {file_ext}'}), 400 + + columns = df.columns.tolist() + sample_data = df.fillna('').to_dict('records') + + # Store file path for later configuration + import_session_id = f"import_{secrets.token_urlsafe(8)}" + if 'import_files' not in imported_metadata: + imported_metadata['import_files'] = {} + imported_metadata['import_files'][import_session_id] = { + 'path': str(temp_path), + 'filename': import_filename, + 'file_type': file_ext + } + + return jsonify({ + 'success': True, + 'import_session_id': import_session_id, + 'filename': import_filename, + 'columns': columns, + 'sample_data': sample_data, + 'message': f'Import file uploaded. Please configure column mapping.' + }) + + except Exception as e: + import logging + logging.getLogger(__name__).error(f"Import upload failed: {e}") + return jsonify({'error': f'Import upload failed: {str(e)}'}), 500 + +@app.route('/configure-import-mapping', methods=['POST']) +@login_required +def configure_import_mapping(): + """Configure import column mapping and load metadata.""" + try: + import pandas as pd + import json + + data = request.json + import_session_id = data.get('import_session_id') + column_mapping = data.get('column_mapping', {}) + + if not import_session_id or import_session_id not in imported_metadata.get('import_files', {}): + return jsonify({'error': 'Invalid session ID'}), 400 + + import_info = imported_metadata['import_files'][import_session_id] + import_path = import_info['path'] + file_ext = import_info['file_type'] + + # Read the full file + if file_ext == '.csv': + df = pd.read_csv(import_path, encoding='utf-8') + elif file_ext in ['.xlsx', '.xls']: + df = pd.read_excel(import_path) + elif file_ext == '.json': + with open(import_path, 'r', encoding='utf-8') as f: + json_data = json.load(f) + if isinstance(json_data, list): + df = pd.DataFrame(json_data) + else: + df = pd.DataFrame([json_data]) + + # Build metadata map using configured columns + metadata_map = {} + filename_col = column_mapping.get('filename') + title_col = column_mapping.get('title') + subject_col = column_mapping.get('subject') + keywords_col = column_mapping.get('keywords') + + if not filename_col: + return jsonify({'error': 'Filename column is required'}), 400 + + for _, row in df.iterrows(): + filename = row.get(filename_col) + if pd.notna(filename) and str(filename).strip(): + filename_stem = Path(str(filename).strip()).stem.lower() + + metadata = { + 'title': str(row.get(title_col, '')).strip() if title_col and pd.notna(row.get(title_col)) else '', + 'subject': str(row.get(subject_col, '')).strip() if subject_col and pd.notna(row.get(subject_col)) else '', + 'keywords': str(row.get(keywords_col, '')).strip() if keywords_col and pd.notna(row.get(keywords_col)) else '', + 'original_filename': str(filename).strip() + } + + metadata_map[filename_stem] = metadata + + # Store configured metadata map + imported_metadata[import_session_id] = metadata_map + + # Clean up temp file + Path(import_path).unlink(missing_ok=True) + + # Get stats + stats = { + 'total_records': len(metadata_map), + 'with_title': sum(1 for v in metadata_map.values() if v.get('title')), + 'with_subject': sum(1 for v in metadata_map.values() if v.get('subject')), + 'with_keywords': sum(1 for v in metadata_map.values() if v.get('keywords')) + } + + return jsonify({ + 'success': True, + 'import_session_id': import_session_id, + 'stats': stats, + 'message': f'Configured mapping for {stats["total_records"]} records' + }) + + except Exception as e: + import logging + logging.getLogger(__name__).error(f"Import configuration failed: {e}") + return jsonify({'error': f'Import configuration failed: {str(e)}'}), 500 + +@app.route('/preview-import', methods=['POST']) +@login_required +def preview_import(): + """Preview file structure and suggest field mappings.""" + if 'import_file' not in request.files: + return jsonify({'error': 'No file provided'}), 400 + + file = request.files['import_file'] + if file.filename == '': + return jsonify({'error': 'No file selected'}), 400 + + try: + # Save temp file + import_filename = safe_filename(file.filename) + temp_path = Path(app.config['UPLOAD_FOLDER']) / import_filename + file.save(str(temp_path)) + + # Preview file structure + importer = MetadataImporter() + columns, sample_rows, suggestions = importer.preview_file_structure(str(temp_path)) + + # Clean up temp file + temp_path.unlink() + + # Format suggestions for frontend + formatted_suggestions = {} + for source_field, suggestion_data in suggestions.items(): + formatted_suggestions[source_field] = { + 'best_match': suggestion_data['best_match'], + 'confidence': round(suggestion_data['confidence'], 2), + 'alternatives': [ + {'field': alt['field'], 'confidence': round(alt['confidence'], 2)} + for alt in suggestion_data.get('alternatives', []) + ] + } + + return jsonify({ + 'success': True, + 'columns': columns, + 'sample_rows': sample_rows[:5], # Limit to 5 rows + 'suggestions': formatted_suggestions, + 'filename': import_filename + }) + + except Exception as e: + import logging + logging.getLogger(__name__).error(f"Preview failed: {e}") + # Try to clean up temp file + try: + if temp_path.exists(): + temp_path.unlink() + except: + pass + return jsonify({'error': f'Preview failed: {str(e)}'}), 500 + +@app.route('/stats') +@login_required +def get_stats(): + """Get Excel metadata statistics.""" + try: + lookup = get_metadata_lookup() + stats = lookup.get_stats() + return jsonify({ + 'success': True, + 'stats': stats + }) + except Exception as e: + return jsonify({'error': str(e)}), 500 + +# Template Management Endpoints +template_manager = TemplateManager() + +@app.route('/templates/list', methods=['GET']) +@login_required +def list_templates(): + """List all available templates.""" + try: + templates = template_manager.list_templates() + return jsonify({ + 'success': True, + 'templates': templates + }) + except Exception as e: + return jsonify({'error': str(e)}), 500 + +@app.route('/templates/save', methods=['POST']) +@login_required +def save_template(): + """Save a new template.""" + try: + data = request.json + name = data.get('name', '').strip() + + if not name: + return jsonify({'error': 'Template name is required'}), 400 + + template = template_manager.create_template( + name=name, + title_template=data.get('title', ''), + subject_template=data.get('subject', ''), + keywords_template=data.get('keywords', ''), + description=data.get('description', '') + ) + + success = template_manager.save_template(template) + + if success: + return jsonify({ + 'success': True, + 'message': f'Template "{name}" saved successfully', + 'template': template + }) + else: + return jsonify({'error': 'Failed to save template'}), 500 + + except Exception as e: + return jsonify({'error': str(e)}), 500 + +@app.route('/templates/load/', methods=['GET']) +@login_required +def load_template(name): + """Load a template by name.""" + try: + template = template_manager.load_template(name) + + if template: + return jsonify({ + 'success': True, + 'template': template + }) + else: + return jsonify({'error': f'Template "{name}" not found'}), 404 + + except Exception as e: + return jsonify({'error': str(e)}), 500 + +@app.route('/templates/delete/', methods=['DELETE']) +@login_required +def delete_template(name): + """Delete a template.""" + try: + success = template_manager.delete_template(name) + + if success: + return jsonify({ + 'success': True, + 'message': f'Template "{name}" deleted successfully' + }) + else: + return jsonify({'error': f'Template "{name}" not found'}), 404 + + except Exception as e: + return jsonify({'error': str(e)}), 500 + +@app.route('/templates/apply', methods=['POST']) +@login_required +def apply_template(): + """Apply a template to generate metadata for files.""" + try: + data = request.json + template_name = data.get('template_name', '').strip() + file_indices = data.get('file_indices', []) + session_id = data.get('session_id') + custom_vars = data.get('custom_vars', {}) + + if not template_name: + return jsonify({'error': 'Template name is required'}), 400 + + if not session_id or session_id not in sessions: + return jsonify({'error': 'Invalid or expired session'}), 400 + + # Load template + template = template_manager.load_template(template_name) + if not template: + return jsonify({'error': f'Template "{template_name}" not found'}), 404 + + # Apply template to each file + results = [] + for file_index in file_indices: + if file_index >= len(sessions[session_id]['files']): + continue + + file_info = sessions[session_id]['files'][file_index] + filename = file_info.get('filename', 'unknown') + + # Generate metadata from template + metadata = template_manager.apply_template( + template=template, + filename=filename, + user='web_user', + custom_vars=custom_vars + ) + + # Update file metadata in session + sessions[session_id]['files'][file_index]['suggested_metadata'] = metadata + + results.append({ + 'file_index': file_index, + 'filename': filename, + 'metadata': metadata + }) + + return jsonify({ + 'success': True, + 'message': f'Template applied to {len(results)} file(s)', + 'results': results + }) + + except Exception as e: + return jsonify({'error': str(e)}), 500 + +@app.route('/templates/preview', methods=['POST']) +@login_required +def preview_template(): + """Preview template output with sample data.""" + try: + data = request.json + template = { + 'name': 'preview', + 'title': data.get('title', ''), + 'subject': data.get('subject', ''), + 'keywords': data.get('keywords', '') + } + + sample_filename = data.get('sample_filename', 'example.pdf') + custom_vars = data.get('custom_vars', {}) + + preview = template_manager.preview_template( + template=template, + sample_filename=sample_filename, + user='web_user', + custom_vars=custom_vars + ) + + # Also get available variables + available_vars = template_manager.get_available_variables() + + return jsonify({ + 'success': True, + 'preview': preview, + 'available_variables': available_vars + }) + + except Exception as e: + return jsonify({'error': str(e)}), 500 + +def open_browser(): + """Open browser after short delay.""" + sleep(1.5) + webbrowser.open('http://localhost:5001') + +if __name__ == '__main__': + print("="*60) + print(f"{Config.APP_NAME} v{Config.APP_VERSION} - Web Interface") + print("="*60) + + # Check dependencies + print("\n🔍 Checking dependencies...") + + # Check Excel file + if not EXCEL_PATH.exists(): + print(f"⚠️ Warning: Excel file not found at {EXCEL_PATH}") + print(" Excel metadata lookup will not be available") + print(" Please ensure the Excel file is in the project root") + else: + print(f"✓ Excel file found: {EXCEL_PATH.name}") + + # Check OpenAI API key (optional) + if Config.OPENAI_API_KEY: + print("✓ OpenAI API key configured (AI metadata generation available)") + else: + print("ℹ️ OpenAI API key not configured (AI generation disabled)") + + # Check ExifTool (optional) + if Config.check_exiftool(): + print("✓ ExifTool available for enhanced metadata operations") + else: + print("ℹ️ ExifTool not installed (using Python libraries)") + + print("\nMetadata sources available:") + print(" • Excel lookup (Celum ID mapping)") + if Config.OPENAI_API_KEY: + print(" • AI generation (OpenAI)") + print(" • Manual entry") + print(" • File import (CSV/Excel/JSON)") + + print("\nStarting server...") + + # Docker mode configuration + if DOCKER_MODE: + print("Running in Docker mode") + print("Server will be accessible at http://0.0.0.0:5001") + host = '0.0.0.0' + else: + print("Opening browser at http://localhost:5001") + host = '127.0.0.1' + # Open browser in background (only in local mode) + threading.Thread(target=open_browser, daemon=True).start() + + print("\nPress Ctrl+C to stop the server") + print("="*60) + + # Clean up old files on startup + if DOCKER_MODE: + print("\n🧹 Cleaning up old files...") + cleanup_old_files(max_age_hours=24) + + # Run Flask app + app.run(debug=False, port=5001, host=host)