- Create FastAPI application with async I/O - Implement Redis session storage (fixes session loss on restart) - Add JWT authentication with refresh tokens - Add Microsoft SSO support via MSAL - Copy all processors from src/ (100% reused, no changes) - Create file upload/download endpoints - Create metadata update endpoints - Create template CRUD endpoints - Add SQLAlchemy async database models - Add Docker Compose configuration with Redis Solves critical issues: - Session management: Redis replaces in-memory dicts - Scalability: Async FastAPI + microservices architecture - File handling: Persistent storage with auto-cleanup Key files: - backend/app/main.py - FastAPI entry point - backend/app/core/redis_client.py - Session store - backend/app/core/auth.py - JWT authentication - backend/app/api/* - All REST endpoints - backend/app/processors/ - Reused from src/ Co-Authored-By: Claude Sonnet 4.5 (1M context) <noreply@anthropic.com>
174 lines
5.8 KiB
Python
174 lines
5.8 KiB
Python
"""Unified metadata extractor using ExifTool for images, video, and PDF files."""
|
|
|
|
from typing import Dict, Optional
|
|
from pathlib import Path
|
|
import logging
|
|
|
|
try:
|
|
from exiftool import ExifToolHelper
|
|
EXIFTOOL_AVAILABLE = True
|
|
except ImportError:
|
|
EXIFTOOL_AVAILABLE = False
|
|
|
|
from ..base_extractor import BaseExtractor
|
|
from ..utils import get_logger
|
|
|
|
logger = get_logger(__name__)
|
|
|
|
|
|
class ExifToolExtractor(BaseExtractor):
|
|
"""
|
|
Extract metadata using ExifTool.
|
|
|
|
Supports images (JPEG, PNG, GIF, TIFF, HEIC, RAW),
|
|
videos (MP4, MOV, AVI, MKV), and PDF metadata extraction.
|
|
|
|
Note: This does NOT extract content (text) from files - only metadata.
|
|
For content extraction, use the regular extractors (PDFExtractor, ImageExtractor with OCR).
|
|
"""
|
|
|
|
# Map ExifTool tags to our standard metadata fields
|
|
TAG_MAPPING = {
|
|
# Images (JPEG/PNG/TIFF)
|
|
'EXIF:ImageDescription': 'title',
|
|
'XMP:Description': 'subject',
|
|
'IPTC:Caption-Abstract': 'subject',
|
|
'IPTC:Headline': 'title',
|
|
'XMP:Title': 'title',
|
|
'EXIF:XPSubject': 'subject',
|
|
'EXIF:XPKeywords': 'keywords',
|
|
'IPTC:Keywords': 'keywords',
|
|
'XMP:Subject': 'keywords',
|
|
|
|
# PDF
|
|
'PDF:Title': 'title',
|
|
'PDF:Subject': 'subject',
|
|
'PDF:Keywords': 'keywords',
|
|
|
|
# Video (QuickTime/MP4)
|
|
'QuickTime:Title': 'title',
|
|
'QuickTime:Description': 'subject',
|
|
'QuickTime:Keywords': 'keywords',
|
|
'UserData:Title': 'title',
|
|
'UserData:Description': 'subject',
|
|
}
|
|
|
|
def __init__(self):
|
|
"""Initialize ExifTool extractor."""
|
|
if not EXIFTOOL_AVAILABLE:
|
|
raise ImportError(
|
|
"PyExifTool not installed. Install with: pip install PyExifTool>=0.5.6\n"
|
|
"Also ensure ExifTool is installed on your system."
|
|
)
|
|
|
|
def extract_content(self, file_path: str) -> str:
|
|
"""
|
|
ExifTool does not extract text content - only metadata.
|
|
|
|
This method returns empty string. For content extraction:
|
|
- PDFs: Use PDFExtractor
|
|
- Images: Use ImageExtractor with OCR
|
|
- Office docs: Use OfficeExtractor
|
|
|
|
Args:
|
|
file_path: Path to the file
|
|
|
|
Returns:
|
|
Empty string (ExifTool doesn't extract content)
|
|
"""
|
|
logger.debug(f"ExifToolExtractor.extract_content called for {file_path} - returning empty (metadata only)")
|
|
return ""
|
|
|
|
def read_metadata(self, file_path: str) -> Dict[str, str]:
|
|
"""
|
|
Read metadata using ExifTool.
|
|
|
|
Extracts title, subject, and keywords from various metadata fields.
|
|
Supports images, videos, and PDFs.
|
|
|
|
Args:
|
|
file_path: Path to the file
|
|
|
|
Returns:
|
|
Dictionary with metadata (title, subject, keywords)
|
|
"""
|
|
try:
|
|
with ExifToolHelper() as et:
|
|
metadata_list = et.get_metadata([file_path])
|
|
if not metadata_list:
|
|
logger.warning(f"No metadata returned by ExifTool for {file_path}")
|
|
return {'title': '', 'subject': '', 'keywords': ''}
|
|
|
|
exif_data = metadata_list[0]
|
|
result = {'title': '', 'subject': '', 'keywords': ''}
|
|
|
|
# Map ExifTool tags to standard fields
|
|
for exif_tag, standard_key in self.TAG_MAPPING.items():
|
|
if exif_tag in exif_data and exif_data[exif_tag]:
|
|
value = exif_data[exif_tag]
|
|
|
|
# Handle list values (keywords often come as arrays)
|
|
if isinstance(value, list):
|
|
value = ', '.join(str(v) for v in value)
|
|
else:
|
|
value = str(value)
|
|
|
|
# First non-empty value wins (priority based on TAG_MAPPING order)
|
|
if not result[standard_key] and value.strip():
|
|
result[standard_key] = value.strip()
|
|
|
|
logger.info(f"Extracted metadata from {Path(file_path).name}: "
|
|
f"title={bool(result['title'])}, "
|
|
f"subject={bool(result['subject'])}, "
|
|
f"keywords={bool(result['keywords'])}")
|
|
|
|
return result
|
|
|
|
except Exception as e:
|
|
logger.error(f"ExifTool extraction failed for {file_path}: {e}")
|
|
return {'title': '', 'subject': '', 'keywords': ''}
|
|
|
|
def get_all_tags(self, file_path: str) -> Dict:
|
|
"""
|
|
Get all available metadata tags from a file.
|
|
|
|
Useful for debugging or exploring available metadata fields.
|
|
|
|
Args:
|
|
file_path: Path to the file
|
|
|
|
Returns:
|
|
Dictionary of all metadata tags
|
|
"""
|
|
try:
|
|
with ExifToolHelper() as et:
|
|
metadata_list = et.get_metadata([file_path])
|
|
if metadata_list:
|
|
return metadata_list[0]
|
|
return {}
|
|
except Exception as e:
|
|
logger.error(f"Failed to get all tags for {file_path}: {e}")
|
|
return {}
|
|
|
|
def get_specific_tags(self, file_path: str, tags: list) -> Dict:
|
|
"""
|
|
Get specific metadata tags from a file.
|
|
|
|
More efficient than get_all_tags when you know which tags you need.
|
|
|
|
Args:
|
|
file_path: Path to the file
|
|
tags: List of tag names (e.g., ['EXIF:ImageDescription', 'PDF:Title'])
|
|
|
|
Returns:
|
|
Dictionary of requested tags
|
|
"""
|
|
try:
|
|
with ExifToolHelper() as et:
|
|
metadata_list = et.get_tags([file_path], tags=tags)
|
|
if metadata_list:
|
|
return metadata_list[0]
|
|
return {}
|
|
except Exception as e:
|
|
logger.error(f"Failed to get specific tags for {file_path}: {e}")
|
|
return {}
|