solventum-image-metadata/backend/app/processors/extractors/exiftool_extractor.py
SamoilenkoVadym 563d476a94 feat(backend): migrate from Flask to FastAPI with Redis sessions
- Create FastAPI application with async I/O
- Implement Redis session storage (fixes session loss on restart)
- Add JWT authentication with refresh tokens
- Add Microsoft SSO support via MSAL
- Copy all processors from src/ (100% reused, no changes)
- Create file upload/download endpoints
- Create metadata update endpoints
- Create template CRUD endpoints
- Add SQLAlchemy async database models
- Add Docker Compose configuration with Redis

Solves critical issues:
- Session management: Redis replaces in-memory dicts
- Scalability: Async FastAPI + microservices architecture
- File handling: Persistent storage with auto-cleanup

Key files:
- backend/app/main.py - FastAPI entry point
- backend/app/core/redis_client.py - Session store
- backend/app/core/auth.py - JWT authentication
- backend/app/api/* - All REST endpoints
- backend/app/processors/ - Reused from src/

Co-Authored-By: Claude Sonnet 4.5 (1M context) <noreply@anthropic.com>
2026-02-09 13:14:37 +00:00

174 lines
5.8 KiB
Python

"""Unified metadata extractor using ExifTool for images, video, and PDF files."""
from typing import Dict, Optional
from pathlib import Path
import logging
try:
from exiftool import ExifToolHelper
EXIFTOOL_AVAILABLE = True
except ImportError:
EXIFTOOL_AVAILABLE = False
from ..base_extractor import BaseExtractor
from ..utils import get_logger
logger = get_logger(__name__)
class ExifToolExtractor(BaseExtractor):
"""
Extract metadata using ExifTool.
Supports images (JPEG, PNG, GIF, TIFF, HEIC, RAW),
videos (MP4, MOV, AVI, MKV), and PDF metadata extraction.
Note: This does NOT extract content (text) from files - only metadata.
For content extraction, use the regular extractors (PDFExtractor, ImageExtractor with OCR).
"""
# Map ExifTool tags to our standard metadata fields
TAG_MAPPING = {
# Images (JPEG/PNG/TIFF)
'EXIF:ImageDescription': 'title',
'XMP:Description': 'subject',
'IPTC:Caption-Abstract': 'subject',
'IPTC:Headline': 'title',
'XMP:Title': 'title',
'EXIF:XPSubject': 'subject',
'EXIF:XPKeywords': 'keywords',
'IPTC:Keywords': 'keywords',
'XMP:Subject': 'keywords',
# PDF
'PDF:Title': 'title',
'PDF:Subject': 'subject',
'PDF:Keywords': 'keywords',
# Video (QuickTime/MP4)
'QuickTime:Title': 'title',
'QuickTime:Description': 'subject',
'QuickTime:Keywords': 'keywords',
'UserData:Title': 'title',
'UserData:Description': 'subject',
}
def __init__(self):
"""Initialize ExifTool extractor."""
if not EXIFTOOL_AVAILABLE:
raise ImportError(
"PyExifTool not installed. Install with: pip install PyExifTool>=0.5.6\n"
"Also ensure ExifTool is installed on your system."
)
def extract_content(self, file_path: str) -> str:
"""
ExifTool does not extract text content - only metadata.
This method returns empty string. For content extraction:
- PDFs: Use PDFExtractor
- Images: Use ImageExtractor with OCR
- Office docs: Use OfficeExtractor
Args:
file_path: Path to the file
Returns:
Empty string (ExifTool doesn't extract content)
"""
logger.debug(f"ExifToolExtractor.extract_content called for {file_path} - returning empty (metadata only)")
return ""
def read_metadata(self, file_path: str) -> Dict[str, str]:
"""
Read metadata using ExifTool.
Extracts title, subject, and keywords from various metadata fields.
Supports images, videos, and PDFs.
Args:
file_path: Path to the file
Returns:
Dictionary with metadata (title, subject, keywords)
"""
try:
with ExifToolHelper() as et:
metadata_list = et.get_metadata([file_path])
if not metadata_list:
logger.warning(f"No metadata returned by ExifTool for {file_path}")
return {'title': '', 'subject': '', 'keywords': ''}
exif_data = metadata_list[0]
result = {'title': '', 'subject': '', 'keywords': ''}
# Map ExifTool tags to standard fields
for exif_tag, standard_key in self.TAG_MAPPING.items():
if exif_tag in exif_data and exif_data[exif_tag]:
value = exif_data[exif_tag]
# Handle list values (keywords often come as arrays)
if isinstance(value, list):
value = ', '.join(str(v) for v in value)
else:
value = str(value)
# First non-empty value wins (priority based on TAG_MAPPING order)
if not result[standard_key] and value.strip():
result[standard_key] = value.strip()
logger.info(f"Extracted metadata from {Path(file_path).name}: "
f"title={bool(result['title'])}, "
f"subject={bool(result['subject'])}, "
f"keywords={bool(result['keywords'])}")
return result
except Exception as e:
logger.error(f"ExifTool extraction failed for {file_path}: {e}")
return {'title': '', 'subject': '', 'keywords': ''}
def get_all_tags(self, file_path: str) -> Dict:
"""
Get all available metadata tags from a file.
Useful for debugging or exploring available metadata fields.
Args:
file_path: Path to the file
Returns:
Dictionary of all metadata tags
"""
try:
with ExifToolHelper() as et:
metadata_list = et.get_metadata([file_path])
if metadata_list:
return metadata_list[0]
return {}
except Exception as e:
logger.error(f"Failed to get all tags for {file_path}: {e}")
return {}
def get_specific_tags(self, file_path: str, tags: list) -> Dict:
"""
Get specific metadata tags from a file.
More efficient than get_all_tags when you know which tags you need.
Args:
file_path: Path to the file
tags: List of tag names (e.g., ['EXIF:ImageDescription', 'PDF:Title'])
Returns:
Dictionary of requested tags
"""
try:
with ExifToolHelper() as et:
metadata_list = et.get_tags([file_path], tags=tags)
if metadata_list:
return metadata_list[0]
return {}
except Exception as e:
logger.error(f"Failed to get specific tags for {file_path}: {e}")
return {}