solventum-image-metadata/backend/app/processors/extractors/image_extractor.py
SamoilenkoVadym 563d476a94 feat(backend): migrate from Flask to FastAPI with Redis sessions
- Create FastAPI application with async I/O
- Implement Redis session storage (fixes session loss on restart)
- Add JWT authentication with refresh tokens
- Add Microsoft SSO support via MSAL
- Copy all processors from src/ (100% reused, no changes)
- Create file upload/download endpoints
- Create metadata update endpoints
- Create template CRUD endpoints
- Add SQLAlchemy async database models
- Add Docker Compose configuration with Redis

Solves critical issues:
- Session management: Redis replaces in-memory dicts
- Scalability: Async FastAPI + microservices architecture
- File handling: Persistent storage with auto-cleanup

Key files:
- backend/app/main.py - FastAPI entry point
- backend/app/core/redis_client.py - Session store
- backend/app/core/auth.py - JWT authentication
- backend/app/api/* - All REST endpoints
- backend/app/processors/ - Reused from src/

Co-Authored-By: Claude Sonnet 4.5 (1M context) <noreply@anthropic.com>
2026-02-09 13:14:37 +00:00

179 lines
5.8 KiB
Python

"""Image content and metadata extractor."""
import pytesseract
import piexif
from PIL import Image
from typing import Dict
import os
from ..base_extractor import BaseExtractor
from ..config import Config
from ..utils import get_logger
logger = get_logger(__name__)
class ImageExtractor(BaseExtractor):
"""Extractor for image files (JPEG, PNG, etc.) with OCR and EXIF metadata."""
def __init__(self):
"""Initialize image extractor."""
self.tesseract_path = Config.TESSERACT_PATH
if self.tesseract_path and os.path.exists(self.tesseract_path):
pytesseract.pytesseract.pytesseract_cmd = self.tesseract_path
# Get OCR languages from config (supports Chinese, Japanese, Korean, etc.)
self.ocr_lang = Config.OCR_LANGUAGES
def extract_content(self, file_path: str) -> str:
"""
Extract text content from image using OCR.
Uses pytesseract to perform optical character recognition on the image.
Supports multiple languages including Chinese, Japanese, Korean.
Args:
file_path: Path to the image file
Returns:
Extracted text content
Raises:
Exception: If extraction fails
"""
try:
logger.info(f"Starting image OCR extraction from {file_path}")
# Open image
image = Image.open(file_path)
# Apply OCR with multi-language support
text = pytesseract.image_to_string(image, lang=self.ocr_lang)
if text and len(text.strip()) > 0:
cleaned_text = self.clean_text(text)
logger.info(f"Successfully extracted {len(cleaned_text)} characters from {file_path}")
return cleaned_text
else:
logger.warning(f"OCR extraction returned empty content for {file_path}")
return ""
except Exception as e:
logger.error(f"Failed to extract content from image {file_path}: {e}", exc_info=True)
return ""
def read_metadata(self, file_path: str) -> Dict[str, str]:
"""
Read image metadata from EXIF and IPTC data.
Extracts standard image metadata fields including camera info, date taken,
copyright, etc.
Args:
file_path: Path to the image file
Returns:
Dictionary of metadata fields
Raises:
Exception: If metadata reading fails
"""
metadata = {}
try:
# Get file extension to determine format
file_ext = file_path.lower().split('.')[-1]
# Try EXIF data
metadata = self._read_exif_metadata(file_path)
# For PNG files, try IPTC data
if file_ext in ['png']:
iptc_metadata = self._read_iptc_metadata(file_path)
metadata.update(iptc_metadata)
logger.info(f"Successfully read metadata from {file_path}")
return metadata
except Exception as e:
logger.error(f"Failed to read image metadata from {file_path}: {e}", exc_info=True)
return {}
def _read_exif_metadata(self, file_path: str) -> Dict[str, str]:
"""
Read EXIF metadata from image.
Args:
file_path: Path to image file
Returns:
Dictionary of EXIF metadata
"""
try:
# Try piexif first for JPEG
if file_path.lower().endswith(('.jpg', '.jpeg')):
try:
exif_dict = piexif.load(file_path)
metadata = {}
# Extract commonly useful EXIF fields
if "0th" in exif_dict:
for tag, value in exif_dict["0th"].items():
tag_name = piexif.TAGS["0th"][tag]["name"]
try:
if isinstance(value, bytes):
value = value.decode('utf-8', errors='ignore')
metadata[tag_name.lower()] = str(value).strip()
except Exception:
pass
return metadata
except Exception as e:
logger.debug(f"piexif extraction failed: {e}")
# Fallback to PIL for all image types
image = Image.open(file_path)
metadata = {}
if hasattr(image, '_getexif') and image._getexif() is not None:
exif_data = image._getexif()
for tag_id, value in exif_data.items():
tag_name = piexif.TAGS["0th"].get(tag_id, {}).get("name", f"tag_{tag_id}")
if isinstance(value, bytes):
value = value.decode('utf-8', errors='ignore')
metadata[tag_name.lower()] = str(value).strip()
return metadata
except Exception as e:
logger.debug(f"EXIF metadata extraction failed: {e}")
return {}
def _read_iptc_metadata(self, file_path: str) -> Dict[str, str]:
"""
Read IPTC metadata from image.
Args:
file_path: Path to image file
Returns:
Dictionary of IPTC metadata
"""
try:
from PIL import Image
from PIL.PngImagePlugin import PngInfo
image = Image.open(file_path)
metadata = {}
# Check for PNG info
if hasattr(image, 'info'):
for key, value in image.info.items():
if isinstance(value, bytes):
value = value.decode('utf-8', errors='ignore')
metadata[str(key).lower()] = str(value).strip()
return metadata
except Exception as e:
logger.debug(f"IPTC metadata extraction failed: {e}")
return {}