- Create FastAPI application with async I/O - Implement Redis session storage (fixes session loss on restart) - Add JWT authentication with refresh tokens - Add Microsoft SSO support via MSAL - Copy all processors from src/ (100% reused, no changes) - Create file upload/download endpoints - Create metadata update endpoints - Create template CRUD endpoints - Add SQLAlchemy async database models - Add Docker Compose configuration with Redis Solves critical issues: - Session management: Redis replaces in-memory dicts - Scalability: Async FastAPI + microservices architecture - File handling: Persistent storage with auto-cleanup Key files: - backend/app/main.py - FastAPI entry point - backend/app/core/redis_client.py - Session store - backend/app/core/auth.py - JWT authentication - backend/app/api/* - All REST endpoints - backend/app/processors/ - Reused from src/ Co-Authored-By: Claude Sonnet 4.5 (1M context) <noreply@anthropic.com>
179 lines
5.8 KiB
Python
179 lines
5.8 KiB
Python
"""Image content and metadata extractor."""
|
|
|
|
import pytesseract
|
|
import piexif
|
|
from PIL import Image
|
|
from typing import Dict
|
|
import os
|
|
|
|
from ..base_extractor import BaseExtractor
|
|
from ..config import Config
|
|
from ..utils import get_logger
|
|
|
|
logger = get_logger(__name__)
|
|
|
|
|
|
class ImageExtractor(BaseExtractor):
|
|
"""Extractor for image files (JPEG, PNG, etc.) with OCR and EXIF metadata."""
|
|
|
|
def __init__(self):
|
|
"""Initialize image extractor."""
|
|
self.tesseract_path = Config.TESSERACT_PATH
|
|
if self.tesseract_path and os.path.exists(self.tesseract_path):
|
|
pytesseract.pytesseract.pytesseract_cmd = self.tesseract_path
|
|
# Get OCR languages from config (supports Chinese, Japanese, Korean, etc.)
|
|
self.ocr_lang = Config.OCR_LANGUAGES
|
|
|
|
def extract_content(self, file_path: str) -> str:
|
|
"""
|
|
Extract text content from image using OCR.
|
|
|
|
Uses pytesseract to perform optical character recognition on the image.
|
|
Supports multiple languages including Chinese, Japanese, Korean.
|
|
|
|
Args:
|
|
file_path: Path to the image file
|
|
|
|
Returns:
|
|
Extracted text content
|
|
|
|
Raises:
|
|
Exception: If extraction fails
|
|
"""
|
|
try:
|
|
logger.info(f"Starting image OCR extraction from {file_path}")
|
|
|
|
# Open image
|
|
image = Image.open(file_path)
|
|
|
|
# Apply OCR with multi-language support
|
|
text = pytesseract.image_to_string(image, lang=self.ocr_lang)
|
|
|
|
if text and len(text.strip()) > 0:
|
|
cleaned_text = self.clean_text(text)
|
|
logger.info(f"Successfully extracted {len(cleaned_text)} characters from {file_path}")
|
|
return cleaned_text
|
|
else:
|
|
logger.warning(f"OCR extraction returned empty content for {file_path}")
|
|
return ""
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to extract content from image {file_path}: {e}", exc_info=True)
|
|
return ""
|
|
|
|
def read_metadata(self, file_path: str) -> Dict[str, str]:
|
|
"""
|
|
Read image metadata from EXIF and IPTC data.
|
|
|
|
Extracts standard image metadata fields including camera info, date taken,
|
|
copyright, etc.
|
|
|
|
Args:
|
|
file_path: Path to the image file
|
|
|
|
Returns:
|
|
Dictionary of metadata fields
|
|
|
|
Raises:
|
|
Exception: If metadata reading fails
|
|
"""
|
|
metadata = {}
|
|
|
|
try:
|
|
# Get file extension to determine format
|
|
file_ext = file_path.lower().split('.')[-1]
|
|
|
|
# Try EXIF data
|
|
metadata = self._read_exif_metadata(file_path)
|
|
|
|
# For PNG files, try IPTC data
|
|
if file_ext in ['png']:
|
|
iptc_metadata = self._read_iptc_metadata(file_path)
|
|
metadata.update(iptc_metadata)
|
|
|
|
logger.info(f"Successfully read metadata from {file_path}")
|
|
return metadata
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to read image metadata from {file_path}: {e}", exc_info=True)
|
|
return {}
|
|
|
|
def _read_exif_metadata(self, file_path: str) -> Dict[str, str]:
|
|
"""
|
|
Read EXIF metadata from image.
|
|
|
|
Args:
|
|
file_path: Path to image file
|
|
|
|
Returns:
|
|
Dictionary of EXIF metadata
|
|
"""
|
|
try:
|
|
# Try piexif first for JPEG
|
|
if file_path.lower().endswith(('.jpg', '.jpeg')):
|
|
try:
|
|
exif_dict = piexif.load(file_path)
|
|
metadata = {}
|
|
|
|
# Extract commonly useful EXIF fields
|
|
if "0th" in exif_dict:
|
|
for tag, value in exif_dict["0th"].items():
|
|
tag_name = piexif.TAGS["0th"][tag]["name"]
|
|
try:
|
|
if isinstance(value, bytes):
|
|
value = value.decode('utf-8', errors='ignore')
|
|
metadata[tag_name.lower()] = str(value).strip()
|
|
except Exception:
|
|
pass
|
|
|
|
return metadata
|
|
except Exception as e:
|
|
logger.debug(f"piexif extraction failed: {e}")
|
|
|
|
# Fallback to PIL for all image types
|
|
image = Image.open(file_path)
|
|
metadata = {}
|
|
|
|
if hasattr(image, '_getexif') and image._getexif() is not None:
|
|
exif_data = image._getexif()
|
|
for tag_id, value in exif_data.items():
|
|
tag_name = piexif.TAGS["0th"].get(tag_id, {}).get("name", f"tag_{tag_id}")
|
|
if isinstance(value, bytes):
|
|
value = value.decode('utf-8', errors='ignore')
|
|
metadata[tag_name.lower()] = str(value).strip()
|
|
|
|
return metadata
|
|
|
|
except Exception as e:
|
|
logger.debug(f"EXIF metadata extraction failed: {e}")
|
|
return {}
|
|
|
|
def _read_iptc_metadata(self, file_path: str) -> Dict[str, str]:
|
|
"""
|
|
Read IPTC metadata from image.
|
|
|
|
Args:
|
|
file_path: Path to image file
|
|
|
|
Returns:
|
|
Dictionary of IPTC metadata
|
|
"""
|
|
try:
|
|
from PIL import Image
|
|
from PIL.PngImagePlugin import PngInfo
|
|
|
|
image = Image.open(file_path)
|
|
metadata = {}
|
|
|
|
# Check for PNG info
|
|
if hasattr(image, 'info'):
|
|
for key, value in image.info.items():
|
|
if isinstance(value, bytes):
|
|
value = value.decode('utf-8', errors='ignore')
|
|
metadata[str(key).lower()] = str(value).strip()
|
|
|
|
return metadata
|
|
|
|
except Exception as e:
|
|
logger.debug(f"IPTC metadata extraction failed: {e}")
|
|
return {}
|