solventum-image-metadata/backend/app/processors/extractors/image_extractor.py

"""Image content and metadata extractor."""

import pytesseract
import piexif
from PIL import Image
from typing import Dict
import os

from ..base_extractor import BaseExtractor
from ..config import Config
from ..utils import get_logger

logger = get_logger(__name__)


class ImageExtractor(BaseExtractor):
    """Extractor for image files (JPEG, PNG, etc.) with OCR and EXIF metadata."""

    def __init__(self):
        """Initialize image extractor."""
        self.tesseract_path = Config.TESSERACT_PATH
        if self.tesseract_path and os.path.exists(self.tesseract_path):
            pytesseract.pytesseract.pytesseract_cmd = self.tesseract_path
        # Get OCR languages from config (supports Chinese, Japanese, Korean, etc.)
        self.ocr_lang = Config.OCR_LANGUAGES

    def extract_content(self, file_path: str) -> str:
        """
        Extract text content from image using OCR.

        Uses pytesseract to perform optical character recognition on the image.
        Supports multiple languages including Chinese, Japanese, Korean.

        Args:
            file_path: Path to the image file

        Returns:
            Extracted text content

        Raises:
            Exception: If extraction fails
        """
        try:
            logger.info(f"Starting image OCR extraction from {file_path}")

            # Open image
            image = Image.open(file_path)

            # Apply OCR with multi-language support
            text = pytesseract.image_to_string(image, lang=self.ocr_lang)

            if text and len(text.strip()) > 0:
                cleaned_text = self.clean_text(text)
                logger.info(f"Successfully extracted {len(cleaned_text)} characters from {file_path}")
                return cleaned_text
            else:
                logger.warning(f"OCR extraction returned empty content for {file_path}")
                return ""

        except Exception as e:
            logger.error(f"Failed to extract content from image {file_path}: {e}", exc_info=True)
            return ""

    def read_metadata(self, file_path: str) -> Dict[str, str]:
        """
        Read image metadata from EXIF and IPTC data.

        Extracts standard image metadata fields including camera info, date taken,
        copyright, etc.

        Args:
            file_path: Path to the image file

        Returns:
            Dictionary of metadata fields

        Raises:
            Exception: If metadata reading fails
        """
        metadata = {}

        try:
            # Get file extension to determine format
            file_ext = file_path.lower().split('.')[-1]

            # Try EXIF data
            metadata = self._read_exif_metadata(file_path)

            # For PNG files, try IPTC data
            if file_ext in ['png']:
                iptc_metadata = self._read_iptc_metadata(file_path)
                metadata.update(iptc_metadata)

            logger.info(f"Successfully read metadata from {file_path}")
            return metadata

        except Exception as e:
            logger.error(f"Failed to read image metadata from {file_path}: {e}", exc_info=True)
            return {}

    def _read_exif_metadata(self, file_path: str) -> Dict[str, str]:
        """
        Read EXIF metadata from image.

        Args:
            file_path: Path to image file

        Returns:
            Dictionary of EXIF metadata
        """
        try:
            # Try piexif first for JPEG
            if file_path.lower().endswith(('.jpg', '.jpeg')):
                try:
                    exif_dict = piexif.load(file_path)
                    metadata = {}

                    # Extract commonly useful EXIF fields
                    if "0th" in exif_dict:
                        for tag, value in exif_dict["0th"].items():
                            tag_name = piexif.TAGS["0th"][tag]["name"]
                            try:
                                if isinstance(value, bytes):
                                    value = value.decode('utf-8', errors='ignore')
                                metadata[tag_name.lower()] = str(value).strip()
                            except Exception:
                                pass

                    return metadata
                except Exception as e:
                    logger.debug(f"piexif extraction failed: {e}")

            # Fallback to PIL for all image types
            image = Image.open(file_path)
            metadata = {}

            if hasattr(image, '_getexif') and image._getexif() is not None:
                exif_data = image._getexif()
                for tag_id, value in exif_data.items():
                    tag_name = piexif.TAGS["0th"].get(tag_id, {}).get("name", f"tag_{tag_id}")
                    if isinstance(value, bytes):
                        value = value.decode('utf-8', errors='ignore')
                    metadata[tag_name.lower()] = str(value).strip()

            return metadata

        except Exception as e:
            logger.debug(f"EXIF metadata extraction failed: {e}")
            return {}

    def _read_iptc_metadata(self, file_path: str) -> Dict[str, str]:
        """
        Read IPTC metadata from image.

        Args:
            file_path: Path to image file

        Returns:
            Dictionary of IPTC metadata
        """
        try:
            from PIL import Image
            from PIL.PngImagePlugin import PngInfo

            image = Image.open(file_path)
            metadata = {}

            # Check for PNG info
            if hasattr(image, 'info'):
                for key, value in image.info.items():
                    if isinstance(value, bytes):
                        value = value.decode('utf-8', errors='ignore')
                    metadata[str(key).lower()] = str(value).strip()

            return metadata

        except Exception as e:
            logger.debug(f"IPTC metadata extraction failed: {e}")
            return {}