solventum-image-metadata/backend/app/processors/base_extractor.py

"""Base class for all content extractors."""

from abc import ABC, abstractmethod
from typing import Dict, Optional

class BaseExtractor(ABC):
    """Abstract base class for content extractors."""

    @abstractmethod
    def extract_content(self, file_path: str) -> str:
        """
        Extract text content from file.

        Args:
            file_path: Path to the file

        Returns:
            Extracted text content
        """
        pass

    @abstractmethod
    def read_metadata(self, file_path: str) -> Dict[str, str]:
        """
        Read existing metadata from file.

        Args:
            file_path: Path to the file

        Returns:
            Dictionary of metadata fields
        """
        pass

    def truncate_content(self, content: str, max_length: int = 3000) -> str:
        """
        Truncate content to maximum length for AI processing.

        Args:
            content: Text content
            max_length: Maximum length

        Returns:
            Truncated content
        """
        if len(content) <= max_length:
            return content
        return content[:max_length] + "..."

    def clean_text(self, text: str) -> str:
        """
        Clean extracted text (remove excessive whitespace, etc.).

        Args:
            text: Raw text

        Returns:
            Cleaned text
        """
        # Remove multiple spaces
        text = ' '.join(text.split())
        # Remove multiple newlines
        text = '\n'.join(line for line in text.split('\n') if line.strip())
        return text.strip()