"""Base class for all content extractors.""" from abc import ABC, abstractmethod from typing import Dict, Optional class BaseExtractor(ABC): """Abstract base class for content extractors.""" @abstractmethod def extract_content(self, file_path: str) -> str: """ Extract text content from file. Args: file_path: Path to the file Returns: Extracted text content """ pass @abstractmethod def read_metadata(self, file_path: str) -> Dict[str, str]: """ Read existing metadata from file. Args: file_path: Path to the file Returns: Dictionary of metadata fields """ pass def truncate_content(self, content: str, max_length: int = 3000) -> str: """ Truncate content to maximum length for AI processing. Args: content: Text content max_length: Maximum length Returns: Truncated content """ if len(content) <= max_length: return content return content[:max_length] + "..." def clean_text(self, text: str) -> str: """ Clean extracted text (remove excessive whitespace, etc.). Args: text: Raw text Returns: Cleaned text """ # Remove multiple spaces text = ' '.join(text.split()) # Remove multiple newlines text = '\n'.join(line for line in text.split('\n') if line.strip()) return text.strip()