- Added Flask web interface for batch metadata processing - Added Excel-based metadata lookup (Celum ID mapping) - Dual-sheet support: DSB (primary) and Medsurg (fallback) - Unicode/hieroglyph support for CGA region (Chinese, Japanese, Korean) - Multi-format support: PDF, images, Office docs, video - OCR with multi-language support (Tesseract) - Filename matching without extension (case-insensitive) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
64 lines
1.5 KiB
Python
64 lines
1.5 KiB
Python
"""Base class for all content extractors."""
|
|
|
|
from abc import ABC, abstractmethod
|
|
from typing import Dict, Optional
|
|
|
|
class BaseExtractor(ABC):
|
|
"""Abstract base class for content extractors."""
|
|
|
|
@abstractmethod
|
|
def extract_content(self, file_path: str) -> str:
|
|
"""
|
|
Extract text content from file.
|
|
|
|
Args:
|
|
file_path: Path to the file
|
|
|
|
Returns:
|
|
Extracted text content
|
|
"""
|
|
pass
|
|
|
|
@abstractmethod
|
|
def read_metadata(self, file_path: str) -> Dict[str, str]:
|
|
"""
|
|
Read existing metadata from file.
|
|
|
|
Args:
|
|
file_path: Path to the file
|
|
|
|
Returns:
|
|
Dictionary of metadata fields
|
|
"""
|
|
pass
|
|
|
|
def truncate_content(self, content: str, max_length: int = 3000) -> str:
|
|
"""
|
|
Truncate content to maximum length for AI processing.
|
|
|
|
Args:
|
|
content: Text content
|
|
max_length: Maximum length
|
|
|
|
Returns:
|
|
Truncated content
|
|
"""
|
|
if len(content) <= max_length:
|
|
return content
|
|
return content[:max_length] + "..."
|
|
|
|
def clean_text(self, text: str) -> str:
|
|
"""
|
|
Clean extracted text (remove excessive whitespace, etc.).
|
|
|
|
Args:
|
|
text: Raw text
|
|
|
|
Returns:
|
|
Cleaned text
|
|
"""
|
|
# Remove multiple spaces
|
|
text = ' '.join(text.split())
|
|
# Remove multiple newlines
|
|
text = '\n'.join(line for line in text.split('\n') if line.strip())
|
|
return text.strip()
|