Complete Flask → FastAPI migration with: - FastAPI app with session auth, Azure AD SSO, rate limiting - SQLite-backed session store (survives restarts) - Bulk AI metadata generation with SSE progress - Admin panel (user management, audit log, AI usage) - Subpath deployment support (ROOT_PATH config) - Docker + deploy.sh for production deployment - Test suite (auth, upload, templates, imports, admin, sessions) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
64 lines
1.5 KiB
Python
64 lines
1.5 KiB
Python
"""Base class for all content extractors."""
|
|
|
|
from abc import ABC, abstractmethod
|
|
from typing import Dict, Optional
|
|
|
|
class BaseExtractor(ABC):
|
|
"""Abstract base class for content extractors."""
|
|
|
|
@abstractmethod
|
|
def extract_content(self, file_path: str) -> str:
|
|
"""
|
|
Extract text content from file.
|
|
|
|
Args:
|
|
file_path: Path to the file
|
|
|
|
Returns:
|
|
Extracted text content
|
|
"""
|
|
pass
|
|
|
|
@abstractmethod
|
|
def read_metadata(self, file_path: str) -> Dict[str, str]:
|
|
"""
|
|
Read existing metadata from file.
|
|
|
|
Args:
|
|
file_path: Path to the file
|
|
|
|
Returns:
|
|
Dictionary of metadata fields
|
|
"""
|
|
pass
|
|
|
|
def truncate_content(self, content: str, max_length: int = 3000) -> str:
|
|
"""
|
|
Truncate content to maximum length for AI processing.
|
|
|
|
Args:
|
|
content: Text content
|
|
max_length: Maximum length
|
|
|
|
Returns:
|
|
Truncated content
|
|
"""
|
|
if len(content) <= max_length:
|
|
return content
|
|
return content[:max_length] + "..."
|
|
|
|
def clean_text(self, text: str) -> str:
|
|
"""
|
|
Clean extracted text (remove excessive whitespace, etc.).
|
|
|
|
Args:
|
|
text: Raw text
|
|
|
|
Returns:
|
|
Cleaned text
|
|
"""
|
|
# Remove multiple spaces
|
|
text = ' '.join(text.split())
|
|
# Remove multiple newlines
|
|
text = '\n'.join(line for line in text.split('\n') if line.strip())
|
|
return text.strip()
|