- Create FastAPI application with async I/O - Implement Redis session storage (fixes session loss on restart) - Add JWT authentication with refresh tokens - Add Microsoft SSO support via MSAL - Copy all processors from src/ (100% reused, no changes) - Create file upload/download endpoints - Create metadata update endpoints - Create template CRUD endpoints - Add SQLAlchemy async database models - Add Docker Compose configuration with Redis Solves critical issues: - Session management: Redis replaces in-memory dicts - Scalability: Async FastAPI + microservices architecture - File handling: Persistent storage with auto-cleanup Key files: - backend/app/main.py - FastAPI entry point - backend/app/core/redis_client.py - Session store - backend/app/core/auth.py - JWT authentication - backend/app/api/* - All REST endpoints - backend/app/processors/ - Reused from src/ Co-Authored-By: Claude Sonnet 4.5 (1M context) <noreply@anthropic.com>
64 lines
1.5 KiB
Python
64 lines
1.5 KiB
Python
"""Base class for all content extractors."""
|
|
|
|
from abc import ABC, abstractmethod
|
|
from typing import Dict, Optional
|
|
|
|
class BaseExtractor(ABC):
|
|
"""Abstract base class for content extractors."""
|
|
|
|
@abstractmethod
|
|
def extract_content(self, file_path: str) -> str:
|
|
"""
|
|
Extract text content from file.
|
|
|
|
Args:
|
|
file_path: Path to the file
|
|
|
|
Returns:
|
|
Extracted text content
|
|
"""
|
|
pass
|
|
|
|
@abstractmethod
|
|
def read_metadata(self, file_path: str) -> Dict[str, str]:
|
|
"""
|
|
Read existing metadata from file.
|
|
|
|
Args:
|
|
file_path: Path to the file
|
|
|
|
Returns:
|
|
Dictionary of metadata fields
|
|
"""
|
|
pass
|
|
|
|
def truncate_content(self, content: str, max_length: int = 3000) -> str:
|
|
"""
|
|
Truncate content to maximum length for AI processing.
|
|
|
|
Args:
|
|
content: Text content
|
|
max_length: Maximum length
|
|
|
|
Returns:
|
|
Truncated content
|
|
"""
|
|
if len(content) <= max_length:
|
|
return content
|
|
return content[:max_length] + "..."
|
|
|
|
def clean_text(self, text: str) -> str:
|
|
"""
|
|
Clean extracted text (remove excessive whitespace, etc.).
|
|
|
|
Args:
|
|
text: Raw text
|
|
|
|
Returns:
|
|
Cleaned text
|
|
"""
|
|
# Remove multiple spaces
|
|
text = ' '.join(text.split())
|
|
# Remove multiple newlines
|
|
text = '\n'.join(line for line in text.split('\n') if line.strip())
|
|
return text.strip()
|