- Create FastAPI application with async I/O - Implement Redis session storage (fixes session loss on restart) - Add JWT authentication with refresh tokens - Add Microsoft SSO support via MSAL - Copy all processors from src/ (100% reused, no changes) - Create file upload/download endpoints - Create metadata update endpoints - Create template CRUD endpoints - Add SQLAlchemy async database models - Add Docker Compose configuration with Redis Solves critical issues: - Session management: Redis replaces in-memory dicts - Scalability: Async FastAPI + microservices architecture - File handling: Persistent storage with auto-cleanup Key files: - backend/app/main.py - FastAPI entry point - backend/app/core/redis_client.py - Session store - backend/app/core/auth.py - JWT authentication - backend/app/api/* - All REST endpoints - backend/app/processors/ - Reused from src/ Co-Authored-By: Claude Sonnet 4.5 (1M context) <noreply@anthropic.com>
424 lines
17 KiB
Python
424 lines
17 KiB
Python
"""AI-powered metadata analysis using OpenAI GPT with production-ready features."""
|
|
|
|
import json
|
|
from openai import OpenAI
|
|
from typing import Dict, Optional
|
|
from .config import Config
|
|
from .file_detector import FileType
|
|
from .utils import get_logger, sanitize_metadata_value
|
|
|
|
# Production-ready imports
|
|
try:
|
|
import tiktoken
|
|
TIKTOKEN_AVAILABLE = True
|
|
except ImportError:
|
|
TIKTOKEN_AVAILABLE = False
|
|
|
|
try:
|
|
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
|
|
TENACITY_AVAILABLE = True
|
|
except ImportError:
|
|
TENACITY_AVAILABLE = False
|
|
|
|
logger = get_logger(__name__)
|
|
|
|
class MetadataAnalyzer:
|
|
"""Analyze content and generate metadata using OpenAI GPT with production-ready error handling."""
|
|
|
|
# Valid OpenAI models (as of January 2026)
|
|
VALID_MODELS = [
|
|
# GPT-5 models (2026 release)
|
|
'gpt-5', 'gpt-5-mini', 'gpt-5-nano',
|
|
'gpt-5-mini-2025-08-07', 'gpt-5-nano-2025-08-07',
|
|
# GPT-4 models
|
|
'gpt-4o', 'gpt-4o-mini', 'gpt-4o-mini-2024-07-18',
|
|
'gpt-4-turbo', 'gpt-4', 'gpt-3.5-turbo',
|
|
# Reasoning models
|
|
'o1', 'o1-mini', 'o1-preview'
|
|
]
|
|
|
|
def __init__(self):
|
|
"""Initialize the analyzer with OpenAI client."""
|
|
if not Config.OPENAI_API_KEY:
|
|
raise ValueError("OpenAI API key not configured")
|
|
|
|
self.client = OpenAI(api_key=Config.OPENAI_API_KEY)
|
|
self.model = Config.AI_MODEL
|
|
|
|
# Validate model name
|
|
if not self._is_valid_model(self.model):
|
|
logger.warning(f"⚠️ Model '{self.model}' may not be valid. Valid models: {', '.join(self.VALID_MODELS)}")
|
|
logger.warning(f"⚠️ Using fallback model: gpt-4o-mini")
|
|
self.model = 'gpt-4o-mini'
|
|
|
|
self.max_tokens = Config.MAX_TOKENS
|
|
self.temperature = Config.TEMPERATURE
|
|
|
|
logger.info(f"Initialized MetadataAnalyzer with model: {self.model}")
|
|
|
|
# Initialize tiktoken encoding for proper token counting
|
|
if TIKTOKEN_AVAILABLE:
|
|
try:
|
|
self.encoding = tiktoken.encoding_for_model(self.model)
|
|
except KeyError:
|
|
# Fallback for models not in tiktoken registry
|
|
self.encoding = tiktoken.get_encoding("cl100k_base")
|
|
else:
|
|
self.encoding = None
|
|
logger.warning("tiktoken not available - using character-based truncation")
|
|
|
|
def _count_tokens(self, text: str) -> int:
|
|
"""Count tokens using tiktoken (proper tokenization)."""
|
|
if self.encoding:
|
|
return len(self.encoding.encode(text))
|
|
else:
|
|
# Fallback: rough estimate (1 token ≈ 4 characters)
|
|
return len(text) // 4
|
|
|
|
def _truncate_content(self, content: str, max_tokens: int = 3000) -> str:
|
|
"""Intelligently truncate content to fit token limit."""
|
|
if not self.encoding:
|
|
# Character-based fallback
|
|
max_chars = max_tokens * 4
|
|
if len(content) <= max_chars:
|
|
return content
|
|
return content[:max_chars]
|
|
|
|
tokens = self.encoding.encode(content)
|
|
if len(tokens) <= max_tokens:
|
|
return content
|
|
|
|
# Truncate and decode back
|
|
truncated_tokens = tokens[:max_tokens]
|
|
return self.encoding.decode(truncated_tokens)
|
|
|
|
def _is_valid_model(self, model: str) -> bool:
|
|
"""Check if model name is valid."""
|
|
# Exact match
|
|
if model in self.VALID_MODELS:
|
|
return True
|
|
# Check if it starts with a valid prefix (for dated versions)
|
|
for valid_model in self.VALID_MODELS:
|
|
if model.startswith(valid_model):
|
|
return True
|
|
return False
|
|
|
|
def _is_new_model(self) -> bool:
|
|
"""
|
|
Check if model is a new generation model.
|
|
New models (GPT-5, GPT-4o, o1) use max_completion_tokens and don't support custom temperature.
|
|
"""
|
|
new_models = ['gpt-5', 'gpt-4o', 'gpt-4-turbo', 'o1']
|
|
return any(self.model.startswith(prefix) for prefix in new_models)
|
|
|
|
def _get_api_params(self) -> dict:
|
|
"""
|
|
Get the correct API parameters based on model.
|
|
Newer models (GPT-5, GPT-4o, o1) use max_completion_tokens and don't support custom temperature.
|
|
Older models (GPT-3.5-turbo) use max_tokens and support temperature.
|
|
"""
|
|
params = {}
|
|
|
|
# Token parameter
|
|
if self._is_new_model():
|
|
params['max_completion_tokens'] = self.max_tokens
|
|
# New models (GPT-5, GPT-4o, o1) don't support custom temperature (only default value 1)
|
|
logger.debug(f"Using max_completion_tokens for {self.model}")
|
|
else:
|
|
params['max_tokens'] = self.max_tokens
|
|
params['temperature'] = self.temperature
|
|
logger.debug(f"Using max_tokens + temperature for {self.model}")
|
|
|
|
return params
|
|
|
|
def _call_openai_api(self, messages: list) -> dict:
|
|
"""
|
|
Call OpenAI API with automatic retry on failures.
|
|
Uses tenacity for exponential backoff if available.
|
|
"""
|
|
# Get the correct API parameters
|
|
api_params = self._get_api_params()
|
|
|
|
if TENACITY_AVAILABLE:
|
|
# Use retry decorator dynamically
|
|
retry_decorator = retry(
|
|
stop=stop_after_attempt(Config.API_MAX_RETRIES),
|
|
wait=wait_exponential(multiplier=Config.API_RETRY_DELAY, min=2, max=10),
|
|
retry=retry_if_exception_type((Exception,)),
|
|
reraise=True
|
|
)
|
|
|
|
@retry_decorator
|
|
def _api_call():
|
|
return self.client.chat.completions.create(
|
|
model=self.model,
|
|
messages=messages,
|
|
timeout=Config.API_TIMEOUT,
|
|
**api_params
|
|
)
|
|
|
|
return _api_call()
|
|
else:
|
|
# Fallback: simple retry without exponential backoff
|
|
import time
|
|
last_error = None
|
|
|
|
for attempt in range(Config.API_MAX_RETRIES):
|
|
try:
|
|
return self.client.chat.completions.create(
|
|
model=self.model,
|
|
messages=messages,
|
|
timeout=Config.API_TIMEOUT,
|
|
**api_params
|
|
)
|
|
except Exception as e:
|
|
last_error = e
|
|
if attempt < Config.API_MAX_RETRIES - 1:
|
|
wait_time = Config.API_RETRY_DELAY * (2 ** attempt)
|
|
logger.warning(f"API call failed (attempt {attempt + 1}/{Config.API_MAX_RETRIES}), retrying in {wait_time}s: {e}")
|
|
time.sleep(wait_time)
|
|
|
|
raise last_error
|
|
|
|
def analyze_content(self, content: str, filename: str, file_type: FileType) -> Dict[str, str]:
|
|
"""
|
|
Analyze content and generate appropriate metadata with production-ready error handling.
|
|
|
|
Args:
|
|
content: Extracted text content
|
|
filename: Original filename
|
|
file_type: Type of file
|
|
|
|
Returns:
|
|
Dictionary with metadata (title, subject, keywords, _tokens_used, _confidence)
|
|
"""
|
|
try:
|
|
# Truncate content if needed with proper token counting
|
|
content_tokens = self._count_tokens(content)
|
|
if content_tokens > Config.MAX_TEXT_LENGTH:
|
|
content = self._truncate_content(content, Config.MAX_TEXT_LENGTH)
|
|
logger.info(f"Truncated content from {content_tokens} to {self._count_tokens(content)} tokens")
|
|
|
|
# Generate prompt based on file type
|
|
prompt = self._create_prompt(content, filename, file_type)
|
|
|
|
# Count total tokens before API call
|
|
prompt_tokens = self._count_tokens(prompt)
|
|
logger.info(f"API call for {filename}: {prompt_tokens} prompt tokens")
|
|
|
|
# Call API with retry logic
|
|
response = self._call_openai_api([
|
|
{"role": "system", "content": "You are a metadata expert who generates professional, accurate metadata for documents in English."},
|
|
{"role": "user", "content": prompt}
|
|
])
|
|
|
|
# Parse response with detailed logging
|
|
logger.info(f"API Response for {filename}:")
|
|
logger.info(f" - Model used: {response.model}")
|
|
logger.info(f" - Finish reason: {response.choices[0].finish_reason}")
|
|
logger.info(f" - Tokens: prompt={response.usage.prompt_tokens}, completion={response.usage.completion_tokens}, total={response.usage.total_tokens}")
|
|
|
|
metadata_text = response.choices[0].message.content
|
|
logger.info(f" - Content length: {len(metadata_text) if metadata_text else 0} chars")
|
|
logger.info(f" - Content preview: {metadata_text[:200] if metadata_text else '(empty)'}")
|
|
|
|
# Check if content is None or empty
|
|
if not metadata_text or len(metadata_text.strip()) == 0:
|
|
logger.error(f"❌ API returned empty content for {filename}!")
|
|
logger.error(f" This usually means:")
|
|
logger.error(f" 1. Invalid model name: {self.model}")
|
|
logger.error(f" 2. Model doesn't support this request type")
|
|
logger.error(f" 3. Content was filtered/refused")
|
|
logger.error(f" Using fallback metadata instead.")
|
|
return self._generate_fallback_metadata(filename, file_type)
|
|
|
|
metadata = self._parse_metadata_response(metadata_text)
|
|
|
|
# Sanitize metadata values
|
|
metadata = {
|
|
key: sanitize_metadata_value(value)
|
|
for key, value in metadata.items()
|
|
}
|
|
|
|
# Add metadata about the generation
|
|
metadata['_tokens_used'] = response.usage.total_tokens
|
|
metadata['_confidence'] = 0.9 # Could calculate based on response
|
|
|
|
logger.info(f"Generated metadata for {filename} (tokens used: {metadata['_tokens_used']})")
|
|
return metadata
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error analyzing content for {filename}: {e}")
|
|
# Return fallback metadata with error info
|
|
fallback = self._generate_fallback_metadata(filename, file_type)
|
|
fallback['_ai_error'] = str(e)
|
|
fallback['_tokens_used'] = 0
|
|
return fallback
|
|
|
|
def _create_prompt(self, content: str, filename: str, file_type: FileType) -> str:
|
|
"""Create AI prompt based on file type."""
|
|
file_type_descriptions = {
|
|
FileType.PDF: "PDF document",
|
|
FileType.IMAGE: "image file",
|
|
FileType.OFFICE_DOC: "Word document",
|
|
FileType.OFFICE_SHEET: "Excel spreadsheet",
|
|
FileType.OFFICE_PRESENTATION: "PowerPoint presentation",
|
|
FileType.VIDEO: "video file"
|
|
}
|
|
|
|
file_desc = file_type_descriptions.get(file_type, "file")
|
|
|
|
prompt = f"""Analyze the following {file_desc} content and generate professional metadata in English.
|
|
|
|
Filename: {filename}
|
|
Content: {content}
|
|
|
|
Generate metadata with these fields:
|
|
1. Title: A concise, professional title (50-100 characters) that clearly describes the document/content
|
|
2. Subject: A brief description (1-2 sentences) of the document's purpose and content
|
|
3. Keywords: 5-10 relevant keywords separated by commas (include product names, categories, topics)
|
|
|
|
Rules:
|
|
- All text MUST be in English
|
|
- Title should identify the main product/service and document type (e.g., "guide", "brochure", "manual")
|
|
- Subject should explain what the document is about and its purpose
|
|
- Keywords should be searchable terms relevant to the content
|
|
- Be professional and concise
|
|
- Return ONLY a JSON object with fields: title, subject, keywords
|
|
|
|
Example output format:
|
|
{{
|
|
"title": "3M Filtek Universal Restorative - Shade Selection Guide",
|
|
"subject": "Shade selection guide for 3M Filtek Universal Restorative dental material",
|
|
"keywords": "Filtek, Universal Restorative, shade selection, dental, restorative material, 3M, dentistry, composite"
|
|
}}
|
|
|
|
Return only the JSON object, no additional text."""
|
|
|
|
return prompt
|
|
|
|
def _parse_metadata_response(self, response_text: str) -> Dict[str, str]:
|
|
"""Parse AI response into metadata dictionary."""
|
|
try:
|
|
# Try to parse as JSON first
|
|
response_text = response_text.strip()
|
|
logger.info(f"Parsing response (length={len(response_text)}): {response_text[:200]}")
|
|
|
|
# Remove markdown code blocks if present
|
|
if response_text.startswith('```'):
|
|
lines = response_text.split('\n')
|
|
# Find first and last code block markers
|
|
start_idx = 0
|
|
end_idx = len(lines)
|
|
for i, line in enumerate(lines):
|
|
if line.startswith('```'):
|
|
if start_idx == 0:
|
|
start_idx = i + 1
|
|
else:
|
|
end_idx = i
|
|
break
|
|
response_text = '\n'.join(lines[start_idx:end_idx])
|
|
|
|
# Try to find JSON object in text
|
|
# Look for { ... } pattern
|
|
start = response_text.find('{')
|
|
end = response_text.rfind('}')
|
|
if start != -1 and end != -1:
|
|
json_str = response_text[start:end+1]
|
|
metadata = json.loads(json_str)
|
|
else:
|
|
metadata = json.loads(response_text)
|
|
|
|
# Ensure all required fields are present
|
|
required_fields = ['title', 'subject', 'keywords']
|
|
for field in required_fields:
|
|
if field not in metadata:
|
|
metadata[field] = ""
|
|
|
|
# Validate that we got actual content
|
|
if not metadata.get('title') or len(metadata.get('title', '').strip()) < 3:
|
|
logger.warning("JSON parsed but title is empty or too short, using text parsing")
|
|
return self._parse_metadata_text(response_text)
|
|
|
|
return metadata
|
|
|
|
except (json.JSONDecodeError, ValueError, KeyError) as e:
|
|
logger.warning(f"Failed to parse JSON response ({str(e)}), using text parsing")
|
|
return self._parse_metadata_text(response_text)
|
|
|
|
def _parse_metadata_text(self, text: str) -> Dict[str, str]:
|
|
"""Parse metadata from plain text response."""
|
|
metadata = {
|
|
'title': '',
|
|
'subject': '',
|
|
'keywords': ''
|
|
}
|
|
|
|
# Improved text parsing
|
|
lines = text.split('\n')
|
|
|
|
for line in lines:
|
|
line = line.strip()
|
|
if not line or line.startswith('#') or line.startswith('//'):
|
|
continue
|
|
|
|
# Remove quotes and extra whitespace
|
|
line_clean = line.strip('"\'')
|
|
|
|
# Look for field indicators (case insensitive)
|
|
line_lower = line_clean.lower()
|
|
|
|
if ':' in line_clean:
|
|
parts = line_clean.split(':', 1)
|
|
key = parts[0].strip().lower()
|
|
value = parts[1].strip().strip('",\'')
|
|
|
|
if 'title' in key and not metadata['title']:
|
|
metadata['title'] = value
|
|
elif 'subject' in key and not metadata['subject']:
|
|
metadata['subject'] = value
|
|
elif 'keyword' in key and not metadata['keywords']:
|
|
metadata['keywords'] = value
|
|
|
|
# If still empty, try to extract from unstructured text
|
|
if not metadata['title']:
|
|
# Look for first substantial line as title
|
|
for line in lines:
|
|
line = line.strip().strip('"\'')
|
|
if len(line) > 10 and not line.lower().startswith(('title', 'subject', 'keyword')):
|
|
metadata['title'] = line[:200] # Limit length
|
|
break
|
|
|
|
logger.info(f"Text parsing result: title='{metadata['title'][:50]}...', subject='{metadata['subject'][:50]}...'")
|
|
return metadata
|
|
|
|
def _generate_fallback_metadata(self, filename: str, file_type: FileType) -> Dict[str, str]:
|
|
"""Generate basic metadata based on filename when AI fails."""
|
|
# Remove extension and clean filename
|
|
from pathlib import Path
|
|
clean_name = Path(filename).stem.replace('_', ' ').replace('-', ' ')
|
|
|
|
return {
|
|
'title': clean_name,
|
|
'subject': f"{clean_name} - {FileType(file_type).value}",
|
|
'keywords': clean_name.replace(' ', ', ')
|
|
}
|
|
|
|
def generate_metadata_for_pdf(self, text: str) -> Dict[str, str]:
|
|
"""Specialized metadata generation for PDF documents."""
|
|
# Wrapper for PDF-specific logic if needed
|
|
return self.analyze_content(text, "document.pdf", FileType.PDF)
|
|
|
|
def generate_metadata_for_image(self, text: str) -> Dict[str, str]:
|
|
"""Specialized metadata generation for images."""
|
|
return self.analyze_content(text, "image.jpg", FileType.IMAGE)
|
|
|
|
def generate_metadata_for_office(self, text: str) -> Dict[str, str]:
|
|
"""Specialized metadata generation for Office documents."""
|
|
return self.analyze_content(text, "document.docx", FileType.OFFICE_DOC)
|
|
|
|
def generate_metadata_for_video(self, metadata: Dict[str, str]) -> Dict[str, str]:
|
|
"""Specialized metadata generation for videos."""
|
|
# For videos, we might use existing metadata as input
|
|
text = f"Video title: {metadata.get('title', 'N/A')}"
|
|
return self.analyze_content(text, "video.mp4", FileType.VIDEO)
|