solventum-image-metadata/backend/app/processors/metadata_analyzer.py
SamoilenkoVadym 563d476a94 feat(backend): migrate from Flask to FastAPI with Redis sessions
- Create FastAPI application with async I/O
- Implement Redis session storage (fixes session loss on restart)
- Add JWT authentication with refresh tokens
- Add Microsoft SSO support via MSAL
- Copy all processors from src/ (100% reused, no changes)
- Create file upload/download endpoints
- Create metadata update endpoints
- Create template CRUD endpoints
- Add SQLAlchemy async database models
- Add Docker Compose configuration with Redis

Solves critical issues:
- Session management: Redis replaces in-memory dicts
- Scalability: Async FastAPI + microservices architecture
- File handling: Persistent storage with auto-cleanup

Key files:
- backend/app/main.py - FastAPI entry point
- backend/app/core/redis_client.py - Session store
- backend/app/core/auth.py - JWT authentication
- backend/app/api/* - All REST endpoints
- backend/app/processors/ - Reused from src/

Co-Authored-By: Claude Sonnet 4.5 (1M context) <noreply@anthropic.com>
2026-02-09 13:14:37 +00:00

424 lines
17 KiB
Python

"""AI-powered metadata analysis using OpenAI GPT with production-ready features."""
import json
from openai import OpenAI
from typing import Dict, Optional
from .config import Config
from .file_detector import FileType
from .utils import get_logger, sanitize_metadata_value
# Production-ready imports
try:
import tiktoken
TIKTOKEN_AVAILABLE = True
except ImportError:
TIKTOKEN_AVAILABLE = False
try:
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
TENACITY_AVAILABLE = True
except ImportError:
TENACITY_AVAILABLE = False
logger = get_logger(__name__)
class MetadataAnalyzer:
"""Analyze content and generate metadata using OpenAI GPT with production-ready error handling."""
# Valid OpenAI models (as of January 2026)
VALID_MODELS = [
# GPT-5 models (2026 release)
'gpt-5', 'gpt-5-mini', 'gpt-5-nano',
'gpt-5-mini-2025-08-07', 'gpt-5-nano-2025-08-07',
# GPT-4 models
'gpt-4o', 'gpt-4o-mini', 'gpt-4o-mini-2024-07-18',
'gpt-4-turbo', 'gpt-4', 'gpt-3.5-turbo',
# Reasoning models
'o1', 'o1-mini', 'o1-preview'
]
def __init__(self):
"""Initialize the analyzer with OpenAI client."""
if not Config.OPENAI_API_KEY:
raise ValueError("OpenAI API key not configured")
self.client = OpenAI(api_key=Config.OPENAI_API_KEY)
self.model = Config.AI_MODEL
# Validate model name
if not self._is_valid_model(self.model):
logger.warning(f"⚠️ Model '{self.model}' may not be valid. Valid models: {', '.join(self.VALID_MODELS)}")
logger.warning(f"⚠️ Using fallback model: gpt-4o-mini")
self.model = 'gpt-4o-mini'
self.max_tokens = Config.MAX_TOKENS
self.temperature = Config.TEMPERATURE
logger.info(f"Initialized MetadataAnalyzer with model: {self.model}")
# Initialize tiktoken encoding for proper token counting
if TIKTOKEN_AVAILABLE:
try:
self.encoding = tiktoken.encoding_for_model(self.model)
except KeyError:
# Fallback for models not in tiktoken registry
self.encoding = tiktoken.get_encoding("cl100k_base")
else:
self.encoding = None
logger.warning("tiktoken not available - using character-based truncation")
def _count_tokens(self, text: str) -> int:
"""Count tokens using tiktoken (proper tokenization)."""
if self.encoding:
return len(self.encoding.encode(text))
else:
# Fallback: rough estimate (1 token ≈ 4 characters)
return len(text) // 4
def _truncate_content(self, content: str, max_tokens: int = 3000) -> str:
"""Intelligently truncate content to fit token limit."""
if not self.encoding:
# Character-based fallback
max_chars = max_tokens * 4
if len(content) <= max_chars:
return content
return content[:max_chars]
tokens = self.encoding.encode(content)
if len(tokens) <= max_tokens:
return content
# Truncate and decode back
truncated_tokens = tokens[:max_tokens]
return self.encoding.decode(truncated_tokens)
def _is_valid_model(self, model: str) -> bool:
"""Check if model name is valid."""
# Exact match
if model in self.VALID_MODELS:
return True
# Check if it starts with a valid prefix (for dated versions)
for valid_model in self.VALID_MODELS:
if model.startswith(valid_model):
return True
return False
def _is_new_model(self) -> bool:
"""
Check if model is a new generation model.
New models (GPT-5, GPT-4o, o1) use max_completion_tokens and don't support custom temperature.
"""
new_models = ['gpt-5', 'gpt-4o', 'gpt-4-turbo', 'o1']
return any(self.model.startswith(prefix) for prefix in new_models)
def _get_api_params(self) -> dict:
"""
Get the correct API parameters based on model.
Newer models (GPT-5, GPT-4o, o1) use max_completion_tokens and don't support custom temperature.
Older models (GPT-3.5-turbo) use max_tokens and support temperature.
"""
params = {}
# Token parameter
if self._is_new_model():
params['max_completion_tokens'] = self.max_tokens
# New models (GPT-5, GPT-4o, o1) don't support custom temperature (only default value 1)
logger.debug(f"Using max_completion_tokens for {self.model}")
else:
params['max_tokens'] = self.max_tokens
params['temperature'] = self.temperature
logger.debug(f"Using max_tokens + temperature for {self.model}")
return params
def _call_openai_api(self, messages: list) -> dict:
"""
Call OpenAI API with automatic retry on failures.
Uses tenacity for exponential backoff if available.
"""
# Get the correct API parameters
api_params = self._get_api_params()
if TENACITY_AVAILABLE:
# Use retry decorator dynamically
retry_decorator = retry(
stop=stop_after_attempt(Config.API_MAX_RETRIES),
wait=wait_exponential(multiplier=Config.API_RETRY_DELAY, min=2, max=10),
retry=retry_if_exception_type((Exception,)),
reraise=True
)
@retry_decorator
def _api_call():
return self.client.chat.completions.create(
model=self.model,
messages=messages,
timeout=Config.API_TIMEOUT,
**api_params
)
return _api_call()
else:
# Fallback: simple retry without exponential backoff
import time
last_error = None
for attempt in range(Config.API_MAX_RETRIES):
try:
return self.client.chat.completions.create(
model=self.model,
messages=messages,
timeout=Config.API_TIMEOUT,
**api_params
)
except Exception as e:
last_error = e
if attempt < Config.API_MAX_RETRIES - 1:
wait_time = Config.API_RETRY_DELAY * (2 ** attempt)
logger.warning(f"API call failed (attempt {attempt + 1}/{Config.API_MAX_RETRIES}), retrying in {wait_time}s: {e}")
time.sleep(wait_time)
raise last_error
def analyze_content(self, content: str, filename: str, file_type: FileType) -> Dict[str, str]:
"""
Analyze content and generate appropriate metadata with production-ready error handling.
Args:
content: Extracted text content
filename: Original filename
file_type: Type of file
Returns:
Dictionary with metadata (title, subject, keywords, _tokens_used, _confidence)
"""
try:
# Truncate content if needed with proper token counting
content_tokens = self._count_tokens(content)
if content_tokens > Config.MAX_TEXT_LENGTH:
content = self._truncate_content(content, Config.MAX_TEXT_LENGTH)
logger.info(f"Truncated content from {content_tokens} to {self._count_tokens(content)} tokens")
# Generate prompt based on file type
prompt = self._create_prompt(content, filename, file_type)
# Count total tokens before API call
prompt_tokens = self._count_tokens(prompt)
logger.info(f"API call for {filename}: {prompt_tokens} prompt tokens")
# Call API with retry logic
response = self._call_openai_api([
{"role": "system", "content": "You are a metadata expert who generates professional, accurate metadata for documents in English."},
{"role": "user", "content": prompt}
])
# Parse response with detailed logging
logger.info(f"API Response for {filename}:")
logger.info(f" - Model used: {response.model}")
logger.info(f" - Finish reason: {response.choices[0].finish_reason}")
logger.info(f" - Tokens: prompt={response.usage.prompt_tokens}, completion={response.usage.completion_tokens}, total={response.usage.total_tokens}")
metadata_text = response.choices[0].message.content
logger.info(f" - Content length: {len(metadata_text) if metadata_text else 0} chars")
logger.info(f" - Content preview: {metadata_text[:200] if metadata_text else '(empty)'}")
# Check if content is None or empty
if not metadata_text or len(metadata_text.strip()) == 0:
logger.error(f"❌ API returned empty content for {filename}!")
logger.error(f" This usually means:")
logger.error(f" 1. Invalid model name: {self.model}")
logger.error(f" 2. Model doesn't support this request type")
logger.error(f" 3. Content was filtered/refused")
logger.error(f" Using fallback metadata instead.")
return self._generate_fallback_metadata(filename, file_type)
metadata = self._parse_metadata_response(metadata_text)
# Sanitize metadata values
metadata = {
key: sanitize_metadata_value(value)
for key, value in metadata.items()
}
# Add metadata about the generation
metadata['_tokens_used'] = response.usage.total_tokens
metadata['_confidence'] = 0.9 # Could calculate based on response
logger.info(f"Generated metadata for {filename} (tokens used: {metadata['_tokens_used']})")
return metadata
except Exception as e:
logger.error(f"Error analyzing content for {filename}: {e}")
# Return fallback metadata with error info
fallback = self._generate_fallback_metadata(filename, file_type)
fallback['_ai_error'] = str(e)
fallback['_tokens_used'] = 0
return fallback
def _create_prompt(self, content: str, filename: str, file_type: FileType) -> str:
"""Create AI prompt based on file type."""
file_type_descriptions = {
FileType.PDF: "PDF document",
FileType.IMAGE: "image file",
FileType.OFFICE_DOC: "Word document",
FileType.OFFICE_SHEET: "Excel spreadsheet",
FileType.OFFICE_PRESENTATION: "PowerPoint presentation",
FileType.VIDEO: "video file"
}
file_desc = file_type_descriptions.get(file_type, "file")
prompt = f"""Analyze the following {file_desc} content and generate professional metadata in English.
Filename: {filename}
Content: {content}
Generate metadata with these fields:
1. Title: A concise, professional title (50-100 characters) that clearly describes the document/content
2. Subject: A brief description (1-2 sentences) of the document's purpose and content
3. Keywords: 5-10 relevant keywords separated by commas (include product names, categories, topics)
Rules:
- All text MUST be in English
- Title should identify the main product/service and document type (e.g., "guide", "brochure", "manual")
- Subject should explain what the document is about and its purpose
- Keywords should be searchable terms relevant to the content
- Be professional and concise
- Return ONLY a JSON object with fields: title, subject, keywords
Example output format:
{{
"title": "3M Filtek Universal Restorative - Shade Selection Guide",
"subject": "Shade selection guide for 3M Filtek Universal Restorative dental material",
"keywords": "Filtek, Universal Restorative, shade selection, dental, restorative material, 3M, dentistry, composite"
}}
Return only the JSON object, no additional text."""
return prompt
def _parse_metadata_response(self, response_text: str) -> Dict[str, str]:
"""Parse AI response into metadata dictionary."""
try:
# Try to parse as JSON first
response_text = response_text.strip()
logger.info(f"Parsing response (length={len(response_text)}): {response_text[:200]}")
# Remove markdown code blocks if present
if response_text.startswith('```'):
lines = response_text.split('\n')
# Find first and last code block markers
start_idx = 0
end_idx = len(lines)
for i, line in enumerate(lines):
if line.startswith('```'):
if start_idx == 0:
start_idx = i + 1
else:
end_idx = i
break
response_text = '\n'.join(lines[start_idx:end_idx])
# Try to find JSON object in text
# Look for { ... } pattern
start = response_text.find('{')
end = response_text.rfind('}')
if start != -1 and end != -1:
json_str = response_text[start:end+1]
metadata = json.loads(json_str)
else:
metadata = json.loads(response_text)
# Ensure all required fields are present
required_fields = ['title', 'subject', 'keywords']
for field in required_fields:
if field not in metadata:
metadata[field] = ""
# Validate that we got actual content
if not metadata.get('title') or len(metadata.get('title', '').strip()) < 3:
logger.warning("JSON parsed but title is empty or too short, using text parsing")
return self._parse_metadata_text(response_text)
return metadata
except (json.JSONDecodeError, ValueError, KeyError) as e:
logger.warning(f"Failed to parse JSON response ({str(e)}), using text parsing")
return self._parse_metadata_text(response_text)
def _parse_metadata_text(self, text: str) -> Dict[str, str]:
"""Parse metadata from plain text response."""
metadata = {
'title': '',
'subject': '',
'keywords': ''
}
# Improved text parsing
lines = text.split('\n')
for line in lines:
line = line.strip()
if not line or line.startswith('#') or line.startswith('//'):
continue
# Remove quotes and extra whitespace
line_clean = line.strip('"\'')
# Look for field indicators (case insensitive)
line_lower = line_clean.lower()
if ':' in line_clean:
parts = line_clean.split(':', 1)
key = parts[0].strip().lower()
value = parts[1].strip().strip('",\'')
if 'title' in key and not metadata['title']:
metadata['title'] = value
elif 'subject' in key and not metadata['subject']:
metadata['subject'] = value
elif 'keyword' in key and not metadata['keywords']:
metadata['keywords'] = value
# If still empty, try to extract from unstructured text
if not metadata['title']:
# Look for first substantial line as title
for line in lines:
line = line.strip().strip('"\'')
if len(line) > 10 and not line.lower().startswith(('title', 'subject', 'keyword')):
metadata['title'] = line[:200] # Limit length
break
logger.info(f"Text parsing result: title='{metadata['title'][:50]}...', subject='{metadata['subject'][:50]}...'")
return metadata
def _generate_fallback_metadata(self, filename: str, file_type: FileType) -> Dict[str, str]:
"""Generate basic metadata based on filename when AI fails."""
# Remove extension and clean filename
from pathlib import Path
clean_name = Path(filename).stem.replace('_', ' ').replace('-', ' ')
return {
'title': clean_name,
'subject': f"{clean_name} - {FileType(file_type).value}",
'keywords': clean_name.replace(' ', ', ')
}
def generate_metadata_for_pdf(self, text: str) -> Dict[str, str]:
"""Specialized metadata generation for PDF documents."""
# Wrapper for PDF-specific logic if needed
return self.analyze_content(text, "document.pdf", FileType.PDF)
def generate_metadata_for_image(self, text: str) -> Dict[str, str]:
"""Specialized metadata generation for images."""
return self.analyze_content(text, "image.jpg", FileType.IMAGE)
def generate_metadata_for_office(self, text: str) -> Dict[str, str]:
"""Specialized metadata generation for Office documents."""
return self.analyze_content(text, "document.docx", FileType.OFFICE_DOC)
def generate_metadata_for_video(self, metadata: Dict[str, str]) -> Dict[str, str]:
"""Specialized metadata generation for videos."""
# For videos, we might use existing metadata as input
text = f"Video title: {metadata.get('title', 'N/A')}"
return self.analyze_content(text, "video.mp4", FileType.VIDEO)