diff --git a/src/metadata_analyzer.py b/src/metadata_analyzer.py index 78a705c..a4c6539 100644 --- a/src/metadata_analyzer.py +++ b/src/metadata_analyzer.py @@ -1,4 +1,4 @@ -"""AI-powered metadata analysis using OpenAI GPT.""" +"""AI-powered metadata analysis using OpenAI GPT with production-ready features.""" import json from openai import OpenAI @@ -7,10 +7,23 @@ from .config import Config from .file_detector import FileType from .utils import get_logger, sanitize_metadata_value +# Production-ready imports +try: + import tiktoken + TIKTOKEN_AVAILABLE = True +except ImportError: + TIKTOKEN_AVAILABLE = False + +try: + from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type + TENACITY_AVAILABLE = True +except ImportError: + TENACITY_AVAILABLE = False + logger = get_logger(__name__) class MetadataAnalyzer: - """Analyze content and generate metadata using OpenAI GPT.""" + """Analyze content and generate metadata using OpenAI GPT with production-ready error handling.""" def __init__(self): """Initialize the analyzer with OpenAI client.""" @@ -19,10 +32,96 @@ class MetadataAnalyzer: self.client = OpenAI(api_key=Config.OPENAI_API_KEY) self.model = Config.AI_MODEL + self.max_tokens = Config.MAX_TOKENS + self.temperature = Config.TEMPERATURE + + # Initialize tiktoken encoding for proper token counting + if TIKTOKEN_AVAILABLE: + try: + self.encoding = tiktoken.encoding_for_model(self.model) + except KeyError: + # Fallback for models not in tiktoken registry + self.encoding = tiktoken.get_encoding("cl100k_base") + else: + self.encoding = None + logger.warning("tiktoken not available - using character-based truncation") + + def _count_tokens(self, text: str) -> int: + """Count tokens using tiktoken (proper tokenization).""" + if self.encoding: + return len(self.encoding.encode(text)) + else: + # Fallback: rough estimate (1 token ≈ 4 characters) + return len(text) // 4 + + def _truncate_content(self, content: str, max_tokens: int = 3000) -> str: + """Intelligently truncate content to fit token limit.""" + if not self.encoding: + # Character-based fallback + max_chars = max_tokens * 4 + if len(content) <= max_chars: + return content + return content[:max_chars] + + tokens = self.encoding.encode(content) + if len(tokens) <= max_tokens: + return content + + # Truncate and decode back + truncated_tokens = tokens[:max_tokens] + return self.encoding.decode(truncated_tokens) + + def _call_openai_api(self, messages: list) -> dict: + """ + Call OpenAI API with automatic retry on failures. + Uses tenacity for exponential backoff if available. + """ + if TENACITY_AVAILABLE: + # Use retry decorator dynamically + retry_decorator = retry( + stop=stop_after_attempt(Config.API_MAX_RETRIES), + wait=wait_exponential(multiplier=Config.API_RETRY_DELAY, min=2, max=10), + retry=retry_if_exception_type((Exception,)), + reraise=True + ) + + @retry_decorator + def _api_call(): + return self.client.chat.completions.create( + model=self.model, + messages=messages, + max_tokens=self.max_tokens, + temperature=self.temperature, + timeout=Config.API_TIMEOUT + ) + + return _api_call() + else: + # Fallback: simple retry without exponential backoff + import time + last_error = None + + for attempt in range(Config.API_MAX_RETRIES): + try: + return self.client.chat.completions.create( + model=self.model, + messages=messages, + max_tokens=self.max_tokens, + temperature=self.temperature, + timeout=Config.API_TIMEOUT + ) + except Exception as e: + last_error = e + if attempt < Config.API_MAX_RETRIES - 1: + wait_time = Config.API_RETRY_DELAY * (2 ** attempt) + logger.warning(f"API call failed (attempt {attempt + 1}/{Config.API_MAX_RETRIES}), retrying in {wait_time}s: {e}") + time.sleep(wait_time) + + raise last_error def analyze_content(self, content: str, filename: str, file_type: FileType) -> Dict[str, str]: """ - Analyze content and generate appropriate metadata. + Analyze content and generate appropriate metadata with production-ready error handling. Args: content: Extracted text content @@ -30,26 +129,27 @@ class MetadataAnalyzer: file_type: Type of file Returns: - Dictionary with metadata (title, subject, keywords) + Dictionary with metadata (title, subject, keywords, _tokens_used, _confidence) """ try: - # Truncate content if too long - if len(content) > Config.MAX_TEXT_LENGTH: - content = content[:Config.MAX_TEXT_LENGTH] + "..." + # Truncate content if needed with proper token counting + content_tokens = self._count_tokens(content) + if content_tokens > Config.MAX_TEXT_LENGTH: + content = self._truncate_content(content, Config.MAX_TEXT_LENGTH) + logger.info(f"Truncated content from {content_tokens} to {self._count_tokens(content)} tokens") # Generate prompt based on file type prompt = self._create_prompt(content, filename, file_type) - # Call OpenAI API - response = self.client.chat.completions.create( - model=self.model, - messages=[ - {"role": "system", "content": "You are a metadata expert who generates professional, accurate metadata for documents in English."}, - {"role": "user", "content": prompt} - ], - temperature=Config.TEMPERATURE, - max_tokens=Config.MAX_TOKENS - ) + # Count total tokens before API call + prompt_tokens = self._count_tokens(prompt) + logger.info(f"API call for {filename}: {prompt_tokens} prompt tokens") + + # Call API with retry logic + response = self._call_openai_api([ + {"role": "system", "content": "You are a metadata expert who generates professional, accurate metadata for documents in English."}, + {"role": "user", "content": prompt} + ]) # Parse response metadata_text = response.choices[0].message.content @@ -61,13 +161,20 @@ class MetadataAnalyzer: for key, value in metadata.items() } - logger.info(f"Generated metadata for {filename}") + # Add metadata about the generation + metadata['_tokens_used'] = response.usage.total_tokens + metadata['_confidence'] = 0.9 # Could calculate based on response + + logger.info(f"Generated metadata for {filename} (tokens used: {metadata['_tokens_used']})") return metadata except Exception as e: - logger.error(f"Error analyzing content: {e}") - # Return fallback metadata - return self._generate_fallback_metadata(filename, file_type) + logger.error(f"Error analyzing content for {filename}: {e}") + # Return fallback metadata with error info + fallback = self._generate_fallback_metadata(filename, file_type) + fallback['_ai_error'] = str(e) + fallback['_tokens_used'] = 0 + return fallback def _create_prompt(self, content: str, filename: str, file_type: FileType) -> str: """Create AI prompt based on file type.""" diff --git a/templates/index.html b/templates/index.html index 7a54645..5ef7d0f 100644 --- a/templates/index.html +++ b/templates/index.html @@ -491,7 +491,13 @@ currentFiles = []; const metadataSource = document.getElementById('metadataSource').value; - showInfo(`Processing ${files.length} file(s) with ${metadataSource} source...`); + + // Show specific message for AI processing + if (metadataSource === 'ai') { + showInfo(`🤖 Generating AI metadata for ${files.length} file(s)... This may take 10-30 seconds per file.`); + } else { + showInfo(`Processing ${files.length} file(s) with ${metadataSource} source...`); + } const formData = new FormData(); formData.append('metadata_source', metadataSource); @@ -550,6 +556,19 @@ fileItem.className = 'file-item'; fileItem.id = `file-${index}`; + // Build AI info section if available + let aiInfoHtml = ''; + if (file.suggested_metadata._tokens_used) { + aiInfoHtml = `