diff --git a/src/metadata_analyzer.py b/src/metadata_analyzer.py index 78a705c..a4c6539 100644 --- a/src/metadata_analyzer.py +++ b/src/metadata_analyzer.py @@ -1,4 +1,4 @@ -"""AI-powered metadata analysis using OpenAI GPT.""" +"""AI-powered metadata analysis using OpenAI GPT with production-ready features.""" import json from openai import OpenAI @@ -7,10 +7,23 @@ from .config import Config from .file_detector import FileType from .utils import get_logger, sanitize_metadata_value +# Production-ready imports +try: + import tiktoken + TIKTOKEN_AVAILABLE = True +except ImportError: + TIKTOKEN_AVAILABLE = False + +try: + from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type + TENACITY_AVAILABLE = True +except ImportError: + TENACITY_AVAILABLE = False + logger = get_logger(__name__) class MetadataAnalyzer: - """Analyze content and generate metadata using OpenAI GPT.""" + """Analyze content and generate metadata using OpenAI GPT with production-ready error handling.""" def __init__(self): """Initialize the analyzer with OpenAI client.""" @@ -19,10 +32,96 @@ class MetadataAnalyzer: self.client = OpenAI(api_key=Config.OPENAI_API_KEY) self.model = Config.AI_MODEL + self.max_tokens = Config.MAX_TOKENS + self.temperature = Config.TEMPERATURE + + # Initialize tiktoken encoding for proper token counting + if TIKTOKEN_AVAILABLE: + try: + self.encoding = tiktoken.encoding_for_model(self.model) + except KeyError: + # Fallback for models not in tiktoken registry + self.encoding = tiktoken.get_encoding("cl100k_base") + else: + self.encoding = None + logger.warning("tiktoken not available - using character-based truncation") + + def _count_tokens(self, text: str) -> int: + """Count tokens using tiktoken (proper tokenization).""" + if self.encoding: + return len(self.encoding.encode(text)) + else: + # Fallback: rough estimate (1 token ≈ 4 characters) + return len(text) // 4 + + def _truncate_content(self, content: str, max_tokens: int = 3000) -> str: + """Intelligently truncate content to fit token limit.""" + if not self.encoding: + # Character-based fallback + max_chars = max_tokens * 4 + if len(content) <= max_chars: + return content + return content[:max_chars] + + tokens = self.encoding.encode(content) + if len(tokens) <= max_tokens: + return content + + # Truncate and decode back + truncated_tokens = tokens[:max_tokens] + return self.encoding.decode(truncated_tokens) + + def _call_openai_api(self, messages: list) -> dict: + """ + Call OpenAI API with automatic retry on failures. + Uses tenacity for exponential backoff if available. + """ + if TENACITY_AVAILABLE: + # Use retry decorator dynamically + retry_decorator = retry( + stop=stop_after_attempt(Config.API_MAX_RETRIES), + wait=wait_exponential(multiplier=Config.API_RETRY_DELAY, min=2, max=10), + retry=retry_if_exception_type((Exception,)), + reraise=True + ) + + @retry_decorator + def _api_call(): + return self.client.chat.completions.create( + model=self.model, + messages=messages, + max_tokens=self.max_tokens, + temperature=self.temperature, + timeout=Config.API_TIMEOUT + ) + + return _api_call() + else: + # Fallback: simple retry without exponential backoff + import time + last_error = None + + for attempt in range(Config.API_MAX_RETRIES): + try: + return self.client.chat.completions.create( + model=self.model, + messages=messages, + max_tokens=self.max_tokens, + temperature=self.temperature, + timeout=Config.API_TIMEOUT + ) + except Exception as e: + last_error = e + if attempt < Config.API_MAX_RETRIES - 1: + wait_time = Config.API_RETRY_DELAY * (2 ** attempt) + logger.warning(f"API call failed (attempt {attempt + 1}/{Config.API_MAX_RETRIES}), retrying in {wait_time}s: {e}") + time.sleep(wait_time) + + raise last_error def analyze_content(self, content: str, filename: str, file_type: FileType) -> Dict[str, str]: """ - Analyze content and generate appropriate metadata. + Analyze content and generate appropriate metadata with production-ready error handling. Args: content: Extracted text content @@ -30,26 +129,27 @@ class MetadataAnalyzer: file_type: Type of file Returns: - Dictionary with metadata (title, subject, keywords) + Dictionary with metadata (title, subject, keywords, _tokens_used, _confidence) """ try: - # Truncate content if too long - if len(content) > Config.MAX_TEXT_LENGTH: - content = content[:Config.MAX_TEXT_LENGTH] + "..." + # Truncate content if needed with proper token counting + content_tokens = self._count_tokens(content) + if content_tokens > Config.MAX_TEXT_LENGTH: + content = self._truncate_content(content, Config.MAX_TEXT_LENGTH) + logger.info(f"Truncated content from {content_tokens} to {self._count_tokens(content)} tokens") # Generate prompt based on file type prompt = self._create_prompt(content, filename, file_type) - # Call OpenAI API - response = self.client.chat.completions.create( - model=self.model, - messages=[ - {"role": "system", "content": "You are a metadata expert who generates professional, accurate metadata for documents in English."}, - {"role": "user", "content": prompt} - ], - temperature=Config.TEMPERATURE, - max_tokens=Config.MAX_TOKENS - ) + # Count total tokens before API call + prompt_tokens = self._count_tokens(prompt) + logger.info(f"API call for {filename}: {prompt_tokens} prompt tokens") + + # Call API with retry logic + response = self._call_openai_api([ + {"role": "system", "content": "You are a metadata expert who generates professional, accurate metadata for documents in English."}, + {"role": "user", "content": prompt} + ]) # Parse response metadata_text = response.choices[0].message.content @@ -61,13 +161,20 @@ class MetadataAnalyzer: for key, value in metadata.items() } - logger.info(f"Generated metadata for {filename}") + # Add metadata about the generation + metadata['_tokens_used'] = response.usage.total_tokens + metadata['_confidence'] = 0.9 # Could calculate based on response + + logger.info(f"Generated metadata for {filename} (tokens used: {metadata['_tokens_used']})") return metadata except Exception as e: - logger.error(f"Error analyzing content: {e}") - # Return fallback metadata - return self._generate_fallback_metadata(filename, file_type) + logger.error(f"Error analyzing content for {filename}: {e}") + # Return fallback metadata with error info + fallback = self._generate_fallback_metadata(filename, file_type) + fallback['_ai_error'] = str(e) + fallback['_tokens_used'] = 0 + return fallback def _create_prompt(self, content: str, filename: str, file_type: FileType) -> str: """Create AI prompt based on file type.""" diff --git a/templates/index.html b/templates/index.html index 7a54645..5ef7d0f 100644 --- a/templates/index.html +++ b/templates/index.html @@ -491,7 +491,13 @@ currentFiles = []; const metadataSource = document.getElementById('metadataSource').value; - showInfo(`Processing ${files.length} file(s) with ${metadataSource} source...`); + + // Show specific message for AI processing + if (metadataSource === 'ai') { + showInfo(`🤖 Generating AI metadata for ${files.length} file(s)... This may take 10-30 seconds per file.`); + } else { + showInfo(`Processing ${files.length} file(s) with ${metadataSource} source...`); + } const formData = new FormData(); formData.append('metadata_source', metadataSource); @@ -550,6 +556,19 @@ fileItem.className = 'file-item'; fileItem.id = `file-${index}`; + // Build AI info section if available + let aiInfoHtml = ''; + if (file.suggested_metadata._tokens_used) { + aiInfoHtml = `
+ ✓ AI generated (${file.suggested_metadata._tokens_used} tokens used) +
`; + } + if (file.suggested_metadata._ai_error) { + aiInfoHtml = `
+ ⚠️ AI Error: ${file.suggested_metadata._ai_error} +
`; + } + fileItem.innerHTML = `
📄 ${file.filename}
@@ -565,6 +584,7 @@

✏️ Edit Metadata

${displayEditableMetadata(file.suggested_metadata, index)} + ${aiInfoHtml}
@@ -603,10 +623,13 @@ } function displayEditableMetadata(metadata, index) { + // Filter out internal fields (starting with _) const title = metadata?.title || ''; const subject = metadata?.subject || ''; const keywords = metadata?.keywords || ''; + // Don't show internal metadata fields in the form + return `
diff --git a/web_app.py b/web_app.py index 3086117..811f80e 100644 --- a/web_app.py +++ b/web_app.py @@ -20,6 +20,7 @@ import unicodedata from src.file_detector import FileDetector, FileType from src.excel_metadata_lookup import ExcelMetadataLookup from src.config import Config +from src.metadata_analyzer import MetadataAnalyzer def safe_filename(filename): """Sanitize filename while preserving Unicode characters (Chinese, Japanese, Korean).""" @@ -52,6 +53,9 @@ EXCEL_PATH = Path(__file__).parent / "Celum ID to Adobe Asset Path Mapping Sprea # Initialize metadata lookup from Excel metadata_lookup = None +# Initialize AI analyzer (lazy initialization) +ai_analyzer = None + # Initialize extractors and updaters extractors = { FileType.PDF: PDFExtractor(), @@ -81,6 +85,23 @@ def get_metadata_lookup(): metadata_lookup = ExcelMetadataLookup(str(EXCEL_PATH)) return metadata_lookup +def get_ai_analyzer(): + """Get or create AI analyzer instance.""" + global ai_analyzer + if ai_analyzer is None: + if Config.OPENAI_API_KEY: + try: + ai_analyzer = MetadataAnalyzer() + logger = __import__('logging').getLogger(__name__) + logger.info("AI analyzer initialized successfully") + except Exception as e: + logger = __import__('logging').getLogger(__name__) + logger.error(f"Failed to initialize AI analyzer: {e}") + return None + else: + return None + return ai_analyzer + @app.route('/') def index(): """Main page.""" @@ -167,13 +188,50 @@ def upload_file(): } elif metadata_source == 'ai': - # AI generation - will be implemented in Phase 2.3 - # For now, return placeholder - new_metadata = { - 'title': Path(filename).stem, - 'subject': 'AI generation not yet implemented', - 'keywords': '' - } + # AI generation using MetadataAnalyzer + analyzer = get_ai_analyzer() + + if analyzer: + try: + # Extract content from file + content = extractor.extract_content(str(filepath)) + + if not content or len(content.strip()) < 10: + # Not enough content for AI analysis + new_metadata = { + 'title': Path(filename).stem, + 'subject': 'Insufficient content for AI analysis', + 'keywords': '', + '_ai_error': 'Not enough text content extracted' + } + else: + # Generate metadata with AI + new_metadata = analyzer.analyze_content(content, filename, file_type) + + # Log token usage if available + if '_tokens_used' in new_metadata: + import logging + logging.getLogger(__name__).info( + f"AI tokens used for {filename}: {new_metadata['_tokens_used']}" + ) + + except Exception as e: + import logging + logging.getLogger(__name__).error(f"AI generation failed for {filename}: {e}") + new_metadata = { + 'title': Path(filename).stem, + 'subject': f'AI generation error: {str(e)}', + 'keywords': '', + '_ai_error': str(e) + } + else: + # AI not configured + new_metadata = { + 'title': Path(filename).stem, + 'subject': 'AI generation not available (OpenAI API key not configured)', + 'keywords': '', + '_ai_error': 'OpenAI API key not configured' + } elif metadata_source == 'import': # Import from file - will be implemented in Phase 2.4