solventum-image-metadata/backend/app/processors/metadata_analyzer.py

"""AI-powered metadata analysis using OpenAI GPT with production-ready features."""

import json
from openai import OpenAI
from typing import Dict, Optional
from .config import Config
from .file_detector import FileType
from .utils import get_logger, sanitize_metadata_value

# Production-ready imports
try:
    import tiktoken
    TIKTOKEN_AVAILABLE = True
except ImportError:
    TIKTOKEN_AVAILABLE = False

try:
    from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
    TENACITY_AVAILABLE = True
except ImportError:
    TENACITY_AVAILABLE = False

logger = get_logger(__name__)

class MetadataAnalyzer:
    """Analyze content and generate metadata using OpenAI GPT with production-ready error handling."""

    # Valid OpenAI models (as of January 2026)
    VALID_MODELS = [
        # GPT-5 models (2026 release)
        'gpt-5', 'gpt-5-mini', 'gpt-5-nano',
        'gpt-5-mini-2025-08-07', 'gpt-5-nano-2025-08-07',
        # GPT-4 models
        'gpt-4o', 'gpt-4o-mini', 'gpt-4o-mini-2024-07-18',
        'gpt-4-turbo', 'gpt-4', 'gpt-3.5-turbo',
        # Reasoning models
        'o1', 'o1-mini', 'o1-preview'
    ]

    def __init__(self):
        """Initialize the analyzer with OpenAI client."""
        if not Config.OPENAI_API_KEY:
            raise ValueError("OpenAI API key not configured")

        self.client = OpenAI(api_key=Config.OPENAI_API_KEY)
        self.model = Config.AI_MODEL

        # Validate model name
        if not self._is_valid_model(self.model):
            logger.warning(f"⚠️  Model '{self.model}' may not be valid. Valid models: {', '.join(self.VALID_MODELS)}")
            logger.warning(f"⚠️  Using fallback model: gpt-4o-mini")
            self.model = 'gpt-4o-mini'

        self.max_tokens = Config.MAX_TOKENS
        self.temperature = Config.TEMPERATURE

        logger.info(f"Initialized MetadataAnalyzer with model: {self.model}")

        # Initialize tiktoken encoding for proper token counting
        if TIKTOKEN_AVAILABLE:
            try:
                self.encoding = tiktoken.encoding_for_model(self.model)
            except KeyError:
                # Fallback for models not in tiktoken registry
                self.encoding = tiktoken.get_encoding("cl100k_base")
        else:
            self.encoding = None
            logger.warning("tiktoken not available - using character-based truncation")

    def _count_tokens(self, text: str) -> int:
        """Count tokens using tiktoken (proper tokenization)."""
        if self.encoding:
            return len(self.encoding.encode(text))
        else:
            # Fallback: rough estimate (1 token ≈ 4 characters)
            return len(text) // 4

    def _truncate_content(self, content: str, max_tokens: int = 3000) -> str:
        """Intelligently truncate content to fit token limit."""
        if not self.encoding:
            # Character-based fallback
            max_chars = max_tokens * 4
            if len(content) <= max_chars:
                return content
            return content[:max_chars]

        tokens = self.encoding.encode(content)
        if len(tokens) <= max_tokens:
            return content

        # Truncate and decode back
        truncated_tokens = tokens[:max_tokens]
        return self.encoding.decode(truncated_tokens)

    def _is_valid_model(self, model: str) -> bool:
        """Check if model name is valid."""
        # Exact match
        if model in self.VALID_MODELS:
            return True
        # Check if it starts with a valid prefix (for dated versions)
        for valid_model in self.VALID_MODELS:
            if model.startswith(valid_model):
                return True
        return False

    def _is_new_model(self) -> bool:
        """
        Check if model is a new generation model.
        New models (GPT-5, GPT-4o, o1) use max_completion_tokens and don't support custom temperature.
        """
        new_models = ['gpt-5', 'gpt-4o', 'gpt-4-turbo', 'o1']
        return any(self.model.startswith(prefix) for prefix in new_models)

    def _get_api_params(self) -> dict:
        """
        Get the correct API parameters based on model.
        Newer models (GPT-5, GPT-4o, o1) use max_completion_tokens and don't support custom temperature.
        Older models (GPT-3.5-turbo) use max_tokens and support temperature.
        """
        params = {}

        # Token parameter
        if self._is_new_model():
            params['max_completion_tokens'] = self.max_tokens
            # New models (GPT-5, GPT-4o, o1) don't support custom temperature (only default value 1)
            logger.debug(f"Using max_completion_tokens for {self.model}")
        else:
            params['max_tokens'] = self.max_tokens
            params['temperature'] = self.temperature
            logger.debug(f"Using max_tokens + temperature for {self.model}")

        return params

    def _call_openai_api(self, messages: list) -> dict:
        """
        Call OpenAI API with automatic retry on failures.
        Uses tenacity for exponential backoff if available.
        """
        # Get the correct API parameters
        api_params = self._get_api_params()

        if TENACITY_AVAILABLE:
            # Use retry decorator dynamically
            retry_decorator = retry(
                stop=stop_after_attempt(Config.API_MAX_RETRIES),
                wait=wait_exponential(multiplier=Config.API_RETRY_DELAY, min=2, max=10),
                retry=retry_if_exception_type((Exception,)),
                reraise=True
            )

            @retry_decorator
            def _api_call():
                return self.client.chat.completions.create(
                    model=self.model,
                    messages=messages,
                    timeout=Config.API_TIMEOUT,
                    **api_params
                )

            return _api_call()
        else:
            # Fallback: simple retry without exponential backoff
            import time
            last_error = None

            for attempt in range(Config.API_MAX_RETRIES):
                try:
                    return self.client.chat.completions.create(
                        model=self.model,
                        messages=messages,
                        timeout=Config.API_TIMEOUT,
                        **api_params
                    )
                except Exception as e:
                    last_error = e
                    if attempt < Config.API_MAX_RETRIES - 1:
                        wait_time = Config.API_RETRY_DELAY * (2 ** attempt)
                        logger.warning(f"API call failed (attempt {attempt + 1}/{Config.API_MAX_RETRIES}), retrying in {wait_time}s: {e}")
                        time.sleep(wait_time)

            raise last_error

    def analyze_content(self, content: str, filename: str, file_type: FileType) -> Dict[str, str]:
        """
        Analyze content and generate appropriate metadata with production-ready error handling.

        Args:
            content: Extracted text content
            filename: Original filename
            file_type: Type of file

        Returns:
            Dictionary with metadata (title, subject, keywords, _tokens_used, _confidence)
        """
        try:
            # Truncate content if needed with proper token counting
            content_tokens = self._count_tokens(content)
            if content_tokens > Config.MAX_TEXT_LENGTH:
                content = self._truncate_content(content, Config.MAX_TEXT_LENGTH)
                logger.info(f"Truncated content from {content_tokens} to {self._count_tokens(content)} tokens")

            # Generate prompt based on file type
            prompt = self._create_prompt(content, filename, file_type)

            # Count total tokens before API call
            prompt_tokens = self._count_tokens(prompt)
            logger.info(f"API call for {filename}: {prompt_tokens} prompt tokens")

            # Call API with retry logic
            response = self._call_openai_api([
                {"role": "system", "content": "You are a metadata expert who generates professional, accurate metadata for documents in English."},
                {"role": "user", "content": prompt}
            ])

            # Parse response with detailed logging
            logger.info(f"API Response for {filename}:")
            logger.info(f"  - Model used: {response.model}")
            logger.info(f"  - Finish reason: {response.choices[0].finish_reason}")
            logger.info(f"  - Tokens: prompt={response.usage.prompt_tokens}, completion={response.usage.completion_tokens}, total={response.usage.total_tokens}")

            metadata_text = response.choices[0].message.content
            logger.info(f"  - Content length: {len(metadata_text) if metadata_text else 0} chars")
            logger.info(f"  - Content preview: {metadata_text[:200] if metadata_text else '(empty)'}")

            # Check if content is None or empty
            if not metadata_text or len(metadata_text.strip()) == 0:
                logger.error(f"❌ API returned empty content for {filename}!")
                logger.error(f"   This usually means:")
                logger.error(f"   1. Invalid model name: {self.model}")
                logger.error(f"   2. Model doesn't support this request type")
                logger.error(f"   3. Content was filtered/refused")
                logger.error(f"   Using fallback metadata instead.")
                return self._generate_fallback_metadata(filename, file_type)

            metadata = self._parse_metadata_response(metadata_text)

            # Sanitize metadata values
            metadata = {
                key: sanitize_metadata_value(value)
                for key, value in metadata.items()
            }

            # Add metadata about the generation
            metadata['_tokens_used'] = response.usage.total_tokens
            metadata['_confidence'] = 0.9  # Could calculate based on response

            logger.info(f"Generated metadata for {filename} (tokens used: {metadata['_tokens_used']})")
            return metadata

        except Exception as e:
            logger.error(f"Error analyzing content for {filename}: {e}")
            # Return fallback metadata with error info
            fallback = self._generate_fallback_metadata(filename, file_type)
            fallback['_ai_error'] = str(e)
            fallback['_tokens_used'] = 0
            return fallback

    def _create_prompt(self, content: str, filename: str, file_type: FileType) -> str:
        """Create AI prompt based on file type."""
        file_type_descriptions = {
            FileType.PDF: "PDF document",
            FileType.IMAGE: "image file",
            FileType.OFFICE_DOC: "Word document",
            FileType.OFFICE_SHEET: "Excel spreadsheet",
            FileType.OFFICE_PRESENTATION: "PowerPoint presentation",
            FileType.VIDEO: "video file"
        }

        file_desc = file_type_descriptions.get(file_type, "file")

        prompt = f"""Analyze the following {file_desc} content and generate professional metadata in English.

Filename: {filename}
Content: {content}

Generate metadata with these fields:
1. Title: A concise, professional title (50-100 characters) that clearly describes the document/content
2. Subject: A brief description (1-2 sentences) of the document's purpose and content
3. Keywords: 5-10 relevant keywords separated by commas (include product names, categories, topics)

Rules:
- All text MUST be in English
- Title should identify the main product/service and document type (e.g., "guide", "brochure", "manual")
- Subject should explain what the document is about and its purpose
- Keywords should be searchable terms relevant to the content
- Be professional and concise
- Return ONLY a JSON object with fields: title, subject, keywords

Example output format:
{{
  "title": "3M Filtek Universal Restorative - Shade Selection Guide",
  "subject": "Shade selection guide for 3M Filtek Universal Restorative dental material",
  "keywords": "Filtek, Universal Restorative, shade selection, dental, restorative material, 3M, dentistry, composite"
}}

Return only the JSON object, no additional text."""

        return prompt

    def _parse_metadata_response(self, response_text: str) -> Dict[str, str]:
        """Parse AI response into metadata dictionary."""
        try:
            # Try to parse as JSON first
            response_text = response_text.strip()
            logger.info(f"Parsing response (length={len(response_text)}): {response_text[:200]}")

            # Remove markdown code blocks if present
            if response_text.startswith('```'):
                lines = response_text.split('\n')
                # Find first and last code block markers
                start_idx = 0
                end_idx = len(lines)
                for i, line in enumerate(lines):
                    if line.startswith('```'):
                        if start_idx == 0:
                            start_idx = i + 1
                        else:
                            end_idx = i
                            break
                response_text = '\n'.join(lines[start_idx:end_idx])

            # Try to find JSON object in text
            # Look for { ... } pattern
            start = response_text.find('{')
            end = response_text.rfind('}')
            if start != -1 and end != -1:
                json_str = response_text[start:end+1]
                metadata = json.loads(json_str)
            else:
                metadata = json.loads(response_text)

            # Ensure all required fields are present
            required_fields = ['title', 'subject', 'keywords']
            for field in required_fields:
                if field not in metadata:
                    metadata[field] = ""

            # Validate that we got actual content
            if not metadata.get('title') or len(metadata.get('title', '').strip()) < 3:
                logger.warning("JSON parsed but title is empty or too short, using text parsing")
                return self._parse_metadata_text(response_text)

            return metadata

        except (json.JSONDecodeError, ValueError, KeyError) as e:
            logger.warning(f"Failed to parse JSON response ({str(e)}), using text parsing")
            return self._parse_metadata_text(response_text)

    def _parse_metadata_text(self, text: str) -> Dict[str, str]:
        """Parse metadata from plain text response."""
        metadata = {
            'title': '',
            'subject': '',
            'keywords': ''
        }

        # Improved text parsing
        lines = text.split('\n')

        for line in lines:
            line = line.strip()
            if not line or line.startswith('#') or line.startswith('//'):
                continue

            # Remove quotes and extra whitespace
            line_clean = line.strip('"\'')

            # Look for field indicators (case insensitive)
            line_lower = line_clean.lower()

            if ':' in line_clean:
                parts = line_clean.split(':', 1)
                key = parts[0].strip().lower()
                value = parts[1].strip().strip('",\'')

                if 'title' in key and not metadata['title']:
                    metadata['title'] = value
                elif 'subject' in key and not metadata['subject']:
                    metadata['subject'] = value
                elif 'keyword' in key and not metadata['keywords']:
                    metadata['keywords'] = value

        # If still empty, try to extract from unstructured text
        if not metadata['title']:
            # Look for first substantial line as title
            for line in lines:
                line = line.strip().strip('"\'')
                if len(line) > 10 and not line.lower().startswith(('title', 'subject', 'keyword')):
                    metadata['title'] = line[:200]  # Limit length
                    break

        logger.info(f"Text parsing result: title='{metadata['title'][:50]}...', subject='{metadata['subject'][:50]}...'")
        return metadata

    def _generate_fallback_metadata(self, filename: str, file_type: FileType) -> Dict[str, str]:
        """Generate basic metadata based on filename when AI fails."""
        # Remove extension and clean filename
        from pathlib import Path
        clean_name = Path(filename).stem.replace('_', ' ').replace('-', ' ')

        return {
            'title': clean_name,
            'subject': f"{clean_name} - {FileType(file_type).value}",
            'keywords': clean_name.replace(' ', ', ')
        }

    def generate_metadata_for_pdf(self, text: str) -> Dict[str, str]:
        """Specialized metadata generation for PDF documents."""
        # Wrapper for PDF-specific logic if needed
        return self.analyze_content(text, "document.pdf", FileType.PDF)

    def generate_metadata_for_image(self, text: str) -> Dict[str, str]:
        """Specialized metadata generation for images."""
        return self.analyze_content(text, "image.jpg", FileType.IMAGE)

    def generate_metadata_for_office(self, text: str) -> Dict[str, str]:
        """Specialized metadata generation for Office documents."""
        return self.analyze_content(text, "document.docx", FileType.OFFICE_DOC)

    def generate_metadata_for_video(self, metadata: Dict[str, str]) -> Dict[str, str]:
        """Specialized metadata generation for videos."""
        # For videos, we might use existing metadata as input
        text = f"Video title: {metadata.get('title', 'N/A')}"
        return self.analyze_content(text, "video.mp4", FileType.VIDEO)