#!/usr/bin/env python3 """ Centralized LLM configuration module for Visual AI QC. This script manages the configuration and interaction with different language models. """ import os import io import base64 import time import traceback from typing import Dict, List, Optional, Union, Any from dataclasses import dataclass from dotenv import load_dotenv from PIL import Image # Load environment variables config_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'config.env') if os.path.exists(config_path): load_dotenv(config_path) print(f"LLM Config: Loaded environment variables from {config_path}") else: # Fallback to .env or environment variables load_dotenv() print("LLM Config: Trying to load from default .env file") # Model version configuration - centralized for easy updates @dataclass class ModelVersions: """Dataclass to store current model versions""" # OpenAI models openai_vision: str = "gpt-4o" # Updated from invalid gpt-4.1-2025-04-14 # Google Gemini models - Stable gemini_vision: str = "gemini-2.5-pro" # Stable production model # Google Gemini models - Beta/Experimental gemini_beta: str = "gemini-3-pro-preview" # Beta model for testing # Global model versions instance MODEL_VERSIONS = ModelVersions() # Available model options for UI selection AVAILABLE_MODELS = { 'openai': { 'name': 'OpenAI GPT-4o', 'model_id': 'gpt-4o', 'provider': 'OpenAI', 'status': 'stable', 'description': 'OpenAI GPT-4o with vision capabilities' }, 'gemini': { 'name': 'Gemini 2.5 Pro', 'model_id': 'gemini-2.5-pro', 'provider': 'Gemini', 'status': 'stable', 'description': 'Google Gemini 2.5 Pro with multimodal capabilities' }, 'gemini_beta': { 'name': 'Gemini 3 Pro', 'model_id': 'gemini-3-pro-preview', 'provider': 'Gemini', 'status': 'beta', 'description': 'Latest Gemini 3 model - experimental, may be unstable', 'warning': 'This is a beta model and may produce unexpected results' } } # Configure OpenAI try: import openai openai_api_key = os.getenv("OPENAI_API_KEY") if openai_api_key: openai.api_key = openai_api_key print(f"OpenAI API key loaded (length: {len(openai_api_key)})") openai_client = openai.OpenAI(api_key=openai_api_key) else: print("Warning: OPENAI_API_KEY not found in environment variables.") openai_client = None except ImportError: print("Warning: openai library not installed. OpenAI functionality disabled.") openai_client = None except Exception as e: print(f"Error configuring OpenAI: {e}") openai_client = None # Configure Google Generative AI try: import google.generativeai as genai google_api_key = os.getenv("GOOGLE_API_KEY") if google_api_key: genai.configure(api_key=google_api_key) print(f"Google API key loaded (length: {len(google_api_key)})") else: print("Warning: GOOGLE_API_KEY not found in environment variables. Gemini functionality may be limited.") genai = None except ImportError: print("Warning: google-generativeai library not installed. Gemini functionality disabled.") genai = None except Exception as e: print(f"Error configuring Google Generative AI: {e}") genai = None def pil_image_to_base64(pil_image: Image.Image, format: str = "jpeg") -> str: """Converts a PIL Image to a base64 encoded string.""" buffered = io.BytesIO() pil_image.save(buffered, format=format.upper()) return base64.b64encode(buffered.getvalue()).decode('utf-8') def get_model_info() -> Dict[str, Dict[str, Any]]: """Return information about available models and their configurations""" return { "OpenAI": { "available": openai_client is not None, "current_version": MODEL_VERSIONS.openai_vision, # gpt-4o "max_input_tokens": 128000, "max_output_tokens": 16384, # Updated for gpt-4o "notes": "GPT-4o with vision capabilities, supports images and text" }, "Gemini": { "available": genai is not None, "current_version": MODEL_VERSIONS.gemini_vision, # gemini-2.5-pro "max_input_tokens": 1048576, # 1M token context "max_output_tokens": 8192, "notes": "Gemini 2.5 Pro with vision, thinking, and multimodal capabilities" }, "Gemini_Beta": { "available": genai is not None, "current_version": MODEL_VERSIONS.gemini_beta, # gemini-3-pro-preview "max_input_tokens": 1048576, "max_output_tokens": 8192, "status": "beta", "notes": "Gemini 3 Pro Preview - Beta model, may be unstable" } } def get_available_models() -> Dict[str, Dict[str, Any]]: """Get all available models for UI selection""" return AVAILABLE_MODELS def update_model_version(provider: str, model_type: str, version: str) -> bool: """ Update the model version for a specific provider and model type Args: provider: The LLM provider ("OpenAI" or "Gemini") model_type: The type of model ("vision") version: The new model version to use Returns: bool: True if update was successful, False otherwise """ global MODEL_VERSIONS try: if provider == "OpenAI": if model_type == "vision": MODEL_VERSIONS.openai_vision = version print(f"Updated OpenAI vision model to: {version}") return True elif provider == "Gemini": if model_type == "vision": MODEL_VERSIONS.gemini_vision = version print(f"Updated Gemini vision model to: {version}") return True print(f"Invalid provider or model type: {provider}/{model_type}") return False except Exception as e: print(f"Error updating model version: {e}") return False def call_openai_vision( prompt: str, pil_image_asset: Image.Image, pil_image_ref: Optional[Image.Image] = None, max_retries: int = 2 ) -> str: """ Call OpenAI's vision model with the provided prompt and images Args: prompt: The text prompt to send to the model pil_image_asset: The main image as a PIL Image pil_image_ref: Optional reference image as a PIL Image max_retries: Maximum number of retries on rate limit errors Returns: str: The model's response text """ if not openai_client: raise ValueError("OpenAI API key or library not configured.") print("=" * 80) print("DEBUG: OpenAI Vision API Call") print("=" * 80) # Debug information about images being sent print(f"DEBUG: Main asset image dimensions: {pil_image_asset.size}") print(f"DEBUG: Main asset image mode: {pil_image_asset.mode}") if pil_image_ref: print(f"DEBUG: Reference image provided - dimensions: {pil_image_ref.size}") print(f"DEBUG: Reference image mode: {pil_image_ref.mode}") print("DEBUG: Total images being sent to LLM: 2 (main asset + reference)") else: print("DEBUG: No reference image provided") print("DEBUG: Total images being sent to LLM: 1 (main asset only)") # Debug prompt information print("DEBUG: Prompt being sent to LLM:") print("-" * 40) print(prompt) print("-" * 40) current_retry = 0 last_exception = None # Construct the enhanced prompt to help LLM distinguish images enhanced_prompt = prompt if pil_image_ref: enhanced_prompt = f"""You will receive two images for analysis: 1. FIRST IMAGE: This is the main asset/file that needs to be quality checked 2. SECOND IMAGE: This is the reference image/brand guideline to compare against Please analyze the first image (main asset) against the quality standards shown in the second image (reference/guideline). Original prompt: {prompt}""" print("DEBUG: Enhanced prompt with image labeling:") print("-" * 40) print(enhanced_prompt) print("-" * 40) content = [ {"type": "text", "text": enhanced_prompt} ] # Encode main asset base64_asset = pil_image_to_base64(pil_image_asset) content.append({ "type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_asset}"} }) print(f"DEBUG: Main asset encoded to base64 (length: {len(base64_asset)} chars)") # Encode reference asset if provided if pil_image_ref: base64_ref = pil_image_to_base64(pil_image_ref) content.append({ "type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_ref}"} }) print(f"DEBUG: Reference image encoded to base64 (length: {len(base64_ref)} chars)") while current_retry < max_retries: try: # Use the specifically requested model version from the global config print(f"DEBUG: Calling OpenAI model: {MODEL_VERSIONS.openai_vision}") print(f"DEBUG: Content structure being sent:") for i, item in enumerate(content): if item["type"] == "text": print(f" - Item {i}: Text prompt ({len(item['text'])} chars)") elif item["type"] == "image_url": print(f" - Item {i}: Image data (base64 encoded)") response = openai_client.chat.completions.create( model=MODEL_VERSIONS.openai_vision, messages=[ { "role": "user", "content": content, } ], max_tokens=1000 # Adjust token limit as needed ) response_content = response.choices[0].message.content # Extract token usage token_usage = { 'prompt_tokens': response.usage.prompt_tokens if hasattr(response, 'usage') else 0, 'completion_tokens': response.usage.completion_tokens if hasattr(response, 'usage') else 0, 'total_tokens': response.usage.total_tokens if hasattr(response, 'usage') else 0 } print("DEBUG: OpenAI Response received:") print("-" * 40) print(response_content) print("-" * 40) print(f"DEBUG: Response length: {len(response_content) if response_content else 0} chars") print(f"DEBUG: Token usage - Prompt: {token_usage['prompt_tokens']}, Completion: {token_usage['completion_tokens']}, Total: {token_usage['total_tokens']}") print("=" * 80) return response_content, token_usage except Exception as e: last_exception = e if "rate limit" in str(e).lower(): print(f"OpenAI Rate Limit Error: {e}. Retrying ({current_retry + 1}/{max_retries})...") time.sleep(5 * (current_retry + 1)) # Exponential backoff current_retry += 1 else: # Don't retry on other errors print(f"OpenAI API error (non-rate-limit): {e}") break # If loop finishes due to retries or breaks due to other error if last_exception: print(f"OpenAI API call failed after retries: {last_exception}") raise last_exception else: raise Exception("OpenAI API call failed for an unknown reason after retries.") def call_gemini_vision( prompt: str, pil_image_asset: Image.Image, pil_image_ref: Optional[Image.Image] = None, model_version: Optional[str] = None ) -> str: """ Call Google's Gemini vision model with the provided prompt and images Args: prompt: The text prompt to send to the model pil_image_asset: The main image as a PIL Image pil_image_ref: Optional reference image as a PIL Image model_version: Optional model version override (e.g., 'gemini-3-pro-preview') Returns: str: The model's response text """ if not genai: raise ValueError("Google Generative AI library not configured or API key missing.") print("=" * 80) print("DEBUG: Gemini Vision API Call") print("=" * 80) # Debug information about images being sent print(f"DEBUG: Main asset image dimensions: {pil_image_asset.size}") print(f"DEBUG: Main asset image mode: {pil_image_asset.mode}") if pil_image_ref: print(f"DEBUG: Reference image provided - dimensions: {pil_image_ref.size}") print(f"DEBUG: Reference image mode: {pil_image_ref.mode}") print("DEBUG: Total images being sent to LLM: 2 (main asset + reference)") else: print("DEBUG: No reference image provided") print("DEBUG: Total images being sent to LLM: 1 (main asset only)") # Debug prompt information print("DEBUG: Original prompt being sent to LLM:") print("-" * 40) print(prompt) print("-" * 40) # Get API key from environment again to be sure api_key = os.getenv("GOOGLE_API_KEY") if not api_key: raise ValueError("Google API key not found in environment variables") # Configure with the API key directly genai.configure(api_key=api_key) try: # Use specified model version or default to stable model_to_use = model_version or MODEL_VERSIONS.gemini_vision model = genai.GenerativeModel(model_to_use) print(f"DEBUG: Using Gemini model: {model_to_use}") # Construct the enhanced prompt to help LLM distinguish images enhanced_prompt = prompt if pil_image_ref: enhanced_prompt = f"""You will receive two images for analysis: 1. FIRST IMAGE: This is the main asset/file that needs to be quality checked 2. SECOND IMAGE: This is the reference image/brand guideline to compare against Please analyze the first image (main asset) against the quality standards shown in the second image (reference/guideline). Original prompt: {prompt}""" print("DEBUG: Enhanced prompt with image labeling:") print("-" * 40) print(enhanced_prompt) print("-" * 40) # Prepare parts list parts = [enhanced_prompt] parts.append(pil_image_asset) # Gemini client handles PIL images directly print(f"DEBUG: Added main asset image to parts list (dimensions: {pil_image_asset.size})") if pil_image_ref: parts.append(pil_image_ref) print(f"DEBUG: Added reference image to parts list (dimensions: {pil_image_ref.size})") print(f"DEBUG: Total parts being sent to Gemini: {len(parts)} (1 text + {len(parts)-1} images)") response = model.generate_content(parts) # Handle potential safety blocks if not response.parts: safety_info = response.prompt_feedback if hasattr(response, 'prompt_feedback') else "No specific feedback provided." error_msg = f"Error: Gemini response blocked due to safety settings or other reasons. Feedback: {safety_info}" print("DEBUG: Gemini Response blocked by safety filters:") print("-" * 40) print(error_msg) print("-" * 40) print("=" * 80) # Return error with zero tokens return error_msg, {'prompt_tokens': 0, 'completion_tokens': 0, 'total_tokens': 0} response_text = response.text # Extract token usage from Gemini response token_usage = { 'prompt_tokens': 0, 'completion_tokens': 0, 'total_tokens': 0 } if hasattr(response, 'usage_metadata'): token_usage['prompt_tokens'] = getattr(response.usage_metadata, 'prompt_token_count', 0) token_usage['completion_tokens'] = getattr(response.usage_metadata, 'candidates_token_count', 0) token_usage['total_tokens'] = getattr(response.usage_metadata, 'total_token_count', 0) print("DEBUG: Gemini Response received successfully:") print("-" * 40) print(response_text) print("-" * 40) print(f"DEBUG: Response length: {len(response_text) if response_text else 0} chars") print(f"DEBUG: Token usage - Prompt: {token_usage['prompt_tokens']}, Completion: {token_usage['completion_tokens']}, Total: {token_usage['total_tokens']}") print("=" * 80) return response_text, token_usage except Exception as e: print(f"Gemini API call failed: {e}") error_detail = str(e) # Check for common error messages if "API key not valid" in error_detail: print(f"Invalid API key (length: {len(api_key)})") elif "Quota exceeded" in error_detail: print("API quota exceeded") raise e VIDEO_EXTENSIONS = ['.mp4', '.avi', '.mov', '.mkv', '.wmv', '.flv', '.webm'] VIDEO_MIME_TYPES = { '.mp4': 'video/mp4', '.avi': 'video/x-msvideo', '.mov': 'video/quicktime', '.mkv': 'video/x-matroska', '.wmv': 'video/x-ms-wmv', '.flv': 'video/x-flv', '.webm': 'video/webm', } def is_video_file(file_path: str) -> bool: """Check if a file is a video based on its extension.""" ext = os.path.splitext(file_path)[1].lower() return ext in VIDEO_EXTENSIONS def call_gemini_video( prompt: str, video_path: str, model_version: Optional[str] = None ) -> tuple: """ Call Google's Gemini model with a video file using the File Upload API. Gemini natively supports video analysis - it processes the full video including motion, transitions, and temporal flow. Args: prompt: The text prompt to send to the model video_path: Path to the video file on disk model_version: Optional model version override Returns: tuple: (response_text, token_usage_dict) """ if not genai: raise ValueError("Google Generative AI library not configured or API key missing.") print("=" * 80) print("DEBUG: Gemini Video API Call") print("=" * 80) print(f"DEBUG: Video file path: {video_path}") api_key = os.getenv("GOOGLE_API_KEY") if not api_key: raise ValueError("Google API key not found in environment variables") genai.configure(api_key=api_key) try: # Upload the video file to Gemini file_ext = os.path.splitext(video_path)[1].lower() mime_type = VIDEO_MIME_TYPES.get(file_ext, 'video/mp4') file_size_mb = os.path.getsize(video_path) / (1024 * 1024) print(f"DEBUG: Uploading video ({file_size_mb:.1f} MB, {mime_type})...") video_file = genai.upload_file(video_path, mime_type=mime_type) print(f"DEBUG: Video uploaded successfully: {video_file.name}") # Wait for file to be processed (Gemini needs time for video processing) import time as _time max_wait = 120 # seconds waited = 0 while video_file.state.name == "PROCESSING" and waited < max_wait: print(f"DEBUG: Video processing... ({waited}s)") _time.sleep(5) video_file = genai.get_file(video_file.name) waited += 5 if video_file.state.name == "FAILED": raise ValueError(f"Gemini video processing failed: {video_file.state.name}") if video_file.state.name != "ACTIVE": print(f"DEBUG: Warning - video state is {video_file.state.name} after {waited}s, proceeding anyway") print(f"DEBUG: Video ready for analysis (state: {video_file.state.name})") # Use specified model version or default to stable model_to_use = model_version or MODEL_VERSIONS.gemini_vision model = genai.GenerativeModel(model_to_use) print(f"DEBUG: Using Gemini model: {model_to_use}") # Send video + prompt to Gemini parts = [prompt, video_file] print(f"DEBUG: Sending prompt ({len(prompt)} chars) + video to Gemini") response = model.generate_content(parts) # Handle potential safety blocks if not response.parts: safety_info = response.prompt_feedback if hasattr(response, 'prompt_feedback') else "No feedback" error_msg = f"Error: Gemini response blocked. Feedback: {safety_info}" print(f"DEBUG: {error_msg}") # Clean up uploaded file try: genai.delete_file(video_file.name) except Exception: pass return error_msg, {'prompt_tokens': 0, 'completion_tokens': 0, 'total_tokens': 0} response_text = response.text # Extract token usage token_usage = { 'prompt_tokens': 0, 'completion_tokens': 0, 'total_tokens': 0 } if hasattr(response, 'usage_metadata'): token_usage['prompt_tokens'] = getattr(response.usage_metadata, 'prompt_token_count', 0) token_usage['completion_tokens'] = getattr(response.usage_metadata, 'candidates_token_count', 0) token_usage['total_tokens'] = getattr(response.usage_metadata, 'total_token_count', 0) print("DEBUG: Gemini Video Response received:") print("-" * 40) print(response_text[:500] + ("..." if len(response_text) > 500 else "")) print("-" * 40) print(f"DEBUG: Token usage - Prompt: {token_usage['prompt_tokens']}, Completion: {token_usage['completion_tokens']}, Total: {token_usage['total_tokens']}") # Clean up uploaded file from Gemini servers try: genai.delete_file(video_file.name) print("DEBUG: Cleaned up uploaded video file from Gemini") except Exception as cleanup_err: print(f"DEBUG: Warning - could not clean up video file: {cleanup_err}") print("=" * 80) return response_text, token_usage except Exception as e: print(f"Gemini Video API call failed: {e}") raise e def run_visual_qc( prompt: str, asset_path: str, reference_path: Optional[str] = None, model_name: str = "Gemini", model_version: Optional[str] = None ) -> Dict[str, str]: """ Run visual QC analysis on an asset using the specified model. Args: prompt (str): The QC prompt to send to the model asset_path (str): Path to the asset file reference_path (str, optional): Path to reference image if needed model_name (str): Which model to use ("Gemini" or "OpenAI") model_version (str, optional): Specific model version override Returns: dict: Result dictionary with status and response """ print("\n" + "=" * 100) print("DEBUG: Visual QC Analysis Starting") print("=" * 100) print(f"DEBUG: Model selected: {model_name}") print(f"DEBUG: Asset file path: {asset_path}") if reference_path: print(f"DEBUG: Reference file path: {reference_path}") else: print("DEBUG: No reference file provided") print(f"DEBUG: Prompt length: {len(prompt)} characters") result = { "status": "error", "message": "", "response": "" } try: # Validate inputs if not os.path.exists(asset_path): print(f"DEBUG: ERROR - Asset file does not exist: {asset_path}") result["message"] = f"Asset file not found: {asset_path}" return result if reference_path and not os.path.exists(reference_path): print(f"DEBUG: ERROR - Reference file does not exist: {reference_path}") result["message"] = f"Reference file not found: {reference_path}" return result print("DEBUG: File validation passed - both files exist") # Check if this is a video file - use Gemini's native video API if is_video_file(asset_path): print(f"DEBUG: Video file detected - using Gemini video analysis") if model_name != "Gemini": print(f"DEBUG: Video requires Gemini - overriding {model_name} to Gemini") model_name = "Gemini" if not genai: result["message"] = "Gemini API required for video analysis but not configured." return result token_usage = {'prompt_tokens': 0, 'completion_tokens': 0, 'total_tokens': 0} api_response, token_usage = call_gemini_video(prompt, asset_path, model_version) result["status"] = "success" result["response"] = api_response result["token_usage"] = token_usage result["asset_type"] = "video" print("DEBUG: Video QC analysis completed successfully") print("=" * 100) return result # Image/PDF processing path # Load assets from visual_qc_apps.utils import get_image_from_asset print(f"DEBUG: Loading main asset image from: {asset_path}") pil_asset = get_image_from_asset(asset_path) if not pil_asset: print(f"DEBUG: ERROR - Could not load main asset: {asset_path}") result["message"] = f"Could not load or process asset file: {asset_path}" return result print(f"DEBUG: Main asset loaded successfully - dimensions: {pil_asset.size}, mode: {pil_asset.mode}") pil_ref = None if reference_path: print(f"DEBUG: Loading reference image from: {reference_path}") pil_ref = get_image_from_asset(reference_path) if not pil_ref: print(f"DEBUG: WARNING - Could not load reference image: {reference_path}") print("DEBUG: Continuing without reference image") else: print(f"DEBUG: Reference image loaded successfully - dimensions: {pil_ref.size}, mode: {pil_ref.mode}") # Call appropriate API print(f"DEBUG: Calling {model_name} API for visual analysis") token_usage = {'prompt_tokens': 0, 'completion_tokens': 0, 'total_tokens': 0} if model_name == "OpenAI": if not openai_client: print("DEBUG: ERROR - OpenAI client not configured") result["message"] = "OpenAI API Key or Client not configured." return result api_response, token_usage = call_openai_vision(prompt, pil_asset, pil_ref) elif model_name == "Gemini": if not genai: print("DEBUG: ERROR - Gemini client not configured") result["message"] = "Google Generative AI (Gemini) API Key or Client not configured." return result api_response, token_usage = call_gemini_vision(prompt, pil_asset, pil_ref, model_version) else: print(f"DEBUG: ERROR - Invalid model selected: {model_name}") result["message"] = "Invalid model selected." return result # Success print("DEBUG: Visual QC analysis completed successfully") print("=" * 100) result["status"] = "success" result["response"] = api_response result["token_usage"] = token_usage return result except Exception as e: print(f"DEBUG: ERROR in visual QC analysis: {str(e)}") tb_str = traceback.format_exc() print(f"DEBUG: Full traceback:\n{tb_str}") print("=" * 100) result["message"] = f"Error: {str(e)}\n{tb_str}" return result