ai_qc/backend/llm_config.py
nickviljoen 20259dcad0 Add Honda client, video QC, session refresh, Amazon check tuning
- Add Honda client with static_general and video_general profiles
- Add video QC capability using Gemini native video analysis (4 checks:
  visual_quality, brand_consistency, text_legibility, pacing_flow)
- Add video_general profile assigned to all 8 clients
- Extend session lifetime with MSAL silent token refresh (proactive
  every 45min + reactive on expiry), switch cache to localStorage
- Re-enable OCR layout measurements for Amazon checks
- Add scope boundary notes to all 6 Amazon checks to prevent cross-
  check penalization (locale errors isolated to logo_country only)
- Relax margins left-alignment tolerance from 1% to 4% to account
  for logo lockup internal padding
- Update brand guidelines DB with Amazon localization matrix and
  processed Dove PDF summary

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-16 14:53:52 +02:00

707 lines
No EOL
27 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Centralized LLM configuration module for Visual AI QC.
This script manages the configuration and interaction with different language models.
"""
import os
import io
import base64
import time
import traceback
from typing import Dict, List, Optional, Union, Any
from dataclasses import dataclass
from dotenv import load_dotenv
from PIL import Image
# Load environment variables
config_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'config.env')
if os.path.exists(config_path):
load_dotenv(config_path)
print(f"LLM Config: Loaded environment variables from {config_path}")
else:
# Fallback to .env or environment variables
load_dotenv()
print("LLM Config: Trying to load from default .env file")
# Model version configuration - centralized for easy updates
@dataclass
class ModelVersions:
"""Dataclass to store current model versions"""
# OpenAI models
openai_vision: str = "gpt-4o" # Updated from invalid gpt-4.1-2025-04-14
# Google Gemini models - Stable
gemini_vision: str = "gemini-2.5-pro" # Stable production model
# Google Gemini models - Beta/Experimental
gemini_beta: str = "gemini-3-pro-preview" # Beta model for testing
# Global model versions instance
MODEL_VERSIONS = ModelVersions()
# Available model options for UI selection
AVAILABLE_MODELS = {
'openai': {
'name': 'OpenAI GPT-4o',
'model_id': 'gpt-4o',
'provider': 'OpenAI',
'status': 'stable',
'description': 'OpenAI GPT-4o with vision capabilities'
},
'gemini': {
'name': 'Gemini 2.5 Pro',
'model_id': 'gemini-2.5-pro',
'provider': 'Gemini',
'status': 'stable',
'description': 'Google Gemini 2.5 Pro with multimodal capabilities'
},
'gemini_beta': {
'name': 'Gemini 3 Pro',
'model_id': 'gemini-3-pro-preview',
'provider': 'Gemini',
'status': 'beta',
'description': 'Latest Gemini 3 model - experimental, may be unstable',
'warning': 'This is a beta model and may produce unexpected results'
}
}
# Configure OpenAI
try:
import openai
openai_api_key = os.getenv("OPENAI_API_KEY")
if openai_api_key:
openai.api_key = openai_api_key
print(f"OpenAI API key loaded (length: {len(openai_api_key)})")
openai_client = openai.OpenAI(api_key=openai_api_key)
else:
print("Warning: OPENAI_API_KEY not found in environment variables.")
openai_client = None
except ImportError:
print("Warning: openai library not installed. OpenAI functionality disabled.")
openai_client = None
except Exception as e:
print(f"Error configuring OpenAI: {e}")
openai_client = None
# Configure Google Generative AI
try:
import google.generativeai as genai
google_api_key = os.getenv("GOOGLE_API_KEY")
if google_api_key:
genai.configure(api_key=google_api_key)
print(f"Google API key loaded (length: {len(google_api_key)})")
else:
print("Warning: GOOGLE_API_KEY not found in environment variables. Gemini functionality may be limited.")
genai = None
except ImportError:
print("Warning: google-generativeai library not installed. Gemini functionality disabled.")
genai = None
except Exception as e:
print(f"Error configuring Google Generative AI: {e}")
genai = None
def pil_image_to_base64(pil_image: Image.Image, format: str = "jpeg") -> str:
"""Converts a PIL Image to a base64 encoded string."""
buffered = io.BytesIO()
pil_image.save(buffered, format=format.upper())
return base64.b64encode(buffered.getvalue()).decode('utf-8')
def get_model_info() -> Dict[str, Dict[str, Any]]:
"""Return information about available models and their configurations"""
return {
"OpenAI": {
"available": openai_client is not None,
"current_version": MODEL_VERSIONS.openai_vision, # gpt-4o
"max_input_tokens": 128000,
"max_output_tokens": 16384, # Updated for gpt-4o
"notes": "GPT-4o with vision capabilities, supports images and text"
},
"Gemini": {
"available": genai is not None,
"current_version": MODEL_VERSIONS.gemini_vision, # gemini-2.5-pro
"max_input_tokens": 1048576, # 1M token context
"max_output_tokens": 8192,
"notes": "Gemini 2.5 Pro with vision, thinking, and multimodal capabilities"
},
"Gemini_Beta": {
"available": genai is not None,
"current_version": MODEL_VERSIONS.gemini_beta, # gemini-3-pro-preview
"max_input_tokens": 1048576,
"max_output_tokens": 8192,
"status": "beta",
"notes": "Gemini 3 Pro Preview - Beta model, may be unstable"
}
}
def get_available_models() -> Dict[str, Dict[str, Any]]:
"""Get all available models for UI selection"""
return AVAILABLE_MODELS
def update_model_version(provider: str, model_type: str, version: str) -> bool:
"""
Update the model version for a specific provider and model type
Args:
provider: The LLM provider ("OpenAI" or "Gemini")
model_type: The type of model ("vision")
version: The new model version to use
Returns:
bool: True if update was successful, False otherwise
"""
global MODEL_VERSIONS
try:
if provider == "OpenAI":
if model_type == "vision":
MODEL_VERSIONS.openai_vision = version
print(f"Updated OpenAI vision model to: {version}")
return True
elif provider == "Gemini":
if model_type == "vision":
MODEL_VERSIONS.gemini_vision = version
print(f"Updated Gemini vision model to: {version}")
return True
print(f"Invalid provider or model type: {provider}/{model_type}")
return False
except Exception as e:
print(f"Error updating model version: {e}")
return False
def call_openai_vision(
prompt: str,
pil_image_asset: Image.Image,
pil_image_ref: Optional[Image.Image] = None,
max_retries: int = 2
) -> str:
"""
Call OpenAI's vision model with the provided prompt and images
Args:
prompt: The text prompt to send to the model
pil_image_asset: The main image as a PIL Image
pil_image_ref: Optional reference image as a PIL Image
max_retries: Maximum number of retries on rate limit errors
Returns:
str: The model's response text
"""
if not openai_client:
raise ValueError("OpenAI API key or library not configured.")
print("=" * 80)
print("DEBUG: OpenAI Vision API Call")
print("=" * 80)
# Debug information about images being sent
print(f"DEBUG: Main asset image dimensions: {pil_image_asset.size}")
print(f"DEBUG: Main asset image mode: {pil_image_asset.mode}")
if pil_image_ref:
print(f"DEBUG: Reference image provided - dimensions: {pil_image_ref.size}")
print(f"DEBUG: Reference image mode: {pil_image_ref.mode}")
print("DEBUG: Total images being sent to LLM: 2 (main asset + reference)")
else:
print("DEBUG: No reference image provided")
print("DEBUG: Total images being sent to LLM: 1 (main asset only)")
# Debug prompt information
print("DEBUG: Prompt being sent to LLM:")
print("-" * 40)
print(prompt)
print("-" * 40)
current_retry = 0
last_exception = None
# Construct the enhanced prompt to help LLM distinguish images
enhanced_prompt = prompt
if pil_image_ref:
enhanced_prompt = f"""You will receive two images for analysis:
1. FIRST IMAGE: This is the main asset/file that needs to be quality checked
2. SECOND IMAGE: This is the reference image/brand guideline to compare against
Please analyze the first image (main asset) against the quality standards shown in the second image (reference/guideline).
Original prompt:
{prompt}"""
print("DEBUG: Enhanced prompt with image labeling:")
print("-" * 40)
print(enhanced_prompt)
print("-" * 40)
content = [
{"type": "text", "text": enhanced_prompt}
]
# Encode main asset
base64_asset = pil_image_to_base64(pil_image_asset)
content.append({
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{base64_asset}"}
})
print(f"DEBUG: Main asset encoded to base64 (length: {len(base64_asset)} chars)")
# Encode reference asset if provided
if pil_image_ref:
base64_ref = pil_image_to_base64(pil_image_ref)
content.append({
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{base64_ref}"}
})
print(f"DEBUG: Reference image encoded to base64 (length: {len(base64_ref)} chars)")
while current_retry < max_retries:
try:
# Use the specifically requested model version from the global config
print(f"DEBUG: Calling OpenAI model: {MODEL_VERSIONS.openai_vision}")
print(f"DEBUG: Content structure being sent:")
for i, item in enumerate(content):
if item["type"] == "text":
print(f" - Item {i}: Text prompt ({len(item['text'])} chars)")
elif item["type"] == "image_url":
print(f" - Item {i}: Image data (base64 encoded)")
response = openai_client.chat.completions.create(
model=MODEL_VERSIONS.openai_vision,
messages=[
{
"role": "user",
"content": content,
}
],
max_tokens=1000 # Adjust token limit as needed
)
response_content = response.choices[0].message.content
# Extract token usage
token_usage = {
'prompt_tokens': response.usage.prompt_tokens if hasattr(response, 'usage') else 0,
'completion_tokens': response.usage.completion_tokens if hasattr(response, 'usage') else 0,
'total_tokens': response.usage.total_tokens if hasattr(response, 'usage') else 0
}
print("DEBUG: OpenAI Response received:")
print("-" * 40)
print(response_content)
print("-" * 40)
print(f"DEBUG: Response length: {len(response_content) if response_content else 0} chars")
print(f"DEBUG: Token usage - Prompt: {token_usage['prompt_tokens']}, Completion: {token_usage['completion_tokens']}, Total: {token_usage['total_tokens']}")
print("=" * 80)
return response_content, token_usage
except Exception as e:
last_exception = e
if "rate limit" in str(e).lower():
print(f"OpenAI Rate Limit Error: {e}. Retrying ({current_retry + 1}/{max_retries})...")
time.sleep(5 * (current_retry + 1)) # Exponential backoff
current_retry += 1
else:
# Don't retry on other errors
print(f"OpenAI API error (non-rate-limit): {e}")
break
# If loop finishes due to retries or breaks due to other error
if last_exception:
print(f"OpenAI API call failed after retries: {last_exception}")
raise last_exception
else:
raise Exception("OpenAI API call failed for an unknown reason after retries.")
def call_gemini_vision(
prompt: str,
pil_image_asset: Image.Image,
pil_image_ref: Optional[Image.Image] = None,
model_version: Optional[str] = None
) -> str:
"""
Call Google's Gemini vision model with the provided prompt and images
Args:
prompt: The text prompt to send to the model
pil_image_asset: The main image as a PIL Image
pil_image_ref: Optional reference image as a PIL Image
model_version: Optional model version override (e.g., 'gemini-3-pro-preview')
Returns:
str: The model's response text
"""
if not genai:
raise ValueError("Google Generative AI library not configured or API key missing.")
print("=" * 80)
print("DEBUG: Gemini Vision API Call")
print("=" * 80)
# Debug information about images being sent
print(f"DEBUG: Main asset image dimensions: {pil_image_asset.size}")
print(f"DEBUG: Main asset image mode: {pil_image_asset.mode}")
if pil_image_ref:
print(f"DEBUG: Reference image provided - dimensions: {pil_image_ref.size}")
print(f"DEBUG: Reference image mode: {pil_image_ref.mode}")
print("DEBUG: Total images being sent to LLM: 2 (main asset + reference)")
else:
print("DEBUG: No reference image provided")
print("DEBUG: Total images being sent to LLM: 1 (main asset only)")
# Debug prompt information
print("DEBUG: Original prompt being sent to LLM:")
print("-" * 40)
print(prompt)
print("-" * 40)
# Get API key from environment again to be sure
api_key = os.getenv("GOOGLE_API_KEY")
if not api_key:
raise ValueError("Google API key not found in environment variables")
# Configure with the API key directly
genai.configure(api_key=api_key)
try:
# Use specified model version or default to stable
model_to_use = model_version or MODEL_VERSIONS.gemini_vision
model = genai.GenerativeModel(model_to_use)
print(f"DEBUG: Using Gemini model: {model_to_use}")
# Construct the enhanced prompt to help LLM distinguish images
enhanced_prompt = prompt
if pil_image_ref:
enhanced_prompt = f"""You will receive two images for analysis:
1. FIRST IMAGE: This is the main asset/file that needs to be quality checked
2. SECOND IMAGE: This is the reference image/brand guideline to compare against
Please analyze the first image (main asset) against the quality standards shown in the second image (reference/guideline).
Original prompt:
{prompt}"""
print("DEBUG: Enhanced prompt with image labeling:")
print("-" * 40)
print(enhanced_prompt)
print("-" * 40)
# Prepare parts list
parts = [enhanced_prompt]
parts.append(pil_image_asset) # Gemini client handles PIL images directly
print(f"DEBUG: Added main asset image to parts list (dimensions: {pil_image_asset.size})")
if pil_image_ref:
parts.append(pil_image_ref)
print(f"DEBUG: Added reference image to parts list (dimensions: {pil_image_ref.size})")
print(f"DEBUG: Total parts being sent to Gemini: {len(parts)} (1 text + {len(parts)-1} images)")
response = model.generate_content(parts)
# Handle potential safety blocks
if not response.parts:
safety_info = response.prompt_feedback if hasattr(response, 'prompt_feedback') else "No specific feedback provided."
error_msg = f"Error: Gemini response blocked due to safety settings or other reasons. Feedback: {safety_info}"
print("DEBUG: Gemini Response blocked by safety filters:")
print("-" * 40)
print(error_msg)
print("-" * 40)
print("=" * 80)
# Return error with zero tokens
return error_msg, {'prompt_tokens': 0, 'completion_tokens': 0, 'total_tokens': 0}
response_text = response.text
# Extract token usage from Gemini response
token_usage = {
'prompt_tokens': 0,
'completion_tokens': 0,
'total_tokens': 0
}
if hasattr(response, 'usage_metadata'):
token_usage['prompt_tokens'] = getattr(response.usage_metadata, 'prompt_token_count', 0)
token_usage['completion_tokens'] = getattr(response.usage_metadata, 'candidates_token_count', 0)
token_usage['total_tokens'] = getattr(response.usage_metadata, 'total_token_count', 0)
print("DEBUG: Gemini Response received successfully:")
print("-" * 40)
print(response_text)
print("-" * 40)
print(f"DEBUG: Response length: {len(response_text) if response_text else 0} chars")
print(f"DEBUG: Token usage - Prompt: {token_usage['prompt_tokens']}, Completion: {token_usage['completion_tokens']}, Total: {token_usage['total_tokens']}")
print("=" * 80)
return response_text, token_usage
except Exception as e:
print(f"Gemini API call failed: {e}")
error_detail = str(e)
# Check for common error messages
if "API key not valid" in error_detail:
print(f"Invalid API key (length: {len(api_key)})")
elif "Quota exceeded" in error_detail:
print("API quota exceeded")
raise e
VIDEO_EXTENSIONS = ['.mp4', '.avi', '.mov', '.mkv', '.wmv', '.flv', '.webm']
VIDEO_MIME_TYPES = {
'.mp4': 'video/mp4',
'.avi': 'video/x-msvideo',
'.mov': 'video/quicktime',
'.mkv': 'video/x-matroska',
'.wmv': 'video/x-ms-wmv',
'.flv': 'video/x-flv',
'.webm': 'video/webm',
}
def is_video_file(file_path: str) -> bool:
"""Check if a file is a video based on its extension."""
ext = os.path.splitext(file_path)[1].lower()
return ext in VIDEO_EXTENSIONS
def call_gemini_video(
prompt: str,
video_path: str,
model_version: Optional[str] = None
) -> tuple:
"""
Call Google's Gemini model with a video file using the File Upload API.
Gemini natively supports video analysis - it processes the full video
including motion, transitions, and temporal flow.
Args:
prompt: The text prompt to send to the model
video_path: Path to the video file on disk
model_version: Optional model version override
Returns:
tuple: (response_text, token_usage_dict)
"""
if not genai:
raise ValueError("Google Generative AI library not configured or API key missing.")
print("=" * 80)
print("DEBUG: Gemini Video API Call")
print("=" * 80)
print(f"DEBUG: Video file path: {video_path}")
api_key = os.getenv("GOOGLE_API_KEY")
if not api_key:
raise ValueError("Google API key not found in environment variables")
genai.configure(api_key=api_key)
try:
# Upload the video file to Gemini
file_ext = os.path.splitext(video_path)[1].lower()
mime_type = VIDEO_MIME_TYPES.get(file_ext, 'video/mp4')
file_size_mb = os.path.getsize(video_path) / (1024 * 1024)
print(f"DEBUG: Uploading video ({file_size_mb:.1f} MB, {mime_type})...")
video_file = genai.upload_file(video_path, mime_type=mime_type)
print(f"DEBUG: Video uploaded successfully: {video_file.name}")
# Wait for file to be processed (Gemini needs time for video processing)
import time as _time
max_wait = 120 # seconds
waited = 0
while video_file.state.name == "PROCESSING" and waited < max_wait:
print(f"DEBUG: Video processing... ({waited}s)")
_time.sleep(5)
video_file = genai.get_file(video_file.name)
waited += 5
if video_file.state.name == "FAILED":
raise ValueError(f"Gemini video processing failed: {video_file.state.name}")
if video_file.state.name != "ACTIVE":
print(f"DEBUG: Warning - video state is {video_file.state.name} after {waited}s, proceeding anyway")
print(f"DEBUG: Video ready for analysis (state: {video_file.state.name})")
# Use specified model version or default to stable
model_to_use = model_version or MODEL_VERSIONS.gemini_vision
model = genai.GenerativeModel(model_to_use)
print(f"DEBUG: Using Gemini model: {model_to_use}")
# Send video + prompt to Gemini
parts = [prompt, video_file]
print(f"DEBUG: Sending prompt ({len(prompt)} chars) + video to Gemini")
response = model.generate_content(parts)
# Handle potential safety blocks
if not response.parts:
safety_info = response.prompt_feedback if hasattr(response, 'prompt_feedback') else "No feedback"
error_msg = f"Error: Gemini response blocked. Feedback: {safety_info}"
print(f"DEBUG: {error_msg}")
# Clean up uploaded file
try:
genai.delete_file(video_file.name)
except Exception:
pass
return error_msg, {'prompt_tokens': 0, 'completion_tokens': 0, 'total_tokens': 0}
response_text = response.text
# Extract token usage
token_usage = {
'prompt_tokens': 0,
'completion_tokens': 0,
'total_tokens': 0
}
if hasattr(response, 'usage_metadata'):
token_usage['prompt_tokens'] = getattr(response.usage_metadata, 'prompt_token_count', 0)
token_usage['completion_tokens'] = getattr(response.usage_metadata, 'candidates_token_count', 0)
token_usage['total_tokens'] = getattr(response.usage_metadata, 'total_token_count', 0)
print("DEBUG: Gemini Video Response received:")
print("-" * 40)
print(response_text[:500] + ("..." if len(response_text) > 500 else ""))
print("-" * 40)
print(f"DEBUG: Token usage - Prompt: {token_usage['prompt_tokens']}, Completion: {token_usage['completion_tokens']}, Total: {token_usage['total_tokens']}")
# Clean up uploaded file from Gemini servers
try:
genai.delete_file(video_file.name)
print("DEBUG: Cleaned up uploaded video file from Gemini")
except Exception as cleanup_err:
print(f"DEBUG: Warning - could not clean up video file: {cleanup_err}")
print("=" * 80)
return response_text, token_usage
except Exception as e:
print(f"Gemini Video API call failed: {e}")
raise e
def run_visual_qc(
prompt: str,
asset_path: str,
reference_path: Optional[str] = None,
model_name: str = "Gemini",
model_version: Optional[str] = None
) -> Dict[str, str]:
"""
Run visual QC analysis on an asset using the specified model.
Args:
prompt (str): The QC prompt to send to the model
asset_path (str): Path to the asset file
reference_path (str, optional): Path to reference image if needed
model_name (str): Which model to use ("Gemini" or "OpenAI")
model_version (str, optional): Specific model version override
Returns:
dict: Result dictionary with status and response
"""
print("\n" + "=" * 100)
print("DEBUG: Visual QC Analysis Starting")
print("=" * 100)
print(f"DEBUG: Model selected: {model_name}")
print(f"DEBUG: Asset file path: {asset_path}")
if reference_path:
print(f"DEBUG: Reference file path: {reference_path}")
else:
print("DEBUG: No reference file provided")
print(f"DEBUG: Prompt length: {len(prompt)} characters")
result = {
"status": "error",
"message": "",
"response": ""
}
try:
# Validate inputs
if not os.path.exists(asset_path):
print(f"DEBUG: ERROR - Asset file does not exist: {asset_path}")
result["message"] = f"Asset file not found: {asset_path}"
return result
if reference_path and not os.path.exists(reference_path):
print(f"DEBUG: ERROR - Reference file does not exist: {reference_path}")
result["message"] = f"Reference file not found: {reference_path}"
return result
print("DEBUG: File validation passed - both files exist")
# Check if this is a video file - use Gemini's native video API
if is_video_file(asset_path):
print(f"DEBUG: Video file detected - using Gemini video analysis")
if model_name != "Gemini":
print(f"DEBUG: Video requires Gemini - overriding {model_name} to Gemini")
model_name = "Gemini"
if not genai:
result["message"] = "Gemini API required for video analysis but not configured."
return result
token_usage = {'prompt_tokens': 0, 'completion_tokens': 0, 'total_tokens': 0}
api_response, token_usage = call_gemini_video(prompt, asset_path, model_version)
result["status"] = "success"
result["response"] = api_response
result["token_usage"] = token_usage
result["asset_type"] = "video"
print("DEBUG: Video QC analysis completed successfully")
print("=" * 100)
return result
# Image/PDF processing path
# Load assets
from visual_qc_apps.utils import get_image_from_asset
print(f"DEBUG: Loading main asset image from: {asset_path}")
pil_asset = get_image_from_asset(asset_path)
if not pil_asset:
print(f"DEBUG: ERROR - Could not load main asset: {asset_path}")
result["message"] = f"Could not load or process asset file: {asset_path}"
return result
print(f"DEBUG: Main asset loaded successfully - dimensions: {pil_asset.size}, mode: {pil_asset.mode}")
pil_ref = None
if reference_path:
print(f"DEBUG: Loading reference image from: {reference_path}")
pil_ref = get_image_from_asset(reference_path)
if not pil_ref:
print(f"DEBUG: WARNING - Could not load reference image: {reference_path}")
print("DEBUG: Continuing without reference image")
else:
print(f"DEBUG: Reference image loaded successfully - dimensions: {pil_ref.size}, mode: {pil_ref.mode}")
# Call appropriate API
print(f"DEBUG: Calling {model_name} API for visual analysis")
token_usage = {'prompt_tokens': 0, 'completion_tokens': 0, 'total_tokens': 0}
if model_name == "OpenAI":
if not openai_client:
print("DEBUG: ERROR - OpenAI client not configured")
result["message"] = "OpenAI API Key or Client not configured."
return result
api_response, token_usage = call_openai_vision(prompt, pil_asset, pil_ref)
elif model_name == "Gemini":
if not genai:
print("DEBUG: ERROR - Gemini client not configured")
result["message"] = "Google Generative AI (Gemini) API Key or Client not configured."
return result
api_response, token_usage = call_gemini_vision(prompt, pil_asset, pil_ref, model_version)
else:
print(f"DEBUG: ERROR - Invalid model selected: {model_name}")
result["message"] = "Invalid model selected."
return result
# Success
print("DEBUG: Visual QC analysis completed successfully")
print("=" * 100)
result["status"] = "success"
result["response"] = api_response
result["token_usage"] = token_usage
return result
except Exception as e:
print(f"DEBUG: ERROR in visual QC analysis: {str(e)}")
tb_str = traceback.format_exc()
print(f"DEBUG: Full traceback:\n{tb_str}")
print("=" * 100)
result["message"] = f"Error: {str(e)}\n{tb_str}"
return result