- Add Honda client with static_general and video_general profiles - Add video QC capability using Gemini native video analysis (4 checks: visual_quality, brand_consistency, text_legibility, pacing_flow) - Add video_general profile assigned to all 8 clients - Extend session lifetime with MSAL silent token refresh (proactive every 45min + reactive on expiry), switch cache to localStorage - Re-enable OCR layout measurements for Amazon checks - Add scope boundary notes to all 6 Amazon checks to prevent cross- check penalization (locale errors isolated to logo_country only) - Relax margins left-alignment tolerance from 1% to 4% to account for logo lockup internal padding - Update brand guidelines DB with Amazon localization matrix and processed Dove PDF summary Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
707 lines
No EOL
27 KiB
Python
Executable file
707 lines
No EOL
27 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Centralized LLM configuration module for Visual AI QC.
|
|
This script manages the configuration and interaction with different language models.
|
|
"""
|
|
|
|
import os
|
|
import io
|
|
import base64
|
|
import time
|
|
import traceback
|
|
from typing import Dict, List, Optional, Union, Any
|
|
from dataclasses import dataclass
|
|
from dotenv import load_dotenv
|
|
from PIL import Image
|
|
|
|
# Load environment variables
|
|
config_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'config.env')
|
|
if os.path.exists(config_path):
|
|
load_dotenv(config_path)
|
|
print(f"LLM Config: Loaded environment variables from {config_path}")
|
|
else:
|
|
# Fallback to .env or environment variables
|
|
load_dotenv()
|
|
print("LLM Config: Trying to load from default .env file")
|
|
|
|
# Model version configuration - centralized for easy updates
|
|
@dataclass
|
|
class ModelVersions:
|
|
"""Dataclass to store current model versions"""
|
|
# OpenAI models
|
|
openai_vision: str = "gpt-4o" # Updated from invalid gpt-4.1-2025-04-14
|
|
|
|
# Google Gemini models - Stable
|
|
gemini_vision: str = "gemini-2.5-pro" # Stable production model
|
|
|
|
# Google Gemini models - Beta/Experimental
|
|
gemini_beta: str = "gemini-3-pro-preview" # Beta model for testing
|
|
|
|
# Global model versions instance
|
|
MODEL_VERSIONS = ModelVersions()
|
|
|
|
# Available model options for UI selection
|
|
AVAILABLE_MODELS = {
|
|
'openai': {
|
|
'name': 'OpenAI GPT-4o',
|
|
'model_id': 'gpt-4o',
|
|
'provider': 'OpenAI',
|
|
'status': 'stable',
|
|
'description': 'OpenAI GPT-4o with vision capabilities'
|
|
},
|
|
'gemini': {
|
|
'name': 'Gemini 2.5 Pro',
|
|
'model_id': 'gemini-2.5-pro',
|
|
'provider': 'Gemini',
|
|
'status': 'stable',
|
|
'description': 'Google Gemini 2.5 Pro with multimodal capabilities'
|
|
},
|
|
'gemini_beta': {
|
|
'name': 'Gemini 3 Pro',
|
|
'model_id': 'gemini-3-pro-preview',
|
|
'provider': 'Gemini',
|
|
'status': 'beta',
|
|
'description': 'Latest Gemini 3 model - experimental, may be unstable',
|
|
'warning': 'This is a beta model and may produce unexpected results'
|
|
}
|
|
}
|
|
|
|
# Configure OpenAI
|
|
try:
|
|
import openai
|
|
openai_api_key = os.getenv("OPENAI_API_KEY")
|
|
if openai_api_key:
|
|
openai.api_key = openai_api_key
|
|
print(f"OpenAI API key loaded (length: {len(openai_api_key)})")
|
|
openai_client = openai.OpenAI(api_key=openai_api_key)
|
|
else:
|
|
print("Warning: OPENAI_API_KEY not found in environment variables.")
|
|
openai_client = None
|
|
except ImportError:
|
|
print("Warning: openai library not installed. OpenAI functionality disabled.")
|
|
openai_client = None
|
|
except Exception as e:
|
|
print(f"Error configuring OpenAI: {e}")
|
|
openai_client = None
|
|
|
|
# Configure Google Generative AI
|
|
try:
|
|
import google.generativeai as genai
|
|
google_api_key = os.getenv("GOOGLE_API_KEY")
|
|
if google_api_key:
|
|
genai.configure(api_key=google_api_key)
|
|
print(f"Google API key loaded (length: {len(google_api_key)})")
|
|
else:
|
|
print("Warning: GOOGLE_API_KEY not found in environment variables. Gemini functionality may be limited.")
|
|
genai = None
|
|
except ImportError:
|
|
print("Warning: google-generativeai library not installed. Gemini functionality disabled.")
|
|
genai = None
|
|
except Exception as e:
|
|
print(f"Error configuring Google Generative AI: {e}")
|
|
genai = None
|
|
|
|
def pil_image_to_base64(pil_image: Image.Image, format: str = "jpeg") -> str:
|
|
"""Converts a PIL Image to a base64 encoded string."""
|
|
buffered = io.BytesIO()
|
|
pil_image.save(buffered, format=format.upper())
|
|
return base64.b64encode(buffered.getvalue()).decode('utf-8')
|
|
|
|
def get_model_info() -> Dict[str, Dict[str, Any]]:
|
|
"""Return information about available models and their configurations"""
|
|
return {
|
|
"OpenAI": {
|
|
"available": openai_client is not None,
|
|
"current_version": MODEL_VERSIONS.openai_vision, # gpt-4o
|
|
"max_input_tokens": 128000,
|
|
"max_output_tokens": 16384, # Updated for gpt-4o
|
|
"notes": "GPT-4o with vision capabilities, supports images and text"
|
|
},
|
|
"Gemini": {
|
|
"available": genai is not None,
|
|
"current_version": MODEL_VERSIONS.gemini_vision, # gemini-2.5-pro
|
|
"max_input_tokens": 1048576, # 1M token context
|
|
"max_output_tokens": 8192,
|
|
"notes": "Gemini 2.5 Pro with vision, thinking, and multimodal capabilities"
|
|
},
|
|
"Gemini_Beta": {
|
|
"available": genai is not None,
|
|
"current_version": MODEL_VERSIONS.gemini_beta, # gemini-3-pro-preview
|
|
"max_input_tokens": 1048576,
|
|
"max_output_tokens": 8192,
|
|
"status": "beta",
|
|
"notes": "Gemini 3 Pro Preview - Beta model, may be unstable"
|
|
}
|
|
}
|
|
|
|
def get_available_models() -> Dict[str, Dict[str, Any]]:
|
|
"""Get all available models for UI selection"""
|
|
return AVAILABLE_MODELS
|
|
|
|
def update_model_version(provider: str, model_type: str, version: str) -> bool:
|
|
"""
|
|
Update the model version for a specific provider and model type
|
|
|
|
Args:
|
|
provider: The LLM provider ("OpenAI" or "Gemini")
|
|
model_type: The type of model ("vision")
|
|
version: The new model version to use
|
|
|
|
Returns:
|
|
bool: True if update was successful, False otherwise
|
|
"""
|
|
global MODEL_VERSIONS
|
|
|
|
try:
|
|
if provider == "OpenAI":
|
|
if model_type == "vision":
|
|
MODEL_VERSIONS.openai_vision = version
|
|
print(f"Updated OpenAI vision model to: {version}")
|
|
return True
|
|
elif provider == "Gemini":
|
|
if model_type == "vision":
|
|
MODEL_VERSIONS.gemini_vision = version
|
|
print(f"Updated Gemini vision model to: {version}")
|
|
return True
|
|
|
|
print(f"Invalid provider or model type: {provider}/{model_type}")
|
|
return False
|
|
except Exception as e:
|
|
print(f"Error updating model version: {e}")
|
|
return False
|
|
|
|
def call_openai_vision(
|
|
prompt: str,
|
|
pil_image_asset: Image.Image,
|
|
pil_image_ref: Optional[Image.Image] = None,
|
|
max_retries: int = 2
|
|
) -> str:
|
|
"""
|
|
Call OpenAI's vision model with the provided prompt and images
|
|
|
|
Args:
|
|
prompt: The text prompt to send to the model
|
|
pil_image_asset: The main image as a PIL Image
|
|
pil_image_ref: Optional reference image as a PIL Image
|
|
max_retries: Maximum number of retries on rate limit errors
|
|
|
|
Returns:
|
|
str: The model's response text
|
|
"""
|
|
if not openai_client:
|
|
raise ValueError("OpenAI API key or library not configured.")
|
|
|
|
print("=" * 80)
|
|
print("DEBUG: OpenAI Vision API Call")
|
|
print("=" * 80)
|
|
|
|
# Debug information about images being sent
|
|
print(f"DEBUG: Main asset image dimensions: {pil_image_asset.size}")
|
|
print(f"DEBUG: Main asset image mode: {pil_image_asset.mode}")
|
|
|
|
if pil_image_ref:
|
|
print(f"DEBUG: Reference image provided - dimensions: {pil_image_ref.size}")
|
|
print(f"DEBUG: Reference image mode: {pil_image_ref.mode}")
|
|
print("DEBUG: Total images being sent to LLM: 2 (main asset + reference)")
|
|
else:
|
|
print("DEBUG: No reference image provided")
|
|
print("DEBUG: Total images being sent to LLM: 1 (main asset only)")
|
|
|
|
# Debug prompt information
|
|
print("DEBUG: Prompt being sent to LLM:")
|
|
print("-" * 40)
|
|
print(prompt)
|
|
print("-" * 40)
|
|
|
|
current_retry = 0
|
|
last_exception = None
|
|
|
|
# Construct the enhanced prompt to help LLM distinguish images
|
|
enhanced_prompt = prompt
|
|
if pil_image_ref:
|
|
enhanced_prompt = f"""You will receive two images for analysis:
|
|
1. FIRST IMAGE: This is the main asset/file that needs to be quality checked
|
|
2. SECOND IMAGE: This is the reference image/brand guideline to compare against
|
|
|
|
Please analyze the first image (main asset) against the quality standards shown in the second image (reference/guideline).
|
|
|
|
Original prompt:
|
|
{prompt}"""
|
|
print("DEBUG: Enhanced prompt with image labeling:")
|
|
print("-" * 40)
|
|
print(enhanced_prompt)
|
|
print("-" * 40)
|
|
|
|
content = [
|
|
{"type": "text", "text": enhanced_prompt}
|
|
]
|
|
|
|
# Encode main asset
|
|
base64_asset = pil_image_to_base64(pil_image_asset)
|
|
content.append({
|
|
"type": "image_url",
|
|
"image_url": {"url": f"data:image/jpeg;base64,{base64_asset}"}
|
|
})
|
|
print(f"DEBUG: Main asset encoded to base64 (length: {len(base64_asset)} chars)")
|
|
|
|
# Encode reference asset if provided
|
|
if pil_image_ref:
|
|
base64_ref = pil_image_to_base64(pil_image_ref)
|
|
content.append({
|
|
"type": "image_url",
|
|
"image_url": {"url": f"data:image/jpeg;base64,{base64_ref}"}
|
|
})
|
|
print(f"DEBUG: Reference image encoded to base64 (length: {len(base64_ref)} chars)")
|
|
|
|
while current_retry < max_retries:
|
|
try:
|
|
# Use the specifically requested model version from the global config
|
|
print(f"DEBUG: Calling OpenAI model: {MODEL_VERSIONS.openai_vision}")
|
|
print(f"DEBUG: Content structure being sent:")
|
|
for i, item in enumerate(content):
|
|
if item["type"] == "text":
|
|
print(f" - Item {i}: Text prompt ({len(item['text'])} chars)")
|
|
elif item["type"] == "image_url":
|
|
print(f" - Item {i}: Image data (base64 encoded)")
|
|
|
|
response = openai_client.chat.completions.create(
|
|
model=MODEL_VERSIONS.openai_vision,
|
|
messages=[
|
|
{
|
|
"role": "user",
|
|
"content": content,
|
|
}
|
|
],
|
|
max_tokens=1000 # Adjust token limit as needed
|
|
)
|
|
|
|
response_content = response.choices[0].message.content
|
|
|
|
# Extract token usage
|
|
token_usage = {
|
|
'prompt_tokens': response.usage.prompt_tokens if hasattr(response, 'usage') else 0,
|
|
'completion_tokens': response.usage.completion_tokens if hasattr(response, 'usage') else 0,
|
|
'total_tokens': response.usage.total_tokens if hasattr(response, 'usage') else 0
|
|
}
|
|
|
|
print("DEBUG: OpenAI Response received:")
|
|
print("-" * 40)
|
|
print(response_content)
|
|
print("-" * 40)
|
|
print(f"DEBUG: Response length: {len(response_content) if response_content else 0} chars")
|
|
print(f"DEBUG: Token usage - Prompt: {token_usage['prompt_tokens']}, Completion: {token_usage['completion_tokens']}, Total: {token_usage['total_tokens']}")
|
|
print("=" * 80)
|
|
|
|
return response_content, token_usage
|
|
except Exception as e:
|
|
last_exception = e
|
|
if "rate limit" in str(e).lower():
|
|
print(f"OpenAI Rate Limit Error: {e}. Retrying ({current_retry + 1}/{max_retries})...")
|
|
time.sleep(5 * (current_retry + 1)) # Exponential backoff
|
|
current_retry += 1
|
|
else:
|
|
# Don't retry on other errors
|
|
print(f"OpenAI API error (non-rate-limit): {e}")
|
|
break
|
|
|
|
# If loop finishes due to retries or breaks due to other error
|
|
if last_exception:
|
|
print(f"OpenAI API call failed after retries: {last_exception}")
|
|
raise last_exception
|
|
else:
|
|
raise Exception("OpenAI API call failed for an unknown reason after retries.")
|
|
|
|
def call_gemini_vision(
|
|
prompt: str,
|
|
pil_image_asset: Image.Image,
|
|
pil_image_ref: Optional[Image.Image] = None,
|
|
model_version: Optional[str] = None
|
|
) -> str:
|
|
"""
|
|
Call Google's Gemini vision model with the provided prompt and images
|
|
|
|
Args:
|
|
prompt: The text prompt to send to the model
|
|
pil_image_asset: The main image as a PIL Image
|
|
pil_image_ref: Optional reference image as a PIL Image
|
|
model_version: Optional model version override (e.g., 'gemini-3-pro-preview')
|
|
|
|
Returns:
|
|
str: The model's response text
|
|
"""
|
|
if not genai:
|
|
raise ValueError("Google Generative AI library not configured or API key missing.")
|
|
|
|
print("=" * 80)
|
|
print("DEBUG: Gemini Vision API Call")
|
|
print("=" * 80)
|
|
|
|
# Debug information about images being sent
|
|
print(f"DEBUG: Main asset image dimensions: {pil_image_asset.size}")
|
|
print(f"DEBUG: Main asset image mode: {pil_image_asset.mode}")
|
|
|
|
if pil_image_ref:
|
|
print(f"DEBUG: Reference image provided - dimensions: {pil_image_ref.size}")
|
|
print(f"DEBUG: Reference image mode: {pil_image_ref.mode}")
|
|
print("DEBUG: Total images being sent to LLM: 2 (main asset + reference)")
|
|
else:
|
|
print("DEBUG: No reference image provided")
|
|
print("DEBUG: Total images being sent to LLM: 1 (main asset only)")
|
|
|
|
# Debug prompt information
|
|
print("DEBUG: Original prompt being sent to LLM:")
|
|
print("-" * 40)
|
|
print(prompt)
|
|
print("-" * 40)
|
|
|
|
# Get API key from environment again to be sure
|
|
api_key = os.getenv("GOOGLE_API_KEY")
|
|
if not api_key:
|
|
raise ValueError("Google API key not found in environment variables")
|
|
|
|
# Configure with the API key directly
|
|
genai.configure(api_key=api_key)
|
|
|
|
try:
|
|
# Use specified model version or default to stable
|
|
model_to_use = model_version or MODEL_VERSIONS.gemini_vision
|
|
model = genai.GenerativeModel(model_to_use)
|
|
print(f"DEBUG: Using Gemini model: {model_to_use}")
|
|
|
|
# Construct the enhanced prompt to help LLM distinguish images
|
|
enhanced_prompt = prompt
|
|
if pil_image_ref:
|
|
enhanced_prompt = f"""You will receive two images for analysis:
|
|
1. FIRST IMAGE: This is the main asset/file that needs to be quality checked
|
|
2. SECOND IMAGE: This is the reference image/brand guideline to compare against
|
|
|
|
Please analyze the first image (main asset) against the quality standards shown in the second image (reference/guideline).
|
|
|
|
Original prompt:
|
|
{prompt}"""
|
|
print("DEBUG: Enhanced prompt with image labeling:")
|
|
print("-" * 40)
|
|
print(enhanced_prompt)
|
|
print("-" * 40)
|
|
|
|
# Prepare parts list
|
|
parts = [enhanced_prompt]
|
|
parts.append(pil_image_asset) # Gemini client handles PIL images directly
|
|
print(f"DEBUG: Added main asset image to parts list (dimensions: {pil_image_asset.size})")
|
|
|
|
if pil_image_ref:
|
|
parts.append(pil_image_ref)
|
|
print(f"DEBUG: Added reference image to parts list (dimensions: {pil_image_ref.size})")
|
|
|
|
print(f"DEBUG: Total parts being sent to Gemini: {len(parts)} (1 text + {len(parts)-1} images)")
|
|
|
|
response = model.generate_content(parts)
|
|
# Handle potential safety blocks
|
|
if not response.parts:
|
|
safety_info = response.prompt_feedback if hasattr(response, 'prompt_feedback') else "No specific feedback provided."
|
|
error_msg = f"Error: Gemini response blocked due to safety settings or other reasons. Feedback: {safety_info}"
|
|
print("DEBUG: Gemini Response blocked by safety filters:")
|
|
print("-" * 40)
|
|
print(error_msg)
|
|
print("-" * 40)
|
|
print("=" * 80)
|
|
# Return error with zero tokens
|
|
return error_msg, {'prompt_tokens': 0, 'completion_tokens': 0, 'total_tokens': 0}
|
|
|
|
response_text = response.text
|
|
|
|
# Extract token usage from Gemini response
|
|
token_usage = {
|
|
'prompt_tokens': 0,
|
|
'completion_tokens': 0,
|
|
'total_tokens': 0
|
|
}
|
|
|
|
if hasattr(response, 'usage_metadata'):
|
|
token_usage['prompt_tokens'] = getattr(response.usage_metadata, 'prompt_token_count', 0)
|
|
token_usage['completion_tokens'] = getattr(response.usage_metadata, 'candidates_token_count', 0)
|
|
token_usage['total_tokens'] = getattr(response.usage_metadata, 'total_token_count', 0)
|
|
|
|
print("DEBUG: Gemini Response received successfully:")
|
|
print("-" * 40)
|
|
print(response_text)
|
|
print("-" * 40)
|
|
print(f"DEBUG: Response length: {len(response_text) if response_text else 0} chars")
|
|
print(f"DEBUG: Token usage - Prompt: {token_usage['prompt_tokens']}, Completion: {token_usage['completion_tokens']}, Total: {token_usage['total_tokens']}")
|
|
print("=" * 80)
|
|
return response_text, token_usage
|
|
except Exception as e:
|
|
print(f"Gemini API call failed: {e}")
|
|
error_detail = str(e)
|
|
# Check for common error messages
|
|
if "API key not valid" in error_detail:
|
|
print(f"Invalid API key (length: {len(api_key)})")
|
|
elif "Quota exceeded" in error_detail:
|
|
print("API quota exceeded")
|
|
raise e
|
|
|
|
VIDEO_EXTENSIONS = ['.mp4', '.avi', '.mov', '.mkv', '.wmv', '.flv', '.webm']
|
|
|
|
VIDEO_MIME_TYPES = {
|
|
'.mp4': 'video/mp4',
|
|
'.avi': 'video/x-msvideo',
|
|
'.mov': 'video/quicktime',
|
|
'.mkv': 'video/x-matroska',
|
|
'.wmv': 'video/x-ms-wmv',
|
|
'.flv': 'video/x-flv',
|
|
'.webm': 'video/webm',
|
|
}
|
|
|
|
|
|
def is_video_file(file_path: str) -> bool:
|
|
"""Check if a file is a video based on its extension."""
|
|
ext = os.path.splitext(file_path)[1].lower()
|
|
return ext in VIDEO_EXTENSIONS
|
|
|
|
|
|
def call_gemini_video(
|
|
prompt: str,
|
|
video_path: str,
|
|
model_version: Optional[str] = None
|
|
) -> tuple:
|
|
"""
|
|
Call Google's Gemini model with a video file using the File Upload API.
|
|
|
|
Gemini natively supports video analysis - it processes the full video
|
|
including motion, transitions, and temporal flow.
|
|
|
|
Args:
|
|
prompt: The text prompt to send to the model
|
|
video_path: Path to the video file on disk
|
|
model_version: Optional model version override
|
|
|
|
Returns:
|
|
tuple: (response_text, token_usage_dict)
|
|
"""
|
|
if not genai:
|
|
raise ValueError("Google Generative AI library not configured or API key missing.")
|
|
|
|
print("=" * 80)
|
|
print("DEBUG: Gemini Video API Call")
|
|
print("=" * 80)
|
|
print(f"DEBUG: Video file path: {video_path}")
|
|
|
|
api_key = os.getenv("GOOGLE_API_KEY")
|
|
if not api_key:
|
|
raise ValueError("Google API key not found in environment variables")
|
|
|
|
genai.configure(api_key=api_key)
|
|
|
|
try:
|
|
# Upload the video file to Gemini
|
|
file_ext = os.path.splitext(video_path)[1].lower()
|
|
mime_type = VIDEO_MIME_TYPES.get(file_ext, 'video/mp4')
|
|
file_size_mb = os.path.getsize(video_path) / (1024 * 1024)
|
|
|
|
print(f"DEBUG: Uploading video ({file_size_mb:.1f} MB, {mime_type})...")
|
|
video_file = genai.upload_file(video_path, mime_type=mime_type)
|
|
print(f"DEBUG: Video uploaded successfully: {video_file.name}")
|
|
|
|
# Wait for file to be processed (Gemini needs time for video processing)
|
|
import time as _time
|
|
max_wait = 120 # seconds
|
|
waited = 0
|
|
while video_file.state.name == "PROCESSING" and waited < max_wait:
|
|
print(f"DEBUG: Video processing... ({waited}s)")
|
|
_time.sleep(5)
|
|
video_file = genai.get_file(video_file.name)
|
|
waited += 5
|
|
|
|
if video_file.state.name == "FAILED":
|
|
raise ValueError(f"Gemini video processing failed: {video_file.state.name}")
|
|
|
|
if video_file.state.name != "ACTIVE":
|
|
print(f"DEBUG: Warning - video state is {video_file.state.name} after {waited}s, proceeding anyway")
|
|
|
|
print(f"DEBUG: Video ready for analysis (state: {video_file.state.name})")
|
|
|
|
# Use specified model version or default to stable
|
|
model_to_use = model_version or MODEL_VERSIONS.gemini_vision
|
|
model = genai.GenerativeModel(model_to_use)
|
|
print(f"DEBUG: Using Gemini model: {model_to_use}")
|
|
|
|
# Send video + prompt to Gemini
|
|
parts = [prompt, video_file]
|
|
print(f"DEBUG: Sending prompt ({len(prompt)} chars) + video to Gemini")
|
|
|
|
response = model.generate_content(parts)
|
|
|
|
# Handle potential safety blocks
|
|
if not response.parts:
|
|
safety_info = response.prompt_feedback if hasattr(response, 'prompt_feedback') else "No feedback"
|
|
error_msg = f"Error: Gemini response blocked. Feedback: {safety_info}"
|
|
print(f"DEBUG: {error_msg}")
|
|
# Clean up uploaded file
|
|
try:
|
|
genai.delete_file(video_file.name)
|
|
except Exception:
|
|
pass
|
|
return error_msg, {'prompt_tokens': 0, 'completion_tokens': 0, 'total_tokens': 0}
|
|
|
|
response_text = response.text
|
|
|
|
# Extract token usage
|
|
token_usage = {
|
|
'prompt_tokens': 0,
|
|
'completion_tokens': 0,
|
|
'total_tokens': 0
|
|
}
|
|
if hasattr(response, 'usage_metadata'):
|
|
token_usage['prompt_tokens'] = getattr(response.usage_metadata, 'prompt_token_count', 0)
|
|
token_usage['completion_tokens'] = getattr(response.usage_metadata, 'candidates_token_count', 0)
|
|
token_usage['total_tokens'] = getattr(response.usage_metadata, 'total_token_count', 0)
|
|
|
|
print("DEBUG: Gemini Video Response received:")
|
|
print("-" * 40)
|
|
print(response_text[:500] + ("..." if len(response_text) > 500 else ""))
|
|
print("-" * 40)
|
|
print(f"DEBUG: Token usage - Prompt: {token_usage['prompt_tokens']}, Completion: {token_usage['completion_tokens']}, Total: {token_usage['total_tokens']}")
|
|
|
|
# Clean up uploaded file from Gemini servers
|
|
try:
|
|
genai.delete_file(video_file.name)
|
|
print("DEBUG: Cleaned up uploaded video file from Gemini")
|
|
except Exception as cleanup_err:
|
|
print(f"DEBUG: Warning - could not clean up video file: {cleanup_err}")
|
|
|
|
print("=" * 80)
|
|
return response_text, token_usage
|
|
|
|
except Exception as e:
|
|
print(f"Gemini Video API call failed: {e}")
|
|
raise e
|
|
|
|
|
|
def run_visual_qc(
|
|
prompt: str,
|
|
asset_path: str,
|
|
reference_path: Optional[str] = None,
|
|
model_name: str = "Gemini",
|
|
model_version: Optional[str] = None
|
|
) -> Dict[str, str]:
|
|
"""
|
|
Run visual QC analysis on an asset using the specified model.
|
|
|
|
Args:
|
|
prompt (str): The QC prompt to send to the model
|
|
asset_path (str): Path to the asset file
|
|
reference_path (str, optional): Path to reference image if needed
|
|
model_name (str): Which model to use ("Gemini" or "OpenAI")
|
|
model_version (str, optional): Specific model version override
|
|
|
|
Returns:
|
|
dict: Result dictionary with status and response
|
|
"""
|
|
print("\n" + "=" * 100)
|
|
print("DEBUG: Visual QC Analysis Starting")
|
|
print("=" * 100)
|
|
print(f"DEBUG: Model selected: {model_name}")
|
|
print(f"DEBUG: Asset file path: {asset_path}")
|
|
if reference_path:
|
|
print(f"DEBUG: Reference file path: {reference_path}")
|
|
else:
|
|
print("DEBUG: No reference file provided")
|
|
print(f"DEBUG: Prompt length: {len(prompt)} characters")
|
|
|
|
result = {
|
|
"status": "error",
|
|
"message": "",
|
|
"response": ""
|
|
}
|
|
|
|
try:
|
|
# Validate inputs
|
|
if not os.path.exists(asset_path):
|
|
print(f"DEBUG: ERROR - Asset file does not exist: {asset_path}")
|
|
result["message"] = f"Asset file not found: {asset_path}"
|
|
return result
|
|
|
|
if reference_path and not os.path.exists(reference_path):
|
|
print(f"DEBUG: ERROR - Reference file does not exist: {reference_path}")
|
|
result["message"] = f"Reference file not found: {reference_path}"
|
|
return result
|
|
|
|
print("DEBUG: File validation passed - both files exist")
|
|
|
|
# Check if this is a video file - use Gemini's native video API
|
|
if is_video_file(asset_path):
|
|
print(f"DEBUG: Video file detected - using Gemini video analysis")
|
|
if model_name != "Gemini":
|
|
print(f"DEBUG: Video requires Gemini - overriding {model_name} to Gemini")
|
|
model_name = "Gemini"
|
|
if not genai:
|
|
result["message"] = "Gemini API required for video analysis but not configured."
|
|
return result
|
|
token_usage = {'prompt_tokens': 0, 'completion_tokens': 0, 'total_tokens': 0}
|
|
api_response, token_usage = call_gemini_video(prompt, asset_path, model_version)
|
|
result["status"] = "success"
|
|
result["response"] = api_response
|
|
result["token_usage"] = token_usage
|
|
result["asset_type"] = "video"
|
|
print("DEBUG: Video QC analysis completed successfully")
|
|
print("=" * 100)
|
|
return result
|
|
|
|
# Image/PDF processing path
|
|
# Load assets
|
|
from visual_qc_apps.utils import get_image_from_asset
|
|
|
|
print(f"DEBUG: Loading main asset image from: {asset_path}")
|
|
pil_asset = get_image_from_asset(asset_path)
|
|
if not pil_asset:
|
|
print(f"DEBUG: ERROR - Could not load main asset: {asset_path}")
|
|
result["message"] = f"Could not load or process asset file: {asset_path}"
|
|
return result
|
|
print(f"DEBUG: Main asset loaded successfully - dimensions: {pil_asset.size}, mode: {pil_asset.mode}")
|
|
|
|
pil_ref = None
|
|
if reference_path:
|
|
print(f"DEBUG: Loading reference image from: {reference_path}")
|
|
pil_ref = get_image_from_asset(reference_path)
|
|
if not pil_ref:
|
|
print(f"DEBUG: WARNING - Could not load reference image: {reference_path}")
|
|
print("DEBUG: Continuing without reference image")
|
|
else:
|
|
print(f"DEBUG: Reference image loaded successfully - dimensions: {pil_ref.size}, mode: {pil_ref.mode}")
|
|
|
|
# Call appropriate API
|
|
print(f"DEBUG: Calling {model_name} API for visual analysis")
|
|
token_usage = {'prompt_tokens': 0, 'completion_tokens': 0, 'total_tokens': 0}
|
|
|
|
if model_name == "OpenAI":
|
|
if not openai_client:
|
|
print("DEBUG: ERROR - OpenAI client not configured")
|
|
result["message"] = "OpenAI API Key or Client not configured."
|
|
return result
|
|
api_response, token_usage = call_openai_vision(prompt, pil_asset, pil_ref)
|
|
elif model_name == "Gemini":
|
|
if not genai:
|
|
print("DEBUG: ERROR - Gemini client not configured")
|
|
result["message"] = "Google Generative AI (Gemini) API Key or Client not configured."
|
|
return result
|
|
api_response, token_usage = call_gemini_vision(prompt, pil_asset, pil_ref, model_version)
|
|
else:
|
|
print(f"DEBUG: ERROR - Invalid model selected: {model_name}")
|
|
result["message"] = "Invalid model selected."
|
|
return result
|
|
|
|
# Success
|
|
print("DEBUG: Visual QC analysis completed successfully")
|
|
print("=" * 100)
|
|
result["status"] = "success"
|
|
result["response"] = api_response
|
|
result["token_usage"] = token_usage
|
|
return result
|
|
|
|
except Exception as e:
|
|
print(f"DEBUG: ERROR in visual QC analysis: {str(e)}")
|
|
tb_str = traceback.format_exc()
|
|
print(f"DEBUG: Full traceback:\n{tb_str}")
|
|
print("=" * 100)
|
|
result["message"] = f"Error: {str(e)}\n{tb_str}"
|
|
return result |