Batch 3: Add title legibility check, Google Gemini support, LLM provider selector

- Update image quality prompt to evaluate text/title legibility
- Add Google Gemini (generativeai) as LLM provider in LLMConfig
- Add AI Provider dropdown on configure page (OpenAI GPT-4o / Google Gemini)
- Pass selected provider through execute routes to override profile defaults
- Add google-generativeai to requirements.txt

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
nickviljoen 2026-03-21 16:53:07 +02:00
parent 1c582ffcf4
commit 91dec41e0b
5 changed files with 105 additions and 5 deletions

View file

@ -32,6 +32,10 @@ class LLMConfig:
'api_key_env': 'OPENAI_API_KEY',
'models': ['gpt-4o', 'gpt-4-turbo', 'gpt-4', 'gpt-3.5-turbo']
},
'google': {
'api_key_env': 'GOOGLE_API_KEY',
'models': ['gemini-2.0-flash', 'gemini-2.5-pro', 'gemini-1.5-pro']
},
'anthropic': {
'api_key_env': 'ANTHROPIC_API_KEY',
'models': ['claude-3-opus-20240229', 'claude-3-sonnet-20240229', 'claude-3-haiku-20240307']
@ -104,6 +108,12 @@ class LLMConfig:
api_key = os.getenv('ANTHROPIC_API_KEY')
return Anthropic(api_key=api_key)
elif provider == 'google':
import google.generativeai as genai
api_key = os.getenv('GOOGLE_API_KEY')
genai.configure(api_key=api_key)
return genai
elif provider == 'azure_openai':
from openai import AzureOpenAI
api_key = os.getenv('AZURE_OPENAI_API_KEY')
@ -173,6 +183,10 @@ class LLMConfig:
response = cls._call_openai_vision(
prompt, image_asset_b64, image_ref_b64, model, debug
)
elif provider == 'google':
response = cls._call_google_vision(
prompt, image_asset, image_ref, model, debug
)
elif provider == 'anthropic':
response = cls._call_anthropic_vision(
prompt, image_asset_b64, image_ref_b64, model, debug
@ -250,6 +264,55 @@ class LLMConfig:
'tokens_used': response.usage.total_tokens if hasattr(response, 'usage') else None
}
@classmethod
def _call_google_vision(
cls,
prompt: str,
image_asset: Any,
image_ref: Any,
model: str,
debug: bool
) -> Dict[str, Any]:
"""Call Google Gemini Vision API."""
genai = cls.get_client('google', model)
# Load image as PIL
pil_image = None
if isinstance(image_asset, Image.Image):
pil_image = image_asset
elif isinstance(image_asset, str) and os.path.exists(image_asset):
pil_image = Image.open(image_asset)
elif isinstance(image_asset, bytes):
pil_image = Image.open(BytesIO(image_asset))
if pil_image is None:
raise ConfigurationError("Could not load image for Google Vision API")
# Build content parts
contents = [prompt, pil_image]
# Add reference image if provided
if image_ref is not None:
ref_image = None
if isinstance(image_ref, Image.Image):
ref_image = image_ref
elif isinstance(image_ref, str) and os.path.exists(image_ref):
ref_image = Image.open(image_ref)
if ref_image:
contents.append(ref_image)
gen_model = genai.GenerativeModel(model)
response = gen_model.generate_content(contents)
return {
'text': response.text,
'model': model,
'provider': 'google',
'tokens_used': getattr(response, 'usage_metadata', {}).total_token_count
if hasattr(getattr(response, 'usage_metadata', None), 'total_token_count')
else None
}
@classmethod
def _call_anthropic_vision(
cls,

View file

@ -143,12 +143,18 @@ EVALUATION CRITERIA:
b. Proper exposure (not too dark or too bright)
c. Consistent white balance
3. Composition and Framing:
3. Text and Title Legibility:
a. All text and titles must be clear, legible, and properly rendered
b. No cut-off, overlapping, or unreadable text
c. Font sizes must be appropriate and readable at intended display size
d. Text must have sufficient contrast against its background
4. Composition and Framing:
a. Subject properly framed and positioned
b. Appropriate negative space
c. No unwanted elements or distractions
4. Professional Standards:
5. Professional Standards:
a. Meets commercial photography standards
b. Suitable for marketing/advertising use
c. Consistent with H&M brand aesthetics
@ -163,8 +169,9 @@ SCORING GUIDANCE:
STEPS TO EVALUATE:
1. Assess overall image quality and technical aspects
2. Evaluate color accuracy and lighting
3. Check composition and framing
4. Determine if image meets professional standards for H&M marketing
3. Check that all text and titles are clear and legible
4. Check composition and framing
5. Determine if image meets professional standards for H&M marketing
YOUR OUTPUT MUST INCLUDE:
Format your response as JSON (you can include explanatory text before/after the JSON):

View file

@ -186,6 +186,7 @@ def execute():
session_id = data.get('session_id')
profile_name = data.get('profile')
job_number = data.get('job_number')
llm_provider = data.get('llm_provider')
if not session_id or not profile_name:
return jsonify({'error': 'Missing required parameters'}), 400
@ -209,6 +210,14 @@ def execute():
if not profile:
return jsonify({'error': f'Profile "{profile_name}" not found'}), 404
# Override LLM provider if user selected one
if llm_provider:
provider_models = {'openai': 'gpt-4o', 'google': 'gemini-2.0-flash'}
for check in profile.get('checks', []):
if check.get('llm_provider'):
check['llm_provider'] = llm_provider
check['llm_model'] = provider_models.get(llm_provider, check.get('llm_model'))
logger.info(f"Starting QC execution for session {session_id} with profile {profile_name}")
executor = QCExecutor(
@ -262,6 +271,7 @@ def execute_batch():
session_id = data.get('session_id')
profile_name = data.get('profile')
job_number = data.get('job_number')
llm_provider = data.get('llm_provider')
if not session_id or not profile_name:
return jsonify({'error': 'Missing required parameters'}), 400
@ -283,6 +293,14 @@ def execute_batch():
if not profile:
return jsonify({'error': f'Profile "{profile_name}" not found'}), 404
# Override LLM provider if user selected one
if llm_provider:
provider_models = {'openai': 'gpt-4o', 'google': 'gemini-2.0-flash'}
for check in profile.get('checks', []):
if check.get('llm_provider'):
check['llm_provider'] = llm_provider
check['llm_model'] = provider_models.get(llm_provider, check.get('llm_model'))
logger.info(f"Starting batch QC for {len(files)} files (session: {session_id})")
batch_executor = BatchQCExecutor(

View file

@ -39,6 +39,15 @@
{% endfor %}
</div>
<div class="mb-3">
<label for="llmProvider" class="form-label">AI Provider</label>
<select class="form-select" id="llmProvider">
<option value="openai" selected>OpenAI GPT-4o</option>
<option value="google">Google Gemini</option>
</select>
<div class="form-text">Select which AI model to use for image quality analysis</div>
</div>
<div class="mb-3">
<label for="jobNumber" class="form-label">Job Number (Optional)</label>
<input type="text" class="form-control" id="jobNumber"
@ -89,6 +98,7 @@
const profile = document.getElementById('profile').value;
const jobNumber = document.getElementById('jobNumber').value;
const llmProvider = document.getElementById('llmProvider').value;
const startBtn = document.getElementById('startBtn');
if (!profile) {
@ -115,7 +125,8 @@
body: JSON.stringify({
session_id: sessionId,
profile: profile,
job_number: jobNumber || null
job_number: jobNumber || null,
llm_provider: llmProvider
})
});

View file

@ -21,6 +21,7 @@ beautifulsoup4==4.12.2
# LLM Providers
openai>=1.12.0
anthropic>=0.18.0
google-generativeai>=0.5.0
# Video Processing (Video QC + Video Master)
opencv-python>=4.8.0