ai_qc/backend/ocr_measurement.py
nickviljoen 9eed569587 Tone down OCR from authoritative to supplementary to reduce false positives
OCR measurements were causing the LLM to over-rely on bounding box numbers
and fail correct assets on minor measurement inaccuracies. Changes:
- All prompts now say "supplementary data" not "authoritative/primary source"
- LLM instructed to prioritise visual assessment, use OCR to confirm/question
- Alignment tolerance widened from 1.5% to 3% of width
- OCR context footer softened with accuracy caveat (~5-10px margin of error)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-02 13:54:00 +02:00

429 lines
16 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
OCR Layout Measurement Module
Uses Tesseract OCR to detect text elements and compute precise pixel-level
layout measurements (margins, spacing, alignment) for QC checks.
This supplements LLM visual estimation with hard numbers, enabling detection
of subtle spacing and alignment issues that vision models cannot reliably see.
Requires: pytesseract Python package + Tesseract OCR binary installed on system.
- macOS: brew install tesseract
- Ubuntu: sudo apt install tesseract-ocr
- pip install pytesseract
"""
import re
from PIL import Image
# Checks that benefit from OCR measurements
OCR_RELEVANT_CHECKS = [
'amazon_margins',
'amazon_typography',
'amazon_headline_layout',
'element_alignment',
'text_edge_clearance',
'safety_area',
'visual_hierarchy',
'visual_hierarchy_general',
'text_readability',
'text_readability_general',
]
# Date month patterns in multiple languages
_MONTH_PATTERN = (
r'January|February|March|April|May|June|July|August|September|October|November|December'
r'|Jan|Feb|Mar|Apr|Jun|Jul|Aug|Sep|Oct|Nov|Dec'
r'|gennaio|febbraio|marzo|aprile|maggio|giugno|luglio|agosto|settembre|ottobre|novembre|dicembre'
r'|Januar|Februar|März|Mai|Juni|Juli|Oktober|Dezember'
r'|janvier|f[eé]vrier|mars|avril|mai|juin|juillet|ao[uû]t|septembre|octobre|novembre|d[eé]cembre'
r'|enero|febrero|marzo|abril|mayo|junio|julio|agosto|septiembre|octubre|noviembre|diciembre'
r'|januari|februari|maart|april|mei|juni|juli|augustus|oktober|december'
r'|januari|februari|mars|april|maj|juni|juli|augusti|september|oktober|november|december'
r'|styczeń|luty|marzec|kwiecień|maj|czerwiec|lipiec|sierpień|wrzesień|październik|listopad|grudzień'
)
def run_ocr_measurement(image_path):
"""
Run Tesseract OCR on an image and return layout measurements.
Returns dict with:
- elements: identified text elements with positions
- measurements: calculated margins, spacing, alignment
- context: formatted text string for prompt injection
- image_dimensions: width and height
Returns None if OCR fails or no text detected.
"""
try:
with Image.open(image_path) as img:
img_width, img_height = img.size
blocks = _tesseract_detect(image_path, img_width, img_height)
if not blocks:
return None
elements = _identify_elements(blocks, img_width, img_height)
measurements = _calculate_measurements(elements, img_width, img_height)
context = _build_measurement_context(measurements)
return {
'elements': elements,
'measurements': measurements,
'context': context,
'image_dimensions': {'width': img_width, 'height': img_height}
}
except Exception as e:
print(f"OCR Measurement: Error processing {image_path}: {e}")
return None
def _tesseract_detect(image_path, img_width, img_height):
"""
Run Tesseract OCR to detect text blocks with bounding boxes.
Groups words into blocks and returns position data.
"""
try:
import pytesseract
except ImportError:
print("OCR Measurement: pytesseract not installed (pip install pytesseract)")
return []
try:
img = Image.open(image_path)
# Get word-level bounding boxes from Tesseract
data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT)
# Group words by block + line for finer element separation
# (Tesseract often groups date + logo into one block)
block_groups = {}
for i in range(len(data['text'])):
text = data['text'][i].strip()
conf = int(data['conf'][i])
if not text or conf < 30: # Skip low confidence detections
continue
# Use block_num + line_num as key for finer grouping
group_key = (data['block_num'][i], data['line_num'][i])
if group_key not in block_groups:
block_groups[group_key] = {
'words': [],
'lefts': [],
'tops': [],
'rights': [],
'bottoms': [],
'word_heights': [],
}
bg = block_groups[group_key]
bg['words'].append(text)
left = data['left'][i]
top = data['top'][i]
w = data['width'][i]
h = data['height'][i]
bg['lefts'].append(left)
bg['tops'].append(top)
bg['rights'].append(left + w)
bg['bottoms'].append(top + h)
bg['word_heights'].append(h)
blocks = []
for group_key, bg in block_groups.items():
if not bg['words']:
continue
block_text = ' '.join(bg['words'])
left = min(bg['lefts'])
top = min(bg['tops'])
right = max(bg['rights'])
bottom = max(bg['bottoms'])
# Use median word height as character height (more robust than block height)
sorted_wh = sorted(bg['word_heights'])
char_height = sorted_wh[len(sorted_wh) // 2]
blocks.append({
'text': block_text,
'left': left,
'right': right,
'top': top,
'bottom': bottom,
'width': right - left,
'height': bottom - top,
'char_height': char_height,
'left_pct': round(left / img_width * 100, 1) if img_width else 0,
'top_pct': round(top / img_height * 100, 1) if img_height else 0,
'bottom_pct': round(bottom / img_height * 100, 1) if img_height else 0,
})
print(f"OCR Measurement: Tesseract detected {len(blocks)} text blocks")
return blocks
except Exception as e:
print(f"OCR Measurement: Tesseract error: {e}")
return []
def _identify_elements(blocks, img_width, img_height):
"""
Identify which text blocks are headline, date, legal, logo.
Strategy:
- Headline: largest character height text in the top portion
- Date: matches date patterns (digits + month or digit-digit)
- Legal: smallest text, bottom area
- Logo: contains "amazon" or "prime", bottom area
"""
if not blocks:
return {}
elements = {}
remaining = list(blocks)
# 1. Find headline blocks: largest char_height text in top 70%
top_blocks = [b for b in remaining if b['top_pct'] < 70]
if top_blocks:
max_char_height = max(b['char_height'] for b in top_blocks)
# Headline blocks are those with char_height >= 70% of the max
headline_blocks = sorted(
[b for b in top_blocks if b['char_height'] >= max_char_height * 0.7],
key=lambda b: b['top']
)
if headline_blocks:
all_text = ' '.join(b['text'] for b in headline_blocks)
combined_left = min(b['left'] for b in headline_blocks)
combined_right = max(b['right'] for b in headline_blocks)
combined_top = min(b['top'] for b in headline_blocks)
combined_bottom = max(b['bottom'] for b in headline_blocks)
elements['headline'] = {
'text': all_text,
'left': combined_left,
'right': combined_right,
'top': combined_top,
'bottom': combined_bottom,
'char_height': max_char_height,
'line_count': len(headline_blocks),
}
for b in headline_blocks:
if b in remaining:
remaining.remove(b)
# 2. Find date: contains number patterns like "8-11" or "8 11" with month
date_pattern = re.compile(
r'\d+\s*[-–—]\s*\d+|'
r'\d+\s+(?:' + _MONTH_PATTERN + r')',
re.IGNORECASE
)
for block in remaining[:]:
if date_pattern.search(block['text']):
elements['date'] = block
remaining.remove(block)
break
# 3. Find logo/branding: contains "amazon" or "prime day"
for block in remaining[:]:
if re.search(r'amazon|prime\s*day', block['text'], re.IGNORECASE):
elements['logo_text'] = block
remaining.remove(block)
break
# 4. Find legal: smallest text in bottom 25%
bottom_blocks = [b for b in remaining if b['top_pct'] > 75]
if bottom_blocks:
legal_block = min(bottom_blocks, key=lambda b: b['char_height'])
elements['legal'] = legal_block
if legal_block in remaining:
remaining.remove(legal_block)
elements['other'] = remaining
return elements
def _calculate_measurements(elements, img_width, img_height):
"""Calculate precise layout measurements from identified elements."""
m = {
'image_width': img_width,
'image_height': img_height,
'format': _detect_format(img_width, img_height),
}
shortest_side = min(img_width, img_height)
m['shortest_side'] = shortest_side
headline = elements.get('headline')
date = elements.get('date')
legal = elements.get('legal')
logo = elements.get('logo_text')
if headline:
m['headline'] = {
'text': headline.get('text', ''),
'left_margin_px': headline['left'],
'left_margin_pct': _pct(headline['left'], img_width),
'left_margin_shortest_side_pct': _pct(headline['left'], shortest_side),
'right_margin_px': img_width - headline['right'],
'right_margin_pct': _pct(img_width - headline['right'], img_width),
'top_margin_px': headline['top'],
'top_margin_pct': _pct(headline['top'], img_height),
'top_margin_shortest_side_pct': _pct(headline['top'], shortest_side),
'width_pct': _pct(headline['right'] - headline['left'], img_width),
'bottom_edge_px': headline['bottom'],
'bottom_edge_pct': _pct(headline['bottom'], img_height),
'char_height_px': headline.get('char_height', 0),
}
if date:
m['date'] = {
'text': date.get('text', ''),
'left_margin_px': date['left'],
'left_margin_pct': _pct(date['left'], img_width),
'left_margin_shortest_side_pct': _pct(date['left'], shortest_side),
'top_px': date['top'],
'top_pct': _pct(date['top'], img_height),
'char_height_px': date.get('char_height', 0),
}
if logo:
m['logo_text'] = {
'text': logo.get('text', ''),
'left_margin_px': logo['left'],
'left_margin_pct': _pct(logo['left'], img_width),
}
if legal:
m['legal'] = {
'text': legal.get('text', ''),
'left_margin_px': legal['left'],
'left_margin_pct': _pct(legal['left'], img_width),
'char_height_px': legal.get('char_height', 0),
}
# Headline-to-date gap
if headline and date:
gap_px = date['top'] - headline['bottom']
m['headline_to_date_gap'] = {
'gap_px': gap_px,
'gap_pct': _pct(gap_px, img_height),
}
# Date-to-headline size ratio
h_char = headline.get('char_height', 0)
d_char = date.get('char_height', 0)
if h_char > 0:
m['date_to_headline_ratio_pct'] = round(d_char / h_char * 100, 1)
# Left alignment consistency
left_margins = {}
if headline:
left_margins['headline'] = headline['left']
if date:
left_margins['date'] = date['left']
if logo:
left_margins['logo_text'] = logo['left']
if len(left_margins) >= 2:
values = list(left_margins.values())
max_diff_px = max(values) - min(values)
max_diff_pct = _pct(max_diff_px, img_width)
m['left_alignment'] = {
'elements': {k: {'px': v, 'pct': _pct(v, img_width)} for k, v in left_margins.items()},
'max_difference_px': max_diff_px,
'max_difference_pct': max_diff_pct,
'aligned': max_diff_pct <= 3.0,
}
return m
def _build_measurement_context(measurements):
"""Build formatted text context for injection into QC check prompts."""
shortest_side = measurements.get('shortest_side', min(measurements['image_width'], measurements['image_height']))
lines = [
"=== OCR LAYOUT MEASUREMENTS (computed from pixel-level analysis — NOT visual estimation) ===",
f"Image: {measurements['image_width']}px x {measurements['image_height']}px ({measurements.get('format', 'unknown')} format)",
f"Shortest side: {shortest_side}px (Amazon guideline: margins should be ~7% of shortest side = ~{round(shortest_side * 0.07)}px)",
"",
]
headline = measurements.get('headline')
if headline:
lines.append(f"HEADLINE: \"{headline['text']}\"")
lines.append(f" Left margin: {headline['left_margin_px']}px ({headline.get('left_margin_shortest_side_pct', headline['left_margin_pct'])}% of shortest side, {headline['left_margin_pct']}% of width)")
lines.append(f" Top margin: {headline['top_margin_px']}px ({headline.get('top_margin_shortest_side_pct', headline['top_margin_pct'])}% of shortest side)")
lines.append(f" Right margin: {headline['right_margin_px']}px ({headline['right_margin_pct']}% of width)")
lines.append(f" Headline width: {headline['width_pct']}% of asset width")
lines.append(f" Character height: {headline['char_height_px']}px")
lines.append("")
date = measurements.get('date')
if date:
lines.append(f"DATE: \"{date['text']}\"")
lines.append(f" Left margin: {date['left_margin_px']}px ({date.get('left_margin_shortest_side_pct', date['left_margin_pct'])}% of shortest side, {date['left_margin_pct']}% of width)")
lines.append(f" Character height: {date['char_height_px']}px")
lines.append("")
logo = measurements.get('logo_text')
if logo:
lines.append(f"LOGO/BRANDING: \"{logo['text']}\"")
lines.append(f" Left margin: {logo['left_margin_px']}px ({logo['left_margin_pct']}% of width)")
lines.append("")
legal = measurements.get('legal')
if legal:
lines.append(f"LEGAL: \"{legal['text']}\"")
lines.append(f" Left margin: {legal['left_margin_px']}px ({legal['left_margin_pct']}% of width)")
lines.append("")
gap = measurements.get('headline_to_date_gap')
if gap:
lines.append(f"HEADLINE-TO-DATE SPACING: {gap['gap_px']}px ({gap['gap_pct']}% of image height)")
ratio = measurements.get('date_to_headline_ratio_pct')
if ratio is not None:
lines.append(f"DATE-TO-HEADLINE SIZE RATIO: {ratio}%")
lines.append("")
alignment = measurements.get('left_alignment')
if alignment:
lines.append("LEFT ALIGNMENT COMPARISON:")
for name, pos in alignment['elements'].items():
lines.append(f" {name}: {pos['px']}px from left edge ({pos['pct']}% of width)")
lines.append(f" Max difference: {alignment['max_difference_px']}px ({alignment['max_difference_pct']}% of width)")
if alignment['aligned']:
lines.append(" Status: ALIGNED (within tolerance)")
else:
lines.append(" Status: MISALIGNED — elements do NOT share the same left margin")
lines.append("")
lines.append("NOTE: These OCR measurements are approximate and should be used as supplementary data")
lines.append("alongside your visual assessment. OCR bounding boxes can have small inaccuracies (~5-10px).")
lines.append("Use them to confirm or question your visual impression, but do NOT fail an asset solely")
lines.append("based on OCR numbers if the layout looks visually correct and well-composed.")
lines.append("=== END OCR MEASUREMENTS ===")
return "\n".join(lines)
def _pct(value, total):
"""Calculate percentage, rounded to 1 decimal."""
if total <= 0:
return 0.0
return round(value / total * 100, 1)
def _detect_format(width, height):
"""Detect image format type."""
if height > width * 1.3:
return 'portrait/tall'
elif width > height * 1.3:
return 'landscape'
else:
return 'square'