OCR measurements were causing the LLM to over-rely on bounding box numbers and fail correct assets on minor measurement inaccuracies. Changes: - All prompts now say "supplementary data" not "authoritative/primary source" - LLM instructed to prioritise visual assessment, use OCR to confirm/question - Alignment tolerance widened from 1.5% to 3% of width - OCR context footer softened with accuracy caveat (~5-10px margin of error) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
429 lines
16 KiB
Python
429 lines
16 KiB
Python
"""
|
||
OCR Layout Measurement Module
|
||
|
||
Uses Tesseract OCR to detect text elements and compute precise pixel-level
|
||
layout measurements (margins, spacing, alignment) for QC checks.
|
||
|
||
This supplements LLM visual estimation with hard numbers, enabling detection
|
||
of subtle spacing and alignment issues that vision models cannot reliably see.
|
||
|
||
Requires: pytesseract Python package + Tesseract OCR binary installed on system.
|
||
- macOS: brew install tesseract
|
||
- Ubuntu: sudo apt install tesseract-ocr
|
||
- pip install pytesseract
|
||
"""
|
||
|
||
import re
|
||
from PIL import Image
|
||
|
||
|
||
# Checks that benefit from OCR measurements
|
||
OCR_RELEVANT_CHECKS = [
|
||
'amazon_margins',
|
||
'amazon_typography',
|
||
'amazon_headline_layout',
|
||
'element_alignment',
|
||
'text_edge_clearance',
|
||
'safety_area',
|
||
'visual_hierarchy',
|
||
'visual_hierarchy_general',
|
||
'text_readability',
|
||
'text_readability_general',
|
||
]
|
||
|
||
# Date month patterns in multiple languages
|
||
_MONTH_PATTERN = (
|
||
r'January|February|March|April|May|June|July|August|September|October|November|December'
|
||
r'|Jan|Feb|Mar|Apr|Jun|Jul|Aug|Sep|Oct|Nov|Dec'
|
||
r'|gennaio|febbraio|marzo|aprile|maggio|giugno|luglio|agosto|settembre|ottobre|novembre|dicembre'
|
||
r'|Januar|Februar|März|Mai|Juni|Juli|Oktober|Dezember'
|
||
r'|janvier|f[eé]vrier|mars|avril|mai|juin|juillet|ao[uû]t|septembre|octobre|novembre|d[eé]cembre'
|
||
r'|enero|febrero|marzo|abril|mayo|junio|julio|agosto|septiembre|octubre|noviembre|diciembre'
|
||
r'|januari|februari|maart|april|mei|juni|juli|augustus|oktober|december'
|
||
r'|januari|februari|mars|april|maj|juni|juli|augusti|september|oktober|november|december'
|
||
r'|styczeń|luty|marzec|kwiecień|maj|czerwiec|lipiec|sierpień|wrzesień|październik|listopad|grudzień'
|
||
)
|
||
|
||
|
||
def run_ocr_measurement(image_path):
|
||
"""
|
||
Run Tesseract OCR on an image and return layout measurements.
|
||
|
||
Returns dict with:
|
||
- elements: identified text elements with positions
|
||
- measurements: calculated margins, spacing, alignment
|
||
- context: formatted text string for prompt injection
|
||
- image_dimensions: width and height
|
||
|
||
Returns None if OCR fails or no text detected.
|
||
"""
|
||
try:
|
||
with Image.open(image_path) as img:
|
||
img_width, img_height = img.size
|
||
|
||
blocks = _tesseract_detect(image_path, img_width, img_height)
|
||
if not blocks:
|
||
return None
|
||
|
||
elements = _identify_elements(blocks, img_width, img_height)
|
||
measurements = _calculate_measurements(elements, img_width, img_height)
|
||
context = _build_measurement_context(measurements)
|
||
|
||
return {
|
||
'elements': elements,
|
||
'measurements': measurements,
|
||
'context': context,
|
||
'image_dimensions': {'width': img_width, 'height': img_height}
|
||
}
|
||
except Exception as e:
|
||
print(f"OCR Measurement: Error processing {image_path}: {e}")
|
||
return None
|
||
|
||
|
||
def _tesseract_detect(image_path, img_width, img_height):
|
||
"""
|
||
Run Tesseract OCR to detect text blocks with bounding boxes.
|
||
Groups words into blocks and returns position data.
|
||
"""
|
||
try:
|
||
import pytesseract
|
||
except ImportError:
|
||
print("OCR Measurement: pytesseract not installed (pip install pytesseract)")
|
||
return []
|
||
|
||
try:
|
||
img = Image.open(image_path)
|
||
# Get word-level bounding boxes from Tesseract
|
||
data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT)
|
||
|
||
# Group words by block + line for finer element separation
|
||
# (Tesseract often groups date + logo into one block)
|
||
block_groups = {}
|
||
for i in range(len(data['text'])):
|
||
text = data['text'][i].strip()
|
||
conf = int(data['conf'][i])
|
||
if not text or conf < 30: # Skip low confidence detections
|
||
continue
|
||
|
||
# Use block_num + line_num as key for finer grouping
|
||
group_key = (data['block_num'][i], data['line_num'][i])
|
||
if group_key not in block_groups:
|
||
block_groups[group_key] = {
|
||
'words': [],
|
||
'lefts': [],
|
||
'tops': [],
|
||
'rights': [],
|
||
'bottoms': [],
|
||
'word_heights': [],
|
||
}
|
||
|
||
bg = block_groups[group_key]
|
||
bg['words'].append(text)
|
||
left = data['left'][i]
|
||
top = data['top'][i]
|
||
w = data['width'][i]
|
||
h = data['height'][i]
|
||
bg['lefts'].append(left)
|
||
bg['tops'].append(top)
|
||
bg['rights'].append(left + w)
|
||
bg['bottoms'].append(top + h)
|
||
bg['word_heights'].append(h)
|
||
|
||
blocks = []
|
||
for group_key, bg in block_groups.items():
|
||
if not bg['words']:
|
||
continue
|
||
|
||
block_text = ' '.join(bg['words'])
|
||
left = min(bg['lefts'])
|
||
top = min(bg['tops'])
|
||
right = max(bg['rights'])
|
||
bottom = max(bg['bottoms'])
|
||
|
||
# Use median word height as character height (more robust than block height)
|
||
sorted_wh = sorted(bg['word_heights'])
|
||
char_height = sorted_wh[len(sorted_wh) // 2]
|
||
|
||
blocks.append({
|
||
'text': block_text,
|
||
'left': left,
|
||
'right': right,
|
||
'top': top,
|
||
'bottom': bottom,
|
||
'width': right - left,
|
||
'height': bottom - top,
|
||
'char_height': char_height,
|
||
'left_pct': round(left / img_width * 100, 1) if img_width else 0,
|
||
'top_pct': round(top / img_height * 100, 1) if img_height else 0,
|
||
'bottom_pct': round(bottom / img_height * 100, 1) if img_height else 0,
|
||
})
|
||
|
||
print(f"OCR Measurement: Tesseract detected {len(blocks)} text blocks")
|
||
return blocks
|
||
except Exception as e:
|
||
print(f"OCR Measurement: Tesseract error: {e}")
|
||
return []
|
||
|
||
|
||
def _identify_elements(blocks, img_width, img_height):
|
||
"""
|
||
Identify which text blocks are headline, date, legal, logo.
|
||
|
||
Strategy:
|
||
- Headline: largest character height text in the top portion
|
||
- Date: matches date patterns (digits + month or digit-digit)
|
||
- Legal: smallest text, bottom area
|
||
- Logo: contains "amazon" or "prime", bottom area
|
||
"""
|
||
if not blocks:
|
||
return {}
|
||
|
||
elements = {}
|
||
remaining = list(blocks)
|
||
|
||
# 1. Find headline blocks: largest char_height text in top 70%
|
||
top_blocks = [b for b in remaining if b['top_pct'] < 70]
|
||
if top_blocks:
|
||
max_char_height = max(b['char_height'] for b in top_blocks)
|
||
|
||
# Headline blocks are those with char_height >= 70% of the max
|
||
headline_blocks = sorted(
|
||
[b for b in top_blocks if b['char_height'] >= max_char_height * 0.7],
|
||
key=lambda b: b['top']
|
||
)
|
||
|
||
if headline_blocks:
|
||
all_text = ' '.join(b['text'] for b in headline_blocks)
|
||
combined_left = min(b['left'] for b in headline_blocks)
|
||
combined_right = max(b['right'] for b in headline_blocks)
|
||
combined_top = min(b['top'] for b in headline_blocks)
|
||
combined_bottom = max(b['bottom'] for b in headline_blocks)
|
||
|
||
elements['headline'] = {
|
||
'text': all_text,
|
||
'left': combined_left,
|
||
'right': combined_right,
|
||
'top': combined_top,
|
||
'bottom': combined_bottom,
|
||
'char_height': max_char_height,
|
||
'line_count': len(headline_blocks),
|
||
}
|
||
for b in headline_blocks:
|
||
if b in remaining:
|
||
remaining.remove(b)
|
||
|
||
# 2. Find date: contains number patterns like "8-11" or "8 – 11" with month
|
||
date_pattern = re.compile(
|
||
r'\d+\s*[-–—]\s*\d+|'
|
||
r'\d+\s+(?:' + _MONTH_PATTERN + r')',
|
||
re.IGNORECASE
|
||
)
|
||
|
||
for block in remaining[:]:
|
||
if date_pattern.search(block['text']):
|
||
elements['date'] = block
|
||
remaining.remove(block)
|
||
break
|
||
|
||
# 3. Find logo/branding: contains "amazon" or "prime day"
|
||
for block in remaining[:]:
|
||
if re.search(r'amazon|prime\s*day', block['text'], re.IGNORECASE):
|
||
elements['logo_text'] = block
|
||
remaining.remove(block)
|
||
break
|
||
|
||
# 4. Find legal: smallest text in bottom 25%
|
||
bottom_blocks = [b for b in remaining if b['top_pct'] > 75]
|
||
if bottom_blocks:
|
||
legal_block = min(bottom_blocks, key=lambda b: b['char_height'])
|
||
elements['legal'] = legal_block
|
||
if legal_block in remaining:
|
||
remaining.remove(legal_block)
|
||
|
||
elements['other'] = remaining
|
||
return elements
|
||
|
||
|
||
def _calculate_measurements(elements, img_width, img_height):
|
||
"""Calculate precise layout measurements from identified elements."""
|
||
m = {
|
||
'image_width': img_width,
|
||
'image_height': img_height,
|
||
'format': _detect_format(img_width, img_height),
|
||
}
|
||
|
||
shortest_side = min(img_width, img_height)
|
||
m['shortest_side'] = shortest_side
|
||
|
||
headline = elements.get('headline')
|
||
date = elements.get('date')
|
||
legal = elements.get('legal')
|
||
logo = elements.get('logo_text')
|
||
|
||
if headline:
|
||
m['headline'] = {
|
||
'text': headline.get('text', ''),
|
||
'left_margin_px': headline['left'],
|
||
'left_margin_pct': _pct(headline['left'], img_width),
|
||
'left_margin_shortest_side_pct': _pct(headline['left'], shortest_side),
|
||
'right_margin_px': img_width - headline['right'],
|
||
'right_margin_pct': _pct(img_width - headline['right'], img_width),
|
||
'top_margin_px': headline['top'],
|
||
'top_margin_pct': _pct(headline['top'], img_height),
|
||
'top_margin_shortest_side_pct': _pct(headline['top'], shortest_side),
|
||
'width_pct': _pct(headline['right'] - headline['left'], img_width),
|
||
'bottom_edge_px': headline['bottom'],
|
||
'bottom_edge_pct': _pct(headline['bottom'], img_height),
|
||
'char_height_px': headline.get('char_height', 0),
|
||
}
|
||
|
||
if date:
|
||
m['date'] = {
|
||
'text': date.get('text', ''),
|
||
'left_margin_px': date['left'],
|
||
'left_margin_pct': _pct(date['left'], img_width),
|
||
'left_margin_shortest_side_pct': _pct(date['left'], shortest_side),
|
||
'top_px': date['top'],
|
||
'top_pct': _pct(date['top'], img_height),
|
||
'char_height_px': date.get('char_height', 0),
|
||
}
|
||
|
||
if logo:
|
||
m['logo_text'] = {
|
||
'text': logo.get('text', ''),
|
||
'left_margin_px': logo['left'],
|
||
'left_margin_pct': _pct(logo['left'], img_width),
|
||
}
|
||
|
||
if legal:
|
||
m['legal'] = {
|
||
'text': legal.get('text', ''),
|
||
'left_margin_px': legal['left'],
|
||
'left_margin_pct': _pct(legal['left'], img_width),
|
||
'char_height_px': legal.get('char_height', 0),
|
||
}
|
||
|
||
# Headline-to-date gap
|
||
if headline and date:
|
||
gap_px = date['top'] - headline['bottom']
|
||
m['headline_to_date_gap'] = {
|
||
'gap_px': gap_px,
|
||
'gap_pct': _pct(gap_px, img_height),
|
||
}
|
||
|
||
# Date-to-headline size ratio
|
||
h_char = headline.get('char_height', 0)
|
||
d_char = date.get('char_height', 0)
|
||
if h_char > 0:
|
||
m['date_to_headline_ratio_pct'] = round(d_char / h_char * 100, 1)
|
||
|
||
# Left alignment consistency
|
||
left_margins = {}
|
||
if headline:
|
||
left_margins['headline'] = headline['left']
|
||
if date:
|
||
left_margins['date'] = date['left']
|
||
if logo:
|
||
left_margins['logo_text'] = logo['left']
|
||
|
||
if len(left_margins) >= 2:
|
||
values = list(left_margins.values())
|
||
max_diff_px = max(values) - min(values)
|
||
max_diff_pct = _pct(max_diff_px, img_width)
|
||
|
||
m['left_alignment'] = {
|
||
'elements': {k: {'px': v, 'pct': _pct(v, img_width)} for k, v in left_margins.items()},
|
||
'max_difference_px': max_diff_px,
|
||
'max_difference_pct': max_diff_pct,
|
||
'aligned': max_diff_pct <= 3.0,
|
||
}
|
||
|
||
return m
|
||
|
||
|
||
def _build_measurement_context(measurements):
|
||
"""Build formatted text context for injection into QC check prompts."""
|
||
shortest_side = measurements.get('shortest_side', min(measurements['image_width'], measurements['image_height']))
|
||
|
||
lines = [
|
||
"=== OCR LAYOUT MEASUREMENTS (computed from pixel-level analysis — NOT visual estimation) ===",
|
||
f"Image: {measurements['image_width']}px x {measurements['image_height']}px ({measurements.get('format', 'unknown')} format)",
|
||
f"Shortest side: {shortest_side}px (Amazon guideline: margins should be ~7% of shortest side = ~{round(shortest_side * 0.07)}px)",
|
||
"",
|
||
]
|
||
|
||
headline = measurements.get('headline')
|
||
if headline:
|
||
lines.append(f"HEADLINE: \"{headline['text']}\"")
|
||
lines.append(f" Left margin: {headline['left_margin_px']}px ({headline.get('left_margin_shortest_side_pct', headline['left_margin_pct'])}% of shortest side, {headline['left_margin_pct']}% of width)")
|
||
lines.append(f" Top margin: {headline['top_margin_px']}px ({headline.get('top_margin_shortest_side_pct', headline['top_margin_pct'])}% of shortest side)")
|
||
lines.append(f" Right margin: {headline['right_margin_px']}px ({headline['right_margin_pct']}% of width)")
|
||
lines.append(f" Headline width: {headline['width_pct']}% of asset width")
|
||
lines.append(f" Character height: {headline['char_height_px']}px")
|
||
lines.append("")
|
||
|
||
date = measurements.get('date')
|
||
if date:
|
||
lines.append(f"DATE: \"{date['text']}\"")
|
||
lines.append(f" Left margin: {date['left_margin_px']}px ({date.get('left_margin_shortest_side_pct', date['left_margin_pct'])}% of shortest side, {date['left_margin_pct']}% of width)")
|
||
lines.append(f" Character height: {date['char_height_px']}px")
|
||
lines.append("")
|
||
|
||
logo = measurements.get('logo_text')
|
||
if logo:
|
||
lines.append(f"LOGO/BRANDING: \"{logo['text']}\"")
|
||
lines.append(f" Left margin: {logo['left_margin_px']}px ({logo['left_margin_pct']}% of width)")
|
||
lines.append("")
|
||
|
||
legal = measurements.get('legal')
|
||
if legal:
|
||
lines.append(f"LEGAL: \"{legal['text']}\"")
|
||
lines.append(f" Left margin: {legal['left_margin_px']}px ({legal['left_margin_pct']}% of width)")
|
||
lines.append("")
|
||
|
||
gap = measurements.get('headline_to_date_gap')
|
||
if gap:
|
||
lines.append(f"HEADLINE-TO-DATE SPACING: {gap['gap_px']}px ({gap['gap_pct']}% of image height)")
|
||
|
||
ratio = measurements.get('date_to_headline_ratio_pct')
|
||
if ratio is not None:
|
||
lines.append(f"DATE-TO-HEADLINE SIZE RATIO: {ratio}%")
|
||
|
||
lines.append("")
|
||
|
||
alignment = measurements.get('left_alignment')
|
||
if alignment:
|
||
lines.append("LEFT ALIGNMENT COMPARISON:")
|
||
for name, pos in alignment['elements'].items():
|
||
lines.append(f" {name}: {pos['px']}px from left edge ({pos['pct']}% of width)")
|
||
lines.append(f" Max difference: {alignment['max_difference_px']}px ({alignment['max_difference_pct']}% of width)")
|
||
if alignment['aligned']:
|
||
lines.append(" Status: ALIGNED (within tolerance)")
|
||
else:
|
||
lines.append(" Status: MISALIGNED — elements do NOT share the same left margin")
|
||
lines.append("")
|
||
|
||
lines.append("NOTE: These OCR measurements are approximate and should be used as supplementary data")
|
||
lines.append("alongside your visual assessment. OCR bounding boxes can have small inaccuracies (~5-10px).")
|
||
lines.append("Use them to confirm or question your visual impression, but do NOT fail an asset solely")
|
||
lines.append("based on OCR numbers if the layout looks visually correct and well-composed.")
|
||
lines.append("=== END OCR MEASUREMENTS ===")
|
||
|
||
return "\n".join(lines)
|
||
|
||
|
||
def _pct(value, total):
|
||
"""Calculate percentage, rounded to 1 decimal."""
|
||
if total <= 0:
|
||
return 0.0
|
||
return round(value / total * 100, 1)
|
||
|
||
|
||
def _detect_format(width, height):
|
||
"""Detect image format type."""
|
||
if height > width * 1.3:
|
||
return 'portrait/tall'
|
||
elif width > height * 1.3:
|
||
return 'landscape'
|
||
else:
|
||
return 'square'
|