""" OCR Layout Measurement Module Uses Tesseract OCR to detect text elements and compute precise pixel-level layout measurements (margins, spacing, alignment) for QC checks. This supplements LLM visual estimation with hard numbers, enabling detection of subtle spacing and alignment issues that vision models cannot reliably see. Requires: pytesseract Python package + Tesseract OCR binary installed on system. - macOS: brew install tesseract - Ubuntu: sudo apt install tesseract-ocr - pip install pytesseract """ import re from PIL import Image # Checks that benefit from OCR measurements OCR_RELEVANT_CHECKS = [ 'amazon_margins', 'amazon_typography', 'amazon_headline_layout', 'element_alignment', 'text_edge_clearance', 'safety_area', 'visual_hierarchy', 'visual_hierarchy_general', 'text_readability', 'text_readability_general', ] # Date month patterns in multiple languages _MONTH_PATTERN = ( r'January|February|March|April|May|June|July|August|September|October|November|December' r'|Jan|Feb|Mar|Apr|Jun|Jul|Aug|Sep|Oct|Nov|Dec' r'|gennaio|febbraio|marzo|aprile|maggio|giugno|luglio|agosto|settembre|ottobre|novembre|dicembre' r'|Januar|Februar|März|Mai|Juni|Juli|Oktober|Dezember' r'|janvier|f[eé]vrier|mars|avril|mai|juin|juillet|ao[uû]t|septembre|octobre|novembre|d[eé]cembre' r'|enero|febrero|marzo|abril|mayo|junio|julio|agosto|septiembre|octubre|noviembre|diciembre' r'|januari|februari|maart|april|mei|juni|juli|augustus|oktober|december' r'|januari|februari|mars|april|maj|juni|juli|augusti|september|oktober|november|december' r'|styczeń|luty|marzec|kwiecień|maj|czerwiec|lipiec|sierpień|wrzesień|październik|listopad|grudzień' ) def run_ocr_measurement(image_path): """ Run Tesseract OCR on an image and return layout measurements. Returns dict with: - elements: identified text elements with positions - measurements: calculated margins, spacing, alignment - context: formatted text string for prompt injection - image_dimensions: width and height Returns None if OCR fails or no text detected. """ try: with Image.open(image_path) as img: img_width, img_height = img.size blocks = _tesseract_detect(image_path, img_width, img_height) if not blocks: return None elements = _identify_elements(blocks, img_width, img_height) measurements = _calculate_measurements(elements, img_width, img_height) context = _build_measurement_context(measurements) return { 'elements': elements, 'measurements': measurements, 'context': context, 'image_dimensions': {'width': img_width, 'height': img_height} } except Exception as e: print(f"OCR Measurement: Error processing {image_path}: {e}") return None def _tesseract_detect(image_path, img_width, img_height): """ Run Tesseract OCR to detect text blocks with bounding boxes. Groups words into blocks and returns position data. """ try: import pytesseract except ImportError: print("OCR Measurement: pytesseract not installed (pip install pytesseract)") return [] try: img = Image.open(image_path) # Get word-level bounding boxes from Tesseract data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT) # Group words by block + line for finer element separation # (Tesseract often groups date + logo into one block) block_groups = {} for i in range(len(data['text'])): text = data['text'][i].strip() conf = int(data['conf'][i]) if not text or conf < 30: # Skip low confidence detections continue # Use block_num + line_num as key for finer grouping group_key = (data['block_num'][i], data['line_num'][i]) if group_key not in block_groups: block_groups[group_key] = { 'words': [], 'lefts': [], 'tops': [], 'rights': [], 'bottoms': [], 'word_heights': [], } bg = block_groups[group_key] bg['words'].append(text) left = data['left'][i] top = data['top'][i] w = data['width'][i] h = data['height'][i] bg['lefts'].append(left) bg['tops'].append(top) bg['rights'].append(left + w) bg['bottoms'].append(top + h) bg['word_heights'].append(h) blocks = [] for group_key, bg in block_groups.items(): if not bg['words']: continue block_text = ' '.join(bg['words']) left = min(bg['lefts']) top = min(bg['tops']) right = max(bg['rights']) bottom = max(bg['bottoms']) # Use median word height as character height (more robust than block height) sorted_wh = sorted(bg['word_heights']) char_height = sorted_wh[len(sorted_wh) // 2] blocks.append({ 'text': block_text, 'left': left, 'right': right, 'top': top, 'bottom': bottom, 'width': right - left, 'height': bottom - top, 'char_height': char_height, 'left_pct': round(left / img_width * 100, 1) if img_width else 0, 'top_pct': round(top / img_height * 100, 1) if img_height else 0, 'bottom_pct': round(bottom / img_height * 100, 1) if img_height else 0, }) print(f"OCR Measurement: Tesseract detected {len(blocks)} text blocks") return blocks except Exception as e: print(f"OCR Measurement: Tesseract error: {e}") return [] def _identify_elements(blocks, img_width, img_height): """ Identify which text blocks are headline, date, legal, logo. Strategy: - Headline: largest character height text in the top portion - Date: matches date patterns (digits + month or digit-digit) - Legal: smallest text, bottom area - Logo: contains "amazon" or "prime", bottom area """ if not blocks: return {} elements = {} remaining = list(blocks) # 1. Find headline blocks: largest char_height text in top 70% top_blocks = [b for b in remaining if b['top_pct'] < 70] if top_blocks: max_char_height = max(b['char_height'] for b in top_blocks) # Headline blocks are those with char_height >= 70% of the max headline_blocks = sorted( [b for b in top_blocks if b['char_height'] >= max_char_height * 0.7], key=lambda b: b['top'] ) if headline_blocks: all_text = ' '.join(b['text'] for b in headline_blocks) combined_left = min(b['left'] for b in headline_blocks) combined_right = max(b['right'] for b in headline_blocks) combined_top = min(b['top'] for b in headline_blocks) combined_bottom = max(b['bottom'] for b in headline_blocks) elements['headline'] = { 'text': all_text, 'left': combined_left, 'right': combined_right, 'top': combined_top, 'bottom': combined_bottom, 'char_height': max_char_height, 'line_count': len(headline_blocks), } for b in headline_blocks: if b in remaining: remaining.remove(b) # 2. Find date: contains number patterns like "8-11" or "8 – 11" with month date_pattern = re.compile( r'\d+\s*[-–—]\s*\d+|' r'\d+\s+(?:' + _MONTH_PATTERN + r')', re.IGNORECASE ) for block in remaining[:]: if date_pattern.search(block['text']): elements['date'] = block remaining.remove(block) break # 3. Find logo/branding: contains "amazon" or "prime day" for block in remaining[:]: if re.search(r'amazon|prime\s*day', block['text'], re.IGNORECASE): elements['logo_text'] = block remaining.remove(block) break # 4. Find legal: smallest text in bottom 25% bottom_blocks = [b for b in remaining if b['top_pct'] > 75] if bottom_blocks: legal_block = min(bottom_blocks, key=lambda b: b['char_height']) elements['legal'] = legal_block if legal_block in remaining: remaining.remove(legal_block) elements['other'] = remaining return elements def _calculate_measurements(elements, img_width, img_height): """Calculate precise layout measurements from identified elements.""" m = { 'image_width': img_width, 'image_height': img_height, 'format': _detect_format(img_width, img_height), } shortest_side = min(img_width, img_height) m['shortest_side'] = shortest_side headline = elements.get('headline') date = elements.get('date') legal = elements.get('legal') logo = elements.get('logo_text') if headline: m['headline'] = { 'text': headline.get('text', ''), 'left_margin_px': headline['left'], 'left_margin_pct': _pct(headline['left'], img_width), 'left_margin_shortest_side_pct': _pct(headline['left'], shortest_side), 'right_margin_px': img_width - headline['right'], 'right_margin_pct': _pct(img_width - headline['right'], img_width), 'top_margin_px': headline['top'], 'top_margin_pct': _pct(headline['top'], img_height), 'top_margin_shortest_side_pct': _pct(headline['top'], shortest_side), 'width_pct': _pct(headline['right'] - headline['left'], img_width), 'bottom_edge_px': headline['bottom'], 'bottom_edge_pct': _pct(headline['bottom'], img_height), 'char_height_px': headline.get('char_height', 0), } if date: m['date'] = { 'text': date.get('text', ''), 'left_margin_px': date['left'], 'left_margin_pct': _pct(date['left'], img_width), 'left_margin_shortest_side_pct': _pct(date['left'], shortest_side), 'top_px': date['top'], 'top_pct': _pct(date['top'], img_height), 'char_height_px': date.get('char_height', 0), } if logo: m['logo_text'] = { 'text': logo.get('text', ''), 'left_margin_px': logo['left'], 'left_margin_pct': _pct(logo['left'], img_width), } if legal: m['legal'] = { 'text': legal.get('text', ''), 'left_margin_px': legal['left'], 'left_margin_pct': _pct(legal['left'], img_width), 'char_height_px': legal.get('char_height', 0), } # Headline-to-date gap if headline and date: gap_px = date['top'] - headline['bottom'] m['headline_to_date_gap'] = { 'gap_px': gap_px, 'gap_pct': _pct(gap_px, img_height), } # Date-to-headline size ratio h_char = headline.get('char_height', 0) d_char = date.get('char_height', 0) if h_char > 0: m['date_to_headline_ratio_pct'] = round(d_char / h_char * 100, 1) # Left alignment consistency left_margins = {} if headline: left_margins['headline'] = headline['left'] if date: left_margins['date'] = date['left'] if logo: left_margins['logo_text'] = logo['left'] if len(left_margins) >= 2: values = list(left_margins.values()) max_diff_px = max(values) - min(values) max_diff_pct = _pct(max_diff_px, img_width) m['left_alignment'] = { 'elements': {k: {'px': v, 'pct': _pct(v, img_width)} for k, v in left_margins.items()}, 'max_difference_px': max_diff_px, 'max_difference_pct': max_diff_pct, 'aligned': max_diff_pct <= 3.0, } return m def _build_measurement_context(measurements): """Build formatted text context for injection into QC check prompts.""" shortest_side = measurements.get('shortest_side', min(measurements['image_width'], measurements['image_height'])) lines = [ "=== OCR LAYOUT MEASUREMENTS (computed from pixel-level analysis — NOT visual estimation) ===", f"Image: {measurements['image_width']}px x {measurements['image_height']}px ({measurements.get('format', 'unknown')} format)", f"Shortest side: {shortest_side}px (Amazon guideline: margins should be ~7% of shortest side = ~{round(shortest_side * 0.07)}px)", "", ] headline = measurements.get('headline') if headline: lines.append(f"HEADLINE: \"{headline['text']}\"") lines.append(f" Left margin: {headline['left_margin_px']}px ({headline.get('left_margin_shortest_side_pct', headline['left_margin_pct'])}% of shortest side, {headline['left_margin_pct']}% of width)") lines.append(f" Top margin: {headline['top_margin_px']}px ({headline.get('top_margin_shortest_side_pct', headline['top_margin_pct'])}% of shortest side)") lines.append(f" Right margin: {headline['right_margin_px']}px ({headline['right_margin_pct']}% of width)") lines.append(f" Headline width: {headline['width_pct']}% of asset width") lines.append(f" Character height: {headline['char_height_px']}px") lines.append("") date = measurements.get('date') if date: lines.append(f"DATE: \"{date['text']}\"") lines.append(f" Left margin: {date['left_margin_px']}px ({date.get('left_margin_shortest_side_pct', date['left_margin_pct'])}% of shortest side, {date['left_margin_pct']}% of width)") lines.append(f" Character height: {date['char_height_px']}px") lines.append("") logo = measurements.get('logo_text') if logo: lines.append(f"LOGO/BRANDING: \"{logo['text']}\"") lines.append(f" Left margin: {logo['left_margin_px']}px ({logo['left_margin_pct']}% of width)") lines.append("") legal = measurements.get('legal') if legal: lines.append(f"LEGAL: \"{legal['text']}\"") lines.append(f" Left margin: {legal['left_margin_px']}px ({legal['left_margin_pct']}% of width)") lines.append("") gap = measurements.get('headline_to_date_gap') if gap: lines.append(f"HEADLINE-TO-DATE SPACING: {gap['gap_px']}px ({gap['gap_pct']}% of image height)") ratio = measurements.get('date_to_headline_ratio_pct') if ratio is not None: lines.append(f"DATE-TO-HEADLINE SIZE RATIO: {ratio}%") lines.append("") alignment = measurements.get('left_alignment') if alignment: lines.append("LEFT ALIGNMENT COMPARISON:") for name, pos in alignment['elements'].items(): lines.append(f" {name}: {pos['px']}px from left edge ({pos['pct']}% of width)") lines.append(f" Max difference: {alignment['max_difference_px']}px ({alignment['max_difference_pct']}% of width)") if alignment['aligned']: lines.append(" Status: ALIGNED (within tolerance)") else: lines.append(" Status: MISALIGNED — elements do NOT share the same left margin") lines.append("") lines.append("NOTE: These OCR measurements are approximate and should be used as supplementary data") lines.append("alongside your visual assessment. OCR bounding boxes can have small inaccuracies (~5-10px).") lines.append("Use them to confirm or question your visual impression, but do NOT fail an asset solely") lines.append("based on OCR numbers if the layout looks visually correct and well-composed.") lines.append("=== END OCR MEASUREMENTS ===") return "\n".join(lines) def _pct(value, total): """Calculate percentage, rounded to 1 decimal.""" if total <= 0: return 0.0 return round(value / total * 100, 1) def _detect_format(width, height): """Detect image format type.""" if height > width * 1.3: return 'portrait/tall' elif width > height * 1.3: return 'landscape' else: return 'square'