ai_qc/backend/ocr_measurement.py

"""
OCR Layout Measurement Module

Uses Tesseract OCR to detect text elements and compute precise pixel-level
layout measurements (margins, spacing, alignment) for QC checks.

This supplements LLM visual estimation with hard numbers, enabling detection
of subtle spacing and alignment issues that vision models cannot reliably see.

Requires: pytesseract Python package + Tesseract OCR binary installed on system.
  - macOS: brew install tesseract
  - Ubuntu: sudo apt install tesseract-ocr
  - pip install pytesseract
"""

import re
from PIL import Image


# Checks that benefit from OCR measurements
OCR_RELEVANT_CHECKS = [
    'amazon_margins',
    'amazon_typography',
    'amazon_headline_layout',
    'element_alignment',
    'text_edge_clearance',
    'safety_area',
    'visual_hierarchy',
    'visual_hierarchy_general',
    'text_readability',
    'text_readability_general',
]

# Date month patterns in multiple languages
_MONTH_PATTERN = (
    r'January|February|March|April|May|June|July|August|September|October|November|December'
    r'|Jan|Feb|Mar|Apr|Jun|Jul|Aug|Sep|Oct|Nov|Dec'
    r'|gennaio|febbraio|marzo|aprile|maggio|giugno|luglio|agosto|settembre|ottobre|novembre|dicembre'
    r'|Januar|Februar|März|Mai|Juni|Juli|Oktober|Dezember'
    r'|janvier|f[eé]vrier|mars|avril|mai|juin|juillet|ao[uû]t|septembre|octobre|novembre|d[eé]cembre'
    r'|enero|febrero|marzo|abril|mayo|junio|julio|agosto|septiembre|octubre|noviembre|diciembre'
    r'|januari|februari|maart|april|mei|juni|juli|augustus|oktober|december'
    r'|januari|februari|mars|april|maj|juni|juli|augusti|september|oktober|november|december'
    r'|styczeń|luty|marzec|kwiecień|maj|czerwiec|lipiec|sierpień|wrzesień|październik|listopad|grudzień'
)


def run_ocr_measurement(image_path):
    """
    Run Tesseract OCR on an image and return layout measurements.

    Returns dict with:
    - elements: identified text elements with positions
    - measurements: calculated margins, spacing, alignment
    - context: formatted text string for prompt injection
    - image_dimensions: width and height

    Returns None if OCR fails or no text detected.
    """
    try:
        with Image.open(image_path) as img:
            img_width, img_height = img.size

        blocks = _tesseract_detect(image_path, img_width, img_height)
        if not blocks:
            return None

        elements = _identify_elements(blocks, img_width, img_height)
        measurements = _calculate_measurements(elements, img_width, img_height)
        context = _build_measurement_context(measurements)

        return {
            'elements': elements,
            'measurements': measurements,
            'context': context,
            'image_dimensions': {'width': img_width, 'height': img_height}
        }
    except Exception as e:
        print(f"OCR Measurement: Error processing {image_path}: {e}")
        return None


def _tesseract_detect(image_path, img_width, img_height):
    """
    Run Tesseract OCR to detect text blocks with bounding boxes.
    Groups words into blocks and returns position data.
    """
    try:
        import pytesseract
    except ImportError:
        print("OCR Measurement: pytesseract not installed (pip install pytesseract)")
        return []

    try:
        img = Image.open(image_path)
        # Get word-level bounding boxes from Tesseract
        data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT)

        # Group words by block + line for finer element separation
        # (Tesseract often groups date + logo into one block)
        block_groups = {}
        for i in range(len(data['text'])):
            text = data['text'][i].strip()
            conf = int(data['conf'][i])
            if not text or conf < 30:  # Skip low confidence detections
                continue

            # Use block_num + line_num as key for finer grouping
            group_key = (data['block_num'][i], data['line_num'][i])
            if group_key not in block_groups:
                block_groups[group_key] = {
                    'words': [],
                    'lefts': [],
                    'tops': [],
                    'rights': [],
                    'bottoms': [],
                    'word_heights': [],
                }

            bg = block_groups[group_key]
            bg['words'].append(text)
            left = data['left'][i]
            top = data['top'][i]
            w = data['width'][i]
            h = data['height'][i]
            bg['lefts'].append(left)
            bg['tops'].append(top)
            bg['rights'].append(left + w)
            bg['bottoms'].append(top + h)
            bg['word_heights'].append(h)

        blocks = []
        for group_key, bg in block_groups.items():
            if not bg['words']:
                continue

            block_text = ' '.join(bg['words'])
            left = min(bg['lefts'])
            top = min(bg['tops'])
            right = max(bg['rights'])
            bottom = max(bg['bottoms'])

            # Use median word height as character height (more robust than block height)
            sorted_wh = sorted(bg['word_heights'])
            char_height = sorted_wh[len(sorted_wh) // 2]

            blocks.append({
                'text': block_text,
                'left': left,
                'right': right,
                'top': top,
                'bottom': bottom,
                'width': right - left,
                'height': bottom - top,
                'char_height': char_height,
                'left_pct': round(left / img_width * 100, 1) if img_width else 0,
                'top_pct': round(top / img_height * 100, 1) if img_height else 0,
                'bottom_pct': round(bottom / img_height * 100, 1) if img_height else 0,
            })

        print(f"OCR Measurement: Tesseract detected {len(blocks)} text blocks")
        return blocks
    except Exception as e:
        print(f"OCR Measurement: Tesseract error: {e}")
        return []


def _identify_elements(blocks, img_width, img_height):
    """
    Identify which text blocks are headline, date, legal, logo.

    Strategy:
    - Headline: largest character height text in the top portion
    - Date: matches date patterns (digits + month or digit-digit)
    - Legal: smallest text, bottom area
    - Logo: contains "amazon" or "prime", bottom area
    """
    if not blocks:
        return {}

    elements = {}
    remaining = list(blocks)

    # 1. Find headline blocks: largest char_height text in top 70%
    top_blocks = [b for b in remaining if b['top_pct'] < 70]
    if top_blocks:
        max_char_height = max(b['char_height'] for b in top_blocks)

        # Headline blocks are those with char_height >= 70% of the max
        headline_blocks = sorted(
            [b for b in top_blocks if b['char_height'] >= max_char_height * 0.7],
            key=lambda b: b['top']
        )

        if headline_blocks:
            all_text = ' '.join(b['text'] for b in headline_blocks)
            combined_left = min(b['left'] for b in headline_blocks)
            combined_right = max(b['right'] for b in headline_blocks)
            combined_top = min(b['top'] for b in headline_blocks)
            combined_bottom = max(b['bottom'] for b in headline_blocks)

            elements['headline'] = {
                'text': all_text,
                'left': combined_left,
                'right': combined_right,
                'top': combined_top,
                'bottom': combined_bottom,
                'char_height': max_char_height,
                'line_count': len(headline_blocks),
            }
            for b in headline_blocks:
                if b in remaining:
                    remaining.remove(b)

    # 2. Find date: contains number patterns like "8-11" or "8 – 11" with month
    date_pattern = re.compile(
        r'\d+\s*[-–—]\s*\d+|'
        r'\d+\s+(?:' + _MONTH_PATTERN + r')',
        re.IGNORECASE
    )

    for block in remaining[:]:
        if date_pattern.search(block['text']):
            elements['date'] = block
            remaining.remove(block)
            break

    # 3. Find logo/branding: contains "amazon" or "prime day"
    for block in remaining[:]:
        if re.search(r'amazon|prime\s*day', block['text'], re.IGNORECASE):
            elements['logo_text'] = block
            remaining.remove(block)
            break

    # 4. Find legal: smallest text in bottom 25%
    bottom_blocks = [b for b in remaining if b['top_pct'] > 75]
    if bottom_blocks:
        legal_block = min(bottom_blocks, key=lambda b: b['char_height'])
        elements['legal'] = legal_block
        if legal_block in remaining:
            remaining.remove(legal_block)

    elements['other'] = remaining
    return elements


def _calculate_measurements(elements, img_width, img_height):
    """Calculate precise layout measurements from identified elements."""
    m = {
        'image_width': img_width,
        'image_height': img_height,
        'format': _detect_format(img_width, img_height),
    }

    shortest_side = min(img_width, img_height)
    m['shortest_side'] = shortest_side

    headline = elements.get('headline')
    date = elements.get('date')
    legal = elements.get('legal')
    logo = elements.get('logo_text')

    if headline:
        m['headline'] = {
            'text': headline.get('text', ''),
            'left_margin_px': headline['left'],
            'left_margin_pct': _pct(headline['left'], img_width),
            'left_margin_shortest_side_pct': _pct(headline['left'], shortest_side),
            'right_margin_px': img_width - headline['right'],
            'right_margin_pct': _pct(img_width - headline['right'], img_width),
            'top_margin_px': headline['top'],
            'top_margin_pct': _pct(headline['top'], img_height),
            'top_margin_shortest_side_pct': _pct(headline['top'], shortest_side),
            'width_pct': _pct(headline['right'] - headline['left'], img_width),
            'bottom_edge_px': headline['bottom'],
            'bottom_edge_pct': _pct(headline['bottom'], img_height),
            'char_height_px': headline.get('char_height', 0),
        }

    if date:
        m['date'] = {
            'text': date.get('text', ''),
            'left_margin_px': date['left'],
            'left_margin_pct': _pct(date['left'], img_width),
            'left_margin_shortest_side_pct': _pct(date['left'], shortest_side),
            'top_px': date['top'],
            'top_pct': _pct(date['top'], img_height),
            'char_height_px': date.get('char_height', 0),
        }

    if logo:
        m['logo_text'] = {
            'text': logo.get('text', ''),
            'left_margin_px': logo['left'],
            'left_margin_pct': _pct(logo['left'], img_width),
        }

    if legal:
        m['legal'] = {
            'text': legal.get('text', ''),
            'left_margin_px': legal['left'],
            'left_margin_pct': _pct(legal['left'], img_width),
            'char_height_px': legal.get('char_height', 0),
        }

    # Headline-to-date gap
    if headline and date:
        gap_px = date['top'] - headline['bottom']
        m['headline_to_date_gap'] = {
            'gap_px': gap_px,
            'gap_pct': _pct(gap_px, img_height),
        }

        # Date-to-headline size ratio
        h_char = headline.get('char_height', 0)
        d_char = date.get('char_height', 0)
        if h_char > 0:
            m['date_to_headline_ratio_pct'] = round(d_char / h_char * 100, 1)

    # Left alignment consistency
    left_margins = {}
    if headline:
        left_margins['headline'] = headline['left']
    if date:
        left_margins['date'] = date['left']
    if logo:
        left_margins['logo_text'] = logo['left']

    if len(left_margins) >= 2:
        values = list(left_margins.values())
        max_diff_px = max(values) - min(values)
        max_diff_pct = _pct(max_diff_px, img_width)

        m['left_alignment'] = {
            'elements': {k: {'px': v, 'pct': _pct(v, img_width)} for k, v in left_margins.items()},
            'max_difference_px': max_diff_px,
            'max_difference_pct': max_diff_pct,
            'aligned': max_diff_pct <= 3.0,
        }

    return m


def _build_measurement_context(measurements):
    """Build formatted text context for injection into QC check prompts."""
    shortest_side = measurements.get('shortest_side', min(measurements['image_width'], measurements['image_height']))

    lines = [
        "=== OCR LAYOUT MEASUREMENTS (computed from pixel-level analysis — NOT visual estimation) ===",
        f"Image: {measurements['image_width']}px x {measurements['image_height']}px ({measurements.get('format', 'unknown')} format)",
        f"Shortest side: {shortest_side}px (Amazon guideline: margins should be ~7% of shortest side = ~{round(shortest_side * 0.07)}px)",
        "",
    ]

    headline = measurements.get('headline')
    if headline:
        lines.append(f"HEADLINE: \"{headline['text']}\"")
        lines.append(f"  Left margin: {headline['left_margin_px']}px ({headline.get('left_margin_shortest_side_pct', headline['left_margin_pct'])}% of shortest side, {headline['left_margin_pct']}% of width)")
        lines.append(f"  Top margin: {headline['top_margin_px']}px ({headline.get('top_margin_shortest_side_pct', headline['top_margin_pct'])}% of shortest side)")
        lines.append(f"  Right margin: {headline['right_margin_px']}px ({headline['right_margin_pct']}% of width)")
        lines.append(f"  Headline width: {headline['width_pct']}% of asset width")
        lines.append(f"  Character height: {headline['char_height_px']}px")
        lines.append("")

    date = measurements.get('date')
    if date:
        lines.append(f"DATE: \"{date['text']}\"")
        lines.append(f"  Left margin: {date['left_margin_px']}px ({date.get('left_margin_shortest_side_pct', date['left_margin_pct'])}% of shortest side, {date['left_margin_pct']}% of width)")
        lines.append(f"  Character height: {date['char_height_px']}px")
        lines.append("")

    logo = measurements.get('logo_text')
    if logo:
        lines.append(f"LOGO/BRANDING: \"{logo['text']}\"")
        lines.append(f"  Left margin: {logo['left_margin_px']}px ({logo['left_margin_pct']}% of width)")
        lines.append("")

    legal = measurements.get('legal')
    if legal:
        lines.append(f"LEGAL: \"{legal['text']}\"")
        lines.append(f"  Left margin: {legal['left_margin_px']}px ({legal['left_margin_pct']}% of width)")
        lines.append("")

    gap = measurements.get('headline_to_date_gap')
    if gap:
        lines.append(f"HEADLINE-TO-DATE SPACING: {gap['gap_px']}px ({gap['gap_pct']}% of image height)")

    ratio = measurements.get('date_to_headline_ratio_pct')
    if ratio is not None:
        lines.append(f"DATE-TO-HEADLINE SIZE RATIO: {ratio}%")

    lines.append("")

    alignment = measurements.get('left_alignment')
    if alignment:
        lines.append("LEFT ALIGNMENT COMPARISON:")
        for name, pos in alignment['elements'].items():
            lines.append(f"  {name}: {pos['px']}px from left edge ({pos['pct']}% of width)")
        lines.append(f"  Max difference: {alignment['max_difference_px']}px ({alignment['max_difference_pct']}% of width)")
        if alignment['aligned']:
            lines.append("  Status: ALIGNED (within tolerance)")
        else:
            lines.append("  Status: MISALIGNED — elements do NOT share the same left margin")
        lines.append("")

    lines.append("NOTE: These OCR measurements are approximate and should be used as supplementary data")
    lines.append("alongside your visual assessment. OCR bounding boxes can have small inaccuracies (~5-10px).")
    lines.append("Use them to confirm or question your visual impression, but do NOT fail an asset solely")
    lines.append("based on OCR numbers if the layout looks visually correct and well-composed.")
    lines.append("=== END OCR MEASUREMENTS ===")

    return "\n".join(lines)


def _pct(value, total):
    """Calculate percentage, rounded to 1 decimal."""
    if total <= 0:
        return 0.0
    return round(value / total * 100, 1)


def _detect_format(width, height):
    """Detect image format type."""
    if height > width * 1.3:
        return 'portrait/tall'
    elif width > height * 1.3:
        return 'landscape'
    else:
        return 'square'