ai_qc/backend/document_mode/formatting_diff.py

"""Deterministic span-level formatting diff for one aligned page-pair.

Companion to diff_engine's vision-LLM diff. The LLM compares rendered
page images and is reliable for content/narrative changes but unreliable
for typographic flips (bold, font, size, color) on dense layouts.

This module ignores rendered pixels entirely and instead reads PyMuPDF's
extracted span metadata, matches spans across pages by exact text +
y-position, and reports any span whose typographic attributes flipped.

Public surface: compute_formatting_diff(old_spans, new_spans,
old_page_num, new_page_num) -> dict.
"""

from __future__ import annotations

from collections import defaultdict
from typing import Dict, List, Tuple

# Spans shorter than this (after .strip()) are ignored. "the", "of", "1",
# "." are too common to match reliably and would produce noise.
MIN_TEXT_LEN = 4

# Number of example quotes per aggregated finding shown in the report.
MAX_QUOTES_PER_FINDING = 3

# A finding only qualifies as "page-wide" if the page has enough matched
# spans to make that statement meaningful. Section-break pages with one
# or two long spans should not be labelled page-wide on a single flip.
PAGE_WIDE_MIN_SPANS = 3

# Per-attribute equality tolerances.
SIZE_TOLERANCE_PT = 0.05


def compute_formatting_diff(
    old_spans: List[Dict],
    new_spans: List[Dict],
    old_page_num: int,
    new_page_num: int,
) -> Dict:
    """Compare two span lists and return aggregated formatting changes.

    Returns:
        {
            'formatting_changes': [
                {
                    'attribute': 'bold' | 'italic' | 'font' | 'size' | 'color',
                    'old_value': str | bool | float,
                    'new_value': str | bool | float,
                    'example_quotes': [str, ...],
                    'total_span_count': int,
                    'page_wide': bool,
                },
                ...
            ],
            'finding_count': int,
            'severity': 'medium' | 'none',
            'old_page_num': int,
            'new_page_num': int,
        }
    """
    pairs = _match_spans(old_spans, new_spans)
    matched_count = len(pairs)
    flips = _collect_flips(pairs)
    findings = _aggregate(flips, matched_count)

    return {
        'formatting_changes': findings,
        'finding_count': len(findings),
        'severity': 'medium' if findings else 'none',
        'old_page_num': old_page_num,
        'new_page_num': new_page_num,
    }


def _match_spans(old_spans: List[Dict], new_spans: List[Dict]) -> List[Tuple[Dict, Dict]]:
    """Pair spans across pages by exact-text match, disambiguated by y-position.

    Spans with fewer than MIN_TEXT_LEN chars after stripping are skipped.
    Returns a list of (old_span, new_span) tuples.
    """
    new_by_text: Dict[str, List[Dict]] = defaultdict(list)
    for s in new_spans:
        text = (s.get('text') or '').strip()
        if len(text) < MIN_TEXT_LEN:
            continue
        new_by_text[text].append(s)

    pairs: List[Tuple[Dict, Dict]] = []
    consumed: set = set()
    for old_span in old_spans:
        text = (old_span.get('text') or '').strip()
        if len(text) < MIN_TEXT_LEN:
            continue
        candidates = [c for c in new_by_text.get(text, []) if id(c) not in consumed]
        if not candidates:
            continue
        if len(candidates) == 1:
            chosen = candidates[0]
        else:
            chosen = min(candidates, key=lambda c: abs(_y_mid(c) - _y_mid(old_span)))
        consumed.add(id(chosen))
        pairs.append((old_span, chosen))

    return pairs


def _y_mid(span: Dict) -> float:
    """Vertical midpoint of a span's bbox; 0.0 if bbox is missing."""
    bbox = span.get('bbox') or (0, 0, 0, 0)
    return (bbox[1] + bbox[3]) / 2.0


def _collect_flips(pairs: List[Tuple[Dict, Dict]]) -> List[Dict]:
    """For each paired span, check each attribute and emit a flip record per change."""
    flips: List[Dict] = []
    for old_span, new_span in pairs:
        text = (old_span.get('text') or '').strip()
        for attr in ('bold', 'italic'):
            old_v = bool(old_span.get(attr))
            new_v = bool(new_span.get(attr))
            if old_v != new_v:
                flips.append({
                    'attribute': attr, 'old_value': old_v,
                    'new_value': new_v, 'quote': text,
                })
        for attr in ('font', 'color'):
            old_v = old_span.get(attr) or ''
            new_v = new_span.get(attr) or ''
            if old_v != new_v:
                flips.append({
                    'attribute': attr, 'old_value': old_v,
                    'new_value': new_v, 'quote': text,
                })
        old_size = float(old_span.get('size') or 0.0)
        new_size = float(new_span.get('size') or 0.0)
        if abs(old_size - new_size) > SIZE_TOLERANCE_PT:
            flips.append({
                'attribute': 'size', 'old_value': round(old_size, 2),
                'new_value': round(new_size, 2), 'quote': text,
            })
    return flips


def _aggregate(flips: List[Dict], matched_span_count: int) -> List[Dict]:
    """Group flips by (attribute, old_value, new_value) and emit one finding per group."""
    groups: Dict[Tuple, List[Dict]] = defaultdict(list)
    for f in flips:
        key = (f['attribute'], _hashable(f['old_value']), _hashable(f['new_value']))
        groups[key].append(f)

    findings: List[Dict] = []
    for (attribute, _, _), members in groups.items():
        old_v = members[0]['old_value']
        new_v = members[0]['new_value']
        quotes = [m['quote'] for m in members[:MAX_QUOTES_PER_FINDING]]
        total = len(members)
        page_wide = matched_span_count >= PAGE_WIDE_MIN_SPANS and total == matched_span_count
        findings.append({
            'attribute': attribute,
            'old_value': old_v,
            'new_value': new_v,
            'example_quotes': quotes,
            'total_span_count': total,
            'page_wide': page_wide,
        })

    findings.sort(key=lambda f: -f['total_span_count'])
    return findings


def _hashable(v):
    """Coerce a value to a hashable form for groupby keys (floats already are)."""
    if isinstance(v, (str, int, float, bool)) or v is None:
        return v
    return str(v)