ai_qc/backend/document_mode/formatting_diff.py

"""Deterministic span-level formatting diff for one aligned page-pair.

Companion to diff_engine's vision-LLM diff. The LLM is reliable for
content/narrative changes (added paragraphs, rewords, layout shifts,
colour-scheme rebrands) but unreliable for bold/italic flips on dense
typeset layouts — which is exactly what AXA flagged was being missed.

Scope intentionally narrow: bold + italic flips only. Font / size /
colour are NOT compared here. A re-export from a different toolchain
(or a brand refresh) routinely flips font names and colour values on
every page; reporting those as per-page deductions drowns out the
bold/italic regressions clients actually need to spot. The LLM
narrates those rebrand changes already.

Public surface: compute_formatting_diff(old_spans, new_spans,
old_page_num, new_page_num) -> dict.
"""

from __future__ import annotations

from collections import defaultdict
from typing import Dict, List, Tuple

# Spans shorter than this (after .strip()) are ignored. "the", "of", "1",
# "." are too common to match reliably and would produce noise.
MIN_TEXT_LEN = 4

# Number of example quotes per aggregated finding shown in the report.
MAX_QUOTES_PER_FINDING = 3

# A finding only qualifies as "page-wide" if the page has enough matched
# spans to make that statement meaningful. Section-break pages with one
# or two long spans should not be labelled page-wide on a single flip.
PAGE_WIDE_MIN_SPANS = 3


def compute_formatting_diff(
    old_spans: List[Dict],
    new_spans: List[Dict],
    old_page_num: int,
    new_page_num: int,
) -> Dict:
    """Compare two span lists and return aggregated formatting changes.

    Scope intentionally limited to bold + italic flips. Font, size and
    colour changes (rebrands, re-exports from a different toolchain) are
    handled by the vision-LLM's narrative diff — re-flagging them here
    drowns out the bold/italic regressions clients actually care about.

    Returns:
        {
            'formatting_changes': [
                {
                    'attribute': 'bold' | 'italic',
                    'old_value': bool,
                    'new_value': bool,
                    'example_quotes': [str, ...],
                    'total_span_count': int,
                    'page_wide': bool,
                },
                ...
            ],
            'finding_count': int,
            'severity': 'medium' | 'none',
            'old_page_num': int,
            'new_page_num': int,
        }
    """
    pairs = _match_spans(old_spans, new_spans)
    matched_count = len(pairs)
    flips = _collect_flips(pairs)
    findings = _aggregate(flips, matched_count)

    return {
        'formatting_changes': findings,
        'finding_count': len(findings),
        'severity': 'medium' if findings else 'none',
        'old_page_num': old_page_num,
        'new_page_num': new_page_num,
    }


def _match_spans(old_spans: List[Dict], new_spans: List[Dict]) -> List[Tuple[Dict, Dict]]:
    """Pair spans across pages by exact-text match, disambiguated by y-position.

    Spans with fewer than MIN_TEXT_LEN chars after stripping are skipped.
    Returns a list of (old_span, new_span) tuples.
    """
    new_by_text: Dict[str, List[Dict]] = defaultdict(list)
    for s in new_spans:
        text = (s.get('text') or '').strip()
        if len(text) < MIN_TEXT_LEN:
            continue
        new_by_text[text].append(s)

    pairs: List[Tuple[Dict, Dict]] = []
    consumed: set = set()
    for old_span in old_spans:
        text = (old_span.get('text') or '').strip()
        if len(text) < MIN_TEXT_LEN:
            continue
        candidates = [c for c in new_by_text.get(text, []) if id(c) not in consumed]
        if not candidates:
            continue
        if len(candidates) == 1:
            chosen = candidates[0]
        else:
            chosen = min(candidates, key=lambda c: abs(_y_mid(c) - _y_mid(old_span)))
        consumed.add(id(chosen))
        pairs.append((old_span, chosen))

    return pairs


def _y_mid(span: Dict) -> float:
    """Vertical midpoint of a span's bbox; 0.0 if bbox is missing."""
    bbox = span.get('bbox') or (0, 0, 0, 0)
    return (bbox[1] + bbox[3]) / 2.0


def _collect_flips(pairs: List[Tuple[Dict, Dict]]) -> List[Dict]:
    """For each paired span, emit a flip record per bold/italic change."""
    flips: List[Dict] = []
    for old_span, new_span in pairs:
        text = (old_span.get('text') or '').strip()
        for attr in ('bold', 'italic'):
            old_v = bool(old_span.get(attr))
            new_v = bool(new_span.get(attr))
            if old_v != new_v:
                flips.append({
                    'attribute': attr, 'old_value': old_v,
                    'new_value': new_v, 'quote': text,
                })
    return flips


def _aggregate(flips: List[Dict], matched_span_count: int) -> List[Dict]:
    """Group flips by (attribute, old_value, new_value) and emit one finding per group."""
    groups: Dict[Tuple, List[Dict]] = defaultdict(list)
    for f in flips:
        key = (f['attribute'], _hashable(f['old_value']), _hashable(f['new_value']))
        groups[key].append(f)

    findings: List[Dict] = []
    for (attribute, _, _), members in groups.items():
        old_v = members[0]['old_value']
        new_v = members[0]['new_value']
        quotes = [m['quote'] for m in members[:MAX_QUOTES_PER_FINDING]]
        total = len(members)
        page_wide = matched_span_count >= PAGE_WIDE_MIN_SPANS and total == matched_span_count
        findings.append({
            'attribute': attribute,
            'old_value': old_v,
            'new_value': new_v,
            'example_quotes': quotes,
            'total_span_count': total,
            'page_wide': page_wide,
        })

    findings.sort(key=lambda f: -f['total_span_count'])
    return findings


def _hashable(v):
    """Coerce a value to a hashable form for groupby keys (floats already are)."""
    if isinstance(v, (str, int, float, bool)) or v is None:
        return v
    return str(v)