ai_qc/backend/document_mode/formatting_diff.py
nickviljoen d21a8a276d refactor(formatting_diff): harden page_wide threshold + None-key handling
Three review-driven hardening tweaks:
- page_wide now requires ≥3 matched spans (PAGE_WIDE_MIN_SPANS).
  Avoids labelling section-break pages with a single flipped heading
  as page-wide.
- _collect_flips normalises bold/italic via bool() and font/color
  via "or ''" so callers passing dicts without those keys do not
  produce phantom flips against False/''.
- Adds tests for empty span lists and the missing-bold-key case.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-19 10:01:23 +02:00

177 lines
6.2 KiB
Python

"""Deterministic span-level formatting diff for one aligned page-pair.
Companion to diff_engine's vision-LLM diff. The LLM compares rendered
page images and is reliable for content/narrative changes but unreliable
for typographic flips (bold, font, size, color) on dense layouts.
This module ignores rendered pixels entirely and instead reads PyMuPDF's
extracted span metadata, matches spans across pages by exact text +
y-position, and reports any span whose typographic attributes flipped.
Public surface: compute_formatting_diff(old_spans, new_spans,
old_page_num, new_page_num) -> dict.
"""
from __future__ import annotations
from collections import defaultdict
from typing import Dict, List, Tuple
# Spans shorter than this (after .strip()) are ignored. "the", "of", "1",
# "." are too common to match reliably and would produce noise.
MIN_TEXT_LEN = 4
# Number of example quotes per aggregated finding shown in the report.
MAX_QUOTES_PER_FINDING = 3
# A finding only qualifies as "page-wide" if the page has enough matched
# spans to make that statement meaningful. Section-break pages with one
# or two long spans should not be labelled page-wide on a single flip.
PAGE_WIDE_MIN_SPANS = 3
# Per-attribute equality tolerances.
SIZE_TOLERANCE_PT = 0.05
def compute_formatting_diff(
old_spans: List[Dict],
new_spans: List[Dict],
old_page_num: int,
new_page_num: int,
) -> Dict:
"""Compare two span lists and return aggregated formatting changes.
Returns:
{
'formatting_changes': [
{
'attribute': 'bold' | 'italic' | 'font' | 'size' | 'color',
'old_value': str | bool | float,
'new_value': str | bool | float,
'example_quotes': [str, ...],
'total_span_count': int,
'page_wide': bool,
},
...
],
'finding_count': int,
'severity': 'medium' | 'none',
'old_page_num': int,
'new_page_num': int,
}
"""
pairs = _match_spans(old_spans, new_spans)
matched_count = len(pairs)
flips = _collect_flips(pairs)
findings = _aggregate(flips, matched_count)
return {
'formatting_changes': findings,
'finding_count': len(findings),
'severity': 'medium' if findings else 'none',
'old_page_num': old_page_num,
'new_page_num': new_page_num,
}
def _match_spans(old_spans: List[Dict], new_spans: List[Dict]) -> List[Tuple[Dict, Dict]]:
"""Pair spans across pages by exact-text match, disambiguated by y-position.
Spans with fewer than MIN_TEXT_LEN chars after stripping are skipped.
Returns a list of (old_span, new_span) tuples.
"""
new_by_text: Dict[str, List[Dict]] = defaultdict(list)
for s in new_spans:
text = (s.get('text') or '').strip()
if len(text) < MIN_TEXT_LEN:
continue
new_by_text[text].append(s)
pairs: List[Tuple[Dict, Dict]] = []
consumed: set = set()
for old_span in old_spans:
text = (old_span.get('text') or '').strip()
if len(text) < MIN_TEXT_LEN:
continue
candidates = [c for c in new_by_text.get(text, []) if id(c) not in consumed]
if not candidates:
continue
if len(candidates) == 1:
chosen = candidates[0]
else:
chosen = min(candidates, key=lambda c: abs(_y_mid(c) - _y_mid(old_span)))
consumed.add(id(chosen))
pairs.append((old_span, chosen))
return pairs
def _y_mid(span: Dict) -> float:
"""Vertical midpoint of a span's bbox; 0.0 if bbox is missing."""
bbox = span.get('bbox') or (0, 0, 0, 0)
return (bbox[1] + bbox[3]) / 2.0
def _collect_flips(pairs: List[Tuple[Dict, Dict]]) -> List[Dict]:
"""For each paired span, check each attribute and emit a flip record per change."""
flips: List[Dict] = []
for old_span, new_span in pairs:
text = (old_span.get('text') or '').strip()
for attr in ('bold', 'italic'):
old_v = bool(old_span.get(attr))
new_v = bool(new_span.get(attr))
if old_v != new_v:
flips.append({
'attribute': attr, 'old_value': old_v,
'new_value': new_v, 'quote': text,
})
for attr in ('font', 'color'):
old_v = old_span.get(attr) or ''
new_v = new_span.get(attr) or ''
if old_v != new_v:
flips.append({
'attribute': attr, 'old_value': old_v,
'new_value': new_v, 'quote': text,
})
old_size = float(old_span.get('size') or 0.0)
new_size = float(new_span.get('size') or 0.0)
if abs(old_size - new_size) > SIZE_TOLERANCE_PT:
flips.append({
'attribute': 'size', 'old_value': round(old_size, 2),
'new_value': round(new_size, 2), 'quote': text,
})
return flips
def _aggregate(flips: List[Dict], matched_span_count: int) -> List[Dict]:
"""Group flips by (attribute, old_value, new_value) and emit one finding per group."""
groups: Dict[Tuple, List[Dict]] = defaultdict(list)
for f in flips:
key = (f['attribute'], _hashable(f['old_value']), _hashable(f['new_value']))
groups[key].append(f)
findings: List[Dict] = []
for (attribute, _, _), members in groups.items():
old_v = members[0]['old_value']
new_v = members[0]['new_value']
quotes = [m['quote'] for m in members[:MAX_QUOTES_PER_FINDING]]
total = len(members)
page_wide = matched_span_count >= PAGE_WIDE_MIN_SPANS and total == matched_span_count
findings.append({
'attribute': attribute,
'old_value': old_v,
'new_value': new_v,
'example_quotes': quotes,
'total_span_count': total,
'page_wide': page_wide,
})
findings.sort(key=lambda f: -f['total_span_count'])
return findings
def _hashable(v):
"""Coerce a value to a hashable form for groupby keys (floats already are)."""
if isinstance(v, (str, int, float, bool)) or v is None:
return v
return str(v)