ai_qc/backend/document_mode/formatting_diff.py
nickviljoen 29ee941037 refactor(formatting_diff): narrow scope to bold + italic only
First real-data test against the AXA car-insurance PDFs surfaced a
noise problem: the new document is a brand refresh — every page flips
font (PublicoBanner-Bold→PublicoHeadline-Bold) and colour
(#893f4a→#2e3092). At medium-per-finding that crashed the diff score
to 0.0 and drowned the bold-regression signal AXA actually flagged.

Drop font, size, colour comparators. Keep bold + italic — the
attributes the vision-LLM consistently misses on dense layouts. The
LLM already narrates colour-scheme rebrands and font swaps in its
Modified / Style-changes blocks; running both layers on the same
visual change just double-counts it.

Tests inverted from "X change is flagged" to "X change is NOT
flagged" to lock the scope decision in.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-19 12:37:19 +02:00

168 lines
5.9 KiB
Python

"""Deterministic span-level formatting diff for one aligned page-pair.
Companion to diff_engine's vision-LLM diff. The LLM is reliable for
content/narrative changes (added paragraphs, rewords, layout shifts,
colour-scheme rebrands) but unreliable for bold/italic flips on dense
typeset layouts — which is exactly what AXA flagged was being missed.
Scope intentionally narrow: bold + italic flips only. Font / size /
colour are NOT compared here. A re-export from a different toolchain
(or a brand refresh) routinely flips font names and colour values on
every page; reporting those as per-page deductions drowns out the
bold/italic regressions clients actually need to spot. The LLM
narrates those rebrand changes already.
Public surface: compute_formatting_diff(old_spans, new_spans,
old_page_num, new_page_num) -> dict.
"""
from __future__ import annotations
from collections import defaultdict
from typing import Dict, List, Tuple
# Spans shorter than this (after .strip()) are ignored. "the", "of", "1",
# "." are too common to match reliably and would produce noise.
MIN_TEXT_LEN = 4
# Number of example quotes per aggregated finding shown in the report.
MAX_QUOTES_PER_FINDING = 3
# A finding only qualifies as "page-wide" if the page has enough matched
# spans to make that statement meaningful. Section-break pages with one
# or two long spans should not be labelled page-wide on a single flip.
PAGE_WIDE_MIN_SPANS = 3
def compute_formatting_diff(
old_spans: List[Dict],
new_spans: List[Dict],
old_page_num: int,
new_page_num: int,
) -> Dict:
"""Compare two span lists and return aggregated formatting changes.
Scope intentionally limited to bold + italic flips. Font, size and
colour changes (rebrands, re-exports from a different toolchain) are
handled by the vision-LLM's narrative diff — re-flagging them here
drowns out the bold/italic regressions clients actually care about.
Returns:
{
'formatting_changes': [
{
'attribute': 'bold' | 'italic',
'old_value': bool,
'new_value': bool,
'example_quotes': [str, ...],
'total_span_count': int,
'page_wide': bool,
},
...
],
'finding_count': int,
'severity': 'medium' | 'none',
'old_page_num': int,
'new_page_num': int,
}
"""
pairs = _match_spans(old_spans, new_spans)
matched_count = len(pairs)
flips = _collect_flips(pairs)
findings = _aggregate(flips, matched_count)
return {
'formatting_changes': findings,
'finding_count': len(findings),
'severity': 'medium' if findings else 'none',
'old_page_num': old_page_num,
'new_page_num': new_page_num,
}
def _match_spans(old_spans: List[Dict], new_spans: List[Dict]) -> List[Tuple[Dict, Dict]]:
"""Pair spans across pages by exact-text match, disambiguated by y-position.
Spans with fewer than MIN_TEXT_LEN chars after stripping are skipped.
Returns a list of (old_span, new_span) tuples.
"""
new_by_text: Dict[str, List[Dict]] = defaultdict(list)
for s in new_spans:
text = (s.get('text') or '').strip()
if len(text) < MIN_TEXT_LEN:
continue
new_by_text[text].append(s)
pairs: List[Tuple[Dict, Dict]] = []
consumed: set = set()
for old_span in old_spans:
text = (old_span.get('text') or '').strip()
if len(text) < MIN_TEXT_LEN:
continue
candidates = [c for c in new_by_text.get(text, []) if id(c) not in consumed]
if not candidates:
continue
if len(candidates) == 1:
chosen = candidates[0]
else:
chosen = min(candidates, key=lambda c: abs(_y_mid(c) - _y_mid(old_span)))
consumed.add(id(chosen))
pairs.append((old_span, chosen))
return pairs
def _y_mid(span: Dict) -> float:
"""Vertical midpoint of a span's bbox; 0.0 if bbox is missing."""
bbox = span.get('bbox') or (0, 0, 0, 0)
return (bbox[1] + bbox[3]) / 2.0
def _collect_flips(pairs: List[Tuple[Dict, Dict]]) -> List[Dict]:
"""For each paired span, emit a flip record per bold/italic change."""
flips: List[Dict] = []
for old_span, new_span in pairs:
text = (old_span.get('text') or '').strip()
for attr in ('bold', 'italic'):
old_v = bool(old_span.get(attr))
new_v = bool(new_span.get(attr))
if old_v != new_v:
flips.append({
'attribute': attr, 'old_value': old_v,
'new_value': new_v, 'quote': text,
})
return flips
def _aggregate(flips: List[Dict], matched_span_count: int) -> List[Dict]:
"""Group flips by (attribute, old_value, new_value) and emit one finding per group."""
groups: Dict[Tuple, List[Dict]] = defaultdict(list)
for f in flips:
key = (f['attribute'], _hashable(f['old_value']), _hashable(f['new_value']))
groups[key].append(f)
findings: List[Dict] = []
for (attribute, _, _), members in groups.items():
old_v = members[0]['old_value']
new_v = members[0]['new_value']
quotes = [m['quote'] for m in members[:MAX_QUOTES_PER_FINDING]]
total = len(members)
page_wide = matched_span_count >= PAGE_WIDE_MIN_SPANS and total == matched_span_count
findings.append({
'attribute': attribute,
'old_value': old_v,
'new_value': new_v,
'example_quotes': quotes,
'total_span_count': total,
'page_wide': page_wide,
})
findings.sort(key=lambda f: -f['total_span_count'])
return findings
def _hashable(v):
"""Coerce a value to a hashable form for groupby keys (floats already are)."""
if isinstance(v, (str, int, float, bool)) or v is None:
return v
return str(v)