First real-data test against the AXA car-insurance PDFs surfaced a noise problem: the new document is a brand refresh — every page flips font (PublicoBanner-Bold→PublicoHeadline-Bold) and colour (#893f4a→#2e3092). At medium-per-finding that crashed the diff score to 0.0 and drowned the bold-regression signal AXA actually flagged. Drop font, size, colour comparators. Keep bold + italic — the attributes the vision-LLM consistently misses on dense layouts. The LLM already narrates colour-scheme rebrands and font swaps in its Modified / Style-changes blocks; running both layers on the same visual change just double-counts it. Tests inverted from "X change is flagged" to "X change is NOT flagged" to lock the scope decision in. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
168 lines
5.9 KiB
Python
168 lines
5.9 KiB
Python
"""Deterministic span-level formatting diff for one aligned page-pair.
|
|
|
|
Companion to diff_engine's vision-LLM diff. The LLM is reliable for
|
|
content/narrative changes (added paragraphs, rewords, layout shifts,
|
|
colour-scheme rebrands) but unreliable for bold/italic flips on dense
|
|
typeset layouts — which is exactly what AXA flagged was being missed.
|
|
|
|
Scope intentionally narrow: bold + italic flips only. Font / size /
|
|
colour are NOT compared here. A re-export from a different toolchain
|
|
(or a brand refresh) routinely flips font names and colour values on
|
|
every page; reporting those as per-page deductions drowns out the
|
|
bold/italic regressions clients actually need to spot. The LLM
|
|
narrates those rebrand changes already.
|
|
|
|
Public surface: compute_formatting_diff(old_spans, new_spans,
|
|
old_page_num, new_page_num) -> dict.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from collections import defaultdict
|
|
from typing import Dict, List, Tuple
|
|
|
|
# Spans shorter than this (after .strip()) are ignored. "the", "of", "1",
|
|
# "." are too common to match reliably and would produce noise.
|
|
MIN_TEXT_LEN = 4
|
|
|
|
# Number of example quotes per aggregated finding shown in the report.
|
|
MAX_QUOTES_PER_FINDING = 3
|
|
|
|
# A finding only qualifies as "page-wide" if the page has enough matched
|
|
# spans to make that statement meaningful. Section-break pages with one
|
|
# or two long spans should not be labelled page-wide on a single flip.
|
|
PAGE_WIDE_MIN_SPANS = 3
|
|
|
|
|
|
def compute_formatting_diff(
|
|
old_spans: List[Dict],
|
|
new_spans: List[Dict],
|
|
old_page_num: int,
|
|
new_page_num: int,
|
|
) -> Dict:
|
|
"""Compare two span lists and return aggregated formatting changes.
|
|
|
|
Scope intentionally limited to bold + italic flips. Font, size and
|
|
colour changes (rebrands, re-exports from a different toolchain) are
|
|
handled by the vision-LLM's narrative diff — re-flagging them here
|
|
drowns out the bold/italic regressions clients actually care about.
|
|
|
|
Returns:
|
|
{
|
|
'formatting_changes': [
|
|
{
|
|
'attribute': 'bold' | 'italic',
|
|
'old_value': bool,
|
|
'new_value': bool,
|
|
'example_quotes': [str, ...],
|
|
'total_span_count': int,
|
|
'page_wide': bool,
|
|
},
|
|
...
|
|
],
|
|
'finding_count': int,
|
|
'severity': 'medium' | 'none',
|
|
'old_page_num': int,
|
|
'new_page_num': int,
|
|
}
|
|
"""
|
|
pairs = _match_spans(old_spans, new_spans)
|
|
matched_count = len(pairs)
|
|
flips = _collect_flips(pairs)
|
|
findings = _aggregate(flips, matched_count)
|
|
|
|
return {
|
|
'formatting_changes': findings,
|
|
'finding_count': len(findings),
|
|
'severity': 'medium' if findings else 'none',
|
|
'old_page_num': old_page_num,
|
|
'new_page_num': new_page_num,
|
|
}
|
|
|
|
|
|
def _match_spans(old_spans: List[Dict], new_spans: List[Dict]) -> List[Tuple[Dict, Dict]]:
|
|
"""Pair spans across pages by exact-text match, disambiguated by y-position.
|
|
|
|
Spans with fewer than MIN_TEXT_LEN chars after stripping are skipped.
|
|
Returns a list of (old_span, new_span) tuples.
|
|
"""
|
|
new_by_text: Dict[str, List[Dict]] = defaultdict(list)
|
|
for s in new_spans:
|
|
text = (s.get('text') or '').strip()
|
|
if len(text) < MIN_TEXT_LEN:
|
|
continue
|
|
new_by_text[text].append(s)
|
|
|
|
pairs: List[Tuple[Dict, Dict]] = []
|
|
consumed: set = set()
|
|
for old_span in old_spans:
|
|
text = (old_span.get('text') or '').strip()
|
|
if len(text) < MIN_TEXT_LEN:
|
|
continue
|
|
candidates = [c for c in new_by_text.get(text, []) if id(c) not in consumed]
|
|
if not candidates:
|
|
continue
|
|
if len(candidates) == 1:
|
|
chosen = candidates[0]
|
|
else:
|
|
chosen = min(candidates, key=lambda c: abs(_y_mid(c) - _y_mid(old_span)))
|
|
consumed.add(id(chosen))
|
|
pairs.append((old_span, chosen))
|
|
|
|
return pairs
|
|
|
|
|
|
def _y_mid(span: Dict) -> float:
|
|
"""Vertical midpoint of a span's bbox; 0.0 if bbox is missing."""
|
|
bbox = span.get('bbox') or (0, 0, 0, 0)
|
|
return (bbox[1] + bbox[3]) / 2.0
|
|
|
|
|
|
def _collect_flips(pairs: List[Tuple[Dict, Dict]]) -> List[Dict]:
|
|
"""For each paired span, emit a flip record per bold/italic change."""
|
|
flips: List[Dict] = []
|
|
for old_span, new_span in pairs:
|
|
text = (old_span.get('text') or '').strip()
|
|
for attr in ('bold', 'italic'):
|
|
old_v = bool(old_span.get(attr))
|
|
new_v = bool(new_span.get(attr))
|
|
if old_v != new_v:
|
|
flips.append({
|
|
'attribute': attr, 'old_value': old_v,
|
|
'new_value': new_v, 'quote': text,
|
|
})
|
|
return flips
|
|
|
|
|
|
def _aggregate(flips: List[Dict], matched_span_count: int) -> List[Dict]:
|
|
"""Group flips by (attribute, old_value, new_value) and emit one finding per group."""
|
|
groups: Dict[Tuple, List[Dict]] = defaultdict(list)
|
|
for f in flips:
|
|
key = (f['attribute'], _hashable(f['old_value']), _hashable(f['new_value']))
|
|
groups[key].append(f)
|
|
|
|
findings: List[Dict] = []
|
|
for (attribute, _, _), members in groups.items():
|
|
old_v = members[0]['old_value']
|
|
new_v = members[0]['new_value']
|
|
quotes = [m['quote'] for m in members[:MAX_QUOTES_PER_FINDING]]
|
|
total = len(members)
|
|
page_wide = matched_span_count >= PAGE_WIDE_MIN_SPANS and total == matched_span_count
|
|
findings.append({
|
|
'attribute': attribute,
|
|
'old_value': old_v,
|
|
'new_value': new_v,
|
|
'example_quotes': quotes,
|
|
'total_span_count': total,
|
|
'page_wide': page_wide,
|
|
})
|
|
|
|
findings.sort(key=lambda f: -f['total_span_count'])
|
|
return findings
|
|
|
|
|
|
def _hashable(v):
|
|
"""Coerce a value to a hashable form for groupby keys (floats already are)."""
|
|
if isinstance(v, (str, int, float, bool)) or v is None:
|
|
return v
|
|
return str(v)
|