Three review-driven hardening tweaks: - page_wide now requires ≥3 matched spans (PAGE_WIDE_MIN_SPANS). Avoids labelling section-break pages with a single flipped heading as page-wide. - _collect_flips normalises bold/italic via bool() and font/color via "or ''" so callers passing dicts without those keys do not produce phantom flips against False/''. - Adds tests for empty span lists and the missing-bold-key case. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
177 lines
6.2 KiB
Python
177 lines
6.2 KiB
Python
"""Deterministic span-level formatting diff for one aligned page-pair.
|
|
|
|
Companion to diff_engine's vision-LLM diff. The LLM compares rendered
|
|
page images and is reliable for content/narrative changes but unreliable
|
|
for typographic flips (bold, font, size, color) on dense layouts.
|
|
|
|
This module ignores rendered pixels entirely and instead reads PyMuPDF's
|
|
extracted span metadata, matches spans across pages by exact text +
|
|
y-position, and reports any span whose typographic attributes flipped.
|
|
|
|
Public surface: compute_formatting_diff(old_spans, new_spans,
|
|
old_page_num, new_page_num) -> dict.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from collections import defaultdict
|
|
from typing import Dict, List, Tuple
|
|
|
|
# Spans shorter than this (after .strip()) are ignored. "the", "of", "1",
|
|
# "." are too common to match reliably and would produce noise.
|
|
MIN_TEXT_LEN = 4
|
|
|
|
# Number of example quotes per aggregated finding shown in the report.
|
|
MAX_QUOTES_PER_FINDING = 3
|
|
|
|
# A finding only qualifies as "page-wide" if the page has enough matched
|
|
# spans to make that statement meaningful. Section-break pages with one
|
|
# or two long spans should not be labelled page-wide on a single flip.
|
|
PAGE_WIDE_MIN_SPANS = 3
|
|
|
|
# Per-attribute equality tolerances.
|
|
SIZE_TOLERANCE_PT = 0.05
|
|
|
|
|
|
def compute_formatting_diff(
|
|
old_spans: List[Dict],
|
|
new_spans: List[Dict],
|
|
old_page_num: int,
|
|
new_page_num: int,
|
|
) -> Dict:
|
|
"""Compare two span lists and return aggregated formatting changes.
|
|
|
|
Returns:
|
|
{
|
|
'formatting_changes': [
|
|
{
|
|
'attribute': 'bold' | 'italic' | 'font' | 'size' | 'color',
|
|
'old_value': str | bool | float,
|
|
'new_value': str | bool | float,
|
|
'example_quotes': [str, ...],
|
|
'total_span_count': int,
|
|
'page_wide': bool,
|
|
},
|
|
...
|
|
],
|
|
'finding_count': int,
|
|
'severity': 'medium' | 'none',
|
|
'old_page_num': int,
|
|
'new_page_num': int,
|
|
}
|
|
"""
|
|
pairs = _match_spans(old_spans, new_spans)
|
|
matched_count = len(pairs)
|
|
flips = _collect_flips(pairs)
|
|
findings = _aggregate(flips, matched_count)
|
|
|
|
return {
|
|
'formatting_changes': findings,
|
|
'finding_count': len(findings),
|
|
'severity': 'medium' if findings else 'none',
|
|
'old_page_num': old_page_num,
|
|
'new_page_num': new_page_num,
|
|
}
|
|
|
|
|
|
def _match_spans(old_spans: List[Dict], new_spans: List[Dict]) -> List[Tuple[Dict, Dict]]:
|
|
"""Pair spans across pages by exact-text match, disambiguated by y-position.
|
|
|
|
Spans with fewer than MIN_TEXT_LEN chars after stripping are skipped.
|
|
Returns a list of (old_span, new_span) tuples.
|
|
"""
|
|
new_by_text: Dict[str, List[Dict]] = defaultdict(list)
|
|
for s in new_spans:
|
|
text = (s.get('text') or '').strip()
|
|
if len(text) < MIN_TEXT_LEN:
|
|
continue
|
|
new_by_text[text].append(s)
|
|
|
|
pairs: List[Tuple[Dict, Dict]] = []
|
|
consumed: set = set()
|
|
for old_span in old_spans:
|
|
text = (old_span.get('text') or '').strip()
|
|
if len(text) < MIN_TEXT_LEN:
|
|
continue
|
|
candidates = [c for c in new_by_text.get(text, []) if id(c) not in consumed]
|
|
if not candidates:
|
|
continue
|
|
if len(candidates) == 1:
|
|
chosen = candidates[0]
|
|
else:
|
|
chosen = min(candidates, key=lambda c: abs(_y_mid(c) - _y_mid(old_span)))
|
|
consumed.add(id(chosen))
|
|
pairs.append((old_span, chosen))
|
|
|
|
return pairs
|
|
|
|
|
|
def _y_mid(span: Dict) -> float:
|
|
"""Vertical midpoint of a span's bbox; 0.0 if bbox is missing."""
|
|
bbox = span.get('bbox') or (0, 0, 0, 0)
|
|
return (bbox[1] + bbox[3]) / 2.0
|
|
|
|
|
|
def _collect_flips(pairs: List[Tuple[Dict, Dict]]) -> List[Dict]:
|
|
"""For each paired span, check each attribute and emit a flip record per change."""
|
|
flips: List[Dict] = []
|
|
for old_span, new_span in pairs:
|
|
text = (old_span.get('text') or '').strip()
|
|
for attr in ('bold', 'italic'):
|
|
old_v = bool(old_span.get(attr))
|
|
new_v = bool(new_span.get(attr))
|
|
if old_v != new_v:
|
|
flips.append({
|
|
'attribute': attr, 'old_value': old_v,
|
|
'new_value': new_v, 'quote': text,
|
|
})
|
|
for attr in ('font', 'color'):
|
|
old_v = old_span.get(attr) or ''
|
|
new_v = new_span.get(attr) or ''
|
|
if old_v != new_v:
|
|
flips.append({
|
|
'attribute': attr, 'old_value': old_v,
|
|
'new_value': new_v, 'quote': text,
|
|
})
|
|
old_size = float(old_span.get('size') or 0.0)
|
|
new_size = float(new_span.get('size') or 0.0)
|
|
if abs(old_size - new_size) > SIZE_TOLERANCE_PT:
|
|
flips.append({
|
|
'attribute': 'size', 'old_value': round(old_size, 2),
|
|
'new_value': round(new_size, 2), 'quote': text,
|
|
})
|
|
return flips
|
|
|
|
|
|
def _aggregate(flips: List[Dict], matched_span_count: int) -> List[Dict]:
|
|
"""Group flips by (attribute, old_value, new_value) and emit one finding per group."""
|
|
groups: Dict[Tuple, List[Dict]] = defaultdict(list)
|
|
for f in flips:
|
|
key = (f['attribute'], _hashable(f['old_value']), _hashable(f['new_value']))
|
|
groups[key].append(f)
|
|
|
|
findings: List[Dict] = []
|
|
for (attribute, _, _), members in groups.items():
|
|
old_v = members[0]['old_value']
|
|
new_v = members[0]['new_value']
|
|
quotes = [m['quote'] for m in members[:MAX_QUOTES_PER_FINDING]]
|
|
total = len(members)
|
|
page_wide = matched_span_count >= PAGE_WIDE_MIN_SPANS and total == matched_span_count
|
|
findings.append({
|
|
'attribute': attribute,
|
|
'old_value': old_v,
|
|
'new_value': new_v,
|
|
'example_quotes': quotes,
|
|
'total_span_count': total,
|
|
'page_wide': page_wide,
|
|
})
|
|
|
|
findings.sort(key=lambda f: -f['total_span_count'])
|
|
return findings
|
|
|
|
|
|
def _hashable(v):
|
|
"""Coerce a value to a hashable form for groupby keys (floats already are)."""
|
|
if isinstance(v, (str, int, float, bool)) or v is None:
|
|
return v
|
|
return str(v)
|