"""Deterministic span-level formatting diff for one aligned page-pair. Companion to diff_engine's vision-LLM diff. The LLM is reliable for content/narrative changes (added paragraphs, rewords, layout shifts, colour-scheme rebrands) but unreliable for bold/italic flips on dense typeset layouts — which is exactly what AXA flagged was being missed. Scope intentionally narrow: bold + italic flips only. Font / size / colour are NOT compared here. A re-export from a different toolchain (or a brand refresh) routinely flips font names and colour values on every page; reporting those as per-page deductions drowns out the bold/italic regressions clients actually need to spot. The LLM narrates those rebrand changes already. Public surface: compute_formatting_diff(old_spans, new_spans, old_page_num, new_page_num) -> dict. """ from __future__ import annotations from collections import defaultdict from typing import Dict, List, Tuple # Spans shorter than this (after .strip()) are ignored. "the", "of", "1", # "." are too common to match reliably and would produce noise. MIN_TEXT_LEN = 4 # Number of example quotes per aggregated finding shown in the report. MAX_QUOTES_PER_FINDING = 3 # A finding only qualifies as "page-wide" if the page has enough matched # spans to make that statement meaningful. Section-break pages with one # or two long spans should not be labelled page-wide on a single flip. PAGE_WIDE_MIN_SPANS = 3 def compute_formatting_diff( old_spans: List[Dict], new_spans: List[Dict], old_page_num: int, new_page_num: int, ) -> Dict: """Compare two span lists and return aggregated formatting changes. Scope intentionally limited to bold + italic flips. Font, size and colour changes (rebrands, re-exports from a different toolchain) are handled by the vision-LLM's narrative diff — re-flagging them here drowns out the bold/italic regressions clients actually care about. Returns: { 'formatting_changes': [ { 'attribute': 'bold' | 'italic', 'old_value': bool, 'new_value': bool, 'example_quotes': [str, ...], 'total_span_count': int, 'page_wide': bool, }, ... ], 'finding_count': int, 'severity': 'medium' | 'none', 'old_page_num': int, 'new_page_num': int, } """ pairs = _match_spans(old_spans, new_spans) matched_count = len(pairs) flips = _collect_flips(pairs) findings = _aggregate(flips, matched_count) return { 'formatting_changes': findings, 'finding_count': len(findings), 'severity': 'medium' if findings else 'none', 'old_page_num': old_page_num, 'new_page_num': new_page_num, } def _match_spans(old_spans: List[Dict], new_spans: List[Dict]) -> List[Tuple[Dict, Dict]]: """Pair spans across pages by exact-text match, disambiguated by y-position. Spans with fewer than MIN_TEXT_LEN chars after stripping are skipped. Returns a list of (old_span, new_span) tuples. """ new_by_text: Dict[str, List[Dict]] = defaultdict(list) for s in new_spans: text = (s.get('text') or '').strip() if len(text) < MIN_TEXT_LEN: continue new_by_text[text].append(s) pairs: List[Tuple[Dict, Dict]] = [] consumed: set = set() for old_span in old_spans: text = (old_span.get('text') or '').strip() if len(text) < MIN_TEXT_LEN: continue candidates = [c for c in new_by_text.get(text, []) if id(c) not in consumed] if not candidates: continue if len(candidates) == 1: chosen = candidates[0] else: chosen = min(candidates, key=lambda c: abs(_y_mid(c) - _y_mid(old_span))) consumed.add(id(chosen)) pairs.append((old_span, chosen)) return pairs def _y_mid(span: Dict) -> float: """Vertical midpoint of a span's bbox; 0.0 if bbox is missing.""" bbox = span.get('bbox') or (0, 0, 0, 0) return (bbox[1] + bbox[3]) / 2.0 def _collect_flips(pairs: List[Tuple[Dict, Dict]]) -> List[Dict]: """For each paired span, emit a flip record per bold/italic change.""" flips: List[Dict] = [] for old_span, new_span in pairs: text = (old_span.get('text') or '').strip() for attr in ('bold', 'italic'): old_v = bool(old_span.get(attr)) new_v = bool(new_span.get(attr)) if old_v != new_v: flips.append({ 'attribute': attr, 'old_value': old_v, 'new_value': new_v, 'quote': text, }) return flips def _aggregate(flips: List[Dict], matched_span_count: int) -> List[Dict]: """Group flips by (attribute, old_value, new_value) and emit one finding per group.""" groups: Dict[Tuple, List[Dict]] = defaultdict(list) for f in flips: key = (f['attribute'], _hashable(f['old_value']), _hashable(f['new_value'])) groups[key].append(f) findings: List[Dict] = [] for (attribute, _, _), members in groups.items(): old_v = members[0]['old_value'] new_v = members[0]['new_value'] quotes = [m['quote'] for m in members[:MAX_QUOTES_PER_FINDING]] total = len(members) page_wide = matched_span_count >= PAGE_WIDE_MIN_SPANS and total == matched_span_count findings.append({ 'attribute': attribute, 'old_value': old_v, 'new_value': new_v, 'example_quotes': quotes, 'total_span_count': total, 'page_wide': page_wide, }) findings.sort(key=lambda f: -f['total_span_count']) return findings def _hashable(v): """Coerce a value to a hashable form for groupby keys (floats already are).""" if isinstance(v, (str, int, float, bool)) or v is None: return v return str(v)