"""Deterministic span-level formatting diff for one aligned page-pair. Companion to diff_engine's vision-LLM diff. The LLM compares rendered page images and is reliable for content/narrative changes but unreliable for typographic flips (bold, font, size, color) on dense layouts. This module ignores rendered pixels entirely and instead reads PyMuPDF's extracted span metadata, matches spans across pages by exact text + y-position, and reports any span whose typographic attributes flipped. Public surface: compute_formatting_diff(old_spans, new_spans, old_page_num, new_page_num) -> dict. """ from __future__ import annotations from collections import defaultdict from typing import Dict, List, Tuple # Spans shorter than this (after .strip()) are ignored. "the", "of", "1", # "." are too common to match reliably and would produce noise. MIN_TEXT_LEN = 4 # Number of example quotes per aggregated finding shown in the report. MAX_QUOTES_PER_FINDING = 3 # A finding only qualifies as "page-wide" if the page has enough matched # spans to make that statement meaningful. Section-break pages with one # or two long spans should not be labelled page-wide on a single flip. PAGE_WIDE_MIN_SPANS = 3 # Per-attribute equality tolerances. SIZE_TOLERANCE_PT = 0.05 def compute_formatting_diff( old_spans: List[Dict], new_spans: List[Dict], old_page_num: int, new_page_num: int, ) -> Dict: """Compare two span lists and return aggregated formatting changes. Returns: { 'formatting_changes': [ { 'attribute': 'bold' | 'italic' | 'font' | 'size' | 'color', 'old_value': str | bool | float, 'new_value': str | bool | float, 'example_quotes': [str, ...], 'total_span_count': int, 'page_wide': bool, }, ... ], 'finding_count': int, 'severity': 'medium' | 'none', 'old_page_num': int, 'new_page_num': int, } """ pairs = _match_spans(old_spans, new_spans) matched_count = len(pairs) flips = _collect_flips(pairs) findings = _aggregate(flips, matched_count) return { 'formatting_changes': findings, 'finding_count': len(findings), 'severity': 'medium' if findings else 'none', 'old_page_num': old_page_num, 'new_page_num': new_page_num, } def _match_spans(old_spans: List[Dict], new_spans: List[Dict]) -> List[Tuple[Dict, Dict]]: """Pair spans across pages by exact-text match, disambiguated by y-position. Spans with fewer than MIN_TEXT_LEN chars after stripping are skipped. Returns a list of (old_span, new_span) tuples. """ new_by_text: Dict[str, List[Dict]] = defaultdict(list) for s in new_spans: text = (s.get('text') or '').strip() if len(text) < MIN_TEXT_LEN: continue new_by_text[text].append(s) pairs: List[Tuple[Dict, Dict]] = [] consumed: set = set() for old_span in old_spans: text = (old_span.get('text') or '').strip() if len(text) < MIN_TEXT_LEN: continue candidates = [c for c in new_by_text.get(text, []) if id(c) not in consumed] if not candidates: continue if len(candidates) == 1: chosen = candidates[0] else: chosen = min(candidates, key=lambda c: abs(_y_mid(c) - _y_mid(old_span))) consumed.add(id(chosen)) pairs.append((old_span, chosen)) return pairs def _y_mid(span: Dict) -> float: """Vertical midpoint of a span's bbox; 0.0 if bbox is missing.""" bbox = span.get('bbox') or (0, 0, 0, 0) return (bbox[1] + bbox[3]) / 2.0 def _collect_flips(pairs: List[Tuple[Dict, Dict]]) -> List[Dict]: """For each paired span, check each attribute and emit a flip record per change.""" flips: List[Dict] = [] for old_span, new_span in pairs: text = (old_span.get('text') or '').strip() for attr in ('bold', 'italic'): old_v = bool(old_span.get(attr)) new_v = bool(new_span.get(attr)) if old_v != new_v: flips.append({ 'attribute': attr, 'old_value': old_v, 'new_value': new_v, 'quote': text, }) for attr in ('font', 'color'): old_v = old_span.get(attr) or '' new_v = new_span.get(attr) or '' if old_v != new_v: flips.append({ 'attribute': attr, 'old_value': old_v, 'new_value': new_v, 'quote': text, }) old_size = float(old_span.get('size') or 0.0) new_size = float(new_span.get('size') or 0.0) if abs(old_size - new_size) > SIZE_TOLERANCE_PT: flips.append({ 'attribute': 'size', 'old_value': round(old_size, 2), 'new_value': round(new_size, 2), 'quote': text, }) return flips def _aggregate(flips: List[Dict], matched_span_count: int) -> List[Dict]: """Group flips by (attribute, old_value, new_value) and emit one finding per group.""" groups: Dict[Tuple, List[Dict]] = defaultdict(list) for f in flips: key = (f['attribute'], _hashable(f['old_value']), _hashable(f['new_value'])) groups[key].append(f) findings: List[Dict] = [] for (attribute, _, _), members in groups.items(): old_v = members[0]['old_value'] new_v = members[0]['new_value'] quotes = [m['quote'] for m in members[:MAX_QUOTES_PER_FINDING]] total = len(members) page_wide = matched_span_count >= PAGE_WIDE_MIN_SPANS and total == matched_span_count findings.append({ 'attribute': attribute, 'old_value': old_v, 'new_value': new_v, 'example_quotes': quotes, 'total_span_count': total, 'page_wide': page_wide, }) findings.sort(key=lambda f: -f['total_span_count']) return findings def _hashable(v): """Coerce a value to a hashable form for groupby keys (floats already are).""" if isinstance(v, (str, int, float, bool)) or v is None: return v return str(v)