feat(diff_engine): merge formatting_diff findings into pair_diffs

run_page_pair_diff now invokes compute_formatting_diff alongside the LLM call for each aligned pair. When the deterministic layer finds typographic flips on a page the LLM saw as identical, the pair is re-classified as having differences with medium severity. Each aggregated finding contributes to the global medium-severity tally. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-19 10:03:54 +02:00 · 2026-05-19 10:03:54 +02:00 · 2b1bb9ccf0
commit 2b1bb9ccf0
parent d21a8a276d
2 changed files with 94 additions and 0 deletions
--- a/backend/document_mode/diff_engine.py
+++ b/backend/document_mode/diff_engine.py
@ -26,6 +26,8 @@ from typing import Dict, List, Optional, Tuple

 from PIL import Image

+from document_mode.formatting_diff import compute_formatting_diff
+

 # Similarity threshold for considering two pages "the same page modified"
 # vs "an inserted/removed page". Tuned for policy docs where page-level text
@ -311,6 +313,26 @@ def run_page_pair_diff(
        if not old_p or not new_p or not old_p.get('image_path') or not new_p.get('image_path'):
            return entry, None
        result = _diff_one_pair(old_p, new_p, call_gemini_vision_fn, model_version)
+
+        # Deterministic formatting diff — runs alongside the LLM diff.
+        fmt = compute_formatting_diff(
+            old_p.get('spans') or [],
+            new_p.get('spans') or [],
+            old_p['page_num'],
+            new_p['page_num'],
+        )
+        diff = result.setdefault('diff', {})
+        diff['formatting_changes'] = fmt['formatting_changes']
+        if fmt['finding_count'] > 0:
+            # If the LLM saw the page as identical but the deterministic
+            # layer found typographic flips, we still need the report to
+            # render the pair as "has changes".
+            diff['differences_found'] = True
+            # Each aggregated finding contributes one medium severity entry.
+            # Bump the pair's overall severity to at least 'medium' so the
+            # pair-card pill reflects the finding count.
+            if diff.get('severity') in (None, 'none'):
+                diff['severity'] = 'medium'
        return entry, result

    with concurrent.futures.ThreadPoolExecutor(max_workers=parallel_pairs) as pool:
@ -345,6 +367,16 @@ def run_page_pair_diff(
        sev = d['diff'].get('severity') or 'none'
        if sev in severity_counts:
            severity_counts[sev] += 1
+        # Each formatting-change finding counts as an additional medium entry,
+        # so a page with N findings contributes N+1 mediums (the +1 from the
+        # base severity already counted above, N more from the findings).
+        fmt_findings = d['diff'].get('formatting_changes') or []
+        if fmt_findings:
+            # The base severity was already bumped to >= medium in _run when
+            # findings exist; here we add the additional findings minus the
+            # one already accounted for.
+            extra = max(0, len(fmt_findings) - 1)
+            severity_counts['medium'] += extra

    return {
        'alignment': alignment,
--- a/backend/tests/test_diff_engine_formatting_integration.py
+++ b/backend/tests/test_diff_engine_formatting_integration.py
@ -0,0 +1,62 @@
+"""Smoke test: run_page_pair_diff merges formatting findings into pair_diffs."""
+
+import pytest
+
+from document_mode.diff_engine import run_page_pair_diff
+
+
+def _page(page_num, raw_text, spans, image_path='/tmp/dummy.png'):
+    return {
+        'page_num': page_num,
+        'raw_text': raw_text,
+        'spans': spans,
+        'image_path': image_path,
+        'fonts_used': [],
+    }
+
+
+def _span(text, bold=False):
+    return {'text': text, 'bold': bold, 'italic': False, 'font': 'Helvetica',
+            'size': 10.0, 'color': '#000000', 'bbox': (0, 10, 100, 22)}
+
+
+def test_formatting_findings_surface_when_llm_returns_identical(tmp_path):
+    # Create real dummy PNGs since _diff_one_pair tries to open them via PIL.
+    from PIL import Image as PILImage
+    img_path = tmp_path / "dummy.png"
+    PILImage.new('RGB', (10, 10)).save(img_path)
+
+    old_pages = [_page(
+        1,
+        "Theft of personal belongings if your car is left unattended unless windows are closed.",
+        [_span("Theft of personal belongings if your car is left unattended", bold=True)],
+        image_path=str(img_path),
+    )]
+    new_pages = [_page(
+        1,
+        "Theft of personal belongings if your car is left unattended unless windows are closed.",
+        [_span("Theft of personal belongings if your car is left unattended", bold=False)],
+        image_path=str(img_path),
+    )]
+
+    # LLM says: no differences. We expect the deterministic layer to override.
+    def fake_llm(prompt, old_img, new_img, model_version=None):
+        return (
+            '{"differences_found": false, "added": [], "removed": [], '
+            '"modified": [], "moved": [], "style_changes": [], '
+            '"severity": "none", "summary": "Identical."}',
+            {'prompt_tokens': 100, 'completion_tokens': 20, 'total_tokens': 120},
+        )
+
+    result = run_page_pair_diff(
+        old_ingest={'pages': old_pages},
+        new_ingest={'pages': new_pages},
+        call_gemini_vision_fn=fake_llm,
+    )
+
+    pair_diff = result['pair_diffs']['1->1']['diff']
+    assert pair_diff['differences_found'] is True
+    assert pair_diff['severity'] == 'medium'
+    assert len(pair_diff['formatting_changes']) == 1
+    assert pair_diff['formatting_changes'][0]['attribute'] == 'bold'
+    assert result['totals']['severity_counts']['medium'] >= 1