ai_qc/backend/tests/test_formatting_diff.py
nickviljoen 29ee941037 refactor(formatting_diff): narrow scope to bold + italic only
First real-data test against the AXA car-insurance PDFs surfaced a
noise problem: the new document is a brand refresh — every page flips
font (PublicoBanner-Bold→PublicoHeadline-Bold) and colour
(#893f4a→#2e3092). At medium-per-finding that crashed the diff score
to 0.0 and drowned the bold-regression signal AXA actually flagged.

Drop font, size, colour comparators. Keep bold + italic — the
attributes the vision-LLM consistently misses on dense layouts. The
LLM already narrates colour-scheme rebrands and font swaps in its
Modified / Style-changes blocks; running both layers on the same
visual change just double-counts it.

Tests inverted from "X change is flagged" to "X change is NOT
flagged" to lock the scope decision in.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-19 12:37:19 +02:00

210 lines
7.1 KiB
Python

"""Unit tests for deterministic per-page-pair formatting diff."""
import pytest
from document_mode.formatting_diff import compute_formatting_diff
def _span(text, bold=False, italic=False, font='Helvetica', size=10.0,
color='#000000', bbox=(0, 10, 100, 22)):
return {
'text': text, 'bold': bold, 'italic': italic, 'font': font,
'size': size, 'color': color, 'bbox': bbox,
}
def test_identical_spans_produce_no_findings():
spans_a = [_span("Hello world"), _span("Second paragraph")]
spans_b = [_span("Hello world"), _span("Second paragraph")]
result = compute_formatting_diff(spans_a, spans_b, 1, 1)
assert result['finding_count'] == 0
assert result['formatting_changes'] == []
assert result['severity'] == 'none'
def test_bold_flip_is_detected():
spans_a = [_span("Theft of personal belongings", bold=True)]
spans_b = [_span("Theft of personal belongings", bold=False)]
result = compute_formatting_diff(spans_a, spans_b, 18, 18)
assert result['finding_count'] == 1
finding = result['formatting_changes'][0]
assert finding['attribute'] == 'bold'
assert finding['old_value'] is True
assert finding['new_value'] is False
assert finding['total_span_count'] == 1
assert "Theft of personal belongings" in finding['example_quotes']
assert result['severity'] == 'medium'
def test_aggregates_identical_flips_into_one_finding():
old = [
_span("First sentence that lost bold", bold=True),
_span("Second sentence that lost bold", bold=True),
_span("Third sentence that lost bold", bold=True),
]
new = [
_span("First sentence that lost bold", bold=False),
_span("Second sentence that lost bold", bold=False),
_span("Third sentence that lost bold", bold=False),
]
result = compute_formatting_diff(old, new, 22, 22)
assert result['finding_count'] == 1
finding = result['formatting_changes'][0]
assert finding['total_span_count'] == 3
assert len(finding['example_quotes']) == 3
assert finding['page_wide'] is True
def test_page_wide_flag_false_when_only_subset_flips():
old = [
_span("Lost its bold", bold=True),
_span("Stays regular and matches text", bold=False),
]
new = [
_span("Lost its bold", bold=False),
_span("Stays regular and matches text", bold=False),
]
result = compute_formatting_diff(old, new, 5, 5)
assert result['finding_count'] == 1
assert result['formatting_changes'][0]['page_wide'] is False
def test_short_text_spans_are_ignored():
old = [_span("of", bold=True), _span("the", bold=True)]
new = [_span("of", bold=False), _span("the", bold=False)]
result = compute_formatting_diff(old, new, 1, 1)
assert result['finding_count'] == 0
def test_unmatched_text_is_ignored_not_flagged():
old = [_span("Original sentence that was bold", bold=True)]
new = [_span("Completely different replacement copy", bold=False)]
result = compute_formatting_diff(old, new, 7, 7)
assert result['finding_count'] == 0
def test_size_change_not_flagged():
# Size is intentionally out of scope — rebrand re-exports often change
# body-text point sizes by fractions of a point.
old = [_span("Body text resized", size=10.00)]
new = [_span("Body text resized", size=12.50)]
result = compute_formatting_diff(old, new, 1, 1)
assert result['finding_count'] == 0
def test_font_change_not_flagged():
# Font swap is intentionally out of scope — caught by the LLM narrative
# diff. Reporting it here would drown out bold/italic regressions on
# re-branded documents.
old = [_span("Body text in original font face", font='AXASans-Regular')]
new = [_span("Body text in original font face", font='Helvetica')]
result = compute_formatting_diff(old, new, 1, 1)
assert result['finding_count'] == 0
def test_color_change_not_flagged():
# Colour is intentionally out of scope for the same rebrand-noise reason.
old = [_span("Hyperlink-style text in blue", color='#0066cc')]
new = [_span("Hyperlink-style text in blue", color='#000000')]
result = compute_formatting_diff(old, new, 1, 1)
assert result['finding_count'] == 0
def test_italic_flip_detected():
old = [_span("Block quote that was italicised", italic=True)]
new = [_span("Block quote that was italicised", italic=False)]
result = compute_formatting_diff(old, new, 1, 1)
assert result['finding_count'] == 1
assert result['formatting_changes'][0]['attribute'] == 'italic'
def test_duplicate_text_disambiguated_by_y_position():
old = [
_span("Important note", bold=True, bbox=(72, 100, 200, 115)),
_span("Important note", bold=True, bbox=(72, 700, 200, 715)),
]
new = [
_span("Important note", bold=False, bbox=(72, 100, 200, 115)),
_span("Important note", bold=True, bbox=(72, 700, 200, 715)),
]
result = compute_formatting_diff(old, new, 1, 1)
assert result['finding_count'] == 1
assert result['formatting_changes'][0]['total_span_count'] == 1
def test_single_span_page_not_labelled_page_wide():
# A page with only one matched span that flipped should NOT be page-wide,
# even though "all" matched spans flipped — the count is too small.
old = [_span("Sole heading on this section-break page", bold=True)]
new = [_span("Sole heading on this section-break page", bold=False)]
result = compute_formatting_diff(old, new, 1, 1)
assert result['finding_count'] == 1
assert result['formatting_changes'][0]['page_wide'] is False
def test_two_span_page_not_labelled_page_wide():
# Threshold is 3 — 2 spans flipping is not enough to call page-wide.
old = [
_span("First short heading", bold=True),
_span("Second short heading", bold=True),
]
new = [
_span("First short heading", bold=False),
_span("Second short heading", bold=False),
]
result = compute_formatting_diff(old, new, 1, 1)
assert result['finding_count'] == 1
assert result['formatting_changes'][0]['page_wide'] is False
def test_missing_bold_key_treated_as_false_no_phantom_flip():
# A span dict that omits 'bold' entirely should be treated as bold=False
# for comparison purposes — not as None, which would falsely flip vs False.
old = [{'text': "Body text from older ingest path", 'italic': False,
'font': 'Helvetica', 'size': 10.0, 'color': '#000000',
'bbox': (0, 10, 100, 22)}]
new = [{'text': "Body text from older ingest path", 'bold': False,
'italic': False, 'font': 'Helvetica', 'size': 10.0,
'color': '#000000', 'bbox': (0, 10, 100, 22)}]
result = compute_formatting_diff(old, new, 1, 1)
assert result['finding_count'] == 0
def test_empty_old_spans_returns_no_findings():
result = compute_formatting_diff([], [_span("Some new text")], 1, 1)
assert result['finding_count'] == 0
assert result['severity'] == 'none'
def test_empty_new_spans_returns_no_findings():
result = compute_formatting_diff([_span("Some old text")], [], 1, 1)
assert result['finding_count'] == 0
assert result['severity'] == 'none'