First real-data test against the AXA car-insurance PDFs surfaced a noise problem: the new document is a brand refresh — every page flips font (PublicoBanner-Bold→PublicoHeadline-Bold) and colour (#893f4a→#2e3092). At medium-per-finding that crashed the diff score to 0.0 and drowned the bold-regression signal AXA actually flagged. Drop font, size, colour comparators. Keep bold + italic — the attributes the vision-LLM consistently misses on dense layouts. The LLM already narrates colour-scheme rebrands and font swaps in its Modified / Style-changes blocks; running both layers on the same visual change just double-counts it. Tests inverted from "X change is flagged" to "X change is NOT flagged" to lock the scope decision in. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
210 lines
7.1 KiB
Python
210 lines
7.1 KiB
Python
"""Unit tests for deterministic per-page-pair formatting diff."""
|
|
|
|
import pytest
|
|
|
|
from document_mode.formatting_diff import compute_formatting_diff
|
|
|
|
|
|
def _span(text, bold=False, italic=False, font='Helvetica', size=10.0,
|
|
color='#000000', bbox=(0, 10, 100, 22)):
|
|
return {
|
|
'text': text, 'bold': bold, 'italic': italic, 'font': font,
|
|
'size': size, 'color': color, 'bbox': bbox,
|
|
}
|
|
|
|
|
|
def test_identical_spans_produce_no_findings():
|
|
spans_a = [_span("Hello world"), _span("Second paragraph")]
|
|
spans_b = [_span("Hello world"), _span("Second paragraph")]
|
|
|
|
result = compute_formatting_diff(spans_a, spans_b, 1, 1)
|
|
|
|
assert result['finding_count'] == 0
|
|
assert result['formatting_changes'] == []
|
|
assert result['severity'] == 'none'
|
|
|
|
|
|
def test_bold_flip_is_detected():
|
|
spans_a = [_span("Theft of personal belongings", bold=True)]
|
|
spans_b = [_span("Theft of personal belongings", bold=False)]
|
|
|
|
result = compute_formatting_diff(spans_a, spans_b, 18, 18)
|
|
|
|
assert result['finding_count'] == 1
|
|
finding = result['formatting_changes'][0]
|
|
assert finding['attribute'] == 'bold'
|
|
assert finding['old_value'] is True
|
|
assert finding['new_value'] is False
|
|
assert finding['total_span_count'] == 1
|
|
assert "Theft of personal belongings" in finding['example_quotes']
|
|
assert result['severity'] == 'medium'
|
|
|
|
|
|
def test_aggregates_identical_flips_into_one_finding():
|
|
old = [
|
|
_span("First sentence that lost bold", bold=True),
|
|
_span("Second sentence that lost bold", bold=True),
|
|
_span("Third sentence that lost bold", bold=True),
|
|
]
|
|
new = [
|
|
_span("First sentence that lost bold", bold=False),
|
|
_span("Second sentence that lost bold", bold=False),
|
|
_span("Third sentence that lost bold", bold=False),
|
|
]
|
|
|
|
result = compute_formatting_diff(old, new, 22, 22)
|
|
|
|
assert result['finding_count'] == 1
|
|
finding = result['formatting_changes'][0]
|
|
assert finding['total_span_count'] == 3
|
|
assert len(finding['example_quotes']) == 3
|
|
assert finding['page_wide'] is True
|
|
|
|
|
|
def test_page_wide_flag_false_when_only_subset_flips():
|
|
old = [
|
|
_span("Lost its bold", bold=True),
|
|
_span("Stays regular and matches text", bold=False),
|
|
]
|
|
new = [
|
|
_span("Lost its bold", bold=False),
|
|
_span("Stays regular and matches text", bold=False),
|
|
]
|
|
|
|
result = compute_formatting_diff(old, new, 5, 5)
|
|
|
|
assert result['finding_count'] == 1
|
|
assert result['formatting_changes'][0]['page_wide'] is False
|
|
|
|
|
|
def test_short_text_spans_are_ignored():
|
|
old = [_span("of", bold=True), _span("the", bold=True)]
|
|
new = [_span("of", bold=False), _span("the", bold=False)]
|
|
|
|
result = compute_formatting_diff(old, new, 1, 1)
|
|
|
|
assert result['finding_count'] == 0
|
|
|
|
|
|
def test_unmatched_text_is_ignored_not_flagged():
|
|
old = [_span("Original sentence that was bold", bold=True)]
|
|
new = [_span("Completely different replacement copy", bold=False)]
|
|
|
|
result = compute_formatting_diff(old, new, 7, 7)
|
|
|
|
assert result['finding_count'] == 0
|
|
|
|
|
|
def test_size_change_not_flagged():
|
|
# Size is intentionally out of scope — rebrand re-exports often change
|
|
# body-text point sizes by fractions of a point.
|
|
old = [_span("Body text resized", size=10.00)]
|
|
new = [_span("Body text resized", size=12.50)]
|
|
|
|
result = compute_formatting_diff(old, new, 1, 1)
|
|
|
|
assert result['finding_count'] == 0
|
|
|
|
|
|
def test_font_change_not_flagged():
|
|
# Font swap is intentionally out of scope — caught by the LLM narrative
|
|
# diff. Reporting it here would drown out bold/italic regressions on
|
|
# re-branded documents.
|
|
old = [_span("Body text in original font face", font='AXASans-Regular')]
|
|
new = [_span("Body text in original font face", font='Helvetica')]
|
|
|
|
result = compute_formatting_diff(old, new, 1, 1)
|
|
|
|
assert result['finding_count'] == 0
|
|
|
|
|
|
def test_color_change_not_flagged():
|
|
# Colour is intentionally out of scope for the same rebrand-noise reason.
|
|
old = [_span("Hyperlink-style text in blue", color='#0066cc')]
|
|
new = [_span("Hyperlink-style text in blue", color='#000000')]
|
|
|
|
result = compute_formatting_diff(old, new, 1, 1)
|
|
|
|
assert result['finding_count'] == 0
|
|
|
|
|
|
def test_italic_flip_detected():
|
|
old = [_span("Block quote that was italicised", italic=True)]
|
|
new = [_span("Block quote that was italicised", italic=False)]
|
|
|
|
result = compute_formatting_diff(old, new, 1, 1)
|
|
|
|
assert result['finding_count'] == 1
|
|
assert result['formatting_changes'][0]['attribute'] == 'italic'
|
|
|
|
|
|
def test_duplicate_text_disambiguated_by_y_position():
|
|
old = [
|
|
_span("Important note", bold=True, bbox=(72, 100, 200, 115)),
|
|
_span("Important note", bold=True, bbox=(72, 700, 200, 715)),
|
|
]
|
|
new = [
|
|
_span("Important note", bold=False, bbox=(72, 100, 200, 115)),
|
|
_span("Important note", bold=True, bbox=(72, 700, 200, 715)),
|
|
]
|
|
|
|
result = compute_formatting_diff(old, new, 1, 1)
|
|
|
|
assert result['finding_count'] == 1
|
|
assert result['formatting_changes'][0]['total_span_count'] == 1
|
|
|
|
|
|
def test_single_span_page_not_labelled_page_wide():
|
|
# A page with only one matched span that flipped should NOT be page-wide,
|
|
# even though "all" matched spans flipped — the count is too small.
|
|
old = [_span("Sole heading on this section-break page", bold=True)]
|
|
new = [_span("Sole heading on this section-break page", bold=False)]
|
|
|
|
result = compute_formatting_diff(old, new, 1, 1)
|
|
|
|
assert result['finding_count'] == 1
|
|
assert result['formatting_changes'][0]['page_wide'] is False
|
|
|
|
|
|
def test_two_span_page_not_labelled_page_wide():
|
|
# Threshold is 3 — 2 spans flipping is not enough to call page-wide.
|
|
old = [
|
|
_span("First short heading", bold=True),
|
|
_span("Second short heading", bold=True),
|
|
]
|
|
new = [
|
|
_span("First short heading", bold=False),
|
|
_span("Second short heading", bold=False),
|
|
]
|
|
|
|
result = compute_formatting_diff(old, new, 1, 1)
|
|
|
|
assert result['finding_count'] == 1
|
|
assert result['formatting_changes'][0]['page_wide'] is False
|
|
|
|
|
|
def test_missing_bold_key_treated_as_false_no_phantom_flip():
|
|
# A span dict that omits 'bold' entirely should be treated as bold=False
|
|
# for comparison purposes — not as None, which would falsely flip vs False.
|
|
old = [{'text': "Body text from older ingest path", 'italic': False,
|
|
'font': 'Helvetica', 'size': 10.0, 'color': '#000000',
|
|
'bbox': (0, 10, 100, 22)}]
|
|
new = [{'text': "Body text from older ingest path", 'bold': False,
|
|
'italic': False, 'font': 'Helvetica', 'size': 10.0,
|
|
'color': '#000000', 'bbox': (0, 10, 100, 22)}]
|
|
|
|
result = compute_formatting_diff(old, new, 1, 1)
|
|
|
|
assert result['finding_count'] == 0
|
|
|
|
|
|
def test_empty_old_spans_returns_no_findings():
|
|
result = compute_formatting_diff([], [_span("Some new text")], 1, 1)
|
|
assert result['finding_count'] == 0
|
|
assert result['severity'] == 'none'
|
|
|
|
|
|
def test_empty_new_spans_returns_no_findings():
|
|
result = compute_formatting_diff([_span("Some old text")], [], 1, 1)
|
|
assert result['finding_count'] == 0
|
|
assert result['severity'] == 'none'
|