ai_qc/backend/document_mode/diff_report_writer.py
nickviljoen 0fd6a35562 fix(diff_report): _fmt_value labels italic flips correctly
Previously every boolean attribute rendered as "Bold → Regular",
producing "Italic: Bold → Regular" for italic flips. Now the helper
takes the attribute name and emits "Italic → Regular" or
"Bold → Regular" depending on which boolean attribute is being shown.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-19 10:22:39 +02:00

419 lines
18 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Diff report writer for AXA Old-vs-New document mode.
Distinct from result_writer.py — the diff report's shape is fundamentally
different (alignment table, page-pair cards, severity breakdowns) so it
gets its own module rather than overloading the single-doc writer.
Outputs:
<output_dir>/<session_id>_<old_basename>_vs_<new_basename>_diff_data.json
<output_dir>/<session_id>_<old_basename>_vs_<new_basename>_diff_report.html
"""
from __future__ import annotations
import html
import json
import os
from typing import Dict, List, Optional
def _slug(name: str) -> str:
base = os.path.splitext(os.path.basename(name))[0]
return base.replace(' ', '_').replace('/', '_')[:60]
def _severity_class(sev: str) -> str:
return {
'high': 'sev-high',
'medium': 'sev-medium',
'low': 'sev-low',
'none': 'sev-none',
}.get(sev, 'sev-none')
def _status_class(status: str) -> str:
return {
'matched': 'status-matched',
'added': 'status-added',
'removed': 'status-removed',
}.get(status, '')
def _render_alignment_table(alignment: List[Dict]) -> str:
rows = []
for entry in alignment:
old = entry.get('old_page')
new = entry.get('new_page')
sim = entry.get('similarity')
sim_str = f"{sim:.2f}" if isinstance(sim, (int, float)) else ''
status = entry.get('status', '')
rows.append(f"""
<tr class='{_status_class(status)}'>
<td>{old if old is not None else ''}</td>
<td>{new if new is not None else ''}</td>
<td>{sim_str}</td>
<td><span class='status-pill {_status_class(status)}'>{html.escape(status)}</span></td>
</tr>
""")
return f"""
<table class='alignment-table'>
<thead><tr><th>Old page</th><th>New page</th><th>Similarity</th><th>Status</th></tr></thead>
<tbody>{''.join(rows)}</tbody>
</table>
"""
def _render_diff_list(items: List[str], css_class: str, label: str, icon: str) -> str:
if not items:
return ''
bullets = ''.join(f"<li>{html.escape(it)}</li>" for it in items)
return f"""
<div class='diff-block {css_class}'>
<div class='diff-label'>{icon} {label}</div>
<ul>{bullets}</ul>
</div>
"""
def _render_formatting_block(findings: List[Dict]) -> str:
if not findings:
return ''
def _fmt_value(v, attribute):
if isinstance(v, bool):
if attribute == 'italic':
return 'Italic' if v else 'Regular'
return 'Bold' if v else 'Regular'
return str(v)
items = []
for f in findings:
attr = f.get('attribute', '')
old_v = _fmt_value(f.get('old_value'), attr)
new_v = _fmt_value(f.get('new_value'), attr)
total = f.get('total_span_count', 0)
page_wide = f.get('page_wide', False)
quotes = f.get('example_quotes', []) or []
if page_wide:
prefix = f"<strong>Page-wide {html.escape(attr)} change</strong>: {html.escape(old_v)}{html.escape(new_v)}"
else:
prefix = f"<strong>{html.escape(attr).capitalize()}: {html.escape(old_v)}{html.escape(new_v)}</strong>"
quote_html = ''
if quotes:
quoted = ', '.join(f'&ldquo;{html.escape(q)}&rdquo;' for q in quotes[:3])
extra = total - len(quotes[:3])
extra_html = f" <span class='muted'>…and {extra} more</span>" if extra > 0 else ''
quote_html = f" ({total} span{'s' if total != 1 else ''}): {quoted}{extra_html}"
items.append(f"<li>{prefix}{quote_html}</li>")
return f"""
<div class='diff-block block-style'>
<div class='diff-label'>🎨 Formatting changes</div>
<ul>{''.join(items)}</ul>
</div>
"""
def _render_pair_card(entry: Dict, pair_diffs: Dict) -> str:
old = entry['old_page']
new = entry['new_page']
status = entry['status']
sim = entry.get('similarity')
# Added or removed entire pages — different shape
if status == 'added':
return f"""
<div class='pair-card status-added'>
<div class='pair-header'>
<span class='page-label'>+ Page added in new version</span>
<span class='page-coords'>new page <strong>{new}</strong></span>
</div>
<div class='pair-body'>
<em class='muted'>This page exists in the new version but had no counterpart in the old version.</em>
</div>
</div>
"""
if status == 'removed':
return f"""
<div class='pair-card status-removed'>
<div class='pair-header'>
<span class='page-label'> Page removed in new version</span>
<span class='page-coords'>old page <strong>{old}</strong></span>
</div>
<div class='pair-body'>
<em class='muted'>This page was in the old version but is not in the new version.</em>
</div>
</div>
"""
# Matched pair — render diff result
key = f"{old}->{new}"
pair = pair_diffs.get(key, {}).get('diff') or {}
sev = pair.get('severity', 'none')
summary = pair.get('summary', '')
differences_found = pair.get('differences_found', False)
if not differences_found and not pair.get('error'):
return f"""
<details class='pair-card status-matched-clean'>
<summary>
<span class='page-label'>= No differences detected</span>
<span class='page-coords'>old <strong>{old}</strong> ↔ new <strong>{new}</strong> · sim {sim:.2f}</span>
<span class='sev-pill sev-none'>identical</span>
</summary>
<div class='pair-body'><em class='muted'>{html.escape(summary or "Pages compared as visually identical.")}</em></div>
</details>
"""
blocks = []
blocks.append(_render_diff_list(pair.get('added') or [], 'block-added', 'Added', ''))
blocks.append(_render_diff_list(pair.get('removed') or [], 'block-removed', 'Removed', ''))
blocks.append(_render_diff_list(pair.get('modified') or [], 'block-modified', 'Modified', ''))
blocks.append(_render_diff_list(pair.get('moved') or [], 'block-moved', 'Moved', ''))
blocks.append(_render_diff_list(pair.get('style_changes') or [], 'block-style', 'Style changes', '🎨'))
blocks.append(_render_formatting_block(pair.get('formatting_changes') or []))
error_block = ''
if pair.get('error'):
error_block = f"<div class='diff-error'>⚠️ {html.escape(pair['error'])}</div>"
return f"""
<details class='pair-card' open>
<summary>
<span class='page-label'>old <strong>{old}</strong> ↔ new <strong>{new}</strong></span>
<span class='page-coords'>sim {sim:.2f}</span>
<span class='sev-pill {_severity_class(sev)}'>{html.escape(sev)}</span>
</summary>
<div class='pair-body'>
<p class='pair-summary'>{html.escape(summary or '')}</p>
{error_block}
{''.join(blocks)}
</div>
</details>
"""
def _render_at_a_glance(totals: Dict, doc_summary: Dict) -> str:
sev = totals.get('severity_counts', {})
return f"""
<div class='glance-grid'>
<div class='glance-card'>
<div class='glance-num'>{totals.get('old_page_count', 0)}{totals.get('new_page_count', 0)}</div>
<div class='glance-label'>Page count</div>
</div>
<div class='glance-card status-added'>
<div class='glance-num'>{totals.get('pages_added', 0)}</div>
<div class='glance-label'>Pages added</div>
</div>
<div class='glance-card status-removed'>
<div class='glance-num'>{totals.get('pages_removed', 0)}</div>
<div class='glance-label'>Pages removed</div>
</div>
<div class='glance-card'>
<div class='glance-num'>{totals.get('pages_modified', 0)}</div>
<div class='glance-label'>Pages modified</div>
</div>
<div class='glance-card'>
<div class='glance-num'>{totals.get('pages_unchanged', 0)}</div>
<div class='glance-label'>Pages unchanged</div>
</div>
<div class='glance-card sev-high'>
<div class='glance-num'>{sev.get('high', 0)}</div>
<div class='glance-label'>High severity</div>
</div>
<div class='glance-card sev-medium'>
<div class='glance-num'>{sev.get('medium', 0)}</div>
<div class='glance-label'>Medium severity</div>
</div>
<div class='glance-card sev-low'>
<div class='glance-num'>{sev.get('low', 0)}</div>
<div class='glance-label'>Low severity</div>
</div>
</div>
"""
def _render_html(result: Dict) -> str:
old_pdf = result.get('old_pdf', {})
new_pdf = result.get('new_pdf', {})
totals = result.get('totals', {})
doc_summary = result.get('document_summary', {})
alignment = result.get('alignment', [])
pair_diffs = result.get('pair_diffs', {})
score = doc_summary.get('overall_score', 0)
grade = doc_summary.get('grade', '')
glance = _render_at_a_glance(totals, doc_summary)
alignment_table = _render_alignment_table(alignment)
pair_cards = '\n'.join(_render_pair_card(entry, pair_diffs) for entry in alignment)
title = f"Diff Report — {old_pdf.get('filename', 'old')} vs {new_pdf.get('filename', 'new')}"
return f"""<!DOCTYPE html>
<html lang='en'>
<head>
<meta charset='utf-8'>
<title>{html.escape(title)}</title>
<style>
body {{ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif; margin: 0; background: #f5f6f8; color: #222; }}
.wrap {{ max-width: 1200px; margin: 0 auto; padding: 24px; }}
h1 {{ margin: 0 0 4px; font-size: 22px; }}
h2 {{ margin: 24px 0 8px; font-size: 16px; color: #111; }}
.meta {{ color: #666; font-size: 13px; margin-bottom: 20px; }}
.muted {{ color: #888; }}
.versions-card {{ background: white; border-radius: 8px; padding: 18px 20px; box-shadow: 0 1px 3px rgba(0,0,0,0.06); margin-bottom: 16px; }}
.versions-card .vs-row {{ display: flex; align-items: center; gap: 16px; font-size: 14px; }}
.versions-card .vs-old, .versions-card .vs-new {{ flex: 1; padding: 10px 14px; border-radius: 6px; }}
.versions-card .vs-old {{ background: #fceac0; }}
.versions-card .vs-new {{ background: #d6f0d8; }}
.versions-card .vs-arrow {{ font-size: 24px; color: #888; }}
.overall-card {{ background: white; border-radius: 8px; padding: 20px; box-shadow: 0 1px 3px rgba(0,0,0,0.06); margin-bottom: 16px; display: flex; align-items: center; gap: 24px; }}
.overall-score {{ font-size: 48px; font-weight: 600; color: #111; }}
.grade-badge {{ padding: 4px 12px; border-radius: 999px; font-size: 12px; font-weight: 700; letter-spacing: 0.5px; background: #eef2f7; color: #2a4060; }}
.glance-grid {{ display: grid; grid-template-columns: repeat(4, 1fr); gap: 8px; margin-bottom: 16px; }}
.glance-card {{ background: white; border-radius: 8px; padding: 14px; box-shadow: 0 1px 3px rgba(0,0,0,0.06); text-align: center; }}
.glance-card.status-added {{ background: #d6f0d8; }}
.glance-card.status-removed {{ background: #f4d4d4; }}
.glance-card.sev-high {{ background: #f4d4d4; }}
.glance-card.sev-medium {{ background: #fceac0; }}
.glance-card.sev-low {{ background: #eef2f7; }}
.glance-num {{ font-size: 26px; font-weight: 700; color: #222; }}
.glance-label {{ font-size: 12px; color: #555; margin-top: 4px; }}
.alignment-table {{ width: 100%; border-collapse: collapse; background: white; border-radius: 8px; overflow: hidden; box-shadow: 0 1px 3px rgba(0,0,0,0.06); margin-bottom: 12px; font-size: 13px; }}
.alignment-table th, .alignment-table td {{ text-align: left; padding: 6px 12px; border-bottom: 1px solid #eee; }}
.alignment-table th {{ background: #fafafa; font-weight: 600; color: #555; }}
.alignment-table tr:last-child td {{ border-bottom: none; }}
.alignment-table tr.status-added td {{ background: #f4fcf5; }}
.alignment-table tr.status-removed td {{ background: #fdf3f3; }}
.status-pill {{ font-size: 11px; padding: 2px 8px; border-radius: 999px; font-weight: 600; background: #eef2f7; color: #4a5a72; }}
.status-pill.status-matched {{ background: #eef5ff; color: #2c4f8c; }}
.status-pill.status-added {{ background: #d6f0d8; color: #1f6a2a; }}
.status-pill.status-removed {{ background: #f4d4d4; color: #8a1f1f; }}
.pair-card {{ background: white; border-radius: 8px; padding: 14px 18px; margin-bottom: 8px; box-shadow: 0 1px 3px rgba(0,0,0,0.06); border-left: 3px solid transparent; }}
.pair-card[open] {{ padding-bottom: 18px; }}
.pair-card.status-matched-clean {{ border-left-color: #ccc; opacity: 0.85; }}
.pair-card.status-added {{ border-left-color: #2a8a3a; background: #f4fcf5; }}
.pair-card.status-removed {{ border-left-color: #b53030; background: #fdf3f3; }}
.pair-card summary {{ cursor: pointer; display: flex; align-items: center; justify-content: space-between; gap: 12px; font-size: 14px; list-style: none; }}
.pair-card summary::-webkit-details-marker {{ display: none; }}
.pair-card summary::before {{ content: ''; display: inline-block; transition: transform .15s; margin-right: 6px; color: #888; }}
.pair-card[open] summary::before {{ transform: rotate(90deg); }}
.pair-header {{ display: flex; align-items: center; gap: 12px; flex: 1; }}
.page-label {{ font-weight: 600; }}
.page-coords {{ color: #888; font-size: 12px; font-family: ui-monospace, SFMono-Regular, Menlo, monospace; }}
.pair-body {{ padding-left: 18px; padding-top: 8px; }}
.pair-summary {{ color: #444; font-size: 13px; margin: 4px 0 12px; }}
.sev-pill {{ font-size: 11px; padding: 2px 10px; border-radius: 999px; font-weight: 600; }}
.sev-pill.sev-high {{ background: #f4d4d4; color: #8a1f1f; }}
.sev-pill.sev-medium {{ background: #fceac0; color: #7a5a00; }}
.sev-pill.sev-low {{ background: #eef2f7; color: #4a5a72; }}
.sev-pill.sev-none {{ background: #e8efe8; color: #4a6a4a; }}
.diff-block {{ background: #fafbfc; border-left: 3px solid #ccc; padding: 8px 14px; margin: 8px 0; border-radius: 4px; }}
.diff-block.block-added {{ border-left-color: #2a8a3a; }}
.diff-block.block-removed {{ border-left-color: #b53030; }}
.diff-block.block-modified {{ border-left-color: #b58a00; }}
.diff-block.block-moved {{ border-left-color: #2c4f8c; }}
.diff-block.block-style {{ border-left-color: #8a4ab8; }}
.diff-label {{ font-weight: 600; font-size: 13px; margin-bottom: 4px; }}
.diff-block ul {{ margin: 4px 0; padding-left: 22px; }}
.diff-block li {{ font-size: 13px; line-height: 1.45; margin: 2px 0; }}
.diff-error {{ background: #fdf3f3; color: #8a1f1f; padding: 8px 12px; border-radius: 4px; font-size: 13px; margin: 8px 0; }}
.filter-bar {{ background: white; border-radius: 6px; padding: 10px 14px; margin-bottom: 8px; box-shadow: 0 1px 2px rgba(0,0,0,0.04); font-size: 13px; }}
.filter-bar label {{ cursor: pointer; margin-right: 12px; }}
.cost-line {{ font-size: 12px; color: #666; margin-top: 4px; }}
</style>
</head>
<body>
<div class='wrap'>
<h1>Old vs New Diff — {html.escape(result.get('profile_name', ''))}</h1>
<div class='meta'>{html.escape(result.get('timestamp', ''))}</div>
<div class='versions-card'>
<div class='vs-row'>
<div class='vs-old'>
<strong>OLD:</strong> {html.escape(old_pdf.get('filename', ''))}<br>
<span class='muted'>{old_pdf.get('pages_processed', 0)} pages</span>
</div>
<div class='vs-arrow'>→</div>
<div class='vs-new'>
<strong>NEW:</strong> {html.escape(new_pdf.get('filename', ''))}<br>
<span class='muted'>{new_pdf.get('pages_processed', 0)} pages</span>
</div>
</div>
</div>
<div class='overall-card'>
<div>
<div class='overall-score'>{score}</div>
<div style='font-size:12px;color:#666;'>Diff score (100 = identical)</div>
</div>
<div>
<span class='grade-badge'>{html.escape(grade)}</span>
</div>
<div class='cost-line muted'>
Tokens: {result.get('token_usage', {}).get('total_tokens', 0):,}
</div>
</div>
<h2>At a glance</h2>
{glance}
<h2>Page alignment map</h2>
{alignment_table}
<h2>Page-by-page differences</h2>
{pair_cards}
</div>
</body>
</html>
"""
# ─────────────────────────────────────────────────────────────────────────────
# Public entrypoint
# ─────────────────────────────────────────────────────────────────────────────
def write_diff_report(
result: Dict,
old_filename: str,
new_filename: str,
session_id: str,
output_dir: str,
output_mode: str = 'both',
) -> Dict[str, Optional[str]]:
"""Write JSON + HTML diff reports.
Returns: {'json': path or None, 'html': path or None}
"""
os.makedirs(output_dir, exist_ok=True)
base = f"{session_id}_{_slug(old_filename)}_vs_{_slug(new_filename)}_diff"
paths: Dict[str, Optional[str]] = {'json': None, 'html': None}
if output_mode in ('json', 'both'):
json_path = os.path.join(output_dir, f"{base}_data.json")
with open(json_path, 'w', encoding='utf-8') as f:
json.dump(result, f, indent=2, default=str)
paths['json'] = json_path
if output_mode in ('html', 'both'):
html_path = os.path.join(output_dir, f"{base}_report.html")
with open(html_path, 'w', encoding='utf-8') as f:
f.write(_render_html(result))
paths['html'] = html_path
return paths