"""Diff report writer for AXA Old-vs-New document mode. Distinct from result_writer.py — the diff report's shape is fundamentally different (alignment table, page-pair cards, severity breakdowns) so it gets its own module rather than overloading the single-doc writer. Outputs: /__vs__diff_data.json /__vs__diff_report.html """ from __future__ import annotations import html import json import os from typing import Dict, List, Optional def _slug(name: str) -> str: base = os.path.splitext(os.path.basename(name))[0] return base.replace(' ', '_').replace('/', '_')[:60] def _severity_class(sev: str) -> str: return { 'high': 'sev-high', 'medium': 'sev-medium', 'low': 'sev-low', 'none': 'sev-none', }.get(sev, 'sev-none') def _status_class(status: str) -> str: return { 'matched': 'status-matched', 'added': 'status-added', 'removed': 'status-removed', }.get(status, '') def _render_alignment_table(alignment: List[Dict]) -> str: rows = [] for entry in alignment: old = entry.get('old_page') new = entry.get('new_page') sim = entry.get('similarity') sim_str = f"{sim:.2f}" if isinstance(sim, (int, float)) else '—' status = entry.get('status', '') rows.append(f""" {old if old is not None else '—'} {new if new is not None else '—'} {sim_str} {html.escape(status)} """) return f""" {''.join(rows)}
Old pageNew pageSimilarityStatus
""" def _render_diff_list(items: List[str], css_class: str, label: str, icon: str) -> str: if not items: return '' bullets = ''.join(f"
  • {html.escape(it)}
  • " for it in items) return f"""
    {icon} {label}
      {bullets}
    """ def _render_formatting_block(findings: List[Dict]) -> str: if not findings: return '' def _fmt_value(v, attribute): if isinstance(v, bool): if attribute == 'italic': return 'Italic' if v else 'Regular' return 'Bold' if v else 'Regular' return str(v) items = [] for f in findings: attr = f.get('attribute', '') old_v = _fmt_value(f.get('old_value'), attr) new_v = _fmt_value(f.get('new_value'), attr) total = f.get('total_span_count', 0) page_wide = f.get('page_wide', False) quotes = f.get('example_quotes', []) or [] if page_wide: prefix = f"Page-wide {html.escape(attr)} change: {html.escape(old_v)} → {html.escape(new_v)}" else: prefix = f"{html.escape(attr).capitalize()}: {html.escape(old_v)} → {html.escape(new_v)}" quote_html = '' if quotes: quoted = ', '.join(f'“{html.escape(q)}”' for q in quotes[:3]) extra = total - len(quotes[:3]) extra_html = f" …and {extra} more" if extra > 0 else '' quote_html = f" ({total} span{'s' if total != 1 else ''}): {quoted}{extra_html}" items.append(f"
  • {prefix}{quote_html}
  • ") return f"""
    🎨 Formatting changes
      {''.join(items)}
    """ def _render_pair_card(entry: Dict, pair_diffs: Dict) -> str: old = entry['old_page'] new = entry['new_page'] status = entry['status'] sim = entry.get('similarity') # Added or removed entire pages — different shape if status == 'added': return f"""
    + Page added in new version new page {new}
    This page exists in the new version but had no counterpart in the old version.
    """ if status == 'removed': return f"""
    − Page removed in new version old page {old}
    This page was in the old version but is not in the new version.
    """ # Matched pair — render diff result key = f"{old}->{new}" pair = pair_diffs.get(key, {}).get('diff') or {} sev = pair.get('severity', 'none') summary = pair.get('summary', '') differences_found = pair.get('differences_found', False) if not differences_found and not pair.get('error'): return f"""
    = No differences detected old {old} ↔ new {new} · sim {sim:.2f} identical
    {html.escape(summary or "Pages compared as visually identical.")}
    """ blocks = [] blocks.append(_render_diff_list(pair.get('added') or [], 'block-added', 'Added', '➕')) blocks.append(_render_diff_list(pair.get('removed') or [], 'block-removed', 'Removed', '➖')) blocks.append(_render_diff_list(pair.get('modified') or [], 'block-modified', 'Modified', '✎')) blocks.append(_render_diff_list(pair.get('moved') or [], 'block-moved', 'Moved', '↔')) blocks.append(_render_diff_list(pair.get('style_changes') or [], 'block-style', 'Style changes', '🎨')) blocks.append(_render_formatting_block(pair.get('formatting_changes') or [])) error_block = '' if pair.get('error'): error_block = f"
    ⚠️ {html.escape(pair['error'])}
    " return f"""
    old {old} ↔ new {new} sim {sim:.2f} {html.escape(sev)}

    {html.escape(summary or '')}

    {error_block} {''.join(blocks)}
    """ def _render_at_a_glance(totals: Dict, doc_summary: Dict) -> str: sev = totals.get('severity_counts', {}) return f"""
    {totals.get('old_page_count', 0)} → {totals.get('new_page_count', 0)}
    Page count
    {totals.get('pages_added', 0)}
    Pages added
    {totals.get('pages_removed', 0)}
    Pages removed
    {totals.get('pages_modified', 0)}
    Pages modified
    {totals.get('pages_unchanged', 0)}
    Pages unchanged
    {sev.get('high', 0)}
    High severity
    {sev.get('medium', 0)}
    Medium severity
    {sev.get('low', 0)}
    Low severity
    """ def _render_html(result: Dict) -> str: old_pdf = result.get('old_pdf', {}) new_pdf = result.get('new_pdf', {}) totals = result.get('totals', {}) doc_summary = result.get('document_summary', {}) alignment = result.get('alignment', []) pair_diffs = result.get('pair_diffs', {}) score = doc_summary.get('overall_score', 0) grade = doc_summary.get('grade', '') glance = _render_at_a_glance(totals, doc_summary) alignment_table = _render_alignment_table(alignment) pair_cards = '\n'.join(_render_pair_card(entry, pair_diffs) for entry in alignment) title = f"Diff Report — {old_pdf.get('filename', 'old')} vs {new_pdf.get('filename', 'new')}" return f""" {html.escape(title)}

    Old vs New Diff — {html.escape(result.get('profile_name', ''))}

    {html.escape(result.get('timestamp', ''))}
    OLD: {html.escape(old_pdf.get('filename', ''))}
    {old_pdf.get('pages_processed', 0)} pages
    NEW: {html.escape(new_pdf.get('filename', ''))}
    {new_pdf.get('pages_processed', 0)} pages
    {score}
    Diff score (100 = identical)
    {html.escape(grade)}
    Tokens: {result.get('token_usage', {}).get('total_tokens', 0):,}

    At a glance

    {glance}

    Page alignment map

    {alignment_table}

    Page-by-page differences

    {pair_cards}
    """ # ───────────────────────────────────────────────────────────────────────────── # Public entrypoint # ───────────────────────────────────────────────────────────────────────────── def write_diff_report( result: Dict, old_filename: str, new_filename: str, session_id: str, output_dir: str, output_mode: str = 'both', ) -> Dict[str, Optional[str]]: """Write JSON + HTML diff reports. Returns: {'json': path or None, 'html': path or None} """ os.makedirs(output_dir, exist_ok=True) base = f"{session_id}_{_slug(old_filename)}_vs_{_slug(new_filename)}_diff" paths: Dict[str, Optional[str]] = {'json': None, 'html': None} if output_mode in ('json', 'both'): json_path = os.path.join(output_dir, f"{base}_data.json") with open(json_path, 'w', encoding='utf-8') as f: json.dump(result, f, indent=2, default=str) paths['json'] = json_path if output_mode in ('html', 'both'): html_path = os.path.join(output_dir, f"{base}_report.html") with open(html_path, 'w', encoding='utf-8') as f: f.write(_render_html(result)) paths['html'] = html_path return paths