From 617c1e3ca38d8a59dde9d9b0c40df9880608ebb8 Mon Sep 17 00:00:00 2001
From: DJP <DJP>
Date: Sun, 12 Apr 2026 16:43:38 -0400
Subject: [PATCH] Debug deep extraction Pass 2: better logging, truncate
 analysis, force tool use

- Log structure analysis length and data length before Pass 2
- Log stop_reason from Claude response
- If no assets returned, log the text response for debugging
- Truncate structure analysis to 4k chars if too long (leaves room for data)
- Reduce data to 40k chars (was 45k, combined with analysis was too large)
- Add instruction: "You MUST call extract_assets with at least one asset"

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 backend/app/services/doc_parser.py | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)
diff --git a/backend/app/services/doc_parser.py b/backend/app/services/doc_parser.py
index 0c280d6..be40525 100644
--- a/backend/app/services/doc_parser.py
+++ b/backend/app/services/doc_parser.py
@@ -269,7 +269,12 @@ def deep_pass2_guided_extraction(text: str, structure_analysis: str) -> tuple[li
 
     Returns (assets, usage_info).
     """
-    logger.info("Deep extraction Pass 2: Extracting assets with structural guidance...")
+    # Truncate structure analysis if very long to leave room for data
+    if len(structure_analysis) > 4000:
+        structure_analysis = structure_analysis[:4000] + "\n[Analysis truncated]"
+
+    logger.info(f"Deep extraction Pass 2: structure={len(structure_analysis)} chars, data={len(text)} chars")
+
     guided_prompt = f"""You have been given a structural analysis of a complex client spreadsheet.
 Use this understanding to extract every deliverable asset accurately.
 
@@ -282,10 +287,11 @@ IMPORTANT GUIDELINES:
 - Skip rows that are questions, metadata, or caveats — those are not deliverables
 - If volume is 0 or "No", still extract the asset but set volume to 0
 - Carry forward category names from merged cells (the analysis explains the hierarchy)
+- You MUST call the extract_assets tool with at least one asset. If you cannot find structured assets, extract the best candidates you can identify.
 
 Now extract all deliverable assets from this data:
 
-{text[:45000]}"""
+{text[:40000]}"""
 
     response = call_claude(
         system=SYSTEM_PROMPT,
@@ -297,9 +303,15 @@ Now extract all deliverable assets from this data:
 
     usage = getattr(response, '_usage_info', {"input_tokens": 0, "output_tokens": 0, "cost_usd": 0})
 
+    # Log response details for debugging
+    stop_reason = getattr(response, 'stop_reason', 'unknown')
+    logger.info(f"Deep extraction Pass 2 response: stop_reason={stop_reason}")
+
     result = extract_tool_result(response)
     if not result or "assets" not in result:
-        logger.warning("Deep extraction Pass 2 returned no assets")
+        # Log what we got instead
+        response_text = extract_text(response)
+        logger.warning(f"Deep extraction Pass 2 returned no assets. stop_reason={stop_reason}, text_response={response_text[:500] if response_text else 'none'}")
         return [], usage
 
     logger.info(f"Deep extraction complete: {len(result['assets'])} assets found")