diff --git a/backend/app/services/doc_parser.py b/backend/app/services/doc_parser.py index 0c280d6..be40525 100644 --- a/backend/app/services/doc_parser.py +++ b/backend/app/services/doc_parser.py @@ -269,7 +269,12 @@ def deep_pass2_guided_extraction(text: str, structure_analysis: str) -> tuple[li Returns (assets, usage_info). """ - logger.info("Deep extraction Pass 2: Extracting assets with structural guidance...") + # Truncate structure analysis if very long to leave room for data + if len(structure_analysis) > 4000: + structure_analysis = structure_analysis[:4000] + "\n[Analysis truncated]" + + logger.info(f"Deep extraction Pass 2: structure={len(structure_analysis)} chars, data={len(text)} chars") + guided_prompt = f"""You have been given a structural analysis of a complex client spreadsheet. Use this understanding to extract every deliverable asset accurately. @@ -282,10 +287,11 @@ IMPORTANT GUIDELINES: - Skip rows that are questions, metadata, or caveats — those are not deliverables - If volume is 0 or "No", still extract the asset but set volume to 0 - Carry forward category names from merged cells (the analysis explains the hierarchy) +- You MUST call the extract_assets tool with at least one asset. If you cannot find structured assets, extract the best candidates you can identify. Now extract all deliverable assets from this data: -{text[:45000]}""" +{text[:40000]}""" response = call_claude( system=SYSTEM_PROMPT, @@ -297,9 +303,15 @@ Now extract all deliverable assets from this data: usage = getattr(response, '_usage_info', {"input_tokens": 0, "output_tokens": 0, "cost_usd": 0}) + # Log response details for debugging + stop_reason = getattr(response, 'stop_reason', 'unknown') + logger.info(f"Deep extraction Pass 2 response: stop_reason={stop_reason}") + result = extract_tool_result(response) if not result or "assets" not in result: - logger.warning("Deep extraction Pass 2 returned no assets") + # Log what we got instead + response_text = extract_text(response) + logger.warning(f"Deep extraction Pass 2 returned no assets. stop_reason={stop_reason}, text_response={response_text[:500] if response_text else 'none'}") return [], usage logger.info(f"Deep extraction complete: {len(result['assets'])} assets found")