From 8a2b45ae314c67970ec2299ed3f41f75c9c12a20 Mon Sep 17 00:00:00 2001 From: DJP Date: Fri, 10 Apr 2026 10:03:04 -0400 Subject: [PATCH] Deep extraction: live progress between passes + elapsed timer - Split deep extraction into two separate functions (pass1 + pass2) so the background task can update DB between them - Progress now shows: "Pass 1/2: Analyzing structure... (this takes 20-40 seconds)" "Pass 1 complete (23s). Pass 2/2: Extracting assets..." "Deep extraction complete (52s total). Found 45 assets." - Live elapsed timer (seconds) shown in the upload spinner - Timer ticks every second so user knows it's not hung Co-Authored-By: Claude Opus 4.6 (1M context) --- backend/app/api/matching.py | 30 ++++++++++++++++++++--- backend/app/services/doc_parser.py | 39 ++++++++++++------------------ frontend/src/pages/ProjectView.css | 7 ++++++ frontend/src/pages/ProjectView.tsx | 6 +++++ 4 files changed, 56 insertions(+), 26 deletions(-) diff --git a/backend/app/api/matching.py b/backend/app/api/matching.py index 0a0738d..7e6f858 100644 --- a/backend/app/api/matching.py +++ b/backend/app/api/matching.py @@ -10,7 +10,7 @@ from app.database import get_db, async_session from app.models.gmal import GmalAsset from app.models.project import Project, ClientAsset, Match, ProjectStatus, MatchConfidence from app.schemas.project import ClientAssetOut, ClientAssetUpdate, MatchOut, MatchSelectRequest, ManualMatchRequest -from app.services.doc_parser import extract_text_from_file, parse_text_with_ai, deep_parse_text_with_ai +from app.services.doc_parser import extract_text_from_file, parse_text_with_ai, deep_pass1_structure_analysis, deep_pass2_guided_extraction, SYSTEM_PROMPT, EXTRACT_TOOLS from app.services.ai_matching import match_client_assets router = APIRouter() @@ -29,9 +29,33 @@ async def _background_parse(project_id: int, filename: str, text: str, metadata: # Stage 3: AI parsing (normal or deep) try: if mode == "deep": - project.parse_stage = "Deep extraction: analyzing spreadsheet structure (Pass 1 of 2)..." + # Pass 1: Structure analysis + import time + start = time.time() + project.parse_stage = "Deep extraction Pass 1/2: Analyzing spreadsheet structure... (this takes 20-40 seconds)" + await db.commit() + + structure_analysis, usage1 = deep_pass1_structure_analysis(text) + + elapsed1 = int(time.time() - start) + project.ai_input_tokens = (project.ai_input_tokens or 0) + usage1.get("input_tokens", 0) + project.ai_output_tokens = (project.ai_output_tokens or 0) + usage1.get("output_tokens", 0) + project.ai_cost_usd = float(project.ai_cost_usd or 0) + usage1.get("cost_usd", 0) + project.ai_call_count = (project.ai_call_count or 0) + 1 + project.parse_stage = f"Pass 1 complete ({elapsed1}s). Pass 2/2: Extracting assets using structure analysis..." + await db.commit() + + # Pass 2: Guided extraction + extracted, usage2 = deep_pass2_guided_extraction(text, structure_analysis) + + elapsed2 = int(time.time() - start) + usage_info = { + "input_tokens": usage1.get("input_tokens", 0) + usage2.get("input_tokens", 0), + "output_tokens": usage1.get("output_tokens", 0) + usage2.get("output_tokens", 0), + "cost_usd": usage1.get("cost_usd", 0) + usage2.get("cost_usd", 0), + } + project.parse_stage = f"Deep extraction complete ({elapsed2}s total). Found {len(extracted)} assets." await db.commit() - extracted, usage_info = deep_parse_text_with_ai(text) else: extracted, usage_info = parse_text_with_ai(text) except Exception as e: diff --git a/backend/app/services/doc_parser.py b/backend/app/services/doc_parser.py index 1cbbce8..503bb9e 100644 --- a/backend/app/services/doc_parser.py +++ b/backend/app/services/doc_parser.py @@ -227,33 +227,29 @@ For each sheet with meaningful data, describe: Be specific — reference actual column names and row numbers.""" -def deep_parse_text_with_ai(text: str) -> tuple[list[dict], dict]: - """Two-pass AI extraction for complex documents. +def deep_pass1_structure_analysis(text: str) -> tuple[str, dict]: + """Pass 1 of deep extraction: analyze spreadsheet structure. - Pass 1: Analyze the spreadsheet structure - Pass 2: Extract assets using the structural understanding - - Returns (assets, usage_info). + Returns (structure_analysis_text, usage_info). """ - total_usage = {"input_tokens": 0, "output_tokens": 0, "cost_usd": 0} - - # Pass 1: Structure analysis logger.info("Deep extraction Pass 1: Analyzing spreadsheet structure...") - analysis_response = call_claude( + response = call_claude( system=STRUCTURE_ANALYSIS_PROMPT, user_message=f"Analyze the structure of this spreadsheet data:\n\n{text[:40000]}", max_tokens=4096, ) - usage1 = getattr(analysis_response, '_usage_info', {}) - total_usage["input_tokens"] += usage1.get("input_tokens", 0) - total_usage["output_tokens"] += usage1.get("output_tokens", 0) - total_usage["cost_usd"] += usage1.get("cost_usd", 0) + usage = getattr(response, '_usage_info', {"input_tokens": 0, "output_tokens": 0, "cost_usd": 0}) + analysis = extract_text(response) + logger.info(f"Deep extraction Pass 1 complete: {len(analysis)} chars of analysis") + return analysis, usage - structure_analysis = extract_text(analysis_response) - logger.info(f"Deep extraction Pass 1 complete: {len(structure_analysis)} chars of analysis") - # Pass 2: Guided extraction using the structure analysis +def deep_pass2_guided_extraction(text: str, structure_analysis: str) -> tuple[list[dict], dict]: + """Pass 2 of deep extraction: extract assets using structural understanding. + + Returns (assets, usage_info). + """ logger.info("Deep extraction Pass 2: Extracting assets with structural guidance...") guided_prompt = f"""You have been given a structural analysis of a complex client spreadsheet. Use this understanding to extract every deliverable asset accurately. @@ -280,15 +276,12 @@ Now extract all deliverable assets from this data: max_tokens=16000, ) - usage2 = getattr(response, '_usage_info', {}) - total_usage["input_tokens"] += usage2.get("input_tokens", 0) - total_usage["output_tokens"] += usage2.get("output_tokens", 0) - total_usage["cost_usd"] += usage2.get("cost_usd", 0) + usage = getattr(response, '_usage_info', {"input_tokens": 0, "output_tokens": 0, "cost_usd": 0}) result = extract_tool_result(response) if not result or "assets" not in result: logger.warning("Deep extraction Pass 2 returned no assets") - return [], total_usage + return [], usage logger.info(f"Deep extraction complete: {len(result['assets'])} assets found") - return result["assets"], total_usage + return result["assets"], usage diff --git a/frontend/src/pages/ProjectView.css b/frontend/src/pages/ProjectView.css index eb55a85..32e075a 100644 --- a/frontend/src/pages/ProjectView.css +++ b/frontend/src/pages/ProjectView.css @@ -145,6 +145,13 @@ cursor: pointer; } +.upload-timer { + font-size: 12px; + color: var(--color-text-muted); + margin-top: 8px; + font-variant-numeric: tabular-nums; +} + .extraction-mode-toggle { display: flex; gap: 4px; diff --git a/frontend/src/pages/ProjectView.tsx b/frontend/src/pages/ProjectView.tsx index fbae811..1afe8d9 100644 --- a/frontend/src/pages/ProjectView.tsx +++ b/frontend/src/pages/ProjectView.tsx @@ -58,6 +58,7 @@ export default function ProjectView() { const [uploading, setUploading] = useState(false); const [uploadStage, setUploadStage] = useState(''); const [extractionMode, setExtractionMode] = useState<'normal' | 'deep'>('normal'); + const [uploadTimer, setUploadTimer] = useState(0); const [refineInput, setRefineInput] = useState(''); const [refining, setRefining] = useState(false); const [refineLog, setRefineLog] = useState([]); @@ -147,6 +148,8 @@ export default function ProjectView() { const files = e.target.files; if (!files || files.length === 0) return; setUploading(true); + setUploadTimer(0); + const timerInterval = setInterval(() => setUploadTimer(t => t + 1), 1000); const names = Array.from(files).map(f => f.name).join(', '); setUploadStage(`Uploading ${files.length} file(s): ${names}...`); @@ -158,6 +161,7 @@ export default function ProjectView() { await api.post(`/projects/${id}/upload?mode=${extractionMode}`, form); } catch (err: any) { alert(`Upload failed: ${err.response?.data?.detail || err.message}`); + clearInterval(timerInterval); setUploading(false); setUploadStage(''); return; @@ -172,6 +176,7 @@ export default function ProjectView() { } if (res.data.status !== 'parsing') { clearInterval(pollInterval); + clearInterval(timerInterval); setUploading(false); setUploadStage(''); await loadProject(); @@ -498,6 +503,7 @@ export default function ProjectView() { <>

{uploadStage}

+

{uploadTimer}s elapsed

) : ( <>