Deep extraction: live progress between passes + elapsed timer

- Split deep extraction into two separate functions (pass1 + pass2)
  so the background task can update DB between them
- Progress now shows:
  "Pass 1/2: Analyzing structure... (this takes 20-40 seconds)"
  "Pass 1 complete (23s). Pass 2/2: Extracting assets..."
  "Deep extraction complete (52s total). Found 45 assets."
- Live elapsed timer (seconds) shown in the upload spinner
- Timer ticks every second so user knows it's not hung

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
DJP 2026-04-10 10:03:04 -04:00
parent f01774e6f3
commit 8a2b45ae31
4 changed files with 56 additions and 26 deletions

View file

@ -10,7 +10,7 @@ from app.database import get_db, async_session
from app.models.gmal import GmalAsset
from app.models.project import Project, ClientAsset, Match, ProjectStatus, MatchConfidence
from app.schemas.project import ClientAssetOut, ClientAssetUpdate, MatchOut, MatchSelectRequest, ManualMatchRequest
from app.services.doc_parser import extract_text_from_file, parse_text_with_ai, deep_parse_text_with_ai
from app.services.doc_parser import extract_text_from_file, parse_text_with_ai, deep_pass1_structure_analysis, deep_pass2_guided_extraction, SYSTEM_PROMPT, EXTRACT_TOOLS
from app.services.ai_matching import match_client_assets
router = APIRouter()
@ -29,9 +29,33 @@ async def _background_parse(project_id: int, filename: str, text: str, metadata:
# Stage 3: AI parsing (normal or deep)
try:
if mode == "deep":
project.parse_stage = "Deep extraction: analyzing spreadsheet structure (Pass 1 of 2)..."
# Pass 1: Structure analysis
import time
start = time.time()
project.parse_stage = "Deep extraction Pass 1/2: Analyzing spreadsheet structure... (this takes 20-40 seconds)"
await db.commit()
structure_analysis, usage1 = deep_pass1_structure_analysis(text)
elapsed1 = int(time.time() - start)
project.ai_input_tokens = (project.ai_input_tokens or 0) + usage1.get("input_tokens", 0)
project.ai_output_tokens = (project.ai_output_tokens or 0) + usage1.get("output_tokens", 0)
project.ai_cost_usd = float(project.ai_cost_usd or 0) + usage1.get("cost_usd", 0)
project.ai_call_count = (project.ai_call_count or 0) + 1
project.parse_stage = f"Pass 1 complete ({elapsed1}s). Pass 2/2: Extracting assets using structure analysis..."
await db.commit()
# Pass 2: Guided extraction
extracted, usage2 = deep_pass2_guided_extraction(text, structure_analysis)
elapsed2 = int(time.time() - start)
usage_info = {
"input_tokens": usage1.get("input_tokens", 0) + usage2.get("input_tokens", 0),
"output_tokens": usage1.get("output_tokens", 0) + usage2.get("output_tokens", 0),
"cost_usd": usage1.get("cost_usd", 0) + usage2.get("cost_usd", 0),
}
project.parse_stage = f"Deep extraction complete ({elapsed2}s total). Found {len(extracted)} assets."
await db.commit()
extracted, usage_info = deep_parse_text_with_ai(text)
else:
extracted, usage_info = parse_text_with_ai(text)
except Exception as e:

View file

@ -227,33 +227,29 @@ For each sheet with meaningful data, describe:
Be specific reference actual column names and row numbers."""
def deep_parse_text_with_ai(text: str) -> tuple[list[dict], dict]:
"""Two-pass AI extraction for complex documents.
def deep_pass1_structure_analysis(text: str) -> tuple[str, dict]:
"""Pass 1 of deep extraction: analyze spreadsheet structure.
Pass 1: Analyze the spreadsheet structure
Pass 2: Extract assets using the structural understanding
Returns (assets, usage_info).
Returns (structure_analysis_text, usage_info).
"""
total_usage = {"input_tokens": 0, "output_tokens": 0, "cost_usd": 0}
# Pass 1: Structure analysis
logger.info("Deep extraction Pass 1: Analyzing spreadsheet structure...")
analysis_response = call_claude(
response = call_claude(
system=STRUCTURE_ANALYSIS_PROMPT,
user_message=f"Analyze the structure of this spreadsheet data:\n\n{text[:40000]}",
max_tokens=4096,
)
usage1 = getattr(analysis_response, '_usage_info', {})
total_usage["input_tokens"] += usage1.get("input_tokens", 0)
total_usage["output_tokens"] += usage1.get("output_tokens", 0)
total_usage["cost_usd"] += usage1.get("cost_usd", 0)
usage = getattr(response, '_usage_info', {"input_tokens": 0, "output_tokens": 0, "cost_usd": 0})
analysis = extract_text(response)
logger.info(f"Deep extraction Pass 1 complete: {len(analysis)} chars of analysis")
return analysis, usage
structure_analysis = extract_text(analysis_response)
logger.info(f"Deep extraction Pass 1 complete: {len(structure_analysis)} chars of analysis")
# Pass 2: Guided extraction using the structure analysis
def deep_pass2_guided_extraction(text: str, structure_analysis: str) -> tuple[list[dict], dict]:
"""Pass 2 of deep extraction: extract assets using structural understanding.
Returns (assets, usage_info).
"""
logger.info("Deep extraction Pass 2: Extracting assets with structural guidance...")
guided_prompt = f"""You have been given a structural analysis of a complex client spreadsheet.
Use this understanding to extract every deliverable asset accurately.
@ -280,15 +276,12 @@ Now extract all deliverable assets from this data:
max_tokens=16000,
)
usage2 = getattr(response, '_usage_info', {})
total_usage["input_tokens"] += usage2.get("input_tokens", 0)
total_usage["output_tokens"] += usage2.get("output_tokens", 0)
total_usage["cost_usd"] += usage2.get("cost_usd", 0)
usage = getattr(response, '_usage_info', {"input_tokens": 0, "output_tokens": 0, "cost_usd": 0})
result = extract_tool_result(response)
if not result or "assets" not in result:
logger.warning("Deep extraction Pass 2 returned no assets")
return [], total_usage
return [], usage
logger.info(f"Deep extraction complete: {len(result['assets'])} assets found")
return result["assets"], total_usage
return result["assets"], usage

View file

@ -145,6 +145,13 @@
cursor: pointer;
}
.upload-timer {
font-size: 12px;
color: var(--color-text-muted);
margin-top: 8px;
font-variant-numeric: tabular-nums;
}
.extraction-mode-toggle {
display: flex;
gap: 4px;

View file

@ -58,6 +58,7 @@ export default function ProjectView() {
const [uploading, setUploading] = useState(false);
const [uploadStage, setUploadStage] = useState('');
const [extractionMode, setExtractionMode] = useState<'normal' | 'deep'>('normal');
const [uploadTimer, setUploadTimer] = useState(0);
const [refineInput, setRefineInput] = useState('');
const [refining, setRefining] = useState(false);
const [refineLog, setRefineLog] = useState<string[]>([]);
@ -147,6 +148,8 @@ export default function ProjectView() {
const files = e.target.files;
if (!files || files.length === 0) return;
setUploading(true);
setUploadTimer(0);
const timerInterval = setInterval(() => setUploadTimer(t => t + 1), 1000);
const names = Array.from(files).map(f => f.name).join(', ');
setUploadStage(`Uploading ${files.length} file(s): ${names}...`);
@ -158,6 +161,7 @@ export default function ProjectView() {
await api.post(`/projects/${id}/upload?mode=${extractionMode}`, form);
} catch (err: any) {
alert(`Upload failed: ${err.response?.data?.detail || err.message}`);
clearInterval(timerInterval);
setUploading(false);
setUploadStage('');
return;
@ -172,6 +176,7 @@ export default function ProjectView() {
}
if (res.data.status !== 'parsing') {
clearInterval(pollInterval);
clearInterval(timerInterval);
setUploading(false);
setUploadStage('');
await loadProject();
@ -498,6 +503,7 @@ export default function ProjectView() {
<>
<div className="upload-spinner" />
<p className="upload-stage">{uploadStage}</p>
<p className="upload-timer">{uploadTimer}s elapsed</p>
</>
) : (
<>