Deep extraction: live progress between passes + elapsed timer
- Split deep extraction into two separate functions (pass1 + pass2) so the background task can update DB between them - Progress now shows: "Pass 1/2: Analyzing structure... (this takes 20-40 seconds)" "Pass 1 complete (23s). Pass 2/2: Extracting assets..." "Deep extraction complete (52s total). Found 45 assets." - Live elapsed timer (seconds) shown in the upload spinner - Timer ticks every second so user knows it's not hung Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
f01774e6f3
commit
8a2b45ae31
4 changed files with 56 additions and 26 deletions
|
|
@ -10,7 +10,7 @@ from app.database import get_db, async_session
|
|||
from app.models.gmal import GmalAsset
|
||||
from app.models.project import Project, ClientAsset, Match, ProjectStatus, MatchConfidence
|
||||
from app.schemas.project import ClientAssetOut, ClientAssetUpdate, MatchOut, MatchSelectRequest, ManualMatchRequest
|
||||
from app.services.doc_parser import extract_text_from_file, parse_text_with_ai, deep_parse_text_with_ai
|
||||
from app.services.doc_parser import extract_text_from_file, parse_text_with_ai, deep_pass1_structure_analysis, deep_pass2_guided_extraction, SYSTEM_PROMPT, EXTRACT_TOOLS
|
||||
from app.services.ai_matching import match_client_assets
|
||||
|
||||
router = APIRouter()
|
||||
|
|
@ -29,9 +29,33 @@ async def _background_parse(project_id: int, filename: str, text: str, metadata:
|
|||
# Stage 3: AI parsing (normal or deep)
|
||||
try:
|
||||
if mode == "deep":
|
||||
project.parse_stage = "Deep extraction: analyzing spreadsheet structure (Pass 1 of 2)..."
|
||||
# Pass 1: Structure analysis
|
||||
import time
|
||||
start = time.time()
|
||||
project.parse_stage = "Deep extraction Pass 1/2: Analyzing spreadsheet structure... (this takes 20-40 seconds)"
|
||||
await db.commit()
|
||||
|
||||
structure_analysis, usage1 = deep_pass1_structure_analysis(text)
|
||||
|
||||
elapsed1 = int(time.time() - start)
|
||||
project.ai_input_tokens = (project.ai_input_tokens or 0) + usage1.get("input_tokens", 0)
|
||||
project.ai_output_tokens = (project.ai_output_tokens or 0) + usage1.get("output_tokens", 0)
|
||||
project.ai_cost_usd = float(project.ai_cost_usd or 0) + usage1.get("cost_usd", 0)
|
||||
project.ai_call_count = (project.ai_call_count or 0) + 1
|
||||
project.parse_stage = f"Pass 1 complete ({elapsed1}s). Pass 2/2: Extracting assets using structure analysis..."
|
||||
await db.commit()
|
||||
|
||||
# Pass 2: Guided extraction
|
||||
extracted, usage2 = deep_pass2_guided_extraction(text, structure_analysis)
|
||||
|
||||
elapsed2 = int(time.time() - start)
|
||||
usage_info = {
|
||||
"input_tokens": usage1.get("input_tokens", 0) + usage2.get("input_tokens", 0),
|
||||
"output_tokens": usage1.get("output_tokens", 0) + usage2.get("output_tokens", 0),
|
||||
"cost_usd": usage1.get("cost_usd", 0) + usage2.get("cost_usd", 0),
|
||||
}
|
||||
project.parse_stage = f"Deep extraction complete ({elapsed2}s total). Found {len(extracted)} assets."
|
||||
await db.commit()
|
||||
extracted, usage_info = deep_parse_text_with_ai(text)
|
||||
else:
|
||||
extracted, usage_info = parse_text_with_ai(text)
|
||||
except Exception as e:
|
||||
|
|
|
|||
|
|
@ -227,33 +227,29 @@ For each sheet with meaningful data, describe:
|
|||
Be specific — reference actual column names and row numbers."""
|
||||
|
||||
|
||||
def deep_parse_text_with_ai(text: str) -> tuple[list[dict], dict]:
|
||||
"""Two-pass AI extraction for complex documents.
|
||||
def deep_pass1_structure_analysis(text: str) -> tuple[str, dict]:
|
||||
"""Pass 1 of deep extraction: analyze spreadsheet structure.
|
||||
|
||||
Pass 1: Analyze the spreadsheet structure
|
||||
Pass 2: Extract assets using the structural understanding
|
||||
|
||||
Returns (assets, usage_info).
|
||||
Returns (structure_analysis_text, usage_info).
|
||||
"""
|
||||
total_usage = {"input_tokens": 0, "output_tokens": 0, "cost_usd": 0}
|
||||
|
||||
# Pass 1: Structure analysis
|
||||
logger.info("Deep extraction Pass 1: Analyzing spreadsheet structure...")
|
||||
analysis_response = call_claude(
|
||||
response = call_claude(
|
||||
system=STRUCTURE_ANALYSIS_PROMPT,
|
||||
user_message=f"Analyze the structure of this spreadsheet data:\n\n{text[:40000]}",
|
||||
max_tokens=4096,
|
||||
)
|
||||
|
||||
usage1 = getattr(analysis_response, '_usage_info', {})
|
||||
total_usage["input_tokens"] += usage1.get("input_tokens", 0)
|
||||
total_usage["output_tokens"] += usage1.get("output_tokens", 0)
|
||||
total_usage["cost_usd"] += usage1.get("cost_usd", 0)
|
||||
usage = getattr(response, '_usage_info', {"input_tokens": 0, "output_tokens": 0, "cost_usd": 0})
|
||||
analysis = extract_text(response)
|
||||
logger.info(f"Deep extraction Pass 1 complete: {len(analysis)} chars of analysis")
|
||||
return analysis, usage
|
||||
|
||||
structure_analysis = extract_text(analysis_response)
|
||||
logger.info(f"Deep extraction Pass 1 complete: {len(structure_analysis)} chars of analysis")
|
||||
|
||||
# Pass 2: Guided extraction using the structure analysis
|
||||
def deep_pass2_guided_extraction(text: str, structure_analysis: str) -> tuple[list[dict], dict]:
|
||||
"""Pass 2 of deep extraction: extract assets using structural understanding.
|
||||
|
||||
Returns (assets, usage_info).
|
||||
"""
|
||||
logger.info("Deep extraction Pass 2: Extracting assets with structural guidance...")
|
||||
guided_prompt = f"""You have been given a structural analysis of a complex client spreadsheet.
|
||||
Use this understanding to extract every deliverable asset accurately.
|
||||
|
|
@ -280,15 +276,12 @@ Now extract all deliverable assets from this data:
|
|||
max_tokens=16000,
|
||||
)
|
||||
|
||||
usage2 = getattr(response, '_usage_info', {})
|
||||
total_usage["input_tokens"] += usage2.get("input_tokens", 0)
|
||||
total_usage["output_tokens"] += usage2.get("output_tokens", 0)
|
||||
total_usage["cost_usd"] += usage2.get("cost_usd", 0)
|
||||
usage = getattr(response, '_usage_info', {"input_tokens": 0, "output_tokens": 0, "cost_usd": 0})
|
||||
|
||||
result = extract_tool_result(response)
|
||||
if not result or "assets" not in result:
|
||||
logger.warning("Deep extraction Pass 2 returned no assets")
|
||||
return [], total_usage
|
||||
return [], usage
|
||||
|
||||
logger.info(f"Deep extraction complete: {len(result['assets'])} assets found")
|
||||
return result["assets"], total_usage
|
||||
return result["assets"], usage
|
||||
|
|
|
|||
|
|
@ -145,6 +145,13 @@
|
|||
cursor: pointer;
|
||||
}
|
||||
|
||||
.upload-timer {
|
||||
font-size: 12px;
|
||||
color: var(--color-text-muted);
|
||||
margin-top: 8px;
|
||||
font-variant-numeric: tabular-nums;
|
||||
}
|
||||
|
||||
.extraction-mode-toggle {
|
||||
display: flex;
|
||||
gap: 4px;
|
||||
|
|
|
|||
|
|
@ -58,6 +58,7 @@ export default function ProjectView() {
|
|||
const [uploading, setUploading] = useState(false);
|
||||
const [uploadStage, setUploadStage] = useState('');
|
||||
const [extractionMode, setExtractionMode] = useState<'normal' | 'deep'>('normal');
|
||||
const [uploadTimer, setUploadTimer] = useState(0);
|
||||
const [refineInput, setRefineInput] = useState('');
|
||||
const [refining, setRefining] = useState(false);
|
||||
const [refineLog, setRefineLog] = useState<string[]>([]);
|
||||
|
|
@ -147,6 +148,8 @@ export default function ProjectView() {
|
|||
const files = e.target.files;
|
||||
if (!files || files.length === 0) return;
|
||||
setUploading(true);
|
||||
setUploadTimer(0);
|
||||
const timerInterval = setInterval(() => setUploadTimer(t => t + 1), 1000);
|
||||
const names = Array.from(files).map(f => f.name).join(', ');
|
||||
setUploadStage(`Uploading ${files.length} file(s): ${names}...`);
|
||||
|
||||
|
|
@ -158,6 +161,7 @@ export default function ProjectView() {
|
|||
await api.post(`/projects/${id}/upload?mode=${extractionMode}`, form);
|
||||
} catch (err: any) {
|
||||
alert(`Upload failed: ${err.response?.data?.detail || err.message}`);
|
||||
clearInterval(timerInterval);
|
||||
setUploading(false);
|
||||
setUploadStage('');
|
||||
return;
|
||||
|
|
@ -172,6 +176,7 @@ export default function ProjectView() {
|
|||
}
|
||||
if (res.data.status !== 'parsing') {
|
||||
clearInterval(pollInterval);
|
||||
clearInterval(timerInterval);
|
||||
setUploading(false);
|
||||
setUploadStage('');
|
||||
await loadProject();
|
||||
|
|
@ -498,6 +503,7 @@ export default function ProjectView() {
|
|||
<>
|
||||
<div className="upload-spinner" />
|
||||
<p className="upload-stage">{uploadStage}</p>
|
||||
<p className="upload-timer">{uploadTimer}s elapsed</p>
|
||||
</>
|
||||
) : (
|
||||
<>
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue