Fix: Use test query ONCE for readiness, detects when docs actually searchable

This commit is contained in:
DJP 2025-10-02 13:32:05 -04:00
parent 32d0548475
commit ffbd7e2924
2 changed files with 35 additions and 33 deletions

View file

@ -101,7 +101,7 @@ if not documents:
# Check if pipeline is actually ready
from background_tasks import get_notebook_processing_tasks, TaskStatus
from document_manager import get_latest_document_summary
from pipeline_manager import check_pipeline_status_direct
from pipeline_manager import check_pipeline_ready_with_test
import asyncio as aio
import time
@ -138,8 +138,8 @@ if f'pipeline_confirmed_ready_{notebook.id}' not in st.session_state:
st.stop()
with st.spinner("🔍 Checking pipeline status..."):
readiness = asyncio.run(check_pipeline_status_direct(notebook.pipeline_id))
with st.spinner("🔍 Checking if documents are searchable..."):
readiness = asyncio.run(check_pipeline_ready_with_test(notebook.pipeline_id, notebook.model_type))
# If ready, mark as confirmed and don't test again
if readiness.get('ready', False):

View file

@ -85,53 +85,55 @@ async def add_document_to_pipeline(pipeline_id: str, file_path: str) -> bool:
return False
async def check_pipeline_status_direct(pipeline_id: str) -> dict:
async def check_pipeline_ready_with_test(pipeline_id: str, model_type: str) -> dict:
"""
Check pipeline status directly from LlamaCloud API (no test queries)
Check if pipeline is actually ready by doing ONE simple test query
Only called during readiness check, not during chat
Returns:
dict with 'ready' (bool), 'status' (str), 'doc_count' (int), 'indexed_count' (int)
dict with 'ready' (bool), 'status' (str), 'message' (str)
"""
try:
# First check if pipeline has documents
client = AsyncLlamaCloud(token=os.getenv("LLAMACLOUD_API_KEY"))
# Get pipeline details
pipeline = await client.pipelines.get_pipeline(pipeline_id=pipeline_id)
# Count documents in pipeline
doc_count = 0
indexed_count = 0
if hasattr(pipeline, 'configured_transformation') and hasattr(pipeline.configured_transformation, 'data_sources'):
doc_count = len(pipeline.configured_transformation.data_sources)
if hasattr(pipeline, 'configured_transformation'):
transform = pipeline.configured_transformation
if hasattr(transform, 'data_sources'):
doc_count = len(transform.data_sources)
if doc_count == 0:
return {
'ready': False,
'status': 'No documents',
'message': 'No documents in pipeline yet'
}
# Check each document's status
for source in transform.data_sources:
# If source has been processed, it's indexed
# LlamaCloud doesn't expose explicit "indexed" status,
# so we assume if it's in data_sources, it's processed
indexed_count += 1
# Documents exist - now test if they're actually searchable
print(f"Testing pipeline {pipeline_id} with simple query...")
test_response = await query_notebook_pipeline(pipeline_id, "Summarize this document briefly", model_type)
# Pipeline is ready if it has documents
is_ready = doc_count > 0 and indexed_count > 0
return {
'ready': is_ready,
'status': 'Ready' if is_ready else 'Indexing',
'doc_count': doc_count,
'indexed_count': indexed_count,
'message': f'{indexed_count}/{doc_count} documents indexed' if doc_count > 0 else 'No documents in pipeline'
}
# Check if we got a real response
if test_response and "Empty Response" not in test_response and "Error:" not in test_response and len(test_response) > 50:
print(f"✓ Pipeline ready - got real response")
return {
'ready': True,
'status': 'Ready',
'message': f'{doc_count} document(s) indexed and searchable'
}
else:
print(f"⏳ Pipeline not ready yet - got: {test_response[:100] if test_response else 'None'}")
return {
'ready': False,
'status': 'Still indexing',
'message': f'{doc_count} document(s) added but not searchable yet'
}
except Exception as e:
print(f"Error checking pipeline status: {e}")
print(f"Error checking readiness: {e}")
return {
'ready': False,
'status': 'Error',
'doc_count': 0,
'indexed_count': 0,
'message': f'Error: {str(e)[:100]}'
}