From ffbd7e29248c7153299d3f6d199171aaa52bfc7e Mon Sep 17 00:00:00 2001 From: DJP Date: Thu, 2 Oct 2025 13:32:05 -0400 Subject: [PATCH] Fix: Use test query ONCE for readiness, detects when docs actually searchable --- src/notebookllama/pages/3_Notebook_Chat.py | 6 +-- src/notebookllama/pipeline_manager.py | 62 +++++++++++----------- 2 files changed, 35 insertions(+), 33 deletions(-) diff --git a/src/notebookllama/pages/3_Notebook_Chat.py b/src/notebookllama/pages/3_Notebook_Chat.py index 65f55e4..d780bcc 100644 --- a/src/notebookllama/pages/3_Notebook_Chat.py +++ b/src/notebookllama/pages/3_Notebook_Chat.py @@ -101,7 +101,7 @@ if not documents: # Check if pipeline is actually ready from background_tasks import get_notebook_processing_tasks, TaskStatus from document_manager import get_latest_document_summary -from pipeline_manager import check_pipeline_status_direct +from pipeline_manager import check_pipeline_ready_with_test import asyncio as aio import time @@ -138,8 +138,8 @@ if f'pipeline_confirmed_ready_{notebook.id}' not in st.session_state: st.stop() - with st.spinner("🔍 Checking pipeline status..."): - readiness = asyncio.run(check_pipeline_status_direct(notebook.pipeline_id)) + with st.spinner("🔍 Checking if documents are searchable..."): + readiness = asyncio.run(check_pipeline_ready_with_test(notebook.pipeline_id, notebook.model_type)) # If ready, mark as confirmed and don't test again if readiness.get('ready', False): diff --git a/src/notebookllama/pipeline_manager.py b/src/notebookllama/pipeline_manager.py index ee8488d..c05e2fd 100644 --- a/src/notebookllama/pipeline_manager.py +++ b/src/notebookllama/pipeline_manager.py @@ -85,53 +85,55 @@ async def add_document_to_pipeline(pipeline_id: str, file_path: str) -> bool: return False -async def check_pipeline_status_direct(pipeline_id: str) -> dict: +async def check_pipeline_ready_with_test(pipeline_id: str, model_type: str) -> dict: """ - Check pipeline status directly from LlamaCloud API (no test queries) + Check if pipeline is actually ready by doing ONE simple test query + Only called during readiness check, not during chat Returns: - dict with 'ready' (bool), 'status' (str), 'doc_count' (int), 'indexed_count' (int) + dict with 'ready' (bool), 'status' (str), 'message' (str) """ try: + # First check if pipeline has documents client = AsyncLlamaCloud(token=os.getenv("LLAMACLOUD_API_KEY")) - - # Get pipeline details pipeline = await client.pipelines.get_pipeline(pipeline_id=pipeline_id) - # Count documents in pipeline doc_count = 0 - indexed_count = 0 + if hasattr(pipeline, 'configured_transformation') and hasattr(pipeline.configured_transformation, 'data_sources'): + doc_count = len(pipeline.configured_transformation.data_sources) - if hasattr(pipeline, 'configured_transformation'): - transform = pipeline.configured_transformation - if hasattr(transform, 'data_sources'): - doc_count = len(transform.data_sources) + if doc_count == 0: + return { + 'ready': False, + 'status': 'No documents', + 'message': 'No documents in pipeline yet' + } - # Check each document's status - for source in transform.data_sources: - # If source has been processed, it's indexed - # LlamaCloud doesn't expose explicit "indexed" status, - # so we assume if it's in data_sources, it's processed - indexed_count += 1 + # Documents exist - now test if they're actually searchable + print(f"Testing pipeline {pipeline_id} with simple query...") + test_response = await query_notebook_pipeline(pipeline_id, "Summarize this document briefly", model_type) - # Pipeline is ready if it has documents - is_ready = doc_count > 0 and indexed_count > 0 - - return { - 'ready': is_ready, - 'status': 'Ready' if is_ready else 'Indexing', - 'doc_count': doc_count, - 'indexed_count': indexed_count, - 'message': f'{indexed_count}/{doc_count} documents indexed' if doc_count > 0 else 'No documents in pipeline' - } + # Check if we got a real response + if test_response and "Empty Response" not in test_response and "Error:" not in test_response and len(test_response) > 50: + print(f"✓ Pipeline ready - got real response") + return { + 'ready': True, + 'status': 'Ready', + 'message': f'{doc_count} document(s) indexed and searchable' + } + else: + print(f"⏳ Pipeline not ready yet - got: {test_response[:100] if test_response else 'None'}") + return { + 'ready': False, + 'status': 'Still indexing', + 'message': f'{doc_count} document(s) added but not searchable yet' + } except Exception as e: - print(f"Error checking pipeline status: {e}") + print(f"Error checking readiness: {e}") return { 'ready': False, 'status': 'Error', - 'doc_count': 0, - 'indexed_count': 0, 'message': f'Error: {str(e)[:100]}' }