From de62fa1f874f99346824b5d9addc544d18849344 Mon Sep 17 00:00:00 2001 From: michael Date: Thu, 12 Feb 2026 17:51:52 -0600 Subject: [PATCH] Show partial parse status in UI when some pages fail - LlamaParse service now returns a ParseResult dataclass with markdown, total page count, and a list of failed pages (page number + error) - Knowledge base service sets status to "partial" (instead of "parsed") when some pages failed, with a descriptive error listing which pages failed and why - Frontend StatusBadge shows "partial parse" in orange for partial status - Error details are shown inline below the document row for both partial and error statuses Co-Authored-By: Claude Opus 4.6 --- .../app/services/knowledge_base_service.py | 32 ++++++++++-- backend/app/services/llamaparse_service.py | 33 +++++++++---- frontend/components/KnowledgeBase.tsx | 49 ++++++++++++------- 3 files changed, 83 insertions(+), 31 deletions(-) diff --git a/backend/app/services/knowledge_base_service.py b/backend/app/services/knowledge_base_service.py index cf4a859..a782895 100644 --- a/backend/app/services/knowledge_base_service.py +++ b/backend/app/services/knowledge_base_service.py @@ -145,12 +145,36 @@ class KnowledgeBaseService: await doc_repo.update_source_document_parse_status(doc.id, "parsing") await doc_session.commit() - markdown = await self.llamaparse.parse_document(file_data, doc.filename) + result = await self.llamaparse.parse_document(file_data, doc.filename) + + # Determine status based on failed pages + if result.failed_pages: + if not result.markdown.strip(): + # All pages failed + failed_desc = "; ".join( + f"Page {fp['page']}: {fp['error']}" for fp in result.failed_pages + ) + status = "error" + error_msg = f"All {result.total_pages} pages failed to parse. {failed_desc}" + else: + failed_desc = "; ".join( + f"Page {fp['page']}: {fp['error']}" for fp in result.failed_pages + ) + status = "partial" + error_msg = ( + f"{len(result.failed_pages)} of {result.total_pages} pages " + f"failed to parse. {failed_desc}" + ) + else: + status = "parsed" + error_msg = None async with async_session_factory() as doc_session: doc_repo = KnowledgeBaseRepository(doc_session) await doc_repo.update_source_document_parse_status( - doc.id, "parsed", parsed_markdown=markdown + doc.id, status, + parsed_markdown=result.markdown if result.markdown.strip() else None, + parse_error=error_msg, ) await doc_session.commit() @@ -159,8 +183,8 @@ class KnowledgeBaseService: await repo.update_processing_job(job_id, parsed_documents=parsed_count) await session.commit() - if markdown.strip(): - return (doc.filename, str(doc.id), markdown) + if result.markdown.strip(): + return (doc.filename, str(doc.id), result.markdown) return None except Exception as e: diff --git a/backend/app/services/llamaparse_service.py b/backend/app/services/llamaparse_service.py index e5119da..a378667 100644 --- a/backend/app/services/llamaparse_service.py +++ b/backend/app/services/llamaparse_service.py @@ -1,17 +1,26 @@ import logging import tempfile +from dataclasses import dataclass, field from pathlib import Path logger = logging.getLogger(__name__) +@dataclass +class ParseResult: + """Result of a LlamaParse document parse.""" + markdown: str + total_pages: int = 0 + failed_pages: list[dict] = field(default_factory=list) + + class LlamaParseService: """Service for parsing documents using LlamaParse with the llama-cloud SDK.""" def __init__(self, api_key: str): self.api_key = api_key - async def parse_document(self, file_data: bytes, filename: str) -> str: + async def parse_document(self, file_data: bytes, filename: str) -> ParseResult: """ Parse a document using LlamaParse Agentic Plus tier and return markdown. @@ -23,7 +32,7 @@ class LlamaParseService: filename: Original filename (used for format detection) Returns: - Parsed markdown text + ParseResult with markdown text and any failed page info """ from llama_cloud import AsyncLlamaCloud @@ -57,25 +66,29 @@ class LlamaParseService: # Extract markdown from all pages if result.markdown and result.markdown.pages: + total_pages = len(result.markdown.pages) pages = [] + failed_pages = [] for i, page in enumerate(result.markdown.pages): if hasattr(page, "markdown") and page.markdown: pages.append(page.markdown) else: - # Log full details of failed pages for troubleshooting + page_num = getattr(page, "page_number", i + 1) + error_msg = getattr(page, "error", "Unknown error") logger.error( - f"[LLAMAPARSE] Page {i} failed for '{filename}': " - f"type={type(page).__name__}, attrs={vars(page) if hasattr(page, '__dict__') else repr(page)}" + f"[LLAMAPARSE] Page {page_num} failed for '{filename}': " + f"type={type(page).__name__}, error={error_msg}" ) + failed_pages.append({"page": page_num, "error": error_msg}) if not pages: - logger.warning(f"[LLAMAPARSE] All {len(result.markdown.pages)} pages failed for '{filename}'") - return "" + logger.warning(f"[LLAMAPARSE] All {total_pages} pages failed for '{filename}'") + return ParseResult(markdown="", total_pages=total_pages, failed_pages=failed_pages) combined = "\n\n".join(pages) - logger.info(f"[LLAMAPARSE] Parsed '{filename}' -> {len(combined)} chars from {len(pages)}/{len(result.markdown.pages)} pages") - return combined + logger.info(f"[LLAMAPARSE] Parsed '{filename}' -> {len(combined)} chars from {len(pages)}/{total_pages} pages") + return ParseResult(markdown=combined, total_pages=total_pages, failed_pages=failed_pages) logger.warning(f"[LLAMAPARSE] No markdown content returned for '{filename}'") - return "" + return ParseResult(markdown="", total_pages=0, failed_pages=[]) finally: # Clean up temp file diff --git a/frontend/components/KnowledgeBase.tsx b/frontend/components/KnowledgeBase.tsx index b933388..347453a 100644 --- a/frontend/components/KnowledgeBase.tsx +++ b/frontend/components/KnowledgeBase.tsx @@ -16,15 +16,19 @@ const StatusBadge: React.FC<{ status: string }> = ({ status }) => { pending: 'bg-yellow-100 text-yellow-800', parsing: 'bg-blue-100 text-blue-800', parsed: 'bg-green-100 text-green-800', + partial: 'bg-orange-100 text-orange-800', error: 'bg-red-100 text-red-800', parsing_documents: 'bg-blue-100 text-blue-800', distilling: 'bg-purple-100 text-purple-800', completed: 'bg-green-100 text-green-800', failed: 'bg-red-100 text-red-800', }; + const labels: Record = { + partial: 'partial parse', + }; return ( - {status} + {labels[status] || status} ); }; @@ -475,22 +479,33 @@ export const KnowledgeBase: React.FC = () => { {selectedKb.source_documents.map(doc => ( - - {doc.filename} - {formatBytes(doc.file_size_bytes)} - {formatDate(doc.created_at)} - {doc.uploaded_by_name || '-'} - - - - - + + + {doc.filename} + {formatBytes(doc.file_size_bytes)} + {formatDate(doc.created_at)} + {doc.uploaded_by_name || '-'} + + + + + + {doc.parse_error && (doc.parse_status === 'partial' || doc.parse_status === 'error') && ( + + +
+ {doc.parse_error} +
+ + + )} +
))}