- LlamaParse service now returns a ParseResult dataclass with markdown, total page count, and a list of failed pages (page number + error) - Knowledge base service sets status to "partial" (instead of "parsed") when some pages failed, with a descriptive error listing which pages failed and why - Frontend StatusBadge shows "partial parse" in orange for partial status - Error details are shown inline below the document row for both partial and error statuses Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
98 lines
3.8 KiB
Python
98 lines
3.8 KiB
Python
import logging
|
|
import tempfile
|
|
from dataclasses import dataclass, field
|
|
from pathlib import Path
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class ParseResult:
|
|
"""Result of a LlamaParse document parse."""
|
|
markdown: str
|
|
total_pages: int = 0
|
|
failed_pages: list[dict] = field(default_factory=list)
|
|
|
|
|
|
class LlamaParseService:
|
|
"""Service for parsing documents using LlamaParse with the llama-cloud SDK."""
|
|
|
|
def __init__(self, api_key: str):
|
|
self.api_key = api_key
|
|
|
|
async def parse_document(self, file_data: bytes, filename: str) -> ParseResult:
|
|
"""
|
|
Parse a document using LlamaParse Agentic Plus tier and return markdown.
|
|
|
|
Uses the llama-cloud SDK (v1.0+) with API v2 for maximum accuracy
|
|
on complex layouts, tables, and visual structure.
|
|
|
|
Args:
|
|
file_data: Raw bytes of the document
|
|
filename: Original filename (used for format detection)
|
|
|
|
Returns:
|
|
ParseResult with markdown text and any failed page info
|
|
"""
|
|
from llama_cloud import AsyncLlamaCloud
|
|
|
|
logger.info(f"[LLAMAPARSE] Starting agentic_plus parse for '{filename}' ({len(file_data)} bytes)")
|
|
|
|
client = AsyncLlamaCloud(api_key=self.api_key)
|
|
|
|
# Write bytes to a temp file for upload
|
|
suffix = Path(filename).suffix or ".pdf"
|
|
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
|
|
tmp.write(file_data)
|
|
tmp_path = tmp.name
|
|
|
|
try:
|
|
# Upload the file
|
|
logger.info(f"[LLAMAPARSE] Uploading '{filename}' to LlamaCloud...")
|
|
file_obj = await client.files.create(
|
|
file=tmp_path,
|
|
purpose="parse",
|
|
)
|
|
logger.info(f"[LLAMAPARSE] File uploaded, id: {file_obj.id}")
|
|
|
|
# Parse with agentic_plus tier for maximum accuracy
|
|
logger.info(f"[LLAMAPARSE] Parsing with agentic_plus tier...")
|
|
result = await client.parsing.parse(
|
|
file_id=file_obj.id,
|
|
tier="agentic_plus",
|
|
version="latest",
|
|
expand=["markdown"],
|
|
)
|
|
|
|
# Extract markdown from all pages
|
|
if result.markdown and result.markdown.pages:
|
|
total_pages = len(result.markdown.pages)
|
|
pages = []
|
|
failed_pages = []
|
|
for i, page in enumerate(result.markdown.pages):
|
|
if hasattr(page, "markdown") and page.markdown:
|
|
pages.append(page.markdown)
|
|
else:
|
|
page_num = getattr(page, "page_number", i + 1)
|
|
error_msg = getattr(page, "error", "Unknown error")
|
|
logger.error(
|
|
f"[LLAMAPARSE] Page {page_num} failed for '{filename}': "
|
|
f"type={type(page).__name__}, error={error_msg}"
|
|
)
|
|
failed_pages.append({"page": page_num, "error": error_msg})
|
|
if not pages:
|
|
logger.warning(f"[LLAMAPARSE] All {total_pages} pages failed for '{filename}'")
|
|
return ParseResult(markdown="", total_pages=total_pages, failed_pages=failed_pages)
|
|
combined = "\n\n".join(pages)
|
|
logger.info(f"[LLAMAPARSE] Parsed '{filename}' -> {len(combined)} chars from {len(pages)}/{total_pages} pages")
|
|
return ParseResult(markdown=combined, total_pages=total_pages, failed_pages=failed_pages)
|
|
|
|
logger.warning(f"[LLAMAPARSE] No markdown content returned for '{filename}'")
|
|
return ParseResult(markdown="", total_pages=0, failed_pages=[])
|
|
|
|
finally:
|
|
# Clean up temp file
|
|
try:
|
|
Path(tmp_path).unlink()
|
|
except OSError:
|
|
pass
|