diff --git a/backend/app/services/llamaparse_service.py b/backend/app/services/llamaparse_service.py index 9828d47..465b6fd 100644 --- a/backend/app/services/llamaparse_service.py +++ b/backend/app/services/llamaparse_service.py @@ -6,14 +6,17 @@ logger = logging.getLogger(__name__) class LlamaParseService: - """Service for parsing documents using LlamaParse.""" + """Service for parsing documents using LlamaParse with the llama-cloud SDK.""" def __init__(self, api_key: str): self.api_key = api_key async def parse_document(self, file_data: bytes, filename: str) -> str: """ - Parse a document and return its content as markdown. + Parse a document using LlamaParse Agentic Plus tier and return markdown. + + Uses the llama-cloud SDK (v1.0+) with API v2 for maximum accuracy + on complex layouts, tables, and visual structure. Args: file_data: Raw bytes of the document @@ -22,33 +25,45 @@ class LlamaParseService: Returns: Parsed markdown text """ - from llama_cloud_services import LlamaParse + from llama_cloud import AsyncLlamaCloud - logger.info(f"[LLAMAPARSE] Starting parse for '{filename}' ({len(file_data)} bytes)") + logger.info(f"[LLAMAPARSE] Starting agentic_plus parse for '{filename}' ({len(file_data)} bytes)") - parser = LlamaParse( - api_key=self.api_key, - num_workers=1, - verbose=True, - language="en", - ) + client = AsyncLlamaCloud(api_key=self.api_key) - # Write bytes to a temp file since LlamaParse needs a file path + # Write bytes to a temp file for upload suffix = Path(filename).suffix or ".pdf" with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp: tmp.write(file_data) tmp_path = tmp.name try: - documents = await parser.aload_data(tmp_path) - if not documents: - logger.warning(f"[LLAMAPARSE] No documents returned for '{filename}'") - return "" + # Upload the file + logger.info(f"[LLAMAPARSE] Uploading '{filename}' to LlamaCloud...") + file_obj = await client.files.create( + file=tmp_path, + purpose="parse", + ) + logger.info(f"[LLAMAPARSE] File uploaded, id: {file_obj.id}") - # Combine all document pages into a single markdown string - combined = "\n\n".join(doc.text for doc in documents if doc.text) - logger.info(f"[LLAMAPARSE] Parsed '{filename}' -> {len(combined)} chars from {len(documents)} pages") - return combined + # Parse with agentic_plus tier for maximum accuracy + logger.info(f"[LLAMAPARSE] Parsing with agentic_plus tier...") + result = await client.parsing.parse( + file_id=file_obj.id, + tier="agentic_plus", + version="latest", + expand=["markdown"], + ) + + # Extract markdown from all pages + if result.markdown and result.markdown.pages: + pages = [page.markdown for page in result.markdown.pages if page.markdown] + combined = "\n\n".join(pages) + logger.info(f"[LLAMAPARSE] Parsed '{filename}' -> {len(combined)} chars from {len(pages)} pages") + return combined + + logger.warning(f"[LLAMAPARSE] No markdown content returned for '{filename}'") + return "" finally: # Clean up temp file diff --git a/backend/requirements.txt b/backend/requirements.txt index 92bc384..aca3e3f 100755 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -12,4 +12,4 @@ sqlalchemy[asyncio]>=2.0.0 asyncpg>=0.29.0 alembic>=1.13.0 PyMuPDF>=1.23.0 -llama-cloud-services>=0.6.0 +llama-cloud>=1.0