Switch LlamaParse to llama-cloud SDK with agentic_plus tier

Replace deprecated llama-cloud-services package with llama-cloud>=1.0 (API v2). Use AsyncLlamaCloud client with tier="agentic_plus" for maximum parsing accuracy on complex layouts, tables, and visual structure. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-12 15:29:23 -06:00 · 2026-02-12 15:29:23 -06:00 · 2c6f3d0686
commit 2c6f3d0686
parent 9e2473c3e9
2 changed files with 35 additions and 20 deletions
--- a/backend/app/services/llamaparse_service.py
+++ b/backend/app/services/llamaparse_service.py
@ -6,14 +6,17 @@ logger = logging.getLogger(__name__)


 class LlamaParseService:
-    """Service for parsing documents using LlamaParse."""
+    """Service for parsing documents using LlamaParse with the llama-cloud SDK."""

    def __init__(self, api_key: str):
        self.api_key = api_key

    async def parse_document(self, file_data: bytes, filename: str) -> str:
        """
-        Parse a document and return its content as markdown.
+        Parse a document using LlamaParse Agentic Plus tier and return markdown.
+
+        Uses the llama-cloud SDK (v1.0+) with API v2 for maximum accuracy
+        on complex layouts, tables, and visual structure.

        Args:
            file_data: Raw bytes of the document
@ -22,33 +25,45 @@ class LlamaParseService:
        Returns:
            Parsed markdown text
        """
-        from llama_cloud_services import LlamaParse
+        from llama_cloud import AsyncLlamaCloud

-        logger.info(f"[LLAMAPARSE] Starting parse for '{filename}' ({len(file_data)} bytes)")
+        logger.info(f"[LLAMAPARSE] Starting agentic_plus parse for '{filename}' ({len(file_data)} bytes)")

-        parser = LlamaParse(
-            api_key=self.api_key,
-            num_workers=1,
-            verbose=True,
-            language="en",
-        )
+        client = AsyncLlamaCloud(api_key=self.api_key)

-        # Write bytes to a temp file since LlamaParse needs a file path
+        # Write bytes to a temp file for upload
        suffix = Path(filename).suffix or ".pdf"
        with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
            tmp.write(file_data)
            tmp_path = tmp.name

        try:
-            documents = await parser.aload_data(tmp_path)
-            if not documents:
-                logger.warning(f"[LLAMAPARSE] No documents returned for '{filename}'")
-                return ""
+            # Upload the file
+            logger.info(f"[LLAMAPARSE] Uploading '{filename}' to LlamaCloud...")
+            file_obj = await client.files.create(
+                file=tmp_path,
+                purpose="parse",
+            )
+            logger.info(f"[LLAMAPARSE] File uploaded, id: {file_obj.id}")

-            # Combine all document pages into a single markdown string
-            combined = "\n\n".join(doc.text for doc in documents if doc.text)
-            logger.info(f"[LLAMAPARSE] Parsed '{filename}' -> {len(combined)} chars from {len(documents)} pages")
-            return combined
+            # Parse with agentic_plus tier for maximum accuracy
+            logger.info(f"[LLAMAPARSE] Parsing with agentic_plus tier...")
+            result = await client.parsing.parse(
+                file_id=file_obj.id,
+                tier="agentic_plus",
+                version="latest",
+                expand=["markdown"],
+            )
+
+            # Extract markdown from all pages
+            if result.markdown and result.markdown.pages:
+                pages = [page.markdown for page in result.markdown.pages if page.markdown]
+                combined = "\n\n".join(pages)
+                logger.info(f"[LLAMAPARSE] Parsed '{filename}' -> {len(combined)} chars from {len(pages)} pages")
+                return combined
+
+            logger.warning(f"[LLAMAPARSE] No markdown content returned for '{filename}'")
+            return ""

        finally:
            # Clean up temp file
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@ -12,4 +12,4 @@ sqlalchemy[asyncio]>=2.0.0
 asyncpg>=0.29.0
 alembic>=1.13.0
 PyMuPDF>=1.23.0
-llama-cloud-services>=0.6.0
+llama-cloud>=1.0