Switch LlamaParse to llama-cloud SDK with agentic_plus tier

Replace deprecated llama-cloud-services package with llama-cloud>=1.0 (API v2).
Use AsyncLlamaCloud client with tier="agentic_plus" for maximum parsing accuracy
on complex layouts, tables, and visual structure.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
michael 2026-02-12 15:29:23 -06:00
parent 9e2473c3e9
commit 2c6f3d0686
2 changed files with 35 additions and 20 deletions

View file

@ -6,14 +6,17 @@ logger = logging.getLogger(__name__)
class LlamaParseService:
"""Service for parsing documents using LlamaParse."""
"""Service for parsing documents using LlamaParse with the llama-cloud SDK."""
def __init__(self, api_key: str):
self.api_key = api_key
async def parse_document(self, file_data: bytes, filename: str) -> str:
"""
Parse a document and return its content as markdown.
Parse a document using LlamaParse Agentic Plus tier and return markdown.
Uses the llama-cloud SDK (v1.0+) with API v2 for maximum accuracy
on complex layouts, tables, and visual structure.
Args:
file_data: Raw bytes of the document
@ -22,33 +25,45 @@ class LlamaParseService:
Returns:
Parsed markdown text
"""
from llama_cloud_services import LlamaParse
from llama_cloud import AsyncLlamaCloud
logger.info(f"[LLAMAPARSE] Starting parse for '{filename}' ({len(file_data)} bytes)")
logger.info(f"[LLAMAPARSE] Starting agentic_plus parse for '{filename}' ({len(file_data)} bytes)")
parser = LlamaParse(
api_key=self.api_key,
num_workers=1,
verbose=True,
language="en",
)
client = AsyncLlamaCloud(api_key=self.api_key)
# Write bytes to a temp file since LlamaParse needs a file path
# Write bytes to a temp file for upload
suffix = Path(filename).suffix or ".pdf"
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
tmp.write(file_data)
tmp_path = tmp.name
try:
documents = await parser.aload_data(tmp_path)
if not documents:
logger.warning(f"[LLAMAPARSE] No documents returned for '{filename}'")
return ""
# Upload the file
logger.info(f"[LLAMAPARSE] Uploading '{filename}' to LlamaCloud...")
file_obj = await client.files.create(
file=tmp_path,
purpose="parse",
)
logger.info(f"[LLAMAPARSE] File uploaded, id: {file_obj.id}")
# Combine all document pages into a single markdown string
combined = "\n\n".join(doc.text for doc in documents if doc.text)
logger.info(f"[LLAMAPARSE] Parsed '{filename}' -> {len(combined)} chars from {len(documents)} pages")
return combined
# Parse with agentic_plus tier for maximum accuracy
logger.info(f"[LLAMAPARSE] Parsing with agentic_plus tier...")
result = await client.parsing.parse(
file_id=file_obj.id,
tier="agentic_plus",
version="latest",
expand=["markdown"],
)
# Extract markdown from all pages
if result.markdown and result.markdown.pages:
pages = [page.markdown for page in result.markdown.pages if page.markdown]
combined = "\n\n".join(pages)
logger.info(f"[LLAMAPARSE] Parsed '{filename}' -> {len(combined)} chars from {len(pages)} pages")
return combined
logger.warning(f"[LLAMAPARSE] No markdown content returned for '{filename}'")
return ""
finally:
# Clean up temp file

View file

@ -12,4 +12,4 @@ sqlalchemy[asyncio]>=2.0.0
asyncpg>=0.29.0
alembic>=1.13.0
PyMuPDF>=1.23.0
llama-cloud-services>=0.6.0
llama-cloud>=1.0