Switch LlamaParse to llama-cloud SDK with agentic_plus tier
Replace deprecated llama-cloud-services package with llama-cloud>=1.0 (API v2). Use AsyncLlamaCloud client with tier="agentic_plus" for maximum parsing accuracy on complex layouts, tables, and visual structure. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
9e2473c3e9
commit
2c6f3d0686
2 changed files with 35 additions and 20 deletions
|
|
@ -6,14 +6,17 @@ logger = logging.getLogger(__name__)
|
|||
|
||||
|
||||
class LlamaParseService:
|
||||
"""Service for parsing documents using LlamaParse."""
|
||||
"""Service for parsing documents using LlamaParse with the llama-cloud SDK."""
|
||||
|
||||
def __init__(self, api_key: str):
|
||||
self.api_key = api_key
|
||||
|
||||
async def parse_document(self, file_data: bytes, filename: str) -> str:
|
||||
"""
|
||||
Parse a document and return its content as markdown.
|
||||
Parse a document using LlamaParse Agentic Plus tier and return markdown.
|
||||
|
||||
Uses the llama-cloud SDK (v1.0+) with API v2 for maximum accuracy
|
||||
on complex layouts, tables, and visual structure.
|
||||
|
||||
Args:
|
||||
file_data: Raw bytes of the document
|
||||
|
|
@ -22,33 +25,45 @@ class LlamaParseService:
|
|||
Returns:
|
||||
Parsed markdown text
|
||||
"""
|
||||
from llama_cloud_services import LlamaParse
|
||||
from llama_cloud import AsyncLlamaCloud
|
||||
|
||||
logger.info(f"[LLAMAPARSE] Starting parse for '{filename}' ({len(file_data)} bytes)")
|
||||
logger.info(f"[LLAMAPARSE] Starting agentic_plus parse for '{filename}' ({len(file_data)} bytes)")
|
||||
|
||||
parser = LlamaParse(
|
||||
api_key=self.api_key,
|
||||
num_workers=1,
|
||||
verbose=True,
|
||||
language="en",
|
||||
)
|
||||
client = AsyncLlamaCloud(api_key=self.api_key)
|
||||
|
||||
# Write bytes to a temp file since LlamaParse needs a file path
|
||||
# Write bytes to a temp file for upload
|
||||
suffix = Path(filename).suffix or ".pdf"
|
||||
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
|
||||
tmp.write(file_data)
|
||||
tmp_path = tmp.name
|
||||
|
||||
try:
|
||||
documents = await parser.aload_data(tmp_path)
|
||||
if not documents:
|
||||
logger.warning(f"[LLAMAPARSE] No documents returned for '{filename}'")
|
||||
return ""
|
||||
# Upload the file
|
||||
logger.info(f"[LLAMAPARSE] Uploading '{filename}' to LlamaCloud...")
|
||||
file_obj = await client.files.create(
|
||||
file=tmp_path,
|
||||
purpose="parse",
|
||||
)
|
||||
logger.info(f"[LLAMAPARSE] File uploaded, id: {file_obj.id}")
|
||||
|
||||
# Combine all document pages into a single markdown string
|
||||
combined = "\n\n".join(doc.text for doc in documents if doc.text)
|
||||
logger.info(f"[LLAMAPARSE] Parsed '{filename}' -> {len(combined)} chars from {len(documents)} pages")
|
||||
return combined
|
||||
# Parse with agentic_plus tier for maximum accuracy
|
||||
logger.info(f"[LLAMAPARSE] Parsing with agentic_plus tier...")
|
||||
result = await client.parsing.parse(
|
||||
file_id=file_obj.id,
|
||||
tier="agentic_plus",
|
||||
version="latest",
|
||||
expand=["markdown"],
|
||||
)
|
||||
|
||||
# Extract markdown from all pages
|
||||
if result.markdown and result.markdown.pages:
|
||||
pages = [page.markdown for page in result.markdown.pages if page.markdown]
|
||||
combined = "\n\n".join(pages)
|
||||
logger.info(f"[LLAMAPARSE] Parsed '{filename}' -> {len(combined)} chars from {len(pages)} pages")
|
||||
return combined
|
||||
|
||||
logger.warning(f"[LLAMAPARSE] No markdown content returned for '{filename}'")
|
||||
return ""
|
||||
|
||||
finally:
|
||||
# Clean up temp file
|
||||
|
|
|
|||
|
|
@ -12,4 +12,4 @@ sqlalchemy[asyncio]>=2.0.0
|
|||
asyncpg>=0.29.0
|
||||
alembic>=1.13.0
|
||||
PyMuPDF>=1.23.0
|
||||
llama-cloud-services>=0.6.0
|
||||
llama-cloud>=1.0
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue