- Add LLAMA_CLOUD_BASE_URL config option so the LlamaCloud regional endpoint can be set without code changes (fixes 401/region errors on production); pass it through to AsyncLlamaCloud client init - Document LLAMA_CLOUD_BASE_URL in .env.deploy.example with EU endpoint - Copy BAR-ModComms-logo-v5.png to frontend/public - Sidebar: update logo reference v4 → v5 - PDF header: update logo v4 → v5, wrap in black (#000) band for legibility, remove duplicate "Oliver" wordmark Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
102 lines
3.9 KiB
Python
102 lines
3.9 KiB
Python
import logging
|
|
import tempfile
|
|
from dataclasses import dataclass, field
|
|
from pathlib import Path
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class ParseResult:
|
|
"""Result of a LlamaParse document parse."""
|
|
markdown: str
|
|
total_pages: int = 0
|
|
failed_pages: list[dict] = field(default_factory=list)
|
|
|
|
|
|
class LlamaParseService:
|
|
"""Service for parsing documents using LlamaParse with the llama-cloud SDK."""
|
|
|
|
def __init__(self, api_key: str, base_url: str = ""):
|
|
self.api_key = api_key
|
|
self.base_url = base_url
|
|
|
|
async def parse_document(self, file_data: bytes, filename: str) -> ParseResult:
|
|
"""
|
|
Parse a document using LlamaParse Agentic Plus tier and return markdown.
|
|
|
|
Uses the llama-cloud SDK (v1.0+) with API v2 for maximum accuracy
|
|
on complex layouts, tables, and visual structure.
|
|
|
|
Args:
|
|
file_data: Raw bytes of the document
|
|
filename: Original filename (used for format detection)
|
|
|
|
Returns:
|
|
ParseResult with markdown text and any failed page info
|
|
"""
|
|
from llama_cloud import AsyncLlamaCloud
|
|
|
|
logger.info(f"[LLAMAPARSE] Starting agentic_plus parse for '{filename}' ({len(file_data)} bytes)")
|
|
|
|
client = AsyncLlamaCloud(
|
|
api_key=self.api_key,
|
|
**({"base_url": self.base_url} if self.base_url else {})
|
|
)
|
|
|
|
# Write bytes to a temp file for upload
|
|
suffix = Path(filename).suffix or ".pdf"
|
|
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
|
|
tmp.write(file_data)
|
|
tmp_path = tmp.name
|
|
|
|
try:
|
|
# Upload the file
|
|
logger.info(f"[LLAMAPARSE] Uploading '{filename}' to LlamaCloud...")
|
|
file_obj = await client.files.create(
|
|
file=tmp_path,
|
|
purpose="parse",
|
|
)
|
|
logger.info(f"[LLAMAPARSE] File uploaded, id: {file_obj.id}")
|
|
|
|
# Parse with agentic_plus tier for maximum accuracy
|
|
logger.info(f"[LLAMAPARSE] Parsing with agentic_plus tier...")
|
|
result = await client.parsing.parse(
|
|
file_id=file_obj.id,
|
|
tier="agentic_plus",
|
|
version="latest",
|
|
expand=["markdown"],
|
|
)
|
|
|
|
# Extract markdown from all pages
|
|
if result.markdown and result.markdown.pages:
|
|
total_pages = len(result.markdown.pages)
|
|
pages = []
|
|
failed_pages = []
|
|
for i, page in enumerate(result.markdown.pages):
|
|
if hasattr(page, "markdown") and page.markdown:
|
|
pages.append(page.markdown)
|
|
else:
|
|
page_num = getattr(page, "page_number", i + 1)
|
|
error_msg = getattr(page, "error", "Unknown error")
|
|
logger.error(
|
|
f"[LLAMAPARSE] Page {page_num} failed for '{filename}': "
|
|
f"type={type(page).__name__}, error={error_msg}"
|
|
)
|
|
failed_pages.append({"page": page_num, "error": error_msg})
|
|
if not pages:
|
|
logger.warning(f"[LLAMAPARSE] All {total_pages} pages failed for '{filename}'")
|
|
return ParseResult(markdown="", total_pages=total_pages, failed_pages=failed_pages)
|
|
combined = "\n\n".join(pages)
|
|
logger.info(f"[LLAMAPARSE] Parsed '{filename}' -> {len(combined)} chars from {len(pages)}/{total_pages} pages")
|
|
return ParseResult(markdown=combined, total_pages=total_pages, failed_pages=failed_pages)
|
|
|
|
logger.warning(f"[LLAMAPARSE] No markdown content returned for '{filename}'")
|
|
return ParseResult(markdown="", total_pages=0, failed_pages=[])
|
|
|
|
finally:
|
|
# Clean up temp file
|
|
try:
|
|
Path(tmp_path).unlink()
|
|
except OSError:
|
|
pass
|