modcomms/backend/app/services/llamaparse_service.py
Vadym Samoilenko 538a32505e Fix LlamaParse 401 + update logo to v5
- Add LLAMA_CLOUD_BASE_URL config option so the LlamaCloud regional
  endpoint can be set without code changes (fixes 401/region errors
  on production); pass it through to AsyncLlamaCloud client init
- Document LLAMA_CLOUD_BASE_URL in .env.deploy.example with EU endpoint
- Copy BAR-ModComms-logo-v5.png to frontend/public
- Sidebar: update logo reference v4 → v5
- PDF header: update logo v4 → v5, wrap in black (#000) band for
  legibility, remove duplicate "Oliver" wordmark

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-16 12:22:31 +00:00

102 lines
3.9 KiB
Python

import logging
import tempfile
from dataclasses import dataclass, field
from pathlib import Path
logger = logging.getLogger(__name__)
@dataclass
class ParseResult:
"""Result of a LlamaParse document parse."""
markdown: str
total_pages: int = 0
failed_pages: list[dict] = field(default_factory=list)
class LlamaParseService:
"""Service for parsing documents using LlamaParse with the llama-cloud SDK."""
def __init__(self, api_key: str, base_url: str = ""):
self.api_key = api_key
self.base_url = base_url
async def parse_document(self, file_data: bytes, filename: str) -> ParseResult:
"""
Parse a document using LlamaParse Agentic Plus tier and return markdown.
Uses the llama-cloud SDK (v1.0+) with API v2 for maximum accuracy
on complex layouts, tables, and visual structure.
Args:
file_data: Raw bytes of the document
filename: Original filename (used for format detection)
Returns:
ParseResult with markdown text and any failed page info
"""
from llama_cloud import AsyncLlamaCloud
logger.info(f"[LLAMAPARSE] Starting agentic_plus parse for '{filename}' ({len(file_data)} bytes)")
client = AsyncLlamaCloud(
api_key=self.api_key,
**({"base_url": self.base_url} if self.base_url else {})
)
# Write bytes to a temp file for upload
suffix = Path(filename).suffix or ".pdf"
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
tmp.write(file_data)
tmp_path = tmp.name
try:
# Upload the file
logger.info(f"[LLAMAPARSE] Uploading '{filename}' to LlamaCloud...")
file_obj = await client.files.create(
file=tmp_path,
purpose="parse",
)
logger.info(f"[LLAMAPARSE] File uploaded, id: {file_obj.id}")
# Parse with agentic_plus tier for maximum accuracy
logger.info(f"[LLAMAPARSE] Parsing with agentic_plus tier...")
result = await client.parsing.parse(
file_id=file_obj.id,
tier="agentic_plus",
version="latest",
expand=["markdown"],
)
# Extract markdown from all pages
if result.markdown and result.markdown.pages:
total_pages = len(result.markdown.pages)
pages = []
failed_pages = []
for i, page in enumerate(result.markdown.pages):
if hasattr(page, "markdown") and page.markdown:
pages.append(page.markdown)
else:
page_num = getattr(page, "page_number", i + 1)
error_msg = getattr(page, "error", "Unknown error")
logger.error(
f"[LLAMAPARSE] Page {page_num} failed for '{filename}': "
f"type={type(page).__name__}, error={error_msg}"
)
failed_pages.append({"page": page_num, "error": error_msg})
if not pages:
logger.warning(f"[LLAMAPARSE] All {total_pages} pages failed for '{filename}'")
return ParseResult(markdown="", total_pages=total_pages, failed_pages=failed_pages)
combined = "\n\n".join(pages)
logger.info(f"[LLAMAPARSE] Parsed '{filename}' -> {len(combined)} chars from {len(pages)}/{total_pages} pages")
return ParseResult(markdown=combined, total_pages=total_pages, failed_pages=failed_pages)
logger.warning(f"[LLAMAPARSE] No markdown content returned for '{filename}'")
return ParseResult(markdown="", total_pages=0, failed_pages=[])
finally:
# Clean up temp file
try:
Path(tmp_path).unlink()
except OSError:
pass