fix: add local document extraction fallback when LLAMACLOUD_API_KEY is absent

When LLAMACLOUD_API_KEY is empty the LlamaParse client constructed a Bearer
token with an empty secret, causing Python's HTTP stack to raise
"Illegal header value b'Bearer '" and fail every upload job.

Changes:
- _extract_document_content_local(): new method using PyMuPDF (PDF),
  python-pptx (PPTX), python-docx (DOCX), openpyxl (XLSX) — all already
  in requirements.txt
- _extract_document_content(): skip LlamaParser entirely if key is not set;
  on LlamaParser exception, fall back to local extraction instead of raising

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Vadym Samoilenko 2026-03-23 15:08:23 +00:00
parent f85d6a6b51
commit fc430cc10a

View file

@ -540,54 +540,93 @@ class DocumentAnalyzer:
logging.error(f"Error encoding file for OpenAI: {e}")
return None
def _extract_document_content_local(self, filepath: str) -> str:
"""Local fallback extraction using PyMuPDF / python-pptx / python-docx / openpyxl."""
ext = os.path.splitext(filepath)[1].lower()
logging.info(f"Local extraction for {os.path.basename(filepath)} (ext={ext})")
if ext == '.pdf':
doc = fitz.open(filepath)
pages = []
for i, page in enumerate(doc, 1):
text = page.get_text("text")
if text.strip():
pages.append(f"--- Page {i} ---\n{text}")
doc.close()
return "\n\n".join(pages) or "No text content found in PDF."
elif ext in ('.pptx', '.ppt'):
prs = pptx.Presentation(filepath)
slides = []
for i, slide in enumerate(prs.slides, 1):
texts = []
for shape in slide.shapes:
if hasattr(shape, "text") and shape.text.strip():
texts.append(shape.text.strip())
if texts:
slides.append(f"--- Slide {i} ---\n" + "\n".join(texts))
return "\n\n".join(slides) or "No text content found in presentation."
elif ext in ('.docx', '.doc'):
document = docx.Document(filepath)
paragraphs = [p.text for p in document.paragraphs if p.text.strip()]
for table in document.tables:
for row in table.rows:
row_text = " | ".join(cell.text.strip() for cell in row.cells if cell.text.strip())
if row_text:
paragraphs.append(row_text)
return "\n".join(paragraphs) or "No text content found in document."
elif ext in ('.xlsx', '.xls'):
wb = load_workbook(filepath, read_only=True, data_only=True)
sheets_text = []
for sheet_name in wb.sheetnames:
ws = wb[sheet_name]
rows = []
for row in ws.iter_rows(values_only=True):
cells = [str(c) for c in row if c is not None and str(c).strip()]
if cells:
rows.append(" | ".join(cells))
if rows:
sheets_text.append(f"--- Sheet: {sheet_name} ---\n" + "\n".join(rows))
wb.close()
return "\n\n".join(sheets_text) or "No content found in spreadsheet."
else:
raise Exception(f"Unsupported file type for local extraction: {ext}")
async def _extract_document_content(self, filepath: str) -> str:
"""Extract markdown content from document using LlamaParser cloud service."""
"""Extract content from document — LlamaParser if key is configured, else local fallback."""
if not config.LLAMACLOUD_API_KEY:
logging.warning("LLAMACLOUD_API_KEY not set — using local document extraction")
return self._extract_document_content_local(filepath)
try:
from llama_cloud_services import LlamaParse
logging.info(f"Using LlamaParser to extract content from: {os.path.basename(filepath)}")
parser = LlamaParse(
# API key for LlamaParser
api_key=config.LLAMACLOUD_API_KEY,
# The parsing mode - use agent-based parsing for better accuracy
parse_mode="parse_page_with_agent",
# The model to use - GPT-5 for best results
model="openai-gpt-5",
# Whether to use high resolution OCR (slower but more accurate)
high_res_ocr=True,
# Adaptive long table detection and output adaptation
adaptive_long_table=True,
# Whether to try to extract outlined tables
outlined_table_extraction=True,
# Whether to output tables as HTML in the markdown output
output_tables_as_HTML=True,
# The page separator
page_separator="\n\n---\n\n",
)
# Use the official async method
result = await parser.aparse(filepath)
# Get the markdown documents with page separation
markdown_documents = result.get_markdown_documents(split_by_page=True)
# Combine all markdown documents into a single string
combined_content = "\n\n".join([doc.text for doc in markdown_documents])
logging.info(f"LlamaParser extraction completed. Content length: {len(combined_content)} characters")
return combined_content
except Exception as e:
logging.error(f"Error extracting document content with LlamaParser: {e}")
raise Exception(f"LlamaParser extraction failed: {e}")
logging.error(f"LlamaParser failed: {e} — falling back to local extraction")
return self._extract_document_content_local(filepath)