fix: add local document extraction fallback when LLAMACLOUD_API_KEY is absent
When LLAMACLOUD_API_KEY is empty the LlamaParse client constructed a Bearer token with an empty secret, causing Python's HTTP stack to raise "Illegal header value b'Bearer '" and fail every upload job. Changes: - _extract_document_content_local(): new method using PyMuPDF (PDF), python-pptx (PPTX), python-docx (DOCX), openpyxl (XLSX) — all already in requirements.txt - _extract_document_content(): skip LlamaParser entirely if key is not set; on LlamaParser exception, fall back to local extraction instead of raising Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
f85d6a6b51
commit
fc430cc10a
1 changed files with 62 additions and 23 deletions
|
|
@ -540,54 +540,93 @@ class DocumentAnalyzer:
|
|||
logging.error(f"Error encoding file for OpenAI: {e}")
|
||||
return None
|
||||
|
||||
def _extract_document_content_local(self, filepath: str) -> str:
|
||||
"""Local fallback extraction using PyMuPDF / python-pptx / python-docx / openpyxl."""
|
||||
ext = os.path.splitext(filepath)[1].lower()
|
||||
logging.info(f"Local extraction for {os.path.basename(filepath)} (ext={ext})")
|
||||
|
||||
if ext == '.pdf':
|
||||
doc = fitz.open(filepath)
|
||||
pages = []
|
||||
for i, page in enumerate(doc, 1):
|
||||
text = page.get_text("text")
|
||||
if text.strip():
|
||||
pages.append(f"--- Page {i} ---\n{text}")
|
||||
doc.close()
|
||||
return "\n\n".join(pages) or "No text content found in PDF."
|
||||
|
||||
elif ext in ('.pptx', '.ppt'):
|
||||
prs = pptx.Presentation(filepath)
|
||||
slides = []
|
||||
for i, slide in enumerate(prs.slides, 1):
|
||||
texts = []
|
||||
for shape in slide.shapes:
|
||||
if hasattr(shape, "text") and shape.text.strip():
|
||||
texts.append(shape.text.strip())
|
||||
if texts:
|
||||
slides.append(f"--- Slide {i} ---\n" + "\n".join(texts))
|
||||
return "\n\n".join(slides) or "No text content found in presentation."
|
||||
|
||||
elif ext in ('.docx', '.doc'):
|
||||
document = docx.Document(filepath)
|
||||
paragraphs = [p.text for p in document.paragraphs if p.text.strip()]
|
||||
for table in document.tables:
|
||||
for row in table.rows:
|
||||
row_text = " | ".join(cell.text.strip() for cell in row.cells if cell.text.strip())
|
||||
if row_text:
|
||||
paragraphs.append(row_text)
|
||||
return "\n".join(paragraphs) or "No text content found in document."
|
||||
|
||||
elif ext in ('.xlsx', '.xls'):
|
||||
wb = load_workbook(filepath, read_only=True, data_only=True)
|
||||
sheets_text = []
|
||||
for sheet_name in wb.sheetnames:
|
||||
ws = wb[sheet_name]
|
||||
rows = []
|
||||
for row in ws.iter_rows(values_only=True):
|
||||
cells = [str(c) for c in row if c is not None and str(c).strip()]
|
||||
if cells:
|
||||
rows.append(" | ".join(cells))
|
||||
if rows:
|
||||
sheets_text.append(f"--- Sheet: {sheet_name} ---\n" + "\n".join(rows))
|
||||
wb.close()
|
||||
return "\n\n".join(sheets_text) or "No content found in spreadsheet."
|
||||
|
||||
else:
|
||||
raise Exception(f"Unsupported file type for local extraction: {ext}")
|
||||
|
||||
async def _extract_document_content(self, filepath: str) -> str:
|
||||
"""Extract markdown content from document using LlamaParser cloud service."""
|
||||
"""Extract content from document — LlamaParser if key is configured, else local fallback."""
|
||||
if not config.LLAMACLOUD_API_KEY:
|
||||
logging.warning("LLAMACLOUD_API_KEY not set — using local document extraction")
|
||||
return self._extract_document_content_local(filepath)
|
||||
|
||||
try:
|
||||
from llama_cloud_services import LlamaParse
|
||||
|
||||
logging.info(f"Using LlamaParser to extract content from: {os.path.basename(filepath)}")
|
||||
|
||||
parser = LlamaParse(
|
||||
# API key for LlamaParser
|
||||
api_key=config.LLAMACLOUD_API_KEY,
|
||||
|
||||
# The parsing mode - use agent-based parsing for better accuracy
|
||||
parse_mode="parse_page_with_agent",
|
||||
|
||||
# The model to use - GPT-5 for best results
|
||||
model="openai-gpt-5",
|
||||
|
||||
# Whether to use high resolution OCR (slower but more accurate)
|
||||
high_res_ocr=True,
|
||||
|
||||
# Adaptive long table detection and output adaptation
|
||||
adaptive_long_table=True,
|
||||
|
||||
# Whether to try to extract outlined tables
|
||||
outlined_table_extraction=True,
|
||||
|
||||
# Whether to output tables as HTML in the markdown output
|
||||
output_tables_as_HTML=True,
|
||||
|
||||
# The page separator
|
||||
page_separator="\n\n---\n\n",
|
||||
)
|
||||
|
||||
# Use the official async method
|
||||
result = await parser.aparse(filepath)
|
||||
|
||||
# Get the markdown documents with page separation
|
||||
markdown_documents = result.get_markdown_documents(split_by_page=True)
|
||||
|
||||
# Combine all markdown documents into a single string
|
||||
combined_content = "\n\n".join([doc.text for doc in markdown_documents])
|
||||
|
||||
logging.info(f"LlamaParser extraction completed. Content length: {len(combined_content)} characters")
|
||||
return combined_content
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Error extracting document content with LlamaParser: {e}")
|
||||
raise Exception(f"LlamaParser extraction failed: {e}")
|
||||
logging.error(f"LlamaParser failed: {e} — falling back to local extraction")
|
||||
return self._extract_document_content_local(filepath)
|
||||
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue