fix: add local document extraction fallback when LLAMACLOUD_API_KEY is absent

When LLAMACLOUD_API_KEY is empty the LlamaParse client constructed a Bearer token with an empty secret, causing Python's HTTP stack to raise "Illegal header value b'Bearer '" and fail every upload job. Changes: - _extract_document_content_local(): new method using PyMuPDF (PDF), python-pptx (PPTX), python-docx (DOCX), openpyxl (XLSX) — all already in requirements.txt - _extract_document_content(): skip LlamaParser entirely if key is not set; on LlamaParser exception, fall back to local extraction instead of raising Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-23 15:08:23 +00:00 · 2026-03-23 15:08:23 +00:00 · fc430cc10a
commit fc430cc10a
parent f85d6a6b51
1 changed files with 62 additions and 23 deletions
--- a/backend/core/process_brief_enhanced.py
+++ b/backend/core/process_brief_enhanced.py
@ -540,54 +540,93 @@ class DocumentAnalyzer:
            logging.error(f"Error encoding file for OpenAI: {e}")
            return None
    
+    def _extract_document_content_local(self, filepath: str) -> str:
+        """Local fallback extraction using PyMuPDF / python-pptx / python-docx / openpyxl."""
+        ext = os.path.splitext(filepath)[1].lower()
+        logging.info(f"Local extraction for {os.path.basename(filepath)} (ext={ext})")
+
+        if ext == '.pdf':
+            doc = fitz.open(filepath)
+            pages = []
+            for i, page in enumerate(doc, 1):
+                text = page.get_text("text")
+                if text.strip():
+                    pages.append(f"--- Page {i} ---\n{text}")
+            doc.close()
+            return "\n\n".join(pages) or "No text content found in PDF."
+
+        elif ext in ('.pptx', '.ppt'):
+            prs = pptx.Presentation(filepath)
+            slides = []
+            for i, slide in enumerate(prs.slides, 1):
+                texts = []
+                for shape in slide.shapes:
+                    if hasattr(shape, "text") and shape.text.strip():
+                        texts.append(shape.text.strip())
+                if texts:
+                    slides.append(f"--- Slide {i} ---\n" + "\n".join(texts))
+            return "\n\n".join(slides) or "No text content found in presentation."
+
+        elif ext in ('.docx', '.doc'):
+            document = docx.Document(filepath)
+            paragraphs = [p.text for p in document.paragraphs if p.text.strip()]
+            for table in document.tables:
+                for row in table.rows:
+                    row_text = " | ".join(cell.text.strip() for cell in row.cells if cell.text.strip())
+                    if row_text:
+                        paragraphs.append(row_text)
+            return "\n".join(paragraphs) or "No text content found in document."
+
+        elif ext in ('.xlsx', '.xls'):
+            wb = load_workbook(filepath, read_only=True, data_only=True)
+            sheets_text = []
+            for sheet_name in wb.sheetnames:
+                ws = wb[sheet_name]
+                rows = []
+                for row in ws.iter_rows(values_only=True):
+                    cells = [str(c) for c in row if c is not None and str(c).strip()]
+                    if cells:
+                        rows.append(" | ".join(cells))
+                if rows:
+                    sheets_text.append(f"--- Sheet: {sheet_name} ---\n" + "\n".join(rows))
+            wb.close()
+            return "\n\n".join(sheets_text) or "No content found in spreadsheet."
+
+        else:
+            raise Exception(f"Unsupported file type for local extraction: {ext}")
+
    async def _extract_document_content(self, filepath: str) -> str:
-        """Extract markdown content from document using LlamaParser cloud service."""
+        """Extract content from document — LlamaParser if key is configured, else local fallback."""
+        if not config.LLAMACLOUD_API_KEY:
+            logging.warning("LLAMACLOUD_API_KEY not set — using local document extraction")
+            return self._extract_document_content_local(filepath)
+
        try:
            from llama_cloud_services import LlamaParse

            logging.info(f"Using LlamaParser to extract content from: {os.path.basename(filepath)}")

            parser = LlamaParse(
-                # API key for LlamaParser
                api_key=config.LLAMACLOUD_API_KEY,
-
-                # The parsing mode - use agent-based parsing for better accuracy
                parse_mode="parse_page_with_agent",
-
-                # The model to use - GPT-5 for best results
                model="openai-gpt-5",
-
-                # Whether to use high resolution OCR (slower but more accurate)
                high_res_ocr=True,
-
-                # Adaptive long table detection and output adaptation
                adaptive_long_table=True,
-
-                # Whether to try to extract outlined tables
                outlined_table_extraction=True,
-
-                # Whether to output tables as HTML in the markdown output
                output_tables_as_HTML=True,
-
-                # The page separator
                page_separator="\n\n---\n\n",
            )

-            # Use the official async method
            result = await parser.aparse(filepath)
-
-            # Get the markdown documents with page separation
            markdown_documents = result.get_markdown_documents(split_by_page=True)
-
-            # Combine all markdown documents into a single string
            combined_content = "\n\n".join([doc.text for doc in markdown_documents])

            logging.info(f"LlamaParser extraction completed. Content length: {len(combined_content)} characters")
            return combined_content

        except Exception as e:
-            logging.error(f"Error extracting document content with LlamaParser: {e}")
-            raise Exception(f"LlamaParser extraction failed: {e}")
+            logging.error(f"LlamaParser failed: {e} — falling back to local extraction")
+            return self._extract_document_content_local(filepath)