diff --git a/backend/core/process_brief_enhanced.py b/backend/core/process_brief_enhanced.py index 4c60cb0..80e5f9c 100755 --- a/backend/core/process_brief_enhanced.py +++ b/backend/core/process_brief_enhanced.py @@ -540,54 +540,93 @@ class DocumentAnalyzer: logging.error(f"Error encoding file for OpenAI: {e}") return None + def _extract_document_content_local(self, filepath: str) -> str: + """Local fallback extraction using PyMuPDF / python-pptx / python-docx / openpyxl.""" + ext = os.path.splitext(filepath)[1].lower() + logging.info(f"Local extraction for {os.path.basename(filepath)} (ext={ext})") + + if ext == '.pdf': + doc = fitz.open(filepath) + pages = [] + for i, page in enumerate(doc, 1): + text = page.get_text("text") + if text.strip(): + pages.append(f"--- Page {i} ---\n{text}") + doc.close() + return "\n\n".join(pages) or "No text content found in PDF." + + elif ext in ('.pptx', '.ppt'): + prs = pptx.Presentation(filepath) + slides = [] + for i, slide in enumerate(prs.slides, 1): + texts = [] + for shape in slide.shapes: + if hasattr(shape, "text") and shape.text.strip(): + texts.append(shape.text.strip()) + if texts: + slides.append(f"--- Slide {i} ---\n" + "\n".join(texts)) + return "\n\n".join(slides) or "No text content found in presentation." + + elif ext in ('.docx', '.doc'): + document = docx.Document(filepath) + paragraphs = [p.text for p in document.paragraphs if p.text.strip()] + for table in document.tables: + for row in table.rows: + row_text = " | ".join(cell.text.strip() for cell in row.cells if cell.text.strip()) + if row_text: + paragraphs.append(row_text) + return "\n".join(paragraphs) or "No text content found in document." + + elif ext in ('.xlsx', '.xls'): + wb = load_workbook(filepath, read_only=True, data_only=True) + sheets_text = [] + for sheet_name in wb.sheetnames: + ws = wb[sheet_name] + rows = [] + for row in ws.iter_rows(values_only=True): + cells = [str(c) for c in row if c is not None and str(c).strip()] + if cells: + rows.append(" | ".join(cells)) + if rows: + sheets_text.append(f"--- Sheet: {sheet_name} ---\n" + "\n".join(rows)) + wb.close() + return "\n\n".join(sheets_text) or "No content found in spreadsheet." + + else: + raise Exception(f"Unsupported file type for local extraction: {ext}") + async def _extract_document_content(self, filepath: str) -> str: - """Extract markdown content from document using LlamaParser cloud service.""" + """Extract content from document — LlamaParser if key is configured, else local fallback.""" + if not config.LLAMACLOUD_API_KEY: + logging.warning("LLAMACLOUD_API_KEY not set — using local document extraction") + return self._extract_document_content_local(filepath) + try: from llama_cloud_services import LlamaParse logging.info(f"Using LlamaParser to extract content from: {os.path.basename(filepath)}") parser = LlamaParse( - # API key for LlamaParser api_key=config.LLAMACLOUD_API_KEY, - - # The parsing mode - use agent-based parsing for better accuracy parse_mode="parse_page_with_agent", - - # The model to use - GPT-5 for best results model="openai-gpt-5", - - # Whether to use high resolution OCR (slower but more accurate) high_res_ocr=True, - - # Adaptive long table detection and output adaptation adaptive_long_table=True, - - # Whether to try to extract outlined tables outlined_table_extraction=True, - - # Whether to output tables as HTML in the markdown output output_tables_as_HTML=True, - - # The page separator page_separator="\n\n---\n\n", ) - # Use the official async method result = await parser.aparse(filepath) - - # Get the markdown documents with page separation markdown_documents = result.get_markdown_documents(split_by_page=True) - - # Combine all markdown documents into a single string combined_content = "\n\n".join([doc.text for doc in markdown_documents]) logging.info(f"LlamaParser extraction completed. Content length: {len(combined_content)} characters") return combined_content except Exception as e: - logging.error(f"Error extracting document content with LlamaParser: {e}") - raise Exception(f"LlamaParser extraction failed: {e}") + logging.error(f"LlamaParser failed: {e} — falling back to local extraction") + return self._extract_document_content_local(filepath)