added llm based fallback extraction method for file types llamaextract doesn't support

2025-10-17 15:36:55 -05:00 · 2025-10-17 15:36:55 -05:00 · aff4e7154d
commit aff4e7154d
parent c70dc59bfc
3 changed files with 212 additions and 20 deletions
--- a/README.md
+++ b/README.md
@ -143,6 +143,9 @@ npm run dev
  - Images: JPG, PNG, GIF, BMP, TIFF, SVG (with OCR)
  - Audio: MP3, WAV, M4A (transcription, 20MB limit)
  - Web: HTML
+- ✅ **Intelligent Processing** - Automatic format detection
+  - PDF/DOCX/PPTX: LlamaExtract for structured extraction
+  - Other formats: Direct LLM analysis (Excel, CSV, images, audio, etc.)
 - ✅ **Background Processing** - Non-blocking document analysis
 - ✅ **Real-time Status** - Watch processing progress

@ -325,6 +328,8 @@ UPDATE users SET is_admin = true WHERE email = 'your@email.com';
   - Web: HTML
 4. Click "Upload X File(s)"
 5. Wait ~1 minute per file for processing
+   - PDF/DOCX/PPTX: Processed with LlamaExtract
+   - Other formats: Processed with direct LLM analysis
 6. Expand files to see summaries, highlights, Q&A

 **Generate Cross-Document Analysis:**
@ -386,6 +391,9 @@ UPDATE users SET is_admin = true WHERE email = 'your@email.com';
 **AI/ML:**
 - LlamaCloud (document indexing)
 - LlamaIndex (RAG)
+- LlamaParse (document parsing - supports 30+ formats)
+- LlamaExtract (structured extraction for PDF/DOCX/PPTX)
+- Direct LLM extraction (fallback for Excel, CSV, images, audio, etc.)
 - OpenAI (GPT-4, GPT-5)
 - Anthropic (Claude 3.5, 4.5)
 - Google (Gemini 2.0, 2.5 Pro)
--- a/backend/src/notebookllama/background_tasks.py
+++ b/backend/src/notebookllama/background_tasks.py
@ -206,30 +206,47 @@ async def execute_document_processing_task(task_id: int):

        text = "\n\n---\n\n".join([md.text for md in md_content])

-        extraction_output = await EXTRACT_AGENT.aextract(
-            files=SourceText(text_content=text, filename=original_filename)
-        )
+        # Determine extraction method based on file type
+        # LlamaExtract only supports: PDF, DOCX, PPTX (and their variants)
+        llamaextract_supported = ('.pdf', '.docx', '.doc', '.pptx', '.ppt')
+        file_ext = os.path.splitext(original_filename.lower())[1]
+        use_llamaextract = file_ext in llamaextract_supported

-        if extraction_output:
-            notebook_data = extraction_output.data
-
-            # Save summary
-            create_document_summary(
-                user_id, document.id,
-                notebook_data.get('summary', ''),
-                notebook_data.get('highlights', []),
-                notebook_data.get('questions', []),
-                notebook_data.get('answers', []),
-                text or ''
+        if use_llamaextract:
+            # Use LlamaExtract for supported formats (PDF, DOCX, PPTX)
+            print(f"Using LlamaExtract for {file_ext}")
+            extraction_output = await EXTRACT_AGENT.aextract(
+                files=SourceText(text_content=text, filename=original_filename)
            )

-            # Clean up temp file
-            if os.path.exists(file_path):
-                os.remove(file_path)
-
-            update_task_status(task_id, TaskStatus.COMPLETED, result={'document_id': document.id})
+            if extraction_output:
+                notebook_data = extraction_output.data
+            else:
+                update_task_status(task_id, TaskStatus.FAILED, error="LlamaExtract failed")
+                return
        else:
-            update_task_status(task_id, TaskStatus.FAILED, error="Extraction failed")
+            # Use LLM-based extraction for unsupported formats (Excel, CSV, images, audio, etc.)
+            print(f"Using LLM extraction fallback for {file_ext}")
+            from llm_extraction import extract_with_llm
+
+            # Use the notebook's configured model type
+            notebook_data = await extract_with_llm(text, original_filename, notebook.model_type)
+
+        # Save summary (same format from both extraction methods)
+        create_document_summary(
+            user_id, document.id,
+            notebook_data.get('summary', ''),
+            notebook_data.get('highlights', []),
+            notebook_data.get('questions', []),
+            notebook_data.get('answers', []),
+            text or ''
+        )
+
+        # Clean up temp file
+        if os.path.exists(file_path):
+            os.remove(file_path)
+
+        update_task_status(task_id, TaskStatus.COMPLETED, result={'document_id': document.id})

    except Exception as e:
        error_msg = f"Processing failed: {str(e)[:300]}"
--- a/backend/src/notebookllama/llm_extraction.py
+++ b/backend/src/notebookllama/llm_extraction.py
@ -0,0 +1,167 @@
+"""
+LLM-based document extraction fallback for file types not supported by LlamaExtract.
+Generates the same structured output: summary, highlights, questions, answers.
+"""
+
+from typing import Dict, List
+from llm_factory import get_llm_by_type
+
+
+async def extract_with_llm(text_content: str, filename: str, model_type: str = 'gpt4o') -> Dict:
+    """
+    Extract structured information from document text using LLM.
+    This is a fallback for when LlamaExtract doesn't support the file format.
+
+    Args:
+        text_content: Markdown text extracted by LlamaParse
+        filename: Original filename (for context)
+        model_type: Which LLM to use
+
+    Returns:
+        Dict with same structure as LlamaExtract:
+        {
+            'summary': str,
+            'highlights': List[str],
+            'questions': List[str],
+            'answers': List[str]
+        }
+    """
+
+    # Get appropriate LLM
+    llm = get_llm_by_type(model_type)
+
+    # Create comprehensive prompt for extraction
+    prompt = f"""You are analyzing a document titled "{filename}".
+
+Below is the document content in markdown format:
+
+{text_content[:15000]}
+
+Please analyze this document and provide:
+
+1. A comprehensive summary (2-3 paragraphs) covering the main points and key takeaways.
+
+2. A list of 5-8 key highlights or important points from the document.
+
+3. A list of 3-5 important questions that this document answers.
+
+4. The corresponding answers to those questions based on the document content.
+
+Format your response EXACTLY as follows (use this exact structure):
+
+SUMMARY:
+[Your 2-3 paragraph summary here]
+
+HIGHLIGHTS:
+- [Highlight 1]
+- [Highlight 2]
+- [Highlight 3]
+(continue for 5-8 highlights)
+
+QUESTIONS:
+1. [Question 1]
+2. [Question 2]
+3. [Question 3]
+(continue for 3-5 questions)
+
+ANSWERS:
+1. [Answer to question 1]
+2. [Answer to question 2]
+3. [Answer to question 3]
+(continue matching the questions)
+"""
+
+    # Get response from LLM
+    try:
+        response = await llm.acomplete(prompt)
+        response_text = response.text
+
+        # Parse the structured response
+        result = parse_llm_extraction_response(response_text)
+
+        return result
+
+    except Exception as e:
+        print(f"Error in LLM extraction: {e}")
+        # Return minimal structure if extraction fails
+        return {
+            'summary': f"Document: {filename}\n\n{text_content[:500]}...",
+            'highlights': ["Content extraction in progress"],
+            'questions': ["What is this document about?"],
+            'answers': ["This document is being processed."]
+        }
+
+
+def parse_llm_extraction_response(response_text: str) -> Dict:
+    """
+    Parse the LLM response into structured format matching LlamaExtract output.
+
+    Args:
+        response_text: Raw LLM response
+
+    Returns:
+        Dict with summary, highlights, questions, answers
+    """
+    result = {
+        'summary': '',
+        'highlights': [],
+        'questions': [],
+        'answers': []
+    }
+
+    # Split by sections
+    sections = response_text.split('\n')
+    current_section = None
+
+    summary_lines = []
+    highlights = []
+    questions = []
+    answers = []
+
+    for line in sections:
+        line = line.strip()
+
+        # Detect section headers
+        if line.upper().startswith('SUMMARY:'):
+            current_section = 'summary'
+            continue
+        elif line.upper().startswith('HIGHLIGHTS:'):
+            current_section = 'highlights'
+            continue
+        elif line.upper().startswith('QUESTIONS:'):
+            current_section = 'questions'
+            continue
+        elif line.upper().startswith('ANSWERS:'):
+            current_section = 'answers'
+            continue
+
+        # Skip empty lines
+        if not line:
+            continue
+
+        # Collect content by section
+        if current_section == 'summary':
+            summary_lines.append(line)
+        elif current_section == 'highlights':
+            # Remove bullet points/dashes
+            clean_line = line.lstrip('- •*123456789.')
+            if clean_line:
+                highlights.append(clean_line.strip())
+        elif current_section == 'questions':
+            # Remove numbering
+            clean_line = line.lstrip('123456789.')
+            if clean_line:
+                questions.append(clean_line.strip())
+        elif current_section == 'answers':
+            # Remove numbering
+            clean_line = line.lstrip('123456789.')
+            if clean_line:
+                answers.append(clean_line.strip())
+
+    # Assemble results
+    result['summary'] = ' '.join(summary_lines) if summary_lines else "Summary not available"
+    result['highlights'] = highlights[:10] if highlights else ["No highlights extracted"]
+    result['questions'] = questions[:10] if questions else ["What is this document about?"]
+    result['answers'] = answers[:10] if answers else ["Content being processed"]
+
+    return result