diff --git a/README.md b/README.md index db7e9e6..40eba89 100644 --- a/README.md +++ b/README.md @@ -143,6 +143,9 @@ npm run dev - Images: JPG, PNG, GIF, BMP, TIFF, SVG (with OCR) - Audio: MP3, WAV, M4A (transcription, 20MB limit) - Web: HTML +- ✅ **Intelligent Processing** - Automatic format detection + - PDF/DOCX/PPTX: LlamaExtract for structured extraction + - Other formats: Direct LLM analysis (Excel, CSV, images, audio, etc.) - ✅ **Background Processing** - Non-blocking document analysis - ✅ **Real-time Status** - Watch processing progress @@ -325,6 +328,8 @@ UPDATE users SET is_admin = true WHERE email = 'your@email.com'; - Web: HTML 4. Click "Upload X File(s)" 5. Wait ~1 minute per file for processing + - PDF/DOCX/PPTX: Processed with LlamaExtract + - Other formats: Processed with direct LLM analysis 6. Expand files to see summaries, highlights, Q&A **Generate Cross-Document Analysis:** @@ -386,6 +391,9 @@ UPDATE users SET is_admin = true WHERE email = 'your@email.com'; **AI/ML:** - LlamaCloud (document indexing) - LlamaIndex (RAG) +- LlamaParse (document parsing - supports 30+ formats) +- LlamaExtract (structured extraction for PDF/DOCX/PPTX) +- Direct LLM extraction (fallback for Excel, CSV, images, audio, etc.) - OpenAI (GPT-4, GPT-5) - Anthropic (Claude 3.5, 4.5) - Google (Gemini 2.0, 2.5 Pro) diff --git a/backend/src/notebookllama/background_tasks.py b/backend/src/notebookllama/background_tasks.py index 2f7ec5a..4ebbd0d 100644 --- a/backend/src/notebookllama/background_tasks.py +++ b/backend/src/notebookllama/background_tasks.py @@ -206,30 +206,47 @@ async def execute_document_processing_task(task_id: int): text = "\n\n---\n\n".join([md.text for md in md_content]) - extraction_output = await EXTRACT_AGENT.aextract( - files=SourceText(text_content=text, filename=original_filename) - ) + # Determine extraction method based on file type + # LlamaExtract only supports: PDF, DOCX, PPTX (and their variants) + llamaextract_supported = ('.pdf', '.docx', '.doc', '.pptx', '.ppt') + file_ext = os.path.splitext(original_filename.lower())[1] + use_llamaextract = file_ext in llamaextract_supported - if extraction_output: - notebook_data = extraction_output.data - - # Save summary - create_document_summary( - user_id, document.id, - notebook_data.get('summary', ''), - notebook_data.get('highlights', []), - notebook_data.get('questions', []), - notebook_data.get('answers', []), - text or '' + if use_llamaextract: + # Use LlamaExtract for supported formats (PDF, DOCX, PPTX) + print(f"Using LlamaExtract for {file_ext}") + extraction_output = await EXTRACT_AGENT.aextract( + files=SourceText(text_content=text, filename=original_filename) ) - # Clean up temp file - if os.path.exists(file_path): - os.remove(file_path) - - update_task_status(task_id, TaskStatus.COMPLETED, result={'document_id': document.id}) + if extraction_output: + notebook_data = extraction_output.data + else: + update_task_status(task_id, TaskStatus.FAILED, error="LlamaExtract failed") + return else: - update_task_status(task_id, TaskStatus.FAILED, error="Extraction failed") + # Use LLM-based extraction for unsupported formats (Excel, CSV, images, audio, etc.) + print(f"Using LLM extraction fallback for {file_ext}") + from llm_extraction import extract_with_llm + + # Use the notebook's configured model type + notebook_data = await extract_with_llm(text, original_filename, notebook.model_type) + + # Save summary (same format from both extraction methods) + create_document_summary( + user_id, document.id, + notebook_data.get('summary', ''), + notebook_data.get('highlights', []), + notebook_data.get('questions', []), + notebook_data.get('answers', []), + text or '' + ) + + # Clean up temp file + if os.path.exists(file_path): + os.remove(file_path) + + update_task_status(task_id, TaskStatus.COMPLETED, result={'document_id': document.id}) except Exception as e: error_msg = f"Processing failed: {str(e)[:300]}" diff --git a/backend/src/notebookllama/llm_extraction.py b/backend/src/notebookllama/llm_extraction.py new file mode 100644 index 0000000..e0c8b2f --- /dev/null +++ b/backend/src/notebookllama/llm_extraction.py @@ -0,0 +1,167 @@ +""" +LLM-based document extraction fallback for file types not supported by LlamaExtract. +Generates the same structured output: summary, highlights, questions, answers. +""" + +from typing import Dict, List +from llm_factory import get_llm_by_type + + +async def extract_with_llm(text_content: str, filename: str, model_type: str = 'gpt4o') -> Dict: + """ + Extract structured information from document text using LLM. + This is a fallback for when LlamaExtract doesn't support the file format. + + Args: + text_content: Markdown text extracted by LlamaParse + filename: Original filename (for context) + model_type: Which LLM to use + + Returns: + Dict with same structure as LlamaExtract: + { + 'summary': str, + 'highlights': List[str], + 'questions': List[str], + 'answers': List[str] + } + """ + + # Get appropriate LLM + llm = get_llm_by_type(model_type) + + # Create comprehensive prompt for extraction + prompt = f"""You are analyzing a document titled "{filename}". + +Below is the document content in markdown format: + +{text_content[:15000]} + +Please analyze this document and provide: + +1. A comprehensive summary (2-3 paragraphs) covering the main points and key takeaways. + +2. A list of 5-8 key highlights or important points from the document. + +3. A list of 3-5 important questions that this document answers. + +4. The corresponding answers to those questions based on the document content. + +Format your response EXACTLY as follows (use this exact structure): + +SUMMARY: +[Your 2-3 paragraph summary here] + +HIGHLIGHTS: +- [Highlight 1] +- [Highlight 2] +- [Highlight 3] +(continue for 5-8 highlights) + +QUESTIONS: +1. [Question 1] +2. [Question 2] +3. [Question 3] +(continue for 3-5 questions) + +ANSWERS: +1. [Answer to question 1] +2. [Answer to question 2] +3. [Answer to question 3] +(continue matching the questions) +""" + + # Get response from LLM + try: + response = await llm.acomplete(prompt) + response_text = response.text + + # Parse the structured response + result = parse_llm_extraction_response(response_text) + + return result + + except Exception as e: + print(f"Error in LLM extraction: {e}") + # Return minimal structure if extraction fails + return { + 'summary': f"Document: {filename}\n\n{text_content[:500]}...", + 'highlights': ["Content extraction in progress"], + 'questions': ["What is this document about?"], + 'answers': ["This document is being processed."] + } + + +def parse_llm_extraction_response(response_text: str) -> Dict: + """ + Parse the LLM response into structured format matching LlamaExtract output. + + Args: + response_text: Raw LLM response + + Returns: + Dict with summary, highlights, questions, answers + """ + result = { + 'summary': '', + 'highlights': [], + 'questions': [], + 'answers': [] + } + + # Split by sections + sections = response_text.split('\n') + current_section = None + + summary_lines = [] + highlights = [] + questions = [] + answers = [] + + for line in sections: + line = line.strip() + + # Detect section headers + if line.upper().startswith('SUMMARY:'): + current_section = 'summary' + continue + elif line.upper().startswith('HIGHLIGHTS:'): + current_section = 'highlights' + continue + elif line.upper().startswith('QUESTIONS:'): + current_section = 'questions' + continue + elif line.upper().startswith('ANSWERS:'): + current_section = 'answers' + continue + + # Skip empty lines + if not line: + continue + + # Collect content by section + if current_section == 'summary': + summary_lines.append(line) + elif current_section == 'highlights': + # Remove bullet points/dashes + clean_line = line.lstrip('- •*123456789.') + if clean_line: + highlights.append(clean_line.strip()) + elif current_section == 'questions': + # Remove numbering + clean_line = line.lstrip('123456789.') + if clean_line: + questions.append(clean_line.strip()) + elif current_section == 'answers': + # Remove numbering + clean_line = line.lstrip('123456789.') + if clean_line: + answers.append(clean_line.strip()) + + # Assemble results + result['summary'] = ' '.join(summary_lines) if summary_lines else "Summary not available" + result['highlights'] = highlights[:10] if highlights else ["No highlights extracted"] + result['questions'] = questions[:10] if questions else ["What is this document about?"] + result['answers'] = answers[:10] if answers else ["Content being processed"] + + return result