added llm based fallback extraction method for file types llamaextract doesn't support
This commit is contained in:
parent
c70dc59bfc
commit
aff4e7154d
3 changed files with 212 additions and 20 deletions
|
|
@ -143,6 +143,9 @@ npm run dev
|
|||
- Images: JPG, PNG, GIF, BMP, TIFF, SVG (with OCR)
|
||||
- Audio: MP3, WAV, M4A (transcription, 20MB limit)
|
||||
- Web: HTML
|
||||
- ✅ **Intelligent Processing** - Automatic format detection
|
||||
- PDF/DOCX/PPTX: LlamaExtract for structured extraction
|
||||
- Other formats: Direct LLM analysis (Excel, CSV, images, audio, etc.)
|
||||
- ✅ **Background Processing** - Non-blocking document analysis
|
||||
- ✅ **Real-time Status** - Watch processing progress
|
||||
|
||||
|
|
@ -325,6 +328,8 @@ UPDATE users SET is_admin = true WHERE email = 'your@email.com';
|
|||
- Web: HTML
|
||||
4. Click "Upload X File(s)"
|
||||
5. Wait ~1 minute per file for processing
|
||||
- PDF/DOCX/PPTX: Processed with LlamaExtract
|
||||
- Other formats: Processed with direct LLM analysis
|
||||
6. Expand files to see summaries, highlights, Q&A
|
||||
|
||||
**Generate Cross-Document Analysis:**
|
||||
|
|
@ -386,6 +391,9 @@ UPDATE users SET is_admin = true WHERE email = 'your@email.com';
|
|||
**AI/ML:**
|
||||
- LlamaCloud (document indexing)
|
||||
- LlamaIndex (RAG)
|
||||
- LlamaParse (document parsing - supports 30+ formats)
|
||||
- LlamaExtract (structured extraction for PDF/DOCX/PPTX)
|
||||
- Direct LLM extraction (fallback for Excel, CSV, images, audio, etc.)
|
||||
- OpenAI (GPT-4, GPT-5)
|
||||
- Anthropic (Claude 3.5, 4.5)
|
||||
- Google (Gemini 2.0, 2.5 Pro)
|
||||
|
|
|
|||
|
|
@ -206,30 +206,47 @@ async def execute_document_processing_task(task_id: int):
|
|||
|
||||
text = "\n\n---\n\n".join([md.text for md in md_content])
|
||||
|
||||
extraction_output = await EXTRACT_AGENT.aextract(
|
||||
files=SourceText(text_content=text, filename=original_filename)
|
||||
)
|
||||
# Determine extraction method based on file type
|
||||
# LlamaExtract only supports: PDF, DOCX, PPTX (and their variants)
|
||||
llamaextract_supported = ('.pdf', '.docx', '.doc', '.pptx', '.ppt')
|
||||
file_ext = os.path.splitext(original_filename.lower())[1]
|
||||
use_llamaextract = file_ext in llamaextract_supported
|
||||
|
||||
if extraction_output:
|
||||
notebook_data = extraction_output.data
|
||||
|
||||
# Save summary
|
||||
create_document_summary(
|
||||
user_id, document.id,
|
||||
notebook_data.get('summary', ''),
|
||||
notebook_data.get('highlights', []),
|
||||
notebook_data.get('questions', []),
|
||||
notebook_data.get('answers', []),
|
||||
text or ''
|
||||
if use_llamaextract:
|
||||
# Use LlamaExtract for supported formats (PDF, DOCX, PPTX)
|
||||
print(f"Using LlamaExtract for {file_ext}")
|
||||
extraction_output = await EXTRACT_AGENT.aextract(
|
||||
files=SourceText(text_content=text, filename=original_filename)
|
||||
)
|
||||
|
||||
# Clean up temp file
|
||||
if os.path.exists(file_path):
|
||||
os.remove(file_path)
|
||||
|
||||
update_task_status(task_id, TaskStatus.COMPLETED, result={'document_id': document.id})
|
||||
if extraction_output:
|
||||
notebook_data = extraction_output.data
|
||||
else:
|
||||
update_task_status(task_id, TaskStatus.FAILED, error="LlamaExtract failed")
|
||||
return
|
||||
else:
|
||||
update_task_status(task_id, TaskStatus.FAILED, error="Extraction failed")
|
||||
# Use LLM-based extraction for unsupported formats (Excel, CSV, images, audio, etc.)
|
||||
print(f"Using LLM extraction fallback for {file_ext}")
|
||||
from llm_extraction import extract_with_llm
|
||||
|
||||
# Use the notebook's configured model type
|
||||
notebook_data = await extract_with_llm(text, original_filename, notebook.model_type)
|
||||
|
||||
# Save summary (same format from both extraction methods)
|
||||
create_document_summary(
|
||||
user_id, document.id,
|
||||
notebook_data.get('summary', ''),
|
||||
notebook_data.get('highlights', []),
|
||||
notebook_data.get('questions', []),
|
||||
notebook_data.get('answers', []),
|
||||
text or ''
|
||||
)
|
||||
|
||||
# Clean up temp file
|
||||
if os.path.exists(file_path):
|
||||
os.remove(file_path)
|
||||
|
||||
update_task_status(task_id, TaskStatus.COMPLETED, result={'document_id': document.id})
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Processing failed: {str(e)[:300]}"
|
||||
|
|
|
|||
167
backend/src/notebookllama/llm_extraction.py
Normal file
167
backend/src/notebookllama/llm_extraction.py
Normal file
|
|
@ -0,0 +1,167 @@
|
|||
"""
|
||||
LLM-based document extraction fallback for file types not supported by LlamaExtract.
|
||||
Generates the same structured output: summary, highlights, questions, answers.
|
||||
"""
|
||||
|
||||
from typing import Dict, List
|
||||
from llm_factory import get_llm_by_type
|
||||
|
||||
|
||||
async def extract_with_llm(text_content: str, filename: str, model_type: str = 'gpt4o') -> Dict:
|
||||
"""
|
||||
Extract structured information from document text using LLM.
|
||||
This is a fallback for when LlamaExtract doesn't support the file format.
|
||||
|
||||
Args:
|
||||
text_content: Markdown text extracted by LlamaParse
|
||||
filename: Original filename (for context)
|
||||
model_type: Which LLM to use
|
||||
|
||||
Returns:
|
||||
Dict with same structure as LlamaExtract:
|
||||
{
|
||||
'summary': str,
|
||||
'highlights': List[str],
|
||||
'questions': List[str],
|
||||
'answers': List[str]
|
||||
}
|
||||
"""
|
||||
|
||||
# Get appropriate LLM
|
||||
llm = get_llm_by_type(model_type)
|
||||
|
||||
# Create comprehensive prompt for extraction
|
||||
prompt = f"""You are analyzing a document titled "{filename}".
|
||||
|
||||
Below is the document content in markdown format:
|
||||
|
||||
{text_content[:15000]}
|
||||
|
||||
Please analyze this document and provide:
|
||||
|
||||
1. A comprehensive summary (2-3 paragraphs) covering the main points and key takeaways.
|
||||
|
||||
2. A list of 5-8 key highlights or important points from the document.
|
||||
|
||||
3. A list of 3-5 important questions that this document answers.
|
||||
|
||||
4. The corresponding answers to those questions based on the document content.
|
||||
|
||||
Format your response EXACTLY as follows (use this exact structure):
|
||||
|
||||
SUMMARY:
|
||||
[Your 2-3 paragraph summary here]
|
||||
|
||||
HIGHLIGHTS:
|
||||
- [Highlight 1]
|
||||
- [Highlight 2]
|
||||
- [Highlight 3]
|
||||
(continue for 5-8 highlights)
|
||||
|
||||
QUESTIONS:
|
||||
1. [Question 1]
|
||||
2. [Question 2]
|
||||
3. [Question 3]
|
||||
(continue for 3-5 questions)
|
||||
|
||||
ANSWERS:
|
||||
1. [Answer to question 1]
|
||||
2. [Answer to question 2]
|
||||
3. [Answer to question 3]
|
||||
(continue matching the questions)
|
||||
"""
|
||||
|
||||
# Get response from LLM
|
||||
try:
|
||||
response = await llm.acomplete(prompt)
|
||||
response_text = response.text
|
||||
|
||||
# Parse the structured response
|
||||
result = parse_llm_extraction_response(response_text)
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error in LLM extraction: {e}")
|
||||
# Return minimal structure if extraction fails
|
||||
return {
|
||||
'summary': f"Document: {filename}\n\n{text_content[:500]}...",
|
||||
'highlights': ["Content extraction in progress"],
|
||||
'questions': ["What is this document about?"],
|
||||
'answers': ["This document is being processed."]
|
||||
}
|
||||
|
||||
|
||||
def parse_llm_extraction_response(response_text: str) -> Dict:
|
||||
"""
|
||||
Parse the LLM response into structured format matching LlamaExtract output.
|
||||
|
||||
Args:
|
||||
response_text: Raw LLM response
|
||||
|
||||
Returns:
|
||||
Dict with summary, highlights, questions, answers
|
||||
"""
|
||||
result = {
|
||||
'summary': '',
|
||||
'highlights': [],
|
||||
'questions': [],
|
||||
'answers': []
|
||||
}
|
||||
|
||||
# Split by sections
|
||||
sections = response_text.split('\n')
|
||||
current_section = None
|
||||
|
||||
summary_lines = []
|
||||
highlights = []
|
||||
questions = []
|
||||
answers = []
|
||||
|
||||
for line in sections:
|
||||
line = line.strip()
|
||||
|
||||
# Detect section headers
|
||||
if line.upper().startswith('SUMMARY:'):
|
||||
current_section = 'summary'
|
||||
continue
|
||||
elif line.upper().startswith('HIGHLIGHTS:'):
|
||||
current_section = 'highlights'
|
||||
continue
|
||||
elif line.upper().startswith('QUESTIONS:'):
|
||||
current_section = 'questions'
|
||||
continue
|
||||
elif line.upper().startswith('ANSWERS:'):
|
||||
current_section = 'answers'
|
||||
continue
|
||||
|
||||
# Skip empty lines
|
||||
if not line:
|
||||
continue
|
||||
|
||||
# Collect content by section
|
||||
if current_section == 'summary':
|
||||
summary_lines.append(line)
|
||||
elif current_section == 'highlights':
|
||||
# Remove bullet points/dashes
|
||||
clean_line = line.lstrip('- •*123456789.')
|
||||
if clean_line:
|
||||
highlights.append(clean_line.strip())
|
||||
elif current_section == 'questions':
|
||||
# Remove numbering
|
||||
clean_line = line.lstrip('123456789.')
|
||||
if clean_line:
|
||||
questions.append(clean_line.strip())
|
||||
elif current_section == 'answers':
|
||||
# Remove numbering
|
||||
clean_line = line.lstrip('123456789.')
|
||||
if clean_line:
|
||||
answers.append(clean_line.strip())
|
||||
|
||||
# Assemble results
|
||||
result['summary'] = ' '.join(summary_lines) if summary_lines else "Summary not available"
|
||||
result['highlights'] = highlights[:10] if highlights else ["No highlights extracted"]
|
||||
result['questions'] = questions[:10] if questions else ["What is this document about?"]
|
||||
result['answers'] = answers[:10] if answers else ["Content being processed"]
|
||||
|
||||
return result
|
||||
Loading…
Add table
Reference in a new issue