added llm based fallback extraction method for file types llamaextract doesn't support

This commit is contained in:
michael 2025-10-17 15:36:55 -05:00
parent c70dc59bfc
commit aff4e7154d
3 changed files with 212 additions and 20 deletions

View file

@ -143,6 +143,9 @@ npm run dev
- Images: JPG, PNG, GIF, BMP, TIFF, SVG (with OCR)
- Audio: MP3, WAV, M4A (transcription, 20MB limit)
- Web: HTML
- ✅ **Intelligent Processing** - Automatic format detection
- PDF/DOCX/PPTX: LlamaExtract for structured extraction
- Other formats: Direct LLM analysis (Excel, CSV, images, audio, etc.)
- ✅ **Background Processing** - Non-blocking document analysis
- ✅ **Real-time Status** - Watch processing progress
@ -325,6 +328,8 @@ UPDATE users SET is_admin = true WHERE email = 'your@email.com';
- Web: HTML
4. Click "Upload X File(s)"
5. Wait ~1 minute per file for processing
- PDF/DOCX/PPTX: Processed with LlamaExtract
- Other formats: Processed with direct LLM analysis
6. Expand files to see summaries, highlights, Q&A
**Generate Cross-Document Analysis:**
@ -386,6 +391,9 @@ UPDATE users SET is_admin = true WHERE email = 'your@email.com';
**AI/ML:**
- LlamaCloud (document indexing)
- LlamaIndex (RAG)
- LlamaParse (document parsing - supports 30+ formats)
- LlamaExtract (structured extraction for PDF/DOCX/PPTX)
- Direct LLM extraction (fallback for Excel, CSV, images, audio, etc.)
- OpenAI (GPT-4, GPT-5)
- Anthropic (Claude 3.5, 4.5)
- Google (Gemini 2.0, 2.5 Pro)

View file

@ -206,30 +206,47 @@ async def execute_document_processing_task(task_id: int):
text = "\n\n---\n\n".join([md.text for md in md_content])
extraction_output = await EXTRACT_AGENT.aextract(
files=SourceText(text_content=text, filename=original_filename)
)
# Determine extraction method based on file type
# LlamaExtract only supports: PDF, DOCX, PPTX (and their variants)
llamaextract_supported = ('.pdf', '.docx', '.doc', '.pptx', '.ppt')
file_ext = os.path.splitext(original_filename.lower())[1]
use_llamaextract = file_ext in llamaextract_supported
if extraction_output:
notebook_data = extraction_output.data
# Save summary
create_document_summary(
user_id, document.id,
notebook_data.get('summary', ''),
notebook_data.get('highlights', []),
notebook_data.get('questions', []),
notebook_data.get('answers', []),
text or ''
if use_llamaextract:
# Use LlamaExtract for supported formats (PDF, DOCX, PPTX)
print(f"Using LlamaExtract for {file_ext}")
extraction_output = await EXTRACT_AGENT.aextract(
files=SourceText(text_content=text, filename=original_filename)
)
# Clean up temp file
if os.path.exists(file_path):
os.remove(file_path)
update_task_status(task_id, TaskStatus.COMPLETED, result={'document_id': document.id})
if extraction_output:
notebook_data = extraction_output.data
else:
update_task_status(task_id, TaskStatus.FAILED, error="LlamaExtract failed")
return
else:
update_task_status(task_id, TaskStatus.FAILED, error="Extraction failed")
# Use LLM-based extraction for unsupported formats (Excel, CSV, images, audio, etc.)
print(f"Using LLM extraction fallback for {file_ext}")
from llm_extraction import extract_with_llm
# Use the notebook's configured model type
notebook_data = await extract_with_llm(text, original_filename, notebook.model_type)
# Save summary (same format from both extraction methods)
create_document_summary(
user_id, document.id,
notebook_data.get('summary', ''),
notebook_data.get('highlights', []),
notebook_data.get('questions', []),
notebook_data.get('answers', []),
text or ''
)
# Clean up temp file
if os.path.exists(file_path):
os.remove(file_path)
update_task_status(task_id, TaskStatus.COMPLETED, result={'document_id': document.id})
except Exception as e:
error_msg = f"Processing failed: {str(e)[:300]}"

View file

@ -0,0 +1,167 @@
"""
LLM-based document extraction fallback for file types not supported by LlamaExtract.
Generates the same structured output: summary, highlights, questions, answers.
"""
from typing import Dict, List
from llm_factory import get_llm_by_type
async def extract_with_llm(text_content: str, filename: str, model_type: str = 'gpt4o') -> Dict:
"""
Extract structured information from document text using LLM.
This is a fallback for when LlamaExtract doesn't support the file format.
Args:
text_content: Markdown text extracted by LlamaParse
filename: Original filename (for context)
model_type: Which LLM to use
Returns:
Dict with same structure as LlamaExtract:
{
'summary': str,
'highlights': List[str],
'questions': List[str],
'answers': List[str]
}
"""
# Get appropriate LLM
llm = get_llm_by_type(model_type)
# Create comprehensive prompt for extraction
prompt = f"""You are analyzing a document titled "{filename}".
Below is the document content in markdown format:
{text_content[:15000]}
Please analyze this document and provide:
1. A comprehensive summary (2-3 paragraphs) covering the main points and key takeaways.
2. A list of 5-8 key highlights or important points from the document.
3. A list of 3-5 important questions that this document answers.
4. The corresponding answers to those questions based on the document content.
Format your response EXACTLY as follows (use this exact structure):
SUMMARY:
[Your 2-3 paragraph summary here]
HIGHLIGHTS:
- [Highlight 1]
- [Highlight 2]
- [Highlight 3]
(continue for 5-8 highlights)
QUESTIONS:
1. [Question 1]
2. [Question 2]
3. [Question 3]
(continue for 3-5 questions)
ANSWERS:
1. [Answer to question 1]
2. [Answer to question 2]
3. [Answer to question 3]
(continue matching the questions)
"""
# Get response from LLM
try:
response = await llm.acomplete(prompt)
response_text = response.text
# Parse the structured response
result = parse_llm_extraction_response(response_text)
return result
except Exception as e:
print(f"Error in LLM extraction: {e}")
# Return minimal structure if extraction fails
return {
'summary': f"Document: {filename}\n\n{text_content[:500]}...",
'highlights': ["Content extraction in progress"],
'questions': ["What is this document about?"],
'answers': ["This document is being processed."]
}
def parse_llm_extraction_response(response_text: str) -> Dict:
"""
Parse the LLM response into structured format matching LlamaExtract output.
Args:
response_text: Raw LLM response
Returns:
Dict with summary, highlights, questions, answers
"""
result = {
'summary': '',
'highlights': [],
'questions': [],
'answers': []
}
# Split by sections
sections = response_text.split('\n')
current_section = None
summary_lines = []
highlights = []
questions = []
answers = []
for line in sections:
line = line.strip()
# Detect section headers
if line.upper().startswith('SUMMARY:'):
current_section = 'summary'
continue
elif line.upper().startswith('HIGHLIGHTS:'):
current_section = 'highlights'
continue
elif line.upper().startswith('QUESTIONS:'):
current_section = 'questions'
continue
elif line.upper().startswith('ANSWERS:'):
current_section = 'answers'
continue
# Skip empty lines
if not line:
continue
# Collect content by section
if current_section == 'summary':
summary_lines.append(line)
elif current_section == 'highlights':
# Remove bullet points/dashes
clean_line = line.lstrip('- •*123456789.')
if clean_line:
highlights.append(clean_line.strip())
elif current_section == 'questions':
# Remove numbering
clean_line = line.lstrip('123456789.')
if clean_line:
questions.append(clean_line.strip())
elif current_section == 'answers':
# Remove numbering
clean_line = line.lstrip('123456789.')
if clean_line:
answers.append(clean_line.strip())
# Assemble results
result['summary'] = ' '.join(summary_lines) if summary_lines else "Summary not available"
result['highlights'] = highlights[:10] if highlights else ["No highlights extracted"]
result['questions'] = questions[:10] if questions else ["What is this document about?"]
result['answers'] = answers[:10] if answers else ["Content being processed"]
return result