Fix document processing: fallback to LLM when LlamaExtract returns data=None

LlamaExtract can return a non-None response object but with data=None for
certain PDFs, causing 'NoneType' object has no attribute 'get' on notebook_data.
Now falls back to LLM extraction instead of failing the task.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Vadym Samoilenko 2026-05-05 10:46:23 +01:00
parent f39ddb269f
commit 974a66288e

View file

@ -621,13 +621,13 @@ async def execute_document_processing_task(task_id: int):
)
extract_duration = time.time() - extract_start
if extraction_output:
if extraction_output and extraction_output.data:
notebook_data = extraction_output.data
logger.info(f" ✓ [LLAMAEXTRACT] aextract → Success ({extract_duration:.1f}s)")
else:
logger.error(f" [LLAMAEXTRACT] aextract → No data returned ({extract_duration:.1f}s)")
update_task_status(task_id, TaskStatus.FAILED, error="LlamaExtract failed")
return
logger.warning(f" [LLAMAEXTRACT] aextract → No data returned ({extract_duration:.1f}s), falling back to LLM extraction")
from llm_extraction import extract_with_llm
notebook_data = await extract_with_llm(text, original_filename, notebook.model_type)
except (httpx.RemoteProtocolError, httpx.ReadTimeout, httpx.ConnectError) as e:
# Network errors during extraction - provide helpful error message
logger.error(f"✗ Network error during extraction: {e}")