ppt-tool/backend/services/documents_loader.py
Vadym Samoilenko e8295d6e71 Phase 4: Fix critical bugs, improve document parsing, add vision OCR
- Fix SSE stream 500: use async_session_maker inside StreamingResponse generator
  (Depends session closes when endpoint returns, before streaming starts)
- Fix template application: store template_name in prepare endpoint so worker
  uses the selected custom template instead of defaulting to "general"
- Fix OverlayLoader: replace loading.gif with HamsterLoader component
- Fix parse_mode default: change from "slides" to "layouts" to avoid 70+ layouts
- Update Gemini Flash model to gemini-3.1-flash-image-preview
- Improve DOCX parsing: python-docx for structured table extraction, OCR enabled
- Add vision-based image text extraction via Gemini for uploaded images
- Add LayoutParser integration for slide layout structure analysis
- Add Phase 4 MVP features: transfer ownership, URL input, follow-up questions,
  attachment-to-slide mapping, content router

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-27 14:07:00 +00:00

116 lines
3.5 KiB
Python

import mimetypes
from fastapi import HTTPException
import os, asyncio
from typing import List, Optional, Tuple
import pdfplumber
from constants.documents import (
PDF_MIME_TYPES,
POWERPOINT_TYPES,
TEXT_MIME_TYPES,
WORD_TYPES,
)
from services.docling_service import DoclingService
class DocumentsLoader:
def __init__(self, file_paths: List[str]):
self._file_paths = file_paths
self.docling_service = DoclingService()
self._documents: List[str] = []
self._images: List[List[str]] = []
@property
def documents(self):
return self._documents
@property
def images(self):
return self._images
async def load_documents(
self,
temp_dir: Optional[str] = None,
load_text: bool = True,
load_images: bool = False,
):
"""If load_images is True, temp_dir must be provided"""
documents: List[str] = []
images: List[str] = []
for file_path in self._file_paths:
if not os.path.exists(file_path):
raise HTTPException(
status_code=404, detail=f"File {file_path} not found"
)
document = ""
imgs = []
mime_type = mimetypes.guess_type(file_path)[0]
if mime_type in PDF_MIME_TYPES:
document, imgs = await self.load_pdf(
file_path, load_text, load_images, temp_dir
)
elif mime_type in TEXT_MIME_TYPES:
document = await self.load_text(file_path)
elif mime_type in POWERPOINT_TYPES:
document = self.load_powerpoint(file_path)
elif mime_type in WORD_TYPES:
document = self.load_msword(file_path)
documents.append(document)
images.append(imgs)
self._documents = documents
self._images = images
async def load_pdf(
self,
file_path: str,
load_text: bool,
load_images: bool,
temp_dir: Optional[str] = None,
) -> Tuple[str, List[str]]:
image_paths = []
document: str = ""
if load_text:
document = self.docling_service.parse_to_markdown(file_path)
if load_images:
image_paths = await self.get_page_images_from_pdf_async(file_path, temp_dir)
return document, image_paths
async def load_text(self, file_path: str) -> str:
with open(file_path, "r") as file:
return await asyncio.to_thread(file.read)
def load_msword(self, file_path: str) -> str:
"""Parse DOCX with python-docx for better table/structure handling."""
return self.docling_service.parse_docx_structured(file_path)
def load_powerpoint(self, file_path: str) -> str:
return self.docling_service.parse_to_markdown(file_path)
@classmethod
def get_page_images_from_pdf(cls, file_path: str, temp_dir: str) -> List[str]:
with pdfplumber.open(file_path) as pdf:
images = []
for page in pdf.pages:
img = page.to_image(resolution=150)
image_path = os.path.join(temp_dir, f"page_{page.page_number}.png")
img.save(image_path)
images.append(image_path)
return images
@classmethod
async def get_page_images_from_pdf_async(cls, file_path: str, temp_dir: str):
return await asyncio.to_thread(
cls.get_page_images_from_pdf, file_path, temp_dir
)