157 lines
5.1 KiB
Python
157 lines
5.1 KiB
Python
import mimetypes
|
|
import sys
|
|
from fastapi import HTTPException
|
|
import os, asyncio
|
|
from typing import List, Optional, Tuple
|
|
|
|
import pdfplumber
|
|
from constants.documents import (
|
|
PDF_MIME_TYPES,
|
|
POWERPOINT_TYPES,
|
|
TEXT_MIME_TYPES,
|
|
WORD_TYPES,
|
|
)
|
|
|
|
# Platform-specific document service imports
|
|
is_windows = sys.platform == 'win32'
|
|
if not is_windows:
|
|
from services.docling_service import DoclingService
|
|
DocumentService = None
|
|
else:
|
|
DoclingService = None
|
|
from services.lightweight_document_service import DocumentService
|
|
|
|
|
|
class DocumentsLoader:
|
|
|
|
def __init__(self, file_paths: List[str]):
|
|
self._file_paths = file_paths
|
|
|
|
# Initialize document service based on platform
|
|
if not is_windows and DoclingService is not None:
|
|
# Use DoclingService on Linux/macOS
|
|
self.docling_service = DoclingService()
|
|
self.document_service = None
|
|
elif is_windows and DocumentService is not None:
|
|
# Use lightweight DocumentService on Windows
|
|
self.docling_service = None
|
|
self.document_service = DocumentService()
|
|
else:
|
|
# Fallback if neither is available
|
|
self.docling_service = None
|
|
self.document_service = None
|
|
|
|
self._documents: List[str] = []
|
|
self._images: List[List[str]] = []
|
|
|
|
@property
|
|
def documents(self):
|
|
return self._documents
|
|
|
|
@property
|
|
def images(self):
|
|
return self._images
|
|
|
|
async def load_documents(
|
|
self,
|
|
temp_dir: Optional[str] = None,
|
|
load_text: bool = True,
|
|
load_images: bool = False,
|
|
):
|
|
"""If load_images is True, temp_dir must be provided"""
|
|
|
|
documents: List[str] = []
|
|
images: List[str] = []
|
|
|
|
for file_path in self._file_paths:
|
|
if not os.path.exists(file_path):
|
|
raise HTTPException(
|
|
status_code=404, detail=f"File {file_path} not found"
|
|
)
|
|
|
|
document = ""
|
|
imgs = []
|
|
|
|
mime_type = mimetypes.guess_type(file_path)[0]
|
|
if mime_type in PDF_MIME_TYPES:
|
|
document, imgs = await self.load_pdf(
|
|
file_path, load_text, load_images, temp_dir
|
|
)
|
|
elif mime_type in TEXT_MIME_TYPES:
|
|
document = await self.load_text(file_path)
|
|
elif mime_type in POWERPOINT_TYPES:
|
|
document = self.load_powerpoint(file_path)
|
|
elif mime_type in WORD_TYPES:
|
|
document = self.load_msword(file_path)
|
|
|
|
documents.append(document)
|
|
images.append(imgs)
|
|
|
|
self._documents = documents
|
|
self._images = images
|
|
|
|
async def load_pdf(
|
|
self,
|
|
file_path: str,
|
|
load_text: bool,
|
|
load_images: bool,
|
|
temp_dir: Optional[str] = None,
|
|
) -> Tuple[str, List[str]]:
|
|
image_paths = []
|
|
document: str = ""
|
|
|
|
if load_text:
|
|
document = await self.load_text_from_pdf_locally(file_path)
|
|
|
|
if load_images:
|
|
image_paths = await self.get_page_images_from_pdf_async(file_path, temp_dir)
|
|
|
|
return document, image_paths
|
|
|
|
async def load_text_from_pdf_locally(self, file_path: str) -> str:
|
|
return await asyncio.to_thread(self._extract_text_from_pdf, file_path)
|
|
|
|
@staticmethod
|
|
def _extract_text_from_pdf(file_path: str) -> str:
|
|
texts: List[str] = []
|
|
with pdfplumber.open(file_path) as pdf:
|
|
for idx, page in enumerate(pdf.pages):
|
|
page_text = f"## Page {idx + 1}\n"
|
|
page_text += page.extract_text() or ""
|
|
texts.append(page_text)
|
|
return "\n\n".join(texts)
|
|
|
|
async def load_text(self, file_path: str) -> str:
|
|
with open(file_path, "r", encoding="utf-8") as file:
|
|
return await asyncio.to_thread(file.read)
|
|
|
|
def load_msword(self, file_path: str) -> str:
|
|
if self.docling_service is not None:
|
|
return self.docling_service.parse_to_markdown(file_path)
|
|
elif self.document_service is not None:
|
|
return self.document_service.parse_to_markdown(file_path)
|
|
return "" # Document service not available
|
|
|
|
def load_powerpoint(self, file_path: str) -> str:
|
|
if self.docling_service is not None:
|
|
return self.docling_service.parse_to_markdown(file_path)
|
|
elif self.document_service is not None:
|
|
return self.document_service.parse_to_markdown(file_path)
|
|
return "" # Document service not available
|
|
|
|
@classmethod
|
|
def get_page_images_from_pdf(cls, file_path: str, temp_dir: str) -> List[str]:
|
|
with pdfplumber.open(file_path) as pdf:
|
|
images = []
|
|
for page in pdf.pages:
|
|
img = page.to_image(resolution=150)
|
|
image_path = os.path.join(temp_dir, f"page_{page.page_number}.png")
|
|
img.save(image_path)
|
|
images.append(image_path)
|
|
return images
|
|
|
|
@classmethod
|
|
async def get_page_images_from_pdf_async(cls, file_path: str, temp_dir: str):
|
|
return await asyncio.to_thread(
|
|
cls.get_page_images_from_pdf, file_path, temp_dir
|
|
)
|