feat: adds docling
This commit is contained in:
parent
f299cad078
commit
f2e410639a
6 changed files with 112 additions and 66 deletions
1
servers/fastapi/.python-version
Normal file
1
servers/fastapi/.python-version
Normal file
|
|
@ -0,0 +1 @@
|
|||
3.11
|
||||
6
servers/fastapi/main.py
Normal file
6
servers/fastapi/main.py
Normal file
|
|
@ -0,0 +1,6 @@
|
|||
def main():
|
||||
print("Hello from fastapi!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -1,143 +1,168 @@
|
|||
accelerate==1.9.0
|
||||
aiohappyeyeballs==2.6.1
|
||||
aiohttp==3.12.14
|
||||
aiohttp==3.12.15
|
||||
aiomysql==0.2.0
|
||||
aiosignal==1.4.0
|
||||
aiosqlite==0.21.0
|
||||
annotated-types==0.7.0
|
||||
anthropic==0.60.0
|
||||
anyio==4.9.0
|
||||
argcomplete==3.6.2
|
||||
async-timeout==5.0.1
|
||||
asyncpg==0.30.0
|
||||
attrs==25.3.0
|
||||
backoff==2.2.1
|
||||
bcrypt==4.3.0
|
||||
black==25.1.0
|
||||
build==1.2.2.post1
|
||||
beautifulsoup4==4.13.4
|
||||
build==1.3.0
|
||||
cachetools==5.5.2
|
||||
certifi==2025.7.14
|
||||
cffi==1.17.1
|
||||
charset-normalizer==3.4.2
|
||||
chromadb==1.0.15
|
||||
click==8.2.1
|
||||
click==8.2.2
|
||||
coloredlogs==15.0.1
|
||||
cryptography==45.0.5
|
||||
dill==0.4.0
|
||||
distro==1.9.0
|
||||
dnspython==2.7.0
|
||||
docling==2.43.0
|
||||
docling-core==2.44.1
|
||||
docling-ibm-models==3.9.0
|
||||
docling-parse==4.1.0
|
||||
durationpy==0.10
|
||||
easyocr==1.7.2
|
||||
email_validator==2.2.0
|
||||
et_xmlfile==2.0.0
|
||||
fastapi==0.116.1
|
||||
fastapi-cli==0.0.8
|
||||
fastapi-cloud-cli==0.1.4
|
||||
fastembed==0.7.1
|
||||
fastapi-cloud-cli==0.1.5
|
||||
filelock==3.18.0
|
||||
filetype==1.2.0
|
||||
flatbuffers==25.2.10
|
||||
frozenlist==1.7.0
|
||||
fsspec==2025.7.0
|
||||
genson==1.3.0
|
||||
google-auth==2.40.3
|
||||
google-genai==1.25.0
|
||||
google-genai==1.28.0
|
||||
googleapis-common-protos==1.70.0
|
||||
greenlet==3.2.3
|
||||
grpcio==1.74.0
|
||||
h11==0.16.0
|
||||
h2==4.2.0
|
||||
hf-xet==1.1.5
|
||||
hpack==4.1.0
|
||||
httpcore==1.0.9
|
||||
httptools==0.6.4
|
||||
httpx==0.28.1
|
||||
huggingface-hub==0.34.1
|
||||
huggingface-hub==0.34.3
|
||||
humanfriendly==10.0
|
||||
hyperframe==6.1.0
|
||||
idna==3.10
|
||||
imageio==2.37.0
|
||||
importlib_metadata==8.7.0
|
||||
importlib_resources==6.5.2
|
||||
inflect==7.5.0
|
||||
iniconfig==2.1.0
|
||||
isort==6.0.1
|
||||
Jinja2==3.1.6
|
||||
jiter==0.10.0
|
||||
jsonlines==3.1.0
|
||||
jsonref==1.1.0
|
||||
jsonschema==4.25.0
|
||||
jsonschema-specifications==2025.4.1
|
||||
kubernetes==33.1.0
|
||||
loguru==0.7.3
|
||||
lxml==6.0.0
|
||||
latex2mathml==3.78.0
|
||||
lazy_loader==0.4
|
||||
lxml==5.4.0
|
||||
markdown-it-py==3.0.0
|
||||
marko==2.1.4
|
||||
MarkupSafe==3.0.2
|
||||
mdurl==0.1.2
|
||||
mmh3==5.1.0
|
||||
more-itertools==10.7.0
|
||||
mmh3==5.2.0
|
||||
mpire==2.10.2
|
||||
mpmath==1.3.0
|
||||
multidict==6.6.3
|
||||
mypy_extensions==1.1.0
|
||||
numpy==2.3.2
|
||||
multiprocess==0.70.18
|
||||
networkx==3.5
|
||||
ninja==1.11.1.4
|
||||
numpy==2.2.6
|
||||
oauthlib==3.3.1
|
||||
onnxruntime==1.22.1
|
||||
openai==1.95.1
|
||||
opentelemetry-api==1.35.0
|
||||
opentelemetry-exporter-otlp-proto-common==1.35.0
|
||||
opentelemetry-exporter-otlp-proto-grpc==1.35.0
|
||||
opentelemetry-proto==1.35.0
|
||||
opentelemetry-sdk==1.35.0
|
||||
opentelemetry-semantic-conventions==0.56b0
|
||||
openai==1.98.0
|
||||
opencv-python-headless==4.12.0.88
|
||||
openpyxl==3.1.5
|
||||
opentelemetry-api==1.36.0
|
||||
opentelemetry-exporter-otlp-proto-common==1.36.0
|
||||
opentelemetry-exporter-otlp-proto-grpc==1.36.0
|
||||
opentelemetry-proto==1.36.0
|
||||
opentelemetry-sdk==1.36.0
|
||||
opentelemetry-semantic-conventions==0.57b0
|
||||
orjson==3.11.1
|
||||
overrides==7.7.0
|
||||
packaging==25.0
|
||||
pathspec==0.12.1
|
||||
pandas==2.3.1
|
||||
pathvalidate==3.3.1
|
||||
pdfminer.six==20250506
|
||||
pdfplumber==0.11.7
|
||||
pillow==11.3.0
|
||||
platformdirs==4.3.8
|
||||
pluggy==1.6.0
|
||||
portalocker==3.2.0
|
||||
posthog==5.4.0
|
||||
propcache==0.3.2
|
||||
protobuf==6.31.1
|
||||
py_rust_stemmers==0.1.5
|
||||
psutil==7.0.0
|
||||
pyasn1==0.6.1
|
||||
pyasn1_modules==0.4.2
|
||||
pybase64==1.4.2
|
||||
pyclipper==1.3.0.post6
|
||||
pycparser==2.22
|
||||
pydantic==2.11.7
|
||||
pydantic-settings==2.10.1
|
||||
pydantic_core==2.33.2
|
||||
Pygments==2.19.2
|
||||
pypdfium2==4.30.1
|
||||
pylatexenc==2.10
|
||||
PyMySQL==1.1.1
|
||||
pypdfium2==4.30.0
|
||||
PyPika==0.48.9
|
||||
pyproject_hooks==1.2.0
|
||||
pytest==8.4.1
|
||||
python-bidi==0.6.6
|
||||
python-dateutil==2.9.0.post0
|
||||
python-docx==1.2.0
|
||||
python-dotenv==1.1.1
|
||||
python-multipart==0.0.20
|
||||
python-pptx==1.0.2
|
||||
pytz==2025.2
|
||||
PyYAML==6.0.2
|
||||
redis==6.2.0
|
||||
referencing==0.36.2
|
||||
regex==2025.7.34
|
||||
requests==2.32.4
|
||||
requests-oauthlib==2.0.0
|
||||
rich==14.0.0
|
||||
rich-toolkit==0.14.8
|
||||
rignore==0.6.2
|
||||
rich==14.1.0
|
||||
rich-toolkit==0.14.9
|
||||
rignore==0.6.4
|
||||
rpds-py==0.26.0
|
||||
rsa==4.9.1
|
||||
sentry-sdk==2.32.0
|
||||
rtree==1.4.0
|
||||
safetensors==0.5.3
|
||||
scikit-image==0.25.2
|
||||
scipy==1.16.1
|
||||
semchunk==2.2.2
|
||||
sentry-sdk==2.34.1
|
||||
shapely==2.1.1
|
||||
shellingham==1.5.4
|
||||
six==1.17.0
|
||||
sniffio==1.3.1
|
||||
SQLAlchemy==2.0.41
|
||||
soupsieve==2.7
|
||||
SQLAlchemy==2.0.42
|
||||
sqlmodel==0.0.24
|
||||
starlette==0.47.1
|
||||
starlette==0.47.2
|
||||
sympy==1.14.0
|
||||
tabulate==0.9.0
|
||||
tenacity==8.5.0
|
||||
tokenizers==0.21.2
|
||||
tomli==2.2.1
|
||||
tifffile==2025.6.11
|
||||
tokenizers==0.21.4
|
||||
--extra-index-url https://download.pytorch.org/whl/cpu
|
||||
torch==2.7.1+cpu
|
||||
torchvision==0.22.1+cpu
|
||||
tqdm==4.67.1
|
||||
typeguard==4.4.4
|
||||
transformers==4.54.1
|
||||
typer==0.16.0
|
||||
typing-inspection==0.4.1
|
||||
typing_extensions==4.14.1
|
||||
tzdata==2025.2
|
||||
urllib3==2.5.0
|
||||
uvicorn==0.35.0
|
||||
uvloop==0.21.0
|
||||
|
|
|
|||
27
servers/fastapi/services/docling_service.py
Normal file
27
servers/fastapi/services/docling_service.py
Normal file
|
|
@ -0,0 +1,27 @@
|
|||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
|
||||
|
||||
class DoclingService:
|
||||
def __init__(self):
|
||||
self.pipeline_options = PdfPipelineOptions()
|
||||
self.pipeline_options.do_ocr = False
|
||||
|
||||
self.converter = DocumentConverter(
|
||||
format_options={
|
||||
InputFormat.DOCX: PdfFormatOption(
|
||||
pipeline_options=self.pipeline_options,
|
||||
),
|
||||
InputFormat.PPTX: PdfFormatOption(
|
||||
pipeline_options=self.pipeline_options,
|
||||
),
|
||||
InputFormat.PDF: PdfFormatOption(
|
||||
pipeline_options=self.pipeline_options,
|
||||
),
|
||||
}
|
||||
)
|
||||
|
||||
def parse_to_markdown(self, file_path: str) -> str:
|
||||
result = self.converter.convert(file_path)
|
||||
return result.document.export_to_markdown()
|
||||
|
|
@ -1,9 +1,8 @@
|
|||
import mimetypes
|
||||
from fastapi import HTTPException
|
||||
import os, pdfplumber, asyncio
|
||||
import os, asyncio
|
||||
from typing import List, Tuple
|
||||
from docx import Document
|
||||
from pptx import Presentation
|
||||
import pdfplumber
|
||||
|
||||
from constants.documents import (
|
||||
PDF_MIME_TYPES,
|
||||
|
|
@ -11,6 +10,7 @@ from constants.documents import (
|
|||
TEXT_MIME_TYPES,
|
||||
WORD_TYPES,
|
||||
)
|
||||
from services.docling_service import DoclingService
|
||||
|
||||
|
||||
class DocumentsLoader:
|
||||
|
|
@ -18,6 +18,8 @@ class DocumentsLoader:
|
|||
def __init__(self, file_paths: List[str]):
|
||||
self._file_paths = file_paths
|
||||
|
||||
self.docling_service = DoclingService()
|
||||
|
||||
self._documents: List[str] = []
|
||||
self._images: List[List[str]] = []
|
||||
|
||||
|
|
@ -76,9 +78,7 @@ class DocumentsLoader:
|
|||
document: str = ""
|
||||
|
||||
if load_text:
|
||||
with pdfplumber.open(file_path) as pdf:
|
||||
for page in pdf.pages:
|
||||
document += await asyncio.to_thread(page.extract_text)
|
||||
document = self.docling_service.parse_to_markdown(file_path)
|
||||
|
||||
if load_images:
|
||||
image_paths = await self.get_page_images_from_pdf_async(file_path, temp_dir)
|
||||
|
|
@ -90,23 +90,10 @@ class DocumentsLoader:
|
|||
return await asyncio.to_thread(file.read)
|
||||
|
||||
def load_msword(self, file_path: str) -> str:
|
||||
document = Document(file_path)
|
||||
text = "\n".join([paragraph.text for paragraph in document.paragraphs])
|
||||
return text
|
||||
return self.docling_service.parse_to_markdown(file_path)
|
||||
|
||||
def load_powerpoint(self, file_path: str) -> str:
|
||||
presentation = Presentation(file_path)
|
||||
|
||||
extracted_text = ""
|
||||
for index, slide in enumerate(presentation.slides):
|
||||
extracted_text += f"# Slide {index + 1}\n"
|
||||
for shape in slide.shapes:
|
||||
if shape.has_text_frame:
|
||||
for paragraph in shape.text_frame.paragraphs:
|
||||
extracted_text += f"{paragraph.text}\n"
|
||||
extracted_text += "\n"
|
||||
extracted_text += "\n\n"
|
||||
return extracted_text
|
||||
return self.docling_service.parse_to_markdown(file_path)
|
||||
|
||||
def get_page_images_from_pdf(self, file_path: str, temp_dir: str):
|
||||
with pdfplumber.open(file_path) as pdf:
|
||||
|
|
|
|||
|
|
@ -207,7 +207,7 @@ const DocumentsPreviewPage: React.FC = () => {
|
|||
|
||||
return (
|
||||
<div className={`border-r border-gray-200 fixed xl:relative w-full z-50 xl:z-auto
|
||||
transition-all duration-300 ease-in-out max-w-[200px] md:max-w-[300px] h-[85vh] rounded-md p-5`}>
|
||||
transition-all duration-300 bg-white ease-in-out max-w-[200px] md:max-w-[300px] h-[85vh] rounded-md p-5`}>
|
||||
<X
|
||||
onClick={() => setIsOpen(false)}
|
||||
className="text-black mb-4 ml-auto mr-0 cursor-pointer hover:text-gray-600"
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue