feat: adds docling

2025-08-02 23:32:48 +05:45 · 2025-08-02 23:32:48 +05:45 · f2e410639a
commit f2e410639a
parent f299cad078
6 changed files with 112 additions and 66 deletions
--- a/servers/fastapi/.python-version
+++ b/servers/fastapi/.python-version
@ -0,0 +1 @@
+3.11
--- a/servers/fastapi/main.py
+++ b/servers/fastapi/main.py
@ -0,0 +1,6 @@
+def main():
+    print("Hello from fastapi!")
+
+
+if __name__ == "__main__":
+    main()
--- a/servers/fastapi/requirements.txt
+++ b/servers/fastapi/requirements.txt
@ -1,143 +1,168 @@
+accelerate==1.9.0
 aiohappyeyeballs==2.6.1
-aiohttp==3.12.14
+aiohttp==3.12.15
 aiomysql==0.2.0
 aiosignal==1.4.0
 aiosqlite==0.21.0
 annotated-types==0.7.0
 anthropic==0.60.0
 anyio==4.9.0
-argcomplete==3.6.2
 async-timeout==5.0.1
 asyncpg==0.30.0
 attrs==25.3.0
 backoff==2.2.1
 bcrypt==4.3.0
-black==25.1.0
-build==1.2.2.post1
+beautifulsoup4==4.13.4
+build==1.3.0
 cachetools==5.5.2
 certifi==2025.7.14
 cffi==1.17.1
 charset-normalizer==3.4.2
 chromadb==1.0.15
-click==8.2.1
+click==8.2.2
 coloredlogs==15.0.1
 cryptography==45.0.5
+dill==0.4.0
 distro==1.9.0
 dnspython==2.7.0
+docling==2.43.0
+docling-core==2.44.1
+docling-ibm-models==3.9.0
+docling-parse==4.1.0
 durationpy==0.10
+easyocr==1.7.2
 email_validator==2.2.0
+et_xmlfile==2.0.0
 fastapi==0.116.1
 fastapi-cli==0.0.8
-fastapi-cloud-cli==0.1.4
-fastembed==0.7.1
+fastapi-cloud-cli==0.1.5
 filelock==3.18.0
+filetype==1.2.0
 flatbuffers==25.2.10
 frozenlist==1.7.0
 fsspec==2025.7.0
-genson==1.3.0
 google-auth==2.40.3
-google-genai==1.25.0
+google-genai==1.28.0
 googleapis-common-protos==1.70.0
 greenlet==3.2.3
 grpcio==1.74.0
 h11==0.16.0
-h2==4.2.0
 hf-xet==1.1.5
-hpack==4.1.0
 httpcore==1.0.9
 httptools==0.6.4
 httpx==0.28.1
-huggingface-hub==0.34.1
+huggingface-hub==0.34.3
 humanfriendly==10.0
-hyperframe==6.1.0
 idna==3.10
+imageio==2.37.0
 importlib_metadata==8.7.0
 importlib_resources==6.5.2
-inflect==7.5.0
-iniconfig==2.1.0
-isort==6.0.1
 Jinja2==3.1.6
 jiter==0.10.0
+jsonlines==3.1.0
+jsonref==1.1.0
 jsonschema==4.25.0
 jsonschema-specifications==2025.4.1
 kubernetes==33.1.0
-loguru==0.7.3
-lxml==6.0.0
+latex2mathml==3.78.0
+lazy_loader==0.4
+lxml==5.4.0
 markdown-it-py==3.0.0
+marko==2.1.4
 MarkupSafe==3.0.2
 mdurl==0.1.2
-mmh3==5.1.0
-more-itertools==10.7.0
+mmh3==5.2.0
+mpire==2.10.2
 mpmath==1.3.0
 multidict==6.6.3
-mypy_extensions==1.1.0
-numpy==2.3.2
+multiprocess==0.70.18
+networkx==3.5
+ninja==1.11.1.4
+numpy==2.2.6
 oauthlib==3.3.1
 onnxruntime==1.22.1
-openai==1.95.1
-opentelemetry-api==1.35.0
-opentelemetry-exporter-otlp-proto-common==1.35.0
-opentelemetry-exporter-otlp-proto-grpc==1.35.0
-opentelemetry-proto==1.35.0
-opentelemetry-sdk==1.35.0
-opentelemetry-semantic-conventions==0.56b0
+openai==1.98.0
+opencv-python-headless==4.12.0.88
+openpyxl==3.1.5
+opentelemetry-api==1.36.0
+opentelemetry-exporter-otlp-proto-common==1.36.0
+opentelemetry-exporter-otlp-proto-grpc==1.36.0
+opentelemetry-proto==1.36.0
+opentelemetry-sdk==1.36.0
+opentelemetry-semantic-conventions==0.57b0
 orjson==3.11.1
 overrides==7.7.0
 packaging==25.0
-pathspec==0.12.1
+pandas==2.3.1
 pathvalidate==3.3.1
 pdfminer.six==20250506
 pdfplumber==0.11.7
 pillow==11.3.0
-platformdirs==4.3.8
 pluggy==1.6.0
-portalocker==3.2.0
 posthog==5.4.0
 propcache==0.3.2
 protobuf==6.31.1
-py_rust_stemmers==0.1.5
+psutil==7.0.0
 pyasn1==0.6.1
 pyasn1_modules==0.4.2
 pybase64==1.4.2
+pyclipper==1.3.0.post6
 pycparser==2.22
 pydantic==2.11.7
+pydantic-settings==2.10.1
 pydantic_core==2.33.2
 Pygments==2.19.2
-pypdfium2==4.30.1
+pylatexenc==2.10
+PyMySQL==1.1.1
+pypdfium2==4.30.0
 PyPika==0.48.9
 pyproject_hooks==1.2.0
-pytest==8.4.1
+python-bidi==0.6.6
 python-dateutil==2.9.0.post0
 python-docx==1.2.0
 python-dotenv==1.1.1
 python-multipart==0.0.20
 python-pptx==1.0.2
+pytz==2025.2
 PyYAML==6.0.2
 redis==6.2.0
 referencing==0.36.2
+regex==2025.7.34
 requests==2.32.4
 requests-oauthlib==2.0.0
-rich==14.0.0
-rich-toolkit==0.14.8
-rignore==0.6.2
+rich==14.1.0
+rich-toolkit==0.14.9
+rignore==0.6.4
 rpds-py==0.26.0
 rsa==4.9.1
-sentry-sdk==2.32.0
+rtree==1.4.0
+safetensors==0.5.3
+scikit-image==0.25.2
+scipy==1.16.1
+semchunk==2.2.2
+sentry-sdk==2.34.1
+shapely==2.1.1
 shellingham==1.5.4
 six==1.17.0
 sniffio==1.3.1
-SQLAlchemy==2.0.41
+soupsieve==2.7
+SQLAlchemy==2.0.42
 sqlmodel==0.0.24
-starlette==0.47.1
+starlette==0.47.2
 sympy==1.14.0
+tabulate==0.9.0
 tenacity==8.5.0
-tokenizers==0.21.2
-tomli==2.2.1
+tifffile==2025.6.11
+tokenizers==0.21.4
+--extra-index-url https://download.pytorch.org/whl/cpu
+torch==2.7.1+cpu
+torchvision==0.22.1+cpu
 tqdm==4.67.1
-typeguard==4.4.4
+transformers==4.54.1
 typer==0.16.0
 typing-inspection==0.4.1
 typing_extensions==4.14.1
+tzdata==2025.2
 urllib3==2.5.0
 uvicorn==0.35.0
 uvloop==0.21.0
--- a/servers/fastapi/services/docling_service.py
+++ b/servers/fastapi/services/docling_service.py
@ -0,0 +1,27 @@
+from docling.document_converter import DocumentConverter, PdfFormatOption
+from docling.datamodel.pipeline_options import PdfPipelineOptions
+from docling.datamodel.base_models import InputFormat
+
+
+class DoclingService:
+    def __init__(self):
+        self.pipeline_options = PdfPipelineOptions()
+        self.pipeline_options.do_ocr = False
+
+        self.converter = DocumentConverter(
+            format_options={
+                InputFormat.DOCX: PdfFormatOption(
+                    pipeline_options=self.pipeline_options,
+                ),
+                InputFormat.PPTX: PdfFormatOption(
+                    pipeline_options=self.pipeline_options,
+                ),
+                InputFormat.PDF: PdfFormatOption(
+                    pipeline_options=self.pipeline_options,
+                ),
+            }
+        )
+
+    def parse_to_markdown(self, file_path: str) -> str:
+        result = self.converter.convert(file_path)
+        return result.document.export_to_markdown()
--- a/servers/fastapi/services/documents_loader.py
+++ b/servers/fastapi/services/documents_loader.py
@ -1,9 +1,8 @@
 import mimetypes
 from fastapi import HTTPException
-import os, pdfplumber, asyncio
+import os, asyncio
 from typing import List, Tuple
-from docx import Document
-from pptx import Presentation
+import pdfplumber

 from constants.documents import (
    PDF_MIME_TYPES,
@ -11,6 +10,7 @@ from constants.documents import (
    TEXT_MIME_TYPES,
    WORD_TYPES,
 )
+from services.docling_service import DoclingService


 class DocumentsLoader:
@ -18,6 +18,8 @@ class DocumentsLoader:
    def __init__(self, file_paths: List[str]):
        self._file_paths = file_paths

+        self.docling_service = DoclingService()
+
        self._documents: List[str] = []
        self._images: List[List[str]] = []

@ -76,9 +78,7 @@ class DocumentsLoader:
        document: str = ""

        if load_text:
-            with pdfplumber.open(file_path) as pdf:
-                for page in pdf.pages:
-                    document += await asyncio.to_thread(page.extract_text)
+            document = self.docling_service.parse_to_markdown(file_path)

        if load_images:
            image_paths = await self.get_page_images_from_pdf_async(file_path, temp_dir)
@ -90,23 +90,10 @@ class DocumentsLoader:
            return await asyncio.to_thread(file.read)

    def load_msword(self, file_path: str) -> str:
-        document = Document(file_path)
-        text = "\n".join([paragraph.text for paragraph in document.paragraphs])
-        return text
+        return self.docling_service.parse_to_markdown(file_path)

    def load_powerpoint(self, file_path: str) -> str:
-        presentation = Presentation(file_path)
-
-        extracted_text = ""
-        for index, slide in enumerate(presentation.slides):
-            extracted_text += f"# Slide {index + 1}\n"
-            for shape in slide.shapes:
-                if shape.has_text_frame:
-                    for paragraph in shape.text_frame.paragraphs:
-                        extracted_text += f"{paragraph.text}\n"
-                    extracted_text += "\n"
-            extracted_text += "\n\n"
-        return extracted_text
+        return self.docling_service.parse_to_markdown(file_path)

    def get_page_images_from_pdf(self, file_path: str, temp_dir: str):
        with pdfplumber.open(file_path) as pdf:
--- a/servers/nextjs/app/(presentation-generator)/documents-preview/components/DocumentPreviewPage.tsx
+++ b/servers/nextjs/app/(presentation-generator)/documents-preview/components/DocumentPreviewPage.tsx
@ -207,7 +207,7 @@ const DocumentsPreviewPage: React.FC = () => {

    return (
      <div className={`border-r border-gray-200 fixed xl:relative w-full z-50 xl:z-auto
-        transition-all duration-300 ease-in-out max-w-[200px] md:max-w-[300px] h-[85vh] rounded-md p-5`}>
+        transition-all duration-300 bg-white ease-in-out max-w-[200px] md:max-w-[300px] h-[85vh] rounded-md p-5`}>
        <X
          onClick={() => setIsOpen(false)}
          className="text-black mb-4 ml-auto mr-0 cursor-pointer hover:text-gray-600"