feat: adds docling

This commit is contained in:
sauravniraula 2025-08-02 23:32:48 +05:45
parent f299cad078
commit f2e410639a
No known key found for this signature in database
GPG key ID: 60FCC1B5A5E83326
6 changed files with 112 additions and 66 deletions

View file

@ -0,0 +1 @@
3.11

6
servers/fastapi/main.py Normal file
View file

@ -0,0 +1,6 @@
def main():
print("Hello from fastapi!")
if __name__ == "__main__":
main()

View file

@ -1,143 +1,168 @@
accelerate==1.9.0
aiohappyeyeballs==2.6.1
aiohttp==3.12.14
aiohttp==3.12.15
aiomysql==0.2.0
aiosignal==1.4.0
aiosqlite==0.21.0
annotated-types==0.7.0
anthropic==0.60.0
anyio==4.9.0
argcomplete==3.6.2
async-timeout==5.0.1
asyncpg==0.30.0
attrs==25.3.0
backoff==2.2.1
bcrypt==4.3.0
black==25.1.0
build==1.2.2.post1
beautifulsoup4==4.13.4
build==1.3.0
cachetools==5.5.2
certifi==2025.7.14
cffi==1.17.1
charset-normalizer==3.4.2
chromadb==1.0.15
click==8.2.1
click==8.2.2
coloredlogs==15.0.1
cryptography==45.0.5
dill==0.4.0
distro==1.9.0
dnspython==2.7.0
docling==2.43.0
docling-core==2.44.1
docling-ibm-models==3.9.0
docling-parse==4.1.0
durationpy==0.10
easyocr==1.7.2
email_validator==2.2.0
et_xmlfile==2.0.0
fastapi==0.116.1
fastapi-cli==0.0.8
fastapi-cloud-cli==0.1.4
fastembed==0.7.1
fastapi-cloud-cli==0.1.5
filelock==3.18.0
filetype==1.2.0
flatbuffers==25.2.10
frozenlist==1.7.0
fsspec==2025.7.0
genson==1.3.0
google-auth==2.40.3
google-genai==1.25.0
google-genai==1.28.0
googleapis-common-protos==1.70.0
greenlet==3.2.3
grpcio==1.74.0
h11==0.16.0
h2==4.2.0
hf-xet==1.1.5
hpack==4.1.0
httpcore==1.0.9
httptools==0.6.4
httpx==0.28.1
huggingface-hub==0.34.1
huggingface-hub==0.34.3
humanfriendly==10.0
hyperframe==6.1.0
idna==3.10
imageio==2.37.0
importlib_metadata==8.7.0
importlib_resources==6.5.2
inflect==7.5.0
iniconfig==2.1.0
isort==6.0.1
Jinja2==3.1.6
jiter==0.10.0
jsonlines==3.1.0
jsonref==1.1.0
jsonschema==4.25.0
jsonschema-specifications==2025.4.1
kubernetes==33.1.0
loguru==0.7.3
lxml==6.0.0
latex2mathml==3.78.0
lazy_loader==0.4
lxml==5.4.0
markdown-it-py==3.0.0
marko==2.1.4
MarkupSafe==3.0.2
mdurl==0.1.2
mmh3==5.1.0
more-itertools==10.7.0
mmh3==5.2.0
mpire==2.10.2
mpmath==1.3.0
multidict==6.6.3
mypy_extensions==1.1.0
numpy==2.3.2
multiprocess==0.70.18
networkx==3.5
ninja==1.11.1.4
numpy==2.2.6
oauthlib==3.3.1
onnxruntime==1.22.1
openai==1.95.1
opentelemetry-api==1.35.0
opentelemetry-exporter-otlp-proto-common==1.35.0
opentelemetry-exporter-otlp-proto-grpc==1.35.0
opentelemetry-proto==1.35.0
opentelemetry-sdk==1.35.0
opentelemetry-semantic-conventions==0.56b0
openai==1.98.0
opencv-python-headless==4.12.0.88
openpyxl==3.1.5
opentelemetry-api==1.36.0
opentelemetry-exporter-otlp-proto-common==1.36.0
opentelemetry-exporter-otlp-proto-grpc==1.36.0
opentelemetry-proto==1.36.0
opentelemetry-sdk==1.36.0
opentelemetry-semantic-conventions==0.57b0
orjson==3.11.1
overrides==7.7.0
packaging==25.0
pathspec==0.12.1
pandas==2.3.1
pathvalidate==3.3.1
pdfminer.six==20250506
pdfplumber==0.11.7
pillow==11.3.0
platformdirs==4.3.8
pluggy==1.6.0
portalocker==3.2.0
posthog==5.4.0
propcache==0.3.2
protobuf==6.31.1
py_rust_stemmers==0.1.5
psutil==7.0.0
pyasn1==0.6.1
pyasn1_modules==0.4.2
pybase64==1.4.2
pyclipper==1.3.0.post6
pycparser==2.22
pydantic==2.11.7
pydantic-settings==2.10.1
pydantic_core==2.33.2
Pygments==2.19.2
pypdfium2==4.30.1
pylatexenc==2.10
PyMySQL==1.1.1
pypdfium2==4.30.0
PyPika==0.48.9
pyproject_hooks==1.2.0
pytest==8.4.1
python-bidi==0.6.6
python-dateutil==2.9.0.post0
python-docx==1.2.0
python-dotenv==1.1.1
python-multipart==0.0.20
python-pptx==1.0.2
pytz==2025.2
PyYAML==6.0.2
redis==6.2.0
referencing==0.36.2
regex==2025.7.34
requests==2.32.4
requests-oauthlib==2.0.0
rich==14.0.0
rich-toolkit==0.14.8
rignore==0.6.2
rich==14.1.0
rich-toolkit==0.14.9
rignore==0.6.4
rpds-py==0.26.0
rsa==4.9.1
sentry-sdk==2.32.0
rtree==1.4.0
safetensors==0.5.3
scikit-image==0.25.2
scipy==1.16.1
semchunk==2.2.2
sentry-sdk==2.34.1
shapely==2.1.1
shellingham==1.5.4
six==1.17.0
sniffio==1.3.1
SQLAlchemy==2.0.41
soupsieve==2.7
SQLAlchemy==2.0.42
sqlmodel==0.0.24
starlette==0.47.1
starlette==0.47.2
sympy==1.14.0
tabulate==0.9.0
tenacity==8.5.0
tokenizers==0.21.2
tomli==2.2.1
tifffile==2025.6.11
tokenizers==0.21.4
--extra-index-url https://download.pytorch.org/whl/cpu
torch==2.7.1+cpu
torchvision==0.22.1+cpu
tqdm==4.67.1
typeguard==4.4.4
transformers==4.54.1
typer==0.16.0
typing-inspection==0.4.1
typing_extensions==4.14.1
tzdata==2025.2
urllib3==2.5.0
uvicorn==0.35.0
uvloop==0.21.0

View file

@ -0,0 +1,27 @@
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.datamodel.base_models import InputFormat
class DoclingService:
def __init__(self):
self.pipeline_options = PdfPipelineOptions()
self.pipeline_options.do_ocr = False
self.converter = DocumentConverter(
format_options={
InputFormat.DOCX: PdfFormatOption(
pipeline_options=self.pipeline_options,
),
InputFormat.PPTX: PdfFormatOption(
pipeline_options=self.pipeline_options,
),
InputFormat.PDF: PdfFormatOption(
pipeline_options=self.pipeline_options,
),
}
)
def parse_to_markdown(self, file_path: str) -> str:
result = self.converter.convert(file_path)
return result.document.export_to_markdown()

View file

@ -1,9 +1,8 @@
import mimetypes
from fastapi import HTTPException
import os, pdfplumber, asyncio
import os, asyncio
from typing import List, Tuple
from docx import Document
from pptx import Presentation
import pdfplumber
from constants.documents import (
PDF_MIME_TYPES,
@ -11,6 +10,7 @@ from constants.documents import (
TEXT_MIME_TYPES,
WORD_TYPES,
)
from services.docling_service import DoclingService
class DocumentsLoader:
@ -18,6 +18,8 @@ class DocumentsLoader:
def __init__(self, file_paths: List[str]):
self._file_paths = file_paths
self.docling_service = DoclingService()
self._documents: List[str] = []
self._images: List[List[str]] = []
@ -76,9 +78,7 @@ class DocumentsLoader:
document: str = ""
if load_text:
with pdfplumber.open(file_path) as pdf:
for page in pdf.pages:
document += await asyncio.to_thread(page.extract_text)
document = self.docling_service.parse_to_markdown(file_path)
if load_images:
image_paths = await self.get_page_images_from_pdf_async(file_path, temp_dir)
@ -90,23 +90,10 @@ class DocumentsLoader:
return await asyncio.to_thread(file.read)
def load_msword(self, file_path: str) -> str:
document = Document(file_path)
text = "\n".join([paragraph.text for paragraph in document.paragraphs])
return text
return self.docling_service.parse_to_markdown(file_path)
def load_powerpoint(self, file_path: str) -> str:
presentation = Presentation(file_path)
extracted_text = ""
for index, slide in enumerate(presentation.slides):
extracted_text += f"# Slide {index + 1}\n"
for shape in slide.shapes:
if shape.has_text_frame:
for paragraph in shape.text_frame.paragraphs:
extracted_text += f"{paragraph.text}\n"
extracted_text += "\n"
extracted_text += "\n\n"
return extracted_text
return self.docling_service.parse_to_markdown(file_path)
def get_page_images_from_pdf(self, file_path: str, temp_dir: str):
with pdfplumber.open(file_path) as pdf:

View file

@ -207,7 +207,7 @@ const DocumentsPreviewPage: React.FC = () => {
return (
<div className={`border-r border-gray-200 fixed xl:relative w-full z-50 xl:z-auto
transition-all duration-300 ease-in-out max-w-[200px] md:max-w-[300px] h-[85vh] rounded-md p-5`}>
transition-all duration-300 bg-white ease-in-out max-w-[200px] md:max-w-[300px] h-[85vh] rounded-md p-5`}>
<X
onClick={() => setIsOpen(false)}
className="text-black mb-4 ml-auto mr-0 cursor-pointer hover:text-gray-600"