From 8715fa8bd254f3c2d9d8325eb80f1ba1b7b316bd Mon Sep 17 00:00:00 2001
From: Vadym Samoilenko <vadymsamoilenko@oliver.agency>
Date: Thu, 19 Mar 2026 20:06:46 +0000
Subject: [PATCH] =?UTF-8?q?Replace=20docling+layoutparser+torch=20with=20P?=
 =?UTF-8?q?yMuPDF=20(~3.5GB=20=E2=86=92=20~80MB)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- docling removed: PDF now parsed by PyMuPDF (fitz), PPTX by python-pptx
- layoutparser removed: already optional with graceful fallback (returns [])
- torch/pytorch index removed: no longer needed by any dependency
- pymupdf added: ~20MB wheel, no ML deps, faster than docling for text extraction
- All existing DOCX parsing kept (python-docx, already working)
- extract_text_from_image_via_vision() unchanged (Gemini API)

Result: api/worker Docker image ~3-4GB lighter, no NVIDIA libs on CPU server

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 backend/pyproject.toml              |  12 +--
 backend/services/docling_service.py | 128 +++++++++++++++-------------
 2 files changed, 68 insertions(+), 72 deletions(-)

diff --git a/backend/pyproject.toml b/backend/pyproject.toml
index d66017c..aaad03b 100644
--- a/backend/pyproject.toml
+++ b/backend/pyproject.toml
@@ -11,7 +11,6 @@ dependencies = [
     "asyncpg>=0.30.0",
     "chromadb>=1.0.15",
     "dirtyjson>=1.0.8",
-    "docling>=2.43.0",
     "fastapi[standard]>=0.116.1",
     "fastmcp>=2.11.0",
     "google-genai>=1.28.0",
@@ -22,7 +21,7 @@ dependencies = [
     "pytest>=8.4.1",
     "python-docx>=1.1",
     "python-pptx>=1.0.2",
-    "layoutparser>=0.3",
+    "pymupdf>=1.24",
     "opencv-python-headless>=4.8",
     "redis>=5.0,<6",
     "sqlmodel>=0.0.24",
@@ -42,15 +41,6 @@ asyncio_mode = "auto"
 testpaths = ["tests"]
 pythonpath = ["."]
 
-[[tool.uv.index]]
-name = "pytorch-cpu"
-url = "https://download.pytorch.org/whl/cpu"
-explicit = true
-
-[tool.uv.sources]
-torch = { index = "pytorch-cpu" }
-torchvision = { index = "pytorch-cpu" }
-torchaudio = { index = "pytorch-cpu" }
 
 [tool.setuptools.packages.find]
 where = ["."]
diff --git a/backend/services/docling_service.py b/backend/services/docling_service.py
index 1d3b22b..7a63bf3 100644
--- a/backend/services/docling_service.py
+++ b/backend/services/docling_service.py
@@ -1,6 +1,6 @@
 """Document parsing service.
 
-Uses Docling for PDF/PPTX and python-docx for DOCX (better table handling).
+Uses PyMuPDF for PDF, python-pptx for PPTX, python-docx for DOCX.
 Optionally extracts text from embedded images via Gemini vision.
 """
 import asyncio
@@ -8,51 +8,20 @@ import base64
 import os
 from typing import List, Optional
 
-from docling.document_converter import (
-    DocumentConverter,
-    PdfFormatOption,
-    PowerpointFormatOption,
-    WordFormatOption,
-)
-from docling.datamodel.pipeline_options import PdfPipelineOptions
-from docling.datamodel.base_models import InputFormat
-
 
 class DoclingService:
-    def __init__(self):
-        self.pipeline_options = PdfPipelineOptions()
-        self.pipeline_options.do_ocr = True
-
-        self.converter = DocumentConverter(
-            allowed_formats=[InputFormat.PPTX, InputFormat.PDF, InputFormat.DOCX],
-            format_options={
-                InputFormat.DOCX: WordFormatOption(
-                    pipeline_options=self.pipeline_options,
-                ),
-                InputFormat.PPTX: PowerpointFormatOption(
-                    pipeline_options=self.pipeline_options,
-                ),
-                InputFormat.PDF: PdfFormatOption(
-                    pipeline_options=self.pipeline_options,
-                ),
-            },
-        )
-
     def parse_to_markdown(self, file_path: str) -> str:
-        """Parse any supported document to markdown via Docling."""
-        result = self.converter.convert(file_path)
-        return result.document.export_to_markdown()
+        """Parse PDF or PPTX to markdown."""
+        ext = os.path.splitext(file_path)[1].lower()
+        if ext == ".pdf":
+            return _parse_pdf_with_pymupdf(file_path)
+        elif ext in (".pptx", ".ppt"):
+            return _parse_pptx_text(file_path)
+        return ""
 
     def parse_docx_structured(self, file_path: str) -> str:
-        """Parse DOCX with python-docx for better table/structure handling.
-
-        Falls back to Docling if python-docx is not available.
-        """
-        try:
-            return self._parse_docx_with_python_docx(file_path)
-        except Exception as e:
-            print(f"[DoclingService] python-docx parsing failed ({e}), falling back to Docling")
-            return self.parse_to_markdown(file_path)
+        """Parse DOCX with python-docx for better table/structure handling."""
+        return self._parse_docx_with_python_docx(file_path)
 
     def _parse_docx_with_python_docx(self, file_path: str) -> str:
         """Extract text from DOCX using python-docx with proper table handling."""
@@ -65,12 +34,10 @@ class DoclingService:
             tag = element.tag.split("}")[-1] if "}" in element.tag else element.tag
 
             if tag == "p":
-                # Paragraph
                 para = _find_paragraph_by_element(doc, element)
                 if para is not None:
                     text = para.text.strip()
                     if text:
-                        # Check heading style
                         style_name = (para.style.name or "").lower() if para.style else ""
                         if "heading" in style_name:
                             level = 1
@@ -83,14 +50,12 @@ class DoclingService:
                             parts.append(text)
 
             elif tag == "tbl":
-                # Table — extract as markdown table
                 tbl = _find_table_by_element(doc, element)
                 if tbl is not None:
                     md_table = _table_to_markdown(tbl)
                     if md_table:
                         parts.append(md_table)
 
-        # Also extract images descriptions if possible
         embedded_images = self._extract_docx_images(doc)
         if embedded_images:
             parts.append("\n## Embedded Images\n")
@@ -100,10 +65,6 @@ class DoclingService:
         return "\n\n".join(parts)
 
     def _extract_docx_images(self, doc) -> List[str]:
-        """Extract image descriptions from DOCX.
-
-        Returns alt text for images, or placeholder if no alt text.
-        """
         descriptions = []
         try:
             for rel in doc.part.rels.values():
@@ -114,8 +75,61 @@ class DoclingService:
         return descriptions
 
 
+def _parse_pdf_with_pymupdf(file_path: str) -> str:
+    """Extract text from PDF using PyMuPDF (no ML, no torch)."""
+    import fitz  # PyMuPDF
+
+    parts = []
+    with fitz.open(file_path) as doc:
+        for page_num, page in enumerate(doc, 1):
+            text = page.get_text("markdown").strip()
+            if text:
+                parts.append(f"## Page {page_num}\n\n{text}")
+
+    return "\n\n".join(parts)
+
+
+def _parse_pptx_text(file_path: str) -> str:
+    """Extract text from PPTX using python-pptx (no ML, no torch)."""
+    from pptx import Presentation
+
+    prs = Presentation(file_path)
+    parts = []
+
+    for slide_num, slide in enumerate(prs.slides, 1):
+        slide_texts = []
+        for shape in slide.shapes:
+            if shape.has_text_frame:
+                text = shape.text_frame.text.strip()
+                if text:
+                    slide_texts.append(text)
+            elif shape.has_table:
+                md = _pptx_table_to_markdown(shape.table)
+                if md:
+                    slide_texts.append(md)
+
+        if slide_texts:
+            parts.append(f"## Slide {slide_num}\n\n" + "\n\n".join(slide_texts))
+
+    return "\n\n".join(parts)
+
+
+def _pptx_table_to_markdown(table) -> str:
+    rows = [[cell.text.strip().replace("|", "\\|") for cell in row.cells] for row in table.rows]
+    if not rows:
+        return ""
+    header = rows[0]
+    lines = [
+        "| " + " | ".join(header) + " |",
+        "| " + " | ".join(["---"] * len(header)) + " |",
+    ]
+    for row in rows[1:]:
+        padded = row + [""] * (len(header) - len(row))
+        lines.append("| " + " | ".join(padded[: len(header)]) + " |")
+    return "\n".join(lines)
+
+
 def _find_paragraph_by_element(doc, element):
-    """Find a Paragraph object matching the given XML element."""
     for para in doc.paragraphs:
         if para._element is element:
             return para
@@ -123,7 +137,6 @@ def _find_paragraph_by_element(doc, element):
 
 
 def _find_table_by_element(doc, element):
-    """Find a Table object matching the given XML element."""
     for table in doc.tables:
         if table._element is element:
             return table
@@ -131,7 +144,6 @@ def _find_table_by_element(doc, element):
 
 
 def _table_to_markdown(table) -> str:
-    """Convert a python-docx Table to a markdown table string."""
     rows = []
     for row in table.rows:
         cells = [cell.text.strip().replace("|", "\\|") for cell in row.cells]
@@ -140,36 +152,30 @@ def _table_to_markdown(table) -> str:
     if not rows:
         return ""
 
-    # Deduplicate merged cells (python-docx repeats merged cell text)
     clean_rows = []
     for row_cells in rows:
         clean = []
         for i, cell_text in enumerate(row_cells):
             if i > 0 and cell_text == row_cells[i - 1]:
-                clean.append("")  # merged cell
+                clean.append("")
             else:
                 clean.append(cell_text)
         clean_rows.append(clean)
 
-    # Build markdown table
     lines = []
     if clean_rows:
         header = clean_rows[0]
         lines.append("| " + " | ".join(header) + " |")
         lines.append("| " + " | ".join(["---"] * len(header)) + " |")
         for row in clean_rows[1:]:
-            # Pad row to match header length
             padded = row + [""] * (len(header) - len(row))
-            lines.append("| " + " | ".join(padded[:len(header)]) + " |")
+            lines.append("| " + " | ".join(padded[: len(header)]) + " |")
 
     return "\n".join(lines)
 
 
 async def extract_text_from_image_via_vision(image_bytes: bytes, mime_type: str = "image/png") -> Optional[str]:
-    """Use Gemini vision to extract text from an image.
-
-    Returns extracted text or None if unavailable.
-    """
+    """Use Gemini vision to extract text from an image."""
     try:
         import google.genai as genai