From 8715fa8bd254f3c2d9d8325eb80f1ba1b7b316bd Mon Sep 17 00:00:00 2001 From: Vadym Samoilenko Date: Thu, 19 Mar 2026 20:06:46 +0000 Subject: [PATCH] =?UTF-8?q?Replace=20docling+layoutparser+torch=20with=20P?= =?UTF-8?q?yMuPDF=20(~3.5GB=20=E2=86=92=20~80MB)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - docling removed: PDF now parsed by PyMuPDF (fitz), PPTX by python-pptx - layoutparser removed: already optional with graceful fallback (returns []) - torch/pytorch index removed: no longer needed by any dependency - pymupdf added: ~20MB wheel, no ML deps, faster than docling for text extraction - All existing DOCX parsing kept (python-docx, already working) - extract_text_from_image_via_vision() unchanged (Gemini API) Result: api/worker Docker image ~3-4GB lighter, no NVIDIA libs on CPU server Co-Authored-By: Claude Sonnet 4.6 --- backend/pyproject.toml | 12 +-- backend/services/docling_service.py | 128 +++++++++++++++------------- 2 files changed, 68 insertions(+), 72 deletions(-) diff --git a/backend/pyproject.toml b/backend/pyproject.toml index d66017c..aaad03b 100644 --- a/backend/pyproject.toml +++ b/backend/pyproject.toml @@ -11,7 +11,6 @@ dependencies = [ "asyncpg>=0.30.0", "chromadb>=1.0.15", "dirtyjson>=1.0.8", - "docling>=2.43.0", "fastapi[standard]>=0.116.1", "fastmcp>=2.11.0", "google-genai>=1.28.0", @@ -22,7 +21,7 @@ dependencies = [ "pytest>=8.4.1", "python-docx>=1.1", "python-pptx>=1.0.2", - "layoutparser>=0.3", + "pymupdf>=1.24", "opencv-python-headless>=4.8", "redis>=5.0,<6", "sqlmodel>=0.0.24", @@ -42,15 +41,6 @@ asyncio_mode = "auto" testpaths = ["tests"] pythonpath = ["."] -[[tool.uv.index]] -name = "pytorch-cpu" -url = "https://download.pytorch.org/whl/cpu" -explicit = true - -[tool.uv.sources] -torch = { index = "pytorch-cpu" } -torchvision = { index = "pytorch-cpu" } -torchaudio = { index = "pytorch-cpu" } [tool.setuptools.packages.find] where = ["."] diff --git a/backend/services/docling_service.py b/backend/services/docling_service.py index 1d3b22b..7a63bf3 100644 --- a/backend/services/docling_service.py +++ b/backend/services/docling_service.py @@ -1,6 +1,6 @@ """Document parsing service. -Uses Docling for PDF/PPTX and python-docx for DOCX (better table handling). +Uses PyMuPDF for PDF, python-pptx for PPTX, python-docx for DOCX. Optionally extracts text from embedded images via Gemini vision. """ import asyncio @@ -8,51 +8,20 @@ import base64 import os from typing import List, Optional -from docling.document_converter import ( - DocumentConverter, - PdfFormatOption, - PowerpointFormatOption, - WordFormatOption, -) -from docling.datamodel.pipeline_options import PdfPipelineOptions -from docling.datamodel.base_models import InputFormat - class DoclingService: - def __init__(self): - self.pipeline_options = PdfPipelineOptions() - self.pipeline_options.do_ocr = True - - self.converter = DocumentConverter( - allowed_formats=[InputFormat.PPTX, InputFormat.PDF, InputFormat.DOCX], - format_options={ - InputFormat.DOCX: WordFormatOption( - pipeline_options=self.pipeline_options, - ), - InputFormat.PPTX: PowerpointFormatOption( - pipeline_options=self.pipeline_options, - ), - InputFormat.PDF: PdfFormatOption( - pipeline_options=self.pipeline_options, - ), - }, - ) - def parse_to_markdown(self, file_path: str) -> str: - """Parse any supported document to markdown via Docling.""" - result = self.converter.convert(file_path) - return result.document.export_to_markdown() + """Parse PDF or PPTX to markdown.""" + ext = os.path.splitext(file_path)[1].lower() + if ext == ".pdf": + return _parse_pdf_with_pymupdf(file_path) + elif ext in (".pptx", ".ppt"): + return _parse_pptx_text(file_path) + return "" def parse_docx_structured(self, file_path: str) -> str: - """Parse DOCX with python-docx for better table/structure handling. - - Falls back to Docling if python-docx is not available. - """ - try: - return self._parse_docx_with_python_docx(file_path) - except Exception as e: - print(f"[DoclingService] python-docx parsing failed ({e}), falling back to Docling") - return self.parse_to_markdown(file_path) + """Parse DOCX with python-docx for better table/structure handling.""" + return self._parse_docx_with_python_docx(file_path) def _parse_docx_with_python_docx(self, file_path: str) -> str: """Extract text from DOCX using python-docx with proper table handling.""" @@ -65,12 +34,10 @@ class DoclingService: tag = element.tag.split("}")[-1] if "}" in element.tag else element.tag if tag == "p": - # Paragraph para = _find_paragraph_by_element(doc, element) if para is not None: text = para.text.strip() if text: - # Check heading style style_name = (para.style.name or "").lower() if para.style else "" if "heading" in style_name: level = 1 @@ -83,14 +50,12 @@ class DoclingService: parts.append(text) elif tag == "tbl": - # Table — extract as markdown table tbl = _find_table_by_element(doc, element) if tbl is not None: md_table = _table_to_markdown(tbl) if md_table: parts.append(md_table) - # Also extract images descriptions if possible embedded_images = self._extract_docx_images(doc) if embedded_images: parts.append("\n## Embedded Images\n") @@ -100,10 +65,6 @@ class DoclingService: return "\n\n".join(parts) def _extract_docx_images(self, doc) -> List[str]: - """Extract image descriptions from DOCX. - - Returns alt text for images, or placeholder if no alt text. - """ descriptions = [] try: for rel in doc.part.rels.values(): @@ -114,8 +75,61 @@ class DoclingService: return descriptions +def _parse_pdf_with_pymupdf(file_path: str) -> str: + """Extract text from PDF using PyMuPDF (no ML, no torch).""" + import fitz # PyMuPDF + + parts = [] + with fitz.open(file_path) as doc: + for page_num, page in enumerate(doc, 1): + text = page.get_text("markdown").strip() + if text: + parts.append(f"## Page {page_num}\n\n{text}") + + return "\n\n".join(parts) + + +def _parse_pptx_text(file_path: str) -> str: + """Extract text from PPTX using python-pptx (no ML, no torch).""" + from pptx import Presentation + + prs = Presentation(file_path) + parts = [] + + for slide_num, slide in enumerate(prs.slides, 1): + slide_texts = [] + for shape in slide.shapes: + if shape.has_text_frame: + text = shape.text_frame.text.strip() + if text: + slide_texts.append(text) + elif shape.has_table: + md = _pptx_table_to_markdown(shape.table) + if md: + slide_texts.append(md) + + if slide_texts: + parts.append(f"## Slide {slide_num}\n\n" + "\n\n".join(slide_texts)) + + return "\n\n".join(parts) + + +def _pptx_table_to_markdown(table) -> str: + rows = [[cell.text.strip().replace("|", "\\|") for cell in row.cells] for row in table.rows] + if not rows: + return "" + header = rows[0] + lines = [ + "| " + " | ".join(header) + " |", + "| " + " | ".join(["---"] * len(header)) + " |", + ] + for row in rows[1:]: + padded = row + [""] * (len(header) - len(row)) + lines.append("| " + " | ".join(padded[: len(header)]) + " |") + return "\n".join(lines) + + def _find_paragraph_by_element(doc, element): - """Find a Paragraph object matching the given XML element.""" for para in doc.paragraphs: if para._element is element: return para @@ -123,7 +137,6 @@ def _find_paragraph_by_element(doc, element): def _find_table_by_element(doc, element): - """Find a Table object matching the given XML element.""" for table in doc.tables: if table._element is element: return table @@ -131,7 +144,6 @@ def _find_table_by_element(doc, element): def _table_to_markdown(table) -> str: - """Convert a python-docx Table to a markdown table string.""" rows = [] for row in table.rows: cells = [cell.text.strip().replace("|", "\\|") for cell in row.cells] @@ -140,36 +152,30 @@ def _table_to_markdown(table) -> str: if not rows: return "" - # Deduplicate merged cells (python-docx repeats merged cell text) clean_rows = [] for row_cells in rows: clean = [] for i, cell_text in enumerate(row_cells): if i > 0 and cell_text == row_cells[i - 1]: - clean.append("") # merged cell + clean.append("") else: clean.append(cell_text) clean_rows.append(clean) - # Build markdown table lines = [] if clean_rows: header = clean_rows[0] lines.append("| " + " | ".join(header) + " |") lines.append("| " + " | ".join(["---"] * len(header)) + " |") for row in clean_rows[1:]: - # Pad row to match header length padded = row + [""] * (len(header) - len(row)) - lines.append("| " + " | ".join(padded[:len(header)]) + " |") + lines.append("| " + " | ".join(padded[: len(header)]) + " |") return "\n".join(lines) async def extract_text_from_image_via_vision(image_bytes: bytes, mime_type: str = "image/png") -> Optional[str]: - """Use Gemini vision to extract text from an image. - - Returns extracted text or None if unavailable. - """ + """Use Gemini vision to extract text from an image.""" try: import google.genai as genai