Replace docling+layoutparser+torch with PyMuPDF (~3.5GB → ~80MB)
- docling removed: PDF now parsed by PyMuPDF (fitz), PPTX by python-pptx - layoutparser removed: already optional with graceful fallback (returns []) - torch/pytorch index removed: no longer needed by any dependency - pymupdf added: ~20MB wheel, no ML deps, faster than docling for text extraction - All existing DOCX parsing kept (python-docx, already working) - extract_text_from_image_via_vision() unchanged (Gemini API) Result: api/worker Docker image ~3-4GB lighter, no NVIDIA libs on CPU server Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
f9ae1c9b3a
commit
8715fa8bd2
2 changed files with 68 additions and 72 deletions
|
|
@ -11,7 +11,6 @@ dependencies = [
|
|||
"asyncpg>=0.30.0",
|
||||
"chromadb>=1.0.15",
|
||||
"dirtyjson>=1.0.8",
|
||||
"docling>=2.43.0",
|
||||
"fastapi[standard]>=0.116.1",
|
||||
"fastmcp>=2.11.0",
|
||||
"google-genai>=1.28.0",
|
||||
|
|
@ -22,7 +21,7 @@ dependencies = [
|
|||
"pytest>=8.4.1",
|
||||
"python-docx>=1.1",
|
||||
"python-pptx>=1.0.2",
|
||||
"layoutparser>=0.3",
|
||||
"pymupdf>=1.24",
|
||||
"opencv-python-headless>=4.8",
|
||||
"redis>=5.0,<6",
|
||||
"sqlmodel>=0.0.24",
|
||||
|
|
@ -42,15 +41,6 @@ asyncio_mode = "auto"
|
|||
testpaths = ["tests"]
|
||||
pythonpath = ["."]
|
||||
|
||||
[[tool.uv.index]]
|
||||
name = "pytorch-cpu"
|
||||
url = "https://download.pytorch.org/whl/cpu"
|
||||
explicit = true
|
||||
|
||||
[tool.uv.sources]
|
||||
torch = { index = "pytorch-cpu" }
|
||||
torchvision = { index = "pytorch-cpu" }
|
||||
torchaudio = { index = "pytorch-cpu" }
|
||||
|
||||
[tool.setuptools.packages.find]
|
||||
where = ["."]
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
"""Document parsing service.
|
||||
|
||||
Uses Docling for PDF/PPTX and python-docx for DOCX (better table handling).
|
||||
Uses PyMuPDF for PDF, python-pptx for PPTX, python-docx for DOCX.
|
||||
Optionally extracts text from embedded images via Gemini vision.
|
||||
"""
|
||||
import asyncio
|
||||
|
|
@ -8,51 +8,20 @@ import base64
|
|||
import os
|
||||
from typing import List, Optional
|
||||
|
||||
from docling.document_converter import (
|
||||
DocumentConverter,
|
||||
PdfFormatOption,
|
||||
PowerpointFormatOption,
|
||||
WordFormatOption,
|
||||
)
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
|
||||
|
||||
class DoclingService:
|
||||
def __init__(self):
|
||||
self.pipeline_options = PdfPipelineOptions()
|
||||
self.pipeline_options.do_ocr = True
|
||||
|
||||
self.converter = DocumentConverter(
|
||||
allowed_formats=[InputFormat.PPTX, InputFormat.PDF, InputFormat.DOCX],
|
||||
format_options={
|
||||
InputFormat.DOCX: WordFormatOption(
|
||||
pipeline_options=self.pipeline_options,
|
||||
),
|
||||
InputFormat.PPTX: PowerpointFormatOption(
|
||||
pipeline_options=self.pipeline_options,
|
||||
),
|
||||
InputFormat.PDF: PdfFormatOption(
|
||||
pipeline_options=self.pipeline_options,
|
||||
),
|
||||
},
|
||||
)
|
||||
|
||||
def parse_to_markdown(self, file_path: str) -> str:
|
||||
"""Parse any supported document to markdown via Docling."""
|
||||
result = self.converter.convert(file_path)
|
||||
return result.document.export_to_markdown()
|
||||
"""Parse PDF or PPTX to markdown."""
|
||||
ext = os.path.splitext(file_path)[1].lower()
|
||||
if ext == ".pdf":
|
||||
return _parse_pdf_with_pymupdf(file_path)
|
||||
elif ext in (".pptx", ".ppt"):
|
||||
return _parse_pptx_text(file_path)
|
||||
return ""
|
||||
|
||||
def parse_docx_structured(self, file_path: str) -> str:
|
||||
"""Parse DOCX with python-docx for better table/structure handling.
|
||||
|
||||
Falls back to Docling if python-docx is not available.
|
||||
"""
|
||||
try:
|
||||
return self._parse_docx_with_python_docx(file_path)
|
||||
except Exception as e:
|
||||
print(f"[DoclingService] python-docx parsing failed ({e}), falling back to Docling")
|
||||
return self.parse_to_markdown(file_path)
|
||||
"""Parse DOCX with python-docx for better table/structure handling."""
|
||||
return self._parse_docx_with_python_docx(file_path)
|
||||
|
||||
def _parse_docx_with_python_docx(self, file_path: str) -> str:
|
||||
"""Extract text from DOCX using python-docx with proper table handling."""
|
||||
|
|
@ -65,12 +34,10 @@ class DoclingService:
|
|||
tag = element.tag.split("}")[-1] if "}" in element.tag else element.tag
|
||||
|
||||
if tag == "p":
|
||||
# Paragraph
|
||||
para = _find_paragraph_by_element(doc, element)
|
||||
if para is not None:
|
||||
text = para.text.strip()
|
||||
if text:
|
||||
# Check heading style
|
||||
style_name = (para.style.name or "").lower() if para.style else ""
|
||||
if "heading" in style_name:
|
||||
level = 1
|
||||
|
|
@ -83,14 +50,12 @@ class DoclingService:
|
|||
parts.append(text)
|
||||
|
||||
elif tag == "tbl":
|
||||
# Table — extract as markdown table
|
||||
tbl = _find_table_by_element(doc, element)
|
||||
if tbl is not None:
|
||||
md_table = _table_to_markdown(tbl)
|
||||
if md_table:
|
||||
parts.append(md_table)
|
||||
|
||||
# Also extract images descriptions if possible
|
||||
embedded_images = self._extract_docx_images(doc)
|
||||
if embedded_images:
|
||||
parts.append("\n## Embedded Images\n")
|
||||
|
|
@ -100,10 +65,6 @@ class DoclingService:
|
|||
return "\n\n".join(parts)
|
||||
|
||||
def _extract_docx_images(self, doc) -> List[str]:
|
||||
"""Extract image descriptions from DOCX.
|
||||
|
||||
Returns alt text for images, or placeholder if no alt text.
|
||||
"""
|
||||
descriptions = []
|
||||
try:
|
||||
for rel in doc.part.rels.values():
|
||||
|
|
@ -114,8 +75,61 @@ class DoclingService:
|
|||
return descriptions
|
||||
|
||||
|
||||
def _parse_pdf_with_pymupdf(file_path: str) -> str:
|
||||
"""Extract text from PDF using PyMuPDF (no ML, no torch)."""
|
||||
import fitz # PyMuPDF
|
||||
|
||||
parts = []
|
||||
with fitz.open(file_path) as doc:
|
||||
for page_num, page in enumerate(doc, 1):
|
||||
text = page.get_text("markdown").strip()
|
||||
if text:
|
||||
parts.append(f"## Page {page_num}\n\n{text}")
|
||||
|
||||
return "\n\n".join(parts)
|
||||
|
||||
|
||||
def _parse_pptx_text(file_path: str) -> str:
|
||||
"""Extract text from PPTX using python-pptx (no ML, no torch)."""
|
||||
from pptx import Presentation
|
||||
|
||||
prs = Presentation(file_path)
|
||||
parts = []
|
||||
|
||||
for slide_num, slide in enumerate(prs.slides, 1):
|
||||
slide_texts = []
|
||||
for shape in slide.shapes:
|
||||
if shape.has_text_frame:
|
||||
text = shape.text_frame.text.strip()
|
||||
if text:
|
||||
slide_texts.append(text)
|
||||
elif shape.has_table:
|
||||
md = _pptx_table_to_markdown(shape.table)
|
||||
if md:
|
||||
slide_texts.append(md)
|
||||
|
||||
if slide_texts:
|
||||
parts.append(f"## Slide {slide_num}\n\n" + "\n\n".join(slide_texts))
|
||||
|
||||
return "\n\n".join(parts)
|
||||
|
||||
|
||||
def _pptx_table_to_markdown(table) -> str:
|
||||
rows = [[cell.text.strip().replace("|", "\\|") for cell in row.cells] for row in table.rows]
|
||||
if not rows:
|
||||
return ""
|
||||
header = rows[0]
|
||||
lines = [
|
||||
"| " + " | ".join(header) + " |",
|
||||
"| " + " | ".join(["---"] * len(header)) + " |",
|
||||
]
|
||||
for row in rows[1:]:
|
||||
padded = row + [""] * (len(header) - len(row))
|
||||
lines.append("| " + " | ".join(padded[: len(header)]) + " |")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def _find_paragraph_by_element(doc, element):
|
||||
"""Find a Paragraph object matching the given XML element."""
|
||||
for para in doc.paragraphs:
|
||||
if para._element is element:
|
||||
return para
|
||||
|
|
@ -123,7 +137,6 @@ def _find_paragraph_by_element(doc, element):
|
|||
|
||||
|
||||
def _find_table_by_element(doc, element):
|
||||
"""Find a Table object matching the given XML element."""
|
||||
for table in doc.tables:
|
||||
if table._element is element:
|
||||
return table
|
||||
|
|
@ -131,7 +144,6 @@ def _find_table_by_element(doc, element):
|
|||
|
||||
|
||||
def _table_to_markdown(table) -> str:
|
||||
"""Convert a python-docx Table to a markdown table string."""
|
||||
rows = []
|
||||
for row in table.rows:
|
||||
cells = [cell.text.strip().replace("|", "\\|") for cell in row.cells]
|
||||
|
|
@ -140,36 +152,30 @@ def _table_to_markdown(table) -> str:
|
|||
if not rows:
|
||||
return ""
|
||||
|
||||
# Deduplicate merged cells (python-docx repeats merged cell text)
|
||||
clean_rows = []
|
||||
for row_cells in rows:
|
||||
clean = []
|
||||
for i, cell_text in enumerate(row_cells):
|
||||
if i > 0 and cell_text == row_cells[i - 1]:
|
||||
clean.append("") # merged cell
|
||||
clean.append("")
|
||||
else:
|
||||
clean.append(cell_text)
|
||||
clean_rows.append(clean)
|
||||
|
||||
# Build markdown table
|
||||
lines = []
|
||||
if clean_rows:
|
||||
header = clean_rows[0]
|
||||
lines.append("| " + " | ".join(header) + " |")
|
||||
lines.append("| " + " | ".join(["---"] * len(header)) + " |")
|
||||
for row in clean_rows[1:]:
|
||||
# Pad row to match header length
|
||||
padded = row + [""] * (len(header) - len(row))
|
||||
lines.append("| " + " | ".join(padded[:len(header)]) + " |")
|
||||
lines.append("| " + " | ".join(padded[: len(header)]) + " |")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
async def extract_text_from_image_via_vision(image_bytes: bytes, mime_type: str = "image/png") -> Optional[str]:
|
||||
"""Use Gemini vision to extract text from an image.
|
||||
|
||||
Returns extracted text or None if unavailable.
|
||||
"""
|
||||
"""Use Gemini vision to extract text from an image."""
|
||||
try:
|
||||
import google.genai as genai
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue