ppt-tool/backend/services/docling_service.py

"""Document parsing service.

Uses Docling for PDF/PPTX and python-docx for DOCX (better table handling).
Optionally extracts text from embedded images via Gemini vision.
"""
import asyncio
import base64
import os
from typing import List, Optional

from docling.document_converter import (
    DocumentConverter,
    PdfFormatOption,
    PowerpointFormatOption,
    WordFormatOption,
)
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.datamodel.base_models import InputFormat


class DoclingService:
    def __init__(self):
        self.pipeline_options = PdfPipelineOptions()
        self.pipeline_options.do_ocr = True

        self.converter = DocumentConverter(
            allowed_formats=[InputFormat.PPTX, InputFormat.PDF, InputFormat.DOCX],
            format_options={
                InputFormat.DOCX: WordFormatOption(
                    pipeline_options=self.pipeline_options,
                ),
                InputFormat.PPTX: PowerpointFormatOption(
                    pipeline_options=self.pipeline_options,
                ),
                InputFormat.PDF: PdfFormatOption(
                    pipeline_options=self.pipeline_options,
                ),
            },
        )

    def parse_to_markdown(self, file_path: str) -> str:
        """Parse any supported document to markdown via Docling."""
        result = self.converter.convert(file_path)
        return result.document.export_to_markdown()

    def parse_docx_structured(self, file_path: str) -> str:
        """Parse DOCX with python-docx for better table/structure handling.

        Falls back to Docling if python-docx is not available.
        """
        try:
            return self._parse_docx_with_python_docx(file_path)
        except Exception as e:
            print(f"[DoclingService] python-docx parsing failed ({e}), falling back to Docling")
            return self.parse_to_markdown(file_path)

    def _parse_docx_with_python_docx(self, file_path: str) -> str:
        """Extract text from DOCX using python-docx with proper table handling."""
        from docx import Document

        doc = Document(file_path)
        parts: List[str] = []

        for element in doc.element.body:
            tag = element.tag.split("}")[-1] if "}" in element.tag else element.tag

            if tag == "p":
                # Paragraph
                para = _find_paragraph_by_element(doc, element)
                if para is not None:
                    text = para.text.strip()
                    if text:
                        # Check heading style
                        style_name = (para.style.name or "").lower() if para.style else ""
                        if "heading" in style_name:
                            level = 1
                            for ch in style_name:
                                if ch.isdigit():
                                    level = int(ch)
                                    break
                            parts.append(f"{'#' * level} {text}")
                        else:
                            parts.append(text)

            elif tag == "tbl":
                # Table — extract as markdown table
                tbl = _find_table_by_element(doc, element)
                if tbl is not None:
                    md_table = _table_to_markdown(tbl)
                    if md_table:
                        parts.append(md_table)

        # Also extract images descriptions if possible
        embedded_images = self._extract_docx_images(doc)
        if embedded_images:
            parts.append("\n## Embedded Images\n")
            for desc in embedded_images:
                parts.append(f"- {desc}")

        return "\n\n".join(parts)

    def _extract_docx_images(self, doc) -> List[str]:
        """Extract image descriptions from DOCX.

        Returns alt text for images, or placeholder if no alt text.
        """
        descriptions = []
        try:
            for rel in doc.part.rels.values():
                if "image" in rel.reltype:
                    descriptions.append("[Embedded image]")
        except Exception:
            pass
        return descriptions


def _find_paragraph_by_element(doc, element):
    """Find a Paragraph object matching the given XML element."""
    for para in doc.paragraphs:
        if para._element is element:
            return para
    return None


def _find_table_by_element(doc, element):
    """Find a Table object matching the given XML element."""
    for table in doc.tables:
        if table._element is element:
            return table
    return None


def _table_to_markdown(table) -> str:
    """Convert a python-docx Table to a markdown table string."""
    rows = []
    for row in table.rows:
        cells = [cell.text.strip().replace("|", "\\|") for cell in row.cells]
        rows.append(cells)

    if not rows:
        return ""

    # Deduplicate merged cells (python-docx repeats merged cell text)
    clean_rows = []
    for row_cells in rows:
        clean = []
        for i, cell_text in enumerate(row_cells):
            if i > 0 and cell_text == row_cells[i - 1]:
                clean.append("")  # merged cell
            else:
                clean.append(cell_text)
        clean_rows.append(clean)

    # Build markdown table
    lines = []
    if clean_rows:
        header = clean_rows[0]
        lines.append("| " + " | ".join(header) + " |")
        lines.append("| " + " | ".join(["---"] * len(header)) + " |")
        for row in clean_rows[1:]:
            # Pad row to match header length
            padded = row + [""] * (len(header) - len(row))
            lines.append("| " + " | ".join(padded[:len(header)]) + " |")

    return "\n".join(lines)


async def extract_text_from_image_via_vision(image_bytes: bytes, mime_type: str = "image/png") -> Optional[str]:
    """Use Gemini vision to extract text from an image.

    Returns extracted text or None if unavailable.
    """
    try:
        import google.genai as genai

        api_key = os.environ.get("GOOGLE_API_KEY")
        if not api_key:
            return None

        client = genai.Client()
        b64 = base64.b64encode(image_bytes).decode("utf-8")

        response = await asyncio.to_thread(
            client.models.generate_content,
            model="gemini-2.5-flash",
            contents=[
                {
                    "parts": [
                        {"text": "Extract all text from this image. Return only the extracted text, nothing else. If no text is found, return 'No text found'."},
                        {"inline_data": {"mime_type": mime_type, "data": b64}},
                    ]
                }
            ],
        )
        text = response.text.strip() if response.text else None
        if text and text.lower() != "no text found":
            return text
        return None
    except Exception as e:
        print(f"[DoclingService] Vision text extraction failed: {e}")
        return None