"""Document parsing service. Uses Docling for PDF/PPTX and python-docx for DOCX (better table handling). Optionally extracts text from embedded images via Gemini vision. """ import asyncio import base64 import os from typing import List, Optional from docling.document_converter import ( DocumentConverter, PdfFormatOption, PowerpointFormatOption, WordFormatOption, ) from docling.datamodel.pipeline_options import PdfPipelineOptions from docling.datamodel.base_models import InputFormat class DoclingService: def __init__(self): self.pipeline_options = PdfPipelineOptions() self.pipeline_options.do_ocr = True self.converter = DocumentConverter( allowed_formats=[InputFormat.PPTX, InputFormat.PDF, InputFormat.DOCX], format_options={ InputFormat.DOCX: WordFormatOption( pipeline_options=self.pipeline_options, ), InputFormat.PPTX: PowerpointFormatOption( pipeline_options=self.pipeline_options, ), InputFormat.PDF: PdfFormatOption( pipeline_options=self.pipeline_options, ), }, ) def parse_to_markdown(self, file_path: str) -> str: """Parse any supported document to markdown via Docling.""" result = self.converter.convert(file_path) return result.document.export_to_markdown() def parse_docx_structured(self, file_path: str) -> str: """Parse DOCX with python-docx for better table/structure handling. Falls back to Docling if python-docx is not available. """ try: return self._parse_docx_with_python_docx(file_path) except Exception as e: print(f"[DoclingService] python-docx parsing failed ({e}), falling back to Docling") return self.parse_to_markdown(file_path) def _parse_docx_with_python_docx(self, file_path: str) -> str: """Extract text from DOCX using python-docx with proper table handling.""" from docx import Document doc = Document(file_path) parts: List[str] = [] for element in doc.element.body: tag = element.tag.split("}")[-1] if "}" in element.tag else element.tag if tag == "p": # Paragraph para = _find_paragraph_by_element(doc, element) if para is not None: text = para.text.strip() if text: # Check heading style style_name = (para.style.name or "").lower() if para.style else "" if "heading" in style_name: level = 1 for ch in style_name: if ch.isdigit(): level = int(ch) break parts.append(f"{'#' * level} {text}") else: parts.append(text) elif tag == "tbl": # Table — extract as markdown table tbl = _find_table_by_element(doc, element) if tbl is not None: md_table = _table_to_markdown(tbl) if md_table: parts.append(md_table) # Also extract images descriptions if possible embedded_images = self._extract_docx_images(doc) if embedded_images: parts.append("\n## Embedded Images\n") for desc in embedded_images: parts.append(f"- {desc}") return "\n\n".join(parts) def _extract_docx_images(self, doc) -> List[str]: """Extract image descriptions from DOCX. Returns alt text for images, or placeholder if no alt text. """ descriptions = [] try: for rel in doc.part.rels.values(): if "image" in rel.reltype: descriptions.append("[Embedded image]") except Exception: pass return descriptions def _find_paragraph_by_element(doc, element): """Find a Paragraph object matching the given XML element.""" for para in doc.paragraphs: if para._element is element: return para return None def _find_table_by_element(doc, element): """Find a Table object matching the given XML element.""" for table in doc.tables: if table._element is element: return table return None def _table_to_markdown(table) -> str: """Convert a python-docx Table to a markdown table string.""" rows = [] for row in table.rows: cells = [cell.text.strip().replace("|", "\\|") for cell in row.cells] rows.append(cells) if not rows: return "" # Deduplicate merged cells (python-docx repeats merged cell text) clean_rows = [] for row_cells in rows: clean = [] for i, cell_text in enumerate(row_cells): if i > 0 and cell_text == row_cells[i - 1]: clean.append("") # merged cell else: clean.append(cell_text) clean_rows.append(clean) # Build markdown table lines = [] if clean_rows: header = clean_rows[0] lines.append("| " + " | ".join(header) + " |") lines.append("| " + " | ".join(["---"] * len(header)) + " |") for row in clean_rows[1:]: # Pad row to match header length padded = row + [""] * (len(header) - len(row)) lines.append("| " + " | ".join(padded[:len(header)]) + " |") return "\n".join(lines) async def extract_text_from_image_via_vision(image_bytes: bytes, mime_type: str = "image/png") -> Optional[str]: """Use Gemini vision to extract text from an image. Returns extracted text or None if unavailable. """ try: import google.genai as genai api_key = os.environ.get("GOOGLE_API_KEY") if not api_key: return None client = genai.Client() b64 = base64.b64encode(image_bytes).decode("utf-8") response = await asyncio.to_thread( client.models.generate_content, model="gemini-2.5-flash", contents=[ { "parts": [ {"text": "Extract all text from this image. Return only the extracted text, nothing else. If no text is found, return 'No text found'."}, {"inline_data": {"mime_type": mime_type, "data": b64}}, ] } ], ) text = response.text.strip() if response.text else None if text and text.lower() != "no text found": return text return None except Exception as e: print(f"[DoclingService] Vision text extraction failed: {e}") return None