- Fix SSE stream 500: use async_session_maker inside StreamingResponse generator (Depends session closes when endpoint returns, before streaming starts) - Fix template application: store template_name in prepare endpoint so worker uses the selected custom template instead of defaulting to "general" - Fix OverlayLoader: replace loading.gif with HamsterLoader component - Fix parse_mode default: change from "slides" to "layouts" to avoid 70+ layouts - Update Gemini Flash model to gemini-3.1-flash-image-preview - Improve DOCX parsing: python-docx for structured table extraction, OCR enabled - Add vision-based image text extraction via Gemini for uploaded images - Add LayoutParser integration for slide layout structure analysis - Add Phase 4 MVP features: transfer ownership, URL input, follow-up questions, attachment-to-slide mapping, content router Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
201 lines
6.8 KiB
Python
201 lines
6.8 KiB
Python
"""Document parsing service.
|
|
|
|
Uses Docling for PDF/PPTX and python-docx for DOCX (better table handling).
|
|
Optionally extracts text from embedded images via Gemini vision.
|
|
"""
|
|
import asyncio
|
|
import base64
|
|
import os
|
|
from typing import List, Optional
|
|
|
|
from docling.document_converter import (
|
|
DocumentConverter,
|
|
PdfFormatOption,
|
|
PowerpointFormatOption,
|
|
WordFormatOption,
|
|
)
|
|
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
|
from docling.datamodel.base_models import InputFormat
|
|
|
|
|
|
class DoclingService:
|
|
def __init__(self):
|
|
self.pipeline_options = PdfPipelineOptions()
|
|
self.pipeline_options.do_ocr = True
|
|
|
|
self.converter = DocumentConverter(
|
|
allowed_formats=[InputFormat.PPTX, InputFormat.PDF, InputFormat.DOCX],
|
|
format_options={
|
|
InputFormat.DOCX: WordFormatOption(
|
|
pipeline_options=self.pipeline_options,
|
|
),
|
|
InputFormat.PPTX: PowerpointFormatOption(
|
|
pipeline_options=self.pipeline_options,
|
|
),
|
|
InputFormat.PDF: PdfFormatOption(
|
|
pipeline_options=self.pipeline_options,
|
|
),
|
|
},
|
|
)
|
|
|
|
def parse_to_markdown(self, file_path: str) -> str:
|
|
"""Parse any supported document to markdown via Docling."""
|
|
result = self.converter.convert(file_path)
|
|
return result.document.export_to_markdown()
|
|
|
|
def parse_docx_structured(self, file_path: str) -> str:
|
|
"""Parse DOCX with python-docx for better table/structure handling.
|
|
|
|
Falls back to Docling if python-docx is not available.
|
|
"""
|
|
try:
|
|
return self._parse_docx_with_python_docx(file_path)
|
|
except Exception as e:
|
|
print(f"[DoclingService] python-docx parsing failed ({e}), falling back to Docling")
|
|
return self.parse_to_markdown(file_path)
|
|
|
|
def _parse_docx_with_python_docx(self, file_path: str) -> str:
|
|
"""Extract text from DOCX using python-docx with proper table handling."""
|
|
from docx import Document
|
|
|
|
doc = Document(file_path)
|
|
parts: List[str] = []
|
|
|
|
for element in doc.element.body:
|
|
tag = element.tag.split("}")[-1] if "}" in element.tag else element.tag
|
|
|
|
if tag == "p":
|
|
# Paragraph
|
|
para = _find_paragraph_by_element(doc, element)
|
|
if para is not None:
|
|
text = para.text.strip()
|
|
if text:
|
|
# Check heading style
|
|
style_name = (para.style.name or "").lower() if para.style else ""
|
|
if "heading" in style_name:
|
|
level = 1
|
|
for ch in style_name:
|
|
if ch.isdigit():
|
|
level = int(ch)
|
|
break
|
|
parts.append(f"{'#' * level} {text}")
|
|
else:
|
|
parts.append(text)
|
|
|
|
elif tag == "tbl":
|
|
# Table — extract as markdown table
|
|
tbl = _find_table_by_element(doc, element)
|
|
if tbl is not None:
|
|
md_table = _table_to_markdown(tbl)
|
|
if md_table:
|
|
parts.append(md_table)
|
|
|
|
# Also extract images descriptions if possible
|
|
embedded_images = self._extract_docx_images(doc)
|
|
if embedded_images:
|
|
parts.append("\n## Embedded Images\n")
|
|
for desc in embedded_images:
|
|
parts.append(f"- {desc}")
|
|
|
|
return "\n\n".join(parts)
|
|
|
|
def _extract_docx_images(self, doc) -> List[str]:
|
|
"""Extract image descriptions from DOCX.
|
|
|
|
Returns alt text for images, or placeholder if no alt text.
|
|
"""
|
|
descriptions = []
|
|
try:
|
|
for rel in doc.part.rels.values():
|
|
if "image" in rel.reltype:
|
|
descriptions.append("[Embedded image]")
|
|
except Exception:
|
|
pass
|
|
return descriptions
|
|
|
|
|
|
def _find_paragraph_by_element(doc, element):
|
|
"""Find a Paragraph object matching the given XML element."""
|
|
for para in doc.paragraphs:
|
|
if para._element is element:
|
|
return para
|
|
return None
|
|
|
|
|
|
def _find_table_by_element(doc, element):
|
|
"""Find a Table object matching the given XML element."""
|
|
for table in doc.tables:
|
|
if table._element is element:
|
|
return table
|
|
return None
|
|
|
|
|
|
def _table_to_markdown(table) -> str:
|
|
"""Convert a python-docx Table to a markdown table string."""
|
|
rows = []
|
|
for row in table.rows:
|
|
cells = [cell.text.strip().replace("|", "\\|") for cell in row.cells]
|
|
rows.append(cells)
|
|
|
|
if not rows:
|
|
return ""
|
|
|
|
# Deduplicate merged cells (python-docx repeats merged cell text)
|
|
clean_rows = []
|
|
for row_cells in rows:
|
|
clean = []
|
|
for i, cell_text in enumerate(row_cells):
|
|
if i > 0 and cell_text == row_cells[i - 1]:
|
|
clean.append("") # merged cell
|
|
else:
|
|
clean.append(cell_text)
|
|
clean_rows.append(clean)
|
|
|
|
# Build markdown table
|
|
lines = []
|
|
if clean_rows:
|
|
header = clean_rows[0]
|
|
lines.append("| " + " | ".join(header) + " |")
|
|
lines.append("| " + " | ".join(["---"] * len(header)) + " |")
|
|
for row in clean_rows[1:]:
|
|
# Pad row to match header length
|
|
padded = row + [""] * (len(header) - len(row))
|
|
lines.append("| " + " | ".join(padded[:len(header)]) + " |")
|
|
|
|
return "\n".join(lines)
|
|
|
|
|
|
async def extract_text_from_image_via_vision(image_bytes: bytes, mime_type: str = "image/png") -> Optional[str]:
|
|
"""Use Gemini vision to extract text from an image.
|
|
|
|
Returns extracted text or None if unavailable.
|
|
"""
|
|
try:
|
|
import google.genai as genai
|
|
|
|
api_key = os.environ.get("GOOGLE_API_KEY")
|
|
if not api_key:
|
|
return None
|
|
|
|
client = genai.Client()
|
|
b64 = base64.b64encode(image_bytes).decode("utf-8")
|
|
|
|
response = await asyncio.to_thread(
|
|
client.models.generate_content,
|
|
model="gemini-2.5-flash",
|
|
contents=[
|
|
{
|
|
"parts": [
|
|
{"text": "Extract all text from this image. Return only the extracted text, nothing else. If no text is found, return 'No text found'."},
|
|
{"inline_data": {"mime_type": mime_type, "data": b64}},
|
|
]
|
|
}
|
|
],
|
|
)
|
|
text = response.text.strip() if response.text else None
|
|
if text and text.lower() != "no text found":
|
|
return text
|
|
return None
|
|
except Exception as e:
|
|
print(f"[DoclingService] Vision text extraction failed: {e}")
|
|
return None
|