presenton/electron/servers/fastapi/services/lightweight_document_service.py
sudipnext 3207422651 feat: add language parameter to decomposeDocuments API call
- Updated the decomposeDocuments method in PresentationGenerationApi to accept an optional language parameter.
- Modified the UploadPage component to pass the selected language from the config when calling the decomposeDocuments method.
2026-03-28 15:34:53 +05:45

177 lines
6.1 KiB
Python

"""
Lightweight document converter for Windows/MSIX compatibility.
Uses pure-Python libraries: pdfplumber for PDF, docx2txt for DOCX, python-pptx for PPTX.
No subprocess, no external runtimes, MSIX/Appx safe.
"""
import os
from typing import List, Optional
import docx2everything
import pdfplumber
from pptx import Presentation
class LightweightDocumentConverter:
"""Lightweight document converter supporting PDF, DOCX, and PPTX."""
def convert(self, file_path: str) -> str:
"""
Convert document to markdown text.
Args:
file_path: Path to the document file
Returns:
Extracted text in markdown format
Raises:
ValueError: If file format is not supported
FileNotFoundError: If file does not exist
"""
if not os.path.exists(file_path):
raise FileNotFoundError(f"File not found: {file_path}")
file_ext = os.path.splitext(file_path)[1].lower()
if file_ext == '.pdf':
return self._convert_pdf(file_path)
elif file_ext == '.docx':
return self._convert_docx(file_path)
elif file_ext == '.pptx':
return self._convert_pptx(file_path)
else:
raise ValueError(f"Unsupported file format: {file_ext}")
def _convert_pdf(self, path: str) -> str:
"""
Convert PDF to markdown using pdfplumber.
Args:
path: Path to PDF file
Returns:
Extracted text in markdown format
"""
texts: List[str] = []
with pdfplumber.open(path) as pdf:
for idx, page in enumerate(pdf.pages):
page_text = f"## Page {idx + 1}\n"
page_text += page.extract_text() or ""
texts.append(page_text)
return "\n\n".join(texts)
def _convert_docx(self, path: str) -> str:
"""
Extract markdown from DOCX using docx2everything (no images).
Args:
path: Path to DOCX file
Returns:
Extracted markdown (no images)
"""
# Use the correct API: process_to_markdown(path) without img_dir extracts markdown without images
markdown = docx2everything.process_to_markdown(path)
return markdown if markdown else ""
def _convert_pptx(self, path: str) -> str:
"""
Convert PPTX to markdown using python-pptx.
Args:
path: Path to PPTX file
Returns:
Extracted text in markdown format
"""
prs = Presentation(path)
markdown_parts = []
for slide_num, slide in enumerate(prs.slides, start=1):
slide_parts = []
# Extract slide title (usually first shape with title placeholder)
title_text = None
for shape in slide.shapes:
if hasattr(shape, "placeholder"):
if shape.placeholder.placeholder_format.type == 1: # Title placeholder
if hasattr(shape, "text") and shape.text.strip():
title_text = shape.text.strip()
break
# If no title placeholder found, try to find text box at top
if not title_text:
for shape in slide.shapes:
if hasattr(shape, "text") and shape.text.strip():
# Check if it's likely a title (first text shape, short text)
text = shape.text.strip()
if len(text) < 200: # Heuristic: titles are usually short
title_text = text
break
# Add slide title
if title_text:
slide_parts.append(f"# {title_text}")
else:
slide_parts.append(f"# Slide {slide_num}")
# Extract content (bullet points and text)
for shape in slide.shapes:
if not hasattr(shape, "text"):
continue
text = shape.text.strip()
if not text:
continue
# Skip if this is the title we already added
if title_text and text == title_text:
continue
# Check if it's a text frame with paragraphs (bullet points)
if hasattr(shape, "text_frame"):
paragraphs = shape.text_frame.paragraphs
if len(paragraphs) > 1:
# Multiple paragraphs - likely bullet points
for para in paragraphs:
para_text = para.text.strip()
if para_text:
# Check bullet level
level = para.level
indent = " " * level
slide_parts.append(f"{indent}- {para_text}")
else:
# Single paragraph
if text and text != title_text:
slide_parts.append(text)
else:
# Plain text shape
if text and text != title_text:
slide_parts.append(text)
if slide_parts:
markdown_parts.append("\n".join(slide_parts))
return "\n\n---\n\n".join(markdown_parts)
class DocumentService:
"""
Document service wrapper providing parse_to_markdown interface.
Same parse_to_markdown entry point as LiteParseService for optional Windows fallback.
"""
def __init__(self):
self.converter = LightweightDocumentConverter()
def parse_to_markdown(self, file_path: str) -> str:
"""
Parse document to markdown format.
Args:
file_path: Path to the document file
Returns:
Extracted text in markdown format
"""
return self.converter.convert(file_path)