39 lines
1.1 KiB
Python
39 lines
1.1 KiB
Python
import io
|
|
import os
|
|
import subprocess
|
|
from typing import List
|
|
|
|
from image_processor.utils import get_page_images_from_pdf
|
|
|
|
|
|
def get_pdf_from_pptx(pptx_path: str, temp_dir: str) -> str:
|
|
base_name = os.path.splitext(os.path.basename(pptx_path))[0]
|
|
pdf_filename = f"{base_name}.pdf"
|
|
|
|
pdf_path = os.path.join(temp_dir, pdf_filename)
|
|
|
|
retry_count = 0
|
|
while not os.path.exists(pdf_path):
|
|
result = subprocess.run(
|
|
f'{os.getenv("LIBREOFFICE")} --convert-to pdf "{pptx_path}" --outdir "{temp_dir}"',
|
|
shell=True,
|
|
capture_output=True,
|
|
text=True,
|
|
)
|
|
print("LibreOffice stdout:", result.stdout)
|
|
print("LibreOffice stderr:", result.stderr)
|
|
|
|
retry_count += 1
|
|
if retry_count > 3:
|
|
raise Exception("Failed to convert PPTX to PDF")
|
|
|
|
pdf_path = os.path.join(temp_dir, pdf_filename)
|
|
|
|
return pdf_path
|
|
|
|
|
|
def get_images_from_pptx(pptx_path: str) -> List[str]:
|
|
temp_dir = os.path.dirname(pptx_path)
|
|
pdf_path = get_pdf_from_pptx(pptx_path, temp_dir)
|
|
|
|
return get_page_images_from_pdf(pdf_path, temp_dir)
|