diff --git a/electron/app/ipc/setup_install_handlers.ts b/electron/app/ipc/setup_install_handlers.ts index 3ffa345b..a59cf8aa 100644 --- a/electron/app/ipc/setup_install_handlers.ts +++ b/electron/app/ipc/setup_install_handlers.ts @@ -4,11 +4,14 @@ * - setup:install-chrome — download Chromium (browser-snapshots) with progress */ -import { ipcMain, WebContents, shell } from "electron"; +import { ipcMain, WebContents } from "electron"; import fs from "fs"; import path from "path"; import os from "os"; import { spawn, spawnSync } from "child_process"; +import * as https from "https"; +import * as http from "http"; +import { IncomingMessage } from "http"; import puppeteer from "puppeteer"; import { Browser, @@ -19,8 +22,10 @@ import { } from "@puppeteer/browsers"; import { getSetupStatus } from "../utils/setup-dependencies"; import { + getImageMagickBinaryPath, getImageMagickDownloadUrl, getImageMagickManualInstallCommands, + getWindowsImageMagickInstallDir, isImageMagickInstalled, } from "../utils/imagemagick-check"; @@ -50,7 +55,7 @@ function sendChromeLog(wc: WebContents, level: string, text: string) { function sendImageMagickProgress( wc: WebContents, - phase: "installing" | "done" | "error", + phase: "downloading" | "installing" | "done" | "error", percent?: number, message?: string ) { @@ -100,6 +105,156 @@ function logManualImageMagickCommands(wc: WebContents) { } } +const MAX_DOWNLOAD_REDIRECTS = 5; +const MIN_IMAGEMAGICK_INSTALLER_SIZE_BYTES = 5 * 1024 * 1024; + +function formatBytes(bytes: number): string { + if (bytes <= 0) return "0 B"; + const mb = bytes / 1024 / 1024; + if (mb >= 1) return `${mb.toFixed(1)} MB`; + const kb = bytes / 1024; + if (kb >= 1) return `${kb.toFixed(0)} KB`; + return `${bytes} B`; +} + +function escapePowerShellSingleQuoted(value: string): string { + return value.replace(/'/g, "''"); +} + +function getFilenameFromUrl(url: string, fallback: string): string { + try { + const parsed = new URL(url); + const name = path.basename(parsed.pathname); + return name || fallback; + } catch { + return fallback; + } +} + +function downloadFileWithProgress( + wc: WebContents, + url: string, + destinationPath: string +): Promise { + return new Promise((resolve, reject) => { + const requestDownload = (requestUrl: string, redirects: number) => { + const requester = requestUrl.startsWith("https") ? https.get : http.get; + sendImageMagickLog(wc, "cmd", `GET ${requestUrl}`); + + requester(requestUrl, (res: IncomingMessage) => { + const statusCode = res.statusCode ?? 0; + if ( + [301, 302, 303, 307, 308].includes(statusCode) && + res.headers.location + ) { + if (redirects >= MAX_DOWNLOAD_REDIRECTS) { + reject(new Error("Too many redirects while downloading installer.")); + return; + } + const redirectUrl = new URL(res.headers.location, requestUrl).toString(); + sendImageMagickLog(wc, "info", `Redirecting to ${redirectUrl}`); + requestDownload(redirectUrl, redirects + 1); + return; + } + + if (statusCode !== 200) { + reject(new Error(`Download failed with HTTP ${statusCode}.`)); + return; + } + + const totalBytes = Number.parseInt( + String(res.headers["content-length"] ?? "0"), + 10 + ); + let downloadedBytes = 0; + + const file = fs.createWriteStream(destinationPath); + + res.on("data", (chunk: Buffer) => { + downloadedBytes += chunk.length; + const percent = + totalBytes > 0 + ? Math.min(99, Math.floor((downloadedBytes / totalBytes) * 100)) + : undefined; + const sizeLabel = + totalBytes > 0 + ? `${formatBytes(downloadedBytes)} / ${formatBytes(totalBytes)}` + : `${formatBytes(downloadedBytes)} downloaded`; + sendImageMagickProgress(wc, "downloading", percent, sizeLabel); + }); + + res.pipe(file); + + file.on("finish", () => { + file.close(() => { + if (downloadedBytes < MIN_IMAGEMAGICK_INSTALLER_SIZE_BYTES) { + fs.unlink(destinationPath, () => {}); + reject( + new Error( + `Downloaded file is too small (${formatBytes(downloadedBytes)}).` + ) + ); + return; + } + + sendImageMagickLog( + wc, + "ok", + `Download complete (${formatBytes(downloadedBytes)}).` + ); + resolve(); + }); + }); + + file.on("error", (err) => { + fs.unlink(destinationPath, () => {}); + reject(err); + }); + }).on("error", (err) => { + fs.unlink(destinationPath, () => {}); + reject(err); + }); + }; + + requestDownload(url, 0); + }); +} + +async function runWindowsExecutableInstaller( + wc: WebContents, + installerPath: string, + installerArgs: string[] +): Promise { + const escapedInstallerPath = escapePowerShellSingleQuoted(installerPath); + const argList = installerArgs + .map((arg) => `'${escapePowerShellSingleQuoted(arg)}'`) + .join(", "); + + const runViaPowerShell = async (runAsAdmin: boolean) => { + const verb = runAsAdmin ? " -Verb RunAs" : ""; + const script = `$p = Start-Process -FilePath '${escapedInstallerPath}' -ArgumentList ${argList}${verb} -Wait -PassThru; if ($p) { exit $p.ExitCode } else { exit 1 }`; + await runInstallCommand(wc, "powershell", [ + "-NoProfile", + "-ExecutionPolicy", + "Bypass", + "-Command", + script, + ]); + }; + + try { + sendImageMagickLog(wc, "info", "Running installer in user mode..."); + await runViaPowerShell(false); + } catch { + sendImageMagickLog( + wc, + "warn", + "User-mode install failed. Retrying with administrator rights..." + ); + await runViaPowerShell(true); + } +} + function runInstallCommand( wc: WebContents, command: string, @@ -282,23 +437,61 @@ export function setupSetupInstallHandlers() { await runInstallCommand(wc, brewCommand, ["install", "imagemagick"]); } else if (process.platform === "win32") { - if (commandExists("choco", ["-v"])) { - await runInstallCommand(wc, "choco", [ - "install", - "imagemagick.app", - "-y", - ]); - } else { - throw new Error( - "Chocolatey is not installed. Falling back to direct installer download." - ); - } + const installerUrl = getImageMagickDownloadUrl(); + const installerFilename = getFilenameFromUrl( + installerUrl, + "ImageMagick-installer.exe" + ); + const installerPath = path.join(os.tmpdir(), installerFilename); + const installDir = getWindowsImageMagickInstallDir(); + + fs.mkdirSync(installDir, { recursive: true }); + + sendImageMagickLog( + wc, + "info", + `Downloading ImageMagick installer (${installerFilename})...` + ); + sendImageMagickLog(wc, "cmd", `Install directory: ${installDir}`); + sendImageMagickProgress(wc, "downloading", 0, "Connecting..."); + + await downloadFileWithProgress(wc, installerUrl, installerPath); + + sendImageMagickProgress( + wc, + "installing", + undefined, + "Running installer..." + ); + + await runWindowsExecutableInstaller(wc, installerPath, [ + "/SP-", + "/VERYSILENT", + "/SUPPRESSMSGBOXES", + "/NORESTART", + `/DIR=${installDir}`, + ]); + + fs.unlink(installerPath, () => {}); + sendImageMagickLog(wc, "ok", "ImageMagick installer completed."); } else { throw new Error( "Unsupported platform for automatic install. Use manual install from the official download page." ); } + if (!isImageMagickInstalled()) { + throw new Error( + "ImageMagick installation command finished, but the binary was not detected." + ); + } + + sendImageMagickLog( + wc, + "ok", + `ImageMagick detected at ${getImageMagickBinaryPath()}` + ); + sendImageMagickProgress(wc, "done", 100, "ImageMagick install finished"); return { ok: true }; } catch (error) { @@ -310,9 +503,8 @@ export function setupSetupInstallHandlers() { sendImageMagickLog( wc, "info", - `Opening manual install link: ${downloadUrl}` + `Manual install URL: ${downloadUrl}` ); - await shell.openExternal(downloadUrl); sendImageMagickProgress( wc, "error", @@ -331,7 +523,11 @@ export function setupSetupInstallHandlers() { const installed = isImageMagickInstalled(); if (installed) { sendImageMagickProgress(wc, "done", 100, "ImageMagick detected"); - sendImageMagickLog(wc, "ok", "ImageMagick is installed and ready."); + sendImageMagickLog( + wc, + "ok", + `ImageMagick is installed and ready (${getImageMagickBinaryPath()}).` + ); return { ok: true }; } const message = diff --git a/electron/app/main.ts b/electron/app/main.ts index b478b7ca..420efdb8 100644 --- a/electron/app/main.ts +++ b/electron/app/main.ts @@ -14,7 +14,7 @@ import { checkDependenciesBeforeWindow } from "./utils/setup-dependencies"; import { getSofficePath, isLibreOfficeInstalled } from "./utils/libreoffice-check"; import { getPuppeteerExecutablePath, isChromeInstalled } from "./utils/puppeteer-check"; import { getLiteParseRunnerPath } from "./utils/liteparse-check"; -import { isImageMagickInstalled } from "./utils/imagemagick-check"; +import { getImageMagickBinaryPath, isImageMagickInstalled } from "./utils/imagemagick-check"; import { startUpdateChecker, stopUpdateChecker } from "./utils/update-checker"; @@ -125,7 +125,12 @@ async function startServers(fastApiPort: number, nextjsPort: number) { // Resolved by libreoffice-check.ts at startup; lets Python invoke the // exact binary path instead of relying on the system PATH. SOFFICE_PATH: getSofficePath(), + IMAGEMAGICK_BINARY: getImageMagickBinaryPath(), LITEPARSE_RUNNER_PATH: getLiteParseRunnerPath(), + // Use Electron's embedded runtime for LiteParse so parsing does not + // depend on a system-wide Node installation. + LITEPARSE_NODE_BINARY: process.execPath, + ELECTRON_RUN_AS_NODE: "1", }, isDev, ); diff --git a/electron/app/types/index.d.ts b/electron/app/types/index.d.ts index 10807ecf..0d066a2d 100644 --- a/electron/app/types/index.d.ts +++ b/electron/app/types/index.d.ts @@ -33,8 +33,14 @@ interface FastApiEnv { MIGRATE_DATABASE_ON_STARTUP?: string, /** Absolute path to the soffice binary resolved at startup by libreoffice-check.ts. */ SOFFICE_PATH?: string, + /** Absolute path to the ImageMagick binary resolved at startup by imagemagick-check.ts. */ + IMAGEMAGICK_BINARY?: string, /** Absolute path to the bundled LiteParse runner script. */ LITEPARSE_RUNNER_PATH?: string, + /** Binary path used by LiteParseService to execute liteparse_runner.mjs. */ + LITEPARSE_NODE_BINARY?: string, + /** Set to "1" when using the Electron binary as a Node runtime. */ + ELECTRON_RUN_AS_NODE?: string, } interface NextJsEnv { diff --git a/electron/app/utils/imagemagick-check.ts b/electron/app/utils/imagemagick-check.ts index 38d01be5..34985a74 100644 --- a/electron/app/utils/imagemagick-check.ts +++ b/electron/app/utils/imagemagick-check.ts @@ -1,5 +1,10 @@ +import fs from "fs"; +import os from "os"; +import path from "path"; import { spawnSync } from "child_process"; +let resolvedImageMagickBinaryPath = process.platform === "win32" ? "magick" : "convert"; + function canExecute(command: string, args: string[]): boolean { const result = spawnSync(command, args, { stdio: "pipe", @@ -8,12 +13,162 @@ function canExecute(command: string, args: string[]): boolean { return result.status === 0; } +function runCommand(command: string, args: string[]): string | null { + const result = spawnSync(command, args, { + stdio: ["ignore", "pipe", "pipe"], + encoding: "utf8", + windowsHide: true, + }); + if (result.status !== 0) return null; + + const stdout = (result.stdout ?? "").trim(); + return stdout.length > 0 ? stdout : null; +} + +function getWindowsInstallRootCandidates(): string[] { + const roots = new Set(); + + if (process.env.LOCALAPPDATA) roots.add(process.env.LOCALAPPDATA); + if (process.env.ProgramFiles) roots.add(process.env.ProgramFiles); + if (process.env["ProgramFiles(x86)"]) { + roots.add(process.env["ProgramFiles(x86)"] as string); + } + roots.add(path.join(os.homedir(), "AppData", "Local")); + + return Array.from(roots); +} + +export function getWindowsImageMagickInstallDir(): string { + const localAppData = + process.env.LOCALAPPDATA ?? path.join(os.homedir(), "AppData", "Local"); + return path.join(localAppData, "Presenton", "runtime", "imagemagick"); +} + +function collectWindowsImageMagickBinaryCandidates(): string[] { + const candidates: string[] = [ + path.join(getWindowsImageMagickInstallDir(), "magick.exe"), + ]; + + for (const root of getWindowsInstallRootCandidates()) { + try { + const entries = fs.readdirSync(root, { withFileTypes: true }); + for (const entry of entries) { + if (!entry.isDirectory() || !/^ImageMagick/i.test(entry.name)) { + continue; + } + candidates.push(path.join(root, entry.name, "magick.exe")); + } + } catch { + continue; + } + } + + return candidates; +} + +function resolveBrewCommandPath(): string | null { + const candidates = ["brew", "/opt/homebrew/bin/brew", "/usr/local/bin/brew"]; + for (const candidate of candidates) { + if (canExecute(candidate, ["--version"])) { + return candidate; + } + } + return null; +} + +function collectDarwinBrewImageMagickCandidates(): string[] { + const candidates: string[] = [ + "/opt/homebrew/bin/magick", + "/usr/local/bin/magick", + "/opt/homebrew/opt/imagemagick/bin/magick", + "/usr/local/opt/imagemagick/bin/magick", + ]; + + const brewCommand = resolveBrewCommandPath(); + if (!brewCommand) { + return candidates; + } + + const brewPrefix = runCommand(brewCommand, ["--prefix", "imagemagick"]); + if (brewPrefix) { + candidates.push(path.join(brewPrefix, "bin", "magick")); + } + + const brewCellar = runCommand(brewCommand, ["--cellar", "imagemagick"]); + if (brewCellar && fs.existsSync(brewCellar)) { + try { + const versions = fs + .readdirSync(brewCellar, { withFileTypes: true }) + .filter((entry) => entry.isDirectory()) + .map((entry) => entry.name) + .sort((a, b) => + b.localeCompare(a, undefined, { numeric: true, sensitivity: "base" }) + ); + + for (const version of versions) { + candidates.push(path.join(brewCellar, version, "bin", "magick")); + } + } catch { + // Ignore cellar enumeration errors and continue with other candidates. + } + } + + return candidates; +} + +function resolveImageMagickBinaryPath(): string | null { + const commandCandidates = process.platform === "win32" ? ["magick"] : ["magick", "convert"]; + for (const candidate of commandCandidates) { + if (canExecute(candidate, ["-version"])) { + return candidate; + } + } + + if (process.platform === "win32") { + for (const candidate of collectWindowsImageMagickBinaryCandidates()) { + if (fs.existsSync(candidate) && canExecute(candidate, ["-version"])) { + return candidate; + } + } + return null; + } + + if (process.platform === "darwin") { + for (const candidate of collectDarwinBrewImageMagickCandidates()) { + if (fs.existsSync(candidate) && canExecute(candidate, ["-version"])) { + return candidate; + } + } + } + + const unixCandidates = [ + "/opt/homebrew/bin/magick", + "/usr/local/bin/magick", + "/opt/local/bin/magick", + "/usr/bin/magick", + "/usr/local/bin/convert", + "/usr/bin/convert", + ]; + + for (const candidate of unixCandidates) { + if (fs.existsSync(candidate) && canExecute(candidate, ["-version"])) { + return candidate; + } + } + + return null; +} + export function isImageMagickInstalled(): boolean { - // ImageMagick 7+ command - if (canExecute("magick", ["-version"])) return true; - // Legacy command on Linux/macOS packages - if (canExecute("convert", ["-version"])) return true; - return false; + const resolved = resolveImageMagickBinaryPath(); + if (!resolved) return false; + + resolvedImageMagickBinaryPath = resolved; + return true; +} + +export function getImageMagickBinaryPath(): string { + return resolvedImageMagickBinaryPath; } export function getImageMagickDownloadUrl(): string { @@ -31,6 +186,8 @@ export function getImageMagickManualInstallCommands(): string[] { return [ "Download and run the installer:", getImageMagickDownloadUrl(), + "Recommended install path:", + getWindowsImageMagickInstallDir(), ]; } @@ -40,6 +197,8 @@ export function getImageMagickManualInstallCommands(): string[] { '/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"', "Install ImageMagick:", "brew install imagemagick", + "Verify detected binary path:", + "brew --prefix imagemagick", ]; } diff --git a/electron/resources/ui/setup-installer/index.html b/electron/resources/ui/setup-installer/index.html index 2d871226..aa398fe1 100644 --- a/electron/resources/ui/setup-installer/index.html +++ b/electron/resources/ui/setup-installer/index.html @@ -288,7 +288,7 @@ ? 'Presenton uses LibreOffice to generate custom templates from PPTX files.' : step === 'chrome' ? 'Presenton uses Chromium for export and slide rendering. Download it now (~150 MB).' - : 'Presenton uses ImageMagick for OCR/document conversion support. Linux uses apt, macOS installs Homebrew first (if needed) and then runs brew install imagemagick, and Windows uses Chocolatey with a direct installer fallback.'; + : 'Presenton uses ImageMagick for OCR/document conversion support. Linux uses apt, macOS installs Homebrew first (if needed) and then runs brew install imagemagick, and Windows downloads and installs it directly into the Presenton runtime.'; document.getElementById('btn-install').onclick = () => startInstall(step); document.getElementById('btn-skip').onclick = () => handleSkip(); showState('prompt'); @@ -315,7 +315,7 @@ }); } else { document.getElementById('dl-heading').textContent = 'Installing ImageMagick'; - document.getElementById('dl-phase').textContent = 'Linux: apt-get | macOS: Homebrew + brew install | Windows: choco or direct installer'; + document.getElementById('dl-phase').textContent = 'Linux: apt-get | macOS: Homebrew + brew install | Windows: direct installer (Presenton runtime)'; window.setupInstaller.installImageMagick().then((installResult) => { if (!installResult || !installResult.ok) { if (currentStep !== 'imagemagick') return; diff --git a/electron/servers/fastapi/services/document_conversion_service.py b/electron/servers/fastapi/services/document_conversion_service.py index 497f12ec..8de7ec7f 100644 --- a/electron/servers/fastapi/services/document_conversion_service.py +++ b/electron/servers/fastapi/services/document_conversion_service.py @@ -1,5 +1,6 @@ import os import subprocess +import logging from pathlib import Path from typing import Dict, List @@ -8,6 +9,23 @@ class DocumentConversionError(Exception): pass +LOGGER = logging.getLogger(__name__) +_LOG_SNIPPET_LIMIT = 600 + + +def _snippet(value: str, limit: int = _LOG_SNIPPET_LIMIT) -> str: + text = (value or "").strip() + if not text: + return "" + if len(text) <= limit: + return text + return f"{text[:limit]}... [truncated {len(text) - limit} chars]" + + +def _command_str(parts: list[str]) -> str: + return " ".join(repr(part) for part in parts) + + def _windows_hidden_subprocess_kwargs() -> Dict[str, object]: if os.name != "nt": return {} @@ -39,6 +57,8 @@ class DocumentConversionService: [command, *args], capture_output=True, text=True, + encoding="utf-8", + errors="replace", timeout=10, check=False, **_windows_hidden_subprocess_kwargs(), @@ -71,23 +91,39 @@ class DocumentConversionService: } try: + command = [ + self.soffice_binary, + "--headless", + "--convert-to", + "pdf", + "--outdir", + output_dir, + file_path, + ] + LOGGER.info( + "[DocumentConversion] LibreOffice conversion start input=%s output_dir=%s", + file_path, + output_dir, + ) subprocess.run( - [ - self.soffice_binary, - "--headless", - "--convert-to", - "pdf", - "--outdir", - output_dir, - file_path, - ], + command, check=True, capture_output=True, text=True, + encoding="utf-8", + errors="replace", timeout=timeout_seconds, **_windows_hidden_subprocess_kwargs(), ) + LOGGER.info( + "[DocumentConversion] LibreOffice conversion complete input=%s", + file_path, + ) except subprocess.TimeoutExpired as exc: + LOGGER.error( + "[DocumentConversion] LibreOffice timed out command=%s", + _command_str(exc.cmd if isinstance(exc.cmd, list) else [str(exc.cmd)]), + ) raise DocumentConversionError( f"LibreOffice conversion timed out for {os.path.basename(file_path)}" ) from exc @@ -95,10 +131,19 @@ class DocumentConversionService: stderr = (exc.stderr or "").strip() stdout = (exc.stdout or "").strip() details = stderr or stdout or str(exc) + LOGGER.error( + "[DocumentConversion] LibreOffice failed code=%s command=%s stderr=%s stdout=%s", + exc.returncode, + _command_str(exc.cmd if isinstance(exc.cmd, list) else [str(exc.cmd)]), + _snippet(stderr), + _snippet(stdout), + ) raise DocumentConversionError( - f"LibreOffice conversion failed for {os.path.basename(file_path)}: {details}" + f"LibreOffice conversion failed for {os.path.basename(file_path)}: {details} " + f"(stderr={_snippet(stderr)}; stdout={_snippet(stdout)})" ) from exc except Exception as exc: + LOGGER.exception("[DocumentConversion] LibreOffice conversion unexpected error") raise DocumentConversionError( f"LibreOffice conversion failed for {os.path.basename(file_path)}: {exc}" ) from exc @@ -133,15 +178,31 @@ class DocumentConversionService: command = [self.imagemagick_binary, file_path, str(output_path)] try: + LOGGER.info( + "[DocumentConversion] ImageMagick conversion start input=%s output=%s command=%s", + file_path, + output_path, + _command_str(command), + ) subprocess.run( command, check=True, capture_output=True, text=True, + encoding="utf-8", + errors="replace", timeout=timeout_seconds, **_windows_hidden_subprocess_kwargs(), ) + LOGGER.info( + "[DocumentConversion] ImageMagick conversion complete output=%s", + output_path, + ) except subprocess.TimeoutExpired as exc: + LOGGER.error( + "[DocumentConversion] ImageMagick timed out command=%s", + _command_str(exc.cmd if isinstance(exc.cmd, list) else [str(exc.cmd)]), + ) raise DocumentConversionError( f"ImageMagick conversion timed out for {os.path.basename(file_path)}" ) from exc @@ -149,10 +210,19 @@ class DocumentConversionService: stderr = (exc.stderr or "").strip() stdout = (exc.stdout or "").strip() details = stderr or stdout or str(exc) + LOGGER.error( + "[DocumentConversion] ImageMagick failed code=%s command=%s stderr=%s stdout=%s", + exc.returncode, + _command_str(exc.cmd if isinstance(exc.cmd, list) else [str(exc.cmd)]), + _snippet(stderr), + _snippet(stdout), + ) raise DocumentConversionError( - f"ImageMagick conversion failed for {os.path.basename(file_path)}: {details}" + f"ImageMagick conversion failed for {os.path.basename(file_path)}: {details} " + f"(stderr={_snippet(stderr)}; stdout={_snippet(stdout)})" ) from exc except Exception as exc: + LOGGER.exception("[DocumentConversion] ImageMagick conversion unexpected error") raise DocumentConversionError( f"ImageMagick conversion failed for {os.path.basename(file_path)}: {exc}" ) from exc diff --git a/electron/servers/fastapi/services/documents_loader.py b/electron/servers/fastapi/services/documents_loader.py index 6dbc3f5c..e65a659a 100644 --- a/electron/servers/fastapi/services/documents_loader.py +++ b/electron/servers/fastapi/services/documents_loader.py @@ -1,8 +1,9 @@ import asyncio +import logging import os import tempfile from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, List, Optional, Tuple import pdfplumber from fastapi import HTTPException @@ -22,9 +23,11 @@ from utils.ocr_language import presentation_language_to_ocr_code # Optional fallback converter (primarily useful on Windows) try: - from services.lightweight_document_service import DocumentService + from services.lightweight_document_service import DocumentService as DocumentServiceCls except Exception: - DocumentService = None + DocumentServiceCls = None + +LOGGER = logging.getLogger(__name__) class DocumentsLoader: @@ -38,7 +41,9 @@ class DocumentsLoader: self._ocr_language = presentation_language_to_ocr_code(presentation_language) self.liteparse_service = LiteParseService() self.document_conversion_service = DocumentConversionService() - self.document_service = DocumentService() if DocumentService is not None else None + self.document_service: Any = ( + DocumentServiceCls() if DocumentServiceCls is not None else None + ) self._documents: List[str] = [] self._images: List[List[str]] = [] @@ -69,9 +74,14 @@ class DocumentsLoader: ) document = "" - imgs = [] + imgs: List[str] = [] extension = Path(file_path).suffix.lower() + LOGGER.info( + "[DocumentsLoader] Processing file=%s extension=%s", + file_path, + extension, + ) if extension in PDF_EXTENSIONS: document, imgs = await self.load_pdf( @@ -107,13 +117,18 @@ class DocumentsLoader: load_images: bool, temp_dir: Optional[str] = None, ) -> Tuple[str, List[str]]: - image_paths = [] + image_paths: List[str] = [] document: str = "" if load_text: document = await asyncio.to_thread(self._parse_with_liteparse, file_path) if load_images: + if temp_dir is None: + raise HTTPException( + status_code=400, + detail="temp_dir is required when load_images is true", + ) image_paths = await self.get_page_images_from_pdf_async(file_path, temp_dir) return document, image_paths @@ -154,16 +169,27 @@ class DocumentsLoader: def _parse_with_liteparse(self, file_path: str) -> str: try: + LOGGER.info("[DocumentsLoader] LiteParse start file=%s", file_path) return self.liteparse_service.parse_to_markdown( file_path, ocr_enabled=True, ocr_language=self._ocr_language, ) except (LiteParseError, DocumentConversionError) as exc: + LOGGER.warning( + "[DocumentsLoader] Primary parse failed file=%s error=%s", + file_path, + exc, + ) if self.document_service is not None: try: + LOGGER.info("[DocumentsLoader] Trying fallback parser file=%s", file_path) return self.document_service.parse_to_markdown(file_path) except Exception: + LOGGER.exception( + "[DocumentsLoader] Fallback parser failed file=%s", + file_path, + ) pass raise HTTPException( status_code=500, diff --git a/electron/servers/fastapi/services/liteparse_service.py b/electron/servers/fastapi/services/liteparse_service.py index eacaaca6..dca0835d 100644 --- a/electron/servers/fastapi/services/liteparse_service.py +++ b/electron/servers/fastapi/services/liteparse_service.py @@ -1,13 +1,41 @@ import json +import logging import os import subprocess -from typing import Any, Dict, Tuple +from typing import Any, Dict, Mapping, Tuple class LiteParseError(Exception): pass +LOGGER = logging.getLogger(__name__) +_LOG_SNIPPET_LIMIT = 600 + + +def _snippet(value: str, limit: int = _LOG_SNIPPET_LIMIT) -> str: + text = (value or "").strip() + if not text: + return "" + if len(text) <= limit: + return text + return f"{text[:limit]}... [truncated {len(text) - limit} chars]" + + +def _command_str(parts: list[str]) -> str: + return " ".join(json.dumps(part) for part in parts) + + +def _subprocess_text_kwargs() -> Mapping[str, object]: + """Decode subprocess output consistently across platforms. + + Windows defaults to a locale-dependent code page (often cp1252), which can + crash while decoding UTF-8 output from Node tools. Use UTF-8 and replace + undecodable bytes to keep parsing resilient. + """ + return {"text": True, "encoding": "utf-8", "errors": "replace"} + + class LiteParseService: def __init__(self, timeout_seconds: int = 180): self.timeout_seconds = timeout_seconds @@ -16,6 +44,58 @@ class LiteParseService: self.runner_dir = os.path.dirname(self.runner_path) self._npm_project_root = self._resolve_npm_project_root() + def _build_node_env(self) -> Dict[str, str]: + """Build environment for Node subprocesses. + + When the configured runtime binary is not the canonical `node` executable + (for example Electron's app binary), force Node-compatible mode. + """ + env = os.environ.copy() + binary_name = os.path.basename(self.node_binary).lower() + if binary_name not in {"node", "node.exe"}: + env.setdefault("ELECTRON_RUN_AS_NODE", "1") + + # LiteParse checks ImageMagick availability with `which magick`. + # On macOS app launches, PATH often excludes Homebrew bins, even when + # IMAGEMAGICK_BINARY is configured to an absolute executable path. + path_entries = [p for p in (env.get("PATH") or "").split(os.pathsep) if p] + additional_entries = [] + + imagemagick_binary = (env.get("IMAGEMAGICK_BINARY") or "").strip() + if imagemagick_binary: + magick_dir = os.path.dirname(imagemagick_binary) + if magick_dir: + additional_entries.append(magick_dir) + + soffice_binary = (env.get("SOFFICE_PATH") or "").strip() + if soffice_binary: + soffice_dir = os.path.dirname(soffice_binary) + if soffice_dir: + additional_entries.append(soffice_dir) + + if os.name != "nt": + additional_entries.extend([ + "/opt/homebrew/bin", + "/usr/local/bin", + "/opt/local/bin", + "/usr/bin", + "/bin", + ]) + + deduped_additional_entries = [] + for entry in additional_entries: + normalized = entry.strip() + if not normalized or not os.path.isdir(normalized): + continue + if normalized in path_entries or normalized in deduped_additional_entries: + continue + deduped_additional_entries.append(normalized) + + if deduped_additional_entries: + env["PATH"] = os.pathsep.join(deduped_additional_entries + path_entries) + + return env + def _resolve_npm_project_root(self) -> str: """Directory whose node_modules contains @llamaindex/liteparse (runner dir or Electron app root).""" local_nm = os.path.join( @@ -76,8 +156,9 @@ class LiteParseService: cwd=self.runner_dir, check=True, capture_output=True, - text=True, timeout=10, + env=self._build_node_env(), + **_subprocess_text_kwargs(), ) except Exception as exc: return False, f"Node.js runtime is unavailable: {exc}" @@ -103,8 +184,9 @@ class LiteParseService: cwd=self._npm_project_root, check=True, capture_output=True, - text=True, timeout=20, + env=self._build_node_env(), + **_subprocess_text_kwargs(), ) except Exception as exc: return False, f"LiteParse dependency is unavailable: {exc}" @@ -151,21 +233,51 @@ class LiteParseService: if tessdata: command.extend(["--tessdata-path", tessdata]) + LOGGER.info( + "[LiteParse] Parsing file=%s ocr_enabled=%s ocr_language=%s", + file_path, + ocr_enabled, + ocr_language, + ) + process = subprocess.run( command, cwd=self._npm_project_root, capture_output=True, - text=True, timeout=self.timeout_seconds, - env=os.environ.copy(), + env=self._build_node_env(), + **_subprocess_text_kwargs(), ) - payload = self._decode_runner_output(process.stdout) + LOGGER.info( + "[LiteParse] Command finished returncode=%s command=%s", + process.returncode, + _command_str(command), + ) + + payload: Dict[str, Any] + try: + payload = self._decode_runner_output(process.stdout) + except LiteParseError as exc: + raise LiteParseError( + f"{exc}; returncode={process.returncode}; " + f"stderr={_snippet(process.stderr)}; stdout={_snippet(process.stdout)}" + ) from exc if process.returncode != 0: message = payload.get("error") or process.stderr.strip() or "Unknown error" + LOGGER.error( + "[LiteParse] Parse failed returncode=%s stderr=%s stdout=%s", + process.returncode, + _snippet(process.stderr), + _snippet(process.stdout), + ) raise LiteParseError(message) if not payload.get("ok"): + LOGGER.error( + "[LiteParse] Runner returned not-ok payload=%s", + _snippet(json.dumps(payload)), + ) raise LiteParseError(payload.get("error") or "LiteParse parse failed") return payload