Merge pull request #484 from presenton/fix/imagemagick-issues

Fix/imagemagick issues
This commit is contained in:
Sudip Parajuli 2026-03-31 14:42:45 +05:45 committed by GitHub
commit 08b0726f80
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 621 additions and 47 deletions

View file

@ -4,11 +4,14 @@
* - setup:install-chrome download Chromium (browser-snapshots) with progress
*/
import { ipcMain, WebContents, shell } from "electron";
import { ipcMain, WebContents } from "electron";
import fs from "fs";
import path from "path";
import os from "os";
import { spawn, spawnSync } from "child_process";
import * as https from "https";
import * as http from "http";
import { IncomingMessage } from "http";
import puppeteer from "puppeteer";
import {
Browser,
@ -19,8 +22,10 @@ import {
} from "@puppeteer/browsers";
import { getSetupStatus } from "../utils/setup-dependencies";
import {
getImageMagickBinaryPath,
getImageMagickDownloadUrl,
getImageMagickManualInstallCommands,
getWindowsImageMagickInstallDir,
isImageMagickInstalled,
} from "../utils/imagemagick-check";
@ -50,7 +55,7 @@ function sendChromeLog(wc: WebContents, level: string, text: string) {
function sendImageMagickProgress(
wc: WebContents,
phase: "installing" | "done" | "error",
phase: "downloading" | "installing" | "done" | "error",
percent?: number,
message?: string
) {
@ -100,6 +105,156 @@ function logManualImageMagickCommands(wc: WebContents) {
}
}
const MAX_DOWNLOAD_REDIRECTS = 5;
const MIN_IMAGEMAGICK_INSTALLER_SIZE_BYTES = 5 * 1024 * 1024;
function formatBytes(bytes: number): string {
if (bytes <= 0) return "0 B";
const mb = bytes / 1024 / 1024;
if (mb >= 1) return `${mb.toFixed(1)} MB`;
const kb = bytes / 1024;
if (kb >= 1) return `${kb.toFixed(0)} KB`;
return `${bytes} B`;
}
function escapePowerShellSingleQuoted(value: string): string {
return value.replace(/'/g, "''");
}
function getFilenameFromUrl(url: string, fallback: string): string {
try {
const parsed = new URL(url);
const name = path.basename(parsed.pathname);
return name || fallback;
} catch {
return fallback;
}
}
function downloadFileWithProgress(
wc: WebContents,
url: string,
destinationPath: string
): Promise<void> {
return new Promise((resolve, reject) => {
const requestDownload = (requestUrl: string, redirects: number) => {
const requester = requestUrl.startsWith("https") ? https.get : http.get;
sendImageMagickLog(wc, "cmd", `GET ${requestUrl}`);
requester(requestUrl, (res: IncomingMessage) => {
const statusCode = res.statusCode ?? 0;
if (
[301, 302, 303, 307, 308].includes(statusCode) &&
res.headers.location
) {
if (redirects >= MAX_DOWNLOAD_REDIRECTS) {
reject(new Error("Too many redirects while downloading installer."));
return;
}
const redirectUrl = new URL(res.headers.location, requestUrl).toString();
sendImageMagickLog(wc, "info", `Redirecting to ${redirectUrl}`);
requestDownload(redirectUrl, redirects + 1);
return;
}
if (statusCode !== 200) {
reject(new Error(`Download failed with HTTP ${statusCode}.`));
return;
}
const totalBytes = Number.parseInt(
String(res.headers["content-length"] ?? "0"),
10
);
let downloadedBytes = 0;
const file = fs.createWriteStream(destinationPath);
res.on("data", (chunk: Buffer) => {
downloadedBytes += chunk.length;
const percent =
totalBytes > 0
? Math.min(99, Math.floor((downloadedBytes / totalBytes) * 100))
: undefined;
const sizeLabel =
totalBytes > 0
? `${formatBytes(downloadedBytes)} / ${formatBytes(totalBytes)}`
: `${formatBytes(downloadedBytes)} downloaded`;
sendImageMagickProgress(wc, "downloading", percent, sizeLabel);
});
res.pipe(file);
file.on("finish", () => {
file.close(() => {
if (downloadedBytes < MIN_IMAGEMAGICK_INSTALLER_SIZE_BYTES) {
fs.unlink(destinationPath, () => {});
reject(
new Error(
`Downloaded file is too small (${formatBytes(downloadedBytes)}).`
)
);
return;
}
sendImageMagickLog(
wc,
"ok",
`Download complete (${formatBytes(downloadedBytes)}).`
);
resolve();
});
});
file.on("error", (err) => {
fs.unlink(destinationPath, () => {});
reject(err);
});
}).on("error", (err) => {
fs.unlink(destinationPath, () => {});
reject(err);
});
};
requestDownload(url, 0);
});
}
async function runWindowsExecutableInstaller(
wc: WebContents,
installerPath: string,
installerArgs: string[]
): Promise<void> {
const escapedInstallerPath = escapePowerShellSingleQuoted(installerPath);
const argList = installerArgs
.map((arg) => `'${escapePowerShellSingleQuoted(arg)}'`)
.join(", ");
const runViaPowerShell = async (runAsAdmin: boolean) => {
const verb = runAsAdmin ? " -Verb RunAs" : "";
const script = `$p = Start-Process -FilePath '${escapedInstallerPath}' -ArgumentList ${argList}${verb} -Wait -PassThru; if ($p) { exit $p.ExitCode } else { exit 1 }`;
await runInstallCommand(wc, "powershell", [
"-NoProfile",
"-ExecutionPolicy",
"Bypass",
"-Command",
script,
]);
};
try {
sendImageMagickLog(wc, "info", "Running installer in user mode...");
await runViaPowerShell(false);
} catch {
sendImageMagickLog(
wc,
"warn",
"User-mode install failed. Retrying with administrator rights..."
);
await runViaPowerShell(true);
}
}
function runInstallCommand(
wc: WebContents,
command: string,
@ -282,23 +437,61 @@ export function setupSetupInstallHandlers() {
await runInstallCommand(wc, brewCommand, ["install", "imagemagick"]);
} else if (process.platform === "win32") {
if (commandExists("choco", ["-v"])) {
await runInstallCommand(wc, "choco", [
"install",
"imagemagick.app",
"-y",
]);
} else {
throw new Error(
"Chocolatey is not installed. Falling back to direct installer download."
);
}
const installerUrl = getImageMagickDownloadUrl();
const installerFilename = getFilenameFromUrl(
installerUrl,
"ImageMagick-installer.exe"
);
const installerPath = path.join(os.tmpdir(), installerFilename);
const installDir = getWindowsImageMagickInstallDir();
fs.mkdirSync(installDir, { recursive: true });
sendImageMagickLog(
wc,
"info",
`Downloading ImageMagick installer (${installerFilename})...`
);
sendImageMagickLog(wc, "cmd", `Install directory: ${installDir}`);
sendImageMagickProgress(wc, "downloading", 0, "Connecting...");
await downloadFileWithProgress(wc, installerUrl, installerPath);
sendImageMagickProgress(
wc,
"installing",
undefined,
"Running installer..."
);
await runWindowsExecutableInstaller(wc, installerPath, [
"/SP-",
"/VERYSILENT",
"/SUPPRESSMSGBOXES",
"/NORESTART",
`/DIR=${installDir}`,
]);
fs.unlink(installerPath, () => {});
sendImageMagickLog(wc, "ok", "ImageMagick installer completed.");
} else {
throw new Error(
"Unsupported platform for automatic install. Use manual install from the official download page."
);
}
if (!isImageMagickInstalled()) {
throw new Error(
"ImageMagick installation command finished, but the binary was not detected."
);
}
sendImageMagickLog(
wc,
"ok",
`ImageMagick detected at ${getImageMagickBinaryPath()}`
);
sendImageMagickProgress(wc, "done", 100, "ImageMagick install finished");
return { ok: true };
} catch (error) {
@ -310,9 +503,8 @@ export function setupSetupInstallHandlers() {
sendImageMagickLog(
wc,
"info",
`Opening manual install link: ${downloadUrl}`
`Manual install URL: ${downloadUrl}`
);
await shell.openExternal(downloadUrl);
sendImageMagickProgress(
wc,
"error",
@ -331,7 +523,11 @@ export function setupSetupInstallHandlers() {
const installed = isImageMagickInstalled();
if (installed) {
sendImageMagickProgress(wc, "done", 100, "ImageMagick detected");
sendImageMagickLog(wc, "ok", "ImageMagick is installed and ready.");
sendImageMagickLog(
wc,
"ok",
`ImageMagick is installed and ready (${getImageMagickBinaryPath()}).`
);
return { ok: true };
}
const message =

View file

@ -14,7 +14,7 @@ import { checkDependenciesBeforeWindow } from "./utils/setup-dependencies";
import { getSofficePath, isLibreOfficeInstalled } from "./utils/libreoffice-check";
import { getPuppeteerExecutablePath, isChromeInstalled } from "./utils/puppeteer-check";
import { getLiteParseRunnerPath } from "./utils/liteparse-check";
import { isImageMagickInstalled } from "./utils/imagemagick-check";
import { getImageMagickBinaryPath, isImageMagickInstalled } from "./utils/imagemagick-check";
import { startUpdateChecker, stopUpdateChecker } from "./utils/update-checker";
@ -125,7 +125,12 @@ async function startServers(fastApiPort: number, nextjsPort: number) {
// Resolved by libreoffice-check.ts at startup; lets Python invoke the
// exact binary path instead of relying on the system PATH.
SOFFICE_PATH: getSofficePath(),
IMAGEMAGICK_BINARY: getImageMagickBinaryPath(),
LITEPARSE_RUNNER_PATH: getLiteParseRunnerPath(),
// Use Electron's embedded runtime for LiteParse so parsing does not
// depend on a system-wide Node installation.
LITEPARSE_NODE_BINARY: process.execPath,
ELECTRON_RUN_AS_NODE: "1",
},
isDev,
);

View file

@ -33,8 +33,14 @@ interface FastApiEnv {
MIGRATE_DATABASE_ON_STARTUP?: string,
/** Absolute path to the soffice binary resolved at startup by libreoffice-check.ts. */
SOFFICE_PATH?: string,
/** Absolute path to the ImageMagick binary resolved at startup by imagemagick-check.ts. */
IMAGEMAGICK_BINARY?: string,
/** Absolute path to the bundled LiteParse runner script. */
LITEPARSE_RUNNER_PATH?: string,
/** Binary path used by LiteParseService to execute liteparse_runner.mjs. */
LITEPARSE_NODE_BINARY?: string,
/** Set to "1" when using the Electron binary as a Node runtime. */
ELECTRON_RUN_AS_NODE?: string,
}
interface NextJsEnv {

View file

@ -1,5 +1,10 @@
import fs from "fs";
import os from "os";
import path from "path";
import { spawnSync } from "child_process";
let resolvedImageMagickBinaryPath = process.platform === "win32" ? "magick" : "convert";
function canExecute(command: string, args: string[]): boolean {
const result = spawnSync(command, args, {
stdio: "pipe",
@ -8,12 +13,162 @@ function canExecute(command: string, args: string[]): boolean {
return result.status === 0;
}
function runCommand(command: string, args: string[]): string | null {
const result = spawnSync(command, args, {
stdio: ["ignore", "pipe", "pipe"],
encoding: "utf8",
windowsHide: true,
});
if (result.status !== 0) return null;
const stdout = (result.stdout ?? "").trim();
return stdout.length > 0 ? stdout : null;
}
function getWindowsInstallRootCandidates(): string[] {
const roots = new Set<string>();
if (process.env.LOCALAPPDATA) roots.add(process.env.LOCALAPPDATA);
if (process.env.ProgramFiles) roots.add(process.env.ProgramFiles);
if (process.env["ProgramFiles(x86)"]) {
roots.add(process.env["ProgramFiles(x86)"] as string);
}
roots.add(path.join(os.homedir(), "AppData", "Local"));
return Array.from(roots);
}
export function getWindowsImageMagickInstallDir(): string {
const localAppData =
process.env.LOCALAPPDATA ?? path.join(os.homedir(), "AppData", "Local");
return path.join(localAppData, "Presenton", "runtime", "imagemagick");
}
function collectWindowsImageMagickBinaryCandidates(): string[] {
const candidates: string[] = [
path.join(getWindowsImageMagickInstallDir(), "magick.exe"),
];
for (const root of getWindowsInstallRootCandidates()) {
try {
const entries = fs.readdirSync(root, { withFileTypes: true });
for (const entry of entries) {
if (!entry.isDirectory() || !/^ImageMagick/i.test(entry.name)) {
continue;
}
candidates.push(path.join(root, entry.name, "magick.exe"));
}
} catch {
continue;
}
}
return candidates;
}
function resolveBrewCommandPath(): string | null {
const candidates = ["brew", "/opt/homebrew/bin/brew", "/usr/local/bin/brew"];
for (const candidate of candidates) {
if (canExecute(candidate, ["--version"])) {
return candidate;
}
}
return null;
}
function collectDarwinBrewImageMagickCandidates(): string[] {
const candidates: string[] = [
"/opt/homebrew/bin/magick",
"/usr/local/bin/magick",
"/opt/homebrew/opt/imagemagick/bin/magick",
"/usr/local/opt/imagemagick/bin/magick",
];
const brewCommand = resolveBrewCommandPath();
if (!brewCommand) {
return candidates;
}
const brewPrefix = runCommand(brewCommand, ["--prefix", "imagemagick"]);
if (brewPrefix) {
candidates.push(path.join(brewPrefix, "bin", "magick"));
}
const brewCellar = runCommand(brewCommand, ["--cellar", "imagemagick"]);
if (brewCellar && fs.existsSync(brewCellar)) {
try {
const versions = fs
.readdirSync(brewCellar, { withFileTypes: true })
.filter((entry) => entry.isDirectory())
.map((entry) => entry.name)
.sort((a, b) =>
b.localeCompare(a, undefined, { numeric: true, sensitivity: "base" })
);
for (const version of versions) {
candidates.push(path.join(brewCellar, version, "bin", "magick"));
}
} catch {
// Ignore cellar enumeration errors and continue with other candidates.
}
}
return candidates;
}
function resolveImageMagickBinaryPath(): string | null {
const commandCandidates = process.platform === "win32" ? ["magick"] : ["magick", "convert"];
for (const candidate of commandCandidates) {
if (canExecute(candidate, ["-version"])) {
return candidate;
}
}
if (process.platform === "win32") {
for (const candidate of collectWindowsImageMagickBinaryCandidates()) {
if (fs.existsSync(candidate) && canExecute(candidate, ["-version"])) {
return candidate;
}
}
return null;
}
if (process.platform === "darwin") {
for (const candidate of collectDarwinBrewImageMagickCandidates()) {
if (fs.existsSync(candidate) && canExecute(candidate, ["-version"])) {
return candidate;
}
}
}
const unixCandidates = [
"/opt/homebrew/bin/magick",
"/usr/local/bin/magick",
"/opt/local/bin/magick",
"/usr/bin/magick",
"/usr/local/bin/convert",
"/usr/bin/convert",
];
for (const candidate of unixCandidates) {
if (fs.existsSync(candidate) && canExecute(candidate, ["-version"])) {
return candidate;
}
}
return null;
}
export function isImageMagickInstalled(): boolean {
// ImageMagick 7+ command
if (canExecute("magick", ["-version"])) return true;
// Legacy command on Linux/macOS packages
if (canExecute("convert", ["-version"])) return true;
return false;
const resolved = resolveImageMagickBinaryPath();
if (!resolved) return false;
resolvedImageMagickBinaryPath = resolved;
return true;
}
export function getImageMagickBinaryPath(): string {
return resolvedImageMagickBinaryPath;
}
export function getImageMagickDownloadUrl(): string {
@ -31,6 +186,8 @@ export function getImageMagickManualInstallCommands(): string[] {
return [
"Download and run the installer:",
getImageMagickDownloadUrl(),
"Recommended install path:",
getWindowsImageMagickInstallDir(),
];
}
@ -40,6 +197,8 @@ export function getImageMagickManualInstallCommands(): string[] {
'/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"',
"Install ImageMagick:",
"brew install imagemagick",
"Verify detected binary path:",
"brew --prefix imagemagick",
];
}

View file

@ -288,7 +288,7 @@
? '<strong>Presenton</strong> uses LibreOffice to generate custom templates from PPTX files.'
: step === 'chrome'
? '<strong>Presenton</strong> uses Chromium for export and slide rendering. Download it now (~150 MB).'
: '<strong>Presenton</strong> uses ImageMagick for OCR/document conversion support. Linux uses apt, macOS installs Homebrew first (if needed) and then runs brew install imagemagick, and Windows uses Chocolatey with a direct installer fallback.';
: '<strong>Presenton</strong> uses ImageMagick for OCR/document conversion support. Linux uses apt, macOS installs Homebrew first (if needed) and then runs brew install imagemagick, and Windows downloads and installs it directly into the Presenton runtime.';
document.getElementById('btn-install').onclick = () => startInstall(step);
document.getElementById('btn-skip').onclick = () => handleSkip();
showState('prompt');
@ -315,7 +315,7 @@
});
} else {
document.getElementById('dl-heading').textContent = 'Installing ImageMagick';
document.getElementById('dl-phase').textContent = 'Linux: apt-get | macOS: Homebrew + brew install | Windows: choco or direct installer';
document.getElementById('dl-phase').textContent = 'Linux: apt-get | macOS: Homebrew + brew install | Windows: direct installer (Presenton runtime)';
window.setupInstaller.installImageMagick().then((installResult) => {
if (!installResult || !installResult.ok) {
if (currentStep !== 'imagemagick') return;

View file

@ -1,5 +1,6 @@
import os
import subprocess
import logging
from pathlib import Path
from typing import Dict, List
@ -8,6 +9,23 @@ class DocumentConversionError(Exception):
pass
LOGGER = logging.getLogger(__name__)
_LOG_SNIPPET_LIMIT = 600
def _snippet(value: str, limit: int = _LOG_SNIPPET_LIMIT) -> str:
text = (value or "").strip()
if not text:
return "<empty>"
if len(text) <= limit:
return text
return f"{text[:limit]}... [truncated {len(text) - limit} chars]"
def _command_str(parts: list[str]) -> str:
return " ".join(repr(part) for part in parts)
def _windows_hidden_subprocess_kwargs() -> Dict[str, object]:
if os.name != "nt":
return {}
@ -39,6 +57,8 @@ class DocumentConversionService:
[command, *args],
capture_output=True,
text=True,
encoding="utf-8",
errors="replace",
timeout=10,
check=False,
**_windows_hidden_subprocess_kwargs(),
@ -71,23 +91,39 @@ class DocumentConversionService:
}
try:
command = [
self.soffice_binary,
"--headless",
"--convert-to",
"pdf",
"--outdir",
output_dir,
file_path,
]
LOGGER.info(
"[DocumentConversion] LibreOffice conversion start input=%s output_dir=%s",
file_path,
output_dir,
)
subprocess.run(
[
self.soffice_binary,
"--headless",
"--convert-to",
"pdf",
"--outdir",
output_dir,
file_path,
],
command,
check=True,
capture_output=True,
text=True,
encoding="utf-8",
errors="replace",
timeout=timeout_seconds,
**_windows_hidden_subprocess_kwargs(),
)
LOGGER.info(
"[DocumentConversion] LibreOffice conversion complete input=%s",
file_path,
)
except subprocess.TimeoutExpired as exc:
LOGGER.error(
"[DocumentConversion] LibreOffice timed out command=%s",
_command_str(exc.cmd if isinstance(exc.cmd, list) else [str(exc.cmd)]),
)
raise DocumentConversionError(
f"LibreOffice conversion timed out for {os.path.basename(file_path)}"
) from exc
@ -95,10 +131,19 @@ class DocumentConversionService:
stderr = (exc.stderr or "").strip()
stdout = (exc.stdout or "").strip()
details = stderr or stdout or str(exc)
LOGGER.error(
"[DocumentConversion] LibreOffice failed code=%s command=%s stderr=%s stdout=%s",
exc.returncode,
_command_str(exc.cmd if isinstance(exc.cmd, list) else [str(exc.cmd)]),
_snippet(stderr),
_snippet(stdout),
)
raise DocumentConversionError(
f"LibreOffice conversion failed for {os.path.basename(file_path)}: {details}"
f"LibreOffice conversion failed for {os.path.basename(file_path)}: {details} "
f"(stderr={_snippet(stderr)}; stdout={_snippet(stdout)})"
) from exc
except Exception as exc:
LOGGER.exception("[DocumentConversion] LibreOffice conversion unexpected error")
raise DocumentConversionError(
f"LibreOffice conversion failed for {os.path.basename(file_path)}: {exc}"
) from exc
@ -133,15 +178,31 @@ class DocumentConversionService:
command = [self.imagemagick_binary, file_path, str(output_path)]
try:
LOGGER.info(
"[DocumentConversion] ImageMagick conversion start input=%s output=%s command=%s",
file_path,
output_path,
_command_str(command),
)
subprocess.run(
command,
check=True,
capture_output=True,
text=True,
encoding="utf-8",
errors="replace",
timeout=timeout_seconds,
**_windows_hidden_subprocess_kwargs(),
)
LOGGER.info(
"[DocumentConversion] ImageMagick conversion complete output=%s",
output_path,
)
except subprocess.TimeoutExpired as exc:
LOGGER.error(
"[DocumentConversion] ImageMagick timed out command=%s",
_command_str(exc.cmd if isinstance(exc.cmd, list) else [str(exc.cmd)]),
)
raise DocumentConversionError(
f"ImageMagick conversion timed out for {os.path.basename(file_path)}"
) from exc
@ -149,10 +210,19 @@ class DocumentConversionService:
stderr = (exc.stderr or "").strip()
stdout = (exc.stdout or "").strip()
details = stderr or stdout or str(exc)
LOGGER.error(
"[DocumentConversion] ImageMagick failed code=%s command=%s stderr=%s stdout=%s",
exc.returncode,
_command_str(exc.cmd if isinstance(exc.cmd, list) else [str(exc.cmd)]),
_snippet(stderr),
_snippet(stdout),
)
raise DocumentConversionError(
f"ImageMagick conversion failed for {os.path.basename(file_path)}: {details}"
f"ImageMagick conversion failed for {os.path.basename(file_path)}: {details} "
f"(stderr={_snippet(stderr)}; stdout={_snippet(stdout)})"
) from exc
except Exception as exc:
LOGGER.exception("[DocumentConversion] ImageMagick conversion unexpected error")
raise DocumentConversionError(
f"ImageMagick conversion failed for {os.path.basename(file_path)}: {exc}"
) from exc

View file

@ -1,8 +1,9 @@
import asyncio
import logging
import os
import tempfile
from pathlib import Path
from typing import List, Optional, Tuple
from typing import Any, List, Optional, Tuple
import pdfplumber
from fastapi import HTTPException
@ -22,9 +23,11 @@ from utils.ocr_language import presentation_language_to_ocr_code
# Optional fallback converter (primarily useful on Windows)
try:
from services.lightweight_document_service import DocumentService
from services.lightweight_document_service import DocumentService as DocumentServiceCls
except Exception:
DocumentService = None
DocumentServiceCls = None
LOGGER = logging.getLogger(__name__)
class DocumentsLoader:
@ -38,7 +41,9 @@ class DocumentsLoader:
self._ocr_language = presentation_language_to_ocr_code(presentation_language)
self.liteparse_service = LiteParseService()
self.document_conversion_service = DocumentConversionService()
self.document_service = DocumentService() if DocumentService is not None else None
self.document_service: Any = (
DocumentServiceCls() if DocumentServiceCls is not None else None
)
self._documents: List[str] = []
self._images: List[List[str]] = []
@ -69,9 +74,14 @@ class DocumentsLoader:
)
document = ""
imgs = []
imgs: List[str] = []
extension = Path(file_path).suffix.lower()
LOGGER.info(
"[DocumentsLoader] Processing file=%s extension=%s",
file_path,
extension,
)
if extension in PDF_EXTENSIONS:
document, imgs = await self.load_pdf(
@ -107,13 +117,18 @@ class DocumentsLoader:
load_images: bool,
temp_dir: Optional[str] = None,
) -> Tuple[str, List[str]]:
image_paths = []
image_paths: List[str] = []
document: str = ""
if load_text:
document = await asyncio.to_thread(self._parse_with_liteparse, file_path)
if load_images:
if temp_dir is None:
raise HTTPException(
status_code=400,
detail="temp_dir is required when load_images is true",
)
image_paths = await self.get_page_images_from_pdf_async(file_path, temp_dir)
return document, image_paths
@ -154,16 +169,27 @@ class DocumentsLoader:
def _parse_with_liteparse(self, file_path: str) -> str:
try:
LOGGER.info("[DocumentsLoader] LiteParse start file=%s", file_path)
return self.liteparse_service.parse_to_markdown(
file_path,
ocr_enabled=True,
ocr_language=self._ocr_language,
)
except (LiteParseError, DocumentConversionError) as exc:
LOGGER.warning(
"[DocumentsLoader] Primary parse failed file=%s error=%s",
file_path,
exc,
)
if self.document_service is not None:
try:
LOGGER.info("[DocumentsLoader] Trying fallback parser file=%s", file_path)
return self.document_service.parse_to_markdown(file_path)
except Exception:
LOGGER.exception(
"[DocumentsLoader] Fallback parser failed file=%s",
file_path,
)
pass
raise HTTPException(
status_code=500,

View file

@ -1,13 +1,41 @@
import json
import logging
import os
import subprocess
from typing import Any, Dict, Tuple
from typing import Any, Dict, Mapping, Tuple
class LiteParseError(Exception):
pass
LOGGER = logging.getLogger(__name__)
_LOG_SNIPPET_LIMIT = 600
def _snippet(value: str, limit: int = _LOG_SNIPPET_LIMIT) -> str:
text = (value or "").strip()
if not text:
return "<empty>"
if len(text) <= limit:
return text
return f"{text[:limit]}... [truncated {len(text) - limit} chars]"
def _command_str(parts: list[str]) -> str:
return " ".join(json.dumps(part) for part in parts)
def _subprocess_text_kwargs() -> Mapping[str, object]:
"""Decode subprocess output consistently across platforms.
Windows defaults to a locale-dependent code page (often cp1252), which can
crash while decoding UTF-8 output from Node tools. Use UTF-8 and replace
undecodable bytes to keep parsing resilient.
"""
return {"text": True, "encoding": "utf-8", "errors": "replace"}
class LiteParseService:
def __init__(self, timeout_seconds: int = 180):
self.timeout_seconds = timeout_seconds
@ -16,6 +44,58 @@ class LiteParseService:
self.runner_dir = os.path.dirname(self.runner_path)
self._npm_project_root = self._resolve_npm_project_root()
def _build_node_env(self) -> Dict[str, str]:
"""Build environment for Node subprocesses.
When the configured runtime binary is not the canonical `node` executable
(for example Electron's app binary), force Node-compatible mode.
"""
env = os.environ.copy()
binary_name = os.path.basename(self.node_binary).lower()
if binary_name not in {"node", "node.exe"}:
env.setdefault("ELECTRON_RUN_AS_NODE", "1")
# LiteParse checks ImageMagick availability with `which magick`.
# On macOS app launches, PATH often excludes Homebrew bins, even when
# IMAGEMAGICK_BINARY is configured to an absolute executable path.
path_entries = [p for p in (env.get("PATH") or "").split(os.pathsep) if p]
additional_entries = []
imagemagick_binary = (env.get("IMAGEMAGICK_BINARY") or "").strip()
if imagemagick_binary:
magick_dir = os.path.dirname(imagemagick_binary)
if magick_dir:
additional_entries.append(magick_dir)
soffice_binary = (env.get("SOFFICE_PATH") or "").strip()
if soffice_binary:
soffice_dir = os.path.dirname(soffice_binary)
if soffice_dir:
additional_entries.append(soffice_dir)
if os.name != "nt":
additional_entries.extend([
"/opt/homebrew/bin",
"/usr/local/bin",
"/opt/local/bin",
"/usr/bin",
"/bin",
])
deduped_additional_entries = []
for entry in additional_entries:
normalized = entry.strip()
if not normalized or not os.path.isdir(normalized):
continue
if normalized in path_entries or normalized in deduped_additional_entries:
continue
deduped_additional_entries.append(normalized)
if deduped_additional_entries:
env["PATH"] = os.pathsep.join(deduped_additional_entries + path_entries)
return env
def _resolve_npm_project_root(self) -> str:
"""Directory whose node_modules contains @llamaindex/liteparse (runner dir or Electron app root)."""
local_nm = os.path.join(
@ -76,8 +156,9 @@ class LiteParseService:
cwd=self.runner_dir,
check=True,
capture_output=True,
text=True,
timeout=10,
env=self._build_node_env(),
**_subprocess_text_kwargs(),
)
except Exception as exc:
return False, f"Node.js runtime is unavailable: {exc}"
@ -103,8 +184,9 @@ class LiteParseService:
cwd=self._npm_project_root,
check=True,
capture_output=True,
text=True,
timeout=20,
env=self._build_node_env(),
**_subprocess_text_kwargs(),
)
except Exception as exc:
return False, f"LiteParse dependency is unavailable: {exc}"
@ -151,21 +233,51 @@ class LiteParseService:
if tessdata:
command.extend(["--tessdata-path", tessdata])
LOGGER.info(
"[LiteParse] Parsing file=%s ocr_enabled=%s ocr_language=%s",
file_path,
ocr_enabled,
ocr_language,
)
process = subprocess.run(
command,
cwd=self._npm_project_root,
capture_output=True,
text=True,
timeout=self.timeout_seconds,
env=os.environ.copy(),
env=self._build_node_env(),
**_subprocess_text_kwargs(),
)
payload = self._decode_runner_output(process.stdout)
LOGGER.info(
"[LiteParse] Command finished returncode=%s command=%s",
process.returncode,
_command_str(command),
)
payload: Dict[str, Any]
try:
payload = self._decode_runner_output(process.stdout)
except LiteParseError as exc:
raise LiteParseError(
f"{exc}; returncode={process.returncode}; "
f"stderr={_snippet(process.stderr)}; stdout={_snippet(process.stdout)}"
) from exc
if process.returncode != 0:
message = payload.get("error") or process.stderr.strip() or "Unknown error"
LOGGER.error(
"[LiteParse] Parse failed returncode=%s stderr=%s stdout=%s",
process.returncode,
_snippet(process.stderr),
_snippet(process.stdout),
)
raise LiteParseError(message)
if not payload.get("ok"):
LOGGER.error(
"[LiteParse] Runner returned not-ok payload=%s",
_snippet(json.dumps(payload)),
)
raise LiteParseError(payload.get("error") or "LiteParse parse failed")
return payload