Merge pull request #484 from presenton/fix/imagemagick-issues
Fix/imagemagick issues
This commit is contained in:
commit
08b0726f80
8 changed files with 621 additions and 47 deletions
|
|
@ -4,11 +4,14 @@
|
|||
* - setup:install-chrome — download Chromium (browser-snapshots) with progress
|
||||
*/
|
||||
|
||||
import { ipcMain, WebContents, shell } from "electron";
|
||||
import { ipcMain, WebContents } from "electron";
|
||||
import fs from "fs";
|
||||
import path from "path";
|
||||
import os from "os";
|
||||
import { spawn, spawnSync } from "child_process";
|
||||
import * as https from "https";
|
||||
import * as http from "http";
|
||||
import { IncomingMessage } from "http";
|
||||
import puppeteer from "puppeteer";
|
||||
import {
|
||||
Browser,
|
||||
|
|
@ -19,8 +22,10 @@ import {
|
|||
} from "@puppeteer/browsers";
|
||||
import { getSetupStatus } from "../utils/setup-dependencies";
|
||||
import {
|
||||
getImageMagickBinaryPath,
|
||||
getImageMagickDownloadUrl,
|
||||
getImageMagickManualInstallCommands,
|
||||
getWindowsImageMagickInstallDir,
|
||||
isImageMagickInstalled,
|
||||
} from "../utils/imagemagick-check";
|
||||
|
||||
|
|
@ -50,7 +55,7 @@ function sendChromeLog(wc: WebContents, level: string, text: string) {
|
|||
|
||||
function sendImageMagickProgress(
|
||||
wc: WebContents,
|
||||
phase: "installing" | "done" | "error",
|
||||
phase: "downloading" | "installing" | "done" | "error",
|
||||
percent?: number,
|
||||
message?: string
|
||||
) {
|
||||
|
|
@ -100,6 +105,156 @@ function logManualImageMagickCommands(wc: WebContents) {
|
|||
}
|
||||
}
|
||||
|
||||
const MAX_DOWNLOAD_REDIRECTS = 5;
|
||||
const MIN_IMAGEMAGICK_INSTALLER_SIZE_BYTES = 5 * 1024 * 1024;
|
||||
|
||||
function formatBytes(bytes: number): string {
|
||||
if (bytes <= 0) return "0 B";
|
||||
const mb = bytes / 1024 / 1024;
|
||||
if (mb >= 1) return `${mb.toFixed(1)} MB`;
|
||||
const kb = bytes / 1024;
|
||||
if (kb >= 1) return `${kb.toFixed(0)} KB`;
|
||||
return `${bytes} B`;
|
||||
}
|
||||
|
||||
function escapePowerShellSingleQuoted(value: string): string {
|
||||
return value.replace(/'/g, "''");
|
||||
}
|
||||
|
||||
function getFilenameFromUrl(url: string, fallback: string): string {
|
||||
try {
|
||||
const parsed = new URL(url);
|
||||
const name = path.basename(parsed.pathname);
|
||||
return name || fallback;
|
||||
} catch {
|
||||
return fallback;
|
||||
}
|
||||
}
|
||||
|
||||
function downloadFileWithProgress(
|
||||
wc: WebContents,
|
||||
url: string,
|
||||
destinationPath: string
|
||||
): Promise<void> {
|
||||
return new Promise((resolve, reject) => {
|
||||
const requestDownload = (requestUrl: string, redirects: number) => {
|
||||
const requester = requestUrl.startsWith("https") ? https.get : http.get;
|
||||
sendImageMagickLog(wc, "cmd", `GET ${requestUrl}`);
|
||||
|
||||
requester(requestUrl, (res: IncomingMessage) => {
|
||||
const statusCode = res.statusCode ?? 0;
|
||||
if (
|
||||
[301, 302, 303, 307, 308].includes(statusCode) &&
|
||||
res.headers.location
|
||||
) {
|
||||
if (redirects >= MAX_DOWNLOAD_REDIRECTS) {
|
||||
reject(new Error("Too many redirects while downloading installer."));
|
||||
return;
|
||||
}
|
||||
const redirectUrl = new URL(res.headers.location, requestUrl).toString();
|
||||
sendImageMagickLog(wc, "info", `Redirecting to ${redirectUrl}`);
|
||||
requestDownload(redirectUrl, redirects + 1);
|
||||
return;
|
||||
}
|
||||
|
||||
if (statusCode !== 200) {
|
||||
reject(new Error(`Download failed with HTTP ${statusCode}.`));
|
||||
return;
|
||||
}
|
||||
|
||||
const totalBytes = Number.parseInt(
|
||||
String(res.headers["content-length"] ?? "0"),
|
||||
10
|
||||
);
|
||||
let downloadedBytes = 0;
|
||||
|
||||
const file = fs.createWriteStream(destinationPath);
|
||||
|
||||
res.on("data", (chunk: Buffer) => {
|
||||
downloadedBytes += chunk.length;
|
||||
const percent =
|
||||
totalBytes > 0
|
||||
? Math.min(99, Math.floor((downloadedBytes / totalBytes) * 100))
|
||||
: undefined;
|
||||
const sizeLabel =
|
||||
totalBytes > 0
|
||||
? `${formatBytes(downloadedBytes)} / ${formatBytes(totalBytes)}`
|
||||
: `${formatBytes(downloadedBytes)} downloaded`;
|
||||
sendImageMagickProgress(wc, "downloading", percent, sizeLabel);
|
||||
});
|
||||
|
||||
res.pipe(file);
|
||||
|
||||
file.on("finish", () => {
|
||||
file.close(() => {
|
||||
if (downloadedBytes < MIN_IMAGEMAGICK_INSTALLER_SIZE_BYTES) {
|
||||
fs.unlink(destinationPath, () => {});
|
||||
reject(
|
||||
new Error(
|
||||
`Downloaded file is too small (${formatBytes(downloadedBytes)}).`
|
||||
)
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
sendImageMagickLog(
|
||||
wc,
|
||||
"ok",
|
||||
`Download complete (${formatBytes(downloadedBytes)}).`
|
||||
);
|
||||
resolve();
|
||||
});
|
||||
});
|
||||
|
||||
file.on("error", (err) => {
|
||||
fs.unlink(destinationPath, () => {});
|
||||
reject(err);
|
||||
});
|
||||
}).on("error", (err) => {
|
||||
fs.unlink(destinationPath, () => {});
|
||||
reject(err);
|
||||
});
|
||||
};
|
||||
|
||||
requestDownload(url, 0);
|
||||
});
|
||||
}
|
||||
|
||||
async function runWindowsExecutableInstaller(
|
||||
wc: WebContents,
|
||||
installerPath: string,
|
||||
installerArgs: string[]
|
||||
): Promise<void> {
|
||||
const escapedInstallerPath = escapePowerShellSingleQuoted(installerPath);
|
||||
const argList = installerArgs
|
||||
.map((arg) => `'${escapePowerShellSingleQuoted(arg)}'`)
|
||||
.join(", ");
|
||||
|
||||
const runViaPowerShell = async (runAsAdmin: boolean) => {
|
||||
const verb = runAsAdmin ? " -Verb RunAs" : "";
|
||||
const script = `$p = Start-Process -FilePath '${escapedInstallerPath}' -ArgumentList ${argList}${verb} -Wait -PassThru; if ($p) { exit $p.ExitCode } else { exit 1 }`;
|
||||
await runInstallCommand(wc, "powershell", [
|
||||
"-NoProfile",
|
||||
"-ExecutionPolicy",
|
||||
"Bypass",
|
||||
"-Command",
|
||||
script,
|
||||
]);
|
||||
};
|
||||
|
||||
try {
|
||||
sendImageMagickLog(wc, "info", "Running installer in user mode...");
|
||||
await runViaPowerShell(false);
|
||||
} catch {
|
||||
sendImageMagickLog(
|
||||
wc,
|
||||
"warn",
|
||||
"User-mode install failed. Retrying with administrator rights..."
|
||||
);
|
||||
await runViaPowerShell(true);
|
||||
}
|
||||
}
|
||||
|
||||
function runInstallCommand(
|
||||
wc: WebContents,
|
||||
command: string,
|
||||
|
|
@ -282,23 +437,61 @@ export function setupSetupInstallHandlers() {
|
|||
|
||||
await runInstallCommand(wc, brewCommand, ["install", "imagemagick"]);
|
||||
} else if (process.platform === "win32") {
|
||||
if (commandExists("choco", ["-v"])) {
|
||||
await runInstallCommand(wc, "choco", [
|
||||
"install",
|
||||
"imagemagick.app",
|
||||
"-y",
|
||||
]);
|
||||
} else {
|
||||
throw new Error(
|
||||
"Chocolatey is not installed. Falling back to direct installer download."
|
||||
);
|
||||
}
|
||||
const installerUrl = getImageMagickDownloadUrl();
|
||||
const installerFilename = getFilenameFromUrl(
|
||||
installerUrl,
|
||||
"ImageMagick-installer.exe"
|
||||
);
|
||||
const installerPath = path.join(os.tmpdir(), installerFilename);
|
||||
const installDir = getWindowsImageMagickInstallDir();
|
||||
|
||||
fs.mkdirSync(installDir, { recursive: true });
|
||||
|
||||
sendImageMagickLog(
|
||||
wc,
|
||||
"info",
|
||||
`Downloading ImageMagick installer (${installerFilename})...`
|
||||
);
|
||||
sendImageMagickLog(wc, "cmd", `Install directory: ${installDir}`);
|
||||
sendImageMagickProgress(wc, "downloading", 0, "Connecting...");
|
||||
|
||||
await downloadFileWithProgress(wc, installerUrl, installerPath);
|
||||
|
||||
sendImageMagickProgress(
|
||||
wc,
|
||||
"installing",
|
||||
undefined,
|
||||
"Running installer..."
|
||||
);
|
||||
|
||||
await runWindowsExecutableInstaller(wc, installerPath, [
|
||||
"/SP-",
|
||||
"/VERYSILENT",
|
||||
"/SUPPRESSMSGBOXES",
|
||||
"/NORESTART",
|
||||
`/DIR=${installDir}`,
|
||||
]);
|
||||
|
||||
fs.unlink(installerPath, () => {});
|
||||
sendImageMagickLog(wc, "ok", "ImageMagick installer completed.");
|
||||
} else {
|
||||
throw new Error(
|
||||
"Unsupported platform for automatic install. Use manual install from the official download page."
|
||||
);
|
||||
}
|
||||
|
||||
if (!isImageMagickInstalled()) {
|
||||
throw new Error(
|
||||
"ImageMagick installation command finished, but the binary was not detected."
|
||||
);
|
||||
}
|
||||
|
||||
sendImageMagickLog(
|
||||
wc,
|
||||
"ok",
|
||||
`ImageMagick detected at ${getImageMagickBinaryPath()}`
|
||||
);
|
||||
|
||||
sendImageMagickProgress(wc, "done", 100, "ImageMagick install finished");
|
||||
return { ok: true };
|
||||
} catch (error) {
|
||||
|
|
@ -310,9 +503,8 @@ export function setupSetupInstallHandlers() {
|
|||
sendImageMagickLog(
|
||||
wc,
|
||||
"info",
|
||||
`Opening manual install link: ${downloadUrl}`
|
||||
`Manual install URL: ${downloadUrl}`
|
||||
);
|
||||
await shell.openExternal(downloadUrl);
|
||||
sendImageMagickProgress(
|
||||
wc,
|
||||
"error",
|
||||
|
|
@ -331,7 +523,11 @@ export function setupSetupInstallHandlers() {
|
|||
const installed = isImageMagickInstalled();
|
||||
if (installed) {
|
||||
sendImageMagickProgress(wc, "done", 100, "ImageMagick detected");
|
||||
sendImageMagickLog(wc, "ok", "ImageMagick is installed and ready.");
|
||||
sendImageMagickLog(
|
||||
wc,
|
||||
"ok",
|
||||
`ImageMagick is installed and ready (${getImageMagickBinaryPath()}).`
|
||||
);
|
||||
return { ok: true };
|
||||
}
|
||||
const message =
|
||||
|
|
|
|||
|
|
@ -14,7 +14,7 @@ import { checkDependenciesBeforeWindow } from "./utils/setup-dependencies";
|
|||
import { getSofficePath, isLibreOfficeInstalled } from "./utils/libreoffice-check";
|
||||
import { getPuppeteerExecutablePath, isChromeInstalled } from "./utils/puppeteer-check";
|
||||
import { getLiteParseRunnerPath } from "./utils/liteparse-check";
|
||||
import { isImageMagickInstalled } from "./utils/imagemagick-check";
|
||||
import { getImageMagickBinaryPath, isImageMagickInstalled } from "./utils/imagemagick-check";
|
||||
import { startUpdateChecker, stopUpdateChecker } from "./utils/update-checker";
|
||||
|
||||
|
||||
|
|
@ -125,7 +125,12 @@ async function startServers(fastApiPort: number, nextjsPort: number) {
|
|||
// Resolved by libreoffice-check.ts at startup; lets Python invoke the
|
||||
// exact binary path instead of relying on the system PATH.
|
||||
SOFFICE_PATH: getSofficePath(),
|
||||
IMAGEMAGICK_BINARY: getImageMagickBinaryPath(),
|
||||
LITEPARSE_RUNNER_PATH: getLiteParseRunnerPath(),
|
||||
// Use Electron's embedded runtime for LiteParse so parsing does not
|
||||
// depend on a system-wide Node installation.
|
||||
LITEPARSE_NODE_BINARY: process.execPath,
|
||||
ELECTRON_RUN_AS_NODE: "1",
|
||||
},
|
||||
isDev,
|
||||
);
|
||||
|
|
|
|||
6
electron/app/types/index.d.ts
vendored
6
electron/app/types/index.d.ts
vendored
|
|
@ -33,8 +33,14 @@ interface FastApiEnv {
|
|||
MIGRATE_DATABASE_ON_STARTUP?: string,
|
||||
/** Absolute path to the soffice binary resolved at startup by libreoffice-check.ts. */
|
||||
SOFFICE_PATH?: string,
|
||||
/** Absolute path to the ImageMagick binary resolved at startup by imagemagick-check.ts. */
|
||||
IMAGEMAGICK_BINARY?: string,
|
||||
/** Absolute path to the bundled LiteParse runner script. */
|
||||
LITEPARSE_RUNNER_PATH?: string,
|
||||
/** Binary path used by LiteParseService to execute liteparse_runner.mjs. */
|
||||
LITEPARSE_NODE_BINARY?: string,
|
||||
/** Set to "1" when using the Electron binary as a Node runtime. */
|
||||
ELECTRON_RUN_AS_NODE?: string,
|
||||
}
|
||||
|
||||
interface NextJsEnv {
|
||||
|
|
|
|||
|
|
@ -1,5 +1,10 @@
|
|||
import fs from "fs";
|
||||
import os from "os";
|
||||
import path from "path";
|
||||
import { spawnSync } from "child_process";
|
||||
|
||||
let resolvedImageMagickBinaryPath = process.platform === "win32" ? "magick" : "convert";
|
||||
|
||||
function canExecute(command: string, args: string[]): boolean {
|
||||
const result = spawnSync(command, args, {
|
||||
stdio: "pipe",
|
||||
|
|
@ -8,12 +13,162 @@ function canExecute(command: string, args: string[]): boolean {
|
|||
return result.status === 0;
|
||||
}
|
||||
|
||||
function runCommand(command: string, args: string[]): string | null {
|
||||
const result = spawnSync(command, args, {
|
||||
stdio: ["ignore", "pipe", "pipe"],
|
||||
encoding: "utf8",
|
||||
windowsHide: true,
|
||||
});
|
||||
if (result.status !== 0) return null;
|
||||
|
||||
const stdout = (result.stdout ?? "").trim();
|
||||
return stdout.length > 0 ? stdout : null;
|
||||
}
|
||||
|
||||
function getWindowsInstallRootCandidates(): string[] {
|
||||
const roots = new Set<string>();
|
||||
|
||||
if (process.env.LOCALAPPDATA) roots.add(process.env.LOCALAPPDATA);
|
||||
if (process.env.ProgramFiles) roots.add(process.env.ProgramFiles);
|
||||
if (process.env["ProgramFiles(x86)"]) {
|
||||
roots.add(process.env["ProgramFiles(x86)"] as string);
|
||||
}
|
||||
roots.add(path.join(os.homedir(), "AppData", "Local"));
|
||||
|
||||
return Array.from(roots);
|
||||
}
|
||||
|
||||
export function getWindowsImageMagickInstallDir(): string {
|
||||
const localAppData =
|
||||
process.env.LOCALAPPDATA ?? path.join(os.homedir(), "AppData", "Local");
|
||||
return path.join(localAppData, "Presenton", "runtime", "imagemagick");
|
||||
}
|
||||
|
||||
function collectWindowsImageMagickBinaryCandidates(): string[] {
|
||||
const candidates: string[] = [
|
||||
path.join(getWindowsImageMagickInstallDir(), "magick.exe"),
|
||||
];
|
||||
|
||||
for (const root of getWindowsInstallRootCandidates()) {
|
||||
try {
|
||||
const entries = fs.readdirSync(root, { withFileTypes: true });
|
||||
for (const entry of entries) {
|
||||
if (!entry.isDirectory() || !/^ImageMagick/i.test(entry.name)) {
|
||||
continue;
|
||||
}
|
||||
candidates.push(path.join(root, entry.name, "magick.exe"));
|
||||
}
|
||||
} catch {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
return candidates;
|
||||
}
|
||||
|
||||
function resolveBrewCommandPath(): string | null {
|
||||
const candidates = ["brew", "/opt/homebrew/bin/brew", "/usr/local/bin/brew"];
|
||||
for (const candidate of candidates) {
|
||||
if (canExecute(candidate, ["--version"])) {
|
||||
return candidate;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
function collectDarwinBrewImageMagickCandidates(): string[] {
|
||||
const candidates: string[] = [
|
||||
"/opt/homebrew/bin/magick",
|
||||
"/usr/local/bin/magick",
|
||||
"/opt/homebrew/opt/imagemagick/bin/magick",
|
||||
"/usr/local/opt/imagemagick/bin/magick",
|
||||
];
|
||||
|
||||
const brewCommand = resolveBrewCommandPath();
|
||||
if (!brewCommand) {
|
||||
return candidates;
|
||||
}
|
||||
|
||||
const brewPrefix = runCommand(brewCommand, ["--prefix", "imagemagick"]);
|
||||
if (brewPrefix) {
|
||||
candidates.push(path.join(brewPrefix, "bin", "magick"));
|
||||
}
|
||||
|
||||
const brewCellar = runCommand(brewCommand, ["--cellar", "imagemagick"]);
|
||||
if (brewCellar && fs.existsSync(brewCellar)) {
|
||||
try {
|
||||
const versions = fs
|
||||
.readdirSync(brewCellar, { withFileTypes: true })
|
||||
.filter((entry) => entry.isDirectory())
|
||||
.map((entry) => entry.name)
|
||||
.sort((a, b) =>
|
||||
b.localeCompare(a, undefined, { numeric: true, sensitivity: "base" })
|
||||
);
|
||||
|
||||
for (const version of versions) {
|
||||
candidates.push(path.join(brewCellar, version, "bin", "magick"));
|
||||
}
|
||||
} catch {
|
||||
// Ignore cellar enumeration errors and continue with other candidates.
|
||||
}
|
||||
}
|
||||
|
||||
return candidates;
|
||||
}
|
||||
|
||||
function resolveImageMagickBinaryPath(): string | null {
|
||||
const commandCandidates = process.platform === "win32" ? ["magick"] : ["magick", "convert"];
|
||||
for (const candidate of commandCandidates) {
|
||||
if (canExecute(candidate, ["-version"])) {
|
||||
return candidate;
|
||||
}
|
||||
}
|
||||
|
||||
if (process.platform === "win32") {
|
||||
for (const candidate of collectWindowsImageMagickBinaryCandidates()) {
|
||||
if (fs.existsSync(candidate) && canExecute(candidate, ["-version"])) {
|
||||
return candidate;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
if (process.platform === "darwin") {
|
||||
for (const candidate of collectDarwinBrewImageMagickCandidates()) {
|
||||
if (fs.existsSync(candidate) && canExecute(candidate, ["-version"])) {
|
||||
return candidate;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const unixCandidates = [
|
||||
"/opt/homebrew/bin/magick",
|
||||
"/usr/local/bin/magick",
|
||||
"/opt/local/bin/magick",
|
||||
"/usr/bin/magick",
|
||||
"/usr/local/bin/convert",
|
||||
"/usr/bin/convert",
|
||||
];
|
||||
|
||||
for (const candidate of unixCandidates) {
|
||||
if (fs.existsSync(candidate) && canExecute(candidate, ["-version"])) {
|
||||
return candidate;
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
export function isImageMagickInstalled(): boolean {
|
||||
// ImageMagick 7+ command
|
||||
if (canExecute("magick", ["-version"])) return true;
|
||||
// Legacy command on Linux/macOS packages
|
||||
if (canExecute("convert", ["-version"])) return true;
|
||||
return false;
|
||||
const resolved = resolveImageMagickBinaryPath();
|
||||
if (!resolved) return false;
|
||||
|
||||
resolvedImageMagickBinaryPath = resolved;
|
||||
return true;
|
||||
}
|
||||
|
||||
export function getImageMagickBinaryPath(): string {
|
||||
return resolvedImageMagickBinaryPath;
|
||||
}
|
||||
|
||||
export function getImageMagickDownloadUrl(): string {
|
||||
|
|
@ -31,6 +186,8 @@ export function getImageMagickManualInstallCommands(): string[] {
|
|||
return [
|
||||
"Download and run the installer:",
|
||||
getImageMagickDownloadUrl(),
|
||||
"Recommended install path:",
|
||||
getWindowsImageMagickInstallDir(),
|
||||
];
|
||||
}
|
||||
|
||||
|
|
@ -40,6 +197,8 @@ export function getImageMagickManualInstallCommands(): string[] {
|
|||
'/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"',
|
||||
"Install ImageMagick:",
|
||||
"brew install imagemagick",
|
||||
"Verify detected binary path:",
|
||||
"brew --prefix imagemagick",
|
||||
];
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -288,7 +288,7 @@
|
|||
? '<strong>Presenton</strong> uses LibreOffice to generate custom templates from PPTX files.'
|
||||
: step === 'chrome'
|
||||
? '<strong>Presenton</strong> uses Chromium for export and slide rendering. Download it now (~150 MB).'
|
||||
: '<strong>Presenton</strong> uses ImageMagick for OCR/document conversion support. Linux uses apt, macOS installs Homebrew first (if needed) and then runs brew install imagemagick, and Windows uses Chocolatey with a direct installer fallback.';
|
||||
: '<strong>Presenton</strong> uses ImageMagick for OCR/document conversion support. Linux uses apt, macOS installs Homebrew first (if needed) and then runs brew install imagemagick, and Windows downloads and installs it directly into the Presenton runtime.';
|
||||
document.getElementById('btn-install').onclick = () => startInstall(step);
|
||||
document.getElementById('btn-skip').onclick = () => handleSkip();
|
||||
showState('prompt');
|
||||
|
|
@ -315,7 +315,7 @@
|
|||
});
|
||||
} else {
|
||||
document.getElementById('dl-heading').textContent = 'Installing ImageMagick';
|
||||
document.getElementById('dl-phase').textContent = 'Linux: apt-get | macOS: Homebrew + brew install | Windows: choco or direct installer';
|
||||
document.getElementById('dl-phase').textContent = 'Linux: apt-get | macOS: Homebrew + brew install | Windows: direct installer (Presenton runtime)';
|
||||
window.setupInstaller.installImageMagick().then((installResult) => {
|
||||
if (!installResult || !installResult.ok) {
|
||||
if (currentStep !== 'imagemagick') return;
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
import os
|
||||
import subprocess
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Dict, List
|
||||
|
||||
|
|
@ -8,6 +9,23 @@ class DocumentConversionError(Exception):
|
|||
pass
|
||||
|
||||
|
||||
LOGGER = logging.getLogger(__name__)
|
||||
_LOG_SNIPPET_LIMIT = 600
|
||||
|
||||
|
||||
def _snippet(value: str, limit: int = _LOG_SNIPPET_LIMIT) -> str:
|
||||
text = (value or "").strip()
|
||||
if not text:
|
||||
return "<empty>"
|
||||
if len(text) <= limit:
|
||||
return text
|
||||
return f"{text[:limit]}... [truncated {len(text) - limit} chars]"
|
||||
|
||||
|
||||
def _command_str(parts: list[str]) -> str:
|
||||
return " ".join(repr(part) for part in parts)
|
||||
|
||||
|
||||
def _windows_hidden_subprocess_kwargs() -> Dict[str, object]:
|
||||
if os.name != "nt":
|
||||
return {}
|
||||
|
|
@ -39,6 +57,8 @@ class DocumentConversionService:
|
|||
[command, *args],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
encoding="utf-8",
|
||||
errors="replace",
|
||||
timeout=10,
|
||||
check=False,
|
||||
**_windows_hidden_subprocess_kwargs(),
|
||||
|
|
@ -71,23 +91,39 @@ class DocumentConversionService:
|
|||
}
|
||||
|
||||
try:
|
||||
command = [
|
||||
self.soffice_binary,
|
||||
"--headless",
|
||||
"--convert-to",
|
||||
"pdf",
|
||||
"--outdir",
|
||||
output_dir,
|
||||
file_path,
|
||||
]
|
||||
LOGGER.info(
|
||||
"[DocumentConversion] LibreOffice conversion start input=%s output_dir=%s",
|
||||
file_path,
|
||||
output_dir,
|
||||
)
|
||||
subprocess.run(
|
||||
[
|
||||
self.soffice_binary,
|
||||
"--headless",
|
||||
"--convert-to",
|
||||
"pdf",
|
||||
"--outdir",
|
||||
output_dir,
|
||||
file_path,
|
||||
],
|
||||
command,
|
||||
check=True,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
encoding="utf-8",
|
||||
errors="replace",
|
||||
timeout=timeout_seconds,
|
||||
**_windows_hidden_subprocess_kwargs(),
|
||||
)
|
||||
LOGGER.info(
|
||||
"[DocumentConversion] LibreOffice conversion complete input=%s",
|
||||
file_path,
|
||||
)
|
||||
except subprocess.TimeoutExpired as exc:
|
||||
LOGGER.error(
|
||||
"[DocumentConversion] LibreOffice timed out command=%s",
|
||||
_command_str(exc.cmd if isinstance(exc.cmd, list) else [str(exc.cmd)]),
|
||||
)
|
||||
raise DocumentConversionError(
|
||||
f"LibreOffice conversion timed out for {os.path.basename(file_path)}"
|
||||
) from exc
|
||||
|
|
@ -95,10 +131,19 @@ class DocumentConversionService:
|
|||
stderr = (exc.stderr or "").strip()
|
||||
stdout = (exc.stdout or "").strip()
|
||||
details = stderr or stdout or str(exc)
|
||||
LOGGER.error(
|
||||
"[DocumentConversion] LibreOffice failed code=%s command=%s stderr=%s stdout=%s",
|
||||
exc.returncode,
|
||||
_command_str(exc.cmd if isinstance(exc.cmd, list) else [str(exc.cmd)]),
|
||||
_snippet(stderr),
|
||||
_snippet(stdout),
|
||||
)
|
||||
raise DocumentConversionError(
|
||||
f"LibreOffice conversion failed for {os.path.basename(file_path)}: {details}"
|
||||
f"LibreOffice conversion failed for {os.path.basename(file_path)}: {details} "
|
||||
f"(stderr={_snippet(stderr)}; stdout={_snippet(stdout)})"
|
||||
) from exc
|
||||
except Exception as exc:
|
||||
LOGGER.exception("[DocumentConversion] LibreOffice conversion unexpected error")
|
||||
raise DocumentConversionError(
|
||||
f"LibreOffice conversion failed for {os.path.basename(file_path)}: {exc}"
|
||||
) from exc
|
||||
|
|
@ -133,15 +178,31 @@ class DocumentConversionService:
|
|||
command = [self.imagemagick_binary, file_path, str(output_path)]
|
||||
|
||||
try:
|
||||
LOGGER.info(
|
||||
"[DocumentConversion] ImageMagick conversion start input=%s output=%s command=%s",
|
||||
file_path,
|
||||
output_path,
|
||||
_command_str(command),
|
||||
)
|
||||
subprocess.run(
|
||||
command,
|
||||
check=True,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
encoding="utf-8",
|
||||
errors="replace",
|
||||
timeout=timeout_seconds,
|
||||
**_windows_hidden_subprocess_kwargs(),
|
||||
)
|
||||
LOGGER.info(
|
||||
"[DocumentConversion] ImageMagick conversion complete output=%s",
|
||||
output_path,
|
||||
)
|
||||
except subprocess.TimeoutExpired as exc:
|
||||
LOGGER.error(
|
||||
"[DocumentConversion] ImageMagick timed out command=%s",
|
||||
_command_str(exc.cmd if isinstance(exc.cmd, list) else [str(exc.cmd)]),
|
||||
)
|
||||
raise DocumentConversionError(
|
||||
f"ImageMagick conversion timed out for {os.path.basename(file_path)}"
|
||||
) from exc
|
||||
|
|
@ -149,10 +210,19 @@ class DocumentConversionService:
|
|||
stderr = (exc.stderr or "").strip()
|
||||
stdout = (exc.stdout or "").strip()
|
||||
details = stderr or stdout or str(exc)
|
||||
LOGGER.error(
|
||||
"[DocumentConversion] ImageMagick failed code=%s command=%s stderr=%s stdout=%s",
|
||||
exc.returncode,
|
||||
_command_str(exc.cmd if isinstance(exc.cmd, list) else [str(exc.cmd)]),
|
||||
_snippet(stderr),
|
||||
_snippet(stdout),
|
||||
)
|
||||
raise DocumentConversionError(
|
||||
f"ImageMagick conversion failed for {os.path.basename(file_path)}: {details}"
|
||||
f"ImageMagick conversion failed for {os.path.basename(file_path)}: {details} "
|
||||
f"(stderr={_snippet(stderr)}; stdout={_snippet(stdout)})"
|
||||
) from exc
|
||||
except Exception as exc:
|
||||
LOGGER.exception("[DocumentConversion] ImageMagick conversion unexpected error")
|
||||
raise DocumentConversionError(
|
||||
f"ImageMagick conversion failed for {os.path.basename(file_path)}: {exc}"
|
||||
) from exc
|
||||
|
|
|
|||
|
|
@ -1,8 +1,9 @@
|
|||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import List, Optional, Tuple
|
||||
from typing import Any, List, Optional, Tuple
|
||||
|
||||
import pdfplumber
|
||||
from fastapi import HTTPException
|
||||
|
|
@ -22,9 +23,11 @@ from utils.ocr_language import presentation_language_to_ocr_code
|
|||
|
||||
# Optional fallback converter (primarily useful on Windows)
|
||||
try:
|
||||
from services.lightweight_document_service import DocumentService
|
||||
from services.lightweight_document_service import DocumentService as DocumentServiceCls
|
||||
except Exception:
|
||||
DocumentService = None
|
||||
DocumentServiceCls = None
|
||||
|
||||
LOGGER = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DocumentsLoader:
|
||||
|
|
@ -38,7 +41,9 @@ class DocumentsLoader:
|
|||
self._ocr_language = presentation_language_to_ocr_code(presentation_language)
|
||||
self.liteparse_service = LiteParseService()
|
||||
self.document_conversion_service = DocumentConversionService()
|
||||
self.document_service = DocumentService() if DocumentService is not None else None
|
||||
self.document_service: Any = (
|
||||
DocumentServiceCls() if DocumentServiceCls is not None else None
|
||||
)
|
||||
|
||||
self._documents: List[str] = []
|
||||
self._images: List[List[str]] = []
|
||||
|
|
@ -69,9 +74,14 @@ class DocumentsLoader:
|
|||
)
|
||||
|
||||
document = ""
|
||||
imgs = []
|
||||
imgs: List[str] = []
|
||||
|
||||
extension = Path(file_path).suffix.lower()
|
||||
LOGGER.info(
|
||||
"[DocumentsLoader] Processing file=%s extension=%s",
|
||||
file_path,
|
||||
extension,
|
||||
)
|
||||
|
||||
if extension in PDF_EXTENSIONS:
|
||||
document, imgs = await self.load_pdf(
|
||||
|
|
@ -107,13 +117,18 @@ class DocumentsLoader:
|
|||
load_images: bool,
|
||||
temp_dir: Optional[str] = None,
|
||||
) -> Tuple[str, List[str]]:
|
||||
image_paths = []
|
||||
image_paths: List[str] = []
|
||||
document: str = ""
|
||||
|
||||
if load_text:
|
||||
document = await asyncio.to_thread(self._parse_with_liteparse, file_path)
|
||||
|
||||
if load_images:
|
||||
if temp_dir is None:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="temp_dir is required when load_images is true",
|
||||
)
|
||||
image_paths = await self.get_page_images_from_pdf_async(file_path, temp_dir)
|
||||
|
||||
return document, image_paths
|
||||
|
|
@ -154,16 +169,27 @@ class DocumentsLoader:
|
|||
|
||||
def _parse_with_liteparse(self, file_path: str) -> str:
|
||||
try:
|
||||
LOGGER.info("[DocumentsLoader] LiteParse start file=%s", file_path)
|
||||
return self.liteparse_service.parse_to_markdown(
|
||||
file_path,
|
||||
ocr_enabled=True,
|
||||
ocr_language=self._ocr_language,
|
||||
)
|
||||
except (LiteParseError, DocumentConversionError) as exc:
|
||||
LOGGER.warning(
|
||||
"[DocumentsLoader] Primary parse failed file=%s error=%s",
|
||||
file_path,
|
||||
exc,
|
||||
)
|
||||
if self.document_service is not None:
|
||||
try:
|
||||
LOGGER.info("[DocumentsLoader] Trying fallback parser file=%s", file_path)
|
||||
return self.document_service.parse_to_markdown(file_path)
|
||||
except Exception:
|
||||
LOGGER.exception(
|
||||
"[DocumentsLoader] Fallback parser failed file=%s",
|
||||
file_path,
|
||||
)
|
||||
pass
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
|
|
|
|||
|
|
@ -1,13 +1,41 @@
|
|||
import json
|
||||
import logging
|
||||
import os
|
||||
import subprocess
|
||||
from typing import Any, Dict, Tuple
|
||||
from typing import Any, Dict, Mapping, Tuple
|
||||
|
||||
|
||||
class LiteParseError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
LOGGER = logging.getLogger(__name__)
|
||||
_LOG_SNIPPET_LIMIT = 600
|
||||
|
||||
|
||||
def _snippet(value: str, limit: int = _LOG_SNIPPET_LIMIT) -> str:
|
||||
text = (value or "").strip()
|
||||
if not text:
|
||||
return "<empty>"
|
||||
if len(text) <= limit:
|
||||
return text
|
||||
return f"{text[:limit]}... [truncated {len(text) - limit} chars]"
|
||||
|
||||
|
||||
def _command_str(parts: list[str]) -> str:
|
||||
return " ".join(json.dumps(part) for part in parts)
|
||||
|
||||
|
||||
def _subprocess_text_kwargs() -> Mapping[str, object]:
|
||||
"""Decode subprocess output consistently across platforms.
|
||||
|
||||
Windows defaults to a locale-dependent code page (often cp1252), which can
|
||||
crash while decoding UTF-8 output from Node tools. Use UTF-8 and replace
|
||||
undecodable bytes to keep parsing resilient.
|
||||
"""
|
||||
return {"text": True, "encoding": "utf-8", "errors": "replace"}
|
||||
|
||||
|
||||
class LiteParseService:
|
||||
def __init__(self, timeout_seconds: int = 180):
|
||||
self.timeout_seconds = timeout_seconds
|
||||
|
|
@ -16,6 +44,58 @@ class LiteParseService:
|
|||
self.runner_dir = os.path.dirname(self.runner_path)
|
||||
self._npm_project_root = self._resolve_npm_project_root()
|
||||
|
||||
def _build_node_env(self) -> Dict[str, str]:
|
||||
"""Build environment for Node subprocesses.
|
||||
|
||||
When the configured runtime binary is not the canonical `node` executable
|
||||
(for example Electron's app binary), force Node-compatible mode.
|
||||
"""
|
||||
env = os.environ.copy()
|
||||
binary_name = os.path.basename(self.node_binary).lower()
|
||||
if binary_name not in {"node", "node.exe"}:
|
||||
env.setdefault("ELECTRON_RUN_AS_NODE", "1")
|
||||
|
||||
# LiteParse checks ImageMagick availability with `which magick`.
|
||||
# On macOS app launches, PATH often excludes Homebrew bins, even when
|
||||
# IMAGEMAGICK_BINARY is configured to an absolute executable path.
|
||||
path_entries = [p for p in (env.get("PATH") or "").split(os.pathsep) if p]
|
||||
additional_entries = []
|
||||
|
||||
imagemagick_binary = (env.get("IMAGEMAGICK_BINARY") or "").strip()
|
||||
if imagemagick_binary:
|
||||
magick_dir = os.path.dirname(imagemagick_binary)
|
||||
if magick_dir:
|
||||
additional_entries.append(magick_dir)
|
||||
|
||||
soffice_binary = (env.get("SOFFICE_PATH") or "").strip()
|
||||
if soffice_binary:
|
||||
soffice_dir = os.path.dirname(soffice_binary)
|
||||
if soffice_dir:
|
||||
additional_entries.append(soffice_dir)
|
||||
|
||||
if os.name != "nt":
|
||||
additional_entries.extend([
|
||||
"/opt/homebrew/bin",
|
||||
"/usr/local/bin",
|
||||
"/opt/local/bin",
|
||||
"/usr/bin",
|
||||
"/bin",
|
||||
])
|
||||
|
||||
deduped_additional_entries = []
|
||||
for entry in additional_entries:
|
||||
normalized = entry.strip()
|
||||
if not normalized or not os.path.isdir(normalized):
|
||||
continue
|
||||
if normalized in path_entries or normalized in deduped_additional_entries:
|
||||
continue
|
||||
deduped_additional_entries.append(normalized)
|
||||
|
||||
if deduped_additional_entries:
|
||||
env["PATH"] = os.pathsep.join(deduped_additional_entries + path_entries)
|
||||
|
||||
return env
|
||||
|
||||
def _resolve_npm_project_root(self) -> str:
|
||||
"""Directory whose node_modules contains @llamaindex/liteparse (runner dir or Electron app root)."""
|
||||
local_nm = os.path.join(
|
||||
|
|
@ -76,8 +156,9 @@ class LiteParseService:
|
|||
cwd=self.runner_dir,
|
||||
check=True,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=10,
|
||||
env=self._build_node_env(),
|
||||
**_subprocess_text_kwargs(),
|
||||
)
|
||||
except Exception as exc:
|
||||
return False, f"Node.js runtime is unavailable: {exc}"
|
||||
|
|
@ -103,8 +184,9 @@ class LiteParseService:
|
|||
cwd=self._npm_project_root,
|
||||
check=True,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=20,
|
||||
env=self._build_node_env(),
|
||||
**_subprocess_text_kwargs(),
|
||||
)
|
||||
except Exception as exc:
|
||||
return False, f"LiteParse dependency is unavailable: {exc}"
|
||||
|
|
@ -151,21 +233,51 @@ class LiteParseService:
|
|||
if tessdata:
|
||||
command.extend(["--tessdata-path", tessdata])
|
||||
|
||||
LOGGER.info(
|
||||
"[LiteParse] Parsing file=%s ocr_enabled=%s ocr_language=%s",
|
||||
file_path,
|
||||
ocr_enabled,
|
||||
ocr_language,
|
||||
)
|
||||
|
||||
process = subprocess.run(
|
||||
command,
|
||||
cwd=self._npm_project_root,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=self.timeout_seconds,
|
||||
env=os.environ.copy(),
|
||||
env=self._build_node_env(),
|
||||
**_subprocess_text_kwargs(),
|
||||
)
|
||||
payload = self._decode_runner_output(process.stdout)
|
||||
LOGGER.info(
|
||||
"[LiteParse] Command finished returncode=%s command=%s",
|
||||
process.returncode,
|
||||
_command_str(command),
|
||||
)
|
||||
|
||||
payload: Dict[str, Any]
|
||||
try:
|
||||
payload = self._decode_runner_output(process.stdout)
|
||||
except LiteParseError as exc:
|
||||
raise LiteParseError(
|
||||
f"{exc}; returncode={process.returncode}; "
|
||||
f"stderr={_snippet(process.stderr)}; stdout={_snippet(process.stdout)}"
|
||||
) from exc
|
||||
|
||||
if process.returncode != 0:
|
||||
message = payload.get("error") or process.stderr.strip() or "Unknown error"
|
||||
LOGGER.error(
|
||||
"[LiteParse] Parse failed returncode=%s stderr=%s stdout=%s",
|
||||
process.returncode,
|
||||
_snippet(process.stderr),
|
||||
_snippet(process.stdout),
|
||||
)
|
||||
raise LiteParseError(message)
|
||||
|
||||
if not payload.get("ok"):
|
||||
LOGGER.error(
|
||||
"[LiteParse] Runner returned not-ok payload=%s",
|
||||
_snippet(json.dumps(payload)),
|
||||
)
|
||||
raise LiteParseError(payload.get("error") or "LiteParse parse failed")
|
||||
|
||||
return payload
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue