feat: add language parameter to decomposeDocuments API call

- Updated the decomposeDocuments method in PresentationGenerationApi to accept an optional language parameter.
- Modified the UploadPage component to pass the selected language from the config when calling the decomposeDocuments method.
This commit is contained in:
sudipnext 2026-03-28 15:34:53 +05:45
parent c6ad9de46b
commit 3207422651
31 changed files with 1554 additions and 1295 deletions

View file

@ -1,13 +1,14 @@
/**
* IPC handlers for the unified setup installer (LibreOffice + Chromium).
* IPC handlers for the unified setup installer (LibreOffice + Chromium + ImageMagick).
* - setup:get-status which dependencies are missing
* - setup:install-chrome download Chromium (browser-snapshots) with progress
*/
import { ipcMain, WebContents } from "electron";
import { ipcMain, WebContents, shell } from "electron";
import fs from "fs";
import path from "path";
import os from "os";
import { spawn, spawnSync } from "child_process";
import puppeteer from "puppeteer";
import {
Browser,
@ -17,6 +18,10 @@ import {
resolveBuildId,
} from "@puppeteer/browsers";
import { getSetupStatus } from "../utils/setup-dependencies";
import {
getImageMagickDownloadUrl,
isImageMagickInstalled,
} from "../utils/imagemagick-check";
function getPuppeteerCacheDir(): string {
const configCache =
@ -42,9 +47,78 @@ function sendChromeLog(wc: WebContents, level: string, text: string) {
}
}
function sendImageMagickProgress(
wc: WebContents,
phase: "installing" | "done" | "error",
percent?: number,
message?: string
) {
if (!wc.isDestroyed()) {
wc.send("setup:imagemagick-progress", { phase, percent, message });
}
}
function sendImageMagickLog(wc: WebContents, level: string, text: string) {
if (!wc.isDestroyed()) {
wc.send("setup:imagemagick-log", { level, text });
}
}
function commandExists(command: string, versionArgs: string[] = ["--version"]): boolean {
const result = spawnSync(command, versionArgs, {
stdio: "pipe",
windowsHide: true,
});
return result.status === 0;
}
function runInstallCommand(
wc: WebContents,
command: string,
args: string[]
): Promise<void> {
sendImageMagickLog(wc, "info", `Running: ${command} ${args.join(" ")}`);
return new Promise((resolve, reject) => {
const child = spawn(command, args, {
stdio: ["ignore", "pipe", "pipe"],
windowsHide: process.platform === "win32",
});
child.stdout.on("data", (data) => {
const text = String(data).trim();
if (text) sendImageMagickLog(wc, "info", text);
});
child.stderr.on("data", (data) => {
const text = String(data).trim();
if (text) {
sendImageMagickLog(
wc,
text.toLowerCase().includes("error") ? "error" : "info",
text
);
}
});
child.on("error", reject);
child.on("close", (code) => {
if (code === 0) {
resolve();
return;
}
reject(new Error(`${command} exited with code ${code}`));
});
});
}
export function setupSetupInstallHandlers() {
ipcMain.handle("setup:get-status", () => {
return getSetupStatus() ?? { needsLibreOffice: false, needsChrome: false };
return (
getSetupStatus() ?? {
needsLibreOffice: false,
needsChrome: false,
needsImageMagick: false,
}
);
});
ipcMain.handle(
@ -121,4 +195,91 @@ export function setupSetupInstallHandlers() {
return { ok: true };
}
);
ipcMain.handle(
"setup:install-imagemagick",
async (event): Promise<{ ok: boolean; error?: string }> => {
const wc = event.sender;
try {
sendImageMagickProgress(
wc,
"installing",
undefined,
"Installing ImageMagick..."
);
if (process.platform === "linux") {
if (commandExists("apt-get")) {
await runInstallCommand(wc, "pkexec", [
"apt-get",
"install",
"-y",
"imagemagick",
]);
} else {
throw new Error(
"apt-get is unavailable. Install ImageMagick manually from the official download page."
);
}
} else if (process.platform === "darwin") {
if (commandExists("brew")) {
await runInstallCommand(wc, "brew", ["install", "imagemagick"]);
} else {
throw new Error(
"Homebrew is not installed. Install ImageMagick manually from the official download page."
);
}
} else if (process.platform === "win32") {
if (commandExists("choco", ["-v"])) {
await runInstallCommand(wc, "choco", [
"install",
"imagemagick.app",
"-y",
]);
} else {
throw new Error(
"Chocolatey is not installed. Install ImageMagick manually from the official download page."
);
}
} else {
throw new Error(
"Unsupported platform for automatic install. Use manual install from the official download page."
);
}
sendImageMagickProgress(wc, "done", 100, "ImageMagick install finished");
return { ok: true };
} catch (error) {
const message =
error instanceof Error ? error.message : "ImageMagick install failed";
sendImageMagickLog(wc, "error", message);
const downloadUrl = getImageMagickDownloadUrl();
sendImageMagickLog(
wc,
"info",
`Falling back to manual install page: ${downloadUrl}`
);
await shell.openExternal(downloadUrl);
return { ok: true };
}
}
);
ipcMain.handle(
"setup:check-imagemagick",
async (event): Promise<{ ok: boolean; error?: string }> => {
const wc = event.sender;
const installed = isImageMagickInstalled();
if (installed) {
sendImageMagickProgress(wc, "done", 100, "ImageMagick detected");
sendImageMagickLog(wc, "ok", "ImageMagick is installed and ready.");
return { ok: true };
}
const message =
"ImageMagick is not detected yet. Install it, then click Retry.";
sendImageMagickProgress(wc, "error", undefined, message);
sendImageMagickLog(wc, "error", message);
return { ok: false, error: message };
}
);
}

View file

@ -13,6 +13,8 @@ import { setupSetupInstallHandlers } from "./ipc/setup_install_handlers";
import { checkDependenciesBeforeWindow } from "./utils/setup-dependencies";
import { getSofficePath, isLibreOfficeInstalled } from "./utils/libreoffice-check";
import { getPuppeteerExecutablePath, isChromeInstalled } from "./utils/puppeteer-check";
import { getLiteParseRunnerPath } from "./utils/liteparse-check";
import { isImageMagickInstalled } from "./utils/imagemagick-check";
import { startUpdateChecker, stopUpdateChecker } from "./utils/update-checker";
@ -23,6 +25,7 @@ let isStopping = false;
const startupStatus: Record<string, string> = {
libreoffice: "checking",
puppeteer: "checking",
imagemagick: "checking",
};
// Allow renderer to query initial startup status as soon as it loads.
@ -122,6 +125,7 @@ async function startServers(fastApiPort: number, nextjsPort: number) {
// Resolved by libreoffice-check.ts at startup; lets Python invoke the
// exact binary path instead of relying on the system PATH.
SOFFICE_PATH: getSofficePath(),
LITEPARSE_RUNNER_PATH: getLiteParseRunnerPath(),
},
isDev,
);
@ -188,7 +192,7 @@ app.whenReady().then(async () => {
createWindow();
win?.loadFile(path.join(baseDir, "resources/ui/homepage/index.html"));
// Single installer: checks LibreOffice and Chrome; if either is missing, shows one
// Single installer: checks LibreOffice, Chrome, and ImageMagick; if any are missing, shows one
// window that installs them one after another. Resolves when the window closes.
const setupCompleted = await checkDependenciesBeforeWindow();
if (!setupCompleted) {
@ -199,12 +203,14 @@ app.whenReady().then(async () => {
}
// Update startup status after setup (user may have installed one or both)
const [loResult, chromeOk] = await Promise.all([
const [loResult, chromeOk, imageMagickOk] = await Promise.all([
isLibreOfficeInstalled(),
isChromeInstalled(),
Promise.resolve(isImageMagickInstalled()),
]);
startupStatus.libreoffice = loResult.installed ? "installed" : "missing";
startupStatus.puppeteer = chromeOk ? "installed" : "missing";
startupStatus.imagemagick = imageMagickOk ? "installed" : "missing";
// Show and focus main window
win?.show();
@ -218,6 +224,7 @@ app.whenReady().then(async () => {
win?.webContents.once("did-finish-load", () => {
sendStartupStatus("libreoffice", startupStatus.libreoffice);
sendStartupStatus("puppeteer", startupStatus.puppeteer);
sendStartupStatus("imagemagick", startupStatus.imagemagick);
});
setUserConfig({

View file

@ -5,6 +5,8 @@ contextBridge.exposeInMainWorld("setupInstaller", {
installLibreOffice: () => ipcRenderer.invoke("lo:start-install"),
installChrome: () => ipcRenderer.invoke("setup:install-chrome"),
installImageMagick: () => ipcRenderer.invoke("setup:install-imagemagick"),
checkImageMagick: () => ipcRenderer.invoke("setup:check-imagemagick"),
done: () => ipcRenderer.send("setup:done"),
@ -25,4 +27,13 @@ contextBridge.exposeInMainWorld("setupInstaller", {
onChromeLog: (cb: (data: { level: string; text: string }) => void) => {
ipcRenderer.on("setup:chrome-log", (_event, data) => cb(data));
},
onImageMagickProgress: (
cb: (data: { phase: string; percent?: number; message?: string }) => void
) => {
ipcRenderer.on("setup:imagemagick-progress", (_event, data) => cb(data));
},
onImageMagickLog: (cb: (data: { level: string; text: string }) => void) => {
ipcRenderer.on("setup:imagemagick-log", (_event, data) => cb(data));
},
});

View file

@ -33,6 +33,8 @@ interface FastApiEnv {
MIGRATE_DATABASE_ON_STARTUP?: string,
/** Absolute path to the soffice binary resolved at startup by libreoffice-check.ts. */
SOFFICE_PATH?: string,
/** Absolute path to the bundled LiteParse runner script. */
LITEPARSE_RUNNER_PATH?: string,
}
interface NextJsEnv {

View file

@ -0,0 +1,27 @@
import { spawnSync } from "child_process";
function canExecute(command: string, args: string[]): boolean {
const result = spawnSync(command, args, {
stdio: "pipe",
windowsHide: true,
});
return result.status === 0;
}
export function isImageMagickInstalled(): boolean {
// ImageMagick 7+ command
if (canExecute("magick", ["-version"])) return true;
// Legacy command on Linux/macOS packages
if (canExecute("convert", ["-version"])) return true;
return false;
}
export function getImageMagickDownloadUrl(): string {
if (process.platform === "win32") {
return "https://imagemagick.org/script/download.php#windows";
}
if (process.platform === "darwin") {
return "https://imagemagick.org/script/download.php#macosx";
}
return "https://imagemagick.org/script/download.php#linux";
}

View file

@ -0,0 +1,28 @@
import fs from "fs";
import path from "path";
import { spawnSync } from "child_process";
import { baseDir, isDev } from "./constants";
export function getLiteParseRunnerPath(): string {
return isDev
? path.join(baseDir, "resources", "document-extraction", "liteparse_runner.mjs")
: path.join(baseDir, "resources", "document-extraction", "liteparse_runner.mjs");
}
export function getLiteParseDependencyPath(): string {
return path.join(baseDir, "node_modules", "@llamaindex", "liteparse");
}
export function isLiteParseInstalled(): boolean {
const runnerPath = getLiteParseRunnerPath();
const liteparsePackagePath = getLiteParseDependencyPath();
if (!fs.existsSync(runnerPath)) return false;
if (!fs.existsSync(liteparsePackagePath)) return false;
const nodeCheck = spawnSync("node", ["--version"], {
stdio: "pipe",
windowsHide: true,
});
return nodeCheck.status === 0;
}

View file

@ -29,6 +29,7 @@ function shouldSkipDownload(): boolean {
export interface SetupStatus {
needsLibreOffice: boolean;
needsChrome: boolean;
needsImageMagick: boolean;
}
/**

View file

@ -1,9 +1,10 @@
/**
* setup-dependencies.ts
*
* Single installer window that ensures LibreOffice and Chrome (Puppeteer) are
* Single installer window that ensures LibreOffice, Chrome (Puppeteer), and
* ImageMagick are
* available before the user starts creating presentations. Runs checks, then
* if either is missing shows one installer that runs LibreOffice then Chrome
* if any are missing shows one installer that runs dependency setup steps
* in sequence (each with Install / Skip).
*/
@ -15,6 +16,7 @@ import {
isChromeInstalled,
type SetupStatus,
} from "./puppeteer-check";
import { isImageMagickInstalled } from "./imagemagick-check";
export type { SetupStatus };
@ -26,40 +28,44 @@ export function getSetupStatus(): SetupStatus | null {
}
/**
* Checks LibreOffice and Chrome. If both are present, returns immediately.
* If either is missing, opens one installer window that runs LibreOffice
* then Chrome in sequence. Returns true only when all required dependencies
* Checks LibreOffice, Chrome and ImageMagick. If all are present, returns
* immediately. If any are missing, opens one installer window that runs each
* missing setup step in sequence. Returns true only when all required dependencies
* are installed; false when the installer is closed/skipped before completion.
*/
export async function checkDependenciesBeforeWindow(): Promise<boolean> {
const [loResult, chromeInstalled] = await Promise.all([
const [loResult, chromeInstalled, imageMagickInstalled] = await Promise.all([
isLibreOfficeInstalled(),
isChromeInstalled(),
Promise.resolve(isImageMagickInstalled()),
]);
const needsLibreOffice = !loResult.installed;
const needsChrome = !chromeInstalled;
const needsImageMagick = !imageMagickInstalled;
if (!needsLibreOffice && !needsChrome) {
if (!needsLibreOffice && !needsChrome && !needsImageMagick) {
return true;
}
currentSetupStatus = {
needsLibreOffice,
needsChrome,
needsImageMagick,
};
await showSetupInstallerWindow();
// Re-check after installer closes; setup can only proceed when all
// required dependencies are actually installed.
const [postLoResult, postChromeInstalled] = await Promise.all([
const [postLoResult, postChromeInstalled, postImageMagickInstalled] = await Promise.all([
isLibreOfficeInstalled(),
isChromeInstalled(),
Promise.resolve(isImageMagickInstalled()),
]);
currentSetupStatus = null;
return postLoResult.installed && postChromeInstalled;
return postLoResult.installed && postChromeInstalled && postImageMagickInstalled;
}
/**

View file

@ -9,6 +9,7 @@
"version": "0.6.3-beta",
"hasInstallScript": true,
"dependencies": {
"@llamaindex/liteparse": "^1.4.0",
"@puppeteer/browsers": "^1.9.1",
"@tailwindcss/cli": "^4.1.5",
"@types/uuid": "^10.0.0",
@ -54,6 +55,16 @@
"node": ">=6.9.0"
}
},
"node_modules/@borewit/text-codec": {
"version": "0.2.2",
"resolved": "https://registry.npmjs.org/@borewit/text-codec/-/text-codec-0.2.2.tgz",
"integrity": "sha512-DDaRehssg1aNrH4+2hnj1B7vnUGEjU6OIlyRdkMd0aUdIUvKXrJfXsy8LVtXAy7DRvYVluWbMspsRhz2lcW0mQ==",
"license": "MIT",
"funding": {
"type": "github",
"url": "https://github.com/sponsors/Borewit"
}
},
"node_modules/@develar/schema-utils": {
"version": "2.6.5",
"resolved": "https://registry.npmjs.org/@develar/schema-utils/-/schema-utils-2.6.5.tgz",
@ -507,6 +518,12 @@
"tslib": "^2.4.0"
}
},
"node_modules/@hyzyla/pdfium": {
"version": "2.1.12",
"resolved": "https://registry.npmjs.org/@hyzyla/pdfium/-/pdfium-2.1.12.tgz",
"integrity": "sha512-2ezbrJk9V4foB3+U+eQ7234spsHmrufPU+9EV2cVZCnhTLLfelPz7wWshO0HjUNtcECNBaAfEzrdaQZOigkW+A==",
"license": "MIT"
},
"node_modules/@img/colour": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/@img/colour/-/colour-1.0.0.tgz",
@ -1156,6 +1173,67 @@
"@jridgewell/sourcemap-codec": "^1.4.14"
}
},
"node_modules/@llamaindex/liteparse": {
"version": "1.4.0",
"resolved": "https://registry.npmjs.org/@llamaindex/liteparse/-/liteparse-1.4.0.tgz",
"integrity": "sha512-58Tr4vAutcaf0Cxe7GK4cknpzcpN3tTzUhIAwWioWuSDqVPS3jpNhVVfqE5tV5PE4za07l07QFhGscCoVm/hRw==",
"license": "Apache-2.0",
"dependencies": {
"@hyzyla/pdfium": "^2.1.9",
"axios": "^1.7.0",
"commander": "^12.0.0",
"file-type": "^21.3.3",
"form-data": "^4.0.0",
"p-limit": "^7.3.0",
"sharp": "^0.34.5",
"tesseract.js": "^7.0.0",
"unified": "^11.0.0",
"zod": "^3.23.0"
},
"bin": {
"lit": "dist/src/index.js",
"liteparse": "dist/src/index.js"
},
"engines": {
"node": ">=18.0.0"
}
},
"node_modules/@llamaindex/liteparse/node_modules/commander": {
"version": "12.1.0",
"resolved": "https://registry.npmjs.org/commander/-/commander-12.1.0.tgz",
"integrity": "sha512-Vw8qHK3bZM9y/P10u3Vib8o/DdkvA2OtPtZvD871QKjy74Wj1WSKFILMPRPSdUSx5RFK1arlJzEtA4PkFgnbuA==",
"license": "MIT",
"engines": {
"node": ">=18"
}
},
"node_modules/@llamaindex/liteparse/node_modules/p-limit": {
"version": "7.3.0",
"resolved": "https://registry.npmjs.org/p-limit/-/p-limit-7.3.0.tgz",
"integrity": "sha512-7cIXg/Z0M5WZRblrsOla88S4wAK+zOQQWeBYfV3qJuJXMr+LnbYjaadrFaS0JILfEDPVqHyKnZ1Z/1d6J9VVUw==",
"license": "MIT",
"dependencies": {
"yocto-queue": "^1.2.1"
},
"engines": {
"node": ">=20"
},
"funding": {
"url": "https://github.com/sponsors/sindresorhus"
}
},
"node_modules/@llamaindex/liteparse/node_modules/yocto-queue": {
"version": "1.2.2",
"resolved": "https://registry.npmjs.org/yocto-queue/-/yocto-queue-1.2.2.tgz",
"integrity": "sha512-4LCcse/U2MHZ63HAJVE+v71o7yOdIe4cZ70Wpf8D/IyjDKYQLV5GD46B+hSTjJsvV5PztjvHoU580EftxjDZFQ==",
"license": "MIT",
"engines": {
"node": ">=12.20"
},
"funding": {
"url": "https://github.com/sponsors/sindresorhus"
}
},
"node_modules/@malept/cross-spawn-promise": {
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/@malept/cross-spawn-promise/-/cross-spawn-promise-2.0.0.tgz",
@ -1975,6 +2053,29 @@
"node": ">= 10"
}
},
"node_modules/@tokenizer/inflate": {
"version": "0.4.1",
"resolved": "https://registry.npmjs.org/@tokenizer/inflate/-/inflate-0.4.1.tgz",
"integrity": "sha512-2mAv+8pkG6GIZiF1kNg1jAjh27IDxEPKwdGul3snfztFerfPGI1LjDezZp3i7BElXompqEtPmoPx6c2wgtWsOA==",
"license": "MIT",
"dependencies": {
"debug": "^4.4.3",
"token-types": "^6.1.1"
},
"engines": {
"node": ">=18"
},
"funding": {
"type": "github",
"url": "https://github.com/sponsors/Borewit"
}
},
"node_modules/@tokenizer/token": {
"version": "0.3.0",
"resolved": "https://registry.npmjs.org/@tokenizer/token/-/token-0.3.0.tgz",
"integrity": "sha512-OvjF+z51L3ov0OyAU0duzsYuvO01PH7x4t6DJx+guahgTnBHkhJdG7soQeTSFLWN3efnHyibZ4Z8l2EuWwJN3A==",
"license": "MIT"
},
"node_modules/@tootallnate/quickjs-emscripten": {
"version": "0.23.0",
"resolved": "https://registry.npmjs.org/@tootallnate/quickjs-emscripten/-/quickjs-emscripten-0.23.0.tgz",
@ -2070,6 +2171,12 @@
"@types/node": "*"
}
},
"node_modules/@types/unist": {
"version": "3.0.3",
"resolved": "https://registry.npmjs.org/@types/unist/-/unist-3.0.3.tgz",
"integrity": "sha512-ko/gIFJRv177XgZsZcBwnqJN5x/Gien8qNOn0D5bQU/zAzVf9Zt3BlcUiLqhV9y4ARk0GbT3tnUiPNgnTXzc/Q==",
"license": "MIT"
},
"node_modules/@types/uuid": {
"version": "10.0.0",
"resolved": "https://registry.npmjs.org/@types/uuid/-/uuid-10.0.0.tgz",
@ -2366,7 +2473,6 @@
"version": "0.4.0",
"resolved": "https://registry.npmjs.org/asynckit/-/asynckit-0.4.0.tgz",
"integrity": "sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q==",
"dev": true,
"license": "MIT"
},
"node_modules/at-least-node": {
@ -2379,6 +2485,26 @@
"node": ">= 4.0.0"
}
},
"node_modules/axios": {
"version": "1.14.0",
"resolved": "https://registry.npmjs.org/axios/-/axios-1.14.0.tgz",
"integrity": "sha512-3Y8yrqLSwjuzpXuZ0oIYZ/XGgLwUIBU3uLvbcpb0pidD9ctpShJd43KSlEEkVQg6DS0G9NKyzOvBfUtDKEyHvQ==",
"license": "MIT",
"dependencies": {
"follow-redirects": "^1.15.11",
"form-data": "^4.0.5",
"proxy-from-env": "^2.1.0"
}
},
"node_modules/axios/node_modules/proxy-from-env": {
"version": "2.1.0",
"resolved": "https://registry.npmjs.org/proxy-from-env/-/proxy-from-env-2.1.0.tgz",
"integrity": "sha512-cJ+oHTW1VAEa8cJslgmUZrc+sjRKgAKl3Zyse6+PV38hZe/V6Z14TbCuXcan9F9ghlz4QrFr2c92TNF82UkYHA==",
"license": "MIT",
"engines": {
"node": ">=10"
}
},
"node_modules/b4a": {
"version": "1.8.0",
"resolved": "https://registry.npmjs.org/b4a/-/b4a-1.8.0.tgz",
@ -2393,6 +2519,16 @@
}
}
},
"node_modules/bail": {
"version": "2.0.2",
"resolved": "https://registry.npmjs.org/bail/-/bail-2.0.2.tgz",
"integrity": "sha512-0xO6mYd7JB2YesxDKplafRpsiOzPt9V02ddPCLbY1xYGPOX24NTyN50qnUxgCPcSoYMhKpAuBTjQoRZCAkUDRw==",
"license": "MIT",
"funding": {
"type": "github",
"url": "https://github.com/sponsors/wooorm"
}
},
"node_modules/balanced-match": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.2.tgz",
@ -2527,6 +2663,12 @@
"readable-stream": "^3.4.0"
}
},
"node_modules/bmp-js": {
"version": "0.1.0",
"resolved": "https://registry.npmjs.org/bmp-js/-/bmp-js-0.1.0.tgz",
"integrity": "sha512-vHdS19CnY3hwiNdkaqk93DvjVLfbEcI8mys4UjuWrlX1haDmroo8o4xCzh4wD6DGV6HxRCyauwhHRqMTfERtjw==",
"license": "MIT"
},
"node_modules/boolean": {
"version": "3.2.0",
"resolved": "https://registry.npmjs.org/boolean/-/boolean-3.2.0.tgz",
@ -2820,7 +2962,6 @@
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/call-bind-apply-helpers/-/call-bind-apply-helpers-1.0.2.tgz",
"integrity": "sha512-Sp1ablJ0ivDkSzjcaJdxEunN5/XvksFJ2sMBFfq6x0ryhQV/2b/KwFe21cMpmHtPOSij8K99/wSfoEuTObmuMQ==",
"dev": true,
"license": "MIT",
"dependencies": {
"es-errors": "^1.3.0",
@ -3005,7 +3146,6 @@
"version": "1.0.8",
"resolved": "https://registry.npmjs.org/combined-stream/-/combined-stream-1.0.8.tgz",
"integrity": "sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg==",
"dev": true,
"license": "MIT",
"dependencies": {
"delayed-stream": "~1.0.0"
@ -3294,12 +3434,20 @@
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/delayed-stream/-/delayed-stream-1.0.0.tgz",
"integrity": "sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ==",
"dev": true,
"license": "MIT",
"engines": {
"node": ">=0.4.0"
}
},
"node_modules/dequal": {
"version": "2.0.3",
"resolved": "https://registry.npmjs.org/dequal/-/dequal-2.0.3.tgz",
"integrity": "sha512-0je+qPKHEMohvfRTCEo3CrPG6cAzAYgmzKyxRiYSSDkS6eGJdyVJm7WaYA5ECaAD9wLB2T4EEeymA5aFVcYXCA==",
"license": "MIT",
"engines": {
"node": ">=6"
}
},
"node_modules/detect-libc": {
"version": "2.1.2",
"resolved": "https://registry.npmjs.org/detect-libc/-/detect-libc-2.1.2.tgz",
@ -3317,6 +3465,19 @@
"license": "MIT",
"optional": true
},
"node_modules/devlop": {
"version": "1.1.0",
"resolved": "https://registry.npmjs.org/devlop/-/devlop-1.1.0.tgz",
"integrity": "sha512-RWmIqhcFf1lRYBvNmr7qTNuyCt/7/ns2jbpp1+PalgE/rDQcBT0fioSMUpJ93irlUhC5hrg4cYqe6U+0ImW0rA==",
"license": "MIT",
"dependencies": {
"dequal": "^2.0.0"
},
"funding": {
"type": "github",
"url": "https://github.com/sponsors/wooorm"
}
},
"node_modules/devtools-protocol": {
"version": "0.0.1581282",
"resolved": "https://registry.npmjs.org/devtools-protocol/-/devtools-protocol-0.0.1581282.tgz",
@ -3461,7 +3622,6 @@
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/dunder-proto/-/dunder-proto-1.0.1.tgz",
"integrity": "sha512-KIN/nDJBQRcXw0MLVhZE9iQHmG68qAVIBg9CqmUYjmQIhgij9U5MFvrqkUL5FbtyyzZuOeOt0zdeRe4UY7ct+A==",
"dev": true,
"license": "MIT",
"dependencies": {
"call-bind-apply-helpers": "^1.0.1",
@ -3718,7 +3878,6 @@
"version": "0.1.13",
"resolved": "https://registry.npmjs.org/encoding/-/encoding-0.1.13.tgz",
"integrity": "sha512-ETBauow1T35Y/WZMkio9jiM0Z5xjHHmJ4XmjZOq1l/dXz3lr2sRn87nJy20RupqSh1F2m3HHPSp8ShIPQJrJ3A==",
"dev": true,
"license": "MIT",
"optional": true,
"dependencies": {
@ -3776,7 +3935,6 @@
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/es-define-property/-/es-define-property-1.0.1.tgz",
"integrity": "sha512-e3nRfgfUZ4rNGL232gUgX06QNyyez04KdjFrF+LTRoOXmrOgFKDg4BCdsjW8EnT69eqdYGmRpJwiPVYNrCaW3g==",
"dev": true,
"license": "MIT",
"engines": {
"node": ">= 0.4"
@ -3786,7 +3944,6 @@
"version": "1.3.0",
"resolved": "https://registry.npmjs.org/es-errors/-/es-errors-1.3.0.tgz",
"integrity": "sha512-Zf5H2Kxt2xjTvbJvP2ZWLEICxA6j+hAmMzIlypy4xcBg1vKVnx89Wy0GbS+kf5cwCVFFzdCFh2XSCFNULS6csw==",
"dev": true,
"license": "MIT",
"engines": {
"node": ">= 0.4"
@ -3796,7 +3953,6 @@
"version": "1.1.1",
"resolved": "https://registry.npmjs.org/es-object-atoms/-/es-object-atoms-1.1.1.tgz",
"integrity": "sha512-FGgH2h8zKNim9ljj7dankFPcICIK9Cp5bm+c2gQSYePhpaG5+esrLODihIorn+Pe6FGJzWhXQotPv73jTaldXA==",
"dev": true,
"license": "MIT",
"dependencies": {
"es-errors": "^1.3.0"
@ -3809,7 +3965,6 @@
"version": "2.1.0",
"resolved": "https://registry.npmjs.org/es-set-tostringtag/-/es-set-tostringtag-2.1.0.tgz",
"integrity": "sha512-j6vWzfrGVfyXxge+O0x5sh6cvxAog0a/4Rdd2K36zCMV5eJ+/+tOAngRO8cODMNWbVRdVlmGZQL2YS3yR8bIUA==",
"dev": true,
"license": "MIT",
"dependencies": {
"es-errors": "^1.3.0",
@ -3920,6 +4075,12 @@
"dev": true,
"license": "Apache-2.0"
},
"node_modules/extend": {
"version": "3.0.2",
"resolved": "https://registry.npmjs.org/extend/-/extend-3.0.2.tgz",
"integrity": "sha512-fjquC59cD7CyW6urNXK0FBufkZcoiGG80wTuPujX590cB5Ttln20E2UB4S/WARVqhXffZl2LNgS+gQdPIIim/g==",
"license": "MIT"
},
"node_modules/extract-zip": {
"version": "2.0.1",
"resolved": "https://registry.npmjs.org/extract-zip/-/extract-zip-2.0.1.tgz",
@ -3998,6 +4159,24 @@
}
}
},
"node_modules/file-type": {
"version": "21.3.4",
"resolved": "https://registry.npmjs.org/file-type/-/file-type-21.3.4.tgz",
"integrity": "sha512-Ievi/yy8DS3ygGvT47PjSfdFoX+2isQueoYP1cntFW1JLYAuS4GD7NUPGg4zv2iZfV52uDyk5w5Z0TdpRS6Q1g==",
"license": "MIT",
"dependencies": {
"@tokenizer/inflate": "^0.4.1",
"strtok3": "^10.3.4",
"token-types": "^6.1.1",
"uint8array-extras": "^1.4.0"
},
"engines": {
"node": ">=20"
},
"funding": {
"url": "https://github.com/sindresorhus/file-type?sponsor=1"
}
},
"node_modules/filelist": {
"version": "1.0.4",
"resolved": "https://registry.npmjs.org/filelist/-/filelist-1.0.4.tgz",
@ -4031,6 +4210,26 @@
"node": ">=10"
}
},
"node_modules/follow-redirects": {
"version": "1.15.11",
"resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.11.tgz",
"integrity": "sha512-deG2P0JfjrTxl50XGCDyfI97ZGVCxIpfKYmfyrQ54n5FO/0gfIES8C/Psl6kWVDolizcaaxZJnTS0QSMxvnsBQ==",
"funding": [
{
"type": "individual",
"url": "https://github.com/sponsors/RubenVerborgh"
}
],
"license": "MIT",
"engines": {
"node": ">=4.0"
},
"peerDependenciesMeta": {
"debug": {
"optional": true
}
}
},
"node_modules/foreground-child": {
"version": "3.3.1",
"resolved": "https://registry.npmjs.org/foreground-child/-/foreground-child-3.3.1.tgz",
@ -4065,7 +4264,6 @@
"version": "4.0.5",
"resolved": "https://registry.npmjs.org/form-data/-/form-data-4.0.5.tgz",
"integrity": "sha512-8RipRLol37bNs2bhoV67fiTEvdTrbMUYcFTiy3+wuuOnUog2QBHCZWXDRijWQfAkhBj2Uf5UnVaiWwA5vdd82w==",
"dev": true,
"license": "MIT",
"dependencies": {
"asynckit": "^0.4.0",
@ -4117,7 +4315,6 @@
"version": "1.1.2",
"resolved": "https://registry.npmjs.org/function-bind/-/function-bind-1.1.2.tgz",
"integrity": "sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA==",
"dev": true,
"license": "MIT",
"funding": {
"url": "https://github.com/sponsors/ljharb"
@ -4136,7 +4333,6 @@
"version": "1.3.0",
"resolved": "https://registry.npmjs.org/get-intrinsic/-/get-intrinsic-1.3.0.tgz",
"integrity": "sha512-9fSjSaos/fRIVIp+xSJlE6lfwhES7LNtKaCBIamHsjr2na1BiABJPo0mOjjz8GJDURarmCPGqaiVg5mfjb98CQ==",
"dev": true,
"license": "MIT",
"dependencies": {
"call-bind-apply-helpers": "^1.0.2",
@ -4161,7 +4357,6 @@
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/get-proto/-/get-proto-1.0.1.tgz",
"integrity": "sha512-sTSfBjoXBp89JvIKIefqw7U2CCebsc74kiY6awiGogKtoSGbgjYE/G/+l9sF3MWFPNc9IcoOC4ODfKHfxFmp0g==",
"dev": true,
"license": "MIT",
"dependencies": {
"dunder-proto": "^1.0.1",
@ -4290,7 +4485,6 @@
"version": "1.2.0",
"resolved": "https://registry.npmjs.org/gopd/-/gopd-1.2.0.tgz",
"integrity": "sha512-ZUKRh6/kUFoAiTAtTYPZJ3hw9wNxx+BIBOijnlG9PnrJsCcSjs1wyyD6vJpaYtgnzDrKYRSqf3OO6Rfa93xsRg==",
"dev": true,
"license": "MIT",
"engines": {
"node": ">= 0.4"
@ -4359,7 +4553,6 @@
"version": "1.1.0",
"resolved": "https://registry.npmjs.org/has-symbols/-/has-symbols-1.1.0.tgz",
"integrity": "sha512-1cDNdwJ2Jaohmb3sg4OmKaMBwuC48sYni5HUw2DvsC8LjGTLK9h+eb1X6RyuOHe4hT0ULCW68iomhjUoKUqlPQ==",
"dev": true,
"license": "MIT",
"engines": {
"node": ">= 0.4"
@ -4372,7 +4565,6 @@
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/has-tostringtag/-/has-tostringtag-1.0.2.tgz",
"integrity": "sha512-NqADB8VjPFLM2V0VvHUewwwsw0ZWBaIdgo+ieHtK3hasLz4qeCRjYcqfB6AQrBggRKppKF8L52/VqdVsO47Dlw==",
"dev": true,
"license": "MIT",
"dependencies": {
"has-symbols": "^1.0.3"
@ -4388,7 +4580,6 @@
"version": "2.0.2",
"resolved": "https://registry.npmjs.org/hasown/-/hasown-2.0.2.tgz",
"integrity": "sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ==",
"dev": true,
"license": "MIT",
"dependencies": {
"function-bind": "^1.1.2"
@ -4487,7 +4678,7 @@
"version": "0.6.3",
"resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.6.3.tgz",
"integrity": "sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw==",
"dev": true,
"devOptional": true,
"license": "MIT",
"dependencies": {
"safer-buffer": ">= 2.1.2 < 3.0.0"
@ -4496,6 +4687,12 @@
"node": ">=0.10.0"
}
},
"node_modules/idb-keyval": {
"version": "6.2.2",
"resolved": "https://registry.npmjs.org/idb-keyval/-/idb-keyval-6.2.2.tgz",
"integrity": "sha512-yjD9nARJ/jb1g+CvD0tlhUHOrJ9Sy0P8T9MF3YaLlHnSRpwPfpTX0XIvpmw3gAJUmEu3FiICLBDPXVwyEvrleg==",
"license": "Apache-2.0"
},
"node_modules/ieee754": {
"version": "1.2.1",
"resolved": "https://registry.npmjs.org/ieee754/-/ieee754-1.2.1.tgz",
@ -4616,6 +4813,18 @@
"node": ">=8"
}
},
"node_modules/is-plain-obj": {
"version": "4.1.0",
"resolved": "https://registry.npmjs.org/is-plain-obj/-/is-plain-obj-4.1.0.tgz",
"integrity": "sha512-+Pgi+vMuUNkJyExiMBt5IlFoMyKnr5zhJ4Uspz58WOhBF5QoIZkFyNHIbBAtHwzVAgk5RtndVNsDRN61/mmDqg==",
"license": "MIT",
"engines": {
"node": ">=12"
},
"funding": {
"url": "https://github.com/sponsors/sindresorhus"
}
},
"node_modules/is-unicode-supported": {
"version": "0.1.0",
"resolved": "https://registry.npmjs.org/is-unicode-supported/-/is-unicode-supported-0.1.0.tgz",
@ -4629,6 +4838,12 @@
"url": "https://github.com/sponsors/sindresorhus"
}
},
"node_modules/is-url": {
"version": "1.2.4",
"resolved": "https://registry.npmjs.org/is-url/-/is-url-1.2.4.tgz",
"integrity": "sha512-ITvGim8FhRiYe4IQ5uHSkj7pVaPDrCTkNd3yq3cV7iZAcJdHTUMPMEHcqSOy9xZ9qFenQCvi+2wjH9a1nXqHww==",
"license": "MIT"
},
"node_modules/isbinaryfile": {
"version": "5.0.7",
"resolved": "https://registry.npmjs.org/isbinaryfile/-/isbinaryfile-5.0.7.tgz",
@ -5133,7 +5348,6 @@
"version": "1.1.0",
"resolved": "https://registry.npmjs.org/math-intrinsics/-/math-intrinsics-1.1.0.tgz",
"integrity": "sha512-/IXtbwEk5HTPyEwyKX6hGkYXxM9nbj64B+ilVJnC/R6B0pH5G4V3b0pVbL7DBj4tkhBAppbQUlf6F6Xl9LHu1g==",
"dev": true,
"license": "MIT",
"engines": {
"node": ">= 0.4"
@ -5156,7 +5370,6 @@
"version": "1.52.0",
"resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.52.0.tgz",
"integrity": "sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg==",
"dev": true,
"license": "MIT",
"engines": {
"node": ">= 0.6"
@ -5166,7 +5379,6 @@
"version": "2.1.35",
"resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.35.tgz",
"integrity": "sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw==",
"dev": true,
"license": "MIT",
"dependencies": {
"mime-db": "1.52.0"
@ -5467,6 +5679,26 @@
"node": ">=10"
}
},
"node_modules/node-fetch": {
"version": "2.7.0",
"resolved": "https://registry.npmjs.org/node-fetch/-/node-fetch-2.7.0.tgz",
"integrity": "sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A==",
"license": "MIT",
"dependencies": {
"whatwg-url": "^5.0.0"
},
"engines": {
"node": "4.x || >=6.0.0"
},
"peerDependencies": {
"encoding": "^0.1.0"
},
"peerDependenciesMeta": {
"encoding": {
"optional": true
}
}
},
"node_modules/node-gyp": {
"version": "11.5.0",
"resolved": "https://registry.npmjs.org/node-gyp/-/node-gyp-11.5.0.tgz",
@ -5607,6 +5839,15 @@
"url": "https://github.com/sponsors/sindresorhus"
}
},
"node_modules/opencollective-postinstall": {
"version": "2.0.3",
"resolved": "https://registry.npmjs.org/opencollective-postinstall/-/opencollective-postinstall-2.0.3.tgz",
"integrity": "sha512-8AV/sCtuzUeTo8gQK5qDZzARrulB3egtLzFgteqB2tcT4Mw7B8Kt7JcDHmltjz6FOAHsvTevk70gZEbhM4ZS9Q==",
"license": "MIT",
"bin": {
"opencollective-postinstall": "index.js"
}
},
"node_modules/ora": {
"version": "5.4.1",
"resolved": "https://registry.npmjs.org/ora/-/ora-5.4.1.tgz",
@ -6205,6 +6446,12 @@
"node": ">= 6"
}
},
"node_modules/regenerator-runtime": {
"version": "0.13.11",
"resolved": "https://registry.npmjs.org/regenerator-runtime/-/regenerator-runtime-0.13.11.tgz",
"integrity": "sha512-kY1AZVr2Ra+t+piVaJ4gxaFaReZVH40AKNo7UCX6W+dEwBo/2oZJzqfuN1qLq1oL45o56cPaTXELwrTh8Fpggg==",
"license": "MIT"
},
"node_modules/require-directory": {
"version": "2.1.1",
"resolved": "https://registry.npmjs.org/require-directory/-/require-directory-2.1.1.tgz",
@ -6344,7 +6591,7 @@
"version": "2.1.2",
"resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz",
"integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==",
"dev": true,
"devOptional": true,
"license": "MIT"
},
"node_modules/sanitize-filename": {
@ -6754,6 +7001,22 @@
"node": ">=8"
}
},
"node_modules/strtok3": {
"version": "10.3.5",
"resolved": "https://registry.npmjs.org/strtok3/-/strtok3-10.3.5.tgz",
"integrity": "sha512-ki4hZQfh5rX0QDLLkOCj+h+CVNkqmp/CMf8v8kZpkNVK6jGQooMytqzLZYUVYIZcFZ6yDB70EfD8POcFXiF5oA==",
"license": "MIT",
"dependencies": {
"@tokenizer/token": "^0.3.0"
},
"engines": {
"node": ">=18"
},
"funding": {
"type": "github",
"url": "https://github.com/sponsors/Borewit"
}
},
"node_modules/sumchecker": {
"version": "3.0.1",
"resolved": "https://registry.npmjs.org/sumchecker/-/sumchecker-3.0.1.tgz",
@ -6991,6 +7254,30 @@
"mkdirp": "bin/cmd.js"
}
},
"node_modules/tesseract.js": {
"version": "7.0.0",
"resolved": "https://registry.npmjs.org/tesseract.js/-/tesseract.js-7.0.0.tgz",
"integrity": "sha512-exPBkd+z+wM1BuMkx/Bjv43OeLBxhL5kKWsz/9JY+DXcXdiBjiAch0V49QR3oAJqCaL5qURE0vx9Eo+G5YE7mA==",
"hasInstallScript": true,
"license": "Apache-2.0",
"dependencies": {
"bmp-js": "^0.1.0",
"idb-keyval": "^6.2.0",
"is-url": "^1.2.4",
"node-fetch": "^2.6.9",
"opencollective-postinstall": "^2.0.3",
"regenerator-runtime": "^0.13.3",
"tesseract.js-core": "^7.0.0",
"wasm-feature-detect": "^1.8.0",
"zlibjs": "^0.3.1"
}
},
"node_modules/tesseract.js-core": {
"version": "7.0.0",
"resolved": "https://registry.npmjs.org/tesseract.js-core/-/tesseract.js-core-7.0.0.tgz",
"integrity": "sha512-WnNH518NzmbSq9zgTPeoF8c+xmilS8rFIl1YKbk/ptuuc7p6cLNELNuPAzcmsYw450ca6bLa8j3t0VAtq435Vw==",
"license": "Apache-2.0"
},
"node_modules/text-decoder": {
"version": "1.2.7",
"resolved": "https://registry.npmjs.org/text-decoder/-/text-decoder-1.2.7.tgz",
@ -7063,6 +7350,30 @@
"tmp": "^0.2.0"
}
},
"node_modules/token-types": {
"version": "6.1.2",
"resolved": "https://registry.npmjs.org/token-types/-/token-types-6.1.2.tgz",
"integrity": "sha512-dRXchy+C0IgK8WPC6xvCHFRIWYUbqqdEIKPaKo/AcTUNzwLTK6AH7RjdLWsEZcAN/TBdtfUw3PYEgPr5VPr6ww==",
"license": "MIT",
"dependencies": {
"@borewit/text-codec": "^0.2.1",
"@tokenizer/token": "^0.3.0",
"ieee754": "^1.2.1"
},
"engines": {
"node": ">=14.16"
},
"funding": {
"type": "github",
"url": "https://github.com/sponsors/Borewit"
}
},
"node_modules/tr46": {
"version": "0.0.3",
"resolved": "https://registry.npmjs.org/tr46/-/tr46-0.0.3.tgz",
"integrity": "sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw==",
"license": "MIT"
},
"node_modules/tree-kill": {
"version": "1.2.2",
"resolved": "https://registry.npmjs.org/tree-kill/-/tree-kill-1.2.2.tgz",
@ -7072,6 +7383,16 @@
"tree-kill": "cli.js"
}
},
"node_modules/trough": {
"version": "2.2.0",
"resolved": "https://registry.npmjs.org/trough/-/trough-2.2.0.tgz",
"integrity": "sha512-tmMpK00BjZiUyVyvrBK7knerNgmgvcV/KLVyuma/SC+TQN167GrMRciANTz09+k3zW8L8t60jWO1GpfkZdjTaw==",
"license": "MIT",
"funding": {
"type": "github",
"url": "https://github.com/sponsors/wooorm"
}
},
"node_modules/truncate-utf8-bytes": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/truncate-utf8-bytes/-/truncate-utf8-bytes-1.0.2.tgz",
@ -7122,6 +7443,18 @@
"node": ">=14.17"
}
},
"node_modules/uint8array-extras": {
"version": "1.5.0",
"resolved": "https://registry.npmjs.org/uint8array-extras/-/uint8array-extras-1.5.0.tgz",
"integrity": "sha512-rvKSBiC5zqCCiDZ9kAOszZcDvdAHwwIKJG33Ykj43OKcWsnmcBRL09YTU4nOeHZ8Y2a7l1MgTd08SBe9A8Qj6A==",
"license": "MIT",
"engines": {
"node": ">=18"
},
"funding": {
"url": "https://github.com/sponsors/sindresorhus"
}
},
"node_modules/unbzip2-stream": {
"version": "1.4.3",
"resolved": "https://registry.npmjs.org/unbzip2-stream/-/unbzip2-stream-1.4.3.tgz",
@ -7139,6 +7472,25 @@
"devOptional": true,
"license": "MIT"
},
"node_modules/unified": {
"version": "11.0.5",
"resolved": "https://registry.npmjs.org/unified/-/unified-11.0.5.tgz",
"integrity": "sha512-xKvGhPWw3k84Qjh8bI3ZeJjqnyadK+GEFtazSfZv/rKeTkTjOJho6mFqh2SM96iIcZokxiOpg78GazTSg8+KHA==",
"license": "MIT",
"dependencies": {
"@types/unist": "^3.0.0",
"bail": "^2.0.0",
"devlop": "^1.0.0",
"extend": "^3.0.0",
"is-plain-obj": "^4.0.0",
"trough": "^2.0.0",
"vfile": "^6.0.0"
},
"funding": {
"type": "opencollective",
"url": "https://opencollective.com/unified"
}
},
"node_modules/unique-filename": {
"version": "4.0.0",
"resolved": "https://registry.npmjs.org/unique-filename/-/unique-filename-4.0.0.tgz",
@ -7165,6 +7517,19 @@
"node": "^18.17.0 || >=20.5.0"
}
},
"node_modules/unist-util-stringify-position": {
"version": "4.0.0",
"resolved": "https://registry.npmjs.org/unist-util-stringify-position/-/unist-util-stringify-position-4.0.0.tgz",
"integrity": "sha512-0ASV06AAoKCDkS2+xw5RXJywruurpbC4JZSm7nr7MOt1ojAzvyyaO+UxZf18j8FCF6kmzCZKcAgN/yu2gm2XgQ==",
"license": "MIT",
"dependencies": {
"@types/unist": "^3.0.0"
},
"funding": {
"type": "opencollective",
"url": "https://opencollective.com/unified"
}
},
"node_modules/universalify": {
"version": "0.1.2",
"resolved": "https://registry.npmjs.org/universalify/-/universalify-0.1.2.tgz",
@ -7228,6 +7593,40 @@
"node": ">=0.6.0"
}
},
"node_modules/vfile": {
"version": "6.0.3",
"resolved": "https://registry.npmjs.org/vfile/-/vfile-6.0.3.tgz",
"integrity": "sha512-KzIbH/9tXat2u30jf+smMwFCsno4wHVdNmzFyL+T/L3UGqqk6JKfVqOFOZEpZSHADH1k40ab6NUIXZq422ov3Q==",
"license": "MIT",
"dependencies": {
"@types/unist": "^3.0.0",
"vfile-message": "^4.0.0"
},
"funding": {
"type": "opencollective",
"url": "https://opencollective.com/unified"
}
},
"node_modules/vfile-message": {
"version": "4.0.3",
"resolved": "https://registry.npmjs.org/vfile-message/-/vfile-message-4.0.3.tgz",
"integrity": "sha512-QTHzsGd1EhbZs4AsQ20JX1rC3cOlt/IWJruk893DfLRr57lcnOeMaWG4K0JrRta4mIJZKth2Au3mM3u03/JWKw==",
"license": "MIT",
"dependencies": {
"@types/unist": "^3.0.0",
"unist-util-stringify-position": "^4.0.0"
},
"funding": {
"type": "opencollective",
"url": "https://opencollective.com/unified"
}
},
"node_modules/wasm-feature-detect": {
"version": "1.8.0",
"resolved": "https://registry.npmjs.org/wasm-feature-detect/-/wasm-feature-detect-1.8.0.tgz",
"integrity": "sha512-zksaLKM2fVlnB5jQQDqKXXwYHLQUVH9es+5TOOHwGOVJOCeRBCiPjwSg+3tN2AdTCzjgli4jijCH290kXb/zWQ==",
"license": "Apache-2.0"
},
"node_modules/wcwidth": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/wcwidth/-/wcwidth-1.0.1.tgz",
@ -7244,6 +7643,22 @@
"integrity": "sha512-ARrjNjtWRRs2w4Tk7nqrf2gBI0QXWuOmMCx2hU+1jUt6d00MjMxURrhxhGbrsoiZKJrhTSTzbIrc554iKI10qw==",
"license": "Apache-2.0"
},
"node_modules/webidl-conversions": {
"version": "3.0.1",
"resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-3.0.1.tgz",
"integrity": "sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ==",
"license": "BSD-2-Clause"
},
"node_modules/whatwg-url": {
"version": "5.0.0",
"resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-5.0.0.tgz",
"integrity": "sha512-saE57nupxk6v3HY35+jzBwYa0rKSy0XR8JSxZPwgLr7ys0IBzhGviA1/TUGJLmSVqs8pb9AnvICXEuOHLprYTw==",
"license": "MIT",
"dependencies": {
"tr46": "~0.0.3",
"webidl-conversions": "^3.0.0"
}
},
"node_modules/which": {
"version": "5.0.0",
"resolved": "https://registry.npmjs.org/which/-/which-5.0.0.tgz",
@ -7399,6 +7814,15 @@
"url": "https://github.com/sponsors/sindresorhus"
}
},
"node_modules/zlibjs": {
"version": "0.3.1",
"resolved": "https://registry.npmjs.org/zlibjs/-/zlibjs-0.3.1.tgz",
"integrity": "sha512-+J9RrgTKOmlxFSDHo0pI1xM6BLVUv+o0ZT9ANtCxGkjIVCCUdx9alUF8Gm+dGLKbkkkidWIHFDZHDMpfITt4+w==",
"license": "MIT",
"engines": {
"node": "*"
}
},
"node_modules/zod": {
"version": "3.25.76",
"resolved": "https://registry.npmjs.org/zod/-/zod-3.25.76.tgz",

View file

@ -26,10 +26,10 @@
"automation"
],
"scripts": {
"start": "electron .",
"start": "electron . --no-sandbox",
"dist": "electron-builder",
"postinstall": "electron-builder install-app-deps",
"dev": "rm -rf app_dist && tsc && electron .",
"dev": "rm -rf app_dist && tsc && electron . --no-sandbox",
"setup:env": "npm install && cd servers/fastapi && uv sync && cd ../../servers/nextjs && npm install && cd ../.. && npm run setup:export-runtime",
"install:pyinstaller": "cd servers/fastapi && echo 'pyinstaller already in dependencies'",
"build:ts": "rm -rf app_dist && tsc",
@ -51,6 +51,7 @@
"email": "suraj@presenton.ai"
},
"dependencies": {
"@llamaindex/liteparse": "^1.4.0",
"@puppeteer/browsers": "^1.9.1",
"@tailwindcss/cli": "^4.1.5",
"@types/uuid": "^10.0.0",

View file

@ -0,0 +1,147 @@
#!/usr/bin/env node
/**
* CLI bridge for Python: one JSON line on stdout for LiteParse extraction.
*
* OCR follows LlamaIndex LiteParse guidance (built-in Tesseract by default):
* https://developers.llamaindex.ai/liteparse/guides/ocr/
*
* - ISO 639-3 for Tesseract (eng, fra, deu, jpn, ); multi-lang as "deu+eng" or "deu,eng".
* - Parallel workers CPU cores 1 (override --num-workers).
* - Optional HTTP OCR: --ocr-server-url or LITEPARSE_OCR_SERVER_URL.
* - Optional local models: --tessdata-path or LITEPARSE_TESSDATA_PATH (else TESSDATA_PREFIX / CDN).
*/
import fs from "node:fs";
import os from "node:os";
import path from "node:path";
import { LiteParse } from "@llamaindex/liteparse";
function readArg(name) {
const idx = process.argv.indexOf(name);
if (idx === -1) return null;
return process.argv[idx + 1] ?? null;
}
function parseBool(value, fallback) {
if (value == null || value === "") return fallback;
const s = String(value).trim().toLowerCase();
if (["1", "true", "yes", "on"].includes(s)) return true;
if (["0", "false", "no", "off"].includes(s)) return false;
return fallback;
}
function toNumber(value, fallback, min, max) {
if (value == null || value === "") return fallback;
const parsed = Number(value);
if (Number.isNaN(parsed)) return fallback;
return Math.min(Math.max(parsed, min), max);
}
/** Tesseract accepts "deu+eng"; allow comma-separated CLI/env for convenience. */
function normalizeOcrLanguage(raw) {
const s = String(raw ?? "").trim();
if (!s) return "eng";
if (s.includes(",")) {
return s
.split(",")
.map((p) => p.trim())
.filter(Boolean)
.join("+");
}
return s;
}
function emit(result, exitCode = 0) {
process.stdout.write(`${JSON.stringify(result)}\n`);
process.exit(exitCode);
}
const filePath = readArg("--file");
if (!filePath) {
emit({ ok: false, error: "Missing required --file argument" }, 2);
}
const resolvedPath = path.resolve(filePath);
if (!fs.existsSync(resolvedPath)) {
emit({ ok: false, error: `File not found: ${resolvedPath}` }, 2);
}
const ocrEnabled = parseBool(readArg("--ocr-enabled"), true);
const dpi = toNumber(readArg("--dpi"), 150, 72, 600);
const numWorkers = toNumber(
readArg("--num-workers"),
Math.max(os.cpus().length - 1, 1),
1,
64
);
const cliOcrLanguage = readArg("--ocr-language");
const ocrLanguageRaw =
(process.env.LITEPARSE_OCR_LANGUAGE && String(process.env.LITEPARSE_OCR_LANGUAGE).trim()) ||
(cliOcrLanguage && String(cliOcrLanguage).trim()) ||
"";
const ocrLanguage = normalizeOcrLanguage(ocrLanguageRaw || "eng");
const outputFormatRaw = (readArg("--output-format") || "text").trim().toLowerCase();
const outputFormat = outputFormatRaw === "json" ? "json" : "text";
const ocrServerUrlArg = readArg("--ocr-server-url");
const ocrServerUrl =
(ocrServerUrlArg && String(ocrServerUrlArg).trim()) ||
(process.env.LITEPARSE_OCR_SERVER_URL && String(process.env.LITEPARSE_OCR_SERVER_URL).trim()) ||
undefined;
const tessdataArg = readArg("--tessdata-path");
const tessdataPath =
(tessdataArg && String(tessdataArg).trim()) ||
(process.env.LITEPARSE_TESSDATA_PATH && String(process.env.LITEPARSE_TESSDATA_PATH).trim()) ||
(process.env.TESSDATA_PREFIX && String(process.env.TESSDATA_PREFIX).trim()) ||
undefined;
try {
const config = {
ocrEnabled,
ocrLanguage,
outputFormat,
dpi,
numWorkers,
};
if (ocrServerUrl) {
config.ocrServerUrl = ocrServerUrl;
}
if (tessdataPath) {
config.tessdataPath = tessdataPath;
}
const parser = new LiteParse(config);
const result = await parser.parse(resolvedPath, true);
const text = result?.text ?? "";
emit({
ok: true,
filePath: resolvedPath,
text,
pageCount: Array.isArray(result?.pages) ? result.pages.length : 0,
ocr: {
engine: ocrServerUrl ? "http" : "tesseract",
ocrLanguage,
ocrEnabled,
dpi,
numWorkers,
},
});
} catch (error) {
const message = error instanceof Error ? error.message : String(error);
const stack = error instanceof Error ? error.stack : undefined;
if (stack) {
process.stderr.write(`${stack}\n`);
}
emit(
{
ok: false,
filePath: resolvedPath,
error: message,
},
1
);
}

View file

@ -0,0 +1,8 @@
{
"name": "presenton-document-extraction",
"private": true,
"type": "module",
"dependencies": {
"@llamaindex/liteparse": "^1.4.0"
}
}

View file

@ -12,6 +12,7 @@ window.addEventListener("DOMContentLoaded", () => {
const labelMap = {
libreoffice: "LibreOffice",
puppeteer: "Chromium",
imagemagick: "ImageMagick",
};
const dependenciesEl = document.getElementById("status-dependencies");
@ -24,6 +25,7 @@ window.addEventListener("DOMContentLoaded", () => {
const currentStatus = {
libreoffice: "checking",
puppeteer: "checking",
imagemagick: "checking",
};
function setStatus(name, status) {
@ -83,6 +85,7 @@ window.addEventListener("DOMContentLoaded", () => {
if (!statusMap) return;
if (statusMap.libreoffice) setStatus("libreoffice", statusMap.libreoffice);
if (statusMap.puppeteer) setStatus("puppeteer", statusMap.puppeteer);
if (statusMap.imagemagick) setStatus("imagemagick", statusMap.imagemagick);
});
}
});

View file

@ -141,7 +141,7 @@
<div id="state-prompt" class="state active">
<div class="icon-wrap purple">📦</div>
<p class="heading" id="prompt-heading">Dependencies required</p>
<p class="sub" id="prompt-sub">Presenton needs LibreOffice and Chrome to create and export presentations. Install them now so everything works.</p>
<p class="sub" id="prompt-sub">Presenton needs LibreOffice, Chrome, and ImageMagick to create and export presentations reliably. Install them now so everything works.</p>
<div class="btn-row">
<button class="btn-primary" id="btn-install">Install</button>
<button class="btn-ghost" id="btn-skip">Skip for now</button>
@ -212,8 +212,9 @@
<script>
const STATES = ['prompt','downloading','installing','success','error'];
let logLines = 0;
let currentStep = null; // 'libreoffice' | 'chrome'
let status = { needsLibreOffice: false, needsChrome: false };
let currentStep = null; // 'libreoffice' | 'chrome' | 'imagemagick'
let status = { needsLibreOffice: false, needsChrome: false, needsImageMagick: false };
let steps = [];
let logOpen = false;
function showState(name) {
@ -264,15 +265,30 @@
document.getElementById('log-toggle-label').textContent = logOpen ? 'Hide details' : 'Show details';
}
function getStepsFromStatus() {
const queue = [];
if (status.needsLibreOffice) queue.push('libreoffice');
if (status.needsChrome) queue.push('chrome');
if (status.needsImageMagick) queue.push('imagemagick');
return queue;
}
function showPromptForStep(step) {
currentStep = step;
const total = (status.needsLibreOffice ? 1 : 0) + (status.needsChrome ? 1 : 0);
const stepNum = step === 'libreoffice' ? 1 : 2;
setStepBadge(stepNum, total, step === 'libreoffice' ? 'LibreOffice' : 'Chromium');
document.getElementById('prompt-heading').textContent = step === 'libreoffice' ? 'LibreOffice required' : 'Chromium required';
document.getElementById('prompt-sub').innerHTML = step === 'libreoffice'
? '<strong>Presenton</strong> uses LibreOffice to generate custom templates from PPTX files.'
: '<strong>Presenton</strong> uses Chromium for export and slide rendering. Download it now (~150 MB).';
const total = steps.length || 1;
const stepNum = Math.max(1, steps.indexOf(step) + 1);
const stepLabel = step === 'libreoffice' ? 'LibreOffice' : step === 'chrome' ? 'Chromium' : 'ImageMagick';
setStepBadge(stepNum, total, stepLabel);
document.getElementById('prompt-heading').textContent =
step === 'libreoffice' ? 'LibreOffice required' :
step === 'chrome' ? 'Chromium required' :
'ImageMagick required';
document.getElementById('prompt-sub').innerHTML =
step === 'libreoffice'
? '<strong>Presenton</strong> uses LibreOffice to generate custom templates from PPTX files.'
: step === 'chrome'
? '<strong>Presenton</strong> uses Chromium for export and slide rendering. Download it now (~150 MB).'
: '<strong>Presenton</strong> uses ImageMagick for OCR/document conversion support. We will try automatic installation first, then open the download page if package manager tools are unavailable.';
document.getElementById('btn-install').onclick = () => startInstall(step);
document.getElementById('btn-skip').onclick = () => handleSkip();
showState('prompt');
@ -286,7 +302,7 @@
document.getElementById('dl-heading').textContent = 'Downloading LibreOffice';
document.getElementById('dl-phase').textContent = 'This may take a few minutes (~300 MB)';
window.setupInstaller.installLibreOffice();
} else {
} else if (step === 'chrome') {
document.getElementById('dl-heading').textContent = 'Downloading Chromium';
document.getElementById('dl-phase').textContent = 'This may take a few minutes (~150 MB)';
window.setupInstaller.installChrome().then(res => {
@ -297,12 +313,27 @@
document.getElementById('btn-skip-error').onclick = () => nextOrDone();
}
});
} else {
document.getElementById('dl-heading').textContent = 'Installing ImageMagick';
document.getElementById('dl-phase').textContent = 'Automatic install (apt/brew/choco) with fallback to manual download';
window.setupInstaller.installImageMagick().then(() => {
window.setupInstaller.checkImageMagick().then(res => {
if (!res.ok && currentStep === 'imagemagick') {
document.getElementById('err-msg').textContent = res.error || 'ImageMagick is not installed yet.';
showState('error');
document.getElementById('btn-retry').onclick = () => startInstall('imagemagick');
document.getElementById('btn-skip-error').onclick = () => nextOrDone();
}
});
});
}
}
function nextOrDone() {
if (currentStep === 'libreoffice' && status.needsChrome) {
showPromptForStep('chrome');
const idx = steps.indexOf(currentStep);
const nextStep = idx >= 0 ? steps[idx + 1] : null;
if (nextStep) {
showPromptForStep(nextStep);
} else {
window.setupInstaller.done();
}
@ -338,13 +369,17 @@
}
if (phase === 'done') {
showState('success');
document.getElementById('success-heading').textContent = currentStep === 'libreoffice' ? 'LibreOffice installed' : 'Chromium installed';
document.getElementById('success-sub').textContent = status.needsChrome && currentStep === 'libreoffice' ? 'Next: Chrome.' : 'Continuing in a moment…';
document.getElementById('success-heading').textContent =
currentStep === 'libreoffice' ? 'LibreOffice installed' :
currentStep === 'chrome' ? 'Chromium installed' :
'ImageMagick ready';
const idx = steps.indexOf(currentStep);
const nextStep = idx >= 0 ? steps[idx + 1] : null;
document.getElementById('success-sub').textContent = nextStep ? 'Continuing with next step…' : 'Continuing in a moment…';
const bar = document.getElementById('success-bar');
if (bar) bar.style.width = '100%';
setTimeout(() => {
if (currentStep === 'libreoffice' && status.needsChrome) showPromptForStep('chrome');
else window.setupInstaller.done();
nextOrDone();
}, 2200);
return;
}
@ -366,18 +401,20 @@
window.setupInstaller.onLibreOfficeLog((data) => onLog('libreoffice', data));
window.setupInstaller.onChromeProgress((data) => onProgress('chrome', data));
window.setupInstaller.onChromeLog((data) => onLog('chrome', data));
window.setupInstaller.onImageMagickProgress((data) => onProgress('imagemagick', data));
window.setupInstaller.onImageMagickLog((data) => onLog('imagemagick', data));
document.getElementById('btn-retry').onclick = () => startInstall(currentStep);
document.getElementById('btn-skip-error').onclick = () => nextOrDone();
window.setupInstaller.getStatus().then(s => {
status = s;
if (!status.needsLibreOffice && !status.needsChrome) {
steps = getStepsFromStatus();
if (steps.length === 0) {
window.setupInstaller.done();
return;
}
if (status.needsLibreOffice) showPromptForStep('libreoffice');
else showPromptForStep('chrome');
showPromptForStep(steps[0]);
});
</script>
</body>

View file

@ -4,6 +4,7 @@ from typing import Annotated, List, Optional
from fastapi import APIRouter, Body, File, UploadFile
from constants.documents import UPLOAD_ACCEPTED_FILE_TYPES
from models.decompose_files_body import DecomposeFilesBody
from models.decomposed_file_info import DecomposedFileInfo
from services.temp_file_service import TEMP_FILE_SERVICE
from services.documents_loader import DocumentsLoader
@ -38,18 +39,21 @@ async def upload_files(files: Optional[List[UploadFile]]):
@FILES_ROUTER.post("/decompose", response_model=List[DecomposedFileInfo])
async def decompose_files(file_paths: Annotated[List[str], Body(embed=True)]):
async def decompose_files(body: DecomposeFilesBody):
temp_dir = TEMP_FILE_SERVICE.create_temp_dir(str(uuid.uuid4()))
txt_files = []
other_files = []
for file_path in file_paths:
for file_path in body.file_paths:
if file_path.endswith(".txt"):
txt_files.append(file_path)
else:
other_files.append(file_path)
documents_loader = DocumentsLoader(file_paths=other_files)
documents_loader = DocumentsLoader(
file_paths=other_files,
presentation_language=body.language,
)
await documents_loader.load_documents(temp_dir)
parsed_documents = documents_loader.documents

View file

@ -43,7 +43,10 @@ async def stream_outlines(
additional_context = ""
if presentation.file_paths:
documents_loader = DocumentsLoader(file_paths=presentation.file_paths)
documents_loader = DocumentsLoader(
file_paths=presentation.file_paths,
presentation_language=presentation.language,
)
await documents_loader.load_documents(temp_dir)
documents = documents_loader.documents
if documents:

View file

@ -518,7 +518,10 @@ async def generate_presentation_handler(
await sql_session.commit()
if request.files:
documents_loader = DocumentsLoader(file_paths=request.files)
documents_loader = DocumentsLoader(
file_paths=request.files,
presentation_language=request.language,
)
await documents_loader.load_documents()
documents = documents_loader.documents
if documents:

View file

@ -13,8 +13,9 @@ SPREADSHEET_TYPES = ["text/csv", "application/csv"]
PNG_MIME_TYPES = ["image/png"]
JPEG_MIME_TYPES = ["image/jpeg"]
WEBP_MIME_TYPES = ["image/webp"]
IMAGE_MIME_TYPES = PNG_MIME_TYPES + JPEG_MIME_TYPES + WEBP_MIME_TYPES
UPLOAD_ACCEPTED_FILE_TYPES = (
PDF_MIME_TYPES + TEXT_MIME_TYPES + POWERPOINT_TYPES + WORD_TYPES
PDF_MIME_TYPES + TEXT_MIME_TYPES + POWERPOINT_TYPES + WORD_TYPES + IMAGE_MIME_TYPES
)

View file

@ -0,0 +1,11 @@
from typing import List, Optional
from pydantic import BaseModel, Field
class DecomposeFilesBody(BaseModel):
file_paths: List[str]
language: Optional[str] = Field(
default=None,
description="Presentation language from the UI; used as LiteParse/Tesseract OCR language hint.",
)

View file

@ -1,47 +1,45 @@
[project]
name = "presenton-backend"
version = "0.1.0"
description = "Add your description here"
requires-python = ">=3.11,<3.12"
dependencies = [
"alembic>=1.14.0",
"aiohttp>=3.12.15",
"aiomysql>=0.2.0",
"aiosqlite>=0.21.0",
"anthropic>=0.60.0",
"asyncpg>=0.30.0",
"dirtyjson>=1.0.8",
# Platform-specific: docling for Linux/macOS only
"docling>=2.43.0; sys_platform != 'win32'",
"fastapi[standard]>=0.116.1",
"fastembed-vectorstore>=0.5.2",
"fastmcp>=2.11.0",
"google-genai>=1.28.0",
# Platform-specific: greenlet for macOS only (critical for SQLAlchemy async)
"greenlet>=3.0.0; sys_platform == 'darwin'",
"nltk>=3.9.1",
"openai>=1.98.0",
"pathvalidate>=3.3.1",
"pdfplumber>=0.11.7",
# Platform-specific: docx2everything for DOCX/Markdown extraction on Windows
"docx2everything>=1.0.0; sys_platform == 'win32'",
"pyinstaller>=6.18.0",
"pytest>=8.4.1",
"python-pptx>=1.0.2; sys_platform == 'win32'",
"redis>=6.2.0",
"sqlmodel>=0.0.24",
]
[tool.uv]
index-strategy = "unsafe-best-match"
[[tool.uv.index]]
url = "https://download.pytorch.org/whl/cpu"
[dependency-groups]
dev = [
]
[tool.setuptools.packages.find]
where = ["."]
include = ["api*", "enums*", "models*", "services*", "constants*", "utils*"]
[project]
name = "presenton-backend"
version = "0.1.0"
description = "Add your description here"
requires-python = ">=3.11,<3.12"
dependencies = [
"alembic>=1.14.0",
"aiohttp>=3.12.15",
"aiomysql>=0.2.0",
"aiosqlite>=0.21.0",
"anthropic>=0.60.0",
"asyncpg>=0.30.0",
"dirtyjson>=1.0.8",
"fastapi[standard]>=0.116.1",
"fastembed-vectorstore>=0.5.2",
"fastmcp>=2.11.0",
"google-genai>=1.28.0",
# Platform-specific: greenlet for macOS only (critical for SQLAlchemy async)
"greenlet>=3.0.0; sys_platform == 'darwin'",
"nltk>=3.9.1",
"openai>=1.98.0",
"pathvalidate>=3.3.1",
"pdfplumber>=0.11.7",
# Platform-specific: docx2everything for DOCX/Markdown extraction on Windows
"docx2everything>=1.0.0; sys_platform == 'win32'",
"pyinstaller>=6.18.0",
"pytest>=8.4.1",
"python-pptx>=1.0.2",
"redis>=6.2.0",
"sqlmodel>=0.0.24",
]
[tool.uv]
index-strategy = "unsafe-best-match"
[[tool.uv.index]]
url = "https://download.pytorch.org/whl/cpu"
[dependency-groups]
dev = [
]
[tool.setuptools.packages.find]
where = ["."]
include = ["api*", "enums*", "models*", "services*", "constants*", "utils*"]

View file

@ -1,78 +0,0 @@
"""
Runtime hook to fix docling metadata lookup and python-docx template path resolution in PyInstaller builds.
PyInstaller doesn't always preserve package metadata (dist-info) in a way that
importlib.metadata can find it. This hook patches the version lookup to return
a default version if metadata isn't found, allowing docling to import successfully.
Additionally, python-docx uses __file__ to locate template files, which doesn't work
correctly in PyInstaller bundles. This hook patches the path resolution to use
sys._MEIPASS to find the templates.
"""
import sys
import os
# Only apply this fix when running in PyInstaller bundle
if getattr(sys, 'frozen', False) and hasattr(sys, '_MEIPASS'):
try:
import importlib.metadata
# Store original version function
_original_version = importlib.metadata.version
def _patched_version(package_name):
"""Patched version that handles missing metadata gracefully."""
try:
return _original_version(package_name)
except importlib.metadata.PackageNotFoundError:
# For docling packages, return a default version if metadata not found
if package_name in ('docling', 'docling-core', 'docling-parse', 'docling-ibm-models'):
# Return a reasonable default version to allow import to proceed
return '2.43.0'
raise
# Patch the version function
importlib.metadata.version = _patched_version
except Exception:
# If patching fails, continue anyway
pass
# Fix python-docx template path resolution
try:
import docx.parts.hdrftr as hdrftr_module
# Store the original _default_header_xml function
if hasattr(hdrftr_module, '_default_header_xml'):
_original_default_header_xml = hdrftr_module._default_header_xml
def _patched_default_header_xml():
"""Patched function that resolves template path correctly in PyInstaller bundle."""
# Try to find the template file in the bundle
template_path = os.path.join(sys._MEIPASS, 'docx', 'templates', 'default-header.xml')
if os.path.exists(template_path):
with open(template_path, 'rb') as f:
return f.read()
# Fallback to original implementation
return _original_default_header_xml()
# Patch the function
hdrftr_module._default_header_xml = _patched_default_header_xml
# Also patch _default_footer_xml if it exists
if hasattr(hdrftr_module, '_default_footer_xml'):
_original_default_footer_xml = hdrftr_module._default_footer_xml
def _patched_default_footer_xml():
"""Patched function that resolves template path correctly in PyInstaller bundle."""
template_path = os.path.join(sys._MEIPASS, 'docx', 'templates', 'default-footer.xml')
if os.path.exists(template_path):
with open(template_path, 'rb') as f:
return f.read()
return _original_default_footer_xml()
hdrftr_module._default_footer_xml = _patched_default_footer_xml
except Exception:
# If patching fails, continue anyway
pass

View file

@ -17,18 +17,6 @@ datas_docx2everything, binaries_docx2everything, hiddenimports_docx2everything =
# collect_all returns empty lists if package not installed, so safe to call always
datas_greenlet, binaries_greenlet, hiddenimports_greenlet = collect_all('greenlet')
# Collect docling - only installed on Linux/macOS (via pyproject.toml)
# collect_all returns empty lists if package not installed, so safe to call always
datas_docling, binaries_docling, hiddenimports_docling = collect_all('docling')
# Also collect docling dependencies which are needed for metadata lookup
datas_docling_core, binaries_docling_core, hiddenimports_docling_core = collect_all('docling-core')
datas_docling_parse, binaries_docling_parse, hiddenimports_docling_parse = collect_all('docling-parse')
datas_docling_ibm, binaries_docling_ibm, hiddenimports_docling_ibm = collect_all('docling-ibm-models')
# Collect python-docx (dependency of docling) - needed for Word document processing on Linux/macOS
# collect_all returns empty lists if package not installed, so safe to call conditionally
datas_docx, binaries_docx, hiddenimports_docx = collect_all('docx')
# fastembed_cache is created at runtime when models are first used; include only if present (e.g. local dev)
datas_fastembed_cache = [('fastembed_cache', 'fastembed_cache')] if os.path.isdir('fastembed_cache') else []
@ -37,12 +25,12 @@ excludes = []
a = Analysis(
['server.py'],
pathex=[],
binaries=binaries_fastembed + binaries_fastembed_vs + binaries_onnx + binaries_pptx + binaries_docx2everything + binaries_greenlet + binaries_docling + binaries_docling_core + binaries_docling_parse + binaries_docling_ibm + binaries_docx,
binaries=binaries_fastembed + binaries_fastembed_vs + binaries_onnx + binaries_pptx + binaries_docx2everything + binaries_greenlet,
datas=[
('assets', 'assets'),
('static', 'static'),
('alembic', 'alembic'),
] + datas_fastembed_cache + datas_fastembed + datas_fastembed_vs + datas_onnx + datas_pptx + datas_docx2everything + datas_greenlet + datas_docling + datas_docling_core + datas_docling_parse + datas_docling_ibm + datas_docx,
] + datas_fastembed_cache + datas_fastembed + datas_fastembed_vs + datas_onnx + datas_pptx + datas_docx2everything + datas_greenlet,
hiddenimports=[
'aiosqlite',
'alembic',
@ -52,10 +40,10 @@ a = Analysis(
'greenlet',
'greenlet._greenlet',
'importlib.metadata',
] + hiddenimports_fastembed + hiddenimports_fastembed_vs + hiddenimports_onnx + hiddenimports_pptx + hiddenimports_docx2everything + hiddenimports_greenlet + hiddenimports_docling + hiddenimports_docling_core + hiddenimports_docling_parse + hiddenimports_docling_ibm + hiddenimports_docx,
] + hiddenimports_fastembed + hiddenimports_fastembed_vs + hiddenimports_onnx + hiddenimports_pptx + hiddenimports_docx2everything + hiddenimports_greenlet,
hookspath=[],
hooksconfig={},
runtime_hooks=['runtime_hook_docling.py'],
runtime_hooks=[],
excludes=excludes,
noarchive=False,
optimize=0,

View file

@ -1,38 +0,0 @@
from docling.document_converter import (
DocumentConverter,
PdfFormatOption,
PowerpointFormatOption,
WordFormatOption,
)
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.datamodel.base_models import InputFormat
from utils.path_helpers import patch_python_docx_templates
class DoclingService:
def __init__(self):
# Patch python-docx template path resolution before initializing converter
# This is safe to call in any environment (Docker, development, PyInstaller)
patch_python_docx_templates()
self.pipeline_options = PdfPipelineOptions()
self.pipeline_options.do_ocr = False
self.converter = DocumentConverter(
allowed_formats=[InputFormat.PPTX, InputFormat.PDF, InputFormat.DOCX],
format_options={
InputFormat.DOCX: WordFormatOption(
pipeline_options=self.pipeline_options,
),
InputFormat.PPTX: PowerpointFormatOption(
pipeline_options=self.pipeline_options,
),
InputFormat.PDF: PdfFormatOption(
pipeline_options=self.pipeline_options,
),
},
)
def parse_to_markdown(self, file_path: str) -> str:
result = self.converter.convert(file_path)
return result.document.export_to_markdown()

View file

@ -1,45 +1,37 @@
import mimetypes
import sys
from fastapi import HTTPException
import os, asyncio
from typing import List, Optional, Tuple
import pdfplumber
from constants.documents import (
IMAGE_MIME_TYPES,
PDF_MIME_TYPES,
POWERPOINT_TYPES,
TEXT_MIME_TYPES,
WORD_TYPES,
)
from services.liteparse_service import LiteParseError, LiteParseService
from utils.ocr_language import presentation_language_to_ocr_code
# Platform-specific document service imports
is_windows = sys.platform == 'win32'
if not is_windows:
from services.docling_service import DoclingService
DocumentService = None
else:
DoclingService = None
# Optional fallback converter (primarily useful on Windows)
try:
from services.lightweight_document_service import DocumentService
except Exception:
DocumentService = None
class DocumentsLoader:
def __init__(self, file_paths: List[str]):
def __init__(
self,
file_paths: List[str],
presentation_language: Optional[str] = None,
):
self._file_paths = file_paths
# Initialize document service based on platform
if not is_windows and DoclingService is not None:
# Use DoclingService on Linux/macOS
self.docling_service = DoclingService()
self.document_service = None
elif is_windows and DocumentService is not None:
# Use lightweight DocumentService on Windows
self.docling_service = None
self.document_service = DocumentService()
else:
# Fallback if neither is available
self.docling_service = None
self.document_service = None
self._ocr_language = presentation_language_to_ocr_code(presentation_language)
self.liteparse_service = LiteParseService()
self.document_service = DocumentService() if DocumentService is not None else None
self._documents: List[str] = []
self._images: List[List[str]] = []
@ -83,6 +75,8 @@ class DocumentsLoader:
document = self.load_powerpoint(file_path)
elif mime_type in WORD_TYPES:
document = self.load_msword(file_path)
elif mime_type in IMAGE_MIME_TYPES:
document = self.load_image(file_path)
documents.append(document)
images.append(imgs)
@ -101,43 +95,43 @@ class DocumentsLoader:
document: str = ""
if load_text:
document = await self.load_text_from_pdf_locally(file_path)
document = await asyncio.to_thread(self._parse_with_liteparse, file_path)
if load_images:
image_paths = await self.get_page_images_from_pdf_async(file_path, temp_dir)
return document, image_paths
async def load_text_from_pdf_locally(self, file_path: str) -> str:
return await asyncio.to_thread(self._extract_text_from_pdf, file_path)
@staticmethod
def _extract_text_from_pdf(file_path: str) -> str:
texts: List[str] = []
with pdfplumber.open(file_path) as pdf:
for idx, page in enumerate(pdf.pages):
page_text = f"## Page {idx + 1}\n"
page_text += page.extract_text() or ""
texts.append(page_text)
return "\n\n".join(texts)
async def load_text(self, file_path: str) -> str:
with open(file_path, "r", encoding="utf-8") as file:
return await asyncio.to_thread(file.read)
def load_msword(self, file_path: str) -> str:
if self.docling_service is not None:
return self.docling_service.parse_to_markdown(file_path)
elif self.document_service is not None:
return self.document_service.parse_to_markdown(file_path)
return "" # Document service not available
return self._parse_with_liteparse(file_path)
def load_powerpoint(self, file_path: str) -> str:
if self.docling_service is not None:
return self.docling_service.parse_to_markdown(file_path)
elif self.document_service is not None:
return self.document_service.parse_to_markdown(file_path)
return "" # Document service not available
return self._parse_with_liteparse(file_path)
def load_image(self, file_path: str) -> str:
return self._parse_with_liteparse(file_path)
def _parse_with_liteparse(self, file_path: str) -> str:
try:
return self.liteparse_service.parse_to_markdown(
file_path,
ocr_enabled=True,
ocr_language=self._ocr_language,
)
except LiteParseError as exc:
if self.document_service is not None:
try:
return self.document_service.parse_to_markdown(file_path)
except Exception:
pass
raise HTTPException(
status_code=500,
detail=f"Failed to parse document {os.path.basename(file_path)}: {exc}",
) from exc
@classmethod
def get_page_images_from_pdf(cls, file_path: str, temp_dir: str) -> List[str]:

View file

@ -1,177 +1,177 @@
"""
Lightweight document converter for Windows/MSIX compatibility.
Uses pure-Python libraries: pdfplumber for PDF, docx2txt for DOCX, python-pptx for PPTX.
No subprocess, no external runtimes, MSIX/Appx safe.
"""
import os
from typing import List, Optional
import docx2everything
import pdfplumber
from pptx import Presentation
class LightweightDocumentConverter:
"""Lightweight document converter supporting PDF, DOCX, and PPTX."""
def convert(self, file_path: str) -> str:
"""
Convert document to markdown text.
Args:
file_path: Path to the document file
Returns:
Extracted text in markdown format
Raises:
ValueError: If file format is not supported
FileNotFoundError: If file does not exist
"""
if not os.path.exists(file_path):
raise FileNotFoundError(f"File not found: {file_path}")
file_ext = os.path.splitext(file_path)[1].lower()
if file_ext == '.pdf':
return self._convert_pdf(file_path)
elif file_ext == '.docx':
return self._convert_docx(file_path)
elif file_ext == '.pptx':
return self._convert_pptx(file_path)
else:
raise ValueError(f"Unsupported file format: {file_ext}")
def _convert_pdf(self, path: str) -> str:
"""
Convert PDF to markdown using pdfplumber.
Args:
path: Path to PDF file
Returns:
Extracted text in markdown format
"""
texts: List[str] = []
with pdfplumber.open(path) as pdf:
for idx, page in enumerate(pdf.pages):
page_text = f"## Page {idx + 1}\n"
page_text += page.extract_text() or ""
texts.append(page_text)
return "\n\n".join(texts)
def _convert_docx(self, path: str) -> str:
"""
Extract markdown from DOCX using docx2everything (no images).
Args:
path: Path to DOCX file
Returns:
Extracted markdown (no images)
"""
# Use the correct API: process_to_markdown(path) without img_dir extracts markdown without images
markdown = docx2everything.process_to_markdown(path)
return markdown if markdown else ""
def _convert_pptx(self, path: str) -> str:
"""
Convert PPTX to markdown using python-pptx.
Args:
path: Path to PPTX file
Returns:
Extracted text in markdown format
"""
prs = Presentation(path)
markdown_parts = []
for slide_num, slide in enumerate(prs.slides, start=1):
slide_parts = []
# Extract slide title (usually first shape with title placeholder)
title_text = None
for shape in slide.shapes:
if hasattr(shape, "placeholder"):
if shape.placeholder.placeholder_format.type == 1: # Title placeholder
if hasattr(shape, "text") and shape.text.strip():
title_text = shape.text.strip()
break
# If no title placeholder found, try to find text box at top
if not title_text:
for shape in slide.shapes:
if hasattr(shape, "text") and shape.text.strip():
# Check if it's likely a title (first text shape, short text)
text = shape.text.strip()
if len(text) < 200: # Heuristic: titles are usually short
title_text = text
break
# Add slide title
if title_text:
slide_parts.append(f"# {title_text}")
else:
slide_parts.append(f"# Slide {slide_num}")
# Extract content (bullet points and text)
for shape in slide.shapes:
if not hasattr(shape, "text"):
continue
text = shape.text.strip()
if not text:
continue
# Skip if this is the title we already added
if title_text and text == title_text:
continue
# Check if it's a text frame with paragraphs (bullet points)
if hasattr(shape, "text_frame"):
paragraphs = shape.text_frame.paragraphs
if len(paragraphs) > 1:
# Multiple paragraphs - likely bullet points
for para in paragraphs:
para_text = para.text.strip()
if para_text:
# Check bullet level
level = para.level
indent = " " * level
slide_parts.append(f"{indent}- {para_text}")
else:
# Single paragraph
if text and text != title_text:
slide_parts.append(text)
else:
# Plain text shape
if text and text != title_text:
slide_parts.append(text)
if slide_parts:
markdown_parts.append("\n".join(slide_parts))
return "\n\n---\n\n".join(markdown_parts)
class DocumentService:
"""
Document service wrapper providing parse_to_markdown interface.
Compatible with DoclingService interface for easy swapping.
"""
def __init__(self):
self.converter = LightweightDocumentConverter()
def parse_to_markdown(self, file_path: str) -> str:
"""
Parse document to markdown format.
Args:
file_path: Path to the document file
Returns:
Extracted text in markdown format
"""
return self.converter.convert(file_path)
"""
Lightweight document converter for Windows/MSIX compatibility.
Uses pure-Python libraries: pdfplumber for PDF, docx2txt for DOCX, python-pptx for PPTX.
No subprocess, no external runtimes, MSIX/Appx safe.
"""
import os
from typing import List, Optional
import docx2everything
import pdfplumber
from pptx import Presentation
class LightweightDocumentConverter:
"""Lightweight document converter supporting PDF, DOCX, and PPTX."""
def convert(self, file_path: str) -> str:
"""
Convert document to markdown text.
Args:
file_path: Path to the document file
Returns:
Extracted text in markdown format
Raises:
ValueError: If file format is not supported
FileNotFoundError: If file does not exist
"""
if not os.path.exists(file_path):
raise FileNotFoundError(f"File not found: {file_path}")
file_ext = os.path.splitext(file_path)[1].lower()
if file_ext == '.pdf':
return self._convert_pdf(file_path)
elif file_ext == '.docx':
return self._convert_docx(file_path)
elif file_ext == '.pptx':
return self._convert_pptx(file_path)
else:
raise ValueError(f"Unsupported file format: {file_ext}")
def _convert_pdf(self, path: str) -> str:
"""
Convert PDF to markdown using pdfplumber.
Args:
path: Path to PDF file
Returns:
Extracted text in markdown format
"""
texts: List[str] = []
with pdfplumber.open(path) as pdf:
for idx, page in enumerate(pdf.pages):
page_text = f"## Page {idx + 1}\n"
page_text += page.extract_text() or ""
texts.append(page_text)
return "\n\n".join(texts)
def _convert_docx(self, path: str) -> str:
"""
Extract markdown from DOCX using docx2everything (no images).
Args:
path: Path to DOCX file
Returns:
Extracted markdown (no images)
"""
# Use the correct API: process_to_markdown(path) without img_dir extracts markdown without images
markdown = docx2everything.process_to_markdown(path)
return markdown if markdown else ""
def _convert_pptx(self, path: str) -> str:
"""
Convert PPTX to markdown using python-pptx.
Args:
path: Path to PPTX file
Returns:
Extracted text in markdown format
"""
prs = Presentation(path)
markdown_parts = []
for slide_num, slide in enumerate(prs.slides, start=1):
slide_parts = []
# Extract slide title (usually first shape with title placeholder)
title_text = None
for shape in slide.shapes:
if hasattr(shape, "placeholder"):
if shape.placeholder.placeholder_format.type == 1: # Title placeholder
if hasattr(shape, "text") and shape.text.strip():
title_text = shape.text.strip()
break
# If no title placeholder found, try to find text box at top
if not title_text:
for shape in slide.shapes:
if hasattr(shape, "text") and shape.text.strip():
# Check if it's likely a title (first text shape, short text)
text = shape.text.strip()
if len(text) < 200: # Heuristic: titles are usually short
title_text = text
break
# Add slide title
if title_text:
slide_parts.append(f"# {title_text}")
else:
slide_parts.append(f"# Slide {slide_num}")
# Extract content (bullet points and text)
for shape in slide.shapes:
if not hasattr(shape, "text"):
continue
text = shape.text.strip()
if not text:
continue
# Skip if this is the title we already added
if title_text and text == title_text:
continue
# Check if it's a text frame with paragraphs (bullet points)
if hasattr(shape, "text_frame"):
paragraphs = shape.text_frame.paragraphs
if len(paragraphs) > 1:
# Multiple paragraphs - likely bullet points
for para in paragraphs:
para_text = para.text.strip()
if para_text:
# Check bullet level
level = para.level
indent = " " * level
slide_parts.append(f"{indent}- {para_text}")
else:
# Single paragraph
if text and text != title_text:
slide_parts.append(text)
else:
# Plain text shape
if text and text != title_text:
slide_parts.append(text)
if slide_parts:
markdown_parts.append("\n".join(slide_parts))
return "\n\n---\n\n".join(markdown_parts)
class DocumentService:
"""
Document service wrapper providing parse_to_markdown interface.
Same parse_to_markdown entry point as LiteParseService for optional Windows fallback.
"""
def __init__(self):
self.converter = LightweightDocumentConverter()
def parse_to_markdown(self, file_path: str) -> str:
"""
Parse document to markdown format.
Args:
file_path: Path to the document file
Returns:
Extracted text in markdown format
"""
return self.converter.convert(file_path)

View file

@ -0,0 +1,197 @@
import json
import os
import subprocess
from typing import Any, Dict, Tuple
class LiteParseError(Exception):
pass
class LiteParseService:
def __init__(self, timeout_seconds: int = 180):
self.timeout_seconds = timeout_seconds
self.node_binary = os.getenv("LITEPARSE_NODE_BINARY", "node")
self.runner_path = os.getenv("LITEPARSE_RUNNER_PATH", self._resolve_runner_path())
self.runner_dir = os.path.dirname(self.runner_path)
self._npm_project_root = self._resolve_npm_project_root()
def _resolve_npm_project_root(self) -> str:
"""Directory whose node_modules contains @llamaindex/liteparse (runner dir or Electron app root)."""
local_nm = os.path.join(
self.runner_dir, "node_modules", "@llamaindex", "liteparse"
)
if os.path.isdir(local_nm):
return self.runner_dir
electron_nm = os.path.abspath(
os.path.join(self.runner_dir, "..", "..", "node_modules", "@llamaindex", "liteparse")
)
if os.path.isdir(electron_nm):
return os.path.abspath(os.path.join(self.runner_dir, "..", ".."))
return os.path.abspath(os.path.join(self.runner_dir, "..", ".."))
@staticmethod
def _resolve_runner_path() -> str:
cwd = os.path.abspath(".")
candidates = [
# electron/servers/fastapi → electron/resources/...
os.path.abspath(
os.path.join(
cwd, "..", "..", "resources", "document-extraction", "liteparse_runner.mjs"
)
),
# servers/fastapi (repo root layout) → electron/resources/...
os.path.abspath(
os.path.join(
cwd,
"..",
"..",
"electron",
"resources",
"document-extraction",
"liteparse_runner.mjs",
)
),
# PyInstaller bundle layout
os.path.abspath(
os.path.join(
cwd, "..", "..", "app", "resources", "document-extraction", "liteparse_runner.mjs"
)
),
# Docker / explicit layout
"/app/document-extraction-liteparse/liteparse_runner.mjs",
]
for path in candidates:
if os.path.isfile(path):
return path
return candidates[0]
def check_runtime_ready(self) -> Tuple[bool, str]:
if not os.path.isfile(self.runner_path):
return False, f"LiteParse runner not found at: {self.runner_path}"
try:
subprocess.run(
[self.node_binary, "--version"],
cwd=self.runner_dir,
check=True,
capture_output=True,
text=True,
timeout=10,
)
except Exception as exc:
return False, f"Node.js runtime is unavailable: {exc}"
liteparse_dir = os.path.join(
self._npm_project_root, "node_modules", "@llamaindex", "liteparse"
)
if not os.path.isdir(liteparse_dir):
return (
False,
f"LiteParse npm package missing at {liteparse_dir}. Run npm install in the Electron app directory.",
)
# @llamaindex/liteparse is ESM-only; require.resolve() fails. Use dynamic import.
try:
subprocess.run(
[
self.node_binary,
"--input-type=module",
"-e",
"import '@llamaindex/liteparse'",
],
cwd=self._npm_project_root,
check=True,
capture_output=True,
text=True,
timeout=20,
)
except Exception as exc:
return False, f"LiteParse dependency is unavailable: {exc}"
return True, "ok"
def parse_to_markdown(
self,
file_path: str,
ocr_enabled: bool = True,
ocr_language: str = "eng",
) -> str:
result = self.parse(
file_path=file_path,
ocr_enabled=ocr_enabled,
ocr_language=ocr_language,
)
return str(result.get("text") or "")
def parse(
self,
file_path: str,
ocr_enabled: bool = True,
ocr_language: str = "eng",
) -> Dict[str, Any]:
is_ready, reason = self.check_runtime_ready()
if not is_ready:
raise LiteParseError(reason)
command = [
self.node_binary,
self.runner_path,
"--file",
file_path,
"--ocr-enabled",
"true" if ocr_enabled else "false",
"--ocr-language",
ocr_language,
]
ocr_server = (os.getenv("LITEPARSE_OCR_SERVER_URL") or "").strip()
if ocr_server:
command.extend(["--ocr-server-url", ocr_server])
tessdata = (os.getenv("LITEPARSE_TESSDATA_PATH") or "").strip()
if tessdata:
command.extend(["--tessdata-path", tessdata])
process = subprocess.run(
command,
cwd=self._npm_project_root,
capture_output=True,
text=True,
timeout=self.timeout_seconds,
env=os.environ.copy(),
)
payload = self._decode_runner_output(process.stdout)
if process.returncode != 0:
message = payload.get("error") or process.stderr.strip() or "Unknown error"
raise LiteParseError(message)
if not payload.get("ok"):
raise LiteParseError(payload.get("error") or "LiteParse parse failed")
return payload
@staticmethod
def _decode_runner_output(stdout: str) -> Dict[str, Any]:
raw = (stdout or "").lstrip("\ufeff").strip()
if not raw:
raise LiteParseError("LiteParse runner returned empty output")
# Prefer the last line that parses as JSON (handles stray log lines before our payload).
lines = [line.strip() for line in raw.splitlines() if line.strip()]
for line in reversed(lines):
try:
parsed = json.loads(line)
if isinstance(parsed, dict):
return parsed
except json.JSONDecodeError:
continue
# Single blob without newlines (entire stdout is one JSON object).
try:
parsed = json.loads(raw)
if isinstance(parsed, dict):
return parsed
except json.JSONDecodeError:
pass
raise LiteParseError("LiteParse runner returned invalid JSON output")

View file

@ -0,0 +1,126 @@
"""
Map presentation UI language strings (LanguageType enum values from Next.js) to
Tesseract / LiteParse OCR language codes (ISO 639-3 where applicable).
Keep keys in sync with:
electron/servers/nextjs/app/(presentation-generator)/upload/type.ts LanguageType
"""
from __future__ import annotations
import re
from typing import Optional
# Values must match `LanguageType` string literals in the upload UI.
PRESENTATION_LANGUAGE_TO_TESSERACT: dict[str, str] = {
"English": "eng",
"Spanish (Español)": "spa",
"French (Français)": "fra",
"German (Deutsch)": "deu",
"Portuguese (Português)": "por",
"Italian (Italiano)": "ita",
"Dutch (Nederlands)": "nld",
"Russian (Русский)": "rus",
"Chinese (Simplified - 中文, 汉语)": "chi_sim",
"Chinese (Traditional - 中文, 漢語)": "chi_tra",
"Japanese (日本語)": "jpn",
"Korean (한국어)": "kor",
"Arabic (العربية)": "ara",
"Hindi (हिन्दी)": "hin",
"Bengali (বাংলা)": "ben",
"Polish (Polski)": "pol",
"Czech (Čeština)": "ces",
"Slovak (Slovenčina)": "slk",
"Hungarian (Magyar)": "hun",
"Romanian (Română)": "ron",
"Bulgarian (Български)": "bul",
"Greek (Ελληνικά)": "ell",
"Serbian (Српски / Srpski)": "srp",
"Croatian (Hrvatski)": "hrv",
"Bosnian (Bosanski)": "bos",
"Slovenian (Slovenščina)": "slv",
"Finnish (Suomi)": "fin",
"Swedish (Svenska)": "swe",
"Danish (Dansk)": "dan",
"Norwegian (Norsk)": "nor",
"Icelandic (Íslenska)": "isl",
"Lithuanian (Lietuvių)": "lit",
"Latvian (Latviešu)": "lav",
"Estonian (Eesti)": "est",
"Maltese (Malti)": "mlt",
"Welsh (Cymraeg)": "cym",
"Irish (Gaeilge)": "gle",
"Scottish Gaelic (Gàidhlig)": "gla",
"Ukrainian (Українська)": "ukr",
"Hebrew (עברית)": "heb",
"Persian/Farsi (فارسی)": "fas",
"Turkish (Türkçe)": "tur",
"Kurdish (Kurdî / کوردی)": "kmr",
"Pashto (پښتو)": "pus",
"Dari (دری)": "prs",
"Uzbek (Oʻzbek)": "uzb",
"Kazakh (Қазақша)": "kaz",
"Tajik (Тоҷикӣ)": "tgk",
"Turkmen (Türkmençe)": "tuk",
"Azerbaijani (Azərbaycan dili)": "aze",
"Urdu (اردو)": "urd",
"Tamil (தமிழ்)": "tam",
"Telugu (తెలుగు)": "tel",
"Marathi (मराठी)": "mar",
"Punjabi (ਪੰਜਾਬੀ / پنجابی)": "pan",
"Gujarati (ગુજરાતી)": "guj",
"Malayalam (മലയാളം)": "mal",
"Kannada (ಕನ್ನಡ)": "kan",
"Odia (ଓଡ଼ିଆ)": "ori",
"Sinhala (සිංහල)": "sin",
"Nepali (नेपाली)": "nep",
"Thai (ไทย)": "tha",
"Vietnamese (Tiếng Việt)": "vie",
"Lao (ລາວ)": "lao",
"Khmer (ភាសាខ្មែរ)": "khm",
"Burmese (မြန်မာစာ)": "mya",
"Tagalog/Filipino (Tagalog/Filipino)": "tgl",
"Javanese (Basa Jawa)": "jav",
"Sundanese (Basa Sunda)": "sun",
"Malay (Bahasa Melayu)": "msa",
"Mongolian (Монгол)": "mon",
"Swahili (Kiswahili)": "swa",
"Hausa (Hausa)": "hau",
"Yoruba (Yorùbá)": "yor",
"Igbo (Igbo)": "ibo",
"Amharic (አማርኛ)": "amh",
"Zulu (isiZulu)": "zul",
"Xhosa (isiXhosa)": "xho",
"Shona (ChiShona)": "sna",
"Somali (Soomaaliga)": "som",
"Basque (Euskara)": "eus",
"Catalan (Català)": "cat",
"Galician (Galego)": "glg",
"Quechua (Runasimi)": "que",
"Nahuatl (Nāhuatl)": "nah",
"Hawaiian (ʻŌlelo Hawaiʻi)": "haw",
"Maori (Te Reo Māori)": "mri",
# No dedicated Tahitian traineddata in default Tesseract bundles.
"Tahitian (Reo Tahiti)": "eng",
"Samoan (Gagana Samoa)": "smo",
}
_LOWER_MAP = {k.lower(): v for k, v in PRESENTATION_LANGUAGE_TO_TESSERACT.items()}
_OCR_CODE_RE = re.compile(r"^[a-zA-Z0-9_,+]+$")
def presentation_language_to_ocr_code(language: Optional[str]) -> str:
"""Resolve UI language label to a Tesseract language code; default English."""
if language is None:
return "eng"
s = str(language).strip()
if not s:
return "eng"
if s in PRESENTATION_LANGUAGE_TO_TESSERACT:
code = PRESENTATION_LANGUAGE_TO_TESSERACT[s]
else:
code = _LOWER_MAP.get(s.lower(), "eng")
if not _OCR_CODE_RE.fullmatch(code):
return "eng"
return code

View file

@ -156,7 +156,7 @@ def patch_python_docx_templates():
- Docker/Development: Returns immediately without patching (no-op)
- PyInstaller: Patches the template loading functions
Note: This should be called before using docling service in PyInstaller bundles.
Note: Call before any code path that uses python-docx inside a PyInstaller bundle.
"""
# Only patch if running in PyInstaller bundle
# This check ensures Docker and development environments are unaffected

File diff suppressed because it is too large Load diff

View file

@ -29,7 +29,10 @@ export class PresentationGenerationApi {
}
}
static async decomposeDocuments(documentKeys: string[]) {
static async decomposeDocuments(
documentKeys: string[],
language?: string | null
) {
try {
const response = await fetch(
getApiUrl(`/api/v1/ppt/files/decompose`),
@ -38,6 +41,7 @@ export class PresentationGenerationApi {
headers: getHeader(),
body: JSON.stringify({
file_paths: documentKeys,
language: language ?? null,
}),
cache: "no-cache",
}

View file

@ -132,7 +132,12 @@ const UploadPage = () => {
if (documents.length > 0) {
trackEvent(MixpanelEvent.Upload_Decompose_Documents_API_Call);
promises.push(PresentationGenerationApi.decomposeDocuments(documents));
promises.push(
PresentationGenerationApi.decomposeDocuments(
documents,
config?.language ?? null
)
);
}
const responses = await Promise.all(promises);
dispatch(setPptGenUploadState({