Merge pull request #481 from presenton/feat/liteparseVsDocling
Feat/liteparse vs docling
This commit is contained in:
commit
28ff86c19b
35 changed files with 1982 additions and 1352 deletions
9
electron/.gitignore
vendored
9
electron/.gitignore
vendored
|
|
@ -21,6 +21,13 @@ app_dist
|
|||
resources/fastapi
|
||||
resources/nextjs
|
||||
dist
|
||||
eng.traineddata
|
||||
servers/fastapi/build/
|
||||
servers/fastapi/dist/
|
||||
servers/fastapi/fastembed_cache/
|
||||
electron/.cache/
|
||||
electron/.cache/export-runtime/
|
||||
electron/.cache/export-runtime/
|
||||
*.pkg
|
||||
*.toc
|
||||
*.zip
|
||||
*.pyc
|
||||
|
|
@ -1,13 +1,14 @@
|
|||
/**
|
||||
* IPC handlers for the unified setup installer (LibreOffice + Chromium).
|
||||
* IPC handlers for the unified setup installer (LibreOffice + Chromium + ImageMagick).
|
||||
* - setup:get-status — which dependencies are missing
|
||||
* - setup:install-chrome — download Chromium (browser-snapshots) with progress
|
||||
*/
|
||||
|
||||
import { ipcMain, WebContents } from "electron";
|
||||
import { ipcMain, WebContents, shell } from "electron";
|
||||
import fs from "fs";
|
||||
import path from "path";
|
||||
import os from "os";
|
||||
import { spawn, spawnSync } from "child_process";
|
||||
import puppeteer from "puppeteer";
|
||||
import {
|
||||
Browser,
|
||||
|
|
@ -17,6 +18,11 @@ import {
|
|||
resolveBuildId,
|
||||
} from "@puppeteer/browsers";
|
||||
import { getSetupStatus } from "../utils/setup-dependencies";
|
||||
import {
|
||||
getImageMagickDownloadUrl,
|
||||
getImageMagickManualInstallCommands,
|
||||
isImageMagickInstalled,
|
||||
} from "../utils/imagemagick-check";
|
||||
|
||||
function getPuppeteerCacheDir(): string {
|
||||
const configCache =
|
||||
|
|
@ -42,9 +48,105 @@ function sendChromeLog(wc: WebContents, level: string, text: string) {
|
|||
}
|
||||
}
|
||||
|
||||
function sendImageMagickProgress(
|
||||
wc: WebContents,
|
||||
phase: "installing" | "done" | "error",
|
||||
percent?: number,
|
||||
message?: string
|
||||
) {
|
||||
if (!wc.isDestroyed()) {
|
||||
wc.send("setup:imagemagick-progress", { phase, percent, message });
|
||||
}
|
||||
}
|
||||
|
||||
function sendImageMagickLog(wc: WebContents, level: string, text: string) {
|
||||
if (!wc.isDestroyed()) {
|
||||
wc.send("setup:imagemagick-log", { level, text });
|
||||
}
|
||||
}
|
||||
|
||||
function commandExists(command: string, versionArgs: string[] = ["--version"]): boolean {
|
||||
const result = spawnSync(command, versionArgs, {
|
||||
stdio: "pipe",
|
||||
windowsHide: true,
|
||||
});
|
||||
return result.status === 0;
|
||||
}
|
||||
|
||||
function resolveBrewCommand(): string | null {
|
||||
if (commandExists("brew")) {
|
||||
return "brew";
|
||||
}
|
||||
|
||||
const candidates = ["/opt/homebrew/bin/brew", "/usr/local/bin/brew"];
|
||||
for (const candidate of candidates) {
|
||||
if (fs.existsSync(candidate)) {
|
||||
return candidate;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
function resolveLinuxEscalationCommand(): string | null {
|
||||
if (commandExists("pkexec", ["--version"])) return "pkexec";
|
||||
if (commandExists("sudo", ["-V"])) return "sudo";
|
||||
return null;
|
||||
}
|
||||
|
||||
function logManualImageMagickCommands(wc: WebContents) {
|
||||
for (const line of getImageMagickManualInstallCommands()) {
|
||||
const level = line.endsWith(":") ? "info" : "cmd";
|
||||
sendImageMagickLog(wc, level, line);
|
||||
}
|
||||
}
|
||||
|
||||
function runInstallCommand(
|
||||
wc: WebContents,
|
||||
command: string,
|
||||
args: string[]
|
||||
): Promise<void> {
|
||||
sendImageMagickLog(wc, "info", `Running: ${command} ${args.join(" ")}`);
|
||||
return new Promise((resolve, reject) => {
|
||||
const child = spawn(command, args, {
|
||||
stdio: ["ignore", "pipe", "pipe"],
|
||||
windowsHide: process.platform === "win32",
|
||||
});
|
||||
|
||||
child.stdout.on("data", (data) => {
|
||||
const text = String(data).trim();
|
||||
if (text) sendImageMagickLog(wc, "info", text);
|
||||
});
|
||||
child.stderr.on("data", (data) => {
|
||||
const text = String(data).trim();
|
||||
if (text) {
|
||||
sendImageMagickLog(
|
||||
wc,
|
||||
text.toLowerCase().includes("error") ? "error" : "info",
|
||||
text
|
||||
);
|
||||
}
|
||||
});
|
||||
|
||||
child.on("error", reject);
|
||||
child.on("close", (code) => {
|
||||
if (code === 0) {
|
||||
resolve();
|
||||
return;
|
||||
}
|
||||
reject(new Error(`${command} exited with code ${code}`));
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
export function setupSetupInstallHandlers() {
|
||||
ipcMain.handle("setup:get-status", () => {
|
||||
return getSetupStatus() ?? { needsLibreOffice: false, needsChrome: false };
|
||||
return (
|
||||
getSetupStatus() ?? {
|
||||
needsLibreOffice: false,
|
||||
needsChrome: false,
|
||||
needsImageMagick: false,
|
||||
}
|
||||
);
|
||||
});
|
||||
|
||||
ipcMain.handle(
|
||||
|
|
@ -121,4 +223,122 @@ export function setupSetupInstallHandlers() {
|
|||
return { ok: true };
|
||||
}
|
||||
);
|
||||
|
||||
ipcMain.handle(
|
||||
"setup:install-imagemagick",
|
||||
async (event): Promise<{ ok: boolean; error?: string }> => {
|
||||
const wc = event.sender;
|
||||
try {
|
||||
sendImageMagickProgress(
|
||||
wc,
|
||||
"installing",
|
||||
undefined,
|
||||
"Installing ImageMagick..."
|
||||
);
|
||||
|
||||
if (process.platform === "linux") {
|
||||
if (commandExists("apt-get")) {
|
||||
const escalator = resolveLinuxEscalationCommand();
|
||||
if (!escalator) {
|
||||
throw new Error(
|
||||
"Neither pkexec nor sudo is available to run apt-get install."
|
||||
);
|
||||
}
|
||||
|
||||
await runInstallCommand(wc, escalator, [
|
||||
"apt-get",
|
||||
"update",
|
||||
]);
|
||||
await runInstallCommand(wc, escalator, [
|
||||
"apt-get",
|
||||
"install",
|
||||
"-y",
|
||||
"imagemagick",
|
||||
]);
|
||||
} else {
|
||||
throw new Error(
|
||||
"apt-get is unavailable. Install ImageMagick manually using your package manager."
|
||||
);
|
||||
}
|
||||
} else if (process.platform === "darwin") {
|
||||
let brewCommand = resolveBrewCommand();
|
||||
if (!brewCommand) {
|
||||
sendImageMagickLog(
|
||||
wc,
|
||||
"info",
|
||||
"Homebrew not found. Installing Homebrew first..."
|
||||
);
|
||||
const installHomebrewCommand =
|
||||
'NONINTERACTIVE=1 /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"';
|
||||
await runInstallCommand(wc, "/bin/bash", ["-c", installHomebrewCommand]);
|
||||
brewCommand = resolveBrewCommand();
|
||||
}
|
||||
|
||||
if (!brewCommand) {
|
||||
throw new Error(
|
||||
"Homebrew installation completed, but brew was not found on PATH."
|
||||
);
|
||||
}
|
||||
|
||||
await runInstallCommand(wc, brewCommand, ["install", "imagemagick"]);
|
||||
} else if (process.platform === "win32") {
|
||||
if (commandExists("choco", ["-v"])) {
|
||||
await runInstallCommand(wc, "choco", [
|
||||
"install",
|
||||
"imagemagick.app",
|
||||
"-y",
|
||||
]);
|
||||
} else {
|
||||
throw new Error(
|
||||
"Chocolatey is not installed. Falling back to direct installer download."
|
||||
);
|
||||
}
|
||||
} else {
|
||||
throw new Error(
|
||||
"Unsupported platform for automatic install. Use manual install from the official download page."
|
||||
);
|
||||
}
|
||||
|
||||
sendImageMagickProgress(wc, "done", 100, "ImageMagick install finished");
|
||||
return { ok: true };
|
||||
} catch (error) {
|
||||
const message =
|
||||
error instanceof Error ? error.message : "ImageMagick install failed";
|
||||
sendImageMagickLog(wc, "error", message);
|
||||
logManualImageMagickCommands(wc);
|
||||
const downloadUrl = getImageMagickDownloadUrl();
|
||||
sendImageMagickLog(
|
||||
wc,
|
||||
"info",
|
||||
`Opening manual install link: ${downloadUrl}`
|
||||
);
|
||||
await shell.openExternal(downloadUrl);
|
||||
sendImageMagickProgress(
|
||||
wc,
|
||||
"error",
|
||||
undefined,
|
||||
"Finish manual installation, then click Retry."
|
||||
);
|
||||
return { ok: false, error: message };
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
ipcMain.handle(
|
||||
"setup:check-imagemagick",
|
||||
async (event): Promise<{ ok: boolean; error?: string }> => {
|
||||
const wc = event.sender;
|
||||
const installed = isImageMagickInstalled();
|
||||
if (installed) {
|
||||
sendImageMagickProgress(wc, "done", 100, "ImageMagick detected");
|
||||
sendImageMagickLog(wc, "ok", "ImageMagick is installed and ready.");
|
||||
return { ok: true };
|
||||
}
|
||||
const message =
|
||||
"ImageMagick is not detected yet. Install it, then click Retry.";
|
||||
sendImageMagickProgress(wc, "error", undefined, message);
|
||||
sendImageMagickLog(wc, "error", message);
|
||||
return { ok: false, error: message };
|
||||
}
|
||||
);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -13,6 +13,8 @@ import { setupSetupInstallHandlers } from "./ipc/setup_install_handlers";
|
|||
import { checkDependenciesBeforeWindow } from "./utils/setup-dependencies";
|
||||
import { getSofficePath, isLibreOfficeInstalled } from "./utils/libreoffice-check";
|
||||
import { getPuppeteerExecutablePath, isChromeInstalled } from "./utils/puppeteer-check";
|
||||
import { getLiteParseRunnerPath } from "./utils/liteparse-check";
|
||||
import { isImageMagickInstalled } from "./utils/imagemagick-check";
|
||||
import { startUpdateChecker, stopUpdateChecker } from "./utils/update-checker";
|
||||
|
||||
|
||||
|
|
@ -23,6 +25,7 @@ let isStopping = false;
|
|||
const startupStatus: Record<string, string> = {
|
||||
libreoffice: "checking",
|
||||
puppeteer: "checking",
|
||||
imagemagick: "checking",
|
||||
};
|
||||
|
||||
// Allow renderer to query initial startup status as soon as it loads.
|
||||
|
|
@ -122,6 +125,7 @@ async function startServers(fastApiPort: number, nextjsPort: number) {
|
|||
// Resolved by libreoffice-check.ts at startup; lets Python invoke the
|
||||
// exact binary path instead of relying on the system PATH.
|
||||
SOFFICE_PATH: getSofficePath(),
|
||||
LITEPARSE_RUNNER_PATH: getLiteParseRunnerPath(),
|
||||
},
|
||||
isDev,
|
||||
);
|
||||
|
|
@ -188,7 +192,7 @@ app.whenReady().then(async () => {
|
|||
createWindow();
|
||||
win?.loadFile(path.join(baseDir, "resources/ui/homepage/index.html"));
|
||||
|
||||
// Single installer: checks LibreOffice and Chrome; if either is missing, shows one
|
||||
// Single installer: checks LibreOffice, Chrome, and ImageMagick; if any are missing, shows one
|
||||
// window that installs them one after another. Resolves when the window closes.
|
||||
const setupCompleted = await checkDependenciesBeforeWindow();
|
||||
if (!setupCompleted) {
|
||||
|
|
@ -199,12 +203,14 @@ app.whenReady().then(async () => {
|
|||
}
|
||||
|
||||
// Update startup status after setup (user may have installed one or both)
|
||||
const [loResult, chromeOk] = await Promise.all([
|
||||
const [loResult, chromeOk, imageMagickOk] = await Promise.all([
|
||||
isLibreOfficeInstalled(),
|
||||
isChromeInstalled(),
|
||||
Promise.resolve(isImageMagickInstalled()),
|
||||
]);
|
||||
startupStatus.libreoffice = loResult.installed ? "installed" : "missing";
|
||||
startupStatus.puppeteer = chromeOk ? "installed" : "missing";
|
||||
startupStatus.imagemagick = imageMagickOk ? "installed" : "missing";
|
||||
|
||||
// Show and focus main window
|
||||
win?.show();
|
||||
|
|
@ -218,6 +224,7 @@ app.whenReady().then(async () => {
|
|||
win?.webContents.once("did-finish-load", () => {
|
||||
sendStartupStatus("libreoffice", startupStatus.libreoffice);
|
||||
sendStartupStatus("puppeteer", startupStatus.puppeteer);
|
||||
sendStartupStatus("imagemagick", startupStatus.imagemagick);
|
||||
});
|
||||
|
||||
setUserConfig({
|
||||
|
|
|
|||
|
|
@ -5,6 +5,8 @@ contextBridge.exposeInMainWorld("setupInstaller", {
|
|||
|
||||
installLibreOffice: () => ipcRenderer.invoke("lo:start-install"),
|
||||
installChrome: () => ipcRenderer.invoke("setup:install-chrome"),
|
||||
installImageMagick: () => ipcRenderer.invoke("setup:install-imagemagick"),
|
||||
checkImageMagick: () => ipcRenderer.invoke("setup:check-imagemagick"),
|
||||
|
||||
done: () => ipcRenderer.send("setup:done"),
|
||||
|
||||
|
|
@ -25,4 +27,13 @@ contextBridge.exposeInMainWorld("setupInstaller", {
|
|||
onChromeLog: (cb: (data: { level: string; text: string }) => void) => {
|
||||
ipcRenderer.on("setup:chrome-log", (_event, data) => cb(data));
|
||||
},
|
||||
|
||||
onImageMagickProgress: (
|
||||
cb: (data: { phase: string; percent?: number; message?: string }) => void
|
||||
) => {
|
||||
ipcRenderer.on("setup:imagemagick-progress", (_event, data) => cb(data));
|
||||
},
|
||||
onImageMagickLog: (cb: (data: { level: string; text: string }) => void) => {
|
||||
ipcRenderer.on("setup:imagemagick-log", (_event, data) => cb(data));
|
||||
},
|
||||
});
|
||||
|
|
|
|||
2
electron/app/types/index.d.ts
vendored
2
electron/app/types/index.d.ts
vendored
|
|
@ -33,6 +33,8 @@ interface FastApiEnv {
|
|||
MIGRATE_DATABASE_ON_STARTUP?: string,
|
||||
/** Absolute path to the soffice binary resolved at startup by libreoffice-check.ts. */
|
||||
SOFFICE_PATH?: string,
|
||||
/** Absolute path to the bundled LiteParse runner script. */
|
||||
LITEPARSE_RUNNER_PATH?: string,
|
||||
}
|
||||
|
||||
interface NextJsEnv {
|
||||
|
|
|
|||
51
electron/app/utils/imagemagick-check.ts
Normal file
51
electron/app/utils/imagemagick-check.ts
Normal file
|
|
@ -0,0 +1,51 @@
|
|||
import { spawnSync } from "child_process";
|
||||
|
||||
function canExecute(command: string, args: string[]): boolean {
|
||||
const result = spawnSync(command, args, {
|
||||
stdio: "pipe",
|
||||
windowsHide: true,
|
||||
});
|
||||
return result.status === 0;
|
||||
}
|
||||
|
||||
export function isImageMagickInstalled(): boolean {
|
||||
// ImageMagick 7+ command
|
||||
if (canExecute("magick", ["-version"])) return true;
|
||||
// Legacy command on Linux/macOS packages
|
||||
if (canExecute("convert", ["-version"])) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
export function getImageMagickDownloadUrl(): string {
|
||||
if (process.platform === "win32") {
|
||||
return "https://imagemagick.org/archive/binaries/ImageMagick-7.1.2-18-Q16-HDRI-x64-dll.exe";
|
||||
}
|
||||
if (process.platform === "darwin") {
|
||||
return "https://brew.sh/";
|
||||
}
|
||||
return "https://imagemagick.org/script/download.php#linux";
|
||||
}
|
||||
|
||||
export function getImageMagickManualInstallCommands(): string[] {
|
||||
if (process.platform === "win32") {
|
||||
return [
|
||||
"Download and run the installer:",
|
||||
getImageMagickDownloadUrl(),
|
||||
];
|
||||
}
|
||||
|
||||
if (process.platform === "darwin") {
|
||||
return [
|
||||
"Install Homebrew:",
|
||||
'/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"',
|
||||
"Install ImageMagick:",
|
||||
"brew install imagemagick",
|
||||
];
|
||||
}
|
||||
|
||||
return [
|
||||
"Install ImageMagick:",
|
||||
"sudo apt-get update",
|
||||
"sudo apt-get install -y imagemagick",
|
||||
];
|
||||
}
|
||||
6
electron/app/utils/liteparse-check.ts
Normal file
6
electron/app/utils/liteparse-check.ts
Normal file
|
|
@ -0,0 +1,6 @@
|
|||
import path from "path";
|
||||
import { baseDir } from "./constants";
|
||||
|
||||
export function getLiteParseRunnerPath(): string {
|
||||
return path.join(baseDir, "resources", "document-extraction", "liteparse_runner.mjs");
|
||||
}
|
||||
|
|
@ -29,6 +29,7 @@ function shouldSkipDownload(): boolean {
|
|||
export interface SetupStatus {
|
||||
needsLibreOffice: boolean;
|
||||
needsChrome: boolean;
|
||||
needsImageMagick: boolean;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -1,9 +1,10 @@
|
|||
/**
|
||||
* setup-dependencies.ts
|
||||
*
|
||||
* Single installer window that ensures LibreOffice and Chrome (Puppeteer) are
|
||||
* Single installer window that ensures LibreOffice, Chrome (Puppeteer), and
|
||||
* ImageMagick are
|
||||
* available before the user starts creating presentations. Runs checks, then
|
||||
* if either is missing shows one installer that runs LibreOffice then Chrome
|
||||
* if any are missing shows one installer that runs dependency setup steps
|
||||
* in sequence (each with Install / Skip).
|
||||
*/
|
||||
|
||||
|
|
@ -15,6 +16,7 @@ import {
|
|||
isChromeInstalled,
|
||||
type SetupStatus,
|
||||
} from "./puppeteer-check";
|
||||
import { isImageMagickInstalled } from "./imagemagick-check";
|
||||
|
||||
export type { SetupStatus };
|
||||
|
||||
|
|
@ -26,40 +28,44 @@ export function getSetupStatus(): SetupStatus | null {
|
|||
}
|
||||
|
||||
/**
|
||||
* Checks LibreOffice and Chrome. If both are present, returns immediately.
|
||||
* If either is missing, opens one installer window that runs LibreOffice
|
||||
* then Chrome in sequence. Returns true only when all required dependencies
|
||||
* Checks LibreOffice, Chrome and ImageMagick. If all are present, returns
|
||||
* immediately. If any are missing, opens one installer window that runs each
|
||||
* missing setup step in sequence. Returns true only when all required dependencies
|
||||
* are installed; false when the installer is closed/skipped before completion.
|
||||
*/
|
||||
export async function checkDependenciesBeforeWindow(): Promise<boolean> {
|
||||
const [loResult, chromeInstalled] = await Promise.all([
|
||||
const [loResult, chromeInstalled, imageMagickInstalled] = await Promise.all([
|
||||
isLibreOfficeInstalled(),
|
||||
isChromeInstalled(),
|
||||
Promise.resolve(isImageMagickInstalled()),
|
||||
]);
|
||||
|
||||
const needsLibreOffice = !loResult.installed;
|
||||
const needsChrome = !chromeInstalled;
|
||||
const needsImageMagick = !imageMagickInstalled;
|
||||
|
||||
if (!needsLibreOffice && !needsChrome) {
|
||||
if (!needsLibreOffice && !needsChrome && !needsImageMagick) {
|
||||
return true;
|
||||
}
|
||||
|
||||
currentSetupStatus = {
|
||||
needsLibreOffice,
|
||||
needsChrome,
|
||||
needsImageMagick,
|
||||
};
|
||||
|
||||
await showSetupInstallerWindow();
|
||||
|
||||
// Re-check after installer closes; setup can only proceed when all
|
||||
// required dependencies are actually installed.
|
||||
const [postLoResult, postChromeInstalled] = await Promise.all([
|
||||
const [postLoResult, postChromeInstalled, postImageMagickInstalled] = await Promise.all([
|
||||
isLibreOfficeInstalled(),
|
||||
isChromeInstalled(),
|
||||
Promise.resolve(isImageMagickInstalled()),
|
||||
]);
|
||||
|
||||
currentSetupStatus = null;
|
||||
return postLoResult.installed && postChromeInstalled;
|
||||
return postLoResult.installed && postChromeInstalled && postImageMagickInstalled;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
470
electron/package-lock.json
generated
470
electron/package-lock.json
generated
|
|
@ -9,6 +9,7 @@
|
|||
"version": "0.6.3-beta",
|
||||
"hasInstallScript": true,
|
||||
"dependencies": {
|
||||
"@llamaindex/liteparse": "^1.4.0",
|
||||
"@puppeteer/browsers": "^1.9.1",
|
||||
"@tailwindcss/cli": "^4.1.5",
|
||||
"@types/uuid": "^10.0.0",
|
||||
|
|
@ -54,6 +55,16 @@
|
|||
"node": ">=6.9.0"
|
||||
}
|
||||
},
|
||||
"node_modules/@borewit/text-codec": {
|
||||
"version": "0.2.2",
|
||||
"resolved": "https://registry.npmjs.org/@borewit/text-codec/-/text-codec-0.2.2.tgz",
|
||||
"integrity": "sha512-DDaRehssg1aNrH4+2hnj1B7vnUGEjU6OIlyRdkMd0aUdIUvKXrJfXsy8LVtXAy7DRvYVluWbMspsRhz2lcW0mQ==",
|
||||
"license": "MIT",
|
||||
"funding": {
|
||||
"type": "github",
|
||||
"url": "https://github.com/sponsors/Borewit"
|
||||
}
|
||||
},
|
||||
"node_modules/@develar/schema-utils": {
|
||||
"version": "2.6.5",
|
||||
"resolved": "https://registry.npmjs.org/@develar/schema-utils/-/schema-utils-2.6.5.tgz",
|
||||
|
|
@ -507,6 +518,12 @@
|
|||
"tslib": "^2.4.0"
|
||||
}
|
||||
},
|
||||
"node_modules/@hyzyla/pdfium": {
|
||||
"version": "2.1.12",
|
||||
"resolved": "https://registry.npmjs.org/@hyzyla/pdfium/-/pdfium-2.1.12.tgz",
|
||||
"integrity": "sha512-2ezbrJk9V4foB3+U+eQ7234spsHmrufPU+9EV2cVZCnhTLLfelPz7wWshO0HjUNtcECNBaAfEzrdaQZOigkW+A==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/@img/colour": {
|
||||
"version": "1.0.0",
|
||||
"resolved": "https://registry.npmjs.org/@img/colour/-/colour-1.0.0.tgz",
|
||||
|
|
@ -1156,6 +1173,67 @@
|
|||
"@jridgewell/sourcemap-codec": "^1.4.14"
|
||||
}
|
||||
},
|
||||
"node_modules/@llamaindex/liteparse": {
|
||||
"version": "1.4.0",
|
||||
"resolved": "https://registry.npmjs.org/@llamaindex/liteparse/-/liteparse-1.4.0.tgz",
|
||||
"integrity": "sha512-58Tr4vAutcaf0Cxe7GK4cknpzcpN3tTzUhIAwWioWuSDqVPS3jpNhVVfqE5tV5PE4za07l07QFhGscCoVm/hRw==",
|
||||
"license": "Apache-2.0",
|
||||
"dependencies": {
|
||||
"@hyzyla/pdfium": "^2.1.9",
|
||||
"axios": "^1.7.0",
|
||||
"commander": "^12.0.0",
|
||||
"file-type": "^21.3.3",
|
||||
"form-data": "^4.0.0",
|
||||
"p-limit": "^7.3.0",
|
||||
"sharp": "^0.34.5",
|
||||
"tesseract.js": "^7.0.0",
|
||||
"unified": "^11.0.0",
|
||||
"zod": "^3.23.0"
|
||||
},
|
||||
"bin": {
|
||||
"lit": "dist/src/index.js",
|
||||
"liteparse": "dist/src/index.js"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=18.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/@llamaindex/liteparse/node_modules/commander": {
|
||||
"version": "12.1.0",
|
||||
"resolved": "https://registry.npmjs.org/commander/-/commander-12.1.0.tgz",
|
||||
"integrity": "sha512-Vw8qHK3bZM9y/P10u3Vib8o/DdkvA2OtPtZvD871QKjy74Wj1WSKFILMPRPSdUSx5RFK1arlJzEtA4PkFgnbuA==",
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">=18"
|
||||
}
|
||||
},
|
||||
"node_modules/@llamaindex/liteparse/node_modules/p-limit": {
|
||||
"version": "7.3.0",
|
||||
"resolved": "https://registry.npmjs.org/p-limit/-/p-limit-7.3.0.tgz",
|
||||
"integrity": "sha512-7cIXg/Z0M5WZRblrsOla88S4wAK+zOQQWeBYfV3qJuJXMr+LnbYjaadrFaS0JILfEDPVqHyKnZ1Z/1d6J9VVUw==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"yocto-queue": "^1.2.1"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=20"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/sponsors/sindresorhus"
|
||||
}
|
||||
},
|
||||
"node_modules/@llamaindex/liteparse/node_modules/yocto-queue": {
|
||||
"version": "1.2.2",
|
||||
"resolved": "https://registry.npmjs.org/yocto-queue/-/yocto-queue-1.2.2.tgz",
|
||||
"integrity": "sha512-4LCcse/U2MHZ63HAJVE+v71o7yOdIe4cZ70Wpf8D/IyjDKYQLV5GD46B+hSTjJsvV5PztjvHoU580EftxjDZFQ==",
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">=12.20"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/sponsors/sindresorhus"
|
||||
}
|
||||
},
|
||||
"node_modules/@malept/cross-spawn-promise": {
|
||||
"version": "2.0.0",
|
||||
"resolved": "https://registry.npmjs.org/@malept/cross-spawn-promise/-/cross-spawn-promise-2.0.0.tgz",
|
||||
|
|
@ -1975,6 +2053,29 @@
|
|||
"node": ">= 10"
|
||||
}
|
||||
},
|
||||
"node_modules/@tokenizer/inflate": {
|
||||
"version": "0.4.1",
|
||||
"resolved": "https://registry.npmjs.org/@tokenizer/inflate/-/inflate-0.4.1.tgz",
|
||||
"integrity": "sha512-2mAv+8pkG6GIZiF1kNg1jAjh27IDxEPKwdGul3snfztFerfPGI1LjDezZp3i7BElXompqEtPmoPx6c2wgtWsOA==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"debug": "^4.4.3",
|
||||
"token-types": "^6.1.1"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=18"
|
||||
},
|
||||
"funding": {
|
||||
"type": "github",
|
||||
"url": "https://github.com/sponsors/Borewit"
|
||||
}
|
||||
},
|
||||
"node_modules/@tokenizer/token": {
|
||||
"version": "0.3.0",
|
||||
"resolved": "https://registry.npmjs.org/@tokenizer/token/-/token-0.3.0.tgz",
|
||||
"integrity": "sha512-OvjF+z51L3ov0OyAU0duzsYuvO01PH7x4t6DJx+guahgTnBHkhJdG7soQeTSFLWN3efnHyibZ4Z8l2EuWwJN3A==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/@tootallnate/quickjs-emscripten": {
|
||||
"version": "0.23.0",
|
||||
"resolved": "https://registry.npmjs.org/@tootallnate/quickjs-emscripten/-/quickjs-emscripten-0.23.0.tgz",
|
||||
|
|
@ -2070,6 +2171,12 @@
|
|||
"@types/node": "*"
|
||||
}
|
||||
},
|
||||
"node_modules/@types/unist": {
|
||||
"version": "3.0.3",
|
||||
"resolved": "https://registry.npmjs.org/@types/unist/-/unist-3.0.3.tgz",
|
||||
"integrity": "sha512-ko/gIFJRv177XgZsZcBwnqJN5x/Gien8qNOn0D5bQU/zAzVf9Zt3BlcUiLqhV9y4ARk0GbT3tnUiPNgnTXzc/Q==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/@types/uuid": {
|
||||
"version": "10.0.0",
|
||||
"resolved": "https://registry.npmjs.org/@types/uuid/-/uuid-10.0.0.tgz",
|
||||
|
|
@ -2366,7 +2473,6 @@
|
|||
"version": "0.4.0",
|
||||
"resolved": "https://registry.npmjs.org/asynckit/-/asynckit-0.4.0.tgz",
|
||||
"integrity": "sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q==",
|
||||
"dev": true,
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/at-least-node": {
|
||||
|
|
@ -2379,6 +2485,26 @@
|
|||
"node": ">= 4.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/axios": {
|
||||
"version": "1.14.0",
|
||||
"resolved": "https://registry.npmjs.org/axios/-/axios-1.14.0.tgz",
|
||||
"integrity": "sha512-3Y8yrqLSwjuzpXuZ0oIYZ/XGgLwUIBU3uLvbcpb0pidD9ctpShJd43KSlEEkVQg6DS0G9NKyzOvBfUtDKEyHvQ==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"follow-redirects": "^1.15.11",
|
||||
"form-data": "^4.0.5",
|
||||
"proxy-from-env": "^2.1.0"
|
||||
}
|
||||
},
|
||||
"node_modules/axios/node_modules/proxy-from-env": {
|
||||
"version": "2.1.0",
|
||||
"resolved": "https://registry.npmjs.org/proxy-from-env/-/proxy-from-env-2.1.0.tgz",
|
||||
"integrity": "sha512-cJ+oHTW1VAEa8cJslgmUZrc+sjRKgAKl3Zyse6+PV38hZe/V6Z14TbCuXcan9F9ghlz4QrFr2c92TNF82UkYHA==",
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">=10"
|
||||
}
|
||||
},
|
||||
"node_modules/b4a": {
|
||||
"version": "1.8.0",
|
||||
"resolved": "https://registry.npmjs.org/b4a/-/b4a-1.8.0.tgz",
|
||||
|
|
@ -2393,6 +2519,16 @@
|
|||
}
|
||||
}
|
||||
},
|
||||
"node_modules/bail": {
|
||||
"version": "2.0.2",
|
||||
"resolved": "https://registry.npmjs.org/bail/-/bail-2.0.2.tgz",
|
||||
"integrity": "sha512-0xO6mYd7JB2YesxDKplafRpsiOzPt9V02ddPCLbY1xYGPOX24NTyN50qnUxgCPcSoYMhKpAuBTjQoRZCAkUDRw==",
|
||||
"license": "MIT",
|
||||
"funding": {
|
||||
"type": "github",
|
||||
"url": "https://github.com/sponsors/wooorm"
|
||||
}
|
||||
},
|
||||
"node_modules/balanced-match": {
|
||||
"version": "1.0.2",
|
||||
"resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.2.tgz",
|
||||
|
|
@ -2527,6 +2663,12 @@
|
|||
"readable-stream": "^3.4.0"
|
||||
}
|
||||
},
|
||||
"node_modules/bmp-js": {
|
||||
"version": "0.1.0",
|
||||
"resolved": "https://registry.npmjs.org/bmp-js/-/bmp-js-0.1.0.tgz",
|
||||
"integrity": "sha512-vHdS19CnY3hwiNdkaqk93DvjVLfbEcI8mys4UjuWrlX1haDmroo8o4xCzh4wD6DGV6HxRCyauwhHRqMTfERtjw==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/boolean": {
|
||||
"version": "3.2.0",
|
||||
"resolved": "https://registry.npmjs.org/boolean/-/boolean-3.2.0.tgz",
|
||||
|
|
@ -2820,7 +2962,6 @@
|
|||
"version": "1.0.2",
|
||||
"resolved": "https://registry.npmjs.org/call-bind-apply-helpers/-/call-bind-apply-helpers-1.0.2.tgz",
|
||||
"integrity": "sha512-Sp1ablJ0ivDkSzjcaJdxEunN5/XvksFJ2sMBFfq6x0ryhQV/2b/KwFe21cMpmHtPOSij8K99/wSfoEuTObmuMQ==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"es-errors": "^1.3.0",
|
||||
|
|
@ -3005,7 +3146,6 @@
|
|||
"version": "1.0.8",
|
||||
"resolved": "https://registry.npmjs.org/combined-stream/-/combined-stream-1.0.8.tgz",
|
||||
"integrity": "sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"delayed-stream": "~1.0.0"
|
||||
|
|
@ -3294,12 +3434,20 @@
|
|||
"version": "1.0.0",
|
||||
"resolved": "https://registry.npmjs.org/delayed-stream/-/delayed-stream-1.0.0.tgz",
|
||||
"integrity": "sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">=0.4.0"
|
||||
}
|
||||
},
|
||||
"node_modules/dequal": {
|
||||
"version": "2.0.3",
|
||||
"resolved": "https://registry.npmjs.org/dequal/-/dequal-2.0.3.tgz",
|
||||
"integrity": "sha512-0je+qPKHEMohvfRTCEo3CrPG6cAzAYgmzKyxRiYSSDkS6eGJdyVJm7WaYA5ECaAD9wLB2T4EEeymA5aFVcYXCA==",
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">=6"
|
||||
}
|
||||
},
|
||||
"node_modules/detect-libc": {
|
||||
"version": "2.1.2",
|
||||
"resolved": "https://registry.npmjs.org/detect-libc/-/detect-libc-2.1.2.tgz",
|
||||
|
|
@ -3317,6 +3465,19 @@
|
|||
"license": "MIT",
|
||||
"optional": true
|
||||
},
|
||||
"node_modules/devlop": {
|
||||
"version": "1.1.0",
|
||||
"resolved": "https://registry.npmjs.org/devlop/-/devlop-1.1.0.tgz",
|
||||
"integrity": "sha512-RWmIqhcFf1lRYBvNmr7qTNuyCt/7/ns2jbpp1+PalgE/rDQcBT0fioSMUpJ93irlUhC5hrg4cYqe6U+0ImW0rA==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"dequal": "^2.0.0"
|
||||
},
|
||||
"funding": {
|
||||
"type": "github",
|
||||
"url": "https://github.com/sponsors/wooorm"
|
||||
}
|
||||
},
|
||||
"node_modules/devtools-protocol": {
|
||||
"version": "0.0.1581282",
|
||||
"resolved": "https://registry.npmjs.org/devtools-protocol/-/devtools-protocol-0.0.1581282.tgz",
|
||||
|
|
@ -3461,7 +3622,6 @@
|
|||
"version": "1.0.1",
|
||||
"resolved": "https://registry.npmjs.org/dunder-proto/-/dunder-proto-1.0.1.tgz",
|
||||
"integrity": "sha512-KIN/nDJBQRcXw0MLVhZE9iQHmG68qAVIBg9CqmUYjmQIhgij9U5MFvrqkUL5FbtyyzZuOeOt0zdeRe4UY7ct+A==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"call-bind-apply-helpers": "^1.0.1",
|
||||
|
|
@ -3718,7 +3878,6 @@
|
|||
"version": "0.1.13",
|
||||
"resolved": "https://registry.npmjs.org/encoding/-/encoding-0.1.13.tgz",
|
||||
"integrity": "sha512-ETBauow1T35Y/WZMkio9jiM0Z5xjHHmJ4XmjZOq1l/dXz3lr2sRn87nJy20RupqSh1F2m3HHPSp8ShIPQJrJ3A==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"dependencies": {
|
||||
|
|
@ -3776,7 +3935,6 @@
|
|||
"version": "1.0.1",
|
||||
"resolved": "https://registry.npmjs.org/es-define-property/-/es-define-property-1.0.1.tgz",
|
||||
"integrity": "sha512-e3nRfgfUZ4rNGL232gUgX06QNyyez04KdjFrF+LTRoOXmrOgFKDg4BCdsjW8EnT69eqdYGmRpJwiPVYNrCaW3g==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">= 0.4"
|
||||
|
|
@ -3786,7 +3944,6 @@
|
|||
"version": "1.3.0",
|
||||
"resolved": "https://registry.npmjs.org/es-errors/-/es-errors-1.3.0.tgz",
|
||||
"integrity": "sha512-Zf5H2Kxt2xjTvbJvP2ZWLEICxA6j+hAmMzIlypy4xcBg1vKVnx89Wy0GbS+kf5cwCVFFzdCFh2XSCFNULS6csw==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">= 0.4"
|
||||
|
|
@ -3796,7 +3953,6 @@
|
|||
"version": "1.1.1",
|
||||
"resolved": "https://registry.npmjs.org/es-object-atoms/-/es-object-atoms-1.1.1.tgz",
|
||||
"integrity": "sha512-FGgH2h8zKNim9ljj7dankFPcICIK9Cp5bm+c2gQSYePhpaG5+esrLODihIorn+Pe6FGJzWhXQotPv73jTaldXA==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"es-errors": "^1.3.0"
|
||||
|
|
@ -3809,7 +3965,6 @@
|
|||
"version": "2.1.0",
|
||||
"resolved": "https://registry.npmjs.org/es-set-tostringtag/-/es-set-tostringtag-2.1.0.tgz",
|
||||
"integrity": "sha512-j6vWzfrGVfyXxge+O0x5sh6cvxAog0a/4Rdd2K36zCMV5eJ+/+tOAngRO8cODMNWbVRdVlmGZQL2YS3yR8bIUA==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"es-errors": "^1.3.0",
|
||||
|
|
@ -3920,6 +4075,12 @@
|
|||
"dev": true,
|
||||
"license": "Apache-2.0"
|
||||
},
|
||||
"node_modules/extend": {
|
||||
"version": "3.0.2",
|
||||
"resolved": "https://registry.npmjs.org/extend/-/extend-3.0.2.tgz",
|
||||
"integrity": "sha512-fjquC59cD7CyW6urNXK0FBufkZcoiGG80wTuPujX590cB5Ttln20E2UB4S/WARVqhXffZl2LNgS+gQdPIIim/g==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/extract-zip": {
|
||||
"version": "2.0.1",
|
||||
"resolved": "https://registry.npmjs.org/extract-zip/-/extract-zip-2.0.1.tgz",
|
||||
|
|
@ -3998,6 +4159,24 @@
|
|||
}
|
||||
}
|
||||
},
|
||||
"node_modules/file-type": {
|
||||
"version": "21.3.4",
|
||||
"resolved": "https://registry.npmjs.org/file-type/-/file-type-21.3.4.tgz",
|
||||
"integrity": "sha512-Ievi/yy8DS3ygGvT47PjSfdFoX+2isQueoYP1cntFW1JLYAuS4GD7NUPGg4zv2iZfV52uDyk5w5Z0TdpRS6Q1g==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@tokenizer/inflate": "^0.4.1",
|
||||
"strtok3": "^10.3.4",
|
||||
"token-types": "^6.1.1",
|
||||
"uint8array-extras": "^1.4.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=20"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/sindresorhus/file-type?sponsor=1"
|
||||
}
|
||||
},
|
||||
"node_modules/filelist": {
|
||||
"version": "1.0.4",
|
||||
"resolved": "https://registry.npmjs.org/filelist/-/filelist-1.0.4.tgz",
|
||||
|
|
@ -4031,6 +4210,26 @@
|
|||
"node": ">=10"
|
||||
}
|
||||
},
|
||||
"node_modules/follow-redirects": {
|
||||
"version": "1.15.11",
|
||||
"resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.11.tgz",
|
||||
"integrity": "sha512-deG2P0JfjrTxl50XGCDyfI97ZGVCxIpfKYmfyrQ54n5FO/0gfIES8C/Psl6kWVDolizcaaxZJnTS0QSMxvnsBQ==",
|
||||
"funding": [
|
||||
{
|
||||
"type": "individual",
|
||||
"url": "https://github.com/sponsors/RubenVerborgh"
|
||||
}
|
||||
],
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">=4.0"
|
||||
},
|
||||
"peerDependenciesMeta": {
|
||||
"debug": {
|
||||
"optional": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"node_modules/foreground-child": {
|
||||
"version": "3.3.1",
|
||||
"resolved": "https://registry.npmjs.org/foreground-child/-/foreground-child-3.3.1.tgz",
|
||||
|
|
@ -4065,7 +4264,6 @@
|
|||
"version": "4.0.5",
|
||||
"resolved": "https://registry.npmjs.org/form-data/-/form-data-4.0.5.tgz",
|
||||
"integrity": "sha512-8RipRLol37bNs2bhoV67fiTEvdTrbMUYcFTiy3+wuuOnUog2QBHCZWXDRijWQfAkhBj2Uf5UnVaiWwA5vdd82w==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"asynckit": "^0.4.0",
|
||||
|
|
@ -4117,7 +4315,6 @@
|
|||
"version": "1.1.2",
|
||||
"resolved": "https://registry.npmjs.org/function-bind/-/function-bind-1.1.2.tgz",
|
||||
"integrity": "sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"funding": {
|
||||
"url": "https://github.com/sponsors/ljharb"
|
||||
|
|
@ -4136,7 +4333,6 @@
|
|||
"version": "1.3.0",
|
||||
"resolved": "https://registry.npmjs.org/get-intrinsic/-/get-intrinsic-1.3.0.tgz",
|
||||
"integrity": "sha512-9fSjSaos/fRIVIp+xSJlE6lfwhES7LNtKaCBIamHsjr2na1BiABJPo0mOjjz8GJDURarmCPGqaiVg5mfjb98CQ==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"call-bind-apply-helpers": "^1.0.2",
|
||||
|
|
@ -4161,7 +4357,6 @@
|
|||
"version": "1.0.1",
|
||||
"resolved": "https://registry.npmjs.org/get-proto/-/get-proto-1.0.1.tgz",
|
||||
"integrity": "sha512-sTSfBjoXBp89JvIKIefqw7U2CCebsc74kiY6awiGogKtoSGbgjYE/G/+l9sF3MWFPNc9IcoOC4ODfKHfxFmp0g==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"dunder-proto": "^1.0.1",
|
||||
|
|
@ -4290,7 +4485,6 @@
|
|||
"version": "1.2.0",
|
||||
"resolved": "https://registry.npmjs.org/gopd/-/gopd-1.2.0.tgz",
|
||||
"integrity": "sha512-ZUKRh6/kUFoAiTAtTYPZJ3hw9wNxx+BIBOijnlG9PnrJsCcSjs1wyyD6vJpaYtgnzDrKYRSqf3OO6Rfa93xsRg==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">= 0.4"
|
||||
|
|
@ -4359,7 +4553,6 @@
|
|||
"version": "1.1.0",
|
||||
"resolved": "https://registry.npmjs.org/has-symbols/-/has-symbols-1.1.0.tgz",
|
||||
"integrity": "sha512-1cDNdwJ2Jaohmb3sg4OmKaMBwuC48sYni5HUw2DvsC8LjGTLK9h+eb1X6RyuOHe4hT0ULCW68iomhjUoKUqlPQ==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">= 0.4"
|
||||
|
|
@ -4372,7 +4565,6 @@
|
|||
"version": "1.0.2",
|
||||
"resolved": "https://registry.npmjs.org/has-tostringtag/-/has-tostringtag-1.0.2.tgz",
|
||||
"integrity": "sha512-NqADB8VjPFLM2V0VvHUewwwsw0ZWBaIdgo+ieHtK3hasLz4qeCRjYcqfB6AQrBggRKppKF8L52/VqdVsO47Dlw==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"has-symbols": "^1.0.3"
|
||||
|
|
@ -4388,7 +4580,6 @@
|
|||
"version": "2.0.2",
|
||||
"resolved": "https://registry.npmjs.org/hasown/-/hasown-2.0.2.tgz",
|
||||
"integrity": "sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"function-bind": "^1.1.2"
|
||||
|
|
@ -4487,7 +4678,7 @@
|
|||
"version": "0.6.3",
|
||||
"resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.6.3.tgz",
|
||||
"integrity": "sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw==",
|
||||
"dev": true,
|
||||
"devOptional": true,
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"safer-buffer": ">= 2.1.2 < 3.0.0"
|
||||
|
|
@ -4496,6 +4687,12 @@
|
|||
"node": ">=0.10.0"
|
||||
}
|
||||
},
|
||||
"node_modules/idb-keyval": {
|
||||
"version": "6.2.2",
|
||||
"resolved": "https://registry.npmjs.org/idb-keyval/-/idb-keyval-6.2.2.tgz",
|
||||
"integrity": "sha512-yjD9nARJ/jb1g+CvD0tlhUHOrJ9Sy0P8T9MF3YaLlHnSRpwPfpTX0XIvpmw3gAJUmEu3FiICLBDPXVwyEvrleg==",
|
||||
"license": "Apache-2.0"
|
||||
},
|
||||
"node_modules/ieee754": {
|
||||
"version": "1.2.1",
|
||||
"resolved": "https://registry.npmjs.org/ieee754/-/ieee754-1.2.1.tgz",
|
||||
|
|
@ -4616,6 +4813,18 @@
|
|||
"node": ">=8"
|
||||
}
|
||||
},
|
||||
"node_modules/is-plain-obj": {
|
||||
"version": "4.1.0",
|
||||
"resolved": "https://registry.npmjs.org/is-plain-obj/-/is-plain-obj-4.1.0.tgz",
|
||||
"integrity": "sha512-+Pgi+vMuUNkJyExiMBt5IlFoMyKnr5zhJ4Uspz58WOhBF5QoIZkFyNHIbBAtHwzVAgk5RtndVNsDRN61/mmDqg==",
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">=12"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/sponsors/sindresorhus"
|
||||
}
|
||||
},
|
||||
"node_modules/is-unicode-supported": {
|
||||
"version": "0.1.0",
|
||||
"resolved": "https://registry.npmjs.org/is-unicode-supported/-/is-unicode-supported-0.1.0.tgz",
|
||||
|
|
@ -4629,6 +4838,12 @@
|
|||
"url": "https://github.com/sponsors/sindresorhus"
|
||||
}
|
||||
},
|
||||
"node_modules/is-url": {
|
||||
"version": "1.2.4",
|
||||
"resolved": "https://registry.npmjs.org/is-url/-/is-url-1.2.4.tgz",
|
||||
"integrity": "sha512-ITvGim8FhRiYe4IQ5uHSkj7pVaPDrCTkNd3yq3cV7iZAcJdHTUMPMEHcqSOy9xZ9qFenQCvi+2wjH9a1nXqHww==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/isbinaryfile": {
|
||||
"version": "5.0.7",
|
||||
"resolved": "https://registry.npmjs.org/isbinaryfile/-/isbinaryfile-5.0.7.tgz",
|
||||
|
|
@ -5133,7 +5348,6 @@
|
|||
"version": "1.1.0",
|
||||
"resolved": "https://registry.npmjs.org/math-intrinsics/-/math-intrinsics-1.1.0.tgz",
|
||||
"integrity": "sha512-/IXtbwEk5HTPyEwyKX6hGkYXxM9nbj64B+ilVJnC/R6B0pH5G4V3b0pVbL7DBj4tkhBAppbQUlf6F6Xl9LHu1g==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">= 0.4"
|
||||
|
|
@ -5156,7 +5370,6 @@
|
|||
"version": "1.52.0",
|
||||
"resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.52.0.tgz",
|
||||
"integrity": "sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">= 0.6"
|
||||
|
|
@ -5166,7 +5379,6 @@
|
|||
"version": "2.1.35",
|
||||
"resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.35.tgz",
|
||||
"integrity": "sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"mime-db": "1.52.0"
|
||||
|
|
@ -5467,6 +5679,26 @@
|
|||
"node": ">=10"
|
||||
}
|
||||
},
|
||||
"node_modules/node-fetch": {
|
||||
"version": "2.7.0",
|
||||
"resolved": "https://registry.npmjs.org/node-fetch/-/node-fetch-2.7.0.tgz",
|
||||
"integrity": "sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"whatwg-url": "^5.0.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": "4.x || >=6.0.0"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"encoding": "^0.1.0"
|
||||
},
|
||||
"peerDependenciesMeta": {
|
||||
"encoding": {
|
||||
"optional": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"node_modules/node-gyp": {
|
||||
"version": "11.5.0",
|
||||
"resolved": "https://registry.npmjs.org/node-gyp/-/node-gyp-11.5.0.tgz",
|
||||
|
|
@ -5607,6 +5839,15 @@
|
|||
"url": "https://github.com/sponsors/sindresorhus"
|
||||
}
|
||||
},
|
||||
"node_modules/opencollective-postinstall": {
|
||||
"version": "2.0.3",
|
||||
"resolved": "https://registry.npmjs.org/opencollective-postinstall/-/opencollective-postinstall-2.0.3.tgz",
|
||||
"integrity": "sha512-8AV/sCtuzUeTo8gQK5qDZzARrulB3egtLzFgteqB2tcT4Mw7B8Kt7JcDHmltjz6FOAHsvTevk70gZEbhM4ZS9Q==",
|
||||
"license": "MIT",
|
||||
"bin": {
|
||||
"opencollective-postinstall": "index.js"
|
||||
}
|
||||
},
|
||||
"node_modules/ora": {
|
||||
"version": "5.4.1",
|
||||
"resolved": "https://registry.npmjs.org/ora/-/ora-5.4.1.tgz",
|
||||
|
|
@ -6205,6 +6446,12 @@
|
|||
"node": ">= 6"
|
||||
}
|
||||
},
|
||||
"node_modules/regenerator-runtime": {
|
||||
"version": "0.13.11",
|
||||
"resolved": "https://registry.npmjs.org/regenerator-runtime/-/regenerator-runtime-0.13.11.tgz",
|
||||
"integrity": "sha512-kY1AZVr2Ra+t+piVaJ4gxaFaReZVH40AKNo7UCX6W+dEwBo/2oZJzqfuN1qLq1oL45o56cPaTXELwrTh8Fpggg==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/require-directory": {
|
||||
"version": "2.1.1",
|
||||
"resolved": "https://registry.npmjs.org/require-directory/-/require-directory-2.1.1.tgz",
|
||||
|
|
@ -6344,7 +6591,7 @@
|
|||
"version": "2.1.2",
|
||||
"resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz",
|
||||
"integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==",
|
||||
"dev": true,
|
||||
"devOptional": true,
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/sanitize-filename": {
|
||||
|
|
@ -6754,6 +7001,22 @@
|
|||
"node": ">=8"
|
||||
}
|
||||
},
|
||||
"node_modules/strtok3": {
|
||||
"version": "10.3.5",
|
||||
"resolved": "https://registry.npmjs.org/strtok3/-/strtok3-10.3.5.tgz",
|
||||
"integrity": "sha512-ki4hZQfh5rX0QDLLkOCj+h+CVNkqmp/CMf8v8kZpkNVK6jGQooMytqzLZYUVYIZcFZ6yDB70EfD8POcFXiF5oA==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@tokenizer/token": "^0.3.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=18"
|
||||
},
|
||||
"funding": {
|
||||
"type": "github",
|
||||
"url": "https://github.com/sponsors/Borewit"
|
||||
}
|
||||
},
|
||||
"node_modules/sumchecker": {
|
||||
"version": "3.0.1",
|
||||
"resolved": "https://registry.npmjs.org/sumchecker/-/sumchecker-3.0.1.tgz",
|
||||
|
|
@ -6991,6 +7254,30 @@
|
|||
"mkdirp": "bin/cmd.js"
|
||||
}
|
||||
},
|
||||
"node_modules/tesseract.js": {
|
||||
"version": "7.0.0",
|
||||
"resolved": "https://registry.npmjs.org/tesseract.js/-/tesseract.js-7.0.0.tgz",
|
||||
"integrity": "sha512-exPBkd+z+wM1BuMkx/Bjv43OeLBxhL5kKWsz/9JY+DXcXdiBjiAch0V49QR3oAJqCaL5qURE0vx9Eo+G5YE7mA==",
|
||||
"hasInstallScript": true,
|
||||
"license": "Apache-2.0",
|
||||
"dependencies": {
|
||||
"bmp-js": "^0.1.0",
|
||||
"idb-keyval": "^6.2.0",
|
||||
"is-url": "^1.2.4",
|
||||
"node-fetch": "^2.6.9",
|
||||
"opencollective-postinstall": "^2.0.3",
|
||||
"regenerator-runtime": "^0.13.3",
|
||||
"tesseract.js-core": "^7.0.0",
|
||||
"wasm-feature-detect": "^1.8.0",
|
||||
"zlibjs": "^0.3.1"
|
||||
}
|
||||
},
|
||||
"node_modules/tesseract.js-core": {
|
||||
"version": "7.0.0",
|
||||
"resolved": "https://registry.npmjs.org/tesseract.js-core/-/tesseract.js-core-7.0.0.tgz",
|
||||
"integrity": "sha512-WnNH518NzmbSq9zgTPeoF8c+xmilS8rFIl1YKbk/ptuuc7p6cLNELNuPAzcmsYw450ca6bLa8j3t0VAtq435Vw==",
|
||||
"license": "Apache-2.0"
|
||||
},
|
||||
"node_modules/text-decoder": {
|
||||
"version": "1.2.7",
|
||||
"resolved": "https://registry.npmjs.org/text-decoder/-/text-decoder-1.2.7.tgz",
|
||||
|
|
@ -7063,6 +7350,30 @@
|
|||
"tmp": "^0.2.0"
|
||||
}
|
||||
},
|
||||
"node_modules/token-types": {
|
||||
"version": "6.1.2",
|
||||
"resolved": "https://registry.npmjs.org/token-types/-/token-types-6.1.2.tgz",
|
||||
"integrity": "sha512-dRXchy+C0IgK8WPC6xvCHFRIWYUbqqdEIKPaKo/AcTUNzwLTK6AH7RjdLWsEZcAN/TBdtfUw3PYEgPr5VPr6ww==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@borewit/text-codec": "^0.2.1",
|
||||
"@tokenizer/token": "^0.3.0",
|
||||
"ieee754": "^1.2.1"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=14.16"
|
||||
},
|
||||
"funding": {
|
||||
"type": "github",
|
||||
"url": "https://github.com/sponsors/Borewit"
|
||||
}
|
||||
},
|
||||
"node_modules/tr46": {
|
||||
"version": "0.0.3",
|
||||
"resolved": "https://registry.npmjs.org/tr46/-/tr46-0.0.3.tgz",
|
||||
"integrity": "sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/tree-kill": {
|
||||
"version": "1.2.2",
|
||||
"resolved": "https://registry.npmjs.org/tree-kill/-/tree-kill-1.2.2.tgz",
|
||||
|
|
@ -7072,6 +7383,16 @@
|
|||
"tree-kill": "cli.js"
|
||||
}
|
||||
},
|
||||
"node_modules/trough": {
|
||||
"version": "2.2.0",
|
||||
"resolved": "https://registry.npmjs.org/trough/-/trough-2.2.0.tgz",
|
||||
"integrity": "sha512-tmMpK00BjZiUyVyvrBK7knerNgmgvcV/KLVyuma/SC+TQN167GrMRciANTz09+k3zW8L8t60jWO1GpfkZdjTaw==",
|
||||
"license": "MIT",
|
||||
"funding": {
|
||||
"type": "github",
|
||||
"url": "https://github.com/sponsors/wooorm"
|
||||
}
|
||||
},
|
||||
"node_modules/truncate-utf8-bytes": {
|
||||
"version": "1.0.2",
|
||||
"resolved": "https://registry.npmjs.org/truncate-utf8-bytes/-/truncate-utf8-bytes-1.0.2.tgz",
|
||||
|
|
@ -7122,6 +7443,18 @@
|
|||
"node": ">=14.17"
|
||||
}
|
||||
},
|
||||
"node_modules/uint8array-extras": {
|
||||
"version": "1.5.0",
|
||||
"resolved": "https://registry.npmjs.org/uint8array-extras/-/uint8array-extras-1.5.0.tgz",
|
||||
"integrity": "sha512-rvKSBiC5zqCCiDZ9kAOszZcDvdAHwwIKJG33Ykj43OKcWsnmcBRL09YTU4nOeHZ8Y2a7l1MgTd08SBe9A8Qj6A==",
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">=18"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/sponsors/sindresorhus"
|
||||
}
|
||||
},
|
||||
"node_modules/unbzip2-stream": {
|
||||
"version": "1.4.3",
|
||||
"resolved": "https://registry.npmjs.org/unbzip2-stream/-/unbzip2-stream-1.4.3.tgz",
|
||||
|
|
@ -7139,6 +7472,25 @@
|
|||
"devOptional": true,
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/unified": {
|
||||
"version": "11.0.5",
|
||||
"resolved": "https://registry.npmjs.org/unified/-/unified-11.0.5.tgz",
|
||||
"integrity": "sha512-xKvGhPWw3k84Qjh8bI3ZeJjqnyadK+GEFtazSfZv/rKeTkTjOJho6mFqh2SM96iIcZokxiOpg78GazTSg8+KHA==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@types/unist": "^3.0.0",
|
||||
"bail": "^2.0.0",
|
||||
"devlop": "^1.0.0",
|
||||
"extend": "^3.0.0",
|
||||
"is-plain-obj": "^4.0.0",
|
||||
"trough": "^2.0.0",
|
||||
"vfile": "^6.0.0"
|
||||
},
|
||||
"funding": {
|
||||
"type": "opencollective",
|
||||
"url": "https://opencollective.com/unified"
|
||||
}
|
||||
},
|
||||
"node_modules/unique-filename": {
|
||||
"version": "4.0.0",
|
||||
"resolved": "https://registry.npmjs.org/unique-filename/-/unique-filename-4.0.0.tgz",
|
||||
|
|
@ -7165,6 +7517,19 @@
|
|||
"node": "^18.17.0 || >=20.5.0"
|
||||
}
|
||||
},
|
||||
"node_modules/unist-util-stringify-position": {
|
||||
"version": "4.0.0",
|
||||
"resolved": "https://registry.npmjs.org/unist-util-stringify-position/-/unist-util-stringify-position-4.0.0.tgz",
|
||||
"integrity": "sha512-0ASV06AAoKCDkS2+xw5RXJywruurpbC4JZSm7nr7MOt1ojAzvyyaO+UxZf18j8FCF6kmzCZKcAgN/yu2gm2XgQ==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@types/unist": "^3.0.0"
|
||||
},
|
||||
"funding": {
|
||||
"type": "opencollective",
|
||||
"url": "https://opencollective.com/unified"
|
||||
}
|
||||
},
|
||||
"node_modules/universalify": {
|
||||
"version": "0.1.2",
|
||||
"resolved": "https://registry.npmjs.org/universalify/-/universalify-0.1.2.tgz",
|
||||
|
|
@ -7228,6 +7593,40 @@
|
|||
"node": ">=0.6.0"
|
||||
}
|
||||
},
|
||||
"node_modules/vfile": {
|
||||
"version": "6.0.3",
|
||||
"resolved": "https://registry.npmjs.org/vfile/-/vfile-6.0.3.tgz",
|
||||
"integrity": "sha512-KzIbH/9tXat2u30jf+smMwFCsno4wHVdNmzFyL+T/L3UGqqk6JKfVqOFOZEpZSHADH1k40ab6NUIXZq422ov3Q==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@types/unist": "^3.0.0",
|
||||
"vfile-message": "^4.0.0"
|
||||
},
|
||||
"funding": {
|
||||
"type": "opencollective",
|
||||
"url": "https://opencollective.com/unified"
|
||||
}
|
||||
},
|
||||
"node_modules/vfile-message": {
|
||||
"version": "4.0.3",
|
||||
"resolved": "https://registry.npmjs.org/vfile-message/-/vfile-message-4.0.3.tgz",
|
||||
"integrity": "sha512-QTHzsGd1EhbZs4AsQ20JX1rC3cOlt/IWJruk893DfLRr57lcnOeMaWG4K0JrRta4mIJZKth2Au3mM3u03/JWKw==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@types/unist": "^3.0.0",
|
||||
"unist-util-stringify-position": "^4.0.0"
|
||||
},
|
||||
"funding": {
|
||||
"type": "opencollective",
|
||||
"url": "https://opencollective.com/unified"
|
||||
}
|
||||
},
|
||||
"node_modules/wasm-feature-detect": {
|
||||
"version": "1.8.0",
|
||||
"resolved": "https://registry.npmjs.org/wasm-feature-detect/-/wasm-feature-detect-1.8.0.tgz",
|
||||
"integrity": "sha512-zksaLKM2fVlnB5jQQDqKXXwYHLQUVH9es+5TOOHwGOVJOCeRBCiPjwSg+3tN2AdTCzjgli4jijCH290kXb/zWQ==",
|
||||
"license": "Apache-2.0"
|
||||
},
|
||||
"node_modules/wcwidth": {
|
||||
"version": "1.0.1",
|
||||
"resolved": "https://registry.npmjs.org/wcwidth/-/wcwidth-1.0.1.tgz",
|
||||
|
|
@ -7244,6 +7643,22 @@
|
|||
"integrity": "sha512-ARrjNjtWRRs2w4Tk7nqrf2gBI0QXWuOmMCx2hU+1jUt6d00MjMxURrhxhGbrsoiZKJrhTSTzbIrc554iKI10qw==",
|
||||
"license": "Apache-2.0"
|
||||
},
|
||||
"node_modules/webidl-conversions": {
|
||||
"version": "3.0.1",
|
||||
"resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-3.0.1.tgz",
|
||||
"integrity": "sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ==",
|
||||
"license": "BSD-2-Clause"
|
||||
},
|
||||
"node_modules/whatwg-url": {
|
||||
"version": "5.0.0",
|
||||
"resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-5.0.0.tgz",
|
||||
"integrity": "sha512-saE57nupxk6v3HY35+jzBwYa0rKSy0XR8JSxZPwgLr7ys0IBzhGviA1/TUGJLmSVqs8pb9AnvICXEuOHLprYTw==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"tr46": "~0.0.3",
|
||||
"webidl-conversions": "^3.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/which": {
|
||||
"version": "5.0.0",
|
||||
"resolved": "https://registry.npmjs.org/which/-/which-5.0.0.tgz",
|
||||
|
|
@ -7399,6 +7814,15 @@
|
|||
"url": "https://github.com/sponsors/sindresorhus"
|
||||
}
|
||||
},
|
||||
"node_modules/zlibjs": {
|
||||
"version": "0.3.1",
|
||||
"resolved": "https://registry.npmjs.org/zlibjs/-/zlibjs-0.3.1.tgz",
|
||||
"integrity": "sha512-+J9RrgTKOmlxFSDHo0pI1xM6BLVUv+o0ZT9ANtCxGkjIVCCUdx9alUF8Gm+dGLKbkkkidWIHFDZHDMpfITt4+w==",
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": "*"
|
||||
}
|
||||
},
|
||||
"node_modules/zod": {
|
||||
"version": "3.25.76",
|
||||
"resolved": "https://registry.npmjs.org/zod/-/zod-3.25.76.tgz",
|
||||
|
|
|
|||
|
|
@ -26,10 +26,10 @@
|
|||
"automation"
|
||||
],
|
||||
"scripts": {
|
||||
"start": "electron .",
|
||||
"start": "electron . --no-sandbox",
|
||||
"dist": "electron-builder",
|
||||
"postinstall": "electron-builder install-app-deps",
|
||||
"dev": "rm -rf app_dist && tsc && electron .",
|
||||
"dev": "rm -rf app_dist && tsc && electron . --no-sandbox",
|
||||
"setup:env": "npm install && cd servers/fastapi && uv sync && cd ../../servers/nextjs && npm install && cd ../.. && npm run setup:export-runtime",
|
||||
"install:pyinstaller": "cd servers/fastapi && echo 'pyinstaller already in dependencies'",
|
||||
"build:ts": "rm -rf app_dist && tsc",
|
||||
|
|
@ -51,6 +51,7 @@
|
|||
"email": "suraj@presenton.ai"
|
||||
},
|
||||
"dependencies": {
|
||||
"@llamaindex/liteparse": "^1.4.0",
|
||||
"@puppeteer/browsers": "^1.9.1",
|
||||
"@tailwindcss/cli": "^4.1.5",
|
||||
"@types/uuid": "^10.0.0",
|
||||
|
|
|
|||
147
electron/resources/document-extraction/liteparse_runner.mjs
Normal file
147
electron/resources/document-extraction/liteparse_runner.mjs
Normal file
|
|
@ -0,0 +1,147 @@
|
|||
#!/usr/bin/env node
|
||||
/**
|
||||
* CLI bridge for Python: one JSON line on stdout for LiteParse extraction.
|
||||
*
|
||||
* OCR follows LlamaIndex LiteParse guidance (built-in Tesseract by default):
|
||||
* https://developers.llamaindex.ai/liteparse/guides/ocr/
|
||||
*
|
||||
* - ISO 639-3 for Tesseract (eng, fra, deu, jpn, …); multi-lang as "deu+eng" or "deu,eng".
|
||||
* - Parallel workers ≈ CPU cores − 1 (override --num-workers).
|
||||
* - Optional HTTP OCR: --ocr-server-url or LITEPARSE_OCR_SERVER_URL.
|
||||
* - Optional local models: --tessdata-path or LITEPARSE_TESSDATA_PATH (else TESSDATA_PREFIX / CDN).
|
||||
*/
|
||||
|
||||
import fs from "node:fs";
|
||||
import os from "node:os";
|
||||
import path from "node:path";
|
||||
import { LiteParse } from "@llamaindex/liteparse";
|
||||
|
||||
function readArg(name) {
|
||||
const idx = process.argv.indexOf(name);
|
||||
if (idx === -1) return null;
|
||||
return process.argv[idx + 1] ?? null;
|
||||
}
|
||||
|
||||
function parseBool(value, fallback) {
|
||||
if (value == null || value === "") return fallback;
|
||||
const s = String(value).trim().toLowerCase();
|
||||
if (["1", "true", "yes", "on"].includes(s)) return true;
|
||||
if (["0", "false", "no", "off"].includes(s)) return false;
|
||||
return fallback;
|
||||
}
|
||||
|
||||
function toNumber(value, fallback, min, max) {
|
||||
if (value == null || value === "") return fallback;
|
||||
const parsed = Number(value);
|
||||
if (Number.isNaN(parsed)) return fallback;
|
||||
return Math.min(Math.max(parsed, min), max);
|
||||
}
|
||||
|
||||
/** Tesseract accepts "deu+eng"; allow comma-separated CLI/env for convenience. */
|
||||
function normalizeOcrLanguage(raw) {
|
||||
const s = String(raw ?? "").trim();
|
||||
if (!s) return "eng";
|
||||
if (s.includes(",")) {
|
||||
return s
|
||||
.split(",")
|
||||
.map((p) => p.trim())
|
||||
.filter(Boolean)
|
||||
.join("+");
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
function emit(result, exitCode = 0) {
|
||||
process.stdout.write(`${JSON.stringify(result)}\n`);
|
||||
process.exit(exitCode);
|
||||
}
|
||||
|
||||
const filePath = readArg("--file");
|
||||
if (!filePath) {
|
||||
emit({ ok: false, error: "Missing required --file argument" }, 2);
|
||||
}
|
||||
|
||||
const resolvedPath = path.resolve(filePath);
|
||||
if (!fs.existsSync(resolvedPath)) {
|
||||
emit({ ok: false, error: `File not found: ${resolvedPath}` }, 2);
|
||||
}
|
||||
|
||||
const ocrEnabled = parseBool(readArg("--ocr-enabled"), true);
|
||||
const dpi = toNumber(readArg("--dpi"), 150, 72, 600);
|
||||
const numWorkers = toNumber(
|
||||
readArg("--num-workers"),
|
||||
Math.max(os.cpus().length - 2, 1),
|
||||
1,
|
||||
64
|
||||
);
|
||||
|
||||
const cliOcrLanguage = readArg("--ocr-language");
|
||||
const ocrLanguageRaw =
|
||||
(process.env.LITEPARSE_OCR_LANGUAGE && String(process.env.LITEPARSE_OCR_LANGUAGE).trim()) ||
|
||||
(cliOcrLanguage && String(cliOcrLanguage).trim()) ||
|
||||
"";
|
||||
const ocrLanguage = normalizeOcrLanguage(ocrLanguageRaw || "eng");
|
||||
|
||||
const outputFormatRaw = (readArg("--output-format") || "text").trim().toLowerCase();
|
||||
const outputFormat = outputFormatRaw === "json" ? "json" : "text";
|
||||
|
||||
const ocrServerUrlArg = readArg("--ocr-server-url");
|
||||
const ocrServerUrl =
|
||||
(ocrServerUrlArg && String(ocrServerUrlArg).trim()) ||
|
||||
(process.env.LITEPARSE_OCR_SERVER_URL && String(process.env.LITEPARSE_OCR_SERVER_URL).trim()) ||
|
||||
undefined;
|
||||
|
||||
const tessdataArg = readArg("--tessdata-path");
|
||||
const tessdataPath =
|
||||
(tessdataArg && String(tessdataArg).trim()) ||
|
||||
(process.env.LITEPARSE_TESSDATA_PATH && String(process.env.LITEPARSE_TESSDATA_PATH).trim()) ||
|
||||
(process.env.TESSDATA_PREFIX && String(process.env.TESSDATA_PREFIX).trim()) ||
|
||||
undefined;
|
||||
|
||||
try {
|
||||
const config = {
|
||||
ocrEnabled,
|
||||
ocrLanguage,
|
||||
outputFormat,
|
||||
dpi,
|
||||
numWorkers,
|
||||
};
|
||||
if (ocrServerUrl) {
|
||||
config.ocrServerUrl = ocrServerUrl;
|
||||
}
|
||||
if (tessdataPath) {
|
||||
config.tessdataPath = tessdataPath;
|
||||
}
|
||||
|
||||
const parser = new LiteParse(config);
|
||||
|
||||
const result = await parser.parse(resolvedPath, true);
|
||||
const text = result?.text ?? "";
|
||||
emit({
|
||||
ok: true,
|
||||
filePath: resolvedPath,
|
||||
text,
|
||||
pageCount: Array.isArray(result?.pages) ? result.pages.length : 0,
|
||||
ocr: {
|
||||
engine: ocrServerUrl ? "http" : "tesseract",
|
||||
ocrLanguage,
|
||||
ocrEnabled,
|
||||
dpi,
|
||||
numWorkers,
|
||||
},
|
||||
});
|
||||
} catch (error) {
|
||||
const message = error instanceof Error ? error.message : String(error);
|
||||
const stack = error instanceof Error ? error.stack : undefined;
|
||||
if (stack) {
|
||||
process.stderr.write(`${stack}\n`);
|
||||
}
|
||||
emit(
|
||||
{
|
||||
ok: false,
|
||||
filePath: resolvedPath,
|
||||
error: message,
|
||||
},
|
||||
1
|
||||
);
|
||||
}
|
||||
|
|
@ -12,6 +12,7 @@ window.addEventListener("DOMContentLoaded", () => {
|
|||
const labelMap = {
|
||||
libreoffice: "LibreOffice",
|
||||
puppeteer: "Chromium",
|
||||
imagemagick: "ImageMagick",
|
||||
};
|
||||
|
||||
const dependenciesEl = document.getElementById("status-dependencies");
|
||||
|
|
@ -24,6 +25,7 @@ window.addEventListener("DOMContentLoaded", () => {
|
|||
const currentStatus = {
|
||||
libreoffice: "checking",
|
||||
puppeteer: "checking",
|
||||
imagemagick: "checking",
|
||||
};
|
||||
|
||||
function setStatus(name, status) {
|
||||
|
|
@ -83,6 +85,7 @@ window.addEventListener("DOMContentLoaded", () => {
|
|||
if (!statusMap) return;
|
||||
if (statusMap.libreoffice) setStatus("libreoffice", statusMap.libreoffice);
|
||||
if (statusMap.puppeteer) setStatus("puppeteer", statusMap.puppeteer);
|
||||
if (statusMap.imagemagick) setStatus("imagemagick", statusMap.imagemagick);
|
||||
});
|
||||
}
|
||||
});
|
||||
|
|
@ -141,7 +141,7 @@
|
|||
<div id="state-prompt" class="state active">
|
||||
<div class="icon-wrap purple">📦</div>
|
||||
<p class="heading" id="prompt-heading">Dependencies required</p>
|
||||
<p class="sub" id="prompt-sub">Presenton needs LibreOffice and Chrome to create and export presentations. Install them now so everything works.</p>
|
||||
<p class="sub" id="prompt-sub">Presenton needs LibreOffice, Chrome, and ImageMagick to create and export presentations reliably. Install them now so everything works.</p>
|
||||
<div class="btn-row">
|
||||
<button class="btn-primary" id="btn-install">Install</button>
|
||||
<button class="btn-ghost" id="btn-skip">Skip for now</button>
|
||||
|
|
@ -212,8 +212,9 @@
|
|||
<script>
|
||||
const STATES = ['prompt','downloading','installing','success','error'];
|
||||
let logLines = 0;
|
||||
let currentStep = null; // 'libreoffice' | 'chrome'
|
||||
let status = { needsLibreOffice: false, needsChrome: false };
|
||||
let currentStep = null; // 'libreoffice' | 'chrome' | 'imagemagick'
|
||||
let status = { needsLibreOffice: false, needsChrome: false, needsImageMagick: false };
|
||||
let steps = [];
|
||||
let logOpen = false;
|
||||
|
||||
function showState(name) {
|
||||
|
|
@ -264,15 +265,30 @@
|
|||
document.getElementById('log-toggle-label').textContent = logOpen ? 'Hide details' : 'Show details';
|
||||
}
|
||||
|
||||
function getStepsFromStatus() {
|
||||
const queue = [];
|
||||
if (status.needsLibreOffice) queue.push('libreoffice');
|
||||
if (status.needsChrome) queue.push('chrome');
|
||||
if (status.needsImageMagick) queue.push('imagemagick');
|
||||
return queue;
|
||||
}
|
||||
|
||||
function showPromptForStep(step) {
|
||||
currentStep = step;
|
||||
const total = (status.needsLibreOffice ? 1 : 0) + (status.needsChrome ? 1 : 0);
|
||||
const stepNum = step === 'libreoffice' ? 1 : 2;
|
||||
setStepBadge(stepNum, total, step === 'libreoffice' ? 'LibreOffice' : 'Chromium');
|
||||
document.getElementById('prompt-heading').textContent = step === 'libreoffice' ? 'LibreOffice required' : 'Chromium required';
|
||||
document.getElementById('prompt-sub').innerHTML = step === 'libreoffice'
|
||||
? '<strong>Presenton</strong> uses LibreOffice to generate custom templates from PPTX files.'
|
||||
: '<strong>Presenton</strong> uses Chromium for export and slide rendering. Download it now (~150 MB).';
|
||||
const total = steps.length || 1;
|
||||
const stepNum = Math.max(1, steps.indexOf(step) + 1);
|
||||
const stepLabel = step === 'libreoffice' ? 'LibreOffice' : step === 'chrome' ? 'Chromium' : 'ImageMagick';
|
||||
setStepBadge(stepNum, total, stepLabel);
|
||||
document.getElementById('prompt-heading').textContent =
|
||||
step === 'libreoffice' ? 'LibreOffice required' :
|
||||
step === 'chrome' ? 'Chromium required' :
|
||||
'ImageMagick required';
|
||||
document.getElementById('prompt-sub').innerHTML =
|
||||
step === 'libreoffice'
|
||||
? '<strong>Presenton</strong> uses LibreOffice to generate custom templates from PPTX files.'
|
||||
: step === 'chrome'
|
||||
? '<strong>Presenton</strong> uses Chromium for export and slide rendering. Download it now (~150 MB).'
|
||||
: '<strong>Presenton</strong> uses ImageMagick for OCR/document conversion support. Linux uses apt, macOS installs Homebrew first (if needed) and then runs brew install imagemagick, and Windows uses Chocolatey with a direct installer fallback.';
|
||||
document.getElementById('btn-install').onclick = () => startInstall(step);
|
||||
document.getElementById('btn-skip').onclick = () => handleSkip();
|
||||
showState('prompt');
|
||||
|
|
@ -286,7 +302,7 @@
|
|||
document.getElementById('dl-heading').textContent = 'Downloading LibreOffice';
|
||||
document.getElementById('dl-phase').textContent = 'This may take a few minutes (~300 MB)';
|
||||
window.setupInstaller.installLibreOffice();
|
||||
} else {
|
||||
} else if (step === 'chrome') {
|
||||
document.getElementById('dl-heading').textContent = 'Downloading Chromium';
|
||||
document.getElementById('dl-phase').textContent = 'This may take a few minutes (~150 MB)';
|
||||
window.setupInstaller.installChrome().then(res => {
|
||||
|
|
@ -297,12 +313,36 @@
|
|||
document.getElementById('btn-skip-error').onclick = () => nextOrDone();
|
||||
}
|
||||
});
|
||||
} else {
|
||||
document.getElementById('dl-heading').textContent = 'Installing ImageMagick';
|
||||
document.getElementById('dl-phase').textContent = 'Linux: apt-get | macOS: Homebrew + brew install | Windows: choco or direct installer';
|
||||
window.setupInstaller.installImageMagick().then((installResult) => {
|
||||
if (!installResult || !installResult.ok) {
|
||||
if (currentStep !== 'imagemagick') return;
|
||||
document.getElementById('err-msg').textContent = installResult?.error || 'ImageMagick installation needs manual completion. Follow the shown commands and then click Retry.';
|
||||
showState('error');
|
||||
document.getElementById('btn-retry').onclick = () => startInstall('imagemagick');
|
||||
document.getElementById('btn-skip-error').onclick = () => nextOrDone();
|
||||
return;
|
||||
}
|
||||
|
||||
window.setupInstaller.checkImageMagick().then(res => {
|
||||
if (!res.ok && currentStep === 'imagemagick') {
|
||||
document.getElementById('err-msg').textContent = res.error || 'ImageMagick is not installed yet.';
|
||||
showState('error');
|
||||
document.getElementById('btn-retry').onclick = () => startInstall('imagemagick');
|
||||
document.getElementById('btn-skip-error').onclick = () => nextOrDone();
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
function nextOrDone() {
|
||||
if (currentStep === 'libreoffice' && status.needsChrome) {
|
||||
showPromptForStep('chrome');
|
||||
const idx = steps.indexOf(currentStep);
|
||||
const nextStep = idx >= 0 ? steps[idx + 1] : null;
|
||||
if (nextStep) {
|
||||
showPromptForStep(nextStep);
|
||||
} else {
|
||||
window.setupInstaller.done();
|
||||
}
|
||||
|
|
@ -338,13 +378,17 @@
|
|||
}
|
||||
if (phase === 'done') {
|
||||
showState('success');
|
||||
document.getElementById('success-heading').textContent = currentStep === 'libreoffice' ? 'LibreOffice installed' : 'Chromium installed';
|
||||
document.getElementById('success-sub').textContent = status.needsChrome && currentStep === 'libreoffice' ? 'Next: Chrome.' : 'Continuing in a moment…';
|
||||
document.getElementById('success-heading').textContent =
|
||||
currentStep === 'libreoffice' ? 'LibreOffice installed' :
|
||||
currentStep === 'chrome' ? 'Chromium installed' :
|
||||
'ImageMagick ready';
|
||||
const idx = steps.indexOf(currentStep);
|
||||
const nextStep = idx >= 0 ? steps[idx + 1] : null;
|
||||
document.getElementById('success-sub').textContent = nextStep ? 'Continuing with next step…' : 'Continuing in a moment…';
|
||||
const bar = document.getElementById('success-bar');
|
||||
if (bar) bar.style.width = '100%';
|
||||
setTimeout(() => {
|
||||
if (currentStep === 'libreoffice' && status.needsChrome) showPromptForStep('chrome');
|
||||
else window.setupInstaller.done();
|
||||
nextOrDone();
|
||||
}, 2200);
|
||||
return;
|
||||
}
|
||||
|
|
@ -366,18 +410,20 @@
|
|||
window.setupInstaller.onLibreOfficeLog((data) => onLog('libreoffice', data));
|
||||
window.setupInstaller.onChromeProgress((data) => onProgress('chrome', data));
|
||||
window.setupInstaller.onChromeLog((data) => onLog('chrome', data));
|
||||
window.setupInstaller.onImageMagickProgress((data) => onProgress('imagemagick', data));
|
||||
window.setupInstaller.onImageMagickLog((data) => onLog('imagemagick', data));
|
||||
|
||||
document.getElementById('btn-retry').onclick = () => startInstall(currentStep);
|
||||
document.getElementById('btn-skip-error').onclick = () => nextOrDone();
|
||||
|
||||
window.setupInstaller.getStatus().then(s => {
|
||||
status = s;
|
||||
if (!status.needsLibreOffice && !status.needsChrome) {
|
||||
steps = getStepsFromStatus();
|
||||
if (steps.length === 0) {
|
||||
window.setupInstaller.done();
|
||||
return;
|
||||
}
|
||||
if (status.needsLibreOffice) showPromptForStep('libreoffice');
|
||||
else showPromptForStep('chrome');
|
||||
showPromptForStep(steps[0]);
|
||||
});
|
||||
</script>
|
||||
</body>
|
||||
|
|
|
|||
|
|
@ -4,6 +4,7 @@ from typing import Annotated, List, Optional
|
|||
from fastapi import APIRouter, Body, File, UploadFile
|
||||
|
||||
from constants.documents import UPLOAD_ACCEPTED_FILE_TYPES
|
||||
from models.decompose_files_body import DecomposeFilesBody
|
||||
from models.decomposed_file_info import DecomposedFileInfo
|
||||
from services.temp_file_service import TEMP_FILE_SERVICE
|
||||
from services.documents_loader import DocumentsLoader
|
||||
|
|
@ -38,18 +39,21 @@ async def upload_files(files: Optional[List[UploadFile]]):
|
|||
|
||||
|
||||
@FILES_ROUTER.post("/decompose", response_model=List[DecomposedFileInfo])
|
||||
async def decompose_files(file_paths: Annotated[List[str], Body(embed=True)]):
|
||||
async def decompose_files(body: DecomposeFilesBody):
|
||||
temp_dir = TEMP_FILE_SERVICE.create_temp_dir(str(uuid.uuid4()))
|
||||
|
||||
txt_files = []
|
||||
other_files = []
|
||||
for file_path in file_paths:
|
||||
for file_path in body.file_paths:
|
||||
if file_path.endswith(".txt"):
|
||||
txt_files.append(file_path)
|
||||
else:
|
||||
other_files.append(file_path)
|
||||
|
||||
documents_loader = DocumentsLoader(file_paths=other_files)
|
||||
documents_loader = DocumentsLoader(
|
||||
file_paths=other_files,
|
||||
presentation_language=body.language,
|
||||
)
|
||||
await documents_loader.load_documents(temp_dir)
|
||||
parsed_documents = documents_loader.documents
|
||||
|
||||
|
|
|
|||
|
|
@ -43,7 +43,10 @@ async def stream_outlines(
|
|||
|
||||
additional_context = ""
|
||||
if presentation.file_paths:
|
||||
documents_loader = DocumentsLoader(file_paths=presentation.file_paths)
|
||||
documents_loader = DocumentsLoader(
|
||||
file_paths=presentation.file_paths,
|
||||
presentation_language=presentation.language,
|
||||
)
|
||||
await documents_loader.load_documents(temp_dir)
|
||||
documents = documents_loader.documents
|
||||
if documents:
|
||||
|
|
|
|||
|
|
@ -15,7 +15,7 @@ import re
|
|||
from services.documents_loader import DocumentsLoader
|
||||
from utils.asset_directory_utils import get_images_directory
|
||||
import uuid
|
||||
from constants.documents import POWERPOINT_TYPES
|
||||
from constants.documents import PPTX_MIME_TYPES
|
||||
|
||||
|
||||
def _get_soffice_binary() -> str:
|
||||
|
|
@ -330,7 +330,7 @@ async def process_pptx_slides(
|
|||
"""
|
||||
|
||||
# Validate PPTX file
|
||||
if pptx_file.content_type not in POWERPOINT_TYPES:
|
||||
if pptx_file.content_type not in PPTX_MIME_TYPES:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"Invalid file type. Expected PPTX file, got {pptx_file.content_type}",
|
||||
|
|
@ -441,7 +441,7 @@ async def process_pptx_fonts(
|
|||
Uses the exact same font extraction and analysis utilities as the /pptx-slides endpoint.
|
||||
"""
|
||||
# Validate PPTX file
|
||||
if pptx_file.content_type not in POWERPOINT_TYPES:
|
||||
if pptx_file.content_type not in PPTX_MIME_TYPES:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"Invalid file type. Expected PPTX file, got {pptx_file.content_type}",
|
||||
|
|
|
|||
|
|
@ -518,7 +518,10 @@ async def generate_presentation_handler(
|
|||
await sql_session.commit()
|
||||
|
||||
if request.files:
|
||||
documents_loader = DocumentsLoader(file_paths=request.files)
|
||||
documents_loader = DocumentsLoader(
|
||||
file_paths=request.files,
|
||||
presentation_language=request.language,
|
||||
)
|
||||
await documents_loader.load_documents()
|
||||
documents = documents_loader.documents
|
||||
if documents:
|
||||
|
|
|
|||
|
|
@ -1,20 +1,90 @@
|
|||
PDF_EXTENSIONS = [".pdf"]
|
||||
TEXT_EXTENSIONS = [".txt"]
|
||||
|
||||
WORD_EXTENSIONS = [".doc", ".docx", ".docm", ".odt", ".rtf"]
|
||||
POWERPOINT_EXTENSIONS = [".ppt", ".pptx", ".pptm", ".odp"]
|
||||
SPREADSHEET_EXTENSIONS = [".xls", ".xlsx", ".xlsm", ".ods", ".csv", ".tsv"]
|
||||
|
||||
JPEG_EXTENSIONS = [".jpg", ".jpeg"]
|
||||
PNG_EXTENSIONS = [".png"]
|
||||
GIF_EXTENSIONS = [".gif"]
|
||||
BMP_EXTENSIONS = [".bmp"]
|
||||
TIFF_EXTENSIONS = [".tiff", ".tif"]
|
||||
WEBP_EXTENSIONS = [".webp"]
|
||||
SVG_EXTENSIONS = [".svg"]
|
||||
IMAGE_EXTENSIONS = (
|
||||
JPEG_EXTENSIONS
|
||||
+ PNG_EXTENSIONS
|
||||
+ GIF_EXTENSIONS
|
||||
+ BMP_EXTENSIONS
|
||||
+ TIFF_EXTENSIONS
|
||||
+ WEBP_EXTENSIONS
|
||||
+ SVG_EXTENSIONS
|
||||
)
|
||||
|
||||
OFFICE_EXTENSIONS = WORD_EXTENSIONS + POWERPOINT_EXTENSIONS + SPREADSHEET_EXTENSIONS
|
||||
|
||||
PDF_MIME_TYPES = ["application/pdf"]
|
||||
TEXT_MIME_TYPES = ["text/plain"]
|
||||
POWERPOINT_TYPES = [
|
||||
"application/vnd.openxmlformats-officedocument.presentationml.presentation"
|
||||
]
|
||||
WORD_TYPES = [
|
||||
TEXT_MIME_TYPES = ["text/plain", "text/markdown"]
|
||||
|
||||
WORD_MIME_TYPES = [
|
||||
"application/msword",
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
"application/vnd.ms-word.document.macroenabled.12",
|
||||
"application/vnd.oasis.opendocument.text",
|
||||
"application/rtf",
|
||||
"text/rtf",
|
||||
]
|
||||
SPREADSHEET_TYPES = ["text/csv", "application/csv"]
|
||||
|
||||
POWERPOINT_MIME_TYPES = [
|
||||
"application/vnd.ms-powerpoint",
|
||||
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
||||
"application/vnd.ms-powerpoint.presentation.macroenabled.12",
|
||||
"application/vnd.oasis.opendocument.presentation",
|
||||
]
|
||||
|
||||
PNG_MIME_TYPES = ["image/png"]
|
||||
JPEG_MIME_TYPES = ["image/jpeg"]
|
||||
WEBP_MIME_TYPES = ["image/webp"]
|
||||
SPREADSHEET_MIME_TYPES = [
|
||||
"application/vnd.ms-excel",
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||
"application/vnd.ms-excel.sheet.macroenabled.12",
|
||||
"application/vnd.oasis.opendocument.spreadsheet",
|
||||
"text/csv",
|
||||
"application/csv",
|
||||
"text/tab-separated-values",
|
||||
"text/tsv",
|
||||
]
|
||||
|
||||
IMAGE_MIME_TYPES = [
|
||||
"image/jpeg",
|
||||
"image/png",
|
||||
"image/gif",
|
||||
"image/bmp",
|
||||
"image/tiff",
|
||||
"image/webp",
|
||||
"image/svg+xml",
|
||||
]
|
||||
|
||||
UPLOAD_ACCEPTED_FILE_TYPES = (
|
||||
PDF_MIME_TYPES + TEXT_MIME_TYPES + POWERPOINT_TYPES + WORD_TYPES
|
||||
UPLOAD_ACCEPTED_MIME_TYPES = (
|
||||
PDF_MIME_TYPES
|
||||
+ TEXT_MIME_TYPES
|
||||
+ WORD_MIME_TYPES
|
||||
+ POWERPOINT_MIME_TYPES
|
||||
+ SPREADSHEET_MIME_TYPES
|
||||
+ IMAGE_MIME_TYPES
|
||||
)
|
||||
|
||||
UPLOAD_ACCEPTED_EXTENSIONS = (
|
||||
PDF_EXTENSIONS + TEXT_EXTENSIONS + OFFICE_EXTENSIONS + IMAGE_EXTENSIONS
|
||||
)
|
||||
|
||||
# Includes both MIME types and extensions because some clients upload legacy
|
||||
# office files with generic content-type values.
|
||||
UPLOAD_ACCEPTED_FILE_TYPES = UPLOAD_ACCEPTED_MIME_TYPES + UPLOAD_ACCEPTED_EXTENSIONS
|
||||
|
||||
# Kept for endpoints that strictly require modern .pptx files.
|
||||
PPTX_MIME_TYPES = ["application/vnd.openxmlformats-officedocument.presentationml.presentation"]
|
||||
|
||||
# Backward compatibility aliases used across existing modules.
|
||||
POWERPOINT_TYPES = PPTX_MIME_TYPES
|
||||
WORD_TYPES = WORD_MIME_TYPES
|
||||
SPREADSHEET_TYPES = SPREADSHEET_MIME_TYPES
|
||||
|
|
|
|||
11
electron/servers/fastapi/models/decompose_files_body.py
Normal file
11
electron/servers/fastapi/models/decompose_files_body.py
Normal file
|
|
@ -0,0 +1,11 @@
|
|||
from typing import List, Optional
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
class DecomposeFilesBody(BaseModel):
|
||||
file_paths: List[str]
|
||||
language: Optional[str] = Field(
|
||||
default=None,
|
||||
description="Presentation language from the UI; used as LiteParse/Tesseract OCR language hint.",
|
||||
)
|
||||
|
|
@ -1,47 +1,45 @@
|
|||
[project]
|
||||
name = "presenton-backend"
|
||||
version = "0.1.0"
|
||||
description = "Add your description here"
|
||||
requires-python = ">=3.11,<3.12"
|
||||
dependencies = [
|
||||
"alembic>=1.14.0",
|
||||
"aiohttp>=3.12.15",
|
||||
"aiomysql>=0.2.0",
|
||||
"aiosqlite>=0.21.0",
|
||||
"anthropic>=0.60.0",
|
||||
"asyncpg>=0.30.0",
|
||||
"dirtyjson>=1.0.8",
|
||||
# Platform-specific: docling for Linux/macOS only
|
||||
"docling>=2.43.0; sys_platform != 'win32'",
|
||||
"fastapi[standard]>=0.116.1",
|
||||
"fastembed-vectorstore>=0.5.2",
|
||||
"fastmcp>=2.11.0",
|
||||
"google-genai>=1.28.0",
|
||||
# Platform-specific: greenlet for macOS only (critical for SQLAlchemy async)
|
||||
"greenlet>=3.0.0; sys_platform == 'darwin'",
|
||||
"nltk>=3.9.1",
|
||||
"openai>=1.98.0",
|
||||
"pathvalidate>=3.3.1",
|
||||
"pdfplumber>=0.11.7",
|
||||
# Platform-specific: docx2everything for DOCX/Markdown extraction on Windows
|
||||
"docx2everything>=1.0.0; sys_platform == 'win32'",
|
||||
"pyinstaller>=6.18.0",
|
||||
"pytest>=8.4.1",
|
||||
"python-pptx>=1.0.2; sys_platform == 'win32'",
|
||||
"redis>=6.2.0",
|
||||
"sqlmodel>=0.0.24",
|
||||
]
|
||||
|
||||
[tool.uv]
|
||||
index-strategy = "unsafe-best-match"
|
||||
|
||||
[[tool.uv.index]]
|
||||
url = "https://download.pytorch.org/whl/cpu"
|
||||
|
||||
[dependency-groups]
|
||||
dev = [
|
||||
]
|
||||
|
||||
[tool.setuptools.packages.find]
|
||||
where = ["."]
|
||||
include = ["api*", "enums*", "models*", "services*", "constants*", "utils*"]
|
||||
[project]
|
||||
name = "presenton-backend"
|
||||
version = "0.1.0"
|
||||
description = "Add your description here"
|
||||
requires-python = ">=3.11,<3.12"
|
||||
dependencies = [
|
||||
"alembic>=1.14.0",
|
||||
"aiohttp>=3.12.15",
|
||||
"aiomysql>=0.2.0",
|
||||
"aiosqlite>=0.21.0",
|
||||
"anthropic>=0.60.0",
|
||||
"asyncpg>=0.30.0",
|
||||
"dirtyjson>=1.0.8",
|
||||
"fastapi[standard]>=0.116.1",
|
||||
"fastembed-vectorstore>=0.5.2",
|
||||
"fastmcp>=2.11.0",
|
||||
"google-genai>=1.28.0",
|
||||
# Platform-specific: greenlet for macOS only (critical for SQLAlchemy async)
|
||||
"greenlet>=3.0.0; sys_platform == 'darwin'",
|
||||
"nltk>=3.9.1",
|
||||
"openai>=1.98.0",
|
||||
"pathvalidate>=3.3.1",
|
||||
"pdfplumber>=0.11.7",
|
||||
# Platform-specific: docx2everything for DOCX/Markdown extraction on Windows
|
||||
"docx2everything>=1.0.0; sys_platform == 'win32'",
|
||||
"pyinstaller>=6.18.0",
|
||||
"pytest>=8.4.1",
|
||||
"python-pptx>=1.0.2",
|
||||
"redis>=6.2.0",
|
||||
"sqlmodel>=0.0.24",
|
||||
]
|
||||
|
||||
[tool.uv]
|
||||
index-strategy = "unsafe-best-match"
|
||||
|
||||
[[tool.uv.index]]
|
||||
url = "https://download.pytorch.org/whl/cpu"
|
||||
|
||||
[dependency-groups]
|
||||
dev = [
|
||||
]
|
||||
|
||||
[tool.setuptools.packages.find]
|
||||
where = ["."]
|
||||
include = ["api*", "enums*", "models*", "services*", "constants*", "utils*"]
|
||||
|
|
|
|||
|
|
@ -1,78 +0,0 @@
|
|||
"""
|
||||
Runtime hook to fix docling metadata lookup and python-docx template path resolution in PyInstaller builds.
|
||||
|
||||
PyInstaller doesn't always preserve package metadata (dist-info) in a way that
|
||||
importlib.metadata can find it. This hook patches the version lookup to return
|
||||
a default version if metadata isn't found, allowing docling to import successfully.
|
||||
|
||||
Additionally, python-docx uses __file__ to locate template files, which doesn't work
|
||||
correctly in PyInstaller bundles. This hook patches the path resolution to use
|
||||
sys._MEIPASS to find the templates.
|
||||
"""
|
||||
import sys
|
||||
import os
|
||||
|
||||
# Only apply this fix when running in PyInstaller bundle
|
||||
if getattr(sys, 'frozen', False) and hasattr(sys, '_MEIPASS'):
|
||||
try:
|
||||
import importlib.metadata
|
||||
|
||||
# Store original version function
|
||||
_original_version = importlib.metadata.version
|
||||
|
||||
def _patched_version(package_name):
|
||||
"""Patched version that handles missing metadata gracefully."""
|
||||
try:
|
||||
return _original_version(package_name)
|
||||
except importlib.metadata.PackageNotFoundError:
|
||||
# For docling packages, return a default version if metadata not found
|
||||
if package_name in ('docling', 'docling-core', 'docling-parse', 'docling-ibm-models'):
|
||||
# Return a reasonable default version to allow import to proceed
|
||||
return '2.43.0'
|
||||
raise
|
||||
|
||||
# Patch the version function
|
||||
importlib.metadata.version = _patched_version
|
||||
|
||||
except Exception:
|
||||
# If patching fails, continue anyway
|
||||
pass
|
||||
|
||||
# Fix python-docx template path resolution
|
||||
try:
|
||||
import docx.parts.hdrftr as hdrftr_module
|
||||
|
||||
# Store the original _default_header_xml function
|
||||
if hasattr(hdrftr_module, '_default_header_xml'):
|
||||
_original_default_header_xml = hdrftr_module._default_header_xml
|
||||
|
||||
def _patched_default_header_xml():
|
||||
"""Patched function that resolves template path correctly in PyInstaller bundle."""
|
||||
# Try to find the template file in the bundle
|
||||
template_path = os.path.join(sys._MEIPASS, 'docx', 'templates', 'default-header.xml')
|
||||
if os.path.exists(template_path):
|
||||
with open(template_path, 'rb') as f:
|
||||
return f.read()
|
||||
# Fallback to original implementation
|
||||
return _original_default_header_xml()
|
||||
|
||||
# Patch the function
|
||||
hdrftr_module._default_header_xml = _patched_default_header_xml
|
||||
|
||||
# Also patch _default_footer_xml if it exists
|
||||
if hasattr(hdrftr_module, '_default_footer_xml'):
|
||||
_original_default_footer_xml = hdrftr_module._default_footer_xml
|
||||
|
||||
def _patched_default_footer_xml():
|
||||
"""Patched function that resolves template path correctly in PyInstaller bundle."""
|
||||
template_path = os.path.join(sys._MEIPASS, 'docx', 'templates', 'default-footer.xml')
|
||||
if os.path.exists(template_path):
|
||||
with open(template_path, 'rb') as f:
|
||||
return f.read()
|
||||
return _original_default_footer_xml()
|
||||
|
||||
hdrftr_module._default_footer_xml = _patched_default_footer_xml
|
||||
|
||||
except Exception:
|
||||
# If patching fails, continue anyway
|
||||
pass
|
||||
|
|
@ -17,18 +17,6 @@ datas_docx2everything, binaries_docx2everything, hiddenimports_docx2everything =
|
|||
# collect_all returns empty lists if package not installed, so safe to call always
|
||||
datas_greenlet, binaries_greenlet, hiddenimports_greenlet = collect_all('greenlet')
|
||||
|
||||
# Collect docling - only installed on Linux/macOS (via pyproject.toml)
|
||||
# collect_all returns empty lists if package not installed, so safe to call always
|
||||
datas_docling, binaries_docling, hiddenimports_docling = collect_all('docling')
|
||||
# Also collect docling dependencies which are needed for metadata lookup
|
||||
datas_docling_core, binaries_docling_core, hiddenimports_docling_core = collect_all('docling-core')
|
||||
datas_docling_parse, binaries_docling_parse, hiddenimports_docling_parse = collect_all('docling-parse')
|
||||
datas_docling_ibm, binaries_docling_ibm, hiddenimports_docling_ibm = collect_all('docling-ibm-models')
|
||||
|
||||
# Collect python-docx (dependency of docling) - needed for Word document processing on Linux/macOS
|
||||
# collect_all returns empty lists if package not installed, so safe to call conditionally
|
||||
datas_docx, binaries_docx, hiddenimports_docx = collect_all('docx')
|
||||
|
||||
# fastembed_cache is created at runtime when models are first used; include only if present (e.g. local dev)
|
||||
datas_fastembed_cache = [('fastembed_cache', 'fastembed_cache')] if os.path.isdir('fastembed_cache') else []
|
||||
|
||||
|
|
@ -37,12 +25,12 @@ excludes = []
|
|||
a = Analysis(
|
||||
['server.py'],
|
||||
pathex=[],
|
||||
binaries=binaries_fastembed + binaries_fastembed_vs + binaries_onnx + binaries_pptx + binaries_docx2everything + binaries_greenlet + binaries_docling + binaries_docling_core + binaries_docling_parse + binaries_docling_ibm + binaries_docx,
|
||||
binaries=binaries_fastembed + binaries_fastembed_vs + binaries_onnx + binaries_pptx + binaries_docx2everything + binaries_greenlet,
|
||||
datas=[
|
||||
('assets', 'assets'),
|
||||
('static', 'static'),
|
||||
('alembic', 'alembic'),
|
||||
] + datas_fastembed_cache + datas_fastembed + datas_fastembed_vs + datas_onnx + datas_pptx + datas_docx2everything + datas_greenlet + datas_docling + datas_docling_core + datas_docling_parse + datas_docling_ibm + datas_docx,
|
||||
] + datas_fastembed_cache + datas_fastembed + datas_fastembed_vs + datas_onnx + datas_pptx + datas_docx2everything + datas_greenlet,
|
||||
hiddenimports=[
|
||||
'aiosqlite',
|
||||
'alembic',
|
||||
|
|
@ -52,10 +40,10 @@ a = Analysis(
|
|||
'greenlet',
|
||||
'greenlet._greenlet',
|
||||
'importlib.metadata',
|
||||
] + hiddenimports_fastembed + hiddenimports_fastembed_vs + hiddenimports_onnx + hiddenimports_pptx + hiddenimports_docx2everything + hiddenimports_greenlet + hiddenimports_docling + hiddenimports_docling_core + hiddenimports_docling_parse + hiddenimports_docling_ibm + hiddenimports_docx,
|
||||
] + hiddenimports_fastembed + hiddenimports_fastembed_vs + hiddenimports_onnx + hiddenimports_pptx + hiddenimports_docx2everything + hiddenimports_greenlet,
|
||||
hookspath=[],
|
||||
hooksconfig={},
|
||||
runtime_hooks=['runtime_hook_docling.py'],
|
||||
runtime_hooks=[],
|
||||
excludes=excludes,
|
||||
noarchive=False,
|
||||
optimize=0,
|
||||
|
|
|
|||
|
|
@ -1,38 +0,0 @@
|
|||
from docling.document_converter import (
|
||||
DocumentConverter,
|
||||
PdfFormatOption,
|
||||
PowerpointFormatOption,
|
||||
WordFormatOption,
|
||||
)
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from utils.path_helpers import patch_python_docx_templates
|
||||
|
||||
|
||||
class DoclingService:
|
||||
def __init__(self):
|
||||
# Patch python-docx template path resolution before initializing converter
|
||||
# This is safe to call in any environment (Docker, development, PyInstaller)
|
||||
patch_python_docx_templates()
|
||||
|
||||
self.pipeline_options = PdfPipelineOptions()
|
||||
self.pipeline_options.do_ocr = False
|
||||
|
||||
self.converter = DocumentConverter(
|
||||
allowed_formats=[InputFormat.PPTX, InputFormat.PDF, InputFormat.DOCX],
|
||||
format_options={
|
||||
InputFormat.DOCX: WordFormatOption(
|
||||
pipeline_options=self.pipeline_options,
|
||||
),
|
||||
InputFormat.PPTX: PowerpointFormatOption(
|
||||
pipeline_options=self.pipeline_options,
|
||||
),
|
||||
InputFormat.PDF: PdfFormatOption(
|
||||
pipeline_options=self.pipeline_options,
|
||||
),
|
||||
},
|
||||
)
|
||||
|
||||
def parse_to_markdown(self, file_path: str) -> str:
|
||||
result = self.converter.convert(file_path)
|
||||
return result.document.export_to_markdown()
|
||||
165
electron/servers/fastapi/services/document_conversion_service.py
Normal file
165
electron/servers/fastapi/services/document_conversion_service.py
Normal file
|
|
@ -0,0 +1,165 @@
|
|||
import os
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from typing import Dict, List
|
||||
|
||||
|
||||
class DocumentConversionError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
def _windows_hidden_subprocess_kwargs() -> Dict[str, object]:
|
||||
if os.name != "nt":
|
||||
return {}
|
||||
|
||||
startupinfo = subprocess.STARTUPINFO()
|
||||
startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
|
||||
return {
|
||||
"creationflags": getattr(subprocess, "CREATE_NO_WINDOW", 0),
|
||||
"startupinfo": startupinfo,
|
||||
}
|
||||
|
||||
|
||||
class DocumentConversionService:
|
||||
def __init__(self):
|
||||
self.soffice_binary = self._resolve_soffice_binary()
|
||||
self.imagemagick_binary = self._resolve_imagemagick_binary()
|
||||
|
||||
@staticmethod
|
||||
def _resolve_soffice_binary() -> str:
|
||||
configured = (os.getenv("SOFFICE_PATH") or "").strip()
|
||||
if configured:
|
||||
return configured
|
||||
return "soffice.exe" if os.name == "nt" else "soffice"
|
||||
|
||||
@staticmethod
|
||||
def _can_execute(command: str, args: List[str]) -> bool:
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[command, *args],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=10,
|
||||
check=False,
|
||||
**_windows_hidden_subprocess_kwargs(),
|
||||
)
|
||||
return result.returncode == 0
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def _resolve_imagemagick_binary(self) -> str:
|
||||
configured = (os.getenv("IMAGEMAGICK_BINARY") or "").strip()
|
||||
if configured:
|
||||
return configured
|
||||
|
||||
for candidate in ["magick", "convert"]:
|
||||
if self._can_execute(candidate, ["-version"]):
|
||||
return candidate
|
||||
|
||||
return "magick" if os.name == "nt" else "convert"
|
||||
|
||||
def convert_office_to_pdf(
|
||||
self,
|
||||
file_path: str,
|
||||
output_dir: str,
|
||||
timeout_seconds: int = 180,
|
||||
) -> str:
|
||||
Path(output_dir).mkdir(parents=True, exist_ok=True)
|
||||
|
||||
existing_pdfs = {
|
||||
p.name for p in Path(output_dir).glob("*.pdf") if p.is_file()
|
||||
}
|
||||
|
||||
try:
|
||||
subprocess.run(
|
||||
[
|
||||
self.soffice_binary,
|
||||
"--headless",
|
||||
"--convert-to",
|
||||
"pdf",
|
||||
"--outdir",
|
||||
output_dir,
|
||||
file_path,
|
||||
],
|
||||
check=True,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=timeout_seconds,
|
||||
**_windows_hidden_subprocess_kwargs(),
|
||||
)
|
||||
except subprocess.TimeoutExpired as exc:
|
||||
raise DocumentConversionError(
|
||||
f"LibreOffice conversion timed out for {os.path.basename(file_path)}"
|
||||
) from exc
|
||||
except subprocess.CalledProcessError as exc:
|
||||
stderr = (exc.stderr or "").strip()
|
||||
stdout = (exc.stdout or "").strip()
|
||||
details = stderr or stdout or str(exc)
|
||||
raise DocumentConversionError(
|
||||
f"LibreOffice conversion failed for {os.path.basename(file_path)}: {details}"
|
||||
) from exc
|
||||
except Exception as exc:
|
||||
raise DocumentConversionError(
|
||||
f"LibreOffice conversion failed for {os.path.basename(file_path)}: {exc}"
|
||||
) from exc
|
||||
|
||||
expected_pdf = Path(output_dir) / f"{Path(file_path).stem}.pdf"
|
||||
if expected_pdf.is_file():
|
||||
return str(expected_pdf)
|
||||
|
||||
generated_pdfs = [
|
||||
p
|
||||
for p in Path(output_dir).glob("*.pdf")
|
||||
if p.is_file() and p.name not in existing_pdfs
|
||||
]
|
||||
if generated_pdfs:
|
||||
newest = max(generated_pdfs, key=lambda p: p.stat().st_mtime)
|
||||
return str(newest)
|
||||
|
||||
raise DocumentConversionError(
|
||||
f"LibreOffice did not create a PDF for {os.path.basename(file_path)}"
|
||||
)
|
||||
|
||||
def convert_image_to_png(
|
||||
self,
|
||||
file_path: str,
|
||||
output_dir: str,
|
||||
timeout_seconds: int = 120,
|
||||
) -> str:
|
||||
Path(output_dir).mkdir(parents=True, exist_ok=True)
|
||||
|
||||
output_path = Path(output_dir) / f"{Path(file_path).stem}_converted.png"
|
||||
|
||||
command = [self.imagemagick_binary, file_path, str(output_path)]
|
||||
|
||||
try:
|
||||
subprocess.run(
|
||||
command,
|
||||
check=True,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=timeout_seconds,
|
||||
**_windows_hidden_subprocess_kwargs(),
|
||||
)
|
||||
except subprocess.TimeoutExpired as exc:
|
||||
raise DocumentConversionError(
|
||||
f"ImageMagick conversion timed out for {os.path.basename(file_path)}"
|
||||
) from exc
|
||||
except subprocess.CalledProcessError as exc:
|
||||
stderr = (exc.stderr or "").strip()
|
||||
stdout = (exc.stdout or "").strip()
|
||||
details = stderr or stdout or str(exc)
|
||||
raise DocumentConversionError(
|
||||
f"ImageMagick conversion failed for {os.path.basename(file_path)}: {details}"
|
||||
) from exc
|
||||
except Exception as exc:
|
||||
raise DocumentConversionError(
|
||||
f"ImageMagick conversion failed for {os.path.basename(file_path)}: {exc}"
|
||||
) from exc
|
||||
|
||||
if not output_path.is_file():
|
||||
raise DocumentConversionError(
|
||||
f"ImageMagick did not create a PNG for {os.path.basename(file_path)}"
|
||||
)
|
||||
|
||||
return str(output_path)
|
||||
|
|
@ -1,45 +1,44 @@
|
|||
import mimetypes
|
||||
import sys
|
||||
from fastapi import HTTPException
|
||||
import os, asyncio
|
||||
import asyncio
|
||||
import os
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
import pdfplumber
|
||||
from constants.documents import (
|
||||
PDF_MIME_TYPES,
|
||||
POWERPOINT_TYPES,
|
||||
TEXT_MIME_TYPES,
|
||||
WORD_TYPES,
|
||||
)
|
||||
from fastapi import HTTPException
|
||||
|
||||
# Platform-specific document service imports
|
||||
is_windows = sys.platform == 'win32'
|
||||
if not is_windows:
|
||||
from services.docling_service import DoclingService
|
||||
DocumentService = None
|
||||
else:
|
||||
DoclingService = None
|
||||
from constants.documents import (
|
||||
IMAGE_EXTENSIONS,
|
||||
OFFICE_EXTENSIONS,
|
||||
PDF_EXTENSIONS,
|
||||
TEXT_EXTENSIONS,
|
||||
)
|
||||
from services.document_conversion_service import (
|
||||
DocumentConversionError,
|
||||
DocumentConversionService,
|
||||
)
|
||||
from services.liteparse_service import LiteParseError, LiteParseService
|
||||
from utils.ocr_language import presentation_language_to_ocr_code
|
||||
|
||||
# Optional fallback converter (primarily useful on Windows)
|
||||
try:
|
||||
from services.lightweight_document_service import DocumentService
|
||||
except Exception:
|
||||
DocumentService = None
|
||||
|
||||
|
||||
class DocumentsLoader:
|
||||
|
||||
def __init__(self, file_paths: List[str]):
|
||||
def __init__(
|
||||
self,
|
||||
file_paths: List[str],
|
||||
presentation_language: Optional[str] = None,
|
||||
):
|
||||
self._file_paths = file_paths
|
||||
|
||||
# Initialize document service based on platform
|
||||
if not is_windows and DoclingService is not None:
|
||||
# Use DoclingService on Linux/macOS
|
||||
self.docling_service = DoclingService()
|
||||
self.document_service = None
|
||||
elif is_windows and DocumentService is not None:
|
||||
# Use lightweight DocumentService on Windows
|
||||
self.docling_service = None
|
||||
self.document_service = DocumentService()
|
||||
else:
|
||||
# Fallback if neither is available
|
||||
self.docling_service = None
|
||||
self.document_service = None
|
||||
self._ocr_language = presentation_language_to_ocr_code(presentation_language)
|
||||
self.liteparse_service = LiteParseService()
|
||||
self.document_conversion_service = DocumentConversionService()
|
||||
self.document_service = DocumentService() if DocumentService is not None else None
|
||||
|
||||
self._documents: List[str] = []
|
||||
self._images: List[List[str]] = []
|
||||
|
|
@ -61,7 +60,7 @@ class DocumentsLoader:
|
|||
"""If load_images is True, temp_dir must be provided"""
|
||||
|
||||
documents: List[str] = []
|
||||
images: List[str] = []
|
||||
images: List[List[str]] = []
|
||||
|
||||
for file_path in self._file_paths:
|
||||
if not os.path.exists(file_path):
|
||||
|
|
@ -72,17 +71,28 @@ class DocumentsLoader:
|
|||
document = ""
|
||||
imgs = []
|
||||
|
||||
mime_type = mimetypes.guess_type(file_path)[0]
|
||||
if mime_type in PDF_MIME_TYPES:
|
||||
extension = Path(file_path).suffix.lower()
|
||||
|
||||
if extension in PDF_EXTENSIONS:
|
||||
document, imgs = await self.load_pdf(
|
||||
file_path, load_text, load_images, temp_dir
|
||||
)
|
||||
elif mime_type in TEXT_MIME_TYPES:
|
||||
elif extension in TEXT_EXTENSIONS:
|
||||
document = await self.load_text(file_path)
|
||||
elif mime_type in POWERPOINT_TYPES:
|
||||
document = self.load_powerpoint(file_path)
|
||||
elif mime_type in WORD_TYPES:
|
||||
document = self.load_msword(file_path)
|
||||
elif extension in OFFICE_EXTENSIONS:
|
||||
document = await asyncio.to_thread(
|
||||
self.load_office_document,
|
||||
file_path,
|
||||
temp_dir,
|
||||
)
|
||||
elif extension in IMAGE_EXTENSIONS:
|
||||
document = await asyncio.to_thread(
|
||||
self.load_image,
|
||||
file_path,
|
||||
temp_dir,
|
||||
)
|
||||
else:
|
||||
document = await asyncio.to_thread(self._parse_with_liteparse, file_path)
|
||||
|
||||
documents.append(document)
|
||||
images.append(imgs)
|
||||
|
|
@ -101,43 +111,64 @@ class DocumentsLoader:
|
|||
document: str = ""
|
||||
|
||||
if load_text:
|
||||
document = await self.load_text_from_pdf_locally(file_path)
|
||||
document = await asyncio.to_thread(self._parse_with_liteparse, file_path)
|
||||
|
||||
if load_images:
|
||||
image_paths = await self.get_page_images_from_pdf_async(file_path, temp_dir)
|
||||
|
||||
return document, image_paths
|
||||
|
||||
async def load_text_from_pdf_locally(self, file_path: str) -> str:
|
||||
return await asyncio.to_thread(self._extract_text_from_pdf, file_path)
|
||||
|
||||
@staticmethod
|
||||
def _extract_text_from_pdf(file_path: str) -> str:
|
||||
texts: List[str] = []
|
||||
with pdfplumber.open(file_path) as pdf:
|
||||
for idx, page in enumerate(pdf.pages):
|
||||
page_text = f"## Page {idx + 1}\n"
|
||||
page_text += page.extract_text() or ""
|
||||
texts.append(page_text)
|
||||
return "\n\n".join(texts)
|
||||
|
||||
async def load_text(self, file_path: str) -> str:
|
||||
with open(file_path, "r", encoding="utf-8") as file:
|
||||
return await asyncio.to_thread(file.read)
|
||||
|
||||
def load_msword(self, file_path: str) -> str:
|
||||
if self.docling_service is not None:
|
||||
return self.docling_service.parse_to_markdown(file_path)
|
||||
elif self.document_service is not None:
|
||||
return self.document_service.parse_to_markdown(file_path)
|
||||
return "" # Document service not available
|
||||
def load_office_document(self, file_path: str, temp_dir: Optional[str] = None) -> str:
|
||||
if temp_dir:
|
||||
converted_path = self.document_conversion_service.convert_office_to_pdf(
|
||||
file_path,
|
||||
temp_dir,
|
||||
)
|
||||
return self._parse_with_liteparse(converted_path)
|
||||
|
||||
def load_powerpoint(self, file_path: str) -> str:
|
||||
if self.docling_service is not None:
|
||||
return self.docling_service.parse_to_markdown(file_path)
|
||||
elif self.document_service is not None:
|
||||
return self.document_service.parse_to_markdown(file_path)
|
||||
return "" # Document service not available
|
||||
with tempfile.TemporaryDirectory(prefix="office-convert-") as conversion_dir:
|
||||
converted_path = self.document_conversion_service.convert_office_to_pdf(
|
||||
file_path,
|
||||
conversion_dir,
|
||||
)
|
||||
return self._parse_with_liteparse(converted_path)
|
||||
|
||||
def load_image(self, file_path: str, temp_dir: Optional[str] = None) -> str:
|
||||
if temp_dir:
|
||||
converted_path = self.document_conversion_service.convert_image_to_png(
|
||||
file_path,
|
||||
temp_dir,
|
||||
)
|
||||
return self._parse_with_liteparse(converted_path)
|
||||
|
||||
with tempfile.TemporaryDirectory(prefix="image-convert-") as conversion_dir:
|
||||
converted_path = self.document_conversion_service.convert_image_to_png(
|
||||
file_path,
|
||||
conversion_dir,
|
||||
)
|
||||
return self._parse_with_liteparse(converted_path)
|
||||
|
||||
def _parse_with_liteparse(self, file_path: str) -> str:
|
||||
try:
|
||||
return self.liteparse_service.parse_to_markdown(
|
||||
file_path,
|
||||
ocr_enabled=True,
|
||||
ocr_language=self._ocr_language,
|
||||
)
|
||||
except (LiteParseError, DocumentConversionError) as exc:
|
||||
if self.document_service is not None:
|
||||
try:
|
||||
return self.document_service.parse_to_markdown(file_path)
|
||||
except Exception:
|
||||
pass
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail=f"Failed to parse document {os.path.basename(file_path)}: {exc}",
|
||||
) from exc
|
||||
|
||||
@classmethod
|
||||
def get_page_images_from_pdf(cls, file_path: str, temp_dir: str) -> List[str]:
|
||||
|
|
|
|||
|
|
@ -1,177 +1,177 @@
|
|||
"""
|
||||
Lightweight document converter for Windows/MSIX compatibility.
|
||||
Uses pure-Python libraries: pdfplumber for PDF, docx2txt for DOCX, python-pptx for PPTX.
|
||||
No subprocess, no external runtimes, MSIX/Appx safe.
|
||||
"""
|
||||
import os
|
||||
from typing import List, Optional
|
||||
|
||||
import docx2everything
|
||||
import pdfplumber
|
||||
from pptx import Presentation
|
||||
|
||||
|
||||
class LightweightDocumentConverter:
|
||||
"""Lightweight document converter supporting PDF, DOCX, and PPTX."""
|
||||
|
||||
def convert(self, file_path: str) -> str:
|
||||
"""
|
||||
Convert document to markdown text.
|
||||
|
||||
Args:
|
||||
file_path: Path to the document file
|
||||
|
||||
Returns:
|
||||
Extracted text in markdown format
|
||||
|
||||
Raises:
|
||||
ValueError: If file format is not supported
|
||||
FileNotFoundError: If file does not exist
|
||||
"""
|
||||
if not os.path.exists(file_path):
|
||||
raise FileNotFoundError(f"File not found: {file_path}")
|
||||
|
||||
file_ext = os.path.splitext(file_path)[1].lower()
|
||||
|
||||
if file_ext == '.pdf':
|
||||
return self._convert_pdf(file_path)
|
||||
elif file_ext == '.docx':
|
||||
return self._convert_docx(file_path)
|
||||
elif file_ext == '.pptx':
|
||||
return self._convert_pptx(file_path)
|
||||
else:
|
||||
raise ValueError(f"Unsupported file format: {file_ext}")
|
||||
|
||||
def _convert_pdf(self, path: str) -> str:
|
||||
"""
|
||||
Convert PDF to markdown using pdfplumber.
|
||||
|
||||
Args:
|
||||
path: Path to PDF file
|
||||
|
||||
Returns:
|
||||
Extracted text in markdown format
|
||||
"""
|
||||
texts: List[str] = []
|
||||
with pdfplumber.open(path) as pdf:
|
||||
for idx, page in enumerate(pdf.pages):
|
||||
page_text = f"## Page {idx + 1}\n"
|
||||
page_text += page.extract_text() or ""
|
||||
texts.append(page_text)
|
||||
return "\n\n".join(texts)
|
||||
|
||||
def _convert_docx(self, path: str) -> str:
|
||||
"""
|
||||
Extract markdown from DOCX using docx2everything (no images).
|
||||
|
||||
Args:
|
||||
path: Path to DOCX file
|
||||
|
||||
Returns:
|
||||
Extracted markdown (no images)
|
||||
"""
|
||||
# Use the correct API: process_to_markdown(path) without img_dir extracts markdown without images
|
||||
markdown = docx2everything.process_to_markdown(path)
|
||||
return markdown if markdown else ""
|
||||
|
||||
def _convert_pptx(self, path: str) -> str:
|
||||
"""
|
||||
Convert PPTX to markdown using python-pptx.
|
||||
|
||||
Args:
|
||||
path: Path to PPTX file
|
||||
|
||||
Returns:
|
||||
Extracted text in markdown format
|
||||
"""
|
||||
prs = Presentation(path)
|
||||
markdown_parts = []
|
||||
|
||||
for slide_num, slide in enumerate(prs.slides, start=1):
|
||||
slide_parts = []
|
||||
|
||||
# Extract slide title (usually first shape with title placeholder)
|
||||
title_text = None
|
||||
for shape in slide.shapes:
|
||||
if hasattr(shape, "placeholder"):
|
||||
if shape.placeholder.placeholder_format.type == 1: # Title placeholder
|
||||
if hasattr(shape, "text") and shape.text.strip():
|
||||
title_text = shape.text.strip()
|
||||
break
|
||||
|
||||
# If no title placeholder found, try to find text box at top
|
||||
if not title_text:
|
||||
for shape in slide.shapes:
|
||||
if hasattr(shape, "text") and shape.text.strip():
|
||||
# Check if it's likely a title (first text shape, short text)
|
||||
text = shape.text.strip()
|
||||
if len(text) < 200: # Heuristic: titles are usually short
|
||||
title_text = text
|
||||
break
|
||||
|
||||
# Add slide title
|
||||
if title_text:
|
||||
slide_parts.append(f"# {title_text}")
|
||||
else:
|
||||
slide_parts.append(f"# Slide {slide_num}")
|
||||
|
||||
# Extract content (bullet points and text)
|
||||
for shape in slide.shapes:
|
||||
if not hasattr(shape, "text"):
|
||||
continue
|
||||
|
||||
text = shape.text.strip()
|
||||
if not text:
|
||||
continue
|
||||
|
||||
# Skip if this is the title we already added
|
||||
if title_text and text == title_text:
|
||||
continue
|
||||
|
||||
# Check if it's a text frame with paragraphs (bullet points)
|
||||
if hasattr(shape, "text_frame"):
|
||||
paragraphs = shape.text_frame.paragraphs
|
||||
if len(paragraphs) > 1:
|
||||
# Multiple paragraphs - likely bullet points
|
||||
for para in paragraphs:
|
||||
para_text = para.text.strip()
|
||||
if para_text:
|
||||
# Check bullet level
|
||||
level = para.level
|
||||
indent = " " * level
|
||||
slide_parts.append(f"{indent}- {para_text}")
|
||||
else:
|
||||
# Single paragraph
|
||||
if text and text != title_text:
|
||||
slide_parts.append(text)
|
||||
else:
|
||||
# Plain text shape
|
||||
if text and text != title_text:
|
||||
slide_parts.append(text)
|
||||
|
||||
if slide_parts:
|
||||
markdown_parts.append("\n".join(slide_parts))
|
||||
|
||||
return "\n\n---\n\n".join(markdown_parts)
|
||||
|
||||
|
||||
class DocumentService:
|
||||
"""
|
||||
Document service wrapper providing parse_to_markdown interface.
|
||||
Compatible with DoclingService interface for easy swapping.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.converter = LightweightDocumentConverter()
|
||||
|
||||
def parse_to_markdown(self, file_path: str) -> str:
|
||||
"""
|
||||
Parse document to markdown format.
|
||||
|
||||
Args:
|
||||
file_path: Path to the document file
|
||||
|
||||
Returns:
|
||||
Extracted text in markdown format
|
||||
"""
|
||||
return self.converter.convert(file_path)
|
||||
"""
|
||||
Lightweight document converter for Windows/MSIX compatibility.
|
||||
Uses pure-Python libraries: pdfplumber for PDF, docx2txt for DOCX, python-pptx for PPTX.
|
||||
No subprocess, no external runtimes, MSIX/Appx safe.
|
||||
"""
|
||||
import os
|
||||
from typing import List, Optional
|
||||
|
||||
import docx2everything
|
||||
import pdfplumber
|
||||
from pptx import Presentation
|
||||
|
||||
|
||||
class LightweightDocumentConverter:
|
||||
"""Lightweight document converter supporting PDF, DOCX, and PPTX."""
|
||||
|
||||
def convert(self, file_path: str) -> str:
|
||||
"""
|
||||
Convert document to markdown text.
|
||||
|
||||
Args:
|
||||
file_path: Path to the document file
|
||||
|
||||
Returns:
|
||||
Extracted text in markdown format
|
||||
|
||||
Raises:
|
||||
ValueError: If file format is not supported
|
||||
FileNotFoundError: If file does not exist
|
||||
"""
|
||||
if not os.path.exists(file_path):
|
||||
raise FileNotFoundError(f"File not found: {file_path}")
|
||||
|
||||
file_ext = os.path.splitext(file_path)[1].lower()
|
||||
|
||||
if file_ext == '.pdf':
|
||||
return self._convert_pdf(file_path)
|
||||
elif file_ext == '.docx':
|
||||
return self._convert_docx(file_path)
|
||||
elif file_ext == '.pptx':
|
||||
return self._convert_pptx(file_path)
|
||||
else:
|
||||
raise ValueError(f"Unsupported file format: {file_ext}")
|
||||
|
||||
def _convert_pdf(self, path: str) -> str:
|
||||
"""
|
||||
Convert PDF to markdown using pdfplumber.
|
||||
|
||||
Args:
|
||||
path: Path to PDF file
|
||||
|
||||
Returns:
|
||||
Extracted text in markdown format
|
||||
"""
|
||||
texts: List[str] = []
|
||||
with pdfplumber.open(path) as pdf:
|
||||
for idx, page in enumerate(pdf.pages):
|
||||
page_text = f"## Page {idx + 1}\n"
|
||||
page_text += page.extract_text() or ""
|
||||
texts.append(page_text)
|
||||
return "\n\n".join(texts)
|
||||
|
||||
def _convert_docx(self, path: str) -> str:
|
||||
"""
|
||||
Extract markdown from DOCX using docx2everything (no images).
|
||||
|
||||
Args:
|
||||
path: Path to DOCX file
|
||||
|
||||
Returns:
|
||||
Extracted markdown (no images)
|
||||
"""
|
||||
# Use the correct API: process_to_markdown(path) without img_dir extracts markdown without images
|
||||
markdown = docx2everything.process_to_markdown(path)
|
||||
return markdown if markdown else ""
|
||||
|
||||
def _convert_pptx(self, path: str) -> str:
|
||||
"""
|
||||
Convert PPTX to markdown using python-pptx.
|
||||
|
||||
Args:
|
||||
path: Path to PPTX file
|
||||
|
||||
Returns:
|
||||
Extracted text in markdown format
|
||||
"""
|
||||
prs = Presentation(path)
|
||||
markdown_parts = []
|
||||
|
||||
for slide_num, slide in enumerate(prs.slides, start=1):
|
||||
slide_parts = []
|
||||
|
||||
# Extract slide title (usually first shape with title placeholder)
|
||||
title_text = None
|
||||
for shape in slide.shapes:
|
||||
if hasattr(shape, "placeholder"):
|
||||
if shape.placeholder.placeholder_format.type == 1: # Title placeholder
|
||||
if hasattr(shape, "text") and shape.text.strip():
|
||||
title_text = shape.text.strip()
|
||||
break
|
||||
|
||||
# If no title placeholder found, try to find text box at top
|
||||
if not title_text:
|
||||
for shape in slide.shapes:
|
||||
if hasattr(shape, "text") and shape.text.strip():
|
||||
# Check if it's likely a title (first text shape, short text)
|
||||
text = shape.text.strip()
|
||||
if len(text) < 200: # Heuristic: titles are usually short
|
||||
title_text = text
|
||||
break
|
||||
|
||||
# Add slide title
|
||||
if title_text:
|
||||
slide_parts.append(f"# {title_text}")
|
||||
else:
|
||||
slide_parts.append(f"# Slide {slide_num}")
|
||||
|
||||
# Extract content (bullet points and text)
|
||||
for shape in slide.shapes:
|
||||
if not hasattr(shape, "text"):
|
||||
continue
|
||||
|
||||
text = shape.text.strip()
|
||||
if not text:
|
||||
continue
|
||||
|
||||
# Skip if this is the title we already added
|
||||
if title_text and text == title_text:
|
||||
continue
|
||||
|
||||
# Check if it's a text frame with paragraphs (bullet points)
|
||||
if hasattr(shape, "text_frame"):
|
||||
paragraphs = shape.text_frame.paragraphs
|
||||
if len(paragraphs) > 1:
|
||||
# Multiple paragraphs - likely bullet points
|
||||
for para in paragraphs:
|
||||
para_text = para.text.strip()
|
||||
if para_text:
|
||||
# Check bullet level
|
||||
level = para.level
|
||||
indent = " " * level
|
||||
slide_parts.append(f"{indent}- {para_text}")
|
||||
else:
|
||||
# Single paragraph
|
||||
if text and text != title_text:
|
||||
slide_parts.append(text)
|
||||
else:
|
||||
# Plain text shape
|
||||
if text and text != title_text:
|
||||
slide_parts.append(text)
|
||||
|
||||
if slide_parts:
|
||||
markdown_parts.append("\n".join(slide_parts))
|
||||
|
||||
return "\n\n---\n\n".join(markdown_parts)
|
||||
|
||||
|
||||
class DocumentService:
|
||||
"""
|
||||
Document service wrapper providing parse_to_markdown interface.
|
||||
Same parse_to_markdown entry point as LiteParseService for optional Windows fallback.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.converter = LightweightDocumentConverter()
|
||||
|
||||
def parse_to_markdown(self, file_path: str) -> str:
|
||||
"""
|
||||
Parse document to markdown format.
|
||||
|
||||
Args:
|
||||
file_path: Path to the document file
|
||||
|
||||
Returns:
|
||||
Extracted text in markdown format
|
||||
"""
|
||||
return self.converter.convert(file_path)
|
||||
|
|
|
|||
197
electron/servers/fastapi/services/liteparse_service.py
Normal file
197
electron/servers/fastapi/services/liteparse_service.py
Normal file
|
|
@ -0,0 +1,197 @@
|
|||
import json
|
||||
import os
|
||||
import subprocess
|
||||
from typing import Any, Dict, Tuple
|
||||
|
||||
|
||||
class LiteParseError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class LiteParseService:
|
||||
def __init__(self, timeout_seconds: int = 180):
|
||||
self.timeout_seconds = timeout_seconds
|
||||
self.node_binary = os.getenv("LITEPARSE_NODE_BINARY", "node")
|
||||
self.runner_path = os.getenv("LITEPARSE_RUNNER_PATH", self._resolve_runner_path())
|
||||
self.runner_dir = os.path.dirname(self.runner_path)
|
||||
self._npm_project_root = self._resolve_npm_project_root()
|
||||
|
||||
def _resolve_npm_project_root(self) -> str:
|
||||
"""Directory whose node_modules contains @llamaindex/liteparse (runner dir or Electron app root)."""
|
||||
local_nm = os.path.join(
|
||||
self.runner_dir, "node_modules", "@llamaindex", "liteparse"
|
||||
)
|
||||
if os.path.isdir(local_nm):
|
||||
return self.runner_dir
|
||||
electron_nm = os.path.abspath(
|
||||
os.path.join(self.runner_dir, "..", "..", "node_modules", "@llamaindex", "liteparse")
|
||||
)
|
||||
if os.path.isdir(electron_nm):
|
||||
return os.path.abspath(os.path.join(self.runner_dir, "..", ".."))
|
||||
return os.path.abspath(os.path.join(self.runner_dir, "..", ".."))
|
||||
|
||||
@staticmethod
|
||||
def _resolve_runner_path() -> str:
|
||||
cwd = os.path.abspath(".")
|
||||
candidates = [
|
||||
# electron/servers/fastapi → electron/resources/...
|
||||
os.path.abspath(
|
||||
os.path.join(
|
||||
cwd, "..", "..", "resources", "document-extraction", "liteparse_runner.mjs"
|
||||
)
|
||||
),
|
||||
# servers/fastapi (repo root layout) → electron/resources/...
|
||||
os.path.abspath(
|
||||
os.path.join(
|
||||
cwd,
|
||||
"..",
|
||||
"..",
|
||||
"electron",
|
||||
"resources",
|
||||
"document-extraction",
|
||||
"liteparse_runner.mjs",
|
||||
)
|
||||
),
|
||||
# PyInstaller bundle layout
|
||||
os.path.abspath(
|
||||
os.path.join(
|
||||
cwd, "..", "..", "app", "resources", "document-extraction", "liteparse_runner.mjs"
|
||||
)
|
||||
),
|
||||
# Docker / explicit layout
|
||||
"/app/document-extraction-liteparse/liteparse_runner.mjs",
|
||||
]
|
||||
for path in candidates:
|
||||
if os.path.isfile(path):
|
||||
return path
|
||||
return candidates[0]
|
||||
|
||||
def check_runtime_ready(self) -> Tuple[bool, str]:
|
||||
if not os.path.isfile(self.runner_path):
|
||||
return False, f"LiteParse runner not found at: {self.runner_path}"
|
||||
|
||||
try:
|
||||
subprocess.run(
|
||||
[self.node_binary, "--version"],
|
||||
cwd=self.runner_dir,
|
||||
check=True,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=10,
|
||||
)
|
||||
except Exception as exc:
|
||||
return False, f"Node.js runtime is unavailable: {exc}"
|
||||
|
||||
liteparse_dir = os.path.join(
|
||||
self._npm_project_root, "node_modules", "@llamaindex", "liteparse"
|
||||
)
|
||||
if not os.path.isdir(liteparse_dir):
|
||||
return (
|
||||
False,
|
||||
f"LiteParse npm package missing at {liteparse_dir}. Run npm install in the Electron app directory.",
|
||||
)
|
||||
|
||||
# @llamaindex/liteparse is ESM-only; require.resolve() fails. Use dynamic import.
|
||||
try:
|
||||
subprocess.run(
|
||||
[
|
||||
self.node_binary,
|
||||
"--input-type=module",
|
||||
"-e",
|
||||
"import '@llamaindex/liteparse'",
|
||||
],
|
||||
cwd=self._npm_project_root,
|
||||
check=True,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=20,
|
||||
)
|
||||
except Exception as exc:
|
||||
return False, f"LiteParse dependency is unavailable: {exc}"
|
||||
|
||||
return True, "ok"
|
||||
|
||||
def parse_to_markdown(
|
||||
self,
|
||||
file_path: str,
|
||||
ocr_enabled: bool = True,
|
||||
ocr_language: str = "eng",
|
||||
) -> str:
|
||||
result = self.parse(
|
||||
file_path=file_path,
|
||||
ocr_enabled=ocr_enabled,
|
||||
ocr_language=ocr_language,
|
||||
)
|
||||
return str(result.get("text") or "")
|
||||
|
||||
def parse(
|
||||
self,
|
||||
file_path: str,
|
||||
ocr_enabled: bool = True,
|
||||
ocr_language: str = "eng",
|
||||
) -> Dict[str, Any]:
|
||||
is_ready, reason = self.check_runtime_ready()
|
||||
if not is_ready:
|
||||
raise LiteParseError(reason)
|
||||
|
||||
command = [
|
||||
self.node_binary,
|
||||
self.runner_path,
|
||||
"--file",
|
||||
file_path,
|
||||
"--ocr-enabled",
|
||||
"true" if ocr_enabled else "false",
|
||||
"--ocr-language",
|
||||
ocr_language,
|
||||
]
|
||||
ocr_server = (os.getenv("LITEPARSE_OCR_SERVER_URL") or "").strip()
|
||||
if ocr_server:
|
||||
command.extend(["--ocr-server-url", ocr_server])
|
||||
tessdata = (os.getenv("LITEPARSE_TESSDATA_PATH") or "").strip()
|
||||
if tessdata:
|
||||
command.extend(["--tessdata-path", tessdata])
|
||||
|
||||
process = subprocess.run(
|
||||
command,
|
||||
cwd=self._npm_project_root,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=self.timeout_seconds,
|
||||
env=os.environ.copy(),
|
||||
)
|
||||
payload = self._decode_runner_output(process.stdout)
|
||||
|
||||
if process.returncode != 0:
|
||||
message = payload.get("error") or process.stderr.strip() or "Unknown error"
|
||||
raise LiteParseError(message)
|
||||
|
||||
if not payload.get("ok"):
|
||||
raise LiteParseError(payload.get("error") or "LiteParse parse failed")
|
||||
|
||||
return payload
|
||||
|
||||
@staticmethod
|
||||
def _decode_runner_output(stdout: str) -> Dict[str, Any]:
|
||||
raw = (stdout or "").lstrip("\ufeff").strip()
|
||||
if not raw:
|
||||
raise LiteParseError("LiteParse runner returned empty output")
|
||||
|
||||
# Prefer the last line that parses as JSON (handles stray log lines before our payload).
|
||||
lines = [line.strip() for line in raw.splitlines() if line.strip()]
|
||||
for line in reversed(lines):
|
||||
try:
|
||||
parsed = json.loads(line)
|
||||
if isinstance(parsed, dict):
|
||||
return parsed
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
# Single blob without newlines (entire stdout is one JSON object).
|
||||
try:
|
||||
parsed = json.loads(raw)
|
||||
if isinstance(parsed, dict):
|
||||
return parsed
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
raise LiteParseError("LiteParse runner returned invalid JSON output")
|
||||
126
electron/servers/fastapi/utils/ocr_language.py
Normal file
126
electron/servers/fastapi/utils/ocr_language.py
Normal file
|
|
@ -0,0 +1,126 @@
|
|||
"""
|
||||
Map presentation UI language strings (LanguageType enum values from Next.js) to
|
||||
Tesseract / LiteParse OCR language codes (ISO 639-3 where applicable).
|
||||
|
||||
Keep keys in sync with:
|
||||
electron/servers/nextjs/app/(presentation-generator)/upload/type.ts → LanguageType
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from typing import Optional
|
||||
|
||||
# Values must match `LanguageType` string literals in the upload UI.
|
||||
PRESENTATION_LANGUAGE_TO_TESSERACT: dict[str, str] = {
|
||||
"English": "eng",
|
||||
"Spanish (Español)": "spa",
|
||||
"French (Français)": "fra",
|
||||
"German (Deutsch)": "deu",
|
||||
"Portuguese (Português)": "por",
|
||||
"Italian (Italiano)": "ita",
|
||||
"Dutch (Nederlands)": "nld",
|
||||
"Russian (Русский)": "rus",
|
||||
"Chinese (Simplified - 中文, 汉语)": "chi_sim",
|
||||
"Chinese (Traditional - 中文, 漢語)": "chi_tra",
|
||||
"Japanese (日本語)": "jpn",
|
||||
"Korean (한국어)": "kor",
|
||||
"Arabic (العربية)": "ara",
|
||||
"Hindi (हिन्दी)": "hin",
|
||||
"Bengali (বাংলা)": "ben",
|
||||
"Polish (Polski)": "pol",
|
||||
"Czech (Čeština)": "ces",
|
||||
"Slovak (Slovenčina)": "slk",
|
||||
"Hungarian (Magyar)": "hun",
|
||||
"Romanian (Română)": "ron",
|
||||
"Bulgarian (Български)": "bul",
|
||||
"Greek (Ελληνικά)": "ell",
|
||||
"Serbian (Српски / Srpski)": "srp",
|
||||
"Croatian (Hrvatski)": "hrv",
|
||||
"Bosnian (Bosanski)": "bos",
|
||||
"Slovenian (Slovenščina)": "slv",
|
||||
"Finnish (Suomi)": "fin",
|
||||
"Swedish (Svenska)": "swe",
|
||||
"Danish (Dansk)": "dan",
|
||||
"Norwegian (Norsk)": "nor",
|
||||
"Icelandic (Íslenska)": "isl",
|
||||
"Lithuanian (Lietuvių)": "lit",
|
||||
"Latvian (Latviešu)": "lav",
|
||||
"Estonian (Eesti)": "est",
|
||||
"Maltese (Malti)": "mlt",
|
||||
"Welsh (Cymraeg)": "cym",
|
||||
"Irish (Gaeilge)": "gle",
|
||||
"Scottish Gaelic (Gàidhlig)": "gla",
|
||||
"Ukrainian (Українська)": "ukr",
|
||||
"Hebrew (עברית)": "heb",
|
||||
"Persian/Farsi (فارسی)": "fas",
|
||||
"Turkish (Türkçe)": "tur",
|
||||
"Kurdish (Kurdî / کوردی)": "kmr",
|
||||
"Pashto (پښتو)": "pus",
|
||||
"Dari (دری)": "prs",
|
||||
"Uzbek (Oʻzbek)": "uzb",
|
||||
"Kazakh (Қазақша)": "kaz",
|
||||
"Tajik (Тоҷикӣ)": "tgk",
|
||||
"Turkmen (Türkmençe)": "tuk",
|
||||
"Azerbaijani (Azərbaycan dili)": "aze",
|
||||
"Urdu (اردو)": "urd",
|
||||
"Tamil (தமிழ்)": "tam",
|
||||
"Telugu (తెలుగు)": "tel",
|
||||
"Marathi (मराठी)": "mar",
|
||||
"Punjabi (ਪੰਜਾਬੀ / پنجابی)": "pan",
|
||||
"Gujarati (ગુજરાતી)": "guj",
|
||||
"Malayalam (മലയാളം)": "mal",
|
||||
"Kannada (ಕನ್ನಡ)": "kan",
|
||||
"Odia (ଓଡ଼ିଆ)": "ori",
|
||||
"Sinhala (සිංහල)": "sin",
|
||||
"Nepali (नेपाली)": "nep",
|
||||
"Thai (ไทย)": "tha",
|
||||
"Vietnamese (Tiếng Việt)": "vie",
|
||||
"Lao (ລາວ)": "lao",
|
||||
"Khmer (ភាសាខ្មែរ)": "khm",
|
||||
"Burmese (မြန်မာစာ)": "mya",
|
||||
"Tagalog/Filipino (Tagalog/Filipino)": "tgl",
|
||||
"Javanese (Basa Jawa)": "jav",
|
||||
"Sundanese (Basa Sunda)": "sun",
|
||||
"Malay (Bahasa Melayu)": "msa",
|
||||
"Mongolian (Монгол)": "mon",
|
||||
"Swahili (Kiswahili)": "swa",
|
||||
"Hausa (Hausa)": "hau",
|
||||
"Yoruba (Yorùbá)": "yor",
|
||||
"Igbo (Igbo)": "ibo",
|
||||
"Amharic (አማርኛ)": "amh",
|
||||
"Zulu (isiZulu)": "zul",
|
||||
"Xhosa (isiXhosa)": "xho",
|
||||
"Shona (ChiShona)": "sna",
|
||||
"Somali (Soomaaliga)": "som",
|
||||
"Basque (Euskara)": "eus",
|
||||
"Catalan (Català)": "cat",
|
||||
"Galician (Galego)": "glg",
|
||||
"Quechua (Runasimi)": "que",
|
||||
"Nahuatl (Nāhuatl)": "nah",
|
||||
"Hawaiian (ʻŌlelo Hawaiʻi)": "haw",
|
||||
"Maori (Te Reo Māori)": "mri",
|
||||
# No dedicated Tahitian traineddata in default Tesseract bundles.
|
||||
"Tahitian (Reo Tahiti)": "eng",
|
||||
"Samoan (Gagana Samoa)": "smo",
|
||||
}
|
||||
|
||||
_LOWER_MAP = {k.lower(): v for k, v in PRESENTATION_LANGUAGE_TO_TESSERACT.items()}
|
||||
|
||||
_OCR_CODE_RE = re.compile(r"^[a-zA-Z0-9_,+]+$")
|
||||
|
||||
|
||||
def presentation_language_to_ocr_code(language: Optional[str]) -> str:
|
||||
"""Resolve UI language label to a Tesseract language code; default English."""
|
||||
if language is None:
|
||||
return "eng"
|
||||
s = str(language).strip()
|
||||
if not s:
|
||||
return "eng"
|
||||
if s in PRESENTATION_LANGUAGE_TO_TESSERACT:
|
||||
code = PRESENTATION_LANGUAGE_TO_TESSERACT[s]
|
||||
else:
|
||||
code = _LOWER_MAP.get(s.lower(), "eng")
|
||||
if not _OCR_CODE_RE.fullmatch(code):
|
||||
return "eng"
|
||||
return code
|
||||
|
|
@ -156,7 +156,7 @@ def patch_python_docx_templates():
|
|||
- Docker/Development: Returns immediately without patching (no-op)
|
||||
- PyInstaller: Patches the template loading functions
|
||||
|
||||
Note: This should be called before using docling service in PyInstaller bundles.
|
||||
Note: Call before any code path that uses python-docx inside a PyInstaller bundle.
|
||||
"""
|
||||
# Only patch if running in PyInstaller bundle
|
||||
# This check ensures Docker and development environments are unaffected
|
||||
|
|
|
|||
|
|
@ -1,9 +1,25 @@
|
|||
from pathlib import Path
|
||||
from typing import List
|
||||
from fastapi import HTTPException
|
||||
|
||||
from fastapi import UploadFile
|
||||
|
||||
|
||||
def _is_accepted_file_type(file: UploadFile, accepted_types: List[str]) -> bool:
|
||||
accepted_mime_types = {t.lower() for t in accepted_types if not t.startswith(".")}
|
||||
accepted_extensions = {t.lower() for t in accepted_types if t.startswith(".")}
|
||||
|
||||
content_type = (file.content_type or "").strip().lower()
|
||||
if content_type in accepted_mime_types:
|
||||
return True
|
||||
|
||||
extension = Path(file.filename or "").suffix.lower()
|
||||
if extension in accepted_extensions:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def validate_files(
|
||||
field,
|
||||
nullable: bool,
|
||||
|
|
@ -15,12 +31,14 @@ def validate_files(
|
|||
if field:
|
||||
files: List[UploadFile] = field if multiple else [field]
|
||||
for each_file in files:
|
||||
if (max_size * 1024 * 1024) < each_file.size:
|
||||
file_size = each_file.size or 0
|
||||
|
||||
if (max_size * 1024 * 1024) < file_size:
|
||||
raise HTTPException(
|
||||
400,
|
||||
detail=f"File '{each_file.filename}' exceeded max upload size of {max_size} MB",
|
||||
)
|
||||
elif each_file.content_type not in accepted_types:
|
||||
elif not _is_accepted_file_type(each_file, accepted_types):
|
||||
raise HTTPException(
|
||||
400,
|
||||
detail=f"File '{each_file.filename}' not accepted. Accepted types: {accepted_types}",
|
||||
|
|
|
|||
826
electron/servers/fastapi/uv.lock
generated
826
electron/servers/fastapi/uv.lock
generated
File diff suppressed because it is too large
Load diff
|
|
@ -29,7 +29,10 @@ export class PresentationGenerationApi {
|
|||
}
|
||||
}
|
||||
|
||||
static async decomposeDocuments(documentKeys: string[]) {
|
||||
static async decomposeDocuments(
|
||||
documentKeys: string[],
|
||||
language?: string | null
|
||||
) {
|
||||
try {
|
||||
const response = await fetch(
|
||||
getApiUrl(`/api/v1/ppt/files/decompose`),
|
||||
|
|
@ -38,6 +41,7 @@ export class PresentationGenerationApi {
|
|||
headers: getHeader(),
|
||||
body: JSON.stringify({
|
||||
file_paths: documentKeys,
|
||||
language: language ?? null,
|
||||
}),
|
||||
cache: "no-cache",
|
||||
}
|
||||
|
|
|
|||
|
|
@ -13,37 +13,50 @@ interface SupportingDocProps {
|
|||
|
||||
const PDF_TYPES = ['.pdf']
|
||||
const TEXT_TYPES = ['.txt']
|
||||
const POWERPOINT_TYPES = ['.pptx']
|
||||
const WORD_TYPES = ['.docx']
|
||||
const WORD_TYPES = ['.doc', '.docx', '.docm', '.odt', '.rtf']
|
||||
const POWERPOINT_TYPES = ['.ppt', '.pptx', '.pptm', '.odp']
|
||||
const SPREADSHEET_TYPES = ['.xls', '.xlsx', '.xlsm', '.ods', '.csv', '.tsv']
|
||||
const IMAGE_TYPES = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp', '.svg']
|
||||
|
||||
const ACCEPT_DEFAULT = [
|
||||
'application/pdf',
|
||||
'text/plain',
|
||||
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
||||
'application/vnd.openxmlformats-officedocument.presentationml.presentation',
|
||||
...PDF_TYPES,
|
||||
...TEXT_TYPES,
|
||||
...POWERPOINT_TYPES,
|
||||
...WORD_TYPES,
|
||||
].join(',')
|
||||
const ALLOWED_MIME_PREFIXES: string[] = []
|
||||
const ALLOWED_MIME_PREFIXES: string[] = ['image/']
|
||||
const ALLOWED_MIME_TYPES = [
|
||||
'application/pdf',
|
||||
'application/x-pdf',
|
||||
'application/acrobat',
|
||||
'applications/pdf',
|
||||
'text/pdf',
|
||||
'application/vnd.pdf',
|
||||
'text/plain',
|
||||
'text/csv',
|
||||
'application/csv',
|
||||
'text/tab-separated-values',
|
||||
'text/tsv',
|
||||
'application/msword',
|
||||
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
||||
'application/vnd.ms-word.document.macroenabled.12',
|
||||
'application/vnd.oasis.opendocument.text',
|
||||
'application/rtf',
|
||||
'text/rtf',
|
||||
'application/vnd.ms-powerpoint',
|
||||
'application/vnd.openxmlformats-officedocument.presentationml.presentation',
|
||||
'application/vnd.ms-powerpoint.presentation.macroenabled.12',
|
||||
'application/vnd.oasis.opendocument.presentation',
|
||||
'application/vnd.ms-excel',
|
||||
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
|
||||
'application/vnd.ms-excel.sheet.macroenabled.12',
|
||||
'application/vnd.oasis.opendocument.spreadsheet',
|
||||
'image/jpeg',
|
||||
'image/png',
|
||||
'image/gif',
|
||||
'image/bmp',
|
||||
'image/tiff',
|
||||
'image/webp',
|
||||
'image/svg+xml',
|
||||
]
|
||||
const ALLOWED_EXTENSIONS = [
|
||||
...PDF_TYPES,
|
||||
...TEXT_TYPES,
|
||||
...POWERPOINT_TYPES,
|
||||
...WORD_TYPES,
|
||||
...POWERPOINT_TYPES,
|
||||
...SPREADSHEET_TYPES,
|
||||
...IMAGE_TYPES,
|
||||
]
|
||||
const ACCEPT_DEFAULT = [...ALLOWED_MIME_TYPES, ...ALLOWED_EXTENSIONS].join(',')
|
||||
|
||||
const SupportingDoc = ({
|
||||
files,
|
||||
|
|
@ -75,7 +88,7 @@ const SupportingDoc = ({
|
|||
const disallowed = filesToReview.filter((file) => !isAllowedFile(file))
|
||||
if (disallowed.length > 0) {
|
||||
toast.error('Some files are not supported', {
|
||||
description: 'Only PDF, TXT, PPTX, and DOCX files are allowed.',
|
||||
description: 'Supported: Word, PowerPoint, spreadsheets, PDF/TXT, and image files.',
|
||||
})
|
||||
}
|
||||
}
|
||||
|
|
@ -171,7 +184,7 @@ const SupportingDoc = ({
|
|||
<div className="flex flex-col items-center gap-2">
|
||||
<Paperclip className="h-6 w-6 text-[#5146E5]" />
|
||||
<p className="text-sm font-medium text-gray-800 font-syne">
|
||||
Drag and drop PDF, TXT, PPTX, DOCX, or <span className="text-[#5146E5]">click to browse</span>
|
||||
Drag and drop Office docs, spreadsheets, images, PDF/TXT, or <span className="text-[#5146E5]">click to browse</span>
|
||||
</p>
|
||||
</div>
|
||||
</label>
|
||||
|
|
@ -214,7 +227,7 @@ const SupportingDoc = ({
|
|||
</ul>
|
||||
{filteredFiles.length !== files.length && (
|
||||
<p className="mt-2 text-xs text-amber-600 font-syne">
|
||||
Some files were skipped. Only PDF, TXT, PPTX, and DOCX files are supported.
|
||||
Some files were skipped. Supported: Word, PowerPoint, spreadsheets, PDF/TXT, and image files.
|
||||
</p>
|
||||
)}
|
||||
</div>
|
||||
|
|
|
|||
|
|
@ -132,7 +132,12 @@ const UploadPage = () => {
|
|||
|
||||
if (documents.length > 0) {
|
||||
trackEvent(MixpanelEvent.Upload_Decompose_Documents_API_Call);
|
||||
promises.push(PresentationGenerationApi.decomposeDocuments(documents));
|
||||
promises.push(
|
||||
PresentationGenerationApi.decomposeDocuments(
|
||||
documents,
|
||||
config?.language ?? null
|
||||
)
|
||||
);
|
||||
}
|
||||
const responses = await Promise.all(promises);
|
||||
dispatch(setPptGenUploadState({
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue