presenton/electron/resources/document-extraction/liteparse_runner.mjs
sudipnext 691d0f62e8 feat: enhance ImageMagick installation process and update documentation
- Added functions to resolve Homebrew and Linux escalation commands for ImageMagick installation.
- Improved error handling and logging for manual installation steps.
- Updated download URLs for ImageMagick based on the platform.
- Enhanced user interface messages to clarify installation steps for different operating systems.
- Adjusted CPU worker count in document extraction for better performance.
2026-03-30 20:21:15 +05:45

147 lines
4.2 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env node
/**
* CLI bridge for Python: one JSON line on stdout for LiteParse extraction.
*
* OCR follows LlamaIndex LiteParse guidance (built-in Tesseract by default):
* https://developers.llamaindex.ai/liteparse/guides/ocr/
*
* - ISO 639-3 for Tesseract (eng, fra, deu, jpn, …); multi-lang as "deu+eng" or "deu,eng".
* - Parallel workers ≈ CPU cores 1 (override --num-workers).
* - Optional HTTP OCR: --ocr-server-url or LITEPARSE_OCR_SERVER_URL.
* - Optional local models: --tessdata-path or LITEPARSE_TESSDATA_PATH (else TESSDATA_PREFIX / CDN).
*/
import fs from "node:fs";
import os from "node:os";
import path from "node:path";
import { LiteParse } from "@llamaindex/liteparse";
function readArg(name) {
const idx = process.argv.indexOf(name);
if (idx === -1) return null;
return process.argv[idx + 1] ?? null;
}
function parseBool(value, fallback) {
if (value == null || value === "") return fallback;
const s = String(value).trim().toLowerCase();
if (["1", "true", "yes", "on"].includes(s)) return true;
if (["0", "false", "no", "off"].includes(s)) return false;
return fallback;
}
function toNumber(value, fallback, min, max) {
if (value == null || value === "") return fallback;
const parsed = Number(value);
if (Number.isNaN(parsed)) return fallback;
return Math.min(Math.max(parsed, min), max);
}
/** Tesseract accepts "deu+eng"; allow comma-separated CLI/env for convenience. */
function normalizeOcrLanguage(raw) {
const s = String(raw ?? "").trim();
if (!s) return "eng";
if (s.includes(",")) {
return s
.split(",")
.map((p) => p.trim())
.filter(Boolean)
.join("+");
}
return s;
}
function emit(result, exitCode = 0) {
process.stdout.write(`${JSON.stringify(result)}\n`);
process.exit(exitCode);
}
const filePath = readArg("--file");
if (!filePath) {
emit({ ok: false, error: "Missing required --file argument" }, 2);
}
const resolvedPath = path.resolve(filePath);
if (!fs.existsSync(resolvedPath)) {
emit({ ok: false, error: `File not found: ${resolvedPath}` }, 2);
}
const ocrEnabled = parseBool(readArg("--ocr-enabled"), true);
const dpi = toNumber(readArg("--dpi"), 150, 72, 600);
const numWorkers = toNumber(
readArg("--num-workers"),
Math.max(os.cpus().length - 2, 1),
1,
64
);
const cliOcrLanguage = readArg("--ocr-language");
const ocrLanguageRaw =
(process.env.LITEPARSE_OCR_LANGUAGE && String(process.env.LITEPARSE_OCR_LANGUAGE).trim()) ||
(cliOcrLanguage && String(cliOcrLanguage).trim()) ||
"";
const ocrLanguage = normalizeOcrLanguage(ocrLanguageRaw || "eng");
const outputFormatRaw = (readArg("--output-format") || "text").trim().toLowerCase();
const outputFormat = outputFormatRaw === "json" ? "json" : "text";
const ocrServerUrlArg = readArg("--ocr-server-url");
const ocrServerUrl =
(ocrServerUrlArg && String(ocrServerUrlArg).trim()) ||
(process.env.LITEPARSE_OCR_SERVER_URL && String(process.env.LITEPARSE_OCR_SERVER_URL).trim()) ||
undefined;
const tessdataArg = readArg("--tessdata-path");
const tessdataPath =
(tessdataArg && String(tessdataArg).trim()) ||
(process.env.LITEPARSE_TESSDATA_PATH && String(process.env.LITEPARSE_TESSDATA_PATH).trim()) ||
(process.env.TESSDATA_PREFIX && String(process.env.TESSDATA_PREFIX).trim()) ||
undefined;
try {
const config = {
ocrEnabled,
ocrLanguage,
outputFormat,
dpi,
numWorkers,
};
if (ocrServerUrl) {
config.ocrServerUrl = ocrServerUrl;
}
if (tessdataPath) {
config.tessdataPath = tessdataPath;
}
const parser = new LiteParse(config);
const result = await parser.parse(resolvedPath, true);
const text = result?.text ?? "";
emit({
ok: true,
filePath: resolvedPath,
text,
pageCount: Array.isArray(result?.pages) ? result.pages.length : 0,
ocr: {
engine: ocrServerUrl ? "http" : "tesseract",
ocrLanguage,
ocrEnabled,
dpi,
numWorkers,
},
});
} catch (error) {
const message = error instanceof Error ? error.message : String(error);
const stack = error instanceof Error ? error.stack : undefined;
if (stack) {
process.stderr.write(`${stack}\n`);
}
emit(
{
ok: false,
filePath: resolvedPath,
error: message,
},
1
);
}