- Added functions to resolve Homebrew and Linux escalation commands for ImageMagick installation. - Improved error handling and logging for manual installation steps. - Updated download URLs for ImageMagick based on the platform. - Enhanced user interface messages to clarify installation steps for different operating systems. - Adjusted CPU worker count in document extraction for better performance.
147 lines
4.2 KiB
JavaScript
147 lines
4.2 KiB
JavaScript
#!/usr/bin/env node
|
||
/**
|
||
* CLI bridge for Python: one JSON line on stdout for LiteParse extraction.
|
||
*
|
||
* OCR follows LlamaIndex LiteParse guidance (built-in Tesseract by default):
|
||
* https://developers.llamaindex.ai/liteparse/guides/ocr/
|
||
*
|
||
* - ISO 639-3 for Tesseract (eng, fra, deu, jpn, …); multi-lang as "deu+eng" or "deu,eng".
|
||
* - Parallel workers ≈ CPU cores − 1 (override --num-workers).
|
||
* - Optional HTTP OCR: --ocr-server-url or LITEPARSE_OCR_SERVER_URL.
|
||
* - Optional local models: --tessdata-path or LITEPARSE_TESSDATA_PATH (else TESSDATA_PREFIX / CDN).
|
||
*/
|
||
|
||
import fs from "node:fs";
|
||
import os from "node:os";
|
||
import path from "node:path";
|
||
import { LiteParse } from "@llamaindex/liteparse";
|
||
|
||
function readArg(name) {
|
||
const idx = process.argv.indexOf(name);
|
||
if (idx === -1) return null;
|
||
return process.argv[idx + 1] ?? null;
|
||
}
|
||
|
||
function parseBool(value, fallback) {
|
||
if (value == null || value === "") return fallback;
|
||
const s = String(value).trim().toLowerCase();
|
||
if (["1", "true", "yes", "on"].includes(s)) return true;
|
||
if (["0", "false", "no", "off"].includes(s)) return false;
|
||
return fallback;
|
||
}
|
||
|
||
function toNumber(value, fallback, min, max) {
|
||
if (value == null || value === "") return fallback;
|
||
const parsed = Number(value);
|
||
if (Number.isNaN(parsed)) return fallback;
|
||
return Math.min(Math.max(parsed, min), max);
|
||
}
|
||
|
||
/** Tesseract accepts "deu+eng"; allow comma-separated CLI/env for convenience. */
|
||
function normalizeOcrLanguage(raw) {
|
||
const s = String(raw ?? "").trim();
|
||
if (!s) return "eng";
|
||
if (s.includes(",")) {
|
||
return s
|
||
.split(",")
|
||
.map((p) => p.trim())
|
||
.filter(Boolean)
|
||
.join("+");
|
||
}
|
||
return s;
|
||
}
|
||
|
||
function emit(result, exitCode = 0) {
|
||
process.stdout.write(`${JSON.stringify(result)}\n`);
|
||
process.exit(exitCode);
|
||
}
|
||
|
||
const filePath = readArg("--file");
|
||
if (!filePath) {
|
||
emit({ ok: false, error: "Missing required --file argument" }, 2);
|
||
}
|
||
|
||
const resolvedPath = path.resolve(filePath);
|
||
if (!fs.existsSync(resolvedPath)) {
|
||
emit({ ok: false, error: `File not found: ${resolvedPath}` }, 2);
|
||
}
|
||
|
||
const ocrEnabled = parseBool(readArg("--ocr-enabled"), true);
|
||
const dpi = toNumber(readArg("--dpi"), 150, 72, 600);
|
||
const numWorkers = toNumber(
|
||
readArg("--num-workers"),
|
||
Math.max(os.cpus().length - 2, 1),
|
||
1,
|
||
64
|
||
);
|
||
|
||
const cliOcrLanguage = readArg("--ocr-language");
|
||
const ocrLanguageRaw =
|
||
(process.env.LITEPARSE_OCR_LANGUAGE && String(process.env.LITEPARSE_OCR_LANGUAGE).trim()) ||
|
||
(cliOcrLanguage && String(cliOcrLanguage).trim()) ||
|
||
"";
|
||
const ocrLanguage = normalizeOcrLanguage(ocrLanguageRaw || "eng");
|
||
|
||
const outputFormatRaw = (readArg("--output-format") || "text").trim().toLowerCase();
|
||
const outputFormat = outputFormatRaw === "json" ? "json" : "text";
|
||
|
||
const ocrServerUrlArg = readArg("--ocr-server-url");
|
||
const ocrServerUrl =
|
||
(ocrServerUrlArg && String(ocrServerUrlArg).trim()) ||
|
||
(process.env.LITEPARSE_OCR_SERVER_URL && String(process.env.LITEPARSE_OCR_SERVER_URL).trim()) ||
|
||
undefined;
|
||
|
||
const tessdataArg = readArg("--tessdata-path");
|
||
const tessdataPath =
|
||
(tessdataArg && String(tessdataArg).trim()) ||
|
||
(process.env.LITEPARSE_TESSDATA_PATH && String(process.env.LITEPARSE_TESSDATA_PATH).trim()) ||
|
||
(process.env.TESSDATA_PREFIX && String(process.env.TESSDATA_PREFIX).trim()) ||
|
||
undefined;
|
||
|
||
try {
|
||
const config = {
|
||
ocrEnabled,
|
||
ocrLanguage,
|
||
outputFormat,
|
||
dpi,
|
||
numWorkers,
|
||
};
|
||
if (ocrServerUrl) {
|
||
config.ocrServerUrl = ocrServerUrl;
|
||
}
|
||
if (tessdataPath) {
|
||
config.tessdataPath = tessdataPath;
|
||
}
|
||
|
||
const parser = new LiteParse(config);
|
||
|
||
const result = await parser.parse(resolvedPath, true);
|
||
const text = result?.text ?? "";
|
||
emit({
|
||
ok: true,
|
||
filePath: resolvedPath,
|
||
text,
|
||
pageCount: Array.isArray(result?.pages) ? result.pages.length : 0,
|
||
ocr: {
|
||
engine: ocrServerUrl ? "http" : "tesseract",
|
||
ocrLanguage,
|
||
ocrEnabled,
|
||
dpi,
|
||
numWorkers,
|
||
},
|
||
});
|
||
} catch (error) {
|
||
const message = error instanceof Error ? error.message : String(error);
|
||
const stack = error instanceof Error ? error.stack : undefined;
|
||
if (stack) {
|
||
process.stderr.write(`${stack}\n`);
|
||
}
|
||
emit(
|
||
{
|
||
ok: false,
|
||
filePath: resolvedPath,
|
||
error: message,
|
||
},
|
||
1
|
||
);
|
||
}
|