- Added logging to track the start and completion of document conversions in DocumentConversionService. - Implemented detailed error logging for subprocess failures in both DocumentConversionService and LiteParseService. - Introduced helper functions for logging command strings and output snippets to improve debuggability. - Updated DocumentsLoader to log file processing details and fallback parsing attempts.
309 lines
11 KiB
Python
309 lines
11 KiB
Python
import json
|
|
import logging
|
|
import os
|
|
import subprocess
|
|
from typing import Any, Dict, Mapping, Tuple
|
|
|
|
|
|
class LiteParseError(Exception):
|
|
pass
|
|
|
|
|
|
LOGGER = logging.getLogger(__name__)
|
|
_LOG_SNIPPET_LIMIT = 600
|
|
|
|
|
|
def _snippet(value: str, limit: int = _LOG_SNIPPET_LIMIT) -> str:
|
|
text = (value or "").strip()
|
|
if not text:
|
|
return "<empty>"
|
|
if len(text) <= limit:
|
|
return text
|
|
return f"{text[:limit]}... [truncated {len(text) - limit} chars]"
|
|
|
|
|
|
def _command_str(parts: list[str]) -> str:
|
|
return " ".join(json.dumps(part) for part in parts)
|
|
|
|
|
|
def _subprocess_text_kwargs() -> Mapping[str, object]:
|
|
"""Decode subprocess output consistently across platforms.
|
|
|
|
Windows defaults to a locale-dependent code page (often cp1252), which can
|
|
crash while decoding UTF-8 output from Node tools. Use UTF-8 and replace
|
|
undecodable bytes to keep parsing resilient.
|
|
"""
|
|
return {"text": True, "encoding": "utf-8", "errors": "replace"}
|
|
|
|
|
|
class LiteParseService:
|
|
def __init__(self, timeout_seconds: int = 180):
|
|
self.timeout_seconds = timeout_seconds
|
|
self.node_binary = os.getenv("LITEPARSE_NODE_BINARY", "node")
|
|
self.runner_path = os.getenv("LITEPARSE_RUNNER_PATH", self._resolve_runner_path())
|
|
self.runner_dir = os.path.dirname(self.runner_path)
|
|
self._npm_project_root = self._resolve_npm_project_root()
|
|
|
|
def _build_node_env(self) -> Dict[str, str]:
|
|
"""Build environment for Node subprocesses.
|
|
|
|
When the configured runtime binary is not the canonical `node` executable
|
|
(for example Electron's app binary), force Node-compatible mode.
|
|
"""
|
|
env = os.environ.copy()
|
|
binary_name = os.path.basename(self.node_binary).lower()
|
|
if binary_name not in {"node", "node.exe"}:
|
|
env.setdefault("ELECTRON_RUN_AS_NODE", "1")
|
|
|
|
# LiteParse checks ImageMagick availability with `which magick`.
|
|
# On macOS app launches, PATH often excludes Homebrew bins, even when
|
|
# IMAGEMAGICK_BINARY is configured to an absolute executable path.
|
|
path_entries = [p for p in (env.get("PATH") or "").split(os.pathsep) if p]
|
|
additional_entries = []
|
|
|
|
imagemagick_binary = (env.get("IMAGEMAGICK_BINARY") or "").strip()
|
|
if imagemagick_binary:
|
|
magick_dir = os.path.dirname(imagemagick_binary)
|
|
if magick_dir:
|
|
additional_entries.append(magick_dir)
|
|
|
|
soffice_binary = (env.get("SOFFICE_PATH") or "").strip()
|
|
if soffice_binary:
|
|
soffice_dir = os.path.dirname(soffice_binary)
|
|
if soffice_dir:
|
|
additional_entries.append(soffice_dir)
|
|
|
|
if os.name != "nt":
|
|
additional_entries.extend([
|
|
"/opt/homebrew/bin",
|
|
"/usr/local/bin",
|
|
"/opt/local/bin",
|
|
"/usr/bin",
|
|
"/bin",
|
|
])
|
|
|
|
deduped_additional_entries = []
|
|
for entry in additional_entries:
|
|
normalized = entry.strip()
|
|
if not normalized or not os.path.isdir(normalized):
|
|
continue
|
|
if normalized in path_entries or normalized in deduped_additional_entries:
|
|
continue
|
|
deduped_additional_entries.append(normalized)
|
|
|
|
if deduped_additional_entries:
|
|
env["PATH"] = os.pathsep.join(deduped_additional_entries + path_entries)
|
|
|
|
return env
|
|
|
|
def _resolve_npm_project_root(self) -> str:
|
|
"""Directory whose node_modules contains @llamaindex/liteparse (runner dir or Electron app root)."""
|
|
local_nm = os.path.join(
|
|
self.runner_dir, "node_modules", "@llamaindex", "liteparse"
|
|
)
|
|
if os.path.isdir(local_nm):
|
|
return self.runner_dir
|
|
electron_nm = os.path.abspath(
|
|
os.path.join(self.runner_dir, "..", "..", "node_modules", "@llamaindex", "liteparse")
|
|
)
|
|
if os.path.isdir(electron_nm):
|
|
return os.path.abspath(os.path.join(self.runner_dir, "..", ".."))
|
|
return os.path.abspath(os.path.join(self.runner_dir, "..", ".."))
|
|
|
|
@staticmethod
|
|
def _resolve_runner_path() -> str:
|
|
cwd = os.path.abspath(".")
|
|
candidates = [
|
|
# electron/servers/fastapi → electron/resources/...
|
|
os.path.abspath(
|
|
os.path.join(
|
|
cwd, "..", "..", "resources", "document-extraction", "liteparse_runner.mjs"
|
|
)
|
|
),
|
|
# servers/fastapi (repo root layout) → electron/resources/...
|
|
os.path.abspath(
|
|
os.path.join(
|
|
cwd,
|
|
"..",
|
|
"..",
|
|
"electron",
|
|
"resources",
|
|
"document-extraction",
|
|
"liteparse_runner.mjs",
|
|
)
|
|
),
|
|
# PyInstaller bundle layout
|
|
os.path.abspath(
|
|
os.path.join(
|
|
cwd, "..", "..", "app", "resources", "document-extraction", "liteparse_runner.mjs"
|
|
)
|
|
),
|
|
# Docker / explicit layout
|
|
"/app/document-extraction-liteparse/liteparse_runner.mjs",
|
|
]
|
|
for path in candidates:
|
|
if os.path.isfile(path):
|
|
return path
|
|
return candidates[0]
|
|
|
|
def check_runtime_ready(self) -> Tuple[bool, str]:
|
|
if not os.path.isfile(self.runner_path):
|
|
return False, f"LiteParse runner not found at: {self.runner_path}"
|
|
|
|
try:
|
|
subprocess.run(
|
|
[self.node_binary, "--version"],
|
|
cwd=self.runner_dir,
|
|
check=True,
|
|
capture_output=True,
|
|
timeout=10,
|
|
env=self._build_node_env(),
|
|
**_subprocess_text_kwargs(),
|
|
)
|
|
except Exception as exc:
|
|
return False, f"Node.js runtime is unavailable: {exc}"
|
|
|
|
liteparse_dir = os.path.join(
|
|
self._npm_project_root, "node_modules", "@llamaindex", "liteparse"
|
|
)
|
|
if not os.path.isdir(liteparse_dir):
|
|
return (
|
|
False,
|
|
f"LiteParse npm package missing at {liteparse_dir}. Run npm install in the Electron app directory.",
|
|
)
|
|
|
|
# @llamaindex/liteparse is ESM-only; require.resolve() fails. Use dynamic import.
|
|
try:
|
|
subprocess.run(
|
|
[
|
|
self.node_binary,
|
|
"--input-type=module",
|
|
"-e",
|
|
"import '@llamaindex/liteparse'",
|
|
],
|
|
cwd=self._npm_project_root,
|
|
check=True,
|
|
capture_output=True,
|
|
timeout=20,
|
|
env=self._build_node_env(),
|
|
**_subprocess_text_kwargs(),
|
|
)
|
|
except Exception as exc:
|
|
return False, f"LiteParse dependency is unavailable: {exc}"
|
|
|
|
return True, "ok"
|
|
|
|
def parse_to_markdown(
|
|
self,
|
|
file_path: str,
|
|
ocr_enabled: bool = True,
|
|
ocr_language: str = "eng",
|
|
) -> str:
|
|
result = self.parse(
|
|
file_path=file_path,
|
|
ocr_enabled=ocr_enabled,
|
|
ocr_language=ocr_language,
|
|
)
|
|
return str(result.get("text") or "")
|
|
|
|
def parse(
|
|
self,
|
|
file_path: str,
|
|
ocr_enabled: bool = True,
|
|
ocr_language: str = "eng",
|
|
) -> Dict[str, Any]:
|
|
is_ready, reason = self.check_runtime_ready()
|
|
if not is_ready:
|
|
raise LiteParseError(reason)
|
|
|
|
command = [
|
|
self.node_binary,
|
|
self.runner_path,
|
|
"--file",
|
|
file_path,
|
|
"--ocr-enabled",
|
|
"true" if ocr_enabled else "false",
|
|
"--ocr-language",
|
|
ocr_language,
|
|
]
|
|
ocr_server = (os.getenv("LITEPARSE_OCR_SERVER_URL") or "").strip()
|
|
if ocr_server:
|
|
command.extend(["--ocr-server-url", ocr_server])
|
|
tessdata = (os.getenv("LITEPARSE_TESSDATA_PATH") or "").strip()
|
|
if tessdata:
|
|
command.extend(["--tessdata-path", tessdata])
|
|
|
|
LOGGER.info(
|
|
"[LiteParse] Parsing file=%s ocr_enabled=%s ocr_language=%s",
|
|
file_path,
|
|
ocr_enabled,
|
|
ocr_language,
|
|
)
|
|
|
|
process = subprocess.run(
|
|
command,
|
|
cwd=self._npm_project_root,
|
|
capture_output=True,
|
|
timeout=self.timeout_seconds,
|
|
env=self._build_node_env(),
|
|
**_subprocess_text_kwargs(),
|
|
)
|
|
LOGGER.info(
|
|
"[LiteParse] Command finished returncode=%s command=%s",
|
|
process.returncode,
|
|
_command_str(command),
|
|
)
|
|
|
|
payload: Dict[str, Any]
|
|
try:
|
|
payload = self._decode_runner_output(process.stdout)
|
|
except LiteParseError as exc:
|
|
raise LiteParseError(
|
|
f"{exc}; returncode={process.returncode}; "
|
|
f"stderr={_snippet(process.stderr)}; stdout={_snippet(process.stdout)}"
|
|
) from exc
|
|
|
|
if process.returncode != 0:
|
|
message = payload.get("error") or process.stderr.strip() or "Unknown error"
|
|
LOGGER.error(
|
|
"[LiteParse] Parse failed returncode=%s stderr=%s stdout=%s",
|
|
process.returncode,
|
|
_snippet(process.stderr),
|
|
_snippet(process.stdout),
|
|
)
|
|
raise LiteParseError(message)
|
|
|
|
if not payload.get("ok"):
|
|
LOGGER.error(
|
|
"[LiteParse] Runner returned not-ok payload=%s",
|
|
_snippet(json.dumps(payload)),
|
|
)
|
|
raise LiteParseError(payload.get("error") or "LiteParse parse failed")
|
|
|
|
return payload
|
|
|
|
@staticmethod
|
|
def _decode_runner_output(stdout: str) -> Dict[str, Any]:
|
|
raw = (stdout or "").lstrip("\ufeff").strip()
|
|
if not raw:
|
|
raise LiteParseError("LiteParse runner returned empty output")
|
|
|
|
# Prefer the last line that parses as JSON (handles stray log lines before our payload).
|
|
lines = [line.strip() for line in raw.splitlines() if line.strip()]
|
|
for line in reversed(lines):
|
|
try:
|
|
parsed = json.loads(line)
|
|
if isinstance(parsed, dict):
|
|
return parsed
|
|
except json.JSONDecodeError:
|
|
continue
|
|
|
|
# Single blob without newlines (entire stdout is one JSON object).
|
|
try:
|
|
parsed = json.loads(raw)
|
|
if isinstance(parsed, dict):
|
|
return parsed
|
|
except json.JSONDecodeError:
|
|
pass
|
|
|
|
raise LiteParseError("LiteParse runner returned invalid JSON output")
|