presenton/electron/servers/fastapi/services/liteparse_service.py
sudipnext c7e9a6872b feat: enhance document conversion services with improved logging and error handling
- Added logging to track the start and completion of document conversions in DocumentConversionService.
- Implemented detailed error logging for subprocess failures in both DocumentConversionService and LiteParseService.
- Introduced helper functions for logging command strings and output snippets to improve debuggability.
- Updated DocumentsLoader to log file processing details and fallback parsing attempts.
2026-03-31 14:06:57 +05:45

309 lines
11 KiB
Python

import json
import logging
import os
import subprocess
from typing import Any, Dict, Mapping, Tuple
class LiteParseError(Exception):
pass
LOGGER = logging.getLogger(__name__)
_LOG_SNIPPET_LIMIT = 600
def _snippet(value: str, limit: int = _LOG_SNIPPET_LIMIT) -> str:
text = (value or "").strip()
if not text:
return "<empty>"
if len(text) <= limit:
return text
return f"{text[:limit]}... [truncated {len(text) - limit} chars]"
def _command_str(parts: list[str]) -> str:
return " ".join(json.dumps(part) for part in parts)
def _subprocess_text_kwargs() -> Mapping[str, object]:
"""Decode subprocess output consistently across platforms.
Windows defaults to a locale-dependent code page (often cp1252), which can
crash while decoding UTF-8 output from Node tools. Use UTF-8 and replace
undecodable bytes to keep parsing resilient.
"""
return {"text": True, "encoding": "utf-8", "errors": "replace"}
class LiteParseService:
def __init__(self, timeout_seconds: int = 180):
self.timeout_seconds = timeout_seconds
self.node_binary = os.getenv("LITEPARSE_NODE_BINARY", "node")
self.runner_path = os.getenv("LITEPARSE_RUNNER_PATH", self._resolve_runner_path())
self.runner_dir = os.path.dirname(self.runner_path)
self._npm_project_root = self._resolve_npm_project_root()
def _build_node_env(self) -> Dict[str, str]:
"""Build environment for Node subprocesses.
When the configured runtime binary is not the canonical `node` executable
(for example Electron's app binary), force Node-compatible mode.
"""
env = os.environ.copy()
binary_name = os.path.basename(self.node_binary).lower()
if binary_name not in {"node", "node.exe"}:
env.setdefault("ELECTRON_RUN_AS_NODE", "1")
# LiteParse checks ImageMagick availability with `which magick`.
# On macOS app launches, PATH often excludes Homebrew bins, even when
# IMAGEMAGICK_BINARY is configured to an absolute executable path.
path_entries = [p for p in (env.get("PATH") or "").split(os.pathsep) if p]
additional_entries = []
imagemagick_binary = (env.get("IMAGEMAGICK_BINARY") or "").strip()
if imagemagick_binary:
magick_dir = os.path.dirname(imagemagick_binary)
if magick_dir:
additional_entries.append(magick_dir)
soffice_binary = (env.get("SOFFICE_PATH") or "").strip()
if soffice_binary:
soffice_dir = os.path.dirname(soffice_binary)
if soffice_dir:
additional_entries.append(soffice_dir)
if os.name != "nt":
additional_entries.extend([
"/opt/homebrew/bin",
"/usr/local/bin",
"/opt/local/bin",
"/usr/bin",
"/bin",
])
deduped_additional_entries = []
for entry in additional_entries:
normalized = entry.strip()
if not normalized or not os.path.isdir(normalized):
continue
if normalized in path_entries or normalized in deduped_additional_entries:
continue
deduped_additional_entries.append(normalized)
if deduped_additional_entries:
env["PATH"] = os.pathsep.join(deduped_additional_entries + path_entries)
return env
def _resolve_npm_project_root(self) -> str:
"""Directory whose node_modules contains @llamaindex/liteparse (runner dir or Electron app root)."""
local_nm = os.path.join(
self.runner_dir, "node_modules", "@llamaindex", "liteparse"
)
if os.path.isdir(local_nm):
return self.runner_dir
electron_nm = os.path.abspath(
os.path.join(self.runner_dir, "..", "..", "node_modules", "@llamaindex", "liteparse")
)
if os.path.isdir(electron_nm):
return os.path.abspath(os.path.join(self.runner_dir, "..", ".."))
return os.path.abspath(os.path.join(self.runner_dir, "..", ".."))
@staticmethod
def _resolve_runner_path() -> str:
cwd = os.path.abspath(".")
candidates = [
# electron/servers/fastapi → electron/resources/...
os.path.abspath(
os.path.join(
cwd, "..", "..", "resources", "document-extraction", "liteparse_runner.mjs"
)
),
# servers/fastapi (repo root layout) → electron/resources/...
os.path.abspath(
os.path.join(
cwd,
"..",
"..",
"electron",
"resources",
"document-extraction",
"liteparse_runner.mjs",
)
),
# PyInstaller bundle layout
os.path.abspath(
os.path.join(
cwd, "..", "..", "app", "resources", "document-extraction", "liteparse_runner.mjs"
)
),
# Docker / explicit layout
"/app/document-extraction-liteparse/liteparse_runner.mjs",
]
for path in candidates:
if os.path.isfile(path):
return path
return candidates[0]
def check_runtime_ready(self) -> Tuple[bool, str]:
if not os.path.isfile(self.runner_path):
return False, f"LiteParse runner not found at: {self.runner_path}"
try:
subprocess.run(
[self.node_binary, "--version"],
cwd=self.runner_dir,
check=True,
capture_output=True,
timeout=10,
env=self._build_node_env(),
**_subprocess_text_kwargs(),
)
except Exception as exc:
return False, f"Node.js runtime is unavailable: {exc}"
liteparse_dir = os.path.join(
self._npm_project_root, "node_modules", "@llamaindex", "liteparse"
)
if not os.path.isdir(liteparse_dir):
return (
False,
f"LiteParse npm package missing at {liteparse_dir}. Run npm install in the Electron app directory.",
)
# @llamaindex/liteparse is ESM-only; require.resolve() fails. Use dynamic import.
try:
subprocess.run(
[
self.node_binary,
"--input-type=module",
"-e",
"import '@llamaindex/liteparse'",
],
cwd=self._npm_project_root,
check=True,
capture_output=True,
timeout=20,
env=self._build_node_env(),
**_subprocess_text_kwargs(),
)
except Exception as exc:
return False, f"LiteParse dependency is unavailable: {exc}"
return True, "ok"
def parse_to_markdown(
self,
file_path: str,
ocr_enabled: bool = True,
ocr_language: str = "eng",
) -> str:
result = self.parse(
file_path=file_path,
ocr_enabled=ocr_enabled,
ocr_language=ocr_language,
)
return str(result.get("text") or "")
def parse(
self,
file_path: str,
ocr_enabled: bool = True,
ocr_language: str = "eng",
) -> Dict[str, Any]:
is_ready, reason = self.check_runtime_ready()
if not is_ready:
raise LiteParseError(reason)
command = [
self.node_binary,
self.runner_path,
"--file",
file_path,
"--ocr-enabled",
"true" if ocr_enabled else "false",
"--ocr-language",
ocr_language,
]
ocr_server = (os.getenv("LITEPARSE_OCR_SERVER_URL") or "").strip()
if ocr_server:
command.extend(["--ocr-server-url", ocr_server])
tessdata = (os.getenv("LITEPARSE_TESSDATA_PATH") or "").strip()
if tessdata:
command.extend(["--tessdata-path", tessdata])
LOGGER.info(
"[LiteParse] Parsing file=%s ocr_enabled=%s ocr_language=%s",
file_path,
ocr_enabled,
ocr_language,
)
process = subprocess.run(
command,
cwd=self._npm_project_root,
capture_output=True,
timeout=self.timeout_seconds,
env=self._build_node_env(),
**_subprocess_text_kwargs(),
)
LOGGER.info(
"[LiteParse] Command finished returncode=%s command=%s",
process.returncode,
_command_str(command),
)
payload: Dict[str, Any]
try:
payload = self._decode_runner_output(process.stdout)
except LiteParseError as exc:
raise LiteParseError(
f"{exc}; returncode={process.returncode}; "
f"stderr={_snippet(process.stderr)}; stdout={_snippet(process.stdout)}"
) from exc
if process.returncode != 0:
message = payload.get("error") or process.stderr.strip() or "Unknown error"
LOGGER.error(
"[LiteParse] Parse failed returncode=%s stderr=%s stdout=%s",
process.returncode,
_snippet(process.stderr),
_snippet(process.stdout),
)
raise LiteParseError(message)
if not payload.get("ok"):
LOGGER.error(
"[LiteParse] Runner returned not-ok payload=%s",
_snippet(json.dumps(payload)),
)
raise LiteParseError(payload.get("error") or "LiteParse parse failed")
return payload
@staticmethod
def _decode_runner_output(stdout: str) -> Dict[str, Any]:
raw = (stdout or "").lstrip("\ufeff").strip()
if not raw:
raise LiteParseError("LiteParse runner returned empty output")
# Prefer the last line that parses as JSON (handles stray log lines before our payload).
lines = [line.strip() for line in raw.splitlines() if line.strip()]
for line in reversed(lines):
try:
parsed = json.loads(line)
if isinstance(parsed, dict):
return parsed
except json.JSONDecodeError:
continue
# Single blob without newlines (entire stdout is one JSON object).
try:
parsed = json.loads(raw)
if isinstance(parsed, dict):
return parsed
except json.JSONDecodeError:
pass
raise LiteParseError("LiteParse runner returned invalid JSON output")