presenton/servers/fastapi/tests/test_liteparse_service.py
sudipnext 35f784379b feat: Enhance LiteParse runner and document processing
- Updated the LiteParse runner to support two output formats: raw text and JSON, improving compatibility and flexibility.
- Introduced error handling for missing file arguments and file existence checks, enhancing robustness.
- Added functions to clean and extract text from LiteParse JSON outputs, handling malformed JSON gracefully.
- Updated the DocumentsLoader to utilize the new text cleaning functionality, ensuring cleaner document outputs.
- Implemented tests for the new text extraction and cleaning features, ensuring reliability and correctness.
2026-04-26 18:10:49 +05:45

108 lines
3.7 KiB
Python

from types import SimpleNamespace
from unittest.mock import patch
from services.liteparse_service import LiteParseService
def _ok_process(
stdout: str = "ok",
returncode: int = 0,
stderr: str = "",
):
return SimpleNamespace(returncode=returncode, stdout=stdout, stderr=stderr)
class TestLiteParseService:
def test_parse_uses_safe_defaults(self):
with patch.dict(
"os.environ",
{
"LITEPARSE_DPI": "",
"LITEPARSE_NUM_WORKERS": "",
},
clear=False,
), patch.object(
LiteParseService,
"check_runtime_ready",
return_value=(True, "ok"),
), patch(
"services.liteparse_service.subprocess.run",
return_value=_ok_process(),
) as mock_run:
service = LiteParseService(timeout_seconds=30)
r = service.parse("/tmp/sample.pdf", ocr_enabled=True, ocr_language="eng")
assert r["ok"] is True
assert r["text"] == "ok"
command = mock_run.call_args.args[0]
assert "--dpi" in command
assert command[command.index("--dpi") + 1] == "120"
assert "--num-workers" in command
assert command[command.index("--num-workers") + 1] == "1"
assert command[command.index("--python-bridge") + 1] == "plain"
def test_parse_uses_env_overrides(self):
with patch.dict(
"os.environ",
{
"LITEPARSE_DPI": "96",
"LITEPARSE_NUM_WORKERS": "2",
},
clear=False,
), patch.object(
LiteParseService,
"check_runtime_ready",
return_value=(True, "ok"),
), patch(
"services.liteparse_service.subprocess.run",
return_value=_ok_process(),
) as mock_run:
service = LiteParseService(timeout_seconds=30)
service.parse("/tmp/sample.pdf", ocr_enabled=True, ocr_language="eng")
command = mock_run.call_args.args[0]
assert command[command.index("--dpi") + 1] == "96"
assert command[command.index("--num-workers") + 1] == "2"
def test_parse_clamps_invalid_env_values(self):
with patch.dict(
"os.environ",
{
"LITEPARSE_DPI": "-1",
"LITEPARSE_NUM_WORKERS": "0",
},
clear=False,
), patch.object(
LiteParseService,
"check_runtime_ready",
return_value=(True, "ok"),
), patch(
"services.liteparse_service.subprocess.run",
return_value=_ok_process(),
) as mock_run:
service = LiteParseService(timeout_seconds=30)
service.parse("/tmp/sample.pdf", ocr_enabled=True, ocr_language="eng")
command = mock_run.call_args.args[0]
assert command[command.index("--dpi") + 1] == "72"
assert command[command.index("--num-workers") + 1] == "1"
def test_parse_json_bridge_env(self):
with patch.dict(
"os.environ",
{"LITEPARSE_RUNNER_OUTPUT": "json"},
clear=False,
), patch.object(
LiteParseService,
"check_runtime_ready",
return_value=(True, "ok"),
), patch(
"services.liteparse_service.subprocess.run",
return_value=_ok_process(stdout='{"ok": true, "text": "legacy"}\n'),
) as mock_run:
service = LiteParseService(timeout_seconds=30)
r = service.parse("/tmp/sample.pdf", ocr_enabled=True, ocr_language="eng")
assert r["text"] == "legacy"
command = mock_run.call_args.args[0]
assert command[command.index("--python-bridge") + 1] == "json"