presenton/servers/fastapi/tests/test_documents_loader_unwrap.py
sudipnext 35f784379b feat: Enhance LiteParse runner and document processing
- Updated the LiteParse runner to support two output formats: raw text and JSON, improving compatibility and flexibility.
- Introduced error handling for missing file arguments and file existence checks, enhancing robustness.
- Added functions to clean and extract text from LiteParse JSON outputs, handling malformed JSON gracefully.
- Updated the DocumentsLoader to utilize the new text cleaning functionality, ensuring cleaner document outputs.
- Implemented tests for the new text extraction and cleaning features, ensuring reliability and correctness.
2026-04-26 18:10:49 +05:45

60 lines
1.8 KiB
Python

import json
from services.documents_loader import (
_unwrap_liteparse_json_line_if_stored,
clean_extracted_document_text,
)
def test_unwrap_strips_liteparse_json_line():
inner = "Title\n\nBody with \"quotes\" and\nnewlines."
line = json.dumps(
{"ok": True, "filePath": "/tmp/x.pdf", "text": inner},
ensure_ascii=False,
)
assert _unwrap_liteparse_json_line_if_stored(line) == inner
assert _unwrap_liteparse_json_line_if_stored(" \n" + line) == inner
def test_unwrap_leaves_plain_text():
t = "Not JSON. {Braces} in prose."
assert _unwrap_liteparse_json_line_if_stored(t) is t
def test_unwrap_rejects_malformed_json():
t = "{not valid json"
assert _unwrap_liteparse_json_line_if_stored(t) is t
def test_clean_extracts_text_when_json_truncated():
"""Drops everything before the "text" value and unescapes, even if JSON is not closed."""
blob = (
'{"ok": true, "filePath": "/tmp/x.pdf", "text": " similarweb | HypeAuditor\\n\\n2024" '
)
# Missing closing " } — json.loads will fail, fallback path should still return body
out = clean_extracted_document_text(blob)
assert "similarweb" in out
assert "ok" not in out
assert "filePath" not in out
def test_clean_same_as_unwrap_for_valid_line():
inner = "Prose only."
line = json.dumps(
{"ok": True, "filePath": "/tmp/x.pdf", "text": inner},
ensure_ascii=False,
)
assert clean_extracted_document_text(line) == inner
def test_clean_double_json_embedded_in_text_field():
inner2 = "Final body."
inner1 = json.dumps(
{"ok": True, "filePath": "/a.pdf", "text": inner2},
ensure_ascii=False,
)
outer = json.dumps(
{"ok": True, "filePath": "/b.pdf", "text": inner1},
ensure_ascii=False,
)
assert clean_extracted_document_text(outer) == inner2