feat: enhance document conversion services with improved logging and error handling

- Added logging to track the start and completion of document conversions in DocumentConversionService.
- Implemented detailed error logging for subprocess failures in both DocumentConversionService and LiteParseService.
- Introduced helper functions for logging command strings and output snippets to improve debuggability.
- Updated DocumentsLoader to log file processing details and fallback parsing attempts.
This commit is contained in:
sudipnext 2026-03-31 14:06:57 +05:45
parent 429cb6c09b
commit c7e9a6872b
3 changed files with 176 additions and 22 deletions

View file

@ -1,5 +1,6 @@
import os
import subprocess
import logging
from pathlib import Path
from typing import Dict, List
@ -8,6 +9,23 @@ class DocumentConversionError(Exception):
pass
LOGGER = logging.getLogger(__name__)
_LOG_SNIPPET_LIMIT = 600
def _snippet(value: str, limit: int = _LOG_SNIPPET_LIMIT) -> str:
text = (value or "").strip()
if not text:
return "<empty>"
if len(text) <= limit:
return text
return f"{text[:limit]}... [truncated {len(text) - limit} chars]"
def _command_str(parts: list[str]) -> str:
return " ".join(repr(part) for part in parts)
def _windows_hidden_subprocess_kwargs() -> Dict[str, object]:
if os.name != "nt":
return {}
@ -39,6 +57,8 @@ class DocumentConversionService:
[command, *args],
capture_output=True,
text=True,
encoding="utf-8",
errors="replace",
timeout=10,
check=False,
**_windows_hidden_subprocess_kwargs(),
@ -71,23 +91,39 @@ class DocumentConversionService:
}
try:
command = [
self.soffice_binary,
"--headless",
"--convert-to",
"pdf",
"--outdir",
output_dir,
file_path,
]
LOGGER.info(
"[DocumentConversion] LibreOffice conversion start input=%s output_dir=%s",
file_path,
output_dir,
)
subprocess.run(
[
self.soffice_binary,
"--headless",
"--convert-to",
"pdf",
"--outdir",
output_dir,
file_path,
],
command,
check=True,
capture_output=True,
text=True,
encoding="utf-8",
errors="replace",
timeout=timeout_seconds,
**_windows_hidden_subprocess_kwargs(),
)
LOGGER.info(
"[DocumentConversion] LibreOffice conversion complete input=%s",
file_path,
)
except subprocess.TimeoutExpired as exc:
LOGGER.error(
"[DocumentConversion] LibreOffice timed out command=%s",
_command_str(exc.cmd if isinstance(exc.cmd, list) else [str(exc.cmd)]),
)
raise DocumentConversionError(
f"LibreOffice conversion timed out for {os.path.basename(file_path)}"
) from exc
@ -95,10 +131,19 @@ class DocumentConversionService:
stderr = (exc.stderr or "").strip()
stdout = (exc.stdout or "").strip()
details = stderr or stdout or str(exc)
LOGGER.error(
"[DocumentConversion] LibreOffice failed code=%s command=%s stderr=%s stdout=%s",
exc.returncode,
_command_str(exc.cmd if isinstance(exc.cmd, list) else [str(exc.cmd)]),
_snippet(stderr),
_snippet(stdout),
)
raise DocumentConversionError(
f"LibreOffice conversion failed for {os.path.basename(file_path)}: {details}"
f"LibreOffice conversion failed for {os.path.basename(file_path)}: {details} "
f"(stderr={_snippet(stderr)}; stdout={_snippet(stdout)})"
) from exc
except Exception as exc:
LOGGER.exception("[DocumentConversion] LibreOffice conversion unexpected error")
raise DocumentConversionError(
f"LibreOffice conversion failed for {os.path.basename(file_path)}: {exc}"
) from exc
@ -133,15 +178,31 @@ class DocumentConversionService:
command = [self.imagemagick_binary, file_path, str(output_path)]
try:
LOGGER.info(
"[DocumentConversion] ImageMagick conversion start input=%s output=%s command=%s",
file_path,
output_path,
_command_str(command),
)
subprocess.run(
command,
check=True,
capture_output=True,
text=True,
encoding="utf-8",
errors="replace",
timeout=timeout_seconds,
**_windows_hidden_subprocess_kwargs(),
)
LOGGER.info(
"[DocumentConversion] ImageMagick conversion complete output=%s",
output_path,
)
except subprocess.TimeoutExpired as exc:
LOGGER.error(
"[DocumentConversion] ImageMagick timed out command=%s",
_command_str(exc.cmd if isinstance(exc.cmd, list) else [str(exc.cmd)]),
)
raise DocumentConversionError(
f"ImageMagick conversion timed out for {os.path.basename(file_path)}"
) from exc
@ -149,10 +210,19 @@ class DocumentConversionService:
stderr = (exc.stderr or "").strip()
stdout = (exc.stdout or "").strip()
details = stderr or stdout or str(exc)
LOGGER.error(
"[DocumentConversion] ImageMagick failed code=%s command=%s stderr=%s stdout=%s",
exc.returncode,
_command_str(exc.cmd if isinstance(exc.cmd, list) else [str(exc.cmd)]),
_snippet(stderr),
_snippet(stdout),
)
raise DocumentConversionError(
f"ImageMagick conversion failed for {os.path.basename(file_path)}: {details}"
f"ImageMagick conversion failed for {os.path.basename(file_path)}: {details} "
f"(stderr={_snippet(stderr)}; stdout={_snippet(stdout)})"
) from exc
except Exception as exc:
LOGGER.exception("[DocumentConversion] ImageMagick conversion unexpected error")
raise DocumentConversionError(
f"ImageMagick conversion failed for {os.path.basename(file_path)}: {exc}"
) from exc

View file

@ -1,8 +1,9 @@
import asyncio
import logging
import os
import tempfile
from pathlib import Path
from typing import List, Optional, Tuple
from typing import Any, List, Optional, Tuple
import pdfplumber
from fastapi import HTTPException
@ -22,9 +23,11 @@ from utils.ocr_language import presentation_language_to_ocr_code
# Optional fallback converter (primarily useful on Windows)
try:
from services.lightweight_document_service import DocumentService
from services.lightweight_document_service import DocumentService as DocumentServiceCls
except Exception:
DocumentService = None
DocumentServiceCls = None
LOGGER = logging.getLogger(__name__)
class DocumentsLoader:
@ -38,7 +41,9 @@ class DocumentsLoader:
self._ocr_language = presentation_language_to_ocr_code(presentation_language)
self.liteparse_service = LiteParseService()
self.document_conversion_service = DocumentConversionService()
self.document_service = DocumentService() if DocumentService is not None else None
self.document_service: Any = (
DocumentServiceCls() if DocumentServiceCls is not None else None
)
self._documents: List[str] = []
self._images: List[List[str]] = []
@ -69,9 +74,14 @@ class DocumentsLoader:
)
document = ""
imgs = []
imgs: List[str] = []
extension = Path(file_path).suffix.lower()
LOGGER.info(
"[DocumentsLoader] Processing file=%s extension=%s",
file_path,
extension,
)
if extension in PDF_EXTENSIONS:
document, imgs = await self.load_pdf(
@ -107,13 +117,18 @@ class DocumentsLoader:
load_images: bool,
temp_dir: Optional[str] = None,
) -> Tuple[str, List[str]]:
image_paths = []
image_paths: List[str] = []
document: str = ""
if load_text:
document = await asyncio.to_thread(self._parse_with_liteparse, file_path)
if load_images:
if temp_dir is None:
raise HTTPException(
status_code=400,
detail="temp_dir is required when load_images is true",
)
image_paths = await self.get_page_images_from_pdf_async(file_path, temp_dir)
return document, image_paths
@ -154,16 +169,27 @@ class DocumentsLoader:
def _parse_with_liteparse(self, file_path: str) -> str:
try:
LOGGER.info("[DocumentsLoader] LiteParse start file=%s", file_path)
return self.liteparse_service.parse_to_markdown(
file_path,
ocr_enabled=True,
ocr_language=self._ocr_language,
)
except (LiteParseError, DocumentConversionError) as exc:
LOGGER.warning(
"[DocumentsLoader] Primary parse failed file=%s error=%s",
file_path,
exc,
)
if self.document_service is not None:
try:
LOGGER.info("[DocumentsLoader] Trying fallback parser file=%s", file_path)
return self.document_service.parse_to_markdown(file_path)
except Exception:
LOGGER.exception(
"[DocumentsLoader] Fallback parser failed file=%s",
file_path,
)
pass
raise HTTPException(
status_code=500,

View file

@ -1,13 +1,41 @@
import json
import logging
import os
import subprocess
from typing import Any, Dict, Tuple
from typing import Any, Dict, Mapping, Tuple
class LiteParseError(Exception):
pass
LOGGER = logging.getLogger(__name__)
_LOG_SNIPPET_LIMIT = 600
def _snippet(value: str, limit: int = _LOG_SNIPPET_LIMIT) -> str:
text = (value or "").strip()
if not text:
return "<empty>"
if len(text) <= limit:
return text
return f"{text[:limit]}... [truncated {len(text) - limit} chars]"
def _command_str(parts: list[str]) -> str:
return " ".join(json.dumps(part) for part in parts)
def _subprocess_text_kwargs() -> Mapping[str, object]:
"""Decode subprocess output consistently across platforms.
Windows defaults to a locale-dependent code page (often cp1252), which can
crash while decoding UTF-8 output from Node tools. Use UTF-8 and replace
undecodable bytes to keep parsing resilient.
"""
return {"text": True, "encoding": "utf-8", "errors": "replace"}
class LiteParseService:
def __init__(self, timeout_seconds: int = 180):
self.timeout_seconds = timeout_seconds
@ -128,9 +156,9 @@ class LiteParseService:
cwd=self.runner_dir,
check=True,
capture_output=True,
text=True,
timeout=10,
env=self._build_node_env(),
**_subprocess_text_kwargs(),
)
except Exception as exc:
return False, f"Node.js runtime is unavailable: {exc}"
@ -156,9 +184,9 @@ class LiteParseService:
cwd=self._npm_project_root,
check=True,
capture_output=True,
text=True,
timeout=20,
env=self._build_node_env(),
**_subprocess_text_kwargs(),
)
except Exception as exc:
return False, f"LiteParse dependency is unavailable: {exc}"
@ -205,21 +233,51 @@ class LiteParseService:
if tessdata:
command.extend(["--tessdata-path", tessdata])
LOGGER.info(
"[LiteParse] Parsing file=%s ocr_enabled=%s ocr_language=%s",
file_path,
ocr_enabled,
ocr_language,
)
process = subprocess.run(
command,
cwd=self._npm_project_root,
capture_output=True,
text=True,
timeout=self.timeout_seconds,
env=self._build_node_env(),
**_subprocess_text_kwargs(),
)
payload = self._decode_runner_output(process.stdout)
LOGGER.info(
"[LiteParse] Command finished returncode=%s command=%s",
process.returncode,
_command_str(command),
)
payload: Dict[str, Any]
try:
payload = self._decode_runner_output(process.stdout)
except LiteParseError as exc:
raise LiteParseError(
f"{exc}; returncode={process.returncode}; "
f"stderr={_snippet(process.stderr)}; stdout={_snippet(process.stdout)}"
) from exc
if process.returncode != 0:
message = payload.get("error") or process.stderr.strip() or "Unknown error"
LOGGER.error(
"[LiteParse] Parse failed returncode=%s stderr=%s stdout=%s",
process.returncode,
_snippet(process.stderr),
_snippet(process.stdout),
)
raise LiteParseError(message)
if not payload.get("ok"):
LOGGER.error(
"[LiteParse] Runner returned not-ok payload=%s",
_snippet(json.dumps(payload)),
)
raise LiteParseError(payload.get("error") or "LiteParse parse failed")
return payload