feat: enhance document conversion services with improved logging and error handling

- Added logging to track the start and completion of document conversions in DocumentConversionService. - Implemented detailed error logging for subprocess failures in both DocumentConversionService and LiteParseService. - Introduced helper functions for logging command strings and output snippets to improve debuggability. - Updated DocumentsLoader to log file processing details and fallback parsing attempts.
2026-03-31 14:06:57 +05:45 · 2026-03-31 14:06:57 +05:45 · c7e9a6872b
commit c7e9a6872b
parent 429cb6c09b
3 changed files with 176 additions and 22 deletions
--- a/electron/servers/fastapi/services/document_conversion_service.py
+++ b/electron/servers/fastapi/services/document_conversion_service.py
@ -1,5 +1,6 @@
 import os
 import subprocess
+import logging
 from pathlib import Path
 from typing import Dict, List

@ -8,6 +9,23 @@ class DocumentConversionError(Exception):
    pass


+LOGGER = logging.getLogger(__name__)
+_LOG_SNIPPET_LIMIT = 600
+
+
+def _snippet(value: str, limit: int = _LOG_SNIPPET_LIMIT) -> str:
+    text = (value or "").strip()
+    if not text:
+        return "<empty>"
+    if len(text) <= limit:
+        return text
+    return f"{text[:limit]}... [truncated {len(text) - limit} chars]"
+
+
+def _command_str(parts: list[str]) -> str:
+    return " ".join(repr(part) for part in parts)
+
+
 def _windows_hidden_subprocess_kwargs() -> Dict[str, object]:
    if os.name != "nt":
        return {}
@ -39,6 +57,8 @@ class DocumentConversionService:
                [command, *args],
                capture_output=True,
                text=True,
+                encoding="utf-8",
+                errors="replace",
                timeout=10,
                check=False,
                **_windows_hidden_subprocess_kwargs(),
@ -71,23 +91,39 @@ class DocumentConversionService:
        }

        try:
+            command = [
+                self.soffice_binary,
+                "--headless",
+                "--convert-to",
+                "pdf",
+                "--outdir",
+                output_dir,
+                file_path,
+            ]
+            LOGGER.info(
+                "[DocumentConversion] LibreOffice conversion start input=%s output_dir=%s",
+                file_path,
+                output_dir,
+            )
            subprocess.run(
-                [
-                    self.soffice_binary,
-                    "--headless",
-                    "--convert-to",
-                    "pdf",
-                    "--outdir",
-                    output_dir,
-                    file_path,
-                ],
+                command,
                check=True,
                capture_output=True,
                text=True,
+                encoding="utf-8",
+                errors="replace",
                timeout=timeout_seconds,
                **_windows_hidden_subprocess_kwargs(),
            )
+            LOGGER.info(
+                "[DocumentConversion] LibreOffice conversion complete input=%s",
+                file_path,
+            )
        except subprocess.TimeoutExpired as exc:
+            LOGGER.error(
+                "[DocumentConversion] LibreOffice timed out command=%s",
+                _command_str(exc.cmd if isinstance(exc.cmd, list) else [str(exc.cmd)]),
+            )
            raise DocumentConversionError(
                f"LibreOffice conversion timed out for {os.path.basename(file_path)}"
            ) from exc
@ -95,10 +131,19 @@ class DocumentConversionService:
            stderr = (exc.stderr or "").strip()
            stdout = (exc.stdout or "").strip()
            details = stderr or stdout or str(exc)
+            LOGGER.error(
+                "[DocumentConversion] LibreOffice failed code=%s command=%s stderr=%s stdout=%s",
+                exc.returncode,
+                _command_str(exc.cmd if isinstance(exc.cmd, list) else [str(exc.cmd)]),
+                _snippet(stderr),
+                _snippet(stdout),
+            )
            raise DocumentConversionError(
-                f"LibreOffice conversion failed for {os.path.basename(file_path)}: {details}"
+                f"LibreOffice conversion failed for {os.path.basename(file_path)}: {details} "
+                f"(stderr={_snippet(stderr)}; stdout={_snippet(stdout)})"
            ) from exc
        except Exception as exc:
+            LOGGER.exception("[DocumentConversion] LibreOffice conversion unexpected error")
            raise DocumentConversionError(
                f"LibreOffice conversion failed for {os.path.basename(file_path)}: {exc}"
            ) from exc
@ -133,15 +178,31 @@ class DocumentConversionService:
        command = [self.imagemagick_binary, file_path, str(output_path)]

        try:
+            LOGGER.info(
+                "[DocumentConversion] ImageMagick conversion start input=%s output=%s command=%s",
+                file_path,
+                output_path,
+                _command_str(command),
+            )
            subprocess.run(
                command,
                check=True,
                capture_output=True,
                text=True,
+                encoding="utf-8",
+                errors="replace",
                timeout=timeout_seconds,
                **_windows_hidden_subprocess_kwargs(),
            )
+            LOGGER.info(
+                "[DocumentConversion] ImageMagick conversion complete output=%s",
+                output_path,
+            )
        except subprocess.TimeoutExpired as exc:
+            LOGGER.error(
+                "[DocumentConversion] ImageMagick timed out command=%s",
+                _command_str(exc.cmd if isinstance(exc.cmd, list) else [str(exc.cmd)]),
+            )
            raise DocumentConversionError(
                f"ImageMagick conversion timed out for {os.path.basename(file_path)}"
            ) from exc
@ -149,10 +210,19 @@ class DocumentConversionService:
            stderr = (exc.stderr or "").strip()
            stdout = (exc.stdout or "").strip()
            details = stderr or stdout or str(exc)
+            LOGGER.error(
+                "[DocumentConversion] ImageMagick failed code=%s command=%s stderr=%s stdout=%s",
+                exc.returncode,
+                _command_str(exc.cmd if isinstance(exc.cmd, list) else [str(exc.cmd)]),
+                _snippet(stderr),
+                _snippet(stdout),
+            )
            raise DocumentConversionError(
-                f"ImageMagick conversion failed for {os.path.basename(file_path)}: {details}"
+                f"ImageMagick conversion failed for {os.path.basename(file_path)}: {details} "
+                f"(stderr={_snippet(stderr)}; stdout={_snippet(stdout)})"
            ) from exc
        except Exception as exc:
+            LOGGER.exception("[DocumentConversion] ImageMagick conversion unexpected error")
            raise DocumentConversionError(
                f"ImageMagick conversion failed for {os.path.basename(file_path)}: {exc}"
            ) from exc
--- a/electron/servers/fastapi/services/documents_loader.py
+++ b/electron/servers/fastapi/services/documents_loader.py
@ -1,8 +1,9 @@
 import asyncio
+import logging
 import os
 import tempfile
 from pathlib import Path
-from typing import List, Optional, Tuple
+from typing import Any, List, Optional, Tuple

 import pdfplumber
 from fastapi import HTTPException
@ -22,9 +23,11 @@ from utils.ocr_language import presentation_language_to_ocr_code

 # Optional fallback converter (primarily useful on Windows)
 try:
-    from services.lightweight_document_service import DocumentService
+    from services.lightweight_document_service import DocumentService as DocumentServiceCls
 except Exception:
-    DocumentService = None
+    DocumentServiceCls = None
+
+LOGGER = logging.getLogger(__name__)


 class DocumentsLoader:
@ -38,7 +41,9 @@ class DocumentsLoader:
        self._ocr_language = presentation_language_to_ocr_code(presentation_language)
        self.liteparse_service = LiteParseService()
        self.document_conversion_service = DocumentConversionService()
-        self.document_service = DocumentService() if DocumentService is not None else None
+        self.document_service: Any = (
+            DocumentServiceCls() if DocumentServiceCls is not None else None
+        )

        self._documents: List[str] = []
        self._images: List[List[str]] = []
@ -69,9 +74,14 @@ class DocumentsLoader:
                )

            document = ""
-            imgs = []
+            imgs: List[str] = []

            extension = Path(file_path).suffix.lower()
+            LOGGER.info(
+                "[DocumentsLoader] Processing file=%s extension=%s",
+                file_path,
+                extension,
+            )

            if extension in PDF_EXTENSIONS:
                document, imgs = await self.load_pdf(
@ -107,13 +117,18 @@ class DocumentsLoader:
        load_images: bool,
        temp_dir: Optional[str] = None,
    ) -> Tuple[str, List[str]]:
-        image_paths = []
+        image_paths: List[str] = []
        document: str = ""

        if load_text:
            document = await asyncio.to_thread(self._parse_with_liteparse, file_path)

        if load_images:
+            if temp_dir is None:
+                raise HTTPException(
+                    status_code=400,
+                    detail="temp_dir is required when load_images is true",
+                )
            image_paths = await self.get_page_images_from_pdf_async(file_path, temp_dir)

        return document, image_paths
@ -154,16 +169,27 @@ class DocumentsLoader:

    def _parse_with_liteparse(self, file_path: str) -> str:
        try:
+            LOGGER.info("[DocumentsLoader] LiteParse start file=%s", file_path)
            return self.liteparse_service.parse_to_markdown(
                file_path,
                ocr_enabled=True,
                ocr_language=self._ocr_language,
            )
        except (LiteParseError, DocumentConversionError) as exc:
+            LOGGER.warning(
+                "[DocumentsLoader] Primary parse failed file=%s error=%s",
+                file_path,
+                exc,
+            )
            if self.document_service is not None:
                try:
+                    LOGGER.info("[DocumentsLoader] Trying fallback parser file=%s", file_path)
                    return self.document_service.parse_to_markdown(file_path)
                except Exception:
+                    LOGGER.exception(
+                        "[DocumentsLoader] Fallback parser failed file=%s",
+                        file_path,
+                    )
                    pass
            raise HTTPException(
                status_code=500,
--- a/electron/servers/fastapi/services/liteparse_service.py
+++ b/electron/servers/fastapi/services/liteparse_service.py
@ -1,13 +1,41 @@
 import json
+import logging
 import os
 import subprocess
-from typing import Any, Dict, Tuple
+from typing import Any, Dict, Mapping, Tuple


 class LiteParseError(Exception):
    pass


+LOGGER = logging.getLogger(__name__)
+_LOG_SNIPPET_LIMIT = 600
+
+
+def _snippet(value: str, limit: int = _LOG_SNIPPET_LIMIT) -> str:
+    text = (value or "").strip()
+    if not text:
+        return "<empty>"
+    if len(text) <= limit:
+        return text
+    return f"{text[:limit]}... [truncated {len(text) - limit} chars]"
+
+
+def _command_str(parts: list[str]) -> str:
+    return " ".join(json.dumps(part) for part in parts)
+
+
+def _subprocess_text_kwargs() -> Mapping[str, object]:
+    """Decode subprocess output consistently across platforms.
+
+    Windows defaults to a locale-dependent code page (often cp1252), which can
+    crash while decoding UTF-8 output from Node tools. Use UTF-8 and replace
+    undecodable bytes to keep parsing resilient.
+    """
+    return {"text": True, "encoding": "utf-8", "errors": "replace"}
+
+
 class LiteParseService:
    def __init__(self, timeout_seconds: int = 180):
        self.timeout_seconds = timeout_seconds
@ -128,9 +156,9 @@ class LiteParseService:
                cwd=self.runner_dir,
                check=True,
                capture_output=True,
-                text=True,
                timeout=10,
                env=self._build_node_env(),
+                **_subprocess_text_kwargs(),
            )
        except Exception as exc:
            return False, f"Node.js runtime is unavailable: {exc}"
@ -156,9 +184,9 @@ class LiteParseService:
                cwd=self._npm_project_root,
                check=True,
                capture_output=True,
-                text=True,
                timeout=20,
                env=self._build_node_env(),
+                **_subprocess_text_kwargs(),
            )
        except Exception as exc:
            return False, f"LiteParse dependency is unavailable: {exc}"
@ -205,21 +233,51 @@ class LiteParseService:
        if tessdata:
            command.extend(["--tessdata-path", tessdata])

+        LOGGER.info(
+            "[LiteParse] Parsing file=%s ocr_enabled=%s ocr_language=%s",
+            file_path,
+            ocr_enabled,
+            ocr_language,
+        )
+
        process = subprocess.run(
            command,
            cwd=self._npm_project_root,
            capture_output=True,
-            text=True,
            timeout=self.timeout_seconds,
            env=self._build_node_env(),
+            **_subprocess_text_kwargs(),
        )
-        payload = self._decode_runner_output(process.stdout)
+        LOGGER.info(
+            "[LiteParse] Command finished returncode=%s command=%s",
+            process.returncode,
+            _command_str(command),
+        )
+
+        payload: Dict[str, Any]
+        try:
+            payload = self._decode_runner_output(process.stdout)
+        except LiteParseError as exc:
+            raise LiteParseError(
+                f"{exc}; returncode={process.returncode}; "
+                f"stderr={_snippet(process.stderr)}; stdout={_snippet(process.stdout)}"
+            ) from exc

        if process.returncode != 0:
            message = payload.get("error") or process.stderr.strip() or "Unknown error"
+            LOGGER.error(
+                "[LiteParse] Parse failed returncode=%s stderr=%s stdout=%s",
+                process.returncode,
+                _snippet(process.stderr),
+                _snippet(process.stdout),
+            )
            raise LiteParseError(message)

        if not payload.get("ok"):
+            LOGGER.error(
+                "[LiteParse] Runner returned not-ok payload=%s",
+                _snippet(json.dumps(payload)),
+            )
            raise LiteParseError(payload.get("error") or "LiteParse parse failed")

        return payload