diff --git a/electron/servers/fastapi/services/document_conversion_service.py b/electron/servers/fastapi/services/document_conversion_service.py index 497f12ec..8de7ec7f 100644 --- a/electron/servers/fastapi/services/document_conversion_service.py +++ b/electron/servers/fastapi/services/document_conversion_service.py @@ -1,5 +1,6 @@ import os import subprocess +import logging from pathlib import Path from typing import Dict, List @@ -8,6 +9,23 @@ class DocumentConversionError(Exception): pass +LOGGER = logging.getLogger(__name__) +_LOG_SNIPPET_LIMIT = 600 + + +def _snippet(value: str, limit: int = _LOG_SNIPPET_LIMIT) -> str: + text = (value or "").strip() + if not text: + return "" + if len(text) <= limit: + return text + return f"{text[:limit]}... [truncated {len(text) - limit} chars]" + + +def _command_str(parts: list[str]) -> str: + return " ".join(repr(part) for part in parts) + + def _windows_hidden_subprocess_kwargs() -> Dict[str, object]: if os.name != "nt": return {} @@ -39,6 +57,8 @@ class DocumentConversionService: [command, *args], capture_output=True, text=True, + encoding="utf-8", + errors="replace", timeout=10, check=False, **_windows_hidden_subprocess_kwargs(), @@ -71,23 +91,39 @@ class DocumentConversionService: } try: + command = [ + self.soffice_binary, + "--headless", + "--convert-to", + "pdf", + "--outdir", + output_dir, + file_path, + ] + LOGGER.info( + "[DocumentConversion] LibreOffice conversion start input=%s output_dir=%s", + file_path, + output_dir, + ) subprocess.run( - [ - self.soffice_binary, - "--headless", - "--convert-to", - "pdf", - "--outdir", - output_dir, - file_path, - ], + command, check=True, capture_output=True, text=True, + encoding="utf-8", + errors="replace", timeout=timeout_seconds, **_windows_hidden_subprocess_kwargs(), ) + LOGGER.info( + "[DocumentConversion] LibreOffice conversion complete input=%s", + file_path, + ) except subprocess.TimeoutExpired as exc: + LOGGER.error( + "[DocumentConversion] LibreOffice timed out command=%s", + _command_str(exc.cmd if isinstance(exc.cmd, list) else [str(exc.cmd)]), + ) raise DocumentConversionError( f"LibreOffice conversion timed out for {os.path.basename(file_path)}" ) from exc @@ -95,10 +131,19 @@ class DocumentConversionService: stderr = (exc.stderr or "").strip() stdout = (exc.stdout or "").strip() details = stderr or stdout or str(exc) + LOGGER.error( + "[DocumentConversion] LibreOffice failed code=%s command=%s stderr=%s stdout=%s", + exc.returncode, + _command_str(exc.cmd if isinstance(exc.cmd, list) else [str(exc.cmd)]), + _snippet(stderr), + _snippet(stdout), + ) raise DocumentConversionError( - f"LibreOffice conversion failed for {os.path.basename(file_path)}: {details}" + f"LibreOffice conversion failed for {os.path.basename(file_path)}: {details} " + f"(stderr={_snippet(stderr)}; stdout={_snippet(stdout)})" ) from exc except Exception as exc: + LOGGER.exception("[DocumentConversion] LibreOffice conversion unexpected error") raise DocumentConversionError( f"LibreOffice conversion failed for {os.path.basename(file_path)}: {exc}" ) from exc @@ -133,15 +178,31 @@ class DocumentConversionService: command = [self.imagemagick_binary, file_path, str(output_path)] try: + LOGGER.info( + "[DocumentConversion] ImageMagick conversion start input=%s output=%s command=%s", + file_path, + output_path, + _command_str(command), + ) subprocess.run( command, check=True, capture_output=True, text=True, + encoding="utf-8", + errors="replace", timeout=timeout_seconds, **_windows_hidden_subprocess_kwargs(), ) + LOGGER.info( + "[DocumentConversion] ImageMagick conversion complete output=%s", + output_path, + ) except subprocess.TimeoutExpired as exc: + LOGGER.error( + "[DocumentConversion] ImageMagick timed out command=%s", + _command_str(exc.cmd if isinstance(exc.cmd, list) else [str(exc.cmd)]), + ) raise DocumentConversionError( f"ImageMagick conversion timed out for {os.path.basename(file_path)}" ) from exc @@ -149,10 +210,19 @@ class DocumentConversionService: stderr = (exc.stderr or "").strip() stdout = (exc.stdout or "").strip() details = stderr or stdout or str(exc) + LOGGER.error( + "[DocumentConversion] ImageMagick failed code=%s command=%s stderr=%s stdout=%s", + exc.returncode, + _command_str(exc.cmd if isinstance(exc.cmd, list) else [str(exc.cmd)]), + _snippet(stderr), + _snippet(stdout), + ) raise DocumentConversionError( - f"ImageMagick conversion failed for {os.path.basename(file_path)}: {details}" + f"ImageMagick conversion failed for {os.path.basename(file_path)}: {details} " + f"(stderr={_snippet(stderr)}; stdout={_snippet(stdout)})" ) from exc except Exception as exc: + LOGGER.exception("[DocumentConversion] ImageMagick conversion unexpected error") raise DocumentConversionError( f"ImageMagick conversion failed for {os.path.basename(file_path)}: {exc}" ) from exc diff --git a/electron/servers/fastapi/services/documents_loader.py b/electron/servers/fastapi/services/documents_loader.py index 6dbc3f5c..e65a659a 100644 --- a/electron/servers/fastapi/services/documents_loader.py +++ b/electron/servers/fastapi/services/documents_loader.py @@ -1,8 +1,9 @@ import asyncio +import logging import os import tempfile from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, List, Optional, Tuple import pdfplumber from fastapi import HTTPException @@ -22,9 +23,11 @@ from utils.ocr_language import presentation_language_to_ocr_code # Optional fallback converter (primarily useful on Windows) try: - from services.lightweight_document_service import DocumentService + from services.lightweight_document_service import DocumentService as DocumentServiceCls except Exception: - DocumentService = None + DocumentServiceCls = None + +LOGGER = logging.getLogger(__name__) class DocumentsLoader: @@ -38,7 +41,9 @@ class DocumentsLoader: self._ocr_language = presentation_language_to_ocr_code(presentation_language) self.liteparse_service = LiteParseService() self.document_conversion_service = DocumentConversionService() - self.document_service = DocumentService() if DocumentService is not None else None + self.document_service: Any = ( + DocumentServiceCls() if DocumentServiceCls is not None else None + ) self._documents: List[str] = [] self._images: List[List[str]] = [] @@ -69,9 +74,14 @@ class DocumentsLoader: ) document = "" - imgs = [] + imgs: List[str] = [] extension = Path(file_path).suffix.lower() + LOGGER.info( + "[DocumentsLoader] Processing file=%s extension=%s", + file_path, + extension, + ) if extension in PDF_EXTENSIONS: document, imgs = await self.load_pdf( @@ -107,13 +117,18 @@ class DocumentsLoader: load_images: bool, temp_dir: Optional[str] = None, ) -> Tuple[str, List[str]]: - image_paths = [] + image_paths: List[str] = [] document: str = "" if load_text: document = await asyncio.to_thread(self._parse_with_liteparse, file_path) if load_images: + if temp_dir is None: + raise HTTPException( + status_code=400, + detail="temp_dir is required when load_images is true", + ) image_paths = await self.get_page_images_from_pdf_async(file_path, temp_dir) return document, image_paths @@ -154,16 +169,27 @@ class DocumentsLoader: def _parse_with_liteparse(self, file_path: str) -> str: try: + LOGGER.info("[DocumentsLoader] LiteParse start file=%s", file_path) return self.liteparse_service.parse_to_markdown( file_path, ocr_enabled=True, ocr_language=self._ocr_language, ) except (LiteParseError, DocumentConversionError) as exc: + LOGGER.warning( + "[DocumentsLoader] Primary parse failed file=%s error=%s", + file_path, + exc, + ) if self.document_service is not None: try: + LOGGER.info("[DocumentsLoader] Trying fallback parser file=%s", file_path) return self.document_service.parse_to_markdown(file_path) except Exception: + LOGGER.exception( + "[DocumentsLoader] Fallback parser failed file=%s", + file_path, + ) pass raise HTTPException( status_code=500, diff --git a/electron/servers/fastapi/services/liteparse_service.py b/electron/servers/fastapi/services/liteparse_service.py index bb538f50..dca0835d 100644 --- a/electron/servers/fastapi/services/liteparse_service.py +++ b/electron/servers/fastapi/services/liteparse_service.py @@ -1,13 +1,41 @@ import json +import logging import os import subprocess -from typing import Any, Dict, Tuple +from typing import Any, Dict, Mapping, Tuple class LiteParseError(Exception): pass +LOGGER = logging.getLogger(__name__) +_LOG_SNIPPET_LIMIT = 600 + + +def _snippet(value: str, limit: int = _LOG_SNIPPET_LIMIT) -> str: + text = (value or "").strip() + if not text: + return "" + if len(text) <= limit: + return text + return f"{text[:limit]}... [truncated {len(text) - limit} chars]" + + +def _command_str(parts: list[str]) -> str: + return " ".join(json.dumps(part) for part in parts) + + +def _subprocess_text_kwargs() -> Mapping[str, object]: + """Decode subprocess output consistently across platforms. + + Windows defaults to a locale-dependent code page (often cp1252), which can + crash while decoding UTF-8 output from Node tools. Use UTF-8 and replace + undecodable bytes to keep parsing resilient. + """ + return {"text": True, "encoding": "utf-8", "errors": "replace"} + + class LiteParseService: def __init__(self, timeout_seconds: int = 180): self.timeout_seconds = timeout_seconds @@ -128,9 +156,9 @@ class LiteParseService: cwd=self.runner_dir, check=True, capture_output=True, - text=True, timeout=10, env=self._build_node_env(), + **_subprocess_text_kwargs(), ) except Exception as exc: return False, f"Node.js runtime is unavailable: {exc}" @@ -156,9 +184,9 @@ class LiteParseService: cwd=self._npm_project_root, check=True, capture_output=True, - text=True, timeout=20, env=self._build_node_env(), + **_subprocess_text_kwargs(), ) except Exception as exc: return False, f"LiteParse dependency is unavailable: {exc}" @@ -205,21 +233,51 @@ class LiteParseService: if tessdata: command.extend(["--tessdata-path", tessdata]) + LOGGER.info( + "[LiteParse] Parsing file=%s ocr_enabled=%s ocr_language=%s", + file_path, + ocr_enabled, + ocr_language, + ) + process = subprocess.run( command, cwd=self._npm_project_root, capture_output=True, - text=True, timeout=self.timeout_seconds, env=self._build_node_env(), + **_subprocess_text_kwargs(), ) - payload = self._decode_runner_output(process.stdout) + LOGGER.info( + "[LiteParse] Command finished returncode=%s command=%s", + process.returncode, + _command_str(command), + ) + + payload: Dict[str, Any] + try: + payload = self._decode_runner_output(process.stdout) + except LiteParseError as exc: + raise LiteParseError( + f"{exc}; returncode={process.returncode}; " + f"stderr={_snippet(process.stderr)}; stdout={_snippet(process.stdout)}" + ) from exc if process.returncode != 0: message = payload.get("error") or process.stderr.strip() or "Unknown error" + LOGGER.error( + "[LiteParse] Parse failed returncode=%s stderr=%s stdout=%s", + process.returncode, + _snippet(process.stderr), + _snippet(process.stdout), + ) raise LiteParseError(message) if not payload.get("ok"): + LOGGER.error( + "[LiteParse] Runner returned not-ok payload=%s", + _snippet(json.dumps(payload)), + ) raise LiteParseError(payload.get("error") or "LiteParse parse failed") return payload