feat: enhance document conversion services with improved logging and error handling
- Added logging to track the start and completion of document conversions in DocumentConversionService. - Implemented detailed error logging for subprocess failures in both DocumentConversionService and LiteParseService. - Introduced helper functions for logging command strings and output snippets to improve debuggability. - Updated DocumentsLoader to log file processing details and fallback parsing attempts.
This commit is contained in:
parent
429cb6c09b
commit
c7e9a6872b
3 changed files with 176 additions and 22 deletions
|
|
@ -1,5 +1,6 @@
|
|||
import os
|
||||
import subprocess
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Dict, List
|
||||
|
||||
|
|
@ -8,6 +9,23 @@ class DocumentConversionError(Exception):
|
|||
pass
|
||||
|
||||
|
||||
LOGGER = logging.getLogger(__name__)
|
||||
_LOG_SNIPPET_LIMIT = 600
|
||||
|
||||
|
||||
def _snippet(value: str, limit: int = _LOG_SNIPPET_LIMIT) -> str:
|
||||
text = (value or "").strip()
|
||||
if not text:
|
||||
return "<empty>"
|
||||
if len(text) <= limit:
|
||||
return text
|
||||
return f"{text[:limit]}... [truncated {len(text) - limit} chars]"
|
||||
|
||||
|
||||
def _command_str(parts: list[str]) -> str:
|
||||
return " ".join(repr(part) for part in parts)
|
||||
|
||||
|
||||
def _windows_hidden_subprocess_kwargs() -> Dict[str, object]:
|
||||
if os.name != "nt":
|
||||
return {}
|
||||
|
|
@ -39,6 +57,8 @@ class DocumentConversionService:
|
|||
[command, *args],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
encoding="utf-8",
|
||||
errors="replace",
|
||||
timeout=10,
|
||||
check=False,
|
||||
**_windows_hidden_subprocess_kwargs(),
|
||||
|
|
@ -71,23 +91,39 @@ class DocumentConversionService:
|
|||
}
|
||||
|
||||
try:
|
||||
command = [
|
||||
self.soffice_binary,
|
||||
"--headless",
|
||||
"--convert-to",
|
||||
"pdf",
|
||||
"--outdir",
|
||||
output_dir,
|
||||
file_path,
|
||||
]
|
||||
LOGGER.info(
|
||||
"[DocumentConversion] LibreOffice conversion start input=%s output_dir=%s",
|
||||
file_path,
|
||||
output_dir,
|
||||
)
|
||||
subprocess.run(
|
||||
[
|
||||
self.soffice_binary,
|
||||
"--headless",
|
||||
"--convert-to",
|
||||
"pdf",
|
||||
"--outdir",
|
||||
output_dir,
|
||||
file_path,
|
||||
],
|
||||
command,
|
||||
check=True,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
encoding="utf-8",
|
||||
errors="replace",
|
||||
timeout=timeout_seconds,
|
||||
**_windows_hidden_subprocess_kwargs(),
|
||||
)
|
||||
LOGGER.info(
|
||||
"[DocumentConversion] LibreOffice conversion complete input=%s",
|
||||
file_path,
|
||||
)
|
||||
except subprocess.TimeoutExpired as exc:
|
||||
LOGGER.error(
|
||||
"[DocumentConversion] LibreOffice timed out command=%s",
|
||||
_command_str(exc.cmd if isinstance(exc.cmd, list) else [str(exc.cmd)]),
|
||||
)
|
||||
raise DocumentConversionError(
|
||||
f"LibreOffice conversion timed out for {os.path.basename(file_path)}"
|
||||
) from exc
|
||||
|
|
@ -95,10 +131,19 @@ class DocumentConversionService:
|
|||
stderr = (exc.stderr or "").strip()
|
||||
stdout = (exc.stdout or "").strip()
|
||||
details = stderr or stdout or str(exc)
|
||||
LOGGER.error(
|
||||
"[DocumentConversion] LibreOffice failed code=%s command=%s stderr=%s stdout=%s",
|
||||
exc.returncode,
|
||||
_command_str(exc.cmd if isinstance(exc.cmd, list) else [str(exc.cmd)]),
|
||||
_snippet(stderr),
|
||||
_snippet(stdout),
|
||||
)
|
||||
raise DocumentConversionError(
|
||||
f"LibreOffice conversion failed for {os.path.basename(file_path)}: {details}"
|
||||
f"LibreOffice conversion failed for {os.path.basename(file_path)}: {details} "
|
||||
f"(stderr={_snippet(stderr)}; stdout={_snippet(stdout)})"
|
||||
) from exc
|
||||
except Exception as exc:
|
||||
LOGGER.exception("[DocumentConversion] LibreOffice conversion unexpected error")
|
||||
raise DocumentConversionError(
|
||||
f"LibreOffice conversion failed for {os.path.basename(file_path)}: {exc}"
|
||||
) from exc
|
||||
|
|
@ -133,15 +178,31 @@ class DocumentConversionService:
|
|||
command = [self.imagemagick_binary, file_path, str(output_path)]
|
||||
|
||||
try:
|
||||
LOGGER.info(
|
||||
"[DocumentConversion] ImageMagick conversion start input=%s output=%s command=%s",
|
||||
file_path,
|
||||
output_path,
|
||||
_command_str(command),
|
||||
)
|
||||
subprocess.run(
|
||||
command,
|
||||
check=True,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
encoding="utf-8",
|
||||
errors="replace",
|
||||
timeout=timeout_seconds,
|
||||
**_windows_hidden_subprocess_kwargs(),
|
||||
)
|
||||
LOGGER.info(
|
||||
"[DocumentConversion] ImageMagick conversion complete output=%s",
|
||||
output_path,
|
||||
)
|
||||
except subprocess.TimeoutExpired as exc:
|
||||
LOGGER.error(
|
||||
"[DocumentConversion] ImageMagick timed out command=%s",
|
||||
_command_str(exc.cmd if isinstance(exc.cmd, list) else [str(exc.cmd)]),
|
||||
)
|
||||
raise DocumentConversionError(
|
||||
f"ImageMagick conversion timed out for {os.path.basename(file_path)}"
|
||||
) from exc
|
||||
|
|
@ -149,10 +210,19 @@ class DocumentConversionService:
|
|||
stderr = (exc.stderr or "").strip()
|
||||
stdout = (exc.stdout or "").strip()
|
||||
details = stderr or stdout or str(exc)
|
||||
LOGGER.error(
|
||||
"[DocumentConversion] ImageMagick failed code=%s command=%s stderr=%s stdout=%s",
|
||||
exc.returncode,
|
||||
_command_str(exc.cmd if isinstance(exc.cmd, list) else [str(exc.cmd)]),
|
||||
_snippet(stderr),
|
||||
_snippet(stdout),
|
||||
)
|
||||
raise DocumentConversionError(
|
||||
f"ImageMagick conversion failed for {os.path.basename(file_path)}: {details}"
|
||||
f"ImageMagick conversion failed for {os.path.basename(file_path)}: {details} "
|
||||
f"(stderr={_snippet(stderr)}; stdout={_snippet(stdout)})"
|
||||
) from exc
|
||||
except Exception as exc:
|
||||
LOGGER.exception("[DocumentConversion] ImageMagick conversion unexpected error")
|
||||
raise DocumentConversionError(
|
||||
f"ImageMagick conversion failed for {os.path.basename(file_path)}: {exc}"
|
||||
) from exc
|
||||
|
|
|
|||
|
|
@ -1,8 +1,9 @@
|
|||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import List, Optional, Tuple
|
||||
from typing import Any, List, Optional, Tuple
|
||||
|
||||
import pdfplumber
|
||||
from fastapi import HTTPException
|
||||
|
|
@ -22,9 +23,11 @@ from utils.ocr_language import presentation_language_to_ocr_code
|
|||
|
||||
# Optional fallback converter (primarily useful on Windows)
|
||||
try:
|
||||
from services.lightweight_document_service import DocumentService
|
||||
from services.lightweight_document_service import DocumentService as DocumentServiceCls
|
||||
except Exception:
|
||||
DocumentService = None
|
||||
DocumentServiceCls = None
|
||||
|
||||
LOGGER = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DocumentsLoader:
|
||||
|
|
@ -38,7 +41,9 @@ class DocumentsLoader:
|
|||
self._ocr_language = presentation_language_to_ocr_code(presentation_language)
|
||||
self.liteparse_service = LiteParseService()
|
||||
self.document_conversion_service = DocumentConversionService()
|
||||
self.document_service = DocumentService() if DocumentService is not None else None
|
||||
self.document_service: Any = (
|
||||
DocumentServiceCls() if DocumentServiceCls is not None else None
|
||||
)
|
||||
|
||||
self._documents: List[str] = []
|
||||
self._images: List[List[str]] = []
|
||||
|
|
@ -69,9 +74,14 @@ class DocumentsLoader:
|
|||
)
|
||||
|
||||
document = ""
|
||||
imgs = []
|
||||
imgs: List[str] = []
|
||||
|
||||
extension = Path(file_path).suffix.lower()
|
||||
LOGGER.info(
|
||||
"[DocumentsLoader] Processing file=%s extension=%s",
|
||||
file_path,
|
||||
extension,
|
||||
)
|
||||
|
||||
if extension in PDF_EXTENSIONS:
|
||||
document, imgs = await self.load_pdf(
|
||||
|
|
@ -107,13 +117,18 @@ class DocumentsLoader:
|
|||
load_images: bool,
|
||||
temp_dir: Optional[str] = None,
|
||||
) -> Tuple[str, List[str]]:
|
||||
image_paths = []
|
||||
image_paths: List[str] = []
|
||||
document: str = ""
|
||||
|
||||
if load_text:
|
||||
document = await asyncio.to_thread(self._parse_with_liteparse, file_path)
|
||||
|
||||
if load_images:
|
||||
if temp_dir is None:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="temp_dir is required when load_images is true",
|
||||
)
|
||||
image_paths = await self.get_page_images_from_pdf_async(file_path, temp_dir)
|
||||
|
||||
return document, image_paths
|
||||
|
|
@ -154,16 +169,27 @@ class DocumentsLoader:
|
|||
|
||||
def _parse_with_liteparse(self, file_path: str) -> str:
|
||||
try:
|
||||
LOGGER.info("[DocumentsLoader] LiteParse start file=%s", file_path)
|
||||
return self.liteparse_service.parse_to_markdown(
|
||||
file_path,
|
||||
ocr_enabled=True,
|
||||
ocr_language=self._ocr_language,
|
||||
)
|
||||
except (LiteParseError, DocumentConversionError) as exc:
|
||||
LOGGER.warning(
|
||||
"[DocumentsLoader] Primary parse failed file=%s error=%s",
|
||||
file_path,
|
||||
exc,
|
||||
)
|
||||
if self.document_service is not None:
|
||||
try:
|
||||
LOGGER.info("[DocumentsLoader] Trying fallback parser file=%s", file_path)
|
||||
return self.document_service.parse_to_markdown(file_path)
|
||||
except Exception:
|
||||
LOGGER.exception(
|
||||
"[DocumentsLoader] Fallback parser failed file=%s",
|
||||
file_path,
|
||||
)
|
||||
pass
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
|
|
|
|||
|
|
@ -1,13 +1,41 @@
|
|||
import json
|
||||
import logging
|
||||
import os
|
||||
import subprocess
|
||||
from typing import Any, Dict, Tuple
|
||||
from typing import Any, Dict, Mapping, Tuple
|
||||
|
||||
|
||||
class LiteParseError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
LOGGER = logging.getLogger(__name__)
|
||||
_LOG_SNIPPET_LIMIT = 600
|
||||
|
||||
|
||||
def _snippet(value: str, limit: int = _LOG_SNIPPET_LIMIT) -> str:
|
||||
text = (value or "").strip()
|
||||
if not text:
|
||||
return "<empty>"
|
||||
if len(text) <= limit:
|
||||
return text
|
||||
return f"{text[:limit]}... [truncated {len(text) - limit} chars]"
|
||||
|
||||
|
||||
def _command_str(parts: list[str]) -> str:
|
||||
return " ".join(json.dumps(part) for part in parts)
|
||||
|
||||
|
||||
def _subprocess_text_kwargs() -> Mapping[str, object]:
|
||||
"""Decode subprocess output consistently across platforms.
|
||||
|
||||
Windows defaults to a locale-dependent code page (often cp1252), which can
|
||||
crash while decoding UTF-8 output from Node tools. Use UTF-8 and replace
|
||||
undecodable bytes to keep parsing resilient.
|
||||
"""
|
||||
return {"text": True, "encoding": "utf-8", "errors": "replace"}
|
||||
|
||||
|
||||
class LiteParseService:
|
||||
def __init__(self, timeout_seconds: int = 180):
|
||||
self.timeout_seconds = timeout_seconds
|
||||
|
|
@ -128,9 +156,9 @@ class LiteParseService:
|
|||
cwd=self.runner_dir,
|
||||
check=True,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=10,
|
||||
env=self._build_node_env(),
|
||||
**_subprocess_text_kwargs(),
|
||||
)
|
||||
except Exception as exc:
|
||||
return False, f"Node.js runtime is unavailable: {exc}"
|
||||
|
|
@ -156,9 +184,9 @@ class LiteParseService:
|
|||
cwd=self._npm_project_root,
|
||||
check=True,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=20,
|
||||
env=self._build_node_env(),
|
||||
**_subprocess_text_kwargs(),
|
||||
)
|
||||
except Exception as exc:
|
||||
return False, f"LiteParse dependency is unavailable: {exc}"
|
||||
|
|
@ -205,21 +233,51 @@ class LiteParseService:
|
|||
if tessdata:
|
||||
command.extend(["--tessdata-path", tessdata])
|
||||
|
||||
LOGGER.info(
|
||||
"[LiteParse] Parsing file=%s ocr_enabled=%s ocr_language=%s",
|
||||
file_path,
|
||||
ocr_enabled,
|
||||
ocr_language,
|
||||
)
|
||||
|
||||
process = subprocess.run(
|
||||
command,
|
||||
cwd=self._npm_project_root,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=self.timeout_seconds,
|
||||
env=self._build_node_env(),
|
||||
**_subprocess_text_kwargs(),
|
||||
)
|
||||
payload = self._decode_runner_output(process.stdout)
|
||||
LOGGER.info(
|
||||
"[LiteParse] Command finished returncode=%s command=%s",
|
||||
process.returncode,
|
||||
_command_str(command),
|
||||
)
|
||||
|
||||
payload: Dict[str, Any]
|
||||
try:
|
||||
payload = self._decode_runner_output(process.stdout)
|
||||
except LiteParseError as exc:
|
||||
raise LiteParseError(
|
||||
f"{exc}; returncode={process.returncode}; "
|
||||
f"stderr={_snippet(process.stderr)}; stdout={_snippet(process.stdout)}"
|
||||
) from exc
|
||||
|
||||
if process.returncode != 0:
|
||||
message = payload.get("error") or process.stderr.strip() or "Unknown error"
|
||||
LOGGER.error(
|
||||
"[LiteParse] Parse failed returncode=%s stderr=%s stdout=%s",
|
||||
process.returncode,
|
||||
_snippet(process.stderr),
|
||||
_snippet(process.stdout),
|
||||
)
|
||||
raise LiteParseError(message)
|
||||
|
||||
if not payload.get("ok"):
|
||||
LOGGER.error(
|
||||
"[LiteParse] Runner returned not-ok payload=%s",
|
||||
_snippet(json.dumps(payload)),
|
||||
)
|
||||
raise LiteParseError(payload.get("error") or "LiteParse parse failed")
|
||||
|
||||
return payload
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue