hm_qc/checks/HM_image_parse.py
2025-11-13 13:41:31 +02:00

108 lines
3.7 KiB
Python

import os
import shutil
import logging
from PIL import Image
def run_check(config: dict, context: dict, check_id: str):
"""
QC check that handles static image file parsing (JPG, PNG, PSD)
Loads the image as a PIL object and stores it in context for downstream checks.
Unlike PDF parsing, this does not extract text content since static images
typically don't contain extractable text.
"""
input_file = config.get("input_file")
working_dir = config.get("working_dir", "working")
if not input_file or not os.path.isfile(input_file):
return {
"status": "error",
"error_message": f"Image file '{input_file}' not provided or does not exist."
}
# Validate file extension
valid_extensions = ['.jpg', '.jpeg', '.png', '.psd']
file_ext = os.path.splitext(input_file)[1].lower()
if file_ext not in valid_extensions:
return {
"status": "error",
"error_message": f"Input file '{input_file}' is not a supported image format. Supported: {', '.join(valid_extensions)}"
}
# Prepare working directory
try:
if os.path.exists(working_dir):
for item in os.listdir(working_dir):
item_path = os.path.join(working_dir, item)
if os.path.isfile(item_path) or os.path.islink(item_path):
os.remove(item_path)
else:
shutil.rmtree(item_path)
else:
os.makedirs(working_dir)
except Exception as e:
return {
"status": "error",
"error_message": f"Failed to prepare working directory '{working_dir}': {e}"
}
try:
# Load image as PIL object
with Image.open(input_file) as img:
# For PSD files, PIL will load the composite image
parsed_image = img.copy()
# Get image metadata
image_format = img.format
image_mode = img.mode
image_size = img.size
# Optionally save a copy to working directory for debugging
saved_image_path = os.path.join(working_dir, "parsed_image.jpg")
if image_format == 'PSD':
# Convert PSD to RGB if needed
if parsed_image.mode not in ('RGB', 'RGBA'):
parsed_image = parsed_image.convert('RGB')
parsed_image.save(saved_image_path, 'JPEG')
else:
parsed_image.save(saved_image_path)
filename = os.path.basename(input_file)
# Store in context
context[check_id] = {
"filename": filename,
"input_file_path": input_file,
"parsed_image": parsed_image,
"image_format": image_format,
"image_mode": image_mode,
"image_size": image_size,
"saved_image_path": saved_image_path,
"extracted_text": "" # No text extraction for static images
}
return {
"status": "passed",
"details": {
"message": "Image parsed successfully.",
"working_dir": working_dir,
"filename": filename,
"format": image_format,
"size": f"{image_size[0]}x{image_size[1]}",
"mode": image_mode,
"saved_path": saved_image_path
}
}
except Exception as e:
logging.error(f"Failed to parse image '{input_file}': {e}")
context[check_id] = {
"filename": os.path.basename(input_file) if input_file else "unknown",
"error": str(e),
"partial_extraction": True
}
return {
"status": "error",
"error_message": f"Failed to parse image '{input_file}': {e}"
}