hm_qc/checks/HM_image_parse.py

import os
import shutil
import logging
from PIL import Image

def run_check(config: dict, context: dict, check_id: str):
    """
    QC check that handles static image file parsing (JPG, PNG, PSD)
    Loads the image as a PIL object and stores it in context for downstream checks.

    Unlike PDF parsing, this does not extract text content since static images
    typically don't contain extractable text.
    """
    input_file = config.get("input_file")
    working_dir = config.get("working_dir", "working")

    if not input_file or not os.path.isfile(input_file):
        return {
            "status": "error",
            "error_message": f"Image file '{input_file}' not provided or does not exist."
        }

    # Validate file extension
    valid_extensions = ['.jpg', '.jpeg', '.png', '.psd']
    file_ext = os.path.splitext(input_file)[1].lower()

    if file_ext not in valid_extensions:
        return {
            "status": "error",
            "error_message": f"Input file '{input_file}' is not a supported image format. Supported: {', '.join(valid_extensions)}"
        }

    # Prepare working directory
    try:
        if os.path.exists(working_dir):
            for item in os.listdir(working_dir):
                item_path = os.path.join(working_dir, item)
                if os.path.isfile(item_path) or os.path.islink(item_path):
                    os.remove(item_path)
                else:
                    shutil.rmtree(item_path)
        else:
            os.makedirs(working_dir)
    except Exception as e:
        return {
            "status": "error",
            "error_message": f"Failed to prepare working directory '{working_dir}': {e}"
        }

    try:
        # Load image as PIL object
        with Image.open(input_file) as img:
            # For PSD files, PIL will load the composite image
            parsed_image = img.copy()

            # Get image metadata
            image_format = img.format
            image_mode = img.mode
            image_size = img.size

            # Optionally save a copy to working directory for debugging
            saved_image_path = os.path.join(working_dir, "parsed_image.jpg")
            if image_format == 'PSD':
                # Convert PSD to RGB if needed
                if parsed_image.mode not in ('RGB', 'RGBA'):
                    parsed_image = parsed_image.convert('RGB')
                parsed_image.save(saved_image_path, 'JPEG')
            else:
                parsed_image.save(saved_image_path)

        filename = os.path.basename(input_file)

        # Store in context
        context[check_id] = {
            "filename": filename,
            "input_file_path": input_file,
            "parsed_image": parsed_image,
            "image_format": image_format,
            "image_mode": image_mode,
            "image_size": image_size,
            "saved_image_path": saved_image_path,
            "extracted_text": ""  # No text extraction for static images
        }

        return {
            "status": "passed",
            "details": {
                "message": "Image parsed successfully.",
                "working_dir": working_dir,
                "filename": filename,
                "format": image_format,
                "size": f"{image_size[0]}x{image_size[1]}",
                "mode": image_mode,
                "saved_path": saved_image_path
            }
        }

    except Exception as e:
        logging.error(f"Failed to parse image '{input_file}': {e}")
        context[check_id] = {
            "filename": os.path.basename(input_file) if input_file else "unknown",
            "error": str(e),
            "partial_extraction": True
        }
        return {
            "status": "error",
            "error_message": f"Failed to parse image '{input_file}': {e}"
        }