hm_qc/checks/HM_language_validate.py

from checks.analyze_with_gpt import analyze_with_gpt

def run_check(config: dict, context: dict, check_id: str) -> dict:
    """
    QC check that validates document language against filename code using
    context-stored PIL images instead of base64 or filesystem paths.
    Supports both PDF and static image workflows.
    """
    # --- Validate presence of required context data (support both PDF and image workflows) ---
    hm_parse_data = context.get("HM_parse") or context.get("HM_image_parse", {})
    hm_filename_data = context.get("HM_filename_parse") or context.get("HM_image_filename_parse", {})

    filename = hm_parse_data.get("filename", "")
    text_content = hm_parse_data.get("extracted_text", "")
    parsed_image = hm_parse_data.get("parsed_image")  # Get PIL image
    expected_language = hm_filename_data.get("parsed", {}).get("language", "").lower()

    error_messages = []
    if not filename:
        error_messages.append("Filename missing from HM_parse context")
    if not text_content.strip() and not parsed_image:
        error_messages.append("Both text content and image data missing from HM_parse context")
    if not expected_language:
        error_messages.append("Language code missing from HM_filename_parse context")

    if error_messages:
        return {
            "status": "error",
            "error_message": "Missing critical context data: " + ", ".join(error_messages),
            "details": {
                "available_context": {
                    "HM_parse_keys": list(hm_parse_data.keys()),
                    "HM_filename_parse_keys": list(hm_filename_data.keys())
                }
            }
        }

    # --- Prepare context result structure ---
    context_result = {
        "filename": filename,
        "expected_language": expected_language.upper(),
        "detected_language": "UNKNOWN",
        "matches": False,
        "isCensorshipRequired": False,
        "validation_method": None,
        "gpt_response": None,
        "image_used": False,
        "image_format": None
    }

    # --- Handle special CEN/GEN cases first (case-insensitive) ---
    expected_language_lower = expected_language.lower()
    if "cen" in expected_language_lower:
        context_result.update({
            "detected_language": "CEN",
            "matches": True,
            "isCensorshipRequired": True,
            "validation_method": "auto"
        })
    elif "gen" in expected_language_lower:
        context_result.update({
            "detected_language": "GEN",
            "matches": True,
            "isCensorshipRequired": False,
            "validation_method": "auto"
        })
    else:
        # --- Prepare multimodal analysis with PIL image ---
        images = []
        if parsed_image:
            images = [parsed_image]
            context_result.update({
                "image_used": True,
                "image_format": parsed_image.format.lower() if parsed_image.format else None
            })

        # --- GPT-based multimodal language detection ---
        try:
            prompt = f"""
            Analyze both the text content (if present) and document image (if provided) to identify:
            1. Primary language (ISO 639-1 code)
            2. Regional variation if detectable (ISO 3166-1 alpha-2 country code)

            Use format 'xx' or 'xx-YY' (e.g., 'en-EG' for Egyptian English).
            Consider:
            - Textual content (when available)
            - Writing direction
            - Typographic conventions
            - Cultural references
            - Visual layout characteristics
            - Any visible text in the image

            If uncertain, respond with primary language code only.
            If completely unsure, respond 'UNKNOWN'.

            Text Content (may be empty):
            {text_content}

            Respond ONLY with the language code or 'UNKNOWN'.
            """

            detected_lang_raw = analyze_with_gpt(
                prompt=prompt,
                content=text_content,
                images=images,  # Passing PIL image directly
            ).strip()

            # Handle empty/unexpected responses
            if not detected_lang_raw:
                raise ValueError("LLM returned an empty or invalid language code.")

            # Normalize and validate response
            detected_lang = detected_lang_raw.upper()

            # Standardize separator for detected language
            if "_" in detected_lang:
                detected_lang = detected_lang.replace("_", "-")

            if "-" in detected_lang:
                parts = detected_lang.split("-")
                if len(parts[0]) != 2 or len(parts[1]) != 2:
                    detected_lang = parts[0]  # Fall back to primary language

            # Standardize expected language to use hyphen
            normalized_expected = expected_language.replace("_", "-")

            # Compare to expected language (case-insensitive)
            expected_primary = normalized_expected.split("-")[0].lower()
            detected_primary = detected_lang.split("-")[0].lower()
            expected_language_lower = normalized_expected.lower()
            detected_lang_lower = detected_lang.lower()

            # Check if directly equal (after normalization) or if primary languages match
            matches = (
                detected_lang_lower == expected_language_lower or
                detected_primary == expected_primary
            )

            context_result.update({
                "detected_language": detected_lang,
                "matches": matches,
                "validation_method": "Multimodal LLM analysis",
                "gpt_response": detected_lang_raw
            })

        except Exception as e:
            context_result.update({
                "error": str(e),
                "validation_method": "failed"
            })
            context[check_id] = context_result
            return {
                "status": "error",
                "error_message": f"Language detection error: {str(e)}",
                "details": context_result
            }

    # --- Final validation check (case-insensitive) ---
    expected_language_upper = context_result["expected_language"].upper()
    validation_passed = (
        context_result["matches"] or
        expected_language_upper in ["CEN", "GEN"]
    )

    if not validation_passed:
        return {
            "status": "error",
            "error_message": (
                f"Language mismatch: Expected {expected_language.upper()}, "
                f"detected {context_result['detected_language']}"
            ),
            "details": context_result
        }

    # --- Successful validation ---
    context[check_id] = context_result
    return {
        "status": "passed",
        "details": context_result
    }