from checks.analyze_with_gpt import analyze_with_gpt def run_check(config: dict, context: dict, check_id: str) -> dict: """ QC check that validates document language against filename code using context-stored PIL images instead of base64 or filesystem paths. Supports both PDF and static image workflows. """ # --- Validate presence of required context data (support both PDF and image workflows) --- hm_parse_data = context.get("HM_parse") or context.get("HM_image_parse", {}) hm_filename_data = context.get("HM_filename_parse") or context.get("HM_image_filename_parse", {}) filename = hm_parse_data.get("filename", "") text_content = hm_parse_data.get("extracted_text", "") parsed_image = hm_parse_data.get("parsed_image") # Get PIL image expected_language = hm_filename_data.get("parsed", {}).get("language", "").lower() error_messages = [] if not filename: error_messages.append("Filename missing from HM_parse context") if not text_content.strip() and not parsed_image: error_messages.append("Both text content and image data missing from HM_parse context") if not expected_language: error_messages.append("Language code missing from HM_filename_parse context") if error_messages: return { "status": "error", "error_message": "Missing critical context data: " + ", ".join(error_messages), "details": { "available_context": { "HM_parse_keys": list(hm_parse_data.keys()), "HM_filename_parse_keys": list(hm_filename_data.keys()) } } } # --- Prepare context result structure --- context_result = { "filename": filename, "expected_language": expected_language.upper(), "detected_language": "UNKNOWN", "matches": False, "isCensorshipRequired": False, "validation_method": None, "gpt_response": None, "image_used": False, "image_format": None } # --- Handle special CEN/GEN cases first (case-insensitive) --- expected_language_lower = expected_language.lower() if "cen" in expected_language_lower: context_result.update({ "detected_language": "CEN", "matches": True, "isCensorshipRequired": True, "validation_method": "auto" }) elif "gen" in expected_language_lower: context_result.update({ "detected_language": "GEN", "matches": True, "isCensorshipRequired": False, "validation_method": "auto" }) else: # --- Prepare multimodal analysis with PIL image --- images = [] if parsed_image: images = [parsed_image] context_result.update({ "image_used": True, "image_format": parsed_image.format.lower() if parsed_image.format else None }) # --- GPT-based multimodal language detection --- try: prompt = f""" Analyze both the text content (if present) and document image (if provided) to identify: 1. Primary language (ISO 639-1 code) 2. Regional variation if detectable (ISO 3166-1 alpha-2 country code) Use format 'xx' or 'xx-YY' (e.g., 'en-EG' for Egyptian English). Consider: - Textual content (when available) - Writing direction - Typographic conventions - Cultural references - Visual layout characteristics - Any visible text in the image If uncertain, respond with primary language code only. If completely unsure, respond 'UNKNOWN'. Text Content (may be empty): {text_content} Respond ONLY with the language code or 'UNKNOWN'. """ detected_lang_raw = analyze_with_gpt( prompt=prompt, content=text_content, images=images, # Passing PIL image directly ).strip() # Handle empty/unexpected responses if not detected_lang_raw: raise ValueError("LLM returned an empty or invalid language code.") # Normalize and validate response detected_lang = detected_lang_raw.upper() # Standardize separator for detected language if "_" in detected_lang: detected_lang = detected_lang.replace("_", "-") if "-" in detected_lang: parts = detected_lang.split("-") if len(parts[0]) != 2 or len(parts[1]) != 2: detected_lang = parts[0] # Fall back to primary language # Standardize expected language to use hyphen normalized_expected = expected_language.replace("_", "-") # Compare to expected language (case-insensitive) expected_primary = normalized_expected.split("-")[0].lower() detected_primary = detected_lang.split("-")[0].lower() expected_language_lower = normalized_expected.lower() detected_lang_lower = detected_lang.lower() # Check if directly equal (after normalization) or if primary languages match matches = ( detected_lang_lower == expected_language_lower or detected_primary == expected_primary ) context_result.update({ "detected_language": detected_lang, "matches": matches, "validation_method": "Multimodal LLM analysis", "gpt_response": detected_lang_raw }) except Exception as e: context_result.update({ "error": str(e), "validation_method": "failed" }) context[check_id] = context_result return { "status": "error", "error_message": f"Language detection error: {str(e)}", "details": context_result } # --- Final validation check (case-insensitive) --- expected_language_upper = context_result["expected_language"].upper() validation_passed = ( context_result["matches"] or expected_language_upper in ["CEN", "GEN"] ) if not validation_passed: return { "status": "error", "error_message": ( f"Language mismatch: Expected {expected_language.upper()}, " f"detected {context_result['detected_language']}" ), "details": context_result } # --- Successful validation --- context[check_id] = context_result return { "status": "passed", "details": context_result }