hm_qc/checks/HM_language_validate.py
2025-11-13 13:41:31 +02:00

180 lines
6.8 KiB
Python
Executable file

from checks.analyze_with_gpt import analyze_with_gpt
def run_check(config: dict, context: dict, check_id: str) -> dict:
"""
QC check that validates document language against filename code using
context-stored PIL images instead of base64 or filesystem paths.
Supports both PDF and static image workflows.
"""
# --- Validate presence of required context data (support both PDF and image workflows) ---
hm_parse_data = context.get("HM_parse") or context.get("HM_image_parse", {})
hm_filename_data = context.get("HM_filename_parse") or context.get("HM_image_filename_parse", {})
filename = hm_parse_data.get("filename", "")
text_content = hm_parse_data.get("extracted_text", "")
parsed_image = hm_parse_data.get("parsed_image") # Get PIL image
expected_language = hm_filename_data.get("parsed", {}).get("language", "").lower()
error_messages = []
if not filename:
error_messages.append("Filename missing from HM_parse context")
if not text_content.strip() and not parsed_image:
error_messages.append("Both text content and image data missing from HM_parse context")
if not expected_language:
error_messages.append("Language code missing from HM_filename_parse context")
if error_messages:
return {
"status": "error",
"error_message": "Missing critical context data: " + ", ".join(error_messages),
"details": {
"available_context": {
"HM_parse_keys": list(hm_parse_data.keys()),
"HM_filename_parse_keys": list(hm_filename_data.keys())
}
}
}
# --- Prepare context result structure ---
context_result = {
"filename": filename,
"expected_language": expected_language.upper(),
"detected_language": "UNKNOWN",
"matches": False,
"isCensorshipRequired": False,
"validation_method": None,
"gpt_response": None,
"image_used": False,
"image_format": None
}
# --- Handle special CEN/GEN cases first (case-insensitive) ---
expected_language_lower = expected_language.lower()
if "cen" in expected_language_lower:
context_result.update({
"detected_language": "CEN",
"matches": True,
"isCensorshipRequired": True,
"validation_method": "auto"
})
elif "gen" in expected_language_lower:
context_result.update({
"detected_language": "GEN",
"matches": True,
"isCensorshipRequired": False,
"validation_method": "auto"
})
else:
# --- Prepare multimodal analysis with PIL image ---
images = []
if parsed_image:
images = [parsed_image]
context_result.update({
"image_used": True,
"image_format": parsed_image.format.lower() if parsed_image.format else None
})
# --- GPT-based multimodal language detection ---
try:
prompt = f"""
Analyze both the text content (if present) and document image (if provided) to identify:
1. Primary language (ISO 639-1 code)
2. Regional variation if detectable (ISO 3166-1 alpha-2 country code)
Use format 'xx' or 'xx-YY' (e.g., 'en-EG' for Egyptian English).
Consider:
- Textual content (when available)
- Writing direction
- Typographic conventions
- Cultural references
- Visual layout characteristics
- Any visible text in the image
If uncertain, respond with primary language code only.
If completely unsure, respond 'UNKNOWN'.
Text Content (may be empty):
{text_content}
Respond ONLY with the language code or 'UNKNOWN'.
"""
detected_lang_raw = analyze_with_gpt(
prompt=prompt,
content=text_content,
images=images, # Passing PIL image directly
).strip()
# Handle empty/unexpected responses
if not detected_lang_raw:
raise ValueError("LLM returned an empty or invalid language code.")
# Normalize and validate response
detected_lang = detected_lang_raw.upper()
# Standardize separator for detected language
if "_" in detected_lang:
detected_lang = detected_lang.replace("_", "-")
if "-" in detected_lang:
parts = detected_lang.split("-")
if len(parts[0]) != 2 or len(parts[1]) != 2:
detected_lang = parts[0] # Fall back to primary language
# Standardize expected language to use hyphen
normalized_expected = expected_language.replace("_", "-")
# Compare to expected language (case-insensitive)
expected_primary = normalized_expected.split("-")[0].lower()
detected_primary = detected_lang.split("-")[0].lower()
expected_language_lower = normalized_expected.lower()
detected_lang_lower = detected_lang.lower()
# Check if directly equal (after normalization) or if primary languages match
matches = (
detected_lang_lower == expected_language_lower or
detected_primary == expected_primary
)
context_result.update({
"detected_language": detected_lang,
"matches": matches,
"validation_method": "Multimodal LLM analysis",
"gpt_response": detected_lang_raw
})
except Exception as e:
context_result.update({
"error": str(e),
"validation_method": "failed"
})
context[check_id] = context_result
return {
"status": "error",
"error_message": f"Language detection error: {str(e)}",
"details": context_result
}
# --- Final validation check (case-insensitive) ---
expected_language_upper = context_result["expected_language"].upper()
validation_passed = (
context_result["matches"] or
expected_language_upper in ["CEN", "GEN"]
)
if not validation_passed:
return {
"status": "error",
"error_message": (
f"Language mismatch: Expected {expected_language.upper()}, "
f"detected {context_result['detected_language']}"
),
"details": context_result
}
# --- Successful validation ---
context[check_id] = context_result
return {
"status": "passed",
"details": context_result
}