180 lines
6.8 KiB
Python
Executable file
180 lines
6.8 KiB
Python
Executable file
from checks.analyze_with_gpt import analyze_with_gpt
|
|
|
|
def run_check(config: dict, context: dict, check_id: str) -> dict:
|
|
"""
|
|
QC check that validates document language against filename code using
|
|
context-stored PIL images instead of base64 or filesystem paths.
|
|
Supports both PDF and static image workflows.
|
|
"""
|
|
# --- Validate presence of required context data (support both PDF and image workflows) ---
|
|
hm_parse_data = context.get("HM_parse") or context.get("HM_image_parse", {})
|
|
hm_filename_data = context.get("HM_filename_parse") or context.get("HM_image_filename_parse", {})
|
|
|
|
filename = hm_parse_data.get("filename", "")
|
|
text_content = hm_parse_data.get("extracted_text", "")
|
|
parsed_image = hm_parse_data.get("parsed_image") # Get PIL image
|
|
expected_language = hm_filename_data.get("parsed", {}).get("language", "").lower()
|
|
|
|
error_messages = []
|
|
if not filename:
|
|
error_messages.append("Filename missing from HM_parse context")
|
|
if not text_content.strip() and not parsed_image:
|
|
error_messages.append("Both text content and image data missing from HM_parse context")
|
|
if not expected_language:
|
|
error_messages.append("Language code missing from HM_filename_parse context")
|
|
|
|
if error_messages:
|
|
return {
|
|
"status": "error",
|
|
"error_message": "Missing critical context data: " + ", ".join(error_messages),
|
|
"details": {
|
|
"available_context": {
|
|
"HM_parse_keys": list(hm_parse_data.keys()),
|
|
"HM_filename_parse_keys": list(hm_filename_data.keys())
|
|
}
|
|
}
|
|
}
|
|
|
|
# --- Prepare context result structure ---
|
|
context_result = {
|
|
"filename": filename,
|
|
"expected_language": expected_language.upper(),
|
|
"detected_language": "UNKNOWN",
|
|
"matches": False,
|
|
"isCensorshipRequired": False,
|
|
"validation_method": None,
|
|
"gpt_response": None,
|
|
"image_used": False,
|
|
"image_format": None
|
|
}
|
|
|
|
# --- Handle special CEN/GEN cases first (case-insensitive) ---
|
|
expected_language_lower = expected_language.lower()
|
|
if "cen" in expected_language_lower:
|
|
context_result.update({
|
|
"detected_language": "CEN",
|
|
"matches": True,
|
|
"isCensorshipRequired": True,
|
|
"validation_method": "auto"
|
|
})
|
|
elif "gen" in expected_language_lower:
|
|
context_result.update({
|
|
"detected_language": "GEN",
|
|
"matches": True,
|
|
"isCensorshipRequired": False,
|
|
"validation_method": "auto"
|
|
})
|
|
else:
|
|
# --- Prepare multimodal analysis with PIL image ---
|
|
images = []
|
|
if parsed_image:
|
|
images = [parsed_image]
|
|
context_result.update({
|
|
"image_used": True,
|
|
"image_format": parsed_image.format.lower() if parsed_image.format else None
|
|
})
|
|
|
|
# --- GPT-based multimodal language detection ---
|
|
try:
|
|
prompt = f"""
|
|
Analyze both the text content (if present) and document image (if provided) to identify:
|
|
1. Primary language (ISO 639-1 code)
|
|
2. Regional variation if detectable (ISO 3166-1 alpha-2 country code)
|
|
|
|
Use format 'xx' or 'xx-YY' (e.g., 'en-EG' for Egyptian English).
|
|
Consider:
|
|
- Textual content (when available)
|
|
- Writing direction
|
|
- Typographic conventions
|
|
- Cultural references
|
|
- Visual layout characteristics
|
|
- Any visible text in the image
|
|
|
|
If uncertain, respond with primary language code only.
|
|
If completely unsure, respond 'UNKNOWN'.
|
|
|
|
Text Content (may be empty):
|
|
{text_content}
|
|
|
|
Respond ONLY with the language code or 'UNKNOWN'.
|
|
"""
|
|
|
|
detected_lang_raw = analyze_with_gpt(
|
|
prompt=prompt,
|
|
content=text_content,
|
|
images=images, # Passing PIL image directly
|
|
).strip()
|
|
|
|
# Handle empty/unexpected responses
|
|
if not detected_lang_raw:
|
|
raise ValueError("LLM returned an empty or invalid language code.")
|
|
|
|
# Normalize and validate response
|
|
detected_lang = detected_lang_raw.upper()
|
|
|
|
# Standardize separator for detected language
|
|
if "_" in detected_lang:
|
|
detected_lang = detected_lang.replace("_", "-")
|
|
|
|
if "-" in detected_lang:
|
|
parts = detected_lang.split("-")
|
|
if len(parts[0]) != 2 or len(parts[1]) != 2:
|
|
detected_lang = parts[0] # Fall back to primary language
|
|
|
|
# Standardize expected language to use hyphen
|
|
normalized_expected = expected_language.replace("_", "-")
|
|
|
|
# Compare to expected language (case-insensitive)
|
|
expected_primary = normalized_expected.split("-")[0].lower()
|
|
detected_primary = detected_lang.split("-")[0].lower()
|
|
expected_language_lower = normalized_expected.lower()
|
|
detected_lang_lower = detected_lang.lower()
|
|
|
|
# Check if directly equal (after normalization) or if primary languages match
|
|
matches = (
|
|
detected_lang_lower == expected_language_lower or
|
|
detected_primary == expected_primary
|
|
)
|
|
|
|
context_result.update({
|
|
"detected_language": detected_lang,
|
|
"matches": matches,
|
|
"validation_method": "Multimodal LLM analysis",
|
|
"gpt_response": detected_lang_raw
|
|
})
|
|
|
|
except Exception as e:
|
|
context_result.update({
|
|
"error": str(e),
|
|
"validation_method": "failed"
|
|
})
|
|
context[check_id] = context_result
|
|
return {
|
|
"status": "error",
|
|
"error_message": f"Language detection error: {str(e)}",
|
|
"details": context_result
|
|
}
|
|
|
|
# --- Final validation check (case-insensitive) ---
|
|
expected_language_upper = context_result["expected_language"].upper()
|
|
validation_passed = (
|
|
context_result["matches"] or
|
|
expected_language_upper in ["CEN", "GEN"]
|
|
)
|
|
|
|
if not validation_passed:
|
|
return {
|
|
"status": "error",
|
|
"error_message": (
|
|
f"Language mismatch: Expected {expected_language.upper()}, "
|
|
f"detected {context_result['detected_language']}"
|
|
),
|
|
"details": context_result
|
|
}
|
|
|
|
# --- Successful validation ---
|
|
context[check_id] = context_result
|
|
return {
|
|
"status": "passed",
|
|
"details": context_result
|
|
}
|