Add multilingual PDF support: language detection + language-aware checks

- Import langdetect with graceful fallback if not installed
- _check_language(): detect actual document language via langdetect on first
  3 pages of text; store in self._detected_lang; warn when declared /Lang tag
  doesn't match detected language; suggest correct BCP-47 tag when missing
- _check_readability(): skip Flesch Reading Ease / Flesch-Kincaid (English-only
  formulas) for non-English documents; long-sentence check remains language-agnostic
- _check_links(): extend unclear-link patterns to Ukrainian, Russian, German,
  French, Spanish, and Polish
- requirements-cloudrun.txt: add langdetect>=1.0.9

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Vadym Samoilenko 2026-03-13 14:52:05 +00:00
parent 350f5de56e
commit 7fe26e7dc4
2 changed files with 129 additions and 51 deletions

View file

@ -97,6 +97,14 @@ except ImportError:
logger.info("Install: pip install anthropic")
anthropic = None
# Language detection
try:
from langdetect import detect as langdetect_detect, LangDetectException
except ImportError:
logger.warning("langdetect not available — language detection disabled")
langdetect_detect = None
LangDetectException = Exception
# WCAG 2.1 criterion → conformance level
WCAG_LEVELS: Dict[str, str] = {
@ -376,6 +384,7 @@ class EnterprisePDFChecker:
self.page_images: Dict[int, str] = {} # page_num -> image_path
self.verapdf_results: Optional[Dict] = None
self.remediation_suggestions: Optional[Dict] = None
self._detected_lang: str = 'en' # detected language of the document
# API clients
self.vision_client = None
@ -627,25 +636,71 @@ class EnterprisePDFChecker:
)
def _check_language(self):
"""Check language declaration"""
"""Check language declaration (WCAG 3.1.1) and detect actual content language."""
catalog = self.pdf_reader.trailer.get("/Root", {})
# --- Detect actual language from content ---
sample_text = ""
for page in self.pdf_plumber.pages[:3]:
t = page.extract_text()
if t:
sample_text += t + " "
if len(sample_text) > 500:
break
if langdetect_detect and len(sample_text.strip()) >= 50:
try:
self._detected_lang = langdetect_detect(sample_text)
except LangDetectException:
self._detected_lang = 'en'
# --- Check declared /Lang ---
if "/Lang" not in catalog:
suggestion = self._detected_lang if self._detected_lang else 'en-US'
# Map ISO 639-1 codes to BCP-47 tags
lang_map = {
'uk': 'uk-UA', 'ru': 'ru-RU', 'de': 'de-DE', 'fr': 'fr-FR',
'es': 'es-ES', 'pl': 'pl-PL', 'it': 'it-IT', 'pt': 'pt-PT',
'nl': 'nl-NL', 'cs': 'cs-CZ', 'sk': 'sk-SK', 'ro': 'ro-RO',
'hu': 'hu-HU', 'bg': 'bg-BG', 'hr': 'hr-HR', 'ar': 'ar-SA',
'zh': 'zh-CN', 'ja': 'ja-JP', 'ko': 'ko-KR', 'en': 'en-US',
}
bcp47 = lang_map.get(self._detected_lang, self._detected_lang)
self.add_issue(
Severity.ERROR,
"Language",
"Document language not specified",
wcag_criterion="3.1.1",
recommendation="Set document language (e.g., 'en-US')"
recommendation=f"Set document language (detected content language: '{bcp47}')",
details={'detected_language': self._detected_lang}
)
else:
lang = catalog["/Lang"]
self.add_issue(
Severity.SUCCESS,
"Language",
f"Document language set to: {lang}",
wcag_criterion="3.1.1"
)
declared_lang = str(catalog["/Lang"]).lower()
# Compare declared lang prefix with detected lang
declared_prefix = declared_lang.split('-')[0].split('_')[0]
if (langdetect_detect and len(sample_text.strip()) >= 50
and self._detected_lang != 'en' # English is common false-positive
and declared_prefix != self._detected_lang
and self._detected_lang not in declared_prefix):
self.add_issue(
Severity.WARNING,
"Language",
f"Declared language '{catalog['/Lang']}' may not match content "
f"(detected: '{self._detected_lang}')",
wcag_criterion="3.1.1",
recommendation="Verify the /Lang entry matches the document's actual language",
details={'declared_language': str(catalog["/Lang"]),
'detected_language': self._detected_lang}
)
else:
self.add_issue(
Severity.SUCCESS,
"Language",
f"Document language set to: {catalog['/Lang']}",
wcag_criterion="3.1.1",
details={'declared_language': str(catalog["/Lang"]),
'detected_language': self._detected_lang}
)
def _check_text_extractability(self):
"""Check if text can be extracted"""
@ -1063,64 +1118,86 @@ Respond in JSON format:
logger.warning(f"Contrast check skipped: {str(e)}")
def _check_readability(self):
"""Check content readability"""
"""Check content readability (language-aware: Flesch only for English)."""
# Extract all text
all_text = ""
for page in self.pdf_plumber.pages:
text = page.extract_text()
if text:
all_text += text + "\n"
if len(all_text) < 100:
return
analysis = ReadabilityAnalyzer.analyze(all_text)
if 'error' in analysis:
return
# Check Flesch Reading Ease — readability is advisory, cap at WARNING
if analysis['flesch_reading_ease'] < 60:
severity = Severity.WARNING # never ERROR: readability is not a hard accessibility failure
self.add_issue(
severity,
"Readability",
f"Content is difficult to read (Flesch score: {analysis['flesch_reading_ease']}/100)",
wcag_criterion="3.1.5",
recommendation="Simplify language to reach 8th-9th grade level (target score: 60+)",
details=analysis
)
# Check grade level
if analysis['flesch_kincaid_grade'] > 10:
self.add_issue(
Severity.WARNING,
"Readability",
f"Content requires grade {analysis['flesch_kincaid_grade']} reading level",
wcag_criterion="3.1.5",
recommendation="Target grade 8-10 for general audiences",
details=analysis
)
# Check long sentences
if analysis['long_sentences_count'] > 5:
# Flesch Reading Ease is an English-only formula — skip for other languages
is_english = self._detected_lang in ('en', 'en-us', 'en-gb')
if is_english:
analysis = ReadabilityAnalyzer.analyze(all_text)
if 'error' in analysis:
return
# Check Flesch Reading Ease — readability is advisory, cap at WARNING
if analysis['flesch_reading_ease'] < 60:
self.add_issue(
Severity.WARNING,
"Readability",
f"Content is difficult to read (Flesch score: {analysis['flesch_reading_ease']}/100)",
wcag_criterion="3.1.5",
recommendation="Simplify language to reach 8th-9th grade level (target score: 60+)",
details=analysis
)
# Check grade level
if analysis['flesch_kincaid_grade'] > 10:
self.add_issue(
Severity.WARNING,
"Readability",
f"Content requires grade {analysis['flesch_kincaid_grade']} reading level",
wcag_criterion="3.1.5",
recommendation="Target grade 8-10 for general audiences",
details=analysis
)
# Long-sentence check is language-agnostic
sentences = [s.strip() for s in re.split(r'[.!?]+', all_text) if s.strip()]
long_sentences = [s for s in sentences if len(s.split()) > 25]
if len(long_sentences) > 5:
self.add_issue(
Severity.INFO,
"Readability",
f"{analysis['long_sentences_count']} sentences exceed 25 words",
f"{len(long_sentences)} sentences exceed 25 words",
wcag_criterion="3.1.5",
recommendation="Break long sentences for better comprehension"
recommendation="Break long sentences for better comprehension",
details={'long_sentences_count': len(long_sentences),
'detected_language': self._detected_lang}
)
def _check_links(self):
"""Check link quality (WCAG 2.4.4) — only checks actual hyperlink label text."""
unclear_patterns = [
r'\bclick here\b',
r'\bhere\b',
r'\bread more\b',
r'\bmore\b',
r'\bthis\b',
r'\blink\b',
# English
r'\bclick here\b', r'\bhere\b', r'\bread more\b',
r'\bmore\b', r'\bthis\b', r'\blink\b',
# Ukrainian
r'\атисніть тут\b', r'\ут\b', r'\окладніше\b',
r'\bбільше\b', r'\bцe\b', r'\bпосилання\b',
# Russian
r'\ажмите здесь\b', r'\bздесь\b', r'\bподробнее\b',
r'\bбольше\b', r'\bэто\b', r'\bссылка\b',
# German
r'\bhier klicken\b', r'\bhier\b', r'\bmehr lesen\b',
r'\bmehr\b', r'\bdies\b', r'\blink\b',
# French
r'\bcliquez ici\b', r'\bici\b', r'\blire la suite\b',
r'\bplus\b', r'\bceci\b', r'\blien\b',
# Spanish
r'\bhaz clic aquí\b', r'\baquí\b', r'\beer más\b',
r'\bmás\b', r'\besto\b', r'\benlace\b',
# Polish
r'\bkliknij tutaj\b', r'\btutaj\b', r'\bczytaj więcej\b',
r'\bwięcej\b', r'\bto\b', r'\blink\b',
]
for i, (page_plumber, page_pypdf) in enumerate(

View file

@ -31,3 +31,4 @@ python-dotenv>=1.0.0
flask>=3.0.0
gunicorn>=21.2.0
google-cloud-storage>=2.14.0
langdetect>=1.0.9