Add multilingual PDF support: language detection + language-aware checks
- Import langdetect with graceful fallback if not installed - _check_language(): detect actual document language via langdetect on first 3 pages of text; store in self._detected_lang; warn when declared /Lang tag doesn't match detected language; suggest correct BCP-47 tag when missing - _check_readability(): skip Flesch Reading Ease / Flesch-Kincaid (English-only formulas) for non-English documents; long-sentence check remains language-agnostic - _check_links(): extend unclear-link patterns to Ukrainian, Russian, German, French, Spanish, and Polish - requirements-cloudrun.txt: add langdetect>=1.0.9 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
350f5de56e
commit
7fe26e7dc4
2 changed files with 129 additions and 51 deletions
|
|
@ -97,6 +97,14 @@ except ImportError:
|
|||
logger.info("Install: pip install anthropic")
|
||||
anthropic = None
|
||||
|
||||
# Language detection
|
||||
try:
|
||||
from langdetect import detect as langdetect_detect, LangDetectException
|
||||
except ImportError:
|
||||
logger.warning("langdetect not available — language detection disabled")
|
||||
langdetect_detect = None
|
||||
LangDetectException = Exception
|
||||
|
||||
|
||||
# WCAG 2.1 criterion → conformance level
|
||||
WCAG_LEVELS: Dict[str, str] = {
|
||||
|
|
@ -376,6 +384,7 @@ class EnterprisePDFChecker:
|
|||
self.page_images: Dict[int, str] = {} # page_num -> image_path
|
||||
self.verapdf_results: Optional[Dict] = None
|
||||
self.remediation_suggestions: Optional[Dict] = None
|
||||
self._detected_lang: str = 'en' # detected language of the document
|
||||
|
||||
# API clients
|
||||
self.vision_client = None
|
||||
|
|
@ -627,25 +636,71 @@ class EnterprisePDFChecker:
|
|||
)
|
||||
|
||||
def _check_language(self):
|
||||
"""Check language declaration"""
|
||||
"""Check language declaration (WCAG 3.1.1) and detect actual content language."""
|
||||
catalog = self.pdf_reader.trailer.get("/Root", {})
|
||||
|
||||
|
||||
# --- Detect actual language from content ---
|
||||
sample_text = ""
|
||||
for page in self.pdf_plumber.pages[:3]:
|
||||
t = page.extract_text()
|
||||
if t:
|
||||
sample_text += t + " "
|
||||
if len(sample_text) > 500:
|
||||
break
|
||||
|
||||
if langdetect_detect and len(sample_text.strip()) >= 50:
|
||||
try:
|
||||
self._detected_lang = langdetect_detect(sample_text)
|
||||
except LangDetectException:
|
||||
self._detected_lang = 'en'
|
||||
|
||||
# --- Check declared /Lang ---
|
||||
if "/Lang" not in catalog:
|
||||
suggestion = self._detected_lang if self._detected_lang else 'en-US'
|
||||
# Map ISO 639-1 codes to BCP-47 tags
|
||||
lang_map = {
|
||||
'uk': 'uk-UA', 'ru': 'ru-RU', 'de': 'de-DE', 'fr': 'fr-FR',
|
||||
'es': 'es-ES', 'pl': 'pl-PL', 'it': 'it-IT', 'pt': 'pt-PT',
|
||||
'nl': 'nl-NL', 'cs': 'cs-CZ', 'sk': 'sk-SK', 'ro': 'ro-RO',
|
||||
'hu': 'hu-HU', 'bg': 'bg-BG', 'hr': 'hr-HR', 'ar': 'ar-SA',
|
||||
'zh': 'zh-CN', 'ja': 'ja-JP', 'ko': 'ko-KR', 'en': 'en-US',
|
||||
}
|
||||
bcp47 = lang_map.get(self._detected_lang, self._detected_lang)
|
||||
self.add_issue(
|
||||
Severity.ERROR,
|
||||
"Language",
|
||||
"Document language not specified",
|
||||
wcag_criterion="3.1.1",
|
||||
recommendation="Set document language (e.g., 'en-US')"
|
||||
recommendation=f"Set document language (detected content language: '{bcp47}')",
|
||||
details={'detected_language': self._detected_lang}
|
||||
)
|
||||
else:
|
||||
lang = catalog["/Lang"]
|
||||
self.add_issue(
|
||||
Severity.SUCCESS,
|
||||
"Language",
|
||||
f"Document language set to: {lang}",
|
||||
wcag_criterion="3.1.1"
|
||||
)
|
||||
declared_lang = str(catalog["/Lang"]).lower()
|
||||
# Compare declared lang prefix with detected lang
|
||||
declared_prefix = declared_lang.split('-')[0].split('_')[0]
|
||||
if (langdetect_detect and len(sample_text.strip()) >= 50
|
||||
and self._detected_lang != 'en' # English is common false-positive
|
||||
and declared_prefix != self._detected_lang
|
||||
and self._detected_lang not in declared_prefix):
|
||||
self.add_issue(
|
||||
Severity.WARNING,
|
||||
"Language",
|
||||
f"Declared language '{catalog['/Lang']}' may not match content "
|
||||
f"(detected: '{self._detected_lang}')",
|
||||
wcag_criterion="3.1.1",
|
||||
recommendation="Verify the /Lang entry matches the document's actual language",
|
||||
details={'declared_language': str(catalog["/Lang"]),
|
||||
'detected_language': self._detected_lang}
|
||||
)
|
||||
else:
|
||||
self.add_issue(
|
||||
Severity.SUCCESS,
|
||||
"Language",
|
||||
f"Document language set to: {catalog['/Lang']}",
|
||||
wcag_criterion="3.1.1",
|
||||
details={'declared_language': str(catalog["/Lang"]),
|
||||
'detected_language': self._detected_lang}
|
||||
)
|
||||
|
||||
def _check_text_extractability(self):
|
||||
"""Check if text can be extracted"""
|
||||
|
|
@ -1063,64 +1118,86 @@ Respond in JSON format:
|
|||
logger.warning(f"Contrast check skipped: {str(e)}")
|
||||
|
||||
def _check_readability(self):
|
||||
"""Check content readability"""
|
||||
"""Check content readability (language-aware: Flesch only for English)."""
|
||||
# Extract all text
|
||||
all_text = ""
|
||||
for page in self.pdf_plumber.pages:
|
||||
text = page.extract_text()
|
||||
if text:
|
||||
all_text += text + "\n"
|
||||
|
||||
|
||||
if len(all_text) < 100:
|
||||
return
|
||||
|
||||
analysis = ReadabilityAnalyzer.analyze(all_text)
|
||||
|
||||
if 'error' in analysis:
|
||||
return
|
||||
|
||||
# Check Flesch Reading Ease — readability is advisory, cap at WARNING
|
||||
if analysis['flesch_reading_ease'] < 60:
|
||||
severity = Severity.WARNING # never ERROR: readability is not a hard accessibility failure
|
||||
self.add_issue(
|
||||
severity,
|
||||
"Readability",
|
||||
f"Content is difficult to read (Flesch score: {analysis['flesch_reading_ease']}/100)",
|
||||
wcag_criterion="3.1.5",
|
||||
recommendation="Simplify language to reach 8th-9th grade level (target score: 60+)",
|
||||
details=analysis
|
||||
)
|
||||
|
||||
# Check grade level
|
||||
if analysis['flesch_kincaid_grade'] > 10:
|
||||
self.add_issue(
|
||||
Severity.WARNING,
|
||||
"Readability",
|
||||
f"Content requires grade {analysis['flesch_kincaid_grade']} reading level",
|
||||
wcag_criterion="3.1.5",
|
||||
recommendation="Target grade 8-10 for general audiences",
|
||||
details=analysis
|
||||
)
|
||||
|
||||
# Check long sentences
|
||||
if analysis['long_sentences_count'] > 5:
|
||||
|
||||
# Flesch Reading Ease is an English-only formula — skip for other languages
|
||||
is_english = self._detected_lang in ('en', 'en-us', 'en-gb')
|
||||
|
||||
if is_english:
|
||||
analysis = ReadabilityAnalyzer.analyze(all_text)
|
||||
|
||||
if 'error' in analysis:
|
||||
return
|
||||
|
||||
# Check Flesch Reading Ease — readability is advisory, cap at WARNING
|
||||
if analysis['flesch_reading_ease'] < 60:
|
||||
self.add_issue(
|
||||
Severity.WARNING,
|
||||
"Readability",
|
||||
f"Content is difficult to read (Flesch score: {analysis['flesch_reading_ease']}/100)",
|
||||
wcag_criterion="3.1.5",
|
||||
recommendation="Simplify language to reach 8th-9th grade level (target score: 60+)",
|
||||
details=analysis
|
||||
)
|
||||
|
||||
# Check grade level
|
||||
if analysis['flesch_kincaid_grade'] > 10:
|
||||
self.add_issue(
|
||||
Severity.WARNING,
|
||||
"Readability",
|
||||
f"Content requires grade {analysis['flesch_kincaid_grade']} reading level",
|
||||
wcag_criterion="3.1.5",
|
||||
recommendation="Target grade 8-10 for general audiences",
|
||||
details=analysis
|
||||
)
|
||||
|
||||
# Long-sentence check is language-agnostic
|
||||
sentences = [s.strip() for s in re.split(r'[.!?]+', all_text) if s.strip()]
|
||||
long_sentences = [s for s in sentences if len(s.split()) > 25]
|
||||
if len(long_sentences) > 5:
|
||||
self.add_issue(
|
||||
Severity.INFO,
|
||||
"Readability",
|
||||
f"{analysis['long_sentences_count']} sentences exceed 25 words",
|
||||
f"{len(long_sentences)} sentences exceed 25 words",
|
||||
wcag_criterion="3.1.5",
|
||||
recommendation="Break long sentences for better comprehension"
|
||||
recommendation="Break long sentences for better comprehension",
|
||||
details={'long_sentences_count': len(long_sentences),
|
||||
'detected_language': self._detected_lang}
|
||||
)
|
||||
|
||||
def _check_links(self):
|
||||
"""Check link quality (WCAG 2.4.4) — only checks actual hyperlink label text."""
|
||||
unclear_patterns = [
|
||||
r'\bclick here\b',
|
||||
r'\bhere\b',
|
||||
r'\bread more\b',
|
||||
r'\bmore\b',
|
||||
r'\bthis\b',
|
||||
r'\blink\b',
|
||||
# English
|
||||
r'\bclick here\b', r'\bhere\b', r'\bread more\b',
|
||||
r'\bmore\b', r'\bthis\b', r'\blink\b',
|
||||
# Ukrainian
|
||||
r'\bнатисніть тут\b', r'\bтут\b', r'\bдокладніше\b',
|
||||
r'\bбільше\b', r'\bцe\b', r'\bпосилання\b',
|
||||
# Russian
|
||||
r'\bнажмите здесь\b', r'\bздесь\b', r'\bподробнее\b',
|
||||
r'\bбольше\b', r'\bэто\b', r'\bссылка\b',
|
||||
# German
|
||||
r'\bhier klicken\b', r'\bhier\b', r'\bmehr lesen\b',
|
||||
r'\bmehr\b', r'\bdies\b', r'\blink\b',
|
||||
# French
|
||||
r'\bcliquez ici\b', r'\bici\b', r'\blire la suite\b',
|
||||
r'\bplus\b', r'\bceci\b', r'\blien\b',
|
||||
# Spanish
|
||||
r'\bhaz clic aquí\b', r'\baquí\b', r'\beer más\b',
|
||||
r'\bmás\b', r'\besto\b', r'\benlace\b',
|
||||
# Polish
|
||||
r'\bkliknij tutaj\b', r'\btutaj\b', r'\bczytaj więcej\b',
|
||||
r'\bwięcej\b', r'\bto\b', r'\blink\b',
|
||||
]
|
||||
|
||||
for i, (page_plumber, page_pypdf) in enumerate(
|
||||
|
|
|
|||
|
|
@ -31,3 +31,4 @@ python-dotenv>=1.0.0
|
|||
flask>=3.0.0
|
||||
gunicorn>=21.2.0
|
||||
google-cloud-storage>=2.14.0
|
||||
langdetect>=1.0.9
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue