diff --git a/enterprise_pdf_checker.py b/enterprise_pdf_checker.py index 85f6544..63fd4e7 100644 --- a/enterprise_pdf_checker.py +++ b/enterprise_pdf_checker.py @@ -97,6 +97,14 @@ except ImportError: logger.info("Install: pip install anthropic") anthropic = None +# Language detection +try: + from langdetect import detect as langdetect_detect, LangDetectException +except ImportError: + logger.warning("langdetect not available — language detection disabled") + langdetect_detect = None + LangDetectException = Exception + # WCAG 2.1 criterion → conformance level WCAG_LEVELS: Dict[str, str] = { @@ -376,6 +384,7 @@ class EnterprisePDFChecker: self.page_images: Dict[int, str] = {} # page_num -> image_path self.verapdf_results: Optional[Dict] = None self.remediation_suggestions: Optional[Dict] = None + self._detected_lang: str = 'en' # detected language of the document # API clients self.vision_client = None @@ -627,25 +636,71 @@ class EnterprisePDFChecker: ) def _check_language(self): - """Check language declaration""" + """Check language declaration (WCAG 3.1.1) and detect actual content language.""" catalog = self.pdf_reader.trailer.get("/Root", {}) - + + # --- Detect actual language from content --- + sample_text = "" + for page in self.pdf_plumber.pages[:3]: + t = page.extract_text() + if t: + sample_text += t + " " + if len(sample_text) > 500: + break + + if langdetect_detect and len(sample_text.strip()) >= 50: + try: + self._detected_lang = langdetect_detect(sample_text) + except LangDetectException: + self._detected_lang = 'en' + + # --- Check declared /Lang --- if "/Lang" not in catalog: + suggestion = self._detected_lang if self._detected_lang else 'en-US' + # Map ISO 639-1 codes to BCP-47 tags + lang_map = { + 'uk': 'uk-UA', 'ru': 'ru-RU', 'de': 'de-DE', 'fr': 'fr-FR', + 'es': 'es-ES', 'pl': 'pl-PL', 'it': 'it-IT', 'pt': 'pt-PT', + 'nl': 'nl-NL', 'cs': 'cs-CZ', 'sk': 'sk-SK', 'ro': 'ro-RO', + 'hu': 'hu-HU', 'bg': 'bg-BG', 'hr': 'hr-HR', 'ar': 'ar-SA', + 'zh': 'zh-CN', 'ja': 'ja-JP', 'ko': 'ko-KR', 'en': 'en-US', + } + bcp47 = lang_map.get(self._detected_lang, self._detected_lang) self.add_issue( Severity.ERROR, "Language", "Document language not specified", wcag_criterion="3.1.1", - recommendation="Set document language (e.g., 'en-US')" + recommendation=f"Set document language (detected content language: '{bcp47}')", + details={'detected_language': self._detected_lang} ) else: - lang = catalog["/Lang"] - self.add_issue( - Severity.SUCCESS, - "Language", - f"Document language set to: {lang}", - wcag_criterion="3.1.1" - ) + declared_lang = str(catalog["/Lang"]).lower() + # Compare declared lang prefix with detected lang + declared_prefix = declared_lang.split('-')[0].split('_')[0] + if (langdetect_detect and len(sample_text.strip()) >= 50 + and self._detected_lang != 'en' # English is common false-positive + and declared_prefix != self._detected_lang + and self._detected_lang not in declared_prefix): + self.add_issue( + Severity.WARNING, + "Language", + f"Declared language '{catalog['/Lang']}' may not match content " + f"(detected: '{self._detected_lang}')", + wcag_criterion="3.1.1", + recommendation="Verify the /Lang entry matches the document's actual language", + details={'declared_language': str(catalog["/Lang"]), + 'detected_language': self._detected_lang} + ) + else: + self.add_issue( + Severity.SUCCESS, + "Language", + f"Document language set to: {catalog['/Lang']}", + wcag_criterion="3.1.1", + details={'declared_language': str(catalog["/Lang"]), + 'detected_language': self._detected_lang} + ) def _check_text_extractability(self): """Check if text can be extracted""" @@ -1063,64 +1118,86 @@ Respond in JSON format: logger.warning(f"Contrast check skipped: {str(e)}") def _check_readability(self): - """Check content readability""" + """Check content readability (language-aware: Flesch only for English).""" # Extract all text all_text = "" for page in self.pdf_plumber.pages: text = page.extract_text() if text: all_text += text + "\n" - + if len(all_text) < 100: return - - analysis = ReadabilityAnalyzer.analyze(all_text) - - if 'error' in analysis: - return - - # Check Flesch Reading Ease — readability is advisory, cap at WARNING - if analysis['flesch_reading_ease'] < 60: - severity = Severity.WARNING # never ERROR: readability is not a hard accessibility failure - self.add_issue( - severity, - "Readability", - f"Content is difficult to read (Flesch score: {analysis['flesch_reading_ease']}/100)", - wcag_criterion="3.1.5", - recommendation="Simplify language to reach 8th-9th grade level (target score: 60+)", - details=analysis - ) - - # Check grade level - if analysis['flesch_kincaid_grade'] > 10: - self.add_issue( - Severity.WARNING, - "Readability", - f"Content requires grade {analysis['flesch_kincaid_grade']} reading level", - wcag_criterion="3.1.5", - recommendation="Target grade 8-10 for general audiences", - details=analysis - ) - - # Check long sentences - if analysis['long_sentences_count'] > 5: + + # Flesch Reading Ease is an English-only formula — skip for other languages + is_english = self._detected_lang in ('en', 'en-us', 'en-gb') + + if is_english: + analysis = ReadabilityAnalyzer.analyze(all_text) + + if 'error' in analysis: + return + + # Check Flesch Reading Ease — readability is advisory, cap at WARNING + if analysis['flesch_reading_ease'] < 60: + self.add_issue( + Severity.WARNING, + "Readability", + f"Content is difficult to read (Flesch score: {analysis['flesch_reading_ease']}/100)", + wcag_criterion="3.1.5", + recommendation="Simplify language to reach 8th-9th grade level (target score: 60+)", + details=analysis + ) + + # Check grade level + if analysis['flesch_kincaid_grade'] > 10: + self.add_issue( + Severity.WARNING, + "Readability", + f"Content requires grade {analysis['flesch_kincaid_grade']} reading level", + wcag_criterion="3.1.5", + recommendation="Target grade 8-10 for general audiences", + details=analysis + ) + + # Long-sentence check is language-agnostic + sentences = [s.strip() for s in re.split(r'[.!?]+', all_text) if s.strip()] + long_sentences = [s for s in sentences if len(s.split()) > 25] + if len(long_sentences) > 5: self.add_issue( Severity.INFO, "Readability", - f"{analysis['long_sentences_count']} sentences exceed 25 words", + f"{len(long_sentences)} sentences exceed 25 words", wcag_criterion="3.1.5", - recommendation="Break long sentences for better comprehension" + recommendation="Break long sentences for better comprehension", + details={'long_sentences_count': len(long_sentences), + 'detected_language': self._detected_lang} ) def _check_links(self): """Check link quality (WCAG 2.4.4) — only checks actual hyperlink label text.""" unclear_patterns = [ - r'\bclick here\b', - r'\bhere\b', - r'\bread more\b', - r'\bmore\b', - r'\bthis\b', - r'\blink\b', + # English + r'\bclick here\b', r'\bhere\b', r'\bread more\b', + r'\bmore\b', r'\bthis\b', r'\blink\b', + # Ukrainian + r'\bнатисніть тут\b', r'\bтут\b', r'\bдокладніше\b', + r'\bбільше\b', r'\bцe\b', r'\bпосилання\b', + # Russian + r'\bнажмите здесь\b', r'\bздесь\b', r'\bподробнее\b', + r'\bбольше\b', r'\bэто\b', r'\bссылка\b', + # German + r'\bhier klicken\b', r'\bhier\b', r'\bmehr lesen\b', + r'\bmehr\b', r'\bdies\b', r'\blink\b', + # French + r'\bcliquez ici\b', r'\bici\b', r'\blire la suite\b', + r'\bplus\b', r'\bceci\b', r'\blien\b', + # Spanish + r'\bhaz clic aquí\b', r'\baquí\b', r'\beer más\b', + r'\bmás\b', r'\besto\b', r'\benlace\b', + # Polish + r'\bkliknij tutaj\b', r'\btutaj\b', r'\bczytaj więcej\b', + r'\bwięcej\b', r'\bto\b', r'\blink\b', ] for i, (page_plumber, page_pypdf) in enumerate( diff --git a/requirements-cloudrun.txt b/requirements-cloudrun.txt index 3edea11..01ff610 100644 --- a/requirements-cloudrun.txt +++ b/requirements-cloudrun.txt @@ -31,3 +31,4 @@ python-dotenv>=1.0.0 flask>=3.0.0 gunicorn>=21.2.0 google-cloud-storage>=2.14.0 +langdetect>=1.0.9