Fix link text false positives: check annotation bbox text only (WCAG 2.4.4)

Replaced full-page text scan with annotation-based extraction — now only checks the text inside actual URI hyperlink bounding boxes, eliminating false positives from vague words in body prose. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-13 11:18:11 +00:00 · 2026-03-13 11:18:11 +00:00 · dca86fb81e
commit dca86fb81e
parent a5cd1af982
1 changed files with 65 additions and 22 deletions
--- a/enterprise_pdf_checker.py
+++ b/enterprise_pdf_checker.py
@ -1086,36 +1086,79 @@ Respond in JSON format:
            )
    
    def _check_links(self):
-        """Check link quality"""
+        """Check link quality (WCAG 2.4.4) — only checks actual hyperlink label text."""
        unclear_patterns = [
            r'\bclick here\b',
            r'\bhere\b',
-            r'\blink\b',
            r'\bread more\b',
            r'\bmore\b',
            r'\bthis\b',
+            r'\blink\b',
        ]
-        
-        for i, page in enumerate(self.pdf_plumber.pages):
-            text = page.extract_text()
-            if not text:
+
+        for i, (page_plumber, page_pypdf) in enumerate(
+            zip(self.pdf_plumber.pages, self.pdf_reader.pages)
+        ):
+            annots_raw = page_pypdf.get("/Annots")
+            if not annots_raw:
                continue
-            
-            # Find URLs
-            urls = re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text)
-            
-            # Check for unclear link text
-            for pattern in unclear_patterns:
-                if re.search(pattern, text, re.IGNORECASE):
-                    self.add_issue(
-                        Severity.WARNING,
-                        "Link Text",
-                        f"Page {i+1}: Potentially unclear link text detected",
-                        wcag_criterion="2.4.4",
-                        recommendation="Use descriptive link text that makes sense out of context",
-                        page_number=i+1
-                    )
-                    break
+
+            page_height = float(page_plumber.height)
+            page_flagged = False
+
+            for annot_ref in annots_raw:
+                try:
+                    annot = annot_ref.get_object()
+                except Exception:
+                    continue
+
+                # Only process URI hyperlinks
+                if annot.get("/Subtype") != "/Link":
+                    continue
+                action = annot.get("/A")
+                if not action or action.get("/S") != "/URI":
+                    continue
+
+                # Get annotation bounding box (PDF coords: bottom-left origin)
+                rect = annot.get("/Rect")
+                if not rect or len(rect) < 4:
+                    continue
+                x0, y0, x1, y1 = (float(rect[0]), float(rect[1]),
+                                   float(rect[2]), float(rect[3]))
+
+                # Convert to pdfplumber coords (top-left origin)
+                top    = page_height - y1
+                bottom = page_height - y0
+                if x0 >= x1 or top >= bottom:
+                    continue
+
+                # Extract only the text inside the hyperlink rectangle
+                try:
+                    link_text = (
+                        page_plumber.within_bbox((x0, top, x1, bottom))
+                        .extract_text() or ""
+                    ).strip()
+                except Exception:
+                    continue
+
+                if not link_text:
+                    continue  # image-only link — skip
+
+                for pattern in unclear_patterns:
+                    if re.search(pattern, link_text, re.IGNORECASE):
+                        self.add_issue(
+                            Severity.WARNING,
+                            "Link Text",
+                            f"Page {i+1}: Unclear link text \"{link_text}\" — should describe the destination",
+                            wcag_criterion="2.4.4",
+                            recommendation="Use descriptive link text that makes sense out of context",
+                            page_number=i+1
+                        )
+                        page_flagged = True
+                        break  # one issue per link is enough
+
+                if page_flagged:
+                    break  # one issue per page
    
    def _check_headings(self):
        """Check heading structure and hierarchy"""