Fix link text false positives: check annotation bbox text only (WCAG 2.4.4)

Replaced full-page text scan with annotation-based extraction — now only
checks the text inside actual URI hyperlink bounding boxes, eliminating
false positives from vague words in body prose.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Vadym Samoilenko 2026-03-13 11:18:11 +00:00
parent a5cd1af982
commit dca86fb81e

View file

@ -1086,36 +1086,79 @@ Respond in JSON format:
)
def _check_links(self):
"""Check link quality"""
"""Check link quality (WCAG 2.4.4) — only checks actual hyperlink label text."""
unclear_patterns = [
r'\bclick here\b',
r'\bhere\b',
r'\blink\b',
r'\bread more\b',
r'\bmore\b',
r'\bthis\b',
r'\blink\b',
]
for i, page in enumerate(self.pdf_plumber.pages):
text = page.extract_text()
if not text:
for i, (page_plumber, page_pypdf) in enumerate(
zip(self.pdf_plumber.pages, self.pdf_reader.pages)
):
annots_raw = page_pypdf.get("/Annots")
if not annots_raw:
continue
# Find URLs
urls = re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text)
# Check for unclear link text
for pattern in unclear_patterns:
if re.search(pattern, text, re.IGNORECASE):
self.add_issue(
Severity.WARNING,
"Link Text",
f"Page {i+1}: Potentially unclear link text detected",
wcag_criterion="2.4.4",
recommendation="Use descriptive link text that makes sense out of context",
page_number=i+1
)
break
page_height = float(page_plumber.height)
page_flagged = False
for annot_ref in annots_raw:
try:
annot = annot_ref.get_object()
except Exception:
continue
# Only process URI hyperlinks
if annot.get("/Subtype") != "/Link":
continue
action = annot.get("/A")
if not action or action.get("/S") != "/URI":
continue
# Get annotation bounding box (PDF coords: bottom-left origin)
rect = annot.get("/Rect")
if not rect or len(rect) < 4:
continue
x0, y0, x1, y1 = (float(rect[0]), float(rect[1]),
float(rect[2]), float(rect[3]))
# Convert to pdfplumber coords (top-left origin)
top = page_height - y1
bottom = page_height - y0
if x0 >= x1 or top >= bottom:
continue
# Extract only the text inside the hyperlink rectangle
try:
link_text = (
page_plumber.within_bbox((x0, top, x1, bottom))
.extract_text() or ""
).strip()
except Exception:
continue
if not link_text:
continue # image-only link — skip
for pattern in unclear_patterns:
if re.search(pattern, link_text, re.IGNORECASE):
self.add_issue(
Severity.WARNING,
"Link Text",
f"Page {i+1}: Unclear link text \"{link_text}\" — should describe the destination",
wcag_criterion="2.4.4",
recommendation="Use descriptive link text that makes sense out of context",
page_number=i+1
)
page_flagged = True
break # one issue per link is enough
if page_flagged:
break # one issue per page
def _check_headings(self):
"""Check heading structure and hierarchy"""