Fix link text false positives: check annotation bbox text only (WCAG 2.4.4)
Replaced full-page text scan with annotation-based extraction — now only checks the text inside actual URI hyperlink bounding boxes, eliminating false positives from vague words in body prose. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
a5cd1af982
commit
dca86fb81e
1 changed files with 65 additions and 22 deletions
|
|
@ -1086,36 +1086,79 @@ Respond in JSON format:
|
|||
)
|
||||
|
||||
def _check_links(self):
|
||||
"""Check link quality"""
|
||||
"""Check link quality (WCAG 2.4.4) — only checks actual hyperlink label text."""
|
||||
unclear_patterns = [
|
||||
r'\bclick here\b',
|
||||
r'\bhere\b',
|
||||
r'\blink\b',
|
||||
r'\bread more\b',
|
||||
r'\bmore\b',
|
||||
r'\bthis\b',
|
||||
r'\blink\b',
|
||||
]
|
||||
|
||||
for i, page in enumerate(self.pdf_plumber.pages):
|
||||
text = page.extract_text()
|
||||
if not text:
|
||||
|
||||
for i, (page_plumber, page_pypdf) in enumerate(
|
||||
zip(self.pdf_plumber.pages, self.pdf_reader.pages)
|
||||
):
|
||||
annots_raw = page_pypdf.get("/Annots")
|
||||
if not annots_raw:
|
||||
continue
|
||||
|
||||
# Find URLs
|
||||
urls = re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text)
|
||||
|
||||
# Check for unclear link text
|
||||
for pattern in unclear_patterns:
|
||||
if re.search(pattern, text, re.IGNORECASE):
|
||||
self.add_issue(
|
||||
Severity.WARNING,
|
||||
"Link Text",
|
||||
f"Page {i+1}: Potentially unclear link text detected",
|
||||
wcag_criterion="2.4.4",
|
||||
recommendation="Use descriptive link text that makes sense out of context",
|
||||
page_number=i+1
|
||||
)
|
||||
break
|
||||
|
||||
page_height = float(page_plumber.height)
|
||||
page_flagged = False
|
||||
|
||||
for annot_ref in annots_raw:
|
||||
try:
|
||||
annot = annot_ref.get_object()
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
# Only process URI hyperlinks
|
||||
if annot.get("/Subtype") != "/Link":
|
||||
continue
|
||||
action = annot.get("/A")
|
||||
if not action or action.get("/S") != "/URI":
|
||||
continue
|
||||
|
||||
# Get annotation bounding box (PDF coords: bottom-left origin)
|
||||
rect = annot.get("/Rect")
|
||||
if not rect or len(rect) < 4:
|
||||
continue
|
||||
x0, y0, x1, y1 = (float(rect[0]), float(rect[1]),
|
||||
float(rect[2]), float(rect[3]))
|
||||
|
||||
# Convert to pdfplumber coords (top-left origin)
|
||||
top = page_height - y1
|
||||
bottom = page_height - y0
|
||||
if x0 >= x1 or top >= bottom:
|
||||
continue
|
||||
|
||||
# Extract only the text inside the hyperlink rectangle
|
||||
try:
|
||||
link_text = (
|
||||
page_plumber.within_bbox((x0, top, x1, bottom))
|
||||
.extract_text() or ""
|
||||
).strip()
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
if not link_text:
|
||||
continue # image-only link — skip
|
||||
|
||||
for pattern in unclear_patterns:
|
||||
if re.search(pattern, link_text, re.IGNORECASE):
|
||||
self.add_issue(
|
||||
Severity.WARNING,
|
||||
"Link Text",
|
||||
f"Page {i+1}: Unclear link text \"{link_text}\" — should describe the destination",
|
||||
wcag_criterion="2.4.4",
|
||||
recommendation="Use descriptive link text that makes sense out of context",
|
||||
page_number=i+1
|
||||
)
|
||||
page_flagged = True
|
||||
break # one issue per link is enough
|
||||
|
||||
if page_flagged:
|
||||
break # one issue per page
|
||||
|
||||
def _check_headings(self):
|
||||
"""Check heading structure and hierarchy"""
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue