diff --git a/enterprise_pdf_checker.py b/enterprise_pdf_checker.py index 60a598d..fde6009 100644 --- a/enterprise_pdf_checker.py +++ b/enterprise_pdf_checker.py @@ -1086,36 +1086,79 @@ Respond in JSON format: ) def _check_links(self): - """Check link quality""" + """Check link quality (WCAG 2.4.4) — only checks actual hyperlink label text.""" unclear_patterns = [ r'\bclick here\b', r'\bhere\b', - r'\blink\b', r'\bread more\b', r'\bmore\b', r'\bthis\b', + r'\blink\b', ] - - for i, page in enumerate(self.pdf_plumber.pages): - text = page.extract_text() - if not text: + + for i, (page_plumber, page_pypdf) in enumerate( + zip(self.pdf_plumber.pages, self.pdf_reader.pages) + ): + annots_raw = page_pypdf.get("/Annots") + if not annots_raw: continue - - # Find URLs - urls = re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text) - - # Check for unclear link text - for pattern in unclear_patterns: - if re.search(pattern, text, re.IGNORECASE): - self.add_issue( - Severity.WARNING, - "Link Text", - f"Page {i+1}: Potentially unclear link text detected", - wcag_criterion="2.4.4", - recommendation="Use descriptive link text that makes sense out of context", - page_number=i+1 - ) - break + + page_height = float(page_plumber.height) + page_flagged = False + + for annot_ref in annots_raw: + try: + annot = annot_ref.get_object() + except Exception: + continue + + # Only process URI hyperlinks + if annot.get("/Subtype") != "/Link": + continue + action = annot.get("/A") + if not action or action.get("/S") != "/URI": + continue + + # Get annotation bounding box (PDF coords: bottom-left origin) + rect = annot.get("/Rect") + if not rect or len(rect) < 4: + continue + x0, y0, x1, y1 = (float(rect[0]), float(rect[1]), + float(rect[2]), float(rect[3])) + + # Convert to pdfplumber coords (top-left origin) + top = page_height - y1 + bottom = page_height - y0 + if x0 >= x1 or top >= bottom: + continue + + # Extract only the text inside the hyperlink rectangle + try: + link_text = ( + page_plumber.within_bbox((x0, top, x1, bottom)) + .extract_text() or "" + ).strip() + except Exception: + continue + + if not link_text: + continue # image-only link — skip + + for pattern in unclear_patterns: + if re.search(pattern, link_text, re.IGNORECASE): + self.add_issue( + Severity.WARNING, + "Link Text", + f"Page {i+1}: Unclear link text \"{link_text}\" — should describe the destination", + wcag_criterion="2.4.4", + recommendation="Use descriptive link text that makes sense out of context", + page_number=i+1 + ) + page_flagged = True + break # one issue per link is enough + + if page_flagged: + break # one issue per page def _check_headings(self): """Check heading structure and hierarchy"""