volt-newsroom-scraper-report/extract_hyperlinks.py

#!/usr/bin/env python3
"""
Helper module to extract hyperlinks from Google Sheets rich text formatting.
"""

from googleapiclient.discovery import build
from google.oauth2.service_account import Credentials


def extract_hyperlinks_from_sheet(sheet_id, tab_name, start_row, end_row, service_account_file):
    """
    Extract hyperlinks from Google Sheets cells using Sheets API v4.

    Args:
        sheet_id: The Google Sheets ID
        tab_name: The worksheet tab name
        start_row: Starting row number (1-indexed)
        end_row: Ending row number (1-indexed)
        service_account_file: Path to service account JSON file

    Returns:
        dict: Mapping of cell addresses to their hyperlinks
            Example: {'B20': 'https://example.com', 'C21': 'https://...'}
    """
    # Set up credentials
    scopes = [
        'https://www.googleapis.com/auth/spreadsheets.readonly',
        'https://www.googleapis.com/auth/drive.readonly'
    ]

    creds = Credentials.from_service_account_file(service_account_file, scopes=scopes)

    # Build the Sheets API service
    service = build('sheets', 'v4', credentials=creds)

    # Define the range (A-G, 7 columns total)
    range_name = f"'{tab_name}'!A{start_row}:G{end_row}"

    # Request the sheet data with textFormatRuns (contains hyperlinks)
    request = service.spreadsheets().get(
        spreadsheetId=sheet_id,
        ranges=[range_name],
        fields='sheets(data(rowData(values(hyperlink,textFormatRuns,formattedValue))))'
    )

    response = request.execute()

    # Extract hyperlinks
    hyperlinks = {}

    try:
        sheets = response.get('sheets', [])
        if not sheets:
            return hyperlinks

        data = sheets[0].get('data', [])
        if not data:
            return hyperlinks

        row_data = data[0].get('rowData', [])

        # Column letters mapping (0-indexed to column letters) - only A-G exist
        col_letters = ['A', 'B', 'C', 'D', 'E', 'F', 'G']

        for row_idx, row in enumerate(row_data):
            actual_row = start_row + row_idx
            values = row.get('values', [])

            for col_idx, cell in enumerate(values):
                if col_idx >= len(col_letters):
                    continue

                col_letter = col_letters[col_idx]
                cell_address = f"{col_letter}{actual_row}"

                # Check for direct hyperlink
                if 'hyperlink' in cell:
                    hyperlinks[cell_address] = cell['hyperlink']
                    continue

                # Check for hyperlinks in textFormatRuns (rich text)
                text_format_runs = cell.get('textFormatRuns', [])
                for run in text_format_runs:
                    text_format = run.get('format', {})
                    if 'link' in text_format:
                        link_url = text_format['link'].get('uri')
                        if link_url:
                            hyperlinks[cell_address] = link_url
                            break  # Take the first hyperlink in the cell

    except Exception as e:
        print(f"Error extracting hyperlinks: {e}")

    return hyperlinks


if __name__ == '__main__':
    # Test the function
    from config import Config

    print("Testing hyperlink extraction...\n")

    hyperlinks = extract_hyperlinks_from_sheet(
        sheet_id=Config.GOOGLE_SHEET_ID,
        tab_name=Config.GOOGLE_SHEET_TAB,
        start_row=20,
        end_row=25,
        service_account_file=Config.GOOGLE_SERVICE_ACCOUNT_FILE
    )

    print(f"Found {len(hyperlinks)} hyperlinks:\n")
    for cell_addr, url in hyperlinks.items():
        print(f"{cell_addr}: {url}")