volt-newsroom-scraper-report/extract_hyperlinks.py
DJP ff1893d26b Fix column mapping and add MAMP/Apache deployment guides
- FIXED: Column mapping now correct (A-G, not B-H)
- FIXED: INSPIRATION category now included (column G)
- Updated extract_hyperlinks to use A-G range (7 columns total)
- Verified headers in Row 8: columns A-G match all 7 categories
- Added MAMP_SETUP.md for local MAMP development
- Added APACHE_DEPLOY.md for production Apache deployment
- Added .htaccess with security headers and PHP settings
- Updated extract range from B-H to A-G throughout codebase
- All 7 categories now properly extracted: HARD NEWS, POP CULTURE,
  PRODUCT SPOTTING, INTERNET CULTURE, INDUSTRY NEWS, SOCIAL UPDATES, INSPIRATION
2026-01-06 13:38:01 -05:00

113 lines
3.6 KiB
Python

#!/usr/bin/env python3
"""
Helper module to extract hyperlinks from Google Sheets rich text formatting.
"""
from googleapiclient.discovery import build
from google.oauth2.service_account import Credentials
def extract_hyperlinks_from_sheet(sheet_id, tab_name, start_row, end_row, service_account_file):
"""
Extract hyperlinks from Google Sheets cells using Sheets API v4.
Args:
sheet_id: The Google Sheets ID
tab_name: The worksheet tab name
start_row: Starting row number (1-indexed)
end_row: Ending row number (1-indexed)
service_account_file: Path to service account JSON file
Returns:
dict: Mapping of cell addresses to their hyperlinks
Example: {'B20': 'https://example.com', 'C21': 'https://...'}
"""
# Set up credentials
scopes = [
'https://www.googleapis.com/auth/spreadsheets.readonly',
'https://www.googleapis.com/auth/drive.readonly'
]
creds = Credentials.from_service_account_file(service_account_file, scopes=scopes)
# Build the Sheets API service
service = build('sheets', 'v4', credentials=creds)
# Define the range (A-G, 7 columns total)
range_name = f"'{tab_name}'!A{start_row}:G{end_row}"
# Request the sheet data with textFormatRuns (contains hyperlinks)
request = service.spreadsheets().get(
spreadsheetId=sheet_id,
ranges=[range_name],
fields='sheets(data(rowData(values(hyperlink,textFormatRuns,formattedValue))))'
)
response = request.execute()
# Extract hyperlinks
hyperlinks = {}
try:
sheets = response.get('sheets', [])
if not sheets:
return hyperlinks
data = sheets[0].get('data', [])
if not data:
return hyperlinks
row_data = data[0].get('rowData', [])
# Column letters mapping (0-indexed to column letters) - only A-G exist
col_letters = ['A', 'B', 'C', 'D', 'E', 'F', 'G']
for row_idx, row in enumerate(row_data):
actual_row = start_row + row_idx
values = row.get('values', [])
for col_idx, cell in enumerate(values):
if col_idx >= len(col_letters):
continue
col_letter = col_letters[col_idx]
cell_address = f"{col_letter}{actual_row}"
# Check for direct hyperlink
if 'hyperlink' in cell:
hyperlinks[cell_address] = cell['hyperlink']
continue
# Check for hyperlinks in textFormatRuns (rich text)
text_format_runs = cell.get('textFormatRuns', [])
for run in text_format_runs:
text_format = run.get('format', {})
if 'link' in text_format:
link_url = text_format['link'].get('uri')
if link_url:
hyperlinks[cell_address] = link_url
break # Take the first hyperlink in the cell
except Exception as e:
print(f"Error extracting hyperlinks: {e}")
return hyperlinks
if __name__ == '__main__':
# Test the function
from config import Config
print("Testing hyperlink extraction...\n")
hyperlinks = extract_hyperlinks_from_sheet(
sheet_id=Config.GOOGLE_SHEET_ID,
tab_name=Config.GOOGLE_SHEET_TAB,
start_row=20,
end_row=25,
service_account_file=Config.GOOGLE_SERVICE_ACCOUNT_FILE
)
print(f"Found {len(hyperlinks)} hyperlinks:\n")
for cell_addr, url in hyperlinks.items():
print(f"{cell_addr}: {url}")