- FIXED: Column mapping now correct (A-G, not B-H) - FIXED: INSPIRATION category now included (column G) - Updated extract_hyperlinks to use A-G range (7 columns total) - Verified headers in Row 8: columns A-G match all 7 categories - Added MAMP_SETUP.md for local MAMP development - Added APACHE_DEPLOY.md for production Apache deployment - Added .htaccess with security headers and PHP settings - Updated extract range from B-H to A-G throughout codebase - All 7 categories now properly extracted: HARD NEWS, POP CULTURE, PRODUCT SPOTTING, INTERNET CULTURE, INDUSTRY NEWS, SOCIAL UPDATES, INSPIRATION
113 lines
3.6 KiB
Python
113 lines
3.6 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Helper module to extract hyperlinks from Google Sheets rich text formatting.
|
|
"""
|
|
|
|
from googleapiclient.discovery import build
|
|
from google.oauth2.service_account import Credentials
|
|
|
|
|
|
def extract_hyperlinks_from_sheet(sheet_id, tab_name, start_row, end_row, service_account_file):
|
|
"""
|
|
Extract hyperlinks from Google Sheets cells using Sheets API v4.
|
|
|
|
Args:
|
|
sheet_id: The Google Sheets ID
|
|
tab_name: The worksheet tab name
|
|
start_row: Starting row number (1-indexed)
|
|
end_row: Ending row number (1-indexed)
|
|
service_account_file: Path to service account JSON file
|
|
|
|
Returns:
|
|
dict: Mapping of cell addresses to their hyperlinks
|
|
Example: {'B20': 'https://example.com', 'C21': 'https://...'}
|
|
"""
|
|
# Set up credentials
|
|
scopes = [
|
|
'https://www.googleapis.com/auth/spreadsheets.readonly',
|
|
'https://www.googleapis.com/auth/drive.readonly'
|
|
]
|
|
|
|
creds = Credentials.from_service_account_file(service_account_file, scopes=scopes)
|
|
|
|
# Build the Sheets API service
|
|
service = build('sheets', 'v4', credentials=creds)
|
|
|
|
# Define the range (A-G, 7 columns total)
|
|
range_name = f"'{tab_name}'!A{start_row}:G{end_row}"
|
|
|
|
# Request the sheet data with textFormatRuns (contains hyperlinks)
|
|
request = service.spreadsheets().get(
|
|
spreadsheetId=sheet_id,
|
|
ranges=[range_name],
|
|
fields='sheets(data(rowData(values(hyperlink,textFormatRuns,formattedValue))))'
|
|
)
|
|
|
|
response = request.execute()
|
|
|
|
# Extract hyperlinks
|
|
hyperlinks = {}
|
|
|
|
try:
|
|
sheets = response.get('sheets', [])
|
|
if not sheets:
|
|
return hyperlinks
|
|
|
|
data = sheets[0].get('data', [])
|
|
if not data:
|
|
return hyperlinks
|
|
|
|
row_data = data[0].get('rowData', [])
|
|
|
|
# Column letters mapping (0-indexed to column letters) - only A-G exist
|
|
col_letters = ['A', 'B', 'C', 'D', 'E', 'F', 'G']
|
|
|
|
for row_idx, row in enumerate(row_data):
|
|
actual_row = start_row + row_idx
|
|
values = row.get('values', [])
|
|
|
|
for col_idx, cell in enumerate(values):
|
|
if col_idx >= len(col_letters):
|
|
continue
|
|
|
|
col_letter = col_letters[col_idx]
|
|
cell_address = f"{col_letter}{actual_row}"
|
|
|
|
# Check for direct hyperlink
|
|
if 'hyperlink' in cell:
|
|
hyperlinks[cell_address] = cell['hyperlink']
|
|
continue
|
|
|
|
# Check for hyperlinks in textFormatRuns (rich text)
|
|
text_format_runs = cell.get('textFormatRuns', [])
|
|
for run in text_format_runs:
|
|
text_format = run.get('format', {})
|
|
if 'link' in text_format:
|
|
link_url = text_format['link'].get('uri')
|
|
if link_url:
|
|
hyperlinks[cell_address] = link_url
|
|
break # Take the first hyperlink in the cell
|
|
|
|
except Exception as e:
|
|
print(f"Error extracting hyperlinks: {e}")
|
|
|
|
return hyperlinks
|
|
|
|
|
|
if __name__ == '__main__':
|
|
# Test the function
|
|
from config import Config
|
|
|
|
print("Testing hyperlink extraction...\n")
|
|
|
|
hyperlinks = extract_hyperlinks_from_sheet(
|
|
sheet_id=Config.GOOGLE_SHEET_ID,
|
|
tab_name=Config.GOOGLE_SHEET_TAB,
|
|
start_row=20,
|
|
end_row=25,
|
|
service_account_file=Config.GOOGLE_SERVICE_ACCOUNT_FILE
|
|
)
|
|
|
|
print(f"Found {len(hyperlinks)} hyperlinks:\n")
|
|
for cell_addr, url in hyperlinks.items():
|
|
print(f"{cell_addr}: {url}")
|