volt-newsroom-scraper-report/content_processor.py
DJP 2ee799ace6 Initial commit: Newsroom Daily Report Generator
- Google Sheets integration with service account
- Dual-track scraping (Firecrawl + Apify)
- Claude API summarization
- Newsletter-style PDF generation with Montserrat font
- Complete documentation and setup instructions
2026-01-06 12:26:53 -05:00

148 lines
3.3 KiB
Python

"""
Content processor for URL detection and classification.
Identifies social media vs regular URLs and extracts URLs from text.
"""
import re
from urllib.parse import urlparse
from config import Config
def is_social_media_url(url):
"""
Determine if a URL is from a social media platform.
Args:
url (str): The URL to check
Returns:
bool: True if URL is from a social media platform
"""
try:
parsed = urlparse(url)
domain = parsed.netloc.lower()
# Remove 'www.' prefix if present
if domain.startswith('www.'):
domain = domain[4:]
# Check against social media domains
for social_domain in Config.SOCIAL_MEDIA_DOMAINS:
if social_domain in domain:
return True
return False
except Exception as e:
print(f"Error parsing URL {url}: {e}")
return False
def get_social_platform(url):
"""
Identify which social media platform a URL belongs to.
Args:
url (str): The social media URL
Returns:
str: Platform name ('twitter', 'instagram', 'tiktok', 'linkedin') or None
"""
try:
parsed = urlparse(url)
domain = parsed.netloc.lower()
# Remove 'www.' prefix
if domain.startswith('www.'):
domain = domain[4:]
if 'twitter.com' in domain or 'x.com' in domain:
return 'twitter'
elif 'instagram.com' in domain:
return 'instagram'
elif 'tiktok.com' in domain:
return 'tiktok'
elif 'linkedin.com' in domain:
return 'linkedin'
return None
except Exception as e:
print(f"Error identifying platform for {url}: {e}")
return None
def extract_urls_from_text(text):
"""
Extract all URLs from a text string.
Args:
text (str): Text potentially containing URLs
Returns:
list: List of extracted URLs
"""
if not text:
return []
# URL regex pattern
url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
urls = re.findall(url_pattern, text)
return urls
def classify_urls(urls):
"""
Classify a list of URLs into social media and regular categories.
Args:
urls (list): List of URLs to classify
Returns:
dict: Dictionary with 'social' and 'regular' lists
{
'social': [{'url': str, 'platform': str}, ...],
'regular': [str, ...]
}
"""
classified = {
'social': [],
'regular': []
}
for url in urls:
if is_social_media_url(url):
platform = get_social_platform(url)
classified['social'].append({
'url': url,
'platform': platform
})
else:
classified['regular'].append(url)
return classified
def extract_domain(url):
"""
Extract clean domain name from URL for display.
Args:
url (str): The URL
Returns:
str: Clean domain name
"""
try:
parsed = urlparse(url)
domain = parsed.netloc.lower()
# Remove 'www.' prefix
if domain.startswith('www.'):
domain = domain[4:]
return domain
except Exception:
return url