- Google Sheets integration with service account - Dual-track scraping (Firecrawl + Apify) - Claude API summarization - Newsletter-style PDF generation with Montserrat font - Complete documentation and setup instructions
148 lines
3.3 KiB
Python
148 lines
3.3 KiB
Python
"""
|
|
Content processor for URL detection and classification.
|
|
Identifies social media vs regular URLs and extracts URLs from text.
|
|
"""
|
|
|
|
import re
|
|
from urllib.parse import urlparse
|
|
from config import Config
|
|
|
|
|
|
def is_social_media_url(url):
|
|
"""
|
|
Determine if a URL is from a social media platform.
|
|
|
|
Args:
|
|
url (str): The URL to check
|
|
|
|
Returns:
|
|
bool: True if URL is from a social media platform
|
|
"""
|
|
try:
|
|
parsed = urlparse(url)
|
|
domain = parsed.netloc.lower()
|
|
|
|
# Remove 'www.' prefix if present
|
|
if domain.startswith('www.'):
|
|
domain = domain[4:]
|
|
|
|
# Check against social media domains
|
|
for social_domain in Config.SOCIAL_MEDIA_DOMAINS:
|
|
if social_domain in domain:
|
|
return True
|
|
|
|
return False
|
|
|
|
except Exception as e:
|
|
print(f"Error parsing URL {url}: {e}")
|
|
return False
|
|
|
|
|
|
def get_social_platform(url):
|
|
"""
|
|
Identify which social media platform a URL belongs to.
|
|
|
|
Args:
|
|
url (str): The social media URL
|
|
|
|
Returns:
|
|
str: Platform name ('twitter', 'instagram', 'tiktok', 'linkedin') or None
|
|
"""
|
|
try:
|
|
parsed = urlparse(url)
|
|
domain = parsed.netloc.lower()
|
|
|
|
# Remove 'www.' prefix
|
|
if domain.startswith('www.'):
|
|
domain = domain[4:]
|
|
|
|
if 'twitter.com' in domain or 'x.com' in domain:
|
|
return 'twitter'
|
|
elif 'instagram.com' in domain:
|
|
return 'instagram'
|
|
elif 'tiktok.com' in domain:
|
|
return 'tiktok'
|
|
elif 'linkedin.com' in domain:
|
|
return 'linkedin'
|
|
|
|
return None
|
|
|
|
except Exception as e:
|
|
print(f"Error identifying platform for {url}: {e}")
|
|
return None
|
|
|
|
|
|
def extract_urls_from_text(text):
|
|
"""
|
|
Extract all URLs from a text string.
|
|
|
|
Args:
|
|
text (str): Text potentially containing URLs
|
|
|
|
Returns:
|
|
list: List of extracted URLs
|
|
"""
|
|
if not text:
|
|
return []
|
|
|
|
# URL regex pattern
|
|
url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
|
|
|
|
urls = re.findall(url_pattern, text)
|
|
return urls
|
|
|
|
|
|
def classify_urls(urls):
|
|
"""
|
|
Classify a list of URLs into social media and regular categories.
|
|
|
|
Args:
|
|
urls (list): List of URLs to classify
|
|
|
|
Returns:
|
|
dict: Dictionary with 'social' and 'regular' lists
|
|
{
|
|
'social': [{'url': str, 'platform': str}, ...],
|
|
'regular': [str, ...]
|
|
}
|
|
"""
|
|
classified = {
|
|
'social': [],
|
|
'regular': []
|
|
}
|
|
|
|
for url in urls:
|
|
if is_social_media_url(url):
|
|
platform = get_social_platform(url)
|
|
classified['social'].append({
|
|
'url': url,
|
|
'platform': platform
|
|
})
|
|
else:
|
|
classified['regular'].append(url)
|
|
|
|
return classified
|
|
|
|
|
|
def extract_domain(url):
|
|
"""
|
|
Extract clean domain name from URL for display.
|
|
|
|
Args:
|
|
url (str): The URL
|
|
|
|
Returns:
|
|
str: Clean domain name
|
|
"""
|
|
try:
|
|
parsed = urlparse(url)
|
|
domain = parsed.netloc.lower()
|
|
|
|
# Remove 'www.' prefix
|
|
if domain.startswith('www.'):
|
|
domain = domain[4:]
|
|
|
|
return domain
|
|
|
|
except Exception:
|
|
return url
|