""" Content processor for URL detection and classification. Identifies social media vs regular URLs and extracts URLs from text. """ import re from urllib.parse import urlparse from config import Config def is_social_media_url(url): """ Determine if a URL is from a social media platform. Args: url (str): The URL to check Returns: bool: True if URL is from a social media platform """ try: parsed = urlparse(url) domain = parsed.netloc.lower() # Remove 'www.' prefix if present if domain.startswith('www.'): domain = domain[4:] # Check against social media domains for social_domain in Config.SOCIAL_MEDIA_DOMAINS: if social_domain in domain: return True return False except Exception as e: print(f"Error parsing URL {url}: {e}") return False def get_social_platform(url): """ Identify which social media platform a URL belongs to. Args: url (str): The social media URL Returns: str: Platform name ('twitter', 'instagram', 'tiktok', 'linkedin') or None """ try: parsed = urlparse(url) domain = parsed.netloc.lower() # Remove 'www.' prefix if domain.startswith('www.'): domain = domain[4:] if 'twitter.com' in domain or 'x.com' in domain: return 'twitter' elif 'instagram.com' in domain: return 'instagram' elif 'tiktok.com' in domain: return 'tiktok' elif 'linkedin.com' in domain: return 'linkedin' return None except Exception as e: print(f"Error identifying platform for {url}: {e}") return None def extract_urls_from_text(text): """ Extract all URLs from a text string. Args: text (str): Text potentially containing URLs Returns: list: List of extracted URLs """ if not text: return [] # URL regex pattern url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+' urls = re.findall(url_pattern, text) return urls def classify_urls(urls): """ Classify a list of URLs into social media and regular categories. Args: urls (list): List of URLs to classify Returns: dict: Dictionary with 'social' and 'regular' lists { 'social': [{'url': str, 'platform': str}, ...], 'regular': [str, ...] } """ classified = { 'social': [], 'regular': [] } for url in urls: if is_social_media_url(url): platform = get_social_platform(url) classified['social'].append({ 'url': url, 'platform': platform }) else: classified['regular'].append(url) return classified def extract_domain(url): """ Extract clean domain name from URL for display. Args: url (str): The URL Returns: str: Clean domain name """ try: parsed = urlparse(url) domain = parsed.netloc.lower() # Remove 'www.' prefix if domain.startswith('www.'): domain = domain[4:] return domain except Exception: return url