volt-newsroom-scraper-report/content_processor.py

"""
Content processor for URL detection and classification.
Identifies social media vs regular URLs and extracts URLs from text.
"""

import re
from urllib.parse import urlparse
from config import Config


def is_social_media_url(url):
    """
    Determine if a URL is from a social media platform.

    Args:
        url (str): The URL to check

    Returns:
        bool: True if URL is from a social media platform
    """
    try:
        parsed = urlparse(url)
        domain = parsed.netloc.lower()

        # Remove 'www.' prefix if present
        if domain.startswith('www.'):
            domain = domain[4:]

        # Check against social media domains
        for social_domain in Config.SOCIAL_MEDIA_DOMAINS:
            if social_domain in domain:
                return True

        return False

    except Exception as e:
        print(f"Error parsing URL {url}: {e}")
        return False


def get_social_platform(url):
    """
    Identify which social media platform a URL belongs to.

    Args:
        url (str): The social media URL

    Returns:
        str: Platform name ('twitter', 'instagram', 'tiktok', 'linkedin') or None
    """
    try:
        parsed = urlparse(url)
        domain = parsed.netloc.lower()

        # Remove 'www.' prefix
        if domain.startswith('www.'):
            domain = domain[4:]

        if 'twitter.com' in domain or 'x.com' in domain:
            return 'twitter'
        elif 'instagram.com' in domain:
            return 'instagram'
        elif 'tiktok.com' in domain:
            return 'tiktok'
        elif 'linkedin.com' in domain:
            return 'linkedin'

        return None

    except Exception as e:
        print(f"Error identifying platform for {url}: {e}")
        return None


def extract_urls_from_text(text):
    """
    Extract all URLs from a text string.

    Args:
        text (str): Text potentially containing URLs

    Returns:
        list: List of extracted URLs
    """
    if not text:
        return []

    # URL regex pattern
    url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

    urls = re.findall(url_pattern, text)
    return urls


def classify_urls(urls):
    """
    Classify a list of URLs into social media and regular categories.

    Args:
        urls (list): List of URLs to classify

    Returns:
        dict: Dictionary with 'social' and 'regular' lists
            {
                'social': [{'url': str, 'platform': str}, ...],
                'regular': [str, ...]
            }
    """
    classified = {
        'social': [],
        'regular': []
    }

    for url in urls:
        if is_social_media_url(url):
            platform = get_social_platform(url)
            classified['social'].append({
                'url': url,
                'platform': platform
            })
        else:
            classified['regular'].append(url)

    return classified


def extract_domain(url):
    """
    Extract clean domain name from URL for display.

    Args:
        url (str): The URL

    Returns:
        str: Clean domain name
    """
    try:
        parsed = urlparse(url)
        domain = parsed.netloc.lower()

        # Remove 'www.' prefix
        if domain.startswith('www.'):
            domain = domain[4:]

        return domain

    except Exception:
        return url