Oliver-ai-bot_2.0/backend/app/core/web_scraper.py

"""
Web Scraper — extract clean text from URLs using trafilatura
"""
import logging
from typing import Optional
from urllib.parse import urlparse

import trafilatura

logger = logging.getLogger(__name__)

MAX_CONTENT_LENGTH = 500_000  # ~500KB of text
TIMEOUT = 30  # seconds


class WebScraperError(Exception):
    """Raised when URL scraping fails."""


def validate_url(url: str) -> str:
    """Validate and normalize URL."""
    parsed = urlparse(url)
    if parsed.scheme not in ("http", "https"):
        raise WebScraperError(f"Invalid URL scheme: {parsed.scheme}. Only http/https allowed.")
    if not parsed.netloc:
        raise WebScraperError("Invalid URL: no host found.")
    return url


def scrape_url(url: str, output_format: str = "markdown") -> str:
    """
    Fetch and extract main content from a URL.

    Args:
        url: The URL to scrape
        output_format: "markdown" or "text"

    Returns:
        Extracted content as markdown or plain text

    Raises:
        WebScraperError: If fetching or extraction fails
    """
    url = validate_url(url)
    logger.info("Scraping URL: %s", url)

    downloaded = trafilatura.fetch_url(url)
    if not downloaded:
        raise WebScraperError(f"Failed to fetch URL: {url}")

    result = trafilatura.extract(
        downloaded,
        output_format=output_format,
        include_links=True,
        include_tables=True,
        include_images=False,
    )

    if not result:
        raise WebScraperError(f"No content extracted from URL: {url}")

    if len(result) > MAX_CONTENT_LENGTH:
        logger.warning("Content truncated from %d to %d chars for URL: %s", len(result), MAX_CONTENT_LENGTH, url)
        result = result[:MAX_CONTENT_LENGTH]

    logger.info("Extracted %d chars from %s", len(result), url)
    return result