Oliver-ai-bot_2.0/backend/app/core/web_scraper.py
Vadym Samoilenko 44a512c41f Phase 1 Complete: Dual-bot architecture, knowledge base, access control
- Remove notebook mode, add RAG + Personal Assistant dual-bot setup
- Add knowledge base management (upload, URL scraping, document processing)
- Add user feature access control (allowed_features, features_override)
- Update admin dashboard with knowledge base tab
- Redesign login page, sidebar, and profile
- Add Celery tasks for async document processing

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-04 21:26:40 +00:00

67 lines
1.8 KiB
Python

"""
Web Scraper — extract clean text from URLs using trafilatura
"""
import logging
from typing import Optional
from urllib.parse import urlparse
import trafilatura
logger = logging.getLogger(__name__)
MAX_CONTENT_LENGTH = 500_000 # ~500KB of text
TIMEOUT = 30 # seconds
class WebScraperError(Exception):
"""Raised when URL scraping fails."""
def validate_url(url: str) -> str:
"""Validate and normalize URL."""
parsed = urlparse(url)
if parsed.scheme not in ("http", "https"):
raise WebScraperError(f"Invalid URL scheme: {parsed.scheme}. Only http/https allowed.")
if not parsed.netloc:
raise WebScraperError("Invalid URL: no host found.")
return url
def scrape_url(url: str, output_format: str = "markdown") -> str:
"""
Fetch and extract main content from a URL.
Args:
url: The URL to scrape
output_format: "markdown" or "text"
Returns:
Extracted content as markdown or plain text
Raises:
WebScraperError: If fetching or extraction fails
"""
url = validate_url(url)
logger.info("Scraping URL: %s", url)
downloaded = trafilatura.fetch_url(url)
if not downloaded:
raise WebScraperError(f"Failed to fetch URL: {url}")
result = trafilatura.extract(
downloaded,
output_format=output_format,
include_links=True,
include_tables=True,
include_images=False,
)
if not result:
raise WebScraperError(f"No content extracted from URL: {url}")
if len(result) > MAX_CONTENT_LENGTH:
logger.warning("Content truncated from %d to %d chars for URL: %s", len(result), MAX_CONTENT_LENGTH, url)
result = result[:MAX_CONTENT_LENGTH]
logger.info("Extracted %d chars from %s", len(result), url)
return result