- Remove notebook mode, add RAG + Personal Assistant dual-bot setup - Add knowledge base management (upload, URL scraping, document processing) - Add user feature access control (allowed_features, features_override) - Update admin dashboard with knowledge base tab - Redesign login page, sidebar, and profile - Add Celery tasks for async document processing Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
67 lines
1.8 KiB
Python
67 lines
1.8 KiB
Python
"""
|
|
Web Scraper — extract clean text from URLs using trafilatura
|
|
"""
|
|
import logging
|
|
from typing import Optional
|
|
from urllib.parse import urlparse
|
|
|
|
import trafilatura
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
MAX_CONTENT_LENGTH = 500_000 # ~500KB of text
|
|
TIMEOUT = 30 # seconds
|
|
|
|
|
|
class WebScraperError(Exception):
|
|
"""Raised when URL scraping fails."""
|
|
|
|
|
|
def validate_url(url: str) -> str:
|
|
"""Validate and normalize URL."""
|
|
parsed = urlparse(url)
|
|
if parsed.scheme not in ("http", "https"):
|
|
raise WebScraperError(f"Invalid URL scheme: {parsed.scheme}. Only http/https allowed.")
|
|
if not parsed.netloc:
|
|
raise WebScraperError("Invalid URL: no host found.")
|
|
return url
|
|
|
|
|
|
def scrape_url(url: str, output_format: str = "markdown") -> str:
|
|
"""
|
|
Fetch and extract main content from a URL.
|
|
|
|
Args:
|
|
url: The URL to scrape
|
|
output_format: "markdown" or "text"
|
|
|
|
Returns:
|
|
Extracted content as markdown or plain text
|
|
|
|
Raises:
|
|
WebScraperError: If fetching or extraction fails
|
|
"""
|
|
url = validate_url(url)
|
|
logger.info("Scraping URL: %s", url)
|
|
|
|
downloaded = trafilatura.fetch_url(url)
|
|
if not downloaded:
|
|
raise WebScraperError(f"Failed to fetch URL: {url}")
|
|
|
|
result = trafilatura.extract(
|
|
downloaded,
|
|
output_format=output_format,
|
|
include_links=True,
|
|
include_tables=True,
|
|
include_images=False,
|
|
)
|
|
|
|
if not result:
|
|
raise WebScraperError(f"No content extracted from URL: {url}")
|
|
|
|
if len(result) > MAX_CONTENT_LENGTH:
|
|
logger.warning("Content truncated from %d to %d chars for URL: %s", len(result), MAX_CONTENT_LENGTH, url)
|
|
result = result[:MAX_CONTENT_LENGTH]
|
|
|
|
logger.info("Extracted %d chars from %s", len(result), url)
|
|
return result
|