volt-newsroom-scraper-report/scraper.py

"""
Firecrawl integration for scraping regular websites (news articles, blogs).
Uses batch scraping for parallel processing of multiple URLs.
Supports screenshot capture for visual previews.
"""

import requests
import base64
import hashlib
from pathlib import Path
from config import Config


class FirecrawlScraper:
    """Scraper for regular websites using Firecrawl API."""

    def __init__(self):
        """Initialize Firecrawl scraper with API key."""
        self.api_key = Config.FIRECRAWL_API_KEY
        self.base_url = "https://api.firecrawl.dev/v1"

        # Create temp screenshots directory if enabled
        if Config.ENABLE_SCREENSHOTS:
            Config.TEMP_SCREENSHOTS_DIR.mkdir(exist_ok=True)

    def scrape_urls(self, urls):
        """
        Scrape multiple URLs using Firecrawl batch scrape endpoint.

        Args:
            urls (list): List of URLs to scrape

        Returns:
            list: List of scraped content dictionaries
                [
                    {
                        'url': str,
                        'title': str,
                        'content': str (markdown),
                        'success': bool,
                        'error': str (if failed)
                    },
                    ...
                ]
        """
        if not urls:
            return []

        print(f"Scraping {len(urls)} URLs with Firecrawl...")

        results = []

        # Firecrawl batch-scrape endpoint (if available) or individual scraping
        # For now, using individual scrape calls (can optimize to batch later)
        for url in urls:
            try:
                result = self._scrape_single_url(url)
                results.append(result)
            except Exception as e:
                print(f"Error scraping {url}: {e}")
                results.append({
                    'url': url,
                    'title': url,
                    'content': '',
                    'success': False,
                    'error': str(e)
                })

        successful = sum(1 for r in results if r['success'])
        print(f"Successfully scraped {successful}/{len(urls)} URLs")

        return results

    def _scrape_single_url(self, url):
        """
        Scrape a single URL using Firecrawl scrape endpoint.

        Args:
            url (str): URL to scrape

        Returns:
            dict: Scraped content
        """
        endpoint = f"{self.base_url}/scrape"

        headers = {
            'Authorization': f'Bearer {self.api_key}',
            'Content-Type': 'application/json'
        }

        # Build payload - keep it simple for Firecrawl v1
        payload = {
            'url': url,
            'formats': ['markdown']
        }

        # Add screenshot if enabled (as separate format)
        if Config.ENABLE_SCREENSHOTS:
            payload['formats'].append('screenshot')
            # Add wait time to let page fully load before screenshot
            payload['waitFor'] = 3000  # Wait 3 seconds for JavaScript/images to load

        try:
            response = requests.post(endpoint, json=payload, headers=headers, timeout=60)
            response.raise_for_status()

            data = response.json()

            # Extract data from response
            if data.get('success'):
                result = {
                    'url': url,
                    'title': data.get('data', {}).get('metadata', {}).get('title', url),
                    'content': data.get('data', {}).get('markdown', ''),
                    'success': True,
                    'error': None,
                    'screenshot_path': None
                }

                # Handle screenshot if present
                if Config.ENABLE_SCREENSHOTS:
                    screenshot_base64 = data.get('data', {}).get('screenshot')
                    if screenshot_base64:
                        screenshot_path = self._save_screenshot(url, screenshot_base64)
                        result['screenshot_path'] = screenshot_path

                return result
            else:
                error_msg = data.get('error', 'Unknown error')
                print(f"Firecrawl error for {url}: {error_msg}")
                return {
                    'url': url,
                    'title': url,
                    'content': '',
                    'success': False,
                    'error': error_msg
                }

        except requests.exceptions.Timeout:
            print(f"Timeout scraping {url}")
            return {
                'url': url,
                'title': url,
                'content': '',
                'success': False,
                'error': 'Request timeout'
            }

        except requests.exceptions.RequestException as e:
            print(f"Request error scraping {url}: {e}")
            return {
                'url': url,
                'title': url,
                'content': '',
                'success': False,
                'error': str(e)
            }

        except Exception as e:
            print(f"Unexpected error scraping {url}: {e}")
            return {
                'url': url,
                'title': url,
                'content': '',
                'success': False,
                'error': str(e),
                'screenshot_path': None
            }

    def _save_screenshot(self, url, screenshot_data):
        """
        Save screenshot to temp file.
        Firecrawl returns a URL to the screenshot, not base64 data.

        Args:
            url (str): URL being screenshotted (for filename)
            screenshot_data (str): Screenshot URL or base64 data

        Returns:
            str: Path to saved screenshot file, or None if failed
        """
        try:
            # Create hash of URL for filename
            url_hash = hashlib.md5(url.encode()).hexdigest()[:12]
            filename = f"screenshot_{url_hash}.png"
            filepath = Config.TEMP_SCREENSHOTS_DIR / filename

            # Check if it's a URL (Firecrawl v1 returns URLs)
            if screenshot_data.startswith('http'):
                # Download the screenshot from the URL
                response = requests.get(screenshot_data, timeout=30)
                response.raise_for_status()
                image_data = response.content
            else:
                # Assume it's base64 encoded
                image_data = base64.b64decode(screenshot_data)

            # Save image
            with open(filepath, 'wb') as f:
                f.write(image_data)

            print(f"  Saved screenshot: {filename} ({len(image_data)} bytes)")
            return str(filepath)

        except Exception as e:
            print(f"  Error saving screenshot for {url}: {e}")
            return None