volt-newsroom-scraper-report/scraper.py
DJP 8faf6b98f1 Add wait time for Firecrawl screenshots
- Added 3 second wait before capturing screenshot
- Ensures JavaScript and images fully load
- Prevents 'too early' screenshot captures
- Configurable via SCREENSHOT_WAIT_TIME in config.py
2026-01-06 14:55:46 -05:00

207 lines
6.6 KiB
Python

"""
Firecrawl integration for scraping regular websites (news articles, blogs).
Uses batch scraping for parallel processing of multiple URLs.
Supports screenshot capture for visual previews.
"""
import requests
import base64
import hashlib
from pathlib import Path
from config import Config
class FirecrawlScraper:
"""Scraper for regular websites using Firecrawl API."""
def __init__(self):
"""Initialize Firecrawl scraper with API key."""
self.api_key = Config.FIRECRAWL_API_KEY
self.base_url = "https://api.firecrawl.dev/v1"
# Create temp screenshots directory if enabled
if Config.ENABLE_SCREENSHOTS:
Config.TEMP_SCREENSHOTS_DIR.mkdir(exist_ok=True)
def scrape_urls(self, urls):
"""
Scrape multiple URLs using Firecrawl batch scrape endpoint.
Args:
urls (list): List of URLs to scrape
Returns:
list: List of scraped content dictionaries
[
{
'url': str,
'title': str,
'content': str (markdown),
'success': bool,
'error': str (if failed)
},
...
]
"""
if not urls:
return []
print(f"Scraping {len(urls)} URLs with Firecrawl...")
results = []
# Firecrawl batch-scrape endpoint (if available) or individual scraping
# For now, using individual scrape calls (can optimize to batch later)
for url in urls:
try:
result = self._scrape_single_url(url)
results.append(result)
except Exception as e:
print(f"Error scraping {url}: {e}")
results.append({
'url': url,
'title': url,
'content': '',
'success': False,
'error': str(e)
})
successful = sum(1 for r in results if r['success'])
print(f"Successfully scraped {successful}/{len(urls)} URLs")
return results
def _scrape_single_url(self, url):
"""
Scrape a single URL using Firecrawl scrape endpoint.
Args:
url (str): URL to scrape
Returns:
dict: Scraped content
"""
endpoint = f"{self.base_url}/scrape"
headers = {
'Authorization': f'Bearer {self.api_key}',
'Content-Type': 'application/json'
}
# Build payload - keep it simple for Firecrawl v1
payload = {
'url': url,
'formats': ['markdown']
}
# Add screenshot if enabled (as separate format)
if Config.ENABLE_SCREENSHOTS:
payload['formats'].append('screenshot')
# Add wait time to let page fully load before screenshot
payload['waitFor'] = 3000 # Wait 3 seconds for JavaScript/images to load
try:
response = requests.post(endpoint, json=payload, headers=headers, timeout=60)
response.raise_for_status()
data = response.json()
# Extract data from response
if data.get('success'):
result = {
'url': url,
'title': data.get('data', {}).get('metadata', {}).get('title', url),
'content': data.get('data', {}).get('markdown', ''),
'success': True,
'error': None,
'screenshot_path': None
}
# Handle screenshot if present
if Config.ENABLE_SCREENSHOTS:
screenshot_base64 = data.get('data', {}).get('screenshot')
if screenshot_base64:
screenshot_path = self._save_screenshot(url, screenshot_base64)
result['screenshot_path'] = screenshot_path
return result
else:
error_msg = data.get('error', 'Unknown error')
print(f"Firecrawl error for {url}: {error_msg}")
return {
'url': url,
'title': url,
'content': '',
'success': False,
'error': error_msg
}
except requests.exceptions.Timeout:
print(f"Timeout scraping {url}")
return {
'url': url,
'title': url,
'content': '',
'success': False,
'error': 'Request timeout'
}
except requests.exceptions.RequestException as e:
print(f"Request error scraping {url}: {e}")
return {
'url': url,
'title': url,
'content': '',
'success': False,
'error': str(e)
}
except Exception as e:
print(f"Unexpected error scraping {url}: {e}")
return {
'url': url,
'title': url,
'content': '',
'success': False,
'error': str(e),
'screenshot_path': None
}
def _save_screenshot(self, url, screenshot_data):
"""
Save screenshot to temp file.
Firecrawl returns a URL to the screenshot, not base64 data.
Args:
url (str): URL being screenshotted (for filename)
screenshot_data (str): Screenshot URL or base64 data
Returns:
str: Path to saved screenshot file, or None if failed
"""
try:
# Create hash of URL for filename
url_hash = hashlib.md5(url.encode()).hexdigest()[:12]
filename = f"screenshot_{url_hash}.png"
filepath = Config.TEMP_SCREENSHOTS_DIR / filename
# Check if it's a URL (Firecrawl v1 returns URLs)
if screenshot_data.startswith('http'):
# Download the screenshot from the URL
response = requests.get(screenshot_data, timeout=30)
response.raise_for_status()
image_data = response.content
else:
# Assume it's base64 encoded
image_data = base64.b64decode(screenshot_data)
# Save image
with open(filepath, 'wb') as f:
f.write(image_data)
print(f" Saved screenshot: {filename} ({len(image_data)} bytes)")
return str(filepath)
except Exception as e:
print(f" Error saving screenshot for {url}: {e}")
return None