salary-benchmark/app/services/firecrawl_client.py
DJP da3f5faa91 Initial commit: Salary Benchmark Tool
FastAPI + React + PostgreSQL salary benchmarking tool with AI research pipeline.
- Seed data for 25+ New York roles (junior/mid/senior levels)
- Single + bulk lookup with location alias mapping (NYC -> New York, etc.)
- Research pipeline: Serper -> Firecrawl -> Cohere Rerank -> Claude analysis
- Editable validation UI for AI-proposed benchmarks
- CSV export, Montserrat font, black/white/#FFC407 design
- Fully Dockerized (app + db + frontend)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-02 22:47:32 -04:00

31 lines
1.1 KiB
Python

import asyncio
import httpx
from app.config import settings
SEMAPHORE = asyncio.Semaphore(3)
async def _scrape_one(client: httpx.AsyncClient, url: str) -> dict:
async with SEMAPHORE:
try:
resp = await client.post(
"https://api.firecrawl.dev/v1/scrape",
headers={"Authorization": f"Bearer {settings.firecrawl_api_key}"},
json={"url": url, "formats": ["markdown"]},
timeout=30,
)
resp.raise_for_status()
data = resp.json()
markdown = data.get("data", {}).get("markdown", "")
# Truncate to avoid sending huge content downstream
return {"url": url, "content": markdown[:3000], "success": True}
except Exception as e:
return {"url": url, "content": "", "success": False, "error": str(e)}
async def scrape_urls(urls: list[str]) -> dict:
async with httpx.AsyncClient() as client:
tasks = [_scrape_one(client, url) for url in urls]
results = await asyncio.gather(*tasks)
return {"scraped": [r for r in results if r["success"] and r["content"]]}