FastAPI + React + PostgreSQL salary benchmarking tool with AI research pipeline. - Seed data for 25+ New York roles (junior/mid/senior levels) - Single + bulk lookup with location alias mapping (NYC -> New York, etc.) - Research pipeline: Serper -> Firecrawl -> Cohere Rerank -> Claude analysis - Editable validation UI for AI-proposed benchmarks - CSV export, Montserrat font, black/white/#FFC407 design - Fully Dockerized (app + db + frontend) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
31 lines
1.1 KiB
Python
31 lines
1.1 KiB
Python
import asyncio
|
|
import httpx
|
|
|
|
from app.config import settings
|
|
|
|
SEMAPHORE = asyncio.Semaphore(3)
|
|
|
|
|
|
async def _scrape_one(client: httpx.AsyncClient, url: str) -> dict:
|
|
async with SEMAPHORE:
|
|
try:
|
|
resp = await client.post(
|
|
"https://api.firecrawl.dev/v1/scrape",
|
|
headers={"Authorization": f"Bearer {settings.firecrawl_api_key}"},
|
|
json={"url": url, "formats": ["markdown"]},
|
|
timeout=30,
|
|
)
|
|
resp.raise_for_status()
|
|
data = resp.json()
|
|
markdown = data.get("data", {}).get("markdown", "")
|
|
# Truncate to avoid sending huge content downstream
|
|
return {"url": url, "content": markdown[:3000], "success": True}
|
|
except Exception as e:
|
|
return {"url": url, "content": "", "success": False, "error": str(e)}
|
|
|
|
|
|
async def scrape_urls(urls: list[str]) -> dict:
|
|
async with httpx.AsyncClient() as client:
|
|
tasks = [_scrape_one(client, url) for url in urls]
|
|
results = await asyncio.gather(*tasks)
|
|
return {"scraped": [r for r in results if r["success"] and r["content"]]}
|