salary-benchmark/app/services/firecrawl_client.py

import asyncio
import httpx

from app.config import settings

SEMAPHORE = asyncio.Semaphore(3)


async def _scrape_one(client: httpx.AsyncClient, url: str) -> dict:
    async with SEMAPHORE:
        try:
            resp = await client.post(
                "https://api.firecrawl.dev/v1/scrape",
                headers={"Authorization": f"Bearer {settings.firecrawl_api_key}"},
                json={"url": url, "formats": ["markdown"]},
                timeout=30,
            )
            resp.raise_for_status()
            data = resp.json()
            markdown = data.get("data", {}).get("markdown", "")
            # Truncate to avoid sending huge content downstream
            return {"url": url, "content": markdown[:3000], "success": True}
        except Exception as e:
            return {"url": url, "content": "", "success": False, "error": str(e)}


async def scrape_urls(urls: list[str]) -> dict:
    async with httpx.AsyncClient() as client:
        tasks = [_scrape_one(client, url) for url in urls]
        results = await asyncio.gather(*tasks)
    return {"scraped": [r for r in results if r["success"] and r["content"]]}