salary-benchmark/app/services/benchmark_service.py

from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession

from app.models import Benchmark, Location, Role
from app.schemas import BenchmarkOut

LOCATION_ALIASES = {
    "nyc": "new york",
    "new york city": "new york",
    "manhattan": "new york",
    "sf": "san francisco",
    "san fran": "san francisco",
    "la": "los angeles",
    "chi": "chicago",
    "dc": "washington dc",
    "washington": "washington dc",
    "philly": "philadelphia",
    "atl": "atlanta",
    "bos": "boston",
    "dal": "dallas",
    "hou": "houston",
    "sea": "seattle",
    "pdx": "portland",
    "den": "denver",
    "mia": "miami",
    "det": "detroit",
    "mpls": "minneapolis",
    "nola": "new orleans",
    "london": "london",
    "ldn": "london",
}


def normalize(text: str) -> str:
    cleaned = text.strip().lower()
    return LOCATION_ALIASES.get(cleaned, cleaned)


def normalize_title(text: str) -> str:
    return text.strip().lower()


async def get_or_create_role(db: AsyncSession, title: str) -> Role:
    norm = normalize_title(title)
    result = await db.execute(select(Role).where(Role.normalized_title == norm))
    role = result.scalar_one_or_none()
    if not role:
        role = Role(title=title.strip(), normalized_title=norm)
        db.add(role)
        await db.flush()
    return role


async def get_or_create_location(db: AsyncSession, location: str) -> Location:
    norm = normalize(location)
    result = await db.execute(
        select(Location).where(Location.normalized_name == norm)
    )
    loc = result.scalar_one_or_none()
    if not loc:
        # Use the canonical name as the display city name
        loc = Location(city=norm.title(), normalized_name=norm)
        db.add(loc)
        await db.flush()
    return loc


async def lookup_benchmarks(
    db: AsyncSession, title: str, location: str
) -> list[BenchmarkOut]:
    norm_title = normalize_title(title)
    norm_location = normalize(location)

    result = await db.execute(
        select(Benchmark, Role, Location)
        .join(Role, Benchmark.role_id == Role.id)
        .join(Location, Benchmark.location_id == Location.id)
        .where(Role.normalized_title == norm_title)
        .where(Location.normalized_name == norm_location)
        .order_by(
            Benchmark.level.desc()
        )
    )
    rows = result.all()
    return [
        BenchmarkOut(
            role=role.title,
            location=loc.city,
            level=bench.level,
            salary=bench.salary,
            source=bench.source,
            validated=bench.validated,
            confidence_score=bench.confidence_score,
        )
        for bench, role, loc in rows
    ]