Multi-token + fuzzy search; admin-only Run Now / Backfill

Search:
- Previously /api/events did one ILIKE %q% across the columns, so
  "female city" required the literal substring "female city" to
  appear somewhere. Now the query is tokenised on whitespace; every
  token must match somewhere (AND), and each token matches either
  by substring (ILIKE) across the searched columns OR by trigram
  similarity (pg_trgm) against a concatenated text blob with a 0.3
  threshold — handles typos like "femalle" → "female".
- Results ranked by summed similarity score across all tokens, then
  recency. Empty query falls back to "newest 100".
- schema.sql: CREATE EXTENSION IF NOT EXISTS pg_trgm (idempotent;
  applied by ensure_schema on api startup).

Admin gating:
- auth.py: User now carries `is_admin`. Computed from a
  comma-separated ADMIN_EMAILS env var (case-insensitive match
  against `preferred_username`/`upn`/`email` claim). New
  `require_admin` FastAPI dependency 403s non-admins.
- In DEV_AUTH_BYPASS mode the dev user is admin by default; flip
  DEV_AUTH_IS_ADMIN=false to test the read-only UX without enabling
  SSO.
- POST /api/runs and POST /api/backfill now gated by require_admin.
- /api/me carries is_admin so the SPA can hide the destructive
  buttons for non-admins.

Frontend:
- App.tsx fetches /api/me on mount and hides Run Now + Backfill
  unless `is_admin` is true. Non-admins still see search + results +
  recent-runs table.

docker-compose / .env.example: thread ADMIN_EMAILS +
DEV_AUTH_IS_ADMIN into the api container.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
DJP 2026-05-11 15:51:50 -04:00
parent 9e6a75feb6
commit 1f2c2ff8e1
7 changed files with 179 additions and 49 deletions

View file

@ -32,6 +32,13 @@ DEV_AUTH_EMAIL=dev@oliver.agency
# goes in both the backend (token validation) and the frontend (MSAL login).
AZURE_TENANT_ID=
AZURE_CLIENT_ID=
# Comma-separated list of admin emails. Only these accounts can click
# "Run now" or "Backfill from Box". Everyone else gets a read-only UI.
# Example: ADMIN_EMAILS=alice@oliver.agency,bob@oliver.agency
ADMIN_EMAILS=
# When DEV_AUTH_BYPASS=true the dev user is admin by default. Flip to false
# to test the read-only UI without enabling SSO.
DEV_AUTH_IS_ADMIN=true
# Frontend mirrors — Vite reads VITE_* at build time and bakes them into dist.
# Keep these in sync with the values above.
VITE_DEV_AUTH_BYPASS=true

107
api.py
View file

@ -23,7 +23,7 @@ from fastapi.middleware.cors import CORSMiddleware
from psycopg.rows import dict_row
import db
from auth import User, maybe_auth_info, require_auth
from auth import User, maybe_auth_info, require_admin, require_auth
BOX_FILE_URL = "https://app.box.com/file/{file_id}"
@ -86,26 +86,83 @@ def me(user: User = Depends(require_auth)):
# ── Search ──────────────────────────────────────────────────────────────────
SEARCH_SQL = """
SELECT id, run_id, created_at, file_id, file_name, folder_path, media_type,
gemini_model, description, scenes, validated_metadata, raw_response,
metadata_write_success, description_write_success, scene_comment_write_success,
status, error_message, duration_ms
FROM tagging_events
WHERE
(%(q)s = '' OR (
file_name ILIKE %(like)s
OR folder_path ILIKE %(like)s
OR description ILIKE %(like)s
OR status ILIKE %(like)s
OR file_id ILIKE %(like)s
OR coalesce(validated_metadata::text, '') ILIKE %(like)s
OR coalesce(raw_response::text, '') ILIKE %(like)s
OR coalesce(scenes::text, '') ILIKE %(like)s
))
ORDER BY created_at DESC
LIMIT %(limit)s
"""
# Columns the search ILIKE walks over (substring match, case-insensitive).
_SEARCH_COLS = [
"file_name",
"folder_path",
"description",
"status",
"file_id",
"coalesce(validated_metadata::text, '')",
"coalesce(raw_response::text, '')",
"coalesce(scenes::text, '')",
]
# Single concatenated text used for trigram similarity (fuzzy / typo tolerance).
_SEARCH_BLOB = (
"coalesce(file_name,'')||' '||coalesce(folder_path,'')||' '||"
"coalesce(description,'')||' '||coalesce(validated_metadata::text,'')||' '||"
"coalesce(scenes::text,'')"
)
# Fuzzy threshold for trigram similarity. 0.3 catches typos like
# "femalle" → "female" without flooding the results with noise.
_SIM_THRESHOLD = 0.3
# Short tokens (1-2 chars) are too noisy for trigrams — fall back to substring
# match only for those.
_MIN_FUZZY_TOKEN_LEN = 3
def _build_search_sql(q: str, limit: int):
"""
Tokenise the query on whitespace, AND-match every token across the columns,
where each token may match by substring OR by trigram similarity. Results
ranked by summed similarity score, then recency.
"""
tokens = [t for t in q.strip().split() if t]
common_cols = (
"id, run_id, created_at, file_id, file_name, folder_path, media_type, "
"gemini_model, description, scenes, validated_metadata, raw_response, "
"metadata_write_success, description_write_success, scene_comment_write_success, "
"status, error_message, duration_ms"
)
if not tokens:
return (
f"SELECT {common_cols} FROM tagging_events "
f"ORDER BY created_at DESC LIMIT %(limit)s",
{"limit": limit},
)
params: dict = {"limit": limit}
clauses: list[str] = []
score_terms: list[str] = []
for i, tok in enumerate(tokens):
like_key = f"like_{i}"
sim_key = f"sim_{i}"
params[like_key] = f"%{tok}%"
params[sim_key] = tok
col_ors = " OR ".join(f"{c} ILIKE %({like_key})s" for c in _SEARCH_COLS)
if len(tok) >= _MIN_FUZZY_TOKEN_LEN:
clauses.append(
f"(({col_ors}) "
f"OR similarity({_SEARCH_BLOB}, %({sim_key})s) > {_SIM_THRESHOLD})"
)
score_terms.append(f"similarity({_SEARCH_BLOB}, %({sim_key})s)")
else:
clauses.append(f"({col_ors})")
where = " AND ".join(clauses)
score_sql = " + ".join(score_terms) if score_terms else "0"
sql = (
f"SELECT {common_cols}, ({score_sql}) AS _score "
f"FROM tagging_events "
f"WHERE {where} "
f"ORDER BY _score DESC, created_at DESC "
f"LIMIT %(limit)s"
)
return sql, params
def _event_to_dict(row):
@ -125,10 +182,10 @@ def search_events(
limit: int = Query(100, ge=1, le=500),
user: User = Depends(require_auth),
):
like = f"%{q}%"
sql, params = _build_search_sql(q, limit)
with _conn() as c:
with c.cursor(row_factory=dict_row) as cur:
cur.execute(SEARCH_SQL, {"q": q, "like": like, "limit": limit})
cur.execute(sql, params)
rows = cur.fetchall()
return {"q": q, "count": len(rows), "results": [_event_to_dict(r) for r in rows]}
@ -168,7 +225,7 @@ def _run_pass_in_thread(run_id: uuid.UUID):
@app.post("/api/runs")
def start_run(user: User = Depends(require_auth)):
def start_run(user: User = Depends(require_admin)):
run_id = uuid.uuid4()
t = threading.Thread(target=_run_pass_in_thread, args=(run_id,), daemon=True)
t.start()
@ -205,7 +262,7 @@ def _run_backfill_in_thread(run_id: uuid.UUID):
@app.post("/api/backfill")
def start_backfill(user: User = Depends(require_auth)):
def start_backfill(user: User = Depends(require_admin)):
"""
Walk the Box folder and mirror any existing marriottUsa metadata into the
local DB as `status='backfilled'` rows. Use this after first deploy (or

45
auth.py
View file

@ -19,6 +19,17 @@ AZURE_TENANT_ID = os.getenv("AZURE_TENANT_ID", "").strip()
AZURE_CLIENT_ID = os.getenv("AZURE_CLIENT_ID", "").strip()
DEV_AUTH_BYPASS = os.getenv("DEV_AUTH_BYPASS", "").strip().lower() in ("1", "true", "yes")
# Comma-separated allowlist of admin emails. Case-insensitive. Members can
# trigger destructive endpoints (Run now, Backfill). Everyone else can read.
_ADMIN_EMAILS = {
e.strip().lower()
for e in os.getenv("ADMIN_EMAILS", "").split(",")
if e.strip()
}
# In bypass mode the dev user is admin by default — set DEV_AUTH_IS_ADMIN=false
# to test the non-admin UX without flipping to MSAL.
_DEV_AUTH_IS_ADMIN = os.getenv("DEV_AUTH_IS_ADMIN", "true").strip().lower() in ("1", "true", "yes")
JWKS_URL = f"https://login.microsoftonline.com/{AZURE_TENANT_ID}/discovery/v2.0/keys" if AZURE_TENANT_ID else None
ISSUERS = (
f"https://login.microsoftonline.com/{AZURE_TENANT_ID}/v2.0",
@ -41,22 +52,35 @@ def _get_jwks_client() -> PyJWKClient:
class User:
def __init__(self, *, oid: str, name: str, email: str, dev: bool = False):
def __init__(self, *, oid: str, name: str, email: str, dev: bool = False, is_admin: bool = False):
self.oid = oid
self.name = name
self.email = email
self.dev = dev
self.is_admin = is_admin
def to_dict(self):
return {"oid": self.oid, "name": self.name, "email": self.email, "dev": self.dev}
return {
"oid": self.oid,
"name": self.name,
"email": self.email,
"dev": self.dev,
"is_admin": self.is_admin,
}
def _is_admin_email(email: str) -> bool:
return bool(email) and email.strip().lower() in _ADMIN_EMAILS
def _bypass_user() -> User:
email = os.getenv("DEV_AUTH_EMAIL", "dev@oliver.agency")
return User(
oid="dev-bypass",
name=os.getenv("DEV_AUTH_NAME", "Dev User"),
email=os.getenv("DEV_AUTH_EMAIL", "dev@oliver.agency"),
email=email,
dev=True,
is_admin=_DEV_AUTH_IS_ADMIN or _is_admin_email(email),
)
@ -94,17 +118,30 @@ def require_auth(authorization: Optional[str] = Header(default=None)) -> User:
except httpx.HTTPError as e:
raise HTTPException(status_code=status.HTTP_503_SERVICE_UNAVAILABLE, detail=f"JWKS fetch failed: {e}")
email = claims.get("preferred_username") or claims.get("upn") or claims.get("email", "")
return User(
oid=claims.get("oid") or claims.get("sub", "unknown"),
name=claims.get("name", ""),
email=claims.get("preferred_username") or claims.get("upn") or claims.get("email", ""),
email=email,
is_admin=_is_admin_email(email),
)
def require_admin(user: User = Depends(require_auth)) -> User:
"""403 unless the caller is in ADMIN_EMAILS (or is a bypass-admin dev user)."""
if not user.is_admin:
raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN,
detail="Admin-only endpoint",
)
return user
def maybe_auth_info():
"""Diagnostic helper for /api/health: report whether auth is wired."""
return {
"dev_bypass": DEV_AUTH_BYPASS,
"tenant_configured": bool(AZURE_TENANT_ID),
"client_configured": bool(AZURE_CLIENT_ID),
"admin_emails_configured": len(_ADMIN_EMAILS),
}

View file

@ -47,8 +47,14 @@ services:
DEV_AUTH_BYPASS: ${DEV_AUTH_BYPASS:-true}
AZURE_TENANT_ID: ${AZURE_TENANT_ID:-}
AZURE_CLIENT_ID: ${AZURE_CLIENT_ID:-}
# Comma-separated list of admin emails — only these accounts can hit
# POST /api/runs and POST /api/backfill. Everyone else is read-only.
ADMIN_EMAILS: ${ADMIN_EMAILS:-}
DEV_AUTH_EMAIL: ${DEV_AUTH_EMAIL:-dev@oliver.agency}
DEV_AUTH_NAME: ${DEV_AUTH_NAME:-Dev User}
# In DEV_AUTH_BYPASS mode the dev user is admin by default; flip to
# false here if you want to test the non-admin UX without enabling SSO.
DEV_AUTH_IS_ADMIN: ${DEV_AUTH_IS_ADMIN:-true}
# CORS for local dev: when Vite is on :5173 and FastAPI on host:8004.
# Empty in prod — Apache serves SPA and API under the same origin.
CORS_ORIGINS: ${CORS_ORIGINS:-}

View file

@ -4,6 +4,7 @@ import {
Event,
Run,
listRuns,
me as fetchMe,
runEvents,
searchEvents,
startBackfill,
@ -17,6 +18,8 @@ export function App() {
const [searching, setSearching] = useState(false);
const [error, setError] = useState<string | null>(null);
const [isAdmin, setIsAdmin] = useState(false);
const [activeRun, setActiveRun] = useState<string | null>(null);
const [activeRunKind, setActiveRunKind] = useState<"tag" | "backfill" | null>(null);
const [activeRunEvents, setActiveRunEvents] = useState<Event[]>([]);
@ -52,11 +55,14 @@ export function App() {
}
}, [auth.getToken]);
// Initial load: recent rows + recent runs.
// Initial load: who am I + recent rows + recent runs.
useEffect(() => {
fetchMe(auth.getToken)
.then((u) => setIsAdmin(Boolean(u.is_admin)))
.catch(() => setIsAdmin(false));
doSearch("");
refreshRuns();
}, [doSearch, refreshRuns]);
}, [auth.getToken, doSearch, refreshRuns]);
// Poll active run.
useEffect(() => {
@ -180,24 +186,28 @@ export function App() {
<button className="primary" type="submit" disabled={searching}>
{searching ? "Searching…" : "Search"}
</button>
<button
type="button"
className="run-now"
disabled={starting !== null}
title="Trigger a tagging pass against the Box folder right now"
onClick={onRunNow}
>
{starting === "tag" ? "Starting…" : "Run now"}
</button>
<button
type="button"
className="ghost"
disabled={starting !== null}
title="Walk Box and mirror existing marriottUsa metadata into the local DB. No Gemini, no Box writes."
onClick={onBackfill}
>
{starting === "backfill" ? "Starting…" : "Backfill from Box"}
</button>
{isAdmin && (
<>
<button
type="button"
className="run-now"
disabled={starting !== null}
title="Trigger a tagging pass against the Box folder right now"
onClick={onRunNow}
>
{starting === "tag" ? "Starting…" : "Run now"}
</button>
<button
type="button"
className="ghost"
disabled={starting !== null}
title="Walk Box and mirror existing marriottUsa metadata into the local DB. No Gemini, no Box writes."
onClick={onBackfill}
>
{starting === "backfill" ? "Starting…" : "Backfill from Box"}
</button>
</>
)}
</form>
{error && <p className="error">{error}</p>}
</section>

View file

@ -104,3 +104,13 @@ export function runEvents(getToken: GetTokenFn, runId: string) {
export function health() {
return fetch(`${API_BASE}/health`).then((r) => r.json());
}
export function me(getToken: GetTokenFn) {
return req<{
oid: string;
name: string;
email: string;
dev: boolean;
is_admin: boolean;
}>(`/me`, getToken);
}

View file

@ -2,6 +2,9 @@
-- One row per file the tagger sent to Gemini (success or error).
-- Skipped-as-already-tagged files do not produce rows.
-- pg_trgm powers the fuzzy `similarity()` call in /api/events. Idempotent.
CREATE EXTENSION IF NOT EXISTS pg_trgm;
CREATE TABLE IF NOT EXISTS tagging_events (
id BIGSERIAL PRIMARY KEY,
run_id UUID NOT NULL,