Multi-token + fuzzy search; admin-only Run Now / Backfill
Search: - Previously /api/events did one ILIKE %q% across the columns, so "female city" required the literal substring "female city" to appear somewhere. Now the query is tokenised on whitespace; every token must match somewhere (AND), and each token matches either by substring (ILIKE) across the searched columns OR by trigram similarity (pg_trgm) against a concatenated text blob with a 0.3 threshold — handles typos like "femalle" → "female". - Results ranked by summed similarity score across all tokens, then recency. Empty query falls back to "newest 100". - schema.sql: CREATE EXTENSION IF NOT EXISTS pg_trgm (idempotent; applied by ensure_schema on api startup). Admin gating: - auth.py: User now carries `is_admin`. Computed from a comma-separated ADMIN_EMAILS env var (case-insensitive match against `preferred_username`/`upn`/`email` claim). New `require_admin` FastAPI dependency 403s non-admins. - In DEV_AUTH_BYPASS mode the dev user is admin by default; flip DEV_AUTH_IS_ADMIN=false to test the read-only UX without enabling SSO. - POST /api/runs and POST /api/backfill now gated by require_admin. - /api/me carries is_admin so the SPA can hide the destructive buttons for non-admins. Frontend: - App.tsx fetches /api/me on mount and hides Run Now + Backfill unless `is_admin` is true. Non-admins still see search + results + recent-runs table. docker-compose / .env.example: thread ADMIN_EMAILS + DEV_AUTH_IS_ADMIN into the api container. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
9e6a75feb6
commit
1f2c2ff8e1
7 changed files with 179 additions and 49 deletions
|
|
@ -32,6 +32,13 @@ DEV_AUTH_EMAIL=dev@oliver.agency
|
|||
# goes in both the backend (token validation) and the frontend (MSAL login).
|
||||
AZURE_TENANT_ID=
|
||||
AZURE_CLIENT_ID=
|
||||
# Comma-separated list of admin emails. Only these accounts can click
|
||||
# "Run now" or "Backfill from Box". Everyone else gets a read-only UI.
|
||||
# Example: ADMIN_EMAILS=alice@oliver.agency,bob@oliver.agency
|
||||
ADMIN_EMAILS=
|
||||
# When DEV_AUTH_BYPASS=true the dev user is admin by default. Flip to false
|
||||
# to test the read-only UI without enabling SSO.
|
||||
DEV_AUTH_IS_ADMIN=true
|
||||
# Frontend mirrors — Vite reads VITE_* at build time and bakes them into dist.
|
||||
# Keep these in sync with the values above.
|
||||
VITE_DEV_AUTH_BYPASS=true
|
||||
|
|
|
|||
107
api.py
107
api.py
|
|
@ -23,7 +23,7 @@ from fastapi.middleware.cors import CORSMiddleware
|
|||
from psycopg.rows import dict_row
|
||||
|
||||
import db
|
||||
from auth import User, maybe_auth_info, require_auth
|
||||
from auth import User, maybe_auth_info, require_admin, require_auth
|
||||
|
||||
BOX_FILE_URL = "https://app.box.com/file/{file_id}"
|
||||
|
||||
|
|
@ -86,26 +86,83 @@ def me(user: User = Depends(require_auth)):
|
|||
# ── Search ──────────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
SEARCH_SQL = """
|
||||
SELECT id, run_id, created_at, file_id, file_name, folder_path, media_type,
|
||||
gemini_model, description, scenes, validated_metadata, raw_response,
|
||||
metadata_write_success, description_write_success, scene_comment_write_success,
|
||||
status, error_message, duration_ms
|
||||
FROM tagging_events
|
||||
WHERE
|
||||
(%(q)s = '' OR (
|
||||
file_name ILIKE %(like)s
|
||||
OR folder_path ILIKE %(like)s
|
||||
OR description ILIKE %(like)s
|
||||
OR status ILIKE %(like)s
|
||||
OR file_id ILIKE %(like)s
|
||||
OR coalesce(validated_metadata::text, '') ILIKE %(like)s
|
||||
OR coalesce(raw_response::text, '') ILIKE %(like)s
|
||||
OR coalesce(scenes::text, '') ILIKE %(like)s
|
||||
))
|
||||
ORDER BY created_at DESC
|
||||
LIMIT %(limit)s
|
||||
"""
|
||||
# Columns the search ILIKE walks over (substring match, case-insensitive).
|
||||
_SEARCH_COLS = [
|
||||
"file_name",
|
||||
"folder_path",
|
||||
"description",
|
||||
"status",
|
||||
"file_id",
|
||||
"coalesce(validated_metadata::text, '')",
|
||||
"coalesce(raw_response::text, '')",
|
||||
"coalesce(scenes::text, '')",
|
||||
]
|
||||
|
||||
# Single concatenated text used for trigram similarity (fuzzy / typo tolerance).
|
||||
_SEARCH_BLOB = (
|
||||
"coalesce(file_name,'')||' '||coalesce(folder_path,'')||' '||"
|
||||
"coalesce(description,'')||' '||coalesce(validated_metadata::text,'')||' '||"
|
||||
"coalesce(scenes::text,'')"
|
||||
)
|
||||
|
||||
# Fuzzy threshold for trigram similarity. 0.3 catches typos like
|
||||
# "femalle" → "female" without flooding the results with noise.
|
||||
_SIM_THRESHOLD = 0.3
|
||||
|
||||
# Short tokens (1-2 chars) are too noisy for trigrams — fall back to substring
|
||||
# match only for those.
|
||||
_MIN_FUZZY_TOKEN_LEN = 3
|
||||
|
||||
|
||||
def _build_search_sql(q: str, limit: int):
|
||||
"""
|
||||
Tokenise the query on whitespace, AND-match every token across the columns,
|
||||
where each token may match by substring OR by trigram similarity. Results
|
||||
ranked by summed similarity score, then recency.
|
||||
"""
|
||||
tokens = [t for t in q.strip().split() if t]
|
||||
common_cols = (
|
||||
"id, run_id, created_at, file_id, file_name, folder_path, media_type, "
|
||||
"gemini_model, description, scenes, validated_metadata, raw_response, "
|
||||
"metadata_write_success, description_write_success, scene_comment_write_success, "
|
||||
"status, error_message, duration_ms"
|
||||
)
|
||||
|
||||
if not tokens:
|
||||
return (
|
||||
f"SELECT {common_cols} FROM tagging_events "
|
||||
f"ORDER BY created_at DESC LIMIT %(limit)s",
|
||||
{"limit": limit},
|
||||
)
|
||||
|
||||
params: dict = {"limit": limit}
|
||||
clauses: list[str] = []
|
||||
score_terms: list[str] = []
|
||||
for i, tok in enumerate(tokens):
|
||||
like_key = f"like_{i}"
|
||||
sim_key = f"sim_{i}"
|
||||
params[like_key] = f"%{tok}%"
|
||||
params[sim_key] = tok
|
||||
col_ors = " OR ".join(f"{c} ILIKE %({like_key})s" for c in _SEARCH_COLS)
|
||||
if len(tok) >= _MIN_FUZZY_TOKEN_LEN:
|
||||
clauses.append(
|
||||
f"(({col_ors}) "
|
||||
f"OR similarity({_SEARCH_BLOB}, %({sim_key})s) > {_SIM_THRESHOLD})"
|
||||
)
|
||||
score_terms.append(f"similarity({_SEARCH_BLOB}, %({sim_key})s)")
|
||||
else:
|
||||
clauses.append(f"({col_ors})")
|
||||
|
||||
where = " AND ".join(clauses)
|
||||
score_sql = " + ".join(score_terms) if score_terms else "0"
|
||||
sql = (
|
||||
f"SELECT {common_cols}, ({score_sql}) AS _score "
|
||||
f"FROM tagging_events "
|
||||
f"WHERE {where} "
|
||||
f"ORDER BY _score DESC, created_at DESC "
|
||||
f"LIMIT %(limit)s"
|
||||
)
|
||||
return sql, params
|
||||
|
||||
|
||||
def _event_to_dict(row):
|
||||
|
|
@ -125,10 +182,10 @@ def search_events(
|
|||
limit: int = Query(100, ge=1, le=500),
|
||||
user: User = Depends(require_auth),
|
||||
):
|
||||
like = f"%{q}%"
|
||||
sql, params = _build_search_sql(q, limit)
|
||||
with _conn() as c:
|
||||
with c.cursor(row_factory=dict_row) as cur:
|
||||
cur.execute(SEARCH_SQL, {"q": q, "like": like, "limit": limit})
|
||||
cur.execute(sql, params)
|
||||
rows = cur.fetchall()
|
||||
return {"q": q, "count": len(rows), "results": [_event_to_dict(r) for r in rows]}
|
||||
|
||||
|
|
@ -168,7 +225,7 @@ def _run_pass_in_thread(run_id: uuid.UUID):
|
|||
|
||||
|
||||
@app.post("/api/runs")
|
||||
def start_run(user: User = Depends(require_auth)):
|
||||
def start_run(user: User = Depends(require_admin)):
|
||||
run_id = uuid.uuid4()
|
||||
t = threading.Thread(target=_run_pass_in_thread, args=(run_id,), daemon=True)
|
||||
t.start()
|
||||
|
|
@ -205,7 +262,7 @@ def _run_backfill_in_thread(run_id: uuid.UUID):
|
|||
|
||||
|
||||
@app.post("/api/backfill")
|
||||
def start_backfill(user: User = Depends(require_auth)):
|
||||
def start_backfill(user: User = Depends(require_admin)):
|
||||
"""
|
||||
Walk the Box folder and mirror any existing marriottUsa metadata into the
|
||||
local DB as `status='backfilled'` rows. Use this after first deploy (or
|
||||
|
|
|
|||
45
auth.py
45
auth.py
|
|
@ -19,6 +19,17 @@ AZURE_TENANT_ID = os.getenv("AZURE_TENANT_ID", "").strip()
|
|||
AZURE_CLIENT_ID = os.getenv("AZURE_CLIENT_ID", "").strip()
|
||||
DEV_AUTH_BYPASS = os.getenv("DEV_AUTH_BYPASS", "").strip().lower() in ("1", "true", "yes")
|
||||
|
||||
# Comma-separated allowlist of admin emails. Case-insensitive. Members can
|
||||
# trigger destructive endpoints (Run now, Backfill). Everyone else can read.
|
||||
_ADMIN_EMAILS = {
|
||||
e.strip().lower()
|
||||
for e in os.getenv("ADMIN_EMAILS", "").split(",")
|
||||
if e.strip()
|
||||
}
|
||||
# In bypass mode the dev user is admin by default — set DEV_AUTH_IS_ADMIN=false
|
||||
# to test the non-admin UX without flipping to MSAL.
|
||||
_DEV_AUTH_IS_ADMIN = os.getenv("DEV_AUTH_IS_ADMIN", "true").strip().lower() in ("1", "true", "yes")
|
||||
|
||||
JWKS_URL = f"https://login.microsoftonline.com/{AZURE_TENANT_ID}/discovery/v2.0/keys" if AZURE_TENANT_ID else None
|
||||
ISSUERS = (
|
||||
f"https://login.microsoftonline.com/{AZURE_TENANT_ID}/v2.0",
|
||||
|
|
@ -41,22 +52,35 @@ def _get_jwks_client() -> PyJWKClient:
|
|||
|
||||
|
||||
class User:
|
||||
def __init__(self, *, oid: str, name: str, email: str, dev: bool = False):
|
||||
def __init__(self, *, oid: str, name: str, email: str, dev: bool = False, is_admin: bool = False):
|
||||
self.oid = oid
|
||||
self.name = name
|
||||
self.email = email
|
||||
self.dev = dev
|
||||
self.is_admin = is_admin
|
||||
|
||||
def to_dict(self):
|
||||
return {"oid": self.oid, "name": self.name, "email": self.email, "dev": self.dev}
|
||||
return {
|
||||
"oid": self.oid,
|
||||
"name": self.name,
|
||||
"email": self.email,
|
||||
"dev": self.dev,
|
||||
"is_admin": self.is_admin,
|
||||
}
|
||||
|
||||
|
||||
def _is_admin_email(email: str) -> bool:
|
||||
return bool(email) and email.strip().lower() in _ADMIN_EMAILS
|
||||
|
||||
|
||||
def _bypass_user() -> User:
|
||||
email = os.getenv("DEV_AUTH_EMAIL", "dev@oliver.agency")
|
||||
return User(
|
||||
oid="dev-bypass",
|
||||
name=os.getenv("DEV_AUTH_NAME", "Dev User"),
|
||||
email=os.getenv("DEV_AUTH_EMAIL", "dev@oliver.agency"),
|
||||
email=email,
|
||||
dev=True,
|
||||
is_admin=_DEV_AUTH_IS_ADMIN or _is_admin_email(email),
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -94,17 +118,30 @@ def require_auth(authorization: Optional[str] = Header(default=None)) -> User:
|
|||
except httpx.HTTPError as e:
|
||||
raise HTTPException(status_code=status.HTTP_503_SERVICE_UNAVAILABLE, detail=f"JWKS fetch failed: {e}")
|
||||
|
||||
email = claims.get("preferred_username") or claims.get("upn") or claims.get("email", "")
|
||||
return User(
|
||||
oid=claims.get("oid") or claims.get("sub", "unknown"),
|
||||
name=claims.get("name", ""),
|
||||
email=claims.get("preferred_username") or claims.get("upn") or claims.get("email", ""),
|
||||
email=email,
|
||||
is_admin=_is_admin_email(email),
|
||||
)
|
||||
|
||||
|
||||
def require_admin(user: User = Depends(require_auth)) -> User:
|
||||
"""403 unless the caller is in ADMIN_EMAILS (or is a bypass-admin dev user)."""
|
||||
if not user.is_admin:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_403_FORBIDDEN,
|
||||
detail="Admin-only endpoint",
|
||||
)
|
||||
return user
|
||||
|
||||
|
||||
def maybe_auth_info():
|
||||
"""Diagnostic helper for /api/health: report whether auth is wired."""
|
||||
return {
|
||||
"dev_bypass": DEV_AUTH_BYPASS,
|
||||
"tenant_configured": bool(AZURE_TENANT_ID),
|
||||
"client_configured": bool(AZURE_CLIENT_ID),
|
||||
"admin_emails_configured": len(_ADMIN_EMAILS),
|
||||
}
|
||||
|
|
|
|||
|
|
@ -47,8 +47,14 @@ services:
|
|||
DEV_AUTH_BYPASS: ${DEV_AUTH_BYPASS:-true}
|
||||
AZURE_TENANT_ID: ${AZURE_TENANT_ID:-}
|
||||
AZURE_CLIENT_ID: ${AZURE_CLIENT_ID:-}
|
||||
# Comma-separated list of admin emails — only these accounts can hit
|
||||
# POST /api/runs and POST /api/backfill. Everyone else is read-only.
|
||||
ADMIN_EMAILS: ${ADMIN_EMAILS:-}
|
||||
DEV_AUTH_EMAIL: ${DEV_AUTH_EMAIL:-dev@oliver.agency}
|
||||
DEV_AUTH_NAME: ${DEV_AUTH_NAME:-Dev User}
|
||||
# In DEV_AUTH_BYPASS mode the dev user is admin by default; flip to
|
||||
# false here if you want to test the non-admin UX without enabling SSO.
|
||||
DEV_AUTH_IS_ADMIN: ${DEV_AUTH_IS_ADMIN:-true}
|
||||
# CORS for local dev: when Vite is on :5173 and FastAPI on host:8004.
|
||||
# Empty in prod — Apache serves SPA and API under the same origin.
|
||||
CORS_ORIGINS: ${CORS_ORIGINS:-}
|
||||
|
|
|
|||
|
|
@ -4,6 +4,7 @@ import {
|
|||
Event,
|
||||
Run,
|
||||
listRuns,
|
||||
me as fetchMe,
|
||||
runEvents,
|
||||
searchEvents,
|
||||
startBackfill,
|
||||
|
|
@ -17,6 +18,8 @@ export function App() {
|
|||
const [searching, setSearching] = useState(false);
|
||||
const [error, setError] = useState<string | null>(null);
|
||||
|
||||
const [isAdmin, setIsAdmin] = useState(false);
|
||||
|
||||
const [activeRun, setActiveRun] = useState<string | null>(null);
|
||||
const [activeRunKind, setActiveRunKind] = useState<"tag" | "backfill" | null>(null);
|
||||
const [activeRunEvents, setActiveRunEvents] = useState<Event[]>([]);
|
||||
|
|
@ -52,11 +55,14 @@ export function App() {
|
|||
}
|
||||
}, [auth.getToken]);
|
||||
|
||||
// Initial load: recent rows + recent runs.
|
||||
// Initial load: who am I + recent rows + recent runs.
|
||||
useEffect(() => {
|
||||
fetchMe(auth.getToken)
|
||||
.then((u) => setIsAdmin(Boolean(u.is_admin)))
|
||||
.catch(() => setIsAdmin(false));
|
||||
doSearch("");
|
||||
refreshRuns();
|
||||
}, [doSearch, refreshRuns]);
|
||||
}, [auth.getToken, doSearch, refreshRuns]);
|
||||
|
||||
// Poll active run.
|
||||
useEffect(() => {
|
||||
|
|
@ -180,24 +186,28 @@ export function App() {
|
|||
<button className="primary" type="submit" disabled={searching}>
|
||||
{searching ? "Searching…" : "Search"}
|
||||
</button>
|
||||
<button
|
||||
type="button"
|
||||
className="run-now"
|
||||
disabled={starting !== null}
|
||||
title="Trigger a tagging pass against the Box folder right now"
|
||||
onClick={onRunNow}
|
||||
>
|
||||
{starting === "tag" ? "Starting…" : "Run now"}
|
||||
</button>
|
||||
<button
|
||||
type="button"
|
||||
className="ghost"
|
||||
disabled={starting !== null}
|
||||
title="Walk Box and mirror existing marriottUsa metadata into the local DB. No Gemini, no Box writes."
|
||||
onClick={onBackfill}
|
||||
>
|
||||
{starting === "backfill" ? "Starting…" : "Backfill from Box"}
|
||||
</button>
|
||||
{isAdmin && (
|
||||
<>
|
||||
<button
|
||||
type="button"
|
||||
className="run-now"
|
||||
disabled={starting !== null}
|
||||
title="Trigger a tagging pass against the Box folder right now"
|
||||
onClick={onRunNow}
|
||||
>
|
||||
{starting === "tag" ? "Starting…" : "Run now"}
|
||||
</button>
|
||||
<button
|
||||
type="button"
|
||||
className="ghost"
|
||||
disabled={starting !== null}
|
||||
title="Walk Box and mirror existing marriottUsa metadata into the local DB. No Gemini, no Box writes."
|
||||
onClick={onBackfill}
|
||||
>
|
||||
{starting === "backfill" ? "Starting…" : "Backfill from Box"}
|
||||
</button>
|
||||
</>
|
||||
)}
|
||||
</form>
|
||||
{error && <p className="error">{error}</p>}
|
||||
</section>
|
||||
|
|
|
|||
|
|
@ -104,3 +104,13 @@ export function runEvents(getToken: GetTokenFn, runId: string) {
|
|||
export function health() {
|
||||
return fetch(`${API_BASE}/health`).then((r) => r.json());
|
||||
}
|
||||
|
||||
export function me(getToken: GetTokenFn) {
|
||||
return req<{
|
||||
oid: string;
|
||||
name: string;
|
||||
email: string;
|
||||
dev: boolean;
|
||||
is_admin: boolean;
|
||||
}>(`/me`, getToken);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2,6 +2,9 @@
|
|||
-- One row per file the tagger sent to Gemini (success or error).
|
||||
-- Skipped-as-already-tagged files do not produce rows.
|
||||
|
||||
-- pg_trgm powers the fuzzy `similarity()` call in /api/events. Idempotent.
|
||||
CREATE EXTENSION IF NOT EXISTS pg_trgm;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS tagging_events (
|
||||
id BIGSERIAL PRIMARY KEY,
|
||||
run_id UUID NOT NULL,
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue