From 04440d661de44acda77fff62112249bfc337b2f8 Mon Sep 17 00:00:00 2001 From: DJP Date: Mon, 11 May 2026 16:20:13 -0400 Subject: [PATCH] Cache Box thumbnails + search blob; render in UI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Search results were text-only — hard to scan visually with thousands of assets coming. Now every file Gemini-tags or backfill mirrors also gets its Box-generated 160x160 JPG thumbnail (~10-20 KB) pulled and stashed in Postgres, plus a consolidated `search_terms` blob (file_name + folder + description + flattened metadata values). Search results render the thumbnail inline; rows missing one show a striped placeholder. Search SQL now LEFT JOINs file_assets and hits search_terms too, so backfilled rows are properly searchable. - schema.sql: new `file_assets` table (file_id PK, thumbnail_bytes bytea, search_terms text, updated_at). idempotent. - db.py: `upsert_file_asset` (INSERT … ON CONFLICT preserving existing thumbnail bytes if today's fetch failed) and `get_thumbnail`. Both swallow exceptions per the established defensive pattern. - main.py: `fetch_thumbnail` (Box SDK get_file_thumbnail_by_id, JPG at 160 px, handles BoxAPIError 202/404 as soft misses) and `build_search_terms` (lowercase, whitespace-collapsed text blob). `_persist_file_asset` wires both into the image+video success paths of `_run_pass` and into every iteration of `_run_backfill`. - Backfill skip logic refined: always upsert file_assets (idempotent PK), only skip the tagging_events insert if a good row already exists. Re-running Backfill from Box populates thumbnails for rows backfilled before this feature shipped. - api.py: `GET /api/files/{file_id}/thumbnail` streams the bytea with Cache-Control max-age=86400. Search SQL gains the LEFT JOIN and emits `has_thumbnail` per row. Search also matches against fa.search_terms so backfilled rows surface for free-text queries that hit their metadata. - frontend: Event type adds `has_thumbnail`; `thumbnailUrl(fileId)` helper builds the prefix-aware URL via Vite's BASE. EventList renders the thumbnail (lazy, with onError fallback) or a striped placeholder. .thumb styling + .event-head layout in styles.css. Verified locally: schema applies via lifespan; upsert + get_thumbnail roundtrip; /api/files/999/thumbnail returns 200 with bytes; /api/events returns has_thumbnail per row; multi-token "female city" search finds a row whose validated_metadata contains both tokens. Co-Authored-By: Claude Opus 4.7 (1M context) --- api.py | 88 ++++++++++++++++++++++------- db.py | 73 ++++++++++++++++++++++++ frontend/src/App.tsx | 14 +++++ frontend/src/api.ts | 6 ++ frontend/src/styles.css | 19 +++++++ main.py | 122 +++++++++++++++++++++++++++++++++++----- schema.sql | 13 +++++ 7 files changed, 302 insertions(+), 33 deletions(-) diff --git a/api.py b/api.py index 44eb63d..29e54fe 100644 --- a/api.py +++ b/api.py @@ -18,7 +18,7 @@ import uuid from contextlib import asynccontextmanager, contextmanager from typing import Optional -from fastapi import Depends, FastAPI, HTTPException, Query +from fastapi import Depends, FastAPI, HTTPException, Query, Response from fastapi.middleware.cors import CORSMiddleware from psycopg.rows import dict_row @@ -26,6 +26,8 @@ import db from auth import User, maybe_auth_info, require_admin, require_auth BOX_FILE_URL = "https://app.box.com/file/{file_id}" +# Frontend builds the actual URL by joining its own API base with file_id — +# we only signal *whether* a thumbnail exists so the JSON stays prefix-agnostic. @asynccontextmanager @@ -138,23 +140,46 @@ def _build_search_sql(q: str, limit: int): """ Tokenise the query on whitespace, AND-match every token across the columns, where each token may match by substring OR by trigram similarity. Results - ranked by summed similarity score, then recency. + ranked by summed similarity score, then recency. LEFT JOINs file_assets + so the response can flag whether a thumbnail is available. """ tokens = [t for t in q.strip().split() if t] common_cols = ( - "id, run_id, created_at, file_id, file_name, folder_path, media_type, " - "gemini_model, description, scenes, validated_metadata, raw_response, " - "metadata_write_success, description_write_success, scene_comment_write_success, " - "status, error_message, duration_ms" + "e.id, e.run_id, e.created_at, e.file_id, e.file_name, e.folder_path, e.media_type, " + "e.gemini_model, e.description, e.scenes, e.validated_metadata, e.raw_response, " + "e.metadata_write_success, e.description_write_success, e.scene_comment_write_success, " + "e.status, e.error_message, e.duration_ms, " + "(fa.thumbnail_bytes IS NOT NULL) AS _has_thumbnail" ) + join_clause = "FROM tagging_events e LEFT JOIN file_assets fa ON fa.file_id = e.file_id" if not tokens: return ( - f"SELECT {common_cols} FROM tagging_events " - f"ORDER BY created_at DESC LIMIT %(limit)s", + f"SELECT {common_cols} {join_clause} " + f"ORDER BY e.created_at DESC LIMIT %(limit)s", {"limit": limit}, ) + # Per-token search runs across event columns + the file_assets.search_terms + # blob — gives backfilled rows a richer search surface than just the event + # row's fields. + all_search_cols = [ + "e.file_name", + "e.folder_path", + "e.description", + "e.status", + "e.file_id", + "coalesce(e.validated_metadata::text, '')", + "coalesce(e.raw_response::text, '')", + "coalesce(e.scenes::text, '')", + "coalesce(fa.search_terms, '')", + ] + fuzzy_blob = ( + "coalesce(e.file_name,'')||' '||coalesce(e.folder_path,'')||' '||" + "coalesce(e.description,'')||' '||coalesce(e.validated_metadata::text,'')||' '||" + "coalesce(e.scenes::text,'')||' '||coalesce(fa.search_terms,'')" + ) + params: dict = {"limit": limit} clauses: list[str] = [] score_terms: list[str] = [] @@ -163,13 +188,13 @@ def _build_search_sql(q: str, limit: int): sim_key = f"sim_{i}" params[like_key] = f"%{tok}%" params[sim_key] = tok - col_ors = " OR ".join(f"{c} ILIKE %({like_key})s" for c in _SEARCH_COLS) + col_ors = " OR ".join(f"{c} ILIKE %({like_key})s" for c in all_search_cols) if len(tok) >= _MIN_FUZZY_TOKEN_LEN: clauses.append( f"(({col_ors}) " - f"OR similarity({_SEARCH_BLOB}, %({sim_key})s) > {_SIM_THRESHOLD})" + f"OR similarity({fuzzy_blob}, %({sim_key})s) > {_SIM_THRESHOLD})" ) - score_terms.append(f"similarity({_SEARCH_BLOB}, %({sim_key})s)") + score_terms.append(f"similarity({fuzzy_blob}, %({sim_key})s)") else: clauses.append(f"({col_ors})") @@ -177,9 +202,9 @@ def _build_search_sql(q: str, limit: int): score_sql = " + ".join(score_terms) if score_terms else "0" sql = ( f"SELECT {common_cols}, ({score_sql}) AS _score " - f"FROM tagging_events " + f"{join_clause} " f"WHERE {where} " - f"ORDER BY _score DESC, created_at DESC " + f"ORDER BY _score DESC, e.created_at DESC " f"LIMIT %(limit)s" ) return sql, params @@ -189,6 +214,7 @@ def _event_to_dict(row): out = dict(row) fid = out.get("file_id") out["box_url"] = BOX_FILE_URL.format(file_id=fid) if fid else None + out["has_thumbnail"] = bool(out.pop("_has_thumbnail", False)) if out.get("run_id") is not None: out["run_id"] = str(out["run_id"]) if out.get("created_at") is not None: @@ -196,6 +222,26 @@ def _event_to_dict(row): return out +@app.get("/api/files/{file_id}/thumbnail") +def file_thumbnail(file_id: str, user: User = Depends(require_auth)): + """ + Serve a previously-cached Box thumbnail (160x160 JPG by default) for a file. + Bytes live in Postgres bytea on file_assets — see `_persist_file_asset` + in main.py. Browser is told to cache aggressively; the asset is stable + until a re-fetch overwrites the row. + """ + with _conn() as c: + row = db.get_thumbnail(c, file_id) + if not row: + raise HTTPException(status_code=404, detail="no thumbnail") + data, content_type = row + return Response( + content=data, + media_type=content_type or "image/jpeg", + headers={"Cache-Control": "private, max-age=86400"}, + ) + + @app.get("/api/events") def search_events( q: str = Query("", description="Free-text search across all fields"), @@ -343,13 +389,15 @@ def run_events(run_id: str, user: User = Depends(require_auth), limit: int = Que with c.cursor(row_factory=dict_row) as cur: cur.execute( """ - SELECT id, run_id, created_at, file_id, file_name, folder_path, media_type, - gemini_model, description, scenes, validated_metadata, - metadata_write_success, description_write_success, - scene_comment_write_success, status, error_message, duration_ms - FROM tagging_events - WHERE run_id = %s - ORDER BY created_at DESC + SELECT e.id, e.run_id, e.created_at, e.file_id, e.file_name, e.folder_path, e.media_type, + e.gemini_model, e.description, e.scenes, e.validated_metadata, + e.metadata_write_success, e.description_write_success, + e.scene_comment_write_success, e.status, e.error_message, e.duration_ms, + (fa.thumbnail_bytes IS NOT NULL) AS _has_thumbnail + FROM tagging_events e + LEFT JOIN file_assets fa ON fa.file_id = e.file_id + WHERE e.run_id = %s + ORDER BY e.created_at DESC LIMIT %s """, (run_id, limit), diff --git a/db.py b/db.py index bb2e55f..560788b 100644 --- a/db.py +++ b/db.py @@ -50,6 +50,79 @@ def ensure_schema(conn): cur.execute(sql) +_UPSERT_FILE_ASSET_SQL = """ +INSERT INTO file_assets ( + file_id, thumbnail_bytes, thumbnail_content_type, thumbnail_size, + search_terms, updated_at +) VALUES ( + %(file_id)s, %(thumbnail_bytes)s, %(thumbnail_content_type)s, %(thumbnail_size)s, + %(search_terms)s, now() +) +ON CONFLICT (file_id) DO UPDATE SET + -- only overwrite the thumbnail when we have new bytes; preserves + -- previously-captured thumbs across runs where the fetch failed. + thumbnail_bytes = COALESCE(EXCLUDED.thumbnail_bytes, file_assets.thumbnail_bytes), + thumbnail_content_type = COALESCE(EXCLUDED.thumbnail_content_type, file_assets.thumbnail_content_type), + thumbnail_size = COALESCE(EXCLUDED.thumbnail_size, file_assets.thumbnail_size), + search_terms = COALESCE(EXCLUDED.search_terms, file_assets.search_terms), + updated_at = now() +""" + + +def upsert_file_asset( + conn, + *, + file_id, + thumbnail_bytes=None, + thumbnail_content_type=None, + thumbnail_size=None, + search_terms=None, +): + """ + Idempotently insert/update the per-file row. Failures are swallowed — + a thumbnail or search-blob hiccup must never stop a tagging pass. + """ + if conn is None or not file_id: + return + try: + with conn.cursor() as cur: + cur.execute(_UPSERT_FILE_ASSET_SQL, { + "file_id": str(file_id), + "thumbnail_bytes": thumbnail_bytes, + "thumbnail_content_type": thumbnail_content_type, + "thumbnail_size": thumbnail_size, + "search_terms": search_terms, + }) + except Exception as e: + print( + f" WARN: DB upsert_file_asset failed ({type(e).__name__}: {e}) — continuing", + file=sys.stderr, + ) + + +def get_thumbnail(conn, file_id): + """Return (bytes, content_type) for the file's stored thumbnail, or None.""" + if conn is None or not file_id: + return None + try: + with conn.cursor() as cur: + cur.execute( + "SELECT thumbnail_bytes, thumbnail_content_type " + "FROM file_assets WHERE file_id = %s AND thumbnail_bytes IS NOT NULL", + (str(file_id),), + ) + row = cur.fetchone() + if not row: + return None + return bytes(row[0]), row[1] + except Exception as e: + print( + f" WARN: DB get_thumbnail failed ({type(e).__name__}: {e})", + file=sys.stderr, + ) + return None + + def is_file_already_tagged(conn, file_id) -> bool: """ Skip-check oracle. A file counts as "already tagged" if we have any row diff --git a/frontend/src/App.tsx b/frontend/src/App.tsx index 69aef9d..df0c80c 100644 --- a/frontend/src/App.tsx +++ b/frontend/src/App.tsx @@ -9,6 +9,7 @@ import { searchEvents, startBackfill, startRun, + thumbnailUrl, } from "./api"; export function App() { @@ -281,6 +282,19 @@ function EventList({ events }: { events: Event[] }) { {events.map((e) => (
  • + {e.has_thumbnail ? ( + { + (ev.currentTarget as HTMLImageElement).style.display = "none"; + }} + /> + ) : ( +