Cache Box thumbnails + search blob; render in UI

Search results were text-only — hard to scan visually with thousands
of assets coming. Now every file Gemini-tags or backfill mirrors also
gets its Box-generated 160x160 JPG thumbnail (~10-20 KB) pulled and
stashed in Postgres, plus a consolidated `search_terms` blob
(file_name + folder + description + flattened metadata values).
Search results render the thumbnail inline; rows missing one show a
striped placeholder. Search SQL now LEFT JOINs file_assets and hits
search_terms too, so backfilled rows are properly searchable.

- schema.sql: new `file_assets` table (file_id PK, thumbnail_bytes
  bytea, search_terms text, updated_at). idempotent.
- db.py: `upsert_file_asset` (INSERT … ON CONFLICT preserving
  existing thumbnail bytes if today's fetch failed) and
  `get_thumbnail`. Both swallow exceptions per the established
  defensive pattern.
- main.py: `fetch_thumbnail` (Box SDK get_file_thumbnail_by_id, JPG
  at 160 px, handles BoxAPIError 202/404 as soft misses) and
  `build_search_terms` (lowercase, whitespace-collapsed text blob).
  `_persist_file_asset` wires both into the image+video success
  paths of `_run_pass` and into every iteration of `_run_backfill`.
- Backfill skip logic refined: always upsert file_assets (idempotent
  PK), only skip the tagging_events insert if a good row already
  exists. Re-running Backfill from Box populates thumbnails for
  rows backfilled before this feature shipped.
- api.py: `GET /api/files/{file_id}/thumbnail` streams the bytea
  with Cache-Control max-age=86400. Search SQL gains the LEFT JOIN
  and emits `has_thumbnail` per row. Search also matches against
  fa.search_terms so backfilled rows surface for free-text queries
  that hit their metadata.
- frontend: Event type adds `has_thumbnail`; `thumbnailUrl(fileId)`
  helper builds the prefix-aware URL via Vite's BASE. EventList
  renders the thumbnail (lazy, with onError fallback) or a striped
  placeholder. .thumb styling + .event-head layout in styles.css.

Verified locally: schema applies via lifespan; upsert + get_thumbnail
roundtrip; /api/files/999/thumbnail returns 200 with bytes; /api/events
returns has_thumbnail per row; multi-token "female city" search finds
a row whose validated_metadata contains both tokens.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
DJP 2026-05-11 16:20:13 -04:00
parent 6f367d5b29
commit 04440d661d
7 changed files with 302 additions and 33 deletions

88
api.py
View file

@ -18,7 +18,7 @@ import uuid
from contextlib import asynccontextmanager, contextmanager
from typing import Optional
from fastapi import Depends, FastAPI, HTTPException, Query
from fastapi import Depends, FastAPI, HTTPException, Query, Response
from fastapi.middleware.cors import CORSMiddleware
from psycopg.rows import dict_row
@ -26,6 +26,8 @@ import db
from auth import User, maybe_auth_info, require_admin, require_auth
BOX_FILE_URL = "https://app.box.com/file/{file_id}"
# Frontend builds the actual URL by joining its own API base with file_id —
# we only signal *whether* a thumbnail exists so the JSON stays prefix-agnostic.
@asynccontextmanager
@ -138,23 +140,46 @@ def _build_search_sql(q: str, limit: int):
"""
Tokenise the query on whitespace, AND-match every token across the columns,
where each token may match by substring OR by trigram similarity. Results
ranked by summed similarity score, then recency.
ranked by summed similarity score, then recency. LEFT JOINs file_assets
so the response can flag whether a thumbnail is available.
"""
tokens = [t for t in q.strip().split() if t]
common_cols = (
"id, run_id, created_at, file_id, file_name, folder_path, media_type, "
"gemini_model, description, scenes, validated_metadata, raw_response, "
"metadata_write_success, description_write_success, scene_comment_write_success, "
"status, error_message, duration_ms"
"e.id, e.run_id, e.created_at, e.file_id, e.file_name, e.folder_path, e.media_type, "
"e.gemini_model, e.description, e.scenes, e.validated_metadata, e.raw_response, "
"e.metadata_write_success, e.description_write_success, e.scene_comment_write_success, "
"e.status, e.error_message, e.duration_ms, "
"(fa.thumbnail_bytes IS NOT NULL) AS _has_thumbnail"
)
join_clause = "FROM tagging_events e LEFT JOIN file_assets fa ON fa.file_id = e.file_id"
if not tokens:
return (
f"SELECT {common_cols} FROM tagging_events "
f"ORDER BY created_at DESC LIMIT %(limit)s",
f"SELECT {common_cols} {join_clause} "
f"ORDER BY e.created_at DESC LIMIT %(limit)s",
{"limit": limit},
)
# Per-token search runs across event columns + the file_assets.search_terms
# blob — gives backfilled rows a richer search surface than just the event
# row's fields.
all_search_cols = [
"e.file_name",
"e.folder_path",
"e.description",
"e.status",
"e.file_id",
"coalesce(e.validated_metadata::text, '')",
"coalesce(e.raw_response::text, '')",
"coalesce(e.scenes::text, '')",
"coalesce(fa.search_terms, '')",
]
fuzzy_blob = (
"coalesce(e.file_name,'')||' '||coalesce(e.folder_path,'')||' '||"
"coalesce(e.description,'')||' '||coalesce(e.validated_metadata::text,'')||' '||"
"coalesce(e.scenes::text,'')||' '||coalesce(fa.search_terms,'')"
)
params: dict = {"limit": limit}
clauses: list[str] = []
score_terms: list[str] = []
@ -163,13 +188,13 @@ def _build_search_sql(q: str, limit: int):
sim_key = f"sim_{i}"
params[like_key] = f"%{tok}%"
params[sim_key] = tok
col_ors = " OR ".join(f"{c} ILIKE %({like_key})s" for c in _SEARCH_COLS)
col_ors = " OR ".join(f"{c} ILIKE %({like_key})s" for c in all_search_cols)
if len(tok) >= _MIN_FUZZY_TOKEN_LEN:
clauses.append(
f"(({col_ors}) "
f"OR similarity({_SEARCH_BLOB}, %({sim_key})s) > {_SIM_THRESHOLD})"
f"OR similarity({fuzzy_blob}, %({sim_key})s) > {_SIM_THRESHOLD})"
)
score_terms.append(f"similarity({_SEARCH_BLOB}, %({sim_key})s)")
score_terms.append(f"similarity({fuzzy_blob}, %({sim_key})s)")
else:
clauses.append(f"({col_ors})")
@ -177,9 +202,9 @@ def _build_search_sql(q: str, limit: int):
score_sql = " + ".join(score_terms) if score_terms else "0"
sql = (
f"SELECT {common_cols}, ({score_sql}) AS _score "
f"FROM tagging_events "
f"{join_clause} "
f"WHERE {where} "
f"ORDER BY _score DESC, created_at DESC "
f"ORDER BY _score DESC, e.created_at DESC "
f"LIMIT %(limit)s"
)
return sql, params
@ -189,6 +214,7 @@ def _event_to_dict(row):
out = dict(row)
fid = out.get("file_id")
out["box_url"] = BOX_FILE_URL.format(file_id=fid) if fid else None
out["has_thumbnail"] = bool(out.pop("_has_thumbnail", False))
if out.get("run_id") is not None:
out["run_id"] = str(out["run_id"])
if out.get("created_at") is not None:
@ -196,6 +222,26 @@ def _event_to_dict(row):
return out
@app.get("/api/files/{file_id}/thumbnail")
def file_thumbnail(file_id: str, user: User = Depends(require_auth)):
"""
Serve a previously-cached Box thumbnail (160x160 JPG by default) for a file.
Bytes live in Postgres bytea on file_assets see `_persist_file_asset`
in main.py. Browser is told to cache aggressively; the asset is stable
until a re-fetch overwrites the row.
"""
with _conn() as c:
row = db.get_thumbnail(c, file_id)
if not row:
raise HTTPException(status_code=404, detail="no thumbnail")
data, content_type = row
return Response(
content=data,
media_type=content_type or "image/jpeg",
headers={"Cache-Control": "private, max-age=86400"},
)
@app.get("/api/events")
def search_events(
q: str = Query("", description="Free-text search across all fields"),
@ -343,13 +389,15 @@ def run_events(run_id: str, user: User = Depends(require_auth), limit: int = Que
with c.cursor(row_factory=dict_row) as cur:
cur.execute(
"""
SELECT id, run_id, created_at, file_id, file_name, folder_path, media_type,
gemini_model, description, scenes, validated_metadata,
metadata_write_success, description_write_success,
scene_comment_write_success, status, error_message, duration_ms
FROM tagging_events
WHERE run_id = %s
ORDER BY created_at DESC
SELECT e.id, e.run_id, e.created_at, e.file_id, e.file_name, e.folder_path, e.media_type,
e.gemini_model, e.description, e.scenes, e.validated_metadata,
e.metadata_write_success, e.description_write_success,
e.scene_comment_write_success, e.status, e.error_message, e.duration_ms,
(fa.thumbnail_bytes IS NOT NULL) AS _has_thumbnail
FROM tagging_events e
LEFT JOIN file_assets fa ON fa.file_id = e.file_id
WHERE e.run_id = %s
ORDER BY e.created_at DESC
LIMIT %s
""",
(run_id, limit),

73
db.py
View file

@ -50,6 +50,79 @@ def ensure_schema(conn):
cur.execute(sql)
_UPSERT_FILE_ASSET_SQL = """
INSERT INTO file_assets (
file_id, thumbnail_bytes, thumbnail_content_type, thumbnail_size,
search_terms, updated_at
) VALUES (
%(file_id)s, %(thumbnail_bytes)s, %(thumbnail_content_type)s, %(thumbnail_size)s,
%(search_terms)s, now()
)
ON CONFLICT (file_id) DO UPDATE SET
-- only overwrite the thumbnail when we have new bytes; preserves
-- previously-captured thumbs across runs where the fetch failed.
thumbnail_bytes = COALESCE(EXCLUDED.thumbnail_bytes, file_assets.thumbnail_bytes),
thumbnail_content_type = COALESCE(EXCLUDED.thumbnail_content_type, file_assets.thumbnail_content_type),
thumbnail_size = COALESCE(EXCLUDED.thumbnail_size, file_assets.thumbnail_size),
search_terms = COALESCE(EXCLUDED.search_terms, file_assets.search_terms),
updated_at = now()
"""
def upsert_file_asset(
conn,
*,
file_id,
thumbnail_bytes=None,
thumbnail_content_type=None,
thumbnail_size=None,
search_terms=None,
):
"""
Idempotently insert/update the per-file row. Failures are swallowed
a thumbnail or search-blob hiccup must never stop a tagging pass.
"""
if conn is None or not file_id:
return
try:
with conn.cursor() as cur:
cur.execute(_UPSERT_FILE_ASSET_SQL, {
"file_id": str(file_id),
"thumbnail_bytes": thumbnail_bytes,
"thumbnail_content_type": thumbnail_content_type,
"thumbnail_size": thumbnail_size,
"search_terms": search_terms,
})
except Exception as e:
print(
f" WARN: DB upsert_file_asset failed ({type(e).__name__}: {e}) — continuing",
file=sys.stderr,
)
def get_thumbnail(conn, file_id):
"""Return (bytes, content_type) for the file's stored thumbnail, or None."""
if conn is None or not file_id:
return None
try:
with conn.cursor() as cur:
cur.execute(
"SELECT thumbnail_bytes, thumbnail_content_type "
"FROM file_assets WHERE file_id = %s AND thumbnail_bytes IS NOT NULL",
(str(file_id),),
)
row = cur.fetchone()
if not row:
return None
return bytes(row[0]), row[1]
except Exception as e:
print(
f" WARN: DB get_thumbnail failed ({type(e).__name__}: {e})",
file=sys.stderr,
)
return None
def is_file_already_tagged(conn, file_id) -> bool:
"""
Skip-check oracle. A file counts as "already tagged" if we have any row

View file

@ -9,6 +9,7 @@ import {
searchEvents,
startBackfill,
startRun,
thumbnailUrl,
} from "./api";
export function App() {
@ -281,6 +282,19 @@ function EventList({ events }: { events: Event[] }) {
{events.map((e) => (
<li key={e.id} className={`event status-${e.status}`}>
<div className="event-head">
{e.has_thumbnail ? (
<img
className="thumb"
loading="lazy"
alt=""
src={thumbnailUrl(e.file_id)}
onError={(ev) => {
(ev.currentTarget as HTMLImageElement).style.display = "none";
}}
/>
) : (
<span className="thumb thumb-placeholder" aria-hidden="true" />
)}
<span className={`badge type-${e.media_type}`}>{e.media_type}</span>
<span className="event-name">{e.file_name}</span>
<span className={`pill status-${e.status}`}>{e.status}</span>

View file

@ -28,8 +28,14 @@ export type Event = {
error_message: string | null;
duration_ms: number | null;
box_url: string | null;
has_thumbnail: boolean;
};
/** Build the thumbnail URL using the SPA's own API base — works at /marriott-tagging/ and /. */
export function thumbnailUrl(fileId: string): string {
return `${API_BASE}/files/${encodeURIComponent(fileId)}/thumbnail`;
}
export type Run = {
run_id: string;
started_at: string | null;

View file

@ -231,6 +231,25 @@ input:focus {
gap: 0.5rem;
flex-wrap: wrap;
}
.thumb {
display: block;
width: 64px;
height: 64px;
object-fit: cover;
border-radius: 4px;
border: 1px solid var(--line);
background: var(--surface);
flex-shrink: 0;
}
.thumb-placeholder {
background: repeating-linear-gradient(
45deg,
var(--surface),
var(--surface) 6px,
var(--surface-alt) 6px,
var(--surface-alt) 12px
);
}
.event-name {
font-weight: 600;
flex: 1;

122
main.py
View file

@ -26,6 +26,7 @@ from box_sdk_gen.managers.file_metadata import (
UpdateFileMetadataByIdRequestBody,
UpdateFileMetadataByIdRequestBodyOpField,
)
from box_sdk_gen.managers.files import GetFileThumbnailByIdExtension
from box_sdk_gen.managers.comments import CreateCommentItem, CreateCommentItemTypeField
from google import genai
@ -660,6 +661,67 @@ def fetch_file_description(box_client, file_id):
return None
# Box's pre-rendered JPG thumbnails come in 32, 160, or 320 (px square).
THUMBNAIL_DIM = 160
def fetch_thumbnail(box_client, file_id, dim=THUMBNAIL_DIM):
"""
Pull a small (160×160 default) JPG thumbnail from Box.
Returns (bytes, "image/jpeg") on success, None if Box has no rendition
yet (202 / 404) or any other failure. Never raises the caller treats
'no thumb' as a soft miss and the row's thumbnail column stays NULL.
"""
try:
stream = box_client.files.get_file_thumbnail_by_id(
file_id=file_id,
extension=GetFileThumbnailByIdExtension.JPG,
min_height=dim,
min_width=dim,
max_height=dim,
max_width=dim,
)
if stream is None:
return None
data = stream.read() if hasattr(stream, "read") else bytes(stream)
if not data:
return None
return data, "image/jpeg"
except BoxAPIError as e:
# 202: rendition not yet generated. 404/415: no thumbnail for this type.
# All non-fatal — a later backfill re-attempts.
print(f" Thumbnail unavailable for {file_id}: {e}")
return None
except Exception as e:
print(f" Thumbnail fetch errored for {file_id}: {type(e).__name__}: {e}")
return None
def build_search_terms(file_name, folder_path, description, validated_metadata):
"""
Flatten everything searchable about a file into one normalized text blob.
Stored on file_assets and (future) wired into the search SQL / FTS.
"""
parts: list[str] = []
if file_name:
parts.append(str(file_name))
if folder_path:
parts.append(str(folder_path))
if description:
parts.append(str(description))
if isinstance(validated_metadata, dict):
for k, v in validated_metadata.items():
parts.append(str(k))
if isinstance(v, (list, tuple)):
parts.extend(str(x) for x in v)
elif v is not None:
parts.append(str(v))
# collapse whitespace, lowercase for stable substring/trigram matching
joined = " ".join(parts)
return " ".join(joined.lower().split())
# ── 10. Write Metadata to Box ─────────────────────────────────────────────────
def write_metadata_to_box(box_client, file_id, metadata, file_name):
@ -924,6 +986,11 @@ def _run_pass(run_id, db_conn):
status="success",
duration_ms=gemini_elapsed_ms,
)
_persist_file_asset(
box_client, db_conn, file_id, file_name, folder_path,
description if isinstance(description, str) else None,
cleaned_metadata,
)
# Rate limit delay (skip after last file)
if i < img_total or video_files:
@ -1063,6 +1130,11 @@ def _run_pass(run_id, db_conn):
status="success",
duration_ms=gemini_elapsed_ms,
)
_persist_file_asset(
box_client, db_conn, file_id, file_name, folder_path,
description if isinstance(description, str) else None,
cleaned_metadata,
)
# Rate limit delay (skip after last video)
if i < vid_total:
@ -1083,6 +1155,20 @@ def _run_pass(run_id, db_conn):
print("=" * 60)
def _persist_file_asset(box_client, db_conn, file_id, file_name, folder_path, description, validated_metadata):
"""Fetch the Box thumbnail (best-effort) and upsert the file_assets row."""
thumb = fetch_thumbnail(box_client, file_id)
search_terms = build_search_terms(file_name, folder_path, description, validated_metadata)
db.upsert_file_asset(
db_conn,
file_id=file_id,
thumbnail_bytes=thumb[0] if thumb else None,
thumbnail_content_type=thumb[1] if thumb else None,
thumbnail_size=THUMBNAIL_DIM if thumb else None,
search_terms=search_terms,
)
# ── 14. Backfill from Box ─────────────────────────────────────────────────────
@ -1110,9 +1196,10 @@ def _run_backfill(run_id, db_conn):
print("No media files found. Exiting.")
return
inserted = 0
events_inserted = 0
events_existing = 0
assets_upserted = 0
no_metadata = 0
already_in_db = 0
errored = 0
combined = [("image", f) for f in image_files] + [("video", f) for f in video_files]
@ -1125,11 +1212,6 @@ def _run_backfill(run_id, db_conn):
if folder_path:
print(f" Folder: {folder_path}")
if db.is_file_already_tagged(db_conn, file_id):
print(" Already in DB — skipping.")
already_in_db += 1
continue
try:
existing = fetch_existing_metadata(box_client, file_id)
except BoxAPIError as e:
@ -1147,6 +1229,19 @@ def _run_backfill(run_id, db_conn):
if description:
print(f" Description: {description[:80]}{'' if len(description) > 80 else ''}")
# Always upsert the file_assets row (thumbnail + search blob). Idempotent
# on file_id PK; preserves previously-captured thumbnails if today's
# fetch fails (see ON CONFLICT in db._UPSERT_FILE_ASSET_SQL).
_persist_file_asset(box_client, db_conn, file_id, file_name, folder_path, description, existing)
assets_upserted += 1
# Only insert a tagging_events row if there isn't already a good one
# for this file. Re-running backfill should not duplicate event rows.
if db.is_file_already_tagged(db_conn, file_id):
print(" Event row already present — leaving tagging_events untouched.")
events_existing += 1
continue
db.log_event(
db_conn,
run_id=run_id,
@ -1167,16 +1262,17 @@ def _run_backfill(run_id, db_conn):
error_message=None,
duration_ms=None,
)
inserted += 1
events_inserted += 1
print("\n" + "=" * 60)
print("BACKFILL SUMMARY")
print("=" * 60)
print(f" Total files seen: {total}")
print(f" Inserted into DB: {inserted}")
print(f" Already in DB (skipped): {already_in_db}")
print(f" No metadata in Box: {no_metadata}")
print(f" Errors reading Box: {errored}")
print(f" Total files seen: {total}")
print(f" file_assets upserted: {assets_upserted}")
print(f" New tagging_events rows: {events_inserted}")
print(f" Event rows already present: {events_existing}")
print(f" No metadata in Box: {no_metadata}")
print(f" Errors reading Box: {errored}")
print("=" * 60)

View file

@ -30,3 +30,16 @@ CREATE TABLE IF NOT EXISTS tagging_events (
CREATE INDEX IF NOT EXISTS tagging_events_run_id_idx ON tagging_events (run_id);
CREATE INDEX IF NOT EXISTS tagging_events_file_id_idx ON tagging_events (file_id);
CREATE INDEX IF NOT EXISTS tagging_events_created_idx ON tagging_events (created_at DESC);
-- Per-file state: thumbnails + consolidated search blob. One row per Box file_id,
-- upserted by the tagger and backfill flows. Independent of the append-only
-- tagging_events log (which can have many rows per file_id over time).
CREATE TABLE IF NOT EXISTS file_assets (
file_id TEXT PRIMARY KEY,
thumbnail_bytes BYTEA,
thumbnail_content_type TEXT,
thumbnail_size INTEGER,
search_terms TEXT,
updated_at TIMESTAMPTZ NOT NULL DEFAULT now()
);
CREATE INDEX IF NOT EXISTS file_assets_updated_idx ON file_assets (updated_at DESC);