Search results were text-only — hard to scan visually with thousands
of assets coming. Now every file Gemini-tags or backfill mirrors also
gets its Box-generated 160x160 JPG thumbnail (~10-20 KB) pulled and
stashed in Postgres, plus a consolidated `search_terms` blob
(file_name + folder + description + flattened metadata values).
Search results render the thumbnail inline; rows missing one show a
striped placeholder. Search SQL now LEFT JOINs file_assets and hits
search_terms too, so backfilled rows are properly searchable.
- schema.sql: new `file_assets` table (file_id PK, thumbnail_bytes
bytea, search_terms text, updated_at). idempotent.
- db.py: `upsert_file_asset` (INSERT … ON CONFLICT preserving
existing thumbnail bytes if today's fetch failed) and
`get_thumbnail`. Both swallow exceptions per the established
defensive pattern.
- main.py: `fetch_thumbnail` (Box SDK get_file_thumbnail_by_id, JPG
at 160 px, handles BoxAPIError 202/404 as soft misses) and
`build_search_terms` (lowercase, whitespace-collapsed text blob).
`_persist_file_asset` wires both into the image+video success
paths of `_run_pass` and into every iteration of `_run_backfill`.
- Backfill skip logic refined: always upsert file_assets (idempotent
PK), only skip the tagging_events insert if a good row already
exists. Re-running Backfill from Box populates thumbnails for
rows backfilled before this feature shipped.
- api.py: `GET /api/files/{file_id}/thumbnail` streams the bytea
with Cache-Control max-age=86400. Search SQL gains the LEFT JOIN
and emits `has_thumbnail` per row. Search also matches against
fa.search_terms so backfilled rows surface for free-text queries
that hit their metadata.
- frontend: Event type adds `has_thumbnail`; `thumbnailUrl(fileId)`
helper builds the prefix-aware URL via Vite's BASE. EventList
renders the thumbnail (lazy, with onError fallback) or a striped
placeholder. .thumb styling + .event-head layout in styles.css.
Verified locally: schema applies via lifespan; upsert + get_thumbnail
roundtrip; /api/files/999/thumbnail returns 200 with bytes; /api/events
returns has_thumbnail per row; multi-token "female city" search finds
a row whose validated_metadata contains both tokens.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
211 lines
6.6 KiB
Python
211 lines
6.6 KiB
Python
"""
|
|
Postgres logging for the Marriott Box tagger.
|
|
|
|
One row per file Gemini was called on (success or error). The DB is auxiliary —
|
|
all functions swallow exceptions and print to stderr so a Postgres outage cannot
|
|
stop the tagging pass. Box remains the source of truth.
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
import psycopg
|
|
from psycopg.types.json import Jsonb
|
|
|
|
SCHEMA_PATH = Path(__file__).parent / "schema.sql"
|
|
|
|
INSERT_SQL = """
|
|
INSERT INTO tagging_events (
|
|
run_id, file_id, file_name, folder_path, media_type, gemini_model,
|
|
prompt, raw_response, description, scenes, validated_metadata,
|
|
metadata_write_success, description_write_success, scene_comment_write_success,
|
|
status, error_message, duration_ms
|
|
) VALUES (
|
|
%(run_id)s, %(file_id)s, %(file_name)s, %(folder_path)s, %(media_type)s, %(gemini_model)s,
|
|
%(prompt)s, %(raw_response)s, %(description)s, %(scenes)s, %(validated_metadata)s,
|
|
%(metadata_write_success)s, %(description_write_success)s, %(scene_comment_write_success)s,
|
|
%(status)s, %(error_message)s, %(duration_ms)s
|
|
)
|
|
"""
|
|
|
|
|
|
def _dsn():
|
|
dsn = os.getenv("DATABASE_URL")
|
|
if not dsn:
|
|
raise RuntimeError("DATABASE_URL not set")
|
|
return dsn
|
|
|
|
|
|
def get_conn():
|
|
"""Open a Postgres connection. Caller owns close()."""
|
|
return psycopg.connect(_dsn(), autocommit=True)
|
|
|
|
|
|
def ensure_schema(conn):
|
|
"""Apply schema.sql idempotently."""
|
|
sql = SCHEMA_PATH.read_text()
|
|
with conn.cursor() as cur:
|
|
cur.execute(sql)
|
|
|
|
|
|
_UPSERT_FILE_ASSET_SQL = """
|
|
INSERT INTO file_assets (
|
|
file_id, thumbnail_bytes, thumbnail_content_type, thumbnail_size,
|
|
search_terms, updated_at
|
|
) VALUES (
|
|
%(file_id)s, %(thumbnail_bytes)s, %(thumbnail_content_type)s, %(thumbnail_size)s,
|
|
%(search_terms)s, now()
|
|
)
|
|
ON CONFLICT (file_id) DO UPDATE SET
|
|
-- only overwrite the thumbnail when we have new bytes; preserves
|
|
-- previously-captured thumbs across runs where the fetch failed.
|
|
thumbnail_bytes = COALESCE(EXCLUDED.thumbnail_bytes, file_assets.thumbnail_bytes),
|
|
thumbnail_content_type = COALESCE(EXCLUDED.thumbnail_content_type, file_assets.thumbnail_content_type),
|
|
thumbnail_size = COALESCE(EXCLUDED.thumbnail_size, file_assets.thumbnail_size),
|
|
search_terms = COALESCE(EXCLUDED.search_terms, file_assets.search_terms),
|
|
updated_at = now()
|
|
"""
|
|
|
|
|
|
def upsert_file_asset(
|
|
conn,
|
|
*,
|
|
file_id,
|
|
thumbnail_bytes=None,
|
|
thumbnail_content_type=None,
|
|
thumbnail_size=None,
|
|
search_terms=None,
|
|
):
|
|
"""
|
|
Idempotently insert/update the per-file row. Failures are swallowed —
|
|
a thumbnail or search-blob hiccup must never stop a tagging pass.
|
|
"""
|
|
if conn is None or not file_id:
|
|
return
|
|
try:
|
|
with conn.cursor() as cur:
|
|
cur.execute(_UPSERT_FILE_ASSET_SQL, {
|
|
"file_id": str(file_id),
|
|
"thumbnail_bytes": thumbnail_bytes,
|
|
"thumbnail_content_type": thumbnail_content_type,
|
|
"thumbnail_size": thumbnail_size,
|
|
"search_terms": search_terms,
|
|
})
|
|
except Exception as e:
|
|
print(
|
|
f" WARN: DB upsert_file_asset failed ({type(e).__name__}: {e}) — continuing",
|
|
file=sys.stderr,
|
|
)
|
|
|
|
|
|
def get_thumbnail(conn, file_id):
|
|
"""Return (bytes, content_type) for the file's stored thumbnail, or None."""
|
|
if conn is None or not file_id:
|
|
return None
|
|
try:
|
|
with conn.cursor() as cur:
|
|
cur.execute(
|
|
"SELECT thumbnail_bytes, thumbnail_content_type "
|
|
"FROM file_assets WHERE file_id = %s AND thumbnail_bytes IS NOT NULL",
|
|
(str(file_id),),
|
|
)
|
|
row = cur.fetchone()
|
|
if not row:
|
|
return None
|
|
return bytes(row[0]), row[1]
|
|
except Exception as e:
|
|
print(
|
|
f" WARN: DB get_thumbnail failed ({type(e).__name__}: {e})",
|
|
file=sys.stderr,
|
|
)
|
|
return None
|
|
|
|
|
|
def is_file_already_tagged(conn, file_id) -> bool:
|
|
"""
|
|
Skip-check oracle. A file counts as "already tagged" if we have any row
|
|
in tagging_events for it with a terminal-good status — either a real
|
|
Gemini-driven success or a backfilled row that mirrors Box's existing
|
|
metadata. Error/validation rows do NOT count, so a previously failed
|
|
file gets retried on the next pass.
|
|
"""
|
|
if conn is None or not file_id:
|
|
return False
|
|
try:
|
|
with conn.cursor() as cur:
|
|
cur.execute(
|
|
"SELECT 1 FROM tagging_events "
|
|
"WHERE file_id = %s AND status IN ('success','backfilled') LIMIT 1",
|
|
(str(file_id),),
|
|
)
|
|
return cur.fetchone() is not None
|
|
except Exception as e:
|
|
print(
|
|
f" WARN: DB is_file_already_tagged failed ({type(e).__name__}: {e}) — assuming NOT tagged",
|
|
file=sys.stderr,
|
|
)
|
|
return False
|
|
|
|
|
|
def _jsonable(value):
|
|
if value is None:
|
|
return None
|
|
return Jsonb(value)
|
|
|
|
|
|
def log_event(
|
|
conn,
|
|
*,
|
|
run_id,
|
|
file_id,
|
|
file_name,
|
|
folder_path,
|
|
media_type,
|
|
gemini_model,
|
|
status,
|
|
prompt=None,
|
|
raw_response=None,
|
|
description=None,
|
|
scenes=None,
|
|
validated_metadata=None,
|
|
metadata_write_success=None,
|
|
description_write_success=None,
|
|
scene_comment_write_success=None,
|
|
error_message=None,
|
|
duration_ms=None,
|
|
):
|
|
"""
|
|
Insert one tagging_events row. Never raises — DB problems are reported to stderr
|
|
and the tagger continues.
|
|
"""
|
|
if conn is None:
|
|
return
|
|
params = {
|
|
"run_id": str(run_id),
|
|
"file_id": str(file_id) if file_id is not None else None,
|
|
"file_name": file_name,
|
|
"folder_path": folder_path,
|
|
"media_type": media_type,
|
|
"gemini_model": gemini_model,
|
|
"prompt": prompt,
|
|
"raw_response": _jsonable(raw_response),
|
|
"description": description,
|
|
"scenes": _jsonable(scenes),
|
|
"validated_metadata": _jsonable(validated_metadata),
|
|
"metadata_write_success": metadata_write_success,
|
|
"description_write_success": description_write_success,
|
|
"scene_comment_write_success": scene_comment_write_success,
|
|
"status": status,
|
|
"error_message": error_message,
|
|
"duration_ms": duration_ms,
|
|
}
|
|
try:
|
|
with conn.cursor() as cur:
|
|
cur.execute(INSERT_SQL, params)
|
|
except Exception as e:
|
|
print(
|
|
f" WARN: DB log_event failed ({type(e).__name__}: {e}) — continuing",
|
|
file=sys.stderr,
|
|
)
|