marriott-box-image-video-ta.../db.py
DJP 04440d661d Cache Box thumbnails + search blob; render in UI
Search results were text-only — hard to scan visually with thousands
of assets coming. Now every file Gemini-tags or backfill mirrors also
gets its Box-generated 160x160 JPG thumbnail (~10-20 KB) pulled and
stashed in Postgres, plus a consolidated `search_terms` blob
(file_name + folder + description + flattened metadata values).
Search results render the thumbnail inline; rows missing one show a
striped placeholder. Search SQL now LEFT JOINs file_assets and hits
search_terms too, so backfilled rows are properly searchable.

- schema.sql: new `file_assets` table (file_id PK, thumbnail_bytes
  bytea, search_terms text, updated_at). idempotent.
- db.py: `upsert_file_asset` (INSERT … ON CONFLICT preserving
  existing thumbnail bytes if today's fetch failed) and
  `get_thumbnail`. Both swallow exceptions per the established
  defensive pattern.
- main.py: `fetch_thumbnail` (Box SDK get_file_thumbnail_by_id, JPG
  at 160 px, handles BoxAPIError 202/404 as soft misses) and
  `build_search_terms` (lowercase, whitespace-collapsed text blob).
  `_persist_file_asset` wires both into the image+video success
  paths of `_run_pass` and into every iteration of `_run_backfill`.
- Backfill skip logic refined: always upsert file_assets (idempotent
  PK), only skip the tagging_events insert if a good row already
  exists. Re-running Backfill from Box populates thumbnails for
  rows backfilled before this feature shipped.
- api.py: `GET /api/files/{file_id}/thumbnail` streams the bytea
  with Cache-Control max-age=86400. Search SQL gains the LEFT JOIN
  and emits `has_thumbnail` per row. Search also matches against
  fa.search_terms so backfilled rows surface for free-text queries
  that hit their metadata.
- frontend: Event type adds `has_thumbnail`; `thumbnailUrl(fileId)`
  helper builds the prefix-aware URL via Vite's BASE. EventList
  renders the thumbnail (lazy, with onError fallback) or a striped
  placeholder. .thumb styling + .event-head layout in styles.css.

Verified locally: schema applies via lifespan; upsert + get_thumbnail
roundtrip; /api/files/999/thumbnail returns 200 with bytes; /api/events
returns has_thumbnail per row; multi-token "female city" search finds
a row whose validated_metadata contains both tokens.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-11 16:20:13 -04:00

211 lines
6.6 KiB
Python

"""
Postgres logging for the Marriott Box tagger.
One row per file Gemini was called on (success or error). The DB is auxiliary —
all functions swallow exceptions and print to stderr so a Postgres outage cannot
stop the tagging pass. Box remains the source of truth.
"""
import json
import os
import sys
from pathlib import Path
import psycopg
from psycopg.types.json import Jsonb
SCHEMA_PATH = Path(__file__).parent / "schema.sql"
INSERT_SQL = """
INSERT INTO tagging_events (
run_id, file_id, file_name, folder_path, media_type, gemini_model,
prompt, raw_response, description, scenes, validated_metadata,
metadata_write_success, description_write_success, scene_comment_write_success,
status, error_message, duration_ms
) VALUES (
%(run_id)s, %(file_id)s, %(file_name)s, %(folder_path)s, %(media_type)s, %(gemini_model)s,
%(prompt)s, %(raw_response)s, %(description)s, %(scenes)s, %(validated_metadata)s,
%(metadata_write_success)s, %(description_write_success)s, %(scene_comment_write_success)s,
%(status)s, %(error_message)s, %(duration_ms)s
)
"""
def _dsn():
dsn = os.getenv("DATABASE_URL")
if not dsn:
raise RuntimeError("DATABASE_URL not set")
return dsn
def get_conn():
"""Open a Postgres connection. Caller owns close()."""
return psycopg.connect(_dsn(), autocommit=True)
def ensure_schema(conn):
"""Apply schema.sql idempotently."""
sql = SCHEMA_PATH.read_text()
with conn.cursor() as cur:
cur.execute(sql)
_UPSERT_FILE_ASSET_SQL = """
INSERT INTO file_assets (
file_id, thumbnail_bytes, thumbnail_content_type, thumbnail_size,
search_terms, updated_at
) VALUES (
%(file_id)s, %(thumbnail_bytes)s, %(thumbnail_content_type)s, %(thumbnail_size)s,
%(search_terms)s, now()
)
ON CONFLICT (file_id) DO UPDATE SET
-- only overwrite the thumbnail when we have new bytes; preserves
-- previously-captured thumbs across runs where the fetch failed.
thumbnail_bytes = COALESCE(EXCLUDED.thumbnail_bytes, file_assets.thumbnail_bytes),
thumbnail_content_type = COALESCE(EXCLUDED.thumbnail_content_type, file_assets.thumbnail_content_type),
thumbnail_size = COALESCE(EXCLUDED.thumbnail_size, file_assets.thumbnail_size),
search_terms = COALESCE(EXCLUDED.search_terms, file_assets.search_terms),
updated_at = now()
"""
def upsert_file_asset(
conn,
*,
file_id,
thumbnail_bytes=None,
thumbnail_content_type=None,
thumbnail_size=None,
search_terms=None,
):
"""
Idempotently insert/update the per-file row. Failures are swallowed —
a thumbnail or search-blob hiccup must never stop a tagging pass.
"""
if conn is None or not file_id:
return
try:
with conn.cursor() as cur:
cur.execute(_UPSERT_FILE_ASSET_SQL, {
"file_id": str(file_id),
"thumbnail_bytes": thumbnail_bytes,
"thumbnail_content_type": thumbnail_content_type,
"thumbnail_size": thumbnail_size,
"search_terms": search_terms,
})
except Exception as e:
print(
f" WARN: DB upsert_file_asset failed ({type(e).__name__}: {e}) — continuing",
file=sys.stderr,
)
def get_thumbnail(conn, file_id):
"""Return (bytes, content_type) for the file's stored thumbnail, or None."""
if conn is None or not file_id:
return None
try:
with conn.cursor() as cur:
cur.execute(
"SELECT thumbnail_bytes, thumbnail_content_type "
"FROM file_assets WHERE file_id = %s AND thumbnail_bytes IS NOT NULL",
(str(file_id),),
)
row = cur.fetchone()
if not row:
return None
return bytes(row[0]), row[1]
except Exception as e:
print(
f" WARN: DB get_thumbnail failed ({type(e).__name__}: {e})",
file=sys.stderr,
)
return None
def is_file_already_tagged(conn, file_id) -> bool:
"""
Skip-check oracle. A file counts as "already tagged" if we have any row
in tagging_events for it with a terminal-good status — either a real
Gemini-driven success or a backfilled row that mirrors Box's existing
metadata. Error/validation rows do NOT count, so a previously failed
file gets retried on the next pass.
"""
if conn is None or not file_id:
return False
try:
with conn.cursor() as cur:
cur.execute(
"SELECT 1 FROM tagging_events "
"WHERE file_id = %s AND status IN ('success','backfilled') LIMIT 1",
(str(file_id),),
)
return cur.fetchone() is not None
except Exception as e:
print(
f" WARN: DB is_file_already_tagged failed ({type(e).__name__}: {e}) — assuming NOT tagged",
file=sys.stderr,
)
return False
def _jsonable(value):
if value is None:
return None
return Jsonb(value)
def log_event(
conn,
*,
run_id,
file_id,
file_name,
folder_path,
media_type,
gemini_model,
status,
prompt=None,
raw_response=None,
description=None,
scenes=None,
validated_metadata=None,
metadata_write_success=None,
description_write_success=None,
scene_comment_write_success=None,
error_message=None,
duration_ms=None,
):
"""
Insert one tagging_events row. Never raises — DB problems are reported to stderr
and the tagger continues.
"""
if conn is None:
return
params = {
"run_id": str(run_id),
"file_id": str(file_id) if file_id is not None else None,
"file_name": file_name,
"folder_path": folder_path,
"media_type": media_type,
"gemini_model": gemini_model,
"prompt": prompt,
"raw_response": _jsonable(raw_response),
"description": description,
"scenes": _jsonable(scenes),
"validated_metadata": _jsonable(validated_metadata),
"metadata_write_success": metadata_write_success,
"description_write_success": description_write_success,
"scene_comment_write_success": scene_comment_write_success,
"status": status,
"error_message": error_message,
"duration_ms": duration_ms,
}
try:
with conn.cursor() as cur:
cur.execute(INSERT_SQL, params)
except Exception as e:
print(
f" WARN: DB log_event failed ({type(e).__name__}: {e}) — continuing",
file=sys.stderr,
)