Cache Box thumbnails + search blob; render in UI
Search results were text-only — hard to scan visually with thousands
of assets coming. Now every file Gemini-tags or backfill mirrors also
gets its Box-generated 160x160 JPG thumbnail (~10-20 KB) pulled and
stashed in Postgres, plus a consolidated `search_terms` blob
(file_name + folder + description + flattened metadata values).
Search results render the thumbnail inline; rows missing one show a
striped placeholder. Search SQL now LEFT JOINs file_assets and hits
search_terms too, so backfilled rows are properly searchable.
- schema.sql: new `file_assets` table (file_id PK, thumbnail_bytes
bytea, search_terms text, updated_at). idempotent.
- db.py: `upsert_file_asset` (INSERT … ON CONFLICT preserving
existing thumbnail bytes if today's fetch failed) and
`get_thumbnail`. Both swallow exceptions per the established
defensive pattern.
- main.py: `fetch_thumbnail` (Box SDK get_file_thumbnail_by_id, JPG
at 160 px, handles BoxAPIError 202/404 as soft misses) and
`build_search_terms` (lowercase, whitespace-collapsed text blob).
`_persist_file_asset` wires both into the image+video success
paths of `_run_pass` and into every iteration of `_run_backfill`.
- Backfill skip logic refined: always upsert file_assets (idempotent
PK), only skip the tagging_events insert if a good row already
exists. Re-running Backfill from Box populates thumbnails for
rows backfilled before this feature shipped.
- api.py: `GET /api/files/{file_id}/thumbnail` streams the bytea
with Cache-Control max-age=86400. Search SQL gains the LEFT JOIN
and emits `has_thumbnail` per row. Search also matches against
fa.search_terms so backfilled rows surface for free-text queries
that hit their metadata.
- frontend: Event type adds `has_thumbnail`; `thumbnailUrl(fileId)`
helper builds the prefix-aware URL via Vite's BASE. EventList
renders the thumbnail (lazy, with onError fallback) or a striped
placeholder. .thumb styling + .event-head layout in styles.css.
Verified locally: schema applies via lifespan; upsert + get_thumbnail
roundtrip; /api/files/999/thumbnail returns 200 with bytes; /api/events
returns has_thumbnail per row; multi-token "female city" search finds
a row whose validated_metadata contains both tokens.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
6f367d5b29
commit
04440d661d
7 changed files with 302 additions and 33 deletions
88
api.py
88
api.py
|
|
@ -18,7 +18,7 @@ import uuid
|
|||
from contextlib import asynccontextmanager, contextmanager
|
||||
from typing import Optional
|
||||
|
||||
from fastapi import Depends, FastAPI, HTTPException, Query
|
||||
from fastapi import Depends, FastAPI, HTTPException, Query, Response
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from psycopg.rows import dict_row
|
||||
|
||||
|
|
@ -26,6 +26,8 @@ import db
|
|||
from auth import User, maybe_auth_info, require_admin, require_auth
|
||||
|
||||
BOX_FILE_URL = "https://app.box.com/file/{file_id}"
|
||||
# Frontend builds the actual URL by joining its own API base with file_id —
|
||||
# we only signal *whether* a thumbnail exists so the JSON stays prefix-agnostic.
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
|
|
@ -138,23 +140,46 @@ def _build_search_sql(q: str, limit: int):
|
|||
"""
|
||||
Tokenise the query on whitespace, AND-match every token across the columns,
|
||||
where each token may match by substring OR by trigram similarity. Results
|
||||
ranked by summed similarity score, then recency.
|
||||
ranked by summed similarity score, then recency. LEFT JOINs file_assets
|
||||
so the response can flag whether a thumbnail is available.
|
||||
"""
|
||||
tokens = [t for t in q.strip().split() if t]
|
||||
common_cols = (
|
||||
"id, run_id, created_at, file_id, file_name, folder_path, media_type, "
|
||||
"gemini_model, description, scenes, validated_metadata, raw_response, "
|
||||
"metadata_write_success, description_write_success, scene_comment_write_success, "
|
||||
"status, error_message, duration_ms"
|
||||
"e.id, e.run_id, e.created_at, e.file_id, e.file_name, e.folder_path, e.media_type, "
|
||||
"e.gemini_model, e.description, e.scenes, e.validated_metadata, e.raw_response, "
|
||||
"e.metadata_write_success, e.description_write_success, e.scene_comment_write_success, "
|
||||
"e.status, e.error_message, e.duration_ms, "
|
||||
"(fa.thumbnail_bytes IS NOT NULL) AS _has_thumbnail"
|
||||
)
|
||||
join_clause = "FROM tagging_events e LEFT JOIN file_assets fa ON fa.file_id = e.file_id"
|
||||
|
||||
if not tokens:
|
||||
return (
|
||||
f"SELECT {common_cols} FROM tagging_events "
|
||||
f"ORDER BY created_at DESC LIMIT %(limit)s",
|
||||
f"SELECT {common_cols} {join_clause} "
|
||||
f"ORDER BY e.created_at DESC LIMIT %(limit)s",
|
||||
{"limit": limit},
|
||||
)
|
||||
|
||||
# Per-token search runs across event columns + the file_assets.search_terms
|
||||
# blob — gives backfilled rows a richer search surface than just the event
|
||||
# row's fields.
|
||||
all_search_cols = [
|
||||
"e.file_name",
|
||||
"e.folder_path",
|
||||
"e.description",
|
||||
"e.status",
|
||||
"e.file_id",
|
||||
"coalesce(e.validated_metadata::text, '')",
|
||||
"coalesce(e.raw_response::text, '')",
|
||||
"coalesce(e.scenes::text, '')",
|
||||
"coalesce(fa.search_terms, '')",
|
||||
]
|
||||
fuzzy_blob = (
|
||||
"coalesce(e.file_name,'')||' '||coalesce(e.folder_path,'')||' '||"
|
||||
"coalesce(e.description,'')||' '||coalesce(e.validated_metadata::text,'')||' '||"
|
||||
"coalesce(e.scenes::text,'')||' '||coalesce(fa.search_terms,'')"
|
||||
)
|
||||
|
||||
params: dict = {"limit": limit}
|
||||
clauses: list[str] = []
|
||||
score_terms: list[str] = []
|
||||
|
|
@ -163,13 +188,13 @@ def _build_search_sql(q: str, limit: int):
|
|||
sim_key = f"sim_{i}"
|
||||
params[like_key] = f"%{tok}%"
|
||||
params[sim_key] = tok
|
||||
col_ors = " OR ".join(f"{c} ILIKE %({like_key})s" for c in _SEARCH_COLS)
|
||||
col_ors = " OR ".join(f"{c} ILIKE %({like_key})s" for c in all_search_cols)
|
||||
if len(tok) >= _MIN_FUZZY_TOKEN_LEN:
|
||||
clauses.append(
|
||||
f"(({col_ors}) "
|
||||
f"OR similarity({_SEARCH_BLOB}, %({sim_key})s) > {_SIM_THRESHOLD})"
|
||||
f"OR similarity({fuzzy_blob}, %({sim_key})s) > {_SIM_THRESHOLD})"
|
||||
)
|
||||
score_terms.append(f"similarity({_SEARCH_BLOB}, %({sim_key})s)")
|
||||
score_terms.append(f"similarity({fuzzy_blob}, %({sim_key})s)")
|
||||
else:
|
||||
clauses.append(f"({col_ors})")
|
||||
|
||||
|
|
@ -177,9 +202,9 @@ def _build_search_sql(q: str, limit: int):
|
|||
score_sql = " + ".join(score_terms) if score_terms else "0"
|
||||
sql = (
|
||||
f"SELECT {common_cols}, ({score_sql}) AS _score "
|
||||
f"FROM tagging_events "
|
||||
f"{join_clause} "
|
||||
f"WHERE {where} "
|
||||
f"ORDER BY _score DESC, created_at DESC "
|
||||
f"ORDER BY _score DESC, e.created_at DESC "
|
||||
f"LIMIT %(limit)s"
|
||||
)
|
||||
return sql, params
|
||||
|
|
@ -189,6 +214,7 @@ def _event_to_dict(row):
|
|||
out = dict(row)
|
||||
fid = out.get("file_id")
|
||||
out["box_url"] = BOX_FILE_URL.format(file_id=fid) if fid else None
|
||||
out["has_thumbnail"] = bool(out.pop("_has_thumbnail", False))
|
||||
if out.get("run_id") is not None:
|
||||
out["run_id"] = str(out["run_id"])
|
||||
if out.get("created_at") is not None:
|
||||
|
|
@ -196,6 +222,26 @@ def _event_to_dict(row):
|
|||
return out
|
||||
|
||||
|
||||
@app.get("/api/files/{file_id}/thumbnail")
|
||||
def file_thumbnail(file_id: str, user: User = Depends(require_auth)):
|
||||
"""
|
||||
Serve a previously-cached Box thumbnail (160x160 JPG by default) for a file.
|
||||
Bytes live in Postgres bytea on file_assets — see `_persist_file_asset`
|
||||
in main.py. Browser is told to cache aggressively; the asset is stable
|
||||
until a re-fetch overwrites the row.
|
||||
"""
|
||||
with _conn() as c:
|
||||
row = db.get_thumbnail(c, file_id)
|
||||
if not row:
|
||||
raise HTTPException(status_code=404, detail="no thumbnail")
|
||||
data, content_type = row
|
||||
return Response(
|
||||
content=data,
|
||||
media_type=content_type or "image/jpeg",
|
||||
headers={"Cache-Control": "private, max-age=86400"},
|
||||
)
|
||||
|
||||
|
||||
@app.get("/api/events")
|
||||
def search_events(
|
||||
q: str = Query("", description="Free-text search across all fields"),
|
||||
|
|
@ -343,13 +389,15 @@ def run_events(run_id: str, user: User = Depends(require_auth), limit: int = Que
|
|||
with c.cursor(row_factory=dict_row) as cur:
|
||||
cur.execute(
|
||||
"""
|
||||
SELECT id, run_id, created_at, file_id, file_name, folder_path, media_type,
|
||||
gemini_model, description, scenes, validated_metadata,
|
||||
metadata_write_success, description_write_success,
|
||||
scene_comment_write_success, status, error_message, duration_ms
|
||||
FROM tagging_events
|
||||
WHERE run_id = %s
|
||||
ORDER BY created_at DESC
|
||||
SELECT e.id, e.run_id, e.created_at, e.file_id, e.file_name, e.folder_path, e.media_type,
|
||||
e.gemini_model, e.description, e.scenes, e.validated_metadata,
|
||||
e.metadata_write_success, e.description_write_success,
|
||||
e.scene_comment_write_success, e.status, e.error_message, e.duration_ms,
|
||||
(fa.thumbnail_bytes IS NOT NULL) AS _has_thumbnail
|
||||
FROM tagging_events e
|
||||
LEFT JOIN file_assets fa ON fa.file_id = e.file_id
|
||||
WHERE e.run_id = %s
|
||||
ORDER BY e.created_at DESC
|
||||
LIMIT %s
|
||||
""",
|
||||
(run_id, limit),
|
||||
|
|
|
|||
73
db.py
73
db.py
|
|
@ -50,6 +50,79 @@ def ensure_schema(conn):
|
|||
cur.execute(sql)
|
||||
|
||||
|
||||
_UPSERT_FILE_ASSET_SQL = """
|
||||
INSERT INTO file_assets (
|
||||
file_id, thumbnail_bytes, thumbnail_content_type, thumbnail_size,
|
||||
search_terms, updated_at
|
||||
) VALUES (
|
||||
%(file_id)s, %(thumbnail_bytes)s, %(thumbnail_content_type)s, %(thumbnail_size)s,
|
||||
%(search_terms)s, now()
|
||||
)
|
||||
ON CONFLICT (file_id) DO UPDATE SET
|
||||
-- only overwrite the thumbnail when we have new bytes; preserves
|
||||
-- previously-captured thumbs across runs where the fetch failed.
|
||||
thumbnail_bytes = COALESCE(EXCLUDED.thumbnail_bytes, file_assets.thumbnail_bytes),
|
||||
thumbnail_content_type = COALESCE(EXCLUDED.thumbnail_content_type, file_assets.thumbnail_content_type),
|
||||
thumbnail_size = COALESCE(EXCLUDED.thumbnail_size, file_assets.thumbnail_size),
|
||||
search_terms = COALESCE(EXCLUDED.search_terms, file_assets.search_terms),
|
||||
updated_at = now()
|
||||
"""
|
||||
|
||||
|
||||
def upsert_file_asset(
|
||||
conn,
|
||||
*,
|
||||
file_id,
|
||||
thumbnail_bytes=None,
|
||||
thumbnail_content_type=None,
|
||||
thumbnail_size=None,
|
||||
search_terms=None,
|
||||
):
|
||||
"""
|
||||
Idempotently insert/update the per-file row. Failures are swallowed —
|
||||
a thumbnail or search-blob hiccup must never stop a tagging pass.
|
||||
"""
|
||||
if conn is None or not file_id:
|
||||
return
|
||||
try:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(_UPSERT_FILE_ASSET_SQL, {
|
||||
"file_id": str(file_id),
|
||||
"thumbnail_bytes": thumbnail_bytes,
|
||||
"thumbnail_content_type": thumbnail_content_type,
|
||||
"thumbnail_size": thumbnail_size,
|
||||
"search_terms": search_terms,
|
||||
})
|
||||
except Exception as e:
|
||||
print(
|
||||
f" WARN: DB upsert_file_asset failed ({type(e).__name__}: {e}) — continuing",
|
||||
file=sys.stderr,
|
||||
)
|
||||
|
||||
|
||||
def get_thumbnail(conn, file_id):
|
||||
"""Return (bytes, content_type) for the file's stored thumbnail, or None."""
|
||||
if conn is None or not file_id:
|
||||
return None
|
||||
try:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(
|
||||
"SELECT thumbnail_bytes, thumbnail_content_type "
|
||||
"FROM file_assets WHERE file_id = %s AND thumbnail_bytes IS NOT NULL",
|
||||
(str(file_id),),
|
||||
)
|
||||
row = cur.fetchone()
|
||||
if not row:
|
||||
return None
|
||||
return bytes(row[0]), row[1]
|
||||
except Exception as e:
|
||||
print(
|
||||
f" WARN: DB get_thumbnail failed ({type(e).__name__}: {e})",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return None
|
||||
|
||||
|
||||
def is_file_already_tagged(conn, file_id) -> bool:
|
||||
"""
|
||||
Skip-check oracle. A file counts as "already tagged" if we have any row
|
||||
|
|
|
|||
|
|
@ -9,6 +9,7 @@ import {
|
|||
searchEvents,
|
||||
startBackfill,
|
||||
startRun,
|
||||
thumbnailUrl,
|
||||
} from "./api";
|
||||
|
||||
export function App() {
|
||||
|
|
@ -281,6 +282,19 @@ function EventList({ events }: { events: Event[] }) {
|
|||
{events.map((e) => (
|
||||
<li key={e.id} className={`event status-${e.status}`}>
|
||||
<div className="event-head">
|
||||
{e.has_thumbnail ? (
|
||||
<img
|
||||
className="thumb"
|
||||
loading="lazy"
|
||||
alt=""
|
||||
src={thumbnailUrl(e.file_id)}
|
||||
onError={(ev) => {
|
||||
(ev.currentTarget as HTMLImageElement).style.display = "none";
|
||||
}}
|
||||
/>
|
||||
) : (
|
||||
<span className="thumb thumb-placeholder" aria-hidden="true" />
|
||||
)}
|
||||
<span className={`badge type-${e.media_type}`}>{e.media_type}</span>
|
||||
<span className="event-name">{e.file_name}</span>
|
||||
<span className={`pill status-${e.status}`}>{e.status}</span>
|
||||
|
|
|
|||
|
|
@ -28,8 +28,14 @@ export type Event = {
|
|||
error_message: string | null;
|
||||
duration_ms: number | null;
|
||||
box_url: string | null;
|
||||
has_thumbnail: boolean;
|
||||
};
|
||||
|
||||
/** Build the thumbnail URL using the SPA's own API base — works at /marriott-tagging/ and /. */
|
||||
export function thumbnailUrl(fileId: string): string {
|
||||
return `${API_BASE}/files/${encodeURIComponent(fileId)}/thumbnail`;
|
||||
}
|
||||
|
||||
export type Run = {
|
||||
run_id: string;
|
||||
started_at: string | null;
|
||||
|
|
|
|||
|
|
@ -231,6 +231,25 @@ input:focus {
|
|||
gap: 0.5rem;
|
||||
flex-wrap: wrap;
|
||||
}
|
||||
.thumb {
|
||||
display: block;
|
||||
width: 64px;
|
||||
height: 64px;
|
||||
object-fit: cover;
|
||||
border-radius: 4px;
|
||||
border: 1px solid var(--line);
|
||||
background: var(--surface);
|
||||
flex-shrink: 0;
|
||||
}
|
||||
.thumb-placeholder {
|
||||
background: repeating-linear-gradient(
|
||||
45deg,
|
||||
var(--surface),
|
||||
var(--surface) 6px,
|
||||
var(--surface-alt) 6px,
|
||||
var(--surface-alt) 12px
|
||||
);
|
||||
}
|
||||
.event-name {
|
||||
font-weight: 600;
|
||||
flex: 1;
|
||||
|
|
|
|||
122
main.py
122
main.py
|
|
@ -26,6 +26,7 @@ from box_sdk_gen.managers.file_metadata import (
|
|||
UpdateFileMetadataByIdRequestBody,
|
||||
UpdateFileMetadataByIdRequestBodyOpField,
|
||||
)
|
||||
from box_sdk_gen.managers.files import GetFileThumbnailByIdExtension
|
||||
from box_sdk_gen.managers.comments import CreateCommentItem, CreateCommentItemTypeField
|
||||
|
||||
from google import genai
|
||||
|
|
@ -660,6 +661,67 @@ def fetch_file_description(box_client, file_id):
|
|||
return None
|
||||
|
||||
|
||||
# Box's pre-rendered JPG thumbnails come in 32, 160, or 320 (px square).
|
||||
THUMBNAIL_DIM = 160
|
||||
|
||||
|
||||
def fetch_thumbnail(box_client, file_id, dim=THUMBNAIL_DIM):
|
||||
"""
|
||||
Pull a small (160×160 default) JPG thumbnail from Box.
|
||||
|
||||
Returns (bytes, "image/jpeg") on success, None if Box has no rendition
|
||||
yet (202 / 404) or any other failure. Never raises — the caller treats
|
||||
'no thumb' as a soft miss and the row's thumbnail column stays NULL.
|
||||
"""
|
||||
try:
|
||||
stream = box_client.files.get_file_thumbnail_by_id(
|
||||
file_id=file_id,
|
||||
extension=GetFileThumbnailByIdExtension.JPG,
|
||||
min_height=dim,
|
||||
min_width=dim,
|
||||
max_height=dim,
|
||||
max_width=dim,
|
||||
)
|
||||
if stream is None:
|
||||
return None
|
||||
data = stream.read() if hasattr(stream, "read") else bytes(stream)
|
||||
if not data:
|
||||
return None
|
||||
return data, "image/jpeg"
|
||||
except BoxAPIError as e:
|
||||
# 202: rendition not yet generated. 404/415: no thumbnail for this type.
|
||||
# All non-fatal — a later backfill re-attempts.
|
||||
print(f" Thumbnail unavailable for {file_id}: {e}")
|
||||
return None
|
||||
except Exception as e:
|
||||
print(f" Thumbnail fetch errored for {file_id}: {type(e).__name__}: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def build_search_terms(file_name, folder_path, description, validated_metadata):
|
||||
"""
|
||||
Flatten everything searchable about a file into one normalized text blob.
|
||||
Stored on file_assets and (future) wired into the search SQL / FTS.
|
||||
"""
|
||||
parts: list[str] = []
|
||||
if file_name:
|
||||
parts.append(str(file_name))
|
||||
if folder_path:
|
||||
parts.append(str(folder_path))
|
||||
if description:
|
||||
parts.append(str(description))
|
||||
if isinstance(validated_metadata, dict):
|
||||
for k, v in validated_metadata.items():
|
||||
parts.append(str(k))
|
||||
if isinstance(v, (list, tuple)):
|
||||
parts.extend(str(x) for x in v)
|
||||
elif v is not None:
|
||||
parts.append(str(v))
|
||||
# collapse whitespace, lowercase for stable substring/trigram matching
|
||||
joined = " ".join(parts)
|
||||
return " ".join(joined.lower().split())
|
||||
|
||||
|
||||
# ── 10. Write Metadata to Box ─────────────────────────────────────────────────
|
||||
|
||||
def write_metadata_to_box(box_client, file_id, metadata, file_name):
|
||||
|
|
@ -924,6 +986,11 @@ def _run_pass(run_id, db_conn):
|
|||
status="success",
|
||||
duration_ms=gemini_elapsed_ms,
|
||||
)
|
||||
_persist_file_asset(
|
||||
box_client, db_conn, file_id, file_name, folder_path,
|
||||
description if isinstance(description, str) else None,
|
||||
cleaned_metadata,
|
||||
)
|
||||
|
||||
# Rate limit delay (skip after last file)
|
||||
if i < img_total or video_files:
|
||||
|
|
@ -1063,6 +1130,11 @@ def _run_pass(run_id, db_conn):
|
|||
status="success",
|
||||
duration_ms=gemini_elapsed_ms,
|
||||
)
|
||||
_persist_file_asset(
|
||||
box_client, db_conn, file_id, file_name, folder_path,
|
||||
description if isinstance(description, str) else None,
|
||||
cleaned_metadata,
|
||||
)
|
||||
|
||||
# Rate limit delay (skip after last video)
|
||||
if i < vid_total:
|
||||
|
|
@ -1083,6 +1155,20 @@ def _run_pass(run_id, db_conn):
|
|||
print("=" * 60)
|
||||
|
||||
|
||||
def _persist_file_asset(box_client, db_conn, file_id, file_name, folder_path, description, validated_metadata):
|
||||
"""Fetch the Box thumbnail (best-effort) and upsert the file_assets row."""
|
||||
thumb = fetch_thumbnail(box_client, file_id)
|
||||
search_terms = build_search_terms(file_name, folder_path, description, validated_metadata)
|
||||
db.upsert_file_asset(
|
||||
db_conn,
|
||||
file_id=file_id,
|
||||
thumbnail_bytes=thumb[0] if thumb else None,
|
||||
thumbnail_content_type=thumb[1] if thumb else None,
|
||||
thumbnail_size=THUMBNAIL_DIM if thumb else None,
|
||||
search_terms=search_terms,
|
||||
)
|
||||
|
||||
|
||||
# ── 14. Backfill from Box ─────────────────────────────────────────────────────
|
||||
|
||||
|
||||
|
|
@ -1110,9 +1196,10 @@ def _run_backfill(run_id, db_conn):
|
|||
print("No media files found. Exiting.")
|
||||
return
|
||||
|
||||
inserted = 0
|
||||
events_inserted = 0
|
||||
events_existing = 0
|
||||
assets_upserted = 0
|
||||
no_metadata = 0
|
||||
already_in_db = 0
|
||||
errored = 0
|
||||
|
||||
combined = [("image", f) for f in image_files] + [("video", f) for f in video_files]
|
||||
|
|
@ -1125,11 +1212,6 @@ def _run_backfill(run_id, db_conn):
|
|||
if folder_path:
|
||||
print(f" Folder: {folder_path}")
|
||||
|
||||
if db.is_file_already_tagged(db_conn, file_id):
|
||||
print(" Already in DB — skipping.")
|
||||
already_in_db += 1
|
||||
continue
|
||||
|
||||
try:
|
||||
existing = fetch_existing_metadata(box_client, file_id)
|
||||
except BoxAPIError as e:
|
||||
|
|
@ -1147,6 +1229,19 @@ def _run_backfill(run_id, db_conn):
|
|||
if description:
|
||||
print(f" Description: {description[:80]}{'…' if len(description) > 80 else ''}")
|
||||
|
||||
# Always upsert the file_assets row (thumbnail + search blob). Idempotent
|
||||
# on file_id PK; preserves previously-captured thumbnails if today's
|
||||
# fetch fails (see ON CONFLICT in db._UPSERT_FILE_ASSET_SQL).
|
||||
_persist_file_asset(box_client, db_conn, file_id, file_name, folder_path, description, existing)
|
||||
assets_upserted += 1
|
||||
|
||||
# Only insert a tagging_events row if there isn't already a good one
|
||||
# for this file. Re-running backfill should not duplicate event rows.
|
||||
if db.is_file_already_tagged(db_conn, file_id):
|
||||
print(" Event row already present — leaving tagging_events untouched.")
|
||||
events_existing += 1
|
||||
continue
|
||||
|
||||
db.log_event(
|
||||
db_conn,
|
||||
run_id=run_id,
|
||||
|
|
@ -1167,16 +1262,17 @@ def _run_backfill(run_id, db_conn):
|
|||
error_message=None,
|
||||
duration_ms=None,
|
||||
)
|
||||
inserted += 1
|
||||
events_inserted += 1
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("BACKFILL SUMMARY")
|
||||
print("=" * 60)
|
||||
print(f" Total files seen: {total}")
|
||||
print(f" Inserted into DB: {inserted}")
|
||||
print(f" Already in DB (skipped): {already_in_db}")
|
||||
print(f" No metadata in Box: {no_metadata}")
|
||||
print(f" Errors reading Box: {errored}")
|
||||
print(f" Total files seen: {total}")
|
||||
print(f" file_assets upserted: {assets_upserted}")
|
||||
print(f" New tagging_events rows: {events_inserted}")
|
||||
print(f" Event rows already present: {events_existing}")
|
||||
print(f" No metadata in Box: {no_metadata}")
|
||||
print(f" Errors reading Box: {errored}")
|
||||
print("=" * 60)
|
||||
|
||||
|
||||
|
|
|
|||
13
schema.sql
13
schema.sql
|
|
@ -30,3 +30,16 @@ CREATE TABLE IF NOT EXISTS tagging_events (
|
|||
CREATE INDEX IF NOT EXISTS tagging_events_run_id_idx ON tagging_events (run_id);
|
||||
CREATE INDEX IF NOT EXISTS tagging_events_file_id_idx ON tagging_events (file_id);
|
||||
CREATE INDEX IF NOT EXISTS tagging_events_created_idx ON tagging_events (created_at DESC);
|
||||
|
||||
-- Per-file state: thumbnails + consolidated search blob. One row per Box file_id,
|
||||
-- upserted by the tagger and backfill flows. Independent of the append-only
|
||||
-- tagging_events log (which can have many rows per file_id over time).
|
||||
CREATE TABLE IF NOT EXISTS file_assets (
|
||||
file_id TEXT PRIMARY KEY,
|
||||
thumbnail_bytes BYTEA,
|
||||
thumbnail_content_type TEXT,
|
||||
thumbnail_size INTEGER,
|
||||
search_terms TEXT,
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT now()
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS file_assets_updated_idx ON file_assets (updated_at DESC);
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue