From 9e6a75feb6e2cc0a1c7a1fbefbf026de53f2e837 Mon Sep 17 00:00:00 2001 From: DJP Date: Mon, 11 May 2026 15:41:10 -0400 Subject: [PATCH] Manual-only runs, DB-based skip check, backfill-from-Box MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously a nightly APScheduler container fired the tagger on every file in the configured Box folder. With ~5000 files coming, that's ~5000 Box HTTP calls every night just to ask "is this tagged?". Move to manual-only mode and source the skip decision from the local DB. - `db.is_file_already_tagged(conn, file_id)` — returns True iff the DB has a row with status IN ('success','backfilled'). Used by both image and video loops in main.py instead of the previous `check_existing_metadata(box_client, file_id)` Box round-trip. - `fetch_existing_metadata(box_client, file_id)` (main.py) — returns the user-defined template fields as a flat dict by stripping the Box `$id`/`$type`/etc. attrs from the SDK response. - `_run_backfill(run_id, db_conn)` (main.py) — walks the Box folder and inserts a `status='backfilled'` row for every file Box already has marriottUsa metadata for. Read-only against Box; safe to re-run. Use this after first deploy, or to repopulate the DB from Box. - `POST /api/backfill` mirrors `POST /api/runs` (background thread, same live-state record). - SPA: new "Backfill from Box" button next to "Run now" (with a confirm dialog and a yellow `.status-backfilled` event treatment). - docker-compose.yml: removed the `tagger` (scheduler) service. Manual triggers via the SPA / `POST /api/runs` only. scheduler.py stays in the repo for archival / opt-back-in. - deploy.sh: readiness now checks the `api` container instead of `tagger`; `--logs` tails api logs. Co-Authored-By: Claude Opus 4.7 (1M context) --- api.py | 43 ++++++++++ db.py | 26 ++++++ deploy/deploy.sh | 16 ++-- docker-compose.yml | 22 ++--- frontend/src/App.tsx | 47 +++++++++-- frontend/src/api.ts | 8 ++ frontend/src/styles.css | 16 ++++ main.py | 172 +++++++++++++++++++++++++++++++++++++--- 8 files changed, 307 insertions(+), 43 deletions(-) diff --git a/api.py b/api.py index 42a13db..08ca8d6 100644 --- a/api.py +++ b/api.py @@ -175,6 +175,49 @@ def start_run(user: User = Depends(require_auth)): return {"run_id": str(run_id), "state": "running", "started_by": user.email or user.oid} +def _run_backfill_in_thread(run_id: uuid.UUID): + import main as tagger + + with _runs_lock: + _runs[str(run_id)] = {"run_id": str(run_id), "state": "running", "error": None, "kind": "backfill"} + + db_conn = None + try: + db_conn = db.get_conn() + db.ensure_schema(db_conn) + tagger._run_backfill(run_id, db_conn) + with _runs_lock: + _runs[str(run_id)]["state"] = "completed" + except SystemExit as e: + with _runs_lock: + _runs[str(run_id)]["state"] = "failed" + _runs[str(run_id)]["error"] = f"SystemExit({e.code})" + except Exception as e: + with _runs_lock: + _runs[str(run_id)]["state"] = "failed" + _runs[str(run_id)]["error"] = f"{type(e).__name__}: {e}" + finally: + if db_conn is not None: + try: + db_conn.close() + except Exception: + pass + + +@app.post("/api/backfill") +def start_backfill(user: User = Depends(require_auth)): + """ + Walk the Box folder and mirror any existing marriottUsa metadata into the + local DB as `status='backfilled'` rows. Use this after first deploy (or + after restoring an empty DB) so the per-file skip check doesn't re-tag + files Box already has metadata for. + """ + run_id = uuid.uuid4() + t = threading.Thread(target=_run_backfill_in_thread, args=(run_id,), daemon=True) + t.start() + return {"run_id": str(run_id), "state": "running", "kind": "backfill", "started_by": user.email or user.oid} + + @app.get("/api/runs") def list_runs(user: User = Depends(require_auth), limit: int = Query(20, ge=1, le=100)): """Recent runs in the DB, plus the in-memory state if the run is still active.""" diff --git a/db.py b/db.py index 11fc8ca..bb2e55f 100644 --- a/db.py +++ b/db.py @@ -50,6 +50,32 @@ def ensure_schema(conn): cur.execute(sql) +def is_file_already_tagged(conn, file_id) -> bool: + """ + Skip-check oracle. A file counts as "already tagged" if we have any row + in tagging_events for it with a terminal-good status — either a real + Gemini-driven success or a backfilled row that mirrors Box's existing + metadata. Error/validation rows do NOT count, so a previously failed + file gets retried on the next pass. + """ + if conn is None or not file_id: + return False + try: + with conn.cursor() as cur: + cur.execute( + "SELECT 1 FROM tagging_events " + "WHERE file_id = %s AND status IN ('success','backfilled') LIMIT 1", + (str(file_id),), + ) + return cur.fetchone() is not None + except Exception as e: + print( + f" WARN: DB is_file_already_tagged failed ({type(e).__name__}: {e}) — assuming NOT tagged", + file=sys.stderr, + ) + return False + + def _jsonable(value): if value is None: return None diff --git a/deploy/deploy.sh b/deploy/deploy.sh index e16c2e9..a2c1dfb 100755 --- a/deploy/deploy.sh +++ b/deploy/deploy.sh @@ -183,7 +183,7 @@ if (( DO_BUILD )); then docker compose build fi -log "docker compose up -d (db + tagger + api)" +log "docker compose up -d (db + api)" docker compose up -d # ---------- 6. Frontend build + sync ---------- @@ -257,13 +257,13 @@ for i in $(seq 1 30); do fi done -TAGGER_STATE=$(docker compose ps tagger --format '{{.State}}' 2>/dev/null | head -1) -if [[ "$TAGGER_STATE" != "running" ]]; then - err "Tagger (scheduler) container is not running (state=${TAGGER_STATE:-unknown}). Recent logs:" - docker compose logs tagger --tail 60 || true +API_STATE=$(docker compose ps api --format '{{.State}}' 2>/dev/null | head -1) +if [[ "$API_STATE" != "running" ]]; then + err "API container is not running (state=${API_STATE:-unknown}). Recent logs:" + docker compose logs api --tail 60 || true exit 1 fi -ok "Tagger scheduler running" +ok "API container running (manual-only mode — no scheduler container)" # ---------- 8. Optional: trigger an immediate pass via the API ---------- @@ -315,6 +315,6 @@ fi echo if (( TAIL_LOGS )); then - log "Tailing tagger logs (Ctrl-C to stop)…" - docker compose logs -f tagger + log "Tailing api logs (Ctrl-C to stop)…" + docker compose logs -f api fi diff --git a/docker-compose.yml b/docker-compose.yml index a4fee99..a639d92 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -26,22 +26,12 @@ services: timeout: 5s retries: 10 - tagger: - build: . - restart: unless-stopped - depends_on: - db: - condition: service_healthy - environment: - DATABASE_URL: postgresql://${POSTGRES_USER:-marriott}:${POSTGRES_PASSWORD:-marriott}@db:5432/${POSTGRES_DB:-marriott_tagging} - GEMINI_API_KEY: ${GEMINI_API_KEY} - SCHEDULE_CRON: ${SCHEDULE_CRON:-0 2 * * *} - RUN_AT_STARTUP: ${RUN_AT_STARTUP:-0} - TZ: ${TZ:-UTC} - command: ["python", "-u", "scheduler.py"] - volumes: - # Box JWT config is bind-mounted read-only — never baked into the image. - - ./box_config.json:/app/box_config.json:ro + # ── Manual-only mode ───────────────────────────────────────────────────────── + # The nightly APScheduler container (`tagger`) was intentionally removed so the + # app only runs Gemini against new files when a human clicks "Run now" in the + # SPA (or `curl -X POST .../api/runs`). The pipeline still lives in main.py + # and scheduler.py is kept in the repo — re-add a `tagger` service here using + # the previous block in git history if you want cron-driven passes back. api: build: . diff --git a/frontend/src/App.tsx b/frontend/src/App.tsx index e26004f..3bfa22e 100644 --- a/frontend/src/App.tsx +++ b/frontend/src/App.tsx @@ -6,6 +6,7 @@ import { listRuns, runEvents, searchEvents, + startBackfill, startRun, } from "./api"; @@ -17,10 +18,11 @@ export function App() { const [error, setError] = useState(null); const [activeRun, setActiveRun] = useState(null); + const [activeRunKind, setActiveRunKind] = useState<"tag" | "backfill" | null>(null); const [activeRunEvents, setActiveRunEvents] = useState([]); const [activeRunState, setActiveRunState] = useState(null); const [recentRuns, setRecentRuns] = useState([]); - const [starting, setStarting] = useState(false); + const [starting, setStarting] = useState(null); const runPollTimer = useRef(null); @@ -97,17 +99,41 @@ export function App() { }; const onRunNow = async () => { - setStarting(true); + setStarting("tag"); setError(null); try { const r = await startRun(auth.getToken); setActiveRun(r.run_id); + setActiveRunKind("tag"); setActiveRunEvents([]); setActiveRunState("running"); } catch (e) { setError((e as Error).message); } finally { - setStarting(false); + setStarting(null); + } + }; + + const onBackfill = async () => { + if (!window.confirm( + "Backfill the local DB from Box?\n\n" + + "Walks every file in the configured Box folder and inserts a " + + "'backfilled' row for each one that already has marriottUsa metadata. " + + "No Gemini calls, no Box writes — Box is read-only here.\n\n" + + "Safe to re-run; existing DB rows are kept." + )) return; + setStarting("backfill"); + setError(null); + try { + const r = await startBackfill(auth.getToken); + setActiveRun(r.run_id); + setActiveRunKind("backfill"); + setActiveRunEvents([]); + setActiveRunState("running"); + } catch (e) { + setError((e as Error).message); + } finally { + setStarting(null); } }; @@ -157,11 +183,20 @@ export function App() { + {error &&

{error}

} @@ -170,7 +205,7 @@ export function App() { {activeRun && (

- Active run + {activeRunKind === "backfill" ? "Active backfill" : "Active run"} {activeRun.slice(0, 8)}… {activeRunState && ( {activeRunState} diff --git a/frontend/src/api.ts b/frontend/src/api.ts index 1280f9c..8997902 100644 --- a/frontend/src/api.ts +++ b/frontend/src/api.ts @@ -79,6 +79,14 @@ export function startRun(getToken: GetTokenFn) { ); } +export function startBackfill(getToken: GetTokenFn) { + return req<{ run_id: string; state: string; kind: string; started_by: string }>( + `/backfill`, + getToken, + { method: "POST", body: JSON.stringify({}) } + ); +} + export function listRuns(getToken: GetTokenFn, limit = 20) { return req<{ runs: Run[] }>(`/runs?limit=${limit}`, getToken); } diff --git a/frontend/src/styles.css b/frontend/src/styles.css index 0d1880a..f2b0f44 100644 --- a/frontend/src/styles.css +++ b/frontend/src/styles.css @@ -66,6 +66,15 @@ button.run-now:hover:not(:disabled) { background: var(--accent); color: #000; } +button.ghost { + background: transparent; + color: var(--text); + border: 1px solid var(--line); +} +button.ghost:hover:not(:disabled) { + border-color: var(--accent); + color: var(--accent); +} button.link { background: transparent; color: var(--accent); @@ -208,6 +217,9 @@ input:focus { .event.status-success { border-left-color: var(--ok); } +.event.status-backfilled { + border-left-color: var(--accent); +} .event.status-gemini_error, .event.status-validation_error, .event.status-metadata_write_error { @@ -292,6 +304,10 @@ input:focus { background: var(--ok); color: #000; } +.pill.status-backfilled { + background: var(--accent); + color: #000; +} .pill.status-gemini_error, .pill.status-validation_error, .pill.status-metadata_write_error, diff --git a/main.py b/main.py index abdaafe..8ceaf9f 100644 --- a/main.py +++ b/main.py @@ -596,8 +596,20 @@ def validate_and_clean_metadata(raw_metadata, template_schema): # ── 9. Check Existing Metadata ──────────────────────────────────────────────── +# Box-managed keys returned alongside template fields in a metadata response. +# After `from_dict`, these become regular attribute names on the response object. +_BOX_META_SYSTEM_ATTRS = { + "id", "type", "type_version", "parent", "template", "scope", "version", + "can_edit", +} + + def check_existing_metadata(box_client, file_id): - """Check if file already has marriottUsa metadata. Returns True/False.""" + """Check if file already has marriottUsa metadata. Returns True/False. + + Kept for the backfill code path, which still asks Box. The per-pass skip + check in the main loops now uses the local DB instead (see db.is_file_already_tagged). + """ try: box_client.file_metadata.get_file_metadata_by_id( file_id=file_id, @@ -611,6 +623,43 @@ def check_existing_metadata(box_client, file_id): raise +def fetch_existing_metadata(box_client, file_id): + """ + Fetch the file's marriottUsa metadata from Box and return a flat dict of + user-defined fields (no `$id`/`$scope`/etc.). Returns {} if metadata exists + but has no user fields, None if Box returns 404, raises on other errors. + """ + try: + resp = box_client.file_metadata.get_file_metadata_by_id( + file_id=file_id, + scope=CreateFileMetadataByIdScope.ENTERPRISE, + template_key=METADATA_TEMPLATE_KEY, + ) + except BoxAPIError as e: + if e.response_info.status_code == 404: + return None + raise + + out = {} + for k, v in vars(resp).items(): + if k.startswith("_"): + continue + if k in _BOX_META_SYSTEM_ATTRS: + continue + out[k] = v + return out + + +def fetch_file_description(box_client, file_id): + """Return the file's Box description (string) or None if unavailable / empty.""" + try: + f = box_client.files.get_file_by_id(file_id=file_id, fields=["description"]) + desc = getattr(f, "description", None) + return desc if isinstance(desc, str) and desc else None + except BoxAPIError: + return None + + # ── 10. Write Metadata to Box ───────────────────────────────────────────────── def write_metadata_to_box(box_client, file_id, metadata, file_name): @@ -783,12 +832,13 @@ def _run_pass(run_id, db_conn): if folder_path: print(f" Folder: {folder_path}") - # Check if already tagged - if SKIP_ALREADY_TAGGED: - if check_existing_metadata(box_client, file_id): - print(f" Already tagged — skipping") - img_skipped += 1 - continue + # Skip if the DB already has a success/backfilled row for this file. + # If the DB is empty/lost, run the backfill flow first to repopulate + # from Box — that avoids re-tagging files Box already has metadata for. + if SKIP_ALREADY_TAGGED and db.is_file_already_tagged(db_conn, file_id): + print(f" Already in DB — skipping") + img_skipped += 1 + continue # Download and resize result = download_and_resize_image(box_client, file_id, file_name) @@ -907,12 +957,11 @@ def _run_pass(run_id, db_conn): if folder_path: print(f" Folder: {folder_path}") - # Check if already tagged - if SKIP_ALREADY_TAGGED: - if check_existing_metadata(box_client, file_id): - print(f" Already tagged — skipping") - vid_skipped += 1 - continue + # Skip if the DB already has a success/backfilled row for this file. + if SKIP_ALREADY_TAGGED and db.is_file_already_tagged(db_conn, file_id): + print(f" Already in DB — skipping") + vid_skipped += 1 + continue # Download video proxy (480p MP4) result = download_video_proxy(box_client, file_id, file_name) @@ -1034,5 +1083,102 @@ def _run_pass(run_id, db_conn): print("=" * 60) +# ── 14. Backfill from Box ───────────────────────────────────────────────────── + + +def _run_backfill(run_id, db_conn): + """ + Walk the Box folder and, for every file that ALREADY has marriottUsa + metadata, insert a `status='backfilled'` row into tagging_events. No + Gemini calls, no Box writes — purely reads from Box and mirrors into + the local DB so the per-file skip check (which is now DB-based) won't + re-tag files Box has already tagged. + + Idempotent: a file that already has a success/backfilled row is left + alone. Safe to re-run after a partial backfill. + """ + print("=" * 60) + print("Marriott Box Asset Tagger — BACKFILL") + print("=" * 60) + print(f"Run ID: {run_id}") + + box_client = init_box_client() + + image_files, video_files = list_all_media(box_client) + total = len(image_files) + len(video_files) + if not total: + print("No media files found. Exiting.") + return + + inserted = 0 + no_metadata = 0 + already_in_db = 0 + errored = 0 + + combined = [("image", f) for f in image_files] + [("video", f) for f in video_files] + for i, (media_type, file_info) in enumerate(combined, 1): + file_id = file_info["id"] + file_name = file_info["name"] + folder_path = file_info.get("folder_path", "") + + print(f"\n[{i}/{total}] {media_type}: {file_name} (ID: {file_id})") + if folder_path: + print(f" Folder: {folder_path}") + + if db.is_file_already_tagged(db_conn, file_id): + print(" Already in DB — skipping.") + already_in_db += 1 + continue + + try: + existing = fetch_existing_metadata(box_client, file_id) + except BoxAPIError as e: + print(f" ERROR reading metadata from Box: {e}") + errored += 1 + continue + + if existing is None: + print(" No marriottUsa metadata in Box — leaving for normal tagging pass.") + no_metadata += 1 + continue + + description = fetch_file_description(box_client, file_id) + print(f" Fields from Box: {list(existing.keys()) or '(none)'}") + if description: + print(f" Description: {description[:80]}{'…' if len(description) > 80 else ''}") + + db.log_event( + db_conn, + run_id=run_id, + file_id=file_id, + file_name=file_name, + folder_path=folder_path, + media_type=media_type, + gemini_model=GEMINI_MODEL, + status="backfilled", + prompt=None, + raw_response=None, + description=description, + scenes=None, + validated_metadata=existing or {}, + metadata_write_success=True, + description_write_success=bool(description), + scene_comment_write_success=None, + error_message=None, + duration_ms=None, + ) + inserted += 1 + + print("\n" + "=" * 60) + print("BACKFILL SUMMARY") + print("=" * 60) + print(f" Total files seen: {total}") + print(f" Inserted into DB: {inserted}") + print(f" Already in DB (skipped): {already_in_db}") + print(f" No metadata in Box: {no_metadata}") + print(f" Errors reading Box: {errored}") + print("=" * 60) + + if __name__ == "__main__": main()