Manual-only runs, DB-based skip check, backfill-from-Box

Previously a nightly APScheduler container fired the tagger on every
file in the configured Box folder. With ~5000 files coming, that's
~5000 Box HTTP calls every night just to ask "is this tagged?". Move
to manual-only mode and source the skip decision from the local DB.

- `db.is_file_already_tagged(conn, file_id)` — returns True iff the
  DB has a row with status IN ('success','backfilled'). Used by both
  image and video loops in main.py instead of the previous
  `check_existing_metadata(box_client, file_id)` Box round-trip.
- `fetch_existing_metadata(box_client, file_id)` (main.py) — returns
  the user-defined template fields as a flat dict by stripping the
  Box `$id`/`$type`/etc. attrs from the SDK response.
- `_run_backfill(run_id, db_conn)` (main.py) — walks the Box folder
  and inserts a `status='backfilled'` row for every file Box already
  has marriottUsa metadata for. Read-only against Box; safe to re-run.
  Use this after first deploy, or to repopulate the DB from Box.
- `POST /api/backfill` mirrors `POST /api/runs` (background thread,
  same live-state record).
- SPA: new "Backfill from Box" button next to "Run now" (with a
  confirm dialog and a yellow `.status-backfilled` event treatment).
- docker-compose.yml: removed the `tagger` (scheduler) service.
  Manual triggers via the SPA / `POST /api/runs` only. scheduler.py
  stays in the repo for archival / opt-back-in.
- deploy.sh: readiness now checks the `api` container instead of
  `tagger`; `--logs` tails api logs.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
DJP 2026-05-11 15:41:10 -04:00
parent dafd097d24
commit 9e6a75feb6
8 changed files with 307 additions and 43 deletions

43
api.py
View file

@ -175,6 +175,49 @@ def start_run(user: User = Depends(require_auth)):
return {"run_id": str(run_id), "state": "running", "started_by": user.email or user.oid}
def _run_backfill_in_thread(run_id: uuid.UUID):
import main as tagger
with _runs_lock:
_runs[str(run_id)] = {"run_id": str(run_id), "state": "running", "error": None, "kind": "backfill"}
db_conn = None
try:
db_conn = db.get_conn()
db.ensure_schema(db_conn)
tagger._run_backfill(run_id, db_conn)
with _runs_lock:
_runs[str(run_id)]["state"] = "completed"
except SystemExit as e:
with _runs_lock:
_runs[str(run_id)]["state"] = "failed"
_runs[str(run_id)]["error"] = f"SystemExit({e.code})"
except Exception as e:
with _runs_lock:
_runs[str(run_id)]["state"] = "failed"
_runs[str(run_id)]["error"] = f"{type(e).__name__}: {e}"
finally:
if db_conn is not None:
try:
db_conn.close()
except Exception:
pass
@app.post("/api/backfill")
def start_backfill(user: User = Depends(require_auth)):
"""
Walk the Box folder and mirror any existing marriottUsa metadata into the
local DB as `status='backfilled'` rows. Use this after first deploy (or
after restoring an empty DB) so the per-file skip check doesn't re-tag
files Box already has metadata for.
"""
run_id = uuid.uuid4()
t = threading.Thread(target=_run_backfill_in_thread, args=(run_id,), daemon=True)
t.start()
return {"run_id": str(run_id), "state": "running", "kind": "backfill", "started_by": user.email or user.oid}
@app.get("/api/runs")
def list_runs(user: User = Depends(require_auth), limit: int = Query(20, ge=1, le=100)):
"""Recent runs in the DB, plus the in-memory state if the run is still active."""

26
db.py
View file

@ -50,6 +50,32 @@ def ensure_schema(conn):
cur.execute(sql)
def is_file_already_tagged(conn, file_id) -> bool:
"""
Skip-check oracle. A file counts as "already tagged" if we have any row
in tagging_events for it with a terminal-good status either a real
Gemini-driven success or a backfilled row that mirrors Box's existing
metadata. Error/validation rows do NOT count, so a previously failed
file gets retried on the next pass.
"""
if conn is None or not file_id:
return False
try:
with conn.cursor() as cur:
cur.execute(
"SELECT 1 FROM tagging_events "
"WHERE file_id = %s AND status IN ('success','backfilled') LIMIT 1",
(str(file_id),),
)
return cur.fetchone() is not None
except Exception as e:
print(
f" WARN: DB is_file_already_tagged failed ({type(e).__name__}: {e}) — assuming NOT tagged",
file=sys.stderr,
)
return False
def _jsonable(value):
if value is None:
return None

View file

@ -183,7 +183,7 @@ if (( DO_BUILD )); then
docker compose build
fi
log "docker compose up -d (db + tagger + api)"
log "docker compose up -d (db + api)"
docker compose up -d
# ---------- 6. Frontend build + sync ----------
@ -257,13 +257,13 @@ for i in $(seq 1 30); do
fi
done
TAGGER_STATE=$(docker compose ps tagger --format '{{.State}}' 2>/dev/null | head -1)
if [[ "$TAGGER_STATE" != "running" ]]; then
err "Tagger (scheduler) container is not running (state=${TAGGER_STATE:-unknown}). Recent logs:"
docker compose logs tagger --tail 60 || true
API_STATE=$(docker compose ps api --format '{{.State}}' 2>/dev/null | head -1)
if [[ "$API_STATE" != "running" ]]; then
err "API container is not running (state=${API_STATE:-unknown}). Recent logs:"
docker compose logs api --tail 60 || true
exit 1
fi
ok "Tagger scheduler running"
ok "API container running (manual-only mode — no scheduler container)"
# ---------- 8. Optional: trigger an immediate pass via the API ----------
@ -315,6 +315,6 @@ fi
echo
if (( TAIL_LOGS )); then
log "Tailing tagger logs (Ctrl-C to stop)…"
docker compose logs -f tagger
log "Tailing api logs (Ctrl-C to stop)…"
docker compose logs -f api
fi

View file

@ -26,22 +26,12 @@ services:
timeout: 5s
retries: 10
tagger:
build: .
restart: unless-stopped
depends_on:
db:
condition: service_healthy
environment:
DATABASE_URL: postgresql://${POSTGRES_USER:-marriott}:${POSTGRES_PASSWORD:-marriott}@db:5432/${POSTGRES_DB:-marriott_tagging}
GEMINI_API_KEY: ${GEMINI_API_KEY}
SCHEDULE_CRON: ${SCHEDULE_CRON:-0 2 * * *}
RUN_AT_STARTUP: ${RUN_AT_STARTUP:-0}
TZ: ${TZ:-UTC}
command: ["python", "-u", "scheduler.py"]
volumes:
# Box JWT config is bind-mounted read-only — never baked into the image.
- ./box_config.json:/app/box_config.json:ro
# ── Manual-only mode ─────────────────────────────────────────────────────────
# The nightly APScheduler container (`tagger`) was intentionally removed so the
# app only runs Gemini against new files when a human clicks "Run now" in the
# SPA (or `curl -X POST .../api/runs`). The pipeline still lives in main.py
# and scheduler.py is kept in the repo — re-add a `tagger` service here using
# the previous block in git history if you want cron-driven passes back.
api:
build: .

View file

@ -6,6 +6,7 @@ import {
listRuns,
runEvents,
searchEvents,
startBackfill,
startRun,
} from "./api";
@ -17,10 +18,11 @@ export function App() {
const [error, setError] = useState<string | null>(null);
const [activeRun, setActiveRun] = useState<string | null>(null);
const [activeRunKind, setActiveRunKind] = useState<"tag" | "backfill" | null>(null);
const [activeRunEvents, setActiveRunEvents] = useState<Event[]>([]);
const [activeRunState, setActiveRunState] = useState<string | null>(null);
const [recentRuns, setRecentRuns] = useState<Run[]>([]);
const [starting, setStarting] = useState(false);
const [starting, setStarting] = useState<null | "tag" | "backfill">(null);
const runPollTimer = useRef<number | null>(null);
@ -97,17 +99,41 @@ export function App() {
};
const onRunNow = async () => {
setStarting(true);
setStarting("tag");
setError(null);
try {
const r = await startRun(auth.getToken);
setActiveRun(r.run_id);
setActiveRunKind("tag");
setActiveRunEvents([]);
setActiveRunState("running");
} catch (e) {
setError((e as Error).message);
} finally {
setStarting(false);
setStarting(null);
}
};
const onBackfill = async () => {
if (!window.confirm(
"Backfill the local DB from Box?\n\n" +
"Walks every file in the configured Box folder and inserts a " +
"'backfilled' row for each one that already has marriottUsa metadata. " +
"No Gemini calls, no Box writes — Box is read-only here.\n\n" +
"Safe to re-run; existing DB rows are kept."
)) return;
setStarting("backfill");
setError(null);
try {
const r = await startBackfill(auth.getToken);
setActiveRun(r.run_id);
setActiveRunKind("backfill");
setActiveRunEvents([]);
setActiveRunState("running");
} catch (e) {
setError((e as Error).message);
} finally {
setStarting(null);
}
};
@ -157,11 +183,20 @@ export function App() {
<button
type="button"
className="run-now"
disabled={starting}
disabled={starting !== null}
title="Trigger a tagging pass against the Box folder right now"
onClick={onRunNow}
>
{starting ? "Starting…" : "Run now"}
{starting === "tag" ? "Starting…" : "Run now"}
</button>
<button
type="button"
className="ghost"
disabled={starting !== null}
title="Walk Box and mirror existing marriottUsa metadata into the local DB. No Gemini, no Box writes."
onClick={onBackfill}
>
{starting === "backfill" ? "Starting…" : "Backfill from Box"}
</button>
</form>
{error && <p className="error">{error}</p>}
@ -170,7 +205,7 @@ export function App() {
{activeRun && (
<section className="card">
<h2>
Active run
{activeRunKind === "backfill" ? "Active backfill" : "Active run"}
<span className="run-id"> {activeRun.slice(0, 8)}</span>
{activeRunState && (
<span className={`pill state-${activeRunState}`}>{activeRunState}</span>

View file

@ -79,6 +79,14 @@ export function startRun(getToken: GetTokenFn) {
);
}
export function startBackfill(getToken: GetTokenFn) {
return req<{ run_id: string; state: string; kind: string; started_by: string }>(
`/backfill`,
getToken,
{ method: "POST", body: JSON.stringify({}) }
);
}
export function listRuns(getToken: GetTokenFn, limit = 20) {
return req<{ runs: Run[] }>(`/runs?limit=${limit}`, getToken);
}

View file

@ -66,6 +66,15 @@ button.run-now:hover:not(:disabled) {
background: var(--accent);
color: #000;
}
button.ghost {
background: transparent;
color: var(--text);
border: 1px solid var(--line);
}
button.ghost:hover:not(:disabled) {
border-color: var(--accent);
color: var(--accent);
}
button.link {
background: transparent;
color: var(--accent);
@ -208,6 +217,9 @@ input:focus {
.event.status-success {
border-left-color: var(--ok);
}
.event.status-backfilled {
border-left-color: var(--accent);
}
.event.status-gemini_error,
.event.status-validation_error,
.event.status-metadata_write_error {
@ -292,6 +304,10 @@ input:focus {
background: var(--ok);
color: #000;
}
.pill.status-backfilled {
background: var(--accent);
color: #000;
}
.pill.status-gemini_error,
.pill.status-validation_error,
.pill.status-metadata_write_error,

172
main.py
View file

@ -596,8 +596,20 @@ def validate_and_clean_metadata(raw_metadata, template_schema):
# ── 9. Check Existing Metadata ────────────────────────────────────────────────
# Box-managed keys returned alongside template fields in a metadata response.
# After `from_dict`, these become regular attribute names on the response object.
_BOX_META_SYSTEM_ATTRS = {
"id", "type", "type_version", "parent", "template", "scope", "version",
"can_edit",
}
def check_existing_metadata(box_client, file_id):
"""Check if file already has marriottUsa metadata. Returns True/False."""
"""Check if file already has marriottUsa metadata. Returns True/False.
Kept for the backfill code path, which still asks Box. The per-pass skip
check in the main loops now uses the local DB instead (see db.is_file_already_tagged).
"""
try:
box_client.file_metadata.get_file_metadata_by_id(
file_id=file_id,
@ -611,6 +623,43 @@ def check_existing_metadata(box_client, file_id):
raise
def fetch_existing_metadata(box_client, file_id):
"""
Fetch the file's marriottUsa metadata from Box and return a flat dict of
user-defined fields (no `$id`/`$scope`/etc.). Returns {} if metadata exists
but has no user fields, None if Box returns 404, raises on other errors.
"""
try:
resp = box_client.file_metadata.get_file_metadata_by_id(
file_id=file_id,
scope=CreateFileMetadataByIdScope.ENTERPRISE,
template_key=METADATA_TEMPLATE_KEY,
)
except BoxAPIError as e:
if e.response_info.status_code == 404:
return None
raise
out = {}
for k, v in vars(resp).items():
if k.startswith("_"):
continue
if k in _BOX_META_SYSTEM_ATTRS:
continue
out[k] = v
return out
def fetch_file_description(box_client, file_id):
"""Return the file's Box description (string) or None if unavailable / empty."""
try:
f = box_client.files.get_file_by_id(file_id=file_id, fields=["description"])
desc = getattr(f, "description", None)
return desc if isinstance(desc, str) and desc else None
except BoxAPIError:
return None
# ── 10. Write Metadata to Box ─────────────────────────────────────────────────
def write_metadata_to_box(box_client, file_id, metadata, file_name):
@ -783,12 +832,13 @@ def _run_pass(run_id, db_conn):
if folder_path:
print(f" Folder: {folder_path}")
# Check if already tagged
if SKIP_ALREADY_TAGGED:
if check_existing_metadata(box_client, file_id):
print(f" Already tagged — skipping")
img_skipped += 1
continue
# Skip if the DB already has a success/backfilled row for this file.
# If the DB is empty/lost, run the backfill flow first to repopulate
# from Box — that avoids re-tagging files Box already has metadata for.
if SKIP_ALREADY_TAGGED and db.is_file_already_tagged(db_conn, file_id):
print(f" Already in DB — skipping")
img_skipped += 1
continue
# Download and resize
result = download_and_resize_image(box_client, file_id, file_name)
@ -907,12 +957,11 @@ def _run_pass(run_id, db_conn):
if folder_path:
print(f" Folder: {folder_path}")
# Check if already tagged
if SKIP_ALREADY_TAGGED:
if check_existing_metadata(box_client, file_id):
print(f" Already tagged — skipping")
vid_skipped += 1
continue
# Skip if the DB already has a success/backfilled row for this file.
if SKIP_ALREADY_TAGGED and db.is_file_already_tagged(db_conn, file_id):
print(f" Already in DB — skipping")
vid_skipped += 1
continue
# Download video proxy (480p MP4)
result = download_video_proxy(box_client, file_id, file_name)
@ -1034,5 +1083,102 @@ def _run_pass(run_id, db_conn):
print("=" * 60)
# ── 14. Backfill from Box ─────────────────────────────────────────────────────
def _run_backfill(run_id, db_conn):
"""
Walk the Box folder and, for every file that ALREADY has marriottUsa
metadata, insert a `status='backfilled'` row into tagging_events. No
Gemini calls, no Box writes purely reads from Box and mirrors into
the local DB so the per-file skip check (which is now DB-based) won't
re-tag files Box has already tagged.
Idempotent: a file that already has a success/backfilled row is left
alone. Safe to re-run after a partial backfill.
"""
print("=" * 60)
print("Marriott Box Asset Tagger — BACKFILL")
print("=" * 60)
print(f"Run ID: {run_id}")
box_client = init_box_client()
image_files, video_files = list_all_media(box_client)
total = len(image_files) + len(video_files)
if not total:
print("No media files found. Exiting.")
return
inserted = 0
no_metadata = 0
already_in_db = 0
errored = 0
combined = [("image", f) for f in image_files] + [("video", f) for f in video_files]
for i, (media_type, file_info) in enumerate(combined, 1):
file_id = file_info["id"]
file_name = file_info["name"]
folder_path = file_info.get("folder_path", "")
print(f"\n[{i}/{total}] {media_type}: {file_name} (ID: {file_id})")
if folder_path:
print(f" Folder: {folder_path}")
if db.is_file_already_tagged(db_conn, file_id):
print(" Already in DB — skipping.")
already_in_db += 1
continue
try:
existing = fetch_existing_metadata(box_client, file_id)
except BoxAPIError as e:
print(f" ERROR reading metadata from Box: {e}")
errored += 1
continue
if existing is None:
print(" No marriottUsa metadata in Box — leaving for normal tagging pass.")
no_metadata += 1
continue
description = fetch_file_description(box_client, file_id)
print(f" Fields from Box: {list(existing.keys()) or '(none)'}")
if description:
print(f" Description: {description[:80]}{'' if len(description) > 80 else ''}")
db.log_event(
db_conn,
run_id=run_id,
file_id=file_id,
file_name=file_name,
folder_path=folder_path,
media_type=media_type,
gemini_model=GEMINI_MODEL,
status="backfilled",
prompt=None,
raw_response=None,
description=description,
scenes=None,
validated_metadata=existing or {},
metadata_write_success=True,
description_write_success=bool(description),
scene_comment_write_success=None,
error_message=None,
duration_ms=None,
)
inserted += 1
print("\n" + "=" * 60)
print("BACKFILL SUMMARY")
print("=" * 60)
print(f" Total files seen: {total}")
print(f" Inserted into DB: {inserted}")
print(f" Already in DB (skipped): {already_in_db}")
print(f" No metadata in Box: {no_metadata}")
print(f" Errors reading Box: {errored}")
print("=" * 60)
if __name__ == "__main__":
main()