Manual-only runs, DB-based skip check, backfill-from-Box
Previously a nightly APScheduler container fired the tagger on every
file in the configured Box folder. With ~5000 files coming, that's
~5000 Box HTTP calls every night just to ask "is this tagged?". Move
to manual-only mode and source the skip decision from the local DB.
- `db.is_file_already_tagged(conn, file_id)` — returns True iff the
DB has a row with status IN ('success','backfilled'). Used by both
image and video loops in main.py instead of the previous
`check_existing_metadata(box_client, file_id)` Box round-trip.
- `fetch_existing_metadata(box_client, file_id)` (main.py) — returns
the user-defined template fields as a flat dict by stripping the
Box `$id`/`$type`/etc. attrs from the SDK response.
- `_run_backfill(run_id, db_conn)` (main.py) — walks the Box folder
and inserts a `status='backfilled'` row for every file Box already
has marriottUsa metadata for. Read-only against Box; safe to re-run.
Use this after first deploy, or to repopulate the DB from Box.
- `POST /api/backfill` mirrors `POST /api/runs` (background thread,
same live-state record).
- SPA: new "Backfill from Box" button next to "Run now" (with a
confirm dialog and a yellow `.status-backfilled` event treatment).
- docker-compose.yml: removed the `tagger` (scheduler) service.
Manual triggers via the SPA / `POST /api/runs` only. scheduler.py
stays in the repo for archival / opt-back-in.
- deploy.sh: readiness now checks the `api` container instead of
`tagger`; `--logs` tails api logs.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
dafd097d24
commit
9e6a75feb6
8 changed files with 307 additions and 43 deletions
43
api.py
43
api.py
|
|
@ -175,6 +175,49 @@ def start_run(user: User = Depends(require_auth)):
|
|||
return {"run_id": str(run_id), "state": "running", "started_by": user.email or user.oid}
|
||||
|
||||
|
||||
def _run_backfill_in_thread(run_id: uuid.UUID):
|
||||
import main as tagger
|
||||
|
||||
with _runs_lock:
|
||||
_runs[str(run_id)] = {"run_id": str(run_id), "state": "running", "error": None, "kind": "backfill"}
|
||||
|
||||
db_conn = None
|
||||
try:
|
||||
db_conn = db.get_conn()
|
||||
db.ensure_schema(db_conn)
|
||||
tagger._run_backfill(run_id, db_conn)
|
||||
with _runs_lock:
|
||||
_runs[str(run_id)]["state"] = "completed"
|
||||
except SystemExit as e:
|
||||
with _runs_lock:
|
||||
_runs[str(run_id)]["state"] = "failed"
|
||||
_runs[str(run_id)]["error"] = f"SystemExit({e.code})"
|
||||
except Exception as e:
|
||||
with _runs_lock:
|
||||
_runs[str(run_id)]["state"] = "failed"
|
||||
_runs[str(run_id)]["error"] = f"{type(e).__name__}: {e}"
|
||||
finally:
|
||||
if db_conn is not None:
|
||||
try:
|
||||
db_conn.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
@app.post("/api/backfill")
|
||||
def start_backfill(user: User = Depends(require_auth)):
|
||||
"""
|
||||
Walk the Box folder and mirror any existing marriottUsa metadata into the
|
||||
local DB as `status='backfilled'` rows. Use this after first deploy (or
|
||||
after restoring an empty DB) so the per-file skip check doesn't re-tag
|
||||
files Box already has metadata for.
|
||||
"""
|
||||
run_id = uuid.uuid4()
|
||||
t = threading.Thread(target=_run_backfill_in_thread, args=(run_id,), daemon=True)
|
||||
t.start()
|
||||
return {"run_id": str(run_id), "state": "running", "kind": "backfill", "started_by": user.email or user.oid}
|
||||
|
||||
|
||||
@app.get("/api/runs")
|
||||
def list_runs(user: User = Depends(require_auth), limit: int = Query(20, ge=1, le=100)):
|
||||
"""Recent runs in the DB, plus the in-memory state if the run is still active."""
|
||||
|
|
|
|||
26
db.py
26
db.py
|
|
@ -50,6 +50,32 @@ def ensure_schema(conn):
|
|||
cur.execute(sql)
|
||||
|
||||
|
||||
def is_file_already_tagged(conn, file_id) -> bool:
|
||||
"""
|
||||
Skip-check oracle. A file counts as "already tagged" if we have any row
|
||||
in tagging_events for it with a terminal-good status — either a real
|
||||
Gemini-driven success or a backfilled row that mirrors Box's existing
|
||||
metadata. Error/validation rows do NOT count, so a previously failed
|
||||
file gets retried on the next pass.
|
||||
"""
|
||||
if conn is None or not file_id:
|
||||
return False
|
||||
try:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(
|
||||
"SELECT 1 FROM tagging_events "
|
||||
"WHERE file_id = %s AND status IN ('success','backfilled') LIMIT 1",
|
||||
(str(file_id),),
|
||||
)
|
||||
return cur.fetchone() is not None
|
||||
except Exception as e:
|
||||
print(
|
||||
f" WARN: DB is_file_already_tagged failed ({type(e).__name__}: {e}) — assuming NOT tagged",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return False
|
||||
|
||||
|
||||
def _jsonable(value):
|
||||
if value is None:
|
||||
return None
|
||||
|
|
|
|||
|
|
@ -183,7 +183,7 @@ if (( DO_BUILD )); then
|
|||
docker compose build
|
||||
fi
|
||||
|
||||
log "docker compose up -d (db + tagger + api)"
|
||||
log "docker compose up -d (db + api)"
|
||||
docker compose up -d
|
||||
|
||||
# ---------- 6. Frontend build + sync ----------
|
||||
|
|
@ -257,13 +257,13 @@ for i in $(seq 1 30); do
|
|||
fi
|
||||
done
|
||||
|
||||
TAGGER_STATE=$(docker compose ps tagger --format '{{.State}}' 2>/dev/null | head -1)
|
||||
if [[ "$TAGGER_STATE" != "running" ]]; then
|
||||
err "Tagger (scheduler) container is not running (state=${TAGGER_STATE:-unknown}). Recent logs:"
|
||||
docker compose logs tagger --tail 60 || true
|
||||
API_STATE=$(docker compose ps api --format '{{.State}}' 2>/dev/null | head -1)
|
||||
if [[ "$API_STATE" != "running" ]]; then
|
||||
err "API container is not running (state=${API_STATE:-unknown}). Recent logs:"
|
||||
docker compose logs api --tail 60 || true
|
||||
exit 1
|
||||
fi
|
||||
ok "Tagger scheduler running"
|
||||
ok "API container running (manual-only mode — no scheduler container)"
|
||||
|
||||
# ---------- 8. Optional: trigger an immediate pass via the API ----------
|
||||
|
||||
|
|
@ -315,6 +315,6 @@ fi
|
|||
echo
|
||||
|
||||
if (( TAIL_LOGS )); then
|
||||
log "Tailing tagger logs (Ctrl-C to stop)…"
|
||||
docker compose logs -f tagger
|
||||
log "Tailing api logs (Ctrl-C to stop)…"
|
||||
docker compose logs -f api
|
||||
fi
|
||||
|
|
|
|||
|
|
@ -26,22 +26,12 @@ services:
|
|||
timeout: 5s
|
||||
retries: 10
|
||||
|
||||
tagger:
|
||||
build: .
|
||||
restart: unless-stopped
|
||||
depends_on:
|
||||
db:
|
||||
condition: service_healthy
|
||||
environment:
|
||||
DATABASE_URL: postgresql://${POSTGRES_USER:-marriott}:${POSTGRES_PASSWORD:-marriott}@db:5432/${POSTGRES_DB:-marriott_tagging}
|
||||
GEMINI_API_KEY: ${GEMINI_API_KEY}
|
||||
SCHEDULE_CRON: ${SCHEDULE_CRON:-0 2 * * *}
|
||||
RUN_AT_STARTUP: ${RUN_AT_STARTUP:-0}
|
||||
TZ: ${TZ:-UTC}
|
||||
command: ["python", "-u", "scheduler.py"]
|
||||
volumes:
|
||||
# Box JWT config is bind-mounted read-only — never baked into the image.
|
||||
- ./box_config.json:/app/box_config.json:ro
|
||||
# ── Manual-only mode ─────────────────────────────────────────────────────────
|
||||
# The nightly APScheduler container (`tagger`) was intentionally removed so the
|
||||
# app only runs Gemini against new files when a human clicks "Run now" in the
|
||||
# SPA (or `curl -X POST .../api/runs`). The pipeline still lives in main.py
|
||||
# and scheduler.py is kept in the repo — re-add a `tagger` service here using
|
||||
# the previous block in git history if you want cron-driven passes back.
|
||||
|
||||
api:
|
||||
build: .
|
||||
|
|
|
|||
|
|
@ -6,6 +6,7 @@ import {
|
|||
listRuns,
|
||||
runEvents,
|
||||
searchEvents,
|
||||
startBackfill,
|
||||
startRun,
|
||||
} from "./api";
|
||||
|
||||
|
|
@ -17,10 +18,11 @@ export function App() {
|
|||
const [error, setError] = useState<string | null>(null);
|
||||
|
||||
const [activeRun, setActiveRun] = useState<string | null>(null);
|
||||
const [activeRunKind, setActiveRunKind] = useState<"tag" | "backfill" | null>(null);
|
||||
const [activeRunEvents, setActiveRunEvents] = useState<Event[]>([]);
|
||||
const [activeRunState, setActiveRunState] = useState<string | null>(null);
|
||||
const [recentRuns, setRecentRuns] = useState<Run[]>([]);
|
||||
const [starting, setStarting] = useState(false);
|
||||
const [starting, setStarting] = useState<null | "tag" | "backfill">(null);
|
||||
|
||||
const runPollTimer = useRef<number | null>(null);
|
||||
|
||||
|
|
@ -97,17 +99,41 @@ export function App() {
|
|||
};
|
||||
|
||||
const onRunNow = async () => {
|
||||
setStarting(true);
|
||||
setStarting("tag");
|
||||
setError(null);
|
||||
try {
|
||||
const r = await startRun(auth.getToken);
|
||||
setActiveRun(r.run_id);
|
||||
setActiveRunKind("tag");
|
||||
setActiveRunEvents([]);
|
||||
setActiveRunState("running");
|
||||
} catch (e) {
|
||||
setError((e as Error).message);
|
||||
} finally {
|
||||
setStarting(false);
|
||||
setStarting(null);
|
||||
}
|
||||
};
|
||||
|
||||
const onBackfill = async () => {
|
||||
if (!window.confirm(
|
||||
"Backfill the local DB from Box?\n\n" +
|
||||
"Walks every file in the configured Box folder and inserts a " +
|
||||
"'backfilled' row for each one that already has marriottUsa metadata. " +
|
||||
"No Gemini calls, no Box writes — Box is read-only here.\n\n" +
|
||||
"Safe to re-run; existing DB rows are kept."
|
||||
)) return;
|
||||
setStarting("backfill");
|
||||
setError(null);
|
||||
try {
|
||||
const r = await startBackfill(auth.getToken);
|
||||
setActiveRun(r.run_id);
|
||||
setActiveRunKind("backfill");
|
||||
setActiveRunEvents([]);
|
||||
setActiveRunState("running");
|
||||
} catch (e) {
|
||||
setError((e as Error).message);
|
||||
} finally {
|
||||
setStarting(null);
|
||||
}
|
||||
};
|
||||
|
||||
|
|
@ -157,11 +183,20 @@ export function App() {
|
|||
<button
|
||||
type="button"
|
||||
className="run-now"
|
||||
disabled={starting}
|
||||
disabled={starting !== null}
|
||||
title="Trigger a tagging pass against the Box folder right now"
|
||||
onClick={onRunNow}
|
||||
>
|
||||
{starting ? "Starting…" : "Run now"}
|
||||
{starting === "tag" ? "Starting…" : "Run now"}
|
||||
</button>
|
||||
<button
|
||||
type="button"
|
||||
className="ghost"
|
||||
disabled={starting !== null}
|
||||
title="Walk Box and mirror existing marriottUsa metadata into the local DB. No Gemini, no Box writes."
|
||||
onClick={onBackfill}
|
||||
>
|
||||
{starting === "backfill" ? "Starting…" : "Backfill from Box"}
|
||||
</button>
|
||||
</form>
|
||||
{error && <p className="error">{error}</p>}
|
||||
|
|
@ -170,7 +205,7 @@ export function App() {
|
|||
{activeRun && (
|
||||
<section className="card">
|
||||
<h2>
|
||||
Active run
|
||||
{activeRunKind === "backfill" ? "Active backfill" : "Active run"}
|
||||
<span className="run-id"> {activeRun.slice(0, 8)}…</span>
|
||||
{activeRunState && (
|
||||
<span className={`pill state-${activeRunState}`}>{activeRunState}</span>
|
||||
|
|
|
|||
|
|
@ -79,6 +79,14 @@ export function startRun(getToken: GetTokenFn) {
|
|||
);
|
||||
}
|
||||
|
||||
export function startBackfill(getToken: GetTokenFn) {
|
||||
return req<{ run_id: string; state: string; kind: string; started_by: string }>(
|
||||
`/backfill`,
|
||||
getToken,
|
||||
{ method: "POST", body: JSON.stringify({}) }
|
||||
);
|
||||
}
|
||||
|
||||
export function listRuns(getToken: GetTokenFn, limit = 20) {
|
||||
return req<{ runs: Run[] }>(`/runs?limit=${limit}`, getToken);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -66,6 +66,15 @@ button.run-now:hover:not(:disabled) {
|
|||
background: var(--accent);
|
||||
color: #000;
|
||||
}
|
||||
button.ghost {
|
||||
background: transparent;
|
||||
color: var(--text);
|
||||
border: 1px solid var(--line);
|
||||
}
|
||||
button.ghost:hover:not(:disabled) {
|
||||
border-color: var(--accent);
|
||||
color: var(--accent);
|
||||
}
|
||||
button.link {
|
||||
background: transparent;
|
||||
color: var(--accent);
|
||||
|
|
@ -208,6 +217,9 @@ input:focus {
|
|||
.event.status-success {
|
||||
border-left-color: var(--ok);
|
||||
}
|
||||
.event.status-backfilled {
|
||||
border-left-color: var(--accent);
|
||||
}
|
||||
.event.status-gemini_error,
|
||||
.event.status-validation_error,
|
||||
.event.status-metadata_write_error {
|
||||
|
|
@ -292,6 +304,10 @@ input:focus {
|
|||
background: var(--ok);
|
||||
color: #000;
|
||||
}
|
||||
.pill.status-backfilled {
|
||||
background: var(--accent);
|
||||
color: #000;
|
||||
}
|
||||
.pill.status-gemini_error,
|
||||
.pill.status-validation_error,
|
||||
.pill.status-metadata_write_error,
|
||||
|
|
|
|||
172
main.py
172
main.py
|
|
@ -596,8 +596,20 @@ def validate_and_clean_metadata(raw_metadata, template_schema):
|
|||
|
||||
# ── 9. Check Existing Metadata ────────────────────────────────────────────────
|
||||
|
||||
# Box-managed keys returned alongside template fields in a metadata response.
|
||||
# After `from_dict`, these become regular attribute names on the response object.
|
||||
_BOX_META_SYSTEM_ATTRS = {
|
||||
"id", "type", "type_version", "parent", "template", "scope", "version",
|
||||
"can_edit",
|
||||
}
|
||||
|
||||
|
||||
def check_existing_metadata(box_client, file_id):
|
||||
"""Check if file already has marriottUsa metadata. Returns True/False."""
|
||||
"""Check if file already has marriottUsa metadata. Returns True/False.
|
||||
|
||||
Kept for the backfill code path, which still asks Box. The per-pass skip
|
||||
check in the main loops now uses the local DB instead (see db.is_file_already_tagged).
|
||||
"""
|
||||
try:
|
||||
box_client.file_metadata.get_file_metadata_by_id(
|
||||
file_id=file_id,
|
||||
|
|
@ -611,6 +623,43 @@ def check_existing_metadata(box_client, file_id):
|
|||
raise
|
||||
|
||||
|
||||
def fetch_existing_metadata(box_client, file_id):
|
||||
"""
|
||||
Fetch the file's marriottUsa metadata from Box and return a flat dict of
|
||||
user-defined fields (no `$id`/`$scope`/etc.). Returns {} if metadata exists
|
||||
but has no user fields, None if Box returns 404, raises on other errors.
|
||||
"""
|
||||
try:
|
||||
resp = box_client.file_metadata.get_file_metadata_by_id(
|
||||
file_id=file_id,
|
||||
scope=CreateFileMetadataByIdScope.ENTERPRISE,
|
||||
template_key=METADATA_TEMPLATE_KEY,
|
||||
)
|
||||
except BoxAPIError as e:
|
||||
if e.response_info.status_code == 404:
|
||||
return None
|
||||
raise
|
||||
|
||||
out = {}
|
||||
for k, v in vars(resp).items():
|
||||
if k.startswith("_"):
|
||||
continue
|
||||
if k in _BOX_META_SYSTEM_ATTRS:
|
||||
continue
|
||||
out[k] = v
|
||||
return out
|
||||
|
||||
|
||||
def fetch_file_description(box_client, file_id):
|
||||
"""Return the file's Box description (string) or None if unavailable / empty."""
|
||||
try:
|
||||
f = box_client.files.get_file_by_id(file_id=file_id, fields=["description"])
|
||||
desc = getattr(f, "description", None)
|
||||
return desc if isinstance(desc, str) and desc else None
|
||||
except BoxAPIError:
|
||||
return None
|
||||
|
||||
|
||||
# ── 10. Write Metadata to Box ─────────────────────────────────────────────────
|
||||
|
||||
def write_metadata_to_box(box_client, file_id, metadata, file_name):
|
||||
|
|
@ -783,12 +832,13 @@ def _run_pass(run_id, db_conn):
|
|||
if folder_path:
|
||||
print(f" Folder: {folder_path}")
|
||||
|
||||
# Check if already tagged
|
||||
if SKIP_ALREADY_TAGGED:
|
||||
if check_existing_metadata(box_client, file_id):
|
||||
print(f" Already tagged — skipping")
|
||||
img_skipped += 1
|
||||
continue
|
||||
# Skip if the DB already has a success/backfilled row for this file.
|
||||
# If the DB is empty/lost, run the backfill flow first to repopulate
|
||||
# from Box — that avoids re-tagging files Box already has metadata for.
|
||||
if SKIP_ALREADY_TAGGED and db.is_file_already_tagged(db_conn, file_id):
|
||||
print(f" Already in DB — skipping")
|
||||
img_skipped += 1
|
||||
continue
|
||||
|
||||
# Download and resize
|
||||
result = download_and_resize_image(box_client, file_id, file_name)
|
||||
|
|
@ -907,12 +957,11 @@ def _run_pass(run_id, db_conn):
|
|||
if folder_path:
|
||||
print(f" Folder: {folder_path}")
|
||||
|
||||
# Check if already tagged
|
||||
if SKIP_ALREADY_TAGGED:
|
||||
if check_existing_metadata(box_client, file_id):
|
||||
print(f" Already tagged — skipping")
|
||||
vid_skipped += 1
|
||||
continue
|
||||
# Skip if the DB already has a success/backfilled row for this file.
|
||||
if SKIP_ALREADY_TAGGED and db.is_file_already_tagged(db_conn, file_id):
|
||||
print(f" Already in DB — skipping")
|
||||
vid_skipped += 1
|
||||
continue
|
||||
|
||||
# Download video proxy (480p MP4)
|
||||
result = download_video_proxy(box_client, file_id, file_name)
|
||||
|
|
@ -1034,5 +1083,102 @@ def _run_pass(run_id, db_conn):
|
|||
print("=" * 60)
|
||||
|
||||
|
||||
# ── 14. Backfill from Box ─────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def _run_backfill(run_id, db_conn):
|
||||
"""
|
||||
Walk the Box folder and, for every file that ALREADY has marriottUsa
|
||||
metadata, insert a `status='backfilled'` row into tagging_events. No
|
||||
Gemini calls, no Box writes — purely reads from Box and mirrors into
|
||||
the local DB so the per-file skip check (which is now DB-based) won't
|
||||
re-tag files Box has already tagged.
|
||||
|
||||
Idempotent: a file that already has a success/backfilled row is left
|
||||
alone. Safe to re-run after a partial backfill.
|
||||
"""
|
||||
print("=" * 60)
|
||||
print("Marriott Box Asset Tagger — BACKFILL")
|
||||
print("=" * 60)
|
||||
print(f"Run ID: {run_id}")
|
||||
|
||||
box_client = init_box_client()
|
||||
|
||||
image_files, video_files = list_all_media(box_client)
|
||||
total = len(image_files) + len(video_files)
|
||||
if not total:
|
||||
print("No media files found. Exiting.")
|
||||
return
|
||||
|
||||
inserted = 0
|
||||
no_metadata = 0
|
||||
already_in_db = 0
|
||||
errored = 0
|
||||
|
||||
combined = [("image", f) for f in image_files] + [("video", f) for f in video_files]
|
||||
for i, (media_type, file_info) in enumerate(combined, 1):
|
||||
file_id = file_info["id"]
|
||||
file_name = file_info["name"]
|
||||
folder_path = file_info.get("folder_path", "")
|
||||
|
||||
print(f"\n[{i}/{total}] {media_type}: {file_name} (ID: {file_id})")
|
||||
if folder_path:
|
||||
print(f" Folder: {folder_path}")
|
||||
|
||||
if db.is_file_already_tagged(db_conn, file_id):
|
||||
print(" Already in DB — skipping.")
|
||||
already_in_db += 1
|
||||
continue
|
||||
|
||||
try:
|
||||
existing = fetch_existing_metadata(box_client, file_id)
|
||||
except BoxAPIError as e:
|
||||
print(f" ERROR reading metadata from Box: {e}")
|
||||
errored += 1
|
||||
continue
|
||||
|
||||
if existing is None:
|
||||
print(" No marriottUsa metadata in Box — leaving for normal tagging pass.")
|
||||
no_metadata += 1
|
||||
continue
|
||||
|
||||
description = fetch_file_description(box_client, file_id)
|
||||
print(f" Fields from Box: {list(existing.keys()) or '(none)'}")
|
||||
if description:
|
||||
print(f" Description: {description[:80]}{'…' if len(description) > 80 else ''}")
|
||||
|
||||
db.log_event(
|
||||
db_conn,
|
||||
run_id=run_id,
|
||||
file_id=file_id,
|
||||
file_name=file_name,
|
||||
folder_path=folder_path,
|
||||
media_type=media_type,
|
||||
gemini_model=GEMINI_MODEL,
|
||||
status="backfilled",
|
||||
prompt=None,
|
||||
raw_response=None,
|
||||
description=description,
|
||||
scenes=None,
|
||||
validated_metadata=existing or {},
|
||||
metadata_write_success=True,
|
||||
description_write_success=bool(description),
|
||||
scene_comment_write_success=None,
|
||||
error_message=None,
|
||||
duration_ms=None,
|
||||
)
|
||||
inserted += 1
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("BACKFILL SUMMARY")
|
||||
print("=" * 60)
|
||||
print(f" Total files seen: {total}")
|
||||
print(f" Inserted into DB: {inserted}")
|
||||
print(f" Already in DB (skipped): {already_in_db}")
|
||||
print(f" No metadata in Box: {no_metadata}")
|
||||
print(f" Errors reading Box: {errored}")
|
||||
print("=" * 60)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue