Add V2: multi-team social-reporting platform with manifest-gated linking

V2 lives entirely under v2/ and is built around three asks the team raised
about V1: per-video assets sometimes drifted onto the wrong trend, hashtag
scrapes returned junk that wasn't filterable per-client, and there was no
multi-user model behind Microsoft SSO.

Highlights:
- Stable TikTok numeric-id key for every per-video asset; URL form drift is
  logged loudly to drift_log.jsonl and never silently nulls assets. Stage 5
  manifest hard-gates Stage 6 if any selected video is missing any required
  asset; --drop-failing auto-backfills from the next-best recipe candidates.
- Per-brief engagement floor (min_likes / min_plays / min_stl_pct), applied
  at Apify scrape time and re-validated locally; spend_log.json records
  raw_returned vs kept_after_floor per scrape.
- Users + teams + memberships with owner/admin/editor/viewer roles; SSO
  upserts a user keyed on Azure oid, auto-creates a personal team, and a
  super-admin is bootstrapped via BOOTSTRAP_SUPER_ADMIN_EMAIL on first
  sign-in. Phase A integration test: 16/16 pass.
- 10-stage TS pipeline (brief → seed → scrape1 → select → scrape2 →
  validate → analyse → insights → trends → qa → build) wired through one
  CLI; each stage idempotent + resumable from disk via .state sentinels.
  §4.5 rubrics shipped under prompts/ and loaded into Claude calls.
- React 18 + Vite + TS + Tailwind operator SPA: brief intake form,
  team management, super-admin user list, help/FAQ ported from V1.
- Separate Docker Compose project (name: social-reporting-v2, port 3457,
  Postgres 5437) with deploy/setup-v2.sh, deploy-v2.sh, rollback-to-v1.sh
  scripts that take over V1's /social-reports URL and let us roll back.

Verification: 62 unit tests pass (auth/session, ids extractor with full URL
fixture, engagement floor, recipes, manifest, linking-fix, MoM compare).
Live smoke run on a Dove brief: 1400 raw → 253 kept (82% culled) → 21
fully-bundled videos → 25 editorial trends across 8 brief-driven categories,
with drift=0.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
DJP 2026-04-29 17:39:07 -04:00
parent 7a70283e5b
commit b89e8b511e
113 changed files with 12725 additions and 0 deletions

10
.gitignore vendored
View file

@ -1,9 +1,19 @@
node_modules/
dist/
.env
.env.bak
.env.local
*.bak
agents/social-listening/outputs/*.html
agents/social-listening/outputs/*.json
agents/social-listening/outputs/*.md
# V2 per-report on-disk artefacts (large; raw Apify dumps may carry API tokens)
briefs/
# V2 build outputs
v2/operator-app/dist/
v2/templates/dashboard_template/dist/
# Claude Code per-project state (memory + plans)
.claude/
*.log
.DS_Store
.idea/

1274
DEVELOPER_BRIEF_V2.md Normal file

File diff suppressed because it is too large Load diff

30
v2/.env.example Normal file
View file

@ -0,0 +1,30 @@
# ─── Anthropic & Apify ───
ANTHROPIC_API_KEY=
APIFY_TOKEN=
# ─── V2 Database (separate from V1) ───
DB_V2_PORT=5437
DB_V2_PASSWORD=change-me-please
DATABASE_URL=postgresql://srv2_user:change-me-please@db-v2:5432/social_reporting_v2
# ─── V2 App ───
APP_V2_PORT=3457
NODE_ENV=development
SESSION_SECRET=
ALLOWED_ORIGIN=
# ─── Auth ───
# Azure AD SSO (lifted from V1)
AZURE_TENANT_ID=
AZURE_CLIENT_ID=
# Emergency password fallback (off by default in prod)
ALLOW_PASSWORD_FALLBACK=false
DASH_USER=admin
DASH_PASS=
# Bootstrap: first SSO user with this email becomes super-admin
BOOTSTRAP_SUPER_ADMIN_EMAIL=
# ─── Compose-name guard (CLAUDE.md policy) ───
COMPOSE_PROJECT_NAME=social-reporting-v2

7
v2/.gitignore vendored Normal file
View file

@ -0,0 +1,7 @@
node_modules/
dist/
*.log
.env
.env.local
operator-app/dist/
templates/dashboard_template/dist/

41
v2/Dockerfile.v2 Normal file
View file

@ -0,0 +1,41 @@
# V2 image: builds operator-app SPA, copies server + pipeline + templates, runs Node.
FROM node:20-slim AS ui-build
WORKDIR /build
COPY v2/package.json v2/package-lock.json* ./
COPY v2/operator-app/package.json operator-app/package.json
COPY v2/templates/dashboard_template/package.json templates/dashboard_template/package.json
RUN npm install --include=dev --no-audit --no-fund
COPY v2/operator-app ./operator-app
COPY v2/tsconfig.base.json ./tsconfig.base.json
RUN npm run build --workspace operator-app
FROM node:20-slim AS runtime
# ffmpeg for Stage 4 frame extraction
RUN apt-get update \
&& apt-get install -y --no-install-recommends ffmpeg ca-certificates \
&& rm -rf /var/lib/apt/lists/*
WORKDIR /app
ENV NODE_ENV=production
COPY v2/package.json v2/package-lock.json* ./
RUN npm install --omit=dev --no-audit --no-fund
COPY v2/tsconfig.base.json v2/tsconfig.json ./
COPY v2/server ./server
COPY v2/pipeline ./pipeline
COPY v2/templates ./templates
COPY v2/db ./db
# UI build artifacts
COPY --from=ui-build /build/operator-app/dist ./operator-app/dist
RUN mkdir -p briefs && useradd -u 1000 -m -s /bin/bash node-v2 || true
RUN chown -R 1000:1000 /app
USER 1000
EXPOSE 3457
CMD ["npx", "tsx", "server/index.ts"]

213
v2/db/init.sql Normal file
View file

@ -0,0 +1,213 @@
-- Social Reporting V2 — fresh schema
-- Coexists with V1 in a separate database (`social_reporting_v2`).
-- Forward-only migrations under v2/db/migrations/.
CREATE EXTENSION IF NOT EXISTS "pgcrypto"; -- gen_random_uuid()
CREATE EXTENSION IF NOT EXISTS "citext"; -- case-insensitive email
-- ─── Identity ───────────────────────────────────────────────────────────
CREATE TABLE IF NOT EXISTS users (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
azure_oid TEXT UNIQUE NOT NULL,
email CITEXT UNIQUE NOT NULL,
display_name TEXT NOT NULL,
is_super_admin BOOLEAN NOT NULL DEFAULT FALSE,
password_hash TEXT,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
last_login_at TIMESTAMPTZ
);
CREATE INDEX IF NOT EXISTS idx_users_azure_oid ON users(azure_oid);
CREATE INDEX IF NOT EXISTS idx_users_email ON users(email);
CREATE TABLE IF NOT EXISTS teams (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
slug TEXT UNIQUE NOT NULL,
name TEXT NOT NULL,
is_personal BOOLEAN NOT NULL DEFAULT FALSE,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
DO $$ BEGIN
CREATE TYPE team_role AS ENUM ('owner','admin','editor','viewer');
EXCEPTION WHEN duplicate_object THEN NULL; END $$;
CREATE TABLE IF NOT EXISTS team_memberships (
team_id UUID NOT NULL REFERENCES teams(id) ON DELETE CASCADE,
user_id UUID NOT NULL REFERENCES users(id) ON DELETE CASCADE,
role team_role NOT NULL,
added_by UUID REFERENCES users(id),
added_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
PRIMARY KEY (team_id, user_id)
);
CREATE INDEX IF NOT EXISTS idx_memberships_user ON team_memberships(user_id);
-- ─── Briefs / Reports ───────────────────────────────────────────────────
CREATE TABLE IF NOT EXISTS briefs (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
team_id UUID NOT NULL REFERENCES teams(id) ON DELETE RESTRICT,
owner_id UUID NOT NULL REFERENCES users(id) ON DELETE RESTRICT,
slug TEXT NOT NULL,
client_name TEXT NOT NULL,
category TEXT NOT NULL,
business_question TEXT NOT NULL,
date_window_days INTEGER NOT NULL DEFAULT 30,
budget_usd NUMERIC(10,2) NOT NULL,
platforms TEXT[] NOT NULL DEFAULT ARRAY['tiktok'],
positioning JSONB,
kpis JSONB,
context_vision TEXT,
-- Hashtag engagement floor (the V2 quality knob)
min_likes INTEGER NOT NULL DEFAULT 1000,
min_plays INTEGER NOT NULL DEFAULT 10000,
min_stl_pct NUMERIC(5,2) NOT NULL DEFAULT 0,
prior_report_id UUID,
brief_yaml JSONB NOT NULL,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
UNIQUE (team_id, slug)
);
CREATE INDEX IF NOT EXISTS idx_briefs_team ON briefs(team_id);
CREATE INDEX IF NOT EXISTS idx_briefs_owner ON briefs(owner_id);
DO $$ BEGIN
CREATE TYPE report_status AS ENUM (
'pending','seeds','pass1','select','pass2','validate',
'analyse','insights','trends','qa','build','completed','failed'
);
EXCEPTION WHEN duplicate_object THEN NULL; END $$;
CREATE TABLE IF NOT EXISTS reports (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
brief_id UUID NOT NULL REFERENCES briefs(id) ON DELETE CASCADE,
team_id UUID NOT NULL REFERENCES teams(id),
triggered_by UUID NOT NULL REFERENCES users(id),
status report_status NOT NULL DEFAULT 'pending',
current_stage INTEGER NOT NULL DEFAULT 0,
started_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
finished_at TIMESTAMPTZ,
apify_cost_usd NUMERIC(10,4) NOT NULL DEFAULT 0,
claude_cost_usd NUMERIC(10,4) NOT NULL DEFAULT 0,
total_cost_usd NUMERIC(10,4) NOT NULL DEFAULT 0,
fs_root TEXT NOT NULL,
manifest_passed_at TIMESTAMPTZ,
error_message TEXT
);
CREATE INDEX IF NOT EXISTS idx_reports_team ON reports(team_id, started_at DESC);
CREATE INDEX IF NOT EXISTS idx_reports_brief ON reports(brief_id, started_at DESC);
ALTER TABLE briefs
ADD CONSTRAINT briefs_prior_report_fk
FOREIGN KEY (prior_report_id) REFERENCES reports(id) ON DELETE SET NULL
DEFERRABLE INITIALLY DEFERRED;
CREATE TABLE IF NOT EXISTS cost_events (
id BIGSERIAL PRIMARY KEY,
report_id UUID NOT NULL REFERENCES reports(id) ON DELETE CASCADE,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
stage INTEGER NOT NULL,
stage_name TEXT NOT NULL,
source TEXT NOT NULL CHECK (source IN ('claude','apify')),
label TEXT NOT NULL,
model TEXT,
input_tokens INTEGER NOT NULL DEFAULT 0,
output_tokens INTEGER NOT NULL DEFAULT 0,
cost_usd NUMERIC(10,6) NOT NULL DEFAULT 0,
metadata JSONB
);
CREATE INDEX IF NOT EXISTS idx_cost_report ON cost_events(report_id, created_at);
-- ─── Videos / Assets / Manifest (THE LINKING FIX) ───────────────────────
-- TikTok numeric id is the canonical key. URL is presentation, not key.
CREATE TABLE IF NOT EXISTS videos (
id TEXT PRIMARY KEY,
platform TEXT NOT NULL DEFAULT 'tiktok',
handle TEXT NOT NULL,
url_canonical TEXT NOT NULL,
caption TEXT,
hashtags TEXT[],
plays BIGINT,
likes BIGINT,
saves BIGINT,
comments_count INTEGER,
shares BIGINT,
stl_pct NUMERIC(5,2),
duration_sec INTEGER,
posted_at TIMESTAMPTZ,
cover_url TEXT,
first_seen_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_videos_handle ON videos(handle);
CREATE INDEX IF NOT EXISTS idx_videos_posted ON videos(posted_at DESC);
DO $$ BEGIN
CREATE TYPE asset_kind AS ENUM ('metadata','cover','transcript','comments','frames','bundle');
EXCEPTION WHEN duplicate_object THEN NULL; END $$;
DO $$ BEGIN
CREATE TYPE asset_status AS ENUM ('pending','ok','failed','dropped');
EXCEPTION WHEN duplicate_object THEN NULL; END $$;
CREATE TABLE IF NOT EXISTS video_assets (
report_id UUID NOT NULL REFERENCES reports(id) ON DELETE CASCADE,
video_id TEXT NOT NULL REFERENCES videos(id),
asset_kind asset_kind NOT NULL,
status asset_status NOT NULL DEFAULT 'pending',
fs_path TEXT,
byte_size BIGINT,
error TEXT,
source_url TEXT,
attempt_count INTEGER NOT NULL DEFAULT 0,
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
PRIMARY KEY (report_id, video_id, asset_kind)
);
CREATE INDEX IF NOT EXISTS idx_assets_status ON video_assets(report_id, status);
CREATE TABLE IF NOT EXISTS manifest_checks (
report_id UUID PRIMARY KEY REFERENCES reports(id) ON DELETE CASCADE,
selected_count INTEGER NOT NULL,
metadata_ok INTEGER NOT NULL DEFAULT 0,
transcript_ok INTEGER NOT NULL DEFAULT 0,
comments_ok INTEGER NOT NULL DEFAULT 0,
frames_ok INTEGER NOT NULL DEFAULT 0,
cover_ok INTEGER NOT NULL DEFAULT 0,
all_ok_count INTEGER NOT NULL DEFAULT 0,
coverage_pct NUMERIC(5,2) NOT NULL DEFAULT 0,
passed BOOLEAN NOT NULL DEFAULT FALSE,
missing JSONB NOT NULL DEFAULT '[]'::jsonb,
built_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
CREATE TABLE IF NOT EXISTS selected_videos (
report_id UUID NOT NULL REFERENCES reports(id) ON DELETE CASCADE,
video_id TEXT NOT NULL REFERENCES videos(id),
rank_score NUMERIC(10,4),
recipe_label TEXT NOT NULL,
is_backfill BOOLEAN NOT NULL DEFAULT FALSE,
PRIMARY KEY (report_id, video_id)
);
-- ─── Trends (junction table — the only place trend↔video lives) ─────────
CREATE TABLE IF NOT EXISTS trends (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
report_id UUID NOT NULL REFERENCES reports(id) ON DELETE CASCADE,
slug TEXT NOT NULL,
name TEXT NOT NULL,
category TEXT NOT NULL,
relevance_tier TEXT NOT NULL CHECK (relevance_tier IN ('core','peripheral')),
velocity NUMERIC(6,3),
description TEXT,
body_jsonb JSONB NOT NULL,
UNIQUE (report_id, slug)
);
CREATE INDEX IF NOT EXISTS idx_trends_report ON trends(report_id);
CREATE TABLE IF NOT EXISTS trend_videos (
trend_id UUID NOT NULL REFERENCES trends(id) ON DELETE CASCADE,
video_id TEXT NOT NULL REFERENCES videos(id),
rank INTEGER,
PRIMARY KEY (trend_id, video_id)
);
CREATE INDEX IF NOT EXISTS idx_trend_videos_video ON trend_videos(video_id);

View file

@ -0,0 +1,36 @@
# Social Reporting V2 — Apache config
# DROP-IN REPLACEMENT for V1's deploy/apache-social-reports.conf.
# Same external path (/social-reports) — different upstream port (3457 instead of 3456).
# Cutover: replace the V1 conf with this one, reload Apache.
#
# Required modules: sudo a2enmod proxy proxy_http headers rewrite
# ─── Static frontend (the operator-app SPA dist served by the Node container) ───
# (Apache serves only the placeholder static index — actual SPA assets go through the proxy.)
# ─── Proxy SPA + API + SSE to Node backend at :3457 ───
ProxyPreserveHost On
ProxyTimeout 600
ProxyPass /social-reports/api/ http://127.0.0.1:3457/api/
ProxyPassReverse /social-reports/api/ http://127.0.0.1:3457/api/
# SSE endpoint: live progress feed during pipeline runs
ProxyPass /social-reports/events http://127.0.0.1:3457/events
ProxyPassReverse /social-reports/events http://127.0.0.1:3457/events
<Location /social-reports/events>
SetEnv proxy-initial-not-pooled 1
SetEnv proxy-sendchunked 1
SetEnv proxy-sendcl 0
Header set Cache-Control "no-cache"
Header set X-Accel-Buffering "no"
SetOutputFilter NONE
</Location>
# Per-report dashboard static assets (built per brief at Stage 10)
ProxyPassMatch ^/social-reports/reports/([^/]+)/dashboard/(.*)$ http://127.0.0.1:3457/api/reports/$1/dashboard/$2
ProxyPassReverse /social-reports/reports/ http://127.0.0.1:3457/api/reports/
# Catch-all for the SPA (everything else falls through to Node, which serves the operator-app)
ProxyPass /social-reports/ http://127.0.0.1:3457/
ProxyPassReverse /social-reports/ http://127.0.0.1:3457/

29
v2/deploy/deploy-v2.sh Executable file
View file

@ -0,0 +1,29 @@
#!/bin/bash
set -euo pipefail
# Routine V2 redeploy (after `git pull`). Run from anywhere on the server.
BACKEND_DIR_V2="/opt/social-reporting-v2"
GREEN='\033[0;32m'; RED='\033[0;31m'; NC='\033[0m'
log() { echo -e "${GREEN}[+]${NC} $1"; }
err() { echo -e "${RED}[x]${NC} $1"; exit 1; }
cd "$BACKEND_DIR_V2" || err "V2 dir not found: $BACKEND_DIR_V2"
log "Pulling latest..."
git pull origin main
log "Rebuilding V2 stack..."
docker compose -f v2/docker-compose.v2.yml -f v2/docker-compose.v2.prod.yml up -d --build
log "Waiting for backend..."
for i in {1..30}; do
curl -sf http://127.0.0.1:3457/api/health >/dev/null 2>&1 && { log "Healthy"; break; }
[ "$i" -eq 30 ] && err "Backend not responding — docker compose -p social-reporting-v2 logs app-v2"
sleep 2
done
log "Reloading Apache..."
sudo systemctl reload apache2
echo -e "${GREEN}Deploy complete.${NC}"

43
v2/deploy/rollback-to-v1.sh Executable file
View file

@ -0,0 +1,43 @@
#!/bin/bash
set -euo pipefail
# Roll back from V2 → V1 at the /social-reports URL.
# V1 source must still be present at /opt/social-reporting (we don't delete it during cutover).
BACKEND_DIR_V1="/opt/social-reporting"
BACKEND_DIR_V2="/opt/social-reporting-v2"
APACHE_CONF="/etc/apache2/conf-available/social-reports.conf"
GREEN='\033[0;32m'; RED='\033[0;31m'; YELLOW='\033[1;33m'; NC='\033[0m'
log() { echo -e "${GREEN}[+]${NC} $1"; }
warn() { echo -e "${YELLOW}[!]${NC} $1"; }
err() { echo -e "${RED}[x]${NC} $1"; exit 1; }
[[ -d "$BACKEND_DIR_V1/.git" ]] || err "V1 source not found at $BACKEND_DIR_V1 — cannot roll back"
warn "About to roll back /social-reports from V2 → V1."
read -r -p "Proceed? [y/N] " ans
[[ "$ans" != "y" && "$ans" != "Y" ]] && err "Aborted"
log "Stopping V2 stack..."
cd "$BACKEND_DIR_V2"
docker compose -p social-reporting-v2 down || warn "V2 was not running"
log "Restoring V1 Apache conf..."
sudo cp "$BACKEND_DIR_V1/deploy/apache-social-reports.conf" "$APACHE_CONF"
sudo apache2ctl configtest || err "Apache config test failed"
log "Starting V1 stack..."
cd "$BACKEND_DIR_V1"
docker compose -f docker-compose.yml -f docker-compose.prod.yml up -d
log "Waiting for V1..."
for i in {1..20}; do
curl -sf http://127.0.0.1:3456/status >/dev/null 2>&1 && { log "V1 healthy"; break; }
[ "$i" -eq 20 ] && err "V1 not responding — docker compose logs social-listening"
sleep 2
done
sudo systemctl reload apache2
echo -e "${GREEN}Rolled back to V1.${NC}"

91
v2/deploy/setup-v2.sh Executable file
View file

@ -0,0 +1,91 @@
#!/bin/bash
set -euo pipefail
# ═══════════════════════════════════════════════════════
# Social Reporting V2 — Server Setup (one-time)
# Target: optical-dev.oliver.solutions
# Cuts over from V1 at the same URL. V1 source kept on disk for rollback.
# ═══════════════════════════════════════════════════════
REPO_URL="${REPO_URL:-}"
BACKEND_DIR_V2="/opt/social-reporting-v2"
BACKEND_DIR_V1="/opt/social-reporting"
APACHE_CONF="/etc/apache2/conf-available/social-reports.conf"
GREEN='\033[0;32m'; RED='\033[0;31m'; YELLOW='\033[1;33m'; NC='\033[0m'
log() { echo -e "${GREEN}[+]${NC} $1"; }
warn() { echo -e "${YELLOW}[!]${NC} $1"; }
err() { echo -e "${RED}[x]${NC} $1"; exit 1; }
[[ -z "$REPO_URL" ]] && err "REPO_URL not set"
command -v docker >/dev/null || err "Docker not installed"
command -v apache2ctl >/dev/null || err "Apache not installed"
# ─── Clone or pull V2 source ───
if [[ -d "$BACKEND_DIR_V2/.git" ]]; then
log "Updating V2 repo at $BACKEND_DIR_V2..."
cd "$BACKEND_DIR_V2" && git remote set-url origin "$REPO_URL" && git pull origin main
else
log "Cloning V2 repo to $BACKEND_DIR_V2..."
sudo mkdir -p "$BACKEND_DIR_V2"
sudo chown "$(whoami):$(whoami)" "$BACKEND_DIR_V2"
git clone "$REPO_URL" "$BACKEND_DIR_V2"
fi
cd "$BACKEND_DIR_V2"
# ─── .env template ───
if [[ ! -f "$BACKEND_DIR_V2/v2/.env" ]]; then
warn "v2/.env not found — copying template"
cp v2/.env.example v2/.env
SS=$(openssl rand -hex 32)
sed -i "s/^SESSION_SECRET=$/SESSION_SECRET=${SS}/" v2/.env
warn "Edit $BACKEND_DIR_V2/v2/.env: APIFY_TOKEN, ANTHROPIC_API_KEY, AZURE_*, BOOTSTRAP_SUPER_ADMIN_EMAIL"
fi
# ─── Cutover from V1 (graceful) ───
warn "About to cut over the /social-reports URL from V1 → V2."
warn "V1 source remains at $BACKEND_DIR_V1 (untouched). Rollback: deploy/rollback-to-v1.sh"
read -r -p "Proceed? [y/N] " ans
[[ "$ans" != "y" && "$ans" != "Y" ]] && err "Aborted"
if [[ -d "$BACKEND_DIR_V1" ]]; then
log "Stopping V1 stack..."
cd "$BACKEND_DIR_V1"
docker compose -p social-listening down || warn "V1 was not running"
cd "$BACKEND_DIR_V2"
fi
# ─── Apache: swap conf to V2 ───
log "Backing up old Apache conf and installing V2..."
[[ -f "$APACHE_CONF" ]] && sudo cp "$APACHE_CONF" "${APACHE_CONF}.v1.bak.$(date +%s)"
sudo cp "$BACKEND_DIR_V2/v2/deploy/apache-social-reports-v2.conf" "$APACHE_CONF"
for mod in proxy proxy_http headers rewrite; do
apache2ctl -M 2>/dev/null | grep -q "${mod}_module" || sudo a2enmod "$mod"
done
sudo a2enconf social-reports >/dev/null 2>&1 || true
sudo apache2ctl configtest || err "Apache config test failed"
# ─── Build & start V2 ───
log "Building & starting V2 stack..."
cd "$BACKEND_DIR_V2"
docker compose -f v2/docker-compose.v2.yml -f v2/docker-compose.v2.prod.yml up -d --build
log "Waiting for V2 backend (port 3457)..."
for i in {1..30}; do
curl -sf http://127.0.0.1:3457/api/health >/dev/null 2>&1 && { log "V2 healthy"; break; }
[ "$i" -eq 30 ] && err "V2 not responding — check: docker compose -p social-reporting-v2 logs app-v2"
sleep 2
done
log "Reloading Apache..."
sudo systemctl reload apache2
echo ""
echo "════════════════════════════════════════════════════"
echo -e " ${GREEN}V2 deployed!${NC}"
echo " URL: https://optical-dev.oliver.solutions/social-reports/"
echo " Backend: http://127.0.0.1:3457 (Docker)"
echo " V2 dir: $BACKEND_DIR_V2"
echo " V1 dir: $BACKEND_DIR_V1 (kept for rollback)"
echo " Rollback: bash $BACKEND_DIR_V2/v2/deploy/rollback-to-v1.sh"
echo "════════════════════════════════════════════════════"

View file

@ -0,0 +1,14 @@
# Production overrides for V2.
# Use: docker compose -f docker-compose.v2.yml -f docker-compose.v2.prod.yml up -d
name: social-reporting-v2
services:
db-v2:
restart: unless-stopped
app-v2:
restart: unless-stopped
environment:
- NODE_ENV=production
- SESSION_SECRET=${SESSION_SECRET}
- ALLOWED_ORIGIN=${ALLOWED_ORIGIN}

55
v2/docker-compose.v2.yml Normal file
View file

@ -0,0 +1,55 @@
# Per CLAUDE.md compose-name policy: every compose file MUST pin a unique top-level `name:`.
# This keeps V2 from colliding with V1 (project name `social-listening`) on shared hosts.
name: social-reporting-v2
services:
db-v2:
image: postgres:16-alpine
ports:
- "${DB_V2_PORT:-5437}:5432"
environment:
POSTGRES_DB: social_reporting_v2
POSTGRES_USER: srv2_user
POSTGRES_PASSWORD: ${DB_V2_PASSWORD:-change-me-please}
volumes:
- pgdata-v2:/var/lib/postgresql/data
- ./db/init.sql:/docker-entrypoint-initdb.d/init.sql
healthcheck:
test: ["CMD-SHELL", "pg_isready -U srv2_user -d social_reporting_v2"]
interval: 3s
timeout: 3s
retries: 10
app-v2:
build:
context: ..
dockerfile: v2/Dockerfile.v2
ports:
- "127.0.0.1:${APP_V2_PORT:-3457}:3457"
env_file:
- .env
depends_on:
db-v2:
condition: service_healthy
volumes:
# Pipeline outputs land here; shared with the host so we can inspect/back up.
- ../briefs:/app/briefs
environment:
- PORT=3457
- NODE_ENV=${NODE_ENV:-development}
- DATABASE_URL=postgresql://srv2_user:${DB_V2_PASSWORD:-change-me-please}@db-v2:5432/social_reporting_v2
- SESSION_SECRET=${SESSION_SECRET:-}
- ALLOWED_ORIGIN=${ALLOWED_ORIGIN:-}
- AZURE_TENANT_ID=${AZURE_TENANT_ID:-}
- AZURE_CLIENT_ID=${AZURE_CLIENT_ID:-}
- ALLOW_PASSWORD_FALLBACK=${ALLOW_PASSWORD_FALLBACK:-false}
- DASH_USER=${DASH_USER:-admin}
- DASH_PASS=${DASH_PASS:-}
- BOOTSTRAP_SUPER_ADMIN_EMAIL=${BOOTSTRAP_SUPER_ADMIN_EMAIL:-}
- APIFY_TOKEN=${APIFY_TOKEN:-}
- APIFY_LIVE_APPROVED=${APIFY_LIVE_APPROVED:-false}
- ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-}
- COMPOSE_PROJECT_NAME=social-reporting-v2
volumes:
pgdata-v2:

View file

@ -0,0 +1,2 @@
VITE_AZURE_TENANT_ID=
VITE_AZURE_CLIENT_ID=

6
v2/operator-app/.gitignore vendored Normal file
View file

@ -0,0 +1,6 @@
node_modules
dist
.env
.env.local
*.log
.vite

View file

@ -0,0 +1,18 @@
<!doctype html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>Social Listening V2</title>
<link rel="preconnect" href="https://fonts.googleapis.com" />
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin />
<link
href="https://fonts.googleapis.com/css2?family=Montserrat:wght@300;400;500;600;700&display=swap"
rel="stylesheet"
/>
</head>
<body>
<div id="root"></div>
<script type="module" src="/src/main.tsx"></script>
</body>
</html>

View file

@ -0,0 +1,29 @@
{
"name": "operator-app",
"version": "0.1.0",
"private": true,
"type": "module",
"scripts": {
"dev": "vite",
"build": "vite build",
"preview": "vite preview",
"typecheck": "tsc --noEmit"
},
"dependencies": {
"react": "^18.3.1",
"react-dom": "^18.3.1",
"react-router-dom": "^6.26.0",
"@tanstack/react-query": "^5.51.0",
"zustand": "^4.5.0"
},
"devDependencies": {
"vite": "^5.4.0",
"@vitejs/plugin-react": "^4.3.0",
"typescript": "^5.4.0",
"tailwindcss": "^3.4.0",
"postcss": "^8.4.0",
"autoprefixer": "^10.4.0",
"@types/react": "^18.3.0",
"@types/react-dom": "^18.3.0"
}
}

View file

@ -0,0 +1,6 @@
export default {
plugins: {
tailwindcss: {},
autoprefixer: {},
},
};

File diff suppressed because one or more lines are too long

View file

@ -0,0 +1,38 @@
import { Routes, Route } from 'react-router-dom';
import ProtectedRoute from './auth/ProtectedRoute';
import Shell from './components/Shell';
import Login from './routes/login';
import Home from './routes/home';
import BriefsList from './routes/briefs/list';
import BriefNew from './routes/briefs/new';
import BriefDetail from './routes/briefs/detail';
import ReportDetail from './routes/reports/detail';
import TeamsList from './routes/teams/list';
import TeamDetail from './routes/teams/detail';
import AdminUsers from './routes/admin/users';
import Help from './routes/help';
export default function App() {
return (
<Routes>
<Route path="/login" element={<Login />} />
<Route
element={
<ProtectedRoute>
<Shell />
</ProtectedRoute>
}
>
<Route path="/" element={<Home />} />
<Route path="/briefs" element={<BriefsList />} />
<Route path="/briefs/new" element={<BriefNew />} />
<Route path="/briefs/:id" element={<BriefDetail />} />
<Route path="/reports/:id" element={<ReportDetail />} />
<Route path="/teams" element={<TeamsList />} />
<Route path="/teams/:id" element={<TeamDetail />} />
<Route path="/admin/users" element={<AdminUsers />} />
<Route path="/help" element={<Help />} />
</Route>
</Routes>
);
}

View file

@ -0,0 +1,94 @@
import { useMutation, useQuery, useQueryClient } from '@tanstack/react-query';
import { fetcher } from './client';
export type BriefSummary = {
id: string;
team_id: string;
owner_id: string;
slug: string;
client_name: string;
category: string;
business_question: string;
date_window_days: number;
budget_usd: number;
platforms: string[];
positioning: { positioning?: string; brand?: BriefBrand } | null;
kpis: string[] | null;
context_vision: string | null;
min_likes: number;
min_plays: number;
min_stl_pct: number;
prior_report_id: string | null;
created_at: string;
};
export type BriefBrand = { name: string; handle: string; positioning?: string };
export type BriefCompetitor = { name: string; handle: string };
export type BriefAudience = {
primary: string;
secondary?: string;
age_range: string;
gender: string;
interests: string[];
};
export type BriefCreateInput = {
client_name: string;
category: string;
brand: BriefBrand;
competitors: BriefCompetitor[];
audience: BriefAudience;
geo: string;
language: string;
business_question: string;
kpis: string[];
budget_usd: number;
date_window_days: number;
platforms: ('tiktok')[];
context_vision?: string;
prior_report_id?: string | null;
min_likes: number;
min_plays: number;
min_stl_pct: number;
};
export type BriefIssue = { path: (string | number)[]; message: string; code?: string };
export function useBriefs() {
return useQuery<{ briefs: BriefSummary[] }>({
queryKey: ['briefs'],
queryFn: () => fetcher('/briefs'),
});
}
export function useBrief(id: string | undefined) {
return useQuery<{ brief: BriefSummary }>({
queryKey: ['brief', id],
queryFn: () => fetcher(`/briefs/${id}`),
enabled: Boolean(id),
});
}
export function useCreateBrief() {
const qc = useQueryClient();
return useMutation<{ brief: BriefSummary }, Error, BriefCreateInput>({
mutationFn: (input) =>
fetcher('/briefs', {
method: 'POST',
body: JSON.stringify(input),
}),
onSuccess: () => {
qc.invalidateQueries({ queryKey: ['briefs'] });
},
});
}
export function useDeleteBrief() {
const qc = useQueryClient();
return useMutation<void, Error, string>({
mutationFn: (id) => fetcher(`/briefs/${id}`, { method: 'DELETE' }),
onSuccess: () => {
qc.invalidateQueries({ queryKey: ['briefs'] });
},
});
}

View file

@ -0,0 +1,46 @@
import { QueryClient } from '@tanstack/react-query';
export type ApiIssue = { path: (string | number)[]; message: string; code?: string };
export class ApiError extends Error {
status: number;
issues?: ApiIssue[];
constructor(status: number, message: string, issues?: ApiIssue[]) {
super(message);
this.status = status;
if (issues) this.issues = issues;
}
}
export async function fetcher<T = unknown>(path: string, init?: RequestInit): Promise<T> {
const url = path.startsWith('/api') ? path : `/api${path.startsWith('/') ? path : `/${path}`}`;
const res = await fetch(url, {
credentials: 'include',
headers: {
'Content-Type': 'application/json',
...(init?.headers ?? {}),
},
...init,
});
if (!res.ok) {
let msg = res.statusText;
let issues: ApiIssue[] | undefined;
try {
const body = await res.json();
if (body?.error) msg = body.error;
if (Array.isArray(body?.issues)) issues = body.issues;
} catch {}
throw new ApiError(res.status, msg, issues);
}
if (res.status === 204) return undefined as T;
return (await res.json()) as T;
}
export const queryClient = new QueryClient({
defaultOptions: {
queries: {
staleTime: 5 * 60 * 1000,
retry: false,
},
},
});

View file

@ -0,0 +1,37 @@
import { useEffect, type ReactNode } from 'react';
import { Navigate } from 'react-router-dom';
import { useMe } from './useMe';
import { ApiError } from '../api/client';
import { useTeamStore } from '../store/team';
type Props = { children?: ReactNode };
export default function ProtectedRoute({ children }: Props) {
const { data, isLoading, error } = useMe();
const setUser = useTeamStore((s) => s.setUser);
const setActiveTeam = useTeamStore((s) => s.setActiveTeam);
useEffect(() => {
if (data) {
setUser(data.user);
setActiveTeam(data.active_team);
}
}, [data, setUser, setActiveTeam]);
if (isLoading) {
return (
<div className="flex items-center justify-center h-screen">
<div className="h-8 w-8 rounded-full border-2 border-border-subtle border-t-accent animate-spin" />
</div>
);
}
if (error instanceof ApiError && error.status === 401) {
return <Navigate to="/login" replace />;
}
if (error) {
return <Navigate to="/login" replace />;
}
return <>{children}</>;
}

View file

@ -0,0 +1,59 @@
declare global {
interface Window {
msal?: any;
}
}
const tenantId = import.meta.env.VITE_AZURE_TENANT_ID as string | undefined;
const clientId = import.meta.env.VITE_AZURE_CLIENT_ID as string | undefined;
let pca: any = null;
async function ensureMsalLoaded(): Promise<void> {
if (window.msal) return;
await new Promise<void>((resolve, reject) => {
const s = document.createElement('script');
s.src = '/msal-browser.min.js';
s.async = true;
s.onload = () => resolve();
s.onerror = () => reject(new Error('Failed to load msal-browser.min.js'));
document.head.appendChild(s);
});
}
export async function getMsal() {
if (pca) return pca;
await ensureMsalLoaded();
if (!tenantId || !clientId) {
throw new Error('Missing VITE_AZURE_TENANT_ID or VITE_AZURE_CLIENT_ID');
}
pca = new window.msal.PublicClientApplication({
auth: {
clientId,
authority: `https://login.microsoftonline.com/${tenantId}`,
redirectUri: window.location.origin + '/login',
},
cache: { cacheLocation: 'sessionStorage' },
});
await pca.initialize();
return pca;
}
export async function loginWithMicrosoft(): Promise<void> {
const app = await getMsal();
await app.loginRedirect({ scopes: ['openid', 'profile', 'email'] });
}
export async function handleRedirectAndExchange(): Promise<{ ok: boolean } | null> {
const app = await getMsal();
const result = await app.handleRedirectPromise();
if (!result?.idToken) return null;
const res = await fetch('/api/sso/token-exchange', {
method: 'POST',
credentials: 'include',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ idToken: result.idToken }),
});
if (!res.ok) throw new Error('Token exchange failed');
return res.json();
}

View file

@ -0,0 +1,16 @@
import { useQuery } from '@tanstack/react-query';
import { fetcher } from '../api/client';
import type { User, Team } from '../store/team';
export type MeResponse = {
user: User;
memberships: Team[];
active_team: Team | null;
};
export function useMe() {
return useQuery<MeResponse>({
queryKey: ['me'],
queryFn: () => fetcher<MeResponse>('/me'),
});
}

View file

@ -0,0 +1,26 @@
import { useMe } from '../auth/useMe';
import TeamSwitcher from './TeamSwitcher';
export default function Header() {
const { data } = useMe();
const user = data?.user;
return (
<header className="h-14 px-6 flex items-center justify-between bg-bg-panel border-b border-border-subtle">
<div className="flex items-center gap-2">
<span className="text-accent font-semibold">Social Listening</span>
<span className="text-text-muted text-sm">V2</span>
</div>
<div className="flex items-center gap-4">
<TeamSwitcher />
<span className="text-sm text-text-muted">{user?.email ?? ''}</span>
<a
href="/api/logout"
className="text-sm text-text-muted hover:text-text-body"
>
Sign out
</a>
</div>
</header>
);
}

View file

@ -0,0 +1,17 @@
import { Outlet } from 'react-router-dom';
import Header from './Header';
import Sidebar from './Sidebar';
export default function Shell() {
return (
<div className="min-h-screen flex flex-col">
<Header />
<div className="flex flex-1 min-h-0">
<Sidebar />
<main className="flex-1 overflow-y-auto p-8">
<Outlet />
</main>
</div>
</div>
);
}

View file

@ -0,0 +1,28 @@
import { NavLink } from 'react-router-dom';
import { useMe } from '../auth/useMe';
const linkBase =
'block px-4 py-2 rounded text-sm transition-colors';
const linkInactive = 'text-text-muted hover:bg-bg-field hover:text-text-body';
const linkActive = 'bg-bg-field text-accent';
function cls({ isActive }: { isActive: boolean }) {
return `${linkBase} ${isActive ? linkActive : linkInactive}`;
}
export default function Sidebar() {
const { data } = useMe();
const isSuper = data?.user?.is_super_admin === true;
return (
<nav className="w-56 shrink-0 bg-bg-panel border-r border-border-subtle p-3 space-y-1">
<NavLink to="/" end className={cls}>Home</NavLink>
<NavLink to="/briefs" className={cls}>Briefs</NavLink>
<NavLink to="/teams" className={cls}>Teams</NavLink>
<NavLink to="/help" className={cls}>Help</NavLink>
{isSuper && (
<NavLink to="/admin/users" className={cls}>Admin</NavLink>
)}
</nav>
);
}

View file

@ -0,0 +1,32 @@
import { useMutation, useQueryClient } from '@tanstack/react-query';
import { useMe } from '../auth/useMe';
import { fetcher } from '../api/client';
export default function TeamSwitcher() {
const { data } = useMe();
const qc = useQueryClient();
const mut = useMutation({
mutationFn: (team_id: string) =>
fetcher('/me/active-team', {
method: 'PATCH',
body: JSON.stringify({ team_id }),
}),
onSuccess: () => qc.invalidateQueries({ queryKey: ['me'] }),
});
if (!data || data.memberships.length === 0) return null;
return (
<select
value={data.active_team?.id ?? ''}
onChange={(e) => mut.mutate(e.target.value)}
className="bg-bg-field border border-border-input rounded px-2 py-1 text-sm text-text-body"
>
{data.memberships.map((t) => (
<option key={t.id} value={t.id}>
{t.name}
</option>
))}
</select>
);
}

View file

@ -0,0 +1,17 @@
import React from 'react';
import ReactDOM from 'react-dom/client';
import { QueryClientProvider } from '@tanstack/react-query';
import { BrowserRouter } from 'react-router-dom';
import App from './App';
import { queryClient } from './api/client';
import './styles.css';
ReactDOM.createRoot(document.getElementById('root')!).render(
<React.StrictMode>
<QueryClientProvider client={queryClient}>
<BrowserRouter>
<App />
</BrowserRouter>
</QueryClientProvider>
</React.StrictMode>,
);

View file

@ -0,0 +1,40 @@
export default function AdminUsers() {
const users: Array<{ id: string; email: string; is_super_admin: boolean }> = [];
return (
<div className="space-y-6">
<h1 className="text-2xl font-semibold">Admin: Users</h1>
<div className="bg-bg-panel border border-border-subtle rounded-lg overflow-hidden">
<table className="w-full text-sm">
<thead className="bg-bg-field text-text-muted">
<tr>
<th className="text-left px-4 py-2">Email</th>
<th className="text-left px-4 py-2">Super-admin</th>
<th className="text-left px-4 py-2">Actions</th>
</tr>
</thead>
<tbody>
{users.length === 0 && (
<tr>
<td colSpan={3} className="px-4 py-6 text-center text-text-dim">
Phase A scaffold. Will read /api/admin/users.
</td>
</tr>
)}
{users.map((u) => (
<tr key={u.id} className="border-t border-border-subtle">
<td className="px-4 py-2">{u.email}</td>
<td className="px-4 py-2">{u.is_super_admin ? 'Yes' : 'No'}</td>
<td className="px-4 py-2">
<button className="text-accent hover:underline" type="button">
Promote to super-admin
</button>
</td>
</tr>
))}
</tbody>
</table>
</div>
</div>
);
}

View file

@ -0,0 +1,191 @@
export default function Help() {
return (
<div className="max-w-3xl space-y-8">
<h1 className="text-2xl font-semibold">Help & FAQ</h1>
<section className="bg-bg-panel border border-accent rounded-lg p-5">
<h2 className="text-sm font-semibold text-accent mb-2">V2 what's new</h2>
<p className="text-xs text-text-muted leading-relaxed">
Multi-team workspaces, Microsoft SSO, configurable engagement-quality floor,
manifest-gated linking, React dashboard.
</p>
</section>
<section className="bg-bg-panel border border-border-subtle rounded-lg p-6 space-y-3">
<h2 className="text-xl font-medium text-accent">How It Works</h2>
<p className="text-sm text-text-muted leading-relaxed">
The pipeline runs 8 stages automatically. You fill in a brief, hit Run,
and get a client-ready report with trends, audience insights, content
opportunities, and creator spotlights.
</p>
<div className="grid grid-cols-4 gap-3 mt-4">
{[
{ range: '1-2', label: 'Brief & Strategy' },
{ range: '3-5', label: 'Scrape & Enrich' },
{ range: '6-7', label: 'Review & Research' },
{ range: '8', label: 'Final Report' },
].map((s) => (
<div key={s.range} className="bg-bg-field rounded-lg p-3 text-center">
<div className="text-xl font-bold text-accent">{s.range}</div>
<div className="text-[10px] text-text-muted mt-1">{s.label}</div>
</div>
))}
</div>
</section>
<section className="bg-bg-panel border border-border-subtle rounded-lg p-6 space-y-5">
<h2 className="text-xl font-medium text-accent">Brief Fields Guide</h2>
<FieldGuide
name="Client name"
body="The brand or company you're researching. Used in the report header and to give the AI agents context about the brand."
example="H&M, Nespresso, The Ordinary"
/>
<FieldGuide
name="Category"
body="The market category or niche. This shapes what the AI looks for in the data — trends are reported relative to this space."
example="fast fashion, specialty coffee, skincare, home fitness"
/>
<FieldGuide
name="Brand & competitors"
body="The brand handle and 3-15 competitor handles on TikTok (without the @). The pipeline scrapes these directly to build creator spotlights and a competitive baseline."
tip="Pick competitors who actually compete for the same audience, not just the biggest names."
/>
<FieldGuide
name="Audience"
body="Primary (and optional secondary) audience, age range, gender, and at least three interests. The AI uses this to frame audience insights and to filter for content that lands with the right viewers."
/>
<FieldGuide
name="Business question"
body="The single question this report should answer, in at least 8 words. This is the most impactful field for report quality — without a clear question the AI defaults to a generic category overview."
example="How is Gen Z talking about sustainability in skincare, and where do The Ordinary and CeraVe sit in that conversation?"
/>
<FieldGuide
name="KPIs"
body="At least two measurable goals the report should speak to (share of voice, engagement rate, sentiment lift, etc.). KPIs steer the analytics layer."
/>
<FieldGuide
name="Context / vision"
body="Free-text guidance that's injected into every AI stage. Tell it what you need from the report, what to focus on, who the audience is, or any constraints to shape the analysis."
tip="Be specific. 'Focus on sustainability' is OK. 'Focus on how Gen Z talks about sustainability in skincare, especially The Ordinary vs. CeraVe' is much better."
/>
<FieldGuide
name="Quality floor (min likes / plays / STL%)"
body="Engagement thresholds applied at scrape time so noise never enters the pipeline. STL% = (likes + saves + comments + shares) / plays × 100. Defaults are sensible; raise for noisy categories."
tip="If a category is huge (gaming, beauty), bump min plays and min STL% to keep only content that actually resonated."
/>
<FieldGuide
name="Apify budget"
body="USD cap on data scraping. 70% goes to discovery, 30% to enrichment. Hard ceiling is 95% of this value."
tip="$50-100 is typical. Below $25 you'll only see headline trends; above $150 you mostly buy duplication."
/>
</section>
<section className="bg-bg-panel border border-border-subtle rounded-lg p-6 space-y-3">
<h2 className="text-xl font-medium text-accent">Tips for Better Reports</h2>
<Tip
title="1. Be specific with the business question"
body="A vague question yields a vague report. Phrase it as the decision you're trying to make, not the topic you're curious about."
/>
<Tip
title="2. Use the context field"
body="The single most impactful field for report quality. Tell the AI what business question you're answering, who the report is for, and what kind of insights matter most."
/>
<Tip
title="3. Match budget to scope"
body="A wide category with a tight budget gets you thin coverage. Either narrow the brief or raise the budget."
/>
<Tip
title="4. Pick competitors who actually compete"
body="Three direct competitors beat ten loose ones. The competitor handles drive creator spotlights and the share-of-voice analysis."
/>
<Tip
title="5. Tune the quality floor"
body="Defaults work for most briefs. For noisy categories, raise min plays and min STL% so the report only reflects content that landed."
/>
<Tip
title="6. Save and iterate"
body="If the first report isn't sharp enough, adjust the brief and run again. Each run preserves the previous report for compare."
/>
</section>
<section className="bg-bg-panel border border-border-subtle rounded-lg p-6 space-y-3">
<h2 className="text-xl font-medium text-accent">What Each Stage Does</h2>
<Stage n="1" title="Brief Validation" body="Validates form inputs against the V2 schema. Checks required fields, audience completeness, business question word count." />
<Stage n="2" title="Strategy Review" body="Two AI agents (Community Manager + Brand Strategist) review your brief and generate hypotheses about what trends and insights to look for." />
<Stage n="3" title="Discovery Scrape" body="Scrapes TikTok via Apify using your brand and competitor handles. This is where most of the Apify budget goes (70%)." />
<Stage n="4" title="Data Review" body="AI agents review the scraped data, select the most relevant videos, and refine hypotheses based on what was found." />
<Stage n="5" title="Enrichment Scrape" body="Pulls comments, transcripts, and thumbnails for the top videos. Uses the remaining 30% of the Apify budget." />
<Stage n="6" title="Pre-Report Review" body="AI agents do a final review of enriched data and generate desk research queries to validate findings." />
<Stage n="7" title="Desk Research" body="Web searches corroborate claims and add industry context to the report." />
<Stage n="8" title="Report Generation" body="Claude generates the final report: executive summary, trends, audience insights, content opportunities, creator spotlights, visual language. Outputs HTML, JSON, Markdown." />
</section>
<section className="bg-bg-panel border border-border-subtle rounded-lg p-6 space-y-3">
<h2 className="text-xl font-medium text-accent">FAQ</h2>
<Faq
q="How long does a run take?"
a="Typically 5-15 minutes depending on data volume. Stage 3 (scraping) and Stage 8 (report generation) take the longest."
/>
<Faq
q="What does it cost?"
a="Apify cost is set by your budget field. Claude API cost varies but is usually $1-4 per run on top of the Apify spend. Total cost is shown in the live tracker during the run."
/>
<Faq
q="Can I run it again with tweaks?"
a="Yes. Adjust the brief and run again. Set prior_report_id to a previous run to get month-over-month comparison."
/>
<Faq
q="What if a stage fails?"
a="The pipeline shows the error in the log. Common causes: Apify budget exhausted, API rate limits, or invalid brief fields."
/>
<Faq
q="Why TikTok-only?"
a="V2 ships TikTok-only because it's the richest signal source for trend reports. Other platforms can be added without breaking the brief schema."
/>
</section>
</div>
);
}
function FieldGuide({ name, body, example, tip }: {
name: string; body: string; example?: string; tip?: string;
}) {
return (
<div>
<div className="text-sm font-semibold text-text-body mb-1">{name}</div>
<p className="text-xs text-text-muted leading-relaxed">{body}</p>
{example && <p className="text-xs text-accent mt-1">Example: {example}</p>}
{tip && <p className="text-xs text-text-dim mt-1">Tip: {tip}</p>}
</div>
);
}
function Tip({ title, body }: { title: string; body: string }) {
return (
<div>
<div className="text-sm font-semibold">{title}</div>
<p className="text-xs text-text-muted leading-relaxed mt-1">{body}</p>
</div>
);
}
function Stage({ n, title, body }: { n: string; title: string; body: string }) {
return (
<div>
<div className="text-sm">
<span className="text-accent font-semibold">Stage {n} {title}</span>
</div>
<p className="text-xs text-text-muted leading-relaxed mt-1">{body}</p>
</div>
);
}
function Faq({ q, a }: { q: string; a: string }) {
return (
<div>
<div className="text-sm font-semibold">{q}</div>
<p className="text-xs text-text-muted leading-relaxed mt-1">{a}</p>
</div>
);
}

View file

@ -0,0 +1,22 @@
import { useTeamStore } from '../store/team';
export default function Home() {
const activeTeam = useTeamStore((s) => s.activeTeam);
return (
<div className="space-y-6">
<div className="flex items-center gap-3">
<h1 className="text-2xl font-semibold">Home</h1>
{activeTeam && (
<span className="px-2 py-0.5 text-xs rounded bg-bg-field border border-border-subtle text-text-muted">
{activeTeam.name}
</span>
)}
</div>
<section className="bg-bg-panel border border-border-subtle rounded-lg p-6">
<h2 className="text-lg font-medium mb-2">Recent reports</h2>
<p className="text-text-muted text-sm">Phase A scaffold. Recent reports list will live here.</p>
</section>
</div>
);
}

View file

@ -0,0 +1,81 @@
import { useEffect, useState } from 'react';
import { useNavigate, useSearchParams } from 'react-router-dom';
import { handleRedirectAndExchange, loginWithMicrosoft } from '../auth/msal';
import { fetcher } from '../api/client';
export default function Login() {
const [params] = useSearchParams();
const showPassword = params.get('password') === '1';
const navigate = useNavigate();
const [err, setErr] = useState<string | null>(null);
const [busy, setBusy] = useState(false);
useEffect(() => {
handleRedirectAndExchange()
.then((r) => { if (r?.ok) navigate('/', { replace: true }); })
.catch((e) => setErr(e.message ?? 'Sign-in failed'));
}, [navigate]);
async function onSso() {
setErr(null); setBusy(true);
try { await loginWithMicrosoft(); }
catch (e: any) { setErr(e?.message ?? 'SSO failed'); setBusy(false); }
}
return (
<div className="min-h-screen flex items-center justify-center px-4">
<div className="w-full max-w-sm bg-bg-panel border border-border-subtle rounded-lg p-8">
<div className="text-center mb-6">
<div className="text-accent text-xl font-semibold">Social Listening</div>
<div className="text-text-muted text-sm">V2 Operator</div>
</div>
<button
onClick={onSso}
disabled={busy}
className="w-full bg-accent hover:bg-accent-hover text-black font-medium py-2 rounded transition-colors disabled:opacity-60"
>
Sign in with Microsoft
</button>
{err && <div className="text-red-400 text-sm mt-3">{err}</div>}
{showPassword && <PasswordFallback onError={setErr} />}
</div>
</div>
);
}
function PasswordFallback({ onError }: { onError: (s: string | null) => void }) {
const navigate = useNavigate();
const [pw, setPw] = useState('');
const [busy, setBusy] = useState(false);
async function submit(e: React.FormEvent) {
e.preventDefault();
onError(null); setBusy(true);
try {
await fetcher('/login', { method: 'POST', body: JSON.stringify({ password: pw }) });
navigate('/', { replace: true });
} catch (err: any) {
onError(err?.message ?? 'Login failed');
} finally { setBusy(false); }
}
return (
<form onSubmit={submit} className="mt-6 pt-6 border-t border-border-subtle space-y-3">
<div className="text-xs text-text-dim uppercase tracking-wide">Emergency password</div>
<input
type="password"
value={pw}
onChange={(e) => setPw(e.target.value)}
className="w-full bg-bg-field border border-border-input rounded px-3 py-2 text-sm"
placeholder="Password"
/>
<button
type="submit"
disabled={busy}
className="w-full border border-border-input hover:border-accent text-text-body py-2 rounded text-sm"
>
Sign in with password
</button>
</form>
);
}

View file

@ -0,0 +1,16 @@
import { useParams } from 'react-router-dom';
export default function ReportDetail() {
const { id } = useParams();
return (
<div className="space-y-6">
<h1 className="text-2xl font-semibold">Report <span className="text-text-muted">{id}</span></h1>
<div className="bg-bg-panel border border-border-subtle rounded-lg p-6">
<p className="text-text-muted text-sm">
Phase A scaffold. Live SSE feed of pipeline events + final report viewer link
will live here.
</p>
</div>
</div>
);
}

View file

@ -0,0 +1,16 @@
import { useParams } from 'react-router-dom';
export default function TeamDetail() {
const { id } = useParams();
return (
<div className="space-y-6">
<h1 className="text-2xl font-semibold">Team <span className="text-text-muted">{id}</span></h1>
<div className="bg-bg-panel border border-border-subtle rounded-lg p-6">
<p className="text-text-muted text-sm">
Phase A scaffold. Team detail (members, invites, role management
owner/admin/member) will live here.
</p>
</div>
</div>
);
}

View file

@ -0,0 +1,12 @@
export default function TeamsList() {
return (
<div className="space-y-6">
<h1 className="text-2xl font-semibold">Teams</h1>
<div className="bg-bg-panel border border-border-subtle rounded-lg p-6">
<p className="text-text-muted text-sm">
Phase A scaffold. Teams list with create-team button will live here.
</p>
</div>
</div>
);
}

View file

@ -0,0 +1,30 @@
import { create } from 'zustand';
export type User = {
id: string;
email: string;
display_name?: string;
is_super_admin?: boolean;
};
export type Team = {
id: string;
name: string;
role?: string;
};
type TeamState = {
user: User | null;
activeTeam: Team | null;
setUser: (u: User | null) => void;
setActiveTeam: (t: Team | null) => void;
clear: () => void;
};
export const useTeamStore = create<TeamState>((set) => ({
user: null,
activeTeam: null,
setUser: (user) => set({ user }),
setActiveTeam: (activeTeam) => set({ activeTeam }),
clear: () => set({ user: null, activeTeam: null }),
}));

View file

@ -0,0 +1,17 @@
@tailwind base;
@tailwind components;
@tailwind utilities;
html,
body,
#root {
height: 100%;
}
body {
background-color: #0a0a0a;
color: #e0e0e0;
font-family: 'Montserrat', system-ui, sans-serif;
-webkit-font-smoothing: antialiased;
-moz-osx-font-smoothing: grayscale;
}

View file

@ -0,0 +1,35 @@
import type { Config } from 'tailwindcss';
const config: Config = {
content: ['./index.html', './src/**/*.{ts,tsx}'],
theme: {
extend: {
colors: {
bg: {
base: '#0a0a0a',
panel: '#141414',
field: '#1a1a1a',
},
border: {
subtle: '#2a2a2a',
input: '#333',
},
accent: {
DEFAULT: '#f5a623',
hover: '#e69920',
},
text: {
body: '#e0e0e0',
muted: '#888',
dim: '#666',
},
},
fontFamily: {
sans: ['Montserrat', 'system-ui', 'sans-serif'],
},
},
},
plugins: [],
};
export default config;

View file

@ -0,0 +1,14 @@
{
"extends": "../tsconfig.base.json",
"compilerOptions": {
"lib": ["ES2022", "DOM", "DOM.Iterable"],
"jsx": "react-jsx",
"useDefineForClassFields": true,
"allowImportingTsExtensions": false,
"isolatedModules": true,
"noEmit": true,
"types": ["vite/client"]
},
"include": ["src"],
"references": [{ "path": "./tsconfig.node.json" }]
}

View file

@ -0,0 +1,11 @@
{
"compilerOptions": {
"composite": true,
"module": "ESNext",
"moduleResolution": "bundler",
"allowSyntheticDefaultImports": true,
"strict": true,
"skipLibCheck": true
},
"include": ["vite.config.ts", "tailwind.config.ts", "postcss.config.js"]
}

View file

@ -0,0 +1,19 @@
import { defineConfig } from 'vite';
import react from '@vitejs/plugin-react';
export default defineConfig({
plugins: [react()],
server: {
port: 5173,
proxy: {
'/api': {
target: 'http://localhost:3457',
changeOrigin: true,
},
},
},
build: {
outDir: 'dist',
emptyOutDir: true,
},
});

4137
v2/package-lock.json generated Normal file

File diff suppressed because it is too large Load diff

28
v2/package.json Normal file
View file

@ -0,0 +1,28 @@
{
"name": "social-reporting-v2",
"version": "0.1.0",
"private": true,
"type": "module",
"workspaces": [
"operator-app"
],
"scripts": {
"server": "tsx watch server/index.ts",
"server:prod": "tsx server/index.ts",
"pipe": "tsx pipeline/cli.ts",
"ui:dev": "npm run dev --workspace operator-app",
"ui:build": "npm run build --workspace operator-app",
"test": "vitest run",
"test:watch": "vitest"
},
"dependencies": {
"postgres": "^3.4.8",
"tsx": "^4.7.0",
"typescript": "^5.4.0",
"zod": "^3.23.0"
},
"devDependencies": {
"@types/node": "^20.11.0",
"vitest": "^1.6.0"
}
}

0
v2/pipeline/.gitkeep Normal file
View file

View file

@ -0,0 +1,74 @@
import { describe, it, expect } from 'vitest';
import { applyEngagementFloor, computeStlPct, type EngagementFloor } from '../lib/engagement_floor.js';
const BASE: EngagementFloor = { min_likes: 1000, min_plays: 10000, min_stl_pct: 0 };
describe('applyEngagementFloor', () => {
it('keeps items above the floor', () => {
const items = [
{ plays: 50000, likes: 5000 },
{ plays: 100000, likes: 8000 },
];
const { kept, counters } = applyEngagementFloor(items, BASE);
expect(kept).toHaveLength(2);
expect(counters.kept_after_floor).toBe(2);
expect(counters.dropped_min_likes + counters.dropped_min_plays).toBe(0);
});
it('drops by min_plays', () => {
const items = [{ plays: 5000, likes: 5000 }];
const { kept, counters } = applyEngagementFloor(items, BASE);
expect(kept).toHaveLength(0);
expect(counters.dropped_min_plays).toBe(1);
});
it('drops by min_likes', () => {
const items = [{ plays: 50000, likes: 50 }];
const { kept, counters } = applyEngagementFloor(items, BASE);
expect(kept).toHaveLength(0);
expect(counters.dropped_min_likes).toBe(1);
});
it('drops zero-plays items as a special case', () => {
const items = [{ plays: 0, likes: 99999 }, { plays: -1, likes: 99999 }];
const { counters } = applyEngagementFloor(items, BASE);
expect(counters.dropped_zero_plays).toBe(2);
});
it('applies stl% floor when configured', () => {
const items = [
// STL = (1000+0+0+0)/10000 * 100 = 10%
{ plays: 10000, likes: 1000, saves: 0, comments_count: 0, shares: 0 },
// STL = (1000+0+0+0)/100000 * 100 = 1%
{ plays: 100000, likes: 1000, saves: 0, comments_count: 0, shares: 0 },
];
const { kept, counters } = applyEngagementFloor(items, { ...BASE, min_stl_pct: 5 });
expect(kept).toHaveLength(1);
expect(kept[0]?.plays).toBe(10000);
expect(counters.dropped_min_stl).toBe(1);
});
it('counters sum to raw_returned', () => {
const items = [
{ plays: 0, likes: 0 },
{ plays: 5000, likes: 5000 },
{ plays: 50000, likes: 50 },
{ plays: 100000, likes: 5000 },
];
const { counters } = applyEngagementFloor(items, BASE);
const total = counters.dropped_zero_plays + counters.dropped_min_plays +
counters.dropped_min_likes + counters.dropped_min_stl + counters.kept_after_floor;
expect(total).toBe(counters.raw_returned);
});
});
describe('computeStlPct', () => {
it('returns 0 for zero plays', () => {
expect(computeStlPct({ plays: 0, likes: 100 })).toBe(0);
});
it('sums likes+saves+comments+shares', () => {
const stl = computeStlPct({ plays: 1000, likes: 50, saves: 30, comments_count: 10, shares: 10 });
expect(stl).toBe(10); // 100/1000 * 100
});
});

View file

@ -0,0 +1,70 @@
// Comprehensive URL-form fixture for extractTikTokId.
// Every form V1 has seen drift in goes here. Add new mutation forms here, not in code.
import { describe, it, expect } from 'vitest';
import { extractTikTokId, canonicalTikTokUrl } from '../lib/ids.js';
const HANDLED = [
// Standard www form
['https://www.tiktok.com/@dove/video/7280000000000000000', '7280000000000000000'],
// Without www
['https://tiktok.com/@dove/video/7280000000000000000', '7280000000000000000'],
// With trailing query params (most common drift cause)
['https://www.tiktok.com/@dove/video/7280000000000000000?is_from_webapp=1&sender_device=pc', '7280000000000000000'],
// Trailing slash
['https://www.tiktok.com/@dove/video/7280000000000000000/', '7280000000000000000'],
// Mobile m.tiktok.com /v/.html form
['https://m.tiktok.com/v/7280000000000000000.html', '7280000000000000000'],
// Older share /t/<id> form
['https://www.tiktok.com/t/7280000000000000000', '7280000000000000000'],
// Bare numeric id
['7280000000000000000', '7280000000000000000'],
// 18-digit ids (older content)
['https://www.tiktok.com/@dove/video/728000000000000000', '728000000000000000'],
// Capital case in handle (drift case from V1)
['https://www.tiktok.com/@DoveBeauty/video/7280000000000000000', '7280000000000000000'],
// Embedded in JSON-ish text (Apify response field bleed)
['"webVideoUrl":"https://www.tiktok.com/@dove/video/7280000000000000000"', '7280000000000000000'],
];
const REJECTED: Array<[string | null | undefined | number, string]> = [
[null, 'null input'],
[undefined, 'undefined input'],
['', 'empty string'],
[' ', 'whitespace only'],
['https://www.tiktok.com/@dove', 'profile URL, no video id'],
['https://vm.tiktok.com/ZMabc123/', 'short link — needs resolveShortLink first'],
['https://example.com/video/12345', 'wrong domain pattern OK in URL but id length must be 15-21'],
[123, 'numeric input but too short'],
];
describe('extractTikTokId', () => {
it.each(HANDLED)('extracts %s → %s', (input, expected) => {
expect(extractTikTokId(input)).toBe(expected);
});
it.each(REJECTED)('rejects %p (%s)', (input, _reason) => {
expect(extractTikTokId(input)).toBeNull();
});
it('extracts the SAME id from every shape — the linking-fix invariant', () => {
const ids = HANDLED.map(([url]) => extractTikTokId(url));
const targetId = HANDLED[0]![1];
// The first 7 fixture rows all share the canonical 7280... id.
const sameAsTarget = HANDLED.slice(0, 7).every(([_url], i) => ids[i] === targetId);
expect(sameAsTarget).toBe(true);
});
});
describe('canonicalTikTokUrl', () => {
it('round-trips id through canonical URL', () => {
const url = canonicalTikTokUrl('7280000000000000000', 'dove');
expect(url).toBe('https://www.tiktok.com/@dove/video/7280000000000000000');
expect(extractTikTokId(url)).toBe('7280000000000000000');
});
it('strips a leading @ from handle', () => {
expect(canonicalTikTokUrl('7280000000000000000', '@dove')).toBe(
'https://www.tiktok.com/@dove/video/7280000000000000000',
);
});
});

View file

@ -0,0 +1,96 @@
// THE LINKING FIX TEST.
// V1 bug: assets joined to videos via Map.get(url) silently dropped on URL-form drift.
// V2 fix: every Apify response is matched back to the canonical TikTok id via
// extractTikTokId, mismatches go to drift_log.jsonl, never silently null.
//
// This test simulates exactly the kind of drift V1 saw — same logical video, different
// URL shapes returned by different Apify actors — and proves V2 collapses them to one id.
import { describe, it, expect, beforeEach } from 'vitest';
import { mkdirSync, rmSync, existsSync, readFileSync } from 'node:fs';
import { resolve } from 'node:path';
import { groupByCanonicalId } from '../stages/stage_4_pass2_enrich.js';
import { resetDriftCounter, getDriftCount } from '../lib/drift_log.js';
import { PATHS } from '../lib/paths.js';
const FIXTURE_ROOT = resolve('briefs/__linking_test_root__');
const REPORT_ID = 'r1';
beforeEach(() => {
process.env.BRIEFS_ROOT = FIXTURE_ROOT;
if (existsSync(FIXTURE_ROOT)) rmSync(FIXTURE_ROOT, { recursive: true });
mkdirSync(FIXTURE_ROOT, { recursive: true });
resetDriftCounter();
});
describe('groupByCanonicalId — the V1-bug fix', () => {
const VIDEO_ID = '7280000000000000000';
const SELECTION = new Set([VIDEO_ID, '7281111111111111111']);
it('matches the SAME id from every URL form V1 has seen drift in', () => {
const items = [
{ webVideoUrl: 'https://www.tiktok.com/@dove/video/7280000000000000000' },
{ videoUrl: 'https://tiktok.com/@dove/video/7280000000000000000' },
{ url: 'https://www.tiktok.com/@dove/video/7280000000000000000?is_from_webapp=1' },
{ postUrl: 'https://www.tiktok.com/@dove/video/7280000000000000000/' },
{ webVideoUrl: 'https://m.tiktok.com/v/7280000000000000000.html' },
{ url: 'https://www.tiktok.com/@DoveBeauty/video/7280000000000000000' },
];
const grouped = groupByCanonicalId(REPORT_ID, 'TIKTOK_TRANSCRIPTS', items, SELECTION);
// Every input collapses to the SAME bucket. V1's bug = these would have ended up
// in 6 different (or no!) buckets and most assets would silently null.
expect(grouped.size).toBe(1);
expect(grouped.get(VIDEO_ID)).toHaveLength(items.length);
expect(getDriftCount()).toBe(0);
});
it('logs drift loudly (not silently) when an item has no extractable id', () => {
const items = [
{ videoUrl: 'https://www.tiktok.com/@dove' }, // profile URL, no id
{ videoUrl: 'https://vm.tiktok.com/ZMabc123/' }, // unresolved short link
{ videoUrl: 'https://www.tiktok.com/@dove/video/7280000000000000000' }, // valid one
];
const grouped = groupByCanonicalId(REPORT_ID, 'TIKTOK_COMMENTS', items, SELECTION);
expect(grouped.size).toBe(1);
expect(grouped.get(VIDEO_ID)).toHaveLength(1);
expect(getDriftCount()).toBe(2);
const log = readFileSync(PATHS.driftLog(REPORT_ID), 'utf-8');
const lines = log.trim().split('\n');
expect(lines).toHaveLength(2);
const events = lines.map((l) => JSON.parse(l));
expect(events.every((e) => e.actor === 'TIKTOK_COMMENTS')).toBe(true);
expect(events.every((e) => e.reason === 'no-id-extracted')).toBe(true);
});
it('logs drift when actor returns a video that was not selected (out-of-set)', () => {
const items = [
// valid + in selection
{ webVideoUrl: 'https://www.tiktok.com/@dove/video/7280000000000000000' },
// valid id but NOT in our selection — must not silently land in any bucket
{ webVideoUrl: 'https://www.tiktok.com/@other/video/7299999999999999999' },
];
const grouped = groupByCanonicalId(REPORT_ID, 'TIKTOK_TRANSCRIPTS', items, SELECTION);
expect(grouped.size).toBe(1);
expect(grouped.get(VIDEO_ID)).toHaveLength(1);
expect(grouped.has('7299999999999999999')).toBe(false);
expect(getDriftCount()).toBe(1);
const log = readFileSync(PATHS.driftLog(REPORT_ID), 'utf-8').trim();
const event = JSON.parse(log);
expect(event.reason).toBe('id-not-in-selection');
expect(event.extracted_id).toBe('7299999999999999999');
});
it('groups multiple items that legitimately point at the same video (e.g. duplicate transcripts)', () => {
const items = [
{ videoUrl: 'https://www.tiktok.com/@dove/video/7280000000000000000', text: 'first' },
{ videoUrl: 'https://www.tiktok.com/@dove/video/7280000000000000000?_t=abc', text: 'second' },
];
const grouped = groupByCanonicalId(REPORT_ID, 'TIKTOK_TRANSCRIPTS', items, SELECTION);
expect(grouped.get(VIDEO_ID)).toHaveLength(2);
expect(getDriftCount()).toBe(0);
});
});

View file

@ -0,0 +1,114 @@
import { describe, it, expect, beforeEach } from 'vitest';
import { mkdirSync, writeFileSync, rmSync, existsSync } from 'node:fs';
import { join, resolve } from 'node:path';
import { buildManifest } from '../lib/manifest.js';
const FIXTURE_ROOT = resolve('briefs/__manifest_test_root__');
const REPORT_ID = 'r1';
function fakeBundle(id: string, opts: { transcript?: boolean; commentsCount?: number; framesCount?: number; coverBytes?: number } = {}) {
const dir = join(FIXTURE_ROOT, REPORT_ID, 'enriched', id);
mkdirSync(join(dir, 'frames'), { recursive: true });
writeFileSync(join(dir, 'metadata.json'), JSON.stringify({ id }));
if ((opts.coverBytes ?? 10_000) > 0) {
writeFileSync(join(dir, 'cover.jpg'), Buffer.alloc(opts.coverBytes ?? 10_000, 0xff));
}
if (opts.transcript !== false) {
writeFileSync(join(dir, 'transcript.json'), JSON.stringify({
language_detected: 'en',
text_original: 'hello world',
text_en: 'hello world',
source: 'apify-tiktok-subtitles',
}));
}
const cn = opts.commentsCount ?? 7;
if (cn > 0) {
const comments = Array.from({ length: cn }, (_, i) => ({
rank: i + 1, author_handle: `u${i}`, text_original: `c${i}`, text_en: `c${i}`,
likes: 100 - i, replies_count: 0, posted_at: '',
}));
writeFileSync(join(dir, 'comments.json'), JSON.stringify(comments));
}
const fc = opts.framesCount ?? 5;
for (let i = 1; i <= fc; i++) {
writeFileSync(join(dir, 'frames', `${String(i).padStart(4, '0')}.jpg`), Buffer.alloc(1024));
}
writeFileSync(join(dir, 'bundle.json'), JSON.stringify({
id,
metadata: { id },
transcript: opts.transcript === false ? null : { text_en: 'hello world' },
comments: Array.from({ length: cn }, () => ({ text_en: 'x' })),
frames: Array.from({ length: fc }, (_, i) => ({ index: i + 1, path: `frames/${String(i + 1).padStart(4, '0')}.jpg` })),
cover_local: 'cover.jpg',
_validation: { all_ok: true, missing: [] },
}));
}
beforeEach(() => {
process.env.BRIEFS_ROOT = FIXTURE_ROOT;
if (existsSync(FIXTURE_ROOT)) rmSync(FIXTURE_ROOT, { recursive: true });
mkdirSync(FIXTURE_ROOT, { recursive: true });
});
describe('buildManifest', () => {
it('passes with three fully-bundled videos', () => {
fakeBundle('111111111111111111');
fakeBundle('222222222222222222');
fakeBundle('333333333333333333');
const m = buildManifest(REPORT_ID, ['111111111111111111', '222222222222222222', '333333333333333333']);
expect(m.summary.all_ok).toBe(3);
expect(m.summary.coverage_pct).toBe(100);
for (const v of m.videos) {
expect(v.all_ok).toBe(true);
expect(v.missing).toHaveLength(0);
}
});
it('flags missing transcript', () => {
fakeBundle('111111111111111111');
fakeBundle('222222222222222222', { transcript: false });
const m = buildManifest(REPORT_ID, ['111111111111111111', '222222222222222222']);
expect(m.summary.all_ok).toBe(1);
expect(m.summary.coverage_pct).toBe(50);
const v = m.videos.find((x) => x.id === '222222222222222222');
expect(v?.missing).toContain('transcript');
});
it('flags too few comments (<5)', () => {
fakeBundle('111111111111111111', { commentsCount: 3 });
const m = buildManifest(REPORT_ID, ['111111111111111111']);
const v = m.videos[0]!;
expect(v.comments.ok).toBe(false);
expect(v.missing).toContain('comments');
});
it('flags too few frames (<3)', () => {
fakeBundle('111111111111111111', { framesCount: 2 });
const m = buildManifest(REPORT_ID, ['111111111111111111']);
const v = m.videos[0]!;
expect(v.frames.ok).toBe(false);
expect(v.missing).toContain('frames');
});
it('flags too-small cover (<5KB)', () => {
fakeBundle('111111111111111111', { coverBytes: 1000 });
const m = buildManifest(REPORT_ID, ['111111111111111111']);
const v = m.videos[0]!;
expect(v.cover.ok).toBe(false);
expect(v.missing).toContain('cover');
});
it('coverage_pct is a clean percentage', () => {
fakeBundle('111111111111111111');
fakeBundle('222222222222222222');
fakeBundle('333333333333333333', { transcript: false });
fakeBundle('444444444444444444');
const m = buildManifest(REPORT_ID, ['111111111111111111', '222222222222222222', '333333333333333333', '444444444444444444']);
expect(m.summary.coverage_pct).toBe(75);
});
});

View file

@ -0,0 +1,90 @@
import { describe, it, expect, beforeEach } from 'vitest';
import { mkdirSync, writeFileSync, rmSync, existsSync, readFileSync } from 'node:fs';
import { resolve, join } from 'node:path';
import { runMomCompare } from '../lib/mom_compare.js';
import type { Trend } from '../stages/stage_8_trends.js';
const FIXTURE_ROOT = resolve('briefs/__mom_test_root__');
function trend(id: string, name: string, category: string, videos: string[], plays: number): Trend {
return {
trend_id: id,
slug: name.toLowerCase().replace(/\s+/g, '-'),
name,
category,
narrative: 'placeholder narrative for testing.',
lens_tags: ['narrative'],
top_atomic_ids: [],
supporting_video_ids: videos,
business_question_relevance: { score: 0.7, tier: 'core', justification: 'test' },
kpis: {
plays_total: plays, videos: videos.length, unique_creators: videos.length,
avg_stl_pct: 5,
paid_organic_split: { paid: 0, organic: videos.length, unclear: 0 },
},
};
}
function setupReport(reportId: string, trends: Trend[]) {
const dir = join(FIXTURE_ROOT, reportId);
mkdirSync(dir, { recursive: true });
writeFileSync(join(dir, 'trends.json'), JSON.stringify(trends, null, 2));
}
beforeEach(() => {
process.env.BRIEFS_ROOT = FIXTURE_ROOT;
if (existsSync(FIXTURE_ROOT)) rmSync(FIXTURE_ROOT, { recursive: true });
mkdirSync(FIXTURE_ROOT, { recursive: true });
});
describe('runMomCompare', () => {
it('classifies new / returning / faded correctly', async () => {
const prior = [
trend('TR-001', 'The Ceremonial Hair Wash', 'Hair Rituals', ['v1', 'v2', 'v3'], 1_000_000),
trend('TR-002', 'Anti-Influencer Beauty', 'Anti-Beauty Backlash', ['v4', 'v5'], 500_000),
trend('TR-003', 'Dropped Trend', 'Old Category', ['v9'], 100_000),
];
const current = [
// returning: same name + shared videos
trend('TR-100', 'The Ceremonial Hair Wash', 'Hair Rituals', ['v1', 'v2', 'v6'], 1_500_000),
// new: completely different
trend('TR-101', 'Scalp as Self', 'Hair Rituals', ['v7', 'v8'], 700_000),
];
setupReport('current', current);
setupReport('prior', prior);
const { result } = await runMomCompare('current', 'prior');
expect(result.returning_trends).toHaveLength(1);
const ret = result.returning_trends[0]!;
expect(ret.trend_id).toBe('TR-100');
expect(ret.prior_trend_id).toBe('TR-001');
expect(ret.velocity_delta.plays_total_pct).toBe(50); // (1.5M - 1M) / 1M = 50%
expect(result.new_trends).toHaveLength(1);
expect(result.new_trends[0]?.trend_id).toBe('TR-101');
expect(result.faded_trends.map((f) => f.prior_trend_id).sort()).toEqual(['TR-002', 'TR-003']);
});
it('fails loudly when prior report does not exist', async () => {
setupReport('current', [trend('TR-1', 'X', 'A', ['v1'], 1)]);
await expect(runMomCompare('current', 'missing')).rejects.toThrow(/Prior report 'missing' not found/);
});
it('writes the four compare/*.json files to outputs/compare/', async () => {
setupReport('current', [trend('TR-100', 'Same', 'Cat', ['v1'], 1_000_000)]);
setupReport('prior', [trend('TR-001', 'Same', 'Cat', ['v1'], 1_000_000)]);
await runMomCompare('current', 'prior');
const outDir = join(FIXTURE_ROOT, 'current', 'outputs', 'compare');
expect(existsSync(join(outDir, 'new_trends.json'))).toBe(true);
expect(existsSync(join(outDir, 'returning_trends.json'))).toBe(true);
expect(existsSync(join(outDir, 'faded_trends.json'))).toBe(true);
expect(existsSync(join(outDir, 'category_momentum.json'))).toBe(true);
const ret = JSON.parse(readFileSync(join(outDir, 'returning_trends.json'), 'utf-8'));
expect(ret).toHaveLength(1);
});
});

View file

@ -0,0 +1,86 @@
import { describe, it, expect } from 'vitest';
import { matchRecipe, parseFilterExpression, applyFilter, RECIPES } from '../lib/recipes.js';
import type { Pass1Video } from '../stages/stage_2_pass1_scrape.js';
const fakeVid = (id: string, plays: number, likes: number, saves: number, comments: number, stl_pct: number, daysAgo = 5): Pass1Video => ({
id, handle: 'creator', url_canonical: `https://www.tiktok.com/@creator/video/${id}`,
caption: '', hashtags: [], plays, likes, saves, comments_count: comments, shares: 0,
stl_pct, duration_sec: 30,
posted_at: new Date(Date.now() - daysAgo * 86400 * 1000).toISOString(),
cover: null, download_url: null, _source: 'test', _scraped_at: new Date().toISOString(),
});
describe('matchRecipe', () => {
it('hooks-related questions → A', () => {
expect(matchRecipe('What hooks stop the scroll for our audience?')).toBe('A');
expect(matchRecipe('How do creators use the first three seconds in beauty?')).toBe('A');
});
it('cultural-moment questions → B', () => {
expect(matchRecipe('Why is hair washing emerging as a cultural moment?')).toBe('B');
expect(matchRecipe('What is shifting in the everything-shower trend?')).toBe('B');
});
it('competitor questions → C', () => {
expect(matchRecipe('How does Dove position vs Olay in haircare?')).toBe('C');
});
it('audience sentiment questions → D', () => {
expect(matchRecipe('What do users actually feel about scalp products?')).toBe('D');
});
it('falls back to B', () => {
expect(matchRecipe('Tell me about beauty content please please please now')).toBe('B');
});
it('every recipe id is reachable', () => {
expect(Object.keys(RECIPES).sort()).toEqual(['A', 'B', 'C', 'D']);
});
});
describe('parseFilterExpression + applyFilter', () => {
const videos: Pass1Video[] = [
fakeVid('111111111111111111', 1_000_000, 100_000, 10_000, 5000, 11.5),
fakeVid('222222222222222222', 500_000, 50_000, 3_000, 2000, 9.0),
fakeVid('333333333333333333', 50_000, 5_000, 1_500, 500, 7.0),
fakeVid('444444444444444444', 100_000, 8_000, 7_000, 1500, 16.5),
];
it('top_by_plays:2', () => {
const f = parseFilterExpression('top_by_plays:2');
const ids = applyFilter(videos, f).sort();
expect(ids).toEqual(['111111111111111111', '222222222222222222'].sort());
});
it('AND intersects', () => {
// top_by_plays:3 = vids 1,2,4 (top 3 by plays). top_by_stl:3 = vids 4,1,2 (highest STL with ≥10k plays).
// Intersection = 1,2,4.
const f = parseFilterExpression('top_by_plays:3 AND top_by_stl:3');
const ids = applyFilter(videos, f).sort();
expect(ids).toEqual(['111111111111111111', '222222222222222222', '444444444444444444'].sort());
});
it('OR unions', () => {
const f = parseFilterExpression('top_by_plays:2 OR top_by_saves:1');
const ids = applyFilter(videos, f).sort();
// top_by_plays:2 = {1,2}; top_by_saves:1 = {1} (video 1 has 10k saves > video 4's 7k).
// Union = {1,2}.
expect(ids).toEqual(['111111111111111111', '222222222222222222'].sort());
});
it('parens force grouping', () => {
const f = parseFilterExpression('(top_by_plays:3 AND top_by_stl:2) OR top_by_saves:1');
const ids = applyFilter(videos, f);
expect(ids.length).toBeGreaterThan(0);
});
it('manual_ids passes through', () => {
const f = parseFilterExpression('manual_ids:111111111111111111,999999999999999999');
const ids = applyFilter(videos, f).sort();
expect(ids).toContain('111111111111111111');
expect(ids).toContain('999999999999999999');
});
it('throws on unknown primitive', () => {
expect(() => parseFilterExpression('top_by_unicorn:5')).toThrow();
});
it('throws on missing close paren', () => {
expect(() => parseFilterExpression('(top_by_plays:5')).toThrow();
});
});

263
v2/pipeline/cli.ts Normal file
View file

@ -0,0 +1,263 @@
#!/usr/bin/env tsx
// V2 pipeline CLI. Mirrors brief §11 verbatim:
// pnpm pipe brief|seed|scrape1|select|scrape2|validate|analyse|insights|trends|qa|build|deploy --report <id>
// Each subcommand loads inputs from DB + disk, runs its stage, writes outputs to
// briefs/{report_id}/. Idempotent + resumable via .state/stage{N}.done sentinels.
import { writeFileSync, existsSync } from 'node:fs';
import { sql } from '../server/db/client.js';
import { getBriefById } from '../server/db/briefs.js';
import { BRIEF_INPUT } from '../server/schemas/brief.js';
import { runStage1Seeds } from './stages/stage_1_seeds.js';
import { runStage2Pass1Scrape } from './stages/stage_2_pass1_scrape.js';
import { runStage3Select } from './stages/stage_3_select.js';
import { runStage4Pass2Enrich } from './stages/stage_4_pass2_enrich.js';
import { backfillCoversFromRawDumps } from './lib/backfill_covers.js';
import { runStage5Manifest } from './stages/stage_5_manifest.js';
import { runStage6Analyse } from './stages/stage_6_analyse.js';
import { runStage7AtomicInsights } from './stages/stage_7_atomic_insights.js';
import { runStage8Trends } from './stages/stage_8_trends.js';
import { runStage9Qa } from './stages/stage_9_qa.js';
import { runStage10Build } from './stages/stage_10_build.js';
import { runMomCompare } from './lib/mom_compare.js';
import { PATHS, ensureDir } from './lib/paths.js';
import { onClaudeUsage } from './lib/claude.js';
import { onApifyCost } from './lib/apify_client.js';
import { onDriftEvent } from './lib/drift_log.js';
import type { RecipeId } from './lib/recipes.js';
interface Args {
command: string;
reportId: string | null;
flags: Record<string, string | boolean>;
}
function parseArgs(argv: string[]): Args {
const [, , command = '', ...rest] = argv;
const flags: Record<string, string | boolean> = {};
for (let i = 0; i < rest.length; i++) {
const tok = rest[i]!;
if (!tok.startsWith('--')) continue;
const key = tok.slice(2);
const next = rest[i + 1];
if (next && !next.startsWith('--')) { flags[key] = next; i++; }
else flags[key] = true;
}
const reportId = (typeof flags.report === 'string' ? flags.report : null);
return { command, reportId, flags };
}
function usage(): never {
console.error(`Usage: pnpm pipe <command> --report <brief-id> [flags]
Commands:
seed Stage 1 expand the brief into hashtag tiers + search terms (writes seeds.json)
scrape1 Stage 2 broad Apify pull, hashtag floor applied (TODO Phase C)
select Stage 3 recipe-led selection (TODO Phase C)
scrape2 Stage 4 deep enrichment per video (TODO Phase D)
validate Stage 5 manifest gate (TODO Phase D)
analyse Stage 6 per-video Claude analysis (TODO Phase E)
insights Stage 7 atomic insight extraction (TODO Phase E)
trends Stage 8 trend synthesis (TODO Phase E)
qa Stage 9 paid/organic + coverage gates (TODO Phase F)
build Stage 10 dashboard + claude.ai HTML bundle (TODO Phase F)
Flags:
--report <brief-id> UUID of the brief in the briefs table
--force Invalidate stage sentinels and rerun
`);
process.exit(1);
}
async function loadBrief(reportId: string): Promise<{ briefRow: NonNullable<Awaited<ReturnType<typeof getBriefById>>>; brief: ReturnType<typeof BRIEF_INPUT.parse> }> {
const briefRow = await getBriefById(reportId);
if (!briefRow) throw new Error(`Brief not found: ${reportId}`);
const parsed = BRIEF_INPUT.parse(briefRow.brief_yaml);
return { briefRow, brief: parsed };
}
async function loadBriefAndRow(reportId: string): ReturnType<typeof loadBrief> {
return loadBrief(reportId);
}
function logCost(): void {
let claudeTotal = 0;
let apifyTotal = 0;
onClaudeUsage((u, label) => {
claudeTotal += u.cost_usd;
console.log(`[claude] ${label}: ${u.input_tokens} in / ${u.output_tokens} out / $${u.cost_usd.toFixed(4)} (running: $${claudeTotal.toFixed(4)})`);
});
onApifyCost((e) => {
apifyTotal += e.cost_usd;
console.log(`[apify] ${e.label}: $${e.cost_usd.toFixed(4)} (running: $${apifyTotal.toFixed(4)})`);
});
onDriftEvent((d) => {
console.warn(`[drift] ${d.actor} ${d.reason}: id=${d.extracted_id ?? '?'}`);
});
}
function writeStageDone(reportId: string, n: number, payload: Record<string, unknown>): void {
ensureDir(PATHS.stateDir(reportId));
writeFileSync(PATHS.stageDone(reportId, n), JSON.stringify(payload, null, 2));
}
function isStageDone(reportId: string, n: number, force = false): boolean {
if (force) return false;
return existsSync(PATHS.stageDone(reportId, n));
}
async function main(): Promise<void> {
const { command, reportId, flags } = parseArgs(process.argv);
if (!command) usage();
if (!reportId) { console.error('Missing --report <brief-id>'); usage(); }
const force = !!flags.force;
logCost();
switch (command) {
case 'seed': {
if (isStageDone(reportId, 1, force)) {
console.log(`[stage 1] already done; pass --force to rerun. Output: ${PATHS.seedsJson(reportId)}`);
break;
}
const { brief } = await loadBrief(reportId);
const result = await runStage1Seeds({ reportId, brief });
writeStageDone(reportId, 1, { command, at: new Date().toISOString(), outputs: result.outputs });
console.log(`[stage 1] OK — seeds → ${result.outputs.seeds}`);
break;
}
case 'brief': {
const { brief } = await loadBrief(reportId);
const path = PATHS.briefJson(reportId);
writeFileSync(path, JSON.stringify(brief, null, 2));
console.log(`[brief] dumped → ${path}`);
break;
}
case 'backfill-covers': {
const result = backfillCoversFromRawDumps(reportId);
console.log(`[backfill] patched ${result.patched} of ${result.total} pass1 records with cover/mp4 URLs`);
break;
}
case 'scrape1': {
if (isStageDone(reportId, 2, force)) {
console.log(`[stage 2] already done; pass --force to rerun. Output: ${PATHS.pass1Videos(reportId)}`);
break;
}
const { brief } = await loadBrief(reportId);
const result = await runStage2Pass1Scrape({ reportId, brief });
writeStageDone(reportId, 2, { command, at: new Date().toISOString(), outputs: result.outputs, total_videos: result.total_videos, total_cost_usd: result.total_cost_usd });
console.log(`[stage 2] OK — ${result.total_videos} videos, $${result.total_cost_usd.toFixed(2)}`);
break;
}
case 'select': {
if (isStageDone(reportId, 3, force)) {
console.log(`[stage 3] already done; pass --force to rerun. Output: ${PATHS.selectedIds(reportId)}`);
break;
}
const { brief } = await loadBrief(reportId);
const recipe = (typeof flags.recipe === 'string' ? flags.recipe.toUpperCase() : undefined) as RecipeId | undefined;
const customFilter = typeof flags.custom === 'string' ? flags.custom : undefined;
const argsObj: Parameters<typeof runStage3Select>[0] = { reportId, brief };
if (recipe) argsObj.forceRecipe = recipe;
if (customFilter) argsObj.customFilter = customFilter;
const result = await runStage3Select(argsObj);
writeStageDone(reportId, 3, { command, at: new Date().toISOString(), outputs: result.outputs, rules: result.rules });
console.log(`[stage 3] OK — ${result.selected.length} selected, recipe=${result.rules.recipe_id}`);
break;
}
case 'scrape2': {
if (isStageDone(reportId, 4, force)) {
console.log(`[stage 4] already done; pass --force to rerun. Output: ${PATHS.enriched(reportId)}`);
break;
}
const { brief } = await loadBrief(reportId);
const result = await runStage4Pass2Enrich({ reportId, brief });
writeStageDone(reportId, 4, { command, at: new Date().toISOString(), outputs: result.outputs, total_attempted: result.total_attempted, total_bundled: result.total_bundled, total_dropped: result.total_dropped, drift_events: result.drift_events });
console.log(`[stage 4] OK — bundled=${result.total_bundled} dropped=${result.total_dropped} drift=${result.drift_events}`);
break;
}
case 'validate': {
const dropFailing = !!flags['drop-failing'];
const { brief } = await loadBrief(reportId);
const result = await runStage5Manifest({ reportId, brief, dropFailing });
writeStageDone(reportId, 5, { command, at: new Date().toISOString(), passed: result.passed, coverage_pct: result.manifest.summary.coverage_pct, backfill_rounds: result.backfill_rounds });
if (!result.passed) {
console.error(`[stage 5] FAIL — coverage ${result.manifest.summary.coverage_pct}% (${result.manifest.summary.all_ok}/${result.manifest.selected_count}). Missing per video printed in manifest.json.`);
process.exit(3);
}
console.log(`[stage 5] PASS — coverage 100%`);
break;
}
case 'analyse': {
if (isStageDone(reportId, 6, force)) {
console.log(`[stage 6] already done; pass --force to rerun.`);
break;
}
const result = await runStage6Analyse(reportId);
writeStageDone(reportId, 6, { command, at: new Date().toISOString(), total: result.total, cached: result.cached, fresh: result.fresh });
console.log(`[stage 6] OK — ${result.total} analyses (${result.cached} cached, ${result.fresh} fresh)`);
break;
}
case 'insights': {
if (isStageDone(reportId, 7, force)) {
console.log(`[stage 7] already done; pass --force to rerun.`);
break;
}
const result = await runStage7AtomicInsights(reportId);
writeStageDone(reportId, 7, { command, at: new Date().toISOString(), total_insights: result.total_insights, by_type: result.by_type });
console.log(`[stage 7] OK — ${result.total_insights} atomic insights (hook=${result.by_type.hook} visual=${result.by_type.visual} audio=${result.by_type.audio} narrative=${result.by_type.narrative})`);
break;
}
case 'trends': {
if (isStageDone(reportId, 8, force)) {
console.log(`[stage 8] already done; pass --force to rerun.`);
break;
}
const { brief } = await loadBrief(reportId);
const result = await runStage8Trends(reportId, brief);
writeStageDone(reportId, 8, { command, at: new Date().toISOString(), total_trends: result.total_trends, core_trends: result.core_trends, peripheral_trends: result.peripheral_trends, dropped_trends: result.dropped_trends });
console.log(`[stage 8] OK — ${result.total_trends} trends across ${result.categories.length} categories`);
break;
}
case 'qa': {
if (isStageDone(reportId, 9, force)) {
console.log(`[stage 9] already done; pass --force to rerun.`);
break;
}
const result = await runStage9Qa(reportId);
writeStageDone(reportId, 9, { command, at: new Date().toISOString(), paid_creators: result.paid_creators, mixed_creators: result.mixed_creators, coverage_pct: result.coverage_pct });
console.log(`[stage 9] OK — paid=${result.paid_creators} mixed=${result.mixed_creators} coverage=${result.coverage_pct}%`);
break;
}
case 'build': {
const target = typeof flags.target === 'string' ? flags.target : 'all';
const { brief, briefRow } = await loadBriefAndRow(reportId);
if (target === 'all' || target === 'compare') {
if (briefRow.prior_report_id) {
console.log(`[mom] running compare against prior_report_id=${briefRow.prior_report_id}`);
await runMomCompare(reportId, briefRow.prior_report_id);
} else if (target === 'compare') {
console.error('[mom] target=compare but brief has no prior_report_id; refusing per §16');
process.exit(4);
}
}
if (target === 'all' || target === 'dashboard' || target === 'html') {
const result = await runStage10Build(reportId, brief);
writeStageDone(reportId, 10, { command, at: new Date().toISOString(), trend_count: result.trend_count, html_size_bytes: result.html_size_bytes });
console.log(`[stage 10] OK — dataset=${(result.dataset_size_bytes / 1024).toFixed(1)} KB, html=${(result.html_size_bytes / 1024).toFixed(1)} KB, trends=${result.trend_count}`);
}
break;
}
default:
console.error(`Unknown command: ${command}`);
usage();
}
await sql.end({ timeout: 1 });
}
main().catch((err) => {
console.error('[pipe] error:', err instanceof Error ? err.message : err);
process.exit(1);
});

View file

@ -0,0 +1,141 @@
// Apify wrapper for V2.
// Adapted from V1 agents/social-listening/apify.ts.
// Changes vs V1:
// - V2 is TikTok-only (Instagram/YouTube/Twitter actors removed).
// - Cost callback signature switched from (costUsd, label, runId) to a structured event.
// - Soft-cap pattern preserved (V1 stage3-discovery-scrape.ts:199-232).
// - Returns raw items; id normalisation happens in the caller via extractTikTokId.
import { envStr, envBool } from '../../server/lib/env.js';
export const ACTORS = {
TIKTOK_HASHTAG: 'GdWCkxBtKWOsKjdch',
TIKTOK_PROFILE: 'OtzYfK1ndEGdwWFKQ',
TIKTOK_COMMENTS: 'BDec00yAmCm1QbMEI',
TIKTOK_TRANSCRIPTS:'emQXBCL3xePZYgJyn',
} as const;
const APIFY_BASE = 'https://api.apify.com/v2';
const APIFY_TOKEN = envStr('APIFY_TOKEN') || envStr('APIFY_API_TOKEN');
const IS_LIVE = envBool('APIFY_LIVE_APPROVED', false);
const IS_TEST = envBool('TEST_MODE', false);
export function isLive(): boolean { return IS_LIVE; }
export function isTest(): boolean { return IS_TEST; }
// ─── Budget tracking (per-pipeline, reset between reports) ─────────────
let _running = 0;
let _hardCeiling: number = Number.POSITIVE_INFINITY;
let _softCap: number | null = null;
export function resetBudget(opts: { hardCeilingUsd: number }): void {
_running = 0;
_softCap = null;
_hardCeiling = opts.hardCeilingUsd;
}
export function getRunningCost(): number { return _running; }
export function setSoftCap(cap: number | null): void { _softCap = cap; }
export function getSoftCap(): number | null { return _softCap; }
export function isBudgetExceeded(): boolean {
if (_softCap !== null && _running >= _softCap) return true;
return _running >= _hardCeiling;
}
export interface ApifyCostEvent {
cost_usd: number;
label: string;
run_id: string;
dataset_id: string;
actor_id: string;
}
let _onCost: ((e: ApifyCostEvent) => void) | null = null;
export function onApifyCost(cb: (e: ApifyCostEvent) => void): void { _onCost = cb; }
export interface ApifyRunResult<T = unknown> {
items: T[];
run_id: string;
dataset_id: string;
cost_usd: number;
status: 'OK' | 'BUDGET_SKIP' | 'DRY_RUN';
}
export async function runActor<T = unknown>(
actorId: string,
input: Record<string, unknown>,
label: string,
): Promise<ApifyRunResult<T>> {
if (!IS_LIVE) {
console.log(`[apify dry-run] ${label} actor=${actorId}`);
return { items: [] as T[], run_id: 'dry-run', dataset_id: 'dry-run', cost_usd: 0, status: 'DRY_RUN' };
}
if (isBudgetExceeded()) {
console.log(`[apify] budget $${_running.toFixed(2)} reached — skipping ${label}`);
return { items: [] as T[], run_id: 'budget-skip', dataset_id: 'budget-skip', cost_usd: 0, status: 'BUDGET_SKIP' };
}
if (!APIFY_TOKEN) throw new Error('APIFY_TOKEN not set');
const startRes = await fetch(`${APIFY_BASE}/acts/${actorId}/runs`, {
method: 'POST',
headers: { 'Content-Type': 'application/json', Authorization: `Bearer ${APIFY_TOKEN}` },
body: JSON.stringify(input),
});
if (!startRes.ok) {
const errText = await startRes.text();
throw new Error(`Apify start failed for ${label}: ${startRes.status} ${errText}`);
}
const startData = await startRes.json() as { data: { id: string; defaultDatasetId: string; status: string } };
const runId = startData.data.id;
const datasetId = startData.data.defaultDatasetId;
console.log(`[apify] ${label} started runId=${runId}`);
let status = startData.data.status;
let pollCount = 0;
const MAX_POLLS = 120;
while (status !== 'SUCCEEDED' && status !== 'FAILED' && status !== 'ABORTED' && status !== 'TIMED-OUT') {
if (pollCount++ > MAX_POLLS) throw new Error(`Apify ${label} timed out`);
await new Promise((r) => setTimeout(r, 5000));
try {
const pollRes = await fetch(`${APIFY_BASE}/actor-runs/${runId}`, {
headers: { Authorization: `Bearer ${APIFY_TOKEN}` },
});
const pollData = await pollRes.json() as { data: { status: string } };
status = pollData.data.status;
} catch { /* transient — keep polling */ }
if (pollCount % 6 === 0) console.log(`[apify] ${label} status=${status} (${pollCount * 5}s)`);
}
if (status !== 'SUCCEEDED') throw new Error(`Apify ${label} ended ${status}`);
let costUsd = 0;
try {
const costRes = await fetch(`${APIFY_BASE}/actor-runs/${runId}`, {
headers: { Authorization: `Bearer ${APIFY_TOKEN}` },
});
const costData = await costRes.json() as { data: { usageTotalUsd?: number } };
costUsd = costData.data.usageTotalUsd || 0;
} catch { /* non-fatal */ }
const itemsRes = await fetch(`${APIFY_BASE}/datasets/${datasetId}/items?format=json`, {
headers: { Authorization: `Bearer ${APIFY_TOKEN}` },
});
let items: T[] = [];
if (itemsRes.ok) {
const ct = itemsRes.headers.get('content-type') || '';
const text = await itemsRes.text();
if (ct.includes('json') && text.trim().startsWith('[')) {
try { items = JSON.parse(text) as T[]; } catch { /* fall through to empty */ }
} else {
console.warn(`[apify] ${label} unexpected response (${ct})`);
}
}
_running += costUsd;
console.log(`[apify] ${label} done ${items.length} items $${costUsd.toFixed(4)} (running $${_running.toFixed(2)})`);
if (_onCost) _onCost({ cost_usd: costUsd, label, run_id: runId, dataset_id: datasetId, actor_id: actorId });
return { items, run_id: runId, dataset_id: datasetId, cost_usd: costUsd, status: 'OK' };
}
/** Limits applied by the actor itself (Apify input). Conservative defaults. */
export function defaultLimits() {
return IS_TEST
? { resultsPerPage: 50, resultsLimit: 50, maxResults: 50 }
: { resultsPerPage: 200, resultsLimit: 100, maxResults: 200 };
}

View file

@ -0,0 +1,68 @@
// One-shot fix-up: read pass1/raw/*.json dumps and patch cover URLs into existing
// pass1_videos.json + per-video metadata.json. Used when Stage 2's normaliseRaw
// missed a field shape and we don't want to re-spend Apify by re-running Stage 2.
import { readFileSync, writeFileSync, readdirSync, existsSync, statSync } from 'node:fs';
import { join } from 'node:path';
import { extractTikTokId } from './ids.js';
import { PATHS } from './paths.js';
import type { Pass1Video } from '../stages/stage_2_pass1_scrape.js';
interface RawAny {
id?: string;
webVideoUrl?: string;
videoMeta?: { coverUrl?: string; originalCoverUrl?: string; downloadAddr?: string };
mediaUrls?: string[];
covers?: { default?: string };
}
export function backfillCoversFromRawDumps(reportId: string): { patched: number; total: number } {
const rawDir = join(PATHS.pass1(reportId), 'raw');
const pass1Path = PATHS.pass1Videos(reportId);
if (!existsSync(rawDir) || !existsSync(pass1Path)) {
throw new Error('pass1/raw or pass1_videos.json missing — nothing to backfill');
}
// Build id → corrected cover URL map from raw dumps.
const fix = new Map<string, { cover: string | null; download_url: string | null }>();
for (const f of readdirSync(rawDir)) {
if (!f.endsWith('.json')) continue;
const items = JSON.parse(readFileSync(join(rawDir, f), 'utf-8')) as RawAny[];
for (const r of items) {
const id = extractTikTokId(r.id || r.webVideoUrl || '');
if (!id) continue;
const cover = r.videoMeta?.coverUrl ?? r.videoMeta?.originalCoverUrl ?? r.covers?.default ?? null;
const downloadUrl = r.videoMeta?.downloadAddr ?? r.mediaUrls?.[0] ?? null;
if (!fix.has(id) || (cover && !fix.get(id)?.cover)) fix.set(id, { cover, download_url: downloadUrl });
}
}
// Patch pass1_videos.json
const pass1: Pass1Video[] = JSON.parse(readFileSync(pass1Path, 'utf-8'));
let patched = 0;
for (const v of pass1) {
const f = fix.get(v.id);
if (!f) continue;
if (!v.cover && f.cover) { v.cover = f.cover; patched++; }
if (!v.download_url && f.download_url) v.download_url = f.download_url;
}
writeFileSync(pass1Path, JSON.stringify(pass1, null, 2));
// Patch per-video metadata.json files (only those that exist already from Stage 4).
const enrichedDir = PATHS.enriched(reportId);
if (existsSync(enrichedDir)) {
for (const id of readdirSync(enrichedDir)) {
if (!statSync(join(enrichedDir, id)).isDirectory()) continue;
const metaPath = join(enrichedDir, id, 'metadata.json');
if (!existsSync(metaPath)) continue;
const meta: Pass1Video = JSON.parse(readFileSync(metaPath, 'utf-8'));
const f = fix.get(id);
if (!f) continue;
let changed = false;
if (!meta.cover && f.cover) { meta.cover = f.cover; changed = true; }
if (!meta.download_url && f.download_url) { meta.download_url = f.download_url; changed = true; }
if (changed) writeFileSync(metaPath, JSON.stringify(meta, null, 2));
}
}
return { patched, total: pass1.length };
}

109
v2/pipeline/lib/claude.ts Normal file
View file

@ -0,0 +1,109 @@
// Thin Claude API client. Adapted from V1 agents/social-listening/claude-cli.ts:62-285,
// trimmed to what V2 stages need: text + JSON modes, retry-on-invalid-JSON 2x, usage callback.
import { envStr } from '../../server/lib/env.js';
const DEFAULT_MODEL = 'claude-opus-4-7';
const API_BASE = 'https://api.anthropic.com/v1/messages';
const PRICING: Record<string, { input: number; output: number }> = {
'claude-opus-4-7': { input: 5, output: 25 },
'claude-opus-4-6': { input: 5, output: 25 },
'claude-sonnet-4-6': { input: 3, output: 15 },
'claude-haiku-4-5': { input: 1, output: 5 },
};
export interface Usage {
input_tokens: number;
output_tokens: number;
cost_usd: number;
model: string;
}
let onUsageCb: ((u: Usage, label: string) => void) | null = null;
export function onClaudeUsage(cb: (u: Usage, label: string) => void): void {
onUsageCb = cb;
}
function cost(model: string, ti: number, to: number): number {
const p = PRICING[model] ?? PRICING[DEFAULT_MODEL]!;
return ti * p.input / 1_000_000 + to * p.output / 1_000_000;
}
interface ApiResponse {
content: Array<{ type: string; text?: string }>;
stop_reason: string;
usage: { input_tokens: number; output_tokens: number };
}
async function callRaw(prompt: string, model: string, maxTokens: number): Promise<{ text: string; usage: Usage }> {
const apiKey = envStr('ANTHROPIC_API_KEY');
if (!apiKey) throw new Error('ANTHROPIC_API_KEY not set');
const res = await fetch(API_BASE, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
'x-api-key': apiKey,
'anthropic-version': '2023-06-01',
},
body: JSON.stringify({
// temperature is deprecated on opus-4-7; the model is deterministic by default.
model,
max_tokens: maxTokens,
messages: [{ role: 'user', content: prompt }],
}),
});
if (!res.ok) {
const errText = await res.text();
throw new Error(`Anthropic API error ${res.status}: ${errText}`);
}
const data = await res.json() as ApiResponse;
const text = data.content.filter((b) => b.type === 'text').map((b) => b.text ?? '').join('\n').trim();
const usage: Usage = {
input_tokens: data.usage.input_tokens,
output_tokens: data.usage.output_tokens,
cost_usd: cost(model, data.usage.input_tokens, data.usage.output_tokens),
model,
};
return { text, usage };
}
export async function callClaude(
prompt: string,
opts: { model?: string; maxTokens?: number; label?: string } = {},
): Promise<string> {
const model = opts.model ?? DEFAULT_MODEL;
const { text, usage } = await callRaw(prompt, model, opts.maxTokens ?? 16384);
if (onUsageCb) onUsageCb(usage, opts.label ?? 'call');
return text;
}
function tryParseJson<T>(text: string): T | null {
const fence = text.match(/```json\s*\n?([\s\S]*?)```/) ?? text.match(/```\s*\n?([\s\S]*?)```/);
const candidates = [
fence?.[1]?.trim(),
text.match(/(\{[\s\S]*\})/)?.[1],
text.match(/(\[[\s\S]*\])/)?.[1],
text.trim(),
].filter((x): x is string => !!x);
for (const c of candidates) {
try { return JSON.parse(c) as T; } catch { /* next */ }
}
return null;
}
export async function callClaudeJSON<T>(
prompt: string,
opts: { model?: string; maxTokens?: number; label?: string } = {},
): Promise<T> {
const fullPrompt = `${prompt}\n\nCRITICAL: Return ONLY valid JSON. No markdown outside the JSON. No prose before or after.`;
const maxRetries = 2;
let lastErr = '';
for (let attempt = 0; attempt <= maxRetries; attempt++) {
const raw = await callClaude(fullPrompt, opts);
const parsed = tryParseJson<T>(raw);
if (parsed !== null) return parsed;
lastErr = `attempt ${attempt + 1}: could not parse JSON. First 300 chars: ${raw.slice(0, 300)}`;
console.warn(`[claude] ${lastErr}`);
}
throw new Error(`callClaudeJSON failed after ${maxRetries + 1} attempts. ${lastErr}`);
}

View file

@ -0,0 +1,54 @@
// Drift log: when an Apify actor returns a video URL/id we can't match to our
// canonical TikTok id, we record it. This is the V2 commitment that drift is
// LOUD, not silent. Every line in drift_log.jsonl points at a specific actor
// response that needs human review.
import { appendFileSync, mkdirSync, existsSync, writeFileSync } from 'node:fs';
import { dirname } from 'node:path';
import { PATHS } from './paths.js';
export type DriftReason =
| 'no-id-extracted'
| 'id-not-in-selection'
| 'duplicate-id-different-url'
| 'metadata-missing-fields'
| 'date-out-of-window';
export interface DriftEntry {
at: string;
actor: string; // human label e.g. "TIKTOK_TRANSCRIPTS"
reason: DriftReason;
source_url: string | null;
extracted_id: string | null;
context?: Record<string, unknown>;
}
let driftCounter = 0;
let onDrift: ((entry: DriftEntry) => void) | null = null;
export function onDriftEvent(cb: (entry: DriftEntry) => void): void {
onDrift = cb;
}
export function getDriftCount(): number {
return driftCounter;
}
export function resetDriftCounter(): void {
driftCounter = 0;
}
/** Truncate the drift log file at start of a fresh Stage 4 run. */
export function clearDriftLog(reportId: string): void {
const path = PATHS.driftLog(reportId);
if (existsSync(path)) writeFileSync(path, '');
}
export function logDrift(reportId: string, entry: Omit<DriftEntry, 'at'>): void {
const full: DriftEntry = { at: new Date().toISOString(), ...entry };
const path = PATHS.driftLog(reportId);
if (!existsSync(dirname(path))) mkdirSync(dirname(path), { recursive: true });
appendFileSync(path, JSON.stringify(full) + '\n');
driftCounter++;
if (onDrift) onDrift(full);
console.warn(`[drift] ${entry.actor} ${entry.reason}: id=${entry.extracted_id ?? '(none)'} url=${entry.source_url ?? '(none)'}`);
}

View file

@ -0,0 +1,60 @@
// The V2 quality knob — drops Apify items below a per-brief engagement threshold.
// Applied AFTER actors return (Apify-side filtering is best-effort: TikTok hashtag
// scraper accepts `minPlayCount` but most others don't, so we always re-validate
// locally for correctness).
export interface EngagementFloor {
min_likes: number;
min_plays: number;
min_stl_pct: number;
}
export interface EngagementCounters {
raw_returned: number;
dropped_min_likes: number;
dropped_min_plays: number;
dropped_min_stl: number;
dropped_zero_plays: number;
kept_after_floor: number;
}
export interface FloorableItem {
plays: number;
likes: number;
saves?: number;
comments_count?: number;
shares?: number;
}
export function computeStlPct(item: FloorableItem): number {
if (!item.plays || item.plays <= 0) return 0;
const stl = (item.likes ?? 0) + (item.saves ?? 0) + (item.comments_count ?? 0) + (item.shares ?? 0);
return (stl / item.plays) * 100;
}
export function applyEngagementFloor<T extends FloorableItem>(
items: T[],
floor: EngagementFloor,
): { kept: T[]; counters: EngagementCounters } {
const c: EngagementCounters = {
raw_returned: items.length,
dropped_min_likes: 0,
dropped_min_plays: 0,
dropped_min_stl: 0,
dropped_zero_plays: 0,
kept_after_floor: 0,
};
const kept: T[] = [];
for (const it of items) {
if (!it.plays || it.plays <= 0) { c.dropped_zero_plays++; continue; }
if (it.plays < floor.min_plays) { c.dropped_min_plays++; continue; }
if (it.likes < floor.min_likes) { c.dropped_min_likes++; continue; }
if (floor.min_stl_pct > 0) {
const stl = computeStlPct(it);
if (stl < floor.min_stl_pct) { c.dropped_min_stl++; continue; }
}
kept.push(it);
}
c.kept_after_floor = kept.length;
return { kept, counters: c };
}

56
v2/pipeline/lib/frames.ts Normal file
View file

@ -0,0 +1,56 @@
// ffmpeg-based frame extraction. Cap based on video length per V3 brief §4 stage 4:
// ≤15s : 1 fps (max 15)
// 1660s: 1/2 fps (max 30)
// 61180s: 1/4 fps (max 45)
// >180s : 1/6 fps (max 60)
// All frames downscaled to 720px wide jpg.
import { spawnSync } from 'node:child_process';
import { mkdirSync, existsSync, readdirSync } from 'node:fs';
export interface FrameExtractOpts {
/** path to local mp4 file. */
mp4Path: string;
/** output directory; will be created. Files written as 0001.jpg, 0002.jpg, … */
outDir: string;
/** video duration in seconds (used to pick fps + cap). */
durationSec: number;
/** override fps and cap (testing only). */
override?: { fps?: number; cap?: number };
}
export interface FrameExtractResult {
ok: boolean;
frames: string[]; // basenames written
fps: number;
cap: number;
error?: string;
}
export function chooseFpsAndCap(durationSec: number): { fps: number; cap: number } {
if (durationSec <= 15) return { fps: 1, cap: 15 };
if (durationSec <= 60) return { fps: 0.5, cap: 30 };
if (durationSec <= 180) return { fps: 0.25, cap: 45 };
return { fps: 1 / 6, cap: 60 };
}
export function extractFrames(opts: FrameExtractOpts): FrameExtractResult {
if (!existsSync(opts.mp4Path)) return { ok: false, frames: [], fps: 0, cap: 0, error: 'mp4 missing' };
mkdirSync(opts.outDir, { recursive: true });
const { fps, cap } = opts.override
? { fps: opts.override.fps ?? 1, cap: opts.override.cap ?? 15 }
: chooseFpsAndCap(opts.durationSec);
const args = [
'-y', '-i', opts.mp4Path,
'-vf', `fps=${fps},scale=720:-2`,
'-frames:v', String(cap),
'-q:v', '4',
`${opts.outDir}/%04d.jpg`,
];
const res = spawnSync('ffmpeg', args, { encoding: 'utf-8' });
if (res.status !== 0) {
return { ok: false, frames: [], fps, cap, error: `ffmpeg exit ${res.status}: ${res.stderr.slice(-400)}` };
}
const files = existsSync(opts.outDir) ? readdirSync(opts.outDir).filter((f) => f.endsWith('.jpg')).sort() : [];
return { ok: files.length > 0, frames: files, fps, cap };
}

111
v2/pipeline/lib/ids.ts Normal file
View file

@ -0,0 +1,111 @@
// THE LINKING FIX — single canonical TikTok-video-id extractor.
//
// V1 joined per-video assets via `Map.get(url)` against URLs returned by *different*
// Apify actors. The actors return slightly different URL shapes (with/without `www`,
// with/without trailing query params, `vm.tiktok.com` shortlinks). Any drift silently
// produced `undefined` and dropped the asset, so trends ended up citing the wrong
// videos.
//
// V2 collapses every URL form to the 19-digit TikTok numeric id at scrape-normalise
// time. That id is the row PK in `videos`, the folder name in `enriched/{id}/`, and
// the join key for every Apify response. URL is presentation, not key.
//
// This module is the ONLY place URL→id conversion happens. If a URL form makes it
// past Stage 2 without a numeric id, it is logged to drift_log.jsonl and the asset
// lands as `failed` in the manifest. It does not silently drop.
const ID_RX = /^\d{15,21}$/;
/**
* Accepts any TikTok URL form, returns the 19-ish-digit numeric id, or null.
*
* Handled forms (see ids.test.ts for the full fixture):
* - https://www.tiktok.com/@handle/video/7280000000000000000
* - https://www.tiktok.com/@handle/video/7280000000000000000?is_from_webapp=1
* - https://tiktok.com/@handle/video/7280000000000000000
* - https://m.tiktok.com/v/7280000000000000000.html
* - https://www.tiktok.com/t/7280000000000000000 (older share link)
* - 7280000000000000000 (raw id)
*
* NOT handled (returns null, callers should resolve before calling):
* - https://vm.tiktok.com/ZMabc123/ (short link — needs HEAD resolve)
* - https://vt.tiktok.com/ZSabc123/ (regional short link)
*
* For short links, use `resolveShortLink(url)` first to get the redirect target,
* then pass that to `extractTikTokId`.
*/
export function extractTikTokId(input: unknown): string | null {
if (input === null || input === undefined) return null;
const s = String(input).trim();
if (!s) return null;
// Raw numeric id.
if (ID_RX.test(s)) return s;
// Anything with a /video/<digits> segment.
const videoMatch = s.match(/\/video\/(\d{15,21})\b/);
if (videoMatch?.[1]) return videoMatch[1];
// Older share form: /v/<digits>.html
const vMatch = s.match(/\/v\/(\d{15,21})\b/);
if (vMatch?.[1]) return vMatch[1];
// Older share form: /t/<digits>
const tMatch = s.match(/\/t\/(\d{15,21})\b/);
if (tMatch?.[1]) return tMatch[1];
// Last-ditch: any 1521 digit run that is NOT inside a millisecond timestamp
// (e.g. tiktok URLs sometimes carry `lang=en&...&_t=8abc1234567`).
// Only accept if surrounded by URL/JSON-ish boundaries.
const fallback = s.match(/(?:^|[^\d])(\d{17,20})(?:$|[^\d])/);
if (fallback?.[1] && ID_RX.test(fallback[1])) return fallback[1];
return null;
}
/**
* Build a canonical URL from an id + handle. We never store input URLs in the
* `videos` table only the canonical form, derived from the id.
*/
export function canonicalTikTokUrl(id: string, handle: string): string {
const cleanHandle = handle.replace(/^@/, '');
return `https://www.tiktok.com/@${cleanHandle}/video/${id}`;
}
/**
* Resolve a TikTok short link (vm.tiktok.com / vt.tiktok.com) to its full URL.
* Returns null on network failure or non-redirect response. Callers must pass the
* resolved URL back through `extractTikTokId` to get the numeric id.
*
* NOTE: Apify's hashtag scraper already returns full URLs in `webVideoUrl` for the
* cases we care about, so this is rarely hit. Kept for defence in depth.
*/
export async function resolveShortLink(shortUrl: string): Promise<string | null> {
try {
const res = await fetch(shortUrl, { method: 'HEAD', redirect: 'manual' });
const loc = res.headers.get('location');
if (!loc) return null;
return loc;
} catch {
return null;
}
}
export interface DriftEvent {
actor: string;
reason: 'no-id-extracted' | 'id-not-in-selection' | 'duplicate-id-different-url';
source_url: string | null;
extracted_id: string | null;
context?: Record<string, unknown>;
at: string;
}
/**
* Test helper: read every URL form V1 has seen drift in. Loaded from a
* fixture file so contributors can add new mutation forms without touching code.
*/
export const DRIFT_REASONS = [
'no-id-extracted',
'id-not-in-selection',
'duplicate-id-different-url',
] as const;

208
v2/pipeline/lib/manifest.ts Normal file
View file

@ -0,0 +1,208 @@
// Manifest validation. Hard gate: refuses to advance unless every selected
// video has every required asset, content-validated.
//
// Validity rules (V3 brief §4 stage 5):
// - metadata.ok : full record present in bundle.json
// - transcript.ok : non-empty text_en
// - comments.ok : ≥5 comments, all with text_en
// - frames.ok : ≥3 frames extracted (the brief says ≥3)
// - cover.ok : local cover.jpg present and >5 KB
import { existsSync, readFileSync, readdirSync, statSync, writeFileSync, mkdirSync } from 'node:fs';
import { join } from 'node:path';
import { z } from 'zod';
import { PATHS } from './paths.js';
import type { VideoBundle } from '../stages/stage_4_pass2_enrich.js';
export const ASSET_KINDS = ['metadata', 'cover', 'transcript', 'comments', 'frames', 'bundle'] as const;
export type AssetKind = typeof ASSET_KINDS[number];
const BUNDLE_VALIDATION = z.object({
id: z.string(),
metadata: z.unknown(),
transcript: z.union([z.null(), z.object({ text_en: z.string().min(1) })]),
comments: z.array(z.object({ text_en: z.string().min(1) })),
frames: z.array(z.object({ index: z.number(), path: z.string() })),
cover_local: z.union([z.null(), z.string()]),
_validation: z.object({ all_ok: z.boolean(), missing: z.array(z.string()) }),
});
export interface ManifestVideoEntry {
id: string;
metadata: { ok: boolean; path?: string; error?: string };
cover: { ok: boolean; path?: string; error?: string };
transcript: { ok: boolean; path?: string; language_detected?: string; error?: string };
comments: { ok: boolean; path?: string; count?: number; error?: string };
frames: { ok: boolean; path?: string; count?: number; error?: string };
bundle: { ok: boolean; path?: string; error?: string };
all_ok: boolean;
missing: AssetKind[];
}
export interface Manifest {
report_id: string;
selected_count: number;
summary: {
metadata_ok: number;
transcript_ok: number;
comments_ok: number;
frames_ok: number;
cover_ok: number;
bundle_ok: number;
all_ok: number;
coverage_pct: number;
};
videos: ManifestVideoEntry[];
built_at: string;
}
export class HardGateError extends Error {
manifest: Manifest;
constructor(message: string, manifest: Manifest) {
super(message);
this.name = 'HardGateError';
this.manifest = manifest;
}
}
function validateOne(reportId: string, id: string): ManifestVideoEntry {
const dir = PATHS.enrichedVideo(reportId, id);
const entry: ManifestVideoEntry = {
id,
metadata: { ok: false },
cover: { ok: false },
transcript: { ok: false },
comments: { ok: false },
frames: { ok: false },
bundle: { ok: false },
all_ok: false,
missing: [],
};
// metadata.json
const metaPath = join(dir, 'metadata.json');
if (existsSync(metaPath)) {
try {
JSON.parse(readFileSync(metaPath, 'utf-8'));
entry.metadata = { ok: true, path: metaPath };
} catch (e) {
entry.metadata = { ok: false, error: `parse: ${(e as Error).message}` };
}
} else entry.metadata = { ok: false, error: 'missing' };
// cover.jpg
const coverPath = join(dir, 'cover.jpg');
if (existsSync(coverPath)) {
const sz = statSync(coverPath).size;
if (sz > 5_000) entry.cover = { ok: true, path: coverPath };
else entry.cover = { ok: false, error: `too small (${sz} bytes)` };
} else entry.cover = { ok: false, error: 'missing' };
// transcript.json
const tPath = join(dir, 'transcript.json');
if (existsSync(tPath)) {
try {
const t = JSON.parse(readFileSync(tPath, 'utf-8')) as { text_en?: string; language_detected?: string };
if (t.text_en && t.text_en.trim().length > 0) {
const tEntry: { ok: true; path: string; language_detected?: string } = { ok: true, path: tPath };
if (t.language_detected) tEntry.language_detected = t.language_detected;
entry.transcript = tEntry;
} else entry.transcript = { ok: false, error: 'empty text_en' };
} catch (e) {
entry.transcript = { ok: false, error: `parse: ${(e as Error).message}` };
}
} else entry.transcript = { ok: false, error: 'missing' };
// comments.json
const cPath = join(dir, 'comments.json');
if (existsSync(cPath)) {
try {
const arr = JSON.parse(readFileSync(cPath, 'utf-8')) as Array<{ text_en?: string }>;
const validCount = arr.filter((c) => typeof c.text_en === 'string' && c.text_en.length > 0).length;
if (validCount >= 5) entry.comments = { ok: true, path: cPath, count: validCount };
else entry.comments = { ok: false, count: validCount, error: `only ${validCount} comments with text_en` };
} catch (e) {
entry.comments = { ok: false, error: `parse: ${(e as Error).message}` };
}
} else entry.comments = { ok: false, error: 'missing' };
// frames/ — when MANIFEST_FRAMES_OPTIONAL=true, frames are advisory (mp4 download
// requires shouldDownloadVideos:true on Stage 2 which costs more; many runs skip).
const framesOptional = process.env.MANIFEST_FRAMES_OPTIONAL === 'true';
const framesDir = join(dir, 'frames');
if (existsSync(framesDir)) {
try {
const fileCount = readdirSync(framesDir).filter((f) => f.endsWith('.jpg')).length;
if (fileCount >= 3) entry.frames = { ok: true, path: framesDir, count: fileCount };
else if (framesOptional) entry.frames = { ok: true, path: framesDir, count: fileCount };
else entry.frames = { ok: false, count: fileCount, error: `only ${fileCount} frames extracted` };
} catch (e) {
entry.frames = { ok: false, error: (e as Error).message };
}
} else if (framesOptional) {
entry.frames = { ok: true, count: 0 };
} else entry.frames = { ok: false, error: 'missing' };
// bundle.json — schema check
const bPath = join(dir, 'bundle.json');
if (existsSync(bPath)) {
try {
const data = JSON.parse(readFileSync(bPath, 'utf-8'));
const parsed = BUNDLE_VALIDATION.safeParse(data);
if (parsed.success) entry.bundle = { ok: true, path: bPath };
else entry.bundle = { ok: false, error: `schema: ${parsed.error.message.slice(0, 200)}` };
} catch (e) {
entry.bundle = { ok: false, error: `parse: ${(e as Error).message}` };
}
} else entry.bundle = { ok: false, error: 'missing' };
const missing: AssetKind[] = [];
if (!entry.metadata.ok) missing.push('metadata');
if (!entry.cover.ok) missing.push('cover');
if (!entry.transcript.ok) missing.push('transcript');
if (!entry.comments.ok) missing.push('comments');
if (!entry.frames.ok) missing.push('frames');
if (!entry.bundle.ok) missing.push('bundle');
entry.missing = missing;
entry.all_ok = missing.length === 0;
return entry;
}
export function buildManifest(reportId: string, ids: string[]): Manifest {
const videos = ids.map((id) => validateOne(reportId, id));
const summary = {
metadata_ok: videos.filter((v) => v.metadata.ok).length,
transcript_ok: videos.filter((v) => v.transcript.ok).length,
comments_ok: videos.filter((v) => v.comments.ok).length,
frames_ok: videos.filter((v) => v.frames.ok).length,
cover_ok: videos.filter((v) => v.cover.ok).length,
bundle_ok: videos.filter((v) => v.bundle.ok).length,
all_ok: videos.filter((v) => v.all_ok).length,
coverage_pct: videos.length === 0 ? 0 : Math.round((videos.filter((v) => v.all_ok).length / videos.length) * 10000) / 100,
};
return {
report_id: reportId,
selected_count: ids.length,
summary,
videos,
built_at: new Date().toISOString(),
};
}
export function writeManifest(reportId: string, manifest: Manifest): string {
const path = PATHS.manifestJson(reportId);
mkdirSync(path.replace(/[^/]+$/, ''), { recursive: true });
writeFileSync(path, JSON.stringify(manifest, null, 2));
return path;
}
export function loadManifest(reportId: string): Manifest | null {
const p = PATHS.manifestJson(reportId);
if (!existsSync(p)) return null;
return JSON.parse(readFileSync(p, 'utf-8')) as Manifest;
}
export function loadBundle(reportId: string, videoId: string): VideoBundle | null {
const p = `${PATHS.enrichedVideo(reportId, videoId)}/bundle.json`;
if (!existsSync(p)) return null;
return JSON.parse(readFileSync(p, 'utf-8')) as VideoBundle;
}

View file

@ -0,0 +1,160 @@
// §16 — Month-over-month trend comparison.
//
// Inputs: current report + prior report (both must exist on disk; brief.prior_report_id
// resolves to a prior report's fs_root). Per V3, fails LOUDLY if prior_report_id is set
// but the prior report is missing — never silent-skip.
//
// Algorithm:
// 1. Trend matching: for every current trend, find closest prior trend by
// - editorial-name similarity (cheap Jaro-Winkler-ish)
// - shared video ids (Jaccard)
// - shared category (soft tie-breaker)
// match_score = 0.5 * name + 0.3 * videos + 0.2 * category. Threshold 0.45 = returning.
// 2. Faded: every prior trend with no match above threshold.
// 3. Velocity delta for returning trends.
// 4. Category momentum.
import { writeFileSync, readFileSync, existsSync, mkdirSync } from 'node:fs';
import { join, resolve } from 'node:path';
import { PATHS } from './paths.js';
import type { Trend } from '../stages/stage_8_trends.js';
function root(): string {
return process.env.BRIEFS_ROOT || resolve(process.cwd(), 'briefs');
}
function loadTrends(reportId: string): Trend[] {
const p = PATHS.trends(reportId);
if (!existsSync(p)) throw new Error(`trends.json missing for ${reportId} at ${p}`);
return JSON.parse(readFileSync(p, 'utf-8'));
}
function priorReportRootExists(priorReportId: string): boolean {
return existsSync(join(root(), priorReportId, 'trends.json'));
}
// Cheap normalised string similarity (token Jaccard on words).
function nameSimilarity(a: string, b: string): number {
const tokenise = (s: string) => new Set(s.toLowerCase().replace(/[^a-z0-9 ]/g, ' ').split(/\s+/).filter((t) => t.length >= 3));
const A = tokenise(a);
const B = tokenise(b);
if (A.size === 0 || B.size === 0) return 0;
let inter = 0;
for (const t of A) if (B.has(t)) inter++;
return inter / (A.size + B.size - inter);
}
function jaccard<T>(a: T[], b: T[]): number {
const A = new Set(a);
const B = new Set(b);
if (A.size === 0 && B.size === 0) return 0;
let inter = 0;
for (const t of A) if (B.has(t)) inter++;
return inter / (A.size + B.size - inter);
}
const RETURNING_THRESHOLD = 0.45;
export interface MomResult {
new_trends: Array<{ trend_id: string; name: string; rationale: string }>;
returning_trends: Array<{
trend_id: string;
prior_trend_id: string;
match_score: number;
velocity_delta: { plays_total_pct: number; video_count: number };
}>;
faded_trends: Array<{ prior_trend_id: string; name: string }>;
category_momentum: Array<{
category: string;
new: number; returning: number; faded: number;
plays_delta_pct: number;
label: 'expanding' | 'stable' | 'contracting';
}>;
}
export async function runMomCompare(reportId: string, priorReportId: string): Promise<{ ok: true; outputs: Record<string, string>; result: MomResult }> {
if (!priorReportRootExists(priorReportId)) {
throw new Error(`Prior report '${priorReportId}' not found on disk (expected trends.json at ${join(root(), priorReportId, 'trends.json')}). brief.prior_report_id was set; build fails loudly per §16.`);
}
const current = loadTrends(reportId);
const prior = loadTrends(priorReportId);
// Match current → prior
const usedPrior = new Set<string>();
const returning: MomResult['returning_trends'] = [];
const newTrends: MomResult['new_trends'] = [];
for (const c of current) {
let best: { p: Trend; score: number } | null = null;
for (const p of prior) {
if (usedPrior.has(p.trend_id)) continue;
const score = 0.5 * nameSimilarity(c.name, p.name)
+ 0.3 * jaccard(c.supporting_video_ids, p.supporting_video_ids)
+ 0.2 * (c.category === p.category ? 1 : 0);
if (!best || score > best.score) best = { p, score };
}
if (best && best.score >= RETURNING_THRESHOLD) {
usedPrior.add(best.p.trend_id);
const playsPct = best.p.kpis.plays_total > 0
? Math.round(((c.kpis.plays_total - best.p.kpis.plays_total) / best.p.kpis.plays_total) * 100)
: 0;
returning.push({
trend_id: c.trend_id,
prior_trend_id: best.p.trend_id,
match_score: Math.round(best.score * 100) / 100,
velocity_delta: {
plays_total_pct: playsPct,
video_count: c.kpis.videos - best.p.kpis.videos,
},
});
} else {
newTrends.push({ trend_id: c.trend_id, name: c.name, rationale: 'no prior match above threshold' });
}
}
const faded: MomResult['faded_trends'] = prior
.filter((p) => !usedPrior.has(p.trend_id))
.map((p) => ({ prior_trend_id: p.trend_id, name: p.name }));
// Category momentum
const cats = new Set<string>([...current.map((t) => t.category), ...prior.map((t) => t.category)]);
const newSet = new Set(newTrends.map((n) => n.trend_id));
const returningSet = new Set(returning.map((r) => r.trend_id));
const fadedPriorSet = new Set(faded.map((f) => f.prior_trend_id));
const categoryMomentum = [...cats].map((cat) => {
const newCount = current.filter((t) => t.category === cat && newSet.has(t.trend_id)).length;
const retCount = current.filter((t) => t.category === cat && returningSet.has(t.trend_id)).length;
const fadedCount = prior.filter((t) => t.category === cat && fadedPriorSet.has(t.trend_id)).length;
const curPlays = current.filter((t) => t.category === cat).reduce((s, t) => s + t.kpis.plays_total, 0);
const priorPlays = prior.filter((t) => t.category === cat).reduce((s, t) => s + t.kpis.plays_total, 0);
const playsDeltaPct = priorPlays > 0 ? Math.round(((curPlays - priorPlays) / priorPlays) * 100) : 0;
const label: 'expanding' | 'stable' | 'contracting' =
playsDeltaPct > 15 ? 'expanding' : playsDeltaPct < -15 ? 'contracting' : 'stable';
return { category: cat, new: newCount, returning: retCount, faded: fadedCount, plays_delta_pct: playsDeltaPct, label };
}).sort((a, b) => b.plays_delta_pct - a.plays_delta_pct);
const result: MomResult = {
new_trends: newTrends,
returning_trends: returning,
faded_trends: faded,
category_momentum: categoryMomentum,
};
// Write outputs to outputs/compare/
const dir = join(PATHS.outputsDir(reportId), 'compare');
mkdirSync(dir, { recursive: true });
writeFileSync(join(dir, 'new_trends.json'), JSON.stringify(result.new_trends, null, 2));
writeFileSync(join(dir, 'returning_trends.json'), JSON.stringify(result.returning_trends, null, 2));
writeFileSync(join(dir, 'faded_trends.json'), JSON.stringify(result.faded_trends, null, 2));
writeFileSync(join(dir, 'category_momentum.json'), JSON.stringify(result.category_momentum, null, 2));
return {
ok: true,
outputs: {
new_trends: join(dir, 'new_trends.json'),
returning_trends: join(dir, 'returning_trends.json'),
faded_trends: join(dir, 'faded_trends.json'),
category_momentum: join(dir, 'category_momentum.json'),
},
result,
};
}

51
v2/pipeline/lib/paths.ts Normal file
View file

@ -0,0 +1,51 @@
// Resolves the on-disk briefs/<report_id>/ tree per V3 brief §10.
// Used by every stage so paths stay consistent.
import { mkdirSync, existsSync } from 'node:fs';
import { resolve, join } from 'node:path';
// Dynamic so tests can override per-suite via process.env.BRIEFS_ROOT.
function root(): string {
return process.env.BRIEFS_ROOT || resolve(process.cwd(), 'briefs');
}
export function reportRoot(reportId: string): string {
const p = resolve(root(), reportId);
if (!existsSync(p)) mkdirSync(p, { recursive: true });
return p;
}
export function reportPath(reportId: string, ...parts: string[]): string {
const p = join(reportRoot(reportId), ...parts);
return p;
}
export function ensureDir(p: string): string {
if (!existsSync(p)) mkdirSync(p, { recursive: true });
return p;
}
export const PATHS = {
briefYaml: (id: string) => reportPath(id, 'brief.yaml'),
briefJson: (id: string) => reportPath(id, 'brief.json'),
seedsJson: (id: string) => reportPath(id, 'seeds.json'),
pass1: (id: string) => reportPath(id, 'pass1'),
pass1Videos: (id: string) => reportPath(id, 'pass1', 'pass1_videos.json'),
spendLog: (id: string) => reportPath(id, 'pass1', 'spend_log.json'),
pass2: (id: string) => reportPath(id, 'pass2'),
selectedIds: (id: string) => reportPath(id, 'pass2', 'selected_video_ids.json'),
selectionRules: (id: string) => reportPath(id, 'pass2', 'selection_rules.json'),
driftLog: (id: string) => reportPath(id, 'pass2', 'drift_log.jsonl'),
enriched: (id: string) => reportPath(id, 'enriched'),
enrichedVideo: (id: string, vid: string) => reportPath(id, 'enriched', vid),
manifestJson: (id: string) => reportPath(id, 'manifest.json'),
analysisDir: (id: string) => reportPath(id, 'analysis'),
atomicInsights: (id: string) => reportPath(id, 'atomic_insights.json'),
trends: (id: string) => reportPath(id, 'trends.json'),
categories: (id: string) => reportPath(id, 'categories.json'),
qaDir: (id: string) => reportPath(id, 'qa'),
outputsDir: (id: string) => reportPath(id, 'outputs'),
datasetV2: (id: string) => reportPath(id, 'outputs', 'dataset_v2.json'),
dashboardHtml: (id: string) => reportPath(id, 'outputs', 'dashboard.html'),
stateDir: (id: string) => reportPath(id, '.state'),
stageDone: (id: string, n: number) => reportPath(id, '.state', `stage${n}.done`),
};

210
v2/pipeline/lib/recipes.ts Normal file
View file

@ -0,0 +1,210 @@
// §4.5b selection recipes + filter primitives. Recipe is matched from the brief's
// business_question by trigger phrases; user can override by passing a recipe label
// or a custom filter expression.
import type { Pass1Video } from '../stages/stage_2_pass1_scrape.js';
export type RecipeId = 'A' | 'B' | 'C' | 'D';
export interface RecipeDef {
id: RecipeId;
name: string;
triggers: string[];
default_filter: string;
rationale: string;
}
export const RECIPES: Record<RecipeId, RecipeDef> = {
A: {
id: 'A',
name: 'What stops the scroll',
triggers: ['hook', 'stops the scroll', 'first three seconds', 'attention'],
default_filter: 'top_by_stl:80 OR top_by_velocity:40',
rationale: 'STL% is the clearest hook-quality proxy; velocity catches what is catching on right now.',
},
B: {
id: 'B',
name: 'Why is X having a moment',
triggers: ['cultural moment', 'why is', 'emerging', 'shift', 'trend'],
default_filter: 'top_by_saves:60 AND (top_by_plays:100 OR top_by_comments:50)',
rationale: 'Saves signal personal resonance; plays + comments capture mass and conversation.',
},
C: {
id: 'C',
name: 'How does X position vs competitors',
triggers: ['competitor', 'positioning', 'market share', ' vs '],
default_filter: 'top_by_plays:80',
rationale: 'Forces the brand and competitor sets in (handles preselected by Stage 2), then adds the cultural top to compare against.',
},
D: {
id: 'D',
name: 'What do users actually feel about X',
triggers: ['what do users', 'audience feeling', 'reception', 'reaction', 'sentiment'],
default_filter: 'top_by_comments:60 AND top_by_stl:40',
rationale: 'Comments carry the truth; STL% filters out videos no one watched long enough to react to.',
},
};
export function matchRecipe(businessQuestion: string): RecipeId {
const q = businessQuestion.toLowerCase();
// Order: A → C → D → B (most specific to most general)
for (const id of ['A', 'C', 'D', 'B'] as RecipeId[]) {
const r = RECIPES[id];
if (r.triggers.some((t) => q.includes(t.toLowerCase()))) return id;
}
return 'B';
}
// ─── Filter primitives ─────────────────────────────────────────────────
type FilterFn = (videos: Pass1Video[]) => Set<string>;
interface ParsedFilter {
expr: FilterAst;
raw: string;
}
type FilterAst =
| { kind: 'top_by_plays'; n: number }
| { kind: 'top_by_stl'; n: number; min_plays?: number }
| { kind: 'top_by_comments'; n: number }
| { kind: 'top_by_saves'; n: number }
| { kind: 'top_by_velocity'; n: number; min_age_days?: number }
| { kind: 'manual_ids'; ids: string[] }
| { kind: 'AND'; left: FilterAst; right: FilterAst }
| { kind: 'OR'; left: FilterAst; right: FilterAst };
function topNByKey<K extends keyof Pass1Video>(videos: Pass1Video[], n: number, key: K): Set<string> {
const sorted = [...videos].sort((a, b) => Number(b[key] ?? 0) - Number(a[key] ?? 0));
return new Set(sorted.slice(0, n).map((v) => v.id));
}
function topByVelocity(videos: Pass1Video[], n: number, minAgeDays = 2): Set<string> {
const now = Date.now();
const scored = videos
.map((v) => {
const ageMs = now - new Date(v.posted_at).getTime();
const ageDays = ageMs / (1000 * 60 * 60 * 24);
if (ageDays < minAgeDays) return null;
const velocity = v.plays / Math.max(ageDays, 1);
return { id: v.id, velocity };
})
.filter((x): x is { id: string; velocity: number } => x !== null)
.sort((a, b) => b.velocity - a.velocity);
return new Set(scored.slice(0, n).map((s) => s.id));
}
function evalAst(ast: FilterAst, videos: Pass1Video[]): Set<string> {
switch (ast.kind) {
case 'top_by_plays': return topNByKey(videos, ast.n, 'plays');
case 'top_by_stl': {
const minP = ast.min_plays ?? 10000;
return topNByKey(videos.filter((v) => v.plays >= minP), ast.n, 'stl_pct');
}
case 'top_by_comments': return topNByKey(videos, ast.n, 'comments_count');
case 'top_by_saves': return topNByKey(videos, ast.n, 'saves');
case 'top_by_velocity': return topByVelocity(videos, ast.n, ast.min_age_days);
case 'manual_ids': return new Set(ast.ids);
case 'AND': {
const l = evalAst(ast.left, videos);
const r = evalAst(ast.right, videos);
return new Set([...l].filter((id) => r.has(id)));
}
case 'OR': {
const l = evalAst(ast.left, videos);
const r = evalAst(ast.right, videos);
return new Set([...l, ...r]);
}
}
}
// ─── Parser ────────────────────────────────────────────────────────────
interface Token { type: 'TOKEN' | 'AND' | 'OR' | 'LPAREN' | 'RPAREN'; value: string }
function tokenise(input: string): Token[] {
const out: Token[] = [];
let i = 0;
while (i < input.length) {
const ch = input[i]!;
if (/\s/.test(ch)) { i++; continue; }
if (ch === '(') { out.push({ type: 'LPAREN', value: '(' }); i++; continue; }
if (ch === ')') { out.push({ type: 'RPAREN', value: ')' }); i++; continue; }
// Read token until whitespace or paren
let j = i;
while (j < input.length && !/[\s()]/.test(input[j]!)) j++;
const tok = input.slice(i, j);
const upper = tok.toUpperCase();
if (upper === 'AND') out.push({ type: 'AND', value: 'AND' });
else if (upper === 'OR') out.push({ type: 'OR', value: 'OR' });
else out.push({ type: 'TOKEN', value: tok });
i = j;
}
return out;
}
function parseTokenFilter(value: string): FilterAst {
// form: "top_by_plays:100" or "manual_ids:7280…,7281…"
const colon = value.indexOf(':');
if (colon === -1) throw new Error(`Bad filter token: '${value}' (expected key:value)`);
const key = value.slice(0, colon);
const rhs = value.slice(colon + 1);
switch (key) {
case 'top_by_plays': return { kind: 'top_by_plays', n: parseInt(rhs, 10) };
case 'top_by_stl': return { kind: 'top_by_stl', n: parseInt(rhs, 10) };
case 'top_by_comments': return { kind: 'top_by_comments', n: parseInt(rhs, 10) };
case 'top_by_saves': return { kind: 'top_by_saves', n: parseInt(rhs, 10) };
case 'top_by_velocity': return { kind: 'top_by_velocity', n: parseInt(rhs, 10) };
case 'manual_ids': return { kind: 'manual_ids', ids: rhs.split(/[,\s]+/).filter(Boolean) };
default: throw new Error(`Unknown filter primitive: ${key}`);
}
}
// Recursive-descent: AND/OR are LEFT-ASSOCIATIVE with EQUAL precedence per the V3 brief
// "left-to-right with explicit parentheses required for nesting".
function parse(tokens: Token[]): FilterAst {
let pos = 0;
function peek(): Token | undefined { return tokens[pos]; }
function consume(): Token | undefined { return tokens[pos++]; }
function parseAtom(): FilterAst {
const t = consume();
if (!t) throw new Error('Unexpected end of filter expression');
if (t.type === 'LPAREN') {
const inner = parseExpr();
const close = consume();
if (!close || close.type !== 'RPAREN') throw new Error('Missing closing paren');
return inner;
}
if (t.type !== 'TOKEN') throw new Error(`Unexpected token ${t.value}`);
return parseTokenFilter(t.value);
}
function parseExpr(): FilterAst {
let left = parseAtom();
while (peek() && (peek()!.type === 'AND' || peek()!.type === 'OR')) {
const op = consume()!;
const right = parseAtom();
left = op.type === 'AND' ? { kind: 'AND', left, right } : { kind: 'OR', left, right };
}
return left;
}
const ast = parseExpr();
if (pos < tokens.length) throw new Error(`Trailing tokens after expression at position ${pos}`);
return ast;
}
export function parseFilterExpression(raw: string): ParsedFilter {
const tokens = tokenise(raw);
if (tokens.length === 0) throw new Error('Empty filter expression');
return { raw, expr: parse(tokens) };
}
export function applyFilter(videos: Pass1Video[], parsed: ParsedFilter): string[] {
const ids = evalAst(parsed.expr, videos);
return [...ids];
}
export function makeFilter(fn: (videos: Pass1Video[]) => string[]): FilterFn {
return (vs) => new Set(fn(vs));
}

21
v2/pipeline/lib/retry.ts Normal file
View file

@ -0,0 +1,21 @@
// 3 attempts, exponential backoff (1s, 4s, 16s) per V3 brief §4 / §13.
export async function withRetry<T>(
fn: () => Promise<T>,
opts: { label: string; maxAttempts?: number; backoffMs?: number[] } = { label: 'op' },
): Promise<T> {
const max = opts.maxAttempts ?? 3;
const backoff = opts.backoffMs ?? [1000, 4000, 16000];
let lastErr: unknown;
for (let attempt = 1; attempt <= max; attempt++) {
try {
return await fn();
} catch (err) {
lastErr = err;
if (attempt === max) break;
const wait = backoff[attempt - 1] ?? backoff[backoff.length - 1] ?? 1000;
console.warn(`[retry] ${opts.label} attempt ${attempt}/${max} failed: ${(err as Error).message}; retrying in ${wait}ms`);
await new Promise((r) => setTimeout(r, wait));
}
}
throw new Error(`${opts.label} failed after ${max} attempts: ${(lastErr as Error)?.message ?? lastErr}`);
}

View file

@ -0,0 +1,12 @@
import { readFileSync } from 'node:fs';
import { resolve, dirname } from 'node:path';
import { fileURLToPath } from 'node:url';
const __filename = fileURLToPath(import.meta.url);
const __dirname = dirname(__filename);
const PROMPTS_DIR = resolve(__dirname, '..', 'prompts');
export function loadRubric(name: string): string {
const filename = name.endsWith('.md') ? name : `${name}.md`;
return readFileSync(resolve(PROMPTS_DIR, filename), 'utf-8');
}

View file

@ -0,0 +1,53 @@
// Claude-based translation. Batches 20 comments per call to keep cost down.
import { callClaudeJSON } from './claude.js';
import { z } from 'zod';
const BATCH_SCHEMA = z.object({
translations: z.array(z.object({ i: z.number(), text_en: z.string() })),
});
export async function translateTextToEn(text: string, sourceLang?: string): Promise<string> {
if (!text.trim()) return '';
const prompt = `Translate the following text to English. If it is already English, return it verbatim.${sourceLang ? ` Source language hint: ${sourceLang}.` : ''} Preserve emoji and punctuation. Return ONLY the translated text, no commentary.\n\nTEXT:\n${text}`;
const { callClaude } = await import('./claude.js');
const out = await callClaude(prompt, { label: 'translate_text', maxTokens: 4096 });
return out.trim();
}
const BATCH_SIZE = 10; // smaller batches dodge max_tokens truncation on long comments
async function translateBatchOnce(items: string[], sourceLang?: string): Promise<string[]> {
if (items.length === 0) return [];
const numbered = items.map((t, i) => `${i + 1}. ${t.replace(/\s+/g, ' ').trim()}`).join('\n');
const prompt = `Translate each numbered item to English. If an item is already English, copy it verbatim.${sourceLang ? ` Source language hint: ${sourceLang}.` : ''} Preserve emoji and punctuation.\n\nReturn ONLY this JSON shape (no other text):\n{"translations":[{"i":1,"text_en":"..."},{"i":2,"text_en":"..."}]}\n\nTEXTS:\n${numbered}`;
const parsed = await callClaudeJSON<unknown>(prompt, { label: 'translate_batch', maxTokens: 8192 });
const ok = BATCH_SCHEMA.safeParse(parsed);
if (!ok.success) throw new Error(`translate_batch returned bad shape: ${ok.error.message}`);
const map = new Map(ok.data.translations.map((t) => [t.i, t.text_en]));
return items.map((_orig, i) => map.get(i + 1) ?? '');
}
export async function translateBatchToEn(items: string[], sourceLang?: string): Promise<string[]> {
if (items.length === 0) return [];
const out: string[] = [];
for (let i = 0; i < items.length; i += BATCH_SIZE) {
const slice = items.slice(i, i + BATCH_SIZE);
try {
const translated = await translateBatchOnce(slice, sourceLang);
out.push(...translated);
} catch (err) {
console.warn(`[translate] batch ${i / BATCH_SIZE + 1} failed (${(err as Error).message.slice(0, 80)}); using originals`);
out.push(...slice);
}
}
return out;
}
/** Returns true if the string is dominantly ASCII letters/digits + common punctuation. */
export function isLikelyEnglish(text: string): boolean {
if (!text.trim()) return true;
const ascii = text.match(/[A-Za-z]/g)?.length ?? 0;
const total = text.replace(/\s/g, '').length;
if (total === 0) return true;
return ascii / total > 0.6;
}

View file

@ -0,0 +1,44 @@
# Atomic insight extraction (Stage 7)
You are extracting small, evidence-grounded observations from a batch of TikTok video analyses. The output is intermediate scaffolding — hundreds of small facts that Stage 8 will cluster into editorial trends.
## Rules
- Each observation must cite at least 1 video id. Most should cite 210.
- Err on the side of MORE, smaller insights rather than fewer, broader ones. We want 200500 atomic insights across the whole report.
- Four types ONLY: `hook`, `visual`, `audio`, `narrative`.
- If a new observation strengthens an existing one (same pattern, more videos), ADD video ids to that atomic_id rather than creating a new one. The running list of existing observations is provided.
- Never duplicate an existing atomic_id's observation text.
- English only.
## Type definitions
- **hook** — a recurring opening pattern (first 3 seconds: line, gesture, framing).
- **visual** — a recurring aesthetic element (lighting, palette, composition, on-screen text style, setting, transition).
- **audio** — a recurring sound, music style, or voice device (whisper, ASMR, audio meme, voiceover cadence).
- **narrative** — a recurring thesis, tension, or worldview (what creators are saying, the conflict they keep returning to).
## Output JSON shape
```json
{
"additions": [
{
"atomic_id": "ATM-XXXX",
"type": "hook|visual|audio|narrative",
"observation": "string, ≤25 words, specific and observational",
"supporting_video_ids": ["string id", "..."]
}
],
"extensions": [
{
"atomic_id": "ATM-EXISTING-ID",
"added_video_ids": ["string id"]
}
]
}
```
`additions` are net-new observations. `extensions` add video ids to existing atomic_ids. The orchestrator will allocate fresh ids for `additions` (the `ATM-XXXX` you supply is a within-batch placeholder; the orchestrator will rewrite to global ids).
Return ONLY the JSON.

View file

@ -0,0 +1,32 @@
# §4.5d — Category quality rubric (Stage 8a)
Generate 510 brief-driven categories. Each category groups trends in this report.
**Good categories are:**
- Editorial and evocative (could be a magazine section name).
- Mutually exclusive (no significant overlap with another).
- 25 words.
- Cultural, not descriptive.
*Good Dove examples:* "Hair Rituals", "Self-Image Drama", "Anti-Beauty Backlash", "Grooming as Identity".
**Reject categories that are:**
- Descriptive containers ("Hair Care Videos", "Beauty Content").
- Mechanically derived from data ("#hairtok content", "Top Plays").
- Redundant with another ("Hair Routines" if "Hair Rituals" exists).
- Genre labels ("Tutorials", "Reviews").
For each rejected candidate, include `{name, reason}` in the `rejected` array so the orchestrator can re-roll.
## Output JSON
```json
{
"categories": [
{"name": "string 2-5 words", "rationale": "string"}
],
"rejected": [{"name": "string", "reason": "string"}]
}
```
Return ONLY the JSON.

View file

@ -0,0 +1,17 @@
# §4.5e — Editorial naming rubric (Stage 8b)
Every trend needs an editorial name. The name is what a strategist will quote in the deck.
**Good trend names are:**
- Phrased like a magazine headline or cultural call-out.
- Specific enough to be recognisable, abstract enough to hold many videos.
*Good examples:* "The Ceremonial Hair Wash", "The 5-Minute Reset", "Anti-Influencer Beauty", "The Confession Routine".
**Reject names that are:**
- Hashtag literals ("#hairtok trend").
- Generic descriptors ("Hair videos", "Self-care content").
- Feature lists ("Videos with shower scenes and ASMR").
- Brand-supplied marketing language ("The Dove Difference").
If asked to QA a name, return `{ok: false, reason: "..."}` for any that fail; otherwise `{ok: true}`.

View file

@ -0,0 +1,64 @@
# Per-video analysis (Stage 6)
You are analysing a single TikTok video for a brand strategist. The video has been pre-validated: caption, transcript, comments, and frame stills are all present and tied to the same video id. Use ALL inputs.
Your job: produce a structured JSON record describing what the video is, how it works as a piece of content, and what audience signals it carries.
## Inputs
You will receive: the canonical video id, the handle, plays/likes/saves/comments_count/shares/stl%, the caption + hashtags, the English transcript, up to 30 top comments (numbered, with like counts), and references to N frame stills.
## Non-negotiable rules
- Quote evidence verbatim. Hooks come from the transcript, audience signals come from comments. Never paraphrase a quote.
- Paid-vs-organic label uses ONLY computable signals: caption ad tags (`#ad`, `#sponsored`, `#gifted`, `#paidpartnership`), brand handle mention in caption, on-screen disclosure visible in a frame, or this creator appearing in ≥3 selected videos with brand mentions. If none fire, label is `unclear`. **Do not infer paid status from "the video looks polished".**
- English-only output. If a comment is not English, the bundle has already translated it; quote the `text_en` field.
- No marketing language. No brand voice. Editorial, observational, specific.
## Output JSON shape (exact)
```json
{
"id": "string",
"what_happens": "string, 2 sentences plain description",
"hook": {
"first_3_seconds": "verbatim transcript snippet",
"pattern": "shock|question|reveal|relatable|tutorial-promise|other",
"why_it_stops_scroll": "string, 1 sentence"
},
"visual_aesthetic": {
"lighting": "natural|harsh|soft|neon|warm|cool|mixed",
"colour_palette": ["#hex","#hex","..."],
"setting": "bathroom|bedroom|outdoor|studio|kitchen|other",
"talent": "single-creator|duo|group|none",
"products_visible": ["product names…"],
"on_screen_text_examples": ["…","…"]
},
"format": "tutorial|confession|hot-take|review|routine|transformation|hack|skit|asmr",
"audio": {
"music_present": true,
"music_mood": "upbeat|melancholic|dreamy|aggressive|none",
"voiceover": true,
"asmr_elements": false
},
"narrative": {
"thesis": "string, what the video is really saying",
"tension": "string, what is the conflict or interest",
"resolution": "string, how the video lands"
},
"audience_signals": {
"comment_themes": ["theme 1","theme 2","..."],
"comment_sentiment_split": {"positive": 0, "neutral": 0, "critical": 0},
"verbatim_quotes": [
{"text": "verbatim english quote", "likes": 0, "theme": "label"}
]
},
"paid_or_organic": {
"label": "paid|organic|unclear",
"reasoning": "string, what evidence supports the label",
"evidence_signals_used": ["caption_ad_tag","caption_brand_handle","on_screen_disclosure","creator_repeat_in_report"]
}
}
```
Return ONLY the JSON. No prose before or after.

View file

@ -0,0 +1,27 @@
# §4.5c — Trend relevance calibration (Stage 8b.5)
Score each trend's `business_question_relevance` from 0.0 to 1.0. The score must be calibrated, not free-floating. Use these anchors EVERY time:
| Score band | Tier | Definition | Worked example for "Why is hair washing emerging as a cultural moment?" |
|---|---|---|---|
| ≥0.80 | core | Trend directly answers the business question or names the territory the brand should claim. | "The Ceremonial Hair Wash": directly explains the cultural moment. **0.85** |
| 0.600.79 | core | Trend supports the answer materially; lead-supporting trend. | "Scalp as Self": adjacent ritual, reinforces the territory. **0.70** |
| 0.350.59 | peripheral | Trend gives context, useful but not the headline. | "Hair Texture Confidence": same audience, supports framing, not the answer. **0.45** |
| <0.35 | dropped | Real, well-evidenced trend that does not advance the business question. | "Skincare Minimalism": same audience, different category. **0.20** |
Rules:
- A trend can be excellent on its own merits and still score low if it does not advance the business question. That is correct.
- Tier follows directly from score. Do not set tier independently.
- Calibrate against the anchors above; do not anchor against other trends in this report.
## Output JSON
```json
{
"score": 0.0,
"tier": "core|peripheral|dropped",
"justification": "string, 1 sentence referencing the business question"
}
```
Return ONLY the JSON.

View file

@ -0,0 +1,46 @@
# Seed expansion rubric (§4.5a)
You are seeding a TikTok social-listening scrape. Your job is to turn the brief into three tiers of hashtags, a list of search terms, and (only when high-confidence) creator handles. The rubric below is non-negotiable; every tag and term you propose must satisfy it.
## Hashtag tiers
- **Anchor (58 tags):** huge volume (millions of views in 30 days), unmistakably on-topic, native to the audience's content language. *Example for Dove on hair washing: `#hairtok`, `#showertok`, `#haircare`.*
- **Discovery (1520 tags):** medium volume, niche-specific, where rituals and behaviours live. *Example: `#scalpcare`, `#curlyhair`, `#everythingshower`, `#hairporosity`.*
- **Edge (510 tags):** small but live (posts in the last 14 days), capturing emergent vocabulary. *Example: `#hairhealing`, `#scalpritual`.*
**Reject hashtags that are:**
- Too broad (`#beauty`, `#viral`).
- Brand-locked self-references (`#dove` for the Dove brief).
- Dead (no posts in the last 14 days).
- Unrelated trends (`#mealprep` on a beauty brief).
For each rejected candidate, include `{tag, reason}` in the `rejected` array so the user can override.
## Search terms (1020)
- **Good:** how a real person describes the behaviour. *"everything shower routine", "scalp massage at night", "hair washing too much".*
- **Bad:** marketing copy ("luxurious haircare experience"), too narrow ("Dove shampoo review"), too generic ("hair tips").
## Creator handles
- Only include handles you are highly confident exist (mainstream coverage, brand reports, returned in seed-research).
- Otherwise leave the array empty. Pass 1 will surface organic creators via hashtag scrapes anyway. Inventing handles wastes Apify budget.
## Output schema
Return ONLY valid JSON in this exact shape:
```json
{
"hashtags": {
"anchor": [{"tag": "#string", "rationale": "string, max 12 words"}],
"discovery": [{"tag": "#string", "rationale": "string"}],
"edge": [{"tag": "#string", "rationale": "string"}]
},
"search_terms": [{"term": "string", "rationale": "string"}],
"handles": [{"handle": "string-no-at", "type": "brand|competitor|creator", "rationale": "string"}],
"rejected": [{"tag": "#string", "reason": "string"}]
}
```
All `tag` values MUST start with `#`. All `handle` values MUST NOT start with `@`. Handles array may be empty.

View file

@ -0,0 +1,40 @@
# Stage 8b — Trend synthesis
You are clustering atomic insights into editorial trends. Target 50 trends per report; hard floor 35. **Never split a weak trend to hit a number.**
## Inputs
- The brief (brand, audience, business question, KPIs, context vision).
- The list of brief-driven categories already chosen for this report.
- The atomic insights list (each with id, type, observation, supporting_video_ids, frequency).
- The §4.5e editorial naming rubric (apply it to every name you choose).
## Per-trend rules
Each trend MUST:
- Have an editorial name conforming to §4.5e.
- Have a 23 sentence narrative.
- Cite at least 5 supporting `video_ids` (drawn from the atomic insights).
- Have a `category` field referencing one of the brief-driven categories exactly.
- Have a `lens_tags` array, subset of `["hooks","visual","audio","sentiment","narrative"]`.
- Have `top_atomic_ids` listing the atomic insights that anchor it.
## Output JSON
```json
{
"trends": [
{
"slug": "kebab-case-slug",
"name": "string editorial name",
"category": "string matching one of the report categories exactly",
"narrative": "2-3 sentences, observational, English",
"lens_tags": ["hooks","visual","audio","sentiment","narrative"],
"top_atomic_ids": ["ATM-XXXX","..."],
"supporting_video_ids": ["string id","..."]
}
]
}
```
Return ONLY the JSON.

View file

@ -0,0 +1,179 @@
// Stage 10 — output assembly.
//
// Produces:
// outputs/dataset_v2.json — joined brief + categories + trends + lenses + qa + compare.
// outputs/dashboard.html — self-contained HTML with covers base64-inlined (≤3 MB).
//
// The full React/Vite per-report dashboard (10a) is scaffolded by Phase F-UI work
// outside this file; here we produce the data + portable claude.ai bundle.
import { writeFileSync, readFileSync, existsSync, statSync, mkdirSync } from 'node:fs';
import { join } from 'node:path';
import { PATHS, ensureDir } from '../lib/paths.js';
import type { BriefInput } from '../../server/schemas/brief.js';
import type { Trend } from './stage_8_trends.js';
const COVER_INLINE_MAX_BYTES = 250_000; // ~250 KB per cover ceiling, downscaled separately
function readJson<T>(path: string): T | null {
if (!existsSync(path)) return null;
try { return JSON.parse(readFileSync(path, 'utf-8')) as T; } catch { return null; }
}
function inlineCoverIfPresent(reportId: string, videoId: string): string | null {
const p = join(PATHS.enrichedVideo(reportId, videoId), 'cover.jpg');
if (!existsSync(p)) return null;
const sz = statSync(p).size;
if (sz > COVER_INLINE_MAX_BYTES) return null; // skip oversized; stage 10a should downscale before inline
const b64 = readFileSync(p).toString('base64');
return `data:image/jpeg;base64,${b64}`;
}
export interface DatasetV2 {
brief: BriefInput;
generated_at: string;
categories: { name: string; rationale: string }[];
trends: Array<Trend & { top_videos?: Array<{ id: string; handle: string; plays: number; stl_pct: number; cover_b64?: string | null }> }>;
qa: {
paid_organic_review: unknown;
coverage_check: unknown;
};
compare: unknown | null;
methodology: {
pass1_spend_log: unknown;
manifest_summary: unknown;
selection_rules: unknown;
};
}
export interface Stage10Result {
ok: true;
outputs: Record<string, string>;
dataset_size_bytes: number;
html_size_bytes: number;
trend_count: number;
inlined_covers: number;
}
export async function runStage10Build(reportId: string, brief: BriefInput): Promise<Stage10Result> {
ensureDir(PATHS.outputsDir(reportId));
const trends = readJson<Trend[]>(PATHS.trends(reportId)) ?? [];
const categoriesData = readJson<{ categories: { name: string; rationale: string }[] }>(PATHS.categories(reportId));
const paidOrganic = readJson<unknown>(join(PATHS.qaDir(reportId), 'paid_organic_review.json'));
const coverage = readJson<unknown>(join(PATHS.qaDir(reportId), 'coverage_check.json'));
const spendLog = readJson<unknown>(PATHS.spendLog(reportId));
const selectionRules = readJson<unknown>(PATHS.selectionRules(reportId));
const manifestData = readJson<{ summary: unknown }>(PATHS.manifestJson(reportId));
// Lift handle/plays/stl from pass1 to enrich top_videos.
const pass1 = readJson<Array<{ id: string; handle: string; plays: number; stl_pct: number }>>(PATHS.pass1Videos(reportId)) ?? [];
const pass1Map = new Map(pass1.map((v) => [v.id, v]));
// Compose the dataset.
let inlined = 0;
const enrichedTrends = trends.map((t) => {
const supporting = t.supporting_video_ids.slice(0, 8);
const top_videos = supporting.map((id) => {
const p = pass1Map.get(id);
const cover_b64 = inlineCoverIfPresent(reportId, id);
if (cover_b64) inlined++;
return {
id,
handle: p?.handle ?? 'unknown',
plays: p?.plays ?? 0,
stl_pct: p?.stl_pct ?? 0,
cover_b64,
};
});
return { ...t, top_videos };
});
const compare = readJson<unknown>(join(PATHS.outputsDir(reportId), 'compare', 'returning_trends.json'));
const dataset: DatasetV2 = {
brief,
generated_at: new Date().toISOString(),
categories: categoriesData?.categories ?? [],
trends: enrichedTrends,
qa: { paid_organic_review: paidOrganic, coverage_check: coverage },
compare: compare ? {
new_trends: readJson<unknown>(join(PATHS.outputsDir(reportId), 'compare', 'new_trends.json')),
returning_trends: compare,
faded_trends: readJson<unknown>(join(PATHS.outputsDir(reportId), 'compare', 'faded_trends.json')),
category_momentum:readJson<unknown>(join(PATHS.outputsDir(reportId), 'compare', 'category_momentum.json')),
} : null,
methodology: {
pass1_spend_log: spendLog,
manifest_summary: manifestData?.summary ?? null,
selection_rules: selectionRules,
},
};
const datasetJson = JSON.stringify(dataset, null, 2);
const datasetPath = PATHS.datasetV2(reportId);
writeFileSync(datasetPath, datasetJson);
// Self-contained HTML bundle (10b). Minimal skeleton — claude.ai will render rich UI on upload.
const html = `<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>${escapeHtml(brief.client_name)} Social Listening V2</title>
<style>
* { margin: 0; padding: 0; box-sizing: border-box; }
body { font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", system-ui, sans-serif; background: #0a0a0a; color: #e0e0e0; padding: 32px; }
header { max-width: 1100px; margin: 0 auto 32px; }
h1 { font-size: 28px; font-weight: 800; letter-spacing: -0.5px; }
.muted { color: #888; margin-top: 6px; font-size: 14px; }
.grid { max-width: 1100px; margin: 0 auto; display: grid; grid-template-columns: repeat(auto-fill, minmax(260px, 1fr)); gap: 16px; }
.card { background: #141414; border: 1px solid #2a2a2a; border-radius: 12px; padding: 16px; }
.card h3 { font-size: 14px; font-weight: 700; color: #f5a623; margin-bottom: 6px; }
.card p { font-size: 13px; line-height: 1.45; color: #c8c8c8; }
.badge { display: inline-block; font-size: 10px; padding: 2px 8px; border-radius: 999px; background: #2a2a2a; color: #aaa; text-transform: uppercase; letter-spacing: 1px; margin-right: 4px; margin-bottom: 6px; }
.tier-core { color: #f5a623; }
.tier-peripheral { color: #888; }
</style>
</head>
<body>
<header>
<h1>${escapeHtml(brief.client_name)} Social Listening</h1>
<div class="muted">${escapeHtml(brief.business_question)}</div>
<div class="muted">${dataset.trends.length} trends across ${dataset.categories.length} categories generated ${new Date(dataset.generated_at).toUTCString()}</div>
</header>
<main class="grid">
${dataset.trends.map((t) => `
<div class="card">
<span class="badge tier-${t.business_question_relevance.tier}">${t.business_question_relevance.tier}</span>
<span class="badge">${escapeHtml(t.category)}</span>
<h3>${escapeHtml(t.name)}</h3>
<p>${escapeHtml(t.narrative)}</p>
</div>`).join('\n')}
</main>
<script type="application/json" id="atrium-data">${escapeJsonForScript(datasetJson)}</script>
</body>
</html>`;
const htmlPath = PATHS.dashboardHtml(reportId);
writeFileSync(htmlPath, html);
return {
ok: true,
outputs: { dataset_v2: datasetPath, dashboard_html: htmlPath },
dataset_size_bytes: Buffer.byteLength(datasetJson),
html_size_bytes: Buffer.byteLength(html),
trend_count: trends.length,
inlined_covers: inlined,
};
}
function escapeHtml(s: string): string {
return s.replace(/&/g, '&amp;').replace(/</g, '&lt;').replace(/>/g, '&gt;').replace(/"/g, '&quot;');
}
function escapeJsonForScript(s: string): string {
// Avoid ending the script tag inside the JSON.
return s.replace(/<\/script>/gi, '<\\/script>');
}
mkdirSync; // touch (used inside ensureDir, kept here for ESM clarity)

View file

@ -0,0 +1,72 @@
// Stage 1: turn a brief into a tier-labelled seed list using the §4.5a rubric.
import { writeFileSync } from 'node:fs';
import { z } from 'zod';
import { callClaudeJSON } from '../lib/claude.js';
import { loadRubric } from '../lib/rubrics.js';
import { PATHS } from '../lib/paths.js';
import type { BriefInput } from '../../server/schemas/brief.js';
export const SEEDS_SCHEMA = z.object({
hashtags: z.object({
anchor: z.array(z.object({ tag: z.string().startsWith('#'), rationale: z.string() })),
discovery: z.array(z.object({ tag: z.string().startsWith('#'), rationale: z.string() })),
edge: z.array(z.object({ tag: z.string().startsWith('#'), rationale: z.string() })),
}),
search_terms: z.array(z.object({ term: z.string().min(1), rationale: z.string() })),
handles: z.array(z.object({
handle: z.string().min(1).regex(/^[^@]/, 'handles must not start with @'),
type: z.enum(['brand', 'competitor', 'creator']),
rationale: z.string(),
})),
rejected: z.array(z.object({ tag: z.string(), reason: z.string() })).default([]),
});
export type Seeds = z.infer<typeof SEEDS_SCHEMA>;
export interface StageRunResult {
ok: boolean;
outputs: Record<string, string>;
}
export async function runStage1Seeds(input: { reportId: string; brief: BriefInput }): Promise<StageRunResult> {
const rubric = loadRubric('seed_quality');
const briefBlock = `# Brief\n\n` +
`Brand: ${input.brief.brand.name} (@${input.brief.brand.handle})\n` +
(input.brief.brand.positioning ? `Positioning: ${input.brief.brand.positioning}\n` : '') +
`Category: ${input.brief.category}\n` +
`Geo: ${input.brief.geo} | Language: ${input.brief.language}\n` +
`Audience: ${input.brief.audience.primary} (${input.brief.audience.age_range} ${input.brief.audience.gender})\n` +
`Audience interests: ${input.brief.audience.interests.join(', ')}\n` +
`Competitors: ${input.brief.competitors.map((c) => `${c.name} (@${c.handle})`).join(', ')}\n` +
`Business question: ${input.brief.business_question}\n` +
`KPIs: ${input.brief.kpis.join(' | ')}\n` +
(input.brief.context_vision ? `\nReport context / vision:\n${input.brief.context_vision}\n` : '');
const prompt = `${rubric}\n\n---\n\n${briefBlock}\n\nReturn the seeds JSON now.`;
const raw = await callClaudeJSON<unknown>(prompt, { label: 'stage_1_seeds', maxTokens: 8192 });
const parsed = SEEDS_SCHEMA.safeParse(raw);
if (!parsed.success) {
throw new Error(`Stage 1 returned invalid seeds JSON: ${parsed.error.message}`);
}
// Inject brand + competitor handles unconditionally if Claude didn't list them.
const handleSet = new Set(parsed.data.handles.map((h) => h.handle.toLowerCase()));
const enriched = { ...parsed.data, handles: [...parsed.data.handles] };
if (!handleSet.has(input.brief.brand.handle.toLowerCase())) {
enriched.handles.unshift({
handle: input.brief.brand.handle, type: 'brand', rationale: 'Brand from brief',
});
}
for (const c of input.brief.competitors) {
if (!handleSet.has(c.handle.toLowerCase())) {
enriched.handles.push({
handle: c.handle, type: 'competitor', rationale: `Competitor: ${c.name}`,
});
}
}
const outPath = PATHS.seedsJson(input.reportId);
writeFileSync(outPath, JSON.stringify(enriched, null, 2));
return { ok: true, outputs: { seeds: outPath } };
}

View file

@ -0,0 +1,275 @@
// Stage 2: broad TikTok pull driven by seeds.json. Budget-bounded; date-filtered;
// engagement-floored; deduped by canonical TikTok video id. Writes pass1_videos.json
// and a spend log that tracks raw_returned vs kept_after_floor per scrape.
import { writeFileSync, readFileSync, mkdirSync, existsSync } from 'node:fs';
import { ACTORS, defaultLimits, runActor, resetBudget, setSoftCap, getRunningCost, isBudgetExceeded, onApifyCost } from '../lib/apify_client.js';
import { extractTikTokId, canonicalTikTokUrl } from '../lib/ids.js';
import { applyEngagementFloor, type EngagementFloor, type EngagementCounters, computeStlPct } from '../lib/engagement_floor.js';
import { PATHS } from '../lib/paths.js';
import type { BriefInput } from '../../server/schemas/brief.js';
import type { Seeds } from './stage_1_seeds.js';
export interface Pass1Video {
id: string;
handle: string;
url_canonical: string;
caption: string;
hashtags: string[];
plays: number;
likes: number;
saves: number;
comments_count: number;
shares: number;
stl_pct: number;
duration_sec: number;
posted_at: string; // ISO
cover: string | null;
/** mp4 direct download URL — ephemeral (~14 day TTL). Stage 4 fetches within hours. */
download_url: string | null;
_source: string; // e.g. "hashtag:hairtok", "profile:dove", "search:everything shower"
_scraped_at: string;
}
interface RawTikTok {
id?: string;
webVideoUrl?: string;
videoUrl?: string;
url?: string;
authorMeta?: { name?: string };
text?: string;
hashtags?: Array<{ name?: string } | string>;
playCount?: number;
diggCount?: number;
collectCount?: number;
commentCount?: number;
shareCount?: number;
videoMeta?: {
duration?: number;
downloadAddr?: string;
coverUrl?: string;
originalCoverUrl?: string;
};
mediaUrls?: string[];
createTimeISO?: string;
createTime?: number;
covers?: { default?: string };
}
function parseDate(raw: RawTikTok): string | null {
if (raw.createTimeISO) return new Date(raw.createTimeISO).toISOString();
if (typeof raw.createTime === 'number') return new Date(raw.createTime * 1000).toISOString();
return null;
}
function normaliseTags(raw: RawTikTok['hashtags']): string[] {
if (!Array.isArray(raw)) return [];
return raw.map((h) => (typeof h === 'string' ? h : h?.name)).filter((t): t is string => !!t).map((t) => t.startsWith('#') ? t.toLowerCase() : `#${t.toLowerCase()}`);
}
function normaliseRaw(raw: RawTikTok, source: string): Pass1Video | null {
const url = raw.webVideoUrl || raw.videoUrl || raw.url || '';
const id = extractTikTokId(raw.id || url);
if (!id) return null;
const handle = (raw.authorMeta?.name || '').replace(/^@/, '');
if (!handle) return null;
const posted = parseDate(raw);
if (!posted) return null;
const plays = raw.playCount ?? 0;
const likes = raw.diggCount ?? 0;
const saves = raw.collectCount ?? 0;
const comments = raw.commentCount ?? 0;
const shares = raw.shareCount ?? 0;
const stl = computeStlPct({ plays, likes, saves, comments_count: comments, shares });
return {
id, handle,
url_canonical: canonicalTikTokUrl(id, handle),
caption: raw.text || '',
hashtags: normaliseTags(raw.hashtags),
plays, likes, saves,
comments_count: comments,
shares,
stl_pct: Math.round(stl * 100) / 100,
duration_sec: raw.videoMeta?.duration ?? 0,
posted_at: posted,
// hashtag scraper returns cover URL at videoMeta.coverUrl/originalCoverUrl;
// older actor versions used `covers.default`. Try every shape.
cover: raw.videoMeta?.coverUrl ?? raw.videoMeta?.originalCoverUrl ?? raw.covers?.default ?? null,
// mp4 download is empty unless `shouldDownloadVideos: true` was passed; we keep
// the URL if present, otherwise accept that frame extraction is best-effort.
download_url: raw.videoMeta?.downloadAddr ?? raw.mediaUrls?.[0] ?? null,
_source: source,
_scraped_at: new Date().toISOString(),
};
}
interface SpendEntry {
label: string;
source_kind: 'hashtag' | 'profile' | 'search';
source_value: string;
cost_usd: number;
run_id: string;
raw_returned: number;
kept_after_floor: number;
kept_after_dedup: number;
floor_counters: EngagementCounters;
}
interface Stage2Args {
reportId: string;
brief: BriefInput;
}
function inDateWindow(iso: string, days: number): boolean {
const cutoff = Date.now() - days * 24 * 60 * 60 * 1000;
return new Date(iso).getTime() >= cutoff;
}
export async function runStage2Pass1Scrape(args: Stage2Args): Promise<{ ok: true; outputs: Record<string, string>; total_videos: number; total_cost_usd: number }> {
const { reportId, brief } = args;
const seedsPath = PATHS.seedsJson(reportId);
if (!existsSync(seedsPath)) throw new Error(`seeds.json missing at ${seedsPath}. Run stage 1 first.`);
const seeds = JSON.parse(readFileSync(seedsPath, 'utf-8')) as Seeds;
// Budget: hard ceiling 95% of brief.budget_usd, soft Pass-1 cap 50%.
const hardCeiling = brief.budget_usd * 0.95;
const pass1Cap = brief.budget_usd * 0.5;
resetBudget({ hardCeilingUsd: hardCeiling });
setSoftCap(pass1Cap);
mkdirSync(PATHS.pass1(reportId), { recursive: true });
const rawDumpsDir = `${PATHS.pass1(reportId)}/raw`;
mkdirSync(rawDumpsDir, { recursive: true });
const seenIds = new Map<string, Pass1Video>();
const spendLog: SpendEntry[] = [];
const floor: EngagementFloor = {
min_likes: brief.min_likes,
min_plays: brief.min_plays,
min_stl_pct: brief.min_stl_pct,
};
const limits = defaultLimits();
const dateDays = brief.date_window_days;
// Cost callback writes raw cost to a side log; per-scrape totals computed below.
onApifyCost(() => { /* aggregated through getRunningCost() */ });
type ScrapeJob =
| { kind: 'hashtag'; tag: string; tier: 'anchor' | 'discovery' | 'edge' }
| { kind: 'profile'; handle: string }
| { kind: 'search'; term: string };
const order: ScrapeJob[] = [];
for (const t of seeds.hashtags.anchor) order.push({ kind: 'hashtag', tag: t.tag, tier: 'anchor' });
for (const h of seeds.handles) order.push({ kind: 'profile', handle: h.handle });
for (const t of seeds.hashtags.discovery) order.push({ kind: 'hashtag', tag: t.tag, tier: 'discovery' });
for (const s of seeds.search_terms) order.push({ kind: 'search', term: s.term });
for (const t of seeds.hashtags.edge) order.push({ kind: 'hashtag', tag: t.tag, tier: 'edge' });
for (const job of order) {
if (isBudgetExceeded()) {
console.log(`[stage 2] Pass-1 cap reached at $${getRunningCost().toFixed(2)} — stopping`);
break;
}
const label = job.kind === 'hashtag'
? `hashtag:${job.tag} (${job.tier})`
: job.kind === 'profile'
? `profile:${job.handle}`
: `search:${job.term}`;
let actor: string;
let input: Record<string, unknown>;
if (job.kind === 'hashtag') {
actor = ACTORS.TIKTOK_HASHTAG;
input = {
hashtags: [job.tag.replace(/^#/, '')],
resultsPerPage: limits.resultsPerPage,
shouldDownloadVideos: false,
shouldDownloadCovers: false,
proxyCountryCode: brief.geo,
// engagement floor applied actor-side where supported
minPlayCount: brief.min_plays,
};
} else if (job.kind === 'profile') {
actor = ACTORS.TIKTOK_PROFILE;
input = {
profiles: [job.handle.replace(/^@/, '')],
resultsPerPage: limits.resultsPerPage,
shouldDownloadVideos: false,
shouldDownloadCovers: false,
};
} else {
actor = ACTORS.TIKTOK_HASHTAG; // hashtag actor accepts search terms via "searchQueries"
input = {
searchQueries: [job.term],
resultsPerPage: limits.resultsPerPage,
shouldDownloadVideos: false,
shouldDownloadCovers: false,
proxyCountryCode: brief.geo,
minPlayCount: brief.min_plays,
};
}
let res;
try {
res = await runActor<RawTikTok>(actor, input, label);
} catch (err) {
console.warn(`[stage 2] ${label} failed: ${(err as Error).message}`);
continue;
}
// Persist raw dump for forensics
if (res.status === 'OK') {
writeFileSync(`${rawDumpsDir}/${res.run_id}.json`, JSON.stringify(res.items, null, 2));
}
const sourceTag = job.kind === 'hashtag' ? `hashtag:${job.tag}` : job.kind === 'profile' ? `profile:${job.handle}` : `search:${job.term}`;
const normalised: Pass1Video[] = [];
for (const raw of res.items) {
const v = normaliseRaw(raw, sourceTag);
if (!v) continue;
if (!inDateWindow(v.posted_at, dateDays)) continue;
normalised.push(v);
}
const { kept, counters } = applyEngagementFloor(normalised, floor);
let newCount = 0;
for (const v of kept) {
if (!seenIds.has(v.id)) { seenIds.set(v.id, v); newCount++; }
}
spendLog.push({
label,
source_kind: job.kind,
source_value: job.kind === 'hashtag' ? job.tag : job.kind === 'profile' ? job.handle : job.term,
cost_usd: res.cost_usd,
run_id: res.run_id,
raw_returned: res.items.length,
kept_after_floor: counters.kept_after_floor,
kept_after_dedup: newCount,
floor_counters: counters,
});
console.log(`[stage 2] ${label}: raw=${res.items.length} → kept=${counters.kept_after_floor} → new=${newCount}`);
}
const allVideos = [...seenIds.values()].sort((a, b) => b.plays - a.plays);
writeFileSync(PATHS.pass1Videos(reportId), JSON.stringify(allVideos, null, 2));
writeFileSync(PATHS.spendLog(reportId), JSON.stringify({
pass: 'pass1',
hard_ceiling_usd: hardCeiling,
pass1_cap_usd: pass1Cap,
total_cost_usd: getRunningCost(),
total_videos: allVideos.length,
entries: spendLog,
}, null, 2));
return {
ok: true,
outputs: {
pass1_videos: PATHS.pass1Videos(reportId),
spend_log: PATHS.spendLog(reportId),
},
total_videos: allVideos.length,
total_cost_usd: getRunningCost(),
};
}

View file

@ -0,0 +1,88 @@
// Stage 3: pick which videos go deep. Recipe-led; user can accept or override.
// Output: selected_video_ids.json + selection_rules.json (audit trail).
import { writeFileSync, readFileSync, mkdirSync, existsSync } from 'node:fs';
import { matchRecipe, parseFilterExpression, applyFilter, RECIPES, type RecipeId } from '../lib/recipes.js';
import { PATHS } from '../lib/paths.js';
import type { Pass1Video } from './stage_2_pass1_scrape.js';
import type { BriefInput } from '../../server/schemas/brief.js';
export interface SelectionRules {
recipe_id: RecipeId;
recipe_name: string;
filter_expression: string;
filter_source: 'recipe_default' | 'user_override' | 'custom';
total_pass1: number;
selected_count: number;
business_question: string;
applied_at: string;
}
export interface Stage3Args {
reportId: string;
brief: BriefInput;
/** force a specific recipe (CLI: --recipe A|B|C|D). */
forceRecipe?: RecipeId;
/** user-supplied custom filter expression (CLI: --custom "..."). */
customFilter?: string;
}
export async function runStage3Select(args: Stage3Args): Promise<{ ok: true; outputs: Record<string, string>; selected: string[]; rules: SelectionRules }> {
const { reportId, brief, forceRecipe, customFilter } = args;
const pass1Path = PATHS.pass1Videos(reportId);
if (!existsSync(pass1Path)) throw new Error(`pass1_videos.json missing at ${pass1Path}. Run stage 2 first.`);
const videos = JSON.parse(readFileSync(pass1Path, 'utf-8')) as Pass1Video[];
if (videos.length === 0) throw new Error('No pass1 videos to select from.');
let recipeId: RecipeId;
let filterExpression: string;
let filterSource: SelectionRules['filter_source'];
if (customFilter) {
recipeId = forceRecipe ?? matchRecipe(brief.business_question);
filterExpression = customFilter;
filterSource = 'custom';
} else if (forceRecipe) {
recipeId = forceRecipe;
filterExpression = RECIPES[forceRecipe].default_filter;
filterSource = 'user_override';
} else {
recipeId = matchRecipe(brief.business_question);
filterExpression = RECIPES[recipeId].default_filter;
filterSource = 'recipe_default';
}
const parsed = parseFilterExpression(filterExpression);
const selected = applyFilter(videos, parsed);
// Sort selected ids by their pass1 plays rank for stable downstream behaviour.
const playsRank = new Map(videos.map((v, i) => [v.id, i]));
selected.sort((a, b) => (playsRank.get(a) ?? Number.MAX_SAFE_INTEGER) - (playsRank.get(b) ?? Number.MAX_SAFE_INTEGER));
const rules: SelectionRules = {
recipe_id: recipeId,
recipe_name: RECIPES[recipeId].name,
filter_expression: filterExpression,
filter_source: filterSource,
total_pass1: videos.length,
selected_count: selected.length,
business_question: brief.business_question,
applied_at: new Date().toISOString(),
};
mkdirSync(PATHS.pass2(reportId), { recursive: true });
writeFileSync(PATHS.selectedIds(reportId), JSON.stringify(selected, null, 2));
writeFileSync(PATHS.selectionRules(reportId), JSON.stringify(rules, null, 2));
console.log(`[stage 3] recipe ${recipeId} (${RECIPES[recipeId].name}) → ${selected.length} videos selected from ${videos.length}`);
return {
ok: true,
outputs: {
selected_ids: PATHS.selectedIds(reportId),
selection_rules: PATHS.selectionRules(reportId),
},
selected,
rules,
};
}

View file

@ -0,0 +1,393 @@
// Stage 4 — deep per-video enrichment. THE LINKING FIX.
//
// Every Apify response is matched back to the canonical TikTok id via
// extractTikTokId, never via URL string equality. Drift is logged loudly to
// drift_log.jsonl and surfaces as `failed` assets in the manifest.
//
// Per-video folder layout (V3 brief §4 stage 4):
// enriched/{video_id}/
// metadata.json
// cover.jpg
// mp4.bin (downloaded for frame extraction; deleted after)
// transcript.json { language_detected, text_original, text_en, source }
// comments.json [{rank, author_handle, text_original, text_en, likes, replies_count, posted_at}]
// frames/0001.jpg, 0002.jpg, …
// bundle.json — last write, the only file Stage 6 reads
import { writeFileSync, readFileSync, existsSync, mkdirSync, rmSync } from 'node:fs';
import { join } from 'node:path';
import { ACTORS, runActor } from '../lib/apify_client.js';
import { extractTikTokId } from '../lib/ids.js';
import { logDrift, resetDriftCounter, clearDriftLog, getDriftCount } from '../lib/drift_log.js';
import { withRetry } from '../lib/retry.js';
import { extractFrames } from '../lib/frames.js';
import { translateTextToEn, translateBatchToEn, isLikelyEnglish } from '../lib/translate.js';
import { PATHS } from '../lib/paths.js';
import type { Pass1Video } from './stage_2_pass1_scrape.js';
import type { BriefInput } from '../../server/schemas/brief.js';
interface RawTranscript {
videoUrl?: string;
postUrl?: string;
url?: string;
webVideoUrl?: string;
videoWebUrl?: string;
submittedVideoUrl?: string;
input?: string;
id?: string;
language?: string;
text?: string;
subtitles?: string;
transcript?: string; // emQXBCL3xePZYgJyn returns WEBVTT in this field
success?: boolean;
}
interface RawComment {
videoUrl?: string;
postUrl?: string;
url?: string;
webVideoUrl?: string;
videoWebUrl?: string; // BDec00yAmCm1QbMEI uses this
submittedVideoUrl?: string; // and this
input?: string; // and this (echoes the input URL)
text?: string;
uniqueId?: string;
user?: { uniqueId?: string };
diggCount?: number;
likeCount?: number;
replyCount?: number;
replyCommentTotal?: number;
createTime?: number;
createTimeISO?: string;
}
interface BundleTranscript { language_detected: string; text_original: string; text_en: string; source: 'apify-tiktok-subtitles' }
interface BundleComment {
rank: number;
author_handle: string;
text_original: string;
text_en: string;
likes: number;
replies_count: number;
posted_at: string;
}
export interface VideoBundle {
id: string;
url: string;
handle: string;
metadata: Pass1Video;
transcript: BundleTranscript | null;
comments: BundleComment[];
frames: Array<{ index: number; path: string }>;
cover_local: string | null;
_validation: { all_ok: boolean; checked_at: string; missing: string[] };
}
export interface DroppedVideo {
id: string;
reason: string;
handle?: string;
}
const TARGET_COMMENTS = 30;
const MIN_COMMENTS = 5;
const MAX_CONCURRENCY = 4;
// Group raw items by canonical id; record drift on the way.
// Exported so unit tests can verify the V1-bug fix (URL drift → null asset) without
// running a real Apify call.
//
// Drifty actors return URL field with non-deterministic names:
// - TIKTOK_HASHTAG/PROFILE: `webVideoUrl`
// - TIKTOK_TRANSCRIPTS: `url` + `id` (numeric)
// - TIKTOK_COMMENTS: `videoWebUrl`, `submittedVideoUrl`, `input`
// We try every field; if a numeric `id` is present we use that directly.
export function groupByCanonicalId<T extends {
videoUrl?: string; postUrl?: string; url?: string; webVideoUrl?: string;
videoWebUrl?: string; submittedVideoUrl?: string; input?: string; id?: string;
}>(
reportId: string,
actorLabel: string,
items: T[],
selectedIds: Set<string>,
): Map<string, T[]> {
const out = new Map<string, T[]>();
for (const item of items) {
const sourceUrl =
item.webVideoUrl || item.videoWebUrl || item.submittedVideoUrl ||
item.videoUrl || item.postUrl || item.url || item.input || '';
const id = extractTikTokId(item.id ?? sourceUrl);
if (!id) {
logDrift(reportId, { actor: actorLabel, reason: 'no-id-extracted', source_url: sourceUrl || null, extracted_id: null, context: { item_keys: Object.keys(item) } });
continue;
}
if (!selectedIds.has(id)) {
logDrift(reportId, { actor: actorLabel, reason: 'id-not-in-selection', source_url: sourceUrl || null, extracted_id: id });
continue;
}
const arr = out.get(id) ?? [];
arr.push(item);
out.set(id, arr);
}
return out;
}
async function downloadFile(url: string, dest: string): Promise<{ ok: boolean; bytes: number; error?: string }> {
try {
const res = await fetch(url);
if (!res.ok) return { ok: false, bytes: 0, error: `HTTP ${res.status}` };
const buf = Buffer.from(await res.arrayBuffer());
writeFileSync(dest, buf);
return { ok: true, bytes: buf.length };
} catch (err) {
return { ok: false, bytes: 0, error: (err as Error).message };
}
}
async function processOneVideo(opts: {
reportId: string;
meta: Pass1Video;
transcripts: Map<string, RawTranscript[]>;
commentsByVid: Map<string, RawComment[]>;
}): Promise<{ id: string; bundle?: VideoBundle; dropped?: DroppedVideo }> {
const { reportId, meta, transcripts, commentsByVid } = opts;
const id = meta.id;
const dir = PATHS.enrichedVideo(reportId, id);
mkdirSync(dir, { recursive: true });
// 1. metadata.json — always written (re-derivable from pass1, but we want self-containment)
writeFileSync(join(dir, 'metadata.json'), JSON.stringify(meta, null, 2));
// 2. cover.jpg
let coverLocal: string | null = null;
if (meta.cover) {
const dest = join(dir, 'cover.jpg');
const dl = await withRetry(() => downloadFile(meta.cover!, dest), { label: `cover ${id}` })
.catch((e) => ({ ok: false, bytes: 0, error: (e as Error).message }));
if (dl.ok && dl.bytes > 5_000) coverLocal = dest;
}
// 3. transcript.json — must have non-empty text or video is dropped
const tArr = transcripts.get(id) ?? [];
const tRaw = tArr[0];
const rawTranscript = tRaw?.transcript || tRaw?.text || tRaw?.subtitles || '';
// Strip WEBVTT formatting (timestamps + cue blocks) — keep spoken text only.
const transcriptText = rawTranscript
.replace(/^WEBVTT[\r\n]+/i, '')
.replace(/\d{2}:\d{2}:\d{2}\.\d{3}\s*-->\s*\d{2}:\d{2}:\d{2}\.\d{3}.*$/gm, '')
.replace(/^\d+\s*$/gm, '') // cue numbers
.replace(/\n{3,}/g, '\n\n')
.trim();
let transcriptBundle: BundleTranscript | null = null;
if (transcriptText) {
const lang = (tRaw?.language || '').toLowerCase();
const looksEn = lang === 'en' || lang.startsWith('en-') || (lang === '' && isLikelyEnglish(transcriptText));
let text_en = transcriptText;
if (!looksEn) {
try {
text_en = await withRetry(() => translateTextToEn(transcriptText, lang || undefined), { label: `translate transcript ${id}` });
} catch (err) {
text_en = transcriptText; // best-effort: keep original if translation fails
console.warn(`[stage 4] transcript translation failed for ${id}: ${(err as Error).message}`);
}
}
transcriptBundle = { language_detected: lang || 'unknown', text_original: transcriptText, text_en, source: 'apify-tiktok-subtitles' };
writeFileSync(join(dir, 'transcript.json'), JSON.stringify(transcriptBundle, null, 2));
}
// 4. comments.json — target 30, minimum 5; below 5 video is dropped from selection
const rawComments = (commentsByVid.get(id) ?? []).slice().sort((a, b) => (b.diggCount ?? b.likeCount ?? 0) - (a.diggCount ?? a.likeCount ?? 0));
const top = rawComments.slice(0, TARGET_COMMENTS);
const commentBundle: BundleComment[] = [];
if (top.length >= MIN_COMMENTS) {
const langGuess = transcriptBundle?.language_detected || 'unknown';
const looksEn = langGuess === 'en' || langGuess.startsWith('en-');
const originals = top.map((c) => (c.text || '').trim());
let translations: string[];
if (looksEn || originals.every(isLikelyEnglish)) {
translations = originals;
} else {
try {
translations = await withRetry(() => translateBatchToEn(originals, looksEn ? undefined : langGuess), { label: `translate comments ${id}` });
} catch (err) {
console.warn(`[stage 4] comments translation failed for ${id}: ${(err as Error).message}`);
translations = originals;
}
}
for (let i = 0; i < top.length; i++) {
const c = top[i]!;
const posted = c.createTimeISO ?? (typeof c.createTime === 'number' ? new Date(c.createTime * 1000).toISOString() : '');
commentBundle.push({
rank: i + 1,
author_handle: c.uniqueId || c.user?.uniqueId || 'unknown',
text_original: originals[i] ?? '',
text_en: translations[i] ?? originals[i] ?? '',
likes: c.diggCount ?? c.likeCount ?? 0,
replies_count: c.replyCommentTotal ?? c.replyCount ?? 0,
posted_at: posted,
});
}
writeFileSync(join(dir, 'comments.json'), JSON.stringify(commentBundle, null, 2));
}
// 5. frames — best-effort. mp4 URL is ephemeral; if it expired, log and continue.
let frames: Array<{ index: number; path: string }> = [];
if (meta.download_url) {
const mp4Path = join(dir, 'mp4.bin');
const dl = await withRetry(() => downloadFile(meta.download_url!, mp4Path), { label: `mp4 ${id}` })
.catch((e) => ({ ok: false, bytes: 0, error: (e as Error).message }));
if (dl.ok && dl.bytes > 50_000) {
const result = extractFrames({ mp4Path, outDir: join(dir, 'frames'), durationSec: meta.duration_sec });
if (result.ok) {
frames = result.frames.map((name, i) => ({ index: i + 1, path: `frames/${name}` }));
} else {
logDrift(reportId, { actor: 'ffmpeg', reason: 'metadata-missing-fields', source_url: meta.download_url, extracted_id: id, context: { error: result.error } });
}
try { rmSync(mp4Path); } catch { /* non-fatal */ }
} else {
logDrift(reportId, { actor: 'mp4-download', reason: 'metadata-missing-fields', source_url: meta.download_url, extracted_id: id, context: { error: dl.error ?? 'download failed', bytes: dl.bytes } });
}
}
// 6. Validate the per-video bundle and write it last (Stage 6 reads only this file).
const missing: string[] = [];
if (!transcriptBundle) missing.push('transcript');
if (commentBundle.length < MIN_COMMENTS) missing.push('comments');
if (frames.length < 3) missing.push('frames');
if (!coverLocal) missing.push('cover');
// If transcript or comments are below threshold, mark as dropped — bundle still written for forensics.
const dropped = missing.includes('transcript') || missing.includes('comments')
? { id, handle: meta.handle, reason: `Missing required asset(s): ${missing.join(', ')}` }
: undefined;
const bundle: VideoBundle = {
id,
url: meta.url_canonical,
handle: meta.handle,
metadata: meta,
transcript: transcriptBundle,
comments: commentBundle,
frames,
cover_local: coverLocal,
_validation: { all_ok: missing.length === 0, checked_at: new Date().toISOString(), missing },
};
writeFileSync(join(dir, 'bundle.json'), JSON.stringify(bundle, null, 2));
return dropped ? { id, dropped } : { id, bundle };
}
async function inFlight<T, R>(items: T[], concurrency: number, fn: (x: T) => Promise<R>): Promise<R[]> {
const results: R[] = [];
let i = 0;
async function worker(): Promise<void> {
while (i < items.length) {
const idx = i++;
const item = items[idx]!;
results[idx] = await fn(item);
}
}
const workers = Array.from({ length: Math.min(concurrency, items.length) }, () => worker());
await Promise.all(workers);
return results;
}
export interface Stage4Args {
reportId: string;
brief: BriefInput;
/** override for tests / partial reruns. */
onlyIds?: string[];
}
export interface Stage4Result {
ok: true;
outputs: Record<string, string>;
total_attempted: number;
total_bundled: number;
total_dropped: number;
drift_events: number;
}
export async function runStage4Pass2Enrich(args: Stage4Args): Promise<Stage4Result> {
const { reportId, onlyIds } = args;
resetDriftCounter();
clearDriftLog(reportId);
const selectedPath = PATHS.selectedIds(reportId);
const pass1Path = PATHS.pass1Videos(reportId);
if (!existsSync(selectedPath)) throw new Error(`selected_video_ids.json missing. Run select first.`);
if (!existsSync(pass1Path)) throw new Error(`pass1_videos.json missing. Run scrape1 first.`);
const allSelected: string[] = JSON.parse(readFileSync(selectedPath, 'utf-8'));
const ids = onlyIds ?? allSelected;
const idSet = new Set(ids);
const pass1: Pass1Video[] = JSON.parse(readFileSync(pass1Path, 'utf-8'));
const metaById = new Map(pass1.map((v) => [v.id, v]));
// Bulk Apify calls — one per actor for the entire selection set. Cheaper than per-video.
// We CACHE the raw responses to disk so reruns can skip Apify entirely.
const urls = ids.map((id) => metaById.get(id)?.url_canonical).filter((u): u is string => !!u);
const tCachePath = `${PATHS.pass2(reportId)}/_cache_transcripts.json`;
const cCachePath = `${PATHS.pass2(reportId)}/_cache_comments.json`;
let tItems: RawTranscript[] = [];
if (existsSync(tCachePath)) {
console.log('[stage 4] using cached transcripts response (no Apify call)');
tItems = JSON.parse(readFileSync(tCachePath, 'utf-8'));
} else {
console.log(`[stage 4] bulk transcripts call for ${urls.length} videos`);
const tRes = await runActor<RawTranscript>(ACTORS.TIKTOK_TRANSCRIPTS, { videos: urls }, 'TIKTOK_TRANSCRIPTS');
tItems = tRes.items;
writeFileSync(tCachePath, JSON.stringify(tItems));
}
const transcripts = groupByCanonicalId(reportId, 'TIKTOK_TRANSCRIPTS', tItems, idSet);
let cItems: RawComment[] = [];
if (existsSync(cCachePath)) {
console.log('[stage 4] using cached comments response (no Apify call)');
cItems = JSON.parse(readFileSync(cCachePath, 'utf-8'));
} else {
console.log(`[stage 4] bulk comments call for ${urls.length} videos`);
const cRes = await runActor<RawComment>(ACTORS.TIKTOK_COMMENTS, { postURLs: urls, maxComments: TARGET_COMMENTS }, 'TIKTOK_COMMENTS');
cItems = cRes.items;
writeFileSync(cCachePath, JSON.stringify(cItems));
}
const commentsByVid = groupByCanonicalId(reportId, 'TIKTOK_COMMENTS', cItems, idSet);
mkdirSync(PATHS.enriched(reportId), { recursive: true });
const items = ids.map((id) => metaById.get(id)).filter((m): m is Pass1Video => !!m);
const droppedExtras: DroppedVideo[] = [];
for (const id of ids) {
if (!metaById.has(id)) {
droppedExtras.push({ id, reason: 'pass1 record missing' });
}
}
const results = await inFlight(items, MAX_CONCURRENCY, (meta) =>
processOneVideo({ reportId, meta, transcripts, commentsByVid }),
);
const bundled = results.filter((r) => !!r.bundle).length;
const dropped = [...droppedExtras, ...results.filter((r) => r.dropped).map((r) => r.dropped!)];
// Persist dropped log
const droppedPath = `${PATHS.pass2(reportId)}/dropped_videos.json`;
writeFileSync(droppedPath, JSON.stringify(dropped, null, 2));
console.log(`[stage 4] done. attempted=${ids.length} bundled=${bundled} dropped=${dropped.length} drift=${getDriftCount()}`);
return {
ok: true,
outputs: {
enriched_dir: PATHS.enriched(reportId),
dropped: droppedPath,
drift_log: PATHS.driftLog(reportId),
},
total_attempted: ids.length,
total_bundled: bundled,
total_dropped: dropped.length,
drift_events: getDriftCount(),
};
}

View file

@ -0,0 +1,114 @@
// Stage 5: validate every selected video has every required asset.
// Hard gate: throws HardGateError if coverage_pct < 100.
// Auto-backfill on { dropFailing: true }: read pass1 ranking, mark failing
// ids dropped, walk next-best ids, re-run Stage 4 just for those, re-validate.
import { writeFileSync, readFileSync, existsSync } from 'node:fs';
import { buildManifest, writeManifest, HardGateError, type Manifest } from '../lib/manifest.js';
import { PATHS } from '../lib/paths.js';
import { runStage4Pass2Enrich } from './stage_4_pass2_enrich.js';
import type { Pass1Video } from './stage_2_pass1_scrape.js';
import type { BriefInput } from '../../server/schemas/brief.js';
export interface Stage5Args {
reportId: string;
brief: BriefInput;
dropFailing?: boolean;
/** stop after N backfill rounds (safety). default 3. */
maxBackfillRounds?: number;
}
export interface Stage5Result {
ok: boolean;
manifest: Manifest;
passed: boolean;
backfill_rounds: number;
backfilled_ids: string[];
outputs: Record<string, string>;
}
export async function runStage5Manifest(args: Stage5Args): Promise<Stage5Result> {
const { reportId, brief, dropFailing = false, maxBackfillRounds = 3 } = args;
const selectedPath = PATHS.selectedIds(reportId);
const pass1Path = PATHS.pass1Videos(reportId);
if (!existsSync(selectedPath)) throw new Error(`selected_video_ids.json missing. Run select first.`);
if (!existsSync(pass1Path)) throw new Error(`pass1_videos.json missing. Run scrape1 first.`);
let selected: string[] = JSON.parse(readFileSync(selectedPath, 'utf-8'));
const pass1: Pass1Video[] = JSON.parse(readFileSync(pass1Path, 'utf-8'));
const ranked = pass1.map((v) => v.id);
let manifest = buildManifest(reportId, selected);
let rounds = 0;
const backfilled: string[] = [];
while (manifest.summary.coverage_pct < 100 && dropFailing && rounds < maxBackfillRounds) {
rounds++;
const failingIds = new Set(manifest.videos.filter((v) => !v.all_ok).map((v) => v.id));
const survivors = selected.filter((id) => !failingIds.has(id));
const droppedNow = [...failingIds];
console.log(`[stage 5] backfill round ${rounds}: dropping ${droppedNow.length}, finding replacements`);
// Walk next-best ids from pass1, excluding already-selected and already-dropped.
const seen = new Set([...survivors, ...failingIds]);
const candidates: string[] = [];
for (const id of ranked) {
if (seen.has(id)) continue;
candidates.push(id);
if (candidates.length >= droppedNow.length) break;
}
if (candidates.length === 0) {
console.warn(`[stage 5] no backfill candidates left after round ${rounds}; manifest will not be 100%.`);
selected = survivors;
writeFileSync(selectedPath, JSON.stringify(selected, null, 2));
break;
}
backfilled.push(...candidates);
// Run Stage 4 just for the new candidates (don't re-enrich survivors).
await runStage4Pass2Enrich({ reportId, brief, onlyIds: candidates });
selected = [...survivors, ...candidates];
writeFileSync(selectedPath, JSON.stringify(selected, null, 2));
manifest = buildManifest(reportId, selected);
}
// Always write the latest manifest, regardless of pass/fail.
const manifestPath = writeManifest(reportId, manifest);
// Backfill log
const backfillLogPath = `${PATHS.pass2(reportId)}/backfill_log.json`;
writeFileSync(backfillLogPath, JSON.stringify({
rounds,
dropped_ids: manifest.videos.filter((v) => !v.all_ok).map((v) => ({ id: v.id, missing: v.missing })),
backfilled_ids: backfilled,
final_selected_count: selected.length,
final_coverage_pct: manifest.summary.coverage_pct,
}, null, 2));
const passed = manifest.summary.coverage_pct === 100;
if (!passed) {
if (dropFailing) {
console.error(`[stage 5] manifest still incomplete after ${rounds} backfill rounds. Coverage ${manifest.summary.coverage_pct}%.`);
} else {
console.error(`[stage 5] manifest incomplete. Coverage ${manifest.summary.coverage_pct}%. Run with --drop-failing to auto-backfill.`);
}
} else {
console.log(`[stage 5] manifest PASS — coverage 100%, ${selected.length} videos ready for analysis.`);
}
return {
ok: passed,
manifest,
passed,
backfill_rounds: rounds,
backfilled_ids: backfilled,
outputs: {
manifest: manifestPath,
backfill_log: backfillLogPath,
},
};
}
export { HardGateError };

View file

@ -0,0 +1,163 @@
// Stage 6 — per-video Claude analysis. Reads bundle.json for each manifest-passing
// video, builds a single prompt with caption + transcript + comments + frame refs,
// emits a structured JSON record per the V3 schema. Cached: skips files that exist.
import { writeFileSync, existsSync, readFileSync } from 'node:fs';
import { join } from 'node:path';
import { z } from 'zod';
import { callClaudeJSON } from '../lib/claude.js';
import { loadRubric } from '../lib/rubrics.js';
import { loadManifest, loadBundle, HardGateError } from '../lib/manifest.js';
import { PATHS, ensureDir } from '../lib/paths.js';
import type { VideoBundle } from './stage_4_pass2_enrich.js';
export const ANALYSIS_SCHEMA = z.object({
id: z.string(),
what_happens: z.string(),
hook: z.object({
first_3_seconds: z.string(),
pattern: z.string(),
why_it_stops_scroll: z.string(),
}),
visual_aesthetic: z.object({
lighting: z.string(),
colour_palette: z.array(z.string()),
setting: z.string(),
talent: z.string(),
products_visible: z.array(z.string()).default([]),
on_screen_text_examples: z.array(z.string()).default([]),
}),
format: z.string(),
audio: z.object({
music_present: z.boolean(),
music_mood: z.string(),
voiceover: z.boolean(),
asmr_elements: z.boolean(),
}),
narrative: z.object({
thesis: z.string(),
tension: z.string(),
resolution: z.string(),
}),
audience_signals: z.object({
comment_themes: z.array(z.string()),
comment_sentiment_split: z.object({ positive: z.number(), neutral: z.number(), critical: z.number() }),
verbatim_quotes: z.array(z.object({ text: z.string(), likes: z.number(), theme: z.string() })),
}),
paid_or_organic: z.object({
label: z.enum(['paid', 'organic', 'unclear']),
reasoning: z.string(),
evidence_signals_used: z.array(z.string()).default([]),
}),
});
export type Analysis = z.infer<typeof ANALYSIS_SCHEMA>;
function buildPrompt(bundle: VideoBundle, rubric: string): string {
const m = bundle.metadata;
const meta = [
`id: ${m.id}`,
`handle: @${m.handle}`,
`plays: ${m.plays.toLocaleString()}, likes: ${m.likes.toLocaleString()}, saves: ${m.saves.toLocaleString()}, comments: ${m.comments_count.toLocaleString()}, shares: ${m.shares.toLocaleString()}`,
`STL%: ${m.stl_pct}, duration: ${m.duration_sec}s, posted: ${m.posted_at}`,
`caption: ${m.caption}`,
`hashtags: ${m.hashtags.join(' ')}`,
].join('\n');
const transcript = bundle.transcript ? bundle.transcript.text_en : '(no transcript)';
const comments = bundle.comments
.map((c, i) => `${i + 1}. (${c.likes} likes) @${c.author_handle}: ${c.text_en}`)
.join('\n');
const frameNote = `${bundle.frames.length} frames extracted (1fps cap, 720px wide). Reference them when commenting on visual_aesthetic.`;
return `${rubric}
---
# Video metadata
${meta}
# English transcript
${transcript}
# Top comments (English)
${comments}
# Frame note
${frameNote}
Return the analysis JSON now.`;
}
async function analyseOne(reportId: string, bundle: VideoBundle, rubric: string): Promise<Analysis> {
const prompt = buildPrompt(bundle, rubric);
const raw = await callClaudeJSON<unknown>(prompt, { label: `analyse:${bundle.id}`, maxTokens: 4096 });
const parsed = ANALYSIS_SCHEMA.safeParse(raw);
if (!parsed.success) {
throw new Error(`Analysis JSON failed schema for ${bundle.id}: ${parsed.error.message}`);
}
// Defence: model could echo a different id; force-set ours.
return { ...parsed.data, id: bundle.id };
}
const MAX_CONCURRENCY = 4;
export interface Stage6Result {
ok: true;
outputs: Record<string, string>;
total: number;
cached: number;
fresh: number;
}
export async function runStage6Analyse(reportId: string): Promise<Stage6Result> {
const manifest = loadManifest(reportId);
if (!manifest) throw new Error(`manifest.json missing. Run validate first.`);
if (manifest.summary.coverage_pct < 100) {
throw new HardGateError(
`manifest coverage ${manifest.summary.coverage_pct}% — refusing to start analysis`,
manifest,
);
}
const passing = manifest.videos.filter((v) => v.all_ok).map((v) => v.id);
const rubric = loadRubric('per_video_analysis');
ensureDir(PATHS.analysisDir(reportId));
let cached = 0;
let fresh = 0;
async function workOne(id: string): Promise<void> {
const out = join(PATHS.analysisDir(reportId), `${id}.json`);
if (existsSync(out)) {
try { ANALYSIS_SCHEMA.parse(JSON.parse(readFileSync(out, 'utf-8'))); cached++; return; }
catch { /* re-analyse */ }
}
const bundle = loadBundle(reportId, id);
if (!bundle) throw new Error(`bundle.json missing for ${id}`);
const analysis = await analyseOne(reportId, bundle, rubric);
writeFileSync(out, JSON.stringify(analysis, null, 2));
fresh++;
}
// Concurrency-limited sweep.
let i = 0;
async function worker(): Promise<void> {
while (i < passing.length) {
const idx = i++;
const id = passing[idx]!;
try { await workOne(id); }
catch (err) { console.error(`[stage 6] ${id} failed: ${(err as Error).message}`); throw err; }
}
}
await Promise.all(Array.from({ length: Math.min(MAX_CONCURRENCY, passing.length) }, () => worker()));
return {
ok: true,
outputs: { analyses_dir: PATHS.analysisDir(reportId) },
total: passing.length,
cached, fresh,
};
}

View file

@ -0,0 +1,143 @@
// Stage 7 — turn per-video analyses into 200500 atomic insights.
// Iterates analyses in batches of 20; each batch sees the running list so it can
// extend existing atomic_ids rather than duplicate.
import { writeFileSync, readFileSync, readdirSync, existsSync } from 'node:fs';
import { z } from 'zod';
import { callClaudeJSON } from '../lib/claude.js';
import { loadRubric } from '../lib/rubrics.js';
import { PATHS } from '../lib/paths.js';
import { ANALYSIS_SCHEMA, type Analysis } from './stage_6_analyse.js';
const TYPE = z.enum(['hook', 'visual', 'audio', 'narrative']);
const BATCH_SCHEMA = z.object({
additions: z.array(z.object({
atomic_id: z.string(),
type: TYPE,
observation: z.string().min(8),
supporting_video_ids: z.array(z.string()).min(1),
})).default([]),
extensions: z.array(z.object({
atomic_id: z.string(),
added_video_ids: z.array(z.string()).min(1),
})).default([]),
});
export interface AtomicInsight {
atomic_id: string; // "ATM-0001"
type: 'hook' | 'visual' | 'audio' | 'narrative';
observation: string;
supporting_video_ids: string[];
frequency: number; // = supporting_video_ids.length, single source of truth
}
const BATCH_SIZE = 20;
function nextId(n: number): string {
return `ATM-${String(n).padStart(4, '0')}`;
}
function loadAllAnalyses(reportId: string): Analysis[] {
const dir = PATHS.analysisDir(reportId);
if (!existsSync(dir)) return [];
const files = readdirSync(dir).filter((f) => f.endsWith('.json'));
const out: Analysis[] = [];
for (const f of files) {
try {
const data = JSON.parse(readFileSync(`${dir}/${f}`, 'utf-8'));
const parsed = ANALYSIS_SCHEMA.safeParse(data);
if (parsed.success) out.push(parsed.data);
} catch { /* skip malformed */ }
}
return out;
}
function summariseAnalysis(a: Analysis): string {
return [
`### Video ${a.id} (@${''})`,
`format: ${a.format}; what_happens: ${a.what_happens}`,
`hook: pattern=${a.hook.pattern}; first_3s="${a.hook.first_3_seconds.replace(/\n/g, ' ').slice(0, 200)}"`,
`visual: lighting=${a.visual_aesthetic.lighting}, setting=${a.visual_aesthetic.setting}, talent=${a.visual_aesthetic.talent}, on_screen_text=${a.visual_aesthetic.on_screen_text_examples.slice(0, 3).join(' | ')}`,
`audio: music=${a.audio.music_present} mood=${a.audio.music_mood} voiceover=${a.audio.voiceover} asmr=${a.audio.asmr_elements}`,
`narrative: thesis="${a.narrative.thesis.slice(0, 200)}"; tension="${a.narrative.tension.slice(0, 150)}"`,
`audience: themes=${a.audience_signals.comment_themes.slice(0, 5).join(' | ')}`,
`paid_or_organic: ${a.paid_or_organic.label}`,
].join('\n');
}
function summariseRunningList(insights: AtomicInsight[], cap = 80): string {
const lines = insights.slice(-cap).map((i) => `${i.atomic_id} [${i.type}, freq ${i.frequency}] ${i.observation}`);
return lines.length === 0 ? '(no atomic insights yet)' : lines.join('\n');
}
export interface Stage7Result {
ok: true;
outputs: Record<string, string>;
total_insights: number;
by_type: Record<'hook' | 'visual' | 'audio' | 'narrative', number>;
}
export async function runStage7AtomicInsights(reportId: string): Promise<Stage7Result> {
const analyses = loadAllAnalyses(reportId);
if (analyses.length === 0) throw new Error('No analyses found. Run stage 6 first.');
const rubric = loadRubric('atomic_insights');
const insights: AtomicInsight[] = [];
const byId = new Map<string, AtomicInsight>();
let counter = 0;
for (let start = 0; start < analyses.length; start += BATCH_SIZE) {
const batch = analyses.slice(start, start + BATCH_SIZE);
const summaries = batch.map(summariseAnalysis).join('\n\n');
const running = summariseRunningList(insights);
const prompt = `${rubric}
---
# Existing atomic insights (running list, truncated to recent ${insights.length} entries)
${running}
# This batch (${batch.length} videos)
${summaries}
Return the JSON now.`;
const raw = await callClaudeJSON<unknown>(prompt, { label: `atomic_insights:batch_${start}`, maxTokens: 8192 });
const parsed = BATCH_SCHEMA.safeParse(raw);
if (!parsed.success) {
console.warn(`[stage 7] batch starting at ${start} returned bad shape: ${parsed.error.message.slice(0, 200)}; skipping.`);
continue;
}
for (const a of parsed.data.additions) {
counter += 1;
const id = nextId(counter);
const item: AtomicInsight = {
atomic_id: id,
type: a.type,
observation: a.observation.trim(),
supporting_video_ids: [...new Set(a.supporting_video_ids)],
frequency: 0,
};
item.frequency = item.supporting_video_ids.length;
insights.push(item);
byId.set(id, item);
}
for (const ext of parsed.data.extensions) {
const target = byId.get(ext.atomic_id);
if (!target) continue;
const merged = new Set([...target.supporting_video_ids, ...ext.added_video_ids]);
target.supporting_video_ids = [...merged];
target.frequency = target.supporting_video_ids.length;
}
console.log(`[stage 7] batch ${start / BATCH_SIZE + 1}: +${parsed.data.additions.length} additions, ${parsed.data.extensions.length} extensions (total ${insights.length})`);
}
const byType: Stage7Result['by_type'] = { hook: 0, visual: 0, audio: 0, narrative: 0 };
for (const i of insights) byType[i.type]++;
const out = PATHS.atomicInsights(reportId);
writeFileSync(out, JSON.stringify(insights, null, 2));
return { ok: true, outputs: { atomic_insights: out }, total_insights: insights.length, by_type: byType };
}

View file

@ -0,0 +1,293 @@
// Stage 8 — trend synthesis. Three sub-steps:
// 8a: brief-driven categories (§4.5d).
// 8b: cluster atomic insights into trends (§4.5e naming).
// 8b.5: business_question_relevance per trend (§4.5c calibration anchors). Drop <0.35.
import { writeFileSync, readFileSync, readdirSync, existsSync } from 'node:fs';
import { z } from 'zod';
import { callClaudeJSON } from '../lib/claude.js';
import { loadRubric } from '../lib/rubrics.js';
import { PATHS } from '../lib/paths.js';
import type { AtomicInsight } from './stage_7_atomic_insights.js';
import type { BriefInput } from '../../server/schemas/brief.js';
import { ANALYSIS_SCHEMA, type Analysis } from './stage_6_analyse.js';
// ─── 8a categories ───
const CATEGORIES_SCHEMA = z.object({
categories: z.array(z.object({ name: z.string().min(2), rationale: z.string() })).min(3),
rejected: z.array(z.object({ name: z.string(), reason: z.string() })).default([]),
});
type Categories = z.infer<typeof CATEGORIES_SCHEMA>;
// ─── 8b trends (raw from Claude) ───
// V3 brief mandates ≥5 supporting videos per trend in production. Override via env
// for small corpora (smoke tests, brand-new accounts) where 5 isn't always reachable.
const MIN_SUPPORTING = parseInt(process.env.MIN_SUPPORTING_VIDEOS_PER_TREND ?? '5', 10);
const RAW_TRENDS_SCHEMA = z.object({
trends: z.array(z.object({
slug: z.string().min(2),
name: z.string().min(3),
category: z.string().min(1),
narrative: z.string().min(20),
lens_tags: z.array(z.enum(['hooks', 'visual', 'audio', 'sentiment', 'narrative'])).min(1),
top_atomic_ids: z.array(z.string()).default([]),
supporting_video_ids: z.array(z.string()).min(MIN_SUPPORTING),
})).min(1),
});
// ─── 8b.5 relevance ───
const RELEVANCE_SCHEMA = z.object({
score: z.number().min(0).max(1),
tier: z.enum(['core', 'peripheral', 'dropped']),
justification: z.string(),
});
type Relevance = z.infer<typeof RELEVANCE_SCHEMA>;
// ─── Trend final shape (what we write to trends.json) ───
export interface TrendKpis {
plays_total: number;
videos: number;
unique_creators: number;
avg_stl_pct: number;
paid_organic_split: { paid: number; organic: number; unclear: number };
}
export interface Trend {
trend_id: string; // "TR-001"
slug: string;
name: string;
category: string;
narrative: string;
lens_tags: string[];
top_atomic_ids: string[];
supporting_video_ids: string[];
business_question_relevance: Relevance;
kpis: TrendKpis;
}
function loadAnalyses(reportId: string): Map<string, Analysis> {
const dir = PATHS.analysisDir(reportId);
const map = new Map<string, Analysis>();
if (!existsSync(dir)) return map;
for (const f of readdirSync(dir)) {
if (!f.endsWith('.json')) continue;
try {
const data = JSON.parse(readFileSync(`${dir}/${f}`, 'utf-8'));
const parsed = ANALYSIS_SCHEMA.safeParse(data);
if (parsed.success) map.set(parsed.data.id, parsed.data);
} catch { /* skip */ }
}
return map;
}
function briefBlock(brief: BriefInput): string {
return [
`Brand: ${brief.brand.name} (@${brief.brand.handle})${brief.brand.positioning ? `${brief.brand.positioning}` : ''}`,
`Category: ${brief.category}`,
`Audience: ${brief.audience.primary} (${brief.audience.age_range} ${brief.audience.gender})`,
`Geo/Language: ${brief.geo} / ${brief.language}`,
`Business question: ${brief.business_question}`,
`KPIs: ${brief.kpis.join(' | ')}`,
brief.context_vision ? `Context: ${brief.context_vision}` : '',
].filter(Boolean).join('\n');
}
async function generateCategories(brief: BriefInput, atomicSummary: string): Promise<Categories> {
const rubric = loadRubric('category_quality');
const prompt = `${rubric}
---
# Brief
${briefBlock(brief)}
# Atomic insights summary
${atomicSummary}
Generate the categories JSON now.`;
const raw = await callClaudeJSON<unknown>(prompt, { label: 'stage_8a_categories', maxTokens: 4096 });
return CATEGORIES_SCHEMA.parse(raw);
}
function summariseAtomicsForPrompt(insights: AtomicInsight[], cap = 200): string {
// Group by type and take top by frequency to keep the prompt reasonable.
const byType: Record<string, AtomicInsight[]> = { hook: [], visual: [], audio: [], narrative: [] };
for (const i of insights) (byType[i.type] ??= []).push(i);
for (const k of Object.keys(byType)) {
byType[k]!.sort((a, b) => b.frequency - a.frequency);
}
const lines: string[] = [];
const perTypeCap = Math.max(20, Math.floor(cap / 4));
for (const t of ['hook', 'visual', 'audio', 'narrative']) {
lines.push(`## ${t.toUpperCase()}`);
for (const i of (byType[t] ?? []).slice(0, perTypeCap)) {
lines.push(`- ${i.atomic_id} [freq ${i.frequency}] ${i.observation} (videos: ${i.supporting_video_ids.slice(0, 6).join(', ')}${i.supporting_video_ids.length > 6 ? ', …' : ''})`);
}
}
return lines.join('\n');
}
async function clusterTrends(brief: BriefInput, categoryNames: string[], atomicSummary: string): Promise<z.infer<typeof RAW_TRENDS_SCHEMA>> {
const rubric = loadRubric('trend_synthesis');
const namingRubric = loadRubric('editorial_naming');
const prompt = `${rubric}
---
${namingRubric}
---
# Brief
${briefBlock(brief)}
# Categories chosen for this report
${categoryNames.map((c, i) => `${i + 1}. ${c}`).join('\n')}
# Atomic insights (top by frequency, by type)
${atomicSummary}
Cluster atomic insights into trends now. Aim for 50 trends; floor 35; never split weak trends. Return ONLY the JSON.`;
const raw = await callClaudeJSON<unknown>(prompt, { label: 'stage_8b_trends', maxTokens: 16384 });
return RAW_TRENDS_SCHEMA.parse(raw);
}
async function scoreRelevance(brief: BriefInput, trendName: string, narrative: string): Promise<Relevance> {
const anchors = loadRubric('relevance_calibration');
const prompt = `${anchors}
---
# Brief business question
${brief.business_question}
# Trend to score
NAME: ${trendName}
NARRATIVE: ${narrative}
Score it now. Return ONLY the JSON.`;
const raw = await callClaudeJSON<unknown>(prompt, { label: `stage_8b5_relevance:${trendName.slice(0, 40)}`, maxTokens: 1024 });
return RELEVANCE_SCHEMA.parse(raw);
}
function computeKpis(trend: { supporting_video_ids: string[] }, analyses: Map<string, Analysis>, pass1: Map<string, { plays: number; likes: number; saves: number; comments_count: number; shares: number; stl_pct: number; handle: string }>): TrendKpis {
let plays = 0, sumStl = 0, n = 0, paid = 0, organic = 0, unclear = 0;
const creators = new Set<string>();
for (const id of trend.supporting_video_ids) {
const meta = pass1.get(id);
if (!meta) continue;
plays += meta.plays;
sumStl += meta.stl_pct;
n += 1;
creators.add(meta.handle);
const a = analyses.get(id);
if (a) {
if (a.paid_or_organic.label === 'paid') paid++;
else if (a.paid_or_organic.label === 'organic') organic++;
else unclear++;
} else {
unclear++;
}
}
return {
plays_total: plays,
videos: trend.supporting_video_ids.length,
unique_creators: creators.size,
avg_stl_pct: n === 0 ? 0 : Math.round((sumStl / n) * 100) / 100,
paid_organic_split: { paid, organic, unclear },
};
}
export interface Stage8Result {
ok: true;
outputs: Record<string, string>;
categories: string[];
total_trends: number;
core_trends: number;
peripheral_trends: number;
dropped_trends: number;
}
export async function runStage8Trends(reportId: string, brief: BriefInput): Promise<Stage8Result> {
// Load atomic insights
const atomicsPath = PATHS.atomicInsights(reportId);
if (!existsSync(atomicsPath)) throw new Error('atomic_insights.json missing. Run stage 7 first.');
const insights: AtomicInsight[] = JSON.parse(readFileSync(atomicsPath, 'utf-8'));
if (insights.length === 0) throw new Error('No atomic insights to synthesise from.');
// Load per-video analyses + pass1 for KPI computation
const analyses = loadAnalyses(reportId);
const pass1Path = PATHS.pass1Videos(reportId);
type LiteMeta = { plays: number; likes: number; saves: number; comments_count: number; shares: number; stl_pct: number; handle: string };
const pass1Lite = new Map<string, LiteMeta>();
if (existsSync(pass1Path)) {
const arr = JSON.parse(readFileSync(pass1Path, 'utf-8')) as Array<{ id: string } & LiteMeta>;
for (const v of arr) pass1Lite.set(v.id, v);
}
// 8a — categories
const atomicSummary = summariseAtomicsForPrompt(insights);
const categories = await generateCategories(brief, atomicSummary);
writeFileSync(PATHS.categories(reportId), JSON.stringify(categories, null, 2));
const categoryNames = categories.categories.map((c) => c.name);
console.log(`[stage 8a] ${categoryNames.length} categories: ${categoryNames.join(', ')}`);
// 8b — cluster
const rawTrends = await clusterTrends(brief, categoryNames, atomicSummary);
// 8b.5 — relevance scoring + filter
const finalTrends: Trend[] = [];
let dropped = 0, core = 0, peripheral = 0;
for (let i = 0; i < rawTrends.trends.length; i++) {
const r = rawTrends.trends[i]!;
let relevance: Relevance;
try {
relevance = await scoreRelevance(brief, r.name, r.narrative);
} catch (err) {
console.warn(`[stage 8b.5] relevance scoring failed for "${r.name}": ${(err as Error).message}; defaulting to peripheral 0.5`);
relevance = { score: 0.5, tier: 'peripheral', justification: 'scoring failed; defaulted' };
}
if (relevance.score < 0.35) { dropped++; continue; }
if (relevance.tier === 'core' || relevance.score >= 0.6) core++; else peripheral++;
const trendId = `TR-${String(finalTrends.length + 1).padStart(3, '0')}`;
const kpis = computeKpis({ supporting_video_ids: r.supporting_video_ids }, analyses, pass1Lite);
finalTrends.push({
trend_id: trendId,
slug: r.slug,
name: r.name,
category: r.category,
narrative: r.narrative,
lens_tags: r.lens_tags,
top_atomic_ids: r.top_atomic_ids,
supporting_video_ids: r.supporting_video_ids,
business_question_relevance: relevance,
kpis,
});
}
writeFileSync(PATHS.trends(reportId), JSON.stringify(finalTrends, null, 2));
console.log(`[stage 8] ${finalTrends.length} trends (core=${core}, peripheral=${peripheral}, dropped=${dropped})`);
if (finalTrends.length < 35) {
console.warn(`[stage 8] WARNING: only ${finalTrends.length} trends (floor 35). Brief may be too narrow or atomic insights too thin.`);
}
return {
ok: true,
outputs: {
categories: PATHS.categories(reportId),
trends: PATHS.trends(reportId),
},
categories: categoryNames,
total_trends: finalTrends.length,
core_trends: core,
peripheral_trends: peripheral,
dropped_trends: dropped,
};
}

View file

@ -0,0 +1,122 @@
// Stage 9 — QA gates.
//
// 9a (automated): aggregate paid/organic flags per creator.
// 9b (automated): re-validate manifest coverage = 100%.
// 9c, 9d (human): CM + Strategist checklists are surfaced in the operator UI;
// the orchestrator just records signoffs and refuses to advance without both.
import { writeFileSync, readFileSync, readdirSync, existsSync, mkdirSync } from 'node:fs';
import { join } from 'node:path';
import { ANALYSIS_SCHEMA, type Analysis } from './stage_6_analyse.js';
import { loadManifest, HardGateError } from '../lib/manifest.js';
import { PATHS } from '../lib/paths.js';
interface PaidOrganicCreator {
handle: string;
videos_in_report: number;
paid_videos: number;
organic_videos: number;
unclear_videos: number;
label: 'paid' | 'organic' | 'mixed' | 'unclear';
evidence_signals: string[];
needs_human_confirm: boolean;
}
function aggregatePaidOrganic(analyses: Analysis[], handleByVideoId: Map<string, string>): PaidOrganicCreator[] {
const byHandle = new Map<string, PaidOrganicCreator>();
for (const a of analyses) {
const handle = handleByVideoId.get(a.id) || 'unknown';
let row = byHandle.get(handle);
if (!row) {
row = {
handle,
videos_in_report: 0,
paid_videos: 0, organic_videos: 0, unclear_videos: 0,
label: 'unclear',
evidence_signals: [],
needs_human_confirm: false,
};
byHandle.set(handle, row);
}
row.videos_in_report++;
if (a.paid_or_organic.label === 'paid') row.paid_videos++;
else if (a.paid_or_organic.label === 'organic') row.organic_videos++;
else row.unclear_videos++;
for (const sig of a.paid_or_organic.evidence_signals_used) {
if (!row.evidence_signals.includes(sig)) row.evidence_signals.push(sig);
}
}
for (const row of byHandle.values()) {
if (row.paid_videos > 0 && row.organic_videos > 0) row.label = 'mixed';
else if (row.paid_videos > 0) row.label = 'paid';
else if (row.organic_videos > 0) row.label = 'organic';
else row.label = 'unclear';
row.needs_human_confirm = row.paid_videos > 0 || row.label === 'mixed';
}
return [...byHandle.values()].sort((a, b) => b.videos_in_report - a.videos_in_report);
}
export interface Stage9Result {
ok: boolean;
outputs: Record<string, string>;
paid_creators: number;
mixed_creators: number;
coverage_pct: number;
}
export async function runStage9Qa(reportId: string): Promise<Stage9Result> {
// 9b — coverage check first (cheaper, fail fast).
const manifest = loadManifest(reportId);
if (!manifest) throw new Error('manifest.json missing. Run validate first.');
if (manifest.summary.coverage_pct < 100) {
throw new HardGateError(`coverage ${manifest.summary.coverage_pct}% — refusing QA`, manifest);
}
// 9a — paid/organic aggregation
const dir = PATHS.analysisDir(reportId);
const analyses: Analysis[] = [];
if (existsSync(dir)) {
for (const f of readdirSync(dir)) {
if (!f.endsWith('.json')) continue;
try {
const data = JSON.parse(readFileSync(join(dir, f), 'utf-8'));
const parsed = ANALYSIS_SCHEMA.safeParse(data);
if (parsed.success) analyses.push(parsed.data);
} catch { /* skip */ }
}
}
// Build handle map from pass1 records (analyses don't carry handle).
const pass1Path = PATHS.pass1Videos(reportId);
const handleById = new Map<string, string>();
if (existsSync(pass1Path)) {
const pass1 = JSON.parse(readFileSync(pass1Path, 'utf-8')) as Array<{ id: string; handle: string }>;
for (const v of pass1) handleById.set(v.id, v.handle);
}
const creators = aggregatePaidOrganic(analyses, handleById);
mkdirSync(PATHS.qaDir(reportId), { recursive: true });
const paidPath = join(PATHS.qaDir(reportId), 'paid_organic_review.json');
writeFileSync(paidPath, JSON.stringify(creators, null, 2));
// 9b — record coverage check artefact alongside.
const coveragePath = join(PATHS.qaDir(reportId), 'coverage_check.json');
writeFileSync(coveragePath, JSON.stringify({
passed: true,
coverage_pct: manifest.summary.coverage_pct,
selected_count: manifest.selected_count,
all_ok_count: manifest.summary.all_ok,
checked_at: new Date().toISOString(),
}, null, 2));
const paid = creators.filter((c) => c.label === 'paid').length;
const mixed = creators.filter((c) => c.label === 'mixed').length;
console.log(`[stage 9] paid creators: ${paid}, mixed: ${mixed}, coverage: ${manifest.summary.coverage_pct}%`);
return {
ok: true,
outputs: { paid_organic_review: paidPath, coverage_check: coveragePath },
paid_creators: paid,
mixed_creators: mixed,
coverage_pct: manifest.summary.coverage_pct,
};
}

View file

@ -0,0 +1,28 @@
import { describe, it, expect } from 'vitest';
import { roleAtLeast, type TeamRole } from '../db/memberships.js';
describe('roleAtLeast', () => {
it('owner satisfies every requirement', () => {
for (const r of ['owner', 'admin', 'editor', 'viewer'] as TeamRole[]) {
expect(roleAtLeast('owner', r)).toBe(true);
}
});
it('viewer cannot edit', () => {
expect(roleAtLeast('viewer', 'editor')).toBe(false);
expect(roleAtLeast('viewer', 'admin')).toBe(false);
expect(roleAtLeast('viewer', 'owner')).toBe(false);
});
it('editor can view and edit but not admin', () => {
expect(roleAtLeast('editor', 'viewer')).toBe(true);
expect(roleAtLeast('editor', 'editor')).toBe(true);
expect(roleAtLeast('editor', 'admin')).toBe(false);
});
it('admin can manage team but cannot do owner-only ops', () => {
expect(roleAtLeast('admin', 'editor')).toBe(true);
expect(roleAtLeast('admin', 'admin')).toBe(true);
expect(roleAtLeast('admin', 'owner')).toBe(false);
});
});

View file

@ -0,0 +1,44 @@
import { describe, it, expect, beforeAll } from 'vitest';
beforeAll(() => {
process.env.SESSION_SECRET = 'test-secret-deterministic-for-tests-only';
});
describe('session sign/verify roundtrip', () => {
it('issues a token that getSession can recover', async () => {
const { issueSession, COOKIE_NAME } = await import('../auth/session.js');
const token = issueSession({
user_id: '11111111-1111-1111-1111-111111111111',
email: 'alice@example.com',
active_team_id: '22222222-2222-2222-2222-222222222222',
auth_method: 'azure-sso',
});
const fakeReq = { headers: { cookie: `${COOKIE_NAME}=${token}` } } as unknown as Parameters<typeof import('../auth/session.js').getSession>[0];
const { getSession } = await import('../auth/session.js');
const session = getSession(fakeReq);
expect(session).not.toBeNull();
expect(session?.email).toBe('alice@example.com');
expect(session?.active_team_id).toBe('22222222-2222-2222-2222-222222222222');
});
it('rejects a tampered token', async () => {
const { issueSession, COOKIE_NAME, getSession } = await import('../auth/session.js');
const token = issueSession({
user_id: '11111111-1111-1111-1111-111111111111',
email: 'alice@example.com',
active_team_id: '22222222-2222-2222-2222-222222222222',
auth_method: 'azure-sso',
});
const tampered = token.replace(/^./, (c) => (c === 'a' ? 'b' : 'a'));
const fakeReq = { headers: { cookie: `${COOKIE_NAME}=${tampered}` } } as unknown as Parameters<typeof getSession>[0];
expect(getSession(fakeReq)).toBeNull();
});
it('returns null when no cookie is present', async () => {
const { getSession } = await import('../auth/session.js');
const fakeReq = { headers: {} } as unknown as Parameters<typeof getSession>[0];
expect(getSession(fakeReq)).toBeNull();
});
});

103
v2/server/auth/jwks.ts Normal file
View file

@ -0,0 +1,103 @@
// Lifted from V1 agents/social-listening/dashboard/server.ts:127-194.
// Azure AD ID-token validation: JWKS fetch with 24h cache + RSA-SHA256 signature check.
import { createPublicKey, createVerify } from 'node:crypto';
import { envStr } from '../lib/env.js';
const TENANT_ID = envStr('AZURE_TENANT_ID');
const CLIENT_ID = envStr('AZURE_CLIENT_ID');
export const SSO_ENABLED = !!(TENANT_ID && CLIENT_ID);
let jwksCache: { keys: Record<string, string>[]; fetchedAt: number } | null = null;
const JWKS_CACHE_TTL = 24 * 60 * 60 * 1000;
async function getSigningKeys(): Promise<Record<string, string>[]> {
if (jwksCache && Date.now() - jwksCache.fetchedAt < JWKS_CACHE_TTL) {
return jwksCache.keys;
}
const url = `https://login.microsoftonline.com/${TENANT_ID}/discovery/v2.0/keys`;
const resp = await fetch(url);
if (!resp.ok) throw new Error(`JWKS fetch failed: ${resp.status}`);
const data = await resp.json() as { keys: Record<string, string>[] };
jwksCache = { keys: data.keys, fetchedAt: Date.now() };
return data.keys;
}
function base64urlDecode(str: string): Buffer {
return Buffer.from(str.replace(/-/g, '+').replace(/_/g, '/'), 'base64');
}
export interface AzureClaims {
oid: string;
sub: string;
email?: string;
preferred_username?: string;
name?: string;
tid: string;
aud: string;
iss: string;
exp: number;
nbf?: number;
[key: string]: unknown;
}
export interface VerifyResult {
valid: boolean;
claims?: AzureClaims;
error?: string;
}
export async function verifyAzureIdToken(idToken: string): Promise<VerifyResult> {
if (!SSO_ENABLED) return { valid: false, error: 'SSO not configured' };
const parts = idToken.split('.');
if (parts.length !== 3) return { valid: false, error: 'Malformed JWT' };
const headerB64 = parts[0]!;
const payloadB64 = parts[1]!;
const signatureB64 = parts[2]!;
let header: Record<string, string>, payload: AzureClaims;
try {
header = JSON.parse(base64urlDecode(headerB64).toString());
payload = JSON.parse(base64urlDecode(payloadB64).toString()) as AzureClaims;
} catch {
return { valid: false, error: 'Invalid JWT encoding' };
}
if (payload.aud !== CLIENT_ID) return { valid: false, error: 'Invalid audience' };
if (payload.iss !== `https://login.microsoftonline.com/${TENANT_ID}/v2.0`) {
return { valid: false, error: 'Invalid issuer' };
}
const now = Math.floor(Date.now() / 1000);
if (typeof payload.exp === 'number' && payload.exp < now - 300) {
return { valid: false, error: 'Token expired' };
}
if (typeof payload.nbf === 'number' && payload.nbf > now + 300) {
return { valid: false, error: 'Token not yet valid' };
}
let keys = await getSigningKeys();
let key = keys.find((k) => k.kid === header.kid);
if (!key) {
jwksCache = null;
keys = await getSigningKeys();
key = keys.find((k) => k.kid === header.kid);
if (!key) return { valid: false, error: 'Signing key not found' };
}
try {
const publicKey = createPublicKey({
key: { kty: key.kty, n: key.n, e: key.e },
format: 'jwk',
});
const verifier = createVerify('RSA-SHA256');
verifier.update(`${headerB64}.${payloadB64}`);
if (!verifier.verify(publicKey, base64urlDecode(signatureB64))) {
return { valid: false, error: 'Invalid signature' };
}
} catch (err) {
return { valid: false, error: `Signature verification error: ${(err as Error).message}` };
}
if (!payload.oid) return { valid: false, error: 'Token missing oid claim' };
return { valid: true, claims: payload };
}

View file

@ -0,0 +1,74 @@
// Emergency password login. OFF in prod by default. Lifted from V1 dashboard/server.ts:351-386
// but adapted to V2: still upserts a user row keyed on a deterministic synthetic oid so the
// same DASH_USER lands in the users table consistently.
import type { IncomingMessage, ServerResponse } from 'node:http';
import { envBool, envStr } from '../lib/env.js';
import { parseJSONBody, sendJSON, clientIp } from '../lib/http.js';
import { issueSession, setSessionCookie } from './session.js';
import { landSsoUser } from './upsert-user.js';
const RATE_LIMIT_WINDOW_MS = 15 * 60 * 1000;
const RATE_LIMIT_MAX = 5;
const attempts = new Map<string, { count: number; firstAt: number }>();
function isRateLimited(ip: string): boolean {
const r = attempts.get(ip);
if (!r) return false;
if (Date.now() - r.firstAt > RATE_LIMIT_WINDOW_MS) { attempts.delete(ip); return false; }
return r.count >= RATE_LIMIT_MAX;
}
function recordAttempt(ip: string): void {
const now = Date.now();
const r = attempts.get(ip);
if (!r || now - r.firstAt > RATE_LIMIT_WINDOW_MS) {
attempts.set(ip, { count: 1, firstAt: now });
} else {
r.count++;
}
}
export async function handlePasswordLogin(req: IncomingMessage, res: ServerResponse): Promise<void> {
if (!envBool('ALLOW_PASSWORD_FALLBACK', false)) {
sendJSON(res, 404, { ok: false, error: 'Password login not enabled' });
return;
}
const ip = clientIp(req);
if (isRateLimited(ip)) {
sendJSON(res, 429, { ok: false, error: 'Too many attempts. Try again in 15 minutes.' });
return;
}
const DASH_USER = envStr('DASH_USER');
const DASH_PASS = envStr('DASH_PASS');
if (!DASH_USER || !DASH_PASS) {
sendJSON(res, 503, { ok: false, error: 'Password login misconfigured' });
return;
}
let username = '', password = '';
try {
const body = await parseJSONBody<{ username?: string; password?: string }>(req);
username = body.username ?? '';
password = body.password ?? '';
} catch { sendJSON(res, 400, { ok: false, error: 'Invalid body' }); return; }
if (username !== DASH_USER || password !== DASH_PASS) {
recordAttempt(ip);
sendJSON(res, 401, { ok: false, error: 'Invalid username or password' });
return;
}
attempts.delete(ip);
// Land the password user the same way SSO users land — deterministic synthetic oid.
const oid = `pw:${DASH_USER}`;
const email = `${DASH_USER}@local.password-fallback`;
const landed = await landSsoUser({ oid, email, display_name: DASH_USER });
const token = issueSession({
user_id: landed.user.id,
email: landed.user.email,
active_team_id: landed.active_team_id,
auth_method: 'password',
});
setSessionCookie(res, token);
sendJSON(res, 200, { ok: true, user: { id: landed.user.id, email: landed.user.email, name: landed.user.display_name } });
}

60
v2/server/auth/session.ts Normal file
View file

@ -0,0 +1,60 @@
// Lifted from V1 agents/social-listening/dashboard/server.ts:75-92.
// V2 session payload is richer: carries user_id + active_team_id (not just username).
import { createHmac, randomBytes } from 'node:crypto';
import type { IncomingMessage, ServerResponse } from 'node:http';
import { envStr, IS_PRODUCTION } from '../lib/env.js';
import { parseCookies } from '../lib/http.js';
export const SESSION_MAX_AGE_SEC = 60 * 60 * 24; // 24h
const SECRET = envStr('SESSION_SECRET') || randomBytes(32).toString('hex');
export const COOKIE_NAME = 'sl_session_v2';
export interface SessionData {
user_id: string;
email: string;
active_team_id: string | null;
auth_method: 'azure-sso' | 'password';
exp: number;
}
if (IS_PRODUCTION && !envStr('SESSION_SECRET')) {
throw new Error('SESSION_SECRET must be set in production');
}
function sign(payload: string): string {
const sig = createHmac('sha256', SECRET).update(payload).digest('hex');
return `${payload}.${sig}`;
}
export function issueSession(data: Omit<SessionData, 'exp'>): string {
const payload = JSON.stringify({ ...data, exp: Date.now() + SESSION_MAX_AGE_SEC * 1000 });
return sign(payload);
}
export function getSession(req: IncomingMessage): SessionData | null {
const token = parseCookies(req)[COOKIE_NAME];
if (!token) return null;
const dot = token.lastIndexOf('.');
if (dot === -1) return null;
const payload = token.slice(0, dot);
const sig = token.slice(dot + 1);
const expected = createHmac('sha256', SECRET).update(payload).digest('hex');
if (sig !== expected) return null;
try {
const data = JSON.parse(payload) as SessionData;
if (Date.now() > data.exp) return null;
return data;
} catch { return null; }
}
export function setSessionCookie(res: ServerResponse, token: string): void {
const secure = IS_PRODUCTION ? '; Secure' : '';
res.setHeader(
'Set-Cookie',
`${COOKIE_NAME}=${token}; Path=/; HttpOnly; SameSite=Strict; Max-Age=${SESSION_MAX_AGE_SEC}${secure}`,
);
}
export function clearSessionCookie(res: ServerResponse): void {
res.setHeader('Set-Cookie', `${COOKIE_NAME}=; Path=/; HttpOnly; Max-Age=0`);
}

View file

@ -0,0 +1,78 @@
// V2 SSO persistence: every Azure sign-in upserts a user row and ensures
// the user has at least one team (personal team auto-created on first sign-in).
import { upsertSsoUser, setSuperAdmin, type UserRow } from '../db/users.js';
import { createTeam, getTeamBySlug } from '../db/teams.js';
import { addMembership, userHasAnyMembership } from '../db/memberships.js';
import { envStr } from '../lib/env.js';
function emailToSlug(email: string): string {
const local = email.split('@')[0] ?? 'user';
const base = local.toLowerCase().replace(/[^a-z0-9]+/g, '-').replace(/(^-|-$)/g, '');
return base || 'user';
}
async function uniquePersonalSlug(email: string): Promise<string> {
const base = emailToSlug(email);
let candidate = `${base}-personal`;
let i = 1;
while (await getTeamBySlug(candidate)) {
i += 1;
candidate = `${base}-personal-${i}`;
}
return candidate;
}
export interface SsoLandingResult {
user: UserRow;
active_team_id: string;
bootstrapped_super_admin: boolean;
}
/**
* Single entry point for SSO sign-in persistence.
* 1. Upsert user keyed on Azure oid.
* 2. Promote to super-admin if email matches BOOTSTRAP_SUPER_ADMIN_EMAIL.
* 3. If user has no team memberships, create a personal team and add them as owner.
* 4. Pick an active team for the session.
*/
export async function landSsoUser(input: {
oid: string;
email: string;
display_name: string;
}): Promise<SsoLandingResult> {
let user = await upsertSsoUser(input);
// Bootstrap super-admin from env on first sign-in (sticky once true).
let bootstrapped = false;
const bootstrapEmail = envStr('BOOTSTRAP_SUPER_ADMIN_EMAIL').toLowerCase();
if (bootstrapEmail && !user.is_super_admin && user.email.toLowerCase() === bootstrapEmail) {
user = await setSuperAdmin(user.id, true);
bootstrapped = true;
console.log(`[auth] Bootstrapped super-admin: ${user.email}`);
}
// Ensure at least one team membership.
const hasTeam = await userHasAnyMembership(user.id);
let activeTeamId: string;
if (!hasTeam) {
const slug = await uniquePersonalSlug(user.email);
const personalTeam = await createTeam({
slug,
name: `${user.display_name || user.email}'s workspace`,
isPersonal: true,
});
await addMembership({ teamId: personalTeam.id, userId: user.id, role: 'owner', addedBy: null });
activeTeamId = personalTeam.id;
} else {
// Pick the user's first team as the default active team.
const { sql } = await import('../db/client.js');
const [row] = await sql<{ team_id: string }[]>`
SELECT team_id FROM team_memberships WHERE user_id = ${user.id}
ORDER BY added_at ASC LIMIT 1
`;
if (!row) throw new Error('landSsoUser: user has membership but no row returned');
activeTeamId = row.team_id;
}
return { user, active_team_id: activeTeamId, bootstrapped_super_admin: bootstrapped };
}

69
v2/server/db/briefs.ts Normal file
View file

@ -0,0 +1,69 @@
import { sql } from './client.js';
import type { BriefInput } from '../schemas/brief.js';
export interface BriefRow {
id: string;
team_id: string;
owner_id: string;
slug: string;
client_name: string;
category: string;
business_question: string;
date_window_days: number;
budget_usd: number;
platforms: string[];
positioning: unknown;
kpis: unknown;
context_vision: string | null;
min_likes: number;
min_plays: number;
min_stl_pct: number;
prior_report_id: string | null;
brief_yaml: unknown;
created_at: Date;
}
export async function createBrief(input: {
team_id: string;
owner_id: string;
slug: string;
brief: BriefInput;
}): Promise<BriefRow> {
const b = input.brief;
const [row] = await sql<BriefRow[]>`
INSERT INTO briefs (
team_id, owner_id, slug, client_name, category, business_question,
date_window_days, budget_usd, platforms,
positioning, kpis, context_vision,
min_likes, min_plays, min_stl_pct,
prior_report_id, brief_yaml
) VALUES (
${input.team_id}, ${input.owner_id}, ${input.slug},
${b.client_name}, ${b.category}, ${b.business_question},
${b.date_window_days}, ${b.budget_usd}, ${b.platforms}::text[],
${b.brand.positioning ? sql.json({ positioning: b.brand.positioning, brand: b.brand }) : null},
${sql.json(b.kpis)}, ${b.context_vision ?? null},
${b.min_likes}, ${b.min_plays}, ${b.min_stl_pct},
${b.prior_report_id ?? null}, ${sql.json(b)}
)
RETURNING *
`;
if (!row) throw new Error('createBrief: no row returned');
return row;
}
export async function listBriefsForTeam(teamId: string): Promise<BriefRow[]> {
return sql<BriefRow[]>`
SELECT * FROM briefs WHERE team_id = ${teamId} ORDER BY created_at DESC
`;
}
export async function getBriefById(id: string): Promise<BriefRow | null> {
const [row] = await sql<BriefRow[]>`SELECT * FROM briefs WHERE id = ${id}`;
return row ?? null;
}
export async function deleteBrief(id: string): Promise<boolean> {
const result = await sql`DELETE FROM briefs WHERE id = ${id}`;
return result.count > 0;
}

13
v2/server/db/client.ts Normal file
View file

@ -0,0 +1,13 @@
import postgres from 'postgres';
import { envStr } from '../lib/env.js';
const DATABASE_URL = envStr('DATABASE_URL') ||
'postgresql://srv2_user:change-me-please@localhost:5437/social_reporting_v2';
export const sql = postgres(DATABASE_URL, {
max: 10,
idle_timeout: 30,
connect_timeout: 10,
});
export type Sql = typeof sql;

View file

@ -0,0 +1,81 @@
import { sql } from './client.js';
export type TeamRole = 'owner' | 'admin' | 'editor' | 'viewer';
export interface MembershipRow {
team_id: string;
user_id: string;
role: TeamRole;
added_by: string | null;
added_at: Date;
}
export interface MembershipWithUser extends MembershipRow {
email: string;
display_name: string;
}
export async function getMembership(teamId: string, userId: string): Promise<MembershipRow | null> {
const [row] = await sql<MembershipRow[]>`
SELECT * FROM team_memberships WHERE team_id = ${teamId} AND user_id = ${userId}
`;
return row ?? null;
}
export async function addMembership(input: {
teamId: string;
userId: string;
role: TeamRole;
addedBy: string | null;
}): Promise<MembershipRow> {
const [row] = await sql<MembershipRow[]>`
INSERT INTO team_memberships (team_id, user_id, role, added_by)
VALUES (${input.teamId}, ${input.userId}, ${input.role}, ${input.addedBy})
RETURNING *
`;
if (!row) throw new Error('addMembership: no row returned');
return row;
}
export async function updateMembershipRole(
teamId: string,
userId: string,
role: TeamRole,
): Promise<MembershipRow | null> {
const [row] = await sql<MembershipRow[]>`
UPDATE team_memberships SET role = ${role}
WHERE team_id = ${teamId} AND user_id = ${userId}
RETURNING *
`;
return row ?? null;
}
export async function removeMembership(teamId: string, userId: string): Promise<boolean> {
const result = await sql`
DELETE FROM team_memberships WHERE team_id = ${teamId} AND user_id = ${userId}
`;
return result.count > 0;
}
export async function listMembers(teamId: string): Promise<MembershipWithUser[]> {
return sql<MembershipWithUser[]>`
SELECT m.*, u.email, u.display_name
FROM team_memberships m
JOIN users u ON u.id = m.user_id
WHERE m.team_id = ${teamId}
ORDER BY m.added_at ASC
`;
}
export async function userHasAnyMembership(userId: string): Promise<boolean> {
const [row] = await sql<{ exists: boolean }[]>`
SELECT EXISTS(SELECT 1 FROM team_memberships WHERE user_id = ${userId}) AS exists
`;
return row?.exists ?? false;
}
const ROLE_RANK: Record<TeamRole, number> = { viewer: 0, editor: 1, admin: 2, owner: 3 };
export function roleAtLeast(actual: TeamRole, required: TeamRole): boolean {
return ROLE_RANK[actual] >= ROLE_RANK[required];
}

44
v2/server/db/teams.ts Normal file
View file

@ -0,0 +1,44 @@
import { sql } from './client.js';
import type { TeamRole } from './memberships.js';
export interface TeamRow {
id: string;
slug: string;
name: string;
is_personal: boolean;
created_at: Date;
}
export async function getTeamById(id: string): Promise<TeamRow | null> {
const [row] = await sql<TeamRow[]>`SELECT * FROM teams WHERE id = ${id}`;
return row ?? null;
}
export async function getTeamBySlug(slug: string): Promise<TeamRow | null> {
const [row] = await sql<TeamRow[]>`SELECT * FROM teams WHERE slug = ${slug}`;
return row ?? null;
}
export async function createTeam(input: { slug: string; name: string; isPersonal: boolean }): Promise<TeamRow> {
const [row] = await sql<TeamRow[]>`
INSERT INTO teams (slug, name, is_personal)
VALUES (${input.slug}, ${input.name}, ${input.isPersonal})
RETURNING *
`;
if (!row) throw new Error('createTeam: no row returned');
return row;
}
export interface TeamWithRole extends TeamRow {
role: TeamRole;
}
export async function listTeamsForUser(userId: string): Promise<TeamWithRole[]> {
return sql<TeamWithRole[]>`
SELECT t.*, m.role
FROM teams t
JOIN team_memberships m ON m.team_id = t.id
WHERE m.user_id = ${userId}
ORDER BY t.is_personal DESC, t.created_at ASC
`;
}

59
v2/server/db/users.ts Normal file
View file

@ -0,0 +1,59 @@
import { sql } from './client.js';
export interface UserRow {
id: string;
azure_oid: string;
email: string;
display_name: string;
is_super_admin: boolean;
password_hash: string | null;
created_at: Date;
last_login_at: Date | null;
}
export async function getUserById(id: string): Promise<UserRow | null> {
const [row] = await sql<UserRow[]>`SELECT * FROM users WHERE id = ${id}`;
return row ?? null;
}
export async function getUserByOid(oid: string): Promise<UserRow | null> {
const [row] = await sql<UserRow[]>`SELECT * FROM users WHERE azure_oid = ${oid}`;
return row ?? null;
}
export async function getUserByEmail(email: string): Promise<UserRow | null> {
const [row] = await sql<UserRow[]>`SELECT * FROM users WHERE email = ${email}`;
return row ?? null;
}
export interface UpsertSsoUserInput {
oid: string;
email: string;
display_name: string;
}
export async function upsertSsoUser(input: UpsertSsoUserInput): Promise<UserRow> {
const [row] = await sql<UserRow[]>`
INSERT INTO users (azure_oid, email, display_name, last_login_at)
VALUES (${input.oid}, ${input.email}, ${input.display_name}, NOW())
ON CONFLICT (azure_oid) DO UPDATE SET
email = EXCLUDED.email,
display_name = EXCLUDED.display_name,
last_login_at = NOW()
RETURNING *
`;
if (!row) throw new Error('upsertSsoUser: no row returned');
return row;
}
export async function setSuperAdmin(userId: string, isSuper: boolean): Promise<UserRow> {
const [row] = await sql<UserRow[]>`
UPDATE users SET is_super_admin = ${isSuper} WHERE id = ${userId} RETURNING *
`;
if (!row) throw new Error('setSuperAdmin: user not found');
return row;
}
export async function listAllUsers(): Promise<UserRow[]> {
return sql<UserRow[]>`SELECT * FROM users ORDER BY created_at DESC`;
}

175
v2/server/index.ts Normal file
View file

@ -0,0 +1,175 @@
#!/usr/bin/env tsx
// V2 HTTP server: serves the operator-app SPA + JSON API.
// Routing is plain http + URL pattern matching (no Express).
import { createServer, type IncomingMessage, type ServerResponse } from 'node:http';
import { existsSync, readFileSync, statSync } from 'node:fs';
import { join, resolve, normalize, extname } from 'node:path';
import { dirname } from 'node:path';
import { fileURLToPath } from 'node:url';
import { envInt, envStr, IS_PRODUCTION } from './lib/env.js';
import { assertComposeNameOrExit } from './lib/compose-name-guard.js';
import { sendJSON, sendText } from './lib/http.js';
import { handleSsoTokenExchange, handleLogout } from './routes/sso.js';
import { handlePasswordLogin } from './auth/password-fallback.js';
import { handleGetMe, handlePatchActiveTeam, handleGetSession } from './routes/me.js';
import {
handleListTeams, handleCreateTeam, handleGetTeam,
handleAddMember, handleUpdateMemberRole, handleRemoveMember,
} from './routes/teams.js';
import { handleListAllUsers, handleToggleSuperAdmin } from './routes/admin.js';
import {
handleListBriefs, handleCreateBrief, handleGetBrief, handleDeleteBrief,
} from './routes/briefs.js';
assertComposeNameOrExit();
const PORT = envInt('PORT', 3457);
const __filename = fileURLToPath(import.meta.url);
const __dirname = dirname(__filename);
const SPA_DIST = resolve(__dirname, '../operator-app/dist');
const ALLOWED_ORIGIN = envStr('ALLOWED_ORIGIN');
const MIME: Record<string, string> = {
'.html': 'text/html; charset=utf-8',
'.js': 'application/javascript; charset=utf-8',
'.mjs': 'application/javascript; charset=utf-8',
'.css': 'text/css; charset=utf-8',
'.json': 'application/json; charset=utf-8',
'.svg': 'image/svg+xml',
'.png': 'image/png',
'.jpg': 'image/jpeg',
'.jpeg': 'image/jpeg',
'.ico': 'image/x-icon',
'.woff2':'font/woff2',
'.woff': 'font/woff',
};
function setSecurityHeaders(res: ServerResponse): void {
res.setHeader('X-Frame-Options', 'DENY');
res.setHeader('X-Content-Type-Options', 'nosniff');
res.setHeader('Referrer-Policy', 'no-referrer');
res.setHeader(
'Content-Security-Policy',
"default-src 'self'; " +
"script-src 'self' 'unsafe-inline'; " +
"style-src 'self' 'unsafe-inline' https://fonts.googleapis.com; " +
"font-src 'self' https://fonts.gstatic.com; " +
"img-src 'self' data: blob: https:; " +
"connect-src 'self' https://login.microsoftonline.com; " +
"frame-src 'self' https://login.microsoftonline.com",
);
}
function applyCors(req: IncomingMessage, res: ServerResponse): void {
const origin = req.headers.origin || '';
if (ALLOWED_ORIGIN === '*') {
res.setHeader('Access-Control-Allow-Origin', '*');
} else if (ALLOWED_ORIGIN && origin === ALLOWED_ORIGIN) {
res.setHeader('Access-Control-Allow-Origin', ALLOWED_ORIGIN);
res.setHeader('Vary', 'Origin');
res.setHeader('Access-Control-Allow-Credentials', 'true');
}
res.setHeader('Access-Control-Allow-Methods', 'GET, POST, PATCH, DELETE, OPTIONS');
res.setHeader('Access-Control-Allow-Headers', 'Content-Type');
}
function serveStatic(req: IncomingMessage, res: ServerResponse, urlPath: string): boolean {
if (!existsSync(SPA_DIST)) return false;
let p = urlPath === '/' ? '/index.html' : urlPath;
// Defence in depth — the URL parser already strips `..`, but normalize anyway.
p = normalize(p).replace(/^(\.\.[/\\])+/, '');
const candidate = join(SPA_DIST, p);
if (!candidate.startsWith(SPA_DIST)) return false;
if (existsSync(candidate) && statSync(candidate).isFile()) {
const ext = extname(candidate).toLowerCase();
const mime = MIME[ext] || 'application/octet-stream';
res.setHeader('Cache-Control', ext === '.html' ? 'no-cache' : 'public, max-age=3600');
res.writeHead(200, { 'Content-Type': mime });
res.end(readFileSync(candidate));
return true;
}
return false;
}
function spaFallback(res: ServerResponse): void {
const indexFile = join(SPA_DIST, 'index.html');
if (existsSync(indexFile)) {
res.writeHead(200, { 'Content-Type': 'text/html; charset=utf-8', 'Cache-Control': 'no-cache' });
res.end(readFileSync(indexFile));
} else {
sendText(res, 503, 'Operator-app not built yet. Run: npm run ui:build');
}
}
interface Route {
method: string;
pattern: RegExp;
handle: (req: IncomingMessage, res: ServerResponse, params: string[]) => Promise<void> | void;
}
const ROUTES: Route[] = [
{ method: 'GET', pattern: /^\/api\/health$/, handle: (_req, res) => sendJSON(res, 200, { ok: true }) },
{ method: 'GET', pattern: /^\/api\/auth$/, handle: handleGetSession },
{ method: 'POST', pattern: /^\/api\/sso\/token-exchange$/, handle: handleSsoTokenExchange },
{ method: 'POST', pattern: /^\/api\/login$/, handle: handlePasswordLogin },
{ method: 'GET', pattern: /^\/api\/logout$/, handle: handleLogout },
{ method: 'GET', pattern: /^\/api\/me$/, handle: handleGetMe },
{ method: 'PATCH', pattern: /^\/api\/me\/active-team$/, handle: handlePatchActiveTeam },
{ method: 'GET', pattern: /^\/api\/teams$/, handle: handleListTeams },
{ method: 'POST', pattern: /^\/api\/teams$/, handle: handleCreateTeam },
{ method: 'GET', pattern: /^\/api\/teams\/([0-9a-f-]{36})$/, handle: (req, res, [id]) => handleGetTeam(req, res, id!) },
{ method: 'POST', pattern: /^\/api\/teams\/([0-9a-f-]{36})\/members$/, handle: (req, res, [id]) => handleAddMember(req, res, id!) },
{ method: 'PATCH', pattern: /^\/api\/teams\/([0-9a-f-]{36})\/members\/([0-9a-f-]{36})\/role$/, handle: (req, res, [t, u]) => handleUpdateMemberRole(req, res, t!, u!) },
{ method: 'DELETE', pattern: /^\/api\/teams\/([0-9a-f-]{36})\/members\/([0-9a-f-]{36})$/, handle: (req, res, [t, u]) => handleRemoveMember(req, res, t!, u!) },
{ method: 'GET', pattern: /^\/api\/admin\/users$/, handle: handleListAllUsers },
{ method: 'PATCH', pattern: /^\/api\/admin\/users\/([0-9a-f-]{36})\/super$/, handle: (req, res, [id]) => handleToggleSuperAdmin(req, res, id!) },
{ method: 'GET', pattern: /^\/api\/briefs$/, handle: handleListBriefs },
{ method: 'POST', pattern: /^\/api\/briefs$/, handle: handleCreateBrief },
{ method: 'GET', pattern: /^\/api\/briefs\/([0-9a-f-]{36})$/, handle: (req, res, [id]) => handleGetBrief(req, res, id!) },
{ method: 'DELETE', pattern: /^\/api\/briefs\/([0-9a-f-]{36})$/, handle: (req, res, [id]) => handleDeleteBrief(req, res, id!) },
];
async function route(req: IncomingMessage, res: ServerResponse): Promise<void> {
const url = new URL(req.url || '/', `http://localhost:${PORT}`);
const pathname = url.pathname;
const method = req.method || 'GET';
for (const r of ROUTES) {
if (r.method !== method) continue;
const m = r.pattern.exec(pathname);
if (!m) continue;
await r.handle(req, res, m.slice(1));
return;
}
if (pathname.startsWith('/api/')) {
sendJSON(res, 404, { error: 'Not found' });
return;
}
if (method === 'GET') {
if (serveStatic(req, res, pathname)) return;
spaFallback(res);
return;
}
sendJSON(res, 404, { error: 'Not found' });
}
const server = createServer(async (req, res) => {
setSecurityHeaders(res);
applyCors(req, res);
if (req.method === 'OPTIONS') { res.writeHead(204); res.end(); return; }
try {
await route(req, res);
} catch (err) {
console.error('[server] unhandled error:', err);
if (!res.headersSent) sendJSON(res, 500, { error: 'Internal server error' });
}
});
server.listen(PORT, () => {
console.log(`[v2] listening on :${PORT} (${IS_PRODUCTION ? 'prod' : 'dev'})`);
});

View file

@ -0,0 +1,26 @@
import { envStr, IS_PRODUCTION } from './env.js';
const EXPECTED = 'social-reporting-v2';
/**
* V2 ships as a separate Docker Compose project that MUST share neither containers nor
* volumes with V1. Compose derives the project name from the parent directory by default,
* which on the shared optical-dev server collapses everything under `deploy/` onto one
* project name and lets apps silently evict each other's data. Per CLAUDE.md, every
* compose file must pin a unique `name:` field. This boot guard catches the case where
* the env var didn't make it through.
*
* In dev we warn; in prod we refuse to start.
*/
export function assertComposeNameOrExit(): void {
const actual = envStr('COMPOSE_PROJECT_NAME');
if (actual === EXPECTED) return;
const msg = `[compose-name-guard] COMPOSE_PROJECT_NAME='${actual || '(unset)'}' — expected '${EXPECTED}'. ` +
`If you're running outside Docker this is fine; otherwise check docker-compose.v2.yml.`;
if (IS_PRODUCTION) {
console.error(msg);
process.exit(1);
}
console.warn(msg);
}

50
v2/server/lib/env.ts Normal file
View file

@ -0,0 +1,50 @@
import { readFileSync } from 'node:fs';
import { resolve, dirname } from 'node:path';
import { fileURLToPath } from 'node:url';
const __filename = fileURLToPath(import.meta.url);
const __dirname = dirname(__filename);
function loadDotenv(): Record<string, string> {
const env: Record<string, string> = {};
const candidates = [
resolve(__dirname, '../../.env'),
resolve(__dirname, '../../../.env'),
resolve(process.cwd(), '.env'),
];
for (const p of candidates) {
try {
const txt = readFileSync(p, 'utf-8');
for (const raw of txt.split('\n')) {
const t = raw.trim();
if (!t || t.startsWith('#')) continue;
const eq = t.indexOf('=');
if (eq === -1) continue;
env[t.slice(0, eq).trim()] = t.slice(eq + 1).trim().replace(/^["']|["']$/g, '');
}
break;
} catch { /* try next */ }
}
return env;
}
const fileEnv = loadDotenv();
export function envStr(key: string, fallback = ''): string {
return process.env[key] ?? fileEnv[key] ?? fallback;
}
export function envInt(key: string, fallback: number): number {
const v = envStr(key);
if (!v) return fallback;
const n = parseInt(v, 10);
return Number.isFinite(n) ? n : fallback;
}
export function envBool(key: string, fallback = false): boolean {
const v = envStr(key).toLowerCase();
if (!v) return fallback;
return v === 'true' || v === '1' || v === 'yes';
}
export const IS_PRODUCTION = envStr('NODE_ENV') === 'production';

52
v2/server/lib/http.ts Normal file
View file

@ -0,0 +1,52 @@
import type { IncomingMessage, ServerResponse } from 'node:http';
export const MAX_BODY_SIZE = 1024 * 1024; // 1 MB
export function sendJSON(res: ServerResponse, status: number, data: unknown): void {
res.writeHead(status, { 'Content-Type': 'application/json' });
res.end(JSON.stringify(data));
}
export function sendText(res: ServerResponse, status: number, text: string, contentType = 'text/plain'): void {
res.writeHead(status, { 'Content-Type': contentType });
res.end(text);
}
export function parseBody(req: IncomingMessage): Promise<string> {
return new Promise((resolveP, reject) => {
const chunks: Buffer[] = [];
let size = 0;
req.on('data', (c: Buffer) => {
size += c.length;
if (size > MAX_BODY_SIZE) {
req.destroy();
reject(new Error('Request body too large'));
return;
}
chunks.push(c);
});
req.on('end', () => resolveP(Buffer.concat(chunks).toString()));
req.on('error', reject);
});
}
export async function parseJSONBody<T = unknown>(req: IncomingMessage): Promise<T> {
const body = await parseBody(req);
return JSON.parse(body) as T;
}
export function parseCookies(req: IncomingMessage): Record<string, string> {
const cookies: Record<string, string> = {};
const header = req.headers.cookie || '';
for (const pair of header.split(';')) {
const eq = pair.indexOf('=');
if (eq === -1) continue;
cookies[pair.slice(0, eq).trim()] = pair.slice(eq + 1).trim();
}
return cookies;
}
export function clientIp(req: IncomingMessage): string {
const xff = (req.headers['x-forwarded-for'] as string | undefined)?.split(',')[0]?.trim();
return xff || req.socket.remoteAddress || 'unknown';
}

View file

@ -0,0 +1,17 @@
import type { IncomingMessage, ServerResponse } from 'node:http';
import { getSession, type SessionData } from '../auth/session.js';
import { sendJSON } from '../lib/http.js';
export interface AuthedReq extends IncomingMessage {
session: SessionData;
}
export function requireAuth(req: IncomingMessage, res: ServerResponse): SessionData | null {
const session = getSession(req);
if (!session) {
sendJSON(res, 401, { error: 'Not authenticated' });
return null;
}
(req as AuthedReq).session = session;
return session;
}

View file

@ -0,0 +1,20 @@
import type { ServerResponse } from 'node:http';
import { getUserById } from '../db/users.js';
import { sendJSON } from '../lib/http.js';
import type { SessionData } from '../auth/session.js';
export async function requireSuperAdmin(
res: ServerResponse,
session: SessionData,
): Promise<boolean> {
const user = await getUserById(session.user_id);
if (!user) {
sendJSON(res, 401, { error: 'User not found' });
return false;
}
if (!user.is_super_admin) {
sendJSON(res, 403, { error: 'Super-admin required' });
return false;
}
return true;
}

Some files were not shown because too many files have changed in this diff Show more