Phase 2: deploy machinery for Dev/Prod cutover

- deploy.sh dev|prod with --dry-run, auto-rollback if /health fails
  within 60s; checkpoint saved to .last_deploy_rollback before reset
- deploy/rollback.sh last|<sha> with the same Docker compose dance
- deploy/health-check.sh — curl wrapper for monitoring/oncall
- deploy/apache-{dev,prod}.conf — Location blocks proxying /hm-aiqc/
  to gunicorn on 127.0.0.1:5050 with X-Script-Name set so wsgi.py's
  ReverseProxied middleware emits prefixed URLs
- deploy/.env.{dev,prod}.example — starter envs with Azure SSO config
This commit is contained in:
nickviljoen 2026-05-09 14:08:06 +02:00
parent 84326352b2
commit e772095158
7 changed files with 387 additions and 0 deletions

184
deploy.sh Executable file
View file

@ -0,0 +1,184 @@
#!/bin/bash
# HM AI QC deploy script (Docker Compose).
#
# Usage:
# deploy.sh dev Deploy origin/develop HEAD to this server
# deploy.sh prod <tag> Deploy a specific tag to this server
# deploy.sh dev --dry-run Show what would change, make no changes
# deploy.sh prod <tag> --dry-run
#
# Runs on the target server (optical-dev / optical-prod), not your laptop.
# Saves a rollback checkpoint to .last_deploy_rollback before changing
# anything, and auto-rolls back if the post-deploy /health probe fails.
#
# Differences from the AI QC sibling script (intentional):
# * Docker Compose, not systemd. `docker compose up -d` replaces
# systemctl restart; `docker compose build` replaces pip install.
# * `flask db upgrade` runs as a one-shot container before bringing up
# the web service, so schema changes apply atomically with the deploy.
# * No "delete frontend / build frontend / copy to /var/www/html" steps
# from the IT spec — HM QC ships Flask templates, not an SPA bundle.
set -euo pipefail
APP_DIR=${APP_DIR:-/opt/hm-aiqc}
HEALTH_URL=${HEALTH_URL:-http://127.0.0.1:5050/health}
ROLLBACK_FILE="$APP_DIR/.last_deploy_rollback"
MODE=${1:-}
shift || true
DRY_RUN=false
TARGET_TAG=""
case "$MODE" in
dev)
for arg in "$@"; do
[[ "$arg" == "--dry-run" ]] && DRY_RUN=true
done
;;
prod)
TARGET_TAG=${1:-}
shift || true
for arg in "$@"; do
[[ "$arg" == "--dry-run" ]] && DRY_RUN=true
done
if [[ -z "$TARGET_TAG" ]]; then
echo "Usage: $0 prod <tag> [--dry-run]"
exit 1
fi
;;
""|-h|--help)
cat <<EOF
Usage:
$(basename "$0") dev [--dry-run] Deploy latest develop to this server
$(basename "$0") prod <tag> [--dry-run] Deploy a specific tag to this server
Run on the target server. Requires permission to talk to docker.
EOF
exit 0
;;
*)
echo "Unknown mode: $MODE"
echo "Try: $(basename "$0") --help"
exit 1
;;
esac
cd "$APP_DIR"
if [[ ! -d .git ]]; then
echo "ERROR: $APP_DIR is not a git repo"
exit 1
fi
if [[ ! -f .env ]]; then
echo "ERROR: $APP_DIR/.env not found. Copy from deploy/.env.${MODE}.example and fill in."
exit 1
fi
CURRENT_REV=$(git rev-parse HEAD)
CURRENT_SHORT=$(git rev-parse --short HEAD)
echo "============================================"
echo " HM AI QC deploy ($MODE)"
echo "============================================"
echo "Server: $(hostname)"
echo "Current: $CURRENT_SHORT $(git log -1 --format='%s' HEAD)"
echo ""
echo "Fetching latest refs..."
git fetch --tags --prune --quiet
if [[ "$MODE" == "dev" ]]; then
TARGET_REF="origin/develop"
else
if ! git rev-parse --verify --quiet "refs/tags/$TARGET_TAG^{commit}" > /dev/null; then
echo "ERROR: Tag '$TARGET_TAG' not found after fetch"
exit 1
fi
TARGET_REF="refs/tags/$TARGET_TAG"
fi
TARGET_REV=$(git rev-parse "$TARGET_REF")
TARGET_SHORT=$(git rev-parse --short "$TARGET_REF")
if [[ "$CURRENT_REV" == "$TARGET_REV" ]]; then
echo "Already at $TARGET_SHORT — nothing to do."
exit 0
fi
echo "Target: $TARGET_SHORT $(git log -1 --format='%s' "$TARGET_REF")"
echo ""
echo "Commits to apply:"
git log --oneline "$CURRENT_REV..$TARGET_REV" | head -20
CHANGE_COUNT=$(git log --oneline "$CURRENT_REV..$TARGET_REV" | wc -l | tr -d ' ')
if [[ $CHANGE_COUNT -gt 20 ]]; then
echo " ... and $((CHANGE_COUNT - 20)) more"
fi
echo ""
if git diff --name-only "$CURRENT_REV" "$TARGET_REV" | grep -qE "(^|/)migrations/versions/"; then
echo "Note: Alembic migrations changed — flask db upgrade will run."
echo ""
fi
if [[ "$DRY_RUN" == "true" ]]; then
echo "Dry run — no changes made."
exit 0
fi
read -r -p "Proceed with deploy? (y/N): " confirm
if [[ ! $confirm =~ ^[Yy]$ ]]; then
echo "Cancelled."
exit 0
fi
echo "$CURRENT_REV" > "$ROLLBACK_FILE"
echo "Applying changes..."
git reset --hard "$TARGET_REV"
echo "Building images..."
docker compose build
echo "Starting services (entrypoint runs flask db upgrade first)..."
docker compose up -d
# Poll $HEALTH_URL every 2s until it answers 2xx, or timeout.
# 60s window allows for migration time on first boot of a major release.
wait_for_health() {
local max_attempts=30 # 30 × 2s = 60s window
for ((i=1; i<=max_attempts; i++)); do
sleep 2
if curl -sf -o /dev/null "$HEALTH_URL"; then
echo " healthy after ${i}x2s"
return 0
fi
done
return 1
}
echo "Smoke testing $HEALTH_URL..."
if wait_for_health; then
NEW_SHORT=$(git rev-parse --short HEAD)
echo ""
echo "Deploy OK. Now at $NEW_SHORT."
echo "Rollback target saved: $CURRENT_SHORT (run deploy/rollback.sh last to revert)"
exit 0
fi
echo ""
echo "Smoke test failed after 60s — rolling back to $CURRENT_SHORT..."
git reset --hard "$CURRENT_REV"
docker compose build
docker compose up -d
if wait_for_health; then
echo "Rolled back successfully. Service healthy at $CURRENT_SHORT."
echo "Investigate: docker compose logs --tail=200 web"
exit 1
fi
echo "ROLLBACK ALSO FAILED. Service is in a broken state."
echo "docker compose ps"
echo "docker compose logs --tail=200 web"
exit 2

31
deploy/.env.dev.example Normal file
View file

@ -0,0 +1,31 @@
# HM AI QC — Dev environment starter
# Copy to /opt/hm-aiqc/.env and fill in real secrets.
# Azure AD authentication (shared with AI QC sibling project)
AZURE_TENANT_ID=e519c2e6-bc6d-4fdf-8d9c-923c2f002385
AZURE_CLIENT_ID=9079054c-9620-4757-a256-23413042f1ef
ENVIRONMENT=development
# Box Configuration
BOX_CONFIG_PATH=config/box_config.json
BOX_REPORT_FOLDER_ID=133295752718
BOX_CAMPAIGNS_FOLDER_ID=156182880490
# Flask
FLASK_APP=app:app
FLASK_ENV=production
# Generate with: python -c 'import secrets; print(secrets.token_urlsafe(48))'
SECRET_KEY=replace-me-with-a-long-random-string
# Server
HOST=0.0.0.0
PORT=5000
# Database — absolute path inside the container, mapped to ./database on host
DATABASE_URI=sqlite:////app/database/qc_platform.db
# LLM Provider Keys (NO HARDCODED KEYS — set real values below)
OPENAI_API_KEY=
GOOGLE_API_KEY=
ANTHROPIC_API_KEY=

31
deploy/.env.prod.example Normal file
View file

@ -0,0 +1,31 @@
# HM AI QC — Prod environment starter
# Copy to /opt/hm-aiqc/.env and fill in real secrets.
# Azure AD authentication (shared with AI QC sibling project)
AZURE_TENANT_ID=e519c2e6-bc6d-4fdf-8d9c-923c2f002385
AZURE_CLIENT_ID=9079054c-9620-4757-a256-23413042f1ef
ENVIRONMENT=production
# Box Configuration
BOX_CONFIG_PATH=config/box_config.json
BOX_REPORT_FOLDER_ID=133295752718
BOX_CAMPAIGNS_FOLDER_ID=156182880490
# Flask
FLASK_APP=app:app
FLASK_ENV=production
# Generate with: python -c 'import secrets; print(secrets.token_urlsafe(48))'
SECRET_KEY=replace-me-with-a-long-random-string
# Server
HOST=0.0.0.0
PORT=5000
# Database — absolute path inside the container, mapped to ./database on host
DATABASE_URI=sqlite:////app/database/qc_platform.db
# LLM Provider Keys (NO HARDCODED KEYS — set real values below)
OPENAI_API_KEY=
GOOGLE_API_KEY=
ANTHROPIC_API_KEY=

28
deploy/apache-dev.conf Normal file
View file

@ -0,0 +1,28 @@
# Apache Location block for HM AI QC on optical-dev.
#
# Insert inside the existing <VirtualHost *:443> for
# ServerName optical-dev.oliver.solutions
# and reload:
# sudo systemctl reload apache2
#
# Required modules (enable once if not already):
# sudo a2enmod proxy proxy_http headers
#
# The X-Script-Name header is read by wsgi.py's ReverseProxied middleware
# so url_for() generates URLs prefixed with /hm-aiqc.
ProxyPass /hm-aiqc/ http://127.0.0.1:5050/
ProxyPassReverse /hm-aiqc/ http://127.0.0.1:5050/
# Bare /hm-aiqc (no trailing slash) → redirect with slash so the prefix matches
RewriteEngine On
RewriteRule ^/hm-aiqc$ /hm-aiqc/ [R=301,L]
<Location /hm-aiqc/>
RequestHeader set X-Script-Name "/hm-aiqc"
RequestHeader set X-Forwarded-Proto "https"
ProxyPreserveHost On
# Long-running endpoints (QC execution, Box searches, video analysis)
ProxyTimeout 600
</Location>

28
deploy/apache-prod.conf Normal file
View file

@ -0,0 +1,28 @@
# Apache Location block for HM AI QC on optical-prod.
#
# Insert inside the existing <VirtualHost *:443> for
# ServerName optical-prod.oliver.solutions
# and reload:
# sudo systemctl reload apache2
#
# Required modules (enable once if not already):
# sudo a2enmod proxy proxy_http headers
#
# The X-Script-Name header is read by wsgi.py's ReverseProxied middleware
# so url_for() generates URLs prefixed with /hm-aiqc.
ProxyPass /hm-aiqc/ http://127.0.0.1:5050/
ProxyPassReverse /hm-aiqc/ http://127.0.0.1:5050/
# Bare /hm-aiqc (no trailing slash) → redirect with slash so the prefix matches
RewriteEngine On
RewriteRule ^/hm-aiqc$ /hm-aiqc/ [R=301,L]
<Location /hm-aiqc/>
RequestHeader set X-Script-Name "/hm-aiqc"
RequestHeader set X-Forwarded-Proto "https"
ProxyPreserveHost On
# Long-running endpoints (QC execution, Box searches, video analysis)
ProxyTimeout 600
</Location>

10
deploy/health-check.sh Executable file
View file

@ -0,0 +1,10 @@
#!/bin/bash
# Quick "is the app alive?" check. Prints status and exits 0 (healthy) or 1 (not).
HEALTH_URL=${HEALTH_URL:-http://127.0.0.1:5050/health}
if output=$(curl -sf "$HEALTH_URL" 2>&1); then
echo "OK $output"
exit 0
fi
echo "DOWN no response from $HEALTH_URL"
exit 1

75
deploy/rollback.sh Executable file
View file

@ -0,0 +1,75 @@
#!/bin/bash
# Emergency rollback for HM AI QC.
#
# Usage:
# rollback.sh last Roll back to the checkpoint saved by deploy.sh
# rollback.sh <commit-hash> Roll back to an explicit commit
#
# Note: Alembic downgrade is intentionally NOT run here — schema rollbacks
# are risky on data-bearing tables. If the bad deploy added a column the
# rolled-back code doesn't know about, that's almost always fine. If it
# dropped or renamed a column, restore from the daily DB backup.
set -euo pipefail
APP_DIR=${APP_DIR:-/opt/hm-aiqc}
HEALTH_URL=${HEALTH_URL:-http://127.0.0.1:5050/health}
ROLLBACK_FILE="$APP_DIR/.last_deploy_rollback"
TARGET=${1:-}
if [[ -z "$TARGET" || "$TARGET" == "last" ]]; then
if [[ ! -f "$ROLLBACK_FILE" ]]; then
echo "No .last_deploy_rollback file. Specify a commit hash explicitly."
echo "Usage: $(basename "$0") last | <commit-hash>"
exit 1
fi
TARGET=$(cat "$ROLLBACK_FILE")
fi
cd "$APP_DIR"
if ! git rev-parse --verify --quiet "$TARGET^{commit}" > /dev/null; then
echo "ERROR: Commit '$TARGET' not found"
exit 1
fi
CURRENT_REV=$(git rev-parse HEAD)
CURRENT_SHORT=$(git rev-parse --short HEAD)
TARGET_REV=$(git rev-parse "$TARGET")
TARGET_SHORT=$(git rev-parse --short "$TARGET")
if [[ "$CURRENT_REV" == "$TARGET_REV" ]]; then
echo "Already at $TARGET_SHORT — nothing to do."
exit 0
fi
echo "============================================"
echo " HM AI QC rollback"
echo "============================================"
echo "Current: $CURRENT_SHORT $(git log -1 --format='%s' HEAD)"
echo "Target: $TARGET_SHORT $(git log -1 --format='%s' "$TARGET")"
echo ""
read -r -p "Proceed? (y/N): " confirm
if [[ ! $confirm =~ ^[Yy]$ ]]; then
echo "Cancelled."
exit 0
fi
git reset --hard "$TARGET_REV"
docker compose build
docker compose up -d
# 60s window — same as deploy.sh
for i in {1..30}; do
sleep 2
if curl -sf -o /dev/null "$HEALTH_URL"; then
echo "Rollback OK. Now at $TARGET_SHORT."
exit 0
fi
done
echo "Service unhealthy after rollback."
echo "docker compose logs --tail=200 web"
exit 1