ai_qc/backend/scripts/deploy.sh
nickviljoen 9771feaa3a Make deploy.sh smoke test retry for 30s instead of giving up at 3s
The service takes ~4s to come up on dev (75 QC modules + 14 profiles
import on start), just over the previous 3s sleep. This caused a
false-negative rollback. Now we poll /health every 2s for up to 30s
before declaring failure; same logic for the rollback-restart path.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-22 21:09:10 +02:00

174 lines
4.7 KiB
Bash
Executable file
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
# AI QC deploy script.
#
# Usage:
# deploy.sh dev Pull latest develop → restart service
# deploy.sh prod <tag> Check out a specific tag → restart service
# deploy.sh dev --dry-run Show what would change, make no changes
#
# Runs on the target server (not your laptop). Needs sudo for systemctl.
# Saves a rollback checkpoint to .last_deploy_rollback before changing anything,
# and auto-rolls back if the post-deploy smoke test fails.
set -euo pipefail
APP_DIR=/opt/ai_qc
SERVICE=ai-qc.service
HEALTH_URL=http://127.0.0.1:7183/health
ROLLBACK_FILE="$APP_DIR/.last_deploy_rollback"
MODE=${1:-}
shift || true
DRY_RUN=false
TARGET_TAG=""
case "$MODE" in
dev)
for arg in "$@"; do
[[ "$arg" == "--dry-run" ]] && DRY_RUN=true
done
;;
prod)
TARGET_TAG=${1:-}
shift || true
for arg in "$@"; do
[[ "$arg" == "--dry-run" ]] && DRY_RUN=true
done
if [[ -z "$TARGET_TAG" ]]; then
echo "Usage: $0 prod <tag> [--dry-run]"
exit 1
fi
;;
""|-h|--help)
cat <<EOF
Usage:
$(basename "$0") dev [--dry-run] Deploy latest develop to this server
$(basename "$0") prod <tag> [--dry-run] Deploy a specific tag to this server
Run on the target server. Requires sudo for systemctl restart.
EOF
exit 0
;;
*)
echo "Unknown mode: $MODE"
echo "Try: $(basename "$0") --help"
exit 1
;;
esac
cd "$APP_DIR"
if [[ ! -d .git ]]; then
echo "ERROR: $APP_DIR is not a git repo"
exit 1
fi
CURRENT_REV=$(git rev-parse HEAD)
CURRENT_SHORT=$(git rev-parse --short HEAD)
echo "============================================"
echo " AI QC deploy ($MODE)"
echo "============================================"
echo "Server: $(hostname)"
echo "Current: $CURRENT_SHORT $(git log -1 --format='%s' HEAD)"
echo ""
echo "Fetching latest refs..."
git fetch --tags --prune --quiet
if [[ "$MODE" == "dev" ]]; then
TARGET_REF="origin/develop"
else
if ! git rev-parse --verify --quiet "refs/tags/$TARGET_TAG^{commit}" > /dev/null; then
echo "ERROR: Tag '$TARGET_TAG' not found after fetch"
exit 1
fi
TARGET_REF="refs/tags/$TARGET_TAG"
fi
TARGET_REV=$(git rev-parse "$TARGET_REF")
TARGET_SHORT=$(git rev-parse --short "$TARGET_REF")
if [[ "$CURRENT_REV" == "$TARGET_REV" ]]; then
echo "Already at $TARGET_SHORT — nothing to do."
exit 0
fi
echo "Target: $TARGET_SHORT $(git log -1 --format='%s' "$TARGET_REF")"
echo ""
echo "Commits to apply:"
git log --oneline "$CURRENT_REV..$TARGET_REV" | head -20
CHANGE_COUNT=$(git log --oneline "$CURRENT_REV..$TARGET_REV" | wc -l | tr -d ' ')
if [[ $CHANGE_COUNT -gt 20 ]]; then
echo " ... and $((CHANGE_COUNT - 20)) more"
fi
echo ""
REQS_CHANGED=false
if git diff --name-only "$CURRENT_REV" "$TARGET_REV" | grep -qE "(^|/)requirements.txt$"; then
REQS_CHANGED=true
echo "Note: requirements.txt changed — pip install will run."
echo ""
fi
if [[ "$DRY_RUN" == "true" ]]; then
echo "Dry run — no changes made."
exit 0
fi
read -r -p "Proceed with deploy? (y/N): " confirm
if [[ ! $confirm =~ ^[Yy]$ ]]; then
echo "Cancelled."
exit 0
fi
echo "$CURRENT_REV" > "$ROLLBACK_FILE"
echo "Applying changes..."
git reset --hard "$TARGET_REV"
if [[ "$REQS_CHANGED" == "true" ]]; then
echo "Installing updated dependencies..."
"$APP_DIR/venv/bin/pip" install -q -r "$APP_DIR/requirements.txt"
fi
# Poll $HEALTH_URL every 2s until it answers 2xx, or timeout. Returns 0 on success.
# Startup takes ~4s on dev (75 QC modules + 14 profiles load on import).
wait_for_health() {
local max_attempts=15 # 15 × 2s = 30s window
for ((i=1; i<=max_attempts; i++)); do
sleep 2
if curl -sf -o /dev/null "$HEALTH_URL"; then
echo " healthy after ${i}x2s"
return 0
fi
done
return 1
}
echo "Restarting $SERVICE..."
sudo systemctl restart "$SERVICE"
echo "Smoke testing $HEALTH_URL..."
if wait_for_health; then
NEW_SHORT=$(git rev-parse --short HEAD)
echo ""
echo "Deploy OK. Now at $NEW_SHORT."
echo "Rollback target saved: $CURRENT_SHORT (run rollback.sh last to revert)"
exit 0
fi
echo ""
echo "Smoke test failed after 30s — rolling back to $CURRENT_SHORT..."
git reset --hard "$CURRENT_REV"
sudo systemctl restart "$SERVICE"
if wait_for_health; then
echo "Rolled back successfully. Service healthy at $CURRENT_SHORT."
echo "Investigate: sudo journalctl -u $SERVICE -n 100"
exit 1
fi
echo "ROLLBACK ALSO FAILED. Service is in a broken state."
echo "sudo systemctl status $SERVICE"
echo "sudo journalctl -u $SERVICE -n 100"
exit 2