Skip to content

(build): front

(build): front #535

# =============================================================================
# Production CI/CD Pipeline — Quality Gates + Blue-Green Deployment
# =============================================================================
#
# Pipeline:
# 1. Quality Gates (parallel): lint, typecheck, security
# 2. Build Verification: extension + Next.js builds (smoke test)
# 3. Deploy Gate: parses commit message with a strict regex (anti-footgun)
# 4. Deploy: docker compose build on the prod host → blue-green rollout
#
# Triggers:
# - push to main: full pipeline. Deploy gated by commit-message convention
# `(build): front`, `(build): back`, or `(build): front back` parsed by
# `.github/scripts/evaluate-deploy-trigger.sh prod` in `deploy-gate`.
# Uptime Kuma deploy uses `(build): uptime-kuma` via the same script's
# `uptime-kuma` mode in `uptime-kuma-gate`.
# - pull_request to main: quality gates + build verification only.
# - schedule (Sunday 00:00 UTC): security audit only.
# - workflow_dispatch: manual run with `force_deploy` / `skip_quality_gates`.
#
# Architectural notes (decided 2026-05, see commit history):
# - Images are built on the prod self-hosted runner (`prod.docs.plus`).
# Disk pressure is mitigated by a pre-build disk guard, not by pushing
# to a registry. If pressure resurfaces, revisit M3 (ghcr.io push).
# - Rollback uses an on-disk tag stash on the prod host:
# /opt/projects/prod.docs.plus/.deploy/last-good-tag.
# - All third-party actions (workflow + composite) are pinned to commit
# SHA. Renovate/Dependabot should bump them; never use floating tags.
# =============================================================================
name: CI/CD Production
on:
push:
branches: [main]
pull_request:
branches: [main]
schedule:
# Weekly security scan (Sunday 00:00 UTC)
- cron: '0 0 * * 0'
workflow_dispatch:
inputs:
skip_quality_gates:
description: 'Skip quality gates (emergency deploy)'
required: false
default: false
type: boolean
force_deploy:
description: 'Force deployment (bypass commit-message gate)'
required: false
default: false
type: boolean
# Two concurrency groups:
# - quality-gates can be cancelled freely (cheap to redo)
# - deploy MUST finish or rollback (mid-deploy SIGTERM corrupts blue-green state)
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-quality
cancel-in-progress: true
# Default to bash so set -e/-o pipefail behavior is consistent across runners.
defaults:
run:
shell: bash
# Workflow-level least privilege; per-job overrides where needed.
permissions:
contents: read
env:
ENV_SOURCE: /opt/projects/prod.docs.plus/.env
ENV_FILE: .env.production
COMPOSE_FILE: docker-compose.prod.yml
DEPLOY_TAG: ${{ github.sha }}
# Where the prod host stashes the last successfully deployed SHA for rollback.
DEPLOY_STATE_DIR: /opt/projects/prod.docs.plus/.deploy
LAST_GOOD_TAG_FILE: /opt/projects/prod.docs.plus/.deploy/last-good-tag
jobs:
# ===========================================================================
# STAGE 0 — CHANGE DETECTION (cheap; gates the expensive extension suite)
# ===========================================================================
# The clean-room extension suite (~14 min) is the pipeline's long pole and is
# a hard deploy gate. It only needs to run when something that affects an
# extension build/test actually changed. This job emits a boolean the
# extension-tests job keys off; lint/typecheck/security stay always-on
# because they are fast and repo-global (typecheck still catches extension
# TYPE regressions on every push even when the Cypress suite is skipped).
# ===========================================================================
changes:
name: 🔎 Detect Changes
runs-on: ubuntu-latest
timeout-minutes: 5
if: github.event_name != 'schedule'
permissions:
contents: read
outputs:
extensions: ${{ steps.filter.outputs.extensions }}
steps:
- name: 📦 Checkout
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: 🔎 Filter extension-affecting paths
id: filter
uses: dorny/paths-filter@de90cc6fb38fc0963ad72b210f1f284cd68cea36 # v3.0.2
with:
filters: .github/filters/extensions.yaml
# ===========================================================================
# STAGE 1 — QUALITY GATES (parallel, fast feedback)
# ===========================================================================
lint:
name: 🔍 Lint & Format
runs-on: ubuntu-latest
timeout-minutes: 10
if: github.event_name != 'schedule'
permissions:
contents: read
steps:
- name: 📦 Checkout
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: 🥟 Setup Environment
uses: ./.github/actions/setup-bun
- name: 🔍 Lint, format & styles
run: |
bun run lint
bun run format
bun run lint:styles
typecheck:
name: 📝 Type Check
runs-on: ubuntu-latest
timeout-minutes: 15
if: github.event_name != 'schedule'
permissions:
contents: read
steps:
- name: 📦 Checkout
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: 🥟 Setup Environment
uses: ./.github/actions/setup-bun
- name: 🔧 Build Extensions (required for types)
uses: ./.github/actions/build-extensions
- name: 📝 Type Check All
run: bun run typecheck
security:
name: 🔒 Security Audit
runs-on: ubuntu-latest
timeout-minutes: 10
permissions:
contents: read
steps:
- name: 📦 Checkout
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: 🥟 Setup Environment
uses: ./.github/actions/setup-bun
with:
ignore-scripts: 'true'
- name: 🔍 Bun Audit
run: |
set -o pipefail
echo "🔍 Checking for known vulnerabilities..."
# Capture both human-readable and machine-readable output.
# `bun pm audit` exits non-zero when vulns exist; we always want both files.
bun pm audit 2>&1 | tee audit-results.txt || true
bun pm audit --json > audit-results.json 2>/dev/null || echo '{}' > audit-results.json
# Structured parsing — replaces brittle `grep -ci "critical"` which
# matched the summary header line and produced false positives.
# Expected shape: { "vulnerabilities": { "critical": N, "high": N, ... } }
CRITICAL=$(bun -e 'const a=JSON.parse(require("fs").readFileSync("audit-results.json","utf8"));process.stdout.write(String(a?.vulnerabilities?.critical||0))')
HIGH=$(bun -e 'const a=JSON.parse(require("fs").readFileSync("audit-results.json","utf8"));process.stdout.write(String(a?.vulnerabilities?.high||0))')
echo ""
echo "📊 Summary: critical=${CRITICAL}, high=${HIGH}"
# Defensive: if both are 0 AND the JSON looks empty, the audit
# shape may have changed (Bun has changed it before). Print the
# raw JSON head so a future regression doesn't silently neutralize
# this gate. Caps at 4 KB to keep logs tidy.
if [ "${CRITICAL}" -eq 0 ] && [ "${HIGH}" -eq 0 ]; then
BYTES=$(wc -c < audit-results.json | tr -d ' ')
if [ "${BYTES}" -lt 32 ]; then
echo "::warning::audit-results.json is suspiciously small (${BYTES} bytes). Bun audit JSON shape may have changed."
echo "--- audit-results.json (head 4KB) ---"
head -c 4096 audit-results.json || true
echo ""
echo "--- end ---"
fi
fi
if [ "${CRITICAL}" -gt 0 ] || [ "${HIGH}" -gt 0 ]; then
echo "::error::Critical/High vulnerabilities detected (critical=${CRITICAL}, high=${HIGH})"
exit 1
fi
echo "✅ No critical/high vulnerabilities"
- name: 📤 Upload Audit Results
if: always()
uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
with:
name: security-audit-${{ github.sha }}
path: |
audit-results.txt
audit-results.json
retention-days: 30
extension-tests:
name: 🧪 Extension Tests
runs-on: ubuntu-latest
timeout-minutes: 45
needs: [changes]
# Run only when extension-affecting paths changed. workflow_dispatch always
# runs the suite (manual runs have no reliable diff base). When this job is
# skipped because nothing extension-related changed, the build gate below
# accepts that skip only if the changes job succeeded.
if: |
github.event_name != 'schedule' &&
(needs.changes.outputs.extensions == 'true' || github.event_name == 'workflow_dispatch')
permissions:
contents: read
steps:
- name: 📦 Checkout
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: 🥟 Setup Environment
uses: ./.github/actions/setup-bun
- name: 🔧 Build extensions (release gate)
uses: ./.github/actions/build-extensions
- name: 🌲 Setup Cypress binary
uses: ./.github/actions/setup-cypress
- name: 🧪 Clean-room extension suites
env:
EXTENSION_DIST_READY: '1'
run: bash scripts/run-tests.sh --extensions
- name: ✈️ Publish preflight (all five)
run: bash scripts/extension-preflight.sh
# ===========================================================================
# STAGE 2 — BUILD VERIFICATION (smoke test, no artifacts produced)
# ===========================================================================
# Note: this job intentionally does NOT build Docker images. The deploy job
# rebuilds them on the prod host anyway (decided 2026-05); duplicating the
# docker build here would just slow the pipeline without sharing cache. The
# webapp/admin Next.js compile here catches type/build regressions early.
# ===========================================================================
build:
name: 🏗️ Build Verification
runs-on: ubuntu-latest
timeout-minutes: 35
needs: [changes, lint, typecheck, security, extension-tests]
if: |
always() &&
github.event_name != 'schedule' &&
needs.changes.result == 'success' &&
(needs.lint.result == 'success' || (github.event_name == 'workflow_dispatch' && inputs.skip_quality_gates)) &&
(needs.typecheck.result == 'success' || (github.event_name == 'workflow_dispatch' && inputs.skip_quality_gates)) &&
(needs.security.result == 'success' || (github.event_name == 'workflow_dispatch' && inputs.skip_quality_gates)) &&
(needs.extension-tests.result == 'success' ||
(needs.extension-tests.result == 'skipped' && needs.changes.outputs.extensions != 'true') ||
(github.event_name == 'workflow_dispatch' && inputs.skip_quality_gates))
permissions:
contents: read
steps:
- name: 📦 Checkout
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: 🥟 Setup Environment
uses: ./.github/actions/setup-bun
- name: 🔧 Build TipTap Extensions
uses: ./.github/actions/build-extensions
- name: 🏗️ Build Webapp
run: bun run --filter @docs.plus/webapp build:ci
env:
NEXT_PUBLIC_SUPABASE_URL: ${{ secrets.NEXT_PUBLIC_SUPABASE_URL || 'http://localhost:54321' }}
NEXT_PUBLIC_SUPABASE_ANON_KEY: ${{ secrets.NEXT_PUBLIC_SUPABASE_ANON_KEY || 'dummy-key' }}
- name: 🏗️ Build Admin Dashboard
run: bun run --filter @docs.plus/admin-dashboard build:ci
env:
NEXT_PUBLIC_SUPABASE_URL: ${{ secrets.NEXT_PUBLIC_SUPABASE_URL || 'http://localhost:54321' }}
NEXT_PUBLIC_SUPABASE_ANON_KEY: ${{ secrets.NEXT_PUBLIC_SUPABASE_ANON_KEY || 'dummy-key' }}
NEXT_PUBLIC_API_URL: ${{ secrets.NEXT_PUBLIC_API_URL || 'http://localhost:3003' }}
NEXT_PUBLIC_APP_URL: ${{ secrets.NEXT_PUBLIC_APP_URL || 'http://localhost:3000' }}
# ===========================================================================
# STAGE 2.5 — DEPLOY GATE (precise commit-message parsing)
# ===========================================================================
# Replaces the previous loose `contains(...)` chain in the deploy job's `if:`.
# The old check matched any commit whose body contained both "build" and
# "front" or "back" anywhere (e.g. "fix iOS back gesture build crash" would
# have triggered a production deploy). This job parses the convention
# documented in AGENTS.md (`(build): front|back|front back`) with a real
# regex and surfaces the result as a job output.
# ===========================================================================
deploy-gate:
name: 🚦 Deploy Gate
runs-on: ubuntu-latest
timeout-minutes: 2
needs: [build]
if: |
always() &&
needs.build.result == 'success' &&
(github.event_name == 'push' || (github.event_name == 'workflow_dispatch' && inputs.force_deploy))
permissions:
contents: read
outputs:
deploy: ${{ steps.gate.outputs.deploy }}
reason: ${{ steps.gate.outputs.reason }}
steps:
- name: 📦 Checkout
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: 🚦 Evaluate trigger
id: gate
env:
EVENT_NAME: ${{ github.event_name }}
REF: ${{ github.ref }}
FORCE_DEPLOY: ${{ inputs.force_deploy }}
COMMIT_MSG: ${{ github.event.head_commit.message }}
run: |
set -euo pipefail
if [ "${EVENT_NAME}" = "workflow_dispatch" ] && [ "${FORCE_DEPLOY}" = "true" ]; then
echo "deploy=true" >> "$GITHUB_OUTPUT"
echo "reason=workflow_dispatch+force_deploy" >> "$GITHUB_OUTPUT"
echo "✅ Deploying: workflow_dispatch with force_deploy=true"
exit 0
fi
if [ "${EVENT_NAME}" != "push" ] || [ "${REF}" != "refs/heads/main" ]; then
echo "deploy=false" >> "$GITHUB_OUTPUT"
echo "reason=non-main push" >> "$GITHUB_OUTPUT"
echo "ℹ️ Skipping deploy: not a push to main"
exit 0
fi
if bash .github/scripts/evaluate-deploy-trigger.sh prod; then
echo "✅ Deploying: matched (build): front|back convention"
else
echo "ℹ️ Skipping deploy: commit does not match '(build): front|back|front back'"
echo "ℹ️ Subject line was:"
printf '%s\n' "${COMMIT_MSG}" | head -1
fi
# ===========================================================================
# STAGE 3 — PRODUCTION DEPLOYMENT
# ===========================================================================
# IMPORTANT: this job intentionally opts OUT of cancel-in-progress at the job
# level. Mid-deploy SIGTERM during `docker compose up --scale` can leave the
# cluster with a mix of old+new containers and corrupt blue-green state.
# ===========================================================================
deploy:
name: 🚀 Deploy Production
runs-on: prod.docs.plus
timeout-minutes: 30
needs: [deploy-gate]
# Separate concurrency group with cancel-in-progress: false. If two pushes
# arrive close together, the second waits for the first to finish.
concurrency:
group: ${{ github.workflow }}-deploy
cancel-in-progress: false
if: needs.deploy-gate.outputs.deploy == 'true'
environment:
name: production
url: https://docs.plus
permissions:
contents: read
steps:
- name: 📦 Checkout Code
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 1
- name: 🔐 Prepare Environment
run: |
# Compose `--env-file` is the single source of truth. We do NOT also
# `set -a; source` it elsewhere — that path was double-loading and
# leaking vars to subshells unintentionally.
cp "${ENV_SOURCE}" "${ENV_FILE}"
echo "DEPLOY_TAG=${DEPLOY_TAG}" >> "${ENV_FILE}"
# Stash the previous successful tag (if any) for the rollback step.
mkdir -p "${DEPLOY_STATE_DIR}"
if [ -f "${LAST_GOOD_TAG_FILE}" ]; then
PREVIOUS_TAG=$(cat "${LAST_GOOD_TAG_FILE}")
echo "PREVIOUS_TAG=${PREVIOUS_TAG}" >> "$GITHUB_ENV"
echo "ℹ️ Previous good tag: ${PREVIOUS_TAG}"
else
echo "PREVIOUS_TAG=" >> "$GITHUB_ENV"
echo "ℹ️ No previous good tag stashed (first deploy or fresh state dir)"
fi
echo "✅ Environment ready"
- name: 💾 Pre-deploy disk guard
run: |
echo "📊 Disk before prune:"
df -h / | tail -1
# Free space proactively. Without this, --no-cache builds can fill
# the root volume between deploys and silently OOM/ENOSPC the build
# step (job ends in <2min with no error). Runs before build, not after.
docker image prune -af --filter "until=24h" 2>/dev/null || true
docker builder prune -af --filter "until=24h" 2>/dev/null || true
# Hard guard: refuse to build when <10 GB free. Fail loud here
# rather than fail silently mid-build.
AVAIL_KB=$(df --output=avail / | tail -1)
AVAIL_GB=$((AVAIL_KB / 1024 / 1024))
echo "📊 Disk after prune: ${AVAIL_GB} GB free"
if [ "${AVAIL_GB}" -lt 10 ]; then
echo "::error::Less than 10 GB free on /. Aborting deploy. SSH to host and run 'docker system prune -af --volumes'."
df -h /
docker system df
exit 1
fi
- name: 📂 Verify build context (monorepo root)
run: |
if [ ! -d packages/email-templates ]; then
echo "::error::packages/email-templates missing. Build context must be repo root (context: .). Check checkout includes the workspace."
exit 1
fi
if ! grep -q 'email-templates' apps/hocuspocus.server/docker/Dockerfile.bun; then
echo "::error::apps/hocuspocus.server/docker/Dockerfile.bun must COPY packages/email-templates."
exit 1
fi
if ! grep -q 'email-templates' apps/webapp/docker/Dockerfile.bun; then
echo "::error::apps/webapp/docker/Dockerfile.bun must COPY packages/email-templates."
exit 1
fi
echo "✅ Build context OK (repo root, email-templates present)"
- name: 🏗️ Build Docker Images
env:
DOCKER_BUILDKIT: '1'
COMPOSE_DOCKER_CLI_BUILD: '1'
run: |
echo "🔨 Building images with tag: ${DEPLOY_TAG}"
# hocuspocus-server and hocuspocus-worker share `docsplus-hocuspocus`;
# building both via compose with --no-cache duplicates context transfer
# and ties up the bake plan. Build via hocuspocus-server only; the
# worker reuses the resulting tag at `up` time.
#
# --no-cache: required as long as the prod entrypoint script changes
# are layered late in the Dockerfile and we don't yet have stable
# layer ordering. If/when entrypoint COPY moves to the last layer,
# we can drop --no-cache and gain ~5 min per deploy.
docker compose -f "${COMPOSE_FILE}" --env-file "${ENV_FILE}" \
build --no-cache rest-api hocuspocus-server
docker compose -f "${COMPOSE_FILE}" --env-file "${ENV_FILE}" \
build --parallel webapp admin-dashboard
echo "✅ Images built"
- name: 🔧 Ensure Infrastructure
run: |
echo "🔧 Ensuring infrastructure..."
docker network create docsplus-network 2>/dev/null || true
# Start Traefik and Redis (--no-recreate keeps existing if running)
docker compose -f "${COMPOSE_FILE}" --env-file "${ENV_FILE}" \
up -d --no-recreate traefik redis
# Force-start Traefik if somehow not running
if ! docker ps --filter "name=traefik" --filter "status=running" --format '{{.Names}}' | grep -q traefik; then
echo "⚠️ Traefik not running, starting..."
docker compose -f "${COMPOSE_FILE}" --env-file "${ENV_FILE}" up -d traefik
sleep 15
fi
# Wait for healthy
echo "⏳ Waiting for Traefik..."
for i in {1..30}; do
if docker ps --filter "name=traefik" --filter "health=healthy" --format '{{.Names}}' | grep -q traefik; then
echo "✅ Traefik healthy"
break
fi
[ "${i}" -eq 30 ] && echo "⚠️ Traefik health timeout, continuing..."
sleep 2
done
- name: 🚀 Deploy Services (Blue-Green)
run: |
echo "🚀 Starting zero-downtime deployment..."
deploy_service() {
local SERVICE="$1"
local TARGET="$2"
local CURRENT
CURRENT=$(docker ps --filter "label=com.docker.compose.service=${SERVICE}" -q | wc -l | tr -d ' ')
local SCALE_UP=$((CURRENT + TARGET))
echo ""
echo "📦 Deploying ${SERVICE} (current: ${CURRENT}, target: ${TARGET})..."
# Scale UP first (keeps old containers serving traffic)
if [ "${SCALE_UP}" -gt "${CURRENT}" ]; then
echo "⬆️ Scaling up to ${SCALE_UP}..."
docker compose -f "${COMPOSE_FILE}" --env-file "${ENV_FILE}" \
up -d --no-deps --scale "${SERVICE}=${SCALE_UP}" "${SERVICE}"
# Wait for healthy. 60×2s = 120s — Next.js cold start can hit
# 60-90s right after a --no-cache build. Was 30×2s = 60s before
# which produced false-fail rollbacks.
echo "⏳ Waiting for healthy containers..."
for i in {1..60}; do
local HEALTHY
HEALTHY=$(docker ps --filter "label=com.docker.compose.service=${SERVICE}" --filter "health=healthy" -q | wc -l)
if [ "${HEALTHY}" -ge "${TARGET}" ]; then
echo "✅ ${HEALTHY} healthy containers"
break
fi
[ $((i % 10)) -eq 0 ] && echo " ... ${HEALTHY}/${TARGET} healthy (attempt ${i}/60)"
sleep 2
done
fi
# Scale to target (compose removes old containers)
echo "📏 Scaling to target: ${TARGET}..."
docker compose -f "${COMPOSE_FILE}" --env-file "${ENV_FILE}" \
up -d --no-deps --scale "${SERVICE}=${TARGET}" "${SERVICE}"
sleep 2
}
deploy_service "webapp" 2
deploy_service "rest-api" 2
deploy_service "hocuspocus-server" 2
deploy_service "hocuspocus-worker" 1
deploy_service "admin-dashboard" 1
echo ""
echo "✅ All services deployed"
- name: 🩺 Verify Deployment
run: |
echo "🩺 Verifying deployment..."
sleep 10
# Infrastructure check
echo "📊 Infrastructure:"
for svc in traefik docsplus-redis; do
if docker ps --filter "name=${svc}" --filter "status=running" --format '{{.Names}}' | grep -q "${svc}"; then
echo " ✅ ${svc}: running"
else
echo " ❌ ${svc}: NOT running"
docker logs "${svc}" --tail 30 2>/dev/null || true
exit 1
fi
done
# Service running + healthy check
echo "📊 Services:"
for svc in webapp rest-api hocuspocus-server hocuspocus-worker admin-dashboard; do
RUNNING=$(docker ps --filter "label=com.docker.compose.service=${svc}" --filter "status=running" --format "{{.Names}}" | wc -l)
HEALTHY=$(docker ps --filter "label=com.docker.compose.service=${svc}" --filter "health=healthy" --format "{{.Names}}" | wc -l)
if [ "${RUNNING}" -gt 0 ]; then
echo " ✅ ${svc}: ${RUNNING} running, ${HEALTHY} healthy"
else
echo " ❌ ${svc}: NOT running"
exit 1
fi
done
# Internal smoke test — hit container health endpoints via the
# docker network, NOT via the public DNS+TLS stack. A transient
# ACME / Let's Encrypt hiccup must not trigger a false-fail rollback.
echo ""
echo "🔍 Internal smoke tests..."
smoke() {
local SVC="$1" PORT="$2" PATH_="$3"
if docker compose -f "${COMPOSE_FILE}" --env-file "${ENV_FILE}" exec -T "${SVC}" \
bun -e "fetch('http://localhost:${PORT}${PATH_}').then(r => r.ok ? process.exit(0) : process.exit(1)).catch(() => process.exit(1))"; then
echo " ✅ ${SVC} internal health"
else
echo " ❌ ${SVC} internal health"
return 1
fi
}
smoke webapp 3000 /api/health
smoke rest-api 4000 /health
smoke hocuspocus-server 4001 /health
smoke hocuspocus-worker 4002 /health
smoke admin-dashboard 3100 /api/health
# Public-URL probe is now informational only — does NOT fail the deploy.
# Real public-availability monitoring belongs in uptime-kuma, not here.
echo ""
echo "🌐 Public URL probe (informational):"
PUBLIC_CODE=$(curl -sf -o /dev/null -w "%{http_code}" --max-time 10 https://docs.plus/ 2>/dev/null || echo "000")
echo " https://docs.plus/ → ${PUBLIC_CODE}"
API_CODE=$(curl -sf -o /dev/null -w "%{http_code}" --max-time 10 https://prodback.docs.plus/api/health 2>/dev/null || echo "000")
echo " https://prodback.docs.plus/api/health → ${API_CODE}"
echo ""
echo "✅ Deployment verified"
- name: 📁 Sync compose files for break-glass
if: success()
run: |
# Replaces the previous "Sync Production Directory" step which re-`up`'d
# services from a different cwd — that re-up broke the blue-green
# guarantee. Now we only COPY the active compose + env files to a
# stable path so a human SSH'd in can run, e.g.:
# cd /opt/projects/prod.docs.plus/.deploy/current
# docker compose -f docker-compose.prod.yml --env-file .env.production ps
# without having to know the runner's _work directory.
mkdir -p "${DEPLOY_STATE_DIR}/current"
cp "${COMPOSE_FILE}" "${DEPLOY_STATE_DIR}/current/${COMPOSE_FILE}"
cp "${ENV_FILE}" "${DEPLOY_STATE_DIR}/current/${ENV_FILE}"
echo "✅ Synced compose+env to ${DEPLOY_STATE_DIR}/current/"
- name: 💾 Stash this tag as last-good
# Only on success — failure path is handled by the rollback step.
if: success()
run: |
# Persist current tag for the next deploy's rollback target.
mkdir -p "${DEPLOY_STATE_DIR}"
# Keep the previous one as last-good-tag.previous for one-step-back debugging.
if [ -f "${LAST_GOOD_TAG_FILE}" ]; then
cp "${LAST_GOOD_TAG_FILE}" "${LAST_GOOD_TAG_FILE}.previous"
fi
echo "${DEPLOY_TAG}" > "${LAST_GOOD_TAG_FILE}"
echo "✅ Stashed last-good-tag = ${DEPLOY_TAG}"
- name: 🧹 Cleanup
if: success()
continue-on-error: true # cleanup failure shouldn't fail an otherwise green deploy
run: |
docker image prune -f
docker image prune -f --filter "until=24h" 2>/dev/null || true
echo "✅ Cleanup complete"
- name: 📊 Summary
if: success()
run: |
echo "======================================"
echo "✅ DEPLOYMENT SUCCESSFUL"
echo "======================================"
echo "Tag: ${DEPLOY_TAG}"
echo "Previous tag: ${PREVIOUS_TAG:-<none>}"
echo ""
echo "Services:"
docker ps --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}" | grep -E "(traefik|docsplus|webapp|rest-api|hocuspocus)" | head -15
echo ""
echo "URLs:"
echo " - https://docs.plus"
echo " - https://prodback.docs.plus"
echo "======================================"
- name: 🚨 Rollback on Failure
if: failure()
run: |
echo "⚠️ Deployment failed — attempting rollback..."
if [ -z "${PREVIOUS_TAG:-}" ]; then
echo "::warning::No PREVIOUS_TAG stashed — cannot auto-rollback."
echo "📊 Current state:"
docker ps --format "table {{.Names}}\t{{.Status}}" | head -15
exit 0
fi
echo "↩️ Rolling back to: ${PREVIOUS_TAG}"
# Multi-image precondition (A2): ALL service images for the previous
# tag must still exist locally. The cleanup step honors --filter
# until=24h, so within a 24h window this works reliably; outside
# that window we fail loudly rather than partially-rollback into a
# mixed-version cluster.
MISSING=()
for img in docsplus-webapp docsplus-rest-api docsplus-hocuspocus docsplus-admin; do
if ! docker image inspect "${img}:${PREVIOUS_TAG}" >/dev/null 2>&1; then
MISSING+=("${img}:${PREVIOUS_TAG}")
fi
done
if [ "${#MISSING[@]}" -gt 0 ]; then
echo "::error::Cannot auto-rollback. Missing images for previous tag:"
for img in "${MISSING[@]}"; do
echo " - ${img}"
done
echo "Manual recovery: bring traffic back via the existing healthy containers."
docker compose -f "${COMPOSE_FILE}" --env-file "${ENV_FILE}" up -d --no-recreate 2>/dev/null || true
exit 1
fi
# Override DEPLOY_TAG in the env file and re-deploy with previous images.
sed -i.bak "s|^DEPLOY_TAG=.*|DEPLOY_TAG=${PREVIOUS_TAG}|" "${ENV_FILE}"
rm -f "${ENV_FILE}.bak"
docker compose -f "${COMPOSE_FILE}" --env-file "${ENV_FILE}" \
up -d --force-recreate \
webapp rest-api hocuspocus-server hocuspocus-worker admin-dashboard
# Post-rollback verification (A3). Give containers a moment to bind
# ports + pass first healthcheck, then run the same internal smoke
# set the forward path runs. If rollback itself can't come healthy,
# we want the workflow to fail RED so the on-call sees it instead
# of a misleading "rollback complete" green check.
echo ""
echo "⏳ Waiting 30s for rolled-back containers to settle..."
sleep 30
smoke() {
local SVC="$1" PORT="$2" PATH_="$3"
if docker compose -f "${COMPOSE_FILE}" --env-file "${ENV_FILE}" exec -T "${SVC}" \
bun -e "fetch('http://localhost:${PORT}${PATH_}').then(r => r.ok ? process.exit(0) : process.exit(1)).catch(() => process.exit(1))"; then
echo " ✅ ${SVC} internal health (post-rollback)"
return 0
else
echo " ❌ ${SVC} internal health (post-rollback)"
return 1
fi
}
ROLLBACK_OK=1
smoke webapp 3000 /api/health || ROLLBACK_OK=0
smoke rest-api 4000 /health || ROLLBACK_OK=0
smoke hocuspocus-server 4001 /health || ROLLBACK_OK=0
smoke hocuspocus-worker 4002 /health || ROLLBACK_OK=0
smoke admin-dashboard 3100 /api/health || ROLLBACK_OK=0
echo ""
echo "📊 Post-rollback state:"
docker ps --format "table {{.Names}}\t{{.Status}}" | head -15
if [ "${ROLLBACK_OK}" -ne 1 ]; then
echo "::error::Rollback to ${PREVIOUS_TAG} did not pass smoke tests. Manual intervention required."
exit 1
fi
echo "✅ Rollback to ${PREVIOUS_TAG} verified healthy"
# ===========================================================================
# UPTIME KUMA (optional monitoring service)
# ===========================================================================
# ===========================================================================
# UPTIME KUMA (optional monitoring service)
# ===========================================================================
uptime-kuma-gate:
name: 🚦 Uptime Kuma Gate
runs-on: ubuntu-latest
timeout-minutes: 2
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
permissions:
contents: read
outputs:
deploy: ${{ steps.gate.outputs.deploy }}
steps:
- name: 📦 Checkout
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: 🚦 Evaluate trigger
id: gate
env:
COMMIT_MSG: ${{ github.event.head_commit.message }}
run: |
set -euo pipefail
if bash .github/scripts/evaluate-deploy-trigger.sh uptime-kuma; then
echo "✅ Uptime Kuma: matched (build): uptime-kuma convention"
else
echo "ℹ️ Skipping Uptime Kuma: commit does not match '(build): uptime-kuma'"
fi
deploy-uptime-kuma:
name: 🔔 Deploy Uptime Kuma
runs-on: prod.docs.plus
timeout-minutes: 10
needs: [uptime-kuma-gate]
if: needs.uptime-kuma-gate.outputs.deploy == 'true'
permissions:
contents: read
steps:
- name: 🚀 Deploy
run: |
# Pinned by digest (not :latest) so the same uptime-kuma version
# is reproducible across redeploys. To bump, look up new digest:
# docker pull louislam/uptime-kuma:1 && docker inspect ...
UPTIME_KUMA_IMAGE='louislam/uptime-kuma:1@sha256:bb1bcecbc3e3ffb1cb0f8fc5f9c3cdaa78c1dfb56d98d64e06da13ebfc6dba0d'
docker network create docsplus-network 2>/dev/null || true
docker stop uptime-kuma 2>/dev/null || true
docker rm uptime-kuma 2>/dev/null || true
docker run -d \
--name uptime-kuma \
--network docsplus-network \
--restart unless-stopped \
-v uptime-kuma-data:/app/data \
--label "traefik.enable=true" \
--label "traefik.http.routers.uptime.rule=Host(\`status.docs.plus\`)" \
--label "traefik.http.routers.uptime.entrypoints=websecure" \
--label "traefik.http.routers.uptime.tls.certresolver=letsencrypt" \
--label "traefik.http.services.uptime.loadbalancer.server.port=3001" \
"${UPTIME_KUMA_IMAGE}"
sleep 15
echo "✅ Uptime Kuma deployed at https://status.docs.plus"