(build): front #535
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # ============================================================================= | |
| # Production CI/CD Pipeline — Quality Gates + Blue-Green Deployment | |
| # ============================================================================= | |
| # | |
| # Pipeline: | |
| # 1. Quality Gates (parallel): lint, typecheck, security | |
| # 2. Build Verification: extension + Next.js builds (smoke test) | |
| # 3. Deploy Gate: parses commit message with a strict regex (anti-footgun) | |
| # 4. Deploy: docker compose build on the prod host → blue-green rollout | |
| # | |
| # Triggers: | |
| # - push to main: full pipeline. Deploy gated by commit-message convention | |
| # `(build): front`, `(build): back`, or `(build): front back` parsed by | |
| # `.github/scripts/evaluate-deploy-trigger.sh prod` in `deploy-gate`. | |
| # Uptime Kuma deploy uses `(build): uptime-kuma` via the same script's | |
| # `uptime-kuma` mode in `uptime-kuma-gate`. | |
| # - pull_request to main: quality gates + build verification only. | |
| # - schedule (Sunday 00:00 UTC): security audit only. | |
| # - workflow_dispatch: manual run with `force_deploy` / `skip_quality_gates`. | |
| # | |
| # Architectural notes (decided 2026-05, see commit history): | |
| # - Images are built on the prod self-hosted runner (`prod.docs.plus`). | |
| # Disk pressure is mitigated by a pre-build disk guard, not by pushing | |
| # to a registry. If pressure resurfaces, revisit M3 (ghcr.io push). | |
| # - Rollback uses an on-disk tag stash on the prod host: | |
| # /opt/projects/prod.docs.plus/.deploy/last-good-tag. | |
| # - All third-party actions (workflow + composite) are pinned to commit | |
| # SHA. Renovate/Dependabot should bump them; never use floating tags. | |
| # ============================================================================= | |
| name: CI/CD Production | |
| on: | |
| push: | |
| branches: [main] | |
| pull_request: | |
| branches: [main] | |
| schedule: | |
| # Weekly security scan (Sunday 00:00 UTC) | |
| - cron: '0 0 * * 0' | |
| workflow_dispatch: | |
| inputs: | |
| skip_quality_gates: | |
| description: 'Skip quality gates (emergency deploy)' | |
| required: false | |
| default: false | |
| type: boolean | |
| force_deploy: | |
| description: 'Force deployment (bypass commit-message gate)' | |
| required: false | |
| default: false | |
| type: boolean | |
| # Two concurrency groups: | |
| # - quality-gates can be cancelled freely (cheap to redo) | |
| # - deploy MUST finish or rollback (mid-deploy SIGTERM corrupts blue-green state) | |
| concurrency: | |
| group: ${{ github.workflow }}-${{ github.ref }}-quality | |
| cancel-in-progress: true | |
| # Default to bash so set -e/-o pipefail behavior is consistent across runners. | |
| defaults: | |
| run: | |
| shell: bash | |
| # Workflow-level least privilege; per-job overrides where needed. | |
| permissions: | |
| contents: read | |
| env: | |
| ENV_SOURCE: /opt/projects/prod.docs.plus/.env | |
| ENV_FILE: .env.production | |
| COMPOSE_FILE: docker-compose.prod.yml | |
| DEPLOY_TAG: ${{ github.sha }} | |
| # Where the prod host stashes the last successfully deployed SHA for rollback. | |
| DEPLOY_STATE_DIR: /opt/projects/prod.docs.plus/.deploy | |
| LAST_GOOD_TAG_FILE: /opt/projects/prod.docs.plus/.deploy/last-good-tag | |
| jobs: | |
| # =========================================================================== | |
| # STAGE 0 — CHANGE DETECTION (cheap; gates the expensive extension suite) | |
| # =========================================================================== | |
| # The clean-room extension suite (~14 min) is the pipeline's long pole and is | |
| # a hard deploy gate. It only needs to run when something that affects an | |
| # extension build/test actually changed. This job emits a boolean the | |
| # extension-tests job keys off; lint/typecheck/security stay always-on | |
| # because they are fast and repo-global (typecheck still catches extension | |
| # TYPE regressions on every push even when the Cypress suite is skipped). | |
| # =========================================================================== | |
| changes: | |
| name: 🔎 Detect Changes | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 5 | |
| if: github.event_name != 'schedule' | |
| permissions: | |
| contents: read | |
| outputs: | |
| extensions: ${{ steps.filter.outputs.extensions }} | |
| steps: | |
| - name: 📦 Checkout | |
| uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 | |
| - name: 🔎 Filter extension-affecting paths | |
| id: filter | |
| uses: dorny/paths-filter@de90cc6fb38fc0963ad72b210f1f284cd68cea36 # v3.0.2 | |
| with: | |
| filters: .github/filters/extensions.yaml | |
| # =========================================================================== | |
| # STAGE 1 — QUALITY GATES (parallel, fast feedback) | |
| # =========================================================================== | |
| lint: | |
| name: 🔍 Lint & Format | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 10 | |
| if: github.event_name != 'schedule' | |
| permissions: | |
| contents: read | |
| steps: | |
| - name: 📦 Checkout | |
| uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 | |
| - name: 🥟 Setup Environment | |
| uses: ./.github/actions/setup-bun | |
| - name: 🔍 Lint, format & styles | |
| run: | | |
| bun run lint | |
| bun run format | |
| bun run lint:styles | |
| typecheck: | |
| name: 📝 Type Check | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 15 | |
| if: github.event_name != 'schedule' | |
| permissions: | |
| contents: read | |
| steps: | |
| - name: 📦 Checkout | |
| uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 | |
| - name: 🥟 Setup Environment | |
| uses: ./.github/actions/setup-bun | |
| - name: 🔧 Build Extensions (required for types) | |
| uses: ./.github/actions/build-extensions | |
| - name: 📝 Type Check All | |
| run: bun run typecheck | |
| security: | |
| name: 🔒 Security Audit | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 10 | |
| permissions: | |
| contents: read | |
| steps: | |
| - name: 📦 Checkout | |
| uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 | |
| - name: 🥟 Setup Environment | |
| uses: ./.github/actions/setup-bun | |
| with: | |
| ignore-scripts: 'true' | |
| - name: 🔍 Bun Audit | |
| run: | | |
| set -o pipefail | |
| echo "🔍 Checking for known vulnerabilities..." | |
| # Capture both human-readable and machine-readable output. | |
| # `bun pm audit` exits non-zero when vulns exist; we always want both files. | |
| bun pm audit 2>&1 | tee audit-results.txt || true | |
| bun pm audit --json > audit-results.json 2>/dev/null || echo '{}' > audit-results.json | |
| # Structured parsing — replaces brittle `grep -ci "critical"` which | |
| # matched the summary header line and produced false positives. | |
| # Expected shape: { "vulnerabilities": { "critical": N, "high": N, ... } } | |
| CRITICAL=$(bun -e 'const a=JSON.parse(require("fs").readFileSync("audit-results.json","utf8"));process.stdout.write(String(a?.vulnerabilities?.critical||0))') | |
| HIGH=$(bun -e 'const a=JSON.parse(require("fs").readFileSync("audit-results.json","utf8"));process.stdout.write(String(a?.vulnerabilities?.high||0))') | |
| echo "" | |
| echo "📊 Summary: critical=${CRITICAL}, high=${HIGH}" | |
| # Defensive: if both are 0 AND the JSON looks empty, the audit | |
| # shape may have changed (Bun has changed it before). Print the | |
| # raw JSON head so a future regression doesn't silently neutralize | |
| # this gate. Caps at 4 KB to keep logs tidy. | |
| if [ "${CRITICAL}" -eq 0 ] && [ "${HIGH}" -eq 0 ]; then | |
| BYTES=$(wc -c < audit-results.json | tr -d ' ') | |
| if [ "${BYTES}" -lt 32 ]; then | |
| echo "::warning::audit-results.json is suspiciously small (${BYTES} bytes). Bun audit JSON shape may have changed." | |
| echo "--- audit-results.json (head 4KB) ---" | |
| head -c 4096 audit-results.json || true | |
| echo "" | |
| echo "--- end ---" | |
| fi | |
| fi | |
| if [ "${CRITICAL}" -gt 0 ] || [ "${HIGH}" -gt 0 ]; then | |
| echo "::error::Critical/High vulnerabilities detected (critical=${CRITICAL}, high=${HIGH})" | |
| exit 1 | |
| fi | |
| echo "✅ No critical/high vulnerabilities" | |
| - name: 📤 Upload Audit Results | |
| if: always() | |
| uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3 | |
| with: | |
| name: security-audit-${{ github.sha }} | |
| path: | | |
| audit-results.txt | |
| audit-results.json | |
| retention-days: 30 | |
| extension-tests: | |
| name: 🧪 Extension Tests | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 45 | |
| needs: [changes] | |
| # Run only when extension-affecting paths changed. workflow_dispatch always | |
| # runs the suite (manual runs have no reliable diff base). When this job is | |
| # skipped because nothing extension-related changed, the build gate below | |
| # accepts that skip only if the changes job succeeded. | |
| if: | | |
| github.event_name != 'schedule' && | |
| (needs.changes.outputs.extensions == 'true' || github.event_name == 'workflow_dispatch') | |
| permissions: | |
| contents: read | |
| steps: | |
| - name: 📦 Checkout | |
| uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 | |
| - name: 🥟 Setup Environment | |
| uses: ./.github/actions/setup-bun | |
| - name: 🔧 Build extensions (release gate) | |
| uses: ./.github/actions/build-extensions | |
| - name: 🌲 Setup Cypress binary | |
| uses: ./.github/actions/setup-cypress | |
| - name: 🧪 Clean-room extension suites | |
| env: | |
| EXTENSION_DIST_READY: '1' | |
| run: bash scripts/run-tests.sh --extensions | |
| - name: ✈️ Publish preflight (all five) | |
| run: bash scripts/extension-preflight.sh | |
| # =========================================================================== | |
| # STAGE 2 — BUILD VERIFICATION (smoke test, no artifacts produced) | |
| # =========================================================================== | |
| # Note: this job intentionally does NOT build Docker images. The deploy job | |
| # rebuilds them on the prod host anyway (decided 2026-05); duplicating the | |
| # docker build here would just slow the pipeline without sharing cache. The | |
| # webapp/admin Next.js compile here catches type/build regressions early. | |
| # =========================================================================== | |
| build: | |
| name: 🏗️ Build Verification | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 35 | |
| needs: [changes, lint, typecheck, security, extension-tests] | |
| if: | | |
| always() && | |
| github.event_name != 'schedule' && | |
| needs.changes.result == 'success' && | |
| (needs.lint.result == 'success' || (github.event_name == 'workflow_dispatch' && inputs.skip_quality_gates)) && | |
| (needs.typecheck.result == 'success' || (github.event_name == 'workflow_dispatch' && inputs.skip_quality_gates)) && | |
| (needs.security.result == 'success' || (github.event_name == 'workflow_dispatch' && inputs.skip_quality_gates)) && | |
| (needs.extension-tests.result == 'success' || | |
| (needs.extension-tests.result == 'skipped' && needs.changes.outputs.extensions != 'true') || | |
| (github.event_name == 'workflow_dispatch' && inputs.skip_quality_gates)) | |
| permissions: | |
| contents: read | |
| steps: | |
| - name: 📦 Checkout | |
| uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 | |
| - name: 🥟 Setup Environment | |
| uses: ./.github/actions/setup-bun | |
| - name: 🔧 Build TipTap Extensions | |
| uses: ./.github/actions/build-extensions | |
| - name: 🏗️ Build Webapp | |
| run: bun run --filter @docs.plus/webapp build:ci | |
| env: | |
| NEXT_PUBLIC_SUPABASE_URL: ${{ secrets.NEXT_PUBLIC_SUPABASE_URL || 'http://localhost:54321' }} | |
| NEXT_PUBLIC_SUPABASE_ANON_KEY: ${{ secrets.NEXT_PUBLIC_SUPABASE_ANON_KEY || 'dummy-key' }} | |
| - name: 🏗️ Build Admin Dashboard | |
| run: bun run --filter @docs.plus/admin-dashboard build:ci | |
| env: | |
| NEXT_PUBLIC_SUPABASE_URL: ${{ secrets.NEXT_PUBLIC_SUPABASE_URL || 'http://localhost:54321' }} | |
| NEXT_PUBLIC_SUPABASE_ANON_KEY: ${{ secrets.NEXT_PUBLIC_SUPABASE_ANON_KEY || 'dummy-key' }} | |
| NEXT_PUBLIC_API_URL: ${{ secrets.NEXT_PUBLIC_API_URL || 'http://localhost:3003' }} | |
| NEXT_PUBLIC_APP_URL: ${{ secrets.NEXT_PUBLIC_APP_URL || 'http://localhost:3000' }} | |
| # =========================================================================== | |
| # STAGE 2.5 — DEPLOY GATE (precise commit-message parsing) | |
| # =========================================================================== | |
| # Replaces the previous loose `contains(...)` chain in the deploy job's `if:`. | |
| # The old check matched any commit whose body contained both "build" and | |
| # "front" or "back" anywhere (e.g. "fix iOS back gesture build crash" would | |
| # have triggered a production deploy). This job parses the convention | |
| # documented in AGENTS.md (`(build): front|back|front back`) with a real | |
| # regex and surfaces the result as a job output. | |
| # =========================================================================== | |
| deploy-gate: | |
| name: 🚦 Deploy Gate | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 2 | |
| needs: [build] | |
| if: | | |
| always() && | |
| needs.build.result == 'success' && | |
| (github.event_name == 'push' || (github.event_name == 'workflow_dispatch' && inputs.force_deploy)) | |
| permissions: | |
| contents: read | |
| outputs: | |
| deploy: ${{ steps.gate.outputs.deploy }} | |
| reason: ${{ steps.gate.outputs.reason }} | |
| steps: | |
| - name: 📦 Checkout | |
| uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 | |
| - name: 🚦 Evaluate trigger | |
| id: gate | |
| env: | |
| EVENT_NAME: ${{ github.event_name }} | |
| REF: ${{ github.ref }} | |
| FORCE_DEPLOY: ${{ inputs.force_deploy }} | |
| COMMIT_MSG: ${{ github.event.head_commit.message }} | |
| run: | | |
| set -euo pipefail | |
| if [ "${EVENT_NAME}" = "workflow_dispatch" ] && [ "${FORCE_DEPLOY}" = "true" ]; then | |
| echo "deploy=true" >> "$GITHUB_OUTPUT" | |
| echo "reason=workflow_dispatch+force_deploy" >> "$GITHUB_OUTPUT" | |
| echo "✅ Deploying: workflow_dispatch with force_deploy=true" | |
| exit 0 | |
| fi | |
| if [ "${EVENT_NAME}" != "push" ] || [ "${REF}" != "refs/heads/main" ]; then | |
| echo "deploy=false" >> "$GITHUB_OUTPUT" | |
| echo "reason=non-main push" >> "$GITHUB_OUTPUT" | |
| echo "ℹ️ Skipping deploy: not a push to main" | |
| exit 0 | |
| fi | |
| if bash .github/scripts/evaluate-deploy-trigger.sh prod; then | |
| echo "✅ Deploying: matched (build): front|back convention" | |
| else | |
| echo "ℹ️ Skipping deploy: commit does not match '(build): front|back|front back'" | |
| echo "ℹ️ Subject line was:" | |
| printf '%s\n' "${COMMIT_MSG}" | head -1 | |
| fi | |
| # =========================================================================== | |
| # STAGE 3 — PRODUCTION DEPLOYMENT | |
| # =========================================================================== | |
| # IMPORTANT: this job intentionally opts OUT of cancel-in-progress at the job | |
| # level. Mid-deploy SIGTERM during `docker compose up --scale` can leave the | |
| # cluster with a mix of old+new containers and corrupt blue-green state. | |
| # =========================================================================== | |
| deploy: | |
| name: 🚀 Deploy Production | |
| runs-on: prod.docs.plus | |
| timeout-minutes: 30 | |
| needs: [deploy-gate] | |
| # Separate concurrency group with cancel-in-progress: false. If two pushes | |
| # arrive close together, the second waits for the first to finish. | |
| concurrency: | |
| group: ${{ github.workflow }}-deploy | |
| cancel-in-progress: false | |
| if: needs.deploy-gate.outputs.deploy == 'true' | |
| environment: | |
| name: production | |
| url: https://docs.plus | |
| permissions: | |
| contents: read | |
| steps: | |
| - name: 📦 Checkout Code | |
| uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 | |
| with: | |
| fetch-depth: 1 | |
| - name: 🔐 Prepare Environment | |
| run: | | |
| # Compose `--env-file` is the single source of truth. We do NOT also | |
| # `set -a; source` it elsewhere — that path was double-loading and | |
| # leaking vars to subshells unintentionally. | |
| cp "${ENV_SOURCE}" "${ENV_FILE}" | |
| echo "DEPLOY_TAG=${DEPLOY_TAG}" >> "${ENV_FILE}" | |
| # Stash the previous successful tag (if any) for the rollback step. | |
| mkdir -p "${DEPLOY_STATE_DIR}" | |
| if [ -f "${LAST_GOOD_TAG_FILE}" ]; then | |
| PREVIOUS_TAG=$(cat "${LAST_GOOD_TAG_FILE}") | |
| echo "PREVIOUS_TAG=${PREVIOUS_TAG}" >> "$GITHUB_ENV" | |
| echo "ℹ️ Previous good tag: ${PREVIOUS_TAG}" | |
| else | |
| echo "PREVIOUS_TAG=" >> "$GITHUB_ENV" | |
| echo "ℹ️ No previous good tag stashed (first deploy or fresh state dir)" | |
| fi | |
| echo "✅ Environment ready" | |
| - name: 💾 Pre-deploy disk guard | |
| run: | | |
| echo "📊 Disk before prune:" | |
| df -h / | tail -1 | |
| # Free space proactively. Without this, --no-cache builds can fill | |
| # the root volume between deploys and silently OOM/ENOSPC the build | |
| # step (job ends in <2min with no error). Runs before build, not after. | |
| docker image prune -af --filter "until=24h" 2>/dev/null || true | |
| docker builder prune -af --filter "until=24h" 2>/dev/null || true | |
| # Hard guard: refuse to build when <10 GB free. Fail loud here | |
| # rather than fail silently mid-build. | |
| AVAIL_KB=$(df --output=avail / | tail -1) | |
| AVAIL_GB=$((AVAIL_KB / 1024 / 1024)) | |
| echo "📊 Disk after prune: ${AVAIL_GB} GB free" | |
| if [ "${AVAIL_GB}" -lt 10 ]; then | |
| echo "::error::Less than 10 GB free on /. Aborting deploy. SSH to host and run 'docker system prune -af --volumes'." | |
| df -h / | |
| docker system df | |
| exit 1 | |
| fi | |
| - name: 📂 Verify build context (monorepo root) | |
| run: | | |
| if [ ! -d packages/email-templates ]; then | |
| echo "::error::packages/email-templates missing. Build context must be repo root (context: .). Check checkout includes the workspace." | |
| exit 1 | |
| fi | |
| if ! grep -q 'email-templates' apps/hocuspocus.server/docker/Dockerfile.bun; then | |
| echo "::error::apps/hocuspocus.server/docker/Dockerfile.bun must COPY packages/email-templates." | |
| exit 1 | |
| fi | |
| if ! grep -q 'email-templates' apps/webapp/docker/Dockerfile.bun; then | |
| echo "::error::apps/webapp/docker/Dockerfile.bun must COPY packages/email-templates." | |
| exit 1 | |
| fi | |
| echo "✅ Build context OK (repo root, email-templates present)" | |
| - name: 🏗️ Build Docker Images | |
| env: | |
| DOCKER_BUILDKIT: '1' | |
| COMPOSE_DOCKER_CLI_BUILD: '1' | |
| run: | | |
| echo "🔨 Building images with tag: ${DEPLOY_TAG}" | |
| # hocuspocus-server and hocuspocus-worker share `docsplus-hocuspocus`; | |
| # building both via compose with --no-cache duplicates context transfer | |
| # and ties up the bake plan. Build via hocuspocus-server only; the | |
| # worker reuses the resulting tag at `up` time. | |
| # | |
| # --no-cache: required as long as the prod entrypoint script changes | |
| # are layered late in the Dockerfile and we don't yet have stable | |
| # layer ordering. If/when entrypoint COPY moves to the last layer, | |
| # we can drop --no-cache and gain ~5 min per deploy. | |
| docker compose -f "${COMPOSE_FILE}" --env-file "${ENV_FILE}" \ | |
| build --no-cache rest-api hocuspocus-server | |
| docker compose -f "${COMPOSE_FILE}" --env-file "${ENV_FILE}" \ | |
| build --parallel webapp admin-dashboard | |
| echo "✅ Images built" | |
| - name: 🔧 Ensure Infrastructure | |
| run: | | |
| echo "🔧 Ensuring infrastructure..." | |
| docker network create docsplus-network 2>/dev/null || true | |
| # Start Traefik and Redis (--no-recreate keeps existing if running) | |
| docker compose -f "${COMPOSE_FILE}" --env-file "${ENV_FILE}" \ | |
| up -d --no-recreate traefik redis | |
| # Force-start Traefik if somehow not running | |
| if ! docker ps --filter "name=traefik" --filter "status=running" --format '{{.Names}}' | grep -q traefik; then | |
| echo "⚠️ Traefik not running, starting..." | |
| docker compose -f "${COMPOSE_FILE}" --env-file "${ENV_FILE}" up -d traefik | |
| sleep 15 | |
| fi | |
| # Wait for healthy | |
| echo "⏳ Waiting for Traefik..." | |
| for i in {1..30}; do | |
| if docker ps --filter "name=traefik" --filter "health=healthy" --format '{{.Names}}' | grep -q traefik; then | |
| echo "✅ Traefik healthy" | |
| break | |
| fi | |
| [ "${i}" -eq 30 ] && echo "⚠️ Traefik health timeout, continuing..." | |
| sleep 2 | |
| done | |
| - name: 🚀 Deploy Services (Blue-Green) | |
| run: | | |
| echo "🚀 Starting zero-downtime deployment..." | |
| deploy_service() { | |
| local SERVICE="$1" | |
| local TARGET="$2" | |
| local CURRENT | |
| CURRENT=$(docker ps --filter "label=com.docker.compose.service=${SERVICE}" -q | wc -l | tr -d ' ') | |
| local SCALE_UP=$((CURRENT + TARGET)) | |
| echo "" | |
| echo "📦 Deploying ${SERVICE} (current: ${CURRENT}, target: ${TARGET})..." | |
| # Scale UP first (keeps old containers serving traffic) | |
| if [ "${SCALE_UP}" -gt "${CURRENT}" ]; then | |
| echo "⬆️ Scaling up to ${SCALE_UP}..." | |
| docker compose -f "${COMPOSE_FILE}" --env-file "${ENV_FILE}" \ | |
| up -d --no-deps --scale "${SERVICE}=${SCALE_UP}" "${SERVICE}" | |
| # Wait for healthy. 60×2s = 120s — Next.js cold start can hit | |
| # 60-90s right after a --no-cache build. Was 30×2s = 60s before | |
| # which produced false-fail rollbacks. | |
| echo "⏳ Waiting for healthy containers..." | |
| for i in {1..60}; do | |
| local HEALTHY | |
| HEALTHY=$(docker ps --filter "label=com.docker.compose.service=${SERVICE}" --filter "health=healthy" -q | wc -l) | |
| if [ "${HEALTHY}" -ge "${TARGET}" ]; then | |
| echo "✅ ${HEALTHY} healthy containers" | |
| break | |
| fi | |
| [ $((i % 10)) -eq 0 ] && echo " ... ${HEALTHY}/${TARGET} healthy (attempt ${i}/60)" | |
| sleep 2 | |
| done | |
| fi | |
| # Scale to target (compose removes old containers) | |
| echo "📏 Scaling to target: ${TARGET}..." | |
| docker compose -f "${COMPOSE_FILE}" --env-file "${ENV_FILE}" \ | |
| up -d --no-deps --scale "${SERVICE}=${TARGET}" "${SERVICE}" | |
| sleep 2 | |
| } | |
| deploy_service "webapp" 2 | |
| deploy_service "rest-api" 2 | |
| deploy_service "hocuspocus-server" 2 | |
| deploy_service "hocuspocus-worker" 1 | |
| deploy_service "admin-dashboard" 1 | |
| echo "" | |
| echo "✅ All services deployed" | |
| - name: 🩺 Verify Deployment | |
| run: | | |
| echo "🩺 Verifying deployment..." | |
| sleep 10 | |
| # Infrastructure check | |
| echo "📊 Infrastructure:" | |
| for svc in traefik docsplus-redis; do | |
| if docker ps --filter "name=${svc}" --filter "status=running" --format '{{.Names}}' | grep -q "${svc}"; then | |
| echo " ✅ ${svc}: running" | |
| else | |
| echo " ❌ ${svc}: NOT running" | |
| docker logs "${svc}" --tail 30 2>/dev/null || true | |
| exit 1 | |
| fi | |
| done | |
| # Service running + healthy check | |
| echo "📊 Services:" | |
| for svc in webapp rest-api hocuspocus-server hocuspocus-worker admin-dashboard; do | |
| RUNNING=$(docker ps --filter "label=com.docker.compose.service=${svc}" --filter "status=running" --format "{{.Names}}" | wc -l) | |
| HEALTHY=$(docker ps --filter "label=com.docker.compose.service=${svc}" --filter "health=healthy" --format "{{.Names}}" | wc -l) | |
| if [ "${RUNNING}" -gt 0 ]; then | |
| echo " ✅ ${svc}: ${RUNNING} running, ${HEALTHY} healthy" | |
| else | |
| echo " ❌ ${svc}: NOT running" | |
| exit 1 | |
| fi | |
| done | |
| # Internal smoke test — hit container health endpoints via the | |
| # docker network, NOT via the public DNS+TLS stack. A transient | |
| # ACME / Let's Encrypt hiccup must not trigger a false-fail rollback. | |
| echo "" | |
| echo "🔍 Internal smoke tests..." | |
| smoke() { | |
| local SVC="$1" PORT="$2" PATH_="$3" | |
| if docker compose -f "${COMPOSE_FILE}" --env-file "${ENV_FILE}" exec -T "${SVC}" \ | |
| bun -e "fetch('http://localhost:${PORT}${PATH_}').then(r => r.ok ? process.exit(0) : process.exit(1)).catch(() => process.exit(1))"; then | |
| echo " ✅ ${SVC} internal health" | |
| else | |
| echo " ❌ ${SVC} internal health" | |
| return 1 | |
| fi | |
| } | |
| smoke webapp 3000 /api/health | |
| smoke rest-api 4000 /health | |
| smoke hocuspocus-server 4001 /health | |
| smoke hocuspocus-worker 4002 /health | |
| smoke admin-dashboard 3100 /api/health | |
| # Public-URL probe is now informational only — does NOT fail the deploy. | |
| # Real public-availability monitoring belongs in uptime-kuma, not here. | |
| echo "" | |
| echo "🌐 Public URL probe (informational):" | |
| PUBLIC_CODE=$(curl -sf -o /dev/null -w "%{http_code}" --max-time 10 https://docs.plus/ 2>/dev/null || echo "000") | |
| echo " https://docs.plus/ → ${PUBLIC_CODE}" | |
| API_CODE=$(curl -sf -o /dev/null -w "%{http_code}" --max-time 10 https://prodback.docs.plus/api/health 2>/dev/null || echo "000") | |
| echo " https://prodback.docs.plus/api/health → ${API_CODE}" | |
| echo "" | |
| echo "✅ Deployment verified" | |
| - name: 📁 Sync compose files for break-glass | |
| if: success() | |
| run: | | |
| # Replaces the previous "Sync Production Directory" step which re-`up`'d | |
| # services from a different cwd — that re-up broke the blue-green | |
| # guarantee. Now we only COPY the active compose + env files to a | |
| # stable path so a human SSH'd in can run, e.g.: | |
| # cd /opt/projects/prod.docs.plus/.deploy/current | |
| # docker compose -f docker-compose.prod.yml --env-file .env.production ps | |
| # without having to know the runner's _work directory. | |
| mkdir -p "${DEPLOY_STATE_DIR}/current" | |
| cp "${COMPOSE_FILE}" "${DEPLOY_STATE_DIR}/current/${COMPOSE_FILE}" | |
| cp "${ENV_FILE}" "${DEPLOY_STATE_DIR}/current/${ENV_FILE}" | |
| echo "✅ Synced compose+env to ${DEPLOY_STATE_DIR}/current/" | |
| - name: 💾 Stash this tag as last-good | |
| # Only on success — failure path is handled by the rollback step. | |
| if: success() | |
| run: | | |
| # Persist current tag for the next deploy's rollback target. | |
| mkdir -p "${DEPLOY_STATE_DIR}" | |
| # Keep the previous one as last-good-tag.previous for one-step-back debugging. | |
| if [ -f "${LAST_GOOD_TAG_FILE}" ]; then | |
| cp "${LAST_GOOD_TAG_FILE}" "${LAST_GOOD_TAG_FILE}.previous" | |
| fi | |
| echo "${DEPLOY_TAG}" > "${LAST_GOOD_TAG_FILE}" | |
| echo "✅ Stashed last-good-tag = ${DEPLOY_TAG}" | |
| - name: 🧹 Cleanup | |
| if: success() | |
| continue-on-error: true # cleanup failure shouldn't fail an otherwise green deploy | |
| run: | | |
| docker image prune -f | |
| docker image prune -f --filter "until=24h" 2>/dev/null || true | |
| echo "✅ Cleanup complete" | |
| - name: 📊 Summary | |
| if: success() | |
| run: | | |
| echo "======================================" | |
| echo "✅ DEPLOYMENT SUCCESSFUL" | |
| echo "======================================" | |
| echo "Tag: ${DEPLOY_TAG}" | |
| echo "Previous tag: ${PREVIOUS_TAG:-<none>}" | |
| echo "" | |
| echo "Services:" | |
| docker ps --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}" | grep -E "(traefik|docsplus|webapp|rest-api|hocuspocus)" | head -15 | |
| echo "" | |
| echo "URLs:" | |
| echo " - https://docs.plus" | |
| echo " - https://prodback.docs.plus" | |
| echo "======================================" | |
| - name: 🚨 Rollback on Failure | |
| if: failure() | |
| run: | | |
| echo "⚠️ Deployment failed — attempting rollback..." | |
| if [ -z "${PREVIOUS_TAG:-}" ]; then | |
| echo "::warning::No PREVIOUS_TAG stashed — cannot auto-rollback." | |
| echo "📊 Current state:" | |
| docker ps --format "table {{.Names}}\t{{.Status}}" | head -15 | |
| exit 0 | |
| fi | |
| echo "↩️ Rolling back to: ${PREVIOUS_TAG}" | |
| # Multi-image precondition (A2): ALL service images for the previous | |
| # tag must still exist locally. The cleanup step honors --filter | |
| # until=24h, so within a 24h window this works reliably; outside | |
| # that window we fail loudly rather than partially-rollback into a | |
| # mixed-version cluster. | |
| MISSING=() | |
| for img in docsplus-webapp docsplus-rest-api docsplus-hocuspocus docsplus-admin; do | |
| if ! docker image inspect "${img}:${PREVIOUS_TAG}" >/dev/null 2>&1; then | |
| MISSING+=("${img}:${PREVIOUS_TAG}") | |
| fi | |
| done | |
| if [ "${#MISSING[@]}" -gt 0 ]; then | |
| echo "::error::Cannot auto-rollback. Missing images for previous tag:" | |
| for img in "${MISSING[@]}"; do | |
| echo " - ${img}" | |
| done | |
| echo "Manual recovery: bring traffic back via the existing healthy containers." | |
| docker compose -f "${COMPOSE_FILE}" --env-file "${ENV_FILE}" up -d --no-recreate 2>/dev/null || true | |
| exit 1 | |
| fi | |
| # Override DEPLOY_TAG in the env file and re-deploy with previous images. | |
| sed -i.bak "s|^DEPLOY_TAG=.*|DEPLOY_TAG=${PREVIOUS_TAG}|" "${ENV_FILE}" | |
| rm -f "${ENV_FILE}.bak" | |
| docker compose -f "${COMPOSE_FILE}" --env-file "${ENV_FILE}" \ | |
| up -d --force-recreate \ | |
| webapp rest-api hocuspocus-server hocuspocus-worker admin-dashboard | |
| # Post-rollback verification (A3). Give containers a moment to bind | |
| # ports + pass first healthcheck, then run the same internal smoke | |
| # set the forward path runs. If rollback itself can't come healthy, | |
| # we want the workflow to fail RED so the on-call sees it instead | |
| # of a misleading "rollback complete" green check. | |
| echo "" | |
| echo "⏳ Waiting 30s for rolled-back containers to settle..." | |
| sleep 30 | |
| smoke() { | |
| local SVC="$1" PORT="$2" PATH_="$3" | |
| if docker compose -f "${COMPOSE_FILE}" --env-file "${ENV_FILE}" exec -T "${SVC}" \ | |
| bun -e "fetch('http://localhost:${PORT}${PATH_}').then(r => r.ok ? process.exit(0) : process.exit(1)).catch(() => process.exit(1))"; then | |
| echo " ✅ ${SVC} internal health (post-rollback)" | |
| return 0 | |
| else | |
| echo " ❌ ${SVC} internal health (post-rollback)" | |
| return 1 | |
| fi | |
| } | |
| ROLLBACK_OK=1 | |
| smoke webapp 3000 /api/health || ROLLBACK_OK=0 | |
| smoke rest-api 4000 /health || ROLLBACK_OK=0 | |
| smoke hocuspocus-server 4001 /health || ROLLBACK_OK=0 | |
| smoke hocuspocus-worker 4002 /health || ROLLBACK_OK=0 | |
| smoke admin-dashboard 3100 /api/health || ROLLBACK_OK=0 | |
| echo "" | |
| echo "📊 Post-rollback state:" | |
| docker ps --format "table {{.Names}}\t{{.Status}}" | head -15 | |
| if [ "${ROLLBACK_OK}" -ne 1 ]; then | |
| echo "::error::Rollback to ${PREVIOUS_TAG} did not pass smoke tests. Manual intervention required." | |
| exit 1 | |
| fi | |
| echo "✅ Rollback to ${PREVIOUS_TAG} verified healthy" | |
| # =========================================================================== | |
| # UPTIME KUMA (optional monitoring service) | |
| # =========================================================================== | |
| # =========================================================================== | |
| # UPTIME KUMA (optional monitoring service) | |
| # =========================================================================== | |
| uptime-kuma-gate: | |
| name: 🚦 Uptime Kuma Gate | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 2 | |
| if: github.event_name == 'push' && github.ref == 'refs/heads/main' | |
| permissions: | |
| contents: read | |
| outputs: | |
| deploy: ${{ steps.gate.outputs.deploy }} | |
| steps: | |
| - name: 📦 Checkout | |
| uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 | |
| - name: 🚦 Evaluate trigger | |
| id: gate | |
| env: | |
| COMMIT_MSG: ${{ github.event.head_commit.message }} | |
| run: | | |
| set -euo pipefail | |
| if bash .github/scripts/evaluate-deploy-trigger.sh uptime-kuma; then | |
| echo "✅ Uptime Kuma: matched (build): uptime-kuma convention" | |
| else | |
| echo "ℹ️ Skipping Uptime Kuma: commit does not match '(build): uptime-kuma'" | |
| fi | |
| deploy-uptime-kuma: | |
| name: 🔔 Deploy Uptime Kuma | |
| runs-on: prod.docs.plus | |
| timeout-minutes: 10 | |
| needs: [uptime-kuma-gate] | |
| if: needs.uptime-kuma-gate.outputs.deploy == 'true' | |
| permissions: | |
| contents: read | |
| steps: | |
| - name: 🚀 Deploy | |
| run: | | |
| # Pinned by digest (not :latest) so the same uptime-kuma version | |
| # is reproducible across redeploys. To bump, look up new digest: | |
| # docker pull louislam/uptime-kuma:1 && docker inspect ... | |
| UPTIME_KUMA_IMAGE='louislam/uptime-kuma:1@sha256:bb1bcecbc3e3ffb1cb0f8fc5f9c3cdaa78c1dfb56d98d64e06da13ebfc6dba0d' | |
| docker network create docsplus-network 2>/dev/null || true | |
| docker stop uptime-kuma 2>/dev/null || true | |
| docker rm uptime-kuma 2>/dev/null || true | |
| docker run -d \ | |
| --name uptime-kuma \ | |
| --network docsplus-network \ | |
| --restart unless-stopped \ | |
| -v uptime-kuma-data:/app/data \ | |
| --label "traefik.enable=true" \ | |
| --label "traefik.http.routers.uptime.rule=Host(\`status.docs.plus\`)" \ | |
| --label "traefik.http.routers.uptime.entrypoints=websecure" \ | |
| --label "traefik.http.routers.uptime.tls.certresolver=letsencrypt" \ | |
| --label "traefik.http.services.uptime.loadbalancer.server.port=3001" \ | |
| "${UPTIME_KUMA_IMAGE}" | |
| sleep 15 | |
| echo "✅ Uptime Kuma deployed at https://status.docs.plus" |