diff --git a/.github/workflows/catalyst-build.yaml b/.github/workflows/catalyst-build.yaml index c1e3abd9..a9e21d94 100644 --- a/.github/workflows/catalyst-build.yaml +++ b/.github/workflows/catalyst-build.yaml @@ -308,6 +308,108 @@ jobs: - name: Checkout uses: actions/checkout@v4 + # In-flight provisioning guard — t13/t17/t21 incident, 2026-05-17. + # + # The mothership catalyst-api Pod is single-replica and is rolled + # by Flux whenever this workflow bumps the image SHA. The OpenTofu + # workdir lives on a /tmp emptyDir that dies with the Pod, so any + # in-flight `tofu apply` is killed mid-resource. The on-disk + # deployment record is rewritten to status=failed on the new Pod's + # restoreFromStore (deployments.go:413), but the Hetzner resources + # tagged with the abandoned deployment-id remain orphans that + # require manual `hcloud` cleanup. Three consecutive provs + # (t13/t17/t21) died this way during 2026-05-17, each costing + # ~15 minutes of provisioning time plus cleanup overhead. + # + # This step polls the public, read-only in-flight-count endpoint + # on the mothership catalyst-api (added in this PR, served at + # console.openova.io/api/v1/deployments/in-flight-count). The + # endpoint counts ONLY Phase-0 in-flight statuses (pending / + # provisioning / tofu-applying / flux-bootstrapping) — Phase-1 is + # observational and resumes across Pod restarts, so it does not + # block. When count==0 we proceed with the values.yaml bump. + # + # Timeout policy: cap at MAX_WAIT_SECONDS (default 30 minutes — + # the upper bound on a healthy multi-region prov). If a prov is + # still in flight after the cap, we proceed anyway and log a + # WARNING. Blocking deploys indefinitely on a stuck prov would + # mean an operator can never ship a fix for whatever is causing + # the stuck prov (the worst possible failure mode for a CI gate). + # + # Endpoint outage policy: if the curl fails for any reason + # (network blip, mothership down, endpoint not yet deployed on + # the live SHA), we proceed with the bump after logging. Same + # rationale — a broken gate must not block all future deploys. + # First-time-rollout consideration: the endpoint does not exist + # on the LIVE mothership until THIS PR's image lands, so the + # first run after merge will fall through the "endpoint not + # found" branch and proceed normally. Subsequent runs benefit + # from the gate. + - name: Wait for in-flight provisioning to drain + env: + # Override-able via repo variables/secrets if a different + # mothership URL is in play (Sovereign chroot self-deploy, + # staging, etc.). Default targets the production mothership. + CATALYST_API_URL: ${{ vars.CATALYST_API_URL || 'https://console.openova.io' }} + MAX_WAIT_SECONDS: '1800' # 30 min hard cap + POLL_INTERVAL_SECONDS: '20' + run: | + set -u + ENDPOINT="${CATALYST_API_URL%/}/api/v1/deployments/in-flight-count" + echo "Polling ${ENDPOINT} every ${POLL_INTERVAL_SECONDS}s (cap ${MAX_WAIT_SECONDS}s)" + + START=$(date +%s) + ATTEMPT=0 + while : ; do + ATTEMPT=$((ATTEMPT + 1)) + HTTP_CODE=$(curl -fsSL --max-time 10 -o /tmp/inflight.json -w '%{http_code}' \ + "${ENDPOINT}" 2>/dev/null || echo "000") + + if [ "$HTTP_CODE" = "000" ]; then + # Network failure (DNS, connect refused, timeout). Do NOT + # block the deploy — fail-open per "broken gate must not + # halt all deploys" rule above. Log + proceed. + echo "WARN: ${ENDPOINT} unreachable on attempt ${ATTEMPT} (curl failed). Proceeding without gate." + break + fi + + if [ "$HTTP_CODE" = "404" ]; then + # First-rollout case — the endpoint is not yet present on + # the LIVE catalyst-api. Once this PR merges, subsequent + # runs will see the endpoint and start gating properly. + echo "INFO: ${ENDPOINT} returned 404 — endpoint not yet deployed on live mothership. Proceeding (first-rollout fall-through)." + break + fi + + if [ "$HTTP_CODE" != "200" ]; then + # Any other non-2xx: log + proceed (fail-open). + echo "WARN: ${ENDPOINT} returned HTTP ${HTTP_CODE} on attempt ${ATTEMPT}. Body:" + cat /tmp/inflight.json 2>/dev/null || true + echo + echo "Proceeding without gate (fail-open)." + break + fi + + COUNT=$(jq -r '.count // 0' /tmp/inflight.json 2>/dev/null || echo "0") + IDS=$(jq -r '.ids // [] | join(",")' /tmp/inflight.json 2>/dev/null || echo "") + + if [ "$COUNT" -eq 0 ] 2>/dev/null; then + echo "OK: 0 deployments in-flight. Safe to bump catalyst-api image." + break + fi + + ELAPSED=$(($(date +%s) - START)) + if [ "$ELAPSED" -ge "$MAX_WAIT_SECONDS" ]; then + echo "WARN: ${COUNT} deployment(s) still in-flight after ${ELAPSED}s (cap ${MAX_WAIT_SECONDS}s)." + echo "WARN: in-flight ids: ${IDS}" + echo "WARN: proceeding with image bump anyway — stuck provs must not block all future deploys." + break + fi + + echo "WAIT: attempt ${ATTEMPT} — ${COUNT} deployment(s) in-flight (ids: ${IDS}); elapsed=${ELAPSED}s. Sleeping ${POLL_INTERVAL_SECONDS}s." + sleep "${POLL_INTERVAL_SECONDS}" + done + - name: Update SHA tags in values.yaml and deployment manifests # The catalyst-ui and catalyst-api images are referenced in two places: # diff --git a/products/catalyst/bootstrap/api/cmd/api/main.go b/products/catalyst/bootstrap/api/cmd/api/main.go index 4866b0ed..ce4dce00 100644 --- a/products/catalyst/bootstrap/api/cmd/api/main.go +++ b/products/catalyst/bootstrap/api/cmd/api/main.go @@ -375,6 +375,21 @@ func main() { // endpoint. See handler/auth_test_session.go for the rationale. r.Post("/api/v1/auth/test-session", h.HandleAuthTestSession) + // /api/v1/deployments/in-flight-count — public, read-only count of + // deployments in any Phase-0 in-flight status (pending / + // provisioning / tofu-applying / flux-bootstrapping). The CI + // deploy-bot (.github/workflows/catalyst-build.yaml) polls this + // before pushing a values.yaml image-SHA bump, to avoid rolling the + // catalyst-api Pod mid-tofu-apply (the OpenTofu workdir lives on a + // /tmp emptyDir that dies with the Pod, abandoning the prov and + // leaking Hetzner resources). MUST live outside RequireSession — + // the deploy-bot has no session cookie and runs from a GHA runner. + // Same posture as /healthz, /readyz, /api/v1/version. The response + // is count+IDs only; no FQDNs or owner emails. See handler/ + // deployments_in_flight_count.go for the full rationale and the + // t13/t17/t21 incident history that motivated this gate. + r.Get("/api/v1/deployments/in-flight-count", h.InFlightCount) + // /api/v1/subdomains/check — public, read-only availability query. // Same model as a username-availability check on a signup form: an // anonymous visitor lands on the wizard's Domain step BEFORE they diff --git a/products/catalyst/bootstrap/api/internal/handler/deployments_in_flight_count.go b/products/catalyst/bootstrap/api/internal/handler/deployments_in_flight_count.go new file mode 100644 index 00000000..86cc50a7 --- /dev/null +++ b/products/catalyst/bootstrap/api/internal/handler/deployments_in_flight_count.go @@ -0,0 +1,115 @@ +// In-flight deployment count — public, unauthenticated. +// +// Endpoint: GET /api/v1/deployments/in-flight-count +// +// Returns the number of deployments currently in a non-terminal Phase-0 +// status (pending / provisioning / tofu-applying / flux-bootstrapping) +// across ALL owners. The caller is the CI deploy-bot +// (.github/workflows/catalyst-build.yaml), which uses this to GATE the +// values.yaml image-SHA bump: a non-zero count means rolling the +// catalyst-api Deployment now would kill an active tofu apply +// mid-flight, abandoning the prov and leaking Hetzner resources. +// +// Background — t13/t17/t21 incident, 2026-05-17/18: +// +// catalyst-api is single-replica with strategy: Recreate (Sovereign +// chart) / RollingUpdate maxUnavailable=1 (contabo-mkt). When the +// deploy-bot bumps the image SHA, Flux reconciles the Deployment and +// the OLD Pod gets SIGTERM'd. The OpenTofu workdir lives on a /tmp +// emptyDir that dies with the Pod (provisioner constraint — fresh state +// every run), so any in-flight `tofu apply` is killed mid-resource. +// The on-disk deployment record is rewritten to status=failed on the +// NEW Pod's restoreFromStore path (see deployments.go:413 — issue #530 +// follow-up), but the Hetzner resources tagged with the abandoned +// deployment-id remain orphans. Three consecutive provs (t13/t17/t21) +// died this way during 2026-05-17. +// +// Why this is the right shape: +// +// - Phase-0 in-flight is the only status that cannot survive a Pod +// restart. Phase-1 (HelmRelease watch) is observational and resumes +// via resumePhase1Watch on the new Pod, so we do NOT count +// "phase1-watching" as a blocking state — isPhase0InFlightStatus +// (deployments.go:440) is the canonical predicate. +// +// - "Count only" (not a per-deployment list with FQDNs/owners) keeps +// information disclosure to the bare minimum. Same posture as +// /api/v1/subdomains/check — pre-auth surface, read-only, smallest +// answer that still drives the caller's decision. +// +// - HTTP 200 + count=0 is the green light. The deploy-bot polls until +// count==0 OR a hard timeout, then proceeds with the bump. +// +// - No auth gate. The deploy-bot has no session cookie and runs from +// a GitHub Actions runner. Same precedent as /healthz, /readyz, +// /api/v1/version, /api/v1/subdomains/check. +// +// Adopted deployments are excluded from the count — once AdoptedAt is +// set, Phase-0 is long-since complete (handover already fired), so a +// stale "phase1-watching" or "ready" status with AdoptedAt set should +// not gate a deploy. We rely on isPhase0InFlightStatus to filter +// terminal statuses anyway, so the AdoptedAt check is belt-and-braces. + +package handler + +import ( + "encoding/json" + "net/http" + "sort" +) + +// inFlightCountResponse is the wire shape of the public count endpoint. +// +// Count is the authoritative number the deploy-bot gates on. IDs is a +// best-effort sorted list of deployment IDs in non-terminal Phase-0 +// status — useful for CI logs ("waiting on dep-abc123, dep-def456") +// without leaking owner emails or FQDNs. +type inFlightCountResponse struct { + Count int `json:"count"` + IDs []string `json:"ids"` +} + +// InFlightCount handles GET /api/v1/deployments/in-flight-count. +// +// Public + unauthenticated. Returns a count of deployments in any +// Phase-0 in-flight status (see isPhase0InFlightStatus). The deploy-bot +// in .github/workflows/catalyst-build.yaml polls this before bumping +// the values.yaml image SHA — if count > 0, it waits and retries to +// avoid rolling the catalyst-api Pod mid-tofu-apply. +func (h *Handler) InFlightCount(w http.ResponseWriter, r *http.Request) { + ids := make([]string, 0) + + h.deployments.Range(func(_, val any) bool { + dep, ok := val.(*Deployment) + if !ok || dep == nil { + return true + } + dep.mu.Lock() + status := dep.Status + adopted := dep.AdoptedAt != nil + id := dep.ID + dep.mu.Unlock() + + if adopted { + return true + } + if isPhase0InFlightStatus(status) { + ids = append(ids, id) + } + return true + }) + + sort.Strings(ids) + + resp := inFlightCountResponse{ + Count: len(ids), + IDs: ids, + } + + w.Header().Set("Content-Type", "application/json") + // Discourage caching: a deploy-bot retry seconds later needs the + // fresh state, not whatever an upstream proxy stored. + w.Header().Set("Cache-Control", "no-store") + w.WriteHeader(http.StatusOK) + _ = json.NewEncoder(w).Encode(resp) +} diff --git a/products/catalyst/bootstrap/api/internal/handler/deployments_in_flight_count_test.go b/products/catalyst/bootstrap/api/internal/handler/deployments_in_flight_count_test.go new file mode 100644 index 00000000..8a92d1ab --- /dev/null +++ b/products/catalyst/bootstrap/api/internal/handler/deployments_in_flight_count_test.go @@ -0,0 +1,152 @@ +// Tests for the public /api/v1/deployments/in-flight-count endpoint. +// +// Contract: +// - Counts ONLY Phase-0 in-flight statuses (pending, provisioning, +// tofu-applying, flux-bootstrapping). NOT phase1-watching (Phase-1 +// resumes across catalyst-api Pod restarts via resumePhase1Watch). +// - Excludes adopted deployments (post-handover; customer-owned). +// - Returns 200 + {count, ids} on every call. No auth gate. +// +// The CI deploy-bot gates the catalyst-api image-SHA bump on this +// endpoint returning count=0, to avoid rolling the Pod mid-tofu-apply +// (t13/t17/t21 incident, 2026-05-17). +package handler + +import ( + "encoding/json" + "log/slog" + "net/http" + "net/http/httptest" + "reflect" + "testing" + "time" + + "github.com/openova-io/openova/products/catalyst/bootstrap/api/internal/provisioner" +) + +func newDepForInFlight(h *Handler, id, status string, adopted bool) { + dep := &Deployment{ + ID: id, + Status: status, + Request: provisioner.Request{SovereignFQDN: id + ".example.com"}, + StartedAt: time.Now().Add(-1 * time.Minute), + eventsCh: make(chan provisioner.Event), + done: make(chan struct{}), + } + close(dep.eventsCh) + close(dep.done) + if adopted { + now := time.Now() + dep.AdoptedAt = &now + } + h.deployments.Store(id, dep) +} + +// Phase-0 in-flight statuses MUST be counted. Phase-1 + terminal + +// adopted statuses MUST be excluded. +func TestInFlightCount_OnlyPhase0InFlightCounts(t *testing.T) { + h := &Handler{log: slog.Default()} + + // Phase-0 in-flight — every one of these counts. + newDepForInFlight(h, "dep-pending", "pending", false) + newDepForInFlight(h, "dep-provisioning", "provisioning", false) + newDepForInFlight(h, "dep-tofu-applying", "tofu-applying", false) + newDepForInFlight(h, "dep-flux-bootstrapping", "flux-bootstrapping", false) + + // Phase-1 — does NOT count (resumable across Pod restarts). + newDepForInFlight(h, "dep-phase1-watching", "phase1-watching", false) + + // Terminal — does NOT count. + newDepForInFlight(h, "dep-ready", "ready", false) + newDepForInFlight(h, "dep-failed", "failed", false) + newDepForInFlight(h, "dep-wiped", "wiped", false) + + // Adopted — does NOT count even if status somehow regressed. + newDepForInFlight(h, "dep-adopted-ready", "ready", true) + newDepForInFlight(h, "dep-adopted-stuck", "phase1-watching", true) + + w := httptest.NewRecorder() + r := httptest.NewRequest(http.MethodGet, "/api/v1/deployments/in-flight-count", nil) + h.InFlightCount(w, r) + + if w.Code != http.StatusOK { + t.Fatalf("expected 200; got %d body=%s", w.Code, w.Body.String()) + } + if got := w.Header().Get("Content-Type"); got != "application/json" { + t.Fatalf("expected Content-Type application/json; got %q", got) + } + if got := w.Header().Get("Cache-Control"); got != "no-store" { + t.Fatalf("expected Cache-Control no-store; got %q", got) + } + + var body inFlightCountResponse + if err := json.NewDecoder(w.Body).Decode(&body); err != nil { + t.Fatalf("decode failed: %v", err) + } + if body.Count != 4 { + t.Fatalf("expected count=4 (Phase-0 in-flight only); got %d ids=%v", body.Count, body.IDs) + } + wantIDs := []string{ + "dep-flux-bootstrapping", + "dep-pending", + "dep-provisioning", + "dep-tofu-applying", + } + if !reflect.DeepEqual(body.IDs, wantIDs) { + t.Fatalf("expected sorted ids=%v; got %v", wantIDs, body.IDs) + } +} + +// No deployments → count=0, ids=[]. This is the green-light state the +// deploy-bot gates on. +func TestInFlightCount_EmptyStore(t *testing.T) { + h := &Handler{log: slog.Default()} + + w := httptest.NewRecorder() + r := httptest.NewRequest(http.MethodGet, "/api/v1/deployments/in-flight-count", nil) + h.InFlightCount(w, r) + + if w.Code != http.StatusOK { + t.Fatalf("expected 200; got %d", w.Code) + } + var body inFlightCountResponse + if err := json.NewDecoder(w.Body).Decode(&body); err != nil { + t.Fatalf("decode failed: %v", err) + } + if body.Count != 0 { + t.Fatalf("expected count=0; got %d", body.Count) + } + if body.IDs == nil { + t.Fatalf("expected ids=[] (non-nil); got nil — wire shape MUST be a JSON array, not null") + } + if len(body.IDs) != 0 { + t.Fatalf("expected ids=[] empty; got %v", body.IDs) + } +} + +// Only terminal/adopted deployments → count=0. Same green-light state. +func TestInFlightCount_AllTerminal(t *testing.T) { + h := &Handler{log: slog.Default()} + + newDepForInFlight(h, "dep-ready", "ready", false) + newDepForInFlight(h, "dep-failed", "failed", false) + newDepForInFlight(h, "dep-adopted", "ready", true) + // phase1-watching is observational — Pod restart resumes it via + // resumePhase1Watch, so it MUST NOT block a deploy bump. + newDepForInFlight(h, "dep-phase1", "phase1-watching", false) + + w := httptest.NewRecorder() + r := httptest.NewRequest(http.MethodGet, "/api/v1/deployments/in-flight-count", nil) + h.InFlightCount(w, r) + + if w.Code != http.StatusOK { + t.Fatalf("expected 200; got %d", w.Code) + } + var body inFlightCountResponse + if err := json.NewDecoder(w.Body).Decode(&body); err != nil { + t.Fatalf("decode failed: %v", err) + } + if body.Count != 0 { + t.Fatalf("expected count=0 (terminal + phase1 + adopted excluded); got %d ids=%v", body.Count, body.IDs) + } +}