fix(handover): PR N — fallback to per-FQDN cert when wildcard 429s (#1594)

t143 caught the LE PROD rate limit (429: too many certificates (50)
already issued for omani.works in last 168h0m0s, retry after
2026-05-17 10:28:32 UTC). The chart renders TWO cert names:
- sovereign-wildcard-tls (canonical, hit 429)
- sovereign-wildcard-tls-<fqdn> (per-FQDN, was already issued before
  rate limit, Ready=True)

waitForWildcardCert only checked the canonical name. With the limit
hit, handover waited the full 10-min budget before firing degraded.

Fix: when the canonical cert is unavailable, list namespace certs
matching `sovereign-wildcard-tls-*` prefix and return Ready=True if
ANY sibling is Ready. The operator's console.<fqdn> TLS handshake
will succeed against either secret since both wildcard *.<fqdn>.

Bumps chart 1.4.150 -> 1.4.151 + bootstrap-kit pin so the fix lands
on next fresh prov.

Co-authored-by: hatiyildiz <hatice.yildiz@openova.io>
Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
e3mrah 2026-05-17 13:02:17 +04:00 committed by GitHub
parent 13c9684cc1
commit b27bdeee05
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 33 additions and 6 deletions

View File

@ -502,7 +502,7 @@ spec:
# - PR #1585: D17 /app/$componentId route-collision fix (catalyst-ui 2ab8a0e)
# Caught on t136/t138 fresh-prov runs that bootstrap-kit was
# still pinned to 1.4.147 → none of the fixes reached the chroot.
version: 1.4.150
version: 1.4.151
sourceRef:
kind: HelmRepository
name: bp-catalyst-platform

View File

@ -1205,10 +1205,37 @@ func wildcardCertReady(ctx context.Context, dyn dynamic.Interface) (bool, string
u, err := dyn.Resource(certificateGVR).
Namespace(sovereignWildcardCertNamespace).
Get(ctx, sovereignWildcardCertName, metav1.GetOptions{})
if err != nil {
return false, "<not-found>", err
if err == nil {
return certificateReady(u)
}
return certificateReady(u)
// PR N (2026-05-17 t143 LE rate-limit incident): when the canonical
// `sovereign-wildcard-tls` cert is unavailable (404 / 429 LE rate
// limit on the parent domain / DNS01 propagation lag), fall back to
// ANY per-FQDN sibling cert matching `sovereign-wildcard-tls-*`
// that's already Ready=True. The chart renders both names in
// multi-zone configurations (sovereign-wildcard-tls per-zone +
// sovereign-wildcard-tls-<fqdn> per-FQDN); either reaching Ready
// proves the operator's console.<fqdn> TLS handshake will succeed.
// Without this fallback, handover waits the full 10-min budget
// before firing degraded — operator browser can't reach the new
// Sovereign for that whole window.
list, listErr := dyn.Resource(certificateGVR).
Namespace(sovereignWildcardCertNamespace).
List(ctx, metav1.ListOptions{})
if listErr == nil && list != nil {
for i := range list.Items {
item := &list.Items[i]
name := item.GetName()
if !strings.HasPrefix(name, sovereignWildcardCertName+"-") {
continue
}
ok, _, _ := certificateReady(item)
if ok {
return true, "True (via fallback " + name + ")", nil
}
}
}
return false, "<not-found>", err
}
// certificateReady — returns (ready, observedStatus, nil) for a

View File

@ -1058,8 +1058,8 @@ name: bp-catalyst-platform
# Fix #154 (HR-timeout audit). Those bumped the HelmRelease
# install.timeout. This bumps the chart-INTERNAL wait loop budget
# inside the pre-install hook Job, which is a different seam.
version: 1.4.150
appVersion: 1.4.150
version: 1.4.151
appVersion: 1.4.151
# 1.4.141 (qa-loop Fix #185, prov #38/#39/#41 recurrence — pre-install
# hook unscheduable on saturated worker):
#