fix(handover): PR N — fallback to per-FQDN cert when wildcard 429s (#1594)
t143 caught the LE PROD rate limit (429: too many certificates (50) already issued for omani.works in last 168h0m0s, retry after 2026-05-17 10:28:32 UTC). The chart renders TWO cert names: - sovereign-wildcard-tls (canonical, hit 429) - sovereign-wildcard-tls-<fqdn> (per-FQDN, was already issued before rate limit, Ready=True) waitForWildcardCert only checked the canonical name. With the limit hit, handover waited the full 10-min budget before firing degraded. Fix: when the canonical cert is unavailable, list namespace certs matching `sovereign-wildcard-tls-*` prefix and return Ready=True if ANY sibling is Ready. The operator's console.<fqdn> TLS handshake will succeed against either secret since both wildcard *.<fqdn>. Bumps chart 1.4.150 -> 1.4.151 + bootstrap-kit pin so the fix lands on next fresh prov. Co-authored-by: hatiyildiz <hatice.yildiz@openova.io> Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
13c9684cc1
commit
b27bdeee05
@ -502,7 +502,7 @@ spec:
|
||||
# - PR #1585: D17 /app/$componentId route-collision fix (catalyst-ui 2ab8a0e)
|
||||
# Caught on t136/t138 fresh-prov runs that bootstrap-kit was
|
||||
# still pinned to 1.4.147 → none of the fixes reached the chroot.
|
||||
version: 1.4.150
|
||||
version: 1.4.151
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: bp-catalyst-platform
|
||||
|
||||
@ -1205,10 +1205,37 @@ func wildcardCertReady(ctx context.Context, dyn dynamic.Interface) (bool, string
|
||||
u, err := dyn.Resource(certificateGVR).
|
||||
Namespace(sovereignWildcardCertNamespace).
|
||||
Get(ctx, sovereignWildcardCertName, metav1.GetOptions{})
|
||||
if err != nil {
|
||||
return false, "<not-found>", err
|
||||
if err == nil {
|
||||
return certificateReady(u)
|
||||
}
|
||||
return certificateReady(u)
|
||||
// PR N (2026-05-17 t143 LE rate-limit incident): when the canonical
|
||||
// `sovereign-wildcard-tls` cert is unavailable (404 / 429 LE rate
|
||||
// limit on the parent domain / DNS01 propagation lag), fall back to
|
||||
// ANY per-FQDN sibling cert matching `sovereign-wildcard-tls-*`
|
||||
// that's already Ready=True. The chart renders both names in
|
||||
// multi-zone configurations (sovereign-wildcard-tls per-zone +
|
||||
// sovereign-wildcard-tls-<fqdn> per-FQDN); either reaching Ready
|
||||
// proves the operator's console.<fqdn> TLS handshake will succeed.
|
||||
// Without this fallback, handover waits the full 10-min budget
|
||||
// before firing degraded — operator browser can't reach the new
|
||||
// Sovereign for that whole window.
|
||||
list, listErr := dyn.Resource(certificateGVR).
|
||||
Namespace(sovereignWildcardCertNamespace).
|
||||
List(ctx, metav1.ListOptions{})
|
||||
if listErr == nil && list != nil {
|
||||
for i := range list.Items {
|
||||
item := &list.Items[i]
|
||||
name := item.GetName()
|
||||
if !strings.HasPrefix(name, sovereignWildcardCertName+"-") {
|
||||
continue
|
||||
}
|
||||
ok, _, _ := certificateReady(item)
|
||||
if ok {
|
||||
return true, "True (via fallback " + name + ")", nil
|
||||
}
|
||||
}
|
||||
}
|
||||
return false, "<not-found>", err
|
||||
}
|
||||
|
||||
// certificateReady — returns (ready, observedStatus, nil) for a
|
||||
|
||||
@ -1058,8 +1058,8 @@ name: bp-catalyst-platform
|
||||
# Fix #154 (HR-timeout audit). Those bumped the HelmRelease
|
||||
# install.timeout. This bumps the chart-INTERNAL wait loop budget
|
||||
# inside the pre-install hook Job, which is a different seam.
|
||||
version: 1.4.150
|
||||
appVersion: 1.4.150
|
||||
version: 1.4.151
|
||||
appVersion: 1.4.151
|
||||
# 1.4.141 (qa-loop Fix #185, prov #38/#39/#41 recurrence — pre-install
|
||||
# hook unscheduable on saturated worker):
|
||||
#
|
||||
|
||||
Loading…
Reference in New Issue
Block a user