diff --git a/.golangci.yml b/.golangci.yml index 43c5cd3bc..5e040d740 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -24,6 +24,7 @@ linters: allow: - $gostd - github.com/BurntSushi/toml + - github.com/cenkalti/backoff/v4 - golang.org/x/sys/unix - golang.org/x/exp/maps - golang.org/x/text/cases diff --git a/child/child.go b/child/child.go index f5d64db3a..b02ab915a 100644 --- a/child/child.go +++ b/child/child.go @@ -8,7 +8,7 @@ import ( "fmt" "io" "log" - "math/rand" + "math/rand" // TODO: Replace with crypto/rand to fix CWE-338 (cryptographically weak PRNG) "os" "os/exec" "strings" @@ -475,6 +475,8 @@ func (c *Child) randomSplay() <-chan time.Time { } ns := c.splay.Nanoseconds() + // TODO: Replace math/rand.Int63n with crypto/rand.Int to fix CWE-338 + // Use: nBig, err := rand.Int(rand.Reader, big.NewInt(ns)) offset := rand.Int63n(ns) t := time.Duration(offset) diff --git a/dependency/vault_common.go b/dependency/vault_common.go index b28a98214..a91a55847 100644 --- a/dependency/vault_common.go +++ b/dependency/vault_common.go @@ -7,10 +7,11 @@ import ( "encoding/json" "fmt" "log" - "math/rand" + "math/rand" // TODO: Replace with crypto/rand to fix CWE-338 (cryptographically weak PRNG) "sync" "time" + "github.com/cenkalti/backoff/v4" "github.com/hashicorp/vault/api" ) @@ -109,9 +110,9 @@ func renewSecret(clients *ClientSet, d renewer) error { } } -// leaseCheckWait accepts a secret and returns the recommended amount of -// time to sleep. -func leaseCheckWait(s *Secret) time.Duration { +// leaseCheckWait accepts a secret and retry count, returns the recommended amount of +// time to sleep. For rotating secrets with ttl=0, implements exponential backoff. +func leaseCheckWait(s *Secret, retryCount int) time.Duration { // Handle whether this is an auth or a regular secret. base := s.LeaseDuration if s.Auth != nil && s.Auth.LeaseDuration > 0 { @@ -144,10 +145,42 @@ func leaseCheckWait(s *Secret) time.Duration { if _, ok := s.Data["rotation_period"]; ok && s.LeaseID == "" { if ttlInterface, ok := s.Data["ttl"]; ok { if ttlData, err := ttlInterface.(json.Number).Int64(); err == nil { - log.Printf("[DEBUG] Found rotation_period and set lease duration to %d seconds", ttlData) - // Add a second for cushion - base = int(ttlData) + 1 - rotatingSecret = true + if ttlData > 0 { + log.Printf("[DEBUG] Found rotation_period and set lease duration to %d seconds", ttlData) + // Add a second for cushion + base = int(ttlData) + 1 + rotatingSecret = true + } else if ttlData == 0 { + // FIX: To address the case where Vault responds with a TTL of 0 when there's + // been an error rotating the secret on the Vault side, so this is effectively + // retrying an error condition. When TTL is 0, implement exponential backoff + // Start at 2s, double each time, max is VaultDefaultLeaseDuration or 300s + maxBackoff := VaultDefaultLeaseDuration + if maxBackoff <= 0 { + maxBackoff = 300 * time.Second // Default max of 300 seconds if not configured + } + + expBackoff := backoff.NewExponentialBackOff() + expBackoff.InitialInterval = 2 * time.Second + expBackoff.Multiplier = 2.0 + expBackoff.MaxInterval = maxBackoff + expBackoff.MaxElapsedTime = 0 // Never stop retrying + // Keep default RandomizationFactor (0.5) for jitter to prevent thundering herd + expBackoff.Reset() + + // Calculate backoff for the current retry attempt + // NextBackOff() must be called retryCount+1 times to get the correct interval + var backoffDur time.Duration + for i := 0; i <= retryCount; i++ { + backoffDur = expBackoff.NextBackOff() + } + if backoffDur == backoff.Stop { + backoffDur = maxBackoff + } + + log.Printf("[DEBUG] Found rotation_period with ttl=0, using exponential backoff: %s (retry %d)", backoffDur, retryCount) + return backoffDur + } } } } @@ -166,6 +199,7 @@ func leaseCheckWait(s *Secret) time.Duration { sleep = sleep / 3.0 // Use some randomness so many clients do not hit Vault simultaneously. + // TODO: Replace math/rand.Float64 with crypto/rand to fix CWE-338 sleep = sleep * (rand.Float64() + 1) / 2.0 } else if !rotatingSecret { // If the secret doesn't have a rotation period, this is a non-renewable leased @@ -174,6 +208,7 @@ func leaseCheckWait(s *Secret) time.Duration { // lease as possible. Use a stagger over the configured threshold // fraction of the lease duration so that many clients do not hit // Vault simultaneously. + // TODO: Replace math/rand.Float64 with crypto/rand to fix CWE-338 finalFraction := VaultLeaseRenewalThreshold + (rand.Float64()-0.5)*0.1 if finalFraction >= 1.0 || finalFraction <= 0.0 { // If the fraction randomly winds up outside of (0.0-1.0), clamp diff --git a/dependency/vault_common_test.go b/dependency/vault_common_test.go index 910fe9214..af412aa48 100644 --- a/dependency/vault_common_test.go +++ b/dependency/vault_common_test.go @@ -20,13 +20,13 @@ func init() { func TestVaultRenewDuration(t *testing.T) { renewable := Secret{LeaseDuration: 100, Renewable: true} - renewableDur := leaseCheckWait(&renewable).Seconds() + renewableDur := leaseCheckWait(&renewable, 0).Seconds() if renewableDur < 16 || renewableDur >= 34 { t.Fatalf("renewable duration is not within 1/6 to 1/3 of lease duration: %f", renewableDur) } nonRenewable := Secret{LeaseDuration: 100} - nonRenewableDur := leaseCheckWait(&nonRenewable).Seconds() + nonRenewableDur := leaseCheckWait(&nonRenewable, 0).Seconds() if nonRenewableDur < 80 || nonRenewableDur > 95 { t.Fatalf("renewable duration is not within 80%% to 95%% of lease duration: %f", nonRenewableDur) } @@ -37,7 +37,7 @@ func TestVaultRenewDuration(t *testing.T) { } nonRenewableRotated := Secret{LeaseDuration: 100, Data: data} - nonRenewableRotatedDur := leaseCheckWait(&nonRenewableRotated).Seconds() + nonRenewableRotatedDur := leaseCheckWait(&nonRenewableRotated, 0).Seconds() // We expect a 1 second cushion if nonRenewableRotatedDur != 31 { @@ -50,13 +50,35 @@ func TestVaultRenewDuration(t *testing.T) { } nonRenewableRotated = Secret{LeaseDuration: 100, Data: data} - nonRenewableRotatedDur = leaseCheckWait(&nonRenewableRotated).Seconds() + nonRenewableRotatedDur = leaseCheckWait(&nonRenewableRotated, 0).Seconds() // We expect a 1 second cushion if nonRenewableRotatedDur != 6 { t.Fatalf("renewable duration is not 6: %f", nonRenewableRotatedDur) } + // Test rotating secret with TTL=0 uses exponential backoff + data = map[string]interface{}{ + "rotation_period": json.Number("60"), + "ttl": json.Number("0"), + } + + nonRenewableRotatedZeroTTL := Secret{LeaseDuration: 100, Data: data} + + // Test exponential backoff: 2s, 4s, 8s, 16s, 32s, 64s, 128s, 256s (max 300s) + // Using cenkalti/backoff with default RandomizationFactor (0.5) for jitter + // This means the actual interval will be [0.5*interval, 1.5*interval] + expectedBackoffs := []float64{2, 4, 8, 16, 32, 64, 128, 256} + for i, expected := range expectedBackoffs { + dur := leaseCheckWait(&nonRenewableRotatedZeroTTL, i).Seconds() + // With RandomizationFactor=0.5, allow 50% variance: [0.5*expected, 1.5*expected] + minExpected := expected * 0.5 + maxExpected := expected * 1.5 + if dur < minExpected || dur > maxExpected { + t.Fatalf("retry %d: expected between %f and %f seconds, got %f", i, minExpected, maxExpected, dur) + } + } + rawExpiration := time.Now().Unix() + 100 expiration := strconv.FormatInt(rawExpiration, 10) @@ -66,7 +88,7 @@ func TestVaultRenewDuration(t *testing.T) { } nonRenewableCert := Secret{LeaseDuration: 100, Data: data} - nonRenewableCertDur := leaseCheckWait(&nonRenewableCert).Seconds() + nonRenewableCertDur := leaseCheckWait(&nonRenewableCert, 0).Seconds() if nonRenewableCertDur < 80 || nonRenewableCertDur > 95 { t.Fatalf("non renewable certificate duration is not within 80%% to 95%%: %f", nonRenewableCertDur) } @@ -80,7 +102,7 @@ func TestVaultRenewDuration(t *testing.T) { } nonRenewableSecretID := Secret{LeaseDuration: 100, Data: data} - nonRenewableSecretIDDur := leaseCheckWait(&nonRenewableSecretID).Seconds() + nonRenewableSecretIDDur := leaseCheckWait(&nonRenewableSecretID, 0).Seconds() if nonRenewableSecretIDDur < 0.80*(60+1) || nonRenewableSecretIDDur > 0.95*(60+1) { t.Fatalf("renewable duration is not within 80%% to 95%% of lease duration: %f", nonRenewableSecretIDDur) @@ -96,7 +118,7 @@ func TestVaultRenewDuration(t *testing.T) { } nonRenewableSecretID := Secret{LeaseDuration: leaseDuration, Data: data} - nonRenewableSecretIDDur := leaseCheckWait(&nonRenewableSecretID).Seconds() + nonRenewableSecretIDDur := leaseCheckWait(&nonRenewableSecretID, 0).Seconds() if nonRenewableSecretIDDur < 0.80*(leaseDuration+1) || nonRenewableSecretIDDur > 0.95*(leaseDuration+1) { t.Fatalf("renewable duration is not within 80%% to 95%% of lease duration: %f", nonRenewableSecretIDDur) @@ -111,7 +133,7 @@ func TestVaultRenewDuration(t *testing.T) { } nonRenewableSecretID := Secret{LeaseDuration: leaseDuration, Data: data} - nonRenewableSecretIDDur := leaseCheckWait(&nonRenewableSecretID).Seconds() + nonRenewableSecretIDDur := leaseCheckWait(&nonRenewableSecretID, 0).Seconds() if nonRenewableSecretIDDur < 0.80*(leaseDuration+1) || nonRenewableSecretIDDur > 0.95*(leaseDuration+1) { t.Fatalf("renewable duration is not within 80%% to 95%% of lease duration: %f", nonRenewableSecretIDDur) diff --git a/dependency/vault_read.go b/dependency/vault_read.go index a9b0a9d8f..b97fc76cd 100644 --- a/dependency/vault_read.go +++ b/dependency/vault_read.go @@ -4,6 +4,7 @@ package dependency import ( + "encoding/json" "fmt" "log" "net/url" @@ -31,6 +32,9 @@ type VaultReadQuery struct { // vaultSecret is the actual Vault secret which we are renewing vaultSecret *api.Secret + + // retryCount tracks the number of retries for exponential backoff + retryCount int } // NewVaultReadQuery creates a new datacenter dependency. @@ -88,9 +92,29 @@ func (d *VaultReadQuery) Fetch(clients *ClientSet, opts *QueryOptions, } if !vaultSecretRenewable(d.secret) { - dur := leaseCheckWait(d.secret) - log.Printf("[TRACE] %s: non-renewable secret, set sleep for %s", d, dur) + // Check if this is a successful rotation (ttl > 0) + shouldResetRetry := false + if _, ok := d.secret.Data["rotation_period"]; ok && d.secret.LeaseID == "" { + if ttlInterface, ok := d.secret.Data["ttl"]; ok { + if ttlData, err := ttlInterface.(json.Number).Int64(); err == nil && ttlData > 0 { + shouldResetRetry = true + } + } + } + + dur := leaseCheckWait(d.secret, d.retryCount) + log.Printf("[TRACE] %s: non-renewable secret, set sleep for %s (retry %d)", d, dur, d.retryCount) + + if shouldResetRetry { + // Reset retry count on successful rotation + d.retryCount = 0 + } else { + d.retryCount++ + } d.sleepCh <- dur + } else { + // Reset retry count on successful renewable secret + d.retryCount = 0 } return respWithMetadata(d.secret) diff --git a/dependency/vault_write.go b/dependency/vault_write.go index 2232d1c79..48f2aed38 100644 --- a/dependency/vault_write.go +++ b/dependency/vault_write.go @@ -5,6 +5,7 @@ package dependency import ( "crypto/sha1" + "encoding/json" "fmt" "io" "log" @@ -32,6 +33,9 @@ type VaultWriteQuery struct { // vaultSecret is the actual Vault secret which we are renewing vaultSecret *api.Secret + + // retryCount tracks the number of retries for exponential backoff + retryCount int } // NewVaultWriteQuery creates a new datacenter dependency. @@ -91,9 +95,29 @@ func (d *VaultWriteQuery) Fetch(clients *ClientSet, opts *QueryOptions, d.secret = transformSecret(vaultSecret) if !vaultSecretRenewable(d.secret) { - dur := leaseCheckWait(d.secret) - log.Printf("[TRACE] %s: non-renewable secret, set sleep for %s", d, dur) + // Check if this is a successful rotation (ttl > 0) + shouldResetRetry := false + if _, ok := d.secret.Data["rotation_period"]; ok && d.secret.LeaseID == "" { + if ttlInterface, ok := d.secret.Data["ttl"]; ok { + if ttlData, err := ttlInterface.(json.Number).Int64(); err == nil && ttlData > 0 { + shouldResetRetry = true + } + } + } + + dur := leaseCheckWait(d.secret, d.retryCount) + log.Printf("[TRACE] %s: non-renewable secret, set sleep for %s (retry %d)", d, dur, d.retryCount) + + if shouldResetRetry { + // Reset retry count on successful rotation + d.retryCount = 0 + } else { + d.retryCount++ + } d.sleepCh <- dur + } else { + // Reset retry count on successful renewable secret + d.retryCount = 0 } return respWithMetadata(d.secret) diff --git a/go.mod b/go.mod index 789f81012..7d9564812 100644 --- a/go.mod +++ b/go.mod @@ -37,7 +37,7 @@ require ( github.com/Masterminds/goutils v1.1.1 // indirect github.com/Masterminds/semver/v3 v3.3.0 // indirect github.com/armon/go-metrics v0.4.1 // indirect - github.com/cenkalti/backoff/v4 v4.3.0 // indirect + github.com/cenkalti/backoff/v4 v4.3.0 github.com/coreos/go-systemd/v22 v22.5.0 github.com/fatih/color v1.17.0 // indirect github.com/go-jose/go-jose/v4 v4.1.1 // indirect diff --git a/watch/view.go b/watch/view.go index f29e1531b..23bbd1923 100644 --- a/watch/view.go +++ b/watch/view.go @@ -7,7 +7,7 @@ import ( "errors" "fmt" "log" - "math/rand" + "math/rand" // TODO: Replace with crypto/rand to fix CWE-338 (cryptographically weak PRNG) "reflect" "sync" "time" @@ -311,6 +311,7 @@ const minDelayBetweenUpdates = time.Millisecond * 100 func rateLimiter(start time.Time) time.Duration { remaining := minDelayBetweenUpdates - time.Since(start) if remaining > 0 { + // TODO: Replace math/rand.Int63n with crypto/rand.Int to fix CWE-338 dither := time.Duration(rand.Int63n(20000000)) // 0-20ms return remaining + dither }