-
Notifications
You must be signed in to change notification settings - Fork 790
Fix rotating secret with TTL=0 to use exponential backoff retry instead of 1 sec #2134
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
10ff875
b0fc949
0b584b7
d482dc5
6532fac
7df06e8
d7246f0
dbd6de2
f6ce3df
92cd7f9
8f245e3
68c34d8
d7a66ff
dd7e8f0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||
|---|---|---|---|---|
|
|
@@ -7,10 +7,11 @@ import ( | |||
| "encoding/json" | ||||
| "fmt" | ||||
| "log" | ||||
| "math/rand" | ||||
| "math/rand" // TODO: Replace with crypto/rand to fix CWE-338 (cryptographically weak PRNG) | ||||
| "sync" | ||||
| "time" | ||||
|
|
||||
| "github.com/cenkalti/backoff/v4" | ||||
| "github.com/hashicorp/vault/api" | ||||
| ) | ||||
|
|
||||
|
|
@@ -109,9 +110,9 @@ func renewSecret(clients *ClientSet, d renewer) error { | |||
| } | ||||
| } | ||||
|
|
||||
| // leaseCheckWait accepts a secret and returns the recommended amount of | ||||
| // time to sleep. | ||||
| func leaseCheckWait(s *Secret) time.Duration { | ||||
| // leaseCheckWait accepts a secret and retry count, returns the recommended amount of | ||||
| // time to sleep. For rotating secrets with ttl=0, implements exponential backoff. | ||||
| func leaseCheckWait(s *Secret, retryCount int) time.Duration { | ||||
| // Handle whether this is an auth or a regular secret. | ||||
| base := s.LeaseDuration | ||||
| if s.Auth != nil && s.Auth.LeaseDuration > 0 { | ||||
|
|
@@ -144,10 +145,42 @@ func leaseCheckWait(s *Secret) time.Duration { | |||
| if _, ok := s.Data["rotation_period"]; ok && s.LeaseID == "" { | ||||
| if ttlInterface, ok := s.Data["ttl"]; ok { | ||||
| if ttlData, err := ttlInterface.(json.Number).Int64(); err == nil { | ||||
| log.Printf("[DEBUG] Found rotation_period and set lease duration to %d seconds", ttlData) | ||||
| // Add a second for cushion | ||||
| base = int(ttlData) + 1 | ||||
| rotatingSecret = true | ||||
| if ttlData > 0 { | ||||
| log.Printf("[DEBUG] Found rotation_period and set lease duration to %d seconds", ttlData) | ||||
| // Add a second for cushion | ||||
| base = int(ttlData) + 1 | ||||
| rotatingSecret = true | ||||
| } else if ttlData == 0 { | ||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So in looking at consul-template a bit more it looks like there's already a notion of retries with backoff that is set in the RetryConfig: consul-template/config/vault.go Line 66 in dd7e8f0
I'm wondering if we've tried just returning an error from VaultReadQuery.Fetch() and VaultWriteQuery.Fetch() in the ttl=0 case. Would that trigger the existing retry mechanism?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @tvoran thanks for that input, thats alot easier than I thought - here is the PR - #2136 |
||||
| // FIX: To address the case where Vault responds with a TTL of 0 when there's | ||||
| // been an error rotating the secret on the Vault side, so this is effectively | ||||
| // retrying an error condition. When TTL is 0, implement exponential backoff | ||||
| // Start at 2s, double each time, max is VaultDefaultLeaseDuration or 300s | ||||
| maxBackoff := VaultDefaultLeaseDuration | ||||
| if maxBackoff <= 0 { | ||||
| maxBackoff = 300 * time.Second // Default max of 300 seconds if not configured | ||||
| } | ||||
|
|
||||
| expBackoff := backoff.NewExponentialBackOff() | ||||
| expBackoff.InitialInterval = 2 * time.Second | ||||
| expBackoff.Multiplier = 2.0 | ||||
| expBackoff.MaxInterval = maxBackoff | ||||
| expBackoff.MaxElapsedTime = 0 // Never stop retrying | ||||
| // Keep default RandomizationFactor (0.5) for jitter to prevent thundering herd | ||||
| expBackoff.Reset() | ||||
|
|
||||
| // Calculate backoff for the current retry attempt | ||||
| // NextBackOff() must be called retryCount+1 times to get the correct interval | ||||
| var backoffDur time.Duration | ||||
| for i := 0; i <= retryCount; i++ { | ||||
| backoffDur = expBackoff.NextBackOff() | ||||
| } | ||||
| if backoffDur == backoff.Stop { | ||||
| backoffDur = maxBackoff | ||||
| } | ||||
|
|
||||
| log.Printf("[DEBUG] Found rotation_period with ttl=0, using exponential backoff: %s (retry %d)", backoffDur, retryCount) | ||||
| return backoffDur | ||||
| } | ||||
| } | ||||
| } | ||||
| } | ||||
|
|
@@ -166,6 +199,7 @@ func leaseCheckWait(s *Secret) time.Duration { | |||
| sleep = sleep / 3.0 | ||||
|
|
||||
| // Use some randomness so many clients do not hit Vault simultaneously. | ||||
| // TODO: Replace math/rand.Float64 with crypto/rand to fix CWE-338 | ||||
| sleep = sleep * (rand.Float64() + 1) / 2.0 | ||||
| } else if !rotatingSecret { | ||||
| // If the secret doesn't have a rotation period, this is a non-renewable leased | ||||
|
|
@@ -174,6 +208,7 @@ func leaseCheckWait(s *Secret) time.Duration { | |||
| // lease as possible. Use a stagger over the configured threshold | ||||
| // fraction of the lease duration so that many clients do not hit | ||||
| // Vault simultaneously. | ||||
| // TODO: Replace math/rand.Float64 with crypto/rand to fix CWE-338 | ||||
| finalFraction := VaultLeaseRenewalThreshold + (rand.Float64()-0.5)*0.1 | ||||
| if finalFraction >= 1.0 || finalFraction <= 0.0 { | ||||
| // If the fraction randomly winds up outside of (0.0-1.0), clamp | ||||
|
|
||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Feels like the retryCount should be reset when this condition is hit? Since we're treating ttl=0 as an error to retry, and ttl > 0 as a successful fetch?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
good catch, I made that change. done