Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .golangci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ linters:
allow:
- $gostd
- github.com/BurntSushi/toml
- github.com/cenkalti/backoff/v4
- golang.org/x/sys/unix
- golang.org/x/exp/maps
- golang.org/x/text/cases
Expand Down
4 changes: 3 additions & 1 deletion child/child.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ import (
"fmt"
"io"
"log"
"math/rand"
"math/rand" // TODO: Replace with crypto/rand to fix CWE-338 (cryptographically weak PRNG)
"os"
"os/exec"
"strings"
Expand Down Expand Up @@ -475,6 +475,8 @@ func (c *Child) randomSplay() <-chan time.Time {
}

ns := c.splay.Nanoseconds()
// TODO: Replace math/rand.Int63n with crypto/rand.Int to fix CWE-338
// Use: nBig, err := rand.Int(rand.Reader, big.NewInt(ns))
offset := rand.Int63n(ns)
t := time.Duration(offset)

Expand Down
51 changes: 43 additions & 8 deletions dependency/vault_common.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,11 @@ import (
"encoding/json"
"fmt"
"log"
"math/rand"
"math/rand" // TODO: Replace with crypto/rand to fix CWE-338 (cryptographically weak PRNG)
"sync"
"time"

"github.com/cenkalti/backoff/v4"
"github.com/hashicorp/vault/api"
)

Expand Down Expand Up @@ -109,9 +110,9 @@ func renewSecret(clients *ClientSet, d renewer) error {
}
}

// leaseCheckWait accepts a secret and returns the recommended amount of
// time to sleep.
func leaseCheckWait(s *Secret) time.Duration {
// leaseCheckWait accepts a secret and retry count, returns the recommended amount of
// time to sleep. For rotating secrets with ttl=0, implements exponential backoff.
func leaseCheckWait(s *Secret, retryCount int) time.Duration {
// Handle whether this is an auth or a regular secret.
base := s.LeaseDuration
if s.Auth != nil && s.Auth.LeaseDuration > 0 {
Expand Down Expand Up @@ -144,10 +145,42 @@ func leaseCheckWait(s *Secret) time.Duration {
if _, ok := s.Data["rotation_period"]; ok && s.LeaseID == "" {
if ttlInterface, ok := s.Data["ttl"]; ok {
if ttlData, err := ttlInterface.(json.Number).Int64(); err == nil {
log.Printf("[DEBUG] Found rotation_period and set lease duration to %d seconds", ttlData)
// Add a second for cushion
base = int(ttlData) + 1
rotatingSecret = true
if ttlData > 0 {
log.Printf("[DEBUG] Found rotation_period and set lease duration to %d seconds", ttlData)
// Add a second for cushion
base = int(ttlData) + 1
rotatingSecret = true
Comment on lines +148 to +152
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Feels like the retryCount should be reset when this condition is hit? Since we're treating ttl=0 as an error to retry, and ttl > 0 as a successful fetch?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good catch, I made that change. done

} else if ttlData == 0 {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So in looking at consul-template a bit more it looks like there's already a notion of retries with backoff that is set in the RetryConfig:

Retry *RetryConfig `mapstructure:"retry"`

I'm wondering if we've tried just returning an error from VaultReadQuery.Fetch() and VaultWriteQuery.Fetch() in the ttl=0 case. Would that trigger the existing retry mechanism?

Copy link
Collaborator Author

@aswanidutt aswanidutt Feb 19, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@tvoran thanks for that input, thats alot easier than I thought - here is the PR - #2136
but the only caveat is that there is default max 12 retries and maxbackoff of 1 min for retry.go. So, I had to use cenkaltiv4 in view.go to customize the retry backoff with max backoff of 5m and unlimited retries.

// FIX: To address the case where Vault responds with a TTL of 0 when there's
// been an error rotating the secret on the Vault side, so this is effectively
// retrying an error condition. When TTL is 0, implement exponential backoff
// Start at 2s, double each time, max is VaultDefaultLeaseDuration or 300s
maxBackoff := VaultDefaultLeaseDuration
if maxBackoff <= 0 {
maxBackoff = 300 * time.Second // Default max of 300 seconds if not configured
}

expBackoff := backoff.NewExponentialBackOff()
expBackoff.InitialInterval = 2 * time.Second
expBackoff.Multiplier = 2.0
expBackoff.MaxInterval = maxBackoff
expBackoff.MaxElapsedTime = 0 // Never stop retrying
// Keep default RandomizationFactor (0.5) for jitter to prevent thundering herd
expBackoff.Reset()

// Calculate backoff for the current retry attempt
// NextBackOff() must be called retryCount+1 times to get the correct interval
var backoffDur time.Duration
for i := 0; i <= retryCount; i++ {
backoffDur = expBackoff.NextBackOff()
}
if backoffDur == backoff.Stop {
backoffDur = maxBackoff
}

log.Printf("[DEBUG] Found rotation_period with ttl=0, using exponential backoff: %s (retry %d)", backoffDur, retryCount)
return backoffDur
}
}
}
}
Expand All @@ -166,6 +199,7 @@ func leaseCheckWait(s *Secret) time.Duration {
sleep = sleep / 3.0

// Use some randomness so many clients do not hit Vault simultaneously.
// TODO: Replace math/rand.Float64 with crypto/rand to fix CWE-338
sleep = sleep * (rand.Float64() + 1) / 2.0
} else if !rotatingSecret {
// If the secret doesn't have a rotation period, this is a non-renewable leased
Expand All @@ -174,6 +208,7 @@ func leaseCheckWait(s *Secret) time.Duration {
// lease as possible. Use a stagger over the configured threshold
// fraction of the lease duration so that many clients do not hit
// Vault simultaneously.
// TODO: Replace math/rand.Float64 with crypto/rand to fix CWE-338
finalFraction := VaultLeaseRenewalThreshold + (rand.Float64()-0.5)*0.1
if finalFraction >= 1.0 || finalFraction <= 0.0 {
// If the fraction randomly winds up outside of (0.0-1.0), clamp
Expand Down
38 changes: 30 additions & 8 deletions dependency/vault_common_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,13 @@ func init() {

func TestVaultRenewDuration(t *testing.T) {
renewable := Secret{LeaseDuration: 100, Renewable: true}
renewableDur := leaseCheckWait(&renewable).Seconds()
renewableDur := leaseCheckWait(&renewable, 0).Seconds()
if renewableDur < 16 || renewableDur >= 34 {
t.Fatalf("renewable duration is not within 1/6 to 1/3 of lease duration: %f", renewableDur)
}

nonRenewable := Secret{LeaseDuration: 100}
nonRenewableDur := leaseCheckWait(&nonRenewable).Seconds()
nonRenewableDur := leaseCheckWait(&nonRenewable, 0).Seconds()
if nonRenewableDur < 80 || nonRenewableDur > 95 {
t.Fatalf("renewable duration is not within 80%% to 95%% of lease duration: %f", nonRenewableDur)
}
Expand All @@ -37,7 +37,7 @@ func TestVaultRenewDuration(t *testing.T) {
}

nonRenewableRotated := Secret{LeaseDuration: 100, Data: data}
nonRenewableRotatedDur := leaseCheckWait(&nonRenewableRotated).Seconds()
nonRenewableRotatedDur := leaseCheckWait(&nonRenewableRotated, 0).Seconds()

// We expect a 1 second cushion
if nonRenewableRotatedDur != 31 {
Expand All @@ -50,13 +50,35 @@ func TestVaultRenewDuration(t *testing.T) {
}

nonRenewableRotated = Secret{LeaseDuration: 100, Data: data}
nonRenewableRotatedDur = leaseCheckWait(&nonRenewableRotated).Seconds()
nonRenewableRotatedDur = leaseCheckWait(&nonRenewableRotated, 0).Seconds()

// We expect a 1 second cushion
if nonRenewableRotatedDur != 6 {
t.Fatalf("renewable duration is not 6: %f", nonRenewableRotatedDur)
}

// Test rotating secret with TTL=0 uses exponential backoff
data = map[string]interface{}{
"rotation_period": json.Number("60"),
"ttl": json.Number("0"),
}

nonRenewableRotatedZeroTTL := Secret{LeaseDuration: 100, Data: data}

// Test exponential backoff: 2s, 4s, 8s, 16s, 32s, 64s, 128s, 256s (max 300s)
// Using cenkalti/backoff with default RandomizationFactor (0.5) for jitter
// This means the actual interval will be [0.5*interval, 1.5*interval]
expectedBackoffs := []float64{2, 4, 8, 16, 32, 64, 128, 256}
for i, expected := range expectedBackoffs {
dur := leaseCheckWait(&nonRenewableRotatedZeroTTL, i).Seconds()
// With RandomizationFactor=0.5, allow 50% variance: [0.5*expected, 1.5*expected]
minExpected := expected * 0.5
maxExpected := expected * 1.5
if dur < minExpected || dur > maxExpected {
t.Fatalf("retry %d: expected between %f and %f seconds, got %f", i, minExpected, maxExpected, dur)
}
}

rawExpiration := time.Now().Unix() + 100
expiration := strconv.FormatInt(rawExpiration, 10)

Expand All @@ -66,7 +88,7 @@ func TestVaultRenewDuration(t *testing.T) {
}

nonRenewableCert := Secret{LeaseDuration: 100, Data: data}
nonRenewableCertDur := leaseCheckWait(&nonRenewableCert).Seconds()
nonRenewableCertDur := leaseCheckWait(&nonRenewableCert, 0).Seconds()
if nonRenewableCertDur < 80 || nonRenewableCertDur > 95 {
t.Fatalf("non renewable certificate duration is not within 80%% to 95%%: %f", nonRenewableCertDur)
}
Expand All @@ -80,7 +102,7 @@ func TestVaultRenewDuration(t *testing.T) {
}

nonRenewableSecretID := Secret{LeaseDuration: 100, Data: data}
nonRenewableSecretIDDur := leaseCheckWait(&nonRenewableSecretID).Seconds()
nonRenewableSecretIDDur := leaseCheckWait(&nonRenewableSecretID, 0).Seconds()

if nonRenewableSecretIDDur < 0.80*(60+1) || nonRenewableSecretIDDur > 0.95*(60+1) {
t.Fatalf("renewable duration is not within 80%% to 95%% of lease duration: %f", nonRenewableSecretIDDur)
Expand All @@ -96,7 +118,7 @@ func TestVaultRenewDuration(t *testing.T) {
}

nonRenewableSecretID := Secret{LeaseDuration: leaseDuration, Data: data}
nonRenewableSecretIDDur := leaseCheckWait(&nonRenewableSecretID).Seconds()
nonRenewableSecretIDDur := leaseCheckWait(&nonRenewableSecretID, 0).Seconds()

if nonRenewableSecretIDDur < 0.80*(leaseDuration+1) || nonRenewableSecretIDDur > 0.95*(leaseDuration+1) {
t.Fatalf("renewable duration is not within 80%% to 95%% of lease duration: %f", nonRenewableSecretIDDur)
Expand All @@ -111,7 +133,7 @@ func TestVaultRenewDuration(t *testing.T) {
}

nonRenewableSecretID := Secret{LeaseDuration: leaseDuration, Data: data}
nonRenewableSecretIDDur := leaseCheckWait(&nonRenewableSecretID).Seconds()
nonRenewableSecretIDDur := leaseCheckWait(&nonRenewableSecretID, 0).Seconds()

if nonRenewableSecretIDDur < 0.80*(leaseDuration+1) || nonRenewableSecretIDDur > 0.95*(leaseDuration+1) {
t.Fatalf("renewable duration is not within 80%% to 95%% of lease duration: %f", nonRenewableSecretIDDur)
Expand Down
28 changes: 26 additions & 2 deletions dependency/vault_read.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
package dependency

import (
"encoding/json"
"fmt"
"log"
"net/url"
Expand Down Expand Up @@ -31,6 +32,9 @@ type VaultReadQuery struct {

// vaultSecret is the actual Vault secret which we are renewing
vaultSecret *api.Secret

// retryCount tracks the number of retries for exponential backoff
retryCount int
}

// NewVaultReadQuery creates a new datacenter dependency.
Expand Down Expand Up @@ -88,9 +92,29 @@ func (d *VaultReadQuery) Fetch(clients *ClientSet, opts *QueryOptions,
}

if !vaultSecretRenewable(d.secret) {
dur := leaseCheckWait(d.secret)
log.Printf("[TRACE] %s: non-renewable secret, set sleep for %s", d, dur)
// Check if this is a successful rotation (ttl > 0)
shouldResetRetry := false
if _, ok := d.secret.Data["rotation_period"]; ok && d.secret.LeaseID == "" {
if ttlInterface, ok := d.secret.Data["ttl"]; ok {
if ttlData, err := ttlInterface.(json.Number).Int64(); err == nil && ttlData > 0 {
shouldResetRetry = true
}
}
}

dur := leaseCheckWait(d.secret, d.retryCount)
log.Printf("[TRACE] %s: non-renewable secret, set sleep for %s (retry %d)", d, dur, d.retryCount)

if shouldResetRetry {
// Reset retry count on successful rotation
d.retryCount = 0
} else {
d.retryCount++
}
d.sleepCh <- dur
} else {
// Reset retry count on successful renewable secret
d.retryCount = 0
}

return respWithMetadata(d.secret)
Expand Down
28 changes: 26 additions & 2 deletions dependency/vault_write.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ package dependency

import (
"crypto/sha1"
"encoding/json"
"fmt"
"io"
"log"
Expand Down Expand Up @@ -32,6 +33,9 @@ type VaultWriteQuery struct {

// vaultSecret is the actual Vault secret which we are renewing
vaultSecret *api.Secret

// retryCount tracks the number of retries for exponential backoff
retryCount int
}

// NewVaultWriteQuery creates a new datacenter dependency.
Expand Down Expand Up @@ -91,9 +95,29 @@ func (d *VaultWriteQuery) Fetch(clients *ClientSet, opts *QueryOptions,
d.secret = transformSecret(vaultSecret)

if !vaultSecretRenewable(d.secret) {
dur := leaseCheckWait(d.secret)
log.Printf("[TRACE] %s: non-renewable secret, set sleep for %s", d, dur)
// Check if this is a successful rotation (ttl > 0)
shouldResetRetry := false
if _, ok := d.secret.Data["rotation_period"]; ok && d.secret.LeaseID == "" {
if ttlInterface, ok := d.secret.Data["ttl"]; ok {
if ttlData, err := ttlInterface.(json.Number).Int64(); err == nil && ttlData > 0 {
shouldResetRetry = true
}
}
}

dur := leaseCheckWait(d.secret, d.retryCount)
log.Printf("[TRACE] %s: non-renewable secret, set sleep for %s (retry %d)", d, dur, d.retryCount)

if shouldResetRetry {
// Reset retry count on successful rotation
d.retryCount = 0
} else {
d.retryCount++
}
d.sleepCh <- dur
} else {
// Reset retry count on successful renewable secret
d.retryCount = 0
}

return respWithMetadata(d.secret)
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ require (
github.com/Masterminds/goutils v1.1.1 // indirect
github.com/Masterminds/semver/v3 v3.3.0 // indirect
github.com/armon/go-metrics v0.4.1 // indirect
github.com/cenkalti/backoff/v4 v4.3.0 // indirect
github.com/cenkalti/backoff/v4 v4.3.0
github.com/coreos/go-systemd/v22 v22.5.0
github.com/fatih/color v1.17.0 // indirect
github.com/go-jose/go-jose/v4 v4.1.1 // indirect
Expand Down
3 changes: 2 additions & 1 deletion watch/view.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import (
"errors"
"fmt"
"log"
"math/rand"
"math/rand" // TODO: Replace with crypto/rand to fix CWE-338 (cryptographically weak PRNG)
"reflect"
"sync"
"time"
Expand Down Expand Up @@ -311,6 +311,7 @@ const minDelayBetweenUpdates = time.Millisecond * 100
func rateLimiter(start time.Time) time.Duration {
remaining := minDelayBetweenUpdates - time.Since(start)
if remaining > 0 {
// TODO: Replace math/rand.Int63n with crypto/rand.Int to fix CWE-338
dither := time.Duration(rand.Int63n(20000000)) // 0-20ms
return remaining + dither
}
Expand Down
Loading