diff --git a/internal/app/manager.go b/internal/app/manager.go index 865c370..d6af8bf 100644 --- a/internal/app/manager.go +++ b/internal/app/manager.go @@ -71,6 +71,14 @@ func (app *App) stateManager() appState { var switchover Switchover if err := app.dcs.Get(pathCurrentSwitch, &switchover); err == nil { + if !switchover.InitiatedAt.IsZero() && time.Since(switchover.InitiatedAt) > app.config.Valkey.SwitchoverTimeout { + app.logger.Error(fmt.Sprintf("Switchover: %s => %s timed out after %s", switchover.From, switchover.To, time.Since(switchover.InitiatedAt))) + err = app.failSwitchover(&switchover, fmt.Errorf("switchover timed out after %s", time.Since(switchover.InitiatedAt))) + if err != nil { + app.logger.Error("Failed to report switchover timeout", slog.Any("error", err)) + } + return stateManager + } err = app.approveSwitchover(&switchover, activeNodes, shardState) if err != nil { app.logger.Error("Unable to perform switchover", slog.Any("error", err)) diff --git a/internal/app/switchover.go b/internal/app/switchover.go index 49f479e..a0505f3 100644 --- a/internal/app/switchover.go +++ b/internal/app/switchover.go @@ -167,8 +167,7 @@ func (app *App) performSwitchover(shardState map[string]*HostState, activeNodes }, activeNodes) if err, ok := errsRO[oldMaster]; ok && err != nil && shardState[oldMaster].PingOk { - err = fmt.Errorf("failed to set old master %s read-only: %s", oldMaster, err.Error()) - app.logger.Error("Switchover", slog.Any("error", err)) + return fmt.Errorf("failed to set old master %s read-only: %s", oldMaster, err.Error()) } poisonPill, err := app.getPoisonPill() diff --git a/internal/config/config.go b/internal/config/config.go index a13fd99..b63233c 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -37,6 +37,7 @@ type ValkeyConfig struct { StaleReplicaLagOpen time.Duration `yaml:"stale_replica_lag_open"` DestructiveReplicationRepairTimeout time.Duration `yaml:"destructive_replication_repair_timeout"` FailoverCooldown time.Duration `yaml:"failover_cooldown"` + SwitchoverTimeout time.Duration `yaml:"switchover_timeout"` MaxParallelSyncs int `yaml:"max_parallel_syncs"` ClusterBusPort int `yaml:"cluster_bus_port"` ReservedConnections int `yaml:"reserved_connections"` @@ -105,6 +106,7 @@ func DefaultValkeyConfig() ValkeyConfig { StaleReplicaLagOpen: 10 * time.Second, BusyTimeout: 5 * time.Second, DestructiveReplicationRepairTimeout: 30 * time.Minute, + SwitchoverTimeout: 10 * time.Minute, MaxParallelSyncs: 1, ReservedConnections: 10, AllowDataLoss: false,