diff --git a/ratis-docs/src/site/markdown/configurations.md b/ratis-docs/src/site/markdown/configurations.md index acd1cb9f9e..8cc7f1453b 100644 --- a/ratis-docs/src/site/markdown/configurations.md +++ b/ratis-docs/src/site/markdown/configurations.md @@ -227,6 +227,29 @@ if it fails to receive any RPC responses from this peer within this specified ti | **Default** | false | +| **Property** | `raft.server.read.read-index.heartbeat.skip.enabled` | +|:----------------|:--------------------------------------------------------------------| +| **Description** | whether to skip the leadership check heartbeat for read index. | +| **Type** | boolean | +| **Default** | false | + +Note that although enabling `heartbeat.skip.enabled` reduce the RTT due to the leadership check heartbeat, +it causes reads to not be linearizable in some cases. +This is because without the leadership check heartbeats, the leader might not be the latest leader +and might serve stale reads. +There might be a small period of time when there is a split brain and two Raft peers that believe that +they are leaders (old leader and new leader where old leader's term < new leader's term). +The old leader might not have detected that it has lost majority heartbeats and stepped down or the old leader +has not received any RPC with higher term which forces the old leader to steps down. +Without the leadership check heartbeat which should detect that the old leader is no longer the latest leader and +force the old leader to step down, the old leader ReadIndex might return an index that is lower than +the new leader's applied index. +This means that the old leader might return stale data which is not linearizable by definition. +However, this situation should happen only abnormal situations when the Raft group encounters a leader election +due to network partition between the old leader and the other majority quorum (which will elect a new leader). +Therefore, this might be an acceptable tradeoff for applications that seek to improve the linearizable read +performance. + ### Write - Configurations related to write requests. * Limits on pending write requests diff --git a/ratis-server-api/src/main/java/org/apache/ratis/server/RaftServerConfigKeys.java b/ratis-server-api/src/main/java/org/apache/ratis/server/RaftServerConfigKeys.java index 2538a472a8..8f08989545 100644 --- a/ratis-server-api/src/main/java/org/apache/ratis/server/RaftServerConfigKeys.java +++ b/ratis-server-api/src/main/java/org/apache/ratis/server/RaftServerConfigKeys.java @@ -279,6 +279,17 @@ static boolean appliedIndexEnabled(RaftProperties properties) { static void setAppliedIndexEnabled(RaftProperties properties, boolean enabled) { setBoolean(properties::setBoolean, APPLIED_INDEX_ENABLED_KEY, enabled); } + + String HEARTBEAT_SKIP_ENABLED_KEY = PREFIX + ".heartbeat.skip.enabled"; + boolean HEARTBEAT_SKIP_ENABLED_DEFAULT = false; + static boolean heartbeatSkipEnabled(RaftProperties properties) { + return getBoolean(properties::getBoolean, HEARTBEAT_SKIP_ENABLED_KEY, + HEARTBEAT_SKIP_ENABLED_DEFAULT, getDefaultLog()); + } + + static void setHeartbeatSkipEnabled(RaftProperties properties, boolean enabled) { + setBoolean(properties::setBoolean, HEARTBEAT_SKIP_ENABLED_KEY, enabled); + } } } diff --git a/ratis-server/src/main/java/org/apache/ratis/server/impl/LeaderStateImpl.java b/ratis-server/src/main/java/org/apache/ratis/server/impl/LeaderStateImpl.java index 90d0b76df5..a49e7fb333 100644 --- a/ratis-server/src/main/java/org/apache/ratis/server/impl/LeaderStateImpl.java +++ b/ratis-server/src/main/java/org/apache/ratis/server/impl/LeaderStateImpl.java @@ -354,6 +354,7 @@ boolean isApplied() { private final ReadIndexHeartbeats readIndexHeartbeats; private final boolean readIndexAppliedIndexEnabled; + private final boolean readIndexHeartbeatSkipEnabled; private final LeaderLease lease; LeaderStateImpl(RaftServerImpl server) { @@ -392,6 +393,8 @@ boolean isApplied() { } this.readIndexAppliedIndexEnabled = RaftServerConfigKeys.Read.ReadIndex .appliedIndexEnabled(properties); + this.readIndexHeartbeatSkipEnabled = RaftServerConfigKeys.Read.ReadIndex + .heartbeatSkipEnabled(properties); final RaftConfigurationImpl conf = state.getRaftConf(); Collection others = conf.getOtherPeers(server.getId()); @@ -1166,7 +1169,8 @@ CompletableFuture getReadIndex(Long readAfterWriteConsistentIndex) { } // if lease is enabled, check lease first - if (hasLease()) { + // if we allow leader to skip the leadership check heartbeat, we can return immediately + if (readIndexHeartbeatSkipEnabled || hasLease()) { return CompletableFuture.completedFuture(readIndex); }