From 680b7a9c2847404756bcec2e5f7694383511513e Mon Sep 17 00:00:00 2001 From: Michael Davis Date: Wed, 17 Dec 2025 15:33:52 -0500 Subject: [PATCH] osiris_writer: Track replica staleness This introduces a metric calculated at every batch which records the difference between the timestamps of the last chunks in the logs of the writer and its replicas. This can be used to watch replicas catch up on replication, or to diagnose situations when replication is being starved out (network-wise) by high-throughput publishing. The replication diff should usually be low but, for streams seeing traffic, non-zero. This calculation is also used in the stream coordinator when adding a member, via `osiris_writer:query_replication_state/1`. With this change the stream coordinator could be updated to use the counter instead of calling the writer. --- src/osiris_writer.erl | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/src/osiris_writer.erl b/src/osiris_writer.erl index 2ca0c2d9..bf85734d 100644 --- a/src/osiris_writer.erl +++ b/src/osiris_writer.erl @@ -42,11 +42,14 @@ -define(C_COMMITTED_OFFSET, ?C_NUM_LOG_FIELDS + 1). -define(C_READERS, ?C_NUM_LOG_FIELDS + 2). -define(C_EPOCH, ?C_NUM_LOG_FIELDS + 3). +-define(C_REPLICA_STALENESS, ?C_NUM_LOG_FIELDS + 4). -define(ADD_COUNTER_FIELDS, [ {committed_offset, ?C_COMMITTED_OFFSET, counter, "Last committed offset"}, {readers, ?C_READERS, counter, "Number of readers"}, - {epoch, ?C_EPOCH, counter, "Current epoch"} + {epoch, ?C_EPOCH, counter, "Current epoch"}, + {replica_staleness, ?C_REPLICA_STALENESS, gauge, + "Timestamp difference between last chunk written and oldest last chunk of any replica"} ] ). @@ -288,7 +291,14 @@ handle_batch(Commands, LastChId = case osiris_log:tail_info(State2#?MODULE.log) of - {_, {_, TailChId, _TailTs}} -> + {_, {_, TailChId, TailTs}} -> + {Max, Min} = maps:fold(fun(_, {_, Ts}, {Max, Min}) -> + {max(Ts, Max), + min(Ts, Min)} + end, + {TailTs, TailTs}, + State2#?MODULE.replica_state), + counters:put(Cnt, ?C_REPLICA_STALENESS, Max - Min), TailChId; _ -> -1