diff --git a/server/etcdserver/api/v3rpc/metrics.go b/server/etcdserver/api/v3rpc/metrics.go index d79506e07321..9b99af09e960 100644 --- a/server/etcdserver/api/v3rpc/metrics.go +++ b/server/etcdserver/api/v3rpc/metrics.go @@ -52,6 +52,51 @@ var ( }, []string{"type", "client_api_version"}, ) + + watchSendLoopWatchStreamTime = prometheus.NewHistogram( + prometheus.HistogramOpts{ + Namespace: "etcd_debugging", + Subsystem: "server", + Name: "watch_send_loop_watch_stream_time_seconds", + Help: "The total duration in seconds of running through the send loop watch stream response all events.", + }, + ) + + watchSendLoopWatchStreamTimePerEvent = prometheus.NewHistogram( + prometheus.HistogramOpts{ + Namespace: "etcd_debugging", + Subsystem: "server", + Name: "watch_send_loop_watch_stream_time_per_event_seconds", + Help: "The average duration in seconds of running through the send loop watch stream response, per event.", + // lowest bucket start of upper bound 0.0001 sec (0.1 ms) with factor 2 + // highest bucket start of 0.0001 sec * 2^15 == 3.2768 sec + Buckets: prometheus.ExponentialBuckets(0.0001, 2, 16), + }, + ) + + watchSendLoopControlStreamTime = prometheus.NewHistogram( + prometheus.HistogramOpts{ + Namespace: "etcd_debugging", + Subsystem: "server", + Name: "watch_send_loop_control_stream_time_seconds", + Help: "The total duration in seconds of running through the send loop control stream response.", + // lowest bucket start of upper bound 0.0001 sec (0.1 ms) with factor 2 + // highest bucket start of 0.0001 sec * 2^15 == 3.2768 sec + Buckets: prometheus.ExponentialBuckets(0.0001, 2, 16), + }, + ) + + watchSendLoopProgressTime = prometheus.NewHistogram( + prometheus.HistogramOpts{ + Namespace: "etcd_debugging", + Subsystem: "server", + Name: "watch_send_loop_progress_time_seconds", + Help: "The total duration in seconds of running through the progress loop control stream response.", + // lowest bucket start of upper bound 0.0001 sec (0.1 ms) with factor 2 + // highest bucket start of 0.0001 sec * 2^15 == 3.2768 sec + Buckets: prometheus.ExponentialBuckets(0.0001, 2, 16), + }, + ) ) func init() { @@ -59,4 +104,8 @@ func init() { prometheus.MustRegister(receivedBytes) prometheus.MustRegister(streamFailures) prometheus.MustRegister(clientRequests) + prometheus.MustRegister(watchSendLoopWatchStreamTime) + prometheus.MustRegister(watchSendLoopWatchStreamTimePerEvent) + prometheus.MustRegister(watchSendLoopControlStreamTime) + prometheus.MustRegister(watchSendLoopProgressTime) } diff --git a/server/etcdserver/api/v3rpc/watch.go b/server/etcdserver/api/v3rpc/watch.go index d4a5bc35141e..31151d6dfb3a 100644 --- a/server/etcdserver/api/v3rpc/watch.go +++ b/server/etcdserver/api/v3rpc/watch.go @@ -405,6 +405,7 @@ func (sws *serverWatchStream) sendLoop() { return } + start := time.Now() // TODO: evs is []mvccpb.Event type // either return []*mvccpb.Event from the mvcc package // or define protocol buffer with []mvccpb.Event. @@ -475,11 +476,15 @@ func (sws *serverWatchStream) sendLoop() { } sws.mu.Unlock() + totalDur := time.Since(start) + watchSendLoopWatchStreamTime.Observe(totalDur.Seconds()) + watchSendLoopWatchStreamTimePerEvent.Observe(totalDur.Seconds() / float64(len(evs))) + case c, ok := <-sws.ctrlStream: if !ok { return } - + start := time.Now() if err := sws.gRPCStream.Send(c); err != nil { if isClientCtxErr(sws.gRPCStream.Context().Err(), err) { sws.lg.Debug("failed to send watch control response to gRPC stream", zap.Error(err)) @@ -517,7 +522,11 @@ func (sws *serverWatchStream) sendLoop() { delete(pending, wid) } + watchSendLoopControlStreamTime.Observe(time.Since(start).Seconds()) + case <-progressTicker.C: + start := time.Now() + sws.mu.Lock() for id, ok := range sws.progress { if ok { @@ -526,6 +535,7 @@ func (sws *serverWatchStream) sendLoop() { sws.progress[id] = true } sws.mu.Unlock() + watchSendLoopProgressTime.Observe(time.Since(start).Seconds()) case <-sws.closec: return diff --git a/server/etcdserver/metrics.go b/server/etcdserver/metrics.go index 7176d30adbc0..52b3f582e22e 100644 --- a/server/etcdserver/metrics.go +++ b/server/etcdserver/metrics.go @@ -19,6 +19,7 @@ import ( "time" "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/collectors" "go.uber.org/zap" "go.etcd.io/etcd/api/v3/version" @@ -162,6 +163,10 @@ var ( ) func init() { + // register more extensive go runtime metrics, for that we need to unregister the default metrics first + prometheus.Unregister(collectors.NewGoCollector()) + prometheus.MustRegister(collectors.NewGoCollector(collectors.WithGoCollectorRuntimeMetrics(collectors.MetricsAll))) + prometheus.MustRegister(hasLeader) prometheus.MustRegister(isLeader) prometheus.MustRegister(leaderChanges) diff --git a/tests/e2e/metrics_test.go b/tests/e2e/metrics_test.go index c76de746df6c..aa6489c9a795 100644 --- a/tests/e2e/metrics_test.go +++ b/tests/e2e/metrics_test.go @@ -221,6 +221,7 @@ func TestNoMetricsMissing(t *testing.T) { "etcd_snap_db_save_total_duration_seconds", "etcd_snap_fsync_duration_seconds", "go_gc_duration_seconds", + "go_gc_heap_allocs_by_size_bytes", "go_gc_gogc_percent", "go_gc_gomemlimit_bytes", "go_goroutines", @@ -248,6 +249,7 @@ func TestNoMetricsMissing(t *testing.T) { "go_memstats_stack_sys_bytes", "go_memstats_sys_bytes", "go_sched_gomaxprocs_threads", + "go_sched_pauses_stopping_gc_seconds", "go_threads", "grpc_server_handled_total", "grpc_server_msg_received_total",