Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 49 additions & 0 deletions server/etcdserver/api/v3rpc/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,11 +52,60 @@ var (
},
[]string{"type", "client_api_version"},
)

watchSendLoopWatchStreamTime = prometheus.NewHistogram(
prometheus.HistogramOpts{
Namespace: "etcd_debugging",
Subsystem: "server",
Name: "watch_send_loop_watch_stream_time_seconds",
Help: "The total duration in seconds of running through the send loop watch stream response all events.",
},
)

watchSendLoopWatchStreamTimePerEvent = prometheus.NewHistogram(
prometheus.HistogramOpts{
Namespace: "etcd_debugging",
Subsystem: "server",
Name: "watch_send_loop_watch_stream_time_per_event_seconds",
Help: "The average duration in seconds of running through the send loop watch stream response, per event.",
// lowest bucket start of upper bound 0.0001 sec (0.1 ms) with factor 2
// highest bucket start of 0.0001 sec * 2^15 == 3.2768 sec
Buckets: prometheus.ExponentialBuckets(0.0001, 2, 16),
},
)

watchSendLoopControlStreamTime = prometheus.NewHistogram(
prometheus.HistogramOpts{
Namespace: "etcd_debugging",
Subsystem: "server",
Name: "watch_send_loop_control_stream_time_seconds",
Help: "The total duration in seconds of running through the send loop control stream response.",
// lowest bucket start of upper bound 0.0001 sec (0.1 ms) with factor 2
// highest bucket start of 0.0001 sec * 2^15 == 3.2768 sec
Buckets: prometheus.ExponentialBuckets(0.0001, 2, 16),
},
)

watchSendLoopProgressTime = prometheus.NewHistogram(
prometheus.HistogramOpts{
Namespace: "etcd_debugging",
Subsystem: "server",
Name: "watch_send_loop_progress_time_seconds",
Help: "The total duration in seconds of running through the progress loop control stream response.",
// lowest bucket start of upper bound 0.0001 sec (0.1 ms) with factor 2
// highest bucket start of 0.0001 sec * 2^15 == 3.2768 sec
Buckets: prometheus.ExponentialBuckets(0.0001, 2, 16),
},
)
)

func init() {
prometheus.MustRegister(sentBytes)
prometheus.MustRegister(receivedBytes)
prometheus.MustRegister(streamFailures)
prometheus.MustRegister(clientRequests)
prometheus.MustRegister(watchSendLoopWatchStreamTime)
prometheus.MustRegister(watchSendLoopWatchStreamTimePerEvent)
prometheus.MustRegister(watchSendLoopControlStreamTime)
prometheus.MustRegister(watchSendLoopProgressTime)
}
12 changes: 11 additions & 1 deletion server/etcdserver/api/v3rpc/watch.go
Original file line number Diff line number Diff line change
Expand Up @@ -405,6 +405,7 @@ func (sws *serverWatchStream) sendLoop() {
return
}

start := time.Now()
// TODO: evs is []mvccpb.Event type
// either return []*mvccpb.Event from the mvcc package
// or define protocol buffer with []mvccpb.Event.
Expand Down Expand Up @@ -475,11 +476,15 @@ func (sws *serverWatchStream) sendLoop() {
}
sws.mu.Unlock()

totalDur := time.Since(start)
watchSendLoopWatchStreamTime.Observe(totalDur.Seconds())
watchSendLoopWatchStreamTimePerEvent.Observe(totalDur.Seconds() / float64(len(evs)))

case c, ok := <-sws.ctrlStream:
if !ok {
return
}

start := time.Now()
if err := sws.gRPCStream.Send(c); err != nil {
if isClientCtxErr(sws.gRPCStream.Context().Err(), err) {
sws.lg.Debug("failed to send watch control response to gRPC stream", zap.Error(err))
Expand Down Expand Up @@ -517,7 +522,11 @@ func (sws *serverWatchStream) sendLoop() {
delete(pending, wid)
}

watchSendLoopControlStreamTime.Observe(time.Since(start).Seconds())

case <-progressTicker.C:
start := time.Now()

sws.mu.Lock()
for id, ok := range sws.progress {
if ok {
Expand All @@ -526,6 +535,7 @@ func (sws *serverWatchStream) sendLoop() {
sws.progress[id] = true
}
sws.mu.Unlock()
watchSendLoopProgressTime.Observe(time.Since(start).Seconds())

case <-sws.closec:
return
Expand Down
5 changes: 5 additions & 0 deletions server/etcdserver/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
"time"

"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/collectors"
"go.uber.org/zap"

"go.etcd.io/etcd/api/v3/version"
Expand Down Expand Up @@ -162,6 +163,10 @@ var (
)

func init() {
// register more extensive go runtime metrics, for that we need to unregister the default metrics first
prometheus.Unregister(collectors.NewGoCollector())
prometheus.MustRegister(collectors.NewGoCollector(collectors.WithGoCollectorRuntimeMetrics(collectors.MetricsAll)))

prometheus.MustRegister(hasLeader)
prometheus.MustRegister(isLeader)
prometheus.MustRegister(leaderChanges)
Expand Down
2 changes: 2 additions & 0 deletions tests/e2e/metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,7 @@ func TestNoMetricsMissing(t *testing.T) {
"etcd_snap_db_save_total_duration_seconds",
"etcd_snap_fsync_duration_seconds",
"go_gc_duration_seconds",
"go_gc_heap_allocs_by_size_bytes",
"go_gc_gogc_percent",
"go_gc_gomemlimit_bytes",
"go_goroutines",
Expand Down Expand Up @@ -248,6 +249,7 @@ func TestNoMetricsMissing(t *testing.T) {
"go_memstats_stack_sys_bytes",
"go_memstats_sys_bytes",
"go_sched_gomaxprocs_threads",
"go_sched_pauses_stopping_gc_seconds",
"go_threads",
"grpc_server_handled_total",
"grpc_server_msg_received_total",
Expand Down