From 643a7080c19944576d7c6d443e21a0cb5a4487fb Mon Sep 17 00:00:00 2001 From: Andrew Yuan Date: Wed, 14 Jan 2026 11:50:32 -0800 Subject: [PATCH 01/30] Add worker heartbeat support Implement worker heartbeats that report worker status, slot usage, and metrics to the server via the WorkerHeartbeat RPC. Key changes: - Add HeartbeatMetricsHandler to capture metrics needed for heartbeats - Add internal_worker_heartbeat.go for heartbeat worker management - Add hostmetrics package for CPU/memory usage reporting - Plumb heartbeat data through workers (workflow, activity, nexus, local activity) - Add integration tests for worker heartbeat functionality - Fix nil pointer in shutdownWorker when heartbeating is disabled --- contrib/resourcetuner/go.mod | 4 +- contrib/resourcetuner/resourcetuner.go | 137 +-- go.mod | 17 + go.sum | 52 + internal/client.go | 24 +- internal/cmd/build/main.go | 2 + internal/common/metrics/heartbeat_handler.go | 519 +++++++++ internal/internal_nexus_task_poller.go | 6 +- internal/internal_task_pollers.go | 55 +- internal/internal_task_pollers_test.go | 9 +- internal/internal_worker.go | 395 ++++++- internal/internal_worker_base.go | 7 - internal/internal_worker_base_test.go | 6 - internal/internal_worker_heartbeat.go | 143 +++ internal/internal_workers_test.go | 3 - internal/internal_workflow_client.go | 155 ++- internal/tuning.go | 29 + internal/worker.go | 12 + test/go.mod | 2 +- test/integration_test.go | 1 - test/worker_heartbeat_test.go | 997 ++++++++++++++++++ .../hostmetrics}/cgroups.go | 2 +- .../hostmetrics}/cgroups_notlinux.go | 5 +- worker/hostmetrics/hostmetrics.go | 121 +++ worker/hostmetrics/hostmetrics_test.go | 49 + worker/worker.go | 7 + 26 files changed, 2555 insertions(+), 204 deletions(-) create mode 100644 internal/common/metrics/heartbeat_handler.go create mode 100644 internal/internal_worker_heartbeat.go create mode 100644 test/worker_heartbeat_test.go rename {contrib/resourcetuner => worker/hostmetrics}/cgroups.go (99%) rename {contrib/resourcetuner => worker/hostmetrics}/cgroups_notlinux.go (87%) create mode 100644 worker/hostmetrics/hostmetrics.go create mode 100644 worker/hostmetrics/hostmetrics_test.go diff --git a/contrib/resourcetuner/go.mod b/contrib/resourcetuner/go.mod index dde1cc877..bc34753c4 100644 --- a/contrib/resourcetuner/go.mod +++ b/contrib/resourcetuner/go.mod @@ -5,8 +5,6 @@ go 1.23.0 toolchain go1.23.6 require ( - github.com/containerd/cgroups/v3 v3.0.3 - github.com/shirou/gopsutil/v4 v4.24.8 github.com/stretchr/testify v1.10.0 go.einride.tech/pid v0.1.3 go.temporal.io/sdk v1.29.1 @@ -14,6 +12,7 @@ require ( require ( github.com/cilium/ebpf v0.11.0 // indirect + github.com/containerd/cgroups/v3 v3.0.3 // indirect github.com/coreos/go-systemd/v22 v22.3.2 // indirect github.com/davecgh/go-spew v1.1.1 // indirect github.com/facebookgo/clock v0.0.0-20150410010913-600d898af40a // indirect @@ -30,6 +29,7 @@ require ( github.com/pmezard/go-difflib v1.0.0 // indirect github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c // indirect github.com/robfig/cron v1.2.0 // indirect + github.com/shirou/gopsutil/v4 v4.24.8 // indirect github.com/shoenig/go-m1cpu v0.1.6 // indirect github.com/sirupsen/logrus v1.9.3 // indirect github.com/stretchr/objx v0.5.2 // indirect diff --git a/contrib/resourcetuner/resourcetuner.go b/contrib/resourcetuner/resourcetuner.go index 1145d3cd7..7e01c7f5e 100644 --- a/contrib/resourcetuner/resourcetuner.go +++ b/contrib/resourcetuner/resourcetuner.go @@ -3,16 +3,14 @@ package resourcetuner import ( "context" "errors" - "runtime" "sync" "time" - "github.com/shirou/gopsutil/v4/cpu" - "github.com/shirou/gopsutil/v4/mem" "go.einride.tech/pid" "go.temporal.io/sdk/client" "go.temporal.io/sdk/log" "go.temporal.io/sdk/worker" + "go.temporal.io/sdk/worker/hostmetrics" ) // Metric names emitted by the resource-based tuner @@ -36,12 +34,30 @@ type ResourceBasedTunerOptions struct { WorkflowRampThrottle time.Duration } +// resourceBasedTuner wraps a WorkerTuner and implements HostMetricsProvider +// so the SDK can reuse metrics instead of collecting them twice. +type resourceBasedTuner struct { + worker.WorkerTuner + hostMetrics *hostmetrics.PSUtilSystemInfoSupplier +} + +func (t *resourceBasedTuner) GetCpuUsage() (float64, error) { + return t.hostMetrics.GetCpuUsage() +} + +func (t *resourceBasedTuner) GetMemoryUsage() (float64, error) { + return t.hostMetrics.GetMemoryUsage() +} + // NewResourceBasedTuner creates a WorkerTuner that dynamically adjusts the number of slots based // on system resources. Specify the target CPU and memory usage as a value between 0 and 1. func NewResourceBasedTuner(opts ResourceBasedTunerOptions) (worker.WorkerTuner, error) { + hostMetrics := hostmetrics.NewPSUtilSystemInfoSupplier(nil) + options := DefaultResourceControllerOptions() options.MemTargetPercent = opts.TargetMem options.CpuTargetPercent = opts.TargetCpu + options.InfoSupplier = &hostMetricsInfoSupplier{provider: hostMetrics} controller := NewResourceController(options) wfSS := &ResourceBasedSlotSupplier{controller: controller, options: DefaultWorkflowResourceBasedSlotSupplierOptions()} @@ -72,7 +88,23 @@ func NewResourceBasedTuner(opts ResourceBasedTunerOptions) (worker.WorkerTuner, if err != nil { return nil, err } - return compositeTuner, nil + return &resourceBasedTuner{ + WorkerTuner: compositeTuner, + hostMetrics: hostMetrics, + }, nil +} + +// hostMetricsInfoSupplier adapts hostmetrics.PSUtilSystemInfoSupplier to SystemInfoSupplier +type hostMetricsInfoSupplier struct { + provider *hostmetrics.PSUtilSystemInfoSupplier +} + +func (s *hostMetricsInfoSupplier) GetMemoryUsage(ctx *SystemInfoContext) (float64, error) { + return s.provider.GetMemoryUsageWithLogger(ctx.Logger) +} + +func (s *hostMetricsInfoSupplier) GetCpuUsage(ctx *SystemInfoContext) (float64, error) { + return s.provider.GetCpuUsageWithLogger(ctx.Logger) } // ResourceBasedSlotSupplierOptions configures a particular ResourceBasedSlotSupplier. @@ -182,6 +214,9 @@ func (r *ResourceBasedSlotSupplier) ReleaseSlot(worker.SlotReleaseInfo) {} func (r *ResourceBasedSlotSupplier) MaxSlots() int { return 0 } +func (r *ResourceBasedSlotSupplier) Kind() string { + return "ResourceBased" +} // SystemInfoSupplier implementations provide information about system resources. type SystemInfoSupplier interface { @@ -255,13 +290,11 @@ type ResourceController struct { // the controller looks at overall system resources, multiple instances with different configs can // only conflict with one another. func NewResourceController(options ResourceControllerOptions) *ResourceController { - var infoSupplier SystemInfoSupplier - if options.InfoSupplier == nil { - infoSupplier = &psUtilSystemInfoSupplier{ - cGroupInfo: newCGroupInfo(), + infoSupplier := options.InfoSupplier + if infoSupplier == nil { + infoSupplier = &hostMetricsInfoSupplier{ + provider: hostmetrics.NewPSUtilSystemInfoSupplier(nil), } - } else { - infoSupplier = options.InfoSupplier } return &ResourceController{ options: options, @@ -329,87 +362,3 @@ func (rc *ResourceController) publishResourceMetrics(metricsHandler client.Metri metricsHandler.Gauge(resourceSlotsMemUsage).Update(memUsage * 100) metricsHandler.Gauge(resourceSlotsCPUUsage).Update(cpuUsage * 100) } - -type psUtilSystemInfoSupplier struct { - logger log.Logger - mu sync.Mutex - lastRefresh time.Time - - lastMemStat *mem.VirtualMemoryStat - lastCpuUsage float64 - - stopTryingToGetCGroupInfo bool - cGroupInfo cGroupInfo -} - -type cGroupInfo interface { - // Update requests an update of the cgroup stats. This is a no-op if not in a cgroup. Returns - // true if cgroup stats should continue to be updated, false if not in a cgroup or the returned - // error is considered unrecoverable. - Update() (bool, error) - // GetLastMemUsage returns last known memory usage as a fraction of the cgroup limit. 0 if not - // in a cgroup or limit is not set. - GetLastMemUsage() float64 - // GetLastCPUUsage returns last known CPU usage as a fraction of the cgroup limit. 0 if not in a - // cgroup or limit is not set. - GetLastCPUUsage() float64 -} - -func (p *psUtilSystemInfoSupplier) GetMemoryUsage(infoContext *SystemInfoContext) (float64, error) { - if err := p.maybeRefresh(infoContext); err != nil { - return 0, err - } - lastCGroupMem := p.cGroupInfo.GetLastMemUsage() - if lastCGroupMem != 0 { - return lastCGroupMem, nil - } - return p.lastMemStat.UsedPercent / 100, nil -} - -func (p *psUtilSystemInfoSupplier) GetCpuUsage(infoContext *SystemInfoContext) (float64, error) { - if err := p.maybeRefresh(infoContext); err != nil { - return 0, err - } - - lastCGroupCPU := p.cGroupInfo.GetLastCPUUsage() - if lastCGroupCPU != 0 { - return lastCGroupCPU, nil - } - return p.lastCpuUsage / 100, nil -} - -func (p *psUtilSystemInfoSupplier) maybeRefresh(infoContext *SystemInfoContext) error { - if time.Since(p.lastRefresh) < 100*time.Millisecond { - return nil - } - p.mu.Lock() - defer p.mu.Unlock() - // Double check refresh is still needed - if time.Since(p.lastRefresh) < 100*time.Millisecond { - return nil - } - ctx, cancelFn := context.WithTimeout(context.Background(), 1*time.Second) - defer cancelFn() - memStat, err := mem.VirtualMemoryWithContext(ctx) - if err != nil { - return err - } - cpuUsage, err := cpu.PercentWithContext(ctx, 0, false) - if err != nil { - return err - } - - p.lastMemStat = memStat - p.lastCpuUsage = cpuUsage[0] - - if runtime.GOOS == "linux" && !p.stopTryingToGetCGroupInfo { - continueUpdates, err := p.cGroupInfo.Update() - if err != nil { - infoContext.Logger.Warn("Failed to get cgroup stats", "error", err) - } - p.stopTryingToGetCGroupInfo = !continueUpdates - } - - p.lastRefresh = time.Now() - return nil -} diff --git a/go.mod b/go.mod index 46648f756..fe0edf0dd 100644 --- a/go.mod +++ b/go.mod @@ -5,6 +5,7 @@ go 1.23.0 toolchain go1.23.6 require ( + github.com/containerd/cgroups/v3 v3.0.3 github.com/facebookgo/clock v0.0.0-20150410010913-600d898af40a github.com/gogo/protobuf v1.3.2 github.com/golang/mock v1.6.0 @@ -13,6 +14,7 @@ require ( github.com/grpc-ecosystem/go-grpc-middleware/v2 v2.3.2 github.com/nexus-rpc/sdk-go v0.5.1 github.com/robfig/cron v1.2.0 + github.com/shirou/gopsutil/v4 v4.24.8 github.com/stretchr/testify v1.10.0 go.temporal.io/api v1.59.0 golang.org/x/sync v0.13.0 @@ -23,10 +25,25 @@ require ( ) require ( + github.com/cilium/ebpf v0.11.0 // indirect + github.com/coreos/go-systemd/v22 v22.3.2 // indirect github.com/davecgh/go-spew v1.1.1 // indirect + github.com/go-ole/go-ole v1.2.6 // indirect + github.com/godbus/dbus/v5 v5.0.4 // indirect github.com/grpc-ecosystem/grpc-gateway/v2 v2.22.0 // indirect + github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 // indirect + github.com/opencontainers/runtime-spec v1.0.2 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect + github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c // indirect + github.com/shoenig/go-m1cpu v0.1.6 // indirect + github.com/sirupsen/logrus v1.9.3 // indirect github.com/stretchr/objx v0.5.2 // indirect + github.com/tklauser/go-sysconf v0.3.12 // indirect + github.com/tklauser/numcpus v0.6.1 // indirect + github.com/yusufpapurcu/wmi v1.2.4 // indirect + go.einride.tech/pid v0.1.3 // indirect + go.temporal.io/sdk/contrib/resourcetuner v0.0.0-20260112203102-5b6df8e02dcf // indirect + golang.org/x/exp v0.0.0-20240325151524-a685a6edb6d8 // indirect golang.org/x/net v0.39.0 // indirect golang.org/x/text v0.24.0 // indirect google.golang.org/genproto/googleapis/api v0.0.0-20240827150818-7e3bb234dfed // indirect diff --git a/go.sum b/go.sum index 2f5906d58..b069bc428 100644 --- a/go.sum +++ b/go.sum @@ -1,11 +1,25 @@ +github.com/cilium/ebpf v0.11.0 h1:V8gS/bTCCjX9uUnkUFUpPsksM8n1lXBAvHcpiFk1X2Y= +github.com/cilium/ebpf v0.11.0/go.mod h1:WE7CZAnqOL2RouJ4f1uyNhqr2P4CCvXFIqdRDUgWsVs= +github.com/containerd/cgroups/v3 v3.0.3 h1:S5ByHZ/h9PMe5IOQoN7E+nMc2UcLEM/V48DGDJ9kip0= +github.com/containerd/cgroups/v3 v3.0.3/go.mod h1:8HBe7V3aWGLFPd/k03swSIsGjZhHI2WzJmticMgVuz0= +github.com/coreos/go-systemd/v22 v22.3.2 h1:D9/bQk5vlXQFZ6Kwuu6zaiXJ9oTPe68++AzAJc1DzSI= +github.com/coreos/go-systemd/v22 v22.3.2/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/facebookgo/clock v0.0.0-20150410010913-600d898af40a h1:yDWHCSQ40h88yih2JAcL6Ls/kVkSE8GFACTGVnMPruw= github.com/facebookgo/clock v0.0.0-20150410010913-600d898af40a/go.mod h1:7Ga40egUymuWXxAe151lTNnCv97MddSOVsjpPPkityA= +github.com/frankban/quicktest v1.14.5 h1:dfYrrRyLtiqT9GyKXgdh+k4inNeTvmGbuSgZ3lx3GhA= +github.com/frankban/quicktest v1.14.5/go.mod h1:4ptaffx2x8+WTWXmUCuVU6aPUX1/Mz7zb5vbUoiM6w0= +github.com/go-ole/go-ole v1.2.6 h1:/Fpf6oFPoeFik9ty7siob0G6Ke8QvQEuVcuChpwXzpY= +github.com/go-ole/go-ole v1.2.6/go.mod h1:pprOEPIfldk/42T2oK7lQ4v4JSDwmV0As9GaiUsvbm0= +github.com/godbus/dbus/v5 v5.0.4 h1:9349emZab16e7zQvpmsbtjc18ykshndd8y2PG3sgJbA= +github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= github.com/golang/mock v1.6.0 h1:ErTB+efbowRARo13NNdxyJji2egdxLGQhRaY+DUumQc= github.com/golang/mock v1.6.0/go.mod h1:p6yTPP+5HYm5mzsMV8JkE6ZKdX+/wYM6Hr+LicevLPs= +github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= @@ -20,26 +34,58 @@ github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= +github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 h1:6E+4a0GO5zZEnZ81pIr0yLvtUWk2if982qA3F3QD6H4= +github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0/go.mod h1:zJYVVT2jmtg6P3p1VtQj7WsuWi/y4VnjVBn7F8KPB3I= github.com/nexus-rpc/sdk-go v0.5.1 h1:UFYYfoHlQc+Pn9gQpmn9QE7xluewAn2AO1OSkAh7YFU= github.com/nexus-rpc/sdk-go v0.5.1/go.mod h1:FHdPfVQwRuJFZFTF0Y2GOAxCrbIBNrcPna9slkGKPYk= +github.com/opencontainers/runtime-spec v1.0.2 h1:UfAcuLBJB9Coz72x1hgl8O5RVzTdNiaglX6v2DM6FI0= +github.com/opencontainers/runtime-spec v1.0.2/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c h1:ncq/mPwQF4JjgDlrVEn3C11VoGHZN7m8qihwgMEtzYw= +github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c/go.mod h1:OmDBASR4679mdNQnz2pUhc2G8CO2JrUAVFDRBDP/hJE= github.com/robfig/cron v1.2.0 h1:ZjScXvvxeQ63Dbyxy76Fj3AT3Ut0aKsyd2/tl3DTMuQ= github.com/robfig/cron v1.2.0/go.mod h1:JGuDeoQd7Z6yL4zQhZ3OPEVHB7fL6Ka6skscFHfmt2k= github.com/rogpeppe/go-internal v1.11.0 h1:cWPaGQEPrBb5/AsnsZesgZZ9yb1OQ+GOISoDNXVBh4M= github.com/rogpeppe/go-internal v1.11.0/go.mod h1:ddIwULY96R17DhadqLgMfk9H9tvdUzkipdSkR5nkCZA= +github.com/shirou/gopsutil/v4 v4.24.8 h1:pVQjIenQkIhqO81mwTaXjTzOMT7d3TZkf43PlVFHENI= +github.com/shirou/gopsutil/v4 v4.24.8/go.mod h1:wE0OrJtj4dG+hYkxqDH3QiBICdKSf04/npcvLLc/oRg= +github.com/shoenig/go-m1cpu v0.1.6 h1:nxdKQNcEB6vzgA2E2bvzKIYRuNj7XNJ4S/aRSwKzFtM= +github.com/shoenig/go-m1cpu v0.1.6/go.mod h1:1JJMcUBvfNwpq05QDQVAnx3gUHr9IYF7GNg9SUEw2VQ= +github.com/shoenig/test v0.6.4 h1:kVTaSd7WLz5WZ2IaoM0RSzRsUD+m8wRR+5qvntpn4LU= +github.com/shoenig/test v0.6.4/go.mod h1:byHiCGXqrVaflBLAMq/srcZIHynQPQgeyvkvXnjqq0k= +github.com/sirupsen/logrus v1.9.0 h1:trlNQbNUG3OdDrDil03MCb1H2o9nJ1x4/5LYw7byDE0= +github.com/sirupsen/logrus v1.9.0/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= +github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= +github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY= github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA= +github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +github.com/tklauser/go-sysconf v0.3.12 h1:0QaGUFOdQaIVdPgfITYzaTegZvdCjmYO52cSFAEVmqU= +github.com/tklauser/go-sysconf v0.3.12/go.mod h1:Ho14jnntGE1fpdOqQEEaiKRpvIavV0hSfmBq8nJbHYI= +github.com/tklauser/numcpus v0.6.1 h1:ng9scYS7az0Bk4OZLvrNXNSAO2Pxr1XXRAPyjhIx+Fk= +github.com/tklauser/numcpus v0.6.1/go.mod h1:1XfjsgE2zo8GVw7POkMbHENHzVg3GzmoZ9fESEdAacY= github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.3.5/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k= +github.com/yusufpapurcu/wmi v1.2.4 h1:zFUKzehAFReQwLys1b/iSMl+JQGSCSjtVqQn9bBrPo0= +github.com/yusufpapurcu/wmi v1.2.4/go.mod h1:SBZ9tNy3G9/m5Oi98Zks0QjeHVDvuK0qfxQmPyzfmi0= +go.einride.tech/pid v0.1.3 h1:yWAKSmD2Z10jxd4gYFhOjbBNqXeIQwAtnCO/XKCT7sQ= +go.einride.tech/pid v0.1.3/go.mod h1:33JSUbKrH/4v8DZf/0K8IC8Enjd92wB2birp+bCYQso= go.temporal.io/api v1.59.0 h1:QUpAju1KKs9xBfGSI0Uwdyg06k6dRCJH+Zm3G1Jc9Vk= go.temporal.io/api v1.59.0/go.mod h1:iaxoP/9OXMJcQkETTECfwYq4cw/bj4nwov8b3ZLVnXM= +go.temporal.io/sdk/contrib/resourcetuner v0.0.0-20260112203102-5b6df8e02dcf h1:hfa3sOvh1ZoC2SH5FKA5UdivU5X3AjARYEzSUy0ObUc= +go.temporal.io/sdk/contrib/resourcetuner v0.0.0-20260112203102-5b6df8e02dcf/go.mod h1:UvUEaYWquPBbehPnQJ6St0iDJVDV1HFXRSicol9Z+ek= +go.uber.org/goleak v1.1.12 h1:gZAh5/EyT/HQwlpkCy6wTpqfH9H8Lz8zbm3dZh+OyzA= +go.uber.org/goleak v1.1.12/go.mod h1:cwTWslyiVhfpKIDGSZEM2HlOvcqm+tG4zioyIeLoqMQ= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= +golang.org/x/exp v0.0.0-20240325151524-a685a6edb6d8 h1:aAcj0Da7eBAtrTp03QXWvm88pSyOt+UgdZw2BFZ+lEw= +golang.org/x/exp v0.0.0-20240325151524-a685a6edb6d8/go.mod h1:CQ1k9gNrJ50XIzaKCRR2hssIjF07kZFEiieALBM/ARQ= golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.4.2/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= @@ -58,10 +104,15 @@ golang.org/x/sync v0.13.0 h1:AauUjRAJ9OSnvULf/ARrrVywoJDy0YS2AwQ98I37610= golang.org/x/sync v0.13.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190916202348-b4ddaad3f8a3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20201204225414-ed752295db88/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210330210617-4fbd30eecc44/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210510120138-977fb7262007/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.11.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.32.0 h1:s77OFDvIQeibCmezSnk/q6iAfkdiQaJi4VzroCFrN20= golang.org/x/sys v0.32.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= @@ -91,5 +142,6 @@ google.golang.org/protobuf v1.36.6/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/ gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/internal/client.go b/internal/client.go index 24c7f6737..6a1a2b905 100644 --- a/internal/client.go +++ b/internal/client.go @@ -4,6 +4,7 @@ import ( "context" "crypto/tls" "fmt" + "github.com/google/uuid" "sync/atomic" "time" @@ -533,6 +534,13 @@ type ( // // NOTE: Experimental Plugins []ClientPlugin + + // WorkerHeartbeatInterval is the interval at which the worker will send heartbeats to the server. + // + // default: 60s. To disable, set to 0. + // + // NOTE: Experimental + WorkerHeartbeatInterval *time.Duration } // HeadersProvider returns a map of gRPC headers that should be used on every request. @@ -1132,7 +1140,9 @@ func NewServiceClient(workflowServiceClient workflowservice.WorkflowServiceClien // Collect set of applicable worker plugins and interceptors var workerPlugins []WorkerPlugin + var clientPluginNames []string for _, plugin := range options.Plugins { + clientPluginNames = append(clientPluginNames, plugin.Name()) if workerPlugin, _ := plugin.(WorkerPlugin); workerPlugin != nil { workerPlugins = append(workerPlugins, workerPlugin) } @@ -1144,6 +1154,15 @@ func NewServiceClient(workflowServiceClient workflowservice.WorkflowServiceClien } } + var heartbeatInterval time.Duration + if options.WorkerHeartbeatInterval == nil { + heartbeatInterval = time.Second * 60 + } else if *options.WorkerHeartbeatInterval == 0 { + heartbeatInterval = time.Second * 0 + } else { + heartbeatInterval = *options.WorkerHeartbeatInterval + } + client := &WorkflowClient{ workflowService: workflowServiceClient, conn: conn, @@ -1157,11 +1176,14 @@ func NewServiceClient(workflowServiceClient workflowservice.WorkflowServiceClien contextPropagators: options.ContextPropagators, workerPlugins: workerPlugins, workerInterceptors: workerInterceptors, + clientPluginNames: clientPluginNames, excludeInternalFromRetry: options.ConnectionOptions.excludeInternalFromRetry, eagerDispatcher: &eagerWorkflowDispatcher{ workersByTaskQueue: make(map[string]map[eagerWorker]struct{}), }, - getSystemInfoTimeout: options.ConnectionOptions.GetSystemInfoTimeout, + getSystemInfoTimeout: options.ConnectionOptions.GetSystemInfoTimeout, + workerHeartbeatInterval: heartbeatInterval, + workerGroupingKey: uuid.NewString(), } // Create outbound interceptor by wrapping backwards through chain diff --git a/internal/cmd/build/main.go b/internal/cmd/build/main.go index ad17d4977..04086145d 100644 --- a/internal/cmd/build/main.go +++ b/internal/cmd/build/main.go @@ -155,6 +155,8 @@ func (b *builder) integrationTest() error { "--dynamic-config-value", `system.refreshNexusEndpointsMinWait="0s"`, // Make Nexus tests faster "--dynamic-config-value", `component.nexusoperations.recordCancelRequestCompletionEvents=true`, // Defaults to false until after OSS 1.28 is released "--dynamic-config-value", `history.enableRequestIdRefLinks=true`, + "--dynamic-config-value", "frontend.WorkerHeartbeatsEnabled=true", + "--dynamic-config-value", "frontend.ListWorkersEnabled=true", }, }) if err != nil { diff --git a/internal/common/metrics/heartbeat_handler.go b/internal/common/metrics/heartbeat_handler.go new file mode 100644 index 000000000..17b439d03 --- /dev/null +++ b/internal/common/metrics/heartbeat_handler.go @@ -0,0 +1,519 @@ +package metrics + +import ( + "sync/atomic" + "time" +) + +// HeartbeatMetricsHandler wraps a metrics handler and captures specific metrics +// in memory that are needed for worker heartbeats +type HeartbeatMetricsHandler struct { + underlying Handler + + // Current worker type tag for this handler instance (set via WithTags) + workerType string + + stickyCacheHit *atomic.Uint64 + stickyCacheMiss *atomic.Uint64 + stickyCacheSize *atomic.Uint64 + + workflowTaskFailures *atomic.Uint64 + activityTaskFailures *atomic.Uint64 + localActivityTaskFailures *atomic.Uint64 + nexusTaskFailures *atomic.Uint64 + + workflowSlotsAvailable *atomic.Uint64 + workflowSlotsUsed *atomic.Uint64 + activitySlotsAvailable *atomic.Uint64 + activitySlotsUsed *atomic.Uint64 + localActivitySlotsAvailable *atomic.Uint64 + localActivitySlotsUsed *atomic.Uint64 + nexusSlotsAvailable *atomic.Uint64 + nexusSlotsUsed *atomic.Uint64 + + // Task processed counters (per worker type) - incremented each time execution latency is recorded + workflowTasksProcessed *atomic.Uint64 + activityTasksProcessed *atomic.Uint64 + localActivityTasksProcessed *atomic.Uint64 + nexusTasksProcessed *atomic.Uint64 + + // Current poller type tag for this handler instance (set via WithTags) + pollerType string + + workflowPollerCount *atomic.Uint64 + workflowStickyPollerCount *atomic.Uint64 + activityPollerCount *atomic.Uint64 + nexusPollerCount *atomic.Uint64 + + // Last successful poll times (per poller type) - stored as Unix nanoseconds + // NOTE: These are only kept in memory, there is no corresponding metric exported for these. + workflowLastPoll *atomic.Int64 + workflowStickyLastPoll *atomic.Int64 + activityLastPoll *atomic.Int64 + nexusLastPoll *atomic.Int64 +} + +// NewHeartbeatMetricsHandler creates a new handler that captures specific metrics +// for worker heartbeats while passing all metrics to the underlying handler. +func NewHeartbeatMetricsHandler(underlying Handler) *HeartbeatMetricsHandler { + return &HeartbeatMetricsHandler{ + underlying: underlying, + + stickyCacheHit: new(atomic.Uint64), + stickyCacheMiss: new(atomic.Uint64), + stickyCacheSize: new(atomic.Uint64), + + workflowTaskFailures: new(atomic.Uint64), + activityTaskFailures: new(atomic.Uint64), + localActivityTaskFailures: new(atomic.Uint64), + nexusTaskFailures: new(atomic.Uint64), + + workflowSlotsAvailable: new(atomic.Uint64), + workflowSlotsUsed: new(atomic.Uint64), + activitySlotsAvailable: new(atomic.Uint64), + activitySlotsUsed: new(atomic.Uint64), + localActivitySlotsAvailable: new(atomic.Uint64), + localActivitySlotsUsed: new(atomic.Uint64), + nexusSlotsAvailable: new(atomic.Uint64), + nexusSlotsUsed: new(atomic.Uint64), + + workflowTasksProcessed: new(atomic.Uint64), + activityTasksProcessed: new(atomic.Uint64), + localActivityTasksProcessed: new(atomic.Uint64), + nexusTasksProcessed: new(atomic.Uint64), + + workflowPollerCount: new(atomic.Uint64), + workflowStickyPollerCount: new(atomic.Uint64), + activityPollerCount: new(atomic.Uint64), + nexusPollerCount: new(atomic.Uint64), + + workflowLastPoll: new(atomic.Int64), + workflowStickyLastPoll: new(atomic.Int64), + activityLastPoll: new(atomic.Int64), + nexusLastPoll: new(atomic.Int64), + } +} + +func (h *HeartbeatMetricsHandler) WithTags(tags map[string]string) Handler { + // Track the worker type if present in tags + workerType := h.workerType + if wt, ok := tags[WorkerTypeTagName]; ok { + workerType = wt + } + + // Track the poller type if present in tags + pollerType := h.pollerType + if pt, ok := tags[PollerTypeTagName]; ok { + pollerType = pt + } + + return &HeartbeatMetricsHandler{ + underlying: h.underlying.WithTags(tags), + workerType: workerType, + pollerType: pollerType, + + stickyCacheHit: h.stickyCacheHit, + stickyCacheMiss: h.stickyCacheMiss, + stickyCacheSize: h.stickyCacheSize, + + workflowTaskFailures: h.workflowTaskFailures, + activityTaskFailures: h.activityTaskFailures, + localActivityTaskFailures: h.localActivityTaskFailures, + nexusTaskFailures: h.nexusTaskFailures, + + workflowSlotsAvailable: h.workflowSlotsAvailable, + workflowSlotsUsed: h.workflowSlotsUsed, + activitySlotsAvailable: h.activitySlotsAvailable, + activitySlotsUsed: h.activitySlotsUsed, + localActivitySlotsAvailable: h.localActivitySlotsAvailable, + localActivitySlotsUsed: h.localActivitySlotsUsed, + nexusSlotsAvailable: h.nexusSlotsAvailable, + nexusSlotsUsed: h.nexusSlotsUsed, + + workflowTasksProcessed: h.workflowTasksProcessed, + activityTasksProcessed: h.activityTasksProcessed, + localActivityTasksProcessed: h.localActivityTasksProcessed, + nexusTasksProcessed: h.nexusTasksProcessed, + + workflowPollerCount: h.workflowPollerCount, + workflowStickyPollerCount: h.workflowStickyPollerCount, + activityPollerCount: h.activityPollerCount, + nexusPollerCount: h.nexusPollerCount, + + workflowLastPoll: h.workflowLastPoll, + workflowStickyLastPoll: h.workflowStickyLastPoll, + activityLastPoll: h.activityLastPoll, + nexusLastPoll: h.nexusLastPoll, + } +} + +func (h *HeartbeatMetricsHandler) Counter(name string) Counter { + underlying := h.underlying.Counter(name) + + switch name { + case StickyCacheHit: + return &capturingCounter{ + underlying: underlying, + value: h.stickyCacheHit, + } + case StickyCacheMiss: + return &capturingCounter{ + underlying: underlying, + value: h.stickyCacheMiss, + } + case WorkflowTaskExecutionFailureCounter: + return &capturingCounter{ + underlying: underlying, + value: h.workflowTaskFailures, + } + case ActivityExecutionFailedCounter: + return &capturingCounter{ + underlying: underlying, + value: h.activityTaskFailures, + } + case LocalActivityExecutionFailedCounter: + return &capturingCounter{ + underlying: underlying, + value: h.localActivityTaskFailures, + } + case NexusTaskExecutionFailedCounter: + return &capturingCounter{ + underlying: underlying, + value: h.nexusTaskFailures, + } + default: + return underlying + } +} + +func (h *HeartbeatMetricsHandler) Gauge(name string) Gauge { + underlying := h.underlying.Gauge(name) + + switch name { + case StickyCacheSize: + return &capturingGauge{ + underlying: underlying, + value: h.stickyCacheSize, + } + case WorkerTaskSlotsAvailable: + var valuePtr *atomic.Uint64 + switch h.workerType { + case "WorkflowWorker": + valuePtr = h.workflowSlotsAvailable + case "ActivityWorker": + valuePtr = h.activitySlotsAvailable + case "LocalActivityWorker": + valuePtr = h.localActivitySlotsAvailable + case "NexusWorker": + valuePtr = h.nexusSlotsAvailable + } + if valuePtr != nil { + return &capturingGauge{ + underlying: underlying, + value: valuePtr, + } + } + case WorkerTaskSlotsUsed: + var valuePtr *atomic.Uint64 + switch h.workerType { + case "WorkflowWorker": + valuePtr = h.workflowSlotsUsed + case "ActivityWorker": + valuePtr = h.activitySlotsUsed + case "LocalActivityWorker": + valuePtr = h.localActivitySlotsUsed + case "NexusWorker": + valuePtr = h.nexusSlotsUsed + } + if valuePtr != nil { + return &capturingGauge{ + underlying: underlying, + value: valuePtr, + } + } + case NumPoller: + var valuePtr *atomic.Uint64 + switch h.pollerType { + case PollerTypeWorkflowTask: + valuePtr = h.workflowPollerCount + case PollerTypeWorkflowStickyTask: + valuePtr = h.workflowStickyPollerCount + case PollerTypeActivityTask: + valuePtr = h.activityPollerCount + case PollerTypeNexusTask: + valuePtr = h.nexusPollerCount + } + if valuePtr != nil { + return &capturingGauge{ + underlying: underlying, + value: valuePtr, + } + } + } + + return underlying +} + +func (h *HeartbeatMetricsHandler) Timer(name string) Timer { + underlying := h.underlying.Timer(name) + + // Capture execution latency timers to count processed tasks + switch name { + case WorkflowTaskExecutionLatency: + return &capturingTimer{ + underlying: underlying, + counter: h.workflowTasksProcessed, + } + case ActivityExecutionLatency: + return &capturingTimer{ + underlying: underlying, + counter: h.activityTasksProcessed, + } + case LocalActivityExecutionLatency: + return &capturingTimer{ + underlying: underlying, + counter: h.localActivityTasksProcessed, + } + case NexusTaskExecutionLatency: + return &capturingTimer{ + underlying: underlying, + counter: h.nexusTasksProcessed, + } + } + + return underlying +} + +// GetStickyCacheHit returns the total number of sticky cache hits. +func (h *HeartbeatMetricsHandler) GetStickyCacheHit() int32 { + return int32(h.stickyCacheHit.Load()) +} + +// GetStickyCacheMiss returns the total number of sticky cache misses. +func (h *HeartbeatMetricsHandler) GetStickyCacheMiss() int32 { + return int32(h.stickyCacheMiss.Load()) +} + +// GetStickyCacheSize returns the current sticky cache size. +func (h *HeartbeatMetricsHandler) GetStickyCacheSize() int32 { + return int32(h.stickyCacheSize.Load()) +} + +// GetWorkflowTaskFailures returns the total number of workflow task failures. +func (h *HeartbeatMetricsHandler) GetWorkflowTaskFailures() int64 { + return int64(h.workflowTaskFailures.Load()) +} + +// GetActivityTaskFailures returns the total number of activity task failures. +func (h *HeartbeatMetricsHandler) GetActivityTaskFailures() int64 { + return int64(h.activityTaskFailures.Load()) +} + +// GetLocalActivityTaskFailures returns the total number of local activity task failures. +func (h *HeartbeatMetricsHandler) GetLocalActivityTaskFailures() int64 { + return int64(h.localActivityTaskFailures.Load()) +} + +// GetNexusTaskFailures returns the total number of nexus task failures. +func (h *HeartbeatMetricsHandler) GetNexusTaskFailures() int64 { + return int64(h.nexusTaskFailures.Load()) +} + +// GetWorkflowSlotsAvailable returns the current workflow slots available. +func (h *HeartbeatMetricsHandler) GetWorkflowSlotsAvailable() int32 { + return int32(h.workflowSlotsAvailable.Load()) +} + +// GetWorkflowSlotsUsed returns the current workflow slots used. +func (h *HeartbeatMetricsHandler) GetWorkflowSlotsUsed() int32 { + return int32(h.workflowSlotsUsed.Load()) +} + +// GetActivitySlotsAvailable returns the current activity slots available. +func (h *HeartbeatMetricsHandler) GetActivitySlotsAvailable() int32 { + return int32(h.activitySlotsAvailable.Load()) +} + +// GetActivitySlotsUsed returns the current activity slots used. +func (h *HeartbeatMetricsHandler) GetActivitySlotsUsed() int32 { + return int32(h.activitySlotsUsed.Load()) +} + +// GetLocalActivitySlotsAvailable returns the current local activity slots available. +func (h *HeartbeatMetricsHandler) GetLocalActivitySlotsAvailable() int32 { + return int32(h.localActivitySlotsAvailable.Load()) +} + +// GetLocalActivitySlotsUsed returns the current local activity slots used. +func (h *HeartbeatMetricsHandler) GetLocalActivitySlotsUsed() int32 { + return int32(h.localActivitySlotsUsed.Load()) +} + +// GetNexusSlotsAvailable returns the current nexus slots available. +func (h *HeartbeatMetricsHandler) GetNexusSlotsAvailable() int32 { + return int32(h.nexusSlotsAvailable.Load()) +} + +// GetNexusSlotsUsed returns the current nexus slots used. +func (h *HeartbeatMetricsHandler) GetNexusSlotsUsed() int32 { + return int32(h.nexusSlotsUsed.Load()) +} + +// GetWorkflowTasksProcessed returns the total number of workflow tasks processed. +func (h *HeartbeatMetricsHandler) GetWorkflowTasksProcessed() int64 { + return int64(h.workflowTasksProcessed.Load()) +} + +// GetActivityTasksProcessed returns the total number of activity tasks processed. +func (h *HeartbeatMetricsHandler) GetActivityTasksProcessed() int64 { + return int64(h.activityTasksProcessed.Load()) +} + +// GetLocalActivityTasksProcessed returns the total number of local activity tasks processed. +func (h *HeartbeatMetricsHandler) GetLocalActivityTasksProcessed() int64 { + return int64(h.localActivityTasksProcessed.Load()) +} + +// GetNexusTasksProcessed returns the total number of nexus tasks processed. +func (h *HeartbeatMetricsHandler) GetNexusTasksProcessed() int64 { + return int64(h.nexusTasksProcessed.Load()) +} + +// GetWorkflowPollerCount returns the current number of workflow task pollers. +func (h *HeartbeatMetricsHandler) GetWorkflowPollerCount() int32 { + return int32(h.workflowPollerCount.Load()) +} + +// GetWorkflowStickyPollerCount returns the current number of workflow sticky task pollers. +func (h *HeartbeatMetricsHandler) GetWorkflowStickyPollerCount() int32 { + return int32(h.workflowStickyPollerCount.Load()) +} + +// GetActivityPollerCount returns the current number of activity task pollers. +func (h *HeartbeatMetricsHandler) GetActivityPollerCount() int32 { + return int32(h.activityPollerCount.Load()) +} + +// GetNexusPollerCount returns the current number of nexus task pollers. +func (h *HeartbeatMetricsHandler) GetNexusPollerCount() int32 { + return int32(h.nexusPollerCount.Load()) +} + +// RecordWorkflowPollSuccess records a successful workflow task poll. +func (h *HeartbeatMetricsHandler) RecordWorkflowPollSuccess() { + h.workflowLastPoll.Store(time.Now().UnixNano()) +} + +// RecordWorkflowStickyPollSuccess records a successful workflow sticky task poll. +func (h *HeartbeatMetricsHandler) RecordWorkflowStickyPollSuccess() { + h.workflowStickyLastPoll.Store(time.Now().UnixNano()) +} + +// RecordActivityPollSuccess records a successful activity task poll. +func (h *HeartbeatMetricsHandler) RecordActivityPollSuccess() { + h.activityLastPoll.Store(time.Now().UnixNano()) +} + +// RecordNexusPollSuccess records a successful nexus task poll. +func (h *HeartbeatMetricsHandler) RecordNexusPollSuccess() { + h.nexusLastPoll.Store(time.Now().UnixNano()) +} + +// GetWorkflowLastPollTime returns the last successful workflow task poll time. +func (h *HeartbeatMetricsHandler) GetWorkflowLastPollTime() time.Time { + nanos := h.workflowLastPoll.Load() + if nanos == 0 { + return time.Time{} + } + return time.Unix(0, nanos) +} + +// GetWorkflowStickyLastPollTime returns the last successful workflow sticky task poll time. +func (h *HeartbeatMetricsHandler) GetWorkflowStickyLastPollTime() time.Time { + nanos := h.workflowStickyLastPoll.Load() + if nanos == 0 { + return time.Time{} + } + return time.Unix(0, nanos) +} + +// GetActivityLastPollTime returns the last successful activity task poll time. +func (h *HeartbeatMetricsHandler) GetActivityLastPollTime() time.Time { + nanos := h.activityLastPoll.Load() + if nanos == 0 { + return time.Time{} + } + return time.Unix(0, nanos) +} + +// GetNexusLastPollTime returns the last successful nexus task poll time. +func (h *HeartbeatMetricsHandler) GetNexusLastPollTime() time.Time { + nanos := h.nexusLastPoll.Load() + if nanos == 0 { + return time.Time{} + } + return time.Unix(0, nanos) +} + +// PollSuccessRecorder is an optional interface for recording successful poll times. +type PollSuccessRecorder interface { + RecordWorkflowPollSuccess() + RecordWorkflowStickyPollSuccess() + RecordActivityPollSuccess() + RecordNexusPollSuccess() +} + +// RecordPollSuccess records a successful poll time if the handler supports it. +// pollerType should be one of PollerTypeWorkflowTask, PollerTypeWorkflowStickyTask, +// PollerTypeActivityTask, or PollerTypeNexusTask. +func RecordPollSuccess(h Handler, pollerType string) { + recorder, ok := h.(PollSuccessRecorder) + if !ok { + return + } + switch pollerType { + case PollerTypeWorkflowTask: + recorder.RecordWorkflowPollSuccess() + case PollerTypeWorkflowStickyTask: + recorder.RecordWorkflowStickyPollSuccess() + case PollerTypeActivityTask: + recorder.RecordActivityPollSuccess() + case PollerTypeNexusTask: + recorder.RecordNexusPollSuccess() + } +} + +// capturingCounter wraps a counter and captures its value in memory for heartbeat reporting. +type capturingCounter struct { + underlying Counter + value *atomic.Uint64 +} + +func (c *capturingCounter) Inc(delta int64) { + c.underlying.Inc(delta) + if delta > 0 { + c.value.Add(uint64(delta)) + } +} + +// capturingGauge wraps a gauge and captures its value in memory for heartbeat reporting. +type capturingGauge struct { + underlying Gauge + value *atomic.Uint64 +} + +func (g *capturingGauge) Update(f float64) { + g.underlying.Update(f) + g.value.Store(uint64(f)) +} + +// capturingTimer wraps a timer and increments a counter each time Record is called. +type capturingTimer struct { + underlying Timer + counter *atomic.Uint64 +} + +func (t *capturingTimer) Record(d time.Duration) { + t.underlying.Record(d) + t.counter.Add(1) +} diff --git a/internal/internal_nexus_task_poller.go b/internal/internal_nexus_task_poller.go index d7e17b544..a98e484e3 100644 --- a/internal/internal_nexus_task_poller.go +++ b/internal/internal_nexus_task_poller.go @@ -90,11 +90,9 @@ func (ntp *nexusTaskPoller) poll(ctx context.Context) (taskForWorker, error) { return nil, nil } - return &nexusTask{task: response}, nil -} + metrics.RecordPollSuccess(ntp.metricsHandler, metrics.PollerTypeNexusTask) -func (ntp *nexusTaskPoller) Cleanup() error { - return nil + return &nexusTask{task: response}, nil } // PollTask polls a new task diff --git a/internal/internal_task_pollers.go b/internal/internal_task_pollers.go index dba946879..67afb18b6 100644 --- a/internal/internal_task_pollers.go +++ b/internal/internal_task_pollers.go @@ -13,13 +13,10 @@ import ( "google.golang.org/protobuf/types/known/durationpb" "google.golang.org/protobuf/types/known/wrapperspb" - "github.com/google/uuid" - commonpb "go.temporal.io/api/common/v1" deploymentpb "go.temporal.io/api/deployment/v1" enumspb "go.temporal.io/api/enums/v1" historypb "go.temporal.io/api/history/v1" - "go.temporal.io/api/serviceerror" taskqueuepb "go.temporal.io/api/taskqueue/v1" "go.temporal.io/api/workflowservice/v1" @@ -53,9 +50,6 @@ type ( taskPoller interface { // PollTask polls for one new task PollTask() (taskForWorker, error) - // Called when the poller will no longer be polled. Presently only useful for - // workflow workers. - Cleanup() error } // taskProcessor interface to process tasks @@ -315,6 +309,7 @@ func newWorkflowTaskProcessor( contextManager WorkflowContextManager, service workflowservice.WorkflowServiceClient, params workerExecutionParameters, + stickyUUID string, ) *workflowTaskProcessor { return &workflowTaskProcessor{ basePoller: basePoller{ @@ -334,7 +329,7 @@ func newWorkflowTaskProcessor( logger: params.Logger, dataConverter: params.DataConverter, failureConverter: params.FailureConverter, - stickyUUID: uuid.NewString(), + stickyUUID: stickyUUID, StickyScheduleToStartTimeout: params.StickyScheduleToStartTimeout, stickyCacheSize: params.cache.MaxWorkflowCacheSize(), eagerActivityExecutor: params.eagerActivityExecutor, @@ -343,36 +338,6 @@ func newWorkflowTaskProcessor( } } -// Best-effort attempt to indicate to Matching service that this workflow task -// poller's sticky queue will no longer be polled. Should be called when the -// poller is stopping. Failure to call ShutdownWorker is logged, but otherwise -// ignored. -func (wtp *workflowTaskPoller) Cleanup() error { - ctx := context.Background() - grpcCtx, cancel := newGRPCContext(ctx, grpcMetricsHandler(wtp.metricsHandler)) - defer cancel() - - _, err := wtp.service.ShutdownWorker(grpcCtx, &workflowservice.ShutdownWorkerRequest{ - Namespace: wtp.namespace, - StickyTaskQueue: getWorkerTaskQueue(wtp.stickyUUID), - Identity: wtp.identity, - Reason: "graceful shutdown", - }) - - // we ignore unimplemented - if _, isUnimplemented := err.(*serviceerror.Unimplemented); isUnimplemented { - return nil - } - - if err != nil { - traceLog(func() { - wtp.logger.Debug("ShutdownWorker failed.", tagError, err) - }) - } - - return err -} - // PollTask polls a new task func (wtp *workflowTaskPoller) PollTask() (taskForWorker, error) { // Get the task. @@ -737,10 +702,6 @@ func newLocalActivityPoller( } } -func (latp *localActivityTaskPoller) Cleanup() error { - return nil -} - func (latp *localActivityTaskPoller) PollTask() (taskForWorker, error) { return latp.laTunnel.getTask(), nil } @@ -1008,6 +969,12 @@ func (wtp *workflowTaskPoller) poll(ctx context.Context) (taskForWorker, error) return &workflowTask{}, nil } + if request.TaskQueue.GetKind() == enumspb.TASK_QUEUE_KIND_STICKY { + metrics.RecordPollSuccess(wtp.metricsHandler, metrics.PollerTypeWorkflowStickyTask) + } else { + metrics.RecordPollSuccess(wtp.metricsHandler, metrics.PollerTypeWorkflowTask) + } + wtp.updateBacklog(request.TaskQueue.GetKind(), response.GetBacklogCountHint()) task := wtp.toWorkflowTask(response) @@ -1206,6 +1173,8 @@ func (atp *activityTaskPoller) poll(ctx context.Context) (taskForWorker, error) return &activityTask{}, nil } + metrics.RecordPollSuccess(atp.metricsHandler, metrics.PollerTypeActivityTask) + workflowType := response.WorkflowType.GetName() activityType := response.ActivityType.GetName() metricsHandler := atp.metricsHandler.WithTags(metrics.ActivityTags(workflowType, activityType, atp.taskQueueName)) @@ -1216,10 +1185,6 @@ func (atp *activityTaskPoller) poll(ctx context.Context) (taskForWorker, error) return &activityTask{task: response}, nil } -func (atp *activityTaskPoller) Cleanup() error { - return nil -} - // PollTask polls a new task func (atp *activityTaskPoller) PollTask() (taskForWorker, error) { // Get the task. diff --git a/internal/internal_task_pollers_test.go b/internal/internal_task_pollers_test.go index af70a53f6..8fde8791e 100644 --- a/internal/internal_task_pollers_test.go +++ b/internal/internal_task_pollers_test.go @@ -4,6 +4,7 @@ import ( "context" "encoding/binary" "errors" + "github.com/google/uuid" "sync/atomic" "testing" "time" @@ -97,7 +98,7 @@ func TestWFTRacePrevention(t *testing.T) { return &workflowservice.RespondWorkflowTaskFailedResponse{}, nil }) - poller := newWorkflowTaskProcessor(taskHandler, contextManager, client, params) + poller := newWorkflowTaskProcessor(taskHandler, contextManager, client, params, uuid.NewString()) t.Log("Issue task0") go func() { resultsChan <- poller.processWorkflowTask(&task0) }() @@ -188,7 +189,7 @@ func TestWFTCorruption(t *testing.T) { return nil, errors.New("Failure responding to workflow task") }) - poller := newWorkflowTaskProcessor(taskHandler, contextManager, client, params) + poller := newWorkflowTaskProcessor(taskHandler, contextManager, client, params, uuid.NewString()) processTaskDone := make(chan struct{}) go func() { require.Error(t, poller.processWorkflowTask(&task0)) @@ -329,7 +330,7 @@ func TestWFTReset(t *testing.T) { client.EXPECT().RespondWorkflowTaskCompleted(gomock.Any(), gomock.Any()). Return(&workflowservice.RespondWorkflowTaskCompletedResponse{}, nil) - poller := newWorkflowTaskProcessor(taskHandler, contextManager, client, params) + poller := newWorkflowTaskProcessor(taskHandler, contextManager, client, params, uuid.NewString()) // Send a full history as part of the speculative WFT require.NoError(t, poller.processWorkflowTask(&task0)) originalCachedExecution := cache.getWorkflowContext(runID) @@ -403,7 +404,7 @@ func TestWFTPanicInTaskHandler(t *testing.T) { task0 = workflowTask{task: &pollResp0} ) - poller := newWorkflowTaskProcessor(taskHandler, contextManager, client, params) + poller := newWorkflowTaskProcessor(taskHandler, contextManager, client, params, uuid.NewString()) require.Error(t, poller.processWorkflowTask(&task0)) // Workflow should not be in cache require.Nil(t, cache.getWorkflowContext(runID)) diff --git a/internal/internal_worker.go b/internal/internal_worker.go index 7b45787c1..13f326fe7 100644 --- a/internal/internal_worker.go +++ b/internal/internal_worker.go @@ -6,6 +6,9 @@ import ( "context" "errors" "fmt" + workerpb "go.temporal.io/api/worker/v1" + "google.golang.org/protobuf/types/known/durationpb" + "google.golang.org/protobuf/types/known/timestamppb" "io" "math" "os" @@ -24,6 +27,7 @@ import ( deploymentpb "go.temporal.io/api/deployment/v1" enumspb "go.temporal.io/api/enums/v1" historypb "go.temporal.io/api/history/v1" + "go.temporal.io/api/serviceerror" "go.temporal.io/api/temporalproto" "go.temporal.io/api/workflowservice/v1" "go.temporal.io/api/workflowservicemock/v1" @@ -35,6 +39,7 @@ import ( "go.temporal.io/sdk/internal/common/util" ilog "go.temporal.io/sdk/internal/log" "go.temporal.io/sdk/log" + "go.temporal.io/sdk/worker/hostmetrics" ) const ( @@ -81,6 +86,7 @@ type ( identity string stopC chan struct{} localActivityStopC chan struct{} + stickyUUID string // Used for ShutdownWorker call } // ActivityWorker wraps the code for hosting activity types. @@ -223,6 +229,22 @@ type ( // The build id specific to this worker BuildID string } + + // workerHeartbeatManager includes all information needed to report worker heartbeats. + workerHeartbeatManager struct { + heartbeatWorker *sharedNamespaceWorker + heartbeatMetrics *metrics.HeartbeatMetricsHandler + heartbeatCallback func() *workerpb.WorkerHeartbeat + + // Slot suppliers for heartbeat reporting + workflowTaskSlotSupplier *trackingSlotSupplier + activityTaskSlotSupplier *trackingSlotSupplier + localActivitySlotSupplier *trackingSlotSupplier + nexusTaskSlotSupplier *trackingSlotSupplier + + // Host metrics provider for CPU/memory reporting in heartbeats + hostMetricsProvider HostMetricsProvider + } ) var debugMode = os.Getenv("TEMPORAL_DEBUG") != "" @@ -322,7 +344,9 @@ func newWorkflowTaskWorkerInternal( if client != nil { service = client.workflowService } - taskProcessor := newWorkflowTaskProcessor(taskHandler, contextManager, service, params) + // Generate stickyUUID here so it can be stored in workflowWorker for ShutdownWorker call + stickyUUID := uuid.NewString() + taskProcessor := newWorkflowTaskProcessor(taskHandler, contextManager, service, params, stickyUUID) var scalableTaskPollers []scalableTaskPoller switch params.WorkflowTaskPollerBehavior.(type) { @@ -412,6 +436,7 @@ func newWorkflowTaskWorkerInternal( identity: params.Identity, stopC: stopC, localActivityStopC: laStopChannel, + stickyUUID: stickyUUID, } } @@ -1175,6 +1200,8 @@ type AggregatedWorker struct { workerInstanceKey string plugins []WorkerPlugin pluginRegistryOptions *WorkerPluginConfigureWorkerRegistryOptions // Never nil + + workerHeartbeatManager *workerHeartbeatManager } // RegisterWorkflow registers workflow implementation with the AggregatedWorker @@ -1350,6 +1377,15 @@ func (aw *AggregatedWorker) start() error { if err := aw.nexusWorker.Start(); err != nil { return fmt.Errorf("failed to start a nexus worker: %w", err) } + if aw.nexusWorker.worker != nil && aw.workerHeartbeatManager != nil { + aw.workerHeartbeatManager.nexusTaskSlotSupplier = aw.nexusWorker.worker.slotSupplier + } + } + + if aw.client.workerHeartbeatInterval > 0 { + if err := aw.registerHeartbeatWorker(); err != nil { + return fmt.Errorf("failed to register heartbeat worker: %w", err) + } } aw.logger.Info("Started Worker") return nil @@ -1439,6 +1475,8 @@ func (aw *AggregatedWorker) Stop() { close(aw.stopC) } + aw.shutdownWorker() + // Issue stop through plugins stop := func(context.Context, WorkerPluginStopWorkerOptions) { if !util.IsInterfaceNil(aw.workflowWorker) { @@ -1468,9 +1506,74 @@ func (aw *AggregatedWorker) Stop() { WorkerInstanceKey: aw.workerInstanceKey, }) + aw.unregisterHeartbeatWorker() + aw.logger.Info("Stopped Worker") } +func (aw *AggregatedWorker) registerHeartbeatWorker() error { + hw, err := aw.client.getOrCreateHeartbeatWorker(aw.executionParams.Namespace) + if err != nil { + return err + } + + // Server doesn't support heartbeating. + if hw == nil { + return nil + } + + aw.workerHeartbeatManager.heartbeatWorker = hw + hw.registerCallback(aw.workerInstanceKey, aw.workerHeartbeatManager.heartbeatCallback) + + return nil +} + +func (aw *AggregatedWorker) unregisterHeartbeatWorker() { + if aw.workerHeartbeatManager != nil && aw.workerHeartbeatManager.heartbeatWorker != nil { + aw.workerHeartbeatManager.heartbeatWorker.unregisterCallback(aw.workerInstanceKey) + aw.workerHeartbeatManager.heartbeatWorker = nil + } +} + +// shutdownWorker sends a ShutdownWorker RPC to notify the server that this worker is shutting down. +// When StickyTaskQueue is non-empty, this is a best-effort attempt to indicate to Matching service +// that this workflow task poller's sticky queue will no longer be polled. +// +// NOTE: errors are logged but don't fail the shutdown. +func (aw *AggregatedWorker) shutdownWorker() { + ctx := context.Background() + grpcCtx, cancel := newGRPCContext(ctx, grpcMetricsHandler(aw.executionParams.MetricsHandler)) + defer cancel() + + var heartbeat *workerpb.WorkerHeartbeat + if aw.workerHeartbeatManager != nil && aw.workerHeartbeatManager.heartbeatCallback != nil { + heartbeat = aw.workerHeartbeatManager.heartbeatCallback() + heartbeat.Status = enumspb.WORKER_STATUS_SHUTTING_DOWN + } + + var stickyTaskQueue string + if aw.workflowWorker != nil && aw.workflowWorker.stickyUUID != "" { + stickyTaskQueue = getWorkerTaskQueue(aw.workflowWorker.stickyUUID) + } + + _, err := aw.client.workflowService.ShutdownWorker(grpcCtx, &workflowservice.ShutdownWorkerRequest{ + Namespace: aw.executionParams.Namespace, + StickyTaskQueue: stickyTaskQueue, + Identity: aw.executionParams.Identity, + Reason: "graceful shutdown", + WorkerHeartbeat: heartbeat, + }) + + // Ignore unimplemented (server doesn't support it) and unavailable (server shutting down) + if _, isUnimplemented := err.(*serviceerror.Unimplemented); isUnimplemented { + return + } + + if err != nil { + aw.logger.Debug("ShutdownWorker failed.", tagError, err) + } +} + // WorkflowReplayer is used to replay workflow code from an event history type WorkflowReplayer struct { registry *registry @@ -2024,6 +2127,19 @@ func NewAggregatedWorker(client *WorkflowClient, taskQueue string, options Worke // should take a pointer to this struct and wait for it to be populated when the worker is run. var capabilities workflowservice.GetSystemInfoResponse_Capabilities + baseMetricsHandler := client.metricsHandler.WithTags(metrics.TaskQueueTags(taskQueue)) + var metricsHandler metrics.Handler + var heartbeatMetrics *metrics.HeartbeatMetricsHandler + var heartbeatManager *workerHeartbeatManager + + if client.workerHeartbeatInterval != 0 { + heartbeatMetrics = metrics.NewHeartbeatMetricsHandler(baseMetricsHandler) + metricsHandler = heartbeatMetrics + heartbeatManager = &workerHeartbeatManager{} + } else { + metricsHandler = baseMetricsHandler + } + cache := NewWorkerCache() workerParams := workerExecutionParameters{ Namespace: client.namespace, @@ -2035,7 +2151,7 @@ func NewAggregatedWorker(client *WorkflowClient, taskQueue string, options Worke WorkerBuildID: options.BuildID, UseBuildIDForVersioning: options.UseBuildIDForVersioning || options.DeploymentOptions.UseVersioning, DeploymentOptions: options.DeploymentOptions, - MetricsHandler: client.metricsHandler.WithTags(metrics.TaskQueueTags(taskQueue)), + MetricsHandler: metricsHandler, Logger: client.logger, EnableLoggingInReplay: options.EnableLoggingInReplay, BackgroundContext: backgroundActivityContext, @@ -2145,19 +2261,190 @@ func NewAggregatedWorker(client *WorkflowClient, taskQueue string, options Worke }) } + // Initialize host metrics provider for CPU/memory reporting. + // If the tuner implements HostMetricsProvider, use it to avoid double-measurement of system. + var hostMetricsProvider HostMetricsProvider + if provider, ok := options.Tuner.(HostMetricsProvider); ok { + hostMetricsProvider = provider + } else if client.workerHeartbeatInterval != 0 { + hostMetricsProvider = hostmetrics.NewPSUtilSystemInfoSupplier(workerParams.Logger) + } + + var heartbeatCallback func() *workerpb.WorkerHeartbeat + if heartbeatManager != nil { + startTime := timestamppb.New(time.Now()) + hostname, _ := os.Hostname() + previousHeartbeatTime := time.Now() + + pluginInfos := collectPluginInfos(client.clientPluginNames, plugins) + + var prevWorkflowProcessed, prevWorkflowFailed int64 + var prevActivityProcessed, prevActivityFailed int64 + var prevLocalActivityProcessed, prevLocalActivityFailed int64 + var prevNexusProcessed, prevNexusFailed int64 + + heartbeatCallback = func() *workerpb.WorkerHeartbeat { + heartbeatTime := time.Now() + elapsedSinceLastHeartbeat := previousHeartbeatTime.Sub(heartbeatTime) + previousHeartbeatTime = heartbeatTime + + var stickyCacheHit, stickyCacheMiss, stickyCacheSize int32 + if aw.workerHeartbeatManager.heartbeatMetrics != nil { + stickyCacheHit = aw.workerHeartbeatManager.heartbeatMetrics.GetStickyCacheHit() + stickyCacheMiss = aw.workerHeartbeatManager.heartbeatMetrics.GetStickyCacheMiss() + stickyCacheSize = aw.workerHeartbeatManager.heartbeatMetrics.GetStickyCacheSize() + } + var deploymentVersion *deploymentpb.WorkerDeploymentVersion + if options.DeploymentOptions.UseVersioning { + deploymentVersion = &deploymentpb.WorkerDeploymentVersion{ + DeploymentName: options.DeploymentOptions.Version.DeploymentName, + BuildId: options.DeploymentOptions.Version.BuildID, + } + } + + var workflowTaskSlotsInfo *workerpb.WorkerSlotsInfo + var activityTaskSlotsInfo *workerpb.WorkerSlotsInfo + var localActivitySlotsInfo *workerpb.WorkerSlotsInfo + var nexusTaskSlotsInfo *workerpb.WorkerSlotsInfo + + if aw.workerHeartbeatManager.heartbeatMetrics != nil { + if aw.workerHeartbeatManager.workflowTaskSlotSupplier != nil { + workflowTaskSlotsInfo = buildSlotsInfo( + aw.workerHeartbeatManager.workflowTaskSlotSupplier.GetSlotSupplierKind(), + aw.workerHeartbeatManager.heartbeatMetrics.GetWorkflowSlotsAvailable(), + aw.workerHeartbeatManager.heartbeatMetrics.GetWorkflowSlotsUsed(), + aw.workerHeartbeatManager.heartbeatMetrics.GetWorkflowTasksProcessed(), + aw.workerHeartbeatManager.heartbeatMetrics.GetWorkflowTaskFailures(), + &prevWorkflowProcessed, + &prevWorkflowFailed, + ) + } + if aw.workerHeartbeatManager.activityTaskSlotSupplier != nil { + activityTaskSlotsInfo = buildSlotsInfo( + aw.workerHeartbeatManager.activityTaskSlotSupplier.GetSlotSupplierKind(), + aw.workerHeartbeatManager.heartbeatMetrics.GetActivitySlotsAvailable(), + aw.workerHeartbeatManager.heartbeatMetrics.GetActivitySlotsUsed(), + aw.workerHeartbeatManager.heartbeatMetrics.GetActivityTasksProcessed(), + aw.workerHeartbeatManager.heartbeatMetrics.GetActivityTaskFailures(), + &prevActivityProcessed, + &prevActivityFailed, + ) + } + if aw.workerHeartbeatManager.localActivitySlotSupplier != nil { + localActivitySlotsInfo = buildSlotsInfo( + aw.workerHeartbeatManager.localActivitySlotSupplier.GetSlotSupplierKind(), + aw.workerHeartbeatManager.heartbeatMetrics.GetLocalActivitySlotsAvailable(), + aw.workerHeartbeatManager.heartbeatMetrics.GetLocalActivitySlotsUsed(), + aw.workerHeartbeatManager.heartbeatMetrics.GetLocalActivityTasksProcessed(), + aw.workerHeartbeatManager.heartbeatMetrics.GetLocalActivityTaskFailures(), + &prevLocalActivityProcessed, + &prevLocalActivityFailed, + ) + } + if aw.workerHeartbeatManager.nexusTaskSlotSupplier != nil { + nexusTaskSlotsInfo = buildSlotsInfo( + aw.workerHeartbeatManager.nexusTaskSlotSupplier.GetSlotSupplierKind(), + aw.workerHeartbeatManager.heartbeatMetrics.GetNexusSlotsAvailable(), + aw.workerHeartbeatManager.heartbeatMetrics.GetNexusSlotsUsed(), + aw.workerHeartbeatManager.heartbeatMetrics.GetNexusTasksProcessed(), + aw.workerHeartbeatManager.heartbeatMetrics.GetNexusTaskFailures(), + &prevNexusProcessed, + &prevNexusFailed, + ) + } + } + + var workflowPollerInfo, workflowStickyPollerInfo, activityPollerInfo, nexusPollerInfo *workerpb.WorkerPollerInfo + if aw.workerHeartbeatManager.heartbeatMetrics != nil { + workflowPollerInfo = buildPollerInfo( + aw.workerHeartbeatManager.heartbeatMetrics.GetWorkflowPollerCount(), + aw.workerHeartbeatManager.heartbeatMetrics.GetWorkflowLastPollTime(), + options.WorkflowTaskPollerBehavior, + ) + workflowStickyPollerInfo = buildPollerInfo( + aw.workerHeartbeatManager.heartbeatMetrics.GetWorkflowStickyPollerCount(), + aw.workerHeartbeatManager.heartbeatMetrics.GetWorkflowStickyLastPollTime(), + options.WorkflowTaskPollerBehavior, + ) + activityPollerInfo = buildPollerInfo( + aw.workerHeartbeatManager.heartbeatMetrics.GetActivityPollerCount(), + aw.workerHeartbeatManager.heartbeatMetrics.GetActivityLastPollTime(), + options.ActivityTaskPollerBehavior, + ) + nexusPollerInfo = buildPollerInfo( + aw.workerHeartbeatManager.heartbeatMetrics.GetNexusPollerCount(), + aw.workerHeartbeatManager.heartbeatMetrics.GetNexusLastPollTime(), + options.NexusTaskPollerBehavior, + ) + } + + hb := &workerpb.WorkerHeartbeat{ + WorkerInstanceKey: aw.workerInstanceKey, + WorkerIdentity: aw.client.identity, + HostInfo: &workerpb.WorkerHostInfo{ + HostName: hostname, + WorkerGroupingKey: aw.client.workerGroupingKey, + ProcessId: strconv.Itoa(os.Getpid()), + CurrentHostCpuUsage: getCpuUsage(hostMetricsProvider), + CurrentHostMemUsage: getMemUsage(hostMetricsProvider), + }, + TaskQueue: aw.executionParams.TaskQueue, + DeploymentVersion: deploymentVersion, + SdkName: SDKName, + SdkVersion: SDKVersion, + Status: enumspb.WORKER_STATUS_RUNNING, + StartTime: startTime, + HeartbeatTime: timestamppb.New(heartbeatTime), + ElapsedSinceLastHeartbeat: durationpb.New(elapsedSinceLastHeartbeat), + WorkflowTaskSlotsInfo: workflowTaskSlotsInfo, + ActivityTaskSlotsInfo: activityTaskSlotsInfo, + NexusTaskSlotsInfo: nexusTaskSlotsInfo, + LocalActivitySlotsInfo: localActivitySlotsInfo, + WorkflowPollerInfo: workflowPollerInfo, + WorkflowStickyPollerInfo: workflowStickyPollerInfo, + ActivityPollerInfo: activityPollerInfo, + NexusPollerInfo: nexusPollerInfo, + TotalStickyCacheHit: stickyCacheHit, + TotalStickyCacheMiss: stickyCacheMiss, + CurrentStickyCacheSize: stickyCacheSize, + Plugins: pluginInfos, + } + return hb + } + } + + if heartbeatManager != nil { + heartbeatManager.heartbeatMetrics = heartbeatMetrics + heartbeatManager.heartbeatCallback = heartbeatCallback + heartbeatManager.hostMetricsProvider = hostMetricsProvider + } + aw = &AggregatedWorker{ - client: client, - workflowWorker: workflowWorker, - activityWorker: activityWorker, - sessionWorker: sessionWorker, - logger: workerParams.Logger, - registry: registry, - stopC: make(chan struct{}), - capabilities: &capabilities, - executionParams: workerParams, - workerInstanceKey: workerInstanceKey, - plugins: plugins, - pluginRegistryOptions: &pluginRegistryOptions, + client: client, + workflowWorker: workflowWorker, + activityWorker: activityWorker, + sessionWorker: sessionWorker, + logger: workerParams.Logger, + registry: registry, + stopC: make(chan struct{}), + capabilities: &capabilities, + executionParams: workerParams, + workerInstanceKey: workerInstanceKey, + plugins: plugins, + pluginRegistryOptions: &pluginRegistryOptions, + workerHeartbeatManager: heartbeatManager, + } + + if aw.workerHeartbeatManager != nil { + if workflowWorker != nil && workflowWorker.worker != nil { + aw.workerHeartbeatManager.workflowTaskSlotSupplier = workflowWorker.worker.slotSupplier + } + if workflowWorker != nil && workflowWorker.localActivityWorker != nil { + aw.workerHeartbeatManager.localActivitySlotSupplier = workflowWorker.localActivityWorker.slotSupplier + } + if activityWorker != nil && activityWorker.worker != nil { + aw.workerHeartbeatManager.activityTaskSlotSupplier = activityWorker.worker.slotSupplier + } } // Set memoized start as a once-value that invokes plugins first @@ -2476,3 +2763,83 @@ func workerDeploymentVersionFromProtoOrString(wd *deploymentpb.WorkerDeploymentV BuildID: wd.BuildId, } } + +func buildSlotsInfo( + supplierKind string, + slotsAvailable int32, + slotsUsed int32, + totalProcessed int64, + totalFailed int64, + prevProcessed *int64, + prevFailed *int64, +) *workerpb.WorkerSlotsInfo { + intervalProcessed := totalProcessed - *prevProcessed + intervalFailed := totalFailed - *prevFailed + + // Update previous totals for next interval + *prevProcessed = totalProcessed + *prevFailed = totalFailed + + totalProcessedTasks := int32(totalProcessed) + totalFailedTasks := int32(totalFailed) + lastIntervalProcessed := int32(intervalProcessed) + lastIntervalFailed := int32(intervalFailed) + + return &workerpb.WorkerSlotsInfo{ + CurrentAvailableSlots: slotsAvailable, + CurrentUsedSlots: slotsUsed, + SlotSupplierKind: supplierKind, + TotalProcessedTasks: totalProcessedTasks, + TotalFailedTasks: totalFailedTasks, + LastIntervalProcessedTasks: lastIntervalProcessed, + LastIntervalFailureTasks: lastIntervalFailed, + } +} + +func buildPollerInfo(currentPollers int32, lastSuccessfulPollTime time.Time, pollerBehavior PollerBehavior) *workerpb.WorkerPollerInfo { + var isAutoscaling bool + switch pollerBehavior.(type) { + case *pollerBehaviorAutoscaling: + isAutoscaling = true + } + + return &workerpb.WorkerPollerInfo{ + CurrentPollers: currentPollers, + LastSuccessfulPollTime: timestamppb.New(lastSuccessfulPollTime), + IsAutoscaling: isAutoscaling, + } +} + +func getCpuUsage(provider HostMetricsProvider) float32 { + if provider == nil { + return 0 + } + cpu, _ := provider.GetCpuUsage() + return float32(cpu) +} + +func getMemUsage(provider HostMetricsProvider) float32 { + if provider == nil { + return 0 + } + mem, _ := provider.GetMemoryUsage() + return float32(mem) +} + +// collectPluginInfos collects plugin names from client and worker plugins, +// deduplicates them, and returns a slice of PluginInfo for heartbeat reporting. +func collectPluginInfos(clientPluginNames []string, workerPlugins []WorkerPlugin) []*workerpb.PluginInfo { + set := make(map[string]struct{}, len(clientPluginNames)+len(workerPlugins)) + for _, name := range clientPluginNames { + set[name] = struct{}{} + } + for _, plugin := range workerPlugins { + set[plugin.Name()] = struct{}{} + } + + result := make([]*workerpb.PluginInfo, 0, len(set)) + for name := range set { + result = append(result, &workerpb.PluginInfo{Name: name}) + } + return result +} diff --git a/internal/internal_worker_base.go b/internal/internal_worker_base.go index 923e7fd29..756667de5 100644 --- a/internal/internal_worker_base.go +++ b/internal/internal_worker_base.go @@ -694,13 +694,6 @@ func (bw *baseWorker) Stop() { close(bw.stopCh) bw.limiterContextCancel() - for _, taskWorker := range bw.options.taskPollers { - err := taskWorker.taskPoller.Cleanup() - if err != nil { - bw.logger.Error("Couldn't cleanup task worker", tagError, err) - } - } - if success := awaitWaitGroup(&bw.stopWG, bw.options.stopTimeout); !success { traceLog(func() { bw.logger.Info("Worker graceful stop timed out.", "Stop timeout", bw.options.stopTimeout) diff --git a/internal/internal_worker_base_test.go b/internal/internal_worker_base_test.go index e900dbba7..a9de195d1 100644 --- a/internal/internal_worker_base_test.go +++ b/internal/internal_worker_base_test.go @@ -242,12 +242,6 @@ func (p *semaphoreProbeTaskPoller) PollTask() (taskForWorker, error) { return nil, nil } -// Cleanup implements taskPoller. -func (p *semaphoreProbeTaskPoller) Cleanup() error { - p.Close() - return nil -} - func (p *semaphoreProbeTaskPoller) Allow(n int) { for range n { for { diff --git a/internal/internal_worker_heartbeat.go b/internal/internal_worker_heartbeat.go new file mode 100644 index 000000000..47fd9cf4d --- /dev/null +++ b/internal/internal_worker_heartbeat.go @@ -0,0 +1,143 @@ +package internal + +import ( + "context" + workerpb "go.temporal.io/api/worker/v1" + "go.temporal.io/api/workflowservice/v1" + "go.temporal.io/sdk/log" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" + "sync" + "sync/atomic" + "time" +) + +// sharedNamespaceWorker is the background nexus worker that handles heartbeating for +// all workers in a specific namespace for a specific client. +type sharedNamespaceWorker struct { + client *WorkflowClient + namespace string + taskQueue string // temporal-sys/worker-commands/{namespace}/{workerGroupingKey} + interval time.Duration + logger log.Logger + + nexusWorker *nexusWorker + + mu sync.RWMutex + callbacks map[string]func() *workerpb.WorkerHeartbeat // workerInstanceKey -> callback + + stopC chan struct{} + stoppedC chan struct{} + started atomic.Bool +} + +func (hw *sharedNamespaceWorker) createNexusWorker() (*nexusWorker, error) { + tuner, err := NewFixedSizeTuner(FixedSizeTunerOptions{ + NumNexusSlots: 5}) + if err != nil { + return nil, err + } + + params := workerExecutionParameters{ + Namespace: hw.namespace, + TaskQueue: hw.taskQueue, + Tuner: tuner, + NexusTaskPollerBehavior: NewPollerBehaviorSimpleMaximum(PollerBehaviorSimpleMaximumOptions{MaximumNumberOfPollers: 1}), + } + + nw, err := newNexusWorker(nexusWorkerOptions{ + executionParameters: params, + client: hw.client, + workflowService: hw.client.workflowService, + }) + + return nw, nil +} + +func (hw *sharedNamespaceWorker) run() { + defer close(hw.stoppedC) + + hw.started.Store(true) + + if err := hw.nexusWorker.Start(); err != nil { + return + } + defer hw.nexusWorker.Stop() + + ticker := time.NewTicker(hw.interval) + defer ticker.Stop() + + for { + select { + case <-ticker.C: + hw.sendHeartbeats() + case <-hw.stopC: + return + } + } +} + +func (hw *sharedNamespaceWorker) sendHeartbeats() { + hw.mu.RLock() + callbacks := make([]func() *workerpb.WorkerHeartbeat, 0, len(hw.callbacks)) + for _, cb := range hw.callbacks { + callbacks = append(callbacks, cb) + } + hw.mu.RUnlock() + + if len(callbacks) == 0 { + return + } + + heartbeats := make([]*workerpb.WorkerHeartbeat, 0, len(callbacks)) + for _, cb := range callbacks { + hb := cb() + heartbeats = append(heartbeats, hb) + } + + if len(heartbeats) == 0 { + return + } + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + _, err := hw.client.RecordWorkerHeartbeat(ctx, &workflowservice.RecordWorkerHeartbeatRequest{ + Namespace: hw.namespace, + WorkerHeartbeat: heartbeats, + }) + + if err != nil { + if status.Code(err) == codes.Unimplemented { + // Server doesn't support heartbeats, shutdown worker + hw.stop() + return + } + hw.logger.Warn("Failed to send heartbeat", "Error", err) + } +} + +func (hw *sharedNamespaceWorker) registerCallback( + workerInstanceKey string, + callback func() *workerpb.WorkerHeartbeat, +) { + hw.mu.Lock() + defer hw.mu.Unlock() + hw.callbacks[workerInstanceKey] = callback +} + +func (hw *sharedNamespaceWorker) unregisterCallback(workerInstanceKey string) { + shouldStop := hw.client.unregisterHeartbeatCallback(hw.namespace, workerInstanceKey) + if shouldStop { + hw.stop() + } +} + +func (hw *sharedNamespaceWorker) stop() { + if !hw.started.CompareAndSwap(true, false) { + return + } + + close(hw.stopC) + <-hw.stoppedC +} diff --git a/internal/internal_workers_test.go b/internal/internal_workers_test.go index acb799a52..a2d41af1f 100644 --- a/internal/internal_workers_test.go +++ b/internal/internal_workers_test.go @@ -76,7 +76,6 @@ func (s *WorkersTestSuite) TestWorkflowWorker() { s.service.EXPECT().DescribeNamespace(gomock.Any(), gomock.Any(), gomock.Any()).Return(nil, nil) s.service.EXPECT().PollWorkflowTaskQueue(gomock.Any(), gomock.Any(), gomock.Any()).Return(&workflowservice.PollWorkflowTaskQueueResponse{}, nil).AnyTimes() s.service.EXPECT().RespondWorkflowTaskCompleted(gomock.Any(), gomock.Any(), gomock.Any()).Return(nil, nil).AnyTimes() - s.service.EXPECT().ShutdownWorker(gomock.Any(), gomock.Any(), gomock.Any()).Return(&workflowservice.ShutdownWorkerResponse{}, nil).Times(1) ctx, cancel := context.WithCancelCause(context.Background()) executionParameters := workerExecutionParameters{ @@ -168,7 +167,6 @@ func (s *WorkersTestSuite) TestWorkflowWorkerSlotSupplier() { pollRespondedCh <- struct{}{} }). Return(nil, nil).AnyTimes() - s.service.EXPECT().ShutdownWorker(gomock.Any(), gomock.Any(), gomock.Any()).Return(&workflowservice.ShutdownWorkerResponse{}, nil).Times(1) ctx, cancel := context.WithCancelCause(context.Background()) wfCss := &CountingSlotSupplier{} @@ -444,7 +442,6 @@ func (s *WorkersTestSuite) TestActivityWorkerStop() { func (s *WorkersTestSuite) TestPollWorkflowTaskQueue_InternalServiceError() { s.service.EXPECT().DescribeNamespace(gomock.Any(), gomock.Any(), gomock.Any()).Return(nil, nil) s.service.EXPECT().PollWorkflowTaskQueue(gomock.Any(), gomock.Any(), gomock.Any()).Return(&workflowservice.PollWorkflowTaskQueueResponse{}, serviceerror.NewInternal("")).AnyTimes() - s.service.EXPECT().ShutdownWorker(gomock.Any(), gomock.Any(), gomock.Any()).Return(&workflowservice.ShutdownWorkerResponse{}, nil).Times(1) executionParameters := workerExecutionParameters{ Namespace: DefaultNamespace, diff --git a/internal/internal_workflow_client.go b/internal/internal_workflow_client.go index 3c0db0c9a..21675f6f8 100644 --- a/internal/internal_workflow_client.go +++ b/internal/internal_workflow_client.go @@ -4,6 +4,8 @@ import ( "context" "errors" "fmt" + namespacepb "go.temporal.io/api/namespace/v1" + workerpb "go.temporal.io/api/worker/v1" "io" "math" "reflect" @@ -59,24 +61,31 @@ const ( type ( // WorkflowClient is the client for starting a workflow execution. WorkflowClient struct { - workflowService workflowservice.WorkflowServiceClient - conn *grpc.ClientConn - namespace string - registry *registry - logger log.Logger - metricsHandler metrics.Handler - identity string - dataConverter converter.DataConverter - failureConverter converter.FailureConverter - contextPropagators []ContextPropagator - workerPlugins []WorkerPlugin - workerInterceptors []WorkerInterceptor - interceptor ClientOutboundInterceptor - excludeInternalFromRetry *atomic.Bool - capabilities *workflowservice.GetSystemInfoResponse_Capabilities - capabilitiesLock sync.RWMutex - eagerDispatcher *eagerWorkflowDispatcher - getSystemInfoTimeout time.Duration + workflowService workflowservice.WorkflowServiceClient + conn *grpc.ClientConn + namespace string + registry *registry + logger log.Logger + metricsHandler metrics.Handler + identity string + dataConverter converter.DataConverter + failureConverter converter.FailureConverter + contextPropagators []ContextPropagator + workerPlugins []WorkerPlugin + workerInterceptors []WorkerInterceptor + clientPluginNames []string + interceptor ClientOutboundInterceptor + excludeInternalFromRetry *atomic.Bool + capabilities *workflowservice.GetSystemInfoResponse_Capabilities + capabilitiesLock sync.RWMutex + namespaceCapabilities *namespacepb.NamespaceInfo_Capabilities + namespaceCapabilitiesLock sync.RWMutex + eagerDispatcher *eagerWorkflowDispatcher + getSystemInfoTimeout time.Duration + workerHeartbeatInterval time.Duration + workerGroupingKey string + heartbeatWorkers map[string]*sharedNamespaceWorker + heartbeatWorkersMu sync.RWMutex // The pointer value is shared across multiple clients. If non-nil, only // access/mutate atomically. @@ -1366,6 +1375,33 @@ func (wc *WorkflowClient) loadCapabilities(ctx context.Context) (*workflowservic return capabilities, nil } +// Get namespace capabilities, lazily fetching from server if not already obtained. +func (wc *WorkflowClient) loadNamespaceCapabilities(ctx context.Context) (*namespacepb.NamespaceInfo_Capabilities, error) { + wc.namespaceCapabilitiesLock.RLock() + capabilities := wc.namespaceCapabilities + wc.namespaceCapabilitiesLock.RUnlock() + if capabilities != nil { + return capabilities, nil + } + + grpcCtx, cancel := newGRPCContext(ctx, grpcTimeout(wc.getSystemInfoTimeout)) + defer cancel() + resp, err := wc.workflowService.DescribeNamespace(grpcCtx, &workflowservice.DescribeNamespaceRequest{Namespace: wc.namespace}) + if _, isUnimplemented := err.(*serviceerror.Unimplemented); err != nil && !isUnimplemented { + return nil, fmt.Errorf("failed reaching server: %w", err) + } + if resp != nil && resp.NamespaceInfo.Capabilities != nil { + capabilities = resp.NamespaceInfo.Capabilities + } else { + capabilities = &namespacepb.NamespaceInfo_Capabilities{} + } + + wc.namespaceCapabilitiesLock.Lock() + wc.namespaceCapabilities = capabilities + wc.namespaceCapabilitiesLock.Unlock() + return capabilities, nil +} + func (wc *WorkflowClient) ensureInitialized(ctx context.Context) error { // Just loading the capabilities is enough _, err := wc.loadCapabilities(ctx) @@ -1393,6 +1429,89 @@ func (wc *WorkflowClient) WorkerDeploymentClient() WorkerDeploymentClient { } } +func (wc *WorkflowClient) RecordWorkerHeartbeat(ctx context.Context, request *workflowservice.RecordWorkerHeartbeatRequest) (*workflowservice.RecordWorkerHeartbeatResponse, error) { + if err := wc.ensureInitialized(ctx); err != nil { + return nil, err + } + + grpcCtx, cancel := newGRPCContext(ctx, defaultGrpcRetryParameters(ctx)) + defer cancel() + resp, err := wc.workflowService.RecordWorkerHeartbeat(grpcCtx, request) + if err != nil { + return nil, err + } + + return resp, nil +} + +func (wc *WorkflowClient) getOrCreateHeartbeatWorker(namespace string) (*sharedNamespaceWorker, error) { + wc.heartbeatWorkersMu.Lock() + defer wc.heartbeatWorkersMu.Unlock() + + if hw, ok := wc.heartbeatWorkers[namespace]; ok { + return hw, nil + } + + capabilities, err := wc.loadNamespaceCapabilities(context.Background()) + if err != nil { + return nil, fmt.Errorf("failed to get namespace capabilities: %w", err) + } + if !capabilities.WorkerHeartbeats { + wc.logger.Debug("Worker heartbeating configured, but server version does not support it.") + return nil, nil + } + + hw := &sharedNamespaceWorker{ + client: wc, + namespace: namespace, + taskQueue: fmt.Sprintf("temporal-sys/worker-commands/%s/%s", namespace, wc.workerGroupingKey), + interval: wc.workerHeartbeatInterval, + callbacks: make(map[string]func() *workerpb.WorkerHeartbeat), + stopC: make(chan struct{}), + stoppedC: make(chan struct{}), + logger: wc.logger, + } + + nexusWorker, err := hw.createNexusWorker() + if err != nil { + return nil, fmt.Errorf("failed to create nexus worker for heartbeating: %w", err) + } + hw.nexusWorker = nexusWorker + + if wc.heartbeatWorkers == nil { + wc.heartbeatWorkers = make(map[string]*sharedNamespaceWorker) + } + wc.heartbeatWorkers[namespace] = hw + + go hw.run() + + return hw, nil +} + +// unregisterHeartbeatCallback removes a callback from the heartbeat worker for the given namespace. +// Returns true if the heartbeat worker should be stopped (no more callbacks remain). +// This method holds heartbeatWorkersMu while checking and removing, preventing races where +// a new worker could get a reference to an about-to-be-stopped heartbeat worker. +func (wc *WorkflowClient) unregisterHeartbeatCallback(namespace string, workerInstanceKey string) bool { + wc.heartbeatWorkersMu.Lock() + defer wc.heartbeatWorkersMu.Unlock() + + hw, ok := wc.heartbeatWorkers[namespace] + if !ok { + return false + } + + hw.mu.Lock() + delete(hw.callbacks, workerInstanceKey) + shouldStop := len(hw.callbacks) == 0 + hw.mu.Unlock() + + if shouldStop { + delete(wc.heartbeatWorkers, namespace) + } + return shouldStop +} + // Close client and clean up underlying resources. func (wc *WorkflowClient) Close() { // If there's a set of unclosed clients, we have to decrement it and then diff --git a/internal/tuning.go b/internal/tuning.go index 8d146800f..4f85db366 100644 --- a/internal/tuning.go +++ b/internal/tuning.go @@ -3,6 +3,7 @@ package internal import ( "context" "fmt" + "reflect" "sync" "sync/atomic" @@ -130,6 +131,26 @@ type SlotSupplier interface { MaxSlots() int } +// SlotSupplierKinder is an optional interface that slot suppliers can implement to provide +// a custom kind/type name. If not implemented, getSlotSupplierKind will use reflection. +type SlotSupplierKinder interface { + Kind() string +} + +// getSlotSupplierKind returns the kind/type name of a slot supplier. If the supplier implements +// SlotSupplierKinder, it returns the result of Kind(). Otherwise, it uses reflection to get the +// type name. +func getSlotSupplierKind(s SlotSupplier) string { + if k, ok := s.(SlotSupplierKinder); ok { + return k.Kind() + } + t := reflect.TypeOf(s) + if t.Kind() == reflect.Ptr { + return t.Elem().Name() + } + return t.Name() +} + // CompositeTuner allows you to build a tuner from multiple slot suppliers. type CompositeTuner struct { workflowSlotSupplier SlotSupplier @@ -284,6 +305,9 @@ func (f *FixedSizeSlotSupplier) ReleaseSlot(SlotReleaseInfo) { func (f *FixedSizeSlotSupplier) MaxSlots() int { return f.numSlots } +func (f *FixedSizeSlotSupplier) Kind() string { + return "Fixed" +} type slotReservationData struct { taskQueue string @@ -478,6 +502,7 @@ func (t *trackingSlotSupplier) ReleaseSlot(permit *SlotPermit, reason SlotReleas if permit.extraReleaseCallback != nil { permit.extraReleaseCallback() } + t.publishMetrics(usedSlots) } @@ -487,3 +512,7 @@ func (t *trackingSlotSupplier) publishMetrics(usedSlots int) { } t.taskSlotsUsedGauge.Update(float64(usedSlots)) } + +func (t *trackingSlotSupplier) GetSlotSupplierKind() string { + return getSlotSupplierKind(t.inner) +} diff --git a/internal/worker.go b/internal/worker.go index 83efaf21c..9eefa81da 100644 --- a/internal/worker.go +++ b/internal/worker.go @@ -35,6 +35,18 @@ type ( isPollerBehavior() } + // HostMetricsProvider provides host-level CPU and memory metrics for worker heartbeats. + // Implement this interface to provide custom metrics collection, or use the default + // implementation provided by the SDK in the worker/hostmetrics package. + // + // Exposed as: [go.temporal.io/sdk/worker.HostMetricsProvider] + HostMetricsProvider interface { + // GetCpuUsage returns the current host CPU usage as a fraction (0.0-1.0) + GetCpuUsage() (float64, error) + // GetMemoryUsage returns the current host memory usage as a fraction (0.0-1.0) + GetMemoryUsage() (float64, error) + } + // PollerBehaviorAutoscalingOptions is the options for NewPollerBehaviorAutoscaling. // // Exposed as: [go.temporal.io/sdk/worker.PollerBehaviorAutoscalingOptions] diff --git a/test/go.mod b/test/go.mod index 25fd4a34b..2b8801962 100644 --- a/test/go.mod +++ b/test/go.mod @@ -19,7 +19,7 @@ require ( go.temporal.io/sdk v1.29.1 go.temporal.io/sdk/contrib/opentelemetry v0.0.0-00010101000000-000000000000 go.temporal.io/sdk/contrib/opentracing v0.0.0-00010101000000-000000000000 - go.temporal.io/sdk/contrib/resourcetuner v0.0.0-00010101000000-000000000000 + go.temporal.io/sdk/contrib/resourcetuner v0.0.0-20260112203102-5b6df8e02dcf go.temporal.io/sdk/contrib/tally v0.0.0-00010101000000-000000000000 go.uber.org/goleak v1.1.12 google.golang.org/grpc v1.67.1 diff --git a/test/integration_test.go b/test/integration_test.go index e63503db3..02fdca747 100644 --- a/test/integration_test.go +++ b/test/integration_test.go @@ -3376,7 +3376,6 @@ func (ts *IntegrationTestSuite) TestSlotSupplierWFTFailMetrics() { run, err := ts.client.ExecuteWorkflow(ctx, wfOptions, waitsToProceedWorkflow) ts.NoError(err) ts.NotNil(run) - ts.NoError(err) <-actStarted // The workflow task will fail once and then pass diff --git a/test/worker_heartbeat_test.go b/test/worker_heartbeat_test.go new file mode 100644 index 000000000..e64152abc --- /dev/null +++ b/test/worker_heartbeat_test.go @@ -0,0 +1,997 @@ +package test_test + +import ( + "context" + "errors" + "fmt" + "sync" + "sync/atomic" + "testing" + "time" + + "github.com/google/uuid" + "github.com/nexus-rpc/sdk-go/nexus" + "github.com/stretchr/testify/require" + "github.com/stretchr/testify/suite" + "go.temporal.io/api/enums/v1" + workerpb "go.temporal.io/api/worker/v1" + "go.temporal.io/api/workflowservice/v1" + "go.temporal.io/sdk/activity" + "go.temporal.io/sdk/client" + "go.temporal.io/sdk/contrib/resourcetuner" + "go.temporal.io/sdk/internal" + ilog "go.temporal.io/sdk/internal/log" + "go.temporal.io/sdk/temporal" + "go.temporal.io/sdk/worker" + "go.temporal.io/sdk/workflow" + "google.golang.org/protobuf/types/known/timestamppb" +) + +type WorkerHeartbeatTestSuite struct { + *require.Assertions + suite.Suite + ConfigAndClientSuiteBase + worker worker.Worker +} + +func TestWorkerHeartbeatSuite(t *testing.T) { + suite.Run(t, new(WorkerHeartbeatTestSuite)) +} + +func (ts *WorkerHeartbeatTestSuite) SetupSuite() { + ts.Assertions = require.New(ts.T()) + ts.NoError(ts.InitConfigAndNamespace()) +} + +func (ts *WorkerHeartbeatTestSuite) TearDownSuite() { + ts.Assertions = require.New(ts.T()) +} + +func (ts *WorkerHeartbeatTestSuite) SetupTest() { + var err error + heartbeatInterval := 100 * time.Millisecond + + // Create a client with heartbeating enabled + ts.client, err = client.Dial(client.Options{ + HostPort: ts.config.ServiceAddr, + Namespace: ts.config.Namespace, + Logger: ilog.NewDefaultLogger(), + WorkerHeartbeatInterval: &heartbeatInterval, + ConnectionOptions: client.ConnectionOptions{TLS: ts.config.TLS}, + Identity: "WorkerHeartbeatTest", + }) + ts.NoError(err) + + ts.taskQueueName = taskQueuePrefix + "-" + ts.T().Name() +} + +func (ts *WorkerHeartbeatTestSuite) TearDownTest() { + if ts.worker != nil { + ts.worker.Stop() + } + if ts.client != nil { + ts.client.Close() + } +} + +// assertRecentTimestamp asserts the timestamp is within maxAge of now +func (ts *WorkerHeartbeatTestSuite) assertRecentTimestamp(timestamp *timestamppb.Timestamp, maxAge time.Duration, name string) { + ts.NotNil(timestamp, "%s should not be nil", name) + ts.False(timestamp.AsTime().IsZero(), "%s should not be zero", name) + ts.WithinDuration(time.Now(), timestamp.AsTime(), maxAge, "%s should be recent", name) +} + +// TestWorkerHeartbeat verifies that worker heartbeats are sent to the server +// and can be queried via ListWorkers and DescribeWorker APIs +func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatBasic() { + workerStartTime := time.Now() + + worker.SetStickyWorkflowCacheSize(5) + ts.worker = worker.New(ts.client, ts.taskQueueName, worker.Options{ + MaxConcurrentWorkflowTaskExecutionSize: 5, + MaxConcurrentActivityExecutionSize: 5, + DisableEagerActivities: true, + }) + ts.worker.RegisterWorkflow(workflowWithBlockingActivity) + ts.worker.RegisterActivity(blockingActivity) + // Register a nexus service so the nexus worker is created and slot info is populated + nexusService := nexus.NewService("test-heartbeat") + ts.NoError(nexusService.Register(noopNexusOp)) + ts.worker.RegisterNexusService(nexusService) + ts.Nil(ts.worker.Start()) + + ctx := context.Background() + wfOptions := ts.startWorkflowOptions("test-worker-heartbeat") + + run, err := ts.client.ExecuteWorkflow(ctx, wfOptions, workflowWithBlockingActivity) + ts.NoError(err) + ts.NotNil(run) + // Wait for activity to start + select { + case <-blockingActivityStarted: + ts.T().Log("Activity started") + case <-time.After(5 * time.Second): + ts.Fail("Timeout waiting for activity to start") + } + + // Wait for heartbeat to capture the in-flight activity + time.Sleep(100 * time.Millisecond) + + // Get worker info and verify activity slot is used + workerInfo := ts.getWorkerInfo(ctx, ts.taskQueueName) + ts.NotNil(workerInfo, "Should find worker in ListWorkers/DescribeWorker") + ts.logWorkerInfo(workerInfo) + + ts.Equal(enums.WORKER_STATUS_RUNNING, workerInfo.Status) + + workflowTaskSlots := workerInfo.WorkflowTaskSlotsInfo + ts.Equal(int32(1), workflowTaskSlots.TotalProcessedTasks) + ts.Equal(int32(5), workflowTaskSlots.CurrentAvailableSlots) + ts.Equal(int32(0), workflowTaskSlots.CurrentUsedSlots) + ts.Equal("Fixed", workflowTaskSlots.SlotSupplierKind) + activityTaskSlots := workerInfo.ActivityTaskSlotsInfo + ts.Equal(int32(0), activityTaskSlots.TotalProcessedTasks) + ts.Equal(int32(4), activityTaskSlots.CurrentAvailableSlots) + ts.Equal(int32(1), activityTaskSlots.CurrentUsedSlots) + ts.Equal("Fixed", activityTaskSlots.SlotSupplierKind) + nexusTaskSlots := workerInfo.NexusTaskSlotsInfo + ts.NotNil(nexusTaskSlots) + ts.Equal(int32(0), nexusTaskSlots.TotalProcessedTasks) + ts.Equal(int32(1000), nexusTaskSlots.CurrentAvailableSlots) + ts.Equal(int32(0), nexusTaskSlots.CurrentUsedSlots) + ts.Equal("Fixed", nexusTaskSlots.SlotSupplierKind) + localActivityTaskSlots := workerInfo.LocalActivitySlotsInfo + ts.Equal(int32(0), localActivityTaskSlots.TotalProcessedTasks) + ts.Equal(int32(1000), localActivityTaskSlots.CurrentAvailableSlots) + ts.Equal(int32(0), localActivityTaskSlots.CurrentUsedSlots) + ts.Equal("Fixed", localActivityTaskSlots.SlotSupplierKind) + + workflowPollerInfo := workerInfo.WorkflowPollerInfo + ts.Equal(int32(1), workflowPollerInfo.CurrentPollers) + stickyPollerInfo := workerInfo.WorkflowStickyPollerInfo + ts.NotEqual(int32(0), stickyPollerInfo.CurrentPollers) + nexusPollerInfo := workerInfo.NexusPollerInfo + ts.Equal(int32(2), nexusPollerInfo.CurrentPollers) + activityPollerInfo := workerInfo.ActivityPollerInfo + ts.NotEqual(int32(0), activityPollerInfo.CurrentPollers) + + ts.Equal(int32(1), workerInfo.CurrentStickyCacheSize) + + ts.assertRecentTimestamp(workerInfo.StartTime, 10*time.Second, "StartTime") + ts.assertRecentTimestamp(workerInfo.HeartbeatTime, 5*time.Second, "HeartbeatTime") + + ts.WithinDuration(workerStartTime, workerInfo.StartTime.AsTime(), 5*time.Second, + "StartTime should match worker creation time") + + ts.True(workerInfo.HeartbeatTime.AsTime().After(workerInfo.StartTime.AsTime()) || + workerInfo.HeartbeatTime.AsTime().Equal(workerInfo.StartTime.AsTime()), + "HeartbeatTime should be >= StartTime") + + ts.NotNil(workerInfo.ElapsedSinceLastHeartbeat) + elapsed := workerInfo.ElapsedSinceLastHeartbeat.AsDuration() + ts.True(elapsed <= 500*time.Millisecond, + "ElapsedSinceLastHeartbeat should be <= 500ms (got %v)", elapsed) + + ts.assertRecentTimestamp(workerInfo.WorkflowPollerInfo.LastSuccessfulPollTime, 5*time.Second, + "WorkflowPollerInfo.LastSuccessfulPollTime") + ts.assertRecentTimestamp(workerInfo.ActivityPollerInfo.LastSuccessfulPollTime, 5*time.Second, + "ActivityPollerInfo.LastSuccessfulPollTime") + + // Store values to compare after shutdown + firstStartTime := workerInfo.StartTime.AsTime() + firstHeartbeatTime := workerInfo.HeartbeatTime.AsTime() + + // Signal activity to complete + blockingActivityComplete <- struct{}{} + + ts.NoError(run.Get(ctx, nil)) + ts.worker.Stop() + + workerInfo = ts.getWorkerInfo(ctx, ts.taskQueueName) + ts.NotNil(workerInfo, "Should find worker in ListWorkers/DescribeWorker") + + // After shutdown checks + ts.Equal("WorkerHeartbeatTest", workerInfo.WorkerIdentity) + hostInfo := workerInfo.HostInfo + fmt.Println("hostInfo", hostInfo) + ts.NotEqual("", hostInfo.HostName) + ts.NotEqual("", hostInfo.ProcessId) + ts.NotEqual("", hostInfo.WorkerGroupingKey) + + ts.GreaterOrEqual(hostInfo.CurrentHostCpuUsage, float32(0.0)) + ts.GreaterOrEqual(hostInfo.CurrentHostMemUsage, float32(0.0)) + + ts.Equal(ts.taskQueueName, workerInfo.TaskQueue) + ts.Equal(internal.SDKName, workerInfo.SdkName) + ts.Equal(internal.SDKVersion, workerInfo.SdkVersion) + ts.Equal(enums.WORKER_STATUS_SHUTTING_DOWN, workerInfo.Status) + + // Timestamp validations - second heartbeat check (after shutdown) + // StartTime should be unchanged + ts.Equal(firstStartTime, workerInfo.StartTime.AsTime(), + "StartTime should not change between heartbeats") + + // HeartbeatTime should have advanced + ts.True(workerInfo.HeartbeatTime.AsTime().After(firstHeartbeatTime), + "HeartbeatTime should advance between heartbeats") + + fmt.Println("aa") + workflowTaskSlots = workerInfo.WorkflowTaskSlotsInfo + ts.Equal(int32(2), workflowTaskSlots.TotalProcessedTasks) + ts.Equal("Fixed", workflowTaskSlots.SlotSupplierKind) + activityTaskSlots = workerInfo.ActivityTaskSlotsInfo + ts.Equal(int32(1), activityTaskSlots.TotalProcessedTasks) + ts.Equal(int32(5), activityTaskSlots.CurrentAvailableSlots) + ts.Equal(int32(0), activityTaskSlots.CurrentUsedSlots) + ts.Equal(int32(1), activityTaskSlots.LastIntervalProcessedTasks) + ts.Equal("Fixed", activityTaskSlots.SlotSupplierKind) + fmt.Println("bb") + nexusTaskSlots = workerInfo.NexusTaskSlotsInfo + ts.NotNil(nexusTaskSlots) + ts.Equal(int32(0), nexusTaskSlots.TotalProcessedTasks) + ts.Equal(int32(1000), nexusTaskSlots.CurrentAvailableSlots) + ts.Equal(int32(0), nexusTaskSlots.CurrentUsedSlots) + ts.Equal("Fixed", nexusTaskSlots.SlotSupplierKind) + localActivityTaskSlots = workerInfo.LocalActivitySlotsInfo + ts.Equal(int32(0), localActivityTaskSlots.TotalProcessedTasks) + ts.Equal(int32(1000), localActivityTaskSlots.CurrentAvailableSlots) + ts.Equal(int32(0), localActivityTaskSlots.CurrentUsedSlots) + ts.Equal("Fixed", localActivityTaskSlots.SlotSupplierKind) + // + + workflowPollerInfo = workerInfo.WorkflowPollerInfo + ts.Equal(int32(1), workflowPollerInfo.CurrentPollers) + ts.False(workflowPollerInfo.IsAutoscaling) + ts.assertRecentTimestamp(workflowPollerInfo.LastSuccessfulPollTime, 10*time.Second, + "WorkflowPollerInfo.LastSuccessfulPollTime after shutdown") + + stickyPollerInfo = workerInfo.WorkflowStickyPollerInfo + ts.NotEqual(int32(0), stickyPollerInfo.CurrentPollers) + ts.False(stickyPollerInfo.IsAutoscaling) + ts.assertRecentTimestamp(stickyPollerInfo.LastSuccessfulPollTime, 10*time.Second, + "WorkflowStickyPollerInfo.LastSuccessfulPollTime after shutdown") + + nexusPollerInfo = workerInfo.NexusPollerInfo + ts.Equal(int32(2), nexusPollerInfo.CurrentPollers) + ts.False(nexusPollerInfo.IsAutoscaling) + // Nexus poller has no successful polls since we didn't execute any nexus operations + + activityPollerInfo = workerInfo.ActivityPollerInfo + ts.NotEqual(int32(0), activityPollerInfo.CurrentPollers) + ts.False(activityPollerInfo.IsAutoscaling) + ts.assertRecentTimestamp(activityPollerInfo.LastSuccessfulPollTime, 10*time.Second, + "ActivityPollerInfo.LastSuccessfulPollTime after shutdown") + + ts.Equal(int32(1), workerInfo.TotalStickyCacheHit) +} + +// TestWorkerHeartbeatDeploymentVersion verifies that deployment version info is +// included in heartbeats when versioning is enabled. This test doesn't run workflows +// since versioned workers require additional server-side setup for task routing. +func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatDeploymentVersion() { + ctx := context.Background() + + taskQueue := ts.taskQueueName + "-deployment-version" + + w := worker.New(ts.client, taskQueue, worker.Options{ + DeploymentOptions: worker.DeploymentOptions{ + UseVersioning: true, + Version: worker.WorkerDeploymentVersion{ + DeploymentName: "test-deployment", + BuildID: "test_build_id", + }, + DefaultVersioningBehavior: internal.VersioningBehaviorAutoUpgrade, + }, + }) + w.RegisterWorkflow(simpleWorkflow) + ts.NoError(w.Start()) + defer w.Stop() + + // Wait for heartbeat to be sent + time.Sleep(200 * time.Millisecond) + + // Get worker info and verify deployment version + workerInfo := ts.getWorkerInfo(ctx, taskQueue) + ts.NotNil(workerInfo, "Should find worker in ListWorkers/DescribeWorker") + + ts.NotNil(workerInfo.DeploymentVersion, "DeploymentVersion should be set") + ts.Equal("test_build_id", workerInfo.DeploymentVersion.BuildId) + ts.Equal("test-deployment", workerInfo.DeploymentVersion.DeploymentName) + + ts.logWorkerInfo(workerInfo) +} + +// TestWorkerHeartbeatDisabled verifies that when heartbeating is disabled, +// workers should not appear in ListWorkers +func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatDisabled() { + ctx := context.Background() + + // Create a separate client with heartbeating disabled + heartbeatInterval := time.Duration(0) + clientNoHeartbeat, err := client.Dial(client.Options{ + HostPort: ts.config.ServiceAddr, + Namespace: ts.config.Namespace, + Logger: ilog.NewDefaultLogger(), + WorkerHeartbeatInterval: &heartbeatInterval, + ConnectionOptions: client.ConnectionOptions{TLS: ts.config.TLS}, + }) + ts.NoError(err) + defer clientNoHeartbeat.Close() + + taskQueueNoHeartbeat := taskQueuePrefix + "-no-heartbeat-" + ts.T().Name() + + // Create and start worker with no heartbeating + workerNoHeartbeat := worker.New(clientNoHeartbeat, taskQueueNoHeartbeat, worker.Options{}) + workerNoHeartbeat.RegisterWorkflow(simpleWorkflow) + ts.NoError(workerNoHeartbeat.Start()) + defer workerNoHeartbeat.Stop() + + // Wait a bit + time.Sleep(500 * time.Millisecond) + + // Get the internal client + internalClient := clientNoHeartbeat.(internal.Client) + workflowClient := internalClient.(*internal.WorkflowClient) + + // List workers - should not find the worker without heartbeating + listResp, err := workflowClient.WorkflowService().ListWorkers(ctx, &workflowservice.ListWorkersRequest{ + Namespace: ts.config.Namespace, + Query: fmt.Sprintf(`TaskQueue="%s"`, taskQueueNoHeartbeat), + PageSize: 10, + }) + + ts.NoError(err, "ListWorkers failed") + foundWorker := false + for _, workerInfo := range listResp.WorkersInfo { + if workerInfo.WorkerHeartbeat.TaskQueue == taskQueueNoHeartbeat { + foundWorker = true + break + } + } + ts.False(foundWorker, "Should not find worker without heartbeating enabled") +} + +// Get worker info from the server +func (ts *WorkerHeartbeatTestSuite) getWorkerInfo(ctx context.Context, taskQueue string) *workerpb.WorkerHeartbeat { + // Get the internal client to access the workflow service directly + internalClient := ts.client.(internal.Client) + workflowClient := internalClient.(*internal.WorkflowClient) + + // List workers in this namespace + listResp, err := workflowClient.WorkflowService().ListWorkers(ctx, &workflowservice.ListWorkersRequest{ + Namespace: ts.config.Namespace, + Query: fmt.Sprintf(`TaskQueue="%s"`, taskQueue), + PageSize: 10, + }) + if err != nil { + ts.T().Logf("ListWorkers failed: %v (may not be implemented on this server)", err) + return nil + } + + if len(listResp.WorkersInfo) == 0 { + ts.T().Logf("No workers found for task queue %s", taskQueue) + return nil + } + + // Find our worker in the list + var workerInstanceKey string + for _, workerInfo := range listResp.WorkersInfo { + if workerInfo.WorkerHeartbeat.TaskQueue == taskQueue { + workerInstanceKey = workerInfo.WorkerHeartbeat.WorkerInstanceKey + break + } + } + + if workerInstanceKey == "" { + ts.T().Logf("Could not find worker with task queue %s in list", taskQueue) + return nil + } + + // Describe the specific worker + describeResp, err := workflowClient.WorkflowService().DescribeWorker(ctx, &workflowservice.DescribeWorkerRequest{ + Namespace: ts.config.Namespace, + WorkerInstanceKey: workerInstanceKey, + }) + if err != nil { + ts.T().Logf("DescribeWorker failed: %v", err) + return nil + } + + return describeResp.WorkerInfo.WorkerHeartbeat +} + +func (ts *WorkerHeartbeatTestSuite) logWorkerInfo(workerInfo *workerpb.WorkerHeartbeat) { + ts.T().Logf("=== Worker Heartbeat Info ===") + ts.T().Logf("Worker Instance Key: %s", workerInfo.WorkerInstanceKey) + ts.T().Logf("Worker Identity: %s", workerInfo.WorkerIdentity) + ts.T().Logf("Task Queue: %s", workerInfo.TaskQueue) + ts.T().Logf("SDK Name: %s", workerInfo.SdkName) + ts.T().Logf("SDK Version: %s", workerInfo.SdkVersion) + ts.T().Logf("Status: %s", workerInfo.Status) + ts.T().Logf("Total Sticky Cache Hit: %d", workerInfo.TotalStickyCacheHit) + ts.T().Logf("Total Sticky Cache Miss: %d", workerInfo.TotalStickyCacheMiss) + ts.T().Logf("Current Sticky Cache Size: %d", workerInfo.CurrentStickyCacheSize) + if workerInfo.HostInfo != nil { + ts.T().Logf("Host Name: %s", workerInfo.HostInfo.HostName) + ts.T().Logf("Process ID: %s", workerInfo.HostInfo.ProcessId) + } + if workerInfo.WorkflowTaskSlotsInfo != nil { + ts.T().Logf("=== Workflow Task Slots Info ===") + ts.T().Logf(" Current Available Slots: %d", workerInfo.WorkflowTaskSlotsInfo.CurrentAvailableSlots) + ts.T().Logf(" Current Used Slots: %d", workerInfo.WorkflowTaskSlotsInfo.CurrentUsedSlots) + ts.T().Logf(" Slot Supplier Kind: %s", workerInfo.WorkflowTaskSlotsInfo.SlotSupplierKind) + ts.T().Logf(" Total Processed Tasks: %d", workerInfo.WorkflowTaskSlotsInfo.TotalProcessedTasks) + ts.T().Logf(" Total Failed Tasks: %d", workerInfo.WorkflowTaskSlotsInfo.TotalFailedTasks) + ts.T().Logf(" Last Interval Processed: %d", workerInfo.WorkflowTaskSlotsInfo.LastIntervalProcessedTasks) + ts.T().Logf(" Last Interval Failed: %d", workerInfo.WorkflowTaskSlotsInfo.LastIntervalFailureTasks) + } + if workerInfo.ActivityTaskSlotsInfo != nil { + ts.T().Logf("=== Activity Task Slots Info ===") + ts.T().Logf(" Current Available Slots: %d", workerInfo.ActivityTaskSlotsInfo.CurrentAvailableSlots) + ts.T().Logf(" Current Used Slots: %d", workerInfo.ActivityTaskSlotsInfo.CurrentUsedSlots) + ts.T().Logf(" Slot Supplier Kind: %s", workerInfo.ActivityTaskSlotsInfo.SlotSupplierKind) + ts.T().Logf(" Total Processed Tasks: %d", workerInfo.ActivityTaskSlotsInfo.TotalProcessedTasks) + ts.T().Logf(" Total Failed Tasks: %d", workerInfo.ActivityTaskSlotsInfo.TotalFailedTasks) + } + if workerInfo.LocalActivitySlotsInfo != nil { + ts.T().Logf("=== Local Activity Slots Info ===") + ts.T().Logf(" Current Available Slots: %d", workerInfo.LocalActivitySlotsInfo.CurrentAvailableSlots) + ts.T().Logf(" Current Used Slots: %d", workerInfo.LocalActivitySlotsInfo.CurrentUsedSlots) + ts.T().Logf(" Slot Supplier Kind: %s", workerInfo.LocalActivitySlotsInfo.SlotSupplierKind) + ts.T().Logf(" Total Processed Tasks: %d", workerInfo.LocalActivitySlotsInfo.TotalProcessedTasks) + ts.T().Logf(" Total Failed Tasks: %d", workerInfo.LocalActivitySlotsInfo.TotalFailedTasks) + } + if workerInfo.NexusTaskSlotsInfo != nil { + ts.T().Logf("=== Nexus Task Slots Info ===") + ts.T().Logf(" Current Available Slots: %d", workerInfo.NexusTaskSlotsInfo.CurrentAvailableSlots) + ts.T().Logf(" Current Used Slots: %d", workerInfo.NexusTaskSlotsInfo.CurrentUsedSlots) + ts.T().Logf(" Slot Supplier Kind: %s", workerInfo.NexusTaskSlotsInfo.SlotSupplierKind) + ts.T().Logf(" Total Processed Tasks: %d", workerInfo.NexusTaskSlotsInfo.TotalProcessedTasks) + ts.T().Logf(" Total Failed Tasks: %d", workerInfo.NexusTaskSlotsInfo.TotalFailedTasks) + } + if workerInfo.WorkflowPollerInfo != nil { + ts.T().Logf("=== Workflow Poller Info ===") + ts.T().Logf(" Current Pollers: %d", workerInfo.WorkflowPollerInfo.CurrentPollers) + ts.T().Logf(" Last Successful Poll Time: %v", workerInfo.WorkflowPollerInfo.LastSuccessfulPollTime.AsTime()) + ts.T().Logf(" Is Autoscaling: %v", workerInfo.WorkflowPollerInfo.IsAutoscaling) + } + if workerInfo.WorkflowStickyPollerInfo != nil { + ts.T().Logf("=== Workflow Sticky Poller Info ===") + ts.T().Logf(" Current Pollers: %d", workerInfo.WorkflowStickyPollerInfo.CurrentPollers) + ts.T().Logf(" Last Successful Poll Time: %v", workerInfo.WorkflowStickyPollerInfo.LastSuccessfulPollTime.AsTime()) + ts.T().Logf(" Is Autoscaling: %v", workerInfo.WorkflowStickyPollerInfo.IsAutoscaling) + } + if workerInfo.ActivityPollerInfo != nil { + ts.T().Logf("=== Activity Poller Info ===") + ts.T().Logf(" Current Pollers: %d", workerInfo.ActivityPollerInfo.CurrentPollers) + ts.T().Logf(" Last Successful Poll Time: %v", workerInfo.ActivityPollerInfo.LastSuccessfulPollTime.AsTime()) + ts.T().Logf(" Is Autoscaling: %v", workerInfo.ActivityPollerInfo.IsAutoscaling) + } + if workerInfo.NexusPollerInfo != nil { + ts.T().Logf("=== Nexus Poller Info ===") + ts.T().Logf(" Current Pollers: %d", workerInfo.NexusPollerInfo.CurrentPollers) + ts.T().Logf(" Last Successful Poll Time: %v", workerInfo.NexusPollerInfo.LastSuccessfulPollTime.AsTime()) + ts.T().Logf(" Is Autoscaling: %v", workerInfo.NexusPollerInfo.IsAutoscaling) + } + if len(workerInfo.Plugins) > 0 { + ts.T().Logf("=== Plugins ===") + for _, plugin := range workerInfo.Plugins { + ts.T().Logf(" Name: %s", plugin.Name) + } + } +} + +// Simple workflow for testing +func simpleWorkflow(ctx workflow.Context) (string, error) { + return "hello", nil +} + +// Simple nexus operation for testing - just returns immediately +var noopNexusOp = nexus.NewSyncOperation("noop", func(ctx context.Context, input nexus.NoValue, opts nexus.StartOperationOptions) (nexus.NoValue, error) { + return nil, nil +}) + +var ( + blockingActivityStarted = make(chan struct{}, 10) + blockingActivityComplete = make(chan struct{}, 10) +) + +func blockingActivity(ctx context.Context) (string, error) { + // Signal that activity has started + select { + case blockingActivityStarted <- struct{}{}: + default: + } + fmt.Println("ACTIVITY STARTED") + + // Wait for signal to complete + select { + case <-blockingActivityComplete: + fmt.Println("ACTIVITY COMPLETED") + return "done", nil + case <-ctx.Done(): + fmt.Println("ACTIVITY TIMED OUT") + return "", ctx.Err() + } +} + +func workflowWithBlockingActivity(ctx workflow.Context) (string, error) { + ao := workflow.ActivityOptions{ + StartToCloseTimeout: 30 * time.Second, + } + ctx = workflow.WithActivityOptions(ctx, ao) + + var result string + err := workflow.ExecuteActivity(ctx, blockingActivity).Get(ctx, &result) + return result, err +} + +var failingActivityCallCount atomic.Int32 + +func failingActivity(ctx context.Context) error { + failingActivityCallCount.Add(1) + return temporal.NewApplicationError("intentional failure", "TEST_ERROR") +} + +// Workflow that executes a failing activity with limited retries +func workflowWithFailingActivity(ctx workflow.Context) error { + ao := workflow.ActivityOptions{ + StartToCloseTimeout: 10 * time.Second, + RetryPolicy: &temporal.RetryPolicy{ + MaximumAttempts: 1, + }, + } + ctx = workflow.WithActivityOptions(ctx, ao) + + return workflow.ExecuteActivity(ctx, failingActivity).Get(ctx, nil) +} + +// Workflow that panics (simulates workflow task failure) +var failingWorkflowShouldFail atomic.Bool + +func failingWorkflow(ctx workflow.Context) (string, error) { + if failingWorkflowShouldFail.Load() { + return "", errors.New("intentional workflow failure") + } + return "success", nil +} + +// TestWorkerHeartbeatWithActivityInFlight verifies that activity slots are tracked +// correctly when activities are in flight +func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatWithActivityInFlight() { + ctx := context.Background() + + blockingActivityStarted = make(chan struct{}, 10) + blockingActivityComplete = make(chan struct{}, 10) + + ts.worker = worker.New(ts.client, ts.taskQueueName, worker.Options{ + MaxConcurrentActivityExecutionSize: 5, + }) + ts.worker.RegisterWorkflow(workflowWithBlockingActivity) + ts.worker.RegisterActivity(blockingActivity) + ts.NoError(ts.worker.Start()) + + workflowOptions := client.StartWorkflowOptions{ + ID: "test-activity-in-flight-" + uuid.NewString(), + TaskQueue: ts.taskQueueName, + } + + run, err := ts.client.ExecuteWorkflow(ctx, workflowOptions, workflowWithBlockingActivity) + ts.NoError(err) + + // Wait for activity to start + select { + case <-blockingActivityStarted: + ts.T().Log("Activity started") + case <-time.After(10 * time.Second): + ts.Fail("Timeout waiting for activity to start") + } + + time.Sleep(150 * time.Millisecond) + + workerInfo := ts.getWorkerInfo(ctx, ts.taskQueueName) + ts.NotNil(workerInfo, "Should find worker in ListWorkers/DescribeWorker") + + ts.T().Logf("Activity slots used: %d, available: %d", + workerInfo.ActivityTaskSlotsInfo.CurrentUsedSlots, + workerInfo.ActivityTaskSlotsInfo.CurrentAvailableSlots) + + ts.GreaterOrEqual(workerInfo.ActivityTaskSlotsInfo.CurrentUsedSlots, int32(1), + "Should have at least 1 activity slot used") + ts.GreaterOrEqual(workerInfo.ActivityTaskSlotsInfo.CurrentAvailableSlots, int32(0), + "Available slots should be non-negative") + + blockingActivityComplete <- struct{}{} + + var result string + err = run.Get(ctx, &result) + ts.NoError(err) + ts.Equal("done", result) + + time.Sleep(150 * time.Millisecond) + + workerInfo = ts.getWorkerInfo(ctx, ts.taskQueueName) + ts.True(workerInfo != nil && workerInfo.ActivityTaskSlotsInfo != nil) + ts.T().Logf("After completion - Activity slots used: %d, available: %d", + workerInfo.ActivityTaskSlotsInfo.CurrentUsedSlots, + workerInfo.ActivityTaskSlotsInfo.CurrentAvailableSlots) + ts.Equal(int32(0), workerInfo.ActivityTaskSlotsInfo.CurrentUsedSlots) + ts.GreaterOrEqual(workerInfo.ActivityTaskSlotsInfo.TotalProcessedTasks, int32(1)) +} + +func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatStickyCacheMiss() { + ctx := context.Background() + + wf1ActivityStarted := make(chan struct{}, 1) + wf1ActivityComplete := make(chan struct{}, 1) + wf2ActivityStarted := make(chan struct{}, 1) + wf2ActivityComplete := make(chan struct{}, 1) + + stickyCacheMissActivity := func(ctx context.Context, marker string) (string, error) { + switch marker { + case "wf1": + select { + case wf1ActivityStarted <- struct{}{}: + default: + } + select { + case <-wf1ActivityComplete: + return marker, nil + case <-ctx.Done(): + return "", ctx.Err() + } + case "wf2": + select { + case wf2ActivityStarted <- struct{}{}: + default: + } + select { + case <-wf2ActivityComplete: + return marker, nil + case <-ctx.Done(): + return "", ctx.Err() + } + } + return marker, nil + } + + stickyCacheMissWorkflow := func(ctx workflow.Context, marker string) (string, error) { + ao := workflow.ActivityOptions{ + StartToCloseTimeout: 30 * time.Second, + } + ctx = workflow.WithActivityOptions(ctx, ao) + var result string + err := workflow.ExecuteActivity(ctx, stickyCacheMissActivity, marker).Get(ctx, &result) + return result, err + } + + worker.SetStickyWorkflowCacheSize(1) + ts.worker = worker.New(ts.client, ts.taskQueueName, worker.Options{ + MaxConcurrentWorkflowTaskExecutionSize: 2, + DisableEagerActivities: true, + }) + ts.worker.RegisterWorkflow(stickyCacheMissWorkflow) + ts.worker.RegisterActivity(stickyCacheMissActivity) + ts.NoError(ts.worker.Start()) + + wf1Options := client.StartWorkflowOptions{ + ID: "test-sticky-miss-wf1-" + uuid.NewString(), + TaskQueue: ts.taskQueueName, + } + run1, err := ts.client.ExecuteWorkflow(ctx, wf1Options, stickyCacheMissWorkflow, "wf1") + ts.NoError(err) + + select { + case <-wf1ActivityStarted: + ts.T().Log("wf1 activity started") + case <-time.After(10 * time.Second): + ts.Fail("Timeout waiting for wf1 activity to start") + } + + // this should evict wf1 from the cache + wf2Options := client.StartWorkflowOptions{ + ID: "test-sticky-miss-wf2-" + uuid.NewString(), + TaskQueue: ts.taskQueueName, + } + run2, err := ts.client.ExecuteWorkflow(ctx, wf2Options, stickyCacheMissWorkflow, "wf2") + ts.NoError(err) + + select { + case <-wf2ActivityStarted: + ts.T().Log("wf2 activity started") + case <-time.After(10 * time.Second): + ts.Fail("Timeout waiting for wf2 activity to start") + } + + // wf1 should experience a cache miss when it resumes + wf1ActivityComplete <- struct{}{} + var result1 string + ts.NoError(run1.Get(ctx, &result1)) + ts.Equal("wf1", result1) + + wf2ActivityComplete <- struct{}{} + var result2 string + ts.NoError(run2.Get(ctx, &result2)) + ts.Equal("wf2", result2) + + // Wait for heartbeat + time.Sleep(150 * time.Millisecond) + + workerInfo := ts.getWorkerInfo(ctx, ts.taskQueueName) + ts.NotNil(workerInfo, "Should find worker in ListWorkers/DescribeWorker") + + ts.GreaterOrEqual(workerInfo.TotalStickyCacheMiss, int32(1), + "Should have at least 1 sticky cache miss") +} + +// TestWorkerHeartbeatMultipleWorkers verifies that multiple workers can heartbeat +// simultaneously and be tracked separately +func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatMultipleWorkers() { + ctx := context.Background() + + taskQueue1 := ts.taskQueueName + "-worker1" + taskQueue2 := ts.taskQueueName + "-worker2" + + worker1 := worker.New(ts.client, taskQueue1, worker.Options{}) + worker1.RegisterWorkflow(simpleWorkflow) + ts.NoError(worker1.Start()) + defer worker1.Stop() + + worker2 := worker.New(ts.client, taskQueue2, worker.Options{}) + worker2.RegisterWorkflow(simpleWorkflow) + ts.NoError(worker2.Start()) + defer worker2.Stop() + + // Run workflow on each worker + var wg sync.WaitGroup + for i, tq := range []string{taskQueue1, taskQueue2} { + wg.Add(1) + go func(idx int, taskQueue string) { + defer wg.Done() + workflowOptions := client.StartWorkflowOptions{ + ID: fmt.Sprintf("test-multi-worker-%d-%s", idx, uuid.NewString()), + TaskQueue: taskQueue, + } + run, err := ts.client.ExecuteWorkflow(ctx, workflowOptions, simpleWorkflow) + ts.NoError(err) + err = run.Get(ctx, nil) + ts.NoError(err) + }(i, tq) + } + wg.Wait() + + // Wait for heartbeats + time.Sleep(150 * time.Millisecond) + + // Verify both workers are tracked + workerInfo1 := ts.getWorkerInfo(ctx, taskQueue1) + workerInfo2 := ts.getWorkerInfo(ctx, taskQueue2) + + ts.NotNil(workerInfo1, "Should find worker1") + ts.NotNil(workerInfo2, "Should find worker2") + + ts.NotEqual(workerInfo1.WorkerInstanceKey, workerInfo2.WorkerInstanceKey, + "Different workers should have different instance keys") + + ts.Equal(taskQueue1, workerInfo1.TaskQueue) + ts.Equal(taskQueue2, workerInfo2.TaskQueue) + + ts.Equal(workerInfo1.HostInfo.WorkerGroupingKey, workerInfo2.HostInfo.WorkerGroupingKey, + "Workers should share the same client and worker grouping key") + +} + +func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatFailureMetrics() { + ctx := context.Background() + + // Reset call counter + failingActivityCallCount.Store(0) + + ts.worker = worker.New(ts.client, ts.taskQueueName, worker.Options{}) + ts.worker.RegisterWorkflow(workflowWithFailingActivity) + ts.worker.RegisterActivity(failingActivity) + ts.NoError(ts.worker.Start()) + + // Run workflow that will have a failing activity + workflowOptions := client.StartWorkflowOptions{ + ID: "test-failure-metrics-" + uuid.NewString(), + TaskQueue: ts.taskQueueName, + } + + run, err := ts.client.ExecuteWorkflow(ctx, workflowOptions, workflowWithFailingActivity) + ts.NoError(err) + + // Wait for workflow to complete (will fail due to activity failure) + err = run.Get(ctx, nil) + ts.Error(err, "Workflow should fail due to activity failure") + + // Wait for heartbeat to capture failure metrics + time.Sleep(150 * time.Millisecond) + + // Get worker info and verify failure counts + workerInfo := ts.getWorkerInfo(ctx, ts.taskQueueName) + ts.NotNil(workerInfo, "Should find worker in ListWorkers/DescribeWorker") + ts.NotNil(workerInfo.ActivityTaskSlotsInfo) + ts.GreaterOrEqual(workerInfo.ActivityTaskSlotsInfo.LastIntervalFailureTasks, int32(1), + "Should have at least 1 activity failure") + ts.GreaterOrEqual(workerInfo.ActivityTaskSlotsInfo.TotalFailedTasks, int32(1), + "Should have tracked at least 1 activity task failure") + + // Last interval should go back to 0 on next heartbeat + time.Sleep(150 * time.Millisecond) + workerInfo = ts.getWorkerInfo(ctx, ts.taskQueueName) + ts.Equal(int32(0), workerInfo.ActivityTaskSlotsInfo.LastIntervalFailureTasks) +} + +func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatWorkflowTaskProcessed() { + ctx := context.Background() + + ts.worker = worker.New(ts.client, ts.taskQueueName, worker.Options{}) + ts.worker.RegisterWorkflow(simpleWorkflow) + ts.NoError(ts.worker.Start()) + + numWorkflows := 3 + for i := 0; i < numWorkflows; i++ { + workflowOptions := client.StartWorkflowOptions{ + ID: fmt.Sprintf("test-wf-processed-%d-%s", i, uuid.NewString()), + TaskQueue: ts.taskQueueName, + } + run, err := ts.client.ExecuteWorkflow(ctx, workflowOptions, simpleWorkflow) + ts.NoError(err) + err = run.Get(ctx, nil) + ts.NoError(err) + } + + // Wait for heartbeat + time.Sleep(150 * time.Millisecond) + + workerInfo := ts.getWorkerInfo(ctx, ts.taskQueueName) + ts.NotNil(workerInfo, "Should find worker in ListWorkers/DescribeWorker") + ts.NotNil(workerInfo.WorkflowTaskSlotsInfo) + ts.Equal(int32(numWorkflows), workerInfo.WorkflowTaskSlotsInfo.TotalProcessedTasks) + ts.GreaterOrEqual(workerInfo.WorkflowTaskSlotsInfo.LastIntervalProcessedTasks, int32(1), + "Should have processed at least 1 workflow task in last interval") + + // Last interval should go back to 0 on next heartbeat + time.Sleep(150 * time.Millisecond) + workerInfo = ts.getWorkerInfo(ctx, ts.taskQueueName) + ts.NotNil(workerInfo) + ts.Equal(int32(0), workerInfo.WorkflowTaskSlotsInfo.LastIntervalProcessedTasks) +} + +func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatResourceBasedTuner() { + ctx := context.Background() + + tuner, err := resourcetuner.NewResourceBasedTuner(resourcetuner.ResourceBasedTunerOptions{ + TargetMem: 0.8, + TargetCpu: 0.9, + }) + ts.NoError(err) + + tunerWorkflow := func(ctx workflow.Context) error { + ao := workflow.ActivityOptions{ + StartToCloseTimeout: 10 * time.Second, + } + ctx = workflow.WithActivityOptions(ctx, ao) + return workflow.ExecuteActivity(ctx, "tunerActivity").Get(ctx, nil) + } + + tunerActivity := func(ctx context.Context) error { + activity.GetLogger(ctx).Info("tunerActivity executed") + return nil + } + + autoscalingBehavior := worker.NewPollerBehaviorAutoscaling(worker.PollerBehaviorAutoscalingOptions{ + InitialNumberOfPollers: 5, + MinimumNumberOfPollers: 1, + MaximumNumberOfPollers: 200, + }) + + ts.worker = worker.New(ts.client, ts.taskQueueName, worker.Options{ + Tuner: tuner, + WorkflowTaskPollerBehavior: autoscalingBehavior, + ActivityTaskPollerBehavior: autoscalingBehavior, + NexusTaskPollerBehavior: autoscalingBehavior, + }) + ts.worker.RegisterWorkflowWithOptions(tunerWorkflow, workflow.RegisterOptions{Name: "tunerWorkflow"}) + ts.worker.RegisterActivityWithOptions(tunerActivity, activity.RegisterOptions{Name: "tunerActivity"}) + ts.NoError(ts.worker.Start()) + + // Run a workflow + workflowOptions := client.StartWorkflowOptions{ + ID: "test-resource-tuner-" + uuid.NewString(), + TaskQueue: ts.taskQueueName, + } + run, err := ts.client.ExecuteWorkflow(ctx, workflowOptions, "tunerWorkflow") + ts.NoError(err) + ts.NoError(run.Get(ctx, nil)) + + // Wait for heartbeat + time.Sleep(150 * time.Millisecond) + + workerInfo := ts.getWorkerInfo(ctx, ts.taskQueueName) + ts.NotNil(workerInfo, "Should find worker in ListWorkers/DescribeWorker") + + ts.NotNil(workerInfo.WorkflowTaskSlotsInfo) + ts.Equal("ResourceBased", workerInfo.WorkflowTaskSlotsInfo.SlotSupplierKind) + + ts.NotNil(workerInfo.ActivityTaskSlotsInfo) + ts.Equal("ResourceBased", workerInfo.ActivityTaskSlotsInfo.SlotSupplierKind) + + ts.NotNil(workerInfo.LocalActivitySlotsInfo) + ts.Equal("ResourceBased", workerInfo.LocalActivitySlotsInfo.SlotSupplierKind) + + ts.NotNil(workerInfo.WorkflowPollerInfo) + ts.True(workerInfo.WorkflowPollerInfo.IsAutoscaling) + + ts.NotNil(workerInfo.WorkflowStickyPollerInfo) + ts.True(workerInfo.WorkflowStickyPollerInfo.IsAutoscaling) + + ts.NotNil(workerInfo.ActivityPollerInfo) + ts.True(workerInfo.ActivityPollerInfo.IsAutoscaling) +} + +func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatPlugins() { + ctx := context.Background() + + clientPlugin, err := temporal.NewSimplePlugin(temporal.SimplePluginOptions{ + Name: "test-client-plugin", + }) + ts.NoError(err) + + workerPlugin, err := temporal.NewSimplePlugin(temporal.SimplePluginOptions{ + Name: "test-worker-plugin", + }) + ts.NoError(err) + + duplicatePlugin, err := temporal.NewSimplePlugin(temporal.SimplePluginOptions{ + Name: "test-client-plugin", + }) + ts.NoError(err) + + // Create a new client with the plugin + heartbeatInterval := 100 * time.Millisecond + pluginClient, err := client.Dial(client.Options{ + HostPort: ts.config.ServiceAddr, + Namespace: ts.config.Namespace, + Logger: ilog.NewDefaultLogger(), + WorkerHeartbeatInterval: &heartbeatInterval, + ConnectionOptions: client.ConnectionOptions{TLS: ts.config.TLS}, + Identity: "PluginTest", + Plugins: []client.Plugin{clientPlugin}, + }) + ts.NoError(err) + defer pluginClient.Close() + + // Create worker with additional plugins (including duplicate) + ts.worker = worker.New(pluginClient, ts.taskQueueName, worker.Options{ + Plugins: []worker.Plugin{workerPlugin, duplicatePlugin}, + }) + ts.worker.RegisterWorkflow(simpleWorkflow) + ts.NoError(ts.worker.Start()) + + workflowOptions := client.StartWorkflowOptions{ + ID: "test-plugins-" + uuid.NewString(), + TaskQueue: ts.taskQueueName, + } + run, err := pluginClient.ExecuteWorkflow(ctx, workflowOptions, simpleWorkflow) + ts.NoError(err) + ts.NoError(run.Get(ctx, nil)) + + // Wait for heartbeat + time.Sleep(150 * time.Millisecond) + + workerInfo := ts.getWorkerInfo(ctx, ts.taskQueueName) + ts.NotNil(workerInfo, "Should find worker in ListWorkers/DescribeWorker") + + // Verify plugin names are reported + ts.NotNil(workerInfo.Plugins) + ts.Len(workerInfo.Plugins, 2, "Should have 2 unique plugins (duplicates deduped)") + + pluginNames := make(map[string]bool) + for _, plugin := range workerInfo.Plugins { + pluginNames[plugin.Name] = true + } + ts.True(pluginNames["test-client-plugin"], "Should contain client plugin") + ts.True(pluginNames["test-worker-plugin"], "Should contain worker plugin") +} diff --git a/contrib/resourcetuner/cgroups.go b/worker/hostmetrics/cgroups.go similarity index 99% rename from contrib/resourcetuner/cgroups.go rename to worker/hostmetrics/cgroups.go index f6615296f..f10614c33 100644 --- a/contrib/resourcetuner/cgroups.go +++ b/worker/hostmetrics/cgroups.go @@ -1,6 +1,6 @@ //go:build linux -package resourcetuner +package hostmetrics import ( "errors" diff --git a/contrib/resourcetuner/cgroups_notlinux.go b/worker/hostmetrics/cgroups_notlinux.go similarity index 87% rename from contrib/resourcetuner/cgroups_notlinux.go rename to worker/hostmetrics/cgroups_notlinux.go index 068e4220f..ca3d940d7 100644 --- a/contrib/resourcetuner/cgroups_notlinux.go +++ b/worker/hostmetrics/cgroups_notlinux.go @@ -1,6 +1,6 @@ //go:build !linux -package resourcetuner +package hostmetrics import "errors" @@ -8,8 +8,7 @@ func newCGroupInfo() cGroupInfo { return &cGroupInfoImpl{} } -type cGroupInfoImpl struct { -} +type cGroupInfoImpl struct{} func (p *cGroupInfoImpl) Update() (bool, error) { return false, errors.New("cgroup is not supported on this platform") diff --git a/worker/hostmetrics/hostmetrics.go b/worker/hostmetrics/hostmetrics.go new file mode 100644 index 000000000..e2dd79653 --- /dev/null +++ b/worker/hostmetrics/hostmetrics.go @@ -0,0 +1,121 @@ +// Package hostmetrics provides host-level CPU and memory metrics collection +// for worker heartbeats. It uses gopsutil for system metrics and supports +// cgroup metrics for containerized environments. +package hostmetrics + +import ( + "context" + "runtime" + "sync" + "time" + + "github.com/shirou/gopsutil/v4/cpu" + "github.com/shirou/gopsutil/v4/mem" + "go.temporal.io/sdk/log" +) + +// PSUtilSystemInfoSupplier implements worker.HostMetricsProvider using gopsutil. +type PSUtilSystemInfoSupplier struct { + mu sync.Mutex + lastRefresh time.Time + lastMemStat *mem.VirtualMemoryStat + lastCpuUsage float64 + cGroupInfo cGroupInfo + stopTryingToGetCGroupInfo bool + logger log.Logger +} + +// NewPSUtilSystemInfoSupplier creates a new PSUtilSystemInfoSupplier. +func NewPSUtilSystemInfoSupplier(logger log.Logger) *PSUtilSystemInfoSupplier { + return &PSUtilSystemInfoSupplier{ + logger: logger, + cGroupInfo: newCGroupInfo(), + } +} + +// GetCpuUsage returns the current host CPU usage as a fraction (0.0-1.0). +// In containerized environments, it prefers cgroup metrics if available. +func (p *PSUtilSystemInfoSupplier) GetCpuUsage() (float64, error) { + return p.GetCpuUsageWithLogger(p.logger) +} + +// GetCpuUsageWithLogger is like GetCpuUsage but uses the provided logger for warnings. +func (p *PSUtilSystemInfoSupplier) GetCpuUsageWithLogger(logger log.Logger) (float64, error) { + if err := p.maybeRefresh(logger); err != nil { + return 0, err + } + // Prefer cgroup metrics in containerized environments + if p.cGroupInfo != nil { + if cgroupCPU := p.cGroupInfo.GetLastCPUUsage(); cgroupCPU != 0 { + return cgroupCPU, nil + } + } + return p.lastCpuUsage / 100, nil +} + +// GetMemoryUsage returns the current host memory usage as a fraction (0.0-1.0). +// In containerized environments, it prefers cgroup metrics if available. +func (p *PSUtilSystemInfoSupplier) GetMemoryUsage() (float64, error) { + return p.GetMemoryUsageWithLogger(p.logger) +} + +// GetMemoryUsageWithLogger is like GetMemoryUsage but uses the provided logger for warnings. +func (p *PSUtilSystemInfoSupplier) GetMemoryUsageWithLogger(logger log.Logger) (float64, error) { + if err := p.maybeRefresh(logger); err != nil { + return 0, err + } + if cgroupMem := p.cGroupInfo.GetLastMemUsage(); cgroupMem != 0 { + return cgroupMem, nil + } + return p.lastMemStat.UsedPercent / 100, nil +} + +func (p *PSUtilSystemInfoSupplier) maybeRefresh(logger log.Logger) error { + if time.Since(p.lastRefresh) < 100*time.Millisecond { + return nil + } + p.mu.Lock() + defer p.mu.Unlock() + // Double check refresh is still needed + if time.Since(p.lastRefresh) < 100*time.Millisecond { + return nil + } + + ctx, cancelFn := context.WithTimeout(context.Background(), 1*time.Second) + defer cancelFn() + memStat, err := mem.VirtualMemoryWithContext(ctx) + if err != nil { + return err + } + cpuUsage, err := cpu.PercentWithContext(ctx, 0, false) + if err != nil { + return err + } + + p.lastMemStat = memStat + p.lastCpuUsage = cpuUsage[0] + + // Try cgroup metrics on Linux for containerized environments + if runtime.GOOS == "linux" && !p.stopTryingToGetCGroupInfo && p.cGroupInfo != nil { + continueUpdates, err := p.cGroupInfo.Update() + if err != nil && logger != nil { + logger.Warn("Failed to get cgroup stats", "error", err) + } + p.stopTryingToGetCGroupInfo = !continueUpdates + } + + p.lastRefresh = time.Now() + return nil +} + +type cGroupInfo interface { + // Update requests an update of the cgroup stats. Returns true if cgroup stats + // should continue to be updated, false if not in a cgroup or error is unrecoverable. + Update() (bool, error) + // GetLastMemUsage returns last known memory usage as a fraction of cgroup limit. + // Returns 0 if not in a cgroup or limit is not set. + GetLastMemUsage() float64 + // GetLastCPUUsage returns last known CPU usage as a fraction of cgroup limit. + // Returns 0 if not in a cgroup or limit is not set. + GetLastCPUUsage() float64 +} diff --git a/worker/hostmetrics/hostmetrics_test.go b/worker/hostmetrics/hostmetrics_test.go new file mode 100644 index 000000000..243db4a75 --- /dev/null +++ b/worker/hostmetrics/hostmetrics_test.go @@ -0,0 +1,49 @@ +package hostmetrics + +import ( + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestPSUtilSystemInfoSupplier_GetCpuUsage(t *testing.T) { + p := NewPSUtilSystemInfoSupplier(nil) + + cpu, err := p.GetCpuUsage() + require.NoError(t, err) + assert.GreaterOrEqual(t, cpu, 0.0) + assert.LessOrEqual(t, cpu, 1.0) +} + +func TestPSUtilSystemInfoSupplier_GetMemoryUsage(t *testing.T) { + p := NewPSUtilSystemInfoSupplier(nil) + + mem, err := p.GetMemoryUsage() + require.NoError(t, err) + assert.GreaterOrEqual(t, mem, 0.0) + assert.LessOrEqual(t, mem, 1.0) +} + +func TestPSUtilSystemInfoSupplier_RateLimiting(t *testing.T) { + p := NewPSUtilSystemInfoSupplier(nil) + + // First call should refresh + _, err := p.GetCpuUsage() + require.NoError(t, err) + firstRefresh := p.lastRefresh + + // Immediate second call should use cached value + _, err = p.GetCpuUsage() + require.NoError(t, err) + assert.Equal(t, firstRefresh, p.lastRefresh) + + // Wait past the refresh interval + time.Sleep(150 * time.Millisecond) + + // Third call should refresh + _, err = p.GetCpuUsage() + require.NoError(t, err) + assert.NotEqual(t, firstRefresh, p.lastRefresh) +} diff --git a/worker/worker.go b/worker/worker.go index f0a5b8eb2..e961bb2ca 100644 --- a/worker/worker.go +++ b/worker/worker.go @@ -236,6 +236,13 @@ type ( // ReplayWorkflowHistoryOptions are options for replaying a workflow. ReplayWorkflowHistoryOptions = internal.ReplayWorkflowHistoryOptions + + // HostMetricsProvider provides host-level CPU and memory metrics for worker heartbeats. + // Implement this interface to provide custom metrics collection, or use the default + // implementation provided by the SDK in the worker/hostmetrics package. + // + // NOTE: Experimental + HostMetricsProvider = internal.HostMetricsProvider ) var _ WorkflowRegistry = (WorkflowReplayer)(nil) From 6af4db866f6a706459ca13dad4697ffa0ccd00c6 Mon Sep 17 00:00:00 2001 From: Andrew Yuan Date: Thu, 22 Jan 2026 16:54:09 -0800 Subject: [PATCH 02/30] vendor gopsutil --- .github/workflows/ci.yml | 46 ++++ go.mod | 18 +- go.sum | 54 +--- internal/sysinfo/LICENSE | 30 +++ internal/sysinfo/README.md | 9 + internal/sysinfo/common.go | 112 +++++++++ internal/sysinfo/common_darwin.go | 61 +++++ internal/sysinfo/cpu.go | 141 +++++++++++ internal/sysinfo/cpu_darwin.go | 101 ++++++++ internal/sysinfo/cpu_linux.go | 134 ++++++++++ internal/sysinfo/cpu_unsupported.go | 18 ++ internal/sysinfo/cpu_windows.go | 141 +++++++++++ internal/sysinfo/mem.go | 50 ++++ internal/sysinfo/mem_darwin.go | 76 ++++++ internal/sysinfo/mem_linux.go | 100 ++++++++ internal/sysinfo/mem_unsupported.go | 18 ++ internal/sysinfo/mem_windows.go | 50 ++++ .../sysinfo/scripts/compare_with_gopsutil.sh | 236 ++++++++++++++++++ worker/hostmetrics/cgroups.go | 152 ++++++----- worker/hostmetrics/hostmetrics.go | 13 +- .../scripts/compare_with_containerd.sh | 153 ++++++++++++ 21 files changed, 1575 insertions(+), 138 deletions(-) create mode 100644 internal/sysinfo/LICENSE create mode 100644 internal/sysinfo/README.md create mode 100644 internal/sysinfo/common.go create mode 100644 internal/sysinfo/common_darwin.go create mode 100644 internal/sysinfo/cpu.go create mode 100644 internal/sysinfo/cpu_darwin.go create mode 100644 internal/sysinfo/cpu_linux.go create mode 100644 internal/sysinfo/cpu_unsupported.go create mode 100644 internal/sysinfo/cpu_windows.go create mode 100644 internal/sysinfo/mem.go create mode 100644 internal/sysinfo/mem_darwin.go create mode 100644 internal/sysinfo/mem_linux.go create mode 100644 internal/sysinfo/mem_unsupported.go create mode 100644 internal/sysinfo/mem_windows.go create mode 100755 internal/sysinfo/scripts/compare_with_gopsutil.sh create mode 100755 worker/hostmetrics/scripts/compare_with_containerd.sh diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 0008a2e18..f0866ec90 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -135,3 +135,49 @@ jobs: go-repo-path: ${{github.event.pull_request.head.repo.full_name}} version: ${{github.event.pull_request.head.ref}} version-is-repo-ref: true + + # Verify internal/sysinfo matches gopsutil on all platforms + sysinfo-compare: + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest, macos-intel, macos-arm, windows-latest] + include: + - os: macos-intel + runsOn: macos-15-intel + - os: macos-arm + runsOn: macos-14 + runs-on: ${{ matrix.runsOn || matrix.os }} + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Setup Go + uses: actions/setup-go@v5 + with: + go-version: stable + + - name: Compare sysinfo with gopsutil (Unix) + if: runner.os != 'Windows' + run: ./internal/sysinfo/scripts/compare_with_gopsutil.sh + + - name: Compare sysinfo with gopsutil (Windows) + if: runner.os == 'Windows' + shell: bash + run: ./internal/sysinfo/scripts/compare_with_gopsutil.sh + + cgroups-compare: + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Compare cgroups with containerd/cgroups + run: | + docker run --rm \ + -v "${{ github.workspace }}":/workspace \ + -w /workspace \ + --memory=512m \ + --cpus=1 \ + golang:1.23 \ + ./worker/hostmetrics/scripts/compare_with_containerd.sh diff --git a/go.mod b/go.mod index fe0edf0dd..2de405460 100644 --- a/go.mod +++ b/go.mod @@ -5,7 +5,7 @@ go 1.23.0 toolchain go1.23.6 require ( - github.com/containerd/cgroups/v3 v3.0.3 + github.com/ebitengine/purego v0.9.1 github.com/facebookgo/clock v0.0.0-20150410010913-600d898af40a github.com/gogo/protobuf v1.3.2 github.com/golang/mock v1.6.0 @@ -14,7 +14,6 @@ require ( github.com/grpc-ecosystem/go-grpc-middleware/v2 v2.3.2 github.com/nexus-rpc/sdk-go v0.5.1 github.com/robfig/cron v1.2.0 - github.com/shirou/gopsutil/v4 v4.24.8 github.com/stretchr/testify v1.10.0 go.temporal.io/api v1.59.0 golang.org/x/sync v0.13.0 @@ -25,25 +24,10 @@ require ( ) require ( - github.com/cilium/ebpf v0.11.0 // indirect - github.com/coreos/go-systemd/v22 v22.3.2 // indirect github.com/davecgh/go-spew v1.1.1 // indirect - github.com/go-ole/go-ole v1.2.6 // indirect - github.com/godbus/dbus/v5 v5.0.4 // indirect github.com/grpc-ecosystem/grpc-gateway/v2 v2.22.0 // indirect - github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 // indirect - github.com/opencontainers/runtime-spec v1.0.2 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect - github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c // indirect - github.com/shoenig/go-m1cpu v0.1.6 // indirect - github.com/sirupsen/logrus v1.9.3 // indirect github.com/stretchr/objx v0.5.2 // indirect - github.com/tklauser/go-sysconf v0.3.12 // indirect - github.com/tklauser/numcpus v0.6.1 // indirect - github.com/yusufpapurcu/wmi v1.2.4 // indirect - go.einride.tech/pid v0.1.3 // indirect - go.temporal.io/sdk/contrib/resourcetuner v0.0.0-20260112203102-5b6df8e02dcf // indirect - golang.org/x/exp v0.0.0-20240325151524-a685a6edb6d8 // indirect golang.org/x/net v0.39.0 // indirect golang.org/x/text v0.24.0 // indirect google.golang.org/genproto/googleapis/api v0.0.0-20240827150818-7e3bb234dfed // indirect diff --git a/go.sum b/go.sum index b069bc428..33e474181 100644 --- a/go.sum +++ b/go.sum @@ -1,25 +1,13 @@ -github.com/cilium/ebpf v0.11.0 h1:V8gS/bTCCjX9uUnkUFUpPsksM8n1lXBAvHcpiFk1X2Y= -github.com/cilium/ebpf v0.11.0/go.mod h1:WE7CZAnqOL2RouJ4f1uyNhqr2P4CCvXFIqdRDUgWsVs= -github.com/containerd/cgroups/v3 v3.0.3 h1:S5ByHZ/h9PMe5IOQoN7E+nMc2UcLEM/V48DGDJ9kip0= -github.com/containerd/cgroups/v3 v3.0.3/go.mod h1:8HBe7V3aWGLFPd/k03swSIsGjZhHI2WzJmticMgVuz0= -github.com/coreos/go-systemd/v22 v22.3.2 h1:D9/bQk5vlXQFZ6Kwuu6zaiXJ9oTPe68++AzAJc1DzSI= -github.com/coreos/go-systemd/v22 v22.3.2/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc= -github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/ebitengine/purego v0.9.1 h1:a/k2f2HQU3Pi399RPW1MOaZyhKJL9w/xFpKAg4q1s0A= +github.com/ebitengine/purego v0.9.1/go.mod h1:iIjxzd6CiRiOG0UyXP+V1+jWqUXVjPKLAI0mRfJZTmQ= github.com/facebookgo/clock v0.0.0-20150410010913-600d898af40a h1:yDWHCSQ40h88yih2JAcL6Ls/kVkSE8GFACTGVnMPruw= github.com/facebookgo/clock v0.0.0-20150410010913-600d898af40a/go.mod h1:7Ga40egUymuWXxAe151lTNnCv97MddSOVsjpPPkityA= -github.com/frankban/quicktest v1.14.5 h1:dfYrrRyLtiqT9GyKXgdh+k4inNeTvmGbuSgZ3lx3GhA= -github.com/frankban/quicktest v1.14.5/go.mod h1:4ptaffx2x8+WTWXmUCuVU6aPUX1/Mz7zb5vbUoiM6w0= -github.com/go-ole/go-ole v1.2.6 h1:/Fpf6oFPoeFik9ty7siob0G6Ke8QvQEuVcuChpwXzpY= -github.com/go-ole/go-ole v1.2.6/go.mod h1:pprOEPIfldk/42T2oK7lQ4v4JSDwmV0As9GaiUsvbm0= -github.com/godbus/dbus/v5 v5.0.4 h1:9349emZab16e7zQvpmsbtjc18ykshndd8y2PG3sgJbA= -github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= github.com/golang/mock v1.6.0 h1:ErTB+efbowRARo13NNdxyJji2egdxLGQhRaY+DUumQc= github.com/golang/mock v1.6.0/go.mod h1:p6yTPP+5HYm5mzsMV8JkE6ZKdX+/wYM6Hr+LicevLPs= -github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= @@ -34,58 +22,26 @@ github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= -github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 h1:6E+4a0GO5zZEnZ81pIr0yLvtUWk2if982qA3F3QD6H4= -github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0/go.mod h1:zJYVVT2jmtg6P3p1VtQj7WsuWi/y4VnjVBn7F8KPB3I= github.com/nexus-rpc/sdk-go v0.5.1 h1:UFYYfoHlQc+Pn9gQpmn9QE7xluewAn2AO1OSkAh7YFU= github.com/nexus-rpc/sdk-go v0.5.1/go.mod h1:FHdPfVQwRuJFZFTF0Y2GOAxCrbIBNrcPna9slkGKPYk= -github.com/opencontainers/runtime-spec v1.0.2 h1:UfAcuLBJB9Coz72x1hgl8O5RVzTdNiaglX6v2DM6FI0= -github.com/opencontainers/runtime-spec v1.0.2/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c h1:ncq/mPwQF4JjgDlrVEn3C11VoGHZN7m8qihwgMEtzYw= -github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c/go.mod h1:OmDBASR4679mdNQnz2pUhc2G8CO2JrUAVFDRBDP/hJE= github.com/robfig/cron v1.2.0 h1:ZjScXvvxeQ63Dbyxy76Fj3AT3Ut0aKsyd2/tl3DTMuQ= github.com/robfig/cron v1.2.0/go.mod h1:JGuDeoQd7Z6yL4zQhZ3OPEVHB7fL6Ka6skscFHfmt2k= github.com/rogpeppe/go-internal v1.11.0 h1:cWPaGQEPrBb5/AsnsZesgZZ9yb1OQ+GOISoDNXVBh4M= github.com/rogpeppe/go-internal v1.11.0/go.mod h1:ddIwULY96R17DhadqLgMfk9H9tvdUzkipdSkR5nkCZA= -github.com/shirou/gopsutil/v4 v4.24.8 h1:pVQjIenQkIhqO81mwTaXjTzOMT7d3TZkf43PlVFHENI= -github.com/shirou/gopsutil/v4 v4.24.8/go.mod h1:wE0OrJtj4dG+hYkxqDH3QiBICdKSf04/npcvLLc/oRg= -github.com/shoenig/go-m1cpu v0.1.6 h1:nxdKQNcEB6vzgA2E2bvzKIYRuNj7XNJ4S/aRSwKzFtM= -github.com/shoenig/go-m1cpu v0.1.6/go.mod h1:1JJMcUBvfNwpq05QDQVAnx3gUHr9IYF7GNg9SUEw2VQ= -github.com/shoenig/test v0.6.4 h1:kVTaSd7WLz5WZ2IaoM0RSzRsUD+m8wRR+5qvntpn4LU= -github.com/shoenig/test v0.6.4/go.mod h1:byHiCGXqrVaflBLAMq/srcZIHynQPQgeyvkvXnjqq0k= -github.com/sirupsen/logrus v1.9.0 h1:trlNQbNUG3OdDrDil03MCb1H2o9nJ1x4/5LYw7byDE0= -github.com/sirupsen/logrus v1.9.0/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= -github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= -github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= -github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY= github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA= -github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= -github.com/tklauser/go-sysconf v0.3.12 h1:0QaGUFOdQaIVdPgfITYzaTegZvdCjmYO52cSFAEVmqU= -github.com/tklauser/go-sysconf v0.3.12/go.mod h1:Ho14jnntGE1fpdOqQEEaiKRpvIavV0hSfmBq8nJbHYI= -github.com/tklauser/numcpus v0.6.1 h1:ng9scYS7az0Bk4OZLvrNXNSAO2Pxr1XXRAPyjhIx+Fk= -github.com/tklauser/numcpus v0.6.1/go.mod h1:1XfjsgE2zo8GVw7POkMbHENHzVg3GzmoZ9fESEdAacY= github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.3.5/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k= -github.com/yusufpapurcu/wmi v1.2.4 h1:zFUKzehAFReQwLys1b/iSMl+JQGSCSjtVqQn9bBrPo0= -github.com/yusufpapurcu/wmi v1.2.4/go.mod h1:SBZ9tNy3G9/m5Oi98Zks0QjeHVDvuK0qfxQmPyzfmi0= -go.einride.tech/pid v0.1.3 h1:yWAKSmD2Z10jxd4gYFhOjbBNqXeIQwAtnCO/XKCT7sQ= -go.einride.tech/pid v0.1.3/go.mod h1:33JSUbKrH/4v8DZf/0K8IC8Enjd92wB2birp+bCYQso= go.temporal.io/api v1.59.0 h1:QUpAju1KKs9xBfGSI0Uwdyg06k6dRCJH+Zm3G1Jc9Vk= go.temporal.io/api v1.59.0/go.mod h1:iaxoP/9OXMJcQkETTECfwYq4cw/bj4nwov8b3ZLVnXM= -go.temporal.io/sdk/contrib/resourcetuner v0.0.0-20260112203102-5b6df8e02dcf h1:hfa3sOvh1ZoC2SH5FKA5UdivU5X3AjARYEzSUy0ObUc= -go.temporal.io/sdk/contrib/resourcetuner v0.0.0-20260112203102-5b6df8e02dcf/go.mod h1:UvUEaYWquPBbehPnQJ6St0iDJVDV1HFXRSicol9Z+ek= -go.uber.org/goleak v1.1.12 h1:gZAh5/EyT/HQwlpkCy6wTpqfH9H8Lz8zbm3dZh+OyzA= -go.uber.org/goleak v1.1.12/go.mod h1:cwTWslyiVhfpKIDGSZEM2HlOvcqm+tG4zioyIeLoqMQ= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= -golang.org/x/exp v0.0.0-20240325151524-a685a6edb6d8 h1:aAcj0Da7eBAtrTp03QXWvm88pSyOt+UgdZw2BFZ+lEw= -golang.org/x/exp v0.0.0-20240325151524-a685a6edb6d8/go.mod h1:CQ1k9gNrJ50XIzaKCRR2hssIjF07kZFEiieALBM/ARQ= golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.4.2/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= @@ -104,15 +60,10 @@ golang.org/x/sync v0.13.0 h1:AauUjRAJ9OSnvULf/ARrrVywoJDy0YS2AwQ98I37610= golang.org/x/sync v0.13.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20190916202348-b4ddaad3f8a3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20201204225414-ed752295db88/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210330210617-4fbd30eecc44/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210510120138-977fb7262007/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.11.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.32.0 h1:s77OFDvIQeibCmezSnk/q6iAfkdiQaJi4VzroCFrN20= golang.org/x/sys v0.32.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= @@ -142,6 +93,5 @@ google.golang.org/protobuf v1.36.6/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/ gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= -gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/internal/sysinfo/LICENSE b/internal/sysinfo/LICENSE new file mode 100644 index 000000000..ca62f1f51 --- /dev/null +++ b/internal/sysinfo/LICENSE @@ -0,0 +1,30 @@ +This package contains code derived from gopsutil: +https://github.com/shirou/gopsutil + +gopsutil is distributed under BSD license reproduced below. + +Copyright (c) 2014, WAKAYAMA Shirou +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + * Neither the name of the gopsutil authors nor the names of its contributors + may be used to endorse or promote products derived from this software without + specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/internal/sysinfo/README.md b/internal/sysinfo/README.md new file mode 100644 index 000000000..86c6a1ca1 --- /dev/null +++ b/internal/sysinfo/README.md @@ -0,0 +1,9 @@ +This package is vendored based off of the [gopsutil](https://github.com/shirou/gopsutil) +package, where we've stripped everything except the CPU and mem measuring functionality. +We also only need to support Darwin, Linux, and Windows measurements, as those are +the platforms the SDK itself supports. `LICENSE` has been included in this directory +to honor the BSD license of gopsutil. + +When making changes to update with upstream, use the `scripts/compare_with_gopsutil.sh` +to compare the results of the vendored package with using the library directly. +CI also runs this script to ensure there are no unexpected discrepancies. diff --git a/internal/sysinfo/common.go b/internal/sysinfo/common.go new file mode 100644 index 000000000..7026956d9 --- /dev/null +++ b/internal/sysinfo/common.go @@ -0,0 +1,112 @@ +// SPDX-License-Identifier: BSD-3-Clause +// Derived from github.com/shirou/gopsutil/v4 (Copyright (c) 2014, WAKAYAMA Shirou) +// Modified to include only CPU percentage and memory usage for Linux/Darwin/Windows. +package sysinfo + +import ( + "bufio" + "context" + "errors" + "io" + "os" + "path/filepath" + "strings" + "time" +) + +var ErrNotImplemented = errors.New("not implemented on this platform") + +var Timeout = 3 * time.Second + +// EnvKey is the type for context keys used to pass environment variables. +type EnvKey string + +// EnvKeyType is the type alias for environment variable keys. +type EnvKeyType = string + +// EnvMap is the type alias for environment variable maps. +type EnvMap = map[EnvKeyType]string + +// ReadLines reads contents from a file and splits them by new lines. +func ReadLines(filename string) ([]string, error) { + return ReadLinesOffsetN(filename, 0, -1) +} + +// ReadLinesOffsetN reads contents from file and splits them by new line. +// The offset tells at which line number to start. +// The count determines the number of lines to read (starting from offset): +// n >= 0: at most n lines +// n < 0: whole file +func ReadLinesOffsetN(filename string, offset uint, n int) ([]string, error) { + f, err := os.Open(filename) + if err != nil { + return []string{""}, err + } + defer f.Close() + + var ret []string + + r := bufio.NewReader(f) + for i := uint(0); i < uint(n)+offset || n < 0; i++ { + line, err := r.ReadString('\n') + if err != nil { + if err == io.EOF && line != "" { + ret = append(ret, strings.Trim(line, "\n")) + } + break + } + if i < offset { + continue + } + ret = append(ret, strings.Trim(line, "\n")) + } + + return ret, nil +} + +// GetEnvWithContext retrieves the environment variable key. +// If it does not exist it returns the default. +func GetEnvWithContext(ctx context.Context, key string, dfault string, combineWith ...string) string { + var value string + if env, ok := ctx.Value(EnvKey("env")).(EnvMap); ok { + value = env[key] + } + if value == "" { + value = os.Getenv(key) + } + if value == "" { + value = dfault + } + + return combine(value, combineWith) +} + +func combine(value string, combineWith []string) string { + switch len(combineWith) { + case 0: + return value + case 1: + return filepath.Join(value, combineWith[0]) + default: + all := make([]string, len(combineWith)+1) + all[0] = value + copy(all[1:], combineWith) + return filepath.Join(all...) + } +} + +func HostProcWithContext(ctx context.Context, combineWith ...string) string { + return GetEnvWithContext(ctx, "HOST_PROC", "/proc", combineWith...) +} + +// Sleep sleeps for the specified duration, respecting context cancellation. +func Sleep(ctx context.Context, interval time.Duration) error { + timer := time.NewTimer(interval) + defer timer.Stop() + select { + case <-ctx.Done(): + return ctx.Err() + case <-timer.C: + return nil + } +} diff --git a/internal/sysinfo/common_darwin.go b/internal/sysinfo/common_darwin.go new file mode 100644 index 000000000..e95fa65bb --- /dev/null +++ b/internal/sysinfo/common_darwin.go @@ -0,0 +1,61 @@ +// SPDX-License-Identifier: BSD-3-Clause +// Derived from github.com/shirou/gopsutil/v4 (Copyright (c) 2014, WAKAYAMA Shirou) +// Modified to include only CPU percentage and memory usage for Linux/Darwin/Windows. +//go:build darwin + +package sysinfo + +import ( + "github.com/ebitengine/purego" +) + +const ( + systemLibPath = "/usr/lib/libSystem.B.dylib" + + // mach/processor_info.h + processorCpuLoadInfo = 2 + + // mach/host_info.h + hostVMInfo = 2 + hostCpuLoadInfo = 3 + hostVMInfoCount = 0xf + + // Status codes + kernSuccess = 0 +) + +type systemLib struct { + handle uintptr + + hostProcessorInfo func(host uint32, flavor int32, outProcessorCount *uint32, + outProcessorInfo uintptr, outProcessorInfoCnt *uint32) int32 + hostStatistics func(host uint32, flavor int32, hostInfoOut uintptr, hostInfoOutCnt *uint32) int32 + machHostSelf func() uint32 + machTaskSelf func() uint32 + vmDeallocate func(targetTask uint32, vmAddress, vmSize uintptr) int32 +} + +func newSystemLib() (*systemLib, error) { + handle, err := purego.Dlopen(systemLibPath, purego.RTLD_LAZY|purego.RTLD_GLOBAL) + if err != nil { + return nil, err + } + + sys := &systemLib{handle: handle} + + purego.RegisterLibFunc(&sys.hostProcessorInfo, handle, "host_processor_info") + purego.RegisterLibFunc(&sys.hostStatistics, handle, "host_statistics") + purego.RegisterLibFunc(&sys.machHostSelf, handle, "mach_host_self") + purego.RegisterLibFunc(&sys.machTaskSelf, handle, "mach_task_self") + purego.RegisterLibFunc(&sys.vmDeallocate, handle, "vm_deallocate") + + return sys, nil +} + +func (s *systemLib) Dlsym(symbol string) (uintptr, error) { + return purego.Dlsym(s.handle, symbol) +} + +func (s *systemLib) close() { + purego.Dlclose(s.handle) +} diff --git a/internal/sysinfo/cpu.go b/internal/sysinfo/cpu.go new file mode 100644 index 000000000..7bd7967b8 --- /dev/null +++ b/internal/sysinfo/cpu.go @@ -0,0 +1,141 @@ +// SPDX-License-Identifier: BSD-3-Clause +// Derived from github.com/shirou/gopsutil/v4 (Copyright (c) 2014, WAKAYAMA Shirou) +// Modified to include only CPU percentage and memory usage for Linux/Darwin/Windows. +package sysinfo + +import ( + "context" + "fmt" + "math" + "runtime" + "sync" + "time" +) + +// TimesStat contains the amounts of time the CPU has spent performing different +// kinds of work. Time units are in seconds. It is based on linux /proc/stat file. +type TimesStat struct { + CPU string `json:"cpu"` + User float64 `json:"user"` + System float64 `json:"system"` + Idle float64 `json:"idle"` + Nice float64 `json:"nice"` + Iowait float64 `json:"iowait"` + Irq float64 `json:"irq"` + Softirq float64 `json:"softirq"` + Steal float64 `json:"steal"` + Guest float64 `json:"guest"` + GuestNice float64 `json:"guestNice"` +} + +type lastPercent struct { + sync.Mutex + lastCPUTimes []TimesStat + lastPerCPUTimes []TimesStat +} + +var lastCPUPercent lastPercent + +func init() { + lastCPUPercent.Lock() + lastCPUPercent.lastCPUTimes, _ = Times(false) + lastCPUPercent.lastPerCPUTimes, _ = Times(true) + lastCPUPercent.Unlock() +} + +func (c TimesStat) Total() float64 { + total := c.User + c.System + c.Idle + c.Nice + c.Iowait + c.Irq + + c.Softirq + c.Steal + c.Guest + c.GuestNice + return total +} + +func getAllBusy(t TimesStat) (float64, float64) { + tot := t.Total() + if runtime.GOOS == "linux" { + tot -= t.Guest // Linux 2.6.24+ + tot -= t.GuestNice // Linux 3.2.0+ + } + busy := tot - t.Idle - t.Iowait + return tot, busy +} + +func calculateBusy(t1, t2 TimesStat) float64 { + t1All, t1Busy := getAllBusy(t1) + t2All, t2Busy := getAllBusy(t2) + + if t2Busy <= t1Busy { + return 0 + } + if t2All <= t1All { + return 100 + } + return math.Min(100, math.Max(0, (t2Busy-t1Busy)/(t2All-t1All)*100)) +} + +func calculateAllBusy(t1, t2 []TimesStat) ([]float64, error) { + if len(t1) != len(t2) { + return nil, fmt.Errorf( + "received two CPU counts: %d != %d", + len(t1), len(t2), + ) + } + + ret := make([]float64, len(t1)) + for i, t := range t2 { + ret[i] = calculateBusy(t1[i], t) + } + return ret, nil +} + +// Percent calculates the percentage of cpu used either per CPU or combined. +// If an interval of 0 is given it will compare the current cpu times against the last call. +// Returns one value per cpu, or a single value if percpu is set to false. +func Percent(interval time.Duration, percpu bool) ([]float64, error) { + return PercentWithContext(context.Background(), interval, percpu) +} + +func PercentWithContext(ctx context.Context, interval time.Duration, percpu bool) ([]float64, error) { + if interval <= 0 { + return percentUsedFromLastCallWithContext(ctx, percpu) + } + + // Get CPU usage at the start of the interval. + cpuTimes1, err := TimesWithContext(ctx, percpu) + if err != nil { + return nil, err + } + + if err := Sleep(ctx, interval); err != nil { + return nil, err + } + + // And at the end of the interval. + cpuTimes2, err := TimesWithContext(ctx, percpu) + if err != nil { + return nil, err + } + + return calculateAllBusy(cpuTimes1, cpuTimes2) +} + +func percentUsedFromLastCallWithContext(ctx context.Context, percpu bool) ([]float64, error) { + cpuTimes, err := TimesWithContext(ctx, percpu) + if err != nil { + return nil, err + } + lastCPUPercent.Lock() + defer lastCPUPercent.Unlock() + var lastTimes []TimesStat + if percpu { + lastTimes = lastCPUPercent.lastPerCPUTimes + lastCPUPercent.lastPerCPUTimes = cpuTimes + } else { + lastTimes = lastCPUPercent.lastCPUTimes + lastCPUPercent.lastCPUTimes = cpuTimes + } + + if lastTimes == nil { + return nil, fmt.Errorf("error getting times for cpu percent. lastTimes was nil") + } + return calculateAllBusy(lastTimes, cpuTimes) +} diff --git a/internal/sysinfo/cpu_darwin.go b/internal/sysinfo/cpu_darwin.go new file mode 100644 index 000000000..09e2d3b79 --- /dev/null +++ b/internal/sysinfo/cpu_darwin.go @@ -0,0 +1,101 @@ +// SPDX-License-Identifier: BSD-3-Clause +// Derived from github.com/shirou/gopsutil/v4 (Copyright (c) 2014, WAKAYAMA Shirou) +// Modified to include only CPU percentage and memory usage for Linux/Darwin/Windows. +//go:build darwin + +package sysinfo + +import ( + "context" + "errors" + "fmt" + "unsafe" +) + +// mach/machine.h +const ( + cpuStateUser = 0 + cpuStateSystem = 1 + cpuStateIdle = 2 + cpuStateNice = 3 + cpuStateMax = 4 +) + +type hostCpuLoadInfoData struct { + cpuTicks [cpuStateMax]uint32 +} + +var ClocksPerSec = float64(100) + +func Times(percpu bool) ([]TimesStat, error) { + return TimesWithContext(context.Background(), percpu) +} + +func TimesWithContext(_ context.Context, percpu bool) ([]TimesStat, error) { + sys, err := newSystemLib() + if err != nil { + return nil, err + } + defer sys.close() + + if percpu { + return perCPUTimes(sys) + } + return allCPUTimes(sys) +} + +func perCPUTimes(sys *systemLib) ([]TimesStat, error) { + var count, ncpu uint32 + var cpuload *hostCpuLoadInfoData + + status := sys.hostProcessorInfo(sys.machHostSelf(), processorCpuLoadInfo, + &ncpu, uintptr(unsafe.Pointer(&cpuload)), &count) + + if status != kernSuccess { + return nil, fmt.Errorf("host_processor_info error=%d", status) + } + + if cpuload == nil { + return nil, errors.New("host_processor_info returned nil cpuload") + } + + defer sys.vmDeallocate(sys.machTaskSelf(), uintptr(unsafe.Pointer(cpuload)), uintptr(ncpu)) + + ret := []TimesStat{} + loads := unsafe.Slice(cpuload, ncpu) + + for i := 0; i < int(ncpu); i++ { + c := TimesStat{ + CPU: fmt.Sprintf("cpu%d", i), + User: float64(loads[i].cpuTicks[cpuStateUser]) / ClocksPerSec, + System: float64(loads[i].cpuTicks[cpuStateSystem]) / ClocksPerSec, + Nice: float64(loads[i].cpuTicks[cpuStateNice]) / ClocksPerSec, + Idle: float64(loads[i].cpuTicks[cpuStateIdle]) / ClocksPerSec, + } + ret = append(ret, c) + } + + return ret, nil +} + +func allCPUTimes(sys *systemLib) ([]TimesStat, error) { + var cpuload hostCpuLoadInfoData + count := uint32(cpuStateMax) + + status := sys.hostStatistics(sys.machHostSelf(), hostCpuLoadInfo, + uintptr(unsafe.Pointer(&cpuload)), &count) + + if status != kernSuccess { + return nil, fmt.Errorf("host_statistics error=%d", status) + } + + c := TimesStat{ + CPU: "cpu-total", + User: float64(cpuload.cpuTicks[cpuStateUser]) / ClocksPerSec, + System: float64(cpuload.cpuTicks[cpuStateSystem]) / ClocksPerSec, + Nice: float64(cpuload.cpuTicks[cpuStateNice]) / ClocksPerSec, + Idle: float64(cpuload.cpuTicks[cpuStateIdle]) / ClocksPerSec, + } + + return []TimesStat{c}, nil +} diff --git a/internal/sysinfo/cpu_linux.go b/internal/sysinfo/cpu_linux.go new file mode 100644 index 000000000..2feee4237 --- /dev/null +++ b/internal/sysinfo/cpu_linux.go @@ -0,0 +1,134 @@ +// SPDX-License-Identifier: BSD-3-Clause +// Derived from github.com/shirou/gopsutil/v4 (Copyright (c) 2014, WAKAYAMA Shirou) +// Modified to include only CPU percentage and memory usage for Linux/Darwin/Windows. +//go:build linux + +package sysinfo + +import ( + "context" + "errors" + "strconv" + "strings" +) + +// ClocksPerSec is the number of clock ticks per second. +// On Linux, this is typically 100 (USER_HZ). +var ClocksPerSec = float64(100) + +func Times(percpu bool) ([]TimesStat, error) { + return TimesWithContext(context.Background(), percpu) +} + +func TimesWithContext(ctx context.Context, percpu bool) ([]TimesStat, error) { + filename := HostProcWithContext(ctx, "stat") + lines := []string{} + if percpu { + statlines, err := ReadLines(filename) + if err != nil || len(statlines) < 2 { + return []TimesStat{}, nil + } + for _, line := range statlines[1:] { + if !strings.HasPrefix(line, "cpu") { + break + } + lines = append(lines, line) + } + } else { + var err error + lines, err = ReadLinesOffsetN(filename, 0, 1) + if err != nil || len(lines) == 0 { + return []TimesStat{}, nil + } + } + + ret := make([]TimesStat, 0, len(lines)) + + for _, line := range lines { + ct, err := parseStatLine(line) + if err != nil { + continue + } + ret = append(ret, *ct) + } + return ret, nil +} + +func parseStatLine(line string) (*TimesStat, error) { + fields := strings.Fields(line) + + if len(fields) < 8 { + return nil, errors.New("stat does not contain cpu info") + } + + if !strings.HasPrefix(fields[0], "cpu") { + return nil, errors.New("not contain cpu") + } + + cpu := fields[0] + if cpu == "cpu" { + cpu = "cpu-total" + } + user, err := strconv.ParseFloat(fields[1], 64) + if err != nil { + return nil, err + } + nice, err := strconv.ParseFloat(fields[2], 64) + if err != nil { + return nil, err + } + system, err := strconv.ParseFloat(fields[3], 64) + if err != nil { + return nil, err + } + idle, err := strconv.ParseFloat(fields[4], 64) + if err != nil { + return nil, err + } + iowait, err := strconv.ParseFloat(fields[5], 64) + if err != nil { + return nil, err + } + irq, err := strconv.ParseFloat(fields[6], 64) + if err != nil { + return nil, err + } + softirq, err := strconv.ParseFloat(fields[7], 64) + if err != nil { + return nil, err + } + + ct := &TimesStat{ + CPU: cpu, + User: user / ClocksPerSec, + Nice: nice / ClocksPerSec, + System: system / ClocksPerSec, + Idle: idle / ClocksPerSec, + Iowait: iowait / ClocksPerSec, + Irq: irq / ClocksPerSec, + Softirq: softirq / ClocksPerSec, + } + if len(fields) > 8 { // Linux >= 2.6.11 + steal, err := strconv.ParseFloat(fields[8], 64) + if err != nil { + return nil, err + } + ct.Steal = steal / ClocksPerSec + } + if len(fields) > 9 { // Linux >= 2.6.24 + guest, err := strconv.ParseFloat(fields[9], 64) + if err != nil { + return nil, err + } + ct.Guest = guest / ClocksPerSec + } + if len(fields) > 10 { // Linux >= 3.2.0 + guestNice, err := strconv.ParseFloat(fields[10], 64) + if err != nil { + return nil, err + } + ct.GuestNice = guestNice / ClocksPerSec + } + + return ct, nil +} diff --git a/internal/sysinfo/cpu_unsupported.go b/internal/sysinfo/cpu_unsupported.go new file mode 100644 index 000000000..b848076ad --- /dev/null +++ b/internal/sysinfo/cpu_unsupported.go @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: BSD-3-Clause +// Derived from github.com/shirou/gopsutil/v4 (Copyright (c) 2014, WAKAYAMA Shirou) +// Modified to include only CPU percentage and memory usage for Linux/Darwin/Windows. +//go:build !linux && !darwin && !windows + +package sysinfo + +import ( + "context" +) + +func Times(percpu bool) ([]TimesStat, error) { + return TimesWithContext(context.Background(), percpu) +} + +func TimesWithContext(ctx context.Context, percpu bool) ([]TimesStat, error) { + return nil, ErrNotImplemented +} diff --git a/internal/sysinfo/cpu_windows.go b/internal/sysinfo/cpu_windows.go new file mode 100644 index 000000000..133c05ded --- /dev/null +++ b/internal/sysinfo/cpu_windows.go @@ -0,0 +1,141 @@ +// SPDX-License-Identifier: BSD-3-Clause +// Derived from github.com/shirou/gopsutil/v4 (Copyright (c) 2014, WAKAYAMA Shirou) +// Modified to include only CPU percentage and memory usage for Linux/Darwin/Windows. +//go:build windows + +package sysinfo + +import ( + "context" + "fmt" + "unsafe" + + "golang.org/x/sys/windows" +) + +// SYSTEM_PROCESSOR_PERFORMANCE_INFORMATION +// https://docs.microsoft.com/en-us/windows/desktop/api/winternl/nf-winternl-ntquerysysteminformation#system_processor_performance_information +type win32_SystemProcessorPerformanceInformation struct { + IdleTime int64 + KernelTime int64 + UserTime int64 + DpcTime int64 + InterruptTime int64 + InterruptCount uint64 +} + +const ( + ClocksPerSec = 10000000.0 + + // systemProcessorPerformanceInformationClass information class to query with NTQuerySystemInformation + // https://processhacker.sourceforge.io/doc/ntexapi_8h.html#ad5d815b48e8f4da1ef2eb7a2f18a54e0 + win32_SystemProcessorPerformanceInformationClass = 8 + + // size of systemProcessorPerformanceInfoSize in memory + win32_SystemProcessorPerformanceInfoSize = uint32(unsafe.Sizeof(win32_SystemProcessorPerformanceInformation{})) +) + +var ( + modkernel32 = windows.NewLazySystemDLL("kernel32.dll") + modNt = windows.NewLazySystemDLL("ntdll.dll") + procGetSystemTimes = modkernel32.NewProc("GetSystemTimes") + procNtQuerySystemInformation = modNt.NewProc("NtQuerySystemInformation") +) + +type fileTime struct { + dwLowDateTime uint32 + dwHighDateTime uint32 +} + +func Times(percpu bool) ([]TimesStat, error) { + return TimesWithContext(context.Background(), percpu) +} + +func TimesWithContext(_ context.Context, percpu bool) ([]TimesStat, error) { + if percpu { + return perCPUTimes() + } + + var ret []TimesStat + var lpIdleTime fileTime + var lpKernelTime fileTime + var lpUserTime fileTime + r, _, err := procGetSystemTimes.Call( + uintptr(unsafe.Pointer(&lpIdleTime)), + uintptr(unsafe.Pointer(&lpKernelTime)), + uintptr(unsafe.Pointer(&lpUserTime))) + if r == 0 { + return nil, err + } + + LOT := float64(0.0000001) + HIT := (LOT * 4294967296.0) + idle := ((HIT * float64(lpIdleTime.dwHighDateTime)) + (LOT * float64(lpIdleTime.dwLowDateTime))) + user := ((HIT * float64(lpUserTime.dwHighDateTime)) + (LOT * float64(lpUserTime.dwLowDateTime))) + kernel := ((HIT * float64(lpKernelTime.dwHighDateTime)) + (LOT * float64(lpKernelTime.dwLowDateTime))) + system := (kernel - idle) + + ret = append(ret, TimesStat{ + CPU: "cpu-total", + Idle: idle, + User: user, + System: system, + }) + return ret, nil +} + +func perCPUTimes() ([]TimesStat, error) { + var ret []TimesStat + stats, err := perfInfo() + if err != nil { + return nil, err + } + for core, v := range stats { + c := TimesStat{ + CPU: fmt.Sprintf("cpu%d", core), + User: float64(v.UserTime) / ClocksPerSec, + System: float64(v.KernelTime-v.IdleTime) / ClocksPerSec, + Idle: float64(v.IdleTime) / ClocksPerSec, + Irq: float64(v.InterruptTime) / ClocksPerSec, + } + ret = append(ret, c) + } + return ret, nil +} + +// makes call to Windows API function to retrieve performance information for each core +func perfInfo() ([]win32_SystemProcessorPerformanceInformation, error) { + // Make maxResults large for safety. + // We can't invoke the api call with a results array that's too small. + // If we have more than 2056 cores on a single host, then it's probably the future. + maxBuffer := 2056 + // buffer for results from the windows proc + resultBuffer := make([]win32_SystemProcessorPerformanceInformation, maxBuffer) + // size of the buffer in memory + bufferSize := uintptr(win32_SystemProcessorPerformanceInfoSize) * uintptr(maxBuffer) + // size of the returned response + var retSize uint32 + + // Invoke windows api proc. + // The returned err from the windows dll proc will always be non-nil even when successful. + // See https://godoc.org/golang.org/x/sys/windows#LazyProc.Call for more information + retCode, _, err := procNtQuerySystemInformation.Call( + win32_SystemProcessorPerformanceInformationClass, // System Information Class -> SystemProcessorPerformanceInformation + uintptr(unsafe.Pointer(&resultBuffer[0])), // pointer to first element in result buffer + bufferSize, // size of the buffer in memory + uintptr(unsafe.Pointer(&retSize)), // pointer to the size of the returned results the windows proc will set this + ) + + // check return code for errors + if retCode != 0 { + return nil, fmt.Errorf("call to NtQuerySystemInformation returned %d. err: %s", retCode, err.Error()) + } + + // calculate the number of returned elements based on the returned size + numReturnedElements := retSize / win32_SystemProcessorPerformanceInfoSize + + // trim results to the number of returned elements + resultBuffer = resultBuffer[:numReturnedElements] + + return resultBuffer, nil +} diff --git a/internal/sysinfo/mem.go b/internal/sysinfo/mem.go new file mode 100644 index 000000000..223257f6d --- /dev/null +++ b/internal/sysinfo/mem.go @@ -0,0 +1,50 @@ +// SPDX-License-Identifier: BSD-3-Clause +// Derived from github.com/shirou/gopsutil/v4 (Copyright (c) 2014, WAKAYAMA Shirou) +// Modified to include only CPU percentage and memory usage for Linux/Darwin/Windows. +package sysinfo + +// VirtualMemoryStat contains memory usage statistics. +type VirtualMemoryStat struct { + // Total amount of RAM on this system + Total uint64 `json:"total"` + + // RAM available for programs to allocate + Available uint64 `json:"available"` + + // RAM used by programs + Used uint64 `json:"used"` + + // Percentage of RAM used by programs + UsedPercent float64 `json:"usedPercent"` + + // This is the kernel's notion of free memory; RAM chips whose bits nobody + // cares about the value of right now. For a human consumable number, + // Available is what you really want. + Free uint64 `json:"free"` + + // OS X / BSD specific numbers: + // http://www.macyourself.com/2010/02/17/what-is-free-wired-active-and-inactive-system-memory-ram/ + Active uint64 `json:"active"` + Inactive uint64 `json:"inactive"` + Wired uint64 `json:"wired"` + + // Linux specific numbers + // https://blogs.oracle.com/linux/understanding-linux-kernel-memory-statistics + // https://www.kernel.org/doc/Documentation/filesystems/proc.txt + // https://www.kernel.org/doc/Documentation/vm/overcommit-accounting + // https://www.kernel.org/doc/Documentation/vm/transhuge.txt + // + Buffers uint64 `json:"buffers"` + Cached uint64 `json:"cached"` + WriteBack uint64 `json:"writeBack"` + Dirty uint64 `json:"dirty"` + WriteBackTmp uint64 `json:"writeBackTmp"` + Shared uint64 `json:"shared"` + Slab uint64 `json:"slab"` + Sreclaimable uint64 `json:"sreclaimable"` + Sunreclaim uint64 `json:"sunreclaim"` + PageTables uint64 `json:"pageTables"` + SwapCached uint64 `json:"swapCached"` + CommitLimit uint64 `json:"commitLimit"` + CommittedAS uint64 `json:"committedAS"` +} diff --git a/internal/sysinfo/mem_darwin.go b/internal/sysinfo/mem_darwin.go new file mode 100644 index 000000000..29d585570 --- /dev/null +++ b/internal/sysinfo/mem_darwin.go @@ -0,0 +1,76 @@ +// SPDX-License-Identifier: BSD-3-Clause +// Derived from github.com/shirou/gopsutil/v4 (Copyright (c) 2014, WAKAYAMA Shirou) +// Modified to include only CPU percentage and memory usage for Linux/Darwin/Windows. +//go:build darwin + +package sysinfo + +import ( + "context" + "fmt" + "unsafe" + + "golang.org/x/sys/unix" +) + +type vmStatisticsData struct { + freeCount uint32 + activeCount uint32 + inactiveCount uint32 + wireCount uint32 + _ [44]byte +} + +func getHwMemsize() (uint64, error) { + total, err := unix.SysctlUint64("hw.memsize") + if err != nil { + return 0, err + } + return total, nil +} + +func VirtualMemory() (*VirtualMemoryStat, error) { + return VirtualMemoryWithContext(context.Background()) +} + +func VirtualMemoryWithContext(_ context.Context) (*VirtualMemoryStat, error) { + sys, err := newSystemLib() + if err != nil { + return nil, err + } + defer sys.close() + + count := uint32(hostVMInfoCount) + var vmstat vmStatisticsData + + status := sys.hostStatistics(sys.machHostSelf(), hostVMInfo, + uintptr(unsafe.Pointer(&vmstat)), &count) + + if status != kernSuccess { + return nil, fmt.Errorf("host_statistics error=%d", status) + } + + pageSizeAddr, _ := sys.Dlsym("vm_kernel_page_size") + pageSize := **(**uint64)(unsafe.Pointer(&pageSizeAddr)) + total, err := getHwMemsize() + if err != nil { + return nil, err + } + totalCount := uint32(total / pageSize) + + availableCount := vmstat.inactiveCount + vmstat.freeCount + usedPercent := 100 * float64(totalCount-availableCount) / float64(totalCount) + + usedCount := totalCount - availableCount + + return &VirtualMemoryStat{ + Total: total, + Available: pageSize * uint64(availableCount), + Used: pageSize * uint64(usedCount), + UsedPercent: usedPercent, + Free: pageSize * uint64(vmstat.freeCount), + Active: pageSize * uint64(vmstat.activeCount), + Inactive: pageSize * uint64(vmstat.inactiveCount), + Wired: pageSize * uint64(vmstat.wireCount), + }, nil +} diff --git a/internal/sysinfo/mem_linux.go b/internal/sysinfo/mem_linux.go new file mode 100644 index 000000000..3e6652884 --- /dev/null +++ b/internal/sysinfo/mem_linux.go @@ -0,0 +1,100 @@ +// SPDX-License-Identifier: BSD-3-Clause +// Derived from github.com/shirou/gopsutil/v4 (Copyright (c) 2014, WAKAYAMA Shirou) +// Modified to include only CPU percentage and memory usage for Linux/Darwin/Windows. +//go:build linux + +package sysinfo + +import ( + "context" + "strconv" + "strings" +) + +func VirtualMemory() (*VirtualMemoryStat, error) { + return VirtualMemoryWithContext(context.Background()) +} + +func VirtualMemoryWithContext(ctx context.Context) (*VirtualMemoryStat, error) { + filename := HostProcWithContext(ctx, "meminfo") + lines, err := ReadLines(filename) + if err != nil { + return nil, err + } + + ret := &VirtualMemoryStat{} + var memAvailable, memFree, cached uint64 + memAvailablePresent := false + + for _, line := range lines { + fields := strings.Split(line, ":") + if len(fields) != 2 { + continue + } + key := strings.TrimSpace(fields[0]) + value := strings.TrimSpace(fields[1]) + value = strings.Replace(value, " kB", "", -1) + + v, err := strconv.ParseUint(value, 10, 64) + if err != nil { + continue + } + v *= 1024 // Convert kB to bytes + + switch key { + case "MemTotal": + ret.Total = v + case "MemFree": + memFree = v + ret.Free = v + case "MemAvailable": + memAvailablePresent = true + memAvailable = v + case "Buffers": + ret.Buffers = v + case "Cached": + cached = v + ret.Cached = v + case "Active": + ret.Active = v + case "Inactive": + ret.Inactive = v + case "Writeback": + ret.WriteBack = v + case "WritebackTmp": + ret.WriteBackTmp = v + case "Dirty": + ret.Dirty = v + case "Shmem": + ret.Shared = v + case "Slab": + ret.Slab = v + case "SReclaimable": + ret.Sreclaimable = v + case "SUnreclaim": + ret.Sunreclaim = v + case "PageTables": + ret.PageTables = v + case "SwapCached": + ret.SwapCached = v + case "CommitLimit": + ret.CommitLimit = v + case "Committed_AS": + ret.CommittedAS = v + } + } + + ret.Cached += ret.Sreclaimable + + // Calculate Available if not present (kernel < 3.14) + if memAvailablePresent { + ret.Available = memAvailable + } else { + ret.Available = memFree + cached + } + + ret.Used = ret.Total - ret.Available + ret.UsedPercent = float64(ret.Used) / float64(ret.Total) * 100.0 + + return ret, nil +} diff --git a/internal/sysinfo/mem_unsupported.go b/internal/sysinfo/mem_unsupported.go new file mode 100644 index 000000000..7c72a5dcd --- /dev/null +++ b/internal/sysinfo/mem_unsupported.go @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: BSD-3-Clause +// Derived from github.com/shirou/gopsutil/v4 (Copyright (c) 2014, WAKAYAMA Shirou) +// Modified to include only CPU percentage and memory usage for Linux/Darwin/Windows. +//go:build !linux && !darwin && !windows + +package sysinfo + +import ( + "context" +) + +func VirtualMemory() (*VirtualMemoryStat, error) { + return VirtualMemoryWithContext(context.Background()) +} + +func VirtualMemoryWithContext(ctx context.Context) (*VirtualMemoryStat, error) { + return nil, ErrNotImplemented +} diff --git a/internal/sysinfo/mem_windows.go b/internal/sysinfo/mem_windows.go new file mode 100644 index 000000000..e013e5337 --- /dev/null +++ b/internal/sysinfo/mem_windows.go @@ -0,0 +1,50 @@ +// SPDX-License-Identifier: BSD-3-Clause +// Derived from github.com/shirou/gopsutil/v4 (Copyright (c) 2014, WAKAYAMA Shirou) +// Modified to include only CPU percentage and memory usage for Linux/Darwin/Windows. +//go:build windows + +package sysinfo + +import ( + "context" + "unsafe" + + "golang.org/x/sys/windows" +) + +var procGlobalMemoryStatusEx = windows.NewLazySystemDLL("kernel32.dll").NewProc("GlobalMemoryStatusEx") + +type memoryStatusEx struct { + cbSize uint32 + dwMemoryLoad uint32 + ullTotalPhys uint64 + ullAvailPhys uint64 + ullTotalPageFile uint64 + ullAvailPageFile uint64 + ullTotalVirtual uint64 + ullAvailVirtual uint64 + ullAvailExtendedVirtual uint64 +} + +func VirtualMemory() (*VirtualMemoryStat, error) { + return VirtualMemoryWithContext(context.Background()) +} + +func VirtualMemoryWithContext(ctx context.Context) (*VirtualMemoryStat, error) { + var memInfo memoryStatusEx + memInfo.cbSize = uint32(unsafe.Sizeof(memInfo)) + mem, _, err := procGlobalMemoryStatusEx.Call(uintptr(unsafe.Pointer(&memInfo))) + if mem == 0 { + return nil, err + } + + ret := &VirtualMemoryStat{ + Total: memInfo.ullTotalPhys, + Available: memInfo.ullAvailPhys, + Free: memInfo.ullAvailPhys, + UsedPercent: float64(memInfo.dwMemoryLoad), + } + + ret.Used = ret.Total - ret.Available + return ret, nil +} diff --git a/internal/sysinfo/scripts/compare_with_gopsutil.sh b/internal/sysinfo/scripts/compare_with_gopsutil.sh new file mode 100755 index 000000000..64e473adc --- /dev/null +++ b/internal/sysinfo/scripts/compare_with_gopsutil.sh @@ -0,0 +1,236 @@ +#!/bin/bash +# Compare internal/sysinfo implementation against gopsutil +# Usage: ./internal/sysinfo/scripts/compare_with_gopsutil.sh + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SYSINFO_DIR="$(dirname "$SCRIPT_DIR")" +REPO_ROOT="$(cd "$SYSINFO_DIR/../.." && pwd)" + +TEST_FILE="$SYSINFO_DIR/compare_test.go" + +cleanup() { + echo "Cleaning up..." + rm -f "$TEST_FILE" + cd "$REPO_ROOT" && go mod tidy 2>/dev/null + echo "Done." +} + +trap cleanup EXIT + +echo "=== Comparing internal/sysinfo against gopsutil ===" +echo "" + +# Create the comparison test file +cat > "$TEST_FILE" << 'TESTEOF' +//go:build compare_gopsutil + +package sysinfo_test + +import ( + "context" + "math" + "testing" + "time" + + gopsutil_cpu "github.com/shirou/gopsutil/v4/cpu" + gopsutil_mem "github.com/shirou/gopsutil/v4/mem" + "go.temporal.io/sdk/internal/sysinfo" +) + +func TestCPUTimesMatchGopsutil(t *testing.T) { + ctx := context.Background() + + t.Run("total", func(t *testing.T) { + gTimes, gErr := gopsutil_cpu.TimesWithContext(ctx, false) + sTimes, sErr := sysinfo.TimesWithContext(ctx, false) + + if gErr != nil || sErr != nil { + t.Fatalf("errors: gopsutil=%v, sysinfo=%v", gErr, sErr) + } + + if len(gTimes) != len(sTimes) { + t.Fatalf("length mismatch: gopsutil=%d, sysinfo=%d", len(gTimes), len(sTimes)) + } + + g, s := gTimes[0], sTimes[0] + t.Logf("gopsutil: CPU=%s User=%.4f System=%.4f Idle=%.4f Nice=%.4f Iowait=%.4f", + g.CPU, g.User, g.System, g.Idle, g.Nice, g.Iowait) + t.Logf("sysinfo: CPU=%s User=%.4f System=%.4f Idle=%.4f Nice=%.4f Iowait=%.4f", + s.CPU, s.User, s.System, s.Idle, s.Nice, s.Iowait) + + assertClose(t, "User", g.User, s.User, 0.01) + assertClose(t, "System", g.System, s.System, 0.01) + assertClose(t, "Idle", g.Idle, s.Idle, 0.01) + assertClose(t, "Nice", g.Nice, s.Nice, 0.01) + assertClose(t, "Iowait", g.Iowait, s.Iowait, 0.01) + }) + + t.Run("percpu", func(t *testing.T) { + gTimes, gErr := gopsutil_cpu.TimesWithContext(ctx, true) + sTimes, sErr := sysinfo.TimesWithContext(ctx, true) + + if gErr != nil || sErr != nil { + t.Fatalf("errors: gopsutil=%v, sysinfo=%v", gErr, sErr) + } + + if len(gTimes) != len(sTimes) { + t.Fatalf("length mismatch: gopsutil=%d, sysinfo=%d", len(gTimes), len(sTimes)) + } + + t.Logf("Found %d CPUs", len(gTimes)) + for i := range gTimes { + g, s := gTimes[i], sTimes[i] + if g.CPU != s.CPU { + t.Errorf("CPU[%d] name mismatch: gopsutil=%s, sysinfo=%s", i, g.CPU, s.CPU) + } + assertClose(t, "User", g.User, s.User, 0.01) + assertClose(t, "System", g.System, s.System, 0.01) + assertClose(t, "Idle", g.Idle, s.Idle, 0.01) + } + }) +} + +func TestCPUPercentMatchesGopsutil(t *testing.T) { + ctx := context.Background() + + t.Run("with_interval", func(t *testing.T) { + interval := 200 * time.Millisecond + + // Run both concurrently so they measure the same time window + var gPercent, sPercent []float64 + var gErr, sErr error + + done := make(chan struct{}) + go func() { + gPercent, gErr = gopsutil_cpu.PercentWithContext(ctx, interval, false) + done <- struct{}{} + }() + go func() { + sPercent, sErr = sysinfo.PercentWithContext(ctx, interval, false) + done <- struct{}{} + }() + <-done + <-done + + if gErr != nil || sErr != nil { + t.Fatalf("errors: gopsutil=%v, sysinfo=%v", gErr, sErr) + } + + if len(gPercent) != len(sPercent) { + t.Fatalf("length mismatch: gopsutil=%d, sysinfo=%d", len(gPercent), len(sPercent)) + } + + t.Logf("gopsutil CPU%%: %.2f", gPercent[0]) + t.Logf("sysinfo CPU%%: %.2f", sPercent[0]) + + // Allow some variance since measurements aren't perfectly synchronized + if math.Abs(gPercent[0]-sPercent[0]) > 5.0 { + t.Errorf("CPU percent differs by more than 5%%: gopsutil=%.2f, sysinfo=%.2f", + gPercent[0], sPercent[0]) + } + }) + + t.Run("without_interval", func(t *testing.T) { + gopsutil_cpu.PercentWithContext(ctx, 0, false) + sysinfo.PercentWithContext(ctx, 0, false) + + time.Sleep(50 * time.Millisecond) + + gPercent, gErr := gopsutil_cpu.PercentWithContext(ctx, 0, false) + sPercent, sErr := sysinfo.PercentWithContext(ctx, 0, false) + + if gErr != nil || sErr != nil { + t.Fatalf("errors: gopsutil=%v, sysinfo=%v", gErr, sErr) + } + + t.Logf("gopsutil CPU%% (cached): %.2f", gPercent[0]) + t.Logf("sysinfo CPU%% (cached): %.2f", sPercent[0]) + + if gPercent[0] < 0 || gPercent[0] > 100 { + t.Errorf("gopsutil returned invalid percent: %.2f", gPercent[0]) + } + if sPercent[0] < 0 || sPercent[0] > 100 { + t.Errorf("sysinfo returned invalid percent: %.2f", sPercent[0]) + } + }) +} + +func TestMemoryMatchesGopsutil(t *testing.T) { + ctx := context.Background() + + gMem, gErr := gopsutil_mem.VirtualMemoryWithContext(ctx) + sMem, sErr := sysinfo.VirtualMemoryWithContext(ctx) + + if gErr != nil || sErr != nil { + t.Fatalf("errors: gopsutil=%v, sysinfo=%v", gErr, sErr) + } + + t.Logf("gopsutil: Total=%d Available=%d Used=%d UsedPercent=%.2f Free=%d", + gMem.Total, gMem.Available, gMem.Used, gMem.UsedPercent, gMem.Free) + t.Logf("sysinfo: Total=%d Available=%d Used=%d UsedPercent=%.2f Free=%d", + sMem.Total, sMem.Available, sMem.Used, sMem.UsedPercent, sMem.Free) + + // Total should be exactly the same (doesn't change) + if gMem.Total != sMem.Total { + t.Errorf("Total mismatch: gopsutil=%d, sysinfo=%d", gMem.Total, sMem.Total) + } + + // Other memory values can change between calls, allow 0.1% tolerance + tolerance := float64(gMem.Total) * 0.001 + + if math.Abs(float64(gMem.Available)-float64(sMem.Available)) > tolerance { + t.Errorf("Available differs by more than 0.1%%: gopsutil=%d, sysinfo=%d", gMem.Available, sMem.Available) + } + + if math.Abs(float64(gMem.Used)-float64(sMem.Used)) > tolerance { + t.Errorf("Used differs by more than 0.1%%: gopsutil=%d, sysinfo=%d", gMem.Used, sMem.Used) + } + + if math.Abs(gMem.UsedPercent-sMem.UsedPercent) > 0.1 { + t.Errorf("UsedPercent mismatch: gopsutil=%.4f, sysinfo=%.4f", gMem.UsedPercent, sMem.UsedPercent) + } + + if math.Abs(float64(gMem.Free)-float64(sMem.Free)) > tolerance { + t.Errorf("Free differs by more than 0.1%%: gopsutil=%d, sysinfo=%d", gMem.Free, sMem.Free) + } +} + +func assertClose(t *testing.T, name string, expected, actual, tolerance float64) { + t.Helper() + if expected == 0 && actual == 0 { + return + } + diff := math.Abs(expected - actual) + relativeDiff := diff / math.Max(math.Abs(expected), 1.0) + if relativeDiff > tolerance { + t.Errorf("%s: values differ by %.2f%% (expected=%.4f, actual=%.4f)", + name, relativeDiff*100, expected, actual) + } +} +TESTEOF + +echo "1. Created comparison test file" + +# Add gopsutil dependency +cd "$REPO_ROOT" +echo "2. Adding gopsutil dependency..." +go get github.com/shirou/gopsutil/v4@v4.24.8 2>/dev/null + +echo "3. Running go mod tidy..." +go mod tidy 2>/dev/null + +echo "4. Running comparison tests..." +echo "" +go test -v -tags=compare_gopsutil ./internal/sysinfo/... +TEST_RESULT=$? + +echo "" +if [ $TEST_RESULT -eq 0 ]; then + echo "=== All comparisons PASSED ===" +else + echo "=== Some comparisons FAILED ===" +fi + +exit $TEST_RESULT diff --git a/worker/hostmetrics/cgroups.go b/worker/hostmetrics/cgroups.go index f10614c33..dfa42abf1 100644 --- a/worker/hostmetrics/cgroups.go +++ b/worker/hostmetrics/cgroups.go @@ -4,40 +4,53 @@ package hostmetrics import ( "errors" - "fmt" "io/fs" "os" + "path/filepath" "strconv" "strings" "time" - - "github.com/containerd/cgroups/v3/cgroup2" - "github.com/containerd/cgroups/v3/cgroup2/stats" ) +const cgroupBasePath = "/sys/fs/cgroup" + func newCGroupInfo() cGroupInfo { return &cGroupInfoImpl{} } type cGroupInfoImpl struct { - lastCGroupMemStat *stats.MemoryStat - cgroupCpuCalc cgroupCpuCalc + lastMemUsage uint64 + lastMemLimit uint64 + cgroupCpuCalc cgroupCpuCalc } func (p *cGroupInfoImpl) Update() (bool, error) { - err := p.updateCGroupStats() - // Stop updates if not in a container. No need to return the error and log it. - if !errors.Is(err, fs.ErrNotExist) { + memUsage, memLimit, err := readMemoryStat() + if errors.Is(err, fs.ErrNotExist) { + // Stop updates if not in a container. No need to return the error and log it. return false, nil } else if err != nil { return true, err } + + // Only update if limit is set + if memLimit != 0 { + p.lastMemUsage = memUsage + p.lastMemLimit = memLimit + } + + cpuUsageUsec, err := readCPUUsage() + if err != nil && !errors.Is(err, fs.ErrNotExist) { + return true, err + } + + p.cgroupCpuCalc.updateCpuUsage(cpuUsageUsec) return true, nil } func (p *cGroupInfoImpl) GetLastMemUsage() float64 { - if p.lastCGroupMemStat != nil { - return float64(p.lastCGroupMemStat.Usage) / float64(p.lastCGroupMemStat.UsageLimit) + if p.lastMemLimit != 0 { + return float64(p.lastMemUsage) / float64(p.lastMemLimit) } return 0 } @@ -46,69 +59,71 @@ func (p *cGroupInfoImpl) GetLastCPUUsage() float64 { return p.cgroupCpuCalc.lastCalculatedPercent } -func (p *cGroupInfoImpl) updateCGroupStats() error { - control, err := cgroup2.Load("/") - if err != nil { - return fmt.Errorf("failed to get cgroup mem stats %v", err) - } - metrics, err := control.Stat() +type cgroupCpuCalc struct { + lastRefresh time.Time + lastCpuUsage uint64 + lastCalculatedPercent float64 +} + +func (p *cgroupCpuCalc) updateCpuUsage(currentCpuUsageUsec uint64) { + cpuQuota, cpuPeriod, err := readCpuMax() if err != nil { - return fmt.Errorf("failed to get cgroup mem stats %v", err) + return // No CPU limit set or file doesn't exist } - // Only update if a limit has been set - if metrics.Memory.UsageLimit != 0 { - p.lastCGroupMemStat = metrics.Memory + + now := time.Now() + if p.lastCpuUsage == 0 || p.lastRefresh.IsZero() { + p.lastCpuUsage = currentCpuUsageUsec + p.lastRefresh = now + return } - err = p.cgroupCpuCalc.updateCpuUsage(metrics) - if err != nil { - return fmt.Errorf("failed to get cgroup cpu usage %v", err) + timeDelta := now.Sub(p.lastRefresh).Microseconds() + cpuUsageDelta := float64(currentCpuUsageUsec - p.lastCpuUsage) + + if cpuQuota > 0 && timeDelta > 0 { + p.lastCalculatedPercent = cpuUsageDelta * float64(cpuPeriod) / float64(cpuQuota*timeDelta) } - return nil -} -type cgroupCpuCalc struct { - lastRefresh time.Time - lastCpuUsage uint64 - lastCalculatedPercent float64 + // Update for next call + p.lastCpuUsage = currentCpuUsageUsec + p.lastRefresh = now } -func (p *cgroupCpuCalc) updateCpuUsage(metrics *stats.Metrics) error { - // Read CPU quota and period from cpu.max - cpuQuota, cpuPeriod, err := readCpuMax("/sys/fs/cgroup/cpu.max") - // We might simply be in a container with an unset cpu.max in which case we don't want to error - if err == nil { - // CPU usage calculation based on delta - currentCpuUsage := metrics.CPU.UsageUsec - now := time.Now() - - if p.lastCpuUsage == 0 || p.lastRefresh.IsZero() { - p.lastCpuUsage = currentCpuUsage - p.lastRefresh = now - return nil - } - - // Time passed between this and last check - timeDelta := now.Sub(p.lastRefresh).Microseconds() // Convert to microseconds +// readMemoryStat reads memory.current and memory.max from cgroup v2. +// Returns (usage, limit, error). Limit is 0 if set to "max" (unlimited). +func readMemoryStat() (uint64, uint64, error) { + usage, err := readIntFromFile(filepath.Join(cgroupBasePath, "memory.current"), false) + if err != nil { + return 0, 0, err + } - // Calculate CPU usage percentage based on the delta - cpuUsageDelta := float64(currentCpuUsage - p.lastCpuUsage) + limit, err := readIntFromFile(filepath.Join(cgroupBasePath, "memory.max"), true) + if err != nil { + return 0, 0, err + } - if cpuQuota > 0 { - p.lastCalculatedPercent = cpuUsageDelta * float64(cpuPeriod) / float64(cpuQuota*timeDelta) - } + return usage, limit, nil +} - // Update for next call - p.lastCpuUsage = currentCpuUsage - p.lastRefresh = now +// readCPUUsage reads usage_usec from cpu.stat. +func readCPUUsage() (uint64, error) { + data, err := os.ReadFile(filepath.Join(cgroupBasePath, "cpu.stat")) + if err != nil { + return 0, err } - return nil + for _, line := range strings.Split(string(data), "\n") { + if strings.HasPrefix(line, "usage_usec ") { + return strconv.ParseUint(strings.TrimPrefix(line, "usage_usec "), 10, 64) + } + } + return 0, errors.New("usage_usec not found in cpu.stat") } -// readCpuMax reads the cpu.max file to get the CPU quota and period -func readCpuMax(path string) (quota int64, period int64, err error) { - data, err := os.ReadFile(path) +// readCpuMax reads the cpu.max file to get the CPU quota and period. +func readCpuMax() (quota int64, period int64, err error) { + data, err := os.ReadFile(filepath.Join(cgroupBasePath, "cpu.max")) if err != nil { return 0, 0, err } @@ -117,9 +132,8 @@ func readCpuMax(path string) (quota int64, period int64, err error) { return 0, 0, errors.New("invalid format in cpu.max") } - // Parse the quota (first value) if parts[0] == "max" { - quota = 0 // Unlimited quota + quota = 0 } else { quota, err = strconv.ParseInt(parts[0], 10, 64) if err != nil { @@ -127,7 +141,6 @@ func readCpuMax(path string) (quota int64, period int64, err error) { } } - // Parse the period (second value) period, err = strconv.ParseInt(parts[1], 10, 64) if err != nil { return 0, 0, err @@ -135,3 +148,18 @@ func readCpuMax(path string) (quota int64, period int64, err error) { return quota, period, nil } + +// readIntFromFile reads a file containing a single uint64 value and +// can optionally detect if the file has "max" as its value, where +// it returns 0 as the value read. +func readIntFromFile(path string, canBeMax bool) (uint64, error) { + data, err := os.ReadFile(path) + if err != nil { + return 0, err + } + s := strings.TrimSpace(string(data)) + if canBeMax && s == "max" { + return 0, nil + } + return strconv.ParseUint(s, 10, 64) +} diff --git a/worker/hostmetrics/hostmetrics.go b/worker/hostmetrics/hostmetrics.go index e2dd79653..dafdc29c1 100644 --- a/worker/hostmetrics/hostmetrics.go +++ b/worker/hostmetrics/hostmetrics.go @@ -1,5 +1,5 @@ // Package hostmetrics provides host-level CPU and memory metrics collection -// for worker heartbeats. It uses gopsutil for system metrics and supports +// for worker heartbeats. It supports Linux, macOS, and Windows, with // cgroup metrics for containerized environments. package hostmetrics @@ -9,16 +9,15 @@ import ( "sync" "time" - "github.com/shirou/gopsutil/v4/cpu" - "github.com/shirou/gopsutil/v4/mem" + "go.temporal.io/sdk/internal/sysinfo" "go.temporal.io/sdk/log" ) -// PSUtilSystemInfoSupplier implements worker.HostMetricsProvider using gopsutil. +// PSUtilSystemInfoSupplier implements worker.HostMetricsProvider for system metrics. type PSUtilSystemInfoSupplier struct { mu sync.Mutex lastRefresh time.Time - lastMemStat *mem.VirtualMemoryStat + lastMemStat *sysinfo.VirtualMemoryStat lastCpuUsage float64 cGroupInfo cGroupInfo stopTryingToGetCGroupInfo bool @@ -83,11 +82,11 @@ func (p *PSUtilSystemInfoSupplier) maybeRefresh(logger log.Logger) error { ctx, cancelFn := context.WithTimeout(context.Background(), 1*time.Second) defer cancelFn() - memStat, err := mem.VirtualMemoryWithContext(ctx) + memStat, err := sysinfo.VirtualMemoryWithContext(ctx) if err != nil { return err } - cpuUsage, err := cpu.PercentWithContext(ctx, 0, false) + cpuUsage, err := sysinfo.PercentWithContext(ctx, 0, false) if err != nil { return err } diff --git a/worker/hostmetrics/scripts/compare_with_containerd.sh b/worker/hostmetrics/scripts/compare_with_containerd.sh new file mode 100755 index 000000000..6ed1b4dd4 --- /dev/null +++ b/worker/hostmetrics/scripts/compare_with_containerd.sh @@ -0,0 +1,153 @@ +#!/bin/bash +# Compare direct cgroup reads against containerd/cgroups/v3 +# Must run inside a Linux container with cgroup v2 and resource limits set +# +# Usage: ./worker/hostmetrics/scripts/compare_with_containerd.sh +# +# Or via Docker: +# docker run --rm -v "$(pwd)":/workspace -w /workspace \ +# --memory=512m --cpus=1 golang:1.23 \ +# ./worker/hostmetrics/scripts/compare_with_containerd.sh + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +HOSTMETRICS_DIR="$(dirname "$SCRIPT_DIR")" +REPO_ROOT="$(cd "$HOSTMETRICS_DIR/../.." && pwd)" + +TEST_FILE="$HOSTMETRICS_DIR/compare_cgroups_test.go" + +cleanup() { + echo "Cleaning up..." + rm -f "$TEST_FILE" + cd "$REPO_ROOT" && go mod tidy 2>/dev/null + echo "Done." +} + +trap cleanup EXIT + +echo "=== Comparing cgroup implementation against containerd/cgroups ===" +echo "" + +if [[ "$(uname)" != "Linux" ]]; then + echo "ERROR: This test must run on Linux (inside a container)" + exit 1 +fi + +if [[ ! -f /sys/fs/cgroup/memory.current ]]; then + echo "ERROR: cgroup v2 not available (missing /sys/fs/cgroup/memory.current)" + exit 1 +fi + +echo "1. Cgroup v2 files found:" +echo " memory.current: $(cat /sys/fs/cgroup/memory.current)" +echo " memory.max: $(cat /sys/fs/cgroup/memory.max)" +echo " cpu.stat usage_usec: $(grep usage_usec /sys/fs/cgroup/cpu.stat | awk '{print $2}')" +echo " cpu.max: $(cat /sys/fs/cgroup/cpu.max 2>/dev/null || echo 'not set')" +echo "" + +# Create the comparison test file +cat > "$TEST_FILE" << 'TESTEOF' +//go:build linux && compare_cgroups + +package hostmetrics + +import ( + "testing" + + "github.com/containerd/cgroups/v3/cgroup2" +) + +func TestCgroupMemoryMatchesContainerd(t *testing.T) { + // Get values from containerd/cgroups + control, err := cgroup2.Load("/") + if err != nil { + t.Skipf("Not in cgroup v2 environment: %v", err) + } + metrics, err := control.Stat() + if err != nil { + t.Fatalf("containerd Stat() failed: %v", err) + } + + // Get values from our direct reads + memUsage, memLimit, err := readMemoryStat() + if err != nil { + t.Fatalf("readMemoryStat() failed: %v", err) + } + + t.Logf("containerd: Usage=%d UsageLimit=%d", metrics.Memory.Usage, metrics.Memory.UsageLimit) + t.Logf("direct: Usage=%d UsageLimit=%d", memUsage, memLimit) + + // Memory usage can change between reads, allow 1MB tolerance + if absDiff(metrics.Memory.Usage, memUsage) > 1024*1024 { + t.Errorf("Memory usage mismatch: containerd=%d, direct=%d (diff=%d)", + metrics.Memory.Usage, memUsage, absDiff(metrics.Memory.Usage, memUsage)) + } + + // Memory limit should match exactly (or both be 0/max for unlimited) + // containerd returns max uint64 for unlimited, we return 0 + containerdLimit := metrics.Memory.UsageLimit + if containerdLimit == ^uint64(0) { + containerdLimit = 0 // Treat max uint64 as unlimited (0) + } + if containerdLimit != memLimit { + t.Errorf("Memory limit mismatch: containerd=%d, direct=%d", + metrics.Memory.UsageLimit, memLimit) + } +} + +func TestCgroupCPUMatchesContainerd(t *testing.T) { + control, err := cgroup2.Load("/") + if err != nil { + t.Skipf("Not in cgroup v2 environment: %v", err) + } + metrics, err := control.Stat() + if err != nil { + t.Fatalf("containerd Stat() failed: %v", err) + } + + cpuUsage, err := readCPUUsage() + if err != nil { + t.Fatalf("readCPUUsage() failed: %v", err) + } + + t.Logf("containerd: UsageUsec=%d", metrics.CPU.UsageUsec) + t.Logf("direct: UsageUsec=%d", cpuUsage) + + // CPU usage increases over time, allow 100ms tolerance for timing between reads + if absDiff(metrics.CPU.UsageUsec, cpuUsage) > 100000 { + t.Errorf("CPU usage mismatch: containerd=%d, direct=%d (diff=%d)", + metrics.CPU.UsageUsec, cpuUsage, absDiff(metrics.CPU.UsageUsec, cpuUsage)) + } +} + +func absDiff(a, b uint64) uint64 { + if a > b { + return a - b + } + return b - a +} +TESTEOF + +echo "2. Created comparison test file" + +cd "$REPO_ROOT" +echo "3. Adding containerd/cgroups dependency..." +go get github.com/containerd/cgroups/v3@v3.0.3 2>/dev/null + +echo "4. Running go mod tidy..." +go mod tidy 2>/dev/null + +echo "5. Running comparison tests..." +echo "" +go test -v -tags=compare_cgroups ./worker/hostmetrics/... +TEST_RESULT=$? + +echo "" +if [ $TEST_RESULT -eq 0 ]; then + echo "=== All comparisons PASSED ===" +else + echo "=== Some comparisons FAILED ===" +fi + +exit $TEST_RESULT From 04ff20fe18321c2d07b4b5260d29764b76768108 Mon Sep 17 00:00:00 2001 From: Andrew Yuan Date: Thu, 22 Jan 2026 18:04:02 -0800 Subject: [PATCH 03/30] PR feedback --- contrib/resourcetuner/resourcetuner.go | 2 +- internal/client.go | 4 + internal/common/metrics/heartbeat_handler.go | 519 ---------------- internal/internal_nexus_task_poller.go | 2 +- internal/internal_task_pollers.go | 6 +- internal/internal_worker.go | 223 ++----- internal/internal_worker_heartbeat.go | 126 +++- internal/internal_worker_heartbeat_metrics.go | 559 ++++++++++++++++++ internal/internal_workflow_client.go | 74 +-- internal/tuning.go | 11 +- internal/worker.go | 12 +- test/go.mod | 16 +- test/go.sum | 42 +- test/worker_heartbeat_test.go | 174 +++--- worker/hostmetrics/hostmetrics.go | 8 +- worker/worker.go | 10 +- 16 files changed, 835 insertions(+), 953 deletions(-) delete mode 100644 internal/common/metrics/heartbeat_handler.go create mode 100644 internal/internal_worker_heartbeat_metrics.go diff --git a/contrib/resourcetuner/resourcetuner.go b/contrib/resourcetuner/resourcetuner.go index 7e01c7f5e..cf4e9f5ae 100644 --- a/contrib/resourcetuner/resourcetuner.go +++ b/contrib/resourcetuner/resourcetuner.go @@ -34,7 +34,7 @@ type ResourceBasedTunerOptions struct { WorkflowRampThrottle time.Duration } -// resourceBasedTuner wraps a WorkerTuner and implements HostMetricsProvider +// resourceBasedTuner wraps a WorkerTuner and implements TunerHostMetricsProvider // so the SDK can reuse metrics instead of collecting them twice. type resourceBasedTuner struct { worker.WorkerTuner diff --git a/internal/client.go b/internal/client.go index 6a1a2b905..24cc18d09 100644 --- a/internal/client.go +++ b/internal/client.go @@ -1186,6 +1186,10 @@ func NewServiceClient(workflowServiceClient workflowservice.WorkflowServiceClien workerGroupingKey: uuid.NewString(), } + if heartbeatInterval > 0 { + client.heartbeatManager = NewHeartbeatManager(client, heartbeatInterval, client.logger) + } + // Create outbound interceptor by wrapping backwards through chain client.interceptor = &workflowClientInterceptor{client: client} for i := len(options.Interceptors) - 1; i >= 0; i-- { diff --git a/internal/common/metrics/heartbeat_handler.go b/internal/common/metrics/heartbeat_handler.go deleted file mode 100644 index 17b439d03..000000000 --- a/internal/common/metrics/heartbeat_handler.go +++ /dev/null @@ -1,519 +0,0 @@ -package metrics - -import ( - "sync/atomic" - "time" -) - -// HeartbeatMetricsHandler wraps a metrics handler and captures specific metrics -// in memory that are needed for worker heartbeats -type HeartbeatMetricsHandler struct { - underlying Handler - - // Current worker type tag for this handler instance (set via WithTags) - workerType string - - stickyCacheHit *atomic.Uint64 - stickyCacheMiss *atomic.Uint64 - stickyCacheSize *atomic.Uint64 - - workflowTaskFailures *atomic.Uint64 - activityTaskFailures *atomic.Uint64 - localActivityTaskFailures *atomic.Uint64 - nexusTaskFailures *atomic.Uint64 - - workflowSlotsAvailable *atomic.Uint64 - workflowSlotsUsed *atomic.Uint64 - activitySlotsAvailable *atomic.Uint64 - activitySlotsUsed *atomic.Uint64 - localActivitySlotsAvailable *atomic.Uint64 - localActivitySlotsUsed *atomic.Uint64 - nexusSlotsAvailable *atomic.Uint64 - nexusSlotsUsed *atomic.Uint64 - - // Task processed counters (per worker type) - incremented each time execution latency is recorded - workflowTasksProcessed *atomic.Uint64 - activityTasksProcessed *atomic.Uint64 - localActivityTasksProcessed *atomic.Uint64 - nexusTasksProcessed *atomic.Uint64 - - // Current poller type tag for this handler instance (set via WithTags) - pollerType string - - workflowPollerCount *atomic.Uint64 - workflowStickyPollerCount *atomic.Uint64 - activityPollerCount *atomic.Uint64 - nexusPollerCount *atomic.Uint64 - - // Last successful poll times (per poller type) - stored as Unix nanoseconds - // NOTE: These are only kept in memory, there is no corresponding metric exported for these. - workflowLastPoll *atomic.Int64 - workflowStickyLastPoll *atomic.Int64 - activityLastPoll *atomic.Int64 - nexusLastPoll *atomic.Int64 -} - -// NewHeartbeatMetricsHandler creates a new handler that captures specific metrics -// for worker heartbeats while passing all metrics to the underlying handler. -func NewHeartbeatMetricsHandler(underlying Handler) *HeartbeatMetricsHandler { - return &HeartbeatMetricsHandler{ - underlying: underlying, - - stickyCacheHit: new(atomic.Uint64), - stickyCacheMiss: new(atomic.Uint64), - stickyCacheSize: new(atomic.Uint64), - - workflowTaskFailures: new(atomic.Uint64), - activityTaskFailures: new(atomic.Uint64), - localActivityTaskFailures: new(atomic.Uint64), - nexusTaskFailures: new(atomic.Uint64), - - workflowSlotsAvailable: new(atomic.Uint64), - workflowSlotsUsed: new(atomic.Uint64), - activitySlotsAvailable: new(atomic.Uint64), - activitySlotsUsed: new(atomic.Uint64), - localActivitySlotsAvailable: new(atomic.Uint64), - localActivitySlotsUsed: new(atomic.Uint64), - nexusSlotsAvailable: new(atomic.Uint64), - nexusSlotsUsed: new(atomic.Uint64), - - workflowTasksProcessed: new(atomic.Uint64), - activityTasksProcessed: new(atomic.Uint64), - localActivityTasksProcessed: new(atomic.Uint64), - nexusTasksProcessed: new(atomic.Uint64), - - workflowPollerCount: new(atomic.Uint64), - workflowStickyPollerCount: new(atomic.Uint64), - activityPollerCount: new(atomic.Uint64), - nexusPollerCount: new(atomic.Uint64), - - workflowLastPoll: new(atomic.Int64), - workflowStickyLastPoll: new(atomic.Int64), - activityLastPoll: new(atomic.Int64), - nexusLastPoll: new(atomic.Int64), - } -} - -func (h *HeartbeatMetricsHandler) WithTags(tags map[string]string) Handler { - // Track the worker type if present in tags - workerType := h.workerType - if wt, ok := tags[WorkerTypeTagName]; ok { - workerType = wt - } - - // Track the poller type if present in tags - pollerType := h.pollerType - if pt, ok := tags[PollerTypeTagName]; ok { - pollerType = pt - } - - return &HeartbeatMetricsHandler{ - underlying: h.underlying.WithTags(tags), - workerType: workerType, - pollerType: pollerType, - - stickyCacheHit: h.stickyCacheHit, - stickyCacheMiss: h.stickyCacheMiss, - stickyCacheSize: h.stickyCacheSize, - - workflowTaskFailures: h.workflowTaskFailures, - activityTaskFailures: h.activityTaskFailures, - localActivityTaskFailures: h.localActivityTaskFailures, - nexusTaskFailures: h.nexusTaskFailures, - - workflowSlotsAvailable: h.workflowSlotsAvailable, - workflowSlotsUsed: h.workflowSlotsUsed, - activitySlotsAvailable: h.activitySlotsAvailable, - activitySlotsUsed: h.activitySlotsUsed, - localActivitySlotsAvailable: h.localActivitySlotsAvailable, - localActivitySlotsUsed: h.localActivitySlotsUsed, - nexusSlotsAvailable: h.nexusSlotsAvailable, - nexusSlotsUsed: h.nexusSlotsUsed, - - workflowTasksProcessed: h.workflowTasksProcessed, - activityTasksProcessed: h.activityTasksProcessed, - localActivityTasksProcessed: h.localActivityTasksProcessed, - nexusTasksProcessed: h.nexusTasksProcessed, - - workflowPollerCount: h.workflowPollerCount, - workflowStickyPollerCount: h.workflowStickyPollerCount, - activityPollerCount: h.activityPollerCount, - nexusPollerCount: h.nexusPollerCount, - - workflowLastPoll: h.workflowLastPoll, - workflowStickyLastPoll: h.workflowStickyLastPoll, - activityLastPoll: h.activityLastPoll, - nexusLastPoll: h.nexusLastPoll, - } -} - -func (h *HeartbeatMetricsHandler) Counter(name string) Counter { - underlying := h.underlying.Counter(name) - - switch name { - case StickyCacheHit: - return &capturingCounter{ - underlying: underlying, - value: h.stickyCacheHit, - } - case StickyCacheMiss: - return &capturingCounter{ - underlying: underlying, - value: h.stickyCacheMiss, - } - case WorkflowTaskExecutionFailureCounter: - return &capturingCounter{ - underlying: underlying, - value: h.workflowTaskFailures, - } - case ActivityExecutionFailedCounter: - return &capturingCounter{ - underlying: underlying, - value: h.activityTaskFailures, - } - case LocalActivityExecutionFailedCounter: - return &capturingCounter{ - underlying: underlying, - value: h.localActivityTaskFailures, - } - case NexusTaskExecutionFailedCounter: - return &capturingCounter{ - underlying: underlying, - value: h.nexusTaskFailures, - } - default: - return underlying - } -} - -func (h *HeartbeatMetricsHandler) Gauge(name string) Gauge { - underlying := h.underlying.Gauge(name) - - switch name { - case StickyCacheSize: - return &capturingGauge{ - underlying: underlying, - value: h.stickyCacheSize, - } - case WorkerTaskSlotsAvailable: - var valuePtr *atomic.Uint64 - switch h.workerType { - case "WorkflowWorker": - valuePtr = h.workflowSlotsAvailable - case "ActivityWorker": - valuePtr = h.activitySlotsAvailable - case "LocalActivityWorker": - valuePtr = h.localActivitySlotsAvailable - case "NexusWorker": - valuePtr = h.nexusSlotsAvailable - } - if valuePtr != nil { - return &capturingGauge{ - underlying: underlying, - value: valuePtr, - } - } - case WorkerTaskSlotsUsed: - var valuePtr *atomic.Uint64 - switch h.workerType { - case "WorkflowWorker": - valuePtr = h.workflowSlotsUsed - case "ActivityWorker": - valuePtr = h.activitySlotsUsed - case "LocalActivityWorker": - valuePtr = h.localActivitySlotsUsed - case "NexusWorker": - valuePtr = h.nexusSlotsUsed - } - if valuePtr != nil { - return &capturingGauge{ - underlying: underlying, - value: valuePtr, - } - } - case NumPoller: - var valuePtr *atomic.Uint64 - switch h.pollerType { - case PollerTypeWorkflowTask: - valuePtr = h.workflowPollerCount - case PollerTypeWorkflowStickyTask: - valuePtr = h.workflowStickyPollerCount - case PollerTypeActivityTask: - valuePtr = h.activityPollerCount - case PollerTypeNexusTask: - valuePtr = h.nexusPollerCount - } - if valuePtr != nil { - return &capturingGauge{ - underlying: underlying, - value: valuePtr, - } - } - } - - return underlying -} - -func (h *HeartbeatMetricsHandler) Timer(name string) Timer { - underlying := h.underlying.Timer(name) - - // Capture execution latency timers to count processed tasks - switch name { - case WorkflowTaskExecutionLatency: - return &capturingTimer{ - underlying: underlying, - counter: h.workflowTasksProcessed, - } - case ActivityExecutionLatency: - return &capturingTimer{ - underlying: underlying, - counter: h.activityTasksProcessed, - } - case LocalActivityExecutionLatency: - return &capturingTimer{ - underlying: underlying, - counter: h.localActivityTasksProcessed, - } - case NexusTaskExecutionLatency: - return &capturingTimer{ - underlying: underlying, - counter: h.nexusTasksProcessed, - } - } - - return underlying -} - -// GetStickyCacheHit returns the total number of sticky cache hits. -func (h *HeartbeatMetricsHandler) GetStickyCacheHit() int32 { - return int32(h.stickyCacheHit.Load()) -} - -// GetStickyCacheMiss returns the total number of sticky cache misses. -func (h *HeartbeatMetricsHandler) GetStickyCacheMiss() int32 { - return int32(h.stickyCacheMiss.Load()) -} - -// GetStickyCacheSize returns the current sticky cache size. -func (h *HeartbeatMetricsHandler) GetStickyCacheSize() int32 { - return int32(h.stickyCacheSize.Load()) -} - -// GetWorkflowTaskFailures returns the total number of workflow task failures. -func (h *HeartbeatMetricsHandler) GetWorkflowTaskFailures() int64 { - return int64(h.workflowTaskFailures.Load()) -} - -// GetActivityTaskFailures returns the total number of activity task failures. -func (h *HeartbeatMetricsHandler) GetActivityTaskFailures() int64 { - return int64(h.activityTaskFailures.Load()) -} - -// GetLocalActivityTaskFailures returns the total number of local activity task failures. -func (h *HeartbeatMetricsHandler) GetLocalActivityTaskFailures() int64 { - return int64(h.localActivityTaskFailures.Load()) -} - -// GetNexusTaskFailures returns the total number of nexus task failures. -func (h *HeartbeatMetricsHandler) GetNexusTaskFailures() int64 { - return int64(h.nexusTaskFailures.Load()) -} - -// GetWorkflowSlotsAvailable returns the current workflow slots available. -func (h *HeartbeatMetricsHandler) GetWorkflowSlotsAvailable() int32 { - return int32(h.workflowSlotsAvailable.Load()) -} - -// GetWorkflowSlotsUsed returns the current workflow slots used. -func (h *HeartbeatMetricsHandler) GetWorkflowSlotsUsed() int32 { - return int32(h.workflowSlotsUsed.Load()) -} - -// GetActivitySlotsAvailable returns the current activity slots available. -func (h *HeartbeatMetricsHandler) GetActivitySlotsAvailable() int32 { - return int32(h.activitySlotsAvailable.Load()) -} - -// GetActivitySlotsUsed returns the current activity slots used. -func (h *HeartbeatMetricsHandler) GetActivitySlotsUsed() int32 { - return int32(h.activitySlotsUsed.Load()) -} - -// GetLocalActivitySlotsAvailable returns the current local activity slots available. -func (h *HeartbeatMetricsHandler) GetLocalActivitySlotsAvailable() int32 { - return int32(h.localActivitySlotsAvailable.Load()) -} - -// GetLocalActivitySlotsUsed returns the current local activity slots used. -func (h *HeartbeatMetricsHandler) GetLocalActivitySlotsUsed() int32 { - return int32(h.localActivitySlotsUsed.Load()) -} - -// GetNexusSlotsAvailable returns the current nexus slots available. -func (h *HeartbeatMetricsHandler) GetNexusSlotsAvailable() int32 { - return int32(h.nexusSlotsAvailable.Load()) -} - -// GetNexusSlotsUsed returns the current nexus slots used. -func (h *HeartbeatMetricsHandler) GetNexusSlotsUsed() int32 { - return int32(h.nexusSlotsUsed.Load()) -} - -// GetWorkflowTasksProcessed returns the total number of workflow tasks processed. -func (h *HeartbeatMetricsHandler) GetWorkflowTasksProcessed() int64 { - return int64(h.workflowTasksProcessed.Load()) -} - -// GetActivityTasksProcessed returns the total number of activity tasks processed. -func (h *HeartbeatMetricsHandler) GetActivityTasksProcessed() int64 { - return int64(h.activityTasksProcessed.Load()) -} - -// GetLocalActivityTasksProcessed returns the total number of local activity tasks processed. -func (h *HeartbeatMetricsHandler) GetLocalActivityTasksProcessed() int64 { - return int64(h.localActivityTasksProcessed.Load()) -} - -// GetNexusTasksProcessed returns the total number of nexus tasks processed. -func (h *HeartbeatMetricsHandler) GetNexusTasksProcessed() int64 { - return int64(h.nexusTasksProcessed.Load()) -} - -// GetWorkflowPollerCount returns the current number of workflow task pollers. -func (h *HeartbeatMetricsHandler) GetWorkflowPollerCount() int32 { - return int32(h.workflowPollerCount.Load()) -} - -// GetWorkflowStickyPollerCount returns the current number of workflow sticky task pollers. -func (h *HeartbeatMetricsHandler) GetWorkflowStickyPollerCount() int32 { - return int32(h.workflowStickyPollerCount.Load()) -} - -// GetActivityPollerCount returns the current number of activity task pollers. -func (h *HeartbeatMetricsHandler) GetActivityPollerCount() int32 { - return int32(h.activityPollerCount.Load()) -} - -// GetNexusPollerCount returns the current number of nexus task pollers. -func (h *HeartbeatMetricsHandler) GetNexusPollerCount() int32 { - return int32(h.nexusPollerCount.Load()) -} - -// RecordWorkflowPollSuccess records a successful workflow task poll. -func (h *HeartbeatMetricsHandler) RecordWorkflowPollSuccess() { - h.workflowLastPoll.Store(time.Now().UnixNano()) -} - -// RecordWorkflowStickyPollSuccess records a successful workflow sticky task poll. -func (h *HeartbeatMetricsHandler) RecordWorkflowStickyPollSuccess() { - h.workflowStickyLastPoll.Store(time.Now().UnixNano()) -} - -// RecordActivityPollSuccess records a successful activity task poll. -func (h *HeartbeatMetricsHandler) RecordActivityPollSuccess() { - h.activityLastPoll.Store(time.Now().UnixNano()) -} - -// RecordNexusPollSuccess records a successful nexus task poll. -func (h *HeartbeatMetricsHandler) RecordNexusPollSuccess() { - h.nexusLastPoll.Store(time.Now().UnixNano()) -} - -// GetWorkflowLastPollTime returns the last successful workflow task poll time. -func (h *HeartbeatMetricsHandler) GetWorkflowLastPollTime() time.Time { - nanos := h.workflowLastPoll.Load() - if nanos == 0 { - return time.Time{} - } - return time.Unix(0, nanos) -} - -// GetWorkflowStickyLastPollTime returns the last successful workflow sticky task poll time. -func (h *HeartbeatMetricsHandler) GetWorkflowStickyLastPollTime() time.Time { - nanos := h.workflowStickyLastPoll.Load() - if nanos == 0 { - return time.Time{} - } - return time.Unix(0, nanos) -} - -// GetActivityLastPollTime returns the last successful activity task poll time. -func (h *HeartbeatMetricsHandler) GetActivityLastPollTime() time.Time { - nanos := h.activityLastPoll.Load() - if nanos == 0 { - return time.Time{} - } - return time.Unix(0, nanos) -} - -// GetNexusLastPollTime returns the last successful nexus task poll time. -func (h *HeartbeatMetricsHandler) GetNexusLastPollTime() time.Time { - nanos := h.nexusLastPoll.Load() - if nanos == 0 { - return time.Time{} - } - return time.Unix(0, nanos) -} - -// PollSuccessRecorder is an optional interface for recording successful poll times. -type PollSuccessRecorder interface { - RecordWorkflowPollSuccess() - RecordWorkflowStickyPollSuccess() - RecordActivityPollSuccess() - RecordNexusPollSuccess() -} - -// RecordPollSuccess records a successful poll time if the handler supports it. -// pollerType should be one of PollerTypeWorkflowTask, PollerTypeWorkflowStickyTask, -// PollerTypeActivityTask, or PollerTypeNexusTask. -func RecordPollSuccess(h Handler, pollerType string) { - recorder, ok := h.(PollSuccessRecorder) - if !ok { - return - } - switch pollerType { - case PollerTypeWorkflowTask: - recorder.RecordWorkflowPollSuccess() - case PollerTypeWorkflowStickyTask: - recorder.RecordWorkflowStickyPollSuccess() - case PollerTypeActivityTask: - recorder.RecordActivityPollSuccess() - case PollerTypeNexusTask: - recorder.RecordNexusPollSuccess() - } -} - -// capturingCounter wraps a counter and captures its value in memory for heartbeat reporting. -type capturingCounter struct { - underlying Counter - value *atomic.Uint64 -} - -func (c *capturingCounter) Inc(delta int64) { - c.underlying.Inc(delta) - if delta > 0 { - c.value.Add(uint64(delta)) - } -} - -// capturingGauge wraps a gauge and captures its value in memory for heartbeat reporting. -type capturingGauge struct { - underlying Gauge - value *atomic.Uint64 -} - -func (g *capturingGauge) Update(f float64) { - g.underlying.Update(f) - g.value.Store(uint64(f)) -} - -// capturingTimer wraps a timer and increments a counter each time Record is called. -type capturingTimer struct { - underlying Timer - counter *atomic.Uint64 -} - -func (t *capturingTimer) Record(d time.Duration) { - t.underlying.Record(d) - t.counter.Add(1) -} diff --git a/internal/internal_nexus_task_poller.go b/internal/internal_nexus_task_poller.go index a98e484e3..43d4441b3 100644 --- a/internal/internal_nexus_task_poller.go +++ b/internal/internal_nexus_task_poller.go @@ -90,7 +90,7 @@ func (ntp *nexusTaskPoller) poll(ctx context.Context) (taskForWorker, error) { return nil, nil } - metrics.RecordPollSuccess(ntp.metricsHandler, metrics.PollerTypeNexusTask) + RecordPollSuccess(ntp.metricsHandler, metrics.PollerTypeNexusTask) return &nexusTask{task: response}, nil } diff --git a/internal/internal_task_pollers.go b/internal/internal_task_pollers.go index 67afb18b6..cd2224e25 100644 --- a/internal/internal_task_pollers.go +++ b/internal/internal_task_pollers.go @@ -970,9 +970,9 @@ func (wtp *workflowTaskPoller) poll(ctx context.Context) (taskForWorker, error) } if request.TaskQueue.GetKind() == enumspb.TASK_QUEUE_KIND_STICKY { - metrics.RecordPollSuccess(wtp.metricsHandler, metrics.PollerTypeWorkflowStickyTask) + RecordPollSuccess(wtp.metricsHandler, metrics.PollerTypeWorkflowStickyTask) } else { - metrics.RecordPollSuccess(wtp.metricsHandler, metrics.PollerTypeWorkflowTask) + RecordPollSuccess(wtp.metricsHandler, metrics.PollerTypeWorkflowTask) } wtp.updateBacklog(request.TaskQueue.GetKind(), response.GetBacklogCountHint()) @@ -1173,7 +1173,7 @@ func (atp *activityTaskPoller) poll(ctx context.Context) (taskForWorker, error) return &activityTask{}, nil } - metrics.RecordPollSuccess(atp.metricsHandler, metrics.PollerTypeActivityTask) + RecordPollSuccess(atp.metricsHandler, metrics.PollerTypeActivityTask) workflowType := response.WorkflowType.GetName() activityType := response.ActivityType.GetName() diff --git a/internal/internal_worker.go b/internal/internal_worker.go index 13f326fe7..265e1c02d 100644 --- a/internal/internal_worker.go +++ b/internal/internal_worker.go @@ -232,8 +232,7 @@ type ( // workerHeartbeatManager includes all information needed to report worker heartbeats. workerHeartbeatManager struct { - heartbeatWorker *sharedNamespaceWorker - heartbeatMetrics *metrics.HeartbeatMetricsHandler + heartbeatMetrics *HeartbeatMetricsHandler heartbeatCallback func() *workerpb.WorkerHeartbeat // Slot suppliers for heartbeat reporting @@ -243,7 +242,7 @@ type ( nexusTaskSlotSupplier *trackingSlotSupplier // Host metrics provider for CPU/memory reporting in heartbeats - hostMetricsProvider HostMetricsProvider + hostMetricsProvider TunerHostMetricsProvider } ) @@ -1512,27 +1511,24 @@ func (aw *AggregatedWorker) Stop() { } func (aw *AggregatedWorker) registerHeartbeatWorker() error { - hw, err := aw.client.getOrCreateHeartbeatWorker(aw.executionParams.Namespace) - if err != nil { - return err - } - - // Server doesn't support heartbeating. - if hw == nil { + if aw.client.heartbeatManager == nil { return nil } - - aw.workerHeartbeatManager.heartbeatWorker = hw - hw.registerCallback(aw.workerInstanceKey, aw.workerHeartbeatManager.heartbeatCallback) - - return nil + return aw.client.heartbeatManager.RegisterWorker( + aw.executionParams.Namespace, + aw.workerInstanceKey, + aw.workerHeartbeatManager.heartbeatCallback, + ) } func (aw *AggregatedWorker) unregisterHeartbeatWorker() { - if aw.workerHeartbeatManager != nil && aw.workerHeartbeatManager.heartbeatWorker != nil { - aw.workerHeartbeatManager.heartbeatWorker.unregisterCallback(aw.workerInstanceKey) - aw.workerHeartbeatManager.heartbeatWorker = nil + if aw.client.heartbeatManager == nil || aw.workerHeartbeatManager == nil { + return } + aw.client.heartbeatManager.UnregisterWorker( + aw.executionParams.Namespace, + aw.workerInstanceKey, + ) } // shutdownWorker sends a ShutdownWorker RPC to notify the server that this worker is shutting down. @@ -1568,9 +1564,12 @@ func (aw *AggregatedWorker) shutdownWorker() { if _, isUnimplemented := err.(*serviceerror.Unimplemented); isUnimplemented { return } + if _, isUnavailable := err.(*serviceerror.Unavailable); isUnavailable { + return + } if err != nil { - aw.logger.Debug("ShutdownWorker failed.", tagError, err) + aw.logger.Debug("ShutdownWorker rpc errored during worker shutdown.", tagError, err) } } @@ -2129,11 +2128,11 @@ func NewAggregatedWorker(client *WorkflowClient, taskQueue string, options Worke baseMetricsHandler := client.metricsHandler.WithTags(metrics.TaskQueueTags(taskQueue)) var metricsHandler metrics.Handler - var heartbeatMetrics *metrics.HeartbeatMetricsHandler + var heartbeatMetrics *HeartbeatMetricsHandler var heartbeatManager *workerHeartbeatManager if client.workerHeartbeatInterval != 0 { - heartbeatMetrics = metrics.NewHeartbeatMetricsHandler(baseMetricsHandler) + heartbeatMetrics = NewHeartbeatMetricsHandler(baseMetricsHandler) metricsHandler = heartbeatMetrics heartbeatManager = &workerHeartbeatManager{} } else { @@ -2262,9 +2261,9 @@ func NewAggregatedWorker(client *WorkflowClient, taskQueue string, options Worke } // Initialize host metrics provider for CPU/memory reporting. - // If the tuner implements HostMetricsProvider, use it to avoid double-measurement of system. - var hostMetricsProvider HostMetricsProvider - if provider, ok := options.Tuner.(HostMetricsProvider); ok { + // If the tuner implements TunerHostMetricsProvider, use it to avoid double-measurement of system. + var hostMetricsProvider TunerHostMetricsProvider + if provider, ok := options.Tuner.(TunerHostMetricsProvider); ok { hostMetricsProvider = provider } else if client.workerHeartbeatInterval != 0 { hostMetricsProvider = hostmetrics.NewPSUtilSystemInfoSupplier(workerParams.Logger) @@ -2283,17 +2282,25 @@ func NewAggregatedWorker(client *WorkflowClient, taskQueue string, options Worke var prevLocalActivityProcessed, prevLocalActivityFailed int64 var prevNexusProcessed, prevNexusFailed int64 + populateOpts := &PopulateHeartbeatOptions{ + WorkflowPollerBehavior: options.WorkflowTaskPollerBehavior, + ActivityPollerBehavior: options.ActivityTaskPollerBehavior, + NexusPollerBehavior: options.NexusTaskPollerBehavior, + PrevWorkflowProcessed: &prevWorkflowProcessed, + PrevWorkflowFailed: &prevWorkflowFailed, + PrevActivityProcessed: &prevActivityProcessed, + PrevActivityFailed: &prevActivityFailed, + PrevLocalActivityProcessed: &prevLocalActivityProcessed, + PrevLocalActivityFailed: &prevLocalActivityFailed, + PrevNexusProcessed: &prevNexusProcessed, + PrevNexusFailed: &prevNexusFailed, + } + heartbeatCallback = func() *workerpb.WorkerHeartbeat { heartbeatTime := time.Now() - elapsedSinceLastHeartbeat := previousHeartbeatTime.Sub(heartbeatTime) + elapsedSinceLastHeartbeat := heartbeatTime.Sub(previousHeartbeatTime) previousHeartbeatTime = heartbeatTime - var stickyCacheHit, stickyCacheMiss, stickyCacheSize int32 - if aw.workerHeartbeatManager.heartbeatMetrics != nil { - stickyCacheHit = aw.workerHeartbeatManager.heartbeatMetrics.GetStickyCacheHit() - stickyCacheMiss = aw.workerHeartbeatManager.heartbeatMetrics.GetStickyCacheMiss() - stickyCacheSize = aw.workerHeartbeatManager.heartbeatMetrics.GetStickyCacheSize() - } var deploymentVersion *deploymentpb.WorkerDeploymentVersion if options.DeploymentOptions.UseVersioning { deploymentVersion = &deploymentpb.WorkerDeploymentVersion{ @@ -2302,82 +2309,6 @@ func NewAggregatedWorker(client *WorkflowClient, taskQueue string, options Worke } } - var workflowTaskSlotsInfo *workerpb.WorkerSlotsInfo - var activityTaskSlotsInfo *workerpb.WorkerSlotsInfo - var localActivitySlotsInfo *workerpb.WorkerSlotsInfo - var nexusTaskSlotsInfo *workerpb.WorkerSlotsInfo - - if aw.workerHeartbeatManager.heartbeatMetrics != nil { - if aw.workerHeartbeatManager.workflowTaskSlotSupplier != nil { - workflowTaskSlotsInfo = buildSlotsInfo( - aw.workerHeartbeatManager.workflowTaskSlotSupplier.GetSlotSupplierKind(), - aw.workerHeartbeatManager.heartbeatMetrics.GetWorkflowSlotsAvailable(), - aw.workerHeartbeatManager.heartbeatMetrics.GetWorkflowSlotsUsed(), - aw.workerHeartbeatManager.heartbeatMetrics.GetWorkflowTasksProcessed(), - aw.workerHeartbeatManager.heartbeatMetrics.GetWorkflowTaskFailures(), - &prevWorkflowProcessed, - &prevWorkflowFailed, - ) - } - if aw.workerHeartbeatManager.activityTaskSlotSupplier != nil { - activityTaskSlotsInfo = buildSlotsInfo( - aw.workerHeartbeatManager.activityTaskSlotSupplier.GetSlotSupplierKind(), - aw.workerHeartbeatManager.heartbeatMetrics.GetActivitySlotsAvailable(), - aw.workerHeartbeatManager.heartbeatMetrics.GetActivitySlotsUsed(), - aw.workerHeartbeatManager.heartbeatMetrics.GetActivityTasksProcessed(), - aw.workerHeartbeatManager.heartbeatMetrics.GetActivityTaskFailures(), - &prevActivityProcessed, - &prevActivityFailed, - ) - } - if aw.workerHeartbeatManager.localActivitySlotSupplier != nil { - localActivitySlotsInfo = buildSlotsInfo( - aw.workerHeartbeatManager.localActivitySlotSupplier.GetSlotSupplierKind(), - aw.workerHeartbeatManager.heartbeatMetrics.GetLocalActivitySlotsAvailable(), - aw.workerHeartbeatManager.heartbeatMetrics.GetLocalActivitySlotsUsed(), - aw.workerHeartbeatManager.heartbeatMetrics.GetLocalActivityTasksProcessed(), - aw.workerHeartbeatManager.heartbeatMetrics.GetLocalActivityTaskFailures(), - &prevLocalActivityProcessed, - &prevLocalActivityFailed, - ) - } - if aw.workerHeartbeatManager.nexusTaskSlotSupplier != nil { - nexusTaskSlotsInfo = buildSlotsInfo( - aw.workerHeartbeatManager.nexusTaskSlotSupplier.GetSlotSupplierKind(), - aw.workerHeartbeatManager.heartbeatMetrics.GetNexusSlotsAvailable(), - aw.workerHeartbeatManager.heartbeatMetrics.GetNexusSlotsUsed(), - aw.workerHeartbeatManager.heartbeatMetrics.GetNexusTasksProcessed(), - aw.workerHeartbeatManager.heartbeatMetrics.GetNexusTaskFailures(), - &prevNexusProcessed, - &prevNexusFailed, - ) - } - } - - var workflowPollerInfo, workflowStickyPollerInfo, activityPollerInfo, nexusPollerInfo *workerpb.WorkerPollerInfo - if aw.workerHeartbeatManager.heartbeatMetrics != nil { - workflowPollerInfo = buildPollerInfo( - aw.workerHeartbeatManager.heartbeatMetrics.GetWorkflowPollerCount(), - aw.workerHeartbeatManager.heartbeatMetrics.GetWorkflowLastPollTime(), - options.WorkflowTaskPollerBehavior, - ) - workflowStickyPollerInfo = buildPollerInfo( - aw.workerHeartbeatManager.heartbeatMetrics.GetWorkflowStickyPollerCount(), - aw.workerHeartbeatManager.heartbeatMetrics.GetWorkflowStickyLastPollTime(), - options.WorkflowTaskPollerBehavior, - ) - activityPollerInfo = buildPollerInfo( - aw.workerHeartbeatManager.heartbeatMetrics.GetActivityPollerCount(), - aw.workerHeartbeatManager.heartbeatMetrics.GetActivityLastPollTime(), - options.ActivityTaskPollerBehavior, - ) - nexusPollerInfo = buildPollerInfo( - aw.workerHeartbeatManager.heartbeatMetrics.GetNexusPollerCount(), - aw.workerHeartbeatManager.heartbeatMetrics.GetNexusLastPollTime(), - options.NexusTaskPollerBehavior, - ) - } - hb := &workerpb.WorkerHeartbeat{ WorkerInstanceKey: aw.workerInstanceKey, WorkerIdentity: aw.client.identity, @@ -2396,19 +2327,25 @@ func NewAggregatedWorker(client *WorkflowClient, taskQueue string, options Worke StartTime: startTime, HeartbeatTime: timestamppb.New(heartbeatTime), ElapsedSinceLastHeartbeat: durationpb.New(elapsedSinceLastHeartbeat), - WorkflowTaskSlotsInfo: workflowTaskSlotsInfo, - ActivityTaskSlotsInfo: activityTaskSlotsInfo, - NexusTaskSlotsInfo: nexusTaskSlotsInfo, - LocalActivitySlotsInfo: localActivitySlotsInfo, - WorkflowPollerInfo: workflowPollerInfo, - WorkflowStickyPollerInfo: workflowStickyPollerInfo, - ActivityPollerInfo: activityPollerInfo, - NexusPollerInfo: nexusPollerInfo, - TotalStickyCacheHit: stickyCacheHit, - TotalStickyCacheMiss: stickyCacheMiss, - CurrentStickyCacheSize: stickyCacheSize, Plugins: pluginInfos, } + + if aw.workerHeartbeatManager.heartbeatMetrics != nil { + if aw.workerHeartbeatManager.workflowTaskSlotSupplier != nil { + populateOpts.WorkflowSlotSupplierKind = aw.workerHeartbeatManager.workflowTaskSlotSupplier.GetSlotSupplierKind() + } + if aw.workerHeartbeatManager.activityTaskSlotSupplier != nil { + populateOpts.ActivitySlotSupplierKind = aw.workerHeartbeatManager.activityTaskSlotSupplier.GetSlotSupplierKind() + } + if aw.workerHeartbeatManager.localActivitySlotSupplier != nil { + populateOpts.LocalActivitySlotSupplierKind = aw.workerHeartbeatManager.localActivitySlotSupplier.GetSlotSupplierKind() + } + if aw.workerHeartbeatManager.nexusTaskSlotSupplier != nil { + populateOpts.NexusSlotSupplierKind = aw.workerHeartbeatManager.nexusTaskSlotSupplier.GetSlotSupplierKind() + } + aw.workerHeartbeatManager.heartbeatMetrics.PopulateHeartbeat(hb, populateOpts) + } + return hb } } @@ -2764,53 +2701,7 @@ func workerDeploymentVersionFromProtoOrString(wd *deploymentpb.WorkerDeploymentV } } -func buildSlotsInfo( - supplierKind string, - slotsAvailable int32, - slotsUsed int32, - totalProcessed int64, - totalFailed int64, - prevProcessed *int64, - prevFailed *int64, -) *workerpb.WorkerSlotsInfo { - intervalProcessed := totalProcessed - *prevProcessed - intervalFailed := totalFailed - *prevFailed - - // Update previous totals for next interval - *prevProcessed = totalProcessed - *prevFailed = totalFailed - - totalProcessedTasks := int32(totalProcessed) - totalFailedTasks := int32(totalFailed) - lastIntervalProcessed := int32(intervalProcessed) - lastIntervalFailed := int32(intervalFailed) - - return &workerpb.WorkerSlotsInfo{ - CurrentAvailableSlots: slotsAvailable, - CurrentUsedSlots: slotsUsed, - SlotSupplierKind: supplierKind, - TotalProcessedTasks: totalProcessedTasks, - TotalFailedTasks: totalFailedTasks, - LastIntervalProcessedTasks: lastIntervalProcessed, - LastIntervalFailureTasks: lastIntervalFailed, - } -} - -func buildPollerInfo(currentPollers int32, lastSuccessfulPollTime time.Time, pollerBehavior PollerBehavior) *workerpb.WorkerPollerInfo { - var isAutoscaling bool - switch pollerBehavior.(type) { - case *pollerBehaviorAutoscaling: - isAutoscaling = true - } - - return &workerpb.WorkerPollerInfo{ - CurrentPollers: currentPollers, - LastSuccessfulPollTime: timestamppb.New(lastSuccessfulPollTime), - IsAutoscaling: isAutoscaling, - } -} - -func getCpuUsage(provider HostMetricsProvider) float32 { +func getCpuUsage(provider TunerHostMetricsProvider) float32 { if provider == nil { return 0 } @@ -2818,7 +2709,7 @@ func getCpuUsage(provider HostMetricsProvider) float32 { return float32(cpu) } -func getMemUsage(provider HostMetricsProvider) float32 { +func getMemUsage(provider TunerHostMetricsProvider) float32 { if provider == nil { return 0 } diff --git a/internal/internal_worker_heartbeat.go b/internal/internal_worker_heartbeat.go index 47fd9cf4d..2cf4ade2a 100644 --- a/internal/internal_worker_heartbeat.go +++ b/internal/internal_worker_heartbeat.go @@ -2,16 +2,109 @@ package internal import ( "context" + "fmt" + "sync" + "sync/atomic" + "time" + + "github.com/nexus-rpc/sdk-go/nexus" workerpb "go.temporal.io/api/worker/v1" "go.temporal.io/api/workflowservice/v1" "go.temporal.io/sdk/log" "google.golang.org/grpc/codes" "google.golang.org/grpc/status" - "sync" - "sync/atomic" - "time" ) +// HeartbeatManager manages heartbeat workers across namespaces for a client. +type HeartbeatManager struct { + client *WorkflowClient + interval time.Duration + logger log.Logger + + mu sync.Mutex + workers map[string]*sharedNamespaceWorker // namespace -> worker +} + +// NewHeartbeatManager creates a new HeartbeatManager. +func NewHeartbeatManager(client *WorkflowClient, interval time.Duration, logger log.Logger) *HeartbeatManager { + return &HeartbeatManager{ + client: client, + interval: interval, + logger: logger, + workers: make(map[string]*sharedNamespaceWorker), + } +} + +// RegisterWorker registers a worker's heartbeat callback with the shared heartbeat worker for the namespace. +func (m *HeartbeatManager) RegisterWorker( + namespace string, + workerInstanceKey string, + callback func() *workerpb.WorkerHeartbeat, +) error { + m.mu.Lock() + defer m.mu.Unlock() + + hw, ok := m.workers[namespace] + if !ok { + capabilities, err := m.client.loadNamespaceCapabilities(context.Background()) + if err != nil { + return fmt.Errorf("failed to get namespace capabilities: %w", err) + } + if !capabilities.WorkerHeartbeats { + m.logger.Debug("Worker heartbeating configured, but server version does not support it.") + return nil + } + + hw = &sharedNamespaceWorker{ + client: m.client, + namespace: namespace, + taskQueue: fmt.Sprintf("temporal-sys/worker-commands/%s/%s", namespace, m.client.workerGroupingKey), + interval: m.interval, + callbacks: make(map[string]func() *workerpb.WorkerHeartbeat), + stopC: make(chan struct{}), + stoppedC: make(chan struct{}), + logger: m.logger, + } + + nexusWorker, err := hw.createNexusWorker() + if err != nil { + return fmt.Errorf("failed to create nexus worker for heartbeating: %w", err) + } + hw.nexusWorker = nexusWorker + + m.workers[namespace] = hw + go hw.run() + } + + hw.mu.Lock() + hw.callbacks[workerInstanceKey] = callback + hw.mu.Unlock() + + return nil +} + +// UnregisterWorker removes a worker's heartbeat callback. If no callbacks remain for the namespace, +// the shared heartbeat worker is stopped. +func (m *HeartbeatManager) UnregisterWorker(namespace, workerInstanceKey string) { + m.mu.Lock() + defer m.mu.Unlock() + + hw, ok := m.workers[namespace] + if !ok { + return + } + + hw.mu.Lock() + delete(hw.callbacks, workerInstanceKey) + remaining := len(hw.callbacks) + hw.mu.Unlock() + + if remaining == 0 { + hw.stop() + delete(m.workers, namespace) + } +} + // sharedNamespaceWorker is the background nexus worker that handles heartbeating for // all workers in a specific namespace for a specific client. type sharedNamespaceWorker struct { @@ -45,13 +138,22 @@ func (hw *sharedNamespaceWorker) createNexusWorker() (*nexusWorker, error) { NexusTaskPollerBehavior: NewPollerBehaviorSimpleMaximum(PollerBehaviorSimpleMaximumOptions{MaximumNumberOfPollers: 1}), } + reg := nexus.NewServiceRegistry() + handler, err := reg.NewHandler() + if err != nil { + return nil, err + } + + // TODO: Register worker commands here + nw, err := newNexusWorker(nexusWorkerOptions{ executionParameters: params, client: hw.client, workflowService: hw.client.workflowService, + handler: handler, }) - return nw, nil + return nw, err } func (hw *sharedNamespaceWorker) run() { @@ -117,22 +219,6 @@ func (hw *sharedNamespaceWorker) sendHeartbeats() { } } -func (hw *sharedNamespaceWorker) registerCallback( - workerInstanceKey string, - callback func() *workerpb.WorkerHeartbeat, -) { - hw.mu.Lock() - defer hw.mu.Unlock() - hw.callbacks[workerInstanceKey] = callback -} - -func (hw *sharedNamespaceWorker) unregisterCallback(workerInstanceKey string) { - shouldStop := hw.client.unregisterHeartbeatCallback(hw.namespace, workerInstanceKey) - if shouldStop { - hw.stop() - } -} - func (hw *sharedNamespaceWorker) stop() { if !hw.started.CompareAndSwap(true, false) { return diff --git a/internal/internal_worker_heartbeat_metrics.go b/internal/internal_worker_heartbeat_metrics.go new file mode 100644 index 000000000..b71423bee --- /dev/null +++ b/internal/internal_worker_heartbeat_metrics.go @@ -0,0 +1,559 @@ +package internal + +import ( + "sync/atomic" + "time" + + workerpb "go.temporal.io/api/worker/v1" + "google.golang.org/protobuf/types/known/timestamppb" + + "go.temporal.io/sdk/internal/common/metrics" +) + +type heartbeatMetric int + +const ( + metricStickyCacheHit heartbeatMetric = iota + metricStickyCacheMiss + metricStickyCacheSize + + metricWorkflowTaskFailures + metricActivityTaskFailures + metricLocalActivityTaskFailures + metricNexusTaskFailures + + metricWorkflowSlotsAvailable + metricWorkflowSlotsUsed + metricActivitySlotsAvailable + metricActivitySlotsUsed + metricLocalActivitySlotsAvailable + metricLocalActivitySlotsUsed + metricNexusSlotsAvailable + metricNexusSlotsUsed + + metricWorkflowTasksProcessed + metricActivityTasksProcessed + metricLocalActivityTasksProcessed + metricNexusTasksProcessed + + metricWorkflowPollerCount + metricWorkflowStickyPollerCount + metricActivityPollerCount + metricNexusPollerCount + + metricWorkflowLastPoll + metricWorkflowStickyLastPoll + metricActivityLastPoll + metricNexusLastPoll + + metricCount +) + +var counterMetricMap = map[string]heartbeatMetric{ + metrics.StickyCacheHit: metricStickyCacheHit, + metrics.StickyCacheMiss: metricStickyCacheMiss, + metrics.WorkflowTaskExecutionFailureCounter: metricWorkflowTaskFailures, + metrics.ActivityExecutionFailedCounter: metricActivityTaskFailures, + metrics.LocalActivityExecutionFailedCounter: metricLocalActivityTaskFailures, + metrics.NexusTaskExecutionFailedCounter: metricNexusTaskFailures, +} + +var timerMetricMap = map[string]heartbeatMetric{ + metrics.WorkflowTaskExecutionLatency: metricWorkflowTasksProcessed, + metrics.ActivityExecutionLatency: metricActivityTasksProcessed, + metrics.LocalActivityExecutionLatency: metricLocalActivityTasksProcessed, + metrics.NexusTaskExecutionLatency: metricNexusTasksProcessed, +} + +var slotsAvailableByWorkerType = map[string]heartbeatMetric{ + "WorkflowWorker": metricWorkflowSlotsAvailable, + "ActivityWorker": metricActivitySlotsAvailable, + "LocalActivityWorker": metricLocalActivitySlotsAvailable, + "NexusWorker": metricNexusSlotsAvailable, +} + +var slotsUsedByWorkerType = map[string]heartbeatMetric{ + "WorkflowWorker": metricWorkflowSlotsUsed, + "ActivityWorker": metricActivitySlotsUsed, + "LocalActivityWorker": metricLocalActivitySlotsUsed, + "NexusWorker": metricNexusSlotsUsed, +} + +var pollerCountByPollerType = map[string]heartbeatMetric{ + metrics.PollerTypeWorkflowTask: metricWorkflowPollerCount, + metrics.PollerTypeWorkflowStickyTask: metricWorkflowStickyPollerCount, + metrics.PollerTypeActivityTask: metricActivityPollerCount, + metrics.PollerTypeNexusTask: metricNexusPollerCount, +} + +// HeartbeatMetricsHandler wraps a metrics handler and captures specific metrics +// in memory that are needed for worker heartbeats +type HeartbeatMetricsHandler struct { + underlying metrics.Handler + workerType string + pollerType string + metrics map[heartbeatMetric]*atomic.Uint64 +} + +// NewHeartbeatMetricsHandler creates a new handler that captures specific metrics +// for worker heartbeats while passing all metrics to the underlying handler. +func NewHeartbeatMetricsHandler(underlying metrics.Handler) *HeartbeatMetricsHandler { + m := make(map[heartbeatMetric]*atomic.Uint64, metricCount) + for i := range heartbeatMetric(metricCount) { + m[i] = new(atomic.Uint64) + } + return &HeartbeatMetricsHandler{ + underlying: underlying, + metrics: m, + } +} + +func (h *HeartbeatMetricsHandler) WithTags(tags map[string]string) metrics.Handler { + cpy := *h + cpy.underlying = h.underlying.WithTags(tags) + if wt, ok := tags[metrics.WorkerTypeTagName]; ok { + cpy.workerType = wt + } + if pt, ok := tags[metrics.PollerTypeTagName]; ok { + cpy.pollerType = pt + } + return &cpy +} + +func (h *HeartbeatMetricsHandler) Counter(name string) metrics.Counter { + underlying := h.underlying.Counter(name) + if metric, ok := counterMetricMap[name]; ok { + return &capturingCounter{ + underlying: underlying, + value: h.metrics[metric], + } + } + return underlying +} + +func (h *HeartbeatMetricsHandler) Gauge(name string) metrics.Gauge { + underlying := h.underlying.Gauge(name) + + switch name { + case metrics.StickyCacheSize: + return &capturingGauge{ + underlying: underlying, + value: h.metrics[metricStickyCacheSize], + } + case metrics.WorkerTaskSlotsAvailable: + if metric, ok := slotsAvailableByWorkerType[h.workerType]; ok { + return &capturingGauge{ + underlying: underlying, + value: h.metrics[metric], + } + } + case metrics.WorkerTaskSlotsUsed: + if metric, ok := slotsUsedByWorkerType[h.workerType]; ok { + return &capturingGauge{ + underlying: underlying, + value: h.metrics[metric], + } + } + case metrics.NumPoller: + if metric, ok := pollerCountByPollerType[h.pollerType]; ok { + return &capturingGauge{ + underlying: underlying, + value: h.metrics[metric], + } + } + } + + return underlying +} + +func (h *HeartbeatMetricsHandler) Timer(name string) metrics.Timer { + underlying := h.underlying.Timer(name) + if metric, ok := timerMetricMap[name]; ok { + return &capturingTimer{ + underlying: underlying, + counter: h.metrics[metric], + } + } + return underlying +} + +// PopulateHeartbeatOptions contains external dependencies needed to populate heartbeat metrics. +type PopulateHeartbeatOptions struct { + WorkflowSlotSupplierKind string + ActivitySlotSupplierKind string + LocalActivitySlotSupplierKind string + NexusSlotSupplierKind string + + WorkflowPollerBehavior PollerBehavior + ActivityPollerBehavior PollerBehavior + NexusPollerBehavior PollerBehavior + + // For delta calculations between heartbeats (mutated by PopulateHeartbeat) + PrevWorkflowProcessed *int64 + PrevWorkflowFailed *int64 + PrevActivityProcessed *int64 + PrevActivityFailed *int64 + PrevLocalActivityProcessed *int64 + PrevLocalActivityFailed *int64 + PrevNexusProcessed *int64 + PrevNexusFailed *int64 +} + +// PopulateHeartbeat fills in the metrics-related fields of the passed in WorkerHeartbeat proto, as well as updates +// references in the PopulateHeartbeatOptions for future delta calculations. +func (h *HeartbeatMetricsHandler) PopulateHeartbeat(hb *workerpb.WorkerHeartbeat, opts *PopulateHeartbeatOptions) { + hb.TotalStickyCacheHit = int32(h.metrics[metricStickyCacheHit].Load()) + hb.TotalStickyCacheMiss = int32(h.metrics[metricStickyCacheMiss].Load()) + hb.CurrentStickyCacheSize = int32(h.metrics[metricStickyCacheSize].Load()) + + if opts.WorkflowSlotSupplierKind != "" { + hb.WorkflowTaskSlotsInfo = buildSlotsInfo( + opts.WorkflowSlotSupplierKind, + int32(h.metrics[metricWorkflowSlotsAvailable].Load()), + int32(h.metrics[metricWorkflowSlotsUsed].Load()), + int64(h.metrics[metricWorkflowTasksProcessed].Load()), + int64(h.metrics[metricWorkflowTaskFailures].Load()), + opts.PrevWorkflowProcessed, + opts.PrevWorkflowFailed, + ) + } + + if opts.ActivitySlotSupplierKind != "" { + hb.ActivityTaskSlotsInfo = buildSlotsInfo( + opts.ActivitySlotSupplierKind, + int32(h.metrics[metricActivitySlotsAvailable].Load()), + int32(h.metrics[metricActivitySlotsUsed].Load()), + int64(h.metrics[metricActivityTasksProcessed].Load()), + int64(h.metrics[metricActivityTaskFailures].Load()), + opts.PrevActivityProcessed, + opts.PrevActivityFailed, + ) + } + + if opts.LocalActivitySlotSupplierKind != "" { + hb.LocalActivitySlotsInfo = buildSlotsInfo( + opts.LocalActivitySlotSupplierKind, + int32(h.metrics[metricLocalActivitySlotsAvailable].Load()), + int32(h.metrics[metricLocalActivitySlotsUsed].Load()), + int64(h.metrics[metricLocalActivityTasksProcessed].Load()), + int64(h.metrics[metricLocalActivityTaskFailures].Load()), + opts.PrevLocalActivityProcessed, + opts.PrevLocalActivityFailed, + ) + } + + if opts.NexusSlotSupplierKind != "" { + hb.NexusTaskSlotsInfo = buildSlotsInfo( + opts.NexusSlotSupplierKind, + int32(h.metrics[metricNexusSlotsAvailable].Load()), + int32(h.metrics[metricNexusSlotsUsed].Load()), + int64(h.metrics[metricNexusTasksProcessed].Load()), + int64(h.metrics[metricNexusTaskFailures].Load()), + opts.PrevNexusProcessed, + opts.PrevNexusFailed, + ) + } + + hb.WorkflowPollerInfo = buildPollerInfo( + int32(h.metrics[metricWorkflowPollerCount].Load()), + h.getLastPollTime(metricWorkflowLastPoll), + opts.WorkflowPollerBehavior, + ) + hb.WorkflowStickyPollerInfo = buildPollerInfo( + int32(h.metrics[metricWorkflowStickyPollerCount].Load()), + h.getLastPollTime(metricWorkflowStickyLastPoll), + opts.WorkflowPollerBehavior, + ) + hb.ActivityPollerInfo = buildPollerInfo( + int32(h.metrics[metricActivityPollerCount].Load()), + h.getLastPollTime(metricActivityLastPoll), + opts.ActivityPollerBehavior, + ) + hb.NexusPollerInfo = buildPollerInfo( + int32(h.metrics[metricNexusPollerCount].Load()), + h.getLastPollTime(metricNexusLastPoll), + opts.NexusPollerBehavior, + ) +} + +func (h *HeartbeatMetricsHandler) getLastPollTime(metric heartbeatMetric) time.Time { + nanos := h.metrics[metric].Load() + if nanos == 0 { + return time.Time{} + } + return time.Unix(0, int64(nanos)) +} + +func buildSlotsInfo( + supplierKind string, + slotsAvailable int32, + slotsUsed int32, + totalProcessed int64, + totalFailed int64, + prevProcessed *int64, + prevFailed *int64, +) *workerpb.WorkerSlotsInfo { + intervalProcessed := totalProcessed - *prevProcessed + intervalFailed := totalFailed - *prevFailed + + *prevProcessed = totalProcessed + *prevFailed = totalFailed + + return &workerpb.WorkerSlotsInfo{ + CurrentAvailableSlots: slotsAvailable, + CurrentUsedSlots: slotsUsed, + SlotSupplierKind: supplierKind, + TotalProcessedTasks: int32(totalProcessed), + TotalFailedTasks: int32(totalFailed), + LastIntervalProcessedTasks: int32(intervalProcessed), + LastIntervalFailureTasks: int32(intervalFailed), + } +} + +func buildPollerInfo(currentPollers int32, lastSuccessfulPollTime time.Time, pollerBehavior PollerBehavior) *workerpb.WorkerPollerInfo { + var isAutoscaling bool + switch pollerBehavior.(type) { + case *pollerBehaviorAutoscaling: + isAutoscaling = true + } + + return &workerpb.WorkerPollerInfo{ + CurrentPollers: currentPollers, + LastSuccessfulPollTime: timestamppb.New(lastSuccessfulPollTime), + IsAutoscaling: isAutoscaling, + } +} + +// GetStickyCacheHit returns the total number of sticky cache hits. +func (h *HeartbeatMetricsHandler) GetStickyCacheHit() int32 { + return int32(h.metrics[metricStickyCacheHit].Load()) +} + +// GetStickyCacheMiss returns the total number of sticky cache misses. +func (h *HeartbeatMetricsHandler) GetStickyCacheMiss() int32 { + return int32(h.metrics[metricStickyCacheMiss].Load()) +} + +// GetStickyCacheSize returns the current sticky cache size. +func (h *HeartbeatMetricsHandler) GetStickyCacheSize() int32 { + return int32(h.metrics[metricStickyCacheSize].Load()) +} + +// GetWorkflowTaskFailures returns the total number of workflow task failures. +func (h *HeartbeatMetricsHandler) GetWorkflowTaskFailures() int64 { + return int64(h.metrics[metricWorkflowTaskFailures].Load()) +} + +// GetActivityTaskFailures returns the total number of activity task failures. +func (h *HeartbeatMetricsHandler) GetActivityTaskFailures() int64 { + return int64(h.metrics[metricActivityTaskFailures].Load()) +} + +// GetLocalActivityTaskFailures returns the total number of local activity task failures. +func (h *HeartbeatMetricsHandler) GetLocalActivityTaskFailures() int64 { + return int64(h.metrics[metricLocalActivityTaskFailures].Load()) +} + +// GetNexusTaskFailures returns the total number of nexus task failures. +func (h *HeartbeatMetricsHandler) GetNexusTaskFailures() int64 { + return int64(h.metrics[metricNexusTaskFailures].Load()) +} + +// GetWorkflowSlotsAvailable returns the current workflow slots available. +func (h *HeartbeatMetricsHandler) GetWorkflowSlotsAvailable() int32 { + return int32(h.metrics[metricWorkflowSlotsAvailable].Load()) +} + +// GetWorkflowSlotsUsed returns the current workflow slots used. +func (h *HeartbeatMetricsHandler) GetWorkflowSlotsUsed() int32 { + return int32(h.metrics[metricWorkflowSlotsUsed].Load()) +} + +// GetActivitySlotsAvailable returns the current activity slots available. +func (h *HeartbeatMetricsHandler) GetActivitySlotsAvailable() int32 { + return int32(h.metrics[metricActivitySlotsAvailable].Load()) +} + +// GetActivitySlotsUsed returns the current activity slots used. +func (h *HeartbeatMetricsHandler) GetActivitySlotsUsed() int32 { + return int32(h.metrics[metricActivitySlotsUsed].Load()) +} + +// GetLocalActivitySlotsAvailable returns the current local activity slots available. +func (h *HeartbeatMetricsHandler) GetLocalActivitySlotsAvailable() int32 { + return int32(h.metrics[metricLocalActivitySlotsAvailable].Load()) +} + +// GetLocalActivitySlotsUsed returns the current local activity slots used. +func (h *HeartbeatMetricsHandler) GetLocalActivitySlotsUsed() int32 { + return int32(h.metrics[metricLocalActivitySlotsUsed].Load()) +} + +// GetNexusSlotsAvailable returns the current nexus slots available. +func (h *HeartbeatMetricsHandler) GetNexusSlotsAvailable() int32 { + return int32(h.metrics[metricNexusSlotsAvailable].Load()) +} + +// GetNexusSlotsUsed returns the current nexus slots used. +func (h *HeartbeatMetricsHandler) GetNexusSlotsUsed() int32 { + return int32(h.metrics[metricNexusSlotsUsed].Load()) +} + +// GetWorkflowTasksProcessed returns the total number of workflow tasks processed. +func (h *HeartbeatMetricsHandler) GetWorkflowTasksProcessed() int64 { + return int64(h.metrics[metricWorkflowTasksProcessed].Load()) +} + +// GetActivityTasksProcessed returns the total number of activity tasks processed. +func (h *HeartbeatMetricsHandler) GetActivityTasksProcessed() int64 { + return int64(h.metrics[metricActivityTasksProcessed].Load()) +} + +// GetLocalActivityTasksProcessed returns the total number of local activity tasks processed. +func (h *HeartbeatMetricsHandler) GetLocalActivityTasksProcessed() int64 { + return int64(h.metrics[metricLocalActivityTasksProcessed].Load()) +} + +// GetNexusTasksProcessed returns the total number of nexus tasks processed. +func (h *HeartbeatMetricsHandler) GetNexusTasksProcessed() int64 { + return int64(h.metrics[metricNexusTasksProcessed].Load()) +} + +// GetWorkflowPollerCount returns the current number of workflow task pollers. +func (h *HeartbeatMetricsHandler) GetWorkflowPollerCount() int32 { + return int32(h.metrics[metricWorkflowPollerCount].Load()) +} + +// GetWorkflowStickyPollerCount returns the current number of workflow sticky task pollers. +func (h *HeartbeatMetricsHandler) GetWorkflowStickyPollerCount() int32 { + return int32(h.metrics[metricWorkflowStickyPollerCount].Load()) +} + +// GetActivityPollerCount returns the current number of activity task pollers. +func (h *HeartbeatMetricsHandler) GetActivityPollerCount() int32 { + return int32(h.metrics[metricActivityPollerCount].Load()) +} + +// GetNexusPollerCount returns the current number of nexus task pollers. +func (h *HeartbeatMetricsHandler) GetNexusPollerCount() int32 { + return int32(h.metrics[metricNexusPollerCount].Load()) +} + +// RecordWorkflowPollSuccess records a successful workflow task poll. +func (h *HeartbeatMetricsHandler) RecordWorkflowPollSuccess() { + h.metrics[metricWorkflowLastPoll].Store(uint64(time.Now().UnixNano())) +} + +// RecordWorkflowStickyPollSuccess records a successful workflow sticky task poll. +func (h *HeartbeatMetricsHandler) RecordWorkflowStickyPollSuccess() { + h.metrics[metricWorkflowStickyLastPoll].Store(uint64(time.Now().UnixNano())) +} + +// RecordActivityPollSuccess records a successful activity task poll. +func (h *HeartbeatMetricsHandler) RecordActivityPollSuccess() { + h.metrics[metricActivityLastPoll].Store(uint64(time.Now().UnixNano())) +} + +// RecordNexusPollSuccess records a successful nexus task poll. +func (h *HeartbeatMetricsHandler) RecordNexusPollSuccess() { + h.metrics[metricNexusLastPoll].Store(uint64(time.Now().UnixNano())) +} + +// GetWorkflowLastPollTime returns the last successful workflow task poll time. +func (h *HeartbeatMetricsHandler) GetWorkflowLastPollTime() time.Time { + nanos := h.metrics[metricWorkflowLastPoll].Load() + if nanos == 0 { + return time.Time{} + } + return time.Unix(0, int64(nanos)) +} + +// GetWorkflowStickyLastPollTime returns the last successful workflow sticky task poll time. +func (h *HeartbeatMetricsHandler) GetWorkflowStickyLastPollTime() time.Time { + nanos := h.metrics[metricWorkflowStickyLastPoll].Load() + if nanos == 0 { + return time.Time{} + } + return time.Unix(0, int64(nanos)) +} + +// GetActivityLastPollTime returns the last successful activity task poll time. +func (h *HeartbeatMetricsHandler) GetActivityLastPollTime() time.Time { + nanos := h.metrics[metricActivityLastPoll].Load() + if nanos == 0 { + return time.Time{} + } + return time.Unix(0, int64(nanos)) +} + +// GetNexusLastPollTime returns the last successful nexus task poll time. +func (h *HeartbeatMetricsHandler) GetNexusLastPollTime() time.Time { + nanos := h.metrics[metricNexusLastPoll].Load() + if nanos == 0 { + return time.Time{} + } + return time.Unix(0, int64(nanos)) +} + +// PollSuccessRecorder is an optional interface for recording successful poll times. +type PollSuccessRecorder interface { + RecordWorkflowPollSuccess() + RecordWorkflowStickyPollSuccess() + RecordActivityPollSuccess() + RecordNexusPollSuccess() +} + +// RecordPollSuccess records a successful poll time if the handler supports it. +// pollerType should be one of PollerTypeWorkflowTask, PollerTypeWorkflowStickyTask, +// PollerTypeActivityTask, or PollerTypeNexusTask. +func RecordPollSuccess(h metrics.Handler, pollerType string) { + recorder, ok := h.(PollSuccessRecorder) + if !ok { + return + } + switch pollerType { + case metrics.PollerTypeWorkflowTask: + recorder.RecordWorkflowPollSuccess() + case metrics.PollerTypeWorkflowStickyTask: + recorder.RecordWorkflowStickyPollSuccess() + case metrics.PollerTypeActivityTask: + recorder.RecordActivityPollSuccess() + case metrics.PollerTypeNexusTask: + recorder.RecordNexusPollSuccess() + } +} + +// capturingCounter wraps a counter and captures its value in memory for heartbeat reporting. +type capturingCounter struct { + underlying metrics.Counter + value *atomic.Uint64 +} + +func (c *capturingCounter) Inc(delta int64) { + c.underlying.Inc(delta) + if delta > 0 { + c.value.Add(uint64(delta)) + } +} + +// capturingGauge wraps a gauge and captures its value in memory for heartbeat reporting. +type capturingGauge struct { + underlying metrics.Gauge + value *atomic.Uint64 +} + +func (g *capturingGauge) Update(f float64) { + g.underlying.Update(f) + g.value.Store(uint64(f)) +} + +// capturingTimer wraps a timer and increments a counter each time Record is called. +type capturingTimer struct { + underlying metrics.Timer + counter *atomic.Uint64 +} + +func (t *capturingTimer) Record(d time.Duration) { + t.underlying.Record(d) + t.counter.Add(1) +} diff --git a/internal/internal_workflow_client.go b/internal/internal_workflow_client.go index 21675f6f8..94d3db685 100644 --- a/internal/internal_workflow_client.go +++ b/internal/internal_workflow_client.go @@ -4,8 +4,6 @@ import ( "context" "errors" "fmt" - namespacepb "go.temporal.io/api/namespace/v1" - workerpb "go.temporal.io/api/worker/v1" "io" "math" "reflect" @@ -23,6 +21,7 @@ import ( commonpb "go.temporal.io/api/common/v1" enumspb "go.temporal.io/api/enums/v1" historypb "go.temporal.io/api/history/v1" + namespacepb "go.temporal.io/api/namespace/v1" "go.temporal.io/api/operatorservice/v1" querypb "go.temporal.io/api/query/v1" "go.temporal.io/api/sdk/v1" @@ -84,8 +83,7 @@ type ( getSystemInfoTimeout time.Duration workerHeartbeatInterval time.Duration workerGroupingKey string - heartbeatWorkers map[string]*sharedNamespaceWorker - heartbeatWorkersMu sync.RWMutex + heartbeatManager *HeartbeatManager // The pointer value is shared across multiple clients. If non-nil, only // access/mutate atomically. @@ -1444,74 +1442,6 @@ func (wc *WorkflowClient) RecordWorkerHeartbeat(ctx context.Context, request *wo return resp, nil } -func (wc *WorkflowClient) getOrCreateHeartbeatWorker(namespace string) (*sharedNamespaceWorker, error) { - wc.heartbeatWorkersMu.Lock() - defer wc.heartbeatWorkersMu.Unlock() - - if hw, ok := wc.heartbeatWorkers[namespace]; ok { - return hw, nil - } - - capabilities, err := wc.loadNamespaceCapabilities(context.Background()) - if err != nil { - return nil, fmt.Errorf("failed to get namespace capabilities: %w", err) - } - if !capabilities.WorkerHeartbeats { - wc.logger.Debug("Worker heartbeating configured, but server version does not support it.") - return nil, nil - } - - hw := &sharedNamespaceWorker{ - client: wc, - namespace: namespace, - taskQueue: fmt.Sprintf("temporal-sys/worker-commands/%s/%s", namespace, wc.workerGroupingKey), - interval: wc.workerHeartbeatInterval, - callbacks: make(map[string]func() *workerpb.WorkerHeartbeat), - stopC: make(chan struct{}), - stoppedC: make(chan struct{}), - logger: wc.logger, - } - - nexusWorker, err := hw.createNexusWorker() - if err != nil { - return nil, fmt.Errorf("failed to create nexus worker for heartbeating: %w", err) - } - hw.nexusWorker = nexusWorker - - if wc.heartbeatWorkers == nil { - wc.heartbeatWorkers = make(map[string]*sharedNamespaceWorker) - } - wc.heartbeatWorkers[namespace] = hw - - go hw.run() - - return hw, nil -} - -// unregisterHeartbeatCallback removes a callback from the heartbeat worker for the given namespace. -// Returns true if the heartbeat worker should be stopped (no more callbacks remain). -// This method holds heartbeatWorkersMu while checking and removing, preventing races where -// a new worker could get a reference to an about-to-be-stopped heartbeat worker. -func (wc *WorkflowClient) unregisterHeartbeatCallback(namespace string, workerInstanceKey string) bool { - wc.heartbeatWorkersMu.Lock() - defer wc.heartbeatWorkersMu.Unlock() - - hw, ok := wc.heartbeatWorkers[namespace] - if !ok { - return false - } - - hw.mu.Lock() - delete(hw.callbacks, workerInstanceKey) - shouldStop := len(hw.callbacks) == 0 - hw.mu.Unlock() - - if shouldStop { - delete(wc.heartbeatWorkers, namespace) - } - return shouldStop -} - // Close client and clean up underlying resources. func (wc *WorkflowClient) Close() { // If there's a set of unclosed clients, we have to decrement it and then diff --git a/internal/tuning.go b/internal/tuning.go index 4f85db366..72a6c3ac6 100644 --- a/internal/tuning.go +++ b/internal/tuning.go @@ -131,17 +131,10 @@ type SlotSupplier interface { MaxSlots() int } -// SlotSupplierKinder is an optional interface that slot suppliers can implement to provide -// a custom kind/type name. If not implemented, getSlotSupplierKind will use reflection. -type SlotSupplierKinder interface { - Kind() string -} - // getSlotSupplierKind returns the kind/type name of a slot supplier. If the supplier implements -// SlotSupplierKinder, it returns the result of Kind(). Otherwise, it uses reflection to get the -// type name. +// a Kind() string method, it uses that. Otherwise, it falls back to reflection on the type name. func getSlotSupplierKind(s SlotSupplier) string { - if k, ok := s.(SlotSupplierKinder); ok { + if k, ok := s.(interface{ Kind() string }); ok { return k.Kind() } t := reflect.TypeOf(s) diff --git a/internal/worker.go b/internal/worker.go index 9eefa81da..7b73c1668 100644 --- a/internal/worker.go +++ b/internal/worker.go @@ -35,12 +35,14 @@ type ( isPollerBehavior() } - // HostMetricsProvider provides host-level CPU and memory metrics for worker heartbeats. - // Implement this interface to provide custom metrics collection, or use the default - // implementation provided by the SDK in the worker/hostmetrics package. + // TunerHostMetricsProvider provides host-level CPU and memory metrics for worker heartbeats. + // This interface is typically implemented by a [WorkerTuner] to provide metrics from the same + // source used for tuning decisions, avoiding double-measurement. If the tuner passed to + // WorkerOptions implements this interface, the SDK will automatically use it for heartbeat + // metrics. Alternatively, use the default implementation in the worker/hostmetrics package. // - // Exposed as: [go.temporal.io/sdk/worker.HostMetricsProvider] - HostMetricsProvider interface { + // Exposed as: [go.temporal.io/sdk/worker.TunerHostMetricsProvider] + TunerHostMetricsProvider interface { // GetCpuUsage returns the current host CPU usage as a fraction (0.0-1.0) GetCpuUsage() (float64, error) // GetMemoryUsage returns the current host memory usage as a fraction (0.0-1.0) diff --git a/test/go.mod b/test/go.mod index 2b8801962..c3efe5c97 100644 --- a/test/go.mod +++ b/test/go.mod @@ -27,35 +27,21 @@ require ( ) require ( - github.com/cilium/ebpf v0.11.0 // indirect - github.com/containerd/cgroups/v3 v3.0.3 // indirect - github.com/coreos/go-systemd/v22 v22.3.2 // indirect github.com/davecgh/go-spew v1.1.1 // indirect + github.com/ebitengine/purego v0.9.1 // indirect github.com/facebookgo/clock v0.0.0-20150410010913-600d898af40a // indirect github.com/go-logr/logr v1.4.2 // indirect github.com/go-logr/stdr v1.2.2 // indirect - github.com/go-ole/go-ole v1.2.6 // indirect - github.com/godbus/dbus/v5 v5.0.4 // indirect github.com/gogo/protobuf v1.3.2 // indirect github.com/grpc-ecosystem/go-grpc-middleware/v2 v2.3.2 // indirect github.com/grpc-ecosystem/grpc-gateway/v2 v2.22.0 // indirect - github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 // indirect - github.com/opencontainers/runtime-spec v1.0.2 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect - github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c // indirect github.com/robfig/cron v1.2.0 // indirect - github.com/shirou/gopsutil/v4 v4.24.8 // indirect - github.com/shoenig/go-m1cpu v0.1.6 // indirect - github.com/sirupsen/logrus v1.9.3 // indirect github.com/stretchr/objx v0.5.2 // indirect - github.com/tklauser/go-sysconf v0.3.12 // indirect - github.com/tklauser/numcpus v0.6.1 // indirect github.com/twmb/murmur3 v1.1.5 // indirect - github.com/yusufpapurcu/wmi v1.2.4 // indirect go.einride.tech/pid v0.1.3 // indirect go.opentelemetry.io/otel/metric v1.28.0 // indirect go.uber.org/atomic v1.9.0 // indirect - golang.org/x/exp v0.0.0-20240325151524-a685a6edb6d8 // indirect golang.org/x/net v0.39.0 // indirect golang.org/x/sync v0.13.0 // indirect golang.org/x/sys v0.32.0 // indirect diff --git a/test/go.sum b/test/go.sum index 252849014..25f0898b2 100644 --- a/test/go.sum +++ b/test/go.sum @@ -9,19 +9,13 @@ github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+Ce github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/cactus/go-statsd-client/statsd v0.0.0-20200423205355-cb0885a1018c/go.mod h1:l/bIBLeOl9eX+wxJAzxS4TveKRtAqlyDpHjhkfO0MEI= github.com/cespare/xxhash/v2 v2.1.1/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= -github.com/cilium/ebpf v0.11.0 h1:V8gS/bTCCjX9uUnkUFUpPsksM8n1lXBAvHcpiFk1X2Y= -github.com/cilium/ebpf v0.11.0/go.mod h1:WE7CZAnqOL2RouJ4f1uyNhqr2P4CCvXFIqdRDUgWsVs= -github.com/containerd/cgroups/v3 v3.0.3 h1:S5ByHZ/h9PMe5IOQoN7E+nMc2UcLEM/V48DGDJ9kip0= -github.com/containerd/cgroups/v3 v3.0.3/go.mod h1:8HBe7V3aWGLFPd/k03swSIsGjZhHI2WzJmticMgVuz0= -github.com/coreos/go-systemd/v22 v22.3.2 h1:D9/bQk5vlXQFZ6Kwuu6zaiXJ9oTPe68++AzAJc1DzSI= -github.com/coreos/go-systemd/v22 v22.3.2/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/ebitengine/purego v0.9.1 h1:a/k2f2HQU3Pi399RPW1MOaZyhKJL9w/xFpKAg4q1s0A= +github.com/ebitengine/purego v0.9.1/go.mod h1:iIjxzd6CiRiOG0UyXP+V1+jWqUXVjPKLAI0mRfJZTmQ= github.com/facebookgo/clock v0.0.0-20150410010913-600d898af40a h1:yDWHCSQ40h88yih2JAcL6Ls/kVkSE8GFACTGVnMPruw= github.com/facebookgo/clock v0.0.0-20150410010913-600d898af40a/go.mod h1:7Ga40egUymuWXxAe151lTNnCv97MddSOVsjpPPkityA= -github.com/frankban/quicktest v1.14.5 h1:dfYrrRyLtiqT9GyKXgdh+k4inNeTvmGbuSgZ3lx3GhA= -github.com/frankban/quicktest v1.14.5/go.mod h1:4ptaffx2x8+WTWXmUCuVU6aPUX1/Mz7zb5vbUoiM6w0= github.com/go-kit/kit v0.8.0/go.mod h1:xBxKIO96dXMWWy0MnWVtmwkA9/13aqxPnvrjFYMA2as= github.com/go-kit/kit v0.9.0/go.mod h1:xBxKIO96dXMWWy0MnWVtmwkA9/13aqxPnvrjFYMA2as= github.com/go-kit/log v0.1.0/go.mod h1:zbhenjAZHb184qTLMA9ZjW7ThYL0H2mk7Q6pNt4vbaY= @@ -33,11 +27,7 @@ github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY= github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= -github.com/go-ole/go-ole v1.2.6 h1:/Fpf6oFPoeFik9ty7siob0G6Ke8QvQEuVcuChpwXzpY= -github.com/go-ole/go-ole v1.2.6/go.mod h1:pprOEPIfldk/42T2oK7lQ4v4JSDwmV0As9GaiUsvbm0= github.com/go-stack/stack v1.8.0/go.mod h1:v0f6uXyyMGvRgIKkXu+yp6POWl0qKG85gN/melR3HDY= -github.com/godbus/dbus/v5 v5.0.4 h1:9349emZab16e7zQvpmsbtjc18ykshndd8y2PG3sgJbA= -github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= github.com/gogo/protobuf v1.1.1/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ= github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= @@ -58,7 +48,6 @@ github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMyw github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.4/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= -github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= @@ -86,8 +75,6 @@ github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= -github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 h1:6E+4a0GO5zZEnZ81pIr0yLvtUWk2if982qA3F3QD6H4= -github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0/go.mod h1:zJYVVT2jmtg6P3p1VtQj7WsuWi/y4VnjVBn7F8KPB3I= github.com/matttproud/golang_protobuf_extensions v1.0.1/go.mod h1:D8He9yQNgCq6Z5Ld7szi9bcBfOoFv/3dc6xSMkL2PC0= github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= @@ -97,8 +84,6 @@ github.com/mwitkow/go-conntrack v0.0.0-20161129095857-cc309e4a2223/go.mod h1:qRW github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U= github.com/nexus-rpc/sdk-go v0.5.1 h1:UFYYfoHlQc+Pn9gQpmn9QE7xluewAn2AO1OSkAh7YFU= github.com/nexus-rpc/sdk-go v0.5.1/go.mod h1:FHdPfVQwRuJFZFTF0Y2GOAxCrbIBNrcPna9slkGKPYk= -github.com/opencontainers/runtime-spec v1.0.2 h1:UfAcuLBJB9Coz72x1hgl8O5RVzTdNiaglX6v2DM6FI0= -github.com/opencontainers/runtime-spec v1.0.2/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0= github.com/opentracing/opentracing-go v1.2.0 h1:uEJPy/1a5RIPAJ0Ov+OIO8OxWu77jEv+1B0VhjKrZUs= github.com/opentracing/opentracing-go v1.2.0/go.mod h1:GxEUsuufX4nBwe+T+Wl9TAgYrxe9dPLANfrWvHYVTgc= github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= @@ -106,8 +91,6 @@ github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINE github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c h1:ncq/mPwQF4JjgDlrVEn3C11VoGHZN7m8qihwgMEtzYw= -github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c/go.mod h1:OmDBASR4679mdNQnz2pUhc2G8CO2JrUAVFDRBDP/hJE= github.com/prometheus/client_golang v0.9.1/go.mod h1:7SWBe2y4D6OKWSNQJUaRYU/AaXPKyh/dDVn+NZz0KFw= github.com/prometheus/client_golang v1.0.0/go.mod h1:db9x61etRT2tGnBNRi70OPL5FsnadC4Ky3P0J6CfImo= github.com/prometheus/client_golang v1.7.1/go.mod h1:PY5Wy2awLA44sXw4AOSfFBetzPP4j5+D6mVACh+pe2M= @@ -126,17 +109,9 @@ github.com/robfig/cron v1.2.0 h1:ZjScXvvxeQ63Dbyxy76Fj3AT3Ut0aKsyd2/tl3DTMuQ= github.com/robfig/cron v1.2.0/go.mod h1:JGuDeoQd7Z6yL4zQhZ3OPEVHB7fL6Ka6skscFHfmt2k= github.com/rogpeppe/go-internal v1.11.0 h1:cWPaGQEPrBb5/AsnsZesgZZ9yb1OQ+GOISoDNXVBh4M= github.com/rogpeppe/go-internal v1.11.0/go.mod h1:ddIwULY96R17DhadqLgMfk9H9tvdUzkipdSkR5nkCZA= -github.com/shirou/gopsutil/v4 v4.24.8 h1:pVQjIenQkIhqO81mwTaXjTzOMT7d3TZkf43PlVFHENI= -github.com/shirou/gopsutil/v4 v4.24.8/go.mod h1:wE0OrJtj4dG+hYkxqDH3QiBICdKSf04/npcvLLc/oRg= -github.com/shoenig/go-m1cpu v0.1.6 h1:nxdKQNcEB6vzgA2E2bvzKIYRuNj7XNJ4S/aRSwKzFtM= -github.com/shoenig/go-m1cpu v0.1.6/go.mod h1:1JJMcUBvfNwpq05QDQVAnx3gUHr9IYF7GNg9SUEw2VQ= -github.com/shoenig/test v0.6.4 h1:kVTaSd7WLz5WZ2IaoM0RSzRsUD+m8wRR+5qvntpn4LU= -github.com/shoenig/test v0.6.4/go.mod h1:byHiCGXqrVaflBLAMq/srcZIHynQPQgeyvkvXnjqq0k= github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo= github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6MwdIuYE2rE= github.com/sirupsen/logrus v1.6.0/go.mod h1:7uNnSEd1DgxDLC74fIahvMZmmYsHGZGEOFrfsX/uA88= -github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= -github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY= @@ -147,10 +122,6 @@ github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81P github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= -github.com/tklauser/go-sysconf v0.3.12 h1:0QaGUFOdQaIVdPgfITYzaTegZvdCjmYO52cSFAEVmqU= -github.com/tklauser/go-sysconf v0.3.12/go.mod h1:Ho14jnntGE1fpdOqQEEaiKRpvIavV0hSfmBq8nJbHYI= -github.com/tklauser/numcpus v0.6.1 h1:ng9scYS7az0Bk4OZLvrNXNSAO2Pxr1XXRAPyjhIx+Fk= -github.com/tklauser/numcpus v0.6.1/go.mod h1:1XfjsgE2zo8GVw7POkMbHENHzVg3GzmoZ9fESEdAacY= github.com/twmb/murmur3 v1.1.5 h1:i9OLS9fkuLzBXjt6dptlAEyk58fJsSTXbRg3SgVyqgk= github.com/twmb/murmur3 v1.1.5/go.mod h1:Qq/R7NUyOfr65zD+6Q5IHKsJLwP7exErjN6lyyq3OSQ= github.com/uber-go/tally/v4 v4.1.1 h1:jhy6WOZp4nHyCqeV43x3Wz370LXUGBhgW2JmzOIHCWI= @@ -158,8 +129,6 @@ github.com/uber-go/tally/v4 v4.1.1/go.mod h1:aXeSTDMl4tNosyf6rdU8jlgScHyjEGGtfJ/ github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.3.5/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k= -github.com/yusufpapurcu/wmi v1.2.4 h1:zFUKzehAFReQwLys1b/iSMl+JQGSCSjtVqQn9bBrPo0= -github.com/yusufpapurcu/wmi v1.2.4/go.mod h1:SBZ9tNy3G9/m5Oi98Zks0QjeHVDvuK0qfxQmPyzfmi0= go.einride.tech/pid v0.1.3 h1:yWAKSmD2Z10jxd4gYFhOjbBNqXeIQwAtnCO/XKCT7sQ= go.einride.tech/pid v0.1.3/go.mod h1:33JSUbKrH/4v8DZf/0K8IC8Enjd92wB2birp+bCYQso= go.opentelemetry.io/otel v1.28.0 h1:/SqNcYk+idO0CxKEUOtKQClMK/MimZihKYMruSMViUo= @@ -183,8 +152,6 @@ golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnf golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= -golang.org/x/exp v0.0.0-20240325151524-a685a6edb6d8 h1:aAcj0Da7eBAtrTp03QXWvm88pSyOt+UgdZw2BFZ+lEw= -golang.org/x/exp v0.0.0-20240325151524-a685a6edb6d8/go.mod h1:CQ1k9gNrJ50XIzaKCRR2hssIjF07kZFEiieALBM/ARQ= golang.org/x/lint v0.0.0-20190930215403-16217165b5de h1:5hukYrvBGR8/eNkX5mdUezrA6JiaEZDtJb9Ei+1LlBs= golang.org/x/lint v0.0.0-20190930215403-16217165b5de/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= @@ -218,21 +185,16 @@ golang.org/x/sys v0.0.0-20181116152217-5ac8a444bdc5/go.mod h1:STP8DvDyc/dI5b8T5h golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190422165155-953cdadca894/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20190916202348-b4ddaad3f8a3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200106162015-b016eb3dc98e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200615200032-f1bc736245b1/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200625212154-ddb9806d33ae/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20201204225414-ed752295db88/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210330210617-4fbd30eecc44/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210510120138-977fb7262007/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20210603081109-ebe580a85c40/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.11.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.32.0 h1:s77OFDvIQeibCmezSnk/q6iAfkdiQaJi4VzroCFrN20= golang.org/x/sys v0.32.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= diff --git a/test/worker_heartbeat_test.go b/test/worker_heartbeat_test.go index e64152abc..6cb114180 100644 --- a/test/worker_heartbeat_test.go +++ b/test/worker_heartbeat_test.go @@ -115,11 +115,12 @@ func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatBasic() { } // Wait for heartbeat to capture the in-flight activity - time.Sleep(100 * time.Millisecond) - - // Get worker info and verify activity slot is used - workerInfo := ts.getWorkerInfo(ctx, ts.taskQueueName) - ts.NotNil(workerInfo, "Should find worker in ListWorkers/DescribeWorker") + var workerInfo *workerpb.WorkerHeartbeat + ts.Eventually(func() bool { + workerInfo = ts.getWorkerInfo(ctx, ts.taskQueueName) + return workerInfo != nil && workerInfo.ActivityTaskSlotsInfo != nil && + workerInfo.ActivityTaskSlotsInfo.CurrentUsedSlots >= 1 + }, time.Second, 50*time.Millisecond, "Should find worker with activity slot used") ts.logWorkerInfo(workerInfo) ts.Equal(enums.WORKER_STATUS_RUNNING, workerInfo.Status) @@ -193,7 +194,6 @@ func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatBasic() { // After shutdown checks ts.Equal("WorkerHeartbeatTest", workerInfo.WorkerIdentity) hostInfo := workerInfo.HostInfo - fmt.Println("hostInfo", hostInfo) ts.NotEqual("", hostInfo.HostName) ts.NotEqual("", hostInfo.ProcessId) ts.NotEqual("", hostInfo.WorkerGroupingKey) @@ -208,14 +208,11 @@ func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatBasic() { // Timestamp validations - second heartbeat check (after shutdown) // StartTime should be unchanged - ts.Equal(firstStartTime, workerInfo.StartTime.AsTime(), - "StartTime should not change between heartbeats") + ts.Equal(firstStartTime, workerInfo.StartTime.AsTime()) // HeartbeatTime should have advanced - ts.True(workerInfo.HeartbeatTime.AsTime().After(firstHeartbeatTime), - "HeartbeatTime should advance between heartbeats") + ts.True(workerInfo.HeartbeatTime.AsTime().After(firstHeartbeatTime)) - fmt.Println("aa") workflowTaskSlots = workerInfo.WorkflowTaskSlotsInfo ts.Equal(int32(2), workflowTaskSlots.TotalProcessedTasks) ts.Equal("Fixed", workflowTaskSlots.SlotSupplierKind) @@ -225,7 +222,6 @@ func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatBasic() { ts.Equal(int32(0), activityTaskSlots.CurrentUsedSlots) ts.Equal(int32(1), activityTaskSlots.LastIntervalProcessedTasks) ts.Equal("Fixed", activityTaskSlots.SlotSupplierKind) - fmt.Println("bb") nexusTaskSlots = workerInfo.NexusTaskSlotsInfo ts.NotNil(nexusTaskSlots) ts.Equal(int32(0), nexusTaskSlots.TotalProcessedTasks) @@ -237,7 +233,6 @@ func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatBasic() { ts.Equal(int32(1000), localActivityTaskSlots.CurrentAvailableSlots) ts.Equal(int32(0), localActivityTaskSlots.CurrentUsedSlots) ts.Equal("Fixed", localActivityTaskSlots.SlotSupplierKind) - // workflowPollerInfo = workerInfo.WorkflowPollerInfo ts.Equal(int32(1), workflowPollerInfo.CurrentPollers) @@ -288,13 +283,13 @@ func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatDeploymentVersion() { defer w.Stop() // Wait for heartbeat to be sent - time.Sleep(200 * time.Millisecond) - - // Get worker info and verify deployment version - workerInfo := ts.getWorkerInfo(ctx, taskQueue) - ts.NotNil(workerInfo, "Should find worker in ListWorkers/DescribeWorker") + var workerInfo *workerpb.WorkerHeartbeat + ts.Eventually(func() bool { + workerInfo = ts.getWorkerInfo(ctx, taskQueue) + return workerInfo != nil && workerInfo.DeploymentVersion != nil + }, time.Second, 50*time.Millisecond, "Should find worker with deployment version") - ts.NotNil(workerInfo.DeploymentVersion, "DeploymentVersion should be set") + ts.NotNil(workerInfo.DeploymentVersion) ts.Equal("test_build_id", workerInfo.DeploymentVersion.BuildId) ts.Equal("test-deployment", workerInfo.DeploymentVersion.DeploymentName) @@ -502,15 +497,12 @@ func blockingActivity(ctx context.Context) (string, error) { case blockingActivityStarted <- struct{}{}: default: } - fmt.Println("ACTIVITY STARTED") // Wait for signal to complete select { case <-blockingActivityComplete: - fmt.Println("ACTIVITY COMPLETED") return "done", nil case <-ctx.Done(): - fmt.Println("ACTIVITY TIMED OUT") return "", ctx.Err() } } @@ -587,19 +579,17 @@ func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatWithActivityInFlight() { ts.Fail("Timeout waiting for activity to start") } - time.Sleep(150 * time.Millisecond) - - workerInfo := ts.getWorkerInfo(ctx, ts.taskQueueName) - ts.NotNil(workerInfo, "Should find worker in ListWorkers/DescribeWorker") + var workerInfo *workerpb.WorkerHeartbeat + ts.Eventually(func() bool { + workerInfo = ts.getWorkerInfo(ctx, ts.taskQueueName) + return workerInfo != nil && workerInfo.ActivityTaskSlotsInfo != nil && + workerInfo.ActivityTaskSlotsInfo.CurrentUsedSlots >= 1 + }, time.Second, 50*time.Millisecond, "Should have at least 1 activity slot used") ts.T().Logf("Activity slots used: %d, available: %d", workerInfo.ActivityTaskSlotsInfo.CurrentUsedSlots, workerInfo.ActivityTaskSlotsInfo.CurrentAvailableSlots) - - ts.GreaterOrEqual(workerInfo.ActivityTaskSlotsInfo.CurrentUsedSlots, int32(1), - "Should have at least 1 activity slot used") - ts.GreaterOrEqual(workerInfo.ActivityTaskSlotsInfo.CurrentAvailableSlots, int32(0), - "Available slots should be non-negative") + ts.GreaterOrEqual(workerInfo.ActivityTaskSlotsInfo.CurrentAvailableSlots, int32(0)) blockingActivityComplete <- struct{}{} @@ -608,14 +598,15 @@ func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatWithActivityInFlight() { ts.NoError(err) ts.Equal("done", result) - time.Sleep(150 * time.Millisecond) + ts.Eventually(func() bool { + workerInfo = ts.getWorkerInfo(ctx, ts.taskQueueName) + return workerInfo != nil && workerInfo.ActivityTaskSlotsInfo != nil && + workerInfo.ActivityTaskSlotsInfo.CurrentUsedSlots == 0 + }, time.Second, 50*time.Millisecond, "Activity slot should be released after completion") - workerInfo = ts.getWorkerInfo(ctx, ts.taskQueueName) - ts.True(workerInfo != nil && workerInfo.ActivityTaskSlotsInfo != nil) ts.T().Logf("After completion - Activity slots used: %d, available: %d", workerInfo.ActivityTaskSlotsInfo.CurrentUsedSlots, workerInfo.ActivityTaskSlotsInfo.CurrentAvailableSlots) - ts.Equal(int32(0), workerInfo.ActivityTaskSlotsInfo.CurrentUsedSlots) ts.GreaterOrEqual(workerInfo.ActivityTaskSlotsInfo.TotalProcessedTasks, int32(1)) } @@ -714,14 +705,12 @@ func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatStickyCacheMiss() { ts.NoError(run2.Get(ctx, &result2)) ts.Equal("wf2", result2) - // Wait for heartbeat - time.Sleep(150 * time.Millisecond) - - workerInfo := ts.getWorkerInfo(ctx, ts.taskQueueName) - ts.NotNil(workerInfo, "Should find worker in ListWorkers/DescribeWorker") - - ts.GreaterOrEqual(workerInfo.TotalStickyCacheMiss, int32(1), - "Should have at least 1 sticky cache miss") + // Wait for heartbeat to capture sticky cache miss + var workerInfo *workerpb.WorkerHeartbeat + ts.Eventually(func() bool { + workerInfo = ts.getWorkerInfo(ctx, ts.taskQueueName) + return workerInfo != nil && workerInfo.TotalStickyCacheMiss >= 1 + }, time.Second, 50*time.Millisecond, "Should have at least 1 sticky cache miss") } // TestWorkerHeartbeatMultipleWorkers verifies that multiple workers can heartbeat @@ -760,15 +749,13 @@ func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatMultipleWorkers() { } wg.Wait() - // Wait for heartbeats - time.Sleep(150 * time.Millisecond) - // Verify both workers are tracked - workerInfo1 := ts.getWorkerInfo(ctx, taskQueue1) - workerInfo2 := ts.getWorkerInfo(ctx, taskQueue2) - - ts.NotNil(workerInfo1, "Should find worker1") - ts.NotNil(workerInfo2, "Should find worker2") + var workerInfo1, workerInfo2 *workerpb.WorkerHeartbeat + ts.Eventually(func() bool { + workerInfo1 = ts.getWorkerInfo(ctx, taskQueue1) + workerInfo2 = ts.getWorkerInfo(ctx, taskQueue2) + return workerInfo1 != nil && workerInfo2 != nil + }, time.Second, 50*time.Millisecond, "Should find both workers") ts.NotEqual(workerInfo1.WorkerInstanceKey, workerInfo2.WorkerInstanceKey, "Different workers should have different instance keys") @@ -803,24 +790,24 @@ func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatFailureMetrics() { // Wait for workflow to complete (will fail due to activity failure) err = run.Get(ctx, nil) - ts.Error(err, "Workflow should fail due to activity failure") + ts.Error(err) // Wait for heartbeat to capture failure metrics - time.Sleep(150 * time.Millisecond) + var workerInfo *workerpb.WorkerHeartbeat + ts.Eventually(func() bool { + workerInfo = ts.getWorkerInfo(ctx, ts.taskQueueName) + return workerInfo != nil && workerInfo.ActivityTaskSlotsInfo != nil && + workerInfo.ActivityTaskSlotsInfo.TotalFailedTasks >= 1 + }, time.Second, 50*time.Millisecond, "Should have tracked at least 1 activity task failure") - // Get worker info and verify failure counts - workerInfo := ts.getWorkerInfo(ctx, ts.taskQueueName) - ts.NotNil(workerInfo, "Should find worker in ListWorkers/DescribeWorker") - ts.NotNil(workerInfo.ActivityTaskSlotsInfo) - ts.GreaterOrEqual(workerInfo.ActivityTaskSlotsInfo.LastIntervalFailureTasks, int32(1), - "Should have at least 1 activity failure") - ts.GreaterOrEqual(workerInfo.ActivityTaskSlotsInfo.TotalFailedTasks, int32(1), - "Should have tracked at least 1 activity task failure") + ts.GreaterOrEqual(workerInfo.ActivityTaskSlotsInfo.LastIntervalFailureTasks, int32(1)) // Last interval should go back to 0 on next heartbeat - time.Sleep(150 * time.Millisecond) - workerInfo = ts.getWorkerInfo(ctx, ts.taskQueueName) - ts.Equal(int32(0), workerInfo.ActivityTaskSlotsInfo.LastIntervalFailureTasks) + ts.Eventually(func() bool { + workerInfo = ts.getWorkerInfo(ctx, ts.taskQueueName) + return workerInfo != nil && workerInfo.ActivityTaskSlotsInfo != nil && + workerInfo.ActivityTaskSlotsInfo.LastIntervalFailureTasks == 0 + }, time.Second, 50*time.Millisecond, "Last interval failure count should reset to 0") } func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatWorkflowTaskProcessed() { @@ -842,21 +829,22 @@ func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatWorkflowTaskProcessed() { ts.NoError(err) } - // Wait for heartbeat - time.Sleep(150 * time.Millisecond) + // Wait for heartbeat to capture processed tasks + var workerInfo *workerpb.WorkerHeartbeat + ts.Eventually(func() bool { + workerInfo = ts.getWorkerInfo(ctx, ts.taskQueueName) + return workerInfo != nil && workerInfo.WorkflowTaskSlotsInfo != nil && + workerInfo.WorkflowTaskSlotsInfo.TotalProcessedTasks == int32(numWorkflows) + }, time.Second, 50*time.Millisecond, "Should have processed all workflow tasks") - workerInfo := ts.getWorkerInfo(ctx, ts.taskQueueName) - ts.NotNil(workerInfo, "Should find worker in ListWorkers/DescribeWorker") - ts.NotNil(workerInfo.WorkflowTaskSlotsInfo) - ts.Equal(int32(numWorkflows), workerInfo.WorkflowTaskSlotsInfo.TotalProcessedTasks) - ts.GreaterOrEqual(workerInfo.WorkflowTaskSlotsInfo.LastIntervalProcessedTasks, int32(1), - "Should have processed at least 1 workflow task in last interval") + ts.GreaterOrEqual(workerInfo.WorkflowTaskSlotsInfo.LastIntervalProcessedTasks, int32(1)) // Last interval should go back to 0 on next heartbeat - time.Sleep(150 * time.Millisecond) - workerInfo = ts.getWorkerInfo(ctx, ts.taskQueueName) - ts.NotNil(workerInfo) - ts.Equal(int32(0), workerInfo.WorkflowTaskSlotsInfo.LastIntervalProcessedTasks) + ts.Eventually(func() bool { + workerInfo = ts.getWorkerInfo(ctx, ts.taskQueueName) + return workerInfo != nil && workerInfo.WorkflowTaskSlotsInfo != nil && + workerInfo.WorkflowTaskSlotsInfo.LastIntervalProcessedTasks == 0 + }, time.Second, 50*time.Millisecond, "Last interval processed count should reset to 0") } func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatResourceBasedTuner() { @@ -906,14 +894,13 @@ func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatResourceBasedTuner() { ts.NoError(err) ts.NoError(run.Get(ctx, nil)) - // Wait for heartbeat - time.Sleep(150 * time.Millisecond) - - workerInfo := ts.getWorkerInfo(ctx, ts.taskQueueName) - ts.NotNil(workerInfo, "Should find worker in ListWorkers/DescribeWorker") - - ts.NotNil(workerInfo.WorkflowTaskSlotsInfo) - ts.Equal("ResourceBased", workerInfo.WorkflowTaskSlotsInfo.SlotSupplierKind) + // Wait for heartbeat with resource-based tuner info + var workerInfo *workerpb.WorkerHeartbeat + ts.Eventually(func() bool { + workerInfo = ts.getWorkerInfo(ctx, ts.taskQueueName) + return workerInfo != nil && workerInfo.WorkflowTaskSlotsInfo != nil && + workerInfo.WorkflowTaskSlotsInfo.SlotSupplierKind == "ResourceBased" + }, time.Second, 50*time.Millisecond, "Should find worker with ResourceBased slot supplier") ts.NotNil(workerInfo.ActivityTaskSlotsInfo) ts.Equal("ResourceBased", workerInfo.ActivityTaskSlotsInfo.SlotSupplierKind) @@ -978,20 +965,17 @@ func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatPlugins() { ts.NoError(err) ts.NoError(run.Get(ctx, nil)) - // Wait for heartbeat - time.Sleep(150 * time.Millisecond) - - workerInfo := ts.getWorkerInfo(ctx, ts.taskQueueName) - ts.NotNil(workerInfo, "Should find worker in ListWorkers/DescribeWorker") - - // Verify plugin names are reported - ts.NotNil(workerInfo.Plugins) - ts.Len(workerInfo.Plugins, 2, "Should have 2 unique plugins (duplicates deduped)") + // Wait for heartbeat with plugin info + var workerInfo *workerpb.WorkerHeartbeat + ts.Eventually(func() bool { + workerInfo = ts.getWorkerInfo(ctx, ts.taskQueueName) + return workerInfo != nil && len(workerInfo.Plugins) == 2 + }, time.Second, 50*time.Millisecond, "Should have 2 unique plugins (duplicates deduped)") pluginNames := make(map[string]bool) for _, plugin := range workerInfo.Plugins { pluginNames[plugin.Name] = true } - ts.True(pluginNames["test-client-plugin"], "Should contain client plugin") - ts.True(pluginNames["test-worker-plugin"], "Should contain worker plugin") + ts.True(pluginNames["test-client-plugin"]) + ts.True(pluginNames["test-worker-plugin"]) } diff --git a/worker/hostmetrics/hostmetrics.go b/worker/hostmetrics/hostmetrics.go index dafdc29c1..474c5f8d0 100644 --- a/worker/hostmetrics/hostmetrics.go +++ b/worker/hostmetrics/hostmetrics.go @@ -13,7 +13,7 @@ import ( "go.temporal.io/sdk/log" ) -// PSUtilSystemInfoSupplier implements worker.HostMetricsProvider for system metrics. +// PSUtilSystemInfoSupplier implements worker.TunerHostMetricsProvider for system metrics. type PSUtilSystemInfoSupplier struct { mu sync.Mutex lastRefresh time.Time @@ -63,8 +63,10 @@ func (p *PSUtilSystemInfoSupplier) GetMemoryUsageWithLogger(logger log.Logger) ( if err := p.maybeRefresh(logger); err != nil { return 0, err } - if cgroupMem := p.cGroupInfo.GetLastMemUsage(); cgroupMem != 0 { - return cgroupMem, nil + if p.cGroupInfo != nil { + if cgroupMem := p.cGroupInfo.GetLastMemUsage(); cgroupMem != 0 { + return cgroupMem, nil + } } return p.lastMemStat.UsedPercent / 100, nil } diff --git a/worker/worker.go b/worker/worker.go index e961bb2ca..6895b2df6 100644 --- a/worker/worker.go +++ b/worker/worker.go @@ -237,12 +237,14 @@ type ( // ReplayWorkflowHistoryOptions are options for replaying a workflow. ReplayWorkflowHistoryOptions = internal.ReplayWorkflowHistoryOptions - // HostMetricsProvider provides host-level CPU and memory metrics for worker heartbeats. - // Implement this interface to provide custom metrics collection, or use the default - // implementation provided by the SDK in the worker/hostmetrics package. + // TunerHostMetricsProvider provides host-level CPU and memory metrics for worker heartbeats. + // This interface is typically implemented by a [WorkerTuner] to provide metrics from the same + // source used for tuning decisions, avoiding double-measurement. If the tuner passed to + // [Options] implements this interface, the SDK will automatically use it for heartbeat metrics. + // Alternatively, use the default implementation in the worker/hostmetrics package. // // NOTE: Experimental - HostMetricsProvider = internal.HostMetricsProvider + TunerHostMetricsProvider = internal.TunerHostMetricsProvider ) var _ WorkflowRegistry = (WorkflowReplayer)(nil) From 819080b2393eeaecf165b25e1b49160111473cfd Mon Sep 17 00:00:00 2001 From: Andrew Yuan Date: Tue, 27 Jan 2026 13:55:13 -0800 Subject: [PATCH 04/30] Sort plugin names --- contrib/datadog/go.mod | 2 +- contrib/datadog/go.sum | 1 + contrib/resourcetuner/cgroups.go | 137 ++++++++++ .../resourcetuner}/cgroups_notlinux.go | 5 +- contrib/resourcetuner/go.mod | 4 +- contrib/resourcetuner/resourcetuner.go | 137 ++++++---- internal/internal_worker.go | 40 +-- internal/internal_worker_heartbeat.go | 12 +- internal/internal_worker_heartbeat_metrics.go | 171 +------------ internal/sysinfo/LICENSE | 30 --- internal/sysinfo/README.md | 9 - internal/sysinfo/common.go | 112 --------- internal/sysinfo/common_darwin.go | 61 ----- internal/sysinfo/cpu.go | 141 ----------- internal/sysinfo/cpu_darwin.go | 101 -------- internal/sysinfo/cpu_linux.go | 134 ---------- internal/sysinfo/cpu_unsupported.go | 18 -- internal/sysinfo/cpu_windows.go | 141 ----------- internal/sysinfo/mem.go | 50 ---- internal/sysinfo/mem_darwin.go | 76 ------ internal/sysinfo/mem_linux.go | 100 -------- internal/sysinfo/mem_unsupported.go | 18 -- internal/sysinfo/mem_windows.go | 50 ---- .../sysinfo/scripts/compare_with_gopsutil.sh | 236 ------------------ internal/worker.go | 2 +- worker/hostmetrics/cgroups.go | 165 ------------ worker/hostmetrics/hostmetrics.go | 122 --------- worker/hostmetrics/hostmetrics_test.go | 49 ---- .../scripts/compare_with_containerd.sh | 153 ------------ worker/worker.go | 2 +- 30 files changed, 272 insertions(+), 2007 deletions(-) create mode 100644 contrib/resourcetuner/cgroups.go rename {worker/hostmetrics => contrib/resourcetuner}/cgroups_notlinux.go (87%) delete mode 100644 internal/sysinfo/LICENSE delete mode 100644 internal/sysinfo/README.md delete mode 100644 internal/sysinfo/common.go delete mode 100644 internal/sysinfo/common_darwin.go delete mode 100644 internal/sysinfo/cpu.go delete mode 100644 internal/sysinfo/cpu_darwin.go delete mode 100644 internal/sysinfo/cpu_linux.go delete mode 100644 internal/sysinfo/cpu_unsupported.go delete mode 100644 internal/sysinfo/cpu_windows.go delete mode 100644 internal/sysinfo/mem.go delete mode 100644 internal/sysinfo/mem_darwin.go delete mode 100644 internal/sysinfo/mem_linux.go delete mode 100644 internal/sysinfo/mem_unsupported.go delete mode 100644 internal/sysinfo/mem_windows.go delete mode 100755 internal/sysinfo/scripts/compare_with_gopsutil.sh delete mode 100644 worker/hostmetrics/cgroups.go delete mode 100644 worker/hostmetrics/hostmetrics.go delete mode 100644 worker/hostmetrics/hostmetrics_test.go delete mode 100755 worker/hostmetrics/scripts/compare_with_containerd.sh diff --git a/contrib/datadog/go.mod b/contrib/datadog/go.mod index 6d1653dc4..4406f3f53 100644 --- a/contrib/datadog/go.mod +++ b/contrib/datadog/go.mod @@ -22,7 +22,7 @@ require ( github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect github.com/dustin/go-humanize v1.0.1 // indirect - github.com/ebitengine/purego v0.5.0 // indirect + github.com/ebitengine/purego v0.9.1 // indirect github.com/facebookgo/clock v0.0.0-20150410010913-600d898af40a // indirect github.com/gogo/protobuf v1.3.2 // indirect github.com/golang/mock v1.6.0 // indirect diff --git a/contrib/datadog/go.sum b/contrib/datadog/go.sum index a6e877d5f..c3f65cf00 100644 --- a/contrib/datadog/go.sum +++ b/contrib/datadog/go.sum @@ -32,6 +32,7 @@ github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+m github.com/dvyukov/go-fuzz v0.0.0-20210103155950-6a8e9d1f2415/go.mod h1:11Gm+ccJnvAhCNLlf5+cS9KjtbaD5I5zaZpFMsTHWTw= github.com/ebitengine/purego v0.5.0 h1:JrMGKfRIAM4/QVKaesIIT7m/UVjTj5GYhRSQYwfVdpo= github.com/ebitengine/purego v0.5.0/go.mod h1:ah1In8AOtksoNK6yk5z1HTJeUkC1Ez4Wk2idgGslMwQ= +github.com/ebitengine/purego v0.9.1/go.mod h1:iIjxzd6CiRiOG0UyXP+V1+jWqUXVjPKLAI0mRfJZTmQ= github.com/facebookgo/clock v0.0.0-20150410010913-600d898af40a h1:yDWHCSQ40h88yih2JAcL6Ls/kVkSE8GFACTGVnMPruw= github.com/facebookgo/clock v0.0.0-20150410010913-600d898af40a/go.mod h1:7Ga40egUymuWXxAe151lTNnCv97MddSOVsjpPPkityA= github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= diff --git a/contrib/resourcetuner/cgroups.go b/contrib/resourcetuner/cgroups.go new file mode 100644 index 000000000..f6615296f --- /dev/null +++ b/contrib/resourcetuner/cgroups.go @@ -0,0 +1,137 @@ +//go:build linux + +package resourcetuner + +import ( + "errors" + "fmt" + "io/fs" + "os" + "strconv" + "strings" + "time" + + "github.com/containerd/cgroups/v3/cgroup2" + "github.com/containerd/cgroups/v3/cgroup2/stats" +) + +func newCGroupInfo() cGroupInfo { + return &cGroupInfoImpl{} +} + +type cGroupInfoImpl struct { + lastCGroupMemStat *stats.MemoryStat + cgroupCpuCalc cgroupCpuCalc +} + +func (p *cGroupInfoImpl) Update() (bool, error) { + err := p.updateCGroupStats() + // Stop updates if not in a container. No need to return the error and log it. + if !errors.Is(err, fs.ErrNotExist) { + return false, nil + } else if err != nil { + return true, err + } + return true, nil +} + +func (p *cGroupInfoImpl) GetLastMemUsage() float64 { + if p.lastCGroupMemStat != nil { + return float64(p.lastCGroupMemStat.Usage) / float64(p.lastCGroupMemStat.UsageLimit) + } + return 0 +} + +func (p *cGroupInfoImpl) GetLastCPUUsage() float64 { + return p.cgroupCpuCalc.lastCalculatedPercent +} + +func (p *cGroupInfoImpl) updateCGroupStats() error { + control, err := cgroup2.Load("/") + if err != nil { + return fmt.Errorf("failed to get cgroup mem stats %v", err) + } + metrics, err := control.Stat() + if err != nil { + return fmt.Errorf("failed to get cgroup mem stats %v", err) + } + // Only update if a limit has been set + if metrics.Memory.UsageLimit != 0 { + p.lastCGroupMemStat = metrics.Memory + } + + err = p.cgroupCpuCalc.updateCpuUsage(metrics) + if err != nil { + return fmt.Errorf("failed to get cgroup cpu usage %v", err) + } + return nil +} + +type cgroupCpuCalc struct { + lastRefresh time.Time + lastCpuUsage uint64 + lastCalculatedPercent float64 +} + +func (p *cgroupCpuCalc) updateCpuUsage(metrics *stats.Metrics) error { + // Read CPU quota and period from cpu.max + cpuQuota, cpuPeriod, err := readCpuMax("/sys/fs/cgroup/cpu.max") + // We might simply be in a container with an unset cpu.max in which case we don't want to error + if err == nil { + // CPU usage calculation based on delta + currentCpuUsage := metrics.CPU.UsageUsec + now := time.Now() + + if p.lastCpuUsage == 0 || p.lastRefresh.IsZero() { + p.lastCpuUsage = currentCpuUsage + p.lastRefresh = now + return nil + } + + // Time passed between this and last check + timeDelta := now.Sub(p.lastRefresh).Microseconds() // Convert to microseconds + + // Calculate CPU usage percentage based on the delta + cpuUsageDelta := float64(currentCpuUsage - p.lastCpuUsage) + + if cpuQuota > 0 { + p.lastCalculatedPercent = cpuUsageDelta * float64(cpuPeriod) / float64(cpuQuota*timeDelta) + } + + // Update for next call + p.lastCpuUsage = currentCpuUsage + p.lastRefresh = now + } + + return nil +} + +// readCpuMax reads the cpu.max file to get the CPU quota and period +func readCpuMax(path string) (quota int64, period int64, err error) { + data, err := os.ReadFile(path) + if err != nil { + return 0, 0, err + } + parts := strings.Fields(string(data)) + if len(parts) != 2 { + return 0, 0, errors.New("invalid format in cpu.max") + } + + // Parse the quota (first value) + if parts[0] == "max" { + quota = 0 // Unlimited quota + } else { + quota, err = strconv.ParseInt(parts[0], 10, 64) + if err != nil { + return 0, 0, err + } + } + + // Parse the period (second value) + period, err = strconv.ParseInt(parts[1], 10, 64) + if err != nil { + return 0, 0, err + } + + return quota, period, nil +} diff --git a/worker/hostmetrics/cgroups_notlinux.go b/contrib/resourcetuner/cgroups_notlinux.go similarity index 87% rename from worker/hostmetrics/cgroups_notlinux.go rename to contrib/resourcetuner/cgroups_notlinux.go index ca3d940d7..068e4220f 100644 --- a/worker/hostmetrics/cgroups_notlinux.go +++ b/contrib/resourcetuner/cgroups_notlinux.go @@ -1,6 +1,6 @@ //go:build !linux -package hostmetrics +package resourcetuner import "errors" @@ -8,7 +8,8 @@ func newCGroupInfo() cGroupInfo { return &cGroupInfoImpl{} } -type cGroupInfoImpl struct{} +type cGroupInfoImpl struct { +} func (p *cGroupInfoImpl) Update() (bool, error) { return false, errors.New("cgroup is not supported on this platform") diff --git a/contrib/resourcetuner/go.mod b/contrib/resourcetuner/go.mod index bc34753c4..dde1cc877 100644 --- a/contrib/resourcetuner/go.mod +++ b/contrib/resourcetuner/go.mod @@ -5,6 +5,8 @@ go 1.23.0 toolchain go1.23.6 require ( + github.com/containerd/cgroups/v3 v3.0.3 + github.com/shirou/gopsutil/v4 v4.24.8 github.com/stretchr/testify v1.10.0 go.einride.tech/pid v0.1.3 go.temporal.io/sdk v1.29.1 @@ -12,7 +14,6 @@ require ( require ( github.com/cilium/ebpf v0.11.0 // indirect - github.com/containerd/cgroups/v3 v3.0.3 // indirect github.com/coreos/go-systemd/v22 v22.3.2 // indirect github.com/davecgh/go-spew v1.1.1 // indirect github.com/facebookgo/clock v0.0.0-20150410010913-600d898af40a // indirect @@ -29,7 +30,6 @@ require ( github.com/pmezard/go-difflib v1.0.0 // indirect github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c // indirect github.com/robfig/cron v1.2.0 // indirect - github.com/shirou/gopsutil/v4 v4.24.8 // indirect github.com/shoenig/go-m1cpu v0.1.6 // indirect github.com/sirupsen/logrus v1.9.3 // indirect github.com/stretchr/objx v0.5.2 // indirect diff --git a/contrib/resourcetuner/resourcetuner.go b/contrib/resourcetuner/resourcetuner.go index cf4e9f5ae..1145d3cd7 100644 --- a/contrib/resourcetuner/resourcetuner.go +++ b/contrib/resourcetuner/resourcetuner.go @@ -3,14 +3,16 @@ package resourcetuner import ( "context" "errors" + "runtime" "sync" "time" + "github.com/shirou/gopsutil/v4/cpu" + "github.com/shirou/gopsutil/v4/mem" "go.einride.tech/pid" "go.temporal.io/sdk/client" "go.temporal.io/sdk/log" "go.temporal.io/sdk/worker" - "go.temporal.io/sdk/worker/hostmetrics" ) // Metric names emitted by the resource-based tuner @@ -34,30 +36,12 @@ type ResourceBasedTunerOptions struct { WorkflowRampThrottle time.Duration } -// resourceBasedTuner wraps a WorkerTuner and implements TunerHostMetricsProvider -// so the SDK can reuse metrics instead of collecting them twice. -type resourceBasedTuner struct { - worker.WorkerTuner - hostMetrics *hostmetrics.PSUtilSystemInfoSupplier -} - -func (t *resourceBasedTuner) GetCpuUsage() (float64, error) { - return t.hostMetrics.GetCpuUsage() -} - -func (t *resourceBasedTuner) GetMemoryUsage() (float64, error) { - return t.hostMetrics.GetMemoryUsage() -} - // NewResourceBasedTuner creates a WorkerTuner that dynamically adjusts the number of slots based // on system resources. Specify the target CPU and memory usage as a value between 0 and 1. func NewResourceBasedTuner(opts ResourceBasedTunerOptions) (worker.WorkerTuner, error) { - hostMetrics := hostmetrics.NewPSUtilSystemInfoSupplier(nil) - options := DefaultResourceControllerOptions() options.MemTargetPercent = opts.TargetMem options.CpuTargetPercent = opts.TargetCpu - options.InfoSupplier = &hostMetricsInfoSupplier{provider: hostMetrics} controller := NewResourceController(options) wfSS := &ResourceBasedSlotSupplier{controller: controller, options: DefaultWorkflowResourceBasedSlotSupplierOptions()} @@ -88,23 +72,7 @@ func NewResourceBasedTuner(opts ResourceBasedTunerOptions) (worker.WorkerTuner, if err != nil { return nil, err } - return &resourceBasedTuner{ - WorkerTuner: compositeTuner, - hostMetrics: hostMetrics, - }, nil -} - -// hostMetricsInfoSupplier adapts hostmetrics.PSUtilSystemInfoSupplier to SystemInfoSupplier -type hostMetricsInfoSupplier struct { - provider *hostmetrics.PSUtilSystemInfoSupplier -} - -func (s *hostMetricsInfoSupplier) GetMemoryUsage(ctx *SystemInfoContext) (float64, error) { - return s.provider.GetMemoryUsageWithLogger(ctx.Logger) -} - -func (s *hostMetricsInfoSupplier) GetCpuUsage(ctx *SystemInfoContext) (float64, error) { - return s.provider.GetCpuUsageWithLogger(ctx.Logger) + return compositeTuner, nil } // ResourceBasedSlotSupplierOptions configures a particular ResourceBasedSlotSupplier. @@ -214,9 +182,6 @@ func (r *ResourceBasedSlotSupplier) ReleaseSlot(worker.SlotReleaseInfo) {} func (r *ResourceBasedSlotSupplier) MaxSlots() int { return 0 } -func (r *ResourceBasedSlotSupplier) Kind() string { - return "ResourceBased" -} // SystemInfoSupplier implementations provide information about system resources. type SystemInfoSupplier interface { @@ -290,11 +255,13 @@ type ResourceController struct { // the controller looks at overall system resources, multiple instances with different configs can // only conflict with one another. func NewResourceController(options ResourceControllerOptions) *ResourceController { - infoSupplier := options.InfoSupplier - if infoSupplier == nil { - infoSupplier = &hostMetricsInfoSupplier{ - provider: hostmetrics.NewPSUtilSystemInfoSupplier(nil), + var infoSupplier SystemInfoSupplier + if options.InfoSupplier == nil { + infoSupplier = &psUtilSystemInfoSupplier{ + cGroupInfo: newCGroupInfo(), } + } else { + infoSupplier = options.InfoSupplier } return &ResourceController{ options: options, @@ -362,3 +329,87 @@ func (rc *ResourceController) publishResourceMetrics(metricsHandler client.Metri metricsHandler.Gauge(resourceSlotsMemUsage).Update(memUsage * 100) metricsHandler.Gauge(resourceSlotsCPUUsage).Update(cpuUsage * 100) } + +type psUtilSystemInfoSupplier struct { + logger log.Logger + mu sync.Mutex + lastRefresh time.Time + + lastMemStat *mem.VirtualMemoryStat + lastCpuUsage float64 + + stopTryingToGetCGroupInfo bool + cGroupInfo cGroupInfo +} + +type cGroupInfo interface { + // Update requests an update of the cgroup stats. This is a no-op if not in a cgroup. Returns + // true if cgroup stats should continue to be updated, false if not in a cgroup or the returned + // error is considered unrecoverable. + Update() (bool, error) + // GetLastMemUsage returns last known memory usage as a fraction of the cgroup limit. 0 if not + // in a cgroup or limit is not set. + GetLastMemUsage() float64 + // GetLastCPUUsage returns last known CPU usage as a fraction of the cgroup limit. 0 if not in a + // cgroup or limit is not set. + GetLastCPUUsage() float64 +} + +func (p *psUtilSystemInfoSupplier) GetMemoryUsage(infoContext *SystemInfoContext) (float64, error) { + if err := p.maybeRefresh(infoContext); err != nil { + return 0, err + } + lastCGroupMem := p.cGroupInfo.GetLastMemUsage() + if lastCGroupMem != 0 { + return lastCGroupMem, nil + } + return p.lastMemStat.UsedPercent / 100, nil +} + +func (p *psUtilSystemInfoSupplier) GetCpuUsage(infoContext *SystemInfoContext) (float64, error) { + if err := p.maybeRefresh(infoContext); err != nil { + return 0, err + } + + lastCGroupCPU := p.cGroupInfo.GetLastCPUUsage() + if lastCGroupCPU != 0 { + return lastCGroupCPU, nil + } + return p.lastCpuUsage / 100, nil +} + +func (p *psUtilSystemInfoSupplier) maybeRefresh(infoContext *SystemInfoContext) error { + if time.Since(p.lastRefresh) < 100*time.Millisecond { + return nil + } + p.mu.Lock() + defer p.mu.Unlock() + // Double check refresh is still needed + if time.Since(p.lastRefresh) < 100*time.Millisecond { + return nil + } + ctx, cancelFn := context.WithTimeout(context.Background(), 1*time.Second) + defer cancelFn() + memStat, err := mem.VirtualMemoryWithContext(ctx) + if err != nil { + return err + } + cpuUsage, err := cpu.PercentWithContext(ctx, 0, false) + if err != nil { + return err + } + + p.lastMemStat = memStat + p.lastCpuUsage = cpuUsage[0] + + if runtime.GOOS == "linux" && !p.stopTryingToGetCGroupInfo { + continueUpdates, err := p.cGroupInfo.Update() + if err != nil { + infoContext.Logger.Warn("Failed to get cgroup stats", "error", err) + } + p.stopTryingToGetCGroupInfo = !continueUpdates + } + + p.lastRefresh = time.Now() + return nil +} diff --git a/internal/internal_worker.go b/internal/internal_worker.go index 265e1c02d..fd886c139 100644 --- a/internal/internal_worker.go +++ b/internal/internal_worker.go @@ -14,6 +14,7 @@ import ( "os" "reflect" "runtime" + "sort" "strconv" "strings" "sync" @@ -39,7 +40,6 @@ import ( "go.temporal.io/sdk/internal/common/util" ilog "go.temporal.io/sdk/internal/log" "go.temporal.io/sdk/log" - "go.temporal.io/sdk/worker/hostmetrics" ) const ( @@ -239,7 +239,7 @@ type ( workflowTaskSlotSupplier *trackingSlotSupplier activityTaskSlotSupplier *trackingSlotSupplier localActivitySlotSupplier *trackingSlotSupplier - nexusTaskSlotSupplier *trackingSlotSupplier + nexusTaskSlotSupplier *trackingSlotSupplier // TODO: nexus worker only gets started when worker is started, need to find a way to send kind over to heartbeat callback // Host metrics provider for CPU/memory reporting in heartbeats hostMetricsProvider TunerHostMetricsProvider @@ -1201,6 +1201,7 @@ type AggregatedWorker struct { pluginRegistryOptions *WorkerPluginConfigureWorkerRegistryOptions // Never nil workerHeartbeatManager *workerHeartbeatManager + heartbeatCallback func() *workerpb.WorkerHeartbeat } // RegisterWorkflow registers workflow implementation with the AggregatedWorker @@ -1514,21 +1515,14 @@ func (aw *AggregatedWorker) registerHeartbeatWorker() error { if aw.client.heartbeatManager == nil { return nil } - return aw.client.heartbeatManager.RegisterWorker( - aw.executionParams.Namespace, - aw.workerInstanceKey, - aw.workerHeartbeatManager.heartbeatCallback, - ) + return aw.client.heartbeatManager.RegisterWorker(aw) } func (aw *AggregatedWorker) unregisterHeartbeatWorker() { if aw.client.heartbeatManager == nil || aw.workerHeartbeatManager == nil { return } - aw.client.heartbeatManager.UnregisterWorker( - aw.executionParams.Namespace, - aw.workerInstanceKey, - ) + aw.client.heartbeatManager.UnregisterWorker(aw) } // shutdownWorker sends a ShutdownWorker RPC to notify the server that this worker is shutting down. @@ -2260,13 +2254,11 @@ func NewAggregatedWorker(client *WorkflowClient, taskQueue string, options Worke }) } - // Initialize host metrics provider for CPU/memory reporting. - // If the tuner implements TunerHostMetricsProvider, use it to avoid double-measurement of system. + // If the tuner implements TunerHostMetricsProvider, use it for CPU/memory reporting in heartbeats. + // Otherwise, heartbeats will report 0 for CPU/memory usage. var hostMetricsProvider TunerHostMetricsProvider if provider, ok := options.Tuner.(TunerHostMetricsProvider); ok { hostMetricsProvider = provider - } else if client.workerHeartbeatInterval != 0 { - hostMetricsProvider = hostmetrics.NewPSUtilSystemInfoSupplier(workerParams.Logger) } var heartbeatCallback func() *workerpb.WorkerHeartbeat @@ -2721,16 +2713,24 @@ func getMemUsage(provider TunerHostMetricsProvider) float32 { // deduplicates them, and returns a slice of PluginInfo for heartbeat reporting. func collectPluginInfos(clientPluginNames []string, workerPlugins []WorkerPlugin) []*workerpb.PluginInfo { set := make(map[string]struct{}, len(clientPluginNames)+len(workerPlugins)) + result := make([]*workerpb.PluginInfo, 0, len(clientPluginNames)+len(workerPlugins)) for _, name := range clientPluginNames { + if _, found := set[name]; !found { + set[name] = struct{}{} + result = append(result, &workerpb.PluginInfo{Name: name}) + } set[name] = struct{}{} } for _, plugin := range workerPlugins { - set[plugin.Name()] = struct{}{} + if _, found := set[plugin.Name()]; !found { + set[plugin.Name()] = struct{}{} + result = append(result, &workerpb.PluginInfo{Name: plugin.Name()}) + } } - result := make([]*workerpb.PluginInfo, 0, len(set)) - for name := range set { - result = append(result, &workerpb.PluginInfo{Name: name}) - } + sort.Slice(result, func(i, j int) bool { + return result[i].Name < result[j].Name + }) + return result } diff --git a/internal/internal_worker_heartbeat.go b/internal/internal_worker_heartbeat.go index 2cf4ade2a..5d3b5daa8 100644 --- a/internal/internal_worker_heartbeat.go +++ b/internal/internal_worker_heartbeat.go @@ -37,13 +37,12 @@ func NewHeartbeatManager(client *WorkflowClient, interval time.Duration, logger // RegisterWorker registers a worker's heartbeat callback with the shared heartbeat worker for the namespace. func (m *HeartbeatManager) RegisterWorker( - namespace string, - workerInstanceKey string, - callback func() *workerpb.WorkerHeartbeat, + worker *AggregatedWorker, ) error { m.mu.Lock() defer m.mu.Unlock() + namespace := worker.executionParams.Namespace hw, ok := m.workers[namespace] if !ok { capabilities, err := m.client.loadNamespaceCapabilities(context.Background()) @@ -77,7 +76,7 @@ func (m *HeartbeatManager) RegisterWorker( } hw.mu.Lock() - hw.callbacks[workerInstanceKey] = callback + hw.callbacks[worker.workerInstanceKey] = worker.heartbeatCallback hw.mu.Unlock() return nil @@ -85,17 +84,18 @@ func (m *HeartbeatManager) RegisterWorker( // UnregisterWorker removes a worker's heartbeat callback. If no callbacks remain for the namespace, // the shared heartbeat worker is stopped. -func (m *HeartbeatManager) UnregisterWorker(namespace, workerInstanceKey string) { +func (m *HeartbeatManager) UnregisterWorker(worker *AggregatedWorker) { m.mu.Lock() defer m.mu.Unlock() + namespace := worker.executionParams.Namespace hw, ok := m.workers[namespace] if !ok { return } hw.mu.Lock() - delete(hw.callbacks, workerInstanceKey) + delete(hw.callbacks, worker.workerInstanceKey) remaining := len(hw.callbacks) hw.mu.Unlock() diff --git a/internal/internal_worker_heartbeat_metrics.go b/internal/internal_worker_heartbeat_metrics.go index b71423bee..8906aaa06 100644 --- a/internal/internal_worker_heartbeat_metrics.go +++ b/internal/internal_worker_heartbeat_metrics.go @@ -324,121 +324,6 @@ func buildPollerInfo(currentPollers int32, lastSuccessfulPollTime time.Time, pol } } -// GetStickyCacheHit returns the total number of sticky cache hits. -func (h *HeartbeatMetricsHandler) GetStickyCacheHit() int32 { - return int32(h.metrics[metricStickyCacheHit].Load()) -} - -// GetStickyCacheMiss returns the total number of sticky cache misses. -func (h *HeartbeatMetricsHandler) GetStickyCacheMiss() int32 { - return int32(h.metrics[metricStickyCacheMiss].Load()) -} - -// GetStickyCacheSize returns the current sticky cache size. -func (h *HeartbeatMetricsHandler) GetStickyCacheSize() int32 { - return int32(h.metrics[metricStickyCacheSize].Load()) -} - -// GetWorkflowTaskFailures returns the total number of workflow task failures. -func (h *HeartbeatMetricsHandler) GetWorkflowTaskFailures() int64 { - return int64(h.metrics[metricWorkflowTaskFailures].Load()) -} - -// GetActivityTaskFailures returns the total number of activity task failures. -func (h *HeartbeatMetricsHandler) GetActivityTaskFailures() int64 { - return int64(h.metrics[metricActivityTaskFailures].Load()) -} - -// GetLocalActivityTaskFailures returns the total number of local activity task failures. -func (h *HeartbeatMetricsHandler) GetLocalActivityTaskFailures() int64 { - return int64(h.metrics[metricLocalActivityTaskFailures].Load()) -} - -// GetNexusTaskFailures returns the total number of nexus task failures. -func (h *HeartbeatMetricsHandler) GetNexusTaskFailures() int64 { - return int64(h.metrics[metricNexusTaskFailures].Load()) -} - -// GetWorkflowSlotsAvailable returns the current workflow slots available. -func (h *HeartbeatMetricsHandler) GetWorkflowSlotsAvailable() int32 { - return int32(h.metrics[metricWorkflowSlotsAvailable].Load()) -} - -// GetWorkflowSlotsUsed returns the current workflow slots used. -func (h *HeartbeatMetricsHandler) GetWorkflowSlotsUsed() int32 { - return int32(h.metrics[metricWorkflowSlotsUsed].Load()) -} - -// GetActivitySlotsAvailable returns the current activity slots available. -func (h *HeartbeatMetricsHandler) GetActivitySlotsAvailable() int32 { - return int32(h.metrics[metricActivitySlotsAvailable].Load()) -} - -// GetActivitySlotsUsed returns the current activity slots used. -func (h *HeartbeatMetricsHandler) GetActivitySlotsUsed() int32 { - return int32(h.metrics[metricActivitySlotsUsed].Load()) -} - -// GetLocalActivitySlotsAvailable returns the current local activity slots available. -func (h *HeartbeatMetricsHandler) GetLocalActivitySlotsAvailable() int32 { - return int32(h.metrics[metricLocalActivitySlotsAvailable].Load()) -} - -// GetLocalActivitySlotsUsed returns the current local activity slots used. -func (h *HeartbeatMetricsHandler) GetLocalActivitySlotsUsed() int32 { - return int32(h.metrics[metricLocalActivitySlotsUsed].Load()) -} - -// GetNexusSlotsAvailable returns the current nexus slots available. -func (h *HeartbeatMetricsHandler) GetNexusSlotsAvailable() int32 { - return int32(h.metrics[metricNexusSlotsAvailable].Load()) -} - -// GetNexusSlotsUsed returns the current nexus slots used. -func (h *HeartbeatMetricsHandler) GetNexusSlotsUsed() int32 { - return int32(h.metrics[metricNexusSlotsUsed].Load()) -} - -// GetWorkflowTasksProcessed returns the total number of workflow tasks processed. -func (h *HeartbeatMetricsHandler) GetWorkflowTasksProcessed() int64 { - return int64(h.metrics[metricWorkflowTasksProcessed].Load()) -} - -// GetActivityTasksProcessed returns the total number of activity tasks processed. -func (h *HeartbeatMetricsHandler) GetActivityTasksProcessed() int64 { - return int64(h.metrics[metricActivityTasksProcessed].Load()) -} - -// GetLocalActivityTasksProcessed returns the total number of local activity tasks processed. -func (h *HeartbeatMetricsHandler) GetLocalActivityTasksProcessed() int64 { - return int64(h.metrics[metricLocalActivityTasksProcessed].Load()) -} - -// GetNexusTasksProcessed returns the total number of nexus tasks processed. -func (h *HeartbeatMetricsHandler) GetNexusTasksProcessed() int64 { - return int64(h.metrics[metricNexusTasksProcessed].Load()) -} - -// GetWorkflowPollerCount returns the current number of workflow task pollers. -func (h *HeartbeatMetricsHandler) GetWorkflowPollerCount() int32 { - return int32(h.metrics[metricWorkflowPollerCount].Load()) -} - -// GetWorkflowStickyPollerCount returns the current number of workflow sticky task pollers. -func (h *HeartbeatMetricsHandler) GetWorkflowStickyPollerCount() int32 { - return int32(h.metrics[metricWorkflowStickyPollerCount].Load()) -} - -// GetActivityPollerCount returns the current number of activity task pollers. -func (h *HeartbeatMetricsHandler) GetActivityPollerCount() int32 { - return int32(h.metrics[metricActivityPollerCount].Load()) -} - -// GetNexusPollerCount returns the current number of nexus task pollers. -func (h *HeartbeatMetricsHandler) GetNexusPollerCount() int32 { - return int32(h.metrics[metricNexusPollerCount].Load()) -} - // RecordWorkflowPollSuccess records a successful workflow task poll. func (h *HeartbeatMetricsHandler) RecordWorkflowPollSuccess() { h.metrics[metricWorkflowLastPoll].Store(uint64(time.Now().UnixNano())) @@ -459,67 +344,23 @@ func (h *HeartbeatMetricsHandler) RecordNexusPollSuccess() { h.metrics[metricNexusLastPoll].Store(uint64(time.Now().UnixNano())) } -// GetWorkflowLastPollTime returns the last successful workflow task poll time. -func (h *HeartbeatMetricsHandler) GetWorkflowLastPollTime() time.Time { - nanos := h.metrics[metricWorkflowLastPoll].Load() - if nanos == 0 { - return time.Time{} - } - return time.Unix(0, int64(nanos)) -} - -// GetWorkflowStickyLastPollTime returns the last successful workflow sticky task poll time. -func (h *HeartbeatMetricsHandler) GetWorkflowStickyLastPollTime() time.Time { - nanos := h.metrics[metricWorkflowStickyLastPoll].Load() - if nanos == 0 { - return time.Time{} - } - return time.Unix(0, int64(nanos)) -} - -// GetActivityLastPollTime returns the last successful activity task poll time. -func (h *HeartbeatMetricsHandler) GetActivityLastPollTime() time.Time { - nanos := h.metrics[metricActivityLastPoll].Load() - if nanos == 0 { - return time.Time{} - } - return time.Unix(0, int64(nanos)) -} - -// GetNexusLastPollTime returns the last successful nexus task poll time. -func (h *HeartbeatMetricsHandler) GetNexusLastPollTime() time.Time { - nanos := h.metrics[metricNexusLastPoll].Load() - if nanos == 0 { - return time.Time{} - } - return time.Unix(0, int64(nanos)) -} - -// PollSuccessRecorder is an optional interface for recording successful poll times. -type PollSuccessRecorder interface { - RecordWorkflowPollSuccess() - RecordWorkflowStickyPollSuccess() - RecordActivityPollSuccess() - RecordNexusPollSuccess() -} - -// RecordPollSuccess records a successful poll time if the handler supports it. +// RecordPollSuccess records a successful poll time if the handler is a *HeartbeatMetricsHandler. // pollerType should be one of PollerTypeWorkflowTask, PollerTypeWorkflowStickyTask, // PollerTypeActivityTask, or PollerTypeNexusTask. func RecordPollSuccess(h metrics.Handler, pollerType string) { - recorder, ok := h.(PollSuccessRecorder) + hm, ok := h.(*HeartbeatMetricsHandler) if !ok { return } switch pollerType { case metrics.PollerTypeWorkflowTask: - recorder.RecordWorkflowPollSuccess() + hm.RecordWorkflowPollSuccess() case metrics.PollerTypeWorkflowStickyTask: - recorder.RecordWorkflowStickyPollSuccess() + hm.RecordWorkflowStickyPollSuccess() case metrics.PollerTypeActivityTask: - recorder.RecordActivityPollSuccess() + hm.RecordActivityPollSuccess() case metrics.PollerTypeNexusTask: - recorder.RecordNexusPollSuccess() + hm.RecordNexusPollSuccess() } } diff --git a/internal/sysinfo/LICENSE b/internal/sysinfo/LICENSE deleted file mode 100644 index ca62f1f51..000000000 --- a/internal/sysinfo/LICENSE +++ /dev/null @@ -1,30 +0,0 @@ -This package contains code derived from gopsutil: -https://github.com/shirou/gopsutil - -gopsutil is distributed under BSD license reproduced below. - -Copyright (c) 2014, WAKAYAMA Shirou -All rights reserved. - -Redistribution and use in source and binary forms, with or without modification, -are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - * Neither the name of the gopsutil authors nor the names of its contributors - may be used to endorse or promote products derived from this software without - specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON -ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/internal/sysinfo/README.md b/internal/sysinfo/README.md deleted file mode 100644 index 86c6a1ca1..000000000 --- a/internal/sysinfo/README.md +++ /dev/null @@ -1,9 +0,0 @@ -This package is vendored based off of the [gopsutil](https://github.com/shirou/gopsutil) -package, where we've stripped everything except the CPU and mem measuring functionality. -We also only need to support Darwin, Linux, and Windows measurements, as those are -the platforms the SDK itself supports. `LICENSE` has been included in this directory -to honor the BSD license of gopsutil. - -When making changes to update with upstream, use the `scripts/compare_with_gopsutil.sh` -to compare the results of the vendored package with using the library directly. -CI also runs this script to ensure there are no unexpected discrepancies. diff --git a/internal/sysinfo/common.go b/internal/sysinfo/common.go deleted file mode 100644 index 7026956d9..000000000 --- a/internal/sysinfo/common.go +++ /dev/null @@ -1,112 +0,0 @@ -// SPDX-License-Identifier: BSD-3-Clause -// Derived from github.com/shirou/gopsutil/v4 (Copyright (c) 2014, WAKAYAMA Shirou) -// Modified to include only CPU percentage and memory usage for Linux/Darwin/Windows. -package sysinfo - -import ( - "bufio" - "context" - "errors" - "io" - "os" - "path/filepath" - "strings" - "time" -) - -var ErrNotImplemented = errors.New("not implemented on this platform") - -var Timeout = 3 * time.Second - -// EnvKey is the type for context keys used to pass environment variables. -type EnvKey string - -// EnvKeyType is the type alias for environment variable keys. -type EnvKeyType = string - -// EnvMap is the type alias for environment variable maps. -type EnvMap = map[EnvKeyType]string - -// ReadLines reads contents from a file and splits them by new lines. -func ReadLines(filename string) ([]string, error) { - return ReadLinesOffsetN(filename, 0, -1) -} - -// ReadLinesOffsetN reads contents from file and splits them by new line. -// The offset tells at which line number to start. -// The count determines the number of lines to read (starting from offset): -// n >= 0: at most n lines -// n < 0: whole file -func ReadLinesOffsetN(filename string, offset uint, n int) ([]string, error) { - f, err := os.Open(filename) - if err != nil { - return []string{""}, err - } - defer f.Close() - - var ret []string - - r := bufio.NewReader(f) - for i := uint(0); i < uint(n)+offset || n < 0; i++ { - line, err := r.ReadString('\n') - if err != nil { - if err == io.EOF && line != "" { - ret = append(ret, strings.Trim(line, "\n")) - } - break - } - if i < offset { - continue - } - ret = append(ret, strings.Trim(line, "\n")) - } - - return ret, nil -} - -// GetEnvWithContext retrieves the environment variable key. -// If it does not exist it returns the default. -func GetEnvWithContext(ctx context.Context, key string, dfault string, combineWith ...string) string { - var value string - if env, ok := ctx.Value(EnvKey("env")).(EnvMap); ok { - value = env[key] - } - if value == "" { - value = os.Getenv(key) - } - if value == "" { - value = dfault - } - - return combine(value, combineWith) -} - -func combine(value string, combineWith []string) string { - switch len(combineWith) { - case 0: - return value - case 1: - return filepath.Join(value, combineWith[0]) - default: - all := make([]string, len(combineWith)+1) - all[0] = value - copy(all[1:], combineWith) - return filepath.Join(all...) - } -} - -func HostProcWithContext(ctx context.Context, combineWith ...string) string { - return GetEnvWithContext(ctx, "HOST_PROC", "/proc", combineWith...) -} - -// Sleep sleeps for the specified duration, respecting context cancellation. -func Sleep(ctx context.Context, interval time.Duration) error { - timer := time.NewTimer(interval) - defer timer.Stop() - select { - case <-ctx.Done(): - return ctx.Err() - case <-timer.C: - return nil - } -} diff --git a/internal/sysinfo/common_darwin.go b/internal/sysinfo/common_darwin.go deleted file mode 100644 index e95fa65bb..000000000 --- a/internal/sysinfo/common_darwin.go +++ /dev/null @@ -1,61 +0,0 @@ -// SPDX-License-Identifier: BSD-3-Clause -// Derived from github.com/shirou/gopsutil/v4 (Copyright (c) 2014, WAKAYAMA Shirou) -// Modified to include only CPU percentage and memory usage for Linux/Darwin/Windows. -//go:build darwin - -package sysinfo - -import ( - "github.com/ebitengine/purego" -) - -const ( - systemLibPath = "/usr/lib/libSystem.B.dylib" - - // mach/processor_info.h - processorCpuLoadInfo = 2 - - // mach/host_info.h - hostVMInfo = 2 - hostCpuLoadInfo = 3 - hostVMInfoCount = 0xf - - // Status codes - kernSuccess = 0 -) - -type systemLib struct { - handle uintptr - - hostProcessorInfo func(host uint32, flavor int32, outProcessorCount *uint32, - outProcessorInfo uintptr, outProcessorInfoCnt *uint32) int32 - hostStatistics func(host uint32, flavor int32, hostInfoOut uintptr, hostInfoOutCnt *uint32) int32 - machHostSelf func() uint32 - machTaskSelf func() uint32 - vmDeallocate func(targetTask uint32, vmAddress, vmSize uintptr) int32 -} - -func newSystemLib() (*systemLib, error) { - handle, err := purego.Dlopen(systemLibPath, purego.RTLD_LAZY|purego.RTLD_GLOBAL) - if err != nil { - return nil, err - } - - sys := &systemLib{handle: handle} - - purego.RegisterLibFunc(&sys.hostProcessorInfo, handle, "host_processor_info") - purego.RegisterLibFunc(&sys.hostStatistics, handle, "host_statistics") - purego.RegisterLibFunc(&sys.machHostSelf, handle, "mach_host_self") - purego.RegisterLibFunc(&sys.machTaskSelf, handle, "mach_task_self") - purego.RegisterLibFunc(&sys.vmDeallocate, handle, "vm_deallocate") - - return sys, nil -} - -func (s *systemLib) Dlsym(symbol string) (uintptr, error) { - return purego.Dlsym(s.handle, symbol) -} - -func (s *systemLib) close() { - purego.Dlclose(s.handle) -} diff --git a/internal/sysinfo/cpu.go b/internal/sysinfo/cpu.go deleted file mode 100644 index 7bd7967b8..000000000 --- a/internal/sysinfo/cpu.go +++ /dev/null @@ -1,141 +0,0 @@ -// SPDX-License-Identifier: BSD-3-Clause -// Derived from github.com/shirou/gopsutil/v4 (Copyright (c) 2014, WAKAYAMA Shirou) -// Modified to include only CPU percentage and memory usage for Linux/Darwin/Windows. -package sysinfo - -import ( - "context" - "fmt" - "math" - "runtime" - "sync" - "time" -) - -// TimesStat contains the amounts of time the CPU has spent performing different -// kinds of work. Time units are in seconds. It is based on linux /proc/stat file. -type TimesStat struct { - CPU string `json:"cpu"` - User float64 `json:"user"` - System float64 `json:"system"` - Idle float64 `json:"idle"` - Nice float64 `json:"nice"` - Iowait float64 `json:"iowait"` - Irq float64 `json:"irq"` - Softirq float64 `json:"softirq"` - Steal float64 `json:"steal"` - Guest float64 `json:"guest"` - GuestNice float64 `json:"guestNice"` -} - -type lastPercent struct { - sync.Mutex - lastCPUTimes []TimesStat - lastPerCPUTimes []TimesStat -} - -var lastCPUPercent lastPercent - -func init() { - lastCPUPercent.Lock() - lastCPUPercent.lastCPUTimes, _ = Times(false) - lastCPUPercent.lastPerCPUTimes, _ = Times(true) - lastCPUPercent.Unlock() -} - -func (c TimesStat) Total() float64 { - total := c.User + c.System + c.Idle + c.Nice + c.Iowait + c.Irq + - c.Softirq + c.Steal + c.Guest + c.GuestNice - return total -} - -func getAllBusy(t TimesStat) (float64, float64) { - tot := t.Total() - if runtime.GOOS == "linux" { - tot -= t.Guest // Linux 2.6.24+ - tot -= t.GuestNice // Linux 3.2.0+ - } - busy := tot - t.Idle - t.Iowait - return tot, busy -} - -func calculateBusy(t1, t2 TimesStat) float64 { - t1All, t1Busy := getAllBusy(t1) - t2All, t2Busy := getAllBusy(t2) - - if t2Busy <= t1Busy { - return 0 - } - if t2All <= t1All { - return 100 - } - return math.Min(100, math.Max(0, (t2Busy-t1Busy)/(t2All-t1All)*100)) -} - -func calculateAllBusy(t1, t2 []TimesStat) ([]float64, error) { - if len(t1) != len(t2) { - return nil, fmt.Errorf( - "received two CPU counts: %d != %d", - len(t1), len(t2), - ) - } - - ret := make([]float64, len(t1)) - for i, t := range t2 { - ret[i] = calculateBusy(t1[i], t) - } - return ret, nil -} - -// Percent calculates the percentage of cpu used either per CPU or combined. -// If an interval of 0 is given it will compare the current cpu times against the last call. -// Returns one value per cpu, or a single value if percpu is set to false. -func Percent(interval time.Duration, percpu bool) ([]float64, error) { - return PercentWithContext(context.Background(), interval, percpu) -} - -func PercentWithContext(ctx context.Context, interval time.Duration, percpu bool) ([]float64, error) { - if interval <= 0 { - return percentUsedFromLastCallWithContext(ctx, percpu) - } - - // Get CPU usage at the start of the interval. - cpuTimes1, err := TimesWithContext(ctx, percpu) - if err != nil { - return nil, err - } - - if err := Sleep(ctx, interval); err != nil { - return nil, err - } - - // And at the end of the interval. - cpuTimes2, err := TimesWithContext(ctx, percpu) - if err != nil { - return nil, err - } - - return calculateAllBusy(cpuTimes1, cpuTimes2) -} - -func percentUsedFromLastCallWithContext(ctx context.Context, percpu bool) ([]float64, error) { - cpuTimes, err := TimesWithContext(ctx, percpu) - if err != nil { - return nil, err - } - lastCPUPercent.Lock() - defer lastCPUPercent.Unlock() - var lastTimes []TimesStat - if percpu { - lastTimes = lastCPUPercent.lastPerCPUTimes - lastCPUPercent.lastPerCPUTimes = cpuTimes - } else { - lastTimes = lastCPUPercent.lastCPUTimes - lastCPUPercent.lastCPUTimes = cpuTimes - } - - if lastTimes == nil { - return nil, fmt.Errorf("error getting times for cpu percent. lastTimes was nil") - } - return calculateAllBusy(lastTimes, cpuTimes) -} diff --git a/internal/sysinfo/cpu_darwin.go b/internal/sysinfo/cpu_darwin.go deleted file mode 100644 index 09e2d3b79..000000000 --- a/internal/sysinfo/cpu_darwin.go +++ /dev/null @@ -1,101 +0,0 @@ -// SPDX-License-Identifier: BSD-3-Clause -// Derived from github.com/shirou/gopsutil/v4 (Copyright (c) 2014, WAKAYAMA Shirou) -// Modified to include only CPU percentage and memory usage for Linux/Darwin/Windows. -//go:build darwin - -package sysinfo - -import ( - "context" - "errors" - "fmt" - "unsafe" -) - -// mach/machine.h -const ( - cpuStateUser = 0 - cpuStateSystem = 1 - cpuStateIdle = 2 - cpuStateNice = 3 - cpuStateMax = 4 -) - -type hostCpuLoadInfoData struct { - cpuTicks [cpuStateMax]uint32 -} - -var ClocksPerSec = float64(100) - -func Times(percpu bool) ([]TimesStat, error) { - return TimesWithContext(context.Background(), percpu) -} - -func TimesWithContext(_ context.Context, percpu bool) ([]TimesStat, error) { - sys, err := newSystemLib() - if err != nil { - return nil, err - } - defer sys.close() - - if percpu { - return perCPUTimes(sys) - } - return allCPUTimes(sys) -} - -func perCPUTimes(sys *systemLib) ([]TimesStat, error) { - var count, ncpu uint32 - var cpuload *hostCpuLoadInfoData - - status := sys.hostProcessorInfo(sys.machHostSelf(), processorCpuLoadInfo, - &ncpu, uintptr(unsafe.Pointer(&cpuload)), &count) - - if status != kernSuccess { - return nil, fmt.Errorf("host_processor_info error=%d", status) - } - - if cpuload == nil { - return nil, errors.New("host_processor_info returned nil cpuload") - } - - defer sys.vmDeallocate(sys.machTaskSelf(), uintptr(unsafe.Pointer(cpuload)), uintptr(ncpu)) - - ret := []TimesStat{} - loads := unsafe.Slice(cpuload, ncpu) - - for i := 0; i < int(ncpu); i++ { - c := TimesStat{ - CPU: fmt.Sprintf("cpu%d", i), - User: float64(loads[i].cpuTicks[cpuStateUser]) / ClocksPerSec, - System: float64(loads[i].cpuTicks[cpuStateSystem]) / ClocksPerSec, - Nice: float64(loads[i].cpuTicks[cpuStateNice]) / ClocksPerSec, - Idle: float64(loads[i].cpuTicks[cpuStateIdle]) / ClocksPerSec, - } - ret = append(ret, c) - } - - return ret, nil -} - -func allCPUTimes(sys *systemLib) ([]TimesStat, error) { - var cpuload hostCpuLoadInfoData - count := uint32(cpuStateMax) - - status := sys.hostStatistics(sys.machHostSelf(), hostCpuLoadInfo, - uintptr(unsafe.Pointer(&cpuload)), &count) - - if status != kernSuccess { - return nil, fmt.Errorf("host_statistics error=%d", status) - } - - c := TimesStat{ - CPU: "cpu-total", - User: float64(cpuload.cpuTicks[cpuStateUser]) / ClocksPerSec, - System: float64(cpuload.cpuTicks[cpuStateSystem]) / ClocksPerSec, - Nice: float64(cpuload.cpuTicks[cpuStateNice]) / ClocksPerSec, - Idle: float64(cpuload.cpuTicks[cpuStateIdle]) / ClocksPerSec, - } - - return []TimesStat{c}, nil -} diff --git a/internal/sysinfo/cpu_linux.go b/internal/sysinfo/cpu_linux.go deleted file mode 100644 index 2feee4237..000000000 --- a/internal/sysinfo/cpu_linux.go +++ /dev/null @@ -1,134 +0,0 @@ -// SPDX-License-Identifier: BSD-3-Clause -// Derived from github.com/shirou/gopsutil/v4 (Copyright (c) 2014, WAKAYAMA Shirou) -// Modified to include only CPU percentage and memory usage for Linux/Darwin/Windows. -//go:build linux - -package sysinfo - -import ( - "context" - "errors" - "strconv" - "strings" -) - -// ClocksPerSec is the number of clock ticks per second. -// On Linux, this is typically 100 (USER_HZ). -var ClocksPerSec = float64(100) - -func Times(percpu bool) ([]TimesStat, error) { - return TimesWithContext(context.Background(), percpu) -} - -func TimesWithContext(ctx context.Context, percpu bool) ([]TimesStat, error) { - filename := HostProcWithContext(ctx, "stat") - lines := []string{} - if percpu { - statlines, err := ReadLines(filename) - if err != nil || len(statlines) < 2 { - return []TimesStat{}, nil - } - for _, line := range statlines[1:] { - if !strings.HasPrefix(line, "cpu") { - break - } - lines = append(lines, line) - } - } else { - var err error - lines, err = ReadLinesOffsetN(filename, 0, 1) - if err != nil || len(lines) == 0 { - return []TimesStat{}, nil - } - } - - ret := make([]TimesStat, 0, len(lines)) - - for _, line := range lines { - ct, err := parseStatLine(line) - if err != nil { - continue - } - ret = append(ret, *ct) - } - return ret, nil -} - -func parseStatLine(line string) (*TimesStat, error) { - fields := strings.Fields(line) - - if len(fields) < 8 { - return nil, errors.New("stat does not contain cpu info") - } - - if !strings.HasPrefix(fields[0], "cpu") { - return nil, errors.New("not contain cpu") - } - - cpu := fields[0] - if cpu == "cpu" { - cpu = "cpu-total" - } - user, err := strconv.ParseFloat(fields[1], 64) - if err != nil { - return nil, err - } - nice, err := strconv.ParseFloat(fields[2], 64) - if err != nil { - return nil, err - } - system, err := strconv.ParseFloat(fields[3], 64) - if err != nil { - return nil, err - } - idle, err := strconv.ParseFloat(fields[4], 64) - if err != nil { - return nil, err - } - iowait, err := strconv.ParseFloat(fields[5], 64) - if err != nil { - return nil, err - } - irq, err := strconv.ParseFloat(fields[6], 64) - if err != nil { - return nil, err - } - softirq, err := strconv.ParseFloat(fields[7], 64) - if err != nil { - return nil, err - } - - ct := &TimesStat{ - CPU: cpu, - User: user / ClocksPerSec, - Nice: nice / ClocksPerSec, - System: system / ClocksPerSec, - Idle: idle / ClocksPerSec, - Iowait: iowait / ClocksPerSec, - Irq: irq / ClocksPerSec, - Softirq: softirq / ClocksPerSec, - } - if len(fields) > 8 { // Linux >= 2.6.11 - steal, err := strconv.ParseFloat(fields[8], 64) - if err != nil { - return nil, err - } - ct.Steal = steal / ClocksPerSec - } - if len(fields) > 9 { // Linux >= 2.6.24 - guest, err := strconv.ParseFloat(fields[9], 64) - if err != nil { - return nil, err - } - ct.Guest = guest / ClocksPerSec - } - if len(fields) > 10 { // Linux >= 3.2.0 - guestNice, err := strconv.ParseFloat(fields[10], 64) - if err != nil { - return nil, err - } - ct.GuestNice = guestNice / ClocksPerSec - } - - return ct, nil -} diff --git a/internal/sysinfo/cpu_unsupported.go b/internal/sysinfo/cpu_unsupported.go deleted file mode 100644 index b848076ad..000000000 --- a/internal/sysinfo/cpu_unsupported.go +++ /dev/null @@ -1,18 +0,0 @@ -// SPDX-License-Identifier: BSD-3-Clause -// Derived from github.com/shirou/gopsutil/v4 (Copyright (c) 2014, WAKAYAMA Shirou) -// Modified to include only CPU percentage and memory usage for Linux/Darwin/Windows. -//go:build !linux && !darwin && !windows - -package sysinfo - -import ( - "context" -) - -func Times(percpu bool) ([]TimesStat, error) { - return TimesWithContext(context.Background(), percpu) -} - -func TimesWithContext(ctx context.Context, percpu bool) ([]TimesStat, error) { - return nil, ErrNotImplemented -} diff --git a/internal/sysinfo/cpu_windows.go b/internal/sysinfo/cpu_windows.go deleted file mode 100644 index 133c05ded..000000000 --- a/internal/sysinfo/cpu_windows.go +++ /dev/null @@ -1,141 +0,0 @@ -// SPDX-License-Identifier: BSD-3-Clause -// Derived from github.com/shirou/gopsutil/v4 (Copyright (c) 2014, WAKAYAMA Shirou) -// Modified to include only CPU percentage and memory usage for Linux/Darwin/Windows. -//go:build windows - -package sysinfo - -import ( - "context" - "fmt" - "unsafe" - - "golang.org/x/sys/windows" -) - -// SYSTEM_PROCESSOR_PERFORMANCE_INFORMATION -// https://docs.microsoft.com/en-us/windows/desktop/api/winternl/nf-winternl-ntquerysysteminformation#system_processor_performance_information -type win32_SystemProcessorPerformanceInformation struct { - IdleTime int64 - KernelTime int64 - UserTime int64 - DpcTime int64 - InterruptTime int64 - InterruptCount uint64 -} - -const ( - ClocksPerSec = 10000000.0 - - // systemProcessorPerformanceInformationClass information class to query with NTQuerySystemInformation - // https://processhacker.sourceforge.io/doc/ntexapi_8h.html#ad5d815b48e8f4da1ef2eb7a2f18a54e0 - win32_SystemProcessorPerformanceInformationClass = 8 - - // size of systemProcessorPerformanceInfoSize in memory - win32_SystemProcessorPerformanceInfoSize = uint32(unsafe.Sizeof(win32_SystemProcessorPerformanceInformation{})) -) - -var ( - modkernel32 = windows.NewLazySystemDLL("kernel32.dll") - modNt = windows.NewLazySystemDLL("ntdll.dll") - procGetSystemTimes = modkernel32.NewProc("GetSystemTimes") - procNtQuerySystemInformation = modNt.NewProc("NtQuerySystemInformation") -) - -type fileTime struct { - dwLowDateTime uint32 - dwHighDateTime uint32 -} - -func Times(percpu bool) ([]TimesStat, error) { - return TimesWithContext(context.Background(), percpu) -} - -func TimesWithContext(_ context.Context, percpu bool) ([]TimesStat, error) { - if percpu { - return perCPUTimes() - } - - var ret []TimesStat - var lpIdleTime fileTime - var lpKernelTime fileTime - var lpUserTime fileTime - r, _, err := procGetSystemTimes.Call( - uintptr(unsafe.Pointer(&lpIdleTime)), - uintptr(unsafe.Pointer(&lpKernelTime)), - uintptr(unsafe.Pointer(&lpUserTime))) - if r == 0 { - return nil, err - } - - LOT := float64(0.0000001) - HIT := (LOT * 4294967296.0) - idle := ((HIT * float64(lpIdleTime.dwHighDateTime)) + (LOT * float64(lpIdleTime.dwLowDateTime))) - user := ((HIT * float64(lpUserTime.dwHighDateTime)) + (LOT * float64(lpUserTime.dwLowDateTime))) - kernel := ((HIT * float64(lpKernelTime.dwHighDateTime)) + (LOT * float64(lpKernelTime.dwLowDateTime))) - system := (kernel - idle) - - ret = append(ret, TimesStat{ - CPU: "cpu-total", - Idle: idle, - User: user, - System: system, - }) - return ret, nil -} - -func perCPUTimes() ([]TimesStat, error) { - var ret []TimesStat - stats, err := perfInfo() - if err != nil { - return nil, err - } - for core, v := range stats { - c := TimesStat{ - CPU: fmt.Sprintf("cpu%d", core), - User: float64(v.UserTime) / ClocksPerSec, - System: float64(v.KernelTime-v.IdleTime) / ClocksPerSec, - Idle: float64(v.IdleTime) / ClocksPerSec, - Irq: float64(v.InterruptTime) / ClocksPerSec, - } - ret = append(ret, c) - } - return ret, nil -} - -// makes call to Windows API function to retrieve performance information for each core -func perfInfo() ([]win32_SystemProcessorPerformanceInformation, error) { - // Make maxResults large for safety. - // We can't invoke the api call with a results array that's too small. - // If we have more than 2056 cores on a single host, then it's probably the future. - maxBuffer := 2056 - // buffer for results from the windows proc - resultBuffer := make([]win32_SystemProcessorPerformanceInformation, maxBuffer) - // size of the buffer in memory - bufferSize := uintptr(win32_SystemProcessorPerformanceInfoSize) * uintptr(maxBuffer) - // size of the returned response - var retSize uint32 - - // Invoke windows api proc. - // The returned err from the windows dll proc will always be non-nil even when successful. - // See https://godoc.org/golang.org/x/sys/windows#LazyProc.Call for more information - retCode, _, err := procNtQuerySystemInformation.Call( - win32_SystemProcessorPerformanceInformationClass, // System Information Class -> SystemProcessorPerformanceInformation - uintptr(unsafe.Pointer(&resultBuffer[0])), // pointer to first element in result buffer - bufferSize, // size of the buffer in memory - uintptr(unsafe.Pointer(&retSize)), // pointer to the size of the returned results the windows proc will set this - ) - - // check return code for errors - if retCode != 0 { - return nil, fmt.Errorf("call to NtQuerySystemInformation returned %d. err: %s", retCode, err.Error()) - } - - // calculate the number of returned elements based on the returned size - numReturnedElements := retSize / win32_SystemProcessorPerformanceInfoSize - - // trim results to the number of returned elements - resultBuffer = resultBuffer[:numReturnedElements] - - return resultBuffer, nil -} diff --git a/internal/sysinfo/mem.go b/internal/sysinfo/mem.go deleted file mode 100644 index 223257f6d..000000000 --- a/internal/sysinfo/mem.go +++ /dev/null @@ -1,50 +0,0 @@ -// SPDX-License-Identifier: BSD-3-Clause -// Derived from github.com/shirou/gopsutil/v4 (Copyright (c) 2014, WAKAYAMA Shirou) -// Modified to include only CPU percentage and memory usage for Linux/Darwin/Windows. -package sysinfo - -// VirtualMemoryStat contains memory usage statistics. -type VirtualMemoryStat struct { - // Total amount of RAM on this system - Total uint64 `json:"total"` - - // RAM available for programs to allocate - Available uint64 `json:"available"` - - // RAM used by programs - Used uint64 `json:"used"` - - // Percentage of RAM used by programs - UsedPercent float64 `json:"usedPercent"` - - // This is the kernel's notion of free memory; RAM chips whose bits nobody - // cares about the value of right now. For a human consumable number, - // Available is what you really want. - Free uint64 `json:"free"` - - // OS X / BSD specific numbers: - // http://www.macyourself.com/2010/02/17/what-is-free-wired-active-and-inactive-system-memory-ram/ - Active uint64 `json:"active"` - Inactive uint64 `json:"inactive"` - Wired uint64 `json:"wired"` - - // Linux specific numbers - // https://blogs.oracle.com/linux/understanding-linux-kernel-memory-statistics - // https://www.kernel.org/doc/Documentation/filesystems/proc.txt - // https://www.kernel.org/doc/Documentation/vm/overcommit-accounting - // https://www.kernel.org/doc/Documentation/vm/transhuge.txt - // - Buffers uint64 `json:"buffers"` - Cached uint64 `json:"cached"` - WriteBack uint64 `json:"writeBack"` - Dirty uint64 `json:"dirty"` - WriteBackTmp uint64 `json:"writeBackTmp"` - Shared uint64 `json:"shared"` - Slab uint64 `json:"slab"` - Sreclaimable uint64 `json:"sreclaimable"` - Sunreclaim uint64 `json:"sunreclaim"` - PageTables uint64 `json:"pageTables"` - SwapCached uint64 `json:"swapCached"` - CommitLimit uint64 `json:"commitLimit"` - CommittedAS uint64 `json:"committedAS"` -} diff --git a/internal/sysinfo/mem_darwin.go b/internal/sysinfo/mem_darwin.go deleted file mode 100644 index 29d585570..000000000 --- a/internal/sysinfo/mem_darwin.go +++ /dev/null @@ -1,76 +0,0 @@ -// SPDX-License-Identifier: BSD-3-Clause -// Derived from github.com/shirou/gopsutil/v4 (Copyright (c) 2014, WAKAYAMA Shirou) -// Modified to include only CPU percentage and memory usage for Linux/Darwin/Windows. -//go:build darwin - -package sysinfo - -import ( - "context" - "fmt" - "unsafe" - - "golang.org/x/sys/unix" -) - -type vmStatisticsData struct { - freeCount uint32 - activeCount uint32 - inactiveCount uint32 - wireCount uint32 - _ [44]byte -} - -func getHwMemsize() (uint64, error) { - total, err := unix.SysctlUint64("hw.memsize") - if err != nil { - return 0, err - } - return total, nil -} - -func VirtualMemory() (*VirtualMemoryStat, error) { - return VirtualMemoryWithContext(context.Background()) -} - -func VirtualMemoryWithContext(_ context.Context) (*VirtualMemoryStat, error) { - sys, err := newSystemLib() - if err != nil { - return nil, err - } - defer sys.close() - - count := uint32(hostVMInfoCount) - var vmstat vmStatisticsData - - status := sys.hostStatistics(sys.machHostSelf(), hostVMInfo, - uintptr(unsafe.Pointer(&vmstat)), &count) - - if status != kernSuccess { - return nil, fmt.Errorf("host_statistics error=%d", status) - } - - pageSizeAddr, _ := sys.Dlsym("vm_kernel_page_size") - pageSize := **(**uint64)(unsafe.Pointer(&pageSizeAddr)) - total, err := getHwMemsize() - if err != nil { - return nil, err - } - totalCount := uint32(total / pageSize) - - availableCount := vmstat.inactiveCount + vmstat.freeCount - usedPercent := 100 * float64(totalCount-availableCount) / float64(totalCount) - - usedCount := totalCount - availableCount - - return &VirtualMemoryStat{ - Total: total, - Available: pageSize * uint64(availableCount), - Used: pageSize * uint64(usedCount), - UsedPercent: usedPercent, - Free: pageSize * uint64(vmstat.freeCount), - Active: pageSize * uint64(vmstat.activeCount), - Inactive: pageSize * uint64(vmstat.inactiveCount), - Wired: pageSize * uint64(vmstat.wireCount), - }, nil -} diff --git a/internal/sysinfo/mem_linux.go b/internal/sysinfo/mem_linux.go deleted file mode 100644 index 3e6652884..000000000 --- a/internal/sysinfo/mem_linux.go +++ /dev/null @@ -1,100 +0,0 @@ -// SPDX-License-Identifier: BSD-3-Clause -// Derived from github.com/shirou/gopsutil/v4 (Copyright (c) 2014, WAKAYAMA Shirou) -// Modified to include only CPU percentage and memory usage for Linux/Darwin/Windows. -//go:build linux - -package sysinfo - -import ( - "context" - "strconv" - "strings" -) - -func VirtualMemory() (*VirtualMemoryStat, error) { - return VirtualMemoryWithContext(context.Background()) -} - -func VirtualMemoryWithContext(ctx context.Context) (*VirtualMemoryStat, error) { - filename := HostProcWithContext(ctx, "meminfo") - lines, err := ReadLines(filename) - if err != nil { - return nil, err - } - - ret := &VirtualMemoryStat{} - var memAvailable, memFree, cached uint64 - memAvailablePresent := false - - for _, line := range lines { - fields := strings.Split(line, ":") - if len(fields) != 2 { - continue - } - key := strings.TrimSpace(fields[0]) - value := strings.TrimSpace(fields[1]) - value = strings.Replace(value, " kB", "", -1) - - v, err := strconv.ParseUint(value, 10, 64) - if err != nil { - continue - } - v *= 1024 // Convert kB to bytes - - switch key { - case "MemTotal": - ret.Total = v - case "MemFree": - memFree = v - ret.Free = v - case "MemAvailable": - memAvailablePresent = true - memAvailable = v - case "Buffers": - ret.Buffers = v - case "Cached": - cached = v - ret.Cached = v - case "Active": - ret.Active = v - case "Inactive": - ret.Inactive = v - case "Writeback": - ret.WriteBack = v - case "WritebackTmp": - ret.WriteBackTmp = v - case "Dirty": - ret.Dirty = v - case "Shmem": - ret.Shared = v - case "Slab": - ret.Slab = v - case "SReclaimable": - ret.Sreclaimable = v - case "SUnreclaim": - ret.Sunreclaim = v - case "PageTables": - ret.PageTables = v - case "SwapCached": - ret.SwapCached = v - case "CommitLimit": - ret.CommitLimit = v - case "Committed_AS": - ret.CommittedAS = v - } - } - - ret.Cached += ret.Sreclaimable - - // Calculate Available if not present (kernel < 3.14) - if memAvailablePresent { - ret.Available = memAvailable - } else { - ret.Available = memFree + cached - } - - ret.Used = ret.Total - ret.Available - ret.UsedPercent = float64(ret.Used) / float64(ret.Total) * 100.0 - - return ret, nil -} diff --git a/internal/sysinfo/mem_unsupported.go b/internal/sysinfo/mem_unsupported.go deleted file mode 100644 index 7c72a5dcd..000000000 --- a/internal/sysinfo/mem_unsupported.go +++ /dev/null @@ -1,18 +0,0 @@ -// SPDX-License-Identifier: BSD-3-Clause -// Derived from github.com/shirou/gopsutil/v4 (Copyright (c) 2014, WAKAYAMA Shirou) -// Modified to include only CPU percentage and memory usage for Linux/Darwin/Windows. -//go:build !linux && !darwin && !windows - -package sysinfo - -import ( - "context" -) - -func VirtualMemory() (*VirtualMemoryStat, error) { - return VirtualMemoryWithContext(context.Background()) -} - -func VirtualMemoryWithContext(ctx context.Context) (*VirtualMemoryStat, error) { - return nil, ErrNotImplemented -} diff --git a/internal/sysinfo/mem_windows.go b/internal/sysinfo/mem_windows.go deleted file mode 100644 index e013e5337..000000000 --- a/internal/sysinfo/mem_windows.go +++ /dev/null @@ -1,50 +0,0 @@ -// SPDX-License-Identifier: BSD-3-Clause -// Derived from github.com/shirou/gopsutil/v4 (Copyright (c) 2014, WAKAYAMA Shirou) -// Modified to include only CPU percentage and memory usage for Linux/Darwin/Windows. -//go:build windows - -package sysinfo - -import ( - "context" - "unsafe" - - "golang.org/x/sys/windows" -) - -var procGlobalMemoryStatusEx = windows.NewLazySystemDLL("kernel32.dll").NewProc("GlobalMemoryStatusEx") - -type memoryStatusEx struct { - cbSize uint32 - dwMemoryLoad uint32 - ullTotalPhys uint64 - ullAvailPhys uint64 - ullTotalPageFile uint64 - ullAvailPageFile uint64 - ullTotalVirtual uint64 - ullAvailVirtual uint64 - ullAvailExtendedVirtual uint64 -} - -func VirtualMemory() (*VirtualMemoryStat, error) { - return VirtualMemoryWithContext(context.Background()) -} - -func VirtualMemoryWithContext(ctx context.Context) (*VirtualMemoryStat, error) { - var memInfo memoryStatusEx - memInfo.cbSize = uint32(unsafe.Sizeof(memInfo)) - mem, _, err := procGlobalMemoryStatusEx.Call(uintptr(unsafe.Pointer(&memInfo))) - if mem == 0 { - return nil, err - } - - ret := &VirtualMemoryStat{ - Total: memInfo.ullTotalPhys, - Available: memInfo.ullAvailPhys, - Free: memInfo.ullAvailPhys, - UsedPercent: float64(memInfo.dwMemoryLoad), - } - - ret.Used = ret.Total - ret.Available - return ret, nil -} diff --git a/internal/sysinfo/scripts/compare_with_gopsutil.sh b/internal/sysinfo/scripts/compare_with_gopsutil.sh deleted file mode 100755 index 64e473adc..000000000 --- a/internal/sysinfo/scripts/compare_with_gopsutil.sh +++ /dev/null @@ -1,236 +0,0 @@ -#!/bin/bash -# Compare internal/sysinfo implementation against gopsutil -# Usage: ./internal/sysinfo/scripts/compare_with_gopsutil.sh - -set -e - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -SYSINFO_DIR="$(dirname "$SCRIPT_DIR")" -REPO_ROOT="$(cd "$SYSINFO_DIR/../.." && pwd)" - -TEST_FILE="$SYSINFO_DIR/compare_test.go" - -cleanup() { - echo "Cleaning up..." - rm -f "$TEST_FILE" - cd "$REPO_ROOT" && go mod tidy 2>/dev/null - echo "Done." -} - -trap cleanup EXIT - -echo "=== Comparing internal/sysinfo against gopsutil ===" -echo "" - -# Create the comparison test file -cat > "$TEST_FILE" << 'TESTEOF' -//go:build compare_gopsutil - -package sysinfo_test - -import ( - "context" - "math" - "testing" - "time" - - gopsutil_cpu "github.com/shirou/gopsutil/v4/cpu" - gopsutil_mem "github.com/shirou/gopsutil/v4/mem" - "go.temporal.io/sdk/internal/sysinfo" -) - -func TestCPUTimesMatchGopsutil(t *testing.T) { - ctx := context.Background() - - t.Run("total", func(t *testing.T) { - gTimes, gErr := gopsutil_cpu.TimesWithContext(ctx, false) - sTimes, sErr := sysinfo.TimesWithContext(ctx, false) - - if gErr != nil || sErr != nil { - t.Fatalf("errors: gopsutil=%v, sysinfo=%v", gErr, sErr) - } - - if len(gTimes) != len(sTimes) { - t.Fatalf("length mismatch: gopsutil=%d, sysinfo=%d", len(gTimes), len(sTimes)) - } - - g, s := gTimes[0], sTimes[0] - t.Logf("gopsutil: CPU=%s User=%.4f System=%.4f Idle=%.4f Nice=%.4f Iowait=%.4f", - g.CPU, g.User, g.System, g.Idle, g.Nice, g.Iowait) - t.Logf("sysinfo: CPU=%s User=%.4f System=%.4f Idle=%.4f Nice=%.4f Iowait=%.4f", - s.CPU, s.User, s.System, s.Idle, s.Nice, s.Iowait) - - assertClose(t, "User", g.User, s.User, 0.01) - assertClose(t, "System", g.System, s.System, 0.01) - assertClose(t, "Idle", g.Idle, s.Idle, 0.01) - assertClose(t, "Nice", g.Nice, s.Nice, 0.01) - assertClose(t, "Iowait", g.Iowait, s.Iowait, 0.01) - }) - - t.Run("percpu", func(t *testing.T) { - gTimes, gErr := gopsutil_cpu.TimesWithContext(ctx, true) - sTimes, sErr := sysinfo.TimesWithContext(ctx, true) - - if gErr != nil || sErr != nil { - t.Fatalf("errors: gopsutil=%v, sysinfo=%v", gErr, sErr) - } - - if len(gTimes) != len(sTimes) { - t.Fatalf("length mismatch: gopsutil=%d, sysinfo=%d", len(gTimes), len(sTimes)) - } - - t.Logf("Found %d CPUs", len(gTimes)) - for i := range gTimes { - g, s := gTimes[i], sTimes[i] - if g.CPU != s.CPU { - t.Errorf("CPU[%d] name mismatch: gopsutil=%s, sysinfo=%s", i, g.CPU, s.CPU) - } - assertClose(t, "User", g.User, s.User, 0.01) - assertClose(t, "System", g.System, s.System, 0.01) - assertClose(t, "Idle", g.Idle, s.Idle, 0.01) - } - }) -} - -func TestCPUPercentMatchesGopsutil(t *testing.T) { - ctx := context.Background() - - t.Run("with_interval", func(t *testing.T) { - interval := 200 * time.Millisecond - - // Run both concurrently so they measure the same time window - var gPercent, sPercent []float64 - var gErr, sErr error - - done := make(chan struct{}) - go func() { - gPercent, gErr = gopsutil_cpu.PercentWithContext(ctx, interval, false) - done <- struct{}{} - }() - go func() { - sPercent, sErr = sysinfo.PercentWithContext(ctx, interval, false) - done <- struct{}{} - }() - <-done - <-done - - if gErr != nil || sErr != nil { - t.Fatalf("errors: gopsutil=%v, sysinfo=%v", gErr, sErr) - } - - if len(gPercent) != len(sPercent) { - t.Fatalf("length mismatch: gopsutil=%d, sysinfo=%d", len(gPercent), len(sPercent)) - } - - t.Logf("gopsutil CPU%%: %.2f", gPercent[0]) - t.Logf("sysinfo CPU%%: %.2f", sPercent[0]) - - // Allow some variance since measurements aren't perfectly synchronized - if math.Abs(gPercent[0]-sPercent[0]) > 5.0 { - t.Errorf("CPU percent differs by more than 5%%: gopsutil=%.2f, sysinfo=%.2f", - gPercent[0], sPercent[0]) - } - }) - - t.Run("without_interval", func(t *testing.T) { - gopsutil_cpu.PercentWithContext(ctx, 0, false) - sysinfo.PercentWithContext(ctx, 0, false) - - time.Sleep(50 * time.Millisecond) - - gPercent, gErr := gopsutil_cpu.PercentWithContext(ctx, 0, false) - sPercent, sErr := sysinfo.PercentWithContext(ctx, 0, false) - - if gErr != nil || sErr != nil { - t.Fatalf("errors: gopsutil=%v, sysinfo=%v", gErr, sErr) - } - - t.Logf("gopsutil CPU%% (cached): %.2f", gPercent[0]) - t.Logf("sysinfo CPU%% (cached): %.2f", sPercent[0]) - - if gPercent[0] < 0 || gPercent[0] > 100 { - t.Errorf("gopsutil returned invalid percent: %.2f", gPercent[0]) - } - if sPercent[0] < 0 || sPercent[0] > 100 { - t.Errorf("sysinfo returned invalid percent: %.2f", sPercent[0]) - } - }) -} - -func TestMemoryMatchesGopsutil(t *testing.T) { - ctx := context.Background() - - gMem, gErr := gopsutil_mem.VirtualMemoryWithContext(ctx) - sMem, sErr := sysinfo.VirtualMemoryWithContext(ctx) - - if gErr != nil || sErr != nil { - t.Fatalf("errors: gopsutil=%v, sysinfo=%v", gErr, sErr) - } - - t.Logf("gopsutil: Total=%d Available=%d Used=%d UsedPercent=%.2f Free=%d", - gMem.Total, gMem.Available, gMem.Used, gMem.UsedPercent, gMem.Free) - t.Logf("sysinfo: Total=%d Available=%d Used=%d UsedPercent=%.2f Free=%d", - sMem.Total, sMem.Available, sMem.Used, sMem.UsedPercent, sMem.Free) - - // Total should be exactly the same (doesn't change) - if gMem.Total != sMem.Total { - t.Errorf("Total mismatch: gopsutil=%d, sysinfo=%d", gMem.Total, sMem.Total) - } - - // Other memory values can change between calls, allow 0.1% tolerance - tolerance := float64(gMem.Total) * 0.001 - - if math.Abs(float64(gMem.Available)-float64(sMem.Available)) > tolerance { - t.Errorf("Available differs by more than 0.1%%: gopsutil=%d, sysinfo=%d", gMem.Available, sMem.Available) - } - - if math.Abs(float64(gMem.Used)-float64(sMem.Used)) > tolerance { - t.Errorf("Used differs by more than 0.1%%: gopsutil=%d, sysinfo=%d", gMem.Used, sMem.Used) - } - - if math.Abs(gMem.UsedPercent-sMem.UsedPercent) > 0.1 { - t.Errorf("UsedPercent mismatch: gopsutil=%.4f, sysinfo=%.4f", gMem.UsedPercent, sMem.UsedPercent) - } - - if math.Abs(float64(gMem.Free)-float64(sMem.Free)) > tolerance { - t.Errorf("Free differs by more than 0.1%%: gopsutil=%d, sysinfo=%d", gMem.Free, sMem.Free) - } -} - -func assertClose(t *testing.T, name string, expected, actual, tolerance float64) { - t.Helper() - if expected == 0 && actual == 0 { - return - } - diff := math.Abs(expected - actual) - relativeDiff := diff / math.Max(math.Abs(expected), 1.0) - if relativeDiff > tolerance { - t.Errorf("%s: values differ by %.2f%% (expected=%.4f, actual=%.4f)", - name, relativeDiff*100, expected, actual) - } -} -TESTEOF - -echo "1. Created comparison test file" - -# Add gopsutil dependency -cd "$REPO_ROOT" -echo "2. Adding gopsutil dependency..." -go get github.com/shirou/gopsutil/v4@v4.24.8 2>/dev/null - -echo "3. Running go mod tidy..." -go mod tidy 2>/dev/null - -echo "4. Running comparison tests..." -echo "" -go test -v -tags=compare_gopsutil ./internal/sysinfo/... -TEST_RESULT=$? - -echo "" -if [ $TEST_RESULT -eq 0 ]; then - echo "=== All comparisons PASSED ===" -else - echo "=== Some comparisons FAILED ===" -fi - -exit $TEST_RESULT diff --git a/internal/worker.go b/internal/worker.go index 7b73c1668..56e7aa7c0 100644 --- a/internal/worker.go +++ b/internal/worker.go @@ -39,7 +39,7 @@ type ( // This interface is typically implemented by a [WorkerTuner] to provide metrics from the same // source used for tuning decisions, avoiding double-measurement. If the tuner passed to // WorkerOptions implements this interface, the SDK will automatically use it for heartbeat - // metrics. Alternatively, use the default implementation in the worker/hostmetrics package. + // metrics. Otherwise, heartbeats will report 0 for CPU/memory usage. // // Exposed as: [go.temporal.io/sdk/worker.TunerHostMetricsProvider] TunerHostMetricsProvider interface { diff --git a/worker/hostmetrics/cgroups.go b/worker/hostmetrics/cgroups.go deleted file mode 100644 index dfa42abf1..000000000 --- a/worker/hostmetrics/cgroups.go +++ /dev/null @@ -1,165 +0,0 @@ -//go:build linux - -package hostmetrics - -import ( - "errors" - "io/fs" - "os" - "path/filepath" - "strconv" - "strings" - "time" -) - -const cgroupBasePath = "/sys/fs/cgroup" - -func newCGroupInfo() cGroupInfo { - return &cGroupInfoImpl{} -} - -type cGroupInfoImpl struct { - lastMemUsage uint64 - lastMemLimit uint64 - cgroupCpuCalc cgroupCpuCalc -} - -func (p *cGroupInfoImpl) Update() (bool, error) { - memUsage, memLimit, err := readMemoryStat() - if errors.Is(err, fs.ErrNotExist) { - // Stop updates if not in a container. No need to return the error and log it. - return false, nil - } else if err != nil { - return true, err - } - - // Only update if limit is set - if memLimit != 0 { - p.lastMemUsage = memUsage - p.lastMemLimit = memLimit - } - - cpuUsageUsec, err := readCPUUsage() - if err != nil && !errors.Is(err, fs.ErrNotExist) { - return true, err - } - - p.cgroupCpuCalc.updateCpuUsage(cpuUsageUsec) - return true, nil -} - -func (p *cGroupInfoImpl) GetLastMemUsage() float64 { - if p.lastMemLimit != 0 { - return float64(p.lastMemUsage) / float64(p.lastMemLimit) - } - return 0 -} - -func (p *cGroupInfoImpl) GetLastCPUUsage() float64 { - return p.cgroupCpuCalc.lastCalculatedPercent -} - -type cgroupCpuCalc struct { - lastRefresh time.Time - lastCpuUsage uint64 - lastCalculatedPercent float64 -} - -func (p *cgroupCpuCalc) updateCpuUsage(currentCpuUsageUsec uint64) { - cpuQuota, cpuPeriod, err := readCpuMax() - if err != nil { - return // No CPU limit set or file doesn't exist - } - - now := time.Now() - if p.lastCpuUsage == 0 || p.lastRefresh.IsZero() { - p.lastCpuUsage = currentCpuUsageUsec - p.lastRefresh = now - return - } - - timeDelta := now.Sub(p.lastRefresh).Microseconds() - cpuUsageDelta := float64(currentCpuUsageUsec - p.lastCpuUsage) - - if cpuQuota > 0 && timeDelta > 0 { - p.lastCalculatedPercent = cpuUsageDelta * float64(cpuPeriod) / float64(cpuQuota*timeDelta) - } - - // Update for next call - p.lastCpuUsage = currentCpuUsageUsec - p.lastRefresh = now -} - -// readMemoryStat reads memory.current and memory.max from cgroup v2. -// Returns (usage, limit, error). Limit is 0 if set to "max" (unlimited). -func readMemoryStat() (uint64, uint64, error) { - usage, err := readIntFromFile(filepath.Join(cgroupBasePath, "memory.current"), false) - if err != nil { - return 0, 0, err - } - - limit, err := readIntFromFile(filepath.Join(cgroupBasePath, "memory.max"), true) - if err != nil { - return 0, 0, err - } - - return usage, limit, nil -} - -// readCPUUsage reads usage_usec from cpu.stat. -func readCPUUsage() (uint64, error) { - data, err := os.ReadFile(filepath.Join(cgroupBasePath, "cpu.stat")) - if err != nil { - return 0, err - } - - for _, line := range strings.Split(string(data), "\n") { - if strings.HasPrefix(line, "usage_usec ") { - return strconv.ParseUint(strings.TrimPrefix(line, "usage_usec "), 10, 64) - } - } - return 0, errors.New("usage_usec not found in cpu.stat") -} - -// readCpuMax reads the cpu.max file to get the CPU quota and period. -func readCpuMax() (quota int64, period int64, err error) { - data, err := os.ReadFile(filepath.Join(cgroupBasePath, "cpu.max")) - if err != nil { - return 0, 0, err - } - parts := strings.Fields(string(data)) - if len(parts) != 2 { - return 0, 0, errors.New("invalid format in cpu.max") - } - - if parts[0] == "max" { - quota = 0 - } else { - quota, err = strconv.ParseInt(parts[0], 10, 64) - if err != nil { - return 0, 0, err - } - } - - period, err = strconv.ParseInt(parts[1], 10, 64) - if err != nil { - return 0, 0, err - } - - return quota, period, nil -} - -// readIntFromFile reads a file containing a single uint64 value and -// can optionally detect if the file has "max" as its value, where -// it returns 0 as the value read. -func readIntFromFile(path string, canBeMax bool) (uint64, error) { - data, err := os.ReadFile(path) - if err != nil { - return 0, err - } - s := strings.TrimSpace(string(data)) - if canBeMax && s == "max" { - return 0, nil - } - return strconv.ParseUint(s, 10, 64) -} diff --git a/worker/hostmetrics/hostmetrics.go b/worker/hostmetrics/hostmetrics.go deleted file mode 100644 index 474c5f8d0..000000000 --- a/worker/hostmetrics/hostmetrics.go +++ /dev/null @@ -1,122 +0,0 @@ -// Package hostmetrics provides host-level CPU and memory metrics collection -// for worker heartbeats. It supports Linux, macOS, and Windows, with -// cgroup metrics for containerized environments. -package hostmetrics - -import ( - "context" - "runtime" - "sync" - "time" - - "go.temporal.io/sdk/internal/sysinfo" - "go.temporal.io/sdk/log" -) - -// PSUtilSystemInfoSupplier implements worker.TunerHostMetricsProvider for system metrics. -type PSUtilSystemInfoSupplier struct { - mu sync.Mutex - lastRefresh time.Time - lastMemStat *sysinfo.VirtualMemoryStat - lastCpuUsage float64 - cGroupInfo cGroupInfo - stopTryingToGetCGroupInfo bool - logger log.Logger -} - -// NewPSUtilSystemInfoSupplier creates a new PSUtilSystemInfoSupplier. -func NewPSUtilSystemInfoSupplier(logger log.Logger) *PSUtilSystemInfoSupplier { - return &PSUtilSystemInfoSupplier{ - logger: logger, - cGroupInfo: newCGroupInfo(), - } -} - -// GetCpuUsage returns the current host CPU usage as a fraction (0.0-1.0). -// In containerized environments, it prefers cgroup metrics if available. -func (p *PSUtilSystemInfoSupplier) GetCpuUsage() (float64, error) { - return p.GetCpuUsageWithLogger(p.logger) -} - -// GetCpuUsageWithLogger is like GetCpuUsage but uses the provided logger for warnings. -func (p *PSUtilSystemInfoSupplier) GetCpuUsageWithLogger(logger log.Logger) (float64, error) { - if err := p.maybeRefresh(logger); err != nil { - return 0, err - } - // Prefer cgroup metrics in containerized environments - if p.cGroupInfo != nil { - if cgroupCPU := p.cGroupInfo.GetLastCPUUsage(); cgroupCPU != 0 { - return cgroupCPU, nil - } - } - return p.lastCpuUsage / 100, nil -} - -// GetMemoryUsage returns the current host memory usage as a fraction (0.0-1.0). -// In containerized environments, it prefers cgroup metrics if available. -func (p *PSUtilSystemInfoSupplier) GetMemoryUsage() (float64, error) { - return p.GetMemoryUsageWithLogger(p.logger) -} - -// GetMemoryUsageWithLogger is like GetMemoryUsage but uses the provided logger for warnings. -func (p *PSUtilSystemInfoSupplier) GetMemoryUsageWithLogger(logger log.Logger) (float64, error) { - if err := p.maybeRefresh(logger); err != nil { - return 0, err - } - if p.cGroupInfo != nil { - if cgroupMem := p.cGroupInfo.GetLastMemUsage(); cgroupMem != 0 { - return cgroupMem, nil - } - } - return p.lastMemStat.UsedPercent / 100, nil -} - -func (p *PSUtilSystemInfoSupplier) maybeRefresh(logger log.Logger) error { - if time.Since(p.lastRefresh) < 100*time.Millisecond { - return nil - } - p.mu.Lock() - defer p.mu.Unlock() - // Double check refresh is still needed - if time.Since(p.lastRefresh) < 100*time.Millisecond { - return nil - } - - ctx, cancelFn := context.WithTimeout(context.Background(), 1*time.Second) - defer cancelFn() - memStat, err := sysinfo.VirtualMemoryWithContext(ctx) - if err != nil { - return err - } - cpuUsage, err := sysinfo.PercentWithContext(ctx, 0, false) - if err != nil { - return err - } - - p.lastMemStat = memStat - p.lastCpuUsage = cpuUsage[0] - - // Try cgroup metrics on Linux for containerized environments - if runtime.GOOS == "linux" && !p.stopTryingToGetCGroupInfo && p.cGroupInfo != nil { - continueUpdates, err := p.cGroupInfo.Update() - if err != nil && logger != nil { - logger.Warn("Failed to get cgroup stats", "error", err) - } - p.stopTryingToGetCGroupInfo = !continueUpdates - } - - p.lastRefresh = time.Now() - return nil -} - -type cGroupInfo interface { - // Update requests an update of the cgroup stats. Returns true if cgroup stats - // should continue to be updated, false if not in a cgroup or error is unrecoverable. - Update() (bool, error) - // GetLastMemUsage returns last known memory usage as a fraction of cgroup limit. - // Returns 0 if not in a cgroup or limit is not set. - GetLastMemUsage() float64 - // GetLastCPUUsage returns last known CPU usage as a fraction of cgroup limit. - // Returns 0 if not in a cgroup or limit is not set. - GetLastCPUUsage() float64 -} diff --git a/worker/hostmetrics/hostmetrics_test.go b/worker/hostmetrics/hostmetrics_test.go deleted file mode 100644 index 243db4a75..000000000 --- a/worker/hostmetrics/hostmetrics_test.go +++ /dev/null @@ -1,49 +0,0 @@ -package hostmetrics - -import ( - "testing" - "time" - - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" -) - -func TestPSUtilSystemInfoSupplier_GetCpuUsage(t *testing.T) { - p := NewPSUtilSystemInfoSupplier(nil) - - cpu, err := p.GetCpuUsage() - require.NoError(t, err) - assert.GreaterOrEqual(t, cpu, 0.0) - assert.LessOrEqual(t, cpu, 1.0) -} - -func TestPSUtilSystemInfoSupplier_GetMemoryUsage(t *testing.T) { - p := NewPSUtilSystemInfoSupplier(nil) - - mem, err := p.GetMemoryUsage() - require.NoError(t, err) - assert.GreaterOrEqual(t, mem, 0.0) - assert.LessOrEqual(t, mem, 1.0) -} - -func TestPSUtilSystemInfoSupplier_RateLimiting(t *testing.T) { - p := NewPSUtilSystemInfoSupplier(nil) - - // First call should refresh - _, err := p.GetCpuUsage() - require.NoError(t, err) - firstRefresh := p.lastRefresh - - // Immediate second call should use cached value - _, err = p.GetCpuUsage() - require.NoError(t, err) - assert.Equal(t, firstRefresh, p.lastRefresh) - - // Wait past the refresh interval - time.Sleep(150 * time.Millisecond) - - // Third call should refresh - _, err = p.GetCpuUsage() - require.NoError(t, err) - assert.NotEqual(t, firstRefresh, p.lastRefresh) -} diff --git a/worker/hostmetrics/scripts/compare_with_containerd.sh b/worker/hostmetrics/scripts/compare_with_containerd.sh deleted file mode 100755 index 6ed1b4dd4..000000000 --- a/worker/hostmetrics/scripts/compare_with_containerd.sh +++ /dev/null @@ -1,153 +0,0 @@ -#!/bin/bash -# Compare direct cgroup reads against containerd/cgroups/v3 -# Must run inside a Linux container with cgroup v2 and resource limits set -# -# Usage: ./worker/hostmetrics/scripts/compare_with_containerd.sh -# -# Or via Docker: -# docker run --rm -v "$(pwd)":/workspace -w /workspace \ -# --memory=512m --cpus=1 golang:1.23 \ -# ./worker/hostmetrics/scripts/compare_with_containerd.sh - -set -e - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -HOSTMETRICS_DIR="$(dirname "$SCRIPT_DIR")" -REPO_ROOT="$(cd "$HOSTMETRICS_DIR/../.." && pwd)" - -TEST_FILE="$HOSTMETRICS_DIR/compare_cgroups_test.go" - -cleanup() { - echo "Cleaning up..." - rm -f "$TEST_FILE" - cd "$REPO_ROOT" && go mod tidy 2>/dev/null - echo "Done." -} - -trap cleanup EXIT - -echo "=== Comparing cgroup implementation against containerd/cgroups ===" -echo "" - -if [[ "$(uname)" != "Linux" ]]; then - echo "ERROR: This test must run on Linux (inside a container)" - exit 1 -fi - -if [[ ! -f /sys/fs/cgroup/memory.current ]]; then - echo "ERROR: cgroup v2 not available (missing /sys/fs/cgroup/memory.current)" - exit 1 -fi - -echo "1. Cgroup v2 files found:" -echo " memory.current: $(cat /sys/fs/cgroup/memory.current)" -echo " memory.max: $(cat /sys/fs/cgroup/memory.max)" -echo " cpu.stat usage_usec: $(grep usage_usec /sys/fs/cgroup/cpu.stat | awk '{print $2}')" -echo " cpu.max: $(cat /sys/fs/cgroup/cpu.max 2>/dev/null || echo 'not set')" -echo "" - -# Create the comparison test file -cat > "$TEST_FILE" << 'TESTEOF' -//go:build linux && compare_cgroups - -package hostmetrics - -import ( - "testing" - - "github.com/containerd/cgroups/v3/cgroup2" -) - -func TestCgroupMemoryMatchesContainerd(t *testing.T) { - // Get values from containerd/cgroups - control, err := cgroup2.Load("/") - if err != nil { - t.Skipf("Not in cgroup v2 environment: %v", err) - } - metrics, err := control.Stat() - if err != nil { - t.Fatalf("containerd Stat() failed: %v", err) - } - - // Get values from our direct reads - memUsage, memLimit, err := readMemoryStat() - if err != nil { - t.Fatalf("readMemoryStat() failed: %v", err) - } - - t.Logf("containerd: Usage=%d UsageLimit=%d", metrics.Memory.Usage, metrics.Memory.UsageLimit) - t.Logf("direct: Usage=%d UsageLimit=%d", memUsage, memLimit) - - // Memory usage can change between reads, allow 1MB tolerance - if absDiff(metrics.Memory.Usage, memUsage) > 1024*1024 { - t.Errorf("Memory usage mismatch: containerd=%d, direct=%d (diff=%d)", - metrics.Memory.Usage, memUsage, absDiff(metrics.Memory.Usage, memUsage)) - } - - // Memory limit should match exactly (or both be 0/max for unlimited) - // containerd returns max uint64 for unlimited, we return 0 - containerdLimit := metrics.Memory.UsageLimit - if containerdLimit == ^uint64(0) { - containerdLimit = 0 // Treat max uint64 as unlimited (0) - } - if containerdLimit != memLimit { - t.Errorf("Memory limit mismatch: containerd=%d, direct=%d", - metrics.Memory.UsageLimit, memLimit) - } -} - -func TestCgroupCPUMatchesContainerd(t *testing.T) { - control, err := cgroup2.Load("/") - if err != nil { - t.Skipf("Not in cgroup v2 environment: %v", err) - } - metrics, err := control.Stat() - if err != nil { - t.Fatalf("containerd Stat() failed: %v", err) - } - - cpuUsage, err := readCPUUsage() - if err != nil { - t.Fatalf("readCPUUsage() failed: %v", err) - } - - t.Logf("containerd: UsageUsec=%d", metrics.CPU.UsageUsec) - t.Logf("direct: UsageUsec=%d", cpuUsage) - - // CPU usage increases over time, allow 100ms tolerance for timing between reads - if absDiff(metrics.CPU.UsageUsec, cpuUsage) > 100000 { - t.Errorf("CPU usage mismatch: containerd=%d, direct=%d (diff=%d)", - metrics.CPU.UsageUsec, cpuUsage, absDiff(metrics.CPU.UsageUsec, cpuUsage)) - } -} - -func absDiff(a, b uint64) uint64 { - if a > b { - return a - b - } - return b - a -} -TESTEOF - -echo "2. Created comparison test file" - -cd "$REPO_ROOT" -echo "3. Adding containerd/cgroups dependency..." -go get github.com/containerd/cgroups/v3@v3.0.3 2>/dev/null - -echo "4. Running go mod tidy..." -go mod tidy 2>/dev/null - -echo "5. Running comparison tests..." -echo "" -go test -v -tags=compare_cgroups ./worker/hostmetrics/... -TEST_RESULT=$? - -echo "" -if [ $TEST_RESULT -eq 0 ]; then - echo "=== All comparisons PASSED ===" -else - echo "=== Some comparisons FAILED ===" -fi - -exit $TEST_RESULT diff --git a/worker/worker.go b/worker/worker.go index 6895b2df6..11785175c 100644 --- a/worker/worker.go +++ b/worker/worker.go @@ -241,7 +241,7 @@ type ( // This interface is typically implemented by a [WorkerTuner] to provide metrics from the same // source used for tuning decisions, avoiding double-measurement. If the tuner passed to // [Options] implements this interface, the SDK will automatically use it for heartbeat metrics. - // Alternatively, use the default implementation in the worker/hostmetrics package. + // Otherwise, heartbeats will report 0 for CPU/memory usage. // // NOTE: Experimental TunerHostMetricsProvider = internal.TunerHostMetricsProvider From c156e0d4594ea1f10aef9c0ad4604d300267f79f Mon Sep 17 00:00:00 2001 From: Andrew Yuan Date: Tue, 27 Jan 2026 15:52:01 -0800 Subject: [PATCH 05/30] Create new hostinfo package --- .../{resourcetuner => hostinfo}/cgroups.go | 4 +- .../cgroups_notlinux.go | 2 +- contrib/{resourcetuner => hostinfo}/go.mod | 7 +- contrib/{resourcetuner => hostinfo}/go.sum | 8 +- contrib/hostinfo/hostinfo.go | 109 +++++++ contrib/hostinfo/hostinfo_test.go | 51 ++++ contrib/resourcetuner/resourcetuner_test.go | 107 ------- internal/internal_worker.go | 40 ++- .../resource_tuner.go | 289 ++++++++---------- internal/tuning.go | 20 +- internal/worker.go | 14 - worker/tuning.go | 72 +++++ worker/worker.go | 9 - 13 files changed, 396 insertions(+), 336 deletions(-) rename contrib/{resourcetuner => hostinfo}/cgroups.go (98%) rename contrib/{resourcetuner => hostinfo}/cgroups_notlinux.go (94%) rename contrib/{resourcetuner => hostinfo}/go.mod (92%) rename contrib/{resourcetuner => hostinfo}/go.sum (96%) create mode 100644 contrib/hostinfo/hostinfo.go create mode 100644 contrib/hostinfo/hostinfo_test.go delete mode 100644 contrib/resourcetuner/resourcetuner_test.go rename contrib/resourcetuner/resourcetuner.go => internal/resource_tuner.go (62%) diff --git a/contrib/resourcetuner/cgroups.go b/contrib/hostinfo/cgroups.go similarity index 98% rename from contrib/resourcetuner/cgroups.go rename to contrib/hostinfo/cgroups.go index f6615296f..50d69c7e0 100644 --- a/contrib/resourcetuner/cgroups.go +++ b/contrib/hostinfo/cgroups.go @@ -1,6 +1,6 @@ //go:build linux -package resourcetuner +package hostinfo import ( "errors" @@ -27,7 +27,7 @@ type cGroupInfoImpl struct { func (p *cGroupInfoImpl) Update() (bool, error) { err := p.updateCGroupStats() // Stop updates if not in a container. No need to return the error and log it. - if !errors.Is(err, fs.ErrNotExist) { + if errors.Is(err, fs.ErrNotExist) { return false, nil } else if err != nil { return true, err diff --git a/contrib/resourcetuner/cgroups_notlinux.go b/contrib/hostinfo/cgroups_notlinux.go similarity index 94% rename from contrib/resourcetuner/cgroups_notlinux.go rename to contrib/hostinfo/cgroups_notlinux.go index 068e4220f..80dfabec1 100644 --- a/contrib/resourcetuner/cgroups_notlinux.go +++ b/contrib/hostinfo/cgroups_notlinux.go @@ -1,6 +1,6 @@ //go:build !linux -package resourcetuner +package hostinfo import "errors" diff --git a/contrib/resourcetuner/go.mod b/contrib/hostinfo/go.mod similarity index 92% rename from contrib/resourcetuner/go.mod rename to contrib/hostinfo/go.mod index dde1cc877..82b31e33d 100644 --- a/contrib/resourcetuner/go.mod +++ b/contrib/hostinfo/go.mod @@ -1,4 +1,4 @@ -module go.temporal.io/sdk/contrib/resourcetuner +module go.temporal.io/sdk/contrib/hostinfo go 1.23.0 @@ -7,8 +7,6 @@ toolchain go1.23.6 require ( github.com/containerd/cgroups/v3 v3.0.3 github.com/shirou/gopsutil/v4 v4.24.8 - github.com/stretchr/testify v1.10.0 - go.einride.tech/pid v0.1.3 go.temporal.io/sdk v1.29.1 ) @@ -31,8 +29,9 @@ require ( github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c // indirect github.com/robfig/cron v1.2.0 // indirect github.com/shoenig/go-m1cpu v0.1.6 // indirect - github.com/sirupsen/logrus v1.9.3 // indirect + github.com/sirupsen/logrus v1.9.0 // indirect github.com/stretchr/objx v0.5.2 // indirect + github.com/stretchr/testify v1.10.0 // indirect github.com/tklauser/go-sysconf v0.3.12 // indirect github.com/tklauser/numcpus v0.6.1 // indirect github.com/yusufpapurcu/wmi v1.2.4 // indirect diff --git a/contrib/resourcetuner/go.sum b/contrib/hostinfo/go.sum similarity index 96% rename from contrib/resourcetuner/go.sum rename to contrib/hostinfo/go.sum index 00877786f..698b47712 100644 --- a/contrib/resourcetuner/go.sum +++ b/contrib/hostinfo/go.sum @@ -54,8 +54,8 @@ github.com/shoenig/go-m1cpu v0.1.6 h1:nxdKQNcEB6vzgA2E2bvzKIYRuNj7XNJ4S/aRSwKzFt github.com/shoenig/go-m1cpu v0.1.6/go.mod h1:1JJMcUBvfNwpq05QDQVAnx3gUHr9IYF7GNg9SUEw2VQ= github.com/shoenig/test v0.6.4 h1:kVTaSd7WLz5WZ2IaoM0RSzRsUD+m8wRR+5qvntpn4LU= github.com/shoenig/test v0.6.4/go.mod h1:byHiCGXqrVaflBLAMq/srcZIHynQPQgeyvkvXnjqq0k= -github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= -github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= +github.com/sirupsen/logrus v1.9.0 h1:trlNQbNUG3OdDrDil03MCb1H2o9nJ1x4/5LYw7byDE0= +github.com/sirupsen/logrus v1.9.0/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY= github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA= @@ -71,8 +71,6 @@ github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9dec github.com/yuin/goldmark v1.3.5/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k= github.com/yusufpapurcu/wmi v1.2.4 h1:zFUKzehAFReQwLys1b/iSMl+JQGSCSjtVqQn9bBrPo0= github.com/yusufpapurcu/wmi v1.2.4/go.mod h1:SBZ9tNy3G9/m5Oi98Zks0QjeHVDvuK0qfxQmPyzfmi0= -go.einride.tech/pid v0.1.3 h1:yWAKSmD2Z10jxd4gYFhOjbBNqXeIQwAtnCO/XKCT7sQ= -go.einride.tech/pid v0.1.3/go.mod h1:33JSUbKrH/4v8DZf/0K8IC8Enjd92wB2birp+bCYQso= go.temporal.io/api v1.59.0 h1:QUpAju1KKs9xBfGSI0Uwdyg06k6dRCJH+Zm3G1Jc9Vk= go.temporal.io/api v1.59.0/go.mod h1:iaxoP/9OXMJcQkETTECfwYq4cw/bj4nwov8b3ZLVnXM= go.uber.org/goleak v1.1.12 h1:gZAh5/EyT/HQwlpkCy6wTpqfH9H8Lz8zbm3dZh+OyzA= @@ -141,5 +139,3 @@ gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EV gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -gotest.tools/v3 v3.5.1 h1:EENdUnS3pdur5nybKYIh2Vfgc8IUNBjxDPSjtiJcOzU= -gotest.tools/v3 v3.5.1/go.mod h1:isy3WKz7GK6uNw/sbHzfKBLvlvXwUyV06n6brMxxopU= diff --git a/contrib/hostinfo/hostinfo.go b/contrib/hostinfo/hostinfo.go new file mode 100644 index 000000000..a529aa837 --- /dev/null +++ b/contrib/hostinfo/hostinfo.go @@ -0,0 +1,109 @@ +package hostinfo + +import ( + "context" + "runtime" + "sync" + "time" + + "github.com/shirou/gopsutil/v4/cpu" + "github.com/shirou/gopsutil/v4/mem" + "go.temporal.io/sdk/worker" +) + +// NewSystemInfoSupplier creates a SystemInfoSupplier using gopsutil. +// Supports cgroup metrics in containerized Linux environments. +func NewSystemInfoSupplier() worker.SystemInfoSupplier { + return &psUtilSystemInfoSupplier{ + cGroupInfo: newCGroupInfo(), + } +} + +// NewResourceBasedTuner creates a resource-based tuner with gopsutil-based system info. +func NewResourceBasedTuner(opts worker.ResourceBasedTunerOptions) (worker.WorkerTuner, error) { + opts.InfoSupplier = NewSystemInfoSupplier() + return worker.NewResourceBasedTuner(opts) +} + +type psUtilSystemInfoSupplier struct { + mu sync.Mutex + lastRefresh time.Time + + lastMemStat *mem.VirtualMemoryStat + lastCpuUsage float64 + + stopTryingToGetCGroupInfo bool + cGroupInfo cGroupInfo +} + +type cGroupInfo interface { + // Update requests an update of the cgroup stats. This is a no-op if not in a cgroup. Returns + // true if cgroup stats should continue to be updated, false if not in a cgroup or the returned + // error is considered unrecoverable. + Update() (bool, error) + // GetLastMemUsage returns last known memory usage as a fraction of the cgroup limit. 0 if not + // in a cgroup or limit is not set. + GetLastMemUsage() float64 + // GetLastCPUUsage returns last known CPU usage as a fraction of the cgroup limit. 0 if not in a + // cgroup or limit is not set. + GetLastCPUUsage() float64 +} + +func (p *psUtilSystemInfoSupplier) GetMemoryUsage(infoContext *worker.SystemInfoContext) (float64, error) { + if err := p.maybeRefresh(infoContext); err != nil { + return 0, err + } + lastCGroupMem := p.cGroupInfo.GetLastMemUsage() + if lastCGroupMem != 0 { + return lastCGroupMem, nil + } + return p.lastMemStat.UsedPercent / 100, nil +} + +func (p *psUtilSystemInfoSupplier) GetCpuUsage(infoContext *worker.SystemInfoContext) (float64, error) { + if err := p.maybeRefresh(infoContext); err != nil { + return 0, err + } + + lastCGroupCPU := p.cGroupInfo.GetLastCPUUsage() + if lastCGroupCPU != 0 { + return lastCGroupCPU, nil + } + return p.lastCpuUsage / 100, nil +} + +func (p *psUtilSystemInfoSupplier) maybeRefresh(infoContext *worker.SystemInfoContext) error { + if time.Since(p.lastRefresh) < 100*time.Millisecond { + return nil + } + p.mu.Lock() + defer p.mu.Unlock() + // Double check refresh is still needed + if time.Since(p.lastRefresh) < 100*time.Millisecond { + return nil + } + ctx, cancelFn := context.WithTimeout(context.Background(), 1*time.Second) + defer cancelFn() + memStat, err := mem.VirtualMemoryWithContext(ctx) + if err != nil { + return err + } + cpuUsage, err := cpu.PercentWithContext(ctx, 0, false) + if err != nil { + return err + } + + p.lastMemStat = memStat + p.lastCpuUsage = cpuUsage[0] + + if runtime.GOOS == "linux" && !p.stopTryingToGetCGroupInfo { + continueUpdates, err := p.cGroupInfo.Update() + if err != nil { + infoContext.Logger.Warn("Failed to get cgroup stats", "error", err) + } + p.stopTryingToGetCGroupInfo = !continueUpdates + } + + p.lastRefresh = time.Now() + return nil +} diff --git a/contrib/hostinfo/hostinfo_test.go b/contrib/hostinfo/hostinfo_test.go new file mode 100644 index 000000000..4162f23de --- /dev/null +++ b/contrib/hostinfo/hostinfo_test.go @@ -0,0 +1,51 @@ +package hostinfo + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.temporal.io/sdk/internal/log" + "go.temporal.io/sdk/worker" +) + +func TestGetMemoryCpuUsage(t *testing.T) { + supplier := NewSystemInfoSupplier() + ctx := &worker.SystemInfoContext{Logger: log.NewNopLogger()} + + usage, err := supplier.GetMemoryUsage(ctx) + require.NoError(t, err) + assert.GreaterOrEqual(t, usage, 0.0) + assert.LessOrEqual(t, usage, 1.0) + + usage, err = supplier.GetCpuUsage(ctx) + require.NoError(t, err) + assert.GreaterOrEqual(t, usage, 0.0) + assert.LessOrEqual(t, usage, 1.0) +} + +func TestMaybeRefreshRateLimiting(t *testing.T) { + supplier := NewSystemInfoSupplier().(*psUtilSystemInfoSupplier) + ctx := &worker.SystemInfoContext{Logger: log.NewNopLogger()} + + // First call should refresh + firstUsage, err := supplier.GetMemoryUsage(ctx) + require.NoError(t, err) + firstRefresh := supplier.lastRefresh + + // Immediate second call should not refresh (rate limited) + secondUsage, err := supplier.GetMemoryUsage(ctx) + require.NoError(t, err) + assert.Equal(t, firstRefresh, supplier.lastRefresh) + + assert.Equal(t, firstUsage, secondUsage) +} + +func TestNewResourceBasedTuner(t *testing.T) { + tuner, err := NewResourceBasedTuner(worker.ResourceBasedTunerOptions{ + TargetMem: 0.8, + TargetCpu: 0.9, + }) + require.NoError(t, err) + require.NotNil(t, tuner) +} diff --git a/contrib/resourcetuner/resourcetuner_test.go b/contrib/resourcetuner/resourcetuner_test.go deleted file mode 100644 index 603d53da8..000000000 --- a/contrib/resourcetuner/resourcetuner_test.go +++ /dev/null @@ -1,107 +0,0 @@ -package resourcetuner - -import ( - "testing" - - "github.com/stretchr/testify/assert" - "go.temporal.io/sdk/client" - "go.temporal.io/sdk/internal/common/metrics" - "go.temporal.io/sdk/internal/log" -) - -type FakeSystemInfoSupplier struct { - memUse float64 - cpuUse float64 -} - -func (f FakeSystemInfoSupplier) GetMemoryUsage(_ *SystemInfoContext) (float64, error) { - return f.memUse, nil -} - -func (f FakeSystemInfoSupplier) GetCpuUsage(_ *SystemInfoContext) (float64, error) { - return f.cpuUse, nil -} - -func TestPidDecisions(t *testing.T) { - logger := &log.NoopLogger{} - metricsHandler := client.MetricsNopHandler - fakeSupplier := &FakeSystemInfoSupplier{memUse: 0.5, cpuUse: 0.5} - rcOpts := DefaultResourceControllerOptions() - rcOpts.MemTargetPercent = 0.8 - rcOpts.CpuTargetPercent = 0.9 - rcOpts.InfoSupplier = fakeSupplier - rc := NewResourceController(rcOpts) - - for i := 0; i < 10; i++ { - decision, err := rc.pidDecision(logger, metricsHandler) - assert.NoError(t, err) - assert.True(t, decision) - - assert.InDelta(t, 1.5, rc.memPid.State.ControlSignal, 0.001) - assert.InDelta(t, 2.0, rc.cpuPid.State.ControlSignal, 0.001) - } - - fakeSupplier.memUse = 0.8 - fakeSupplier.cpuUse = 0.9 - for i := 0; i < 10; i++ { - decision, err := rc.pidDecision(logger, metricsHandler) - assert.NoError(t, err) - assert.False(t, decision) - } - - fakeSupplier.memUse = 0.7 - fakeSupplier.cpuUse = 0.9 - for i := 0; i < 10; i++ { - decision, err := rc.pidDecision(logger, metricsHandler) - assert.NoError(t, err) - assert.False(t, decision) - } - - fakeSupplier.memUse = 0.7 - fakeSupplier.cpuUse = 0.7 - for i := 0; i < 10; i++ { - decision, err := rc.pidDecision(logger, metricsHandler) - assert.NoError(t, err) - assert.True(t, decision) - } -} - -func TestPidDecisionEmitsUsageMetrics(t *testing.T) { - logger := &log.NoopLogger{} - metricsHandler := metrics.NewCapturingHandler() - fakeSupplier := &FakeSystemInfoSupplier{memUse: 0.25, cpuUse: 0.75} - - rcOpts := DefaultResourceControllerOptions() - rcOpts.InfoSupplier = fakeSupplier - rc := NewResourceController(rcOpts) - - _, err := rc.pidDecision(logger, metricsHandler) - assert.NoError(t, err) - - gauges := metricsHandler.Gauges() - assert.Len(t, gauges, 2) - - gaugesByName := make(map[string]float64) - for _, gauge := range gauges { - gaugesByName[gauge.Name] = gauge.Value() - } - - assert.Equal(t, 25.0, gaugesByName[resourceSlotsMemUsage]) - assert.Equal(t, 75.0, gaugesByName[resourceSlotsCPUUsage]) - - fakeSupplier.memUse = 0.7 - fakeSupplier.cpuUse = 0.9 - _, err = rc.pidDecision(logger, metricsHandler) - assert.NoError(t, err) - - gauges = metricsHandler.Gauges() - assert.Len(t, gauges, 2) - - gaugesByName = make(map[string]float64) - for _, gauge := range gauges { - gaugesByName[gauge.Name] = gauge.Value() - } - - assert.Equal(t, 70.0, gaugesByName[resourceSlotsMemUsage]) - assert.Equal(t, 90.0, gaugesByName[resourceSlotsCPUUsage]) -} diff --git a/internal/internal_worker.go b/internal/internal_worker.go index fd886c139..227c148a6 100644 --- a/internal/internal_worker.go +++ b/internal/internal_worker.go @@ -241,8 +241,8 @@ type ( localActivitySlotSupplier *trackingSlotSupplier nexusTaskSlotSupplier *trackingSlotSupplier // TODO: nexus worker only gets started when worker is started, need to find a way to send kind over to heartbeat callback - // Host metrics provider for CPU/memory reporting in heartbeats - hostMetricsProvider TunerHostMetricsProvider + // SystemInfoSupplier for CPU/memory reporting in heartbeats + systemInfoSupplier SystemInfoSupplier } ) @@ -2254,11 +2254,11 @@ func NewAggregatedWorker(client *WorkflowClient, taskQueue string, options Worke }) } - // If the tuner implements TunerHostMetricsProvider, use it for CPU/memory reporting in heartbeats. - // Otherwise, heartbeats will report 0 for CPU/memory usage. - var hostMetricsProvider TunerHostMetricsProvider - if provider, ok := options.Tuner.(TunerHostMetricsProvider); ok { - hostMetricsProvider = provider + // Get SystemInfoSupplier from tuner's slot supplier if it implements HasSystemInfoSupplier. + // If not available, heartbeats will report 0 for CPU/memory usage. + var systemInfoSupplier SystemInfoSupplier + if sis, ok := options.Tuner.GetWorkflowTaskSlotSupplier().(HasSystemInfoSupplier); ok { + systemInfoSupplier = sis.GetSystemInfoSupplier() } var heartbeatCallback func() *workerpb.WorkerHeartbeat @@ -2308,8 +2308,8 @@ func NewAggregatedWorker(client *WorkflowClient, taskQueue string, options Worke HostName: hostname, WorkerGroupingKey: aw.client.workerGroupingKey, ProcessId: strconv.Itoa(os.Getpid()), - CurrentHostCpuUsage: getCpuUsage(hostMetricsProvider), - CurrentHostMemUsage: getMemUsage(hostMetricsProvider), + CurrentHostCpuUsage: getCpuUsage(systemInfoSupplier, workerParams.Logger), + CurrentHostMemUsage: getMemUsage(systemInfoSupplier, workerParams.Logger), }, TaskQueue: aw.executionParams.TaskQueue, DeploymentVersion: deploymentVersion, @@ -2345,7 +2345,7 @@ func NewAggregatedWorker(client *WorkflowClient, taskQueue string, options Worke if heartbeatManager != nil { heartbeatManager.heartbeatMetrics = heartbeatMetrics heartbeatManager.heartbeatCallback = heartbeatCallback - heartbeatManager.hostMetricsProvider = hostMetricsProvider + heartbeatManager.systemInfoSupplier = systemInfoSupplier } aw = &AggregatedWorker{ @@ -2693,19 +2693,27 @@ func workerDeploymentVersionFromProtoOrString(wd *deploymentpb.WorkerDeploymentV } } -func getCpuUsage(provider TunerHostMetricsProvider) float32 { - if provider == nil { +func getCpuUsage(supplier SystemInfoSupplier, logger log.Logger) float32 { + if supplier == nil { + return 0 + } + cpu, err := supplier.GetCpuUsage(&SystemInfoContext{Logger: logger}) + if err != nil { + logger.Warn("Failed to get CPU usage for heartbeat", "error", err) return 0 } - cpu, _ := provider.GetCpuUsage() return float32(cpu) } -func getMemUsage(provider TunerHostMetricsProvider) float32 { - if provider == nil { +func getMemUsage(supplier SystemInfoSupplier, logger log.Logger) float32 { + if supplier == nil { + return 0 + } + mem, err := supplier.GetMemoryUsage(&SystemInfoContext{Logger: logger}) + if err != nil { + logger.Warn("Failed to get memory usage for heartbeat", "error", err) return 0 } - mem, _ := provider.GetMemoryUsage() return float32(mem) } diff --git a/contrib/resourcetuner/resourcetuner.go b/internal/resource_tuner.go similarity index 62% rename from contrib/resourcetuner/resourcetuner.go rename to internal/resource_tuner.go index 1145d3cd7..43e55a99f 100644 --- a/contrib/resourcetuner/resourcetuner.go +++ b/internal/resource_tuner.go @@ -1,18 +1,13 @@ -package resourcetuner +package internal import ( "context" "errors" - "runtime" "sync" "time" - "github.com/shirou/gopsutil/v4/cpu" - "github.com/shirou/gopsutil/v4/mem" - "go.einride.tech/pid" - "go.temporal.io/sdk/client" + "go.temporal.io/sdk/internal/common/metrics" "go.temporal.io/sdk/log" - "go.temporal.io/sdk/worker" ) // Metric names emitted by the resource-based tuner @@ -21,6 +16,39 @@ const ( resourceSlotsMemUsage = "temporal_resource_slots_mem_usage" ) +// SystemInfoSupplier implementations provide information about system resources. +// +// Exposed as: [go.temporal.io/sdk/worker.SystemInfoSupplier] +type SystemInfoSupplier interface { + // GetMemoryUsage returns the current system memory usage as a fraction of total memory between + // 0 and 1. + GetMemoryUsage(infoContext *SystemInfoContext) (float64, error) + // GetCpuUsage returns the current system CPU usage as a fraction of total CPU usage between 0 + // and 1. + GetCpuUsage(infoContext *SystemInfoContext) (float64, error) +} + +// SystemInfoContext provides context for SystemInfoSupplier calls. +// +// Exposed as: [go.temporal.io/sdk/worker.SystemInfoContext] +type SystemInfoContext struct { + Logger log.Logger +} + +// TODO: Worried this is too invisible for custom slot suppliers to know to implement +// +// HasSystemInfoSupplier is an optional interface that SlotSupplier implementations can implement +// to expose their SystemInfoSupplier. This allows the SDK to access system metrics (CPU/memory) +// for features like worker heartbeats without coupling to specific SlotSupplier implementations. +// +// Exposed as: [go.temporal.io/sdk/worker.HasSystemInfoSupplier] +type HasSystemInfoSupplier interface { + GetSystemInfoSupplier() SystemInfoSupplier +} + +// ResourceBasedTunerOptions configures a resource-based tuner. +// +// Exposed as: [go.temporal.io/sdk/worker.ResourceBasedTunerOptions] type ResourceBasedTunerOptions struct { // TargetMem is the target overall system memory usage as value 0 and 1 that the controller will // attempt to maintain. Must be set nonzero. @@ -28,6 +56,9 @@ type ResourceBasedTunerOptions struct { // TargetCpu is the target overall system CPU usage as value 0 and 1 that the controller will // attempt to maintain. Must be set nonzero. TargetCpu float64 + // InfoSupplier provides CPU and memory usage information. This is required. + // Use contrib/hostinfo.NewSystemInfoSupplier() for a gopsutil-based implementation. + InfoSupplier SystemInfoSupplier // Passed to ResourceBasedSlotSupplierOptions.RampThrottle for activities. // If not set, the default value is 50ms. ActivityRampThrottle time.Duration @@ -38,11 +69,22 @@ type ResourceBasedTunerOptions struct { // NewResourceBasedTuner creates a WorkerTuner that dynamically adjusts the number of slots based // on system resources. Specify the target CPU and memory usage as a value between 0 and 1. -func NewResourceBasedTuner(opts ResourceBasedTunerOptions) (worker.WorkerTuner, error) { - options := DefaultResourceControllerOptions() - options.MemTargetPercent = opts.TargetMem - options.CpuTargetPercent = opts.TargetCpu - controller := NewResourceController(options) +// +// InfoSupplier is required - use contrib/hostinfo.NewSystemInfoSupplier() for a gopsutil-based +// implementation, or provide your own. +// +// Exposed as: [go.temporal.io/sdk/worker.NewResourceBasedTuner] +func NewResourceBasedTuner(opts ResourceBasedTunerOptions) (WorkerTuner, error) { + if opts.InfoSupplier == nil { + return nil, errors.New("InfoSupplier is required for resource-based tuning") + } + + controllerOpts := DefaultResourceControllerOptions() + controllerOpts.MemTargetPercent = opts.TargetMem + controllerOpts.CpuTargetPercent = opts.TargetCpu + controllerOpts.InfoSupplier = opts.InfoSupplier + controller := NewResourceController(controllerOpts) + wfSS := &ResourceBasedSlotSupplier{controller: controller, options: DefaultWorkflowResourceBasedSlotSupplierOptions()} if opts.WorkflowRampThrottle != 0 { @@ -62,20 +104,19 @@ func NewResourceBasedTuner(opts ResourceBasedTunerOptions) (worker.WorkerTuner, options: DefaultWorkflowResourceBasedSlotSupplierOptions()} sessSS := &ResourceBasedSlotSupplier{controller: controller, options: DefaultActivityResourceBasedSlotSupplierOptions()} - compositeTuner, err := worker.NewCompositeTuner(worker.CompositeTunerOptions{ + + return NewCompositeTuner(CompositeTunerOptions{ WorkflowSlotSupplier: wfSS, ActivitySlotSupplier: actSS, LocalActivitySlotSupplier: laSS, NexusSlotSupplier: nexusSS, SessionActivitySlotSupplier: sessSS, }) - if err != nil { - return nil, err - } - return compositeTuner, nil } // ResourceBasedSlotSupplierOptions configures a particular ResourceBasedSlotSupplier. +// +// Exposed as: [go.temporal.io/sdk/worker.ResourceBasedSlotSupplierOptions] type ResourceBasedSlotSupplierOptions struct { // MinSlots is minimum number of slots that will be issued without any resource checks. MinSlots int @@ -87,6 +128,9 @@ type ResourceBasedSlotSupplierOptions struct { RampThrottle time.Duration } +// DefaultWorkflowResourceBasedSlotSupplierOptions returns default options for workflow slot suppliers. +// +// Exposed as: [go.temporal.io/sdk/worker.DefaultWorkflowResourceBasedSlotSupplierOptions] func DefaultWorkflowResourceBasedSlotSupplierOptions() ResourceBasedSlotSupplierOptions { return ResourceBasedSlotSupplierOptions{ MinSlots: 5, @@ -94,6 +138,10 @@ func DefaultWorkflowResourceBasedSlotSupplierOptions() ResourceBasedSlotSupplier RampThrottle: 0 * time.Second, } } + +// DefaultActivityResourceBasedSlotSupplierOptions returns default options for activity slot suppliers. +// +// Exposed as: [go.temporal.io/sdk/worker.DefaultActivityResourceBasedSlotSupplierOptions] func DefaultActivityResourceBasedSlotSupplierOptions() ResourceBasedSlotSupplierOptions { return ResourceBasedSlotSupplierOptions{ MinSlots: 1, @@ -102,8 +150,9 @@ func DefaultActivityResourceBasedSlotSupplierOptions() ResourceBasedSlotSupplier } } -// ResourceBasedSlotSupplier is a worker.SlotSupplier that issues slots based on system resource -// usage. +// ResourceBasedSlotSupplier is a SlotSupplier that issues slots based on system resource usage. +// +// Exposed as: [go.temporal.io/sdk/worker.ResourceBasedSlotSupplier] type ResourceBasedSlotSupplier struct { controller *ResourceController options ResourceBasedSlotSupplierOptions @@ -115,12 +164,14 @@ type ResourceBasedSlotSupplier struct { // NewResourceBasedSlotSupplier creates a ResourceBasedSlotSupplier given the provided // ResourceController and ResourceBasedSlotSupplierOptions. All ResourceBasedSlotSupplier instances // must use the same ResourceController. +// +// Exposed as: [go.temporal.io/sdk/worker.NewResourceBasedSlotSupplier] func NewResourceBasedSlotSupplier( controller *ResourceController, options ResourceBasedSlotSupplierOptions, ) (*ResourceBasedSlotSupplier, error) { if options.MinSlots < 0 || options.MaxSlots < 0 || options.MinSlots > options.MaxSlots { - return nil, errors.New("MinSlots and Max slots must be non-negative and MinSlots must be less than or equal to MaxSlots") + return nil, errors.New("MinSlots and MaxSlots must be non-negative and MinSlots must be less than or equal to MaxSlots") } if options.RampThrottle < 0 { return nil, errors.New("RampThrottle must be non-negative") @@ -128,16 +179,14 @@ func NewResourceBasedSlotSupplier( return &ResourceBasedSlotSupplier{controller: controller, options: options}, nil } -func (r *ResourceBasedSlotSupplier) ReserveSlot(ctx context.Context, info worker.SlotReservationInfo) (*worker.SlotPermit, error) { +func (r *ResourceBasedSlotSupplier) ReserveSlot(ctx context.Context, info SlotReservationInfo) (*SlotPermit, error) { for { if info.NumIssuedSlots() < r.options.MinSlots { - return &worker.SlotPermit{}, nil + return &SlotPermit{}, nil } if r.options.RampThrottle > 0 { r.lastIssuedMu.Lock() mustWaitFor := r.options.RampThrottle - time.Since(r.lastSlotIssuedAt) - // Deal with last issued possibly being unset, or, on windows seemingly sometimes can - // have zero values if called rapidly enough. if mustWaitFor > 0 { select { case <-time.After(mustWaitFor): @@ -157,7 +206,7 @@ func (r *ResourceBasedSlotSupplier) ReserveSlot(ctx context.Context, info worker } } -func (r *ResourceBasedSlotSupplier) TryReserveSlot(info worker.SlotReservationInfo) *worker.SlotPermit { +func (r *ResourceBasedSlotSupplier) TryReserveSlot(info SlotReservationInfo) *SlotPermit { r.lastIssuedMu.Lock() defer r.lastIssuedMu.Unlock() @@ -171,35 +220,28 @@ func (r *ResourceBasedSlotSupplier) TryReserveSlot(info worker.SlotReservationIn } if decision { r.lastSlotIssuedAt = time.Now() - return &worker.SlotPermit{} + return &SlotPermit{} } } return nil } -func (r *ResourceBasedSlotSupplier) MarkSlotUsed(worker.SlotMarkUsedInfo) {} -func (r *ResourceBasedSlotSupplier) ReleaseSlot(worker.SlotReleaseInfo) {} +func (r *ResourceBasedSlotSupplier) MarkSlotUsed(SlotMarkUsedInfo) {} +func (r *ResourceBasedSlotSupplier) ReleaseSlot(SlotReleaseInfo) {} func (r *ResourceBasedSlotSupplier) MaxSlots() int { return 0 } -// SystemInfoSupplier implementations provide information about system resources. -type SystemInfoSupplier interface { - // GetMemoryUsage returns the current system memory usage as a fraction of total memory between - // 0 and 1. - GetMemoryUsage(infoContext *SystemInfoContext) (float64, error) - // GetCpuUsage returns the current system CPU usage as a fraction of total CPU usage between 0 - // and 1. - GetCpuUsage(infoContext *SystemInfoContext) (float64, error) -} - -type SystemInfoContext struct { - Logger log.Logger +// GetSystemInfoSupplier returns the SystemInfoSupplier used by this slot supplier's controller. +func (r *ResourceBasedSlotSupplier) GetSystemInfoSupplier() SystemInfoSupplier { + return r.controller.infoSupplier } // ResourceControllerOptions contains configurable parameters for a ResourceController. // It is recommended to use DefaultResourceControllerOptions to create a ResourceControllerOptions // and only modify the mem/cpu target percent fields. +// +// Exposed as: [go.temporal.io/sdk/worker.ResourceControllerOptions] type ResourceControllerOptions struct { // MemTargetPercent is the target overall system memory usage as value 0 and 1 that the // controller will attempt to maintain. @@ -207,8 +249,7 @@ type ResourceControllerOptions struct { // CpuTargetPercent is the target overall system CPU usage as value 0 and 1 that the controller // will attempt to maintain. CpuTargetPercent float64 - // SystemInfoSupplier is the supplier that the controller will use to get system resources. - // Leave this nil to use the default implementation. + // InfoSupplier is the supplier that the controller will use to get system resources. InfoSupplier SystemInfoSupplier MemOutputThreshold float64 @@ -223,6 +264,8 @@ type ResourceControllerOptions struct { } // DefaultResourceControllerOptions returns a ResourceControllerOptions with default values. +// +// Exposed as: [go.temporal.io/sdk/worker.DefaultResourceControllerOptions] func DefaultResourceControllerOptions() ResourceControllerOptions { return ResourceControllerOptions{ MemTargetPercent: 0.8, @@ -238,52 +281,62 @@ func DefaultResourceControllerOptions() ResourceControllerOptions { } } -// A ResourceController is used by ResourceBasedSlotSupplier to make decisions about whether slots +// pidController implements a simple PID controller for resource-based tuning. +// This is the standard PID formula: output = Kp*error + Ki*integral + Kd*derivative +type pidController struct { + pGain, iGain, dGain float64 + + prevError float64 + integral float64 + controlSignal float64 +} + +func (c *pidController) update(reference, actual float64, dt time.Duration) { + err := reference - actual + c.integral += err * dt.Seconds() + derivative := (err - c.prevError) / dt.Seconds() + c.controlSignal = c.pGain*err + c.iGain*c.integral + c.dGain*derivative + c.prevError = err +} + +// ResourceController is used by ResourceBasedSlotSupplier to make decisions about whether slots // should be issued based on system resource usage. +// +// Exposed as: [go.temporal.io/sdk/worker.ResourceController] type ResourceController struct { options ResourceControllerOptions mu sync.Mutex infoSupplier SystemInfoSupplier lastRefresh time.Time - memPid *pid.Controller - cpuPid *pid.Controller + memPid *pidController + cpuPid *pidController } // NewResourceController creates a new ResourceController with the provided options. // WARNING: It is important that you do not create multiple ResourceController instances. Since // the controller looks at overall system resources, multiple instances with different configs can // only conflict with one another. +// +// Exposed as: [go.temporal.io/sdk/worker.NewResourceController] func NewResourceController(options ResourceControllerOptions) *ResourceController { - var infoSupplier SystemInfoSupplier - if options.InfoSupplier == nil { - infoSupplier = &psUtilSystemInfoSupplier{ - cGroupInfo: newCGroupInfo(), - } - } else { - infoSupplier = options.InfoSupplier - } return &ResourceController{ options: options, - infoSupplier: infoSupplier, - memPid: &pid.Controller{ - Config: pid.ControllerConfig{ - ProportionalGain: options.MemPGain, - IntegralGain: options.MemIGain, - DerivativeGain: options.MemDGain, - }, + infoSupplier: options.InfoSupplier, + memPid: &pidController{ + pGain: options.MemPGain, + iGain: options.MemIGain, + dGain: options.MemDGain, }, - cpuPid: &pid.Controller{ - Config: pid.ControllerConfig{ - ProportionalGain: options.CpuPGain, - IntegralGain: options.CpuIGain, - DerivativeGain: options.CpuDGain, - }, + cpuPid: &pidController{ + pGain: options.CpuPGain, + iGain: options.CpuIGain, + dGain: options.CpuDGain, }, } } -func (rc *ResourceController) pidDecision(logger log.Logger, metricsHandler client.MetricsHandler) (bool, error) { +func (rc *ResourceController) pidDecision(logger log.Logger, metricsHandler metrics.Handler) (bool, error) { rc.mu.Lock() defer rc.mu.Unlock() @@ -306,110 +359,18 @@ func (rc *ResourceController) pidDecision(logger log.Logger, metricsHandler clie if elapsedTime <= 0 { elapsedTime = 1 * time.Millisecond } - rc.memPid.Update(pid.ControllerInput{ - ReferenceSignal: rc.options.MemTargetPercent, - ActualSignal: memUsage, - SamplingInterval: elapsedTime, - }) - rc.cpuPid.Update(pid.ControllerInput{ - ReferenceSignal: rc.options.CpuTargetPercent, - ActualSignal: cpuUsage, - SamplingInterval: elapsedTime, - }) + rc.memPid.update(rc.options.MemTargetPercent, memUsage, elapsedTime) + rc.cpuPid.update(rc.options.CpuTargetPercent, cpuUsage, elapsedTime) rc.lastRefresh = time.Now() - return rc.memPid.State.ControlSignal > rc.options.MemOutputThreshold && - rc.cpuPid.State.ControlSignal > rc.options.CpuOutputThreshold, nil + return rc.memPid.controlSignal > rc.options.MemOutputThreshold && + rc.cpuPid.controlSignal > rc.options.CpuOutputThreshold, nil } -func (rc *ResourceController) publishResourceMetrics(metricsHandler client.MetricsHandler, memUsage, cpuUsage float64) { +func (rc *ResourceController) publishResourceMetrics(metricsHandler metrics.Handler, memUsage, cpuUsage float64) { if metricsHandler == nil { return } metricsHandler.Gauge(resourceSlotsMemUsage).Update(memUsage * 100) metricsHandler.Gauge(resourceSlotsCPUUsage).Update(cpuUsage * 100) } - -type psUtilSystemInfoSupplier struct { - logger log.Logger - mu sync.Mutex - lastRefresh time.Time - - lastMemStat *mem.VirtualMemoryStat - lastCpuUsage float64 - - stopTryingToGetCGroupInfo bool - cGroupInfo cGroupInfo -} - -type cGroupInfo interface { - // Update requests an update of the cgroup stats. This is a no-op if not in a cgroup. Returns - // true if cgroup stats should continue to be updated, false if not in a cgroup or the returned - // error is considered unrecoverable. - Update() (bool, error) - // GetLastMemUsage returns last known memory usage as a fraction of the cgroup limit. 0 if not - // in a cgroup or limit is not set. - GetLastMemUsage() float64 - // GetLastCPUUsage returns last known CPU usage as a fraction of the cgroup limit. 0 if not in a - // cgroup or limit is not set. - GetLastCPUUsage() float64 -} - -func (p *psUtilSystemInfoSupplier) GetMemoryUsage(infoContext *SystemInfoContext) (float64, error) { - if err := p.maybeRefresh(infoContext); err != nil { - return 0, err - } - lastCGroupMem := p.cGroupInfo.GetLastMemUsage() - if lastCGroupMem != 0 { - return lastCGroupMem, nil - } - return p.lastMemStat.UsedPercent / 100, nil -} - -func (p *psUtilSystemInfoSupplier) GetCpuUsage(infoContext *SystemInfoContext) (float64, error) { - if err := p.maybeRefresh(infoContext); err != nil { - return 0, err - } - - lastCGroupCPU := p.cGroupInfo.GetLastCPUUsage() - if lastCGroupCPU != 0 { - return lastCGroupCPU, nil - } - return p.lastCpuUsage / 100, nil -} - -func (p *psUtilSystemInfoSupplier) maybeRefresh(infoContext *SystemInfoContext) error { - if time.Since(p.lastRefresh) < 100*time.Millisecond { - return nil - } - p.mu.Lock() - defer p.mu.Unlock() - // Double check refresh is still needed - if time.Since(p.lastRefresh) < 100*time.Millisecond { - return nil - } - ctx, cancelFn := context.WithTimeout(context.Background(), 1*time.Second) - defer cancelFn() - memStat, err := mem.VirtualMemoryWithContext(ctx) - if err != nil { - return err - } - cpuUsage, err := cpu.PercentWithContext(ctx, 0, false) - if err != nil { - return err - } - - p.lastMemStat = memStat - p.lastCpuUsage = cpuUsage[0] - - if runtime.GOOS == "linux" && !p.stopTryingToGetCGroupInfo { - continueUpdates, err := p.cGroupInfo.Update() - if err != nil { - infoContext.Logger.Warn("Failed to get cgroup stats", "error", err) - } - p.stopTryingToGetCGroupInfo = !continueUpdates - } - - p.lastRefresh = time.Now() - return nil -} diff --git a/internal/tuning.go b/internal/tuning.go index 72a6c3ac6..f6597680f 100644 --- a/internal/tuning.go +++ b/internal/tuning.go @@ -3,7 +3,6 @@ package internal import ( "context" "fmt" - "reflect" "sync" "sync/atomic" @@ -131,17 +130,15 @@ type SlotSupplier interface { MaxSlots() int } -// getSlotSupplierKind returns the kind/type name of a slot supplier. If the supplier implements -// a Kind() string method, it uses that. Otherwise, it falls back to reflection on the type name. func getSlotSupplierKind(s SlotSupplier) string { - if k, ok := s.(interface{ Kind() string }); ok { - return k.Kind() + switch s.(type) { + case *FixedSizeSlotSupplier: + return "Fixed" + case *ResourceBasedSlotSupplier: + return "ResourceBased" + default: + return "Custom" } - t := reflect.TypeOf(s) - if t.Kind() == reflect.Ptr { - return t.Elem().Name() - } - return t.Name() } // CompositeTuner allows you to build a tuner from multiple slot suppliers. @@ -298,9 +295,6 @@ func (f *FixedSizeSlotSupplier) ReleaseSlot(SlotReleaseInfo) { func (f *FixedSizeSlotSupplier) MaxSlots() int { return f.numSlots } -func (f *FixedSizeSlotSupplier) Kind() string { - return "Fixed" -} type slotReservationData struct { taskQueue string diff --git a/internal/worker.go b/internal/worker.go index 56e7aa7c0..83efaf21c 100644 --- a/internal/worker.go +++ b/internal/worker.go @@ -35,20 +35,6 @@ type ( isPollerBehavior() } - // TunerHostMetricsProvider provides host-level CPU and memory metrics for worker heartbeats. - // This interface is typically implemented by a [WorkerTuner] to provide metrics from the same - // source used for tuning decisions, avoiding double-measurement. If the tuner passed to - // WorkerOptions implements this interface, the SDK will automatically use it for heartbeat - // metrics. Otherwise, heartbeats will report 0 for CPU/memory usage. - // - // Exposed as: [go.temporal.io/sdk/worker.TunerHostMetricsProvider] - TunerHostMetricsProvider interface { - // GetCpuUsage returns the current host CPU usage as a fraction (0.0-1.0) - GetCpuUsage() (float64, error) - // GetMemoryUsage returns the current host memory usage as a fraction (0.0-1.0) - GetMemoryUsage() (float64, error) - } - // PollerBehaviorAutoscalingOptions is the options for NewPollerBehaviorAutoscaling. // // Exposed as: [go.temporal.io/sdk/worker.PollerBehaviorAutoscalingOptions] diff --git a/worker/tuning.go b/worker/tuning.go index a04800c5c..1a88aa778 100644 --- a/worker/tuning.go +++ b/worker/tuning.go @@ -46,3 +46,75 @@ func NewCompositeTuner(options CompositeTunerOptions) (WorkerTuner, error) { func NewFixedSizeSlotSupplier(numSlots int) (SlotSupplier, error) { return internal.NewFixedSizeSlotSupplier(numSlots) } + +// SystemInfoSupplier implementations provide information about system resources. +// Use contrib/hostinfo.NewSystemInfoSupplier() for a gopsutil-based implementation, +// or provide your own. +type SystemInfoSupplier = internal.SystemInfoSupplier + +// SystemInfoContext provides context for SystemInfoSupplier calls. +type SystemInfoContext = internal.SystemInfoContext + +// HasSystemInfoSupplier is an optional interface that SlotSupplier implementations can implement +// to expose their SystemInfoSupplier. +type HasSystemInfoSupplier = internal.HasSystemInfoSupplier + +// ResourceBasedTunerOptions configures a resource-based tuner. +type ResourceBasedTunerOptions = internal.ResourceBasedTunerOptions + +// NewResourceBasedTuner creates a WorkerTuner that dynamically adjusts the number of slots based +// on system resources. Specify the target CPU and memory usage as a value between 0 and 1. +// +// InfoSupplier is required - use contrib/hostinfo.NewSystemInfoSupplier() for a gopsutil-based +// implementation, or provide your own. +func NewResourceBasedTuner(opts ResourceBasedTunerOptions) (WorkerTuner, error) { + return internal.NewResourceBasedTuner(opts) +} + +// ResourceBasedSlotSupplierOptions configures a particular ResourceBasedSlotSupplier. +type ResourceBasedSlotSupplierOptions = internal.ResourceBasedSlotSupplierOptions + +// ResourceBasedSlotSupplier is a SlotSupplier that issues slots based on system resource usage. +type ResourceBasedSlotSupplier = internal.ResourceBasedSlotSupplier + +// NewResourceBasedSlotSupplier creates a ResourceBasedSlotSupplier given the provided +// ResourceController and ResourceBasedSlotSupplierOptions. All ResourceBasedSlotSupplier instances +// must use the same ResourceController. +func NewResourceBasedSlotSupplier( + controller *ResourceController, + options ResourceBasedSlotSupplierOptions, +) (*ResourceBasedSlotSupplier, error) { + return internal.NewResourceBasedSlotSupplier(controller, options) +} + +// ResourceControllerOptions contains configurable parameters for a ResourceController. +// It is recommended to use DefaultResourceControllerOptions to create a ResourceControllerOptions +// and only modify the mem/cpu target percent fields. +type ResourceControllerOptions = internal.ResourceControllerOptions + +// ResourceController is used by ResourceBasedSlotSupplier to make decisions about whether slots +// should be issued based on system resource usage. +type ResourceController = internal.ResourceController + +// NewResourceController creates a new ResourceController with the provided options. +// WARNING: It is important that you do not create multiple ResourceController instances. Since +// the controller looks at overall system resources, multiple instances with different configs can +// only conflict with one another. +func NewResourceController(options ResourceControllerOptions) *ResourceController { + return internal.NewResourceController(options) +} + +// DefaultResourceControllerOptions returns a ResourceControllerOptions with default values. +func DefaultResourceControllerOptions() ResourceControllerOptions { + return internal.DefaultResourceControllerOptions() +} + +// DefaultWorkflowResourceBasedSlotSupplierOptions returns default options for workflow slot suppliers. +func DefaultWorkflowResourceBasedSlotSupplierOptions() ResourceBasedSlotSupplierOptions { + return internal.DefaultWorkflowResourceBasedSlotSupplierOptions() +} + +// DefaultActivityResourceBasedSlotSupplierOptions returns default options for activity slot suppliers. +func DefaultActivityResourceBasedSlotSupplierOptions() ResourceBasedSlotSupplierOptions { + return internal.DefaultActivityResourceBasedSlotSupplierOptions() +} diff --git a/worker/worker.go b/worker/worker.go index 11785175c..f0a5b8eb2 100644 --- a/worker/worker.go +++ b/worker/worker.go @@ -236,15 +236,6 @@ type ( // ReplayWorkflowHistoryOptions are options for replaying a workflow. ReplayWorkflowHistoryOptions = internal.ReplayWorkflowHistoryOptions - - // TunerHostMetricsProvider provides host-level CPU and memory metrics for worker heartbeats. - // This interface is typically implemented by a [WorkerTuner] to provide metrics from the same - // source used for tuning decisions, avoiding double-measurement. If the tuner passed to - // [Options] implements this interface, the SDK will automatically use it for heartbeat metrics. - // Otherwise, heartbeats will report 0 for CPU/memory usage. - // - // NOTE: Experimental - TunerHostMetricsProvider = internal.TunerHostMetricsProvider ) var _ WorkflowRegistry = (WorkflowReplayer)(nil) From ebf1064820b590fb657a7e82c2f0f6047dc58740 Mon Sep 17 00:00:00 2001 From: Andrew Yuan Date: Tue, 27 Jan 2026 23:14:53 -0800 Subject: [PATCH 06/30] make methods/structs private, remove aw.workerHeartbeatManager --- internal/client.go | 2 +- internal/internal_worker.go | 101 ++++++++------------------ internal/internal_worker_heartbeat.go | 20 ++--- internal/internal_workflow_client.go | 2 +- 4 files changed, 43 insertions(+), 82 deletions(-) diff --git a/internal/client.go b/internal/client.go index 24cc18d09..bf0c063a2 100644 --- a/internal/client.go +++ b/internal/client.go @@ -1187,7 +1187,7 @@ func NewServiceClient(workflowServiceClient workflowservice.WorkflowServiceClien } if heartbeatInterval > 0 { - client.heartbeatManager = NewHeartbeatManager(client, heartbeatInterval, client.logger) + client.heartbeatManager = newHeartbeatManager(client, heartbeatInterval, client.logger) } // Create outbound interceptor by wrapping backwards through chain diff --git a/internal/internal_worker.go b/internal/internal_worker.go index 227c148a6..3b18db627 100644 --- a/internal/internal_worker.go +++ b/internal/internal_worker.go @@ -229,21 +229,6 @@ type ( // The build id specific to this worker BuildID string } - - // workerHeartbeatManager includes all information needed to report worker heartbeats. - workerHeartbeatManager struct { - heartbeatMetrics *HeartbeatMetricsHandler - heartbeatCallback func() *workerpb.WorkerHeartbeat - - // Slot suppliers for heartbeat reporting - workflowTaskSlotSupplier *trackingSlotSupplier - activityTaskSlotSupplier *trackingSlotSupplier - localActivitySlotSupplier *trackingSlotSupplier - nexusTaskSlotSupplier *trackingSlotSupplier // TODO: nexus worker only gets started when worker is started, need to find a way to send kind over to heartbeat callback - - // SystemInfoSupplier for CPU/memory reporting in heartbeats - systemInfoSupplier SystemInfoSupplier - } ) var debugMode = os.Getenv("TEMPORAL_DEBUG") != "" @@ -1200,8 +1185,8 @@ type AggregatedWorker struct { plugins []WorkerPlugin pluginRegistryOptions *WorkerPluginConfigureWorkerRegistryOptions // Never nil - workerHeartbeatManager *workerHeartbeatManager - heartbeatCallback func() *workerpb.WorkerHeartbeat + heartbeatMetrics *HeartbeatMetricsHandler + heartbeatCallback func() *workerpb.WorkerHeartbeat } // RegisterWorkflow registers workflow implementation with the AggregatedWorker @@ -1377,9 +1362,6 @@ func (aw *AggregatedWorker) start() error { if err := aw.nexusWorker.Start(); err != nil { return fmt.Errorf("failed to start a nexus worker: %w", err) } - if aw.nexusWorker.worker != nil && aw.workerHeartbeatManager != nil { - aw.workerHeartbeatManager.nexusTaskSlotSupplier = aw.nexusWorker.worker.slotSupplier - } } if aw.client.workerHeartbeatInterval > 0 { @@ -1515,14 +1497,14 @@ func (aw *AggregatedWorker) registerHeartbeatWorker() error { if aw.client.heartbeatManager == nil { return nil } - return aw.client.heartbeatManager.RegisterWorker(aw) + return aw.client.heartbeatManager.registerWorker(aw) } func (aw *AggregatedWorker) unregisterHeartbeatWorker() { - if aw.client.heartbeatManager == nil || aw.workerHeartbeatManager == nil { + if aw.client.heartbeatManager == nil { return } - aw.client.heartbeatManager.UnregisterWorker(aw) + aw.client.heartbeatManager.unregisterWorker(aw) } // shutdownWorker sends a ShutdownWorker RPC to notify the server that this worker is shutting down. @@ -1536,8 +1518,8 @@ func (aw *AggregatedWorker) shutdownWorker() { defer cancel() var heartbeat *workerpb.WorkerHeartbeat - if aw.workerHeartbeatManager != nil && aw.workerHeartbeatManager.heartbeatCallback != nil { - heartbeat = aw.workerHeartbeatManager.heartbeatCallback() + if aw.heartbeatCallback != nil { + heartbeat = aw.heartbeatCallback() heartbeat.Status = enumspb.WORKER_STATUS_SHUTTING_DOWN } @@ -2123,12 +2105,10 @@ func NewAggregatedWorker(client *WorkflowClient, taskQueue string, options Worke baseMetricsHandler := client.metricsHandler.WithTags(metrics.TaskQueueTags(taskQueue)) var metricsHandler metrics.Handler var heartbeatMetrics *HeartbeatMetricsHandler - var heartbeatManager *workerHeartbeatManager if client.workerHeartbeatInterval != 0 { heartbeatMetrics = NewHeartbeatMetricsHandler(baseMetricsHandler) metricsHandler = heartbeatMetrics - heartbeatManager = &workerHeartbeatManager{} } else { metricsHandler = baseMetricsHandler } @@ -2262,7 +2242,7 @@ func NewAggregatedWorker(client *WorkflowClient, taskQueue string, options Worke } var heartbeatCallback func() *workerpb.WorkerHeartbeat - if heartbeatManager != nil { + if client.workerHeartbeatInterval != 0 { startTime := timestamppb.New(time.Now()) hostname, _ := os.Hostname() previousHeartbeatTime := time.Now() @@ -2322,58 +2302,39 @@ func NewAggregatedWorker(client *WorkflowClient, taskQueue string, options Worke Plugins: pluginInfos, } - if aw.workerHeartbeatManager.heartbeatMetrics != nil { - if aw.workerHeartbeatManager.workflowTaskSlotSupplier != nil { - populateOpts.WorkflowSlotSupplierKind = aw.workerHeartbeatManager.workflowTaskSlotSupplier.GetSlotSupplierKind() - } - if aw.workerHeartbeatManager.activityTaskSlotSupplier != nil { - populateOpts.ActivitySlotSupplierKind = aw.workerHeartbeatManager.activityTaskSlotSupplier.GetSlotSupplierKind() + if aw.heartbeatMetrics != nil { + if aw.workflowWorker != nil { + populateOpts.WorkflowSlotSupplierKind = aw.workflowWorker.worker.slotSupplier.GetSlotSupplierKind() + populateOpts.LocalActivitySlotSupplierKind = aw.workflowWorker.localActivityWorker.slotSupplier.GetSlotSupplierKind() } - if aw.workerHeartbeatManager.localActivitySlotSupplier != nil { - populateOpts.LocalActivitySlotSupplierKind = aw.workerHeartbeatManager.localActivitySlotSupplier.GetSlotSupplierKind() + if aw.activityWorker != nil { + populateOpts.ActivitySlotSupplierKind = aw.activityWorker.worker.slotSupplier.GetSlotSupplierKind() } - if aw.workerHeartbeatManager.nexusTaskSlotSupplier != nil { - populateOpts.NexusSlotSupplierKind = aw.workerHeartbeatManager.nexusTaskSlotSupplier.GetSlotSupplierKind() + if aw.nexusWorker != nil { + populateOpts.NexusSlotSupplierKind = aw.nexusWorker.worker.slotSupplier.GetSlotSupplierKind() } - aw.workerHeartbeatManager.heartbeatMetrics.PopulateHeartbeat(hb, populateOpts) + aw.heartbeatMetrics.PopulateHeartbeat(hb, populateOpts) } return hb } } - if heartbeatManager != nil { - heartbeatManager.heartbeatMetrics = heartbeatMetrics - heartbeatManager.heartbeatCallback = heartbeatCallback - heartbeatManager.systemInfoSupplier = systemInfoSupplier - } - aw = &AggregatedWorker{ - client: client, - workflowWorker: workflowWorker, - activityWorker: activityWorker, - sessionWorker: sessionWorker, - logger: workerParams.Logger, - registry: registry, - stopC: make(chan struct{}), - capabilities: &capabilities, - executionParams: workerParams, - workerInstanceKey: workerInstanceKey, - plugins: plugins, - pluginRegistryOptions: &pluginRegistryOptions, - workerHeartbeatManager: heartbeatManager, - } - - if aw.workerHeartbeatManager != nil { - if workflowWorker != nil && workflowWorker.worker != nil { - aw.workerHeartbeatManager.workflowTaskSlotSupplier = workflowWorker.worker.slotSupplier - } - if workflowWorker != nil && workflowWorker.localActivityWorker != nil { - aw.workerHeartbeatManager.localActivitySlotSupplier = workflowWorker.localActivityWorker.slotSupplier - } - if activityWorker != nil && activityWorker.worker != nil { - aw.workerHeartbeatManager.activityTaskSlotSupplier = activityWorker.worker.slotSupplier - } + client: client, + workflowWorker: workflowWorker, + activityWorker: activityWorker, + sessionWorker: sessionWorker, + logger: workerParams.Logger, + registry: registry, + stopC: make(chan struct{}), + capabilities: &capabilities, + executionParams: workerParams, + workerInstanceKey: workerInstanceKey, + plugins: plugins, + pluginRegistryOptions: &pluginRegistryOptions, + heartbeatMetrics: heartbeatMetrics, + heartbeatCallback: heartbeatCallback, } // Set memoized start as a once-value that invokes plugins first diff --git a/internal/internal_worker_heartbeat.go b/internal/internal_worker_heartbeat.go index 5d3b5daa8..1317d8ac5 100644 --- a/internal/internal_worker_heartbeat.go +++ b/internal/internal_worker_heartbeat.go @@ -15,8 +15,8 @@ import ( "google.golang.org/grpc/status" ) -// HeartbeatManager manages heartbeat workers across namespaces for a client. -type HeartbeatManager struct { +// heartbeatManager manages heartbeat workers across namespaces for a client. +type heartbeatManager struct { client *WorkflowClient interval time.Duration logger log.Logger @@ -25,9 +25,9 @@ type HeartbeatManager struct { workers map[string]*sharedNamespaceWorker // namespace -> worker } -// NewHeartbeatManager creates a new HeartbeatManager. -func NewHeartbeatManager(client *WorkflowClient, interval time.Duration, logger log.Logger) *HeartbeatManager { - return &HeartbeatManager{ +// newHeartbeatManager creates a new heartbeatManager. +func newHeartbeatManager(client *WorkflowClient, interval time.Duration, logger log.Logger) *heartbeatManager { + return &heartbeatManager{ client: client, interval: interval, logger: logger, @@ -35,8 +35,8 @@ func NewHeartbeatManager(client *WorkflowClient, interval time.Duration, logger } } -// RegisterWorker registers a worker's heartbeat callback with the shared heartbeat worker for the namespace. -func (m *HeartbeatManager) RegisterWorker( +// registerWorker registers a worker's heartbeat callback with the shared heartbeat worker for the namespace. +func (m *heartbeatManager) registerWorker( worker *AggregatedWorker, ) error { m.mu.Lock() @@ -82,9 +82,9 @@ func (m *HeartbeatManager) RegisterWorker( return nil } -// UnregisterWorker removes a worker's heartbeat callback. If no callbacks remain for the namespace, +// unregisterWorker removes a worker's heartbeat callback. If no callbacks remain for the namespace, // the shared heartbeat worker is stopped. -func (m *HeartbeatManager) UnregisterWorker(worker *AggregatedWorker) { +func (m *heartbeatManager) unregisterWorker(worker *AggregatedWorker) { m.mu.Lock() defer m.mu.Unlock() @@ -204,7 +204,7 @@ func (hw *sharedNamespaceWorker) sendHeartbeats() { ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) defer cancel() - _, err := hw.client.RecordWorkerHeartbeat(ctx, &workflowservice.RecordWorkerHeartbeatRequest{ + _, err := hw.client.RecordWorkerHeartbeat(context.Background(), &workflowservice.RecordWorkerHeartbeatRequest{ Namespace: hw.namespace, WorkerHeartbeat: heartbeats, }) diff --git a/internal/internal_workflow_client.go b/internal/internal_workflow_client.go index 94d3db685..78f788284 100644 --- a/internal/internal_workflow_client.go +++ b/internal/internal_workflow_client.go @@ -83,7 +83,7 @@ type ( getSystemInfoTimeout time.Duration workerHeartbeatInterval time.Duration workerGroupingKey string - heartbeatManager *HeartbeatManager + heartbeatManager *heartbeatManager // The pointer value is shared across multiple clients. If non-nil, only // access/mutate atomically. From b8893b97f83c5abdf50eaca01dc6c7e0db0e0f5a Mon Sep 17 00:00:00 2001 From: Andrew Yuan Date: Tue, 27 Jan 2026 23:28:36 -0800 Subject: [PATCH 07/30] tighten lock, consolidate describeNamespace calls to a single call in AggregatedWorker.start() --- internal/internal_nexus_worker.go | 4 ---- internal/internal_worker.go | 30 +++++---------------------- internal/internal_worker_heartbeat.go | 13 ++++-------- internal/internal_workflow_client.go | 2 +- 4 files changed, 10 insertions(+), 39 deletions(-) diff --git a/internal/internal_nexus_worker.go b/internal/internal_nexus_worker.go index ba38bd6ef..d5f844ed5 100644 --- a/internal/internal_nexus_worker.go +++ b/internal/internal_nexus_worker.go @@ -78,10 +78,6 @@ func newNexusWorker(opts nexusWorkerOptions) (*nexusWorker, error) { // Start the worker. func (w *nexusWorker) Start() error { - err := verifyNamespaceExist(w.workflowService, w.executionParameters.MetricsHandler, w.executionParameters.Namespace, w.worker.logger) - if err != nil { - return err - } w.worker.Start() return nil } diff --git a/internal/internal_worker.go b/internal/internal_worker.go index 3b18db627..6388adcac 100644 --- a/internal/internal_worker.go +++ b/internal/internal_worker.go @@ -284,23 +284,6 @@ func (params *workerExecutionParameters) isInternalWorker() bool { return params.Namespace == "temporal-system" || params.TaskQueue == "temporal-sys-per-ns-tq" } -// verifyNamespaceExist does a DescribeNamespace operation on the specified namespace with backoff/retry -func verifyNamespaceExist( - client workflowservice.WorkflowServiceClient, - metricsHandler metrics.Handler, - namespace string, - logger log.Logger, -) error { - ctx := context.Background() - if namespace == "" { - return errors.New("namespace cannot be empty") - } - grpcCtx, cancel := newGRPCContext(ctx, grpcMetricsHandler(metricsHandler), defaultGrpcRetryParameters(ctx)) - defer cancel() - _, err := client.DescribeNamespace(grpcCtx, &workflowservice.DescribeNamespaceRequest{Namespace: namespace}) - return err -} - func newWorkflowWorkerInternal(client *WorkflowClient, params workerExecutionParameters, ppMgr pressurePointMgr, overrides *workerOverrides, registry *registry) *workflowWorker { workerStopChannel := make(chan struct{}) params.WorkerStopChannel = getReadOnlyChannel(workerStopChannel) @@ -426,10 +409,6 @@ func newWorkflowTaskWorkerInternal( // Start the worker. func (ww *workflowWorker) Start() error { - err := verifyNamespaceExist(ww.workflowService, ww.executionParameters.MetricsHandler, ww.executionParameters.Namespace, ww.worker.logger) - if err != nil { - return err - } ww.localActivityWorker.Start() ww.worker.Start() return nil // TODO: propagate error @@ -573,10 +552,6 @@ func newActivityWorker( // Start the worker. func (aw *activityWorker) Start() error { - err := verifyNamespaceExist(aw.workflowService, aw.executionParameters.MetricsHandler, aw.executionParameters.Namespace, aw.worker.logger) - if err != nil { - return err - } aw.worker.Start() return nil // TODO: propagate errors } @@ -1296,6 +1271,11 @@ func (aw *AggregatedWorker) start() error { } proto.Merge(aw.capabilities, capabilities) + // Load namespace capabilities (also verifies namespace exists and caches the result) + if _, err := aw.client.loadNamespaceCapabilities(context.Background()); err != nil { + return err + } + if !util.IsInterfaceNil(aw.workflowWorker) { if err := aw.workflowWorker.Start(); err != nil { return err diff --git a/internal/internal_worker_heartbeat.go b/internal/internal_worker_heartbeat.go index 1317d8ac5..8346f45aa 100644 --- a/internal/internal_worker_heartbeat.go +++ b/internal/internal_worker_heartbeat.go @@ -40,10 +40,10 @@ func (m *heartbeatManager) registerWorker( worker *AggregatedWorker, ) error { m.mu.Lock() - defer m.mu.Unlock() namespace := worker.executionParams.Namespace hw, ok := m.workers[namespace] + m.mu.Unlock() if !ok { capabilities, err := m.client.loadNamespaceCapabilities(context.Background()) if err != nil { @@ -71,7 +71,9 @@ func (m *heartbeatManager) registerWorker( } hw.nexusWorker = nexusWorker + m.mu.Lock() m.workers[namespace] = hw + m.mu.Unlock() go hw.run() } @@ -197,14 +199,7 @@ func (hw *sharedNamespaceWorker) sendHeartbeats() { heartbeats = append(heartbeats, hb) } - if len(heartbeats) == 0 { - return - } - - ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) - defer cancel() - - _, err := hw.client.RecordWorkerHeartbeat(context.Background(), &workflowservice.RecordWorkerHeartbeatRequest{ + _, err := hw.client.recordWorkerHeartbeat(context.Background(), &workflowservice.RecordWorkerHeartbeatRequest{ Namespace: hw.namespace, WorkerHeartbeat: heartbeats, }) diff --git a/internal/internal_workflow_client.go b/internal/internal_workflow_client.go index 78f788284..fd6f37691 100644 --- a/internal/internal_workflow_client.go +++ b/internal/internal_workflow_client.go @@ -1427,7 +1427,7 @@ func (wc *WorkflowClient) WorkerDeploymentClient() WorkerDeploymentClient { } } -func (wc *WorkflowClient) RecordWorkerHeartbeat(ctx context.Context, request *workflowservice.RecordWorkerHeartbeatRequest) (*workflowservice.RecordWorkerHeartbeatResponse, error) { +func (wc *WorkflowClient) recordWorkerHeartbeat(ctx context.Context, request *workflowservice.RecordWorkerHeartbeatRequest) (*workflowservice.RecordWorkerHeartbeatResponse, error) { if err := wc.ensureInitialized(ctx); err != nil { return nil, err } From a6135de06226630977188e892232026cdcc6f071 Mon Sep 17 00:00:00 2001 From: Andrew Yuan Date: Wed, 28 Jan 2026 14:02:28 -0800 Subject: [PATCH 08/30] simplify heartbeat metrics, decouple poller/worker type from WithTags() --- internal/internal_nexus_task_poller.go | 2 +- internal/internal_task_pollers.go | 9 +- internal/internal_worker.go | 6 +- internal/internal_worker_base.go | 3 + internal/internal_worker_heartbeat_metrics.go | 314 +++++++----------- 5 files changed, 134 insertions(+), 200 deletions(-) diff --git a/internal/internal_nexus_task_poller.go b/internal/internal_nexus_task_poller.go index 43d4441b3..36c6f051d 100644 --- a/internal/internal_nexus_task_poller.go +++ b/internal/internal_nexus_task_poller.go @@ -90,7 +90,7 @@ func (ntp *nexusTaskPoller) poll(ctx context.Context) (taskForWorker, error) { return nil, nil } - RecordPollSuccess(ntp.metricsHandler, metrics.PollerTypeNexusTask) + recordPollSuccessIfHeartbeat(ntp.metricsHandler, metrics.PollerTypeNexusTask) return &nexusTask{task: response}, nil } diff --git a/internal/internal_task_pollers.go b/internal/internal_task_pollers.go index cd2224e25..bd02fcded 100644 --- a/internal/internal_task_pollers.go +++ b/internal/internal_task_pollers.go @@ -202,6 +202,9 @@ type ( ) func newNumPollerMetric(metricsHandler metrics.Handler, pollerType string) *numPollerMetric { + if heartbeatHandler, isHeartbeat := metricsHandler.(*heartbeatMetricsHandler); isHeartbeat { + metricsHandler = heartbeatHandler.forPoller(pollerType) + } return &numPollerMetric{ gauge: metricsHandler.WithTags(metrics.PollerTags(pollerType)).Gauge(metrics.NumPoller), } @@ -970,9 +973,9 @@ func (wtp *workflowTaskPoller) poll(ctx context.Context) (taskForWorker, error) } if request.TaskQueue.GetKind() == enumspb.TASK_QUEUE_KIND_STICKY { - RecordPollSuccess(wtp.metricsHandler, metrics.PollerTypeWorkflowStickyTask) + recordPollSuccessIfHeartbeat(wtp.metricsHandler, metrics.PollerTypeWorkflowStickyTask) } else { - RecordPollSuccess(wtp.metricsHandler, metrics.PollerTypeWorkflowTask) + recordPollSuccessIfHeartbeat(wtp.metricsHandler, metrics.PollerTypeWorkflowTask) } wtp.updateBacklog(request.TaskQueue.GetKind(), response.GetBacklogCountHint()) @@ -1173,7 +1176,7 @@ func (atp *activityTaskPoller) poll(ctx context.Context) (taskForWorker, error) return &activityTask{}, nil } - RecordPollSuccess(atp.metricsHandler, metrics.PollerTypeActivityTask) + recordPollSuccessIfHeartbeat(atp.metricsHandler, metrics.PollerTypeActivityTask) workflowType := response.WorkflowType.GetName() activityType := response.ActivityType.GetName() diff --git a/internal/internal_worker.go b/internal/internal_worker.go index 6388adcac..0e5c58732 100644 --- a/internal/internal_worker.go +++ b/internal/internal_worker.go @@ -1160,7 +1160,7 @@ type AggregatedWorker struct { plugins []WorkerPlugin pluginRegistryOptions *WorkerPluginConfigureWorkerRegistryOptions // Never nil - heartbeatMetrics *HeartbeatMetricsHandler + heartbeatMetrics *heartbeatMetricsHandler heartbeatCallback func() *workerpb.WorkerHeartbeat } @@ -2084,10 +2084,10 @@ func NewAggregatedWorker(client *WorkflowClient, taskQueue string, options Worke baseMetricsHandler := client.metricsHandler.WithTags(metrics.TaskQueueTags(taskQueue)) var metricsHandler metrics.Handler - var heartbeatMetrics *HeartbeatMetricsHandler + var heartbeatMetrics *heartbeatMetricsHandler if client.workerHeartbeatInterval != 0 { - heartbeatMetrics = NewHeartbeatMetricsHandler(baseMetricsHandler) + heartbeatMetrics = newHeartbeatMetricsHandler(baseMetricsHandler) metricsHandler = heartbeatMetrics } else { metricsHandler = baseMetricsHandler diff --git a/internal/internal_worker_base.go b/internal/internal_worker_base.go index 756667de5..7eeb9e0b5 100644 --- a/internal/internal_worker_base.go +++ b/internal/internal_worker_base.go @@ -326,6 +326,9 @@ func newBaseWorker( ) *baseWorker { ctx, cancel := context.WithCancel(context.Background()) logger := log.With(options.logger, tagWorkerType, options.workerType) + if heartbeatHandler, isHeartbeat := options.metricsHandler.(*heartbeatMetricsHandler); isHeartbeat { + options.metricsHandler = heartbeatHandler.forWorker(options.workerType).WithTags(metrics.WorkerTags(options.workerType)) + } metricsHandler := options.metricsHandler.WithTags(metrics.WorkerTags(options.workerType)) tss := newTrackingSlotSupplier(options.slotSupplier, trackingSlotSupplierOptions{ logger: logger, diff --git a/internal/internal_worker_heartbeat_metrics.go b/internal/internal_worker_heartbeat_metrics.go index 8906aaa06..acf75ab46 100644 --- a/internal/internal_worker_heartbeat_metrics.go +++ b/internal/internal_worker_heartbeat_metrics.go @@ -10,155 +10,101 @@ import ( "go.temporal.io/sdk/internal/common/metrics" ) -type heartbeatMetric int - -const ( - metricStickyCacheHit heartbeatMetric = iota - metricStickyCacheMiss - metricStickyCacheSize - - metricWorkflowTaskFailures - metricActivityTaskFailures - metricLocalActivityTaskFailures - metricNexusTaskFailures - - metricWorkflowSlotsAvailable - metricWorkflowSlotsUsed - metricActivitySlotsAvailable - metricActivitySlotsUsed - metricLocalActivitySlotsAvailable - metricLocalActivitySlotsUsed - metricNexusSlotsAvailable - metricNexusSlotsUsed - - metricWorkflowTasksProcessed - metricActivityTasksProcessed - metricLocalActivityTasksProcessed - metricNexusTasksProcessed - - metricWorkflowPollerCount - metricWorkflowStickyPollerCount - metricActivityPollerCount - metricNexusPollerCount - - metricWorkflowLastPoll - metricWorkflowStickyLastPoll - metricActivityLastPoll - metricNexusLastPoll - - metricCount -) - -var counterMetricMap = map[string]heartbeatMetric{ - metrics.StickyCacheHit: metricStickyCacheHit, - metrics.StickyCacheMiss: metricStickyCacheMiss, - metrics.WorkflowTaskExecutionFailureCounter: metricWorkflowTaskFailures, - metrics.ActivityExecutionFailedCounter: metricActivityTaskFailures, - metrics.LocalActivityExecutionFailedCounter: metricLocalActivityTaskFailures, - metrics.NexusTaskExecutionFailedCounter: metricNexusTaskFailures, -} - -var timerMetricMap = map[string]heartbeatMetric{ - metrics.WorkflowTaskExecutionLatency: metricWorkflowTasksProcessed, - metrics.ActivityExecutionLatency: metricActivityTasksProcessed, - metrics.LocalActivityExecutionLatency: metricLocalActivityTasksProcessed, - metrics.NexusTaskExecutionLatency: metricNexusTasksProcessed, -} - -var slotsAvailableByWorkerType = map[string]heartbeatMetric{ - "WorkflowWorker": metricWorkflowSlotsAvailable, - "ActivityWorker": metricActivitySlotsAvailable, - "LocalActivityWorker": metricLocalActivitySlotsAvailable, - "NexusWorker": metricNexusSlotsAvailable, -} - -var slotsUsedByWorkerType = map[string]heartbeatMetric{ - "WorkflowWorker": metricWorkflowSlotsUsed, - "ActivityWorker": metricActivitySlotsUsed, - "LocalActivityWorker": metricLocalActivitySlotsUsed, - "NexusWorker": metricNexusSlotsUsed, -} +// Metrics we capture for heartbeat reporting. +var ( + capturedCounters = map[string]bool{ + metrics.StickyCacheHit: true, + metrics.StickyCacheMiss: true, + metrics.WorkflowTaskExecutionFailureCounter: true, + metrics.ActivityExecutionFailedCounter: true, + metrics.LocalActivityExecutionFailedCounter: true, + metrics.NexusTaskExecutionFailedCounter: true, + } -var pollerCountByPollerType = map[string]heartbeatMetric{ - metrics.PollerTypeWorkflowTask: metricWorkflowPollerCount, - metrics.PollerTypeWorkflowStickyTask: metricWorkflowStickyPollerCount, - metrics.PollerTypeActivityTask: metricActivityPollerCount, - metrics.PollerTypeNexusTask: metricNexusPollerCount, -} + // Timer recordings are counted (not their latencies) to track tasks processed. + capturedTimers = map[string]bool{ + metrics.WorkflowTaskExecutionLatency: true, + metrics.ActivityExecutionLatency: true, + metrics.LocalActivityExecutionLatency: true, + metrics.NexusTaskExecutionLatency: true, + } +) -// HeartbeatMetricsHandler wraps a metrics handler and captures specific metrics -// in memory that are needed for worker heartbeats -type HeartbeatMetricsHandler struct { +// heartbeatMetricsHandler wraps a metrics handler and captures specific metrics +// in memory for worker heartbeats. +type heartbeatMetricsHandler struct { underlying metrics.Handler workerType string pollerType string - metrics map[heartbeatMetric]*atomic.Uint64 + + // All instances share the same underlying map (set on creation, never replaced). + // Keys are metric names, or "metricName:workerType" / "metricName:pollerType" for typed metrics. + metrics map[string]*atomic.Int64 } -// NewHeartbeatMetricsHandler creates a new handler that captures specific metrics +// newHeartbeatMetricsHandler creates a new handler that captures specific metrics // for worker heartbeats while passing all metrics to the underlying handler. -func NewHeartbeatMetricsHandler(underlying metrics.Handler) *HeartbeatMetricsHandler { - m := make(map[heartbeatMetric]*atomic.Uint64, metricCount) - for i := range heartbeatMetric(metricCount) { - m[i] = new(atomic.Uint64) - } - return &HeartbeatMetricsHandler{ +func newHeartbeatMetricsHandler(underlying metrics.Handler) *heartbeatMetricsHandler { + return &heartbeatMetricsHandler{ underlying: underlying, - metrics: m, + metrics: make(map[string]*atomic.Int64), } } -func (h *HeartbeatMetricsHandler) WithTags(tags map[string]string) metrics.Handler { +// forWorker creates a new handler that captures metrics specific to a worker type, for worker heartbeating. +// This should be called explicitly before calling WithTags on the returned handler. +func (h *heartbeatMetricsHandler) forWorker(workerType string) metrics.Handler { + cpy := *h + cpy.workerType = workerType + return &cpy +} + +// forPoller creates a new handler that captures metrics specific to a poller type, for worker heartbeating. +// This should be called explicitly before calling WithTags on the returned handler. +func (h *heartbeatMetricsHandler) forPoller(pollerType string) metrics.Handler { + cpy := *h + cpy.pollerType = pollerType + return &cpy +} + +func (h *heartbeatMetricsHandler) WithTags(tags map[string]string) metrics.Handler { cpy := *h cpy.underlying = h.underlying.WithTags(tags) - if wt, ok := tags[metrics.WorkerTypeTagName]; ok { - cpy.workerType = wt - } - if pt, ok := tags[metrics.PollerTypeTagName]; ok { - cpy.pollerType = pt - } return &cpy } -func (h *HeartbeatMetricsHandler) Counter(name string) metrics.Counter { +func (h *heartbeatMetricsHandler) Counter(name string) metrics.Counter { underlying := h.underlying.Counter(name) - if metric, ok := counterMetricMap[name]; ok { + if capturedCounters[name] { return &capturingCounter{ underlying: underlying, - value: h.metrics[metric], + value: h.getOrCreate(name), } } return underlying } -func (h *HeartbeatMetricsHandler) Gauge(name string) metrics.Gauge { +func (h *heartbeatMetricsHandler) Gauge(name string) metrics.Gauge { underlying := h.underlying.Gauge(name) switch name { case metrics.StickyCacheSize: return &capturingGauge{ underlying: underlying, - value: h.metrics[metricStickyCacheSize], + value: h.getOrCreate(name), } - case metrics.WorkerTaskSlotsAvailable: - if metric, ok := slotsAvailableByWorkerType[h.workerType]; ok { + case metrics.WorkerTaskSlotsAvailable, metrics.WorkerTaskSlotsUsed: + if h.workerType != "" { return &capturingGauge{ underlying: underlying, - value: h.metrics[metric], - } - } - case metrics.WorkerTaskSlotsUsed: - if metric, ok := slotsUsedByWorkerType[h.workerType]; ok { - return &capturingGauge{ - underlying: underlying, - value: h.metrics[metric], + value: h.getOrCreate(name + ":" + h.workerType), } } case metrics.NumPoller: - if metric, ok := pollerCountByPollerType[h.pollerType]; ok { + if h.pollerType != "" { return &capturingGauge{ underlying: underlying, - value: h.metrics[metric], + value: h.getOrCreate(name + ":" + h.pollerType), } } } @@ -166,17 +112,33 @@ func (h *HeartbeatMetricsHandler) Gauge(name string) metrics.Gauge { return underlying } -func (h *HeartbeatMetricsHandler) Timer(name string) metrics.Timer { +func (h *heartbeatMetricsHandler) Timer(name string) metrics.Timer { underlying := h.underlying.Timer(name) - if metric, ok := timerMetricMap[name]; ok { + if capturedTimers[name] { return &capturingTimer{ underlying: underlying, - counter: h.metrics[metric], + counter: h.getOrCreate(name), } } return underlying } +func (h *heartbeatMetricsHandler) getOrCreate(key string) *atomic.Int64 { + if v, ok := h.metrics[key]; ok { + return v + } + v := new(atomic.Int64) + h.metrics[key] = v + return v +} + +func (h *heartbeatMetricsHandler) get(key string) int64 { + if v, ok := h.metrics[key]; ok { + return v.Load() + } + return 0 +} + // PopulateHeartbeatOptions contains external dependencies needed to populate heartbeat metrics. type PopulateHeartbeatOptions struct { WorkflowSlotSupplierKind string @@ -188,7 +150,7 @@ type PopulateHeartbeatOptions struct { ActivityPollerBehavior PollerBehavior NexusPollerBehavior PollerBehavior - // For delta calculations between heartbeats (mutated by PopulateHeartbeat) + // For delta calculations between heartbeats (mutated by PopulateHeartbeat). PrevWorkflowProcessed *int64 PrevWorkflowFailed *int64 PrevActivityProcessed *int64 @@ -199,20 +161,19 @@ type PopulateHeartbeatOptions struct { PrevNexusFailed *int64 } -// PopulateHeartbeat fills in the metrics-related fields of the passed in WorkerHeartbeat proto, as well as updates -// references in the PopulateHeartbeatOptions for future delta calculations. -func (h *HeartbeatMetricsHandler) PopulateHeartbeat(hb *workerpb.WorkerHeartbeat, opts *PopulateHeartbeatOptions) { - hb.TotalStickyCacheHit = int32(h.metrics[metricStickyCacheHit].Load()) - hb.TotalStickyCacheMiss = int32(h.metrics[metricStickyCacheMiss].Load()) - hb.CurrentStickyCacheSize = int32(h.metrics[metricStickyCacheSize].Load()) +// PopulateHeartbeat fills in the metrics-related fields of the WorkerHeartbeat proto. +func (h *heartbeatMetricsHandler) PopulateHeartbeat(hb *workerpb.WorkerHeartbeat, opts *PopulateHeartbeatOptions) { + hb.TotalStickyCacheHit = int32(h.get(metrics.StickyCacheHit)) + hb.TotalStickyCacheMiss = int32(h.get(metrics.StickyCacheMiss)) + hb.CurrentStickyCacheSize = int32(h.get(metrics.StickyCacheSize)) if opts.WorkflowSlotSupplierKind != "" { hb.WorkflowTaskSlotsInfo = buildSlotsInfo( opts.WorkflowSlotSupplierKind, - int32(h.metrics[metricWorkflowSlotsAvailable].Load()), - int32(h.metrics[metricWorkflowSlotsUsed].Load()), - int64(h.metrics[metricWorkflowTasksProcessed].Load()), - int64(h.metrics[metricWorkflowTaskFailures].Load()), + int32(h.get(metrics.WorkerTaskSlotsAvailable+":"+"WorkflowWorker")), + int32(h.get(metrics.WorkerTaskSlotsUsed+":"+"WorkflowWorker")), + h.get(metrics.WorkflowTaskExecutionLatency), + h.get(metrics.WorkflowTaskExecutionFailureCounter), opts.PrevWorkflowProcessed, opts.PrevWorkflowFailed, ) @@ -221,10 +182,10 @@ func (h *HeartbeatMetricsHandler) PopulateHeartbeat(hb *workerpb.WorkerHeartbeat if opts.ActivitySlotSupplierKind != "" { hb.ActivityTaskSlotsInfo = buildSlotsInfo( opts.ActivitySlotSupplierKind, - int32(h.metrics[metricActivitySlotsAvailable].Load()), - int32(h.metrics[metricActivitySlotsUsed].Load()), - int64(h.metrics[metricActivityTasksProcessed].Load()), - int64(h.metrics[metricActivityTaskFailures].Load()), + int32(h.get(metrics.WorkerTaskSlotsAvailable+":"+"ActivityWorker")), + int32(h.get(metrics.WorkerTaskSlotsUsed+":"+"ActivityWorker")), + h.get(metrics.ActivityExecutionLatency), + h.get(metrics.ActivityExecutionFailedCounter), opts.PrevActivityProcessed, opts.PrevActivityFailed, ) @@ -233,10 +194,10 @@ func (h *HeartbeatMetricsHandler) PopulateHeartbeat(hb *workerpb.WorkerHeartbeat if opts.LocalActivitySlotSupplierKind != "" { hb.LocalActivitySlotsInfo = buildSlotsInfo( opts.LocalActivitySlotSupplierKind, - int32(h.metrics[metricLocalActivitySlotsAvailable].Load()), - int32(h.metrics[metricLocalActivitySlotsUsed].Load()), - int64(h.metrics[metricLocalActivityTasksProcessed].Load()), - int64(h.metrics[metricLocalActivityTaskFailures].Load()), + int32(h.get(metrics.WorkerTaskSlotsAvailable+":"+"LocalActivityWorker")), + int32(h.get(metrics.WorkerTaskSlotsUsed+":"+"LocalActivityWorker")), + h.get(metrics.LocalActivityExecutionLatency), + h.get(metrics.LocalActivityExecutionFailedCounter), opts.PrevLocalActivityProcessed, opts.PrevLocalActivityFailed, ) @@ -245,43 +206,43 @@ func (h *HeartbeatMetricsHandler) PopulateHeartbeat(hb *workerpb.WorkerHeartbeat if opts.NexusSlotSupplierKind != "" { hb.NexusTaskSlotsInfo = buildSlotsInfo( opts.NexusSlotSupplierKind, - int32(h.metrics[metricNexusSlotsAvailable].Load()), - int32(h.metrics[metricNexusSlotsUsed].Load()), - int64(h.metrics[metricNexusTasksProcessed].Load()), - int64(h.metrics[metricNexusTaskFailures].Load()), + int32(h.get(metrics.WorkerTaskSlotsAvailable+":"+"NexusWorker")), + int32(h.get(metrics.WorkerTaskSlotsUsed+":"+"NexusWorker")), + h.get(metrics.NexusTaskExecutionLatency), + h.get(metrics.NexusTaskExecutionFailedCounter), opts.PrevNexusProcessed, opts.PrevNexusFailed, ) } hb.WorkflowPollerInfo = buildPollerInfo( - int32(h.metrics[metricWorkflowPollerCount].Load()), - h.getLastPollTime(metricWorkflowLastPoll), + int32(h.get(metrics.NumPoller+":"+metrics.PollerTypeWorkflowTask)), + h.getLastPollTime(metrics.PollerTypeWorkflowTask), opts.WorkflowPollerBehavior, ) hb.WorkflowStickyPollerInfo = buildPollerInfo( - int32(h.metrics[metricWorkflowStickyPollerCount].Load()), - h.getLastPollTime(metricWorkflowStickyLastPoll), + int32(h.get(metrics.NumPoller+":"+metrics.PollerTypeWorkflowStickyTask)), + h.getLastPollTime(metrics.PollerTypeWorkflowStickyTask), opts.WorkflowPollerBehavior, ) hb.ActivityPollerInfo = buildPollerInfo( - int32(h.metrics[metricActivityPollerCount].Load()), - h.getLastPollTime(metricActivityLastPoll), + int32(h.get(metrics.NumPoller+":"+metrics.PollerTypeActivityTask)), + h.getLastPollTime(metrics.PollerTypeActivityTask), opts.ActivityPollerBehavior, ) hb.NexusPollerInfo = buildPollerInfo( - int32(h.metrics[metricNexusPollerCount].Load()), - h.getLastPollTime(metricNexusLastPoll), + int32(h.get(metrics.NumPoller+":"+metrics.PollerTypeNexusTask)), + h.getLastPollTime(metrics.PollerTypeNexusTask), opts.NexusPollerBehavior, ) } -func (h *HeartbeatMetricsHandler) getLastPollTime(metric heartbeatMetric) time.Time { - nanos := h.metrics[metric].Load() - if nanos == 0 { - return time.Time{} +func (h *heartbeatMetricsHandler) getLastPollTime(pollerType string) time.Time { + nanos := h.get(pollerType) + if nanos != 0 { + return time.Unix(0, nanos) } - return time.Unix(0, int64(nanos)) + return time.Time{} } func buildSlotsInfo( @@ -324,74 +285,41 @@ func buildPollerInfo(currentPollers int32, lastSuccessfulPollTime time.Time, pol } } -// RecordWorkflowPollSuccess records a successful workflow task poll. -func (h *HeartbeatMetricsHandler) RecordWorkflowPollSuccess() { - h.metrics[metricWorkflowLastPoll].Store(uint64(time.Now().UnixNano())) -} - -// RecordWorkflowStickyPollSuccess records a successful workflow sticky task poll. -func (h *HeartbeatMetricsHandler) RecordWorkflowStickyPollSuccess() { - h.metrics[metricWorkflowStickyLastPoll].Store(uint64(time.Now().UnixNano())) -} - -// RecordActivityPollSuccess records a successful activity task poll. -func (h *HeartbeatMetricsHandler) RecordActivityPollSuccess() { - h.metrics[metricActivityLastPoll].Store(uint64(time.Now().UnixNano())) -} - -// RecordNexusPollSuccess records a successful nexus task poll. -func (h *HeartbeatMetricsHandler) RecordNexusPollSuccess() { - h.metrics[metricNexusLastPoll].Store(uint64(time.Now().UnixNano())) -} - -// RecordPollSuccess records a successful poll time if the handler is a *HeartbeatMetricsHandler. -// pollerType should be one of PollerTypeWorkflowTask, PollerTypeWorkflowStickyTask, -// PollerTypeActivityTask, or PollerTypeNexusTask. -func RecordPollSuccess(h metrics.Handler, pollerType string) { - hm, ok := h.(*HeartbeatMetricsHandler) - if !ok { - return - } - switch pollerType { - case metrics.PollerTypeWorkflowTask: - hm.RecordWorkflowPollSuccess() - case metrics.PollerTypeWorkflowStickyTask: - hm.RecordWorkflowStickyPollSuccess() - case metrics.PollerTypeActivityTask: - hm.RecordActivityPollSuccess() - case metrics.PollerTypeNexusTask: - hm.RecordNexusPollSuccess() +// recordPollSuccessIfHeartbeat records a successful poll time if the handler is a *heartbeatMetricsHandler. +func recordPollSuccessIfHeartbeat(h metrics.Handler, pollerType string) { + if hm, ok := h.(*heartbeatMetricsHandler); ok { + hm.getOrCreate(pollerType).Store(time.Now().UnixNano()) } } -// capturingCounter wraps a counter and captures its value in memory for heartbeat reporting. +// capturingCounter wraps a counter and captures its value in memory. type capturingCounter struct { underlying metrics.Counter - value *atomic.Uint64 + value *atomic.Int64 } func (c *capturingCounter) Inc(delta int64) { c.underlying.Inc(delta) if delta > 0 { - c.value.Add(uint64(delta)) + c.value.Add(delta) } } -// capturingGauge wraps a gauge and captures its value in memory for heartbeat reporting. +// capturingGauge wraps a gauge and captures its value in memory. type capturingGauge struct { underlying metrics.Gauge - value *atomic.Uint64 + value *atomic.Int64 } func (g *capturingGauge) Update(f float64) { g.underlying.Update(f) - g.value.Store(uint64(f)) + g.value.Store(int64(f)) } // capturingTimer wraps a timer and increments a counter each time Record is called. type capturingTimer struct { underlying metrics.Timer - counter *atomic.Uint64 + counter *atomic.Int64 } func (t *capturingTimer) Record(d time.Duration) { From 4f43e75745dc2173bc72ca450abdab33ebd099e9 Mon Sep 17 00:00:00 2001 From: Andrew Yuan Date: Wed, 28 Jan 2026 15:04:02 -0800 Subject: [PATCH 09/30] remove unused nexus worker, tighten heartbeat callback and make concurrent-safe --- .github/workflows/ci.yml | 46 -------------------- contrib/datadog/go.sum | 3 +- contrib/hostinfo/go.mod | 2 +- go.mod | 1 - go.sum | 2 - internal/internal_worker.go | 56 +++++++++++++----------- internal/internal_worker_heartbeat.go | 62 ++++----------------------- internal/internal_workflow_client.go | 8 ++-- test/go.mod | 5 +-- test/go.sum | 6 --- test/integration_test.go | 3 +- test/worker_heartbeat_test.go | 3 +- test/worker_tuner_test.go | 20 ++++----- 13 files changed, 56 insertions(+), 161 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f0866ec90..0008a2e18 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -135,49 +135,3 @@ jobs: go-repo-path: ${{github.event.pull_request.head.repo.full_name}} version: ${{github.event.pull_request.head.ref}} version-is-repo-ref: true - - # Verify internal/sysinfo matches gopsutil on all platforms - sysinfo-compare: - strategy: - fail-fast: false - matrix: - os: [ubuntu-latest, macos-intel, macos-arm, windows-latest] - include: - - os: macos-intel - runsOn: macos-15-intel - - os: macos-arm - runsOn: macos-14 - runs-on: ${{ matrix.runsOn || matrix.os }} - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Setup Go - uses: actions/setup-go@v5 - with: - go-version: stable - - - name: Compare sysinfo with gopsutil (Unix) - if: runner.os != 'Windows' - run: ./internal/sysinfo/scripts/compare_with_gopsutil.sh - - - name: Compare sysinfo with gopsutil (Windows) - if: runner.os == 'Windows' - shell: bash - run: ./internal/sysinfo/scripts/compare_with_gopsutil.sh - - cgroups-compare: - runs-on: ubuntu-latest - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Compare cgroups with containerd/cgroups - run: | - docker run --rm \ - -v "${{ github.workspace }}":/workspace \ - -w /workspace \ - --memory=512m \ - --cpus=1 \ - golang:1.23 \ - ./worker/hostmetrics/scripts/compare_with_containerd.sh diff --git a/contrib/datadog/go.sum b/contrib/datadog/go.sum index c3f65cf00..442961a56 100644 --- a/contrib/datadog/go.sum +++ b/contrib/datadog/go.sum @@ -30,8 +30,7 @@ github.com/dustin/go-humanize v1.0.0/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25Kn github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY= github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto= github.com/dvyukov/go-fuzz v0.0.0-20210103155950-6a8e9d1f2415/go.mod h1:11Gm+ccJnvAhCNLlf5+cS9KjtbaD5I5zaZpFMsTHWTw= -github.com/ebitengine/purego v0.5.0 h1:JrMGKfRIAM4/QVKaesIIT7m/UVjTj5GYhRSQYwfVdpo= -github.com/ebitengine/purego v0.5.0/go.mod h1:ah1In8AOtksoNK6yk5z1HTJeUkC1Ez4Wk2idgGslMwQ= +github.com/ebitengine/purego v0.9.1 h1:a/k2f2HQU3Pi399RPW1MOaZyhKJL9w/xFpKAg4q1s0A= github.com/ebitengine/purego v0.9.1/go.mod h1:iIjxzd6CiRiOG0UyXP+V1+jWqUXVjPKLAI0mRfJZTmQ= github.com/facebookgo/clock v0.0.0-20150410010913-600d898af40a h1:yDWHCSQ40h88yih2JAcL6Ls/kVkSE8GFACTGVnMPruw= github.com/facebookgo/clock v0.0.0-20150410010913-600d898af40a/go.mod h1:7Ga40egUymuWXxAe151lTNnCv97MddSOVsjpPPkityA= diff --git a/contrib/hostinfo/go.mod b/contrib/hostinfo/go.mod index 82b31e33d..ce5512cb3 100644 --- a/contrib/hostinfo/go.mod +++ b/contrib/hostinfo/go.mod @@ -7,6 +7,7 @@ toolchain go1.23.6 require ( github.com/containerd/cgroups/v3 v3.0.3 github.com/shirou/gopsutil/v4 v4.24.8 + github.com/stretchr/testify v1.10.0 go.temporal.io/sdk v1.29.1 ) @@ -31,7 +32,6 @@ require ( github.com/shoenig/go-m1cpu v0.1.6 // indirect github.com/sirupsen/logrus v1.9.0 // indirect github.com/stretchr/objx v0.5.2 // indirect - github.com/stretchr/testify v1.10.0 // indirect github.com/tklauser/go-sysconf v0.3.12 // indirect github.com/tklauser/numcpus v0.6.1 // indirect github.com/yusufpapurcu/wmi v1.2.4 // indirect diff --git a/go.mod b/go.mod index 2de405460..46648f756 100644 --- a/go.mod +++ b/go.mod @@ -5,7 +5,6 @@ go 1.23.0 toolchain go1.23.6 require ( - github.com/ebitengine/purego v0.9.1 github.com/facebookgo/clock v0.0.0-20150410010913-600d898af40a github.com/gogo/protobuf v1.3.2 github.com/golang/mock v1.6.0 diff --git a/go.sum b/go.sum index 33e474181..2f5906d58 100644 --- a/go.sum +++ b/go.sum @@ -1,7 +1,5 @@ github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/ebitengine/purego v0.9.1 h1:a/k2f2HQU3Pi399RPW1MOaZyhKJL9w/xFpKAg4q1s0A= -github.com/ebitengine/purego v0.9.1/go.mod h1:iIjxzd6CiRiOG0UyXP+V1+jWqUXVjPKLAI0mRfJZTmQ= github.com/facebookgo/clock v0.0.0-20150410010913-600d898af40a h1:yDWHCSQ40h88yih2JAcL6Ls/kVkSE8GFACTGVnMPruw= github.com/facebookgo/clock v0.0.0-20150410010913-600d898af40a/go.mod h1:7Ga40egUymuWXxAe151lTNnCv97MddSOVsjpPPkityA= github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= diff --git a/internal/internal_worker.go b/internal/internal_worker.go index 0e5c58732..c68c311a4 100644 --- a/internal/internal_worker.go +++ b/internal/internal_worker.go @@ -2225,8 +2225,8 @@ func NewAggregatedWorker(client *WorkflowClient, taskQueue string, options Worke if client.workerHeartbeatInterval != 0 { startTime := timestamppb.New(time.Now()) hostname, _ := os.Hostname() + pid := strconv.Itoa(os.Getpid()) previousHeartbeatTime := time.Now() - pluginInfos := collectPluginInfos(client.clientPluginNames, plugins) var prevWorkflowProcessed, prevWorkflowFailed int64 @@ -2248,28 +2248,45 @@ func NewAggregatedWorker(client *WorkflowClient, taskQueue string, options Worke PrevNexusFailed: &prevNexusFailed, } + var deploymentVersion *deploymentpb.WorkerDeploymentVersion + if options.DeploymentOptions.UseVersioning { + deploymentVersion = &deploymentpb.WorkerDeploymentVersion{ + DeploymentName: options.DeploymentOptions.Version.DeploymentName, + BuildId: options.DeploymentOptions.Version.BuildID, + } + } + + // The callback can be invoked concurrently from the heartbeat worker goroutine and the shutdown path + var mu sync.Mutex heartbeatCallback = func() *workerpb.WorkerHeartbeat { + cpuUsage := getCpuUsage(systemInfoSupplier, workerParams.Logger) + memUsage := getMemUsage(systemInfoSupplier, workerParams.Logger) + if aw.workflowWorker != nil { + populateOpts.WorkflowSlotSupplierKind = aw.workflowWorker.worker.slotSupplier.GetSlotSupplierKind() + populateOpts.LocalActivitySlotSupplierKind = aw.workflowWorker.localActivityWorker.slotSupplier.GetSlotSupplierKind() + } + if aw.activityWorker != nil { + populateOpts.ActivitySlotSupplierKind = aw.activityWorker.worker.slotSupplier.GetSlotSupplierKind() + } + if aw.nexusWorker != nil { + populateOpts.NexusSlotSupplierKind = aw.nexusWorker.worker.slotSupplier.GetSlotSupplierKind() + } heartbeatTime := time.Now() + + mu.Lock() + defer mu.Unlock() elapsedSinceLastHeartbeat := heartbeatTime.Sub(previousHeartbeatTime) previousHeartbeatTime = heartbeatTime - var deploymentVersion *deploymentpb.WorkerDeploymentVersion - if options.DeploymentOptions.UseVersioning { - deploymentVersion = &deploymentpb.WorkerDeploymentVersion{ - DeploymentName: options.DeploymentOptions.Version.DeploymentName, - BuildId: options.DeploymentOptions.Version.BuildID, - } - } - hb := &workerpb.WorkerHeartbeat{ WorkerInstanceKey: aw.workerInstanceKey, WorkerIdentity: aw.client.identity, HostInfo: &workerpb.WorkerHostInfo{ HostName: hostname, WorkerGroupingKey: aw.client.workerGroupingKey, - ProcessId: strconv.Itoa(os.Getpid()), - CurrentHostCpuUsage: getCpuUsage(systemInfoSupplier, workerParams.Logger), - CurrentHostMemUsage: getMemUsage(systemInfoSupplier, workerParams.Logger), + ProcessId: pid, + CurrentHostCpuUsage: cpuUsage, + CurrentHostMemUsage: memUsage, }, TaskQueue: aw.executionParams.TaskQueue, DeploymentVersion: deploymentVersion, @@ -2281,20 +2298,7 @@ func NewAggregatedWorker(client *WorkflowClient, taskQueue string, options Worke ElapsedSinceLastHeartbeat: durationpb.New(elapsedSinceLastHeartbeat), Plugins: pluginInfos, } - - if aw.heartbeatMetrics != nil { - if aw.workflowWorker != nil { - populateOpts.WorkflowSlotSupplierKind = aw.workflowWorker.worker.slotSupplier.GetSlotSupplierKind() - populateOpts.LocalActivitySlotSupplierKind = aw.workflowWorker.localActivityWorker.slotSupplier.GetSlotSupplierKind() - } - if aw.activityWorker != nil { - populateOpts.ActivitySlotSupplierKind = aw.activityWorker.worker.slotSupplier.GetSlotSupplierKind() - } - if aw.nexusWorker != nil { - populateOpts.NexusSlotSupplierKind = aw.nexusWorker.worker.slotSupplier.GetSlotSupplierKind() - } - aw.heartbeatMetrics.PopulateHeartbeat(hb, populateOpts) - } + aw.heartbeatMetrics.PopulateHeartbeat(hb, populateOpts) return hb } diff --git a/internal/internal_worker_heartbeat.go b/internal/internal_worker_heartbeat.go index 8346f45aa..07860de26 100644 --- a/internal/internal_worker_heartbeat.go +++ b/internal/internal_worker_heartbeat.go @@ -7,7 +7,6 @@ import ( "sync/atomic" "time" - "github.com/nexus-rpc/sdk-go/nexus" workerpb "go.temporal.io/api/worker/v1" "go.temporal.io/api/workflowservice/v1" "go.temporal.io/sdk/log" @@ -57,7 +56,6 @@ func (m *heartbeatManager) registerWorker( hw = &sharedNamespaceWorker{ client: m.client, namespace: namespace, - taskQueue: fmt.Sprintf("temporal-sys/worker-commands/%s/%s", namespace, m.client.workerGroupingKey), interval: m.interval, callbacks: make(map[string]func() *workerpb.WorkerHeartbeat), stopC: make(chan struct{}), @@ -65,12 +63,6 @@ func (m *heartbeatManager) registerWorker( logger: m.logger, } - nexusWorker, err := hw.createNexusWorker() - if err != nil { - return fmt.Errorf("failed to create nexus worker for heartbeating: %w", err) - } - hw.nexusWorker = nexusWorker - m.mu.Lock() m.workers[namespace] = hw m.mu.Unlock() @@ -107,17 +99,13 @@ func (m *heartbeatManager) unregisterWorker(worker *AggregatedWorker) { } } -// sharedNamespaceWorker is the background nexus worker that handles heartbeating for -// all workers in a specific namespace for a specific client. +// sharedNamespaceWorker handles heartbeating for all workers in a specific namespace for a specific client. type sharedNamespaceWorker struct { client *WorkflowClient namespace string - taskQueue string // temporal-sys/worker-commands/{namespace}/{workerGroupingKey} interval time.Duration logger log.Logger - nexusWorker *nexusWorker - mu sync.RWMutex callbacks map[string]func() *workerpb.WorkerHeartbeat // workerInstanceKey -> callback @@ -126,62 +114,28 @@ type sharedNamespaceWorker struct { started atomic.Bool } -func (hw *sharedNamespaceWorker) createNexusWorker() (*nexusWorker, error) { - tuner, err := NewFixedSizeTuner(FixedSizeTunerOptions{ - NumNexusSlots: 5}) - if err != nil { - return nil, err - } - - params := workerExecutionParameters{ - Namespace: hw.namespace, - TaskQueue: hw.taskQueue, - Tuner: tuner, - NexusTaskPollerBehavior: NewPollerBehaviorSimpleMaximum(PollerBehaviorSimpleMaximumOptions{MaximumNumberOfPollers: 1}), - } - - reg := nexus.NewServiceRegistry() - handler, err := reg.NewHandler() - if err != nil { - return nil, err - } - - // TODO: Register worker commands here - - nw, err := newNexusWorker(nexusWorkerOptions{ - executionParameters: params, - client: hw.client, - workflowService: hw.client.workflowService, - handler: handler, - }) - - return nw, err -} - func (hw *sharedNamespaceWorker) run() { defer close(hw.stoppedC) hw.started.Store(true) - if err := hw.nexusWorker.Start(); err != nil { - return - } - defer hw.nexusWorker.Stop() - ticker := time.NewTicker(hw.interval) defer ticker.Stop() for { select { case <-ticker.C: - hw.sendHeartbeats() + if err := hw.sendHeartbeats(); err != nil { + hw.logger.Warn("Stopping heartbeat worker", "error", err) + return + } case <-hw.stopC: return } } } -func (hw *sharedNamespaceWorker) sendHeartbeats() { +func (hw *sharedNamespaceWorker) sendHeartbeats() error { hw.mu.RLock() callbacks := make([]func() *workerpb.WorkerHeartbeat, 0, len(hw.callbacks)) for _, cb := range hw.callbacks { @@ -190,7 +144,7 @@ func (hw *sharedNamespaceWorker) sendHeartbeats() { hw.mu.RUnlock() if len(callbacks) == 0 { - return + return nil } heartbeats := make([]*workerpb.WorkerHeartbeat, 0, len(callbacks)) @@ -208,10 +162,10 @@ func (hw *sharedNamespaceWorker) sendHeartbeats() { if status.Code(err) == codes.Unimplemented { // Server doesn't support heartbeats, shutdown worker hw.stop() - return } hw.logger.Warn("Failed to send heartbeat", "Error", err) } + return err } func (hw *sharedNamespaceWorker) stop() { diff --git a/internal/internal_workflow_client.go b/internal/internal_workflow_client.go index fd6f37691..8ee86b514 100644 --- a/internal/internal_workflow_client.go +++ b/internal/internal_workflow_client.go @@ -1385,12 +1385,12 @@ func (wc *WorkflowClient) loadNamespaceCapabilities(ctx context.Context) (*names grpcCtx, cancel := newGRPCContext(ctx, grpcTimeout(wc.getSystemInfoTimeout)) defer cancel() resp, err := wc.workflowService.DescribeNamespace(grpcCtx, &workflowservice.DescribeNamespaceRequest{Namespace: wc.namespace}) - if _, isUnimplemented := err.(*serviceerror.Unimplemented); err != nil && !isUnimplemented { + var unimplemented *serviceerror.Unimplemented + if err != nil && !errors.As(err, &unimplemented) { return nil, fmt.Errorf("failed reaching server: %w", err) } - if resp != nil && resp.NamespaceInfo.Capabilities != nil { - capabilities = resp.NamespaceInfo.Capabilities - } else { + capabilities = resp.GetNamespaceInfo().GetCapabilities() + if capabilities == nil { capabilities = &namespacepb.NamespaceInfo_Capabilities{} } diff --git a/test/go.mod b/test/go.mod index c3efe5c97..1131bcc76 100644 --- a/test/go.mod +++ b/test/go.mod @@ -19,7 +19,6 @@ require ( go.temporal.io/sdk v1.29.1 go.temporal.io/sdk/contrib/opentelemetry v0.0.0-00010101000000-000000000000 go.temporal.io/sdk/contrib/opentracing v0.0.0-00010101000000-000000000000 - go.temporal.io/sdk/contrib/resourcetuner v0.0.0-20260112203102-5b6df8e02dcf go.temporal.io/sdk/contrib/tally v0.0.0-00010101000000-000000000000 go.uber.org/goleak v1.1.12 google.golang.org/grpc v1.67.1 @@ -28,7 +27,6 @@ require ( require ( github.com/davecgh/go-spew v1.1.1 // indirect - github.com/ebitengine/purego v0.9.1 // indirect github.com/facebookgo/clock v0.0.0-20150410010913-600d898af40a // indirect github.com/go-logr/logr v1.4.2 // indirect github.com/go-logr/stdr v1.2.2 // indirect @@ -39,7 +37,6 @@ require ( github.com/robfig/cron v1.2.0 // indirect github.com/stretchr/objx v0.5.2 // indirect github.com/twmb/murmur3 v1.1.5 // indirect - go.einride.tech/pid v0.1.3 // indirect go.opentelemetry.io/otel/metric v1.28.0 // indirect go.uber.org/atomic v1.9.0 // indirect golang.org/x/net v0.39.0 // indirect @@ -54,8 +51,8 @@ require ( replace ( go.temporal.io/sdk => ../ + go.temporal.io/sdk/contrib/hostinfo => ../contrib/hostinfo go.temporal.io/sdk/contrib/opentelemetry => ../contrib/opentelemetry go.temporal.io/sdk/contrib/opentracing => ../contrib/opentracing - go.temporal.io/sdk/contrib/resourcetuner => ../contrib/resourcetuner go.temporal.io/sdk/contrib/tally => ../contrib/tally ) diff --git a/test/go.sum b/test/go.sum index 25f0898b2..6e480067a 100644 --- a/test/go.sum +++ b/test/go.sum @@ -12,8 +12,6 @@ github.com/cespare/xxhash/v2 v2.1.1/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XL github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/ebitengine/purego v0.9.1 h1:a/k2f2HQU3Pi399RPW1MOaZyhKJL9w/xFpKAg4q1s0A= -github.com/ebitengine/purego v0.9.1/go.mod h1:iIjxzd6CiRiOG0UyXP+V1+jWqUXVjPKLAI0mRfJZTmQ= github.com/facebookgo/clock v0.0.0-20150410010913-600d898af40a h1:yDWHCSQ40h88yih2JAcL6Ls/kVkSE8GFACTGVnMPruw= github.com/facebookgo/clock v0.0.0-20150410010913-600d898af40a/go.mod h1:7Ga40egUymuWXxAe151lTNnCv97MddSOVsjpPPkityA= github.com/go-kit/kit v0.8.0/go.mod h1:xBxKIO96dXMWWy0MnWVtmwkA9/13aqxPnvrjFYMA2as= @@ -129,8 +127,6 @@ github.com/uber-go/tally/v4 v4.1.1/go.mod h1:aXeSTDMl4tNosyf6rdU8jlgScHyjEGGtfJ/ github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.3.5/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k= -go.einride.tech/pid v0.1.3 h1:yWAKSmD2Z10jxd4gYFhOjbBNqXeIQwAtnCO/XKCT7sQ= -go.einride.tech/pid v0.1.3/go.mod h1:33JSUbKrH/4v8DZf/0K8IC8Enjd92wB2birp+bCYQso= go.opentelemetry.io/otel v1.28.0 h1:/SqNcYk+idO0CxKEUOtKQClMK/MimZihKYMruSMViUo= go.opentelemetry.io/otel v1.28.0/go.mod h1:q68ijF8Fc8CnMHKyzqL6akLO46ePnjkgfIMIjUIX9z4= go.opentelemetry.io/otel/metric v1.28.0 h1:f0HGvSl1KRAU1DLgLGFjrwVyismPlnuU6JD6bOeuA5Q= @@ -250,5 +246,3 @@ gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -gotest.tools/v3 v3.5.1 h1:EENdUnS3pdur5nybKYIh2Vfgc8IUNBjxDPSjtiJcOzU= -gotest.tools/v3 v3.5.1/go.mod h1:isy3WKz7GK6uNw/sbHzfKBLvlvXwUyV06n6brMxxopU= diff --git a/test/integration_test.go b/test/integration_test.go index 02fdca747..0881f7396 100644 --- a/test/integration_test.go +++ b/test/integration_test.go @@ -37,7 +37,6 @@ import ( "go.temporal.io/sdk/contrib/opentelemetry" sdkopentracing "go.temporal.io/sdk/contrib/opentracing" - "go.temporal.io/sdk/contrib/resourcetuner" "go.temporal.io/sdk/converter" "go.temporal.io/sdk/test" @@ -242,7 +241,7 @@ func (ts *IntegrationTestSuite) SetupTest() { options.MaxConcurrentLocalActivityExecutionSize = 2 } if strings.Contains(ts.T().Name(), "ResourceBasedSlotSupplier") { - tuner, err := resourcetuner.NewResourceBasedTuner(resourcetuner.ResourceBasedTunerOptions{ + tuner, err := worker.NewResourceBasedTuner(worker.ResourceBasedTunerOptions{ TargetMem: 0.9, TargetCpu: 0.9, }) diff --git a/test/worker_heartbeat_test.go b/test/worker_heartbeat_test.go index 6cb114180..cdb1f5abd 100644 --- a/test/worker_heartbeat_test.go +++ b/test/worker_heartbeat_test.go @@ -18,7 +18,6 @@ import ( "go.temporal.io/api/workflowservice/v1" "go.temporal.io/sdk/activity" "go.temporal.io/sdk/client" - "go.temporal.io/sdk/contrib/resourcetuner" "go.temporal.io/sdk/internal" ilog "go.temporal.io/sdk/internal/log" "go.temporal.io/sdk/temporal" @@ -850,7 +849,7 @@ func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatWorkflowTaskProcessed() { func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatResourceBasedTuner() { ctx := context.Background() - tuner, err := resourcetuner.NewResourceBasedTuner(resourcetuner.ResourceBasedTunerOptions{ + tuner, err := worker.NewResourceBasedTuner(worker.ResourceBasedTunerOptions{ TargetMem: 0.8, TargetCpu: 0.9, }) diff --git a/test/worker_tuner_test.go b/test/worker_tuner_test.go index fd1baf387..966398b7c 100644 --- a/test/worker_tuner_test.go +++ b/test/worker_tuner_test.go @@ -4,11 +4,9 @@ import ( "context" "testing" - "go.temporal.io/sdk/worker" - "github.com/stretchr/testify/require" "github.com/stretchr/testify/suite" - "go.temporal.io/sdk/contrib/resourcetuner" + "go.temporal.io/sdk/worker" ) type WorkerTunerTestSuite struct { @@ -58,12 +56,12 @@ func (ts *WorkerTunerTestSuite) TestCompositeWorkerTuner() { wfSS, err := worker.NewFixedSizeSlotSupplier(10) ts.NoError(err) - controllerOpts := resourcetuner.DefaultResourceControllerOptions() + controllerOpts := worker.DefaultResourceControllerOptions() controllerOpts.MemTargetPercent = 0.8 controllerOpts.CpuTargetPercent = 0.9 - controller := resourcetuner.NewResourceController(controllerOpts) - actSS, err := resourcetuner.NewResourceBasedSlotSupplier(controller, - resourcetuner.ResourceBasedSlotSupplierOptions{ + controller := worker.NewResourceController(controllerOpts) + actSS, err := worker.NewResourceBasedSlotSupplier(controller, + worker.ResourceBasedSlotSupplierOptions{ MinSlots: 10, MaxSlots: 20, RampThrottle: 0, @@ -112,12 +110,12 @@ func (ts *WorkerTunerTestSuite) TestResourceBasedSmallSlots() { wfSS, err := worker.NewFixedSizeSlotSupplier(10) ts.NoError(err) - controllerOpts := resourcetuner.DefaultResourceControllerOptions() + controllerOpts := worker.DefaultResourceControllerOptions() controllerOpts.MemTargetPercent = 0.8 controllerOpts.CpuTargetPercent = 0.9 - controller := resourcetuner.NewResourceController(controllerOpts) - actSS, err := resourcetuner.NewResourceBasedSlotSupplier(controller, - resourcetuner.ResourceBasedSlotSupplierOptions{ + controller := worker.NewResourceController(controllerOpts) + actSS, err := worker.NewResourceBasedSlotSupplier(controller, + worker.ResourceBasedSlotSupplierOptions{ MinSlots: 1, MaxSlots: 4, RampThrottle: 0, From e7fbc0323536155f530cbdc7c626cf3221cdc581 Mon Sep 17 00:00:00 2001 From: Andrew Yuan Date: Wed, 28 Jan 2026 22:50:00 -0800 Subject: [PATCH 10/30] Fix tests --- internal/internal_nexus_worker.go | 13 +++++++++++++ internal/internal_worker.go | 19 ++++++++++++++----- internal/internal_worker_heartbeat.go | 2 +- internal/internal_worker_heartbeat_metrics.go | 18 +++++++++--------- internal/internal_workflow_client.go | 5 +++-- 5 files changed, 40 insertions(+), 17 deletions(-) diff --git a/internal/internal_nexus_worker.go b/internal/internal_nexus_worker.go index d5f844ed5..ce28d27d2 100644 --- a/internal/internal_nexus_worker.go +++ b/internal/internal_nexus_worker.go @@ -16,6 +16,7 @@ type nexusWorkerOptions struct { type nexusWorker struct { executionParameters workerExecutionParameters workflowService workflowservice.WorkflowServiceClient + client *WorkflowClient worker *baseWorker stopC chan struct{} } @@ -68,9 +69,16 @@ func newNexusWorker(opts nexusWorkerOptions) (*nexusWorker, error) { baseWorker := newBaseWorker(bwo) + // Type assert to get the concrete client for namespace capabilities loading + var workflowClient *WorkflowClient + if wc, ok := opts.client.(*WorkflowClient); ok { + workflowClient = wc + } + return &nexusWorker{ executionParameters: opts.executionParameters, workflowService: opts.workflowService, + client: workflowClient, worker: baseWorker, stopC: workerStopChannel, }, nil @@ -78,6 +86,11 @@ func newNexusWorker(opts nexusWorkerOptions) (*nexusWorker, error) { // Start the worker. func (w *nexusWorker) Start() error { + if w.client != nil { + if _, err := w.client.loadNamespaceCapabilities(w.executionParameters.MetricsHandler); err != nil { + return err + } + } w.worker.Start() return nil } diff --git a/internal/internal_worker.go b/internal/internal_worker.go index c68c311a4..da2d538d9 100644 --- a/internal/internal_worker.go +++ b/internal/internal_worker.go @@ -81,6 +81,7 @@ type ( workflowWorker struct { executionParameters workerExecutionParameters workflowService workflowservice.WorkflowServiceClient + client *WorkflowClient worker *baseWorker localActivityWorker *baseWorker identity string @@ -94,6 +95,7 @@ type ( activityWorker struct { executionParameters workerExecutionParameters workflowService workflowservice.WorkflowServiceClient + client *WorkflowClient poller taskPoller worker *baseWorker identity string @@ -398,6 +400,7 @@ func newWorkflowTaskWorkerInternal( return &workflowWorker{ executionParameters: params, workflowService: service, + client: client, worker: worker, localActivityWorker: localActivityWorker, identity: params.Identity, @@ -409,6 +412,11 @@ func newWorkflowTaskWorkerInternal( // Start the worker. func (ww *workflowWorker) Start() error { + if ww.client != nil { + if _, err := ww.client.loadNamespaceCapabilities(ww.executionParameters.MetricsHandler); err != nil { + return err + } + } ww.localActivityWorker.Start() ww.worker.Start() return nil // TODO: propagate error @@ -543,6 +551,7 @@ func newActivityWorker( return &activityWorker{ executionParameters: params, workflowService: service, + client: client, worker: base, poller: poller, identity: params.Identity, @@ -552,6 +561,11 @@ func newActivityWorker( // Start the worker. func (aw *activityWorker) Start() error { + if aw.client != nil { + if _, err := aw.client.loadNamespaceCapabilities(aw.executionParameters.MetricsHandler); err != nil { + return err + } + } aw.worker.Start() return nil // TODO: propagate errors } @@ -1271,11 +1285,6 @@ func (aw *AggregatedWorker) start() error { } proto.Merge(aw.capabilities, capabilities) - // Load namespace capabilities (also verifies namespace exists and caches the result) - if _, err := aw.client.loadNamespaceCapabilities(context.Background()); err != nil { - return err - } - if !util.IsInterfaceNil(aw.workflowWorker) { if err := aw.workflowWorker.Start(); err != nil { return err diff --git a/internal/internal_worker_heartbeat.go b/internal/internal_worker_heartbeat.go index 41ab1ea1b..f554a3c8c 100644 --- a/internal/internal_worker_heartbeat.go +++ b/internal/internal_worker_heartbeat.go @@ -44,7 +44,7 @@ func (m *heartbeatManager) registerWorker( hw, ok := m.workers[namespace] m.mu.Unlock() if !ok { - capabilities, err := m.client.loadNamespaceCapabilities(context.Background()) + capabilities, err := m.client.loadNamespaceCapabilities(worker.heartbeatMetrics) if err != nil { return fmt.Errorf("failed to get namespace capabilities: %w", err) } diff --git a/internal/internal_worker_heartbeat_metrics.go b/internal/internal_worker_heartbeat_metrics.go index acf75ab46..b5bdf5a5d 100644 --- a/internal/internal_worker_heartbeat_metrics.go +++ b/internal/internal_worker_heartbeat_metrics.go @@ -1,6 +1,7 @@ package internal import ( + "sync" "sync/atomic" "time" @@ -37,9 +38,8 @@ type heartbeatMetricsHandler struct { workerType string pollerType string - // All instances share the same underlying map (set on creation, never replaced). // Keys are metric names, or "metricName:workerType" / "metricName:pollerType" for typed metrics. - metrics map[string]*atomic.Int64 + metrics *sync.Map } // newHeartbeatMetricsHandler creates a new handler that captures specific metrics @@ -47,7 +47,7 @@ type heartbeatMetricsHandler struct { func newHeartbeatMetricsHandler(underlying metrics.Handler) *heartbeatMetricsHandler { return &heartbeatMetricsHandler{ underlying: underlying, - metrics: make(map[string]*atomic.Int64), + metrics: &sync.Map{}, } } @@ -124,17 +124,17 @@ func (h *heartbeatMetricsHandler) Timer(name string) metrics.Timer { } func (h *heartbeatMetricsHandler) getOrCreate(key string) *atomic.Int64 { - if v, ok := h.metrics[key]; ok { - return v + if v, ok := h.metrics.Load(key); ok { + return v.(*atomic.Int64) } v := new(atomic.Int64) - h.metrics[key] = v - return v + actual, _ := h.metrics.LoadOrStore(key, v) + return actual.(*atomic.Int64) } func (h *heartbeatMetricsHandler) get(key string) int64 { - if v, ok := h.metrics[key]; ok { - return v.Load() + if v, ok := h.metrics.Load(key); ok { + return v.(*atomic.Int64).Load() } return 0 } diff --git a/internal/internal_workflow_client.go b/internal/internal_workflow_client.go index 8ee86b514..6d16b911a 100644 --- a/internal/internal_workflow_client.go +++ b/internal/internal_workflow_client.go @@ -1374,7 +1374,8 @@ func (wc *WorkflowClient) loadCapabilities(ctx context.Context) (*workflowservic } // Get namespace capabilities, lazily fetching from server if not already obtained. -func (wc *WorkflowClient) loadNamespaceCapabilities(ctx context.Context) (*namespacepb.NamespaceInfo_Capabilities, error) { +func (wc *WorkflowClient) loadNamespaceCapabilities(metricsHandler metrics.Handler) (*namespacepb.NamespaceInfo_Capabilities, error) { + ctx := contextWithNewHeader(context.Background()) wc.namespaceCapabilitiesLock.RLock() capabilities := wc.namespaceCapabilities wc.namespaceCapabilitiesLock.RUnlock() @@ -1382,7 +1383,7 @@ func (wc *WorkflowClient) loadNamespaceCapabilities(ctx context.Context) (*names return capabilities, nil } - grpcCtx, cancel := newGRPCContext(ctx, grpcTimeout(wc.getSystemInfoTimeout)) + grpcCtx, cancel := newGRPCContext(ctx, grpcMetricsHandler(metricsHandler), defaultGrpcRetryParameters(ctx)) defer cancel() resp, err := wc.workflowService.DescribeNamespace(grpcCtx, &workflowservice.DescribeNamespaceRequest{Namespace: wc.namespace}) var unimplemented *serviceerror.Unimplemented From edf6e1106e0b840e99ec522d4375b8e0fcfc1bc4 Mon Sep 17 00:00:00 2001 From: Andrew Yuan Date: Wed, 28 Jan 2026 23:32:57 -0800 Subject: [PATCH 11/30] Fix cursor discovered bugs, fix integ tests --- internal/internal_worker.go | 1 - internal/internal_worker_heartbeat.go | 25 +++++++++++------ internal/resource_tuner.go | 3 ++ test/go.mod | 16 +++++++++++ test/go.sum | 40 +++++++++++++++++++++++++++ test/integration_test.go | 6 ++-- test/worker_tuner_test.go | 3 ++ worker/tuning.go | 8 ++++-- 8 files changed, 88 insertions(+), 14 deletions(-) diff --git a/internal/internal_worker.go b/internal/internal_worker.go index da2d538d9..461a7c5a9 100644 --- a/internal/internal_worker.go +++ b/internal/internal_worker.go @@ -2681,7 +2681,6 @@ func collectPluginInfos(clientPluginNames []string, workerPlugins []WorkerPlugin set[name] = struct{}{} result = append(result, &workerpb.PluginInfo{Name: name}) } - set[name] = struct{}{} } for _, plugin := range workerPlugins { if _, found := set[plugin.Name()]; !found { diff --git a/internal/internal_worker_heartbeat.go b/internal/internal_worker_heartbeat.go index f554a3c8c..390788117 100644 --- a/internal/internal_worker_heartbeat.go +++ b/internal/internal_worker_heartbeat.go @@ -38,9 +38,8 @@ func newHeartbeatManager(client *WorkflowClient, interval time.Duration, logger func (m *heartbeatManager) registerWorker( worker *AggregatedWorker, ) error { - m.mu.Lock() - namespace := worker.executionParams.Namespace + m.mu.Lock() hw, ok := m.workers[namespace] m.mu.Unlock() if !ok { @@ -53,7 +52,7 @@ func (m *heartbeatManager) registerWorker( return nil } - hw = &sharedNamespaceWorker{ + newHw := &sharedNamespaceWorker{ client: m.client, namespace: namespace, interval: m.interval, @@ -64,9 +63,16 @@ func (m *heartbeatManager) registerWorker( } m.mu.Lock() - m.workers[namespace] = hw - m.mu.Unlock() - go hw.run() + if existing, ok := m.workers[namespace]; ok { + m.mu.Unlock() + hw = existing + } else { + m.workers[namespace] = newHw + m.mu.Unlock() + hw = newHw + go hw.run() + } + } hw.mu.Lock() @@ -160,12 +166,13 @@ func (hw *sharedNamespaceWorker) sendHeartbeats() error { if err != nil { if status.Code(err) == codes.Unimplemented { - // Server doesn't support heartbeats, shutdown worker - hw.stop() + // Server doesn't support heartbeats; return error to stop the worker. + return fmt.Errorf("server does not support worker heartbeats: %w", err) } + // For other errors, log and continue heartbeating hw.logger.Warn("Failed to send heartbeat", "Error", err) } - return err + return nil } func (hw *sharedNamespaceWorker) stop() { diff --git a/internal/resource_tuner.go b/internal/resource_tuner.go index 43e55a99f..e01d9f058 100644 --- a/internal/resource_tuner.go +++ b/internal/resource_tuner.go @@ -320,6 +320,9 @@ type ResourceController struct { // // Exposed as: [go.temporal.io/sdk/worker.NewResourceController] func NewResourceController(options ResourceControllerOptions) *ResourceController { + if options.InfoSupplier == nil { + panic("InfoSupplier is required - use contrib/hostinfo.NewSystemInfoSupplier() or provide your own") + } return &ResourceController{ options: options, infoSupplier: options.InfoSupplier, diff --git a/test/go.mod b/test/go.mod index 1131bcc76..48937f45a 100644 --- a/test/go.mod +++ b/test/go.mod @@ -17,6 +17,7 @@ require ( go.opentelemetry.io/otel/trace v1.28.0 go.temporal.io/api v1.59.0 go.temporal.io/sdk v1.29.1 + go.temporal.io/sdk/contrib/hostinfo v0.0.0-00010101000000-000000000000 go.temporal.io/sdk/contrib/opentelemetry v0.0.0-00010101000000-000000000000 go.temporal.io/sdk/contrib/opentracing v0.0.0-00010101000000-000000000000 go.temporal.io/sdk/contrib/tally v0.0.0-00010101000000-000000000000 @@ -26,19 +27,34 @@ require ( ) require ( + github.com/cilium/ebpf v0.11.0 // indirect + github.com/containerd/cgroups/v3 v3.0.3 // indirect + github.com/coreos/go-systemd/v22 v22.3.2 // indirect github.com/davecgh/go-spew v1.1.1 // indirect github.com/facebookgo/clock v0.0.0-20150410010913-600d898af40a // indirect github.com/go-logr/logr v1.4.2 // indirect github.com/go-logr/stdr v1.2.2 // indirect + github.com/go-ole/go-ole v1.2.6 // indirect + github.com/godbus/dbus/v5 v5.0.4 // indirect github.com/gogo/protobuf v1.3.2 // indirect github.com/grpc-ecosystem/go-grpc-middleware/v2 v2.3.2 // indirect github.com/grpc-ecosystem/grpc-gateway/v2 v2.22.0 // indirect + github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 // indirect + github.com/opencontainers/runtime-spec v1.0.2 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect + github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c // indirect github.com/robfig/cron v1.2.0 // indirect + github.com/shirou/gopsutil/v4 v4.24.8 // indirect + github.com/shoenig/go-m1cpu v0.1.6 // indirect + github.com/sirupsen/logrus v1.9.0 // indirect github.com/stretchr/objx v0.5.2 // indirect + github.com/tklauser/go-sysconf v0.3.12 // indirect + github.com/tklauser/numcpus v0.6.1 // indirect github.com/twmb/murmur3 v1.1.5 // indirect + github.com/yusufpapurcu/wmi v1.2.4 // indirect go.opentelemetry.io/otel/metric v1.28.0 // indirect go.uber.org/atomic v1.9.0 // indirect + golang.org/x/exp v0.0.0-20240325151524-a685a6edb6d8 // indirect golang.org/x/net v0.39.0 // indirect golang.org/x/sync v0.13.0 // indirect golang.org/x/sys v0.32.0 // indirect diff --git a/test/go.sum b/test/go.sum index 6e480067a..3c33999ad 100644 --- a/test/go.sum +++ b/test/go.sum @@ -9,11 +9,19 @@ github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+Ce github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/cactus/go-statsd-client/statsd v0.0.0-20200423205355-cb0885a1018c/go.mod h1:l/bIBLeOl9eX+wxJAzxS4TveKRtAqlyDpHjhkfO0MEI= github.com/cespare/xxhash/v2 v2.1.1/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= +github.com/cilium/ebpf v0.11.0 h1:V8gS/bTCCjX9uUnkUFUpPsksM8n1lXBAvHcpiFk1X2Y= +github.com/cilium/ebpf v0.11.0/go.mod h1:WE7CZAnqOL2RouJ4f1uyNhqr2P4CCvXFIqdRDUgWsVs= +github.com/containerd/cgroups/v3 v3.0.3 h1:S5ByHZ/h9PMe5IOQoN7E+nMc2UcLEM/V48DGDJ9kip0= +github.com/containerd/cgroups/v3 v3.0.3/go.mod h1:8HBe7V3aWGLFPd/k03swSIsGjZhHI2WzJmticMgVuz0= +github.com/coreos/go-systemd/v22 v22.3.2 h1:D9/bQk5vlXQFZ6Kwuu6zaiXJ9oTPe68++AzAJc1DzSI= +github.com/coreos/go-systemd/v22 v22.3.2/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/facebookgo/clock v0.0.0-20150410010913-600d898af40a h1:yDWHCSQ40h88yih2JAcL6Ls/kVkSE8GFACTGVnMPruw= github.com/facebookgo/clock v0.0.0-20150410010913-600d898af40a/go.mod h1:7Ga40egUymuWXxAe151lTNnCv97MddSOVsjpPPkityA= +github.com/frankban/quicktest v1.14.5 h1:dfYrrRyLtiqT9GyKXgdh+k4inNeTvmGbuSgZ3lx3GhA= +github.com/frankban/quicktest v1.14.5/go.mod h1:4ptaffx2x8+WTWXmUCuVU6aPUX1/Mz7zb5vbUoiM6w0= github.com/go-kit/kit v0.8.0/go.mod h1:xBxKIO96dXMWWy0MnWVtmwkA9/13aqxPnvrjFYMA2as= github.com/go-kit/kit v0.9.0/go.mod h1:xBxKIO96dXMWWy0MnWVtmwkA9/13aqxPnvrjFYMA2as= github.com/go-kit/log v0.1.0/go.mod h1:zbhenjAZHb184qTLMA9ZjW7ThYL0H2mk7Q6pNt4vbaY= @@ -25,7 +33,11 @@ github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY= github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= +github.com/go-ole/go-ole v1.2.6 h1:/Fpf6oFPoeFik9ty7siob0G6Ke8QvQEuVcuChpwXzpY= +github.com/go-ole/go-ole v1.2.6/go.mod h1:pprOEPIfldk/42T2oK7lQ4v4JSDwmV0As9GaiUsvbm0= github.com/go-stack/stack v1.8.0/go.mod h1:v0f6uXyyMGvRgIKkXu+yp6POWl0qKG85gN/melR3HDY= +github.com/godbus/dbus/v5 v5.0.4 h1:9349emZab16e7zQvpmsbtjc18ykshndd8y2PG3sgJbA= +github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= github.com/gogo/protobuf v1.1.1/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ= github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= @@ -46,6 +58,7 @@ github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMyw github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.4/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= @@ -73,6 +86,8 @@ github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= +github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 h1:6E+4a0GO5zZEnZ81pIr0yLvtUWk2if982qA3F3QD6H4= +github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0/go.mod h1:zJYVVT2jmtg6P3p1VtQj7WsuWi/y4VnjVBn7F8KPB3I= github.com/matttproud/golang_protobuf_extensions v1.0.1/go.mod h1:D8He9yQNgCq6Z5Ld7szi9bcBfOoFv/3dc6xSMkL2PC0= github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= @@ -82,6 +97,8 @@ github.com/mwitkow/go-conntrack v0.0.0-20161129095857-cc309e4a2223/go.mod h1:qRW github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U= github.com/nexus-rpc/sdk-go v0.5.1 h1:UFYYfoHlQc+Pn9gQpmn9QE7xluewAn2AO1OSkAh7YFU= github.com/nexus-rpc/sdk-go v0.5.1/go.mod h1:FHdPfVQwRuJFZFTF0Y2GOAxCrbIBNrcPna9slkGKPYk= +github.com/opencontainers/runtime-spec v1.0.2 h1:UfAcuLBJB9Coz72x1hgl8O5RVzTdNiaglX6v2DM6FI0= +github.com/opencontainers/runtime-spec v1.0.2/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0= github.com/opentracing/opentracing-go v1.2.0 h1:uEJPy/1a5RIPAJ0Ov+OIO8OxWu77jEv+1B0VhjKrZUs= github.com/opentracing/opentracing-go v1.2.0/go.mod h1:GxEUsuufX4nBwe+T+Wl9TAgYrxe9dPLANfrWvHYVTgc= github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= @@ -89,6 +106,8 @@ github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINE github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c h1:ncq/mPwQF4JjgDlrVEn3C11VoGHZN7m8qihwgMEtzYw= +github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c/go.mod h1:OmDBASR4679mdNQnz2pUhc2G8CO2JrUAVFDRBDP/hJE= github.com/prometheus/client_golang v0.9.1/go.mod h1:7SWBe2y4D6OKWSNQJUaRYU/AaXPKyh/dDVn+NZz0KFw= github.com/prometheus/client_golang v1.0.0/go.mod h1:db9x61etRT2tGnBNRi70OPL5FsnadC4Ky3P0J6CfImo= github.com/prometheus/client_golang v1.7.1/go.mod h1:PY5Wy2awLA44sXw4AOSfFBetzPP4j5+D6mVACh+pe2M= @@ -107,9 +126,17 @@ github.com/robfig/cron v1.2.0 h1:ZjScXvvxeQ63Dbyxy76Fj3AT3Ut0aKsyd2/tl3DTMuQ= github.com/robfig/cron v1.2.0/go.mod h1:JGuDeoQd7Z6yL4zQhZ3OPEVHB7fL6Ka6skscFHfmt2k= github.com/rogpeppe/go-internal v1.11.0 h1:cWPaGQEPrBb5/AsnsZesgZZ9yb1OQ+GOISoDNXVBh4M= github.com/rogpeppe/go-internal v1.11.0/go.mod h1:ddIwULY96R17DhadqLgMfk9H9tvdUzkipdSkR5nkCZA= +github.com/shirou/gopsutil/v4 v4.24.8 h1:pVQjIenQkIhqO81mwTaXjTzOMT7d3TZkf43PlVFHENI= +github.com/shirou/gopsutil/v4 v4.24.8/go.mod h1:wE0OrJtj4dG+hYkxqDH3QiBICdKSf04/npcvLLc/oRg= +github.com/shoenig/go-m1cpu v0.1.6 h1:nxdKQNcEB6vzgA2E2bvzKIYRuNj7XNJ4S/aRSwKzFtM= +github.com/shoenig/go-m1cpu v0.1.6/go.mod h1:1JJMcUBvfNwpq05QDQVAnx3gUHr9IYF7GNg9SUEw2VQ= +github.com/shoenig/test v0.6.4 h1:kVTaSd7WLz5WZ2IaoM0RSzRsUD+m8wRR+5qvntpn4LU= +github.com/shoenig/test v0.6.4/go.mod h1:byHiCGXqrVaflBLAMq/srcZIHynQPQgeyvkvXnjqq0k= github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo= github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6MwdIuYE2rE= github.com/sirupsen/logrus v1.6.0/go.mod h1:7uNnSEd1DgxDLC74fIahvMZmmYsHGZGEOFrfsX/uA88= +github.com/sirupsen/logrus v1.9.0 h1:trlNQbNUG3OdDrDil03MCb1H2o9nJ1x4/5LYw7byDE0= +github.com/sirupsen/logrus v1.9.0/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY= @@ -120,6 +147,10 @@ github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81P github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +github.com/tklauser/go-sysconf v0.3.12 h1:0QaGUFOdQaIVdPgfITYzaTegZvdCjmYO52cSFAEVmqU= +github.com/tklauser/go-sysconf v0.3.12/go.mod h1:Ho14jnntGE1fpdOqQEEaiKRpvIavV0hSfmBq8nJbHYI= +github.com/tklauser/numcpus v0.6.1 h1:ng9scYS7az0Bk4OZLvrNXNSAO2Pxr1XXRAPyjhIx+Fk= +github.com/tklauser/numcpus v0.6.1/go.mod h1:1XfjsgE2zo8GVw7POkMbHENHzVg3GzmoZ9fESEdAacY= github.com/twmb/murmur3 v1.1.5 h1:i9OLS9fkuLzBXjt6dptlAEyk58fJsSTXbRg3SgVyqgk= github.com/twmb/murmur3 v1.1.5/go.mod h1:Qq/R7NUyOfr65zD+6Q5IHKsJLwP7exErjN6lyyq3OSQ= github.com/uber-go/tally/v4 v4.1.1 h1:jhy6WOZp4nHyCqeV43x3Wz370LXUGBhgW2JmzOIHCWI= @@ -127,6 +158,8 @@ github.com/uber-go/tally/v4 v4.1.1/go.mod h1:aXeSTDMl4tNosyf6rdU8jlgScHyjEGGtfJ/ github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.3.5/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k= +github.com/yusufpapurcu/wmi v1.2.4 h1:zFUKzehAFReQwLys1b/iSMl+JQGSCSjtVqQn9bBrPo0= +github.com/yusufpapurcu/wmi v1.2.4/go.mod h1:SBZ9tNy3G9/m5Oi98Zks0QjeHVDvuK0qfxQmPyzfmi0= go.opentelemetry.io/otel v1.28.0 h1:/SqNcYk+idO0CxKEUOtKQClMK/MimZihKYMruSMViUo= go.opentelemetry.io/otel v1.28.0/go.mod h1:q68ijF8Fc8CnMHKyzqL6akLO46ePnjkgfIMIjUIX9z4= go.opentelemetry.io/otel/metric v1.28.0 h1:f0HGvSl1KRAU1DLgLGFjrwVyismPlnuU6JD6bOeuA5Q= @@ -148,6 +181,8 @@ golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnf golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= +golang.org/x/exp v0.0.0-20240325151524-a685a6edb6d8 h1:aAcj0Da7eBAtrTp03QXWvm88pSyOt+UgdZw2BFZ+lEw= +golang.org/x/exp v0.0.0-20240325151524-a685a6edb6d8/go.mod h1:CQ1k9gNrJ50XIzaKCRR2hssIjF07kZFEiieALBM/ARQ= golang.org/x/lint v0.0.0-20190930215403-16217165b5de h1:5hukYrvBGR8/eNkX5mdUezrA6JiaEZDtJb9Ei+1LlBs= golang.org/x/lint v0.0.0-20190930215403-16217165b5de/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= @@ -181,16 +216,21 @@ golang.org/x/sys v0.0.0-20181116152217-5ac8a444bdc5/go.mod h1:STP8DvDyc/dI5b8T5h golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190422165155-953cdadca894/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190916202348-b4ddaad3f8a3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200106162015-b016eb3dc98e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200615200032-f1bc736245b1/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200625212154-ddb9806d33ae/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20201204225414-ed752295db88/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210330210617-4fbd30eecc44/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210510120138-977fb7262007/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20210603081109-ebe580a85c40/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.11.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.32.0 h1:s77OFDvIQeibCmezSnk/q6iAfkdiQaJi4VzroCFrN20= golang.org/x/sys v0.32.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= diff --git a/test/integration_test.go b/test/integration_test.go index 0881f7396..0d7e99276 100644 --- a/test/integration_test.go +++ b/test/integration_test.go @@ -5,6 +5,7 @@ import ( "errors" "flag" "fmt" + "go.temporal.io/sdk/contrib/hostinfo" "math" "math/rand" "os" @@ -242,8 +243,9 @@ func (ts *IntegrationTestSuite) SetupTest() { } if strings.Contains(ts.T().Name(), "ResourceBasedSlotSupplier") { tuner, err := worker.NewResourceBasedTuner(worker.ResourceBasedTunerOptions{ - TargetMem: 0.9, - TargetCpu: 0.9, + TargetMem: 0.9, + TargetCpu: 0.9, + InfoSupplier: hostinfo.NewSystemInfoSupplier(), }) ts.NoError(err) options.Tuner = tuner diff --git a/test/worker_tuner_test.go b/test/worker_tuner_test.go index 966398b7c..0a66f3674 100644 --- a/test/worker_tuner_test.go +++ b/test/worker_tuner_test.go @@ -6,6 +6,7 @@ import ( "github.com/stretchr/testify/require" "github.com/stretchr/testify/suite" + "go.temporal.io/sdk/contrib/hostinfo" "go.temporal.io/sdk/worker" ) @@ -59,6 +60,7 @@ func (ts *WorkerTunerTestSuite) TestCompositeWorkerTuner() { controllerOpts := worker.DefaultResourceControllerOptions() controllerOpts.MemTargetPercent = 0.8 controllerOpts.CpuTargetPercent = 0.9 + controllerOpts.InfoSupplier = hostinfo.NewSystemInfoSupplier() controller := worker.NewResourceController(controllerOpts) actSS, err := worker.NewResourceBasedSlotSupplier(controller, worker.ResourceBasedSlotSupplierOptions{ @@ -113,6 +115,7 @@ func (ts *WorkerTunerTestSuite) TestResourceBasedSmallSlots() { controllerOpts := worker.DefaultResourceControllerOptions() controllerOpts.MemTargetPercent = 0.8 controllerOpts.CpuTargetPercent = 0.9 + controllerOpts.InfoSupplier = hostinfo.NewSystemInfoSupplier() controller := worker.NewResourceController(controllerOpts) actSS, err := worker.NewResourceBasedSlotSupplier(controller, worker.ResourceBasedSlotSupplierOptions{ diff --git a/worker/tuning.go b/worker/tuning.go index 1a88aa778..d18278ef3 100644 --- a/worker/tuning.go +++ b/worker/tuning.go @@ -97,8 +97,12 @@ type ResourceControllerOptions = internal.ResourceControllerOptions type ResourceController = internal.ResourceController // NewResourceController creates a new ResourceController with the provided options. -// WARNING: It is important that you do not create multiple ResourceController instances. Since -// the controller looks at overall system resources, multiple instances with different configs can +// +// InfoSupplier is required - use contrib/hostinfo.NewSystemInfoSupplier() for a gopsutil-based +// implementation, or provide your own. +// +// WARNING: It is important that you do not create multiple InfoSupplier instances. Since +// InfoSupplier looks at overall system resources, multiple instances with different configs can // only conflict with one another. func NewResourceController(options ResourceControllerOptions) *ResourceController { return internal.NewResourceController(options) From 73f4a10f2aa03024cc0ddb8dc2d23b8cf790018c Mon Sep 17 00:00:00 2001 From: Andrew Yuan Date: Sun, 1 Feb 2026 23:05:37 -0800 Subject: [PATCH 12/30] Rename hostinfo to sysinfo, add interval enforcement, rename mutexes, clarify code --- contrib/{hostinfo => sysinfo}/cgroups.go | 2 +- .../{hostinfo => sysinfo}/cgroups_notlinux.go | 2 +- contrib/{hostinfo => sysinfo}/go.mod | 0 contrib/{hostinfo => sysinfo}/go.sum | 0 .../hostinfo.go => sysinfo/sysinfo.go} | 22 ++-- .../sysinfo_test.go} | 6 +- internal/client.go | 6 +- internal/internal_worker.go | 32 +++--- internal/internal_worker_base.go | 2 +- internal/internal_worker_heartbeat.go | 108 ++++++++++-------- internal/internal_worker_heartbeat_metrics.go | 80 +++++++------ internal/resource_tuner.go | 6 +- test/go.mod | 4 +- test/integration_test.go | 4 +- test/worker_heartbeat_test.go | 6 +- test/worker_tuner_test.go | 6 +- worker/tuning.go | 6 +- 17 files changed, 160 insertions(+), 132 deletions(-) rename contrib/{hostinfo => sysinfo}/cgroups.go (99%) rename contrib/{hostinfo => sysinfo}/cgroups_notlinux.go (95%) rename contrib/{hostinfo => sysinfo}/go.mod (100%) rename contrib/{hostinfo => sysinfo}/go.sum (100%) rename contrib/{hostinfo/hostinfo.go => sysinfo/sysinfo.go} (87%) rename contrib/{hostinfo/hostinfo_test.go => sysinfo/sysinfo_test.go} (91%) diff --git a/contrib/hostinfo/cgroups.go b/contrib/sysinfo/cgroups.go similarity index 99% rename from contrib/hostinfo/cgroups.go rename to contrib/sysinfo/cgroups.go index 50d69c7e0..ea5e7ec19 100644 --- a/contrib/hostinfo/cgroups.go +++ b/contrib/sysinfo/cgroups.go @@ -1,6 +1,6 @@ //go:build linux -package hostinfo +package sysinfo import ( "errors" diff --git a/contrib/hostinfo/cgroups_notlinux.go b/contrib/sysinfo/cgroups_notlinux.go similarity index 95% rename from contrib/hostinfo/cgroups_notlinux.go rename to contrib/sysinfo/cgroups_notlinux.go index 80dfabec1..d89de073b 100644 --- a/contrib/hostinfo/cgroups_notlinux.go +++ b/contrib/sysinfo/cgroups_notlinux.go @@ -1,6 +1,6 @@ //go:build !linux -package hostinfo +package sysinfo import "errors" diff --git a/contrib/hostinfo/go.mod b/contrib/sysinfo/go.mod similarity index 100% rename from contrib/hostinfo/go.mod rename to contrib/sysinfo/go.mod diff --git a/contrib/hostinfo/go.sum b/contrib/sysinfo/go.sum similarity index 100% rename from contrib/hostinfo/go.sum rename to contrib/sysinfo/go.sum diff --git a/contrib/hostinfo/hostinfo.go b/contrib/sysinfo/sysinfo.go similarity index 87% rename from contrib/hostinfo/hostinfo.go rename to contrib/sysinfo/sysinfo.go index a529aa837..e96bbb696 100644 --- a/contrib/hostinfo/hostinfo.go +++ b/contrib/sysinfo/sysinfo.go @@ -1,4 +1,4 @@ -package hostinfo +package sysinfo import ( "context" @@ -11,17 +11,25 @@ import ( "go.temporal.io/sdk/worker" ) -// NewSystemInfoSupplier creates a SystemInfoSupplier using gopsutil. +var ( + sysInfoOnce sync.Once + sysInfoInstance *psUtilSystemInfoSupplier +) + +// SysInfoProvider returns a shared SystemInfoSupplier using gopsutil. // Supports cgroup metrics in containerized Linux environments. -func NewSystemInfoSupplier() worker.SystemInfoSupplier { - return &psUtilSystemInfoSupplier{ - cGroupInfo: newCGroupInfo(), - } +func SysInfoProvider() worker.SystemInfoSupplier { + sysInfoOnce.Do(func() { + sysInfoInstance = &psUtilSystemInfoSupplier{ + cGroupInfo: newCGroupInfo(), + } + }) + return sysInfoInstance } // NewResourceBasedTuner creates a resource-based tuner with gopsutil-based system info. func NewResourceBasedTuner(opts worker.ResourceBasedTunerOptions) (worker.WorkerTuner, error) { - opts.InfoSupplier = NewSystemInfoSupplier() + opts.InfoSupplier = SysInfoProvider() return worker.NewResourceBasedTuner(opts) } diff --git a/contrib/hostinfo/hostinfo_test.go b/contrib/sysinfo/sysinfo_test.go similarity index 91% rename from contrib/hostinfo/hostinfo_test.go rename to contrib/sysinfo/sysinfo_test.go index 4162f23de..b1671d1e7 100644 --- a/contrib/hostinfo/hostinfo_test.go +++ b/contrib/sysinfo/sysinfo_test.go @@ -1,4 +1,4 @@ -package hostinfo +package sysinfo import ( "testing" @@ -10,7 +10,7 @@ import ( ) func TestGetMemoryCpuUsage(t *testing.T) { - supplier := NewSystemInfoSupplier() + supplier := SysInfoProvider() ctx := &worker.SystemInfoContext{Logger: log.NewNopLogger()} usage, err := supplier.GetMemoryUsage(ctx) @@ -25,7 +25,7 @@ func TestGetMemoryCpuUsage(t *testing.T) { } func TestMaybeRefreshRateLimiting(t *testing.T) { - supplier := NewSystemInfoSupplier().(*psUtilSystemInfoSupplier) + supplier := SysInfoProvider().(*psUtilSystemInfoSupplier) ctx := &worker.SystemInfoContext{Logger: log.NewNopLogger()} // First call should refresh diff --git a/internal/client.go b/internal/client.go index 8c9c96c1a..48530d5c1 100644 --- a/internal/client.go +++ b/internal/client.go @@ -536,6 +536,7 @@ type ( Plugins []ClientPlugin // WorkerHeartbeatInterval is the interval at which the worker will send heartbeats to the server. + // Interval must be between 1s and 60s, inclusive. // // default: 60s. To disable, set to 0. // @@ -1162,8 +1163,11 @@ func NewServiceClient(workflowServiceClient workflowservice.WorkflowServiceClien if options.WorkerHeartbeatInterval == nil { heartbeatInterval = time.Second * 60 } else if *options.WorkerHeartbeatInterval == 0 { - heartbeatInterval = time.Second * 0 + heartbeatInterval = 0 } else { + if heartbeatInterval < time.Second || heartbeatInterval > 60*time.Second { + panic("WorkerHeartbeatInterval must be between 1 second and 60 seconds") + } heartbeatInterval = *options.WorkerHeartbeatInterval } diff --git a/internal/internal_worker.go b/internal/internal_worker.go index 461a7c5a9..3ebc3cf75 100644 --- a/internal/internal_worker.go +++ b/internal/internal_worker.go @@ -2243,18 +2243,18 @@ func NewAggregatedWorker(client *WorkflowClient, taskQueue string, options Worke var prevLocalActivityProcessed, prevLocalActivityFailed int64 var prevNexusProcessed, prevNexusFailed int64 - populateOpts := &PopulateHeartbeatOptions{ - WorkflowPollerBehavior: options.WorkflowTaskPollerBehavior, - ActivityPollerBehavior: options.ActivityTaskPollerBehavior, - NexusPollerBehavior: options.NexusTaskPollerBehavior, - PrevWorkflowProcessed: &prevWorkflowProcessed, - PrevWorkflowFailed: &prevWorkflowFailed, - PrevActivityProcessed: &prevActivityProcessed, - PrevActivityFailed: &prevActivityFailed, - PrevLocalActivityProcessed: &prevLocalActivityProcessed, - PrevLocalActivityFailed: &prevLocalActivityFailed, - PrevNexusProcessed: &prevNexusProcessed, - PrevNexusFailed: &prevNexusFailed, + populateOpts := &populateHeartbeatOptions{ + workflowPollerBehavior: options.WorkflowTaskPollerBehavior, + activityPollerBehavior: options.ActivityTaskPollerBehavior, + nexusPollerBehavior: options.NexusTaskPollerBehavior, + prevWorkflowProcessed: &prevWorkflowProcessed, + prevWorkflowFailed: &prevWorkflowFailed, + prevActivityProcessed: &prevActivityProcessed, + prevActivityFailed: &prevActivityFailed, + prevLocalActivityProcessed: &prevLocalActivityProcessed, + prevLocalActivityFailed: &prevLocalActivityFailed, + prevNexusProcessed: &prevNexusProcessed, + prevNexusFailed: &prevNexusFailed, } var deploymentVersion *deploymentpb.WorkerDeploymentVersion @@ -2271,14 +2271,14 @@ func NewAggregatedWorker(client *WorkflowClient, taskQueue string, options Worke cpuUsage := getCpuUsage(systemInfoSupplier, workerParams.Logger) memUsage := getMemUsage(systemInfoSupplier, workerParams.Logger) if aw.workflowWorker != nil { - populateOpts.WorkflowSlotSupplierKind = aw.workflowWorker.worker.slotSupplier.GetSlotSupplierKind() - populateOpts.LocalActivitySlotSupplierKind = aw.workflowWorker.localActivityWorker.slotSupplier.GetSlotSupplierKind() + populateOpts.workflowSlotSupplierKind = aw.workflowWorker.worker.slotSupplier.GetSlotSupplierKind() + populateOpts.localActivitySlotSupplierKind = aw.workflowWorker.localActivityWorker.slotSupplier.GetSlotSupplierKind() } if aw.activityWorker != nil { - populateOpts.ActivitySlotSupplierKind = aw.activityWorker.worker.slotSupplier.GetSlotSupplierKind() + populateOpts.activitySlotSupplierKind = aw.activityWorker.worker.slotSupplier.GetSlotSupplierKind() } if aw.nexusWorker != nil { - populateOpts.NexusSlotSupplierKind = aw.nexusWorker.worker.slotSupplier.GetSlotSupplierKind() + populateOpts.nexusSlotSupplierKind = aw.nexusWorker.worker.slotSupplier.GetSlotSupplierKind() } heartbeatTime := time.Now() diff --git a/internal/internal_worker_base.go b/internal/internal_worker_base.go index 7eeb9e0b5..fdb3e3051 100644 --- a/internal/internal_worker_base.go +++ b/internal/internal_worker_base.go @@ -327,7 +327,7 @@ func newBaseWorker( ctx, cancel := context.WithCancel(context.Background()) logger := log.With(options.logger, tagWorkerType, options.workerType) if heartbeatHandler, isHeartbeat := options.metricsHandler.(*heartbeatMetricsHandler); isHeartbeat { - options.metricsHandler = heartbeatHandler.forWorker(options.workerType).WithTags(metrics.WorkerTags(options.workerType)) + options.metricsHandler = heartbeatHandler.forWorker(options.workerType) } metricsHandler := options.metricsHandler.WithTags(metrics.WorkerTags(options.workerType)) tss := newTrackingSlotSupplier(options.slotSupplier, trackingSlotSupplierOptions{ diff --git a/internal/internal_worker_heartbeat.go b/internal/internal_worker_heartbeat.go index 390788117..2efc4d133 100644 --- a/internal/internal_worker_heartbeat.go +++ b/internal/internal_worker_heartbeat.go @@ -20,8 +20,8 @@ type heartbeatManager struct { interval time.Duration logger log.Logger - mu sync.Mutex - workers map[string]*sharedNamespaceWorker // namespace -> worker + workersMutex sync.Mutex + workers map[string]*sharedNamespaceWorker // namespace -> worker } // newHeartbeatManager creates a new heartbeatManager. @@ -38,46 +38,17 @@ func newHeartbeatManager(client *WorkflowClient, interval time.Duration, logger func (m *heartbeatManager) registerWorker( worker *AggregatedWorker, ) error { - namespace := worker.executionParams.Namespace - m.mu.Lock() - hw, ok := m.workers[namespace] - m.mu.Unlock() - if !ok { - capabilities, err := m.client.loadNamespaceCapabilities(worker.heartbeatMetrics) - if err != nil { - return fmt.Errorf("failed to get namespace capabilities: %w", err) - } - if !capabilities.GetWorkerHeartbeats() { - m.logger.Debug("Worker heartbeating configured, but server version does not support it.") - return nil - } - - newHw := &sharedNamespaceWorker{ - client: m.client, - namespace: namespace, - interval: m.interval, - callbacks: make(map[string]func() *workerpb.WorkerHeartbeat), - stopC: make(chan struct{}), - stoppedC: make(chan struct{}), - logger: m.logger, - } - - m.mu.Lock() - if existing, ok := m.workers[namespace]; ok { - m.mu.Unlock() - hw = existing - } else { - m.workers[namespace] = newHw - m.mu.Unlock() - hw = newHw - go hw.run() - } - + hw, err := m.getOrCreateSharedNamespaceWorker(worker) + if err != nil { + return err + } + if hw == nil { + return nil // heartbeats not supported } - hw.mu.Lock() + hw.callbacksMutex.Lock() hw.callbacks[worker.workerInstanceKey] = worker.heartbeatCallback - hw.mu.Unlock() + hw.callbacksMutex.Unlock() return nil } @@ -85,8 +56,8 @@ func (m *heartbeatManager) registerWorker( // unregisterWorker removes a worker's heartbeat callback. If no callbacks remain for the namespace, // the shared heartbeat worker is stopped. func (m *heartbeatManager) unregisterWorker(worker *AggregatedWorker) { - m.mu.Lock() - defer m.mu.Unlock() + m.workersMutex.Lock() + defer m.workersMutex.Unlock() namespace := worker.executionParams.Namespace hw, ok := m.workers[namespace] @@ -94,10 +65,10 @@ func (m *heartbeatManager) unregisterWorker(worker *AggregatedWorker) { return } - hw.mu.Lock() + hw.callbacksMutex.Lock() delete(hw.callbacks, worker.workerInstanceKey) remaining := len(hw.callbacks) - hw.mu.Unlock() + hw.callbacksMutex.Unlock() if remaining == 0 { hw.stop() @@ -105,6 +76,47 @@ func (m *heartbeatManager) unregisterWorker(worker *AggregatedWorker) { } } +func (m *heartbeatManager) getOrCreateSharedNamespaceWorker(worker *AggregatedWorker) (*sharedNamespaceWorker, error) { + namespace := worker.executionParams.Namespace + m.workersMutex.Lock() + hw, ok := m.workers[namespace] + m.workersMutex.Unlock() + if !ok { + capabilities, err := m.client.loadNamespaceCapabilities(worker.heartbeatMetrics) + if err != nil { + return nil, fmt.Errorf("failed to get namespace capabilities: %w", err) + } + if !capabilities.GetWorkerHeartbeats() { + m.logger.Debug("Worker heartbeating configured, but server version does not support it.") + return nil, nil + } + + m.workersMutex.Lock() + if existing, ok := m.workers[namespace]; ok { + m.workersMutex.Unlock() + hw = existing + } else { + newHw := &sharedNamespaceWorker{ + client: m.client, + namespace: namespace, + interval: m.interval, + callbacks: make(map[string]func() *workerpb.WorkerHeartbeat), + stopC: make(chan struct{}), + stoppedC: make(chan struct{}), + logger: m.logger, + } + m.workers[namespace] = newHw + m.workersMutex.Unlock() + hw = newHw + if hw.started.Swap(true) { + panic("heartbeat worker already started") + } + go hw.run() + } + } + return hw, nil +} + // sharedNamespaceWorker handles heartbeating for all workers in a specific namespace for a specific client. type sharedNamespaceWorker struct { client *WorkflowClient @@ -112,8 +124,8 @@ type sharedNamespaceWorker struct { interval time.Duration logger log.Logger - mu sync.RWMutex - callbacks map[string]func() *workerpb.WorkerHeartbeat // workerInstanceKey -> callback + callbacksMutex sync.RWMutex + callbacks map[string]func() *workerpb.WorkerHeartbeat // workerInstanceKey -> callback stopC chan struct{} stoppedC chan struct{} @@ -123,8 +135,6 @@ type sharedNamespaceWorker struct { func (hw *sharedNamespaceWorker) run() { defer close(hw.stoppedC) - hw.started.Store(true) - ticker := time.NewTicker(hw.interval) defer ticker.Stop() @@ -142,12 +152,12 @@ func (hw *sharedNamespaceWorker) run() { } func (hw *sharedNamespaceWorker) sendHeartbeats() error { - hw.mu.RLock() + hw.callbacksMutex.RLock() callbacks := make([]func() *workerpb.WorkerHeartbeat, 0, len(hw.callbacks)) for _, cb := range hw.callbacks { callbacks = append(callbacks, cb) } - hw.mu.RUnlock() + hw.callbacksMutex.RUnlock() if len(callbacks) == 0 { return nil diff --git a/internal/internal_worker_heartbeat_metrics.go b/internal/internal_worker_heartbeat_metrics.go index b5bdf5a5d..843e39e41 100644 --- a/internal/internal_worker_heartbeat_metrics.go +++ b/internal/internal_worker_heartbeat_metrics.go @@ -139,101 +139,101 @@ func (h *heartbeatMetricsHandler) get(key string) int64 { return 0 } -// PopulateHeartbeatOptions contains external dependencies needed to populate heartbeat metrics. -type PopulateHeartbeatOptions struct { - WorkflowSlotSupplierKind string - ActivitySlotSupplierKind string - LocalActivitySlotSupplierKind string - NexusSlotSupplierKind string +// populateHeartbeatOptions contains external dependencies needed to populate heartbeat metrics. +type populateHeartbeatOptions struct { + workflowSlotSupplierKind string + activitySlotSupplierKind string + localActivitySlotSupplierKind string + nexusSlotSupplierKind string - WorkflowPollerBehavior PollerBehavior - ActivityPollerBehavior PollerBehavior - NexusPollerBehavior PollerBehavior + workflowPollerBehavior PollerBehavior + activityPollerBehavior PollerBehavior + nexusPollerBehavior PollerBehavior // For delta calculations between heartbeats (mutated by PopulateHeartbeat). - PrevWorkflowProcessed *int64 - PrevWorkflowFailed *int64 - PrevActivityProcessed *int64 - PrevActivityFailed *int64 - PrevLocalActivityProcessed *int64 - PrevLocalActivityFailed *int64 - PrevNexusProcessed *int64 - PrevNexusFailed *int64 + prevWorkflowProcessed *int64 + prevWorkflowFailed *int64 + prevActivityProcessed *int64 + prevActivityFailed *int64 + prevLocalActivityProcessed *int64 + prevLocalActivityFailed *int64 + prevNexusProcessed *int64 + prevNexusFailed *int64 } // PopulateHeartbeat fills in the metrics-related fields of the WorkerHeartbeat proto. -func (h *heartbeatMetricsHandler) PopulateHeartbeat(hb *workerpb.WorkerHeartbeat, opts *PopulateHeartbeatOptions) { +func (h *heartbeatMetricsHandler) PopulateHeartbeat(hb *workerpb.WorkerHeartbeat, opts *populateHeartbeatOptions) { hb.TotalStickyCacheHit = int32(h.get(metrics.StickyCacheHit)) hb.TotalStickyCacheMiss = int32(h.get(metrics.StickyCacheMiss)) hb.CurrentStickyCacheSize = int32(h.get(metrics.StickyCacheSize)) - if opts.WorkflowSlotSupplierKind != "" { + if opts.workflowSlotSupplierKind != "" { hb.WorkflowTaskSlotsInfo = buildSlotsInfo( - opts.WorkflowSlotSupplierKind, + opts.workflowSlotSupplierKind, int32(h.get(metrics.WorkerTaskSlotsAvailable+":"+"WorkflowWorker")), int32(h.get(metrics.WorkerTaskSlotsUsed+":"+"WorkflowWorker")), h.get(metrics.WorkflowTaskExecutionLatency), h.get(metrics.WorkflowTaskExecutionFailureCounter), - opts.PrevWorkflowProcessed, - opts.PrevWorkflowFailed, + opts.prevWorkflowProcessed, + opts.prevWorkflowFailed, ) } - if opts.ActivitySlotSupplierKind != "" { + if opts.activitySlotSupplierKind != "" { hb.ActivityTaskSlotsInfo = buildSlotsInfo( - opts.ActivitySlotSupplierKind, + opts.activitySlotSupplierKind, int32(h.get(metrics.WorkerTaskSlotsAvailable+":"+"ActivityWorker")), int32(h.get(metrics.WorkerTaskSlotsUsed+":"+"ActivityWorker")), h.get(metrics.ActivityExecutionLatency), h.get(metrics.ActivityExecutionFailedCounter), - opts.PrevActivityProcessed, - opts.PrevActivityFailed, + opts.prevActivityProcessed, + opts.prevActivityFailed, ) } - if opts.LocalActivitySlotSupplierKind != "" { + if opts.localActivitySlotSupplierKind != "" { hb.LocalActivitySlotsInfo = buildSlotsInfo( - opts.LocalActivitySlotSupplierKind, + opts.localActivitySlotSupplierKind, int32(h.get(metrics.WorkerTaskSlotsAvailable+":"+"LocalActivityWorker")), int32(h.get(metrics.WorkerTaskSlotsUsed+":"+"LocalActivityWorker")), h.get(metrics.LocalActivityExecutionLatency), h.get(metrics.LocalActivityExecutionFailedCounter), - opts.PrevLocalActivityProcessed, - opts.PrevLocalActivityFailed, + opts.prevLocalActivityProcessed, + opts.prevLocalActivityFailed, ) } - if opts.NexusSlotSupplierKind != "" { + if opts.nexusSlotSupplierKind != "" { hb.NexusTaskSlotsInfo = buildSlotsInfo( - opts.NexusSlotSupplierKind, + opts.nexusSlotSupplierKind, int32(h.get(metrics.WorkerTaskSlotsAvailable+":"+"NexusWorker")), int32(h.get(metrics.WorkerTaskSlotsUsed+":"+"NexusWorker")), h.get(metrics.NexusTaskExecutionLatency), h.get(metrics.NexusTaskExecutionFailedCounter), - opts.PrevNexusProcessed, - opts.PrevNexusFailed, + opts.prevNexusProcessed, + opts.prevNexusFailed, ) } hb.WorkflowPollerInfo = buildPollerInfo( int32(h.get(metrics.NumPoller+":"+metrics.PollerTypeWorkflowTask)), h.getLastPollTime(metrics.PollerTypeWorkflowTask), - opts.WorkflowPollerBehavior, + opts.workflowPollerBehavior, ) hb.WorkflowStickyPollerInfo = buildPollerInfo( int32(h.get(metrics.NumPoller+":"+metrics.PollerTypeWorkflowStickyTask)), h.getLastPollTime(metrics.PollerTypeWorkflowStickyTask), - opts.WorkflowPollerBehavior, + opts.workflowPollerBehavior, ) hb.ActivityPollerInfo = buildPollerInfo( int32(h.get(metrics.NumPoller+":"+metrics.PollerTypeActivityTask)), h.getLastPollTime(metrics.PollerTypeActivityTask), - opts.ActivityPollerBehavior, + opts.activityPollerBehavior, ) hb.NexusPollerInfo = buildPollerInfo( int32(h.get(metrics.NumPoller+":"+metrics.PollerTypeNexusTask)), h.getLastPollTime(metrics.PollerTypeNexusTask), - opts.NexusPollerBehavior, + opts.nexusPollerBehavior, ) } @@ -245,6 +245,10 @@ func (h *heartbeatMetricsHandler) getLastPollTime(pollerType string) time.Time { return time.Time{} } +func (h *heartbeatMetricsHandler) Unwrap() metrics.Handler { + return h.underlying +} + func buildSlotsInfo( supplierKind string, slotsAvailable int32, diff --git a/internal/resource_tuner.go b/internal/resource_tuner.go index e01d9f058..86f612b6e 100644 --- a/internal/resource_tuner.go +++ b/internal/resource_tuner.go @@ -57,7 +57,7 @@ type ResourceBasedTunerOptions struct { // attempt to maintain. Must be set nonzero. TargetCpu float64 // InfoSupplier provides CPU and memory usage information. This is required. - // Use contrib/hostinfo.NewSystemInfoSupplier() for a gopsutil-based implementation. + // Use contrib/sysinfo.SysInfoProvider() for a gopsutil-based implementation. InfoSupplier SystemInfoSupplier // Passed to ResourceBasedSlotSupplierOptions.RampThrottle for activities. // If not set, the default value is 50ms. @@ -70,7 +70,7 @@ type ResourceBasedTunerOptions struct { // NewResourceBasedTuner creates a WorkerTuner that dynamically adjusts the number of slots based // on system resources. Specify the target CPU and memory usage as a value between 0 and 1. // -// InfoSupplier is required - use contrib/hostinfo.NewSystemInfoSupplier() for a gopsutil-based +// InfoSupplier is required - use contrib/sysinfo.SysInfoProvider() for a gopsutil-based // implementation, or provide your own. // // Exposed as: [go.temporal.io/sdk/worker.NewResourceBasedTuner] @@ -321,7 +321,7 @@ type ResourceController struct { // Exposed as: [go.temporal.io/sdk/worker.NewResourceController] func NewResourceController(options ResourceControllerOptions) *ResourceController { if options.InfoSupplier == nil { - panic("InfoSupplier is required - use contrib/hostinfo.NewSystemInfoSupplier() or provide your own") + panic("InfoSupplier is required - use contrib/sysinfo.SysInfoProvider() or provide your own") } return &ResourceController{ options: options, diff --git a/test/go.mod b/test/go.mod index 48937f45a..812dd87ac 100644 --- a/test/go.mod +++ b/test/go.mod @@ -17,7 +17,7 @@ require ( go.opentelemetry.io/otel/trace v1.28.0 go.temporal.io/api v1.59.0 go.temporal.io/sdk v1.29.1 - go.temporal.io/sdk/contrib/hostinfo v0.0.0-00010101000000-000000000000 + go.temporal.io/sdk/contrib/sysinfo v0.0.0-00010101000000-000000000000 go.temporal.io/sdk/contrib/opentelemetry v0.0.0-00010101000000-000000000000 go.temporal.io/sdk/contrib/opentracing v0.0.0-00010101000000-000000000000 go.temporal.io/sdk/contrib/tally v0.0.0-00010101000000-000000000000 @@ -67,7 +67,7 @@ require ( replace ( go.temporal.io/sdk => ../ - go.temporal.io/sdk/contrib/hostinfo => ../contrib/hostinfo + go.temporal.io/sdk/contrib/sysinfo => ../contrib/sysinfo go.temporal.io/sdk/contrib/opentelemetry => ../contrib/opentelemetry go.temporal.io/sdk/contrib/opentracing => ../contrib/opentracing go.temporal.io/sdk/contrib/tally => ../contrib/tally diff --git a/test/integration_test.go b/test/integration_test.go index 0d7e99276..c411a076a 100644 --- a/test/integration_test.go +++ b/test/integration_test.go @@ -5,7 +5,7 @@ import ( "errors" "flag" "fmt" - "go.temporal.io/sdk/contrib/hostinfo" + "go.temporal.io/sdk/contrib/sysinfo" "math" "math/rand" "os" @@ -245,7 +245,7 @@ func (ts *IntegrationTestSuite) SetupTest() { tuner, err := worker.NewResourceBasedTuner(worker.ResourceBasedTunerOptions{ TargetMem: 0.9, TargetCpu: 0.9, - InfoSupplier: hostinfo.NewSystemInfoSupplier(), + InfoSupplier: sysinfo.SysInfoProvider(), }) ts.NoError(err) options.Tuner = tuner diff --git a/test/worker_heartbeat_test.go b/test/worker_heartbeat_test.go index cdb1f5abd..d683483d6 100644 --- a/test/worker_heartbeat_test.go +++ b/test/worker_heartbeat_test.go @@ -18,6 +18,7 @@ import ( "go.temporal.io/api/workflowservice/v1" "go.temporal.io/sdk/activity" "go.temporal.io/sdk/client" + "go.temporal.io/sdk/contrib/sysinfo" "go.temporal.io/sdk/internal" ilog "go.temporal.io/sdk/internal/log" "go.temporal.io/sdk/temporal" @@ -850,8 +851,9 @@ func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatResourceBasedTuner() { ctx := context.Background() tuner, err := worker.NewResourceBasedTuner(worker.ResourceBasedTunerOptions{ - TargetMem: 0.8, - TargetCpu: 0.9, + TargetMem: 0.8, + TargetCpu: 0.9, + InfoSupplier: sysinfo.SysInfoProvider(), }) ts.NoError(err) diff --git a/test/worker_tuner_test.go b/test/worker_tuner_test.go index 0a66f3674..227e4effc 100644 --- a/test/worker_tuner_test.go +++ b/test/worker_tuner_test.go @@ -6,7 +6,7 @@ import ( "github.com/stretchr/testify/require" "github.com/stretchr/testify/suite" - "go.temporal.io/sdk/contrib/hostinfo" + "go.temporal.io/sdk/contrib/sysinfo" "go.temporal.io/sdk/worker" ) @@ -60,7 +60,7 @@ func (ts *WorkerTunerTestSuite) TestCompositeWorkerTuner() { controllerOpts := worker.DefaultResourceControllerOptions() controllerOpts.MemTargetPercent = 0.8 controllerOpts.CpuTargetPercent = 0.9 - controllerOpts.InfoSupplier = hostinfo.NewSystemInfoSupplier() + controllerOpts.InfoSupplier = sysinfo.SysInfoProvider() controller := worker.NewResourceController(controllerOpts) actSS, err := worker.NewResourceBasedSlotSupplier(controller, worker.ResourceBasedSlotSupplierOptions{ @@ -115,7 +115,7 @@ func (ts *WorkerTunerTestSuite) TestResourceBasedSmallSlots() { controllerOpts := worker.DefaultResourceControllerOptions() controllerOpts.MemTargetPercent = 0.8 controllerOpts.CpuTargetPercent = 0.9 - controllerOpts.InfoSupplier = hostinfo.NewSystemInfoSupplier() + controllerOpts.InfoSupplier = sysinfo.SysInfoProvider() controller := worker.NewResourceController(controllerOpts) actSS, err := worker.NewResourceBasedSlotSupplier(controller, worker.ResourceBasedSlotSupplierOptions{ diff --git a/worker/tuning.go b/worker/tuning.go index d18278ef3..0894b0a9b 100644 --- a/worker/tuning.go +++ b/worker/tuning.go @@ -48,7 +48,7 @@ func NewFixedSizeSlotSupplier(numSlots int) (SlotSupplier, error) { } // SystemInfoSupplier implementations provide information about system resources. -// Use contrib/hostinfo.NewSystemInfoSupplier() for a gopsutil-based implementation, +// Use contrib/sysinfo.SysInfoProvider() for a gopsutil-based implementation, // or provide your own. type SystemInfoSupplier = internal.SystemInfoSupplier @@ -65,7 +65,7 @@ type ResourceBasedTunerOptions = internal.ResourceBasedTunerOptions // NewResourceBasedTuner creates a WorkerTuner that dynamically adjusts the number of slots based // on system resources. Specify the target CPU and memory usage as a value between 0 and 1. // -// InfoSupplier is required - use contrib/hostinfo.NewSystemInfoSupplier() for a gopsutil-based +// InfoSupplier is required - use contrib/sysinfo.SysInfoProvider() for a gopsutil-based // implementation, or provide your own. func NewResourceBasedTuner(opts ResourceBasedTunerOptions) (WorkerTuner, error) { return internal.NewResourceBasedTuner(opts) @@ -98,7 +98,7 @@ type ResourceController = internal.ResourceController // NewResourceController creates a new ResourceController with the provided options. // -// InfoSupplier is required - use contrib/hostinfo.NewSystemInfoSupplier() for a gopsutil-based +// InfoSupplier is required - use contrib/sysinfo.SysInfoProvider() for a gopsutil-based // implementation, or provide your own. // // WARNING: It is important that you do not create multiple InfoSupplier instances. Since From 972555ab6a1667ee493d41a382a1d115f9410172 Mon Sep 17 00:00:00 2001 From: Andrew Yuan Date: Mon, 2 Feb 2026 10:32:25 -0800 Subject: [PATCH 13/30] fix bugs cursor found, sync.oncevalue, separate poll time tracking out of metrics --- contrib/sysinfo/go.mod | 2 +- contrib/sysinfo/sysinfo.go | 16 +++--- internal/client.go | 2 +- internal/internal_nexus_task_poller.go | 5 +- internal/internal_task_pollers.go | 18 +++++-- internal/internal_worker.go | 17 +++++-- internal/internal_worker_heartbeat.go | 20 ++++++++ internal/internal_worker_heartbeat_metrics.go | 25 +++------- test/worker_heartbeat_test.go | 49 +++++++------------ 9 files changed, 81 insertions(+), 73 deletions(-) diff --git a/contrib/sysinfo/go.mod b/contrib/sysinfo/go.mod index ce5512cb3..5a2c53a22 100644 --- a/contrib/sysinfo/go.mod +++ b/contrib/sysinfo/go.mod @@ -1,4 +1,4 @@ -module go.temporal.io/sdk/contrib/hostinfo +module go.temporal.io/sdk/contrib/sysinfo go 1.23.0 diff --git a/contrib/sysinfo/sysinfo.go b/contrib/sysinfo/sysinfo.go index e96bbb696..b6865e452 100644 --- a/contrib/sysinfo/sysinfo.go +++ b/contrib/sysinfo/sysinfo.go @@ -11,20 +11,16 @@ import ( "go.temporal.io/sdk/worker" ) -var ( - sysInfoOnce sync.Once - sysInfoInstance *psUtilSystemInfoSupplier -) +var sysInfoProvider = sync.OnceValue(func() *psUtilSystemInfoSupplier { + return &psUtilSystemInfoSupplier{ + cGroupInfo: newCGroupInfo(), + } +}) // SysInfoProvider returns a shared SystemInfoSupplier using gopsutil. // Supports cgroup metrics in containerized Linux environments. func SysInfoProvider() worker.SystemInfoSupplier { - sysInfoOnce.Do(func() { - sysInfoInstance = &psUtilSystemInfoSupplier{ - cGroupInfo: newCGroupInfo(), - } - }) - return sysInfoInstance + return sysInfoProvider() } // NewResourceBasedTuner creates a resource-based tuner with gopsutil-based system info. diff --git a/internal/client.go b/internal/client.go index 48530d5c1..41b2d22aa 100644 --- a/internal/client.go +++ b/internal/client.go @@ -1165,7 +1165,7 @@ func NewServiceClient(workflowServiceClient workflowservice.WorkflowServiceClien } else if *options.WorkerHeartbeatInterval == 0 { heartbeatInterval = 0 } else { - if heartbeatInterval < time.Second || heartbeatInterval > 60*time.Second { + if *options.WorkerHeartbeatInterval < time.Second || *options.WorkerHeartbeatInterval > 60*time.Second { panic("WorkerHeartbeatInterval must be between 1 second and 60 seconds") } heartbeatInterval = *options.WorkerHeartbeatInterval diff --git a/internal/internal_nexus_task_poller.go b/internal/internal_nexus_task_poller.go index 36c6f051d..a67caf3af 100644 --- a/internal/internal_nexus_task_poller.go +++ b/internal/internal_nexus_task_poller.go @@ -42,6 +42,7 @@ func newNexusTaskPoller( useBuildIDVersioning: params.UseBuildIDForVersioning, workerDeploymentVersion: params.DeploymentOptions.Version, capabilities: params.capabilities, + pollTimeTracker: params.pollTimeTracker, }, taskHandler: taskHandler, service: service, @@ -90,7 +91,9 @@ func (ntp *nexusTaskPoller) poll(ctx context.Context) (taskForWorker, error) { return nil, nil } - recordPollSuccessIfHeartbeat(ntp.metricsHandler, metrics.PollerTypeNexusTask) + if ntp.pollTimeTracker != nil { + ntp.pollTimeTracker.recordPollSuccess(metrics.PollerTypeNexusTask) + } return &nexusTask{task: response}, nil } diff --git a/internal/internal_task_pollers.go b/internal/internal_task_pollers.go index bd02fcded..4495b00e8 100644 --- a/internal/internal_task_pollers.go +++ b/internal/internal_task_pollers.go @@ -79,6 +79,8 @@ type ( workerDeploymentVersion WorkerDeploymentVersion // Server's capabilities capabilities *workflowservice.GetSystemInfoResponse_Capabilities + // tracks timestamp for last poll request, for worker heartbeating + pollTimeTracker *pollTimeTracker } // numPollerMetric tracks the number of active pollers and publishes a metric on it. @@ -322,6 +324,7 @@ func newWorkflowTaskProcessor( useBuildIDVersioning: params.UseBuildIDForVersioning, workerDeploymentVersion: params.DeploymentOptions.Version, capabilities: params.capabilities, + pollTimeTracker: params.pollTimeTracker, }, service: service, namespace: params.Namespace, @@ -972,10 +975,12 @@ func (wtp *workflowTaskPoller) poll(ctx context.Context) (taskForWorker, error) return &workflowTask{}, nil } - if request.TaskQueue.GetKind() == enumspb.TASK_QUEUE_KIND_STICKY { - recordPollSuccessIfHeartbeat(wtp.metricsHandler, metrics.PollerTypeWorkflowStickyTask) - } else { - recordPollSuccessIfHeartbeat(wtp.metricsHandler, metrics.PollerTypeWorkflowTask) + if wtp.pollTimeTracker != nil { + if request.TaskQueue.GetKind() == enumspb.TASK_QUEUE_KIND_STICKY { + wtp.pollTimeTracker.recordPollSuccess(metrics.PollerTypeWorkflowStickyTask) + } else { + wtp.pollTimeTracker.recordPollSuccess(metrics.PollerTypeWorkflowTask) + } } wtp.updateBacklog(request.TaskQueue.GetKind(), response.GetBacklogCountHint()) @@ -1125,6 +1130,7 @@ func newActivityTaskPoller(taskHandler ActivityTaskHandler, service workflowserv useBuildIDVersioning: params.UseBuildIDForVersioning, workerDeploymentVersion: params.DeploymentOptions.Version, capabilities: params.capabilities, + pollTimeTracker: params.pollTimeTracker, }, taskHandler: taskHandler, service: service, @@ -1176,7 +1182,9 @@ func (atp *activityTaskPoller) poll(ctx context.Context) (taskForWorker, error) return &activityTask{}, nil } - recordPollSuccessIfHeartbeat(atp.metricsHandler, metrics.PollerTypeActivityTask) + if atp.pollTimeTracker != nil { + atp.pollTimeTracker.recordPollSuccess(metrics.PollerTypeActivityTask) + } workflowType := response.WorkflowType.GetName() activityType := response.ActivityType.GetName() diff --git a/internal/internal_worker.go b/internal/internal_worker.go index 3ebc3cf75..d0ca0353a 100644 --- a/internal/internal_worker.go +++ b/internal/internal_worker.go @@ -212,6 +212,8 @@ type ( eagerActivityExecutor *eagerActivityExecutor capabilities *workflowservice.GetSystemInfoResponse_Capabilities + + pollTimeTracker *pollTimeTracker } // HistoryJSONOptions are options for HistoryFromJSON. @@ -2094,10 +2096,12 @@ func NewAggregatedWorker(client *WorkflowClient, taskQueue string, options Worke baseMetricsHandler := client.metricsHandler.WithTags(metrics.TaskQueueTags(taskQueue)) var metricsHandler metrics.Handler var heartbeatMetrics *heartbeatMetricsHandler + var pollTracker *pollTimeTracker if client.workerHeartbeatInterval != 0 { heartbeatMetrics = newHeartbeatMetricsHandler(baseMetricsHandler) metricsHandler = heartbeatMetrics + pollTracker = newPollTimeTracker() } else { metricsHandler = baseMetricsHandler } @@ -2135,7 +2139,8 @@ func NewAggregatedWorker(client *WorkflowClient, taskQueue string, options Worke taskQueue: taskQueue, maxConcurrent: options.MaxConcurrentEagerActivityExecutionSize, }), - capabilities: &capabilities, + capabilities: &capabilities, + pollTimeTracker: pollTracker, } if options.MaxConcurrentWorkflowTaskPollers != 0 { @@ -2237,6 +2242,9 @@ func NewAggregatedWorker(client *WorkflowClient, taskQueue string, options Worke pid := strconv.Itoa(os.Getpid()) previousHeartbeatTime := time.Now() pluginInfos := collectPluginInfos(client.clientPluginNames, plugins) + if pollTracker == nil { + panic("pollTracker must not be nil when heartbeats are enabled") + } var prevWorkflowProcessed, prevWorkflowFailed int64 var prevActivityProcessed, prevActivityFailed int64 @@ -2255,6 +2263,7 @@ func NewAggregatedWorker(client *WorkflowClient, taskQueue string, options Worke prevLocalActivityFailed: &prevLocalActivityFailed, prevNexusProcessed: &prevNexusProcessed, prevNexusFailed: &prevNexusFailed, + pollTimeTracker: pollTracker, } var deploymentVersion *deploymentpb.WorkerDeploymentVersion @@ -2270,6 +2279,9 @@ func NewAggregatedWorker(client *WorkflowClient, taskQueue string, options Worke heartbeatCallback = func() *workerpb.WorkerHeartbeat { cpuUsage := getCpuUsage(systemInfoSupplier, workerParams.Logger) memUsage := getMemUsage(systemInfoSupplier, workerParams.Logger) + + mu.Lock() + defer mu.Unlock() if aw.workflowWorker != nil { populateOpts.workflowSlotSupplierKind = aw.workflowWorker.worker.slotSupplier.GetSlotSupplierKind() populateOpts.localActivitySlotSupplierKind = aw.workflowWorker.localActivityWorker.slotSupplier.GetSlotSupplierKind() @@ -2281,9 +2293,6 @@ func NewAggregatedWorker(client *WorkflowClient, taskQueue string, options Worke populateOpts.nexusSlotSupplierKind = aw.nexusWorker.worker.slotSupplier.GetSlotSupplierKind() } heartbeatTime := time.Now() - - mu.Lock() - defer mu.Unlock() elapsedSinceLastHeartbeat := heartbeatTime.Sub(previousHeartbeatTime) previousHeartbeatTime = heartbeatTime diff --git a/internal/internal_worker_heartbeat.go b/internal/internal_worker_heartbeat.go index 2efc4d133..3c9d9d3b4 100644 --- a/internal/internal_worker_heartbeat.go +++ b/internal/internal_worker_heartbeat.go @@ -193,3 +193,23 @@ func (hw *sharedNamespaceWorker) stop() { close(hw.stopC) <-hw.stoppedC } + +// pollTimeTracker tracks the last successful poll time for each poller type. +type pollTimeTracker struct { + times sync.Map // pollerType (string) -> time.Time (stored as int64 nanos) +} + +func newPollTimeTracker() *pollTimeTracker { + return &pollTimeTracker{} +} + +func (p *pollTimeTracker) recordPollSuccess(pollerType string) { + p.times.Store(pollerType, time.Now().UnixNano()) +} + +func (p *pollTimeTracker) getLastPollTime(pollerType string) time.Time { + if v, ok := p.times.Load(pollerType); ok { + return time.Unix(0, v.(int64)) + } + return time.Time{} +} diff --git a/internal/internal_worker_heartbeat_metrics.go b/internal/internal_worker_heartbeat_metrics.go index 843e39e41..f31b239b1 100644 --- a/internal/internal_worker_heartbeat_metrics.go +++ b/internal/internal_worker_heartbeat_metrics.go @@ -159,6 +159,8 @@ type populateHeartbeatOptions struct { prevLocalActivityFailed *int64 prevNexusProcessed *int64 prevNexusFailed *int64 + + pollTimeTracker *pollTimeTracker } // PopulateHeartbeat fills in the metrics-related fields of the WorkerHeartbeat proto. @@ -217,34 +219,26 @@ func (h *heartbeatMetricsHandler) PopulateHeartbeat(hb *workerpb.WorkerHeartbeat hb.WorkflowPollerInfo = buildPollerInfo( int32(h.get(metrics.NumPoller+":"+metrics.PollerTypeWorkflowTask)), - h.getLastPollTime(metrics.PollerTypeWorkflowTask), + opts.pollTimeTracker.getLastPollTime(metrics.PollerTypeWorkflowTask), opts.workflowPollerBehavior, ) hb.WorkflowStickyPollerInfo = buildPollerInfo( int32(h.get(metrics.NumPoller+":"+metrics.PollerTypeWorkflowStickyTask)), - h.getLastPollTime(metrics.PollerTypeWorkflowStickyTask), + opts.pollTimeTracker.getLastPollTime(metrics.PollerTypeWorkflowStickyTask), opts.workflowPollerBehavior, ) hb.ActivityPollerInfo = buildPollerInfo( int32(h.get(metrics.NumPoller+":"+metrics.PollerTypeActivityTask)), - h.getLastPollTime(metrics.PollerTypeActivityTask), + opts.pollTimeTracker.getLastPollTime(metrics.PollerTypeActivityTask), opts.activityPollerBehavior, ) hb.NexusPollerInfo = buildPollerInfo( int32(h.get(metrics.NumPoller+":"+metrics.PollerTypeNexusTask)), - h.getLastPollTime(metrics.PollerTypeNexusTask), + opts.pollTimeTracker.getLastPollTime(metrics.PollerTypeNexusTask), opts.nexusPollerBehavior, ) } -func (h *heartbeatMetricsHandler) getLastPollTime(pollerType string) time.Time { - nanos := h.get(pollerType) - if nanos != 0 { - return time.Unix(0, nanos) - } - return time.Time{} -} - func (h *heartbeatMetricsHandler) Unwrap() metrics.Handler { return h.underlying } @@ -289,13 +283,6 @@ func buildPollerInfo(currentPollers int32, lastSuccessfulPollTime time.Time, pol } } -// recordPollSuccessIfHeartbeat records a successful poll time if the handler is a *heartbeatMetricsHandler. -func recordPollSuccessIfHeartbeat(h metrics.Handler, pollerType string) { - if hm, ok := h.(*heartbeatMetricsHandler); ok { - hm.getOrCreate(pollerType).Store(time.Now().UnixNano()) - } -} - // capturingCounter wraps a counter and captures its value in memory. type capturingCounter struct { underlying metrics.Counter diff --git a/test/worker_heartbeat_test.go b/test/worker_heartbeat_test.go index d683483d6..47fd43088 100644 --- a/test/worker_heartbeat_test.go +++ b/test/worker_heartbeat_test.go @@ -49,7 +49,7 @@ func (ts *WorkerHeartbeatTestSuite) TearDownSuite() { func (ts *WorkerHeartbeatTestSuite) SetupTest() { var err error - heartbeatInterval := 100 * time.Millisecond + heartbeatInterval := 1 * time.Second // Create a client with heartbeating enabled ts.client, err = client.Dial(client.Options{ @@ -117,11 +117,9 @@ func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatBasic() { // Wait for heartbeat to capture the in-flight activity var workerInfo *workerpb.WorkerHeartbeat ts.Eventually(func() bool { - workerInfo = ts.getWorkerInfo(ctx, ts.taskQueueName) return workerInfo != nil && workerInfo.ActivityTaskSlotsInfo != nil && workerInfo.ActivityTaskSlotsInfo.CurrentUsedSlots >= 1 - }, time.Second, 50*time.Millisecond, "Should find worker with activity slot used") - ts.logWorkerInfo(workerInfo) + }, 5*time.Second, 200*time.Millisecond, "Should find worker with activity slot used") ts.Equal(enums.WORKER_STATUS_RUNNING, workerInfo.Status) @@ -170,8 +168,8 @@ func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatBasic() { ts.NotNil(workerInfo.ElapsedSinceLastHeartbeat) elapsed := workerInfo.ElapsedSinceLastHeartbeat.AsDuration() - ts.True(elapsed <= 500*time.Millisecond, - "ElapsedSinceLastHeartbeat should be <= 500ms (got %v)", elapsed) + ts.True(elapsed <= 5*time.Second, + "ElapsedSinceLastHeartbeat should be <= 5s (got %v)", elapsed) ts.assertRecentTimestamp(workerInfo.WorkflowPollerInfo.LastSuccessfulPollTime, 5*time.Second, "WorkflowPollerInfo.LastSuccessfulPollTime") @@ -188,7 +186,6 @@ func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatBasic() { ts.NoError(run.Get(ctx, nil)) ts.worker.Stop() - workerInfo = ts.getWorkerInfo(ctx, ts.taskQueueName) ts.NotNil(workerInfo, "Should find worker in ListWorkers/DescribeWorker") // After shutdown checks @@ -287,13 +284,11 @@ func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatDeploymentVersion() { ts.Eventually(func() bool { workerInfo = ts.getWorkerInfo(ctx, taskQueue) return workerInfo != nil && workerInfo.DeploymentVersion != nil - }, time.Second, 50*time.Millisecond, "Should find worker with deployment version") + }, 5*time.Second, 200*time.Millisecond, "Should find worker with deployment version") ts.NotNil(workerInfo.DeploymentVersion) ts.Equal("test_build_id", workerInfo.DeploymentVersion.BuildId) ts.Equal("test-deployment", workerInfo.DeploymentVersion.DeploymentName) - - ts.logWorkerInfo(workerInfo) } // TestWorkerHeartbeatDisabled verifies that when heartbeating is disabled, @@ -322,7 +317,7 @@ func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatDisabled() { defer workerNoHeartbeat.Stop() // Wait a bit - time.Sleep(500 * time.Millisecond) + time.Sleep(2 * time.Second) // Get the internal client internalClient := clientNoHeartbeat.(internal.Client) @@ -364,7 +359,6 @@ func (ts *WorkerHeartbeatTestSuite) getWorkerInfo(ctx context.Context, taskQueue } if len(listResp.WorkersInfo) == 0 { - ts.T().Logf("No workers found for task queue %s", taskQueue) return nil } @@ -581,10 +575,9 @@ func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatWithActivityInFlight() { var workerInfo *workerpb.WorkerHeartbeat ts.Eventually(func() bool { - workerInfo = ts.getWorkerInfo(ctx, ts.taskQueueName) return workerInfo != nil && workerInfo.ActivityTaskSlotsInfo != nil && workerInfo.ActivityTaskSlotsInfo.CurrentUsedSlots >= 1 - }, time.Second, 50*time.Millisecond, "Should have at least 1 activity slot used") + }, 5*time.Second, 200*time.Millisecond, "Should have at least 1 activity slot used") ts.T().Logf("Activity slots used: %d, available: %d", workerInfo.ActivityTaskSlotsInfo.CurrentUsedSlots, @@ -599,10 +592,9 @@ func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatWithActivityInFlight() { ts.Equal("done", result) ts.Eventually(func() bool { - workerInfo = ts.getWorkerInfo(ctx, ts.taskQueueName) return workerInfo != nil && workerInfo.ActivityTaskSlotsInfo != nil && workerInfo.ActivityTaskSlotsInfo.CurrentUsedSlots == 0 - }, time.Second, 50*time.Millisecond, "Activity slot should be released after completion") + }, 5*time.Second, 200*time.Millisecond, "Activity slot should be released after completion") ts.T().Logf("After completion - Activity slots used: %d, available: %d", workerInfo.ActivityTaskSlotsInfo.CurrentUsedSlots, @@ -708,9 +700,8 @@ func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatStickyCacheMiss() { // Wait for heartbeat to capture sticky cache miss var workerInfo *workerpb.WorkerHeartbeat ts.Eventually(func() bool { - workerInfo = ts.getWorkerInfo(ctx, ts.taskQueueName) return workerInfo != nil && workerInfo.TotalStickyCacheMiss >= 1 - }, time.Second, 50*time.Millisecond, "Should have at least 1 sticky cache miss") + }, 5*time.Second, 200*time.Millisecond, "Should have at least 1 sticky cache miss") } // TestWorkerHeartbeatMultipleWorkers verifies that multiple workers can heartbeat @@ -755,7 +746,7 @@ func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatMultipleWorkers() { workerInfo1 = ts.getWorkerInfo(ctx, taskQueue1) workerInfo2 = ts.getWorkerInfo(ctx, taskQueue2) return workerInfo1 != nil && workerInfo2 != nil - }, time.Second, 50*time.Millisecond, "Should find both workers") + }, 5*time.Second, 200*time.Millisecond, "Should find both workers") ts.NotEqual(workerInfo1.WorkerInstanceKey, workerInfo2.WorkerInstanceKey, "Different workers should have different instance keys") @@ -795,19 +786,17 @@ func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatFailureMetrics() { // Wait for heartbeat to capture failure metrics var workerInfo *workerpb.WorkerHeartbeat ts.Eventually(func() bool { - workerInfo = ts.getWorkerInfo(ctx, ts.taskQueueName) return workerInfo != nil && workerInfo.ActivityTaskSlotsInfo != nil && workerInfo.ActivityTaskSlotsInfo.TotalFailedTasks >= 1 - }, time.Second, 50*time.Millisecond, "Should have tracked at least 1 activity task failure") + }, 5*time.Second, 200*time.Millisecond, "Should have tracked at least 1 activity task failure") ts.GreaterOrEqual(workerInfo.ActivityTaskSlotsInfo.LastIntervalFailureTasks, int32(1)) // Last interval should go back to 0 on next heartbeat ts.Eventually(func() bool { - workerInfo = ts.getWorkerInfo(ctx, ts.taskQueueName) return workerInfo != nil && workerInfo.ActivityTaskSlotsInfo != nil && workerInfo.ActivityTaskSlotsInfo.LastIntervalFailureTasks == 0 - }, time.Second, 50*time.Millisecond, "Last interval failure count should reset to 0") + }, 5*time.Second, 200*time.Millisecond, "Last interval failure count should reset to 0") } func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatWorkflowTaskProcessed() { @@ -832,19 +821,17 @@ func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatWorkflowTaskProcessed() { // Wait for heartbeat to capture processed tasks var workerInfo *workerpb.WorkerHeartbeat ts.Eventually(func() bool { - workerInfo = ts.getWorkerInfo(ctx, ts.taskQueueName) return workerInfo != nil && workerInfo.WorkflowTaskSlotsInfo != nil && workerInfo.WorkflowTaskSlotsInfo.TotalProcessedTasks == int32(numWorkflows) - }, time.Second, 50*time.Millisecond, "Should have processed all workflow tasks") + }, 5*time.Second, 200*time.Millisecond, "Should have processed all workflow tasks") ts.GreaterOrEqual(workerInfo.WorkflowTaskSlotsInfo.LastIntervalProcessedTasks, int32(1)) // Last interval should go back to 0 on next heartbeat ts.Eventually(func() bool { - workerInfo = ts.getWorkerInfo(ctx, ts.taskQueueName) return workerInfo != nil && workerInfo.WorkflowTaskSlotsInfo != nil && workerInfo.WorkflowTaskSlotsInfo.LastIntervalProcessedTasks == 0 - }, time.Second, 50*time.Millisecond, "Last interval processed count should reset to 0") + }, 5*time.Second, 200*time.Millisecond, "Last interval processed count should reset to 0") } func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatResourceBasedTuner() { @@ -898,10 +885,9 @@ func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatResourceBasedTuner() { // Wait for heartbeat with resource-based tuner info var workerInfo *workerpb.WorkerHeartbeat ts.Eventually(func() bool { - workerInfo = ts.getWorkerInfo(ctx, ts.taskQueueName) return workerInfo != nil && workerInfo.WorkflowTaskSlotsInfo != nil && workerInfo.WorkflowTaskSlotsInfo.SlotSupplierKind == "ResourceBased" - }, time.Second, 50*time.Millisecond, "Should find worker with ResourceBased slot supplier") + }, 5*time.Second, 200*time.Millisecond, "Should find worker with ResourceBased slot supplier") ts.NotNil(workerInfo.ActivityTaskSlotsInfo) ts.Equal("ResourceBased", workerInfo.ActivityTaskSlotsInfo.SlotSupplierKind) @@ -938,7 +924,7 @@ func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatPlugins() { ts.NoError(err) // Create a new client with the plugin - heartbeatInterval := 100 * time.Millisecond + heartbeatInterval := 1 * time.Second pluginClient, err := client.Dial(client.Options{ HostPort: ts.config.ServiceAddr, Namespace: ts.config.Namespace, @@ -969,9 +955,8 @@ func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatPlugins() { // Wait for heartbeat with plugin info var workerInfo *workerpb.WorkerHeartbeat ts.Eventually(func() bool { - workerInfo = ts.getWorkerInfo(ctx, ts.taskQueueName) return workerInfo != nil && len(workerInfo.Plugins) == 2 - }, time.Second, 50*time.Millisecond, "Should have 2 unique plugins (duplicates deduped)") + }, 5*time.Second, 200*time.Millisecond, "Should have 2 unique plugins (duplicates deduped)") pluginNames := make(map[string]bool) for _, plugin := range workerInfo.Plugins { From f9527326b27056c5ffdbdba78444c1f79ea82707 Mon Sep 17 00:00:00 2001 From: Andrew Yuan Date: Tue, 3 Feb 2026 11:03:04 -0800 Subject: [PATCH 14/30] Add back resource tuner tests that got dropped --- internal/resource_tuner_test.go | 105 ++++++++++++++++++++++++++++++++ test/worker_heartbeat_test.go | 11 ++++ 2 files changed, 116 insertions(+) create mode 100644 internal/resource_tuner_test.go diff --git a/internal/resource_tuner_test.go b/internal/resource_tuner_test.go new file mode 100644 index 000000000..0e0d6c790 --- /dev/null +++ b/internal/resource_tuner_test.go @@ -0,0 +1,105 @@ +package internal + +import ( + "github.com/stretchr/testify/assert" + "go.temporal.io/sdk/internal/common/metrics" + "go.temporal.io/sdk/internal/log" + "testing" +) + +type FakeSystemInfoSupplier struct { + memUse float64 + cpuUse float64 +} + +func (f FakeSystemInfoSupplier) GetMemoryUsage(_ *SystemInfoContext) (float64, error) { + return f.memUse, nil +} + +func (f FakeSystemInfoSupplier) GetCpuUsage(_ *SystemInfoContext) (float64, error) { + return f.cpuUse, nil +} + +func TestPidDecisions(t *testing.T) { + logger := &log.NoopLogger{} + metricsHandler := metrics.NopHandler + fakeSupplier := &FakeSystemInfoSupplier{memUse: 0.5, cpuUse: 0.5} + rcOpts := DefaultResourceControllerOptions() + rcOpts.MemTargetPercent = 0.8 + rcOpts.CpuTargetPercent = 0.9 + rcOpts.InfoSupplier = fakeSupplier + rc := NewResourceController(rcOpts) + + for i := 0; i < 10; i++ { + decision, err := rc.pidDecision(logger, metricsHandler) + assert.NoError(t, err) + assert.True(t, decision) + + assert.InDelta(t, 1.5, rc.memPid.controlSignal, 0.001) + assert.InDelta(t, 2.0, rc.cpuPid.controlSignal, 0.001) + } + + fakeSupplier.memUse = 0.8 + fakeSupplier.cpuUse = 0.9 + for i := 0; i < 10; i++ { + decision, err := rc.pidDecision(logger, metricsHandler) + assert.NoError(t, err) + assert.False(t, decision) + } + + fakeSupplier.memUse = 0.7 + fakeSupplier.cpuUse = 0.9 + for i := 0; i < 10; i++ { + decision, err := rc.pidDecision(logger, metricsHandler) + assert.NoError(t, err) + assert.False(t, decision) + } + + fakeSupplier.memUse = 0.7 + fakeSupplier.cpuUse = 0.7 + for i := 0; i < 10; i++ { + decision, err := rc.pidDecision(logger, metricsHandler) + assert.NoError(t, err) + assert.True(t, decision) + } +} + +func TestPidDecisionEmitsUsageMetrics(t *testing.T) { + logger := &log.NoopLogger{} + metricsHandler := metrics.NewCapturingHandler() + fakeSupplier := &FakeSystemInfoSupplier{memUse: 0.25, cpuUse: 0.75} + + rcOpts := DefaultResourceControllerOptions() + rcOpts.InfoSupplier = fakeSupplier + rc := NewResourceController(rcOpts) + + _, err := rc.pidDecision(logger, metricsHandler) + assert.NoError(t, err) + + gauges := metricsHandler.Gauges() + assert.Len(t, gauges, 2) + + gaugesByName := make(map[string]float64) + for _, gauge := range gauges { + gaugesByName[gauge.Name] = gauge.Value() + } + + assert.Equal(t, 25.0, gaugesByName[resourceSlotsMemUsage]) + assert.Equal(t, 75.0, gaugesByName[resourceSlotsCPUUsage]) + + fakeSupplier.memUse = 0.7 + fakeSupplier.cpuUse = 0.9 + _, err = rc.pidDecision(logger, metricsHandler) + assert.NoError(t, err) + + gauges = metricsHandler.Gauges() + assert.Len(t, gauges, 2) + + gaugesByName = make(map[string]float64) + for _, gauge := range gauges { + gaugesByName[gauge.Name] = gauge.Value() + } + + assert.Equal(t, 70.0, gaugesByName[resourceSlotsMemUsage]) + assert.Equal(t, 90.0, gaugesByName[resourceSlotsCPUUsage]) +} diff --git a/test/worker_heartbeat_test.go b/test/worker_heartbeat_test.go index 47fd43088..749592d4c 100644 --- a/test/worker_heartbeat_test.go +++ b/test/worker_heartbeat_test.go @@ -117,6 +117,7 @@ func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatBasic() { // Wait for heartbeat to capture the in-flight activity var workerInfo *workerpb.WorkerHeartbeat ts.Eventually(func() bool { + workerInfo = ts.getWorkerInfo(ctx, ts.taskQueueName) return workerInfo != nil && workerInfo.ActivityTaskSlotsInfo != nil && workerInfo.ActivityTaskSlotsInfo.CurrentUsedSlots >= 1 }, 5*time.Second, 200*time.Millisecond, "Should find worker with activity slot used") @@ -186,6 +187,7 @@ func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatBasic() { ts.NoError(run.Get(ctx, nil)) ts.worker.Stop() + workerInfo = ts.getWorkerInfo(ctx, ts.taskQueueName) ts.NotNil(workerInfo, "Should find worker in ListWorkers/DescribeWorker") // After shutdown checks @@ -575,6 +577,7 @@ func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatWithActivityInFlight() { var workerInfo *workerpb.WorkerHeartbeat ts.Eventually(func() bool { + workerInfo = ts.getWorkerInfo(ctx, ts.taskQueueName) return workerInfo != nil && workerInfo.ActivityTaskSlotsInfo != nil && workerInfo.ActivityTaskSlotsInfo.CurrentUsedSlots >= 1 }, 5*time.Second, 200*time.Millisecond, "Should have at least 1 activity slot used") @@ -592,6 +595,7 @@ func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatWithActivityInFlight() { ts.Equal("done", result) ts.Eventually(func() bool { + workerInfo = ts.getWorkerInfo(ctx, ts.taskQueueName) return workerInfo != nil && workerInfo.ActivityTaskSlotsInfo != nil && workerInfo.ActivityTaskSlotsInfo.CurrentUsedSlots == 0 }, 5*time.Second, 200*time.Millisecond, "Activity slot should be released after completion") @@ -700,6 +704,7 @@ func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatStickyCacheMiss() { // Wait for heartbeat to capture sticky cache miss var workerInfo *workerpb.WorkerHeartbeat ts.Eventually(func() bool { + workerInfo = ts.getWorkerInfo(ctx, ts.taskQueueName) return workerInfo != nil && workerInfo.TotalStickyCacheMiss >= 1 }, 5*time.Second, 200*time.Millisecond, "Should have at least 1 sticky cache miss") } @@ -786,6 +791,7 @@ func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatFailureMetrics() { // Wait for heartbeat to capture failure metrics var workerInfo *workerpb.WorkerHeartbeat ts.Eventually(func() bool { + workerInfo = ts.getWorkerInfo(ctx, ts.taskQueueName) return workerInfo != nil && workerInfo.ActivityTaskSlotsInfo != nil && workerInfo.ActivityTaskSlotsInfo.TotalFailedTasks >= 1 }, 5*time.Second, 200*time.Millisecond, "Should have tracked at least 1 activity task failure") @@ -794,6 +800,7 @@ func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatFailureMetrics() { // Last interval should go back to 0 on next heartbeat ts.Eventually(func() bool { + workerInfo = ts.getWorkerInfo(ctx, ts.taskQueueName) return workerInfo != nil && workerInfo.ActivityTaskSlotsInfo != nil && workerInfo.ActivityTaskSlotsInfo.LastIntervalFailureTasks == 0 }, 5*time.Second, 200*time.Millisecond, "Last interval failure count should reset to 0") @@ -821,6 +828,7 @@ func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatWorkflowTaskProcessed() { // Wait for heartbeat to capture processed tasks var workerInfo *workerpb.WorkerHeartbeat ts.Eventually(func() bool { + workerInfo = ts.getWorkerInfo(ctx, ts.taskQueueName) return workerInfo != nil && workerInfo.WorkflowTaskSlotsInfo != nil && workerInfo.WorkflowTaskSlotsInfo.TotalProcessedTasks == int32(numWorkflows) }, 5*time.Second, 200*time.Millisecond, "Should have processed all workflow tasks") @@ -829,6 +837,7 @@ func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatWorkflowTaskProcessed() { // Last interval should go back to 0 on next heartbeat ts.Eventually(func() bool { + workerInfo = ts.getWorkerInfo(ctx, ts.taskQueueName) return workerInfo != nil && workerInfo.WorkflowTaskSlotsInfo != nil && workerInfo.WorkflowTaskSlotsInfo.LastIntervalProcessedTasks == 0 }, 5*time.Second, 200*time.Millisecond, "Last interval processed count should reset to 0") @@ -885,6 +894,7 @@ func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatResourceBasedTuner() { // Wait for heartbeat with resource-based tuner info var workerInfo *workerpb.WorkerHeartbeat ts.Eventually(func() bool { + workerInfo = ts.getWorkerInfo(ctx, ts.taskQueueName) return workerInfo != nil && workerInfo.WorkflowTaskSlotsInfo != nil && workerInfo.WorkflowTaskSlotsInfo.SlotSupplierKind == "ResourceBased" }, 5*time.Second, 200*time.Millisecond, "Should find worker with ResourceBased slot supplier") @@ -955,6 +965,7 @@ func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatPlugins() { // Wait for heartbeat with plugin info var workerInfo *workerpb.WorkerHeartbeat ts.Eventually(func() bool { + workerInfo = ts.getWorkerInfo(ctx, ts.taskQueueName) return workerInfo != nil && len(workerInfo.Plugins) == 2 }, 5*time.Second, 200*time.Millisecond, "Should have 2 unique plugins (duplicates deduped)") From a25d85dd4e826b2e3b5dfcbd8b1936e7afd59c14 Mon Sep 17 00:00:00 2001 From: Andrew Yuan Date: Tue, 3 Feb 2026 13:14:36 -0800 Subject: [PATCH 15/30] Fix tests --- contrib/sysinfo/sysinfo.go | 14 +++++++++----- contrib/sysinfo/sysinfo_test.go | 4 ++-- test/worker_heartbeat_test.go | 5 +++++ 3 files changed, 16 insertions(+), 7 deletions(-) diff --git a/contrib/sysinfo/sysinfo.go b/contrib/sysinfo/sysinfo.go index b6865e452..9c729f1ec 100644 --- a/contrib/sysinfo/sysinfo.go +++ b/contrib/sysinfo/sysinfo.go @@ -4,6 +4,7 @@ import ( "context" "runtime" "sync" + "sync/atomic" "time" "github.com/shirou/gopsutil/v4/cpu" @@ -31,7 +32,7 @@ func NewResourceBasedTuner(opts worker.ResourceBasedTunerOptions) (worker.Worker type psUtilSystemInfoSupplier struct { mu sync.Mutex - lastRefresh time.Time + lastRefresh atomic.Int64 // UnixNano, atomic for lock-free reads in maybeRefresh lastMemStat *mem.VirtualMemoryStat lastCpuUsage float64 @@ -57,6 +58,8 @@ func (p *psUtilSystemInfoSupplier) GetMemoryUsage(infoContext *worker.SystemInfo if err := p.maybeRefresh(infoContext); err != nil { return 0, err } + p.mu.Lock() + defer p.mu.Unlock() lastCGroupMem := p.cGroupInfo.GetLastMemUsage() if lastCGroupMem != 0 { return lastCGroupMem, nil @@ -68,7 +71,8 @@ func (p *psUtilSystemInfoSupplier) GetCpuUsage(infoContext *worker.SystemInfoCon if err := p.maybeRefresh(infoContext); err != nil { return 0, err } - + p.mu.Lock() + defer p.mu.Unlock() lastCGroupCPU := p.cGroupInfo.GetLastCPUUsage() if lastCGroupCPU != 0 { return lastCGroupCPU, nil @@ -77,13 +81,13 @@ func (p *psUtilSystemInfoSupplier) GetCpuUsage(infoContext *worker.SystemInfoCon } func (p *psUtilSystemInfoSupplier) maybeRefresh(infoContext *worker.SystemInfoContext) error { - if time.Since(p.lastRefresh) < 100*time.Millisecond { + if time.Since(time.Unix(0, p.lastRefresh.Load())) < 100*time.Millisecond { return nil } p.mu.Lock() defer p.mu.Unlock() // Double check refresh is still needed - if time.Since(p.lastRefresh) < 100*time.Millisecond { + if time.Since(time.Unix(0, p.lastRefresh.Load())) < 100*time.Millisecond { return nil } ctx, cancelFn := context.WithTimeout(context.Background(), 1*time.Second) @@ -108,6 +112,6 @@ func (p *psUtilSystemInfoSupplier) maybeRefresh(infoContext *worker.SystemInfoCo p.stopTryingToGetCGroupInfo = !continueUpdates } - p.lastRefresh = time.Now() + p.lastRefresh.Store(time.Now().UnixNano()) return nil } diff --git a/contrib/sysinfo/sysinfo_test.go b/contrib/sysinfo/sysinfo_test.go index b1671d1e7..073ee2c5a 100644 --- a/contrib/sysinfo/sysinfo_test.go +++ b/contrib/sysinfo/sysinfo_test.go @@ -31,12 +31,12 @@ func TestMaybeRefreshRateLimiting(t *testing.T) { // First call should refresh firstUsage, err := supplier.GetMemoryUsage(ctx) require.NoError(t, err) - firstRefresh := supplier.lastRefresh + firstRefresh := supplier.lastRefresh.Load() // Immediate second call should not refresh (rate limited) secondUsage, err := supplier.GetMemoryUsage(ctx) require.NoError(t, err) - assert.Equal(t, firstRefresh, supplier.lastRefresh) + assert.Equal(t, firstRefresh, supplier.lastRefresh.Load()) assert.Equal(t, firstUsage, secondUsage) } diff --git a/test/worker_heartbeat_test.go b/test/worker_heartbeat_test.go index 749592d4c..239d7606a 100644 --- a/test/worker_heartbeat_test.go +++ b/test/worker_heartbeat_test.go @@ -4,6 +4,7 @@ import ( "context" "errors" "fmt" + "runtime" "sync" "sync/atomic" "testing" @@ -68,9 +69,11 @@ func (ts *WorkerHeartbeatTestSuite) SetupTest() { func (ts *WorkerHeartbeatTestSuite) TearDownTest() { if ts.worker != nil { ts.worker.Stop() + ts.worker = nil } if ts.client != nil { ts.client.Close() + ts.client = nil } } @@ -652,6 +655,8 @@ func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatStickyCacheMiss() { return result, err } + // GC ensures previous worker's cache finalizer runs, allowing cache to be recreated with new size + runtime.GC() worker.SetStickyWorkflowCacheSize(1) ts.worker = worker.New(ts.client, ts.taskQueueName, worker.Options{ MaxConcurrentWorkflowTaskExecutionSize: 2, From 53da340f2b8f4e7661ad9f37f261c60cc0c1a796 Mon Sep 17 00:00:00 2001 From: Andrew Yuan Date: Tue, 3 Feb 2026 13:54:07 -0800 Subject: [PATCH 16/30] Fix tests, disable heartbeating for normal tests, bump dev server version --- internal/cmd/build/main.go | 2 +- test/integration_test.go | 10 ++++++---- test/test_utils_test.go | 2 ++ 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/internal/cmd/build/main.go b/internal/cmd/build/main.go index 04086145d..dccc240fa 100644 --- a/internal/cmd/build/main.go +++ b/internal/cmd/build/main.go @@ -121,7 +121,7 @@ func (b *builder) integrationTest() error { if *devServerFlag { devServer, err := testsuite.StartDevServer(context.Background(), testsuite.DevServerOptions{ CachedDownload: testsuite.CachedDownload{ - Version: "v1.5.0-rc", + Version: "v1.5.1", }, ClientOptions: &client.Options{ HostPort: "127.0.0.1:7233", diff --git a/test/integration_test.go b/test/integration_test.go index c411a076a..bc2443f85 100644 --- a/test/integration_test.go +++ b/test/integration_test.go @@ -169,6 +169,7 @@ func (ts *IntegrationTestSuite) SetupTest() { var err error trafficController := test.NewSimpleTrafficController() + disableHeartbeat := time.Duration(0) ts.client, err = client.Dial(client.Options{ HostPort: ts.config.ServiceAddr, Namespace: ts.config.Namespace, @@ -177,10 +178,11 @@ func (ts *IntegrationTestSuite) SetupTest() { NewKeysPropagator([]string{testContextKey1}), NewKeysPropagator([]string{testContextKey2}), }, - MetricsHandler: metricsHandler, - TrafficController: trafficController, - Interceptors: clientInterceptors, - ConnectionOptions: client.ConnectionOptions{TLS: ts.config.TLS}, + MetricsHandler: metricsHandler, + TrafficController: trafficController, + Interceptors: clientInterceptors, + ConnectionOptions: client.ConnectionOptions{TLS: ts.config.TLS}, + WorkerHeartbeatInterval: &disableHeartbeat, }) ts.NoError(err) diff --git a/test/test_utils_test.go b/test/test_utils_test.go index 644b5514b..88e8040ef 100644 --- a/test/test_utils_test.go +++ b/test/test_utils_test.go @@ -230,6 +230,7 @@ func (ts *ConfigAndClientSuiteBase) InitClient() error { } func (ts *ConfigAndClientSuiteBase) newClient() (client.Client, error) { + disableHeartbeat := time.Duration(0) return client.Dial(client.Options{ HostPort: ts.config.ServiceAddr, Namespace: ts.config.Namespace, @@ -238,6 +239,7 @@ func (ts *ConfigAndClientSuiteBase) newClient() (client.Client, error) { TLS: ts.config.TLS, GetSystemInfoTimeout: ctxTimeout, }, + WorkerHeartbeatInterval: &disableHeartbeat, }) } From 21552069ab0895680eb3d78a5154a32f68231e50 Mon Sep 17 00:00:00 2001 From: Andrew Yuan Date: Wed, 4 Feb 2026 10:07:56 -0800 Subject: [PATCH 17/30] Finish renames of sysInfoProvider, handle Time.IsZero(), make pollTimeTracker more idiomatic, move namespace capabilities to AW, remove Get from func names --- contrib/sysinfo/cgroups.go | 6 +-- contrib/sysinfo/cgroups_test.go | 22 ++++++++ contrib/sysinfo/sysinfo.go | 16 ++---- contrib/sysinfo/sysinfo_test.go | 21 +++----- internal/internal_nexus_task_poller.go | 4 +- internal/internal_nexus_worker.go | 13 ----- internal/internal_task_pollers.go | 14 ++--- internal/internal_worker.go | 51 +++++++----------- internal/internal_worker_heartbeat.go | 26 ++++------ internal/internal_worker_heartbeat_metrics.go | 6 ++- internal/internal_workers_test.go | 7 --- internal/internal_workflow_client.go | 4 +- internal/resource_tuner.go | 46 ++++++++-------- internal/resource_tuner_test.go | 4 +- test/worker_heartbeat_test.go | 52 +++++++++++++++++-- worker/tuning.go | 14 ++--- 16 files changed, 158 insertions(+), 148 deletions(-) create mode 100644 contrib/sysinfo/cgroups_test.go diff --git a/contrib/sysinfo/cgroups.go b/contrib/sysinfo/cgroups.go index ea5e7ec19..18d342b93 100644 --- a/contrib/sysinfo/cgroups.go +++ b/contrib/sysinfo/cgroups.go @@ -49,11 +49,11 @@ func (p *cGroupInfoImpl) GetLastCPUUsage() float64 { func (p *cGroupInfoImpl) updateCGroupStats() error { control, err := cgroup2.Load("/") if err != nil { - return fmt.Errorf("failed to get cgroup mem stats %v", err) + return fmt.Errorf("failed to load cgroup: %w", err) } metrics, err := control.Stat() if err != nil { - return fmt.Errorf("failed to get cgroup mem stats %v", err) + return fmt.Errorf("failed to get cgroup stats: %w", err) } // Only update if a limit has been set if metrics.Memory.UsageLimit != 0 { @@ -62,7 +62,7 @@ func (p *cGroupInfoImpl) updateCGroupStats() error { err = p.cgroupCpuCalc.updateCpuUsage(metrics) if err != nil { - return fmt.Errorf("failed to get cgroup cpu usage %v", err) + return fmt.Errorf("failed to get cgroup cpu usage: %w", err) } return nil } diff --git a/contrib/sysinfo/cgroups_test.go b/contrib/sysinfo/cgroups_test.go new file mode 100644 index 000000000..0ec95389b --- /dev/null +++ b/contrib/sysinfo/cgroups_test.go @@ -0,0 +1,22 @@ +//go:build linux + +package sysinfo + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +// TestCGroupInfoUpdateOutsideContainer verifies that Update() gracefully handles +// running outside a cgroup environment by returning (false, nil) instead of an error. +// This exercises the errors.Is(err, fs.ErrNotExist) check in cGroupInfoImpl.Update(). +func TestCGroupInfoUpdateOutsideContainer(t *testing.T) { + info := newCGroupInfo().(*cGroupInfoImpl) + continueUpdates, err := info.Update() + + // When not in a cgroup (fs.ErrNotExist from cgroup2.Load), Update should + // return false with no error, signaling to stop trying cgroup updates. + assert.False(t, continueUpdates, "should return false when cgroup files don't exist") + assert.NoError(t, err, "should not return error when cgroup files don't exist") +} diff --git a/contrib/sysinfo/sysinfo.go b/contrib/sysinfo/sysinfo.go index 9c729f1ec..204b6ea63 100644 --- a/contrib/sysinfo/sysinfo.go +++ b/contrib/sysinfo/sysinfo.go @@ -18,18 +18,12 @@ var sysInfoProvider = sync.OnceValue(func() *psUtilSystemInfoSupplier { } }) -// SysInfoProvider returns a shared SystemInfoSupplier using gopsutil. +// SysInfoProvider returns a shared SysInfoProvider using gopsutil. // Supports cgroup metrics in containerized Linux environments. -func SysInfoProvider() worker.SystemInfoSupplier { +func SysInfoProvider() worker.SysInfoProvider { return sysInfoProvider() } -// NewResourceBasedTuner creates a resource-based tuner with gopsutil-based system info. -func NewResourceBasedTuner(opts worker.ResourceBasedTunerOptions) (worker.WorkerTuner, error) { - opts.InfoSupplier = SysInfoProvider() - return worker.NewResourceBasedTuner(opts) -} - type psUtilSystemInfoSupplier struct { mu sync.Mutex lastRefresh atomic.Int64 // UnixNano, atomic for lock-free reads in maybeRefresh @@ -54,7 +48,7 @@ type cGroupInfo interface { GetLastCPUUsage() float64 } -func (p *psUtilSystemInfoSupplier) GetMemoryUsage(infoContext *worker.SystemInfoContext) (float64, error) { +func (p *psUtilSystemInfoSupplier) MemoryUsage(infoContext *worker.SysInfoContext) (float64, error) { if err := p.maybeRefresh(infoContext); err != nil { return 0, err } @@ -67,7 +61,7 @@ func (p *psUtilSystemInfoSupplier) GetMemoryUsage(infoContext *worker.SystemInfo return p.lastMemStat.UsedPercent / 100, nil } -func (p *psUtilSystemInfoSupplier) GetCpuUsage(infoContext *worker.SystemInfoContext) (float64, error) { +func (p *psUtilSystemInfoSupplier) CpuUsage(infoContext *worker.SysInfoContext) (float64, error) { if err := p.maybeRefresh(infoContext); err != nil { return 0, err } @@ -80,7 +74,7 @@ func (p *psUtilSystemInfoSupplier) GetCpuUsage(infoContext *worker.SystemInfoCon return p.lastCpuUsage / 100, nil } -func (p *psUtilSystemInfoSupplier) maybeRefresh(infoContext *worker.SystemInfoContext) error { +func (p *psUtilSystemInfoSupplier) maybeRefresh(infoContext *worker.SysInfoContext) error { if time.Since(time.Unix(0, p.lastRefresh.Load())) < 100*time.Millisecond { return nil } diff --git a/contrib/sysinfo/sysinfo_test.go b/contrib/sysinfo/sysinfo_test.go index 073ee2c5a..8e8bfb6be 100644 --- a/contrib/sysinfo/sysinfo_test.go +++ b/contrib/sysinfo/sysinfo_test.go @@ -11,14 +11,14 @@ import ( func TestGetMemoryCpuUsage(t *testing.T) { supplier := SysInfoProvider() - ctx := &worker.SystemInfoContext{Logger: log.NewNopLogger()} + ctx := &worker.SysInfoContext{Logger: log.NewNopLogger()} - usage, err := supplier.GetMemoryUsage(ctx) + usage, err := supplier.MemoryUsage(ctx) require.NoError(t, err) assert.GreaterOrEqual(t, usage, 0.0) assert.LessOrEqual(t, usage, 1.0) - usage, err = supplier.GetCpuUsage(ctx) + usage, err = supplier.CpuUsage(ctx) require.NoError(t, err) assert.GreaterOrEqual(t, usage, 0.0) assert.LessOrEqual(t, usage, 1.0) @@ -26,26 +26,17 @@ func TestGetMemoryCpuUsage(t *testing.T) { func TestMaybeRefreshRateLimiting(t *testing.T) { supplier := SysInfoProvider().(*psUtilSystemInfoSupplier) - ctx := &worker.SystemInfoContext{Logger: log.NewNopLogger()} + ctx := &worker.SysInfoContext{Logger: log.NewNopLogger()} // First call should refresh - firstUsage, err := supplier.GetMemoryUsage(ctx) + firstUsage, err := supplier.MemoryUsage(ctx) require.NoError(t, err) firstRefresh := supplier.lastRefresh.Load() // Immediate second call should not refresh (rate limited) - secondUsage, err := supplier.GetMemoryUsage(ctx) + secondUsage, err := supplier.MemoryUsage(ctx) require.NoError(t, err) assert.Equal(t, firstRefresh, supplier.lastRefresh.Load()) assert.Equal(t, firstUsage, secondUsage) } - -func TestNewResourceBasedTuner(t *testing.T) { - tuner, err := NewResourceBasedTuner(worker.ResourceBasedTunerOptions{ - TargetMem: 0.8, - TargetCpu: 0.9, - }) - require.NoError(t, err) - require.NotNil(t, tuner) -} diff --git a/internal/internal_nexus_task_poller.go b/internal/internal_nexus_task_poller.go index a67caf3af..0e29affe6 100644 --- a/internal/internal_nexus_task_poller.go +++ b/internal/internal_nexus_task_poller.go @@ -91,9 +91,7 @@ func (ntp *nexusTaskPoller) poll(ctx context.Context) (taskForWorker, error) { return nil, nil } - if ntp.pollTimeTracker != nil { - ntp.pollTimeTracker.recordPollSuccess(metrics.PollerTypeNexusTask) - } + ntp.pollTimeTracker.recordPollSuccess(metrics.PollerTypeNexusTask) return &nexusTask{task: response}, nil } diff --git a/internal/internal_nexus_worker.go b/internal/internal_nexus_worker.go index ce28d27d2..d5f844ed5 100644 --- a/internal/internal_nexus_worker.go +++ b/internal/internal_nexus_worker.go @@ -16,7 +16,6 @@ type nexusWorkerOptions struct { type nexusWorker struct { executionParameters workerExecutionParameters workflowService workflowservice.WorkflowServiceClient - client *WorkflowClient worker *baseWorker stopC chan struct{} } @@ -69,16 +68,9 @@ func newNexusWorker(opts nexusWorkerOptions) (*nexusWorker, error) { baseWorker := newBaseWorker(bwo) - // Type assert to get the concrete client for namespace capabilities loading - var workflowClient *WorkflowClient - if wc, ok := opts.client.(*WorkflowClient); ok { - workflowClient = wc - } - return &nexusWorker{ executionParameters: opts.executionParameters, workflowService: opts.workflowService, - client: workflowClient, worker: baseWorker, stopC: workerStopChannel, }, nil @@ -86,11 +78,6 @@ func newNexusWorker(opts nexusWorkerOptions) (*nexusWorker, error) { // Start the worker. func (w *nexusWorker) Start() error { - if w.client != nil { - if _, err := w.client.loadNamespaceCapabilities(w.executionParameters.MetricsHandler); err != nil { - return err - } - } w.worker.Start() return nil } diff --git a/internal/internal_task_pollers.go b/internal/internal_task_pollers.go index 4495b00e8..cf72e8203 100644 --- a/internal/internal_task_pollers.go +++ b/internal/internal_task_pollers.go @@ -975,12 +975,10 @@ func (wtp *workflowTaskPoller) poll(ctx context.Context) (taskForWorker, error) return &workflowTask{}, nil } - if wtp.pollTimeTracker != nil { - if request.TaskQueue.GetKind() == enumspb.TASK_QUEUE_KIND_STICKY { - wtp.pollTimeTracker.recordPollSuccess(metrics.PollerTypeWorkflowStickyTask) - } else { - wtp.pollTimeTracker.recordPollSuccess(metrics.PollerTypeWorkflowTask) - } + if request.TaskQueue.GetKind() == enumspb.TASK_QUEUE_KIND_STICKY { + wtp.pollTimeTracker.recordPollSuccess(metrics.PollerTypeWorkflowStickyTask) + } else { + wtp.pollTimeTracker.recordPollSuccess(metrics.PollerTypeWorkflowTask) } wtp.updateBacklog(request.TaskQueue.GetKind(), response.GetBacklogCountHint()) @@ -1182,9 +1180,7 @@ func (atp *activityTaskPoller) poll(ctx context.Context) (taskForWorker, error) return &activityTask{}, nil } - if atp.pollTimeTracker != nil { - atp.pollTimeTracker.recordPollSuccess(metrics.PollerTypeActivityTask) - } + atp.pollTimeTracker.recordPollSuccess(metrics.PollerTypeActivityTask) workflowType := response.WorkflowType.GetName() activityType := response.ActivityType.GetName() diff --git a/internal/internal_worker.go b/internal/internal_worker.go index d0ca0353a..e21dd0088 100644 --- a/internal/internal_worker.go +++ b/internal/internal_worker.go @@ -81,7 +81,6 @@ type ( workflowWorker struct { executionParameters workerExecutionParameters workflowService workflowservice.WorkflowServiceClient - client *WorkflowClient worker *baseWorker localActivityWorker *baseWorker identity string @@ -95,7 +94,6 @@ type ( activityWorker struct { executionParameters workerExecutionParameters workflowService workflowservice.WorkflowServiceClient - client *WorkflowClient poller taskPoller worker *baseWorker identity string @@ -272,6 +270,9 @@ func ensureRequiredParams(params *workerExecutionParameters) { NumNexusSlots: defaultMaxConcurrentTaskExecutionSize, }) } + if params.pollTimeTracker == nil { + params.pollTimeTracker = &pollTimeTracker{} + } } // getBuildID returns either the user-defined build ID if it was provided, or an autogenerated one @@ -402,7 +403,6 @@ func newWorkflowTaskWorkerInternal( return &workflowWorker{ executionParameters: params, workflowService: service, - client: client, worker: worker, localActivityWorker: localActivityWorker, identity: params.Identity, @@ -414,11 +414,6 @@ func newWorkflowTaskWorkerInternal( // Start the worker. func (ww *workflowWorker) Start() error { - if ww.client != nil { - if _, err := ww.client.loadNamespaceCapabilities(ww.executionParameters.MetricsHandler); err != nil { - return err - } - } ww.localActivityWorker.Start() ww.worker.Start() return nil // TODO: propagate error @@ -553,7 +548,6 @@ func newActivityWorker( return &activityWorker{ executionParameters: params, workflowService: service, - client: client, worker: base, poller: poller, identity: params.Identity, @@ -563,11 +557,6 @@ func newActivityWorker( // Start the worker. func (aw *activityWorker) Start() error { - if aw.client != nil { - if _, err := aw.client.loadNamespaceCapabilities(aw.executionParameters.MetricsHandler); err != nil { - return err - } - } aw.worker.Start() return nil // TODO: propagate errors } @@ -1287,6 +1276,10 @@ func (aw *AggregatedWorker) start() error { } proto.Merge(aw.capabilities, capabilities) + if _, err := aw.client.loadNamespaceCapabilities(aw.executionParams.MetricsHandler); err != nil { + return err + } + if !util.IsInterfaceNil(aw.workflowWorker) { if err := aw.workflowWorker.Start(); err != nil { return err @@ -2096,12 +2089,10 @@ func NewAggregatedWorker(client *WorkflowClient, taskQueue string, options Worke baseMetricsHandler := client.metricsHandler.WithTags(metrics.TaskQueueTags(taskQueue)) var metricsHandler metrics.Handler var heartbeatMetrics *heartbeatMetricsHandler - var pollTracker *pollTimeTracker if client.workerHeartbeatInterval != 0 { heartbeatMetrics = newHeartbeatMetricsHandler(baseMetricsHandler) metricsHandler = heartbeatMetrics - pollTracker = newPollTimeTracker() } else { metricsHandler = baseMetricsHandler } @@ -2139,8 +2130,7 @@ func NewAggregatedWorker(client *WorkflowClient, taskQueue string, options Worke taskQueue: taskQueue, maxConcurrent: options.MaxConcurrentEagerActivityExecutionSize, }), - capabilities: &capabilities, - pollTimeTracker: pollTracker, + capabilities: &capabilities, } if options.MaxConcurrentWorkflowTaskPollers != 0 { @@ -2228,11 +2218,11 @@ func NewAggregatedWorker(client *WorkflowClient, taskQueue string, options Worke }) } - // Get SystemInfoSupplier from tuner's slot supplier if it implements HasSystemInfoSupplier. + // Get SysInfoProvider from tuner's slot supplier if it implements HasSysInfoProvider. // If not available, heartbeats will report 0 for CPU/memory usage. - var systemInfoSupplier SystemInfoSupplier - if sis, ok := options.Tuner.GetWorkflowTaskSlotSupplier().(HasSystemInfoSupplier); ok { - systemInfoSupplier = sis.GetSystemInfoSupplier() + var sysInfoProvider SysInfoProvider + if sis, ok := options.Tuner.GetWorkflowTaskSlotSupplier().(HasSysInfoProvider); ok { + sysInfoProvider = sis.SysInfoProvider() } var heartbeatCallback func() *workerpb.WorkerHeartbeat @@ -2242,9 +2232,6 @@ func NewAggregatedWorker(client *WorkflowClient, taskQueue string, options Worke pid := strconv.Itoa(os.Getpid()) previousHeartbeatTime := time.Now() pluginInfos := collectPluginInfos(client.clientPluginNames, plugins) - if pollTracker == nil { - panic("pollTracker must not be nil when heartbeats are enabled") - } var prevWorkflowProcessed, prevWorkflowFailed int64 var prevActivityProcessed, prevActivityFailed int64 @@ -2263,7 +2250,7 @@ func NewAggregatedWorker(client *WorkflowClient, taskQueue string, options Worke prevLocalActivityFailed: &prevLocalActivityFailed, prevNexusProcessed: &prevNexusProcessed, prevNexusFailed: &prevNexusFailed, - pollTimeTracker: pollTracker, + pollTimeTracker: workerParams.pollTimeTracker, } var deploymentVersion *deploymentpb.WorkerDeploymentVersion @@ -2277,8 +2264,8 @@ func NewAggregatedWorker(client *WorkflowClient, taskQueue string, options Worke // The callback can be invoked concurrently from the heartbeat worker goroutine and the shutdown path var mu sync.Mutex heartbeatCallback = func() *workerpb.WorkerHeartbeat { - cpuUsage := getCpuUsage(systemInfoSupplier, workerParams.Logger) - memUsage := getMemUsage(systemInfoSupplier, workerParams.Logger) + cpuUsage := getCpuUsage(sysInfoProvider, workerParams.Logger) + memUsage := getMemUsage(sysInfoProvider, workerParams.Logger) mu.Lock() defer mu.Unlock() @@ -2656,11 +2643,11 @@ func workerDeploymentVersionFromProtoOrString(wd *deploymentpb.WorkerDeploymentV } } -func getCpuUsage(supplier SystemInfoSupplier, logger log.Logger) float32 { +func getCpuUsage(supplier SysInfoProvider, logger log.Logger) float32 { if supplier == nil { return 0 } - cpu, err := supplier.GetCpuUsage(&SystemInfoContext{Logger: logger}) + cpu, err := supplier.CpuUsage(&SysInfoContext{Logger: logger}) if err != nil { logger.Warn("Failed to get CPU usage for heartbeat", "error", err) return 0 @@ -2668,11 +2655,11 @@ func getCpuUsage(supplier SystemInfoSupplier, logger log.Logger) float32 { return float32(cpu) } -func getMemUsage(supplier SystemInfoSupplier, logger log.Logger) float32 { +func getMemUsage(supplier SysInfoProvider, logger log.Logger) float32 { if supplier == nil { return 0 } - mem, err := supplier.GetMemoryUsage(&SystemInfoContext{Logger: logger}) + mem, err := supplier.MemoryUsage(&SysInfoContext{Logger: logger}) if err != nil { logger.Warn("Failed to get memory usage for heartbeat", "error", err) return 0 diff --git a/internal/internal_worker_heartbeat.go b/internal/internal_worker_heartbeat.go index 3c9d9d3b4..c1b8ed136 100644 --- a/internal/internal_worker_heartbeat.go +++ b/internal/internal_worker_heartbeat.go @@ -77,23 +77,20 @@ func (m *heartbeatManager) unregisterWorker(worker *AggregatedWorker) { } func (m *heartbeatManager) getOrCreateSharedNamespaceWorker(worker *AggregatedWorker) (*sharedNamespaceWorker, error) { + capabilities, err := m.client.loadNamespaceCapabilities(worker.heartbeatMetrics) + if err != nil { + return nil, fmt.Errorf("failed to get namespace capabilities: %w", err) + } + if !capabilities.GetWorkerHeartbeats() { + m.logger.Debug("Worker heartbeating configured, but server version does not support it.") + return nil, nil + } namespace := worker.executionParams.Namespace m.workersMutex.Lock() + defer m.workersMutex.Unlock() hw, ok := m.workers[namespace] - m.workersMutex.Unlock() if !ok { - capabilities, err := m.client.loadNamespaceCapabilities(worker.heartbeatMetrics) - if err != nil { - return nil, fmt.Errorf("failed to get namespace capabilities: %w", err) - } - if !capabilities.GetWorkerHeartbeats() { - m.logger.Debug("Worker heartbeating configured, but server version does not support it.") - return nil, nil - } - - m.workersMutex.Lock() if existing, ok := m.workers[namespace]; ok { - m.workersMutex.Unlock() hw = existing } else { newHw := &sharedNamespaceWorker{ @@ -106,7 +103,6 @@ func (m *heartbeatManager) getOrCreateSharedNamespaceWorker(worker *AggregatedWo logger: m.logger, } m.workers[namespace] = newHw - m.workersMutex.Unlock() hw = newHw if hw.started.Swap(true) { panic("heartbeat worker already started") @@ -199,10 +195,6 @@ type pollTimeTracker struct { times sync.Map // pollerType (string) -> time.Time (stored as int64 nanos) } -func newPollTimeTracker() *pollTimeTracker { - return &pollTimeTracker{} -} - func (p *pollTimeTracker) recordPollSuccess(pollerType string) { p.times.Store(pollerType, time.Now().UnixNano()) } diff --git a/internal/internal_worker_heartbeat_metrics.go b/internal/internal_worker_heartbeat_metrics.go index f31b239b1..1804b456a 100644 --- a/internal/internal_worker_heartbeat_metrics.go +++ b/internal/internal_worker_heartbeat_metrics.go @@ -275,10 +275,14 @@ func buildPollerInfo(currentPollers int32, lastSuccessfulPollTime time.Time, pol case *pollerBehaviorAutoscaling: isAutoscaling = true } + var pollTime *timestamppb.Timestamp + if !lastSuccessfulPollTime.IsZero() { + pollTime = timestamppb.New(lastSuccessfulPollTime) + } return &workerpb.WorkerPollerInfo{ CurrentPollers: currentPollers, - LastSuccessfulPollTime: timestamppb.New(lastSuccessfulPollTime), + LastSuccessfulPollTime: pollTime, IsAutoscaling: isAutoscaling, } } diff --git a/internal/internal_workers_test.go b/internal/internal_workers_test.go index a2d41af1f..8c6509d62 100644 --- a/internal/internal_workers_test.go +++ b/internal/internal_workers_test.go @@ -73,7 +73,6 @@ func TestWorkersTestSuite(t *testing.T) { } func (s *WorkersTestSuite) TestWorkflowWorker() { - s.service.EXPECT().DescribeNamespace(gomock.Any(), gomock.Any(), gomock.Any()).Return(nil, nil) s.service.EXPECT().PollWorkflowTaskQueue(gomock.Any(), gomock.Any(), gomock.Any()).Return(&workflowservice.PollWorkflowTaskQueueResponse{}, nil).AnyTimes() s.service.EXPECT().RespondWorkflowTaskCompleted(gomock.Any(), gomock.Any(), gomock.Any()).Return(nil, nil).AnyTimes() @@ -156,7 +155,6 @@ func (s *WorkersTestSuite) TestWorkflowWorkerSlotSupplier() { unblockPollCh := make(chan struct{}) pollRespondedCh := make(chan struct{}) - s.service.EXPECT().DescribeNamespace(gomock.Any(), gomock.Any(), gomock.Any()).Return(nil, nil) s.service.EXPECT().PollWorkflowTaskQueue(gomock.Any(), gomock.Any(), gomock.Any()). Do(func(ctx, in interface{}, opts ...interface{}) { <-unblockPollCh @@ -221,7 +219,6 @@ func (s *WorkersTestSuite) TestActivityWorkerSlotSupplier() { unblockPollCh := make(chan struct{}) pollRespondedCh := make(chan struct{}) - s.service.EXPECT().DescribeNamespace(gomock.Any(), gomock.Any(), gomock.Any()).Return(nil, nil) s.service.EXPECT().PollActivityTaskQueue(gomock.Any(), gomock.Any(), gomock.Any()). Do(func(ctx, in interface{}, opts ...interface{}) { <-unblockPollCh @@ -301,7 +298,6 @@ func (s *WorkersTestSuite) TestErrorProneSlotSupplier() { unblockPollCh := make(chan struct{}) pollRespondedCh := make(chan struct{}) - s.service.EXPECT().DescribeNamespace(gomock.Any(), gomock.Any(), gomock.Any()).Return(nil, nil) s.service.EXPECT().PollActivityTaskQueue(gomock.Any(), gomock.Any(), gomock.Any()). Do(func(ctx, in interface{}, opts ...interface{}) { <-unblockPollCh @@ -346,7 +342,6 @@ func (s *WorkersTestSuite) TestErrorProneSlotSupplier() { } func (s *WorkersTestSuite) TestActivityWorker() { - s.service.EXPECT().DescribeNamespace(gomock.Any(), gomock.Any(), gomock.Any()).Return(nil, nil) s.service.EXPECT().PollActivityTaskQueue(gomock.Any(), gomock.Any(), gomock.Any()).Return(&workflowservice.PollActivityTaskQueueResponse{}, nil).AnyTimes() s.service.EXPECT().RespondActivityTaskCompleted(gomock.Any(), gomock.Any(), gomock.Any()).Return(&workflowservice.RespondActivityTaskCompletedResponse{}, nil).AnyTimes() @@ -392,7 +387,6 @@ func (s *WorkersTestSuite) TestActivityWorkerStop() { WorkflowNamespace: "namespace", } - s.service.EXPECT().DescribeNamespace(gomock.Any(), gomock.Any(), gomock.Any()).Return(nil, nil) s.service.EXPECT().PollActivityTaskQueue(gomock.Any(), gomock.Any(), gomock.Any()).Return(pats, nil).AnyTimes() s.service.EXPECT().RespondActivityTaskCompleted(gomock.Any(), gomock.Any(), gomock.Any()).Return(&workflowservice.RespondActivityTaskCompletedResponse{}, nil).AnyTimes() @@ -440,7 +434,6 @@ func (s *WorkersTestSuite) TestActivityWorkerStop() { } func (s *WorkersTestSuite) TestPollWorkflowTaskQueue_InternalServiceError() { - s.service.EXPECT().DescribeNamespace(gomock.Any(), gomock.Any(), gomock.Any()).Return(nil, nil) s.service.EXPECT().PollWorkflowTaskQueue(gomock.Any(), gomock.Any(), gomock.Any()).Return(&workflowservice.PollWorkflowTaskQueueResponse{}, serviceerror.NewInternal("")).AnyTimes() executionParameters := workerExecutionParameters{ diff --git a/internal/internal_workflow_client.go b/internal/internal_workflow_client.go index 6d16b911a..328c7a83e 100644 --- a/internal/internal_workflow_client.go +++ b/internal/internal_workflow_client.go @@ -1390,7 +1390,9 @@ func (wc *WorkflowClient) loadNamespaceCapabilities(metricsHandler metrics.Handl if err != nil && !errors.As(err, &unimplemented) { return nil, fmt.Errorf("failed reaching server: %w", err) } - capabilities = resp.GetNamespaceInfo().GetCapabilities() + if resp != nil { + capabilities = resp.GetNamespaceInfo().GetCapabilities() + } if capabilities == nil { capabilities = &namespacepb.NamespaceInfo_Capabilities{} } diff --git a/internal/resource_tuner.go b/internal/resource_tuner.go index 86f612b6e..34f199b38 100644 --- a/internal/resource_tuner.go +++ b/internal/resource_tuner.go @@ -16,34 +16,32 @@ const ( resourceSlotsMemUsage = "temporal_resource_slots_mem_usage" ) -// SystemInfoSupplier implementations provide information about system resources. +// SysInfoProvider implementations provide information about system resources. // -// Exposed as: [go.temporal.io/sdk/worker.SystemInfoSupplier] -type SystemInfoSupplier interface { - // GetMemoryUsage returns the current system memory usage as a fraction of total memory between +// Exposed as: [go.temporal.io/sdk/worker.SysInfoProvider] +type SysInfoProvider interface { + // MemoryUsage returns the current system memory usage as a fraction of total memory between // 0 and 1. - GetMemoryUsage(infoContext *SystemInfoContext) (float64, error) - // GetCpuUsage returns the current system CPU usage as a fraction of total CPU usage between 0 + MemoryUsage(infoContext *SysInfoContext) (float64, error) + // CpuUsage returns the current system CPU usage as a fraction of total CPU usage between 0 // and 1. - GetCpuUsage(infoContext *SystemInfoContext) (float64, error) + CpuUsage(infoContext *SysInfoContext) (float64, error) } -// SystemInfoContext provides context for SystemInfoSupplier calls. +// SysInfoContext provides context for SysInfoProvider calls. // -// Exposed as: [go.temporal.io/sdk/worker.SystemInfoContext] -type SystemInfoContext struct { +// Exposed as: [go.temporal.io/sdk/worker.SysInfoContext] +type SysInfoContext struct { Logger log.Logger } -// TODO: Worried this is too invisible for custom slot suppliers to know to implement -// -// HasSystemInfoSupplier is an optional interface that SlotSupplier implementations can implement -// to expose their SystemInfoSupplier. This allows the SDK to access system metrics (CPU/memory) +// HasSysInfoProvider is an optional interface that SlotSupplier implementations can implement +// to expose their SysInfoProvider. This allows the SDK to access system metrics (CPU/memory) // for features like worker heartbeats without coupling to specific SlotSupplier implementations. // -// Exposed as: [go.temporal.io/sdk/worker.HasSystemInfoSupplier] -type HasSystemInfoSupplier interface { - GetSystemInfoSupplier() SystemInfoSupplier +// Exposed as: [go.temporal.io/sdk/worker.HasSysInfoProvider] +type HasSysInfoProvider interface { + SysInfoProvider() SysInfoProvider } // ResourceBasedTunerOptions configures a resource-based tuner. @@ -58,7 +56,7 @@ type ResourceBasedTunerOptions struct { TargetCpu float64 // InfoSupplier provides CPU and memory usage information. This is required. // Use contrib/sysinfo.SysInfoProvider() for a gopsutil-based implementation. - InfoSupplier SystemInfoSupplier + InfoSupplier SysInfoProvider // Passed to ResourceBasedSlotSupplierOptions.RampThrottle for activities. // If not set, the default value is 50ms. ActivityRampThrottle time.Duration @@ -232,8 +230,8 @@ func (r *ResourceBasedSlotSupplier) MaxSlots() int { return 0 } -// GetSystemInfoSupplier returns the SystemInfoSupplier used by this slot supplier's controller. -func (r *ResourceBasedSlotSupplier) GetSystemInfoSupplier() SystemInfoSupplier { +// GetSysInfoProvider returns the SysInfoProvider used by this slot supplier's controller. +func (r *ResourceBasedSlotSupplier) SysInfoProvider() SysInfoProvider { return r.controller.infoSupplier } @@ -250,7 +248,7 @@ type ResourceControllerOptions struct { // will attempt to maintain. CpuTargetPercent float64 // InfoSupplier is the supplier that the controller will use to get system resources. - InfoSupplier SystemInfoSupplier + InfoSupplier SysInfoProvider MemOutputThreshold float64 CpuOutputThreshold float64 @@ -307,7 +305,7 @@ type ResourceController struct { options ResourceControllerOptions mu sync.Mutex - infoSupplier SystemInfoSupplier + infoSupplier SysInfoProvider lastRefresh time.Time memPid *pidController cpuPid *pidController @@ -343,11 +341,11 @@ func (rc *ResourceController) pidDecision(logger log.Logger, metricsHandler metr rc.mu.Lock() defer rc.mu.Unlock() - memUsage, err := rc.infoSupplier.GetMemoryUsage(&SystemInfoContext{Logger: logger}) + memUsage, err := rc.infoSupplier.MemoryUsage(&SysInfoContext{Logger: logger}) if err != nil { return false, err } - cpuUsage, err := rc.infoSupplier.GetCpuUsage(&SystemInfoContext{Logger: logger}) + cpuUsage, err := rc.infoSupplier.CpuUsage(&SysInfoContext{Logger: logger}) if err != nil { return false, err } diff --git a/internal/resource_tuner_test.go b/internal/resource_tuner_test.go index 0e0d6c790..d97554401 100644 --- a/internal/resource_tuner_test.go +++ b/internal/resource_tuner_test.go @@ -12,11 +12,11 @@ type FakeSystemInfoSupplier struct { cpuUse float64 } -func (f FakeSystemInfoSupplier) GetMemoryUsage(_ *SystemInfoContext) (float64, error) { +func (f FakeSystemInfoSupplier) MemoryUsage(_ *SysInfoContext) (float64, error) { return f.memUse, nil } -func (f FakeSystemInfoSupplier) GetCpuUsage(_ *SystemInfoContext) (float64, error) { +func (f FakeSystemInfoSupplier) CpuUsage(_ *SysInfoContext) (float64, error) { return f.cpuUse, nil } diff --git a/test/worker_heartbeat_test.go b/test/worker_heartbeat_test.go index 239d7606a..21d31ba7a 100644 --- a/test/worker_heartbeat_test.go +++ b/test/worker_heartbeat_test.go @@ -2,7 +2,6 @@ package test_test import ( "context" - "errors" "fmt" "runtime" "sync" @@ -537,12 +536,14 @@ func workflowWithFailingActivity(ctx workflow.Context) error { return workflow.ExecuteActivity(ctx, failingActivity).Get(ctx, nil) } -// Workflow that panics (simulates workflow task failure) +// Workflow that panics to simulate a workflow task failure. The flag controls +// whether it panics, allowing tests to toggle it off so the workflow can +// eventually complete after the server retries the task. var failingWorkflowShouldFail atomic.Bool func failingWorkflow(ctx workflow.Context) (string, error) { if failingWorkflowShouldFail.Load() { - return "", errors.New("intentional workflow failure") + panic("intentional workflow task failure") } return "success", nil } @@ -811,6 +812,51 @@ func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatFailureMetrics() { }, 5*time.Second, 200*time.Millisecond, "Last interval failure count should reset to 0") } +func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatWorkflowTaskFailureMetrics() { + ctx := context.Background() + + failingWorkflowShouldFail.Store(true) + defer failingWorkflowShouldFail.Store(false) + + ts.worker = worker.New(ts.client, ts.taskQueueName, worker.Options{}) + ts.worker.RegisterWorkflow(failingWorkflow) + ts.NoError(ts.worker.Start()) + + workflowOptions := client.StartWorkflowOptions{ + ID: "test-wf-task-failure-" + uuid.NewString(), + TaskQueue: ts.taskQueueName, + } + + _, err := ts.client.ExecuteWorkflow(ctx, workflowOptions, failingWorkflow) + ts.NoError(err) + + var workerInfo *workerpb.WorkerHeartbeat + ts.Eventually(func() bool { + workerInfo = ts.getWorkerInfo(ctx, ts.taskQueueName) + return workerInfo != nil && workerInfo.WorkflowTaskSlotsInfo != nil && + workerInfo.WorkflowTaskSlotsInfo.TotalFailedTasks >= 1 + }, 5*time.Second, 200*time.Millisecond, "Should have tracked at least 1 workflow task failure") + + ts.GreaterOrEqual(workerInfo.WorkflowTaskSlotsInfo.TotalFailedTasks, int32(1)) + ts.GreaterOrEqual(workerInfo.WorkflowTaskSlotsInfo.LastIntervalFailureTasks, int32(1)) + + // Stop panicking so the workflow can complete on the next retry + failingWorkflowShouldFail.Store(false) + + ts.Eventually(func() bool { + workerInfo = ts.getWorkerInfo(ctx, ts.taskQueueName) + return workerInfo != nil && workerInfo.WorkflowTaskSlotsInfo != nil && + workerInfo.WorkflowTaskSlotsInfo.TotalProcessedTasks >= 1 + }, 5*time.Second, 200*time.Millisecond, "Should have processed at least 1 workflow task after recovery") + + // Last interval failure count should reset to 0 on a subsequent heartbeat + ts.Eventually(func() bool { + workerInfo = ts.getWorkerInfo(ctx, ts.taskQueueName) + return workerInfo != nil && workerInfo.WorkflowTaskSlotsInfo != nil && + workerInfo.WorkflowTaskSlotsInfo.LastIntervalFailureTasks == 0 + }, 5*time.Second, 200*time.Millisecond, "Last interval failure count should reset to 0") +} + func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatWorkflowTaskProcessed() { ctx := context.Background() diff --git a/worker/tuning.go b/worker/tuning.go index 0894b0a9b..7b7362dd1 100644 --- a/worker/tuning.go +++ b/worker/tuning.go @@ -47,17 +47,17 @@ func NewFixedSizeSlotSupplier(numSlots int) (SlotSupplier, error) { return internal.NewFixedSizeSlotSupplier(numSlots) } -// SystemInfoSupplier implementations provide information about system resources. +// SysInfoProvider implementations provide information about system resources. // Use contrib/sysinfo.SysInfoProvider() for a gopsutil-based implementation, // or provide your own. -type SystemInfoSupplier = internal.SystemInfoSupplier +type SysInfoProvider = internal.SysInfoProvider -// SystemInfoContext provides context for SystemInfoSupplier calls. -type SystemInfoContext = internal.SystemInfoContext +// SysInfoContext provides context for SysInfoProvider calls. +type SysInfoContext = internal.SysInfoContext -// HasSystemInfoSupplier is an optional interface that SlotSupplier implementations can implement -// to expose their SystemInfoSupplier. -type HasSystemInfoSupplier = internal.HasSystemInfoSupplier +// HasSysInfoProvider is an optional interface that SlotSupplier implementations can implement +// to expose their SysInfoProvider. +type HasSysInfoProvider = internal.HasSysInfoProvider // ResourceBasedTunerOptions configures a resource-based tuner. type ResourceBasedTunerOptions = internal.ResourceBasedTunerOptions From dd0215920cd6dd4c61913cc94c84ff8fa67bb99a Mon Sep 17 00:00:00 2001 From: Andrew Yuan Date: Thu, 5 Feb 2026 19:38:36 -0800 Subject: [PATCH 18/30] Fix tests --- contrib/sysinfo/cgroups_test.go | 22 ---------------------- test/worker_heartbeat_test.go | 8 ++++---- 2 files changed, 4 insertions(+), 26 deletions(-) delete mode 100644 contrib/sysinfo/cgroups_test.go diff --git a/contrib/sysinfo/cgroups_test.go b/contrib/sysinfo/cgroups_test.go deleted file mode 100644 index 0ec95389b..000000000 --- a/contrib/sysinfo/cgroups_test.go +++ /dev/null @@ -1,22 +0,0 @@ -//go:build linux - -package sysinfo - -import ( - "testing" - - "github.com/stretchr/testify/assert" -) - -// TestCGroupInfoUpdateOutsideContainer verifies that Update() gracefully handles -// running outside a cgroup environment by returning (false, nil) instead of an error. -// This exercises the errors.Is(err, fs.ErrNotExist) check in cGroupInfoImpl.Update(). -func TestCGroupInfoUpdateOutsideContainer(t *testing.T) { - info := newCGroupInfo().(*cGroupInfoImpl) - continueUpdates, err := info.Update() - - // When not in a cgroup (fs.ErrNotExist from cgroup2.Load), Update should - // return false with no error, signaling to stop trying cgroup updates. - assert.False(t, continueUpdates, "should return false when cgroup files don't exist") - assert.NoError(t, err, "should not return error when cgroup files don't exist") -} diff --git a/test/worker_heartbeat_test.go b/test/worker_heartbeat_test.go index 21d31ba7a..d572ebcb4 100644 --- a/test/worker_heartbeat_test.go +++ b/test/worker_heartbeat_test.go @@ -149,11 +149,11 @@ func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatBasic() { ts.Equal("Fixed", localActivityTaskSlots.SlotSupplierKind) workflowPollerInfo := workerInfo.WorkflowPollerInfo - ts.Equal(int32(1), workflowPollerInfo.CurrentPollers) + ts.NotEqual(int32(0), workflowPollerInfo.CurrentPollers) stickyPollerInfo := workerInfo.WorkflowStickyPollerInfo ts.NotEqual(int32(0), stickyPollerInfo.CurrentPollers) nexusPollerInfo := workerInfo.NexusPollerInfo - ts.Equal(int32(2), nexusPollerInfo.CurrentPollers) + ts.NotEqual(int32(0), nexusPollerInfo.CurrentPollers) activityPollerInfo := workerInfo.ActivityPollerInfo ts.NotEqual(int32(0), activityPollerInfo.CurrentPollers) @@ -236,7 +236,7 @@ func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatBasic() { ts.Equal("Fixed", localActivityTaskSlots.SlotSupplierKind) workflowPollerInfo = workerInfo.WorkflowPollerInfo - ts.Equal(int32(1), workflowPollerInfo.CurrentPollers) + ts.NotEqual(int32(0), workflowPollerInfo.CurrentPollers) ts.False(workflowPollerInfo.IsAutoscaling) ts.assertRecentTimestamp(workflowPollerInfo.LastSuccessfulPollTime, 10*time.Second, "WorkflowPollerInfo.LastSuccessfulPollTime after shutdown") @@ -248,7 +248,7 @@ func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatBasic() { "WorkflowStickyPollerInfo.LastSuccessfulPollTime after shutdown") nexusPollerInfo = workerInfo.NexusPollerInfo - ts.Equal(int32(2), nexusPollerInfo.CurrentPollers) + ts.NotEqual(int32(0), nexusPollerInfo.CurrentPollers) ts.False(nexusPollerInfo.IsAutoscaling) // Nexus poller has no successful polls since we didn't execute any nexus operations From bb556cbdc967643a906b4d2e57b858bf52bd3d47 Mon Sep 17 00:00:00 2001 From: Andrew Yuan Date: Mon, 9 Feb 2026 11:01:33 -0800 Subject: [PATCH 19/30] remove extra default logger addition, remove dead code --- internal/client.go | 4 ---- internal/internal_worker_heartbeat.go | 31 +++++++++++---------------- 2 files changed, 13 insertions(+), 22 deletions(-) diff --git a/internal/client.go b/internal/client.go index 41b2d22aa..89133710a 100644 --- a/internal/client.go +++ b/internal/client.go @@ -1139,10 +1139,6 @@ func NewServiceClient(workflowServiceClient workflowservice.WorkflowServiceClien options.ConnectionOptions.GetSystemInfoTimeout = defaultGetSystemInfoTimeout } - if options.Logger == nil { - options.Logger = ilog.NewDefaultLogger() - } - // Collect set of applicable worker plugins and interceptors var workerPlugins []WorkerPlugin var clientPluginNames []string diff --git a/internal/internal_worker_heartbeat.go b/internal/internal_worker_heartbeat.go index c1b8ed136..2e012d458 100644 --- a/internal/internal_worker_heartbeat.go +++ b/internal/internal_worker_heartbeat.go @@ -90,25 +90,20 @@ func (m *heartbeatManager) getOrCreateSharedNamespaceWorker(worker *AggregatedWo defer m.workersMutex.Unlock() hw, ok := m.workers[namespace] if !ok { - if existing, ok := m.workers[namespace]; ok { - hw = existing - } else { - newHw := &sharedNamespaceWorker{ - client: m.client, - namespace: namespace, - interval: m.interval, - callbacks: make(map[string]func() *workerpb.WorkerHeartbeat), - stopC: make(chan struct{}), - stoppedC: make(chan struct{}), - logger: m.logger, - } - m.workers[namespace] = newHw - hw = newHw - if hw.started.Swap(true) { - panic("heartbeat worker already started") - } - go hw.run() + hw = &sharedNamespaceWorker{ + client: m.client, + namespace: namespace, + interval: m.interval, + callbacks: make(map[string]func() *workerpb.WorkerHeartbeat), + stopC: make(chan struct{}), + stoppedC: make(chan struct{}), + logger: m.logger, + } + m.workers[namespace] = hw + if hw.started.Swap(true) { + panic("heartbeat worker already started") } + go hw.run() } return hw, nil } From da40521420775adea477300718eb31967721d923 Mon Sep 17 00:00:00 2001 From: Andrew Yuan Date: Mon, 9 Feb 2026 16:54:32 -0800 Subject: [PATCH 20/30] forgot a change.. --- internal/cmd/build/main.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/internal/cmd/build/main.go b/internal/cmd/build/main.go index 5a72c938b..1ed939df3 100644 --- a/internal/cmd/build/main.go +++ b/internal/cmd/build/main.go @@ -159,10 +159,10 @@ func (b *builder) integrationTest() error { "--dynamic-config-value", "history.enableChasm=true", "--dynamic-config-value", "history.enableTransitionHistory=true", "--dynamic-config-value", `component.nexusoperations.useSystemCallbackURL=false`, - "--dynamic-config-value", `component.nexusoperations.callback.endpoint.template="http://localhost:7243/namespaces/{{.NamespaceName}}/nexus/callback"`}, + "--dynamic-config-value", `component.nexusoperations.callback.endpoint.template="http://localhost:7243/namespaces/{{.NamespaceName}}/nexus/callback"`, "--dynamic-config-value", "frontend.WorkerHeartbeatsEnabled=true", "--dynamic-config-value", "frontend.ListWorkersEnabled=true", - }) + }}) if err != nil { return fmt.Errorf("failed starting dev server: %w", err) } From 04f5d4d95365eb53885ba0a84cb8b3421d46a7e0 Mon Sep 17 00:00:00 2001 From: Andrew Yuan Date: Mon, 9 Feb 2026 17:35:53 -0800 Subject: [PATCH 21/30] fix unit tests --- internal/cmd/build/main.go | 3 ++- internal/internal_worker.go | 6 +++--- internal/internal_worker_heartbeat.go | 8 +++++++- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/internal/cmd/build/main.go b/internal/cmd/build/main.go index 1ed939df3..6c0de7573 100644 --- a/internal/cmd/build/main.go +++ b/internal/cmd/build/main.go @@ -162,7 +162,8 @@ func (b *builder) integrationTest() error { "--dynamic-config-value", `component.nexusoperations.callback.endpoint.template="http://localhost:7243/namespaces/{{.NamespaceName}}/nexus/callback"`, "--dynamic-config-value", "frontend.WorkerHeartbeatsEnabled=true", "--dynamic-config-value", "frontend.ListWorkersEnabled=true", - }}) + }, + }) if err != nil { return fmt.Errorf("failed starting dev server: %w", err) } diff --git a/internal/internal_worker.go b/internal/internal_worker.go index d520d0ecf..609f2d33a 100644 --- a/internal/internal_worker.go +++ b/internal/internal_worker.go @@ -2243,9 +2243,9 @@ func NewAggregatedWorker(client *WorkflowClient, taskQueue string, options Worke var prevNexusProcessed, prevNexusFailed int64 populateOpts := &populateHeartbeatOptions{ - workflowPollerBehavior: options.WorkflowTaskPollerBehavior, - activityPollerBehavior: options.ActivityTaskPollerBehavior, - nexusPollerBehavior: options.NexusTaskPollerBehavior, + workflowPollerBehavior: workerParams.WorkflowTaskPollerBehavior, + activityPollerBehavior: workerParams.ActivityTaskPollerBehavior, + nexusPollerBehavior: workerParams.NexusTaskPollerBehavior, prevWorkflowProcessed: &prevWorkflowProcessed, prevWorkflowFailed: &prevWorkflowFailed, prevActivityProcessed: &prevActivityProcessed, diff --git a/internal/internal_worker_heartbeat.go b/internal/internal_worker_heartbeat.go index 2e012d458..52399bd78 100644 --- a/internal/internal_worker_heartbeat.go +++ b/internal/internal_worker_heartbeat.go @@ -3,6 +3,7 @@ package internal import ( "context" "fmt" + ilog "go.temporal.io/sdk/internal/log" "sync" "sync/atomic" "time" @@ -26,6 +27,9 @@ type heartbeatManager struct { // newHeartbeatManager creates a new heartbeatManager. func newHeartbeatManager(client *WorkflowClient, interval time.Duration, logger log.Logger) *heartbeatManager { + if logger == nil { + logger = ilog.NewDefaultLogger() + } return &heartbeatManager{ client: client, interval: interval, @@ -82,7 +86,9 @@ func (m *heartbeatManager) getOrCreateSharedNamespaceWorker(worker *AggregatedWo return nil, fmt.Errorf("failed to get namespace capabilities: %w", err) } if !capabilities.GetWorkerHeartbeats() { - m.logger.Debug("Worker heartbeating configured, but server version does not support it.") + if m.logger != nil { + m.logger.Debug("Worker heartbeating configured, but server version does not support it.") + } return nil, nil } namespace := worker.executionParams.Namespace From a5c85d0371304e08b64f984d74ca025011a78f3d Mon Sep 17 00:00:00 2001 From: Andrew Yuan Date: Tue, 10 Feb 2026 13:50:47 -0800 Subject: [PATCH 22/30] Fix eventually expectation for slower CI machines, fix race with heartbeat worker creation and callback registration --- internal/internal_worker_heartbeat.go | 68 ++++++++++++--------------- test/worker_heartbeat_test.go | 12 +++-- 2 files changed, 39 insertions(+), 41 deletions(-) diff --git a/internal/internal_worker_heartbeat.go b/internal/internal_worker_heartbeat.go index 52399bd78..babe64b42 100644 --- a/internal/internal_worker_heartbeat.go +++ b/internal/internal_worker_heartbeat.go @@ -42,12 +42,37 @@ func newHeartbeatManager(client *WorkflowClient, interval time.Duration, logger func (m *heartbeatManager) registerWorker( worker *AggregatedWorker, ) error { - hw, err := m.getOrCreateSharedNamespaceWorker(worker) + capabilities, err := m.client.loadNamespaceCapabilities(worker.heartbeatMetrics) if err != nil { - return err + return fmt.Errorf("failed to get namespace capabilities: %w", err) } - if hw == nil { - return nil // heartbeats not supported + if !capabilities.GetWorkerHeartbeats() { + if m.logger != nil { + m.logger.Debug("Worker heartbeating configured, but server version does not support it.") + } + return nil + } + + namespace := worker.executionParams.Namespace + m.workersMutex.Lock() + defer m.workersMutex.Unlock() + + hw, ok := m.workers[namespace] + if !ok { + hw = &sharedNamespaceWorker{ + client: m.client, + namespace: namespace, + interval: m.interval, + callbacks: make(map[string]func() *workerpb.WorkerHeartbeat), + stopC: make(chan struct{}), + stoppedC: make(chan struct{}), + logger: m.logger, + } + m.workers[namespace] = hw + if hw.started.Swap(true) { + panic("heartbeat worker already started") + } + go hw.run() } hw.callbacksMutex.Lock() @@ -80,40 +105,6 @@ func (m *heartbeatManager) unregisterWorker(worker *AggregatedWorker) { } } -func (m *heartbeatManager) getOrCreateSharedNamespaceWorker(worker *AggregatedWorker) (*sharedNamespaceWorker, error) { - capabilities, err := m.client.loadNamespaceCapabilities(worker.heartbeatMetrics) - if err != nil { - return nil, fmt.Errorf("failed to get namespace capabilities: %w", err) - } - if !capabilities.GetWorkerHeartbeats() { - if m.logger != nil { - m.logger.Debug("Worker heartbeating configured, but server version does not support it.") - } - return nil, nil - } - namespace := worker.executionParams.Namespace - m.workersMutex.Lock() - defer m.workersMutex.Unlock() - hw, ok := m.workers[namespace] - if !ok { - hw = &sharedNamespaceWorker{ - client: m.client, - namespace: namespace, - interval: m.interval, - callbacks: make(map[string]func() *workerpb.WorkerHeartbeat), - stopC: make(chan struct{}), - stoppedC: make(chan struct{}), - logger: m.logger, - } - m.workers[namespace] = hw - if hw.started.Swap(true) { - panic("heartbeat worker already started") - } - go hw.run() - } - return hw, nil -} - // sharedNamespaceWorker handles heartbeating for all workers in a specific namespace for a specific client. type sharedNamespaceWorker struct { client *WorkflowClient @@ -121,6 +112,7 @@ type sharedNamespaceWorker struct { interval time.Duration logger log.Logger + // callbacksMutex should only be unlocked under callbacksMutex sync.RWMutex callbacks map[string]func() *workerpb.WorkerHeartbeat // workerInstanceKey -> callback diff --git a/test/worker_heartbeat_test.go b/test/worker_heartbeat_test.go index d572ebcb4..a9768034e 100644 --- a/test/worker_heartbeat_test.go +++ b/test/worker_heartbeat_test.go @@ -116,13 +116,19 @@ func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatBasic() { ts.Fail("Timeout waiting for activity to start") } - // Wait for heartbeat to capture the in-flight activity + // Wait for heartbeat to capture the in-flight activity and all pollers to be actively polling. + // Poller counts are only non-zero while a poll RPC is in-flight, so on slower CI machines + // the first heartbeat may fire before all pollers have entered their first long-poll. var workerInfo *workerpb.WorkerHeartbeat ts.Eventually(func() bool { workerInfo = ts.getWorkerInfo(ctx, ts.taskQueueName) return workerInfo != nil && workerInfo.ActivityTaskSlotsInfo != nil && - workerInfo.ActivityTaskSlotsInfo.CurrentUsedSlots >= 1 - }, 5*time.Second, 200*time.Millisecond, "Should find worker with activity slot used") + workerInfo.ActivityTaskSlotsInfo.CurrentUsedSlots >= 1 && + workerInfo.WorkflowPollerInfo.GetCurrentPollers() > 0 && + workerInfo.WorkflowStickyPollerInfo.GetCurrentPollers() > 0 && + workerInfo.ActivityPollerInfo.GetCurrentPollers() > 0 && + workerInfo.NexusPollerInfo.GetCurrentPollers() > 0 + }, 5*time.Second, 200*time.Millisecond, "Should find worker with activity slot used and all pollers active") ts.Equal(enums.WORKER_STATUS_RUNNING, workerInfo.Status) From c5b49dbe2bcc931cdc6ef864dd10b7b16028f397 Mon Sep 17 00:00:00 2001 From: Andrew Yuan Date: Tue, 10 Feb 2026 14:33:34 -0800 Subject: [PATCH 23/30] Gate all sticky cache tests behind maxWorkflowCacheSize checks so it passes CI --- test/worker_heartbeat_test.go | 45 +++++++++++++++++++---------------- 1 file changed, 25 insertions(+), 20 deletions(-) diff --git a/test/worker_heartbeat_test.go b/test/worker_heartbeat_test.go index a9768034e..438623784 100644 --- a/test/worker_heartbeat_test.go +++ b/test/worker_heartbeat_test.go @@ -116,19 +116,13 @@ func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatBasic() { ts.Fail("Timeout waiting for activity to start") } - // Wait for heartbeat to capture the in-flight activity and all pollers to be actively polling. - // Poller counts are only non-zero while a poll RPC is in-flight, so on slower CI machines - // the first heartbeat may fire before all pollers have entered their first long-poll. var workerInfo *workerpb.WorkerHeartbeat + // Wait for heartbeat to capture the in-flight activity ts.Eventually(func() bool { workerInfo = ts.getWorkerInfo(ctx, ts.taskQueueName) return workerInfo != nil && workerInfo.ActivityTaskSlotsInfo != nil && - workerInfo.ActivityTaskSlotsInfo.CurrentUsedSlots >= 1 && - workerInfo.WorkflowPollerInfo.GetCurrentPollers() > 0 && - workerInfo.WorkflowStickyPollerInfo.GetCurrentPollers() > 0 && - workerInfo.ActivityPollerInfo.GetCurrentPollers() > 0 && - workerInfo.NexusPollerInfo.GetCurrentPollers() > 0 - }, 5*time.Second, 200*time.Millisecond, "Should find worker with activity slot used and all pollers active") + workerInfo.ActivityTaskSlotsInfo.CurrentUsedSlots >= 1 + }, 5*time.Second, 200*time.Millisecond, "Should find worker with activity slot used") ts.Equal(enums.WORKER_STATUS_RUNNING, workerInfo.Status) @@ -156,14 +150,16 @@ func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatBasic() { workflowPollerInfo := workerInfo.WorkflowPollerInfo ts.NotEqual(int32(0), workflowPollerInfo.CurrentPollers) - stickyPollerInfo := workerInfo.WorkflowStickyPollerInfo - ts.NotEqual(int32(0), stickyPollerInfo.CurrentPollers) nexusPollerInfo := workerInfo.NexusPollerInfo ts.NotEqual(int32(0), nexusPollerInfo.CurrentPollers) activityPollerInfo := workerInfo.ActivityPollerInfo ts.NotEqual(int32(0), activityPollerInfo.CurrentPollers) - ts.Equal(int32(1), workerInfo.CurrentStickyCacheSize) + if ts.config.maxWorkflowCacheSize > 0 { + stickyPollerInfo := workerInfo.WorkflowStickyPollerInfo + ts.NotEqual(int32(0), stickyPollerInfo.CurrentPollers) + ts.Equal(int32(1), workerInfo.CurrentStickyCacheSize) + } ts.assertRecentTimestamp(workerInfo.StartTime, 10*time.Second, "StartTime") ts.assertRecentTimestamp(workerInfo.HeartbeatTime, 5*time.Second, "HeartbeatTime") @@ -247,11 +243,13 @@ func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatBasic() { ts.assertRecentTimestamp(workflowPollerInfo.LastSuccessfulPollTime, 10*time.Second, "WorkflowPollerInfo.LastSuccessfulPollTime after shutdown") - stickyPollerInfo = workerInfo.WorkflowStickyPollerInfo - ts.NotEqual(int32(0), stickyPollerInfo.CurrentPollers) - ts.False(stickyPollerInfo.IsAutoscaling) - ts.assertRecentTimestamp(stickyPollerInfo.LastSuccessfulPollTime, 10*time.Second, - "WorkflowStickyPollerInfo.LastSuccessfulPollTime after shutdown") + if ts.config.maxWorkflowCacheSize > 0 { + stickyPollerInfo := workerInfo.WorkflowStickyPollerInfo + ts.NotEqual(int32(0), stickyPollerInfo.CurrentPollers) + ts.False(stickyPollerInfo.IsAutoscaling) + ts.assertRecentTimestamp(stickyPollerInfo.LastSuccessfulPollTime, 10*time.Second, + "WorkflowStickyPollerInfo.LastSuccessfulPollTime after shutdown") + } nexusPollerInfo = workerInfo.NexusPollerInfo ts.NotEqual(int32(0), nexusPollerInfo.CurrentPollers) @@ -264,7 +262,9 @@ func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatBasic() { ts.assertRecentTimestamp(activityPollerInfo.LastSuccessfulPollTime, 10*time.Second, "ActivityPollerInfo.LastSuccessfulPollTime after shutdown") - ts.Equal(int32(1), workerInfo.TotalStickyCacheHit) + if ts.config.maxWorkflowCacheSize > 0 { + ts.Equal(int32(1), workerInfo.TotalStickyCacheHit) + } } // TestWorkerHeartbeatDeploymentVersion verifies that deployment version info is @@ -617,6 +617,9 @@ func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatWithActivityInFlight() { } func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatStickyCacheMiss() { + if ts.config.maxWorkflowCacheSize == 0 { + ts.T().Skip("Sticky cache disabled") + } ctx := context.Background() wf1ActivityStarted := make(chan struct{}, 1) @@ -965,8 +968,10 @@ func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatResourceBasedTuner() { ts.NotNil(workerInfo.WorkflowPollerInfo) ts.True(workerInfo.WorkflowPollerInfo.IsAutoscaling) - ts.NotNil(workerInfo.WorkflowStickyPollerInfo) - ts.True(workerInfo.WorkflowStickyPollerInfo.IsAutoscaling) + if ts.config.maxWorkflowCacheSize > 0 { + ts.NotNil(workerInfo.WorkflowStickyPollerInfo) + ts.True(workerInfo.WorkflowStickyPollerInfo.IsAutoscaling) + } ts.NotNil(workerInfo.ActivityPollerInfo) ts.True(workerInfo.ActivityPollerInfo.IsAutoscaling) From faeba63ca2d029b4b3ca48292596e3b9a655f4f2 Mon Sep 17 00:00:00 2001 From: Andrew Yuan Date: Tue, 10 Feb 2026 15:34:47 -0800 Subject: [PATCH 24/30] loosen workerInfo.CurrentStickyCacheSize and workerInfo.TotalStickyCacheHit checks --- test/worker_heartbeat_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/worker_heartbeat_test.go b/test/worker_heartbeat_test.go index 438623784..a8c957e93 100644 --- a/test/worker_heartbeat_test.go +++ b/test/worker_heartbeat_test.go @@ -158,7 +158,7 @@ func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatBasic() { if ts.config.maxWorkflowCacheSize > 0 { stickyPollerInfo := workerInfo.WorkflowStickyPollerInfo ts.NotEqual(int32(0), stickyPollerInfo.CurrentPollers) - ts.Equal(int32(1), workerInfo.CurrentStickyCacheSize) + ts.GreaterOrEqual(workerInfo.CurrentStickyCacheSize, int32(1)) } ts.assertRecentTimestamp(workerInfo.StartTime, 10*time.Second, "StartTime") @@ -263,7 +263,7 @@ func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatBasic() { "ActivityPollerInfo.LastSuccessfulPollTime after shutdown") if ts.config.maxWorkflowCacheSize > 0 { - ts.Equal(int32(1), workerInfo.TotalStickyCacheHit) + ts.GreaterOrEqual(workerInfo.TotalStickyCacheHit, int32(1)) } } From 004032a24fab4408be1fa19a9355efff503bc3ec Mon Sep 17 00:00:00 2001 From: Andrew Yuan Date: Tue, 10 Feb 2026 16:40:00 -0800 Subject: [PATCH 25/30] Fix up TestWorkerHeartbeatStickyCacheMiss --- test/worker_heartbeat_test.go | 100 ++++++++++------------------------ 1 file changed, 30 insertions(+), 70 deletions(-) diff --git a/test/worker_heartbeat_test.go b/test/worker_heartbeat_test.go index a8c957e93..1da461cc9 100644 --- a/test/worker_heartbeat_test.go +++ b/test/worker_heartbeat_test.go @@ -3,7 +3,6 @@ package test_test import ( "context" "fmt" - "runtime" "sync" "sync/atomic" "testing" @@ -622,99 +621,60 @@ func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatStickyCacheMiss() { } ctx := context.Background() - wf1ActivityStarted := make(chan struct{}, 1) - wf1ActivityComplete := make(chan struct{}, 1) - wf2ActivityStarted := make(chan struct{}, 1) - wf2ActivityComplete := make(chan struct{}, 1) - - stickyCacheMissActivity := func(ctx context.Context, marker string) (string, error) { - switch marker { - case "wf1": - select { - case wf1ActivityStarted <- struct{}{}: - default: - } - select { - case <-wf1ActivityComplete: - return marker, nil - case <-ctx.Done(): - return "", ctx.Err() - } - case "wf2": - select { - case wf2ActivityStarted <- struct{}{}: - default: - } - select { - case <-wf2ActivityComplete: - return marker, nil - case <-ctx.Done(): - return "", ctx.Err() - } + activityStarted := make(chan struct{}, 1) + activityComplete := make(chan struct{}, 1) + + cacheMissActivity := func(ctx context.Context) (string, error) { + select { + case activityStarted <- struct{}{}: + default: + } + select { + case <-activityComplete: + return "done", nil + case <-ctx.Done(): + return "", ctx.Err() } - return marker, nil } - stickyCacheMissWorkflow := func(ctx workflow.Context, marker string) (string, error) { + cacheMissWorkflow := func(ctx workflow.Context) (string, error) { ao := workflow.ActivityOptions{ StartToCloseTimeout: 30 * time.Second, } ctx = workflow.WithActivityOptions(ctx, ao) var result string - err := workflow.ExecuteActivity(ctx, stickyCacheMissActivity, marker).Get(ctx, &result) + err := workflow.ExecuteActivity(ctx, cacheMissActivity).Get(ctx, &result) return result, err } - // GC ensures previous worker's cache finalizer runs, allowing cache to be recreated with new size - runtime.GC() - worker.SetStickyWorkflowCacheSize(1) ts.worker = worker.New(ts.client, ts.taskQueueName, worker.Options{ - MaxConcurrentWorkflowTaskExecutionSize: 2, - DisableEagerActivities: true, + DisableEagerActivities: true, }) - ts.worker.RegisterWorkflow(stickyCacheMissWorkflow) - ts.worker.RegisterActivity(stickyCacheMissActivity) + ts.worker.RegisterWorkflow(cacheMissWorkflow) + ts.worker.RegisterActivity(cacheMissActivity) ts.NoError(ts.worker.Start()) - wf1Options := client.StartWorkflowOptions{ - ID: "test-sticky-miss-wf1-" + uuid.NewString(), + wfOptions := client.StartWorkflowOptions{ + ID: "test-sticky-miss-" + uuid.NewString(), TaskQueue: ts.taskQueueName, } - run1, err := ts.client.ExecuteWorkflow(ctx, wf1Options, stickyCacheMissWorkflow, "wf1") + run, err := ts.client.ExecuteWorkflow(ctx, wfOptions, cacheMissWorkflow) ts.NoError(err) select { - case <-wf1ActivityStarted: - ts.T().Log("wf1 activity started") - case <-time.After(10 * time.Second): - ts.Fail("Timeout waiting for wf1 activity to start") - } - - // this should evict wf1 from the cache - wf2Options := client.StartWorkflowOptions{ - ID: "test-sticky-miss-wf2-" + uuid.NewString(), - TaskQueue: ts.taskQueueName, - } - run2, err := ts.client.ExecuteWorkflow(ctx, wf2Options, stickyCacheMissWorkflow, "wf2") - ts.NoError(err) - - select { - case <-wf2ActivityStarted: - ts.T().Log("wf2 activity started") + case <-activityStarted: + ts.T().Log("Activity started") case <-time.After(10 * time.Second): - ts.Fail("Timeout waiting for wf2 activity to start") + ts.Fail("Timeout waiting for activity to start") } - // wf1 should experience a cache miss when it resumes - wf1ActivityComplete <- struct{}{} - var result1 string - ts.NoError(run1.Get(ctx, &result1)) - ts.Equal("wf1", result1) + // Purge the cache so the workflow's sticky task triggers a cache miss on resume + worker.PurgeStickyWorkflowCache() - wf2ActivityComplete <- struct{}{} - var result2 string - ts.NoError(run2.Get(ctx, &result2)) - ts.Equal("wf2", result2) + activityComplete <- struct{}{} + var result string + ts.NoError(run.Get(ctx, &result)) + ts.Equal("done", result) // Wait for heartbeat to capture sticky cache miss var workerInfo *workerpb.WorkerHeartbeat From 257e264010188a242b922f679ad5c4e8cbd13716 Mon Sep 17 00:00:00 2001 From: Andrew Yuan Date: Wed, 11 Feb 2026 09:24:06 -0800 Subject: [PATCH 26/30] Add comment, minor fix --- internal/internal_worker.go | 5 +--- internal/internal_worker_heartbeat.go | 1 + internal/internal_worker_heartbeat_metrics.go | 28 +++++++++---------- 3 files changed, 16 insertions(+), 18 deletions(-) diff --git a/internal/internal_worker.go b/internal/internal_worker.go index 795571f52..91c943047 100644 --- a/internal/internal_worker.go +++ b/internal/internal_worker.go @@ -1522,13 +1522,10 @@ func (aw *AggregatedWorker) shutdownWorker() { WorkerHeartbeat: heartbeat, }) - // Ignore unimplemented (server doesn't support it) and unavailable (server shutting down) + // Ignore unimplemented (server doesn't support it) if _, isUnimplemented := err.(*serviceerror.Unimplemented); isUnimplemented { return } - if _, isUnavailable := err.(*serviceerror.Unavailable); isUnavailable { - return - } if err != nil { aw.logger.Debug("ShutdownWorker rpc errored during worker shutdown.", tagError, err) diff --git a/internal/internal_worker_heartbeat.go b/internal/internal_worker_heartbeat.go index babe64b42..74ddb9f73 100644 --- a/internal/internal_worker_heartbeat.go +++ b/internal/internal_worker_heartbeat.go @@ -58,6 +58,7 @@ func (m *heartbeatManager) registerWorker( defer m.workersMutex.Unlock() hw, ok := m.workers[namespace] + // If this is the first worker on the namespace, start a new shared namespace worker. if !ok { hw = &sharedNamespaceWorker{ client: m.client, diff --git a/internal/internal_worker_heartbeat_metrics.go b/internal/internal_worker_heartbeat_metrics.go index 1804b456a..fdf11c4e8 100644 --- a/internal/internal_worker_heartbeat_metrics.go +++ b/internal/internal_worker_heartbeat_metrics.go @@ -13,21 +13,21 @@ import ( // Metrics we capture for heartbeat reporting. var ( - capturedCounters = map[string]bool{ - metrics.StickyCacheHit: true, - metrics.StickyCacheMiss: true, - metrics.WorkflowTaskExecutionFailureCounter: true, - metrics.ActivityExecutionFailedCounter: true, - metrics.LocalActivityExecutionFailedCounter: true, - metrics.NexusTaskExecutionFailedCounter: true, + capturedCounters = map[string]struct{}{ + metrics.StickyCacheHit: {}, + metrics.StickyCacheMiss: {}, + metrics.WorkflowTaskExecutionFailureCounter: {}, + metrics.ActivityExecutionFailedCounter: {}, + metrics.LocalActivityExecutionFailedCounter: {}, + metrics.NexusTaskExecutionFailedCounter: {}, } // Timer recordings are counted (not their latencies) to track tasks processed. - capturedTimers = map[string]bool{ - metrics.WorkflowTaskExecutionLatency: true, - metrics.ActivityExecutionLatency: true, - metrics.LocalActivityExecutionLatency: true, - metrics.NexusTaskExecutionLatency: true, + capturedTimers = map[string]struct{}{ + metrics.WorkflowTaskExecutionLatency: {}, + metrics.ActivityExecutionLatency: {}, + metrics.LocalActivityExecutionLatency: {}, + metrics.NexusTaskExecutionLatency: {}, } ) @@ -75,7 +75,7 @@ func (h *heartbeatMetricsHandler) WithTags(tags map[string]string) metrics.Handl func (h *heartbeatMetricsHandler) Counter(name string) metrics.Counter { underlying := h.underlying.Counter(name) - if capturedCounters[name] { + if _, ok := capturedCounters[name]; ok { return &capturingCounter{ underlying: underlying, value: h.getOrCreate(name), @@ -114,7 +114,7 @@ func (h *heartbeatMetricsHandler) Gauge(name string) metrics.Gauge { func (h *heartbeatMetricsHandler) Timer(name string) metrics.Timer { underlying := h.underlying.Timer(name) - if capturedTimers[name] { + if _, ok := capturedTimers[name]; ok { return &capturingTimer{ underlying: underlying, counter: h.getOrCreate(name), From 8d7aa2e6df89181f2fba9f5811d8ca9d9b99b1bd Mon Sep 17 00:00:00 2001 From: Andrew Yuan Date: Thu, 12 Feb 2026 13:10:41 -0800 Subject: [PATCH 27/30] Make SHUTTING_DOWN status atomic, plumb workerInstanceKeys to workflow/activity poll requests, make WorkerHeartbeatInterval not a pointer --- internal/client.go | 16 ++++++++-------- internal/cmd/build/main.go | 2 -- internal/internal_task_pollers.go | 6 ++++++ internal/internal_worker.go | 17 ++++++++++++++--- test/integration_test.go | 3 +-- test/test_utils_test.go | 3 +-- test/worker_heartbeat_test.go | 10 +++------- 7 files changed, 33 insertions(+), 24 deletions(-) diff --git a/internal/client.go b/internal/client.go index c97c83bb4..430c77859 100644 --- a/internal/client.go +++ b/internal/client.go @@ -604,12 +604,12 @@ type ( Plugins []ClientPlugin // WorkerHeartbeatInterval is the interval at which the worker will send heartbeats to the server. - // Interval must be between 1s and 60s, inclusive. + // Interval must be between 1s and 60s, inclusive, or a negative value to disable. // - // default: 60s. To disable, set to 0. + // default: 0 defaults to 60s interval. // // NOTE: Experimental - WorkerHeartbeatInterval *time.Duration + WorkerHeartbeatInterval time.Duration } // HeadersProvider returns a map of gRPC headers that should be used on every request. @@ -1222,15 +1222,15 @@ func NewServiceClient(workflowServiceClient workflowservice.WorkflowServiceClien } var heartbeatInterval time.Duration - if options.WorkerHeartbeatInterval == nil { - heartbeatInterval = time.Second * 60 - } else if *options.WorkerHeartbeatInterval == 0 { + if options.WorkerHeartbeatInterval < 0 { heartbeatInterval = 0 + } else if options.WorkerHeartbeatInterval == 0 { + heartbeatInterval = 60 * time.Second } else { - if *options.WorkerHeartbeatInterval < time.Second || *options.WorkerHeartbeatInterval > 60*time.Second { + if options.WorkerHeartbeatInterval < time.Second || options.WorkerHeartbeatInterval > 60*time.Second { panic("WorkerHeartbeatInterval must be between 1 second and 60 seconds") } - heartbeatInterval = *options.WorkerHeartbeatInterval + heartbeatInterval = options.WorkerHeartbeatInterval } client := &WorkflowClient{ diff --git a/internal/cmd/build/main.go b/internal/cmd/build/main.go index 6c0de7573..6804caffb 100644 --- a/internal/cmd/build/main.go +++ b/internal/cmd/build/main.go @@ -160,8 +160,6 @@ func (b *builder) integrationTest() error { "--dynamic-config-value", "history.enableTransitionHistory=true", "--dynamic-config-value", `component.nexusoperations.useSystemCallbackURL=false`, "--dynamic-config-value", `component.nexusoperations.callback.endpoint.template="http://localhost:7243/namespaces/{{.NamespaceName}}/nexus/callback"`, - "--dynamic-config-value", "frontend.WorkerHeartbeatsEnabled=true", - "--dynamic-config-value", "frontend.ListWorkersEnabled=true", }, }) if err != nil { diff --git a/internal/internal_task_pollers.go b/internal/internal_task_pollers.go index cf72e8203..c5e6dff2b 100644 --- a/internal/internal_task_pollers.go +++ b/internal/internal_task_pollers.go @@ -81,6 +81,8 @@ type ( capabilities *workflowservice.GetSystemInfoResponse_Capabilities // tracks timestamp for last poll request, for worker heartbeating pollTimeTracker *pollTimeTracker + // Unique identifier for worker + workerInstanceKey string } // numPollerMetric tracks the number of active pollers and publishes a metric on it. @@ -325,6 +327,7 @@ func newWorkflowTaskProcessor( workerDeploymentVersion: params.DeploymentOptions.Version, capabilities: params.capabilities, pollTimeTracker: params.pollTimeTracker, + workerInstanceKey: params.workerInstanceKey, }, service: service, namespace: params.Namespace, @@ -932,6 +935,7 @@ func (wtp *workflowTaskPoller) getNextPollRequest() (request *workflowservice.Po wtp.useBuildIDVersioning, wtp.workerDeploymentVersion, ), + WorkerInstanceKey: wtp.workerInstanceKey, } if wtp.getCapabilities().BuildIdBasedVersioning { //lint:ignore SA1019 ignore deprecated versioning APIs @@ -1129,6 +1133,7 @@ func newActivityTaskPoller(taskHandler ActivityTaskHandler, service workflowserv workerDeploymentVersion: params.DeploymentOptions.Version, capabilities: params.capabilities, pollTimeTracker: params.pollTimeTracker, + workerInstanceKey: params.workerInstanceKey, }, taskHandler: taskHandler, service: service, @@ -1168,6 +1173,7 @@ func (atp *activityTaskPoller) poll(ctx context.Context) (taskForWorker, error) atp.useBuildIDVersioning, atp.workerDeploymentVersion, ), + WorkerInstanceKey: atp.workerInstanceKey, } response, err := atp.pollActivityTaskQueue(ctx, request) diff --git a/internal/internal_worker.go b/internal/internal_worker.go index 91c943047..17e88e644 100644 --- a/internal/internal_worker.go +++ b/internal/internal_worker.go @@ -216,6 +216,8 @@ type ( capabilities *workflowservice.GetSystemInfoResponse_Capabilities pollTimeTracker *pollTimeTracker + + workerInstanceKey string } // HistoryJSONOptions are options for HistoryFromJSON. @@ -1158,6 +1160,7 @@ type AggregatedWorker struct { registry *registry // Stores a boolean indicating whether the worker has already been started. started atomic.Bool + shuttingDown atomic.Bool stopC chan struct{} fatalErr error fatalErrLock sync.Mutex @@ -1499,6 +1502,8 @@ func (aw *AggregatedWorker) unregisterHeartbeatWorker() { // // NOTE: errors are logged but don't fail the shutdown. func (aw *AggregatedWorker) shutdownWorker() { + aw.shuttingDown.Store(true) + ctx := context.Background() grpcCtx, cancel := newGRPCContext(ctx, grpcMetricsHandler(aw.executionParams.MetricsHandler)) defer cancel() @@ -1506,7 +1511,6 @@ func (aw *AggregatedWorker) shutdownWorker() { var heartbeat *workerpb.WorkerHeartbeat if aw.heartbeatCallback != nil { heartbeat = aw.heartbeatCallback() - heartbeat.Status = enumspb.WORKER_STATUS_SHUTTING_DOWN } var stickyTaskQueue string @@ -2129,7 +2133,9 @@ func NewAggregatedWorker(client *WorkflowClient, taskQueue string, options Worke taskQueue: taskQueue, maxConcurrent: options.MaxConcurrentEagerActivityExecutionSize, }), - capabilities: &capabilities, + capabilities: &capabilities, + pollTimeTracker: &pollTimeTracker{}, + workerInstanceKey: workerInstanceKey, } if options.MaxConcurrentWorkflowTaskPollers != 0 { @@ -2282,6 +2288,11 @@ func NewAggregatedWorker(client *WorkflowClient, taskQueue string, options Worke elapsedSinceLastHeartbeat := heartbeatTime.Sub(previousHeartbeatTime) previousHeartbeatTime = heartbeatTime + status := enumspb.WORKER_STATUS_RUNNING + if aw.shuttingDown.Load() { + status = enumspb.WORKER_STATUS_SHUTTING_DOWN + } + hb := &workerpb.WorkerHeartbeat{ WorkerInstanceKey: aw.workerInstanceKey, WorkerIdentity: aw.client.identity, @@ -2296,7 +2307,7 @@ func NewAggregatedWorker(client *WorkflowClient, taskQueue string, options Worke DeploymentVersion: deploymentVersion, SdkName: SDKName, SdkVersion: SDKVersion, - Status: enumspb.WORKER_STATUS_RUNNING, + Status: status, StartTime: startTime, HeartbeatTime: timestamppb.New(heartbeatTime), ElapsedSinceLastHeartbeat: durationpb.New(elapsedSinceLastHeartbeat), diff --git a/test/integration_test.go b/test/integration_test.go index ea26b3a68..763e41b0c 100644 --- a/test/integration_test.go +++ b/test/integration_test.go @@ -169,7 +169,6 @@ func (ts *IntegrationTestSuite) SetupTest() { var err error trafficController := test.NewSimpleTrafficController() - disableHeartbeat := time.Duration(0) ts.client, err = client.Dial(client.Options{ HostPort: ts.config.ServiceAddr, Namespace: ts.config.Namespace, @@ -182,7 +181,7 @@ func (ts *IntegrationTestSuite) SetupTest() { TrafficController: trafficController, Interceptors: clientInterceptors, ConnectionOptions: client.ConnectionOptions{TLS: ts.config.TLS}, - WorkerHeartbeatInterval: &disableHeartbeat, + WorkerHeartbeatInterval: -1, }) ts.NoError(err) diff --git a/test/test_utils_test.go b/test/test_utils_test.go index 88e8040ef..ece9100e1 100644 --- a/test/test_utils_test.go +++ b/test/test_utils_test.go @@ -230,7 +230,6 @@ func (ts *ConfigAndClientSuiteBase) InitClient() error { } func (ts *ConfigAndClientSuiteBase) newClient() (client.Client, error) { - disableHeartbeat := time.Duration(0) return client.Dial(client.Options{ HostPort: ts.config.ServiceAddr, Namespace: ts.config.Namespace, @@ -239,7 +238,7 @@ func (ts *ConfigAndClientSuiteBase) newClient() (client.Client, error) { TLS: ts.config.TLS, GetSystemInfoTimeout: ctxTimeout, }, - WorkerHeartbeatInterval: &disableHeartbeat, + WorkerHeartbeatInterval: -1, }) } diff --git a/test/worker_heartbeat_test.go b/test/worker_heartbeat_test.go index 1da461cc9..2ace6e39d 100644 --- a/test/worker_heartbeat_test.go +++ b/test/worker_heartbeat_test.go @@ -48,14 +48,12 @@ func (ts *WorkerHeartbeatTestSuite) TearDownSuite() { func (ts *WorkerHeartbeatTestSuite) SetupTest() { var err error - heartbeatInterval := 1 * time.Second - // Create a client with heartbeating enabled ts.client, err = client.Dial(client.Options{ HostPort: ts.config.ServiceAddr, Namespace: ts.config.Namespace, Logger: ilog.NewDefaultLogger(), - WorkerHeartbeatInterval: &heartbeatInterval, + WorkerHeartbeatInterval: 1 * time.Second, ConnectionOptions: client.ConnectionOptions{TLS: ts.config.TLS}, Identity: "WorkerHeartbeatTest", }) @@ -306,12 +304,11 @@ func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatDisabled() { ctx := context.Background() // Create a separate client with heartbeating disabled - heartbeatInterval := time.Duration(0) clientNoHeartbeat, err := client.Dial(client.Options{ HostPort: ts.config.ServiceAddr, Namespace: ts.config.Namespace, Logger: ilog.NewDefaultLogger(), - WorkerHeartbeatInterval: &heartbeatInterval, + WorkerHeartbeatInterval: -1, ConnectionOptions: client.ConnectionOptions{TLS: ts.config.TLS}, }) ts.NoError(err) @@ -956,12 +953,11 @@ func (ts *WorkerHeartbeatTestSuite) TestWorkerHeartbeatPlugins() { ts.NoError(err) // Create a new client with the plugin - heartbeatInterval := 1 * time.Second pluginClient, err := client.Dial(client.Options{ HostPort: ts.config.ServiceAddr, Namespace: ts.config.Namespace, Logger: ilog.NewDefaultLogger(), - WorkerHeartbeatInterval: &heartbeatInterval, + WorkerHeartbeatInterval: 1 * time.Second, ConnectionOptions: client.ConnectionOptions{TLS: ts.config.TLS}, Identity: "PluginTest", Plugins: []client.Plugin{clientPlugin}, From 41d6afde5e5b2138982b186c6c75ad2e68d98797 Mon Sep 17 00:00:00 2001 From: Andrew Yuan Date: Fri, 13 Feb 2026 11:02:25 -0800 Subject: [PATCH 28/30] bring back listworkers dynamic config, fix identity in heartbeat --- internal/cmd/build/main.go | 1 + internal/internal_worker.go | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/internal/cmd/build/main.go b/internal/cmd/build/main.go index 6804caffb..80376e1eb 100644 --- a/internal/cmd/build/main.go +++ b/internal/cmd/build/main.go @@ -160,6 +160,7 @@ func (b *builder) integrationTest() error { "--dynamic-config-value", "history.enableTransitionHistory=true", "--dynamic-config-value", `component.nexusoperations.useSystemCallbackURL=false`, "--dynamic-config-value", `component.nexusoperations.callback.endpoint.template="http://localhost:7243/namespaces/{{.NamespaceName}}/nexus/callback"`, + "--dynamic-config-value", "frontend.ListWorkersEnabled=true", }, }) if err != nil { diff --git a/internal/internal_worker.go b/internal/internal_worker.go index 0c81a4c6f..f21c63a66 100644 --- a/internal/internal_worker.go +++ b/internal/internal_worker.go @@ -2296,7 +2296,7 @@ func NewAggregatedWorker(client *WorkflowClient, taskQueue string, options Worke hb := &workerpb.WorkerHeartbeat{ WorkerInstanceKey: aw.workerInstanceKey, - WorkerIdentity: aw.client.identity, + WorkerIdentity: aw.executionParams.Identity, HostInfo: &workerpb.WorkerHostInfo{ HostName: hostname, WorkerGroupingKey: aw.client.workerGroupingKey, From e297ae16a72e7c5c44deb4036da097542ea0c057 Mon Sep 17 00:00:00 2001 From: Andrew Yuan Date: Fri, 13 Feb 2026 11:36:13 -0800 Subject: [PATCH 29/30] Add dynamic config for listWorkers for docker test --- .github/workflows/docker/dynamic-config-custom.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/docker/dynamic-config-custom.yaml b/.github/workflows/docker/dynamic-config-custom.yaml index 49dddbb77..8da54a9ee 100644 --- a/.github/workflows/docker/dynamic-config-custom.yaml +++ b/.github/workflows/docker/dynamic-config-custom.yaml @@ -49,4 +49,6 @@ history.enableChasm: history.enableTransitionHistory: - value: true component.nexusoperations.useSystemCallbackURL: - - value: false \ No newline at end of file + - value: false +frontend.ListWorkersEnabled: + - value: true From 1cb7c909c67792b545a9dab335417299405c3082 Mon Sep 17 00:00:00 2001 From: Andrew Yuan Date: Fri, 13 Feb 2026 12:15:49 -0800 Subject: [PATCH 30/30] server v1.29.1 still requires dynamic config for heartbeating --- .github/workflows/docker/dynamic-config-custom.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/docker/dynamic-config-custom.yaml b/.github/workflows/docker/dynamic-config-custom.yaml index 8da54a9ee..acaddf3c5 100644 --- a/.github/workflows/docker/dynamic-config-custom.yaml +++ b/.github/workflows/docker/dynamic-config-custom.yaml @@ -50,5 +50,7 @@ history.enableTransitionHistory: - value: true component.nexusoperations.useSystemCallbackURL: - value: false +frontend.WorkerHeartbeatsEnabled: + - value: true frontend.ListWorkersEnabled: - value: true