From 2e4cddc146be88b149ea9f99f3ad9a8f39688618 Mon Sep 17 00:00:00 2001 From: luccabb Date: Thu, 12 Mar 2026 13:31:19 -0700 Subject: [PATCH] go module updates (#95) Summary: - Renames shelper module path from github.com/fairinternal/fair-cluster-monitoring/shelper to github.com/facebookresearch/gcm/shelper to match the public repo - Updates slurmprocessor to reference the renamed shelper module, fixes package declaration from main to slurmprocessor, and removes the internal attributesprocessor dependency (used an internal package that external modules can't import) - Both modules are now published and fetchable via go get github.com/facebookresearch/gcm/shelper@v0.0.1 and go get github.com/facebookresearch/gcm/slurmprocessor@v0.0.1 - Updates Helm chart values and Kubernetes deployment docs Pull Request resolved: https://github.com/facebookresearch/gcm/pull/95 Test Plan: - go get github.com/facebookresearch/gcm/shelper@v0.0.1 resolves successfully from a fresh module - go get github.com/facebookresearch/gcm/slurmprocessor@v0.0.1 resolves successfully from a fresh module - Helm chart deploys correctly with updated values Reviewed By: gunchu Differential Revision: D96259734 Pulled By: luccabb --- .gitignore | 2 ++ charts/gcm/README.md | 22 ++++++++++------- charts/gcm/values.yaml | 4 ++-- shelper/go.mod | 2 +- slurmprocessor/common.go | 4 ++-- slurmprocessor/config.go | 2 +- slurmprocessor/config_test.go | 2 +- slurmprocessor/factory.go | 11 ++++----- slurmprocessor/go.mod | 5 ++-- slurmprocessor/go.sum | 6 ++--- slurmprocessor/logs.go | 4 ++-- slurmprocessor/metrics.go | 4 ++-- slurmprocessor/traces.go | 4 ++-- .../kubernetes_deployment.md | 12 ++++++---- .../GCM_Monitoring/kubernetes_deployment.md | 24 ++++++++++++++----- 15 files changed, 64 insertions(+), 44 deletions(-) diff --git a/.gitignore b/.gitignore index d3ae9cd..e9cc3c8 100644 --- a/.gitignore +++ b/.gitignore @@ -282,3 +282,5 @@ go.work.sum # Meta-internal CI skycastle/ scrut/ + +.claude/ diff --git a/charts/gcm/README.md b/charts/gcm/README.md index a32aed3..845f5f7 100644 --- a/charts/gcm/README.md +++ b/charts/gcm/README.md @@ -23,22 +23,18 @@ The chart is published to GHCR as an OCI artifact and versioned alongside GCM re ```shell helm install gcm oci://ghcr.io/facebookresearch/charts/gcm \ - --set healthChecks.cluster=my-cluster \ - --set healthChecks.sink=otel \ - --set monitoring.sink=otel \ - --set monitoring.cluster=my-cluster + -f /custom-values.yaml \ + --namespace ``` **DCGM 3** (for older NVIDIA drivers R535/R525): ```shell helm install gcm oci://ghcr.io/facebookresearch/charts/gcm \ + -f /custom-values.yaml \ + --namespace \ --set monitoring.image.tag=dcgm3 \ --set healthChecks.image.tag=dcgm3 \ - --set healthChecks.cluster=my-cluster \ - --set healthChecks.sink=otel \ - --set monitoring.sink=otel \ - --set monitoring.cluster=my-cluster ``` To pin a specific chart version, add `--version X.Y.Z`. @@ -48,11 +44,15 @@ Health checks and monitoring are independent — you can deploy either or both: ```shell # Health checks only helm install gcm oci://ghcr.io/facebookresearch/charts/gcm \ + -f /custom-values.yaml \ + --namespace \ --set monitoring.enabled=false \ --set healthChecks.cluster=my-cluster # Monitoring only helm install gcm oci://ghcr.io/facebookresearch/charts/gcm \ + -f /custom-values.yaml \ + --namespace \ --set healthChecks.enabled=false \ --set monitoring.sink=otel \ --set monitoring.cluster=my-cluster @@ -186,6 +186,8 @@ Sink-specific options can be passed via `sinkOpts` (OmegaConf dot-list syntax). ```shell # Monitoring: send GPU metrics to an OpenTelemetry collector helm install gcm oci://ghcr.io/facebookresearch/charts/gcm \ + -f /custom-values.yaml \ + --namespace \ --set monitoring.sink=otel \ --set monitoring.cluster=my-cluster \ --set monitoring.sinkOpts[0]=otel_endpoint=http://otel-collector:4318 \ @@ -193,6 +195,8 @@ helm install gcm oci://ghcr.io/facebookresearch/charts/gcm \ # Health checks: send results to an OpenTelemetry collector helm install gcm oci://ghcr.io/facebookresearch/charts/gcm \ + -f /custom-values.yaml \ + --namespace \ --set healthChecks.sink=otel \ --set healthChecks.cluster=my-cluster \ --set healthChecks.sinkOpts[0]=otel_endpoint=http://otel-collector:4318 \ @@ -207,6 +211,8 @@ For clusters that use **labels** instead of taints to identify GPU nodes, use `n ```shell helm install gcm oci://ghcr.io/facebookresearch/charts/gcm \ + -f /custom-values.yaml \ + --namespace \ --set monitoring.nodeSelector."nvidia\.com/gpu\.present"=true \ --set healthChecks.nodeSelector."nvidia\.com/gpu\.present"=true ``` diff --git a/charts/gcm/values.yaml b/charts/gcm/values.yaml index 5b8e765..d4266aa 100644 --- a/charts/gcm/values.yaml +++ b/charts/gcm/values.yaml @@ -5,8 +5,8 @@ # Usage: # helm install gcm oci://ghcr.io/facebookresearch/charts/gcm \ -# -f charts/gcm/.yaml \ -# --namespace monitoring +# -f /custom-values.yaml \ +# --namespace imagePullSecrets: [] nameOverride: "" diff --git a/shelper/go.mod b/shelper/go.mod index 89f3e00..3c4823b 100644 --- a/shelper/go.mod +++ b/shelper/go.mod @@ -1,4 +1,4 @@ -module github.com/fairinternal/fair-cluster-monitoring/shelper +module github.com/facebookresearch/gcm/shelper go 1.24.3 diff --git a/slurmprocessor/common.go b/slurmprocessor/common.go index f2c5b53..7f8e5d4 100644 --- a/slurmprocessor/common.go +++ b/slurmprocessor/common.go @@ -1,13 +1,13 @@ // Copyright (c) Meta Platforms, Inc. and affiliates. // All rights reserved. -package main +package slurmprocessor import ( "context" "log" "strings" - shelper "github.com/fairinternal/fair-cluster-monitoring/shelper" + shelper "github.com/facebookresearch/gcm/shelper" "go.opentelemetry.io/collector/component" "go.opentelemetry.io/collector/consumer" "go.opentelemetry.io/collector/pdata/pcommon" diff --git a/slurmprocessor/config.go b/slurmprocessor/config.go index 985a81e..a8e7ee9 100644 --- a/slurmprocessor/config.go +++ b/slurmprocessor/config.go @@ -1,6 +1,6 @@ // Copyright (c) Meta Platforms, Inc. and affiliates. // All rights reserved. -package main +package slurmprocessor // Config Structure type Config struct { diff --git a/slurmprocessor/config_test.go b/slurmprocessor/config_test.go index 876b05f..bb28bce 100644 --- a/slurmprocessor/config_test.go +++ b/slurmprocessor/config_test.go @@ -1,6 +1,6 @@ // Copyright (c) Meta Platforms, Inc. and affiliates. // All rights reserved. -package main +package slurmprocessor import ( "path" diff --git a/slurmprocessor/factory.go b/slurmprocessor/factory.go index eb5db6d..afb842c 100644 --- a/slurmprocessor/factory.go +++ b/slurmprocessor/factory.go @@ -1,18 +1,17 @@ // Copyright (c) Meta Platforms, Inc. and affiliates. // All rights reserved. -package main +package slurmprocessor import ( "context" "log" - "github.com/open-telemetry/opentelemetry-collector-contrib/processor/attributesprocessor/internal/metadata" "go.opentelemetry.io/collector/component" "go.opentelemetry.io/collector/consumer" "go.opentelemetry.io/collector/processor" "go.opentelemetry.io/collector/processor/processorhelper" - shelper "github.com/fairinternal/fair-cluster-monitoring/shelper" + shelper "github.com/facebookresearch/gcm/shelper" ) const ( @@ -25,9 +24,9 @@ func NewFactory() processor.Factory { return processor.NewFactory( component.MustNewType(typeStr), createDefaultConfig, - processor.WithTraces(createTracesProcessor, metadata.TracesStability), - processor.WithMetrics(createMetricsProcessor, metadata.MetricsStability), - processor.WithLogs(createLogsProcessor, metadata.LogsStability), + processor.WithTraces(createTracesProcessor, component.StabilityLevelAlpha), + processor.WithMetrics(createMetricsProcessor, component.StabilityLevelAlpha), + processor.WithLogs(createLogsProcessor, component.StabilityLevelAlpha), ) } diff --git a/slurmprocessor/go.mod b/slurmprocessor/go.mod index 46ab892..1beca3f 100644 --- a/slurmprocessor/go.mod +++ b/slurmprocessor/go.mod @@ -1,9 +1,9 @@ -module github.com/fairinternal/gpu-cluster-monitoring/slurmprocessor +module github.com/facebookresearch/gcm/slurmprocessor go 1.24.3 require ( - github.com/open-telemetry/opentelemetry-collector-contrib/processor/attributesprocessor v0.126.0 + github.com/facebookresearch/gcm/shelper v0.0.1 github.com/stretchr/testify v1.10.0 go.opentelemetry.io/collector/component v1.32.0 go.opentelemetry.io/collector/consumer v1.32.0 @@ -23,7 +23,6 @@ require ( github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/davecgh/go-spew v1.1.1 // indirect github.com/ebitengine/purego v0.8.3 // indirect - github.com/fairinternal/fair-cluster-monitoring/shelper v0.0.0-20250620180146-cf5a014efe9f // indirect github.com/go-logr/logr v1.4.2 // indirect github.com/go-logr/stdr v1.2.2 // indirect github.com/go-ole/go-ole v1.2.6 // indirect diff --git a/slurmprocessor/go.sum b/slurmprocessor/go.sum index 7feb8ad..430a147 100644 --- a/slurmprocessor/go.sum +++ b/slurmprocessor/go.sum @@ -12,8 +12,8 @@ github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/ebitengine/purego v0.8.3 h1:K+0AjQp63JEZTEMZiwsI9g0+hAMNohwUOtY0RPGexmc= github.com/ebitengine/purego v0.8.3/go.mod h1:iIjxzd6CiRiOG0UyXP+V1+jWqUXVjPKLAI0mRfJZTmQ= -github.com/fairinternal/fair-cluster-monitoring/shelper v0.0.0-20250620180146-cf5a014efe9f h1:6O4bbLKO9n8m2myxE5oMk7+RAxGy9UYGDJlB/pWRzQM= -github.com/fairinternal/fair-cluster-monitoring/shelper v0.0.0-20250620180146-cf5a014efe9f/go.mod h1:8iCqKvY1tift2boY06yo0PsgbhPwi/NSmyzNLRRZyMQ= +github.com/facebookresearch/gcm/shelper v0.0.1 h1:QLRBEoZuI6dFX0AUvizC3j0bxrCQrmRiErtsgg1tV98= +github.com/facebookresearch/gcm/shelper v0.0.1/go.mod h1:51syI2aPfL2BYW0zEYjblygcomw9SsMTte7LxgD+b9Q= github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= github.com/foxboron/go-tpm-keyfiles v0.0.0-20250323135004-b31fac66206e h1:2jjYsGgM13xId2Ku+UGDQTO5It50LhT6lljiVJvBj1Y= @@ -83,8 +83,6 @@ github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9G github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= -github.com/open-telemetry/opentelemetry-collector-contrib/processor/attributesprocessor v0.126.0 h1:ezG3TqbSnQG9JcaLMh0cTts/Jvek6mlj/WApOC3wQtE= -github.com/open-telemetry/opentelemetry-collector-contrib/processor/attributesprocessor v0.126.0/go.mod h1:xbMS6tl+zIdD26RQXr6VdP2bDuBCBEdV6pC0WgNKiUI= github.com/pierrec/lz4/v4 v4.1.22 h1:cKFw6uJDK+/gfw5BcDL0JL5aBsAFdsIT18eRtLj7VIU= github.com/pierrec/lz4/v4 v4.1.22/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= diff --git a/slurmprocessor/logs.go b/slurmprocessor/logs.go index 55fafaa..0729621 100644 --- a/slurmprocessor/logs.go +++ b/slurmprocessor/logs.go @@ -1,12 +1,12 @@ // Copyright (c) Meta Platforms, Inc. and affiliates. // All rights reserved. -package main +package slurmprocessor import ( "context" "log" - shelper "github.com/fairinternal/fair-cluster-monitoring/shelper" + shelper "github.com/facebookresearch/gcm/shelper" "go.opentelemetry.io/collector/component" "go.opentelemetry.io/collector/consumer" "go.opentelemetry.io/collector/pdata/plog" diff --git a/slurmprocessor/metrics.go b/slurmprocessor/metrics.go index c4639a2..e63aae3 100644 --- a/slurmprocessor/metrics.go +++ b/slurmprocessor/metrics.go @@ -1,12 +1,12 @@ // Copyright (c) Meta Platforms, Inc. and affiliates. // All rights reserved. -package main +package slurmprocessor import ( "context" "log" - shelper "github.com/fairinternal/fair-cluster-monitoring/shelper" + shelper "github.com/facebookresearch/gcm/shelper" "go.opentelemetry.io/collector/component" "go.opentelemetry.io/collector/consumer" "go.opentelemetry.io/collector/pdata/pmetric" diff --git a/slurmprocessor/traces.go b/slurmprocessor/traces.go index 45fc13d..a495ac5 100644 --- a/slurmprocessor/traces.go +++ b/slurmprocessor/traces.go @@ -1,12 +1,12 @@ // Copyright (c) Meta Platforms, Inc. and affiliates. // All rights reserved. -package main +package slurmprocessor import ( "context" "log" - shelper "github.com/fairinternal/fair-cluster-monitoring/shelper" + shelper "github.com/facebookresearch/gcm/shelper" "go.opentelemetry.io/collector/component" "go.opentelemetry.io/collector/consumer" "go.opentelemetry.io/collector/pdata/ptrace" diff --git a/website/docs/GCM_Health_Checks/kubernetes_deployment.md b/website/docs/GCM_Health_Checks/kubernetes_deployment.md index 235803c..e9e64eb 100644 --- a/website/docs/GCM_Health_Checks/kubernetes_deployment.md +++ b/website/docs/GCM_Health_Checks/kubernetes_deployment.md @@ -30,16 +30,20 @@ The recommended way to deploy on Kubernetes is via the [GCM Helm chart](https:// ```shell helm install gcm oci://ghcr.io/facebookresearch/charts/gcm \ - --set healthChecks.cluster=my-cluster \ - --set healthChecks.sink=otel + -f /custom-values.yaml \ + --namespace \ + --set monitoring.enabled=false \ + --set healthChecks.enabled=true ``` Or from source: ```shell helm install gcm charts/gcm \ - --set healthChecks.cluster=my-cluster \ - --set healthChecks.sink=otel + -f /custom-values.yaml \ + --namespace \ + --set monitoring.enabled=false \ + --set healthChecks.enabled=true ``` See the [Helm chart README](https://github.com/facebookresearch/gcm/tree/main/charts/gcm/README.md) for full configuration options. diff --git a/website/docs/GCM_Monitoring/kubernetes_deployment.md b/website/docs/GCM_Monitoring/kubernetes_deployment.md index 9b93e6d..b1b659a 100644 --- a/website/docs/GCM_Monitoring/kubernetes_deployment.md +++ b/website/docs/GCM_Monitoring/kubernetes_deployment.md @@ -29,16 +29,20 @@ The recommended way to deploy on Kubernetes is via the [GCM Helm chart](https:// ```shell helm install gcm oci://ghcr.io/facebookresearch/charts/gcm \ - --set monitoring.sink=otel \ - --set monitoring.cluster=my-cluster + -f /custom-values.yaml \ + --namespace \ + --set monitoring.enabled=true \ + --set healthChecks.enabled=false ``` Or from source: ```shell helm install gcm charts/gcm \ - --set monitoring.sink=otel \ - --set monitoring.cluster=my-cluster + -f /custom-values.yaml \ + --namespace \ + --set monitoring.enabled=true \ + --set healthChecks.enabled=false ``` See the [Helm chart README](https://github.com/facebookresearch/gcm/tree/main/charts/gcm/README.md) for full configuration options. @@ -57,7 +61,11 @@ See the [Helm chart README](https://github.com/facebookresearch/gcm/tree/main/ch ### Sending Metrics to OpenTelemetry ```shell -helm install gcm oci://ghcr.io/facebookresearch/charts/gcm \ +helm install gcm oci://ghcr.io/facebookresearch/charts/gcm \ \ + -f /custom-values.yaml \ + --namespace \ + --set monitoring.enabled=true \ + --set healthChecks.enabled=false \ --set monitoring.sink=otel \ --set monitoring.cluster=my-cluster \ --set monitoring.extraEnv[0].name=OTEL_EXPORTER_OTLP_ENDPOINT \ @@ -67,7 +75,11 @@ helm install gcm oci://ghcr.io/facebookresearch/charts/gcm \ Sink-specific options can also be passed via `monitoring.sinkOpts`: ```shell -helm install gcm oci://ghcr.io/facebookresearch/charts/gcm \ +helm install gcm oci://ghcr.io/facebookresearch/charts/gcm \ \ + -f /custom-values.yaml \ + --namespace + --set monitoring.enabled=true \ + --set healthChecks.enabled=false \ --set monitoring.sink=otel \ --set monitoring.cluster=my-cluster \ --set monitoring.sinkOpts[0]=otel_endpoint=http://otel-collector:4318 \