diff --git a/deploy/fake-gpu-operator/templates/status-updater/deployment.yaml b/deploy/fake-gpu-operator/templates/status-updater/deployment.yaml index b3284a29..8fbc8022 100644 --- a/deploy/fake-gpu-operator/templates/status-updater/deployment.yaml +++ b/deploy/fake-gpu-operator/templates/status-updater/deployment.yaml @@ -38,6 +38,8 @@ spec: value: "{{ .Values.environment.resourceReservationNamespace }}" - name: PROMETHEUS_URL value: "{{ .Values.prometheus.url }}" + - name: DISABLE_NODE_LABELING + value: "{{ .Values.statusUpdater.disableNodeLabeling }}" restartPolicy: Always serviceAccountName: status-updater imagePullSecrets: diff --git a/deploy/fake-gpu-operator/values.yaml b/deploy/fake-gpu-operator/values.yaml index 10320900..9226dc24 100644 --- a/deploy/fake-gpu-operator/values.yaml +++ b/deploy/fake-gpu-operator/values.yaml @@ -22,6 +22,7 @@ devicePlugin: statusUpdater: enabled: true + disableNodeLabeling: false image: pullPolicy: Always repository: ghcr.io/run-ai/fake-gpu-operator/status-updater diff --git a/internal/common/constants/constants.go b/internal/common/constants/constants.go index 341b896b..b011024b 100644 --- a/internal/common/constants/constants.go +++ b/internal/common/constants/constants.go @@ -26,4 +26,5 @@ const ( EnvFakeGpuOperatorNs = "FAKE_GPU_OPERATOR_NAMESPACE" EnvResourceReservationNamespace = "RESOURCE_RESERVATION_NAMESPACE" EnvPrometheusURL = "PROMETHEUS_URL" + EnvDisableNodeLabeling = "DISABLE_NODE_LABELING" ) diff --git a/internal/status-updater/app.go b/internal/status-updater/app.go index e6d4fd9a..91496d27 100644 --- a/internal/status-updater/app.go +++ b/internal/status-updater/app.go @@ -30,6 +30,7 @@ type StatusUpdaterAppConfiguration struct { TopologyCmName string `mapstructure:"TOPOLOGY_CM_NAME" validate:"required"` TopologyCmNamespace string `mapstructure:"TOPOLOGY_CM_NAMESPACE" validate:"required"` PrometheusURL string `mapstructure:"PROMETHEUS_URL"` + DisableNodeLabeling bool `mapstructure:"DISABLE_NODE_LABELING"` } type StatusUpdaterApp struct { @@ -67,8 +68,10 @@ func (app *StatusUpdaterApp) Init(stopCh chan struct{}) { app.kubeClient = KubeClientFn(clusterConfig) dynamicClient := DynamicClientFn(clusterConfig) + disableNodeLabeling := viper.GetBool(constants.EnvDisableNodeLabeling) + app.Controllers = append(app.Controllers, podcontroller.NewPodController(app.kubeClient, dynamicClient, app.wg)) - app.Controllers = append(app.Controllers, nodecontroller.NewNodeController(app.kubeClient, app.wg)) + app.Controllers = append(app.Controllers, nodecontroller.NewNodeController(app.kubeClient, app.wg, disableNodeLabeling)) } func (app *StatusUpdaterApp) Name() string { diff --git a/internal/status-updater/controllers/node/controller.go b/internal/status-updater/controllers/node/controller.go index bbb5c5d1..35e049cb 100644 --- a/internal/status-updater/controllers/node/controller.go +++ b/internal/status-updater/controllers/node/controller.go @@ -36,7 +36,7 @@ type NodeController struct { var _ controllers.Interface = &NodeController{} -func NewNodeController(kubeClient kubernetes.Interface, wg *sync.WaitGroup) *NodeController { +func NewNodeController(kubeClient kubernetes.Interface, wg *sync.WaitGroup, disableNodeLabeling bool) *NodeController { clusterTopology, err := topology.GetClusterTopologyFromCM(kubeClient) if err != nil { log.Fatalf("Failed to get cluster topology: %v", err) @@ -45,7 +45,7 @@ func NewNodeController(kubeClient kubernetes.Interface, wg *sync.WaitGroup) *Nod c := &NodeController{ kubeClient: kubeClient, informer: informers.NewSharedInformerFactory(kubeClient, 0).Core().V1().Nodes().Informer(), - handler: nodehandler.NewNodeHandler(kubeClient, clusterTopology), + handler: nodehandler.NewNodeHandler(kubeClient, clusterTopology, disableNodeLabeling), clusterTopology: clusterTopology, } diff --git a/internal/status-updater/handlers/node/handler.go b/internal/status-updater/handlers/node/handler.go index 74cd8987..b15842b6 100644 --- a/internal/status-updater/handlers/node/handler.go +++ b/internal/status-updater/handlers/node/handler.go @@ -19,14 +19,16 @@ type NodeHandler struct { kubeClient kubernetes.Interface clusterTopology *topology.ClusterTopology + disableLabeling bool } var _ Interface = &NodeHandler{} -func NewNodeHandler(kubeClient kubernetes.Interface, clusterTopology *topology.ClusterTopology) *NodeHandler { +func NewNodeHandler(kubeClient kubernetes.Interface, clusterTopology *topology.ClusterTopology, disableLabeling bool) *NodeHandler { return &NodeHandler{ kubeClient: kubeClient, clusterTopology: clusterTopology, + disableLabeling: disableLabeling, } } @@ -38,6 +40,11 @@ func (p *NodeHandler) HandleAdd(node *v1.Node) error { return fmt.Errorf("failed to create node topology ConfigMap: %w", err) } + if p.disableLabeling { + log.Printf("Skipping node labeling for %s (disabled via config)\n", node.Name) + return nil + } + err = p.labelNode(node) if err != nil { return fmt.Errorf("failed to label node: %w", err) @@ -54,6 +61,11 @@ func (p *NodeHandler) HandleDelete(node *v1.Node) error { return fmt.Errorf("failed to delete node topology: %w", err) } + if p.disableLabeling { + log.Printf("Skipping node unlabeling for %s (disabled via config)\n", node.Name) + return nil + } + err = p.unlabelNode(node) if err != nil { return fmt.Errorf("failed to unlabel node: %w", err)