Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ spec:
value: "{{ .Values.environment.resourceReservationNamespace }}"
- name: PROMETHEUS_URL
value: "{{ .Values.prometheus.url }}"
- name: DISABLE_NODE_LABELING
value: "{{ .Values.statusUpdater.disableNodeLabeling }}"
restartPolicy: Always
serviceAccountName: status-updater
imagePullSecrets:
Expand Down
1 change: 1 addition & 0 deletions deploy/fake-gpu-operator/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ devicePlugin:

statusUpdater:
enabled: true
disableNodeLabeling: false
image:
pullPolicy: Always
repository: ghcr.io/run-ai/fake-gpu-operator/status-updater
Expand Down
1 change: 1 addition & 0 deletions internal/common/constants/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,5 @@ const (
EnvFakeGpuOperatorNs = "FAKE_GPU_OPERATOR_NAMESPACE"
EnvResourceReservationNamespace = "RESOURCE_RESERVATION_NAMESPACE"
EnvPrometheusURL = "PROMETHEUS_URL"
EnvDisableNodeLabeling = "DISABLE_NODE_LABELING"
)
5 changes: 4 additions & 1 deletion internal/status-updater/app.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ type StatusUpdaterAppConfiguration struct {
TopologyCmName string `mapstructure:"TOPOLOGY_CM_NAME" validate:"required"`
TopologyCmNamespace string `mapstructure:"TOPOLOGY_CM_NAMESPACE" validate:"required"`
PrometheusURL string `mapstructure:"PROMETHEUS_URL"`
DisableNodeLabeling bool `mapstructure:"DISABLE_NODE_LABELING"`
}

type StatusUpdaterApp struct {
Expand Down Expand Up @@ -67,8 +68,10 @@ func (app *StatusUpdaterApp) Init(stopCh chan struct{}) {
app.kubeClient = KubeClientFn(clusterConfig)
dynamicClient := DynamicClientFn(clusterConfig)

disableNodeLabeling := viper.GetBool(constants.EnvDisableNodeLabeling)

app.Controllers = append(app.Controllers, podcontroller.NewPodController(app.kubeClient, dynamicClient, app.wg))
app.Controllers = append(app.Controllers, nodecontroller.NewNodeController(app.kubeClient, app.wg))
app.Controllers = append(app.Controllers, nodecontroller.NewNodeController(app.kubeClient, app.wg, disableNodeLabeling))
}

func (app *StatusUpdaterApp) Name() string {
Expand Down
4 changes: 2 additions & 2 deletions internal/status-updater/controllers/node/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ type NodeController struct {

var _ controllers.Interface = &NodeController{}

func NewNodeController(kubeClient kubernetes.Interface, wg *sync.WaitGroup) *NodeController {
func NewNodeController(kubeClient kubernetes.Interface, wg *sync.WaitGroup, disableNodeLabeling bool) *NodeController {
clusterTopology, err := topology.GetClusterTopologyFromCM(kubeClient)
if err != nil {
log.Fatalf("Failed to get cluster topology: %v", err)
Expand All @@ -45,7 +45,7 @@ func NewNodeController(kubeClient kubernetes.Interface, wg *sync.WaitGroup) *Nod
c := &NodeController{
kubeClient: kubeClient,
informer: informers.NewSharedInformerFactory(kubeClient, 0).Core().V1().Nodes().Informer(),
handler: nodehandler.NewNodeHandler(kubeClient, clusterTopology),
handler: nodehandler.NewNodeHandler(kubeClient, clusterTopology, disableNodeLabeling),
clusterTopology: clusterTopology,
}

Expand Down
14 changes: 13 additions & 1 deletion internal/status-updater/handlers/node/handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,16 @@ type NodeHandler struct {
kubeClient kubernetes.Interface

clusterTopology *topology.ClusterTopology
disableLabeling bool
}

var _ Interface = &NodeHandler{}

func NewNodeHandler(kubeClient kubernetes.Interface, clusterTopology *topology.ClusterTopology) *NodeHandler {
func NewNodeHandler(kubeClient kubernetes.Interface, clusterTopology *topology.ClusterTopology, disableLabeling bool) *NodeHandler {
return &NodeHandler{
kubeClient: kubeClient,
clusterTopology: clusterTopology,
disableLabeling: disableLabeling,
}
}

Expand All @@ -38,6 +40,11 @@ func (p *NodeHandler) HandleAdd(node *v1.Node) error {
return fmt.Errorf("failed to create node topology ConfigMap: %w", err)
}

if p.disableLabeling {
log.Printf("Skipping node labeling for %s (disabled via config)\n", node.Name)
return nil
}

err = p.labelNode(node)
if err != nil {
return fmt.Errorf("failed to label node: %w", err)
Expand All @@ -54,6 +61,11 @@ func (p *NodeHandler) HandleDelete(node *v1.Node) error {
return fmt.Errorf("failed to delete node topology: %w", err)
}

if p.disableLabeling {
log.Printf("Skipping node unlabeling for %s (disabled via config)\n", node.Name)
return nil
}

err = p.unlabelNode(node)
if err != nil {
return fmt.Errorf("failed to unlabel node: %w", err)
Expand Down
Loading