From cfcf533380f14dd100f7245b5612604ac6a91a37 Mon Sep 17 00:00:00 2001
From: Eliran Wolff <eliranw@nvidia.com>
Date: Sun, 22 Feb 2026 17:36:40 +0200
Subject: [PATCH 1/3] fix: KWOK node GPU metrics not visible in RunAI UI

The centralized KWOK status-exporter produces DCGM metrics for virtual
GPU nodes, but RunAI's recording rules derive the node label by joining
with kube_pod_info on the exporter pod IP. Since the centralized pod
runs on a system node, KWOK GPU metrics were attributed to the wrong
node and never appeared in the UI.

Changes:
- Set Hostname label to the actual KWOK node name in multi_node_exporter
  so metrics can be correlated to the correct virtual node
- Add a dedicated Service for the KWOK exporter with component selector
  to prevent the DaemonSet service from also matching KWOK pods
- Add component label to DaemonSet pod template to distinguish from KWOK
- Create a PrometheusRule (deployed to runai namespace) that produces
  runai_dcgm_gpu_utilization, runai_dcgm_gpu_used_mebibytes, and
  runai_dcgm_gpu_total_mebibytes recording rules using Hostname as the
  node label, bypassing the kube_pod_info join

Fixes: RUN-36987
---
 .../templates/status-exporter/_helpers.tpl    |  1 +
 .../status-exporter/kwok-prometheusrule.yaml  | 67 +++++++++++++++++++
 .../status-exporter/kwok-service.yaml         | 21 ++++++
 .../templates/status-exporter/service.yaml    |  1 +
 .../export/metrics/multi_node_exporter.go     |  7 ++
 5 files changed, 97 insertions(+)
 create mode 100644 deploy/fake-gpu-operator/templates/status-exporter/kwok-prometheusrule.yaml
 create mode 100644 deploy/fake-gpu-operator/templates/status-exporter/kwok-service.yaml

diff --git a/deploy/fake-gpu-operator/templates/status-exporter/_helpers.tpl b/deploy/fake-gpu-operator/templates/status-exporter/_helpers.tpl
index ccd30b3..566fa2c 100644
--- a/deploy/fake-gpu-operator/templates/status-exporter/_helpers.tpl
+++ b/deploy/fake-gpu-operator/templates/status-exporter/_helpers.tpl
@@ -16,6 +16,7 @@ matchLabels:
 {{- define "fake-gpu-operator.status-exporter.common.podTemplate.metadata" -}}
 labels:
   app: nvidia-dcgm-exporter
+  component: status-exporter
   app.kubernetes.io/name: nvidia-container-toolkit
 annotations:
   checksum/hostpath-init-configmap: {{ include (print $.Template.BasePath "/status-exporter/hostpath-init-configmap.yaml") . | sha256sum }}
diff --git a/deploy/fake-gpu-operator/templates/status-exporter/kwok-prometheusrule.yaml b/deploy/fake-gpu-operator/templates/status-exporter/kwok-prometheusrule.yaml
new file mode 100644
index 0000000..eb2c14b
--- /dev/null
+++ b/deploy/fake-gpu-operator/templates/status-exporter/kwok-prometheusrule.yaml
@@ -0,0 +1,67 @@
+{{- if .Values.statusExporter.enabled -}}
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  name: fake-gpu-operator-kwok-dcgm
+  namespace: {{ .Values.prometheus.namespace | default "runai" }}
+  labels:
+    app: nvidia-dcgm-exporter
+    component: status-exporter-kwok
+spec:
+  groups:
+    - name: kwok-dcgm-metrics
+      rules:
+        # For KWOK nodes the centralized exporter sets Hostname = node name.
+        # The standard RunAI dcgmMetricsRule joins on kube_pod_info to derive
+        # the node, which fails for the centralized pod.  These rules use
+        # Hostname directly as the node label instead.
+        #
+        # Regular (non-KWOK) DCGM metrics have Hostname set to a hash that
+        # does not match any real node in runai_node_nodepool_excluded, so they
+        # are naturally filtered out by the join.
+
+        - record: runai_dcgm_gpu_utilization
+          expr: |
+            label_replace(
+              label_replace(
+                label_replace(
+                  DCGM_FI_DEV_GPU_UTIL,
+                  "pod_name", "$1", "exported_pod", "(.+)"
+                ),
+                "pod_namespace", "$1", "exported_namespace", "(.+)"
+              ),
+              "node", "$1", "Hostname", "(.+)"
+            )
+            * on(node) group_left(nodepool)
+            runai_node_nodepool_excluded
+
+        - record: runai_dcgm_gpu_used_mebibytes
+          expr: |
+            label_replace(
+              label_replace(
+                label_replace(
+                  DCGM_FI_DEV_FB_USED,
+                  "pod_name", "$1", "exported_pod", "(.+)"
+                ),
+                "pod_namespace", "$1", "exported_namespace", "(.+)"
+              ),
+              "node", "$1", "Hostname", "(.+)"
+            )
+            * on(node) group_left(nodepool)
+            runai_node_nodepool_excluded
+
+        - record: runai_dcgm_gpu_total_mebibytes
+          expr: |
+            label_replace(
+              label_replace(
+                label_replace(
+                  (DCGM_FI_DEV_FB_USED + DCGM_FI_DEV_FB_FREE),
+                  "pod_name", "$1", "exported_pod", "(.+)"
+                ),
+                "pod_namespace", "$1", "exported_namespace", "(.+)"
+              ),
+              "node", "$1", "Hostname", "(.+)"
+            )
+            * on(node) group_left(nodepool)
+            runai_node_nodepool_excluded
+{{- end -}}
diff --git a/deploy/fake-gpu-operator/templates/status-exporter/kwok-service.yaml b/deploy/fake-gpu-operator/templates/status-exporter/kwok-service.yaml
new file mode 100644
index 0000000..89fd39f
--- /dev/null
+++ b/deploy/fake-gpu-operator/templates/status-exporter/kwok-service.yaml
@@ -0,0 +1,21 @@
+{{- if .Values.statusExporter.enabled -}}
+apiVersion: v1
+kind: Service
+metadata:
+  annotations:
+    prometheus.io/scrape: "true"
+  labels:
+    app: nvidia-dcgm-exporter
+    component: status-exporter-kwok
+  name: nvidia-dcgm-exporter-kwok
+spec:
+  ports:
+    - name: gpu-metrics
+      port: 9400
+      protocol: TCP
+      targetPort: 9400
+  selector:
+    app: nvidia-dcgm-exporter
+    component: status-exporter-kwok
+  type: ClusterIP
+{{- end -}}
diff --git a/deploy/fake-gpu-operator/templates/status-exporter/service.yaml b/deploy/fake-gpu-operator/templates/status-exporter/service.yaml
index 59ceaab..7a63c8b 100644
--- a/deploy/fake-gpu-operator/templates/status-exporter/service.yaml
+++ b/deploy/fake-gpu-operator/templates/status-exporter/service.yaml
@@ -15,5 +15,6 @@ spec:
       targetPort: 9400
   selector:
     app: nvidia-dcgm-exporter
+    component: status-exporter
   type: ClusterIP
 {{- end -}}
diff --git a/internal/status-exporter/export/metrics/multi_node_exporter.go b/internal/status-exporter/export/metrics/multi_node_exporter.go
index a34cd62..b6888a3 100644
--- a/internal/status-exporter/export/metrics/multi_node_exporter.go
+++ b/internal/status-exporter/export/metrics/multi_node_exporter.go
@@ -96,6 +96,12 @@ func (e *MultiNodeMetricsExporter) exportNode(nodeName string, nodeTopology *top
 		log.Printf("Exporting metrics for KWOK node %s, gpu %s\n", nodeName, gpu.ID)
 		labels := buildGpuMetricLabels(nodeName, gpuIdx, &gpu, nodeTopology)
 
+		// Override Hostname with the node name for KWOK nodes so that the
+		// metrics-exporter can match metrics to the correct virtual node.
+		// The default generateFakeHostname produces a hash that cannot be
+		// correlated back to the KWOK node name.
+		labels["Hostname"] = nodeName
+
 		utilization := gpu.Status.PodGpuUsageStatus.Utilization()
 		fbUsed := gpu.Status.PodGpuUsageStatus.FbUsed(nodeTopology.GpuMemory)
 
@@ -111,6 +117,7 @@ func (e *MultiNodeMetricsExporter) exportNode(nodeName string, nodeTopology *top
 func (e *MultiNodeMetricsExporter) deleteNodeMetrics(nodeName string, nodeTopology *topology.NodeTopology) {
 	for gpuIdx, gpu := range nodeTopology.Gpus {
 		labels := buildGpuMetricLabels(nodeName, gpuIdx, &gpu, nodeTopology)
+		labels["Hostname"] = nodeName
 
 		// Delete the metric series for this GPU
 		gpuUtilization.Delete(labels)

From 836717568faffa344328b306d3f7921f109c7427 Mon Sep 17 00:00:00 2001
From: Eliran Wolff <eliranw@nvidia.com>
Date: Mon, 23 Feb 2026 14:38:13 +0200
Subject: [PATCH 2/3] feat: add toggle to disable KWOK PrometheusRule

Add statusExporter.kwok.prometheusRule.enabled (default true) to allow
disabling the KWOK DCGM recording rules without affecting the rest of
the status-exporter.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../templates/status-exporter/kwok-prometheusrule.yaml          | 2 +-
 deploy/fake-gpu-operator/values.yaml                            | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/deploy/fake-gpu-operator/templates/status-exporter/kwok-prometheusrule.yaml b/deploy/fake-gpu-operator/templates/status-exporter/kwok-prometheusrule.yaml
index eb2c14b..efb4128 100644
--- a/deploy/fake-gpu-operator/templates/status-exporter/kwok-prometheusrule.yaml
+++ b/deploy/fake-gpu-operator/templates/status-exporter/kwok-prometheusrule.yaml
@@ -1,4 +1,4 @@
-{{- if .Values.statusExporter.enabled -}}
+{{- if and .Values.statusExporter.enabled (ne (.Values.statusExporter.kwok.prometheusRule.enabled | toString) "false") -}}
 apiVersion: monitoring.coreos.com/v1
 kind: PrometheusRule
 metadata:
diff --git a/deploy/fake-gpu-operator/values.yaml b/deploy/fake-gpu-operator/values.yaml
index b36887b..8ab21d6 100644
--- a/deploy/fake-gpu-operator/values.yaml
+++ b/deploy/fake-gpu-operator/values.yaml
@@ -65,6 +65,8 @@ statusExporter:
   topologyMaxExportInterval: 10s
   # If using many KWOK nodes, you may need to increase the resources for the KWOK status-exporter
   kwok:
+    prometheusRule:
+      enabled: true
     resources:
       requests:
         cpu: "50m"

From 3b0fe19f0a74cec3e37cc9e67ff191d3bc27a29a Mon Sep 17 00:00:00 2001
From: Eliran Wolff <eliranw@nvidia.com>
Date: Mon, 23 Feb 2026 15:31:43 +0200
Subject: [PATCH 3/3] fix: skip PrometheusRule when CRD is not available

Add .Capabilities.APIVersions check so the KWOK PrometheusRule is only
rendered on clusters that have the Prometheus Operator installed. Fixes
integration test failures on clusters without the CRD.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../templates/status-exporter/kwok-prometheusrule.yaml          | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deploy/fake-gpu-operator/templates/status-exporter/kwok-prometheusrule.yaml b/deploy/fake-gpu-operator/templates/status-exporter/kwok-prometheusrule.yaml
index efb4128..2dd62ed 100644
--- a/deploy/fake-gpu-operator/templates/status-exporter/kwok-prometheusrule.yaml
+++ b/deploy/fake-gpu-operator/templates/status-exporter/kwok-prometheusrule.yaml
@@ -1,4 +1,4 @@
-{{- if and .Values.statusExporter.enabled (ne (.Values.statusExporter.kwok.prometheusRule.enabled | toString) "false") -}}
+{{- if and .Values.statusExporter.enabled (ne (.Values.statusExporter.kwok.prometheusRule.enabled | toString) "false") (.Capabilities.APIVersions.Has "monitoring.coreos.com/v1") -}}
 apiVersion: monitoring.coreos.com/v1
 kind: PrometheusRule
 metadata: