From cfcf533380f14dd100f7245b5612604ac6a91a37 Mon Sep 17 00:00:00 2001 From: Eliran Wolff Date: Sun, 22 Feb 2026 17:36:40 +0200 Subject: [PATCH 1/3] fix: KWOK node GPU metrics not visible in RunAI UI The centralized KWOK status-exporter produces DCGM metrics for virtual GPU nodes, but RunAI's recording rules derive the node label by joining with kube_pod_info on the exporter pod IP. Since the centralized pod runs on a system node, KWOK GPU metrics were attributed to the wrong node and never appeared in the UI. Changes: - Set Hostname label to the actual KWOK node name in multi_node_exporter so metrics can be correlated to the correct virtual node - Add a dedicated Service for the KWOK exporter with component selector to prevent the DaemonSet service from also matching KWOK pods - Add component label to DaemonSet pod template to distinguish from KWOK - Create a PrometheusRule (deployed to runai namespace) that produces runai_dcgm_gpu_utilization, runai_dcgm_gpu_used_mebibytes, and runai_dcgm_gpu_total_mebibytes recording rules using Hostname as the node label, bypassing the kube_pod_info join Fixes: RUN-36987 --- .../templates/status-exporter/_helpers.tpl | 1 + .../status-exporter/kwok-prometheusrule.yaml | 67 +++++++++++++++++++ .../status-exporter/kwok-service.yaml | 21 ++++++ .../templates/status-exporter/service.yaml | 1 + .../export/metrics/multi_node_exporter.go | 7 ++ 5 files changed, 97 insertions(+) create mode 100644 deploy/fake-gpu-operator/templates/status-exporter/kwok-prometheusrule.yaml create mode 100644 deploy/fake-gpu-operator/templates/status-exporter/kwok-service.yaml diff --git a/deploy/fake-gpu-operator/templates/status-exporter/_helpers.tpl b/deploy/fake-gpu-operator/templates/status-exporter/_helpers.tpl index ccd30b3..566fa2c 100644 --- a/deploy/fake-gpu-operator/templates/status-exporter/_helpers.tpl +++ b/deploy/fake-gpu-operator/templates/status-exporter/_helpers.tpl @@ -16,6 +16,7 @@ matchLabels: {{- define "fake-gpu-operator.status-exporter.common.podTemplate.metadata" -}} labels: app: nvidia-dcgm-exporter + component: status-exporter app.kubernetes.io/name: nvidia-container-toolkit annotations: checksum/hostpath-init-configmap: {{ include (print $.Template.BasePath "/status-exporter/hostpath-init-configmap.yaml") . | sha256sum }} diff --git a/deploy/fake-gpu-operator/templates/status-exporter/kwok-prometheusrule.yaml b/deploy/fake-gpu-operator/templates/status-exporter/kwok-prometheusrule.yaml new file mode 100644 index 0000000..eb2c14b --- /dev/null +++ b/deploy/fake-gpu-operator/templates/status-exporter/kwok-prometheusrule.yaml @@ -0,0 +1,67 @@ +{{- if .Values.statusExporter.enabled -}} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: fake-gpu-operator-kwok-dcgm + namespace: {{ .Values.prometheus.namespace | default "runai" }} + labels: + app: nvidia-dcgm-exporter + component: status-exporter-kwok +spec: + groups: + - name: kwok-dcgm-metrics + rules: + # For KWOK nodes the centralized exporter sets Hostname = node name. + # The standard RunAI dcgmMetricsRule joins on kube_pod_info to derive + # the node, which fails for the centralized pod. These rules use + # Hostname directly as the node label instead. + # + # Regular (non-KWOK) DCGM metrics have Hostname set to a hash that + # does not match any real node in runai_node_nodepool_excluded, so they + # are naturally filtered out by the join. + + - record: runai_dcgm_gpu_utilization + expr: | + label_replace( + label_replace( + label_replace( + DCGM_FI_DEV_GPU_UTIL, + "pod_name", "$1", "exported_pod", "(.+)" + ), + "pod_namespace", "$1", "exported_namespace", "(.+)" + ), + "node", "$1", "Hostname", "(.+)" + ) + * on(node) group_left(nodepool) + runai_node_nodepool_excluded + + - record: runai_dcgm_gpu_used_mebibytes + expr: | + label_replace( + label_replace( + label_replace( + DCGM_FI_DEV_FB_USED, + "pod_name", "$1", "exported_pod", "(.+)" + ), + "pod_namespace", "$1", "exported_namespace", "(.+)" + ), + "node", "$1", "Hostname", "(.+)" + ) + * on(node) group_left(nodepool) + runai_node_nodepool_excluded + + - record: runai_dcgm_gpu_total_mebibytes + expr: | + label_replace( + label_replace( + label_replace( + (DCGM_FI_DEV_FB_USED + DCGM_FI_DEV_FB_FREE), + "pod_name", "$1", "exported_pod", "(.+)" + ), + "pod_namespace", "$1", "exported_namespace", "(.+)" + ), + "node", "$1", "Hostname", "(.+)" + ) + * on(node) group_left(nodepool) + runai_node_nodepool_excluded +{{- end -}} diff --git a/deploy/fake-gpu-operator/templates/status-exporter/kwok-service.yaml b/deploy/fake-gpu-operator/templates/status-exporter/kwok-service.yaml new file mode 100644 index 0000000..89fd39f --- /dev/null +++ b/deploy/fake-gpu-operator/templates/status-exporter/kwok-service.yaml @@ -0,0 +1,21 @@ +{{- if .Values.statusExporter.enabled -}} +apiVersion: v1 +kind: Service +metadata: + annotations: + prometheus.io/scrape: "true" + labels: + app: nvidia-dcgm-exporter + component: status-exporter-kwok + name: nvidia-dcgm-exporter-kwok +spec: + ports: + - name: gpu-metrics + port: 9400 + protocol: TCP + targetPort: 9400 + selector: + app: nvidia-dcgm-exporter + component: status-exporter-kwok + type: ClusterIP +{{- end -}} diff --git a/deploy/fake-gpu-operator/templates/status-exporter/service.yaml b/deploy/fake-gpu-operator/templates/status-exporter/service.yaml index 59ceaab..7a63c8b 100644 --- a/deploy/fake-gpu-operator/templates/status-exporter/service.yaml +++ b/deploy/fake-gpu-operator/templates/status-exporter/service.yaml @@ -15,5 +15,6 @@ spec: targetPort: 9400 selector: app: nvidia-dcgm-exporter + component: status-exporter type: ClusterIP {{- end -}} diff --git a/internal/status-exporter/export/metrics/multi_node_exporter.go b/internal/status-exporter/export/metrics/multi_node_exporter.go index a34cd62..b6888a3 100644 --- a/internal/status-exporter/export/metrics/multi_node_exporter.go +++ b/internal/status-exporter/export/metrics/multi_node_exporter.go @@ -96,6 +96,12 @@ func (e *MultiNodeMetricsExporter) exportNode(nodeName string, nodeTopology *top log.Printf("Exporting metrics for KWOK node %s, gpu %s\n", nodeName, gpu.ID) labels := buildGpuMetricLabels(nodeName, gpuIdx, &gpu, nodeTopology) + // Override Hostname with the node name for KWOK nodes so that the + // metrics-exporter can match metrics to the correct virtual node. + // The default generateFakeHostname produces a hash that cannot be + // correlated back to the KWOK node name. + labels["Hostname"] = nodeName + utilization := gpu.Status.PodGpuUsageStatus.Utilization() fbUsed := gpu.Status.PodGpuUsageStatus.FbUsed(nodeTopology.GpuMemory) @@ -111,6 +117,7 @@ func (e *MultiNodeMetricsExporter) exportNode(nodeName string, nodeTopology *top func (e *MultiNodeMetricsExporter) deleteNodeMetrics(nodeName string, nodeTopology *topology.NodeTopology) { for gpuIdx, gpu := range nodeTopology.Gpus { labels := buildGpuMetricLabels(nodeName, gpuIdx, &gpu, nodeTopology) + labels["Hostname"] = nodeName // Delete the metric series for this GPU gpuUtilization.Delete(labels) From 836717568faffa344328b306d3f7921f109c7427 Mon Sep 17 00:00:00 2001 From: Eliran Wolff Date: Mon, 23 Feb 2026 14:38:13 +0200 Subject: [PATCH 2/3] feat: add toggle to disable KWOK PrometheusRule Add statusExporter.kwok.prometheusRule.enabled (default true) to allow disabling the KWOK DCGM recording rules without affecting the rest of the status-exporter. Co-Authored-By: Claude Opus 4.6 --- .../templates/status-exporter/kwok-prometheusrule.yaml | 2 +- deploy/fake-gpu-operator/values.yaml | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/deploy/fake-gpu-operator/templates/status-exporter/kwok-prometheusrule.yaml b/deploy/fake-gpu-operator/templates/status-exporter/kwok-prometheusrule.yaml index eb2c14b..efb4128 100644 --- a/deploy/fake-gpu-operator/templates/status-exporter/kwok-prometheusrule.yaml +++ b/deploy/fake-gpu-operator/templates/status-exporter/kwok-prometheusrule.yaml @@ -1,4 +1,4 @@ -{{- if .Values.statusExporter.enabled -}} +{{- if and .Values.statusExporter.enabled (ne (.Values.statusExporter.kwok.prometheusRule.enabled | toString) "false") -}} apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: diff --git a/deploy/fake-gpu-operator/values.yaml b/deploy/fake-gpu-operator/values.yaml index b36887b..8ab21d6 100644 --- a/deploy/fake-gpu-operator/values.yaml +++ b/deploy/fake-gpu-operator/values.yaml @@ -65,6 +65,8 @@ statusExporter: topologyMaxExportInterval: 10s # If using many KWOK nodes, you may need to increase the resources for the KWOK status-exporter kwok: + prometheusRule: + enabled: true resources: requests: cpu: "50m" From 3b0fe19f0a74cec3e37cc9e67ff191d3bc27a29a Mon Sep 17 00:00:00 2001 From: Eliran Wolff Date: Mon, 23 Feb 2026 15:31:43 +0200 Subject: [PATCH 3/3] fix: skip PrometheusRule when CRD is not available Add .Capabilities.APIVersions check so the KWOK PrometheusRule is only rendered on clusters that have the Prometheus Operator installed. Fixes integration test failures on clusters without the CRD. Co-Authored-By: Claude Opus 4.6 --- .../templates/status-exporter/kwok-prometheusrule.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deploy/fake-gpu-operator/templates/status-exporter/kwok-prometheusrule.yaml b/deploy/fake-gpu-operator/templates/status-exporter/kwok-prometheusrule.yaml index efb4128..2dd62ed 100644 --- a/deploy/fake-gpu-operator/templates/status-exporter/kwok-prometheusrule.yaml +++ b/deploy/fake-gpu-operator/templates/status-exporter/kwok-prometheusrule.yaml @@ -1,4 +1,4 @@ -{{- if and .Values.statusExporter.enabled (ne (.Values.statusExporter.kwok.prometheusRule.enabled | toString) "false") -}} +{{- if and .Values.statusExporter.enabled (ne (.Values.statusExporter.kwok.prometheusRule.enabled | toString) "false") (.Capabilities.APIVersions.Has "monitoring.coreos.com/v1") -}} apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: