From 63e6e44f2f203c98ee7faebfb9177ae124ff1906 Mon Sep 17 00:00:00 2001 From: Vitaliy Emporopulo Date: Tue, 13 May 2025 11:30:39 +0300 Subject: [PATCH] Replace DCGM_FI_PROF_GR_ENGINE_ACTIVE with DCGM_FI_DEV_GPU_UTIL * Profiling metrics DCGM_FI_PROF_* aren't available on pre-Volta GPUs * Consolidate the default metrics into the configmap so that installing the console plugin doesn't break other tools --- .../templates/configmap.yaml | 56 ++++++++++++++++--- .../GPUDashboard/Cards/GPUDashboardGraphs.tsx | 11 ++-- .../GPUDashboard/Cards/WorkloadsCard.tsx | 2 +- src/hooks/use-gpus-info.ts | 2 +- src/utils/cluster-overview.ts | 4 +- src/utils/project-overview.ts | 2 +- 6 files changed, 58 insertions(+), 19 deletions(-) diff --git a/deployment/console-plugin-nvidia-gpu/templates/configmap.yaml b/deployment/console-plugin-nvidia-gpu/templates/configmap.yaml index 62c0854..0046daa 100644 --- a/deployment/console-plugin-nvidia-gpu/templates/configmap.yaml +++ b/deployment/console-plugin-nvidia-gpu/templates/configmap.yaml @@ -6,14 +6,54 @@ metadata: {{- include "console-plugin-nvidia-gpu.labels" . | nindent 4 }} data: dcgm-metrics.csv: | - DCGM_FI_PROF_GR_ENGINE_ACTIVE, gauge, gpu utilization. - DCGM_FI_DEV_MEM_COPY_UTIL, gauge, mem utilization. - DCGM_FI_DEV_ENC_UTIL, gauge, enc utilization. - DCGM_FI_DEV_DEC_UTIL, gauge, dec utilization. - DCGM_FI_DEV_POWER_USAGE, gauge, power usage. + # === Added by the console plugin === DCGM_FI_DEV_POWER_MGMT_LIMIT_MAX, gauge, power mgmt limit. - DCGM_FI_DEV_GPU_TEMP, gauge, gpu temp. - DCGM_FI_DEV_SM_CLOCK, gauge, sm clock. DCGM_FI_DEV_MAX_SM_CLOCK, gauge, max sm clock. - DCGM_FI_DEV_MEM_CLOCK, gauge, mem clock. DCGM_FI_DEV_MAX_MEM_CLOCK, gauge, max mem clock. + + # === Available by default === + # Clocks + DCGM_FI_DEV_SM_CLOCK, gauge, SM clock frequency (in MHz). + DCGM_FI_DEV_MEM_CLOCK, gauge, Memory clock frequency (in MHz). + + # Temperature + DCGM_FI_DEV_MEMORY_TEMP, gauge, Memory temperature (in C). + DCGM_FI_DEV_GPU_TEMP, gauge, GPU temperature (in C). + + # Power + DCGM_FI_DEV_POWER_USAGE, gauge, Power draw (in W). + DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION, counter, Total energy consumption since boot (in mJ). + + # PCIE + DCGM_FI_DEV_PCIE_REPLAY_COUNTER, counter, Total number of PCIe retries. + + # Utilization (the sample period varies depending on the product) + DCGM_FI_DEV_GPU_UTIL, gauge, GPU utilization (in %). + DCGM_FI_DEV_MEM_COPY_UTIL, gauge, Memory utilization (in %). + DCGM_FI_DEV_ENC_UTIL, gauge, Encoder utilization (in %). + DCGM_FI_DEV_DEC_UTIL , gauge, Decoder utilization (in %). + + # Errors and violations + DCGM_FI_DEV_XID_ERRORS, gauge, Value of the last XID error encountered. + + # Memory usage + DCGM_FI_DEV_FB_FREE, gauge, Framebuffer memory free (in MiB). + DCGM_FI_DEV_FB_USED, gauge, Framebuffer memory used (in MiB). + + # NVLink + DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL, counter, Total number of NVLink bandwidth counters for all lanes. + + # VGPU License status + DCGM_FI_DEV_VGPU_LICENSE_STATUS, gauge, vGPU License status + + # Remapped rows + DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for uncorrectable errors + DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for correctable errors + DCGM_FI_DEV_ROW_REMAP_FAILURE, gauge, Whether remapping of rows has failed + + # DCP metrics + DCGM_FI_PROF_GR_ENGINE_ACTIVE, gauge, Ratio of time the graphics engine is active. + DCGM_FI_PROF_PIPE_TENSOR_ACTIVE, gauge, Ratio of cycles the tensor (HMMA) pipe is active. + DCGM_FI_PROF_DRAM_ACTIVE, gauge, Ratio of cycles the device memory interface is active sending or receiving data. + DCGM_FI_PROF_PCIE_TX_BYTES, counter, The number of bytes of active pcie tx data including both header and payload. + DCGM_FI_PROF_PCIE_RX_BYTES, counter, The number of bytes of active pcie rx data including both header and payload. \ No newline at end of file diff --git a/src/components/GPUDashboard/Cards/GPUDashboardGraphs.tsx b/src/components/GPUDashboard/Cards/GPUDashboardGraphs.tsx index 7eb9524..d39b8f3 100644 --- a/src/components/GPUDashboard/Cards/GPUDashboardGraphs.tsx +++ b/src/components/GPUDashboard/Cards/GPUDashboardGraphs.tsx @@ -7,7 +7,6 @@ import { humanizeDegrees, humanizeHertz, humanizePercentage, - humanizeRatio, humanizeWatts, } from '../../../utils/units'; import { useTranslation } from '../../../i18n'; @@ -17,7 +16,7 @@ import { useTranslation } from '../../../i18n'; // /* - these are ok: + these are ok: DCGM_FI_DEV_GPU_UTIL, gauge, gpu utilization. DCGM_FI_DEV_POWER_USAGE, gauge, power usage. DCGM_FI_DEV_GPU_TEMP, gauge, gpu temp. @@ -32,7 +31,7 @@ import { useTranslation } from '../../../i18n'; */ /* Used metrics - DCGM_FI_PROF_GR_ENGINE_ACTIVE, gauge, gpu utilization. + DCGM_FI_DEV_GPU_UTIL, gauge, gpu utilization. DCGM_FI_DEV_MEM_COPY_UTIL, gauge, mem utilization. DCGM_FI_DEV_ENC_UTIL, gauge, enc utilization. DCGM_FI_DEV_DEC_UTIL, gauge, dec utilization. @@ -57,9 +56,9 @@ export const GPUDashboardGraphs: React.FC = () => { ariaTitle={t('Donut GPU utilization')} ariaRangeTitle={t('GPU utilization over time')} ariaDesc={t('Sparkline GPU utilization')} - query={`sum(DCGM_FI_PROF_GR_ENGINE_ACTIVE{UUID="${selectedGPU?.uuid}"})`} - maxDomain={1} - humanize={humanizeRatio} + query={`sum(DCGM_FI_DEV_GPU_UTIL{UUID="${selectedGPU?.uuid}"})`} + maxDomain={100} + humanize={humanizePercentage} /> diff --git a/src/components/GPUDashboard/Cards/WorkloadsCard.tsx b/src/components/GPUDashboard/Cards/WorkloadsCard.tsx index 897b7d4..1238af0 100644 --- a/src/components/GPUDashboard/Cards/WorkloadsCard.tsx +++ b/src/components/GPUDashboard/Cards/WorkloadsCard.tsx @@ -127,7 +127,7 @@ const WorkloadsCard: React.FC = () => { const [gpuMemoryMetrics, gpuMetricsLoaded, gpuMetricsError] = usePrometheusPoll({ endpoint: PrometheusEndpoint.QUERY_RANGE, query: - 'sum (DCGM_FI_PROF_GR_ENGINE_ACTIVE{exported_pod=~".+"}) by (exported_namespace, exported_pod, UUID)', + 'sum (DCGM_FI_DEV_GPU_UTIL{exported_pod=~".+"}) by (exported_namespace, exported_pod, UUID)', timespan: ONE_DAY, }); diff --git a/src/hooks/use-gpus-info.ts b/src/hooks/use-gpus-info.ts index a90495a..13ec7dd 100644 --- a/src/hooks/use-gpus-info.ts +++ b/src/hooks/use-gpus-info.ts @@ -20,7 +20,7 @@ export type GPUInfo = { export const useGPUsInfo = (): [GPUInfo[], /* loaded */ boolean, /* error */ unknown] => { const [result, loaded, error] = usePrometheusPoll({ endpoint: PrometheusEndpoint.QUERY, - query: 'DCGM_FI_PROF_GR_ENGINE_ACTIVE', + query: 'DCGM_FI_DEV_GPU_UTIL', }); const gpus = useDeepCompareMemoize( diff --git a/src/utils/cluster-overview.ts b/src/utils/cluster-overview.ts index 6ade092..5b78436 100644 --- a/src/utils/cluster-overview.ts +++ b/src/utils/cluster-overview.ts @@ -1,9 +1,9 @@ import { GetQuery } from '@openshift-console/dynamic-plugin-sdk'; export const getGPUUtilizationQuery: GetQuery = () => - 'count(count by (UUID,GPU_I_ID) (DCGM_FI_PROF_GR_ENGINE_ACTIVE{exported_pod=~".+"})) or vector(0)'; + 'count(count by (UUID,GPU_I_ID) (DCGM_FI_DEV_GPU_UTIL{exported_pod=~".+"})) or vector(0)'; export const getGPUTotalUtilizationQuery: GetQuery = () => - 'count(count by (UUID,GPU_I_ID) (DCGM_FI_PROF_GR_ENGINE_ACTIVE)) or vector(0)'; + 'count(count by (UUID,GPU_I_ID) (DCGM_FI_DEV_GPU_UTIL)) or vector(0)'; export const getPowerUsageUtilizationQuery: GetQuery = () => 'sum(max by (UUID) (DCGM_FI_DEV_POWER_USAGE))'; diff --git a/src/utils/project-overview.ts b/src/utils/project-overview.ts index 6b9d5f8..4d2355d 100644 --- a/src/utils/project-overview.ts +++ b/src/utils/project-overview.ts @@ -1,2 +1,2 @@ export const getGPUPodsQuery = (project: string) => - `count((kube_pod_status_phase > 0) * on(pod) group_left(gpu,device,instance,modelName) label_replace(DCGM_FI_PROF_GR_ENGINE_ACTIVE{exported_pod=~".+", exported_namespace="${project}"}, "pod", "$1", "exported_pod", "(.*)"))`; + `count((kube_pod_status_phase > 0) * on(pod) group_left(gpu,device,instance,modelName) label_replace(DCGM_FI_DEV_GPU_UTIL{exported_pod=~".+", exported_namespace="${project}"}, "pod", "$1", "exported_pod", "(.*)"))`;