From 54fb0ac4ea7c424c52ac7051e016f96c01bab079 Mon Sep 17 00:00:00 2001 From: Yuan Chen Date: Tue, 31 Mar 2026 16:18:51 -0700 Subject: [PATCH] feat(evidence): add NIM support to evidence collection and update conformance docs Add NIM Operator and NIM inference metrics paths to evidence collection, and update all conformance documentation to reflect NIM on EKS as the certified product. Evidence collection: - Add collect_service_metrics_nim() for NIM /v1/metrics endpoint - Add collect_operator_nim() for NIM Operator CRDs/webhooks/reconciliation - Detection priority: Dynamo > NIM Operator > Kubeflow Trainer Documentation: - Update PRODUCT.yaml platform to "NVIDIA NIM on EKS" - Update submission README and evidence index for NIM - Refresh all 9 evidence files with NIM-based conformance results (9/9 PASS) --- .../cncf/evidence/ai-service-metrics.md | 224 ---- .../cncf/evidence/robust-operator.md | 184 ---- docs/conformance/cncf/index.md | 80 +- docs/conformance/cncf/submission/README.md | 25 - .../nim-eks}/PRODUCT.yaml | 73 +- docs/conformance/cncf/v1.35/nim-eks/README.md | 25 + .../nim-eks}/evidence/accelerator-metrics.md | 998 +++++++++--------- .../nim-eks/evidence/ai-service-metrics.md | 114 ++ .../nim-eks}/evidence/cluster-autoscaling.md | 122 +-- .../nim-eks}/evidence/dra-support.md | 38 +- .../nim-eks}/evidence/gang-scheduling.md | 54 +- .../{ => v1.35/nim-eks}/evidence/index.md | 15 +- .../nim-eks}/evidence/inference-gateway.md | 62 +- .../nim-eks}/evidence/pod-autoscaling.md | 38 +- .../v1.35/nim-eks/evidence/robust-operator.md | 179 ++++ .../evidence/secure-accelerator-access.md | 302 +++--- pkg/evidence/scripts/collect-evidence.sh | 348 +++++- 17 files changed, 1496 insertions(+), 1385 deletions(-) delete mode 100644 docs/conformance/cncf/evidence/ai-service-metrics.md delete mode 100644 docs/conformance/cncf/evidence/robust-operator.md delete mode 100644 docs/conformance/cncf/submission/README.md rename docs/conformance/cncf/{submission => v1.35/nim-eks}/PRODUCT.yaml (83%) create mode 100644 docs/conformance/cncf/v1.35/nim-eks/README.md rename docs/conformance/cncf/{ => v1.35/nim-eks}/evidence/accelerator-metrics.md (59%) create mode 100644 docs/conformance/cncf/v1.35/nim-eks/evidence/ai-service-metrics.md rename docs/conformance/cncf/{ => v1.35/nim-eks}/evidence/cluster-autoscaling.md (54%) rename docs/conformance/cncf/{ => v1.35/nim-eks}/evidence/dra-support.md (70%) rename docs/conformance/cncf/{ => v1.35/nim-eks}/evidence/gang-scheduling.md (82%) rename docs/conformance/cncf/{ => v1.35/nim-eks}/evidence/index.md (54%) rename docs/conformance/cncf/{ => v1.35/nim-eks}/evidence/inference-gateway.md (67%) rename docs/conformance/cncf/{ => v1.35/nim-eks}/evidence/pod-autoscaling.md (84%) create mode 100644 docs/conformance/cncf/v1.35/nim-eks/evidence/robust-operator.md rename docs/conformance/cncf/{ => v1.35/nim-eks}/evidence/secure-accelerator-access.md (66%) diff --git a/docs/conformance/cncf/evidence/ai-service-metrics.md b/docs/conformance/cncf/evidence/ai-service-metrics.md deleted file mode 100644 index 768ed0a69..000000000 --- a/docs/conformance/cncf/evidence/ai-service-metrics.md +++ /dev/null @@ -1,224 +0,0 @@ -# AI Service Metrics (Prometheus Discovery) - -**Kubernetes Version:** v1.35 -**Platform:** linux/amd64 -**Validated on:** EKS / p5.48xlarge / NVIDIA H100 80GB HBM3 - ---- - -Demonstrates that Prometheus discovers and collects metrics from AI workloads -that expose them in Prometheus exposition format, using PodMonitor and -ServiceMonitor CRDs for automatic target discovery across both inference and -training workloads. - -## Inference: Dynamo Platform (PodMonitor) - -**Cluster:** `aicr-cuj2` (EKS, inference) -**Generated:** 2026-03-25 10:18:30 UTC - -The Dynamo operator auto-creates PodMonitors for worker and frontend pods. -The Dynamo vLLM runtime exposes both Dynamo-specific and embedded vLLM metrics -on port 9090 (`system` port) in Prometheus format. - -### Dynamo Workload Pods - -**Dynamo workload pods** -``` -$ kubectl get pods -n dynamo-workload -o wide -NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES -vllm-agg-0-frontend-qqrff 1/1 Running 0 3m29s 10.0.159.241 ip-10-0-184-187.ec2.internal -vllm-agg-0-vllmdecodeworker-95ths 1/1 Running 0 3m29s 10.0.214.229 ip-10-0-180-136.ec2.internal -``` - -### Worker Metrics Endpoint - -**Worker metrics (sampled after 10 inference requests)** -``` -dynamo_component_request_bytes_total{dynamo_component="backend",dynamo_endpoint="generate",model="Qwen/Qwen3-0.6B"} 11230 -dynamo_component_request_duration_seconds_sum{dynamo_component="backend",dynamo_endpoint="generate",model="Qwen/Qwen3-0.6B"} 0.984 -dynamo_component_request_duration_seconds_count{dynamo_component="backend",dynamo_endpoint="generate",model="Qwen/Qwen3-0.6B"} 10 -dynamo_component_requests_total{dynamo_component="backend",dynamo_endpoint="generate",model="Qwen/Qwen3-0.6B"} 10 -dynamo_component_response_bytes_total{dynamo_component="backend",dynamo_endpoint="generate",model="Qwen/Qwen3-0.6B"} 31826 -dynamo_component_uptime_seconds 223.250 -vllm:engine_sleep_state{engine="0",model_name="Qwen/Qwen3-0.6B",sleep_state="awake"} 1.0 -vllm:prefix_cache_queries_total{engine="0",model_name="Qwen/Qwen3-0.6B"} 50.0 -``` - -### PodMonitors (Auto-Created by Dynamo Operator) - -**Dynamo PodMonitors** -``` -$ kubectl get podmonitors -n dynamo-system -NAME AGE -dynamo-frontend 11d -dynamo-planner 11d -dynamo-worker 11d -``` - -**Worker PodMonitor spec** -``` -$ kubectl get podmonitor dynamo-worker -n dynamo-system -o yaml -apiVersion: monitoring.coreos.com/v1 -kind: PodMonitor -metadata: - name: dynamo-worker - namespace: dynamo-system -spec: - namespaceSelector: - any: true - podMetricsEndpoints: - - interval: 5s - path: /metrics - port: system - selector: - matchLabels: - nvidia.com/dynamo-component-type: worker - nvidia.com/metrics-enabled: "true" -``` - -### Prometheus Target Discovery - -**Prometheus scrape targets (active)** -``` -{ - "job": "dynamo-system/dynamo-frontend", - "endpoint": "http://10.0.159.241:8000/metrics", - "health": "up", - "lastScrape": "2026-03-25T10:19:21.101766071Z" -} -{ - "job": "dynamo-system/dynamo-worker", - "endpoint": "http://10.0.214.229:9090/metrics", - "health": "up", - "lastScrape": "2026-03-25T10:19:22.70334816Z" -} -``` - -### Dynamo Metrics in Prometheus - -**Dynamo metrics queried from Prometheus (after 10 inference requests)** -``` -dynamo_component_requests_total{endpoint="generate"} = 10 -dynamo_component_request_bytes_total{endpoint="generate"} = 11230 -dynamo_component_response_bytes_total{endpoint="generate"} = 31826 -dynamo_component_request_duration_seconds_count{endpoint="generate"} = 10 -dynamo_component_request_duration_seconds_sum{endpoint="generate"} = 0.984 -dynamo_component_uptime_seconds = 223.250 -dynamo_frontend_input_sequence_tokens_sum = 50 -dynamo_frontend_input_sequence_tokens_count = 10 -dynamo_frontend_inter_token_latency_seconds_sum = 0.866 -dynamo_frontend_inter_token_latency_seconds_count = 490 -dynamo_frontend_model_context_length = 40960 -dynamo_frontend_model_total_kv_blocks = 37710 -``` - -**Result: PASS** — Prometheus discovers Dynamo inference workloads (frontend + worker) via operator-managed PodMonitors and actively scrapes their Prometheus-format metrics endpoints. Application-level AI inference metrics (request count, request duration, inter-token latency, token throughput, KV cache utilization) are collected and queryable. - ---- - -## Training: PyTorch Workload (ServiceMonitor) - -**Cluster:** `aicr-cuj1` (EKS, training) -**Generated:** 2026-03-25 11:03:00 UTC - -A PyTorch training workload runs a GPU training loop and exposes training-level -metrics (step count, loss, throughput, GPU memory) on port 8080 in Prometheus -format, discovered via ServiceMonitor. - -### Training Workload Pod - -**Training pod** -``` -$ kubectl get pods -n trainer-metrics-test -o wide -NAME READY STATUS RESTARTS AGE -pytorch-training-job 1/1 Running 0 2m -``` - -### Training Metrics Endpoint - -**Training metrics (after 100 training steps)** -``` -# HELP training_step_total Total training steps completed -# TYPE training_step_total counter -training_step_total 100 -# HELP training_loss Current training loss -# TYPE training_loss gauge -training_loss 1.334257 -# HELP training_throughput_samples_per_sec Training throughput -# TYPE training_throughput_samples_per_sec gauge -training_throughput_samples_per_sec 549228.55 -# HELP training_gpu_memory_used_bytes GPU memory used -# TYPE training_gpu_memory_used_bytes gauge -training_gpu_memory_used_bytes 79213568 -# HELP training_gpu_memory_total_bytes GPU memory total -# TYPE training_gpu_memory_total_bytes gauge -training_gpu_memory_total_bytes 85017624576 -``` - -### ServiceMonitor - -**Training ServiceMonitor** -``` -$ kubectl get servicemonitor pytorch-training -n trainer-metrics-test -o yaml -apiVersion: monitoring.coreos.com/v1 -kind: ServiceMonitor -metadata: - labels: - release: kube-prometheus-stack - name: pytorch-training - namespace: trainer-metrics-test -spec: - endpoints: - - interval: 15s - path: /metrics - port: metrics - selector: - matchLabels: - app: pytorch-training -``` - -### Prometheus Target Discovery - -**Prometheus scrape target (active)** -``` -{ - "job": "pytorch-training-metrics", - "endpoint": "http://10.0.212.201:8080/metrics", - "health": "up", - "lastScrape": "2026-03-25T11:03:49.310258779Z" -} -``` - -### Training Metrics in Prometheus - -**Training metrics queried from Prometheus** -``` -training_step_total = 100 -training_loss = 1.334257 -training_throughput_samples_per_sec = 549228.55 -training_gpu_memory_used_bytes = 79213568 -training_gpu_memory_total_bytes = 85017624576 -``` - -**Result: PASS** — Prometheus discovers the PyTorch training workload via ServiceMonitor and actively scrapes its Prometheus-format metrics endpoint. Training-level metrics (step count, loss, throughput, GPU memory) are collected and queryable. - ---- - -## Summary - -| Workload | Discovery | Metrics Port | Metrics Type | Result | -|----------|-----------|-------------|--------------|--------| -| **Dynamo vLLM** (inference) | PodMonitor (auto-created) | 9090 (HTTP) | `dynamo_component_*`, `dynamo_frontend_*`, `vllm:*` | **PASS** | -| **PyTorch training** (training) | ServiceMonitor | 8080 (HTTP) | `training_step_total`, `training_loss`, `training_throughput_*`, `training_gpu_memory_*` | **PASS** | - -## Cleanup - -**Delete inference workload** -``` -$ kubectl delete ns dynamo-workload -``` - -**Delete training workload** -``` -$ kubectl delete ns trainer-metrics-test -``` diff --git a/docs/conformance/cncf/evidence/robust-operator.md b/docs/conformance/cncf/evidence/robust-operator.md deleted file mode 100644 index 917222560..000000000 --- a/docs/conformance/cncf/evidence/robust-operator.md +++ /dev/null @@ -1,184 +0,0 @@ -# Robust AI Operator - -**Kubernetes Version:** v1.35 -**Platform:** linux/amd64 -**Validated on:** Kubernetes v1.35 clusters with NVIDIA H100 80GB HBM3 - ---- - -Demonstrates CNCF AI Conformance requirement that at least one complex AI operator -with a CRD can be installed and functions reliably, including operator pods running, -webhooks operational, and custom resources reconciled. - -## Summary - -Two operators validated across inference and training intents: - -| Operator | Intent | CRDs | Webhooks | CR Reconciled | Result | -|----------|--------|------|----------|---------------|--------| -| **Dynamo Platform** | Inference | 6 CRDs | 4 validating webhooks | DynamoGraphDeployment → PodCliques | **PASS** | -| **Kubeflow Trainer** | Training | 3 CRDs | 3 validating webhooks | TrainJob → distributed training pods | **PASS** | - ---- - -## Inference: Dynamo Platform - -**Generated:** 2026-03-10 03:41:48 UTC - -### Dynamo Operator Health - -**Dynamo operator deployments** -``` -$ kubectl get deploy -n dynamo-system -NAME READY UP-TO-DATE AVAILABLE AGE -dynamo-platform-dynamo-operator-controller-manager 1/1 1 1 13m -grove-operator 1/1 1 1 13m -``` - -**Dynamo operator pods** -``` -$ kubectl get pods -n dynamo-system -NAME READY STATUS RESTARTS AGE -dynamo-platform-dynamo-operator-controller-manager-59f6dc6gs7tt 2/2 Running 0 13m -dynamo-platform-dynamo-operator-webhook-ca-inject-1-6t95h 0/1 Completed 0 13m -dynamo-platform-dynamo-operator-webhook-cert-gen-1-bnqwh 0/1 Completed 0 13m -grove-operator-7c69b46ddf-mxgtz 1/1 Running 1 (13m ago) 13m -``` - -### Custom Resource Definitions - -**Dynamo CRDs** -``` -dynamocomponentdeployments.nvidia.com 2026-03-10T03:20:42Z -dynamographdeploymentrequests.nvidia.com 2026-03-10T03:20:42Z -dynamographdeployments.nvidia.com 2026-03-10T03:20:42Z -dynamographdeploymentscalingadapters.nvidia.com 2026-03-10T03:20:42Z -dynamomodels.nvidia.com 2026-03-10T03:20:42Z -dynamoworkermetadatas.nvidia.com 2026-03-10T03:20:42Z -``` - -### Webhooks - -**Validating webhooks** -``` -$ kubectl get validatingwebhookconfigurations -l app.kubernetes.io/instance=dynamo-platform -NAME WEBHOOKS AGE -dynamo-platform-dynamo-operator-validating 4 13m -``` - -### Custom Resource Reconciliation - -A `DynamoGraphDeployment` defines an inference serving graph. The operator reconciles -it into workload pods managed via PodCliques. - -**DynamoGraphDeployments** -``` -$ kubectl get dynamographdeployments -A -NAMESPACE NAME AGE -dynamo-workload vllm-agg 5m33s -``` - -**Workload Pods Created by Operator** -``` -$ kubectl get pods -n dynamo-workload -l nvidia.com/dynamo-graph-deployment-name -o wide -NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES -vllm-agg-0-frontend-kkmpd 1/1 Running 0 5m35s 10.0.222.55 system-node-2 -vllm-agg-0-vllmdecodeworker-s65j5 1/1 Running 0 5m35s 10.0.235.180 gpu-node-1 -``` - -**PodCliques** -``` -$ kubectl get podcliques -n dynamo-workload -NAME AGE -vllm-agg-0-frontend 5m36s -vllm-agg-0-vllmdecodeworker 5m36s -``` - -### Webhook Rejection Test - -Submit an invalid DynamoGraphDeployment to verify the validating webhook -actively rejects malformed resources. - -**Invalid CR rejection** -``` -Error from server (Forbidden): error when creating "STDIN": admission webhook "vdynamographdeployment.kb.io" denied the request: spec.services must have at least one service -``` - -Webhook correctly rejected the invalid resource. - -**Result: PASS** — Dynamo operator running, webhooks operational (rejection verified), CRDs registered, DynamoGraphDeployment reconciled with 2 healthy workload pod(s). - ---- - -## Training: Kubeflow Trainer - -**Generated:** 2026-03-16 21:48:55 UTC - -### Kubeflow Trainer Health - -**Kubeflow Trainer deployments** -``` -$ kubectl get deploy -n kubeflow -NAME READY UP-TO-DATE AVAILABLE AGE -jobset-controller 1/1 1 1 13m -kubeflow-trainer-controller-manager 1/1 1 1 13m -``` - -**Kubeflow Trainer pods** -``` -$ kubectl get pods -n kubeflow -o wide -NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES -jobset-controller-75f94fdfb7-r7lqd 1/1 Running 1 (13m ago) 13m 10.100.1.52 system-node-1 -kubeflow-trainer-controller-manager-677b98f74f-8dvgj 1/1 Running 1 (13m ago) 13m 10.100.5.60 system-node-2 -pytorch-mnist-node-0-0-9wkj5 0/1 Completed 0 12m 10.100.2.169 gpu-node-1 -``` - -### Custom Resource Definitions - -**Kubeflow Trainer CRDs** -``` -clustertrainingruntimes.trainer.kubeflow.org 2026-03-16T20:45:34Z -trainingruntimes.trainer.kubeflow.org 2026-03-16T20:45:36Z -trainjobs.trainer.kubeflow.org 2026-03-16T20:45:36Z -``` - -### Webhooks - -**Validating webhooks** -``` -$ kubectl get validatingwebhookconfigurations validator.trainer.kubeflow.org -NAME WEBHOOKS AGE -validator.trainer.kubeflow.org 3 13m -``` - -**Webhook endpoint verification** -``` -NAME ENDPOINTS AGE -jobset-metrics-service 10.100.1.52:8443 13m -jobset-webhook-service 10.100.1.52:9443 13m -kubeflow-trainer-controller-manager 10.100.5.60:8080,10.100.5.60:9443 13m -pytorch-mnist 10.100.2.169 12m -``` - -### ClusterTrainingRuntimes - -**ClusterTrainingRuntimes** -``` -$ kubectl get clustertrainingruntimes -NAME AGE -torch-distributed 13m -``` - -### Webhook Rejection Test - -Submit an invalid TrainJob (referencing a non-existent runtime) to verify the -validating webhook actively rejects malformed resources. - -**Invalid TrainJob rejection** -``` -Error from server (Forbidden): error when creating "STDIN": admission webhook "validator.trainjob.trainer.kubeflow.org" denied the request: spec.RuntimeRef: Invalid value: {"name":"nonexistent-runtime","apiGroup":"trainer.kubeflow.org","kind":"ClusterTrainingRuntime"}: ClusterTrainingRuntime.trainer.kubeflow.org "nonexistent-runtime" not found: specified clusterTrainingRuntime must be created before the TrainJob is created -``` - -Webhook correctly rejected the invalid resource. - -**Result: PASS** — Kubeflow Trainer running, webhooks operational (rejection verified), 3 CRDs registered. diff --git a/docs/conformance/cncf/index.md b/docs/conformance/cncf/index.md index bb20c9980..bee8027e2 100644 --- a/docs/conformance/cncf/index.md +++ b/docs/conformance/cncf/index.md @@ -1,43 +1,43 @@ -# CNCF AI Conformance Evidence +# CNCF AI Conformance ## Overview This directory contains evidence for [CNCF Kubernetes AI Conformance](https://github.com/cncf/k8s-ai-conformance) -certification. The evidence demonstrates that a cluster configured with a specific -recipe meets the Must-have requirements for Kubernetes v1.35. +certification. Each submission certifies a specific product on a specific Kubernetes +distribution, with evidence collected using AICR as the validation tooling. -> **Note:** It is the **cluster configured by a recipe** that is conformant, not the -> tool itself. The recipe determines which components are deployed and how they are -> configured. Different recipes may produce clusters with different conformance profiles. +> **Note:** It is the **product deployed on a Kubernetes platform** that is conformant. +> AICR serves as the deployment and validation tooling (similar to sonobuoy for K8s +> conformance), while the certified product is the AI inference/training platform. -**Kubernetes:** v1.35 -**Product:** Kubernetes clusters with NVIDIA AI Cluster Runtime (AICR) +## Submissions -AICR deploys the runtime components that make a Kubernetes cluster AI conformant. -All conformance requirements are platform-agnostic except cluster autoscaling, -which relies on the underlying platform's node group scaling mechanism. +| Version | Product | Platform | Status | Evidence | +|---------|---------|----------|--------|----------| +| v1.35 | [NVIDIA NIM](https://developer.nvidia.com/nim) | EKS | 9/9 PASS | [v1.35/nim-eks/](v1.35/nim-eks/) | ## Directory Structure ``` docs/conformance/cncf/ -├── README.md -├── submission/ -│ ├── PRODUCT.yaml -│ └── README.md -└── evidence/ - ├── index.md - ├── dra-support.md - ├── gang-scheduling.md - ├── secure-accelerator-access.md - ├── accelerator-metrics.md - ├── ai-service-metrics.md - ├── inference-gateway.md - ├── robust-operator.md - ├── pod-autoscaling.md - └── cluster-autoscaling.md - -pkg/evidence/scripts/ # Evidence collection script + test manifests +├── index.md # This file +└── v1.35/ # Kubernetes version + └── nim-eks/ # Product + platform (mirrors CNCF repo) + ├── PRODUCT.yaml # CNCF submission metadata + ├── README.md # Submission overview + results table + └── evidence/ # Behavioral evidence files + ├── index.md + ├── dra-support.md + ├── gang-scheduling.md + ├── secure-accelerator-access.md + ├── accelerator-metrics.md + ├── ai-service-metrics.md + ├── inference-gateway.md + ├── robust-operator.md + ├── pod-autoscaling.md + └── cluster-autoscaling.md + +pkg/evidence/scripts/ # Evidence collection script + test manifests ├── collect-evidence.sh └── manifests/ ├── dra-gpu-test.yaml @@ -82,9 +82,9 @@ Alternatively, run the evidence collection script directly: ``` > **Note:** The `--cncf-submission` flag deploys GPU workloads and takes ~5-10 -> minutes. The evidence collection script uses polling with early exit on both -> success and failure, minimizing wait times. The HPA test uses CUDA N-Body -> Simulation to stress GPUs and verifies scale-up. +> minutes. The evidence collection script automatically detects the AI workload +> type (NIM inference, Dynamo inference, or Kubeflow training) and collects +> appropriate metrics and operator evidence. ### Two Modes @@ -101,21 +101,3 @@ Alternatively, run the evidence collection script directly: | **Gateway** | Condition verification (Accepted, Programmed) | Same | | **Webhook test** | Rejection test with invalid CR | Same | | **Cluster autoscaling** | Cloud node group validation | Cloud-provider autoscaler API | - -## Evidence - -See [evidence/index.md](evidence/index.md) for a summary of all collected evidence and results. - -## Feature Areas - -| # | Feature | Requirement | Evidence File | -|---|---------|-------------|---------------| -| 1 | DRA Support | `dra_support` | [evidence/dra-support.md](evidence/dra-support.md) | -| 2 | Gang Scheduling | `gang_scheduling` | [evidence/gang-scheduling.md](evidence/gang-scheduling.md) | -| 3 | Secure Accelerator Access | `secure_accelerator_access` | [evidence/secure-accelerator-access.md](evidence/secure-accelerator-access.md) | -| 4 | Accelerator Metrics | `accelerator_metrics` | [evidence/accelerator-metrics.md](evidence/accelerator-metrics.md) | -| 5 | AI Service Metrics | `ai_service_metrics` | [evidence/ai-service-metrics.md](evidence/ai-service-metrics.md) | -| 6 | Inference API Gateway | `ai_inference` | [evidence/inference-gateway.md](evidence/inference-gateway.md) | -| 7 | Robust AI Operator | `robust_controller` | [evidence/robust-operator.md](evidence/robust-operator.md) | -| 8 | Pod Autoscaling | `pod_autoscaling` | [evidence/pod-autoscaling.md](evidence/pod-autoscaling.md) | -| 9 | Cluster Autoscaling | `cluster_autoscaling` | [evidence/cluster-autoscaling.md](evidence/cluster-autoscaling.md) | diff --git a/docs/conformance/cncf/submission/README.md b/docs/conformance/cncf/submission/README.md deleted file mode 100644 index 3da12ef75..000000000 --- a/docs/conformance/cncf/submission/README.md +++ /dev/null @@ -1,25 +0,0 @@ -# NVIDIA AI Cluster Runtime - -[NVIDIA AI Cluster Runtime (AICR)](https://github.com/NVIDIA/aicr) generates validated, GPU-accelerated Kubernetes configurations and deploys runtime components that satisfy all CNCF AI Conformance requirements for accelerator management, scheduling, observability, security, and inference networking. - -## Conformance Submission - -- [PRODUCT.yaml](PRODUCT.yaml) - -## Evidence - -Evidence was collected on Kubernetes v1.35 clusters with NVIDIA H100 80GB HBM3 GPUs using AICR-deployed runtime components. - -| # | Requirement | Feature | Result | Evidence | -|---|-------------|---------|--------|----------| -| 1 | `dra_support` | Dynamic Resource Allocation | PASS | [dra-support.md](../evidence/dra-support.md) | -| 2 | `gang_scheduling` | Gang Scheduling (KAI Scheduler) | PASS | [gang-scheduling.md](../evidence/gang-scheduling.md) | -| 3 | `secure_accelerator_access` | Secure Accelerator Access | PASS | [secure-accelerator-access.md](../evidence/secure-accelerator-access.md) | -| 4 | `accelerator_metrics` | Accelerator Metrics (DCGM Exporter) | PASS | [accelerator-metrics.md](../evidence/accelerator-metrics.md) | -| 5 | `ai_service_metrics` | AI Service Metrics (Prometheus ServiceMonitor) | PASS | [ai-service-metrics.md](../evidence/ai-service-metrics.md) | -| 6 | `ai_inference` | Inference API Gateway (kgateway) | PASS | [inference-gateway.md](../evidence/inference-gateway.md) | -| 7 | `robust_controller` | Robust AI Operator (Dynamo + Kubeflow Trainer) | PASS | [robust-operator.md](../evidence/robust-operator.md) | -| 8 | `pod_autoscaling` | Pod Autoscaling (HPA + GPU Metrics) | PASS | [pod-autoscaling.md](../evidence/pod-autoscaling.md) | -| 9 | `cluster_autoscaling` | Cluster Autoscaling | PASS | [cluster-autoscaling.md](../evidence/cluster-autoscaling.md) | - -All 9 MUST conformance requirement IDs across 9 evidence files are **Implemented**. 3 SHOULD requirements (`driver_runtime_management`, `gpu_sharing`, `virtualized_accelerator`) are also Implemented. diff --git a/docs/conformance/cncf/submission/PRODUCT.yaml b/docs/conformance/cncf/v1.35/nim-eks/PRODUCT.yaml similarity index 83% rename from docs/conformance/cncf/submission/PRODUCT.yaml rename to docs/conformance/cncf/v1.35/nim-eks/PRODUCT.yaml index 49888769b..16af204d0 100644 --- a/docs/conformance/cncf/submission/PRODUCT.yaml +++ b/docs/conformance/cncf/v1.35/nim-eks/PRODUCT.yaml @@ -14,23 +14,24 @@ metadata: kubernetesVersion: v1.35 - platformName: "NVIDIA AI Cluster Runtime" - platformVersion: "0.8.0" + platformName: "NVIDIA NIM on EKS" + platformVersion: "1.8.3" vendorName: "NVIDIA" - websiteUrl: "https://github.com/NVIDIA/aicr" - repoUrl: "https://github.com/NVIDIA/aicr" - documentationUrl: "https://github.com/NVIDIA/aicr/blob/main/README.md" + websiteUrl: "https://developer.nvidia.com/nim" + repoUrl: "https://github.com/NVIDIA/k8s-nim-operator" + documentationUrl: "https://docs.nvidia.com/nim/large-language-models/latest/deploy-helm.html" productLogoUrl: "https://raw.githubusercontent.com/cncf/landscape/master/hosted_logos/nvidia-member.svg" description: >- - NVIDIA AI Cluster Runtime (AICR) generates validated, GPU-accelerated - Kubernetes configurations and deploys runtime components that satisfy all - CNCF AI Conformance requirements. + NVIDIA NIM on EKS is a Kubernetes-based AI inference platform that deploys + and manages NVIDIA NIM microservices on Amazon EKS with GPU scheduling, + autoscaling, and Gateway API integration. Configured and validated using + NVIDIA AI Cluster Runtime (AICR). contactEmailAddress: "aicr-maintainers@nvidia.com" - # AICR is not a Kubernetes distribution — it deploys AI runtime components on - # existing conformant platforms. We reference EKS's k8s-conformance entry - # because evidence was collected on a conformant EKS cluster. AICR is - # validated on multiple conformant platforms. - # Also validated on GKE: https://github.com/cncf/k8s-conformance/tree/master/v1.35/gke + # NVIDIA NIM on EKS is not a Kubernetes distribution — it is an AI inference + # platform deployed on top of conformant Amazon EKS. Per CNCF AI Conformance + # guidelines, we reference the underlying Kubernetes distribution's conformance + # entry to establish that the base platform is already K8s conformant. + # This submission certifies the AI capabilities layered on top of EKS. k8sConformanceUrl: "https://github.com/cncf/k8s-conformance/tree/master/v1.35/eks" spec: @@ -40,7 +41,7 @@ spec: level: MUST status: "Implemented" evidence: - - "https://github.com/NVIDIA/aicr/blob/main/docs/conformance/cncf/evidence/dra-support.md" + - "https://github.com/NVIDIA/aicr/blob/main/docs/conformance/cncf/v1.35/nim-eks/evidence/dra-support.md" notes: >- DRA API (resource.k8s.io/v1) is enabled with DeviceClass, ResourceClaim, ResourceClaimTemplate, and ResourceSlice resources available. The NVIDIA @@ -58,7 +59,7 @@ spec: level: SHOULD status: "Implemented" evidence: - - "https://github.com/NVIDIA/aicr/blob/main/docs/conformance/cncf/evidence/dra-support.md" + - "https://github.com/NVIDIA/aicr/blob/main/docs/conformance/cncf/v1.35/nim-eks/evidence/dra-support.md" notes: >- GPU Operator manages the full driver and runtime lifecycle: driver installation, container toolkit configuration, device plugin, and DRA @@ -115,7 +116,7 @@ spec: level: MUST status: "Implemented" evidence: - - "https://github.com/NVIDIA/aicr/blob/main/docs/conformance/cncf/evidence/inference-gateway.md" + - "https://github.com/NVIDIA/aicr/blob/main/docs/conformance/cncf/v1.35/nim-eks/evidence/inference-gateway.md" notes: >- kgateway controller is deployed with full Gateway API CRD support (GatewayClass, Gateway, HTTPRoute, GRPCRoute, ReferenceGrant). Inference @@ -134,7 +135,7 @@ spec: level: MUST status: "Implemented" evidence: - - "https://github.com/NVIDIA/aicr/blob/main/docs/conformance/cncf/evidence/gang-scheduling.md" + - "https://github.com/NVIDIA/aicr/blob/main/docs/conformance/cncf/v1.35/nim-eks/evidence/gang-scheduling.md" notes: >- KAI Scheduler is deployed with operator, scheduler, admission controller, pod-grouper, and queue-controller components. PodGroup CRD @@ -150,12 +151,11 @@ spec: level: MUST status: "Implemented" evidence: - - "https://github.com/NVIDIA/aicr/blob/main/docs/conformance/cncf/evidence/cluster-autoscaling.md" + - "https://github.com/NVIDIA/aicr/blob/main/docs/conformance/cncf/v1.35/nim-eks/evidence/cluster-autoscaling.md" notes: >- Demonstrated on EKS with a GPU Auto Scaling Group (p5.48xlarge, 8x H100 - per node) tagged for Cluster Autoscaler discovery, and on GKE with the - built-in cluster autoscaler managing a3-megagpu-8g node pools. Both - platforms support scaling GPU nodes based on pending pod demand. + per node) tagged for Cluster Autoscaler discovery. The platform supports + scaling GPU nodes based on pending pod demand. - id: pod_autoscaling description: >- If the platform supports the HorizontalPodAutoscaler, it must function @@ -164,7 +164,7 @@ spec: level: MUST status: "Implemented" evidence: - - "https://github.com/NVIDIA/aicr/blob/main/docs/conformance/cncf/evidence/pod-autoscaling.md" + - "https://github.com/NVIDIA/aicr/blob/main/docs/conformance/cncf/v1.35/nim-eks/evidence/pod-autoscaling.md" notes: >- Prometheus adapter exposes GPU custom metrics (gpu_utilization, gpu_memory_used, gpu_power_usage) via the Kubernetes custom metrics API. @@ -189,7 +189,7 @@ spec: level: MUST status: "Implemented" evidence: - - "https://github.com/NVIDIA/aicr/blob/main/docs/conformance/cncf/evidence/accelerator-metrics.md" + - "https://github.com/NVIDIA/aicr/blob/main/docs/conformance/cncf/v1.35/nim-eks/evidence/accelerator-metrics.md" notes: >- DCGM Exporter runs on GPU nodes exposing metrics at :9400/metrics in Prometheus format. Per-GPU metrics include utilization, memory usage, @@ -205,13 +205,14 @@ spec: level: MUST status: "Implemented" evidence: - - "https://github.com/NVIDIA/aicr/blob/main/docs/conformance/cncf/evidence/accelerator-metrics.md" + - "https://github.com/NVIDIA/aicr/blob/main/docs/conformance/cncf/v1.35/nim-eks/evidence/ai-service-metrics.md" notes: >- - Prometheus and Grafana are deployed as the monitoring stack. Prometheus - discovers and scrapes workloads exposing metrics in Prometheus - exposition format via ServiceMonitors. The prometheus-adapter bridges - these metrics into the Kubernetes custom metrics API for consumption by - HPA and other controllers. + NVIDIA NIM inference microservice exposes Prometheus-format metrics at + /v1/metrics including token throughput (prompt_tokens_total, + generation_tokens_total), request latency (time_to_first_token_seconds, + time_per_output_token_seconds), and model request counts. Prometheus + and prometheus-adapter are deployed for metrics collection and bridging + to the Kubernetes custom metrics API. security: - id: secure_accelerator_access description: >- @@ -222,7 +223,7 @@ spec: level: MUST status: "Implemented" evidence: - - "https://github.com/NVIDIA/aicr/blob/main/docs/conformance/cncf/evidence/secure-accelerator-access.md" + - "https://github.com/NVIDIA/aicr/blob/main/docs/conformance/cncf/v1.35/nim-eks/evidence/secure-accelerator-access.md" notes: >- GPU Operator manages all GPU lifecycle components (driver, device-plugin, DCGM, toolkit, validator, MIG manager). 8x H100 GPUs are individually @@ -240,11 +241,9 @@ spec: level: MUST status: "Implemented" evidence: - - "https://github.com/NVIDIA/aicr/blob/main/docs/conformance/cncf/evidence/robust-operator.md" + - "https://github.com/NVIDIA/aicr/blob/main/docs/conformance/cncf/v1.35/nim-eks/evidence/robust-operator.md" notes: >- - Two operators validated: (1) NVIDIA Dynamo for inference — 6 CRDs, - 4 validating webhooks, DynamoGraphDeployment reconciled into running - workload pods; (2) Kubeflow Trainer for training — 3 CRDs, 3 validating - webhooks, TrainJob reconciled into distributed training pods. Both - operators verified via webhook rejection tests (invalid CRs correctly - denied). + NVIDIA NIM Operator validated: 4 CRDs (NIMService, NIMCache, NIMPipeline, + NIMBuild), admission controller with webhook rejection test (invalid + NIMService correctly denied), NIMService CR reconciled into running + inference pod serving Llama 3.2 1B on H100 GPU. diff --git a/docs/conformance/cncf/v1.35/nim-eks/README.md b/docs/conformance/cncf/v1.35/nim-eks/README.md new file mode 100644 index 000000000..b275e6f6e --- /dev/null +++ b/docs/conformance/cncf/v1.35/nim-eks/README.md @@ -0,0 +1,25 @@ +# NVIDIA NIM on EKS + +[NVIDIA NIM](https://developer.nvidia.com/nim) on EKS is a Kubernetes-based AI inference platform that deploys and manages NVIDIA NIM microservices on Amazon EKS with GPU scheduling, autoscaling, and Gateway API integration. NIM microservice lifecycle is managed by the [NIM Operator](https://github.com/NVIDIA/k8s-nim-operator). The platform is configured and validated using [NVIDIA AI Cluster Runtime (AICR)](https://github.com/NVIDIA/aicr). + +## Conformance Submission + +- [PRODUCT.yaml](PRODUCT.yaml) + +## Evidence + +Evidence was collected on an EKS v1.35 cluster with NVIDIA H100 80GB HBM3 GPUs running NIM inference workloads, validated by AICR. + +| # | Requirement | Feature | Result | Evidence | +|---|-------------|---------|--------|----------| +| 1 | `dra_support` | Dynamic Resource Allocation | PASS | [dra-support.md](evidence/dra-support.md) | +| 2 | `gang_scheduling` | Gang Scheduling (KAI Scheduler) | PASS | [gang-scheduling.md](evidence/gang-scheduling.md) | +| 3 | `secure_accelerator_access` | Secure Accelerator Access | PASS | [secure-accelerator-access.md](evidence/secure-accelerator-access.md) | +| 4 | `accelerator_metrics` | Accelerator Metrics (DCGM Exporter) | PASS | [accelerator-metrics.md](evidence/accelerator-metrics.md) | +| 5 | `ai_service_metrics` | AI Service Metrics (NIM Inference) | PASS | [ai-service-metrics.md](evidence/ai-service-metrics.md) | +| 6 | `ai_inference` | Inference API Gateway (kgateway) | PASS | [inference-gateway.md](evidence/inference-gateway.md) | +| 7 | `robust_controller` | Robust AI Operator (NIM Operator) | PASS | [robust-operator.md](evidence/robust-operator.md) | +| 8 | `pod_autoscaling` | Pod Autoscaling (HPA + GPU Metrics) | PASS | [pod-autoscaling.md](evidence/pod-autoscaling.md) | +| 9 | `cluster_autoscaling` | Cluster Autoscaling | PASS | [cluster-autoscaling.md](evidence/cluster-autoscaling.md) | + +All 9 MUST conformance requirement IDs across 9 evidence files are **Implemented**. 3 SHOULD requirements (`driver_runtime_management`, `gpu_sharing`, `virtualized_accelerator`) are also Implemented. diff --git a/docs/conformance/cncf/evidence/accelerator-metrics.md b/docs/conformance/cncf/v1.35/nim-eks/evidence/accelerator-metrics.md similarity index 59% rename from docs/conformance/cncf/evidence/accelerator-metrics.md rename to docs/conformance/cncf/v1.35/nim-eks/evidence/accelerator-metrics.md index 278ad1329..b98f8844d 100644 --- a/docs/conformance/cncf/evidence/accelerator-metrics.md +++ b/docs/conformance/cncf/v1.35/nim-eks/evidence/accelerator-metrics.md @@ -1,18 +1,14 @@ -# Accelerator & AI Service Metrics +# Accelerator Metrics (DCGM Exporter) +**Cluster:** `EKS / p5.48xlarge / NVIDIA-H100-80GB-HBM3` +**Generated:** 2026-04-01 23:15:23 UTC **Kubernetes Version:** v1.35 **Platform:** linux/amd64 -**Validated on:** Kubernetes v1.35 clusters with NVIDIA H100 80GB HBM3 -**Generated:** 2026-03-10 03:41:11 UTC --- -Demonstrates two CNCF AI Conformance observability requirements: - -1. **accelerator_metrics** — Fine-grained GPU performance metrics (utilization, memory, - temperature, power) exposed via standardized Prometheus endpoint -2. **ai_service_metrics** — Monitoring system that discovers and collects metrics from - workloads exposing Prometheus exposition format +Demonstrates that the DCGM exporter exposes per-GPU metrics (utilization, memory, +temperature, power) in Prometheus format via a standardized metrics endpoint. ## Monitoring Stack Health @@ -22,14 +18,14 @@ Demonstrates two CNCF AI Conformance observability requirements: ``` $ kubectl get pods -n monitoring -l app.kubernetes.io/name=prometheus NAME READY STATUS RESTARTS AGE -prometheus-kube-prometheus-prometheus-0 2/2 Running 0 18m +prometheus-kube-prometheus-prometheus-0 2/2 Running 0 64m ``` **Prometheus service** ``` $ kubectl get svc kube-prometheus-prometheus -n monitoring -NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE -kube-prometheus-prometheus ClusterIP 172.20.135.224 9090/TCP,8080/TCP 18m +NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE +kube-prometheus-prometheus ClusterIP 172.20.72.172 9090/TCP,8080/TCP 64m ``` ### Prometheus Adapter (Custom Metrics API) @@ -38,14 +34,14 @@ kube-prometheus-prometheus ClusterIP 172.20.135.224 9090/TCP ``` $ kubectl get pods -n monitoring -l app.kubernetes.io/name=prometheus-adapter NAME READY STATUS RESTARTS AGE -prometheus-adapter-78b8b8d75c-fh4cf 1/1 Running 0 17m +prometheus-adapter-78b8b8d75c-wv9h2 1/1 Running 0 64m ``` **Prometheus adapter service** ``` $ kubectl get svc prometheus-adapter -n monitoring -NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE -prometheus-adapter ClusterIP 172.20.178.141 443/TCP 17m +NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE +prometheus-adapter ClusterIP 172.20.38.130 443/TCP 64m ``` ### Grafana @@ -54,7 +50,7 @@ prometheus-adapter ClusterIP 172.20.178.141 443/TCP 17m ``` $ kubectl get pods -n monitoring -l app.kubernetes.io/name=grafana NAME READY STATUS RESTARTS AGE -grafana-56fbffd7d7-r2htr 3/3 Running 0 18m +grafana-56fbffd7d7-8rnr6 3/3 Running 0 64m ``` ## Accelerator Metrics (DCGM Exporter) @@ -68,15 +64,15 @@ temperature, power draw, and more in Prometheus exposition format. ``` $ kubectl get pods -n gpu-operator -l app=nvidia-dcgm-exporter -o wide NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES -nvidia-dcgm-exporter-g2fjs 1/1 Running 0 15m 10.0.247.52 gpu-node-2 -nvidia-dcgm-exporter-wqqqn 1/1 Running 0 15m 10.0.172.246 gpu-node-1 +nvidia-dcgm-exporter-2xrln 1/1 Running 0 62m 10.0.187.45 ip-10-0-180-136.ec2.internal +nvidia-dcgm-exporter-sscnw 1/1 Running 0 62m 10.0.147.205 ip-10-0-251-220.ec2.internal ``` **DCGM exporter service** ``` $ kubectl get svc -n gpu-operator -l app=nvidia-dcgm-exporter NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE -nvidia-dcgm-exporter ClusterIP 172.20.181.11 9400/TCP 15m +nvidia-dcgm-exporter ClusterIP 172.20.93.244 9400/TCP 62m ``` ### DCGM Metrics Endpoint @@ -85,36 +81,36 @@ Query DCGM exporter directly to show raw GPU metrics in Prometheus format. **Key GPU metrics from DCGM exporter (sampled)** ``` -DCGM_FI_DEV_GPU_TEMP{gpu="0",UUID="GPU-c4529c8d-69c4-b61d-e0bc-7b2460096005",pci_bus_id="00000000:53:00.0",device="nvidia0",modelName="NVIDIA H100 80GB HBM3",Hostname="gpu-node-1",DCGM_FI_DRIVER_VERSION="580.105.08",container="main",namespace="dynamo-workload",pod="vllm-agg-0-vllmdecodeworker-s65j5",pod_uid=""} 30 -DCGM_FI_DEV_GPU_TEMP{gpu="1",UUID="GPU-bc5610b9-79c8-fedd-8899-07539c7f868a",pci_bus_id="00000000:64:00.0",device="nvidia1",modelName="NVIDIA H100 80GB HBM3",Hostname="gpu-node-1",DCGM_FI_DRIVER_VERSION="580.105.08"} 29 -DCGM_FI_DEV_GPU_TEMP{gpu="2",UUID="GPU-fbc2c554-4d37-8938-0032-f923bad0f716",pci_bus_id="00000000:75:00.0",device="nvidia2",modelName="NVIDIA H100 80GB HBM3",Hostname="gpu-node-1",DCGM_FI_DRIVER_VERSION="580.105.08"} 26 -DCGM_FI_DEV_GPU_TEMP{gpu="3",UUID="GPU-a65a773d-52bb-bcc1-a8ee-f78c3faa2e2d",pci_bus_id="00000000:86:00.0",device="nvidia3",modelName="NVIDIA H100 80GB HBM3",Hostname="gpu-node-1",DCGM_FI_DRIVER_VERSION="580.105.08"} 29 -DCGM_FI_DEV_GPU_TEMP{gpu="4",UUID="GPU-82e45d1b-1618-559f-144c-eab51545030b",pci_bus_id="00000000:97:00.0",device="nvidia4",modelName="NVIDIA H100 80GB HBM3",Hostname="gpu-node-1",DCGM_FI_DRIVER_VERSION="580.105.08"} 28 -DCGM_FI_DEV_GPU_TEMP{gpu="5",UUID="GPU-39e28159-8c62-ee71-64db-b748edd61e15",pci_bus_id="00000000:A8:00.0",device="nvidia5",modelName="NVIDIA H100 80GB HBM3",Hostname="gpu-node-1",DCGM_FI_DRIVER_VERSION="580.105.08"} 26 -DCGM_FI_DEV_GPU_TEMP{gpu="6",UUID="GPU-e64d69ca-b4b3-59b2-e78c-94f26c4db365",pci_bus_id="00000000:B9:00.0",device="nvidia6",modelName="NVIDIA H100 80GB HBM3",Hostname="gpu-node-1",DCGM_FI_DRIVER_VERSION="580.105.08"} 28 -DCGM_FI_DEV_GPU_TEMP{gpu="7",UUID="GPU-04d228d3-3b5a-3534-f5cf-969706647d56",pci_bus_id="00000000:CA:00.0",device="nvidia7",modelName="NVIDIA H100 80GB HBM3",Hostname="gpu-node-1",DCGM_FI_DRIVER_VERSION="580.105.08"} 26 -DCGM_FI_DEV_POWER_USAGE{gpu="0",UUID="GPU-c4529c8d-69c4-b61d-e0bc-7b2460096005",pci_bus_id="00000000:53:00.0",device="nvidia0",modelName="NVIDIA H100 80GB HBM3",Hostname="gpu-node-1",DCGM_FI_DRIVER_VERSION="580.105.08",container="main",namespace="dynamo-workload",pod="vllm-agg-0-vllmdecodeworker-s65j5",pod_uid=""} 113.611000 -DCGM_FI_DEV_POWER_USAGE{gpu="1",UUID="GPU-bc5610b9-79c8-fedd-8899-07539c7f868a",pci_bus_id="00000000:64:00.0",device="nvidia1",modelName="NVIDIA H100 80GB HBM3",Hostname="gpu-node-1",DCGM_FI_DRIVER_VERSION="580.105.08"} 68.347000 -DCGM_FI_DEV_POWER_USAGE{gpu="2",UUID="GPU-fbc2c554-4d37-8938-0032-f923bad0f716",pci_bus_id="00000000:75:00.0",device="nvidia2",modelName="NVIDIA H100 80GB HBM3",Hostname="gpu-node-1",DCGM_FI_DRIVER_VERSION="580.105.08"} 65.709000 -DCGM_FI_DEV_POWER_USAGE{gpu="3",UUID="GPU-a65a773d-52bb-bcc1-a8ee-f78c3faa2e2d",pci_bus_id="00000000:86:00.0",device="nvidia3",modelName="NVIDIA H100 80GB HBM3",Hostname="gpu-node-1",DCGM_FI_DRIVER_VERSION="580.105.08"} 67.316000 -DCGM_FI_DEV_POWER_USAGE{gpu="4",UUID="GPU-82e45d1b-1618-559f-144c-eab51545030b",pci_bus_id="00000000:97:00.0",device="nvidia4",modelName="NVIDIA H100 80GB HBM3",Hostname="gpu-node-1",DCGM_FI_DRIVER_VERSION="580.105.08"} 68.717000 -DCGM_FI_DEV_POWER_USAGE{gpu="5",UUID="GPU-39e28159-8c62-ee71-64db-b748edd61e15",pci_bus_id="00000000:A8:00.0",device="nvidia5",modelName="NVIDIA H100 80GB HBM3",Hostname="gpu-node-1",DCGM_FI_DRIVER_VERSION="580.105.08"} 65.742000 -DCGM_FI_DEV_POWER_USAGE{gpu="6",UUID="GPU-e64d69ca-b4b3-59b2-e78c-94f26c4db365",pci_bus_id="00000000:B9:00.0",device="nvidia6",modelName="NVIDIA H100 80GB HBM3",Hostname="gpu-node-1",DCGM_FI_DRIVER_VERSION="580.105.08"} 67.328000 -DCGM_FI_DEV_POWER_USAGE{gpu="7",UUID="GPU-04d228d3-3b5a-3534-f5cf-969706647d56",pci_bus_id="00000000:CA:00.0",device="nvidia7",modelName="NVIDIA H100 80GB HBM3",Hostname="gpu-node-1",DCGM_FI_DRIVER_VERSION="580.105.08"} 66.997000 -DCGM_FI_DEV_GPU_UTIL{gpu="0",UUID="GPU-c4529c8d-69c4-b61d-e0bc-7b2460096005",pci_bus_id="00000000:53:00.0",device="nvidia0",modelName="NVIDIA H100 80GB HBM3",Hostname="gpu-node-1",DCGM_FI_DRIVER_VERSION="580.105.08",container="main",namespace="dynamo-workload",pod="vllm-agg-0-vllmdecodeworker-s65j5",pod_uid=""} 0 -DCGM_FI_DEV_GPU_UTIL{gpu="1",UUID="GPU-bc5610b9-79c8-fedd-8899-07539c7f868a",pci_bus_id="00000000:64:00.0",device="nvidia1",modelName="NVIDIA H100 80GB HBM3",Hostname="gpu-node-1",DCGM_FI_DRIVER_VERSION="580.105.08"} 0 -DCGM_FI_DEV_GPU_UTIL{gpu="2",UUID="GPU-fbc2c554-4d37-8938-0032-f923bad0f716",pci_bus_id="00000000:75:00.0",device="nvidia2",modelName="NVIDIA H100 80GB HBM3",Hostname="gpu-node-1",DCGM_FI_DRIVER_VERSION="580.105.08"} 0 -DCGM_FI_DEV_GPU_UTIL{gpu="3",UUID="GPU-a65a773d-52bb-bcc1-a8ee-f78c3faa2e2d",pci_bus_id="00000000:86:00.0",device="nvidia3",modelName="NVIDIA H100 80GB HBM3",Hostname="gpu-node-1",DCGM_FI_DRIVER_VERSION="580.105.08"} 0 -DCGM_FI_DEV_GPU_UTIL{gpu="4",UUID="GPU-82e45d1b-1618-559f-144c-eab51545030b",pci_bus_id="00000000:97:00.0",device="nvidia4",modelName="NVIDIA H100 80GB HBM3",Hostname="gpu-node-1",DCGM_FI_DRIVER_VERSION="580.105.08"} 0 -DCGM_FI_DEV_GPU_UTIL{gpu="5",UUID="GPU-39e28159-8c62-ee71-64db-b748edd61e15",pci_bus_id="00000000:A8:00.0",device="nvidia5",modelName="NVIDIA H100 80GB HBM3",Hostname="gpu-node-1",DCGM_FI_DRIVER_VERSION="580.105.08"} 0 -DCGM_FI_DEV_GPU_UTIL{gpu="6",UUID="GPU-e64d69ca-b4b3-59b2-e78c-94f26c4db365",pci_bus_id="00000000:B9:00.0",device="nvidia6",modelName="NVIDIA H100 80GB HBM3",Hostname="gpu-node-1",DCGM_FI_DRIVER_VERSION="580.105.08"} 0 -DCGM_FI_DEV_GPU_UTIL{gpu="7",UUID="GPU-04d228d3-3b5a-3534-f5cf-969706647d56",pci_bus_id="00000000:CA:00.0",device="nvidia7",modelName="NVIDIA H100 80GB HBM3",Hostname="gpu-node-1",DCGM_FI_DRIVER_VERSION="580.105.08"} 0 -DCGM_FI_DEV_MEM_COPY_UTIL{gpu="0",UUID="GPU-c4529c8d-69c4-b61d-e0bc-7b2460096005",pci_bus_id="00000000:53:00.0",device="nvidia0",modelName="NVIDIA H100 80GB HBM3",Hostname="gpu-node-1",DCGM_FI_DRIVER_VERSION="580.105.08",container="main",namespace="dynamo-workload",pod="vllm-agg-0-vllmdecodeworker-s65j5",pod_uid=""} 0 -DCGM_FI_DEV_MEM_COPY_UTIL{gpu="1",UUID="GPU-bc5610b9-79c8-fedd-8899-07539c7f868a",pci_bus_id="00000000:64:00.0",device="nvidia1",modelName="NVIDIA H100 80GB HBM3",Hostname="gpu-node-1",DCGM_FI_DRIVER_VERSION="580.105.08"} 0 -DCGM_FI_DEV_MEM_COPY_UTIL{gpu="2",UUID="GPU-fbc2c554-4d37-8938-0032-f923bad0f716",pci_bus_id="00000000:75:00.0",device="nvidia2",modelName="NVIDIA H100 80GB HBM3",Hostname="gpu-node-1",DCGM_FI_DRIVER_VERSION="580.105.08"} 0 -DCGM_FI_DEV_MEM_COPY_UTIL{gpu="3",UUID="GPU-a65a773d-52bb-bcc1-a8ee-f78c3faa2e2d",pci_bus_id="00000000:86:00.0",device="nvidia3",modelName="NVIDIA H100 80GB HBM3",Hostname="gpu-node-1",DCGM_FI_DRIVER_VERSION="580.105.08"} 0 -DCGM_FI_DEV_MEM_COPY_UTIL{gpu="4",UUID="GPU-82e45d1b-1618-559f-144c-eab51545030b",pci_bus_id="00000000:97:00.0",device="nvidia4",modelName="NVIDIA H100 80GB HBM3",Hostname="gpu-node-1",DCGM_FI_DRIVER_VERSION="580.105.08"} 0 -DCGM_FI_DEV_MEM_COPY_UTIL{gpu="5",UUID="GPU-39e28159-8c62-ee71-64db-b748edd61e15",pci_bus_id="00000000:A8:00.0",device="nvidia5",modelName="NVIDIA H100 80GB HBM3",Hostname="gpu-node-1",DCGM_FI_DRIVER_VERSION="580.105.08"} 0 +DCGM_FI_DEV_GPU_TEMP{gpu="0",UUID="GPU-15704b32-f531-14ce-0530-1ac21e4b68e6",pci_bus_id="00000000:53:00.0",device="nvidia0",modelName="NVIDIA H100 80GB HBM3",Hostname="ip-10-0-180-136.ec2.internal",DCGM_FI_DRIVER_VERSION="580.105.08"} 31 +DCGM_FI_DEV_GPU_TEMP{gpu="1",UUID="GPU-edc718f8-e593-6468-b9f9-563d508366ed",pci_bus_id="00000000:64:00.0",device="nvidia1",modelName="NVIDIA H100 80GB HBM3",Hostname="ip-10-0-180-136.ec2.internal",DCGM_FI_DRIVER_VERSION="580.105.08"} 33 +DCGM_FI_DEV_GPU_TEMP{gpu="2",UUID="GPU-e2d9b65e-98cb-5b7a-90f0-e0336573f9e2",pci_bus_id="00000000:75:00.0",device="nvidia2",modelName="NVIDIA H100 80GB HBM3",Hostname="ip-10-0-180-136.ec2.internal",DCGM_FI_DRIVER_VERSION="580.105.08"} 31 +DCGM_FI_DEV_GPU_TEMP{gpu="3",UUID="GPU-3a325419-de5f-778f-cf4e-fe7290362ac5",pci_bus_id="00000000:86:00.0",device="nvidia3",modelName="NVIDIA H100 80GB HBM3",Hostname="ip-10-0-180-136.ec2.internal",DCGM_FI_DRIVER_VERSION="580.105.08"} 34 +DCGM_FI_DEV_GPU_TEMP{gpu="4",UUID="GPU-275ad37d-ebd6-4cf6-3867-0499ba033a12",pci_bus_id="00000000:97:00.0",device="nvidia4",modelName="NVIDIA H100 80GB HBM3",Hostname="ip-10-0-180-136.ec2.internal",DCGM_FI_DRIVER_VERSION="580.105.08"} 34 +DCGM_FI_DEV_GPU_TEMP{gpu="5",UUID="GPU-3cab564d-1f63-674b-a831-024600bf985c",pci_bus_id="00000000:A8:00.0",device="nvidia5",modelName="NVIDIA H100 80GB HBM3",Hostname="ip-10-0-180-136.ec2.internal",DCGM_FI_DRIVER_VERSION="580.105.08"} 32 +DCGM_FI_DEV_GPU_TEMP{gpu="6",UUID="GPU-d0f25a6f-9a3f-61b9-c128-3d14759651d7",pci_bus_id="00000000:B9:00.0",device="nvidia6",modelName="NVIDIA H100 80GB HBM3",Hostname="ip-10-0-180-136.ec2.internal",DCGM_FI_DRIVER_VERSION="580.105.08",container="llama-3-2-1b-ctr",namespace="nim-workload",pod="llama-3-2-1b-7577f87fc7-dhb97",pod_uid=""} 37 +DCGM_FI_DEV_GPU_TEMP{gpu="7",UUID="GPU-9bc10e9a-e27e-652b-9a1e-e84f7e446206",pci_bus_id="00000000:CA:00.0",device="nvidia7",modelName="NVIDIA H100 80GB HBM3",Hostname="ip-10-0-180-136.ec2.internal",DCGM_FI_DRIVER_VERSION="580.105.08"} 31 +DCGM_FI_DEV_POWER_USAGE{gpu="0",UUID="GPU-15704b32-f531-14ce-0530-1ac21e4b68e6",pci_bus_id="00000000:53:00.0",device="nvidia0",modelName="NVIDIA H100 80GB HBM3",Hostname="ip-10-0-180-136.ec2.internal",DCGM_FI_DRIVER_VERSION="580.105.08"} 67.692000 +DCGM_FI_DEV_POWER_USAGE{gpu="1",UUID="GPU-edc718f8-e593-6468-b9f9-563d508366ed",pci_bus_id="00000000:64:00.0",device="nvidia1",modelName="NVIDIA H100 80GB HBM3",Hostname="ip-10-0-180-136.ec2.internal",DCGM_FI_DRIVER_VERSION="580.105.08"} 67.219000 +DCGM_FI_DEV_POWER_USAGE{gpu="2",UUID="GPU-e2d9b65e-98cb-5b7a-90f0-e0336573f9e2",pci_bus_id="00000000:75:00.0",device="nvidia2",modelName="NVIDIA H100 80GB HBM3",Hostname="ip-10-0-180-136.ec2.internal",DCGM_FI_DRIVER_VERSION="580.105.08"} 67.899000 +DCGM_FI_DEV_POWER_USAGE{gpu="3",UUID="GPU-3a325419-de5f-778f-cf4e-fe7290362ac5",pci_bus_id="00000000:86:00.0",device="nvidia3",modelName="NVIDIA H100 80GB HBM3",Hostname="ip-10-0-180-136.ec2.internal",DCGM_FI_DRIVER_VERSION="580.105.08"} 66.711000 +DCGM_FI_DEV_POWER_USAGE{gpu="4",UUID="GPU-275ad37d-ebd6-4cf6-3867-0499ba033a12",pci_bus_id="00000000:97:00.0",device="nvidia4",modelName="NVIDIA H100 80GB HBM3",Hostname="ip-10-0-180-136.ec2.internal",DCGM_FI_DRIVER_VERSION="580.105.08"} 67.875000 +DCGM_FI_DEV_POWER_USAGE{gpu="5",UUID="GPU-3cab564d-1f63-674b-a831-024600bf985c",pci_bus_id="00000000:A8:00.0",device="nvidia5",modelName="NVIDIA H100 80GB HBM3",Hostname="ip-10-0-180-136.ec2.internal",DCGM_FI_DRIVER_VERSION="580.105.08"} 67.664000 +DCGM_FI_DEV_POWER_USAGE{gpu="6",UUID="GPU-d0f25a6f-9a3f-61b9-c128-3d14759651d7",pci_bus_id="00000000:B9:00.0",device="nvidia6",modelName="NVIDIA H100 80GB HBM3",Hostname="ip-10-0-180-136.ec2.internal",DCGM_FI_DRIVER_VERSION="580.105.08",container="llama-3-2-1b-ctr",namespace="nim-workload",pod="llama-3-2-1b-7577f87fc7-dhb97",pod_uid=""} 112.670000 +DCGM_FI_DEV_POWER_USAGE{gpu="7",UUID="GPU-9bc10e9a-e27e-652b-9a1e-e84f7e446206",pci_bus_id="00000000:CA:00.0",device="nvidia7",modelName="NVIDIA H100 80GB HBM3",Hostname="ip-10-0-180-136.ec2.internal",DCGM_FI_DRIVER_VERSION="580.105.08"} 65.061000 +DCGM_FI_DEV_GPU_UTIL{gpu="0",UUID="GPU-15704b32-f531-14ce-0530-1ac21e4b68e6",pci_bus_id="00000000:53:00.0",device="nvidia0",modelName="NVIDIA H100 80GB HBM3",Hostname="ip-10-0-180-136.ec2.internal",DCGM_FI_DRIVER_VERSION="580.105.08"} 0 +DCGM_FI_DEV_GPU_UTIL{gpu="1",UUID="GPU-edc718f8-e593-6468-b9f9-563d508366ed",pci_bus_id="00000000:64:00.0",device="nvidia1",modelName="NVIDIA H100 80GB HBM3",Hostname="ip-10-0-180-136.ec2.internal",DCGM_FI_DRIVER_VERSION="580.105.08"} 0 +DCGM_FI_DEV_GPU_UTIL{gpu="2",UUID="GPU-e2d9b65e-98cb-5b7a-90f0-e0336573f9e2",pci_bus_id="00000000:75:00.0",device="nvidia2",modelName="NVIDIA H100 80GB HBM3",Hostname="ip-10-0-180-136.ec2.internal",DCGM_FI_DRIVER_VERSION="580.105.08"} 0 +DCGM_FI_DEV_GPU_UTIL{gpu="3",UUID="GPU-3a325419-de5f-778f-cf4e-fe7290362ac5",pci_bus_id="00000000:86:00.0",device="nvidia3",modelName="NVIDIA H100 80GB HBM3",Hostname="ip-10-0-180-136.ec2.internal",DCGM_FI_DRIVER_VERSION="580.105.08"} 0 +DCGM_FI_DEV_GPU_UTIL{gpu="4",UUID="GPU-275ad37d-ebd6-4cf6-3867-0499ba033a12",pci_bus_id="00000000:97:00.0",device="nvidia4",modelName="NVIDIA H100 80GB HBM3",Hostname="ip-10-0-180-136.ec2.internal",DCGM_FI_DRIVER_VERSION="580.105.08"} 0 +DCGM_FI_DEV_GPU_UTIL{gpu="5",UUID="GPU-3cab564d-1f63-674b-a831-024600bf985c",pci_bus_id="00000000:A8:00.0",device="nvidia5",modelName="NVIDIA H100 80GB HBM3",Hostname="ip-10-0-180-136.ec2.internal",DCGM_FI_DRIVER_VERSION="580.105.08"} 0 +DCGM_FI_DEV_GPU_UTIL{gpu="6",UUID="GPU-d0f25a6f-9a3f-61b9-c128-3d14759651d7",pci_bus_id="00000000:B9:00.0",device="nvidia6",modelName="NVIDIA H100 80GB HBM3",Hostname="ip-10-0-180-136.ec2.internal",DCGM_FI_DRIVER_VERSION="580.105.08",container="llama-3-2-1b-ctr",namespace="nim-workload",pod="llama-3-2-1b-7577f87fc7-dhb97",pod_uid=""} 0 +DCGM_FI_DEV_GPU_UTIL{gpu="7",UUID="GPU-9bc10e9a-e27e-652b-9a1e-e84f7e446206",pci_bus_id="00000000:CA:00.0",device="nvidia7",modelName="NVIDIA H100 80GB HBM3",Hostname="ip-10-0-180-136.ec2.internal",DCGM_FI_DRIVER_VERSION="580.105.08"} 0 +DCGM_FI_DEV_MEM_COPY_UTIL{gpu="0",UUID="GPU-15704b32-f531-14ce-0530-1ac21e4b68e6",pci_bus_id="00000000:53:00.0",device="nvidia0",modelName="NVIDIA H100 80GB HBM3",Hostname="ip-10-0-180-136.ec2.internal",DCGM_FI_DRIVER_VERSION="580.105.08"} 0 +DCGM_FI_DEV_MEM_COPY_UTIL{gpu="1",UUID="GPU-edc718f8-e593-6468-b9f9-563d508366ed",pci_bus_id="00000000:64:00.0",device="nvidia1",modelName="NVIDIA H100 80GB HBM3",Hostname="ip-10-0-180-136.ec2.internal",DCGM_FI_DRIVER_VERSION="580.105.08"} 0 +DCGM_FI_DEV_MEM_COPY_UTIL{gpu="2",UUID="GPU-e2d9b65e-98cb-5b7a-90f0-e0336573f9e2",pci_bus_id="00000000:75:00.0",device="nvidia2",modelName="NVIDIA H100 80GB HBM3",Hostname="ip-10-0-180-136.ec2.internal",DCGM_FI_DRIVER_VERSION="580.105.08"} 0 +DCGM_FI_DEV_MEM_COPY_UTIL{gpu="3",UUID="GPU-3a325419-de5f-778f-cf4e-fe7290362ac5",pci_bus_id="00000000:86:00.0",device="nvidia3",modelName="NVIDIA H100 80GB HBM3",Hostname="ip-10-0-180-136.ec2.internal",DCGM_FI_DRIVER_VERSION="580.105.08"} 0 +DCGM_FI_DEV_MEM_COPY_UTIL{gpu="4",UUID="GPU-275ad37d-ebd6-4cf6-3867-0499ba033a12",pci_bus_id="00000000:97:00.0",device="nvidia4",modelName="NVIDIA H100 80GB HBM3",Hostname="ip-10-0-180-136.ec2.internal",DCGM_FI_DRIVER_VERSION="580.105.08"} 0 +DCGM_FI_DEV_MEM_COPY_UTIL{gpu="5",UUID="GPU-3cab564d-1f63-674b-a831-024600bf985c",pci_bus_id="00000000:A8:00.0",device="nvidia5",modelName="NVIDIA H100 80GB HBM3",Hostname="ip-10-0-180-136.ec2.internal",DCGM_FI_DRIVER_VERSION="580.105.08"} 0 ``` ### Prometheus Querying GPU Metrics @@ -131,368 +127,368 @@ Query Prometheus to verify it is actively scraping and storing DCGM metrics. { "metric": { "DCGM_FI_DRIVER_VERSION": "580.105.08", - "Hostname": "gpu-node-1", - "UUID": "GPU-bc5610b9-79c8-fedd-8899-07539c7f868a", + "Hostname": "ip-10-0-180-136.ec2.internal", + "UUID": "GPU-15704b32-f531-14ce-0530-1ac21e4b68e6", "__name__": "DCGM_FI_DEV_GPU_UTIL", "container": "nvidia-dcgm-exporter", - "device": "nvidia1", + "device": "nvidia0", "endpoint": "gpu-metrics", - "gpu": "1", - "instance": "10.0.172.246:9400", + "gpu": "0", + "instance": "10.0.187.45:9400", "job": "nvidia-dcgm-exporter", "modelName": "NVIDIA H100 80GB HBM3", "namespace": "gpu-operator", - "pci_bus_id": "00000000:64:00.0", - "pod": "nvidia-dcgm-exporter-wqqqn", + "pci_bus_id": "00000000:53:00.0", + "pod": "nvidia-dcgm-exporter-2xrln", "service": "nvidia-dcgm-exporter" }, "value": [ - 1773114089.184, + 1775085339.885, "0" ] }, { "metric": { "DCGM_FI_DRIVER_VERSION": "580.105.08", - "Hostname": "gpu-node-1", - "UUID": "GPU-fbc2c554-4d37-8938-0032-f923bad0f716", + "Hostname": "ip-10-0-180-136.ec2.internal", + "UUID": "GPU-edc718f8-e593-6468-b9f9-563d508366ed", "__name__": "DCGM_FI_DEV_GPU_UTIL", "container": "nvidia-dcgm-exporter", - "device": "nvidia2", + "device": "nvidia1", "endpoint": "gpu-metrics", - "gpu": "2", - "instance": "10.0.172.246:9400", + "gpu": "1", + "instance": "10.0.187.45:9400", "job": "nvidia-dcgm-exporter", "modelName": "NVIDIA H100 80GB HBM3", "namespace": "gpu-operator", - "pci_bus_id": "00000000:75:00.0", - "pod": "nvidia-dcgm-exporter-wqqqn", + "pci_bus_id": "00000000:64:00.0", + "pod": "nvidia-dcgm-exporter-2xrln", "service": "nvidia-dcgm-exporter" }, "value": [ - 1773114089.184, + 1775085339.885, "0" ] }, { "metric": { "DCGM_FI_DRIVER_VERSION": "580.105.08", - "Hostname": "gpu-node-1", - "UUID": "GPU-a65a773d-52bb-bcc1-a8ee-f78c3faa2e2d", + "Hostname": "ip-10-0-180-136.ec2.internal", + "UUID": "GPU-e2d9b65e-98cb-5b7a-90f0-e0336573f9e2", "__name__": "DCGM_FI_DEV_GPU_UTIL", "container": "nvidia-dcgm-exporter", - "device": "nvidia3", + "device": "nvidia2", "endpoint": "gpu-metrics", - "gpu": "3", - "instance": "10.0.172.246:9400", + "gpu": "2", + "instance": "10.0.187.45:9400", "job": "nvidia-dcgm-exporter", "modelName": "NVIDIA H100 80GB HBM3", "namespace": "gpu-operator", - "pci_bus_id": "00000000:86:00.0", - "pod": "nvidia-dcgm-exporter-wqqqn", + "pci_bus_id": "00000000:75:00.0", + "pod": "nvidia-dcgm-exporter-2xrln", "service": "nvidia-dcgm-exporter" }, "value": [ - 1773114089.184, + 1775085339.885, "0" ] }, { "metric": { "DCGM_FI_DRIVER_VERSION": "580.105.08", - "Hostname": "gpu-node-1", - "UUID": "GPU-82e45d1b-1618-559f-144c-eab51545030b", + "Hostname": "ip-10-0-180-136.ec2.internal", + "UUID": "GPU-3a325419-de5f-778f-cf4e-fe7290362ac5", "__name__": "DCGM_FI_DEV_GPU_UTIL", "container": "nvidia-dcgm-exporter", - "device": "nvidia4", + "device": "nvidia3", "endpoint": "gpu-metrics", - "gpu": "4", - "instance": "10.0.172.246:9400", + "gpu": "3", + "instance": "10.0.187.45:9400", "job": "nvidia-dcgm-exporter", "modelName": "NVIDIA H100 80GB HBM3", "namespace": "gpu-operator", - "pci_bus_id": "00000000:97:00.0", - "pod": "nvidia-dcgm-exporter-wqqqn", + "pci_bus_id": "00000000:86:00.0", + "pod": "nvidia-dcgm-exporter-2xrln", "service": "nvidia-dcgm-exporter" }, "value": [ - 1773114089.184, + 1775085339.885, "0" ] }, { "metric": { "DCGM_FI_DRIVER_VERSION": "580.105.08", - "Hostname": "gpu-node-1", - "UUID": "GPU-39e28159-8c62-ee71-64db-b748edd61e15", + "Hostname": "ip-10-0-180-136.ec2.internal", + "UUID": "GPU-275ad37d-ebd6-4cf6-3867-0499ba033a12", "__name__": "DCGM_FI_DEV_GPU_UTIL", "container": "nvidia-dcgm-exporter", - "device": "nvidia5", + "device": "nvidia4", "endpoint": "gpu-metrics", - "gpu": "5", - "instance": "10.0.172.246:9400", + "gpu": "4", + "instance": "10.0.187.45:9400", "job": "nvidia-dcgm-exporter", "modelName": "NVIDIA H100 80GB HBM3", "namespace": "gpu-operator", - "pci_bus_id": "00000000:A8:00.0", - "pod": "nvidia-dcgm-exporter-wqqqn", + "pci_bus_id": "00000000:97:00.0", + "pod": "nvidia-dcgm-exporter-2xrln", "service": "nvidia-dcgm-exporter" }, "value": [ - 1773114089.184, + 1775085339.885, "0" ] }, { "metric": { "DCGM_FI_DRIVER_VERSION": "580.105.08", - "Hostname": "gpu-node-1", - "UUID": "GPU-e64d69ca-b4b3-59b2-e78c-94f26c4db365", + "Hostname": "ip-10-0-180-136.ec2.internal", + "UUID": "GPU-3cab564d-1f63-674b-a831-024600bf985c", "__name__": "DCGM_FI_DEV_GPU_UTIL", "container": "nvidia-dcgm-exporter", - "device": "nvidia6", + "device": "nvidia5", "endpoint": "gpu-metrics", - "gpu": "6", - "instance": "10.0.172.246:9400", + "gpu": "5", + "instance": "10.0.187.45:9400", "job": "nvidia-dcgm-exporter", "modelName": "NVIDIA H100 80GB HBM3", "namespace": "gpu-operator", - "pci_bus_id": "00000000:B9:00.0", - "pod": "nvidia-dcgm-exporter-wqqqn", + "pci_bus_id": "00000000:A8:00.0", + "pod": "nvidia-dcgm-exporter-2xrln", "service": "nvidia-dcgm-exporter" }, "value": [ - 1773114089.184, + 1775085339.885, "0" ] }, { "metric": { "DCGM_FI_DRIVER_VERSION": "580.105.08", - "Hostname": "gpu-node-1", - "UUID": "GPU-04d228d3-3b5a-3534-f5cf-969706647d56", + "Hostname": "ip-10-0-180-136.ec2.internal", + "UUID": "GPU-9bc10e9a-e27e-652b-9a1e-e84f7e446206", "__name__": "DCGM_FI_DEV_GPU_UTIL", "container": "nvidia-dcgm-exporter", "device": "nvidia7", "endpoint": "gpu-metrics", "gpu": "7", - "instance": "10.0.172.246:9400", + "instance": "10.0.187.45:9400", "job": "nvidia-dcgm-exporter", "modelName": "NVIDIA H100 80GB HBM3", "namespace": "gpu-operator", "pci_bus_id": "00000000:CA:00.0", - "pod": "nvidia-dcgm-exporter-wqqqn", + "pod": "nvidia-dcgm-exporter-2xrln", "service": "nvidia-dcgm-exporter" }, "value": [ - 1773114089.184, + 1775085339.885, "0" ] }, { "metric": { "DCGM_FI_DRIVER_VERSION": "580.105.08", - "Hostname": "gpu-node-2", - "UUID": "GPU-92da0328-2f33-b563-d577-9d2b9f21f280", + "Hostname": "ip-10-0-251-220.ec2.internal", + "UUID": "GPU-3f048793-8751-030e-5870-ebbd2b10cef2", "__name__": "DCGM_FI_DEV_GPU_UTIL", "container": "nvidia-dcgm-exporter", "device": "nvidia0", "endpoint": "gpu-metrics", "gpu": "0", - "instance": "10.0.247.52:9400", + "instance": "10.0.147.205:9400", "job": "nvidia-dcgm-exporter", "modelName": "NVIDIA H100 80GB HBM3", "namespace": "gpu-operator", "pci_bus_id": "00000000:53:00.0", - "pod": "nvidia-dcgm-exporter-g2fjs", + "pod": "nvidia-dcgm-exporter-sscnw", "service": "nvidia-dcgm-exporter" }, "value": [ - 1773114089.184, + 1775085339.885, "0" ] }, { "metric": { "DCGM_FI_DRIVER_VERSION": "580.105.08", - "Hostname": "gpu-node-2", - "UUID": "GPU-184dab49-47ce-eeec-2239-3e03fbd4c002", + "Hostname": "ip-10-0-251-220.ec2.internal", + "UUID": "GPU-cc644abe-17e4-7cb7-500d-ed8c09aea2fb", "__name__": "DCGM_FI_DEV_GPU_UTIL", "container": "nvidia-dcgm-exporter", "device": "nvidia1", "endpoint": "gpu-metrics", "gpu": "1", - "instance": "10.0.247.52:9400", + "instance": "10.0.147.205:9400", "job": "nvidia-dcgm-exporter", "modelName": "NVIDIA H100 80GB HBM3", "namespace": "gpu-operator", "pci_bus_id": "00000000:64:00.0", - "pod": "nvidia-dcgm-exporter-g2fjs", + "pod": "nvidia-dcgm-exporter-sscnw", "service": "nvidia-dcgm-exporter" }, "value": [ - 1773114089.184, + 1775085339.885, "0" ] }, { "metric": { "DCGM_FI_DRIVER_VERSION": "580.105.08", - "Hostname": "gpu-node-2", - "UUID": "GPU-dbabb552-a092-0ca9-0580-8d4fe378eb02", + "Hostname": "ip-10-0-251-220.ec2.internal", + "UUID": "GPU-8d0b1081-9549-2b14-7e01-b4a725873c21", "__name__": "DCGM_FI_DEV_GPU_UTIL", "container": "nvidia-dcgm-exporter", "device": "nvidia2", "endpoint": "gpu-metrics", "gpu": "2", - "instance": "10.0.247.52:9400", + "instance": "10.0.147.205:9400", "job": "nvidia-dcgm-exporter", "modelName": "NVIDIA H100 80GB HBM3", "namespace": "gpu-operator", "pci_bus_id": "00000000:75:00.0", - "pod": "nvidia-dcgm-exporter-g2fjs", + "pod": "nvidia-dcgm-exporter-sscnw", "service": "nvidia-dcgm-exporter" }, "value": [ - 1773114089.184, + 1775085339.885, "0" ] }, { "metric": { "DCGM_FI_DRIVER_VERSION": "580.105.08", - "Hostname": "gpu-node-2", - "UUID": "GPU-5342927e-e180-84f1-55ba-257f1cbd3ba4", + "Hostname": "ip-10-0-251-220.ec2.internal", + "UUID": "GPU-38bbfee9-dc95-ffb5-4034-f9a6c82a45bb", "__name__": "DCGM_FI_DEV_GPU_UTIL", "container": "nvidia-dcgm-exporter", "device": "nvidia3", "endpoint": "gpu-metrics", "gpu": "3", - "instance": "10.0.247.52:9400", + "instance": "10.0.147.205:9400", "job": "nvidia-dcgm-exporter", "modelName": "NVIDIA H100 80GB HBM3", "namespace": "gpu-operator", "pci_bus_id": "00000000:86:00.0", - "pod": "nvidia-dcgm-exporter-g2fjs", + "pod": "nvidia-dcgm-exporter-sscnw", "service": "nvidia-dcgm-exporter" }, "value": [ - 1773114089.184, + 1775085339.885, "0" ] }, { "metric": { "DCGM_FI_DRIVER_VERSION": "580.105.08", - "Hostname": "gpu-node-2", - "UUID": "GPU-95085215-739e-e7c6-4011-8dbe004af8c3", + "Hostname": "ip-10-0-251-220.ec2.internal", + "UUID": "GPU-24087b69-8889-6b23-feeb-2905664fbcbf", "__name__": "DCGM_FI_DEV_GPU_UTIL", "container": "nvidia-dcgm-exporter", "device": "nvidia4", "endpoint": "gpu-metrics", "gpu": "4", - "instance": "10.0.247.52:9400", + "instance": "10.0.147.205:9400", "job": "nvidia-dcgm-exporter", "modelName": "NVIDIA H100 80GB HBM3", "namespace": "gpu-operator", "pci_bus_id": "00000000:97:00.0", - "pod": "nvidia-dcgm-exporter-g2fjs", + "pod": "nvidia-dcgm-exporter-sscnw", "service": "nvidia-dcgm-exporter" }, "value": [ - 1773114089.184, + 1775085339.885, "0" ] }, { "metric": { "DCGM_FI_DRIVER_VERSION": "580.105.08", - "Hostname": "gpu-node-2", - "UUID": "GPU-a7b658ad-f23e-cea9-2523-569d521700bf", + "Hostname": "ip-10-0-251-220.ec2.internal", + "UUID": "GPU-d2f75162-e86d-0da0-0af4-3fa0b80038cd", "__name__": "DCGM_FI_DEV_GPU_UTIL", "container": "nvidia-dcgm-exporter", "device": "nvidia5", "endpoint": "gpu-metrics", "gpu": "5", - "instance": "10.0.247.52:9400", + "instance": "10.0.147.205:9400", "job": "nvidia-dcgm-exporter", "modelName": "NVIDIA H100 80GB HBM3", "namespace": "gpu-operator", "pci_bus_id": "00000000:A8:00.0", - "pod": "nvidia-dcgm-exporter-g2fjs", + "pod": "nvidia-dcgm-exporter-sscnw", "service": "nvidia-dcgm-exporter" }, "value": [ - 1773114089.184, + 1775085339.885, "0" ] }, { "metric": { "DCGM_FI_DRIVER_VERSION": "580.105.08", - "Hostname": "gpu-node-2", - "UUID": "GPU-1e9a0e94-769a-b1e6-36f7-9296e286ef90", + "Hostname": "ip-10-0-251-220.ec2.internal", + "UUID": "GPU-b00fe5f9-5832-19d6-0276-28d8630f0f4b", "__name__": "DCGM_FI_DEV_GPU_UTIL", "container": "nvidia-dcgm-exporter", "device": "nvidia6", "endpoint": "gpu-metrics", "gpu": "6", - "instance": "10.0.247.52:9400", + "instance": "10.0.147.205:9400", "job": "nvidia-dcgm-exporter", "modelName": "NVIDIA H100 80GB HBM3", "namespace": "gpu-operator", "pci_bus_id": "00000000:B9:00.0", - "pod": "nvidia-dcgm-exporter-g2fjs", + "pod": "nvidia-dcgm-exporter-sscnw", "service": "nvidia-dcgm-exporter" }, "value": [ - 1773114089.184, + 1775085339.885, "0" ] }, { "metric": { "DCGM_FI_DRIVER_VERSION": "580.105.08", - "Hostname": "gpu-node-2", - "UUID": "GPU-16b2cd36-9dbe-3ee7-0810-07b330e36e04", + "Hostname": "ip-10-0-251-220.ec2.internal", + "UUID": "GPU-530bd4b0-238b-f0c2-b496-63595812bca8", "__name__": "DCGM_FI_DEV_GPU_UTIL", "container": "nvidia-dcgm-exporter", "device": "nvidia7", "endpoint": "gpu-metrics", "gpu": "7", - "instance": "10.0.247.52:9400", + "instance": "10.0.147.205:9400", "job": "nvidia-dcgm-exporter", "modelName": "NVIDIA H100 80GB HBM3", "namespace": "gpu-operator", "pci_bus_id": "00000000:CA:00.0", - "pod": "nvidia-dcgm-exporter-g2fjs", + "pod": "nvidia-dcgm-exporter-sscnw", "service": "nvidia-dcgm-exporter" }, "value": [ - 1773114089.184, + 1775085339.885, "0" ] }, { "metric": { "DCGM_FI_DRIVER_VERSION": "580.105.08", - "Hostname": "gpu-node-1", - "UUID": "GPU-c4529c8d-69c4-b61d-e0bc-7b2460096005", + "Hostname": "ip-10-0-180-136.ec2.internal", + "UUID": "GPU-d0f25a6f-9a3f-61b9-c128-3d14759651d7", "__name__": "DCGM_FI_DEV_GPU_UTIL", - "container": "main", - "device": "nvidia0", + "container": "llama-3-2-1b-ctr", + "device": "nvidia6", "endpoint": "gpu-metrics", - "gpu": "0", - "instance": "10.0.172.246:9400", + "gpu": "6", + "instance": "10.0.187.45:9400", "job": "nvidia-dcgm-exporter", "modelName": "NVIDIA H100 80GB HBM3", - "namespace": "dynamo-workload", - "pci_bus_id": "00000000:53:00.0", - "pod": "vllm-agg-0-vllmdecodeworker-s65j5", + "namespace": "nim-workload", + "pci_bus_id": "00000000:B9:00.0", + "pod": "llama-3-2-1b-7577f87fc7-dhb97", "service": "nvidia-dcgm-exporter" }, "value": [ - 1773114089.184, + 1775085339.885, "0" ] } @@ -511,369 +507,369 @@ Query Prometheus to verify it is actively scraping and storing DCGM metrics. { "metric": { "DCGM_FI_DRIVER_VERSION": "580.105.08", - "Hostname": "gpu-node-1", - "UUID": "GPU-bc5610b9-79c8-fedd-8899-07539c7f868a", + "Hostname": "ip-10-0-180-136.ec2.internal", + "UUID": "GPU-15704b32-f531-14ce-0530-1ac21e4b68e6", "__name__": "DCGM_FI_DEV_FB_USED", "container": "nvidia-dcgm-exporter", - "device": "nvidia1", + "device": "nvidia0", "endpoint": "gpu-metrics", - "gpu": "1", - "instance": "10.0.172.246:9400", + "gpu": "0", + "instance": "10.0.187.45:9400", "job": "nvidia-dcgm-exporter", "modelName": "NVIDIA H100 80GB HBM3", "namespace": "gpu-operator", - "pci_bus_id": "00000000:64:00.0", - "pod": "nvidia-dcgm-exporter-wqqqn", + "pci_bus_id": "00000000:53:00.0", + "pod": "nvidia-dcgm-exporter-2xrln", "service": "nvidia-dcgm-exporter" }, "value": [ - 1773114089.444, + 1775085340.205, "0" ] }, { "metric": { "DCGM_FI_DRIVER_VERSION": "580.105.08", - "Hostname": "gpu-node-1", - "UUID": "GPU-fbc2c554-4d37-8938-0032-f923bad0f716", + "Hostname": "ip-10-0-180-136.ec2.internal", + "UUID": "GPU-edc718f8-e593-6468-b9f9-563d508366ed", "__name__": "DCGM_FI_DEV_FB_USED", "container": "nvidia-dcgm-exporter", - "device": "nvidia2", + "device": "nvidia1", "endpoint": "gpu-metrics", - "gpu": "2", - "instance": "10.0.172.246:9400", + "gpu": "1", + "instance": "10.0.187.45:9400", "job": "nvidia-dcgm-exporter", "modelName": "NVIDIA H100 80GB HBM3", "namespace": "gpu-operator", - "pci_bus_id": "00000000:75:00.0", - "pod": "nvidia-dcgm-exporter-wqqqn", + "pci_bus_id": "00000000:64:00.0", + "pod": "nvidia-dcgm-exporter-2xrln", "service": "nvidia-dcgm-exporter" }, "value": [ - 1773114089.444, + 1775085340.205, "0" ] }, { "metric": { "DCGM_FI_DRIVER_VERSION": "580.105.08", - "Hostname": "gpu-node-1", - "UUID": "GPU-a65a773d-52bb-bcc1-a8ee-f78c3faa2e2d", + "Hostname": "ip-10-0-180-136.ec2.internal", + "UUID": "GPU-e2d9b65e-98cb-5b7a-90f0-e0336573f9e2", "__name__": "DCGM_FI_DEV_FB_USED", "container": "nvidia-dcgm-exporter", - "device": "nvidia3", + "device": "nvidia2", "endpoint": "gpu-metrics", - "gpu": "3", - "instance": "10.0.172.246:9400", + "gpu": "2", + "instance": "10.0.187.45:9400", "job": "nvidia-dcgm-exporter", "modelName": "NVIDIA H100 80GB HBM3", "namespace": "gpu-operator", - "pci_bus_id": "00000000:86:00.0", - "pod": "nvidia-dcgm-exporter-wqqqn", + "pci_bus_id": "00000000:75:00.0", + "pod": "nvidia-dcgm-exporter-2xrln", "service": "nvidia-dcgm-exporter" }, "value": [ - 1773114089.444, + 1775085340.205, "0" ] }, { "metric": { "DCGM_FI_DRIVER_VERSION": "580.105.08", - "Hostname": "gpu-node-1", - "UUID": "GPU-82e45d1b-1618-559f-144c-eab51545030b", + "Hostname": "ip-10-0-180-136.ec2.internal", + "UUID": "GPU-3a325419-de5f-778f-cf4e-fe7290362ac5", "__name__": "DCGM_FI_DEV_FB_USED", "container": "nvidia-dcgm-exporter", - "device": "nvidia4", + "device": "nvidia3", "endpoint": "gpu-metrics", - "gpu": "4", - "instance": "10.0.172.246:9400", + "gpu": "3", + "instance": "10.0.187.45:9400", "job": "nvidia-dcgm-exporter", "modelName": "NVIDIA H100 80GB HBM3", "namespace": "gpu-operator", - "pci_bus_id": "00000000:97:00.0", - "pod": "nvidia-dcgm-exporter-wqqqn", + "pci_bus_id": "00000000:86:00.0", + "pod": "nvidia-dcgm-exporter-2xrln", "service": "nvidia-dcgm-exporter" }, "value": [ - 1773114089.444, + 1775085340.205, "0" ] }, { "metric": { "DCGM_FI_DRIVER_VERSION": "580.105.08", - "Hostname": "gpu-node-1", - "UUID": "GPU-39e28159-8c62-ee71-64db-b748edd61e15", + "Hostname": "ip-10-0-180-136.ec2.internal", + "UUID": "GPU-275ad37d-ebd6-4cf6-3867-0499ba033a12", "__name__": "DCGM_FI_DEV_FB_USED", "container": "nvidia-dcgm-exporter", - "device": "nvidia5", + "device": "nvidia4", "endpoint": "gpu-metrics", - "gpu": "5", - "instance": "10.0.172.246:9400", + "gpu": "4", + "instance": "10.0.187.45:9400", "job": "nvidia-dcgm-exporter", "modelName": "NVIDIA H100 80GB HBM3", "namespace": "gpu-operator", - "pci_bus_id": "00000000:A8:00.0", - "pod": "nvidia-dcgm-exporter-wqqqn", + "pci_bus_id": "00000000:97:00.0", + "pod": "nvidia-dcgm-exporter-2xrln", "service": "nvidia-dcgm-exporter" }, "value": [ - 1773114089.444, + 1775085340.205, "0" ] }, { "metric": { "DCGM_FI_DRIVER_VERSION": "580.105.08", - "Hostname": "gpu-node-1", - "UUID": "GPU-e64d69ca-b4b3-59b2-e78c-94f26c4db365", + "Hostname": "ip-10-0-180-136.ec2.internal", + "UUID": "GPU-3cab564d-1f63-674b-a831-024600bf985c", "__name__": "DCGM_FI_DEV_FB_USED", "container": "nvidia-dcgm-exporter", - "device": "nvidia6", + "device": "nvidia5", "endpoint": "gpu-metrics", - "gpu": "6", - "instance": "10.0.172.246:9400", + "gpu": "5", + "instance": "10.0.187.45:9400", "job": "nvidia-dcgm-exporter", "modelName": "NVIDIA H100 80GB HBM3", "namespace": "gpu-operator", - "pci_bus_id": "00000000:B9:00.0", - "pod": "nvidia-dcgm-exporter-wqqqn", + "pci_bus_id": "00000000:A8:00.0", + "pod": "nvidia-dcgm-exporter-2xrln", "service": "nvidia-dcgm-exporter" }, "value": [ - 1773114089.444, + 1775085340.205, "0" ] }, { "metric": { "DCGM_FI_DRIVER_VERSION": "580.105.08", - "Hostname": "gpu-node-1", - "UUID": "GPU-04d228d3-3b5a-3534-f5cf-969706647d56", + "Hostname": "ip-10-0-180-136.ec2.internal", + "UUID": "GPU-9bc10e9a-e27e-652b-9a1e-e84f7e446206", "__name__": "DCGM_FI_DEV_FB_USED", "container": "nvidia-dcgm-exporter", "device": "nvidia7", "endpoint": "gpu-metrics", "gpu": "7", - "instance": "10.0.172.246:9400", + "instance": "10.0.187.45:9400", "job": "nvidia-dcgm-exporter", "modelName": "NVIDIA H100 80GB HBM3", "namespace": "gpu-operator", "pci_bus_id": "00000000:CA:00.0", - "pod": "nvidia-dcgm-exporter-wqqqn", + "pod": "nvidia-dcgm-exporter-2xrln", "service": "nvidia-dcgm-exporter" }, "value": [ - 1773114089.444, + 1775085340.205, "0" ] }, { "metric": { "DCGM_FI_DRIVER_VERSION": "580.105.08", - "Hostname": "gpu-node-2", - "UUID": "GPU-92da0328-2f33-b563-d577-9d2b9f21f280", + "Hostname": "ip-10-0-251-220.ec2.internal", + "UUID": "GPU-3f048793-8751-030e-5870-ebbd2b10cef2", "__name__": "DCGM_FI_DEV_FB_USED", "container": "nvidia-dcgm-exporter", "device": "nvidia0", "endpoint": "gpu-metrics", "gpu": "0", - "instance": "10.0.247.52:9400", + "instance": "10.0.147.205:9400", "job": "nvidia-dcgm-exporter", "modelName": "NVIDIA H100 80GB HBM3", "namespace": "gpu-operator", "pci_bus_id": "00000000:53:00.0", - "pod": "nvidia-dcgm-exporter-g2fjs", + "pod": "nvidia-dcgm-exporter-sscnw", "service": "nvidia-dcgm-exporter" }, "value": [ - 1773114089.444, + 1775085340.205, "0" ] }, { "metric": { "DCGM_FI_DRIVER_VERSION": "580.105.08", - "Hostname": "gpu-node-2", - "UUID": "GPU-184dab49-47ce-eeec-2239-3e03fbd4c002", + "Hostname": "ip-10-0-251-220.ec2.internal", + "UUID": "GPU-cc644abe-17e4-7cb7-500d-ed8c09aea2fb", "__name__": "DCGM_FI_DEV_FB_USED", "container": "nvidia-dcgm-exporter", "device": "nvidia1", "endpoint": "gpu-metrics", "gpu": "1", - "instance": "10.0.247.52:9400", + "instance": "10.0.147.205:9400", "job": "nvidia-dcgm-exporter", "modelName": "NVIDIA H100 80GB HBM3", "namespace": "gpu-operator", "pci_bus_id": "00000000:64:00.0", - "pod": "nvidia-dcgm-exporter-g2fjs", + "pod": "nvidia-dcgm-exporter-sscnw", "service": "nvidia-dcgm-exporter" }, "value": [ - 1773114089.444, + 1775085340.205, "0" ] }, { "metric": { "DCGM_FI_DRIVER_VERSION": "580.105.08", - "Hostname": "gpu-node-2", - "UUID": "GPU-dbabb552-a092-0ca9-0580-8d4fe378eb02", + "Hostname": "ip-10-0-251-220.ec2.internal", + "UUID": "GPU-8d0b1081-9549-2b14-7e01-b4a725873c21", "__name__": "DCGM_FI_DEV_FB_USED", "container": "nvidia-dcgm-exporter", "device": "nvidia2", "endpoint": "gpu-metrics", "gpu": "2", - "instance": "10.0.247.52:9400", + "instance": "10.0.147.205:9400", "job": "nvidia-dcgm-exporter", "modelName": "NVIDIA H100 80GB HBM3", "namespace": "gpu-operator", "pci_bus_id": "00000000:75:00.0", - "pod": "nvidia-dcgm-exporter-g2fjs", + "pod": "nvidia-dcgm-exporter-sscnw", "service": "nvidia-dcgm-exporter" }, "value": [ - 1773114089.444, + 1775085340.205, "0" ] }, { "metric": { "DCGM_FI_DRIVER_VERSION": "580.105.08", - "Hostname": "gpu-node-2", - "UUID": "GPU-5342927e-e180-84f1-55ba-257f1cbd3ba4", + "Hostname": "ip-10-0-251-220.ec2.internal", + "UUID": "GPU-38bbfee9-dc95-ffb5-4034-f9a6c82a45bb", "__name__": "DCGM_FI_DEV_FB_USED", "container": "nvidia-dcgm-exporter", "device": "nvidia3", "endpoint": "gpu-metrics", "gpu": "3", - "instance": "10.0.247.52:9400", + "instance": "10.0.147.205:9400", "job": "nvidia-dcgm-exporter", "modelName": "NVIDIA H100 80GB HBM3", "namespace": "gpu-operator", "pci_bus_id": "00000000:86:00.0", - "pod": "nvidia-dcgm-exporter-g2fjs", + "pod": "nvidia-dcgm-exporter-sscnw", "service": "nvidia-dcgm-exporter" }, "value": [ - 1773114089.444, + 1775085340.205, "0" ] }, { "metric": { "DCGM_FI_DRIVER_VERSION": "580.105.08", - "Hostname": "gpu-node-2", - "UUID": "GPU-95085215-739e-e7c6-4011-8dbe004af8c3", + "Hostname": "ip-10-0-251-220.ec2.internal", + "UUID": "GPU-24087b69-8889-6b23-feeb-2905664fbcbf", "__name__": "DCGM_FI_DEV_FB_USED", "container": "nvidia-dcgm-exporter", "device": "nvidia4", "endpoint": "gpu-metrics", "gpu": "4", - "instance": "10.0.247.52:9400", + "instance": "10.0.147.205:9400", "job": "nvidia-dcgm-exporter", "modelName": "NVIDIA H100 80GB HBM3", "namespace": "gpu-operator", "pci_bus_id": "00000000:97:00.0", - "pod": "nvidia-dcgm-exporter-g2fjs", + "pod": "nvidia-dcgm-exporter-sscnw", "service": "nvidia-dcgm-exporter" }, "value": [ - 1773114089.444, + 1775085340.205, "0" ] }, { "metric": { "DCGM_FI_DRIVER_VERSION": "580.105.08", - "Hostname": "gpu-node-2", - "UUID": "GPU-a7b658ad-f23e-cea9-2523-569d521700bf", + "Hostname": "ip-10-0-251-220.ec2.internal", + "UUID": "GPU-d2f75162-e86d-0da0-0af4-3fa0b80038cd", "__name__": "DCGM_FI_DEV_FB_USED", "container": "nvidia-dcgm-exporter", "device": "nvidia5", "endpoint": "gpu-metrics", "gpu": "5", - "instance": "10.0.247.52:9400", + "instance": "10.0.147.205:9400", "job": "nvidia-dcgm-exporter", "modelName": "NVIDIA H100 80GB HBM3", "namespace": "gpu-operator", "pci_bus_id": "00000000:A8:00.0", - "pod": "nvidia-dcgm-exporter-g2fjs", + "pod": "nvidia-dcgm-exporter-sscnw", "service": "nvidia-dcgm-exporter" }, "value": [ - 1773114089.444, + 1775085340.205, "0" ] }, { "metric": { "DCGM_FI_DRIVER_VERSION": "580.105.08", - "Hostname": "gpu-node-2", - "UUID": "GPU-1e9a0e94-769a-b1e6-36f7-9296e286ef90", + "Hostname": "ip-10-0-251-220.ec2.internal", + "UUID": "GPU-b00fe5f9-5832-19d6-0276-28d8630f0f4b", "__name__": "DCGM_FI_DEV_FB_USED", "container": "nvidia-dcgm-exporter", "device": "nvidia6", "endpoint": "gpu-metrics", "gpu": "6", - "instance": "10.0.247.52:9400", + "instance": "10.0.147.205:9400", "job": "nvidia-dcgm-exporter", "modelName": "NVIDIA H100 80GB HBM3", "namespace": "gpu-operator", "pci_bus_id": "00000000:B9:00.0", - "pod": "nvidia-dcgm-exporter-g2fjs", + "pod": "nvidia-dcgm-exporter-sscnw", "service": "nvidia-dcgm-exporter" }, "value": [ - 1773114089.444, + 1775085340.205, "0" ] }, { "metric": { "DCGM_FI_DRIVER_VERSION": "580.105.08", - "Hostname": "gpu-node-2", - "UUID": "GPU-16b2cd36-9dbe-3ee7-0810-07b330e36e04", + "Hostname": "ip-10-0-251-220.ec2.internal", + "UUID": "GPU-530bd4b0-238b-f0c2-b496-63595812bca8", "__name__": "DCGM_FI_DEV_FB_USED", "container": "nvidia-dcgm-exporter", "device": "nvidia7", "endpoint": "gpu-metrics", "gpu": "7", - "instance": "10.0.247.52:9400", + "instance": "10.0.147.205:9400", "job": "nvidia-dcgm-exporter", "modelName": "NVIDIA H100 80GB HBM3", "namespace": "gpu-operator", "pci_bus_id": "00000000:CA:00.0", - "pod": "nvidia-dcgm-exporter-g2fjs", + "pod": "nvidia-dcgm-exporter-sscnw", "service": "nvidia-dcgm-exporter" }, "value": [ - 1773114089.444, + 1775085340.205, "0" ] }, { "metric": { "DCGM_FI_DRIVER_VERSION": "580.105.08", - "Hostname": "gpu-node-1", - "UUID": "GPU-c4529c8d-69c4-b61d-e0bc-7b2460096005", + "Hostname": "ip-10-0-180-136.ec2.internal", + "UUID": "GPU-d0f25a6f-9a3f-61b9-c128-3d14759651d7", "__name__": "DCGM_FI_DEV_FB_USED", - "container": "main", - "device": "nvidia0", + "container": "llama-3-2-1b-ctr", + "device": "nvidia6", "endpoint": "gpu-metrics", - "gpu": "0", - "instance": "10.0.172.246:9400", + "gpu": "6", + "instance": "10.0.187.45:9400", "job": "nvidia-dcgm-exporter", "modelName": "NVIDIA H100 80GB HBM3", - "namespace": "dynamo-workload", - "pci_bus_id": "00000000:53:00.0", - "pod": "vllm-agg-0-vllmdecodeworker-s65j5", + "namespace": "nim-workload", + "pci_bus_id": "00000000:B9:00.0", + "pod": "llama-3-2-1b-7577f87fc7-dhb97", "service": "nvidia-dcgm-exporter" }, "value": [ - 1773114089.444, - "74166" + 1775085340.205, + "75050" ] } ] @@ -891,369 +887,369 @@ Query Prometheus to verify it is actively scraping and storing DCGM metrics. { "metric": { "DCGM_FI_DRIVER_VERSION": "580.105.08", - "Hostname": "gpu-node-1", - "UUID": "GPU-bc5610b9-79c8-fedd-8899-07539c7f868a", + "Hostname": "ip-10-0-180-136.ec2.internal", + "UUID": "GPU-15704b32-f531-14ce-0530-1ac21e4b68e6", + "__name__": "DCGM_FI_DEV_GPU_TEMP", + "container": "nvidia-dcgm-exporter", + "device": "nvidia0", + "endpoint": "gpu-metrics", + "gpu": "0", + "instance": "10.0.187.45:9400", + "job": "nvidia-dcgm-exporter", + "modelName": "NVIDIA H100 80GB HBM3", + "namespace": "gpu-operator", + "pci_bus_id": "00000000:53:00.0", + "pod": "nvidia-dcgm-exporter-2xrln", + "service": "nvidia-dcgm-exporter" + }, + "value": [ + 1775085340.554, + "31" + ] + }, + { + "metric": { + "DCGM_FI_DRIVER_VERSION": "580.105.08", + "Hostname": "ip-10-0-180-136.ec2.internal", + "UUID": "GPU-edc718f8-e593-6468-b9f9-563d508366ed", "__name__": "DCGM_FI_DEV_GPU_TEMP", "container": "nvidia-dcgm-exporter", "device": "nvidia1", "endpoint": "gpu-metrics", "gpu": "1", - "instance": "10.0.172.246:9400", + "instance": "10.0.187.45:9400", "job": "nvidia-dcgm-exporter", "modelName": "NVIDIA H100 80GB HBM3", "namespace": "gpu-operator", "pci_bus_id": "00000000:64:00.0", - "pod": "nvidia-dcgm-exporter-wqqqn", + "pod": "nvidia-dcgm-exporter-2xrln", "service": "nvidia-dcgm-exporter" }, "value": [ - 1773114089.702, - "29" + 1775085340.554, + "33" ] }, { "metric": { "DCGM_FI_DRIVER_VERSION": "580.105.08", - "Hostname": "gpu-node-1", - "UUID": "GPU-fbc2c554-4d37-8938-0032-f923bad0f716", + "Hostname": "ip-10-0-180-136.ec2.internal", + "UUID": "GPU-e2d9b65e-98cb-5b7a-90f0-e0336573f9e2", "__name__": "DCGM_FI_DEV_GPU_TEMP", "container": "nvidia-dcgm-exporter", "device": "nvidia2", "endpoint": "gpu-metrics", "gpu": "2", - "instance": "10.0.172.246:9400", + "instance": "10.0.187.45:9400", "job": "nvidia-dcgm-exporter", "modelName": "NVIDIA H100 80GB HBM3", "namespace": "gpu-operator", "pci_bus_id": "00000000:75:00.0", - "pod": "nvidia-dcgm-exporter-wqqqn", + "pod": "nvidia-dcgm-exporter-2xrln", "service": "nvidia-dcgm-exporter" }, "value": [ - 1773114089.702, - "26" + 1775085340.554, + "31" ] }, { "metric": { "DCGM_FI_DRIVER_VERSION": "580.105.08", - "Hostname": "gpu-node-1", - "UUID": "GPU-a65a773d-52bb-bcc1-a8ee-f78c3faa2e2d", + "Hostname": "ip-10-0-180-136.ec2.internal", + "UUID": "GPU-3a325419-de5f-778f-cf4e-fe7290362ac5", "__name__": "DCGM_FI_DEV_GPU_TEMP", "container": "nvidia-dcgm-exporter", "device": "nvidia3", "endpoint": "gpu-metrics", "gpu": "3", - "instance": "10.0.172.246:9400", + "instance": "10.0.187.45:9400", "job": "nvidia-dcgm-exporter", "modelName": "NVIDIA H100 80GB HBM3", "namespace": "gpu-operator", "pci_bus_id": "00000000:86:00.0", - "pod": "nvidia-dcgm-exporter-wqqqn", + "pod": "nvidia-dcgm-exporter-2xrln", "service": "nvidia-dcgm-exporter" }, "value": [ - 1773114089.702, - "29" + 1775085340.554, + "34" ] }, { "metric": { "DCGM_FI_DRIVER_VERSION": "580.105.08", - "Hostname": "gpu-node-1", - "UUID": "GPU-82e45d1b-1618-559f-144c-eab51545030b", + "Hostname": "ip-10-0-180-136.ec2.internal", + "UUID": "GPU-275ad37d-ebd6-4cf6-3867-0499ba033a12", "__name__": "DCGM_FI_DEV_GPU_TEMP", "container": "nvidia-dcgm-exporter", "device": "nvidia4", "endpoint": "gpu-metrics", "gpu": "4", - "instance": "10.0.172.246:9400", + "instance": "10.0.187.45:9400", "job": "nvidia-dcgm-exporter", "modelName": "NVIDIA H100 80GB HBM3", "namespace": "gpu-operator", "pci_bus_id": "00000000:97:00.0", - "pod": "nvidia-dcgm-exporter-wqqqn", + "pod": "nvidia-dcgm-exporter-2xrln", "service": "nvidia-dcgm-exporter" }, "value": [ - 1773114089.702, - "28" + 1775085340.554, + "34" ] }, { "metric": { "DCGM_FI_DRIVER_VERSION": "580.105.08", - "Hostname": "gpu-node-1", - "UUID": "GPU-39e28159-8c62-ee71-64db-b748edd61e15", + "Hostname": "ip-10-0-180-136.ec2.internal", + "UUID": "GPU-3cab564d-1f63-674b-a831-024600bf985c", "__name__": "DCGM_FI_DEV_GPU_TEMP", "container": "nvidia-dcgm-exporter", "device": "nvidia5", "endpoint": "gpu-metrics", "gpu": "5", - "instance": "10.0.172.246:9400", + "instance": "10.0.187.45:9400", "job": "nvidia-dcgm-exporter", "modelName": "NVIDIA H100 80GB HBM3", "namespace": "gpu-operator", "pci_bus_id": "00000000:A8:00.0", - "pod": "nvidia-dcgm-exporter-wqqqn", + "pod": "nvidia-dcgm-exporter-2xrln", "service": "nvidia-dcgm-exporter" }, "value": [ - 1773114089.702, - "26" + 1775085340.554, + "32" ] }, { "metric": { "DCGM_FI_DRIVER_VERSION": "580.105.08", - "Hostname": "gpu-node-1", - "UUID": "GPU-e64d69ca-b4b3-59b2-e78c-94f26c4db365", - "__name__": "DCGM_FI_DEV_GPU_TEMP", - "container": "nvidia-dcgm-exporter", - "device": "nvidia6", - "endpoint": "gpu-metrics", - "gpu": "6", - "instance": "10.0.172.246:9400", - "job": "nvidia-dcgm-exporter", - "modelName": "NVIDIA H100 80GB HBM3", - "namespace": "gpu-operator", - "pci_bus_id": "00000000:B9:00.0", - "pod": "nvidia-dcgm-exporter-wqqqn", - "service": "nvidia-dcgm-exporter" - }, - "value": [ - 1773114089.702, - "28" - ] - }, - { - "metric": { - "DCGM_FI_DRIVER_VERSION": "580.105.08", - "Hostname": "gpu-node-1", - "UUID": "GPU-04d228d3-3b5a-3534-f5cf-969706647d56", + "Hostname": "ip-10-0-180-136.ec2.internal", + "UUID": "GPU-9bc10e9a-e27e-652b-9a1e-e84f7e446206", "__name__": "DCGM_FI_DEV_GPU_TEMP", "container": "nvidia-dcgm-exporter", "device": "nvidia7", "endpoint": "gpu-metrics", "gpu": "7", - "instance": "10.0.172.246:9400", + "instance": "10.0.187.45:9400", "job": "nvidia-dcgm-exporter", "modelName": "NVIDIA H100 80GB HBM3", "namespace": "gpu-operator", "pci_bus_id": "00000000:CA:00.0", - "pod": "nvidia-dcgm-exporter-wqqqn", + "pod": "nvidia-dcgm-exporter-2xrln", "service": "nvidia-dcgm-exporter" }, "value": [ - 1773114089.702, - "26" + 1775085340.554, + "31" ] }, { "metric": { "DCGM_FI_DRIVER_VERSION": "580.105.08", - "Hostname": "gpu-node-2", - "UUID": "GPU-92da0328-2f33-b563-d577-9d2b9f21f280", + "Hostname": "ip-10-0-251-220.ec2.internal", + "UUID": "GPU-3f048793-8751-030e-5870-ebbd2b10cef2", "__name__": "DCGM_FI_DEV_GPU_TEMP", "container": "nvidia-dcgm-exporter", "device": "nvidia0", "endpoint": "gpu-metrics", "gpu": "0", - "instance": "10.0.247.52:9400", + "instance": "10.0.147.205:9400", "job": "nvidia-dcgm-exporter", "modelName": "NVIDIA H100 80GB HBM3", "namespace": "gpu-operator", "pci_bus_id": "00000000:53:00.0", - "pod": "nvidia-dcgm-exporter-g2fjs", + "pod": "nvidia-dcgm-exporter-sscnw", "service": "nvidia-dcgm-exporter" }, "value": [ - 1773114089.702, - "27" + 1775085340.554, + "31" ] }, { "metric": { "DCGM_FI_DRIVER_VERSION": "580.105.08", - "Hostname": "gpu-node-2", - "UUID": "GPU-184dab49-47ce-eeec-2239-3e03fbd4c002", + "Hostname": "ip-10-0-251-220.ec2.internal", + "UUID": "GPU-cc644abe-17e4-7cb7-500d-ed8c09aea2fb", "__name__": "DCGM_FI_DEV_GPU_TEMP", "container": "nvidia-dcgm-exporter", "device": "nvidia1", "endpoint": "gpu-metrics", "gpu": "1", - "instance": "10.0.247.52:9400", + "instance": "10.0.147.205:9400", "job": "nvidia-dcgm-exporter", "modelName": "NVIDIA H100 80GB HBM3", "namespace": "gpu-operator", "pci_bus_id": "00000000:64:00.0", - "pod": "nvidia-dcgm-exporter-g2fjs", + "pod": "nvidia-dcgm-exporter-sscnw", "service": "nvidia-dcgm-exporter" }, "value": [ - 1773114089.702, - "29" + 1775085340.554, + "33" ] }, { "metric": { "DCGM_FI_DRIVER_VERSION": "580.105.08", - "Hostname": "gpu-node-2", - "UUID": "GPU-dbabb552-a092-0ca9-0580-8d4fe378eb02", + "Hostname": "ip-10-0-251-220.ec2.internal", + "UUID": "GPU-8d0b1081-9549-2b14-7e01-b4a725873c21", "__name__": "DCGM_FI_DEV_GPU_TEMP", "container": "nvidia-dcgm-exporter", "device": "nvidia2", "endpoint": "gpu-metrics", "gpu": "2", - "instance": "10.0.247.52:9400", + "instance": "10.0.147.205:9400", "job": "nvidia-dcgm-exporter", "modelName": "NVIDIA H100 80GB HBM3", "namespace": "gpu-operator", "pci_bus_id": "00000000:75:00.0", - "pod": "nvidia-dcgm-exporter-g2fjs", + "pod": "nvidia-dcgm-exporter-sscnw", "service": "nvidia-dcgm-exporter" }, "value": [ - 1773114089.702, - "28" + 1775085340.554, + "31" ] }, { "metric": { "DCGM_FI_DRIVER_VERSION": "580.105.08", - "Hostname": "gpu-node-2", - "UUID": "GPU-5342927e-e180-84f1-55ba-257f1cbd3ba4", + "Hostname": "ip-10-0-251-220.ec2.internal", + "UUID": "GPU-38bbfee9-dc95-ffb5-4034-f9a6c82a45bb", "__name__": "DCGM_FI_DEV_GPU_TEMP", "container": "nvidia-dcgm-exporter", "device": "nvidia3", "endpoint": "gpu-metrics", "gpu": "3", - "instance": "10.0.247.52:9400", + "instance": "10.0.147.205:9400", "job": "nvidia-dcgm-exporter", "modelName": "NVIDIA H100 80GB HBM3", "namespace": "gpu-operator", "pci_bus_id": "00000000:86:00.0", - "pod": "nvidia-dcgm-exporter-g2fjs", + "pod": "nvidia-dcgm-exporter-sscnw", "service": "nvidia-dcgm-exporter" }, "value": [ - 1773114089.702, - "29" + 1775085340.554, + "32" ] }, { "metric": { "DCGM_FI_DRIVER_VERSION": "580.105.08", - "Hostname": "gpu-node-2", - "UUID": "GPU-95085215-739e-e7c6-4011-8dbe004af8c3", + "Hostname": "ip-10-0-251-220.ec2.internal", + "UUID": "GPU-24087b69-8889-6b23-feeb-2905664fbcbf", "__name__": "DCGM_FI_DEV_GPU_TEMP", "container": "nvidia-dcgm-exporter", "device": "nvidia4", "endpoint": "gpu-metrics", "gpu": "4", - "instance": "10.0.247.52:9400", + "instance": "10.0.147.205:9400", "job": "nvidia-dcgm-exporter", "modelName": "NVIDIA H100 80GB HBM3", "namespace": "gpu-operator", "pci_bus_id": "00000000:97:00.0", - "pod": "nvidia-dcgm-exporter-g2fjs", + "pod": "nvidia-dcgm-exporter-sscnw", "service": "nvidia-dcgm-exporter" }, "value": [ - 1773114089.702, - "29" + 1775085340.554, + "33" ] }, { "metric": { "DCGM_FI_DRIVER_VERSION": "580.105.08", - "Hostname": "gpu-node-2", - "UUID": "GPU-a7b658ad-f23e-cea9-2523-569d521700bf", + "Hostname": "ip-10-0-251-220.ec2.internal", + "UUID": "GPU-d2f75162-e86d-0da0-0af4-3fa0b80038cd", "__name__": "DCGM_FI_DEV_GPU_TEMP", "container": "nvidia-dcgm-exporter", "device": "nvidia5", "endpoint": "gpu-metrics", "gpu": "5", - "instance": "10.0.247.52:9400", + "instance": "10.0.147.205:9400", "job": "nvidia-dcgm-exporter", "modelName": "NVIDIA H100 80GB HBM3", "namespace": "gpu-operator", "pci_bus_id": "00000000:A8:00.0", - "pod": "nvidia-dcgm-exporter-g2fjs", + "pod": "nvidia-dcgm-exporter-sscnw", "service": "nvidia-dcgm-exporter" }, "value": [ - 1773114089.702, - "27" + 1775085340.554, + "31" ] }, { "metric": { "DCGM_FI_DRIVER_VERSION": "580.105.08", - "Hostname": "gpu-node-2", - "UUID": "GPU-1e9a0e94-769a-b1e6-36f7-9296e286ef90", + "Hostname": "ip-10-0-251-220.ec2.internal", + "UUID": "GPU-b00fe5f9-5832-19d6-0276-28d8630f0f4b", "__name__": "DCGM_FI_DEV_GPU_TEMP", "container": "nvidia-dcgm-exporter", "device": "nvidia6", "endpoint": "gpu-metrics", "gpu": "6", - "instance": "10.0.247.52:9400", + "instance": "10.0.147.205:9400", "job": "nvidia-dcgm-exporter", "modelName": "NVIDIA H100 80GB HBM3", "namespace": "gpu-operator", "pci_bus_id": "00000000:B9:00.0", - "pod": "nvidia-dcgm-exporter-g2fjs", + "pod": "nvidia-dcgm-exporter-sscnw", "service": "nvidia-dcgm-exporter" }, "value": [ - 1773114089.702, - "30" + 1775085340.554, + "32" ] }, { "metric": { "DCGM_FI_DRIVER_VERSION": "580.105.08", - "Hostname": "gpu-node-2", - "UUID": "GPU-16b2cd36-9dbe-3ee7-0810-07b330e36e04", + "Hostname": "ip-10-0-251-220.ec2.internal", + "UUID": "GPU-530bd4b0-238b-f0c2-b496-63595812bca8", "__name__": "DCGM_FI_DEV_GPU_TEMP", "container": "nvidia-dcgm-exporter", "device": "nvidia7", "endpoint": "gpu-metrics", "gpu": "7", - "instance": "10.0.247.52:9400", + "instance": "10.0.147.205:9400", "job": "nvidia-dcgm-exporter", "modelName": "NVIDIA H100 80GB HBM3", "namespace": "gpu-operator", "pci_bus_id": "00000000:CA:00.0", - "pod": "nvidia-dcgm-exporter-g2fjs", + "pod": "nvidia-dcgm-exporter-sscnw", "service": "nvidia-dcgm-exporter" }, "value": [ - 1773114089.702, - "27" + 1775085340.554, + "31" ] }, { "metric": { "DCGM_FI_DRIVER_VERSION": "580.105.08", - "Hostname": "gpu-node-1", - "UUID": "GPU-c4529c8d-69c4-b61d-e0bc-7b2460096005", + "Hostname": "ip-10-0-180-136.ec2.internal", + "UUID": "GPU-d0f25a6f-9a3f-61b9-c128-3d14759651d7", "__name__": "DCGM_FI_DEV_GPU_TEMP", - "container": "main", - "device": "nvidia0", + "container": "llama-3-2-1b-ctr", + "device": "nvidia6", "endpoint": "gpu-metrics", - "gpu": "0", - "instance": "10.0.172.246:9400", + "gpu": "6", + "instance": "10.0.187.45:9400", "job": "nvidia-dcgm-exporter", "modelName": "NVIDIA H100 80GB HBM3", - "namespace": "dynamo-workload", - "pci_bus_id": "00000000:53:00.0", - "pod": "vllm-agg-0-vllmdecodeworker-s65j5", + "namespace": "nim-workload", + "pci_bus_id": "00000000:B9:00.0", + "pod": "llama-3-2-1b-7577f87fc7-dhb97", "service": "nvidia-dcgm-exporter" }, "value": [ - 1773114089.702, - "30" + 1775085340.554, + "37" ] } ] @@ -1271,369 +1267,369 @@ Query Prometheus to verify it is actively scraping and storing DCGM metrics. { "metric": { "DCGM_FI_DRIVER_VERSION": "580.105.08", - "Hostname": "gpu-node-1", - "UUID": "GPU-bc5610b9-79c8-fedd-8899-07539c7f868a", + "Hostname": "ip-10-0-180-136.ec2.internal", + "UUID": "GPU-15704b32-f531-14ce-0530-1ac21e4b68e6", + "__name__": "DCGM_FI_DEV_POWER_USAGE", + "container": "nvidia-dcgm-exporter", + "device": "nvidia0", + "endpoint": "gpu-metrics", + "gpu": "0", + "instance": "10.0.187.45:9400", + "job": "nvidia-dcgm-exporter", + "modelName": "NVIDIA H100 80GB HBM3", + "namespace": "gpu-operator", + "pci_bus_id": "00000000:53:00.0", + "pod": "nvidia-dcgm-exporter-2xrln", + "service": "nvidia-dcgm-exporter" + }, + "value": [ + 1775085340.891, + "67.692" + ] + }, + { + "metric": { + "DCGM_FI_DRIVER_VERSION": "580.105.08", + "Hostname": "ip-10-0-180-136.ec2.internal", + "UUID": "GPU-edc718f8-e593-6468-b9f9-563d508366ed", "__name__": "DCGM_FI_DEV_POWER_USAGE", "container": "nvidia-dcgm-exporter", "device": "nvidia1", "endpoint": "gpu-metrics", "gpu": "1", - "instance": "10.0.172.246:9400", + "instance": "10.0.187.45:9400", "job": "nvidia-dcgm-exporter", "modelName": "NVIDIA H100 80GB HBM3", "namespace": "gpu-operator", "pci_bus_id": "00000000:64:00.0", - "pod": "nvidia-dcgm-exporter-wqqqn", + "pod": "nvidia-dcgm-exporter-2xrln", "service": "nvidia-dcgm-exporter" }, "value": [ - 1773114089.943, - "68.347" + 1775085340.891, + "67.219" ] }, { "metric": { "DCGM_FI_DRIVER_VERSION": "580.105.08", - "Hostname": "gpu-node-1", - "UUID": "GPU-fbc2c554-4d37-8938-0032-f923bad0f716", + "Hostname": "ip-10-0-180-136.ec2.internal", + "UUID": "GPU-e2d9b65e-98cb-5b7a-90f0-e0336573f9e2", "__name__": "DCGM_FI_DEV_POWER_USAGE", "container": "nvidia-dcgm-exporter", "device": "nvidia2", "endpoint": "gpu-metrics", "gpu": "2", - "instance": "10.0.172.246:9400", + "instance": "10.0.187.45:9400", "job": "nvidia-dcgm-exporter", "modelName": "NVIDIA H100 80GB HBM3", "namespace": "gpu-operator", "pci_bus_id": "00000000:75:00.0", - "pod": "nvidia-dcgm-exporter-wqqqn", + "pod": "nvidia-dcgm-exporter-2xrln", "service": "nvidia-dcgm-exporter" }, "value": [ - 1773114089.943, - "65.709" + 1775085340.891, + "67.899" ] }, { "metric": { "DCGM_FI_DRIVER_VERSION": "580.105.08", - "Hostname": "gpu-node-1", - "UUID": "GPU-a65a773d-52bb-bcc1-a8ee-f78c3faa2e2d", + "Hostname": "ip-10-0-180-136.ec2.internal", + "UUID": "GPU-3a325419-de5f-778f-cf4e-fe7290362ac5", "__name__": "DCGM_FI_DEV_POWER_USAGE", "container": "nvidia-dcgm-exporter", "device": "nvidia3", "endpoint": "gpu-metrics", "gpu": "3", - "instance": "10.0.172.246:9400", + "instance": "10.0.187.45:9400", "job": "nvidia-dcgm-exporter", "modelName": "NVIDIA H100 80GB HBM3", "namespace": "gpu-operator", "pci_bus_id": "00000000:86:00.0", - "pod": "nvidia-dcgm-exporter-wqqqn", + "pod": "nvidia-dcgm-exporter-2xrln", "service": "nvidia-dcgm-exporter" }, "value": [ - 1773114089.943, - "67.316" + 1775085340.891, + "66.711" ] }, { "metric": { "DCGM_FI_DRIVER_VERSION": "580.105.08", - "Hostname": "gpu-node-1", - "UUID": "GPU-82e45d1b-1618-559f-144c-eab51545030b", + "Hostname": "ip-10-0-180-136.ec2.internal", + "UUID": "GPU-275ad37d-ebd6-4cf6-3867-0499ba033a12", "__name__": "DCGM_FI_DEV_POWER_USAGE", "container": "nvidia-dcgm-exporter", "device": "nvidia4", "endpoint": "gpu-metrics", "gpu": "4", - "instance": "10.0.172.246:9400", + "instance": "10.0.187.45:9400", "job": "nvidia-dcgm-exporter", "modelName": "NVIDIA H100 80GB HBM3", "namespace": "gpu-operator", "pci_bus_id": "00000000:97:00.0", - "pod": "nvidia-dcgm-exporter-wqqqn", + "pod": "nvidia-dcgm-exporter-2xrln", "service": "nvidia-dcgm-exporter" }, "value": [ - 1773114089.943, - "68.717" + 1775085340.891, + "67.875" ] }, { "metric": { "DCGM_FI_DRIVER_VERSION": "580.105.08", - "Hostname": "gpu-node-1", - "UUID": "GPU-39e28159-8c62-ee71-64db-b748edd61e15", + "Hostname": "ip-10-0-180-136.ec2.internal", + "UUID": "GPU-3cab564d-1f63-674b-a831-024600bf985c", "__name__": "DCGM_FI_DEV_POWER_USAGE", "container": "nvidia-dcgm-exporter", "device": "nvidia5", "endpoint": "gpu-metrics", "gpu": "5", - "instance": "10.0.172.246:9400", + "instance": "10.0.187.45:9400", "job": "nvidia-dcgm-exporter", "modelName": "NVIDIA H100 80GB HBM3", "namespace": "gpu-operator", "pci_bus_id": "00000000:A8:00.0", - "pod": "nvidia-dcgm-exporter-wqqqn", - "service": "nvidia-dcgm-exporter" - }, - "value": [ - 1773114089.943, - "65.742" - ] - }, - { - "metric": { - "DCGM_FI_DRIVER_VERSION": "580.105.08", - "Hostname": "gpu-node-1", - "UUID": "GPU-e64d69ca-b4b3-59b2-e78c-94f26c4db365", - "__name__": "DCGM_FI_DEV_POWER_USAGE", - "container": "nvidia-dcgm-exporter", - "device": "nvidia6", - "endpoint": "gpu-metrics", - "gpu": "6", - "instance": "10.0.172.246:9400", - "job": "nvidia-dcgm-exporter", - "modelName": "NVIDIA H100 80GB HBM3", - "namespace": "gpu-operator", - "pci_bus_id": "00000000:B9:00.0", - "pod": "nvidia-dcgm-exporter-wqqqn", + "pod": "nvidia-dcgm-exporter-2xrln", "service": "nvidia-dcgm-exporter" }, "value": [ - 1773114089.943, - "67.328" + 1775085340.891, + "67.664" ] }, { "metric": { "DCGM_FI_DRIVER_VERSION": "580.105.08", - "Hostname": "gpu-node-1", - "UUID": "GPU-04d228d3-3b5a-3534-f5cf-969706647d56", + "Hostname": "ip-10-0-180-136.ec2.internal", + "UUID": "GPU-9bc10e9a-e27e-652b-9a1e-e84f7e446206", "__name__": "DCGM_FI_DEV_POWER_USAGE", "container": "nvidia-dcgm-exporter", "device": "nvidia7", "endpoint": "gpu-metrics", "gpu": "7", - "instance": "10.0.172.246:9400", + "instance": "10.0.187.45:9400", "job": "nvidia-dcgm-exporter", "modelName": "NVIDIA H100 80GB HBM3", "namespace": "gpu-operator", "pci_bus_id": "00000000:CA:00.0", - "pod": "nvidia-dcgm-exporter-wqqqn", + "pod": "nvidia-dcgm-exporter-2xrln", "service": "nvidia-dcgm-exporter" }, "value": [ - 1773114089.943, - "66.997" + 1775085340.891, + "65.061" ] }, { "metric": { "DCGM_FI_DRIVER_VERSION": "580.105.08", - "Hostname": "gpu-node-2", - "UUID": "GPU-92da0328-2f33-b563-d577-9d2b9f21f280", + "Hostname": "ip-10-0-251-220.ec2.internal", + "UUID": "GPU-3f048793-8751-030e-5870-ebbd2b10cef2", "__name__": "DCGM_FI_DEV_POWER_USAGE", "container": "nvidia-dcgm-exporter", "device": "nvidia0", "endpoint": "gpu-metrics", "gpu": "0", - "instance": "10.0.247.52:9400", + "instance": "10.0.147.205:9400", "job": "nvidia-dcgm-exporter", "modelName": "NVIDIA H100 80GB HBM3", "namespace": "gpu-operator", "pci_bus_id": "00000000:53:00.0", - "pod": "nvidia-dcgm-exporter-g2fjs", + "pod": "nvidia-dcgm-exporter-sscnw", "service": "nvidia-dcgm-exporter" }, "value": [ - 1773114089.943, - "69.339" + 1775085340.891, + "68.284" ] }, { "metric": { "DCGM_FI_DRIVER_VERSION": "580.105.08", - "Hostname": "gpu-node-2", - "UUID": "GPU-184dab49-47ce-eeec-2239-3e03fbd4c002", + "Hostname": "ip-10-0-251-220.ec2.internal", + "UUID": "GPU-cc644abe-17e4-7cb7-500d-ed8c09aea2fb", "__name__": "DCGM_FI_DEV_POWER_USAGE", "container": "nvidia-dcgm-exporter", "device": "nvidia1", "endpoint": "gpu-metrics", "gpu": "1", - "instance": "10.0.247.52:9400", + "instance": "10.0.147.205:9400", "job": "nvidia-dcgm-exporter", "modelName": "NVIDIA H100 80GB HBM3", "namespace": "gpu-operator", "pci_bus_id": "00000000:64:00.0", - "pod": "nvidia-dcgm-exporter-g2fjs", + "pod": "nvidia-dcgm-exporter-sscnw", "service": "nvidia-dcgm-exporter" }, "value": [ - 1773114089.943, - "68.754" + 1775085340.891, + "70.963" ] }, { "metric": { "DCGM_FI_DRIVER_VERSION": "580.105.08", - "Hostname": "gpu-node-2", - "UUID": "GPU-dbabb552-a092-0ca9-0580-8d4fe378eb02", + "Hostname": "ip-10-0-251-220.ec2.internal", + "UUID": "GPU-8d0b1081-9549-2b14-7e01-b4a725873c21", "__name__": "DCGM_FI_DEV_POWER_USAGE", "container": "nvidia-dcgm-exporter", "device": "nvidia2", "endpoint": "gpu-metrics", "gpu": "2", - "instance": "10.0.247.52:9400", + "instance": "10.0.147.205:9400", "job": "nvidia-dcgm-exporter", "modelName": "NVIDIA H100 80GB HBM3", "namespace": "gpu-operator", "pci_bus_id": "00000000:75:00.0", - "pod": "nvidia-dcgm-exporter-g2fjs", + "pod": "nvidia-dcgm-exporter-sscnw", "service": "nvidia-dcgm-exporter" }, "value": [ - 1773114089.943, - "68.61" + 1775085340.891, + "67.535" ] }, { "metric": { "DCGM_FI_DRIVER_VERSION": "580.105.08", - "Hostname": "gpu-node-2", - "UUID": "GPU-5342927e-e180-84f1-55ba-257f1cbd3ba4", + "Hostname": "ip-10-0-251-220.ec2.internal", + "UUID": "GPU-38bbfee9-dc95-ffb5-4034-f9a6c82a45bb", "__name__": "DCGM_FI_DEV_POWER_USAGE", "container": "nvidia-dcgm-exporter", "device": "nvidia3", "endpoint": "gpu-metrics", "gpu": "3", - "instance": "10.0.247.52:9400", + "instance": "10.0.147.205:9400", "job": "nvidia-dcgm-exporter", "modelName": "NVIDIA H100 80GB HBM3", "namespace": "gpu-operator", "pci_bus_id": "00000000:86:00.0", - "pod": "nvidia-dcgm-exporter-g2fjs", + "pod": "nvidia-dcgm-exporter-sscnw", "service": "nvidia-dcgm-exporter" }, "value": [ - 1773114089.943, - "66.499" + 1775085340.891, + "68.419" ] }, { "metric": { "DCGM_FI_DRIVER_VERSION": "580.105.08", - "Hostname": "gpu-node-2", - "UUID": "GPU-95085215-739e-e7c6-4011-8dbe004af8c3", + "Hostname": "ip-10-0-251-220.ec2.internal", + "UUID": "GPU-24087b69-8889-6b23-feeb-2905664fbcbf", "__name__": "DCGM_FI_DEV_POWER_USAGE", "container": "nvidia-dcgm-exporter", "device": "nvidia4", "endpoint": "gpu-metrics", "gpu": "4", - "instance": "10.0.247.52:9400", + "instance": "10.0.147.205:9400", "job": "nvidia-dcgm-exporter", "modelName": "NVIDIA H100 80GB HBM3", "namespace": "gpu-operator", "pci_bus_id": "00000000:97:00.0", - "pod": "nvidia-dcgm-exporter-g2fjs", + "pod": "nvidia-dcgm-exporter-sscnw", "service": "nvidia-dcgm-exporter" }, "value": [ - 1773114089.943, - "67.645" + 1775085340.891, + "69.498" ] }, { "metric": { "DCGM_FI_DRIVER_VERSION": "580.105.08", - "Hostname": "gpu-node-2", - "UUID": "GPU-a7b658ad-f23e-cea9-2523-569d521700bf", + "Hostname": "ip-10-0-251-220.ec2.internal", + "UUID": "GPU-d2f75162-e86d-0da0-0af4-3fa0b80038cd", "__name__": "DCGM_FI_DEV_POWER_USAGE", "container": "nvidia-dcgm-exporter", "device": "nvidia5", "endpoint": "gpu-metrics", "gpu": "5", - "instance": "10.0.247.52:9400", + "instance": "10.0.147.205:9400", "job": "nvidia-dcgm-exporter", "modelName": "NVIDIA H100 80GB HBM3", "namespace": "gpu-operator", "pci_bus_id": "00000000:A8:00.0", - "pod": "nvidia-dcgm-exporter-g2fjs", + "pod": "nvidia-dcgm-exporter-sscnw", "service": "nvidia-dcgm-exporter" }, "value": [ - 1773114089.943, - "66.68" + 1775085340.891, + "69.66" ] }, { "metric": { "DCGM_FI_DRIVER_VERSION": "580.105.08", - "Hostname": "gpu-node-2", - "UUID": "GPU-1e9a0e94-769a-b1e6-36f7-9296e286ef90", + "Hostname": "ip-10-0-251-220.ec2.internal", + "UUID": "GPU-b00fe5f9-5832-19d6-0276-28d8630f0f4b", "__name__": "DCGM_FI_DEV_POWER_USAGE", "container": "nvidia-dcgm-exporter", "device": "nvidia6", "endpoint": "gpu-metrics", "gpu": "6", - "instance": "10.0.247.52:9400", + "instance": "10.0.147.205:9400", "job": "nvidia-dcgm-exporter", "modelName": "NVIDIA H100 80GB HBM3", "namespace": "gpu-operator", "pci_bus_id": "00000000:B9:00.0", - "pod": "nvidia-dcgm-exporter-g2fjs", + "pod": "nvidia-dcgm-exporter-sscnw", "service": "nvidia-dcgm-exporter" }, "value": [ - 1773114089.943, - "68.395" + 1775085340.891, + "66.98" ] }, { "metric": { "DCGM_FI_DRIVER_VERSION": "580.105.08", - "Hostname": "gpu-node-2", - "UUID": "GPU-16b2cd36-9dbe-3ee7-0810-07b330e36e04", + "Hostname": "ip-10-0-251-220.ec2.internal", + "UUID": "GPU-530bd4b0-238b-f0c2-b496-63595812bca8", "__name__": "DCGM_FI_DEV_POWER_USAGE", "container": "nvidia-dcgm-exporter", "device": "nvidia7", "endpoint": "gpu-metrics", "gpu": "7", - "instance": "10.0.247.52:9400", + "instance": "10.0.147.205:9400", "job": "nvidia-dcgm-exporter", "modelName": "NVIDIA H100 80GB HBM3", "namespace": "gpu-operator", "pci_bus_id": "00000000:CA:00.0", - "pod": "nvidia-dcgm-exporter-g2fjs", + "pod": "nvidia-dcgm-exporter-sscnw", "service": "nvidia-dcgm-exporter" }, "value": [ - 1773114089.943, - "69.523" + 1775085340.891, + "68.367" ] }, { "metric": { "DCGM_FI_DRIVER_VERSION": "580.105.08", - "Hostname": "gpu-node-1", - "UUID": "GPU-c4529c8d-69c4-b61d-e0bc-7b2460096005", + "Hostname": "ip-10-0-180-136.ec2.internal", + "UUID": "GPU-d0f25a6f-9a3f-61b9-c128-3d14759651d7", "__name__": "DCGM_FI_DEV_POWER_USAGE", - "container": "main", - "device": "nvidia0", + "container": "llama-3-2-1b-ctr", + "device": "nvidia6", "endpoint": "gpu-metrics", - "gpu": "0", - "instance": "10.0.172.246:9400", + "gpu": "6", + "instance": "10.0.187.45:9400", "job": "nvidia-dcgm-exporter", "modelName": "NVIDIA H100 80GB HBM3", - "namespace": "dynamo-workload", - "pci_bus_id": "00000000:53:00.0", - "pod": "vllm-agg-0-vllmdecodeworker-s65j5", + "namespace": "nim-workload", + "pci_bus_id": "00000000:B9:00.0", + "pod": "llama-3-2-1b-7577f87fc7-dhb97", "service": "nvidia-dcgm-exporter" }, "value": [ - 1773114089.943, - "113.611" + 1775085340.891, + "112.67" ] } ] @@ -1641,20 +1637,4 @@ Query Prometheus to verify it is actively scraping and storing DCGM metrics. } ``` -## AI Service Metrics (Custom Metrics API) - -Prometheus adapter exposes custom metrics via the Kubernetes custom metrics API, -enabling HPA and other consumers to act on workload-specific metrics. - -**Custom metrics API available resources** -``` -$ kubectl get --raw /apis/custom.metrics.k8s.io/v1beta1 | python3 -c "..." # extract resource names -namespaces/gpu_utilization -pods/gpu_utilization -namespaces/gpu_memory_used -pods/gpu_memory_used -namespaces/gpu_power_usage -pods/gpu_power_usage -``` - -**Result: PASS** — DCGM exporter provides per-GPU metrics (utilization, memory, temperature, power). Prometheus actively scrapes and stores metrics. Custom metrics API available via prometheus-adapter. +**Result: PASS** — DCGM exporter provides per-GPU metrics (utilization, memory, temperature, power). Prometheus actively scrapes and stores metrics. diff --git a/docs/conformance/cncf/v1.35/nim-eks/evidence/ai-service-metrics.md b/docs/conformance/cncf/v1.35/nim-eks/evidence/ai-service-metrics.md new file mode 100644 index 000000000..855926886 --- /dev/null +++ b/docs/conformance/cncf/v1.35/nim-eks/evidence/ai-service-metrics.md @@ -0,0 +1,114 @@ +# AI Service Metrics (NIM Inference) + +**Cluster:** `EKS / p5.48xlarge / NVIDIA-H100-80GB-HBM3` +**Generated:** 2026-04-01 23:15:43 UTC +**Kubernetes Version:** v1.35 +**Platform:** linux/amd64 + +--- + +Demonstrates that NVIDIA NIM inference microservices expose Prometheus-format +metrics that can be discovered and collected by the monitoring stack. + +## NIM Inference Workload + +**NIMService** +``` +$ kubectl get nimservice -n nim-workload +NAME STATUS AGE +llama-3-2-1b Ready 58m +``` + +**NIM workload pods** +``` +$ kubectl get pods -n nim-workload -o wide +NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES +llama-3-2-1b-7577f87fc7-dhb97 1/1 Running 0 58m 10.0.158.63 ip-10-0-180-136.ec2.internal +``` + +**NIM models endpoint** +``` +Model: meta/llama-3.2-1b-instruct +``` + +**NIM inference metrics endpoint (sampled after generating inference traffic)** +``` +num_requests_waiting{model_name="meta/llama-3.2-1b-instruct"} 1.0 +num_request_max{model_name="meta/llama-3.2-1b-instruct"} 2048.0 +prompt_tokens_total{model_name="meta/llama-3.2-1b-instruct"} 603.0 +generation_tokens_total{model_name="meta/llama-3.2-1b-instruct"} 997.0 +time_to_first_token_seconds_count{model_name="meta/llama-3.2-1b-instruct"} 34.0 +time_to_first_token_seconds_sum{model_name="meta/llama-3.2-1b-instruct"} 3.781902551651001 +time_per_output_token_seconds_count{model_name="meta/llama-3.2-1b-instruct"} 963.0 +time_per_output_token_seconds_sum{model_name="meta/llama-3.2-1b-instruct"} 1.705470085144043 +e2e_request_latency_seconds_count{model_name="meta/llama-3.2-1b-instruct"} 34.0 +e2e_request_latency_seconds_sum{model_name="meta/llama-3.2-1b-instruct"} 5.490677356719971 +request_prompt_tokens_count{model_name="meta/llama-3.2-1b-instruct"} 34.0 +request_prompt_tokens_sum{model_name="meta/llama-3.2-1b-instruct"} 603.0 +request_generation_tokens_count{model_name="meta/llama-3.2-1b-instruct"} 34.0 +request_generation_tokens_sum{model_name="meta/llama-3.2-1b-instruct"} 997.0 +request_success_total{model_name="meta/llama-3.2-1b-instruct"} 34.0 +``` + +## Prometheus Metrics Discovery + +A ServiceMonitor is created to enable Prometheus auto-discovery of NIM inference +metrics. NIM exposes metrics at `/v1/metrics` in Prometheus exposition format. + +**NIM ServiceMonitor** +``` +$ kubectl get servicemonitor nim-inference -n monitoring -o yaml +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + annotations: + kubectl.kubernetes.io/last-applied-configuration: | + {"apiVersion":"monitoring.coreos.com/v1","kind":"ServiceMonitor","metadata":{"annotations":{},"labels":{"release":"kube-prometheus"},"name":"nim-inference","namespace":"monitoring"},"spec":{"endpoints":[{"interval":"15s","path":"/v1/metrics","port":"api"}],"namespaceSelector":{"matchNames":["nim-workload"]},"selector":{"matchLabels":{"app.kubernetes.io/managed-by":"k8s-nim-operator"}}}} + creationTimestamp: "2026-04-01T23:16:15Z" + generation: 1 + labels: + release: kube-prometheus + name: nim-inference + namespace: monitoring + resourceVersion: "102073064" + uid: e29b3536-c76d-410c-a236-a3ac5d745822 +spec: + endpoints: + - interval: 15s + path: /v1/metrics + port: api + namespaceSelector: + matchNames: + - nim-workload + selector: + matchLabels: + app.kubernetes.io/managed-by: k8s-nim-operator +``` + +**Prometheus scrape targets (active)** +``` +{ + "job": "llama-3-2-1b", + "endpoint": "http://10.0.158.63:8000/v1/metrics", + "health": "up", + "lastScrape": "2026-04-01T23:18:42.378844773Z" +} +``` + +**NIM metrics queried from Prometheus** +``` +prompt_tokens_total{model_name="meta/llama-3.2-1b-instruct"} = 603 +generation_tokens_total{model_name="meta/llama-3.2-1b-instruct"} = 997 +time_to_first_token_seconds_sum{model_name="meta/llama-3.2-1b-instruct"} = 3.781902551651001 +time_per_output_token_seconds_sum{model_name="meta/llama-3.2-1b-instruct"} = 1.705470085144043 +e2e_request_latency_seconds_sum{model_name="meta/llama-3.2-1b-instruct"} = 5.490677356719971 +``` + +**Result: PASS** — Prometheus discovers NIM inference workloads via ServiceMonitor and actively scrapes application-level AI inference metrics (token throughput, request latency, time-to-first-token) from the /v1/metrics endpoint. + +## Cleanup + +**Delete workload namespace** +``` +$ kubectl delete ns nim-workload +``` diff --git a/docs/conformance/cncf/evidence/cluster-autoscaling.md b/docs/conformance/cncf/v1.35/nim-eks/evidence/cluster-autoscaling.md similarity index 54% rename from docs/conformance/cncf/evidence/cluster-autoscaling.md rename to docs/conformance/cncf/v1.35/nim-eks/evidence/cluster-autoscaling.md index 4f71c4b8f..a00bc7d74 100644 --- a/docs/conformance/cncf/evidence/cluster-autoscaling.md +++ b/docs/conformance/cncf/v1.35/nim-eks/evidence/cluster-autoscaling.md @@ -1,49 +1,48 @@ # Cluster Autoscaling +**Cluster:** `EKS / p5.48xlarge / NVIDIA-H100-80GB-HBM3` +**Generated:** 2026-04-01 23:20:45 UTC **Kubernetes Version:** v1.35 **Platform:** linux/amd64 -**Validated on:** EKS (p5.48xlarge, 8x H100) and GKE (a3-megagpu-8g, 8x H100) --- Demonstrates CNCF AI Conformance requirement that the platform has GPU-aware -cluster autoscaling infrastructure configured, capable of scaling GPU node -groups based on workload demand. +cluster autoscaling infrastructure configured, with Auto Scaling Groups capable +of scaling GPU node groups based on workload demand. ## Summary -| Platform | Autoscaler | GPU Instances | Nodes | Result | -|----------|-----------|---------------|-------|--------| -| **EKS** | AWS Auto Scaling Group | p5.48xlarge (8x H100) | 2 | **PASS** | -| **GKE** | GKE built-in cluster autoscaler | a3-megagpu-8g (8x H100) | 2 | **PASS** | +1. **GPU Node Group (ASG)** — EKS Auto Scaling Group configured with GPU instances +2. **Capacity Reservation** — Dedicated GPU capacity available for scale-up +3. **Scalable Configuration** — ASG min/max configurable for demand-based scaling +4. **Kubernetes Integration** — ASG nodes auto-join the EKS cluster with GPU labels +5. **Autoscaler Compatibility** — Cluster Autoscaler supported via ASG tag discovery --- -## EKS: Auto Scaling Groups - -**Generated:** 2026-03-10 03:44:07 UTC +## GPU Node Auto Scaling Group The cluster uses an AWS Auto Scaling Group (ASG) for GPU nodes, which can scale -up/down based on workload demand. The ASG is configured with p5.48xlarge instances -(8x NVIDIA H100 80GB HBM3 each) backed by a capacity reservation. +up/down based on workload demand. -### EKS Cluster Details +## EKS Cluster Details - **Region:** us-east-1 - **Cluster:** aws-us-east-1-aicr-cuj2 - **GPU Node Group:** gpu-worker -### GPU Nodes +## GPU Nodes **GPU nodes** ``` $ kubectl get nodes -l nvidia.com/gpu.present=true -o custom-columns=NAME:.metadata.name,INSTANCE-TYPE:.metadata.labels.node\.kubernetes\.io/instance-type,GPUS:.metadata.labels.nvidia\.com/gpu\.count,PRODUCT:.metadata.labels.nvidia\.com/gpu\.product,NODE-GROUP:.metadata.labels.nodeGroup,ZONE:.metadata.labels.topology\.kubernetes\.io/zone NAME INSTANCE-TYPE GPUS PRODUCT NODE-GROUP ZONE -ip-10-0-171-111.ec2.internal p5.48xlarge 8 NVIDIA-H100-80GB-HBM3 gpu-worker us-east-1e -ip-10-0-206-2.ec2.internal p5.48xlarge 8 NVIDIA-H100-80GB-HBM3 gpu-worker us-east-1e +ip-10-0-180-136.ec2.internal p5.48xlarge 8 NVIDIA-H100-80GB-HBM3 gpu-worker us-east-1e +ip-10-0-251-220.ec2.internal p5.48xlarge 8 NVIDIA-H100-80GB-HBM3 gpu-worker us-east-1e ``` -### Auto Scaling Group (AWS) +## Auto Scaling Group (AWS) **GPU ASG details** ``` @@ -65,7 +64,7 @@ $ aws autoscaling describe-auto-scaling-groups --region us-east-1 --auto-scaling **GPU launch template** ``` -$ aws ec2 describe-launch-template-versions --region us-east-1 --launch-template-id lt-038186420dd139467 --versions $Latest --query LaunchTemplateVersions[0].LaunchTemplateData.{InstanceType:InstanceType,ImageId:ImageId} --output table +$ aws ec2 describe-launch-template-versions --region us-east-1 --launch-template-id lt-043af36be99f4f76b --versions $Latest --query LaunchTemplateVersions[0].LaunchTemplateData.{InstanceType:InstanceType,ImageId:ImageId} --output table ------------------------------------------- | DescribeLaunchTemplateVersions | +------------------------+----------------+ @@ -91,7 +90,7 @@ $ aws autoscaling describe-tags --region us-east-1 --filters Name=auto-scaling-g +--------------------------------------+------------------------+ ``` -### Capacity Reservation +## Capacity Reservation **GPU capacity reservation** ``` @@ -100,7 +99,7 @@ $ aws ec2 describe-capacity-reservations --region us-east-1 --query CapacityRese | DescribeCapacityReservations | +------------+------------------------+ | AZ | us-east-1e | -| Available | 2 | +| Available | 1 | | ID | cr-0cbe491320188dfa6 | | State | active | | Total | 10 | @@ -108,85 +107,4 @@ $ aws ec2 describe-capacity-reservations --region us-east-1 --query CapacityRese +------------+------------------------+ ``` -**Result: PASS** — EKS cluster with GPU nodes managed by Auto Scaling Group, ASG configuration verified via AWS API. - ---- - -## GKE: Built-in Cluster Autoscaler - -**Generated:** 2026-03-16 21:50:46 UTC - -GKE includes a built-in cluster autoscaler that manages node pool scaling based -on workload demand. The autoscaler is configured per node pool. - -### GKE Cluster Details - -- **Project:** eidosx -- **Zone:** us-central1-c - -### GPU Nodes - -**GPU nodes** -``` -$ kubectl get nodes -l nvidia.com/gpu.present=true -o custom-columns=NAME:.metadata.name,INSTANCE-TYPE:.metadata.labels.node\.kubernetes\.io/instance-type,GPUS:.status.capacity.nvidia\.com/gpu,ACCELERATOR:.metadata.labels.cloud\.google\.com/gke-accelerator,NODE-POOL:.metadata.labels.cloud\.google\.com/gke-nodepool -NAME INSTANCE-TYPE GPUS ACCELERATOR NODE-POOL -gke-aicr-demo2-aicr-demo2-gpu-worker-8de6040c-h2d0 a3-megagpu-8g 8 nvidia-h100-mega-80gb aicr-demo2-gpu-worker -gke-aicr-demo2-aicr-demo2-gpu-worker-8de6040c-t81x a3-megagpu-8g 8 nvidia-h100-mega-80gb aicr-demo2-gpu-worker -``` - -### GKE Cluster Autoscaler Status - -**Cluster Autoscaler Status** -``` -autoscalerStatus: Running -clusterWide: - health: - lastProbeTime: "2026-03-16T21:50:43Z" - lastTransitionTime: "2026-03-12T21:28:08Z" - nodeCounts: - registered: - ready: 6 - total: 6 - status: Healthy - scaleDown: - status: NoCandidates - scaleUp: - status: NoActivity -nodeGroups: -- health: - cloudProviderTarget: 1 - maxSize: 1 - minSize: 1 - status: Healthy - name: .../gke-aicr-demo2-aicr-demo2-cpu-worker-cd95cf64-grp -- health: - cloudProviderTarget: 2 - maxSize: 2 - minSize: 2 - status: Healthy - name: .../gke-aicr-demo2-aicr-demo2-gpu-worker-8de6040c-grp -- health: - cloudProviderTarget: 1 - maxSize: 3 - minSize: 1 - status: Healthy - name: .../gke-aicr-demo2-aicr-demo2-system-f5af1da6-grp -- health: - cloudProviderTarget: 1 - maxSize: 3 - minSize: 1 - status: Healthy - name: .../gke-aicr-demo2-aicr-demo2-system-358b1ae8-grp -- health: - cloudProviderTarget: 1 - maxSize: 3 - minSize: 1 - status: Healthy - name: .../gke-aicr-demo2-aicr-demo2-system-b313be0b-grp -``` - -**Result: PASS** — GKE cluster with 2 GPU nodes and built-in cluster autoscaler active, all node groups healthy. - ---- - -Evidence is configuration-level; a live scale event is not triggered to avoid disrupting the cluster. +**Result: PASS** — EKS cluster with GPU nodes managed by Auto Scaling Group, ASG configuration verified via AWS API. Evidence is configuration-level; a live scale event is not triggered to avoid disrupting the cluster. diff --git a/docs/conformance/cncf/evidence/dra-support.md b/docs/conformance/cncf/v1.35/nim-eks/evidence/dra-support.md similarity index 70% rename from docs/conformance/cncf/evidence/dra-support.md rename to docs/conformance/cncf/v1.35/nim-eks/evidence/dra-support.md index 38993b745..1d5b9f724 100644 --- a/docs/conformance/cncf/evidence/dra-support.md +++ b/docs/conformance/cncf/v1.35/nim-eks/evidence/dra-support.md @@ -1,9 +1,9 @@ # DRA Support (Dynamic Resource Allocation) +**Cluster:** `EKS / p5.48xlarge / NVIDIA-H100-80GB-HBM3` +**Generated:** 2026-04-01 23:13:30 UTC **Kubernetes Version:** v1.35 **Platform:** linux/amd64 -**Validated on:** Kubernetes v1.35 clusters with NVIDIA H100 80GB HBM3 -**Generated:** 2026-03-10 03:39:16 UTC --- @@ -29,11 +29,11 @@ resourceslices resource.k8s.io/v1 false Resource ``` $ kubectl get deviceclass NAME AGE -compute-domain-daemon.nvidia.com 10m -compute-domain-default-channel.nvidia.com 10m -gpu.nvidia.com 10m -mig.nvidia.com 10m -vfio.gpu.nvidia.com 10m +compute-domain-daemon.nvidia.com 58m +compute-domain-default-channel.nvidia.com 58m +gpu.nvidia.com 58m +mig.nvidia.com 58m +vfio.gpu.nvidia.com 58m ``` ## DRA Driver Health @@ -41,10 +41,10 @@ vfio.gpu.nvidia.com 10m **DRA driver pods** ``` $ kubectl get pods -n nvidia-dra-driver -o wide -NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES -nvidia-dra-driver-gpu-controller-68966c79bb-zj7lf 1/1 Running 0 10m 10.0.4.122 system-node-1 -nvidia-dra-driver-gpu-kubelet-plugin-4kfhk 2/2 Running 0 9m54s 10.0.143.178 gpu-node-1 -nvidia-dra-driver-gpu-kubelet-plugin-grg2l 2/2 Running 0 9m54s 10.0.216.98 gpu-node-2 +NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES +nvidia-dra-driver-gpu-controller-68966c79bb-xvh7f 1/1 Running 0 58m 10.0.7.228 ip-10-0-6-154.ec2.internal +nvidia-dra-driver-gpu-kubelet-plugin-px7p8 2/2 Running 0 58m 10.0.136.3 ip-10-0-251-220.ec2.internal +nvidia-dra-driver-gpu-kubelet-plugin-smkl9 2/2 Running 0 58m 10.0.136.235 ip-10-0-180-136.ec2.internal ``` ## Device Advertisement (ResourceSlices) @@ -53,10 +53,10 @@ nvidia-dra-driver-gpu-kubelet-plugin-grg2l 2/2 Running 0 ``` $ kubectl get resourceslices NAME NODE DRIVER POOL AGE -gpu-node-1-compute-domain.nvidia.com-q9xqc gpu-node-1 compute-domain.nvidia.com gpu-node-1 10m -gpu-node-1-gpu.nvidia.com-7cbz2 gpu-node-1 gpu.nvidia.com gpu-node-1 10m -gpu-node-2-compute-domain.nvidia.com-2n2cq gpu-node-2 compute-domain.nvidia.com gpu-node-2 10m -gpu-node-2-gpu.nvidia.com-79gvw gpu-node-2 gpu.nvidia.com gpu-node-2 10m +ip-10-0-180-136.ec2.internal-compute-domain.nvidia.com-kfxd7 ip-10-0-180-136.ec2.internal compute-domain.nvidia.com ip-10-0-180-136.ec2.internal 58m +ip-10-0-180-136.ec2.internal-gpu.nvidia.com-8w29z ip-10-0-180-136.ec2.internal gpu.nvidia.com ip-10-0-180-136.ec2.internal 58m +ip-10-0-251-220.ec2.internal-compute-domain.nvidia.com-btqsj ip-10-0-251-220.ec2.internal compute-domain.nvidia.com ip-10-0-251-220.ec2.internal 58m +ip-10-0-251-220.ec2.internal-gpu.nvidia.com-qwdqr ip-10-0-251-220.ec2.internal gpu.nvidia.com ip-10-0-251-220.ec2.internal 58m ``` ## GPU Allocation Test @@ -140,7 +140,7 @@ pod/dra-gpu-test created ``` $ kubectl get resourceclaim -n dra-test -o wide NAME STATE AGE -gpu-claim pending 11s +gpu-claim pending 10s ``` > **Note:** ResourceClaim shows `pending` because the DRA controller deallocates the claim after pod completion. The pod logs below confirm the GPU was successfully allocated and visible during execution. @@ -148,8 +148,8 @@ gpu-claim pending 11s **Pod status** ``` $ kubectl get pod dra-gpu-test -n dra-test -o wide -NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES -dra-gpu-test 0/1 Completed 0 13s 10.0.177.19 gpu-node-2 +NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES +dra-gpu-test 0/1 Completed 0 12s 10.0.142.150 ip-10-0-251-220.ec2.internal ``` **Pod logs** @@ -158,7 +158,7 @@ $ kubectl logs dra-gpu-test -n dra-test /dev/nvidia-modeset /dev/nvidia-uvm /dev/nvidia-uvm-tools -/dev/nvidia2 +/dev/nvidia7 /dev/nvidiactl DRA GPU allocation successful ``` diff --git a/docs/conformance/cncf/evidence/gang-scheduling.md b/docs/conformance/cncf/v1.35/nim-eks/evidence/gang-scheduling.md similarity index 82% rename from docs/conformance/cncf/evidence/gang-scheduling.md rename to docs/conformance/cncf/v1.35/nim-eks/evidence/gang-scheduling.md index 53a00fa9e..f1e8888e9 100644 --- a/docs/conformance/cncf/evidence/gang-scheduling.md +++ b/docs/conformance/cncf/v1.35/nim-eks/evidence/gang-scheduling.md @@ -1,7 +1,7 @@ # Gang Scheduling (KAI Scheduler) **Cluster:** `EKS / p5.48xlarge / NVIDIA-H100-80GB-HBM3` -**Generated:** 2026-03-20 20:09:13 UTC +**Generated:** 2026-04-01 23:14:07 UTC **Kubernetes Version:** v1.35 **Platform:** linux/amd64 @@ -16,26 +16,26 @@ scheduler with PodGroups. Both pods in the group must be scheduled together or n ``` $ kubectl get deploy -n kai-scheduler NAME READY UP-TO-DATE AVAILABLE AGE -admission 1/1 1 1 20m -binder 1/1 1 1 20m -kai-operator 1/1 1 1 20m -kai-scheduler-default 1/1 1 1 6d22h -pod-grouper 1/1 1 1 20m -podgroup-controller 1/1 1 1 20m -queue-controller 1/1 1 1 20m +admission 1/1 1 1 59m +binder 1/1 1 1 59m +kai-operator 1/1 1 1 59m +kai-scheduler-default 1/1 1 1 59m +pod-grouper 1/1 1 1 59m +podgroup-controller 1/1 1 1 59m +queue-controller 1/1 1 1 59m ``` **KAI scheduler pods** ``` $ kubectl get pods -n kai-scheduler NAME READY STATUS RESTARTS AGE -admission-6d48656c78-vsf22 1/1 Running 0 20m -binder-8cfb98496-79hwx 1/1 Running 0 20m -kai-operator-558c46545b-tth97 1/1 Running 0 20m -kai-scheduler-default-7945d65d9c-5w4bb 1/1 Running 0 20m -pod-grouper-7bd4c7488c-wlfds 1/1 Running 0 20m -podgroup-controller-798798fb5f-mjht6 1/1 Running 0 20m -queue-controller-5b45bb74c9-b75vg 1/1 Running 0 20m +admission-6d48656c78-wshnq 1/1 Running 0 59m +binder-8cfb98496-sdg2h 1/1 Running 0 59m +kai-operator-558c46545b-qz2rx 1/1 Running 0 59m +kai-scheduler-default-57bdcb878c-fpkl2 1/1 Running 0 59m +pod-grouper-7bd4c7488c-mpbsh 1/1 Running 0 59m +podgroup-controller-798798fb5f-pjwkm 1/1 Running 0 59m +queue-controller-5b45bb74c9-knjc9 1/1 Running 0 59m ``` ## PodGroup CRD @@ -44,7 +44,7 @@ queue-controller-5b45bb74c9-b75vg 1/1 Running 0 20m ``` $ kubectl get crd podgroups.scheduling.run.ai NAME CREATED AT -podgroups.scheduling.run.ai 2026-03-10T20:53:06Z +podgroups.scheduling.run.ai 2026-04-01T22:13:48Z ``` ## Gang Scheduling Test @@ -195,23 +195,23 @@ pod/gang-worker-1 created ``` $ kubectl get podgroups -n gang-scheduling-test -o wide NAME AGE -gang-test-group 12s -pg-gang-worker-0-0f1259e1-c344-4964-a1fb-b1ae14e25859 10s -pg-gang-worker-1-af882f6e-316a-49b2-95f6-189b1a20b5c3 10s +gang-test-group 13s +pg-gang-worker-0-bb3f5b6f-080d-4cf3-8625-8be214e2032b 11s +pg-gang-worker-1-f9c72e1a-f7e9-427f-8127-42bb50491402 11s ``` **Pod status** ``` $ kubectl get pods -n gang-scheduling-test -o wide -NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES -gang-worker-0 0/1 Completed 0 13s 10.0.214.229 ip-10-0-180-136.ec2.internal -gang-worker-1 0/1 Completed 0 13s 10.0.238.183 ip-10-0-180-136.ec2.internal +NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES +gang-worker-0 0/1 Completed 0 13s 10.0.190.56 ip-10-0-180-136.ec2.internal +gang-worker-1 0/1 Completed 0 13s 10.0.153.74 ip-10-0-180-136.ec2.internal ``` **gang-worker-0 logs** ``` $ kubectl logs gang-worker-0 -n gang-scheduling-test -Fri Mar 20 20:09:24 2026 +Wed Apr 1 23:14:19 2026 +-----------------------------------------------------------------------------------------+ | NVIDIA-SMI 580.105.08 Driver Version: 580.105.08 CUDA Version: 13.0 | +-----------------------------------------+------------------------+----------------------+ @@ -219,8 +219,8 @@ Fri Mar 20 20:09:24 2026 | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+========================+======================| -| 0 NVIDIA H100 80GB HBM3 On | 00000000:86:00.0 Off | 0 | -| N/A 32C P0 66W / 700W | 0MiB / 81559MiB | 0% Default | +| 0 NVIDIA H100 80GB HBM3 On | 00000000:53:00.0 Off | 0 | +| N/A 31C P0 67W / 700W | 0MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+------------------------+----------------------+ @@ -237,7 +237,7 @@ Gang worker 0 completed successfully **gang-worker-1 logs** ``` $ kubectl logs gang-worker-1 -n gang-scheduling-test -Fri Mar 20 20:09:24 2026 +Wed Apr 1 23:14:19 2026 +-----------------------------------------------------------------------------------------+ | NVIDIA-SMI 580.105.08 Driver Version: 580.105.08 CUDA Version: 13.0 | +-----------------------------------------+------------------------+----------------------+ @@ -245,7 +245,7 @@ Fri Mar 20 20:09:24 2026 | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+========================+======================| -| 0 NVIDIA H100 80GB HBM3 On | 00000000:97:00.0 Off | 0 | +| 0 NVIDIA H100 80GB HBM3 On | 00000000:64:00.0 Off | 0 | | N/A 33C P0 67W / 700W | 0MiB / 81559MiB | 0% Default | | | | Disabled | +-----------------------------------------+------------------------+----------------------+ diff --git a/docs/conformance/cncf/evidence/index.md b/docs/conformance/cncf/v1.35/nim-eks/evidence/index.md similarity index 54% rename from docs/conformance/cncf/evidence/index.md rename to docs/conformance/cncf/v1.35/nim-eks/evidence/index.md index 8334ae517..782a73bff 100644 --- a/docs/conformance/cncf/evidence/index.md +++ b/docs/conformance/cncf/v1.35/nim-eks/evidence/index.md @@ -2,12 +2,13 @@ **Kubernetes Version:** v1.35 **Platform:** linux/amd64 -**Product:** Kubernetes clusters with NVIDIA AI Cluster Runtime (AICR) +**Product:** [NVIDIA NIM](https://developer.nvidia.com/nim) on EKS — A Kubernetes-based AI inference platform that deploys and manages NVIDIA NIM microservices on Amazon EKS with GPU scheduling, autoscaling, and Gateway API integration. +**Validation Tooling:** NVIDIA AI Cluster Runtime (AICR) -AICR deploys the runtime components (GPU Operator, KAI Scheduler, DCGM Exporter, -kgateway, Kubeflow Trainer, Dynamo, etc.) that make a Kubernetes cluster AI conformant. -Evidence was collected on AICR-enabled Kubernetes v1.35 clusters with NVIDIA H100 80GB HBM3 accelerators. -Cluster autoscaling evidence covers the underlying platform's node group scaling mechanism. +AICR deploys the runtime components (GPU Operator, NIM Operator, KAI Scheduler, +DCGM Exporter, kgateway, etc.) and validates that the platform meets CNCF AI +Conformance requirements. Evidence was collected on an EKS v1.35 cluster with +NVIDIA H100 80GB HBM3 accelerators running NIM inference workloads. ## Results @@ -17,8 +18,8 @@ Cluster autoscaling evidence covers the underlying platform's node group scaling | 2 | `gang_scheduling` | Gang Scheduling (KAI Scheduler) | PASS | [gang-scheduling.md](gang-scheduling.md) | | 3 | `secure_accelerator_access` | Secure Accelerator Access | PASS | [secure-accelerator-access.md](secure-accelerator-access.md) | | 4 | `accelerator_metrics` | Accelerator Metrics (DCGM Exporter) | PASS | [accelerator-metrics.md](accelerator-metrics.md) | -| 5 | `ai_service_metrics` | AI Service Metrics (Prometheus ServiceMonitor) | PASS | [ai-service-metrics.md](ai-service-metrics.md) | +| 5 | `ai_service_metrics` | AI Service Metrics (NIM Inference) | PASS | [ai-service-metrics.md](ai-service-metrics.md) | | 6 | `ai_inference` | Inference API Gateway (kgateway) | PASS | [inference-gateway.md](inference-gateway.md) | -| 7 | `robust_controller` | Robust AI Operator (Dynamo + Kubeflow Trainer) | PASS | [robust-operator.md](robust-operator.md) | +| 7 | `robust_controller` | Robust AI Operator (NIM Operator) | PASS | [robust-operator.md](robust-operator.md) | | 8 | `pod_autoscaling` | Pod Autoscaling (HPA + GPU metrics) | PASS | [pod-autoscaling.md](pod-autoscaling.md) | | 9 | `cluster_autoscaling` | Cluster Autoscaling | PASS | [cluster-autoscaling.md](cluster-autoscaling.md) | diff --git a/docs/conformance/cncf/evidence/inference-gateway.md b/docs/conformance/cncf/v1.35/nim-eks/evidence/inference-gateway.md similarity index 67% rename from docs/conformance/cncf/evidence/inference-gateway.md rename to docs/conformance/cncf/v1.35/nim-eks/evidence/inference-gateway.md index 2c3ddd992..26e910b36 100644 --- a/docs/conformance/cncf/evidence/inference-gateway.md +++ b/docs/conformance/cncf/v1.35/nim-eks/evidence/inference-gateway.md @@ -1,9 +1,9 @@ # Inference API Gateway (kgateway) +**Cluster:** `EKS / p5.48xlarge / NVIDIA-H100-80GB-HBM3` +**Generated:** 2026-04-01 23:18:52 UTC **Kubernetes Version:** v1.35 **Platform:** linux/amd64 -**Validated on:** Kubernetes v1.35 clusters with NVIDIA H100 80GB HBM3 -**Generated:** 2026-03-10 03:49:45 UTC --- @@ -15,7 +15,7 @@ with an implementation for advanced traffic management for inference services. 1. **kgateway controller** — Running in `kgateway-system` 2. **inference-gateway deployment** — Running (the inference extension controller) 3. **Gateway API CRDs** — All present (GatewayClass, Gateway, HTTPRoute, GRPCRoute, ReferenceGrant) -4. **Active Gateway** — `inference-gateway` with class `kgateway`, programmed with a load balancer address +4. **Active Gateway** — `inference-gateway` with class `kgateway`, programmed with an AWS ELB address 5. **Inference Extension CRDs** — InferencePool, InferenceModelRewrite, InferenceObjective installed 6. **Result: PASS** @@ -27,16 +27,16 @@ with an implementation for advanced traffic management for inference services. ``` $ kubectl get deploy -n kgateway-system NAME READY UP-TO-DATE AVAILABLE AGE -inference-gateway 1/1 1 1 28m -kgateway 1/1 1 1 28m +inference-gateway 1/1 1 1 69m +kgateway 1/1 1 1 69m ``` **kgateway pods** ``` $ kubectl get pods -n kgateway-system NAME READY STATUS RESTARTS AGE -inference-gateway-6f55d54bd8-gj9t8 1/1 Running 0 28m -kgateway-7d6dfdc5dc-s6lwc 1/1 Running 0 28m +inference-gateway-6f55d54bd8-rxt9g 1/1 Running 0 69m +kgateway-7d6dfdc5dc-5wtw2 1/1 Running 0 69m ``` ## GatewayClass @@ -45,8 +45,8 @@ kgateway-7d6dfdc5dc-s6lwc 1/1 Running 0 28m ``` $ kubectl get gatewayclass NAME CONTROLLER ACCEPTED AGE -kgateway kgateway.dev/kgateway True 28m -kgateway-waypoint kgateway.dev/kgateway True 28m +kgateway kgateway.dev/kgateway True 69m +kgateway-waypoint kgateway.dev/kgateway True 69m ``` ## Gateway API CRDs @@ -54,11 +54,11 @@ kgateway-waypoint kgateway.dev/kgateway True 28m **Gateway API CRDs** ``` $ kubectl get crds | grep gateway.networking.k8s.io -gatewayclasses.gateway.networking.k8s.io 2026-03-10T03:21:04Z -gateways.gateway.networking.k8s.io 2026-03-10T03:21:05Z -grpcroutes.gateway.networking.k8s.io 2026-03-10T03:21:05Z -httproutes.gateway.networking.k8s.io 2026-03-10T03:21:06Z -referencegrants.gateway.networking.k8s.io 2026-03-10T03:21:06Z +gatewayclasses.gateway.networking.k8s.io 2026-04-01T22:09:22Z +gateways.gateway.networking.k8s.io 2026-04-01T22:09:22Z +grpcroutes.gateway.networking.k8s.io 2026-04-01T22:09:23Z +httproutes.gateway.networking.k8s.io 2026-04-01T22:09:23Z +referencegrants.gateway.networking.k8s.io 2026-04-01T22:09:24Z ``` ## Active Gateway @@ -66,8 +66,8 @@ referencegrants.gateway.networking.k8s.io 2026-03-10T03:21:06Z **Gateways** ``` $ kubectl get gateways -A -NAMESPACE NAME CLASS ADDRESS PROGRAMMED AGE -kgateway-system inference-gateway kgateway True 28m +NAMESPACE NAME CLASS ADDRESS PROGRAMMED AGE +kgateway-system inference-gateway kgateway .elb.amazonaws.com True 69m ``` **Gateway details** @@ -82,12 +82,12 @@ metadata: helm.sh/hook-weight: "10" kubectl.kubernetes.io/last-applied-configuration: | {"apiVersion":"gateway.networking.k8s.io/v1","kind":"Gateway","metadata":{"annotations":{"helm.sh/hook":"post-install,post-upgrade","helm.sh/hook-delete-policy":"before-hook-creation","helm.sh/hook-weight":"10"},"name":"inference-gateway","namespace":"kgateway-system"},"spec":{"gatewayClassName":"kgateway","infrastructure":{"parametersRef":{"group":"gateway.kgateway.dev","kind":"GatewayParameters","name":"system-proxy"}},"listeners":[{"allowedRoutes":{"namespaces":{"from":"All"}},"name":"http","port":80,"protocol":"HTTP"}]}} - creationTimestamp: "2026-03-10T03:21:34Z" + creationTimestamp: "2026-04-01T22:09:39Z" generation: 1 name: inference-gateway namespace: kgateway-system - resourceVersion: "1158803" - uid: 4dac636a-d90d-431c-9397-4baf2c81a150 + resourceVersion: "101860353" + uid: 1b8b3a2a-dd47-4ac0-b18b-b5da8c25cff6 spec: gatewayClassName: kgateway infrastructure: @@ -105,15 +105,15 @@ spec: status: addresses: - type: Hostname - value: + value: .elb.amazonaws.com conditions: - - lastTransitionTime: "2026-03-10T03:21:40Z" + - lastTransitionTime: "2026-04-01T22:09:45Z" message: "" observedGeneration: 1 reason: Accepted status: "True" type: Accepted - - lastTransitionTime: "2026-03-10T03:21:40Z" + - lastTransitionTime: "2026-04-01T22:09:45Z" message: "" observedGeneration: 1 reason: Programmed @@ -122,25 +122,25 @@ status: listeners: - attachedRoutes: 0 conditions: - - lastTransitionTime: "2026-03-10T03:21:40Z" + - lastTransitionTime: "2026-04-01T22:09:45Z" message: "" observedGeneration: 1 reason: Accepted status: "True" type: Accepted - - lastTransitionTime: "2026-03-10T03:21:40Z" + - lastTransitionTime: "2026-04-01T22:09:45Z" message: "" observedGeneration: 1 reason: NoConflicts status: "False" type: Conflicted - - lastTransitionTime: "2026-03-10T03:21:40Z" + - lastTransitionTime: "2026-04-01T22:09:45Z" message: "" observedGeneration: 1 reason: ResolvedRefs status: "True" type: ResolvedRefs - - lastTransitionTime: "2026-03-10T03:21:40Z" + - lastTransitionTime: "2026-04-01T22:09:45Z" message: "" observedGeneration: 1 reason: Programmed @@ -173,11 +173,11 @@ Programmed: True (Programmed) **Inference extension CRDs installed** ``` $ kubectl get crds | grep inference -inferencemodelrewrites.inference.networking.x-k8s.io 2026-03-10T03:21:06Z -inferenceobjectives.inference.networking.x-k8s.io 2026-03-10T03:21:06Z -inferencepoolimports.inference.networking.x-k8s.io 2026-03-10T03:21:07Z -inferencepools.inference.networking.k8s.io 2026-03-10T03:21:07Z -inferencepools.inference.networking.x-k8s.io 2026-03-10T03:21:07Z +inferencemodelrewrites.inference.networking.x-k8s.io 2026-04-01T22:09:24Z +inferenceobjectives.inference.networking.x-k8s.io 2026-04-01T22:09:24Z +inferencepoolimports.inference.networking.x-k8s.io 2026-04-01T22:09:24Z +inferencepools.inference.networking.k8s.io 2026-04-01T22:09:24Z +inferencepools.inference.networking.x-k8s.io 2026-04-01T22:09:25Z ``` **Result: PASS** — kgateway controller running, GatewayClass Accepted, Gateway Programmed, inference CRDs installed. diff --git a/docs/conformance/cncf/evidence/pod-autoscaling.md b/docs/conformance/cncf/v1.35/nim-eks/evidence/pod-autoscaling.md similarity index 84% rename from docs/conformance/cncf/evidence/pod-autoscaling.md rename to docs/conformance/cncf/v1.35/nim-eks/evidence/pod-autoscaling.md index f78b1d97a..74994f5ba 100644 --- a/docs/conformance/cncf/evidence/pod-autoscaling.md +++ b/docs/conformance/cncf/v1.35/nim-eks/evidence/pod-autoscaling.md @@ -1,9 +1,9 @@ # Pod Autoscaling (HPA with GPU Metrics) +**Cluster:** `EKS / p5.48xlarge / NVIDIA-H100-80GB-HBM3` +**Generated:** 2026-04-01 23:19:27 UTC **Kubernetes Version:** v1.35 **Platform:** linux/amd64 -**Validated on:** Kubernetes v1.35 clusters with NVIDIA H100 80GB HBM3 -**Generated:** 2026-03-10 03:42:06 UTC --- @@ -27,14 +27,14 @@ utilizing accelerators, including the ability to scale based on custom GPU metri ``` $ kubectl get pods -n monitoring -l app.kubernetes.io/name=prometheus-adapter NAME READY STATUS RESTARTS AGE -prometheus-adapter-78b8b8d75c-fh4cf 1/1 Running 0 18m +prometheus-adapter-78b8b8d75c-wv9h2 1/1 Running 0 68m ``` **Prometheus adapter service** ``` $ kubectl get svc prometheus-adapter -n monitoring -NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE -prometheus-adapter ClusterIP 172.20.178.141 443/TCP 18m +NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE +prometheus-adapter ClusterIP 172.20.38.130 443/TCP 68m ``` ## Custom Metrics API @@ -42,12 +42,12 @@ prometheus-adapter ClusterIP 172.20.178.141 443/TCP 18m **Available custom metrics** ``` $ kubectl get --raw /apis/custom.metrics.k8s.io/v1beta1 | python3 -c "..." # extract resource names -namespaces/gpu_memory_used namespaces/gpu_power_usage pods/gpu_power_usage pods/gpu_utilization namespaces/gpu_utilization pods/gpu_memory_used +namespaces/gpu_memory_used ``` ## GPU Stress Test Deployment @@ -166,8 +166,8 @@ horizontalpodautoscaler.autoscaling/gpu-workload-hpa created **GPU workload pod** ``` $ kubectl get pods -n hpa-test -o wide -NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES -gpu-workload-86c75dcd97-2wk4f 1/1 Running 0 3s 10.0.254.75 gpu-node-2 +NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES +gpu-workload-86c75dcd97-qbc7g 1/1 Running 0 4s 10.0.222.136 ip-10-0-251-220.ec2.internal ``` ## HPA Status @@ -176,7 +176,7 @@ gpu-workload-86c75dcd97-2wk4f 1/1 Running 0 3s 10.0.254.75 ``` $ kubectl get hpa -n hpa-test NAME REFERENCE TARGETS MINPODS MAXPODS REPLICAS AGE -gpu-workload-hpa Deployment/gpu-workload 100/50 1 2 2 90s +gpu-workload-hpa Deployment/gpu-workload 100/50 1 2 2 49s ``` **HPA details** @@ -186,10 +186,10 @@ Name: gpu-workload-hpa Namespace: hpa-test Labels: Annotations: -CreationTimestamp: Mon, 09 Mar 2026 20:42:14 -0700 +CreationTimestamp: Wed, 01 Apr 2026 16:19:34 -0700 Reference: Deployment/gpu-workload Metrics: ( current / target ) - "gpu_utilization" on pods: 50 / 50 + "gpu_utilization" on pods: 100 / 50 Min replicas: 1 Max replicas: 2 Behavior: @@ -214,18 +214,18 @@ Conditions: Events: Type Reason Age From Message ---- ------ ---- ---- ------- - Warning FailedGetPodsMetric 76s horizontal-pod-autoscaler unable to get metric gpu_utilization: no metrics returned from custom metrics API - Warning FailedComputeMetricsReplicas 76s horizontal-pod-autoscaler invalid metrics (1 invalid out of 1), first error is: failed to get pods metric value: unable to get metric gpu_utilization: no metrics returned from custom metrics API - Normal SuccessfulRescale 31s horizontal-pod-autoscaler New size: 2; reason: pods metric gpu_utilization above target + Warning FailedGetPodsMetric 35s horizontal-pod-autoscaler unable to get metric gpu_utilization: no metrics returned from custom metrics API + Warning FailedComputeMetricsReplicas 35s horizontal-pod-autoscaler invalid metrics (1 invalid out of 1), first error is: failed to get pods metric value: unable to get metric gpu_utilization: no metrics returned from custom metrics API + Normal SuccessfulRescale 20s horizontal-pod-autoscaler New size: 2; reason: pods metric gpu_utilization above target ``` ## GPU Utilization Evidence **GPU utilization (nvidia-smi)** ``` -$ kubectl exec -n hpa-test gpu-workload-86c75dcd97-2wk4f -- nvidia-smi --query-gpu=utilization.gpu,utilization.memory,power.draw --format=csv +$ kubectl exec -n hpa-test gpu-workload-86c75dcd97-qbc7g -- nvidia-smi --query-gpu=utilization.gpu,utilization.memory,power.draw --format=csv utilization.gpu [%], utilization.memory [%], power.draw [W] -100 %, 0 %, 290.28 W +100 %, 0 %, 297.05 W ``` ## Pods After Scale-Up @@ -233,9 +233,9 @@ utilization.gpu [%], utilization.memory [%], power.draw [W] **Pods after scale-up** ``` $ kubectl get pods -n hpa-test -o wide -NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES -gpu-workload-86c75dcd97-2wk4f 1/1 Running 0 96s 10.0.254.75 gpu-node-2 -gpu-workload-86c75dcd97-4gbn8 1/1 Running 0 36s 10.0.219.76 gpu-node-2 +NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES +gpu-workload-86c75dcd97-qbc7g 1/1 Running 0 55s 10.0.222.136 ip-10-0-251-220.ec2.internal +gpu-workload-86c75dcd97-zvnlg 1/1 Running 0 25s 10.0.228.202 ip-10-0-251-220.ec2.internal ``` **Result: PASS** — HPA successfully read gpu_utilization metric and scaled replicas when utilization exceeded target threshold. diff --git a/docs/conformance/cncf/v1.35/nim-eks/evidence/robust-operator.md b/docs/conformance/cncf/v1.35/nim-eks/evidence/robust-operator.md new file mode 100644 index 000000000..eb9cb5e7c --- /dev/null +++ b/docs/conformance/cncf/v1.35/nim-eks/evidence/robust-operator.md @@ -0,0 +1,179 @@ +# Robust AI Operator (NIM Operator) + +**Cluster:** `EKS / p5.48xlarge / NVIDIA-H100-80GB-HBM3` +**Generated:** 2026-04-01 23:19:10 UTC +**Kubernetes Version:** v1.35 +**Platform:** linux/amd64 + +--- + +Demonstrates CNCF AI Conformance requirement that at least one complex AI operator +with a CRD can be installed and functions reliably, including operator pods running, +webhooks operational, and custom resources reconciled. + +## Summary + +1. **NIM Operator** — Controller manager running in `nvidia-nim` +2. **Custom Resource Definitions** — NIMService, NIMCache, NIMPipeline, NIMBuild CRDs registered +3. **Admission Controller** — Validating/mutating webhooks configured and active +4. **Custom Resource Reconciled** — `NIMService` reconciled into running inference pod(s) +5. **Result: PASS** + +--- + +## NIM Operator Health + +**NIM operator deployment** +``` +$ kubectl get deploy -n nvidia-nim +NAME READY UP-TO-DATE AVAILABLE AGE +k8s-nim-operator 1/1 1 1 65m +``` + +**NIM operator pods** +``` +$ kubectl get pods -n nvidia-nim +NAME READY STATUS RESTARTS AGE +k8s-nim-operator-64fb4b7cc6-5ktwg 1/1 Running 0 65m +``` + +## Custom Resource Definitions + +**NIM CRDs** +``` +nemocustomizers.apps.nvidia.com 2026-04-01T22:13:10Z +nemodatastores.apps.nvidia.com 2026-04-01T22:13:11Z +nemoentitystores.apps.nvidia.com 2026-04-01T22:13:12Z +nemoevaluators.apps.nvidia.com 2026-04-01T22:13:13Z +nemoguardrails.apps.nvidia.com 2026-04-01T22:13:13Z +nimbuilds.apps.nvidia.com 2026-04-01T22:13:14Z +nimcaches.apps.nvidia.com 2026-04-01T22:13:14Z +nimpipelines.apps.nvidia.com 2026-04-01T22:13:15Z +nimservices.apps.nvidia.com 2026-04-01T22:13:16Z +``` + +## Webhooks + +**NIM Operator webhooks** +``` +validatingwebhookconfiguration.admissionregistration.k8s.io/k8s-nim-operator-validating-webhook-configuration 2 65m +``` + +## Custom Resource Reconciliation + +A `NIMService` defines an inference microservice. The operator reconciles it into +a Deployment with GPU resources, a Service, and health monitoring. + +**NIMServices** +``` +$ kubectl get nimservices -A +NAMESPACE NAME STATUS AGE +nim-workload llama-3-2-1b Ready 61m +``` + +**NIMService details** +``` +$ kubectl get nimservice llama-3-2-1b -n nim-workload -o yaml +apiVersion: apps.nvidia.com/v1alpha1 +kind: NIMService +metadata: + annotations: + kubectl.kubernetes.io/last-applied-configuration: | + {"apiVersion":"apps.nvidia.com/v1alpha1","kind":"NIMService","metadata":{"annotations":{},"name":"llama-3-2-1b","namespace":"nim-workload"},"spec":{"authSecret":"ngc-api-secret","expose":{"service":{"port":8000,"type":"ClusterIP"}},"image":{"pullPolicy":"IfNotPresent","pullSecrets":["ngc-pull-secret"],"repository":"nvcr.io/nim/meta/llama-3.2-1b-instruct","tag":"1.8.3"},"replicas":1,"resources":{"limits":{"nvidia.com/gpu":"1"},"requests":{"nvidia.com/gpu":"1"}},"storage":{"pvc":{"name":"nim-model-store"}},"tolerations":[{"effect":"NoSchedule","key":"dedicated","operator":"Equal","value":"worker-workload"},{"effect":"NoExecute","key":"dedicated","operator":"Equal","value":"worker-workload"}]}} + creationTimestamp: "2026-04-01T22:17:39Z" + finalizers: + - finalizer.nimservice.apps.nvidia.com + generation: 2 + name: llama-3-2-1b + namespace: nim-workload + resourceVersion: "101880642" + uid: 27ab2169-5913-4c98-a39d-635ce99af343 +spec: + authSecret: ngc-api-secret + expose: + ingress: + spec: {} + router: {} + service: + port: 8000 + type: ClusterIP + image: + pullPolicy: IfNotPresent + pullSecrets: + - ngc-pull-secret + repository: nvcr.io/nim/meta/llama-3.2-1b-instruct + tag: 1.8.3 + inferencePlatform: standalone + livenessProbe: {} + metrics: + serviceMonitor: {} + readinessProbe: {} + replicas: 1 + resources: + limits: + nvidia.com/gpu: "1" + requests: + nvidia.com/gpu: "1" + scale: + hpa: + maxReplicas: 0 + minReplicas: 1 + startupProbe: {} + storage: + nimCache: {} + pvc: + name: nim-model-store + tolerations: + - effect: NoSchedule + key: dedicated + operator: Equal + value: worker-workload + - effect: NoExecute + key: dedicated + operator: Equal + value: worker-workload +status: + conditions: + - lastTransitionTime: "2026-04-01T22:19:34Z" + message: | + deployment "llama-3-2-1b" successfully rolled out + reason: Ready + status: "True" + type: Ready + - lastTransitionTime: "2026-04-01T22:17:39Z" + message: "" + reason: Ready + status: "False" + type: Failed + model: + clusterEndpoint: 172.20.99.16:8000 + externalEndpoint: "" + name: meta/llama-3.2-1b-instruct + state: Ready +``` + +### Workload Pods Created by Operator + +**NIM workload pods** +``` +$ kubectl get pods -n nim-workload -l app.kubernetes.io/managed-by=k8s-nim-operator -o wide +NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES +llama-3-2-1b-7577f87fc7-dhb97 1/1 Running 0 61m 10.0.158.63 ip-10-0-180-136.ec2.internal +``` + +## Webhook Rejection Test + +Submit an invalid NIMService to verify the admission controller actively +rejects malformed resources. + +**Invalid CR rejection** +``` +The NIMService "webhook-test-invalid" is invalid: +* spec.authSecret: Required value +* spec.image: Required value +* : Invalid value: null: some validation rules were not checked because the object was invalid; correct the existing errors to complete validation +``` + +Webhook correctly rejected the invalid resource. + +**Result: PASS** — NIM operator running, webhooks operational (rejection verified), 9 CRDs registered, NIMService reconciled with 1 healthy inference pod(s). diff --git a/docs/conformance/cncf/evidence/secure-accelerator-access.md b/docs/conformance/cncf/v1.35/nim-eks/evidence/secure-accelerator-access.md similarity index 66% rename from docs/conformance/cncf/evidence/secure-accelerator-access.md rename to docs/conformance/cncf/v1.35/nim-eks/evidence/secure-accelerator-access.md index 093ceffdb..235d0e38b 100644 --- a/docs/conformance/cncf/evidence/secure-accelerator-access.md +++ b/docs/conformance/cncf/v1.35/nim-eks/evidence/secure-accelerator-access.md @@ -1,9 +1,9 @@ # Secure Accelerator Access +**Cluster:** `EKS / p5.48xlarge / NVIDIA-H100-80GB-HBM3` +**Generated:** 2026-04-01 23:14:45 UTC **Kubernetes Version:** v1.35 **Platform:** linux/amd64 -**Validated on:** Kubernetes v1.35 clusters with NVIDIA H100 80GB HBM3 -**Generated:** 2026-03-10 03:40:33 UTC --- @@ -19,7 +19,7 @@ access control, and auditability of accelerator usage. ``` $ kubectl get clusterpolicy -o wide NAME STATUS AGE -cluster-policy ready 2026-03-10T03:25:45Z +cluster-policy ready 2026-04-01T22:12:51Z ``` ### GPU Operator Pods @@ -28,30 +28,30 @@ cluster-policy ready 2026-03-10T03:25:45Z ``` $ kubectl get pods -n gpu-operator -o wide NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES -gpu-feature-discovery-6rcxf 1/1 Running 0 14m 10.0.224.30 gpu-node-2 -gpu-feature-discovery-8jhh7 1/1 Running 0 14m 10.0.224.179 gpu-node-1 -gpu-operator-6bf99d6478-r55t5 1/1 Running 0 14m 10.0.6.44 system-node-1 -node-feature-discovery-gc-5495c9b5c9-5jhtb 1/1 Running 0 14m 10.0.4.105 system-node-1 -node-feature-discovery-master-6f876b9c85-97zcw 1/1 Running 0 14m 10.0.6.62 system-node-1 -node-feature-discovery-worker-7z8fm 1/1 Running 0 14m 10.0.230.31 system-node-2 -node-feature-discovery-worker-9s5tc 1/1 Running 0 14m 10.0.154.69 gpu-node-1 -node-feature-discovery-worker-vb62k 1/1 Running 0 14m 10.0.189.91 gpu-node-2 -nvidia-container-toolkit-daemonset-c49gs 1/1 Running 0 14m 10.0.201.217 gpu-node-1 -nvidia-container-toolkit-daemonset-lr895 1/1 Running 0 14m 10.0.182.110 gpu-node-2 -nvidia-cuda-validator-9866n 0/1 Completed 0 12m 10.0.247.169 gpu-node-2 -nvidia-cuda-validator-f42hd 0/1 Completed 0 12m 10.0.143.223 gpu-node-1 -nvidia-dcgm-4bq8l 1/1 Running 0 14m 10.0.145.214 gpu-node-1 -nvidia-dcgm-exporter-g2fjs 1/1 Running 0 14m 10.0.247.52 gpu-node-2 -nvidia-dcgm-exporter-wqqqn 1/1 Running 0 14m 10.0.172.246 gpu-node-1 -nvidia-dcgm-xjsqq 1/1 Running 0 14m 10.0.159.246 gpu-node-2 -nvidia-device-plugin-daemonset-5884b 1/1 Running 0 14m 10.0.255.120 gpu-node-1 -nvidia-device-plugin-daemonset-kx2zg 1/1 Running 0 14m 10.0.185.249 gpu-node-2 -nvidia-driver-daemonset-qc7cg 3/3 Running 0 14m 10.0.198.38 gpu-node-1 -nvidia-driver-daemonset-vvlsc 3/3 Running 0 14m 10.0.166.43 gpu-node-2 -nvidia-mig-manager-4gn76 1/1 Running 0 14m 10.0.135.89 gpu-node-1 -nvidia-mig-manager-8s9wj 1/1 Running 0 14m 10.0.253.166 gpu-node-2 -nvidia-operator-validator-twprm 1/1 Running 0 14m 10.0.231.53 gpu-node-1 -nvidia-operator-validator-vwnsb 1/1 Running 0 14m 10.0.194.119 gpu-node-2 +gpu-feature-discovery-bvjjh 1/1 Running 0 61m 10.0.218.175 ip-10-0-251-220.ec2.internal +gpu-feature-discovery-q4k8g 1/1 Running 0 61m 10.0.133.127 ip-10-0-180-136.ec2.internal +gpu-operator-6bf99d6478-lpll4 1/1 Running 0 61m 10.0.4.84 ip-10-0-7-209.ec2.internal +node-feature-discovery-gc-5495c9b5c9-5lv2g 1/1 Running 0 61m 10.0.6.61 ip-10-0-7-209.ec2.internal +node-feature-discovery-master-6f876b9c85-b7wlm 1/1 Running 0 61m 10.0.6.161 ip-10-0-7-209.ec2.internal +node-feature-discovery-worker-lrn2p 1/1 Running 0 61m 10.0.212.66 ip-10-0-251-220.ec2.internal +node-feature-discovery-worker-srp76 1/1 Running 0 61m 10.0.231.205 ip-10-0-180-136.ec2.internal +node-feature-discovery-worker-svrbw 1/1 Running 0 61m 10.0.201.87 ip-10-0-184-187.ec2.internal +nvidia-container-toolkit-daemonset-2kj4m 1/1 Running 0 61m 10.0.236.177 ip-10-0-180-136.ec2.internal +nvidia-container-toolkit-daemonset-98f25 1/1 Running 0 61m 10.0.157.16 ip-10-0-251-220.ec2.internal +nvidia-cuda-validator-cpnk4 0/1 Completed 0 59m 10.0.146.2 ip-10-0-180-136.ec2.internal +nvidia-cuda-validator-l665p 0/1 Completed 0 59m 10.0.247.132 ip-10-0-251-220.ec2.internal +nvidia-dcgm-bwb6w 1/1 Running 0 61m 10.0.129.30 ip-10-0-251-220.ec2.internal +nvidia-dcgm-exporter-2xrln 1/1 Running 0 61m 10.0.187.45 ip-10-0-180-136.ec2.internal +nvidia-dcgm-exporter-sscnw 1/1 Running 0 61m 10.0.147.205 ip-10-0-251-220.ec2.internal +nvidia-dcgm-gdm9j 1/1 Running 0 61m 10.0.130.151 ip-10-0-180-136.ec2.internal +nvidia-device-plugin-daemonset-5dmkr 1/1 Running 0 61m 10.0.170.117 ip-10-0-180-136.ec2.internal +nvidia-device-plugin-daemonset-tg9x2 1/1 Running 0 61m 10.0.169.151 ip-10-0-251-220.ec2.internal +nvidia-driver-daemonset-9xv78 3/3 Running 0 61m 10.0.163.144 ip-10-0-251-220.ec2.internal +nvidia-driver-daemonset-fbvmz 3/3 Running 0 61m 10.0.147.204 ip-10-0-180-136.ec2.internal +nvidia-mig-manager-6565z 1/1 Running 0 58m 10.0.243.110 ip-10-0-180-136.ec2.internal +nvidia-mig-manager-jm8tl 1/1 Running 0 58m 10.0.191.228 ip-10-0-251-220.ec2.internal +nvidia-operator-validator-bpg4w 1/1 Running 0 61m 10.0.160.53 ip-10-0-251-220.ec2.internal +nvidia-operator-validator-mws7n 1/1 Running 0 61m 10.0.247.220 ip-10-0-180-136.ec2.internal ``` ### GPU Operator DaemonSets @@ -60,16 +60,16 @@ nvidia-operator-validator-vwnsb 1/1 Running 0 ``` $ kubectl get ds -n gpu-operator NAME DESIRED CURRENT READY UP-TO-DATE AVAILABLE NODE SELECTOR AGE -gpu-feature-discovery 2 2 2 2 2 nvidia.com/gpu.deploy.gpu-feature-discovery=true 14m -node-feature-discovery-worker 3 3 3 3 3 14m -nvidia-container-toolkit-daemonset 2 2 2 2 2 nvidia.com/gpu.deploy.container-toolkit=true 14m -nvidia-dcgm 2 2 2 2 2 nvidia.com/gpu.deploy.dcgm=true 14m -nvidia-dcgm-exporter 2 2 2 2 2 nvidia.com/gpu.deploy.dcgm-exporter=true 14m -nvidia-device-plugin-daemonset 2 2 2 2 2 nvidia.com/gpu.deploy.device-plugin=true 14m -nvidia-device-plugin-mps-control-daemon 0 0 0 0 0 nvidia.com/gpu.deploy.device-plugin=true,nvidia.com/mps.capable=true 14m -nvidia-driver-daemonset 2 2 2 2 2 nvidia.com/gpu.deploy.driver=true 14m -nvidia-mig-manager 2 2 2 2 2 nvidia.com/gpu.deploy.mig-manager=true 14m -nvidia-operator-validator 2 2 2 2 2 nvidia.com/gpu.deploy.operator-validator=true 14m +gpu-feature-discovery 2 2 2 2 2 nvidia.com/gpu.deploy.gpu-feature-discovery=true 61m +node-feature-discovery-worker 3 3 3 3 3 61m +nvidia-container-toolkit-daemonset 2 2 2 2 2 nvidia.com/gpu.deploy.container-toolkit=true 61m +nvidia-dcgm 2 2 2 2 2 nvidia.com/gpu.deploy.dcgm=true 61m +nvidia-dcgm-exporter 2 2 2 2 2 nvidia.com/gpu.deploy.dcgm-exporter=true 61m +nvidia-device-plugin-daemonset 2 2 2 2 2 nvidia.com/gpu.deploy.device-plugin=true 61m +nvidia-device-plugin-mps-control-daemon 0 0 0 0 0 nvidia.com/gpu.deploy.device-plugin=true,nvidia.com/mps.capable=true 61m +nvidia-driver-daemonset 2 2 2 2 2 nvidia.com/gpu.deploy.driver=true 61m +nvidia-mig-manager 2 2 2 2 2 nvidia.com/gpu.deploy.mig-manager=true 61m +nvidia-operator-validator 2 2 2 2 2 nvidia.com/gpu.deploy.operator-validator=true 61m ``` ## DRA-Mediated GPU Access @@ -84,10 +84,10 @@ GPU devices via ResourceSlices, and pods request access through ResourceClaims. ``` $ kubectl get resourceslices -o wide NAME NODE DRIVER POOL AGE -gpu-node-1-compute-domain.nvidia.com-q9xqc gpu-node-1 compute-domain.nvidia.com gpu-node-1 11m -gpu-node-1-gpu.nvidia.com-7cbz2 gpu-node-1 gpu.nvidia.com gpu-node-1 11m -gpu-node-2-compute-domain.nvidia.com-2n2cq gpu-node-2 compute-domain.nvidia.com gpu-node-2 11m -gpu-node-2-gpu.nvidia.com-79gvw gpu-node-2 gpu.nvidia.com gpu-node-2 11m +ip-10-0-180-136.ec2.internal-compute-domain.nvidia.com-kfxd7 ip-10-0-180-136.ec2.internal compute-domain.nvidia.com ip-10-0-180-136.ec2.internal 60m +ip-10-0-180-136.ec2.internal-gpu.nvidia.com-8w29z ip-10-0-180-136.ec2.internal gpu.nvidia.com ip-10-0-180-136.ec2.internal 59m +ip-10-0-251-220.ec2.internal-compute-domain.nvidia.com-btqsj ip-10-0-251-220.ec2.internal compute-domain.nvidia.com ip-10-0-251-220.ec2.internal 60m +ip-10-0-251-220.ec2.internal-gpu.nvidia.com-qwdqr ip-10-0-251-220.ec2.internal gpu.nvidia.com ip-10-0-251-220.ec2.internal 59m ``` ### GPU Device Details @@ -100,18 +100,18 @@ items: - apiVersion: resource.k8s.io/v1 kind: ResourceSlice metadata: - creationTimestamp: "2026-03-10T03:29:20Z" - generateName: gpu-node-1-compute-domain.nvidia.com- - generation: 2 - name: gpu-node-1-compute-domain.nvidia.com-q9xqc + creationTimestamp: "2026-04-01T22:14:50Z" + generateName: ip-10-0-180-136.ec2.internal-compute-domain.nvidia.com- + generation: 1 + name: ip-10-0-180-136.ec2.internal-compute-domain.nvidia.com-kfxd7 ownerReferences: - apiVersion: v1 controller: true kind: Node - name: gpu-node-1 - uid: fef55be3-f566-47c8-8bb8-52c117cb3855 - resourceVersion: "1169500" - uid: 8087c1b4-71e0-42c3-9f74-12629e2ee5b5 + name: ip-10-0-180-136.ec2.internal + uid: c01459a2-a385-4843-bc1f-582d283ea94e + resourceVersion: "101864746" + uid: 84642059-2fb9-484f-bb98-7e5ae1802eba spec: devices: - attributes: @@ -127,26 +127,26 @@ items: string: channel name: channel-0 driver: compute-domain.nvidia.com - nodeName: gpu-node-1 + nodeName: ip-10-0-180-136.ec2.internal pool: generation: 1 - name: gpu-node-1 + name: ip-10-0-180-136.ec2.internal resourceSliceCount: 1 - apiVersion: resource.k8s.io/v1 kind: ResourceSlice metadata: - creationTimestamp: "2026-03-10T03:29:22Z" - generateName: gpu-node-1-gpu.nvidia.com- + creationTimestamp: "2026-04-01T22:14:52Z" + generateName: ip-10-0-180-136.ec2.internal-gpu.nvidia.com- generation: 2 - name: gpu-node-1-gpu.nvidia.com-7cbz2 + name: ip-10-0-180-136.ec2.internal-gpu.nvidia.com-8w29z ownerReferences: - apiVersion: v1 controller: true kind: Node - name: gpu-node-1 - uid: fef55be3-f566-47c8-8bb8-52c117cb3855 - resourceVersion: "1169562" - uid: 3441669c-08c4-43ff-9b83-42c5f3dddcff + name: ip-10-0-180-136.ec2.internal + uid: c01459a2-a385-4843-bc1f-582d283ea94e + resourceVersion: "101865710" + uid: 89a1966f-5c3f-4664-a5b7-b348a122db07 spec: devices: - attributes: @@ -165,17 +165,17 @@ items: productName: string: NVIDIA H100 80GB HBM3 resource.kubernetes.io/pciBusID: - string: 0000:64:00.0 + string: "0000:53:00.0" resource.kubernetes.io/pcieRoot: - string: pci0000:55 + string: pci0000:44 type: string: gpu uuid: - string: GPU-bc5610b9-79c8-fedd-8899-07539c7f868a + string: GPU-15704b32-f531-14ce-0530-1ac21e4b68e6 capacity: memory: value: 81559Mi - name: gpu-1 + name: gpu-0 - attributes: addressingMode: string: HMM @@ -192,17 +192,17 @@ items: productName: string: NVIDIA H100 80GB HBM3 resource.kubernetes.io/pciBusID: - string: 0000:75:00.0 + string: 0000:64:00.0 resource.kubernetes.io/pcieRoot: - string: pci0000:66 + string: pci0000:55 type: string: gpu uuid: - string: GPU-fbc2c554-4d37-8938-0032-f923bad0f716 + string: GPU-edc718f8-e593-6468-b9f9-563d508366ed capacity: memory: value: 81559Mi - name: gpu-2 + name: gpu-1 - attributes: addressingMode: string: HMM @@ -219,17 +219,17 @@ items: productName: string: NVIDIA H100 80GB HBM3 resource.kubernetes.io/pciBusID: - string: 0000:86:00.0 + string: 0000:75:00.0 resource.kubernetes.io/pcieRoot: - string: pci0000:77 + string: pci0000:66 type: string: gpu uuid: - string: GPU-a65a773d-52bb-bcc1-a8ee-f78c3faa2e2d + string: GPU-e2d9b65e-98cb-5b7a-90f0-e0336573f9e2 capacity: memory: value: 81559Mi - name: gpu-3 + name: gpu-2 - attributes: addressingMode: string: HMM @@ -246,17 +246,17 @@ items: productName: string: NVIDIA H100 80GB HBM3 resource.kubernetes.io/pciBusID: - string: 0000:97:00.0 + string: 0000:86:00.0 resource.kubernetes.io/pcieRoot: - string: pci0000:88 + string: pci0000:77 type: string: gpu uuid: - string: GPU-82e45d1b-1618-559f-144c-eab51545030b + string: GPU-3a325419-de5f-778f-cf4e-fe7290362ac5 capacity: memory: value: 81559Mi - name: gpu-4 + name: gpu-3 - attributes: addressingMode: string: HMM @@ -273,17 +273,17 @@ items: productName: string: NVIDIA H100 80GB HBM3 resource.kubernetes.io/pciBusID: - string: 0000:a8:00.0 + string: 0000:97:00.0 resource.kubernetes.io/pcieRoot: - string: pci0000:99 + string: pci0000:88 type: string: gpu uuid: - string: GPU-39e28159-8c62-ee71-64db-b748edd61e15 + string: GPU-275ad37d-ebd6-4cf6-3867-0499ba033a12 capacity: memory: value: 81559Mi - name: gpu-5 + name: gpu-4 - attributes: addressingMode: string: HMM @@ -300,17 +300,17 @@ items: productName: string: NVIDIA H100 80GB HBM3 resource.kubernetes.io/pciBusID: - string: 0000:b9:00.0 + string: 0000:a8:00.0 resource.kubernetes.io/pcieRoot: - string: pci0000:aa + string: pci0000:99 type: string: gpu uuid: - string: GPU-e64d69ca-b4b3-59b2-e78c-94f26c4db365 + string: GPU-3cab564d-1f63-674b-a831-024600bf985c capacity: memory: value: 81559Mi - name: gpu-6 + name: gpu-5 - attributes: addressingMode: string: HMM @@ -327,17 +327,17 @@ items: productName: string: NVIDIA H100 80GB HBM3 resource.kubernetes.io/pciBusID: - string: 0000:ca:00.0 + string: 0000:b9:00.0 resource.kubernetes.io/pcieRoot: - string: pci0000:bb + string: pci0000:aa type: string: gpu uuid: - string: GPU-04d228d3-3b5a-3534-f5cf-969706647d56 + string: GPU-d0f25a6f-9a3f-61b9-c128-3d14759651d7 capacity: memory: value: 81559Mi - name: gpu-7 + name: gpu-6 - attributes: addressingMode: string: HMM @@ -354,38 +354,38 @@ items: productName: string: NVIDIA H100 80GB HBM3 resource.kubernetes.io/pciBusID: - string: "0000:53:00.0" + string: 0000:ca:00.0 resource.kubernetes.io/pcieRoot: - string: pci0000:44 + string: pci0000:bb type: string: gpu uuid: - string: GPU-c4529c8d-69c4-b61d-e0bc-7b2460096005 + string: GPU-9bc10e9a-e27e-652b-9a1e-e84f7e446206 capacity: memory: value: 81559Mi - name: gpu-0 + name: gpu-7 driver: gpu.nvidia.com - nodeName: gpu-node-1 + nodeName: ip-10-0-180-136.ec2.internal pool: generation: 1 - name: gpu-node-1 + name: ip-10-0-180-136.ec2.internal resourceSliceCount: 1 - apiVersion: resource.k8s.io/v1 kind: ResourceSlice metadata: - creationTimestamp: "2026-03-10T03:29:19Z" - generateName: gpu-node-2-compute-domain.nvidia.com- + creationTimestamp: "2026-04-01T22:14:51Z" + generateName: ip-10-0-251-220.ec2.internal-compute-domain.nvidia.com- generation: 1 - name: gpu-node-2-compute-domain.nvidia.com-2n2cq + name: ip-10-0-251-220.ec2.internal-compute-domain.nvidia.com-btqsj ownerReferences: - apiVersion: v1 controller: true kind: Node - name: gpu-node-2 - uid: b171b90a-eb8f-4662-bd0d-2055b634dc98 - resourceVersion: "1168846" - uid: 3eca27ae-5231-4845-8407-1e24fd9b5683 + name: ip-10-0-251-220.ec2.internal + uid: d55d06fd-ee55-4525-b7da-393b71669e8f + resourceVersion: "101864753" + uid: af18d2bf-b15f-43cb-8d2b-a49098f4f5bd spec: devices: - attributes: @@ -401,26 +401,26 @@ items: string: daemon name: daemon-0 driver: compute-domain.nvidia.com - nodeName: gpu-node-2 + nodeName: ip-10-0-251-220.ec2.internal pool: generation: 1 - name: gpu-node-2 + name: ip-10-0-251-220.ec2.internal resourceSliceCount: 1 - apiVersion: resource.k8s.io/v1 kind: ResourceSlice metadata: - creationTimestamp: "2026-03-10T03:29:21Z" - generateName: gpu-node-2-gpu.nvidia.com- + creationTimestamp: "2026-04-01T22:14:52Z" + generateName: ip-10-0-251-220.ec2.internal-gpu.nvidia.com- generation: 2 - name: gpu-node-2-gpu.nvidia.com-79gvw + name: ip-10-0-251-220.ec2.internal-gpu.nvidia.com-qwdqr ownerReferences: - apiVersion: v1 controller: true kind: Node - name: gpu-node-2 - uid: b171b90a-eb8f-4662-bd0d-2055b634dc98 - resourceVersion: "1169576" - uid: 0b3dc1d8-a1ba-4fae-894b-cb90e62ed783 + name: ip-10-0-251-220.ec2.internal + uid: d55d06fd-ee55-4525-b7da-393b71669e8f + resourceVersion: "101865689" + uid: 48e7fc88-8ff6-4c50-9e74-8755d19ede37 spec: devices: - attributes: @@ -439,17 +439,17 @@ items: productName: string: NVIDIA H100 80GB HBM3 resource.kubernetes.io/pciBusID: - string: 0000:75:00.0 + string: 0000:ca:00.0 resource.kubernetes.io/pcieRoot: - string: pci0000:66 + string: pci0000:bb type: string: gpu uuid: - string: GPU-dbabb552-a092-0ca9-0580-8d4fe378eb02 + string: GPU-530bd4b0-238b-f0c2-b496-63595812bca8 capacity: memory: value: 81559Mi - name: gpu-2 + name: gpu-7 - attributes: addressingMode: string: HMM @@ -466,17 +466,17 @@ items: productName: string: NVIDIA H100 80GB HBM3 resource.kubernetes.io/pciBusID: - string: 0000:86:00.0 + string: "0000:53:00.0" resource.kubernetes.io/pcieRoot: - string: pci0000:77 + string: pci0000:44 type: string: gpu uuid: - string: GPU-5342927e-e180-84f1-55ba-257f1cbd3ba4 + string: GPU-3f048793-8751-030e-5870-ebbd2b10cef2 capacity: memory: value: 81559Mi - name: gpu-3 + name: gpu-0 - attributes: addressingMode: string: HMM @@ -493,17 +493,17 @@ items: productName: string: NVIDIA H100 80GB HBM3 resource.kubernetes.io/pciBusID: - string: 0000:97:00.0 + string: 0000:64:00.0 resource.kubernetes.io/pcieRoot: - string: pci0000:88 + string: pci0000:55 type: string: gpu uuid: - string: GPU-95085215-739e-e7c6-4011-8dbe004af8c3 + string: GPU-cc644abe-17e4-7cb7-500d-ed8c09aea2fb capacity: memory: value: 81559Mi - name: gpu-4 + name: gpu-1 - attributes: addressingMode: string: HMM @@ -520,17 +520,17 @@ items: productName: string: NVIDIA H100 80GB HBM3 resource.kubernetes.io/pciBusID: - string: 0000:a8:00.0 + string: 0000:75:00.0 resource.kubernetes.io/pcieRoot: - string: pci0000:99 + string: pci0000:66 type: string: gpu uuid: - string: GPU-a7b658ad-f23e-cea9-2523-569d521700bf + string: GPU-8d0b1081-9549-2b14-7e01-b4a725873c21 capacity: memory: value: 81559Mi - name: gpu-5 + name: gpu-2 - attributes: addressingMode: string: HMM @@ -547,17 +547,17 @@ items: productName: string: NVIDIA H100 80GB HBM3 resource.kubernetes.io/pciBusID: - string: 0000:b9:00.0 + string: 0000:86:00.0 resource.kubernetes.io/pcieRoot: - string: pci0000:aa + string: pci0000:77 type: string: gpu uuid: - string: GPU-1e9a0e94-769a-b1e6-36f7-9296e286ef90 + string: GPU-38bbfee9-dc95-ffb5-4034-f9a6c82a45bb capacity: memory: value: 81559Mi - name: gpu-6 + name: gpu-3 - attributes: addressingMode: string: HMM @@ -574,17 +574,17 @@ items: productName: string: NVIDIA H100 80GB HBM3 resource.kubernetes.io/pciBusID: - string: 0000:ca:00.0 + string: 0000:97:00.0 resource.kubernetes.io/pcieRoot: - string: pci0000:bb + string: pci0000:88 type: string: gpu uuid: - string: GPU-16b2cd36-9dbe-3ee7-0810-07b330e36e04 + string: GPU-24087b69-8889-6b23-feeb-2905664fbcbf capacity: memory: value: 81559Mi - name: gpu-7 + name: gpu-4 - attributes: addressingMode: string: HMM @@ -601,17 +601,17 @@ items: productName: string: NVIDIA H100 80GB HBM3 resource.kubernetes.io/pciBusID: - string: "0000:53:00.0" + string: 0000:a8:00.0 resource.kubernetes.io/pcieRoot: - string: pci0000:44 + string: pci0000:99 type: string: gpu uuid: - string: GPU-92da0328-2f33-b563-d577-9d2b9f21f280 + string: GPU-d2f75162-e86d-0da0-0af4-3fa0b80038cd capacity: memory: value: 81559Mi - name: gpu-0 + name: gpu-5 - attributes: addressingMode: string: HMM @@ -628,22 +628,22 @@ items: productName: string: NVIDIA H100 80GB HBM3 resource.kubernetes.io/pciBusID: - string: 0000:64:00.0 + string: 0000:b9:00.0 resource.kubernetes.io/pcieRoot: - string: pci0000:55 + string: pci0000:aa type: string: gpu uuid: - string: GPU-184dab49-47ce-eeec-2239-3e03fbd4c002 + string: GPU-b00fe5f9-5832-19d6-0276-28d8630f0f4b capacity: memory: value: 81559Mi - name: gpu-1 + name: gpu-6 driver: gpu.nvidia.com - nodeName: gpu-node-2 + nodeName: ip-10-0-251-220.ec2.internal pool: generation: 1 - name: gpu-node-2 + name: ip-10-0-251-220.ec2.internal resourceSliceCount: 1 kind: List metadata: @@ -668,14 +668,14 @@ $ kubectl get pod isolation-test -n secure-access-test -o jsonpath={.spec.resour **Pod volumes (no hostPath)** ``` $ kubectl get pod isolation-test -n secure-access-test -o jsonpath={.spec.volumes} -[{"name":"kube-api-access-dl259","projected":{"defaultMode":420,"sources":[{"serviceAccountToken":{"expirationSeconds":3607,"path":"token"}},{"configMap":{"items":[{"key":"ca.crt","path":"ca.crt"}],"name":"kube-root-ca.crt"}},{"downwardAPI":{"items":[{"fieldRef":{"apiVersion":"v1","fieldPath":"metadata.namespace"},"path":"namespace"}]}}]}}] +[{"name":"kube-api-access-vk49g","projected":{"defaultMode":420,"sources":[{"serviceAccountToken":{"expirationSeconds":3607,"path":"token"}},{"configMap":{"items":[{"key":"ca.crt","path":"ca.crt"}],"name":"kube-root-ca.crt"}},{"downwardAPI":{"items":[{"fieldRef":{"apiVersion":"v1","fieldPath":"metadata.namespace"},"path":"namespace"}]}}]}}] ``` **ResourceClaim allocation** ``` $ kubectl get resourceclaim isolated-gpu -n secure-access-test -o wide NAME STATE AGE -isolated-gpu pending 12s +isolated-gpu pending 13s ``` > **Note:** ResourceClaim may show `pending` after pod completion because the DRA controller deallocates claims when the consuming pod terminates. The pod logs below confirm GPU isolation was enforced during execution. @@ -686,17 +686,17 @@ isolated-gpu pending 12s ``` $ kubectl logs isolation-test -n secure-access-test === Visible NVIDIA devices === -crw-rw-rw- 1 root root 195, 254 Mar 10 03:40 /dev/nvidia-modeset -crw-rw-rw- 1 root root 507, 0 Mar 10 03:40 /dev/nvidia-uvm -crw-rw-rw- 1 root root 507, 1 Mar 10 03:40 /dev/nvidia-uvm-tools -crw-rw-rw- 1 root root 195, 1 Mar 10 03:40 /dev/nvidia1 -crw-rw-rw- 1 root root 195, 255 Mar 10 03:40 /dev/nvidiactl +crw-rw-rw- 1 root root 195, 254 Apr 1 23:14 /dev/nvidia-modeset +crw-rw-rw- 1 root root 507, 0 Apr 1 23:14 /dev/nvidia-uvm +crw-rw-rw- 1 root root 507, 1 Apr 1 23:14 /dev/nvidia-uvm-tools +crw-rw-rw- 1 root root 195, 7 Apr 1 23:14 /dev/nvidia7 +crw-rw-rw- 1 root root 195, 255 Apr 1 23:14 /dev/nvidiactl === nvidia-smi output === -GPU 0: NVIDIA H100 80GB HBM3 (UUID: GPU-bc5610b9-79c8-fedd-8899-07539c7f868a) +GPU 0: NVIDIA H100 80GB HBM3 (UUID: GPU-530bd4b0-238b-f0c2-b496-63595812bca8) === GPU count === -0, NVIDIA H100 80GB HBM3, GPU-bc5610b9-79c8-fedd-8899-07539c7f868a +0, NVIDIA H100 80GB HBM3, GPU-530bd4b0-238b-f0c2-b496-63595812bca8 Secure accelerator access test completed ``` diff --git a/pkg/evidence/scripts/collect-evidence.sh b/pkg/evidence/scripts/collect-evidence.sh index da9d66a35..13116300c 100755 --- a/pkg/evidence/scripts/collect-evidence.sh +++ b/pkg/evidence/scripts/collect-evidence.sh @@ -657,11 +657,14 @@ collect_service_metrics() { EVIDENCE_FILE="${EVIDENCE_DIR}/ai-service-metrics.md" log_info "Collecting AI Service Metrics evidence → ${EVIDENCE_FILE}" - # Detect workload type: prefer Dynamo if running, otherwise use training path + # Detect workload type: Dynamo inference > NIM inference > PyTorch training local dynamo_ns="dynamo-workload" + local nim_ns="nim-workload" if kubectl get pods -n "${dynamo_ns}" -l nvidia.com/dynamo-component-type=worker --no-headers 2>/dev/null | grep -q .; then collect_service_metrics_dynamo + elif kubectl get pods -n "${nim_ns}" -l app.kubernetes.io/managed-by=k8s-nim-operator --no-headers 2>/dev/null | grep -q .; then + collect_service_metrics_nim else # Training path: deploys a standalone PyTorch pod with Prometheus metrics. # Only requires GPU nodes + Prometheus — no Kubeflow Trainer dependency. @@ -900,6 +903,222 @@ EOF log_info "AI service metrics (Dynamo) evidence collection complete." } +# --- NIM inference metrics collection --- +# Collects metrics from a running NIMService deployment. NIM exposes OpenAI-compatible +# inference metrics at /v1/metrics in Prometheus exposition format. +collect_service_metrics_nim() { + write_section_header "AI Service Metrics (NIM Inference)" + + cat >> "${EVIDENCE_FILE}" <<'EOF' +Demonstrates that NVIDIA NIM inference microservices expose Prometheus-format +metrics that can be discovered and collected by the monitoring stack. + +## NIM Inference Workload +EOF + + local NS="nim-workload" + + # Find the NIM service pod + local nim_pod="" + nim_pod=$(kubectl get pods -n "${NS}" -l app.kubernetes.io/managed-by=k8s-nim-operator \ + --field-selector=status.phase=Running -o jsonpath='{.items[0].metadata.name}' 2>/dev/null) + + if [ -z "${nim_pod}" ]; then + log_warn "No running NIM pod found in ${NS}" + echo "**Result: SKIP** — No running NIM pod found in ${NS}." >> "${EVIDENCE_FILE}" + return + fi + + # Get the NIMService name from pod labels + local nim_service="" + nim_service=$(kubectl get pod "${nim_pod}" -n "${NS}" -o jsonpath='{.metadata.labels.app\.kubernetes\.io/name}' 2>/dev/null) + + capture "NIMService" kubectl get nimservice -n "${NS}" + capture "NIM workload pods" kubectl get pods -n "${NS}" -o wide + + # Wait for NIM to be serving + log_info "Checking NIM readiness..." + local serving_ready=false + for i in $(seq 1 12); do + if kubectl exec -n "${NS}" "${nim_pod}" -- python3 -c " +import urllib.request +urllib.request.urlopen('http://localhost:8000/v1/health/ready')" &>/dev/null; then + serving_ready=true + break + fi + log_info "NIM not serving yet (attempt ${i}/12), retrying in 15s..." + sleep 15 + done + + if [ "${serving_ready}" != "true" ]; then + log_warn "NIM service not serving after 3 minutes" + echo "**Result: FAIL** — NIM service did not become ready." >> "${EVIDENCE_FILE}" + return + fi + + # Show available models + echo "" >> "${EVIDENCE_FILE}" + echo "**NIM models endpoint**" >> "${EVIDENCE_FILE}" + echo '```' >> "${EVIDENCE_FILE}" + kubectl exec -n "${NS}" "${nim_pod}" -- python3 -c " +import urllib.request, json +data = json.loads(urllib.request.urlopen('http://localhost:8000/v1/models').read()) +for m in data['data']: + print(f\"Model: {m['id']}\")" >> "${EVIDENCE_FILE}" 2>&1 + echo '```' >> "${EVIDENCE_FILE}" + + # Get model name for requests + local model_name="" + model_name=$(kubectl exec -n "${NS}" "${nim_pod}" -- python3 -c " +import urllib.request, json +data = json.loads(urllib.request.urlopen('http://localhost:8000/v1/models').read()) +print(data['data'][0]['id'])" 2>/dev/null) + + # Send inference requests to generate non-zero metrics + log_info "Sending 10 inference requests via NIM..." + for i in $(seq 1 10); do + kubectl exec -n "${NS}" "${nim_pod}" -- python3 -c " +import urllib.request, json +req = urllib.request.Request('http://localhost:8000/v1/chat/completions', + data=json.dumps({'model': '${model_name}', 'messages': [{'role': 'user', 'content': 'Explain GPU computing in one sentence.'}], 'max_tokens': 30}).encode(), + headers={'Content-Type': 'application/json'}) +urllib.request.urlopen(req)" &>/dev/null || true + done + + # Collect NIM metrics from /v1/metrics + echo "" >> "${EVIDENCE_FILE}" + echo "**NIM inference metrics endpoint (sampled after generating inference traffic)**" >> "${EVIDENCE_FILE}" + echo '```' >> "${EVIDENCE_FILE}" + kubectl exec -n "${NS}" "${nim_pod}" -- python3 -c " +import urllib.request +data = urllib.request.urlopen('http://localhost:8000/v1/metrics').read().decode() +for l in data.split('\n'): + if not l or l.startswith('#') or '_bucket' in l or '_created' in l: + continue + parts = l.rsplit(' ', 1) + if len(parts) == 2 and parts[1] not in ('0', '0.0'): + # Show key inference metrics + if any(k in l for k in ['prompt_tokens', 'generation_tokens', 'time_to_first_token', + 'time_per_output_token', 'request_success', 'num_request', + 'e2e_request_latency', 'request_prompt_tokens', 'request_generation_tokens']): + print(l)" 2>&1 | head -20 >> "${EVIDENCE_FILE}" + echo '```' >> "${EVIDENCE_FILE}" + + # Create a ServiceMonitor so Prometheus can discover and scrape NIM metrics. + # NIM exposes metrics at /v1/metrics (not /metrics), so we need a custom path. + log_info "Creating ServiceMonitor for NIM metrics discovery..." + kubectl apply -f - <<'SM_EOF' +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: nim-inference + namespace: monitoring + labels: + release: kube-prometheus +spec: + namespaceSelector: + matchNames: + - nim-workload + selector: + matchLabels: + app.kubernetes.io/managed-by: k8s-nim-operator + endpoints: + - port: api + path: /v1/metrics + interval: 15s +SM_EOF + + cat >> "${EVIDENCE_FILE}" <<'EOF' + +## Prometheus Metrics Discovery + +A ServiceMonitor is created to enable Prometheus auto-discovery of NIM inference +metrics. NIM exposes metrics at `/v1/metrics` in Prometheus exposition format. +EOF + + capture "NIM ServiceMonitor" kubectl get servicemonitor nim-inference -n monitoring -o yaml + + log_info "Waiting for Prometheus to discover and scrape NIM targets (up to 3m)..." + kubectl port-forward svc/kube-prometheus-prometheus -n monitoring 9090:9090 &>/dev/null & + local pf_pid=$! + + if wait_for_port 9090 30 "${pf_pid}"; then + # Wait for NIM targets with health=up (at least one successful scrape). + # Match by namespace since the job name comes from the service name. + local target_found=false + for i in $(seq 1 18); do + if curl -sf 'http://localhost:9090/api/v1/targets?state=active' 2>/dev/null | \ + python3 -c "import sys,json; data=json.load(sys.stdin); exit(0 if any(t['labels'].get('namespace','')=='${NS}' and t.get('health')=='up' for t in data['data']['activeTargets']) else 1)" 2>/dev/null; then + target_found=true + break + fi + log_info "NIM target not yet healthy (attempt ${i}/18), retrying in 10s..." + sleep 10 + done + + if [ "${target_found}" = "true" ]; then + echo "" >> "${EVIDENCE_FILE}" + echo "**Prometheus scrape targets (active)**" >> "${EVIDENCE_FILE}" + echo '```' >> "${EVIDENCE_FILE}" + curl -sf 'http://localhost:9090/api/v1/targets?state=active' 2>/dev/null | \ + python3 -c " +import sys,json +data=json.load(sys.stdin) +for t in data['data']['activeTargets']: + ns = t['labels'].get('namespace','') + if ns == '${NS}': + print(json.dumps({'job':t['labels'].get('job',''),'endpoint':t['scrapeUrl'],'health':t['health'],'lastScrape':t['lastScrape']},indent=2))" >> "${EVIDENCE_FILE}" 2>&1 + echo '```' >> "${EVIDENCE_FILE}" + + # Query NIM-specific metrics from Prometheus + local prom_response + prom_response=$(curl -sf --data-urlencode "query={__name__=~\"prompt_tokens_total|generation_tokens_total|time_to_first_token_seconds_sum|time_per_output_token_seconds_sum|e2e_request_latency_seconds_sum\",model_name=~\".*\"}" 'http://localhost:9090/api/v1/query' 2>/dev/null) + + if [ -n "${prom_response}" ] && echo "${prom_response}" | python3 -c "import sys,json; data=json.load(sys.stdin); exit(0 if data['data']['result'] else 1)" 2>/dev/null; then + echo "" >> "${EVIDENCE_FILE}" + echo "**NIM metrics queried from Prometheus**" >> "${EVIDENCE_FILE}" + echo '```' >> "${EVIDENCE_FILE}" + echo "${prom_response}" | python3 -c " +import sys,json +data=json.load(sys.stdin) +for r in data['data']['result']: + name=r['metric']['__name__'] + model=r['metric'].get('model_name','') + val=r['value'][1] + print(f'{name}{{model_name=\"{model}\"}} = {val}')" 2>&1 | head -15 >> "${EVIDENCE_FILE}" + echo '```' >> "${EVIDENCE_FILE}" + fi + + echo "" >> "${EVIDENCE_FILE}" + echo "**Result: PASS** — Prometheus discovers NIM inference workloads via ServiceMonitor and actively scrapes application-level AI inference metrics (token throughput, request latency, time-to-first-token) from the /v1/metrics endpoint." >> "${EVIDENCE_FILE}" + else + echo "" >> "${EVIDENCE_FILE}" + echo "**Result: FAIL** — Prometheus did not discover NIM targets within 2 minutes." >> "${EVIDENCE_FILE}" + fi + else + echo "" >> "${EVIDENCE_FILE}" + echo "**Result: FAIL** — Could not connect to Prometheus." >> "${EVIDENCE_FILE}" + fi + kill "${pf_pid}" 2>/dev/null || true + + # Clean up ServiceMonitor + if [ "${NO_CLEANUP}" != "true" ]; then + kubectl delete servicemonitor nim-inference -n monitoring --ignore-not-found 2>/dev/null || true + fi + + cat >> "${EVIDENCE_FILE}" <<'EOF' + +## Cleanup + +**Delete workload namespace** +``` +$ kubectl delete ns nim-workload +``` +EOF + + log_info "AI service metrics (NIM) evidence collection complete." +} + # --- PyTorch training workload metrics collection --- # Deploys a PyTorch training pod that exposes training metrics (loss, throughput, # GPU memory) on :8080/metrics in Prometheus format via a ServiceMonitor. @@ -1186,8 +1405,11 @@ collect_operator() { log_info "Collecting Robust AI Operator evidence → ${EVIDENCE_FILE}" # Detect which AI operator is present and route to the appropriate collector. + # Priority: Dynamo > NIM Operator > Kubeflow Trainer if kubectl get deploy -n dynamo-system dynamo-platform-dynamo-operator-controller-manager --no-headers 2>/dev/null | grep -q .; then collect_operator_dynamo + elif kubectl get deploy -n nvidia-nim -l app.kubernetes.io/name=k8s-nim-operator --no-headers 2>/dev/null | grep -q .; then + collect_operator_nim elif kubectl get deploy -n kubeflow kubeflow-trainer-controller-manager --no-headers 2>/dev/null | grep -q .; then collect_operator_kubeflow else @@ -1310,6 +1532,130 @@ INVALID_CR log_info "Robust operator (Kubeflow Trainer) evidence collection complete." } +# --- NIM Operator evidence --- +collect_operator_nim() { + write_section_header "Robust AI Operator (NIM Operator)" + + cat >> "${EVIDENCE_FILE}" <<'EOF' +Demonstrates CNCF AI Conformance requirement that at least one complex AI operator +with a CRD can be installed and functions reliably, including operator pods running, +webhooks operational, and custom resources reconciled. + +## Summary + +1. **NIM Operator** — Controller manager running in `nvidia-nim` +2. **Custom Resource Definitions** — NIMService, NIMCache, NIMPipeline, NIMBuild CRDs registered +3. **Admission Controller** — Validating/mutating webhooks configured and active +4. **Custom Resource Reconciled** — `NIMService` reconciled into running inference pod(s) +5. **Result: PASS** + +--- + +## NIM Operator Health +EOF + capture "NIM operator deployment" kubectl get deploy -n nvidia-nim + capture "NIM operator pods" kubectl get pods -n nvidia-nim + + cat >> "${EVIDENCE_FILE}" <<'EOF' + +## Custom Resource Definitions +EOF + echo "" >> "${EVIDENCE_FILE}" + echo "**NIM CRDs**" >> "${EVIDENCE_FILE}" + echo '```' >> "${EVIDENCE_FILE}" + kubectl get crds 2>/dev/null | grep "apps\.nvidia\.com" >> "${EVIDENCE_FILE}" 2>&1 + echo '```' >> "${EVIDENCE_FILE}" + + cat >> "${EVIDENCE_FILE}" <<'EOF' + +## Webhooks +EOF + echo "" >> "${EVIDENCE_FILE}" + echo "**NIM Operator webhooks**" >> "${EVIDENCE_FILE}" + echo '```' >> "${EVIDENCE_FILE}" + # Match webhooks by name or by backing service in the nvidia-nim namespace + if [[ "${HAS_JQ}" == "true" ]]; then + kubectl get validatingwebhookconfigurations,mutatingwebhookconfigurations -o json 2>/dev/null | \ + jq -r '.items[] | select(.webhooks[]?.clientConfig.service.namespace == "nvidia-nim") | "\(.kind)/\(.metadata.name)"' 2>/dev/null >> "${EVIDENCE_FILE}" 2>&1 || true + else + kubectl get validatingwebhookconfigurations,mutatingwebhookconfigurations 2>/dev/null | grep -iE 'nim|apps\.nvidia\.com' >> "${EVIDENCE_FILE}" 2>&1 || true + fi + echo '```' >> "${EVIDENCE_FILE}" + + cat >> "${EVIDENCE_FILE}" <<'EOF' + +## Custom Resource Reconciliation + +A `NIMService` defines an inference microservice. The operator reconciles it into +a Deployment with GPU resources, a Service, and health monitoring. +EOF + capture "NIMServices" kubectl get nimservices -A + local nim_ns="nim-workload" + local nim_service="" + nim_service=$(kubectl get nimservices -n "${nim_ns}" -o jsonpath='{.items[0].metadata.name}' 2>/dev/null) + if [ -n "${nim_service}" ]; then + capture "NIMService details" kubectl get nimservice "${nim_service}" -n "${nim_ns}" -o yaml + fi + + cat >> "${EVIDENCE_FILE}" <<'EOF' + +### Workload Pods Created by Operator +EOF + capture "NIM workload pods" kubectl get pods -n "${nim_ns}" -l app.kubernetes.io/managed-by=k8s-nim-operator -o wide + + cat >> "${EVIDENCE_FILE}" <<'EOF' + +## Webhook Rejection Test + +Submit an invalid NIMService to verify the admission controller actively +rejects malformed resources. +EOF + echo "" >> "${EVIDENCE_FILE}" + echo "**Invalid CR rejection**" >> "${EVIDENCE_FILE}" + echo '```' >> "${EVIDENCE_FILE}" + local webhook_result + webhook_result=$(kubectl apply -f - 2>&1 <> "${EVIDENCE_FILE}" + echo '```' >> "${EVIDENCE_FILE}" + + echo "" >> "${EVIDENCE_FILE}" + if echo "${webhook_result}" | grep -qi "denied\|forbidden\|invalid\|error"; then + echo "Webhook correctly rejected the invalid resource." >> "${EVIDENCE_FILE}" + else + echo "WARNING: Webhook did not reject the invalid resource." >> "${EVIDENCE_FILE}" + kubectl delete nimservice webhook-test-invalid -n default --ignore-not-found 2>/dev/null + fi + + # Verdict + echo "" >> "${EVIDENCE_FILE}" + local crd_count + crd_count=$(kubectl get crds 2>/dev/null | grep -c "apps\.nvidia\.com" || true) + local running_pods + running_pods=$(kubectl get pods -n "${nim_ns}" -l app.kubernetes.io/managed-by=k8s-nim-operator --no-headers 2>/dev/null | grep -c "Running" || true) + local webhook_ok + webhook_ok=$(echo "${webhook_result}" | grep -ci "denied\|forbidden\|invalid\|error" || true) + + if [ "${crd_count}" -gt 0 ] && [ "${running_pods}" -gt 0 ] && [ "${webhook_ok}" -gt 0 ]; then + echo "**Result: PASS** — NIM operator running, webhooks operational (rejection verified), ${crd_count} CRDs registered, NIMService reconciled with ${running_pods} healthy inference pod(s)." >> "${EVIDENCE_FILE}" + elif [ "${crd_count}" -gt 0 ] && [ "${running_pods}" -gt 0 ]; then + echo "**Result: PASS** — NIM operator running, ${crd_count} CRDs registered, NIMService reconciled with ${running_pods} healthy inference pod(s)." >> "${EVIDENCE_FILE}" + elif [ "${crd_count}" -gt 0 ]; then + echo "**Result: FAIL** — NIMService found but no healthy inference pods." >> "${EVIDENCE_FILE}" + else + echo "**Result: FAIL** — No NIM CRDs found." >> "${EVIDENCE_FILE}" + fi + + log_info "Robust operator (NIM) evidence collection complete." +} + # --- Dynamo evidence --- collect_operator_dynamo() { write_section_header "Robust AI Operator (Dynamo Platform)"