From d922277e26cbc989e20d7cb6c4bb10dbea6a7a7c Mon Sep 17 00:00:00 2001 From: Jont828 Date: Tue, 31 Mar 2026 18:24:06 -0400 Subject: [PATCH] Add H200 accelerator support with AKS recipe overlays Register h200 as a supported accelerator type in the criteria system and add AKS-specific recipe overlays mirroring the existing H100 AKS set. The H200 (Standard_ND96isr_H200_v5) uses NVIDIA H200 Tensor Core GPUs with 141 GB HBM3e and 900 GB/s NVLink interconnect. - Add CriteriaAcceleratorH200 constant, parser case, and model matcher - Create 6 AKS overlay files (training, inference, ubuntu variants, kubeflow, dynamo) inheriting from aks-training/aks-inference bases - Add H200 UAT chainsaw tests for training and inference CUJs - Update unit tests for accelerator parsing and GPU model matching Signed-off-by: Jont828 --- pkg/recipe/criteria.go | 7 +- pkg/recipe/criteria_test.go | 4 +- pkg/recipe/snapshot.go | 2 + pkg/recipe/snapshot_test.go | 25 +++ recipes/overlays/h200-aks-inference.yaml | 44 +++++ recipes/overlays/h200-aks-training.yaml | 48 +++++ .../h200-aks-ubuntu-inference-dynamo.yaml | 92 ++++++++++ .../overlays/h200-aks-ubuntu-inference.yaml | 43 +++++ .../h200-aks-ubuntu-training-kubeflow.yaml | 54 ++++++ .../overlays/h200-aks-ubuntu-training.yaml | 72 ++++++++ .../cuj1-training-h200/assert-recipe.yaml | 24 +++ .../cuj1-training-h200/chainsaw-test.yaml | 170 ++++++++++++++++++ .../cuj2-inference-h200/assert-recipe.yaml | 24 +++ .../cuj2-inference-h200/chainsaw-test.yaml | 170 ++++++++++++++++++ 14 files changed, 776 insertions(+), 3 deletions(-) create mode 100644 recipes/overlays/h200-aks-inference.yaml create mode 100644 recipes/overlays/h200-aks-training.yaml create mode 100644 recipes/overlays/h200-aks-ubuntu-inference-dynamo.yaml create mode 100644 recipes/overlays/h200-aks-ubuntu-inference.yaml create mode 100644 recipes/overlays/h200-aks-ubuntu-training-kubeflow.yaml create mode 100644 recipes/overlays/h200-aks-ubuntu-training.yaml create mode 100644 tests/uat/azure/tests/cuj1-training-h200/assert-recipe.yaml create mode 100644 tests/uat/azure/tests/cuj1-training-h200/chainsaw-test.yaml create mode 100644 tests/uat/azure/tests/cuj2-inference-h200/assert-recipe.yaml create mode 100644 tests/uat/azure/tests/cuj2-inference-h200/chainsaw-test.yaml diff --git a/pkg/recipe/criteria.go b/pkg/recipe/criteria.go index 00997461e..9d6e3a811 100644 --- a/pkg/recipe/criteria.go +++ b/pkg/recipe/criteria.go @@ -77,6 +77,7 @@ type CriteriaAcceleratorType string const ( CriteriaAcceleratorAny CriteriaAcceleratorType = "any" CriteriaAcceleratorH100 CriteriaAcceleratorType = "h100" + CriteriaAcceleratorH200 CriteriaAcceleratorType = "h200" CriteriaAcceleratorGB200 CriteriaAcceleratorType = "gb200" CriteriaAcceleratorB200 CriteriaAcceleratorType = "b200" CriteriaAcceleratorA100 CriteriaAcceleratorType = "a100" @@ -90,6 +91,8 @@ func ParseCriteriaAcceleratorType(s string) (CriteriaAcceleratorType, error) { return CriteriaAcceleratorAny, nil case "h100": return CriteriaAcceleratorH100, nil + case "h200": + return CriteriaAcceleratorH200, nil case "gb200": return CriteriaAcceleratorGB200, nil case "b200": @@ -105,7 +108,7 @@ func ParseCriteriaAcceleratorType(s string) (CriteriaAcceleratorType, error) { // GetCriteriaAcceleratorTypes returns all supported accelerator types sorted alphabetically. func GetCriteriaAcceleratorTypes() []string { - return []string{"a100", "b200", "gb200", "h100", "l40"} + return []string{"a100", "b200", "gb200", "h100", "h200", "l40"} } // CriteriaIntentType represents the workload intent. @@ -210,7 +213,7 @@ type Criteria struct { // Service is the Kubernetes service type (eks, gke, aks, oke, self-managed). Service CriteriaServiceType `json:"service,omitempty" yaml:"service,omitempty"` - // Accelerator is the GPU/accelerator type (h100, gb200, b200, a100, l40). + // Accelerator is the GPU/accelerator type (h100, h200, gb200, b200, a100, l40). Accelerator CriteriaAcceleratorType `json:"accelerator,omitempty" yaml:"accelerator,omitempty"` // Intent is the workload intent (training, inference). diff --git a/pkg/recipe/criteria_test.go b/pkg/recipe/criteria_test.go index 1358ec0ed..a88c967db 100644 --- a/pkg/recipe/criteria_test.go +++ b/pkg/recipe/criteria_test.go @@ -68,6 +68,8 @@ func TestParseCriteriaAcceleratorType(t *testing.T) { {"any", "any", CriteriaAcceleratorAny, false}, {"h100", "h100", CriteriaAcceleratorH100, false}, {"H100 uppercase", "H100", CriteriaAcceleratorH100, false}, + {"h200", "h200", CriteriaAcceleratorH200, false}, + {"H200 uppercase", "H200", CriteriaAcceleratorH200, false}, {"gb200", "gb200", CriteriaAcceleratorGB200, false}, {"b200", "b200", CriteriaAcceleratorB200, false}, {"a100", "a100", CriteriaAcceleratorA100, false}, @@ -687,7 +689,7 @@ func TestGetCriteriaAcceleratorTypes(t *testing.T) { types := GetCriteriaAcceleratorTypes() // Should return sorted list - expected := []string{"a100", "b200", "gb200", "h100", "l40"} + expected := []string{"a100", "b200", "gb200", "h100", "h200", "l40"} if len(types) != len(expected) { t.Errorf("GetCriteriaAcceleratorTypes() returned %d types, want %d", len(types), len(expected)) } diff --git a/pkg/recipe/snapshot.go b/pkg/recipe/snapshot.go index 9cac7c7ef..37b71c94d 100644 --- a/pkg/recipe/snapshot.go +++ b/pkg/recipe/snapshot.go @@ -105,6 +105,8 @@ func matchAccelerator(model string) CriteriaAcceleratorType { // Follow this pattern when adding future Blackwell variants (e.g., check "gb300" before "b300"). case strings.Contains(lower, "b200"): return CriteriaAcceleratorB200 + case strings.Contains(lower, "h200"): + return CriteriaAcceleratorH200 case strings.Contains(lower, "h100"): return CriteriaAcceleratorH100 case strings.Contains(lower, "a100"): diff --git a/pkg/recipe/snapshot_test.go b/pkg/recipe/snapshot_test.go index 3d0d1a087..47a42d23b 100644 --- a/pkg/recipe/snapshot_test.go +++ b/pkg/recipe/snapshot_test.go @@ -150,6 +150,29 @@ func TestExtractCriteriaFromSnapshot(t *testing.T) { } }, }, + { + name: "GPU H200 from model field", + snapshot: &snapshotter.Snapshot{ + Measurements: []*measurement.Measurement{ + { + Type: measurement.TypeGPU, + Subtypes: []measurement.Subtype{ + { + Name: "device", + Data: map[string]measurement.Reading{ + "model": measurement.Str("NVIDIA H200 141GB HBM3e"), + }, + }, + }, + }, + }, + }, + validate: func(t *testing.T, c *Criteria) { + if c.Accelerator != CriteriaAcceleratorH200 { + t.Errorf("Accelerator = %v, want %v", c.Accelerator, CriteriaAcceleratorH200) + } + }, + }, { name: "GPU H100 from model field", snapshot: &snapshotter.Snapshot{ @@ -405,6 +428,8 @@ func TestMatchAccelerator(t *testing.T) { }{ {"H100 uppercase", "NVIDIA H100 80GB HBM3", CriteriaAcceleratorH100}, {"H100 lowercase", "h100-sxm", CriteriaAcceleratorH100}, + {"H200 uppercase", "NVIDIA H200 141GB HBM3e", CriteriaAcceleratorH200}, + {"H200 lowercase", "h200-sxm", CriteriaAcceleratorH200}, {"A100", "A100-SXM4-80GB", CriteriaAcceleratorA100}, {"GB200", "NVIDIA GB200", CriteriaAcceleratorGB200}, {"B200", "NVIDIA-B200", CriteriaAcceleratorB200}, diff --git a/recipes/overlays/h200-aks-inference.yaml b/recipes/overlays/h200-aks-inference.yaml new file mode 100644 index 000000000..959b43e82 --- /dev/null +++ b/recipes/overlays/h200-aks-inference.yaml @@ -0,0 +1,44 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +kind: RecipeMetadata +apiVersion: aicr.nvidia.com/v1alpha1 +metadata: + name: h200-aks-inference + +spec: + # Inherits from aks-inference recipe (AKS + inference settings) + base: aks-inference + + criteria: + service: aks + accelerator: h200 + intent: inference + + # Specific constraints for H200 on AKS inference workloads + # Constraint names use fully qualified measurement paths: {type}.{subtype}.{key} + constraints: + - name: K8s.server.version + value: ">= 1.32.4" + + # Skyhook customizations omitted — Skyhook packages do not support + # service: aks. The skyhook-operator itself is inherited from base + # and still deploys. This follows the same pattern as Kind overlays + # (h100-kind-inference.yaml) which also omit Skyhook tuning. + componentRefs: + - name: gpu-operator + type: Helm + dependencyRefs: + - cert-manager + - kube-prometheus-stack diff --git a/recipes/overlays/h200-aks-training.yaml b/recipes/overlays/h200-aks-training.yaml new file mode 100644 index 000000000..8a8756f71 --- /dev/null +++ b/recipes/overlays/h200-aks-training.yaml @@ -0,0 +1,48 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +kind: RecipeMetadata +apiVersion: aicr.nvidia.com/v1alpha1 +metadata: + name: h200-aks-training + +spec: + # Inherits from aks-training recipe (AKS + training settings) + base: aks-training + + criteria: + service: aks + accelerator: h200 + intent: training + + # Specific constraints for H200 on AKS training workloads + # Constraint names use fully qualified measurement paths: {type}.{subtype}.{key} + constraints: + - name: K8s.server.version + value: ">= 1.32.4" + + # Skyhook customizations omitted — Skyhook packages do not support + # service: aks. The skyhook-operator itself is inherited from base + # and still deploys. This follows the same pattern as Kind overlays + # (h100-kind-training.yaml) which also omit Skyhook tuning. + componentRefs: + # H200-specific GPU Operator overrides (inherits valuesFile from aks-training) + - name: gpu-operator + type: Helm + dependencyRefs: + - cert-manager + - kube-prometheus-stack + overrides: + gdrcopy: + enabled: true diff --git a/recipes/overlays/h200-aks-ubuntu-inference-dynamo.yaml b/recipes/overlays/h200-aks-ubuntu-inference-dynamo.yaml new file mode 100644 index 000000000..d7783bef9 --- /dev/null +++ b/recipes/overlays/h200-aks-ubuntu-inference-dynamo.yaml @@ -0,0 +1,92 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +kind: RecipeMetadata +apiVersion: aicr.nvidia.com/v1alpha1 +metadata: + name: h200-aks-ubuntu-inference-dynamo + +spec: + # Inherits from h200-aks-ubuntu-inference (H200 + Ubuntu inference settings) + # Adds Dynamo inference platform components. + base: h200-aks-ubuntu-inference + + criteria: + service: aks + accelerator: h200 + os: ubuntu + intent: inference + platform: dynamo + + # DRA requires Kubernetes 1.34+ (GA) + constraints: + - name: K8s.server.version + value: ">= 1.34" + + componentRefs: + - name: nvidia-dra-driver-gpu + type: Helm + overrides: + gpuResourcesEnabledOverride: true + + - name: dynamo-crds + type: Helm + source: https://helm.ngc.nvidia.com/nvidia/ai-dynamo + version: "0.9.0" + valuesFile: components/dynamo-crds/values.yaml + + - name: dynamo-platform + type: Helm + source: https://helm.ngc.nvidia.com/nvidia/ai-dynamo + version: "0.9.0" + valuesFile: components/dynamo-platform/values.yaml + dependencyRefs: + - dynamo-crds + - cert-manager + - kube-prometheus-stack + - kai-scheduler + overrides: + etcd: + persistence: + storageClass: managed-csi + nats: + config: + jetstream: + fileStore: + pvc: + storageClassName: managed-csi + + validation: + deployment: + checks: + - operator-health + - expected-resources + - gpu-operator-version + - check-nvidia-smi + constraints: + - name: Deployment.gpu-operator.version + value: ">= v24.6.0" + conformance: + checks: + - platform-health + - gpu-operator-health + - dra-support + - accelerator-metrics + - ai-service-metrics + - inference-gateway + - gang-scheduling + - pod-autoscaling + - cluster-autoscaling + - robust-controller + - secure-accelerator-access diff --git a/recipes/overlays/h200-aks-ubuntu-inference.yaml b/recipes/overlays/h200-aks-ubuntu-inference.yaml new file mode 100644 index 000000000..d6a08dd94 --- /dev/null +++ b/recipes/overlays/h200-aks-ubuntu-inference.yaml @@ -0,0 +1,43 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +kind: RecipeMetadata +apiVersion: aicr.nvidia.com/v1alpha1 +metadata: + name: h200-aks-ubuntu-inference + +spec: + # Inherits from h200-aks-inference recipe (H200 + AKS + inference settings) + # This overlay adds Ubuntu-specific configurations + base: h200-aks-inference + + criteria: + service: aks + accelerator: h200 + os: ubuntu + intent: inference + + # H200 + Ubuntu specific constraints for inference workloads + # Constraint names use fully qualified measurement paths: {type}.{subtype}.{key} + constraints: + - name: K8s.server.version + value: ">= 1.32.4" + - name: OS.release.ID + value: ubuntu + - name: OS.release.VERSION_ID + value: "24.04" + - name: OS.sysctl./proc/sys/kernel/osrelease + value: ">= 6.8" + + componentRefs: [] diff --git a/recipes/overlays/h200-aks-ubuntu-training-kubeflow.yaml b/recipes/overlays/h200-aks-ubuntu-training-kubeflow.yaml new file mode 100644 index 000000000..833f38a0d --- /dev/null +++ b/recipes/overlays/h200-aks-ubuntu-training-kubeflow.yaml @@ -0,0 +1,54 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +kind: RecipeMetadata +apiVersion: aicr.nvidia.com/v1alpha1 +metadata: + name: h200-aks-ubuntu-training-kubeflow + +spec: + # Inherits from h200-aks-ubuntu-training recipe (H200 + AKS + Ubuntu + training settings) + # This overlay adds Kubeflow Training Operator for distributed training with TrainJob + base: h200-aks-ubuntu-training + + criteria: + service: aks + accelerator: h200 + os: ubuntu + intent: training + platform: kubeflow + + # Constraints for H200 on AKS with Ubuntu for Kubeflow training workloads + # Constraint names use fully qualified measurement paths: {type}.{subtype}.{key} + constraints: + - name: K8s.server.version + value: ">= 1.32.4" + - name: OS.release.ID + value: ubuntu + - name: OS.release.VERSION_ID + value: "24.04" + - name: OS.sysctl./proc/sys/kernel/osrelease + value: ">= 6.8" + + # Kubeflow Training Operator for TrainJob support + componentRefs: + - name: kubeflow-trainer + type: Helm + valuesFile: components/kubeflow-trainer/values.yaml + manifestFiles: + - components/kubeflow-trainer/manifests/torch-distributed-cluster-training-runtime.yaml + dependencyRefs: + - cert-manager + - kube-prometheus-stack + - gpu-operator diff --git a/recipes/overlays/h200-aks-ubuntu-training.yaml b/recipes/overlays/h200-aks-ubuntu-training.yaml new file mode 100644 index 000000000..f732e8a49 --- /dev/null +++ b/recipes/overlays/h200-aks-ubuntu-training.yaml @@ -0,0 +1,72 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +kind: RecipeMetadata +apiVersion: aicr.nvidia.com/v1alpha1 +metadata: + name: h200-aks-ubuntu-training + +spec: + # Inherits from h200-aks-training recipe (H200 + AKS + training settings) + # This overlay adds Ubuntu-specific configurations + base: h200-aks-training + + criteria: + service: aks + accelerator: h200 + os: ubuntu + intent: training + + # Constraints for H200 on AKS with Ubuntu for training workloads + # Constraint names use fully qualified measurement paths: {type}.{subtype}.{key} + constraints: + - name: K8s.server.version + value: ">= 1.32.4" + - name: OS.release.ID + value: ubuntu + - name: OS.release.VERSION_ID + value: "24.04" + - name: OS.sysctl./proc/sys/kernel/osrelease + value: ">= 6.8" + + componentRefs: [] + + validation: + deployment: + checks: + - operator-health + - expected-resources + - gpu-operator-version + - check-nvidia-smi + constraints: + - name: Deployment.gpu-operator.version + value: ">= v24.6.0" + performance: + checks: + - nccl-all-reduce-bw + constraints: + - name: nccl-all-reduce-bw + value: ">= 100" + conformance: + checks: + - platform-health + - gpu-operator-health + - dra-support + - accelerator-metrics + - ai-service-metrics + - gang-scheduling + - pod-autoscaling + - cluster-autoscaling + - robust-controller + - secure-accelerator-access diff --git a/tests/uat/azure/tests/cuj1-training-h200/assert-recipe.yaml b/tests/uat/azure/tests/cuj1-training-h200/assert-recipe.yaml new file mode 100644 index 000000000..148785813 --- /dev/null +++ b/tests/uat/azure/tests/cuj1-training-h200/assert-recipe.yaml @@ -0,0 +1,24 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Assert the CUJ1 H200 recipe has the expected structure. +kind: RecipeResult +apiVersion: aicr.nvidia.com/v1alpha1 +criteria: + service: aks + accelerator: h200 + intent: training + os: ubuntu + platform: kubeflow +(length(componentRefs) > `0`): true diff --git a/tests/uat/azure/tests/cuj1-training-h200/chainsaw-test.yaml b/tests/uat/azure/tests/cuj1-training-h200/chainsaw-test.yaml new file mode 100644 index 000000000..f5a7deb99 --- /dev/null +++ b/tests/uat/azure/tests/cuj1-training-h200/chainsaw-test.yaml @@ -0,0 +1,170 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# yaml-language-server: $schema=https://raw.githubusercontent.com/kyverno/chainsaw/main/.schemas/json/test-chainsaw-v1alpha1.json +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Test +metadata: + name: uat-cuj1-training-aks-h200 +spec: + description: | + UAT CUJ1: Training workload on live AKS cluster with H200 GPU nodes. + Tests the aicr workflow against a real cluster: + Step 1: Snapshot the live cluster + Step 2: Generate recipe (AKS/H200/training/kubeflow) + Step 3: Validate deployment against live snapshot + Step 4: Generate bundle with node scheduling + Step 5: Validate bundle structure + Step 6: Multi-phase validation + timeouts: + exec: 300s + steps: + + # ── Step 1: Snapshot the live cluster ────────────────────────────── + - name: snapshot-cluster + description: Capture live cluster state for validation. + try: + - script: + content: | + set -eu + AICR_BIN="${AICR_BIN:?Set AICR_BIN to the path of the aicr binary}" + WORK="/tmp/uat-cuj1-training-aks-h200" + rm -rf "${WORK}" && mkdir -p "${WORK}" + ${AICR_BIN} snapshot --output "${WORK}/snapshot.yaml" + test -f "${WORK}/snapshot.yaml" + + # ── Step 2: Generate recipe ──────────────────────────────────────── + - name: generate-recipe + description: Generate an AKS H200 training recipe with kubeflow platform. + try: + - script: + content: | + set -eu + AICR_BIN="${AICR_BIN:?Set AICR_BIN to the path of the aicr binary}" + WORK="/tmp/uat-cuj1-training-aks-h200" + ${AICR_BIN} recipe \ + --service aks \ + --accelerator h200 \ + --intent training \ + --os ubuntu \ + --platform kubeflow \ + --output "${WORK}/recipe.yaml" + test -f "${WORK}/recipe.yaml" + + - name: assert-recipe + description: Verify recipe has correct criteria and components. + try: + - script: + content: | + set -eu + WORK="/tmp/uat-cuj1-training-aks-h200" + chainsaw assert \ + --resource "${WORK}/recipe.yaml" \ + --file ./assert-recipe.yaml \ + --no-color --timeout 10s + + # ── Step 3: Validate deployment against live snapshot ─────────────── + - name: validate-deployment + description: Run deployment validation with live cluster snapshot. + try: + - script: + content: | + set -eu + AICR_BIN="${AICR_BIN:?Set AICR_BIN to the path of the aicr binary}" + WORK="/tmp/uat-cuj1-training-aks-h200" + ${AICR_BIN} validate \ + --recipe "${WORK}/recipe.yaml" \ + --snapshot "${WORK}/snapshot.yaml" \ + --phase deployment \ + --no-cluster \ + --output "${WORK}/validate-deployment.json" || true + test -f "${WORK}/validate-deployment.json" + grep -q 'reportformat: CTRF' "${WORK}/validate-deployment.json" || grep -q '\"reportFormat\"' "${WORK}/validate-deployment.json" + + - name: assert-validate-deployment + description: Verify deployment validation produces CTRF output. + try: + - script: + content: | + set -eu + WORK="/tmp/uat-cuj1-training-aks-h200" + test -f "${WORK}/validate-deployment.json" + grep -q 'reportformat: CTRF' "${WORK}/validate-deployment.json" || grep -q '\"reportFormat\"' "${WORK}/validate-deployment.json" + check: + ($error == null): true + + # ── Step 4: Generate bundle with node scheduling ─────────────────── + - name: generate-bundle + description: Generate bundle with system and GPU node scheduling. + try: + - script: + content: | + set -eu + AICR_BIN="${AICR_BIN:?Set AICR_BIN to the path of the aicr binary}" + WORK="/tmp/uat-cuj1-training-aks-h200" + ${AICR_BIN} bundle \ + --recipe "${WORK}/recipe.yaml" \ + --output "${WORK}/bundle" \ + --system-node-selector nodeGroup=system-pool \ + --accelerated-node-selector nodeGroup=gpu-worker \ + --accelerated-node-toleration nvidia.com/gpu=present:NoSchedule + + - name: assert-bundle-structure + description: Verify bundle contains expected files. + try: + - script: + content: | + set -eu + WORK="/tmp/uat-cuj1-training-aks-h200" + test -f "${WORK}/bundle/README.md" + test -f "${WORK}/bundle/deploy.sh" + test -x "${WORK}/bundle/deploy.sh" + test -f "${WORK}/bundle/recipe.yaml" + ls "${WORK}"/bundle/*/values.yaml >/dev/null 2>&1 + check: + ($error == null): true + + # ── Step 5: Multi-phase validation ───────────────────────────────── + - name: validate-multiphase + description: Run all three validation phases. + try: + - script: + content: | + set -eu + AICR_BIN="${AICR_BIN:?Set AICR_BIN to the path of the aicr binary}" + WORK="/tmp/uat-cuj1-training-aks-h200" + ${AICR_BIN} validate \ + --recipe "${WORK}/recipe.yaml" \ + --snapshot "${WORK}/snapshot.yaml" \ + --phase deployment \ + --phase performance \ + --phase conformance \ + --no-cluster \ + --output "${WORK}/validate-multiphase.json" || true + + - name: assert-validate-multiphase + description: Verify multiphase validation produces CTRF output. + try: + - script: + content: | + set -eu + WORK="/tmp/uat-cuj1-training-aks-h200" + test -f "${WORK}/validate-multiphase.json" + grep -q 'reportformat: CTRF' "${WORK}/validate-multiphase.json" || grep -q '\"reportFormat\"' "${WORK}/validate-multiphase.json" + check: + ($error == null): true + cleanup: + - script: + content: | + rm -rf /tmp/uat-cuj1-training-aks-h200 diff --git a/tests/uat/azure/tests/cuj2-inference-h200/assert-recipe.yaml b/tests/uat/azure/tests/cuj2-inference-h200/assert-recipe.yaml new file mode 100644 index 000000000..4d2ead710 --- /dev/null +++ b/tests/uat/azure/tests/cuj2-inference-h200/assert-recipe.yaml @@ -0,0 +1,24 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Assert the CUJ2 H200 recipe has the expected structure. +kind: RecipeResult +apiVersion: aicr.nvidia.com/v1alpha1 +criteria: + service: aks + accelerator: h200 + intent: inference + os: ubuntu + platform: dynamo +(length(componentRefs) > `0`): true diff --git a/tests/uat/azure/tests/cuj2-inference-h200/chainsaw-test.yaml b/tests/uat/azure/tests/cuj2-inference-h200/chainsaw-test.yaml new file mode 100644 index 000000000..d8cac3e47 --- /dev/null +++ b/tests/uat/azure/tests/cuj2-inference-h200/chainsaw-test.yaml @@ -0,0 +1,170 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# yaml-language-server: $schema=https://raw.githubusercontent.com/kyverno/chainsaw/main/.schemas/json/test-chainsaw-v1alpha1.json +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Test +metadata: + name: uat-cuj2-inference-aks-h200 +spec: + description: | + UAT CUJ2: Inference workload on live AKS cluster with H200 GPU nodes. + Tests the aicr workflow against a real cluster: + Step 1: Snapshot the live cluster + Step 2: Generate recipe (AKS/H200/inference/dynamo) + Step 3: Validate deployment against live snapshot + Step 4: Generate bundle with node scheduling + Step 5: Validate bundle structure + Step 6: Multi-phase validation + timeouts: + exec: 300s + steps: + + # ── Step 1: Snapshot the live cluster ────────────────────────────── + - name: snapshot-cluster + description: Capture live cluster state for validation. + try: + - script: + content: | + set -eu + AICR_BIN="${AICR_BIN:?Set AICR_BIN to the path of the aicr binary}" + WORK="/tmp/uat-cuj2-inference-aks-h200" + rm -rf "${WORK}" && mkdir -p "${WORK}" + ${AICR_BIN} snapshot --output "${WORK}/snapshot.yaml" + test -f "${WORK}/snapshot.yaml" + + # ── Step 2: Generate recipe ──────────────────────────────────────── + - name: generate-recipe + description: Generate an AKS H200 inference recipe with dynamo platform. + try: + - script: + content: | + set -eu + AICR_BIN="${AICR_BIN:?Set AICR_BIN to the path of the aicr binary}" + WORK="/tmp/uat-cuj2-inference-aks-h200" + ${AICR_BIN} recipe \ + --service aks \ + --accelerator h200 \ + --intent inference \ + --os ubuntu \ + --platform dynamo \ + --output "${WORK}/recipe.yaml" + test -f "${WORK}/recipe.yaml" + + - name: assert-recipe + description: Verify recipe has correct criteria and components. + try: + - script: + content: | + set -eu + WORK="/tmp/uat-cuj2-inference-aks-h200" + chainsaw assert \ + --resource "${WORK}/recipe.yaml" \ + --file ./assert-recipe.yaml \ + --no-color --timeout 10s + + # ── Step 3: Validate deployment against live snapshot ─────────────── + - name: validate-deployment + description: Run deployment validation with live cluster snapshot. + try: + - script: + content: | + set -eu + AICR_BIN="${AICR_BIN:?Set AICR_BIN to the path of the aicr binary}" + WORK="/tmp/uat-cuj2-inference-aks-h200" + ${AICR_BIN} validate \ + --recipe "${WORK}/recipe.yaml" \ + --snapshot "${WORK}/snapshot.yaml" \ + --phase deployment \ + --no-cluster \ + --output "${WORK}/validate-deployment.json" || true + test -f "${WORK}/validate-deployment.json" + grep -q 'reportformat: CTRF' "${WORK}/validate-deployment.json" || grep -q '\"reportFormat\"' "${WORK}/validate-deployment.json" + + - name: assert-validate-deployment + description: Verify deployment validation produces CTRF output. + try: + - script: + content: | + set -eu + WORK="/tmp/uat-cuj2-inference-aks-h200" + test -f "${WORK}/validate-deployment.json" + grep -q 'reportformat: CTRF' "${WORK}/validate-deployment.json" || grep -q '\"reportFormat\"' "${WORK}/validate-deployment.json" + check: + ($error == null): true + + # ── Step 4: Generate bundle with node scheduling ─────────────────── + - name: generate-bundle + description: Generate bundle with system and GPU node scheduling. + try: + - script: + content: | + set -eu + AICR_BIN="${AICR_BIN:?Set AICR_BIN to the path of the aicr binary}" + WORK="/tmp/uat-cuj2-inference-aks-h200" + ${AICR_BIN} bundle \ + --recipe "${WORK}/recipe.yaml" \ + --output "${WORK}/bundle" \ + --system-node-selector nodeGroup=system-pool \ + --accelerated-node-selector nodeGroup=gpu-worker \ + --accelerated-node-toleration nvidia.com/gpu=present:NoSchedule + + - name: assert-bundle-structure + description: Verify bundle contains expected files. + try: + - script: + content: | + set -eu + WORK="/tmp/uat-cuj2-inference-aks-h200" + test -f "${WORK}/bundle/README.md" + test -f "${WORK}/bundle/deploy.sh" + test -x "${WORK}/bundle/deploy.sh" + test -f "${WORK}/bundle/recipe.yaml" + ls "${WORK}"/bundle/*/values.yaml >/dev/null 2>&1 + check: + ($error == null): true + + # ── Step 5: Multi-phase validation ───────────────────────────────── + - name: validate-multiphase + description: Run all three validation phases. + try: + - script: + content: | + set -eu + AICR_BIN="${AICR_BIN:?Set AICR_BIN to the path of the aicr binary}" + WORK="/tmp/uat-cuj2-inference-aks-h200" + ${AICR_BIN} validate \ + --recipe "${WORK}/recipe.yaml" \ + --snapshot "${WORK}/snapshot.yaml" \ + --phase deployment \ + --phase performance \ + --phase conformance \ + --no-cluster \ + --output "${WORK}/validate-multiphase.json" || true + + - name: assert-validate-multiphase + description: Verify multiphase validation produces CTRF output. + try: + - script: + content: | + set -eu + WORK="/tmp/uat-cuj2-inference-aks-h200" + test -f "${WORK}/validate-multiphase.json" + grep -q 'reportformat: CTRF' "${WORK}/validate-multiphase.json" || grep -q '\"reportFormat\"' "${WORK}/validate-multiphase.json" + check: + ($error == null): true + cleanup: + - script: + content: | + rm -rf /tmp/uat-cuj2-inference-aks-h200