From d922277e26cbc989e20d7cb6c4bb10dbea6a7a7c Mon Sep 17 00:00:00 2001
From: Jont828 <jt572@cornell.edu>
Date: Tue, 31 Mar 2026 18:24:06 -0400
Subject: [PATCH] Add H200 accelerator support with AKS recipe overlays

Register h200 as a supported accelerator type in the criteria system
and add AKS-specific recipe overlays mirroring the existing H100 AKS
set. The H200 (Standard_ND96isr_H200_v5) uses NVIDIA H200 Tensor Core
GPUs with 141 GB HBM3e and 900 GB/s NVLink interconnect.

- Add CriteriaAcceleratorH200 constant, parser case, and model matcher
- Create 6 AKS overlay files (training, inference, ubuntu variants,
  kubeflow, dynamo) inheriting from aks-training/aks-inference bases
- Add H200 UAT chainsaw tests for training and inference CUJs
- Update unit tests for accelerator parsing and GPU model matching

Signed-off-by: Jont828 <jt572@cornell.edu>
---
 pkg/recipe/criteria.go                        |   7 +-
 pkg/recipe/criteria_test.go                   |   4 +-
 pkg/recipe/snapshot.go                        |   2 +
 pkg/recipe/snapshot_test.go                   |  25 +++
 recipes/overlays/h200-aks-inference.yaml      |  44 +++++
 recipes/overlays/h200-aks-training.yaml       |  48 +++++
 .../h200-aks-ubuntu-inference-dynamo.yaml     |  92 ++++++++++
 .../overlays/h200-aks-ubuntu-inference.yaml   |  43 +++++
 .../h200-aks-ubuntu-training-kubeflow.yaml    |  54 ++++++
 .../overlays/h200-aks-ubuntu-training.yaml    |  72 ++++++++
 .../cuj1-training-h200/assert-recipe.yaml     |  24 +++
 .../cuj1-training-h200/chainsaw-test.yaml     | 170 ++++++++++++++++++
 .../cuj2-inference-h200/assert-recipe.yaml    |  24 +++
 .../cuj2-inference-h200/chainsaw-test.yaml    | 170 ++++++++++++++++++
 14 files changed, 776 insertions(+), 3 deletions(-)
 create mode 100644 recipes/overlays/h200-aks-inference.yaml
 create mode 100644 recipes/overlays/h200-aks-training.yaml
 create mode 100644 recipes/overlays/h200-aks-ubuntu-inference-dynamo.yaml
 create mode 100644 recipes/overlays/h200-aks-ubuntu-inference.yaml
 create mode 100644 recipes/overlays/h200-aks-ubuntu-training-kubeflow.yaml
 create mode 100644 recipes/overlays/h200-aks-ubuntu-training.yaml
 create mode 100644 tests/uat/azure/tests/cuj1-training-h200/assert-recipe.yaml
 create mode 100644 tests/uat/azure/tests/cuj1-training-h200/chainsaw-test.yaml
 create mode 100644 tests/uat/azure/tests/cuj2-inference-h200/assert-recipe.yaml
 create mode 100644 tests/uat/azure/tests/cuj2-inference-h200/chainsaw-test.yaml

diff --git a/pkg/recipe/criteria.go b/pkg/recipe/criteria.go
index 00997461e..9d6e3a811 100644
--- a/pkg/recipe/criteria.go
+++ b/pkg/recipe/criteria.go
@@ -77,6 +77,7 @@ type CriteriaAcceleratorType string
 const (
 	CriteriaAcceleratorAny   CriteriaAcceleratorType = "any"
 	CriteriaAcceleratorH100  CriteriaAcceleratorType = "h100"
+	CriteriaAcceleratorH200  CriteriaAcceleratorType = "h200"
 	CriteriaAcceleratorGB200 CriteriaAcceleratorType = "gb200"
 	CriteriaAcceleratorB200  CriteriaAcceleratorType = "b200"
 	CriteriaAcceleratorA100  CriteriaAcceleratorType = "a100"
@@ -90,6 +91,8 @@ func ParseCriteriaAcceleratorType(s string) (CriteriaAcceleratorType, error) {
 		return CriteriaAcceleratorAny, nil
 	case "h100":
 		return CriteriaAcceleratorH100, nil
+	case "h200":
+		return CriteriaAcceleratorH200, nil
 	case "gb200":
 		return CriteriaAcceleratorGB200, nil
 	case "b200":
@@ -105,7 +108,7 @@ func ParseCriteriaAcceleratorType(s string) (CriteriaAcceleratorType, error) {
 
 // GetCriteriaAcceleratorTypes returns all supported accelerator types sorted alphabetically.
 func GetCriteriaAcceleratorTypes() []string {
-	return []string{"a100", "b200", "gb200", "h100", "l40"}
+	return []string{"a100", "b200", "gb200", "h100", "h200", "l40"}
 }
 
 // CriteriaIntentType represents the workload intent.
@@ -210,7 +213,7 @@ type Criteria struct {
 	// Service is the Kubernetes service type (eks, gke, aks, oke, self-managed).
 	Service CriteriaServiceType `json:"service,omitempty" yaml:"service,omitempty"`
 
-	// Accelerator is the GPU/accelerator type (h100, gb200, b200, a100, l40).
+	// Accelerator is the GPU/accelerator type (h100, h200, gb200, b200, a100, l40).
 	Accelerator CriteriaAcceleratorType `json:"accelerator,omitempty" yaml:"accelerator,omitempty"`
 
 	// Intent is the workload intent (training, inference).
diff --git a/pkg/recipe/criteria_test.go b/pkg/recipe/criteria_test.go
index 1358ec0ed..a88c967db 100644
--- a/pkg/recipe/criteria_test.go
+++ b/pkg/recipe/criteria_test.go
@@ -68,6 +68,8 @@ func TestParseCriteriaAcceleratorType(t *testing.T) {
 		{"any", "any", CriteriaAcceleratorAny, false},
 		{"h100", "h100", CriteriaAcceleratorH100, false},
 		{"H100 uppercase", "H100", CriteriaAcceleratorH100, false},
+		{"h200", "h200", CriteriaAcceleratorH200, false},
+		{"H200 uppercase", "H200", CriteriaAcceleratorH200, false},
 		{"gb200", "gb200", CriteriaAcceleratorGB200, false},
 		{"b200", "b200", CriteriaAcceleratorB200, false},
 		{"a100", "a100", CriteriaAcceleratorA100, false},
@@ -687,7 +689,7 @@ func TestGetCriteriaAcceleratorTypes(t *testing.T) {
 	types := GetCriteriaAcceleratorTypes()
 
 	// Should return sorted list
-	expected := []string{"a100", "b200", "gb200", "h100", "l40"}
+	expected := []string{"a100", "b200", "gb200", "h100", "h200", "l40"}
 	if len(types) != len(expected) {
 		t.Errorf("GetCriteriaAcceleratorTypes() returned %d types, want %d", len(types), len(expected))
 	}
diff --git a/pkg/recipe/snapshot.go b/pkg/recipe/snapshot.go
index 9cac7c7ef..37b71c94d 100644
--- a/pkg/recipe/snapshot.go
+++ b/pkg/recipe/snapshot.go
@@ -105,6 +105,8 @@ func matchAccelerator(model string) CriteriaAcceleratorType {
 	// Follow this pattern when adding future Blackwell variants (e.g., check "gb300" before "b300").
 	case strings.Contains(lower, "b200"):
 		return CriteriaAcceleratorB200
+	case strings.Contains(lower, "h200"):
+		return CriteriaAcceleratorH200
 	case strings.Contains(lower, "h100"):
 		return CriteriaAcceleratorH100
 	case strings.Contains(lower, "a100"):
diff --git a/pkg/recipe/snapshot_test.go b/pkg/recipe/snapshot_test.go
index 3d0d1a087..47a42d23b 100644
--- a/pkg/recipe/snapshot_test.go
+++ b/pkg/recipe/snapshot_test.go
@@ -150,6 +150,29 @@ func TestExtractCriteriaFromSnapshot(t *testing.T) {
 				}
 			},
 		},
+		{
+			name: "GPU H200 from model field",
+			snapshot: &snapshotter.Snapshot{
+				Measurements: []*measurement.Measurement{
+					{
+						Type: measurement.TypeGPU,
+						Subtypes: []measurement.Subtype{
+							{
+								Name: "device",
+								Data: map[string]measurement.Reading{
+									"model": measurement.Str("NVIDIA H200 141GB HBM3e"),
+								},
+							},
+						},
+					},
+				},
+			},
+			validate: func(t *testing.T, c *Criteria) {
+				if c.Accelerator != CriteriaAcceleratorH200 {
+					t.Errorf("Accelerator = %v, want %v", c.Accelerator, CriteriaAcceleratorH200)
+				}
+			},
+		},
 		{
 			name: "GPU H100 from model field",
 			snapshot: &snapshotter.Snapshot{
@@ -405,6 +428,8 @@ func TestMatchAccelerator(t *testing.T) {
 	}{
 		{"H100 uppercase", "NVIDIA H100 80GB HBM3", CriteriaAcceleratorH100},
 		{"H100 lowercase", "h100-sxm", CriteriaAcceleratorH100},
+		{"H200 uppercase", "NVIDIA H200 141GB HBM3e", CriteriaAcceleratorH200},
+		{"H200 lowercase", "h200-sxm", CriteriaAcceleratorH200},
 		{"A100", "A100-SXM4-80GB", CriteriaAcceleratorA100},
 		{"GB200", "NVIDIA GB200", CriteriaAcceleratorGB200},
 		{"B200", "NVIDIA-B200", CriteriaAcceleratorB200},
diff --git a/recipes/overlays/h200-aks-inference.yaml b/recipes/overlays/h200-aks-inference.yaml
new file mode 100644
index 000000000..959b43e82
--- /dev/null
+++ b/recipes/overlays/h200-aks-inference.yaml
@@ -0,0 +1,44 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+kind: RecipeMetadata
+apiVersion: aicr.nvidia.com/v1alpha1
+metadata:
+  name: h200-aks-inference
+
+spec:
+  # Inherits from aks-inference recipe (AKS + inference settings)
+  base: aks-inference
+
+  criteria:
+    service: aks
+    accelerator: h200
+    intent: inference
+
+  # Specific constraints for H200 on AKS inference workloads
+  # Constraint names use fully qualified measurement paths: {type}.{subtype}.{key}
+  constraints:
+    - name: K8s.server.version
+      value: ">= 1.32.4"
+
+  # Skyhook customizations omitted — Skyhook packages do not support
+  # service: aks. The skyhook-operator itself is inherited from base
+  # and still deploys. This follows the same pattern as Kind overlays
+  # (h100-kind-inference.yaml) which also omit Skyhook tuning.
+  componentRefs:
+    - name: gpu-operator
+      type: Helm
+      dependencyRefs:
+        - cert-manager
+        - kube-prometheus-stack
diff --git a/recipes/overlays/h200-aks-training.yaml b/recipes/overlays/h200-aks-training.yaml
new file mode 100644
index 000000000..8a8756f71
--- /dev/null
+++ b/recipes/overlays/h200-aks-training.yaml
@@ -0,0 +1,48 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+kind: RecipeMetadata
+apiVersion: aicr.nvidia.com/v1alpha1
+metadata:
+  name: h200-aks-training
+
+spec:
+  # Inherits from aks-training recipe (AKS + training settings)
+  base: aks-training
+
+  criteria:
+    service: aks
+    accelerator: h200
+    intent: training
+
+  # Specific constraints for H200 on AKS training workloads
+  # Constraint names use fully qualified measurement paths: {type}.{subtype}.{key}
+  constraints:
+    - name: K8s.server.version
+      value: ">= 1.32.4"
+
+  # Skyhook customizations omitted — Skyhook packages do not support
+  # service: aks. The skyhook-operator itself is inherited from base
+  # and still deploys. This follows the same pattern as Kind overlays
+  # (h100-kind-training.yaml) which also omit Skyhook tuning.
+  componentRefs:
+    # H200-specific GPU Operator overrides (inherits valuesFile from aks-training)
+    - name: gpu-operator
+      type: Helm
+      dependencyRefs:
+        - cert-manager
+        - kube-prometheus-stack
+      overrides:
+        gdrcopy:
+          enabled: true
diff --git a/recipes/overlays/h200-aks-ubuntu-inference-dynamo.yaml b/recipes/overlays/h200-aks-ubuntu-inference-dynamo.yaml
new file mode 100644
index 000000000..d7783bef9
--- /dev/null
+++ b/recipes/overlays/h200-aks-ubuntu-inference-dynamo.yaml
@@ -0,0 +1,92 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+kind: RecipeMetadata
+apiVersion: aicr.nvidia.com/v1alpha1
+metadata:
+  name: h200-aks-ubuntu-inference-dynamo
+
+spec:
+  # Inherits from h200-aks-ubuntu-inference (H200 + Ubuntu inference settings)
+  # Adds Dynamo inference platform components.
+  base: h200-aks-ubuntu-inference
+
+  criteria:
+    service: aks
+    accelerator: h200
+    os: ubuntu
+    intent: inference
+    platform: dynamo
+
+  # DRA requires Kubernetes 1.34+ (GA)
+  constraints:
+    - name: K8s.server.version
+      value: ">= 1.34"
+
+  componentRefs:
+    - name: nvidia-dra-driver-gpu
+      type: Helm
+      overrides:
+        gpuResourcesEnabledOverride: true
+
+    - name: dynamo-crds
+      type: Helm
+      source: https://helm.ngc.nvidia.com/nvidia/ai-dynamo
+      version: "0.9.0"
+      valuesFile: components/dynamo-crds/values.yaml
+
+    - name: dynamo-platform
+      type: Helm
+      source: https://helm.ngc.nvidia.com/nvidia/ai-dynamo
+      version: "0.9.0"
+      valuesFile: components/dynamo-platform/values.yaml
+      dependencyRefs:
+        - dynamo-crds
+        - cert-manager
+        - kube-prometheus-stack
+        - kai-scheduler
+      overrides:
+        etcd:
+          persistence:
+            storageClass: managed-csi
+        nats:
+          config:
+            jetstream:
+              fileStore:
+                pvc:
+                  storageClassName: managed-csi
+
+  validation:
+    deployment:
+      checks:
+        - operator-health
+        - expected-resources
+        - gpu-operator-version
+        - check-nvidia-smi
+      constraints:
+        - name: Deployment.gpu-operator.version
+          value: ">= v24.6.0"
+    conformance:
+      checks:
+        - platform-health
+        - gpu-operator-health
+        - dra-support
+        - accelerator-metrics
+        - ai-service-metrics
+        - inference-gateway
+        - gang-scheduling
+        - pod-autoscaling
+        - cluster-autoscaling
+        - robust-controller
+        - secure-accelerator-access
diff --git a/recipes/overlays/h200-aks-ubuntu-inference.yaml b/recipes/overlays/h200-aks-ubuntu-inference.yaml
new file mode 100644
index 000000000..d6a08dd94
--- /dev/null
+++ b/recipes/overlays/h200-aks-ubuntu-inference.yaml
@@ -0,0 +1,43 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+kind: RecipeMetadata
+apiVersion: aicr.nvidia.com/v1alpha1
+metadata:
+  name: h200-aks-ubuntu-inference
+
+spec:
+  # Inherits from h200-aks-inference recipe (H200 + AKS + inference settings)
+  # This overlay adds Ubuntu-specific configurations
+  base: h200-aks-inference
+
+  criteria:
+    service: aks
+    accelerator: h200
+    os: ubuntu
+    intent: inference
+
+  # H200 + Ubuntu specific constraints for inference workloads
+  # Constraint names use fully qualified measurement paths: {type}.{subtype}.{key}
+  constraints:
+    - name: K8s.server.version
+      value: ">= 1.32.4"
+    - name: OS.release.ID
+      value: ubuntu
+    - name: OS.release.VERSION_ID
+      value: "24.04"
+    - name: OS.sysctl./proc/sys/kernel/osrelease
+      value: ">= 6.8"
+
+  componentRefs: []
diff --git a/recipes/overlays/h200-aks-ubuntu-training-kubeflow.yaml b/recipes/overlays/h200-aks-ubuntu-training-kubeflow.yaml
new file mode 100644
index 000000000..833f38a0d
--- /dev/null
+++ b/recipes/overlays/h200-aks-ubuntu-training-kubeflow.yaml
@@ -0,0 +1,54 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+kind: RecipeMetadata
+apiVersion: aicr.nvidia.com/v1alpha1
+metadata:
+  name: h200-aks-ubuntu-training-kubeflow
+
+spec:
+  # Inherits from h200-aks-ubuntu-training recipe (H200 + AKS + Ubuntu + training settings)
+  # This overlay adds Kubeflow Training Operator for distributed training with TrainJob
+  base: h200-aks-ubuntu-training
+
+  criteria:
+    service: aks
+    accelerator: h200
+    os: ubuntu
+    intent: training
+    platform: kubeflow
+
+  # Constraints for H200 on AKS with Ubuntu for Kubeflow training workloads
+  # Constraint names use fully qualified measurement paths: {type}.{subtype}.{key}
+  constraints:
+    - name: K8s.server.version
+      value: ">= 1.32.4"
+    - name: OS.release.ID
+      value: ubuntu
+    - name: OS.release.VERSION_ID
+      value: "24.04"
+    - name: OS.sysctl./proc/sys/kernel/osrelease
+      value: ">= 6.8"
+
+  # Kubeflow Training Operator for TrainJob support
+  componentRefs:
+    - name: kubeflow-trainer
+      type: Helm
+      valuesFile: components/kubeflow-trainer/values.yaml
+      manifestFiles:
+        - components/kubeflow-trainer/manifests/torch-distributed-cluster-training-runtime.yaml
+      dependencyRefs:
+        - cert-manager
+        - kube-prometheus-stack
+        - gpu-operator
diff --git a/recipes/overlays/h200-aks-ubuntu-training.yaml b/recipes/overlays/h200-aks-ubuntu-training.yaml
new file mode 100644
index 000000000..f732e8a49
--- /dev/null
+++ b/recipes/overlays/h200-aks-ubuntu-training.yaml
@@ -0,0 +1,72 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+kind: RecipeMetadata
+apiVersion: aicr.nvidia.com/v1alpha1
+metadata:
+  name: h200-aks-ubuntu-training
+
+spec:
+  # Inherits from h200-aks-training recipe (H200 + AKS + training settings)
+  # This overlay adds Ubuntu-specific configurations
+  base: h200-aks-training
+
+  criteria:
+    service: aks
+    accelerator: h200
+    os: ubuntu
+    intent: training
+
+  # Constraints for H200 on AKS with Ubuntu for training workloads
+  # Constraint names use fully qualified measurement paths: {type}.{subtype}.{key}
+  constraints:
+    - name: K8s.server.version
+      value: ">= 1.32.4"
+    - name: OS.release.ID
+      value: ubuntu
+    - name: OS.release.VERSION_ID
+      value: "24.04"
+    - name: OS.sysctl./proc/sys/kernel/osrelease
+      value: ">= 6.8"
+
+  componentRefs: []
+
+  validation:
+    deployment:
+      checks:
+        - operator-health
+        - expected-resources
+        - gpu-operator-version
+        - check-nvidia-smi
+      constraints:
+        - name: Deployment.gpu-operator.version
+          value: ">= v24.6.0"
+    performance:
+      checks:
+        - nccl-all-reduce-bw
+      constraints:
+        - name: nccl-all-reduce-bw
+          value: ">= 100"
+    conformance:
+      checks:
+        - platform-health
+        - gpu-operator-health
+        - dra-support
+        - accelerator-metrics
+        - ai-service-metrics
+        - gang-scheduling
+        - pod-autoscaling
+        - cluster-autoscaling
+        - robust-controller
+        - secure-accelerator-access
diff --git a/tests/uat/azure/tests/cuj1-training-h200/assert-recipe.yaml b/tests/uat/azure/tests/cuj1-training-h200/assert-recipe.yaml
new file mode 100644
index 000000000..148785813
--- /dev/null
+++ b/tests/uat/azure/tests/cuj1-training-h200/assert-recipe.yaml
@@ -0,0 +1,24 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Assert the CUJ1 H200 recipe has the expected structure.
+kind: RecipeResult
+apiVersion: aicr.nvidia.com/v1alpha1
+criteria:
+  service: aks
+  accelerator: h200
+  intent: training
+  os: ubuntu
+  platform: kubeflow
+(length(componentRefs) > `0`): true
diff --git a/tests/uat/azure/tests/cuj1-training-h200/chainsaw-test.yaml b/tests/uat/azure/tests/cuj1-training-h200/chainsaw-test.yaml
new file mode 100644
index 000000000..f5a7deb99
--- /dev/null
+++ b/tests/uat/azure/tests/cuj1-training-h200/chainsaw-test.yaml
@@ -0,0 +1,170 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# yaml-language-server: $schema=https://raw.githubusercontent.com/kyverno/chainsaw/main/.schemas/json/test-chainsaw-v1alpha1.json
+apiVersion: chainsaw.kyverno.io/v1alpha1
+kind: Test
+metadata:
+  name: uat-cuj1-training-aks-h200
+spec:
+  description: |
+    UAT CUJ1: Training workload on live AKS cluster with H200 GPU nodes.
+    Tests the aicr workflow against a real cluster:
+      Step 1: Snapshot the live cluster
+      Step 2: Generate recipe (AKS/H200/training/kubeflow)
+      Step 3: Validate deployment against live snapshot
+      Step 4: Generate bundle with node scheduling
+      Step 5: Validate bundle structure
+      Step 6: Multi-phase validation
+  timeouts:
+    exec: 300s
+  steps:
+
+    # ── Step 1: Snapshot the live cluster ──────────────────────────────
+    - name: snapshot-cluster
+      description: Capture live cluster state for validation.
+      try:
+        - script:
+            content: |
+              set -eu
+              AICR_BIN="${AICR_BIN:?Set AICR_BIN to the path of the aicr binary}"
+              WORK="/tmp/uat-cuj1-training-aks-h200"
+              rm -rf "${WORK}" && mkdir -p "${WORK}"
+              ${AICR_BIN} snapshot --output "${WORK}/snapshot.yaml"
+              test -f "${WORK}/snapshot.yaml"
+
+    # ── Step 2: Generate recipe ────────────────────────────────────────
+    - name: generate-recipe
+      description: Generate an AKS H200 training recipe with kubeflow platform.
+      try:
+        - script:
+            content: |
+              set -eu
+              AICR_BIN="${AICR_BIN:?Set AICR_BIN to the path of the aicr binary}"
+              WORK="/tmp/uat-cuj1-training-aks-h200"
+              ${AICR_BIN} recipe \
+                --service aks \
+                --accelerator h200 \
+                --intent training \
+                --os ubuntu \
+                --platform kubeflow \
+                --output "${WORK}/recipe.yaml"
+              test -f "${WORK}/recipe.yaml"
+
+    - name: assert-recipe
+      description: Verify recipe has correct criteria and components.
+      try:
+        - script:
+            content: |
+              set -eu
+              WORK="/tmp/uat-cuj1-training-aks-h200"
+              chainsaw assert \
+                --resource "${WORK}/recipe.yaml" \
+                --file ./assert-recipe.yaml \
+                --no-color --timeout 10s
+
+    # ── Step 3: Validate deployment against live snapshot ───────────────
+    - name: validate-deployment
+      description: Run deployment validation with live cluster snapshot.
+      try:
+        - script:
+            content: |
+              set -eu
+              AICR_BIN="${AICR_BIN:?Set AICR_BIN to the path of the aicr binary}"
+              WORK="/tmp/uat-cuj1-training-aks-h200"
+              ${AICR_BIN} validate \
+                --recipe "${WORK}/recipe.yaml" \
+                --snapshot "${WORK}/snapshot.yaml" \
+                --phase deployment \
+                --no-cluster \
+                --output "${WORK}/validate-deployment.json" || true
+              test -f "${WORK}/validate-deployment.json"
+              grep -q 'reportformat: CTRF' "${WORK}/validate-deployment.json" || grep -q '\"reportFormat\"' "${WORK}/validate-deployment.json"
+
+    - name: assert-validate-deployment
+      description: Verify deployment validation produces CTRF output.
+      try:
+        - script:
+            content: |
+              set -eu
+              WORK="/tmp/uat-cuj1-training-aks-h200"
+              test -f "${WORK}/validate-deployment.json"
+              grep -q 'reportformat: CTRF' "${WORK}/validate-deployment.json" || grep -q '\"reportFormat\"' "${WORK}/validate-deployment.json"
+            check:
+              ($error == null): true
+
+    # ── Step 4: Generate bundle with node scheduling ───────────────────
+    - name: generate-bundle
+      description: Generate bundle with system and GPU node scheduling.
+      try:
+        - script:
+            content: |
+              set -eu
+              AICR_BIN="${AICR_BIN:?Set AICR_BIN to the path of the aicr binary}"
+              WORK="/tmp/uat-cuj1-training-aks-h200"
+              ${AICR_BIN} bundle \
+                --recipe "${WORK}/recipe.yaml" \
+                --output "${WORK}/bundle" \
+                --system-node-selector nodeGroup=system-pool \
+                --accelerated-node-selector nodeGroup=gpu-worker \
+                --accelerated-node-toleration nvidia.com/gpu=present:NoSchedule
+
+    - name: assert-bundle-structure
+      description: Verify bundle contains expected files.
+      try:
+        - script:
+            content: |
+              set -eu
+              WORK="/tmp/uat-cuj1-training-aks-h200"
+              test -f "${WORK}/bundle/README.md"
+              test -f "${WORK}/bundle/deploy.sh"
+              test -x "${WORK}/bundle/deploy.sh"
+              test -f "${WORK}/bundle/recipe.yaml"
+              ls "${WORK}"/bundle/*/values.yaml >/dev/null 2>&1
+            check:
+              ($error == null): true
+
+    # ── Step 5: Multi-phase validation ─────────────────────────────────
+    - name: validate-multiphase
+      description: Run all three validation phases.
+      try:
+        - script:
+            content: |
+              set -eu
+              AICR_BIN="${AICR_BIN:?Set AICR_BIN to the path of the aicr binary}"
+              WORK="/tmp/uat-cuj1-training-aks-h200"
+              ${AICR_BIN} validate \
+                --recipe "${WORK}/recipe.yaml" \
+                --snapshot "${WORK}/snapshot.yaml" \
+                --phase deployment \
+                --phase performance \
+                --phase conformance \
+                --no-cluster \
+                --output "${WORK}/validate-multiphase.json" || true
+
+    - name: assert-validate-multiphase
+      description: Verify multiphase validation produces CTRF output.
+      try:
+        - script:
+            content: |
+              set -eu
+              WORK="/tmp/uat-cuj1-training-aks-h200"
+              test -f "${WORK}/validate-multiphase.json"
+              grep -q 'reportformat: CTRF' "${WORK}/validate-multiphase.json" || grep -q '\"reportFormat\"' "${WORK}/validate-multiphase.json"
+            check:
+              ($error == null): true
+      cleanup:
+        - script:
+            content: |
+              rm -rf /tmp/uat-cuj1-training-aks-h200
diff --git a/tests/uat/azure/tests/cuj2-inference-h200/assert-recipe.yaml b/tests/uat/azure/tests/cuj2-inference-h200/assert-recipe.yaml
new file mode 100644
index 000000000..4d2ead710
--- /dev/null
+++ b/tests/uat/azure/tests/cuj2-inference-h200/assert-recipe.yaml
@@ -0,0 +1,24 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Assert the CUJ2 H200 recipe has the expected structure.
+kind: RecipeResult
+apiVersion: aicr.nvidia.com/v1alpha1
+criteria:
+  service: aks
+  accelerator: h200
+  intent: inference
+  os: ubuntu
+  platform: dynamo
+(length(componentRefs) > `0`): true
diff --git a/tests/uat/azure/tests/cuj2-inference-h200/chainsaw-test.yaml b/tests/uat/azure/tests/cuj2-inference-h200/chainsaw-test.yaml
new file mode 100644
index 000000000..d8cac3e47
--- /dev/null
+++ b/tests/uat/azure/tests/cuj2-inference-h200/chainsaw-test.yaml
@@ -0,0 +1,170 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# yaml-language-server: $schema=https://raw.githubusercontent.com/kyverno/chainsaw/main/.schemas/json/test-chainsaw-v1alpha1.json
+apiVersion: chainsaw.kyverno.io/v1alpha1
+kind: Test
+metadata:
+  name: uat-cuj2-inference-aks-h200
+spec:
+  description: |
+    UAT CUJ2: Inference workload on live AKS cluster with H200 GPU nodes.
+    Tests the aicr workflow against a real cluster:
+      Step 1: Snapshot the live cluster
+      Step 2: Generate recipe (AKS/H200/inference/dynamo)
+      Step 3: Validate deployment against live snapshot
+      Step 4: Generate bundle with node scheduling
+      Step 5: Validate bundle structure
+      Step 6: Multi-phase validation
+  timeouts:
+    exec: 300s
+  steps:
+
+    # ── Step 1: Snapshot the live cluster ──────────────────────────────
+    - name: snapshot-cluster
+      description: Capture live cluster state for validation.
+      try:
+        - script:
+            content: |
+              set -eu
+              AICR_BIN="${AICR_BIN:?Set AICR_BIN to the path of the aicr binary}"
+              WORK="/tmp/uat-cuj2-inference-aks-h200"
+              rm -rf "${WORK}" && mkdir -p "${WORK}"
+              ${AICR_BIN} snapshot --output "${WORK}/snapshot.yaml"
+              test -f "${WORK}/snapshot.yaml"
+
+    # ── Step 2: Generate recipe ────────────────────────────────────────
+    - name: generate-recipe
+      description: Generate an AKS H200 inference recipe with dynamo platform.
+      try:
+        - script:
+            content: |
+              set -eu
+              AICR_BIN="${AICR_BIN:?Set AICR_BIN to the path of the aicr binary}"
+              WORK="/tmp/uat-cuj2-inference-aks-h200"
+              ${AICR_BIN} recipe \
+                --service aks \
+                --accelerator h200 \
+                --intent inference \
+                --os ubuntu \
+                --platform dynamo \
+                --output "${WORK}/recipe.yaml"
+              test -f "${WORK}/recipe.yaml"
+
+    - name: assert-recipe
+      description: Verify recipe has correct criteria and components.
+      try:
+        - script:
+            content: |
+              set -eu
+              WORK="/tmp/uat-cuj2-inference-aks-h200"
+              chainsaw assert \
+                --resource "${WORK}/recipe.yaml" \
+                --file ./assert-recipe.yaml \
+                --no-color --timeout 10s
+
+    # ── Step 3: Validate deployment against live snapshot ───────────────
+    - name: validate-deployment
+      description: Run deployment validation with live cluster snapshot.
+      try:
+        - script:
+            content: |
+              set -eu
+              AICR_BIN="${AICR_BIN:?Set AICR_BIN to the path of the aicr binary}"
+              WORK="/tmp/uat-cuj2-inference-aks-h200"
+              ${AICR_BIN} validate \
+                --recipe "${WORK}/recipe.yaml" \
+                --snapshot "${WORK}/snapshot.yaml" \
+                --phase deployment \
+                --no-cluster \
+                --output "${WORK}/validate-deployment.json" || true
+              test -f "${WORK}/validate-deployment.json"
+              grep -q 'reportformat: CTRF' "${WORK}/validate-deployment.json" || grep -q '\"reportFormat\"' "${WORK}/validate-deployment.json"
+
+    - name: assert-validate-deployment
+      description: Verify deployment validation produces CTRF output.
+      try:
+        - script:
+            content: |
+              set -eu
+              WORK="/tmp/uat-cuj2-inference-aks-h200"
+              test -f "${WORK}/validate-deployment.json"
+              grep -q 'reportformat: CTRF' "${WORK}/validate-deployment.json" || grep -q '\"reportFormat\"' "${WORK}/validate-deployment.json"
+            check:
+              ($error == null): true
+
+    # ── Step 4: Generate bundle with node scheduling ───────────────────
+    - name: generate-bundle
+      description: Generate bundle with system and GPU node scheduling.
+      try:
+        - script:
+            content: |
+              set -eu
+              AICR_BIN="${AICR_BIN:?Set AICR_BIN to the path of the aicr binary}"
+              WORK="/tmp/uat-cuj2-inference-aks-h200"
+              ${AICR_BIN} bundle \
+                --recipe "${WORK}/recipe.yaml" \
+                --output "${WORK}/bundle" \
+                --system-node-selector nodeGroup=system-pool \
+                --accelerated-node-selector nodeGroup=gpu-worker \
+                --accelerated-node-toleration nvidia.com/gpu=present:NoSchedule
+
+    - name: assert-bundle-structure
+      description: Verify bundle contains expected files.
+      try:
+        - script:
+            content: |
+              set -eu
+              WORK="/tmp/uat-cuj2-inference-aks-h200"
+              test -f "${WORK}/bundle/README.md"
+              test -f "${WORK}/bundle/deploy.sh"
+              test -x "${WORK}/bundle/deploy.sh"
+              test -f "${WORK}/bundle/recipe.yaml"
+              ls "${WORK}"/bundle/*/values.yaml >/dev/null 2>&1
+            check:
+              ($error == null): true
+
+    # ── Step 5: Multi-phase validation ─────────────────────────────────
+    - name: validate-multiphase
+      description: Run all three validation phases.
+      try:
+        - script:
+            content: |
+              set -eu
+              AICR_BIN="${AICR_BIN:?Set AICR_BIN to the path of the aicr binary}"
+              WORK="/tmp/uat-cuj2-inference-aks-h200"
+              ${AICR_BIN} validate \
+                --recipe "${WORK}/recipe.yaml" \
+                --snapshot "${WORK}/snapshot.yaml" \
+                --phase deployment \
+                --phase performance \
+                --phase conformance \
+                --no-cluster \
+                --output "${WORK}/validate-multiphase.json" || true
+
+    - name: assert-validate-multiphase
+      description: Verify multiphase validation produces CTRF output.
+      try:
+        - script:
+            content: |
+              set -eu
+              WORK="/tmp/uat-cuj2-inference-aks-h200"
+              test -f "${WORK}/validate-multiphase.json"
+              grep -q 'reportformat: CTRF' "${WORK}/validate-multiphase.json" || grep -q '\"reportFormat\"' "${WORK}/validate-multiphase.json"
+            check:
+              ($error == null): true
+      cleanup:
+        - script:
+            content: |
+              rm -rf /tmp/uat-cuj2-inference-aks-h200