NVIDIA · Jont828 · Mar 31, 2026 · Apr 3, 2026
@@ -77,6 +77,7 @@ type CriteriaAcceleratorType string
 const (
 	CriteriaAcceleratorAny   CriteriaAcceleratorType = "any"
 	CriteriaAcceleratorH100  CriteriaAcceleratorType = "h100"
+	CriteriaAcceleratorH200  CriteriaAcceleratorType = "h200"
 	CriteriaAcceleratorGB200 CriteriaAcceleratorType = "gb200"
 	CriteriaAcceleratorB200  CriteriaAcceleratorType = "b200"
 	CriteriaAcceleratorA100  CriteriaAcceleratorType = "a100"
@@ -90,6 +91,8 @@ func ParseCriteriaAcceleratorType(s string) (CriteriaAcceleratorType, error) {
 		return CriteriaAcceleratorAny, nil
 	case "h100":
 		return CriteriaAcceleratorH100, nil
+	case "h200":
+		return CriteriaAcceleratorH200, nil
 	case "gb200":
 		return CriteriaAcceleratorGB200, nil
 	case "b200":
@@ -105,7 +108,7 @@ func ParseCriteriaAcceleratorType(s string) (CriteriaAcceleratorType, error) {
 
 // GetCriteriaAcceleratorTypes returns all supported accelerator types sorted alphabetically.
 func GetCriteriaAcceleratorTypes() []string {
-	return []string{"a100", "b200", "gb200", "h100", "l40"}
+	return []string{"a100", "b200", "gb200", "h100", "h200", "l40"}
 }
 
 // CriteriaIntentType represents the workload intent.
@@ -210,7 +213,7 @@ type Criteria struct {
 	// Service is the Kubernetes service type (eks, gke, aks, oke, self-managed).
 	Service CriteriaServiceType `json:"service,omitempty" yaml:"service,omitempty"`
 
-	// Accelerator is the GPU/accelerator type (h100, gb200, b200, a100, l40).
+	// Accelerator is the GPU/accelerator type (h100, h200, gb200, b200, a100, l40).
 	Accelerator CriteriaAcceleratorType `json:"accelerator,omitempty" yaml:"accelerator,omitempty"`
 
 	// Intent is the workload intent (training, inference).

@@ -68,6 +68,8 @@ func TestParseCriteriaAcceleratorType(t *testing.T) {
 		{"any", "any", CriteriaAcceleratorAny, false},
 		{"h100", "h100", CriteriaAcceleratorH100, false},
 		{"H100 uppercase", "H100", CriteriaAcceleratorH100, false},
+		{"h200", "h200", CriteriaAcceleratorH200, false},
+		{"H200 uppercase", "H200", CriteriaAcceleratorH200, false},
 		{"gb200", "gb200", CriteriaAcceleratorGB200, false},
 		{"b200", "b200", CriteriaAcceleratorB200, false},
 		{"a100", "a100", CriteriaAcceleratorA100, false},
@@ -687,7 +689,7 @@ func TestGetCriteriaAcceleratorTypes(t *testing.T) {
 	types := GetCriteriaAcceleratorTypes()
 
 	// Should return sorted list
-	expected := []string{"a100", "b200", "gb200", "h100", "l40"}
+	expected := []string{"a100", "b200", "gb200", "h100", "h200", "l40"}
 	if len(types) != len(expected) {
 		t.Errorf("GetCriteriaAcceleratorTypes() returned %d types, want %d", len(types), len(expected))
 	}

@@ -105,6 +105,8 @@ func matchAccelerator(model string) CriteriaAcceleratorType {
 	// Follow this pattern when adding future Blackwell variants (e.g., check "gb300" before "b300").
 	case strings.Contains(lower, "b200"):
 		return CriteriaAcceleratorB200
+	case strings.Contains(lower, "h200"):
+		return CriteriaAcceleratorH200
 	case strings.Contains(lower, "h100"):
 		return CriteriaAcceleratorH100
 	case strings.Contains(lower, "a100"):

@@ -150,6 +150,29 @@ func TestExtractCriteriaFromSnapshot(t *testing.T) {
 				}
 			},
 		},
+		{
+			name: "GPU H200 from model field",
+			snapshot: &snapshotter.Snapshot{
+				Measurements: []*measurement.Measurement{
+					{
+						Type: measurement.TypeGPU,
+						Subtypes: []measurement.Subtype{
+							{
+								Name: "device",
+								Data: map[string]measurement.Reading{
+									"model": measurement.Str("NVIDIA H200 141GB HBM3e"),
+								},
+							},
+						},
+					},
+				},
+			},
+			validate: func(t *testing.T, c *Criteria) {
+				if c.Accelerator != CriteriaAcceleratorH200 {
+					t.Errorf("Accelerator = %v, want %v", c.Accelerator, CriteriaAcceleratorH200)
+				}
+			},
+		},
 		{
 			name: "GPU H100 from model field",
 			snapshot: &snapshotter.Snapshot{
@@ -405,6 +428,8 @@ func TestMatchAccelerator(t *testing.T) {
 	}{
 		{"H100 uppercase", "NVIDIA H100 80GB HBM3", CriteriaAcceleratorH100},
 		{"H100 lowercase", "h100-sxm", CriteriaAcceleratorH100},
+		{"H200 uppercase", "NVIDIA H200 141GB HBM3e", CriteriaAcceleratorH200},
+		{"H200 lowercase", "h200-sxm", CriteriaAcceleratorH200},
 		{"A100", "A100-SXM4-80GB", CriteriaAcceleratorA100},
 		{"GB200", "NVIDIA GB200", CriteriaAcceleratorGB200},
 		{"B200", "NVIDIA-B200", CriteriaAcceleratorB200},

@@ -0,0 +1,44 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+kind: RecipeMetadata
+apiVersion: aicr.nvidia.com/v1alpha1
+metadata:
+  name: h200-aks-inference
+
+spec:
+  # Inherits from aks-inference recipe (AKS + inference settings)
+  base: aks-inference
+
+  criteria:
+    service: aks
+    accelerator: h200
+    intent: inference
+
+  # Specific constraints for H200 on AKS inference workloads
+  # Constraint names use fully qualified measurement paths: {type}.{subtype}.{key}
+  constraints:
+    - name: K8s.server.version
+      value: ">= 1.32.4"
+
+  # Skyhook customizations omitted — Skyhook packages do not support
+  # service: aks. The skyhook-operator itself is inherited from base
+  # and still deploys. This follows the same pattern as Kind overlays
+  # (h100-kind-inference.yaml) which also omit Skyhook tuning.
+  componentRefs:
+    - name: gpu-operator
+      type: Helm
+      dependencyRefs:
+        - cert-manager
+        - kube-prometheus-stack
@@ -0,0 +1,48 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+kind: RecipeMetadata
+apiVersion: aicr.nvidia.com/v1alpha1
+metadata:
+  name: h200-aks-training
+
+spec:
+  # Inherits from aks-training recipe (AKS + training settings)
+  base: aks-training
+
+  criteria:
+    service: aks
+    accelerator: h200
+    intent: training
+
+  # Specific constraints for H200 on AKS training workloads
+  # Constraint names use fully qualified measurement paths: {type}.{subtype}.{key}
+  constraints:
+    - name: K8s.server.version
+      value: ">= 1.32.4"
+
+  # Skyhook customizations omitted — Skyhook packages do not support
+  # service: aks. The skyhook-operator itself is inherited from base
+  # and still deploys. This follows the same pattern as Kind overlays
+  # (h100-kind-training.yaml) which also omit Skyhook tuning.
+  componentRefs:
+    # H200-specific GPU Operator overrides (inherits valuesFile from aks-training)
+    - name: gpu-operator
+      type: Helm
+      dependencyRefs:
+        - cert-manager
+        - kube-prometheus-stack
+      overrides:
+        gdrcopy:
+          enabled: true
@@ -0,0 +1,92 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+kind: RecipeMetadata
+apiVersion: aicr.nvidia.com/v1alpha1
+metadata:
+  name: h200-aks-ubuntu-inference-dynamo
+
+spec:
+  # Inherits from h200-aks-ubuntu-inference (H200 + Ubuntu inference settings)
+  # Adds Dynamo inference platform components.
+  base: h200-aks-ubuntu-inference
+
+  criteria:
+    service: aks
+    accelerator: h200
+    os: ubuntu
+    intent: inference
+    platform: dynamo
+
+  # DRA requires Kubernetes 1.34+ (GA)
+  constraints:
+    - name: K8s.server.version
+      value: ">= 1.34"
+
+  componentRefs:
+    - name: nvidia-dra-driver-gpu
+      type: Helm
+      overrides:
+        gpuResourcesEnabledOverride: true
+
+    - name: dynamo-crds
+      type: Helm
+      source: https://helm.ngc.nvidia.com/nvidia/ai-dynamo
+      version: "0.9.0"
+      valuesFile: components/dynamo-crds/values.yaml
+
+    - name: dynamo-platform
+      type: Helm
+      source: https://helm.ngc.nvidia.com/nvidia/ai-dynamo
+      version: "0.9.0"
+      valuesFile: components/dynamo-platform/values.yaml
+      dependencyRefs:
+        - dynamo-crds
+        - cert-manager
+        - kube-prometheus-stack
+        - kai-scheduler
+      overrides:
+        etcd:
+          persistence:
+            storageClass: managed-csi
+        nats:
+          config:
+            jetstream:
+              fileStore:
+                pvc:
+                  storageClassName: managed-csi
+
+  validation:
+    deployment:
+      checks:
+        - operator-health
+        - expected-resources
+        - gpu-operator-version
+        - check-nvidia-smi
+      constraints:
+        - name: Deployment.gpu-operator.version
+          value: ">= v24.6.0"
+    conformance:
+      checks:
+        - platform-health
+        - gpu-operator-health
+        - dra-support
+        - accelerator-metrics
+        - ai-service-metrics
+        - inference-gateway
+        - gang-scheduling
+        - pod-autoscaling
+        - cluster-autoscaling
+        - robust-controller
+        - secure-accelerator-access
@@ -0,0 +1,43 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+kind: RecipeMetadata
+apiVersion: aicr.nvidia.com/v1alpha1
+metadata:
+  name: h200-aks-ubuntu-inference
+
+spec:
+  # Inherits from h200-aks-inference recipe (H200 + AKS + inference settings)
+  # This overlay adds Ubuntu-specific configurations
+  base: h200-aks-inference
+
+  criteria:
+    service: aks
+    accelerator: h200
+    os: ubuntu
+    intent: inference
+
+  # H200 + Ubuntu specific constraints for inference workloads
+  # Constraint names use fully qualified measurement paths: {type}.{subtype}.{key}
+  constraints:
+    - name: K8s.server.version
+      value: ">= 1.32.4"
+    - name: OS.release.ID
+      value: ubuntu
+    - name: OS.release.VERSION_ID
+      value: "24.04"
+    - name: OS.sysctl./proc/sys/kernel/osrelease
+      value: ">= 6.8"
+
+  componentRefs: []
@@ -0,0 +1,54 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+kind: RecipeMetadata
+apiVersion: aicr.nvidia.com/v1alpha1
+metadata:
+  name: h200-aks-ubuntu-training-kubeflow
+
+spec:
+  # Inherits from h200-aks-ubuntu-training recipe (H200 + AKS + Ubuntu + training settings)
+  # This overlay adds Kubeflow Training Operator for distributed training with TrainJob
+  base: h200-aks-ubuntu-training
+
+  criteria:
+    service: aks
+    accelerator: h200
+    os: ubuntu
+    intent: training
+    platform: kubeflow
+
+  # Constraints for H200 on AKS with Ubuntu for Kubeflow training workloads
+  # Constraint names use fully qualified measurement paths: {type}.{subtype}.{key}
+  constraints:
+    - name: K8s.server.version
+      value: ">= 1.32.4"
+    - name: OS.release.ID
+      value: ubuntu
+    - name: OS.release.VERSION_ID
+      value: "24.04"
+    - name: OS.sysctl./proc/sys/kernel/osrelease
+      value: ">= 6.8"
+
+  # Kubeflow Training Operator for TrainJob support
+  componentRefs:
+    - name: kubeflow-trainer
+      type: Helm
+      valuesFile: components/kubeflow-trainer/values.yaml
+      manifestFiles:
+        - components/kubeflow-trainer/manifests/torch-distributed-cluster-training-runtime.yaml
+      dependencyRefs:
+        - cert-manager
+        - kube-prometheus-stack
+        - gpu-operator