Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions pkg/recipe/criteria.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ type CriteriaAcceleratorType string
const (
CriteriaAcceleratorAny CriteriaAcceleratorType = "any"
CriteriaAcceleratorH100 CriteriaAcceleratorType = "h100"
CriteriaAcceleratorH200 CriteriaAcceleratorType = "h200"
CriteriaAcceleratorGB200 CriteriaAcceleratorType = "gb200"
CriteriaAcceleratorB200 CriteriaAcceleratorType = "b200"
CriteriaAcceleratorA100 CriteriaAcceleratorType = "a100"
Expand All @@ -90,6 +91,8 @@ func ParseCriteriaAcceleratorType(s string) (CriteriaAcceleratorType, error) {
return CriteriaAcceleratorAny, nil
case "h100":
return CriteriaAcceleratorH100, nil
case "h200":
return CriteriaAcceleratorH200, nil
case "gb200":
return CriteriaAcceleratorGB200, nil
case "b200":
Expand All @@ -105,7 +108,7 @@ func ParseCriteriaAcceleratorType(s string) (CriteriaAcceleratorType, error) {

// GetCriteriaAcceleratorTypes returns all supported accelerator types sorted alphabetically.
func GetCriteriaAcceleratorTypes() []string {
return []string{"a100", "b200", "gb200", "h100", "l40"}
return []string{"a100", "b200", "gb200", "h100", "h200", "l40"}
}

// CriteriaIntentType represents the workload intent.
Expand Down Expand Up @@ -210,7 +213,7 @@ type Criteria struct {
// Service is the Kubernetes service type (eks, gke, aks, oke, self-managed).
Service CriteriaServiceType `json:"service,omitempty" yaml:"service,omitempty"`

// Accelerator is the GPU/accelerator type (h100, gb200, b200, a100, l40).
// Accelerator is the GPU/accelerator type (h100, h200, gb200, b200, a100, l40).
Accelerator CriteriaAcceleratorType `json:"accelerator,omitempty" yaml:"accelerator,omitempty"`

// Intent is the workload intent (training, inference).
Expand Down
4 changes: 3 additions & 1 deletion pkg/recipe/criteria_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@ func TestParseCriteriaAcceleratorType(t *testing.T) {
{"any", "any", CriteriaAcceleratorAny, false},
{"h100", "h100", CriteriaAcceleratorH100, false},
{"H100 uppercase", "H100", CriteriaAcceleratorH100, false},
{"h200", "h200", CriteriaAcceleratorH200, false},
{"H200 uppercase", "H200", CriteriaAcceleratorH200, false},
{"gb200", "gb200", CriteriaAcceleratorGB200, false},
{"b200", "b200", CriteriaAcceleratorB200, false},
{"a100", "a100", CriteriaAcceleratorA100, false},
Expand Down Expand Up @@ -687,7 +689,7 @@ func TestGetCriteriaAcceleratorTypes(t *testing.T) {
types := GetCriteriaAcceleratorTypes()

// Should return sorted list
expected := []string{"a100", "b200", "gb200", "h100", "l40"}
expected := []string{"a100", "b200", "gb200", "h100", "h200", "l40"}
if len(types) != len(expected) {
t.Errorf("GetCriteriaAcceleratorTypes() returned %d types, want %d", len(types), len(expected))
}
Expand Down
2 changes: 2 additions & 0 deletions pkg/recipe/snapshot.go
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,8 @@ func matchAccelerator(model string) CriteriaAcceleratorType {
// Follow this pattern when adding future Blackwell variants (e.g., check "gb300" before "b300").
case strings.Contains(lower, "b200"):
return CriteriaAcceleratorB200
case strings.Contains(lower, "h200"):
return CriteriaAcceleratorH200
case strings.Contains(lower, "h100"):
return CriteriaAcceleratorH100
case strings.Contains(lower, "a100"):
Expand Down
25 changes: 25 additions & 0 deletions pkg/recipe/snapshot_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,29 @@ func TestExtractCriteriaFromSnapshot(t *testing.T) {
}
},
},
{
name: "GPU H200 from model field",
snapshot: &snapshotter.Snapshot{
Measurements: []*measurement.Measurement{
{
Type: measurement.TypeGPU,
Subtypes: []measurement.Subtype{
{
Name: "device",
Data: map[string]measurement.Reading{
"model": measurement.Str("NVIDIA H200 141GB HBM3e"),
},
},
},
},
},
},
validate: func(t *testing.T, c *Criteria) {
if c.Accelerator != CriteriaAcceleratorH200 {
t.Errorf("Accelerator = %v, want %v", c.Accelerator, CriteriaAcceleratorH200)
}
},
},
{
name: "GPU H100 from model field",
snapshot: &snapshotter.Snapshot{
Expand Down Expand Up @@ -405,6 +428,8 @@ func TestMatchAccelerator(t *testing.T) {
}{
{"H100 uppercase", "NVIDIA H100 80GB HBM3", CriteriaAcceleratorH100},
{"H100 lowercase", "h100-sxm", CriteriaAcceleratorH100},
{"H200 uppercase", "NVIDIA H200 141GB HBM3e", CriteriaAcceleratorH200},
{"H200 lowercase", "h200-sxm", CriteriaAcceleratorH200},
{"A100", "A100-SXM4-80GB", CriteriaAcceleratorA100},
{"GB200", "NVIDIA GB200", CriteriaAcceleratorGB200},
{"B200", "NVIDIA-B200", CriteriaAcceleratorB200},
Expand Down
44 changes: 44 additions & 0 deletions recipes/overlays/h200-aks-inference.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

kind: RecipeMetadata
apiVersion: aicr.nvidia.com/v1alpha1
metadata:
name: h200-aks-inference

spec:
# Inherits from aks-inference recipe (AKS + inference settings)
base: aks-inference

criteria:
service: aks
accelerator: h200
intent: inference

# Specific constraints for H200 on AKS inference workloads
# Constraint names use fully qualified measurement paths: {type}.{subtype}.{key}
constraints:
- name: K8s.server.version
value: ">= 1.32.4"

# Skyhook customizations omitted — Skyhook packages do not support
# service: aks. The skyhook-operator itself is inherited from base
# and still deploys. This follows the same pattern as Kind overlays
# (h100-kind-inference.yaml) which also omit Skyhook tuning.
componentRefs:
- name: gpu-operator
type: Helm
dependencyRefs:
- cert-manager
- kube-prometheus-stack
48 changes: 48 additions & 0 deletions recipes/overlays/h200-aks-training.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

kind: RecipeMetadata
apiVersion: aicr.nvidia.com/v1alpha1
metadata:
name: h200-aks-training

spec:
# Inherits from aks-training recipe (AKS + training settings)
base: aks-training

criteria:
service: aks
accelerator: h200
intent: training

# Specific constraints for H200 on AKS training workloads
# Constraint names use fully qualified measurement paths: {type}.{subtype}.{key}
constraints:
- name: K8s.server.version
value: ">= 1.32.4"

# Skyhook customizations omitted — Skyhook packages do not support
# service: aks. The skyhook-operator itself is inherited from base
# and still deploys. This follows the same pattern as Kind overlays
# (h100-kind-training.yaml) which also omit Skyhook tuning.
componentRefs:
# H200-specific GPU Operator overrides (inherits valuesFile from aks-training)
- name: gpu-operator
type: Helm
dependencyRefs:
- cert-manager
- kube-prometheus-stack
overrides:
gdrcopy:
enabled: true
92 changes: 92 additions & 0 deletions recipes/overlays/h200-aks-ubuntu-inference-dynamo.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

kind: RecipeMetadata
apiVersion: aicr.nvidia.com/v1alpha1
metadata:
name: h200-aks-ubuntu-inference-dynamo

spec:
# Inherits from h200-aks-ubuntu-inference (H200 + Ubuntu inference settings)
# Adds Dynamo inference platform components.
base: h200-aks-ubuntu-inference

criteria:
service: aks
accelerator: h200
os: ubuntu
intent: inference
platform: dynamo

# DRA requires Kubernetes 1.34+ (GA)
constraints:
- name: K8s.server.version
value: ">= 1.34"

componentRefs:
- name: nvidia-dra-driver-gpu
type: Helm
overrides:
gpuResourcesEnabledOverride: true

- name: dynamo-crds
type: Helm
source: https://helm.ngc.nvidia.com/nvidia/ai-dynamo
version: "0.9.0"
valuesFile: components/dynamo-crds/values.yaml

- name: dynamo-platform
type: Helm
source: https://helm.ngc.nvidia.com/nvidia/ai-dynamo
version: "0.9.0"
valuesFile: components/dynamo-platform/values.yaml
dependencyRefs:
- dynamo-crds
- cert-manager
- kube-prometheus-stack
- kai-scheduler
overrides:
etcd:
persistence:
storageClass: managed-csi
nats:
config:
jetstream:
fileStore:
pvc:
storageClassName: managed-csi

validation:
deployment:
checks:
- operator-health
- expected-resources
- gpu-operator-version
- check-nvidia-smi
constraints:
- name: Deployment.gpu-operator.version
value: ">= v24.6.0"
conformance:
checks:
- platform-health
- gpu-operator-health
- dra-support
- accelerator-metrics
- ai-service-metrics
- inference-gateway
- gang-scheduling
- pod-autoscaling
- cluster-autoscaling
- robust-controller
- secure-accelerator-access
43 changes: 43 additions & 0 deletions recipes/overlays/h200-aks-ubuntu-inference.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

kind: RecipeMetadata
apiVersion: aicr.nvidia.com/v1alpha1
metadata:
name: h200-aks-ubuntu-inference

spec:
# Inherits from h200-aks-inference recipe (H200 + AKS + inference settings)
# This overlay adds Ubuntu-specific configurations
base: h200-aks-inference

criteria:
service: aks
accelerator: h200
os: ubuntu
intent: inference

# H200 + Ubuntu specific constraints for inference workloads
# Constraint names use fully qualified measurement paths: {type}.{subtype}.{key}
constraints:
- name: K8s.server.version
value: ">= 1.32.4"
- name: OS.release.ID
value: ubuntu
- name: OS.release.VERSION_ID
value: "24.04"
- name: OS.sysctl./proc/sys/kernel/osrelease
value: ">= 6.8"

componentRefs: []
54 changes: 54 additions & 0 deletions recipes/overlays/h200-aks-ubuntu-training-kubeflow.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

kind: RecipeMetadata
apiVersion: aicr.nvidia.com/v1alpha1
metadata:
name: h200-aks-ubuntu-training-kubeflow

spec:
# Inherits from h200-aks-ubuntu-training recipe (H200 + AKS + Ubuntu + training settings)
# This overlay adds Kubeflow Training Operator for distributed training with TrainJob
base: h200-aks-ubuntu-training

criteria:
service: aks
accelerator: h200
os: ubuntu
intent: training
platform: kubeflow

# Constraints for H200 on AKS with Ubuntu for Kubeflow training workloads
# Constraint names use fully qualified measurement paths: {type}.{subtype}.{key}
constraints:
- name: K8s.server.version
value: ">= 1.32.4"
- name: OS.release.ID
value: ubuntu
- name: OS.release.VERSION_ID
value: "24.04"
- name: OS.sysctl./proc/sys/kernel/osrelease
value: ">= 6.8"

# Kubeflow Training Operator for TrainJob support
componentRefs:
- name: kubeflow-trainer
type: Helm
valuesFile: components/kubeflow-trainer/values.yaml
manifestFiles:
- components/kubeflow-trainer/manifests/torch-distributed-cluster-training-runtime.yaml
dependencyRefs:
- cert-manager
- kube-prometheus-stack
- gpu-operator
Loading
Loading