From 1ca8a584879198974266b82753accec04abe6745 Mon Sep 17 00:00:00 2001 From: Yuan Chen Date: Tue, 31 Mar 2026 15:49:35 -0700 Subject: [PATCH] feat(recipes): add NIM Operator recipe for CNCF AI Conformance Add k8s-nim-operator as a new AICR component and create an H100/EKS/Ubuntu inference recipe for NIM. This supports the CNCF AI Conformance submission where NIM on EKS is the certified product and AICR is the validation tooling. - Add `nim` platform type to recipe criteria with tests - Register k8s-nim-operator v3.1.0 in component registry with health check - Create h100-eks-ubuntu-inference-nim overlay with DRA support - Add NIMService workload manifest (Llama 3.2 1B) - Add NIM chat demo UI (nim-chat-server.sh, nim-chat.html) - Fix: load healthCheck.assertFile content in ApplyRegistryDefaults so deployment validation actually executes Chainsaw health checks Closes #473 --- demos/workloads/inference/nim-chat-server.sh | 108 ++++++++ demos/workloads/inference/nim-chat.html | 239 ++++++++++++++++++ .../inference/nimservice-llama-3-2-1b.yaml | 93 +++++++ pkg/recipe/criteria.go | 5 +- pkg/recipe/criteria_test.go | 4 +- pkg/recipe/metadata.go | 14 + pkg/recipe/metadata_test.go | 90 +++++++ .../checks/k8s-nim-operator/health-check.yaml | 68 +++++ .../components/k8s-nim-operator/values.yaml | 34 +++ .../h100-eks-ubuntu-inference-nim.yaml | 74 ++++++ recipes/registry.yaml | 19 ++ 11 files changed, 746 insertions(+), 2 deletions(-) create mode 100755 demos/workloads/inference/nim-chat-server.sh create mode 100644 demos/workloads/inference/nim-chat.html create mode 100644 demos/workloads/inference/nimservice-llama-3-2-1b.yaml create mode 100644 recipes/checks/k8s-nim-operator/health-check.yaml create mode 100644 recipes/components/k8s-nim-operator/values.yaml create mode 100644 recipes/overlays/h100-eks-ubuntu-inference-nim.yaml diff --git a/demos/workloads/inference/nim-chat-server.sh b/demos/workloads/inference/nim-chat-server.sh new file mode 100755 index 000000000..4b59b58a0 --- /dev/null +++ b/demos/workloads/inference/nim-chat-server.sh @@ -0,0 +1,108 @@ +#!/bin/bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# NIM Chat UI — single script to launch everything +# Usage: ./nim-chat-server.sh +# Then open: http://127.0.0.1:9090/chat.html + +set -e + +NAMESPACE="${NAMESPACE:-nim-workload}" +SERVICE="${SERVICE:-svc/llama-3-2-1b}" +API_PORT=8000 +UI_PORT=9090 + +cleanup() { + echo "Shutting down..." + kill $PF_PID 2>/dev/null + kill $PY_PID 2>/dev/null + exit 0 +} +trap cleanup EXIT INT TERM + +# Kill anything already on our ports +for port in $API_PORT $UI_PORT; do + pids=$(lsof -ti :$port 2>/dev/null || true) + if [ -n "$pids" ]; then + echo "Killing existing processes on port $port" + echo "$pids" | xargs kill 2>/dev/null || true + sleep 1 + fi +done + +# Start port-forward to NIM service +echo "Starting port-forward to $SERVICE on :$API_PORT..." +kubectl port-forward -n "$NAMESPACE" "$SERVICE" "$API_PORT":8000 & +PF_PID=$! +sleep 2 + +# Start chat UI + API proxy on UI_PORT +echo "Starting chat UI on :$UI_PORT..." +python3 -c " +import http.server, urllib.request, io + +API = 'http://127.0.0.1:${API_PORT}' +HTML_PATH = '$(dirname "$0")/nim-chat.html' + +class H(http.server.BaseHTTPRequestHandler): + def do_GET(self): + if self.path == '/' or self.path == '/chat.html': + html = open(HTML_PATH, 'rb').read() if __import__('os').path.exists(HTML_PATH) else b'' + self.send_response(200) + self.send_header('Content-Type', 'text/html') + self.send_header('Content-Length', len(html)) + self.end_headers() + self.wfile.write(html) + elif self.path.startswith('/v1/'): + self._proxy() + else: + self.send_error(404) + + def do_POST(self): + if self.path.startswith('/v1/'): + self._proxy() + else: + self.send_error(404) + + def _proxy(self): + length = int(self.headers.get('Content-Length', 0)) + body = self.rfile.read(length) if length else None + req = urllib.request.Request( + API + self.path, data=body, + headers={'Content-Type': self.headers.get('Content-Type', 'application/json')}, + method=self.command) + try: + with urllib.request.urlopen(req) as r: + data = r.read() + self.send_response(r.status) + self.send_header('Content-Type', r.headers.get('Content-Type', 'application/json')) + self.send_header('Content-Length', len(data)) + self.end_headers() + self.wfile.write(data) + except urllib.error.URLError as e: + self.send_error(502, str(e)) + + def log_message(self, fmt, *args): pass + +http.server.HTTPServer(('127.0.0.1', ${UI_PORT}), H).serve_forever() +" & +PY_PID=$! + +echo "" +echo "Ready! Open http://127.0.0.1:${UI_PORT}/chat.html" +echo "Press Ctrl+C to stop." +echo "" + +wait diff --git a/demos/workloads/inference/nim-chat.html b/demos/workloads/inference/nim-chat.html new file mode 100644 index 000000000..eee6384d4 --- /dev/null +++ b/demos/workloads/inference/nim-chat.html @@ -0,0 +1,239 @@ + + + + + + + +NIM Chat + + + +
+

Llama 3.2 1B Chat

+ NVIDIA NIM on EKS +
+
+
Send a message to start chatting
+
+
+
+ + +
+
+ + + diff --git a/demos/workloads/inference/nimservice-llama-3-2-1b.yaml b/demos/workloads/inference/nimservice-llama-3-2-1b.yaml new file mode 100644 index 000000000..27dc56f4b --- /dev/null +++ b/demos/workloads/inference/nimservice-llama-3-2-1b.yaml @@ -0,0 +1,93 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# NIM Service — Llama 3.2 1B inference deployment. +# Deploys a single-GPU NIM microservice serving meta/llama-3.2-1b-instruct +# via an OpenAI-compatible API (/v1/chat/completions, /v1/models). +# +# Prerequisites: +# - k8s-nim-operator deployed (via AICR NIM recipe) +# - NGC pull secret and API key in the target namespace +# +# Setup: +# kubectl create ns nim-workload +# kubectl create secret docker-registry ngc-pull-secret \ +# --docker-server=nvcr.io --docker-username='$oauthtoken' \ +# --docker-password="$NGC_CLI_API_KEY" -n nim-workload +# kubectl create secret generic ngc-api-secret \ +# --from-literal=NGC_API_KEY="$NGC_CLI_API_KEY" -n nim-workload +# +# Deploy: +# kubectl apply -f nimservice-llama-3-2-1b.yaml +# +# Test: +# kubectl port-forward svc/llama-3-2-1b 8000:8000 -n nim-workload +# curl http://localhost:8000/v1/models +# curl http://localhost:8000/v1/chat/completions \ +# -H "Content-Type: application/json" \ +# -d '{"model":"meta/llama-3.2-1b-instruct","messages":[{"role":"user","content":"Hello!"}],"max_tokens":30}' + +apiVersion: v1 +kind: Namespace +metadata: + name: nim-workload +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: nim-model-store + namespace: nim-workload +spec: + accessModes: + - ReadWriteOnce + storageClassName: gp2 + resources: + requests: + storage: 20Gi +--- +apiVersion: apps.nvidia.com/v1alpha1 +kind: NIMService +metadata: + name: llama-3-2-1b + namespace: nim-workload +spec: + image: + repository: nvcr.io/nim/meta/llama-3.2-1b-instruct + tag: "1.8.3" + pullPolicy: IfNotPresent + pullSecrets: + - ngc-pull-secret + authSecret: ngc-api-secret + replicas: 1 + resources: + limits: + nvidia.com/gpu: "1" + requests: + nvidia.com/gpu: "1" + tolerations: + - key: dedicated + value: worker-workload + operator: Equal + effect: NoSchedule + - key: dedicated + value: worker-workload + operator: Equal + effect: NoExecute + expose: + service: + type: ClusterIP + port: 8000 + storage: + pvc: + name: nim-model-store diff --git a/pkg/recipe/criteria.go b/pkg/recipe/criteria.go index c9f370bf4..00997461e 100644 --- a/pkg/recipe/criteria.go +++ b/pkg/recipe/criteria.go @@ -180,6 +180,7 @@ const ( CriteriaPlatformAny CriteriaPlatformType = "any" CriteriaPlatformDynamo CriteriaPlatformType = "dynamo" CriteriaPlatformKubeflow CriteriaPlatformType = "kubeflow" + CriteriaPlatformNIM CriteriaPlatformType = "nim" ) // ParseCriteriaPlatformType parses a string into a CriteriaPlatformType. @@ -191,6 +192,8 @@ func ParseCriteriaPlatformType(s string) (CriteriaPlatformType, error) { return CriteriaPlatformDynamo, nil case "kubeflow": return CriteriaPlatformKubeflow, nil + case "nim": + return CriteriaPlatformNIM, nil default: return CriteriaPlatformAny, errors.New(errors.ErrCodeInvalidRequest, fmt.Sprintf("invalid platform type: %s", s)) } @@ -198,7 +201,7 @@ func ParseCriteriaPlatformType(s string) (CriteriaPlatformType, error) { // GetCriteriaPlatformTypes returns all supported platform types sorted alphabetically. func GetCriteriaPlatformTypes() []string { - return []string{"dynamo", "kubeflow"} + return []string{"dynamo", "kubeflow", "nim"} } // Criteria represents the input parameters for recipe matching. diff --git a/pkg/recipe/criteria_test.go b/pkg/recipe/criteria_test.go index 4e96ac541..1358ec0ed 100644 --- a/pkg/recipe/criteria_test.go +++ b/pkg/recipe/criteria_test.go @@ -768,6 +768,8 @@ func TestParseCriteriaPlatformType(t *testing.T) { {"Dynamo uppercase", "Dynamo", CriteriaPlatformDynamo, false}, {"kubeflow", "kubeflow", CriteriaPlatformKubeflow, false}, {"Kubeflow uppercase", "Kubeflow", CriteriaPlatformKubeflow, false}, + {"nim", "nim", CriteriaPlatformNIM, false}, + {"NIM uppercase", "NIM", CriteriaPlatformNIM, false}, {"invalid", "invalid", CriteriaPlatformAny, true}, } @@ -789,7 +791,7 @@ func TestGetCriteriaPlatformTypes(t *testing.T) { types := GetCriteriaPlatformTypes() // Should return sorted list - expected := []string{"dynamo", "kubeflow"} + expected := []string{"dynamo", "kubeflow", "nim"} if len(types) != len(expected) { t.Errorf("GetCriteriaPlatformTypes() returned %d types, want %d", len(types), len(expected)) } diff --git a/pkg/recipe/metadata.go b/pkg/recipe/metadata.go index 4d9885b51..46ac7eeba 100644 --- a/pkg/recipe/metadata.go +++ b/pkg/recipe/metadata.go @@ -238,6 +238,20 @@ func (ref *ComponentRef) ApplyRegistryDefaults(config *ComponentConfig) { ref.Path = config.Kustomize.DefaultPath } } + + // Load health check assert file content if not already set + if ref.HealthCheckAsserts == "" && config.HealthCheck.AssertFile != "" { + provider := GetDataProvider() + if provider != nil { + data, err := provider.ReadFile(config.HealthCheck.AssertFile) + if err != nil { + slog.Debug("failed to read health check assert file", + "component", ref.Name, "file", config.HealthCheck.AssertFile, "error", err) + } else { + ref.HealthCheckAsserts = string(data) + } + } + } } // RecipeMetadataSpec contains the specification for a recipe. diff --git a/pkg/recipe/metadata_test.go b/pkg/recipe/metadata_test.go index d37c1aad4..a2cbd8710 100644 --- a/pkg/recipe/metadata_test.go +++ b/pkg/recipe/metadata_test.go @@ -35,8 +35,10 @@ package recipe import ( "context" + "io/fs" "strings" "testing" + "testing/fstest" ) func TestRecipeMetadataSpecValidateDependencies(t *testing.T) { @@ -1242,6 +1244,94 @@ func TestComponentRefApplyRegistryDefaults_NamespaceAndChart(t *testing.T) { }) } +// TestComponentRefApplyRegistryDefaults_HealthCheckAsserts verifies that +// ApplyRegistryDefaults loads healthCheck.assertFile content into HealthCheckAsserts. +func TestComponentRefApplyRegistryDefaults_HealthCheckAsserts(t *testing.T) { + t.Run("loads assert file from data provider", func(t *testing.T) { + // Set up a test data provider with a health check file + fs := fstest.MapFS{ + "checks/test-component/health-check.yaml": &fstest.MapFile{ + Data: []byte("apiVersion: chainsaw.kyverno.io/v1alpha1\nkind: Test\n"), + }, + } + old := GetDataProvider() + SetDataProvider(&testFSProvider{fs: fs}) + defer SetDataProvider(old) + + config := &ComponentConfig{ + Name: "test-component", + HealthCheck: HealthCheckConfig{ + AssertFile: "checks/test-component/health-check.yaml", + }, + Helm: HelmConfig{DefaultRepository: "https://example.com"}, + } + ref := &ComponentRef{Name: "test-component"} + ref.ApplyRegistryDefaults(config) + + if ref.HealthCheckAsserts == "" { + t.Fatal("HealthCheckAsserts should be populated from assertFile") + } + if !strings.Contains(ref.HealthCheckAsserts, "chainsaw.kyverno.io") { + t.Errorf("HealthCheckAsserts = %q, want content containing chainsaw.kyverno.io", ref.HealthCheckAsserts) + } + }) + + t.Run("does not overwrite existing HealthCheckAsserts", func(t *testing.T) { + config := &ComponentConfig{ + Name: "test-component", + HealthCheck: HealthCheckConfig{ + AssertFile: "checks/test-component/health-check.yaml", + }, + } + ref := &ComponentRef{ + Name: "test-component", + HealthCheckAsserts: "existing-content", + } + ref.ApplyRegistryDefaults(config) + + if ref.HealthCheckAsserts != "existing-content" { + t.Errorf("HealthCheckAsserts = %q, want %q (should not overwrite)", ref.HealthCheckAsserts, "existing-content") + } + }) + + t.Run("handles missing assert file gracefully", func(t *testing.T) { + fs := fstest.MapFS{} + old := GetDataProvider() + SetDataProvider(&testFSProvider{fs: fs}) + defer SetDataProvider(old) + + config := &ComponentConfig{ + Name: "test-component", + HealthCheck: HealthCheckConfig{ + AssertFile: "checks/nonexistent/health-check.yaml", + }, + } + ref := &ComponentRef{Name: "test-component"} + ref.ApplyRegistryDefaults(config) + + if ref.HealthCheckAsserts != "" { + t.Errorf("HealthCheckAsserts = %q, want empty for missing file", ref.HealthCheckAsserts) + } + }) +} + +// testFSProvider wraps fstest.MapFS to implement DataProvider for testing. +type testFSProvider struct { + fs fstest.MapFS +} + +func (p *testFSProvider) ReadFile(path string) ([]byte, error) { + return p.fs.ReadFile(path) +} + +func (p *testFSProvider) WalkDir(root string, fn fs.WalkDirFunc) error { + return fs.WalkDir(p.fs, root, fn) +} + +func (p *testFSProvider) Source(path string) string { + return path +} + // TestComponentRefMergeWithPath verifies that the Path field is correctly merged // when merging ComponentRefs (overlay into base). func TestComponentRefMergeWithPath(t *testing.T) { diff --git a/recipes/checks/k8s-nim-operator/health-check.yaml b/recipes/checks/k8s-nim-operator/health-check.yaml new file mode 100644 index 000000000..f60b3a3da --- /dev/null +++ b/recipes/checks/k8s-nim-operator/health-check.yaml @@ -0,0 +1,68 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# NIM Operator Health Check +# +# Validates that the NVIDIA NIM Operator is running and healthy in the +# nvidia-nim namespace. Checks that the k8s-nim-operator deployment has +# at least one available replica and that no pods in the namespace are +# stuck in Pending, Failed, or Unknown phases. +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Test +metadata: + name: k8s-nim-operator-health-check +spec: + timeouts: + assert: 5m + steps: + - name: validate-deployment-exists + try: + # Guard against vacuous pass on empty namespace: verify the + # k8s-nim-operator deployment exists and has at least one ready replica. + - assert: + resource: + apiVersion: apps/v1 + kind: Deployment + metadata: + name: k8s-nim-operator + namespace: nvidia-nim + status: + (availableReplicas > `0`): true + - name: validate-all-pods-healthy + try: + # Assert no pods are in unhealthy phases. + - error: + resource: + apiVersion: v1 + kind: Pod + metadata: + namespace: nvidia-nim + status: + phase: Pending + - error: + resource: + apiVersion: v1 + kind: Pod + metadata: + namespace: nvidia-nim + status: + phase: Failed + - error: + resource: + apiVersion: v1 + kind: Pod + metadata: + namespace: nvidia-nim + status: + phase: Unknown diff --git a/recipes/components/k8s-nim-operator/values.yaml b/recipes/components/k8s-nim-operator/values.yaml new file mode 100644 index 000000000..117f6ae2a --- /dev/null +++ b/recipes/components/k8s-nim-operator/values.yaml @@ -0,0 +1,34 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# NVIDIA NIM Operator Helm values (v3.1.0) +# Manages NIM microservice lifecycle: deployment, scaling, health monitoring, +# and model caching via NIMService, NIMPipeline, and NIMCache CRDs. +# +# Requires: cert-manager (for admission webhooks), gpu-operator (for GPU scheduling) + +operator: + # Enable admission controller with cert-manager for TLS + admissionController: + enabled: true + tls: + mode: cert-manager + certManager: + issuerType: selfsigned + + # Remove default control-plane affinity — EKS has no control-plane nodes + affinity: {} + +dynamo: + enabled: false diff --git a/recipes/overlays/h100-eks-ubuntu-inference-nim.yaml b/recipes/overlays/h100-eks-ubuntu-inference-nim.yaml new file mode 100644 index 000000000..d5f9ceba2 --- /dev/null +++ b/recipes/overlays/h100-eks-ubuntu-inference-nim.yaml @@ -0,0 +1,74 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +kind: RecipeMetadata +apiVersion: aicr.nvidia.com/v1alpha1 +metadata: + name: h100-eks-ubuntu-inference-nim + +spec: + # Inherits from h100-eks-ubuntu-inference (H100 + Ubuntu inference settings) + # Adds NVIDIA NIM Operator for managing NIM microservice deployments. + base: h100-eks-ubuntu-inference + + criteria: + service: eks + accelerator: h100 + os: ubuntu + intent: inference + platform: nim + + # DRA requires Kubernetes 1.34+ (GA) + constraints: + - name: K8s.server.version + value: ">= 1.34" + + componentRefs: + - name: nvidia-dra-driver-gpu + type: Helm + overrides: + gpuResourcesEnabledOverride: true + + - name: k8s-nim-operator + type: Helm + source: https://helm.ngc.nvidia.com/nvidia + version: "3.1.0" + valuesFile: components/k8s-nim-operator/values.yaml + dependencyRefs: + - cert-manager + - gpu-operator + + validation: + deployment: + checks: + - operator-health + - expected-resources + - gpu-operator-version + - check-nvidia-smi + constraints: + - name: Deployment.gpu-operator.version + value: ">= v24.6.0" + conformance: + checks: + - platform-health + - gpu-operator-health + - dra-support + - accelerator-metrics + - ai-service-metrics + - inference-gateway + - gang-scheduling + - pod-autoscaling + - cluster-autoscaling + - robust-controller + - secure-accelerator-access diff --git a/recipes/registry.yaml b/recipes/registry.yaml index 66d747550..cc86eb282 100644 --- a/recipes/registry.yaml +++ b/recipes/registry.yaml @@ -406,6 +406,25 @@ components: tolerationPaths: - tolerations + - name: k8s-nim-operator + displayName: k8s-nim-operator + valueOverrideKeys: + - nimoperator + - nim + healthCheck: + assertFile: checks/k8s-nim-operator/health-check.yaml + helm: + defaultRepository: https://helm.ngc.nvidia.com/nvidia + defaultChart: k8s-nim-operator + defaultVersion: "3.1.0" + defaultNamespace: nvidia-nim + nodeScheduling: + system: + nodeSelectorPaths: + - operator.nodeSelector + tolerationPaths: + - operator.tolerations + - name: kubeflow-trainer displayName: kubeflow-trainer valueOverrideKeys: