From 4015da76635f7364f2f2706d2a1b14d3b80b429b Mon Sep 17 00:00:00 2001 From: yansun1996 Date: Thu, 20 Mar 2025 02:29:26 +0000 Subject: [PATCH 01/12] Remove old dcgpu link and update to instinct doc link --- Makefile | 4 ++-- docs/fulldeviceconfig.rst | 4 ++-- docs/knownlimitations.md | 2 +- helm-charts/README.md | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Makefile b/Makefile index 19a3ba0c..99907539 100644 --- a/Makefile +++ b/Makefile @@ -417,12 +417,12 @@ kustomize: # go-get-tool will 'go install' any package $2 and install it to $1. PROJECT_DIR := $(shell dirname $(abspath $(lastword $(MAKEFILE_LIST)))) define go-get-tool - @if [ -z "$$GOBIN" ]; then \ + if [ -z "$$GOBIN" ]; then \ GOBIN=$$(go env GOPATH)/bin; \ else \ GOBIN=$$GOBIN; \ fi; \ - @[ -f $(1) ] || { \ + [ -f $(1) ] || { \ set -e; \ echo "Downloading $(2)"; \ echo "Running: GOBIN=$(PROJECT_DIR)/bin go install $(2)"; \ diff --git a/docs/fulldeviceconfig.rst b/docs/fulldeviceconfig.rst index 45d3f3de..8d8c1d95 100644 --- a/docs/fulldeviceconfig.rst +++ b/docs/fulldeviceconfig.rst @@ -131,7 +131,7 @@ Below is an example of a full DeviceConfig CR that can be used to install the AM upgradeStrategy: RollingUpdate, # (Optional) Can be either `RollingUpdate` or `OnDelete` maxUnavailable: 1 # (Optional) Number of pods that can be unavailable during the upgrade process. 1 is the default value # If specifying a node selector here, the metrics exporter will only be deployed on nodes that match the selector - # See Item #6 on https://dcgpu.docs.amd.com/projects/gpu-operator/en/latest/knownlimitations.html for example usage + # See Item #6 on https://instinct.docs.amd.com/projects/gpu-operator/en/latest/knownlimitations.html for example usage tolerations: key: "key1" # Key is the taint key that the toleration applies to. Empty means match all taint keys. If the key is empty, # operator must be "Exists"; this combination means to match all values and all keys. @@ -174,7 +174,7 @@ Below is an example of a full DeviceConfig CR that can be used to install the AM upgradeStrategy: RollingUpdate, # (Optional) Can be either `RollingUpdate` or `OnDelete` maxUnavailable: 1 # (Optional) Number of pods that can be unavailable during the upgrade process. 1 is the default value # If specifying a node selector here, the metrics exporter will only be deployed on nodes that match the selector - # See Item #6 on https://dcgpu.docs.amd.com/projects/gpu-operator/en/latest/knownlimitations.html for example usage + # See Item #6 on https://instinct.docs.amd.com/projects/gpu-operator/en/latest/knownlimitations.html for example usage tolerations: key: "key1" # Key is the taint key that the toleration applies to. Empty means match all taint keys. If the key is empty, # operator must be "Exists"; this combination means to match all values and all keys. diff --git a/docs/knownlimitations.md b/docs/knownlimitations.md index 9702009a..051b48cd 100644 --- a/docs/knownlimitations.md +++ b/docs/knownlimitations.md @@ -15,7 +15,7 @@ 3. **GPU Operator unable to install amdgpu driver if existing driver is already installed** - ***Impact:*** Driver install will fail if amdgpu in-box Driver is present/already installed - ***Affected Configurations:*** All configurations - - ***Workaround:*** When installing the amdgpu drivers using the GPU Operator, worker nodes should have amdgpu blacklisted or amdgpu drivers should not be pre-installed on the node. [Blacklist in-box driver](https://dcgpu.docs.amd.com/projects/gpu-operator/en/release-v1.0.0/drivers/installation.html#blacklist-inbox-driver) so that it is not loaded or remove the pre-installed driver + - ***Workaround:*** When installing the amdgpu drivers using the GPU Operator, worker nodes should have amdgpu blacklisted or amdgpu drivers should not be pre-installed on the node. [Blacklist in-box driver](https://instinct.docs.amd.com/projects/gpu-operator/en/release-v1.0.0/drivers/installation.html#blacklist-inbox-driver) so that it is not loaded or remove the pre-installed driver

4. **When GPU Operator is used in SKIP driver install mode, if amdgpu module is removed with device plugin installed it will not reflect active GPU available on the server** diff --git a/helm-charts/README.md b/helm-charts/README.md index 7e7ec9c4..72bb01ce 100644 --- a/helm-charts/README.md +++ b/helm-charts/README.md @@ -80,7 +80,7 @@ Installation Options ### 3. Install Custom Resource -After the installation of AMD GPU Operator, you need to create the `DeviceConfig` custom resource in order to trigger the operator to start to work. By preparing the `DeviceConfig` in the YAML file, you can create the resouce by running ```kubectl apply -f deviceconfigs.yaml```. For custom resource definition and more detailed information, please refer to [Custom Resource Installation Guide](https://dcgpu.docs.amd.com/projects/gpu-operator/en/latest/installation/kubernetes-helm.html#install-custom-resource). +After the installation of AMD GPU Operator, you need to create the `DeviceConfig` custom resource in order to trigger the operator to start to work. By preparing the `DeviceConfig` in the YAML file, you can create the resouce by running ```kubectl apply -f deviceconfigs.yaml```. For custom resource definition and more detailed information, please refer to [Custom Resource Installation Guide](https://instinct.docs.amd.com/projects/gpu-operator/en/latest/installation/kubernetes-helm.html#install-custom-resource). ### Grafana Dashboards From 618e8cd3aeec2f2a6b7a3c6020fe4f7a355516fb Mon Sep 17 00:00:00 2001 From: praveen Date: Thu, 20 Mar 2025 10:38:00 -0700 Subject: [PATCH 02/12] update examples --- example/metricsExporter/config.json | 37 +++++++++++++++++++++--- example/metricsExporter/configmap.yaml | 39 ++++++++++++++++++++++---- 2 files changed, 67 insertions(+), 9 deletions(-) diff --git a/example/metricsExporter/config.json b/example/metricsExporter/config.json index 689852ab..53e02eb0 100644 --- a/example/metricsExporter/config.json +++ b/example/metricsExporter/config.json @@ -27,7 +27,7 @@ "PCIE_NAC_RECEIVED_COUNT", "GPU_CLOCK", "GPU_POWER_USAGE", - "GPU_TOTAL_MEMORY", + "GPU_TOTAL_VRAM", "GPU_ECC_CORRECT_TOTAL", "GPU_ECC_UNCORRECT_TOTAL", "GPU_ECC_CORRECT_SDMA", @@ -89,7 +89,10 @@ "GPU_ECC_CORRECT_IH", "GPU_ECC_UNCORRECT_IH", "GPU_ECC_CORRECT_MPIO", - "GPU_ECC_UNCORRECT_MPIO" + "GPU_ECC_UNCORRECT_MPIO", + "GPU_HEALTH", + "GPU_XGMI_LINK_RX", + "GPU_XGMI_LINK_TX" ], "Labels": [ "GPU_UUID", @@ -107,7 +110,33 @@ "CARD_VENDOR", "DRIVER_VERSION", "VBIOS_VERSION", - "HOSTNAME" - ] + "HOSTNAME", + "GPU_PARTITION_ID", + "GPU_COMPUTE_PARTITION_TYPE" + ], + "HealthThresholds" : { + "GPU_ECC_UNCORRECT_SDMA" : 0, + "GPU_ECC_UNCORRECT_GFX" : 0, + "GPU_ECC_UNCORRECT_MMHUB" : 0, + "GPU_ECC_UNCORRECT_ATHUB" : 0, + "GPU_ECC_UNCORRECT_BIF" : 0, + "GPU_ECC_UNCORRECT_HDP" : 0, + "GPU_ECC_UNCORRECT_XGMI_WAFL" : 0, + "GPU_ECC_UNCORRECT_DF" : 0, + "GPU_ECC_UNCORRECT_SMN" : 0, + "GPU_ECC_UNCORRECT_SEM" : 0, + "GPU_ECC_UNCORRECT_MP0" : 0, + "GPU_ECC_UNCORRECT_MP1" : 0, + "GPU_ECC_UNCORRECT_FUSE" : 0, + "GPU_ECC_UNCORRECT_UMC" : 0, + "GPU_ECC_UNCORRECT_MCA" : 0, + "GPU_ECC_UNCORRECT_VCN" : 0, + "GPU_ECC_UNCORRECT_JPEG" : 0, + "GPU_ECC_UNCORRECT_IH" : 0, + "GPU_ECC_UNCORRECT_MPIO" : 0 + }, + "CustomLabels" : { + "CLUSTER_NAME" : "amdgpu-k8s-metrics-exporter" + } } } diff --git a/example/metricsExporter/configmap.yaml b/example/metricsExporter/configmap.yaml index 6bf81824..e21cac40 100644 --- a/example/metricsExporter/configmap.yaml +++ b/example/metricsExporter/configmap.yaml @@ -2,7 +2,7 @@ apiVersion: v1 kind: ConfigMap metadata: name: gpu-config - namespace: kube-amd-gpu + namespace: mynamespace data: config.json: | { @@ -34,7 +34,7 @@ data: "PCIE_NAC_RECEIVED_COUNT", "GPU_CLOCK", "GPU_POWER_USAGE", - "GPU_TOTAL_VRAM", + "GPU_TOTAL_MEMORY", "GPU_ECC_CORRECT_TOTAL", "GPU_ECC_UNCORRECT_TOTAL", "GPU_ECC_CORRECT_SDMA", @@ -96,7 +96,10 @@ data: "GPU_ECC_CORRECT_IH", "GPU_ECC_UNCORRECT_IH", "GPU_ECC_CORRECT_MPIO", - "GPU_ECC_UNCORRECT_MPIO" + "GPU_ECC_UNCORRECT_MPIO", + "GPU_HEALTH", + "GPU_XGMI_LINK_RX", + "GPU_XGMI_LINK_TX" ], "Labels": [ "GPU_UUID", @@ -114,8 +117,34 @@ data: "CARD_VENDOR", "DRIVER_VERSION", "VBIOS_VERSION", - "HOSTNAME" - ] + "HOSTNAME", + "GPU_PARTITION_ID", + "GPU_COMPUTE_PARTITION_TYPE" + ], + "HealthThresholds" : { + "GPU_ECC_UNCORRECT_SDMA" : 0, + "GPU_ECC_UNCORRECT_GFX" : 0, + "GPU_ECC_UNCORRECT_MMHUB" : 0, + "GPU_ECC_UNCORRECT_ATHUB" : 0, + "GPU_ECC_UNCORRECT_BIF" : 0, + "GPU_ECC_UNCORRECT_HDP" : 0, + "GPU_ECC_UNCORRECT_XGMI_WAFL" : 0, + "GPU_ECC_UNCORRECT_DF" : 0, + "GPU_ECC_UNCORRECT_SMN" : 0, + "GPU_ECC_UNCORRECT_SEM" : 0, + "GPU_ECC_UNCORRECT_MP0" : 0, + "GPU_ECC_UNCORRECT_MP1" : 0, + "GPU_ECC_UNCORRECT_FUSE" : 0, + "GPU_ECC_UNCORRECT_UMC" : 0, + "GPU_ECC_UNCORRECT_MCA" : 0, + "GPU_ECC_UNCORRECT_VCN" : 0, + "GPU_ECC_UNCORRECT_JPEG" : 0, + "GPU_ECC_UNCORRECT_IH" : 0, + "GPU_ECC_UNCORRECT_MPIO" : 0 + }, + "CustomLabels" : { + "CLUSTER_NAME" : "amdgpu-k8s-metrics-exporter" + } } } From c257a6c2aa07562c78d89c5a9fd8d95b6b912e89 Mon Sep 17 00:00:00 2001 From: vm Date: Thu, 20 Mar 2025 06:05:15 +0000 Subject: [PATCH 03/12] Utils Container Openshift Security Context Constraint access --- Makefile | 2 +- ...md-gpu-operator.clusterserviceversion.yaml | 12 ++++++- config/rbac/kustomization.yaml | 3 ++ config/rbac/utils_container_role.yaml | 13 +++++++ config/rbac/utils_container_role_binding.yaml | 11 ++++++ .../rbac/utils_container_service_account.yaml | 4 +++ hack/k8s-patch/metadata-patch/values.yaml | 3 ++ .../template-patch/serviceaccount.yaml | 11 ++++++ .../template-patch/utils-container-rbac.yaml | 34 +++++++++++++++++++ .../template-patch/utils-container-rbac.yaml | 34 +++++++++++++++++++ helm-charts-k8s/Chart.lock | 2 +- helm-charts-k8s/templates/serviceaccount.yaml | 11 ++++++ .../templates/utils-container-rbac.yaml | 34 +++++++++++++++++++ helm-charts-k8s/values.yaml | 3 ++ helm-charts-openshift/Chart.lock | 2 +- .../templates/utils-container-rbac.yaml | 34 +++++++++++++++++++ internal/controllers/upgrademgr.go | 13 ++++--- 17 files changed, 217 insertions(+), 9 deletions(-) create mode 100644 config/rbac/utils_container_role.yaml create mode 100644 config/rbac/utils_container_role_binding.yaml create mode 100644 config/rbac/utils_container_service_account.yaml create mode 100644 hack/k8s-patch/template-patch/utils-container-rbac.yaml create mode 100644 hack/openshift-patch/template-patch/utils-container-rbac.yaml create mode 100644 helm-charts-k8s/templates/utils-container-rbac.yaml create mode 100644 helm-charts-openshift/templates/utils-container-rbac.yaml diff --git a/Makefile b/Makefile index 99907539..e40cbf01 100644 --- a/Makefile +++ b/Makefile @@ -353,7 +353,7 @@ bundle-build: operator-sdk manifests kustomize ## OpenShift Build OLM bundle. cd config/manager && $(KUSTOMIZE) edit set image controller=$(IMG) cd config/manager-base && $(KUSTOMIZE) edit set image controller=$(IMG) OPERATOR_SDK="${OPERATOR_SDK}" \ - BUNDLE_GEN_FLAGS="${BUNDLE_GEN_FLAGS} --extra-service-accounts amd-gpu-operator-kmm-device-plugin,amd-gpu-operator-kmm-module-loader,amd-gpu-operator-node-labeller,amd-gpu-operator-metrics-exporter,amd-gpu-operator-metrics-exporter-rbac-proxy,amd-gpu-operator-test-runner,amd-gpu-operator-config-manager" \ + BUNDLE_GEN_FLAGS="${BUNDLE_GEN_FLAGS} --extra-service-accounts amd-gpu-operator-kmm-device-plugin,amd-gpu-operator-kmm-module-loader,amd-gpu-operator-node-labeller,amd-gpu-operator-metrics-exporter,amd-gpu-operator-metrics-exporter-rbac-proxy,amd-gpu-operator-test-runner,amd-gpu-operator-config-manager,amd-gpu-operator-utils-container" \ PKG=amd-gpu-operator \ SOURCE_DIR=$(dir $(realpath $(lastword $(MAKEFILE_LIST)))) \ KUBECTL_CMD=${KUBECTL_CMD} ./hack/generate-bundle diff --git a/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml b/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml index 3c6bd39f..495f3670 100644 --- a/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml +++ b/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml @@ -30,7 +30,7 @@ metadata: } ] capabilities: Basic Install - createdAt: "2025-03-18T22:58:17Z" + createdAt: "2025-03-20T06:06:57Z" operatorframework.io/suggested-namespace: openshift-amd-gpu operators.operatorframework.io/builder: operator-sdk-v1.32.0 operators.operatorframework.io/project_layout: go.kubebuilder.io/v3 @@ -955,6 +955,16 @@ spec: verbs: - use serviceAccountName: amd-gpu-operator-test-runner + - rules: + - apiGroups: + - security.openshift.io + resourceNames: + - privileged + resources: + - securitycontextconstraints + verbs: + - use + serviceAccountName: amd-gpu-operator-utils-container deployments: - label: app.kubernetes.io/component: amd-gpu diff --git a/config/rbac/kustomization.yaml b/config/rbac/kustomization.yaml index ac367d92..332d351c 100644 --- a/config/rbac/kustomization.yaml +++ b/config/rbac/kustomization.yaml @@ -22,3 +22,6 @@ resources: - test_runner_service_account.yaml - test_runner_role.yaml - test_runner_role_binding.yaml + - utils_container_service_account.yaml + - utils_container_role.yaml + - utils_container_role_binding.yaml \ No newline at end of file diff --git a/config/rbac/utils_container_role.yaml b/config/rbac/utils_container_role.yaml new file mode 100644 index 00000000..5a5579fa --- /dev/null +++ b/config/rbac/utils_container_role.yaml @@ -0,0 +1,13 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: utils-container +rules: +- apiGroups: + - security.openshift.io + resourceNames: + - privileged + resources: + - securitycontextconstraints + verbs: + - use \ No newline at end of file diff --git a/config/rbac/utils_container_role_binding.yaml b/config/rbac/utils_container_role_binding.yaml new file mode 100644 index 00000000..1f9ceb76 --- /dev/null +++ b/config/rbac/utils_container_role_binding.yaml @@ -0,0 +1,11 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: utils-container +subjects: +- kind: ServiceAccount + name: utils-container +roleRef: + kind: ClusterRole + name: utils-container + apiGroup: rbac.authorization.k8s.io \ No newline at end of file diff --git a/config/rbac/utils_container_service_account.yaml b/config/rbac/utils_container_service_account.yaml new file mode 100644 index 00000000..7d92dcd6 --- /dev/null +++ b/config/rbac/utils_container_service_account.yaml @@ -0,0 +1,4 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: utils-container \ No newline at end of file diff --git a/hack/k8s-patch/metadata-patch/values.yaml b/hack/k8s-patch/metadata-patch/values.yaml index 59caef95..71bfd56c 100644 --- a/hack/k8s-patch/metadata-patch/values.yaml +++ b/hack/k8s-patch/metadata-patch/values.yaml @@ -101,6 +101,9 @@ testRunner: configManager: serviceAccount: annotations: {} +utilsContainer: + serviceAccount: + annotations: {} global: proxy: env: {} diff --git a/hack/k8s-patch/template-patch/serviceaccount.yaml b/hack/k8s-patch/template-patch/serviceaccount.yaml index 57fe7999..ae6c0be4 100644 --- a/hack/k8s-patch/template-patch/serviceaccount.yaml +++ b/hack/k8s-patch/template-patch/serviceaccount.yaml @@ -85,3 +85,14 @@ metadata: {{- include "helm-charts-k8s.labels" . | nindent 4 }} annotations: {{- toYaml .Values.configManager.serviceAccount.annotations | nindent 4 }} +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: amd-gpu-operator-utils-container + labels: + app.kubernetes.io/component: amd-gpu + app.kubernetes.io/part-of: amd-gpu + {{- include "helm-charts-k8s.labels" . | nindent 4 }} + annotations: + {{- toYaml .Values.utilsContainer.serviceAccount.annotations | nindent 4 }} \ No newline at end of file diff --git a/hack/k8s-patch/template-patch/utils-container-rbac.yaml b/hack/k8s-patch/template-patch/utils-container-rbac.yaml new file mode 100644 index 00000000..65f1d965 --- /dev/null +++ b/hack/k8s-patch/template-patch/utils-container-rbac.yaml @@ -0,0 +1,34 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ include "helm-charts-k8s.fullname" . }}-utils-container + labels: + app.kubernetes.io/component: amd-gpu + app.kubernetes.io/part-of: amd-gpu + {{- include "helm-charts-k8s.labels" . | nindent 4 }} +rules: +- apiGroups: + - security.openshift.io + resourceNames: + - privileged + resources: + - securitycontextconstraints + verbs: + - use +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ include "helm-charts-k8s.fullname" . }}-utils-container + labels: + app.kubernetes.io/component: amd-gpu + app.kubernetes.io/part-of: amd-gpu + {{- include "helm-charts-k8s.labels" . | nindent 4 }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: '{{ include "helm-charts-k8s.fullname" . }}-utils-container' +subjects: +- kind: ServiceAccount + name: amd-gpu-operator-utils-container + namespace: '{{ .Release.Namespace }}' \ No newline at end of file diff --git a/hack/openshift-patch/template-patch/utils-container-rbac.yaml b/hack/openshift-patch/template-patch/utils-container-rbac.yaml new file mode 100644 index 00000000..36ac4ad8 --- /dev/null +++ b/hack/openshift-patch/template-patch/utils-container-rbac.yaml @@ -0,0 +1,34 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ include "helm-charts-openshift.fullname" . }}-utils-container + labels: + app.kubernetes.io/component: amd-gpu + app.kubernetes.io/part-of: amd-gpu + {{- include "helm-charts-openshift.labels" . | nindent 4 }} +rules: +- apiGroups: + - security.openshift.io + resourceNames: + - privileged + resources: + - securitycontextconstraints + verbs: + - use +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ include "helm-charts-openshift.fullname" . }}-utils-container + labels: + app.kubernetes.io/component: amd-gpu + app.kubernetes.io/part-of: amd-gpu + {{- include "helm-charts-openshift.labels" . | nindent 4 }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: '{{ include "helm-charts-openshift.fullname" . }}-utils-container' +subjects: +- kind: ServiceAccount + name: amd-gpu-operator-utils-container + namespace: '{{ .Release.Namespace }}' \ No newline at end of file diff --git a/helm-charts-k8s/Chart.lock b/helm-charts-k8s/Chart.lock index 54d3ca29..6ad80130 100644 --- a/helm-charts-k8s/Chart.lock +++ b/helm-charts-k8s/Chart.lock @@ -6,4 +6,4 @@ dependencies: repository: file://./charts/kmm version: v1.0.0 digest: sha256:f9a315dd2ce3d515ebf28c8e9a6a82158b493ca2686439ec381487761261b597 -generated: "2025-03-14T23:56:25.183742256Z" +generated: "2025-03-20T06:06:33.9562362Z" diff --git a/helm-charts-k8s/templates/serviceaccount.yaml b/helm-charts-k8s/templates/serviceaccount.yaml index 57fe7999..ae6c0be4 100644 --- a/helm-charts-k8s/templates/serviceaccount.yaml +++ b/helm-charts-k8s/templates/serviceaccount.yaml @@ -85,3 +85,14 @@ metadata: {{- include "helm-charts-k8s.labels" . | nindent 4 }} annotations: {{- toYaml .Values.configManager.serviceAccount.annotations | nindent 4 }} +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: amd-gpu-operator-utils-container + labels: + app.kubernetes.io/component: amd-gpu + app.kubernetes.io/part-of: amd-gpu + {{- include "helm-charts-k8s.labels" . | nindent 4 }} + annotations: + {{- toYaml .Values.utilsContainer.serviceAccount.annotations | nindent 4 }} \ No newline at end of file diff --git a/helm-charts-k8s/templates/utils-container-rbac.yaml b/helm-charts-k8s/templates/utils-container-rbac.yaml new file mode 100644 index 00000000..65f1d965 --- /dev/null +++ b/helm-charts-k8s/templates/utils-container-rbac.yaml @@ -0,0 +1,34 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ include "helm-charts-k8s.fullname" . }}-utils-container + labels: + app.kubernetes.io/component: amd-gpu + app.kubernetes.io/part-of: amd-gpu + {{- include "helm-charts-k8s.labels" . | nindent 4 }} +rules: +- apiGroups: + - security.openshift.io + resourceNames: + - privileged + resources: + - securitycontextconstraints + verbs: + - use +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ include "helm-charts-k8s.fullname" . }}-utils-container + labels: + app.kubernetes.io/component: amd-gpu + app.kubernetes.io/part-of: amd-gpu + {{- include "helm-charts-k8s.labels" . | nindent 4 }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: '{{ include "helm-charts-k8s.fullname" . }}-utils-container' +subjects: +- kind: ServiceAccount + name: amd-gpu-operator-utils-container + namespace: '{{ .Release.Namespace }}' \ No newline at end of file diff --git a/helm-charts-k8s/values.yaml b/helm-charts-k8s/values.yaml index 59caef95..71bfd56c 100644 --- a/helm-charts-k8s/values.yaml +++ b/helm-charts-k8s/values.yaml @@ -101,6 +101,9 @@ testRunner: configManager: serviceAccount: annotations: {} +utilsContainer: + serviceAccount: + annotations: {} global: proxy: env: {} diff --git a/helm-charts-openshift/Chart.lock b/helm-charts-openshift/Chart.lock index 697fba37..6e3c4ccc 100644 --- a/helm-charts-openshift/Chart.lock +++ b/helm-charts-openshift/Chart.lock @@ -6,4 +6,4 @@ dependencies: repository: file://./charts/kmm version: v1.0.0 digest: sha256:25200c34a5cc846a1275e5bf3fc637b19e909dc68de938189c5278d77d03f5ac -generated: "2025-03-18T22:58:15.38295759Z" +generated: "2025-03-20T06:06:55.80187139Z" diff --git a/helm-charts-openshift/templates/utils-container-rbac.yaml b/helm-charts-openshift/templates/utils-container-rbac.yaml new file mode 100644 index 00000000..36ac4ad8 --- /dev/null +++ b/helm-charts-openshift/templates/utils-container-rbac.yaml @@ -0,0 +1,34 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ include "helm-charts-openshift.fullname" . }}-utils-container + labels: + app.kubernetes.io/component: amd-gpu + app.kubernetes.io/part-of: amd-gpu + {{- include "helm-charts-openshift.labels" . | nindent 4 }} +rules: +- apiGroups: + - security.openshift.io + resourceNames: + - privileged + resources: + - securitycontextconstraints + verbs: + - use +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ include "helm-charts-openshift.fullname" . }}-utils-container + labels: + app.kubernetes.io/component: amd-gpu + app.kubernetes.io/part-of: amd-gpu + {{- include "helm-charts-openshift.labels" . | nindent 4 }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: '{{ include "helm-charts-openshift.fullname" . }}-utils-container' +subjects: +- kind: ServiceAccount + name: amd-gpu-operator-utils-container + namespace: '{{ .Release.Namespace }}' \ No newline at end of file diff --git a/internal/controllers/upgrademgr.go b/internal/controllers/upgrademgr.go index 071a0f82..c5f3fd6e 100644 --- a/internal/controllers/upgrademgr.go +++ b/internal/controllers/upgrademgr.go @@ -61,6 +61,7 @@ import ( const ( defaultUtilsImage = "docker.io/rocm/gpu-operator-utils:latest" + defaultSAName = "amd-gpu-operator-utils-container" ) type upgradeMgr struct { @@ -976,6 +977,7 @@ func (h *upgradeMgrHelper) getRebootPod(nodeName string, dc *amdv1alpha1.DeviceC nodeSelector := map[string]string{} nodeSelector["kubernetes.io/hostname"] = nodeName utilsImage := defaultUtilsImage + serviceaccount := defaultSAName if dc.Spec.CommonConfig.UtilsContainer.Image != "" { utilsImage = dc.Spec.CommonConfig.UtilsContainer.Image } @@ -989,11 +991,12 @@ func (h *upgradeMgrHelper) getRebootPod(nodeName string, dc *amdv1alpha1.DeviceC Namespace: dc.Namespace, }, Spec: v1.PodSpec{ - HostPID: true, - HostNetwork: true, - RestartPolicy: v1.RestartPolicyNever, - NodeSelector: nodeSelector, - ImagePullSecrets: imagePullSecrets, + ServiceAccountName: serviceaccount, + HostPID: true, + HostNetwork: true, + RestartPolicy: v1.RestartPolicyNever, + NodeSelector: nodeSelector, + ImagePullSecrets: imagePullSecrets, Containers: []v1.Container{ { Name: "reboot-container", From 0a08e9919b21a9133757e11cba14a54c9137589b Mon Sep 17 00:00:00 2001 From: vm Date: Fri, 21 Mar 2025 05:08:08 +0000 Subject: [PATCH 04/12] GPUOP-210 --- api/v1alpha1/deviceconfig_types.go | 2 +- .../manifests/amd-gpu-operator.clusterserviceversion.yaml | 2 +- bundle/manifests/amd.com_deviceconfigs.yaml | 2 +- config/crd/bases/amd.com_deviceconfigs.yaml | 2 +- docs/installation/openshift-olm.md | 7 +++++++ helm-charts-k8s/Chart.lock | 2 +- helm-charts-k8s/crds/deviceconfig-crd.yaml | 2 +- helm-charts-openshift/Chart.lock | 2 +- helm-charts-openshift/crds/deviceconfig-crd.yaml | 2 +- 9 files changed, 15 insertions(+), 8 deletions(-) diff --git a/api/v1alpha1/deviceconfig_types.go b/api/v1alpha1/deviceconfig_types.go index 94a4d6a2..7c187780 100644 --- a/api/v1alpha1/deviceconfig_types.go +++ b/api/v1alpha1/deviceconfig_types.go @@ -117,7 +117,7 @@ type DriverSpec struct { // example tag is coreos-416.94-5.14.0-427.28.1.el9_4.x86_64-6.2.2 and ubuntu-22.04-5.15.0-94-generic-6.1.3 //+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="Image",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:image"} // +optional - // +kubebuilder:validation:Pattern=`^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$` + // +kubebuilder:validation:Pattern=`^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[$a-zA-Z0-9_]+(?:[._-][$a-zA-Z0-9_]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$` Image string `json:"image,omitempty"` // driver image registry TLS setting for the container image diff --git a/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml b/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml index 495f3670..86d76210 100644 --- a/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml +++ b/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml @@ -30,7 +30,7 @@ metadata: } ] capabilities: Basic Install - createdAt: "2025-03-20T06:06:57Z" + createdAt: "2025-03-21T05:09:41Z" operatorframework.io/suggested-namespace: openshift-amd-gpu operators.operatorframework.io/builder: operator-sdk-v1.32.0 operators.operatorframework.io/project_layout: go.kubebuilder.io/v3 diff --git a/bundle/manifests/amd.com_deviceconfigs.yaml b/bundle/manifests/amd.com_deviceconfigs.yaml index a0476c71..898220ac 100644 --- a/bundle/manifests/amd.com_deviceconfigs.yaml +++ b/bundle/manifests/amd.com_deviceconfigs.yaml @@ -357,7 +357,7 @@ spec: for OpenShift the default value is image-registry.openshift-image-registry.svc:5000/$MOD_NAMESPACE/amdgpu_kmod image tag will be in the format of --- example tag is coreos-416.94-5.14.0-427.28.1.el9_4.x86_64-6.2.2 and ubuntu-22.04-5.15.0-94-generic-6.1.3 - pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$ + pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[$a-zA-Z0-9_]+(?:[._-][$a-zA-Z0-9_]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$ type: string imageRegistrySecret: description: secrets used for pull/push images from/to private diff --git a/config/crd/bases/amd.com_deviceconfigs.yaml b/config/crd/bases/amd.com_deviceconfigs.yaml index 5f7a02bd..05e8b815 100644 --- a/config/crd/bases/amd.com_deviceconfigs.yaml +++ b/config/crd/bases/amd.com_deviceconfigs.yaml @@ -353,7 +353,7 @@ spec: for OpenShift the default value is image-registry.openshift-image-registry.svc:5000/$MOD_NAMESPACE/amdgpu_kmod image tag will be in the format of --- example tag is coreos-416.94-5.14.0-427.28.1.el9_4.x86_64-6.2.2 and ubuntu-22.04-5.15.0-94-generic-6.1.3 - pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$ + pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[$a-zA-Z0-9_]+(?:[._-][$a-zA-Z0-9_]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$ type: string imageRegistrySecret: description: secrets used for pull/push images from/to private diff --git a/docs/installation/openshift-olm.md b/docs/installation/openshift-olm.md index 89625fc1..489644d9 100644 --- a/docs/installation/openshift-olm.md +++ b/docs/installation/openshift-olm.md @@ -204,6 +204,13 @@ spec: "feature.node.kubernetes.io/amd-gpu": "true" ``` +Things to note: +1. By default, there is no need to specify the image field in CR for Openshift. Default will be used which is: image-registry.openshift-image-registry.svc:5000/$MOD_NAMESPACE/amdgpu_kmod + +2. If users specify image, $MOD_NAMESPACE can be a place holder , KMM Operator can automatically translate it to the namespace + +3. Openshift internal registry has image url restriction, OpenShift users cannot use image like `/` , it requires the image URL to be `//`. However, if any other registry is being used by the user, the image URL can be of either form. + The operator will: 1. Collect worker node system specifications diff --git a/helm-charts-k8s/Chart.lock b/helm-charts-k8s/Chart.lock index 6ad80130..477dc578 100644 --- a/helm-charts-k8s/Chart.lock +++ b/helm-charts-k8s/Chart.lock @@ -6,4 +6,4 @@ dependencies: repository: file://./charts/kmm version: v1.0.0 digest: sha256:f9a315dd2ce3d515ebf28c8e9a6a82158b493ca2686439ec381487761261b597 -generated: "2025-03-20T06:06:33.9562362Z" +generated: "2025-03-21T05:09:30.645342377Z" diff --git a/helm-charts-k8s/crds/deviceconfig-crd.yaml b/helm-charts-k8s/crds/deviceconfig-crd.yaml index 6058c151..707eb1ce 100644 --- a/helm-charts-k8s/crds/deviceconfig-crd.yaml +++ b/helm-charts-k8s/crds/deviceconfig-crd.yaml @@ -361,7 +361,7 @@ spec: for OpenShift the default value is image-registry.openshift-image-registry.svc:5000/$MOD_NAMESPACE/amdgpu_kmod image tag will be in the format of --- example tag is coreos-416.94-5.14.0-427.28.1.el9_4.x86_64-6.2.2 and ubuntu-22.04-5.15.0-94-generic-6.1.3 - pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$ + pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[$a-zA-Z0-9_]+(?:[._-][$a-zA-Z0-9_]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$ type: string imageRegistrySecret: description: secrets used for pull/push images from/to private registry diff --git a/helm-charts-openshift/Chart.lock b/helm-charts-openshift/Chart.lock index 6e3c4ccc..f39ad5ac 100644 --- a/helm-charts-openshift/Chart.lock +++ b/helm-charts-openshift/Chart.lock @@ -6,4 +6,4 @@ dependencies: repository: file://./charts/kmm version: v1.0.0 digest: sha256:25200c34a5cc846a1275e5bf3fc637b19e909dc68de938189c5278d77d03f5ac -generated: "2025-03-20T06:06:55.80187139Z" +generated: "2025-03-21T05:09:40.013067636Z" diff --git a/helm-charts-openshift/crds/deviceconfig-crd.yaml b/helm-charts-openshift/crds/deviceconfig-crd.yaml index 6058c151..707eb1ce 100644 --- a/helm-charts-openshift/crds/deviceconfig-crd.yaml +++ b/helm-charts-openshift/crds/deviceconfig-crd.yaml @@ -361,7 +361,7 @@ spec: for OpenShift the default value is image-registry.openshift-image-registry.svc:5000/$MOD_NAMESPACE/amdgpu_kmod image tag will be in the format of --- example tag is coreos-416.94-5.14.0-427.28.1.el9_4.x86_64-6.2.2 and ubuntu-22.04-5.15.0-94-generic-6.1.3 - pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$ + pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[$a-zA-Z0-9_]+(?:[._-][$a-zA-Z0-9_]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$ type: string imageRegistrySecret: description: secrets used for pull/push images from/to private registry From f08d3a75b68ed1d6b9c8d5b5e9f18a3e3794ecf7 Mon Sep 17 00:00:00 2001 From: yansun1996 Date: Mon, 17 Mar 2025 21:29:38 +0000 Subject: [PATCH 05/12] Ignore vendored docs in sanity --- docs/.markdownlint.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/.markdownlint.yaml b/docs/.markdownlint.yaml index 21ea9629..3d17a138 100644 --- a/docs/.markdownlint.yaml +++ b/docs/.markdownlint.yaml @@ -1,6 +1,3 @@ -ignores: - - CHANGELOG.md - - "vendor/**" default: true MD013: false MD024: @@ -12,3 +9,6 @@ MD029: MD033: false MD034: false MD041: false +ignores: + - CHANGELOG.md + - "vendor/**" \ No newline at end of file From 18465227372ce590945d5b7c9232c13f9da1365b Mon Sep 17 00:00:00 2001 From: yansun1996 Date: Mon, 17 Mar 2025 21:51:01 +0000 Subject: [PATCH 06/12] Fix docs sanity to ignore vendor folder --- .markdownlint-cli2.yaml | 2 ++ docs/.markdownlint.yaml => .markdownlint.yaml | 3 +-- .spellcheck.yml => .spellcheck.local.yaml | 11 ++++++----- docs/.markdownlint-cli2.yaml | 3 --- 4 files changed, 9 insertions(+), 10 deletions(-) create mode 100644 .markdownlint-cli2.yaml rename docs/.markdownlint.yaml => .markdownlint.yaml (83%) rename .spellcheck.yml => .spellcheck.local.yaml (95%) delete mode 100644 docs/.markdownlint-cli2.yaml diff --git a/.markdownlint-cli2.yaml b/.markdownlint-cli2.yaml new file mode 100644 index 00000000..9d2459dc --- /dev/null +++ b/.markdownlint-cli2.yaml @@ -0,0 +1,2 @@ +ignores: + - "vendor/**/*.md" \ No newline at end of file diff --git a/docs/.markdownlint.yaml b/.markdownlint.yaml similarity index 83% rename from docs/.markdownlint.yaml rename to .markdownlint.yaml index 3d17a138..3ffda0de 100644 --- a/docs/.markdownlint.yaml +++ b/.markdownlint.yaml @@ -10,5 +10,4 @@ MD033: false MD034: false MD041: false ignores: - - CHANGELOG.md - - "vendor/**" \ No newline at end of file + - "vendor/**/*.md" \ No newline at end of file diff --git a/.spellcheck.yml b/.spellcheck.local.yaml similarity index 95% rename from .spellcheck.yml rename to .spellcheck.local.yaml index 9291279a..da2912e5 100644 --- a/.spellcheck.yml +++ b/.spellcheck.local.yaml @@ -33,9 +33,9 @@ matrix: - name: Markdown sources: - - ['docs/**/*.md', '!docs/doxygen/mainpage.md', '!docs/contributing/documentation-standards.md'] - - ['tools/autotag/templates/**/*.md', '!tools/autotag/templates/**/5*.md', '!tools/autotag/templates/**/6.0*.md', '!tools/autotag/templates/**/6.1*.md'] - expect_match: false + - '!vendor/**|docs/**/*.md|!docs/doxygen/mainpage.md|!docs/contributing/documentation-standards.md' + expect_match: true + jobs: 4 aspell: lang: en dictionary: @@ -115,8 +115,9 @@ matrix: - pyspelling.filters.url: - name: reST sources: - - 'docs/**/*.rst' - expect_match: false + - '!vendor/**|docs/**/*.rst' + expect_match: true + jobs: 4 aspell: lang: en dictionary: diff --git a/docs/.markdownlint-cli2.yaml b/docs/.markdownlint-cli2.yaml deleted file mode 100644 index 74870cb6..00000000 --- a/docs/.markdownlint-cli2.yaml +++ /dev/null @@ -1,3 +0,0 @@ -ignores: - - CHANGELOG.md - - "vendor/**" \ No newline at end of file From 70a556a821671352d02ad705703586a0c15aa515 Mon Sep 17 00:00:00 2001 From: vm Date: Fri, 21 Mar 2025 05:08:08 +0000 Subject: [PATCH 07/12] GPUOP-210 --- api/v1alpha1/deviceconfig_types.go | 2 +- .../manifests/amd-gpu-operator.clusterserviceversion.yaml | 2 +- bundle/manifests/amd.com_deviceconfigs.yaml | 2 +- config/crd/bases/amd.com_deviceconfigs.yaml | 2 +- docs/installation/openshift-olm.md | 7 +++++++ helm-charts-k8s/Chart.lock | 2 +- helm-charts-k8s/crds/deviceconfig-crd.yaml | 2 +- helm-charts-openshift/Chart.lock | 2 +- helm-charts-openshift/crds/deviceconfig-crd.yaml | 2 +- 9 files changed, 15 insertions(+), 8 deletions(-) diff --git a/api/v1alpha1/deviceconfig_types.go b/api/v1alpha1/deviceconfig_types.go index 94a4d6a2..7c187780 100644 --- a/api/v1alpha1/deviceconfig_types.go +++ b/api/v1alpha1/deviceconfig_types.go @@ -117,7 +117,7 @@ type DriverSpec struct { // example tag is coreos-416.94-5.14.0-427.28.1.el9_4.x86_64-6.2.2 and ubuntu-22.04-5.15.0-94-generic-6.1.3 //+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="Image",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:image"} // +optional - // +kubebuilder:validation:Pattern=`^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$` + // +kubebuilder:validation:Pattern=`^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[$a-zA-Z0-9_]+(?:[._-][$a-zA-Z0-9_]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$` Image string `json:"image,omitempty"` // driver image registry TLS setting for the container image diff --git a/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml b/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml index 495f3670..86d76210 100644 --- a/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml +++ b/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml @@ -30,7 +30,7 @@ metadata: } ] capabilities: Basic Install - createdAt: "2025-03-20T06:06:57Z" + createdAt: "2025-03-21T05:09:41Z" operatorframework.io/suggested-namespace: openshift-amd-gpu operators.operatorframework.io/builder: operator-sdk-v1.32.0 operators.operatorframework.io/project_layout: go.kubebuilder.io/v3 diff --git a/bundle/manifests/amd.com_deviceconfigs.yaml b/bundle/manifests/amd.com_deviceconfigs.yaml index a0476c71..898220ac 100644 --- a/bundle/manifests/amd.com_deviceconfigs.yaml +++ b/bundle/manifests/amd.com_deviceconfigs.yaml @@ -357,7 +357,7 @@ spec: for OpenShift the default value is image-registry.openshift-image-registry.svc:5000/$MOD_NAMESPACE/amdgpu_kmod image tag will be in the format of --- example tag is coreos-416.94-5.14.0-427.28.1.el9_4.x86_64-6.2.2 and ubuntu-22.04-5.15.0-94-generic-6.1.3 - pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$ + pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[$a-zA-Z0-9_]+(?:[._-][$a-zA-Z0-9_]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$ type: string imageRegistrySecret: description: secrets used for pull/push images from/to private diff --git a/config/crd/bases/amd.com_deviceconfigs.yaml b/config/crd/bases/amd.com_deviceconfigs.yaml index 5f7a02bd..05e8b815 100644 --- a/config/crd/bases/amd.com_deviceconfigs.yaml +++ b/config/crd/bases/amd.com_deviceconfigs.yaml @@ -353,7 +353,7 @@ spec: for OpenShift the default value is image-registry.openshift-image-registry.svc:5000/$MOD_NAMESPACE/amdgpu_kmod image tag will be in the format of --- example tag is coreos-416.94-5.14.0-427.28.1.el9_4.x86_64-6.2.2 and ubuntu-22.04-5.15.0-94-generic-6.1.3 - pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$ + pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[$a-zA-Z0-9_]+(?:[._-][$a-zA-Z0-9_]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$ type: string imageRegistrySecret: description: secrets used for pull/push images from/to private diff --git a/docs/installation/openshift-olm.md b/docs/installation/openshift-olm.md index 89625fc1..489644d9 100644 --- a/docs/installation/openshift-olm.md +++ b/docs/installation/openshift-olm.md @@ -204,6 +204,13 @@ spec: "feature.node.kubernetes.io/amd-gpu": "true" ``` +Things to note: +1. By default, there is no need to specify the image field in CR for Openshift. Default will be used which is: image-registry.openshift-image-registry.svc:5000/$MOD_NAMESPACE/amdgpu_kmod + +2. If users specify image, $MOD_NAMESPACE can be a place holder , KMM Operator can automatically translate it to the namespace + +3. Openshift internal registry has image url restriction, OpenShift users cannot use image like `/` , it requires the image URL to be `//`. However, if any other registry is being used by the user, the image URL can be of either form. + The operator will: 1. Collect worker node system specifications diff --git a/helm-charts-k8s/Chart.lock b/helm-charts-k8s/Chart.lock index 6ad80130..477dc578 100644 --- a/helm-charts-k8s/Chart.lock +++ b/helm-charts-k8s/Chart.lock @@ -6,4 +6,4 @@ dependencies: repository: file://./charts/kmm version: v1.0.0 digest: sha256:f9a315dd2ce3d515ebf28c8e9a6a82158b493ca2686439ec381487761261b597 -generated: "2025-03-20T06:06:33.9562362Z" +generated: "2025-03-21T05:09:30.645342377Z" diff --git a/helm-charts-k8s/crds/deviceconfig-crd.yaml b/helm-charts-k8s/crds/deviceconfig-crd.yaml index 6058c151..707eb1ce 100644 --- a/helm-charts-k8s/crds/deviceconfig-crd.yaml +++ b/helm-charts-k8s/crds/deviceconfig-crd.yaml @@ -361,7 +361,7 @@ spec: for OpenShift the default value is image-registry.openshift-image-registry.svc:5000/$MOD_NAMESPACE/amdgpu_kmod image tag will be in the format of --- example tag is coreos-416.94-5.14.0-427.28.1.el9_4.x86_64-6.2.2 and ubuntu-22.04-5.15.0-94-generic-6.1.3 - pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$ + pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[$a-zA-Z0-9_]+(?:[._-][$a-zA-Z0-9_]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$ type: string imageRegistrySecret: description: secrets used for pull/push images from/to private registry diff --git a/helm-charts-openshift/Chart.lock b/helm-charts-openshift/Chart.lock index 6e3c4ccc..f39ad5ac 100644 --- a/helm-charts-openshift/Chart.lock +++ b/helm-charts-openshift/Chart.lock @@ -6,4 +6,4 @@ dependencies: repository: file://./charts/kmm version: v1.0.0 digest: sha256:25200c34a5cc846a1275e5bf3fc637b19e909dc68de938189c5278d77d03f5ac -generated: "2025-03-20T06:06:55.80187139Z" +generated: "2025-03-21T05:09:40.013067636Z" diff --git a/helm-charts-openshift/crds/deviceconfig-crd.yaml b/helm-charts-openshift/crds/deviceconfig-crd.yaml index 6058c151..707eb1ce 100644 --- a/helm-charts-openshift/crds/deviceconfig-crd.yaml +++ b/helm-charts-openshift/crds/deviceconfig-crd.yaml @@ -361,7 +361,7 @@ spec: for OpenShift the default value is image-registry.openshift-image-registry.svc:5000/$MOD_NAMESPACE/amdgpu_kmod image tag will be in the format of --- example tag is coreos-416.94-5.14.0-427.28.1.el9_4.x86_64-6.2.2 and ubuntu-22.04-5.15.0-94-generic-6.1.3 - pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$ + pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[$a-zA-Z0-9_]+(?:[._-][$a-zA-Z0-9_]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$ type: string imageRegistrySecret: description: secrets used for pull/push images from/to private registry From 40bf349916f10e219840cd119e282f91106063d6 Mon Sep 17 00:00:00 2001 From: yansun1996 Date: Mon, 24 Mar 2025 22:31:12 +0000 Subject: [PATCH 08/12] Refactor IsOpenshift function based on RedHat suggestion --- cmd/main.go | 12 +++++---- internal/kmmmodule/kmmmodule.go | 19 ++------------ internal/nodelabeller/nodelabeller.go | 19 ++------------ internal/utils.go | 37 ++++++++++++++++++++++++++- 4 files changed, 47 insertions(+), 40 deletions(-) diff --git a/cmd/main.go b/cmd/main.go index b3c985ff..168a730d 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -35,9 +35,6 @@ package main import ( "flag" - "github.com/ROCm/gpu-operator/internal/configmanager" - "github.com/ROCm/gpu-operator/internal/metricsexporter" - "github.com/ROCm/gpu-operator/internal/testrunner" kmmv1beta1 "github.com/rh-ecosystem-edge/kernel-module-management/api/v1beta1" "k8s.io/apimachinery/pkg/runtime" utilruntime "k8s.io/apimachinery/pkg/util/runtime" @@ -51,11 +48,15 @@ import ( _ "k8s.io/client-go/plugin/pkg/client/auth" gpuev1alpha1 "github.com/ROCm/gpu-operator/api/v1alpha1" + utils "github.com/ROCm/gpu-operator/internal" "github.com/ROCm/gpu-operator/internal/cmd" "github.com/ROCm/gpu-operator/internal/config" + "github.com/ROCm/gpu-operator/internal/configmanager" "github.com/ROCm/gpu-operator/internal/controllers" "github.com/ROCm/gpu-operator/internal/kmmmodule" + "github.com/ROCm/gpu-operator/internal/metricsexporter" "github.com/ROCm/gpu-operator/internal/nodelabeller" + "github.com/ROCm/gpu-operator/internal/testrunner" //+kubebuilder:scaffold:imports ) @@ -107,8 +108,9 @@ func main() { } client := mgr.GetClient() - kmmHandler := kmmmodule.NewKMMModule(client, scheme) - nlHandler := nodelabeller.NewNodeLabeller(scheme) + isOpenShift := utils.IsOpenShift(setupLogger) + kmmHandler := kmmmodule.NewKMMModule(client, scheme, isOpenShift) + nlHandler := nodelabeller.NewNodeLabeller(scheme, isOpenShift) metricsHandler := metricsexporter.NewMetricsExporter(scheme) testrunnerHandler := testrunner.NewTestRunner(scheme) configmanagerHandler := configmanager.NewConfigManager(scheme) diff --git a/internal/kmmmodule/kmmmodule.go b/internal/kmmmodule/kmmmodule.go index 9aa6632b..8c753604 100644 --- a/internal/kmmmodule/kmmmodule.go +++ b/internal/kmmmodule/kmmmodule.go @@ -55,10 +55,8 @@ import ( "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/intstr" - "k8s.io/client-go/discovery" "k8s.io/client-go/kubernetes" "k8s.io/client-go/rest" - ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" "sigs.k8s.io/controller-runtime/pkg/log" @@ -107,27 +105,14 @@ type kmmModule struct { isOpenShift bool } -func NewKMMModule(client client.Client, scheme *runtime.Scheme) KMMModuleAPI { +func NewKMMModule(client client.Client, scheme *runtime.Scheme, isOpenShift bool) KMMModuleAPI { return &kmmModule{ client: client, scheme: scheme, - isOpenShift: isOpenshift(), + isOpenShift: isOpenShift, } } -func isOpenshift() bool { - if dc, err := discovery.NewDiscoveryClientForConfig(ctrl.GetConfigOrDie()); err == nil { - if gplist, err := dc.ServerGroups(); err == nil { - for _, gp := range gplist.Groups { - if gp.Name == "route.openshift.io" { - return true - } - } - } - } - return false -} - func (km *kmmModule) SetNodeVersionLabelAsDesired(ctx context.Context, devConfig *amdv1alpha1.DeviceConfig, nodes *v1.NodeList) error { // for each selected node // put the KMM version label given by CR's driver version diff --git a/internal/nodelabeller/nodelabeller.go b/internal/nodelabeller/nodelabeller.go index 8f60805b..959bf39f 100644 --- a/internal/nodelabeller/nodelabeller.go +++ b/internal/nodelabeller/nodelabeller.go @@ -42,9 +42,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/util/intstr" - "k8s.io/client-go/discovery" "k8s.io/utils/ptr" - ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" ) @@ -66,26 +64,13 @@ type nodeLabeller struct { isOpenShift bool } -func NewNodeLabeller(scheme *runtime.Scheme) NodeLabeller { +func NewNodeLabeller(scheme *runtime.Scheme, isOpenshift bool) NodeLabeller { return &nodeLabeller{ scheme: scheme, - isOpenShift: isOpenshift(), + isOpenShift: isOpenshift, } } -func isOpenshift() bool { - if dc, err := discovery.NewDiscoveryClientForConfig(ctrl.GetConfigOrDie()); err == nil { - if gplist, err := dc.ServerGroups(); err == nil { - for _, gp := range gplist.Groups { - if gp.Name == "route.openshift.io" { - return true - } - } - } - } - return false -} - func (nl *nodeLabeller) SetNodeLabellerAsDesired(ds *appsv1.DaemonSet, devConfig *amdv1alpha1.DeviceConfig) error { if ds == nil { return fmt.Errorf("daemon set is not initialized, zero pointer") diff --git a/internal/utils.go b/internal/utils.go index 9c67f1d3..b34dd031 100644 --- a/internal/utils.go +++ b/internal/utils.go @@ -17,15 +17,23 @@ limitations under the License. package utils import ( + "context" "fmt" "strings" - amdv1alpha1 "github.com/ROCm/gpu-operator/api/v1alpha1" + "github.com/go-logr/logr" v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" + + amdv1alpha1 "github.com/ROCm/gpu-operator/api/v1alpha1" + "github.com/ROCm/gpu-operator/internal/cmd" ) const ( defaultOcDriversVersion = "6.2.2" + openShiftNodeLabel = "node.openshift.io/os_id" NodeFeatureLabelAmdGpu = "feature.node.kubernetes.io/amd-gpu" NodeFeatureLabelAmdVGpu = "feature.node.kubernetes.io/amd-vgpu" ) @@ -88,3 +96,30 @@ func HasNodeLabelKey(node v1.Node, labelKey string) bool { } return false } + +func IsOpenShift(logger logr.Logger) bool { + config, err := rest.InClusterConfig() + if err != nil { + cmd.FatalError(logger, err, "unable to get cluster config") + } + // creates the clientset + clientset, err := kubernetes.NewForConfig(config) + if err != nil { + cmd.FatalError(logger, err, "unable to create cluster clientset") + } + // Check for OpenShift-specific labels on nodes + nodes, err := clientset.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{}) + if err != nil { + cmd.FatalError(logger, err, "unable to list nodes") + } + + isOpenShift := false + for _, node := range nodes.Items { + if _, exists := node.Labels[openShiftNodeLabel]; exists { + isOpenShift = true + break + } + } + logger.Info(fmt.Sprintf("IsOpenShift: %+v", isOpenShift)) + return isOpenShift +} From 044c6b4794046150131e7d153607ad077f52d012 Mon Sep 17 00:00:00 2001 From: vm Date: Tue, 25 Mar 2025 06:30:37 +0000 Subject: [PATCH 09/12] Device Plugin Args option --- api/v1alpha1/deviceconfig_types.go | 9 +++---- api/v1alpha1/zz_generated.deepcopy.go | 7 ++++++ ...md-gpu-operator.clusterserviceversion.yaml | 14 ++++++----- bundle/manifests/amd.com_deviceconfigs.yaml | 14 +++++------ config/crd/bases/amd.com_deviceconfigs.yaml | 14 +++++------ ...md-gpu-operator.clusterserviceversion.yaml | 12 +++++---- helm-charts-k8s/Chart.lock | 2 +- helm-charts-k8s/crds/deviceconfig-crd.yaml | 14 +++++------ helm-charts-openshift/Chart.lock | 2 +- .../crds/deviceconfig-crd.yaml | 14 +++++------ internal/kmmmodule/kmmmodule.go | 10 ++++++-- internal/utils.go | 11 +++++--- internal/validator/specValidators.go | 25 +++++++++++++++++++ tests/e2e/Makefile | 6 +++++ tests/e2e/cluster_test.go | 6 ++--- tests/e2e/init.go | 15 +++++++++++ 16 files changed, 120 insertions(+), 55 deletions(-) diff --git a/api/v1alpha1/deviceconfig_types.go b/api/v1alpha1/deviceconfig_types.go index 7c187780..503c0939 100644 --- a/api/v1alpha1/deviceconfig_types.go +++ b/api/v1alpha1/deviceconfig_types.go @@ -251,12 +251,11 @@ type DevicePluginSpec struct { // +optional DevicePluginTolerations []v1.Toleration `json:"devicePluginTolerations,omitempty"` - // resource naming strategy for device plugin - //+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="ResourceNamingStrategy",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:ResourceNamingStrategy"} - // +kubebuilder:validation:Enum=single;mixed - // +kubebuilder:default:="single" + // device plugin arguments is used to pass supported flags and their values while starting device plugin daemonset + // supported flag values: {"resource_naming_strategy": {"single", "mixed"}} + //+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="DevicePluginArguments",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:devicePluginArguments"} // +optional - ResourceNamingStrategy string `json:"resourceNamingStrategy,omitempty"` + DevicePluginArguments map[string]string `json:"devicePluginArguments,omitempty"` // node labeller image //+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="NodeLabellerImage",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:nodeLabellerImage"} diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index bbbd03c0..c2be36c9 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -267,6 +267,13 @@ func (in *DevicePluginSpec) DeepCopyInto(out *DevicePluginSpec) { (*in)[i].DeepCopyInto(&(*out)[i]) } } + if in.DevicePluginArguments != nil { + in, out := &in.DevicePluginArguments, &out.DevicePluginArguments + *out = make(map[string]string, len(*in)) + for key, val := range *in { + (*out)[key] = val + } + } if in.NodeLabellerTolerations != nil { in, out := &in.NodeLabellerTolerations, &out.NodeLabellerTolerations *out = make([]v1.Toleration, len(*in)) diff --git a/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml b/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml index 86d76210..45078acb 100644 --- a/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml +++ b/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml @@ -30,7 +30,7 @@ metadata: } ] capabilities: Basic Install - createdAt: "2025-03-21T05:09:41Z" + createdAt: "2025-03-25T06:19:27Z" operatorframework.io/suggested-namespace: openshift-amd-gpu operators.operatorframework.io/builder: operator-sdk-v1.32.0 operators.operatorframework.io/project_layout: go.kubebuilder.io/v3 @@ -152,6 +152,13 @@ spec: path: devicePlugin x-descriptors: - urn:alm:descriptor:com.amd.deviceconfigs:devicePlugin + - description: 'device plugin arguments is used to pass supported flags and + their values while starting device plugin daemonset supported flag values: + {"resource_naming_strategy": {"single", "mixed"}}' + displayName: DevicePluginArguments + path: devicePlugin.devicePluginArguments + x-descriptors: + - urn:alm:descriptor:com.amd.deviceconfigs:devicePluginArguments - description: device plugin image displayName: DevicePluginImage path: devicePlugin.devicePluginImage @@ -192,11 +199,6 @@ spec: path: devicePlugin.nodeLabellerTolerations x-descriptors: - urn:alm:descriptor:com.amd.deviceconfigs:nodeLabellerTolerations - - description: resource naming strategy for device plugin - displayName: ResourceNamingStrategy - path: devicePlugin.resourceNamingStrategy - x-descriptors: - - urn:alm:descriptor:com.amd.deviceconfigs:ResourceNamingStrategy - description: upgrade policy for device plugin and node labeller daemons displayName: UpgradePolicy path: devicePlugin.upgradePolicy diff --git a/bundle/manifests/amd.com_deviceconfigs.yaml b/bundle/manifests/amd.com_deviceconfigs.yaml index 898220ac..c9123ffe 100644 --- a/bundle/manifests/amd.com_deviceconfigs.yaml +++ b/bundle/manifests/amd.com_deviceconfigs.yaml @@ -190,6 +190,13 @@ spec: devicePlugin: description: device plugin properties: + devicePluginArguments: + additionalProperties: + type: string + description: |- + device plugin arguments is used to pass supported flags and their values while starting device plugin daemonset + supported flag values: {"resource_naming_strategy": {"single", "mixed"}} + type: object devicePluginImage: description: device plugin image pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$ @@ -306,13 +313,6 @@ spec: type: string type: object type: array - resourceNamingStrategy: - default: single - description: resource naming strategy for device plugin - enum: - - single - - mixed - type: string upgradePolicy: description: upgrade policy for device plugin and node labeller daemons diff --git a/config/crd/bases/amd.com_deviceconfigs.yaml b/config/crd/bases/amd.com_deviceconfigs.yaml index 05e8b815..24c2b053 100644 --- a/config/crd/bases/amd.com_deviceconfigs.yaml +++ b/config/crd/bases/amd.com_deviceconfigs.yaml @@ -186,6 +186,13 @@ spec: devicePlugin: description: device plugin properties: + devicePluginArguments: + additionalProperties: + type: string + description: |- + device plugin arguments is used to pass supported flags and their values while starting device plugin daemonset + supported flag values: {"resource_naming_strategy": {"single", "mixed"}} + type: object devicePluginImage: description: device plugin image pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$ @@ -302,13 +309,6 @@ spec: type: string type: object type: array - resourceNamingStrategy: - default: single - description: resource naming strategy for device plugin - enum: - - single - - mixed - type: string upgradePolicy: description: upgrade policy for device plugin and node labeller daemons diff --git a/config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml b/config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml index a5b6cd65..a9f4d685 100644 --- a/config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml +++ b/config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml @@ -123,6 +123,13 @@ spec: path: devicePlugin x-descriptors: - urn:alm:descriptor:com.amd.deviceconfigs:devicePlugin + - description: 'device plugin arguments is used to pass supported flags and + their values while starting device plugin daemonset supported flag values: + {"resource_naming_strategy": {"single", "mixed"}}' + displayName: DevicePluginArguments + path: devicePlugin.devicePluginArguments + x-descriptors: + - urn:alm:descriptor:com.amd.deviceconfigs:devicePluginArguments - description: device plugin image displayName: DevicePluginImage path: devicePlugin.devicePluginImage @@ -163,11 +170,6 @@ spec: path: devicePlugin.nodeLabellerTolerations x-descriptors: - urn:alm:descriptor:com.amd.deviceconfigs:nodeLabellerTolerations - - description: resource naming strategy for device plugin - displayName: ResourceNamingStrategy - path: devicePlugin.resourceNamingStrategy - x-descriptors: - - urn:alm:descriptor:com.amd.deviceconfigs:ResourceNamingStrategy - description: upgrade policy for device plugin and node labeller daemons displayName: UpgradePolicy path: devicePlugin.upgradePolicy diff --git a/helm-charts-k8s/Chart.lock b/helm-charts-k8s/Chart.lock index 477dc578..54b4cb8c 100644 --- a/helm-charts-k8s/Chart.lock +++ b/helm-charts-k8s/Chart.lock @@ -6,4 +6,4 @@ dependencies: repository: file://./charts/kmm version: v1.0.0 digest: sha256:f9a315dd2ce3d515ebf28c8e9a6a82158b493ca2686439ec381487761261b597 -generated: "2025-03-21T05:09:30.645342377Z" +generated: "2025-03-25T06:19:17.248998622Z" diff --git a/helm-charts-k8s/crds/deviceconfig-crd.yaml b/helm-charts-k8s/crds/deviceconfig-crd.yaml index 707eb1ce..502f4b89 100644 --- a/helm-charts-k8s/crds/deviceconfig-crd.yaml +++ b/helm-charts-k8s/crds/deviceconfig-crd.yaml @@ -194,6 +194,13 @@ spec: devicePlugin: description: device plugin properties: + devicePluginArguments: + additionalProperties: + type: string + description: |- + device plugin arguments is used to pass supported flags and their values while starting device plugin daemonset + supported flag values: {"resource_naming_strategy": {"single", "mixed"}} + type: object devicePluginImage: description: device plugin image pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$ @@ -310,13 +317,6 @@ spec: type: string type: object type: array - resourceNamingStrategy: - default: single - description: resource naming strategy for device plugin - enum: - - single - - mixed - type: string upgradePolicy: description: upgrade policy for device plugin and node labeller daemons diff --git a/helm-charts-openshift/Chart.lock b/helm-charts-openshift/Chart.lock index f39ad5ac..6e9b718d 100644 --- a/helm-charts-openshift/Chart.lock +++ b/helm-charts-openshift/Chart.lock @@ -6,4 +6,4 @@ dependencies: repository: file://./charts/kmm version: v1.0.0 digest: sha256:25200c34a5cc846a1275e5bf3fc637b19e909dc68de938189c5278d77d03f5ac -generated: "2025-03-21T05:09:40.013067636Z" +generated: "2025-03-25T06:19:26.060856628Z" diff --git a/helm-charts-openshift/crds/deviceconfig-crd.yaml b/helm-charts-openshift/crds/deviceconfig-crd.yaml index 707eb1ce..502f4b89 100644 --- a/helm-charts-openshift/crds/deviceconfig-crd.yaml +++ b/helm-charts-openshift/crds/deviceconfig-crd.yaml @@ -194,6 +194,13 @@ spec: devicePlugin: description: device plugin properties: + devicePluginArguments: + additionalProperties: + type: string + description: |- + device plugin arguments is used to pass supported flags and their values while starting device plugin daemonset + supported flag values: {"resource_naming_strategy": {"single", "mixed"}} + type: object devicePluginImage: description: device plugin image pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$ @@ -310,13 +317,6 @@ spec: type: string type: object type: array - resourceNamingStrategy: - default: single - description: resource naming strategy for device plugin - enum: - - single - - mixed - type: string upgradePolicy: description: upgrade policy for device plugin and node labeller daemons diff --git a/internal/kmmmodule/kmmmodule.go b/internal/kmmmodule/kmmmodule.go index 8c753604..9ca34383 100644 --- a/internal/kmmmodule/kmmmodule.go +++ b/internal/kmmmodule/kmmmodule.go @@ -257,8 +257,14 @@ func (km *kmmModule) SetDevicePluginAsDesired(ds *appsv1.DaemonSet, devConfig *a return fmt.Errorf("daemon set is not initialized, zero pointer") } - resourceNamingStrategy := devConfig.Spec.DevicePlugin.ResourceNamingStrategy - command := []string{"sh", "-c", fmt.Sprintf("./k8s-device-plugin -logtostderr=true -stderrthreshold=INFO -v=5 -pulse=30 -resource_naming_strategy=%s", resourceNamingStrategy)} + commandArgs := "./k8s-device-plugin -logtostderr=true -stderrthreshold=INFO -v=5 -pulse=30" + + devicePluginArguments := devConfig.Spec.DevicePlugin.DevicePluginArguments + for key, val := range devicePluginArguments { + commandArgs += " -" + key + "=" + val + } + + command := []string{"sh", "-c", commandArgs} nodeSelector := map[string]string{} for key, val := range devConfig.Spec.Selector { nodeSelector[key] = val diff --git a/internal/utils.go b/internal/utils.go index b34dd031..bc642343 100644 --- a/internal/utils.go +++ b/internal/utils.go @@ -32,10 +32,13 @@ import ( ) const ( - defaultOcDriversVersion = "6.2.2" - openShiftNodeLabel = "node.openshift.io/os_id" - NodeFeatureLabelAmdGpu = "feature.node.kubernetes.io/amd-gpu" - NodeFeatureLabelAmdVGpu = "feature.node.kubernetes.io/amd-vgpu" + defaultOcDriversVersion = "6.2.2" + openShiftNodeLabel = "node.openshift.io/os_id" + NodeFeatureLabelAmdGpu = "feature.node.kubernetes.io/amd-gpu" + NodeFeatureLabelAmdVGpu = "feature.node.kubernetes.io/amd-vgpu" + ResourceNamingStrategyFlag = "resource_naming_strategy" + SingleStrategy = "single" + MixedStrategy = "mixed" ) func GetDriverVersion(node v1.Node, deviceConfig amdv1alpha1.DeviceConfig) (string, error) { diff --git a/internal/validator/specValidators.go b/internal/validator/specValidators.go index f6d87ca7..b804c488 100644 --- a/internal/validator/specValidators.go +++ b/internal/validator/specValidators.go @@ -21,6 +21,7 @@ import ( "fmt" amdv1alpha1 "github.com/ROCm/gpu-operator/api/v1alpha1" + utils "github.com/ROCm/gpu-operator/internal" "sigs.k8s.io/controller-runtime/pkg/client" ) @@ -86,5 +87,29 @@ func ValidateDevicePluginSpec(ctx context.Context, client client.Client, devConf } } + supportedFlagValues := map[string][]string{ + utils.ResourceNamingStrategyFlag: {utils.SingleStrategy, utils.MixedStrategy}, + } + + devicePluginArguments := devConfig.Spec.DevicePlugin.DevicePluginArguments + for key, val := range devicePluginArguments { + validValues, validKey := supportedFlagValues[key] + if !validKey { + return fmt.Errorf("Invalid flag: %s", key) + } + validKeyValue := false + + for _, validVal := range validValues { + if val == validVal { + validKeyValue = true + break + } + } + + if !validKeyValue { + return fmt.Errorf("Invalid flag value: %s=%s. Supported values: %v", key, val, supportedFlagValues[key]) + } + } + return nil } diff --git a/tests/e2e/Makefile b/tests/e2e/Makefile index 90b1e791..e00185c7 100644 --- a/tests/e2e/Makefile +++ b/tests/e2e/Makefile @@ -7,8 +7,11 @@ E2E_KUBE_RBAC_PROXY_CURL_IMAGE ?= curlimages/curl:7.78.0 E2E_UBUNTU_BASE_IMAGE ?= ubuntu:22.04 E2E_MINIO_IMAGE ?= minio/minio:latest E2E_EXPORTER_IMAGE ?= rocm/device-metrics-exporter:v1.2.0 +E2E_EXPORTER_IMAGE_2 ?= rocm/device-metrics-exporter:v1.1.1-beta.0 E2E_DEVICE_PLUGIN_IMAGE ?= rocm/k8s-device-plugin:latest E2E_NODE_LABELLER_IMAGE ?= rocm/k8s-device-plugin:labeller-latest +E2E_DEVICE_PLUGIN_IMAGE_2 ?= rocm/k8s-device-plugin:1.31.0.6 +E2E_NODE_LABELLER_IMAGE_2 ?= rocm/k8s-device-plugin:labeller-1.31.0.6 E2E_TEST_RUNNER_IMAGE ?= rocm/test-runner:v1.2.0-beta.0 export E2E_INIT_CONTAINER_IMAGE @@ -16,8 +19,11 @@ export E2E_KUBE_RBAC_PROXY_CURL_IMAGE export E2E_UBUNTU_BASE_IMAGE export E2E_MINIO_IMAGE export E2E_EXPORTER_IMAGE +export E2E_EXPORTER_IMAGE_2 export E2E_DEVICE_PLUGIN_IMAGE export E2E_NODE_LABELLER_IMAGE +export E2E_DEVICE_PLUGIN_IMAGE_2 +export E2E_NODE_LABELLER_IMAGE_2 export E2E_TEST_RUNNER_IMAGE export E2E_DCM_IMAGE diff --git a/tests/e2e/cluster_test.go b/tests/e2e/cluster_test.go index ed22eb99..5c10af8f 100644 --- a/tests/e2e/cluster_test.go +++ b/tests/e2e/cluster_test.go @@ -1877,8 +1877,8 @@ func (s *E2ESuite) TestDevicePluginNodeLabellerDaemonSetUpgrade(c *C) { // upgrade // update the CR's device plugin with image - devCfg.Spec.DevicePlugin.DevicePluginImage = devicePluginImage - devCfg.Spec.DevicePlugin.NodeLabellerImage = nodeLabellerImage + devCfg.Spec.DevicePlugin.DevicePluginImage = devicePluginImage2 + devCfg.Spec.DevicePlugin.NodeLabellerImage = nodeLabellerImage2 s.patchDevicePluginImage(devCfg, c) s.patchNodeLabellerImage(devCfg, c) s.verifyDevicePluginStatus(s.ns, c, devCfg) @@ -1911,7 +1911,7 @@ func (s *E2ESuite) TestMetricsExporterDaemonSetUpgrade(c *C) { // upgrade // update the CR's device plugin with image - devCfg.Spec.MetricsExporter.Image = exporterImage + devCfg.Spec.MetricsExporter.Image = exporterImage2 s.patchMetricsExporterImage(devCfg, c) s.verifyDeviceConfigStatus(devCfg, c) s.checkMetricsExporterStatus(devCfg, s.ns, v1.ServiceTypeClusterIP, c) diff --git a/tests/e2e/init.go b/tests/e2e/init.go index d7a863eb..973cedb6 100644 --- a/tests/e2e/init.go +++ b/tests/e2e/init.go @@ -25,8 +25,11 @@ var ( initContainerImage string kubeRbacProxyCurlImage string exporterImage string + exporterImage2 string devicePluginImage string nodeLabellerImage string + devicePluginImage2 string + nodeLabellerImage2 string testRunnerImage string driverImageRepo string ) @@ -46,6 +49,10 @@ func init() { if !ok { log.Fatalf("E2E_EXPORTER_IMAGE is not defined") } + exporterImage2, ok = os.LookupEnv("E2E_EXPORTER_IMAGE_2") + if !ok { + log.Fatalf("E2E_EXPORTER_IMAGE_2 is not defined") + } devicePluginImage, ok = os.LookupEnv("E2E_DEVICE_PLUGIN_IMAGE") if !ok { log.Fatalf("E2E_DEVICE_PLUGIN_IMAGE is not defined") @@ -54,6 +61,14 @@ func init() { if !ok { log.Fatalf("E2E_NODE_LABELLER_IMAGE is not defined") } + devicePluginImage2, ok = os.LookupEnv("E2E_DEVICE_PLUGIN_IMAGE_2") + if !ok { + log.Fatalf("E2E_DEVICE_PLUGIN_IMAGE_2 is not defined") + } + nodeLabellerImage2, ok = os.LookupEnv("E2E_NODE_LABELLER_IMAGE_2") + if !ok { + log.Fatalf("E2E_NODE_LABELLER_IMAGE_2 is not defined") + } testRunnerImage, ok = os.LookupEnv("E2E_TEST_RUNNER_IMAGE") if !ok { log.Fatalf("E2E_TEST_RUNNER_IMAGE is not defined") From 5aaffe5fe36a3c7346b91ee34b38d64893c15dca Mon Sep 17 00:00:00 2001 From: vm Date: Wed, 26 Mar 2025 06:18:58 +0000 Subject: [PATCH 10/12] Regression caused from previous commit --- internal/controllers/upgrademgr.go | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/internal/controllers/upgrademgr.go b/internal/controllers/upgrademgr.go index c5f3fd6e..a5e519b2 100644 --- a/internal/controllers/upgrademgr.go +++ b/internal/controllers/upgrademgr.go @@ -151,11 +151,6 @@ func (n *upgradeMgr) HandleUpgrade(ctx context.Context, deviceConfig *amdv1alpha // 1. Set init status for unprocessed nodes n.helper.handleInitStatus(ctx, &nodeList.Items[i]) - if !n.helper.isNodeReadyForUpgrade(ctx, &nodeList.Items[i]) { - res = ctrl.Result{Requeue: true, RequeueAfter: time.Second * 20} - continue - } - // 2. Handle failed nodes if n.helper.isNodeStateUpgradeFailed(ctx, &nodeList.Items[i], deviceConfig) { n.helper.clearUpgradeStartTime(nodeList.Items[i].Name) @@ -193,6 +188,11 @@ func (n *upgradeMgr) HandleUpgrade(ctx context.Context, deviceConfig *amdv1alpha continue } + if !n.helper.isNodeReadyForUpgrade(ctx, &nodeList.Items[i]) { + res = ctrl.Result{Requeue: true, RequeueAfter: time.Second * 20} + continue + } + //This node is a candidate for selection candidateNodes = append(candidateNodes, nodeList.Items[i]) } From 42692bd60d92e67b330023c22870dfb85662803a Mon Sep 17 00:00:00 2001 From: vm Date: Wed, 26 Mar 2025 03:14:38 +0000 Subject: [PATCH 11/12] Device Plugin e2e for homogeneous/heterogeneous with single/mixed strategy --- tests/e2e/cluster_test.go | 242 ++++++++++++++++++++++++++++++++++- tests/e2e/dcm_e2e_test.go | 3 +- tests/e2e/testrunner_test.go | 6 +- tests/e2e/utils/utils.go | 18 +-- 4 files changed, 251 insertions(+), 18 deletions(-) diff --git a/tests/e2e/cluster_test.go b/tests/e2e/cluster_test.go index 5c10af8f..40fd52e5 100644 --- a/tests/e2e/cluster_test.go +++ b/tests/e2e/cluster_test.go @@ -1051,7 +1051,7 @@ func (s *E2ESuite) TestWorkloadRequestedGPUs(c *C) { s.verifyDeviceConfigStatus(devCfg, c) s.verifyNodeGPULabel(devCfg, c) - ret, err := utils.GetAMDGPUCount(ctx, s.clientSet) + ret, err := utils.GetAMDGPUCount(ctx, s.clientSet, "gpu") if err != nil { logger.Errorf("error: %v", err) } @@ -1078,7 +1078,7 @@ func (s *E2ESuite) TestWorkloadRequestedGPUs(c *C) { err = utils.DeployRocmPods(context.TODO(), s.clientSet, res) assert.NoError(c, err, "failed to deploy pods") s.verifyROCMPOD(true, c) - err = utils.VerifyROCMPODResourceCount(ctx, s.clientSet, gpuReqCount) + err = utils.VerifyROCMPODResourceCount(ctx, s.clientSet, gpuReqCount, "gpu") assert.NoError(c, err, fmt.Sprintf("%v", err)) // delete @@ -1092,6 +1092,244 @@ func (s *E2ESuite) TestWorkloadRequestedGPUs(c *C) { assert.NoError(c, err, "failed to reboot nodes") } +func (s *E2ESuite) TestWorkloadRequestedGPUsHomogeneousSingle(c *C) { + if s.simEnable { + c.Skip("Skipping for non amd gpu testbed") + } + if !dcmImageDefined { + c.Skip("skip DCM test because E2E_DCM_IMAGE is not defined") + } + + s.configMapHelper(c) + + logger.Infof("Add node label after pod comes up") + time.Sleep(30 * time.Second) + + nodes := utils.GetAMDGpuWorker(s.clientSet, s.openshift) + nodeNames := make([]string, 0) + for _, node := range nodes { + nodeNames = append(nodeNames, node.Name) + } + for _, nodeName := range nodeNames { + s.addRemoveNodeLabels(nodeName, "e2e_profile2") + } + + logs := s.getLogs() + if strings.Contains(logs, "Partition completed successfully") && (!strings.Contains(logs, "ERROR")) && (s.eventHelper("SuccessfullyPartitioned", "Normal")) { + logger.Infof("Successfully tested homogenous default partitioning") + } else { + logger.Errorf("Failure test homogenous partitioning") + } + devCfgDcm := s.getDeviceConfigForDCM(c) + s.deleteDeviceConfig(devCfgDcm, c) + + time.Sleep(60 * time.Second) + + ctx := context.TODO() + logger.Infof("create %v", s.cfgName) + devCfg := s.getDeviceConfig(c) + driverEnable := false + devCfg.Spec.Driver.Enable = &driverEnable + s.createDeviceConfig(devCfg, c) + s.checkNFDWorkerStatus(s.ns, c, "") + s.checkNodeLabellerStatus(s.ns, c, devCfg) + s.verifyDeviceConfigStatus(devCfg, c) + s.verifyNodeGPULabel(devCfg, c) + + ret, err := utils.GetAMDGPUCount(ctx, s.clientSet, "gpu") + if err != nil { + logger.Errorf("error: %v", err) + } + var minGPU int = 10000 + for _, v := range ret { + if v < minGPU { + minGPU = v + } + } + assert.Greater(c, minGPU, 0, "did not find any server with amd gpu") + + gpuLimitCount := minGPU + gpuReqCount := minGPU + + res := &v1.ResourceRequirements{ + Limits: v1.ResourceList{ + "amd.com/gpu": resource.MustParse(fmt.Sprintf("%d", gpuLimitCount)), + }, + Requests: v1.ResourceList{ + "amd.com/gpu": resource.MustParse(fmt.Sprintf("%d", gpuReqCount)), + }, + } + + err = utils.DeployRocmPods(context.TODO(), s.clientSet, res) + assert.NoError(c, err, "failed to deploy pods") + err = utils.VerifyROCMPODResourceCount(ctx, s.clientSet, gpuReqCount, "gpu") + assert.NoError(c, err, fmt.Sprintf("%v", err)) + + // delete + s.deleteDeviceConfig(devCfg, c) + + err = utils.DelRocmPods(context.TODO(), s.clientSet) + assert.NoError(c, err, "failed to remove rocm pods") +} + +func (s *E2ESuite) TestWorkloadRequestedGPUsHomogeneousMixed(c *C) { + if s.simEnable { + c.Skip("Skipping for non amd gpu testbed") + } + if !dcmImageDefined { + c.Skip("skip DCM test because E2E_DCM_IMAGE is not defined") + } + + s.configMapHelper(c) + + logger.Infof("Add node label after pod comes up") + time.Sleep(30 * time.Second) + + nodes := utils.GetAMDGpuWorker(s.clientSet, s.openshift) + nodeNames := make([]string, 0) + for _, node := range nodes { + nodeNames = append(nodeNames, node.Name) + } + for _, nodeName := range nodeNames { + s.addRemoveNodeLabels(nodeName, "e2e_profile2") + } + + logs := s.getLogs() + if strings.Contains(logs, "Partition completed successfully") && (!strings.Contains(logs, "ERROR")) && (s.eventHelper("SuccessfullyPartitioned", "Normal")) { + logger.Infof("Successfully tested homogeneous partitioning") + } else { + logger.Errorf("Failure test homogeneous partitioning") + } + devCfgDcm := s.getDeviceConfigForDCM(c) + s.deleteDeviceConfig(devCfgDcm, c) + time.Sleep(60 * time.Second) + ctx := context.TODO() + logger.Infof("create %v", s.cfgName) + devCfg := s.getDeviceConfig(c) + driverEnable := false + devCfg.Spec.Driver.Enable = &driverEnable + devCfg.Spec.DevicePlugin.DevicePluginArguments = map[string]string{"resource_naming_strategy": "mixed"} + s.createDeviceConfig(devCfg, c) + s.checkNFDWorkerStatus(s.ns, c, "") + s.checkNodeLabellerStatus(s.ns, c, devCfg) + s.verifyDeviceConfigStatus(devCfg, c) + + ret, err := utils.GetAMDGPUCount(ctx, s.clientSet, "cpx_nps4") + if err != nil { + logger.Errorf("error: %v", err) + } + var minGPU int = 10000 + for _, v := range ret { + if v < minGPU { + minGPU = v + } + } + assert.Greater(c, minGPU, 0, "did not find any server with amd gpu") + + gpuLimitCount := minGPU + gpuReqCount := minGPU + + res := &v1.ResourceRequirements{ + Limits: v1.ResourceList{ + "amd.com/cpx_nps4": resource.MustParse(fmt.Sprintf("%d", gpuLimitCount)), + }, + Requests: v1.ResourceList{ + "amd.com/cpx_nps4": resource.MustParse(fmt.Sprintf("%d", gpuReqCount)), + }, + } + + err = utils.DeployRocmPods(context.TODO(), s.clientSet, res) + assert.NoError(c, err, "failed to deploy pods") + err = utils.VerifyROCMPODResourceCount(ctx, s.clientSet, gpuReqCount, "cpx_nps4") + assert.NoError(c, err, fmt.Sprintf("%v", err)) + + // delete + s.deleteDeviceConfig(devCfg, c) + + err = utils.DelRocmPods(context.TODO(), s.clientSet) + assert.NoError(c, err, "failed to remove rocm pods") + +} + +func (s *E2ESuite) TestWorkloadRequestedGPUsHeterogeneousMixed(c *C) { + if s.simEnable { + c.Skip("Skipping for non amd gpu testbed") + } + if !dcmImageDefined { + c.Skip("skip DCM test because E2E_DCM_IMAGE is not defined") + } + + s.configMapHelper(c) + + logger.Infof("Add node label after pod comes up") + time.Sleep(30 * time.Second) + + nodes := utils.GetAMDGpuWorker(s.clientSet, s.openshift) + nodeNames := make([]string, 0) + for _, node := range nodes { + nodeNames = append(nodeNames, node.Name) + } + for _, nodeName := range nodeNames { + s.addRemoveNodeLabels(nodeName, "e2e_profile1") + } + + logs := s.getLogs() + if strings.Contains(logs, "Partition completed successfully") && (!strings.Contains(logs, "ERROR")) && (s.eventHelper("SuccessfullyPartitioned", "Normal")) { + logger.Infof("Successfully tested homogeneous partitioning") + } else { + logger.Errorf("Failure test heterogenous partitioning") + } + devCfgDcm := s.getDeviceConfigForDCM(c) + s.deleteDeviceConfig(devCfgDcm, c) + time.Sleep(60 * time.Second) + + ctx := context.TODO() + logger.Infof("create %v", s.cfgName) + devCfg := s.getDeviceConfig(c) + driverEnable := false + devCfg.Spec.Driver.Enable = &driverEnable + devCfg.Spec.DevicePlugin.DevicePluginArguments = map[string]string{"resource_naming_strategy": "mixed"} + s.createDeviceConfig(devCfg, c) + s.checkNFDWorkerStatus(s.ns, c, "") + s.checkNodeLabellerStatus(s.ns, c, devCfg) + s.verifyDeviceConfigStatus(devCfg, c) + + ret, err := utils.GetAMDGPUCount(ctx, s.clientSet, "cpx_nps1") + if err != nil { + logger.Errorf("error: %v", err) + } + var minGPU int = 10000 + for _, v := range ret { + if v < minGPU { + minGPU = v + } + } + assert.Greater(c, minGPU, 0, "did not find any server with amd gpu") + + gpuLimitCount := minGPU + gpuReqCount := minGPU + + res := &v1.ResourceRequirements{ + Limits: v1.ResourceList{ + "amd.com/cpx_nps1": resource.MustParse(fmt.Sprintf("%d", gpuLimitCount)), + }, + Requests: v1.ResourceList{ + "amd.com/cpx_nps1": resource.MustParse(fmt.Sprintf("%d", gpuReqCount)), + }, + } + + err = utils.DeployRocmPods(context.TODO(), s.clientSet, res) + assert.NoError(c, err, "failed to deploy pods") + err = utils.VerifyROCMPODResourceCount(ctx, s.clientSet, gpuReqCount, "cpx_nps1") + assert.NoError(c, err, fmt.Sprintf("%v", err)) + + // delete + s.deleteDeviceConfig(devCfg, c) + + err = utils.DelRocmPods(context.TODO(), s.clientSet) + assert.NoError(c, err, "failed to remove rocm pods") +} + func (s *E2ESuite) TestKubeRbacProxyClusterIP(c *C) { _, err := s.dClient.DeviceConfigs(s.ns).Get("deviceconfig-kuberbac-clusterip", metav1.GetOptions{}) assert.Errorf(c, err, "config deviceconfig-kuberbac-clusterip exists") diff --git a/tests/e2e/dcm_e2e_test.go b/tests/e2e/dcm_e2e_test.go index f3f8b9df..cd11bd3c 100644 --- a/tests/e2e/dcm_e2e_test.go +++ b/tests/e2e/dcm_e2e_test.go @@ -72,7 +72,7 @@ func (s *E2ESuite) addRemoveNodeLabels(nodeName string, selectedProfile string) logger.Infof("Error adding node lbels: %s\n", err.Error()) return } - time.Sleep(15 * time.Second) + time.Sleep(45 * time.Second) // Allow partition to happen err = utils.DeleteNodeLabel(s.clientSet, nodeName, "dcm.amd.com/gpu-config-profile") _ = utils.DeleteNodeLabel(s.clientSet, nodeName, "dcm.amd.com/apply-gpu-config-profile") @@ -269,6 +269,7 @@ func (s *E2ESuite) createConfigMap() GPUConfigProfiles { { ComputePartition: "CPX", MemoryPartition: "NPS4", + NumGPUsAssigned: 1, }, } diff --git a/tests/e2e/testrunner_test.go b/tests/e2e/testrunner_test.go index 305b6f59..aa7b37a7 100644 --- a/tests/e2e/testrunner_test.go +++ b/tests/e2e/testrunner_test.go @@ -200,7 +200,7 @@ func (s *E2ESuite) createTestRunnerConfigmap(valid bool, devCfg *v1alpha1.Device } func (s *E2ESuite) scheduleWorkloadOnNodeWithMaxGPUs(c *C) string { - ret, err := utils.GetAMDGPUCount(context.TODO(), s.clientSet) + ret, err := utils.GetAMDGPUCount(context.TODO(), s.clientSet, "gpu") if err != nil { logger.Errorf("error: %v", err) } @@ -228,7 +228,7 @@ func (s *E2ESuite) scheduleWorkloadOnNodeWithMaxGPUs(c *C) string { err = utils.DeployRocmPods(context.TODO(), s.clientSet, res) assert.NoError(c, err, "failed to deploy pods") - err = utils.VerifyROCMPODResourceCount(context.TODO(), s.clientSet, gpuReqCount) + err = utils.VerifyROCMPODResourceCount(context.TODO(), s.clientSet, gpuReqCount, "gpu") assert.NoError(c, err, fmt.Sprintf("%v", err)) return nodeWithMaxGPU @@ -730,7 +730,7 @@ func (s *E2ESuite) TestTestRunnerLogsExport(c *C) { func (s *E2ESuite) getGPUNodeName() (nodeWithMaxGPU string) { var maxPerNodeGPU int = 0 - ret, err := utils.GetAMDGPUCount(context.TODO(), s.clientSet) + ret, err := utils.GetAMDGPUCount(context.TODO(), s.clientSet, "gpu") if err != nil { logger.Printf("Unable to fetch gpu nodes. Error %v", err) return diff --git a/tests/e2e/utils/utils.go b/tests/e2e/utils/utils.go index 9c9dcf9f..5813ccf5 100644 --- a/tests/e2e/utils/utils.go +++ b/tests/e2e/utils/utils.go @@ -598,14 +598,6 @@ func GetWorkerNodes(cl *kubernetes.Clientset) []*v1.Node { func GetAMDGpuWorker(cl *kubernetes.Clientset, isOpenshift bool) []v1.Node { ret := make([]v1.Node, 0) labelSelector := labels.NewSelector() - if !isOpenshift { - r, _ := labels.NewRequirement( - "node-role.kubernetes.io/control-plane", - selection.DoesNotExist, - nil, - ) - labelSelector = labelSelector.Add(*r) - } r, _ := labels.NewRequirement( "feature.node.kubernetes.io/amd-gpu", selection.Equals, @@ -766,7 +758,7 @@ func DelRocmPodsByNodeNames(ctx context.Context, cl *kubernetes.Clientset, } -func GetAMDGPUCount(ctx context.Context, cl *kubernetes.Clientset) (map[string]int, error) { +func GetAMDGPUCount(ctx context.Context, cl *kubernetes.Clientset, resourceType string) (map[string]int, error) { ret := make(map[string]int) // Get the list of nodes @@ -777,7 +769,8 @@ func GetAMDGPUCount(ctx context.Context, cl *kubernetes.Clientset) (map[string]i // Iterate over the nodes and count AMD GPUs for _, node := range nodes.Items { - if val, ok := node.Status.Capacity["amd.com/gpu"]; ok { + resourceKey := v1.ResourceName("amd.com/" + resourceType) + if val, ok := node.Status.Capacity[resourceKey]; ok { num, err := strconv.ParseInt(val.String(), 10, 64) if err != nil { log.Infof("error: %v", err) @@ -790,7 +783,7 @@ func GetAMDGPUCount(ctx context.Context, cl *kubernetes.Clientset) (map[string]i } func VerifyROCMPODResourceCount(ctx context.Context, cl *kubernetes.Clientset, - gpuReqCount int) error { + gpuReqCount int, resourceType string) error { its, err := cl.CoreV1().Pods("").List(ctx, metav1.ListOptions{ @@ -805,7 +798,8 @@ func VerifyROCMPODResourceCount(ctx context.Context, cl *kubernetes.Clientset, continue } - if gpu, ok := cntr.Resources.Requests["amd.com/gpu"]; ok { + resourceKey := v1.ResourceName("amd.com/" + resourceType) + if gpu, ok := cntr.Resources.Requests[resourceKey]; ok { gpuAssignedCount := int(gpu.Value()) if gpuReqCount < gpuAssignedCount { return fmt.Errorf("gpu requested %d got %d", From 996d8c82c8aa2433532e755aa0bca646bbdb8e3e Mon Sep 17 00:00:00 2001 From: "Saad Rahim (AMD)" <44449863+saadrahim@users.noreply.github.com> Date: Thu, 27 Mar 2025 15:49:54 -0600 Subject: [PATCH 12/12] Fix markdown linting errors --- docs/installation/openshift-olm.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/installation/openshift-olm.md b/docs/installation/openshift-olm.md index 489644d9..889ada17 100644 --- a/docs/installation/openshift-olm.md +++ b/docs/installation/openshift-olm.md @@ -205,6 +205,7 @@ spec: ``` Things to note: + 1. By default, there is no need to specify the image field in CR for Openshift. Default will be used which is: image-registry.openshift-image-registry.svc:5000/$MOD_NAMESPACE/amdgpu_kmod 2. If users specify image, $MOD_NAMESPACE can be a place holder , KMM Operator can automatically translate it to the namespace