From 4015da76635f7364f2f2706d2a1b14d3b80b429b Mon Sep 17 00:00:00 2001
From: yansun1996 <yan@pensando.io>
Date: Thu, 20 Mar 2025 02:29:26 +0000
Subject: [PATCH 01/12] Remove old dcgpu link and update to instinct doc link

---
 Makefile                  | 4 ++--
 docs/fulldeviceconfig.rst | 4 ++--
 docs/knownlimitations.md  | 2 +-
 helm-charts/README.md     | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/Makefile b/Makefile
index 19a3ba0c..99907539 100644
--- a/Makefile
+++ b/Makefile
@@ -417,12 +417,12 @@ kustomize:
 # go-get-tool will 'go install' any package $2 and install it to $1.
 PROJECT_DIR := $(shell dirname $(abspath $(lastword $(MAKEFILE_LIST))))
 define go-get-tool
-    @if [ -z "$$GOBIN" ]; then \
+    if [ -z "$$GOBIN" ]; then \
         GOBIN=$$(go env GOPATH)/bin; \
     else \
         GOBIN=$$GOBIN; \
     fi; \
-    @[ -f $(1) ] || { \
+    [ -f $(1) ] || { \
     set -e; \
     echo "Downloading $(2)"; \
     echo "Running: GOBIN=$(PROJECT_DIR)/bin go install $(2)"; \
diff --git a/docs/fulldeviceconfig.rst b/docs/fulldeviceconfig.rst
index 45d3f3de..8d8c1d95 100644
--- a/docs/fulldeviceconfig.rst
+++ b/docs/fulldeviceconfig.rst
@@ -131,7 +131,7 @@ Below is an example of a full DeviceConfig CR that can be used to install the AM
           upgradeStrategy: RollingUpdate, # (Optional) Can be either `RollingUpdate` or `OnDelete`
           maxUnavailable: 1 # (Optional) Number of pods that can be unavailable during the upgrade process. 1 is the default value
         # If specifying a node selector here, the metrics exporter will only be deployed on nodes that match the selector
-        # See Item #6 on https://dcgpu.docs.amd.com/projects/gpu-operator/en/latest/knownlimitations.html for example usage
+        # See Item #6 on https://instinct.docs.amd.com/projects/gpu-operator/en/latest/knownlimitations.html for example usage
         tolerations:
           key: "key1" # Key is the taint key that the toleration applies to. Empty means match all taint keys. If the key is empty,
           # operator must be "Exists"; this combination means to match all values and all keys.
@@ -174,7 +174,7 @@ Below is an example of a full DeviceConfig CR that can be used to install the AM
           upgradeStrategy: RollingUpdate, # (Optional) Can be either `RollingUpdate` or `OnDelete`
           maxUnavailable: 1 # (Optional) Number of pods that can be unavailable during the upgrade process. 1 is the default value
         # If specifying a node selector here, the metrics exporter will only be deployed on nodes that match the selector
-        # See Item #6 on https://dcgpu.docs.amd.com/projects/gpu-operator/en/latest/knownlimitations.html for example usage
+        # See Item #6 on https://instinct.docs.amd.com/projects/gpu-operator/en/latest/knownlimitations.html for example usage
         tolerations:
           key: "key1" # Key is the taint key that the toleration applies to. Empty means match all taint keys. If the key is empty,
           # operator must be "Exists"; this combination means to match all values and all keys.
diff --git a/docs/knownlimitations.md b/docs/knownlimitations.md
index 9702009a..051b48cd 100644
--- a/docs/knownlimitations.md
+++ b/docs/knownlimitations.md
@@ -15,7 +15,7 @@
 3. **GPU Operator unable to install amdgpu driver if existing driver is already installed**
    - ***Impact:*** Driver install will fail if amdgpu in-box Driver is present/already installed
    - ***Affected Configurations:*** All configurations
-   - ***Workaround:*** When installing the amdgpu drivers using the GPU Operator, worker nodes should have amdgpu blacklisted or amdgpu drivers should not be pre-installed on the node. [Blacklist in-box driver](https://dcgpu.docs.amd.com/projects/gpu-operator/en/release-v1.0.0/drivers/installation.html#blacklist-inbox-driver) so that it is not loaded or remove the pre-installed driver
+   - ***Workaround:*** When installing the amdgpu drivers using the GPU Operator, worker nodes should have amdgpu blacklisted or amdgpu drivers should not be pre-installed on the node. [Blacklist in-box driver](https://instinct.docs.amd.com/projects/gpu-operator/en/release-v1.0.0/drivers/installation.html#blacklist-inbox-driver) so that it is not loaded or remove the pre-installed driver
 </br></br>
 
 4. **When GPU Operator is used in SKIP driver install mode, if amdgpu module is removed with device plugin installed it will not reflect active GPU available on the server**
diff --git a/helm-charts/README.md b/helm-charts/README.md
index 7e7ec9c4..72bb01ce 100644
--- a/helm-charts/README.md
+++ b/helm-charts/README.md
@@ -80,7 +80,7 @@ Installation Options
 
 ### 3. Install Custom Resource
 
-After the installation of AMD GPU Operator, you need to create the `DeviceConfig` custom resource in order to trigger the operator to start to work. By preparing the `DeviceConfig` in the YAML file, you can create the resouce by running ```kubectl apply -f deviceconfigs.yaml```. For custom resource definition and more detailed information, please refer to [Custom Resource Installation Guide](https://dcgpu.docs.amd.com/projects/gpu-operator/en/latest/installation/kubernetes-helm.html#install-custom-resource).
+After the installation of AMD GPU Operator, you need to create the `DeviceConfig` custom resource in order to trigger the operator to start to work. By preparing the `DeviceConfig` in the YAML file, you can create the resouce by running ```kubectl apply -f deviceconfigs.yaml```. For custom resource definition and more detailed information, please refer to [Custom Resource Installation Guide](https://instinct.docs.amd.com/projects/gpu-operator/en/latest/installation/kubernetes-helm.html#install-custom-resource).
 
 ### Grafana Dashboards
 

From 618e8cd3aeec2f2a6b7a3c6020fe4f7a355516fb Mon Sep 17 00:00:00 2001
From: praveen <praveenkumar.shanmugam@amd.com>
Date: Thu, 20 Mar 2025 10:38:00 -0700
Subject: [PATCH 02/12] update examples

---
 example/metricsExporter/config.json    | 37 +++++++++++++++++++++---
 example/metricsExporter/configmap.yaml | 39 ++++++++++++++++++++++----
 2 files changed, 67 insertions(+), 9 deletions(-)

diff --git a/example/metricsExporter/config.json b/example/metricsExporter/config.json
index 689852ab..53e02eb0 100644
--- a/example/metricsExporter/config.json
+++ b/example/metricsExporter/config.json
@@ -27,7 +27,7 @@
       "PCIE_NAC_RECEIVED_COUNT",
       "GPU_CLOCK",
       "GPU_POWER_USAGE",
-      "GPU_TOTAL_MEMORY",
+      "GPU_TOTAL_VRAM",
       "GPU_ECC_CORRECT_TOTAL",
       "GPU_ECC_UNCORRECT_TOTAL",
       "GPU_ECC_CORRECT_SDMA",
@@ -89,7 +89,10 @@
       "GPU_ECC_CORRECT_IH",
       "GPU_ECC_UNCORRECT_IH",
       "GPU_ECC_CORRECT_MPIO",
-      "GPU_ECC_UNCORRECT_MPIO"
+      "GPU_ECC_UNCORRECT_MPIO",
+      "GPU_HEALTH",
+      "GPU_XGMI_LINK_RX",
+      "GPU_XGMI_LINK_TX"
     ],
     "Labels": [
       "GPU_UUID",
@@ -107,7 +110,33 @@
       "CARD_VENDOR",
       "DRIVER_VERSION",
       "VBIOS_VERSION",
-      "HOSTNAME"
-    ]
+      "HOSTNAME",
+      "GPU_PARTITION_ID",
+      "GPU_COMPUTE_PARTITION_TYPE"
+    ],
+    "HealthThresholds" : {
+      "GPU_ECC_UNCORRECT_SDMA" : 0,
+      "GPU_ECC_UNCORRECT_GFX" : 0,
+      "GPU_ECC_UNCORRECT_MMHUB" : 0,
+      "GPU_ECC_UNCORRECT_ATHUB" : 0,
+      "GPU_ECC_UNCORRECT_BIF" : 0,
+      "GPU_ECC_UNCORRECT_HDP" : 0,
+      "GPU_ECC_UNCORRECT_XGMI_WAFL" : 0,
+      "GPU_ECC_UNCORRECT_DF" : 0,
+      "GPU_ECC_UNCORRECT_SMN" : 0,
+      "GPU_ECC_UNCORRECT_SEM" : 0,
+      "GPU_ECC_UNCORRECT_MP0" : 0,
+      "GPU_ECC_UNCORRECT_MP1" : 0,
+      "GPU_ECC_UNCORRECT_FUSE" : 0,
+      "GPU_ECC_UNCORRECT_UMC" : 0,
+      "GPU_ECC_UNCORRECT_MCA" : 0,
+      "GPU_ECC_UNCORRECT_VCN" : 0,
+      "GPU_ECC_UNCORRECT_JPEG" : 0,
+      "GPU_ECC_UNCORRECT_IH" : 0,
+      "GPU_ECC_UNCORRECT_MPIO" : 0
+    },
+    "CustomLabels" : {
+      "CLUSTER_NAME" : "amdgpu-k8s-metrics-exporter"
+    }
   }
 }
diff --git a/example/metricsExporter/configmap.yaml b/example/metricsExporter/configmap.yaml
index 6bf81824..e21cac40 100644
--- a/example/metricsExporter/configmap.yaml
+++ b/example/metricsExporter/configmap.yaml
@@ -2,7 +2,7 @@ apiVersion: v1
 kind: ConfigMap
 metadata:
   name: gpu-config
-  namespace: kube-amd-gpu
+  namespace: mynamespace
 data:
   config.json: |
     {
@@ -34,7 +34,7 @@ data:
           "PCIE_NAC_RECEIVED_COUNT",
           "GPU_CLOCK",
           "GPU_POWER_USAGE",
-          "GPU_TOTAL_VRAM",
+          "GPU_TOTAL_MEMORY",
           "GPU_ECC_CORRECT_TOTAL",
           "GPU_ECC_UNCORRECT_TOTAL",
           "GPU_ECC_CORRECT_SDMA",
@@ -96,7 +96,10 @@ data:
           "GPU_ECC_CORRECT_IH",
           "GPU_ECC_UNCORRECT_IH",
           "GPU_ECC_CORRECT_MPIO",
-          "GPU_ECC_UNCORRECT_MPIO"
+          "GPU_ECC_UNCORRECT_MPIO",
+          "GPU_HEALTH",
+          "GPU_XGMI_LINK_RX",
+          "GPU_XGMI_LINK_TX"
         ],
         "Labels": [
           "GPU_UUID",
@@ -114,8 +117,34 @@ data:
           "CARD_VENDOR",
           "DRIVER_VERSION",
           "VBIOS_VERSION",
-          "HOSTNAME"
-        ]
+          "HOSTNAME",
+          "GPU_PARTITION_ID",
+          "GPU_COMPUTE_PARTITION_TYPE"
+        ],
+        "HealthThresholds" : {
+          "GPU_ECC_UNCORRECT_SDMA" : 0,
+          "GPU_ECC_UNCORRECT_GFX" : 0,
+          "GPU_ECC_UNCORRECT_MMHUB" : 0,
+          "GPU_ECC_UNCORRECT_ATHUB" : 0,
+          "GPU_ECC_UNCORRECT_BIF" : 0,
+          "GPU_ECC_UNCORRECT_HDP" : 0,
+          "GPU_ECC_UNCORRECT_XGMI_WAFL" : 0,
+          "GPU_ECC_UNCORRECT_DF" : 0,
+          "GPU_ECC_UNCORRECT_SMN" : 0,
+          "GPU_ECC_UNCORRECT_SEM" : 0,
+          "GPU_ECC_UNCORRECT_MP0" : 0,
+          "GPU_ECC_UNCORRECT_MP1" : 0,
+          "GPU_ECC_UNCORRECT_FUSE" : 0,
+          "GPU_ECC_UNCORRECT_UMC" : 0,
+          "GPU_ECC_UNCORRECT_MCA" : 0,
+          "GPU_ECC_UNCORRECT_VCN" : 0,
+          "GPU_ECC_UNCORRECT_JPEG" : 0,
+          "GPU_ECC_UNCORRECT_IH" : 0,
+          "GPU_ECC_UNCORRECT_MPIO" : 0
+        },
+        "CustomLabels" : {
+          "CLUSTER_NAME" : "amdgpu-k8s-metrics-exporter"
+        }
       }
     }
 

From c257a6c2aa07562c78d89c5a9fd8d95b6b912e89 Mon Sep 17 00:00:00 2001
From: vm <sriramr2230@gmail.com>
Date: Thu, 20 Mar 2025 06:05:15 +0000
Subject: [PATCH 03/12] Utils Container Openshift Security Context Constraint
 access

---
 Makefile                                      |  2 +-
 ...md-gpu-operator.clusterserviceversion.yaml | 12 ++++++-
 config/rbac/kustomization.yaml                |  3 ++
 config/rbac/utils_container_role.yaml         | 13 +++++++
 config/rbac/utils_container_role_binding.yaml | 11 ++++++
 .../rbac/utils_container_service_account.yaml |  4 +++
 hack/k8s-patch/metadata-patch/values.yaml     |  3 ++
 .../template-patch/serviceaccount.yaml        | 11 ++++++
 .../template-patch/utils-container-rbac.yaml  | 34 +++++++++++++++++++
 .../template-patch/utils-container-rbac.yaml  | 34 +++++++++++++++++++
 helm-charts-k8s/Chart.lock                    |  2 +-
 helm-charts-k8s/templates/serviceaccount.yaml | 11 ++++++
 .../templates/utils-container-rbac.yaml       | 34 +++++++++++++++++++
 helm-charts-k8s/values.yaml                   |  3 ++
 helm-charts-openshift/Chart.lock              |  2 +-
 .../templates/utils-container-rbac.yaml       | 34 +++++++++++++++++++
 internal/controllers/upgrademgr.go            | 13 ++++---
 17 files changed, 217 insertions(+), 9 deletions(-)
 create mode 100644 config/rbac/utils_container_role.yaml
 create mode 100644 config/rbac/utils_container_role_binding.yaml
 create mode 100644 config/rbac/utils_container_service_account.yaml
 create mode 100644 hack/k8s-patch/template-patch/utils-container-rbac.yaml
 create mode 100644 hack/openshift-patch/template-patch/utils-container-rbac.yaml
 create mode 100644 helm-charts-k8s/templates/utils-container-rbac.yaml
 create mode 100644 helm-charts-openshift/templates/utils-container-rbac.yaml

diff --git a/Makefile b/Makefile
index 99907539..e40cbf01 100644
--- a/Makefile
+++ b/Makefile
@@ -353,7 +353,7 @@ bundle-build: operator-sdk manifests kustomize ## OpenShift Build OLM bundle.
 	cd config/manager && $(KUSTOMIZE) edit set image controller=$(IMG)
 	cd config/manager-base && $(KUSTOMIZE) edit set image controller=$(IMG)
 	OPERATOR_SDK="${OPERATOR_SDK}" \
-		     BUNDLE_GEN_FLAGS="${BUNDLE_GEN_FLAGS} --extra-service-accounts amd-gpu-operator-kmm-device-plugin,amd-gpu-operator-kmm-module-loader,amd-gpu-operator-node-labeller,amd-gpu-operator-metrics-exporter,amd-gpu-operator-metrics-exporter-rbac-proxy,amd-gpu-operator-test-runner,amd-gpu-operator-config-manager" \
+		     BUNDLE_GEN_FLAGS="${BUNDLE_GEN_FLAGS} --extra-service-accounts amd-gpu-operator-kmm-device-plugin,amd-gpu-operator-kmm-module-loader,amd-gpu-operator-node-labeller,amd-gpu-operator-metrics-exporter,amd-gpu-operator-metrics-exporter-rbac-proxy,amd-gpu-operator-test-runner,amd-gpu-operator-config-manager,amd-gpu-operator-utils-container" \
 		     PKG=amd-gpu-operator \
 		     SOURCE_DIR=$(dir $(realpath $(lastword $(MAKEFILE_LIST)))) \
 		     KUBECTL_CMD=${KUBECTL_CMD} ./hack/generate-bundle
diff --git a/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml b/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml
index 3c6bd39f..495f3670 100644
--- a/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml
+++ b/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml
@@ -30,7 +30,7 @@ metadata:
         }
       ]
     capabilities: Basic Install
-    createdAt: "2025-03-18T22:58:17Z"
+    createdAt: "2025-03-20T06:06:57Z"
     operatorframework.io/suggested-namespace: openshift-amd-gpu
     operators.operatorframework.io/builder: operator-sdk-v1.32.0
     operators.operatorframework.io/project_layout: go.kubebuilder.io/v3
@@ -955,6 +955,16 @@ spec:
           verbs:
           - use
         serviceAccountName: amd-gpu-operator-test-runner
+      - rules:
+        - apiGroups:
+          - security.openshift.io
+          resourceNames:
+          - privileged
+          resources:
+          - securitycontextconstraints
+          verbs:
+          - use
+        serviceAccountName: amd-gpu-operator-utils-container
       deployments:
       - label:
           app.kubernetes.io/component: amd-gpu
diff --git a/config/rbac/kustomization.yaml b/config/rbac/kustomization.yaml
index ac367d92..332d351c 100644
--- a/config/rbac/kustomization.yaml
+++ b/config/rbac/kustomization.yaml
@@ -22,3 +22,6 @@ resources:
   - test_runner_service_account.yaml
   - test_runner_role.yaml
   - test_runner_role_binding.yaml
+  - utils_container_service_account.yaml
+  - utils_container_role.yaml
+  - utils_container_role_binding.yaml
\ No newline at end of file
diff --git a/config/rbac/utils_container_role.yaml b/config/rbac/utils_container_role.yaml
new file mode 100644
index 00000000..5a5579fa
--- /dev/null
+++ b/config/rbac/utils_container_role.yaml
@@ -0,0 +1,13 @@
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: utils-container
+rules:
+- apiGroups:
+  - security.openshift.io
+  resourceNames:
+  - privileged
+  resources:
+  - securitycontextconstraints
+  verbs:
+  - use
\ No newline at end of file
diff --git a/config/rbac/utils_container_role_binding.yaml b/config/rbac/utils_container_role_binding.yaml
new file mode 100644
index 00000000..1f9ceb76
--- /dev/null
+++ b/config/rbac/utils_container_role_binding.yaml
@@ -0,0 +1,11 @@
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: utils-container
+subjects:
+- kind: ServiceAccount
+  name: utils-container
+roleRef:
+  kind: ClusterRole
+  name: utils-container
+  apiGroup: rbac.authorization.k8s.io
\ No newline at end of file
diff --git a/config/rbac/utils_container_service_account.yaml b/config/rbac/utils_container_service_account.yaml
new file mode 100644
index 00000000..7d92dcd6
--- /dev/null
+++ b/config/rbac/utils_container_service_account.yaml
@@ -0,0 +1,4 @@
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: utils-container
\ No newline at end of file
diff --git a/hack/k8s-patch/metadata-patch/values.yaml b/hack/k8s-patch/metadata-patch/values.yaml
index 59caef95..71bfd56c 100644
--- a/hack/k8s-patch/metadata-patch/values.yaml
+++ b/hack/k8s-patch/metadata-patch/values.yaml
@@ -101,6 +101,9 @@ testRunner:
 configManager:
   serviceAccount:
     annotations: {}
+utilsContainer:
+  serviceAccount:
+    annotations: {}
 global:
   proxy:
     env: {}
diff --git a/hack/k8s-patch/template-patch/serviceaccount.yaml b/hack/k8s-patch/template-patch/serviceaccount.yaml
index 57fe7999..ae6c0be4 100644
--- a/hack/k8s-patch/template-patch/serviceaccount.yaml
+++ b/hack/k8s-patch/template-patch/serviceaccount.yaml
@@ -85,3 +85,14 @@ metadata:
   {{- include "helm-charts-k8s.labels" . | nindent 4 }}
   annotations:
     {{- toYaml .Values.configManager.serviceAccount.annotations | nindent 4 }}
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: amd-gpu-operator-utils-container
+  labels:
+    app.kubernetes.io/component: amd-gpu
+    app.kubernetes.io/part-of: amd-gpu
+  {{- include "helm-charts-k8s.labels" . | nindent 4 }}
+  annotations:
+    {{- toYaml .Values.utilsContainer.serviceAccount.annotations | nindent 4 }}
\ No newline at end of file
diff --git a/hack/k8s-patch/template-patch/utils-container-rbac.yaml b/hack/k8s-patch/template-patch/utils-container-rbac.yaml
new file mode 100644
index 00000000..65f1d965
--- /dev/null
+++ b/hack/k8s-patch/template-patch/utils-container-rbac.yaml
@@ -0,0 +1,34 @@
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: {{ include "helm-charts-k8s.fullname" . }}-utils-container
+  labels:
+    app.kubernetes.io/component: amd-gpu
+    app.kubernetes.io/part-of: amd-gpu
+  {{- include "helm-charts-k8s.labels" . | nindent 4 }}
+rules:
+- apiGroups:
+  - security.openshift.io
+  resourceNames:
+  - privileged
+  resources:
+  - securitycontextconstraints
+  verbs:
+  - use
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: {{ include "helm-charts-k8s.fullname" . }}-utils-container
+  labels:
+    app.kubernetes.io/component: amd-gpu
+    app.kubernetes.io/part-of: amd-gpu
+  {{- include "helm-charts-k8s.labels" . | nindent 4 }}
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: '{{ include "helm-charts-k8s.fullname" . }}-utils-container'
+subjects:
+- kind: ServiceAccount
+  name: amd-gpu-operator-utils-container
+  namespace: '{{ .Release.Namespace }}'
\ No newline at end of file
diff --git a/hack/openshift-patch/template-patch/utils-container-rbac.yaml b/hack/openshift-patch/template-patch/utils-container-rbac.yaml
new file mode 100644
index 00000000..36ac4ad8
--- /dev/null
+++ b/hack/openshift-patch/template-patch/utils-container-rbac.yaml
@@ -0,0 +1,34 @@
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: {{ include "helm-charts-openshift.fullname" . }}-utils-container
+  labels:
+    app.kubernetes.io/component: amd-gpu
+    app.kubernetes.io/part-of: amd-gpu
+  {{- include "helm-charts-openshift.labels" . | nindent 4 }}
+rules:
+- apiGroups:
+  - security.openshift.io
+  resourceNames:
+  - privileged
+  resources:
+  - securitycontextconstraints
+  verbs:
+  - use
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: {{ include "helm-charts-openshift.fullname" . }}-utils-container
+  labels:
+    app.kubernetes.io/component: amd-gpu
+    app.kubernetes.io/part-of: amd-gpu
+  {{- include "helm-charts-openshift.labels" . | nindent 4 }}
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: '{{ include "helm-charts-openshift.fullname" . }}-utils-container'
+subjects:
+- kind: ServiceAccount
+  name: amd-gpu-operator-utils-container
+  namespace: '{{ .Release.Namespace }}'
\ No newline at end of file
diff --git a/helm-charts-k8s/Chart.lock b/helm-charts-k8s/Chart.lock
index 54d3ca29..6ad80130 100644
--- a/helm-charts-k8s/Chart.lock
+++ b/helm-charts-k8s/Chart.lock
@@ -6,4 +6,4 @@ dependencies:
   repository: file://./charts/kmm
   version: v1.0.0
 digest: sha256:f9a315dd2ce3d515ebf28c8e9a6a82158b493ca2686439ec381487761261b597
-generated: "2025-03-14T23:56:25.183742256Z"
+generated: "2025-03-20T06:06:33.9562362Z"
diff --git a/helm-charts-k8s/templates/serviceaccount.yaml b/helm-charts-k8s/templates/serviceaccount.yaml
index 57fe7999..ae6c0be4 100644
--- a/helm-charts-k8s/templates/serviceaccount.yaml
+++ b/helm-charts-k8s/templates/serviceaccount.yaml
@@ -85,3 +85,14 @@ metadata:
   {{- include "helm-charts-k8s.labels" . | nindent 4 }}
   annotations:
     {{- toYaml .Values.configManager.serviceAccount.annotations | nindent 4 }}
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: amd-gpu-operator-utils-container
+  labels:
+    app.kubernetes.io/component: amd-gpu
+    app.kubernetes.io/part-of: amd-gpu
+  {{- include "helm-charts-k8s.labels" . | nindent 4 }}
+  annotations:
+    {{- toYaml .Values.utilsContainer.serviceAccount.annotations | nindent 4 }}
\ No newline at end of file
diff --git a/helm-charts-k8s/templates/utils-container-rbac.yaml b/helm-charts-k8s/templates/utils-container-rbac.yaml
new file mode 100644
index 00000000..65f1d965
--- /dev/null
+++ b/helm-charts-k8s/templates/utils-container-rbac.yaml
@@ -0,0 +1,34 @@
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: {{ include "helm-charts-k8s.fullname" . }}-utils-container
+  labels:
+    app.kubernetes.io/component: amd-gpu
+    app.kubernetes.io/part-of: amd-gpu
+  {{- include "helm-charts-k8s.labels" . | nindent 4 }}
+rules:
+- apiGroups:
+  - security.openshift.io
+  resourceNames:
+  - privileged
+  resources:
+  - securitycontextconstraints
+  verbs:
+  - use
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: {{ include "helm-charts-k8s.fullname" . }}-utils-container
+  labels:
+    app.kubernetes.io/component: amd-gpu
+    app.kubernetes.io/part-of: amd-gpu
+  {{- include "helm-charts-k8s.labels" . | nindent 4 }}
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: '{{ include "helm-charts-k8s.fullname" . }}-utils-container'
+subjects:
+- kind: ServiceAccount
+  name: amd-gpu-operator-utils-container
+  namespace: '{{ .Release.Namespace }}'
\ No newline at end of file
diff --git a/helm-charts-k8s/values.yaml b/helm-charts-k8s/values.yaml
index 59caef95..71bfd56c 100644
--- a/helm-charts-k8s/values.yaml
+++ b/helm-charts-k8s/values.yaml
@@ -101,6 +101,9 @@ testRunner:
 configManager:
   serviceAccount:
     annotations: {}
+utilsContainer:
+  serviceAccount:
+    annotations: {}
 global:
   proxy:
     env: {}
diff --git a/helm-charts-openshift/Chart.lock b/helm-charts-openshift/Chart.lock
index 697fba37..6e3c4ccc 100644
--- a/helm-charts-openshift/Chart.lock
+++ b/helm-charts-openshift/Chart.lock
@@ -6,4 +6,4 @@ dependencies:
   repository: file://./charts/kmm
   version: v1.0.0
 digest: sha256:25200c34a5cc846a1275e5bf3fc637b19e909dc68de938189c5278d77d03f5ac
-generated: "2025-03-18T22:58:15.38295759Z"
+generated: "2025-03-20T06:06:55.80187139Z"
diff --git a/helm-charts-openshift/templates/utils-container-rbac.yaml b/helm-charts-openshift/templates/utils-container-rbac.yaml
new file mode 100644
index 00000000..36ac4ad8
--- /dev/null
+++ b/helm-charts-openshift/templates/utils-container-rbac.yaml
@@ -0,0 +1,34 @@
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: {{ include "helm-charts-openshift.fullname" . }}-utils-container
+  labels:
+    app.kubernetes.io/component: amd-gpu
+    app.kubernetes.io/part-of: amd-gpu
+  {{- include "helm-charts-openshift.labels" . | nindent 4 }}
+rules:
+- apiGroups:
+  - security.openshift.io
+  resourceNames:
+  - privileged
+  resources:
+  - securitycontextconstraints
+  verbs:
+  - use
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: {{ include "helm-charts-openshift.fullname" . }}-utils-container
+  labels:
+    app.kubernetes.io/component: amd-gpu
+    app.kubernetes.io/part-of: amd-gpu
+  {{- include "helm-charts-openshift.labels" . | nindent 4 }}
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: '{{ include "helm-charts-openshift.fullname" . }}-utils-container'
+subjects:
+- kind: ServiceAccount
+  name: amd-gpu-operator-utils-container
+  namespace: '{{ .Release.Namespace }}'
\ No newline at end of file
diff --git a/internal/controllers/upgrademgr.go b/internal/controllers/upgrademgr.go
index 071a0f82..c5f3fd6e 100644
--- a/internal/controllers/upgrademgr.go
+++ b/internal/controllers/upgrademgr.go
@@ -61,6 +61,7 @@ import (
 
 const (
 	defaultUtilsImage = "docker.io/rocm/gpu-operator-utils:latest"
+	defaultSAName     = "amd-gpu-operator-utils-container"
 )
 
 type upgradeMgr struct {
@@ -976,6 +977,7 @@ func (h *upgradeMgrHelper) getRebootPod(nodeName string, dc *amdv1alpha1.DeviceC
 	nodeSelector := map[string]string{}
 	nodeSelector["kubernetes.io/hostname"] = nodeName
 	utilsImage := defaultUtilsImage
+	serviceaccount := defaultSAName
 	if dc.Spec.CommonConfig.UtilsContainer.Image != "" {
 		utilsImage = dc.Spec.CommonConfig.UtilsContainer.Image
 	}
@@ -989,11 +991,12 @@ func (h *upgradeMgrHelper) getRebootPod(nodeName string, dc *amdv1alpha1.DeviceC
 			Namespace: dc.Namespace,
 		},
 		Spec: v1.PodSpec{
-			HostPID:          true,
-			HostNetwork:      true,
-			RestartPolicy:    v1.RestartPolicyNever,
-			NodeSelector:     nodeSelector,
-			ImagePullSecrets: imagePullSecrets,
+			ServiceAccountName: serviceaccount,
+			HostPID:            true,
+			HostNetwork:        true,
+			RestartPolicy:      v1.RestartPolicyNever,
+			NodeSelector:       nodeSelector,
+			ImagePullSecrets:   imagePullSecrets,
 			Containers: []v1.Container{
 				{
 					Name:            "reboot-container",

From 0a08e9919b21a9133757e11cba14a54c9137589b Mon Sep 17 00:00:00 2001
From: vm <sriramr2230@gmail.com>
Date: Fri, 21 Mar 2025 05:08:08 +0000
Subject: [PATCH 04/12] GPUOP-210

---
 api/v1alpha1/deviceconfig_types.go                         | 2 +-
 .../manifests/amd-gpu-operator.clusterserviceversion.yaml  | 2 +-
 bundle/manifests/amd.com_deviceconfigs.yaml                | 2 +-
 config/crd/bases/amd.com_deviceconfigs.yaml                | 2 +-
 docs/installation/openshift-olm.md                         | 7 +++++++
 helm-charts-k8s/Chart.lock                                 | 2 +-
 helm-charts-k8s/crds/deviceconfig-crd.yaml                 | 2 +-
 helm-charts-openshift/Chart.lock                           | 2 +-
 helm-charts-openshift/crds/deviceconfig-crd.yaml           | 2 +-
 9 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/api/v1alpha1/deviceconfig_types.go b/api/v1alpha1/deviceconfig_types.go
index 94a4d6a2..7c187780 100644
--- a/api/v1alpha1/deviceconfig_types.go
+++ b/api/v1alpha1/deviceconfig_types.go
@@ -117,7 +117,7 @@ type DriverSpec struct {
 	// example tag is coreos-416.94-5.14.0-427.28.1.el9_4.x86_64-6.2.2 and ubuntu-22.04-5.15.0-94-generic-6.1.3
 	//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="Image",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:image"}
 	// +optional
-	// +kubebuilder:validation:Pattern=`^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$`
+	// +kubebuilder:validation:Pattern=`^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[$a-zA-Z0-9_]+(?:[._-][$a-zA-Z0-9_]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$`
 	Image string `json:"image,omitempty"`
 
 	// driver image registry TLS setting for the container image
diff --git a/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml b/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml
index 495f3670..86d76210 100644
--- a/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml
+++ b/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml
@@ -30,7 +30,7 @@ metadata:
         }
       ]
     capabilities: Basic Install
-    createdAt: "2025-03-20T06:06:57Z"
+    createdAt: "2025-03-21T05:09:41Z"
     operatorframework.io/suggested-namespace: openshift-amd-gpu
     operators.operatorframework.io/builder: operator-sdk-v1.32.0
     operators.operatorframework.io/project_layout: go.kubebuilder.io/v3
diff --git a/bundle/manifests/amd.com_deviceconfigs.yaml b/bundle/manifests/amd.com_deviceconfigs.yaml
index a0476c71..898220ac 100644
--- a/bundle/manifests/amd.com_deviceconfigs.yaml
+++ b/bundle/manifests/amd.com_deviceconfigs.yaml
@@ -357,7 +357,7 @@ spec:
                       for OpenShift the default value is image-registry.openshift-image-registry.svc:5000/$MOD_NAMESPACE/amdgpu_kmod
                       image tag will be in the format of <linux distro>-<release version>-<kernel version>-<driver version>
                       example tag is coreos-416.94-5.14.0-427.28.1.el9_4.x86_64-6.2.2 and ubuntu-22.04-5.15.0-94-generic-6.1.3
-                    pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$
+                    pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[$a-zA-Z0-9_]+(?:[._-][$a-zA-Z0-9_]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$
                     type: string
                   imageRegistrySecret:
                     description: secrets used for pull/push images from/to private
diff --git a/config/crd/bases/amd.com_deviceconfigs.yaml b/config/crd/bases/amd.com_deviceconfigs.yaml
index 5f7a02bd..05e8b815 100644
--- a/config/crd/bases/amd.com_deviceconfigs.yaml
+++ b/config/crd/bases/amd.com_deviceconfigs.yaml
@@ -353,7 +353,7 @@ spec:
                       for OpenShift the default value is image-registry.openshift-image-registry.svc:5000/$MOD_NAMESPACE/amdgpu_kmod
                       image tag will be in the format of <linux distro>-<release version>-<kernel version>-<driver version>
                       example tag is coreos-416.94-5.14.0-427.28.1.el9_4.x86_64-6.2.2 and ubuntu-22.04-5.15.0-94-generic-6.1.3
-                    pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$
+                    pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[$a-zA-Z0-9_]+(?:[._-][$a-zA-Z0-9_]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$
                     type: string
                   imageRegistrySecret:
                     description: secrets used for pull/push images from/to private
diff --git a/docs/installation/openshift-olm.md b/docs/installation/openshift-olm.md
index 89625fc1..489644d9 100644
--- a/docs/installation/openshift-olm.md
+++ b/docs/installation/openshift-olm.md
@@ -204,6 +204,13 @@ spec:
     "feature.node.kubernetes.io/amd-gpu": "true"
 ```
 
+Things to note:
+1. By default, there is no need to specify the image field in CR for Openshift. Default will be used which is: image-registry.openshift-image-registry.svc:5000/$MOD_NAMESPACE/amdgpu_kmod
+
+2. If users specify image, $MOD_NAMESPACE can be a place holder , KMM Operator can automatically translate it to the namespace
+
+3. Openshift internal registry has image url restriction, OpenShift users cannot use image like `<registry URL>/<repo name>` , it requires the image URL to be `<registry URL>/<project name or namespace>/<repo name>`. However, if any other registry is being used by the user, the image URL can be of either form.
+
 The operator will:
 
 1. Collect worker node system specifications
diff --git a/helm-charts-k8s/Chart.lock b/helm-charts-k8s/Chart.lock
index 6ad80130..477dc578 100644
--- a/helm-charts-k8s/Chart.lock
+++ b/helm-charts-k8s/Chart.lock
@@ -6,4 +6,4 @@ dependencies:
   repository: file://./charts/kmm
   version: v1.0.0
 digest: sha256:f9a315dd2ce3d515ebf28c8e9a6a82158b493ca2686439ec381487761261b597
-generated: "2025-03-20T06:06:33.9562362Z"
+generated: "2025-03-21T05:09:30.645342377Z"
diff --git a/helm-charts-k8s/crds/deviceconfig-crd.yaml b/helm-charts-k8s/crds/deviceconfig-crd.yaml
index 6058c151..707eb1ce 100644
--- a/helm-charts-k8s/crds/deviceconfig-crd.yaml
+++ b/helm-charts-k8s/crds/deviceconfig-crd.yaml
@@ -361,7 +361,7 @@ spec:
                       for OpenShift the default value is image-registry.openshift-image-registry.svc:5000/$MOD_NAMESPACE/amdgpu_kmod
                       image tag will be in the format of <linux distro>-<release version>-<kernel version>-<driver version>
                       example tag is coreos-416.94-5.14.0-427.28.1.el9_4.x86_64-6.2.2 and ubuntu-22.04-5.15.0-94-generic-6.1.3
-                    pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$
+                    pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[$a-zA-Z0-9_]+(?:[._-][$a-zA-Z0-9_]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$
                     type: string
                   imageRegistrySecret:
                     description: secrets used for pull/push images from/to private registry
diff --git a/helm-charts-openshift/Chart.lock b/helm-charts-openshift/Chart.lock
index 6e3c4ccc..f39ad5ac 100644
--- a/helm-charts-openshift/Chart.lock
+++ b/helm-charts-openshift/Chart.lock
@@ -6,4 +6,4 @@ dependencies:
   repository: file://./charts/kmm
   version: v1.0.0
 digest: sha256:25200c34a5cc846a1275e5bf3fc637b19e909dc68de938189c5278d77d03f5ac
-generated: "2025-03-20T06:06:55.80187139Z"
+generated: "2025-03-21T05:09:40.013067636Z"
diff --git a/helm-charts-openshift/crds/deviceconfig-crd.yaml b/helm-charts-openshift/crds/deviceconfig-crd.yaml
index 6058c151..707eb1ce 100644
--- a/helm-charts-openshift/crds/deviceconfig-crd.yaml
+++ b/helm-charts-openshift/crds/deviceconfig-crd.yaml
@@ -361,7 +361,7 @@ spec:
                       for OpenShift the default value is image-registry.openshift-image-registry.svc:5000/$MOD_NAMESPACE/amdgpu_kmod
                       image tag will be in the format of <linux distro>-<release version>-<kernel version>-<driver version>
                       example tag is coreos-416.94-5.14.0-427.28.1.el9_4.x86_64-6.2.2 and ubuntu-22.04-5.15.0-94-generic-6.1.3
-                    pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$
+                    pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[$a-zA-Z0-9_]+(?:[._-][$a-zA-Z0-9_]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$
                     type: string
                   imageRegistrySecret:
                     description: secrets used for pull/push images from/to private registry

From f08d3a75b68ed1d6b9c8d5b5e9f18a3e3794ecf7 Mon Sep 17 00:00:00 2001
From: yansun1996 <yan@pensando.io>
Date: Mon, 17 Mar 2025 21:29:38 +0000
Subject: [PATCH 05/12] Ignore vendored docs in sanity

---
 docs/.markdownlint.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/.markdownlint.yaml b/docs/.markdownlint.yaml
index 21ea9629..3d17a138 100644
--- a/docs/.markdownlint.yaml
+++ b/docs/.markdownlint.yaml
@@ -1,6 +1,3 @@
-ignores:
-  - CHANGELOG.md
-  - "vendor/**"
 default: true
 MD013: false
 MD024:
@@ -12,3 +9,6 @@ MD029:
 MD033: false
 MD034: false
 MD041: false
+ignores:
+  - CHANGELOG.md
+  - "vendor/**"
\ No newline at end of file

From 18465227372ce590945d5b7c9232c13f9da1365b Mon Sep 17 00:00:00 2001
From: yansun1996 <yan@pensando.io>
Date: Mon, 17 Mar 2025 21:51:01 +0000
Subject: [PATCH 06/12] Fix docs sanity to ignore vendor folder

---
 .markdownlint-cli2.yaml                       |  2 ++
 docs/.markdownlint.yaml => .markdownlint.yaml |  3 +--
 .spellcheck.yml => .spellcheck.local.yaml     | 11 ++++++-----
 docs/.markdownlint-cli2.yaml                  |  3 ---
 4 files changed, 9 insertions(+), 10 deletions(-)
 create mode 100644 .markdownlint-cli2.yaml
 rename docs/.markdownlint.yaml => .markdownlint.yaml (83%)
 rename .spellcheck.yml => .spellcheck.local.yaml (95%)
 delete mode 100644 docs/.markdownlint-cli2.yaml

diff --git a/.markdownlint-cli2.yaml b/.markdownlint-cli2.yaml
new file mode 100644
index 00000000..9d2459dc
--- /dev/null
+++ b/.markdownlint-cli2.yaml
@@ -0,0 +1,2 @@
+ignores:
+  - "vendor/**/*.md"
\ No newline at end of file
diff --git a/docs/.markdownlint.yaml b/.markdownlint.yaml
similarity index 83%
rename from docs/.markdownlint.yaml
rename to .markdownlint.yaml
index 3d17a138..3ffda0de 100644
--- a/docs/.markdownlint.yaml
+++ b/.markdownlint.yaml
@@ -10,5 +10,4 @@ MD033: false
 MD034: false
 MD041: false
 ignores:
-  - CHANGELOG.md
-  - "vendor/**"
\ No newline at end of file
+  - "vendor/**/*.md"
\ No newline at end of file
diff --git a/.spellcheck.yml b/.spellcheck.local.yaml
similarity index 95%
rename from .spellcheck.yml
rename to .spellcheck.local.yaml
index 9291279a..da2912e5 100644
--- a/.spellcheck.yml
+++ b/.spellcheck.local.yaml
@@ -33,9 +33,9 @@
 matrix:
 - name: Markdown
   sources:
-  - ['docs/**/*.md', '!docs/doxygen/mainpage.md', '!docs/contributing/documentation-standards.md']
-  - ['tools/autotag/templates/**/*.md', '!tools/autotag/templates/**/5*.md', '!tools/autotag/templates/**/6.0*.md', '!tools/autotag/templates/**/6.1*.md']
-  expect_match: false
+  - '!vendor/**|docs/**/*.md|!docs/doxygen/mainpage.md|!docs/contributing/documentation-standards.md'
+  expect_match: true
+  jobs: 4
   aspell:
     lang: en
   dictionary:
@@ -115,8 +115,9 @@ matrix:
   - pyspelling.filters.url:
 - name: reST
   sources:
-  - 'docs/**/*.rst'
-  expect_match: false
+  - '!vendor/**|docs/**/*.rst'
+  expect_match: true
+  jobs: 4
   aspell:
     lang: en
   dictionary:
diff --git a/docs/.markdownlint-cli2.yaml b/docs/.markdownlint-cli2.yaml
deleted file mode 100644
index 74870cb6..00000000
--- a/docs/.markdownlint-cli2.yaml
+++ /dev/null
@@ -1,3 +0,0 @@
-ignores:
-  - CHANGELOG.md
-  - "vendor/**"
\ No newline at end of file

From 70a556a821671352d02ad705703586a0c15aa515 Mon Sep 17 00:00:00 2001
From: vm <sriramr2230@gmail.com>
Date: Fri, 21 Mar 2025 05:08:08 +0000
Subject: [PATCH 07/12] GPUOP-210

---
 api/v1alpha1/deviceconfig_types.go                         | 2 +-
 .../manifests/amd-gpu-operator.clusterserviceversion.yaml  | 2 +-
 bundle/manifests/amd.com_deviceconfigs.yaml                | 2 +-
 config/crd/bases/amd.com_deviceconfigs.yaml                | 2 +-
 docs/installation/openshift-olm.md                         | 7 +++++++
 helm-charts-k8s/Chart.lock                                 | 2 +-
 helm-charts-k8s/crds/deviceconfig-crd.yaml                 | 2 +-
 helm-charts-openshift/Chart.lock                           | 2 +-
 helm-charts-openshift/crds/deviceconfig-crd.yaml           | 2 +-
 9 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/api/v1alpha1/deviceconfig_types.go b/api/v1alpha1/deviceconfig_types.go
index 94a4d6a2..7c187780 100644
--- a/api/v1alpha1/deviceconfig_types.go
+++ b/api/v1alpha1/deviceconfig_types.go
@@ -117,7 +117,7 @@ type DriverSpec struct {
 	// example tag is coreos-416.94-5.14.0-427.28.1.el9_4.x86_64-6.2.2 and ubuntu-22.04-5.15.0-94-generic-6.1.3
 	//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="Image",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:image"}
 	// +optional
-	// +kubebuilder:validation:Pattern=`^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$`
+	// +kubebuilder:validation:Pattern=`^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[$a-zA-Z0-9_]+(?:[._-][$a-zA-Z0-9_]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$`
 	Image string `json:"image,omitempty"`
 
 	// driver image registry TLS setting for the container image
diff --git a/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml b/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml
index 495f3670..86d76210 100644
--- a/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml
+++ b/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml
@@ -30,7 +30,7 @@ metadata:
         }
       ]
     capabilities: Basic Install
-    createdAt: "2025-03-20T06:06:57Z"
+    createdAt: "2025-03-21T05:09:41Z"
     operatorframework.io/suggested-namespace: openshift-amd-gpu
     operators.operatorframework.io/builder: operator-sdk-v1.32.0
     operators.operatorframework.io/project_layout: go.kubebuilder.io/v3
diff --git a/bundle/manifests/amd.com_deviceconfigs.yaml b/bundle/manifests/amd.com_deviceconfigs.yaml
index a0476c71..898220ac 100644
--- a/bundle/manifests/amd.com_deviceconfigs.yaml
+++ b/bundle/manifests/amd.com_deviceconfigs.yaml
@@ -357,7 +357,7 @@ spec:
                       for OpenShift the default value is image-registry.openshift-image-registry.svc:5000/$MOD_NAMESPACE/amdgpu_kmod
                       image tag will be in the format of <linux distro>-<release version>-<kernel version>-<driver version>
                       example tag is coreos-416.94-5.14.0-427.28.1.el9_4.x86_64-6.2.2 and ubuntu-22.04-5.15.0-94-generic-6.1.3
-                    pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$
+                    pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[$a-zA-Z0-9_]+(?:[._-][$a-zA-Z0-9_]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$
                     type: string
                   imageRegistrySecret:
                     description: secrets used for pull/push images from/to private
diff --git a/config/crd/bases/amd.com_deviceconfigs.yaml b/config/crd/bases/amd.com_deviceconfigs.yaml
index 5f7a02bd..05e8b815 100644
--- a/config/crd/bases/amd.com_deviceconfigs.yaml
+++ b/config/crd/bases/amd.com_deviceconfigs.yaml
@@ -353,7 +353,7 @@ spec:
                       for OpenShift the default value is image-registry.openshift-image-registry.svc:5000/$MOD_NAMESPACE/amdgpu_kmod
                       image tag will be in the format of <linux distro>-<release version>-<kernel version>-<driver version>
                       example tag is coreos-416.94-5.14.0-427.28.1.el9_4.x86_64-6.2.2 and ubuntu-22.04-5.15.0-94-generic-6.1.3
-                    pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$
+                    pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[$a-zA-Z0-9_]+(?:[._-][$a-zA-Z0-9_]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$
                     type: string
                   imageRegistrySecret:
                     description: secrets used for pull/push images from/to private
diff --git a/docs/installation/openshift-olm.md b/docs/installation/openshift-olm.md
index 89625fc1..489644d9 100644
--- a/docs/installation/openshift-olm.md
+++ b/docs/installation/openshift-olm.md
@@ -204,6 +204,13 @@ spec:
     "feature.node.kubernetes.io/amd-gpu": "true"
 ```
 
+Things to note:
+1. By default, there is no need to specify the image field in CR for Openshift. Default will be used which is: image-registry.openshift-image-registry.svc:5000/$MOD_NAMESPACE/amdgpu_kmod
+
+2. If users specify image, $MOD_NAMESPACE can be a place holder , KMM Operator can automatically translate it to the namespace
+
+3. Openshift internal registry has image url restriction, OpenShift users cannot use image like `<registry URL>/<repo name>` , it requires the image URL to be `<registry URL>/<project name or namespace>/<repo name>`. However, if any other registry is being used by the user, the image URL can be of either form.
+
 The operator will:
 
 1. Collect worker node system specifications
diff --git a/helm-charts-k8s/Chart.lock b/helm-charts-k8s/Chart.lock
index 6ad80130..477dc578 100644
--- a/helm-charts-k8s/Chart.lock
+++ b/helm-charts-k8s/Chart.lock
@@ -6,4 +6,4 @@ dependencies:
   repository: file://./charts/kmm
   version: v1.0.0
 digest: sha256:f9a315dd2ce3d515ebf28c8e9a6a82158b493ca2686439ec381487761261b597
-generated: "2025-03-20T06:06:33.9562362Z"
+generated: "2025-03-21T05:09:30.645342377Z"
diff --git a/helm-charts-k8s/crds/deviceconfig-crd.yaml b/helm-charts-k8s/crds/deviceconfig-crd.yaml
index 6058c151..707eb1ce 100644
--- a/helm-charts-k8s/crds/deviceconfig-crd.yaml
+++ b/helm-charts-k8s/crds/deviceconfig-crd.yaml
@@ -361,7 +361,7 @@ spec:
                       for OpenShift the default value is image-registry.openshift-image-registry.svc:5000/$MOD_NAMESPACE/amdgpu_kmod
                       image tag will be in the format of <linux distro>-<release version>-<kernel version>-<driver version>
                       example tag is coreos-416.94-5.14.0-427.28.1.el9_4.x86_64-6.2.2 and ubuntu-22.04-5.15.0-94-generic-6.1.3
-                    pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$
+                    pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[$a-zA-Z0-9_]+(?:[._-][$a-zA-Z0-9_]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$
                     type: string
                   imageRegistrySecret:
                     description: secrets used for pull/push images from/to private registry
diff --git a/helm-charts-openshift/Chart.lock b/helm-charts-openshift/Chart.lock
index 6e3c4ccc..f39ad5ac 100644
--- a/helm-charts-openshift/Chart.lock
+++ b/helm-charts-openshift/Chart.lock
@@ -6,4 +6,4 @@ dependencies:
   repository: file://./charts/kmm
   version: v1.0.0
 digest: sha256:25200c34a5cc846a1275e5bf3fc637b19e909dc68de938189c5278d77d03f5ac
-generated: "2025-03-20T06:06:55.80187139Z"
+generated: "2025-03-21T05:09:40.013067636Z"
diff --git a/helm-charts-openshift/crds/deviceconfig-crd.yaml b/helm-charts-openshift/crds/deviceconfig-crd.yaml
index 6058c151..707eb1ce 100644
--- a/helm-charts-openshift/crds/deviceconfig-crd.yaml
+++ b/helm-charts-openshift/crds/deviceconfig-crd.yaml
@@ -361,7 +361,7 @@ spec:
                       for OpenShift the default value is image-registry.openshift-image-registry.svc:5000/$MOD_NAMESPACE/amdgpu_kmod
                       image tag will be in the format of <linux distro>-<release version>-<kernel version>-<driver version>
                       example tag is coreos-416.94-5.14.0-427.28.1.el9_4.x86_64-6.2.2 and ubuntu-22.04-5.15.0-94-generic-6.1.3
-                    pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$
+                    pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[$a-zA-Z0-9_]+(?:[._-][$a-zA-Z0-9_]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$
                     type: string
                   imageRegistrySecret:
                     description: secrets used for pull/push images from/to private registry

From 40bf349916f10e219840cd119e282f91106063d6 Mon Sep 17 00:00:00 2001
From: yansun1996 <yan@pensando.io>
Date: Mon, 24 Mar 2025 22:31:12 +0000
Subject: [PATCH 08/12] Refactor IsOpenshift function based on RedHat
 suggestion

---
 cmd/main.go                           | 12 +++++----
 internal/kmmmodule/kmmmodule.go       | 19 ++------------
 internal/nodelabeller/nodelabeller.go | 19 ++------------
 internal/utils.go                     | 37 ++++++++++++++++++++++++++-
 4 files changed, 47 insertions(+), 40 deletions(-)

diff --git a/cmd/main.go b/cmd/main.go
index b3c985ff..168a730d 100644
--- a/cmd/main.go
+++ b/cmd/main.go
@@ -35,9 +35,6 @@ package main
 import (
 	"flag"
 
-	"github.com/ROCm/gpu-operator/internal/configmanager"
-	"github.com/ROCm/gpu-operator/internal/metricsexporter"
-	"github.com/ROCm/gpu-operator/internal/testrunner"
 	kmmv1beta1 "github.com/rh-ecosystem-edge/kernel-module-management/api/v1beta1"
 	"k8s.io/apimachinery/pkg/runtime"
 	utilruntime "k8s.io/apimachinery/pkg/util/runtime"
@@ -51,11 +48,15 @@ import (
 	_ "k8s.io/client-go/plugin/pkg/client/auth"
 
 	gpuev1alpha1 "github.com/ROCm/gpu-operator/api/v1alpha1"
+	utils "github.com/ROCm/gpu-operator/internal"
 	"github.com/ROCm/gpu-operator/internal/cmd"
 	"github.com/ROCm/gpu-operator/internal/config"
+	"github.com/ROCm/gpu-operator/internal/configmanager"
 	"github.com/ROCm/gpu-operator/internal/controllers"
 	"github.com/ROCm/gpu-operator/internal/kmmmodule"
+	"github.com/ROCm/gpu-operator/internal/metricsexporter"
 	"github.com/ROCm/gpu-operator/internal/nodelabeller"
+	"github.com/ROCm/gpu-operator/internal/testrunner"
 	//+kubebuilder:scaffold:imports
 )
 
@@ -107,8 +108,9 @@ func main() {
 	}
 
 	client := mgr.GetClient()
-	kmmHandler := kmmmodule.NewKMMModule(client, scheme)
-	nlHandler := nodelabeller.NewNodeLabeller(scheme)
+	isOpenShift := utils.IsOpenShift(setupLogger)
+	kmmHandler := kmmmodule.NewKMMModule(client, scheme, isOpenShift)
+	nlHandler := nodelabeller.NewNodeLabeller(scheme, isOpenShift)
 	metricsHandler := metricsexporter.NewMetricsExporter(scheme)
 	testrunnerHandler := testrunner.NewTestRunner(scheme)
 	configmanagerHandler := configmanager.NewConfigManager(scheme)
diff --git a/internal/kmmmodule/kmmmodule.go b/internal/kmmmodule/kmmmodule.go
index 9aa6632b..8c753604 100644
--- a/internal/kmmmodule/kmmmodule.go
+++ b/internal/kmmmodule/kmmmodule.go
@@ -55,10 +55,8 @@ import (
 	"k8s.io/apimachinery/pkg/runtime"
 	"k8s.io/apimachinery/pkg/types"
 	"k8s.io/apimachinery/pkg/util/intstr"
-	"k8s.io/client-go/discovery"
 	"k8s.io/client-go/kubernetes"
 	"k8s.io/client-go/rest"
-	ctrl "sigs.k8s.io/controller-runtime"
 	"sigs.k8s.io/controller-runtime/pkg/client"
 	"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
 	"sigs.k8s.io/controller-runtime/pkg/log"
@@ -107,27 +105,14 @@ type kmmModule struct {
 	isOpenShift bool
 }
 
-func NewKMMModule(client client.Client, scheme *runtime.Scheme) KMMModuleAPI {
+func NewKMMModule(client client.Client, scheme *runtime.Scheme, isOpenShift bool) KMMModuleAPI {
 	return &kmmModule{
 		client:      client,
 		scheme:      scheme,
-		isOpenShift: isOpenshift(),
+		isOpenShift: isOpenShift,
 	}
 }
 
-func isOpenshift() bool {
-	if dc, err := discovery.NewDiscoveryClientForConfig(ctrl.GetConfigOrDie()); err == nil {
-		if gplist, err := dc.ServerGroups(); err == nil {
-			for _, gp := range gplist.Groups {
-				if gp.Name == "route.openshift.io" {
-					return true
-				}
-			}
-		}
-	}
-	return false
-}
-
 func (km *kmmModule) SetNodeVersionLabelAsDesired(ctx context.Context, devConfig *amdv1alpha1.DeviceConfig, nodes *v1.NodeList) error {
 	// for each selected node
 	// put the KMM version label given by CR's driver version
diff --git a/internal/nodelabeller/nodelabeller.go b/internal/nodelabeller/nodelabeller.go
index 8f60805b..959bf39f 100644
--- a/internal/nodelabeller/nodelabeller.go
+++ b/internal/nodelabeller/nodelabeller.go
@@ -42,9 +42,7 @@ import (
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/runtime"
 	"k8s.io/apimachinery/pkg/util/intstr"
-	"k8s.io/client-go/discovery"
 	"k8s.io/utils/ptr"
-	ctrl "sigs.k8s.io/controller-runtime"
 	"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
 )
 
@@ -66,26 +64,13 @@ type nodeLabeller struct {
 	isOpenShift bool
 }
 
-func NewNodeLabeller(scheme *runtime.Scheme) NodeLabeller {
+func NewNodeLabeller(scheme *runtime.Scheme, isOpenshift bool) NodeLabeller {
 	return &nodeLabeller{
 		scheme:      scheme,
-		isOpenShift: isOpenshift(),
+		isOpenShift: isOpenshift,
 	}
 }
 
-func isOpenshift() bool {
-	if dc, err := discovery.NewDiscoveryClientForConfig(ctrl.GetConfigOrDie()); err == nil {
-		if gplist, err := dc.ServerGroups(); err == nil {
-			for _, gp := range gplist.Groups {
-				if gp.Name == "route.openshift.io" {
-					return true
-				}
-			}
-		}
-	}
-	return false
-}
-
 func (nl *nodeLabeller) SetNodeLabellerAsDesired(ds *appsv1.DaemonSet, devConfig *amdv1alpha1.DeviceConfig) error {
 	if ds == nil {
 		return fmt.Errorf("daemon set is not initialized, zero pointer")
diff --git a/internal/utils.go b/internal/utils.go
index 9c67f1d3..b34dd031 100644
--- a/internal/utils.go
+++ b/internal/utils.go
@@ -17,15 +17,23 @@ limitations under the License.
 package utils
 
 import (
+	"context"
 	"fmt"
 	"strings"
 
-	amdv1alpha1 "github.com/ROCm/gpu-operator/api/v1alpha1"
+	"github.com/go-logr/logr"
 	v1 "k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/client-go/kubernetes"
+	"k8s.io/client-go/rest"
+
+	amdv1alpha1 "github.com/ROCm/gpu-operator/api/v1alpha1"
+	"github.com/ROCm/gpu-operator/internal/cmd"
 )
 
 const (
 	defaultOcDriversVersion = "6.2.2"
+	openShiftNodeLabel      = "node.openshift.io/os_id"
 	NodeFeatureLabelAmdGpu  = "feature.node.kubernetes.io/amd-gpu"
 	NodeFeatureLabelAmdVGpu = "feature.node.kubernetes.io/amd-vgpu"
 )
@@ -88,3 +96,30 @@ func HasNodeLabelKey(node v1.Node, labelKey string) bool {
 	}
 	return false
 }
+
+func IsOpenShift(logger logr.Logger) bool {
+	config, err := rest.InClusterConfig()
+	if err != nil {
+		cmd.FatalError(logger, err, "unable to get cluster config")
+	}
+	// creates the clientset
+	clientset, err := kubernetes.NewForConfig(config)
+	if err != nil {
+		cmd.FatalError(logger, err, "unable to create cluster clientset")
+	}
+	// Check for OpenShift-specific labels on nodes
+	nodes, err := clientset.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{})
+	if err != nil {
+		cmd.FatalError(logger, err, "unable to list nodes")
+	}
+
+	isOpenShift := false
+	for _, node := range nodes.Items {
+		if _, exists := node.Labels[openShiftNodeLabel]; exists {
+			isOpenShift = true
+			break
+		}
+	}
+	logger.Info(fmt.Sprintf("IsOpenShift: %+v", isOpenShift))
+	return isOpenShift
+}

From 044c6b4794046150131e7d153607ad077f52d012 Mon Sep 17 00:00:00 2001
From: vm <sriramr2230@gmail.com>
Date: Tue, 25 Mar 2025 06:30:37 +0000
Subject: [PATCH 09/12] Device Plugin Args option

---
 api/v1alpha1/deviceconfig_types.go            |  9 +++----
 api/v1alpha1/zz_generated.deepcopy.go         |  7 ++++++
 ...md-gpu-operator.clusterserviceversion.yaml | 14 ++++++-----
 bundle/manifests/amd.com_deviceconfigs.yaml   | 14 +++++------
 config/crd/bases/amd.com_deviceconfigs.yaml   | 14 +++++------
 ...md-gpu-operator.clusterserviceversion.yaml | 12 +++++----
 helm-charts-k8s/Chart.lock                    |  2 +-
 helm-charts-k8s/crds/deviceconfig-crd.yaml    | 14 +++++------
 helm-charts-openshift/Chart.lock              |  2 +-
 .../crds/deviceconfig-crd.yaml                | 14 +++++------
 internal/kmmmodule/kmmmodule.go               | 10 ++++++--
 internal/utils.go                             | 11 +++++---
 internal/validator/specValidators.go          | 25 +++++++++++++++++++
 tests/e2e/Makefile                            |  6 +++++
 tests/e2e/cluster_test.go                     |  6 ++---
 tests/e2e/init.go                             | 15 +++++++++++
 16 files changed, 120 insertions(+), 55 deletions(-)

diff --git a/api/v1alpha1/deviceconfig_types.go b/api/v1alpha1/deviceconfig_types.go
index 7c187780..503c0939 100644
--- a/api/v1alpha1/deviceconfig_types.go
+++ b/api/v1alpha1/deviceconfig_types.go
@@ -251,12 +251,11 @@ type DevicePluginSpec struct {
 	// +optional
 	DevicePluginTolerations []v1.Toleration `json:"devicePluginTolerations,omitempty"`
 
-	// resource naming strategy for device plugin
-	//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="ResourceNamingStrategy",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:ResourceNamingStrategy"}
-	// +kubebuilder:validation:Enum=single;mixed
-	// +kubebuilder:default:="single"
+	// device plugin arguments is used to pass supported flags and their values while starting device plugin daemonset
+	// supported flag values: {"resource_naming_strategy": {"single", "mixed"}}
+	//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="DevicePluginArguments",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:devicePluginArguments"}
 	// +optional
-	ResourceNamingStrategy string `json:"resourceNamingStrategy,omitempty"`
+	DevicePluginArguments map[string]string `json:"devicePluginArguments,omitempty"`
 
 	// node labeller image
 	//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="NodeLabellerImage",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:nodeLabellerImage"}
diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go
index bbbd03c0..c2be36c9 100644
--- a/api/v1alpha1/zz_generated.deepcopy.go
+++ b/api/v1alpha1/zz_generated.deepcopy.go
@@ -267,6 +267,13 @@ func (in *DevicePluginSpec) DeepCopyInto(out *DevicePluginSpec) {
 			(*in)[i].DeepCopyInto(&(*out)[i])
 		}
 	}
+	if in.DevicePluginArguments != nil {
+		in, out := &in.DevicePluginArguments, &out.DevicePluginArguments
+		*out = make(map[string]string, len(*in))
+		for key, val := range *in {
+			(*out)[key] = val
+		}
+	}
 	if in.NodeLabellerTolerations != nil {
 		in, out := &in.NodeLabellerTolerations, &out.NodeLabellerTolerations
 		*out = make([]v1.Toleration, len(*in))
diff --git a/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml b/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml
index 86d76210..45078acb 100644
--- a/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml
+++ b/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml
@@ -30,7 +30,7 @@ metadata:
         }
       ]
     capabilities: Basic Install
-    createdAt: "2025-03-21T05:09:41Z"
+    createdAt: "2025-03-25T06:19:27Z"
     operatorframework.io/suggested-namespace: openshift-amd-gpu
     operators.operatorframework.io/builder: operator-sdk-v1.32.0
     operators.operatorframework.io/project_layout: go.kubebuilder.io/v3
@@ -152,6 +152,13 @@ spec:
         path: devicePlugin
         x-descriptors:
         - urn:alm:descriptor:com.amd.deviceconfigs:devicePlugin
+      - description: 'device plugin arguments is used to pass supported flags and
+          their values while starting device plugin daemonset supported flag values:
+          {"resource_naming_strategy": {"single", "mixed"}}'
+        displayName: DevicePluginArguments
+        path: devicePlugin.devicePluginArguments
+        x-descriptors:
+        - urn:alm:descriptor:com.amd.deviceconfigs:devicePluginArguments
       - description: device plugin image
         displayName: DevicePluginImage
         path: devicePlugin.devicePluginImage
@@ -192,11 +199,6 @@ spec:
         path: devicePlugin.nodeLabellerTolerations
         x-descriptors:
         - urn:alm:descriptor:com.amd.deviceconfigs:nodeLabellerTolerations
-      - description: resource naming strategy for device plugin
-        displayName: ResourceNamingStrategy
-        path: devicePlugin.resourceNamingStrategy
-        x-descriptors:
-        - urn:alm:descriptor:com.amd.deviceconfigs:ResourceNamingStrategy
       - description: upgrade policy for device plugin and node labeller daemons
         displayName: UpgradePolicy
         path: devicePlugin.upgradePolicy
diff --git a/bundle/manifests/amd.com_deviceconfigs.yaml b/bundle/manifests/amd.com_deviceconfigs.yaml
index 898220ac..c9123ffe 100644
--- a/bundle/manifests/amd.com_deviceconfigs.yaml
+++ b/bundle/manifests/amd.com_deviceconfigs.yaml
@@ -190,6 +190,13 @@ spec:
               devicePlugin:
                 description: device plugin
                 properties:
+                  devicePluginArguments:
+                    additionalProperties:
+                      type: string
+                    description: |-
+                      device plugin arguments is used to pass supported flags and their values while starting device plugin daemonset
+                      supported flag values: {"resource_naming_strategy": {"single", "mixed"}}
+                    type: object
                   devicePluginImage:
                     description: device plugin image
                     pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$
@@ -306,13 +313,6 @@ spec:
                           type: string
                       type: object
                     type: array
-                  resourceNamingStrategy:
-                    default: single
-                    description: resource naming strategy for device plugin
-                    enum:
-                    - single
-                    - mixed
-                    type: string
                   upgradePolicy:
                     description: upgrade policy for device plugin and node labeller
                       daemons
diff --git a/config/crd/bases/amd.com_deviceconfigs.yaml b/config/crd/bases/amd.com_deviceconfigs.yaml
index 05e8b815..24c2b053 100644
--- a/config/crd/bases/amd.com_deviceconfigs.yaml
+++ b/config/crd/bases/amd.com_deviceconfigs.yaml
@@ -186,6 +186,13 @@ spec:
               devicePlugin:
                 description: device plugin
                 properties:
+                  devicePluginArguments:
+                    additionalProperties:
+                      type: string
+                    description: |-
+                      device plugin arguments is used to pass supported flags and their values while starting device plugin daemonset
+                      supported flag values: {"resource_naming_strategy": {"single", "mixed"}}
+                    type: object
                   devicePluginImage:
                     description: device plugin image
                     pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$
@@ -302,13 +309,6 @@ spec:
                           type: string
                       type: object
                     type: array
-                  resourceNamingStrategy:
-                    default: single
-                    description: resource naming strategy for device plugin
-                    enum:
-                    - single
-                    - mixed
-                    type: string
                   upgradePolicy:
                     description: upgrade policy for device plugin and node labeller
                       daemons
diff --git a/config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml b/config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml
index a5b6cd65..a9f4d685 100644
--- a/config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml
+++ b/config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml
@@ -123,6 +123,13 @@ spec:
         path: devicePlugin
         x-descriptors:
         - urn:alm:descriptor:com.amd.deviceconfigs:devicePlugin
+      - description: 'device plugin arguments is used to pass supported flags and
+          their values while starting device plugin daemonset supported flag values:
+          {"resource_naming_strategy": {"single", "mixed"}}'
+        displayName: DevicePluginArguments
+        path: devicePlugin.devicePluginArguments
+        x-descriptors:
+        - urn:alm:descriptor:com.amd.deviceconfigs:devicePluginArguments
       - description: device plugin image
         displayName: DevicePluginImage
         path: devicePlugin.devicePluginImage
@@ -163,11 +170,6 @@ spec:
         path: devicePlugin.nodeLabellerTolerations
         x-descriptors:
         - urn:alm:descriptor:com.amd.deviceconfigs:nodeLabellerTolerations
-      - description: resource naming strategy for device plugin
-        displayName: ResourceNamingStrategy
-        path: devicePlugin.resourceNamingStrategy
-        x-descriptors:
-        - urn:alm:descriptor:com.amd.deviceconfigs:ResourceNamingStrategy
       - description: upgrade policy for device plugin and node labeller daemons
         displayName: UpgradePolicy
         path: devicePlugin.upgradePolicy
diff --git a/helm-charts-k8s/Chart.lock b/helm-charts-k8s/Chart.lock
index 477dc578..54b4cb8c 100644
--- a/helm-charts-k8s/Chart.lock
+++ b/helm-charts-k8s/Chart.lock
@@ -6,4 +6,4 @@ dependencies:
   repository: file://./charts/kmm
   version: v1.0.0
 digest: sha256:f9a315dd2ce3d515ebf28c8e9a6a82158b493ca2686439ec381487761261b597
-generated: "2025-03-21T05:09:30.645342377Z"
+generated: "2025-03-25T06:19:17.248998622Z"
diff --git a/helm-charts-k8s/crds/deviceconfig-crd.yaml b/helm-charts-k8s/crds/deviceconfig-crd.yaml
index 707eb1ce..502f4b89 100644
--- a/helm-charts-k8s/crds/deviceconfig-crd.yaml
+++ b/helm-charts-k8s/crds/deviceconfig-crd.yaml
@@ -194,6 +194,13 @@ spec:
               devicePlugin:
                 description: device plugin
                 properties:
+                  devicePluginArguments:
+                    additionalProperties:
+                      type: string
+                    description: |-
+                      device plugin arguments is used to pass supported flags and their values while starting device plugin daemonset
+                      supported flag values: {"resource_naming_strategy": {"single", "mixed"}}
+                    type: object
                   devicePluginImage:
                     description: device plugin image
                     pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$
@@ -310,13 +317,6 @@ spec:
                           type: string
                       type: object
                     type: array
-                  resourceNamingStrategy:
-                    default: single
-                    description: resource naming strategy for device plugin
-                    enum:
-                    - single
-                    - mixed
-                    type: string
                   upgradePolicy:
                     description: upgrade policy for device plugin and node labeller
                       daemons
diff --git a/helm-charts-openshift/Chart.lock b/helm-charts-openshift/Chart.lock
index f39ad5ac..6e9b718d 100644
--- a/helm-charts-openshift/Chart.lock
+++ b/helm-charts-openshift/Chart.lock
@@ -6,4 +6,4 @@ dependencies:
   repository: file://./charts/kmm
   version: v1.0.0
 digest: sha256:25200c34a5cc846a1275e5bf3fc637b19e909dc68de938189c5278d77d03f5ac
-generated: "2025-03-21T05:09:40.013067636Z"
+generated: "2025-03-25T06:19:26.060856628Z"
diff --git a/helm-charts-openshift/crds/deviceconfig-crd.yaml b/helm-charts-openshift/crds/deviceconfig-crd.yaml
index 707eb1ce..502f4b89 100644
--- a/helm-charts-openshift/crds/deviceconfig-crd.yaml
+++ b/helm-charts-openshift/crds/deviceconfig-crd.yaml
@@ -194,6 +194,13 @@ spec:
               devicePlugin:
                 description: device plugin
                 properties:
+                  devicePluginArguments:
+                    additionalProperties:
+                      type: string
+                    description: |-
+                      device plugin arguments is used to pass supported flags and their values while starting device plugin daemonset
+                      supported flag values: {"resource_naming_strategy": {"single", "mixed"}}
+                    type: object
                   devicePluginImage:
                     description: device plugin image
                     pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$
@@ -310,13 +317,6 @@ spec:
                           type: string
                       type: object
                     type: array
-                  resourceNamingStrategy:
-                    default: single
-                    description: resource naming strategy for device plugin
-                    enum:
-                    - single
-                    - mixed
-                    type: string
                   upgradePolicy:
                     description: upgrade policy for device plugin and node labeller
                       daemons
diff --git a/internal/kmmmodule/kmmmodule.go b/internal/kmmmodule/kmmmodule.go
index 8c753604..9ca34383 100644
--- a/internal/kmmmodule/kmmmodule.go
+++ b/internal/kmmmodule/kmmmodule.go
@@ -257,8 +257,14 @@ func (km *kmmModule) SetDevicePluginAsDesired(ds *appsv1.DaemonSet, devConfig *a
 		return fmt.Errorf("daemon set is not initialized, zero pointer")
 	}
 
-	resourceNamingStrategy := devConfig.Spec.DevicePlugin.ResourceNamingStrategy
-	command := []string{"sh", "-c", fmt.Sprintf("./k8s-device-plugin -logtostderr=true -stderrthreshold=INFO -v=5 -pulse=30 -resource_naming_strategy=%s", resourceNamingStrategy)}
+	commandArgs := "./k8s-device-plugin -logtostderr=true -stderrthreshold=INFO -v=5 -pulse=30"
+
+	devicePluginArguments := devConfig.Spec.DevicePlugin.DevicePluginArguments
+	for key, val := range devicePluginArguments {
+		commandArgs += " -" + key + "=" + val
+	}
+
+	command := []string{"sh", "-c", commandArgs}
 	nodeSelector := map[string]string{}
 	for key, val := range devConfig.Spec.Selector {
 		nodeSelector[key] = val
diff --git a/internal/utils.go b/internal/utils.go
index b34dd031..bc642343 100644
--- a/internal/utils.go
+++ b/internal/utils.go
@@ -32,10 +32,13 @@ import (
 )
 
 const (
-	defaultOcDriversVersion = "6.2.2"
-	openShiftNodeLabel      = "node.openshift.io/os_id"
-	NodeFeatureLabelAmdGpu  = "feature.node.kubernetes.io/amd-gpu"
-	NodeFeatureLabelAmdVGpu = "feature.node.kubernetes.io/amd-vgpu"
+	defaultOcDriversVersion    = "6.2.2"
+	openShiftNodeLabel         = "node.openshift.io/os_id"
+	NodeFeatureLabelAmdGpu     = "feature.node.kubernetes.io/amd-gpu"
+	NodeFeatureLabelAmdVGpu    = "feature.node.kubernetes.io/amd-vgpu"
+	ResourceNamingStrategyFlag = "resource_naming_strategy"
+	SingleStrategy             = "single"
+	MixedStrategy              = "mixed"
 )
 
 func GetDriverVersion(node v1.Node, deviceConfig amdv1alpha1.DeviceConfig) (string, error) {
diff --git a/internal/validator/specValidators.go b/internal/validator/specValidators.go
index f6d87ca7..b804c488 100644
--- a/internal/validator/specValidators.go
+++ b/internal/validator/specValidators.go
@@ -21,6 +21,7 @@ import (
 	"fmt"
 
 	amdv1alpha1 "github.com/ROCm/gpu-operator/api/v1alpha1"
+	utils "github.com/ROCm/gpu-operator/internal"
 	"sigs.k8s.io/controller-runtime/pkg/client"
 )
 
@@ -86,5 +87,29 @@ func ValidateDevicePluginSpec(ctx context.Context, client client.Client, devConf
 		}
 	}
 
+	supportedFlagValues := map[string][]string{
+		utils.ResourceNamingStrategyFlag: {utils.SingleStrategy, utils.MixedStrategy},
+	}
+
+	devicePluginArguments := devConfig.Spec.DevicePlugin.DevicePluginArguments
+	for key, val := range devicePluginArguments {
+		validValues, validKey := supportedFlagValues[key]
+		if !validKey {
+			return fmt.Errorf("Invalid flag: %s", key)
+		}
+		validKeyValue := false
+
+		for _, validVal := range validValues {
+			if val == validVal {
+				validKeyValue = true
+				break
+			}
+		}
+
+		if !validKeyValue {
+			return fmt.Errorf("Invalid flag value: %s=%s. Supported values: %v", key, val, supportedFlagValues[key])
+		}
+	}
+
 	return nil
 }
diff --git a/tests/e2e/Makefile b/tests/e2e/Makefile
index 90b1e791..e00185c7 100644
--- a/tests/e2e/Makefile
+++ b/tests/e2e/Makefile
@@ -7,8 +7,11 @@ E2E_KUBE_RBAC_PROXY_CURL_IMAGE ?= curlimages/curl:7.78.0
 E2E_UBUNTU_BASE_IMAGE ?= ubuntu:22.04
 E2E_MINIO_IMAGE ?= minio/minio:latest
 E2E_EXPORTER_IMAGE ?= rocm/device-metrics-exporter:v1.2.0
+E2E_EXPORTER_IMAGE_2 ?= rocm/device-metrics-exporter:v1.1.1-beta.0
 E2E_DEVICE_PLUGIN_IMAGE ?= rocm/k8s-device-plugin:latest
 E2E_NODE_LABELLER_IMAGE ?= rocm/k8s-device-plugin:labeller-latest
+E2E_DEVICE_PLUGIN_IMAGE_2 ?= rocm/k8s-device-plugin:1.31.0.6
+E2E_NODE_LABELLER_IMAGE_2 ?= rocm/k8s-device-plugin:labeller-1.31.0.6
 E2E_TEST_RUNNER_IMAGE ?= rocm/test-runner:v1.2.0-beta.0
 
 export E2E_INIT_CONTAINER_IMAGE
@@ -16,8 +19,11 @@ export E2E_KUBE_RBAC_PROXY_CURL_IMAGE
 export E2E_UBUNTU_BASE_IMAGE
 export E2E_MINIO_IMAGE
 export E2E_EXPORTER_IMAGE
+export E2E_EXPORTER_IMAGE_2
 export E2E_DEVICE_PLUGIN_IMAGE
 export E2E_NODE_LABELLER_IMAGE
+export E2E_DEVICE_PLUGIN_IMAGE_2
+export E2E_NODE_LABELLER_IMAGE_2
 export E2E_TEST_RUNNER_IMAGE
 
 export E2E_DCM_IMAGE
diff --git a/tests/e2e/cluster_test.go b/tests/e2e/cluster_test.go
index ed22eb99..5c10af8f 100644
--- a/tests/e2e/cluster_test.go
+++ b/tests/e2e/cluster_test.go
@@ -1877,8 +1877,8 @@ func (s *E2ESuite) TestDevicePluginNodeLabellerDaemonSetUpgrade(c *C) {
 
 	// upgrade
 	// update the CR's device plugin with image
-	devCfg.Spec.DevicePlugin.DevicePluginImage = devicePluginImage
-	devCfg.Spec.DevicePlugin.NodeLabellerImage = nodeLabellerImage
+	devCfg.Spec.DevicePlugin.DevicePluginImage = devicePluginImage2
+	devCfg.Spec.DevicePlugin.NodeLabellerImage = nodeLabellerImage2
 	s.patchDevicePluginImage(devCfg, c)
 	s.patchNodeLabellerImage(devCfg, c)
 	s.verifyDevicePluginStatus(s.ns, c, devCfg)
@@ -1911,7 +1911,7 @@ func (s *E2ESuite) TestMetricsExporterDaemonSetUpgrade(c *C) {
 
 	// upgrade
 	// update the CR's device plugin with image
-	devCfg.Spec.MetricsExporter.Image = exporterImage
+	devCfg.Spec.MetricsExporter.Image = exporterImage2
 	s.patchMetricsExporterImage(devCfg, c)
 	s.verifyDeviceConfigStatus(devCfg, c)
 	s.checkMetricsExporterStatus(devCfg, s.ns, v1.ServiceTypeClusterIP, c)
diff --git a/tests/e2e/init.go b/tests/e2e/init.go
index d7a863eb..973cedb6 100644
--- a/tests/e2e/init.go
+++ b/tests/e2e/init.go
@@ -25,8 +25,11 @@ var (
 	initContainerImage     string
 	kubeRbacProxyCurlImage string
 	exporterImage          string
+	exporterImage2         string
 	devicePluginImage      string
 	nodeLabellerImage      string
+	devicePluginImage2     string
+	nodeLabellerImage2     string
 	testRunnerImage        string
 	driverImageRepo        string
 )
@@ -46,6 +49,10 @@ func init() {
 	if !ok {
 		log.Fatalf("E2E_EXPORTER_IMAGE is not defined")
 	}
+	exporterImage2, ok = os.LookupEnv("E2E_EXPORTER_IMAGE_2")
+	if !ok {
+		log.Fatalf("E2E_EXPORTER_IMAGE_2 is not defined")
+	}
 	devicePluginImage, ok = os.LookupEnv("E2E_DEVICE_PLUGIN_IMAGE")
 	if !ok {
 		log.Fatalf("E2E_DEVICE_PLUGIN_IMAGE is not defined")
@@ -54,6 +61,14 @@ func init() {
 	if !ok {
 		log.Fatalf("E2E_NODE_LABELLER_IMAGE is not defined")
 	}
+	devicePluginImage2, ok = os.LookupEnv("E2E_DEVICE_PLUGIN_IMAGE_2")
+	if !ok {
+		log.Fatalf("E2E_DEVICE_PLUGIN_IMAGE_2 is not defined")
+	}
+	nodeLabellerImage2, ok = os.LookupEnv("E2E_NODE_LABELLER_IMAGE_2")
+	if !ok {
+		log.Fatalf("E2E_NODE_LABELLER_IMAGE_2 is not defined")
+	}
 	testRunnerImage, ok = os.LookupEnv("E2E_TEST_RUNNER_IMAGE")
 	if !ok {
 		log.Fatalf("E2E_TEST_RUNNER_IMAGE is not defined")

From 5aaffe5fe36a3c7346b91ee34b38d64893c15dca Mon Sep 17 00:00:00 2001
From: vm <sriramr2230@gmail.com>
Date: Wed, 26 Mar 2025 06:18:58 +0000
Subject: [PATCH 10/12] Regression caused from previous commit

---
 internal/controllers/upgrademgr.go | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/internal/controllers/upgrademgr.go b/internal/controllers/upgrademgr.go
index c5f3fd6e..a5e519b2 100644
--- a/internal/controllers/upgrademgr.go
+++ b/internal/controllers/upgrademgr.go
@@ -151,11 +151,6 @@ func (n *upgradeMgr) HandleUpgrade(ctx context.Context, deviceConfig *amdv1alpha
 		// 1. Set init status for unprocessed nodes
 		n.helper.handleInitStatus(ctx, &nodeList.Items[i])
 
-		if !n.helper.isNodeReadyForUpgrade(ctx, &nodeList.Items[i]) {
-			res = ctrl.Result{Requeue: true, RequeueAfter: time.Second * 20}
-			continue
-		}
-
 		// 2. Handle failed nodes
 		if n.helper.isNodeStateUpgradeFailed(ctx, &nodeList.Items[i], deviceConfig) {
 			n.helper.clearUpgradeStartTime(nodeList.Items[i].Name)
@@ -193,6 +188,11 @@ func (n *upgradeMgr) HandleUpgrade(ctx context.Context, deviceConfig *amdv1alpha
 			continue
 		}
 
+		if !n.helper.isNodeReadyForUpgrade(ctx, &nodeList.Items[i]) {
+			res = ctrl.Result{Requeue: true, RequeueAfter: time.Second * 20}
+			continue
+		}
+
 		//This node is a candidate for selection
 		candidateNodes = append(candidateNodes, nodeList.Items[i])
 	}

From 42692bd60d92e67b330023c22870dfb85662803a Mon Sep 17 00:00:00 2001
From: vm <sriramr2230@gmail.com>
Date: Wed, 26 Mar 2025 03:14:38 +0000
Subject: [PATCH 11/12] Device Plugin e2e for homogeneous/heterogeneous with
 single/mixed strategy

---
 tests/e2e/cluster_test.go    | 242 ++++++++++++++++++++++++++++++++++-
 tests/e2e/dcm_e2e_test.go    |   3 +-
 tests/e2e/testrunner_test.go |   6 +-
 tests/e2e/utils/utils.go     |  18 +--
 4 files changed, 251 insertions(+), 18 deletions(-)

diff --git a/tests/e2e/cluster_test.go b/tests/e2e/cluster_test.go
index 5c10af8f..40fd52e5 100644
--- a/tests/e2e/cluster_test.go
+++ b/tests/e2e/cluster_test.go
@@ -1051,7 +1051,7 @@ func (s *E2ESuite) TestWorkloadRequestedGPUs(c *C) {
 	s.verifyDeviceConfigStatus(devCfg, c)
 	s.verifyNodeGPULabel(devCfg, c)
 
-	ret, err := utils.GetAMDGPUCount(ctx, s.clientSet)
+	ret, err := utils.GetAMDGPUCount(ctx, s.clientSet, "gpu")
 	if err != nil {
 		logger.Errorf("error: %v", err)
 	}
@@ -1078,7 +1078,7 @@ func (s *E2ESuite) TestWorkloadRequestedGPUs(c *C) {
 	err = utils.DeployRocmPods(context.TODO(), s.clientSet, res)
 	assert.NoError(c, err, "failed to deploy pods")
 	s.verifyROCMPOD(true, c)
-	err = utils.VerifyROCMPODResourceCount(ctx, s.clientSet, gpuReqCount)
+	err = utils.VerifyROCMPODResourceCount(ctx, s.clientSet, gpuReqCount, "gpu")
 	assert.NoError(c, err, fmt.Sprintf("%v", err))
 
 	// delete
@@ -1092,6 +1092,244 @@ func (s *E2ESuite) TestWorkloadRequestedGPUs(c *C) {
 	assert.NoError(c, err, "failed to reboot nodes")
 }
 
+func (s *E2ESuite) TestWorkloadRequestedGPUsHomogeneousSingle(c *C) {
+	if s.simEnable {
+		c.Skip("Skipping for non amd gpu testbed")
+	}
+	if !dcmImageDefined {
+		c.Skip("skip DCM test because E2E_DCM_IMAGE is not defined")
+	}
+
+	s.configMapHelper(c)
+
+	logger.Infof("Add node label after pod comes up")
+	time.Sleep(30 * time.Second)
+
+	nodes := utils.GetAMDGpuWorker(s.clientSet, s.openshift)
+	nodeNames := make([]string, 0)
+	for _, node := range nodes {
+		nodeNames = append(nodeNames, node.Name)
+	}
+	for _, nodeName := range nodeNames {
+		s.addRemoveNodeLabels(nodeName, "e2e_profile2")
+	}
+
+	logs := s.getLogs()
+	if strings.Contains(logs, "Partition completed successfully") && (!strings.Contains(logs, "ERROR")) && (s.eventHelper("SuccessfullyPartitioned", "Normal")) {
+		logger.Infof("Successfully tested homogenous default partitioning")
+	} else {
+		logger.Errorf("Failure test homogenous partitioning")
+	}
+	devCfgDcm := s.getDeviceConfigForDCM(c)
+	s.deleteDeviceConfig(devCfgDcm, c)
+
+	time.Sleep(60 * time.Second)
+
+	ctx := context.TODO()
+	logger.Infof("create %v", s.cfgName)
+	devCfg := s.getDeviceConfig(c)
+	driverEnable := false
+	devCfg.Spec.Driver.Enable = &driverEnable
+	s.createDeviceConfig(devCfg, c)
+	s.checkNFDWorkerStatus(s.ns, c, "")
+	s.checkNodeLabellerStatus(s.ns, c, devCfg)
+	s.verifyDeviceConfigStatus(devCfg, c)
+	s.verifyNodeGPULabel(devCfg, c)
+
+	ret, err := utils.GetAMDGPUCount(ctx, s.clientSet, "gpu")
+	if err != nil {
+		logger.Errorf("error: %v", err)
+	}
+	var minGPU int = 10000
+	for _, v := range ret {
+		if v < minGPU {
+			minGPU = v
+		}
+	}
+	assert.Greater(c, minGPU, 0, "did not find any server with amd gpu")
+
+	gpuLimitCount := minGPU
+	gpuReqCount := minGPU
+
+	res := &v1.ResourceRequirements{
+		Limits: v1.ResourceList{
+			"amd.com/gpu": resource.MustParse(fmt.Sprintf("%d", gpuLimitCount)),
+		},
+		Requests: v1.ResourceList{
+			"amd.com/gpu": resource.MustParse(fmt.Sprintf("%d", gpuReqCount)),
+		},
+	}
+
+	err = utils.DeployRocmPods(context.TODO(), s.clientSet, res)
+	assert.NoError(c, err, "failed to deploy pods")
+	err = utils.VerifyROCMPODResourceCount(ctx, s.clientSet, gpuReqCount, "gpu")
+	assert.NoError(c, err, fmt.Sprintf("%v", err))
+
+	// delete
+	s.deleteDeviceConfig(devCfg, c)
+
+	err = utils.DelRocmPods(context.TODO(), s.clientSet)
+	assert.NoError(c, err, "failed to remove rocm pods")
+}
+
+func (s *E2ESuite) TestWorkloadRequestedGPUsHomogeneousMixed(c *C) {
+	if s.simEnable {
+		c.Skip("Skipping for non amd gpu testbed")
+	}
+	if !dcmImageDefined {
+		c.Skip("skip DCM test because E2E_DCM_IMAGE is not defined")
+	}
+
+	s.configMapHelper(c)
+
+	logger.Infof("Add node label after pod comes up")
+	time.Sleep(30 * time.Second)
+
+	nodes := utils.GetAMDGpuWorker(s.clientSet, s.openshift)
+	nodeNames := make([]string, 0)
+	for _, node := range nodes {
+		nodeNames = append(nodeNames, node.Name)
+	}
+	for _, nodeName := range nodeNames {
+		s.addRemoveNodeLabels(nodeName, "e2e_profile2")
+	}
+
+	logs := s.getLogs()
+	if strings.Contains(logs, "Partition completed successfully") && (!strings.Contains(logs, "ERROR")) && (s.eventHelper("SuccessfullyPartitioned", "Normal")) {
+		logger.Infof("Successfully tested homogeneous partitioning")
+	} else {
+		logger.Errorf("Failure test homogeneous partitioning")
+	}
+	devCfgDcm := s.getDeviceConfigForDCM(c)
+	s.deleteDeviceConfig(devCfgDcm, c)
+	time.Sleep(60 * time.Second)
+	ctx := context.TODO()
+	logger.Infof("create %v", s.cfgName)
+	devCfg := s.getDeviceConfig(c)
+	driverEnable := false
+	devCfg.Spec.Driver.Enable = &driverEnable
+	devCfg.Spec.DevicePlugin.DevicePluginArguments = map[string]string{"resource_naming_strategy": "mixed"}
+	s.createDeviceConfig(devCfg, c)
+	s.checkNFDWorkerStatus(s.ns, c, "")
+	s.checkNodeLabellerStatus(s.ns, c, devCfg)
+	s.verifyDeviceConfigStatus(devCfg, c)
+
+	ret, err := utils.GetAMDGPUCount(ctx, s.clientSet, "cpx_nps4")
+	if err != nil {
+		logger.Errorf("error: %v", err)
+	}
+	var minGPU int = 10000
+	for _, v := range ret {
+		if v < minGPU {
+			minGPU = v
+		}
+	}
+	assert.Greater(c, minGPU, 0, "did not find any server with amd gpu")
+
+	gpuLimitCount := minGPU
+	gpuReqCount := minGPU
+
+	res := &v1.ResourceRequirements{
+		Limits: v1.ResourceList{
+			"amd.com/cpx_nps4": resource.MustParse(fmt.Sprintf("%d", gpuLimitCount)),
+		},
+		Requests: v1.ResourceList{
+			"amd.com/cpx_nps4": resource.MustParse(fmt.Sprintf("%d", gpuReqCount)),
+		},
+	}
+
+	err = utils.DeployRocmPods(context.TODO(), s.clientSet, res)
+	assert.NoError(c, err, "failed to deploy pods")
+	err = utils.VerifyROCMPODResourceCount(ctx, s.clientSet, gpuReqCount, "cpx_nps4")
+	assert.NoError(c, err, fmt.Sprintf("%v", err))
+
+	// delete
+	s.deleteDeviceConfig(devCfg, c)
+
+	err = utils.DelRocmPods(context.TODO(), s.clientSet)
+	assert.NoError(c, err, "failed to remove rocm pods")
+
+}
+
+func (s *E2ESuite) TestWorkloadRequestedGPUsHeterogeneousMixed(c *C) {
+	if s.simEnable {
+		c.Skip("Skipping for non amd gpu testbed")
+	}
+	if !dcmImageDefined {
+		c.Skip("skip DCM test because E2E_DCM_IMAGE is not defined")
+	}
+
+	s.configMapHelper(c)
+
+	logger.Infof("Add node label after pod comes up")
+	time.Sleep(30 * time.Second)
+
+	nodes := utils.GetAMDGpuWorker(s.clientSet, s.openshift)
+	nodeNames := make([]string, 0)
+	for _, node := range nodes {
+		nodeNames = append(nodeNames, node.Name)
+	}
+	for _, nodeName := range nodeNames {
+		s.addRemoveNodeLabels(nodeName, "e2e_profile1")
+	}
+
+	logs := s.getLogs()
+	if strings.Contains(logs, "Partition completed successfully") && (!strings.Contains(logs, "ERROR")) && (s.eventHelper("SuccessfullyPartitioned", "Normal")) {
+		logger.Infof("Successfully tested homogeneous partitioning")
+	} else {
+		logger.Errorf("Failure test heterogenous partitioning")
+	}
+	devCfgDcm := s.getDeviceConfigForDCM(c)
+	s.deleteDeviceConfig(devCfgDcm, c)
+	time.Sleep(60 * time.Second)
+
+	ctx := context.TODO()
+	logger.Infof("create %v", s.cfgName)
+	devCfg := s.getDeviceConfig(c)
+	driverEnable := false
+	devCfg.Spec.Driver.Enable = &driverEnable
+	devCfg.Spec.DevicePlugin.DevicePluginArguments = map[string]string{"resource_naming_strategy": "mixed"}
+	s.createDeviceConfig(devCfg, c)
+	s.checkNFDWorkerStatus(s.ns, c, "")
+	s.checkNodeLabellerStatus(s.ns, c, devCfg)
+	s.verifyDeviceConfigStatus(devCfg, c)
+
+	ret, err := utils.GetAMDGPUCount(ctx, s.clientSet, "cpx_nps1")
+	if err != nil {
+		logger.Errorf("error: %v", err)
+	}
+	var minGPU int = 10000
+	for _, v := range ret {
+		if v < minGPU {
+			minGPU = v
+		}
+	}
+	assert.Greater(c, minGPU, 0, "did not find any server with amd gpu")
+
+	gpuLimitCount := minGPU
+	gpuReqCount := minGPU
+
+	res := &v1.ResourceRequirements{
+		Limits: v1.ResourceList{
+			"amd.com/cpx_nps1": resource.MustParse(fmt.Sprintf("%d", gpuLimitCount)),
+		},
+		Requests: v1.ResourceList{
+			"amd.com/cpx_nps1": resource.MustParse(fmt.Sprintf("%d", gpuReqCount)),
+		},
+	}
+
+	err = utils.DeployRocmPods(context.TODO(), s.clientSet, res)
+	assert.NoError(c, err, "failed to deploy pods")
+	err = utils.VerifyROCMPODResourceCount(ctx, s.clientSet, gpuReqCount, "cpx_nps1")
+	assert.NoError(c, err, fmt.Sprintf("%v", err))
+
+	// delete
+	s.deleteDeviceConfig(devCfg, c)
+
+	err = utils.DelRocmPods(context.TODO(), s.clientSet)
+	assert.NoError(c, err, "failed to remove rocm pods")
+}
+
 func (s *E2ESuite) TestKubeRbacProxyClusterIP(c *C) {
 	_, err := s.dClient.DeviceConfigs(s.ns).Get("deviceconfig-kuberbac-clusterip", metav1.GetOptions{})
 	assert.Errorf(c, err, "config deviceconfig-kuberbac-clusterip exists")
diff --git a/tests/e2e/dcm_e2e_test.go b/tests/e2e/dcm_e2e_test.go
index f3f8b9df..cd11bd3c 100644
--- a/tests/e2e/dcm_e2e_test.go
+++ b/tests/e2e/dcm_e2e_test.go
@@ -72,7 +72,7 @@ func (s *E2ESuite) addRemoveNodeLabels(nodeName string, selectedProfile string)
 		logger.Infof("Error adding node lbels: %s\n", err.Error())
 		return
 	}
-	time.Sleep(15 * time.Second)
+	time.Sleep(45 * time.Second)
 	// Allow partition to happen
 	err = utils.DeleteNodeLabel(s.clientSet, nodeName, "dcm.amd.com/gpu-config-profile")
 	_ = utils.DeleteNodeLabel(s.clientSet, nodeName, "dcm.amd.com/apply-gpu-config-profile")
@@ -269,6 +269,7 @@ func (s *E2ESuite) createConfigMap() GPUConfigProfiles {
 		{
 			ComputePartition: "CPX",
 			MemoryPartition:  "NPS4",
+			NumGPUsAssigned:  1,
 		},
 	}
 
diff --git a/tests/e2e/testrunner_test.go b/tests/e2e/testrunner_test.go
index 305b6f59..aa7b37a7 100644
--- a/tests/e2e/testrunner_test.go
+++ b/tests/e2e/testrunner_test.go
@@ -200,7 +200,7 @@ func (s *E2ESuite) createTestRunnerConfigmap(valid bool, devCfg *v1alpha1.Device
 }
 
 func (s *E2ESuite) scheduleWorkloadOnNodeWithMaxGPUs(c *C) string {
-	ret, err := utils.GetAMDGPUCount(context.TODO(), s.clientSet)
+	ret, err := utils.GetAMDGPUCount(context.TODO(), s.clientSet, "gpu")
 	if err != nil {
 		logger.Errorf("error: %v", err)
 	}
@@ -228,7 +228,7 @@ func (s *E2ESuite) scheduleWorkloadOnNodeWithMaxGPUs(c *C) string {
 
 	err = utils.DeployRocmPods(context.TODO(), s.clientSet, res)
 	assert.NoError(c, err, "failed to deploy pods")
-	err = utils.VerifyROCMPODResourceCount(context.TODO(), s.clientSet, gpuReqCount)
+	err = utils.VerifyROCMPODResourceCount(context.TODO(), s.clientSet, gpuReqCount, "gpu")
 	assert.NoError(c, err, fmt.Sprintf("%v", err))
 
 	return nodeWithMaxGPU
@@ -730,7 +730,7 @@ func (s *E2ESuite) TestTestRunnerLogsExport(c *C) {
 
 func (s *E2ESuite) getGPUNodeName() (nodeWithMaxGPU string) {
 	var maxPerNodeGPU int = 0
-	ret, err := utils.GetAMDGPUCount(context.TODO(), s.clientSet)
+	ret, err := utils.GetAMDGPUCount(context.TODO(), s.clientSet, "gpu")
 	if err != nil {
 		logger.Printf("Unable to fetch gpu nodes. Error %v", err)
 		return
diff --git a/tests/e2e/utils/utils.go b/tests/e2e/utils/utils.go
index 9c9dcf9f..5813ccf5 100644
--- a/tests/e2e/utils/utils.go
+++ b/tests/e2e/utils/utils.go
@@ -598,14 +598,6 @@ func GetWorkerNodes(cl *kubernetes.Clientset) []*v1.Node {
 func GetAMDGpuWorker(cl *kubernetes.Clientset, isOpenshift bool) []v1.Node {
 	ret := make([]v1.Node, 0)
 	labelSelector := labels.NewSelector()
-	if !isOpenshift {
-		r, _ := labels.NewRequirement(
-			"node-role.kubernetes.io/control-plane",
-			selection.DoesNotExist,
-			nil,
-		)
-		labelSelector = labelSelector.Add(*r)
-	}
 	r, _ := labels.NewRequirement(
 		"feature.node.kubernetes.io/amd-gpu",
 		selection.Equals,
@@ -766,7 +758,7 @@ func DelRocmPodsByNodeNames(ctx context.Context, cl *kubernetes.Clientset,
 
 }
 
-func GetAMDGPUCount(ctx context.Context, cl *kubernetes.Clientset) (map[string]int, error) {
+func GetAMDGPUCount(ctx context.Context, cl *kubernetes.Clientset, resourceType string) (map[string]int, error) {
 
 	ret := make(map[string]int)
 	// Get the list of nodes
@@ -777,7 +769,8 @@ func GetAMDGPUCount(ctx context.Context, cl *kubernetes.Clientset) (map[string]i
 
 	// Iterate over the nodes and count AMD GPUs
 	for _, node := range nodes.Items {
-		if val, ok := node.Status.Capacity["amd.com/gpu"]; ok {
+		resourceKey := v1.ResourceName("amd.com/" + resourceType)
+		if val, ok := node.Status.Capacity[resourceKey]; ok {
 			num, err := strconv.ParseInt(val.String(), 10, 64)
 			if err != nil {
 				log.Infof("error: %v", err)
@@ -790,7 +783,7 @@ func GetAMDGPUCount(ctx context.Context, cl *kubernetes.Clientset) (map[string]i
 }
 
 func VerifyROCMPODResourceCount(ctx context.Context, cl *kubernetes.Clientset,
-	gpuReqCount int) error {
+	gpuReqCount int, resourceType string) error {
 
 	its, err := cl.CoreV1().Pods("").List(ctx,
 		metav1.ListOptions{
@@ -805,7 +798,8 @@ func VerifyROCMPODResourceCount(ctx context.Context, cl *kubernetes.Clientset,
 				continue
 			}
 
-			if gpu, ok := cntr.Resources.Requests["amd.com/gpu"]; ok {
+			resourceKey := v1.ResourceName("amd.com/" + resourceType)
+			if gpu, ok := cntr.Resources.Requests[resourceKey]; ok {
 				gpuAssignedCount := int(gpu.Value())
 				if gpuReqCount < gpuAssignedCount {
 					return fmt.Errorf("gpu requested %d got %d",

From 996d8c82c8aa2433532e755aa0bca646bbdb8e3e Mon Sep 17 00:00:00 2001
From: "Saad Rahim (AMD)" <44449863+saadrahim@users.noreply.github.com>
Date: Thu, 27 Mar 2025 15:49:54 -0600
Subject: [PATCH 12/12] Fix markdown linting errors

---
 docs/installation/openshift-olm.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/installation/openshift-olm.md b/docs/installation/openshift-olm.md
index 489644d9..889ada17 100644
--- a/docs/installation/openshift-olm.md
+++ b/docs/installation/openshift-olm.md
@@ -205,6 +205,7 @@ spec:
 ```
 
 Things to note:
+
 1. By default, there is no need to specify the image field in CR for Openshift. Default will be used which is: image-registry.openshift-image-registry.svc:5000/$MOD_NAMESPACE/amdgpu_kmod
 
 2. If users specify image, $MOD_NAMESPACE can be a place holder , KMM Operator can automatically translate it to the namespace