diff --git a/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml b/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml index bc7e0a73..fac57f01 100644 --- a/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml +++ b/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml @@ -21,7 +21,7 @@ metadata: "imageRegistrySecret": { "name": "docker-auth" }, - "version": "7.0" + "version": "30.20.1" }, "selector": { "feature.node.kubernetes.io/amd-gpu": "true" diff --git a/hack/k8s-patch/template-patch/pre-upgrade-hook.yaml b/hack/k8s-patch/template-patch/pre-upgrade-hook.yaml index e93b101b..600edc47 100644 --- a/hack/k8s-patch/template-patch/pre-upgrade-hook.yaml +++ b/hack/k8s-patch/template-patch/pre-upgrade-hook.yaml @@ -93,7 +93,7 @@ spec: echo "$deviceconfigs" | jq . # Check if any UpgradeState is in the blocked states - blocked_states='["Upgrade-Not-Started", "Upgrade-Started", "Install-In-Progress", "Upgrade-In-Progress"]' + blocked_states='["Upgrade-Not-Started", "Upgrade-Started", "Upgrade-In-Progress"]' if echo "$deviceconfigs" | jq --argjson blocked_states "$blocked_states" -e ' .items[] | .status.nodeModuleStatus // {} | diff --git a/helm-charts-k8s/Chart.lock b/helm-charts-k8s/Chart.lock index 058c7ecf..be08b5b4 100644 --- a/helm-charts-k8s/Chart.lock +++ b/helm-charts-k8s/Chart.lock @@ -9,4 +9,4 @@ dependencies: repository: file://./charts/remediation version: v1.0.0 digest: sha256:41fa6a6232514acebf6abdcb1bccaf087e134b9f413b8fa33a7fec1f58a99e07 -generated: "2025-12-09T09:27:36.511662862Z" +generated: "2026-01-05T19:49:07.882445585Z" diff --git a/helm-charts-k8s/templates/pre-upgrade-hook.yaml b/helm-charts-k8s/templates/pre-upgrade-hook.yaml index e93b101b..600edc47 100644 --- a/helm-charts-k8s/templates/pre-upgrade-hook.yaml +++ b/helm-charts-k8s/templates/pre-upgrade-hook.yaml @@ -93,7 +93,7 @@ spec: echo "$deviceconfigs" | jq . # Check if any UpgradeState is in the blocked states - blocked_states='["Upgrade-Not-Started", "Upgrade-Started", "Install-In-Progress", "Upgrade-In-Progress"]' + blocked_states='["Upgrade-Not-Started", "Upgrade-Started", "Upgrade-In-Progress"]' if echo "$deviceconfigs" | jq --argjson blocked_states "$blocked_states" -e ' .items[] | .status.nodeModuleStatus // {} | diff --git a/internal/controllers/device_config_reconciler.go b/internal/controllers/device_config_reconciler.go index 90ef9e26..aa69b65b 100644 --- a/internal/controllers/device_config_reconciler.go +++ b/internal/controllers/device_config_reconciler.go @@ -605,11 +605,19 @@ func (dcrh *deviceConfigReconcilerHelper) buildDeviceConfigNodeStatus(ctx contex // push their status back to DeviceConfig if module.Namespace == devConfig.Namespace && module.Name == devConfig.Name { + // if the DeviceConfig is not managing drivers, remove the driver status information + // need to remove this redundant info to unblock a known issue https://github.com/ROCm/gpu-operator/issues/403 + // the driver management is disabled but there is probability that dirver status get stuck in Install-In-Progress + nodeStatus := amdv1alpha1.UpgradeStateEmpty + if utils.ShouldUseKMM(devConfig) { + // only assign node driver status value when DeviceConfig is managing drivers + nodeStatus = dcrh.upgradeMgrHandler.GetNodeStatus(node.Name) + } devConfig.Status.NodeModuleStatus[node.Name] = amdv1alpha1.ModuleStatus{ ContainerImage: module.Config.ContainerImage, KernelVersion: module.Config.KernelVersion, LastTransitionTime: module.LastTransitionTime.String(), - Status: dcrh.upgradeMgrHandler.GetNodeStatus(node.Name), + Status: nodeStatus, UpgradeStartTime: upgradeStartTime, BootId: bootId, } diff --git a/tests/e2e/cluster_test.go b/tests/e2e/cluster_test.go index 2ea9cb68..c8801b8b 100644 --- a/tests/e2e/cluster_test.go +++ b/tests/e2e/cluster_test.go @@ -3140,9 +3140,9 @@ func (s *E2ESuite) TestPreUpgradeHookFailure(c *C) { } if s.openshift { - devCfg.Spec.Driver.Version = "el9-6.1.0" + devCfg.Spec.Driver.Version = "30.20.1" } else { - devCfg.Spec.Driver.Version = "6.1.0" + devCfg.Spec.Driver.Version = "30.20.1" } s.patchDriversVersion(devCfg, c) s.verifyDeviceConfigStatus(devCfg, c)