Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ metadata:
"imageRegistrySecret": {
"name": "docker-auth"
},
"version": "7.0"
"version": "30.20.1"
},
"selector": {
"feature.node.kubernetes.io/amd-gpu": "true"
Expand Down
2 changes: 1 addition & 1 deletion hack/k8s-patch/template-patch/pre-upgrade-hook.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ spec:
echo "$deviceconfigs" | jq .

# Check if any UpgradeState is in the blocked states
blocked_states='["Upgrade-Not-Started", "Upgrade-Started", "Install-In-Progress", "Upgrade-In-Progress"]'
blocked_states='["Upgrade-Not-Started", "Upgrade-Started", "Upgrade-In-Progress"]'
if echo "$deviceconfigs" | jq --argjson blocked_states "$blocked_states" -e '
.items[] |
.status.nodeModuleStatus // {} |
Expand Down
2 changes: 1 addition & 1 deletion helm-charts-k8s/Chart.lock
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,4 @@ dependencies:
repository: file://./charts/remediation
version: v1.0.0
digest: sha256:41fa6a6232514acebf6abdcb1bccaf087e134b9f413b8fa33a7fec1f58a99e07
generated: "2025-12-09T09:27:36.511662862Z"
generated: "2026-01-05T19:49:07.882445585Z"
2 changes: 1 addition & 1 deletion helm-charts-k8s/templates/pre-upgrade-hook.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ spec:
echo "$deviceconfigs" | jq .

# Check if any UpgradeState is in the blocked states
blocked_states='["Upgrade-Not-Started", "Upgrade-Started", "Install-In-Progress", "Upgrade-In-Progress"]'
blocked_states='["Upgrade-Not-Started", "Upgrade-Started", "Upgrade-In-Progress"]'
if echo "$deviceconfigs" | jq --argjson blocked_states "$blocked_states" -e '
.items[] |
.status.nodeModuleStatus // {} |
Expand Down
10 changes: 9 additions & 1 deletion internal/controllers/device_config_reconciler.go
Original file line number Diff line number Diff line change
Expand Up @@ -605,11 +605,19 @@ func (dcrh *deviceConfigReconcilerHelper) buildDeviceConfigNodeStatus(ctx contex
// push their status back to DeviceConfig
if module.Namespace == devConfig.Namespace &&
module.Name == devConfig.Name {
// if the DeviceConfig is not managing drivers, remove the driver status information
// need to remove this redundant info to unblock a known issue https://github.com/ROCm/gpu-operator/issues/403
// the driver management is disabled but there is probability that dirver status get stuck in Install-In-Progress
nodeStatus := amdv1alpha1.UpgradeStateEmpty
if utils.ShouldUseKMM(devConfig) {
// only assign node driver status value when DeviceConfig is managing drivers
nodeStatus = dcrh.upgradeMgrHandler.GetNodeStatus(node.Name)
}
devConfig.Status.NodeModuleStatus[node.Name] = amdv1alpha1.ModuleStatus{
ContainerImage: module.Config.ContainerImage,
KernelVersion: module.Config.KernelVersion,
LastTransitionTime: module.LastTransitionTime.String(),
Status: dcrh.upgradeMgrHandler.GetNodeStatus(node.Name),
Status: nodeStatus,
UpgradeStartTime: upgradeStartTime,
BootId: bootId,
}
Expand Down
4 changes: 2 additions & 2 deletions tests/e2e/cluster_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3140,9 +3140,9 @@ func (s *E2ESuite) TestPreUpgradeHookFailure(c *C) {
}

if s.openshift {
devCfg.Spec.Driver.Version = "el9-6.1.0"
devCfg.Spec.Driver.Version = "30.20.1"
} else {
devCfg.Spec.Driver.Version = "6.1.0"
devCfg.Spec.Driver.Version = "30.20.1"
}
s.patchDriversVersion(devCfg, c)
s.verifyDeviceConfigStatus(devCfg, c)
Expand Down
Loading