kai-scheduler · enoodle · Feb 26, 2026 · Feb 27, 2026 · Feb 27, 2026 · Feb 27, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -38,6 +38,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
 - Fixed admission webhook to skip runtimeClassName injection when gpuPodRuntimeClassName is empty [#1035](https://github.com/NVIDIA/KAI-Scheduler/pull/1035)
 - Fixed topology-migration helm hook failing on OpenShift due to missing `kai-topology-migration` service account in the `kai-system` SCC [#1050](https://github.com/NVIDIA/KAI-Scheduler/pull/1050)
 - Fixed a bug where queue status did not reflect its podgroups resources correctly [#1049](https://github.com/NVIDIA/KAI-Scheduler/pull/1049)
+- Fixed `spread` node placement strategy ignoring fractional GPU usage on shared GPUs, causing nondeterministic pod placement instead of even spreading
 - Fixed helm uninstall does not remove webhooks [#959](https://github.com/NVIDIA/KAI-Scheduler/pull/959) [faizan-exe](https://github.com/faizan-exe)
 - Fixed security vulnerability where PodGang could reference pods in other namespaces, preventing cross-namespace manipulation
 - Fixed pod controller logging to use request namespace/name instead of empty pod object fields when pod is not found

diff --git a/pkg/scheduler/actions/allocate/allocateFractionalGpu_test.go b/pkg/scheduler/actions/allocate/allocateFractionalGpu_test.go
@@ -1642,3 +1642,154 @@ func getFractionalGPUTestsMetadata() []integration_tests_utils.TestTopologyMetad
 		},
 	}
 }
+
+// TestFillNodeSpreadStrategyPackGPUs reproduces the E2E fill_node_test scenario:
+// 2 nodes (2 GPUs each), 4 pending pods (0.5 GPU each), spread across nodes + pack GPUs.
+// With spread strategy, pods should alternate across nodes. With gpupack, pods on the
+// same node should share a single GPU. Expected: 2 pods per node, 1 reservation per node.
+func TestFillNodeSpreadStrategyPackGPUs(t *testing.T) {
+	test_utils.InitTestingInfrastructure()
+	controller := NewController(t)
+	defer controller.Finish()
+	defer gock.Off()
+
+	testsMetadata := getFillNodeSpreadPackTestsMetadata()
+	for testNumber, testMetadata := range testsMetadata {
+		t.Logf("Running test %d: %s", testNumber, testMetadata.TestTopologyBasic.Name)
+
+		ssn := test_utils.BuildSession(testMetadata.TestTopologyBasic, controller)
+		allocateAction := allocate.New()
+		allocateAction.Execute(ssn)
+
+		test_utils.MatchExpectedAndRealTasks(t, testNumber, testMetadata.TestTopologyBasic, ssn)
+
+		// Additionally verify GPU packing: count distinct GPU groups per node
+		reservationsByNode := map[string]map[string]bool{}
+		for _, job := range ssn.ClusterInfo.PodGroupInfos {
+			for _, task := range job.GetAllPodsMap() {
+				if task.NodeName == "" {
+					continue
+				}
+				if reservationsByNode[task.NodeName] == nil {
+					reservationsByNode[task.NodeName] = map[string]bool{}
+				}
+				for _, gpuGroup := range task.GPUGroups {
+					reservationsByNode[task.NodeName][gpuGroup] = true
+				}
+			}
+		}
+
+		for nodeName, gpuGroups := range reservationsByNode {
+			if len(gpuGroups) != 1 {
+				t.Errorf("Test %d: %s - expected 1 GPU group on node %s (pack behavior), got %d: %v",
+					testNumber, testMetadata.TestTopologyBasic.Name, nodeName, len(gpuGroups), gpuGroups)
+			}
+		}
+	}
+}
+
+func getFillNodeSpreadPackTestsMetadata() []integration_tests_utils.TestTopologyMetadata {
+	return []integration_tests_utils.TestTopologyMetadata{
+		{
+			TestTopologyBasic: test_utils.TestTopologyBasic{
+				Name: "E2E fill_node reproduction: 4 pending pods, spread nodes, pack GPUs - all from empty state",
+				Jobs: []*jobs_fake.TestJobBasic{
+					{
+						Name:                "pending_job0",
+						Priority:            constants.PriorityBuildNumber,
+						QueueName:           "queue0",
+						RequiredGPUsPerTask: 0.5,
+						Tasks: []*tasks_fake.TestTaskBasic{
+							{State: pod_status.Pending},
+						},
+					},
+					{
+						Name:                "pending_job1",
+						Priority:            constants.PriorityBuildNumber,
+						QueueName:           "queue0",
+						RequiredGPUsPerTask: 0.5,
+						Tasks: []*tasks_fake.TestTaskBasic{
+							{State: pod_status.Pending},
+						},
+					},
+					{
+						Name:                "pending_job2",
+						Priority:            constants.PriorityBuildNumber,
+						QueueName:           "queue0",
+						RequiredGPUsPerTask: 0.5,
+						Tasks: []*tasks_fake.TestTaskBasic{
+							{State: pod_status.Pending},
+						},
+					},
+					{
+						Name:                "pending_job3",
+						Priority:            constants.PriorityBuildNumber,
+						QueueName:           "queue0",
+						RequiredGPUsPerTask: 0.5,
+						Tasks: []*tasks_fake.TestTaskBasic{
+							{State: pod_status.Pending},
+						},
+					},
+				},
+				Nodes: map[string]nodes_fake.TestNodeBasic{
+					"node0": {GPUs: 2},
+					"node1": {GPUs: 2},
+				},
+				Queues: []test_utils.TestQueueBasic{
+					{
+						Name:         "queue0",
+						DeservedGPUs: 4,
+					},
+				},
+				JobExpectedResults: map[string]test_utils.TestExpectedResultBasic{
+					"pending_job0": {
+						GPUsRequired:         0.5,
+						Status:               pod_status.Binding,
+						DontValidateGPUGroup: true,
+					},
+					"pending_job1": {
+						GPUsRequired:         0.5,
+						Status:               pod_status.Binding,
+						DontValidateGPUGroup: true,
+					},
+					"pending_job2": {
+						GPUsRequired:         0.5,
+						Status:               pod_status.Binding,
+						DontValidateGPUGroup: true,
+					},
+					"pending_job3": {
+						GPUsRequired:         0.5,
+						Status:               pod_status.Binding,
+						DontValidateGPUGroup: true,
+					},
+				},
+				Mocks: &test_utils.TestMock{
+					CacheRequirements: &test_utils.CacheMocking{
+						NumberOfCacheBinds: 5,
+					},
+					SchedulerConf: &conf.SchedulerConfiguration{
+						Actions: "allocate",
+						Tiers: []conf.Tier{
+							{
+								Plugins: []conf.PluginOption{
+									{
+										Name: "nodeplacement",
+										Arguments: map[string]string{
+											constants.GPUResource: constants.SpreadStrategy,
+											constants.CPUResource: constants.SpreadStrategy,
+										},
+									},
+									{Name: "gpupack"},
+									{Name: "proportion"},
+									{Name: "priority"},
+									{Name: "nodeavailability"},
+									{Name: "resourcetype"},
+								},
+							},
+						},
+					},
+				},
+			},
+		},
+	}
+}
diff --git a/pkg/scheduler/plugins/nodeplacement/nodespread_test.go b/pkg/scheduler/plugins/nodeplacement/nodespread_test.go
@@ -112,5 +112,78 @@ var _ = Describe("NodeSpread", func() {
 				Expect(actual).To(Equal(c.expected))
 			}
 		})
+
+		It("should account for shared GPU consumption when scoring", func() {
+			// Node with 2 GPUs, one GPU has 0.5 allocated (50 of 100 memory)
+			// Expected: (1 idle whole GPU + 0.5 available on shared) / 2 total = 0.75
+			nodeWithShared := &node_info.NodeInfo{
+				Node: &corev1.Node{
+					ObjectMeta: metav1.ObjectMeta{
+						Name: "node-with-shared",
+						Labels: map[string]string{
+							node_info.GpuCountLabel: "2",
+						},
+					},
+				},
+				Idle:                   resource_info.NewResource(0, 0, 1), // 1 whole idle GPU
+				Releasing:              resource_info.EmptyResource(),
+				MemoryOfEveryGpuOnNode: 100,
+				GpuSharingNodeInfo: node_info.GpuSharingNodeInfo{
+					ReleasingSharedGPUs:       map[string]bool{},
+					UsedSharedGPUsMemory:      map[string]int64{"gpu-0": 50},
+					ReleasingSharedGPUsMemory: map[string]int64{},
+					AllocatedSharedGPUsMemory: map[string]int64{"gpu-0": 50},
+				},
+			}
+
+			// Node with 2 GPUs, completely empty
+			// Expected: 2 / 2 = 1.0
+			nodeEmpty := &node_info.NodeInfo{
+				Node: &corev1.Node{
+					ObjectMeta: metav1.ObjectMeta{
+						Name: "node-empty",
+						Labels: map[string]string{
+							node_info.GpuCountLabel: "2",
+						},
+					},
+				},
+				Idle:                   resource_info.NewResource(0, 0, 2), // 2 whole idle GPUs
+				Releasing:              resource_info.EmptyResource(),
+				MemoryOfEveryGpuOnNode: 100,
+				GpuSharingNodeInfo: node_info.GpuSharingNodeInfo{
+					ReleasingSharedGPUs:       map[string]bool{},
+					UsedSharedGPUsMemory:      map[string]int64{},
+					ReleasingSharedGPUsMemory: map[string]int64{},
+					AllocatedSharedGPUsMemory: map[string]int64{},
+				},
+			}
+
+			task := &pod_info.PodInfo{
+				ResReq: resource_info.NewResourceRequirementsWithGpus(1),
+			}
+
+			nodes := map[string]*node_info.NodeInfo{
+				nodeWithShared.Name: nodeWithShared,
+				nodeEmpty.Name:      nodeEmpty,
+			}
+			plugin := nodeplacement.New(map[string]string{
+				constants.GPUResource: constants.SpreadStrategy,
+				constants.CPUResource: constants.SpreadStrategy,
+			})
+			ssn := createFakeTestSession(nodes)
+			plugin.OnSessionOpen(ssn)
+			nof := ssn.NodeOrderFns[len(ssn.NodeOrderFns)-1]
+
+			sharedScore, err := nof(task, nodeWithShared)
+			Expect(err).NotTo(HaveOccurred())
+			Expect(sharedScore).To(Equal(0.75))
+
+			emptyScore, err := nof(task, nodeEmpty)
+			Expect(err).NotTo(HaveOccurred())
+			Expect(emptyScore).To(Equal(1.0))
+
+			// The empty node should score higher (more available), ensuring spread
+			Expect(emptyScore).To(BeNumerically(">", sharedScore))
+		})
 	})
 })
diff --git a/pkg/scheduler/plugins/nodeplacement/spread.go b/pkg/scheduler/plugins/nodeplacement/spread.go
@@ -16,17 +16,24 @@ import (
 func nodeResourceSpread(resourceName v1.ResourceName) api.NodeOrderFn {
 	return func(task *pod_info.PodInfo, node *node_info.NodeInfo) (float64, error) {
 		var resourceCount float64
+		var nonAllocated float64
+
 		if resourceName == resource_info.GPUResourceName {
 			resourceCount = float64(node.GetNumberOfGPUsInNode())
+			if resourceCount == 0 {
+				return 0, nil
+			}
+			idleGPUs, _ := node.GetSumOfIdleGPUs()
+			releasingGPUs, _ := node.GetSumOfReleasingGPUs()
+			nonAllocated = idleGPUs + releasingGPUs
 		} else {
 			resourceCount = node.Allocatable.Get(resourceName)
+			if resourceCount == 0 {
+				return 0, nil
+			}
+			nonAllocated = node.NonAllocatedResource(resourceName)
 		}
 
-		if resourceCount == 0 {
-			return 0, nil
-		}
-
-		nonAllocated := node.NonAllocatedResource(resourceName)
 		score := nonAllocated / resourceCount
 		log.InfraLogger.V(7).Infof("Estimating Task: <%v/%v> Job: <%v> for node: <%s> "+
 			"that has <%.2f/%f> non allocated %v. Score: %f",

diff --git a/test/e2e/modules/configurations/feature_flags/plugins.go b/test/e2e/modules/configurations/feature_flags/plugins.go
@@ -0,0 +1,55 @@
+/*
+Copyright 2025 NVIDIA CORPORATION
+SPDX-License-Identifier: Apache-2.0
+*/
+package feature_flags
+
+import (
+	"context"
+
+	"k8s.io/utils/ptr"
+
+	kaiv1 "github.com/NVIDIA/KAI-scheduler/pkg/apis/kai/v1"
+	"github.com/NVIDIA/KAI-scheduler/pkg/common/constants"
+	"github.com/NVIDIA/KAI-scheduler/test/e2e/modules/configurations"
+	"github.com/NVIDIA/KAI-scheduler/test/e2e/modules/constant"
+	testContext "github.com/NVIDIA/KAI-scheduler/test/e2e/modules/context"
+	"github.com/NVIDIA/KAI-scheduler/test/e2e/modules/wait"
+)
+
+func SetPluginEnabled(
+	ctx context.Context, testCtx *testContext.TestContext, pluginName string, enabled bool,
+) error {
+	return patchPlugin(ctx, testCtx, func(shard *kaiv1.SchedulingShard) {
+		if shard.Spec.Plugins == nil {
+			shard.Spec.Plugins = make(map[string]kaiv1.PluginConfig)
+		}
+		config := shard.Spec.Plugins[pluginName]
+		config.Enabled = ptr.To(enabled)
+		shard.Spec.Plugins[pluginName] = config
+	})
+}
+
+func UnsetPlugin(
+	ctx context.Context, testCtx *testContext.TestContext, pluginName string,
+) error {
+	return patchPlugin(ctx, testCtx, func(shard *kaiv1.SchedulingShard) {
+		delete(shard.Spec.Plugins, pluginName)
+	})
+}
+
+func patchPlugin(
+	ctx context.Context, testCtx *testContext.TestContext,
+	mutateFn func(shard *kaiv1.SchedulingShard),
+) error {
+	if err := configurations.PatchSchedulingShard(
+		ctx, testCtx, "default",
+		mutateFn,
+	); err != nil {
+		return err
+	}
+	wait.WaitForDeploymentPodsRunning(
+		ctx, testCtx.ControllerClient, constant.SchedulerDeploymentName, constants.DefaultKAINamespace,
+	)
+	return nil
+}