Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
- Fixed admission webhook to skip runtimeClassName injection when gpuPodRuntimeClassName is empty [#1035](https://github.com/NVIDIA/KAI-Scheduler/pull/1035)
- Fixed topology-migration helm hook failing on OpenShift due to missing `kai-topology-migration` service account in the `kai-system` SCC [#1050](https://github.com/NVIDIA/KAI-Scheduler/pull/1050)
- Fixed a bug where queue status did not reflect its podgroups resources correctly [#1049](https://github.com/NVIDIA/KAI-Scheduler/pull/1049)
- Fixed `spread` node placement strategy ignoring fractional GPU usage on shared GPUs, causing nondeterministic pod placement instead of even spreading
- Fixed helm uninstall does not remove webhooks [#959](https://github.com/NVIDIA/KAI-Scheduler/pull/959) [faizan-exe](https://github.com/faizan-exe)
- Fixed security vulnerability where PodGang could reference pods in other namespaces, preventing cross-namespace manipulation
- Fixed pod controller logging to use request namespace/name instead of empty pod object fields when pod is not found
Expand Down
151 changes: 151 additions & 0 deletions pkg/scheduler/actions/allocate/allocateFractionalGpu_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1642,3 +1642,154 @@ func getFractionalGPUTestsMetadata() []integration_tests_utils.TestTopologyMetad
},
}
}

// TestFillNodeSpreadStrategyPackGPUs reproduces the E2E fill_node_test scenario:
// 2 nodes (2 GPUs each), 4 pending pods (0.5 GPU each), spread across nodes + pack GPUs.
// With spread strategy, pods should alternate across nodes. With gpupack, pods on the
// same node should share a single GPU. Expected: 2 pods per node, 1 reservation per node.
func TestFillNodeSpreadStrategyPackGPUs(t *testing.T) {
test_utils.InitTestingInfrastructure()
controller := NewController(t)
defer controller.Finish()
defer gock.Off()

testsMetadata := getFillNodeSpreadPackTestsMetadata()
for testNumber, testMetadata := range testsMetadata {
t.Logf("Running test %d: %s", testNumber, testMetadata.TestTopologyBasic.Name)

ssn := test_utils.BuildSession(testMetadata.TestTopologyBasic, controller)
allocateAction := allocate.New()
allocateAction.Execute(ssn)

test_utils.MatchExpectedAndRealTasks(t, testNumber, testMetadata.TestTopologyBasic, ssn)

// Additionally verify GPU packing: count distinct GPU groups per node
reservationsByNode := map[string]map[string]bool{}
for _, job := range ssn.ClusterInfo.PodGroupInfos {
for _, task := range job.GetAllPodsMap() {
if task.NodeName == "" {
continue
}
if reservationsByNode[task.NodeName] == nil {
reservationsByNode[task.NodeName] = map[string]bool{}
}
for _, gpuGroup := range task.GPUGroups {
reservationsByNode[task.NodeName][gpuGroup] = true
}
}
}

for nodeName, gpuGroups := range reservationsByNode {
if len(gpuGroups) != 1 {
t.Errorf("Test %d: %s - expected 1 GPU group on node %s (pack behavior), got %d: %v",
testNumber, testMetadata.TestTopologyBasic.Name, nodeName, len(gpuGroups), gpuGroups)
}
}
Comment thread
enoodle marked this conversation as resolved.
}
}

func getFillNodeSpreadPackTestsMetadata() []integration_tests_utils.TestTopologyMetadata {
return []integration_tests_utils.TestTopologyMetadata{
{
TestTopologyBasic: test_utils.TestTopologyBasic{
Name: "E2E fill_node reproduction: 4 pending pods, spread nodes, pack GPUs - all from empty state",
Jobs: []*jobs_fake.TestJobBasic{
{
Name: "pending_job0",
Priority: constants.PriorityBuildNumber,
QueueName: "queue0",
RequiredGPUsPerTask: 0.5,
Tasks: []*tasks_fake.TestTaskBasic{
{State: pod_status.Pending},
},
},
{
Name: "pending_job1",
Priority: constants.PriorityBuildNumber,
QueueName: "queue0",
RequiredGPUsPerTask: 0.5,
Tasks: []*tasks_fake.TestTaskBasic{
{State: pod_status.Pending},
},
},
{
Name: "pending_job2",
Priority: constants.PriorityBuildNumber,
QueueName: "queue0",
RequiredGPUsPerTask: 0.5,
Tasks: []*tasks_fake.TestTaskBasic{
{State: pod_status.Pending},
},
},
{
Name: "pending_job3",
Priority: constants.PriorityBuildNumber,
QueueName: "queue0",
RequiredGPUsPerTask: 0.5,
Tasks: []*tasks_fake.TestTaskBasic{
{State: pod_status.Pending},
},
},
},
Nodes: map[string]nodes_fake.TestNodeBasic{
"node0": {GPUs: 2},
"node1": {GPUs: 2},
},
Queues: []test_utils.TestQueueBasic{
{
Name: "queue0",
DeservedGPUs: 4,
},
},
JobExpectedResults: map[string]test_utils.TestExpectedResultBasic{
"pending_job0": {
GPUsRequired: 0.5,
Status: pod_status.Binding,
DontValidateGPUGroup: true,
},
"pending_job1": {
GPUsRequired: 0.5,
Status: pod_status.Binding,
DontValidateGPUGroup: true,
},
"pending_job2": {
GPUsRequired: 0.5,
Status: pod_status.Binding,
DontValidateGPUGroup: true,
},
"pending_job3": {
GPUsRequired: 0.5,
Status: pod_status.Binding,
DontValidateGPUGroup: true,
},
},
Mocks: &test_utils.TestMock{
CacheRequirements: &test_utils.CacheMocking{
NumberOfCacheBinds: 5,
},
SchedulerConf: &conf.SchedulerConfiguration{
Actions: "allocate",
Tiers: []conf.Tier{
{
Plugins: []conf.PluginOption{
{
Name: "nodeplacement",
Arguments: map[string]string{
constants.GPUResource: constants.SpreadStrategy,
constants.CPUResource: constants.SpreadStrategy,
},
},
{Name: "gpupack"},
{Name: "proportion"},
{Name: "priority"},
{Name: "nodeavailability"},
{Name: "resourcetype"},
},
},
},
},
},
},
},
}
}
73 changes: 73 additions & 0 deletions pkg/scheduler/plugins/nodeplacement/nodespread_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -112,5 +112,78 @@ var _ = Describe("NodeSpread", func() {
Expect(actual).To(Equal(c.expected))
}
})

It("should account for shared GPU consumption when scoring", func() {
// Node with 2 GPUs, one GPU has 0.5 allocated (50 of 100 memory)
// Expected: (1 idle whole GPU + 0.5 available on shared) / 2 total = 0.75
nodeWithShared := &node_info.NodeInfo{
Node: &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "node-with-shared",
Labels: map[string]string{
node_info.GpuCountLabel: "2",
},
},
},
Idle: resource_info.NewResource(0, 0, 1), // 1 whole idle GPU
Releasing: resource_info.EmptyResource(),
MemoryOfEveryGpuOnNode: 100,
GpuSharingNodeInfo: node_info.GpuSharingNodeInfo{
ReleasingSharedGPUs: map[string]bool{},
UsedSharedGPUsMemory: map[string]int64{"gpu-0": 50},
ReleasingSharedGPUsMemory: map[string]int64{},
AllocatedSharedGPUsMemory: map[string]int64{"gpu-0": 50},
},
}

// Node with 2 GPUs, completely empty
// Expected: 2 / 2 = 1.0
nodeEmpty := &node_info.NodeInfo{
Node: &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "node-empty",
Labels: map[string]string{
node_info.GpuCountLabel: "2",
},
},
},
Idle: resource_info.NewResource(0, 0, 2), // 2 whole idle GPUs
Releasing: resource_info.EmptyResource(),
MemoryOfEveryGpuOnNode: 100,
GpuSharingNodeInfo: node_info.GpuSharingNodeInfo{
ReleasingSharedGPUs: map[string]bool{},
UsedSharedGPUsMemory: map[string]int64{},
ReleasingSharedGPUsMemory: map[string]int64{},
AllocatedSharedGPUsMemory: map[string]int64{},
},
}

task := &pod_info.PodInfo{
ResReq: resource_info.NewResourceRequirementsWithGpus(1),
}

nodes := map[string]*node_info.NodeInfo{
nodeWithShared.Name: nodeWithShared,
nodeEmpty.Name: nodeEmpty,
}
plugin := nodeplacement.New(map[string]string{
constants.GPUResource: constants.SpreadStrategy,
constants.CPUResource: constants.SpreadStrategy,
})
ssn := createFakeTestSession(nodes)
plugin.OnSessionOpen(ssn)
nof := ssn.NodeOrderFns[len(ssn.NodeOrderFns)-1]

sharedScore, err := nof(task, nodeWithShared)
Expect(err).NotTo(HaveOccurred())
Expect(sharedScore).To(Equal(0.75))

emptyScore, err := nof(task, nodeEmpty)
Expect(err).NotTo(HaveOccurred())
Expect(emptyScore).To(Equal(1.0))

// The empty node should score higher (more available), ensuring spread
Expect(emptyScore).To(BeNumerically(">", sharedScore))
})
})
})
17 changes: 12 additions & 5 deletions pkg/scheduler/plugins/nodeplacement/spread.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,24 @@ import (
func nodeResourceSpread(resourceName v1.ResourceName) api.NodeOrderFn {
return func(task *pod_info.PodInfo, node *node_info.NodeInfo) (float64, error) {
var resourceCount float64
var nonAllocated float64
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should the same logic be applied to pack.go?


if resourceName == resource_info.GPUResourceName {
resourceCount = float64(node.GetNumberOfGPUsInNode())
if resourceCount == 0 {
return 0, nil
}
idleGPUs, _ := node.GetSumOfIdleGPUs()
releasingGPUs, _ := node.GetSumOfReleasingGPUs()
nonAllocated = idleGPUs + releasingGPUs
Comment on lines +26 to +28
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Consider exporting that on the node

} else {
resourceCount = node.Allocatable.Get(resourceName)
if resourceCount == 0 {
return 0, nil
}
nonAllocated = node.NonAllocatedResource(resourceName)
}

if resourceCount == 0 {
return 0, nil
}

nonAllocated := node.NonAllocatedResource(resourceName)
score := nonAllocated / resourceCount
log.InfraLogger.V(7).Infof("Estimating Task: <%v/%v> Job: <%v> for node: <%s> "+
"that has <%.2f/%f> non allocated %v. Score: %f",
Expand Down
55 changes: 55 additions & 0 deletions test/e2e/modules/configurations/feature_flags/plugins.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
/*
Copyright 2025 NVIDIA CORPORATION
SPDX-License-Identifier: Apache-2.0
*/
package feature_flags

import (
"context"

"k8s.io/utils/ptr"

kaiv1 "github.com/NVIDIA/KAI-scheduler/pkg/apis/kai/v1"
"github.com/NVIDIA/KAI-scheduler/pkg/common/constants"
"github.com/NVIDIA/KAI-scheduler/test/e2e/modules/configurations"
"github.com/NVIDIA/KAI-scheduler/test/e2e/modules/constant"
testContext "github.com/NVIDIA/KAI-scheduler/test/e2e/modules/context"
"github.com/NVIDIA/KAI-scheduler/test/e2e/modules/wait"
)

func SetPluginEnabled(
ctx context.Context, testCtx *testContext.TestContext, pluginName string, enabled bool,
) error {
return patchPlugin(ctx, testCtx, func(shard *kaiv1.SchedulingShard) {
if shard.Spec.Plugins == nil {
shard.Spec.Plugins = make(map[string]kaiv1.PluginConfig)
}
config := shard.Spec.Plugins[pluginName]
config.Enabled = ptr.To(enabled)
shard.Spec.Plugins[pluginName] = config
})
}

func UnsetPlugin(
ctx context.Context, testCtx *testContext.TestContext, pluginName string,
) error {
return patchPlugin(ctx, testCtx, func(shard *kaiv1.SchedulingShard) {
delete(shard.Spec.Plugins, pluginName)
})
}

func patchPlugin(
ctx context.Context, testCtx *testContext.TestContext,
mutateFn func(shard *kaiv1.SchedulingShard),
) error {
if err := configurations.PatchSchedulingShard(
ctx, testCtx, "default",
mutateFn,
); err != nil {
return err
}
wait.WaitForDeploymentPodsRunning(
ctx, testCtx.ControllerClient, constant.SchedulerDeploymentName, constants.DefaultKAINamespace,
)
return nil
}
Loading
Loading