From 4a7b05d6338212f861d5352cc7bea36794332494 Mon Sep 17 00:00:00 2001 From: Erez Freiberger Date: Sat, 7 Feb 2026 00:01:55 +0100 Subject: [PATCH] refactor(scheduler): migrate NodeInfo, PodInfo, and plugin resource reads to vectors Convert all read-path methods from Resource to ResourceVector operations: - NodeInfo: IsTaskAllocatable, FittingError, GetSumOfIdleGPUs, IsCPUOnlyNode - External plugins: proportion, topology, nodeplacement, nodeavailability, resourcetype - Framework: session logging, statement references - Error handling: pod_errors, job_errors Add AcceptedResourceVector to PodInfo. Add QuantifyVector util to proportion plugin. Rewrite topology calcNodeAccommodation from iterative pod probing to division-based vector approach. Resource fields still maintained via dual-write for backward compatibility until removal in subsequent commits. Co-Authored-By: Claude Opus 4.6 --- pkg/scheduler/actions/common/allocate.go | 6 +- .../actions/common/feasible_nodes_test.go | 10 + .../idle_gpus/idle_gpus.go | 4 +- .../idle_gpus/idle_gpus_test.go | 8 + .../solvers/pod_scenario_builder_test.go | 3 + .../actions/utils/job_order_by_queue_test.go | 54 ++-- pkg/scheduler/api/common_info/job_errors.go | 88 +++---- .../api/common_info/job_errors_test.go | 44 ++-- pkg/scheduler/api/common_info/pod_errors.go | 54 ++-- .../api/common_info/pod_errors_test.go | 5 +- .../api/node_info/gpu_sharing_node_info.go | 14 +- pkg/scheduler/api/node_info/node_info.go | 111 +++++--- pkg/scheduler/api/node_info/node_info_test.go | 42 ++- pkg/scheduler/api/pod_info/pod_info.go | 54 ++-- .../cache/cluster_info/cluster_info.go | 2 +- pkg/scheduler/framework/session.go | 12 +- pkg/scheduler/framework/statement.go | 4 +- .../nodeavailability/nodeavailability.go | 6 +- .../plugins/nodeplacement/nodepack_test.go | 1 + .../plugins/nodeplacement/nodespread_test.go | 28 +- pkg/scheduler/plugins/nodeplacement/pack.go | 4 +- pkg/scheduler/plugins/nodeplacement/spread.go | 2 +- .../capacity_policy/capacity_policy.go | 8 +- .../capacity_policy/capacity_policy_test.go | 199 ++++++++------ .../plugins/proportion/proportion.go | 52 ++-- .../plugins/proportion/proportion_test.go | 112 +++++--- .../proportion/reclaimable/reclaimable.go | 53 ++-- .../reclaimable/reclaimable_test.go | 72 +++-- .../proportion/reclaimable/reclaimer_info.go | 3 +- .../reclaimable/strategies/strategies.go | 20 +- .../reclaimable/strategies/strategies_test.go | 26 +- .../plugins/proportion/utils/utils.go | 25 ++ .../plugins/resourcetype/resourcetype.go | 4 +- .../plugins/topology/job_filtering.go | 156 +++++------ .../plugins/topology/job_filtering_test.go | 246 ++++++++++-------- .../plugins/topology/node_scoring_test.go | 120 ++++----- .../plugins/topology/topology_plugin.go | 9 + .../plugins/topology/topology_structs.go | 16 +- 38 files changed, 982 insertions(+), 695 deletions(-) diff --git a/pkg/scheduler/actions/common/allocate.go b/pkg/scheduler/actions/common/allocate.go index 30f07ed3a..39140af44 100644 --- a/pkg/scheduler/actions/common/allocate.go +++ b/pkg/scheduler/actions/common/allocate.go @@ -137,7 +137,7 @@ func allocateTask(ssn *framework.Session, stmt *framework.Statement, nodes []*no } log.InfraLogger.V(6).Infof("Looking for best node for task - Task: <%s/%s>, init requested: <%v>.", - task.Namespace, task.Name, task.ResReq) + task.Namespace, task.Name, task.ResReqVector) orderedNodes := ssn.OrderedNodesByTask(nodes, task) for _, node := range orderedNodes { @@ -175,7 +175,7 @@ func allocateTaskToNode(ssn *framework.Session, stmt *framework.Statement, task func bindTaskToNode(ssn *framework.Session, stmt *framework.Statement, task *pod_info.PodInfo, node *node_info.NodeInfo) bool { log.InfraLogger.V(6).Infof("Binding Task <%v/%v> to node <%v>, requires: %v GPUs", - task.Namespace, task.Name, node.Name, task.ResReq) + task.Namespace, task.Name, node.Name, task.ResReqVector) if err := stmt.Allocate(task, node.Name); err != nil { log.InfraLogger.Errorf("Failed to bind Task %v on %v in Session %v, err: %v", task.UID, node.Name, ssn.ID, err) @@ -186,7 +186,7 @@ func bindTaskToNode(ssn *framework.Session, stmt *framework.Statement, task *pod func pipelineTaskToNode(ssn *framework.Session, stmt *framework.Statement, task *pod_info.PodInfo, node *node_info.NodeInfo, updateTasksIfExistsOnNode bool) bool { log.InfraLogger.V(6).Infof("Pipelining Task <%v/%v> to node <%v> requires: %v GPUs", - task.Namespace, task.Name, node.Name, task.ResReq) + task.Namespace, task.Name, node.Name, task.ResReqVector) if err := stmt.Pipeline(task, node.Name, updateTasksIfExistsOnNode); err != nil { log.InfraLogger.V(6).Infof("Failed to pipeline Task %v on %v in Session %v", task.UID, node.Name, ssn.ID) diff --git a/pkg/scheduler/actions/common/feasible_nodes_test.go b/pkg/scheduler/actions/common/feasible_nodes_test.go index 68283c07c..b45466b16 100644 --- a/pkg/scheduler/actions/common/feasible_nodes_test.go +++ b/pkg/scheduler/actions/common/feasible_nodes_test.go @@ -78,6 +78,16 @@ var ( allNodeNames = append(gpuNodeNames, cpuNode.Name) ) +func init() { + vectorMap := resource_info.NewResourceVectorMap() + vectorMap.AddResource("nvidia.com/mig-1g.10gb") + for _, node := range allNodes { + node.VectorMap = vectorMap + node.IdleVector = node.Idle.ToVector(vectorMap) + node.ReleasingVector = node.Releasing.ToVector(vectorMap) + } +} + func TestFeasibleNodes(t *testing.T) { tests := []struct { name string diff --git a/pkg/scheduler/actions/common/solvers/accumulated_scenario_filters/idle_gpus/idle_gpus.go b/pkg/scheduler/actions/common/solvers/accumulated_scenario_filters/idle_gpus/idle_gpus.go index 656f9295f..899c380f8 100644 --- a/pkg/scheduler/actions/common/solvers/accumulated_scenario_filters/idle_gpus/idle_gpus.go +++ b/pkg/scheduler/actions/common/solvers/accumulated_scenario_filters/idle_gpus/idle_gpus.go @@ -151,7 +151,7 @@ func (ig *AccumulatedIdleGpus) updateRequiredResources(scenario *scenario.ByNode var requiredResources []float64 for _, pod := range scenario.PendingTasks() { - requiredResources = append(requiredResources, pod.ResReq.GPUs()+float64(pod.ResReq.GetDraGpusCount())) + requiredResources = append(requiredResources, pod.ResReqVector.Get(pod.VectorMap.GetIndex("gpu"))) ig.pendingTasksInState[pod.UID] = true } sort.Sort(sort.Reverse(sort.Float64Slice(requiredResources))) @@ -174,7 +174,7 @@ func (ig *AccumulatedIdleGpus) updateWithVictim(task *pod_info.PodInfo, minIdleG } prevMinRelevantValue := ig.nodesNameToIdleGpus[minIdleGpusRelevant] - ig.nodesNameToIdleGpus[task.NodeName] += task.AcceptedResource.GPUs() + float64(task.AcceptedResource.GetDraGpusCount()) + ig.nodesNameToIdleGpus[task.NodeName] += task.AcceptedResourceVector.Get(task.VectorMap.GetIndex("gpu")) if ig.nodesNameToIdleGpus[task.NodeName] > prevMinRelevantValue { ig.maxFreeGpuNodesSorted = orderedInsert(ig.maxFreeGpuNodesSorted, task.NodeName, diff --git a/pkg/scheduler/actions/common/solvers/accumulated_scenario_filters/idle_gpus/idle_gpus_test.go b/pkg/scheduler/actions/common/solvers/accumulated_scenario_filters/idle_gpus/idle_gpus_test.go index 43262b196..63f6b849b 100644 --- a/pkg/scheduler/actions/common/solvers/accumulated_scenario_filters/idle_gpus/idle_gpus_test.go +++ b/pkg/scheduler/actions/common/solvers/accumulated_scenario_filters/idle_gpus/idle_gpus_test.go @@ -25,6 +25,8 @@ import ( "github.com/NVIDIA/KAI-scheduler/pkg/scheduler/framework" ) +var testVectorMap = resource_info.NewResourceVectorMap() + func Test_orderedInsert(t *testing.T) { type args[T cmp.Ordered] struct { array []T @@ -232,6 +234,8 @@ func TestAccumulatedIdleGpus_updateWithVictim(t *testing.T) { 2, 0, ), }, + AcceptedResourceVector: resource_info.NewResourceVectorWithValues(0, 0, 2, testVectorMap), + VectorMap: testVectorMap, }, minIdleGpusRelevant: "n2", }, @@ -255,6 +259,8 @@ func TestAccumulatedIdleGpus_updateWithVictim(t *testing.T) { 2, 0, ), }, + AcceptedResourceVector: resource_info.NewResourceVectorWithValues(0, 0, 2, testVectorMap), + VectorMap: testVectorMap, }, minIdleGpusRelevant: "n2", }, @@ -278,6 +284,8 @@ func TestAccumulatedIdleGpus_updateWithVictim(t *testing.T) { 2, 0, ), }, + AcceptedResourceVector: resource_info.NewResourceVectorWithValues(0, 0, 2, testVectorMap), + VectorMap: testVectorMap, }, minIdleGpusRelevant: "n4", }, diff --git a/pkg/scheduler/actions/common/solvers/pod_scenario_builder_test.go b/pkg/scheduler/actions/common/solvers/pod_scenario_builder_test.go index 327823415..d278b934b 100644 --- a/pkg/scheduler/actions/common/solvers/pod_scenario_builder_test.go +++ b/pkg/scheduler/actions/common/solvers/pod_scenario_builder_test.go @@ -309,8 +309,11 @@ func initializeSession(jobsCount, tasksPerJob int) (*framework.Session, []*pod_i queueName := fmt.Sprintf("team-%d", jobID) newJob, jobTasks := createJobWithTasks(tasksPerJob, jobID, queueName, v1.PodRunning, []v1.ResourceRequirements{requireOneGPU()}) jobs = append(jobs, newJob) + allocatedVector := newJob.Allocated.ToVector(vectorMap) node.Allocatable.Add(newJob.Allocated) + node.AllocatableVector.Add(allocatedVector) node.Idle.Add(newJob.Allocated) + node.IdleVector.Add(allocatedVector) _ = node.AddTasksToNode(jobTasks, map[common_info.PodID]*pod_info.PodInfo{}) tasks = append(tasks, jobTasks...) queues = append(queues, createQueue(queueName)) diff --git a/pkg/scheduler/actions/utils/job_order_by_queue_test.go b/pkg/scheduler/actions/utils/job_order_by_queue_test.go index 8cbe6d7d1..572c4a458 100644 --- a/pkg/scheduler/actions/utils/job_order_by_queue_test.go +++ b/pkg/scheduler/actions/utils/job_order_by_queue_test.go @@ -39,6 +39,8 @@ const ( testPod = "p1" ) +var testVectorMap = resource_info.NewResourceVectorMap() + func TestNumericalPriorityWithinSameQueue(t *testing.T) { ssn := newPrioritySession(t) @@ -210,16 +212,16 @@ func TestVictimQueue_PopNextJob(t *testing.T) { PodStatusIndex: map[pod_status.PodStatus]pod_info.PodsMap{ pod_status.Allocated: { "p1": { - UID: "p1", - AcceptedResource: resource_info.NewResourceRequirements( - 1, - 1000, - 1024, - ), + UID: "p1", + VectorMap: testVectorMap, + AcceptedResource: resource_info.NewResourceRequirements(1, 1000, 1024), + AcceptedResourceVector: resource_info.NewResourceRequirements(1, 1000, 1024).ToVector(testVectorMap), }, }, }, - Allocated: resource_info.NewResource(1000, 1024, 1), + Allocated: resource_info.NewResource(1000, 1024, 1), + VectorMap: testVectorMap, + AllocatedVector: resource_info.NewResource(1000, 1024, 1).ToVector(testVectorMap), }, "q1j2": { Name: "q1j2", @@ -228,16 +230,16 @@ func TestVictimQueue_PopNextJob(t *testing.T) { PodStatusIndex: map[pod_status.PodStatus]pod_info.PodsMap{ pod_status.Allocated: { "p1": { - UID: "p1", - AcceptedResource: resource_info.NewResourceRequirements( - 1, - 1000, - 1024, - ), + UID: "p1", + VectorMap: testVectorMap, + AcceptedResource: resource_info.NewResourceRequirements(1, 1000, 1024), + AcceptedResourceVector: resource_info.NewResourceRequirements(1, 1000, 1024).ToVector(testVectorMap), }, }, }, - Allocated: resource_info.NewResource(1000, 1024, 1), + Allocated: resource_info.NewResource(1000, 1024, 1), + VectorMap: testVectorMap, + AllocatedVector: resource_info.NewResource(1000, 1024, 1).ToVector(testVectorMap), }, "q1j3": { Name: "q1j3", @@ -246,7 +248,9 @@ func TestVictimQueue_PopNextJob(t *testing.T) { PodStatusIndex: map[pod_status.PodStatus]pod_info.PodsMap{ pod_status.Allocated: { "p1": { - UID: "p1", + UID: "p1", + VectorMap: testVectorMap, + AcceptedResourceVector: resource_info.NewResourceRequirements(1, 1000, 1024).ToVector(testVectorMap), AcceptedResource: resource_info.NewResourceRequirements( 1, 1000, @@ -264,7 +268,9 @@ func TestVictimQueue_PopNextJob(t *testing.T) { PodStatusIndex: map[pod_status.PodStatus]pod_info.PodsMap{ pod_status.Allocated: { "p1": { - UID: "p1", + UID: "p1", + VectorMap: testVectorMap, + AcceptedResourceVector: resource_info.NewResourceRequirements(1, 1000, 1024).ToVector(testVectorMap), AcceptedResource: resource_info.NewResourceRequirements( 1, 1000, @@ -282,7 +288,9 @@ func TestVictimQueue_PopNextJob(t *testing.T) { PodStatusIndex: map[pod_status.PodStatus]pod_info.PodsMap{ pod_status.Allocated: { "p1": { - UID: "p1", + UID: "p1", + VectorMap: testVectorMap, + AcceptedResourceVector: resource_info.NewResourceRequirements(1, 1000, 1024).ToVector(testVectorMap), AcceptedResource: resource_info.NewResourceRequirements( 1, 1000, @@ -291,7 +299,9 @@ func TestVictimQueue_PopNextJob(t *testing.T) { }, }, }, - Allocated: resource_info.NewResource(1000, 1024, 1), + Allocated: resource_info.NewResource(1000, 1024, 1), + VectorMap: testVectorMap, + AllocatedVector: resource_info.NewResource(1000, 1024, 1).ToVector(testVectorMap), }, "q2j3": { Name: "q2j3", @@ -300,7 +310,9 @@ func TestVictimQueue_PopNextJob(t *testing.T) { PodStatusIndex: map[pod_status.PodStatus]pod_info.PodsMap{ pod_status.Allocated: { "p1": { - UID: "p1", + UID: "p1", + VectorMap: testVectorMap, + AcceptedResourceVector: resource_info.NewResourceRequirements(1, 1000, 1024).ToVector(testVectorMap), AcceptedResource: resource_info.NewResourceRequirements( 1, 1000, @@ -309,7 +321,9 @@ func TestVictimQueue_PopNextJob(t *testing.T) { }, }, }, - Allocated: resource_info.NewResource(1000, 1024, 1), + Allocated: resource_info.NewResource(1000, 1024, 1), + VectorMap: testVectorMap, + AllocatedVector: resource_info.NewResource(1000, 1024, 1).ToVector(testVectorMap), }, }, expectedJobNames: []string{"q1j3", "q2j3", "q1j2", "q2j2", "q1j1", "q2j1"}, diff --git a/pkg/scheduler/api/common_info/job_errors.go b/pkg/scheduler/api/common_info/job_errors.go index 8d9f13617..1493ca5a1 100644 --- a/pkg/scheduler/api/common_info/job_errors.go +++ b/pkg/scheduler/api/common_info/job_errors.go @@ -10,8 +10,10 @@ import ( "strings" enginev2alpha2 "github.com/NVIDIA/KAI-scheduler/pkg/apis/scheduling/v2alpha2" + "github.com/NVIDIA/KAI-scheduler/pkg/common/constants" "github.com/NVIDIA/KAI-scheduler/pkg/scheduler/api/resource_info" "github.com/dustin/go-humanize" + v1 "k8s.io/api/core/v1" ) const ( @@ -141,68 +143,52 @@ func (f *TopologyFitError) DetailedMessage() string { func NewTopologyInsufficientResourcesError( jobName, subGroupName, namespace, domainID string, - resourceRequested *resource_info.Resource, availableResource *resource_info.Resource, + resourceRequested resource_info.ResourceVector, availableResource resource_info.ResourceVector, + vectorMap *resource_info.ResourceVectorMap, ) *TopologyFitError { var shortMessages []string var detailedMessages []string - if len(resourceRequested.MigResources()) > 0 { - for migProfile, quant := range resourceRequested.MigResources() { - availableMigProfilesQuant := int64(0) - if _, found := availableResource.ScalarResources()[migProfile]; found { - availableMigProfilesQuant = availableResource.ScalarResources()[migProfile] - } - if availableMigProfilesQuant < quant { - detailedMessages = append(detailedMessages, - fmt.Sprintf("%s didn't have enough resource: %s, requested: %d, available: %d", - domainID, migProfile, quant, availableMigProfilesQuant)) - shortMessages = append(shortMessages, fmt.Sprintf("node-group(s) didn't have enough of mig profile: %s", - migProfile)) - } + for i := 0; i < vectorMap.Len(); i++ { + resourceName := vectorMap.ResourceAt(i) + requested := resourceRequested.Get(i) + available := availableResource.Get(i) + if requested <= available { + continue } - } else { - requestedGPUs := resourceRequested.GPUs() - availableGPUs := availableResource.GPUs() - if requestedGPUs > availableGPUs { + + if resource_info.IsMigResource(v1.ResourceName(resourceName)) { + detailedMessages = append(detailedMessages, + fmt.Sprintf("%s didn't have enough resource: %s, requested: %d, available: %d", + domainID, resourceName, int64(requested), int64(available))) + shortMessages = append(shortMessages, fmt.Sprintf("node-group(s) didn't have enough of mig profile: %s", + resourceName)) + } else if resourceName == constants.GpuResource { detailedMessages = append(detailedMessages, fmt.Sprintf("%s didn't have enough resource: GPUs, requested: %s, available: %s", domainID, - strconv.FormatFloat(requestedGPUs, 'g', 3, 64), - strconv.FormatFloat(availableGPUs, 'g', 3, 64), + strconv.FormatFloat(requested, 'g', 3, 64), + strconv.FormatFloat(available, 'g', 3, 64), )) shortMessages = append(shortMessages, "node-group(s) didn't have enough resources: GPUs") - } - } - - requestedCPUs := int64(resourceRequested.Cpu()) - availableCPUs := int64(availableResource.Cpu()) - if requestedCPUs > availableCPUs { - detailedMessages = append(detailedMessages, fmt.Sprintf("%s didn't have enough resources: CPU cores, requested: %s, available: %s", - domainID, - humanize.FtoaWithDigits(resourceRequested.Cpu()/resource_info.MilliCPUToCores, 3), - humanize.FtoaWithDigits(availableResource.Cpu()/resource_info.MilliCPUToCores, 3), - )) - shortMessages = append(shortMessages, "node-group(s) didn't have enough resources: CPU cores") - } - - if resourceRequested.Memory() > availableResource.Memory() { - detailedMessages = append(detailedMessages, fmt.Sprintf("%s didn't have enough resources: memory, requested: %s, available: %s", - domainID, - humanize.FtoaWithDigits(resourceRequested.Memory()/resource_info.MemoryToGB, 3), - humanize.FtoaWithDigits(availableResource.Memory()/resource_info.MemoryToGB, 3), - )) - shortMessages = append(shortMessages, "node-group(s) didn't have enough resources: memory") - } - - for requestedResourceName, requestedResourceQuant := range resourceRequested.ScalarResources() { - availableResourceQuant := int64(0) - if _, found := availableResource.ScalarResources()[requestedResourceName]; found { - availableResourceQuant = availableResource.ScalarResources()[requestedResourceName] - } - if availableResourceQuant < requestedResourceQuant { + } else if resourceName == string(v1.ResourceCPU) { + detailedMessages = append(detailedMessages, fmt.Sprintf("%s didn't have enough resources: CPU cores, requested: %s, available: %s", + domainID, + humanize.FtoaWithDigits(requested/resource_info.MilliCPUToCores, 3), + humanize.FtoaWithDigits(available/resource_info.MilliCPUToCores, 3), + )) + shortMessages = append(shortMessages, "node-group(s) didn't have enough resources: CPU cores") + } else if resourceName == string(v1.ResourceMemory) { + detailedMessages = append(detailedMessages, fmt.Sprintf("%s didn't have enough resources: memory, requested: %s, available: %s", + domainID, + humanize.FtoaWithDigits(requested/resource_info.MemoryToGB, 3), + humanize.FtoaWithDigits(available/resource_info.MemoryToGB, 3), + )) + shortMessages = append(shortMessages, "node-group(s) didn't have enough resources: memory") + } else { detailedMessages = append(detailedMessages, fmt.Sprintf("%s didn't have enough resource: %s, requested: %d, available: %d", - domainID, requestedResourceName, requestedResourceQuant, availableResourceQuant)) + domainID, resourceName, int64(requested), int64(available))) shortMessages = append(shortMessages, fmt.Sprintf("node-group(s) didn't have enough resources: %s", - requestedResourceName)) + resourceName)) } } diff --git a/pkg/scheduler/api/common_info/job_errors_test.go b/pkg/scheduler/api/common_info/job_errors_test.go index 5e8b3a34b..20f93a189 100644 --- a/pkg/scheduler/api/common_info/job_errors_test.go +++ b/pkg/scheduler/api/common_info/job_errors_test.go @@ -14,6 +14,14 @@ import ( "github.com/NVIDIA/KAI-scheduler/pkg/scheduler/api/resource_info" ) +var errorsTestVectorMap *resource_info.ResourceVectorMap + +func init() { + errorsTestVectorMap = resource_info.NewResourceVectorMap() + errorsTestVectorMap.AddResource("nvidia.com/mig-1g.5gb") + errorsTestVectorMap.AddResource("custom.io/res") +} + func TestJobFitErrorsToDetailedMessage(t *testing.T) { tests := []struct { name string @@ -304,8 +312,8 @@ func TestNewTopologyInsufficientResourcesError(t *testing.T) { subGroupName string namespace string domainID string - resourceRequested *resource_info.Resource - availableResource *resource_info.Resource + resourceRequested resource_info.ResourceVector + availableResource resource_info.ResourceVector } tests := []struct { name string @@ -319,8 +327,8 @@ func TestNewTopologyInsufficientResourcesError(t *testing.T) { subGroupName: "subgroup1", namespace: "namespace1", domainID: "domain1", - resourceRequested: BuildResource("1500m", "1M"), - availableResource: BuildResource("1000m", "2M"), + resourceRequested: BuildResource("1500m", "1M").ToVector(errorsTestVectorMap), + availableResource: BuildResource("1000m", "2M").ToVector(errorsTestVectorMap), }, want: &TopologyFitError{ JobFitErrorBase: JobFitErrorBase{ @@ -341,8 +349,8 @@ func TestNewTopologyInsufficientResourcesError(t *testing.T) { subGroupName: "subgroup1", namespace: "namespace1", domainID: "domain1", - resourceRequested: BuildResource("1000m", "3M"), - availableResource: BuildResource("2000m", "2M"), + resourceRequested: BuildResource("1000m", "3M").ToVector(errorsTestVectorMap), + availableResource: BuildResource("2000m", "2M").ToVector(errorsTestVectorMap), }, want: &TopologyFitError{ JobFitErrorBase: JobFitErrorBase{ @@ -363,8 +371,8 @@ func TestNewTopologyInsufficientResourcesError(t *testing.T) { subGroupName: "subgroup1", namespace: "namespace1", domainID: "domain1", - resourceRequested: BuildResourceWithGpu("1000m", "1M", "2", "1"), - availableResource: BuildResourceWithGpu("2000m", "2M", "1", "110"), + resourceRequested: BuildResourceWithGpu("1000m", "1M", "2", "1").ToVector(errorsTestVectorMap), + availableResource: BuildResourceWithGpu("2000m", "2M", "1", "110").ToVector(errorsTestVectorMap), }, want: &TopologyFitError{ JobFitErrorBase: JobFitErrorBase{ @@ -387,10 +395,10 @@ func TestNewTopologyInsufficientResourcesError(t *testing.T) { domainID: "domain1", resourceRequested: resource_info.ResourceFromResourceList( BuildResourceListWithMig("1000m", "1M", "nvidia.com/mig-1g.5gb", "nvidia.com/mig-1g.5gb"), - ), + ).ToVector(errorsTestVectorMap), availableResource: resource_info.ResourceFromResourceList( BuildResourceListWithMig("2000m", "2M", "nvidia.com/mig-1g.5gb"), - ), + ).ToVector(errorsTestVectorMap), }, want: &TopologyFitError{ JobFitErrorBase: JobFitErrorBase{ @@ -398,14 +406,11 @@ func TestNewTopologyInsufficientResourcesError(t *testing.T) { jobName: "job1", subGroupName: "subgroup1", reason: UnschedulableWorkloadReason, - // MIG resources appear twice: once in MigResources() and once in ScalarResources() messages: []string{ "node-group(s) didn't have enough of mig profile: nvidia.com/mig-1g.5gb", - "node-group(s) didn't have enough resources: nvidia.com/mig-1g.5gb", }, detailedMessages: []string{ "domain1 didn't have enough resource: nvidia.com/mig-1g.5gb, requested: 2, available: 1", - "domain1 didn't have enough resource: nvidia.com/mig-1g.5gb, requested: 2, available: 1", }, }, nodesGroupName: "domain1", @@ -424,14 +429,14 @@ func TestNewTopologyInsufficientResourcesError(t *testing.T) { v1.ResourceMemory: resource.MustParse("1M"), v1.ResourceName("custom.io/res"): resource.MustParse("5"), }, - ), + ).ToVector(errorsTestVectorMap), availableResource: resource_info.ResourceFromResourceList( v1.ResourceList{ v1.ResourceCPU: resource.MustParse("2000m"), v1.ResourceMemory: resource.MustParse("2M"), v1.ResourceName("custom.io/res"): resource.MustParse("3"), }, - ), + ).ToVector(errorsTestVectorMap), }, want: &TopologyFitError{ JobFitErrorBase: JobFitErrorBase{ @@ -452,8 +457,8 @@ func TestNewTopologyInsufficientResourcesError(t *testing.T) { subGroupName: "subgroup1", namespace: "namespace1", domainID: "domain1", - resourceRequested: BuildResourceWithGpu("2000m", "3M", "2", "1"), - availableResource: BuildResourceWithGpu("1000m", "2M", "1", "110"), + resourceRequested: BuildResourceWithGpu("2000m", "3M", "2", "1").ToVector(errorsTestVectorMap), + availableResource: BuildResourceWithGpu("1000m", "2M", "1", "110").ToVector(errorsTestVectorMap), }, want: &TopologyFitError{ JobFitErrorBase: JobFitErrorBase{ @@ -462,14 +467,14 @@ func TestNewTopologyInsufficientResourcesError(t *testing.T) { subGroupName: "subgroup1", reason: UnschedulableWorkloadReason, messages: []string{ - "node-group(s) didn't have enough resources: GPUs", "node-group(s) didn't have enough resources: CPU cores", "node-group(s) didn't have enough resources: memory", + "node-group(s) didn't have enough resources: GPUs", }, detailedMessages: []string{ - "domain1 didn't have enough resource: GPUs, requested: 2, available: 1", "domain1 didn't have enough resources: CPU cores, requested: 2, available: 1", "domain1 didn't have enough resources: memory, requested: 0.003, available: 0.002", + "domain1 didn't have enough resource: GPUs, requested: 2, available: 1", }, }, nodesGroupName: "domain1", @@ -485,6 +490,7 @@ func TestNewTopologyInsufficientResourcesError(t *testing.T) { tt.args.domainID, tt.args.resourceRequested, tt.args.availableResource, + errorsTestVectorMap, ) if !reflect.DeepEqual(got, tt.want) { t.Errorf("NewTopologyInsufficientResourcesError() = %v, want %v", got, tt.want) diff --git a/pkg/scheduler/api/common_info/pod_errors.go b/pkg/scheduler/api/common_info/pod_errors.go index ac95cf794..05357a838 100644 --- a/pkg/scheduler/api/common_info/pod_errors.go +++ b/pkg/scheduler/api/common_info/pod_errors.go @@ -10,6 +10,7 @@ import ( "strings" "github.com/dustin/go-humanize" + v1 "k8s.io/api/core/v1" "github.com/NVIDIA/KAI-scheduler/pkg/scheduler/api/resource_info" "github.com/NVIDIA/KAI-scheduler/pkg/scheduler/k8s_internal" @@ -59,27 +60,31 @@ func NewFitErrorByReasons(name, namespace, nodeName string, err error, reasons . func NewFitErrorInsufficientResource( name, namespace, nodeName string, - resourceRequested *resource_info.ResourceRequirements, usedResource, capacityResource *resource_info.Resource, + resourceRequested *resource_info.ResourceRequirements, + usedVector, capacityVector resource_info.ResourceVector, + vectorMap *resource_info.ResourceVectorMap, capacityGpuMemory int64, gangSchedulingJob bool, messageSuffix string, ) *TasksFitError { - availableResource := capacityResource.Clone() - availableResource.Sub(usedResource) + availableVector := capacityVector.Clone() + availableVector.Sub(usedVector) + + gpuIdx := vectorMap.GetIndex(resource_info.GPUResourceName) + cpuIdx := vectorMap.GetIndex(string(v1.ResourceCPU)) + memIdx := vectorMap.GetIndex(string(v1.ResourceMemory)) + var shortMessages []string var detailedMessages []string if len(resourceRequested.MigResources()) > 0 { for migProfile, quant := range resourceRequested.MigResources() { - availableMigProfilesQuant := int64(0) - capacityMigProfilesQuant := int64(0) - if _, found := availableResource.ScalarResources()[migProfile]; found { - availableMigProfilesQuant = availableResource.ScalarResources()[migProfile] - capacityMigProfilesQuant = capacityResource.ScalarResources()[migProfile] - } + migIdx := vectorMap.GetIndex(string(migProfile)) + availableMigProfilesQuant := int64(availableVector.Get(migIdx)) + capacityMigProfilesQuant := int64(capacityVector.Get(migIdx)) if availableMigProfilesQuant < quant { detailedMessages = append(detailedMessages, k8s_internal.NewInsufficientResourceErrorScalarResources( migProfile, quant, - usedResource.ScalarResources()[migProfile], + int64(usedVector.Get(migIdx)), capacityMigProfilesQuant, gangSchedulingJob)) shortMessages = append(shortMessages, fmt.Sprintf("node(s) didn't have enough of mig profile: %s", @@ -88,13 +93,13 @@ func NewFitErrorInsufficientResource( } } else { requestedGPUs := resourceRequested.GPUs() - availableGPUs := availableResource.GPUs() + availableGPUs := availableVector.Get(gpuIdx) if requestedGPUs > availableGPUs { detailedMessages = append(detailedMessages, k8s_internal.NewInsufficientResourceError( "GPUs", resourceRequested.GpusAsString(), - strconv.FormatFloat(usedResource.GPUs(), 'g', 3, 64), - strconv.FormatFloat(capacityResource.GPUs(), 'g', 3, 64), + strconv.FormatFloat(usedVector.Get(gpuIdx), 'g', 3, 64), + strconv.FormatFloat(capacityVector.Get(gpuIdx), 'g', 3, 64), gangSchedulingJob)) shortMessages = append(shortMessages, "node(s) didn't have enough resources: GPUs") } @@ -107,39 +112,36 @@ func NewFitErrorInsufficientResource( } requestedCPUs := int64(resourceRequested.Cpu()) - availableCPUs := int64(availableResource.Cpu()) + availableCPUs := int64(availableVector.Get(cpuIdx)) if requestedCPUs > availableCPUs { detailedMessages = append(detailedMessages, k8s_internal.NewInsufficientResourceError( "CPU cores", humanize.FtoaWithDigits(resourceRequested.Cpu()/resource_info.MilliCPUToCores, 3), - humanize.FtoaWithDigits(usedResource.Cpu()/resource_info.MilliCPUToCores, 3), - humanize.FtoaWithDigits(capacityResource.Cpu()/resource_info.MilliCPUToCores, 3), + humanize.FtoaWithDigits(usedVector.Get(cpuIdx)/resource_info.MilliCPUToCores, 3), + humanize.FtoaWithDigits(capacityVector.Get(cpuIdx)/resource_info.MilliCPUToCores, 3), gangSchedulingJob)) shortMessages = append(shortMessages, "node(s) didn't have enough resources: CPU cores") } - if resourceRequested.Memory() > availableResource.Memory() { + if resourceRequested.Memory() > availableVector.Get(memIdx) { detailedMessages = append(detailedMessages, k8s_internal.NewInsufficientResourceError( "memory", humanize.FtoaWithDigits(resourceRequested.Memory()/resource_info.MemoryToGB, 3), - humanize.FtoaWithDigits(usedResource.Memory()/resource_info.MemoryToGB, 3), - humanize.FtoaWithDigits(capacityResource.Memory()/resource_info.MemoryToGB, 3), + humanize.FtoaWithDigits(usedVector.Get(memIdx)/resource_info.MemoryToGB, 3), + humanize.FtoaWithDigits(capacityVector.Get(memIdx)/resource_info.MemoryToGB, 3), gangSchedulingJob)) shortMessages = append(shortMessages, "node(s) didn't have enough resources: memory") } for requestedResourceName, requestedResourceQuant := range resourceRequested.ScalarResources() { - availableResourceQuant := int64(0) - capacityResourceQuant := int64(0) - if _, found := availableResource.ScalarResources()[requestedResourceName]; found { - availableResourceQuant = availableResource.ScalarResources()[requestedResourceName] - capacityResourceQuant = capacityResource.ScalarResources()[requestedResourceName] - } + scalarIdx := vectorMap.GetIndex(string(requestedResourceName)) + availableResourceQuant := int64(availableVector.Get(scalarIdx)) + capacityResourceQuant := int64(capacityVector.Get(scalarIdx)) if availableResourceQuant < requestedResourceQuant { detailedMessages = append(detailedMessages, k8s_internal.NewInsufficientResourceErrorScalarResources( requestedResourceName, requestedResourceQuant, - usedResource.ScalarResources()[requestedResourceName], capacityResourceQuant, + int64(usedVector.Get(scalarIdx)), capacityResourceQuant, gangSchedulingJob)) shortMessages = append(shortMessages, fmt.Sprintf("node(s) didn't have enough resources: %s", requestedResourceName)) diff --git a/pkg/scheduler/api/common_info/pod_errors_test.go b/pkg/scheduler/api/common_info/pod_errors_test.go index 67f751282..fa630de03 100644 --- a/pkg/scheduler/api/common_info/pod_errors_test.go +++ b/pkg/scheduler/api/common_info/pod_errors_test.go @@ -42,6 +42,7 @@ func TestFitErrors_Error(t *testing.T) { } func TestNewFitErrorInsufficientResource(t *testing.T) { + vectorMap := resource_info.NewResourceVectorMap() type args struct { name string namespace string @@ -188,8 +189,10 @@ func TestNewFitErrorInsufficientResource(t *testing.T) { } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { + usedVector := tt.args.usedResource.ToVector(vectorMap) + capacityVector := tt.args.capacityResource.ToVector(vectorMap) if got := NewFitErrorInsufficientResource(tt.args.name, tt.args.namespace, tt.args.nodeName, - tt.args.resourceRequested, tt.args.usedResource, tt.args.capacityResource, tt.args.capacityGpuMemory, + tt.args.resourceRequested, usedVector, capacityVector, vectorMap, tt.args.capacityGpuMemory, tt.args.gangSchedulingJob, tt.args.suffix); !reflect.DeepEqual(got, tt.want) { t.Errorf("NewFitErrorInsufficientResource() = %v, want %v", got, tt.want) } diff --git a/pkg/scheduler/api/node_info/gpu_sharing_node_info.go b/pkg/scheduler/api/node_info/gpu_sharing_node_info.go index 84e0b0f4f..e94674936 100644 --- a/pkg/scheduler/api/node_info/gpu_sharing_node_info.go +++ b/pkg/scheduler/api/node_info/gpu_sharing_node_info.go @@ -9,6 +9,7 @@ import ( "golang.org/x/exp/maps" + commonconstants "github.com/NVIDIA/KAI-scheduler/pkg/common/constants" "github.com/NVIDIA/KAI-scheduler/pkg/scheduler/api/pod_info" "github.com/NVIDIA/KAI-scheduler/pkg/scheduler/api/pod_status" "github.com/NVIDIA/KAI-scheduler/pkg/scheduler/api/resource_info" @@ -96,6 +97,7 @@ func (ni *NodeInfo) addSharedTaskResourcesPerPodGroup(task *pod_info.PodInfo, gp ni.UsedSharedGPUsMemory[gpuGroup] += ni.GetResourceGpuMemory(task.ResReq) singleGpu := resource_info.NewSingleGpuVector(ni.VectorMap) + gpuIdx := ni.VectorMap.GetIndex(commonconstants.GpuResource) switch task.Status { case pod_status.Releasing: @@ -109,7 +111,7 @@ func (ni *NodeInfo) addSharedTaskResourcesPerPodGroup(task *pod_info.PodInfo, gp ni.ReleasingVector.Add(singleGpu) ni.markSharedGpuAsReleasing(gpuGroup) } - if int(ni.GetNumberOfGPUsInNode()) < int(ni.Idle.GPUs())+ni.getNumberOfUsedGPUs() { + if int(ni.GetNumberOfGPUsInNode()) < int(ni.IdleVector.Get(gpuIdx))+ni.getNumberOfUsedGPUs() { ni.Idle.SubGPUs(1) ni.IdleVector.Sub(singleGpu) } @@ -127,7 +129,7 @@ func (ni *NodeInfo) addSharedTaskResourcesPerPodGroup(task *pod_info.PodInfo, gp if ni.UsedSharedGPUsMemory[gpuGroup] <= ni.GetResourceGpuMemory(task.ResReq) { // no other fractional was allocated here yet - if int(ni.GetNumberOfGPUsInNode()) < int(ni.Idle.GPUs())+ni.getNumberOfUsedGPUs() { + if int(ni.GetNumberOfGPUsInNode()) < int(ni.IdleVector.Get(gpuIdx))+ni.getNumberOfUsedGPUs() { ni.Idle.SubGPUs(1) ni.IdleVector.Sub(singleGpu) } @@ -176,6 +178,7 @@ func (ni *NodeInfo) removeSharedTaskResourcesPerPodGroup(task *pod_info.PodInfo, ni.UsedSharedGPUsMemory[gpuGroup] -= ni.GetResourceGpuMemory(task.ResReq) singleGpu := resource_info.NewSingleGpuVector(ni.VectorMap) + gpuIdx := ni.VectorMap.GetIndex(commonconstants.GpuResource) switch task.Status { case pod_status.Releasing: @@ -189,7 +192,7 @@ func (ni *NodeInfo) removeSharedTaskResourcesPerPodGroup(task *pod_info.PodInfo, if ni.UsedSharedGPUsMemory[gpuGroup] <= 0 { // is this the last releasing task for this gpu - if int(ni.GetNumberOfGPUsInNode()) >= int(ni.Idle.GPUs())+ni.getNumberOfUsedGPUs() { + if int(ni.GetNumberOfGPUsInNode()) >= int(ni.IdleVector.Get(gpuIdx))+ni.getNumberOfUsedGPUs() { ni.Idle.AddGPUs(1) ni.IdleVector.Add(singleGpu) } @@ -222,7 +225,7 @@ func (ni *NodeInfo) removeSharedTaskResourcesPerPodGroup(task *pod_info.PodInfo, if ni.UsedSharedGPUsMemory[gpuGroup] <= 0 { // no other fractional was allocated here yet - if int(ni.GetNumberOfGPUsInNode()) >= int(ni.Idle.GPUs())+ni.getNumberOfUsedGPUs() { + if int(ni.GetNumberOfGPUsInNode()) >= int(ni.IdleVector.Get(gpuIdx))+ni.getNumberOfUsedGPUs() { ni.Idle.AddGPUs(1) ni.IdleVector.Add(singleGpu) } @@ -284,7 +287,8 @@ func (ni *NodeInfo) getNumberOfUsedSharedGPUs() int { } func (ni *NodeInfo) getNumberOfUsedGPUs() int { - return int(ni.Used.GPUs()) + ni.getNumberOfUsedSharedGPUs() + gpuIdx := ni.VectorMap.GetIndex(commonconstants.GpuResource) + return int(ni.UsedVector.Get(gpuIdx)) + ni.getNumberOfUsedSharedGPUs() } func (ni *NodeInfo) GetNumberOfAllocatedSharedGPUs() int { diff --git a/pkg/scheduler/api/node_info/node_info.go b/pkg/scheduler/api/node_info/node_info.go index 53f746465..e2cbdece1 100644 --- a/pkg/scheduler/api/node_info/node_info.go +++ b/pkg/scheduler/api/node_info/node_info.go @@ -107,9 +107,7 @@ type NodeInfo struct { func NewNodeInfo(node *v1.Node, podAffinityInfo pod_affinity.NodePodAffinityInfo, vectorMap *resource_info.ResourceVectorMap) *NodeInfo { gpuMemory, exists := getNodeGpuMemory(node) - allocatableVector := resource_info.NewResourceVectorFromResourceList( - node.Status.Allocatable, vectorMap, - ) + allocatableVector := resource_info.ResourceFromResourceList(node.Status.Allocatable).ToVector(vectorMap) idleVector := allocatableVector.Clone() usedVector := resource_info.NewResourceVector(vectorMap) releasingVector := resource_info.NewResourceVector(vectorMap) @@ -161,8 +159,18 @@ func (ni *NodeInfo) NonAllocatedResources() *resource_info.Resource { return nonAllocatedResource } +func (ni *NodeInfo) nonAllocatedVector() resource_info.ResourceVector { + v := ni.IdleVector.Clone() + v.Add(ni.ReleasingVector) + return v +} + func (ni *NodeInfo) NonAllocatedResource(resourceType v1.ResourceName) float64 { - return ni.Idle.Get(resourceType) + ni.Releasing.Get(resourceType) + idx := ni.VectorMap.GetIndex(string(resourceType)) + if idx < 0 { + return 0 + } + return ni.IdleVector.Get(idx) + ni.ReleasingVector.Get(idx) } func (ni *NodeInfo) IsTaskAllocatable(task *pod_info.PodInfo) bool { @@ -171,7 +179,7 @@ func (ni *NodeInfo) IsTaskAllocatable(task *pod_info.PodInfo) bool { return true } - if allocatable := ni.isTaskAllocatableOnNonAllocatedResources(task, ni.Idle); !allocatable { + if allocatable := ni.isTaskAllocatableOnNonAllocatedResources(task, ni.IdleVector); !allocatable { log.InfraLogger.V(7).Infof("Task GPU %s/%s is not allocatable on node %s", task.Namespace, task.Name, ni.Name) return false @@ -188,9 +196,9 @@ func (ni *NodeInfo) IsTaskAllocatable(task *pod_info.PodInfo) bool { } func (ni *NodeInfo) IsTaskAllocatableOnReleasingOrIdle(task *pod_info.PodInfo) bool { - nodeNonAllocatedResources := ni.NonAllocatedResources() + nodeNonAllocatedVector := ni.nonAllocatedVector() - if allocatable := ni.isTaskAllocatableOnNonAllocatedResources(task, nodeNonAllocatedResources); !allocatable { + if allocatable := ni.isTaskAllocatableOnNonAllocatedResources(task, nodeNonAllocatedVector); !allocatable { log.InfraLogger.V(7).Infof("Task GPU %s/%s is not allocatable on node %s", task.Namespace, task.Name, ni.Name) return false @@ -272,11 +280,12 @@ func (ni *NodeInfo) isTaskStorageAllocatableOnReleasingOrIdle(task *pod_info.Pod } func (ni *NodeInfo) FittingError(task *pod_info.PodInfo, isGangTask bool) *common_info.TasksFitError { - enoughResources := ni.lessEqualTaskToNodeResources(task.ResReq, ni.Idle) + enoughResources := ni.lessEqualTaskToNodeResources(task, ni.IdleVector) if !enoughResources { - totalUsed := ni.Used.Clone() - totalUsed.AddGPUs(float64(ni.getNumberOfUsedSharedGPUs())) - totalCapability := ni.Allocatable.Clone() + totalUsedVector := ni.UsedVector.Clone() + gpuIdx := ni.VectorMap.GetIndex(commonconstants.GpuResource) + totalUsedVector.Set(gpuIdx, totalUsedVector.Get(gpuIdx)+float64(ni.getNumberOfUsedSharedGPUs())) + totalCapabilityVector := ni.AllocatableVector.Clone() requestedResources := task.ResReq.Clone() if requestedResources.GpuMemory() > 0 { @@ -288,18 +297,18 @@ func (ni *NodeInfo) FittingError(task *pod_info.PodInfo, isGangTask bool) *commo messageSuffix := "" if len(task.Pod.Spec.Overhead) > 0 { // Adding to node idle instead of subtracting from pod requested resources - idleResourcesWithOverhead := ni.Idle.Clone() - idleResourcesWithOverhead.Add(resource_info.ResourceFromResourceList(task.Pod.Spec.Overhead)) - enoughResourcesWithoutOverhead := ni.lessEqualTaskToNodeResources(task.ResReq, idleResourcesWithOverhead) - if enoughResourcesWithoutOverhead { + overheadVector := resource_info.NewResourceVectorFromResourceList(task.Pod.Spec.Overhead, ni.VectorMap) + idleWithOverhead := ni.IdleVector.Clone() + idleWithOverhead.Add(overheadVector) + if ni.lessEqualTaskToNodeResources(task, idleWithOverhead) { messageSuffix = fmt.Sprintf("%s. The overhead resources are %v", common_info.OverheadMessage, k8s_utils.StringResourceList(task.Pod.Spec.Overhead)) } } fitError := common_info.NewFitErrorInsufficientResource( - task.Name, task.Namespace, ni.Name, task.ResReq, totalUsed, totalCapability, ni.MemoryOfEveryGpuOnNode, - isGangTask, messageSuffix) + task.Name, task.Namespace, ni.Name, task.ResReq, totalUsedVector, totalCapabilityVector, ni.VectorMap, + ni.MemoryOfEveryGpuOnNode, isGangTask, messageSuffix) return fitError } @@ -359,20 +368,21 @@ func (ni *NodeInfo) PredicateByNodeResourcesType(task *pod_info.PodInfo) error { } func (ni *NodeInfo) isTaskAllocatableOnNonAllocatedResources( - task *pod_info.PodInfo, nodeNonAllocatedResources *resource_info.Resource, + task *pod_info.PodInfo, nodeNonAllocatedVector resource_info.ResourceVector, ) bool { if task.IsRegularGPURequest() || task.IsMigProfileRequest() { - return ni.lessEqualTaskToNodeResources(task.ResReq, nodeNonAllocatedResources) + return ni.lessEqualTaskToNodeResources(task, nodeNonAllocatedVector) } - if !task.ResReq.BaseResource.LessEqual(&nodeNonAllocatedResources.BaseResource) { + if !ni.lessEqualVectorsExcludingGPU(task.ResReqVector, nodeNonAllocatedVector) { return false } if !ni.isValidGpuPortion(task.ResReq) { return false } - nodeIdleOrReleasingWholeGpus := int64(math.Floor(nodeNonAllocatedResources.GPUs())) + gpuIdx := ni.VectorMap.GetIndex(commonconstants.GpuResource) + nodeIdleOrReleasingWholeGpus := int64(math.Floor(nodeNonAllocatedVector.Get(gpuIdx))) nodeNonAllocatedResourcesMatchingSharedGpus := ni.fractionTaskGpusAllocatableDeviceCount(task) if nodeIdleOrReleasingWholeGpus+nodeNonAllocatedResourcesMatchingSharedGpus >= task.ResReq.GetNumOfGpuDevices() { return true @@ -381,6 +391,18 @@ func (ni *NodeInfo) isTaskAllocatableOnNonAllocatedResources( return false } +func (ni *NodeInfo) lessEqualVectorsExcludingGPU(a, b resource_info.ResourceVector) bool { + gpuIdx := ni.VectorMap.GetIndex(commonconstants.GpuResource) + savedA := a.Get(gpuIdx) + savedB := b.Get(gpuIdx) + a.Set(gpuIdx, 0) + b.Set(gpuIdx, 0) + result := a.LessEqual(b) + a.Set(gpuIdx, savedA) + b.Set(gpuIdx, savedB) + return result +} + func (ni *NodeInfo) AddTask(task *pod_info.PodInfo) error { return ni.addTask(task, false) } @@ -591,18 +613,20 @@ func (ni *NodeInfo) String() string { func (ni *NodeInfo) GetSumOfIdleGPUs() (float64, int64) { sumOfSharedGPUs, sumOfSharedGPUsMemory := ni.getSumOfAvailableSharedGPUs() - idleGPUs := ni.Idle.GPUs() + gpuIdx := ni.VectorMap.GetIndex(commonconstants.GpuResource) + idleGPUs := ni.IdleVector.Get(gpuIdx) - for resourceName, qty := range ni.Idle.ScalarResources() { - if !isMigResource(resourceName.String()) { + for i := range ni.VectorMap.Len() { + name := ni.VectorMap.ResourceAt(i) + if !isMigResource(name) { continue } - gpuPortion, _, err := resources.ExtractGpuAndMemoryFromMigResourceName(resourceName.String()) + gpuPortion, _, err := resources.ExtractGpuAndMemoryFromMigResourceName(name) if err != nil { - log.InfraLogger.Errorf("failed to evaluate device portion for resource %v: %v", resourceName, err) + log.InfraLogger.Errorf("failed to evaluate device portion for resource %v: %v", name, err) continue } - idleGPUs += float64(int64(gpuPortion) * qty) + idleGPUs += float64(int64(gpuPortion) * int64(ni.IdleVector.Get(i))) } return sumOfSharedGPUs + idleGPUs, sumOfSharedGPUsMemory + (int64(idleGPUs) * ni.MemoryOfEveryGpuOnNode) @@ -610,18 +634,20 @@ func (ni *NodeInfo) GetSumOfIdleGPUs() (float64, int64) { func (ni *NodeInfo) GetSumOfReleasingGPUs() (float64, int64) { sumOfSharedGPUs, sumOfSharedGPUsMemory := ni.getSumOfReleasingSharedGPUs() - releasingGPUs := ni.Releasing.GPUs() + gpuIdx := ni.VectorMap.GetIndex(commonconstants.GpuResource) + releasingGPUs := ni.ReleasingVector.Get(gpuIdx) - for resourceName, qty := range ni.Releasing.ScalarResources() { - if !isMigResource(resourceName.String()) { + for i := range ni.VectorMap.Len() { + name := ni.VectorMap.ResourceAt(i) + if !isMigResource(name) { continue } - gpuPortion, _, err := resources.ExtractGpuAndMemoryFromMigResourceName(resourceName.String()) + gpuPortion, _, err := resources.ExtractGpuAndMemoryFromMigResourceName(name) if err != nil { - log.InfraLogger.Errorf("failed to evaluate device portion for resource %v: %v", resourceName, err) + log.InfraLogger.Errorf("failed to evaluate device portion for resource %v: %v", name, err) continue } - releasingGPUs += float64(int64(gpuPortion) * qty) + releasingGPUs += float64(int64(gpuPortion) * int64(ni.ReleasingVector.Get(i))) } return sumOfSharedGPUs + releasingGPUs, sumOfSharedGPUsMemory + (int64(releasingGPUs) * ni.MemoryOfEveryGpuOnNode) @@ -645,7 +671,8 @@ func (ni *NodeInfo) GetNumberOfGPUsInNode() int64 { numberOfGPUs, err := ni.getNodeGpuCountLabelValue() if err != nil { log.InfraLogger.V(6).Infof("Node: <%v> had no annotations of nvidia.com/gpu.count", ni.Name) - return int64(ni.Allocatable.GPUs()) + gpuIdx := ni.VectorMap.GetIndex(commonconstants.GpuResource) + return int64(ni.AllocatableVector.Get(gpuIdx)) } return int64(numberOfGPUs) } @@ -698,7 +725,8 @@ func (ni *NodeInfo) IsCPUOnlyNode() bool { if ni.IsMIGEnabled() { return false } - return ni.Allocatable.GPUs() <= 0 && !ni.HasDRAGPUs + gpuIdx := ni.VectorMap.GetIndex(commonconstants.GpuResource) + return ni.AllocatableVector.Get(gpuIdx) <= 0 && !ni.HasDRAGPUs } func (ni *NodeInfo) IsMIGEnabled() bool { @@ -708,8 +736,9 @@ func (ni *NodeInfo) IsMIGEnabled() bool { isMig, err := strconv.ParseBool(enabled) return err == nil && isMig } - for nodeResource := range ni.Allocatable.ScalarResources() { - if isMigResource(nodeResource.String()) { + for i := range ni.VectorMap.Len() { + name := ni.VectorMap.ResourceAt(i) + if isMigResource(name) && ni.AllocatableVector.Get(i) > 0 { return true } } @@ -765,16 +794,16 @@ func (ni *NodeInfo) setAcceptedResources(pi *pod_info.PodInfo) { // TODO: improve by getting claims actual status. This approach doesn't support FirstAvailable requests. pi.AcceptedResource.SetDraGpus(pi.ResReq.DraGpuCounts()) } - + pi.AcceptedResourceVector = pi.AcceptedResource.ToVector(pi.VectorMap) } func (ni *NodeInfo) lessEqualTaskToNodeResources( - taskResources *resource_info.ResourceRequirements, nodeResources *resource_info.Resource, + task *pod_info.PodInfo, nodeResourcesVector resource_info.ResourceVector, ) bool { - if !ni.isValidGpuPortion(taskResources) { + if !ni.isValidGpuPortion(task.ResReq) { return false } - return taskResources.LessEqualResource(nodeResources) + return task.ResReqVector.LessEqual(nodeResourcesVector) } func isMigResource(rName string) bool { diff --git a/pkg/scheduler/api/node_info/node_info_test.go b/pkg/scheduler/api/node_info/node_info_test.go index 3cf0174c7..fffd05a60 100644 --- a/pkg/scheduler/api/node_info/node_info_test.go +++ b/pkg/scheduler/api/node_info/node_info_test.go @@ -837,13 +837,22 @@ func runAllocatableTest( nodePodAffinityInfo := pod_affinity.NewMockNodePodAffinityInfo(controller) nodePodAffinityInfo.EXPECT().AddPod(Any()).Times(len(testData.podsResources)) - ni := NewNodeInfo(testData.node, nodePodAffinityInfo, testVectorMapFromNode(testData.node)) + vectorMap := testVectorMapFromNode(testData.node) + for _, podResources := range testData.podsResources { + for resourceName := range podResources { + vectorMap.AddResource(string(resourceName)) + } + } + for resourceName := range testData.podResourcesToAllocate { + vectorMap.AddResource(string(resourceName)) + } + ni := NewNodeInfo(testData.node, nodePodAffinityInfo, vectorMap) for ind, podResouces := range testData.podsResources { pod := common_info.BuildPod( fmt.Sprintf("p%d", ind), "p1", "n1", v1.PodRunning, podResouces, []metav1.OwnerReference{}, make(map[string]string), map[string]string{}) addJobAnnotation(pod) - pi := pod_info.NewTaskInfo(pod, nil, resource_info.NewResourceVectorMap()) + pi := pod_info.NewTaskInfo(pod, nil, vectorMap) if err := ni.AddTask(pi); err != nil { t.Errorf("%s: failed to add pod %v, index: %d", testName, pi, ind) } @@ -857,7 +866,7 @@ func runAllocatableTest( pod.Spec.Overhead = testData.podOverhead } - task := pod_info.NewTaskInfo(pod, nil, resource_info.NewResourceVectorMap()) + task := pod_info.NewTaskInfo(pod, nil, vectorMap) allocatable, fitErr := testedFunction(ni, task) if allocatable != testData.expected { t.Errorf("%s: is pod allocatable: expected %v, got %v", testName, testData.expected, allocatable) @@ -1028,6 +1037,7 @@ func TestNodeInfo_isTaskAllocatableOnNonAllocatedResources(t *testing.T) { } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { + vectorMap := resource_info.NewResourceVectorMap() ni := &NodeInfo{ Name: tt.fields.Name, Node: tt.fields.Node, @@ -1042,8 +1052,10 @@ func TestNodeInfo_isTaskAllocatableOnNonAllocatedResources(t *testing.T) { PodAffinityInfo: tt.fields.PodAffinityInfo, GpuSharingNodeInfo: tt.fields.GpuSharingNodeInfo, } + setNodeInfoVectors(ni, vectorMap) + nodeNonAllocatedVector := tt.args.nodeNonAllocatedResources.ToVector(vectorMap) assert.Equalf(t, tt.want, - ni.isTaskAllocatableOnNonAllocatedResources(tt.args.task, tt.args.nodeNonAllocatedResources), + ni.isTaskAllocatableOnNonAllocatedResources(tt.args.task, nodeNonAllocatedVector), "isTaskAllocatableOnNonAllocatedResources(%v, %v)", tt.args.task, tt.args.nodeNonAllocatedResources) }) } @@ -1357,6 +1369,7 @@ func TestPredicateByNodeResourcesType_DRA(t *testing.T) { for testName, testData := range tests { t.Run(testName, func(t *testing.T) { + setNodeInfoVectors(testData.nodeInfo, resource_info.NewResourceVectorMap()) err := testData.nodeInfo.PredicateByNodeResourcesType(testData.task) if testData.expectError { assert.Error(t, err, "Should reject request") @@ -1369,19 +1382,24 @@ func TestPredicateByNodeResourcesType_DRA(t *testing.T) { } func TestIsCPUOnlyNode_DRA(t *testing.T) { + vectorMap := resource_info.NewResourceVectorMap() nodeWithDRA := &NodeInfo{ - Name: "dra-node", - HasDRAGPUs: true, - Allocatable: common_info.BuildResourceWithGpu("1000m", "1G", "4", "110"), - Node: &v1.Node{ObjectMeta: metav1.ObjectMeta{Name: "dra-node", Labels: map[string]string{}}}, + Name: "dra-node", + HasDRAGPUs: true, + Allocatable: common_info.BuildResourceWithGpu("1000m", "1G", "4", "110"), + AllocatableVector: common_info.BuildResourceWithGpu("1000m", "1G", "4", "110").ToVector(vectorMap), + VectorMap: vectorMap, + Node: &v1.Node{ObjectMeta: metav1.ObjectMeta{Name: "dra-node", Labels: map[string]string{}}}, } assert.False(t, nodeWithDRA.IsCPUOnlyNode(), "node with HasDRAGPUs should not be CPU-only") cpuOnlyNode := &NodeInfo{ - Name: "cpu-node", - HasDRAGPUs: false, - Allocatable: common_info.BuildResource("1000m", "1G"), - Node: &v1.Node{ObjectMeta: metav1.ObjectMeta{Name: "cpu-node", Labels: map[string]string{}}}, + Name: "cpu-node", + HasDRAGPUs: false, + Allocatable: common_info.BuildResource("1000m", "1G"), + AllocatableVector: common_info.BuildResource("1000m", "1G").ToVector(vectorMap), + VectorMap: vectorMap, + Node: &v1.Node{ObjectMeta: metav1.ObjectMeta{Name: "cpu-node", Labels: map[string]string{}}}, } assert.True(t, cpuOnlyNode.IsCPUOnlyNode(), "node without GPUs and without HasDRAGPUs should be CPU-only") } diff --git a/pkg/scheduler/api/pod_info/pod_info.go b/pkg/scheduler/api/pod_info/pod_info.go index bfb42f18f..bd8eafe90 100644 --- a/pkg/scheduler/api/pod_info/pod_info.go +++ b/pkg/scheduler/api/pod_info/pod_info.go @@ -83,9 +83,10 @@ type PodInfo struct { ResReq *resource_info.ResourceRequirements AcceptedResource *resource_info.ResourceRequirements - // Vector representation of ResReq - ResReqVector resource_info.ResourceVector - VectorMap *resource_info.ResourceVectorMap + // Vector representation of ResReq and AcceptedResource + ResReqVector resource_info.ResourceVector + AcceptedResourceVector resource_info.ResourceVector + VectorMap *resource_info.ResourceVectorMap schedulingConstraintsSignature common_info.SchedulingConstraintsSignature @@ -198,6 +199,7 @@ func NewTaskInfoWithBindRequest(pod *v1.Pod, bindRequest *bindrequest_info.BindR ResReq: initResreq, AcceptedResource: resource_info.EmptyResourceRequirements(), ResReqVector: initResreq.ToVector(vectorMap), + AcceptedResourceVector: resource_info.NewResourceVector(vectorMap), VectorMap: vectorMap, GPUGroups: []string{}, ResourceRequestType: RequestTypeRegular, @@ -259,6 +261,7 @@ func resourceClaimInfoFromPodClaims(draPodClaims []*resourceapi.ResourceClaim, p func (pi *PodInfo) SetVectorMap(vectorMap *resource_info.ResourceVectorMap) { pi.VectorMap = vectorMap pi.ResReqVector = pi.ResReq.ToVector(vectorMap) + pi.AcceptedResourceVector = pi.AcceptedResource.ToVector(vectorMap) } func (pi *PodInfo) Clone() *PodInfo { @@ -267,28 +270,33 @@ func (pi *PodInfo) Clone() *PodInfo { if pi.ResReqVector != nil { resReqVectorClone = pi.ResReqVector.Clone() } + var acceptedResourceVectorClone resource_info.ResourceVector + if pi.AcceptedResourceVector != nil { + acceptedResourceVectorClone = pi.AcceptedResourceVector.Clone() + } return &PodInfo{ - UID: pi.UID, - Job: pi.Job, - Name: pi.Name, - Namespace: pi.Namespace, - SubGroupName: pi.SubGroupName, - NodeName: pi.NodeName, - Status: pi.Status, - Pod: pi.Pod, - ResReq: pi.ResReq.Clone(), - AcceptedResource: pi.AcceptedResource.Clone(), - ResReqVector: resReqVectorClone, - VectorMap: pi.VectorMap, - GPUGroups: pi.GPUGroups, - ResourceClaimInfo: pi.ResourceClaimInfo.Clone(), - ResourceRequestType: pi.ResourceRequestType, - ResourceReceivedType: pi.ResourceReceivedType, - IsVirtualStatus: pi.IsVirtualStatus, - IsLegacyMIGtask: pi.IsLegacyMIGtask, - storageClaims: pi.storageClaims, - ownedStorageClaims: pi.ownedStorageClaims, + UID: pi.UID, + Job: pi.Job, + Name: pi.Name, + Namespace: pi.Namespace, + SubGroupName: pi.SubGroupName, + NodeName: pi.NodeName, + Status: pi.Status, + Pod: pi.Pod, + ResReq: pi.ResReq.Clone(), + AcceptedResource: pi.AcceptedResource.Clone(), + ResReqVector: resReqVectorClone, + AcceptedResourceVector: acceptedResourceVectorClone, + VectorMap: pi.VectorMap, + GPUGroups: pi.GPUGroups, + ResourceClaimInfo: pi.ResourceClaimInfo.Clone(), + ResourceRequestType: pi.ResourceRequestType, + ResourceReceivedType: pi.ResourceReceivedType, + IsVirtualStatus: pi.IsVirtualStatus, + IsLegacyMIGtask: pi.IsLegacyMIGtask, + storageClaims: pi.storageClaims, + ownedStorageClaims: pi.ownedStorageClaims, } } diff --git a/pkg/scheduler/cache/cluster_info/cluster_info.go b/pkg/scheduler/cache/cluster_info/cluster_info.go index eb3bcfcd0..eab3d7cc4 100644 --- a/pkg/scheduler/cache/cluster_info/cluster_info.go +++ b/pkg/scheduler/cache/cluster_info/cluster_info.go @@ -282,7 +282,7 @@ func (c *ClusterInfo) populateDRAGPUs(nodes map[string]*node_info.NodeInfo) { if draGPUCount > 0 { log.InfraLogger.V(6).Infof("Node %s has %d DRA GPUs from ResourceSlices", nodeName, draGPUCount) - if nodeInfo.Allocatable.GPUs() > 0 { + if nodeInfo.AllocatableVector.Get(nodeInfo.VectorMap.GetIndex("gpu")) > 0 { log.InfraLogger.Warningf("Node %s has both device-plugin GPUs and DRA GPUs", nodeName) } nodeInfo.AddDRAGPUs(float64(draGPUCount)) diff --git a/pkg/scheduler/framework/session.go b/pkg/scheduler/framework/session.go index 8a81381bb..1be578c24 100644 --- a/pkg/scheduler/framework/session.go +++ b/pkg/scheduler/framework/session.go @@ -29,6 +29,7 @@ import ( "k8s.io/apimachinery/pkg/types" ksf "k8s.io/kube-scheduler/framework" + commonconstants "github.com/NVIDIA/KAI-scheduler/pkg/common/constants" "github.com/NVIDIA/KAI-scheduler/pkg/scheduler/api" "github.com/NVIDIA/KAI-scheduler/pkg/scheduler/api/common_info" "github.com/NVIDIA/KAI-scheduler/pkg/scheduler/api/eviction_info" @@ -174,8 +175,11 @@ func filterGpusByEnoughResources(node *node_info.NodeInfo, pod *pod_info.PodInfo filteredGPUs = append(filteredGPUs, gpuIdx) } } - if node.Idle.GPUs() > 0 || node.Releasing.GPUs() > 0 { - for range int(node.Idle.GPUs()) + int(node.Releasing.GPUs()) { + gpuIdx := node.VectorMap.GetIndex(commonconstants.GpuResource) + idleGPUs := node.IdleVector.Get(gpuIdx) + releasingGPUs := node.ReleasingVector.Get(gpuIdx) + if idleGPUs > 0 || releasingGPUs > 0 { + for range int(idleGPUs) + int(releasingGPUs) { filteredGPUs = append(filteredGPUs, pod_info.WholeGpuIndicator) } } @@ -207,7 +211,7 @@ func (ssn *Session) FittingNode(task *pod_info.PodInfo, node *node_info.NodeInfo job := ssn.ClusterInfo.PodGroupInfos[task.Job] log.InfraLogger.V(6).Infof("Checking if task <%v/%v> is allocatable on node <%v>: <%v> vs. <%v>", - task.Namespace, task.Name, node.Name, task.ResReq, node.Idle) + task.Namespace, task.Name, node.Name, task.ResReqVector, node.IdleVector) allocatable, fitError := ssn.isTaskAllocatableOnNode(task, job, node, writeFittingDelta) if !allocatable { if fitError != nil && writeFittingDelta { @@ -272,7 +276,7 @@ func (ssn *Session) isTaskAllocatableOnNode(task *pod_info.PodInfo, job *podgrou allocatable = false log.InfraLogger.V(6).Infof("Not enough resources for task: <%s/%s>, init requested: <%v>. "+ "Node <%s> with limited resources, releasing: <%v>, idle: <%v>", - task.Namespace, task.Name, task.ResReq, node.Name, node.Releasing, node.Idle) + task.Namespace, task.Name, task.ResReqVector, node.Name, node.ReleasingVector, node.IdleVector) if writeFittingDelta { if taskAllocatable := node.IsTaskAllocatable(task); !taskAllocatable { fitError = node.FittingError(task, len(job.GetAllPodsMap()) > 1) diff --git a/pkg/scheduler/framework/statement.go b/pkg/scheduler/framework/statement.go index c1334bda4..5cbf1020c 100644 --- a/pkg/scheduler/framework/statement.go +++ b/pkg/scheduler/framework/statement.go @@ -263,7 +263,7 @@ func (s *Statement) Pipeline(task *pod_info.PodInfo, hostname string, updateTask } log.InfraLogger.V(6).Infof("After pipelined Task <%v/%v> to Node <%v>: idle <%v>, used <%v>, releasing <%v>", - task.Namespace, task.Name, node.Name, node.Idle, node.Used, node.Releasing) + task.Namespace, task.Name, node.Name, node.IdleVector, node.UsedVector, node.ReleasingVector) for _, eh := range s.ssn.eventHandlers { if eh.AllocateFunc != nil { @@ -321,7 +321,7 @@ func (s *Statement) Allocate(task *pod_info.PodInfo, hostname string) error { } log.InfraLogger.V(5).Infof( "After allocated Task <%v/%v> to Node <%v>: idle <%v>, used <%v>, releasing <%v>", - task.Namespace, task.Name, node.Name, node.Idle, node.Used, node.Releasing) + task.Namespace, task.Name, node.Name, node.IdleVector, node.UsedVector, node.ReleasingVector) } else { log.InfraLogger.Errorf("Failed to find Node <%s> in Session <%s> index when binding.", hostname, s.sessionID) diff --git a/pkg/scheduler/plugins/nodeavailability/nodeavailability.go b/pkg/scheduler/plugins/nodeavailability/nodeavailability.go index 32ca808f0..8ad78fdc2 100644 --- a/pkg/scheduler/plugins/nodeavailability/nodeavailability.go +++ b/pkg/scheduler/plugins/nodeavailability/nodeavailability.go @@ -4,6 +4,7 @@ package nodeavailability import ( + commonconstants "github.com/NVIDIA/KAI-scheduler/pkg/common/constants" "github.com/NVIDIA/KAI-scheduler/pkg/scheduler/api/node_info" "github.com/NVIDIA/KAI-scheduler/pkg/scheduler/api/pod_info" "github.com/NVIDIA/KAI-scheduler/pkg/scheduler/framework" @@ -32,10 +33,11 @@ func (pp *nodeAvailabilityPlugin) nodeOrderFn(task *pod_info.PodInfo, node *node score = scores.Availability } + gpuIdx := node.VectorMap.GetIndex(commonconstants.GpuResource) log.InfraLogger.V(7).Infof( "Estimating Task: <%v/%v> Job: <%v> for node: <%s> that has <%f> idle GPUs and <%f> releasing GPUs and <%f> allocated GPUs. Score: %f", - task.Namespace, task.Name, task.Job, node.Name, node.Idle.GPUs(), node.Releasing.GPUs(), - node.Used.GPUs(), score) + task.Namespace, task.Name, task.Job, node.Name, node.IdleVector.Get(gpuIdx), node.ReleasingVector.Get(gpuIdx), + node.UsedVector.Get(gpuIdx), score) return score, nil } diff --git a/pkg/scheduler/plugins/nodeplacement/nodepack_test.go b/pkg/scheduler/plugins/nodeplacement/nodepack_test.go index 91d69a542..3c0240752 100644 --- a/pkg/scheduler/plugins/nodeplacement/nodepack_test.go +++ b/pkg/scheduler/plugins/nodeplacement/nodepack_test.go @@ -221,6 +221,7 @@ func buildSingleTestParams(testMetadata testTopologyMetadata) (*framework.Sessio idleResources := resources_fake.BuildResourceList(nil, nil, &nodeMetadata.nodeIdleGPUs, nil) nodeInfo.Idle = resource_info.ResourceFromResourceList(*idleResources) + nodeInfo.IdleVector = nodeInfo.Idle.ToVector(vectorMap) nodeInfoMap[nodeName] = nodeInfo } diff --git a/pkg/scheduler/plugins/nodeplacement/nodespread_test.go b/pkg/scheduler/plugins/nodeplacement/nodespread_test.go index 6dfeb8492..eacfa86ef 100644 --- a/pkg/scheduler/plugins/nodeplacement/nodespread_test.go +++ b/pkg/scheduler/plugins/nodeplacement/nodespread_test.go @@ -65,11 +65,16 @@ var _ = Describe("NodeSpread", func() { }, } + vectorMap := resource_info.NewResourceVectorMap() + for _, c := range cases { task := &pod_info.PodInfo{ ResReq: resource_info.NewResourceRequirementsWithGpus(1), } + idle := resource_info.NewResource(0, 0, c.nonAllocated) + releasing := resource_info.EmptyResource() + node := &node_info.NodeInfo{ Node: &corev1.Node{ ObjectMeta: metav1.ObjectMeta{ @@ -78,8 +83,11 @@ var _ = Describe("NodeSpread", func() { }, }, }, - Idle: resource_info.NewResource(0, 0, c.nonAllocated), - Releasing: resource_info.EmptyResource(), + Idle: idle, + IdleVector: idle.ToVector(vectorMap), + Releasing: releasing, + ReleasingVector: releasing.ToVector(vectorMap), + VectorMap: vectorMap, } plugin := nodeplacement.New(map[string]string{ @@ -100,11 +108,19 @@ var _ = Describe("NodeSpread", func() { ResReq: resource_info.NewResourceRequirements(0, 1, 0), } + idle2 := resource_info.NewResource(c.nonAllocated, 0, 0) + allocatable := resource_info.NewResource(float64(c.gpuCount), 0, 0) + releasing2 := resource_info.EmptyResource() + node = &node_info.NodeInfo{ - Node: &corev1.Node{}, - Idle: resource_info.NewResource(c.nonAllocated, 0, 0), - Allocatable: resource_info.NewResource(float64(c.gpuCount), 0, 0), - Releasing: resource_info.EmptyResource(), + Node: &corev1.Node{}, + Idle: idle2, + IdleVector: idle2.ToVector(vectorMap), + Allocatable: allocatable, + AllocatableVector: allocatable.ToVector(vectorMap), + Releasing: releasing2, + ReleasingVector: releasing2.ToVector(vectorMap), + VectorMap: vectorMap, } actual, err = nof(task, node) diff --git a/pkg/scheduler/plugins/nodeplacement/pack.go b/pkg/scheduler/plugins/nodeplacement/pack.go index d2bc8975b..551d6ee42 100644 --- a/pkg/scheduler/plugins/nodeplacement/pack.go +++ b/pkg/scheduler/plugins/nodeplacement/pack.go @@ -22,7 +22,7 @@ func (pp *nodePlacementPlugin) nodeResourcePack(resourceName v1.ResourceName) ap podAllocationRange := pp.podAllocatableRange[string(task.UID)] currentNodeNonAllocated := node.NonAllocatedResource(resourceName) - nodeOverall := node.Allocatable.Get(resourceName) + nodeOverall := node.AllocatableVector.Get(node.VectorMap.GetIndex(string(resourceName))) score := getScoreOfCurrentNode(podAllocationRange.minAllocatable, podAllocationRange.maxAllocatable, currentNodeNonAllocated, nodeOverall) log.InfraLogger.V(7).Infof("Estimating Task: <%v/%v> Job: <%v> for node: <%s> "+ @@ -69,7 +69,7 @@ func getMinMaxPerNode(resourceName v1.ResourceName, nodes []*node_info.NodeInfo) for _, node := range nodes { current := node.NonAllocatedResource(resourceName) // We don't want to consider nodes with none of that resource type - if node.Allocatable.Get(resourceName) == 0 { + if node.AllocatableVector.Get(node.VectorMap.GetIndex(string(resourceName))) == 0 { continue } diff --git a/pkg/scheduler/plugins/nodeplacement/spread.go b/pkg/scheduler/plugins/nodeplacement/spread.go index 3d9bf1cbd..e0b77636d 100644 --- a/pkg/scheduler/plugins/nodeplacement/spread.go +++ b/pkg/scheduler/plugins/nodeplacement/spread.go @@ -19,7 +19,7 @@ func nodeResourceSpread(resourceName v1.ResourceName) api.NodeOrderFn { if resourceName == resource_info.GPUResourceName { resourceCount = float64(node.GetNumberOfGPUsInNode()) } else { - resourceCount = node.Allocatable.Get(resourceName) + resourceCount = node.AllocatableVector.Get(node.VectorMap.GetIndex(string(resourceName))) } if resourceCount == 0 { diff --git a/pkg/scheduler/plugins/proportion/capacity_policy/capacity_policy.go b/pkg/scheduler/plugins/proportion/capacity_policy/capacity_policy.go index 02fa0f24d..a07fccddd 100644 --- a/pkg/scheduler/plugins/proportion/capacity_policy/capacity_policy.go +++ b/pkg/scheduler/plugins/proportion/capacity_policy/capacity_policy.go @@ -11,6 +11,7 @@ import ( "github.com/NVIDIA/KAI-scheduler/pkg/scheduler/api/podgroup_info" "github.com/NVIDIA/KAI-scheduler/pkg/scheduler/log" rs "github.com/NVIDIA/KAI-scheduler/pkg/scheduler/plugins/proportion/resource_share" + "github.com/NVIDIA/KAI-scheduler/pkg/scheduler/plugins/proportion/utils" ) type capacityCheckFn func(requestedShare rs.ResourceQuantities, job *podgroup_info.PodGroupInfo) *api.SchedulableResult @@ -76,9 +77,10 @@ func (cp *CapacityPolicy) isJobOverCapacity(requestedShare rs.ResourceQuantities func getRequiredQuota(tasksToAllocate []*pod_info.PodInfo) *podgroup_info.JobRequirement { quota := podgroup_info.JobRequirement{} for _, pod := range tasksToAllocate { - quota.GPU += pod.ResReq.GetGpusQuota() - quota.MilliCPU += pod.ResReq.Cpu() - quota.Memory += pod.ResReq.Memory() + quantities := utils.QuantifyVector(pod.ResReqVector, pod.VectorMap) + quota.GPU += quantities[rs.GpuResource] + quota.MilliCPU += quantities[rs.CpuResource] + quota.Memory += quantities[rs.MemoryResource] } return "a } diff --git a/pkg/scheduler/plugins/proportion/capacity_policy/capacity_policy_test.go b/pkg/scheduler/plugins/proportion/capacity_policy/capacity_policy_test.go index b8be9de9c..81de6d31e 100644 --- a/pkg/scheduler/plugins/proportion/capacity_policy/capacity_policy_test.go +++ b/pkg/scheduler/plugins/proportion/capacity_policy/capacity_policy_test.go @@ -21,6 +21,9 @@ import ( ) var _ = Describe("Capacity Policy Check", func() { + var ( + testVectorMap = resource_info.NewResourceVectorMap() + ) Describe("IsJobOverQueueCapacity", func() { Context("max allowed", func() { tests := map[string]struct { @@ -80,12 +83,14 @@ var _ = Describe("Capacity Policy Check", func() { podgroup_info.DefaultSubGroup: subgroup_info.NewPodSet(podgroup_info.DefaultSubGroup, 1, nil). WithPodInfos(map[common_info.PodID]*pod_info.PodInfo{ "task-a": { - UID: "task-a", - Job: "job-a", - Name: "task-a", - Namespace: "team-a", - Status: pod_status.Pending, - ResReq: resource_info.NewResourceRequirementsWithGpus(1), + UID: "task-a", + Job: "job-a", + Name: "task-a", + Namespace: "team-a", + Status: pod_status.Pending, + ResReq: resource_info.NewResourceRequirementsWithGpus(1), + ResReqVector: resource_info.NewResourceRequirementsWithGpus(1).ToVector(testVectorMap), + VectorMap: testVectorMap, }, }), }, @@ -143,12 +148,14 @@ var _ = Describe("Capacity Policy Check", func() { podgroup_info.DefaultSubGroup: subgroup_info.NewPodSet(podgroup_info.DefaultSubGroup, 1, nil). WithPodInfos(map[common_info.PodID]*pod_info.PodInfo{ "task-a": { - UID: "task-a", - Job: "job-a", - Name: "task-a", - Namespace: "team-a", - Status: pod_status.Pending, - ResReq: resource_info.NewResourceRequirementsWithGpus(1), + UID: "task-a", + Job: "job-a", + Name: "task-a", + Namespace: "team-a", + Status: pod_status.Pending, + ResReq: resource_info.NewResourceRequirementsWithGpus(1), + ResReqVector: resource_info.NewResourceRequirementsWithGpus(1).ToVector(testVectorMap), + VectorMap: testVectorMap, }, }), }, @@ -231,12 +238,14 @@ var _ = Describe("Capacity Policy Check", func() { podgroup_info.DefaultSubGroup: subgroup_info.NewPodSet(podgroup_info.DefaultSubGroup, 1, nil). WithPodInfos(map[common_info.PodID]*pod_info.PodInfo{ "task-a": { - UID: "task-a", - Job: "job-a", - Name: "task-a", - Namespace: "team-a", - Status: pod_status.Pending, - ResReq: resource_info.NewResourceRequirementsWithGpus(1), + UID: "task-a", + Job: "job-a", + Name: "task-a", + Namespace: "team-a", + Status: pod_status.Pending, + ResReq: resource_info.NewResourceRequirementsWithGpus(1), + ResReqVector: resource_info.NewResourceRequirementsWithGpus(1).ToVector(testVectorMap), + VectorMap: testVectorMap, }, }), }, @@ -298,12 +307,14 @@ var _ = Describe("Capacity Policy Check", func() { podgroup_info.DefaultSubGroup: subgroup_info.NewPodSet(podgroup_info.DefaultSubGroup, 1, nil). WithPodInfos(map[common_info.PodID]*pod_info.PodInfo{ "task-a": { - UID: "task-a", - Job: "job-a", - Name: "task-a", - Namespace: "team-a", - Status: pod_status.Pending, - ResReq: resource_info.NewResourceRequirementsWithGpus(1), + UID: "task-a", + Job: "job-a", + Name: "task-a", + Namespace: "team-a", + Status: pod_status.Pending, + ResReq: resource_info.NewResourceRequirementsWithGpus(1), + ResReqVector: resource_info.NewResourceRequirementsWithGpus(1).ToVector(testVectorMap), + VectorMap: testVectorMap, }, }), }, @@ -389,12 +400,14 @@ var _ = Describe("Capacity Policy Check", func() { podgroup_info.DefaultSubGroup: subgroup_info.NewPodSet(podgroup_info.DefaultSubGroup, 1, nil). WithPodInfos(map[common_info.PodID]*pod_info.PodInfo{ "task-a": { - UID: "task-a", - Job: "job-a", - Name: "task-a", - Namespace: "team-a", - Status: pod_status.Pending, - ResReq: resource_info.NewResourceRequirementsWithGpus(1), + UID: "task-a", + Job: "job-a", + Name: "task-a", + Namespace: "team-a", + Status: pod_status.Pending, + ResReq: resource_info.NewResourceRequirementsWithGpus(1), + ResReqVector: resource_info.NewResourceRequirementsWithGpus(1).ToVector(testVectorMap), + VectorMap: testVectorMap, }, }), }, @@ -456,12 +469,14 @@ var _ = Describe("Capacity Policy Check", func() { podgroup_info.DefaultSubGroup: subgroup_info.NewPodSet(podgroup_info.DefaultSubGroup, 1, nil). WithPodInfos(map[common_info.PodID]*pod_info.PodInfo{ "task-a": { - UID: "task-a", - Job: "job-a", - Name: "task-a", - Namespace: "team-a", - Status: pod_status.Pending, - ResReq: resource_info.NewResourceRequirementsWithGpus(1), + UID: "task-a", + Job: "job-a", + Name: "task-a", + Namespace: "team-a", + Status: pod_status.Pending, + ResReq: resource_info.NewResourceRequirementsWithGpus(1), + ResReqVector: resource_info.NewResourceRequirementsWithGpus(1).ToVector(testVectorMap), + VectorMap: testVectorMap, }, }), }, @@ -548,12 +563,14 @@ var _ = Describe("Capacity Policy Check", func() { podgroup_info.DefaultSubGroup: subgroup_info.NewPodSet(podgroup_info.DefaultSubGroup, 1, nil). WithPodInfos(map[common_info.PodID]*pod_info.PodInfo{ "task-a": { - UID: "task-a", - Job: "job-a", - Name: "task-a", - Namespace: "team-a", - Status: pod_status.Pending, - ResReq: resource_info.NewResourceRequirements(0, 1000, 0), + UID: "task-a", + Job: "job-a", + Name: "task-a", + Namespace: "team-a", + Status: pod_status.Pending, + ResReq: resource_info.NewResourceRequirements(0, 1000, 0), + ResReqVector: resource_info.NewResourceRequirements(0, 1000, 0).ToVector(testVectorMap), + VectorMap: testVectorMap, }, }), }, @@ -618,12 +635,14 @@ var _ = Describe("Capacity Policy Check", func() { podgroup_info.DefaultSubGroup: subgroup_info.NewPodSet(podgroup_info.DefaultSubGroup, 1, nil). WithPodInfos(map[common_info.PodID]*pod_info.PodInfo{ "task-a": { - UID: "task-a", - Job: "job-a", - Name: "task-a", - Namespace: "team-a", - Status: pod_status.Pending, - ResReq: resource_info.NewResourceRequirements(0, 1000, 0), + UID: "task-a", + Job: "job-a", + Name: "task-a", + Namespace: "team-a", + Status: pod_status.Pending, + ResReq: resource_info.NewResourceRequirements(0, 1000, 0), + ResReqVector: resource_info.NewResourceRequirements(0, 1000, 0).ToVector(testVectorMap), + VectorMap: testVectorMap, }, }), }, @@ -688,12 +707,14 @@ var _ = Describe("Capacity Policy Check", func() { podgroup_info.DefaultSubGroup: subgroup_info.NewPodSet(podgroup_info.DefaultSubGroup, 1, nil). WithPodInfos(map[common_info.PodID]*pod_info.PodInfo{ "task-a": { - UID: "task-a", - Job: "job-a", - Name: "task-a", - Namespace: "team-a", - Status: pod_status.Pending, - ResReq: resource_info.NewResourceRequirements(0, 1000, 0), + UID: "task-a", + Job: "job-a", + Name: "task-a", + Namespace: "team-a", + Status: pod_status.Pending, + ResReq: resource_info.NewResourceRequirements(0, 1000, 0), + ResReqVector: resource_info.NewResourceRequirements(0, 1000, 0).ToVector(testVectorMap), + VectorMap: testVectorMap, }, }), }, @@ -758,12 +779,14 @@ var _ = Describe("Capacity Policy Check", func() { podgroup_info.DefaultSubGroup: subgroup_info.NewPodSet(podgroup_info.DefaultSubGroup, 1, nil). WithPodInfos(map[common_info.PodID]*pod_info.PodInfo{ "task-a": { - UID: "task-a", - Job: "job-a", - Name: "task-a", - Namespace: "team-a", - Status: pod_status.Pending, - ResReq: resource_info.NewResourceRequirements(0, 1000, 0), + UID: "task-a", + Job: "job-a", + Name: "task-a", + Namespace: "team-a", + Status: pod_status.Pending, + ResReq: resource_info.NewResourceRequirements(0, 1000, 0), + ResReqVector: resource_info.NewResourceRequirements(0, 1000, 0).ToVector(testVectorMap), + VectorMap: testVectorMap, }, }), }, @@ -831,12 +854,14 @@ var _ = Describe("Capacity Policy Check", func() { podgroup_info.DefaultSubGroup: subgroup_info.NewPodSet(podgroup_info.DefaultSubGroup, 1, nil). WithPodInfos(map[common_info.PodID]*pod_info.PodInfo{ "task-a": { - UID: "task-a", - Job: "job-a", - Name: "task-a", - Namespace: "team-a", - Status: pod_status.Pending, - ResReq: resource_info.NewResourceRequirements(0, 500, 0), + UID: "task-a", + Job: "job-a", + Name: "task-a", + Namespace: "team-a", + Status: pod_status.Pending, + ResReq: resource_info.NewResourceRequirements(0, 500, 0), + ResReqVector: resource_info.NewResourceRequirements(0, 500, 0).ToVector(testVectorMap), + VectorMap: testVectorMap, }, }), }, @@ -904,12 +929,14 @@ var _ = Describe("Capacity Policy Check", func() { podgroup_info.DefaultSubGroup: subgroup_info.NewPodSet(podgroup_info.DefaultSubGroup, 1, nil). WithPodInfos(map[common_info.PodID]*pod_info.PodInfo{ "task-a": { - UID: "task-a", - Job: "job-a", - Name: "task-a", - Namespace: "team-a", - Status: pod_status.Pending, - ResReq: resource_info.NewResourceRequirements(0, 1100, 0), + UID: "task-a", + Job: "job-a", + Name: "task-a", + Namespace: "team-a", + Status: pod_status.Pending, + ResReq: resource_info.NewResourceRequirements(0, 1100, 0), + ResReqVector: resource_info.NewResourceRequirements(0, 1100, 0).ToVector(testVectorMap), + VectorMap: testVectorMap, }, }), }, @@ -977,12 +1004,14 @@ var _ = Describe("Capacity Policy Check", func() { podgroup_info.DefaultSubGroup: subgroup_info.NewPodSet(podgroup_info.DefaultSubGroup, 1, nil). WithPodInfos(map[common_info.PodID]*pod_info.PodInfo{ "task-a": { - UID: "task-a", - Job: "job-a", - Name: "task-a", - Namespace: "team-a", - Status: pod_status.Pending, - ResReq: resource_info.NewResourceRequirements(0, 500, 0), + UID: "task-a", + Job: "job-a", + Name: "task-a", + Namespace: "team-a", + Status: pod_status.Pending, + ResReq: resource_info.NewResourceRequirements(0, 500, 0), + ResReqVector: resource_info.NewResourceRequirements(0, 500, 0).ToVector(testVectorMap), + VectorMap: testVectorMap, }, }), }, @@ -1050,12 +1079,14 @@ var _ = Describe("Capacity Policy Check", func() { podgroup_info.DefaultSubGroup: subgroup_info.NewPodSet(podgroup_info.DefaultSubGroup, 1, nil). WithPodInfos(map[common_info.PodID]*pod_info.PodInfo{ "task-a": { - UID: "task-a", - Job: "job-a", - Name: "task-a", - Namespace: "team-a", - Status: pod_status.Pending, - ResReq: resource_info.NewResourceRequirements(0, 1100, 0), + UID: "task-a", + Job: "job-a", + Name: "task-a", + Namespace: "team-a", + Status: pod_status.Pending, + ResReq: resource_info.NewResourceRequirements(0, 1100, 0), + ResReqVector: resource_info.NewResourceRequirements(0, 1100, 0).ToVector(testVectorMap), + VectorMap: testVectorMap, }, }), }, diff --git a/pkg/scheduler/plugins/proportion/proportion.go b/pkg/scheduler/plugins/proportion/proportion.go index 46af19209..815fd2ba8 100644 --- a/pkg/scheduler/plugins/proportion/proportion.go +++ b/pkg/scheduler/plugins/proportion/proportion.go @@ -144,7 +144,7 @@ func (pp *proportionPlugin) reclaimableFn( scenario api.ScenarioInfo, ) bool { reclaimerInfo := pp.buildReclaimerInfo(scenario.GetPreemptor(), pp.minNodeGPUMemory) - totalVictimsResources := make(map[common_info.QueueID][]*resource_info.Resource) + totalVictimsResources := make(map[common_info.QueueID][]resource_info.ResourceVector) victims := scenario.GetVictims() for _, victim := range victims { totalJobResources := pp.getVictimResources(victim) @@ -161,8 +161,8 @@ func (pp *proportionPlugin) reclaimableFn( return pp.reclaimablePlugin.Reclaimable(pp.jobSimulationQueues, reclaimerInfo, totalVictimsResources) } -func (pp *proportionPlugin) getVictimResources(victim *api.VictimInfo) []*resource_info.Resource { - var victimResources []*resource_info.Resource +func (pp *proportionPlugin) getVictimResources(victim *api.VictimInfo) []resource_info.ResourceVector { + var victimResources []resource_info.ResourceVector elasticTasks, coreTasks := splitVictimTasks(victim.Tasks, victim.Job.GetSubGroups()) @@ -218,25 +218,25 @@ func splitVictimTasks(tasks []*pod_info.PodInfo, subGroups map[string]*subgroup_ return elasticTasks, coreTasks } -func getResources(ignoreReallocatedTasks bool, pods ...*pod_info.PodInfo) *resource_info.Resource { - resources := make([]*resource_info.ResourceRequirements, 0, len(pods)) +func getResources(ignoreReallocatedTasks bool, pods ...*pod_info.PodInfo) resource_info.ResourceVector { + var vectors []resource_info.ResourceVector for _, task := range pods { if ignoreReallocatedTasks && pod_status.IsActiveAllocatedStatus(task.Status) { continue } - resources = append(resources, task.AcceptedResource) + vectors = append(vectors, task.AcceptedResourceVector) } - if len(resources) == 0 { + if len(vectors) == 0 { return nil } - totalResources := resource_info.EmptyResource() - for _, resource := range resources { - totalResources.AddResourceRequirements(resource) + total := vectors[0].Clone() + for _, vec := range vectors[1:] { + total.Add(vec) } - return totalResources + return total } func (pp *proportionPlugin) calculateResourcesProportion(ssn *framework.Session) { @@ -267,9 +267,11 @@ func getNodeResources(ssn *framework.Session, node *node_info.NodeInfo) rs.Resou _, found := node.Node.Labels[gpuWorkerLabelKey] shouldIgnoreGPUs := ssn.IsRestrictNodeSchedulingEnabled() && !found if shouldIgnoreGPUs { - nodeResource.Add(rs.NewResourceQuantities(node.Allocatable.Cpu(), node.Allocatable.Memory(), 0)) + alloc := utils.QuantifyVector(node.AllocatableVector, node.VectorMap) + alloc[rs.GpuResource] = 0 + nodeResource.Add(alloc) } else { - nodeResource.Add(utils.QuantifyResource(node.Allocatable)) + nodeResource.Add(utils.QuantifyVector(node.AllocatableVector, node.VectorMap)) } // Subtract resources of non-related pods @@ -280,7 +282,7 @@ func getNodeResources(ssn *framework.Session, node *node_info.NodeInfo) rs.Resou !pod_info.IsKaiUtilityPod(podInfo.Pod) { log.InfraLogger.V(7).Infof("Pod %s/%s is scheduled by a different scheduler, marking resources as unallocatable "+ "on node %s", podInfo.Namespace, podInfo.Name, node.Name) - nodeResource.Sub(utils.QuantifyResourceRequirements(podInfo.ResReq)) + nodeResource.Sub(utils.QuantifyVector(podInfo.ResReqVector, podInfo.VectorMap)) } } @@ -294,13 +296,15 @@ func (pp *proportionPlugin) createQueueAttributes(ssn *framework.Session) { } func (pp *proportionPlugin) buildReclaimerInfo(reclaimer *podgroup_info.PodGroupInfo, minNodeGPUMemory int64) *rec.ReclaimerInfo { + initResource := podgroup_info.GetTasksToAllocateInitResource( + reclaimer, pp.subGroupOrderFn, pp.taskOrderFunc, false, minNodeGPUMemory) return &rec.ReclaimerInfo{ - Name: reclaimer.Name, - Namespace: reclaimer.Namespace, - Queue: reclaimer.Queue, - IsPreemptable: reclaimer.IsPreemptibleJob(), - RequiredResources: podgroup_info.GetTasksToAllocateInitResource( - reclaimer, pp.subGroupOrderFn, pp.taskOrderFunc, false, minNodeGPUMemory), + Name: reclaimer.Name, + Namespace: reclaimer.Namespace, + Queue: reclaimer.Queue, + IsPreemptable: reclaimer.IsPreemptibleJob(), + RequiredResources: initResource.ToVector(reclaimer.VectorMap), + VectorMap: reclaimer.VectorMap, } } @@ -352,13 +356,13 @@ func (pp *proportionPlugin) updateQueuesCurrentResourceUsage(ssn *framework.Sess for status, tasks := range job.PodStatusIndex { if pod_status.AllocatedStatus(status) { for _, t := range tasks { - resources := utils.QuantifyResourceRequirements(t.AcceptedResource) + resources := utils.QuantifyVector(t.AcceptedResourceVector, t.VectorMap) isPreemptible := job.IsPreemptibleJob() pp.updateQueuesResourceUsageForAllocatedJob(job.Queue, resources, isPreemptible) } } else if status == pod_status.Pending { for _, t := range tasks { - resources := utils.QuantifyResourceRequirements(t.ResReq) + resources := utils.QuantifyVector(t.ResReqVector, t.VectorMap) if t.IsMemoryRequest() { resources.Add(rs.ResourceQuantities{ rs.GpuResource: float64(t.ResReq.GpuResourceRequirement.GetNumOfGpuDevices()) * (float64(t.ResReq.GpuMemory()) / float64(ssn.ClusterInfo.MinNodeGPUMemory)), @@ -444,7 +448,7 @@ func (pp *proportionPlugin) allocateHandlerFn(ssn *framework.Session) func(event return func(event *framework.Event) { job := ssn.ClusterInfo.PodGroupInfos[event.Task.Job] isPreemptibleJob := job.IsPreemptibleJob() - taskResources := utils.QuantifyResourceRequirements(event.Task.AcceptedResource) + taskResources := utils.QuantifyVector(event.Task.AcceptedResourceVector, event.Task.VectorMap) for queue, ok := pp.queues[job.Queue]; ok; queue, ok = pp.queues[queue.ParentQueue] { for _, resource := range rs.AllResources { @@ -468,7 +472,7 @@ func (pp *proportionPlugin) deallocateHandlerFn(ssn *framework.Session) func(eve return func(event *framework.Event) { job := ssn.ClusterInfo.PodGroupInfos[event.Task.Job] isPreemptibleJob := job.IsPreemptibleJob() - taskResources := utils.QuantifyResourceRequirements(event.Task.AcceptedResource) + taskResources := utils.QuantifyVector(event.Task.AcceptedResourceVector, event.Task.VectorMap) for queue, ok := pp.queues[job.Queue]; ok; queue, ok = pp.queues[queue.ParentQueue] { for _, resource := range rs.AllResources { diff --git a/pkg/scheduler/plugins/proportion/proportion_test.go b/pkg/scheduler/plugins/proportion/proportion_test.go index 1dab9ff01..a099a99c4 100644 --- a/pkg/scheduler/plugins/proportion/proportion_test.go +++ b/pkg/scheduler/plugins/proportion/proportion_test.go @@ -34,6 +34,8 @@ import ( const schedulerName = "kai-scheduler" +var testVectorMap = resource_info.NewResourceVectorMap() + func TestSetFairShare(t *testing.T) { RegisterFailHandler(Fail) RunSpecs(t, "Proportion Suite") @@ -618,8 +620,10 @@ var _ = Describe("Set Fair Share in Proportion", func() { SchedulerName: schedulerName, }, }, - Status: pod_status.Running, - ResReq: common_info.BuildResourceRequirements("2", "2G"), + Status: pod_status.Running, + ResReq: common_info.BuildResourceRequirements("2", "2G"), + ResReqVector: common_info.BuildResourceRequirements("2", "2G").ToVector(testVectorMap), + VectorMap: testVectorMap, }, "2": { Pod: &v1.Pod{ @@ -627,8 +631,10 @@ var _ = Describe("Set Fair Share in Proportion", func() { SchedulerName: "default-scheduler", }, }, - Status: pod_status.Running, - ResReq: common_info.BuildResourceRequirements("1", "1G"), + Status: pod_status.Running, + ResReq: common_info.BuildResourceRequirements("1", "1G"), + ResReqVector: common_info.BuildResourceRequirements("1", "1G").ToVector(testVectorMap), + VectorMap: testVectorMap, }, }, }, @@ -652,8 +658,10 @@ var _ = Describe("Set Fair Share in Proportion", func() { SchedulerName: schedulerName, }, }, - Status: pod_status.Running, - ResReq: common_info.BuildResourceRequirements("2", "2G"), + Status: pod_status.Running, + ResReq: common_info.BuildResourceRequirements("2", "2G"), + ResReqVector: common_info.BuildResourceRequirements("2", "2G").ToVector(testVectorMap), + VectorMap: testVectorMap, }, "2": { Pod: &v1.Pod{ @@ -661,8 +669,10 @@ var _ = Describe("Set Fair Share in Proportion", func() { SchedulerName: "default-scheduler", }, }, - Status: pod_status.Running, - ResReq: common_info.BuildResourceRequirements("1", "1G"), + Status: pod_status.Running, + ResReq: common_info.BuildResourceRequirements("1", "1G"), + ResReqVector: common_info.BuildResourceRequirements("1", "1G").ToVector(testVectorMap), + VectorMap: testVectorMap, }, "reservation": { Pod: &v1.Pod{ @@ -673,8 +683,10 @@ var _ = Describe("Set Fair Share in Proportion", func() { SchedulerName: "default-scheduler", }, }, - Status: pod_status.Running, - ResReq: common_info.BuildResourceRequirements("1", "1G"), + Status: pod_status.Running, + ResReq: common_info.BuildResourceRequirements("1", "1G"), + ResReqVector: common_info.BuildResourceRequirements("1", "1G").ToVector(testVectorMap), + VectorMap: testVectorMap, }, }, }, @@ -698,8 +710,10 @@ var _ = Describe("Set Fair Share in Proportion", func() { SchedulerName: schedulerName, }, }, - Status: pod_status.Running, - ResReq: common_info.BuildResourceRequirements("2", "2G"), + Status: pod_status.Running, + ResReq: common_info.BuildResourceRequirements("2", "2G"), + ResReqVector: common_info.BuildResourceRequirements("2", "2G").ToVector(testVectorMap), + VectorMap: testVectorMap, }, "2": { Pod: &v1.Pod{ @@ -707,8 +721,10 @@ var _ = Describe("Set Fair Share in Proportion", func() { SchedulerName: "default-scheduler", }, }, - Status: pod_status.Running, - ResReq: common_info.BuildResourceRequirements("1", "1G"), + Status: pod_status.Running, + ResReq: common_info.BuildResourceRequirements("1", "1G"), + ResReqVector: common_info.BuildResourceRequirements("1", "1G").ToVector(testVectorMap), + VectorMap: testVectorMap, }, "scaler": { Pod: &v1.Pod{ @@ -719,8 +735,10 @@ var _ = Describe("Set Fair Share in Proportion", func() { SchedulerName: "default-scheduler", }, }, - Status: pod_status.Running, - ResReq: common_info.BuildResourceRequirements("1", "1G"), + Status: pod_status.Running, + ResReq: common_info.BuildResourceRequirements("1", "1G"), + ResReqVector: common_info.BuildResourceRequirements("1", "1G").ToVector(testVectorMap), + VectorMap: testVectorMap, }, }, }, @@ -790,6 +808,12 @@ var _ = Describe("Set Fair Share in Proportion", func() { SchedulerName: schedulerName, }, "1", nil) + vectorMap := resource_info.NewResourceVectorMap() + for rName := range testData.node.Allocatable.ScalarResources() { + vectorMap.AddResource(string(rName)) + } + testData.node.VectorMap = vectorMap + testData.node.AllocatableVector = testData.node.Allocatable.ToVector(vectorMap) if got := getNodeResources(session, testData.node); !reflect.DeepEqual(got, testData.want) { Fail(fmt.Sprintf("getNodeResources() = %v, want %v", got, testData.want)) } @@ -816,8 +840,10 @@ var _ = Describe("Set Fair Share in Proportion", func() { }, Tasks: []*pod_info.PodInfo{ { - Status: pod_status.Pending, - AcceptedResource: common_info.BuildResourceRequirements("1", "1Gi"), + Status: pod_status.Pending, + AcceptedResource: common_info.BuildResourceRequirements("1", "1Gi"), + AcceptedResourceVector: common_info.BuildResourceRequirements("1", "1Gi").ToVector(testVectorMap), + VectorMap: testVectorMap, }, }, } @@ -827,7 +853,7 @@ var _ = Describe("Set Fair Share in Proportion", func() { // Should return resources for the single task that exists Expect(len(result)).To(Equal(1)) Expect(result[0]).ToNot(BeNil()) - Expect(result[0].Cpu()).To(Equal(1000.0)) + Expect(result[0].Get(testVectorMap.GetIndex("cpu"))).To(Equal(1000.0)) }) It("should correctly split elastic and core tasks when MinAvailable is less than task count", func() { @@ -846,16 +872,22 @@ var _ = Describe("Set Fair Share in Proportion", func() { }, Tasks: []*pod_info.PodInfo{ { - Status: pod_status.Pending, - AcceptedResource: common_info.BuildResourceRequirements("1", "1Gi"), + Status: pod_status.Pending, + AcceptedResource: common_info.BuildResourceRequirements("1", "1Gi"), + AcceptedResourceVector: common_info.BuildResourceRequirements("1", "1Gi").ToVector(testVectorMap), + VectorMap: testVectorMap, }, { - Status: pod_status.Pending, - AcceptedResource: common_info.BuildResourceRequirements("1", "1Gi"), + Status: pod_status.Pending, + AcceptedResource: common_info.BuildResourceRequirements("1", "1Gi"), + AcceptedResourceVector: common_info.BuildResourceRequirements("1", "1Gi").ToVector(testVectorMap), + VectorMap: testVectorMap, }, { - Status: pod_status.Pending, - AcceptedResource: common_info.BuildResourceRequirements("1", "1Gi"), + Status: pod_status.Pending, + AcceptedResource: common_info.BuildResourceRequirements("1", "1Gi"), + AcceptedResourceVector: common_info.BuildResourceRequirements("1", "1Gi").ToVector(testVectorMap), + VectorMap: testVectorMap, }, }, } @@ -866,7 +898,7 @@ var _ = Describe("Set Fair Share in Proportion", func() { Expect(len(result)).To(Equal(3)) for _, res := range result { Expect(res).ToNot(BeNil()) - Expect(res.Cpu()).To(Equal(1000.0)) + Expect(res.Get(testVectorMap.GetIndex("cpu"))).To(Equal(1000.0)) } }) @@ -886,12 +918,16 @@ var _ = Describe("Set Fair Share in Proportion", func() { }, Tasks: []*pod_info.PodInfo{ { - Status: pod_status.Pending, - AcceptedResource: common_info.BuildResourceRequirements("1", "1Gi"), + Status: pod_status.Pending, + AcceptedResource: common_info.BuildResourceRequirements("1", "1Gi"), + AcceptedResourceVector: common_info.BuildResourceRequirements("1", "1Gi").ToVector(testVectorMap), + VectorMap: testVectorMap, }, { - Status: pod_status.Pending, - AcceptedResource: common_info.BuildResourceRequirements("1", "1Gi"), + Status: pod_status.Pending, + AcceptedResource: common_info.BuildResourceRequirements("1", "1Gi"), + AcceptedResourceVector: common_info.BuildResourceRequirements("1", "1Gi").ToVector(testVectorMap), + VectorMap: testVectorMap, }, }, } @@ -901,7 +937,7 @@ var _ = Describe("Set Fair Share in Proportion", func() { // Should return 1 resource for all core tasks (no elastic tasks) Expect(len(result)).To(Equal(1)) Expect(result[0]).ToNot(BeNil()) - Expect(result[0].Cpu()).To(Equal(2000.0)) // Combined resources + Expect(result[0].Get(testVectorMap.GetIndex("cpu"))).To(Equal(2000.0)) // Combined resources }) It("should handle zero MinAvailable", func() { @@ -919,12 +955,16 @@ var _ = Describe("Set Fair Share in Proportion", func() { }, Tasks: []*pod_info.PodInfo{ { - Status: pod_status.Pending, - AcceptedResource: common_info.BuildResourceRequirements("1", "1Gi"), + Status: pod_status.Pending, + AcceptedResource: common_info.BuildResourceRequirements("1", "1Gi"), + AcceptedResourceVector: common_info.BuildResourceRequirements("1", "1Gi").ToVector(testVectorMap), + VectorMap: testVectorMap, }, { - Status: pod_status.Pending, - AcceptedResource: common_info.BuildResourceRequirements("1", "1Gi"), + Status: pod_status.Pending, + AcceptedResource: common_info.BuildResourceRequirements("1", "1Gi"), + AcceptedResourceVector: common_info.BuildResourceRequirements("1", "1Gi").ToVector(testVectorMap), + VectorMap: testVectorMap, }, }, } @@ -935,7 +975,7 @@ var _ = Describe("Set Fair Share in Proportion", func() { Expect(len(result)).To(Equal(2)) for _, res := range result { Expect(res).ToNot(BeNil()) - Expect(res.Cpu()).To(Equal(1000.0)) + Expect(res.Get(testVectorMap.GetIndex("cpu"))).To(Equal(1000.0)) } }) }) diff --git a/pkg/scheduler/plugins/proportion/reclaimable/reclaimable.go b/pkg/scheduler/plugins/proportion/reclaimable/reclaimable.go index 71596159d..523cc1ac4 100644 --- a/pkg/scheduler/plugins/proportion/reclaimable/reclaimable.go +++ b/pkg/scheduler/plugins/proportion/reclaimable/reclaimable.go @@ -4,8 +4,10 @@ package reclaimable import ( + "fmt" "maps" "math" + "strings" commonconstants "github.com/NVIDIA/KAI-scheduler/pkg/common/constants" "github.com/NVIDIA/KAI-scheduler/pkg/scheduler/api/common_info" @@ -14,6 +16,7 @@ import ( "github.com/NVIDIA/KAI-scheduler/pkg/scheduler/plugins/proportion/reclaimable/strategies" rs "github.com/NVIDIA/KAI-scheduler/pkg/scheduler/plugins/proportion/resource_share" "github.com/NVIDIA/KAI-scheduler/pkg/scheduler/plugins/proportion/utils" + v1 "k8s.io/api/core/v1" ) type Reclaimable struct { @@ -31,7 +34,7 @@ func (r *Reclaimable) CanReclaimResources( reclaimer *ReclaimerInfo, ) bool { reclaimerQueue := queues[reclaimer.Queue] - requestedResources := utils.QuantifyResource(reclaimer.RequiredResources) + requestedResources := utils.QuantifyVector(reclaimer.RequiredResources, reclaimer.VectorMap) allocatedResources := reclaimerQueue.GetAllocatedShare() allocatedResources.Add(requestedResources) @@ -55,7 +58,7 @@ func (r *Reclaimable) CanReclaimResources( func (r *Reclaimable) Reclaimable( queues map[common_info.QueueID]*rs.QueueAttributes, reclaimer *ReclaimerInfo, - reclaimeeResourcesByQueue map[common_info.QueueID][]*resource_info.Resource, + reclaimeeResourcesByQueue map[common_info.QueueID][]resource_info.ResourceVector, ) bool { reclaimable, reclaimedQueuesRemainingResources, involvedResources := r.reclaimResourcesFromReclaimees(queues, reclaimer, reclaimeeResourcesByQueue) @@ -68,7 +71,7 @@ func (r *Reclaimable) Reclaimable( func (r *Reclaimable) reclaimResourcesFromReclaimees( queues map[common_info.QueueID]*rs.QueueAttributes, reclaimer *ReclaimerInfo, - reclaimeesResourcesByQueue map[common_info.QueueID][]*resource_info.Resource, + reclaimeesResourcesByQueue map[common_info.QueueID][]resource_info.ResourceVector, ) ( bool, map[common_info.QueueID]rs.ResourceQuantities, map[common_info.QueueID]map[rs.ResourceName]any, ) { @@ -77,7 +80,7 @@ func (r *Reclaimable) reclaimResourcesFromReclaimees( for reclaimeeQueueID, reclaimeeQueueReclaimedResources := range reclaimeesResourcesByQueue { reclaimerQueue, reclaimeeQueue := r.getLeveledQueues(queues, reclaimer.Queue, reclaimeeQueueID) - involvedResourcesByQueue[reclaimeeQueueID] = getInvolvedResourcesNames(reclaimeeQueueReclaimedResources) + involvedResourcesByQueue[reclaimeeQueueID] = getInvolvedResourcesNames(reclaimeeQueueReclaimedResources, reclaimer.VectorMap) if _, found := remainingResourcesMap[reclaimeeQueue.UID]; !found { remainingResourcesMap[reclaimeeQueue.UID] = queues[reclaimeeQueue.UID].GetAllocatedShare() @@ -85,17 +88,17 @@ func (r *Reclaimable) reclaimResourcesFromReclaimees( remainingResources := remainingResourcesMap[reclaimeeQueue.UID] for _, reclaimeeResources := range reclaimeeQueueReclaimedResources { - if !strategies.FitsReclaimStrategy(reclaimer.RequiredResources, reclaimerQueue, reclaimeeQueue, + if !strategies.FitsReclaimStrategy(reclaimer.RequiredResources, reclaimer.VectorMap, reclaimerQueue, reclaimeeQueue, remainingResources) { log.InfraLogger.V(7).Infof("queue <%s>,shouldn't be reclaimed, for %s resources"+ " remaining reosurces: <%s>, deserved: <%s>, fairShare: <%s>", - reclaimeeQueue.Name, resource_info.StringResourceArray(reclaimeeQueueReclaimedResources), + reclaimeeQueue.Name, stringVectorArray(reclaimeeQueueReclaimedResources, reclaimer.VectorMap), remainingResources, reclaimeeQueue.GetDeservedShare(), reclaimeeQueue.GetFairShare()) return false, nil, nil } - r.subtractReclaimedResources(queues, remainingResourcesMap, reclaimeeQueueID, reclaimeeResources, involvedResourcesByQueue) + r.subtractReclaimedResources(queues, remainingResourcesMap, reclaimeeQueueID, reclaimeeResources, reclaimer.VectorMap, involvedResourcesByQueue) } } @@ -106,7 +109,8 @@ func (r *Reclaimable) subtractReclaimedResources( queues map[common_info.QueueID]*rs.QueueAttributes, remainingResourcesMap map[common_info.QueueID]rs.ResourceQuantities, reclaimeeQueueID common_info.QueueID, - reclaimedResources *resource_info.Resource, + reclaimedResources resource_info.ResourceVector, + vectorMap *resource_info.ResourceVectorMap, involvedResourcesByQueue map[common_info.QueueID]map[rs.ResourceName]any, ) { for queue, ok := queues[reclaimeeQueueID]; ok; queue, ok = queues[queue.ParentQueue] { @@ -115,7 +119,7 @@ func (r *Reclaimable) subtractReclaimedResources( } remainingResources := remainingResourcesMap[queue.UID] - activeAllocatedQuota := utils.QuantifyResource(reclaimedResources) + activeAllocatedQuota := utils.QuantifyVector(reclaimedResources, vectorMap) remainingResources.Sub(activeAllocatedQuota) _, found := involvedResourcesByQueue[queue.UID] @@ -134,8 +138,8 @@ func (r *Reclaimable) reclaimingQueuesRemainWithinBoundaries( involvedResourcesByQueue map[common_info.QueueID]map[rs.ResourceName]any, ) bool { - requestedQuota := utils.QuantifyResource(reclaimer.RequiredResources) - reclaimerInvolvedResources := getInvolvedResourcesNames([]*resource_info.Resource{reclaimer.RequiredResources}) + requestedQuota := utils.QuantifyVector(reclaimer.RequiredResources, reclaimer.VectorMap) + reclaimerInvolvedResources := getInvolvedResourcesNames([]resource_info.ResourceVector{reclaimer.RequiredResources}, reclaimer.VectorMap) for reclaimingQueue, found := queues[reclaimer.Queue]; found; reclaimingQueue, found = queues[reclaimingQueue.ParentQueue] { remainingResources, foundRemaining := remainingResourcesMap[reclaimingQueue.UID] @@ -262,25 +266,40 @@ func (r *Reclaimable) getHierarchyPath( return hierarchyPath } -func getInvolvedResourcesNames(resources []*resource_info.Resource) map[rs.ResourceName]any { +func getInvolvedResourcesNames(resources []resource_info.ResourceVector, vectorMap *resource_info.ResourceVectorMap) map[rs.ResourceName]any { involvedResources := map[rs.ResourceName]any{} - for _, resource := range resources { - if resource == nil { + cpuIdx := vectorMap.GetIndex(string(v1.ResourceCPU)) + memIdx := vectorMap.GetIndex(string(v1.ResourceMemory)) + gpuIdx := vectorMap.GetIndex(commonconstants.GpuResource) + + for _, vec := range resources { + if vec == nil { continue } - if resource.Cpu() > 0 { + if vec.Get(cpuIdx) > 0 { involvedResources[rs.CpuResource] = struct{}{} } - if resource.Memory() > 0 { + if vec.Get(memIdx) > 0 { involvedResources[rs.MemoryResource] = struct{}{} } - if resource.GPUs() > 0 { + if vec.Get(gpuIdx) > 0 { involvedResources[rs.GpuResource] = struct{}{} } } return involvedResources } + +func stringVectorArray(vectors []resource_info.ResourceVector, vectorMap *resource_info.ResourceVectorMap) string { + if len(vectors) == 0 { + return "" + } + parts := make([]string, 0, len(vectors)) + for _, vec := range vectors { + parts = append(parts, fmt.Sprintf("%v", utils.QuantifyVector(vec, vectorMap))) + } + return strings.Join(parts, ",") +} diff --git a/pkg/scheduler/plugins/proportion/reclaimable/reclaimable_test.go b/pkg/scheduler/plugins/proportion/reclaimable/reclaimable_test.go index ddf10e1d5..5b5cc0ca0 100644 --- a/pkg/scheduler/plugins/proportion/reclaimable/reclaimable_test.go +++ b/pkg/scheduler/plugins/proportion/reclaimable/reclaimable_test.go @@ -19,6 +19,8 @@ import ( . "github.com/onsi/gomega" ) +var testVectorMap = resource_info.NewResourceVectorMap() + type queuesTestData struct { parentQueue common_info.QueueID deserved float64 @@ -43,7 +45,8 @@ var _ = Describe("Can Reclaim Resources", func() { name: "No allocated resources, below fair share", reclaimerInfo: &ReclaimerInfo{ Queue: "queue1", - RequiredResources: resource_info.NewResource(1000, 1000, 1), + RequiredResources: resource_info.NewResource(1000, 1000, 1).ToVector(testVectorMap), + VectorMap: testVectorMap, IsPreemptable: true, }, queue: &rs.QueueAttributes{ @@ -70,7 +73,8 @@ var _ = Describe("Can Reclaim Resources", func() { name: "Some resources allocated, below fair share", reclaimerInfo: &ReclaimerInfo{ Queue: "queue1", - RequiredResources: resource_info.NewResource(1000, 1000, 1), + RequiredResources: resource_info.NewResource(1000, 1000, 1).ToVector(testVectorMap), + VectorMap: testVectorMap, IsPreemptable: true, }, queue: &rs.QueueAttributes{ @@ -97,7 +101,8 @@ var _ = Describe("Can Reclaim Resources", func() { name: "Some resources at fair share with job, other stay below", reclaimerInfo: &ReclaimerInfo{ Queue: "queue1", - RequiredResources: resource_info.NewResource(500, 1000, 0), + RequiredResources: resource_info.NewResource(500, 1000, 0).ToVector(testVectorMap), + VectorMap: testVectorMap, IsPreemptable: true, }, queue: &rs.QueueAttributes{ @@ -124,7 +129,8 @@ var _ = Describe("Can Reclaim Resources", func() { name: "Exactly at fair share with job", reclaimerInfo: &ReclaimerInfo{ Queue: "queue1", - RequiredResources: resource_info.NewResource(1000, 1000, 1), + RequiredResources: resource_info.NewResource(1000, 1000, 1).ToVector(testVectorMap), + VectorMap: testVectorMap, IsPreemptable: true, }, queue: &rs.QueueAttributes{ @@ -151,7 +157,8 @@ var _ = Describe("Can Reclaim Resources", func() { name: "Partially above fair share", reclaimerInfo: &ReclaimerInfo{ Queue: "queue1", - RequiredResources: resource_info.NewResource(1, 1000, 1000), + RequiredResources: resource_info.NewResource(1, 1000, 1000).ToVector(testVectorMap), + VectorMap: testVectorMap, IsPreemptable: true, }, queue: &rs.QueueAttributes{ @@ -178,7 +185,8 @@ var _ = Describe("Can Reclaim Resources", func() { name: "Fully above fair share", reclaimerInfo: &ReclaimerInfo{ Queue: "queue1", - RequiredResources: resource_info.NewResource(1, 1000, 1000), + RequiredResources: resource_info.NewResource(1, 1000, 1000).ToVector(testVectorMap), + VectorMap: testVectorMap, IsPreemptable: true, }, queue: &rs.QueueAttributes{ @@ -205,7 +213,8 @@ var _ = Describe("Can Reclaim Resources", func() { name: "Queue partially above fair share without job resources", reclaimerInfo: &ReclaimerInfo{ Queue: "queue1", - RequiredResources: resource_info.NewResource(1, 1000, 1000), + RequiredResources: resource_info.NewResource(1, 1000, 1000).ToVector(testVectorMap), + VectorMap: testVectorMap, IsPreemptable: true, }, queue: &rs.QueueAttributes{ @@ -254,7 +263,8 @@ var _ = Describe("Can Reclaim Resources", func() { name: "No allocated resources, below quota", reclaimerInfo: &ReclaimerInfo{ Queue: "queue1", - RequiredResources: resource_info.NewResource(1000, 1000, 1), + RequiredResources: resource_info.NewResource(1000, 1000, 1).ToVector(testVectorMap), + VectorMap: testVectorMap, IsPreemptable: false, }, queue: &rs.QueueAttributes{ @@ -287,7 +297,8 @@ var _ = Describe("Can Reclaim Resources", func() { name: "No allocated resources, exactly at quota", reclaimerInfo: &ReclaimerInfo{ Queue: "queue1", - RequiredResources: resource_info.NewResource(1000, 1000, 1), + RequiredResources: resource_info.NewResource(1000, 1000, 1).ToVector(testVectorMap), + VectorMap: testVectorMap, IsPreemptable: false, }, queue: &rs.QueueAttributes{ @@ -320,7 +331,8 @@ var _ = Describe("Can Reclaim Resources", func() { name: "No allocated resources, partially above quota", reclaimerInfo: &ReclaimerInfo{ Queue: "queue1", - RequiredResources: resource_info.NewResource(1, 1000, 1000), + RequiredResources: resource_info.NewResource(1, 1000, 1000).ToVector(testVectorMap), + VectorMap: testVectorMap, IsPreemptable: false, }, queue: &rs.QueueAttributes{ @@ -353,7 +365,8 @@ var _ = Describe("Can Reclaim Resources", func() { name: "Some resources allocated, below quota", reclaimerInfo: &ReclaimerInfo{ Queue: "queue1", - RequiredResources: resource_info.NewResource(1000, 1000, 1), + RequiredResources: resource_info.NewResource(1000, 1000, 1).ToVector(testVectorMap), + VectorMap: testVectorMap, IsPreemptable: false, }, queue: &rs.QueueAttributes{ @@ -386,7 +399,8 @@ var _ = Describe("Can Reclaim Resources", func() { name: "Some preemptible resources allocated, zero quota", reclaimerInfo: &ReclaimerInfo{ Queue: "queue1", - RequiredResources: resource_info.NewResource(1, 1000, 1000), + RequiredResources: resource_info.NewResource(1, 1000, 1000).ToVector(testVectorMap), + VectorMap: testVectorMap, IsPreemptable: false, }, queue: &rs.QueueAttributes{ @@ -419,7 +433,8 @@ var _ = Describe("Can Reclaim Resources", func() { name: "Partially above quota", reclaimerInfo: &ReclaimerInfo{ Queue: "queue1", - RequiredResources: resource_info.NewResource(1, 1000, 1000), + RequiredResources: resource_info.NewResource(1, 1000, 1000).ToVector(testVectorMap), + VectorMap: testVectorMap, IsPreemptable: false, }, queue: &rs.QueueAttributes{ @@ -452,7 +467,8 @@ var _ = Describe("Can Reclaim Resources", func() { name: "Fully above quota", reclaimerInfo: &ReclaimerInfo{ Queue: "queue1", - RequiredResources: resource_info.NewResource(1, 1000, 1000), + RequiredResources: resource_info.NewResource(1, 1000, 1000).ToVector(testVectorMap), + VectorMap: testVectorMap, IsPreemptable: false, }, queue: &rs.QueueAttributes{ @@ -485,7 +501,8 @@ var _ = Describe("Can Reclaim Resources", func() { name: "Queue partially over quota without job resources", reclaimerInfo: &ReclaimerInfo{ Queue: "queue1", - RequiredResources: resource_info.NewResource(1, 1000, 1000), + RequiredResources: resource_info.NewResource(1, 1000, 1000).ToVector(testVectorMap), + VectorMap: testVectorMap, IsPreemptable: false, }, queue: &rs.QueueAttributes{ @@ -543,7 +560,8 @@ var _ = Describe("Reclaimable - Single department", func() { Namespace: "n1", Queue: "p1", IsPreemptable: true, - RequiredResources: resource_info.NewResource(0, 0, 1), + RequiredResources: resource_info.NewResource(0, 0, 1).ToVector(testVectorMap), + VectorMap: testVectorMap, } reclaimeePods := pod_info.PodsMap{ @@ -695,7 +713,8 @@ var _ = Describe("Reclaimable - Multiple departments", func() { Namespace: "n1", Queue: "p1", IsPreemptable: true, - RequiredResources: resource_info.NewResource(0, 0, 1), + RequiredResources: resource_info.NewResource(0, 0, 1).ToVector(testVectorMap), + VectorMap: testVectorMap, } reclaimeePods := pod_info.PodsMap{ @@ -809,7 +828,8 @@ var _ = Describe("Reclaimable - Multiple hierarchy levels", func() { Namespace: "n1", Queue: "left-leaf", IsPreemptable: true, - RequiredResources: resource_info.NewResource(0, 0, 1), + RequiredResources: resource_info.NewResource(0, 0, 1).ToVector(testVectorMap), + VectorMap: testVectorMap, } reclaimeePods := pod_info.PodsMap{ @@ -969,7 +989,7 @@ var _ = Describe("Reclaimable - Multiple hierarchy levels", func() { reclaimee.GetAllPodsMap()["1"].ResReq.GpuResourceRequirement = *resource_info.NewGpuResourceRequirementWithGpus(1.5, 0) - reclaimerInfo.RequiredResources = resource_info.NewResource(0, 0, 1) + reclaimerInfo.RequiredResources = resource_info.NewResource(0, 0, 1).ToVector(testVectorMap) reclaimerInfo.Queue = "left-leaf1" reclaimees := []*podgroup_info.PodGroupInfo{reclaimee} @@ -1018,7 +1038,7 @@ var _ = Describe("Reclaimable - Multiple hierarchy levels", func() { queues := buildQueues(queuesData) reclaimable = New(1.0) - reclaimerInfo.RequiredResources = resource_info.NewResource(0, 0, 1) + reclaimerInfo.RequiredResources = resource_info.NewResource(0, 0, 1).ToVector(testVectorMap) reclaimerInfo.Queue = "left-leaf1" reclaimee.GetAllPodsMap()["1"].ResReq.GpuResourceRequirement = *resource_info.NewGpuResourceRequirementWithGpus(1.5, 0) @@ -1078,7 +1098,7 @@ var _ = Describe("Reclaimable - Multiple hierarchy levels", func() { queues := buildQueues(queuesData) reclaimable = New(1.0) - reclaimerInfo.RequiredResources = resource_info.NewResource(0, 0, 1) + reclaimerInfo.RequiredResources = resource_info.NewResource(0, 0, 1).ToVector(testVectorMap) reclaimerInfo.Queue = "d1-project-1" reclaimee2 := &podgroup_info.PodGroupInfo{ Name: "reclaimee2", @@ -1137,7 +1157,7 @@ var _ = Describe("Reclaimable - Multiple hierarchy levels", func() { reclaimable = New(1.0) - reclaimerInfo.RequiredResources = resource_info.NewResource(0, 0, 1) // Only requests GPU + reclaimerInfo.RequiredResources = resource_info.NewResource(0, 0, 1).ToVector(testVectorMap) // Only requests GPU reclaimerInfo.Queue = "d1-project-1" reclaimee2 := &podgroup_info.PodGroupInfo{ Name: "reclaimee2", @@ -1184,14 +1204,14 @@ func buildQueues(queuesData map[common_info.QueueID]queuesTestData) map[common_i return queues } -func reclaimeeResourcesByQueue(reclaimees []*podgroup_info.PodGroupInfo) map[common_info.QueueID][]*resource_info.Resource { - resources := make(map[common_info.QueueID][]*resource_info.Resource) +func reclaimeeResourcesByQueue(reclaimees []*podgroup_info.PodGroupInfo) map[common_info.QueueID][]resource_info.ResourceVector { + resources := make(map[common_info.QueueID][]resource_info.ResourceVector) for _, reclaimee := range reclaimees { if _, found := resources[reclaimee.Queue]; !found { - resources[reclaimee.Queue] = make([]*resource_info.Resource, 0) + resources[reclaimee.Queue] = make([]resource_info.ResourceVector, 0) } - resources[reclaimee.Queue] = append(resources[reclaimee.Queue], reclaimee.GetTasksActiveAllocatedReqResource()) + resources[reclaimee.Queue] = append(resources[reclaimee.Queue], reclaimee.GetTasksActiveAllocatedReqResource().ToVector(testVectorMap)) } return resources diff --git a/pkg/scheduler/plugins/proportion/reclaimable/reclaimer_info.go b/pkg/scheduler/plugins/proportion/reclaimable/reclaimer_info.go index 8345eea20..3fc878ea9 100644 --- a/pkg/scheduler/plugins/proportion/reclaimable/reclaimer_info.go +++ b/pkg/scheduler/plugins/proportion/reclaimable/reclaimer_info.go @@ -12,6 +12,7 @@ type ReclaimerInfo struct { Name string Namespace string Queue common_info.QueueID - RequiredResources *resource_info.Resource + RequiredResources resource_info.ResourceVector + VectorMap *resource_info.ResourceVectorMap IsPreemptable bool } diff --git a/pkg/scheduler/plugins/proportion/reclaimable/strategies/strategies.go b/pkg/scheduler/plugins/proportion/reclaimable/strategies/strategies.go index cb1f8eb15..19cf0f324 100644 --- a/pkg/scheduler/plugins/proportion/reclaimable/strategies/strategies.go +++ b/pkg/scheduler/plugins/proportion/reclaimable/strategies/strategies.go @@ -16,14 +16,15 @@ type GuaranteeDeservedQuotaStrategy struct{} var strategies = []ReclaimStrategy{&MaintainFairShareStrategy{}, &GuaranteeDeservedQuotaStrategy{}} func FitsReclaimStrategy( - reclaimerResources *resource_info.Resource, + reclaimerResources resource_info.ResourceVector, + vectorMap *resource_info.ResourceVectorMap, reclaimerQueue *rs.QueueAttributes, reclaimeeQueue *rs.QueueAttributes, reclaimeeRemainingShare rs.ResourceQuantities, ) bool { for _, strategy := range strategies { if strategy.Reclaimable( - reclaimerResources, reclaimerQueue, reclaimeeQueue, + reclaimerResources, vectorMap, reclaimerQueue, reclaimeeQueue, reclaimeeRemainingShare, ) { return true @@ -34,13 +35,15 @@ func FitsReclaimStrategy( type ReclaimStrategy interface { Reclaimable( - reclaimerResources *resource_info.Resource, reclaimerQueue *rs.QueueAttributes, + reclaimerResources resource_info.ResourceVector, vectorMap *resource_info.ResourceVectorMap, + reclaimerQueue *rs.QueueAttributes, reclaimeeQueue *rs.QueueAttributes, reclaimeeRemainingShare rs.ResourceQuantities, ) bool } func (mfss *MaintainFairShareStrategy) Reclaimable( - _ *resource_info.Resource, + _ resource_info.ResourceVector, + _ *resource_info.ResourceVectorMap, reclaimerQueue *rs.QueueAttributes, reclaimeeQueue *rs.QueueAttributes, reclaimeeRemainingShare rs.ResourceQuantities) bool { @@ -56,7 +59,8 @@ func (mfss *MaintainFairShareStrategy) Reclaimable( } func (gdqs *GuaranteeDeservedQuotaStrategy) Reclaimable( - reclaimerResources *resource_info.Resource, + reclaimerResources resource_info.ResourceVector, + vectorMap *resource_info.ResourceVectorMap, reclaimerQueue *rs.QueueAttributes, reclaimeeQueue *rs.QueueAttributes, reclaimeeRemainingShare rs.ResourceQuantities) bool { @@ -71,7 +75,7 @@ func (gdqs *GuaranteeDeservedQuotaStrategy) Reclaimable( reclaimerQueue.GetDeservedShare(), reclaimerQueue.GetFairShare()) // reclaimer has to be under (or equal) deserved quota in all resources (cpu, mem, gpu) - if reclaimerWillGoOverQuota(reclaimerResources, reclaimerQueue) { + if reclaimerWillGoOverQuota(reclaimerResources, vectorMap, reclaimerQueue) { return false } @@ -83,9 +87,9 @@ func (gdqs *GuaranteeDeservedQuotaStrategy) Reclaimable( return true } -func reclaimerWillGoOverQuota(reclaimerResources *resource_info.Resource, reclaimerQueue *rs.QueueAttributes) bool { +func reclaimerWillGoOverQuota(reclaimerResources resource_info.ResourceVector, vectorMap *resource_info.ResourceVectorMap, reclaimerQueue *rs.QueueAttributes) bool { reclaimerRequestedQuota := reclaimerQueue.GetAllocatedShare() - reclaimerRequestedQuota.Add(utils.QuantifyResource(reclaimerResources)) + reclaimerRequestedQuota.Add(utils.QuantifyVector(reclaimerResources, vectorMap)) return !reclaimerRequestedQuota.LessEqual(reclaimerQueue.GetDeservedShare()) } diff --git a/pkg/scheduler/plugins/proportion/reclaimable/strategies/strategies_test.go b/pkg/scheduler/plugins/proportion/reclaimable/strategies/strategies_test.go index b230edac1..6d8c7f94d 100644 --- a/pkg/scheduler/plugins/proportion/reclaimable/strategies/strategies_test.go +++ b/pkg/scheduler/plugins/proportion/reclaimable/strategies/strategies_test.go @@ -19,6 +19,8 @@ func TestReclaimStrategies(t *testing.T) { RunSpecs(t, "Reclaim strategies tests") } +var testVectorMap = resource_info.NewResourceVectorMap() + var _ = Describe("Reclaim strategies", func() { Context("Maintain Fair Share Strategy", func() { tests := map[string]struct { @@ -180,7 +182,7 @@ var _ = Describe("Reclaim strategies", func() { testData := testData It(testName, func() { remainingResourceShare := testData.reclaimeeQueue.GetAllocatedShare() - reclaimable := strategy.Reclaimable(nil, testData.reclaimerQueue, testData.reclaimeeQueue, + reclaimable := strategy.Reclaimable(nil, testVectorMap, testData.reclaimerQueue, testData.reclaimeeQueue, remainingResourceShare) Expect(testData.expected).To(Equal(reclaimable)) }) @@ -542,7 +544,7 @@ var _ = Describe("Reclaim strategies", func() { testName := testName testData := testData It(testName, func() { - reclaimable := strategy.Reclaimable(nil, testData.reclaimerQueue, testData.reclaimeeQueue, + reclaimable := strategy.Reclaimable(nil, testVectorMap, testData.reclaimerQueue, testData.reclaimeeQueue, testData.remainingResourceShare) Expect(reclaimable).To(Equal(testData.expected)) }) @@ -551,13 +553,13 @@ var _ = Describe("Reclaim strategies", func() { Context("Guarantee Deserved Quota Strategy", func() { tests := map[string]struct { - reclaimerResources *resource_info.Resource + reclaimerResources resource_info.ResourceVector reclaimerQueue *rs.QueueAttributes reclaimeeQueue *rs.QueueAttributes expected bool }{ "Reclaimer is above deserved quota and reclaimee above deserved quota": { - reclaimerResources: resource_info.NewResource(0, 0, 2), + reclaimerResources: resource_info.NewResource(0, 0, 2).ToVector(testVectorMap), reclaimerQueue: &rs.QueueAttributes{ Name: "p1", QueueResourceShare: rs.QueueResourceShare{ @@ -585,7 +587,7 @@ var _ = Describe("Reclaim strategies", func() { expected: false, }, "Reclaimer reaches exactly deserved quota and reclaimee above deserved quota": { - reclaimerResources: resource_info.NewResource(0, 0, 2), + reclaimerResources: resource_info.NewResource(0, 0, 2).ToVector(testVectorMap), reclaimerQueue: &rs.QueueAttributes{ Name: "p1", QueueResourceShare: rs.QueueResourceShare{ @@ -613,7 +615,7 @@ var _ = Describe("Reclaim strategies", func() { expected: true, }, "Reclaimer gets exactly deserved quota and reclaimee remains with exactly deserved quota": { - reclaimerResources: resource_info.NewResource(0, 0, 2), + reclaimerResources: resource_info.NewResource(0, 0, 2).ToVector(testVectorMap), reclaimerQueue: &rs.QueueAttributes{ Name: "p1", QueueResourceShare: rs.QueueResourceShare{ @@ -641,7 +643,7 @@ var _ = Describe("Reclaim strategies", func() { expected: true, }, "Reclaimer gets exactly deserved quota and reclaimee goes below deserved quota": { - reclaimerResources: resource_info.NewResource(0, 0, 2), + reclaimerResources: resource_info.NewResource(0, 0, 2).ToVector(testVectorMap), reclaimerQueue: &rs.QueueAttributes{ Name: "p1", QueueResourceShare: rs.QueueResourceShare{ @@ -677,7 +679,7 @@ var _ = Describe("Reclaim strategies", func() { expected: true, }, "Reclaimer is below deserved quota and reclaimee is above deserved quota in *only one* of the resources": { - reclaimerResources: resource_info.NewResource(0, 0, 2), + reclaimerResources: resource_info.NewResource(0, 0, 2).ToVector(testVectorMap), reclaimerQueue: &rs.QueueAttributes{ Name: "p1", QueueResourceShare: rs.QueueResourceShare{ @@ -708,7 +710,7 @@ var _ = Describe("Reclaim strategies", func() { expected: true, }, "Reclaimer is below deserved quota and reclaimee has exactly deserved quota": { - reclaimerResources: resource_info.NewResource(0, 0, 2), + reclaimerResources: resource_info.NewResource(0, 0, 2).ToVector(testVectorMap), reclaimerQueue: &rs.QueueAttributes{ Name: "p1", QueueResourceShare: rs.QueueResourceShare{ @@ -736,7 +738,7 @@ var _ = Describe("Reclaim strategies", func() { expected: false, }, "Reclaimer is below deserved quota and reclaimee is below deserved quota": { - reclaimerResources: resource_info.NewResource(0, 0, 2), + reclaimerResources: resource_info.NewResource(0, 0, 2).ToVector(testVectorMap), reclaimerQueue: &rs.QueueAttributes{ Name: "p1", QueueResourceShare: rs.QueueResourceShare{ @@ -764,7 +766,7 @@ var _ = Describe("Reclaim strategies", func() { expected: false, }, "Zero quota queue gives up on all resources in favor of a starved queue": { - reclaimerResources: resource_info.NewResource(0, 0, 2), + reclaimerResources: resource_info.NewResource(0, 0, 2).ToVector(testVectorMap), reclaimerQueue: &rs.QueueAttributes{ Name: "p1", QueueResourceShare: rs.QueueResourceShare{ @@ -798,7 +800,7 @@ var _ = Describe("Reclaim strategies", func() { testName := testName testData := testData It(testName, func() { - reclaimable := strategy.Reclaimable(testData.reclaimerResources, testData.reclaimerQueue, + reclaimable := strategy.Reclaimable(testData.reclaimerResources, testVectorMap, testData.reclaimerQueue, testData.reclaimeeQueue, testData.reclaimeeQueue.GetAllocatedShare()) Expect(testData.expected).To(Equal(reclaimable)) }) diff --git a/pkg/scheduler/plugins/proportion/utils/utils.go b/pkg/scheduler/plugins/proportion/utils/utils.go index f61e53c29..32ea26fa3 100644 --- a/pkg/scheduler/plugins/proportion/utils/utils.go +++ b/pkg/scheduler/plugins/proportion/utils/utils.go @@ -4,7 +4,12 @@ package utils import ( + v1 "k8s.io/api/core/v1" + + commonconstants "github.com/NVIDIA/KAI-scheduler/pkg/common/constants" + "github.com/NVIDIA/KAI-scheduler/pkg/scheduler/api/common_info/resources" "github.com/NVIDIA/KAI-scheduler/pkg/scheduler/api/resource_info" + "github.com/NVIDIA/KAI-scheduler/pkg/scheduler/log" rs "github.com/NVIDIA/KAI-scheduler/pkg/scheduler/plugins/proportion/resource_share" ) @@ -16,6 +21,26 @@ func QuantifyResourceRequirements(resource *resource_info.ResourceRequirements) return rs.NewResourceQuantities(resource.Cpu(), resource.Memory(), resource.GetGpusQuota()) } +func QuantifyVector(vec resource_info.ResourceVector, vectorMap *resource_info.ResourceVectorMap) rs.ResourceQuantities { + cpuIdx := vectorMap.GetIndex(string(v1.ResourceCPU)) + memIdx := vectorMap.GetIndex(string(v1.ResourceMemory)) + gpuIdx := vectorMap.GetIndex(commonconstants.GpuResource) + totalGPUs := vec.Get(gpuIdx) + for i := range vectorMap.Len() { + name := vectorMap.ResourceAt(i) + if !resource_info.IsMigResource(v1.ResourceName(name)) { + continue + } + gpuPortion, _, err := resources.ExtractGpuAndMemoryFromMigResourceName(name) + if err != nil { + log.InfraLogger.Errorf("Failed to get device portion from %v", name) + continue + } + totalGPUs += float64(gpuPortion) * vec.Get(i) + } + return rs.NewResourceQuantities(vec.Get(cpuIdx), vec.Get(memIdx), totalGPUs) +} + func ResourceRequirementsFromQuantities(quantities rs.ResourceQuantities) *resource_info.ResourceRequirements { return resource_info.NewResourceRequirements( quantities[rs.GpuResource], diff --git a/pkg/scheduler/plugins/resourcetype/resourcetype.go b/pkg/scheduler/plugins/resourcetype/resourcetype.go index c0195ab15..d9e2d3e49 100644 --- a/pkg/scheduler/plugins/resourcetype/resourcetype.go +++ b/pkg/scheduler/plugins/resourcetype/resourcetype.go @@ -4,6 +4,7 @@ package resourcetype import ( + commonconstants "github.com/NVIDIA/KAI-scheduler/pkg/common/constants" "github.com/NVIDIA/KAI-scheduler/pkg/scheduler/api" "github.com/NVIDIA/KAI-scheduler/pkg/scheduler/api/node_info" "github.com/NVIDIA/KAI-scheduler/pkg/scheduler/api/pod_info" @@ -33,9 +34,10 @@ func (pp *resourceType) nodeOrderFn() api.NodeOrderFn { if isCPUOnlyTask && node.IsCPUOnlyNode() { score = scores.ResourceType } + gpuIdx := node.VectorMap.GetIndex(commonconstants.GpuResource) log.InfraLogger.V(7).Infof( "Task %s requests GPU: %t. On node with %f total allocatable GPU. Score: %f", - task.Name, !isCPUOnlyTask, node.Allocatable.GPUs(), score) + task.Name, !isCPUOnlyTask, node.AllocatableVector.Get(gpuIdx), score) return score, nil } } diff --git a/pkg/scheduler/plugins/topology/job_filtering.go b/pkg/scheduler/plugins/topology/job_filtering.go index b4cef91d2..906647e52 100644 --- a/pkg/scheduler/plugins/topology/job_filtering.go +++ b/pkg/scheduler/plugins/topology/job_filtering.go @@ -26,9 +26,8 @@ const ( ) type jobAllocationMetaData struct { - maxPodResources *resource_info.ResourceRequirements - allocationTestPods []*pod_info.PodInfo - tasksToAllocate []*pod_info.PodInfo + maxPodResources *resource_info.ResourceRequirements + tasksToAllocate []*pod_info.PodInfo } func (t *topologyPlugin) subSetNodesFn( @@ -62,7 +61,7 @@ func (t *topologyPlugin) subSetNodesFn( tasksResources, tasksCount := getTasksAllocationMetadata(tasks) - if err := checkJobDomainFit(job, subGroup, tasksResources, tasksCount, domain); err != nil { + if err := checkJobDomainFit(job, subGroup, tasksResources, tasksCount, domain, topologyTree.VectorMap); err != nil { if domain.ID == rootDomainId { job.AddSimpleJobFitError( podgroup_info.PodSchedulingErrors, @@ -111,10 +110,14 @@ func (t *topologyPlugin) subSetNodesFn( return domainNodeSets, nil } -func getTasksAllocationMetadata(tasks []*pod_info.PodInfo) (*resource_info.Resource, int) { - tasksResources := resource_info.NewResource(0, 0, 0) +func getTasksAllocationMetadata(tasks []*pod_info.PodInfo) (resource_info.ResourceVector, int) { + var tasksResources resource_info.ResourceVector for _, task := range tasks { - tasksResources.AddResourceRequirements(task.ResReq) + if tasksResources == nil { + tasksResources = task.ResReqVector.Clone() + } else { + tasksResources.Add(task.ResReqVector) + } } tasksCount := len(tasks) return tasksResources, tasksCount @@ -153,15 +156,10 @@ func initTasksRepresentorMetadataStruct(tasksToAllocate []*pod_info.PodInfo) (*j return nil, err } } - initialAllocationTestPods := []*pod_info.PodInfo{ - {Name: "1-pods-resources", ResReq: maxPodResources}, - } - jobAllocationData := &jobAllocationMetaData{ - maxPodResources: maxPodResources, - allocationTestPods: initialAllocationTestPods, - tasksToAllocate: tasksToAllocate, - } - return jobAllocationData, nil + return &jobAllocationMetaData{ + maxPodResources: maxPodResources, + tasksToAllocate: tasksToAllocate, + }, nil } func (t *topologyPlugin) calcSubTreeAllocatable( @@ -189,82 +187,66 @@ func (t *topologyPlugin) calcSubTreeAllocatable( return domain.AllocatablePods, nil } -func calcSubTreeFreeResources(domain *DomainInfo) *resource_info.Resource { +func calcSubTreeFreeResources(domain *DomainInfo) resource_info.ResourceVector { if domain == nil { return nil } if len(domain.Children) == 0 { for _, node := range domain.Nodes { - domain.IdleOrReleasingResources.Add(node.Idle) - domain.IdleOrReleasingResources.Add(node.Releasing) + domain.IdleOrReleasingVector.Add(node.IdleVector) + domain.IdleOrReleasingVector.Add(node.ReleasingVector) // Ignore fractions of GPUs for now } - return domain.IdleOrReleasingResources + return domain.IdleOrReleasingVector } for _, child := range domain.Children { subdomainFreeResources := calcSubTreeFreeResources(child) - domain.IdleOrReleasingResources.Add(subdomainFreeResources) + domain.IdleOrReleasingVector.Add(subdomainFreeResources) } - return domain.IdleOrReleasingResources + return domain.IdleOrReleasingVector } func calcNodeAccommodation(jobAllocationMetaData *jobAllocationMetaData, node *node_info.NodeInfo) int { - onePodOnly := resource_info.EmptyResourceRequirements() - onePodOnly.ScalarResources()[resource_info.PodsResourceName] = 1 - if jobAllocationMetaData.maxPodResources.LessEqual(onePodOnly) { + maxPodVector := jobAllocationMetaData.maxPodResources.ToVector(node.VectorMap) + + podsIdx := node.VectorMap.GetIndex(string(v1.ResourcePods)) + onePodOnlyVector := resource_info.NewResourceVector(node.VectorMap) + if podsIdx >= 0 { + onePodOnlyVector.Set(podsIdx, 1) + } + if maxPodVector.LessEqual(onePodOnlyVector) { return len(jobAllocationMetaData.tasksToAllocate) } - allocatablePodsCount := 0 - for _, resourceRepresentorPod := range jobAllocationMetaData.allocationTestPods { - if node.IsTaskAllocatableOnReleasingOrIdle(resourceRepresentorPod) { - allocatablePodsCount++ - } else { - break - } + nonAllocated := node.IdleVector.Clone() + nonAllocated.Add(node.ReleasingVector) + + if !maxPodVector.LessEqual(nonAllocated) { + return 0 } - // Add more to jobResourcesAllocationsRepresenters until the node cannot accommodate any more pods - if allocatablePodsCount == len(jobAllocationMetaData.allocationTestPods) { - for i := allocatablePodsCount; ; i++ { - latestTestPod := jobAllocationMetaData.allocationTestPods[len(jobAllocationMetaData.allocationTestPods)-1] - iAllocationsTestPod := &pod_info.PodInfo{ - Name: fmt.Sprintf("%d-pods-resources", allocatablePodsCount+1), - ResReq: calcNextAllocationTestPodResources(latestTestPod.ResReq, jobAllocationMetaData.maxPodResources), - } - jobAllocationMetaData.allocationTestPods = append(jobAllocationMetaData.allocationTestPods, iAllocationsTestPod) - if node.IsTaskAllocatableOnReleasingOrIdle(iAllocationsTestPod) { - allocatablePodsCount++ - } else { - break - } + minPods := math.MaxInt + for i := 0; i < len(maxPodVector); i++ { + if maxPodVector[i] <= 0 { + continue + } + pods := int(nonAllocated[i] / maxPodVector[i]) + if pods < minPods { + minPods = pods } } - return allocatablePodsCount -} -func calcNextAllocationTestPodResources(previousTestResources, maxPodResources *resource_info.ResourceRequirements) *resource_info.ResourceRequirements { - nPlus1Resources := previousTestResources.Clone() - nPlus1Resources.BaseResource.Add(&maxPodResources.BaseResource) - if len(nPlus1Resources.GpuResourceRequirement.MigResources()) > 0 { - for migResource, quant := range maxPodResources.GpuResourceRequirement.MigResources() { - nPlus1Resources.GpuResourceRequirement.MigResources()[migResource] += quant - } - } else { - updatedGpuResource := resource_info.NewGpuResourceRequirementWithMultiFraction( - nPlus1Resources.GetNumOfGpuDevices()+maxPodResources.GetNumOfGpuDevices(), - nPlus1Resources.GpuFractionalPortion(), - nPlus1Resources.GpuMemory()) - nPlus1Resources.GpuResourceRequirement = *updatedGpuResource + if minPods == math.MaxInt { + return len(jobAllocationMetaData.tasksToAllocate) } - return nPlus1Resources + return minPods } func (t *topologyPlugin) getJobAllocatableDomains( job *podgroup_info.PodGroupInfo, subGroup *subgroup_info.SubGroupInfo, podSets map[string]*subgroup_info.PodSet, - tasksResources *resource_info.Resource, tasksCount int, topologyTree *Info, + tasksResources resource_info.ResourceVector, tasksCount int, topologyTree *Info, ) ([]*DomainInfo, error) { relevantLevels, err := t.calculateRelevantDomainLevels(subGroup, topologyTree) if err != nil { @@ -284,7 +266,7 @@ func (t *topologyPlugin) getJobAllocatableDomains( var topLevelFitErrors []common_info.JobFitError for levelIndex, level := range relevantLevels { for _, domain := range relevantDomainsByLevel[level] { - err := checkJobDomainFit(job, subGroup, tasksResources, tasksCount, domain) + err := checkJobDomainFit(job, subGroup, tasksResources, tasksCount, domain, topologyTree.VectorMap) if err != nil { // Filter domains that cannot allocate the job if levelIndex == len(relevantLevels)-1 { topLevelFitErrors = append(topLevelFitErrors, err) @@ -360,7 +342,8 @@ func hasTopologyRequiredConstraint(subGroup *subgroup_info.SubGroupInfo) bool { } func checkJobDomainFit(job *podgroup_info.PodGroupInfo, subGroup *subgroup_info.SubGroupInfo, - tasksResources *resource_info.Resource, tasksCount int, domain *DomainInfo) *common_info.TopologyFitError { + tasksResources resource_info.ResourceVector, tasksCount int, domain *DomainInfo, + vectorMap *resource_info.ResourceVectorMap) *common_info.TopologyFitError { if domain.AllocatablePods != allocatablePodsNotSet { if domain.AllocatablePods < tasksCount { return common_info.NewTopologyFitError( @@ -372,7 +355,7 @@ func checkJobDomainFit(job *podgroup_info.PodGroupInfo, subGroup *subgroup_info. if getJobRatioToFreeResources(tasksResources, domain) > maxAllocatableTasksRatio { err := common_info.NewTopologyInsufficientResourcesError( - job.Name, subGroup.GetName(), job.Namespace, string(domain.ID), tasksResources, domain.IdleOrReleasingResources) + job.Name, subGroup.GetName(), job.Namespace, string(domain.ID), tasksResources, domain.IdleOrReleasingVector, vectorMap) return err } return nil @@ -439,15 +422,19 @@ func (*topologyPlugin) treeAllocatableCleanup(topologyTree *Info) { for _, levelDomains := range topologyTree.DomainsByLevel { for _, domain := range levelDomains { domain.AllocatablePods = allocatablePodsNotSet - domain.IdleOrReleasingResources = resource_info.EmptyResource() + domain.IdleOrReleasingVector = resource_info.NewResourceVector(topologyTree.VectorMap) } } } func sortTreeFromRoot(tasks []*pod_info.PodInfo, root *DomainInfo, maxDepthLevel DomainLevel) { - tasksResources := resource_info.NewResource(0, 0, 0) + var tasksResources resource_info.ResourceVector for _, task := range tasks { - tasksResources.AddResourceRequirements(task.ResReq) + if tasksResources == nil { + tasksResources = task.ResReqVector.Clone() + } else { + tasksResources.Add(task.ResReqVector) + } } sortTree(tasksResources, root, maxDepthLevel) @@ -457,7 +444,7 @@ func sortTreeFromRoot(tasks []*pod_info.PodInfo, root *DomainInfo, maxDepthLevel // Domains are sorted by AllocatablePods (ascending) to prioritize filling domains // with fewer available resources first, implementing a bin-packing strategy. // Within domains with equal AllocatablePods, sorts by ID for deterministic ordering. -func sortTree(tasksResources *resource_info.Resource, root *DomainInfo, maxDepthLevel DomainLevel) { +func sortTree(tasksResources resource_info.ResourceVector, root *DomainInfo, maxDepthLevel DomainLevel) { if root == nil || maxDepthLevel == "" { return } @@ -488,34 +475,29 @@ func sortTree(tasksResources *resource_info.Resource, root *DomainInfo, maxDepth // Returns a max ratio for all tasks resources to the free resources in the domain. // The higher the ratio, the more "packed" the domain will be after the job is allocated. // If the ratio is higher then 1, the domain will not be able to allocate the job. -func getJobRatioToFreeResources(tasksResources *resource_info.Resource, domain *DomainInfo) float64 { +func getJobRatioToFreeResources(tasksResources resource_info.ResourceVector, domain *DomainInfo) float64 { dominantResourceRatio := 0.0 - if tasksResources.LessEqual(resource_info.EmptyResource()) { + emptyVec := make(resource_info.ResourceVector, len(tasksResources)) + if tasksResources.LessEqual(emptyVec) { return dominantResourceRatio } - if tasksResources.GPUs() > 0 { - dominantResourceRatio = math.Max(dominantResourceRatio, - tasksResources.GPUs()/domain.IdleOrReleasingResources.GPUs()) - } - - tasksResourcesList := tasksResources.ToResourceList() - freeDomainResourcesList := domain.IdleOrReleasingResources.ToResourceList() - for rName, taskResourceQuantity := range tasksResourcesList { - if taskResourceQuantity.Value() == 0 { + podsIdx := resource_info.NewResourceVectorMap().GetIndex(string(v1.ResourcePods)) + for i := 0; i < len(tasksResources); i++ { + taskVal := tasksResources.Get(i) + if taskVal <= 0 { continue } - // Ignore pods resource for bin-packing behavior - if rName == v1.ResourcePods { + if i == podsIdx { continue } var resourceRatio float64 - freeDomainResourceQuantity := freeDomainResourcesList[rName] - if freeDomainResourceQuantity.Value() == 0 { - resourceRatio = requiredResourceNotInDomainRatio // required resource doesn't exist in the domain + freeVal := domain.IdleOrReleasingVector.Get(i) + if freeVal <= 0 { + resourceRatio = requiredResourceNotInDomainRatio } else { - resourceRatio = float64(taskResourceQuantity.Value()) / float64(freeDomainResourceQuantity.Value()) + resourceRatio = taskVal / freeVal } dominantResourceRatio = math.Max(dominantResourceRatio, resourceRatio) } diff --git a/pkg/scheduler/plugins/topology/job_filtering_test.go b/pkg/scheduler/plugins/topology/job_filtering_test.go index 646ae6992..d600a003e 100644 --- a/pkg/scheduler/plugins/topology/job_filtering_test.go +++ b/pkg/scheduler/plugins/topology/job_filtering_test.go @@ -28,6 +28,8 @@ import ( "github.com/NVIDIA/KAI-scheduler/pkg/scheduler/test_utils/tasks_fake" ) +var testVectorMap = resource_info.NewResourceVectorMap() + func TestTopologyPlugin_subsetNodesFn(t *testing.T) { tests := []struct { name string @@ -86,7 +88,8 @@ func TestTopologyPlugin_subsetNodesFn(t *testing.T) { }, setupTopologyTree: func() *Info { tree := &Info{ - Name: "test-topology", + Name: "test-topology", + VectorMap: resource_info.NewResourceVectorMap(), TopologyResource: &kaiv1alpha1.Topology{ Spec: kaiv1alpha1.TopologySpec{ Levels: []kaiv1alpha1.TopologyLevel{ @@ -98,24 +101,24 @@ func TestTopologyPlugin_subsetNodesFn(t *testing.T) { DomainsByLevel: map[DomainLevel]LevelDomainInfos{ "rack": { "rack1.zone1": { - ID: "rack1.zone1", - Level: "rack", - Nodes: map[string]*node_info.NodeInfo{}, - IdleOrReleasingResources: resource_info.NewResource(0, 0, 0), + ID: "rack1.zone1", + Level: "rack", + Nodes: map[string]*node_info.NodeInfo{}, + IdleOrReleasingVector: resource_info.NewResource(0, 0, 0).ToVector(testVectorMap), }, "rack2.zone1": { - ID: "rack2.zone1", - Level: "rack", - Nodes: map[string]*node_info.NodeInfo{}, - IdleOrReleasingResources: resource_info.NewResource(0, 0, 0), + ID: "rack2.zone1", + Level: "rack", + Nodes: map[string]*node_info.NodeInfo{}, + IdleOrReleasingVector: resource_info.NewResource(0, 0, 0).ToVector(testVectorMap), }, }, "zone": { "zone1": { - ID: "zone1", - Level: "zone", - Nodes: map[string]*node_info.NodeInfo{}, - IdleOrReleasingResources: resource_info.NewResource(0, 0, 0), + ID: "zone1", + Level: "zone", + Nodes: map[string]*node_info.NodeInfo{}, + IdleOrReleasingVector: resource_info.NewResource(0, 0, 0).ToVector(testVectorMap), }, }, }, @@ -188,7 +191,8 @@ func TestTopologyPlugin_subsetNodesFn(t *testing.T) { }, setupTopologyTree: func() *Info { tree := &Info{ - Name: "test-topology", + Name: "test-topology", + VectorMap: resource_info.NewResourceVectorMap(), TopologyResource: &kaiv1alpha1.Topology{ Spec: kaiv1alpha1.TopologySpec{ Levels: []kaiv1alpha1.TopologyLevel{ @@ -200,24 +204,24 @@ func TestTopologyPlugin_subsetNodesFn(t *testing.T) { DomainsByLevel: map[DomainLevel]LevelDomainInfos{ "rack": { "rack1.zone1": { - ID: "rack1.zone1", - Level: "rack", - Nodes: map[string]*node_info.NodeInfo{}, - IdleOrReleasingResources: resource_info.NewResource(0, 0, 0), + ID: "rack1.zone1", + Level: "rack", + Nodes: map[string]*node_info.NodeInfo{}, + IdleOrReleasingVector: resource_info.NewResource(0, 0, 0).ToVector(testVectorMap), }, "rack2.zone1": { - ID: "rack2.zone1", - Level: "rack", - Nodes: map[string]*node_info.NodeInfo{}, - IdleOrReleasingResources: resource_info.NewResource(0, 0, 0), + ID: "rack2.zone1", + Level: "rack", + Nodes: map[string]*node_info.NodeInfo{}, + IdleOrReleasingVector: resource_info.NewResource(0, 0, 0).ToVector(testVectorMap), }, }, "zone": { "zone1": { - ID: "zone1", - Level: "zone", - Nodes: map[string]*node_info.NodeInfo{}, - IdleOrReleasingResources: resource_info.NewResource(0, 0, 0), + ID: "zone1", + Level: "zone", + Nodes: map[string]*node_info.NodeInfo{}, + IdleOrReleasingVector: resource_info.NewResource(0, 0, 0).ToVector(testVectorMap), }, }, }, @@ -268,7 +272,8 @@ func TestTopologyPlugin_subsetNodesFn(t *testing.T) { }, setupTopologyTree: func() *Info { return &Info{ - Name: "test-topology", + Name: "test-topology", + VectorMap: resource_info.NewResourceVectorMap(), TopologyResource: &kaiv1alpha1.Topology{ Spec: kaiv1alpha1.TopologySpec{ Levels: []kaiv1alpha1.TopologyLevel{ @@ -304,7 +309,8 @@ func TestTopologyPlugin_subsetNodesFn(t *testing.T) { }, setupTopologyTree: func() *Info { return &Info{ - Name: "test-topology", + Name: "test-topology", + VectorMap: resource_info.NewResourceVectorMap(), TopologyResource: &kaiv1alpha1.Topology{ Spec: kaiv1alpha1.TopologySpec{ Levels: []kaiv1alpha1.TopologyLevel{ @@ -346,7 +352,8 @@ func TestTopologyPlugin_subsetNodesFn(t *testing.T) { }, setupTopologyTree: func() *Info { tree := &Info{ - Name: "test-topology", + Name: "test-topology", + VectorMap: resource_info.NewResourceVectorMap(), TopologyResource: &kaiv1alpha1.Topology{ Spec: kaiv1alpha1.TopologySpec{ Levels: []kaiv1alpha1.TopologyLevel{ @@ -357,10 +364,10 @@ func TestTopologyPlugin_subsetNodesFn(t *testing.T) { DomainsByLevel: map[DomainLevel]LevelDomainInfos{ "zone": { "zone1": { - ID: "zone1", - Level: "zone", - Nodes: map[string]*node_info.NodeInfo{}, - IdleOrReleasingResources: resource_info.NewResource(0, 0, 0), + ID: "zone1", + Level: "zone", + Nodes: map[string]*node_info.NodeInfo{}, + IdleOrReleasingVector: resource_info.NewResource(0, 0, 0).ToVector(testVectorMap), }, }, }, @@ -417,7 +424,8 @@ func TestTopologyPlugin_subsetNodesFn(t *testing.T) { }, setupTopologyTree: func() *Info { tree := &Info{ - Name: "test-topology", + Name: "test-topology", + VectorMap: resource_info.NewResourceVectorMap(), TopologyResource: &kaiv1alpha1.Topology{ Spec: kaiv1alpha1.TopologySpec{ Levels: []kaiv1alpha1.TopologyLevel{ @@ -429,27 +437,27 @@ func TestTopologyPlugin_subsetNodesFn(t *testing.T) { DomainsByLevel: map[DomainLevel]LevelDomainInfos{ "rack": { "rack1.zone1": { - ID: "rack1.zone1", - Level: "rack", - Nodes: map[string]*node_info.NodeInfo{}, - IdleOrReleasingResources: resource_info.NewResource(0, 0, 0), - AllocatablePods: allocatablePodsNotSet, + ID: "rack1.zone1", + Level: "rack", + Nodes: map[string]*node_info.NodeInfo{}, + IdleOrReleasingVector: resource_info.NewResource(0, 0, 0).ToVector(testVectorMap), + AllocatablePods: allocatablePodsNotSet, }, "rack2.zone1": { - ID: "rack2.zone1", - Level: "rack", - Nodes: map[string]*node_info.NodeInfo{}, - IdleOrReleasingResources: resource_info.NewResource(0, 0, 0), - AllocatablePods: allocatablePodsNotSet, + ID: "rack2.zone1", + Level: "rack", + Nodes: map[string]*node_info.NodeInfo{}, + IdleOrReleasingVector: resource_info.NewResource(0, 0, 0).ToVector(testVectorMap), + AllocatablePods: allocatablePodsNotSet, }, }, "zone": { "zone1": { - ID: "zone1", - Level: "zone", - Nodes: map[string]*node_info.NodeInfo{}, - IdleOrReleasingResources: resource_info.NewResource(0, 0, 0), - AllocatablePods: allocatablePodsNotSet, + ID: "zone1", + Level: "zone", + Nodes: map[string]*node_info.NodeInfo{}, + IdleOrReleasingVector: resource_info.NewResource(0, 0, 0).ToVector(testVectorMap), + AllocatablePods: allocatablePodsNotSet, }, }, }, @@ -508,7 +516,8 @@ func TestTopologyPlugin_subsetNodesFn(t *testing.T) { }, setupTopologyTree: func() *Info { tree := &Info{ - Name: "test-topology", + Name: "test-topology", + VectorMap: resource_info.NewResourceVectorMap(), TopologyResource: &kaiv1alpha1.Topology{ Spec: kaiv1alpha1.TopologySpec{ Levels: []kaiv1alpha1.TopologyLevel{ @@ -520,18 +529,18 @@ func TestTopologyPlugin_subsetNodesFn(t *testing.T) { DomainsByLevel: map[DomainLevel]LevelDomainInfos{ "rack": { "rack1.zone1": { - ID: "rack1.zone1", - Level: "rack", - Nodes: map[string]*node_info.NodeInfo{}, - IdleOrReleasingResources: resource_info.NewResource(0, 0, 0), + ID: "rack1.zone1", + Level: "rack", + Nodes: map[string]*node_info.NodeInfo{}, + IdleOrReleasingVector: resource_info.NewResource(0, 0, 0).ToVector(testVectorMap), }, }, "zone": { "zone1": { - ID: "zone1", - Level: "zone", - Nodes: map[string]*node_info.NodeInfo{}, - IdleOrReleasingResources: resource_info.NewResource(0, 0, 0), + ID: "zone1", + Level: "zone", + Nodes: map[string]*node_info.NodeInfo{}, + IdleOrReleasingVector: resource_info.NewResource(0, 0, 0).ToVector(testVectorMap), }, }, }, @@ -659,7 +668,8 @@ func TestTopologyPlugin_calculateRelevantDomainLevels(t *testing.T) { }, ), topologyTree: &Info{ - Name: "test-topology", + Name: "test-topology", + VectorMap: resource_info.NewResourceVectorMap(), TopologyResource: &kaiv1alpha1.Topology{ Spec: kaiv1alpha1.TopologySpec{ Levels: []kaiv1alpha1.TopologyLevel{ @@ -685,7 +695,8 @@ func TestTopologyPlugin_calculateRelevantDomainLevels(t *testing.T) { }, ), topologyTree: &Info{ - Name: "test-topology", + Name: "test-topology", + VectorMap: resource_info.NewResourceVectorMap(), TopologyResource: &kaiv1alpha1.Topology{ Spec: kaiv1alpha1.TopologySpec{ Levels: []kaiv1alpha1.TopologyLevel{ @@ -710,7 +721,8 @@ func TestTopologyPlugin_calculateRelevantDomainLevels(t *testing.T) { }, ), topologyTree: &Info{ - Name: "test-topology", + Name: "test-topology", + VectorMap: resource_info.NewResourceVectorMap(), TopologyResource: &kaiv1alpha1.Topology{ Spec: kaiv1alpha1.TopologySpec{ Levels: []kaiv1alpha1.TopologyLevel{ @@ -737,7 +749,8 @@ func TestTopologyPlugin_calculateRelevantDomainLevels(t *testing.T) { }, ), topologyTree: &Info{ - Name: "test-topology", + Name: "test-topology", + VectorMap: resource_info.NewResourceVectorMap(), TopologyResource: &kaiv1alpha1.Topology{ Spec: kaiv1alpha1.TopologySpec{ Levels: []kaiv1alpha1.TopologyLevel{ @@ -759,7 +772,8 @@ func TestTopologyPlugin_calculateRelevantDomainLevels(t *testing.T) { }, ), topologyTree: &Info{ - Name: "test-topology", + Name: "test-topology", + VectorMap: resource_info.NewResourceVectorMap(), TopologyResource: &kaiv1alpha1.Topology{ Spec: kaiv1alpha1.TopologySpec{ Levels: []kaiv1alpha1.TopologyLevel{ @@ -781,7 +795,8 @@ func TestTopologyPlugin_calculateRelevantDomainLevels(t *testing.T) { }, ), topologyTree: &Info{ - Name: "test-topology", + Name: "test-topology", + VectorMap: resource_info.NewResourceVectorMap(), TopologyResource: &kaiv1alpha1.Topology{ Spec: kaiv1alpha1.TopologySpec{ Levels: []kaiv1alpha1.TopologyLevel{ @@ -803,7 +818,8 @@ func TestTopologyPlugin_calculateRelevantDomainLevels(t *testing.T) { }, ), topologyTree: &Info{ - Name: "test-topology", + Name: "test-topology", + VectorMap: resource_info.NewResourceVectorMap(), TopologyResource: &kaiv1alpha1.Topology{ Spec: kaiv1alpha1.TopologySpec{ Levels: []kaiv1alpha1.TopologyLevel{ @@ -828,7 +844,8 @@ func TestTopologyPlugin_calculateRelevantDomainLevels(t *testing.T) { }, ), topologyTree: &Info{ - Name: "test-topology", + Name: "test-topology", + VectorMap: resource_info.NewResourceVectorMap(), TopologyResource: &kaiv1alpha1.Topology{ Spec: kaiv1alpha1.TopologySpec{ Levels: []kaiv1alpha1.TopologyLevel{ @@ -856,7 +873,8 @@ func TestTopologyPlugin_calculateRelevantDomainLevels(t *testing.T) { }, ), topologyTree: &Info{ - Name: "test-topology", + Name: "test-topology", + VectorMap: resource_info.NewResourceVectorMap(), TopologyResource: &kaiv1alpha1.Topology{ Spec: kaiv1alpha1.TopologySpec{ Levels: []kaiv1alpha1.TopologyLevel{ @@ -883,7 +901,8 @@ func TestTopologyPlugin_calculateRelevantDomainLevels(t *testing.T) { }, ), topologyTree: &Info{ - Name: "test-topology", + Name: "test-topology", + VectorMap: resource_info.NewResourceVectorMap(), TopologyResource: &kaiv1alpha1.Topology{ Spec: kaiv1alpha1.TopologySpec{ Levels: []kaiv1alpha1.TopologyLevel{ @@ -908,7 +927,8 @@ func TestTopologyPlugin_calculateRelevantDomainLevels(t *testing.T) { }, ), topologyTree: &Info{ - Name: "test-topology", + Name: "test-topology", + VectorMap: resource_info.NewResourceVectorMap(), TopologyResource: &kaiv1alpha1.Topology{ Spec: kaiv1alpha1.TopologySpec{ Levels: []kaiv1alpha1.TopologyLevel{ @@ -988,27 +1008,28 @@ func TestTopologyPlugin_calcTreeAllocatable(t *testing.T) { }, }, }, + VectorMap: testVectorMap, DomainsByLevel: map[DomainLevel]LevelDomainInfos{ "rack": { "rack1.zone1": { - ID: "rack1.zone1", - Level: "rack", - Nodes: map[string]*node_info.NodeInfo{}, - IdleOrReleasingResources: resource_info.NewResource(0, 0, 0), + ID: "rack1.zone1", + Level: "rack", + Nodes: map[string]*node_info.NodeInfo{}, + IdleOrReleasingVector: resource_info.NewResource(0, 0, 0).ToVector(testVectorMap), }, "rack2.zone1": { - ID: "rack2.zone1", - Level: "rack", - Nodes: map[string]*node_info.NodeInfo{}, - IdleOrReleasingResources: resource_info.NewResource(0, 0, 0), + ID: "rack2.zone1", + Level: "rack", + Nodes: map[string]*node_info.NodeInfo{}, + IdleOrReleasingVector: resource_info.NewResource(0, 0, 0).ToVector(testVectorMap), }, }, "zone": { "zone1": { - ID: "zone1", - Level: "zone", - Nodes: map[string]*node_info.NodeInfo{}, - IdleOrReleasingResources: resource_info.NewResource(0, 0, 0), + ID: "zone1", + Level: "zone", + Nodes: map[string]*node_info.NodeInfo{}, + IdleOrReleasingVector: resource_info.NewResource(0, 0, 0).ToVector(testVectorMap), }, }, }, @@ -1236,10 +1257,10 @@ func TestTopologyPlugin_calcTreeAllocatable(t *testing.T) { DomainsByLevel: map[DomainLevel]LevelDomainInfos{ "zone": { "zone1": { - ID: "zone1", - Level: "zone", - Nodes: map[string]*node_info.NodeInfo{}, - IdleOrReleasingResources: resource_info.NewResource(0, 0, 0), + ID: "zone1", + Level: "zone", + Nodes: map[string]*node_info.NodeInfo{}, + IdleOrReleasingVector: resource_info.NewResource(0, 0, 0).ToVector(testVectorMap), }, }, }, @@ -1540,7 +1561,8 @@ func TestTopologyPlugin_getJobAllocatableDomains(t *testing.T) { RequiredLevel: "zone", }, topologyTree: &Info{ - Name: "test-topology", + Name: "test-topology", + VectorMap: resource_info.NewResourceVectorMap(), TopologyResource: &kaiv1alpha1.Topology{ Spec: kaiv1alpha1.TopologySpec{ Levels: []kaiv1alpha1.TopologyLevel{ @@ -1604,7 +1626,8 @@ func TestTopologyPlugin_getJobAllocatableDomains(t *testing.T) { RequiredLevel: "zone", }, topologyTree: &Info{ - Name: "test-topology", + Name: "test-topology", + VectorMap: resource_info.NewResourceVectorMap(), TopologyResource: &kaiv1alpha1.Topology{ Spec: kaiv1alpha1.TopologySpec{ Levels: []kaiv1alpha1.TopologyLevel{ @@ -1616,30 +1639,30 @@ func TestTopologyPlugin_getJobAllocatableDomains(t *testing.T) { DomainsByLevel: map[DomainLevel]LevelDomainInfos{ "rack": { "rack1.zone1": { - ID: "rack1.zone1", - Level: "rack", - IdleOrReleasingResources: resource_info.NewResource(500, 0, 0), // Insufficient resources - AllocatablePods: -1, + ID: "rack1.zone1", + Level: "rack", + IdleOrReleasingVector: resource_info.NewResource(500, 0, 0).ToVector(testVectorMap), // Insufficient resources + AllocatablePods: -1, }, "rack2.zone2": { - ID: "rack2.zone2", - Level: "rack", - IdleOrReleasingResources: resource_info.NewResource(600, 0, 0), // Insufficient resources - AllocatablePods: -1, + ID: "rack2.zone2", + Level: "rack", + IdleOrReleasingVector: resource_info.NewResource(600, 0, 0).ToVector(testVectorMap), // Insufficient resources + AllocatablePods: -1, }, }, "zone": { "zone1": { - ID: "zone1", - Level: "zone", - IdleOrReleasingResources: resource_info.NewResource(500, 0, 0), // Insufficient resources - AllocatablePods: -1, + ID: "zone1", + Level: "zone", + IdleOrReleasingVector: resource_info.NewResource(500, 0, 0).ToVector(testVectorMap), // Insufficient resources + AllocatablePods: -1, }, "zone2": { - ID: "zone2", - Level: "zone", - IdleOrReleasingResources: resource_info.NewResource(600, 0, 0), // Insufficient resources - AllocatablePods: -1, + ID: "zone2", + Level: "zone", + IdleOrReleasingVector: resource_info.NewResource(600, 0, 0).ToVector(testVectorMap), // Insufficient resources + AllocatablePods: -1, }, }, }, @@ -1700,7 +1723,8 @@ func TestTopologyPlugin_getJobAllocatableDomains(t *testing.T) { PreferredLevel: "rack", }, topologyTree: &Info{ - Name: "test-topology", + Name: "test-topology", + VectorMap: resource_info.NewResourceVectorMap(), TopologyResource: &kaiv1alpha1.Topology{ Spec: kaiv1alpha1.TopologySpec{ Levels: []kaiv1alpha1.TopologyLevel{ @@ -1743,7 +1767,8 @@ func TestTopologyPlugin_getJobAllocatableDomains(t *testing.T) { PreferredLevel: "zone", }, topologyTree: &Info{ - Name: "test-topology", + Name: "test-topology", + VectorMap: resource_info.NewResourceVectorMap(), TopologyResource: &kaiv1alpha1.Topology{ Spec: kaiv1alpha1.TopologySpec{ Levels: []kaiv1alpha1.TopologyLevel{ @@ -1817,7 +1842,8 @@ func TestTopologyPlugin_getJobAllocatableDomains(t *testing.T) { RequiredLevel: "zone", }, topologyTree: &Info{ - Name: "test-topology", + Name: "test-topology", + VectorMap: resource_info.NewResourceVectorMap(), TopologyResource: &kaiv1alpha1.Topology{ Spec: kaiv1alpha1.TopologySpec{ Levels: []kaiv1alpha1.TopologyLevel{ @@ -1873,7 +1899,8 @@ func TestTopologyPlugin_getJobAllocatableDomains(t *testing.T) { RequiredLevel: "zone", }, topologyTree: &Info{ - Name: "test-topology", + Name: "test-topology", + VectorMap: resource_info.NewResourceVectorMap(), TopologyResource: &kaiv1alpha1.Topology{ Spec: kaiv1alpha1.TopologySpec{ Levels: []kaiv1alpha1.TopologyLevel{ @@ -1945,7 +1972,8 @@ func TestTopologyPlugin_getJobAllocatableDomains(t *testing.T) { PreferredLevel: "rack", }, topologyTree: &Info{ - Name: "test-topology", + Name: "test-topology", + VectorMap: resource_info.NewResourceVectorMap(), TopologyResource: &kaiv1alpha1.Topology{ Spec: kaiv1alpha1.TopologySpec{ Levels: []kaiv1alpha1.TopologyLevel{ @@ -2095,7 +2123,7 @@ func TestTopologyPlugin_getJobAllocatableDomains(t *testing.T) { tasksCount := len(tasks) result, err := plugin.getJobAllocatableDomains(tt.job, &tt.job.RootSubGroupSet.SubGroupInfo, - tt.job.RootSubGroupSet.GetAllPodSets(), tasksResources, tasksCount, + tt.job.RootSubGroupSet.GetAllPodSets(), tasksResources.ToVector(testVectorMap), tasksCount, tt.topologyTree) // Check error diff --git a/pkg/scheduler/plugins/topology/node_scoring_test.go b/pkg/scheduler/plugins/topology/node_scoring_test.go index 557f7b8c5..03e6b89a3 100644 --- a/pkg/scheduler/plugins/topology/node_scoring_test.go +++ b/pkg/scheduler/plugins/topology/node_scoring_test.go @@ -257,9 +257,10 @@ func TestCalculateNodeScores(t *testing.T) { } func TestSortTree(t *testing.T) { + vectorMap := resource_info.NewResourceVectorMap() tests := []struct { name string - tasksResources *resource_info.Resource + tasksResources resource_info.ResourceVector setupTree func() *DomainInfo maxDepthLevel DomainLevel expectedOrder []DomainID @@ -276,14 +277,14 @@ func TestSortTree(t *testing.T) { }, { name: "empty max depth level", - tasksResources: resource_info.NewResource(0, 0, 1), + tasksResources: resource_info.NewResourceVectorWithValues(0, 0, 1, vectorMap), setupTree: func() *DomainInfo { return &DomainInfo{ ID: "zone1", Level: "zone", Children: []*DomainInfo{ - {ID: "rack1", Level: "rack", IdleOrReleasingResources: resource_info.NewResource(0, 0, 1)}, - {ID: "rack2", Level: "rack", IdleOrReleasingResources: resource_info.NewResource(0, 0, 1)}, + {ID: "rack1", Level: "rack", IdleOrReleasingVector: resource_info.NewResourceVectorWithValues(0, 0, 1, vectorMap)}, + {ID: "rack2", Level: "rack", IdleOrReleasingVector: resource_info.NewResourceVectorWithValues(0, 0, 1, vectorMap)}, }, } }, @@ -292,15 +293,15 @@ func TestSortTree(t *testing.T) { }, { name: "sort single level - gpus", - tasksResources: resource_info.NewResource(0, 0, 1), + tasksResources: resource_info.NewResourceVectorWithValues(0, 0, 1, vectorMap), setupTree: func() *DomainInfo { return &DomainInfo{ ID: "zone1", Level: "zone", Children: []*DomainInfo{ - {ID: "rack3", Level: "rack", IdleOrReleasingResources: resource_info.NewResource(0, 0, 5)}, - {ID: "rack1", Level: "rack", IdleOrReleasingResources: resource_info.NewResource(0, 0, 2)}, - {ID: "rack2", Level: "rack", IdleOrReleasingResources: resource_info.NewResource(0, 0, 8)}, + {ID: "rack3", Level: "rack", IdleOrReleasingVector: resource_info.NewResourceVectorWithValues(0, 0, 5, vectorMap)}, + {ID: "rack1", Level: "rack", IdleOrReleasingVector: resource_info.NewResourceVectorWithValues(0, 0, 2, vectorMap)}, + {ID: "rack2", Level: "rack", IdleOrReleasingVector: resource_info.NewResourceVectorWithValues(0, 0, 8, vectorMap)}, }, } }, @@ -310,15 +311,15 @@ func TestSortTree(t *testing.T) { }, { name: "sort single level - cpu", - tasksResources: resource_info.NewResource(1000, 0, 0), + tasksResources: resource_info.NewResourceVectorWithValues(1000, 0, 0, vectorMap), setupTree: func() *DomainInfo { return &DomainInfo{ ID: "zone1", Level: "zone", Children: []*DomainInfo{ - {ID: "rack3", Level: "rack", IdleOrReleasingResources: resource_info.NewResource(5000, 0, 0)}, - {ID: "rack1", Level: "rack", IdleOrReleasingResources: resource_info.NewResource(2000, 0, 0)}, - {ID: "rack2", Level: "rack", IdleOrReleasingResources: resource_info.NewResource(8000, 0, 0)}, + {ID: "rack3", Level: "rack", IdleOrReleasingVector: resource_info.NewResourceVectorWithValues(5000, 0, 0, vectorMap)}, + {ID: "rack1", Level: "rack", IdleOrReleasingVector: resource_info.NewResourceVectorWithValues(2000, 0, 0, vectorMap)}, + {ID: "rack2", Level: "rack", IdleOrReleasingVector: resource_info.NewResourceVectorWithValues(8000, 0, 0, vectorMap)}, }, } }, @@ -328,15 +329,15 @@ func TestSortTree(t *testing.T) { }, { name: "sort single level - several dominant resources", - tasksResources: resource_info.NewResource(1000, 0, 1), + tasksResources: resource_info.NewResourceVectorWithValues(1000, 0, 1, vectorMap), setupTree: func() *DomainInfo { return &DomainInfo{ ID: "zone1", Level: "zone", Children: []*DomainInfo{ - {ID: "rack3", Level: "rack", AllocatablePods: 5, IdleOrReleasingResources: resource_info.NewResource(100000, 0, 5)}, - {ID: "rack1", Level: "rack", AllocatablePods: 2, IdleOrReleasingResources: resource_info.NewResource(2000, 0, 4)}, - {ID: "rack2", Level: "rack", AllocatablePods: 8, IdleOrReleasingResources: resource_info.NewResource(100000, 0, 8)}, + {ID: "rack3", Level: "rack", AllocatablePods: 5, IdleOrReleasingVector: resource_info.NewResourceVectorWithValues(100000, 0, 5, vectorMap)}, + {ID: "rack1", Level: "rack", AllocatablePods: 2, IdleOrReleasingVector: resource_info.NewResourceVectorWithValues(2000, 0, 4, vectorMap)}, + {ID: "rack2", Level: "rack", AllocatablePods: 8, IdleOrReleasingVector: resource_info.NewResourceVectorWithValues(100000, 0, 8, vectorMap)}, }, } }, @@ -346,30 +347,30 @@ func TestSortTree(t *testing.T) { }, { name: "sort stops at max depth level", - tasksResources: resource_info.NewResource(0, 0, 1), + tasksResources: resource_info.NewResourceVectorWithValues(0, 0, 1, vectorMap), setupTree: func() *DomainInfo { return &DomainInfo{ ID: "region1", Level: "region", Children: []*DomainInfo{ { - ID: "zone2", - Level: "zone", - AllocatablePods: 10, - IdleOrReleasingResources: resource_info.NewResource(0, 0, 10), + ID: "zone2", + Level: "zone", + AllocatablePods: 10, + IdleOrReleasingVector: resource_info.NewResourceVectorWithValues(0, 0, 10, vectorMap), Children: []*DomainInfo{ - {ID: "rack4", Level: "rack", AllocatablePods: 3, IdleOrReleasingResources: resource_info.NewResource(0, 0, 3)}, - {ID: "rack3", Level: "rack", AllocatablePods: 7, IdleOrReleasingResources: resource_info.NewResource(0, 0, 7)}, + {ID: "rack4", Level: "rack", AllocatablePods: 3, IdleOrReleasingVector: resource_info.NewResourceVectorWithValues(0, 0, 3, vectorMap)}, + {ID: "rack3", Level: "rack", AllocatablePods: 7, IdleOrReleasingVector: resource_info.NewResourceVectorWithValues(0, 0, 7, vectorMap)}, }, }, { - ID: "zone1", - Level: "zone", - AllocatablePods: 5, - IdleOrReleasingResources: resource_info.NewResource(0, 0, 5), + ID: "zone1", + Level: "zone", + AllocatablePods: 5, + IdleOrReleasingVector: resource_info.NewResourceVectorWithValues(0, 0, 5, vectorMap), Children: []*DomainInfo{ - {ID: "rack2", Level: "rack", AllocatablePods: 1, IdleOrReleasingResources: resource_info.NewResource(0, 0, 1)}, - {ID: "rack1", Level: "rack", AllocatablePods: 4, IdleOrReleasingResources: resource_info.NewResource(0, 0, 4)}, + {ID: "rack2", Level: "rack", AllocatablePods: 1, IdleOrReleasingVector: resource_info.NewResourceVectorWithValues(0, 0, 1, vectorMap)}, + {ID: "rack1", Level: "rack", AllocatablePods: 4, IdleOrReleasingVector: resource_info.NewResourceVectorWithValues(0, 0, 4, vectorMap)}, }, }, }, @@ -381,30 +382,30 @@ func TestSortTree(t *testing.T) { }, { name: "sort nested hierarchy to rack level", - tasksResources: resource_info.NewResource(0, 0, 1), + tasksResources: resource_info.NewResourceVectorWithValues(0, 0, 1, vectorMap), setupTree: func() *DomainInfo { return &DomainInfo{ ID: "region1", Level: "region", Children: []*DomainInfo{ { - ID: "zone1", - Level: "zone", - AllocatablePods: 15, - IdleOrReleasingResources: resource_info.NewResource(0, 0, 15), + ID: "zone1", + Level: "zone", + AllocatablePods: 15, + IdleOrReleasingVector: resource_info.NewResourceVectorWithValues(0, 0, 15, vectorMap), Children: []*DomainInfo{ - {ID: "rack2", Level: "rack", AllocatablePods: 10, IdleOrReleasingResources: resource_info.NewResource(0, 0, 10)}, - {ID: "rack1", Level: "rack", AllocatablePods: 5, IdleOrReleasingResources: resource_info.NewResource(0, 0, 5)}, + {ID: "rack2", Level: "rack", AllocatablePods: 10, IdleOrReleasingVector: resource_info.NewResourceVectorWithValues(0, 0, 10, vectorMap)}, + {ID: "rack1", Level: "rack", AllocatablePods: 5, IdleOrReleasingVector: resource_info.NewResourceVectorWithValues(0, 0, 5, vectorMap)}, }, }, { - ID: "zone2", - Level: "zone", - AllocatablePods: 8, - IdleOrReleasingResources: resource_info.NewResource(0, 0, 8), + ID: "zone2", + Level: "zone", + AllocatablePods: 8, + IdleOrReleasingVector: resource_info.NewResourceVectorWithValues(0, 0, 8, vectorMap), Children: []*DomainInfo{ - {ID: "rack4", Level: "rack", AllocatablePods: 3, IdleOrReleasingResources: resource_info.NewResource(0, 0, 3)}, - {ID: "rack3", Level: "rack", AllocatablePods: 5, IdleOrReleasingResources: resource_info.NewResource(0, 0, 5)}, + {ID: "rack4", Level: "rack", AllocatablePods: 3, IdleOrReleasingVector: resource_info.NewResourceVectorWithValues(0, 0, 3, vectorMap)}, + {ID: "rack3", Level: "rack", AllocatablePods: 5, IdleOrReleasingVector: resource_info.NewResourceVectorWithValues(0, 0, 5, vectorMap)}, }, }, }, @@ -416,15 +417,15 @@ func TestSortTree(t *testing.T) { }, { name: "stable sort - same allocatable pods maintain order", - tasksResources: resource_info.NewResource(0, 0, 1), + tasksResources: resource_info.NewResourceVectorWithValues(0, 0, 1, vectorMap), setupTree: func() *DomainInfo { return &DomainInfo{ ID: "zone1", Level: "zone", Children: []*DomainInfo{ - {ID: "rack1", Level: "rack", AllocatablePods: 5, IdleOrReleasingResources: resource_info.NewResource(0, 0, 5)}, - {ID: "rack2", Level: "rack", AllocatablePods: 5, IdleOrReleasingResources: resource_info.NewResource(0, 0, 5)}, - {ID: "rack3", Level: "rack", AllocatablePods: 5, IdleOrReleasingResources: resource_info.NewResource(0, 0, 5)}, + {ID: "rack1", Level: "rack", AllocatablePods: 5, IdleOrReleasingVector: resource_info.NewResourceVectorWithValues(0, 0, 5, vectorMap)}, + {ID: "rack2", Level: "rack", AllocatablePods: 5, IdleOrReleasingVector: resource_info.NewResourceVectorWithValues(0, 0, 5, vectorMap)}, + {ID: "rack3", Level: "rack", AllocatablePods: 5, IdleOrReleasingVector: resource_info.NewResourceVectorWithValues(0, 0, 5, vectorMap)}, }, } }, @@ -461,29 +462,30 @@ func TestSortTree(t *testing.T) { func TestSortTree_NestedSorting(t *testing.T) { // Test that both parent and child levels are sorted when maxDepth is deep enough - tasksResources := resource_info.NewResource(0, 0, 1) + vectorMap := resource_info.NewResourceVectorMap() + tasksResources := resource_info.NewResourceVectorWithValues(0, 0, 1, vectorMap) root := &DomainInfo{ ID: "region1", Level: "region", Children: []*DomainInfo{ { - ID: "zone2", - Level: "zone", - AllocatablePods: 10, - IdleOrReleasingResources: resource_info.NewResource(0, 0, 10), + ID: "zone2", + Level: "zone", + AllocatablePods: 10, + IdleOrReleasingVector: resource_info.NewResourceVectorWithValues(0, 0, 10, vectorMap), Children: []*DomainInfo{ - {ID: "rack4", Level: "rack", AllocatablePods: 7, IdleOrReleasingResources: resource_info.NewResource(0, 0, 7)}, - {ID: "rack3", Level: "rack", AllocatablePods: 3, IdleOrReleasingResources: resource_info.NewResource(0, 0, 3)}, + {ID: "rack4", Level: "rack", AllocatablePods: 7, IdleOrReleasingVector: resource_info.NewResourceVectorWithValues(0, 0, 7, vectorMap)}, + {ID: "rack3", Level: "rack", AllocatablePods: 3, IdleOrReleasingVector: resource_info.NewResourceVectorWithValues(0, 0, 3, vectorMap)}, }, }, { - ID: "zone1", - Level: "zone", - AllocatablePods: 5, - IdleOrReleasingResources: resource_info.NewResource(0, 0, 5), + ID: "zone1", + Level: "zone", + AllocatablePods: 5, + IdleOrReleasingVector: resource_info.NewResourceVectorWithValues(0, 0, 5, vectorMap), Children: []*DomainInfo{ - {ID: "rack2", Level: "rack", AllocatablePods: 4, IdleOrReleasingResources: resource_info.NewResource(0, 0, 4)}, - {ID: "rack1", Level: "rack", AllocatablePods: 1, IdleOrReleasingResources: resource_info.NewResource(0, 0, 1)}, + {ID: "rack2", Level: "rack", AllocatablePods: 4, IdleOrReleasingVector: resource_info.NewResourceVectorWithValues(0, 0, 4, vectorMap)}, + {ID: "rack1", Level: "rack", AllocatablePods: 1, IdleOrReleasingVector: resource_info.NewResourceVectorWithValues(0, 0, 1, vectorMap)}, }, }, }, diff --git a/pkg/scheduler/plugins/topology/topology_plugin.go b/pkg/scheduler/plugins/topology/topology_plugin.go index ff27a3eda..d3dc10279 100644 --- a/pkg/scheduler/plugins/topology/topology_plugin.go +++ b/pkg/scheduler/plugins/topology/topology_plugin.go @@ -8,6 +8,7 @@ import ( "github.com/NVIDIA/KAI-scheduler/pkg/scheduler/api/node_info" "github.com/NVIDIA/KAI-scheduler/pkg/scheduler/api/podgroup_info" + "github.com/NVIDIA/KAI-scheduler/pkg/scheduler/api/resource_info" "github.com/NVIDIA/KAI-scheduler/pkg/scheduler/framework" ) @@ -55,6 +56,13 @@ func (t *topologyPlugin) preJobAllocationFn(_ *podgroup_info.PodGroupInfo) { } func (t *topologyPlugin) initializeTopologyTree(topologies []*kaiv1alpha1.Topology, nodes map[string]*node_info.NodeInfo) { + // Get VectorMap from any node (all share the same map) + var sharedVectorMap *resource_info.ResourceVectorMap + for _, nodeInfo := range nodes { + sharedVectorMap = nodeInfo.VectorMap + break + } + for _, topology := range topologies { topologyTree := &Info{ Name: topology.Name, @@ -64,6 +72,7 @@ func (t *topologyPlugin) initializeTopologyTree(topologies []*kaiv1alpha1.Topolo }, }, TopologyResource: topology, + VectorMap: sharedVectorMap, } for _, nodeInfo := range nodes { diff --git a/pkg/scheduler/plugins/topology/topology_structs.go b/pkg/scheduler/plugins/topology/topology_structs.go index 02cc8ca7d..5432efffc 100644 --- a/pkg/scheduler/plugins/topology/topology_structs.go +++ b/pkg/scheduler/plugins/topology/topology_structs.go @@ -35,6 +35,9 @@ type Info struct { // Topology resource TopologyResource *kaiv1alpha1.Topology + + // VectorMap shared across all nodes + VectorMap *resource_info.ResourceVectorMap } // DomainInfo represents a node in the topology tree @@ -55,17 +58,16 @@ type DomainInfo struct { AllocatablePods int // Total available resources in this domain - IdleOrReleasingResources *resource_info.Resource + IdleOrReleasingVector resource_info.ResourceVector } func NewDomainInfo(id DomainID, level DomainLevel) *DomainInfo { return &DomainInfo{ - ID: id, - Level: level, - Children: []*DomainInfo{}, - Nodes: map[string]*node_info.NodeInfo{}, - AllocatablePods: allocatablePodsNotSet, - IdleOrReleasingResources: resource_info.EmptyResource(), + ID: id, + Level: level, + Children: []*DomainInfo{}, + Nodes: map[string]*node_info.NodeInfo{}, + AllocatablePods: allocatablePodsNotSet, } }