diff --git a/pkg/scheduler/plugins/deviceshare/device_allocator_test.go b/pkg/scheduler/plugins/deviceshare/device_allocator_test.go index c5924af5f..8efd31078 100644 --- a/pkg/scheduler/plugins/deviceshare/device_allocator_test.go +++ b/pkg/scheduler/plugins/deviceshare/device_allocator_test.go @@ -973,6 +973,7 @@ func TestAutopilotAllocator(t *testing.T) { } sortDeviceAllocations(allocations) sortDeviceAllocations(tt.want) + fillGPUTotalMem(allocations, nodeDevice) assert.Equal(t, tt.want, allocations) }) } @@ -1931,6 +1932,7 @@ func TestAutopilotAllocatorWithExclusivePolicyAndRequiredScope(t *testing.T) { } sortDeviceAllocations(allocations) sortDeviceAllocations(tt.want) + fillGPUTotalMem(allocations, nodeDevice) assert.Equal(t, tt.want, allocations) }) } @@ -2117,6 +2119,8 @@ func Test_allocateGPUWithLeastAllocatedScorer(t *testing.T) { scorer: allocationScorer, } allocateResult, status := allocator.Allocate(nil, nil, nil, nil) + err := fillGPUTotalMem(allocateResult, nd) + assert.NoError(t, err) assert.True(t, status.IsSuccess()) expectAllocations := []*apiext.DeviceAllocation{ { @@ -2226,6 +2230,7 @@ func Test_nodeDevice_allocateGPUWithMostAllocatedScorer(t *testing.T) { }, }, } + fillGPUTotalMem(allocateResult, nd) assert.True(t, equality.Semantic.DeepEqual(expectAllocations, allocateResult[schedulingv1alpha1.GPU])) } diff --git a/pkg/scheduler/plugins/deviceshare/devicehandler_gpu.go b/pkg/scheduler/plugins/deviceshare/devicehandler_gpu.go index f322da3e0..41cecf658 100644 --- a/pkg/scheduler/plugins/deviceshare/devicehandler_gpu.go +++ b/pkg/scheduler/plugins/deviceshare/devicehandler_gpu.go @@ -44,55 +44,58 @@ func (h *GPUHandler) CalcDesiredRequestsAndCount(node *corev1.Node, pod *corev1. } podRequests = podRequests.DeepCopy() - if err := fillGPUTotalMem(totalDevice, podRequests); err != nil { - return nil, 0, framework.NewStatus(framework.UnschedulableAndUnresolvable, err.Error()) - } requests := podRequests desiredCount := int64(1) gpuShare, ok := podRequests[apiext.ResourceGPUShared] - gpuCore, gpuMem, gpuMemoryRatio := podRequests[apiext.ResourceGPUCore], podRequests[apiext.ResourceGPUMemory], podRequests[apiext.ResourceGPUMemoryRatio] + gpuCore, coreExists := podRequests[apiext.ResourceGPUCore] + gpuMemoryRatio, memoryRatioExists := podRequests[apiext.ResourceGPUMemoryRatio] // gpu share mode if ok && gpuShare.Value() > 0 { desiredCount = gpuShare.Value() } else { - if gpuMemoryRatio.Value() > 100 && gpuMemoryRatio.Value()%100 == 0 { + if memoryRatioExists && gpuMemoryRatio.Value() > 100 && gpuMemoryRatio.Value()%100 == 0 { desiredCount = gpuMemoryRatio.Value() / 100 } } if desiredCount > 1 { - requests = corev1.ResourceList{ - apiext.ResourceGPUCore: *resource.NewQuantity(gpuCore.Value()/desiredCount, resource.DecimalSI), - apiext.ResourceGPUMemory: *resource.NewQuantity(gpuMem.Value()/desiredCount, resource.BinarySI), - apiext.ResourceGPUMemoryRatio: *resource.NewQuantity(gpuMemoryRatio.Value()/desiredCount, resource.DecimalSI), + requests = corev1.ResourceList{} + if coreExists { + requests[apiext.ResourceGPUCore] = *resource.NewQuantity(gpuCore.Value()/desiredCount, resource.DecimalSI) + } + if memoryRatioExists { + requests[apiext.ResourceGPUMemoryRatio] = *resource.NewQuantity(gpuMemoryRatio.Value()/desiredCount, resource.DecimalSI) + } else if gpuMem, memExists := podRequests[apiext.ResourceGPUMemory]; memExists { + requests[apiext.ResourceGPUMemory] = *resource.NewQuantity(gpuMem.Value()/desiredCount, resource.BinarySI) } } return requests, int(desiredCount), nil } -func fillGPUTotalMem(nodeDeviceTotal deviceResources, podRequest corev1.ResourceList) error { - // nodeDeviceTotal uses the minor of GPU as key. However, under certain circumstances, - // minor 0 might not exist. We need to iterate the cache once to find the active minor. - var total corev1.ResourceList - for _, resources := range nodeDeviceTotal { - if len(resources) > 0 && !quotav1.IsZero(resources) { - total = resources - break - } +func fillGPUTotalMem(allocations apiext.DeviceAllocations, nodeDeviceInfo *nodeDevice) error { + gpuAllocations, ok := allocations[schedulingv1alpha1.GPU] + if !ok { + return nil } - if total == nil { - return fmt.Errorf("no healthy GPU Devices") + gpuTotalDevices, ok := nodeDeviceInfo.deviceTotal[schedulingv1alpha1.GPU] + if !ok { + return nil } - // a node can only contain one type of GPU, so each of them has the same total memory. - if gpuMem, ok := podRequest[apiext.ResourceGPUMemory]; ok { - podRequest[apiext.ResourceGPUMemoryRatio] = memoryBytesToRatio(gpuMem, total[apiext.ResourceGPUMemory]) - } else { - gpuMemRatio := podRequest[apiext.ResourceGPUMemoryRatio] - podRequest[apiext.ResourceGPUMemory] = memoryRatioToBytes(gpuMemRatio, total[apiext.ResourceGPUMemory]) + for i, allocation := range gpuAllocations { + gpuDevice, ok := gpuTotalDevices[int(allocation.Minor)] + if !ok || gpuDevice == nil || quotav1.IsZero(gpuDevice) { + return fmt.Errorf("no healthy gpu device with minor %d of allocation", allocation.Minor) + } + if gpuMem, ok := allocation.Resources[apiext.ResourceGPUMemory]; ok { + gpuAllocations[i].Resources[apiext.ResourceGPUMemoryRatio] = memoryBytesToRatio(gpuMem, gpuDevice[apiext.ResourceGPUMemory]) + } else { + gpuMemRatio := allocation.Resources[apiext.ResourceGPUMemoryRatio] + gpuAllocations[i].Resources[apiext.ResourceGPUMemory] = memoryRatioToBytes(gpuMemRatio, gpuDevice[apiext.ResourceGPUMemory]) + } } return nil } diff --git a/pkg/scheduler/plugins/deviceshare/devicehandler_gpu_test.go b/pkg/scheduler/plugins/deviceshare/devicehandler_gpu_test.go index 4816a0187..025dc75c2 100644 --- a/pkg/scheduler/plugins/deviceshare/devicehandler_gpu_test.go +++ b/pkg/scheduler/plugins/deviceshare/devicehandler_gpu_test.go @@ -24,77 +24,132 @@ import ( "k8s.io/apimachinery/pkg/api/resource" apiext "github.com/koordinator-sh/koordinator/apis/extension" + schedulingv1alpha1 "github.com/koordinator-sh/koordinator/apis/scheduling/v1alpha1" ) func Test_fillGPUTotalMem(t *testing.T) { tests := []struct { - name string - gpuTotal deviceResources - podRequest corev1.ResourceList - wantPodRequest corev1.ResourceList - wantErr bool + name string + allocations apiext.DeviceAllocations + nodeDeviceInfo *nodeDevice + wantAllocations apiext.DeviceAllocations + wantErr bool }{ { name: "ratio to mem", - gpuTotal: deviceResources{ - 0: corev1.ResourceList{ - apiext.ResourceGPUCore: resource.MustParse("100"), - apiext.ResourceGPUMemoryRatio: resource.MustParse("100"), - apiext.ResourceGPUMemory: resource.MustParse("32Gi"), + allocations: map[schedulingv1alpha1.DeviceType][]*apiext.DeviceAllocation{ + schedulingv1alpha1.GPU: { + { + Minor: 0, + Resources: corev1.ResourceList{ + apiext.ResourceGPUCore: resource.MustParse("50"), + apiext.ResourceGPUMemoryRatio: resource.MustParse("50"), + }, + }, }, }, - podRequest: corev1.ResourceList{ - apiext.ResourceGPUCore: resource.MustParse("50"), - apiext.ResourceGPUMemoryRatio: resource.MustParse("50"), + nodeDeviceInfo: &nodeDevice{ + deviceTotal: map[schedulingv1alpha1.DeviceType]deviceResources{ + schedulingv1alpha1.GPU: { + 0: corev1.ResourceList{ + apiext.ResourceGPUCore: resource.MustParse("100"), + apiext.ResourceGPUMemoryRatio: resource.MustParse("100"), + apiext.ResourceGPUMemory: resource.MustParse("32Gi"), + }, + }, + }, }, - wantPodRequest: corev1.ResourceList{ - apiext.ResourceGPUCore: resource.MustParse("50"), - apiext.ResourceGPUMemoryRatio: resource.MustParse("50"), - apiext.ResourceGPUMemory: resource.MustParse("16Gi"), + wantAllocations: map[schedulingv1alpha1.DeviceType][]*apiext.DeviceAllocation{ + schedulingv1alpha1.GPU: { + { + Minor: 0, + Resources: corev1.ResourceList{ + apiext.ResourceGPUCore: resource.MustParse("50"), + apiext.ResourceGPUMemoryRatio: resource.MustParse("50"), + apiext.ResourceGPUMemory: resource.MustParse("16Gi"), + }, + }, + }, }, }, { name: "mem to ratio", - gpuTotal: deviceResources{ - 0: corev1.ResourceList{ - apiext.ResourceGPUCore: resource.MustParse("100"), - apiext.ResourceGPUMemoryRatio: resource.MustParse("100"), - apiext.ResourceGPUMemory: resource.MustParse("32Gi"), + allocations: map[schedulingv1alpha1.DeviceType][]*apiext.DeviceAllocation{ + schedulingv1alpha1.GPU: { + { + Minor: 0, + Resources: corev1.ResourceList{ + apiext.ResourceGPUCore: resource.MustParse("50"), + apiext.ResourceGPUMemory: resource.MustParse("16Gi"), + }, + }, }, }, - podRequest: corev1.ResourceList{ - apiext.ResourceGPUCore: resource.MustParse("50"), - apiext.ResourceGPUMemory: resource.MustParse("16Gi"), + nodeDeviceInfo: &nodeDevice{ + deviceTotal: map[schedulingv1alpha1.DeviceType]deviceResources{ + schedulingv1alpha1.GPU: { + 0: corev1.ResourceList{ + apiext.ResourceGPUCore: resource.MustParse("100"), + apiext.ResourceGPUMemoryRatio: resource.MustParse("100"), + apiext.ResourceGPUMemory: resource.MustParse("32Gi"), + }, + }, + }, }, - wantPodRequest: corev1.ResourceList{ - apiext.ResourceGPUCore: resource.MustParse("50"), - apiext.ResourceGPUMemoryRatio: *resource.NewQuantity(50, resource.DecimalSI), - apiext.ResourceGPUMemory: resource.MustParse("16Gi"), + wantAllocations: map[schedulingv1alpha1.DeviceType][]*apiext.DeviceAllocation{ + schedulingv1alpha1.GPU: { + { + Minor: 0, + Resources: corev1.ResourceList{ + apiext.ResourceGPUCore: resource.MustParse("50"), + apiext.ResourceGPUMemoryRatio: *resource.NewQuantity(50, resource.DecimalSI), + apiext.ResourceGPUMemory: resource.MustParse("16Gi"), + }, + }, + }, }, }, { name: "missing total", - gpuTotal: deviceResources{ - 0: corev1.ResourceList{}, + allocations: map[schedulingv1alpha1.DeviceType][]*apiext.DeviceAllocation{ + schedulingv1alpha1.GPU: { + { + Minor: 0, + Resources: corev1.ResourceList{ + apiext.ResourceGPUCore: resource.MustParse("50"), + apiext.ResourceGPUMemoryRatio: resource.MustParse("50"), + }, + }, + }, }, - podRequest: corev1.ResourceList{ - apiext.ResourceGPUCore: resource.MustParse("50"), - apiext.ResourceGPUMemoryRatio: resource.MustParse("50"), + nodeDeviceInfo: &nodeDevice{ + deviceTotal: map[schedulingv1alpha1.DeviceType]deviceResources{ + schedulingv1alpha1.GPU: { + 0: corev1.ResourceList{}, + }, + }, }, - wantPodRequest: corev1.ResourceList{ - apiext.ResourceGPUCore: resource.MustParse("50"), - apiext.ResourceGPUMemoryRatio: resource.MustParse("50"), + wantAllocations: map[schedulingv1alpha1.DeviceType][]*apiext.DeviceAllocation{ + schedulingv1alpha1.GPU: { + { + Minor: 0, + Resources: corev1.ResourceList{ + apiext.ResourceGPUCore: resource.MustParse("50"), + apiext.ResourceGPUMemoryRatio: resource.MustParse("50"), + }, + }, + }, }, wantErr: true, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - err := fillGPUTotalMem(tt.gpuTotal, tt.podRequest) + err := fillGPUTotalMem(tt.allocations, tt.nodeDeviceInfo) if tt.wantErr != (err != nil) { t.Errorf("wantErr %v but got %v", tt.wantErr, err != nil) } - assert.Equal(t, tt.wantPodRequest, tt.podRequest) + assert.Equal(t, tt.wantAllocations, tt.allocations) }) } } diff --git a/pkg/scheduler/plugins/deviceshare/plugin.go b/pkg/scheduler/plugins/deviceshare/plugin.go index 15d775e8f..7c82fa1e7 100644 --- a/pkg/scheduler/plugins/deviceshare/plugin.go +++ b/pkg/scheduler/plugins/deviceshare/plugin.go @@ -449,6 +449,10 @@ func (p *Plugin) Reserve(ctx context.Context, cycleState *framework.CycleState, return status } } + err = fillGPUTotalMem(result, nodeDeviceInfo) + if err != nil { + return framework.AsStatus(err) + } nodeDeviceInfo.updateCacheUsed(result, pod, true) state.allocationResult = result return nil diff --git a/pkg/scheduler/plugins/deviceshare/reservation_test.go b/pkg/scheduler/plugins/deviceshare/reservation_test.go index 0082ed665..dc2c637ca 100644 --- a/pkg/scheduler/plugins/deviceshare/reservation_test.go +++ b/pkg/scheduler/plugins/deviceshare/reservation_test.go @@ -687,6 +687,8 @@ func Test_tryAllocateFromReservation(t *testing.T) { basicPreemptible, tt.requiredFromReservation, ) + err := fillGPUTotalMem(result, nodeDeviceInfo) + assert.NoError(t, err) assert.Equal(t, tt.wantStatus, status) assert.Equal(t, tt.wantResult, result) }) diff --git a/pkg/scheduler/plugins/deviceshare/topology_hint_test.go b/pkg/scheduler/plugins/deviceshare/topology_hint_test.go index ff5f3d619..d85848dc7 100644 --- a/pkg/scheduler/plugins/deviceshare/topology_hint_test.go +++ b/pkg/scheduler/plugins/deviceshare/topology_hint_test.go @@ -72,11 +72,6 @@ func TestPlugin_GetPodTopologyHints(t *testing.T) { {NUMANodeAffinity: newBitMask(1), Preferred: true}, {NUMANodeAffinity: newBitMask(0, 1), Preferred: false}, }, - string(apiext.ResourceGPUMemory): { - {NUMANodeAffinity: newBitMask(0), Preferred: true}, - {NUMANodeAffinity: newBitMask(1), Preferred: true}, - {NUMANodeAffinity: newBitMask(0, 1), Preferred: false}, - }, string(apiext.ResourceGPUMemoryRatio): { {NUMANodeAffinity: newBitMask(0), Preferred: true}, {NUMANodeAffinity: newBitMask(1), Preferred: true}, @@ -110,10 +105,6 @@ func TestPlugin_GetPodTopologyHints(t *testing.T) { {NUMANodeAffinity: newBitMask(1), Preferred: true}, {NUMANodeAffinity: newBitMask(0, 1), Preferred: false}, }, - string(apiext.ResourceGPUMemory): { - {NUMANodeAffinity: newBitMask(1), Preferred: true}, - {NUMANodeAffinity: newBitMask(0, 1), Preferred: false}, - }, string(apiext.ResourceGPUMemoryRatio): { {NUMANodeAffinity: newBitMask(1), Preferred: true}, {NUMANodeAffinity: newBitMask(0, 1), Preferred: false}, @@ -205,11 +196,6 @@ func TestPlugin_GetPodTopologyHints(t *testing.T) { {NUMANodeAffinity: newBitMask(1), Preferred: true}, {NUMANodeAffinity: newBitMask(0, 1), Preferred: false}, }, - string(apiext.ResourceGPUMemory): { - {NUMANodeAffinity: newBitMask(0), Preferred: true}, - {NUMANodeAffinity: newBitMask(1), Preferred: true}, - {NUMANodeAffinity: newBitMask(0, 1), Preferred: false}, - }, string(apiext.ResourceGPUMemoryRatio): { {NUMANodeAffinity: newBitMask(0), Preferred: true}, {NUMANodeAffinity: newBitMask(1), Preferred: true},