Skip to content

Commit

Permalink
scheduler: support devices of the same node gpuMem not equal (#2199)
Browse files Browse the repository at this point in the history
Signed-off-by: wangjianyu.wjy <wangjianyu.wjy@alibaba-inc.com>
Co-authored-by: wangjianyu.wjy <wangjianyu.wjy@alibaba-inc.com>
  • Loading branch information
ZiMengSheng and wangjianyu.wjy authored Sep 14, 2024
1 parent 4f16162 commit beab44e
Show file tree
Hide file tree
Showing 6 changed files with 134 additions and 79 deletions.
5 changes: 5 additions & 0 deletions pkg/scheduler/plugins/deviceshare/device_allocator_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -973,6 +973,7 @@ func TestAutopilotAllocator(t *testing.T) {
}
sortDeviceAllocations(allocations)
sortDeviceAllocations(tt.want)
fillGPUTotalMem(allocations, nodeDevice)
assert.Equal(t, tt.want, allocations)
})
}
Expand Down Expand Up @@ -1931,6 +1932,7 @@ func TestAutopilotAllocatorWithExclusivePolicyAndRequiredScope(t *testing.T) {
}
sortDeviceAllocations(allocations)
sortDeviceAllocations(tt.want)
fillGPUTotalMem(allocations, nodeDevice)
assert.Equal(t, tt.want, allocations)
})
}
Expand Down Expand Up @@ -2117,6 +2119,8 @@ func Test_allocateGPUWithLeastAllocatedScorer(t *testing.T) {
scorer: allocationScorer,
}
allocateResult, status := allocator.Allocate(nil, nil, nil, nil)
err := fillGPUTotalMem(allocateResult, nd)
assert.NoError(t, err)
assert.True(t, status.IsSuccess())
expectAllocations := []*apiext.DeviceAllocation{
{
Expand Down Expand Up @@ -2226,6 +2230,7 @@ func Test_nodeDevice_allocateGPUWithMostAllocatedScorer(t *testing.T) {
},
},
}
fillGPUTotalMem(allocateResult, nd)
assert.True(t, equality.Semantic.DeepEqual(expectAllocations, allocateResult[schedulingv1alpha1.GPU]))
}

Expand Down
55 changes: 29 additions & 26 deletions pkg/scheduler/plugins/deviceshare/devicehandler_gpu.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,55 +44,58 @@ func (h *GPUHandler) CalcDesiredRequestsAndCount(node *corev1.Node, pod *corev1.
}

podRequests = podRequests.DeepCopy()
if err := fillGPUTotalMem(totalDevice, podRequests); err != nil {
return nil, 0, framework.NewStatus(framework.UnschedulableAndUnresolvable, err.Error())
}

requests := podRequests
desiredCount := int64(1)

gpuShare, ok := podRequests[apiext.ResourceGPUShared]
gpuCore, gpuMem, gpuMemoryRatio := podRequests[apiext.ResourceGPUCore], podRequests[apiext.ResourceGPUMemory], podRequests[apiext.ResourceGPUMemoryRatio]
gpuCore, coreExists := podRequests[apiext.ResourceGPUCore]
gpuMemoryRatio, memoryRatioExists := podRequests[apiext.ResourceGPUMemoryRatio]
// gpu share mode
if ok && gpuShare.Value() > 0 {
desiredCount = gpuShare.Value()
} else {
if gpuMemoryRatio.Value() > 100 && gpuMemoryRatio.Value()%100 == 0 {
if memoryRatioExists && gpuMemoryRatio.Value() > 100 && gpuMemoryRatio.Value()%100 == 0 {
desiredCount = gpuMemoryRatio.Value() / 100
}
}

if desiredCount > 1 {
requests = corev1.ResourceList{
apiext.ResourceGPUCore: *resource.NewQuantity(gpuCore.Value()/desiredCount, resource.DecimalSI),
apiext.ResourceGPUMemory: *resource.NewQuantity(gpuMem.Value()/desiredCount, resource.BinarySI),
apiext.ResourceGPUMemoryRatio: *resource.NewQuantity(gpuMemoryRatio.Value()/desiredCount, resource.DecimalSI),
requests = corev1.ResourceList{}
if coreExists {
requests[apiext.ResourceGPUCore] = *resource.NewQuantity(gpuCore.Value()/desiredCount, resource.DecimalSI)
}
if memoryRatioExists {
requests[apiext.ResourceGPUMemoryRatio] = *resource.NewQuantity(gpuMemoryRatio.Value()/desiredCount, resource.DecimalSI)
} else if gpuMem, memExists := podRequests[apiext.ResourceGPUMemory]; memExists {
requests[apiext.ResourceGPUMemory] = *resource.NewQuantity(gpuMem.Value()/desiredCount, resource.BinarySI)
}
}

return requests, int(desiredCount), nil
}

func fillGPUTotalMem(nodeDeviceTotal deviceResources, podRequest corev1.ResourceList) error {
// nodeDeviceTotal uses the minor of GPU as key. However, under certain circumstances,
// minor 0 might not exist. We need to iterate the cache once to find the active minor.
var total corev1.ResourceList
for _, resources := range nodeDeviceTotal {
if len(resources) > 0 && !quotav1.IsZero(resources) {
total = resources
break
}
func fillGPUTotalMem(allocations apiext.DeviceAllocations, nodeDeviceInfo *nodeDevice) error {
gpuAllocations, ok := allocations[schedulingv1alpha1.GPU]
if !ok {
return nil
}
if total == nil {
return fmt.Errorf("no healthy GPU Devices")
gpuTotalDevices, ok := nodeDeviceInfo.deviceTotal[schedulingv1alpha1.GPU]
if !ok {
return nil
}

// a node can only contain one type of GPU, so each of them has the same total memory.
if gpuMem, ok := podRequest[apiext.ResourceGPUMemory]; ok {
podRequest[apiext.ResourceGPUMemoryRatio] = memoryBytesToRatio(gpuMem, total[apiext.ResourceGPUMemory])
} else {
gpuMemRatio := podRequest[apiext.ResourceGPUMemoryRatio]
podRequest[apiext.ResourceGPUMemory] = memoryRatioToBytes(gpuMemRatio, total[apiext.ResourceGPUMemory])
for i, allocation := range gpuAllocations {
gpuDevice, ok := gpuTotalDevices[int(allocation.Minor)]
if !ok || gpuDevice == nil || quotav1.IsZero(gpuDevice) {
return fmt.Errorf("no healthy gpu device with minor %d of allocation", allocation.Minor)
}
if gpuMem, ok := allocation.Resources[apiext.ResourceGPUMemory]; ok {
gpuAllocations[i].Resources[apiext.ResourceGPUMemoryRatio] = memoryBytesToRatio(gpuMem, gpuDevice[apiext.ResourceGPUMemory])
} else {
gpuMemRatio := allocation.Resources[apiext.ResourceGPUMemoryRatio]
gpuAllocations[i].Resources[apiext.ResourceGPUMemory] = memoryRatioToBytes(gpuMemRatio, gpuDevice[apiext.ResourceGPUMemory])
}
}
return nil
}
Expand Down
133 changes: 94 additions & 39 deletions pkg/scheduler/plugins/deviceshare/devicehandler_gpu_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,77 +24,132 @@ import (
"k8s.io/apimachinery/pkg/api/resource"

apiext "github.com/koordinator-sh/koordinator/apis/extension"
schedulingv1alpha1 "github.com/koordinator-sh/koordinator/apis/scheduling/v1alpha1"
)

func Test_fillGPUTotalMem(t *testing.T) {
tests := []struct {
name string
gpuTotal deviceResources
podRequest corev1.ResourceList
wantPodRequest corev1.ResourceList
wantErr bool
name string
allocations apiext.DeviceAllocations
nodeDeviceInfo *nodeDevice
wantAllocations apiext.DeviceAllocations
wantErr bool
}{
{
name: "ratio to mem",
gpuTotal: deviceResources{
0: corev1.ResourceList{
apiext.ResourceGPUCore: resource.MustParse("100"),
apiext.ResourceGPUMemoryRatio: resource.MustParse("100"),
apiext.ResourceGPUMemory: resource.MustParse("32Gi"),
allocations: map[schedulingv1alpha1.DeviceType][]*apiext.DeviceAllocation{
schedulingv1alpha1.GPU: {
{
Minor: 0,
Resources: corev1.ResourceList{
apiext.ResourceGPUCore: resource.MustParse("50"),
apiext.ResourceGPUMemoryRatio: resource.MustParse("50"),
},
},
},
},
podRequest: corev1.ResourceList{
apiext.ResourceGPUCore: resource.MustParse("50"),
apiext.ResourceGPUMemoryRatio: resource.MustParse("50"),
nodeDeviceInfo: &nodeDevice{
deviceTotal: map[schedulingv1alpha1.DeviceType]deviceResources{
schedulingv1alpha1.GPU: {
0: corev1.ResourceList{
apiext.ResourceGPUCore: resource.MustParse("100"),
apiext.ResourceGPUMemoryRatio: resource.MustParse("100"),
apiext.ResourceGPUMemory: resource.MustParse("32Gi"),
},
},
},
},
wantPodRequest: corev1.ResourceList{
apiext.ResourceGPUCore: resource.MustParse("50"),
apiext.ResourceGPUMemoryRatio: resource.MustParse("50"),
apiext.ResourceGPUMemory: resource.MustParse("16Gi"),
wantAllocations: map[schedulingv1alpha1.DeviceType][]*apiext.DeviceAllocation{
schedulingv1alpha1.GPU: {
{
Minor: 0,
Resources: corev1.ResourceList{
apiext.ResourceGPUCore: resource.MustParse("50"),
apiext.ResourceGPUMemoryRatio: resource.MustParse("50"),
apiext.ResourceGPUMemory: resource.MustParse("16Gi"),
},
},
},
},
},
{
name: "mem to ratio",
gpuTotal: deviceResources{
0: corev1.ResourceList{
apiext.ResourceGPUCore: resource.MustParse("100"),
apiext.ResourceGPUMemoryRatio: resource.MustParse("100"),
apiext.ResourceGPUMemory: resource.MustParse("32Gi"),
allocations: map[schedulingv1alpha1.DeviceType][]*apiext.DeviceAllocation{
schedulingv1alpha1.GPU: {
{
Minor: 0,
Resources: corev1.ResourceList{
apiext.ResourceGPUCore: resource.MustParse("50"),
apiext.ResourceGPUMemory: resource.MustParse("16Gi"),
},
},
},
},
podRequest: corev1.ResourceList{
apiext.ResourceGPUCore: resource.MustParse("50"),
apiext.ResourceGPUMemory: resource.MustParse("16Gi"),
nodeDeviceInfo: &nodeDevice{
deviceTotal: map[schedulingv1alpha1.DeviceType]deviceResources{
schedulingv1alpha1.GPU: {
0: corev1.ResourceList{
apiext.ResourceGPUCore: resource.MustParse("100"),
apiext.ResourceGPUMemoryRatio: resource.MustParse("100"),
apiext.ResourceGPUMemory: resource.MustParse("32Gi"),
},
},
},
},
wantPodRequest: corev1.ResourceList{
apiext.ResourceGPUCore: resource.MustParse("50"),
apiext.ResourceGPUMemoryRatio: *resource.NewQuantity(50, resource.DecimalSI),
apiext.ResourceGPUMemory: resource.MustParse("16Gi"),
wantAllocations: map[schedulingv1alpha1.DeviceType][]*apiext.DeviceAllocation{
schedulingv1alpha1.GPU: {
{
Minor: 0,
Resources: corev1.ResourceList{
apiext.ResourceGPUCore: resource.MustParse("50"),
apiext.ResourceGPUMemoryRatio: *resource.NewQuantity(50, resource.DecimalSI),
apiext.ResourceGPUMemory: resource.MustParse("16Gi"),
},
},
},
},
},
{
name: "missing total",
gpuTotal: deviceResources{
0: corev1.ResourceList{},
allocations: map[schedulingv1alpha1.DeviceType][]*apiext.DeviceAllocation{
schedulingv1alpha1.GPU: {
{
Minor: 0,
Resources: corev1.ResourceList{
apiext.ResourceGPUCore: resource.MustParse("50"),
apiext.ResourceGPUMemoryRatio: resource.MustParse("50"),
},
},
},
},
podRequest: corev1.ResourceList{
apiext.ResourceGPUCore: resource.MustParse("50"),
apiext.ResourceGPUMemoryRatio: resource.MustParse("50"),
nodeDeviceInfo: &nodeDevice{
deviceTotal: map[schedulingv1alpha1.DeviceType]deviceResources{
schedulingv1alpha1.GPU: {
0: corev1.ResourceList{},
},
},
},
wantPodRequest: corev1.ResourceList{
apiext.ResourceGPUCore: resource.MustParse("50"),
apiext.ResourceGPUMemoryRatio: resource.MustParse("50"),
wantAllocations: map[schedulingv1alpha1.DeviceType][]*apiext.DeviceAllocation{
schedulingv1alpha1.GPU: {
{
Minor: 0,
Resources: corev1.ResourceList{
apiext.ResourceGPUCore: resource.MustParse("50"),
apiext.ResourceGPUMemoryRatio: resource.MustParse("50"),
},
},
},
},
wantErr: true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
err := fillGPUTotalMem(tt.gpuTotal, tt.podRequest)
err := fillGPUTotalMem(tt.allocations, tt.nodeDeviceInfo)
if tt.wantErr != (err != nil) {
t.Errorf("wantErr %v but got %v", tt.wantErr, err != nil)
}
assert.Equal(t, tt.wantPodRequest, tt.podRequest)
assert.Equal(t, tt.wantAllocations, tt.allocations)
})
}
}
4 changes: 4 additions & 0 deletions pkg/scheduler/plugins/deviceshare/plugin.go
Original file line number Diff line number Diff line change
Expand Up @@ -449,6 +449,10 @@ func (p *Plugin) Reserve(ctx context.Context, cycleState *framework.CycleState,
return status
}
}
err = fillGPUTotalMem(result, nodeDeviceInfo)
if err != nil {
return framework.AsStatus(err)
}
nodeDeviceInfo.updateCacheUsed(result, pod, true)
state.allocationResult = result
return nil
Expand Down
2 changes: 2 additions & 0 deletions pkg/scheduler/plugins/deviceshare/reservation_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -687,6 +687,8 @@ func Test_tryAllocateFromReservation(t *testing.T) {
basicPreemptible,
tt.requiredFromReservation,
)
err := fillGPUTotalMem(result, nodeDeviceInfo)
assert.NoError(t, err)
assert.Equal(t, tt.wantStatus, status)
assert.Equal(t, tt.wantResult, result)
})
Expand Down
14 changes: 0 additions & 14 deletions pkg/scheduler/plugins/deviceshare/topology_hint_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,11 +72,6 @@ func TestPlugin_GetPodTopologyHints(t *testing.T) {
{NUMANodeAffinity: newBitMask(1), Preferred: true},
{NUMANodeAffinity: newBitMask(0, 1), Preferred: false},
},
string(apiext.ResourceGPUMemory): {
{NUMANodeAffinity: newBitMask(0), Preferred: true},
{NUMANodeAffinity: newBitMask(1), Preferred: true},
{NUMANodeAffinity: newBitMask(0, 1), Preferred: false},
},
string(apiext.ResourceGPUMemoryRatio): {
{NUMANodeAffinity: newBitMask(0), Preferred: true},
{NUMANodeAffinity: newBitMask(1), Preferred: true},
Expand Down Expand Up @@ -110,10 +105,6 @@ func TestPlugin_GetPodTopologyHints(t *testing.T) {
{NUMANodeAffinity: newBitMask(1), Preferred: true},
{NUMANodeAffinity: newBitMask(0, 1), Preferred: false},
},
string(apiext.ResourceGPUMemory): {
{NUMANodeAffinity: newBitMask(1), Preferred: true},
{NUMANodeAffinity: newBitMask(0, 1), Preferred: false},
},
string(apiext.ResourceGPUMemoryRatio): {
{NUMANodeAffinity: newBitMask(1), Preferred: true},
{NUMANodeAffinity: newBitMask(0, 1), Preferred: false},
Expand Down Expand Up @@ -205,11 +196,6 @@ func TestPlugin_GetPodTopologyHints(t *testing.T) {
{NUMANodeAffinity: newBitMask(1), Preferred: true},
{NUMANodeAffinity: newBitMask(0, 1), Preferred: false},
},
string(apiext.ResourceGPUMemory): {
{NUMANodeAffinity: newBitMask(0), Preferred: true},
{NUMANodeAffinity: newBitMask(1), Preferred: true},
{NUMANodeAffinity: newBitMask(0, 1), Preferred: false},
},
string(apiext.ResourceGPUMemoryRatio): {
{NUMANodeAffinity: newBitMask(0), Preferred: true},
{NUMANodeAffinity: newBitMask(1), Preferred: true},
Expand Down

0 comments on commit beab44e

Please sign in to comment.