Skip to content

Commit

Permalink
scheduler: make topology manager aware device preference (#2316)
Browse files Browse the repository at this point in the history
Signed-off-by: wangjianyu.wjy <wangjianyu.wjy@alibaba-inc.com>
Co-authored-by: wangjianyu.wjy <wangjianyu.wjy@alibaba-inc.com>
  • Loading branch information
ZiMengSheng and wangjianyu.wjy authored Jan 13, 2025
1 parent 07c81e2 commit 6f6ef82
Show file tree
Hide file tree
Showing 3 changed files with 64 additions and 44 deletions.
4 changes: 1 addition & 3 deletions pkg/scheduler/frameworkext/topologymanager/policy.go
Original file line number Diff line number Diff line change
Expand Up @@ -222,9 +222,7 @@ func mergeFilteredHints(numaNodes []int, filteredHints [][]NUMATopologyHint, exc

for _, v := range permutation {
if v.NUMANodeAffinity != nil && mergedHint.NUMANodeAffinity.IsEqual(v.NUMANodeAffinity) {
if v.Score > mergedHint.Score {
mergedHint.Score = v.Score
}
mergedHint.Score += v.Score
}
}

Expand Down
70 changes: 46 additions & 24 deletions pkg/scheduler/plugins/deviceshare/topology_hint.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,17 +21,18 @@ import (
"sort"

corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/util/sets"
quotav1 "k8s.io/apiserver/pkg/quota/v1"
"k8s.io/kubernetes/pkg/scheduler/framework"

apiext "github.com/koordinator-sh/koordinator/apis/extension"
schedulingv1alpha1 "github.com/koordinator-sh/koordinator/apis/scheduling/v1alpha1"
"github.com/koordinator-sh/koordinator/pkg/scheduler/frameworkext/topologymanager"
"github.com/koordinator-sh/koordinator/pkg/util/bitmask"
)

const (
ErrInsufficientNUMAScopedDevices = "Insufficient NUMA Scoped Devices"

defaultNUMAScore = 500
)

func (p *Plugin) GetPodTopologyHints(ctx context.Context, cycleState *framework.CycleState, pod *corev1.Pod, nodeName string) (map[string][]topologymanager.NUMATopologyHint, *framework.Status) {
Expand Down Expand Up @@ -137,17 +138,21 @@ func (p *Plugin) generateTopologyHints(cycleState *framework.CycleState, state *
sort.Ints(numaNodes)

var minAffinitySize map[corev1.ResourceName]int
hints := map[string][]topologymanager.NUMATopologyHint{}

var statusUnsatisfied *framework.Status
var bestAllocationResult apiext.DeviceAllocations
var feasibleAllocationResults []*numaScopedAllocation

bitmask.IterateBitMasks(numaNodes, func(mask bitmask.BitMask) {
nodeDevice.lock.RLock()
defer nodeDevice.lock.RUnlock()

var status *framework.Status
var allocateResult apiext.DeviceAllocations
if mask.Count() == len(numaNodes) {
defer func() { statusUnsatisfied = status }()
defer func() {
statusUnsatisfied = status
bestAllocationResult = allocateResult
}()
}

allocator.numaNodes = mask
Expand All @@ -163,48 +168,51 @@ func (p *Plugin) generateTopologyHints(cycleState *framework.CycleState, state *
return
}
}

nodeCount := len(maskNodes)
if minAffinitySize == nil {
minAffinitySize = map[corev1.ResourceName]int{}
for deviceType := range allocator.requestsPerInstance {
minAffinitySize[corev1.ResourceName(deviceType)] = len(numaNodes)
}
}

allocateResult, status := p.tryAllocateFromReservation(allocator, state, restoreState, restoreState.matched, pod, node, preemptible, state.hasReservationAffinity)
allocateResult, status = p.tryAllocateFromReservation(allocator, state, restoreState, restoreState.matched, pod, node, preemptible, state.hasReservationAffinity)
if !status.IsSuccess() {
return
}
if len(allocateResult) == 0 {
preemptible := appendAllocated(preemptible, restoreState.mergedMatchedAllocatable)
_, status = allocator.Allocate(nil, nil, nil, preemptible)
if !status.IsSuccess() {
allocateResult, status = allocator.Allocate(nil, nil, nil, preemptible)
if !status.IsSuccess() || len(allocateResult) == 0 {
return
}
}

nodeCount := mask.Count()
for resourceName, affinitySize := range minAffinitySize {
if nodeCount < affinitySize {
minAffinitySize[resourceName] = nodeCount
}
if _, ok := hints[string(resourceName)]; !ok {
hints[string(resourceName)] = []topologymanager.NUMATopologyHint{}
}
hints[string(resourceName)] = append(hints[string(resourceName)], topologymanager.NUMATopologyHint{
NUMANodeAffinity: mask,
Preferred: false,
Score: 0,
})
}
feasibleAllocationResults = append(feasibleAllocationResults, &numaScopedAllocation{
mask: mask,
allocationResult: allocateResult,
})
})

totalResourceNames := sets.NewString()
for _, deviceInfos := range nodeDevice.deviceInfos {
if len(deviceInfos) > 0 {
for _, name := range quotav1.ResourceNames(deviceInfos[0].Resources) {
totalResourceNames.Insert(string(name))
}
bestAllocationHash := hashAllocateResult(bestAllocationResult)
hints := map[string][]topologymanager.NUMATopologyHint{}

for _, feasibleAllocationResult := range feasibleAllocationResults {
score := 0
if hashAllocateResult(feasibleAllocationResult.allocationResult) == bestAllocationHash {
// we just use a score bigger than 100 to make that device numa preference take precedence over cpu
score = defaultNUMAScore
}
for resourceName := range minAffinitySize {
hints[string(resourceName)] = append(hints[string(resourceName)], topologymanager.NUMATopologyHint{
NUMANodeAffinity: feasibleAllocationResult.mask,
Score: int64(score),
})
}
}

Expand All @@ -227,6 +235,20 @@ func (p *Plugin) generateTopologyHints(cycleState *framework.CycleState, state *
return hints, nil
}

type numaScopedAllocation struct {
mask bitmask.BitMask
allocationResult apiext.DeviceAllocations
}

func hashAllocateResult(allocations apiext.DeviceAllocations) int {
gpuAllocations := allocations[schedulingv1alpha1.GPU]
var minor []int
for _, gpu := range gpuAllocations {
minor = append(minor, int(gpu.Minor))
}
return hashMinors(minor)
}

func calcTotalDevicesByNUMA(nd *nodeDevice, numaNodes []int) map[schedulingv1alpha1.DeviceType]int {
m := map[schedulingv1alpha1.DeviceType]int{}
for _, node := range numaNodes {
Expand Down
34 changes: 17 additions & 17 deletions pkg/scheduler/plugins/deviceshare/topology_hint_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,14 +72,14 @@ func TestPlugin_GetPodTopologyHints(t *testing.T) {
},
want: map[string][]topologymanager.NUMATopologyHint{
string(schedulingv1alpha1.GPU): {
{NUMANodeAffinity: newBitMask(0), Preferred: true},
{NUMANodeAffinity: newBitMask(0), Preferred: true, Score: defaultNUMAScore},
{NUMANodeAffinity: newBitMask(1), Preferred: true},
{NUMANodeAffinity: newBitMask(0, 1), Preferred: false},
{NUMANodeAffinity: newBitMask(0, 1), Preferred: false, Score: defaultNUMAScore},
},
string(schedulingv1alpha1.RDMA): {
{NUMANodeAffinity: newBitMask(0), Preferred: true},
{NUMANodeAffinity: newBitMask(0), Preferred: true, Score: defaultNUMAScore},
{NUMANodeAffinity: newBitMask(1), Preferred: true},
{NUMANodeAffinity: newBitMask(0, 1), Preferred: false},
{NUMANodeAffinity: newBitMask(0, 1), Preferred: false, Score: defaultNUMAScore},
},
},
},
Expand Down Expand Up @@ -110,8 +110,8 @@ func TestPlugin_GetPodTopologyHints(t *testing.T) {
},
want: map[string][]topologymanager.NUMATopologyHint{
string(schedulingv1alpha1.GPU): {
{NUMANodeAffinity: newBitMask(1), Preferred: true},
{NUMANodeAffinity: newBitMask(0, 1), Preferred: false},
{NUMANodeAffinity: newBitMask(1), Preferred: true, Score: defaultNUMAScore},
{NUMANodeAffinity: newBitMask(0, 1), Preferred: false, Score: defaultNUMAScore},
},
},
},
Expand All @@ -137,9 +137,9 @@ func TestPlugin_GetPodTopologyHints(t *testing.T) {
},
want: map[string][]topologymanager.NUMATopologyHint{
string(schedulingv1alpha1.RDMA): {
{NUMANodeAffinity: newBitMask(0), Preferred: true},
{NUMANodeAffinity: newBitMask(1), Preferred: true},
{NUMANodeAffinity: newBitMask(0, 1), Preferred: false},
{NUMANodeAffinity: newBitMask(0), Preferred: true, Score: defaultNUMAScore},
{NUMANodeAffinity: newBitMask(1), Preferred: true, Score: defaultNUMAScore},
{NUMANodeAffinity: newBitMask(0, 1), Preferred: false, Score: defaultNUMAScore},
},
},
},
Expand All @@ -156,9 +156,9 @@ func TestPlugin_GetPodTopologyHints(t *testing.T) {
},
want: map[string][]topologymanager.NUMATopologyHint{
string(schedulingv1alpha1.RDMA): {
{NUMANodeAffinity: newBitMask(0), Preferred: true},
{NUMANodeAffinity: newBitMask(1), Preferred: true},
{NUMANodeAffinity: newBitMask(0, 1), Preferred: false},
{NUMANodeAffinity: newBitMask(0), Preferred: true, Score: defaultNUMAScore},
{NUMANodeAffinity: newBitMask(1), Preferred: true, Score: defaultNUMAScore},
{NUMANodeAffinity: newBitMask(0, 1), Preferred: false, Score: defaultNUMAScore},
},
},
},
Expand All @@ -177,7 +177,7 @@ func TestPlugin_GetPodTopologyHints(t *testing.T) {
},
want: map[string][]topologymanager.NUMATopologyHint{
string(schedulingv1alpha1.RDMA): {
{NUMANodeAffinity: newBitMask(0, 1), Preferred: true},
{NUMANodeAffinity: newBitMask(0, 1), Preferred: true, Score: defaultNUMAScore},
},
},
},
Expand All @@ -197,14 +197,14 @@ func TestPlugin_GetPodTopologyHints(t *testing.T) {
},
want: map[string][]topologymanager.NUMATopologyHint{
string(schedulingv1alpha1.RDMA): {
{NUMANodeAffinity: newBitMask(0), Preferred: true},
{NUMANodeAffinity: newBitMask(0), Preferred: true, Score: defaultNUMAScore},
{NUMANodeAffinity: newBitMask(1), Preferred: true},
{NUMANodeAffinity: newBitMask(0, 1), Preferred: false},
{NUMANodeAffinity: newBitMask(0, 1), Preferred: false, Score: defaultNUMAScore},
},
string(schedulingv1alpha1.GPU): {
{NUMANodeAffinity: newBitMask(0), Preferred: true},
{NUMANodeAffinity: newBitMask(0), Preferred: true, Score: defaultNUMAScore},
{NUMANodeAffinity: newBitMask(1), Preferred: true},
{NUMANodeAffinity: newBitMask(0, 1), Preferred: false},
{NUMANodeAffinity: newBitMask(0, 1), Preferred: false, Score: defaultNUMAScore},
},
},
},
Expand Down

0 comments on commit 6f6ef82

Please sign in to comment.