diff --git a/cmd/plugins/balloons/policy/balloons-policy.go b/cmd/plugins/balloons/policy/balloons-policy.go index 49062b00a..d0e1d97c4 100644 --- a/cmd/plugins/balloons/policy/balloons-policy.go +++ b/cmd/plugins/balloons/policy/balloons-policy.go @@ -568,10 +568,12 @@ func (p *balloons) newBalloon(blnDef *BalloonDef, confCpus bool) (*Balloon, erro // are type specific allocator options, otherwise use policy // default allocator. cpuTreeAllocator := p.cpuTreeAllocator - if blnDef.AllocatorTopologyBalancing != nil || blnDef.PreferSpreadOnPhysicalCores != nil { + if blnDef.AllocatorTopologyBalancing != nil || blnDef.PreferSpreadOnPhysicalCores != nil || len(blnDef.PreferCloseToDevices) > 0 || len(blnDef.PreferFarFromDevices) > 0 { allocatorOptions := cpuTreeAllocatorOptions{ topologyBalancing: p.bpoptions.AllocatorTopologyBalancing, preferSpreadOnPhysicalCores: p.bpoptions.PreferSpreadOnPhysicalCores, + preferCloseToDevices: blnDef.PreferCloseToDevices, + preferFarFromDevices: blnDef.PreferFarFromDevices, } if blnDef.AllocatorTopologyBalancing != nil { allocatorOptions.topologyBalancing = *blnDef.AllocatorTopologyBalancing @@ -1091,6 +1093,8 @@ func (p *balloons) setConfig(bpoptions *BalloonsOptions) error { p.balloons = []*Balloon{} p.freeCpus = p.allowed.Clone() p.freeCpus = p.freeCpus.Difference(p.reserved) + p.fillFarFromDevices(bpoptions.BalloonDefs) + p.cpuTreeAllocator = p.cpuTree.NewAllocator(cpuTreeAllocatorOptions{ topologyBalancing: bpoptions.AllocatorTopologyBalancing, preferSpreadOnPhysicalCores: bpoptions.PreferSpreadOnPhysicalCores, @@ -1146,6 +1150,39 @@ func (p *balloons) setConfig(bpoptions *BalloonsOptions) error { return nil } +// fillFarFromDevices adds BalloonDefs implicit device anti-affinities +// towards devices that other BalloonDefs prefer to be close to. +func (p *balloons) fillFarFromDevices(blnDefs []*BalloonDef) { + // devDefClose[device][blnDef.Name] equals true if and + // only if the blnDef prefers to be close to the device. + devDefClose := map[string]map[string]bool{} + // avoidDevs is a list of devices for which at least one + // balloon type prefers to be close to. The order of devices + // in the avoidDevs list is significant: devices in the + // beginning of the list will be more effectively avoided than + // devices later in the list. + avoidDevs := []string{} + for _, blnDef := range blnDefs { + for _, closeDev := range blnDef.PreferCloseToDevices { + if _, ok := devDefClose[closeDev]; !ok { + avoidDevs = append(avoidDevs, closeDev) + devDefClose[closeDev] = map[string]bool{} + } + devDefClose[closeDev][blnDef.Name] = true + } + } + // Add every device in avoidDev to PreferFarFromDevices lists + // of those balloon types that do not prefer to be close to + // the device. + for _, avoidDev := range avoidDevs { + for _, blnDef := range blnDefs { + if !devDefClose[avoidDev][blnDef.Name] { + blnDef.PreferFarFromDevices = append(blnDef.PreferFarFromDevices, avoidDev) + } + } + } +} + // closestMems returns memory node IDs good for pinning containers // that run on given CPUs func (p *balloons) closestMems(cpus cpuset.CPUSet) idset.IDSet { diff --git a/cmd/plugins/balloons/policy/cputree.go b/cmd/plugins/balloons/policy/cputree.go index 568778cc9..3cfb2cb35 100644 --- a/cmd/plugins/balloons/policy/cputree.go +++ b/cmd/plugins/balloons/policy/cputree.go @@ -21,6 +21,7 @@ import ( "strings" system "github.com/containers/nri-plugins/pkg/sysfs" + "github.com/containers/nri-plugins/pkg/topology" "github.com/containers/nri-plugins/pkg/utils/cpuset" ) @@ -55,8 +56,9 @@ type cpuTreeNodeAttributes struct { // cpuTreeAllocator allocates CPUs from the branch of a CPU tree // where the "root" node is the topmost CPU of the branch. type cpuTreeAllocator struct { - options cpuTreeAllocatorOptions - root *cpuTreeNode + options cpuTreeAllocatorOptions + root *cpuTreeNode + cacheCloseCpuSets map[string][]cpuset.CPUSet } // cpuTreeAllocatorOptions contains parameters for the CPU allocator @@ -67,8 +69,12 @@ type cpuTreeAllocatorOptions struct { // the opposite (packed allocations). topologyBalancing bool preferSpreadOnPhysicalCores bool + preferCloseToDevices []string + preferFarFromDevices []string } +var emptyCpuSet = cpuset.New() + // String returns string representation of a CPU tree node. func (t *cpuTreeNode) String() string { if len(t.children) == 0 { @@ -395,8 +401,9 @@ func (t *cpuTreeNode) SplitLevel(splitLevel CPUTopologyLevel, cpuClassifier func // CPU tree branch. func (t *cpuTreeNode) NewAllocator(options cpuTreeAllocatorOptions) *cpuTreeAllocator { ta := &cpuTreeAllocator{ - root: t, - options: options, + root: t, + options: options, + cacheCloseCpuSets: map[string][]cpuset.CPUSet{}, } if options.preferSpreadOnPhysicalCores { newTree := t.SplitLevel(CPUTopologyLevelNuma, @@ -502,8 +509,172 @@ func (ta *cpuTreeAllocator) sorterRelease(tnas []cpuTreeNodeAttributes) func(int // - removeFromCpus contains CPUs in currentCpus set from which // abs(delta) CPUs can be freed. func (ta *cpuTreeAllocator) ResizeCpus(currentCpus, freeCpus cpuset.CPUSet, delta int) (cpuset.CPUSet, cpuset.CPUSet, error) { + resizers := []cpuResizerFunc{ + ta.resizeCpusOnlyIfNecessary, + ta.resizeCpusWithDevices, + ta.resizeCpusOneAtATime, + ta.resizeCpusMaxLocalSet, + ta.resizeCpusNow} + return ta.nextCpuResizer(resizers, currentCpus, freeCpus, delta) +} + +type cpuResizerFunc func(resizers []cpuResizerFunc, currentCpus, freeCpus cpuset.CPUSet, delta int) (cpuset.CPUSet, cpuset.CPUSet, error) + +func (ta *cpuTreeAllocator) nextCpuResizer(resizers []cpuResizerFunc, currentCpus, freeCpus cpuset.CPUSet, delta int) (cpuset.CPUSet, cpuset.CPUSet, error) { + if len(resizers) == 0 { + return freeCpus, currentCpus, fmt.Errorf("internal error: a CPU resizer consulted next resizer but there was no one left") + } + remainingResizers := resizers[1:] + log.Debugf("- resizer-%d(%q, %q, %d)", len(remainingResizers), currentCpus, freeCpus, delta) + addFrom, removeFrom, err := resizers[0](remainingResizers, currentCpus, freeCpus, delta) + return addFrom, removeFrom, err +} + +// resizeCpusNow does not call next resizer. Instead it keeps all CPU +// allocations from freeCpus and CPU releases from currentCpus equally +// good. This is the terminal block of resizers chain. +func (ta *cpuTreeAllocator) resizeCpusNow(resizers []cpuResizerFunc, currentCpus, freeCpus cpuset.CPUSet, delta int) (cpuset.CPUSet, cpuset.CPUSet, error) { + return freeCpus, currentCpus, nil +} + +// resizeCpusOnlyIfNecessary is the fast path for making trivial +// reservations and to fail if resizing is not possible. +func (ta *cpuTreeAllocator) resizeCpusOnlyIfNecessary(resizers []cpuResizerFunc, currentCpus, freeCpus cpuset.CPUSet, delta int) (cpuset.CPUSet, cpuset.CPUSet, error) { + switch { + case delta == 0: + // Nothing to do. + return emptyCpuSet, emptyCpuSet, nil + case delta > 0: + if freeCpus.Size() < delta { + return freeCpus, emptyCpuSet, fmt.Errorf("not enough free CPUs (%d) to resize current CPU set from %d to %d CPUs", freeCpus.Size(), currentCpus.Size(), currentCpus.Size()+delta) + } else if freeCpus.Size() == delta { + // Allocate all the remaining free CPUs. + return freeCpus, emptyCpuSet, nil + } + case delta < 0: + if currentCpus.Size() < -delta { + return emptyCpuSet, currentCpus, fmt.Errorf("not enough current CPUs (%d) to release %d CPUs", currentCpus.Size(), -delta) + } else if currentCpus.Size() == -delta { + // Free all allocated CPUs. + return emptyCpuSet, currentCpus, nil + } + } + return ta.nextCpuResizer(resizers, currentCpus, freeCpus, delta) +} + +// resizeCpusWithDevices prefers allocating CPUs from those freeCpus +// that are topologically close to preferred devices, and releasing +// those currentCpus that are not. +func (ta *cpuTreeAllocator) resizeCpusWithDevices(resizers []cpuResizerFunc, currentCpus, freeCpus cpuset.CPUSet, delta int) (cpuset.CPUSet, cpuset.CPUSet, error) { + // allCloseCpuSets contains cpusets in the order of priority. + // Applying the first cpusets in it are prioritized over ones + // after them. + allCloseCpuSets := [][]cpuset.CPUSet{} + for _, devPath := range ta.options.preferCloseToDevices { + if closeCpuSets := ta.topologyHintCpus(devPath); len(closeCpuSets) > 0 { + allCloseCpuSets = append(allCloseCpuSets, closeCpuSets) + } + } + for _, devPath := range ta.options.preferFarFromDevices { + for _, farCpuSet := range ta.topologyHintCpus(devPath) { + allCloseCpuSets = append(allCloseCpuSets, []cpuset.CPUSet{freeCpus.Difference(farCpuSet)}) + } + } + if len(allCloseCpuSets) == 0 { + return ta.nextCpuResizer(resizers, currentCpus, freeCpus, delta) + } + if delta > 0 { + // Allocate N=delta CPUs from freeCpus based on topology hints. + // Build a new set of freeCpus with at least N CPUs based on + // intersection with CPU hints. + // In case of conflicting topology hints the first + // hints in the list are the most important. + remainingFreeCpus := freeCpus + appliedHints := 0 + totalHints := 0 + for _, closeCpuSets := range allCloseCpuSets { + for _, cpus := range closeCpuSets { + totalHints++ + newRemainingFreeCpus := remainingFreeCpus.Intersection(cpus) + if newRemainingFreeCpus.Size() >= delta { + appliedHints++ + log.Debugf(" - take hinted cpus %q, common free %q", cpus, newRemainingFreeCpus) + remainingFreeCpus = newRemainingFreeCpus + } else { + log.Debugf(" - drop hinted cpus %q, not enough common free in %q", cpus, newRemainingFreeCpus) + } + } + } + log.Debugf(" - original free cpus %q, took %d/%d hints, remaining free: %q", + freeCpus, appliedHints, totalHints, remainingFreeCpus) + return ta.nextCpuResizer(resizers, currentCpus, remainingFreeCpus, delta) + } else if delta < 0 { + // Free N=-delta CPUs from currentCpus based on topology hints. + // 1. Sort currentCpus based on topology hints (leastHintedCpus). + // 2. Pick largest hint value that has to be released (maxHints). + // 3. Free all CPUs that have a hint value smaller than maxHints. + // 4. Let next CPU resizer choose CPUs to be freed among + // CPUs with hint value maxHints. + currentCpuHints := map[int]uint64{} + for hintPriority, closeCpuSets := range allCloseCpuSets { + for _, cpus := range closeCpuSets { + for _, cpu := range cpus.Intersection(currentCpus).UnsortedList() { + currentCpuHints[cpu] += 1 << (len(allCloseCpuSets) - 1 - hintPriority) + } + } + } + leastHintedCpus := currentCpus.UnsortedList() + sort.Slice(leastHintedCpus, func(i, j int) bool { + return currentCpuHints[leastHintedCpus[i]] < currentCpuHints[leastHintedCpus[j]] + }) + maxHints := currentCpuHints[leastHintedCpus[-delta]] + currentToFreeForSure := cpuset.New() + currentToFreeMaybe := cpuset.New() + for i := 0; i < len(leastHintedCpus) && currentCpuHints[leastHintedCpus[i]] <= maxHints; i++ { + if currentCpuHints[leastHintedCpus[i]] < maxHints { + currentToFreeForSure = currentToFreeForSure.Union(cpuset.New(leastHintedCpus[i])) + } else { + currentToFreeMaybe = currentToFreeMaybe.Union(cpuset.New(leastHintedCpus[i])) + } + } + remainingDelta := delta + currentToFreeForSure.Size() + log.Debugf(" - device hints: from cpus %q: free for sure: %q and %d more from: %q", + currentCpus, currentToFreeForSure, -remainingDelta, currentToFreeMaybe) + _, freeFromMaybe, err := ta.nextCpuResizer(resizers, currentToFreeMaybe, freeCpus, remainingDelta) + // Do not include possible extra CPUs from + // freeFromMaybe to make sure that all CPUs with least + // hints will be freed. + for _, cpu := range freeFromMaybe.UnsortedList() { + if currentToFreeForSure.Size() >= -delta { + break + } + currentToFreeForSure = currentToFreeForSure.Union(cpuset.New(cpu)) + } + return freeCpus, currentToFreeForSure, err + } + return freeCpus, currentCpus, nil +} + +// Fetch cached topology hint, return error only once per bad dev +func (ta *cpuTreeAllocator) topologyHintCpus(dev string) []cpuset.CPUSet { + if closeCpuSets, ok := ta.cacheCloseCpuSets[dev]; ok { + return closeCpuSets + } + topologyHints, err := topology.NewTopologyHints(dev) + if err != nil { + log.Errorf("failed to find topology of device %q: %v", dev, err) + ta.cacheCloseCpuSets[dev] = []cpuset.CPUSet{} + } else { + for _, topologyHint := range topologyHints { + ta.cacheCloseCpuSets[dev] = append(ta.cacheCloseCpuSets[dev], cpuset.MustParse(topologyHint.CPUs)) + } + } + return ta.cacheCloseCpuSets[dev] +} + +func (ta *cpuTreeAllocator) resizeCpusOneAtATime(resizers []cpuResizerFunc, currentCpus, freeCpus cpuset.CPUSet, delta int) (cpuset.CPUSet, cpuset.CPUSet, error) { if delta > 0 { - addFromSuperset, removeFromSuperset, err := ta.resizeCpus(currentCpus, freeCpus, delta) + addFromSuperset, removeFromSuperset, err := ta.nextCpuResizer(resizers, currentCpus, freeCpus, delta) if !ta.options.preferSpreadOnPhysicalCores || addFromSuperset.Size() == delta { return addFromSuperset, removeFromSuperset, err } @@ -515,7 +686,7 @@ func (ta *cpuTreeAllocator) ResizeCpus(currentCpus, freeCpus cpuset.CPUSet, delt // set by adding one CPU at a time. addFrom := cpuset.New() for n := 0; n < delta; n++ { - addSingleFrom, _, err := ta.resizeCpus(currentCpus, freeCpus, 1) + addSingleFrom, _, err := ta.nextCpuResizer(resizers, currentCpus, freeCpus, 1) if err != nil { return addFromSuperset, removeFromSuperset, err } @@ -540,7 +711,7 @@ func (ta *cpuTreeAllocator) ResizeCpus(currentCpus, freeCpus cpuset.CPUSet, delt removeFrom := cpuset.New() addFrom := cpuset.New() for n := 0; n < -delta; n++ { - _, removeSingleFrom, err := ta.resizeCpus(currentCpus, freeCpus, -1) + _, removeSingleFrom, err := ta.nextCpuResizer(resizers, currentCpus, freeCpus, -1) if err != nil { return addFrom, removeFrom, err } @@ -563,7 +734,7 @@ func (ta *cpuTreeAllocator) ResizeCpus(currentCpus, freeCpus cpuset.CPUSet, delt return addFrom, removeFrom, nil } -func (ta *cpuTreeAllocator) resizeCpus(currentCpus, freeCpus cpuset.CPUSet, delta int) (cpuset.CPUSet, cpuset.CPUSet, error) { +func (ta *cpuTreeAllocator) resizeCpusMaxLocalSet(resizers []cpuResizerFunc, currentCpus, freeCpus cpuset.CPUSet, delta int) (cpuset.CPUSet, cpuset.CPUSet, error) { tnas := ta.root.ToAttributedSlice(currentCpus, freeCpus, func(tna *cpuTreeNodeAttributes) bool { // filter out branches with insufficient cpus @@ -587,5 +758,5 @@ func (ta *cpuTreeAllocator) resizeCpus(currentCpus, freeCpus cpuset.CPUSet, delt if len(tnas) == 0 { return freeCpus, currentCpus, fmt.Errorf("not enough free CPUs") } - return tnas[0].freeCpus, tnas[0].currentCpus, nil + return ta.nextCpuResizer(resizers, tnas[0].currentCpus, tnas[0].freeCpus, delta) } diff --git a/cmd/plugins/balloons/policy/cputree_test.go b/cmd/plugins/balloons/policy/cputree_test.go index cad1847a3..05c1e9152 100644 --- a/cmd/plugins/balloons/policy/cputree_test.go +++ b/cmd/plugins/balloons/policy/cputree_test.go @@ -110,6 +110,15 @@ func newCpuTreeFromInt5(pdnct [5]int) (*cpuTreeNode, cpusInTopology) { return sysTree, csit } +func verifyOn(t *testing.T, nameContents string, cpus cpuset.CPUSet, csit cpusInTopology) { + for _, cpuID := range cpus.List() { + name := csit[cpuID].threadName + if !strings.Contains(name, nameContents) { + t.Errorf("cpu%d (%s) not in expected region %s", cpuID, name, nameContents) + } + } +} + func verifyNotOn(t *testing.T, nameContents string, cpus cpuset.CPUSet, csit cpusInTopology) { for _, cpuID := range cpus.List() { name := csit[cpuID].threadName @@ -230,9 +239,11 @@ func TestResizeCpus(t *testing.T) { } tcases := []struct { name string - topology [5]int // package, die, numa, core, thread count - allocatorTB bool // allocator topologyBalancing - allocatorPSoPC bool // allocator preferSpreadOnPhysicalCores + topology [5]int // package, die, numa, core, thread count + allocatorTB bool // allocator topologyBalancing + allocatorPSoPC bool // allocator preferSpreadOnPhysicalCores + allocatorPCtD []string // allocator preferCloseToDevices + allocatorPFfD []string // allocator preferFarFromDevices allocations []int deltas []int allocate bool @@ -240,6 +251,7 @@ func TestResizeCpus(t *testing.T) { expectCurrentOnSame []string expectCurrentNotOnSame []string expectAllOnSame []string + expectCurrentOn []string expectCurrentNotOn []string expectAddSizes []int expectDisjoint []TopoCcids // which ccids should be disjoint @@ -471,6 +483,51 @@ func TestResizeCpus(t *testing.T) { {"package", []int{1, 2}}, {"package", []int{1, 2, 3}}, {"package", []int{1, 2, 3, 4}}, }, }, + { + name: "prefer close to devices", + topology: [5]int{2, 1, 2, 2, 2}, + allocatorTB: true, + allocatorPSoPC: true, + allocatorPCtD: []string{ + "/sys/cpus:4-7", // close to p0d0n1c* + "/sys/cpus:3", // close to p0d0n0c01t1 + "/sys/cpus:2-7", // close to p0d0n* + }, + allocatorPFfD: []string{ + "/sys/cpus:0-1", // far from p0d0n0c00t* + }, + deltas: []int{ + 1, 3, 1, -1, + 4, -3, 1, -1, + }, + allocate: true, + operateOnCcid: []int{ + 1, 1, 1, 1, // container 1 allocates cpus 4-7 + 2, 2, 2, 2, // container 2 cannot get enough cpus from 2-7 + }, + expectCurrentOn: []string{ + "p0d0n1", // cpus:4-7 + "p0d0n1", // cpus:4-7 + "p0d0", // cpus:0-7 + "p0d0n1", // cpus:4-7 + // container 2 + "p1", // cpus:8-15 + "p1", // cpus:8-15 + "", // cpus:any + "p0d0n0c01t1", // cpus:3 + }, + expectCurrentNotOn: []string{ + "p0d0n0", // cpus:0-3 + "p0d0n0", // cpus:0-3 + "p0d0n0c00", // cpus:0-1 + "p0d0n0", // cpus:0-3 + // container 2 + "p0d0n0c00", // cpus:0-1 + "p0d0n0c00", // cpus:0-1 + "p0d0n0c00", // cpus:0-1 + "p0d0n0c00", // cpus:0-1 + }, + }, } for _, tc := range tcases { t.Run(tc.name, func(t *testing.T) { @@ -478,7 +535,14 @@ func TestResizeCpus(t *testing.T) { treeA := tree.NewAllocator(cpuTreeAllocatorOptions{ topologyBalancing: tc.allocatorTB, preferSpreadOnPhysicalCores: tc.allocatorPSoPC, + preferCloseToDevices: tc.allocatorPCtD, + preferFarFromDevices: tc.allocatorPFfD, }) + for _, dev := range append(tc.allocatorPCtD, tc.allocatorPFfD...) { + treeA.cacheCloseCpuSets[dev] = []cpuset.CPUSet{ + cpuset.MustParse(dev[len("/sys/cpus:"):]), + } + } currentCpus := cpuset.New() freeCpus := tree.Cpus() if len(tc.allocations) > 0 { @@ -546,6 +610,9 @@ func TestResizeCpus(t *testing.T) { if i < len(tc.expectCurrentNotOnSame) && tc.expectCurrentNotOnSame[i] != "" { verifyNotSame(t, tc.expectCurrentNotOnSame[i], currentCpus, csit) } + if i < len(tc.expectCurrentOn) && tc.expectCurrentOn[i] != "" { + verifyOn(t, tc.expectCurrentOn[i], currentCpus, csit) + } if i < len(tc.expectCurrentNotOn) && tc.expectCurrentNotOn[i] != "" { verifyNotOn(t, tc.expectCurrentNotOn[i], currentCpus, csit) } diff --git a/deployment/helm/balloons/crds/config.nri_balloonspolicies.yaml b/deployment/helm/balloons/crds/config.nri_balloonspolicies.yaml index 178c7d280..07121dd55 100644 --- a/deployment/helm/balloons/crds/config.nri_balloonspolicies.yaml +++ b/deployment/helm/balloons/crds/config.nri_balloonspolicies.yaml @@ -106,6 +106,11 @@ spec: items: type: string type: array + preferCloseToDevices: + description: prefer creating new balloons of this type close to listed devices. + items: + type: string + type: array preferNewBalloons: description: 'PreferNewBalloons: prefer creating new balloons over adding containers to existing balloons. The default is diff --git a/docs/resource-policy/policy/balloons.md b/docs/resource-policy/policy/balloons.md index b5a1a7643..e64760049 100644 --- a/docs/resource-policy/policy/balloons.md +++ b/docs/resource-policy/policy/balloons.md @@ -116,6 +116,8 @@ Balloons policy parameters: request less. - `CpuClass` specifies the name of the CPU class according to which CPUs of balloons are configured. + - `PreferCloseToDevices`: prefer creating new balloons close to + listed devices. List of strings - `PreferSpreadingPods`: if `true`, containers of the same pod should be spread to different balloons of this type. The default is `false`: prefer placing containers of the same pod to the same @@ -145,6 +147,18 @@ Balloons policy parameters: the balloon. - `PreferSpreadOnPhysicalCores` overrides the policy level option with the same name in the scope of this balloon type. + - `PreferCloseToDevices` prefers creating new balloons close to + listed devices. If all preferences cannot be fulfilled, preference + to first devices in the list override preferences to devices after + them. Adding this preference to any balloon type automatically + adds corresponding anti-affinity to other balloon types that do + not prefer to be close to the same device: they prefer being + created away from the device. Example: + ``` + PreferCloseToDevices: + - /sys/class/net/eth0 + - /sys/class/block/sda + ``` - `AllocatorPriority` (0: High, 1: Normal, 2: Low, 3: None). CPU allocator parameter, used when creating new or resizing existing balloons. If there are balloon types with pre-created balloons diff --git a/pkg/apis/config/v1alpha1/resmgr/policy/balloons/config.go b/pkg/apis/config/v1alpha1/resmgr/policy/balloons/config.go index 754e2ac2a..3b1c4f7e7 100644 --- a/pkg/apis/config/v1alpha1/resmgr/policy/balloons/config.go +++ b/pkg/apis/config/v1alpha1/resmgr/policy/balloons/config.go @@ -183,6 +183,12 @@ type BalloonDef struct { // +kubebuilder:validation:Enum="";system;package;die;numa;core;thread // +kubebuilder:validation:Format:string ShareIdleCpusInSame CPUTopologyLevel `json:"shareIdleCPUsInSame,omitempty"` + // PreferCloseToDevices: prefer creating new balloons of this + // type close to listed devices. + PreferCloseToDevices []string `json:"preferCloseToDevices",omitempty` + // PreferFarFromDevices: prefer creating new balloons of this + // type far from listed devices. + PreferFarFromDevices []string `json:"preferFarFromDevices",omitempty` } // String stringifies a BalloonDef diff --git a/test/e2e/policies.test-suite/balloons/n4c16/test10-allocator-opts/balloons-allocator-opts.cfg b/test/e2e/policies.test-suite/balloons/n4c16/test10-allocator-opts/balloons-allocator-opts.cfg index 9be442e42..dad244d5e 100644 --- a/test/e2e/policies.test-suite/balloons/n4c16/test10-allocator-opts/balloons-allocator-opts.cfg +++ b/test/e2e/policies.test-suite/balloons/n4c16/test10-allocator-opts/balloons-allocator-opts.cfg @@ -10,7 +10,7 @@ config: minBalloons: 2 - name: topo1cores0 minCPUs: 2 - minBalloons: 2 + minBalloons: 1 preferSpreadOnPhysicalCores: false - name: topo0cores1 allocatorTopologyBalancing: false @@ -21,6 +21,11 @@ config: - name: topo1cores1 allocatorTopologyBalancing: true preferSpreadOnPhysicalCores: true + - name: device-node2 + preferNewBalloons: true + preferSpreadOnPhysicalCores: true + preferCloseToDevices: + - "/sys/devices/system/node/node2" instrumentation: httpEndpoint: :8891 diff --git a/test/e2e/policies.test-suite/balloons/n4c16/test10-allocator-opts/code.var.sh b/test/e2e/policies.test-suite/balloons/n4c16/test10-allocator-opts/code.var.sh index dc5dd2e96..c82b4072c 100644 --- a/test/e2e/policies.test-suite/balloons/n4c16/test10-allocator-opts/code.var.sh +++ b/test/e2e/policies.test-suite/balloons/n4c16/test10-allocator-opts/code.var.sh @@ -15,7 +15,8 @@ CPUREQ="100m" MEMREQ="100M" CPULIM="100m" MEMLIM="100M" POD_ANNOTATION="balloon.balloons.resource-policy.nri.io: policydefaults" CONTCOUNT=1 create balloons-busybox report allowed verify 'len(cores["pod0c0"]) == 2' \ - 'len(cpus["pod0c0"]) == 2' + 'len(cpus["pod0c0"]) == 2' \ + '"node2" not in nodes["pod0c0"]' # pod1 in a 2-CPU balloon @@ -23,7 +24,8 @@ CPUREQ="100m" MEMREQ="100M" CPULIM="100m" MEMLIM="100M" POD_ANNOTATION="balloon.balloons.resource-policy.nri.io: topo1cores0" CONTCOUNT=1 create balloons-busybox report allowed verify 'len(cores["pod1c0"]) == 1' \ - 'len(cpus["pod1c0"]) == 2' + 'len(cpus["pod1c0"]) == 2' \ + '"node2" not in nodes["pod1c0"]' # pod2: container 0 resizes first from 0 to 1, container 2 from 1 to 2 CPUs, # use more cores @@ -32,7 +34,12 @@ POD_ANNOTATION="balloon.balloons.resource-policy.nri.io: topo1cores1" CONTCOUNT= report allowed verify 'len(cores["pod2c0"]) == 2' \ 'len(cpus["pod2c0"]) == 2' \ - 'cpus["pod2c0"] == cpus["pod2c1"]' + 'cpus["pod2c0"] == cpus["pod2c1"]' \ + '"node2" not in nodes["pod2c0"]' + +# make room for pod3, because now only node2 should be empty and we +# would not be able to pack tightly elsewhere. +vm-command "kubectl delete pods pod0 pod1 pod2 --now" # pod3: container 0 resizes first from 0 to 1, container 2 from 1 to 2 CPUs, # pack tightly @@ -41,7 +48,26 @@ POD_ANNOTATION="balloon.balloons.resource-policy.nri.io: topo0cores0" CONTCOUNT= report allowed verify 'len(cores["pod3c0"]) == 1' \ 'len(cpus["pod3c0"]) == 2' \ - 'cpus["pod3c0"] == cpus["pod3c1"]' + 'cpus["pod3c0"] == cpus["pod3c1"]' \ + '"node2" not in nodes["pod3c0"]' + +# pod4 in new balloon for which node2 should have been kept free +CPUREQ="3" MEMREQ="100M" CPULIM="6" MEMLIM="100M" +POD_ANNOTATION="balloon.balloons.resource-policy.nri.io: device-node2" CONTCOUNT=1 create balloons-busybox +report allowed +verify '{"node2"} == nodes["pod4c0"]' \ + 'len(cores["pod4c0"]) == 2' \ + 'len(cpus["pod4c0"]) == 3' + +vm-command "kubectl delete pods pod0 pod1 pod2 --now" + +# pod5 in new balloon that will not fit on node2, ignore device hint and allocate from elsewhere +CPUREQ="2" MEMREQ="100M" CPULIM="6" MEMLIM="100M" +POD_ANNOTATION="balloon.balloons.resource-policy.nri.io: device-node2" CONTCOUNT=1 create balloons-busybox +report allowed +verify '"node2" not in nodes["pod5c0"]' \ + 'len(cores["pod5c0"]) == 2' \ + 'len(cpus["pod5c0"]) == 2' cleanup helm-terminate