Skip to content

Commit

Permalink
feat(nodeResources): add GPU support (#1708)
Browse files Browse the repository at this point in the history
* feat(nodeResources): add GPU support

* add resourceCapacity and sum test

* update with make schemas

* Correct tests names

Signed-off-by: Evans Mungai <evans@replicated.com>

---------

Signed-off-by: Evans Mungai <evans@replicated.com>
Co-authored-by: Evans Mungai <evans@replicated.com>
  • Loading branch information
DexterYan and banjoh authored Jan 3, 2025
1 parent 2772722 commit 64ee9e5
Show file tree
Hide file tree
Showing 17 changed files with 784 additions and 17 deletions.
6 changes: 6 additions & 0 deletions config/crds/troubleshoot.replicated.com_analyzers.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -417,6 +417,12 @@ spec:
type: string
podCapacity:
type: string
resourceAllocatable:
type: string
resourceCapacity:
type: string
resourceName:
type: string
selector:
properties:
matchLabel:
Expand Down
6 changes: 6 additions & 0 deletions config/crds/troubleshoot.replicated.com_preflights.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -417,6 +417,12 @@ spec:
type: string
podCapacity:
type: string
resourceAllocatable:
type: string
resourceCapacity:
type: string
resourceName:
type: string
selector:
properties:
matchLabel:
Expand Down
6 changes: 6 additions & 0 deletions config/crds/troubleshoot.replicated.com_supportbundles.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -448,6 +448,12 @@ spec:
type: string
podCapacity:
type: string
resourceAllocatable:
type: string
resourceCapacity:
type: string
resourceName:
type: string
selector:
properties:
matchLabel:
Expand Down
6 changes: 6 additions & 0 deletions config/crds/troubleshoot.sh_analyzers.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1239,6 +1239,12 @@ spec:
type: string
podCapacity:
type: string
resourceAllocatable:
type: string
resourceCapacity:
type: string
resourceName:
type: string
selector:
properties:
matchExpressions:
Expand Down
6 changes: 6 additions & 0 deletions config/crds/troubleshoot.sh_preflights.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1239,6 +1239,12 @@ spec:
type: string
podCapacity:
type: string
resourceAllocatable:
type: string
resourceCapacity:
type: string
resourceName:
type: string
selector:
properties:
matchExpressions:
Expand Down
6 changes: 6 additions & 0 deletions config/crds/troubleshoot.sh_supportbundles.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1270,6 +1270,12 @@ spec:
type: string
podCapacity:
type: string
resourceAllocatable:
type: string
resourceCapacity:
type: string
resourceName:
type: string
selector:
properties:
matchExpressions:
Expand Down
389 changes: 387 additions & 2 deletions pkg/analyze/files/nodes.json

Large diffs are not rendered by default.

79 changes: 65 additions & 14 deletions pkg/analyze/node_resources.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,8 @@ func (a *AnalyzeNodeResources) analyzeNodeResources(analyzer *troubleshootv1beta

for _, outcome := range analyzer.Outcomes {
if outcome.Fail != nil {
isWhenMatch, err := compareNodeResourceConditionalToActual(outcome.Fail.When, matchingNodes)
isWhenMatch, err := compareNodeResourceConditionalToActual(outcome.Fail.When, matchingNodes, analyzer.Filters)

if err != nil {
return nil, errors.Wrap(err, "failed to parse when")
}
Expand All @@ -100,7 +101,7 @@ func (a *AnalyzeNodeResources) analyzeNodeResources(analyzer *troubleshootv1beta
return result, nil
}
} else if outcome.Warn != nil {
isWhenMatch, err := compareNodeResourceConditionalToActual(outcome.Warn.When, matchingNodes)
isWhenMatch, err := compareNodeResourceConditionalToActual(outcome.Warn.When, matchingNodes, analyzer.Filters)
if err != nil {
return nil, errors.Wrap(err, "failed to parse when")
}
Expand All @@ -116,7 +117,7 @@ func (a *AnalyzeNodeResources) analyzeNodeResources(analyzer *troubleshootv1beta
return result, nil
}
} else if outcome.Pass != nil {
isWhenMatch, err := compareNodeResourceConditionalToActual(outcome.Pass.When, matchingNodes)
isWhenMatch, err := compareNodeResourceConditionalToActual(outcome.Pass.When, matchingNodes, analyzer.Filters)
if err != nil {
return nil, errors.Wrap(err, "failed to parse when")
}
Expand All @@ -137,7 +138,7 @@ func (a *AnalyzeNodeResources) analyzeNodeResources(analyzer *troubleshootv1beta
return result, nil
}

func compareNodeResourceConditionalToActual(conditional string, matchingNodes []corev1.Node) (res bool, err error) {
func compareNodeResourceConditionalToActual(conditional string, matchingNodes []corev1.Node, filters *troubleshootv1beta2.NodeResourceFilters) (res bool, err error) {
res = false
err = nil

Expand Down Expand Up @@ -190,18 +191,23 @@ func compareNodeResourceConditionalToActual(conditional string, matchingNodes []

function := match[1]
property := match[2]
resourceName := ""

if filters != nil {
resourceName = filters.ResourceName
}

var actualValue interface{}

switch function {
case "count":
actualValue = len(matchingNodes)
case "min":
actualValue = findMin(matchingNodes, property)
actualValue = findMin(matchingNodes, property, resourceName)
case "max":
actualValue = findMax(matchingNodes, property)
actualValue = findMax(matchingNodes, property, resourceName)
case "sum":
actualValue = findSum(matchingNodes, property)
actualValue = findSum(matchingNodes, property, resourceName)
case "nodeCondition":
operatorChecker := regexp.MustCompile(`={1,3}`)
if !operatorChecker.MatchString(operator) {
Expand Down Expand Up @@ -311,7 +317,7 @@ func compareNodeResourceConditionalToActual(conditional string, matchingNodes []
return
}

func getQuantity(node corev1.Node, property string) *resource.Quantity {
func getQuantity(node corev1.Node, property string, resourceName string) *resource.Quantity {
switch property {
case "cpuCapacity":
return node.Status.Capacity.Cpu()
Expand All @@ -329,27 +335,39 @@ func getQuantity(node corev1.Node, property string) *resource.Quantity {
return node.Status.Capacity.StorageEphemeral()
case "ephemeralStorageAllocatable":
return node.Status.Allocatable.StorageEphemeral()
case "resourceCapacity":
capacity, ok := node.Status.Capacity[corev1.ResourceName(resourceName)]
if !ok {
return nil
}
return &capacity
case "resourceAllocatable":
allocatable, ok := node.Status.Allocatable[corev1.ResourceName(resourceName)]
if !ok {
return nil
}
return &allocatable
}
return nil
}

func findSum(nodes []corev1.Node, property string) *resource.Quantity {
func findSum(nodes []corev1.Node, property string, resourceName string) *resource.Quantity {
sum := resource.Quantity{}

for _, node := range nodes {
if quant := getQuantity(node, property); quant != nil {
if quant := getQuantity(node, property, resourceName); quant != nil {
sum.Add(*quant)
}
}

return &sum
}

func findMin(nodes []corev1.Node, property string) *resource.Quantity {
func findMin(nodes []corev1.Node, property string, resourceName string) *resource.Quantity {
var min *resource.Quantity

for _, node := range nodes {
if quant := getQuantity(node, property); quant != nil {
if quant := getQuantity(node, property, resourceName); quant != nil {
if min == nil {
min = quant
} else if quant.Cmp(*min) == -1 {
Expand All @@ -361,11 +379,11 @@ func findMin(nodes []corev1.Node, property string) *resource.Quantity {
return min
}

func findMax(nodes []corev1.Node, property string) *resource.Quantity {
func findMax(nodes []corev1.Node, property string, resourceName string) *resource.Quantity {
var max *resource.Quantity

for _, node := range nodes {
if quant := getQuantity(node, property); quant != nil {
if quant := getQuantity(node, property, resourceName); quant != nil {
if max == nil {
max = quant
} else if quant.Cmp(*max) == 1 {
Expand All @@ -382,6 +400,39 @@ func nodeMatchesFilters(node corev1.Node, filters *troubleshootv1beta2.NodeResou
return true, nil
}

if filters.ResourceName != "" {
capacity, capacityExists := node.Status.Capacity[corev1.ResourceName(filters.ResourceName)]
allocatable, allocatableExists := node.Status.Allocatable[corev1.ResourceName(filters.ResourceName)]

if !capacityExists && !allocatableExists {
return false, nil
}

if filters.ResourceCapacity != "" {
parsed, err := resource.ParseQuantity(filters.ResourceCapacity)
if err != nil {
return false, errors.Wrap(err, "failed to parse resource capacity")
}

// Compare the capacity value with the parsed value
if capacity.Cmp(parsed) == -1 {
return false, nil
}
}

if filters.ResourceAllocatable != "" {
parsed, err := resource.ParseQuantity(filters.ResourceAllocatable)
if err != nil {
return false, errors.Wrap(err, "failed to parse resource allocatable")
}

// Compare the allocatable value with the parsed value
if allocatable.Cmp(parsed) == -1 {
return false, nil
}
}
}

// all filters must pass for this to pass
if filters.Selector != nil {
selector, err := metav1.LabelSelectorAsSelector(
Expand Down
Loading

0 comments on commit 64ee9e5

Please sign in to comment.