Skip to content

Commit

Permalink
PWX-30276: Do not delete pods running on storage down nodes. (#1385)
Browse files Browse the repository at this point in the history
* PWX-30276: Do not delete pods running on storage down nodes.

- Introduce a new state in Stork for storage down nodes.
- Remap portworx StorageDown state to Stork's StorageDown state
- For degraded node state:
  - filter out the nodes from extender requests.
  - monitor should delete the pods running on them
- For storage down state:
  - do not filter out such nodes.
  - give a low score to nodes which are storage down.
  - monitor should not delete pods running on them

Signed-off-by: Aditya Dani <aditya@portworx.com>

* PWX-30726: Handle review comments

- Added integration tests for extender and health-monitor.
- Simulating a StorageDown state by using Portworx driver's Pool Maintenance operation.

---------

Signed-off-by: Aditya Dani <aditya@portworx.com>
  • Loading branch information
adityadani authored May 9, 2023
1 parent 2bebe98 commit 07ea4ff
Show file tree
Hide file tree
Showing 9 changed files with 229 additions and 68 deletions.
7 changes: 4 additions & 3 deletions drivers/volume/portworx/portworx.go
Original file line number Diff line number Diff line change
Expand Up @@ -530,9 +530,12 @@ func (p *portworx) inspectVolume(volDriver volume.VolumeDriver, volumeID string)

return info, nil
}

func (p *portworx) mapNodeStatus(status api.Status) storkvolume.NodeStatus {
switch status {
case api.Status_STATUS_POOLMAINTENANCE:
fallthrough
case api.Status_STATUS_STORAGE_DOWN:
return storkvolume.NodeStorageDown
case api.Status_STATUS_INIT:
fallthrough
case api.Status_STATUS_OFFLINE:
Expand All @@ -551,8 +554,6 @@ func (p *portworx) mapNodeStatus(status api.Status) storkvolume.NodeStatus {
fallthrough
case api.Status_STATUS_OK:
return storkvolume.NodeOnline
case api.Status_STATUS_STORAGE_DOWN:
fallthrough
case api.Status_STATUS_STORAGE_DEGRADED:
fallthrough
case api.Status_STATUS_STORAGE_REBALANCE:
Expand Down
4 changes: 4 additions & 0 deletions drivers/volume/volume.go
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,10 @@ const (
NodeOnline NodeStatus = "Online"
// NodeOffline Node is Offline
NodeOffline NodeStatus = "Offline"
// NodeStorageDown Node is Online but storage is down.
// The expection from the driver is applications can continue their IO
// since the data is replicated to other nodes.
NodeStorageDown NodeStatus = "StorageDown"
// NodeDegraded Node is in degraded state
NodeDegraded NodeStatus = "Degraded"
)
Expand Down
34 changes: 18 additions & 16 deletions pkg/extender/extender.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,10 @@ const (
regionPriorityScore float64 = 10
// defaultScore Score assigned to a node which doesn't have data for any volume
defaultScore float64 = 5
// degradedNodeScorePenaltyPercentage is the percentage by which a node's score
// will take a hit if the node's status is degraded
degradedNodeScorePenaltyPercentage float64 = 50
schedulingFailureEventReason = "FailedScheduling"
// storageDownNodeScorePenaltyPercentage is the percentage by which a node's score
// will take a hit if the node's status is StorageDown
storageDownNodeScorePenaltyPercentage float64 = 50
schedulingFailureEventReason = "FailedScheduling"
// Pod annotation to check if only local nodes should be used to schedule a pod
preferLocalNodeOnlyAnnotation = "stork.libopenstorage.org/preferLocalNodeOnly"
// StorageCluster parameter to check if only remote nodes should be used to schedule a pod
Expand Down Expand Up @@ -280,7 +280,7 @@ func (e *Extender) processFilterRequest(w http.ResponseWriter, req *http.Request
for _, node := range args.Nodes.Items {
for _, driverNode := range driverNodes {
storklog.PodLog(pod).Debugf("nodeInfo: %v", driverNode)
if (driverNode.Status == volume.NodeOnline || driverNode.Status == volume.NodeDegraded) &&
if (driverNode.Status == volume.NodeOnline || driverNode.Status == volume.NodeStorageDown) &&
volume.IsNodeMatch(&node, driverNode) {
// If only nodes with replicas are to be preferred,
// filter out all nodes that don't have a replica
Expand Down Expand Up @@ -459,36 +459,36 @@ func (e *Extender) getNodeScore(
if rack == nodeRack || nodeRack == "" {
for _, datanodeID := range volumeInfo.DataNodes {
if storageNode.StorageID == datanodeID {
if storageNode.Status == volume.NodeDegraded {
if storageNode.Status == volume.NodeStorageDown {
// Even if the volume data is local to the node
// the node is in degraded state. So the app won't benefit
// from hyperconvergence on this node. So we will not use
// the nodePriorityScore but instead rackPriorityScore and
// penalize based on that.
return rackPriorityScore * (degradedNodeScorePenaltyPercentage / 100)
return rackPriorityScore * (storageDownNodeScorePenaltyPercentage / 100)
}
return nodePriorityScore
}
}
if nodeRack != "" {
if storageNode.Status == volume.NodeDegraded {
return rackPriorityScore * (degradedNodeScorePenaltyPercentage / 100)
if storageNode.Status == volume.NodeStorageDown {
return rackPriorityScore * (storageDownNodeScorePenaltyPercentage / 100)
}
return rackPriorityScore
}
}
}
if nodeZone != "" {
if storageNode.Status == volume.NodeDegraded {
return zonePriorityScore * (degradedNodeScorePenaltyPercentage / 100)
if storageNode.Status == volume.NodeStorageDown {
return zonePriorityScore * (storageDownNodeScorePenaltyPercentage / 100)
}
return zonePriorityScore
}
}
}
if nodeRegion != "" {
if storageNode.Status == volume.NodeDegraded {
return regionPriorityScore * (degradedNodeScorePenaltyPercentage / 100)
if storageNode.Status == volume.NodeStorageDown {
return regionPriorityScore * (storageDownNodeScorePenaltyPercentage / 100)
}
return regionPriorityScore
}
Expand Down Expand Up @@ -599,7 +599,7 @@ func (e *Extender) processPrioritizeRequest(w http.ResponseWriter, req *http.Req
storklog.PodLog(pod).Debugf("nodeInfo: %v", dnode)
// For any node that is offline remove the locality info so that we
// don't prioritize nodes close to it
if dnode.Status == volume.NodeOnline || dnode.Status == volume.NodeDegraded {
if dnode.Status == volume.NodeOnline || dnode.Status == volume.NodeStorageDown {
// Add region info into zone and zone info into rack so that we can
// differentiate same names in different localities
regionInfo.HostnameMap[dnode.Hostname] = dnode.Region
Expand Down Expand Up @@ -749,7 +749,9 @@ func (e *Extender) processCSIExtPodFilterRequest(
for _, knode := range args.Nodes.Items {
for _, dnode := range driverNodes {
storklog.PodLog(pod).Debugf("nodeInfo: %v", dnode)
if (dnode.Status == volume.NodeOnline || dnode.Status == volume.NodeDegraded) &&
// Only nodes which are Online or in StorageDown state should schedule pods.
// All the nodes in Offline or Degraded state will be skipped.
if (dnode.Status == volume.NodeOnline || dnode.Status == volume.NodeStorageDown) &&
volume.IsNodeMatch(&knode, dnode) {
filteredNodes = append(filteredNodes, knode)
break
Expand Down Expand Up @@ -800,7 +802,7 @@ func (e *Extender) processCSIExtPodPrioritizeRequest(
} else if dnode.Status == volume.NodeOffline {
score = 0
} else {
score = int64(nodePriorityScore * (degradedNodeScorePenaltyPercentage / 100))
score = int64(nodePriorityScore * (storageDownNodeScorePenaltyPercentage / 100))
}
hostPriority := schedulerapi.HostPriority{Host: knode.Name, Score: int64(score)}
respList = append(respList, hostPriority)
Expand Down
Loading

0 comments on commit 07ea4ff

Please sign in to comment.