Skip to content

Commit

Permalink
Fall back to a running instance-manager if a default is not available
Browse files Browse the repository at this point in the history
Some operations in controllers, such as deleting disks and reconciling
backup targets, initially fetch the default instance-manager. However,
these operations are not restricted to the default instance-manager;
they can use any running instance-manager. To enhance resilience, if
the default instance-manager is unable to start before an upgrade of
the v2 data engine or for any other reason for both v1 and v2 data
engines, the controller can switch to any running instance-manager as
a fallback.

Longhorn 8464

Signed-off-by: Derek Su <derek.su@suse.com>
  • Loading branch information
derekbit authored and innobead committed May 2, 2024
1 parent 6b98e0b commit 4e7f281
Show file tree
Hide file tree
Showing 4 changed files with 43 additions and 14 deletions.
12 changes: 7 additions & 5 deletions controller/backup_target_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -237,9 +237,9 @@ func getBackupTarget(controllerID string, backupTarget *longhorn.BackupTarget, d
return nil, nil, errors.Wrap(err, "failed to get available data engine for getting backup target")
}

instanceManager, err := ds.GetDefaultInstanceManagerByNodeRO(controllerID, dataEngine)
instanceManager, err := ds.GetRunningInstanceManagerByNodeRO(controllerID, dataEngine)
if err != nil {
return nil, nil, errors.Wrap(err, "failed to get default engine instance manager for proxy client")
return nil, nil, errors.Wrap(err, "failed to get running instance manager for proxy client")
}

engineClientProxy, err = engineapi.NewEngineClientProxy(instanceManager, log, proxyConnCounter)
Expand Down Expand Up @@ -635,10 +635,12 @@ func (btc *BackupTargetController) isResponsibleFor(bt *longhorn.BackupTarget, d
return false, err
}

if instanceManager, err := btc.ds.GetDefaultInstanceManagerByNodeRO(btc.controllerID, ""); err != nil {
instanceManager, err := btc.ds.GetRunningInstanceManagerByNodeRO(btc.controllerID, "")
if err != nil {
return false, err
} else if instanceManager == nil || instanceManager.Status.CurrentState != longhorn.InstanceManagerStateRunning {
return false, errors.New("failed to get default running instance manager")
}
if instanceManager == nil {
return false, errors.New("failed to get running instance manager")
}

isPreferredOwner := currentNodeEngineAvailable && isResponsible
Expand Down
8 changes: 4 additions & 4 deletions controller/node_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -1500,25 +1500,25 @@ func (nc *NodeController) alignDiskSpecAndStatus(node *longhorn.Node) {
if diskInstanceName == "" {
diskInstanceName = diskName
}
if err := nc.deleteDisk(node, diskStatus.Type, diskInstanceName, diskStatus.DiskUUID, diskStatus.DiskPath, string(diskStatus.DiskDriver)); err != nil {
if err := nc.deleteDisk(diskStatus.Type, diskInstanceName, diskStatus.DiskUUID, diskStatus.DiskPath, string(diskStatus.DiskDriver)); err != nil {
nc.logger.WithError(err).Warnf("Failed to delete disk %v", diskInstanceName)
}
delete(node.Status.DiskStatus, diskName)
}
}
}

func (nc *NodeController) deleteDisk(node *longhorn.Node, diskType longhorn.DiskType, diskName, diskUUID, diskPath, diskDriver string) error {
func (nc *NodeController) deleteDisk(diskType longhorn.DiskType, diskName, diskUUID, diskPath, diskDriver string) error {
if diskUUID == "" {
log.Infof("Disk %v has no diskUUID, skip deleting", diskName)
return nil
}

dataEngine := util.GetDataEngineForDiskType(diskType)

im, err := nc.ds.GetDefaultInstanceManagerByNodeRO(nc.controllerID, dataEngine)
im, err := nc.ds.GetRunningInstanceManagerByNodeRO(nc.controllerID, dataEngine)
if err != nil {
return errors.Wrapf(err, "failed to get default instance manager")
return errors.Wrapf(err, "failed to get running instance manager for data engine %v", dataEngine)
}

diskServiceClient, err := engineapi.NewDiskServiceClient(im, nc.logger)
Expand Down
10 changes: 5 additions & 5 deletions controller/orphan_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -327,22 +327,22 @@ func (oc *OrphanController) deleteOrphanedReplica(orphan *longhorn.Orphan) error
err := lhns.DeletePath(filepath.Join(diskPath, "replicas", replicaDirectoryName))
return errors.Wrapf(err, "failed to delete orphan replica directory %v in disk %v", replicaDirectoryName, diskPath)
case longhorn.DiskTypeBlock:
return oc.DeleteSpdkReplicaInstance(orphan.Spec.Parameters[longhorn.OrphanDiskName], orphan.Spec.Parameters[longhorn.OrphanDiskUUID], "", orphan.Spec.Parameters[longhorn.OrphanDataName])
return oc.DeleteV2ReplicaInstance(orphan.Spec.Parameters[longhorn.OrphanDiskName], orphan.Spec.Parameters[longhorn.OrphanDiskUUID], "", orphan.Spec.Parameters[longhorn.OrphanDataName])
default:
return fmt.Errorf("unknown disk type %v for orphan %v", diskType, orphan.Name)
}
}

func (oc *OrphanController) DeleteSpdkReplicaInstance(diskName, diskUUID, diskDriver, replicaInstanceName string) (err error) {
func (oc *OrphanController) DeleteV2ReplicaInstance(diskName, diskUUID, diskDriver, replicaInstanceName string) (err error) {
logrus.Infof("Deleting SPDK replica instance %v on disk %v on node %v", replicaInstanceName, diskUUID, oc.controllerID)

defer func() {
err = errors.Wrapf(err, "cannot delete SPDK replica instance %v", replicaInstanceName)
err = errors.Wrapf(err, "cannot delete v2 replica instance %v", replicaInstanceName)
}()

im, err := oc.ds.GetDefaultInstanceManagerByNodeRO(oc.controllerID, longhorn.DataEngineTypeV2)
im, err := oc.ds.GetRunningInstanceManagerByNodeRO(oc.controllerID, longhorn.DataEngineTypeV2)
if err != nil {
return errors.Wrapf(err, "failed to get instance manager for node %v for deleting SPDK replica instance %v", oc.controllerID, replicaInstanceName)
return errors.Wrapf(err, "failed to get running instance manager for node %v for deleting v2 replica instance %v", oc.controllerID, replicaInstanceName)
}

c, err := engineapi.NewDiskServiceClient(im, oc.logger)
Expand Down
27 changes: 27 additions & 0 deletions datastore/longhorn.go
Original file line number Diff line number Diff line change
Expand Up @@ -5061,3 +5061,30 @@ func (s *DataStore) ListBackupBackingImages() (map[string]*longhorn.BackupBackin
func (s *DataStore) ListBackupBackingImagesRO() ([]*longhorn.BackupBackingImage, error) {
return s.backupBackingImageLister.BackupBackingImages(s.namespace).List(labels.Everything())
}

// GetRunningInstanceManagerByNodeRO returns the running instance manager for the given node and data engine
func (s *DataStore) GetRunningInstanceManagerByNodeRO(node string, dataEngine longhorn.DataEngineType) (*longhorn.InstanceManager, error) {
// Trying to get the default instance manager first.
// If the default instance manager is not running, then try to get another running instance manager.
im, err := s.GetDefaultInstanceManagerByNodeRO(node, dataEngine)
if err == nil {
if im.Status.CurrentState == longhorn.InstanceManagerStateRunning {
return im, nil
}
}

logrus.WithError(err).Warnf("Failed to get the default instance manager for node %v and data engine %v, trying to get another running instance manager", node, dataEngine)

ims, err := s.ListInstanceManagersByNodeRO(node, longhorn.InstanceManagerTypeAllInOne, dataEngine)
if err != nil {
return nil, errors.Wrapf(err, "failed to list instance managers for node %v", node)
}

for _, im := range ims {
if im.Status.CurrentState == longhorn.InstanceManagerStateRunning {
return im, nil
}
}

return nil, fmt.Errorf("failed to find a running instance manager for node %v", node)
}

0 comments on commit 4e7f281

Please sign in to comment.