Skip to content
This repository has been archived by the owner on Oct 22, 2024. It is now read-only.

Commit

Permalink
pmem-csi-driver: fix rescheduling after CreateVolume failure
Browse files Browse the repository at this point in the history
If CreateVolume fails for a volume with late binding, the Kubernetes
scheduler is supposed to be told that it needs to reschedule the
pod. This didn't work with PMEM-CSI because it returned an "internal"
error code instead of the "resource exhausted" one that the
external-provisioner knows about.

Instead of having different well-defined errors at different levels of
the PMEM-CSI software stack, a common error package is used. These
errors get wrapped when it is useful to add more information. We still
add the gRPC error code, though. That could be avoided by making
the internal errors based on gRPC but it is uncertain whether we want
to go that far.

(cherry picked from commit 34323ff)
  • Loading branch information
pohly committed Nov 11, 2020
1 parent 05d17ef commit c7ea8c9
Show file tree
Hide file tree
Showing 12 changed files with 99 additions and 68 deletions.
32 changes: 32 additions & 0 deletions pkg/errors/errors.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
/*
Copyright 2020 Intel Coporation.
SPDX-License-Identifier: Apache-2.0
*/

// Package errors contain some well-defined errors that may have to be
// passed up from low-level layers in the PMEM-CSI stack up to the
// gRPC interface.
//
// These errors must be wrapped (for example, with %w) so that the
// upper layers can use errors.Is to recognize these special errors if
// needed.
package errors

import (
"errors"
)

var (
// DeviceExists device with given id already exists
DeviceExists = errors.New("device exists")

// ErrDeviceNotFound device does not exists
DeviceNotFound = errors.New("device not found")

// ErrDeviceInUse device is in use
DeviceInUse = errors.New("device in use")

// ErrNotEnoughSpace no space to create the device
NotEnoughSpace = errors.New("not enough space")
)
9 changes: 3 additions & 6 deletions pkg/ndctl/ndctl.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,11 @@ package ndctl
import "C"

import (
"errors"
"fmt"

"k8s.io/klog/v2"

pmemerr "github.com/intel/pmem-csi/pkg/errors"
)

const (
Expand All @@ -23,10 +24,6 @@ const (
tib uint64 = gib * 1024
)

var (
ErrNotExist = errors.New("namespace not found")
)

//CreateNamespaceOpts options to create a namespace
type CreateNamespaceOpts struct {
Name string
Expand Down Expand Up @@ -109,7 +106,7 @@ func (ctx *Context) GetNamespaceByName(name string) (*Namespace, error) {
}
}
}
return nil, ErrNotExist
return nil, pmemerr.DeviceNotFound
}

//GetActiveNamespaces returns list of all active namespaces in all regions
Expand Down
4 changes: 3 additions & 1 deletion pkg/ndctl/region.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ import (

"github.com/google/uuid"
"k8s.io/klog/v2"

pmemerr "github.com/intel/pmem-csi/pkg/errors"
)

type RegionType string
Expand Down Expand Up @@ -170,7 +172,7 @@ func (r *Region) CreateNamespace(opts CreateNamespaceOpts) (*Namespace, error) {
available = r.AvailableSize()
}
if opts.Size > available {
return nil, fmt.Errorf("Not enough space to create namespace with size %v", opts.Size)
return nil, fmt.Errorf("create namespace with size %v: %w", opts.Size, pmemerr.NotEnoughSpace)
}
}

Expand Down
2 changes: 1 addition & 1 deletion pkg/pmem-csi-driver/controllerserver-master.go
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,7 @@ func (cs *masterController) CreateVolume(ctx context.Context, req *csi.CreateVol
}

if len(chosenNodes) == 0 {
return nil, status.Error(codes.Unavailable, fmt.Sprintf("No node found with %v capacity", asked))
return nil, status.Error(codes.ResourceExhausted, fmt.Sprintf("No node found with %v capacity", asked))
}

klog.V(3).Infof("Chosen nodes: %v", chosenNodes)
Expand Down
11 changes: 8 additions & 3 deletions pkg/pmem-csi-driver/controllerserver-node.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (

"github.com/container-storage-interface/spec/lib/go/csi"

pmemerr "github.com/intel/pmem-csi/pkg/errors"
grpcserver "github.com/intel/pmem-csi/pkg/grpc-server"
"github.com/intel/pmem-csi/pkg/pmem-csi-driver/parameters"
pmdmanager "github.com/intel/pmem-csi/pkg/pmem-device-manager"
Expand Down Expand Up @@ -97,7 +98,7 @@ func NewNodeControllerServer(nodeID string, dm pmdmanager.PmemDeviceManager, sm

if _, err := dm.GetDevice(id); err == nil {
found = true
} else if !errors.Is(err, pmdmanager.ErrDeviceNotFound) {
} else if !errors.Is(err, pmemerr.DeviceNotFound) {
klog.Warningf("Failed to fetch device for state volume '%s'(volume mode: '%s'): %v", id, v.GetDeviceMode(), err)
// Let's ignore this volume
continue
Expand Down Expand Up @@ -263,7 +264,11 @@ func (cs *nodeControllerServer) createVolumeInternal(ctx context.Context,
asked = 1
}
if err := cs.dm.CreateDevice(volumeID, uint64(asked)); err != nil {
statusErr = status.Errorf(codes.Internal, "Node CreateVolume: device creation failed: %v", err)
code := codes.Internal
if errors.Is(err, pmemerr.NotEnoughSpace) {
code = codes.ResourceExhausted
}
statusErr = status.Errorf(code, "Node CreateVolume: device creation failed: %v", err)
return
}
// TODO(?): determine and return actual size here?
Expand Down Expand Up @@ -316,7 +321,7 @@ func (cs *nodeControllerServer) DeleteVolume(ctx context.Context, req *csi.Delet
}

if err := dm.DeleteDevice(req.VolumeId, p.GetEraseAfter()); err != nil {
if errors.Is(err, pmdmanager.ErrDeviceInUse) {
if errors.Is(err, pmemerr.DeviceInUse) {
return nil, status.Errorf(codes.FailedPrecondition, err.Error())
}
return nil, status.Errorf(codes.Internal, "Failed to delete volume: %s", err.Error())
Expand Down
9 changes: 5 additions & 4 deletions pkg/pmem-csi-driver/nodeserver.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
"k8s.io/klog/v2"
"k8s.io/utils/mount"

pmemerr "github.com/intel/pmem-csi/pkg/errors"
pmemexec "github.com/intel/pmem-csi/pkg/exec"
grpcserver "github.com/intel/pmem-csi/pkg/grpc-server"
"github.com/intel/pmem-csi/pkg/imagefile"
Expand Down Expand Up @@ -150,7 +151,7 @@ func (ns *nodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis
// 1) No Volume found with given volume id
// 2) No provisioner info found in VolumeContext "storage.kubernetes.io/csiProvisionerIdentity"
// 3) No StagingPath in the request
if device, err = ns.cs.dm.GetDevice(req.VolumeId); err != nil && !errors.Is(err, pmdmanager.ErrDeviceNotFound) {
if device, err = ns.cs.dm.GetDevice(req.VolumeId); err != nil && !errors.Is(err, pmemerr.DeviceNotFound) {
return nil, status.Errorf(codes.Internal, "failed to get device details for volume id '%s': %v", req.VolumeId, err)
}
_, ok := req.GetVolumeContext()[volumeProvisionerIdentity]
Expand Down Expand Up @@ -181,7 +182,7 @@ func (ns *nodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis
volumeParameters = v

if device, err = ns.cs.dm.GetDevice(req.VolumeId); err != nil {
if errors.Is(err, pmdmanager.ErrDeviceNotFound) {
if errors.Is(err, pmemerr.DeviceNotFound) {
return nil, status.Errorf(codes.NotFound, "no device found with volume id %q: %v", req.VolumeId, err)
}
return nil, status.Errorf(codes.Internal, "failed to get device details for volume id %q: %v", req.VolumeId, err)
Expand Down Expand Up @@ -487,7 +488,7 @@ func (ns *nodeServer) NodeStageVolume(ctx context.Context, req *csi.NodeStageVol

device, err := ns.cs.dm.GetDevice(req.VolumeId)
if err != nil {
if errors.Is(err, pmdmanager.ErrDeviceNotFound) {
if errors.Is(err, pmemerr.DeviceNotFound) {
return nil, status.Errorf(codes.NotFound, "no device found with volume id %q: %v", req.VolumeId, err)
}
return nil, status.Errorf(codes.Internal, "failed to get device details for volume id %q: %v", req.VolumeId, err)
Expand Down Expand Up @@ -544,7 +545,7 @@ func (ns *nodeServer) NodeUnstageVolume(ctx context.Context, req *csi.NodeUnstag
// so we look up the current device by volumeID and see is that device
// mounted on staging target path
if _, err := ns.cs.dm.GetDevice(req.VolumeId); err != nil {
if errors.Is(err, pmdmanager.ErrDeviceNotFound) {
if errors.Is(err, pmemerr.DeviceNotFound) {
return nil, status.Errorf(codes.NotFound, "no device found with volume id '%s': %s", req.VolumeId, err.Error())
}
return nil, status.Errorf(codes.Internal, "failed to get device details for volume id '%s': %s", req.VolumeId, err.Error())
Expand Down
13 changes: 7 additions & 6 deletions pkg/pmem-device-manager/pmd-lvm.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"sync"

api "github.com/intel/pmem-csi/pkg/apis/pmemcsi/v1alpha1"
pmemerr "github.com/intel/pmem-csi/pkg/errors"
pmemexec "github.com/intel/pmem-csi/pkg/exec"
"github.com/intel/pmem-csi/pkg/ndctl"
pmemcommon "github.com/intel/pmem-csi/pkg/pmem-common"
Expand Down Expand Up @@ -132,7 +133,7 @@ func (lvm *pmemLvm) CreateDevice(volumeId string, size uint64) error {
// Avoid device filling with garbage entries by returning error.
// Overall, no point having more than one namespace with same volumeId.
if _, err := lvm.getDevice(volumeId); err == nil {
return ErrDeviceExists
return pmemerr.DeviceExists
}
vgs, err := getVolumeGroups(lvm.volumeGroups)
if err != nil {
Expand Down Expand Up @@ -177,7 +178,7 @@ func (lvm *pmemLvm) CreateDevice(volumeId string, size uint64) error {
}
}
}
return ErrNotEnoughSpace
return pmemerr.NotEnoughSpace
}

func (lvm *pmemLvm) DeleteDevice(volumeId string, flush bool) error {
Expand All @@ -188,13 +189,13 @@ func (lvm *pmemLvm) DeleteDevice(volumeId string, flush bool) error {
var device *PmemDeviceInfo

if device, err = lvm.getDevice(volumeId); err != nil {
if errors.Is(err, ErrDeviceNotFound) {
if errors.Is(err, pmemerr.DeviceNotFound) {
return nil
}
return err
}
if err := clearDevice(device, flush); err != nil {
if errors.Is(err, ErrDeviceNotFound) {
if errors.Is(err, pmemerr.DeviceNotFound) {
// Remove device from cache
delete(lvm.devices, volumeId)
return nil
Expand Down Expand Up @@ -236,7 +237,7 @@ func (lvm *pmemLvm) getDevice(volumeId string) (*PmemDeviceInfo, error) {
return dev, nil
}

return nil, ErrDeviceNotFound
return nil, pmemerr.DeviceNotFound
}

func getUncachedDevice(volumeId string, volumeGroup string) (*PmemDeviceInfo, error) {
Expand All @@ -249,7 +250,7 @@ func getUncachedDevice(volumeId string, volumeGroup string) (*PmemDeviceInfo, er
return dev, nil
}

return nil, ErrDeviceNotFound
return nil, pmemerr.DeviceNotFound
}

// listDevices Lists available logical devices in given volume groups
Expand Down
30 changes: 2 additions & 28 deletions pkg/pmem-device-manager/pmd-manager.go
Original file line number Diff line number Diff line change
@@ -1,35 +1,9 @@
package pmdmanager

import (
"errors"
"os"

api "github.com/intel/pmem-csi/pkg/apis/pmemcsi/v1alpha1"
)

var (
// ErrInvalid invalid argument passed
ErrInvalid = os.ErrInvalid

// ErrPermission no permission to complete the task
ErrPermission = os.ErrPermission

// ErrDeviceExists device with given id already exists
ErrDeviceExists = errors.New("device exists")

// ErrDeviceNotFound device does not exists
ErrDeviceNotFound = errors.New("device not found")

// ErrDeviceInUse device is in use
ErrDeviceInUse = errors.New("device in use")

// ErrDeviceNotReady device not ready yet
ErrDeviceNotReady = errors.New("device not ready")

// ErrNotEnoughSpace no space to create the device
ErrNotEnoughSpace = errors.New("not enough space")
)

//PmemDeviceInfo represents a block device
type PmemDeviceInfo struct {
//VolumeId is name of the block device
Expand Down Expand Up @@ -64,7 +38,7 @@ type PmemDeviceManager interface {
GetCapacity() (Capacity, error)

// CreateDevice creates a new block device with give name, size and namespace mode
// Possible errors: ErrNotEnoughSpace, ErrInvalid, ErrDeviceExists
// Possible errors: ErrNotEnoughSpace, ErrDeviceExists
CreateDevice(name string, size uint64) error

// GetDevice returns the block device information for given name
Expand All @@ -73,7 +47,7 @@ type PmemDeviceManager interface {

// DeleteDevice deletes an existing block device with give name.
// If 'flush' is 'true', then the device data is zeroed before deleting the device
// Possible errors: ErrDeviceInUse, ErrPermission
// Possible errors: ErrDeviceInUse
DeleteDevice(name string, flush bool) error

// ListDevices returns all the block devices information that was created by this device manager
Expand Down
7 changes: 4 additions & 3 deletions pkg/pmem-device-manager/pmd-manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (
"strings"
"testing"

pmemerr "github.com/intel/pmem-csi/pkg/errors"
pmemexec "github.com/intel/pmem-csi/pkg/exec"
. "github.com/onsi/ginkgo"
. "github.com/onsi/gomega"
Expand Down Expand Up @@ -125,7 +126,7 @@ func runTests(mode string) {
It("Should fail to retrieve non-existent device", func() {
dev, err := dm.GetDevice("unknown")
Expect(err).ShouldNot(BeNil(), "Error expected")
Expect(errors.Is(err, ErrDeviceNotFound)).Should(BeTrue(), "expected error is device not found error")
Expect(errors.Is(err, pmemerr.DeviceNotFound)).Should(BeTrue(), "expected error is device not found error")
Expect(dev).Should(BeNil(), "returned device should be nil")
})

Expand Down Expand Up @@ -206,7 +207,7 @@ func runTests(mode string) {
// Delete should fail as the device is in use
err = dm.DeleteDevice(name, true)
Expect(err).ShouldNot(BeNil(), "Error expected when deleting device in use: %s", dev.VolumeId)
Expect(errors.Is(err, ErrDeviceInUse)).Should(BeTrue(), "Expected device busy error: %s", dev.VolumeId)
Expect(errors.Is(err, pmemerr.DeviceInUse)).Should(BeTrue(), "Expected device busy error: %s", dev.VolumeId)
cleanupList[name] = false

err = unmount(mountPath)
Expand All @@ -218,7 +219,7 @@ func runTests(mode string) {

dev, err = dm.GetDevice(name)
Expect(err).ShouldNot(BeNil(), "GetDevice() should fail on deleted device")
Expect(errors.Is(err, ErrDeviceNotFound)).Should(BeTrue(), "expected error is os.ErrNotExist")
Expect(errors.Is(err, pmemerr.DeviceNotFound)).Should(BeTrue(), "expected error is DeviceNodeFound")
Expect(dev).Should(BeNil(), "returned device should be nil")

// Delete call should not return any error on non-existing device
Expand Down
12 changes: 5 additions & 7 deletions pkg/pmem-device-manager/pmd-ndctl.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"sync"

api "github.com/intel/pmem-csi/pkg/apis/pmemcsi/v1alpha1"
pmemerr "github.com/intel/pmem-csi/pkg/errors"
"github.com/intel/pmem-csi/pkg/ndctl"
"k8s.io/klog/v2"
"k8s.io/utils/mount"
Expand Down Expand Up @@ -112,7 +113,7 @@ func (pmem *pmemNdctl) CreateDevice(volumeId string, size uint64) error {
// Avoid device filling with garbage entries by returning error.
// Overall, no point having more than one namespace with same name.
if _, err := getDevice(ndctx, volumeId); err == nil {
return ErrDeviceExists
return pmemerr.DeviceExists
}

// libndctl needs to store meta data and will use some of the allocated
Expand Down Expand Up @@ -158,13 +159,13 @@ func (pmem *pmemNdctl) DeleteDevice(volumeId string, flush bool) error {

device, err := getDevice(ndctx, volumeId)
if err != nil {
if errors.Is(err, ErrDeviceNotFound) {
if errors.Is(err, pmemerr.DeviceNotFound) {
return nil
}
return err
}
if err := clearDevice(device, flush); err != nil {
if errors.Is(err, ErrDeviceNotFound) {
if errors.Is(err, pmemerr.DeviceNotFound) {
return nil
}
return err
Expand Down Expand Up @@ -205,10 +206,7 @@ func (pmem *pmemNdctl) ListDevices() ([]*PmemDeviceInfo, error) {
func getDevice(ndctx *ndctl.Context, volumeId string) (*PmemDeviceInfo, error) {
ns, err := ndctx.GetNamespaceByName(volumeId)
if err != nil {
if errors.Is(err, ndctl.ErrNotExist) {
return nil, ErrDeviceNotFound
}
return nil, fmt.Errorf("error getting device %q: %v", volumeId, err)
return nil, fmt.Errorf("error getting device %q: %w", volumeId, err)
}

return namespaceToPmemInfo(ns), nil
Expand Down
5 changes: 3 additions & 2 deletions pkg/pmem-device-manager/pmd-util.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"strconv"
"time"

pmemerr "github.com/intel/pmem-csi/pkg/errors"
pmemexec "github.com/intel/pmem-csi/pkg/exec"
"golang.org/x/sys/unix"
"k8s.io/klog/v2"
Expand Down Expand Up @@ -44,7 +45,7 @@ func clearDevice(dev *PmemDeviceInfo, flush bool) error {
defer unix.Close(fd)

if err != nil {
return fmt.Errorf("failed to clear device %q: %w", dev.Path, ErrDeviceInUse)
return fmt.Errorf("failed to clear device %q: %w", dev.Path, pmemerr.DeviceInUse)
}

if blocks == 0 {
Expand Down Expand Up @@ -78,5 +79,5 @@ func waitDeviceAppears(dev *PmemDeviceInfo) error {
i, dev.Path, retryStatTimeout)
time.Sleep(retryStatTimeout)
}
return fmt.Errorf("%s: %w", dev.Path, ErrDeviceNotReady)
return fmt.Errorf("%s: device not ready", dev.Path)
}
Loading

0 comments on commit c7ea8c9

Please sign in to comment.