From 8fe08383e1f6ee44293b47f27fb64f95b212e3cc Mon Sep 17 00:00:00 2001 From: Praveen raj Mani Date: Fri, 15 Sep 2023 12:13:29 +0530 Subject: [PATCH] Support drive anti-affinity for volumes (#823) Some optimal setups would require volumes to be allocated on unique disks. ie, not more than one volume per disk. By default, the volume scheduling algorithm choses the drive based on most free capacity. This will end up allocating more than one volumes per disk. This PR provides a way for such optimal setups, by using the storage class parameters. Using a storage class with `directpv.min.io/volume-claim-id: XXX` parameter enables unique allocation for PVCs. This unique allocation id enables the one-to-one cardinality for the drives and volumes. Co-authored-by: Bala FA --- docs/tools/create-storage-class.sh | 25 +++-- docs/volume-scheduling.md | 101 +++++++++++++++++++-- pkg/apis/directpv.min.io/types/label.go | 9 ++ pkg/apis/directpv.min.io/v1beta1/drive.go | 25 +++++ pkg/apis/directpv.min.io/v1beta1/volume.go | 13 +++ pkg/csi/controller/server.go | 14 ++- pkg/csi/controller/utils.go | 5 + pkg/csi/controller/utils_test.go | 72 +++++++++++---- pkg/volume/event.go | 2 +- 9 files changed, 232 insertions(+), 34 deletions(-) diff --git a/docs/tools/create-storage-class.sh b/docs/tools/create-storage-class.sh index fc4739c96..61f72c68a 100755 --- a/docs/tools/create-storage-class.sh +++ b/docs/tools/create-storage-class.sh @@ -18,27 +18,40 @@ set -e -C -o pipefail -declare NAME DRIVE_LABEL +declare NAME +declare -a DRIVE_LABELS function init() { - if [[ $# -ne 2 ]]; then + if [[ $# -lt 2 ]]; then cat < + create-storage-class.sh ... ARGUMENTS: NAME new storage class name. - DRIVE-LABEL drive labels to be attached. + DRIVE-LABELS drive labels to be attached. EXAMPLE: # Create new storage class 'fast-tier-storage' with drive labels 'directpv.min.io/tier: fast' $ create-storage-class.sh fast-tier-storage 'directpv.min.io/tier: fast' + + # Create new storage class with more than one drive label + $ create-storage-class.sh fast-tier-unique 'directpv.min.io/tier: fast' 'directpv.min.io/volume-claim-id: bcea279a-df70-4d23-be41-9490f9933004' EOF exit 255 fi NAME="$1" - DRIVE_LABEL="$2" + shift + DRIVE_LABELS=( "$@" ) + + for val in "${DRIVE_LABELS[@]}"; do + if [[ "$val" =~ ^directpv.min.io/volume-claim-id:.* ]] && ! [[ "${val#directpv.min.io/volume-claim-id: }" =~ ^[a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12}$ ]]; + then + echo "invalid uuid value set for volume-claim-id; please set a valid uuid based on [RFC 4122]" + exit 255 + fi + done if ! which kubectl >/dev/null 2>&1; then echo "kubectl not found; please install" @@ -67,7 +80,7 @@ metadata: name: ${NAME} parameters: fstype: xfs - ${DRIVE_LABEL} +$(printf ' %s\n' "${DRIVE_LABELS[@]}") provisioner: directpv-min-io reclaimPolicy: Delete volumeBindingMode: WaitForFirstConsumer diff --git a/docs/volume-scheduling.md b/docs/volume-scheduling.md index 3b2936b7d..ae4af635a 100644 --- a/docs/volume-scheduling.md +++ b/docs/volume-scheduling.md @@ -11,6 +11,7 @@ DirectPV CSI controller selects suitable drive for `CreateVolume` request like b a. By requested capacity b. By access-tier if requested c. By topology constraints if requested + d. By volume claim ID if requested 4. In the process of step (3), if more than one drive is selected, the maximum free capacity drive is picked. 5. If step (4) picks up more than one drive, a drive is randomly selected. 6. Finally the selected drive is updated with requested volume information. @@ -51,14 +52,17 @@ DirectPV CSI controller selects suitable drive for `CreateVolume` request like b | ╭╌╌╌╌╌╌╌V╌╌╌╌╌╌╌╮ | │ if requested? │ | No │ Is more than │ | ╰╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╯ +-----------│ one drive │ | | Yes - │ matched? │ | ┌───────V───────┐ - ╰╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╯ | │ Append to │ - | Yes +<----│ matched drives│ - ┌───────V───────┐ └───────────────┘ - │ Return │ - │ Randomly │ - │ selected drive│ - └───────────────┘ + │ matched? │ | ╭╌╌╌╌╌╌╌V╌╌╌╌╌╌╌╮ + ╰╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╯ | │ Match by │ + | Yes | Yes │ volume claim │ + ┌───────V───────┐ |<----│ ID │ + │ Return │ | │ if requested? │ + │ Randomly │ | ╰╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╯ + │ selected drive│ | | No + └───────────────┘ | ┌───────V───────┐ + | │ Append to │ + +<----│ matched drives│ + └───────────────┘ ``` ## Customizing drive selection @@ -92,3 +96,84 @@ spec: storage: 8Mi EOF ``` + +### Unique drive selection + +The default free capacity based drive selection leads to allocate more than one volume in a single drive for StatefulSet deployments which lacks performance and high availability for application like MinIO object storage. To overcome this behavior, DirectPV provides a way to allocate one volume per drive. This feature needs to be set by having custom storage class with label 'directpv.min.io/volume-claim-id'. Below is an example to create custom storage class using [create-storage-class.sh script](../tools/create-storage-class.sh): + +```sh +create-storage-class.sh tenant-1-storage 'directpv.min.io/volume-claim-id: 555e99eb-e255-4407-83e3-fc443bf20f86' +``` + +This custom storage class has to be used in your StatefulSet deployment. Below is an example to deploy MinIO object storage + +```yaml +kind: Service +apiVersion: v1 +metadata: + name: minio + labels: + app: minio +spec: + selector: + app: minio + ports: + - name: minio + port: 9000 + +--- + +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: minio + labels: + app: minio +spec: + serviceName: "minio" + replicas: 2 + selector: + matchLabels: + app: minio + template: + metadata: + labels: + app: minio + directpv.min.io/organization: minio + directpv.min.io/app: minio-example + directpv.min.io/tenant: tenant-1 + spec: + containers: + - name: minio + image: minio/minio + env: + - name: MINIO_ACCESS_KEY + value: minio + - name: MINIO_SECRET_KEY + value: minio123 + volumeMounts: + - name: minio-data-1 + mountPath: /data1 + - name: minio-data-2 + mountPath: /data2 + args: + - "server" + - "http://minio-{0...1}.minio.default.svc.cluster.local:9000/data{1...2}" + volumeClaimTemplates: + - metadata: + name: minio-data-1 + spec: + storageClassName: tenant-1-storage + accessModes: [ "ReadWriteOnce" ] + resources: + requests: + storage: 16Mi + - metadata: + name: minio-data-2 + spec: + storageClassName: tenant-1-storage + accessModes: [ "ReadWriteOnce" ] + resources: + requests: + storage: 16Mi +``` diff --git a/pkg/apis/directpv.min.io/types/label.go b/pkg/apis/directpv.min.io/types/label.go index a1da58902..c4388b5ff 100644 --- a/pkg/apis/directpv.min.io/types/label.go +++ b/pkg/apis/directpv.min.io/types/label.go @@ -79,6 +79,15 @@ const ( // SuspendLabelKey denotes if the volume is suspended. SuspendLabelKey LabelKey = consts.GroupName + "/suspend" + + // VolumeClaimIDLabelKey label key to denote the unique allocation of drives for volumes + VolumeClaimIDLabelKey LabelKey = consts.GroupName + "/volume-claim-id" + + // VolumeClaimIDLabelKeyPrefix label key prefix for volume claim id to be set on the drive + VolumeClaimIDLabelKeyPrefix = consts.GroupName + "/volume-claim-id-" + + // ClaimIDLabelKey label key to denote the claim id of the volumes + ClaimIDLabelKey LabelKey = consts.GroupName + "/claim-id" ) // LabelValue is a type definition for label value diff --git a/pkg/apis/directpv.min.io/v1beta1/drive.go b/pkg/apis/directpv.min.io/v1beta1/drive.go index 543ee3fbc..e9fab227b 100644 --- a/pkg/apis/directpv.min.io/v1beta1/drive.go +++ b/pkg/apis/directpv.min.io/v1beta1/drive.go @@ -17,6 +17,7 @@ package v1beta1 import ( + "strconv" "strings" "github.com/minio/directpv/pkg/apis/directpv.min.io/types" @@ -221,6 +222,30 @@ func (drive DirectPVDrive) GetNodeID() types.NodeID { return types.NodeID(drive.getLabel(types.NodeLabelKey)) } +// HasVolumeClaimID checks if the provided volume claim id is set on the drive. +func (drive *DirectPVDrive) HasVolumeClaimID(claimID string) bool { + if claimID == "" { + return false + } + return drive.GetLabels()[types.VolumeClaimIDLabelKeyPrefix+claimID] == strconv.FormatBool(true) +} + +// SetVolumeClaimID sets the provided claim id on the drive. +func (drive *DirectPVDrive) SetVolumeClaimID(claimID string) { + if claimID == "" { + return + } + drive.SetLabel(types.LabelKey(types.VolumeClaimIDLabelKeyPrefix+claimID), types.LabelValue(strconv.FormatBool(true))) +} + +// RemoveVolumeClaimID removes the volume claim id label. +func (drive *DirectPVDrive) RemoveVolumeClaimID(claimID string) { + if claimID == "" { + return + } + drive.RemoveLabel(types.LabelKey(types.VolumeClaimIDLabelKeyPrefix + claimID)) +} + // SetLabel sets label to this drive. func (drive *DirectPVDrive) SetLabel(key types.LabelKey, value types.LabelValue) bool { values := drive.GetLabels() diff --git a/pkg/apis/directpv.min.io/v1beta1/volume.go b/pkg/apis/directpv.min.io/v1beta1/volume.go index 628f424ec..ed271849a 100644 --- a/pkg/apis/directpv.min.io/v1beta1/volume.go +++ b/pkg/apis/directpv.min.io/v1beta1/volume.go @@ -293,6 +293,19 @@ func (volume DirectPVVolume) IsSuspended() bool { return string(volume.getLabel(types.SuspendLabelKey)) == strconv.FormatBool(true) } +// SetClaimID sets the provided claim id on the volume. +func (volume *DirectPVVolume) SetClaimID(claimID string) { + if claimID == "" { + return + } + volume.SetLabel(types.ClaimIDLabelKey, types.LabelValue(claimID)) +} + +// GetClaimID gets the claim id set on the volume. +func (volume *DirectPVVolume) GetClaimID() string { + return string(volume.getLabel(types.ClaimIDLabelKey)) +} + // +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object // DirectPVVolumeList denotes list of volumes. diff --git a/pkg/csi/controller/server.go b/pkg/csi/controller/server.go index 32056f00f..12bfd7bb0 100644 --- a/pkg/csi/controller/server.go +++ b/pkg/csi/controller/server.go @@ -19,6 +19,7 @@ package controller import ( "context" "fmt" + "regexp" "github.com/container-storage-interface/spec/lib/go/csi" "github.com/dustin/go-humanize" @@ -32,6 +33,8 @@ import ( "k8s.io/klog/v2" ) +var volumeClaimIDRegex = regexp.MustCompile("^[a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12}$") + /* Volume Lifecycle * * Creation @@ -145,11 +148,18 @@ func (c *Server) CreateVolume(ctx context.Context, req *csi.CreateVolumeRequest) return nil, status.Errorf(codes.InvalidArgument, "unsupported filesystem type %v for volume %v", req.GetVolumeCapabilities()[0].GetMount().GetFsType(), name) } + var volumeClaimID string for key, value := range req.GetParameters() { - if key == string(directpvtypes.AccessTierLabelKey) { + switch key { + case string(directpvtypes.AccessTierLabelKey): if _, err := directpvtypes.StringsToAccessTiers(value); err != nil { return nil, status.Errorf(codes.InvalidArgument, "unknown access-tier %v for volume %v; %v", value, name, err) } + case string(directpvtypes.VolumeClaimIDLabelKey): + if !volumeClaimIDRegex.MatchString(value) { + return nil, status.Errorf(codes.InvalidArgument, "invalid volume claim ID %v; ", value) + } + volumeClaimID = value } } @@ -177,6 +187,7 @@ func (c *Server) CreateVolume(ctx context.Context, req *csi.CreateVolumeRequest) drive.GetDriveName(), size, ) + newVolume.SetClaimID(volumeClaimID) if _, err := client.VolumeClient().Create(ctx, newVolume, metav1.CreateOptions{}); err != nil { if !errors.IsAlreadyExists(err) { @@ -206,6 +217,7 @@ func (c *Server) CreateVolume(ctx context.Context, req *csi.CreateVolumeRequest) } if drive.AddVolumeFinalizer(req.GetName()) { + drive.SetVolumeClaimID(volumeClaimID) drive.Status.FreeCapacity -= size drive.Status.AllocatedCapacity += size diff --git a/pkg/csi/controller/utils.go b/pkg/csi/controller/utils.go index ad2bc4d3c..688e6f85a 100644 --- a/pkg/csi/controller/utils.go +++ b/pkg/csi/controller/utils.go @@ -65,6 +65,11 @@ func matchDrive(drive *types.Drive, req *csi.CreateVolumeRequest) bool { if len(accessTiers) > 0 && drive.GetAccessTier() != accessTiers[0] { return false } + case string(directpvtypes.VolumeClaimIDLabelKey): + if drive.HasVolumeClaimID(value) { + // Do not allocate another volume with this claim id + return false + } default: if labels[key] != value { return false diff --git a/pkg/csi/controller/utils_test.go b/pkg/csi/controller/utils_test.go index af5f2d4df..9dcb9e5cf 100644 --- a/pkg/csi/controller/utils_test.go +++ b/pkg/csi/controller/utils_test.go @@ -34,21 +34,21 @@ import ( const GiB = 1024 * 1024 * 1024 -func TestGetFilteredDrives(t *testing.T) { - newDriveWithLabels := func(driveID directpvtypes.DriveID, status types.DriveStatus, nodeID directpvtypes.NodeID, driveName directpvtypes.DriveName, labels map[directpvtypes.LabelKey]directpvtypes.LabelValue) *types.Drive { - drive := types.NewDrive( - driveID, - status, - nodeID, - driveName, - directpvtypes.AccessTierDefault, - ) - for k, v := range labels { - drive.SetLabel(k, v) - } - return drive +func newDriveWithLabels(driveID directpvtypes.DriveID, status types.DriveStatus, nodeID directpvtypes.NodeID, driveName directpvtypes.DriveName, labels map[directpvtypes.LabelKey]directpvtypes.LabelValue) *types.Drive { + drive := types.NewDrive( + driveID, + status, + nodeID, + driveName, + directpvtypes.AccessTierDefault, + ) + for k, v := range labels { + drive.SetLabel(k, v) } + return drive +} +func TestGetFilteredDrives(t *testing.T) { case2Result := []types.Drive{ *types.NewDrive( "drive-1", @@ -493,16 +493,49 @@ func TestGetFilteredDrives(t *testing.T) { } func TestGetDrive(t *testing.T) { - case2Result := types.NewDrive( + case1Result := types.NewDrive( "drive-1", types.DriveStatus{}, "node-1", directpvtypes.DriveName("sda"), directpvtypes.AccessTierDefault, ) - case2Result.AddVolumeFinalizer("volume-1") - case2Objects := []runtime.Object{case2Result} - case2Request := &csi.CreateVolumeRequest{Name: "volume-1"} + case1Result.AddVolumeFinalizer("volume-1") + case1Objects := []runtime.Object{case1Result} + case1Request := &csi.CreateVolumeRequest{Name: "volume-1"} + + case2Objects := []runtime.Object{ + types.NewDrive( + "drive-1", + types.DriveStatus{Status: directpvtypes.DriveStatusReady}, + "node-1", + directpvtypes.DriveName("sda"), + directpvtypes.AccessTierDefault, + ), + newDriveWithLabels( + "drive-2", + types.DriveStatus{ + Status: directpvtypes.DriveStatusReady, + Topology: map[string]string{"node": "node1", "rack": "rack1", "zone": "zone1", "region": "region1"}, + }, + "node-1", + directpvtypes.DriveName("sdd"), + map[directpvtypes.LabelKey]directpvtypes.LabelValue{ + consts.GroupName + "/volume-claim-id-xxx": "true", + }, + ), + } + case2Request := &csi.CreateVolumeRequest{ + Name: "volume-1", + Parameters: map[string]string{consts.GroupName + "/volume-claim-id": "xxx"}, + } + case2Result := types.NewDrive( + "drive-1", + types.DriveStatus{Status: directpvtypes.DriveStatusReady}, + "node-1", + directpvtypes.DriveName("sda"), + directpvtypes.AccessTierDefault, + ) testCases := []struct { objects []runtime.Object @@ -511,6 +544,7 @@ func TestGetDrive(t *testing.T) { expectErr bool }{ {[]runtime.Object{}, nil, nil, true}, + {case1Objects, case1Request, case1Result, false}, {case2Objects, case2Request, case2Result, false}, } @@ -520,7 +554,9 @@ func TestGetDrive(t *testing.T) { client.SetVolumeInterface(clientset.DirectpvLatest().DirectPVVolumes()) result, err := selectDrive(context.TODO(), testCase.request) - + if err != nil && !testCase.expectErr { + t.Fatalf("case %v: unable to select drive; %v", i+1, err) + } if testCase.expectErr { if err == nil { t.Fatalf("case %v: expected error, but succeeded", i+1) diff --git a/pkg/volume/event.go b/pkg/volume/event.go index 1ac523b16..f24b1d49b 100644 --- a/pkg/volume/event.go +++ b/pkg/volume/event.go @@ -187,7 +187,7 @@ func (handler *volumeEventHandler) releaseVolume(ctx context.Context, volume *ty drive.Status.FreeCapacity += volume.Status.TotalCapacity drive.Status.AllocatedCapacity = drive.Status.TotalCapacity - drive.Status.FreeCapacity - + drive.RemoveVolumeClaimID(volume.GetClaimID()) _, err = client.DriveClient().Update( ctx, drive, metav1.UpdateOptions{TypeMeta: types.NewDriveTypeMeta()}, )