Skip to content

Commit

Permalink
Merge #119906
Browse files Browse the repository at this point in the history
119906: roachprod: RAID0 only network disks if both local and network are present r=srosenberg a=nameisbhaskar

Today, if a machine has both local and network disks, both the disks are selected for RAID'ing.

But, RAID'ing different types of disks causes performance differences.

To address this, local disks are ignored for RAID'ing only if network disks are present.

Fixes: #98783
Epic: none

Co-authored-by: Bhaskarjyoti Bora <bhaskar.bora@cockroachlabs.com>
  • Loading branch information
craig[bot] and nameisbhaskar committed Mar 13, 2024
2 parents 2ac5657 + 6fd7ff8 commit aa755fd
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 16 deletions.
13 changes: 0 additions & 13 deletions pkg/cmd/roachtest/tests/restore.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ import (
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/roachprod/install"
"github.com/cockroachdb/cockroach/pkg/roachprod/logger"
"github.com/cockroachdb/cockroach/pkg/roachprod/vm"
"github.com/cockroachdb/cockroach/pkg/testutils"
"github.com/cockroachdb/cockroach/pkg/testutils/sqlutils"
"github.com/cockroachdb/cockroach/pkg/ts/tspb"
Expand Down Expand Up @@ -537,18 +536,6 @@ func (hw hardwareSpecs) makeClusterSpecs(r registry.Registry, backupCloud string
}
s := r.MakeClusterSpec(hw.nodes+addWorkloadNode, clusterOpts...)

if backupCloud == spec.AWS && s.VolumeSize != 0 {
// Work around an issue that RAID0s local NVMe and GP3 storage together:
// https://github.com/cockroachdb/cockroach/issues/98783.
//
// TODO(srosenberg): Remove this workaround when 98783 is addressed.
// TODO(miral): This now returns an error instead of panicking, so even though
// we haven't panicked here before, we should handle the error. Moot if this is
// removed as per TODO above.
s.AWS.MachineType, _, _ = spec.SelectAWSMachineType(s.CPUs, s.Mem, false /* shouldSupportLocalSSD */, vm.ArchAMD64)
s.AWS.MachineType = strings.Replace(s.AWS.MachineType, "d.", ".", 1)
s.Arch = vm.ArchAMD64
}
return s
}

Expand Down
29 changes: 27 additions & 2 deletions pkg/roachprod/vm/aws/support.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,19 +50,44 @@ mount_opts="defaults"
use_multiple_disks='{{if .UseMultipleDisks}}true{{end}}'
disks=()
mount_prefix="/mnt/data"
# if the use_multiple_disks is not set and there are more than 1 disk (excluding the boot disk),
# then the disks will be selected for RAID'ing. If there are both EC2 NVMe Instance Storage and
# EBS, RAID'ing in this case can cause performance differences. So, to avoid this,
# EC2 NVMe Instance Storage are ignored.
# Scenarios:
# (local SSD = 0, Network Disk - 1) - no RAID'ing and mount network disk
# (local SSD = 1, Network Disk - 0) - no RAID'ing and mount local SSD
# (local SSD >= 1, Network Disk = 1) - no RAID'ing and mount network disk
# (local SSD > 1, Network Disk = 0) - local SSDs selected for RAID'ing
# (local SSD >= 0, Network Disk > 1) - network disks selected for RAID'ing
# Keep track of the Local SSDs and EBS volumes for RAID'ing
local_disks=()
ebs_volumes=()
# On different machine types, the drives are either called nvme... or xvdd.
for d in $(ls /dev/nvme?n1 /dev/xvdd); do
if ! mount | grep ${d}; then
disks+=("${d}")
if udevadm info --query=property --name=${d} | grep "ID_MODEL=Amazon Elastic Block Store"; then
echo "EBS Volume ${d} identified!"
ebs_volumes+=("${d}")
else
local_disks+=("${d}")
fi
echo "Disk ${d} not mounted, need to mount..."
else
echo "Disk ${d} already mounted, skipping..."
fi
done
# use only EBS volumes if available and ignore EC2 NVMe Instance Storage
disks=()
if [ "${#ebs_volumes[@]}" -gt "0" ]; then
disks=("${ebs_volumes[@]}")
else
disks=("${local_disks[@]}")
fi
if [ "${#disks[@]}" -eq "0" ]; then
mountpoint="${mount_prefix}1"
Expand Down
18 changes: 17 additions & 1 deletion pkg/roachprod/vm/gce/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,23 @@ for d in $(ls /dev/nvme?n? /dev/disk/by-id/google-persistent-disk-[1-9]); do
zpool list -v -P | grep ${d} > /dev/null
if [ $? -ne 0 ]; then
{{ else }}
for d in $(ls /dev/disk/by-id/google-local-* /dev/disk/by-id/google-persistent-disk-[1-9]); do
# if the use_multiple_disks is not set and there are more than 1 disk (excluding the boot disk),
# then the disks will be selected for RAID'ing. If there are both Local SSDs and Persistent disks,
# RAID'ing in this case can cause performance differences. So, to avoid this, local SSDs are ignored.
# Scenarios:
# (local SSD = 0, Persistent Disk - 1) - no RAID'ing and Persistent Disk mounted
# (local SSD = 1, Persistent Disk - 0) - no RAID'ing and local SSD mounted
# (local SSD >= 1, Persistent Disk = 1) - no RAID'ing and Persistent Disk mounted
# (local SSD > 1, Persistent Disk = 0) - local SSDs selected for RAID'ing
# (local SSD >= 0, Persistent Disk > 1) - network disks selected for RAID'ing
disk_list=()
if [ "$(ls /dev/disk/by-id/google-persistent-disk-[1-9]|wc -l)" -eq "0" ]; then
disk_list=$(ls /dev/disk/by-id/google-local-*)
else
echo "Only persistent disks are selected."
disk_list=$(ls /dev/disk/by-id/google-persistent-disk-[1-9])
fi
for d in ${disk_list}; do
if ! mount | grep ${d}; then
{{ end }}
disks+=("${d}")
Expand Down

0 comments on commit aa755fd

Please sign in to comment.