From e45e3569fa5abcd89508ea7616ba133ce0256c56 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Thu, 2 Jan 2025 02:06:35 +0800 Subject: [PATCH] btrfs: introduce RAID1 round-robin read balancing Add round-robin read policy that balances reads over available devices (all RAID1 block group profiles). Switch to the next devices is done after a number of blocks is read, which is 256K by default and is configurable in sysfs. The format is "round-robin:" and can be set in file /sys/fs/btrfs/FSID/read_policy Signed-off-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 48 +++++++++++++++++++++++++++++++++- fs/btrfs/volumes.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/volumes.h | 13 ++++++++++ 3 files changed, 124 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 2880407d0dd30b..e155b7ce1ee584 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -1305,7 +1305,12 @@ static ssize_t btrfs_temp_fsid_show(struct kobject *kobj, } BTRFS_ATTR(, temp_fsid, btrfs_temp_fsid_show); -static const char * const btrfs_read_policy_name[] = { "pid" }; +static const char *btrfs_read_policy_name[] = { + "pid", +#ifdef CONFIG_BTRFS_EXPERIMENTAL + "round-robin", +#endif +}; static int btrfs_read_policy_to_enum(const char *str, s64 *value_ret) { @@ -1355,6 +1360,12 @@ static ssize_t btrfs_read_policy_show(struct kobject *kobj, ret += sysfs_emit_at(buf, ret, "%s", btrfs_read_policy_name[i]); +#ifdef CONFIG_BTRFS_EXPERIMENTAL + if (i == BTRFS_READ_POLICY_RR) + ret += sysfs_emit_at(buf, ret, ":%u", + READ_ONCE(fs_devices->rr_min_contig_read)); +#endif + if (i == policy) ret += sysfs_emit_at(buf, ret, "]"); } @@ -1376,6 +1387,41 @@ static ssize_t btrfs_read_policy_store(struct kobject *kobj, if (index < 0) return -EINVAL; +#ifdef CONFIG_BTRFS_EXPERIMENTAL + /* If moving from RR then disable collecting fs stats. */ + if (fs_devices->read_policy == BTRFS_READ_POLICY_RR && index != BTRFS_READ_POLICY_RR) + fs_devices->collect_fs_stats = false; + + if (index == BTRFS_READ_POLICY_RR) { + if (value != -1) { + const u32 sectorsize = fs_devices->fs_info->sectorsize; + + if (!IS_ALIGNED(value, sectorsize)) { + u64 temp_value = round_up(value, sectorsize); + + btrfs_debug(fs_devices->fs_info, +"read_policy: min contig read %lld should be multiple of sectorsize %u, rounded to %llu", + value, sectorsize, temp_value); + value = temp_value; + } + } else { + value = BTRFS_DEFAULT_RR_MIN_CONTIG_READ; + } + + if (index != READ_ONCE(fs_devices->read_policy) || + value != READ_ONCE(fs_devices->rr_min_contig_read)) { + WRITE_ONCE(fs_devices->read_policy, index); + WRITE_ONCE(fs_devices->rr_min_contig_read, value); + + btrfs_info(fs_devices->fs_info, "read policy set to '%s:%lld'", + btrfs_read_policy_name[index], value); + } + + fs_devices->collect_fs_stats = true; + + return len; + } +#endif if (index != READ_ONCE(fs_devices->read_policy)) { WRITE_ONCE(fs_devices->read_policy, index); btrfs_info(fs_devices->fs_info, "read policy set to '%s'", diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index a96d84b92a1b91..a31ba441b752ac 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1335,6 +1335,9 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices, fs_devices->total_rw_bytes = 0; fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR; fs_devices->read_policy = BTRFS_READ_POLICY_PID; +#ifdef CONFIG_BTRFS_EXPERIMENTAL + fs_devices->rr_min_contig_read = BTRFS_DEFAULT_RR_MIN_CONTIG_READ; +#endif return 0; } @@ -5967,6 +5970,62 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, return len; } +#ifdef CONFIG_BTRFS_EXPERIMENTAL +struct stripe_mirror { + u64 devid; + int num; +}; + +static int btrfs_cmp_devid(const void *a, const void *b) +{ + const struct stripe_mirror *s1 = (const struct stripe_mirror *)a; + const struct stripe_mirror *s2 = (const struct stripe_mirror *)b; + + if (s1->devid < s2->devid) + return -1; + if (s1->devid > s2->devid) + return 1; + return 0; +} + +/* + * Select a stripe for reading using the round-robin algorithm. + * + * 1. Compute the read cycle as the total sectors read divided by the minimum + * sectors per device. + * 2. Determine the stripe number for the current read by taking the modulus + * of the read cycle with the total number of stripes: + * + * stripe index = (total sectors / min sectors per dev) % num stripes + * + * The calculated stripe index is then used to select the corresponding device + * from the list of devices, which is ordered by devid. + */ +static int btrfs_read_rr(const struct btrfs_chunk_map *map, int first, int num_stripes) +{ + struct stripe_mirror stripes[BTRFS_RAID1_MAX_MIRRORS] = { 0 }; + struct btrfs_device *device = map->stripes[first].dev; + struct btrfs_fs_devices *fs_devices = device->fs_devices; + unsigned int read_cycle; + unsigned int total_reads; + unsigned int min_reads_per_dev; + + total_reads = percpu_counter_sum(&fs_devices->stats_read_blocks); + min_reads_per_dev = READ_ONCE(fs_devices->rr_min_contig_read) >> + fs_devices->fs_info->sectorsize_bits; + + for (int index = 0, i = first; i < first + num_stripes; i++) { + stripes[index].devid = map->stripes[i].dev->devid; + stripes[index].num = i; + index++; + } + sort(stripes, num_stripes, sizeof(struct stripe_mirror), btrfs_cmp_devid, NULL); + + read_cycle = total_reads / min_reads_per_dev; + return stripes[read_cycle % num_stripes].num; +} +#endif + static int find_live_mirror(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map, int first, int dev_replace_is_ongoing) @@ -5996,6 +6055,11 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, case BTRFS_READ_POLICY_PID: preferred_mirror = first + (current->pid % num_stripes); break; +#ifdef CONFIG_BTRFS_EXPERIMENTAL + case BTRFS_READ_POLICY_RR: + preferred_mirror = btrfs_read_rr(map, first, num_stripes); + break; +#endif } if (dev_replace_is_ongoing && diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 244a07c523162e..26034c29ac87fc 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -296,6 +296,9 @@ enum btrfs_chunk_allocation_policy { BTRFS_CHUNK_ALLOC_ZONED, }; +#define BTRFS_DEFAULT_RR_MIN_CONTIG_READ (SZ_256K) +/* Keep in sync with raid_attr table, current maximum is RAID1C4. */ +#define BTRFS_RAID1_MAX_MIRRORS (4) /* * Read policies for mirrored block group profiles, read picks the stripe based * on these policies. @@ -303,6 +306,10 @@ enum btrfs_chunk_allocation_policy { enum btrfs_read_policy { /* Use process PID to choose the stripe */ BTRFS_READ_POLICY_PID, +#ifdef CONFIG_BTRFS_EXPERIMENTAL + /* Balancing RAID1 reads across all striped devices (round-robin). */ + BTRFS_READ_POLICY_RR, +#endif BTRFS_NR_READ_POLICY, }; @@ -436,6 +443,12 @@ struct btrfs_fs_devices { enum btrfs_read_policy read_policy; #ifdef CONFIG_BTRFS_EXPERIMENTAL + /* + * Minimum contiguous reads before switching to next device, the unit + * is one block/sectorsize. + */ + u32 rr_min_contig_read; + /* Checksum mode - offload it or do it synchronously. */ enum btrfs_offload_csum_mode offload_csum_mode; #endif