Skip to content

Commit

Permalink
Set aside a metaslab for ZIL blocks
Browse files Browse the repository at this point in the history
Mixing ZIL and normal allocations has several problems:

1. The ZIL allocations are allocated, written to disk, and then a few
seconds later freed.  This leaves behind holes (free segments) where the
ZIL blocks used to be, which increases fragmentation, which negatively
impacts performance.

2. When under moderate load, ZIL allocations are of 128KB.  If the pool
is fairly fragmented, there may not be many free chunks of that size.
This causes ZFS to load more metaslabs to locate free segments of 128KB
or more.  The loading happens synchronously (from zil_commit()), and can
take around a second even if the metaslab's spacemap is cached in the
ARC.  All concurrent synchronous operations on this filesystem must wait
while the metaslab is loading.  This can cause a significant performance
impact.

3. If the pool is very fragmented, there may be zero free chunks of
128KB or more.  In this case, the ZIL falls back to txg_wait_synced(),
which has an enormous performance impact.

These problems can be eliminated by using a dedicated log device
("slog"), even one with the same performance characteristics as the
normal devices.

This change sets aside one metaslab from each top-level vdev that is
preferentially used for ZIL allocations (vdev_log_mg,
spa_embedded_log_class).  From an allocation perspective, this is
similar to having a dedicated log device, and it eliminates the
above-mentioned performance problems.

Log (ZIL) blocks can be allocated from the following locations.  Each
one is tried in order until the allocation succeeds:
1. dedicated log vdevs, aka "slog" (spa_log_class)
2. embedded slog metaslabs (spa_embedded_log_class)
3. other metaslabs in normal vdevs (spa_normal_class)

The space required for the embedded slog metaslabs is usually between
0.5% and 1.0% of the pool, and comes out of the existing 3.2% of "slop"
space that is not available for user data.

On an all-ssd system with 4TB storage, 87% fragmentation, 60% capacity,
and recordsize=8k, testing shows a ~50% performance increase on random
8k sync writes.  On even more fragmented systems (which hit problem #3
above and call txg_wait_synced()), the performance improvement can be
arbitrarily large (>100x).

Reviewed-by: Serapheim Dimitropoulos <serapheim@delphix.com>
Reviewed-by: George Wilson <gwilson@delphix.com>
Reviewed-by: Don Brady <don.brady@delphix.com>
Reviewed-by: Mark Maybee <mark.maybee@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
Closes openzfs#11389
  • Loading branch information
ahrens authored Jan 21, 2021
1 parent 984362a commit aa755b3
Show file tree
Hide file tree
Showing 14 changed files with 358 additions and 104 deletions.
21 changes: 19 additions & 2 deletions cmd/zdb/zdb.c
Original file line number Diff line number Diff line change
Expand Up @@ -5844,6 +5844,7 @@ zdb_leak_init_prepare_indirect_vdevs(spa_t *spa, zdb_cb_t *zcb)
* metaslabs. We want to set them up for
* zio_claim().
*/
vdev_metaslab_group_create(vd);
VERIFY0(vdev_metaslab_init(vd, 0));

vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
Expand Down Expand Up @@ -5882,6 +5883,7 @@ zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
*/
spa->spa_normal_class->mc_ops = &zdb_metaslab_ops;
spa->spa_log_class->mc_ops = &zdb_metaslab_ops;
spa->spa_embedded_log_class->mc_ops = &zdb_metaslab_ops;

zcb->zcb_vd_obsolete_counts =
umem_zalloc(rvd->vdev_children * sizeof (uint32_t *),
Expand Down Expand Up @@ -6015,15 +6017,16 @@ zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb)
vdev_t *rvd = spa->spa_root_vdev;
for (unsigned c = 0; c < rvd->vdev_children; c++) {
vdev_t *vd = rvd->vdev_child[c];
metaslab_group_t *mg __maybe_unused = vd->vdev_mg;

if (zcb->zcb_vd_obsolete_counts[c] != NULL) {
leaks |= zdb_check_for_obsolete_leaks(vd, zcb);
}

for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
metaslab_t *msp = vd->vdev_ms[m];
ASSERT3P(mg, ==, msp->ms_group);
ASSERT3P(msp->ms_group, ==, (msp->ms_group->mg_class ==
spa_embedded_log_class(spa)) ?
vd->vdev_log_mg : vd->vdev_mg);

/*
* ms_allocatable has been overloaded
Expand Down Expand Up @@ -6230,6 +6233,8 @@ dump_block_stats(spa_t *spa)
zcb.zcb_totalasize = metaslab_class_get_alloc(spa_normal_class(spa));
zcb.zcb_totalasize += metaslab_class_get_alloc(spa_special_class(spa));
zcb.zcb_totalasize += metaslab_class_get_alloc(spa_dedup_class(spa));
zcb.zcb_totalasize +=
metaslab_class_get_alloc(spa_embedded_log_class(spa));
zcb.zcb_start = zcb.zcb_lastprint = gethrtime();
err = traverse_pool(spa, 0, flags, zdb_blkptr_cb, &zcb);

Expand Down Expand Up @@ -6277,6 +6282,7 @@ dump_block_stats(spa_t *spa)

total_alloc = norm_alloc +
metaslab_class_get_alloc(spa_log_class(spa)) +
metaslab_class_get_alloc(spa_embedded_log_class(spa)) +
metaslab_class_get_alloc(spa_special_class(spa)) +
metaslab_class_get_alloc(spa_dedup_class(spa)) +
get_unflushed_alloc_space(spa);
Expand Down Expand Up @@ -6344,6 +6350,17 @@ dump_block_stats(spa_t *spa)
100.0 * alloc / space);
}

if (spa_embedded_log_class(spa)->mc_allocator[0].mca_rotor != NULL) {
uint64_t alloc = metaslab_class_get_alloc(
spa_embedded_log_class(spa));
uint64_t space = metaslab_class_get_space(
spa_embedded_log_class(spa));

(void) printf("\t%-16s %14llu used: %5.2f%%\n",
"Embedded log class", (u_longlong_t)alloc,
100.0 * alloc / space);
}

for (i = 0; i < NUM_BP_EMBEDDED_TYPES; i++) {
if (zcb.zcb_embedded_blocks[i] == 0)
continue;
Expand Down
3 changes: 0 additions & 3 deletions include/sys/dmu.h
Original file line number Diff line number Diff line change
Expand Up @@ -142,9 +142,6 @@ typedef enum dmu_object_byteswap {
#define DMU_OT_IS_DDT(ot) \
((ot) == DMU_OT_DDT_ZAP)

#define DMU_OT_IS_ZIL(ot) \
((ot) == DMU_OT_INTENT_LOG)

/* Note: ztest uses DMU_OT_UINT64_OTHER as a proxy for file blocks */
#define DMU_OT_IS_FILE(ot) \
((ot) == DMU_OT_PLAIN_FILE_CONTENTS || (ot) == DMU_OT_UINT64_OTHER)
Expand Down
1 change: 1 addition & 0 deletions include/sys/spa.h
Original file line number Diff line number Diff line change
Expand Up @@ -1047,6 +1047,7 @@ extern uint64_t spa_version(spa_t *spa);
extern boolean_t spa_deflate(spa_t *spa);
extern metaslab_class_t *spa_normal_class(spa_t *spa);
extern metaslab_class_t *spa_log_class(spa_t *spa);
extern metaslab_class_t *spa_embedded_log_class(spa_t *spa);
extern metaslab_class_t *spa_special_class(spa_t *spa);
extern metaslab_class_t *spa_dedup_class(spa_t *spa);
extern metaslab_class_t *spa_preferred_class(spa_t *spa, uint64_t size,
Expand Down
1 change: 1 addition & 0 deletions include/sys/spa_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,7 @@ struct spa {
boolean_t spa_is_exporting; /* true while exporting pool */
metaslab_class_t *spa_normal_class; /* normal data class */
metaslab_class_t *spa_log_class; /* intent log data class */
metaslab_class_t *spa_embedded_log_class; /* log on normal vdevs */
metaslab_class_t *spa_special_class; /* special allocation class */
metaslab_class_t *spa_dedup_class; /* dedup allocation class */
uint64_t spa_first_txg; /* first txg after spa_open() */
Expand Down
4 changes: 4 additions & 0 deletions include/sys/vdev.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
#include <sys/zio.h>
#include <sys/dmu.h>
#include <sys/space_map.h>
#include <sys/metaslab.h>
#include <sys/fs/zfs.h>

#ifdef __cplusplus
Expand Down Expand Up @@ -113,6 +114,9 @@ extern void vdev_xlate_walk(vdev_t *vd, const range_seg64_t *logical_rs,
vdev_xlate_func_t *func, void *arg);

extern void vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx);

extern metaslab_group_t *vdev_get_mg(vdev_t *vd, metaslab_class_t *mc);

extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs);
extern void vdev_clear_stats(vdev_t *vd);
extern void vdev_stat_update(zio_t *zio, uint64_t psize);
Expand Down
2 changes: 2 additions & 0 deletions include/sys/vdev_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,7 @@ struct vdev {
uint64_t vdev_ms_shift; /* metaslab size shift */
uint64_t vdev_ms_count; /* number of metaslabs */
metaslab_group_t *vdev_mg; /* metaslab group */
metaslab_group_t *vdev_log_mg; /* embedded slog metaslab group */
metaslab_t **vdev_ms; /* metaslab array */
uint64_t vdev_pending_fastwrite; /* allocated fastwrites */
txg_list_t vdev_ms_list; /* per-txg dirty metaslab lists */
Expand Down Expand Up @@ -636,6 +637,7 @@ extern int vdev_obsolete_counts_are_precise(vdev_t *vd, boolean_t *are_precise);
* Other miscellaneous functions
*/
int vdev_checkpoint_sm_object(vdev_t *vd, uint64_t *sm_obj);
void vdev_metaslab_group_create(vdev_t *vd);

/*
* Vdev ashift optimization tunables
Expand Down
1 change: 1 addition & 0 deletions include/sys/zfs_debug.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ extern int zfs_dbgmsg_enable;
#define ZFS_DEBUG_INDIRECT_REMAP (1 << 10)
#define ZFS_DEBUG_TRIM (1 << 11)
#define ZFS_DEBUG_LOG_SPACEMAP (1 << 12)
#define ZFS_DEBUG_METASLAB_ALLOC (1 << 13)

extern void __set_error(const char *file, const char *func, int line, int err);
extern void __zfs_dbgmsg(char *buf);
Expand Down
16 changes: 16 additions & 0 deletions man/man5/zfs-module-parameters.5
Original file line number Diff line number Diff line change
Expand Up @@ -3936,6 +3936,22 @@ to limit potential SLOG device abuse by single active ZIL writer.
Default value: \fB786,432\fR.
.RE

.sp
.ne 2
.na
\fBzfs_embedded_slog_min_ms\fR (int)
.ad
.RS 12n
Usually, one metaslab from each (normal-class) vdev is dedicated for use by
the ZIL (to log synchronous writes).
However, if there are fewer than zfs_embedded_slog_min_ms metaslabs in the
vdev, this functionality is disabled.
This ensures that we don't set aside an unreasonable amount of space for the
ZIL.
.sp
Default value: \fB64\fR.
.RE

.sp
.ne 2
.na
Expand Down
94 changes: 61 additions & 33 deletions module/zfs/metaslab.c
Original file line number Diff line number Diff line change
Expand Up @@ -524,7 +524,7 @@ metaslab_class_histogram_verify(metaslab_class_t *mc)

for (int c = 0; c < rvd->vdev_children; c++) {
vdev_t *tvd = rvd->vdev_child[c];
metaslab_group_t *mg = tvd->vdev_mg;
metaslab_group_t *mg = vdev_get_mg(tvd, mc);

/*
* Skip any holes, uninitialized top-levels, or
Expand All @@ -535,12 +535,16 @@ metaslab_class_histogram_verify(metaslab_class_t *mc)
continue;
}

IMPLY(mg == mg->mg_vd->vdev_log_mg,
mc == spa_embedded_log_class(mg->mg_vd->vdev_spa));

for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
mc_hist[i] += mg->mg_histogram[i];
}

for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]);
}

kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
}
Expand Down Expand Up @@ -1004,16 +1008,22 @@ metaslab_group_initialized(metaslab_group_t *mg)
uint64_t
metaslab_group_get_space(metaslab_group_t *mg)
{
return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count);
/*
* Note that the number of nodes in mg_metaslab_tree may be one less
* than vdev_ms_count, due to the embedded log metaslab.
*/
mutex_enter(&mg->mg_lock);
uint64_t ms_count = avl_numnodes(&mg->mg_metaslab_tree);
mutex_exit(&mg->mg_lock);
return ((1ULL << mg->mg_vd->vdev_ms_shift) * ms_count);
}

void
metaslab_group_histogram_verify(metaslab_group_t *mg)
{
uint64_t *mg_hist;
vdev_t *vd = mg->mg_vd;
uint64_t ashift = vd->vdev_ashift;
int i;
avl_tree_t *t = &mg->mg_metaslab_tree;
uint64_t ashift = mg->mg_vd->vdev_ashift;

if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
return;
Expand All @@ -1024,21 +1034,25 @@ metaslab_group_histogram_verify(metaslab_group_t *mg)
ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=,
SPACE_MAP_HISTOGRAM_SIZE + ashift);

for (int m = 0; m < vd->vdev_ms_count; m++) {
metaslab_t *msp = vd->vdev_ms[m];

/* skip if not active or not a member */
if (msp->ms_sm == NULL || msp->ms_group != mg)
mutex_enter(&mg->mg_lock);
for (metaslab_t *msp = avl_first(t);
msp != NULL; msp = AVL_NEXT(t, msp)) {
VERIFY3P(msp->ms_group, ==, mg);
/* skip if not active */
if (msp->ms_sm == NULL)
continue;

for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
mg_hist[i + ashift] +=
msp->ms_sm->sm_phys->smp_histogram[i];
}
}

for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++)
for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++)
VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]);

mutex_exit(&mg->mg_lock);

kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
}

Expand All @@ -1054,6 +1068,8 @@ metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp)

mutex_enter(&mg->mg_lock);
for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
IMPLY(mg == mg->mg_vd->vdev_log_mg,
mc == spa_embedded_log_class(mg->mg_vd->vdev_spa));
mg->mg_histogram[i + ashift] +=
msp->ms_sm->sm_phys->smp_histogram[i];
mc->mc_histogram[i + ashift] +=
Expand All @@ -1078,6 +1094,8 @@ metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp)
msp->ms_sm->sm_phys->smp_histogram[i]);
ASSERT3U(mc->mc_histogram[i + ashift], >=,
msp->ms_sm->sm_phys->smp_histogram[i]);
IMPLY(mg == mg->mg_vd->vdev_log_mg,
mc == spa_embedded_log_class(mg->mg_vd->vdev_spa));

mg->mg_histogram[i + ashift] -=
msp->ms_sm->sm_phys->smp_histogram[i];
Expand Down Expand Up @@ -2741,37 +2759,47 @@ metaslab_fini(metaslab_t *msp)

mutex_enter(&msp->ms_lock);
VERIFY(msp->ms_group == NULL);
metaslab_space_update(vd, mg->mg_class,
-metaslab_allocated_space(msp), 0, -msp->ms_size);
/*
* If the range trees haven't been allocated, this metaslab hasn't
* been through metaslab_sync_done() for the first time yet, so its
* space hasn't been accounted for in its vdev and doesn't need to be
* subtracted.
*/
if (msp->ms_freed != NULL) {
metaslab_space_update(vd, mg->mg_class,
-metaslab_allocated_space(msp), 0, -msp->ms_size);

}
space_map_close(msp->ms_sm);
msp->ms_sm = NULL;

metaslab_unload(msp);

range_tree_destroy(msp->ms_allocatable);
range_tree_destroy(msp->ms_freeing);
range_tree_destroy(msp->ms_freed);

ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
metaslab_unflushed_changes_memused(msp));
spa->spa_unflushed_stats.sus_memused -=
metaslab_unflushed_changes_memused(msp);
range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL);
range_tree_destroy(msp->ms_unflushed_allocs);
range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL);
range_tree_destroy(msp->ms_unflushed_frees);
if (msp->ms_freed != NULL) {
range_tree_destroy(msp->ms_freeing);
range_tree_destroy(msp->ms_freed);

for (int t = 0; t < TXG_SIZE; t++) {
range_tree_destroy(msp->ms_allocating[t]);
}
ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
metaslab_unflushed_changes_memused(msp));
spa->spa_unflushed_stats.sus_memused -=
metaslab_unflushed_changes_memused(msp);
range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL);
range_tree_destroy(msp->ms_unflushed_allocs);
range_tree_destroy(msp->ms_checkpointing);
range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL);
range_tree_destroy(msp->ms_unflushed_frees);

for (int t = 0; t < TXG_DEFER_SIZE; t++) {
range_tree_destroy(msp->ms_defer[t]);
for (int t = 0; t < TXG_SIZE; t++) {
range_tree_destroy(msp->ms_allocating[t]);
}
for (int t = 0; t < TXG_DEFER_SIZE; t++) {
range_tree_destroy(msp->ms_defer[t]);
}
}
ASSERT0(msp->ms_deferspace);

range_tree_destroy(msp->ms_checkpointing);

for (int t = 0; t < TXG_SIZE; t++)
ASSERT(!txg_list_member(&vd->vdev_ms_list, msp, t));

Expand Down Expand Up @@ -5113,7 +5141,7 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
* all else fails.
*/
if (vd != NULL && vd->vdev_mg != NULL) {
mg = vd->vdev_mg;
mg = vdev_get_mg(vd, mc);

if (flags & METASLAB_HINTBP_AVOID &&
mg->mg_next != NULL)
Expand Down
Loading

0 comments on commit aa755b3

Please sign in to comment.