Skip to content

Commit

Permalink
Improve log spacemap load time after unclean export.
Browse files Browse the repository at this point in the history
Previous flushing algorithm limited only total number of log blocks to
the minimum of 256K and 4x number of metaslabs in the pool.  As result,
system with 1500 disks with 200 metaslabs each, touching several new
metaslabs each TXG could grow spacemap log to huge size without much
benefits.  We've observed one of such systems importing pool after
unclean export for about 45 minutes.

This patch improves the situation from four sides:
 - By limiting maximum period for each metaslab to be flushed to 1000
TXGs, that effectively limits maximum number of per-TXG spacemap logs
to load after unclean export to the same number.
 - By making flushing more smooth via accounting number of metaslabs
that were touched after the last flush and actually need another flush,
not just ms_unflushed_txg bump.
 - By applying zfs_unflushed_log_block_pct to the number of metaslabs
that were touched after the last flush, not all metaslabs in the pool.
 - By aggressively prefetching per-TXG spacemap logs up to 16 TXGs in
advance, making log spacemap load process for wide HDD pool CPU-bound,
accelerating it by many times.

As further optimization we could skip bumping ms_unflushed_txg for
metaslabs not touched since the last flush, but that would be and an
incompatible change, requiring new pool feature.

Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Sponsored-By: iXsystems, Inc.
  • Loading branch information
amotin committed Nov 24, 2021
1 parent ded851b commit cce917e
Show file tree
Hide file tree
Showing 10 changed files with 252 additions and 142 deletions.
2 changes: 2 additions & 0 deletions include/sys/dmu.h
Original file line number Diff line number Diff line change
Expand Up @@ -1069,6 +1069,8 @@ int dmu_diff(const char *tosnap_name, const char *fromsnap_name,
#define ZFS_CRC64_POLY 0xC96C5795D7870F42ULL /* ECMA-182, reflected form */
extern uint64_t zfs_crc64_table[256];

extern int dmu_prefetch_max;

#ifdef __cplusplus
}
#endif
Expand Down
3 changes: 3 additions & 0 deletions include/sys/metaslab.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,14 @@ int metaslab_init(metaslab_group_t *, uint64_t, uint64_t, uint64_t,
metaslab_t **);
void metaslab_fini(metaslab_t *);

void metaslab_set_unflushed_dirty(metaslab_t *, boolean_t);
void metaslab_set_unflushed_txg(metaslab_t *, uint64_t, dmu_tx_t *);
void metaslab_set_estimated_condensed_size(metaslab_t *, uint64_t, dmu_tx_t *);
boolean_t metaslab_unflushed_dirty(metaslab_t *);
uint64_t metaslab_unflushed_txg(metaslab_t *);
uint64_t metaslab_estimated_condensed_size(metaslab_t *);
int metaslab_sort_by_flushed(const void *, const void *);
void metaslab_unflushed_bump(metaslab_t *, dmu_tx_t *, boolean_t);
uint64_t metaslab_unflushed_changes_memused(metaslab_t *);

int metaslab_load(metaslab_t *);
Expand Down
1 change: 1 addition & 0 deletions include/sys/metaslab_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -553,6 +553,7 @@ struct metaslab {
* log space maps.
*/
uint64_t ms_unflushed_txg;
boolean_t ms_unflushed_dirty;

/* updated every time we are done syncing the metaslab's space map */
uint64_t ms_synced_length;
Expand Down
9 changes: 7 additions & 2 deletions include/sys/spa_log_spacemap.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,10 @@

typedef struct log_summary_entry {
uint64_t lse_start; /* start TXG */
uint64_t lse_end; /* last TXG */
uint64_t lse_txgcount; /* # of TXGs */
uint64_t lse_mscount; /* # of metaslabs needed to be flushed */
uint64_t lse_msdcount; /* # of dirty metaslabs needed to be flushed */
uint64_t lse_blkcount; /* blocks held by this entry */
list_node_t lse_node;
} log_summary_entry_t;
Expand All @@ -50,6 +53,7 @@ typedef struct spa_log_sm {
uint64_t sls_nblocks; /* number of blocks in this log */
uint64_t sls_mscount; /* # of metaslabs flushed in the log's txg */
avl_node_t sls_node; /* node in spa_sm_logs_by_txg */
space_map_t *sls_sm; /* space map pointer, if open */
} spa_log_sm_t;

int spa_ld_log_spacemaps(spa_t *);
Expand All @@ -68,8 +72,9 @@ uint64_t spa_log_sm_memused(spa_t *);
void spa_log_sm_decrement_mscount(spa_t *, uint64_t);
void spa_log_sm_increment_current_mscount(spa_t *);

void spa_log_summary_add_flushed_metaslab(spa_t *);
void spa_log_summary_decrement_mscount(spa_t *, uint64_t);
void spa_log_summary_add_flushed_metaslab(spa_t *, boolean_t);
void spa_log_summary_dirty_flushed_metaslab(spa_t *, uint64_t);
void spa_log_summary_decrement_mscount(spa_t *, uint64_t, boolean_t);
void spa_log_summary_decrement_blkcount(spa_t *, uint64_t);

boolean_t spa_flush_all_logs_requested(spa_t *);
Expand Down
7 changes: 6 additions & 1 deletion man/man4/zfs.4
Original file line number Diff line number Diff line change
Expand Up @@ -1000,7 +1000,12 @@ Thus we always allow at least this many log blocks.
.It Sy zfs_unflushed_log_block_pct Ns = Ns Sy 400 Ns % Pq ulong
Tunable used to determine the number of blocks that can be used for
the spacemap log, expressed as a percentage of the total number of
metaslabs in the pool.
unflushed metaslabs in the pool.
.
.It Sy zfs_unflushed_log_txg_max Ns = Ns Sy 1000 Pq ulong
Tunable limiting maximum time in TXGs any metaslab may remain unflushed.
It effectively limits maximum number of unflushed per-TXG spacemap logs
that need to be read after unclean pool export.
.
.It Sy zfs_unlink_suspend_progress Ns = Ns Sy 0 Ns | Ns 1 Pq uint
When enabled, files will not be asynchronously removed from the list of pending
Expand Down
132 changes: 76 additions & 56 deletions module/zfs/metaslab.c
Original file line number Diff line number Diff line change
Expand Up @@ -2758,7 +2758,8 @@ metaslab_fini_flush_data(metaslab_t *msp)
mutex_exit(&spa->spa_flushed_ms_lock);

spa_log_sm_decrement_mscount(spa, metaslab_unflushed_txg(msp));
spa_log_summary_decrement_mscount(spa, metaslab_unflushed_txg(msp));
spa_log_summary_decrement_mscount(spa, metaslab_unflushed_txg(msp),
metaslab_unflushed_dirty(msp));
}

uint64_t
Expand Down Expand Up @@ -3736,68 +3737,93 @@ metaslab_condense(metaslab_t *msp, dmu_tx_t *tx)
metaslab_flush_update(msp, tx);
}

/*
* Called when the metaslab has been flushed (its own spacemap now reflects
* all the contents of the pool-wide spacemap log). Updates the metaslab's
* metadata and any pool-wide related log space map data (e.g. summary,
* obsolete logs, etc..) to reflect that.
*/
static void
metaslab_flush_update(metaslab_t *msp, dmu_tx_t *tx)
metaslab_unflushed_add(metaslab_t *msp, dmu_tx_t *tx)
{
metaslab_group_t *mg = msp->ms_group;
spa_t *spa = mg->mg_vd->vdev_spa;

ASSERT(MUTEX_HELD(&msp->ms_lock));

ASSERT3U(spa_sync_pass(spa), ==, 1);
spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
ASSERT(spa_syncing_log_sm(spa) != NULL);
ASSERT(msp->ms_sm != NULL);
ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));

/*
* Just because a metaslab got flushed, that doesn't mean that
* it will pass through metaslab_sync_done(). Thus, make sure to
* update ms_synced_length here in case it doesn't.
*/
msp->ms_synced_length = space_map_length(msp->ms_sm);
mutex_enter(&spa->spa_flushed_ms_lock);
metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx);
metaslab_set_unflushed_dirty(msp, B_TRUE);
avl_add(&spa->spa_metaslabs_by_flushed, msp);
mutex_exit(&spa->spa_flushed_ms_lock);

/*
* We may end up here from metaslab_condense() without the
* feature being active. In that case this is a no-op.
*/
if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
return;
spa_log_sm_increment_current_mscount(spa);
spa_log_summary_add_flushed_metaslab(spa, B_TRUE);
}

void
metaslab_unflushed_bump(metaslab_t *msp, dmu_tx_t *tx, boolean_t dirty)
{
spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
ASSERT(spa_syncing_log_sm(spa) != NULL);
ASSERT(msp->ms_sm != NULL);
ASSERT(metaslab_unflushed_txg(msp) != 0);
ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL), ==, msp);
ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));

VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(spa));

/* update metaslab's position in our flushing tree */
uint64_t ms_prev_flushed_txg = metaslab_unflushed_txg(msp);
boolean_t ms_prev_flushed_dirty = metaslab_unflushed_dirty(msp);
mutex_enter(&spa->spa_flushed_ms_lock);
avl_remove(&spa->spa_metaslabs_by_flushed, msp);
metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx);
metaslab_set_unflushed_dirty(msp, dirty);
avl_add(&spa->spa_metaslabs_by_flushed, msp);
mutex_exit(&spa->spa_flushed_ms_lock);

/* update metaslab counts of spa_log_sm_t nodes */
spa_log_sm_decrement_mscount(spa, ms_prev_flushed_txg);
spa_log_sm_increment_current_mscount(spa);

/* update log space map summary */
spa_log_summary_decrement_mscount(spa, ms_prev_flushed_txg,
ms_prev_flushed_dirty);
spa_log_summary_add_flushed_metaslab(spa, dirty);

/* cleanup obsolete logs if any */
uint64_t log_blocks_before = spa_log_sm_nblocks(spa);
spa_cleanup_old_sm_logs(spa, tx);
uint64_t log_blocks_after = spa_log_sm_nblocks(spa);
VERIFY3U(log_blocks_after, <=, log_blocks_before);
}

/* update log space map summary */
uint64_t blocks_gone = log_blocks_before - log_blocks_after;
spa_log_summary_add_flushed_metaslab(spa);
spa_log_summary_decrement_mscount(spa, ms_prev_flushed_txg);
spa_log_summary_decrement_blkcount(spa, blocks_gone);
/*
* Called when the metaslab has been flushed (its own spacemap now reflects
* all the contents of the pool-wide spacemap log). Updates the metaslab's
* metadata and any pool-wide related log space map data (e.g. summary,
* obsolete logs, etc..) to reflect that.
*/
static void
metaslab_flush_update(metaslab_t *msp, dmu_tx_t *tx)
{
metaslab_group_t *mg = msp->ms_group;
spa_t *spa = mg->mg_vd->vdev_spa;

ASSERT(MUTEX_HELD(&msp->ms_lock));

ASSERT3U(spa_sync_pass(spa), ==, 1);

/*
* Just because a metaslab got flushed, that doesn't mean that
* it will pass through metaslab_sync_done(). Thus, make sure to
* update ms_synced_length here in case it doesn't.
*/
msp->ms_synced_length = space_map_length(msp->ms_sm);

/*
* We may end up here from metaslab_condense() without the
* feature being active. In that case this is a no-op.
*/
if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP) ||
metaslab_unflushed_txg(msp) == 0)
return;

metaslab_unflushed_bump(msp, tx, B_FALSE);
}

boolean_t
Expand Down Expand Up @@ -4013,23 +4039,6 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
ASSERT0(metaslab_allocated_space(msp));
}

if (metaslab_unflushed_txg(msp) == 0 &&
spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) {
ASSERT(spa_syncing_log_sm(spa) != NULL);

metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx);
spa_log_sm_increment_current_mscount(spa);
spa_log_summary_add_flushed_metaslab(spa);

ASSERT(msp->ms_sm != NULL);
mutex_enter(&spa->spa_flushed_ms_lock);
avl_add(&spa->spa_metaslabs_by_flushed, msp);
mutex_exit(&spa->spa_flushed_ms_lock);

ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
}

if (!range_tree_is_empty(msp->ms_checkpointing) &&
vd->vdev_checkpoint_sm == NULL) {
ASSERT(spa_has_checkpoint(spa));
Expand Down Expand Up @@ -4077,6 +4086,10 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
space_map_t *log_sm = spa_syncing_log_sm(spa);
if (log_sm != NULL) {
ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP));
if (metaslab_unflushed_txg(msp) == 0)
metaslab_unflushed_add(msp, tx);
else if (!metaslab_unflushed_dirty(msp))
metaslab_unflushed_bump(msp, tx, B_TRUE);

space_map_write(log_sm, alloctree, SM_ALLOC,
vd->vdev_id, tx);
Expand Down Expand Up @@ -6139,6 +6152,12 @@ metaslab_enable(metaslab_t *msp, boolean_t sync, boolean_t unload)
mutex_exit(&mg->mg_ms_disabled_lock);
}

void
metaslab_set_unflushed_dirty(metaslab_t *ms, boolean_t dirty)
{
ms->ms_unflushed_dirty = dirty;
}

static void
metaslab_update_ondisk_flush_data(metaslab_t *ms, dmu_tx_t *tx)
{
Expand Down Expand Up @@ -6175,15 +6194,16 @@ metaslab_update_ondisk_flush_data(metaslab_t *ms, dmu_tx_t *tx)
void
metaslab_set_unflushed_txg(metaslab_t *ms, uint64_t txg, dmu_tx_t *tx)
{
spa_t *spa = ms->ms_group->mg_vd->vdev_spa;

if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
return;

ms->ms_unflushed_txg = txg;
metaslab_update_ondisk_flush_data(ms, tx);
}

boolean_t
metaslab_unflushed_dirty(metaslab_t *ms)
{
return (ms->ms_unflushed_dirty);
}

uint64_t
metaslab_unflushed_txg(metaslab_t *ms)
{
Expand Down
2 changes: 1 addition & 1 deletion module/zfs/spa.c
Original file line number Diff line number Diff line change
Expand Up @@ -4323,7 +4323,7 @@ spa_ld_load_vdev_metadata(spa_t *spa)

error = spa_ld_log_spacemaps(spa);
if (error != 0) {
spa_load_failed(spa, "spa_ld_log_sm_data failed [error=%d]",
spa_load_failed(spa, "spa_ld_log_spacemaps failed [error=%d]",
error);
return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
}
Expand Down
Loading

0 comments on commit cce917e

Please sign in to comment.