diff --git a/include/sys/zil.h b/include/sys/zil.h index 384678b223d5..0f0f9327af0f 100644 --- a/include/sys/zil.h +++ b/include/sys/zil.h @@ -604,7 +604,7 @@ extern void zil_clean(zilog_t *zilog, uint64_t synced_txg); extern int zil_suspend(const char *osname, void **cookiep); extern void zil_resume(void *cookie); -extern void zil_lwb_add_block(struct lwb *lwb, const blkptr_t *bp); +extern void zil_lwb_add_flush(struct lwb *lwb, zio_t *zio); extern void zil_lwb_add_txg(struct lwb *lwb, uint64_t txg); extern int zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp); diff --git a/include/sys/zil_impl.h b/include/sys/zil_impl.h index 9a34bafc1c77..74c205a8f651 100644 --- a/include/sys/zil_impl.h +++ b/include/sys/zil_impl.h @@ -172,15 +172,6 @@ typedef struct itx_async_node { avl_node_t ia_node; /* AVL tree linkage */ } itx_async_node_t; -/* - * Vdev flushing: during a zil_commit(), we build up an AVL tree of the vdevs - * we've touched so we know which ones need a write cache flush at the end. - */ -typedef struct zil_vdev_node { - uint64_t zv_vdev; /* vdev to be flushed */ - avl_node_t zv_node; /* AVL tree linkage */ -} zil_vdev_node_t; - #define ZIL_BURSTS 8 /* diff --git a/include/sys/zio.h b/include/sys/zio.h index 446b64ccd8ab..1aa6bd533795 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -180,16 +180,17 @@ typedef uint64_t zio_flag_t; #define ZIO_FLAG_SCRUB (1ULL << 4) #define ZIO_FLAG_SCAN_THREAD (1ULL << 5) #define ZIO_FLAG_PHYSICAL (1ULL << 6) +#define ZIO_FLAG_VDEV_TRACE (1ULL << 7) #define ZIO_FLAG_AGG_INHERIT (ZIO_FLAG_CANFAIL - 1) /* * Flags inherited by ddt, gang, and vdev children. */ -#define ZIO_FLAG_CANFAIL (1ULL << 7) /* must be first for INHERIT */ -#define ZIO_FLAG_SPECULATIVE (1ULL << 8) -#define ZIO_FLAG_CONFIG_WRITER (1ULL << 9) -#define ZIO_FLAG_DONT_RETRY (1ULL << 10) +#define ZIO_FLAG_CANFAIL (1ULL << 8) /* must be first for INHERIT */ +#define ZIO_FLAG_SPECULATIVE (1ULL << 9) +#define ZIO_FLAG_CONFIG_WRITER (1ULL << 10) +#define ZIO_FLAG_DONT_RETRY (1ULL << 11) #define ZIO_FLAG_NODATA (1ULL << 12) #define ZIO_FLAG_INDUCE_DAMAGE (1ULL << 13) #define ZIO_FLAG_IO_ALLOCATING (1ULL << 14) @@ -445,6 +446,11 @@ enum zio_qstate { ZIO_QS_ACTIVE, }; +typedef struct zio_vdev_trace { + uint64_t zvt_guid; + avl_node_t zvt_node; +} zio_vdev_trace_t; + struct zio { /* Core information about this I/O */ zbookmark_phys_t io_bookmark; @@ -513,6 +519,7 @@ struct zio { int io_error; int io_child_error[ZIO_CHILD_TYPES]; uint64_t io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES]; + avl_tree_t io_vdev_trace_tree; uint64_t *io_stall; zio_t *io_gang_leader; zio_gang_node_t *io_gang_tree; @@ -595,7 +602,7 @@ extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, extern int zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp, uint64_t size, boolean_t *slog); -extern void zio_flush(zio_t *zio, vdev_t *vd); +extern void zio_flush(zio_t *zio, vdev_t *vd, boolean_t propagate); extern void zio_shrink(zio_t *zio, uint64_t size); extern int zio_wait(zio_t *zio); @@ -636,6 +643,13 @@ extern void zio_vdev_io_bypass(zio_t *zio); extern void zio_vdev_io_reissue(zio_t *zio); extern void zio_vdev_io_redone(zio_t *zio); +extern void zio_vdev_trace_init(avl_tree_t *t); +extern void zio_vdev_trace_fini(avl_tree_t *t); +extern void zio_vdev_trace_copy(avl_tree_t *src, avl_tree_t *dst); +extern void zio_vdev_trace_move(avl_tree_t *src, avl_tree_t *dst); +extern void zio_vdev_trace_flush(zio_t *zio, avl_tree_t *t); +extern void zio_vdev_trace_empty(avl_tree_t *t); + extern void zio_change_priority(zio_t *pio, zio_priority_t priority); extern void zio_checksum_verified(zio_t *zio); diff --git a/module/os/freebsd/zfs/vdev_geom.c b/module/os/freebsd/zfs/vdev_geom.c index b7ff1063b089..7aaa42bfb1a8 100644 --- a/module/os/freebsd/zfs/vdev_geom.c +++ b/module/os/freebsd/zfs/vdev_geom.c @@ -1014,21 +1014,6 @@ vdev_geom_io_intr(struct bio *bp) zio->io_error = SET_ERROR(EIO); switch (zio->io_error) { - case ENOTSUP: - /* - * If we get ENOTSUP for BIO_FLUSH or BIO_DELETE we know - * that future attempts will never succeed. In this case - * we set a persistent flag so that we don't bother with - * requests in the future. - */ - switch (bp->bio_cmd) { - case BIO_FLUSH: - vd->vdev_nowritecache = B_TRUE; - break; - case BIO_DELETE: - break; - } - break; case ENXIO: if (!vd->vdev_remove_wanted) { /* diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c index e69c5f3841ec..2c963bb05c80 100644 --- a/module/os/linux/zfs/vdev_disk.c +++ b/module/os/linux/zfs/vdev_disk.c @@ -1232,9 +1232,6 @@ BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error) zio->io_error = -error; #endif - if (zio->io_error && (zio->io_error == EOPNOTSUPP)) - zio->io_vd->vdev_nowritecache = B_TRUE; - bio_put(bio); ASSERT3S(zio->io_error, >=, 0); if (zio->io_error) diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index b3eda8ea5097..85e3a549e616 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -1806,12 +1806,11 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg) zgd_t *zgd = dsa->dsa_zgd; /* - * Record the vdev(s) backing this blkptr so they can be flushed after - * the writes for the lwb have completed. + * Capture the trace records for this zio so the vdevs can be flushed + * after the writes for the lwb have completed. */ - if (zio->io_error == 0) { - zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp); - } + if (zio->io_error == 0) + zil_lwb_add_flush(zgd->zgd_lwb, zio); mutex_enter(&db->db_mtx); ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC); @@ -1865,10 +1864,10 @@ dmu_sync_late_arrival_done(zio_t *zio) if (zio->io_error == 0) { /* - * Record the vdev(s) backing this blkptr so they can be + * Capture the trace records for this zio so the vdevs can be * flushed after the writes for the lwb have completed. */ - zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp); + zil_lwb_add_flush(zgd->zgd_lwb, zio); if (!BP_IS_HOLE(bp)) { blkptr_t *bp_orig __maybe_unused = &zio->io_bp_orig; @@ -1955,7 +1954,8 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd, abd_get_from_buf(zgd->zgd_db->db_data, zgd->zgd_db->db_size), zgd->zgd_db->db_size, zgd->zgd_db->db_size, zp, dmu_sync_late_arrival_ready, NULL, dmu_sync_late_arrival_done, - dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb)); + dsa, ZIO_PRIORITY_SYNC_WRITE, + ZIO_FLAG_CANFAIL | ZIO_FLAG_VDEV_TRACE, zb)); return (0); } @@ -2122,7 +2122,8 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) zio_nowait(arc_write(pio, os->os_spa, txg, zgd->zgd_bp, dr->dt.dl.dr_data, !DBUF_IS_CACHEABLE(db), dbuf_is_l2cacheable(db), &zp, dmu_sync_ready, NULL, dmu_sync_done, dsa, - ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb)); + ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL | ZIO_FLAG_VDEV_TRACE, + &zb)); return (0); } diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index 47346dd5acff..468390b9acd3 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -1830,19 +1830,21 @@ vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags) for (int v = 0; v < svdcount; v++) { if (vdev_writeable(svd[v])) { - zio_flush(zio, svd[v]); + zio_flush(zio, svd[v], B_FALSE); } } if (spa->spa_aux_sync_uber) { spa->spa_aux_sync_uber = B_FALSE; for (int v = 0; v < spa->spa_spares.sav_count; v++) { if (vdev_writeable(spa->spa_spares.sav_vdevs[v])) { - zio_flush(zio, spa->spa_spares.sav_vdevs[v]); + zio_flush(zio, spa->spa_spares.sav_vdevs[v], + B_FALSE); } } for (int v = 0; v < spa->spa_l2cache.sav_count; v++) { if (vdev_writeable(spa->spa_l2cache.sav_vdevs[v])) { - zio_flush(zio, spa->spa_l2cache.sav_vdevs[v]); + zio_flush(zio, spa->spa_l2cache.sav_vdevs[v], + B_FALSE); } } } @@ -2007,13 +2009,13 @@ vdev_label_sync_list(spa_t *spa, int l, uint64_t txg, int flags) zio = zio_root(spa, NULL, NULL, flags); for (vd = list_head(dl); vd != NULL; vd = list_next(dl, vd)) - zio_flush(zio, vd); + zio_flush(zio, vd, B_FALSE); for (int i = 0; i < 2; i++) { if (!sav[i]->sav_label_sync) continue; for (int v = 0; v < sav[i]->sav_count; v++) - zio_flush(zio, sav[i]->sav_vdevs[v]); + zio_flush(zio, sav[i]->sav_vdevs[v], B_FALSE); if (l == 1) sav[i]->sav_label_sync = B_FALSE; } @@ -2091,7 +2093,7 @@ vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg) for (vdev_t *vd = txg_list_head(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)); vd != NULL; vd = txg_list_next(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg))) - zio_flush(zio, vd); + zio_flush(zio, vd, B_FALSE); (void) zio_wait(zio); diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index 15c8b8ca6016..187d3908ff50 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -4172,7 +4172,7 @@ raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx) goto io_error_exit; } pio = zio_root(spa, NULL, NULL, 0); - zio_flush(pio, raidvd); + zio_flush(pio, raidvd, B_FALSE); zio_wait(pio); zfs_dbgmsg("reflow: wrote %llu bytes (logical) to scratch area", @@ -4231,7 +4231,7 @@ raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx) goto io_error_exit; } pio = zio_root(spa, NULL, NULL, 0); - zio_flush(pio, raidvd); + zio_flush(pio, raidvd, B_FALSE); zio_wait(pio); zfs_dbgmsg("reflow: overwrote %llu bytes (logical) to real location", @@ -4339,7 +4339,7 @@ vdev_raidz_reflow_copy_scratch(spa_t *spa) } zio_wait(pio); pio = zio_root(spa, NULL, NULL, 0); - zio_flush(pio, raidvd); + zio_flush(pio, raidvd, B_FALSE); zio_wait(pio); zfs_dbgmsg("reflow recovery: overwrote %llu bytes (logical) " diff --git a/module/zfs/zil.c b/module/zfs/zil.c index 3983da6aa424..ae1a4f7ec01d 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -23,6 +23,7 @@ * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright (c) 2018 Datto Inc. + * Copyright (c) 2024, Klara, Inc. */ /* Portions Copyright 2010 Robert Milkowski */ @@ -797,15 +798,6 @@ zil_free_log_record(zilog_t *zilog, const lr_t *lrc, void *tx, } } -static int -zil_lwb_vdev_compare(const void *x1, const void *x2) -{ - const uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev; - const uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev; - - return (TREE_CMP(v1, v2)); -} - /* * Allocate a new lwb. We may already have a block pointer for it, in which * case we get size and version from there. Or we may not yet, in which case @@ -1368,14 +1360,8 @@ zil_commit_waiter_link_nolwb(zil_commit_waiter_t *zcw, list_t *nolwb) } void -zil_lwb_add_block(lwb_t *lwb, const blkptr_t *bp) +zil_lwb_add_flush(struct lwb *lwb, zio_t *zio) { - avl_tree_t *t = &lwb->lwb_vdev_tree; - avl_index_t where; - zil_vdev_node_t *zv, zvsearch; - int ndvas = BP_GET_NDVAS(bp); - int i; - ASSERT3S(lwb->lwb_state, !=, LWB_STATE_WRITE_DONE); ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE); @@ -1383,53 +1369,10 @@ zil_lwb_add_block(lwb_t *lwb, const blkptr_t *bp) return; mutex_enter(&lwb->lwb_vdev_lock); - for (i = 0; i < ndvas; i++) { - zvsearch.zv_vdev = DVA_GET_VDEV(&bp->blk_dva[i]); - if (avl_find(t, &zvsearch, &where) == NULL) { - zv = kmem_alloc(sizeof (*zv), KM_SLEEP); - zv->zv_vdev = zvsearch.zv_vdev; - avl_insert(t, zv, where); - } - } + zio_vdev_trace_copy(&zio->io_vdev_trace_tree, &lwb->lwb_vdev_tree); mutex_exit(&lwb->lwb_vdev_lock); } -static void -zil_lwb_flush_defer(lwb_t *lwb, lwb_t *nlwb) -{ - avl_tree_t *src = &lwb->lwb_vdev_tree; - avl_tree_t *dst = &nlwb->lwb_vdev_tree; - void *cookie = NULL; - zil_vdev_node_t *zv; - - ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE); - ASSERT3S(nlwb->lwb_state, !=, LWB_STATE_WRITE_DONE); - ASSERT3S(nlwb->lwb_state, !=, LWB_STATE_FLUSH_DONE); - - /* - * While 'lwb' is at a point in its lifetime where lwb_vdev_tree does - * not need the protection of lwb_vdev_lock (it will only be modified - * while holding zilog->zl_lock) as its writes and those of its - * children have all completed. The younger 'nlwb' may be waiting on - * future writes to additional vdevs. - */ - mutex_enter(&nlwb->lwb_vdev_lock); - /* - * Tear down the 'lwb' vdev tree, ensuring that entries which do not - * exist in 'nlwb' are moved to it, freeing any would-be duplicates. - */ - while ((zv = avl_destroy_nodes(src, &cookie)) != NULL) { - avl_index_t where; - - if (avl_find(dst, zv, &where) == NULL) { - avl_insert(dst, zv, where); - } else { - kmem_free(zv, sizeof (*zv)); - } - } - mutex_exit(&nlwb->lwb_vdev_lock); -} - void zil_lwb_add_txg(lwb_t *lwb, uint64_t txg) { @@ -1495,12 +1438,6 @@ zil_lwb_flush_vdevs_done(zio_t *zio) * includes ZIO errors from either this LWB's write or * flush, as well as any errors from other dependent LWBs * (e.g. a root LWB ZIO that might be a child of this LWB). - * - * With that said, it's important to note that LWB flush - * errors are not propagated up to the LWB root ZIO. - * This is incorrect behavior, and results in VDEV flush - * errors not being handled correctly here. See the - * comment above the call to "zio_flush" for details. */ zcw->zcw_zio_error = zio->io_error; @@ -1578,9 +1515,6 @@ zil_lwb_write_done(zio_t *zio) lwb_t *lwb = zio->io_private; spa_t *spa = zio->io_spa; zilog_t *zilog = lwb->lwb_zilog; - avl_tree_t *t = &lwb->lwb_vdev_tree; - void *cookie = NULL; - zil_vdev_node_t *zv; lwb_t *nlwb; ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), !=, 0); @@ -1606,13 +1540,9 @@ zil_lwb_write_done(zio_t *zio) nlwb = NULL; mutex_exit(&zilog->zl_lock); - if (avl_numnodes(t) == 0) - return; - /* - * If there was an IO error, we're not going to call zio_flush() - * on these vdevs, so we simply empty the tree and free the - * nodes. We avoid calling zio_flush() since there isn't any + * If there was an IO error, there's no point continuing. We + * avoid calling zio_flush() since there isn't any * good reason for doing so, after the lwb block failed to be * written out. * @@ -1622,47 +1552,78 @@ zil_lwb_write_done(zio_t *zio) * we expect any error seen here, to have been propagated to * that function). */ - if (zio->io_error != 0) { - while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) - kmem_free(zv, sizeof (*zv)); + if (zio->io_error != 0 || zil_nocacheflush) { + /* + * Trace records may have been passed on to us from a + * previously-deferred lwb. Since we're not going to use them, + * we clean them up now. + */ + zio_vdev_trace_empty(&lwb->lwb_vdev_tree); return; } /* - * If this lwb does not have any threads waiting for it to complete, we - * want to defer issuing the flush command to the vdevs written to by - * "this" lwb, and instead rely on the "next" lwb to handle the flush - * command for those vdevs. Thus, we merge the vdev tree of "this" lwb - * with the vdev tree of the "next" lwb in the list, and assume the - * "next" lwb will handle flushing the vdevs (or deferring the flush(s) - * again). + * This zio was issued with ZIO_FLAG_VDEV_TRACE, and so its + * io_vdev_trace_tree has the set of zio_vdev_trace_t records for vdevs + * that were written to by "this" lwb. + * + * To complete the commit, we need to issue a flush to those vdevs. If + * this lwb does not have any threads waiting for it to complete, we + * want to defer issuing that flush, and instead rely on the "next" lwb + * to hndle the flush command for those vdevs. * - * This is a useful performance optimization, especially for workloads + * (This is a useful performance optimization, especially for workloads * with lots of async write activity and few sync write and/or fsync * activity, as it has the potential to coalesce multiple flush - * commands to a vdev into one. + * commands to a vdev into one). + * + * However, if this lwb does have threads waiting for it, we must issue + * the flush now. + * + * Regardless of whether or not we issue the flush now or defer it to + * the "next" lwb, our lwb_vdev_tree may also have entries on it that + * were deferred to us. + * + * So, our job here is to assemble a single tree of vdev trace records + * including all those that were written by this zio, plus any that + * were deferred to us, and either issue the flush now, or pass them on + * to the "next" lwb. + * + * Because the zio owns the entries on its trace tree, and it is no + * longer available to us after this function returns, so we make our + * own copies instead of just stealing them. */ if (list_is_empty(&lwb->lwb_waiters) && nlwb != NULL) { - zil_lwb_flush_defer(lwb, nlwb); - ASSERT(avl_is_empty(&lwb->lwb_vdev_tree)); + ASSERT3S(nlwb->lwb_state, !=, LWB_STATE_WRITE_DONE); + ASSERT3S(nlwb->lwb_state, !=, LWB_STATE_FLUSH_DONE); + + /* + * While 'lwb' is at a point in its lifetime where + * lwb_vdev_tree does not need the protection of lwb_vdev_lock + * (it will only be modified while holding zilog->zl_lock) as + * its writes and those of its children have all completed. + * The younger 'nlwb' may be waiting on future writes to + * additional vdevs. + */ + mutex_enter(&nlwb->lwb_vdev_lock); + zio_vdev_trace_copy(&zio->io_vdev_trace_tree, + &nlwb->lwb_vdev_tree); + + /* + * Also move any trace records from this lwb to the next. + */ + zio_vdev_trace_move(&lwb->lwb_vdev_tree, &nlwb->lwb_vdev_tree); + mutex_exit(&nlwb->lwb_vdev_lock); return; } - while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) { - vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev); - if (vd != NULL) { - /* - * The "ZIO_FLAG_DONT_PROPAGATE" is currently - * always used within "zio_flush". This means, - * any errors when flushing the vdev(s), will - * (unfortunately) not be handled correctly, - * since these "zio_flush" errors will not be - * propagated up to "zil_lwb_flush_vdevs_done". - */ - zio_flush(lwb->lwb_root_zio, vd); - } - kmem_free(zv, sizeof (*zv)); - } + /* + * Fill out the trace tree with records, then issue the flush, + * delivering errors to zil_lwb_flush_vdevs_done(). + */ + zio_vdev_trace_copy(&zio->io_vdev_trace_tree, &lwb->lwb_vdev_tree); + zio_vdev_trace_flush(lwb->lwb_root_zio, &lwb->lwb_vdev_tree); + zio_vdev_trace_empty(&lwb->lwb_vdev_tree); } /* @@ -1945,8 +1906,7 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]); lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio, spa, 0, &lwb->lwb_blk, lwb_abd, lwb->lwb_sz, zil_lwb_write_done, - lwb, prio, ZIO_FLAG_CANFAIL, &zb); - zil_lwb_add_block(lwb, &lwb->lwb_blk); + lwb, prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_VDEV_TRACE, &zb); if (lwb->lwb_slim) { /* For Slim ZIL only write what is used. */ @@ -3787,8 +3747,7 @@ zil_lwb_cons(void *vbuf, void *unused, int kmflag) list_create(&lwb->lwb_itxs, sizeof (itx_t), offsetof(itx_t, itx_node)); list_create(&lwb->lwb_waiters, sizeof (zil_commit_waiter_t), offsetof(zil_commit_waiter_t, zcw_node)); - avl_create(&lwb->lwb_vdev_tree, zil_lwb_vdev_compare, - sizeof (zil_vdev_node_t), offsetof(zil_vdev_node_t, zv_node)); + zio_vdev_trace_init(&lwb->lwb_vdev_tree); mutex_init(&lwb->lwb_vdev_lock, NULL, MUTEX_DEFAULT, NULL); return (0); } @@ -3799,7 +3758,7 @@ zil_lwb_dest(void *vbuf, void *unused) (void) unused; lwb_t *lwb = vbuf; mutex_destroy(&lwb->lwb_vdev_lock); - avl_destroy(&lwb->lwb_vdev_tree); + zio_vdev_trace_fini(&lwb->lwb_vdev_tree); list_destroy(&lwb->lwb_waiters); list_destroy(&lwb->lwb_itxs); } @@ -4387,7 +4346,6 @@ EXPORT_SYMBOL(zil_sync); EXPORT_SYMBOL(zil_clean); EXPORT_SYMBOL(zil_suspend); EXPORT_SYMBOL(zil_resume); -EXPORT_SYMBOL(zil_lwb_add_block); EXPORT_SYMBOL(zil_bp_tree_add); EXPORT_SYMBOL(zil_set_sync); EXPORT_SYMBOL(zil_set_logbias); diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 1f3acb9b921e..5dbd8386a71b 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -76,6 +76,7 @@ static int zio_deadman_log_all = B_FALSE; */ static kmem_cache_t *zio_cache; static kmem_cache_t *zio_link_cache; +static kmem_cache_t *zio_vdev_trace_cache; kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; #if defined(ZFS_DEBUG) && !defined(_KERNEL) @@ -157,6 +158,8 @@ zio_init(void) sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0); zio_link_cache = kmem_cache_create("zio_link_cache", sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0); + zio_vdev_trace_cache = kmem_cache_create("zio_vdev_trace_cache", + sizeof (zio_vdev_trace_t), 0, NULL, NULL, NULL, NULL, NULL, 0); for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { size_t size = (c + 1) << SPA_MINBLOCKSHIFT; @@ -285,6 +288,7 @@ zio_fini(void) VERIFY3P(zio_data_buf_cache[i], ==, NULL); } + kmem_cache_destroy(zio_vdev_trace_cache); kmem_cache_destroy(zio_link_cache); kmem_cache_destroy(zio_cache); @@ -796,6 +800,59 @@ zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait, pio->io_reexecute |= zio->io_reexecute; ASSERT3U(*countp, >, 0); + /* + * If all of the following are true: + * + * - the parent has requested vdev tracing + * - the child has just completed + * - the child was for a real vdev + * - the child is "interesting" for tracing purposes (see below) + * + * then we can stash some information about the vdev on the trace tree. + * + * "Interesting" means a vdev whose response was a direct contributor + * to the success of the overall zio; that is, we only consider zios + * that succeeded, and weren't something that was allowed or expected + * to fail (eg an aggregation padding write). + * + * This is important for the initial use case (knowing which vdevs were + * written to but not flushed), and is arguably correct for all cases + * (a vdev that returned an error, by definition, did not participate + * in the completing the zio). Its necessary in practice because an + * error from a leaf does not necessarily mean its parent will error + * out too (eg raidz can sometimes compensate for failed writes). If + * some future case requires more complex filtering we can look at + * stashing more info into zio_vdev_trace_t. + * + */ + if (pio->io_flags & ZIO_FLAG_VDEV_TRACE && + wait == ZIO_WAIT_DONE && zio->io_vd != NULL && + ((zio->io_flags & (ZIO_FLAG_OPTIONAL | ZIO_FLAG_IO_REPAIR)) == 0)) { + avl_tree_t *t = &pio->io_vdev_trace_tree; + zio_vdev_trace_t *zvt, zvt_search; + avl_index_t where; + + if (zio->io_error == 0) { + zvt_search.zvt_guid = zio->io_vd->vdev_guid; + if (avl_find(t, &zvt_search, &where) == NULL) { + zvt = kmem_cache_alloc( + zio_vdev_trace_cache, KM_SLEEP); + zvt->zvt_guid = zio->io_vd->vdev_guid; + avl_insert(t, zvt, where); + } + } + + /* + * If the child has itself collected trace records, copy them + * to ours. Note that we can't steal them, as there may be + * multiple parents. + */ + if (zio->io_flags & ZIO_FLAG_VDEV_TRACE) { + zio_vdev_trace_copy(&zio->io_vdev_trace_tree, + &pio->io_vdev_trace_tree); + } + } + (*countp)--; if (*countp == 0 && pio->io_stall == countp) { @@ -882,6 +939,92 @@ zio_bookmark_compare(const void *x1, const void *x2) return (0); } +static int +zio_vdev_trace_compare(const void *x1, const void *x2) +{ + const uint64_t v1 = ((zio_vdev_trace_t *)x1)->zvt_guid; + const uint64_t v2 = ((zio_vdev_trace_t *)x2)->zvt_guid; + + return (TREE_CMP(v1, v2)); +} + +void +zio_vdev_trace_init(avl_tree_t *t) +{ + avl_create(t, zio_vdev_trace_compare, sizeof (zio_vdev_trace_t), + offsetof(zio_vdev_trace_t, zvt_node)); +} + +void +zio_vdev_trace_fini(avl_tree_t *t) +{ + ASSERT(avl_is_empty(t)); + avl_destroy(t); +} + +/* + * Copy trace records on src and add them to dst, skipping any that are already + * on dst. + */ +void +zio_vdev_trace_copy(avl_tree_t *src, avl_tree_t *dst) +{ + zio_vdev_trace_t *zvt, *nzvt; + avl_index_t where; + + for (zvt = avl_first(src); zvt != NULL; zvt = AVL_NEXT(src, zvt)) { + if (avl_find(dst, zvt, &where) == NULL) { + nzvt = kmem_cache_alloc(zio_vdev_trace_cache, KM_SLEEP); + nzvt->zvt_guid = zvt->zvt_guid; + avl_insert(dst, nzvt, where); + } + } +} + +/* Move trace records from src to dst. src will be empty upon return. */ +void +zio_vdev_trace_move(avl_tree_t *src, avl_tree_t *dst) +{ + zio_vdev_trace_t *zvt; + avl_index_t where; + void *cookie; + + cookie = NULL; + while ((zvt = avl_destroy_nodes(src, &cookie)) != NULL) { + if (avl_find(dst, zvt, &where) == NULL) + avl_insert(dst, zvt, where); + else + kmem_cache_free(zio_vdev_trace_cache, zvt); + } + + ASSERT(avl_is_empty(src)); +} + +void +zio_vdev_trace_flush(zio_t *pio, avl_tree_t *t) +{ + spa_t *spa = pio->io_spa; + zio_vdev_trace_t *zvt; + vdev_t *vd; + + for (zvt = avl_first(t); zvt != NULL; zvt = AVL_NEXT(t, zvt)) { + vd = vdev_lookup_by_guid(spa->spa_root_vdev, zvt->zvt_guid); + if (vd != NULL && vd->vdev_children == 0) + zio_flush(pio, vd, B_TRUE); + } +} + +void +zio_vdev_trace_empty(avl_tree_t *t) +{ + zio_vdev_trace_t *zvt; + void *cookie = NULL; + while ((zvt = avl_destroy_nodes(t, &cookie)) != NULL) + kmem_cache_free(zio_vdev_trace_cache, zvt); + ASSERT(avl_is_empty(t)); +} + + /* * ========================================================================== * Create the various types of I/O (read, write, free, etc) @@ -919,6 +1062,8 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, offsetof(zio_link_t, zl_child_node)); metaslab_trace_init(&zio->io_alloc_list); + zio_vdev_trace_init(&zio->io_vdev_trace_tree); + if (vd != NULL) zio->io_child_type = ZIO_CHILD_VDEV; else if (flags & ZIO_FLAG_GANG_CHILD) @@ -984,6 +1129,8 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, void zio_destroy(zio_t *zio) { + zio_vdev_trace_empty(&zio->io_vdev_trace_tree); + zio_vdev_trace_fini(&zio->io_vdev_trace_tree); metaslab_trace_fini(&zio->io_alloc_list); list_destroy(&zio->io_parent_list); list_destroy(&zio->io_child_list); @@ -1633,10 +1780,10 @@ zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size, * the flushes complete. */ void -zio_flush(zio_t *pio, vdev_t *vd) +zio_flush(zio_t *pio, vdev_t *vd, boolean_t propagate) { - const zio_flag_t flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | - ZIO_FLAG_DONT_RETRY; + const zio_flag_t flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | + (propagate ? 0 : ZIO_FLAG_DONT_PROPAGATE); if (vd->vdev_nowritecache) return; @@ -1647,7 +1794,7 @@ zio_flush(zio_t *pio, vdev_t *vd) NULL, ZIO_STAGE_OPEN, ZIO_FLUSH_PIPELINE)); } else { for (uint64_t c = 0; c < vd->vdev_children; c++) - zio_flush(pio, vd->vdev_child[c]); + zio_flush(pio, vd->vdev_child[c], propagate); } } @@ -4532,11 +4679,14 @@ zio_vdev_io_assess(zio_t *zio) /* * If a cache flush returns ENOTSUP or ENOTTY, we know that no future * attempts will ever succeed. In this case we set a persistent - * boolean flag so that we don't bother with it in the future. + * boolean flag so that we don't bother with it in the future, and + * then we act like the flush succeeded. */ if ((zio->io_error == ENOTSUP || zio->io_error == ENOTTY) && - zio->io_type == ZIO_TYPE_FLUSH && vd != NULL) + zio->io_type == ZIO_TYPE_FLUSH && vd != NULL) { vd->vdev_nowritecache = B_TRUE; + zio->io_error = 0; + } if (zio->io_error) zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index 5817e649003c..f5485093e0a9 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -124,6 +124,10 @@ tests = ['auto_offline_001_pos', 'auto_online_001_pos', 'auto_online_002_pos', 'scrub_after_resilver', 'suspend_resume_single', 'zpool_status_-s'] tags = ['functional', 'fault'] +[tests/functional/flush:Linux] +tests = ['zil_flush_error', 'zil_flush_fallback'] +tags = ['functional', 'flush'] + [tests/functional/features/large_dnode:Linux] tests = ['large_dnode_002_pos', 'large_dnode_006_pos', 'large_dnode_008_pos'] tags = ['functional', 'features', 'large_dnode'] diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in index 1177e80e1a75..98e6c994f348 100755 --- a/tests/test-runner/bin/zts-report.py.in +++ b/tests/test-runner/bin/zts-report.py.in @@ -379,6 +379,7 @@ if os.environ.get('CI') == 'true': 'fault/auto_spare_ashift': ['SKIP', ci_reason], 'fault/auto_spare_shared': ['SKIP', ci_reason], 'fault/suspend_resume_single': ['SKIP', ci_reason], + 'flush/zil_flush_error': ['SKIP', ci_reason], 'procfs/pool_state': ['SKIP', ci_reason], }) diff --git a/tests/zfs-tests/include/blkdev.shlib b/tests/zfs-tests/include/blkdev.shlib index 51eff3023e73..bd8557c94b34 100644 --- a/tests/zfs-tests/include/blkdev.shlib +++ b/tests/zfs-tests/include/blkdev.shlib @@ -462,13 +462,16 @@ function unload_scsi_debug # Get scsi_debug device name. # Returns basename of scsi_debug device (for example "sdb"). # -function get_debug_device +# $1 (optional): Return the first $1 number of SCSI debug device names. +function get_debug_device #num { + typeset num=${1:-1} + for i in {1..10} ; do - val=$(lsscsi | awk '/scsi_debug/ {print $6; exit}' | cut -d/ -f3) + val=$(lsscsi | awk '/scsi_debug/ {print $6}' | cut -d/ -f3 | head -n$num) # lsscsi can take time to settle - if [ "$val" != "-" ] ; then + if [[ ! "$val" =~ "-" ]] ; then break fi sleep 1 diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index bbeabc6dfb42..d23316abfbe4 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1516,6 +1516,10 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/features/large_dnode/large_dnode_008_pos.ksh \ functional/features/large_dnode/large_dnode_009_pos.ksh \ functional/features/large_dnode/setup.ksh \ + functional/flush/cleanup.ksh \ + functional/flush/zil_flush_error.ksh \ + functional/flush/zil_flush_fallback.ksh \ + functional/flush/setup.ksh \ functional/grow/grow_pool_001_pos.ksh \ functional/grow/grow_replicas_001_pos.ksh \ functional/history/cleanup.ksh \ diff --git a/tests/zfs-tests/tests/functional/flush/cleanup.ksh b/tests/zfs-tests/tests/functional/flush/cleanup.ksh new file mode 100755 index 000000000000..4eb59574e4ec --- /dev/null +++ b/tests/zfs-tests/tests/functional/flush/cleanup.ksh @@ -0,0 +1,28 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END + +# +# Copyright (c) 2024, Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +default_cleanup diff --git a/tests/zfs-tests/tests/functional/flush/setup.ksh b/tests/zfs-tests/tests/functional/flush/setup.ksh new file mode 100755 index 000000000000..94a3936ce2cd --- /dev/null +++ b/tests/zfs-tests/tests/functional/flush/setup.ksh @@ -0,0 +1,30 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END + +# +# Copyright (c) 2024, Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +verify_runnable "global" + +log_pass diff --git a/tests/zfs-tests/tests/functional/flush/zil_flush_error.ksh b/tests/zfs-tests/tests/functional/flush/zil_flush_error.ksh new file mode 100755 index 000000000000..e053c5d3bac6 --- /dev/null +++ b/tests/zfs-tests/tests/functional/flush/zil_flush_error.ksh @@ -0,0 +1,259 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2024, Klara, Inc. +# + +# +# This tests that if the ZIL write sequence fails, it corectly falls back and +# waits until the transaction has fully committed before returning. +# +# When this test was written, the ZIL has a flaw - it assumes that if its +# writes succeed, then the data is definitely on disk and available for reply +# if the pool fails. It issues a flush immediately after the write, but does +# not check it is result. If a disk fails after the data has been accepted into +# the disk cache, but before it can be written to permanent storage, then +# fsync() will have returned success even though the data is not stored in the +# ZIL for replay. +# +# If the main pool then fails before the transaction can be written, then data +# is lost, and fsync() returning success was premature. +# +# To prove this, we create a pool with a separate log device. We inject two +# faults: +# +# - ZIL writes appear to succeed, but never make it disk +# - ZIL flushes fail, and return error +# +# We then remove the main pool device, and do a write+fsync. This goes to the +# ZIL, and appears to succeed. When the txg closes, the write will fail, and +# the pool suspends. +# +# Then, we simulate a reboot by copying the content of the pool devices aside. +# We restore the pool devices, bring it back online, and export it - we don't +# need it anymore, but we have to clean up properly. Then we restore the copied +# content and import the pool, in whatever state it was in when it suspended. +# +# Finally, we check the content of the file we wrote to. If it matches what we +# wrote, then the fsync() was correct, and all is well. If it doesn't match, +# then the flaw is present, and the test fails. +# +# We run the test twice: once without the log device injections, one with. The +# first confirms the expected behaviour of the ZIL - when the pool is imported, +# the log is replayed. The second fails as above. When the flaw is corrected, +# both will succeed, and this overall test succeeds. +# + +. $STF_SUITE/include/libtest.shlib + +TMPDIR=${TMPDIR:-$TEST_BASE_DIR} + +BACKUP_MAIN="$TMPDIR/backup_main" +BACKUP_LOG="$TMPDIR/backup_log" + +LOOP_LOG="$TMPDIR/loop_log" + +DATA_FILE="$TMPDIR/data_file" + +verify_runnable "global" + +function cleanup +{ + zinject -c all + destroy_pool $TESTPOOL + unload_scsi_debug + rm -f $BACKUP_MAIN $BACKUP_LOG $DATA_FILE +} + +log_onexit cleanup + +log_assert "verify fsync() waits if the ZIL commit fails" + +# create 128K of random data, and take its checksum. we do this up front to +# ensure we don't get messed up by any latency from reading /dev/random or +# checksumming the file on the pool +log_must dd if=/dev/random of=$DATA_FILE bs=128K count=1 +typeset sum=$(sha256digest $DATA_FILE) + +# create a virtual scsi device with two device nodes. these are backed by the +# same memory. we do this because we need to be able to take the device offline +# properly in order to get the pool to suspend; fault injection on a loop +# device can't do it. once offline, we can use the second node to take a copy +# of its state. +load_scsi_debug 100 1 2 1 '512b' +set -A sd $(get_debug_device 2) + +# create a loop device for the log. +truncate -s 100M $LOOP_LOG +typeset ld=$(basename $(losetup -f)) +log_must losetup /dev/$ld $LOOP_LOG + +# this function runs the entire test sequence. the option decides if faults +# are injected on the slog device, mimicking the trigger situation that causes +# the fsync() bug to occur +function test_fsync +{ + typeset -i do_fault_log="$1" + + log_note "setting up test" + + # create the pool. the main data store is on the scsi device, with the + # log on a loopback. we bias the ZIL towards to the log device to try + # to ensure that fsync() never involves the main device + log_must zpool create -f -O logbias=latency $TESTPOOL ${sd[0]} log $ld + + # create the file ahead of time. the ZIL head structure is created on + # first use, and does a full txg wait, which we need to avoid + log_must dd if=/dev/zero of=/$TESTPOOL/data_file \ + bs=128k count=1 conv=fsync + log_must zpool sync + + # arrange for writes to the log device to appear to succeed, and + # flushes to fail. this simulates a loss of the device between it + # accepting the the write into its cache, but before it can be written + # out + if [[ $do_fault_log != 0 ]] ; then + log_note "injecting log device faults" + log_must zinject -d $ld -e noop -T write $TESTPOOL + log_must zinject -d $ld -e io -T flush $TESTPOOL + fi + + # take the main device offline. there is no IO in flight, so ZFS won't + # notice immediately + log_note "taking main pool offline" + log_must eval "echo offline > /sys/block/${sd[0]}/device/state" + + # write out some data, then call fsync(). there are three possible + # results: + # + # - if the bug is present, fsync() will return success, and dd will + # succeed "immediately"; before the pool suspends + # - if the bug is fixed, fsync() will block, the pool will suspend, and + # dd will return success after the pool returns to service + # - if something else goes wrong, dd will fail; this may happen before + # or after the pool suspends or returns. this shouldn't happen, and + # should abort the test + # + # we have to put dd in the background, otherwise if it blocks we will + # block with it. what we're interested in is whether or not it succeeds + # before the pool is suspended. if it does, then we expect that after + # the suspended pool is reimported, the data will have been written + log_note "running dd in background to write data and call fsync()" + dd if=$DATA_FILE of=/$TESTPOOL/data_file bs=128k count=1 conv=fsync & + fsync_pid=$! + + # wait for the pool to suspend. this should happen within ~5s, when the + # txg sync tries to write the change to the main device + log_note "waiting for pool to suspend" + typeset -i tries=10 + until [[ $(cat /proc/spl/kstat/zfs/$TESTPOOL/state) == "SUSPENDED" ]] ; do + if ((tries-- == 0)); then + log_fail "pool didn't suspend" + fi + sleep 1 + done + + # the pool is suspended. see if dd is still present; if it is, then + # it's blocked in fsync(), and we have no expectation that the write + # made it to disk. if dd has exited, then its return code will tell + # us whether fsync() returned success, or it failed for some other + # reason + typeset -i fsync_success=0 + if kill -0 $fsync_pid ; then + log_note "dd is blocked; fsync() has not returned" + else + log_note "dd has finished, ensuring it was successful" + log_must wait $fsync_pid + fsync_success=1 + fi + + # pool is suspended. if we online the main device right now, it will + # retry writing the transaction, which will succed, and everything will + # continue as its supposed to. that's the opposite of what we want; we + # want to do an import, as if after reboot, to force the pool to try to + # replay the ZIL, so we can compare the final result against what + # fsync() told us + # + # however, right now the pool is wedged. we need to get it back online + # so we can export it, so we can do the import. so we need to copy the + # entire pool state away. for the scsi device, we can do this through + # the second device node. for the loopback, we can copy it directly + log_note "taking copy of suspended pool" + log_must cp /dev/${sd[1]} $BACKUP_MAIN + log_must cp /dev/$ld $BACKUP_LOG + + # bring the entire pool back online, by clearing error injections and + # restoring the main device. this will unblock anything still waiting + # on it, and tidy up all the internals so we can reset it + log_note "bringing pool back online" + if [[ $do_fault_log != 0 ]] ; then + log_must zinject -c all + fi + log_must eval "echo running > /sys/block/${sd[0]}/device/state" + log_must zpool clear $TESTPOOL + + # now the pool is back online. if dd was blocked, it should now + # complete successfully. make sure that's true + if [[ $fsync_success == 0 ]] ; then + log_note "ensuring blocked dd has now finished" + log_must wait $fsync_pid + fi + + log_note "exporting pool" + + # pool now clean, export it + log_must zpool export $TESTPOOL + + log_note "reverting pool to suspended state" + + # restore the pool to the suspended state, mimicking a reboot + log_must cp $BACKUP_MAIN /dev/${sd[0]} + log_must cp $BACKUP_LOG /dev/$ld + + # import the crashed pool + log_must zpool import $TESTPOOL + + # if fsync() succeeded before the pool suspended, then the ZIL should + # have replayed properly and the data is now available on the pool + # + # note that we don't check the alternative; fsync() blocking does not + # mean that data _didn't_ make it to disk, just the ZFS never claimed + # that it did. in that case we can't know what _should_ be on disk + # right now, so can't check + if [[ $fsync_success == 1 ]] ; then + log_note "fsync() succeeded earlier; checking data was written correctly" + typeset newsum=$(sha256digest /$TESTPOOL/data_file) + log_must test "$sum" = "$newsum" + fi + + log_note "test finished, cleaning up" + log_must zpool destroy -f $TESTPOOL +} + +log_note "first run: ZIL succeeds, and repairs the pool at import" +test_fsync 0 + +log_note "second run: ZIL commit fails, and falls back to txg sync" +test_fsync 1 + +log_pass "fsync() waits if the ZIL commit fails" diff --git a/tests/zfs-tests/tests/functional/flush/zil_flush_fallback.ksh b/tests/zfs-tests/tests/functional/flush/zil_flush_fallback.ksh new file mode 100755 index 000000000000..37f4dbcecaab --- /dev/null +++ b/tests/zfs-tests/tests/functional/flush/zil_flush_fallback.ksh @@ -0,0 +1,89 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2024, Klara, Inc. +# + +# +# This tests that when a pool is degraded, ZIL writes/flushes are still done +# directly, rather than falling back to a full txg flush. +# + +. $STF_SUITE/include/libtest.shlib + +verify_runnable "global" + +function cleanup { + default_cleanup_noexit +} + +log_onexit cleanup + +log_assert "ZIL writes go direct to the pool even when the pool is degraded" + +function zil_commit_count { + kstat zil | grep ^zil_commit_count | awk '{ print $3 }' +} +function zil_commit_error_count { + kstat zil | grep ^zil_commit_error_count | awk '{ print $3 }' +} + +DISK1=${DISKS%% *} +log_must default_mirror_setup_noexit $DISKS + +# get the current count of commits vs errors +typeset -i c1=$(zil_commit_count) +typeset -i e1=$(zil_commit_error_count) + +# force a single ZIL commit +log_must dd if=/dev/zero of=/$TESTPOOL/file bs=128k count=1 conv=fsync + +# get the updated count of commits vs errors +typeset -i c2=$(zil_commit_count) +typeset -i e2=$(zil_commit_error_count) + +# degrade the pool +log_must zpool offline -f $TESTPOOL $DISK1 +log_must wait_for_degraded $TESTPOOL + +# force another ZIL commit +log_must dd if=/dev/zero of=/$TESTPOOL/file bs=128k count=1 conv=fsync + +# get counts again +typeset -i c3=$(zil_commit_count) +typeset -i e3=$(zil_commit_error_count) + +# repair the pool +log_must zpool online $TESTPOOL $DISK1 + +# when pool is in good health, a ZIL commit should go direct to the +# pool and not fall back to a txg sync +log_must test $(( $c2 - $c1 )) -eq 1 +log_must test $(( $e2 - $e1 )) -eq 0 + +# when pool is degraded but still writeable, ZIL should still go direct +# to the pull and not fall back +log_must test $(( $c3 - $c2 )) -eq 1 +log_must test $(( $e3 - $e2 )) -eq 0 + +log_pass "ZIL writes go direct to the pool even when the pool is degraded"