From 789f6dfdacd99642c75ecc2782962a0ff01d694c Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Tue, 29 Aug 2023 18:24:00 +1000 Subject: [PATCH] ZIO: add "vdev tracing" facility; use it for ZIL flushing A problem with zio_flush() is that it issues a flush ZIO to a top-level vdev, which then recursively issues child flush ZIOs until the real leaf devices are flushed. As usual, an error in a child ZIO results in the parent ZIO erroring too, so if a leaf device has failed, it's flush ZIO will fail, and so will the entire flush operation. This didn't matter when we used to ignore flush errors, but now that we propagate them, the flush error propagates into the ZIL write ZIO. This causes the ZIL to believe its write failed, and fall back to a full txg wait. This still provides correct behaviour for zil_commit() callers (eg fsync()) but it ruins performance. We cannot simply skip flushing failed vdevs, because the associated write may have succeeded before the vdev failed, which would give the appearance the write is fully flushed when it is not. Neither can we issue a "syncing write" to the device (eg SCSI FUA), as this also degrades performance. The answer is that we must bind writes and flushes together in a way such that we only flush the physical devices that we wrote to. This adds a "vdev tracing" facility to ZIOs, zio_vdev_trace. When enabled on a ZIO with ZIO_FLAG_VDEV_TRACE, then upon successful completion (in the _done handler), zio->io_vdev_trace_tree will have a list of zio_vdev_trace_t objects that each describe a vdev that was involved in the successful completion of the ZIO. A companion function, zio_vdev_trace_flush(), is included, that issues a flush ZIO to the child vdevs on the given trace tree. zil_lwb_write_done() is updated to use this to bind ZIL writes and flushes together. The tracing facility is similar in many ways to the "deferred flushing" facility inside the ZIL, to the point where it can replace it. Now, if the flush should be deferred, the trace records from the writing ZIO are captured and combined with any captured from previous writes. When its finally time to issue the flush, we issue it to the entire accumulated set of traced vdevs. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Signed-off-by: Rob Norris --- include/sys/zil.h | 2 +- include/sys/zil_impl.h | 9 --- include/sys/zio.h | 22 +++++- module/zfs/dmu.c | 19 ++--- module/zfs/zil.c | 162 +++++++++++++++++------------------------ module/zfs/zio.c | 147 +++++++++++++++++++++++++++++++++++++ 6 files changed, 243 insertions(+), 118 deletions(-) diff --git a/include/sys/zil.h b/include/sys/zil.h index 384678b223d5..0f0f9327af0f 100644 --- a/include/sys/zil.h +++ b/include/sys/zil.h @@ -604,7 +604,7 @@ extern void zil_clean(zilog_t *zilog, uint64_t synced_txg); extern int zil_suspend(const char *osname, void **cookiep); extern void zil_resume(void *cookie); -extern void zil_lwb_add_block(struct lwb *lwb, const blkptr_t *bp); +extern void zil_lwb_add_flush(struct lwb *lwb, zio_t *zio); extern void zil_lwb_add_txg(struct lwb *lwb, uint64_t txg); extern int zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp); diff --git a/include/sys/zil_impl.h b/include/sys/zil_impl.h index 9a34bafc1c77..74c205a8f651 100644 --- a/include/sys/zil_impl.h +++ b/include/sys/zil_impl.h @@ -172,15 +172,6 @@ typedef struct itx_async_node { avl_node_t ia_node; /* AVL tree linkage */ } itx_async_node_t; -/* - * Vdev flushing: during a zil_commit(), we build up an AVL tree of the vdevs - * we've touched so we know which ones need a write cache flush at the end. - */ -typedef struct zil_vdev_node { - uint64_t zv_vdev; /* vdev to be flushed */ - avl_node_t zv_node; /* AVL tree linkage */ -} zil_vdev_node_t; - #define ZIL_BURSTS 8 /* diff --git a/include/sys/zio.h b/include/sys/zio.h index c5a9fb6c8824..70d98ea29be7 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -180,16 +180,17 @@ typedef uint64_t zio_flag_t; #define ZIO_FLAG_SCRUB (1ULL << 4) #define ZIO_FLAG_SCAN_THREAD (1ULL << 5) #define ZIO_FLAG_PHYSICAL (1ULL << 6) +#define ZIO_FLAG_VDEV_TRACE (1ULL << 7) #define ZIO_FLAG_AGG_INHERIT (ZIO_FLAG_CANFAIL - 1) /* * Flags inherited by ddt, gang, and vdev children. */ -#define ZIO_FLAG_CANFAIL (1ULL << 7) /* must be first for INHERIT */ -#define ZIO_FLAG_SPECULATIVE (1ULL << 8) -#define ZIO_FLAG_CONFIG_WRITER (1ULL << 9) -#define ZIO_FLAG_DONT_RETRY (1ULL << 10) +#define ZIO_FLAG_CANFAIL (1ULL << 8) /* must be first for INHERIT */ +#define ZIO_FLAG_SPECULATIVE (1ULL << 9) +#define ZIO_FLAG_CONFIG_WRITER (1ULL << 10) +#define ZIO_FLAG_DONT_RETRY (1ULL << 11) #define ZIO_FLAG_NODATA (1ULL << 12) #define ZIO_FLAG_INDUCE_DAMAGE (1ULL << 13) #define ZIO_FLAG_IO_ALLOCATING (1ULL << 14) @@ -444,6 +445,11 @@ enum zio_qstate { ZIO_QS_ACTIVE, }; +typedef struct zio_vdev_trace { + uint64_t zvt_guid; + avl_node_t zvt_node; +} zio_vdev_trace_t; + struct zio { /* Core information about this I/O */ zbookmark_phys_t io_bookmark; @@ -512,6 +518,7 @@ struct zio { int io_error; int io_child_error[ZIO_CHILD_TYPES]; uint64_t io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES]; + avl_tree_t io_vdev_trace_tree; uint64_t *io_stall; zio_t *io_gang_leader; zio_gang_node_t *io_gang_tree; @@ -635,6 +642,13 @@ extern void zio_vdev_io_bypass(zio_t *zio); extern void zio_vdev_io_reissue(zio_t *zio); extern void zio_vdev_io_redone(zio_t *zio); +extern void zio_vdev_trace_init(avl_tree_t *t); +extern void zio_vdev_trace_fini(avl_tree_t *t); +extern void zio_vdev_trace_copy(avl_tree_t *src, avl_tree_t *dst); +extern void zio_vdev_trace_move(avl_tree_t *src, avl_tree_t *dst); +extern void zio_vdev_trace_flush(zio_t *zio, avl_tree_t *t); +extern void zio_vdev_trace_empty(avl_tree_t *t); + extern void zio_change_priority(zio_t *pio, zio_priority_t priority); extern void zio_checksum_verified(zio_t *zio); diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 0dc6e48b9c85..89d0bc4258d4 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -1600,12 +1600,11 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg) zgd_t *zgd = dsa->dsa_zgd; /* - * Record the vdev(s) backing this blkptr so they can be flushed after - * the writes for the lwb have completed. + * Capture the trace records for this zio so the vdevs can be flushed + * after the writes for the lwb have completed. */ - if (zio->io_error == 0) { - zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp); - } + if (zio->io_error == 0) + zil_lwb_add_flush(zgd->zgd_lwb, zio); mutex_enter(&db->db_mtx); ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC); @@ -1659,10 +1658,10 @@ dmu_sync_late_arrival_done(zio_t *zio) if (zio->io_error == 0) { /* - * Record the vdev(s) backing this blkptr so they can be + * Capture the trace records for this zio so the vdevs can be * flushed after the writes for the lwb have completed. */ - zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp); + zil_lwb_add_flush(zgd->zgd_lwb, zio); if (!BP_IS_HOLE(bp)) { blkptr_t *bp_orig __maybe_unused = &zio->io_bp_orig; @@ -1749,7 +1748,8 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd, abd_get_from_buf(zgd->zgd_db->db_data, zgd->zgd_db->db_size), zgd->zgd_db->db_size, zgd->zgd_db->db_size, zp, dmu_sync_late_arrival_ready, NULL, dmu_sync_late_arrival_done, - dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb)); + dsa, ZIO_PRIORITY_SYNC_WRITE, + ZIO_FLAG_CANFAIL | ZIO_FLAG_VDEV_TRACE, zb)); return (0); } @@ -1915,7 +1915,8 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) zio_nowait(arc_write(pio, os->os_spa, txg, zgd->zgd_bp, dr->dt.dl.dr_data, !DBUF_IS_CACHEABLE(db), dbuf_is_l2cacheable(db), &zp, dmu_sync_ready, NULL, dmu_sync_done, dsa, - ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb)); + ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL | ZIO_FLAG_VDEV_TRACE, + &zb)); return (0); } diff --git a/module/zfs/zil.c b/module/zfs/zil.c index dec5730ff64d..d9c5493f9918 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -780,15 +780,6 @@ zil_free_log_record(zilog_t *zilog, const lr_t *lrc, void *tx, } } -static int -zil_lwb_vdev_compare(const void *x1, const void *x2) -{ - const uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev; - const uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev; - - return (TREE_CMP(v1, v2)); -} - /* * Allocate a new lwb. We may already have a block pointer for it, in which * case we get size and version from there. Or we may not yet, in which case @@ -1351,14 +1342,8 @@ zil_commit_waiter_link_nolwb(zil_commit_waiter_t *zcw, list_t *nolwb) } void -zil_lwb_add_block(lwb_t *lwb, const blkptr_t *bp) +zil_lwb_add_flush(struct lwb *lwb, zio_t *zio) { - avl_tree_t *t = &lwb->lwb_vdev_tree; - avl_index_t where; - zil_vdev_node_t *zv, zvsearch; - int ndvas = BP_GET_NDVAS(bp); - int i; - ASSERT3S(lwb->lwb_state, !=, LWB_STATE_WRITE_DONE); ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE); @@ -1366,53 +1351,10 @@ zil_lwb_add_block(lwb_t *lwb, const blkptr_t *bp) return; mutex_enter(&lwb->lwb_vdev_lock); - for (i = 0; i < ndvas; i++) { - zvsearch.zv_vdev = DVA_GET_VDEV(&bp->blk_dva[i]); - if (avl_find(t, &zvsearch, &where) == NULL) { - zv = kmem_alloc(sizeof (*zv), KM_SLEEP); - zv->zv_vdev = zvsearch.zv_vdev; - avl_insert(t, zv, where); - } - } + zio_vdev_trace_copy(&zio->io_vdev_trace_tree, &lwb->lwb_vdev_tree); mutex_exit(&lwb->lwb_vdev_lock); } -static void -zil_lwb_flush_defer(lwb_t *lwb, lwb_t *nlwb) -{ - avl_tree_t *src = &lwb->lwb_vdev_tree; - avl_tree_t *dst = &nlwb->lwb_vdev_tree; - void *cookie = NULL; - zil_vdev_node_t *zv; - - ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE); - ASSERT3S(nlwb->lwb_state, !=, LWB_STATE_WRITE_DONE); - ASSERT3S(nlwb->lwb_state, !=, LWB_STATE_FLUSH_DONE); - - /* - * While 'lwb' is at a point in its lifetime where lwb_vdev_tree does - * not need the protection of lwb_vdev_lock (it will only be modified - * while holding zilog->zl_lock) as its writes and those of its - * children have all completed. The younger 'nlwb' may be waiting on - * future writes to additional vdevs. - */ - mutex_enter(&nlwb->lwb_vdev_lock); - /* - * Tear down the 'lwb' vdev tree, ensuring that entries which do not - * exist in 'nlwb' are moved to it, freeing any would-be duplicates. - */ - while ((zv = avl_destroy_nodes(src, &cookie)) != NULL) { - avl_index_t where; - - if (avl_find(dst, zv, &where) == NULL) { - avl_insert(dst, zv, where); - } else { - kmem_free(zv, sizeof (*zv)); - } - } - mutex_exit(&nlwb->lwb_vdev_lock); -} - void zil_lwb_add_txg(lwb_t *lwb, uint64_t txg) { @@ -1555,9 +1497,6 @@ zil_lwb_write_done(zio_t *zio) lwb_t *lwb = zio->io_private; spa_t *spa = zio->io_spa; zilog_t *zilog = lwb->lwb_zilog; - avl_tree_t *t = &lwb->lwb_vdev_tree; - void *cookie = NULL; - zil_vdev_node_t *zv; lwb_t *nlwb; ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), !=, 0); @@ -1583,13 +1522,9 @@ zil_lwb_write_done(zio_t *zio) nlwb = NULL; mutex_exit(&zilog->zl_lock); - if (avl_numnodes(t) == 0) - return; - /* - * If there was an IO error, we're not going to call zio_flush() - * on these vdevs, so we simply empty the tree and free the - * nodes. We avoid calling zio_flush() since there isn't any + * If there was an IO error, there's no point continuing. We + * avoid calling zio_flush() since there isn't any * good reason for doing so, after the lwb block failed to be * written out. * @@ -1599,38 +1534,78 @@ zil_lwb_write_done(zio_t *zio) * we expect any error seen here, to have been propagated to * that function). */ - if (zio->io_error != 0) { - while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) - kmem_free(zv, sizeof (*zv)); + if (zio->io_error != 0 || zil_nocacheflush) { + /* + * Trace records may have been passed on to us from a + * previously-deferred lwb. Since we're not going to use them, + * we clean them up now. + */ + zio_vdev_trace_empty(&lwb->lwb_vdev_tree); return; } /* - * If this lwb does not have any threads waiting for it to complete, we - * want to defer issuing the flush command to the vdevs written to by - * "this" lwb, and instead rely on the "next" lwb to handle the flush - * command for those vdevs. Thus, we merge the vdev tree of "this" lwb - * with the vdev tree of the "next" lwb in the list, and assume the - * "next" lwb will handle flushing the vdevs (or deferring the flush(s) - * again). + * This zio was issued with ZIO_FLAG_VDEV_TRACE, and so its + * io_vdev_trace_tree has the set of zio_vdev_trace_t records for vdevs + * that were written to by "this" lwb. * - * This is a useful performance optimization, especially for workloads + * To complete the commit, we need to issue a flush to those vdevs. If + * this lwb does not have any threads waiting for it to complete, we + * want to defer issuing that flush, and instead rely on the "next" lwb + * to hndle the flush command for those vdevs. + * + * (This is a useful performance optimization, especially for workloads * with lots of async write activity and few sync write and/or fsync * activity, as it has the potential to coalesce multiple flush - * commands to a vdev into one. + * commands to a vdev into one). + * + * However, if this lwb does have threads waiting for it, we must issue + * the flush now. + * + * Regardless of whether or not we issue the flush now or defer it to + * the "next" lwb, our lwb_vdev_tree may also have entries on it that + * were deferred to us. + * + * So, our job here is to assemble a single tree of vdev trace records + * including all those that were written by this zio, plus any that + * were deferred to us, and either issue the flush now, or pass them on + * to the "next" lwb. + * + * Because the zio owns the entries on its trace tree, and it is no + * longer available to us after this function returns, so we make our + * own copies instead of just stealing them. */ if (list_is_empty(&lwb->lwb_waiters) && nlwb != NULL) { - zil_lwb_flush_defer(lwb, nlwb); - ASSERT(avl_is_empty(&lwb->lwb_vdev_tree)); + ASSERT3S(nlwb->lwb_state, !=, LWB_STATE_WRITE_DONE); + ASSERT3S(nlwb->lwb_state, !=, LWB_STATE_FLUSH_DONE); + + /* + * While 'lwb' is at a point in its lifetime where + * lwb_vdev_tree does not need the protection of lwb_vdev_lock + * (it will only be modified while holding zilog->zl_lock) as + * its writes and those of its children have all completed. + * The younger 'nlwb' may be waiting on future writes to + * additional vdevs. + */ + mutex_enter(&nlwb->lwb_vdev_lock); + zio_vdev_trace_copy(&zio->io_vdev_trace_tree, + &nlwb->lwb_vdev_tree); + + /* + * Also move any trace records from this lwb to the next. + */ + zio_vdev_trace_move(&lwb->lwb_vdev_tree, &nlwb->lwb_vdev_tree); + mutex_exit(&nlwb->lwb_vdev_lock); return; } - while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) { - vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev); - if (vd != NULL) - zio_flush(lwb->lwb_root_zio, vd, B_TRUE); - kmem_free(zv, sizeof (*zv)); - } + /* + * Fill out the trace tree with records, then issue the flush, + * delivering errors to zil_lwb_flush_vdevs_done(). + */ + zio_vdev_trace_copy(&zio->io_vdev_trace_tree, &lwb->lwb_vdev_tree); + zio_vdev_trace_flush(lwb->lwb_root_zio, &lwb->lwb_vdev_tree); + zio_vdev_trace_empty(&lwb->lwb_vdev_tree); } /* @@ -1913,8 +1888,7 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]); lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio, spa, 0, &lwb->lwb_blk, lwb_abd, lwb->lwb_sz, zil_lwb_write_done, - lwb, prio, ZIO_FLAG_CANFAIL, &zb); - zil_lwb_add_block(lwb, &lwb->lwb_blk); + lwb, prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_VDEV_TRACE, &zb); if (lwb->lwb_slim) { /* For Slim ZIL only write what is used. */ @@ -3755,8 +3729,7 @@ zil_lwb_cons(void *vbuf, void *unused, int kmflag) list_create(&lwb->lwb_itxs, sizeof (itx_t), offsetof(itx_t, itx_node)); list_create(&lwb->lwb_waiters, sizeof (zil_commit_waiter_t), offsetof(zil_commit_waiter_t, zcw_node)); - avl_create(&lwb->lwb_vdev_tree, zil_lwb_vdev_compare, - sizeof (zil_vdev_node_t), offsetof(zil_vdev_node_t, zv_node)); + zio_vdev_trace_init(&lwb->lwb_vdev_tree); mutex_init(&lwb->lwb_vdev_lock, NULL, MUTEX_DEFAULT, NULL); return (0); } @@ -3767,7 +3740,7 @@ zil_lwb_dest(void *vbuf, void *unused) (void) unused; lwb_t *lwb = vbuf; mutex_destroy(&lwb->lwb_vdev_lock); - avl_destroy(&lwb->lwb_vdev_tree); + zio_vdev_trace_fini(&lwb->lwb_vdev_tree); list_destroy(&lwb->lwb_waiters); list_destroy(&lwb->lwb_itxs); } @@ -4355,7 +4328,6 @@ EXPORT_SYMBOL(zil_sync); EXPORT_SYMBOL(zil_clean); EXPORT_SYMBOL(zil_suspend); EXPORT_SYMBOL(zil_resume); -EXPORT_SYMBOL(zil_lwb_add_block); EXPORT_SYMBOL(zil_bp_tree_add); EXPORT_SYMBOL(zil_set_sync); EXPORT_SYMBOL(zil_set_logbias); diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 503ad2b88d7a..d9abfec35612 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -76,6 +76,7 @@ static int zio_deadman_log_all = B_FALSE; */ static kmem_cache_t *zio_cache; static kmem_cache_t *zio_link_cache; +static kmem_cache_t *zio_vdev_trace_cache; kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; #if defined(ZFS_DEBUG) && !defined(_KERNEL) @@ -157,6 +158,8 @@ zio_init(void) sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0); zio_link_cache = kmem_cache_create("zio_link_cache", sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0); + zio_vdev_trace_cache = kmem_cache_create("zio_vdev_trace_cache", + sizeof (zio_vdev_trace_t), 0, NULL, NULL, NULL, NULL, NULL, 0); for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { size_t size = (c + 1) << SPA_MINBLOCKSHIFT; @@ -281,6 +284,7 @@ zio_fini(void) VERIFY3P(zio_data_buf_cache[i], ==, NULL); } + kmem_cache_destroy(zio_vdev_trace_cache); kmem_cache_destroy(zio_link_cache); kmem_cache_destroy(zio_cache); @@ -792,6 +796,59 @@ zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait, pio->io_reexecute |= zio->io_reexecute; ASSERT3U(*countp, >, 0); + /* + * If all of the following are true: + * + * - the parent has requested vdev tracing + * - the child has just completed + * - the child was for a real vdev + * - the child is "interesting" for tracing purposes (see below) + * + * then we can stash some information about the vdev on the trace tree. + * + * "Interesting" means a vdev whose response was a direct contributor + * to the success of the overall zio; that is, we only consider zios + * that succeeded, and weren't something that was allowed or expected + * to fail (eg an aggregation padding write). + * + * This is important for the initial use case (knowing which vdevs were + * written to but not flushed), and is arguably correct for all cases + * (a vdev that returned an error, by definition, did not participate + * in the completing the zio). Its necessary in practice because an + * error from a leaf does not necessarily mean its parent will error + * out too (eg raidz can sometimes compensate for failed writes). If + * some future case requires more complex filtering we can look at + * stashing more info into zio_vdev_trace_t. + * + */ + if (pio->io_flags & ZIO_FLAG_VDEV_TRACE && + wait == ZIO_WAIT_DONE && zio->io_vd != NULL && + ((zio->io_flags & (ZIO_FLAG_OPTIONAL | ZIO_FLAG_IO_REPAIR)) == 0)) { + avl_tree_t *t = &pio->io_vdev_trace_tree; + zio_vdev_trace_t *zvt, zvt_search; + avl_index_t where; + + if (zio->io_error == 0) { + zvt_search.zvt_guid = zio->io_vd->vdev_guid; + if (avl_find(t, &zvt_search, &where) == NULL) { + zvt = kmem_cache_alloc( + zio_vdev_trace_cache, KM_SLEEP); + zvt->zvt_guid = zio->io_vd->vdev_guid; + avl_insert(t, zvt, where); + } + } + + /* + * If the child has itself collected trace records, copy them + * to ours. Note that we can't steal them, as there may be + * multiple parents. + */ + if (zio->io_flags & ZIO_FLAG_VDEV_TRACE) { + zio_vdev_trace_copy(&zio->io_vdev_trace_tree, + &pio->io_vdev_trace_tree); + } + } + (*countp)--; if (*countp == 0 && pio->io_stall == countp) { @@ -878,6 +935,92 @@ zio_bookmark_compare(const void *x1, const void *x2) return (0); } +static int +zio_vdev_trace_compare(const void *x1, const void *x2) +{ + const uint64_t v1 = ((zio_vdev_trace_t *)x1)->zvt_guid; + const uint64_t v2 = ((zio_vdev_trace_t *)x2)->zvt_guid; + + return (TREE_CMP(v1, v2)); +} + +void +zio_vdev_trace_init(avl_tree_t *t) +{ + avl_create(t, zio_vdev_trace_compare, sizeof (zio_vdev_trace_t), + offsetof(zio_vdev_trace_t, zvt_node)); +} + +void +zio_vdev_trace_fini(avl_tree_t *t) +{ + ASSERT(avl_is_empty(t)); + avl_destroy(t); +} + +/* + * Copy trace records on src and add them to dst, skipping any that are already + * on dst. + */ +void +zio_vdev_trace_copy(avl_tree_t *src, avl_tree_t *dst) +{ + zio_vdev_trace_t *zvt, *nzvt; + avl_index_t where; + + for (zvt = avl_first(src); zvt != NULL; zvt = AVL_NEXT(src, zvt)) { + if (avl_find(dst, zvt, &where) == NULL) { + nzvt = kmem_cache_alloc(zio_vdev_trace_cache, KM_SLEEP); + nzvt->zvt_guid = zvt->zvt_guid; + avl_insert(dst, nzvt, where); + } + } +} + +/* Move trace records from src to dst. src will be empty upon return. */ +void +zio_vdev_trace_move(avl_tree_t *src, avl_tree_t *dst) +{ + zio_vdev_trace_t *zvt; + avl_index_t where; + void *cookie; + + cookie = NULL; + while ((zvt = avl_destroy_nodes(src, &cookie)) != NULL) { + if (avl_find(dst, zvt, &where) == NULL) + avl_insert(dst, zvt, where); + else + kmem_cache_free(zio_vdev_trace_cache, zvt); + } + + ASSERT(avl_is_empty(src)); +} + +void +zio_vdev_trace_flush(zio_t *pio, avl_tree_t *t) +{ + spa_t *spa = pio->io_spa; + zio_vdev_trace_t *zvt; + vdev_t *vd; + + for (zvt = avl_first(t); zvt != NULL; zvt = AVL_NEXT(t, zvt)) { + vd = vdev_lookup_by_guid(spa->spa_root_vdev, zvt->zvt_guid); + if (vd != NULL && vd->vdev_children == 0) + zio_flush(pio, vd, B_TRUE); + } +} + +void +zio_vdev_trace_empty(avl_tree_t *t) +{ + zio_vdev_trace_t *zvt; + void *cookie = NULL; + while ((zvt = avl_destroy_nodes(t, &cookie)) != NULL) + kmem_cache_free(zio_vdev_trace_cache, zvt); + ASSERT(avl_is_empty(t)); +} + + /* * ========================================================================== * Create the various types of I/O (read, write, free, etc) @@ -915,6 +1058,8 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, offsetof(zio_link_t, zl_child_node)); metaslab_trace_init(&zio->io_alloc_list); + zio_vdev_trace_init(&zio->io_vdev_trace_tree); + if (vd != NULL) zio->io_child_type = ZIO_CHILD_VDEV; else if (flags & ZIO_FLAG_GANG_CHILD) @@ -980,6 +1125,8 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, void zio_destroy(zio_t *zio) { + zio_vdev_trace_empty(&zio->io_vdev_trace_tree); + zio_vdev_trace_fini(&zio->io_vdev_trace_tree); metaslab_trace_fini(&zio->io_alloc_list); list_destroy(&zio->io_parent_list); list_destroy(&zio->io_child_list);