Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ZIO: add "vdev tracing" facility; use it for ZIL flushing #16375

Draft
wants to merge 5 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion include/sys/zil.h
Original file line number Diff line number Diff line change
Expand Up @@ -604,7 +604,7 @@ extern void zil_clean(zilog_t *zilog, uint64_t synced_txg);
extern int zil_suspend(const char *osname, void **cookiep);
extern void zil_resume(void *cookie);

extern void zil_lwb_add_block(struct lwb *lwb, const blkptr_t *bp);
extern void zil_lwb_add_flush(struct lwb *lwb, zio_t *zio);
extern void zil_lwb_add_txg(struct lwb *lwb, uint64_t txg);
extern int zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp);

Expand Down
9 changes: 0 additions & 9 deletions include/sys/zil_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -172,15 +172,6 @@ typedef struct itx_async_node {
avl_node_t ia_node; /* AVL tree linkage */
} itx_async_node_t;

/*
* Vdev flushing: during a zil_commit(), we build up an AVL tree of the vdevs
* we've touched so we know which ones need a write cache flush at the end.
*/
typedef struct zil_vdev_node {
uint64_t zv_vdev; /* vdev to be flushed */
avl_node_t zv_node; /* AVL tree linkage */
} zil_vdev_node_t;

#define ZIL_BURSTS 8

/*
Expand Down
24 changes: 19 additions & 5 deletions include/sys/zio.h
Original file line number Diff line number Diff line change
Expand Up @@ -180,16 +180,17 @@ typedef uint64_t zio_flag_t;
#define ZIO_FLAG_SCRUB (1ULL << 4)
#define ZIO_FLAG_SCAN_THREAD (1ULL << 5)
#define ZIO_FLAG_PHYSICAL (1ULL << 6)
#define ZIO_FLAG_VDEV_TRACE (1ULL << 7)

#define ZIO_FLAG_AGG_INHERIT (ZIO_FLAG_CANFAIL - 1)

/*
* Flags inherited by ddt, gang, and vdev children.
*/
#define ZIO_FLAG_CANFAIL (1ULL << 7) /* must be first for INHERIT */
#define ZIO_FLAG_SPECULATIVE (1ULL << 8)
#define ZIO_FLAG_CONFIG_WRITER (1ULL << 9)
#define ZIO_FLAG_DONT_RETRY (1ULL << 10)
#define ZIO_FLAG_CANFAIL (1ULL << 8) /* must be first for INHERIT */
#define ZIO_FLAG_SPECULATIVE (1ULL << 9)
#define ZIO_FLAG_CONFIG_WRITER (1ULL << 10)
#define ZIO_FLAG_DONT_RETRY (1ULL << 11)
#define ZIO_FLAG_NODATA (1ULL << 12)
#define ZIO_FLAG_INDUCE_DAMAGE (1ULL << 13)
#define ZIO_FLAG_IO_ALLOCATING (1ULL << 14)
Expand Down Expand Up @@ -445,6 +446,11 @@ enum zio_qstate {
ZIO_QS_ACTIVE,
};

typedef struct zio_vdev_trace {
uint64_t zvt_guid;
avl_node_t zvt_node;
} zio_vdev_trace_t;

struct zio {
/* Core information about this I/O */
zbookmark_phys_t io_bookmark;
Expand Down Expand Up @@ -513,6 +519,7 @@ struct zio {
int io_error;
int io_child_error[ZIO_CHILD_TYPES];
uint64_t io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES];
avl_tree_t io_vdev_trace_tree;
uint64_t *io_stall;
zio_t *io_gang_leader;
zio_gang_node_t *io_gang_tree;
Expand Down Expand Up @@ -595,7 +602,7 @@ extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg,

extern int zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg,
blkptr_t *new_bp, uint64_t size, boolean_t *slog);
extern void zio_flush(zio_t *zio, vdev_t *vd);
extern void zio_flush(zio_t *zio, vdev_t *vd, boolean_t propagate);
extern void zio_shrink(zio_t *zio, uint64_t size);

extern int zio_wait(zio_t *zio);
Expand Down Expand Up @@ -636,6 +643,13 @@ extern void zio_vdev_io_bypass(zio_t *zio);
extern void zio_vdev_io_reissue(zio_t *zio);
extern void zio_vdev_io_redone(zio_t *zio);

extern void zio_vdev_trace_init(avl_tree_t *t);
extern void zio_vdev_trace_fini(avl_tree_t *t);
extern void zio_vdev_trace_copy(avl_tree_t *src, avl_tree_t *dst);
extern void zio_vdev_trace_move(avl_tree_t *src, avl_tree_t *dst);
extern void zio_vdev_trace_flush(zio_t *zio, avl_tree_t *t);
extern void zio_vdev_trace_empty(avl_tree_t *t);

extern void zio_change_priority(zio_t *pio, zio_priority_t priority);

extern void zio_checksum_verified(zio_t *zio);
Expand Down
15 changes: 0 additions & 15 deletions module/os/freebsd/zfs/vdev_geom.c
Original file line number Diff line number Diff line change
Expand Up @@ -1014,21 +1014,6 @@ vdev_geom_io_intr(struct bio *bp)
zio->io_error = SET_ERROR(EIO);

switch (zio->io_error) {
case ENOTSUP:
/*
* If we get ENOTSUP for BIO_FLUSH or BIO_DELETE we know
* that future attempts will never succeed. In this case
* we set a persistent flag so that we don't bother with
* requests in the future.
*/
switch (bp->bio_cmd) {
case BIO_FLUSH:
vd->vdev_nowritecache = B_TRUE;
break;
case BIO_DELETE:
break;
}
break;
case ENXIO:
if (!vd->vdev_remove_wanted) {
/*
Expand Down
3 changes: 0 additions & 3 deletions module/os/linux/zfs/vdev_disk.c
Original file line number Diff line number Diff line change
Expand Up @@ -1232,9 +1232,6 @@ BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error)
zio->io_error = -error;
#endif

if (zio->io_error && (zio->io_error == EOPNOTSUPP))
zio->io_vd->vdev_nowritecache = B_TRUE;

bio_put(bio);
ASSERT3S(zio->io_error, >=, 0);
if (zio->io_error)
Expand Down
19 changes: 10 additions & 9 deletions module/zfs/dmu.c
Original file line number Diff line number Diff line change
Expand Up @@ -1806,12 +1806,11 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
zgd_t *zgd = dsa->dsa_zgd;

/*
* Record the vdev(s) backing this blkptr so they can be flushed after
* the writes for the lwb have completed.
* Capture the trace records for this zio so the vdevs can be flushed
* after the writes for the lwb have completed.
*/
if (zio->io_error == 0) {
zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
}
if (zio->io_error == 0)
zil_lwb_add_flush(zgd->zgd_lwb, zio);

mutex_enter(&db->db_mtx);
ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC);
Expand Down Expand Up @@ -1865,10 +1864,10 @@ dmu_sync_late_arrival_done(zio_t *zio)

if (zio->io_error == 0) {
/*
* Record the vdev(s) backing this blkptr so they can be
* Capture the trace records for this zio so the vdevs can be
* flushed after the writes for the lwb have completed.
*/
zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
zil_lwb_add_flush(zgd->zgd_lwb, zio);

if (!BP_IS_HOLE(bp)) {
blkptr_t *bp_orig __maybe_unused = &zio->io_bp_orig;
Expand Down Expand Up @@ -1955,7 +1954,8 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
abd_get_from_buf(zgd->zgd_db->db_data, zgd->zgd_db->db_size),
zgd->zgd_db->db_size, zgd->zgd_db->db_size, zp,
dmu_sync_late_arrival_ready, NULL, dmu_sync_late_arrival_done,
dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb));
dsa, ZIO_PRIORITY_SYNC_WRITE,
ZIO_FLAG_CANFAIL | ZIO_FLAG_VDEV_TRACE, zb));

return (0);
}
Expand Down Expand Up @@ -2122,7 +2122,8 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
zio_nowait(arc_write(pio, os->os_spa, txg, zgd->zgd_bp,
dr->dt.dl.dr_data, !DBUF_IS_CACHEABLE(db), dbuf_is_l2cacheable(db),
&zp, dmu_sync_ready, NULL, dmu_sync_done, dsa,
ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb));
ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL | ZIO_FLAG_VDEV_TRACE,
&zb));

return (0);
}
Expand Down
14 changes: 8 additions & 6 deletions module/zfs/vdev_label.c
Original file line number Diff line number Diff line change
Expand Up @@ -1830,19 +1830,21 @@ vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags)

for (int v = 0; v < svdcount; v++) {
if (vdev_writeable(svd[v])) {
zio_flush(zio, svd[v]);
zio_flush(zio, svd[v], B_FALSE);
}
}
if (spa->spa_aux_sync_uber) {
spa->spa_aux_sync_uber = B_FALSE;
for (int v = 0; v < spa->spa_spares.sav_count; v++) {
if (vdev_writeable(spa->spa_spares.sav_vdevs[v])) {
zio_flush(zio, spa->spa_spares.sav_vdevs[v]);
zio_flush(zio, spa->spa_spares.sav_vdevs[v],
B_FALSE);
}
}
for (int v = 0; v < spa->spa_l2cache.sav_count; v++) {
if (vdev_writeable(spa->spa_l2cache.sav_vdevs[v])) {
zio_flush(zio, spa->spa_l2cache.sav_vdevs[v]);
zio_flush(zio, spa->spa_l2cache.sav_vdevs[v],
B_FALSE);
}
}
}
Expand Down Expand Up @@ -2007,13 +2009,13 @@ vdev_label_sync_list(spa_t *spa, int l, uint64_t txg, int flags)
zio = zio_root(spa, NULL, NULL, flags);

for (vd = list_head(dl); vd != NULL; vd = list_next(dl, vd))
zio_flush(zio, vd);
zio_flush(zio, vd, B_FALSE);

for (int i = 0; i < 2; i++) {
if (!sav[i]->sav_label_sync)
continue;
for (int v = 0; v < sav[i]->sav_count; v++)
zio_flush(zio, sav[i]->sav_vdevs[v]);
zio_flush(zio, sav[i]->sav_vdevs[v], B_FALSE);
if (l == 1)
sav[i]->sav_label_sync = B_FALSE;
}
Expand Down Expand Up @@ -2091,7 +2093,7 @@ vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg)
for (vdev_t *vd =
txg_list_head(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)); vd != NULL;
vd = txg_list_next(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)))
zio_flush(zio, vd);
zio_flush(zio, vd, B_FALSE);

(void) zio_wait(zio);

Expand Down
6 changes: 3 additions & 3 deletions module/zfs/vdev_raidz.c
Original file line number Diff line number Diff line change
Expand Up @@ -4172,7 +4172,7 @@ raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx)
goto io_error_exit;
}
pio = zio_root(spa, NULL, NULL, 0);
zio_flush(pio, raidvd);
zio_flush(pio, raidvd, B_FALSE);
zio_wait(pio);

zfs_dbgmsg("reflow: wrote %llu bytes (logical) to scratch area",
Expand Down Expand Up @@ -4231,7 +4231,7 @@ raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx)
goto io_error_exit;
}
pio = zio_root(spa, NULL, NULL, 0);
zio_flush(pio, raidvd);
zio_flush(pio, raidvd, B_FALSE);
zio_wait(pio);

zfs_dbgmsg("reflow: overwrote %llu bytes (logical) to real location",
Expand Down Expand Up @@ -4339,7 +4339,7 @@ vdev_raidz_reflow_copy_scratch(spa_t *spa)
}
zio_wait(pio);
pio = zio_root(spa, NULL, NULL, 0);
zio_flush(pio, raidvd);
zio_flush(pio, raidvd, B_FALSE);
zio_wait(pio);

zfs_dbgmsg("reflow recovery: overwrote %llu bytes (logical) "
Expand Down
Loading
Loading