diff --git a/include/sys/zil_impl.h b/include/sys/zil_impl.h index d2f4018653a6..8409ce864e90 100644 --- a/include/sys/zil_impl.h +++ b/include/sys/zil_impl.h @@ -99,7 +99,7 @@ typedef struct lwb { char *lwb_buf; /* log write buffer */ zio_t *lwb_write_zio; /* zio for the lwb buffer */ zio_t *lwb_root_zio; /* root zio for lwb write and flushes */ - dmu_tx_t *lwb_tx; /* tx for log block allocation */ + uint64_t lwb_issued_txg; /* the txg when the write is issued */ uint64_t lwb_max_txg; /* highest txg in this lwb */ list_node_t lwb_node; /* zilog->zl_lwb_list linkage */ list_t lwb_itxs; /* list of itx's */ @@ -209,6 +209,12 @@ struct zilog { uint_t zl_prev_rotor; /* rotor for zl_prev[] */ txg_node_t zl_dirty_link; /* protected by dp_dirty_zilogs list */ uint64_t zl_dirty_max_txg; /* highest txg used to dirty zilog */ + + kmutex_t zl_lwb_io_lock; /* protect following members */ + uint64_t zl_lwb_inflight[TXG_SIZE]; /* io issued, but not done */ + kcondvar_t zl_lwb_io_cv; /* signal when the flush is done */ + uint64_t zl_lwb_max_issued_txg; /* max txg when lwb io issued */ + /* * Max block size for this ZIL. Note that this can not be changed * while the ZIL is in use because consumers (ZPL/zvol) need to take diff --git a/module/zfs/zil.c b/module/zfs/zil.c index 640e805d093a..78786be02e91 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -560,8 +560,8 @@ zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, boolean_t slog, uint64_t txg, lwb->lwb_max_txg = txg; lwb->lwb_write_zio = NULL; lwb->lwb_root_zio = NULL; - lwb->lwb_tx = NULL; lwb->lwb_issued_timestamp = 0; + lwb->lwb_issued_txg = 0; if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) { lwb->lwb_nused = sizeof (zil_chain_t); lwb->lwb_sz = BP_GET_LSIZE(bp); @@ -1126,9 +1126,9 @@ zil_lwb_flush_vdevs_done(zio_t *zio) { lwb_t *lwb = zio->io_private; zilog_t *zilog = lwb->lwb_zilog; - dmu_tx_t *tx = lwb->lwb_tx; zil_commit_waiter_t *zcw; itx_t *itx; + uint64_t txg = lwb->lwb_issued_txg; spa_config_exit(zilog->zl_spa, SCL_STATE, lwb); @@ -1137,15 +1137,13 @@ zil_lwb_flush_vdevs_done(zio_t *zio) mutex_enter(&zilog->zl_lock); /* - * Ensure the lwb buffer pointer is cleared before releasing the - * txg. If we have had an allocation failure and the txg is + * If we have had an allocation failure and the txg is * waiting to sync then we want zil_sync() to remove the lwb so * that it's not picked up as the next new one in * zil_process_commit_list(). zil_sync() will only remove the * lwb if lwb_buf is null. */ lwb->lwb_buf = NULL; - lwb->lwb_tx = NULL; ASSERT3U(lwb->lwb_issued_timestamp, >, 0); zilog->zl_last_lwb_latency = gethrtime() - lwb->lwb_issued_timestamp; @@ -1204,12 +1202,44 @@ zil_lwb_flush_vdevs_done(zio_t *zio) mutex_exit(&zilog->zl_lock); - /* - * Now that we've written this log block, we have a stable pointer - * to the next block in the chain, so it's OK to let the txg in - * which we allocated the next block sync. - */ - dmu_tx_commit(tx); + mutex_enter(&zilog->zl_lwb_io_lock); + ASSERT(zilog->zl_lwb_inflight[txg & TXG_MASK] > 0); + zilog->zl_lwb_inflight[txg & TXG_MASK]--; + if (zilog->zl_lwb_inflight[txg & TXG_MASK] == 0) + cv_broadcast(&zilog->zl_lwb_io_cv); + mutex_exit(&zilog->zl_lwb_io_lock); +} + +/* + * Wait for the completion of all issued write/flush of that txg provided. + * It guarantees zil_lwb_flush_vdevs_done() is called and returned. + */ +static void +zil_lwb_flush_wait_all(zilog_t *zilog, uint64_t txg) +{ + ASSERT(txg == spa_syncing_txg(zilog->zl_spa)); + + mutex_enter(&zilog->zl_lwb_io_lock); + while (zilog->zl_lwb_inflight[txg & TXG_MASK] > 0) + cv_wait(&zilog->zl_lwb_io_cv, &zilog->zl_lwb_io_lock); + mutex_exit(&zilog->zl_lwb_io_lock); + +#ifdef ZFS_DEBUG + mutex_enter(&zilog->zl_lock); + lwb_t *lwb = list_head(&zilog->zl_lwb_list); + while (lwb != NULL && lwb->lwb_max_txg <= txg) { + if (lwb->lwb_issued_txg <= txg) { + ASSERT(lwb->lwb_state != LWB_STATE_ISSUED); + ASSERT(lwb->lwb_state != LWB_STATE_WRITE_DONE); + IMPLY(lwb->lwb_issued_txg > 0, + lwb->lwb_state == LWB_STATE_FLUSH_DONE); + } + IMPLY(lwb->lwb_state == LWB_STATE_FLUSH_DONE, + lwb->lwb_buf == NULL); + lwb = list_next(&zilog->zl_lwb_list, lwb); + } + mutex_exit(&zilog->zl_lock); +#endif } /* @@ -1524,8 +1554,12 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); txg = dmu_tx_get_txg(tx); + lwb->lwb_issued_txg = txg; - lwb->lwb_tx = tx; + mutex_enter(&zilog->zl_lwb_io_lock); + zilog->zl_lwb_inflight[txg & TXG_MASK]++; + zilog->zl_lwb_max_issued_txg = MAX(txg, zilog->zl_lwb_max_issued_txg); + mutex_exit(&zilog->zl_lwb_io_lock); /* * Log blocks are pre-allocated. Here we select the size of the next @@ -1600,6 +1634,8 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) zio_nowait(lwb->lwb_root_zio); zio_nowait(lwb->lwb_write_zio); + dmu_tx_commit(tx); + /* * If there was an allocation failure then nlwb will be null which * forces a txg_wait_synced(). @@ -3062,6 +3098,8 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx) if (spa_sync_pass(spa) != 1) return; + zil_lwb_flush_wait_all(zilog, txg); + mutex_enter(&zilog->zl_lock); ASSERT(zilog->zl_stop_sync == 0); @@ -3216,6 +3254,7 @@ zil_alloc(objset_t *os, zil_header_t *zh_phys) mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&zilog->zl_issuer_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&zilog->zl_lwb_io_lock, NULL, MUTEX_DEFAULT, NULL); for (int i = 0; i < TXG_SIZE; i++) { mutex_init(&zilog->zl_itxg[i].itxg_lock, NULL, @@ -3229,6 +3268,7 @@ zil_alloc(objset_t *os, zil_header_t *zh_phys) offsetof(itx_t, itx_node)); cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL); + cv_init(&zilog->zl_lwb_io_cv, NULL, CV_DEFAULT, NULL); return (zilog); } @@ -3264,8 +3304,10 @@ zil_free(zilog_t *zilog) mutex_destroy(&zilog->zl_issuer_lock); mutex_destroy(&zilog->zl_lock); + mutex_destroy(&zilog->zl_lwb_io_lock); cv_destroy(&zilog->zl_cv_suspend); + cv_destroy(&zilog->zl_lwb_io_cv); kmem_free(zilog, sizeof (zilog_t)); } @@ -3313,9 +3355,18 @@ zil_close(zilog_t *zilog) mutex_exit(&zilog->zl_lock); /* - * We need to use txg_wait_synced() to wait long enough for the - * ZIL to be clean, and to wait for all pending lwbs to be - * written out. + * zl_lwb_max_issued_txg may be larger than lwb_max_txg. It depends + * on the time when the dmu_tx transaction is assigned in + * zil_lwb_write_issue(). + */ + mutex_enter(&zilog->zl_lwb_io_lock); + txg = MAX(zilog->zl_lwb_max_issued_txg, txg); + mutex_exit(&zilog->zl_lwb_io_lock); + + /* + * We need to use txg_wait_synced() to wait until that txg is synced. + * zil_sync() will guarantee all lwbs up to that txg have been + * written out, flushed, and cleaned. */ if (txg != 0) txg_wait_synced(zilog->zl_dmu_pool, txg);