From 6ad07c0dc6717c8d0b0bc64ff7f39292c87beaab Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Fri, 1 Mar 2024 15:48:53 +1100 Subject: [PATCH 1/5] zts: test for correct fsync() response to ZIL flush failure If fsync() (zil_commit()) writes successfully, but then the flush fails, fsync() should not return success, but instead should fall into a full transaction wait. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Signed-off-by: Rob Norris --- tests/runfiles/linux.run | 4 + tests/test-runner/bin/zts-report.py.in | 1 + tests/zfs-tests/include/blkdev.shlib | 9 +- tests/zfs-tests/tests/Makefile.am | 3 + .../tests/functional/flush/cleanup.ksh | 28 ++ .../tests/functional/flush/setup.ksh | 30 ++ .../functional/flush/zil_flush_error.ksh | 259 ++++++++++++++++++ 7 files changed, 331 insertions(+), 3 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/flush/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/flush/setup.ksh create mode 100755 tests/zfs-tests/tests/functional/flush/zil_flush_error.ksh diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index 5817e649003c..4a3fcd2cbec8 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -124,6 +124,10 @@ tests = ['auto_offline_001_pos', 'auto_online_001_pos', 'auto_online_002_pos', 'scrub_after_resilver', 'suspend_resume_single', 'zpool_status_-s'] tags = ['functional', 'fault'] +[tests/functional/flush:Linux] +tests = ['zil_flush_error'] +tags = ['functional', 'flush'] + [tests/functional/features/large_dnode:Linux] tests = ['large_dnode_002_pos', 'large_dnode_006_pos', 'large_dnode_008_pos'] tags = ['functional', 'features', 'large_dnode'] diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in index 1177e80e1a75..98e6c994f348 100755 --- a/tests/test-runner/bin/zts-report.py.in +++ b/tests/test-runner/bin/zts-report.py.in @@ -379,6 +379,7 @@ if os.environ.get('CI') == 'true': 'fault/auto_spare_ashift': ['SKIP', ci_reason], 'fault/auto_spare_shared': ['SKIP', ci_reason], 'fault/suspend_resume_single': ['SKIP', ci_reason], + 'flush/zil_flush_error': ['SKIP', ci_reason], 'procfs/pool_state': ['SKIP', ci_reason], }) diff --git a/tests/zfs-tests/include/blkdev.shlib b/tests/zfs-tests/include/blkdev.shlib index 51eff3023e73..bd8557c94b34 100644 --- a/tests/zfs-tests/include/blkdev.shlib +++ b/tests/zfs-tests/include/blkdev.shlib @@ -462,13 +462,16 @@ function unload_scsi_debug # Get scsi_debug device name. # Returns basename of scsi_debug device (for example "sdb"). # -function get_debug_device +# $1 (optional): Return the first $1 number of SCSI debug device names. +function get_debug_device #num { + typeset num=${1:-1} + for i in {1..10} ; do - val=$(lsscsi | awk '/scsi_debug/ {print $6; exit}' | cut -d/ -f3) + val=$(lsscsi | awk '/scsi_debug/ {print $6}' | cut -d/ -f3 | head -n$num) # lsscsi can take time to settle - if [ "$val" != "-" ] ; then + if [[ ! "$val" =~ "-" ]] ; then break fi sleep 1 diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index bbeabc6dfb42..34a5a0f0756f 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1516,6 +1516,9 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/features/large_dnode/large_dnode_008_pos.ksh \ functional/features/large_dnode/large_dnode_009_pos.ksh \ functional/features/large_dnode/setup.ksh \ + functional/flush/cleanup.ksh \ + functional/flush/zil_flush_error.ksh \ + functional/flush/setup.ksh \ functional/grow/grow_pool_001_pos.ksh \ functional/grow/grow_replicas_001_pos.ksh \ functional/history/cleanup.ksh \ diff --git a/tests/zfs-tests/tests/functional/flush/cleanup.ksh b/tests/zfs-tests/tests/functional/flush/cleanup.ksh new file mode 100755 index 000000000000..4eb59574e4ec --- /dev/null +++ b/tests/zfs-tests/tests/functional/flush/cleanup.ksh @@ -0,0 +1,28 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END + +# +# Copyright (c) 2024, Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +default_cleanup diff --git a/tests/zfs-tests/tests/functional/flush/setup.ksh b/tests/zfs-tests/tests/functional/flush/setup.ksh new file mode 100755 index 000000000000..94a3936ce2cd --- /dev/null +++ b/tests/zfs-tests/tests/functional/flush/setup.ksh @@ -0,0 +1,30 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END + +# +# Copyright (c) 2024, Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +verify_runnable "global" + +log_pass diff --git a/tests/zfs-tests/tests/functional/flush/zil_flush_error.ksh b/tests/zfs-tests/tests/functional/flush/zil_flush_error.ksh new file mode 100755 index 000000000000..e053c5d3bac6 --- /dev/null +++ b/tests/zfs-tests/tests/functional/flush/zil_flush_error.ksh @@ -0,0 +1,259 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2024, Klara, Inc. +# + +# +# This tests that if the ZIL write sequence fails, it corectly falls back and +# waits until the transaction has fully committed before returning. +# +# When this test was written, the ZIL has a flaw - it assumes that if its +# writes succeed, then the data is definitely on disk and available for reply +# if the pool fails. It issues a flush immediately after the write, but does +# not check it is result. If a disk fails after the data has been accepted into +# the disk cache, but before it can be written to permanent storage, then +# fsync() will have returned success even though the data is not stored in the +# ZIL for replay. +# +# If the main pool then fails before the transaction can be written, then data +# is lost, and fsync() returning success was premature. +# +# To prove this, we create a pool with a separate log device. We inject two +# faults: +# +# - ZIL writes appear to succeed, but never make it disk +# - ZIL flushes fail, and return error +# +# We then remove the main pool device, and do a write+fsync. This goes to the +# ZIL, and appears to succeed. When the txg closes, the write will fail, and +# the pool suspends. +# +# Then, we simulate a reboot by copying the content of the pool devices aside. +# We restore the pool devices, bring it back online, and export it - we don't +# need it anymore, but we have to clean up properly. Then we restore the copied +# content and import the pool, in whatever state it was in when it suspended. +# +# Finally, we check the content of the file we wrote to. If it matches what we +# wrote, then the fsync() was correct, and all is well. If it doesn't match, +# then the flaw is present, and the test fails. +# +# We run the test twice: once without the log device injections, one with. The +# first confirms the expected behaviour of the ZIL - when the pool is imported, +# the log is replayed. The second fails as above. When the flaw is corrected, +# both will succeed, and this overall test succeeds. +# + +. $STF_SUITE/include/libtest.shlib + +TMPDIR=${TMPDIR:-$TEST_BASE_DIR} + +BACKUP_MAIN="$TMPDIR/backup_main" +BACKUP_LOG="$TMPDIR/backup_log" + +LOOP_LOG="$TMPDIR/loop_log" + +DATA_FILE="$TMPDIR/data_file" + +verify_runnable "global" + +function cleanup +{ + zinject -c all + destroy_pool $TESTPOOL + unload_scsi_debug + rm -f $BACKUP_MAIN $BACKUP_LOG $DATA_FILE +} + +log_onexit cleanup + +log_assert "verify fsync() waits if the ZIL commit fails" + +# create 128K of random data, and take its checksum. we do this up front to +# ensure we don't get messed up by any latency from reading /dev/random or +# checksumming the file on the pool +log_must dd if=/dev/random of=$DATA_FILE bs=128K count=1 +typeset sum=$(sha256digest $DATA_FILE) + +# create a virtual scsi device with two device nodes. these are backed by the +# same memory. we do this because we need to be able to take the device offline +# properly in order to get the pool to suspend; fault injection on a loop +# device can't do it. once offline, we can use the second node to take a copy +# of its state. +load_scsi_debug 100 1 2 1 '512b' +set -A sd $(get_debug_device 2) + +# create a loop device for the log. +truncate -s 100M $LOOP_LOG +typeset ld=$(basename $(losetup -f)) +log_must losetup /dev/$ld $LOOP_LOG + +# this function runs the entire test sequence. the option decides if faults +# are injected on the slog device, mimicking the trigger situation that causes +# the fsync() bug to occur +function test_fsync +{ + typeset -i do_fault_log="$1" + + log_note "setting up test" + + # create the pool. the main data store is on the scsi device, with the + # log on a loopback. we bias the ZIL towards to the log device to try + # to ensure that fsync() never involves the main device + log_must zpool create -f -O logbias=latency $TESTPOOL ${sd[0]} log $ld + + # create the file ahead of time. the ZIL head structure is created on + # first use, and does a full txg wait, which we need to avoid + log_must dd if=/dev/zero of=/$TESTPOOL/data_file \ + bs=128k count=1 conv=fsync + log_must zpool sync + + # arrange for writes to the log device to appear to succeed, and + # flushes to fail. this simulates a loss of the device between it + # accepting the the write into its cache, but before it can be written + # out + if [[ $do_fault_log != 0 ]] ; then + log_note "injecting log device faults" + log_must zinject -d $ld -e noop -T write $TESTPOOL + log_must zinject -d $ld -e io -T flush $TESTPOOL + fi + + # take the main device offline. there is no IO in flight, so ZFS won't + # notice immediately + log_note "taking main pool offline" + log_must eval "echo offline > /sys/block/${sd[0]}/device/state" + + # write out some data, then call fsync(). there are three possible + # results: + # + # - if the bug is present, fsync() will return success, and dd will + # succeed "immediately"; before the pool suspends + # - if the bug is fixed, fsync() will block, the pool will suspend, and + # dd will return success after the pool returns to service + # - if something else goes wrong, dd will fail; this may happen before + # or after the pool suspends or returns. this shouldn't happen, and + # should abort the test + # + # we have to put dd in the background, otherwise if it blocks we will + # block with it. what we're interested in is whether or not it succeeds + # before the pool is suspended. if it does, then we expect that after + # the suspended pool is reimported, the data will have been written + log_note "running dd in background to write data and call fsync()" + dd if=$DATA_FILE of=/$TESTPOOL/data_file bs=128k count=1 conv=fsync & + fsync_pid=$! + + # wait for the pool to suspend. this should happen within ~5s, when the + # txg sync tries to write the change to the main device + log_note "waiting for pool to suspend" + typeset -i tries=10 + until [[ $(cat /proc/spl/kstat/zfs/$TESTPOOL/state) == "SUSPENDED" ]] ; do + if ((tries-- == 0)); then + log_fail "pool didn't suspend" + fi + sleep 1 + done + + # the pool is suspended. see if dd is still present; if it is, then + # it's blocked in fsync(), and we have no expectation that the write + # made it to disk. if dd has exited, then its return code will tell + # us whether fsync() returned success, or it failed for some other + # reason + typeset -i fsync_success=0 + if kill -0 $fsync_pid ; then + log_note "dd is blocked; fsync() has not returned" + else + log_note "dd has finished, ensuring it was successful" + log_must wait $fsync_pid + fsync_success=1 + fi + + # pool is suspended. if we online the main device right now, it will + # retry writing the transaction, which will succed, and everything will + # continue as its supposed to. that's the opposite of what we want; we + # want to do an import, as if after reboot, to force the pool to try to + # replay the ZIL, so we can compare the final result against what + # fsync() told us + # + # however, right now the pool is wedged. we need to get it back online + # so we can export it, so we can do the import. so we need to copy the + # entire pool state away. for the scsi device, we can do this through + # the second device node. for the loopback, we can copy it directly + log_note "taking copy of suspended pool" + log_must cp /dev/${sd[1]} $BACKUP_MAIN + log_must cp /dev/$ld $BACKUP_LOG + + # bring the entire pool back online, by clearing error injections and + # restoring the main device. this will unblock anything still waiting + # on it, and tidy up all the internals so we can reset it + log_note "bringing pool back online" + if [[ $do_fault_log != 0 ]] ; then + log_must zinject -c all + fi + log_must eval "echo running > /sys/block/${sd[0]}/device/state" + log_must zpool clear $TESTPOOL + + # now the pool is back online. if dd was blocked, it should now + # complete successfully. make sure that's true + if [[ $fsync_success == 0 ]] ; then + log_note "ensuring blocked dd has now finished" + log_must wait $fsync_pid + fi + + log_note "exporting pool" + + # pool now clean, export it + log_must zpool export $TESTPOOL + + log_note "reverting pool to suspended state" + + # restore the pool to the suspended state, mimicking a reboot + log_must cp $BACKUP_MAIN /dev/${sd[0]} + log_must cp $BACKUP_LOG /dev/$ld + + # import the crashed pool + log_must zpool import $TESTPOOL + + # if fsync() succeeded before the pool suspended, then the ZIL should + # have replayed properly and the data is now available on the pool + # + # note that we don't check the alternative; fsync() blocking does not + # mean that data _didn't_ make it to disk, just the ZFS never claimed + # that it did. in that case we can't know what _should_ be on disk + # right now, so can't check + if [[ $fsync_success == 1 ]] ; then + log_note "fsync() succeeded earlier; checking data was written correctly" + typeset newsum=$(sha256digest /$TESTPOOL/data_file) + log_must test "$sum" = "$newsum" + fi + + log_note "test finished, cleaning up" + log_must zpool destroy -f $TESTPOOL +} + +log_note "first run: ZIL succeeds, and repairs the pool at import" +test_fsync 0 + +log_note "second run: ZIL commit fails, and falls back to txg sync" +test_fsync 1 + +log_pass "fsync() waits if the ZIL commit fails" From e7966e581a4ed8462900cd549c4ee626f9e8c8f0 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Thu, 9 May 2024 11:50:58 +1000 Subject: [PATCH 2/5] zio_flush: propagate flush errors to the ZIL Since the beginning, ZFS' "flush" operation has always ignored errors[1]. Write errors are captured and dealt with, but if a write succeeds but the subsequent flush fails, the operation as a whole will appear to succeed[2]. In the end-of-transaction uberblock+label write+flush ceremony, it's very difficult for this situation to occur. Since all devices are written to, typically the first write will succeed, the first flush will fail unobserved, but then the second write will fail, and the entire transaction is aborted. It's difficult to imagine a real-world scenario where all the writes in that sequence could succeed even as the flushes are failing (understanding that the OS is still seeing hardware problems and taking devices offline). In the ZIL however, it's another story. Since only the write response is checked, if that write succeeds but the flush then fails, the ZIL will believe that it succeeds, and zil_commit() (and thus fsync()) will return success rather than the "correct" behaviour of falling back into txg_wait_synced()[3]. This commit fixes this by adding a simple flag to zio_flush() to indicate whether or not the caller wants to receive flush errors. This flag is enabled for ZIL calls. The existing zio chaining inside the ZIL and the flush handler zil_lwb_flush_vdevs_done() already has all the necessary support to properly handle a flush failure and fail the entire zio chain. This causes zil_commit() to correct fall back to txg_wait_synced() rather than returning success prematurely. 1. The ZFS birth commit (illumos/illumos-gate@fa9e4066f0) had support for flushing devices with write caches with the DKIOCFLUSHWRITECACHE ioctl. No errors are checked. The comment in `zil_flush_vdevs()` from from the time shows the thinking: /* * Wait for all the flushes to complete. Not all devices actually * support the DKIOCFLUSHWRITECACHE ioctl, so it's OK if it fails. */ 2. It's not entirely clear from the code history why this was acceptable for devices that _do_ have write caches. Our best guess is that this was an oversight: between the combination of hardware, pool topology and application behaviour required to hit this, it basically didn't come up. 3. Somewhat frustratingly, zil.c contains comments describing this exact behaviour, and further discussion in #12443 (September 2021). It appears that those involved saw the potential, but were looking at a different problem and so didn't have the context to recognise it for what it was. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Signed-off-by: Rob Norris --- include/sys/zio.h | 2 +- module/zfs/vdev_label.c | 14 ++++++++------ module/zfs/vdev_raidz.c | 6 +++--- module/zfs/zil.c | 20 +++----------------- module/zfs/zio.c | 8 ++++---- 5 files changed, 19 insertions(+), 31 deletions(-) diff --git a/include/sys/zio.h b/include/sys/zio.h index 446b64ccd8ab..6492d17360f6 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -595,7 +595,7 @@ extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, extern int zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp, uint64_t size, boolean_t *slog); -extern void zio_flush(zio_t *zio, vdev_t *vd); +extern void zio_flush(zio_t *zio, vdev_t *vd, boolean_t propagate); extern void zio_shrink(zio_t *zio, uint64_t size); extern int zio_wait(zio_t *zio); diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index 47346dd5acff..468390b9acd3 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -1830,19 +1830,21 @@ vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags) for (int v = 0; v < svdcount; v++) { if (vdev_writeable(svd[v])) { - zio_flush(zio, svd[v]); + zio_flush(zio, svd[v], B_FALSE); } } if (spa->spa_aux_sync_uber) { spa->spa_aux_sync_uber = B_FALSE; for (int v = 0; v < spa->spa_spares.sav_count; v++) { if (vdev_writeable(spa->spa_spares.sav_vdevs[v])) { - zio_flush(zio, spa->spa_spares.sav_vdevs[v]); + zio_flush(zio, spa->spa_spares.sav_vdevs[v], + B_FALSE); } } for (int v = 0; v < spa->spa_l2cache.sav_count; v++) { if (vdev_writeable(spa->spa_l2cache.sav_vdevs[v])) { - zio_flush(zio, spa->spa_l2cache.sav_vdevs[v]); + zio_flush(zio, spa->spa_l2cache.sav_vdevs[v], + B_FALSE); } } } @@ -2007,13 +2009,13 @@ vdev_label_sync_list(spa_t *spa, int l, uint64_t txg, int flags) zio = zio_root(spa, NULL, NULL, flags); for (vd = list_head(dl); vd != NULL; vd = list_next(dl, vd)) - zio_flush(zio, vd); + zio_flush(zio, vd, B_FALSE); for (int i = 0; i < 2; i++) { if (!sav[i]->sav_label_sync) continue; for (int v = 0; v < sav[i]->sav_count; v++) - zio_flush(zio, sav[i]->sav_vdevs[v]); + zio_flush(zio, sav[i]->sav_vdevs[v], B_FALSE); if (l == 1) sav[i]->sav_label_sync = B_FALSE; } @@ -2091,7 +2093,7 @@ vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg) for (vdev_t *vd = txg_list_head(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)); vd != NULL; vd = txg_list_next(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg))) - zio_flush(zio, vd); + zio_flush(zio, vd, B_FALSE); (void) zio_wait(zio); diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index 15c8b8ca6016..187d3908ff50 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -4172,7 +4172,7 @@ raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx) goto io_error_exit; } pio = zio_root(spa, NULL, NULL, 0); - zio_flush(pio, raidvd); + zio_flush(pio, raidvd, B_FALSE); zio_wait(pio); zfs_dbgmsg("reflow: wrote %llu bytes (logical) to scratch area", @@ -4231,7 +4231,7 @@ raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx) goto io_error_exit; } pio = zio_root(spa, NULL, NULL, 0); - zio_flush(pio, raidvd); + zio_flush(pio, raidvd, B_FALSE); zio_wait(pio); zfs_dbgmsg("reflow: overwrote %llu bytes (logical) to real location", @@ -4339,7 +4339,7 @@ vdev_raidz_reflow_copy_scratch(spa_t *spa) } zio_wait(pio); pio = zio_root(spa, NULL, NULL, 0); - zio_flush(pio, raidvd); + zio_flush(pio, raidvd, B_FALSE); zio_wait(pio); zfs_dbgmsg("reflow recovery: overwrote %llu bytes (logical) " diff --git a/module/zfs/zil.c b/module/zfs/zil.c index 3983da6aa424..f451c170fcec 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -23,6 +23,7 @@ * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright (c) 2018 Datto Inc. + * Copyright (c) 2024, Klara, Inc. */ /* Portions Copyright 2010 Robert Milkowski */ @@ -1495,12 +1496,6 @@ zil_lwb_flush_vdevs_done(zio_t *zio) * includes ZIO errors from either this LWB's write or * flush, as well as any errors from other dependent LWBs * (e.g. a root LWB ZIO that might be a child of this LWB). - * - * With that said, it's important to note that LWB flush - * errors are not propagated up to the LWB root ZIO. - * This is incorrect behavior, and results in VDEV flush - * errors not being handled correctly here. See the - * comment above the call to "zio_flush" for details. */ zcw->zcw_zio_error = zio->io_error; @@ -1650,17 +1645,8 @@ zil_lwb_write_done(zio_t *zio) while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) { vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev); - if (vd != NULL) { - /* - * The "ZIO_FLAG_DONT_PROPAGATE" is currently - * always used within "zio_flush". This means, - * any errors when flushing the vdev(s), will - * (unfortunately) not be handled correctly, - * since these "zio_flush" errors will not be - * propagated up to "zil_lwb_flush_vdevs_done". - */ - zio_flush(lwb->lwb_root_zio, vd); - } + if (vd != NULL) + zio_flush(lwb->lwb_root_zio, vd, B_TRUE); kmem_free(zv, sizeof (*zv)); } } diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 1f3acb9b921e..43e4e7ff15c8 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -1633,10 +1633,10 @@ zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size, * the flushes complete. */ void -zio_flush(zio_t *pio, vdev_t *vd) +zio_flush(zio_t *pio, vdev_t *vd, boolean_t propagate) { - const zio_flag_t flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | - ZIO_FLAG_DONT_RETRY; + const zio_flag_t flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | + (propagate ? 0 : ZIO_FLAG_DONT_PROPAGATE); if (vd->vdev_nowritecache) return; @@ -1647,7 +1647,7 @@ zio_flush(zio_t *pio, vdev_t *vd) NULL, ZIO_STAGE_OPEN, ZIO_FLUSH_PIPELINE)); } else { for (uint64_t c = 0; c < vd->vdev_children; c++) - zio_flush(pio, vd->vdev_child[c]); + zio_flush(pio, vd->vdev_child[c], propagate); } } From 2331d19dab7c0ea9bd8f895f9a652bf02535a8d7 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Mon, 1 Jul 2024 11:19:16 +1000 Subject: [PATCH 3/5] flush: don't report flush error when disabling flush support The first time a device returns ENOTSUP in repsonse to a flush request, we set vdev_nowritecache so we don't issue flushes in the future and instead just pretend the succeeded. However, we still return an error for the initial flush, even though we just decided such errors are meaningless! So, when setting vdev_nowritecache in response to a flush error, also reset the error code to assume success. Along the way, it seems there's no good reason for vdev_disk & vdev_geom to explicitly detect no support for flush and set vdev_nowritecache; just letting the error through to zio_vdev_io_assess() will cause it all to fall out nicely. So remove those checks. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Signed-off-by: Rob Norris --- module/os/freebsd/zfs/vdev_geom.c | 15 --------------- module/os/linux/zfs/vdev_disk.c | 3 --- module/zfs/zio.c | 7 +++++-- 3 files changed, 5 insertions(+), 20 deletions(-) diff --git a/module/os/freebsd/zfs/vdev_geom.c b/module/os/freebsd/zfs/vdev_geom.c index b7ff1063b089..7aaa42bfb1a8 100644 --- a/module/os/freebsd/zfs/vdev_geom.c +++ b/module/os/freebsd/zfs/vdev_geom.c @@ -1014,21 +1014,6 @@ vdev_geom_io_intr(struct bio *bp) zio->io_error = SET_ERROR(EIO); switch (zio->io_error) { - case ENOTSUP: - /* - * If we get ENOTSUP for BIO_FLUSH or BIO_DELETE we know - * that future attempts will never succeed. In this case - * we set a persistent flag so that we don't bother with - * requests in the future. - */ - switch (bp->bio_cmd) { - case BIO_FLUSH: - vd->vdev_nowritecache = B_TRUE; - break; - case BIO_DELETE: - break; - } - break; case ENXIO: if (!vd->vdev_remove_wanted) { /* diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c index e69c5f3841ec..2c963bb05c80 100644 --- a/module/os/linux/zfs/vdev_disk.c +++ b/module/os/linux/zfs/vdev_disk.c @@ -1232,9 +1232,6 @@ BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error) zio->io_error = -error; #endif - if (zio->io_error && (zio->io_error == EOPNOTSUPP)) - zio->io_vd->vdev_nowritecache = B_TRUE; - bio_put(bio); ASSERT3S(zio->io_error, >=, 0); if (zio->io_error) diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 43e4e7ff15c8..7af3eb922d7b 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -4532,11 +4532,14 @@ zio_vdev_io_assess(zio_t *zio) /* * If a cache flush returns ENOTSUP or ENOTTY, we know that no future * attempts will ever succeed. In this case we set a persistent - * boolean flag so that we don't bother with it in the future. + * boolean flag so that we don't bother with it in the future, and + * then we act like the flush succeeded. */ if ((zio->io_error == ENOTSUP || zio->io_error == ENOTTY) && - zio->io_type == ZIO_TYPE_FLUSH && vd != NULL) + zio->io_type == ZIO_TYPE_FLUSH && vd != NULL) { vd->vdev_nowritecache = B_TRUE; + zio->io_error = 0; + } if (zio->io_error) zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; From 8a09462bce67c2f7bce398fb846ee2780559a648 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Thu, 27 Jun 2024 09:47:57 +1000 Subject: [PATCH 4/5] ZTS: ZIL txg sync fallback test If the pool is degraded, ZIL writes should still succeed without falling back to a full txg sync. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Signed-off-by: Rob Norris --- tests/runfiles/linux.run | 2 +- tests/zfs-tests/tests/Makefile.am | 1 + .../functional/flush/zil_flush_fallback.ksh | 89 +++++++++++++++++++ 3 files changed, 91 insertions(+), 1 deletion(-) create mode 100755 tests/zfs-tests/tests/functional/flush/zil_flush_fallback.ksh diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index 4a3fcd2cbec8..f5485093e0a9 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -125,7 +125,7 @@ tests = ['auto_offline_001_pos', 'auto_online_001_pos', 'auto_online_002_pos', tags = ['functional', 'fault'] [tests/functional/flush:Linux] -tests = ['zil_flush_error'] +tests = ['zil_flush_error', 'zil_flush_fallback'] tags = ['functional', 'flush'] [tests/functional/features/large_dnode:Linux] diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 34a5a0f0756f..d23316abfbe4 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1518,6 +1518,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/features/large_dnode/setup.ksh \ functional/flush/cleanup.ksh \ functional/flush/zil_flush_error.ksh \ + functional/flush/zil_flush_fallback.ksh \ functional/flush/setup.ksh \ functional/grow/grow_pool_001_pos.ksh \ functional/grow/grow_replicas_001_pos.ksh \ diff --git a/tests/zfs-tests/tests/functional/flush/zil_flush_fallback.ksh b/tests/zfs-tests/tests/functional/flush/zil_flush_fallback.ksh new file mode 100755 index 000000000000..37f4dbcecaab --- /dev/null +++ b/tests/zfs-tests/tests/functional/flush/zil_flush_fallback.ksh @@ -0,0 +1,89 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2024, Klara, Inc. +# + +# +# This tests that when a pool is degraded, ZIL writes/flushes are still done +# directly, rather than falling back to a full txg flush. +# + +. $STF_SUITE/include/libtest.shlib + +verify_runnable "global" + +function cleanup { + default_cleanup_noexit +} + +log_onexit cleanup + +log_assert "ZIL writes go direct to the pool even when the pool is degraded" + +function zil_commit_count { + kstat zil | grep ^zil_commit_count | awk '{ print $3 }' +} +function zil_commit_error_count { + kstat zil | grep ^zil_commit_error_count | awk '{ print $3 }' +} + +DISK1=${DISKS%% *} +log_must default_mirror_setup_noexit $DISKS + +# get the current count of commits vs errors +typeset -i c1=$(zil_commit_count) +typeset -i e1=$(zil_commit_error_count) + +# force a single ZIL commit +log_must dd if=/dev/zero of=/$TESTPOOL/file bs=128k count=1 conv=fsync + +# get the updated count of commits vs errors +typeset -i c2=$(zil_commit_count) +typeset -i e2=$(zil_commit_error_count) + +# degrade the pool +log_must zpool offline -f $TESTPOOL $DISK1 +log_must wait_for_degraded $TESTPOOL + +# force another ZIL commit +log_must dd if=/dev/zero of=/$TESTPOOL/file bs=128k count=1 conv=fsync + +# get counts again +typeset -i c3=$(zil_commit_count) +typeset -i e3=$(zil_commit_error_count) + +# repair the pool +log_must zpool online $TESTPOOL $DISK1 + +# when pool is in good health, a ZIL commit should go direct to the +# pool and not fall back to a txg sync +log_must test $(( $c2 - $c1 )) -eq 1 +log_must test $(( $e2 - $e1 )) -eq 0 + +# when pool is degraded but still writeable, ZIL should still go direct +# to the pull and not fall back +log_must test $(( $c3 - $c2 )) -eq 1 +log_must test $(( $e3 - $e2 )) -eq 0 + +log_pass "ZIL writes go direct to the pool even when the pool is degraded" From e35088c93f9fab2cedb383aafbc46277a8a619ae Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Tue, 29 Aug 2023 18:24:00 +1000 Subject: [PATCH 5/5] ZIO: add "vdev tracing" facility; use it for ZIL flushing A problem with zio_flush() is that it issues a flush ZIO to a top-level vdev, which then recursively issues child flush ZIOs until the real leaf devices are flushed. As usual, an error in a child ZIO results in the parent ZIO erroring too, so if a leaf device has failed, it's flush ZIO will fail, and so will the entire flush operation. This didn't matter when we used to ignore flush errors, but now that we propagate them, the flush error propagates into the ZIL write ZIO. This causes the ZIL to believe its write failed, and fall back to a full txg wait. This still provides correct behaviour for zil_commit() callers (eg fsync()) but it ruins performance. We cannot simply skip flushing failed vdevs, because the associated write may have succeeded before the vdev failed, which would give the appearance the write is fully flushed when it is not. Neither can we issue a "syncing write" to the device (eg SCSI FUA), as this also degrades performance. The answer is that we must bind writes and flushes together in a way such that we only flush the physical devices that we wrote to. This adds a "vdev tracing" facility to ZIOs, zio_vdev_trace. When enabled on a ZIO with ZIO_FLAG_VDEV_TRACE, then upon successful completion (in the _done handler), zio->io_vdev_trace_tree will have a list of zio_vdev_trace_t objects that each describe a vdev that was involved in the successful completion of the ZIO. A companion function, zio_vdev_trace_flush(), is included, that issues a flush ZIO to the child vdevs on the given trace tree. zil_lwb_write_done() is updated to use this to bind ZIL writes and flushes together. The tracing facility is similar in many ways to the "deferred flushing" facility inside the ZIL, to the point where it can replace it. Now, if the flush should be deferred, the trace records from the writing ZIO are captured and combined with any captured from previous writes. When its finally time to issue the flush, we issue it to the entire accumulated set of traced vdevs. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Signed-off-by: Rob Norris --- include/sys/zil.h | 2 +- include/sys/zil_impl.h | 9 --- include/sys/zio.h | 22 +++++- module/zfs/dmu.c | 19 ++--- module/zfs/zil.c | 162 +++++++++++++++++------------------------ module/zfs/zio.c | 147 +++++++++++++++++++++++++++++++++++++ 6 files changed, 243 insertions(+), 118 deletions(-) diff --git a/include/sys/zil.h b/include/sys/zil.h index 384678b223d5..0f0f9327af0f 100644 --- a/include/sys/zil.h +++ b/include/sys/zil.h @@ -604,7 +604,7 @@ extern void zil_clean(zilog_t *zilog, uint64_t synced_txg); extern int zil_suspend(const char *osname, void **cookiep); extern void zil_resume(void *cookie); -extern void zil_lwb_add_block(struct lwb *lwb, const blkptr_t *bp); +extern void zil_lwb_add_flush(struct lwb *lwb, zio_t *zio); extern void zil_lwb_add_txg(struct lwb *lwb, uint64_t txg); extern int zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp); diff --git a/include/sys/zil_impl.h b/include/sys/zil_impl.h index 9a34bafc1c77..74c205a8f651 100644 --- a/include/sys/zil_impl.h +++ b/include/sys/zil_impl.h @@ -172,15 +172,6 @@ typedef struct itx_async_node { avl_node_t ia_node; /* AVL tree linkage */ } itx_async_node_t; -/* - * Vdev flushing: during a zil_commit(), we build up an AVL tree of the vdevs - * we've touched so we know which ones need a write cache flush at the end. - */ -typedef struct zil_vdev_node { - uint64_t zv_vdev; /* vdev to be flushed */ - avl_node_t zv_node; /* AVL tree linkage */ -} zil_vdev_node_t; - #define ZIL_BURSTS 8 /* diff --git a/include/sys/zio.h b/include/sys/zio.h index 6492d17360f6..1aa6bd533795 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -180,16 +180,17 @@ typedef uint64_t zio_flag_t; #define ZIO_FLAG_SCRUB (1ULL << 4) #define ZIO_FLAG_SCAN_THREAD (1ULL << 5) #define ZIO_FLAG_PHYSICAL (1ULL << 6) +#define ZIO_FLAG_VDEV_TRACE (1ULL << 7) #define ZIO_FLAG_AGG_INHERIT (ZIO_FLAG_CANFAIL - 1) /* * Flags inherited by ddt, gang, and vdev children. */ -#define ZIO_FLAG_CANFAIL (1ULL << 7) /* must be first for INHERIT */ -#define ZIO_FLAG_SPECULATIVE (1ULL << 8) -#define ZIO_FLAG_CONFIG_WRITER (1ULL << 9) -#define ZIO_FLAG_DONT_RETRY (1ULL << 10) +#define ZIO_FLAG_CANFAIL (1ULL << 8) /* must be first for INHERIT */ +#define ZIO_FLAG_SPECULATIVE (1ULL << 9) +#define ZIO_FLAG_CONFIG_WRITER (1ULL << 10) +#define ZIO_FLAG_DONT_RETRY (1ULL << 11) #define ZIO_FLAG_NODATA (1ULL << 12) #define ZIO_FLAG_INDUCE_DAMAGE (1ULL << 13) #define ZIO_FLAG_IO_ALLOCATING (1ULL << 14) @@ -445,6 +446,11 @@ enum zio_qstate { ZIO_QS_ACTIVE, }; +typedef struct zio_vdev_trace { + uint64_t zvt_guid; + avl_node_t zvt_node; +} zio_vdev_trace_t; + struct zio { /* Core information about this I/O */ zbookmark_phys_t io_bookmark; @@ -513,6 +519,7 @@ struct zio { int io_error; int io_child_error[ZIO_CHILD_TYPES]; uint64_t io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES]; + avl_tree_t io_vdev_trace_tree; uint64_t *io_stall; zio_t *io_gang_leader; zio_gang_node_t *io_gang_tree; @@ -636,6 +643,13 @@ extern void zio_vdev_io_bypass(zio_t *zio); extern void zio_vdev_io_reissue(zio_t *zio); extern void zio_vdev_io_redone(zio_t *zio); +extern void zio_vdev_trace_init(avl_tree_t *t); +extern void zio_vdev_trace_fini(avl_tree_t *t); +extern void zio_vdev_trace_copy(avl_tree_t *src, avl_tree_t *dst); +extern void zio_vdev_trace_move(avl_tree_t *src, avl_tree_t *dst); +extern void zio_vdev_trace_flush(zio_t *zio, avl_tree_t *t); +extern void zio_vdev_trace_empty(avl_tree_t *t); + extern void zio_change_priority(zio_t *pio, zio_priority_t priority); extern void zio_checksum_verified(zio_t *zio); diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index b3eda8ea5097..85e3a549e616 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -1806,12 +1806,11 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg) zgd_t *zgd = dsa->dsa_zgd; /* - * Record the vdev(s) backing this blkptr so they can be flushed after - * the writes for the lwb have completed. + * Capture the trace records for this zio so the vdevs can be flushed + * after the writes for the lwb have completed. */ - if (zio->io_error == 0) { - zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp); - } + if (zio->io_error == 0) + zil_lwb_add_flush(zgd->zgd_lwb, zio); mutex_enter(&db->db_mtx); ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC); @@ -1865,10 +1864,10 @@ dmu_sync_late_arrival_done(zio_t *zio) if (zio->io_error == 0) { /* - * Record the vdev(s) backing this blkptr so they can be + * Capture the trace records for this zio so the vdevs can be * flushed after the writes for the lwb have completed. */ - zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp); + zil_lwb_add_flush(zgd->zgd_lwb, zio); if (!BP_IS_HOLE(bp)) { blkptr_t *bp_orig __maybe_unused = &zio->io_bp_orig; @@ -1955,7 +1954,8 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd, abd_get_from_buf(zgd->zgd_db->db_data, zgd->zgd_db->db_size), zgd->zgd_db->db_size, zgd->zgd_db->db_size, zp, dmu_sync_late_arrival_ready, NULL, dmu_sync_late_arrival_done, - dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb)); + dsa, ZIO_PRIORITY_SYNC_WRITE, + ZIO_FLAG_CANFAIL | ZIO_FLAG_VDEV_TRACE, zb)); return (0); } @@ -2122,7 +2122,8 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) zio_nowait(arc_write(pio, os->os_spa, txg, zgd->zgd_bp, dr->dt.dl.dr_data, !DBUF_IS_CACHEABLE(db), dbuf_is_l2cacheable(db), &zp, dmu_sync_ready, NULL, dmu_sync_done, dsa, - ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb)); + ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL | ZIO_FLAG_VDEV_TRACE, + &zb)); return (0); } diff --git a/module/zfs/zil.c b/module/zfs/zil.c index f451c170fcec..ae1a4f7ec01d 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -798,15 +798,6 @@ zil_free_log_record(zilog_t *zilog, const lr_t *lrc, void *tx, } } -static int -zil_lwb_vdev_compare(const void *x1, const void *x2) -{ - const uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev; - const uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev; - - return (TREE_CMP(v1, v2)); -} - /* * Allocate a new lwb. We may already have a block pointer for it, in which * case we get size and version from there. Or we may not yet, in which case @@ -1369,14 +1360,8 @@ zil_commit_waiter_link_nolwb(zil_commit_waiter_t *zcw, list_t *nolwb) } void -zil_lwb_add_block(lwb_t *lwb, const blkptr_t *bp) +zil_lwb_add_flush(struct lwb *lwb, zio_t *zio) { - avl_tree_t *t = &lwb->lwb_vdev_tree; - avl_index_t where; - zil_vdev_node_t *zv, zvsearch; - int ndvas = BP_GET_NDVAS(bp); - int i; - ASSERT3S(lwb->lwb_state, !=, LWB_STATE_WRITE_DONE); ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE); @@ -1384,53 +1369,10 @@ zil_lwb_add_block(lwb_t *lwb, const blkptr_t *bp) return; mutex_enter(&lwb->lwb_vdev_lock); - for (i = 0; i < ndvas; i++) { - zvsearch.zv_vdev = DVA_GET_VDEV(&bp->blk_dva[i]); - if (avl_find(t, &zvsearch, &where) == NULL) { - zv = kmem_alloc(sizeof (*zv), KM_SLEEP); - zv->zv_vdev = zvsearch.zv_vdev; - avl_insert(t, zv, where); - } - } + zio_vdev_trace_copy(&zio->io_vdev_trace_tree, &lwb->lwb_vdev_tree); mutex_exit(&lwb->lwb_vdev_lock); } -static void -zil_lwb_flush_defer(lwb_t *lwb, lwb_t *nlwb) -{ - avl_tree_t *src = &lwb->lwb_vdev_tree; - avl_tree_t *dst = &nlwb->lwb_vdev_tree; - void *cookie = NULL; - zil_vdev_node_t *zv; - - ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE); - ASSERT3S(nlwb->lwb_state, !=, LWB_STATE_WRITE_DONE); - ASSERT3S(nlwb->lwb_state, !=, LWB_STATE_FLUSH_DONE); - - /* - * While 'lwb' is at a point in its lifetime where lwb_vdev_tree does - * not need the protection of lwb_vdev_lock (it will only be modified - * while holding zilog->zl_lock) as its writes and those of its - * children have all completed. The younger 'nlwb' may be waiting on - * future writes to additional vdevs. - */ - mutex_enter(&nlwb->lwb_vdev_lock); - /* - * Tear down the 'lwb' vdev tree, ensuring that entries which do not - * exist in 'nlwb' are moved to it, freeing any would-be duplicates. - */ - while ((zv = avl_destroy_nodes(src, &cookie)) != NULL) { - avl_index_t where; - - if (avl_find(dst, zv, &where) == NULL) { - avl_insert(dst, zv, where); - } else { - kmem_free(zv, sizeof (*zv)); - } - } - mutex_exit(&nlwb->lwb_vdev_lock); -} - void zil_lwb_add_txg(lwb_t *lwb, uint64_t txg) { @@ -1573,9 +1515,6 @@ zil_lwb_write_done(zio_t *zio) lwb_t *lwb = zio->io_private; spa_t *spa = zio->io_spa; zilog_t *zilog = lwb->lwb_zilog; - avl_tree_t *t = &lwb->lwb_vdev_tree; - void *cookie = NULL; - zil_vdev_node_t *zv; lwb_t *nlwb; ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), !=, 0); @@ -1601,13 +1540,9 @@ zil_lwb_write_done(zio_t *zio) nlwb = NULL; mutex_exit(&zilog->zl_lock); - if (avl_numnodes(t) == 0) - return; - /* - * If there was an IO error, we're not going to call zio_flush() - * on these vdevs, so we simply empty the tree and free the - * nodes. We avoid calling zio_flush() since there isn't any + * If there was an IO error, there's no point continuing. We + * avoid calling zio_flush() since there isn't any * good reason for doing so, after the lwb block failed to be * written out. * @@ -1617,38 +1552,78 @@ zil_lwb_write_done(zio_t *zio) * we expect any error seen here, to have been propagated to * that function). */ - if (zio->io_error != 0) { - while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) - kmem_free(zv, sizeof (*zv)); + if (zio->io_error != 0 || zil_nocacheflush) { + /* + * Trace records may have been passed on to us from a + * previously-deferred lwb. Since we're not going to use them, + * we clean them up now. + */ + zio_vdev_trace_empty(&lwb->lwb_vdev_tree); return; } /* - * If this lwb does not have any threads waiting for it to complete, we - * want to defer issuing the flush command to the vdevs written to by - * "this" lwb, and instead rely on the "next" lwb to handle the flush - * command for those vdevs. Thus, we merge the vdev tree of "this" lwb - * with the vdev tree of the "next" lwb in the list, and assume the - * "next" lwb will handle flushing the vdevs (or deferring the flush(s) - * again). + * This zio was issued with ZIO_FLAG_VDEV_TRACE, and so its + * io_vdev_trace_tree has the set of zio_vdev_trace_t records for vdevs + * that were written to by "this" lwb. * - * This is a useful performance optimization, especially for workloads + * To complete the commit, we need to issue a flush to those vdevs. If + * this lwb does not have any threads waiting for it to complete, we + * want to defer issuing that flush, and instead rely on the "next" lwb + * to hndle the flush command for those vdevs. + * + * (This is a useful performance optimization, especially for workloads * with lots of async write activity and few sync write and/or fsync * activity, as it has the potential to coalesce multiple flush - * commands to a vdev into one. + * commands to a vdev into one). + * + * However, if this lwb does have threads waiting for it, we must issue + * the flush now. + * + * Regardless of whether or not we issue the flush now or defer it to + * the "next" lwb, our lwb_vdev_tree may also have entries on it that + * were deferred to us. + * + * So, our job here is to assemble a single tree of vdev trace records + * including all those that were written by this zio, plus any that + * were deferred to us, and either issue the flush now, or pass them on + * to the "next" lwb. + * + * Because the zio owns the entries on its trace tree, and it is no + * longer available to us after this function returns, so we make our + * own copies instead of just stealing them. */ if (list_is_empty(&lwb->lwb_waiters) && nlwb != NULL) { - zil_lwb_flush_defer(lwb, nlwb); - ASSERT(avl_is_empty(&lwb->lwb_vdev_tree)); + ASSERT3S(nlwb->lwb_state, !=, LWB_STATE_WRITE_DONE); + ASSERT3S(nlwb->lwb_state, !=, LWB_STATE_FLUSH_DONE); + + /* + * While 'lwb' is at a point in its lifetime where + * lwb_vdev_tree does not need the protection of lwb_vdev_lock + * (it will only be modified while holding zilog->zl_lock) as + * its writes and those of its children have all completed. + * The younger 'nlwb' may be waiting on future writes to + * additional vdevs. + */ + mutex_enter(&nlwb->lwb_vdev_lock); + zio_vdev_trace_copy(&zio->io_vdev_trace_tree, + &nlwb->lwb_vdev_tree); + + /* + * Also move any trace records from this lwb to the next. + */ + zio_vdev_trace_move(&lwb->lwb_vdev_tree, &nlwb->lwb_vdev_tree); + mutex_exit(&nlwb->lwb_vdev_lock); return; } - while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) { - vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev); - if (vd != NULL) - zio_flush(lwb->lwb_root_zio, vd, B_TRUE); - kmem_free(zv, sizeof (*zv)); - } + /* + * Fill out the trace tree with records, then issue the flush, + * delivering errors to zil_lwb_flush_vdevs_done(). + */ + zio_vdev_trace_copy(&zio->io_vdev_trace_tree, &lwb->lwb_vdev_tree); + zio_vdev_trace_flush(lwb->lwb_root_zio, &lwb->lwb_vdev_tree); + zio_vdev_trace_empty(&lwb->lwb_vdev_tree); } /* @@ -1931,8 +1906,7 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]); lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio, spa, 0, &lwb->lwb_blk, lwb_abd, lwb->lwb_sz, zil_lwb_write_done, - lwb, prio, ZIO_FLAG_CANFAIL, &zb); - zil_lwb_add_block(lwb, &lwb->lwb_blk); + lwb, prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_VDEV_TRACE, &zb); if (lwb->lwb_slim) { /* For Slim ZIL only write what is used. */ @@ -3773,8 +3747,7 @@ zil_lwb_cons(void *vbuf, void *unused, int kmflag) list_create(&lwb->lwb_itxs, sizeof (itx_t), offsetof(itx_t, itx_node)); list_create(&lwb->lwb_waiters, sizeof (zil_commit_waiter_t), offsetof(zil_commit_waiter_t, zcw_node)); - avl_create(&lwb->lwb_vdev_tree, zil_lwb_vdev_compare, - sizeof (zil_vdev_node_t), offsetof(zil_vdev_node_t, zv_node)); + zio_vdev_trace_init(&lwb->lwb_vdev_tree); mutex_init(&lwb->lwb_vdev_lock, NULL, MUTEX_DEFAULT, NULL); return (0); } @@ -3785,7 +3758,7 @@ zil_lwb_dest(void *vbuf, void *unused) (void) unused; lwb_t *lwb = vbuf; mutex_destroy(&lwb->lwb_vdev_lock); - avl_destroy(&lwb->lwb_vdev_tree); + zio_vdev_trace_fini(&lwb->lwb_vdev_tree); list_destroy(&lwb->lwb_waiters); list_destroy(&lwb->lwb_itxs); } @@ -4373,7 +4346,6 @@ EXPORT_SYMBOL(zil_sync); EXPORT_SYMBOL(zil_clean); EXPORT_SYMBOL(zil_suspend); EXPORT_SYMBOL(zil_resume); -EXPORT_SYMBOL(zil_lwb_add_block); EXPORT_SYMBOL(zil_bp_tree_add); EXPORT_SYMBOL(zil_set_sync); EXPORT_SYMBOL(zil_set_logbias); diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 7af3eb922d7b..5dbd8386a71b 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -76,6 +76,7 @@ static int zio_deadman_log_all = B_FALSE; */ static kmem_cache_t *zio_cache; static kmem_cache_t *zio_link_cache; +static kmem_cache_t *zio_vdev_trace_cache; kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; #if defined(ZFS_DEBUG) && !defined(_KERNEL) @@ -157,6 +158,8 @@ zio_init(void) sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0); zio_link_cache = kmem_cache_create("zio_link_cache", sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0); + zio_vdev_trace_cache = kmem_cache_create("zio_vdev_trace_cache", + sizeof (zio_vdev_trace_t), 0, NULL, NULL, NULL, NULL, NULL, 0); for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { size_t size = (c + 1) << SPA_MINBLOCKSHIFT; @@ -285,6 +288,7 @@ zio_fini(void) VERIFY3P(zio_data_buf_cache[i], ==, NULL); } + kmem_cache_destroy(zio_vdev_trace_cache); kmem_cache_destroy(zio_link_cache); kmem_cache_destroy(zio_cache); @@ -796,6 +800,59 @@ zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait, pio->io_reexecute |= zio->io_reexecute; ASSERT3U(*countp, >, 0); + /* + * If all of the following are true: + * + * - the parent has requested vdev tracing + * - the child has just completed + * - the child was for a real vdev + * - the child is "interesting" for tracing purposes (see below) + * + * then we can stash some information about the vdev on the trace tree. + * + * "Interesting" means a vdev whose response was a direct contributor + * to the success of the overall zio; that is, we only consider zios + * that succeeded, and weren't something that was allowed or expected + * to fail (eg an aggregation padding write). + * + * This is important for the initial use case (knowing which vdevs were + * written to but not flushed), and is arguably correct for all cases + * (a vdev that returned an error, by definition, did not participate + * in the completing the zio). Its necessary in practice because an + * error from a leaf does not necessarily mean its parent will error + * out too (eg raidz can sometimes compensate for failed writes). If + * some future case requires more complex filtering we can look at + * stashing more info into zio_vdev_trace_t. + * + */ + if (pio->io_flags & ZIO_FLAG_VDEV_TRACE && + wait == ZIO_WAIT_DONE && zio->io_vd != NULL && + ((zio->io_flags & (ZIO_FLAG_OPTIONAL | ZIO_FLAG_IO_REPAIR)) == 0)) { + avl_tree_t *t = &pio->io_vdev_trace_tree; + zio_vdev_trace_t *zvt, zvt_search; + avl_index_t where; + + if (zio->io_error == 0) { + zvt_search.zvt_guid = zio->io_vd->vdev_guid; + if (avl_find(t, &zvt_search, &where) == NULL) { + zvt = kmem_cache_alloc( + zio_vdev_trace_cache, KM_SLEEP); + zvt->zvt_guid = zio->io_vd->vdev_guid; + avl_insert(t, zvt, where); + } + } + + /* + * If the child has itself collected trace records, copy them + * to ours. Note that we can't steal them, as there may be + * multiple parents. + */ + if (zio->io_flags & ZIO_FLAG_VDEV_TRACE) { + zio_vdev_trace_copy(&zio->io_vdev_trace_tree, + &pio->io_vdev_trace_tree); + } + } + (*countp)--; if (*countp == 0 && pio->io_stall == countp) { @@ -882,6 +939,92 @@ zio_bookmark_compare(const void *x1, const void *x2) return (0); } +static int +zio_vdev_trace_compare(const void *x1, const void *x2) +{ + const uint64_t v1 = ((zio_vdev_trace_t *)x1)->zvt_guid; + const uint64_t v2 = ((zio_vdev_trace_t *)x2)->zvt_guid; + + return (TREE_CMP(v1, v2)); +} + +void +zio_vdev_trace_init(avl_tree_t *t) +{ + avl_create(t, zio_vdev_trace_compare, sizeof (zio_vdev_trace_t), + offsetof(zio_vdev_trace_t, zvt_node)); +} + +void +zio_vdev_trace_fini(avl_tree_t *t) +{ + ASSERT(avl_is_empty(t)); + avl_destroy(t); +} + +/* + * Copy trace records on src and add them to dst, skipping any that are already + * on dst. + */ +void +zio_vdev_trace_copy(avl_tree_t *src, avl_tree_t *dst) +{ + zio_vdev_trace_t *zvt, *nzvt; + avl_index_t where; + + for (zvt = avl_first(src); zvt != NULL; zvt = AVL_NEXT(src, zvt)) { + if (avl_find(dst, zvt, &where) == NULL) { + nzvt = kmem_cache_alloc(zio_vdev_trace_cache, KM_SLEEP); + nzvt->zvt_guid = zvt->zvt_guid; + avl_insert(dst, nzvt, where); + } + } +} + +/* Move trace records from src to dst. src will be empty upon return. */ +void +zio_vdev_trace_move(avl_tree_t *src, avl_tree_t *dst) +{ + zio_vdev_trace_t *zvt; + avl_index_t where; + void *cookie; + + cookie = NULL; + while ((zvt = avl_destroy_nodes(src, &cookie)) != NULL) { + if (avl_find(dst, zvt, &where) == NULL) + avl_insert(dst, zvt, where); + else + kmem_cache_free(zio_vdev_trace_cache, zvt); + } + + ASSERT(avl_is_empty(src)); +} + +void +zio_vdev_trace_flush(zio_t *pio, avl_tree_t *t) +{ + spa_t *spa = pio->io_spa; + zio_vdev_trace_t *zvt; + vdev_t *vd; + + for (zvt = avl_first(t); zvt != NULL; zvt = AVL_NEXT(t, zvt)) { + vd = vdev_lookup_by_guid(spa->spa_root_vdev, zvt->zvt_guid); + if (vd != NULL && vd->vdev_children == 0) + zio_flush(pio, vd, B_TRUE); + } +} + +void +zio_vdev_trace_empty(avl_tree_t *t) +{ + zio_vdev_trace_t *zvt; + void *cookie = NULL; + while ((zvt = avl_destroy_nodes(t, &cookie)) != NULL) + kmem_cache_free(zio_vdev_trace_cache, zvt); + ASSERT(avl_is_empty(t)); +} + + /* * ========================================================================== * Create the various types of I/O (read, write, free, etc) @@ -919,6 +1062,8 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, offsetof(zio_link_t, zl_child_node)); metaslab_trace_init(&zio->io_alloc_list); + zio_vdev_trace_init(&zio->io_vdev_trace_tree); + if (vd != NULL) zio->io_child_type = ZIO_CHILD_VDEV; else if (flags & ZIO_FLAG_GANG_CHILD) @@ -984,6 +1129,8 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, void zio_destroy(zio_t *zio) { + zio_vdev_trace_empty(&zio->io_vdev_trace_tree); + zio_vdev_trace_fini(&zio->io_vdev_trace_tree); metaslab_trace_fini(&zio->io_alloc_list); list_destroy(&zio->io_parent_list); list_destroy(&zio->io_child_list);