From 0ec67e81d52f3e06feb9f81702c1b117be82f128 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Thu, 18 Jul 2024 12:37:43 +1000 Subject: [PATCH 1/4] zvol_impl: document and tidy flags ZVOL_DUMPIFIED is a vestigial Solaris leftover, and not used anywhere. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Signed-off-by: Rob Norris --- include/sys/zvol_impl.h | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/include/sys/zvol_impl.h b/include/sys/zvol_impl.h index 6c15c84b6bf4..777b1b93ee03 100644 --- a/include/sys/zvol_impl.h +++ b/include/sys/zvol_impl.h @@ -24,16 +24,9 @@ #include -#define ZVOL_RDONLY 0x1 -/* - * Whether the zvol has been written to (as opposed to ZVOL_RDONLY, which - * specifies whether or not the zvol _can_ be written to) - */ -#define ZVOL_WRITTEN_TO 0x2 - -#define ZVOL_DUMPIFIED 0x4 - -#define ZVOL_EXCL 0x8 +#define ZVOL_RDONLY (1<<0) /* zvol is readonly (writes rejected) */ +#define ZVOL_WRITTEN_TO (1<<1) /* zvol has been written to (needs flush) */ +#define ZVOL_EXCL (1<<2) /* zvol has O_EXCL client right now */ /* * The in-core state of each volume. From 57ba54285523e8c4d2f7d0560ac2c1ba9c6ad9a9 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Thu, 18 Jul 2024 13:13:44 +1000 Subject: [PATCH 2/4] linux/zvol_os: fix SET_ERROR with negative return codes SET_ERROR is our facility for tracking errors internally. The negation is to match the what the kernel expects from us. Thus, the negation should happen outside of the SET_ERROR. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Signed-off-by: Rob Norris --- module/os/linux/zfs/zvol_os.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c index c01caa6da8b4..ba6a24f312aa 100644 --- a/module/os/linux/zfs/zvol_os.c +++ b/module/os/linux/zfs/zvol_os.c @@ -730,7 +730,7 @@ zvol_open(struct block_device *bdev, fmode_t flag) #endif if (zv == NULL) { rw_exit(&zvol_state_lock); - return (SET_ERROR(-ENXIO)); + return (-SET_ERROR(ENXIO)); } mutex_enter(&zv->zv_state_lock); @@ -795,10 +795,10 @@ zvol_open(struct block_device *bdev, fmode_t flag) #ifdef HAVE_BLKDEV_GET_ERESTARTSYS schedule(); - return (SET_ERROR(-ERESTARTSYS)); + return (-SET_ERROR(ERESTARTSYS)); #else if ((gethrtime() - start) > timeout) - return (SET_ERROR(-ERESTARTSYS)); + return (-SET_ERROR(ERESTARTSYS)); schedule_timeout_interruptible( MSEC_TO_TICK(10)); @@ -821,7 +821,7 @@ zvol_open(struct block_device *bdev, fmode_t flag) if (zv->zv_open_count == 0) zvol_last_close(zv); - error = SET_ERROR(-EROFS); + error = -SET_ERROR(EROFS); } else { zv->zv_open_count++; } From 1fa03c6d2b6225e0f52c4688545c9d568b6893ac Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Thu, 18 Jul 2024 13:24:05 +1000 Subject: [PATCH 3/4] zvol: ensure device minors are properly cleaned up Currently, if a minor is in use when we try to remove it, we'll skip it and never come back to it again. Since the zvol state is hung off the minor in the kernel, this can get us into weird situations if something tries to use it after the removal fails. It's even worse at pool export, as there's now a vestigial zvol state with no pool under it. It's weirder again if the pool is subsequently reimported, as the zvol code (reasonably) assumes the zvol state has been properly setup, when it's actually left over from the previous import of the pool. This commit attempts to tackle that by setting a flag on the zvol if its minor can't be removed, and then checking that flag when a request is made and rejecting it, thus stopping new work coming in. The flag also causes a condvar to be signaled when the last client finishes. For the case where a single minor is being removed (eg changing volmode), it will wait for this signal before proceeding. Meanwhile, when removing all minors, a background task is created for each minor that couldn't be removed on the spot, and those tasks then wake and clean up. Since any new tasks are queued on to the pool's spa_zvol_taskq, spa_export_common() will continue to wait at export until all minors are removed. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Signed-off-by: Rob Norris --- include/sys/zvol_impl.h | 5 ++ module/os/freebsd/zfs/zvol_os.c | 10 ++- module/os/linux/zfs/zvol_os.c | 15 +++++ module/zfs/zvol.c | 116 +++++++++++++++++++++++++------- 4 files changed, 121 insertions(+), 25 deletions(-) diff --git a/include/sys/zvol_impl.h b/include/sys/zvol_impl.h index 777b1b93ee03..3cd0d78c353d 100644 --- a/include/sys/zvol_impl.h +++ b/include/sys/zvol_impl.h @@ -18,6 +18,9 @@ * * CDDL HEADER END */ +/* + * Copyright (c) 2024, Klara, Inc. + */ #ifndef _SYS_ZVOL_IMPL_H #define _SYS_ZVOL_IMPL_H @@ -27,6 +30,7 @@ #define ZVOL_RDONLY (1<<0) /* zvol is readonly (writes rejected) */ #define ZVOL_WRITTEN_TO (1<<1) /* zvol has been written to (needs flush) */ #define ZVOL_EXCL (1<<2) /* zvol has O_EXCL client right now */ +#define ZVOL_REMOVING (1<<3) /* zvol waiting to remove minor */ /* * The in-core state of each volume. @@ -50,6 +54,7 @@ typedef struct zvol_state { kmutex_t zv_state_lock; /* protects zvol_state_t */ atomic_t zv_suspend_ref; /* refcount for suspend */ krwlock_t zv_suspend_lock; /* suspend lock */ + kcondvar_t zv_removing_cv; /* ready to remove minor */ struct zvol_state_os *zv_zso; /* private platform state */ boolean_t zv_threading; /* volthreading property */ } zvol_state_t; diff --git a/module/os/freebsd/zfs/zvol_os.c b/module/os/freebsd/zfs/zvol_os.c index 38e9debbe877..0d526484cec3 100644 --- a/module/os/freebsd/zfs/zvol_os.c +++ b/module/os/freebsd/zfs/zvol_os.c @@ -30,6 +30,7 @@ * Copyright (c) 2012, 2017 by Delphix. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright (c) 2014 Integros [integros.com] + * Copyright (c) 2024, Klara, Inc. */ /* Portions Copyright 2011 Martin Matuska */ @@ -250,7 +251,7 @@ zvol_geom_open(struct g_provider *pp, int flag, int count) } mutex_enter(&zv->zv_state_lock); - if (zv->zv_zso->zso_dying) { + if (zv->zv_zso->zso_dying || zv->zv_flags & ZVOL_REMOVING) { rw_exit(&zvol_state_lock); err = SET_ERROR(ENXIO); goto out_zv_locked; @@ -683,6 +684,11 @@ zvol_geom_bio_strategy(struct bio *bp) rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); + if (zv->zv_flags & ZVOL_REMOVING) { + error = SET_ERROR(ENXIO); + goto resume; + } + switch (bp->bio_cmd) { case BIO_READ: doread = B_TRUE; @@ -1362,6 +1368,7 @@ zvol_os_free(zvol_state_t *zv) } mutex_destroy(&zv->zv_state_lock); + cv_destroy(&zv->zv_removing_cv); dataset_kstats_destroy(&zv->zv_kstat); kmem_free(zv->zv_zso, sizeof (struct zvol_state_os)); kmem_free(zv, sizeof (zvol_state_t)); @@ -1419,6 +1426,7 @@ zvol_os_create_minor(const char *name) zv = kmem_zalloc(sizeof (*zv), KM_SLEEP); zv->zv_hash = hash; mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&zv->zv_removing_cv, NULL, CV_DEFAULT, NULL); zv->zv_zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP); zv->zv_volmode = volmode; if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c index ba6a24f312aa..83f80f62aee7 100644 --- a/module/os/linux/zfs/zvol_os.c +++ b/module/os/linux/zfs/zvol_os.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2012, 2020 by Delphix. All rights reserved. + * Copyright (c) 2024, Klara, Inc. */ #include @@ -526,6 +527,11 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, uint64_t size = io_size(bio, rq); int rw = io_data_dir(bio, rq); + if (unlikely(zv->zv_flags & ZVOL_REMOVING)) { + END_IO(zv, bio, rq, -SET_ERROR(ENXIO)); + goto out; + } + if (zvol_request_sync || zv->zv_threading == B_FALSE) force_sync = 1; @@ -734,6 +740,13 @@ zvol_open(struct block_device *bdev, fmode_t flag) } mutex_enter(&zv->zv_state_lock); + + if (unlikely(zv->zv_flags & ZVOL_REMOVING)) { + mutex_exit(&zv->zv_state_lock); + rw_exit(&zvol_state_lock); + return (-SET_ERROR(ENXIO)); + } + /* * Make sure zvol is not suspended during first open * (hold zv_suspend_lock) and respect proper lock acquisition @@ -1313,6 +1326,7 @@ zvol_alloc(dev_t dev, const char *name) list_link_init(&zv->zv_next); mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&zv->zv_removing_cv, NULL, CV_DEFAULT, NULL); #ifdef HAVE_BLK_MQ zv->zv_zso->use_blk_mq = zvol_use_blk_mq; @@ -1438,6 +1452,7 @@ zvol_os_free(zvol_state_t *zv) ida_simple_remove(&zvol_ida, MINOR(zv->zv_zso->zvo_dev) >> ZVOL_MINOR_BITS); + cv_destroy(&zv->zv_removing_cv); mutex_destroy(&zv->zv_state_lock); dataset_kstats_destroy(&zv->zv_kstat); diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c index 5b6a3f5cb410..001f774a6d16 100644 --- a/module/zfs/zvol.c +++ b/module/zfs/zvol.c @@ -37,6 +37,7 @@ * Copyright 2014 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2016 Actifio, Inc. All rights reserved. * Copyright (c) 2012, 2019 by Delphix. All rights reserved. + * Copyright (c) 2024, Klara, Inc. */ /* @@ -894,6 +895,9 @@ zvol_resume(zvol_state_t *zv) */ atomic_dec(&zv->zv_suspend_ref); + if (zv->zv_flags & ZVOL_REMOVING) + cv_broadcast(&zv->zv_removing_cv); + return (SET_ERROR(error)); } @@ -929,6 +933,9 @@ zvol_last_close(zvol_state_t *zv) ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); + if (zv->zv_flags & ZVOL_REMOVING) + cv_broadcast(&zv->zv_removing_cv); + zvol_shutdown_zv(zv); dmu_objset_disown(zv->zv_objset, 1, zv); @@ -1221,6 +1228,41 @@ zvol_create_minor(const char *name) * Remove minors for specified dataset including children and snapshots. */ +/* + * Remove the minor for a given zvol. This will do it all: + * - flag the zvol for removal, so new requests are rejected + * - wait until outstanding requests are completed + * - remove it from lists + * - free it + * It's also usable as a taskq task, and smells nice too. + */ +static void +zvol_remove_minor_task(void *arg) +{ + zvol_state_t *zv = (zvol_state_t *)arg; + + ASSERT(!RW_LOCK_HELD(&zvol_state_lock)); + ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); + + mutex_enter(&zv->zv_state_lock); + while (zv->zv_open_count > 0 || atomic_read(&zv->zv_suspend_ref)) { + zv->zv_flags |= ZVOL_REMOVING; + cv_wait(&zv->zv_removing_cv, &zv->zv_state_lock); + } + mutex_exit(&zv->zv_state_lock); + + rw_enter(&zvol_state_lock, RW_WRITER); + mutex_enter(&zv->zv_state_lock); + + zvol_remove(zv); + zvol_os_clear_private(zv); + + mutex_exit(&zv->zv_state_lock); + rw_exit(&zvol_state_lock); + + zvol_os_free(zv); +} + static void zvol_free_task(void *arg) { @@ -1233,11 +1275,13 @@ zvol_remove_minors_impl(const char *name) zvol_state_t *zv, *zv_next; int namelen = ((name) ? strlen(name) : 0); taskqid_t t; - list_t free_list; + list_t delay_list, free_list; if (zvol_inhibit_dev) return; + list_create(&delay_list, sizeof (zvol_state_t), + offsetof(zvol_state_t, zv_next)); list_create(&free_list, sizeof (zvol_state_t), offsetof(zvol_state_t, zv_next)); @@ -1256,9 +1300,24 @@ zvol_remove_minors_impl(const char *name) * one is currently using this zv */ - /* If in use, leave alone */ + /* + * If in use, try to throw everyone off and try again + * later. + */ if (zv->zv_open_count > 0 || atomic_read(&zv->zv_suspend_ref)) { + zv->zv_flags |= ZVOL_REMOVING; + t = taskq_dispatch( + zv->zv_objset->os_spa->spa_zvol_taskq, + zvol_remove_minor_task, zv, TQ_SLEEP); + if (t == TASKQID_INVALID) { + /* + * Couldn't create the task, so we'll + * do it in place once the loop is + * finished. + */ + list_insert_head(&delay_list, zv); + } mutex_exit(&zv->zv_state_lock); continue; } @@ -1285,7 +1344,11 @@ zvol_remove_minors_impl(const char *name) } rw_exit(&zvol_state_lock); - /* Drop zvol_state_lock before calling zvol_free() */ + /* Wait for zvols that we couldn't create a remove task for */ + while ((zv = list_remove_head(&delay_list)) != NULL) + zvol_remove_minor_task(zv); + + /* Free any that we couldn't free in parallel earlier */ while ((zv = list_remove_head(&free_list)) != NULL) zvol_os_free(zv); } @@ -1305,33 +1368,38 @@ zvol_remove_minor_impl(const char *name) zv_next = list_next(&zvol_state_list, zv); mutex_enter(&zv->zv_state_lock); - if (strcmp(zv->zv_name, name) == 0) { - /* - * By holding zv_state_lock here, we guarantee that no - * one is currently using this zv - */ + if (strcmp(zv->zv_name, name) == 0) + /* Found, leave the the loop with zv_lock held */ + break; + mutex_exit(&zv->zv_state_lock); + } - /* If in use, leave alone */ - if (zv->zv_open_count > 0 || - atomic_read(&zv->zv_suspend_ref)) { - mutex_exit(&zv->zv_state_lock); - continue; - } - zvol_remove(zv); + if (zv == NULL) { + rw_exit(&zvol_state_lock); + return; + } - zvol_os_clear_private(zv); - mutex_exit(&zv->zv_state_lock); - break; - } else { - mutex_exit(&zv->zv_state_lock); - } + ASSERT(MUTEX_HELD(&zv->zv_state_lock)); + + if (zv->zv_open_count > 0 || atomic_read(&zv->zv_suspend_ref)) { + /* + * In use, so try to throw everyone off, then wait + * until finished. + */ + zv->zv_flags |= ZVOL_REMOVING; + mutex_exit(&zv->zv_state_lock); + rw_exit(&zvol_state_lock); + zvol_remove_minor_task(zv); + return; } - /* Drop zvol_state_lock before calling zvol_free() */ + zvol_remove(zv); + zvol_os_clear_private(zv); + + mutex_exit(&zv->zv_state_lock); rw_exit(&zvol_state_lock); - if (zv != NULL) - zvol_os_free(zv); + zvol_os_free(zv); } /* From a4d17566c2ed837816bd6c846e13aa009215b88c Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Thu, 18 Jul 2024 19:53:35 +1000 Subject: [PATCH 4/4] ZTS: remove skips for zvol_misc tests Last commit should fix the underlying problem, so these should be passing reliably again. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Signed-off-by: Rob Norris --- tests/test-runner/bin/zts-report.py.in | 4 ---- .../tests/functional/zvol/zvol_misc/zvol_misc_fua.ksh | 9 --------- .../tests/functional/zvol/zvol_misc/zvol_misc_trim.ksh | 9 --------- 3 files changed, 22 deletions(-) diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in index de06c7c6e2c1..422c750e2bc6 100755 --- a/tests/test-runner/bin/zts-report.py.in +++ b/tests/test-runner/bin/zts-report.py.in @@ -350,10 +350,6 @@ elif sys.platform.startswith('linux'): 'mmp/mmp_active_import': ['FAIL', known_reason], 'mmp/mmp_exported_import': ['FAIL', known_reason], 'mmp/mmp_inactive_import': ['FAIL', known_reason], - 'zvol/zvol_misc/zvol_misc_fua': ['SKIP', 14872], - 'zvol/zvol_misc/zvol_misc_snapdev': ['FAIL', 12621], - 'zvol/zvol_misc/zvol_misc_trim': ['SKIP', 14872], - 'zvol/zvol_misc/zvol_misc_volmode': ['FAIL', known_reason], }) # Not all Github actions runners have scsi_debug module, so we may skip diff --git a/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_fua.ksh b/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_fua.ksh index 619d8d0e8f07..9ebd5b149118 100755 --- a/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_fua.ksh +++ b/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_fua.ksh @@ -45,15 +45,6 @@ fi if ! is_linux ; then log_unsupported "Only linux supports dd with oflag=dsync for FUA writes" -else - if [[ $(linux_version) -gt $(linux_version "6.2") ]]; then - log_unsupported "Disabled while issue #14872 is being worked" - fi - - # Disabled for the CentOS 9 kernel - if [[ $(linux_version) -eq $(linux_version "5.14") ]]; then - log_unsupported "Disabled while issue #14872 is being worked" - fi fi typeset datafile1="$(mktemp zvol_misc_fua1.XXXXXX)" diff --git a/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_trim.ksh b/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_trim.ksh index c0b191aafd45..47cc42b9be7d 100755 --- a/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_trim.ksh +++ b/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_trim.ksh @@ -44,15 +44,6 @@ verify_runnable "global" if is_linux ; then - if [[ $(linux_version) -gt $(linux_version "6.2") ]]; then - log_unsupported "Disabled while issue #14872 is being worked" - fi - - # Disabled for the CentOS 9 kernel - if [[ $(linux_version) -eq $(linux_version "5.14") ]]; then - log_unsupported "Disabled while issue #14872 is being worked" - fi - # We need '--force' here since the prior tests may leave a filesystem # on the zvol, and blkdiscard will see that filesystem and print a # warning unless you force it.