From a0bd735adb1b1eb81fef10b4db102ee051c4d4ff Mon Sep 17 00:00:00 2001 From: Boris Protopopov Date: Sat, 22 Mar 2014 05:07:14 -0400 Subject: [PATCH] Add support for asynchronous zvol minor operations zfsonlinux issue #2217 - zvol minor operations: check snapdev property before traversing snapshots of a dataset zfsonlinux issue #3681 - lock order inversion between zvol_open() and dsl_pool_sync()...zvol_rename_minors() Create a per-pool zvol taskq for asynchronous zvol tasks. There are a few key design decisions to be aware of. * Each taskq must be single threaded to ensure tasks are always processed in the order in which they were dispatched. * There is a taskq per-pool in order to keep the pools independent. This way if one pool is suspended it will not impact another. * The preferred location to dispatch a zvol minor task is a sync task. In this context there is easy access to the spa_t and minimal error handling is required because the sync task must succeed. Support for asynchronous zvol minor operations address issue #3681. Signed-off-by: Boris Protopopov Signed-off-by: Brian Behlendorf Closes #2217 Closes #3678 Closes #3681 --- include/sys/spa_impl.h | 2 + include/sys/zvol.h | 15 +- lib/libzpool/kernel.c | 22 ++ module/zfs/dmu_objset.c | 4 + module/zfs/dmu_send.c | 3 + module/zfs/dsl_dataset.c | 36 +-- module/zfs/dsl_destroy.c | 8 +- module/zfs/dsl_dir.c | 6 +- module/zfs/spa.c | 37 ++- module/zfs/zfs_ioctl.c | 50 +--- module/zfs/zvol.c | 494 +++++++++++++++++++++++++++++---------- scripts/zconfig.sh | 19 +- 12 files changed, 482 insertions(+), 214 deletions(-) diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 0bb6dccdc2f9..759c3472f548 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -23,6 +23,7 @@ * Copyright (c) 2011, 2015 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright (c) 2016 Actifio, Inc. All rights reserved. */ #ifndef _SYS_SPA_IMPL_H @@ -253,6 +254,7 @@ struct spa { uint64_t spa_errata; /* errata issues detected */ spa_stats_t spa_stats; /* assorted spa statistics */ hrtime_t spa_ccw_fail_time; /* Conf cache write fail time */ + taskq_t *spa_zvol_taskq; /* Taskq for minor managment */ /* * spa_refcount & spa_config_lock must be the last elements diff --git a/include/sys/zvol.h b/include/sys/zvol.h index 898e2352156b..c3e386f0b79e 100644 --- a/include/sys/zvol.h +++ b/include/sys/zvol.h @@ -21,6 +21,7 @@ /* * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2016 Actifio, Inc. All rights reserved. */ #ifndef _SYS_ZVOL_H @@ -31,24 +32,22 @@ #define ZVOL_OBJ 1ULL #define ZVOL_ZAP_OBJ 2ULL -#ifdef _KERNEL +extern void zvol_create_minors(spa_t *spa, const char *name, boolean_t async); +extern void zvol_remove_minors(spa_t *spa, const char *name, boolean_t async); +extern void zvol_rename_minors(spa_t *spa, const char *oldname, + const char *newname, boolean_t async); +#ifdef _KERNEL extern int zvol_check_volsize(uint64_t volsize, uint64_t blocksize); extern int zvol_check_volblocksize(const char *name, uint64_t volblocksize); extern int zvol_get_stats(objset_t *os, nvlist_t *nv); extern boolean_t zvol_is_zvol(const char *); extern void zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx); -extern int zvol_create_minor(const char *name); -extern int zvol_create_minors(const char *name); -extern int zvol_remove_minor(const char *name); -extern void zvol_remove_minors(const char *name); -extern void zvol_rename_minors(const char *oldname, const char *newname); extern int zvol_set_volsize(const char *, uint64_t); extern int zvol_set_volblocksize(const char *, uint64_t); -extern int zvol_set_snapdev(const char *, uint64_t); +extern int zvol_set_snapdev(const char *, zprop_source_t, uint64_t); extern int zvol_init(void); extern void zvol_fini(void); - #endif /* _KERNEL */ #endif /* _SYS_ZVOL_H */ diff --git a/lib/libzpool/kernel.c b/lib/libzpool/kernel.c index a69a8da3aeeb..49d17ece3273 100644 --- a/lib/libzpool/kernel.c +++ b/lib/libzpool/kernel.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2016 Actifio, Inc. All rights reserved. */ #include @@ -1354,3 +1355,24 @@ spl_fstrans_check(void) { return (0); } + +void +zvol_create_minors(spa_t *spa, const char *name, boolean_t async) +{ +} + +void +zvol_remove_minor(spa_t *spa, const char *name, boolean_t async) +{ +} + +void +zvol_remove_minors(spa_t *spa, const char *name, boolean_t async) +{ +} + +void +zvol_rename_minors(spa_t *spa, const char *oldname, const char *newname, + boolean_t async) +{ +} diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index acfc7f048479..f9c534eb5736 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -26,6 +26,7 @@ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright (c) 2015 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2015, STRATO AG, Inc. All rights reserved. + * Copyright (c) 2016 Actifio, Inc. All rights reserved. */ /* Portions Copyright 2010 Robert Milkowski */ @@ -868,6 +869,8 @@ dmu_objset_create_sync(void *arg, dmu_tx_t *tx) } spa_history_log_internal_ds(ds, "create", tx, ""); + zvol_create_minors(dp->dp_spa, doca->doca_name, B_TRUE); + dsl_dataset_rele(ds, FTAG); dsl_dir_rele(pdd, FTAG); } @@ -961,6 +964,7 @@ dmu_objset_clone_sync(void *arg, dmu_tx_t *tx) dsl_dataset_name(origin, namebuf); spa_history_log_internal_ds(ds, "clone", tx, "origin=%s (%llu)", namebuf, origin->ds_object); + zvol_create_minors(dp->dp_spa, doca->doca_clone, B_TRUE); dsl_dataset_rele(ds, FTAG); dsl_dataset_rele(origin, FTAG); dsl_dir_rele(pdd, FTAG); diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c index 6585e4778de8..613770e10d33 100644 --- a/module/zfs/dmu_send.c +++ b/module/zfs/dmu_send.c @@ -24,6 +24,7 @@ * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014, Joyent, Inc. All rights reserved. * Copyright (c) 2011, 2014 by Delphix. All rights reserved. + * Copyright (c) 2016 Actifio, Inc. All rights reserved. */ #include @@ -54,6 +55,7 @@ #include #include #include +#include /* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */ int zfs_send_corrupt_data = B_FALSE; @@ -2646,6 +2648,7 @@ dmu_recv_end_sync(void *arg, dmu_tx_t *tx) dsl_dataset_phys(ds)->ds_flags &= ~DS_FLAG_INCONSISTENT; } drc->drc_newsnapobj = dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj; + zvol_create_minors(dp->dp_spa, drc->drc_tofs, B_TRUE); /* * Release the hold from dmu_recv_begin. This must be done before * we return to open context, so that when we free the dataset's dnode, diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c index a5a9694fc14a..230027daf995 100644 --- a/module/zfs/dsl_dataset.c +++ b/module/zfs/dsl_dataset.c @@ -24,6 +24,7 @@ * Copyright (c) 2014, Joyent, Inc. All rights reserved. * Copyright (c) 2014 RackTop Systems. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright (c) 2016 Actifio, Inc. All rights reserved. */ #include @@ -1424,6 +1425,7 @@ dsl_dataset_snapshot_sync(void *arg, dmu_tx_t *tx) dsl_props_set_sync_impl(ds->ds_prev, ZPROP_SRC_LOCAL, ddsa->ddsa_props, tx); } + zvol_create_minors(dp->dp_spa, nvpair_name(pair), B_TRUE); dsl_dataset_rele(ds, FTAG); } } @@ -1498,16 +1500,6 @@ dsl_dataset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors) fnvlist_free(suspended); } -#ifdef _KERNEL - if (error == 0) { - for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; - pair = nvlist_next_nvpair(snaps, pair)) { - char *snapname = nvpair_name(pair); - zvol_create_minors(snapname); - } - } -#endif - return (error); } @@ -1930,6 +1922,8 @@ dsl_dataset_rename_snapshot_sync_impl(dsl_pool_t *dp, VERIFY0(zap_add(dp->dp_meta_objset, dsl_dataset_phys(hds)->ds_snapnames_zapobj, ds->ds_snapname, 8, 1, &ds->ds_object, tx)); + zvol_rename_minors(dp->dp_spa, ddrsa->ddrsa_oldsnapname, + ddrsa->ddrsa_newsnapname, B_TRUE); dsl_dataset_rele(ds, FTAG); return (0); @@ -1958,11 +1952,6 @@ int dsl_dataset_rename_snapshot(const char *fsname, const char *oldsnapname, const char *newsnapname, boolean_t recursive) { -#ifdef _KERNEL - char *oldname, *newname; -#endif - int error; - dsl_dataset_rename_snapshot_arg_t ddrsa; ddrsa.ddrsa_fsname = fsname; @@ -1970,22 +1959,9 @@ dsl_dataset_rename_snapshot(const char *fsname, ddrsa.ddrsa_newsnapname = newsnapname; ddrsa.ddrsa_recursive = recursive; - error = dsl_sync_task(fsname, dsl_dataset_rename_snapshot_check, + return (dsl_sync_task(fsname, dsl_dataset_rename_snapshot_check, dsl_dataset_rename_snapshot_sync, &ddrsa, - 1, ZFS_SPACE_CHECK_RESERVED); - - if (error) - return (SET_ERROR(error)); - -#ifdef _KERNEL - oldname = kmem_asprintf("%s@%s", fsname, oldsnapname); - newname = kmem_asprintf("%s@%s", fsname, newsnapname); - zvol_rename_minors(oldname, newname); - strfree(newname); - strfree(oldname); -#endif - - return (0); + 1, ZFS_SPACE_CHECK_RESERVED)); } /* diff --git a/module/zfs/dsl_destroy.c b/module/zfs/dsl_destroy.c index d0015d1bd97b..d7c34c9a403e 100644 --- a/module/zfs/dsl_destroy.c +++ b/module/zfs/dsl_destroy.c @@ -23,6 +23,7 @@ * Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright (c) 2013 by Joyent, Inc. All rights reserved. + * Copyright (c) 2016 Actifio, Inc. All rights reserved. */ #include @@ -40,6 +41,7 @@ #include #include #include +#include typedef struct dmu_snapshots_destroy_arg { nvlist_t *dsda_snaps; @@ -243,9 +245,6 @@ dsl_dataset_remove_clones_key(dsl_dataset_t *ds, uint64_t mintxg, dmu_tx_t *tx) void dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx) { -#ifdef ZFS_DEBUG - int err; -#endif spa_feature_t f; int after_branch_point = FALSE; dsl_pool_t *dp = ds->ds_dir->dd_pool; @@ -441,6 +440,7 @@ dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx) #ifdef ZFS_DEBUG { uint64_t val; + int err; err = dsl_dataset_snap_lookup(ds_head, ds->ds_snapname, &val); @@ -490,6 +490,7 @@ dsl_destroy_snapshot_sync(void *arg, dmu_tx_t *tx) VERIFY0(dsl_dataset_hold(dp, nvpair_name(pair), FTAG, &ds)); dsl_destroy_snapshot_sync_impl(ds, dsda->dsda_defer, tx); + zvol_remove_minors(dp->dp_spa, nvpair_name(pair), B_TRUE); dsl_dataset_rele(ds, FTAG); } } @@ -889,6 +890,7 @@ dsl_destroy_head_sync(void *arg, dmu_tx_t *tx) VERIFY0(dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds)); dsl_destroy_head_sync_impl(ds, tx); + zvol_remove_minors(dp->dp_spa, ddha->ddha_name, B_TRUE); dsl_dataset_rele(ds, FTAG); } diff --git a/module/zfs/dsl_dir.c b/module/zfs/dsl_dir.c index 2c521e285baf..8983e0793f23 100644 --- a/module/zfs/dsl_dir.c +++ b/module/zfs/dsl_dir.c @@ -24,6 +24,7 @@ * Copyright (c) 2013 Martin Matuska. All rights reserved. * Copyright (c) 2014 Joyent, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright (c) 2016 Actifio, Inc. All rights reserved. */ #include @@ -1909,9 +1910,8 @@ dsl_dir_rename_sync(void *arg, dmu_tx_t *tx) VERIFY0(zap_add(mos, dsl_dir_phys(newparent)->dd_child_dir_zapobj, dd->dd_myname, 8, 1, &dd->dd_object, tx)); -#ifdef _KERNEL - zvol_rename_minors(ddra->ddra_oldname, ddra->ddra_newname); -#endif + zvol_rename_minors(dp->dp_spa, ddra->ddra_oldname, + ddra->ddra_newname, B_TRUE); dsl_prop_notify_all(dd); diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 34a317fbed3a..01048bfe9027 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -24,6 +24,7 @@ * Copyright (c) 2013 by Delphix. All rights reserved. * Copyright (c) 2013, 2014, Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright (c) 2016 Actifio, Inc. All rights reserved. */ /* @@ -1136,6 +1137,24 @@ spa_activate(spa_t *spa, int mode) avl_create(&spa->spa_errlist_last, spa_error_entry_compare, sizeof (spa_error_entry_t), offsetof(spa_error_entry_t, se_avl)); + + /* + * This taskq is used to perform zvol-minor-related tasks + * asynchronously. This has several advantages, including easy + * resolution of various deadlocks (zfsonlinux bug #3681). + * + * The taskq must be single threaded to ensure tasks are always + * processed in the order in which they were dispatched. + * + * A taskq per pool allows one to keep the pools independent. + * This way if one pool is suspended, it will not impact another. + * + * The preferred location to dispatch a zvol minor task is a sync + * task. In this context, there is easy access to the spa_t and minimal + * error handling is required because the sync task must succeed. + */ + spa->spa_zvol_taskq = taskq_create("z_zvol", 1, defclsyspri, + 1, INT_MAX, 0); } /* @@ -1154,6 +1173,11 @@ spa_deactivate(spa_t *spa) spa_evicting_os_wait(spa); + if (spa->spa_zvol_taskq) { + taskq_destroy(spa->spa_zvol_taskq); + spa->spa_zvol_taskq = NULL; + } + txg_list_destroy(&spa->spa_vdev_txg_list); list_destroy(&spa->spa_config_dirty_list); @@ -3088,10 +3112,8 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, mutex_exit(&spa_namespace_lock); } -#ifdef _KERNEL if (firstopen) - zvol_create_minors(spa->spa_name); -#endif + zvol_create_minors(spa, spa_name(spa), B_TRUE); *spapp = spa; @@ -4211,10 +4233,7 @@ spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) mutex_exit(&spa_namespace_lock); spa_history_log_version(spa, "import"); - -#ifdef _KERNEL - zvol_create_minors(pool); -#endif + zvol_create_minors(spa, pool, B_TRUE); return (0); } @@ -4349,6 +4368,10 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, spa_open_ref(spa, FTAG); mutex_exit(&spa_namespace_lock); spa_async_suspend(spa); + if (spa->spa_zvol_taskq) { + zvol_remove_minors(spa, spa_name(spa), B_TRUE); + taskq_wait(spa->spa_zvol_taskq); + } mutex_enter(&spa_namespace_lock); spa_close(spa, FTAG); diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index 746a3f0fcb2d..5c84d238f245 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -29,6 +29,7 @@ * Copyright (c) 2011, 2014 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. + * Copyright (c) 2016 Actifio, Inc. All rights reserved. */ /* @@ -1499,8 +1500,7 @@ zfs_ioc_pool_destroy(zfs_cmd_t *zc) int error; zfs_log_history(zc); error = spa_destroy(zc->zc_name); - if (error == 0) - zvol_remove_minors(zc->zc_name); + return (error); } @@ -1552,8 +1552,7 @@ zfs_ioc_pool_export(zfs_cmd_t *zc) zfs_log_history(zc); error = spa_export(zc->zc_name, NULL, force, hardforce); - if (error == 0) - zvol_remove_minors(zc->zc_name); + return (error); } @@ -2394,7 +2393,7 @@ zfs_prop_set_special(const char *dsname, zprop_source_t source, err = zvol_set_volsize(dsname, intval); break; case ZFS_PROP_SNAPDEV: - err = zvol_set_snapdev(dsname, intval); + err = zvol_set_snapdev(dsname, source, intval); break; case ZFS_PROP_VERSION: { @@ -3188,12 +3187,6 @@ zfs_ioc_create(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) if (error != 0) (void) dsl_destroy_head(fsname); } - -#ifdef _KERNEL - if (error == 0 && type == DMU_OST_ZVOL) - zvol_create_minors(fsname); -#endif - return (error); } @@ -3236,12 +3229,6 @@ zfs_ioc_clone(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) if (error != 0) (void) dsl_destroy_head(fsname); } - -#ifdef _KERNEL - if (error == 0) - zvol_create_minors(fsname); -#endif - return (error); } @@ -3304,11 +3291,6 @@ zfs_ioc_snapshot(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) error = dsl_dataset_snapshot(snaps, props, outnvl); -#ifdef _KERNEL - if (error == 0) - zvol_create_minors(poolname); -#endif - return (error); } @@ -3434,7 +3416,6 @@ zfs_ioc_destroy_snaps(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(snaps, pair)) { (void) zfs_unmount_snap(nvpair_name(pair)); - (void) zvol_remove_minor(nvpair_name(pair)); } return (dsl_destroy_snapshots_nvl(snaps, defer, outnvl)); @@ -3560,8 +3541,7 @@ zfs_ioc_destroy(zfs_cmd_t *zc) err = dsl_destroy_snapshot(zc->zc_name, zc->zc_defer_destroy); else err = dsl_destroy_head(zc->zc_name); - if (zc->zc_objset_type == DMU_OST_ZVOL && err == 0) - (void) zvol_remove_minor(zc->zc_name); + return (err); } @@ -4127,11 +4107,6 @@ zfs_ioc_recv(zfs_cmd_t *zc) } #endif -#ifdef _KERNEL - if (error == 0) - zvol_create_minors(tofs); -#endif - /* * On error, restore the original props. */ @@ -6032,16 +6007,16 @@ _init(void) return (error); } + if ((error = -zvol_init()) != 0) + return (error); + spa_init(FREAD | FWRITE); zfs_init(); - if ((error = -zvol_init()) != 0) - goto out1; - zfs_ioctl_init(); if ((error = zfs_attach()) != 0) - goto out2; + goto out; tsd_create(&zfs_fsyncer_key, NULL); tsd_create(&rrw_tsd_key, rrw_tsd_destroy); @@ -6057,11 +6032,10 @@ _init(void) return (0); -out2: - (void) zvol_fini(); -out1: +out: zfs_fini(); spa_fini(); + (void) zvol_fini(); printk(KERN_NOTICE "ZFS: Failed to Load ZFS Filesystem v%s-%s%s" ", rc = %d\n", ZFS_META_VERSION, ZFS_META_RELEASE, ZFS_DEBUG_STR, error); @@ -6073,9 +6047,9 @@ static void __exit _fini(void) { zfs_detach(); - zvol_fini(); zfs_fini(); spa_fini(); + zvol_fini(); tsd_destroy(&zfs_fsyncer_key); tsd_destroy(&rrw_tsd_key); diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c index 034cf6a6a95d..ab4d3ceb74a2 100644 --- a/module/zfs/zvol.c +++ b/module/zfs/zvol.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include #include @@ -49,6 +50,7 @@ #include #include #include +#include #include #include @@ -81,6 +83,23 @@ typedef struct zvol_state { list_node_t zv_next; /* next zvol_state_t linkage */ } zvol_state_t; +typedef enum { + ZVOL_ASYNC_CREATE_MINORS, + ZVOL_ASYNC_REMOVE_MINORS, + ZVOL_ASYNC_RENAME_MINORS, + ZVOL_ASYNC_SET_SNAPDEV, + ZVOL_ASYNC_MAX +} zvol_async_op_t; + +typedef struct { + zvol_async_op_t op; + char pool[MAXNAMELEN]; + char name1[MAXNAMELEN]; + char name2[MAXNAMELEN]; + zprop_source_t source; + uint64_t snapdev; +} zvol_task_t; + #define ZVOL_RDONLY 0x1 /* @@ -977,6 +996,7 @@ zvol_first_open(zvol_state_t *zv) error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); if (error) { dmu_objset_disown(os, zvol_tag); + zv->zv_objset = NULL; goto out_mutex; } @@ -984,6 +1004,7 @@ zvol_first_open(zvol_state_t *zv) error = dmu_bonus_hold(os, ZVOL_OBJ, zvol_tag, &zv->zv_dbuf); if (error) { dmu_objset_disown(os, zvol_tag); + zv->zv_objset = NULL; goto out_mutex; } @@ -1036,7 +1057,7 @@ zvol_open(struct block_device *bdev, fmode_t flag) /* * If the caller is already holding the mutex do not take it - * again, this will happen as part of zvol_create_minor(). + * again, this will happen as part of zvol_create_minor_impl(). * Once add_disk() is called the device is live and the kernel * will attempt to open it to read the partition information. */ @@ -1355,31 +1376,13 @@ zvol_free(zvol_state_t *zv) kmem_free(zv, sizeof (zvol_state_t)); } +/* + * Create a block device minor node and setup the linkage between it + * and the specified volume. Once this function returns the block + * device is live and ready for use. + */ static int -__zvol_snapdev_hidden(const char *name) -{ - uint64_t snapdev; - char *parent; - char *atp; - int error = 0; - - parent = kmem_alloc(MAXPATHLEN, KM_SLEEP); - (void) strlcpy(parent, name, MAXPATHLEN); - - if ((atp = strrchr(parent, '@')) != NULL) { - *atp = '\0'; - error = dsl_prop_get_integer(parent, "snapdev", &snapdev, NULL); - if ((error == 0) && (snapdev == ZFS_SNAPDEV_HIDDEN)) - error = SET_ERROR(ENODEV); - } - - kmem_free(parent, MAXPATHLEN); - - return (SET_ERROR(error)); -} - -static int -__zvol_create_minor(const char *name, boolean_t ignore_snapdev) +zvol_create_minor_impl(const char *name) { zvol_state_t *zv; objset_t *os; @@ -1389,7 +1392,7 @@ __zvol_create_minor(const char *name, boolean_t ignore_snapdev) unsigned minor = 0; int error = 0; - ASSERT(MUTEX_HELD(&zvol_state_lock)); + mutex_enter(&zvol_state_lock); zv = zvol_find_by_name(name); if (zv) { @@ -1397,12 +1400,6 @@ __zvol_create_minor(const char *name, boolean_t ignore_snapdev) goto out; } - if (ignore_snapdev == B_FALSE) { - error = __zvol_snapdev_hidden(name); - if (error) - goto out; - } - doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, zvol_tag, &os); @@ -1489,69 +1486,18 @@ __zvol_create_minor(const char *name, boolean_t ignore_snapdev) */ mutex_exit(&zvol_state_lock); add_disk(zv->zv_disk); - mutex_enter(&zvol_state_lock); + } else { + mutex_exit(&zvol_state_lock); } return (SET_ERROR(error)); } -/* - * Create a block device minor node and setup the linkage between it - * and the specified volume. Once this function returns the block - * device is live and ready for use. - */ -int -zvol_create_minor(const char *name) -{ - int error; - - mutex_enter(&zvol_state_lock); - error = __zvol_create_minor(name, B_FALSE); - mutex_exit(&zvol_state_lock); - - return (SET_ERROR(error)); -} - -static int -__zvol_remove_minor(const char *name) -{ - zvol_state_t *zv; - - ASSERT(MUTEX_HELD(&zvol_state_lock)); - - zv = zvol_find_by_name(name); - if (zv == NULL) - return (SET_ERROR(ENXIO)); - - if (zv->zv_open_count > 0) - return (SET_ERROR(EBUSY)); - - zvol_remove(zv); - zvol_free(zv); - - return (0); -} - -/* - * Remove a block device minor node for the specified volume. - */ -int -zvol_remove_minor(const char *name) -{ - int error; - - mutex_enter(&zvol_state_lock); - error = __zvol_remove_minor(name); - mutex_exit(&zvol_state_lock); - - return (SET_ERROR(error)); -} - /* * Rename a block device minor mode for the specified volume. */ static void -__zvol_rename_minor(zvol_state_t *zv, const char *newname) +zvol_rename_minor(zvol_state_t *zv, const char *newname) { int readonly = get_disk_ro(zv->zv_disk); @@ -1571,30 +1517,120 @@ __zvol_rename_minor(zvol_state_t *zv, const char *newname) set_disk_ro(zv->zv_disk, readonly); } + +/* + * Mask errors to continue dmu_objset_find() traversal + */ +static int +zvol_create_snap_minor_cb(const char *dsname, void *arg) +{ + const char *name = (const char *)arg; + + /* skip the designated dataset */ + if (name && strcmp(dsname, name) == 0) + return (0); + + /* at this point, the dsname should name a snapshot */ + if (strchr(dsname, '@') == 0) { + dprintf("zvol_create_snap_minor_cb(): " + "%s is not a shapshot name\n", dsname); + } else { + (void) zvol_create_minor_impl(dsname); + } + + return (0); +} + +/* + * Mask errors to continue dmu_objset_find() traversal + */ static int zvol_create_minors_cb(const char *dsname, void *arg) { - (void) zvol_create_minor(dsname); + uint64_t snapdev; + int error; + + error = dsl_prop_get_integer(dsname, "snapdev", &snapdev, NULL); + if (error) + return (0); + + /* + * Given the name and the 'snapdev' property, create device minor nodes + * with the linkages to zvols/snapshots as needed. + * If the name represents a zvol, create a minor node for the zvol, then + * check if its snapshots are 'visible', and if so, iterate over the + * snapshots and create device minor nodes for those. + */ + if (strchr(dsname, '@') == 0) { + /* create minor for the 'dsname' explicitly */ + error = zvol_create_minor_impl(dsname); + if ((error == 0 || error == EEXIST) && + (snapdev == ZFS_SNAPDEV_VISIBLE)) { + fstrans_cookie_t cookie = spl_fstrans_mark(); + /* + * traverse snapshots only, do not traverse children, + * and skip the 'dsname' + */ + error = dmu_objset_find((char *)dsname, + zvol_create_snap_minor_cb, (void *)dsname, + DS_FIND_SNAPSHOTS); + spl_fstrans_unmark(cookie); + } + } else { + dprintf("zvol_create_minors_cb(): %s is not a zvol name\n", + dsname); + } return (0); } /* - * Create minors for specified dataset including children and snapshots. + * Create minors for the specified dataset, including children and snapshots. + * Pay attention to the 'snapdev' property and iterate over the snapshots + * only if they are 'visible'. This approach allows one to assure that the + * snapshot metadata is read from disk only if it is needed. + * + * The name can represent a dataset to be recursively scanned for zvols and + * their snapshots, or a single zvol snapshot. If the name represents a + * dataset, the scan is performed in two nested stages: + * - scan the dataset for zvols, and + * - for each zvol, create a minor node, then check if the zvol's snapshots + * are 'visible', and only then iterate over the snapshots if needed + * + * If the name represents a snapshot, a check is perfromed if the snapshot is + * 'visible' (which also verifies that the parent is a zvol), and if so, + * a minor node for that snapshot is created. */ -int -zvol_create_minors(const char *name) +static int +zvol_create_minors_impl(const char *name) { int error = 0; fstrans_cookie_t cookie; + char *atp, *parent; if (zvol_inhibit_dev) return (0); - cookie = spl_fstrans_mark(); - error = dmu_objset_find((char *)name, zvol_create_minors_cb, - NULL, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); - spl_fstrans_unmark(cookie); + parent = kmem_alloc(MAXPATHLEN, KM_SLEEP); + (void) strlcpy(parent, name, MAXPATHLEN); + + if ((atp = strrchr(parent, '@')) != NULL) { + uint64_t snapdev; + + *atp = '\0'; + error = dsl_prop_get_integer(parent, "snapdev", + &snapdev, NULL); + + if (error == 0 && snapdev == ZFS_SNAPDEV_VISIBLE) + error = zvol_create_minor_impl(name); + } else { + cookie = spl_fstrans_mark(); + error = dmu_objset_find(parent, zvol_create_minors_cb, + NULL, DS_FIND_CHILDREN); + spl_fstrans_unmark(cookie); + } + + kmem_free(parent, MAXPATHLEN); return (SET_ERROR(error)); } @@ -1602,8 +1638,8 @@ zvol_create_minors(const char *name) /* * Remove minors for specified dataset including children and snapshots. */ -void -zvol_remove_minors(const char *name) +static void +zvol_remove_minors_impl(const char *name) { zvol_state_t *zv, *zv_next; int namelen = ((name) ? strlen(name) : 0); @@ -1633,11 +1669,41 @@ zvol_remove_minors(const char *name) mutex_exit(&zvol_state_lock); } +/* Remove minor for this specific snapshot only */ +static void +zvol_remove_minor_impl(const char *name) +{ + zvol_state_t *zv, *zv_next; + + if (zvol_inhibit_dev) + return; + + if (strchr(name, '@') == NULL) + return; + + mutex_enter(&zvol_state_lock); + + for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) { + zv_next = list_next(&zvol_state_list, zv); + + if (strcmp(zv->zv_name, name) == 0) { + /* If in use, leave alone */ + if (zv->zv_open_count > 0) + continue; + zvol_remove(zv); + zvol_free(zv); + break; + } + } + + mutex_exit(&zvol_state_lock); +} + /* * Rename minors for specified dataset including children and snapshots. */ -void -zvol_rename_minors(const char *oldname, const char *newname) +static void +zvol_rename_minors_impl(const char *oldname, const char *newname) { zvol_state_t *zv, *zv_next; int oldnamelen, newnamelen; @@ -1660,14 +1726,14 @@ zvol_rename_minors(const char *oldname, const char *newname) continue; if (strcmp(zv->zv_name, oldname) == 0) { - __zvol_rename_minor(zv, newname); + zvol_rename_minor(zv, newname); } else if (strncmp(zv->zv_name, oldname, oldnamelen) == 0 && (zv->zv_name[oldnamelen] == '/' || zv->zv_name[oldnamelen] == '@')) { snprintf(name, MAXNAMELEN, "%s%c%s", newname, zv->zv_name[oldnamelen], zv->zv_name + oldnamelen + 1); - __zvol_rename_minor(zv, name); + zvol_rename_minor(zv, name); } } @@ -1676,42 +1742,227 @@ zvol_rename_minors(const char *oldname, const char *newname) kmem_free(name, MAXNAMELEN); } +typedef struct zvol_snapdev_cb_arg { + uint64_t snapdev; +} zvol_snapdev_cb_arg_t; + static int -snapdev_snapshot_changed_cb(const char *dsname, void *arg) { - uint64_t snapdev = *(uint64_t *) arg; +zvol_set_snapdev_cb(const char *dsname, void *param) { + zvol_snapdev_cb_arg_t *arg = param; if (strchr(dsname, '@') == NULL) return (0); - switch (snapdev) { + switch (arg->snapdev) { case ZFS_SNAPDEV_VISIBLE: - mutex_enter(&zvol_state_lock); - (void) __zvol_create_minor(dsname, B_TRUE); - mutex_exit(&zvol_state_lock); + (void) zvol_create_minor_impl(dsname); break; case ZFS_SNAPDEV_HIDDEN: - (void) zvol_remove_minor(dsname); + (void) zvol_remove_minor_impl(dsname); break; } return (0); } +static void +zvol_set_snapdev_impl(char *name, uint64_t snapdev) +{ + zvol_snapdev_cb_arg_t arg = {snapdev}; + fstrans_cookie_t cookie = spl_fstrans_mark(); + /* + * The zvol_set_snapdev_sync() sets snapdev appropriately + * in the dataset hierarchy. Here, we only scan snapshots. + */ + dmu_objset_find(name, zvol_set_snapdev_cb, &arg, DS_FIND_SNAPSHOTS); + spl_fstrans_unmark(cookie); +} + +static zvol_task_t * +zvol_task_alloc(zvol_async_op_t op, const char *name1, const char *name2, + uint64_t snapdev) +{ + zvol_task_t *task; + char *delim; + + /* Never allow tasks on hidden names. */ + if (name1[0] == '$') + return (NULL); + + task = kmem_zalloc(sizeof (zvol_task_t), KM_SLEEP); + task->op = op; + task->snapdev = snapdev; + delim = strchr(name1, '/'); + strlcpy(task->pool, name1, delim ? (delim - name1 + 1) : MAXNAMELEN); + + strlcpy(task->name1, name1, MAXNAMELEN); + if (name2 != NULL) + strlcpy(task->name2, name2, MAXNAMELEN); + + return (task); +} + +static void +zvol_task_free(zvol_task_t *task) +{ + kmem_free(task, sizeof (zvol_task_t)); +} + +/* + * The worker thread function performed asynchronously. + */ +static void +zvol_task_cb(void *param) +{ + zvol_task_t *task = (zvol_task_t *)param; + + switch (task->op) { + case ZVOL_ASYNC_CREATE_MINORS: + (void) zvol_create_minors_impl(task->name1); + break; + case ZVOL_ASYNC_REMOVE_MINORS: + zvol_remove_minors_impl(task->name1); + break; + case ZVOL_ASYNC_RENAME_MINORS: + zvol_rename_minors_impl(task->name1, task->name2); + break; + case ZVOL_ASYNC_SET_SNAPDEV: + zvol_set_snapdev_impl(task->name1, task->snapdev); + break; + default: + VERIFY(0); + break; + } + + zvol_task_free(task); +} + +typedef struct zvol_set_snapdev_arg { + const char *zsda_name; + uint64_t zsda_value; + zprop_source_t zsda_source; + dmu_tx_t *zsda_tx; +} zvol_set_snapdev_arg_t; + +/* + * Sanity check the dataset for safe use by the sync task. No additional + * conditions are imposed. + */ +static int +zvol_set_snapdev_check(void *arg, dmu_tx_t *tx) +{ + zvol_set_snapdev_arg_t *zsda = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dir_t *dd; + int error; + + error = dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL); + if (error != 0) + return (error); + + dsl_dir_rele(dd, FTAG); + + return (error); +} + +static int +zvol_set_snapdev_sync_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) +{ + zvol_set_snapdev_arg_t *zsda = arg; + char dsname[MAXNAMELEN]; + zvol_task_t *task; + + dsl_dataset_name(ds, dsname); + dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_SNAPDEV), + zsda->zsda_source, sizeof (zsda->zsda_value), 1, + &zsda->zsda_value, zsda->zsda_tx); + + task = zvol_task_alloc(ZVOL_ASYNC_SET_SNAPDEV, dsname, + NULL, zsda->zsda_value); + if (task == NULL) + return (0); + + (void) taskq_dispatch(dp->dp_spa->spa_zvol_taskq, zvol_task_cb, + task, TQ_SLEEP); + return (0); +} + +/* + * Traverse all child snapshot datasets and apply snapdev appropriately. + */ +static void +zvol_set_snapdev_sync(void *arg, dmu_tx_t *tx) +{ + zvol_set_snapdev_arg_t *zsda = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dir_t *dd; + + VERIFY0(dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL)); + zsda->zsda_tx = tx; + + dmu_objset_find_dp(dp, dd->dd_object, zvol_set_snapdev_sync_cb, + zsda, DS_FIND_CHILDREN); + + dsl_dir_rele(dd, FTAG); +} + int -zvol_set_snapdev(const char *dsname, uint64_t snapdev) { - fstrans_cookie_t cookie; +zvol_set_snapdev(const char *ddname, zprop_source_t source, uint64_t snapdev) +{ + zvol_set_snapdev_arg_t zsda; - if (zvol_inhibit_dev) - /* caller should continue to modify snapdev property */ - return (-1); + zsda.zsda_name = ddname; + zsda.zsda_source = source; + zsda.zsda_value = snapdev; - cookie = spl_fstrans_mark(); - (void) dmu_objset_find((char *) dsname, snapdev_snapshot_changed_cb, - &snapdev, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN); - spl_fstrans_unmark(cookie); + return (dsl_sync_task(ddname, zvol_set_snapdev_check, + zvol_set_snapdev_sync, &zsda, 0, ZFS_SPACE_CHECK_NONE)); +} + +void +zvol_create_minors(spa_t *spa, const char *name, boolean_t async) +{ + zvol_task_t *task; + taskqid_t id; + + task = zvol_task_alloc(ZVOL_ASYNC_CREATE_MINORS, name, NULL, ~0ULL); + if (task == NULL) + return; + + id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP); + if ((async == B_FALSE) && (id != 0)) + taskq_wait_id(spa->spa_zvol_taskq, id); +} + +void +zvol_remove_minors(spa_t *spa, const char *name, boolean_t async) +{ + zvol_task_t *task; + taskqid_t id; + + task = zvol_task_alloc(ZVOL_ASYNC_REMOVE_MINORS, name, NULL, ~0ULL); + if (task == NULL) + return; - /* caller should continue to modify snapdev property */ - return (-1); + id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP); + if ((async == B_FALSE) && (id != 0)) + taskq_wait_id(spa->spa_zvol_taskq, id); +} + +void +zvol_rename_minors(spa_t *spa, const char *name1, const char *name2, + boolean_t async) +{ + zvol_task_t *task; + taskqid_t id; + + task = zvol_task_alloc(ZVOL_ASYNC_RENAME_MINORS, name1, name2, ~0ULL); + if (task == NULL) + return; + + id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP); + if ((async == B_FALSE) && (id != 0)) + taskq_wait_id(spa->spa_zvol_taskq, id); } int @@ -1721,7 +1972,6 @@ zvol_init(void) list_create(&zvol_state_list, sizeof (zvol_state_t), offsetof(zvol_state_t, zv_next)); - mutex_init(&zvol_state_lock, NULL, MUTEX_DEFAULT, NULL); error = register_blkdev(zvol_major, ZVOL_DRIVER); @@ -1745,11 +1995,13 @@ zvol_init(void) void zvol_fini(void) { - zvol_remove_minors(NULL); + zvol_remove_minors_impl(NULL); + blk_unregister_region(MKDEV(zvol_major, 0), 1UL << MINORBITS); unregister_blkdev(zvol_major, ZVOL_DRIVER); - mutex_destroy(&zvol_state_lock); + list_destroy(&zvol_state_list); + mutex_destroy(&zvol_state_lock); } module_param(zvol_inhibit_dev, uint, 0644); diff --git a/scripts/zconfig.sh b/scripts/zconfig.sh index 45ccf62ed02b..1908dc1d6919 100755 --- a/scripts/zconfig.sh +++ b/scripts/zconfig.sh @@ -217,15 +217,26 @@ test_3() { zconfig_zvol_device_stat 10 ${POOL_NAME} ${FULL_ZVOL_NAME} \ ${FULL_SNAP_NAME} ${FULL_CLONE_NAME} || fail 11 + # Toggle the snapdev and observe snapshot device links toggled + ${ZFS} set snapdev=hidden ${FULL_ZVOL_NAME} || fail 12 + + zconfig_zvol_device_stat 7 ${POOL_NAME} ${FULL_ZVOL_NAME} \ + "invalid" ${FULL_CLONE_NAME} || fail 13 + + ${ZFS} set snapdev=visible ${FULL_ZVOL_NAME} || fail 14 + + zconfig_zvol_device_stat 10 ${POOL_NAME} ${FULL_ZVOL_NAME} \ + ${FULL_SNAP_NAME} ${FULL_CLONE_NAME} || fail 15 + # Destroy the pool and consequently the devices - ${ZPOOL_CREATE_SH} -p ${POOL_NAME} -c lo-raidz2 -d || fail 12 + ${ZPOOL_CREATE_SH} -p ${POOL_NAME} -c lo-raidz2 -d || fail 16 # verify the devices were removed zconfig_zvol_device_stat 0 ${POOL_NAME} ${FULL_ZVOL_NAME} \ - ${FULL_SNAP_NAME} ${FULL_CLONE_NAME} || fail 13 + ${FULL_SNAP_NAME} ${FULL_CLONE_NAME} || fail 17 - ${ZFS_SH} -u || fail 14 - rm -f ${TMP_CACHE} || fail 15 + ${ZFS_SH} -u || fail 18 + rm -f ${TMP_CACHE} || fail 19 pass }