diff --git a/include/os/linux/spl/sys/kstat.h b/include/os/linux/spl/sys/kstat.h index 305c411ddfa0..7809fda5209f 100644 --- a/include/os/linux/spl/sys/kstat.h +++ b/include/os/linux/spl/sys/kstat.h @@ -20,6 +20,10 @@ * You should have received a copy of the GNU General Public License along * with the SPL. If not, see . */ +/* + * Copyright (c) 2024, Klara, Inc. + * Copyright (c) 2024, Syneto + */ #ifndef _SPL_KSTAT_H #define _SPL_KSTAT_H @@ -89,6 +93,8 @@ typedef struct kstat_module { struct list_head ksm_module_list; /* module linkage */ struct list_head ksm_kstat_list; /* list of kstat entries */ struct proc_dir_entry *ksm_proc; /* proc entry */ + struct kstat_module *ksm_parent; /* parent module in hierarchy */ + uint_t ksm_nchildren; /* number of child modules */ } kstat_module_t; typedef struct kstat_raw_ops { diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index 57ff31e89eb9..3be295c564a0 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -22,7 +22,8 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2017, Intel Corporation. - * Copyright (c) 2023, Klara Inc. + * Copyright (c) 2023, 2024, Klara, Inc. + * Copyright (c) 2024, Syneto */ #ifndef _SYS_VDEV_IMPL_H @@ -41,6 +42,7 @@ #include #include #include +#include #ifdef __cplusplus extern "C" { @@ -138,6 +140,25 @@ typedef union vdev_queue_class { avl_tree_t vqc_tree; } vdev_queue_class_t; +typedef struct vdev_queue_sums { + /* gauges (inc/dec counters, current value) */ + wmsum_t vqs_io_queued; + wmsum_t vqs_io_class_queued[ZIO_PRIORITY_NUM_QUEUEABLE]; + wmsum_t vqs_io_active; + wmsum_t vqs_io_class_active[ZIO_PRIORITY_NUM_QUEUEABLE]; + + /* counters (inc only, since queue creation ) */ + wmsum_t vqs_io_enqueued_total; + wmsum_t vqs_io_class_enqueued_total[ZIO_PRIORITY_NUM_QUEUEABLE]; + wmsum_t vqs_io_dequeued_total; + wmsum_t vqs_io_class_dequeued_total[ZIO_PRIORITY_NUM_QUEUEABLE]; + wmsum_t vqs_io_aggregated_total; + wmsum_t vqs_io_aggregated_data_total; + wmsum_t vqs_io_aggregated_read_gap_total; + wmsum_t vqs_io_aggregated_write_gap_total; + wmsum_t vqs_io_aggregated_shrunk_total; +} vdev_queue_sums_t; + struct vdev_queue { vdev_t *vq_vdev; vdev_queue_class_t vq_class[ZIO_PRIORITY_NUM_QUEUEABLE]; @@ -155,6 +176,8 @@ struct vdev_queue { hrtime_t vq_io_delta_ts; zio_t vq_io_search; /* used as local for stack reduction */ kmutex_t vq_lock; + vdev_queue_sums_t vq_sums; + kstat_t *vq_ksp; }; typedef enum vdev_alloc_bias { diff --git a/module/os/freebsd/spl/spl_kstat.c b/module/os/freebsd/spl/spl_kstat.c index f657ef2a3acb..4ab6e0514406 100644 --- a/module/os/freebsd/spl/spl_kstat.c +++ b/module/os/freebsd/spl/spl_kstat.c @@ -27,6 +27,10 @@ * [1] https://illumos.org/man/1M/kstat * [2] https://illumos.org/man/9f/kstat_create */ +/* + * Copyright (c) 2024, Klara, Inc. + * Copyright (c) 2024, Syneto + */ #include #include @@ -287,7 +291,7 @@ __kstat_create(const char *module, int instance, const char *name, char buf[KSTAT_STRLEN]; struct sysctl_oid *root; kstat_t *ksp; - char *pool; + char *p, *frag; KASSERT(instance == 0, ("instance=%d", instance)); if ((ks_type == KSTAT_TYPE_INTR) || (ks_type == KSTAT_TYPE_IO)) @@ -345,74 +349,54 @@ __kstat_create(const char *module, int instance, const char *name, else ksp->ks_data = kmem_zalloc(ksp->ks_data_size, KM_SLEEP); - /* - * Some kstats use a module name like "zfs/poolname" to distinguish a - * set of kstats belonging to a specific pool. Split on '/' to add an - * extra node for the pool name if needed. - */ + sysctl_ctx_init(&ksp->ks_sysctl_ctx); + (void) strlcpy(buf, module, KSTAT_STRLEN); - module = buf; - pool = strchr(module, '/'); - if (pool != NULL) - *pool++ = '\0'; /* - * Create sysctl tree for those statistics: - * - * kstat.[.].. + * Walk over the module name, splitting on '/', and create the + * intermediate nodes. */ - sysctl_ctx_init(&ksp->ks_sysctl_ctx); - root = SYSCTL_ADD_NODE(&ksp->ks_sysctl_ctx, - SYSCTL_STATIC_CHILDREN(_kstat), OID_AUTO, module, CTLFLAG_RW, 0, - ""); - if (root == NULL) { - printf("%s: Cannot create kstat.%s tree!\n", __func__, module); - sysctl_ctx_free(&ksp->ks_sysctl_ctx); - free(ksp, M_KSTAT); - return (NULL); - } - if (pool != NULL) { - root = SYSCTL_ADD_NODE(&ksp->ks_sysctl_ctx, - SYSCTL_CHILDREN(root), OID_AUTO, pool, CTLFLAG_RW, 0, ""); + root = NULL; + p = buf; + while ((frag = strsep(&p, "/")) != NULL) { + root = SYSCTL_ADD_NODE(&ksp->ks_sysctl_ctx, root ? + SYSCTL_CHILDREN(root) : SYSCTL_STATIC_CHILDREN(_kstat), + OID_AUTO, frag, CTLFLAG_RW, 0, ""); if (root == NULL) { - printf("%s: Cannot create kstat.%s.%s tree!\n", - __func__, module, pool); + printf("%s: Cannot create kstat.%s tree!\n", + __func__, buf); sysctl_ctx_free(&ksp->ks_sysctl_ctx); free(ksp, M_KSTAT); return (NULL); } + if (p != NULL && p > frag) + p[-1] = '.'; } + root = SYSCTL_ADD_NODE(&ksp->ks_sysctl_ctx, SYSCTL_CHILDREN(root), OID_AUTO, class, CTLFLAG_RW, 0, ""); if (root == NULL) { - if (pool != NULL) - printf("%s: Cannot create kstat.%s.%s.%s tree!\n", - __func__, module, pool, class); - else - printf("%s: Cannot create kstat.%s.%s tree!\n", - __func__, module, class); + printf("%s: Cannot create kstat.%s.%s tree!\n", + __func__, buf, class); sysctl_ctx_free(&ksp->ks_sysctl_ctx); free(ksp, M_KSTAT); return (NULL); } + if (ksp->ks_type == KSTAT_TYPE_NAMED) { root = SYSCTL_ADD_NODE(&ksp->ks_sysctl_ctx, SYSCTL_CHILDREN(root), OID_AUTO, name, CTLFLAG_RW, 0, ""); if (root == NULL) { - if (pool != NULL) - printf("%s: Cannot create kstat.%s.%s.%s.%s " - "tree!\n", __func__, module, pool, class, - name); - else - printf("%s: Cannot create kstat.%s.%s.%s " - "tree!\n", __func__, module, class, name); + printf("%s: Cannot create kstat.%s.%s.%s tree!\n", + __func__, buf, class, name); sysctl_ctx_free(&ksp->ks_sysctl_ctx); free(ksp, M_KSTAT); return (NULL); } - } + ksp->ks_sysctl_root = root; return (ksp); @@ -436,7 +420,26 @@ kstat_install_named(kstat_t *ksp) if (ksent->data_type != 0) { typelast = ksent->data_type; namelast = ksent->name; + + /* + * If a sysctl with this name already exists on this on + * this root, first remove it by deleting it from its + * old context, and then destroying it. + */ + struct sysctl_oid *oid = NULL; + SYSCTL_FOREACH(oid, + SYSCTL_CHILDREN(ksp->ks_sysctl_root)) { + if (strcmp(oid->oid_name, namelast) == 0) { + kstat_t *oldksp = + (kstat_t *)oid->oid_arg1; + sysctl_ctx_entry_del( + &oldksp->ks_sysctl_ctx, oid); + sysctl_remove_oid(oid, 1, 0); + break; + } + } } + switch (typelast) { case KSTAT_DATA_CHAR: /* Not Implemented */ diff --git a/module/os/linux/spl/spl-kstat.c b/module/os/linux/spl/spl-kstat.c index ad553a73a69e..cac2a6d469bf 100644 --- a/module/os/linux/spl/spl-kstat.c +++ b/module/os/linux/spl/spl-kstat.c @@ -26,6 +26,10 @@ * [1] https://illumos.org/man/1M/kstat * [2] https://illumos.org/man/9f/kstat_create */ +/* + * Copyright (c) 2024, Klara, Inc. + * Copyright (c) 2024, Syneto + */ #include #include @@ -379,33 +383,72 @@ kstat_find_module(char *name) return (NULL); } -static kstat_module_t * -kstat_create_module(char *name) +static void +kstat_delete_module(kstat_module_t *module) { - kstat_module_t *module; - struct proc_dir_entry *pde; + ASSERT(list_empty(&module->ksm_kstat_list)); + ASSERT0(module->ksm_nchildren); - pde = proc_mkdir(name, proc_spl_kstat); - if (pde == NULL) - return (NULL); + kstat_module_t *parent = module->ksm_parent; - module = kmem_alloc(sizeof (kstat_module_t), KM_SLEEP); - module->ksm_proc = pde; - strlcpy(module->ksm_name, name, KSTAT_STRLEN); - INIT_LIST_HEAD(&module->ksm_kstat_list); - list_add_tail(&module->ksm_module_list, &kstat_module_list); + char *p = module->ksm_name, *frag; + while (p != NULL && (frag = strsep(&p, "/"))) {} - return (module); + remove_proc_entry(frag, parent ? parent->ksm_proc : proc_spl_kstat); + list_del(&module->ksm_module_list); + kmem_free(module, sizeof (kstat_module_t)); + if (parent) { + parent->ksm_nchildren--; + if (parent->ksm_nchildren == 0 && + list_empty(&parent->ksm_kstat_list)) + kstat_delete_module(parent); + } } -static void -kstat_delete_module(kstat_module_t *module) +static kstat_module_t * +kstat_create_module(char *name) { - ASSERT(list_empty(&module->ksm_kstat_list)); - remove_proc_entry(module->ksm_name, proc_spl_kstat); - list_del(&module->ksm_module_list); - kmem_free(module, sizeof (kstat_module_t)); + char buf[KSTAT_STRLEN]; + kstat_module_t *module, *parent; + + (void) strlcpy(buf, name, KSTAT_STRLEN); + + parent = NULL; + char *p = buf, *frag; + while ((frag = strsep(&p, "/")) != NULL) { + module = kstat_find_module(buf); + if (module == NULL) { + struct proc_dir_entry *pde = proc_mkdir(frag, + parent ? parent->ksm_proc : proc_spl_kstat); + if (pde == NULL) { + cmn_err(CE_WARN, "kstat_create('%s'): " + "module dir create failed", buf); + if (parent) + kstat_delete_module(parent); + return (NULL); + } + + module = kmem_alloc(sizeof (kstat_module_t), KM_SLEEP); + module->ksm_proc = pde; + strlcpy(module->ksm_name, buf, KSTAT_STRLEN); + INIT_LIST_HEAD(&module->ksm_kstat_list); + list_add_tail(&module->ksm_module_list, + &kstat_module_list); + + if (parent != NULL) { + module->ksm_parent = parent; + parent->ksm_nchildren++; + } + } + + parent = module; + if (p != NULL && p > frag) + p[-1] = '/'; + } + + return (module); + } static int @@ -624,12 +667,20 @@ kstat_proc_entry_install(kstat_proc_entry_t *kpep, mode_t mode, } /* - * Only one entry by this name per-module, on failure the module - * shouldn't be deleted because we know it has at least one entry. + * We can only have one entry of this name per module. If one already + * exists, replace it by first removing the proc entry, then removing + * it from the list. The kstat itself lives on; it just can't be + * inspected through the filesystem. */ list_for_each_entry(tmp, &module->ksm_kstat_list, kpe_list) { - if (strncmp(tmp->kpe_name, kpep->kpe_name, KSTAT_STRLEN) == 0) - goto out; + if (tmp->kpe_proc != NULL && + strncmp(tmp->kpe_name, kpep->kpe_name, KSTAT_STRLEN) == 0) { + ASSERT3P(tmp->kpe_owner, ==, module); + remove_proc_entry(tmp->kpe_name, module->ksm_proc); + tmp->kpe_proc = NULL; + list_del_init(&tmp->kpe_list); + break; + } } list_add_tail(&kpep->kpe_list, &module->ksm_kstat_list); diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c index 092b3f375be0..ad678ee538e6 100644 --- a/module/zfs/vdev_queue.c +++ b/module/zfs/vdev_queue.c @@ -25,6 +25,8 @@ /* * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright (c) 2024, Klara, Inc. + * Copyright (c) 2024, Syneto */ #include @@ -228,6 +230,81 @@ uint_t zfs_vdev_queue_depth_pct = 300; */ uint_t zfs_vdev_def_queue_depth = 32; +typedef struct vdev_queue_kstats { + kstat_named_t vqks_io_queued; + kstat_named_t vqks_io_class_queued[ZIO_PRIORITY_NUM_QUEUEABLE]; + kstat_named_t vqks_io_active; + kstat_named_t vqks_io_class_active[ZIO_PRIORITY_NUM_QUEUEABLE]; + kstat_named_t vqks_io_enqueued_total; + kstat_named_t vqks_io_class_enqueued_total[ZIO_PRIORITY_NUM_QUEUEABLE]; + kstat_named_t vqks_io_dequeued_total; + kstat_named_t vqks_io_class_dequeued_total[ZIO_PRIORITY_NUM_QUEUEABLE]; + kstat_named_t vqks_io_aggregated_total; + kstat_named_t vqks_io_aggregated_data_total; + kstat_named_t vqks_io_aggregated_read_gap_total; + kstat_named_t vqks_io_aggregated_write_gap_total; + kstat_named_t vqks_io_aggregated_shrunk_total; +} vdev_queue_kstats_t; + +static vdev_queue_kstats_t vdev_queue_kstats_template = { + { "io_queued", KSTAT_DATA_UINT64 }, + { + { "io_syncread_queued", KSTAT_DATA_UINT64 }, + { "io_syncwrite_queued", KSTAT_DATA_UINT64 }, + { "io_asyncread_queued", KSTAT_DATA_UINT64 }, + { "io_asyncwrite_queued", KSTAT_DATA_UINT64 }, + { "io_scrub_queued", KSTAT_DATA_UINT64 }, + { "io_removal_queued", KSTAT_DATA_UINT64 }, + { "io_initializing_queued", KSTAT_DATA_UINT64 }, + { "io_trim_queued", KSTAT_DATA_UINT64 }, + { "io_rebuild_queued", KSTAT_DATA_UINT64 }, + }, + { "io_active", KSTAT_DATA_UINT64 }, + { + { "io_syncread_active", KSTAT_DATA_UINT64 }, + { "io_syncwrite_active", KSTAT_DATA_UINT64 }, + { "io_asyncread_active", KSTAT_DATA_UINT64 }, + { "io_asyncwrite_active", KSTAT_DATA_UINT64 }, + { "io_scrub_active", KSTAT_DATA_UINT64 }, + { "io_removal_active", KSTAT_DATA_UINT64 }, + { "io_initializing_active", KSTAT_DATA_UINT64 }, + { "io_trim_active", KSTAT_DATA_UINT64 }, + { "io_rebuild_active", KSTAT_DATA_UINT64 }, + }, + { "io_enqueued_total", KSTAT_DATA_UINT64 }, + { + { "io_syncread_enqueued_total", KSTAT_DATA_UINT64 }, + { "io_syncwrite_enqueued_total", KSTAT_DATA_UINT64 }, + { "io_asyncread_enqueued_total", KSTAT_DATA_UINT64 }, + { "io_asyncwrite_enqueued_total", KSTAT_DATA_UINT64 }, + { "io_scrub_enqueued_total", KSTAT_DATA_UINT64 }, + { "io_removal_enqueued_total", KSTAT_DATA_UINT64 }, + { "io_initializing_enqueued_total", KSTAT_DATA_UINT64 }, + { "io_trim_enqueued_total", KSTAT_DATA_UINT64 }, + { "io_rebuild_enqueued_total", KSTAT_DATA_UINT64 }, + }, + { "io_dequeued_total", KSTAT_DATA_UINT64 }, + { + { "io_syncread_dequeued_total", KSTAT_DATA_UINT64 }, + { "io_syncwrite_dequeued_total", KSTAT_DATA_UINT64 }, + { "io_asyncread_dequeued_total", KSTAT_DATA_UINT64 }, + { "io_asyncwrite_dequeued_total", KSTAT_DATA_UINT64 }, + { "io_scrub_dequeued_total", KSTAT_DATA_UINT64 }, + { "io_removal_dequeued_total", KSTAT_DATA_UINT64 }, + { "io_initializing_dequeued_total", KSTAT_DATA_UINT64 }, + { "io_trim_dequeued_total", KSTAT_DATA_UINT64 }, + { "io_rebuild_dequeued_total", KSTAT_DATA_UINT64 }, + }, + { "io_aggregated_total", KSTAT_DATA_UINT64 }, + { "io_aggregated_data_total", KSTAT_DATA_UINT64 }, + { "io_aggregated_read_gap_total", KSTAT_DATA_UINT64 }, + { "io_aggregated_write_gap_total", KSTAT_DATA_UINT64 }, + { "io_aggregated_shrunk_total", KSTAT_DATA_UINT64 }, +}; + +#define VQSTAT_INC(vq, stat) wmsum_add(&vq->vq_sums.vqs_##stat, 1) +#define VQSTAT_DEC(vq, stat) wmsum_add(&vq->vq_sums.vqs_##stat, -1) + static int vdev_queue_offset_compare(const void *x1, const void *x2) { @@ -279,6 +356,10 @@ vdev_queue_class_add(vdev_queue_t *vq, zio_t *zio) } else avl_add(&vq->vq_class[p].vqc_tree, zio); + VQSTAT_INC(vq, io_queued); + VQSTAT_INC(vq, io_class_queued[p]); + VQSTAT_INC(vq, io_enqueued_total); + VQSTAT_INC(vq, io_class_enqueued_total[p]); } static void @@ -297,6 +378,10 @@ vdev_queue_class_remove(vdev_queue_t *vq, zio_t *zio) empty = avl_is_empty(tree); } vq->vq_cqueued &= ~(empty << p); + VQSTAT_DEC(vq, io_queued); + VQSTAT_DEC(vq, io_class_queued[p]); + VQSTAT_INC(vq, io_dequeued_total); + VQSTAT_INC(vq, io_class_dequeued_total[p]); } static uint_t @@ -472,6 +557,129 @@ vdev_queue_class_to_issue(vdev_queue_t *vq) return (p); } +static void +vdev_queue_sums_init(vdev_queue_t *vq) +{ + vdev_queue_sums_t *vqs = &vq->vq_sums; + wmsum_init(&vqs->vqs_io_queued, 0); + wmsum_init(&vqs->vqs_io_active, 0); + wmsum_init(&vqs->vqs_io_enqueued_total, 0); + wmsum_init(&vqs->vqs_io_dequeued_total, 0); + wmsum_init(&vqs->vqs_io_aggregated_total, 0); + wmsum_init(&vqs->vqs_io_aggregated_data_total, 0); + wmsum_init(&vqs->vqs_io_aggregated_read_gap_total, 0); + wmsum_init(&vqs->vqs_io_aggregated_write_gap_total, 0); + wmsum_init(&vqs->vqs_io_aggregated_shrunk_total, 0); + for (int i = 0; i < ZIO_PRIORITY_NUM_QUEUEABLE; i++) { + wmsum_init(&vqs->vqs_io_class_queued[i], 0); + wmsum_init(&vqs->vqs_io_class_active[i], 0); + wmsum_init(&vqs->vqs_io_class_enqueued_total[i], 0); + wmsum_init(&vqs->vqs_io_class_dequeued_total[i], 0); + } +} + +static void +vdev_queue_sums_fini(vdev_queue_t *vq) +{ + vdev_queue_sums_t *vqs = &vq->vq_sums; + wmsum_fini(&vqs->vqs_io_queued); + wmsum_fini(&vqs->vqs_io_active); + wmsum_fini(&vqs->vqs_io_enqueued_total); + wmsum_fini(&vqs->vqs_io_dequeued_total); + wmsum_fini(&vqs->vqs_io_aggregated_total); + wmsum_fini(&vqs->vqs_io_aggregated_data_total); + wmsum_fini(&vqs->vqs_io_aggregated_read_gap_total); + wmsum_fini(&vqs->vqs_io_aggregated_write_gap_total); + wmsum_fini(&vqs->vqs_io_aggregated_shrunk_total); + for (int i = 0; i < ZIO_PRIORITY_NUM_QUEUEABLE; i++) { + wmsum_fini(&vqs->vqs_io_class_queued[i]); + wmsum_fini(&vqs->vqs_io_class_active[i]); + wmsum_fini(&vqs->vqs_io_class_enqueued_total[i]); + wmsum_fini(&vqs->vqs_io_class_dequeued_total[i]); + } +} + +static int +vdev_queue_kstats_update(kstat_t *ksp, int rw) +{ + if (rw == KSTAT_WRITE) + return (EACCES); + + vdev_queue_t *vq = ksp->ks_private; + vdev_queue_kstats_t *vqks = ksp->ks_data; + vdev_queue_sums_t *vqs = &vq->vq_sums; + + vqks->vqks_io_queued.value.ui64 = + wmsum_value(&vqs->vqs_io_queued); + vqks->vqks_io_active.value.ui64 = + wmsum_value(&vqs->vqs_io_active); + vqks->vqks_io_enqueued_total.value.ui64 = + wmsum_value(&vqs->vqs_io_enqueued_total); + vqks->vqks_io_dequeued_total.value.ui64 = + wmsum_value(&vqs->vqs_io_dequeued_total); + vqks->vqks_io_aggregated_total.value.ui64 = + wmsum_value(&vqs->vqs_io_aggregated_total); + vqks->vqks_io_aggregated_data_total.value.ui64 = + wmsum_value(&vqs->vqs_io_aggregated_data_total); + vqks->vqks_io_aggregated_read_gap_total.value.ui64 = + wmsum_value(&vqs->vqs_io_aggregated_read_gap_total); + vqks->vqks_io_aggregated_write_gap_total.value.ui64 = + wmsum_value(&vqs->vqs_io_aggregated_write_gap_total); + vqks->vqks_io_aggregated_shrunk_total.value.ui64 = + wmsum_value(&vqs->vqs_io_aggregated_shrunk_total); + for (int i = 0; i < ZIO_PRIORITY_NUM_QUEUEABLE; i++) { + vqks->vqks_io_class_queued[i].value.ui64 = + wmsum_value(&vqs->vqs_io_class_queued[i]); + vqks->vqks_io_class_active[i].value.ui64 = + wmsum_value(&vqs->vqs_io_class_active[i]); + vqks->vqks_io_class_enqueued_total[i].value.ui64 = + wmsum_value(&vqs->vqs_io_class_enqueued_total[i]); + vqks->vqks_io_class_dequeued_total[i].value.ui64 = + wmsum_value(&vqs->vqs_io_class_dequeued_total[i]); + } + + return (0); +} + +static void +vdev_queue_kstats_init(vdev_queue_t *vq) +{ + char *module = + kmem_asprintf("zfs/%s/vdev/%llu", spa_name(vq->vq_vdev->vdev_spa), + (u_longlong_t)vq->vq_vdev->vdev_guid); + + kstat_t *ksp = kstat_create(module, 0, "queue", "misc", + KSTAT_TYPE_NAMED, + sizeof (vdev_queue_kstats_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL); + + kmem_strfree(module); + + if (ksp == NULL) + return; + + ksp->ks_private = vq; + ksp->ks_update = vdev_queue_kstats_update; + ksp->ks_data = kmem_alloc(sizeof (vdev_queue_kstats_t), KM_SLEEP); + memcpy(ksp->ks_data, &vdev_queue_kstats_template, + sizeof (vdev_queue_kstats_t)); + kstat_install(ksp); + + vq->vq_ksp = ksp; +} + +static void +vdev_queue_kstats_fini(vdev_queue_t *vq) +{ + if (vq->vq_ksp == NULL) + return; + + kmem_free(vq->vq_ksp->ks_data, sizeof (vdev_queue_kstats_t)); + kstat_delete(vq->vq_ksp); + + vq->vq_ksp = NULL; +} + void vdev_queue_init(vdev_t *vd) { @@ -502,6 +710,19 @@ vdev_queue_init(vdev_t *vd) list_create(&vq->vq_active_list, sizeof (struct zio), offsetof(struct zio, io_queue_node.l)); mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL); + + vdev_queue_sums_init(vq); + + /* + * IO for interior vdevs and distributed spares never go through the + * queue, so do not create kstat nodes for them. + * See zio_vdev_io_start(). + */ + if (spa_load_state(vd->vdev_spa) != SPA_LOAD_TRYIMPORT && + vd->vdev_ops->vdev_op_leaf && + vd->vdev_ops != &vdev_draid_spare_ops) { + vdev_queue_kstats_init(vq); + } } void @@ -509,6 +730,9 @@ vdev_queue_fini(vdev_t *vd) { vdev_queue_t *vq = &vd->vdev_queue; + vdev_queue_kstats_fini(vq); + vdev_queue_sums_fini(vq); + for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { if (vdev_queue_class_fifo(p)) list_destroy(&vq->vq_class[p].vqc_list); @@ -563,9 +787,12 @@ vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio) { ASSERT(MUTEX_HELD(&vq->vq_lock)); ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); - vq->vq_cactive[zio->io_priority]++; + zio_priority_t p = zio->io_priority; + vq->vq_cactive[p]++; vq->vq_active++; - if (vdev_queue_is_interactive(zio->io_priority)) { + VQSTAT_INC(vq, io_active); + VQSTAT_INC(vq, io_class_active[p]); + if (vdev_queue_is_interactive(p)) { if (++vq->vq_ia_active == 1) vq->vq_nia_credit = 1; } else if (vq->vq_ia_active > 0) { @@ -580,9 +807,12 @@ vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio) { ASSERT(MUTEX_HELD(&vq->vq_lock)); ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); - vq->vq_cactive[zio->io_priority]--; + zio_priority_t p = zio->io_priority; + vq->vq_cactive[p]--; vq->vq_active--; - if (vdev_queue_is_interactive(zio->io_priority)) { + VQSTAT_DEC(vq, io_active); + VQSTAT_DEC(vq, io_class_active[p]); + if (vdev_queue_is_interactive(p)) { if (--vq->vq_ia_active == 0) vq->vq_nia_credit = 0; else @@ -777,6 +1007,8 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) flags | ZIO_FLAG_DONT_QUEUE, vdev_queue_agg_io_done, NULL); aio->io_timestamp = first->io_timestamp; + VQSTAT_INC(vq, io_aggregated_total); + nio = first; next_offset = first->io_offset; do { @@ -785,6 +1017,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) ASSERT3P(dio, !=, NULL); zio_add_child(dio, aio); vdev_queue_io_remove(vq, dio); + VQSTAT_INC(vq, io_aggregated_data_total); if (dio->io_offset != next_offset) { /* allocate a buffer for a read gap */ @@ -793,6 +1026,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) abd = abd_alloc_for_io( dio->io_offset - next_offset, B_TRUE); abd_gang_add(aio->io_abd, abd, B_TRUE); + VQSTAT_INC(vq, io_aggregated_read_gap_total); } if (dio->io_abd && (dio->io_size != abd_get_size(dio->io_abd))) { @@ -800,6 +1034,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) ASSERT3U(abd_get_size(dio->io_abd), >, dio->io_size); abd = abd_get_offset_size(dio->io_abd, 0, dio->io_size); abd_gang_add(aio->io_abd, abd, B_TRUE); + VQSTAT_INC(vq, io_aggregated_shrunk_total); } else { if (dio->io_flags & ZIO_FLAG_NODATA) { /* allocate a buffer for a write gap */ @@ -807,6 +1042,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) ASSERT3P(dio->io_abd, ==, NULL); abd_gang_add(aio->io_abd, abd_get_zeros(dio->io_size), B_TRUE); + VQSTAT_INC(vq, io_aggregated_write_gap_total); } else { /* * We pass B_FALSE to abd_gang_add()