Skip to content

Commit

Permalink
deadlock between spa_errlog_lock and dp_config_rwlock
Browse files Browse the repository at this point in the history
There is a lock order inversion deadlock between `spa_errlog_lock` and
`dp_config_rwlock`:

A thread in `spa_delete_dataset_errlog()` is running from a sync task.
It is holding the `dp_config_rwlock` for writer (see
`dsl_sync_task_sync()`), and waiting for the `spa_errlog_lock`.

A thread in `dsl_pool_config_enter()` is holding the `spa_errlog_lock`
(see `spa_get_errlog_size()`) and waiting for the `dp_config_rwlock` (as
reader).

Note that this was introduced by openzfs#12812.

This commit address this by defining the lock ordering to be
dp_config_rwlock first, then spa_errlog_lock / spa_errlist_lock.
spa_get_errlog() and spa_get_errlog_size() can acquire the locks in this
order, and then process_error_block() and get_head_and_birth_txg() can
verify that the dp_config_rwlock is already held.

Additionally, a buffer overrun in `spa_get_errlog()` is corrected.  Many
code paths didn't check if `*count` got to zero, instead continuing to
overwrite past the beginning of the userspace buffer at `uaddr`.

Tested by having some errors in the pool (via `zinject -t data
/path/to/file`), one thread running `zpool iostat 0.001`, and another
thread runs `zfs destroy` (in a loop, although it hits the first time).
This reproduces the problem easily without the fix, and works with the
fix.

Closes openzfs#14239
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
  • Loading branch information
ahrens committed Dec 14, 2022
1 parent 5f73bbb commit abef0c5
Show file tree
Hide file tree
Showing 8 changed files with 132 additions and 204 deletions.
2 changes: 1 addition & 1 deletion cmd/ztest.c
Original file line number Diff line number Diff line change
Expand Up @@ -6313,7 +6313,7 @@ ztest_scrub_impl(spa_t *spa)
while (dsl_scan_scrubbing(spa_get_dsl(spa)))
txg_wait_synced(spa_get_dsl(spa), 0);

if (spa_get_errlog_size(spa) > 0)
if (spa_approx_errlog_size(spa) > 0)
return (ECKSUM);

ztest_pool_scrubbed = B_TRUE;
Expand Down
2 changes: 1 addition & 1 deletion include/sys/spa.h
Original file line number Diff line number Diff line change
Expand Up @@ -1146,7 +1146,7 @@ extern nvlist_t *zfs_event_create(spa_t *spa, vdev_t *vd, const char *type,
extern void zfs_post_remove(spa_t *spa, vdev_t *vd);
extern void zfs_post_state_change(spa_t *spa, vdev_t *vd, uint64_t laststate);
extern void zfs_post_autoreplace(spa_t *spa, vdev_t *vd);
extern uint64_t spa_get_errlog_size(spa_t *spa);
extern uint64_t spa_approx_errlog_size(spa_t *spa);
extern int spa_get_errlog(spa_t *spa, void *uaddr, uint64_t *count);
extern void spa_errlog_rotate(spa_t *spa);
extern void spa_errlog_drain(spa_t *spa);
Expand Down
46 changes: 20 additions & 26 deletions lib/libzfs/libzfs_pool.c
Original file line number Diff line number Diff line change
Expand Up @@ -4133,33 +4133,28 @@ zpool_get_errlog(zpool_handle_t *zhp, nvlist_t **nverrlistp)
{
zfs_cmd_t zc = {"\0"};
libzfs_handle_t *hdl = zhp->zpool_hdl;
uint64_t count;
zbookmark_phys_t *zb = NULL;
int i;
zbookmark_phys_t *buf;
uint64_t buflen = 10000; /* approx. 1MB of RAM */

if (fnvlist_lookup_uint64(zhp->zpool_config,
ZPOOL_CONFIG_ERRCOUNT) == 0)
return (0);

/*
* Retrieve the raw error list from the kernel. If the number of errors
* has increased, allocate more space and continue until we get the
* entire list.
* Retrieve the raw error list from the kernel. If it doesn't fit,
* allocate a larger buffer and retry.
*/
count = fnvlist_lookup_uint64(zhp->zpool_config, ZPOOL_CONFIG_ERRCOUNT);
if (count == 0)
return (0);
zc.zc_nvlist_dst = (uintptr_t)zfs_alloc(zhp->zpool_hdl,
count * sizeof (zbookmark_phys_t));
zc.zc_nvlist_dst_size = count;
(void) strcpy(zc.zc_name, zhp->zpool_name);
for (;;) {
buf = zfs_alloc(zhp->zpool_hdl,
buflen * sizeof (zbookmark_phys_t));
zc.zc_nvlist_dst = (uintptr_t)buf;
zc.zc_nvlist_dst_size = buflen;
if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_ERROR_LOG,
&zc) != 0) {
free((void *)(uintptr_t)zc.zc_nvlist_dst);
free(buf);
if (errno == ENOMEM) {
void *dst;

count = zc.zc_nvlist_dst_size;
dst = zfs_alloc(zhp->zpool_hdl, count *
sizeof (zbookmark_phys_t));
zc.zc_nvlist_dst = (uintptr_t)dst;
buflen *= 2;
} else {
return (zpool_standard_error_fmt(hdl, errno,
dgettext(TEXT_DOMAIN, "errors: List of "
Expand All @@ -4177,18 +4172,17 @@ zpool_get_errlog(zpool_handle_t *zhp, nvlist_t **nverrlistp)
* _not_ copied as part of the process. So we point the start of our
* array appropriate and decrement the total number of elements.
*/
zb = ((zbookmark_phys_t *)(uintptr_t)zc.zc_nvlist_dst) +
zc.zc_nvlist_dst_size;
count -= zc.zc_nvlist_dst_size;
zbookmark_phys_t *zb = buf + zc.zc_nvlist_dst_size;
uint64_t zblen = buflen - zc.zc_nvlist_dst_size;

qsort(zb, count, sizeof (zbookmark_phys_t), zbookmark_mem_compare);
qsort(zb, zblen, sizeof (zbookmark_phys_t), zbookmark_mem_compare);

verify(nvlist_alloc(nverrlistp, 0, KM_SLEEP) == 0);

/*
* Fill in the nverrlistp with nvlist's of dataset and object numbers.
*/
for (i = 0; i < count; i++) {
for (uint64_t i = 0; i < zblen; i++) {
nvlist_t *nv;

/* ignoring zb_blkid and zb_level for now */
Expand All @@ -4215,11 +4209,11 @@ zpool_get_errlog(zpool_handle_t *zhp, nvlist_t **nverrlistp)
nvlist_free(nv);
}

free((void *)(uintptr_t)zc.zc_nvlist_dst);
free(buf);
return (0);

nomem:
free((void *)(uintptr_t)zc.zc_nvlist_dst);
free(buf);
return (no_memory(zhp->zpool_hdl));
}

Expand Down
2 changes: 1 addition & 1 deletion lib/libzfs/libzfs_status.c
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,6 @@ check_status(nvlist_t *config, boolean_t isimport,
{
pool_scan_stat_t *ps = NULL;
uint_t vsc, psc;
uint64_t nerr;
uint64_t suspended;
uint64_t hostid = 0;
uint64_t errata = 0;
Expand Down Expand Up @@ -392,6 +391,7 @@ check_status(nvlist_t *config, boolean_t isimport,
* Persistent data errors.
*/
if (!isimport) {
uint64_t nerr;
if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_ERRCOUNT,
&nerr) == 0 && nerr != 0)
return (ZPOOL_STATUS_CORRUPT_DATA);
Expand Down
8 changes: 4 additions & 4 deletions module/zfs/dsl_scan.c
Original file line number Diff line number Diff line change
Expand Up @@ -944,13 +944,13 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)

if (dsl_scan_restarting(scn, tx))
spa_history_log_internal(spa, "scan aborted, restarting", tx,
"errors=%llu", (u_longlong_t)spa_get_errlog_size(spa));
"errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa));
else if (!complete)
spa_history_log_internal(spa, "scan cancelled", tx,
"errors=%llu", (u_longlong_t)spa_get_errlog_size(spa));
"errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa));
else
spa_history_log_internal(spa, "scan done", tx,
"errors=%llu", (u_longlong_t)spa_get_errlog_size(spa));
"errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa));

if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
spa->spa_scrub_active = B_FALSE;
Expand Down Expand Up @@ -1013,7 +1013,7 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
vdev_clear_resilver_deferred(spa->spa_root_vdev, tx)) {
spa_history_log_internal(spa,
"starting deferred resilver", tx, "errors=%llu",
(u_longlong_t)spa_get_errlog_size(spa));
(u_longlong_t)spa_approx_errlog_size(spa));
spa_async_request(spa, SPA_ASYNC_RESILVER);
}

Expand Down
2 changes: 1 addition & 1 deletion module/zfs/spa.c
Original file line number Diff line number Diff line change
Expand Up @@ -5543,7 +5543,7 @@ spa_get_stats(const char *name, nvlist_t **config,

fnvlist_add_uint64(*config,
ZPOOL_CONFIG_ERRCOUNT,
spa_get_errlog_size(spa));
spa_approx_errlog_size(spa));

if (spa_suspended(spa)) {
fnvlist_add_uint64(*config,
Expand Down
Loading

0 comments on commit abef0c5

Please sign in to comment.