Skip to content

Commit

Permalink
Add slow disk diagnosis to ZED
Browse files Browse the repository at this point in the history
Slow disk response times can be indicative of a failing drive. ZFS
currently tracks slow I/Os (slower than zio_slow_io_ms) and generates
events (ereport.fs.zfs.delay).  However, no action is taken by ZED,
like is done for checksum or I/O errors.  This change adds slow disk
diagnosis to ZED which is opt-in using new VDEV properties:
  VDEV_PROP_SLOW_IO_N
  VDEV_PROP_SLOW_IO_T

If multiple VDEVs in a pool are undergoing slow I/Os, then it skips
the zpool_vdev_degrade().

Sponsored-By: OpenDrives Inc.
Sponsored-By: Klara Inc.
Co-authored-by: Rob Wing <rob.wing@klarasystems.com>
Signed-off-by: Don Brady <don.brady@klarasystems.com>
  • Loading branch information
don-brady and Rob Wing committed Dec 9, 2023
1 parent 014265f commit 258c87e
Show file tree
Hide file tree
Showing 26 changed files with 662 additions and 84 deletions.
57 changes: 26 additions & 31 deletions cmd/zed/agents/fmd_api.c
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
* Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
*
* Copyright (c) 2016, Intel Corporation.
* Copyright (c) 2023, Klara Inc.
*/

/*
Expand Down Expand Up @@ -231,28 +232,6 @@ fmd_prop_get_int32(fmd_hdl_t *hdl, const char *name)
if (strcmp(name, "spare_on_remove") == 0)
return (1);

if (strcmp(name, "io_N") == 0 || strcmp(name, "checksum_N") == 0)
return (10); /* N = 10 events */

return (0);
}

int64_t
fmd_prop_get_int64(fmd_hdl_t *hdl, const char *name)
{
(void) hdl;

/*
* These can be looked up in mp->modinfo->fmdi_props
* For now we just hard code for phase 2. In the
* future, there can be a ZED based override.
*/
if (strcmp(name, "remove_timeout") == 0)
return (15ULL * 1000ULL * 1000ULL * 1000ULL); /* 15 sec */

if (strcmp(name, "io_T") == 0 || strcmp(name, "checksum_T") == 0)
return (1000ULL * 1000ULL * 1000ULL * 600ULL); /* 10 min */

return (0);
}

Expand Down Expand Up @@ -535,37 +514,53 @@ fmd_serd_exists(fmd_hdl_t *hdl, const char *name)
return (fmd_serd_eng_lookup(&mp->mod_serds, name) != NULL);
}

void
fmd_serd_reset(fmd_hdl_t *hdl, const char *name)
int
fmd_serd_active(fmd_hdl_t *hdl, const char *name)
{
fmd_module_t *mp = (fmd_module_t *)hdl;
fmd_serd_eng_t *sgp;

if ((sgp = fmd_serd_eng_lookup(&mp->mod_serds, name)) == NULL) {
zed_log_msg(LOG_ERR, "serd engine '%s' does not exist", name);
return;
return (0);
}
return (fmd_serd_eng_fired(sgp) || !fmd_serd_eng_empty(sgp));
}

fmd_serd_eng_reset(sgp);
void
fmd_serd_reset(fmd_hdl_t *hdl, const char *name)
{
fmd_module_t *mp = (fmd_module_t *)hdl;
fmd_serd_eng_t *sgp;

fmd_hdl_debug(hdl, "serd_reset %s", name);
if ((sgp = fmd_serd_eng_lookup(&mp->mod_serds, name)) == NULL) {
zed_log_msg(LOG_ERR, "serd engine '%s' does not exist", name);
} else {
fmd_serd_eng_reset(sgp);
fmd_hdl_debug(hdl, "serd_reset %s", name);
}
}

int
fmd_serd_record(fmd_hdl_t *hdl, const char *name, fmd_event_t *ep)
{
fmd_module_t *mp = (fmd_module_t *)hdl;
fmd_serd_eng_t *sgp;
int err;

if ((sgp = fmd_serd_eng_lookup(&mp->mod_serds, name)) == NULL) {
zed_log_msg(LOG_ERR, "failed to add record to SERD engine '%s'",
name);
return (0);
}
err = fmd_serd_eng_record(sgp, ep->ev_hrt);
return (fmd_serd_eng_record(sgp, ep->ev_hrt));
}

void
fmd_serd_gc(fmd_hdl_t *hdl)
{
fmd_module_t *mp = (fmd_module_t *)hdl;

return (err);
fmd_serd_hash_apply(&mp->mod_serds, fmd_serd_eng_gc, NULL);
}

/* FMD Timers */
Expand All @@ -579,7 +574,7 @@ _timer_notify(union sigval sv)
const fmd_hdl_ops_t *ops = mp->mod_info->fmdi_ops;
struct itimerspec its;

fmd_hdl_debug(hdl, "timer fired (%p)", ftp->ft_tid);
fmd_hdl_debug(hdl, "%s timer fired (%p)", mp->mod_name, ftp->ft_tid);

/* disarm the timer */
memset(&its, 0, sizeof (struct itimerspec));
Expand Down
3 changes: 2 additions & 1 deletion cmd/zed/agents/fmd_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,6 @@ extern void fmd_hdl_vdebug(fmd_hdl_t *, const char *, va_list);
extern void fmd_hdl_debug(fmd_hdl_t *, const char *, ...);

extern int32_t fmd_prop_get_int32(fmd_hdl_t *, const char *);
extern int64_t fmd_prop_get_int64(fmd_hdl_t *, const char *);

#define FMD_STAT_NOALLOC 0x0 /* fmd should use caller's memory */
#define FMD_STAT_ALLOC 0x1 /* fmd should allocate stats memory */
Expand Down Expand Up @@ -195,10 +194,12 @@ extern size_t fmd_buf_size(fmd_hdl_t *, fmd_case_t *, const char *);
extern void fmd_serd_create(fmd_hdl_t *, const char *, uint_t, hrtime_t);
extern void fmd_serd_destroy(fmd_hdl_t *, const char *);
extern int fmd_serd_exists(fmd_hdl_t *, const char *);
extern int fmd_serd_active(fmd_hdl_t *, const char *);
extern void fmd_serd_reset(fmd_hdl_t *, const char *);
extern int fmd_serd_record(fmd_hdl_t *, const char *, fmd_event_t *);
extern int fmd_serd_fired(fmd_hdl_t *, const char *);
extern int fmd_serd_empty(fmd_hdl_t *, const char *);
extern void fmd_serd_gc(fmd_hdl_t *);

extern id_t fmd_timer_install(fmd_hdl_t *, void *, fmd_event_t *, hrtime_t);
extern void fmd_timer_remove(fmd_hdl_t *, id_t);
Expand Down
3 changes: 2 additions & 1 deletion cmd/zed/agents/fmd_serd.c
Original file line number Diff line number Diff line change
Expand Up @@ -310,8 +310,9 @@ fmd_serd_eng_reset(fmd_serd_eng_t *sgp)
}

void
fmd_serd_eng_gc(fmd_serd_eng_t *sgp)
fmd_serd_eng_gc(fmd_serd_eng_t *sgp, void *arg)
{
(void) arg;
fmd_serd_elem_t *sep, *nep;
hrtime_t hrt;

Expand Down
2 changes: 1 addition & 1 deletion cmd/zed/agents/fmd_serd.h
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ extern int fmd_serd_eng_fired(fmd_serd_eng_t *);
extern int fmd_serd_eng_empty(fmd_serd_eng_t *);

extern void fmd_serd_eng_reset(fmd_serd_eng_t *);
extern void fmd_serd_eng_gc(fmd_serd_eng_t *);
extern void fmd_serd_eng_gc(fmd_serd_eng_t *, void *);

#ifdef __cplusplus
}
Expand Down
Loading

0 comments on commit 258c87e

Please sign in to comment.