Skip to content

Commit 41f5993

Browse files
author
Paul Dagnelie
committed
Detect a slow raidz child during reads
A single slow responding disk can affect the overall read performance of a raidz group. When a raidz child disk is determined to be a persistent slow outlier, then have it sit out during reads for a period of time. The raidz group can use parity to reconstruct the data that was skipped. Each time a slow disk is placed into a sit out period, its `vdev_stat.vs_slow_ios count` is incremented and a zevent class `ereport.fs.zfs.delay` is posted. The length of the sit out period can be changed using the `raid_read_sit_out_secs` module parameter. Setting it to zero disables slow outlier detection. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Signed-off-by: Paul Dagnelie <[email protected]> Contributions-by: Don Brady <[email protected]>
1 parent b227a5d commit 41f5993

File tree

28 files changed

+1382
-13
lines changed

28 files changed

+1382
-13
lines changed

include/os/freebsd/spl/sys/time.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,17 @@ typedef longlong_t hrtime_t;
6262
#define SEC_TO_TICK(sec) ((sec) * hz)
6363
#define NSEC_TO_TICK(nsec) ((nsec) / (NANOSEC / hz))
6464

65+
static __inline hrtime_t
66+
getlrtime(void)
67+
{
68+
struct timespec ts;
69+
hrtime_t nsec;
70+
71+
getnanouptime(&ts);
72+
nsec = ((hrtime_t)ts.tv_sec * NANOSEC) + ts.tv_nsec;
73+
return (nsec);
74+
}
75+
6576
static __inline hrtime_t
6677
gethrtime(void)
6778
{

include/os/linux/spl/sys/time.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,14 @@ gethrestime_sec(void)
7979
return (ts.tv_sec);
8080
}
8181

82+
static inline hrtime_t
83+
getlrtime(void)
84+
{
85+
inode_timespec_t ts;
86+
ktime_get_coarse_ts64(&ts);
87+
return (((hrtime_t)ts.tv_sec * NSEC_PER_SEC) + ts.tv_nsec);
88+
}
89+
8290
static inline hrtime_t
8391
gethrtime(void)
8492
{

include/sys/fm/fs/zfs.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ extern "C" {
5858
#define FM_EREPORT_ZFS_PROBE_FAILURE "probe_failure"
5959
#define FM_EREPORT_ZFS_LOG_REPLAY "log_replay"
6060
#define FM_EREPORT_ZFS_CONFIG_CACHE_WRITE "config_cache_write"
61+
#define FM_EREPORT_ZFS_SITOUT "sitout"
6162

6263
#define FM_EREPORT_PAYLOAD_ZFS_POOL "pool"
6364
#define FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE "pool_failmode"

include/sys/fs/zfs.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -385,6 +385,8 @@ typedef enum {
385385
VDEV_PROP_TRIM_SUPPORT,
386386
VDEV_PROP_TRIM_ERRORS,
387387
VDEV_PROP_SLOW_IOS,
388+
VDEV_PROP_SIT_OUT,
389+
VDEV_PROP_AUTOSIT,
388390
VDEV_NUM_PROPS
389391
} vdev_prop_t;
390392

@@ -1673,6 +1675,7 @@ typedef enum {
16731675
ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS,
16741676
ZFS_ERR_ASHIFT_MISMATCH,
16751677
ZFS_ERR_STREAM_LARGE_MICROZAP,
1678+
ZFS_ERR_TOO_MANY_SITOUTS,
16761679
} zfs_errno_t;
16771680

16781681
/*

include/sys/vdev_impl.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -279,10 +279,12 @@ struct vdev {
279279
uint64_t vdev_noalloc; /* device is passivated? */
280280
uint64_t vdev_removing; /* device is being removed? */
281281
uint64_t vdev_failfast; /* device failfast setting */
282+
boolean_t vdev_autosit; /* automatic sitout management */
282283
boolean_t vdev_rz_expanding; /* raidz is being expanded? */
283284
boolean_t vdev_ishole; /* is a hole in the namespace */
284285
uint64_t vdev_top_zap;
285286
vdev_alloc_bias_t vdev_alloc_bias; /* metaslab allocation bias */
287+
uint64_t vdev_last_latency_check;
286288

287289
/* pool checkpoint related */
288290
space_map_t *vdev_checkpoint_sm; /* contains reserved blocks */
@@ -431,6 +433,10 @@ struct vdev {
431433
hrtime_t vdev_mmp_pending; /* 0 if write finished */
432434
uint64_t vdev_mmp_kstat_id; /* to find kstat entry */
433435
uint64_t vdev_expansion_time; /* vdev's last expansion time */
436+
/* used to calculate average read latency */
437+
uint64_t *vdev_prev_histo;
438+
int64_t vdev_outlier_count; /* read outlier amongst peers */
439+
hrtime_t vdev_read_sit_out_expire; /* end of sit out period */
434440
list_node_t vdev_leaf_node; /* leaf vdev list */
435441

436442
/*

include/sys/vdev_raidz.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,9 @@ void vdev_raidz_checksum_error(zio_t *, struct raidz_col *, abd_t *);
6161
struct raidz_row *vdev_raidz_row_alloc(int, zio_t *);
6262
void vdev_raidz_reflow_copy_scratch(spa_t *);
6363
void raidz_dtl_reassessed(vdev_t *);
64+
boolean_t vdev_sit_out_reads(vdev_t *, zio_flag_t);
65+
void vdev_raidz_sit_child(vdev_t *svd, uint64_t secs);
66+
void vdev_raidz_unsit_child(vdev_t *vd);
6467

6568
extern const zio_vsd_ops_t vdev_raidz_vsd_ops;
6669

include/sys/vdev_raidz_impl.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,7 @@ typedef struct raidz_col {
119119
uint8_t rc_need_orig_restore:1; /* need to restore from orig_data? */
120120
uint8_t rc_force_repair:1; /* Write good data to this column */
121121
uint8_t rc_allow_repair:1; /* Allow repair I/O to this column */
122+
uint8_t rc_latency_outlier:1; /* Latency outlier for this device */
122123
int rc_shadow_devidx; /* for double write during expansion */
123124
int rc_shadow_error; /* for double write during expansion */
124125
uint64_t rc_shadow_offset; /* for double write during expansion */
@@ -133,6 +134,7 @@ typedef struct raidz_row {
133134
int rr_firstdatacol; /* First data column/parity count */
134135
abd_t *rr_abd_empty; /* dRAID empty sector buffer */
135136
int rr_nempty; /* empty sectors included in parity */
137+
int rr_outlier_cnt; /* Count of latency outlier devices */
136138
#ifdef ZFS_DEBUG
137139
uint64_t rr_offset; /* Logical offset for *_io_verify() */
138140
uint64_t rr_size; /* Physical size for *_io_verify() */

lib/libspl/include/sys/time.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,15 @@ gethrestime_sec(void)
9797
return (tv.tv_sec);
9898
}
9999

100+
static inline hrtime_t
101+
getlrtime(void)
102+
{
103+
struct timeval tv;
104+
(void) gettimeofday(&tv, NULL);
105+
return ((((uint64_t)tv.tv_sec) * NANOSEC) +
106+
((uint64_t)tv.tv_usec * NSEC_PER_USEC));
107+
}
108+
100109
static inline hrtime_t
101110
gethrtime(void)
102111
{

lib/libzfs/libzfs.abi

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6117,7 +6117,9 @@
61176117
<enumerator name='VDEV_PROP_TRIM_SUPPORT' value='49'/>
61186118
<enumerator name='VDEV_PROP_TRIM_ERRORS' value='50'/>
61196119
<enumerator name='VDEV_PROP_SLOW_IOS' value='51'/>
6120-
<enumerator name='VDEV_NUM_PROPS' value='52'/>
6120+
<enumerator name='VDEV_PROP_SIT_OUT' value='52'/>
6121+
<enumerator name='VDEV_PROP_AUTOSIT' value='53'/>
6122+
<enumerator name='VDEV_NUM_PROPS' value='54'/>
61216123
</enum-decl>
61226124
<typedef-decl name='vdev_prop_t' type-id='1573bec8' id='5aa5c90c'/>
61236125
<class-decl name='zpool_load_policy' size-in-bits='256' is-struct='yes' visibility='default' id='2f65b36f'>

lib/libzfs/libzfs_pool.c

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5549,6 +5549,8 @@ zpool_get_vdev_prop_value(nvlist_t *nvprop, vdev_prop_t prop, char *prop_name,
55495549
/* Only use if provided by the RAIDZ VDEV above */
55505550
if (prop == VDEV_PROP_RAIDZ_EXPANDING)
55515551
return (ENOENT);
5552+
if (prop == VDEV_PROP_SIT_OUT)
5553+
return (ENOENT);
55525554
}
55535555
if (vdev_prop_index_to_string(prop, intval,
55545556
(const char **)&strval) != 0)
@@ -5718,8 +5720,16 @@ zpool_set_vdev_prop(zpool_handle_t *zhp, const char *vdevname,
57185720
nvlist_free(nvl);
57195721
nvlist_free(outnvl);
57205722

5721-
if (ret)
5722-
(void) zpool_standard_error(zhp->zpool_hdl, errno, errbuf);
5723+
if (ret) {
5724+
if (errno == ENOTSUP) {
5725+
zfs_error_aux(zhp->zpool_hdl, dgettext(TEXT_DOMAIN,
5726+
"property not supported for this vdev"));
5727+
(void) zfs_error(zhp->zpool_hdl, EZFS_PROPTYPE, errbuf);
5728+
} else {
5729+
(void) zpool_standard_error(zhp->zpool_hdl, errno,
5730+
errbuf);
5731+
}
5732+
}
57235733

57245734
return (ret);
57255735
}

0 commit comments

Comments
 (0)