Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions include/os/freebsd/spl/sys/time.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,17 @@ typedef longlong_t hrtime_t;
#define SEC_TO_TICK(sec) ((sec) * hz)
#define NSEC_TO_TICK(nsec) ((nsec) / (NANOSEC / hz))

static __inline hrtime_t
getlrtime(void)
{
struct timespec ts;
hrtime_t nsec;

getnanouptime(&ts);
nsec = ((hrtime_t)ts.tv_sec * NANOSEC) + ts.tv_nsec;
return (nsec);
}

static __inline hrtime_t
gethrtime(void)
{
Expand Down
8 changes: 8 additions & 0 deletions include/os/linux/spl/sys/time.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,14 @@ gethrestime_sec(void)
return (ts.tv_sec);
}

static inline hrtime_t
getlrtime(void)
{
inode_timespec_t ts;
ktime_get_coarse_ts64(&ts);
return (((hrtime_t)ts.tv_sec * NSEC_PER_SEC) + ts.tv_nsec);
}

static inline hrtime_t
gethrtime(void)
{
Expand Down
1 change: 1 addition & 0 deletions include/sys/fm/fs/zfs.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ extern "C" {
#define FM_EREPORT_ZFS_PROBE_FAILURE "probe_failure"
#define FM_EREPORT_ZFS_LOG_REPLAY "log_replay"
#define FM_EREPORT_ZFS_CONFIG_CACHE_WRITE "config_cache_write"
#define FM_EREPORT_ZFS_SITOUT "sitout"

#define FM_EREPORT_PAYLOAD_ZFS_POOL "pool"
#define FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE "pool_failmode"
Expand Down
3 changes: 3 additions & 0 deletions include/sys/fs/zfs.h
Original file line number Diff line number Diff line change
Expand Up @@ -385,6 +385,8 @@ typedef enum {
VDEV_PROP_TRIM_SUPPORT,
VDEV_PROP_TRIM_ERRORS,
VDEV_PROP_SLOW_IOS,
VDEV_PROP_SIT_OUT,
VDEV_PROP_AUTOSIT,
VDEV_NUM_PROPS
} vdev_prop_t;

Expand Down Expand Up @@ -1673,6 +1675,7 @@ typedef enum {
ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS,
ZFS_ERR_ASHIFT_MISMATCH,
ZFS_ERR_STREAM_LARGE_MICROZAP,
ZFS_ERR_TOO_MANY_SITOUTS,
} zfs_errno_t;

/*
Expand Down
6 changes: 6 additions & 0 deletions include/sys/vdev_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -279,10 +279,12 @@ struct vdev {
uint64_t vdev_noalloc; /* device is passivated? */
uint64_t vdev_removing; /* device is being removed? */
uint64_t vdev_failfast; /* device failfast setting */
boolean_t vdev_autosit; /* automatic sitout management */
boolean_t vdev_rz_expanding; /* raidz is being expanded? */
boolean_t vdev_ishole; /* is a hole in the namespace */
uint64_t vdev_top_zap;
vdev_alloc_bias_t vdev_alloc_bias; /* metaslab allocation bias */
uint64_t vdev_last_latency_check;

/* pool checkpoint related */
space_map_t *vdev_checkpoint_sm; /* contains reserved blocks */
Expand Down Expand Up @@ -431,6 +433,10 @@ struct vdev {
hrtime_t vdev_mmp_pending; /* 0 if write finished */
uint64_t vdev_mmp_kstat_id; /* to find kstat entry */
uint64_t vdev_expansion_time; /* vdev's last expansion time */
/* used to calculate average read latency */
uint64_t *vdev_prev_histo;
int64_t vdev_outlier_count; /* read outlier amongst peers */
hrtime_t vdev_read_sit_out_expire; /* end of sit out period */
list_node_t vdev_leaf_node; /* leaf vdev list */

/*
Expand Down
3 changes: 3 additions & 0 deletions include/sys/vdev_raidz.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,9 @@ void vdev_raidz_checksum_error(zio_t *, struct raidz_col *, abd_t *);
struct raidz_row *vdev_raidz_row_alloc(int, zio_t *);
void vdev_raidz_reflow_copy_scratch(spa_t *);
void raidz_dtl_reassessed(vdev_t *);
boolean_t vdev_sit_out_reads(vdev_t *, zio_flag_t);
void vdev_raidz_sit_child(vdev_t *, uint64_t);
void vdev_raidz_unsit_child(vdev_t *);

extern const zio_vsd_ops_t vdev_raidz_vsd_ops;

Expand Down
2 changes: 2 additions & 0 deletions include/sys/vdev_raidz_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ typedef struct raidz_col {
uint8_t rc_need_orig_restore:1; /* need to restore from orig_data? */
uint8_t rc_force_repair:1; /* Write good data to this column */
uint8_t rc_allow_repair:1; /* Allow repair I/O to this column */
uint8_t rc_latency_outlier:1; /* Latency outlier for this device */
int rc_shadow_devidx; /* for double write during expansion */
int rc_shadow_error; /* for double write during expansion */
uint64_t rc_shadow_offset; /* for double write during expansion */
Expand All @@ -133,6 +134,7 @@ typedef struct raidz_row {
int rr_firstdatacol; /* First data column/parity count */
abd_t *rr_abd_empty; /* dRAID empty sector buffer */
int rr_nempty; /* empty sectors included in parity */
int rr_outlier_cnt; /* Count of latency outlier devices */
#ifdef ZFS_DEBUG
uint64_t rr_offset; /* Logical offset for *_io_verify() */
uint64_t rr_size; /* Physical size for *_io_verify() */
Expand Down
9 changes: 9 additions & 0 deletions lib/libspl/include/sys/time.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,15 @@ gethrestime_sec(void)
return (tv.tv_sec);
}

static inline hrtime_t
getlrtime(void)
{
struct timeval tv;
(void) gettimeofday(&tv, NULL);
return ((((uint64_t)tv.tv_sec) * NANOSEC) +
((uint64_t)tv.tv_usec * NSEC_PER_USEC));
}

static inline hrtime_t
gethrtime(void)
{
Expand Down
4 changes: 3 additions & 1 deletion lib/libzfs/libzfs.abi
Original file line number Diff line number Diff line change
Expand Up @@ -6117,7 +6117,9 @@
<enumerator name='VDEV_PROP_TRIM_SUPPORT' value='49'/>
<enumerator name='VDEV_PROP_TRIM_ERRORS' value='50'/>
<enumerator name='VDEV_PROP_SLOW_IOS' value='51'/>
<enumerator name='VDEV_NUM_PROPS' value='52'/>
<enumerator name='VDEV_PROP_SIT_OUT' value='52'/>
<enumerator name='VDEV_PROP_AUTOSIT' value='53'/>
<enumerator name='VDEV_NUM_PROPS' value='54'/>
</enum-decl>
<typedef-decl name='vdev_prop_t' type-id='1573bec8' id='5aa5c90c'/>
<class-decl name='zpool_load_policy' size-in-bits='256' is-struct='yes' visibility='default' id='2f65b36f'>
Expand Down
14 changes: 12 additions & 2 deletions lib/libzfs/libzfs_pool.c
Original file line number Diff line number Diff line change
Expand Up @@ -5549,6 +5549,8 @@ zpool_get_vdev_prop_value(nvlist_t *nvprop, vdev_prop_t prop, char *prop_name,
/* Only use if provided by the RAIDZ VDEV above */
if (prop == VDEV_PROP_RAIDZ_EXPANDING)
return (ENOENT);
if (prop == VDEV_PROP_SIT_OUT)
return (ENOENT);
}
if (vdev_prop_index_to_string(prop, intval,
(const char **)&strval) != 0)
Expand Down Expand Up @@ -5718,8 +5720,16 @@ zpool_set_vdev_prop(zpool_handle_t *zhp, const char *vdevname,
nvlist_free(nvl);
nvlist_free(outnvl);

if (ret)
(void) zpool_standard_error(zhp->zpool_hdl, errno, errbuf);
if (ret) {
if (errno == ENOTSUP) {
zfs_error_aux(zhp->zpool_hdl, dgettext(TEXT_DOMAIN,
"property not supported for this vdev"));
(void) zfs_error(zhp->zpool_hdl, EZFS_PROPTYPE, errbuf);
} else {
(void) zpool_standard_error(zhp->zpool_hdl, errno,
errbuf);
}
}

return (ret);
}
Expand Down
5 changes: 5 additions & 0 deletions lib/libzfs/libzfs_util.c
Original file line number Diff line number Diff line change
Expand Up @@ -776,6 +776,11 @@ zpool_standard_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...)
case ZFS_ERR_ASHIFT_MISMATCH:
zfs_verror(hdl, EZFS_ASHIFT_MISMATCH, fmt, ap);
break;
case ZFS_ERR_TOO_MANY_SITOUTS:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "too many disks "
"already sitting out"));
zfs_verror(hdl, EZFS_BUSY, fmt, ap);
break;
default:
zfs_error_aux(hdl, "%s", zfs_strerror(error));
zfs_verror(hdl, EZFS_UNKNOWN, fmt, ap);
Expand Down
37 changes: 37 additions & 0 deletions man/man4/zfs.4
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
.\" Copyright (c) 2019, 2021 by Delphix. All rights reserved.
.\" Copyright (c) 2019 Datto Inc.
.\" Copyright (c) 2023, 2024, 2025, Klara, Inc.
.\"
.\" The contents of this file are subject to the terms of the Common Development
.\" and Distribution License (the "License"). You may not use this file except
.\" in compliance with the License. You can obtain a copy of the license at
Expand Down Expand Up @@ -601,6 +602,42 @@ new format when enabling the
feature.
The default is to convert all log entries.
.
.It Sy vdev_read_sit_out_secs Ns = Ns Sy 600 Ns s Po 10 min Pc Pq ulong
When a slow disk outlier is detected it is placed in a sit out state.
While sitting out the disk will not participate in normal reads, instead its
data will be reconstructed as needed from parity.
Scrub operations will always read from a disk, even if it's sitting out.
A number of disks in a RAID-Z or dRAID vdev may sit out at the same time, up
to the number of parity devices.
Writes will still be issued to a disk which is sitting out to maintain full
redundancy.
Defaults to 600 seconds and a value of zero disables disk sit-outs in general,
including slow disk outlier detection.
.
.It Sy vdev_raidz_outlier_check_interval_ms Ns = Ns Sy 1000 Ns ms Po 1 sec Pc Pq ulong
How often each RAID-Z and dRAID vdev will check for slow disk outliers.
Increasing this interval will reduce the sensitivity of detection (since all
I/Os since the last check are included in the statistics), but will slow the
response to a disk developing a problem.
Defaults to once per second; setting extremely small values may cause negative
performance effects.
.
.It Sy vdev_raidz_outlier_insensitivity Ns = Ns Sy 50 Pq uint
When performing slow outlier checks for RAID-Z and dRAID vdevs, this value is
used to determine how far out an outlier must be before it counts as an event
worth consdering.
This is phrased as "insensitivity" because larger values result in fewer
detections.
Smaller values will result in more aggressive sitting out of disks that may have
problems, but may significantly increase the rate of spurious sit-outs.
.Pp
To provide a more technical definition of this parameter, this is the multiple
of the inter-quartile range (IQR) that is being used in a Tukey's Fence
detection algorithm.
This is much higher than a normal Tukey's Fence k-value, because the
distribution under consideration is probably an extreme-value distribution,
rather than a more typical Gaussian distribution.
.
.It Sy vdev_removal_max_span Ns = Ns Sy 32768 Ns B Po 32 KiB Pc Pq uint
During top-level vdev removal, chunks of data are copied from the vdev
which may include free space in order to trade bandwidth for IOPS.
Expand Down
39 changes: 35 additions & 4 deletions man/man7/vdevprops.7
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
.\"
.\" CDDL HEADER END
.\"
.\" Copyright (c) 2021 Klara, Inc.
.\" Copyright (c) 2021, 2025, Klara, Inc.
.\"
.Dd July 23, 2024
.Dt VDEVPROPS 7
Expand Down Expand Up @@ -106,11 +106,17 @@ The number of children belonging to this vdev
.It Sy read_errors , write_errors , checksum_errors , initialize_errors , trim_errors
The number of errors of each type encountered by this vdev
.It Sy slow_ios
The number of slow I/Os encountered by this vdev,
These represent I/O operations that didn't complete in
This indicates the number of slow I/O operations encountered by this vdev.
A slow I/O is defined as an operation that did not complete within the
.Sy zio_slow_io_ms
milliseconds
threshold in milliseconds
.Pq Sy 30000 No by default .
For
.Sy RAIDZ
and
.Sy DRAID
configurations, this value also represents the number of times the vdev was
identified as an outlier and excluded from participating in read I/O operations.
.It Sy null_ops , read_ops , write_ops , free_ops , claim_ops , trim_ops
The number of I/O operations of each type performed by this vdev
.It Xo
Expand Down Expand Up @@ -150,6 +156,31 @@ The amount of space to reserve for the EFI system partition
.It Sy failfast
If this device should propagate BIO errors back to ZFS, used to disable
failfast.
.It Sy sit_out
Only valid for
.Sy RAIDZ
and
.Sy DRAID
vdevs.
True when a slow disk outlier was detected and the vdev is currently in a sit
out state.
This property can be manually set to cause vdevs to sit out.
It will also be automatically set by the
.Sy autosit
logic if that is enabled.
While sitting out, the vdev will not participate in normal reads, instead its
data will be reconstructed as needed from parity.
.It Sy autosit
Only valid for
.Sy RAIDZ
and
.Sy DRAID
vdevs.
If set, this enables the kernel-level slow disk detection logic.
This logic automatically causes any vdevs that are significant negative
performance outliers to sit out, as described in the
.Sy sit_out
property.
.It Sy path
The path to the device for this vdev
.It Sy allocating
Expand Down
10 changes: 10 additions & 0 deletions man/man8/zpool-events.8
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,16 @@ Issued when a scrub is resumed on a pool.
.It Sy scrub.paused
Issued when a scrub is paused on a pool.
.It Sy bootfs.vdev.attach
.It Sy sitout
Issued when a
.Sy RAIDZ
or
.Sy DRAID
vdev triggers the
.Sy autosit
logic.
This logic detects when a disk in such a vdev is significantly slower than its
peers, and sits them out temporarily to preserve the performance of the pool.
.El
.
.Sh PAYLOADS
Expand Down
6 changes: 6 additions & 0 deletions module/zcommon/zpool_prop.c
Original file line number Diff line number Diff line change
Expand Up @@ -467,9 +467,15 @@ vdev_prop_init(void)
zprop_register_index(VDEV_PROP_RAIDZ_EXPANDING, "raidz_expanding", 0,
PROP_READONLY, ZFS_TYPE_VDEV, "on | off", "RAIDZ_EXPANDING",
boolean_table, sfeatures);
zprop_register_index(VDEV_PROP_SIT_OUT, "sit_out", 0,
PROP_DEFAULT, ZFS_TYPE_VDEV, "on | off", "SIT_OUT", boolean_table,
sfeatures);
zprop_register_index(VDEV_PROP_TRIM_SUPPORT, "trim_support", 0,
PROP_READONLY, ZFS_TYPE_VDEV, "on | off", "TRIMSUP",
boolean_table, sfeatures);
zprop_register_index(VDEV_PROP_AUTOSIT, "autosit", 0,
PROP_DEFAULT, ZFS_TYPE_VDEV, "on | off", "AUTOSIT", boolean_table,
sfeatures);

/* default index properties */
zprop_register_index(VDEV_PROP_FAILFAST, "failfast", B_TRUE,
Expand Down
6 changes: 3 additions & 3 deletions module/zfs/spa_misc.c
Original file line number Diff line number Diff line change
Expand Up @@ -251,11 +251,11 @@ spa_mode_t spa_mode_global = SPA_MODE_UNINIT;

#ifdef ZFS_DEBUG
/*
* Everything except dprintf, set_error, spa, and indirect_remap is on
* by default in debug builds.
* Everything except dprintf, set_error, indirect_remap, and raidz_reconstruct
* is on by default in debug builds.
*/
int zfs_flags = ~(ZFS_DEBUG_DPRINTF | ZFS_DEBUG_SET_ERROR |
ZFS_DEBUG_INDIRECT_REMAP);
ZFS_DEBUG_INDIRECT_REMAP | ZFS_DEBUG_RAIDZ_RECONSTRUCT);
#else
int zfs_flags = 0;
#endif
Expand Down
Loading
Loading