Skip to content

Commit 13f5a08

Browse files
committed
zpool: Allow lockless zpool status
Add a new `ZPOOL_LOCK_BEHAVIOR` envvar to control `zpool status` lock behavior. `ZPOOL_LOCK_BEHAVIOR` can have one of these values: "lockless": Try for a short amount of time to get the spa_namespace lock. If that doesn't work, then do the zpool status locklessly. This is dangerous and can crash your system if the pools configs are being modified while zpool status is running. This setting requires `zpool status` to be run as root. "trylock": Try for a short amount of time to get the spa_namespace lock. If that doesn't work then simply abort 'zpool status'. "wait": Wait forever for the lock. This is the default. These options allow users to view the zpool status when the pool gets stuck while holding the spa_namespace lock. Signed-off-by: Tony Hutter <[email protected]>
1 parent b2196fb commit 13f5a08

File tree

27 files changed

+697
-64
lines changed

27 files changed

+697
-64
lines changed

cmd/zdb/zdb.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7815,7 +7815,8 @@ import_checkpointed_state(char *target, nvlist_t *cfg, boolean_t target_is_spa,
78157815

78167816
if (cfg == NULL) {
78177817
zdb_set_skip_mmp(poolname);
7818-
error = spa_get_stats(poolname, &cfg, NULL, 0);
7818+
error = spa_get_stats(poolname, &cfg, NULL, 0,
7819+
ZPOOL_LOCK_BEHAVIOR_DEFAULT);
78197820
if (error != 0) {
78207821
fatal("Tried to read config of pool \"%s\" but "
78217822
"spa_get_stats() failed with error %d\n",

cmd/zpool/zpool_iter.c

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,7 @@ pool_list_get(int argc, char **argv, zprop_list_t **proplist, zfs_type_t type,
118118
boolean_t literal, int *err)
119119
{
120120
zpool_list_t *zlp;
121+
int rc;
121122

122123
zlp = safe_malloc(sizeof (zpool_list_t));
123124

@@ -137,7 +138,11 @@ pool_list_get(int argc, char **argv, zprop_list_t **proplist, zfs_type_t type,
137138
zlp->zl_literal = literal;
138139

139140
if (argc == 0) {
140-
(void) zpool_iter(g_zfs, add_pool, zlp);
141+
rc = zpool_iter(g_zfs, add_pool, zlp);
142+
if (rc != 0) {
143+
free(zlp);
144+
return (NULL);
145+
}
141146
zlp->zl_findall = B_TRUE;
142147
} else {
143148
int i;

cmd/zpool/zpool_main.c

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11048,6 +11048,32 @@ status_callback(zpool_handle_t *zhp, void *data)
1104811048
return (0);
1104911049
}
1105011050

11051+
/*
11052+
* Set the zpool status lock behavior based off of the ZPOOL_LOCK_BEHAVIOR
11053+
* envvar. If the var is not set, or an unknown value, then set the lock
11054+
* behavior to ZPOOL_LOCK_BEHAVIOR_DEFAULT.
11055+
*/
11056+
static void
11057+
zpool_set_lock_behavior(void)
11058+
{
11059+
char *str;
11060+
zpool_lock_behavior_t zpool_lock_behavior;
11061+
11062+
str = getenv("ZPOOL_LOCK_BEHAVIOR");
11063+
if (str == NULL)
11064+
zpool_lock_behavior = ZPOOL_LOCK_BEHAVIOR_DEFAULT;
11065+
else if (strcmp(str, "wait") == 0)
11066+
zpool_lock_behavior = ZPOOL_LOCK_BEHAVIOR_WAIT;
11067+
else if (strcmp(str, "trylock") == 0)
11068+
zpool_lock_behavior = ZPOOL_LOCK_BEHAVIOR_TRYLOCK;
11069+
else if (strcmp(str, "lockless") == 0)
11070+
zpool_lock_behavior = ZPOOL_LOCK_BEHAVIOR_LOCKLESS;
11071+
else
11072+
zpool_lock_behavior = ZPOOL_LOCK_BEHAVIOR_DEFAULT;
11073+
11074+
libzfs_set_lock_behavior(g_zfs, zpool_lock_behavior);
11075+
}
11076+
1105111077
/*
1105211078
* zpool status [-dDegiLpPstvx] [-c [script1,script2,...]] ...
1105311079
* [-j|--json [--json-flat-vdevs] [--json-int] ...
@@ -11223,6 +11249,8 @@ zpool_do_status(int argc, char **argv)
1122311249
usage(B_FALSE);
1122411250
}
1122511251

11252+
zpool_set_lock_behavior();
11253+
1122611254
for (;;) {
1122711255
if (cb.cb_json) {
1122811256
cb.cb_jsobj = zpool_json_schema(0, 1);

include/libzfs.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -265,6 +265,8 @@ _LIBZFS_H int zpool_create(libzfs_handle_t *, const char *, nvlist_t *,
265265
nvlist_t *, nvlist_t *);
266266
_LIBZFS_H int zpool_destroy(zpool_handle_t *, const char *);
267267
_LIBZFS_H int zpool_add(zpool_handle_t *, nvlist_t *, boolean_t check_ashift);
268+
_LIBZFS_H void libzfs_set_lock_behavior(libzfs_handle_t *,
269+
zpool_lock_behavior_t);
268270

269271
typedef struct splitflags {
270272
/* do not split, but return the config that would be split off */

include/os/freebsd/spl/sys/mutex.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,4 +72,19 @@ typedef enum {
7272
#define mutex_owned(lock) sx_xlocked(lock)
7373
#define mutex_owner(lock) sx_xholder(lock)
7474

75+
/*
76+
* Poor-man's version of Linux kernel's down_timeout(). Try to acquire a mutex
77+
* for 'ns' number of nanoseconds. Returns 0 if mutex was acquired or ETIME
78+
* if timeout occurred.
79+
*/
80+
static inline int mutex_enter_timeout(kmutex_t *mutex, uint64_t ns)
81+
{
82+
hrtime_t end = gethrtime() + ns;
83+
while (gethrtime() < end) {
84+
if (mutex_tryenter(mutex))
85+
return (0); /* success */
86+
}
87+
return (ETIME);
88+
}
89+
7590
#endif /* _OPENSOLARIS_SYS_MUTEX_H_ */

include/os/linux/spl/sys/mutex.h

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
#define _SPL_MUTEX_H
2727

2828
#include <sys/types.h>
29+
#include <sys/time.h>
2930
#include <linux/sched.h>
3031
#include <linux/mutex.h>
3132
#include <linux/lockdep.h>
@@ -187,4 +188,19 @@ spl_mutex_lockdep_on_maybe(kmutex_t *mp) \
187188
/* NOTE: do not dereference mp after this point */ \
188189
}
189190

191+
/*
192+
* Poor-man's version of Linux kernel's down_timeout(). Try to acquire a mutex
193+
* for 'ns' number of nanoseconds. Returns 0 if mutex was acquired or ETIME
194+
* if timeout occurred.
195+
*/
196+
static inline int mutex_enter_timeout(kmutex_t *mutex, uint64_t ns)
197+
{
198+
hrtime_t end = gethrtime() + ns;
199+
while (gethrtime() < end) {
200+
if (mutex_tryenter(mutex))
201+
return (0); /* success */
202+
}
203+
return (ETIME);
204+
}
205+
190206
#endif /* _SPL_MUTEX_H */

include/sys/fs/zfs.h

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -874,6 +874,9 @@ typedef struct zpool_load_policy {
874874
#define ZPOOL_CONFIG_REBUILD_STATS "org.openzfs:rebuild_stats"
875875
#define ZPOOL_CONFIG_COMPATIBILITY "compatibility"
876876

877+
/* ZFS_IOC_POOL_STATS argument to for spa_namespace locking behavior */
878+
#define ZPOOL_CONFIG_LOCK_BEHAVIOR "lock_behavior" /* not stored on disk */
879+
877880
/*
878881
* The persistent vdev state is stored as separate values rather than a single
879882
* 'vdev_state' entry. This is because a device can be in multiple states, such
@@ -1999,6 +2002,30 @@ enum zio_encrypt {
19992002
ZFS_XA_NS_PREFIX_MATCH(LINUX_TRUSTED, name) || \
20002003
ZFS_XA_NS_PREFIX_MATCH(LINUX_USER, name))
20012004

2005+
/*
2006+
* Set locking behavior for zpool commands.
2007+
*/
2008+
typedef enum {
2009+
/* Wait to acquire the lock on the zpool config */
2010+
ZPOOL_LOCK_BEHAVIOR_WAIT = 0,
2011+
ZPOOL_LOCK_BEHAVIOR_DEFAULT = ZPOOL_LOCK_BEHAVIOR_WAIT,
2012+
/*
2013+
* Return an error if it's taking an unnecessarily long time to
2014+
* acquire the lock on the pool config (default 100ms)
2015+
*/
2016+
ZPOOL_LOCK_BEHAVIOR_TRYLOCK = 1,
2017+
2018+
/*
2019+
* DANGER: THIS CAN CRASH YOUR SYSTEM
2020+
*
2021+
* If you can't acquire the pool config lock after 100ms then do a
2022+
* a lockless lookup. This should only be done in emergencies, as it
2023+
* can crash the kernel module!
2024+
*/
2025+
ZPOOL_LOCK_BEHAVIOR_LOCKLESS = 2,
2026+
ZPOOL_LOCK_BEHAVIOR_END = 3 /* last entry marker */
2027+
} zpool_lock_behavior_t;
2028+
20022029
#ifdef __cplusplus
20032030
}
20042031
#endif

include/sys/spa.h

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -772,10 +772,13 @@ typedef enum trim_type {
772772

773773
/* state manipulation functions */
774774
extern int spa_open(const char *pool, spa_t **, const void *tag);
775+
extern int spa_open_common_lock_behavior(const char *pool, spa_t **spapp,
776+
const void *tag, nvlist_t *nvpolicy, nvlist_t **config,
777+
zpool_lock_behavior_t zpool_lock_behavior);
775778
extern int spa_open_rewind(const char *pool, spa_t **, const void *tag,
776779
nvlist_t *policy, nvlist_t **config);
777780
extern int spa_get_stats(const char *pool, nvlist_t **config, char *altroot,
778-
size_t buflen);
781+
size_t buflen, zpool_lock_behavior_t zpool_lock_behavior);
779782
extern int spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
780783
nvlist_t *zplprops, struct dsl_crypto_params *dcp);
781784
extern int spa_import(char *pool, nvlist_t *config, nvlist_t *props,
@@ -880,10 +883,13 @@ extern kcondvar_t spa_namespace_cv;
880883
#define SPA_CONFIG_UPDATE_VDEVS 1
881884

882885
extern void spa_write_cachefile(spa_t *, boolean_t, boolean_t, boolean_t);
883-
extern int spa_all_configs(uint64_t *generation, nvlist_t **pools);
886+
extern int spa_all_configs(uint64_t *generation, nvlist_t **pools,
887+
zpool_lock_behavior_t zpool_lock_behavior);
884888
extern void spa_config_set(spa_t *spa, nvlist_t *config);
885889
extern nvlist_t *spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg,
886890
int getstats);
891+
extern nvlist_t *spa_config_generate_lock_behavior(spa_t *spa, vdev_t *vd,
892+
uint64_t txg, int getstats, zpool_lock_behavior_t zpool_lock_behavior);
887893
extern void spa_config_update(spa_t *spa, int what);
888894
extern int spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv,
889895
vdev_t *parent, uint_t id, int atype);
@@ -895,9 +901,11 @@ extern int spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv,
895901

896902
/* Namespace manipulation */
897903
extern spa_t *spa_lookup(const char *name);
904+
extern spa_t *spa_lookup_lockless(const char *name);
898905
extern spa_t *spa_add(const char *name, nvlist_t *config, const char *altroot);
899906
extern void spa_remove(spa_t *spa);
900907
extern spa_t *spa_next(spa_t *prev);
908+
extern spa_t *spa_next_lockless(spa_t *prev);
901909

902910
/* Refcount functions */
903911
extern void spa_open_ref(spa_t *spa, const void *tag);

include/sys/spa_impl.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -500,6 +500,7 @@ extern void spa_set_deadman_synctime(hrtime_t ns);
500500
extern void spa_set_deadman_ziotime(hrtime_t ns);
501501
extern const char *spa_history_zone(void);
502502
extern const char *zfs_active_allocator;
503+
extern unsigned int spa_namespace_trylock_ms;
503504
extern int param_set_active_allocator_common(const char *val);
504505

505506
#ifdef __cplusplus

include/sys/zfs_context.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,7 @@ extern void mutex_enter(kmutex_t *mp);
270270
extern int mutex_enter_check_return(kmutex_t *mp);
271271
extern void mutex_exit(kmutex_t *mp);
272272
extern int mutex_tryenter(kmutex_t *mp);
273+
extern int mutex_enter_timeout(kmutex_t *mp, uint64_t ns);
273274

274275
#define NESTED_SINGLE 1
275276
#define mutex_enter_nested(mp, class) mutex_enter(mp)

0 commit comments

Comments
 (0)