Skip to content

Commit a7a144e

Browse files
authored
enforce arc_dnode_limit
Linux kernel shrinker in the context of null/root memcg does not scan dentry and inode caches added by a task running in non-root memcg. For ZFS this means that dnode cache routinely overflows, evicting valuable meta/data and putting additional memory pressure on the system. This patch restores zfs_prune_aliases as fallback when the kernel shrinker does nothing, enabling zfs to actually free dnodes. Moreover, it (indirectly) calls arc_evict when dnode_size > dnode_limit. Reviewed-by: Rob Norris <[email protected]> Reviewed-by: Alexander Motin <[email protected]> Reviewed-by: Brian Behlendorf <[email protected]> Signed-off-by: Gionatan Danti <[email protected]> Closes #17487 Closes #17542
1 parent be1e991 commit a7a144e

File tree

3 files changed

+78
-11
lines changed

3 files changed

+78
-11
lines changed

include/sys/arc_impl.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -954,7 +954,7 @@ typedef struct arc_sums {
954954
wmsum_t arcstat_data_size;
955955
wmsum_t arcstat_metadata_size;
956956
wmsum_t arcstat_dbuf_size;
957-
wmsum_t arcstat_dnode_size;
957+
aggsum_t arcstat_dnode_size;
958958
wmsum_t arcstat_bonus_size;
959959
wmsum_t arcstat_l2_hits;
960960
wmsum_t arcstat_l2_misses;

module/os/linux/zfs/zfs_vfsops.c

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1216,6 +1216,63 @@ zfs_root(zfsvfs_t *zfsvfs, struct inode **ipp)
12161216
return (error);
12171217
}
12181218

1219+
/*
1220+
* Dentry and inode caches referenced by a task in non-root memcg are
1221+
* not going to be scanned by the kernel-provided shrinker. So, if
1222+
* kernel prunes nothing, fall back to this manual walk to free dnodes.
1223+
* To avoid scanning the same znodes multiple times they are always rotated
1224+
* to the end of the z_all_znodes list. New znodes are inserted at the
1225+
* end of the list so we're always scanning the oldest znodes first.
1226+
*/
1227+
static int
1228+
zfs_prune_aliases(zfsvfs_t *zfsvfs, unsigned long nr_to_scan)
1229+
{
1230+
znode_t **zp_array, *zp;
1231+
int max_array = MIN(nr_to_scan, PAGE_SIZE * 8 / sizeof (znode_t *));
1232+
int objects = 0;
1233+
int i = 0, j = 0;
1234+
1235+
zp_array = vmem_zalloc(max_array * sizeof (znode_t *), KM_SLEEP);
1236+
1237+
mutex_enter(&zfsvfs->z_znodes_lock);
1238+
while ((zp = list_head(&zfsvfs->z_all_znodes)) != NULL) {
1239+
1240+
if ((i++ > nr_to_scan) || (j >= max_array))
1241+
break;
1242+
1243+
ASSERT(list_link_active(&zp->z_link_node));
1244+
list_remove(&zfsvfs->z_all_znodes, zp);
1245+
list_insert_tail(&zfsvfs->z_all_znodes, zp);
1246+
1247+
/* Skip active znodes and .zfs entries */
1248+
if (MUTEX_HELD(&zp->z_lock) || zp->z_is_ctldir)
1249+
continue;
1250+
1251+
if (igrab(ZTOI(zp)) == NULL)
1252+
continue;
1253+
1254+
zp_array[j] = zp;
1255+
j++;
1256+
}
1257+
mutex_exit(&zfsvfs->z_znodes_lock);
1258+
1259+
for (i = 0; i < j; i++) {
1260+
zp = zp_array[i];
1261+
1262+
ASSERT3P(zp, !=, NULL);
1263+
d_prune_aliases(ZTOI(zp));
1264+
1265+
if (atomic_read(&ZTOI(zp)->i_count) == 1)
1266+
objects++;
1267+
1268+
zrele(zp);
1269+
}
1270+
1271+
vmem_free(zp_array, max_array * sizeof (znode_t *));
1272+
1273+
return (objects);
1274+
}
1275+
12191276
/*
12201277
* The ARC has requested that the filesystem drop entries from the dentry
12211278
* and inode caches. This can occur when the ARC needs to free meta data
@@ -1267,6 +1324,14 @@ zfs_prune(struct super_block *sb, unsigned long nr_to_scan, int *objects)
12671324
*objects = (*shrinker->scan_objects)(shrinker, &sc);
12681325
#endif
12691326

1327+
/*
1328+
* Fall back to zfs_prune_aliases if kernel's shrinker did nothing
1329+
* due to dentry and inode caches being referenced by a task running
1330+
* in non-root memcg.
1331+
*/
1332+
if (*objects == 0)
1333+
*objects = zfs_prune_aliases(zfsvfs, nr_to_scan);
1334+
12701335
zfs_exit(zfsvfs, FTAG);
12711336

12721337
dprintf_ds(zfsvfs->z_os->os_dsl_dataset,

module/zfs/arc.c

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2631,7 +2631,7 @@ arc_space_consume(uint64_t space, arc_space_type_t type)
26312631
ARCSTAT_INCR(arcstat_bonus_size, space);
26322632
break;
26332633
case ARC_SPACE_DNODE:
2634-
ARCSTAT_INCR(arcstat_dnode_size, space);
2634+
aggsum_add(&arc_sums.arcstat_dnode_size, space);
26352635
break;
26362636
case ARC_SPACE_DBUF:
26372637
ARCSTAT_INCR(arcstat_dbuf_size, space);
@@ -2677,7 +2677,7 @@ arc_space_return(uint64_t space, arc_space_type_t type)
26772677
ARCSTAT_INCR(arcstat_bonus_size, -space);
26782678
break;
26792679
case ARC_SPACE_DNODE:
2680-
ARCSTAT_INCR(arcstat_dnode_size, -space);
2680+
aggsum_add(&arc_sums.arcstat_dnode_size, -space);
26812681
break;
26822682
case ARC_SPACE_DBUF:
26832683
ARCSTAT_INCR(arcstat_dbuf_size, -space);
@@ -4490,7 +4490,7 @@ arc_evict(void)
44904490
* target is not evictable or if they go over arc_dnode_limit.
44914491
*/
44924492
int64_t prune = 0;
4493-
int64_t dn = wmsum_value(&arc_sums.arcstat_dnode_size);
4493+
int64_t dn = aggsum_value(&arc_sums.arcstat_dnode_size);
44944494
int64_t nem = zfs_refcount_count(&arc_mru->arcs_size[ARC_BUFC_METADATA])
44954495
+ zfs_refcount_count(&arc_mfu->arcs_size[ARC_BUFC_METADATA])
44964496
- zfs_refcount_count(&arc_mru->arcs_esize[ARC_BUFC_METADATA])
@@ -5082,11 +5082,13 @@ arc_is_overflowing(boolean_t lax, boolean_t use_reserve)
50825082
* in the ARC. In practice, that's in the tens of MB, which is low
50835083
* enough to be safe.
50845084
*/
5085-
int64_t over = aggsum_lower_bound(&arc_sums.arcstat_size) - arc_c -
5085+
int64_t arc_over = aggsum_lower_bound(&arc_sums.arcstat_size) - arc_c -
50865086
zfs_max_recordsize;
5087+
int64_t dn_over = aggsum_lower_bound(&arc_sums.arcstat_dnode_size) -
5088+
arc_dnode_limit;
50875089

50885090
/* Always allow at least one block of overflow. */
5089-
if (over < 0)
5091+
if (arc_over < 0 && dn_over <= 0)
50905092
return (ARC_OVF_NONE);
50915093

50925094
/* If we are under memory pressure, report severe overflow. */
@@ -5097,7 +5099,7 @@ arc_is_overflowing(boolean_t lax, boolean_t use_reserve)
50975099
int64_t overflow = (arc_c >> zfs_arc_overflow_shift) / 2;
50985100
if (use_reserve)
50995101
overflow *= 3;
5100-
return (over < overflow ? ARC_OVF_SOME : ARC_OVF_SEVERE);
5102+
return (arc_over < overflow ? ARC_OVF_SOME : ARC_OVF_SEVERE);
51015103
}
51025104

51035105
static abd_t *
@@ -7326,7 +7328,7 @@ arc_kstat_update(kstat_t *ksp, int rw)
73267328
#if defined(COMPAT_FREEBSD11)
73277329
as->arcstat_other_size.value.ui64 =
73287330
wmsum_value(&arc_sums.arcstat_bonus_size) +
7329-
wmsum_value(&arc_sums.arcstat_dnode_size) +
7331+
aggsum_value(&arc_sums.arcstat_dnode_size) +
73307332
wmsum_value(&arc_sums.arcstat_dbuf_size);
73317333
#endif
73327334

@@ -7368,7 +7370,7 @@ arc_kstat_update(kstat_t *ksp, int rw)
73687370
&as->arcstat_uncached_evictable_metadata);
73697371

73707372
as->arcstat_dnode_size.value.ui64 =
7371-
wmsum_value(&arc_sums.arcstat_dnode_size);
7373+
aggsum_value(&arc_sums.arcstat_dnode_size);
73727374
as->arcstat_bonus_size.value.ui64 =
73737375
wmsum_value(&arc_sums.arcstat_bonus_size);
73747376
as->arcstat_l2_hits.value.ui64 =
@@ -7738,7 +7740,7 @@ arc_state_init(void)
77387740
wmsum_init(&arc_sums.arcstat_data_size, 0);
77397741
wmsum_init(&arc_sums.arcstat_metadata_size, 0);
77407742
wmsum_init(&arc_sums.arcstat_dbuf_size, 0);
7741-
wmsum_init(&arc_sums.arcstat_dnode_size, 0);
7743+
aggsum_init(&arc_sums.arcstat_dnode_size, 0);
77427744
wmsum_init(&arc_sums.arcstat_bonus_size, 0);
77437745
wmsum_init(&arc_sums.arcstat_l2_hits, 0);
77447746
wmsum_init(&arc_sums.arcstat_l2_misses, 0);
@@ -7897,7 +7899,7 @@ arc_state_fini(void)
78977899
wmsum_fini(&arc_sums.arcstat_data_size);
78987900
wmsum_fini(&arc_sums.arcstat_metadata_size);
78997901
wmsum_fini(&arc_sums.arcstat_dbuf_size);
7900-
wmsum_fini(&arc_sums.arcstat_dnode_size);
7902+
aggsum_fini(&arc_sums.arcstat_dnode_size);
79017903
wmsum_fini(&arc_sums.arcstat_bonus_size);
79027904
wmsum_fini(&arc_sums.arcstat_l2_hits);
79037905
wmsum_fini(&arc_sums.arcstat_l2_misses);

0 commit comments

Comments
 (0)