From a7333bda4273200a294a20954a197e7a6298a425 Mon Sep 17 00:00:00 2001 From: Robert Evans Date: Thu, 9 Oct 2025 01:00:44 -0400 Subject: [PATCH] Update dnode_next_offset_level to accept blkid instead of offset Currently this function uses L0 offsets which: 1. is hard to read since it maps offsets to blkid and back each call 2. necessitates dnode_next_block to handle edge cases at limits 3. makes it hard to tell if the traversal can loop infinitely Instead, update this and dnode_next_offset to work in (blkid, index). This way the blkid manipulations are clear, and it's also clear that the traversal always terminates since blkid goes one direction. I've also considered updating dnode_next_offset to operate on blkid. Callers use both patterns, so maybe another PR can split the cases? While here tidy up dnode_next_offset_level comments. Signed-off-by: Robert Evans --- module/zfs/dnode.c | 156 ++++++++++++++++++++------------------------- 1 file changed, 68 insertions(+), 88 deletions(-) diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index e88d394b5229..e0cc4a7e13e0 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -2496,26 +2496,27 @@ dnode_diduse_space(dnode_t *dn, int64_t delta) } /* - * Scans a block at the indicated "level" looking for a hole or data, - * depending on 'flags'. + * Scans the block at the indicated "level" looking for a hole or data, + * depending on 'flags' starting from array position given by *index. * - * If level > 0, then we are scanning an indirect block looking at its - * pointers. If level == 0, then we are looking at a block of dnodes. + * If lvl > 0, then we are scanning an indirect block looking at its + * pointers. If lvl == 0, then we are looking at a block of dnodes. * * If we don't find what we are looking for in the block, we return ESRCH. - * Otherwise, return with *offset pointing to the beginning (if searching - * forwards) or end (if searching backwards) of the range covered by the - * block pointer we matched on (or dnode). + * Otherwise, return with *index set to the matching array position. * - * The basic search algorithm used below by dnode_next_offset() is to - * use this function to search up the block tree (widen the search) until - * we find something (i.e., we don't return ESRCH) and then search back - * down the tree (narrow the search) until we reach our original search - * level. + * In both cases, *offset is updated to point at the matched BP/dnode or + * the next offset to search (unless at the limit of possible offsets). + * + * The basic search algorithm used below by dnode_next_offset() uses this + * function to perform a block-order tree traversal. We search up the block + * tree (widen the search) until we find something (i.e., we don't return + * ESRCH) and then search back down the tree (narrow the search) until we + * reach our original search level or backtrack up because nothing matches. */ static int -dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, - int lvl, uint64_t blkfill, uint64_t txg) +dnode_next_offset_level(dnode_t *dn, int flags, int lvl, uint64_t blkid, + int *index, uint64_t blkfill, uint64_t txg, uint64_t *offset) { dmu_buf_impl_t *db = NULL; void *data = NULL; @@ -2541,20 +2542,12 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, rrw_enter(&dmu_objset_ds(dn->dn_objset)->ds_bp_rwlock, RW_READER, FTAG); } else { - uint64_t blkid = dbuf_whichblock(dn, lvl, *offset); error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FALSE, FTAG, &db); if (error) { if (error != ENOENT) return (error); if (hole) return (0); - /* - * This can only happen when we are searching up - * the block tree for data. We don't really need to - * adjust the offset, as we will just end up looking - * at the pointer to this block in its parent, and its - * going to be unallocated, so we will skip over it. - */ return (SET_ERROR(ESRCH)); } error = dbuf_read(db, NULL, @@ -2582,8 +2575,7 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, ASSERT(dn->dn_type == DMU_OT_DNODE); ASSERT(!(flags & DNODE_FIND_BACKWARDS)); - for (i = (*offset >> DNODE_SHIFT) & (blkfill - 1); - i < blkfill; i += dnp[i].dn_extra_slots + 1) { + for (i = *index; i < blkfill; i += dnp[i].dn_extra_slots + 1) { if ((dnp[i].dn_type == DMU_OT_NONE) == hole) break; } @@ -2591,11 +2583,11 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, if (i == blkfill) error = SET_ERROR(ESRCH); + *index = i; *offset = (*offset & ~(DNODE_BLOCK_SIZE - 1)) + (i << DNODE_SHIFT); } else { blkptr_t *bp = data; - uint64_t start = *offset; span = (lvl - 1) * epbs + dn->dn_datablkshift; minfill = 0; maxfill = blkfill << ((lvl - 1) * epbs); @@ -2605,40 +2597,27 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, else minfill++; - if (span >= 8 * sizeof (*offset)) { - /* This only happens on the highest indirection level */ - ASSERT3U((lvl - 1), ==, dn->dn_phys->dn_nlevels - 1); - *offset = 0; - } else { - *offset = *offset >> span; - } - - for (i = BF64_GET(*offset, 0, epbs); - i >= 0 && i < epb; i += inc) { + for (i = *index; i >= 0 && i < epb; i += inc) { if (BP_GET_FILL(&bp[i]) >= minfill && BP_GET_FILL(&bp[i]) <= maxfill && (hole || BP_GET_LOGICAL_BIRTH(&bp[i]) > txg)) break; - if (inc > 0 || *offset > 0) - *offset += inc; } - if (span >= 8 * sizeof (*offset)) { - *offset = start; - } else { - *offset = *offset << span; - } - - if (inc < 0) { - /* traversing backwards; position offset at the end */ - if (span < 8 * sizeof (*offset)) - *offset = MIN(*offset + (1ULL << span) - 1, - start); - } else if (*offset < start) { - *offset = start; - } if (i < 0 || i >= epb) error = SET_ERROR(ESRCH); + + *index = i; + if (span < 8 * sizeof (*offset)) { + uint64_t nblk = blkid << epbs; + if (i >= 0 || blkid != 0) + nblk += i; + if ((nblk >> (8 * sizeof (*offset) - span)) == 0) + *offset = (flags & DNODE_FIND_BACKWARDS) ? + /* backwards: position offset at the end */ + MIN(*offset, ((nblk + 1) << span) - 1) : + MAX(*offset, nblk << span); + } } if (db != NULL) { @@ -2655,32 +2634,6 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, return (error); } -/* - * Adjust *offset to the next (or previous) block byte offset at lvl. - * Returns FALSE if *offset would overflow or underflow. - */ -static boolean_t -dnode_next_block(dnode_t *dn, int flags, uint64_t *offset, int lvl) -{ - int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; - int span = lvl * epbs + dn->dn_datablkshift; - uint64_t blkid, maxblkid; - - if (span >= 8 * sizeof (uint64_t)) - return (B_FALSE); - - blkid = *offset >> span; - maxblkid = 1ULL << (8 * sizeof (*offset) - span); - if (!(flags & DNODE_FIND_BACKWARDS) && blkid + 1 < maxblkid) - *offset = (blkid + 1) << span; - else if ((flags & DNODE_FIND_BACKWARDS) && blkid > 0) - *offset = (blkid << span) - 1; - else - return (B_FALSE); - - return (B_TRUE); -} - /* * Find the next hole, data, or sparse region at or after *offset. * The value 'blkfill' tells us how many items we expect to find @@ -2688,6 +2641,18 @@ dnode_next_block(dnode_t *dn, int flags, uint64_t *offset, int lvl) * DNODES_PER_BLOCK for the meta dnode, and some fraction of * DNODES_PER_BLOCK when searching for sparse regions thereof. * + * If minlvl == 0, this searches for dnodes or unallocated dnodes. + * If found, *offset points to the first offset of the matched dnode. + * Backwards search is not allowed for dnodes. + * + * If minlvl > 0, this searches for blocks at the given level. + * If found, *offset points to the first L0 offset of the block + * (or for backwards search, the last offset, inclusive). + * + * If not found, in both cases, *offset is set to the first (or last) + * offset of the unallocated indirect block where the search ended or + * the initial offset if no such block was encountered. + * * Examples: * * dnode_next_offset(dn, flags, offset, 1, 1, 0); @@ -2708,7 +2673,8 @@ int dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset, int minlvl, uint64_t blkfill, uint64_t txg) { - uint64_t matched = *offset; + uint64_t blkid; + int index, epbs; int lvl, maxlvl; int error = 0; @@ -2730,18 +2696,31 @@ dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset, goto out; } + epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; maxlvl = dn->dn_phys->dn_nlevels; + if (minlvl > 0) { + uint64_t n = dbuf_whichblock(dn, minlvl - 1, *offset); + blkid = n >> epbs; + index = BF64_GET(n, 0, epbs); + } else { + blkid = dbuf_whichblock(dn, 0, *offset); + index = (*offset >> DNODE_SHIFT) & (blkfill - 1); + ASSERT3U(BF64_GET(*offset, 0, DNODE_SHIFT), ==, 0); + } + for (lvl = minlvl; lvl <= maxlvl; ) { error = dnode_next_offset_level(dn, - flags, offset, lvl, blkfill, txg); + flags, lvl, blkid, &index, blkfill, txg, offset); + if (error == 0 && lvl > minlvl) { + /* Continue search at matched block in lvl-1. */ + blkid = (blkid << epbs) + index; + index = 0; --lvl; - matched = *offset; - } else if (error == ESRCH && lvl < maxlvl && - dnode_next_block(dn, flags, &matched, lvl)) { + } else if (error == ESRCH && lvl < maxlvl) { /* - * Continue search at next/prev offset in lvl+1 block. + * Continue search at next/prev index in lvl+1 block. * * Usually we only search upwards at the start of the * search as higher level blocks point at a matching @@ -2752,13 +2731,14 @@ dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset, * happens if we are still syncing out the tree, and * some BP's at higher levels are not updated yet. * - * We must adjust offset to avoid coming back to the - * same offset and getting stuck looping forever. This - * also deals with the case where offset is already at - * the beginning or end of the object. + * We must adjust index to avoid coming back to the + * same offset and getting stuck looping forever. The + * next loop goes up again if index is -1 or (1<> epbs; ++lvl; - *offset = matched; } else { break; }