Skip to content

Commit 6b27138

Browse files
committed
Update dnode_next_offset_level to accept blkid instead of offset
Currently this function uses L0 offsets which: 1. is hard to read since it maps offsets to blkid and back each call 2. necessitates dnode_next_block to handle edge cases at limits 3. makes it hard to tell if the traversal can loop infinitely Instead, update this and dnode_next_offset to work in (blkid, index). This way the blkid manipulations are clear, and it's also clear that the traversal always terminates since blkid goes one direction. I've also considered updating dnode_next_offset to operate on blkid. Callers use both patterns, so maybe another PR can split the cases? While here tidy up dnode_next_offset_level comments. Signed-off-by: Robert Evans <[email protected]>
1 parent aecd6de commit 6b27138

File tree

1 file changed

+66
-95
lines changed

1 file changed

+66
-95
lines changed

module/zfs/dnode.c

Lines changed: 66 additions & 95 deletions
Original file line numberDiff line numberDiff line change
@@ -2496,34 +2496,32 @@ dnode_diduse_space(dnode_t *dn, int64_t delta)
24962496
}
24972497

24982498
/*
2499-
* Scans a block at the indicated "level" looking for a hole or data,
2500-
* depending on 'flags'.
2499+
* Scans the block at the indicated "level" looking for a hole or data,
2500+
* depending on 'flags' starting from array position given by *index.
25012501
*
2502-
* If level > 0, then we are scanning an indirect block looking at its
2503-
* pointers. If level == 0, then we are looking at a block of dnodes.
2502+
* If lvl > 0, then we are scanning an indirect block looking at its
2503+
* pointers. If lvl == 0, then we are looking at a block of dnodes.
25042504
*
25052505
* If we don't find what we are looking for in the block, we return ESRCH.
2506-
* Otherwise, return with *offset pointing to the beginning (if searching
2507-
* forwards) or end (if searching backwards) of the range covered by the
2508-
* block pointer we matched on (or dnode).
2506+
* Otherwise, return with *index set to the matching array position.
25092507
*
2510-
* The basic search algorithm used below by dnode_next_offset() is to
2511-
* use this function to search up the block tree (widen the search) until
2512-
* we find something (i.e., we don't return ESRCH) and then search back
2513-
* down the tree (narrow the search) until we reach our original search
2514-
* level.
2508+
* The basic search algorithm used below by dnode_next_offset() uses this
2509+
* function to perform a block-order tree traversal. We search up the block
2510+
* tree (widen the search) until we find something (i.e., we don't return
2511+
* ESRCH) and then search back down the tree (narrow the search) until we
2512+
* reach our original search level or backtrack up because nothing matches.
25152513
*/
25162514
static int
2517-
dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
2518-
int lvl, uint64_t blkfill, uint64_t txg)
2515+
dnode_next_offset_level(dnode_t *dn, int flags, int lvl, uint64_t blkid,
2516+
int *index, uint64_t blkfill, uint64_t txg)
25192517
{
25202518
dmu_buf_impl_t *db = NULL;
25212519
void *data = NULL;
25222520
uint64_t epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
25232521
uint64_t epb = 1ULL << epbs;
25242522
uint64_t minfill, maxfill;
25252523
boolean_t hole;
2526-
int i, inc, error, span;
2524+
int i = *index, inc, error;
25272525

25282526
ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
25292527

@@ -2541,20 +2539,13 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
25412539
rrw_enter(&dmu_objset_ds(dn->dn_objset)->ds_bp_rwlock,
25422540
RW_READER, FTAG);
25432541
} else {
2544-
uint64_t blkid = dbuf_whichblock(dn, lvl, *offset);
25452542
error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FALSE, FTAG, &db);
25462543
if (error) {
25472544
if (error != ENOENT)
25482545
return (error);
25492546
if (hole)
25502547
return (0);
2551-
/*
2552-
* This can only happen when we are searching up
2553-
* the block tree for data. We don't really need to
2554-
* adjust the offset, as we will just end up looking
2555-
* at the pointer to this block in its parent, and its
2556-
* going to be unallocated, so we will skip over it.
2557-
*/
2548+
/* Unallocated; see comment in dnode_next_offset. */
25582549
return (SET_ERROR(ESRCH));
25592550
}
25602551
error = dbuf_read(db, NULL,
@@ -2582,21 +2573,15 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
25822573
ASSERT(dn->dn_type == DMU_OT_DNODE);
25832574
ASSERT(!(flags & DNODE_FIND_BACKWARDS));
25842575

2585-
for (i = (*offset >> DNODE_SHIFT) & (blkfill - 1);
2586-
i < blkfill; i += dnp[i].dn_extra_slots + 1) {
2576+
for (; i < blkfill; i += dnp[i].dn_extra_slots + 1) {
25872577
if ((dnp[i].dn_type == DMU_OT_NONE) == hole)
25882578
break;
25892579
}
25902580

2591-
if (i == blkfill)
2581+
if (i >= blkfill)
25922582
error = SET_ERROR(ESRCH);
2593-
2594-
*offset = (*offset & ~(DNODE_BLOCK_SIZE - 1)) +
2595-
(i << DNODE_SHIFT);
25962583
} else {
25972584
blkptr_t *bp = data;
2598-
uint64_t start = *offset;
2599-
span = (lvl - 1) * epbs + dn->dn_datablkshift;
26002585
minfill = 0;
26012586
maxfill = blkfill << ((lvl - 1) * epbs);
26022587

@@ -2605,38 +2590,13 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
26052590
else
26062591
minfill++;
26072592

2608-
if (span >= 8 * sizeof (*offset)) {
2609-
/* This only happens on the highest indirection level */
2610-
ASSERT3U((lvl - 1), ==, dn->dn_phys->dn_nlevels - 1);
2611-
*offset = 0;
2612-
} else {
2613-
*offset = *offset >> span;
2614-
}
2615-
2616-
for (i = BF64_GET(*offset, 0, epbs);
2617-
i >= 0 && i < epb; i += inc) {
2593+
for (; i >= 0 && i < epb; i += inc) {
26182594
if (BP_GET_FILL(&bp[i]) >= minfill &&
26192595
BP_GET_FILL(&bp[i]) <= maxfill &&
26202596
(hole || BP_GET_LOGICAL_BIRTH(&bp[i]) > txg))
26212597
break;
2622-
if (inc > 0 || *offset > 0)
2623-
*offset += inc;
26242598
}
26252599

2626-
if (span >= 8 * sizeof (*offset)) {
2627-
*offset = start;
2628-
} else {
2629-
*offset = *offset << span;
2630-
}
2631-
2632-
if (inc < 0) {
2633-
/* traversing backwards; position offset at the end */
2634-
if (span < 8 * sizeof (*offset))
2635-
*offset = MIN(*offset + (1ULL << span) - 1,
2636-
start);
2637-
} else if (*offset < start) {
2638-
*offset = start;
2639-
}
26402600
if (i < 0 || i >= epb)
26412601
error = SET_ERROR(ESRCH);
26422602
}
@@ -2652,35 +2612,10 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
26522612
FTAG);
26532613
}
26542614

2615+
*index = i;
26552616
return (error);
26562617
}
26572618

2658-
/*
2659-
* Adjust *offset to the next (or previous) block byte offset at lvl.
2660-
* Returns FALSE if *offset would overflow or underflow.
2661-
*/
2662-
static boolean_t
2663-
dnode_next_block(dnode_t *dn, int flags, uint64_t *offset, int lvl)
2664-
{
2665-
int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
2666-
int span = lvl * epbs + dn->dn_datablkshift;
2667-
uint64_t blkid, maxblkid;
2668-
2669-
if (span >= 8 * sizeof (uint64_t))
2670-
return (B_FALSE);
2671-
2672-
blkid = *offset >> span;
2673-
maxblkid = 1ULL << (8 * sizeof (*offset) - span);
2674-
if (!(flags & DNODE_FIND_BACKWARDS) && blkid + 1 < maxblkid)
2675-
*offset = (blkid + 1) << span;
2676-
else if ((flags & DNODE_FIND_BACKWARDS) && blkid > 0)
2677-
*offset = (blkid << span) - 1;
2678-
else
2679-
return (B_FALSE);
2680-
2681-
return (B_TRUE);
2682-
}
2683-
26842619
/*
26852620
* Find the next hole, data, or sparse region at or after *offset.
26862621
* The value 'blkfill' tells us how many items we expect to find
@@ -2708,9 +2643,11 @@ int
27082643
dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset,
27092644
int minlvl, uint64_t blkfill, uint64_t txg)
27102645
{
2711-
uint64_t matched = *offset;
2646+
uint64_t blkid;
2647+
int index;
27122648
int lvl, maxlvl;
27132649
int error = 0;
2650+
int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
27142651

27152652
if (!(flags & DNODE_FIND_HAVELOCK))
27162653
rw_enter(&dn->dn_struct_rwlock, RW_READER);
@@ -2730,18 +2667,29 @@ dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset,
27302667
goto out;
27312668
}
27322669

2670+
if (minlvl > 0) {
2671+
uint64_t n = dbuf_whichblock(dn, minlvl - 1, *offset);
2672+
blkid = n >> epbs;
2673+
index = BF64_GET(n, 0, epbs);
2674+
} else {
2675+
blkid = dbuf_whichblock(dn, 0, *offset);
2676+
index = (*offset >> DNODE_SHIFT) & (blkfill - 1);
2677+
}
2678+
27332679
maxlvl = dn->dn_phys->dn_nlevels;
27342680

27352681
for (lvl = minlvl; lvl <= maxlvl; ) {
27362682
error = dnode_next_offset_level(dn,
2737-
flags, offset, lvl, blkfill, txg);
2683+
flags, lvl, blkid, &index, blkfill, txg);
27382684
if (error == 0 && lvl > minlvl) {
2685+
/* Continue search at matched block in lvl-1. */
2686+
blkid = (blkid << epbs) + index;
2687+
index = 0;
27392688
--lvl;
2740-
matched = *offset;
2741-
} else if (error == ESRCH && lvl < maxlvl &&
2742-
dnode_next_block(dn, flags, &matched, lvl)) {
2689+
} else if (error == ESRCH && lvl < maxlvl) {
27432690
/*
2744-
* Continue search at next/prev offset in lvl+1 block.
2691+
* Continue search at next/prev offset in lvl+1 block
2692+
* but stop if blkid would underflow or overflow.
27452693
*
27462694
* Usually we only search upwards at the start of the
27472695
* search as higher level blocks point at a matching
@@ -2751,14 +2699,19 @@ dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset,
27512699
* contains only BPs/dnodes freed at that txg. It also
27522700
* happens if we are still syncing out the tree, and
27532701
* some BP's at higher levels are not updated yet.
2754-
*
2755-
* We must adjust offset to avoid coming back to the
2756-
* same offset and getting stuck looping forever. This
2757-
* also deals with the case where offset is already at
2758-
* the beginning or end of the object.
27592702
*/
2703+
if (flags & DNODE_FIND_BACKWARDS) {
2704+
if (blkid == 0)
2705+
break;
2706+
--blkid;
2707+
} else {
2708+
if (blkid == UINT64_MAX)
2709+
break;
2710+
++blkid;
2711+
}
2712+
index = BF64_GET(blkid, 0, epbs);
2713+
blkid = blkid >> epbs;
27602714
++lvl;
2761-
*offset = matched;
27622715
} else {
27632716
break;
27642717
}
@@ -2773,6 +2726,24 @@ dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset,
27732726
error = 0;
27742727
}
27752728

2729+
if (lvl > 0) {
2730+
uint64_t n = blkid << epbs;
2731+
if (index > 0 || n > 0)
2732+
n += index; /* -1 <= index <= 1<<epbs */
2733+
2734+
int span = (lvl - 1) * epbs + dn->dn_datablkshift;
2735+
if (span >= 8 * sizeof (uint64_t))
2736+
*offset = 0;
2737+
else if (flags & DNODE_FIND_BACKWARDS)
2738+
/* traversing backwards; position at block end */
2739+
*offset = MIN(*offset, ((n + 1) << span) - 1);
2740+
else
2741+
*offset = MAX(*offset, n << span);
2742+
} else {
2743+
*offset = (blkid << dn->dn_datablkshift) +
2744+
(index << DNODE_SHIFT); /* 0 <= index <= blkfill */
2745+
}
2746+
27762747
out:
27772748
if (!(flags & DNODE_FIND_HAVELOCK))
27782749
rw_exit(&dn->dn_struct_rwlock);

0 commit comments

Comments
 (0)