@@ -269,11 +269,17 @@ type IterConfig struct {
269
269
TombstoneElision TombstoneElision
270
270
RangeKeyElision TombstoneElision
271
271
272
- // AllowZeroSeqNum allows the sequence number of KVs in the bottom snapshot
273
- // stripe to be simplified to 0 (which improves compression and enables an
274
- // optimization during forward iteration). This can be enabled if there are no
275
- // tables overlapping the output at lower levels (than the output) in the LSM.
276
- AllowZeroSeqNum bool
272
+ // IsBottommostDataLayer indicates that the compaction inputs form the
273
+ // bottommost layer of data for the compaction's key range. This allows the
274
+ // sequence number of KVs in the bottom snapshot stripe to be simplified to
275
+ // 0 (which improves compression and enables an optimization during forward
276
+ // iteration). This can be enabled if there are no tables overlapping the
277
+ // output at lower levels (than the output) in the LSM.
278
+ //
279
+ // This field may be false even when nothing is overlapping in lower levels.
280
+ // At the time of writing, flushes always set this to false (because flushes
281
+ // almost never form the bottommost layer of data).
282
+ IsBottommostDataLayer bool
277
283
278
284
// IneffectualPointDeleteCallback is called if a SINGLEDEL is being elided
279
285
// without deleting a point set/merge. False positives are rare but possible
@@ -613,8 +619,24 @@ func (i *Iter) Next() *base.InternalKV {
613
619
}
614
620
var needDelete bool
615
621
if i .err == nil {
616
- // includesBase is true whenever we've transformed the MERGE record
617
- // into a SET.
622
+ // If this is the oldest version of this key (the bottommost
623
+ // snapshot stripe), we can transform the sequence number to
624
+ // zero. This can improve compression and enables an
625
+ // optimization during forward iteration to skip some key
626
+ // comparisons. Additionally, we can transform the key kind to
627
+ // SET so that iteration and future compactions do not need to
628
+ // invoke the user's Merge operator.
629
+ if i .isBottommostSnapshotStripe (origSnapshotIdx ) {
630
+ i .kv .K .SetSeqNum (base .SeqNumZero )
631
+ // During the merge (see mergeNext), we may have already
632
+ // transformed the key kind to SET or SETWITHDEL, in which case we want to preserve the existing key kind.
633
+ if i .kv .K .Kind () == base .InternalKeyKindMerge {
634
+ i .kv .K .SetKind (base .InternalKeyKindSet )
635
+ }
636
+ }
637
+
638
+ // includesBase is true when we've merged the oldest operand in
639
+ // the LSM.
618
640
var includesBase bool
619
641
switch i .kv .K .Kind () {
620
642
case base .InternalKeyKindSet , base .InternalKeyKindSetWithDelete :
@@ -633,8 +655,6 @@ func (i *Iter) Next() *base.InternalKV {
633
655
}
634
656
continue
635
657
}
636
-
637
- i .maybeZeroSeqnum (origSnapshotIdx )
638
658
return & i .kv
639
659
}
640
660
if i .err != nil {
@@ -799,7 +819,14 @@ func (i *Iter) setNext() {
799
819
// Save the current key.
800
820
i .saveKey ()
801
821
i .kv .V = i .iterKV .V
802
- i .maybeZeroSeqnum (i .curSnapshotIdx )
822
+
823
+ // If this is the oldest version of this key (the bottommost snapshot
824
+ // stripe), we can transform the sequence number to zero. This can improve
825
+ // compression and enables an optimization during forward iteration to skip
826
+ // some key comparisons.
827
+ if i .isBottommostSnapshotStripe (i .curSnapshotIdx ) {
828
+ i .kv .K .SetSeqNum (base .SeqNumZero )
829
+ }
803
830
804
831
// If this key is already a SETWITHDEL we can early return and skip the remaining
805
832
// records in the stripe:
@@ -1415,23 +1442,21 @@ func (i *Iter) lastRangeDelSpanFrontierReached(key []byte) []byte {
1415
1442
return nil
1416
1443
}
1417
1444
1418
- // maybeZeroSeqnum attempts to set the seqnum for the current key to 0. Doing
1419
- // so improves compression and enables an optimization during forward iteration
1420
- // to skip some key comparisons. The seqnum for an entry can be zeroed if the
1421
- // entry is on the bottom snapshot stripe and on the bottom level of the LSM.
1422
- func (i * Iter ) maybeZeroSeqnum (snapshotIdx int ) {
1423
- if ! i .cfg .AllowZeroSeqNum {
1424
- // TODO(peter): allowZeroSeqNum applies to the entire compaction. We could
1425
- // make the determination on a key by key basis, similar to what is done
1426
- // for elideTombstone. Need to add a benchmark for Iter to verify
1427
- // that isn't too expensive.
1428
- return
1429
- }
1430
- if snapshotIdx > 0 {
1431
- // This is not the last snapshot
1432
- return
1433
- }
1434
- i .kv .K .SetSeqNum (base .SeqNumZero )
1445
+ // isBottommostSnapshotStripe returns true if the compaction's inputs form the
1446
+ // bottommost layer of the LSM for the compaction's key range and the provided
1447
+ // snapshot stripe is the last stripe.
1448
+ //
1449
+ // When isBottommostSnapshotStripe returns true, it is guaranteed there does not
1450
+ // exist any overlapping keys with lower sequence numbers than the keys in the
1451
+ // provided snapshot stripe. However isBottommostSnapshotStripe is permitted to
1452
+ // return false even when there is no overlapping data in lower levels (eg,
1453
+ // flushes).
1454
+ func (i * Iter ) isBottommostSnapshotStripe (snapshotIdx int ) bool {
1455
+ // TODO(peter): This determination applies to the entire compaction. We
1456
+ // could make the determination on a key by key basis, similar to what is
1457
+ // done for elideTombstone. Need to add a benchmark for Iter to verify that
1458
+ // isn't too expensive.
1459
+ return i .cfg .IsBottommostDataLayer && snapshotIdx == 0
1435
1460
}
1436
1461
1437
1462
func finishValueMerger (
0 commit comments