Skip to content

Commit d143b0d

Browse files
jbowensxinhaoz
andcommitted
compact: convert MERGE keys to SET when bottommost
This commit refactors the compaction iterator to perform an additional transformation to MERGE keys that are known to be the oldest version of their key in the LSM, transforming their key kind to SET. This ensures that iterators or compactions stepping over the key avoids invoking the merge operator when unnecessary. Fix #5178. Co-authored-by: Xin Hao Zhang <[email protected]>
1 parent 9be0446 commit d143b0d

File tree

8 files changed

+105
-75
lines changed

8 files changed

+105
-75
lines changed

compaction.go

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -964,11 +964,16 @@ func (c *tableCompaction) errorOnUserKeyOverlap(ve *manifest.VersionEdit) error
964964
return nil
965965
}
966966

967-
// allowZeroSeqNum returns true if seqnum's can be zeroed if there are no
968-
// snapshots requiring them to be kept. It performs this determination by
969-
// looking at the TombstoneElision values which are set up based on sstables
970-
// which overlap the bounds of the compaction at a lower level in the LSM.
971-
func (c *tableCompaction) allowZeroSeqNum() bool {
967+
// isBottommostDataLayer returns true if the compaction's inputs are known to be
968+
// the bottommost layer of data for the compaction's key range. If true, this
969+
// allows the compaction iterator to perform transformations to keys such as
970+
// setting a key's sequence number to zero.
971+
//
972+
// This function performs this determination by looking at the TombstoneElision
973+
// values which are set up based on sstables which overlap the bounds of the
974+
// compaction at a lower level in the LSM. This function always returns false
975+
// for flushes.
976+
func (c *tableCompaction) isBottommostDataLayer() bool {
972977
// TODO(peter): we disable zeroing of seqnums during flushing to match
973978
// RocksDB behavior and to avoid generating overlapping sstables during
974979
// DB.replayWAL. When replaying WAL files at startup, we flush after each
@@ -3335,12 +3340,12 @@ func (d *DB) compactAndWrite(
33353340
return compact.Result{Err: err}
33363341
}
33373342
cfg := compact.IterConfig{
3338-
Comparer: c.comparer,
3339-
Merge: d.merge,
3340-
TombstoneElision: c.delElision,
3341-
RangeKeyElision: c.rangeKeyElision,
3342-
Snapshots: snapshots,
3343-
AllowZeroSeqNum: c.allowZeroSeqNum(),
3343+
Comparer: c.comparer,
3344+
Merge: d.merge,
3345+
TombstoneElision: c.delElision,
3346+
RangeKeyElision: c.rangeKeyElision,
3347+
Snapshots: snapshots,
3348+
IsBottommostDataLayer: c.isBottommostDataLayer(),
33443349
IneffectualSingleDeleteCallback: func(userKey []byte) {
33453350
d.opts.EventListener.PossibleAPIMisuse(PossibleAPIMisuseInfo{
33463351
Kind: IneffectualSingleDelete,

compaction_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2283,7 +2283,7 @@ func TestCompactionAllowZeroSeqNum(t *testing.T) {
22832283
c.delElision, c.rangeKeyElision = compact.SetupTombstoneElision(
22842284
c.comparer.Compare, c.version, d.mu.versions.latest.l0Organizer,
22852285
c.outputLevel.level, c.bounds)
2286-
fmt.Fprintf(&buf, "%t\n", c.allowZeroSeqNum())
2286+
fmt.Fprintf(&buf, "%t\n", c.isBottommostDataLayer())
22872287
}
22882288
return buf.String()
22892289

internal/compact/iterator.go

Lines changed: 52 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -269,11 +269,17 @@ type IterConfig struct {
269269
TombstoneElision TombstoneElision
270270
RangeKeyElision TombstoneElision
271271

272-
// AllowZeroSeqNum allows the sequence number of KVs in the bottom snapshot
273-
// stripe to be simplified to 0 (which improves compression and enables an
274-
// optimization during forward iteration). This can be enabled if there are no
275-
// tables overlapping the output at lower levels (than the output) in the LSM.
276-
AllowZeroSeqNum bool
272+
// IsBottommostDataLayer indicates that the compaction inputs form the
273+
// bottommost layer of data for the compaction's key range. This allows the
274+
// sequence number of KVs in the bottom snapshot stripe to be simplified to
275+
// 0 (which improves compression and enables an optimization during forward
276+
// iteration). This can be enabled if there are no tables overlapping the
277+
// output at lower levels (than the output) in the LSM.
278+
//
279+
// This field may be false even when nothing is overlapping in lower levels.
280+
// At the time of writing, flushes always set this to false (because flushes
281+
// almost never form the bottommost layer of data).
282+
IsBottommostDataLayer bool
277283

278284
// IneffectualPointDeleteCallback is called if a SINGLEDEL is being elided
279285
// without deleting a point set/merge. False positives are rare but possible
@@ -613,8 +619,24 @@ func (i *Iter) Next() *base.InternalKV {
613619
}
614620
var needDelete bool
615621
if i.err == nil {
616-
// includesBase is true whenever we've transformed the MERGE record
617-
// into a SET.
622+
// If this is the oldest version of this key (the bottommost
623+
// snapshot stripe), we can transform the sequence number to
624+
// zero. This can improve compression and enables an
625+
// optimization during forward iteration to skip some key
626+
// comparisons. Additionally, we can transform the key kind to
627+
// SET so that iteration and future compactions do not need to
628+
// invoke the user's Merge operator.
629+
if i.isBottommostSnapshotStripe(origSnapshotIdx) {
630+
i.kv.K.SetSeqNum(base.SeqNumZero)
631+
// During the merge (see mergeNext), we may have already
632+
// transformed the key kind to SET or SETWITHDEL, in which case we want to preserve the existing key kind.
633+
if i.kv.K.Kind() == base.InternalKeyKindMerge {
634+
i.kv.K.SetKind(base.InternalKeyKindSet)
635+
}
636+
}
637+
638+
// includesBase is true when we've merged the oldest operand in
639+
// the LSM.
618640
var includesBase bool
619641
switch i.kv.K.Kind() {
620642
case base.InternalKeyKindSet, base.InternalKeyKindSetWithDelete:
@@ -633,8 +655,6 @@ func (i *Iter) Next() *base.InternalKV {
633655
}
634656
continue
635657
}
636-
637-
i.maybeZeroSeqnum(origSnapshotIdx)
638658
return &i.kv
639659
}
640660
if i.err != nil {
@@ -799,7 +819,14 @@ func (i *Iter) setNext() {
799819
// Save the current key.
800820
i.saveKey()
801821
i.kv.V = i.iterKV.V
802-
i.maybeZeroSeqnum(i.curSnapshotIdx)
822+
823+
// If this is the oldest version of this key (the bottommost snapshot
824+
// stripe), we can transform the sequence number to zero. This can improve
825+
// compression and enables an optimization during forward iteration to skip
826+
// some key comparisons.
827+
if i.isBottommostSnapshotStripe(i.curSnapshotIdx) {
828+
i.kv.K.SetSeqNum(base.SeqNumZero)
829+
}
803830

804831
// If this key is already a SETWITHDEL we can early return and skip the remaining
805832
// records in the stripe:
@@ -1415,23 +1442,21 @@ func (i *Iter) lastRangeDelSpanFrontierReached(key []byte) []byte {
14151442
return nil
14161443
}
14171444

1418-
// maybeZeroSeqnum attempts to set the seqnum for the current key to 0. Doing
1419-
// so improves compression and enables an optimization during forward iteration
1420-
// to skip some key comparisons. The seqnum for an entry can be zeroed if the
1421-
// entry is on the bottom snapshot stripe and on the bottom level of the LSM.
1422-
func (i *Iter) maybeZeroSeqnum(snapshotIdx int) {
1423-
if !i.cfg.AllowZeroSeqNum {
1424-
// TODO(peter): allowZeroSeqNum applies to the entire compaction. We could
1425-
// make the determination on a key by key basis, similar to what is done
1426-
// for elideTombstone. Need to add a benchmark for Iter to verify
1427-
// that isn't too expensive.
1428-
return
1429-
}
1430-
if snapshotIdx > 0 {
1431-
// This is not the last snapshot
1432-
return
1433-
}
1434-
i.kv.K.SetSeqNum(base.SeqNumZero)
1445+
// isBottommostSnapshotStripe returns true if the compaction's inputs form the
1446+
// bottommost layer of the LSM for the compaction's key range and the provided
1447+
// snapshot stripe is the last stripe.
1448+
//
1449+
// When isBottommostSnapshotStripe returns true, it is guaranteed there does not
1450+
// exist any overlapping keys with lower sequence numbers than the keys in the
1451+
// provided snapshot stripe. However isBottommostSnapshotStripe is permitted to
1452+
// return false even when there is no overlapping data in lower levels (eg,
1453+
// flushes).
1454+
func (i *Iter) isBottommostSnapshotStripe(snapshotIdx int) bool {
1455+
// TODO(peter): This determination applies to the entire compaction. We
1456+
// could make the determination on a key by key basis, similar to what is
1457+
// done for elideTombstone. Need to add a benchmark for Iter to verify that
1458+
// isn't too expensive.
1459+
return i.cfg.IsBottommostDataLayer && snapshotIdx == 0
14351460
}
14361461

14371462
func finishValueMerger(

internal/compact/iterator_test.go

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ func TestCompactionIter(t *testing.T) {
5555
var rangeDels []keyspan.Span
5656
var snapshots Snapshots
5757
var elideTombstones bool
58-
var allowZeroSeqnum bool
58+
var isBottommostDataLayer bool
5959
var ineffectualSingleDeleteKeys []string
6060
var invariantViolationSingleDeleteKeys []string
6161
var missizedDeleteInfo []string
@@ -79,12 +79,12 @@ func TestCompactionIter(t *testing.T) {
7979
elision = ElideTombstonesOutsideOf(nil)
8080
}
8181
cfg := IterConfig{
82-
Comparer: base.DefaultComparer,
83-
Merge: merge,
84-
Snapshots: snapshots,
85-
TombstoneElision: elision,
86-
RangeKeyElision: elision,
87-
AllowZeroSeqNum: allowZeroSeqnum,
82+
Comparer: base.DefaultComparer,
83+
Merge: merge,
84+
Snapshots: snapshots,
85+
TombstoneElision: elision,
86+
RangeKeyElision: elision,
87+
IsBottommostDataLayer: isBottommostDataLayer,
8888
IneffectualSingleDeleteCallback: func(userKey []byte) {
8989
ineffectualSingleDeleteKeys = append(ineffectualSingleDeleteKeys, string(userKey))
9090
},
@@ -160,7 +160,7 @@ func TestCompactionIter(t *testing.T) {
160160
case "iter":
161161
snapshots = snapshots[:0]
162162
elideTombstones = false
163-
allowZeroSeqnum = false
163+
isBottommostDataLayer = false
164164
printSnapshotPinned := false
165165
printMissizedDels := false
166166
printForceObsolete := false
@@ -177,9 +177,9 @@ func TestCompactionIter(t *testing.T) {
177177
if err != nil {
178178
return err.Error()
179179
}
180-
case "allow-zero-seqnum":
180+
case "is-bottommost-layer":
181181
var err error
182-
allowZeroSeqnum, err = strconv.ParseBool(arg.Vals[0])
182+
isBottommostDataLayer, err = strconv.ParseBool(arg.Vals[0])
183183
if err != nil {
184184
return err.Error()
185185
}

internal/compact/testdata/iter

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -947,14 +947,14 @@ a.RANGEDEL.1:b
947947
a.MERGE.1:v1
948948
----
949949

950-
iter allow-zero-seqnum=true
950+
iter is-bottommost-layer=true
951951
first
952952
next
953953
next
954954
next
955955
----
956956
a#inf,RANGEDEL:; Span() = a-b:{(#1,RANGEDEL)}
957-
a#0,MERGE:v1v2
957+
a#0,SET:v1v2[base]
958958
.
959959
.
960960

@@ -973,7 +973,7 @@ next
973973
a#5,SETWITHDEL:5[base]
974974
.
975975

976-
iter allow-zero-seqnum=true
976+
iter is-bottommost-layer=true
977977
first
978978
next
979979
----
@@ -1023,13 +1023,13 @@ a#inf,RANGEDEL:; Span() = a-c:{(#3,RANGEDEL)}
10231023
b#5,MERGE:5
10241024
.
10251025

1026-
iter allow-zero-seqnum=true
1026+
iter is-bottommost-layer=true
10271027
first
10281028
next
10291029
next
10301030
----
10311031
a#inf,RANGEDEL:; Span() = a-c:{(#3,RANGEDEL)}
1032-
b#0,MERGE:5
1032+
b#0,SET:5[base]
10331033
.
10341034

10351035
iter snapshots=2

internal/compact/testdata/iter_delete_sized

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -974,13 +974,13 @@ a.RANGEDEL.1:b
974974
a.MERGE.1:v1
975975
----
976976

977-
iter allow-zero-seqnum=true
977+
iter is-bottommost-layer=true
978978
first
979979
next
980980
next
981981
----
982982
a#inf,RANGEDEL:; Span() = a-b:{(#1,RANGEDEL)}
983-
a#0,MERGE:v1v2
983+
a#0,SET:v1v2[base]
984984
.
985985

986986
# Verify that we transform merge+del -> set.
@@ -998,7 +998,7 @@ next
998998
a#5,SETWITHDEL:5[base]
999999
.
10001000

1001-
iter allow-zero-seqnum=true
1001+
iter is-bottommost-layer=true
10021002
first
10031003
next
10041004
----
@@ -1048,13 +1048,13 @@ a#inf,RANGEDEL:; Span() = a-c:{(#3,RANGEDEL)}
10481048
b#5,MERGE:5
10491049
.
10501050

1051-
iter allow-zero-seqnum=true
1051+
iter is-bottommost-layer=true
10521052
first
10531053
next
10541054
next
10551055
----
10561056
a#inf,RANGEDEL:; Span() = a-c:{(#3,RANGEDEL)}
1057-
b#0,MERGE:5
1057+
b#0,SET:5[base]
10581058
.
10591059

10601060
iter snapshots=2
@@ -1211,7 +1211,7 @@ a.SET.2:b
12111211
a.DEL.1:
12121212
----
12131213

1214-
iter allow-zero-seqnum=true
1214+
iter is-bottommost-layer=true
12151215
first
12161216
next
12171217
next
@@ -1220,7 +1220,7 @@ a#inf,RANGEDEL:; Span() = a-z:{(#2,RANGEDEL)}
12201220
a#0,SET:c
12211221
.
12221222

1223-
iter allow-zero-seqnum=true snapshots=3
1223+
iter is-bottommost-layer=true snapshots=3
12241224
first
12251225
next
12261226
next
@@ -1231,7 +1231,7 @@ a#3,SET:c
12311231
a#0,SET:b
12321232
.
12331233

1234-
iter allow-zero-seqnum=true snapshots=2
1234+
iter is-bottommost-layer=true snapshots=2
12351235
first
12361236
next
12371237
next
@@ -1573,7 +1573,7 @@ next
15731573

15741574
# Try the same test as above, but with allowing sequence number zeroing as well.
15751575

1576-
iter elide-tombstones=t allow-zero-seqnum=t
1576+
iter elide-tombstones=t is-bottommost-layer=t
15771577
first
15781578
next
15791579
----
@@ -1670,7 +1670,7 @@ next
16701670
a#inf,RANGEDEL:; Span() = a-d:{(#5,RANGEDEL)}
16711671
.
16721672

1673-
iter elide-tombstones=t allow-zero-seqnum=t
1673+
iter elide-tombstones=t is-bottommost-layer=t
16741674
first
16751675
next
16761676
----

0 commit comments

Comments
 (0)