Skip to content

Commit 994e81b

Browse files
FrankRehmangalaman93
authored andcommitted
fix(cmSketch test) and add better cmSketch test
One Sketch test was intended to check that the seeds had caused each sketch row to be unique but the way the test was iterating, the failure wasn't going to be triggered. With this fix, the test passes seeming to indicate the seeds are working as intended on the row creation. But there is a problem. A new Sketch test is added that goes to the trouble of sorting the counters of each row to ensure that each row isn't just essentially a copy of another row, where the only difference is which position the counters occupy. And the new Sketch test is performed twice, once with high-entropy hashes, because they come from a PRNG, and once with low-entropy hashes, which in many cases will be normal, because they are all small numbers, good for indexing, but not good for getting a spread when simply applied to a bitwise XOR operation. These new tests show a problem with the counter increment logic within the cmSketch.Increment method which was most likely introduced by commit f823dc4. A subsequent commit addresses the problems surfaced. But as the discussion from issue #108 shows (discussion later moved to https://discuss.dgraph.io/t/cmsketch-not-benefitting-from-four-rows/8712 ), other ideas for incrementing the counters were considered by previous authors as well. Fix existing cmSketch test and add improved cmSketch test (Marked as draft because this introduces a test that fails for now. I can commit a fix to the cmSketch increment method to get the new test to pass - if a maintainer agrees there is a problem to be fixed. See #108. I tried a few years ago.) One Sketch test was intended to check that the seeds had caused each sketch row to be unique but the way the test was iterating, the failure wasn't going to be triggered. With this fix, the test passes seeming to indicate the seeds are working as intended on the row creation. But there is a problem with the actual increment method. A new Sketch test is added that goes further and sorts the counters of each row to ensure that each row isn't just essentially a copy of another row, where the only difference is which position the counters occupy. And the new Sketch test is performed twice, once with high-entropy hashes, because they come from a PRNG, and once with low-entropy hashes, which in many cases will be normal, because they are all small numbers, good for indexing, but not good for getting a spread when simply applied with a bitwise XOR operation. These new tests show a problem with the counter increment logic within the cmSketch.Increment method which was most likely introduced by commit f823dc4. A subsequent commit addresses the problems surfaced. But as the discussion from issue #108 shows (discussion later moved to https://discuss.dgraph.io/t/cmsketch-not-benefitting-from-four-rows/8712), other ideas for incrementing the counters were considered by previous authors of the original Java code as well.
1 parent e4edd4c commit 994e81b

File tree

1 file changed

+87
-6
lines changed

1 file changed

+87
-6
lines changed

sketch_test.go

Lines changed: 87 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
package ristretto
22

33
import (
4+
"math/rand"
5+
"sort"
6+
"strings"
47
"testing"
58

69
"github.com/stretchr/testify/require"
@@ -16,17 +19,95 @@ func TestSketch(t *testing.T) {
1619
newCmSketch(0)
1720
}
1821

22+
// Temporary comment. This test is fixed, kind of.
23+
// It will now report when two or more rows are identical but that isn't what
24+
// should be tested. What should be tested is that the rows end up having unique
25+
// values from each other. If the rows have identical counters, but they are in
26+
// different positions, that doesn't help; no reason to have multiple rows if
27+
// the increment for one row always changes counters like to does for the next
28+
// row. So the actual test that shows whether row counters are being incremented
29+
// uniquely is the new one below: TestSketchRowUniqueness. There the counters in
30+
// a row are sorted before being compared.
1931
func TestSketchIncrement(t *testing.T) {
2032
s := newCmSketch(16)
21-
s.Increment(1)
22-
s.Increment(5)
23-
s.Increment(9)
33+
pseudoRandomIncrements(s, anyseed, anycount)
2434
for i := 0; i < cmDepth; i++ {
25-
if s.rows[i].string() != s.rows[0].string() {
26-
break
35+
rowi := s.rows[i].string()
36+
for j := 0; j < i; j++ {
37+
rowj := s.rows[j].string()
38+
require.False(t, rowi == rowj, "identical rows, bad hashing")
39+
}
40+
}
41+
}
42+
43+
const anyseed = int64(990099)
44+
const anycount = int(100) // Hopefully not large enough to max out a counter
45+
46+
func pseudoRandomIncrements(s *cmSketch, seed int64, count int) {
47+
r := rand.New(rand.NewSource(anyseed))
48+
for n := 0; n < count; n++ {
49+
s.Increment(r.Uint64())
50+
}
51+
}
52+
53+
// Bad hashing increments because there is very little entropy
54+
// between the values. This is used to test how well the sketch
55+
// uses multiple rows when there is little entropy between the hashes
56+
// being incremented with.
57+
func badHashingIncrements(s *cmSketch, count int) {
58+
for hashed := 0; hashed < count; hashed++ {
59+
s.Increment(uint64(hashed + 1))
60+
}
61+
}
62+
63+
func TestSketchRowUniqueness(t *testing.T) {
64+
// Test the row uniqueness twice.
65+
// Once when the hashes being added are pretty random
66+
// which we would normally expect.
67+
// And once when the hashes added are not normal, maybe
68+
// they are all low numbers for example.
69+
t.Run("WithGoodHashing", func(t *testing.T) {
70+
s := newCmSketch(16)
71+
pseudoRandomIncrements(s, anyseed, anycount)
72+
testSketchRowUniqueness1(t, s)
73+
})
74+
t.Run("WithBadHashing", func(t *testing.T) {
75+
s := newCmSketch(16)
76+
badHashingIncrements(s, anycount)
77+
testSketchRowUniqueness1(t, s)
78+
})
79+
}
80+
81+
func testSketchRowUniqueness1(t *testing.T, s *cmSketch) {
82+
// Perform test like the one above, TestSketchIncrement, but
83+
// compare the rows counters, once the counters are sorted.
84+
// If after several insertions, the rows have the same counters
85+
// in them, the hashing scheme is likely not actually
86+
// creating uniqueness between rows.
87+
88+
var unswiv [cmDepth]string
89+
for i := 0; i < cmDepth; i++ {
90+
unswiv[i] = s.rows[i].string()
91+
}
92+
// Now perform a kind of unswivel on each row, so counters are in ascending order.
93+
for i := 0; i < cmDepth; i++ {
94+
fields := strings.Split(unswiv[i], " ")
95+
sort.Strings(fields)
96+
unswiv[i] = strings.Join(fields, " ")
97+
}
98+
identical := 0
99+
for i := 0; i < cmDepth; i++ {
100+
t.Logf("s.rows[%d] %s, unswiv[%d] %s", i, s.rows[i].string(), i, unswiv[i])
101+
102+
for j := 0; j < i; j++ {
103+
if unswiv[i] == unswiv[j] {
104+
// Even one would be suspect. But count here so we can see how many rows look the same.
105+
identical++
106+
break // break out of j loop, knowing i looks like any earlier is enough, don't want to double count
107+
}
27108
}
28-
require.False(t, i == cmDepth-1, "identical rows, bad seeding")
29109
}
110+
require.Zero(t, identical, "%d unswiveled rows looked like earlier unswiveled rows", identical)
30111
}
31112

32113
func TestSketchEstimate(t *testing.T) {

0 commit comments

Comments
 (0)