Skip to content

Commit 8d22c74

Browse files
authored
Decrease size of _tsids created by TsidBuilder (#133631)
Based on testing, this has marginal impact on storage but decreases the size of the _tsids from 21B-36B to 18B-21B.
1 parent d837e2e commit 8d22c74

File tree

2 files changed

+41
-20
lines changed

2 files changed

+41
-20
lines changed

server/src/main/java/org/elasticsearch/cluster/routing/TsidBuilder.java

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,12 @@
3131
*/
3232
public class TsidBuilder {
3333

34-
private static final int MAX_TSID_VALUE_FIELDS = 16;
34+
/**
35+
* The maximum number of fields to use for the value similarity part of the TSID.
36+
* This is a trade-off between clustering similar time series together and the size of the TSID.
37+
* More fields improve clustering but also increase the size of the TSID.
38+
*/
39+
private static final int MAX_TSID_VALUE_SIMILARITY_FIELDS = 4;
3540
private final BufferedMurmur3Hasher murmur3Hasher = new BufferedMurmur3Hasher(0L);
3641

3742
private final List<Dimension> dimensions;
@@ -217,11 +222,11 @@ public MurmurHash3.Hash128 hash() {
217222
* The TSID is a hash that includes:
218223
* <ul>
219224
* <li>
220-
* A hash of the dimension field names (4 bytes).
225+
* A hash of the dimension field names (1 byte).
221226
* This is to cluster time series that are using the same dimensions together, which makes the encodings more effective.
222227
* </li>
223228
* <li>
224-
* A hash of the dimension field values (1 byte each, up to a maximum of 16 fields).
229+
* A hash of the dimension field values (1 byte each, up to a maximum of 4 fields).
225230
* This is to cluster time series with similar values together, also helping with making encodings more effective.
226231
* </li>
227232
* <li>
@@ -235,24 +240,24 @@ public MurmurHash3.Hash128 hash() {
235240
*/
236241
public BytesRef buildTsid() {
237242
throwIfEmpty();
238-
int numberOfValues = Math.min(MAX_TSID_VALUE_FIELDS, dimensions.size());
239-
byte[] hash = new byte[4 + numberOfValues + 16];
243+
int numberOfValues = Math.min(MAX_TSID_VALUE_SIMILARITY_FIELDS, dimensions.size());
244+
byte[] hash = new byte[1 + numberOfValues + 16];
240245
int index = 0;
241246

242247
Collections.sort(dimensions);
243248

244249
MurmurHash3.Hash128 hashBuffer = new MurmurHash3.Hash128();
245250
murmur3Hasher.reset();
251+
// similarity hash for dimension names
246252
for (int i = 0; i < dimensions.size(); i++) {
247253
Dimension dim = dimensions.get(i);
248254
murmur3Hasher.addLong(dim.pathHash.h1 ^ dim.pathHash.h2);
249255
}
250-
ByteUtils.writeIntLE((int) murmur3Hasher.digestHash(hashBuffer).h1, hash, index);
251-
index += 4;
256+
hash[index++] = (byte) murmur3Hasher.digestHash(hashBuffer).h1;
252257

253-
// similarity hash for values
258+
// similarity hash for dimension values
254259
String previousPath = null;
255-
for (int i = 0; i < numberOfValues; i++) {
260+
for (int i = 0; index < numberOfValues + 1 && i < dimensions.size(); i++) {
256261
Dimension dim = dimensions.get(i);
257262
String path = dim.path();
258263
if (path.equals(previousPath)) {
@@ -267,6 +272,7 @@ public BytesRef buildTsid() {
267272
}
268273

269274
murmur3Hasher.reset();
275+
// full hash for all dimension names and values for uniqueness
270276
for (int i = 0; i < dimensions.size(); i++) {
271277
Dimension dim = dimensions.get(i);
272278
murmur3Hasher.addLongs(dim.pathHash.h1, dim.pathHash.h2, dim.valueHash.h1, dim.valueHash.h2);

server/src/test/java/org/elasticsearch/cluster/routing/TsidBuilderTests.java

Lines changed: 26 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -37,17 +37,31 @@ public void testAddDimensions() {
3737

3838
// if these change, we'll need a new index version
3939
// because it means existing time series will get a new _tsid and will be routed to a different shard
40-
assertThat(builder.hash().toString(), equalTo("0xd4de1356065d297a2be489781e15d256")); // used to make shard routing decisions
40+
assertThat(builder.hash().toString(), equalTo("0xd4de1356065d297a2be489781e15d256"));
4141
BytesRef bytesRef = builder.buildTsid();
4242
assertThat(bytesRef, notNullValue());
43-
// 4 bytes for path hash + 1 byte per value (up to 16, only first value for arrays) + 16 bytes for hash
44-
assertThat(bytesRef.length, equalTo(26));
43+
// 1 byte for path hash + 1 byte per value (up to 4, only first value for arrays) + 16 bytes for hash
44+
assertThat(bytesRef.length, equalTo(21));
4545
assertThat(
4646
HexFormat.of().formatHex(bytesRef.bytes, bytesRef.offset, bytesRef.length),
47-
equalTo("bf438ddaa0a8d663fdbb56d2151e7889e42b7a295d065613ded4") // _tsid in hex format
47+
equalTo("bfa0a8d66356d2151e7889e42b7a295d065613ded4") // _tsid in hex format
4848
);
4949
}
5050

51+
public void testArray() {
52+
TsidBuilder builder = TsidBuilder.newBuilder().addStringDimension("test_non_array", "value");
53+
54+
int arrayValues = randomIntBetween(32, 64);
55+
for (int i = 0; i < arrayValues; i++) {
56+
builder.addStringDimension("_test_large_array", "value_" + i);
57+
}
58+
59+
BytesRef bytesRef = builder.buildTsid();
60+
assertThat(bytesRef, notNullValue());
61+
// 1 byte for path hash + 2 bytes for value hash (1 for the first array value and 1 for the the non-array value) + 16 bytes for hash
62+
assertThat(bytesRef.length, equalTo(19));
63+
}
64+
5165
public void testOrderingOfDifferentFieldsDoesNotMatter() {
5266
assertEqualBuilders(
5367
TsidBuilder.newBuilder().addStringDimension("foo", "bar").addStringDimension("baz", "qux"),
@@ -114,19 +128,20 @@ public void testExceptionWhenNoDimensions() {
114128
public void testTsidMinSize() {
115129
BytesRef tsid = TsidBuilder.newBuilder().addIntDimension("test_int", 42).buildTsid();
116130

117-
// The TSID format should be: 4 bytes for path hash + 1 byte per value (up to 16) + 16 bytes for hash
118-
// Since we only added one dimension, we expect: 4 + 1 + 16 = 21 bytes
119-
assertEquals(21, tsid.length);
131+
// The TSID format should be: 1 bytes for path hash + 1 byte per value (up to 4) + 16 bytes for hash
132+
// Since we only added one dimension, we expect: 1 + 1 + 16 = 21 bytes
133+
assertEquals(18, tsid.length);
120134
}
121135

122136
public void testTsidMaxSize() {
123137
TsidBuilder tsidBuilder = TsidBuilder.newBuilder();
124-
for (int i = 0; i < 32; i++) {
138+
int dimensions = randomIntBetween(4, 64);
139+
for (int i = 0; i < dimensions; i++) {
125140
tsidBuilder.addStringDimension("dimension_" + i, "value_" + i);
126141
}
127142

128-
// The TSID format should be: 4 bytes for path hash + 1 byte per value (up to 16) + 16 bytes for hash
129-
// Since we added 32 dimensions, we expect: 4 + 16 + 16 = 36 bytes
130-
assertEquals(36, tsidBuilder.buildTsid().length);
143+
// The TSID format should be: 1 bytes for path hash + 1 byte per value (up to 4) + 16 bytes for hash
144+
// Since we added at least 32 dimensions, we expect: 1 + 4 + 16 = 21 bytes
145+
assertEquals(21, tsidBuilder.buildTsid().length);
131146
}
132147
}

0 commit comments

Comments
 (0)