Skip to content

Commit fc5bfab

Browse files
committed
NFC umash.c: wrap raw SSE intrinsics in higher level wrappers
These wrappers both make the intention clearer, and will make it easy to slot in ARM vector extensions.
1 parent a6eb107 commit fc5bfab

File tree

1 file changed

+80
-46
lines changed

1 file changed

+80
-46
lines changed

umash.c

Lines changed: 80 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,47 @@
55
#endif
66

77
#include <assert.h>
8-
/* The OH block reduction code is x86-only for now. */
9-
#include <immintrin.h>
108
#include <string.h>
119

10+
#ifdef __PCLMUL__
11+
/* If we have access to x86 PCLMUL (and some basic SSE). */
12+
#include <immintrin.h>
13+
14+
/* We only use 128-bit vector, as pairs of 64-bit integers. */
15+
typedef __m128i v128;
16+
17+
#define V128_ZERO { 0 };
18+
19+
static inline v128
20+
v128_create(uint64_t lo, uint64_t hi)
21+
{
22+
return _mm_set_epi64x(hi, lo);
23+
}
24+
25+
/* Shift each 64-bit lane left by one bit. */
26+
static inline v128
27+
v128_shift(v128 x)
28+
{
29+
return _mm_add_epi64(x, x);
30+
}
31+
32+
/* Computes the 128-bit carryless product of x and y. */
33+
static inline v128
34+
v128_clmul(uint64_t x, uint64_t y)
35+
{
36+
return _mm_clmulepi64_si128(_mm_cvtsi64_si128(x), _mm_cvtsi64_si128(y), 0);
37+
}
38+
39+
/* Computes the 128-bit carryless product of the high and low halves of x. */
40+
static inline v128
41+
v128_clmul_cross(v128 x)
42+
{
43+
return _mm_clmulepi64_si128(x, x, 1);
44+
}
45+
#else
46+
#error "Unsupported platform: umash requires x86's SSE2 and CLMUL (-mpclmul)"
47+
#endif
48+
1249
/*
1350
* #define UMASH_STAP_PROBE=1 to insert probe points in public UMASH
1451
* functions.
@@ -346,18 +383,18 @@ TEST_DEF struct umash_oh
346383
oh_one_block(const uint64_t *params, uint64_t tag, const void *block)
347384
{
348385
struct umash_oh ret;
349-
__m128i acc = { 0 };
386+
v128 acc = V128_ZERO;
350387
size_t i;
351388

352389
for (i = 0; i < UMASH_OH_PARAM_COUNT - 2; i += 2) {
353-
__m128i x, k;
390+
v128 x, k;
354391

355392
memcpy(&x, block, sizeof(x));
356393
block = (const char *)block + sizeof(x);
357394

358395
memcpy(&k, &params[i], sizeof(k));
359396
x ^= k;
360-
acc ^= _mm_clmulepi64_si128(x, x, 1);
397+
acc ^= v128_clmul_cross(x);
361398
}
362399

363400
memcpy(&ret, &acc, sizeof(ret));
@@ -385,16 +422,15 @@ TEST_DEF void
385422
oh_one_block_fprint(struct umash_oh dst[static 2], const uint64_t *restrict params,
386423
uint64_t tag, const void *restrict block)
387424
{
388-
__m128i acc = { 0 }; /* Base umash */
389-
__m128i acc_shifted = { 0 }; /* Accumulates shifted values */
390-
__m128i lrc;
391-
__m128i prev = { 0 };
425+
v128 acc = V128_ZERO; /* Base umash */
426+
v128 acc_shifted = V128_ZERO; /* Accumulates shifted values */
427+
v128 lrc;
428+
v128 prev = V128_ZERO;
392429
size_t i;
393430

394-
lrc = _mm_set_epi64x(
395-
params[UMASH_OH_PARAM_COUNT + 1], params[UMASH_OH_PARAM_COUNT]);
431+
lrc = v128_create(params[UMASH_OH_PARAM_COUNT], params[UMASH_OH_PARAM_COUNT + 1]);
396432
for (i = 0; i < UMASH_OH_PARAM_COUNT - 2; i += 2) {
397-
__m128i x, k;
433+
v128 x, k;
398434

399435
memcpy(&x, block, sizeof(x));
400436
block = (const char *)block + sizeof(x);
@@ -404,12 +440,12 @@ oh_one_block_fprint(struct umash_oh dst[static 2], const uint64_t *restrict para
404440
x ^= k;
405441
lrc ^= x;
406442

407-
x = _mm_clmulepi64_si128(x, x, 1);
443+
x = v128_clmul_cross(x);
408444

409445
acc ^= x;
410446

411447
acc_shifted ^= prev;
412-
acc_shifted = _mm_add_epi64(acc_shifted, acc_shifted);
448+
acc_shifted = v128_shift(acc_shifted);
413449

414450
prev = x;
415451
}
@@ -419,7 +455,7 @@ oh_one_block_fprint(struct umash_oh dst[static 2], const uint64_t *restrict para
419455
* specially.
420456
*/
421457
{
422-
__m128i x, k;
458+
v128 x, k;
423459

424460
memcpy(&x, block, sizeof(x));
425461
memcpy(&k, &params[i], sizeof(k));
@@ -428,9 +464,9 @@ oh_one_block_fprint(struct umash_oh dst[static 2], const uint64_t *restrict para
428464
}
429465

430466
acc_shifted ^= acc;
431-
acc_shifted = _mm_add_epi64(acc_shifted, acc_shifted);
467+
acc_shifted = v128_shift(acc_shifted);
432468

433-
acc_shifted ^= _mm_clmulepi64_si128(lrc, lrc, 1);
469+
acc_shifted ^= v128_clmul_cross(lrc);
434470

435471
memcpy(&dst[0], &acc, sizeof(dst[0]));
436472
memcpy(&dst[1], &acc_shifted, sizeof(dst[0]));
@@ -463,23 +499,23 @@ TEST_DEF struct umash_oh
463499
oh_last_block(const uint64_t *params, uint64_t tag, const void *block, size_t n_bytes)
464500
{
465501
struct umash_oh ret;
466-
__m128i acc = { 0 };
502+
v128 acc = V128_ZERO;
467503

468504
/* The final block processes `remaining > 0` bytes. */
469-
size_t remaining = 1 + ((n_bytes - 1) % sizeof(__m128i));
505+
size_t remaining = 1 + ((n_bytes - 1) % sizeof(v128));
470506
size_t end_full_pairs = (n_bytes - remaining) / sizeof(uint64_t);
471-
const void *last_ptr = (const char *)block + n_bytes - sizeof(__m128i);
507+
const void *last_ptr = (const char *)block + n_bytes - sizeof(v128);
472508
size_t i;
473509

474510
for (i = 0; i < end_full_pairs; i += 2) {
475-
__m128i x, k;
511+
v128 x, k;
476512

477513
memcpy(&x, block, sizeof(x));
478514
block = (const char *)block + sizeof(x);
479515

480516
memcpy(&k, &params[i], sizeof(k));
481517
x ^= k;
482-
acc ^= _mm_clmulepi64_si128(x, x, 1);
518+
acc ^= v128_clmul_cross(x);
483519
}
484520

485521
memcpy(&ret, &acc, sizeof(ret));
@@ -508,20 +544,19 @@ TEST_DEF void
508544
oh_last_block_fprint(struct umash_oh dst[static 2], const uint64_t *restrict params,
509545
uint64_t tag, const void *restrict block, size_t n_bytes)
510546
{
511-
__m128i acc = { 0 }; /* Base umash */
512-
__m128i acc_shifted = { 0 }; /* Accumulates shifted values */
513-
__m128i lrc;
514-
__m128i prev = { 0 };
547+
v128 acc = V128_ZERO; /* Base umash */
548+
v128 acc_shifted = V128_ZERO; /* Accumulates shifted values */
549+
v128 lrc;
550+
v128 prev = V128_ZERO;
515551
/* The final block processes `remaining > 0` bytes. */
516-
size_t remaining = 1 + ((n_bytes - 1) % sizeof(__m128i));
552+
size_t remaining = 1 + ((n_bytes - 1) % sizeof(v128));
517553
size_t end_full_pairs = (n_bytes - remaining) / sizeof(uint64_t);
518-
const void *last_ptr = (const char *)block + n_bytes - sizeof(__m128i);
554+
const void *last_ptr = (const char *)block + n_bytes - sizeof(v128);
519555
size_t i;
520556

521-
lrc = _mm_set_epi64x(
522-
params[UMASH_OH_PARAM_COUNT + 1], params[UMASH_OH_PARAM_COUNT]);
557+
lrc = v128_create(params[UMASH_OH_PARAM_COUNT], params[UMASH_OH_PARAM_COUNT + 1]);
523558
for (i = 0; i < end_full_pairs; i += 2) {
524-
__m128i x, k;
559+
v128 x, k;
525560

526561
memcpy(&x, block, sizeof(x));
527562
block = (const char *)block + sizeof(x);
@@ -531,12 +566,12 @@ oh_last_block_fprint(struct umash_oh dst[static 2], const uint64_t *restrict par
531566
x ^= k;
532567
lrc ^= x;
533568

534-
x = _mm_clmulepi64_si128(x, x, 1);
569+
x = v128_clmul_cross(x);
535570

536571
acc ^= x;
537572

538573
acc_shifted ^= prev;
539-
acc_shifted = _mm_add_epi64(acc_shifted, acc_shifted);
574+
acc_shifted = v128_shift(acc_shifted);
540575

541576
prev = x;
542577
}
@@ -546,7 +581,7 @@ oh_last_block_fprint(struct umash_oh dst[static 2], const uint64_t *restrict par
546581
* specially.
547582
*/
548583
{
549-
__m128i x, k;
584+
v128 x, k;
550585

551586
memcpy(&x, last_ptr, sizeof(x));
552587
memcpy(&k, &params[end_full_pairs], sizeof(k));
@@ -555,9 +590,9 @@ oh_last_block_fprint(struct umash_oh dst[static 2], const uint64_t *restrict par
555590
}
556591

557592
acc_shifted ^= acc;
558-
acc_shifted = _mm_add_epi64(acc_shifted, acc_shifted);
593+
acc_shifted = v128_shift(acc_shifted);
559594

560-
acc_shifted ^= _mm_clmulepi64_si128(lrc, lrc, 1);
595+
acc_shifted ^= v128_clmul_cross(lrc);
561596

562597
memcpy(&dst[0], &acc, sizeof(dst[0]));
563598
memcpy(&dst[1], &acc_shifted, sizeof(dst[0]));
@@ -755,7 +790,7 @@ umash_fp_medium(const uint64_t multipliers[static 2][2], const uint64_t *oh,
755790
uint64_t u64[2];
756791
} hash;
757792
union {
758-
__m128i v;
793+
v128 v;
759794
uint64_t u64[2];
760795
} mixed_lrc;
761796
uint64_t lrc[2] = { oh[UMASH_OH_PARAM_COUNT], oh[UMASH_OH_PARAM_COUNT + 1] };
@@ -771,7 +806,7 @@ umash_fp_medium(const uint64_t multipliers[static 2][2], const uint64_t *oh,
771806

772807
lrc[0] ^= x ^ a;
773808
lrc[1] ^= y ^ b;
774-
mixed_lrc.v = _mm_clmulepi64_si128((__m128i) { lrc[0] }, (__m128i) { lrc[1] }, 0);
809+
mixed_lrc.v = v128_clmul(lrc[0], lrc[1]);
775810

776811
hash.h = (__uint128_t)offset << 64;
777812
a += x;
@@ -997,14 +1032,14 @@ sink_consume_buf(
9971032

9981033
/* All but the last 16-byte chunk of each block goes through PH. */
9991034
if (sink->oh_iter < UMASH_OH_PARAM_COUNT - 2 && !final) {
1000-
__m128i acc, h, twisted_acc, prev;
1035+
v128 acc, h, twisted_acc, prev;
10011036
uint64_t m0, m1;
10021037

10031038
m0 = x ^ k0;
10041039
m1 = y ^ k1;
10051040

10061041
memcpy(&acc, &sink->oh_acc, sizeof(acc));
1007-
h = _mm_clmulepi64_si128(_mm_cvtsi64_si128(m0), _mm_cvtsi64_si128(m1), 0);
1042+
h = v128_clmul(m0, m1);
10081043
acc ^= h;
10091044
memcpy(&sink->oh_acc, &acc, sizeof(acc));
10101045

@@ -1018,7 +1053,7 @@ sink_consume_buf(
10181053
memcpy(&prev, sink->oh_twisted.prev, sizeof(prev));
10191054

10201055
twisted_acc ^= prev;
1021-
twisted_acc = _mm_add_epi64(twisted_acc, twisted_acc);
1056+
twisted_acc = v128_shift(twisted_acc);
10221057
memcpy(&sink->oh_twisted.acc, &twisted_acc, sizeof(twisted_acc));
10231058
memcpy(&sink->oh_twisted.prev, &h, sizeof(h));
10241059
} else {
@@ -1032,7 +1067,7 @@ sink_consume_buf(
10321067

10331068
if (sink->hash_wanted != 0) {
10341069
union {
1035-
__m128i vec;
1070+
v128 vec;
10361071
uint64_t h[2];
10371072
} lrc_hash;
10381073
uint64_t lrc0, lrc1;
@@ -1041,8 +1076,7 @@ sink_consume_buf(
10411076

10421077
lrc0 = sink->oh_twisted.lrc[0] ^ x ^ k0;
10431078
lrc1 = sink->oh_twisted.lrc[1] ^ y ^ k1;
1044-
lrc_hash.vec = _mm_clmulepi64_si128(
1045-
_mm_cvtsi64_si128(lrc0), _mm_cvtsi64_si128(lrc1), 0);
1079+
lrc_hash.vec = v128_clmul(lrc0, lrc1);
10461080

10471081
oh_twisted0 = sink->oh_twisted.acc.bits[0];
10481082
oh_twisted1 = sink->oh_twisted.acc.bits[1];
@@ -1210,7 +1244,7 @@ umash_full(const struct umash_params *params, uint64_t seed, int which, const vo
12101244
* we want to make sure they fall through correctly to
12111245
* minimise latency.
12121246
*/
1213-
if (LIKELY(n_bytes <= sizeof(__m128i))) {
1247+
if (LIKELY(n_bytes <= sizeof(v128))) {
12141248
if (LIKELY(n_bytes <= sizeof(uint64_t)))
12151249
return umash_short(params->oh, seed, data, n_bytes);
12161250

@@ -1226,7 +1260,7 @@ umash_fprint(
12261260
{
12271261

12281262
DTRACE_PROBE3(libumash, umash_fprint, params, data, n_bytes);
1229-
if (LIKELY(n_bytes <= sizeof(__m128i))) {
1263+
if (LIKELY(n_bytes <= sizeof(v128))) {
12301264
if (LIKELY(n_bytes <= sizeof(uint64_t)))
12311265
return umash_fp_short(params->oh, seed, data, n_bytes);
12321266

0 commit comments

Comments
 (0)