55#endif
66
77#include <assert.h>
8- /* The OH block reduction code is x86-only for now. */
9- #include <immintrin.h>
108#include <string.h>
119
10+ #ifdef __PCLMUL__
11+ /* If we have access to x86 PCLMUL (and some basic SSE). */
12+ #include <immintrin.h>
13+
14+ /* We only use 128-bit vector, as pairs of 64-bit integers. */
15+ typedef __m128i v128 ;
16+
17+ #define V128_ZERO { 0 };
18+
19+ static inline v128
20+ v128_create (uint64_t lo , uint64_t hi )
21+ {
22+ return _mm_set_epi64x (hi , lo );
23+ }
24+
25+ /* Shift each 64-bit lane left by one bit. */
26+ static inline v128
27+ v128_shift (v128 x )
28+ {
29+ return _mm_add_epi64 (x , x );
30+ }
31+
32+ /* Computes the 128-bit carryless product of x and y. */
33+ static inline v128
34+ v128_clmul (uint64_t x , uint64_t y )
35+ {
36+ return _mm_clmulepi64_si128 (_mm_cvtsi64_si128 (x ), _mm_cvtsi64_si128 (y ), 0 );
37+ }
38+
39+ /* Computes the 128-bit carryless product of the high and low halves of x. */
40+ static inline v128
41+ v128_clmul_cross (v128 x )
42+ {
43+ return _mm_clmulepi64_si128 (x , x , 1 );
44+ }
45+ #else
46+ #error "Unsupported platform: umash requires x86's SSE2 and CLMUL (-mpclmul)"
47+ #endif
48+
1249/*
1350 * #define UMASH_STAP_PROBE=1 to insert probe points in public UMASH
1451 * functions.
@@ -346,18 +383,18 @@ TEST_DEF struct umash_oh
346383oh_one_block (const uint64_t * params , uint64_t tag , const void * block )
347384{
348385 struct umash_oh ret ;
349- __m128i acc = { 0 } ;
386+ v128 acc = V128_ZERO ;
350387 size_t i ;
351388
352389 for (i = 0 ; i < UMASH_OH_PARAM_COUNT - 2 ; i += 2 ) {
353- __m128i x , k ;
390+ v128 x , k ;
354391
355392 memcpy (& x , block , sizeof (x ));
356393 block = (const char * )block + sizeof (x );
357394
358395 memcpy (& k , & params [i ], sizeof (k ));
359396 x ^= k ;
360- acc ^= _mm_clmulepi64_si128 ( x , x , 1 );
397+ acc ^= v128_clmul_cross ( x );
361398 }
362399
363400 memcpy (& ret , & acc , sizeof (ret ));
@@ -385,16 +422,15 @@ TEST_DEF void
385422oh_one_block_fprint (struct umash_oh dst [static 2 ], const uint64_t * restrict params ,
386423 uint64_t tag , const void * restrict block )
387424{
388- __m128i acc = { 0 } ; /* Base umash */
389- __m128i acc_shifted = { 0 } ; /* Accumulates shifted values */
390- __m128i lrc ;
391- __m128i prev = { 0 } ;
425+ v128 acc = V128_ZERO ; /* Base umash */
426+ v128 acc_shifted = V128_ZERO ; /* Accumulates shifted values */
427+ v128 lrc ;
428+ v128 prev = V128_ZERO ;
392429 size_t i ;
393430
394- lrc = _mm_set_epi64x (
395- params [UMASH_OH_PARAM_COUNT + 1 ], params [UMASH_OH_PARAM_COUNT ]);
431+ lrc = v128_create (params [UMASH_OH_PARAM_COUNT ], params [UMASH_OH_PARAM_COUNT + 1 ]);
396432 for (i = 0 ; i < UMASH_OH_PARAM_COUNT - 2 ; i += 2 ) {
397- __m128i x , k ;
433+ v128 x , k ;
398434
399435 memcpy (& x , block , sizeof (x ));
400436 block = (const char * )block + sizeof (x );
@@ -404,12 +440,12 @@ oh_one_block_fprint(struct umash_oh dst[static 2], const uint64_t *restrict para
404440 x ^= k ;
405441 lrc ^= x ;
406442
407- x = _mm_clmulepi64_si128 ( x , x , 1 );
443+ x = v128_clmul_cross ( x );
408444
409445 acc ^= x ;
410446
411447 acc_shifted ^= prev ;
412- acc_shifted = _mm_add_epi64 ( acc_shifted , acc_shifted );
448+ acc_shifted = v128_shift ( acc_shifted );
413449
414450 prev = x ;
415451 }
@@ -419,7 +455,7 @@ oh_one_block_fprint(struct umash_oh dst[static 2], const uint64_t *restrict para
419455 * specially.
420456 */
421457 {
422- __m128i x , k ;
458+ v128 x , k ;
423459
424460 memcpy (& x , block , sizeof (x ));
425461 memcpy (& k , & params [i ], sizeof (k ));
@@ -428,9 +464,9 @@ oh_one_block_fprint(struct umash_oh dst[static 2], const uint64_t *restrict para
428464 }
429465
430466 acc_shifted ^= acc ;
431- acc_shifted = _mm_add_epi64 ( acc_shifted , acc_shifted );
467+ acc_shifted = v128_shift ( acc_shifted );
432468
433- acc_shifted ^= _mm_clmulepi64_si128 (lrc , lrc , 1 );
469+ acc_shifted ^= v128_clmul_cross (lrc );
434470
435471 memcpy (& dst [0 ], & acc , sizeof (dst [0 ]));
436472 memcpy (& dst [1 ], & acc_shifted , sizeof (dst [0 ]));
@@ -463,23 +499,23 @@ TEST_DEF struct umash_oh
463499oh_last_block (const uint64_t * params , uint64_t tag , const void * block , size_t n_bytes )
464500{
465501 struct umash_oh ret ;
466- __m128i acc = { 0 } ;
502+ v128 acc = V128_ZERO ;
467503
468504 /* The final block processes `remaining > 0` bytes. */
469- size_t remaining = 1 + ((n_bytes - 1 ) % sizeof (__m128i ));
505+ size_t remaining = 1 + ((n_bytes - 1 ) % sizeof (v128 ));
470506 size_t end_full_pairs = (n_bytes - remaining ) / sizeof (uint64_t );
471- const void * last_ptr = (const char * )block + n_bytes - sizeof (__m128i );
507+ const void * last_ptr = (const char * )block + n_bytes - sizeof (v128 );
472508 size_t i ;
473509
474510 for (i = 0 ; i < end_full_pairs ; i += 2 ) {
475- __m128i x , k ;
511+ v128 x , k ;
476512
477513 memcpy (& x , block , sizeof (x ));
478514 block = (const char * )block + sizeof (x );
479515
480516 memcpy (& k , & params [i ], sizeof (k ));
481517 x ^= k ;
482- acc ^= _mm_clmulepi64_si128 ( x , x , 1 );
518+ acc ^= v128_clmul_cross ( x );
483519 }
484520
485521 memcpy (& ret , & acc , sizeof (ret ));
@@ -508,20 +544,19 @@ TEST_DEF void
508544oh_last_block_fprint (struct umash_oh dst [static 2 ], const uint64_t * restrict params ,
509545 uint64_t tag , const void * restrict block , size_t n_bytes )
510546{
511- __m128i acc = { 0 } ; /* Base umash */
512- __m128i acc_shifted = { 0 } ; /* Accumulates shifted values */
513- __m128i lrc ;
514- __m128i prev = { 0 } ;
547+ v128 acc = V128_ZERO ; /* Base umash */
548+ v128 acc_shifted = V128_ZERO ; /* Accumulates shifted values */
549+ v128 lrc ;
550+ v128 prev = V128_ZERO ;
515551 /* The final block processes `remaining > 0` bytes. */
516- size_t remaining = 1 + ((n_bytes - 1 ) % sizeof (__m128i ));
552+ size_t remaining = 1 + ((n_bytes - 1 ) % sizeof (v128 ));
517553 size_t end_full_pairs = (n_bytes - remaining ) / sizeof (uint64_t );
518- const void * last_ptr = (const char * )block + n_bytes - sizeof (__m128i );
554+ const void * last_ptr = (const char * )block + n_bytes - sizeof (v128 );
519555 size_t i ;
520556
521- lrc = _mm_set_epi64x (
522- params [UMASH_OH_PARAM_COUNT + 1 ], params [UMASH_OH_PARAM_COUNT ]);
557+ lrc = v128_create (params [UMASH_OH_PARAM_COUNT ], params [UMASH_OH_PARAM_COUNT + 1 ]);
523558 for (i = 0 ; i < end_full_pairs ; i += 2 ) {
524- __m128i x , k ;
559+ v128 x , k ;
525560
526561 memcpy (& x , block , sizeof (x ));
527562 block = (const char * )block + sizeof (x );
@@ -531,12 +566,12 @@ oh_last_block_fprint(struct umash_oh dst[static 2], const uint64_t *restrict par
531566 x ^= k ;
532567 lrc ^= x ;
533568
534- x = _mm_clmulepi64_si128 ( x , x , 1 );
569+ x = v128_clmul_cross ( x );
535570
536571 acc ^= x ;
537572
538573 acc_shifted ^= prev ;
539- acc_shifted = _mm_add_epi64 ( acc_shifted , acc_shifted );
574+ acc_shifted = v128_shift ( acc_shifted );
540575
541576 prev = x ;
542577 }
@@ -546,7 +581,7 @@ oh_last_block_fprint(struct umash_oh dst[static 2], const uint64_t *restrict par
546581 * specially.
547582 */
548583 {
549- __m128i x , k ;
584+ v128 x , k ;
550585
551586 memcpy (& x , last_ptr , sizeof (x ));
552587 memcpy (& k , & params [end_full_pairs ], sizeof (k ));
@@ -555,9 +590,9 @@ oh_last_block_fprint(struct umash_oh dst[static 2], const uint64_t *restrict par
555590 }
556591
557592 acc_shifted ^= acc ;
558- acc_shifted = _mm_add_epi64 ( acc_shifted , acc_shifted );
593+ acc_shifted = v128_shift ( acc_shifted );
559594
560- acc_shifted ^= _mm_clmulepi64_si128 (lrc , lrc , 1 );
595+ acc_shifted ^= v128_clmul_cross (lrc );
561596
562597 memcpy (& dst [0 ], & acc , sizeof (dst [0 ]));
563598 memcpy (& dst [1 ], & acc_shifted , sizeof (dst [0 ]));
@@ -755,7 +790,7 @@ umash_fp_medium(const uint64_t multipliers[static 2][2], const uint64_t *oh,
755790 uint64_t u64 [2 ];
756791 } hash ;
757792 union {
758- __m128i v ;
793+ v128 v ;
759794 uint64_t u64 [2 ];
760795 } mixed_lrc ;
761796 uint64_t lrc [2 ] = { oh [UMASH_OH_PARAM_COUNT ], oh [UMASH_OH_PARAM_COUNT + 1 ] };
@@ -771,7 +806,7 @@ umash_fp_medium(const uint64_t multipliers[static 2][2], const uint64_t *oh,
771806
772807 lrc [0 ] ^= x ^ a ;
773808 lrc [1 ] ^= y ^ b ;
774- mixed_lrc .v = _mm_clmulepi64_si128 (( __m128i ) { lrc [0 ] }, ( __m128i ) { lrc [1 ] }, 0 );
809+ mixed_lrc .v = v128_clmul ( lrc [0 ], lrc [1 ]);
775810
776811 hash .h = (__uint128_t )offset << 64 ;
777812 a += x ;
@@ -997,14 +1032,14 @@ sink_consume_buf(
9971032
9981033 /* All but the last 16-byte chunk of each block goes through PH. */
9991034 if (sink -> oh_iter < UMASH_OH_PARAM_COUNT - 2 && !final ) {
1000- __m128i acc , h , twisted_acc , prev ;
1035+ v128 acc , h , twisted_acc , prev ;
10011036 uint64_t m0 , m1 ;
10021037
10031038 m0 = x ^ k0 ;
10041039 m1 = y ^ k1 ;
10051040
10061041 memcpy (& acc , & sink -> oh_acc , sizeof (acc ));
1007- h = _mm_clmulepi64_si128 ( _mm_cvtsi64_si128 ( m0 ), _mm_cvtsi64_si128 ( m1 ), 0 );
1042+ h = v128_clmul ( m0 , m1 );
10081043 acc ^= h ;
10091044 memcpy (& sink -> oh_acc , & acc , sizeof (acc ));
10101045
@@ -1018,7 +1053,7 @@ sink_consume_buf(
10181053 memcpy (& prev , sink -> oh_twisted .prev , sizeof (prev ));
10191054
10201055 twisted_acc ^= prev ;
1021- twisted_acc = _mm_add_epi64 ( twisted_acc , twisted_acc );
1056+ twisted_acc = v128_shift ( twisted_acc );
10221057 memcpy (& sink -> oh_twisted .acc , & twisted_acc , sizeof (twisted_acc ));
10231058 memcpy (& sink -> oh_twisted .prev , & h , sizeof (h ));
10241059 } else {
@@ -1032,7 +1067,7 @@ sink_consume_buf(
10321067
10331068 if (sink -> hash_wanted != 0 ) {
10341069 union {
1035- __m128i vec ;
1070+ v128 vec ;
10361071 uint64_t h [2 ];
10371072 } lrc_hash ;
10381073 uint64_t lrc0 , lrc1 ;
@@ -1041,8 +1076,7 @@ sink_consume_buf(
10411076
10421077 lrc0 = sink -> oh_twisted .lrc [0 ] ^ x ^ k0 ;
10431078 lrc1 = sink -> oh_twisted .lrc [1 ] ^ y ^ k1 ;
1044- lrc_hash .vec = _mm_clmulepi64_si128 (
1045- _mm_cvtsi64_si128 (lrc0 ), _mm_cvtsi64_si128 (lrc1 ), 0 );
1079+ lrc_hash .vec = v128_clmul (lrc0 , lrc1 );
10461080
10471081 oh_twisted0 = sink -> oh_twisted .acc .bits [0 ];
10481082 oh_twisted1 = sink -> oh_twisted .acc .bits [1 ];
@@ -1210,7 +1244,7 @@ umash_full(const struct umash_params *params, uint64_t seed, int which, const vo
12101244 * we want to make sure they fall through correctly to
12111245 * minimise latency.
12121246 */
1213- if (LIKELY (n_bytes <= sizeof (__m128i ))) {
1247+ if (LIKELY (n_bytes <= sizeof (v128 ))) {
12141248 if (LIKELY (n_bytes <= sizeof (uint64_t )))
12151249 return umash_short (params -> oh , seed , data , n_bytes );
12161250
@@ -1226,7 +1260,7 @@ umash_fprint(
12261260{
12271261
12281262 DTRACE_PROBE3 (libumash , umash_fprint , params , data , n_bytes );
1229- if (LIKELY (n_bytes <= sizeof (__m128i ))) {
1263+ if (LIKELY (n_bytes <= sizeof (v128 ))) {
12301264 if (LIKELY (n_bytes <= sizeof (uint64_t )))
12311265 return umash_fp_short (params -> oh , seed , data , n_bytes );
12321266
0 commit comments