From 88a8a0c0d41061a18d8c94bda5117765613272cb Mon Sep 17 00:00:00 2001 From: Diego Havenstein Date: Fri, 29 Aug 2014 14:13:07 +0200 Subject: [PATCH 01/11] AVX2 support for popcount function. To be tested. --- include/sdsl/bits.hpp | 30 ++++++++++++++++++++++++++++++ include/sdsl/uint256_t.hpp | 9 +++++++++ include/sdsl/ymm_union.hpp | 37 +++++++++++++++++++++++++++++++++++++ 3 files changed, 76 insertions(+) create mode 100644 include/sdsl/ymm_union.hpp diff --git a/include/sdsl/bits.hpp b/include/sdsl/bits.hpp index b13fe791f..0bc593e30 100644 --- a/include/sdsl/bits.hpp +++ b/include/sdsl/bits.hpp @@ -24,6 +24,8 @@ #include // for uint64_t uint32_t declaration #include // for cerr #include +#include // SSE/AVX +#include // convenient YMM register wrapper #ifdef __SSE4_2__ #include #endif @@ -237,6 +239,34 @@ struct bits { // ============= inline - implementations ================ +#ifdef __AVX2__ +inline uint64_t bits::cnt256(__m256i x){ + // 4-bit universal table + static const __m256i POPCNT_LOOKUP_4BF_MASK256 = _mm256_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, + 1, 2, 2, 3, 2, 3, 3, 4, + 0, 1, 1, 2, 1, 2, 2, 3, + 1, 2, 2, 3, 2, 3, 3, 4); + + __m256i low, high, bwcount; + + // byte halves stored in separate YMM registers + low = _mm256_and_si256(MASK4_256, buffer.avx); + high = _mm256_and_si256(MASK4_256, _mm256_srli_epi16(buffer.avx, 4)); + + // bytewise population count + bwcount = _mm256_add_epi8(_mm256_shuffle_epi8(POPCNT_LOOKUP_4BF_MASK256, low), + _mm256_shuffle_epi8(POPCNT_LOOKUP_4BF_MASK256, high)); + + // Computes sum of absolute differences and stores intermediate results at positions 0,4,8 and 12 + __m256i fourSums = _mm256_sad_epu8(bwcount, _mm256_setzero_si256()); + + // Use union to access individual bytes (unsigned integers) + sdsl::YMM_Union ymm_union; + ymm_union.ymm = fourSums; + return ymm_union.ymm[0] + ymm_union.ymm[4] + ymm_union.ymm[8] + ymm_union.ymm[12]; +} +#endif + // see page 11, Knuth TAOCP Vol 4 F1A inline uint64_t bits::cnt(uint64_t x) { diff --git a/include/sdsl/uint256_t.hpp b/include/sdsl/uint256_t.hpp index b93a42e31..da5d8ef5e 100644 --- a/include/sdsl/uint256_t.hpp +++ b/include/sdsl/uint256_t.hpp @@ -62,8 +62,17 @@ class uint256_t } inline uint16_t popcount() { +#ifdef __AVX2__ + sdsl::YMM_Union ymm_union; + ymm_union[0] = m_lo; + ymm_union[1] = m_mid; + ymm_union[2] = m_high >> 64; + ymm_union[3] = m_high; + return bits::cnt256(ymm_union.ymm); +#else return ((uint16_t)bits::cnt(m_lo)) + bits::cnt(m_mid) + bits::cnt(m_high>>64) + bits::cnt(m_high); +#endif } inline uint16_t hi() { diff --git a/include/sdsl/ymm_union.hpp b/include/sdsl/ymm_union.hpp new file mode 100644 index 000000000..ae1dca0df --- /dev/null +++ b/include/sdsl/ymm_union.hpp @@ -0,0 +1,37 @@ +/* sdsl - succinct data structures library + Copyright (C) 2012 Simon Gog + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see http://www.gnu.org/licenses/ . +*/ +/*! \file uint256_t.hpp + \brief uint256_t.hpp contains a convenientunion for YMM registers (256-bits). + \author Diego Havenstein +*/ +#ifndef INCLUDED_SDSL_YMMUNION +#define INCLUDED_SDSL_YMMUNION + +namespace sdsl +{ + +#ifdef __AVX2__ +template +union YMM_union { + __m256i ymm; + T values[32/sizeof(T)]; +}; +#endif + +} // end namespace + +#endif From a65f3f45a2161a356c74a7b8cb5fa9c1a68b4607 Mon Sep 17 00:00:00 2001 From: Diego Havenstein Date: Fri, 29 Aug 2014 14:51:09 +0200 Subject: [PATCH 02/11] Added support for SSE based popcount --- include/sdsl/bits.hpp | 19 +++++++++++++++++++ include/sdsl/uint256_t.hpp | 17 +++++++++++++++-- include/sdsl/xmm_union.hpp | 37 +++++++++++++++++++++++++++++++++++++ include/sdsl/ymm_union.hpp | 4 ++-- 4 files changed, 73 insertions(+), 4 deletions(-) create mode 100644 include/sdsl/xmm_union.hpp diff --git a/include/sdsl/bits.hpp b/include/sdsl/bits.hpp index 0bc593e30..a6ae19ab0 100644 --- a/include/sdsl/bits.hpp +++ b/include/sdsl/bits.hpp @@ -267,6 +267,25 @@ inline uint64_t bits::cnt256(__m256i x){ } #endif +#ifdef __SSE4_2__ +inline uint64_t bits::cnt128(__m128i x){ + // 4-bit universal table + static const __m128i POPCNT_LOOKUP_4BF_MASK = _mm_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4); + + __m128i low, high, count; + + low = _mm_and_si128(MASK4, buffer); + high = _mm_and_si128(MASK4, _mm_srli_epi16(buffer, 4)); + count = _mm_add_epi8(_mm_shuffle_epi8(POPCNT_LOOKUP_4BF_MASK, low), + _mm_shuffle_epi8(POPCNT_LOOKUP_4BF_MASK, high)); + + // Use union to access individual bytes (unsigned integers) + sdsl::XMM_Union xmm_union; + xmm_union.sse = _mm_sad_epu8(x, _mm_setzero_si128()); + return xmm_union.values[0] + xmm_union.values[4]; +} +#endif + // see page 11, Knuth TAOCP Vol 4 F1A inline uint64_t bits::cnt(uint64_t x) { diff --git a/include/sdsl/uint256_t.hpp b/include/sdsl/uint256_t.hpp index da5d8ef5e..ec84d9f85 100644 --- a/include/sdsl/uint256_t.hpp +++ b/include/sdsl/uint256_t.hpp @@ -62,14 +62,27 @@ class uint256_t } inline uint16_t popcount() { -#ifdef __AVX2__ +#ifdef __AVX2__ // Fastest method: 32 table lookups per clock cycle sdsl::YMM_Union ymm_union; ymm_union[0] = m_lo; ymm_union[1] = m_mid; ymm_union[2] = m_high >> 64; ymm_union[3] = m_high; return bits::cnt256(ymm_union.ymm); -#else +#endif + +#ifdef __SSE4_2__ // 16 table lookups per clock cycle + sdsl::XMM_Union xmm_union1; + sdsl::XMM_Union xmm_union2; + xmm_union1[0] = m_lo; + xmm_union1[1] = m_mid; + xmm_union2[0] = m_high >> 64; + xmm_union2[1] = m_high; + + return bits::cnt128(xmm_union1.xmm) + bits::cnt128(xmm_union2.xmm); + + +#else // byte after byte return ((uint16_t)bits::cnt(m_lo)) + bits::cnt(m_mid) + bits::cnt(m_high>>64) + bits::cnt(m_high); #endif diff --git a/include/sdsl/xmm_union.hpp b/include/sdsl/xmm_union.hpp new file mode 100644 index 000000000..021851ec4 --- /dev/null +++ b/include/sdsl/xmm_union.hpp @@ -0,0 +1,37 @@ +/* sdsl - succinct data structures library + Copyright (C) 2012 Simon Gog + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see http://www.gnu.org/licenses/ . +*/ +/*! \file xmm_union.hpp + \brief xmm_union.hpp contains a convenientunion for XMM registers (128-bits). + \author Diego Havenstein +*/ +#ifndef INCLUDED_SDSL_XMMUNION +#define INCLUDED_SDSL_XMMUNION + +namespace sdsl +{ X + +#ifdef __AVX2__ +template +union XMM_union { + __m128i xmm; + T values[16/sizeof(T)]; +}; +#endif + +} // end namespace + +#endif diff --git a/include/sdsl/ymm_union.hpp b/include/sdsl/ymm_union.hpp index ae1dca0df..809b050b3 100644 --- a/include/sdsl/ymm_union.hpp +++ b/include/sdsl/ymm_union.hpp @@ -14,8 +14,8 @@ You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/ . */ -/*! \file uint256_t.hpp - \brief uint256_t.hpp contains a convenientunion for YMM registers (256-bits). +/*! \file ymm_union.hpp + \brief ymm_union.hpp contains a convenientunion for YMM registers (256-bits). \author Diego Havenstein */ #ifndef INCLUDED_SDSL_YMMUNION From 8983625a9451e3642af104a7206dc4c94a68c6c7 Mon Sep 17 00:00:00 2001 From: Diego Havenstein Date: Fri, 29 Aug 2014 14:59:07 +0200 Subject: [PATCH 03/11] Bugfixes, now compiles and "make test" is running --- include/sdsl/bits.hpp | 3 ++- include/sdsl/xmm_union.hpp | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/include/sdsl/bits.hpp b/include/sdsl/bits.hpp index a6ae19ab0..aaf044bbb 100644 --- a/include/sdsl/bits.hpp +++ b/include/sdsl/bits.hpp @@ -25,7 +25,8 @@ #include // for cerr #include #include // SSE/AVX -#include // convenient YMM register wrapper +#include "ymm_union.hpp" // convenient YMM register wrapper +#include "xmm_union.hpp" // convenient XMM register wrapper #ifdef __SSE4_2__ #include #endif diff --git a/include/sdsl/xmm_union.hpp b/include/sdsl/xmm_union.hpp index 021851ec4..a28f4402c 100644 --- a/include/sdsl/xmm_union.hpp +++ b/include/sdsl/xmm_union.hpp @@ -22,9 +22,9 @@ #define INCLUDED_SDSL_XMMUNION namespace sdsl -{ X +{ -#ifdef __AVX2__ +#ifdef __SSE4_2__ template union XMM_union { __m128i xmm; From 40409bceacfbb72ac8a99c77bc9f28d31d656581 Mon Sep 17 00:00:00 2001 From: Diego Havenstein Date: Fri, 29 Aug 2014 15:22:07 +0200 Subject: [PATCH 04/11] CheckAVX2.cmake --- CMakeLists.txt | 9 +++++++++ CMakeModules/CheckAVX2.cmake | 24 ++++++++++++++++++++++++ include/sdsl/bits.hpp | 2 ++ 3 files changed, 35 insertions(+) create mode 100644 CMakeModules/CheckAVX2.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index 20b137765..bbbbd4387 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -61,6 +61,15 @@ if( BUILTIN_POPCNT ) endif() endif() +include(CheckAVX2) +if( AVX2 ) + if( CMAKE_COMPILER_IS_GNUCXX ) + append_cxx_compiler_flags("-mavx2" "GCC" CMAKE_CXX_OPT_FLAGS) + else() + append_cxx_compiler_flags("-mavx2" "CLANG" CMAKE_CXX_OPT_FLAGS) + endif() +endif() + add_subdirectory(external) add_subdirectory(include) add_subdirectory(lib) diff --git a/CMakeModules/CheckAVX2.cmake b/CMakeModules/CheckAVX2.cmake new file mode 100644 index 000000000..dcd8815f8 --- /dev/null +++ b/CMakeModules/CheckAVX2.cmake @@ -0,0 +1,24 @@ +# Check if the CPU provides fast operations +# for popcount, leftmost and rightmost bit + +set(AVX2 0) +# Check if we are on a Linux system +if(CMAKE_SYSTEM_NAME STREQUAL "Linux") + # Use /proc/cpuinfo to get the information + file(STRINGS "/proc/cpuinfo" _cpuinfo) + if(_cpuinfo MATCHES "(avx2)") + set(AVX2 1) + endif() +elseif(CMAKE_SYSTEM_NAME STREQUAL "Windows") +# handle windows +# get_filename_component(_vendor_id "[HKEY_LOCAL_MACHINE\\Hardware\\Description\\System\\CentralProcessor\\0;VendorIdentifier]" NAME CACHE) +# get_filename_component(_cpu_id "[HKEY_LOCAL_MACHINE\\Hardware\\Description\\System\\CentralProcessor\\0;Identifier]" NAME CACHE) +elseif(CMAKE_SYSTEM_NAME STREQUAL "Darwin") +# handle MacOs +execute_process(COMMAND sysctl -n machdep.cpu.features + OUTPUT_VARIABLE _cpuinfo OUTPUT_STRIP_TRAILING_WHITESPACE) + if(_cpuinfo MATCHES "AVX2") + set(AVX2 1) + endif() +endif() + diff --git a/include/sdsl/bits.hpp b/include/sdsl/bits.hpp index aaf044bbb..9cf981945 100644 --- a/include/sdsl/bits.hpp +++ b/include/sdsl/bits.hpp @@ -242,6 +242,7 @@ struct bits { #ifdef __AVX2__ inline uint64_t bits::cnt256(__m256i x){ + // 4-bit universal table static const __m256i POPCNT_LOOKUP_4BF_MASK256 = _mm256_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, @@ -270,6 +271,7 @@ inline uint64_t bits::cnt256(__m256i x){ #ifdef __SSE4_2__ inline uint64_t bits::cnt128(__m128i x){ + // 4-bit universal table static const __m128i POPCNT_LOOKUP_4BF_MASK = _mm_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4); From 9748a346d3d366668e79506a54e93276ee53a03d Mon Sep 17 00:00:00 2001 From: Diego Havenstein Date: Fri, 29 Aug 2014 16:05:17 +0200 Subject: [PATCH 05/11] Bug fixes --- include/sdsl/bits.hpp | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/include/sdsl/bits.hpp b/include/sdsl/bits.hpp index 9cf981945..d8c454dea 100644 --- a/include/sdsl/bits.hpp +++ b/include/sdsl/bits.hpp @@ -243,7 +243,8 @@ struct bits { #ifdef __AVX2__ inline uint64_t bits::cnt256(__m256i x){ - // 4-bit universal table + // 4-bit universal table, 4-bit mask + static const __m256i MASK4_256 = _mm256_set1_epi8(0x0F); static const __m256i POPCNT_LOOKUP_4BF_MASK256 = _mm256_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 0, 1, 1, 2, 1, 2, 2, 3, @@ -252,8 +253,8 @@ inline uint64_t bits::cnt256(__m256i x){ __m256i low, high, bwcount; // byte halves stored in separate YMM registers - low = _mm256_and_si256(MASK4_256, buffer.avx); - high = _mm256_and_si256(MASK4_256, _mm256_srli_epi16(buffer.avx, 4)); + low = _mm256_and_si256(MASK4_256, x); + high = _mm256_and_si256(MASK4_256, _mm256_srli_epi16(x, 4)); // bytewise population count bwcount = _mm256_add_epi8(_mm256_shuffle_epi8(POPCNT_LOOKUP_4BF_MASK256, low), @@ -272,13 +273,14 @@ inline uint64_t bits::cnt256(__m256i x){ #ifdef __SSE4_2__ inline uint64_t bits::cnt128(__m128i x){ - // 4-bit universal table + // 4-bit universal table, 4-bit mask static const __m128i POPCNT_LOOKUP_4BF_MASK = _mm_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4); + static const __m128i MASK4 = _mm_set1_epi8(0x0F); __m128i low, high, count; - low = _mm_and_si128(MASK4, buffer); - high = _mm_and_si128(MASK4, _mm_srli_epi16(buffer, 4)); + low = _mm_and_si128(MASK4, x); + high = _mm_and_si128(MASK4, _mm_srli_epi16(x, 4)); count = _mm_add_epi8(_mm_shuffle_epi8(POPCNT_LOOKUP_4BF_MASK, low), _mm_shuffle_epi8(POPCNT_LOOKUP_4BF_MASK, high)); From 7f72dc4101a16d2a9d24c0fc2ebb5050620574aa Mon Sep 17 00:00:00 2001 From: Diego Havenstein Date: Mon, 1 Sep 2014 15:59:18 +0200 Subject: [PATCH 06/11] Some of the fixes suggested on GitHub by Simon --- CMakeLists.txt | 3 ++- include/sdsl/bits.hpp | 22 +++++++++++++++++++--- include/sdsl/uint256_t.hpp | 6 +++--- 3 files changed, 24 insertions(+), 7 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index bbbbd4387..8bbae670d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -65,9 +65,10 @@ include(CheckAVX2) if( AVX2 ) if( CMAKE_COMPILER_IS_GNUCXX ) append_cxx_compiler_flags("-mavx2" "GCC" CMAKE_CXX_OPT_FLAGS) - else() + elseif( CMAKE_COMPILER_IS_CLANGXX ) append_cxx_compiler_flags("-mavx2" "CLANG" CMAKE_CXX_OPT_FLAGS) endif() + message(STATUS "Your compiler is not supported yet!") endif() add_subdirectory(external) diff --git a/include/sdsl/bits.hpp b/include/sdsl/bits.hpp index d8c454dea..51027e427 100644 --- a/include/sdsl/bits.hpp +++ b/include/sdsl/bits.hpp @@ -105,6 +105,22 @@ struct bits { */ static uint64_t cnt(uint64_t x); + //! Counts the number of set bits in YMM register x. + /*! \param YMM register + \return Number of set bits. + */ +#ifdef __AVX2__ + static uint64_t cnt256(__m256i x); +#endif + + //! Counts the number of set bits in XMM register x. + /*! \param XMM register + \return Number of set bits. + */ +#ifdef __AVX2__ + static uint64_t cnt128(__m128i x); +#endif + //! Position of the most significant set bit the 64-bit word x /*! \param x 64-bit word \return The position (in 0..63) of the least significant set bit @@ -264,7 +280,7 @@ inline uint64_t bits::cnt256(__m256i x){ __m256i fourSums = _mm256_sad_epu8(bwcount, _mm256_setzero_si256()); // Use union to access individual bytes (unsigned integers) - sdsl::YMM_Union ymm_union; + sdsl::YMM_union ymm_union; ymm_union.ymm = fourSums; return ymm_union.ymm[0] + ymm_union.ymm[4] + ymm_union.ymm[8] + ymm_union.ymm[12]; } @@ -285,8 +301,8 @@ inline uint64_t bits::cnt128(__m128i x){ _mm_shuffle_epi8(POPCNT_LOOKUP_4BF_MASK, high)); // Use union to access individual bytes (unsigned integers) - sdsl::XMM_Union xmm_union; - xmm_union.sse = _mm_sad_epu8(x, _mm_setzero_si128()); + sdsl::XMM_union xmm_union; + xmm_union.xmm = _mm_sad_epu8(x, _mm_setzero_si128()); return xmm_union.values[0] + xmm_union.values[4]; } #endif diff --git a/include/sdsl/uint256_t.hpp b/include/sdsl/uint256_t.hpp index ec84d9f85..200e6f587 100644 --- a/include/sdsl/uint256_t.hpp +++ b/include/sdsl/uint256_t.hpp @@ -63,7 +63,7 @@ class uint256_t inline uint16_t popcount() { #ifdef __AVX2__ // Fastest method: 32 table lookups per clock cycle - sdsl::YMM_Union ymm_union; + sdsl::YMM_union ymm_union; ymm_union[0] = m_lo; ymm_union[1] = m_mid; ymm_union[2] = m_high >> 64; @@ -72,8 +72,8 @@ class uint256_t #endif #ifdef __SSE4_2__ // 16 table lookups per clock cycle - sdsl::XMM_Union xmm_union1; - sdsl::XMM_Union xmm_union2; + sdsl::XMM_union xmm_union1; + sdsl::XMM_union xmm_union2; xmm_union1[0] = m_lo; xmm_union1[1] = m_mid; xmm_union2[0] = m_high >> 64; From bd7ad14a0ee9b8ee731490cd39bb03151c36c7cc Mon Sep 17 00:00:00 2001 From: Diego Havenstein Date: Mon, 1 Sep 2014 16:02:23 +0200 Subject: [PATCH 07/11] __AVX2__ -> __SSE4_2__ --- build/include/sdsl/bits.hpp | 694 ++++++++++++++++++++++++++++++++++++ 1 file changed, 694 insertions(+) create mode 100644 build/include/sdsl/bits.hpp diff --git a/build/include/sdsl/bits.hpp b/build/include/sdsl/bits.hpp new file mode 100644 index 000000000..88802f924 --- /dev/null +++ b/build/include/sdsl/bits.hpp @@ -0,0 +1,694 @@ +/* sdsl - succinct data structures library + Copyright (C) 2008 Simon Gog + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see http://www.gnu.org/licenses/ . +*/ +/*! \file bits.hpp + \brief bits.hpp contains the sdsl::bits class. + \author Simon Gog +*/ +#ifndef INCLUDED_SDSL_BITS +#define INCLUDED_SDSL_BITS + +#include // for uint64_t uint32_t declaration +#include // for cerr +#include +#include // SSE/AVX +#include "ymm_union.hpp" // convenient YMM register wrapper +#include "xmm_union.hpp" // convenient XMM register wrapper +#ifdef __SSE4_2__ +#include +#endif + +//! Namespace for the succinct data structure library. +namespace sdsl +{ + +//! A helper class for bitwise tricks on 64 bit words. +/*! + bits is a helper class for bitwise tricks and + techniques. For the basic tricks and techiques we refer to Donald E. Knuth's + "The Art of Computer Programming", Volume 4A, Chapter 7.1.3 and + the informative website of Sean E. Anderson about the topic: + http://www-graphics.stanford.edu/~seander/bithacks.html . + + We have added new functions like: cnt11 and sel11. + + All members of this class are static variables or methods. + This class cannot be instantiated. + + \author Simon Gog + */ +struct bits { + bits() = delete; + //! 64bit mask with all bits set to 1. + constexpr static int64_t all_set {-1LL}; + + //! This constant represents a de Bruijn sequence B(k,n) for k=2 and n=6. + /*! Details for de Bruijn sequences see + http://en.wikipedia.org/wiki/De_bruijn_sequence + deBruijn64 is used in combination with the + array lt_deBruijn_to_idx. + */ + constexpr static uint64_t deBruijn64 {0x0218A392CD3D5DBFULL}; + + //! This table maps a 6-bit subsequence S[idx...idx+5] of constant deBruijn64 to idx. + /*! \sa deBruijn64 + */ + static const uint32_t lt_deBruijn_to_idx[64]; + + //! Array containing Fibonacci numbers less than \f$2^64\f$. + static const uint64_t lt_fib[92]; + + //! Lookup table for byte popcounts. + static const uint8_t lt_cnt[256]; + + //! Lookup table for most significant set bit in a byte. + static const uint32_t lt_hi[256]; + + //! lo_set[i] is a 64-bit word with the i least significant bits set and the high bits not set. + /*! lo_set[0] = 0ULL, lo_set[1]=1ULL, lo_set[2]=3ULL... + */ + static const uint64_t lo_set[65]; + + //! lo_unset[i] is a 64-bit word with the i least significant bits not set and the high bits set. + /*! lo_unset[0] = FFFFFFFFFFFFFFFFULL, lo_unset_set[1]=FFFFFFFFFFFFFFFEULL, ... + */ + static const uint64_t lo_unset[65]; + + //! Lookup table for least significant set bit in a byte. + static const uint8_t lt_lo[256]; + + //! Lookup table for select on bytes. + /*! Entry at idx = 256*j + i equals the position of the + (j+1)-th set bit in byte i. Positions lie in the range \f$[0..7]\f$. + */ + static const uint8_t lt_sel[256*8]; + + //! Use to help to decide if a prefix sum stored in a byte overflows. + static const uint64_t ps_overflow[65]; + + //! Counts the number of set bits in x. + /*! \param x 64-bit word + \return Number of set bits. + */ + static uint64_t cnt(uint64_t x); + + //! Counts the number of set bits in YMM register x. + /*! \param YMM register + \return Number of set bits. + */ +#ifdef __AVX2__ + static uint64_t cnt256(__m256i x); +#endif + + //! Counts the number of set bits in XMM register x. + /*! \param XMM register + \return Number of set bits. + */ +#ifdef __SSE4_2__ + static uint64_t cnt128(__m128i x); +#endif + + //! Position of the most significant set bit the 64-bit word x + /*! \param x 64-bit word + \return The position (in 0..63) of the least significant set bit + in `x` or 0 if x equals 0. + \sa sel, lo + */ + static uint32_t hi(uint64_t x); + + //! Calculates the position of the rightmost 1-bit in the 64bit integer x if it exists + /*! \param x 64 bit integer. + \return The position (in 0..63) of the rightmost 1-bit in the 64bit integer x if + x>0 and 0 if x equals 0. + \sa sel, hi + */ + static uint32_t lo(uint64_t x); + + //! Counts the number of 1-bits in the 32bit integer x. + /*! This function is a variant of the method cnt. If + 32bit multiplication is fast, this method beats the cnt. + for 32bit integers. + \param x 64bit integer to count the bits. + \return The number of 1-bits in x. + */ + static uint32_t cnt32(uint32_t x); + + //! Count the number of consecutive and distinct 11 in the 64bit integer x. + /*! + \param x 64bit integer to count the terminating sequence 11 of a Fibonacci code. + \param c Carry equals msb of the previous 64bit integer. + */ + static uint32_t cnt11(uint64_t x, uint64_t& c); + + //! Count the number of consecutive and distinct 11 in the 64bit integer x. + /*! + \param x 64bit integer to count the terminating sequence 11 of a Fibonacci code. + */ + static uint32_t cnt11(uint64_t x); + + //! Count 10 bit pairs in the word x. + /*! + * \param x 64bit integer to count the 10 bit pairs. + * \param c Carry equals msb of the previous 64bit integer. + */ + static uint32_t cnt10(uint64_t x, uint64_t& c); + + //! Count 01 bit pairs in the word x. + /*! + * \param x 64bit integer to count the 01 bit pairs. + * \param c Carry equals msb of the previous 64bit integer. + */ + static uint32_t cnt01(uint64_t x, uint64_t& c); + + //! Map all 10 bit pairs to 01 or 1 if c=1 and the lsb=0. All other pairs are mapped to 00. + static uint64_t map10(uint64_t x, uint64_t c=0); + + //! Map all 01 bit pairs to 01 of 1 if c=1 and the lsb=0. All other pairs are mapped to 00. + static uint64_t map01(uint64_t x, uint64_t c=1); + + //! Calculate the position of the i-th rightmost 1 bit in the 64bit integer x + /*! + \param x 64bit integer. + \param i Argument i must be in the range \f$[1..cnt(x)]\f$. + \pre Argument i must be in the range \f$[1..cnt(x)]\f$. + \sa hi, lo + */ + static uint32_t sel(uint64_t x, uint32_t i); + static uint32_t _sel(uint64_t x, uint32_t i); + + //! Calculates the position of the i-th rightmost 11-bit-pattern which terminates a Fibonacci coded integer in x. + /*! \param x 64 bit integer. + \param i Index of 11-bit-pattern. \f$i \in [1..cnt11(x)]\f$ + \param c Carry bit from word before + \return The position (in 1..63) of the i-th 11-bit-pattern which terminates a Fibonacci coded integer in x if + x contains at least i 11-bit-patterns and a undefined value otherwise. + \sa cnt11, hi11, sel + + */ + static uint32_t sel11(uint64_t x, uint32_t i, uint32_t c=0); + + //! Calculates the position of the leftmost 11-bit-pattern which terminates a Fibonacci coded integer in x. + /*! \param x 64 bit integer. + \return The position (in 1..63) of the leftmost 1 of the leftmost 11-bit-pattern which + terminates a Fibonacci coded integer in x if x contains a 11-bit-pattern + and 0 otherwise. + \sa cnt11, sel11 + */ + static uint32_t hi11(uint64_t x); + + //! Writes value x to an bit position in an array. + static void write_int(uint64_t* word, uint64_t x, const uint8_t offset=0, const uint8_t len=64); + + //! Writes value x to an bit position in an array and moves the bit-pointer. + static void write_int_and_move(uint64_t*& word, uint64_t x, uint8_t& offset, const uint8_t len); + + //! Reads a value from a bit position in an array. + static uint64_t read_int(const uint64_t* word, uint8_t offset=0, const uint8_t len=64); + + //! Reads a value from a bit position in an array and moved the bit-pointer. + static uint64_t read_int_and_move(const uint64_t*& word, uint8_t& offset, const uint8_t len=64); + + //! Reads an unary decoded value from a bit position in an array. + static uint64_t read_unary(const uint64_t* word, uint8_t offset=0); + + //! Reads an unary decoded value from a bit position in an array and moves the bit-pointer. + static uint64_t read_unary_and_move(const uint64_t*& word, uint8_t& offset); + + //! Move the bit-pointer (=uint64_t word and offset) `len` to the right. + /*!\param word 64-bit word part of the bit pointer + * \param offset Offset part of the bit pointer + * \param len Move distance. \f$ len \in [0..64] \f$ + * \sa move_left + */ + static void move_right(const uint64_t*& word, uint8_t& offset, const uint8_t len); + + //! Move the bit-pointer (=uint64_t word and offset) `len` to the left. + /*!\param word 64-bit word part of the bit pointer + * \param offset Offset part of the bit pointer + * \param len Move distance. \f$ len \in [0..64] \f$ + * \sa move_right + */ + static void move_left(const uint64_t*& word, uint8_t& offset, const uint8_t len); + + //! Get the first one bit in the interval \f$[idx..\infty )\f$ + static uint64_t next(const uint64_t* word, uint64_t idx); + + //! Get the one bit with the greatest position in the interval \f$[0..idx]\f$ + static uint64_t prev(const uint64_t* word, uint64_t idx); + + //! reverses a given 64 bit word + static uint64_t rev(uint64_t x); +}; + + +// ============= inline - implementations ================ + +#ifdef __AVX2__ +inline uint64_t bits::cnt256(__m256i x){ + + // 4-bit universal table, 4-bit mask + static const __m256i MASK4_256 = _mm256_set1_epi8(0x0F); + static const __m256i POPCNT_LOOKUP_4BF_MASK256 = _mm256_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, + 1, 2, 2, 3, 2, 3, 3, 4, + 0, 1, 1, 2, 1, 2, 2, 3, + 1, 2, 2, 3, 2, 3, 3, 4); + + __m256i low, high, bwcount; + + // byte halves stored in separate YMM registers + low = _mm256_and_si256(MASK4_256, x); + high = _mm256_and_si256(MASK4_256, _mm256_srli_epi16(x, 4)); + + // bytewise population count + bwcount = _mm256_add_epi8(_mm256_shuffle_epi8(POPCNT_LOOKUP_4BF_MASK256, low), + _mm256_shuffle_epi8(POPCNT_LOOKUP_4BF_MASK256, high)); + + // Computes sum of absolute differences and stores intermediate results at positions 0,4,8 and 12 + __m256i fourSums = _mm256_sad_epu8(bwcount, _mm256_setzero_si256()); + + // Use union to access individual bytes (unsigned integers) + sdsl::YMM_union ymm_union; + ymm_union.ymm = fourSums; + return ymm_union.ymm[0] + ymm_union.ymm[4] + ymm_union.ymm[8] + ymm_union.ymm[12]; +} +#endif + +#ifdef __SSE4_2__ +inline uint64_t bits::cnt128(__m128i x){ + + // 4-bit universal table, 4-bit mask + static const __m128i POPCNT_LOOKUP_4BF_MASK = _mm_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4); + static const __m128i MASK4 = _mm_set1_epi8(0x0F); + + __m128i low, high, count; + + low = _mm_and_si128(MASK4, x); + high = _mm_and_si128(MASK4, _mm_srli_epi16(x, 4)); + count = _mm_add_epi8(_mm_shuffle_epi8(POPCNT_LOOKUP_4BF_MASK, low), + _mm_shuffle_epi8(POPCNT_LOOKUP_4BF_MASK, high)); + + // Use union to access individual bytes (unsigned integers) + sdsl::XMM_union xmm_union; + xmm_union.xmm = _mm_sad_epu8(x, _mm_setzero_si128()); + return xmm_union.values[0] + xmm_union.values[4]; +} +#endif + +// see page 11, Knuth TAOCP Vol 4 F1A +inline uint64_t bits::cnt(uint64_t x) +{ +#ifdef __SSE4_2__ + return __builtin_popcountll(x); +#else +#ifdef POPCOUNT_TL + return lt_cnt[x&0xFFULL] + lt_cnt[(x>>8)&0xFFULL] + + lt_cnt[(x>>16)&0xFFULL] + lt_cnt[(x>>24)&0xFFULL] + + lt_cnt[(x>>32)&0xFFULL] + lt_cnt[(x>>40)&0xFFULL] + + lt_cnt[(x>>48)&0xFFULL] + lt_cnt[(x>>56)&0xFFULL]; +#else + x = x-((x>>1) & 0x5555555555555555ull); + x = (x & 0x3333333333333333ull) + ((x >> 2) & 0x3333333333333333ull); + x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0full; + return (0x0101010101010101ull*x >> 56); +#endif +#endif +} + +inline uint32_t bits::cnt32(uint32_t x) +{ + x = x-((x>>1) & 0x55555555); + x = (x & 0x33333333) + ((x>>2) & 0x33333333); + return (0x10101010*x >>28)+(0x01010101*x >>28); +} + + +inline uint32_t bits::cnt11(uint64_t x, uint64_t& c) +{ + // extract "11" 2bit blocks + uint64_t ex11 = (x&(x>>1))&0x5555555555555555ULL, t; + // extract "10" 2bit blocks + uint64_t ex10or01 = (ex11|(ex11<<1))^x; + + x = ex11 | ((t=(ex11|(ex11<<1))+(((ex10or01<<1)&0x5555555555555555ULL)|c))&(ex10or01&0x5555555555555555ULL)); + c = (ex10or01>>63) or(t < (ex11|(ex11<<1))); + + x = (x & 0x3333333333333333ULL) + ((x >> 2) & 0x3333333333333333ULL); + x = (x + (x >> 4)) & 0x0F0F0F0F0F0F0F0FULL; + return (0x0101010101010101ULL*x >> 56); +} + +inline uint32_t bits::cnt11(uint64_t x) +{ + // extract "11" 2bit blocks + uint64_t ex11 = (x&(x>>1))&0x5555555555555555ULL; + // extract "10" 2bit blocks + uint64_t ex10or01 = (ex11|(ex11<<1))^x; + + x = ex11 | (((ex11|(ex11<<1))+((ex10or01<<1)&0x5555555555555555ULL))&(ex10or01&0x5555555555555555ULL)); + + x = (x & 0x3333333333333333ULL) + ((x >> 2) & 0x3333333333333333ULL); + x = (x + (x >> 4)) & 0x0F0F0F0F0F0F0F0FULL; + return (0x0101010101010101ULL*x >> 56); +} + +inline uint32_t bits::cnt10(uint64_t x, uint64_t& c) +{ + uint32_t res = cnt((x ^((x<<1) | c)) & (~x)); + c = (x >> 63); + return res; +} + +inline uint64_t bits::map10(uint64_t x, uint64_t c) +{ + return ((x ^((x << 1) | c)) & (~x)); +} + +inline uint32_t bits::cnt01(uint64_t x, uint64_t& c) +{ + uint32_t res = cnt((x ^((x<<1) | c)) & x); + c = (x >> 63); + return res; +} +inline uint64_t bits::map01(uint64_t x, uint64_t c) +{ + return ((x ^((x << 1) | c)) & x); +} + +inline uint32_t bits::sel(uint64_t x, uint32_t i) +{ +#ifdef __SSE4_2__ + uint64_t s = x, b; + s = s-((s>>1) & 0x5555555555555555ULL); + s = (s & 0x3333333333333333ULL) + ((s >> 2) & 0x3333333333333333ULL); + s = (s + (s >> 4)) & 0x0F0F0F0F0F0F0F0FULL; + s = 0x0101010101010101ULL*s; +// now s contains 8 bytes s[7],...,s[0]; s[j] contains the cumulative sum +// of (j+1)*8 least significant bits of s + b = (s+ps_overflow[i]) & 0x8080808080808080ULL; +// ps_overflow contains a bit mask x consisting of 8 bytes +// x[7],...,x[0] and x[j] is set to 128-j +// => a byte b[j] in b is >= 128 if cum sum >= j + +// __builtin_ctzll returns the number of trailing zeros, if b!=0 + int byte_nr = __builtin_ctzll(b) >> 3; // byte nr in [0..7] + s <<= 8; + i -= (s >> (byte_nr<<3)) & 0xFFULL; + return (byte_nr << 3) + lt_sel[((i-1) << 8) + ((x>>(byte_nr<<3))&0xFFULL) ]; +#endif + return _sel(x, i); +} + +inline uint32_t bits::_sel(uint64_t x, uint32_t i) +{ + uint64_t s = x, b; // s = sum + s = s-((s>>1) & 0x5555555555555555ULL); + s = (s & 0x3333333333333333ULL) + ((s >> 2) & 0x3333333333333333ULL); + s = (s + (s >> 4)) & 0x0F0F0F0F0F0F0F0FULL; + s = 0x0101010101010101ULL*s; + b = (s+ps_overflow[i]);//&0x8080808080808080ULL;// add something to the partial sums to cause overflow + i = (i-1)<<8; + if (b&0x0000000080000000ULL) // byte <=3 + if (b&0x0000000000008000ULL) //byte <= 1 + if (b&0x0000000000000080ULL) + return lt_sel[(x&0xFFULL) + i]; + else + return 8 +lt_sel[(((x>>8)&0xFFULL) + i - ((s&0xFFULL)<<8))&0x7FFULL];//byte 1; + else//byte >1 + if (b&0x0000000000800000ULL) //byte <=2 + return 16+lt_sel[(((x>>16)&0xFFULL) + i - (s&0xFF00ULL))&0x7FFULL];//byte 2; + else + return 24+lt_sel[(((x>>24)&0xFFULL) + i - ((s>>8)&0xFF00ULL))&0x7FFULL];//byte 3; + else// byte > 3 + if (b&0x0000800000000000ULL) // byte <=5 + if (b&0x0000008000000000ULL) //byte <=4 + return 32+lt_sel[(((x>>32)&0xFFULL) + i - ((s>>16)&0xFF00ULL))&0x7FFULL];//byte 4; + else + return 40+lt_sel[(((x>>40)&0xFFULL) + i - ((s>>24)&0xFF00ULL))&0x7FFULL];//byte 5; + else// byte >5 + if (b&0x0080000000000000ULL) //byte<=6 + return 48+lt_sel[(((x>>48)&0xFFULL) + i - ((s>>32)&0xFF00ULL))&0x7FFULL];//byte 6; + else + return 56+lt_sel[(((x>>56)&0xFFULL) + i - ((s>>40)&0xFF00ULL))&0x7FFULL];//byte 7; + return 0; +} + +// using built-in method or +// 64-bit version of 32-bit proposal of +// http://www-graphics.stanford.edu/~seander/bithacks.html +inline uint32_t bits::hi(uint64_t x) +{ +#ifdef __SSE4_2__ + if (x == 0) + return 0; + return 63 - __builtin_clzll(x); +#else + uint64_t t,tt; // temporaries + if ((tt = x >> 32)) { // hi >= 32 + if ((t = tt >> 16)) { // hi >= 48 + return (tt = t >> 8) ? 56 + lt_hi[tt] : 48 + lt_hi[t]; + } else { // hi < 48 + return (t = tt >> 8) ? 40 + lt_hi[t] : 32 + lt_hi[tt]; + } + } else { // hi < 32 + if ((t = x >> 16)) { // hi >= 16 + return (tt = t >> 8) ? 24 + lt_hi[tt] : 16 + lt_hi[t]; + } else { // hi < 16 + return (tt = x >> 8) ? 8 + lt_hi[tt] : lt_hi[x]; + } + } +#endif +} + +// details see: http://citeseer.ist.psu.edu/leiserson98using.html +// or page 10, Knuth TAOCP Vol 4 F1A +inline uint32_t bits::lo(uint64_t x) +{ +#ifdef __SSE4_2__ + if (x==0) + return 0; + return __builtin_ctzll(x); +#else + if (x&1) return 0; + if (x&3) return 1; + if (x&7) return 2; + if (x&0x7F) { // in average every second random number x can be answered this way + return lt_lo[(x&0x7F)>>3]+3; + } + // x&-x equals x with only the lsb set + return lt_deBruijn_to_idx[((x&-x)*deBruijn64)>>58]; +#endif +} + +inline uint32_t bits::hi11(uint64_t x) +{ + // extract "11" 2bit blocks + uint64_t ex11 = (x&(x>>1))&0x5555555555555555ULL; + // extract "10" 2bit blocks + uint64_t ex10or01 = (ex11|(ex11<<1))^x; + // extract "10" 2bit blocks + ex11 += (((ex11|(ex11<<1))+((ex10or01<<1)&0x5555555555555555ULL)) & ((ex10or01&0x5555555555555555ULL)|ex11)); + return hi(ex11); +} + + +inline uint32_t bits::sel11(uint64_t x, uint32_t i, uint32_t c) +{ + uint64_t ex11 = (x&(x>>1))&0x5555555555555555ULL; + uint64_t ex10or01 = (ex11|(ex11<<1))^x; + ex11 += (((ex11|(ex11<<1))+(((ex10or01<<1)&0x5555555555555555ULL)|c)) & ((ex10or01&0x5555555555555555ULL)|ex11)); + return sel(ex11,i); +} + +inline void bits::write_int(uint64_t* word, uint64_t x, uint8_t offset, const uint8_t len) +{ + x &= bits::lo_set[len]; + if (offset + len < 64) { + *word &= + ((bits::all_set << (offset+len)) | bits::lo_set[offset]); // mask 1..10..01..1 + *word |= (x << offset); +// *word ^= ((*word ^ x) & (bits::lo_set[len] << offset) ); +// surprisingly the above line is slower than the lines above + } else { + *word &= + ((bits::lo_set[offset])); // mask 0....01..1 + *word |= (x << offset); + if ((offset = (offset+len)&0x3F)) { // offset+len > 64 + *(word+1) &= (~bits::lo_set[offset]); // mask 1...10..0 +// *(word+1) &= bits::lo_unset[offset]; // mask 1...10..0 +// surprisingly the above line is slower than the line above + *(word+1) |= (x >> (len-offset)); + } + } +} + +inline void bits::write_int_and_move(uint64_t*& word, uint64_t x, uint8_t& offset, const uint8_t len) +{ + x &= bits::lo_set[len]; + if (offset + len < 64) { + *word &= + ((bits::all_set << (offset+len)) | bits::lo_set[offset]); // mask 1..10..01..1 + *word |= (x << offset); + offset += len; + } else { + *word &= + ((bits::lo_set[offset])); // mask 0....01..1 + *word |= (x << offset); + if ((offset= (offset+len))>64) {// offset+len >= 64 + offset &= 0x3F; + *(++word) &= (~bits::lo_set[offset]); // mask 1...10..0 + *word |= (x >> (len-offset)); + } else { + offset = 0; + ++word; + } + } +} + +inline uint64_t bits::read_int(const uint64_t* word, uint8_t offset, const uint8_t len) +{ + uint64_t w1 = (*word)>>offset; + if ((offset+len) > 64) { // if offset+len > 64 + return w1 | // w1 or w2 adepted: + ((*(word+1) & bits::lo_set[(offset+len)&0x3F]) // set higher bits zero + << (64-offset)); // move bits to the left + } else { + return w1 & bits::lo_set[len]; + } +} + +inline uint64_t bits::read_int_and_move(const uint64_t*& word, uint8_t& offset, const uint8_t len) +{ + uint64_t w1 = (*word)>>offset; + if ((offset = (offset+len))>=64) { // if offset+len > 64 + if (offset==64) { + offset &= 0x3F; + ++word; + return w1; + } else { + offset &= 0x3F; + return w1 | + (((*(++word)) & bits::lo_set[offset]) << (len-offset)); + } + } else { + return w1 & bits::lo_set[len]; + } +} + +inline uint64_t bits::read_unary(const uint64_t* word, uint8_t offset) +{ + uint64_t w = *word >> offset; + if (w) { + return bits::lo(w); + } else { + if (0!=(w=*(++word))) + return bits::lo(w)+64-offset; + uint64_t cnt=2; + while (0==(w=*(++word))) + ++cnt; + return bits::lo(w)+(cnt<<6)-offset; + } + return 0; +} + +inline uint64_t bits::read_unary_and_move(const uint64_t*& word, uint8_t& offset) +{ + uint64_t w = (*word) >> offset; // temporary variable is good for the performance + if (w) { + uint8_t r = bits::lo(w); + offset = (offset + r+1)&0x3F; + // we know that offset + r +1 <= 64, so if the new offset equals 0 increase word + word += (offset==0); + return r; + } else { + uint8_t rr=0; + if (0!=(w=*(++word))) { + rr = bits::lo(w)+64-offset; + offset = (offset+rr+1)&0x3F; + word += (offset==0); + return rr; + } else { + uint64_t cnt_1=1; + while (0==(w=*(++word))) + ++cnt_1; + rr = bits::lo(w)+64-offset; + offset = (offset+rr+1)&0x3F; + word += (offset==0); + return ((cnt_1)<<6) + rr; + } + } + return 0; +} + +inline void bits::move_right(const uint64_t*& word, uint8_t& offset, const uint8_t len) +{ + if ((offset+=len)&0xC0) { // if offset >= 65 + offset&=0x3F; + ++word; + } +} + +inline void bits::move_left(const uint64_t*& word, uint8_t& offset, const uint8_t len) +{ + if ((offset-=len)&0xC0) { // if offset-len<0 + offset&=0x3F; + --word; + } +} + +inline uint64_t bits::next(const uint64_t* word, uint64_t idx) +{ + word += (idx>>6); + if (*word & ~lo_set[idx&0x3F]) { + return (idx & ~((size_t)0x3F)) + lo(*word & ~lo_set[idx&0x3F]); + } + idx = (idx & ~((size_t)0x3F)) + 64; + ++word; + while (*word==0) { + idx += 64; + ++word; + } + return idx + lo(*word); +} + +inline uint64_t bits::prev(const uint64_t* word, uint64_t idx) +{ + word += (idx>>6); + if (*word & lo_set[(idx&0x3F)+1]) { + return (idx & ~((size_t)0x3F)) + hi(*word & lo_set[(idx&0x3F)+1]); + } + idx = (idx & ~((size_t)0x3F)) - 64; + --word; + while (*word==0) { + idx -= 64; + --word; + } + return idx + hi(*word); +} + +inline uint64_t bits::rev(uint64_t x) +{ + x = ((x & 0x5555555555555555ULL) << 1) | ((x & 0xAAAAAAAAAAAAAAAAULL) >> 1); + x = ((x & 0x3333333333333333ULL) << 2) | ((x & 0xCCCCCCCCCCCCCCCCULL) >> 2); + x = ((x & 0x0F0F0F0F0F0F0F0FULL) << 4) | ((x & 0xF0F0F0F0F0F0F0F0ULL) >> 4); + x = ((x & 0x00FF00FF00FF00FFULL) << 8) | ((x & 0xFF00FF00FF00FF00ULL) >> 8); + x = ((x & 0x0000FFFF0000FFFFULL) <<16) | ((x & 0xFFFF0000FFFF0000ULL) >>16); + x = ((x & 0x00000000FFFFFFFFULL) <<32) | ((x & 0xFFFFFFFF00000000ULL) >>32); + return x; +} + +} // end namespace sdsl + +#endif From 91362e69b971d3fa218e94d72e15a2d86e7597c3 Mon Sep 17 00:00:00 2001 From: Diego Havenstein Date: Mon, 1 Sep 2014 16:06:50 +0200 Subject: [PATCH 08/11] Little fix --- include/sdsl/bits.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/sdsl/bits.hpp b/include/sdsl/bits.hpp index 51027e427..88802f924 100644 --- a/include/sdsl/bits.hpp +++ b/include/sdsl/bits.hpp @@ -117,7 +117,7 @@ struct bits { /*! \param XMM register \return Number of set bits. */ -#ifdef __AVX2__ +#ifdef __SSE4_2__ static uint64_t cnt128(__m128i x); #endif From 24ad2ce57f3e41219fee2b901c9baf7493c15200 Mon Sep 17 00:00:00 2001 From: Diego Havenstein Date: Fri, 5 Sep 2014 16:05:02 +0200 Subject: [PATCH 09/11] bug in uint256_t fixed (wrong accessing pattern to array elements) --- build/include/sdsl/bits.hpp | 694 ------------------------------------ include/sdsl/uint256_t.hpp | 16 +- 2 files changed, 8 insertions(+), 702 deletions(-) delete mode 100644 build/include/sdsl/bits.hpp diff --git a/build/include/sdsl/bits.hpp b/build/include/sdsl/bits.hpp deleted file mode 100644 index 88802f924..000000000 --- a/build/include/sdsl/bits.hpp +++ /dev/null @@ -1,694 +0,0 @@ -/* sdsl - succinct data structures library - Copyright (C) 2008 Simon Gog - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see http://www.gnu.org/licenses/ . -*/ -/*! \file bits.hpp - \brief bits.hpp contains the sdsl::bits class. - \author Simon Gog -*/ -#ifndef INCLUDED_SDSL_BITS -#define INCLUDED_SDSL_BITS - -#include // for uint64_t uint32_t declaration -#include // for cerr -#include -#include // SSE/AVX -#include "ymm_union.hpp" // convenient YMM register wrapper -#include "xmm_union.hpp" // convenient XMM register wrapper -#ifdef __SSE4_2__ -#include -#endif - -//! Namespace for the succinct data structure library. -namespace sdsl -{ - -//! A helper class for bitwise tricks on 64 bit words. -/*! - bits is a helper class for bitwise tricks and - techniques. For the basic tricks and techiques we refer to Donald E. Knuth's - "The Art of Computer Programming", Volume 4A, Chapter 7.1.3 and - the informative website of Sean E. Anderson about the topic: - http://www-graphics.stanford.edu/~seander/bithacks.html . - - We have added new functions like: cnt11 and sel11. - - All members of this class are static variables or methods. - This class cannot be instantiated. - - \author Simon Gog - */ -struct bits { - bits() = delete; - //! 64bit mask with all bits set to 1. - constexpr static int64_t all_set {-1LL}; - - //! This constant represents a de Bruijn sequence B(k,n) for k=2 and n=6. - /*! Details for de Bruijn sequences see - http://en.wikipedia.org/wiki/De_bruijn_sequence - deBruijn64 is used in combination with the - array lt_deBruijn_to_idx. - */ - constexpr static uint64_t deBruijn64 {0x0218A392CD3D5DBFULL}; - - //! This table maps a 6-bit subsequence S[idx...idx+5] of constant deBruijn64 to idx. - /*! \sa deBruijn64 - */ - static const uint32_t lt_deBruijn_to_idx[64]; - - //! Array containing Fibonacci numbers less than \f$2^64\f$. - static const uint64_t lt_fib[92]; - - //! Lookup table for byte popcounts. - static const uint8_t lt_cnt[256]; - - //! Lookup table for most significant set bit in a byte. - static const uint32_t lt_hi[256]; - - //! lo_set[i] is a 64-bit word with the i least significant bits set and the high bits not set. - /*! lo_set[0] = 0ULL, lo_set[1]=1ULL, lo_set[2]=3ULL... - */ - static const uint64_t lo_set[65]; - - //! lo_unset[i] is a 64-bit word with the i least significant bits not set and the high bits set. - /*! lo_unset[0] = FFFFFFFFFFFFFFFFULL, lo_unset_set[1]=FFFFFFFFFFFFFFFEULL, ... - */ - static const uint64_t lo_unset[65]; - - //! Lookup table for least significant set bit in a byte. - static const uint8_t lt_lo[256]; - - //! Lookup table for select on bytes. - /*! Entry at idx = 256*j + i equals the position of the - (j+1)-th set bit in byte i. Positions lie in the range \f$[0..7]\f$. - */ - static const uint8_t lt_sel[256*8]; - - //! Use to help to decide if a prefix sum stored in a byte overflows. - static const uint64_t ps_overflow[65]; - - //! Counts the number of set bits in x. - /*! \param x 64-bit word - \return Number of set bits. - */ - static uint64_t cnt(uint64_t x); - - //! Counts the number of set bits in YMM register x. - /*! \param YMM register - \return Number of set bits. - */ -#ifdef __AVX2__ - static uint64_t cnt256(__m256i x); -#endif - - //! Counts the number of set bits in XMM register x. - /*! \param XMM register - \return Number of set bits. - */ -#ifdef __SSE4_2__ - static uint64_t cnt128(__m128i x); -#endif - - //! Position of the most significant set bit the 64-bit word x - /*! \param x 64-bit word - \return The position (in 0..63) of the least significant set bit - in `x` or 0 if x equals 0. - \sa sel, lo - */ - static uint32_t hi(uint64_t x); - - //! Calculates the position of the rightmost 1-bit in the 64bit integer x if it exists - /*! \param x 64 bit integer. - \return The position (in 0..63) of the rightmost 1-bit in the 64bit integer x if - x>0 and 0 if x equals 0. - \sa sel, hi - */ - static uint32_t lo(uint64_t x); - - //! Counts the number of 1-bits in the 32bit integer x. - /*! This function is a variant of the method cnt. If - 32bit multiplication is fast, this method beats the cnt. - for 32bit integers. - \param x 64bit integer to count the bits. - \return The number of 1-bits in x. - */ - static uint32_t cnt32(uint32_t x); - - //! Count the number of consecutive and distinct 11 in the 64bit integer x. - /*! - \param x 64bit integer to count the terminating sequence 11 of a Fibonacci code. - \param c Carry equals msb of the previous 64bit integer. - */ - static uint32_t cnt11(uint64_t x, uint64_t& c); - - //! Count the number of consecutive and distinct 11 in the 64bit integer x. - /*! - \param x 64bit integer to count the terminating sequence 11 of a Fibonacci code. - */ - static uint32_t cnt11(uint64_t x); - - //! Count 10 bit pairs in the word x. - /*! - * \param x 64bit integer to count the 10 bit pairs. - * \param c Carry equals msb of the previous 64bit integer. - */ - static uint32_t cnt10(uint64_t x, uint64_t& c); - - //! Count 01 bit pairs in the word x. - /*! - * \param x 64bit integer to count the 01 bit pairs. - * \param c Carry equals msb of the previous 64bit integer. - */ - static uint32_t cnt01(uint64_t x, uint64_t& c); - - //! Map all 10 bit pairs to 01 or 1 if c=1 and the lsb=0. All other pairs are mapped to 00. - static uint64_t map10(uint64_t x, uint64_t c=0); - - //! Map all 01 bit pairs to 01 of 1 if c=1 and the lsb=0. All other pairs are mapped to 00. - static uint64_t map01(uint64_t x, uint64_t c=1); - - //! Calculate the position of the i-th rightmost 1 bit in the 64bit integer x - /*! - \param x 64bit integer. - \param i Argument i must be in the range \f$[1..cnt(x)]\f$. - \pre Argument i must be in the range \f$[1..cnt(x)]\f$. - \sa hi, lo - */ - static uint32_t sel(uint64_t x, uint32_t i); - static uint32_t _sel(uint64_t x, uint32_t i); - - //! Calculates the position of the i-th rightmost 11-bit-pattern which terminates a Fibonacci coded integer in x. - /*! \param x 64 bit integer. - \param i Index of 11-bit-pattern. \f$i \in [1..cnt11(x)]\f$ - \param c Carry bit from word before - \return The position (in 1..63) of the i-th 11-bit-pattern which terminates a Fibonacci coded integer in x if - x contains at least i 11-bit-patterns and a undefined value otherwise. - \sa cnt11, hi11, sel - - */ - static uint32_t sel11(uint64_t x, uint32_t i, uint32_t c=0); - - //! Calculates the position of the leftmost 11-bit-pattern which terminates a Fibonacci coded integer in x. - /*! \param x 64 bit integer. - \return The position (in 1..63) of the leftmost 1 of the leftmost 11-bit-pattern which - terminates a Fibonacci coded integer in x if x contains a 11-bit-pattern - and 0 otherwise. - \sa cnt11, sel11 - */ - static uint32_t hi11(uint64_t x); - - //! Writes value x to an bit position in an array. - static void write_int(uint64_t* word, uint64_t x, const uint8_t offset=0, const uint8_t len=64); - - //! Writes value x to an bit position in an array and moves the bit-pointer. - static void write_int_and_move(uint64_t*& word, uint64_t x, uint8_t& offset, const uint8_t len); - - //! Reads a value from a bit position in an array. - static uint64_t read_int(const uint64_t* word, uint8_t offset=0, const uint8_t len=64); - - //! Reads a value from a bit position in an array and moved the bit-pointer. - static uint64_t read_int_and_move(const uint64_t*& word, uint8_t& offset, const uint8_t len=64); - - //! Reads an unary decoded value from a bit position in an array. - static uint64_t read_unary(const uint64_t* word, uint8_t offset=0); - - //! Reads an unary decoded value from a bit position in an array and moves the bit-pointer. - static uint64_t read_unary_and_move(const uint64_t*& word, uint8_t& offset); - - //! Move the bit-pointer (=uint64_t word and offset) `len` to the right. - /*!\param word 64-bit word part of the bit pointer - * \param offset Offset part of the bit pointer - * \param len Move distance. \f$ len \in [0..64] \f$ - * \sa move_left - */ - static void move_right(const uint64_t*& word, uint8_t& offset, const uint8_t len); - - //! Move the bit-pointer (=uint64_t word and offset) `len` to the left. - /*!\param word 64-bit word part of the bit pointer - * \param offset Offset part of the bit pointer - * \param len Move distance. \f$ len \in [0..64] \f$ - * \sa move_right - */ - static void move_left(const uint64_t*& word, uint8_t& offset, const uint8_t len); - - //! Get the first one bit in the interval \f$[idx..\infty )\f$ - static uint64_t next(const uint64_t* word, uint64_t idx); - - //! Get the one bit with the greatest position in the interval \f$[0..idx]\f$ - static uint64_t prev(const uint64_t* word, uint64_t idx); - - //! reverses a given 64 bit word - static uint64_t rev(uint64_t x); -}; - - -// ============= inline - implementations ================ - -#ifdef __AVX2__ -inline uint64_t bits::cnt256(__m256i x){ - - // 4-bit universal table, 4-bit mask - static const __m256i MASK4_256 = _mm256_set1_epi8(0x0F); - static const __m256i POPCNT_LOOKUP_4BF_MASK256 = _mm256_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, - 1, 2, 2, 3, 2, 3, 3, 4, - 0, 1, 1, 2, 1, 2, 2, 3, - 1, 2, 2, 3, 2, 3, 3, 4); - - __m256i low, high, bwcount; - - // byte halves stored in separate YMM registers - low = _mm256_and_si256(MASK4_256, x); - high = _mm256_and_si256(MASK4_256, _mm256_srli_epi16(x, 4)); - - // bytewise population count - bwcount = _mm256_add_epi8(_mm256_shuffle_epi8(POPCNT_LOOKUP_4BF_MASK256, low), - _mm256_shuffle_epi8(POPCNT_LOOKUP_4BF_MASK256, high)); - - // Computes sum of absolute differences and stores intermediate results at positions 0,4,8 and 12 - __m256i fourSums = _mm256_sad_epu8(bwcount, _mm256_setzero_si256()); - - // Use union to access individual bytes (unsigned integers) - sdsl::YMM_union ymm_union; - ymm_union.ymm = fourSums; - return ymm_union.ymm[0] + ymm_union.ymm[4] + ymm_union.ymm[8] + ymm_union.ymm[12]; -} -#endif - -#ifdef __SSE4_2__ -inline uint64_t bits::cnt128(__m128i x){ - - // 4-bit universal table, 4-bit mask - static const __m128i POPCNT_LOOKUP_4BF_MASK = _mm_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4); - static const __m128i MASK4 = _mm_set1_epi8(0x0F); - - __m128i low, high, count; - - low = _mm_and_si128(MASK4, x); - high = _mm_and_si128(MASK4, _mm_srli_epi16(x, 4)); - count = _mm_add_epi8(_mm_shuffle_epi8(POPCNT_LOOKUP_4BF_MASK, low), - _mm_shuffle_epi8(POPCNT_LOOKUP_4BF_MASK, high)); - - // Use union to access individual bytes (unsigned integers) - sdsl::XMM_union xmm_union; - xmm_union.xmm = _mm_sad_epu8(x, _mm_setzero_si128()); - return xmm_union.values[0] + xmm_union.values[4]; -} -#endif - -// see page 11, Knuth TAOCP Vol 4 F1A -inline uint64_t bits::cnt(uint64_t x) -{ -#ifdef __SSE4_2__ - return __builtin_popcountll(x); -#else -#ifdef POPCOUNT_TL - return lt_cnt[x&0xFFULL] + lt_cnt[(x>>8)&0xFFULL] + - lt_cnt[(x>>16)&0xFFULL] + lt_cnt[(x>>24)&0xFFULL] + - lt_cnt[(x>>32)&0xFFULL] + lt_cnt[(x>>40)&0xFFULL] + - lt_cnt[(x>>48)&0xFFULL] + lt_cnt[(x>>56)&0xFFULL]; -#else - x = x-((x>>1) & 0x5555555555555555ull); - x = (x & 0x3333333333333333ull) + ((x >> 2) & 0x3333333333333333ull); - x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0full; - return (0x0101010101010101ull*x >> 56); -#endif -#endif -} - -inline uint32_t bits::cnt32(uint32_t x) -{ - x = x-((x>>1) & 0x55555555); - x = (x & 0x33333333) + ((x>>2) & 0x33333333); - return (0x10101010*x >>28)+(0x01010101*x >>28); -} - - -inline uint32_t bits::cnt11(uint64_t x, uint64_t& c) -{ - // extract "11" 2bit blocks - uint64_t ex11 = (x&(x>>1))&0x5555555555555555ULL, t; - // extract "10" 2bit blocks - uint64_t ex10or01 = (ex11|(ex11<<1))^x; - - x = ex11 | ((t=(ex11|(ex11<<1))+(((ex10or01<<1)&0x5555555555555555ULL)|c))&(ex10or01&0x5555555555555555ULL)); - c = (ex10or01>>63) or(t < (ex11|(ex11<<1))); - - x = (x & 0x3333333333333333ULL) + ((x >> 2) & 0x3333333333333333ULL); - x = (x + (x >> 4)) & 0x0F0F0F0F0F0F0F0FULL; - return (0x0101010101010101ULL*x >> 56); -} - -inline uint32_t bits::cnt11(uint64_t x) -{ - // extract "11" 2bit blocks - uint64_t ex11 = (x&(x>>1))&0x5555555555555555ULL; - // extract "10" 2bit blocks - uint64_t ex10or01 = (ex11|(ex11<<1))^x; - - x = ex11 | (((ex11|(ex11<<1))+((ex10or01<<1)&0x5555555555555555ULL))&(ex10or01&0x5555555555555555ULL)); - - x = (x & 0x3333333333333333ULL) + ((x >> 2) & 0x3333333333333333ULL); - x = (x + (x >> 4)) & 0x0F0F0F0F0F0F0F0FULL; - return (0x0101010101010101ULL*x >> 56); -} - -inline uint32_t bits::cnt10(uint64_t x, uint64_t& c) -{ - uint32_t res = cnt((x ^((x<<1) | c)) & (~x)); - c = (x >> 63); - return res; -} - -inline uint64_t bits::map10(uint64_t x, uint64_t c) -{ - return ((x ^((x << 1) | c)) & (~x)); -} - -inline uint32_t bits::cnt01(uint64_t x, uint64_t& c) -{ - uint32_t res = cnt((x ^((x<<1) | c)) & x); - c = (x >> 63); - return res; -} -inline uint64_t bits::map01(uint64_t x, uint64_t c) -{ - return ((x ^((x << 1) | c)) & x); -} - -inline uint32_t bits::sel(uint64_t x, uint32_t i) -{ -#ifdef __SSE4_2__ - uint64_t s = x, b; - s = s-((s>>1) & 0x5555555555555555ULL); - s = (s & 0x3333333333333333ULL) + ((s >> 2) & 0x3333333333333333ULL); - s = (s + (s >> 4)) & 0x0F0F0F0F0F0F0F0FULL; - s = 0x0101010101010101ULL*s; -// now s contains 8 bytes s[7],...,s[0]; s[j] contains the cumulative sum -// of (j+1)*8 least significant bits of s - b = (s+ps_overflow[i]) & 0x8080808080808080ULL; -// ps_overflow contains a bit mask x consisting of 8 bytes -// x[7],...,x[0] and x[j] is set to 128-j -// => a byte b[j] in b is >= 128 if cum sum >= j - -// __builtin_ctzll returns the number of trailing zeros, if b!=0 - int byte_nr = __builtin_ctzll(b) >> 3; // byte nr in [0..7] - s <<= 8; - i -= (s >> (byte_nr<<3)) & 0xFFULL; - return (byte_nr << 3) + lt_sel[((i-1) << 8) + ((x>>(byte_nr<<3))&0xFFULL) ]; -#endif - return _sel(x, i); -} - -inline uint32_t bits::_sel(uint64_t x, uint32_t i) -{ - uint64_t s = x, b; // s = sum - s = s-((s>>1) & 0x5555555555555555ULL); - s = (s & 0x3333333333333333ULL) + ((s >> 2) & 0x3333333333333333ULL); - s = (s + (s >> 4)) & 0x0F0F0F0F0F0F0F0FULL; - s = 0x0101010101010101ULL*s; - b = (s+ps_overflow[i]);//&0x8080808080808080ULL;// add something to the partial sums to cause overflow - i = (i-1)<<8; - if (b&0x0000000080000000ULL) // byte <=3 - if (b&0x0000000000008000ULL) //byte <= 1 - if (b&0x0000000000000080ULL) - return lt_sel[(x&0xFFULL) + i]; - else - return 8 +lt_sel[(((x>>8)&0xFFULL) + i - ((s&0xFFULL)<<8))&0x7FFULL];//byte 1; - else//byte >1 - if (b&0x0000000000800000ULL) //byte <=2 - return 16+lt_sel[(((x>>16)&0xFFULL) + i - (s&0xFF00ULL))&0x7FFULL];//byte 2; - else - return 24+lt_sel[(((x>>24)&0xFFULL) + i - ((s>>8)&0xFF00ULL))&0x7FFULL];//byte 3; - else// byte > 3 - if (b&0x0000800000000000ULL) // byte <=5 - if (b&0x0000008000000000ULL) //byte <=4 - return 32+lt_sel[(((x>>32)&0xFFULL) + i - ((s>>16)&0xFF00ULL))&0x7FFULL];//byte 4; - else - return 40+lt_sel[(((x>>40)&0xFFULL) + i - ((s>>24)&0xFF00ULL))&0x7FFULL];//byte 5; - else// byte >5 - if (b&0x0080000000000000ULL) //byte<=6 - return 48+lt_sel[(((x>>48)&0xFFULL) + i - ((s>>32)&0xFF00ULL))&0x7FFULL];//byte 6; - else - return 56+lt_sel[(((x>>56)&0xFFULL) + i - ((s>>40)&0xFF00ULL))&0x7FFULL];//byte 7; - return 0; -} - -// using built-in method or -// 64-bit version of 32-bit proposal of -// http://www-graphics.stanford.edu/~seander/bithacks.html -inline uint32_t bits::hi(uint64_t x) -{ -#ifdef __SSE4_2__ - if (x == 0) - return 0; - return 63 - __builtin_clzll(x); -#else - uint64_t t,tt; // temporaries - if ((tt = x >> 32)) { // hi >= 32 - if ((t = tt >> 16)) { // hi >= 48 - return (tt = t >> 8) ? 56 + lt_hi[tt] : 48 + lt_hi[t]; - } else { // hi < 48 - return (t = tt >> 8) ? 40 + lt_hi[t] : 32 + lt_hi[tt]; - } - } else { // hi < 32 - if ((t = x >> 16)) { // hi >= 16 - return (tt = t >> 8) ? 24 + lt_hi[tt] : 16 + lt_hi[t]; - } else { // hi < 16 - return (tt = x >> 8) ? 8 + lt_hi[tt] : lt_hi[x]; - } - } -#endif -} - -// details see: http://citeseer.ist.psu.edu/leiserson98using.html -// or page 10, Knuth TAOCP Vol 4 F1A -inline uint32_t bits::lo(uint64_t x) -{ -#ifdef __SSE4_2__ - if (x==0) - return 0; - return __builtin_ctzll(x); -#else - if (x&1) return 0; - if (x&3) return 1; - if (x&7) return 2; - if (x&0x7F) { // in average every second random number x can be answered this way - return lt_lo[(x&0x7F)>>3]+3; - } - // x&-x equals x with only the lsb set - return lt_deBruijn_to_idx[((x&-x)*deBruijn64)>>58]; -#endif -} - -inline uint32_t bits::hi11(uint64_t x) -{ - // extract "11" 2bit blocks - uint64_t ex11 = (x&(x>>1))&0x5555555555555555ULL; - // extract "10" 2bit blocks - uint64_t ex10or01 = (ex11|(ex11<<1))^x; - // extract "10" 2bit blocks - ex11 += (((ex11|(ex11<<1))+((ex10or01<<1)&0x5555555555555555ULL)) & ((ex10or01&0x5555555555555555ULL)|ex11)); - return hi(ex11); -} - - -inline uint32_t bits::sel11(uint64_t x, uint32_t i, uint32_t c) -{ - uint64_t ex11 = (x&(x>>1))&0x5555555555555555ULL; - uint64_t ex10or01 = (ex11|(ex11<<1))^x; - ex11 += (((ex11|(ex11<<1))+(((ex10or01<<1)&0x5555555555555555ULL)|c)) & ((ex10or01&0x5555555555555555ULL)|ex11)); - return sel(ex11,i); -} - -inline void bits::write_int(uint64_t* word, uint64_t x, uint8_t offset, const uint8_t len) -{ - x &= bits::lo_set[len]; - if (offset + len < 64) { - *word &= - ((bits::all_set << (offset+len)) | bits::lo_set[offset]); // mask 1..10..01..1 - *word |= (x << offset); -// *word ^= ((*word ^ x) & (bits::lo_set[len] << offset) ); -// surprisingly the above line is slower than the lines above - } else { - *word &= - ((bits::lo_set[offset])); // mask 0....01..1 - *word |= (x << offset); - if ((offset = (offset+len)&0x3F)) { // offset+len > 64 - *(word+1) &= (~bits::lo_set[offset]); // mask 1...10..0 -// *(word+1) &= bits::lo_unset[offset]; // mask 1...10..0 -// surprisingly the above line is slower than the line above - *(word+1) |= (x >> (len-offset)); - } - } -} - -inline void bits::write_int_and_move(uint64_t*& word, uint64_t x, uint8_t& offset, const uint8_t len) -{ - x &= bits::lo_set[len]; - if (offset + len < 64) { - *word &= - ((bits::all_set << (offset+len)) | bits::lo_set[offset]); // mask 1..10..01..1 - *word |= (x << offset); - offset += len; - } else { - *word &= - ((bits::lo_set[offset])); // mask 0....01..1 - *word |= (x << offset); - if ((offset= (offset+len))>64) {// offset+len >= 64 - offset &= 0x3F; - *(++word) &= (~bits::lo_set[offset]); // mask 1...10..0 - *word |= (x >> (len-offset)); - } else { - offset = 0; - ++word; - } - } -} - -inline uint64_t bits::read_int(const uint64_t* word, uint8_t offset, const uint8_t len) -{ - uint64_t w1 = (*word)>>offset; - if ((offset+len) > 64) { // if offset+len > 64 - return w1 | // w1 or w2 adepted: - ((*(word+1) & bits::lo_set[(offset+len)&0x3F]) // set higher bits zero - << (64-offset)); // move bits to the left - } else { - return w1 & bits::lo_set[len]; - } -} - -inline uint64_t bits::read_int_and_move(const uint64_t*& word, uint8_t& offset, const uint8_t len) -{ - uint64_t w1 = (*word)>>offset; - if ((offset = (offset+len))>=64) { // if offset+len > 64 - if (offset==64) { - offset &= 0x3F; - ++word; - return w1; - } else { - offset &= 0x3F; - return w1 | - (((*(++word)) & bits::lo_set[offset]) << (len-offset)); - } - } else { - return w1 & bits::lo_set[len]; - } -} - -inline uint64_t bits::read_unary(const uint64_t* word, uint8_t offset) -{ - uint64_t w = *word >> offset; - if (w) { - return bits::lo(w); - } else { - if (0!=(w=*(++word))) - return bits::lo(w)+64-offset; - uint64_t cnt=2; - while (0==(w=*(++word))) - ++cnt; - return bits::lo(w)+(cnt<<6)-offset; - } - return 0; -} - -inline uint64_t bits::read_unary_and_move(const uint64_t*& word, uint8_t& offset) -{ - uint64_t w = (*word) >> offset; // temporary variable is good for the performance - if (w) { - uint8_t r = bits::lo(w); - offset = (offset + r+1)&0x3F; - // we know that offset + r +1 <= 64, so if the new offset equals 0 increase word - word += (offset==0); - return r; - } else { - uint8_t rr=0; - if (0!=(w=*(++word))) { - rr = bits::lo(w)+64-offset; - offset = (offset+rr+1)&0x3F; - word += (offset==0); - return rr; - } else { - uint64_t cnt_1=1; - while (0==(w=*(++word))) - ++cnt_1; - rr = bits::lo(w)+64-offset; - offset = (offset+rr+1)&0x3F; - word += (offset==0); - return ((cnt_1)<<6) + rr; - } - } - return 0; -} - -inline void bits::move_right(const uint64_t*& word, uint8_t& offset, const uint8_t len) -{ - if ((offset+=len)&0xC0) { // if offset >= 65 - offset&=0x3F; - ++word; - } -} - -inline void bits::move_left(const uint64_t*& word, uint8_t& offset, const uint8_t len) -{ - if ((offset-=len)&0xC0) { // if offset-len<0 - offset&=0x3F; - --word; - } -} - -inline uint64_t bits::next(const uint64_t* word, uint64_t idx) -{ - word += (idx>>6); - if (*word & ~lo_set[idx&0x3F]) { - return (idx & ~((size_t)0x3F)) + lo(*word & ~lo_set[idx&0x3F]); - } - idx = (idx & ~((size_t)0x3F)) + 64; - ++word; - while (*word==0) { - idx += 64; - ++word; - } - return idx + lo(*word); -} - -inline uint64_t bits::prev(const uint64_t* word, uint64_t idx) -{ - word += (idx>>6); - if (*word & lo_set[(idx&0x3F)+1]) { - return (idx & ~((size_t)0x3F)) + hi(*word & lo_set[(idx&0x3F)+1]); - } - idx = (idx & ~((size_t)0x3F)) - 64; - --word; - while (*word==0) { - idx -= 64; - --word; - } - return idx + hi(*word); -} - -inline uint64_t bits::rev(uint64_t x) -{ - x = ((x & 0x5555555555555555ULL) << 1) | ((x & 0xAAAAAAAAAAAAAAAAULL) >> 1); - x = ((x & 0x3333333333333333ULL) << 2) | ((x & 0xCCCCCCCCCCCCCCCCULL) >> 2); - x = ((x & 0x0F0F0F0F0F0F0F0FULL) << 4) | ((x & 0xF0F0F0F0F0F0F0F0ULL) >> 4); - x = ((x & 0x00FF00FF00FF00FFULL) << 8) | ((x & 0xFF00FF00FF00FF00ULL) >> 8); - x = ((x & 0x0000FFFF0000FFFFULL) <<16) | ((x & 0xFFFF0000FFFF0000ULL) >>16); - x = ((x & 0x00000000FFFFFFFFULL) <<32) | ((x & 0xFFFFFFFF00000000ULL) >>32); - return x; -} - -} // end namespace sdsl - -#endif diff --git a/include/sdsl/uint256_t.hpp b/include/sdsl/uint256_t.hpp index 200e6f587..c522da121 100644 --- a/include/sdsl/uint256_t.hpp +++ b/include/sdsl/uint256_t.hpp @@ -64,20 +64,20 @@ class uint256_t inline uint16_t popcount() { #ifdef __AVX2__ // Fastest method: 32 table lookups per clock cycle sdsl::YMM_union ymm_union; - ymm_union[0] = m_lo; - ymm_union[1] = m_mid; - ymm_union[2] = m_high >> 64; - ymm_union[3] = m_high; + ymm_union.values[0] = m_lo; + ymm_union.values[1] = m_mid; + ymm_union.values[2] = m_high >> 64; + ymm_union.values[3] = m_high; return bits::cnt256(ymm_union.ymm); #endif #ifdef __SSE4_2__ // 16 table lookups per clock cycle sdsl::XMM_union xmm_union1; sdsl::XMM_union xmm_union2; - xmm_union1[0] = m_lo; - xmm_union1[1] = m_mid; - xmm_union2[0] = m_high >> 64; - xmm_union2[1] = m_high; + xmm_union1.values[0] = m_lo; + xmm_union1.values[1] = m_mid; + xmm_union2.values[0] = m_high >> 64; + xmm_union2.values[1] = m_high; return bits::cnt128(xmm_union1.xmm) + bits::cnt128(xmm_union2.xmm); From db534bb99b4ff67ab80a9df802c9b7cdf022d41a Mon Sep 17 00:00:00 2001 From: Diego Havenstein Date: Fri, 5 Sep 2014 18:53:17 +0200 Subject: [PATCH 10/11] values instead of ymm to access data in ymm_union --- include/sdsl/bits.hpp | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/include/sdsl/bits.hpp b/include/sdsl/bits.hpp index 88802f924..306d32ee9 100644 --- a/include/sdsl/bits.hpp +++ b/include/sdsl/bits.hpp @@ -276,13 +276,10 @@ inline uint64_t bits::cnt256(__m256i x){ bwcount = _mm256_add_epi8(_mm256_shuffle_epi8(POPCNT_LOOKUP_4BF_MASK256, low), _mm256_shuffle_epi8(POPCNT_LOOKUP_4BF_MASK256, high)); - // Computes sum of absolute differences and stores intermediate results at positions 0,4,8 and 12 - __m256i fourSums = _mm256_sad_epu8(bwcount, _mm256_setzero_si256()); - // Use union to access individual bytes (unsigned integers) sdsl::YMM_union ymm_union; - ymm_union.ymm = fourSums; - return ymm_union.ymm[0] + ymm_union.ymm[4] + ymm_union.ymm[8] + ymm_union.ymm[12]; + ymm_union.ymm = _mm256_sad_epu8(bwcount, _mm256_setzero_si256()); + return ymm_union.values[0] + ymm_union.values[4] + ymm_union.values[8] + ymm_union.values[12]; } #endif From 7fe66dcb1d63c2ad820292dda92cd2b9a4fa1ff0 Mon Sep 17 00:00:00 2001 From: Diego Havenstein Date: Fri, 5 Sep 2014 19:00:22 +0200 Subject: [PATCH 11/11] count variable being used where it needs to be used now --- include/sdsl/bits.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/sdsl/bits.hpp b/include/sdsl/bits.hpp index 306d32ee9..4f7436020 100644 --- a/include/sdsl/bits.hpp +++ b/include/sdsl/bits.hpp @@ -299,7 +299,7 @@ inline uint64_t bits::cnt128(__m128i x){ // Use union to access individual bytes (unsigned integers) sdsl::XMM_union xmm_union; - xmm_union.xmm = _mm_sad_epu8(x, _mm_setzero_si128()); + xmm_union.xmm = _mm_sad_epu8(count, _mm_setzero_si128()); return xmm_union.values[0] + xmm_union.values[4]; } #endif