From 88a8a0c0d41061a18d8c94bda5117765613272cb Mon Sep 17 00:00:00 2001
From: Diego Havenstein <diego.havenstein@tum.de>
Date: Fri, 29 Aug 2014 14:13:07 +0200
Subject: [PATCH 01/11] AVX2 support for popcount function. To be tested.

---
 include/sdsl/bits.hpp      | 30 ++++++++++++++++++++++++++++++
 include/sdsl/uint256_t.hpp |  9 +++++++++
 include/sdsl/ymm_union.hpp | 37 +++++++++++++++++++++++++++++++++++++
 3 files changed, 76 insertions(+)
 create mode 100644 include/sdsl/ymm_union.hpp
diff --git a/include/sdsl/bits.hpp b/include/sdsl/bits.hpp
index b13fe791f..0bc593e30 100644
--- a/include/sdsl/bits.hpp
+++ b/include/sdsl/bits.hpp
@@ -24,6 +24,8 @@
 #include <stdint.h> // for uint64_t uint32_t declaration
 #include <iostream>// for cerr
 #include <cassert>
+#include <x86intrin.h> // SSE/AVX
+#include <ymm_union.hpp> // convenient YMM register wrapper
 #ifdef __SSE4_2__
 #include <xmmintrin.h>
 #endif
@@ -237,6 +239,34 @@ struct bits {
 
 // ============= inline - implementations ================
 
+#ifdef __AVX2__
+inline uint64_t bits::cnt256(__m256i x){
+  // 4-bit universal table
+  static const __m256i POPCNT_LOOKUP_4BF_MASK256 = _mm256_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3,
+                                                                    1, 2, 2, 3, 2, 3, 3, 4, 
+                                                                    0, 1, 1, 2, 1, 2, 2, 3, 
+                                                                    1, 2, 2, 3, 2, 3, 3, 4);
+ 
+  __m256i low, high, bwcount;
+ 
+  // byte halves stored in separate YMM registers
+  low = _mm256_and_si256(MASK4_256, buffer.avx);
+  high = _mm256_and_si256(MASK4_256, _mm256_srli_epi16(buffer.avx, 4));
+ 
+  // bytewise population count
+  bwcount = _mm256_add_epi8(_mm256_shuffle_epi8(POPCNT_LOOKUP_4BF_MASK256, low),
+                          _mm256_shuffle_epi8(POPCNT_LOOKUP_4BF_MASK256, high));
+ 
+  // Computes sum of absolute differences and stores intermediate results at positions 0,4,8 and 12
+  __m256i fourSums = _mm256_sad_epu8(bwcount, _mm256_setzero_si256());
+ 
+  // Use union to access individual bytes (unsigned integers)
+  sdsl::YMM_Union<uint8_t> ymm_union;
+  ymm_union.ymm = fourSums;
+  return ymm_union.ymm[0] + ymm_union.ymm[4] + ymm_union.ymm[8] + ymm_union.ymm[12];
+}
+#endif
+
 // see page 11, Knuth TAOCP Vol 4 F1A
 inline uint64_t bits::cnt(uint64_t x)
 {
diff --git a/include/sdsl/uint256_t.hpp b/include/sdsl/uint256_t.hpp
index b93a42e31..da5d8ef5e 100644
--- a/include/sdsl/uint256_t.hpp
+++ b/include/sdsl/uint256_t.hpp
@@ -62,8 +62,17 @@ class uint256_t
         }
 
         inline uint16_t popcount() {
+#ifdef __AVX2__
+          sdsl::YMM_Union<uint64_t> ymm_union;
+          ymm_union[0] = m_lo;
+          ymm_union[1] = m_mid;
+          ymm_union[2] = m_high >> 64;
+          ymm_union[3] = m_high;
+          return bits::cnt256(ymm_union.ymm);
+#else
             return ((uint16_t)bits::cnt(m_lo)) + bits::cnt(m_mid)
                    + bits::cnt(m_high>>64) + bits::cnt(m_high);
+#endif
         }
 
         inline uint16_t hi() {
diff --git a/include/sdsl/ymm_union.hpp b/include/sdsl/ymm_union.hpp
new file mode 100644
index 000000000..ae1dca0df
--- /dev/null
+++ b/include/sdsl/ymm_union.hpp
@@ -0,0 +1,37 @@
+/* sdsl - succinct data structures library
+    Copyright (C) 2012 Simon Gog
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see http://www.gnu.org/licenses/ .
+*/
+/*! \file uint256_t.hpp
+   \brief uint256_t.hpp contains a convenientunion for YMM registers (256-bits).
+   \author Diego Havenstein
+*/
+#ifndef INCLUDED_SDSL_YMMUNION
+#define INCLUDED_SDSL_YMMUNION
+
+namespace sdsl
+{
+
+#ifdef __AVX2__
+template<typename T>
+union YMM_union {
+  __m256i ymm;
+  T values[32/sizeof(T)];
+};
+#endif
+
+} // end namespace
+
+#endif

From a65f3f45a2161a356c74a7b8cb5fa9c1a68b4607 Mon Sep 17 00:00:00 2001
From: Diego Havenstein <Havenstein@protonmail.ch>
Date: Fri, 29 Aug 2014 14:51:09 +0200
Subject: [PATCH 02/11] Added support for SSE based popcount

---
 include/sdsl/bits.hpp      | 19 +++++++++++++++++++
 include/sdsl/uint256_t.hpp | 17 +++++++++++++++--
 include/sdsl/xmm_union.hpp | 37 +++++++++++++++++++++++++++++++++++++
 include/sdsl/ymm_union.hpp |  4 ++--
 4 files changed, 73 insertions(+), 4 deletions(-)
 create mode 100644 include/sdsl/xmm_union.hpp

diff --git a/include/sdsl/bits.hpp b/include/sdsl/bits.hpp
index 0bc593e30..a6ae19ab0 100644
--- a/include/sdsl/bits.hpp
+++ b/include/sdsl/bits.hpp
@@ -267,6 +267,25 @@ inline uint64_t bits::cnt256(__m256i x){
 }
 #endif
 
+#ifdef __SSE4_2__
+inline uint64_t bits::cnt128(__m128i x){
+  // 4-bit universal table
+  static const __m128i POPCNT_LOOKUP_4BF_MASK = _mm_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
+ 
+  __m128i low, high, count;
+
+  low = _mm_and_si128(MASK4, buffer);
+  high = _mm_and_si128(MASK4, _mm_srli_epi16(buffer, 4));
+  count = _mm_add_epi8(_mm_shuffle_epi8(POPCNT_LOOKUP_4BF_MASK, low),
+                       _mm_shuffle_epi8(POPCNT_LOOKUP_4BF_MASK, high));
+  
+  // Use union to access individual bytes (unsigned integers)
+  sdsl::XMM_Union<uint8_t> xmm_union;
+  xmm_union.sse = _mm_sad_epu8(x, _mm_setzero_si128());
+  return xmm_union.values[0] + xmm_union.values[4];
+}
+#endif
+
 // see page 11, Knuth TAOCP Vol 4 F1A
 inline uint64_t bits::cnt(uint64_t x)
 {
diff --git a/include/sdsl/uint256_t.hpp b/include/sdsl/uint256_t.hpp
index da5d8ef5e..ec84d9f85 100644
--- a/include/sdsl/uint256_t.hpp
+++ b/include/sdsl/uint256_t.hpp
@@ -62,14 +62,27 @@ class uint256_t
         }
 
         inline uint16_t popcount() {
-#ifdef __AVX2__
+#ifdef __AVX2__ // Fastest method: 32 table lookups per clock cycle
           sdsl::YMM_Union<uint64_t> ymm_union;
           ymm_union[0] = m_lo;
           ymm_union[1] = m_mid;
           ymm_union[2] = m_high >> 64;
           ymm_union[3] = m_high;
           return bits::cnt256(ymm_union.ymm);
-#else
+#endif
+          
+#ifdef __SSE4_2__ // 16 table lookups per clock cycle
+          sdsl::XMM_Union<uint64_t> xmm_union1;
+          sdsl::XMM_Union<uint64_t> xmm_union2;
+          xmm_union1[0] = m_lo;
+          xmm_union1[1] = m_mid;
+          xmm_union2[0] = m_high >> 64;
+          xmm_union2[1] = m_high;
+
+          return bits::cnt128(xmm_union1.xmm) + bits::cnt128(xmm_union2.xmm);
+          
+          
+#else  // byte after byte
             return ((uint16_t)bits::cnt(m_lo)) + bits::cnt(m_mid)
                    + bits::cnt(m_high>>64) + bits::cnt(m_high);
 #endif
diff --git a/include/sdsl/xmm_union.hpp b/include/sdsl/xmm_union.hpp
new file mode 100644
index 000000000..021851ec4
--- /dev/null
+++ b/include/sdsl/xmm_union.hpp
@@ -0,0 +1,37 @@
+/* sdsl - succinct data structures library
+    Copyright (C) 2012 Simon Gog
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see http://www.gnu.org/licenses/ .
+*/
+/*! \file xmm_union.hpp
+   \brief xmm_union.hpp contains a convenientunion for XMM registers (128-bits).
+   \author Diego Havenstein
+*/
+#ifndef INCLUDED_SDSL_XMMUNION
+#define INCLUDED_SDSL_XMMUNION
+
+namespace sdsl
+{                     X
+
+#ifdef __AVX2__
+template<typename T>
+union XMM_union {
+  __m128i xmm;
+  T values[16/sizeof(T)];
+};
+#endif
+
+} // end namespace
+
+#endif
diff --git a/include/sdsl/ymm_union.hpp b/include/sdsl/ymm_union.hpp
index ae1dca0df..809b050b3 100644
--- a/include/sdsl/ymm_union.hpp
+++ b/include/sdsl/ymm_union.hpp
@@ -14,8 +14,8 @@
     You should have received a copy of the GNU General Public License
     along with this program.  If not, see http://www.gnu.org/licenses/ .
 */
-/*! \file uint256_t.hpp
-   \brief uint256_t.hpp contains a convenientunion for YMM registers (256-bits).
+/*! \file ymm_union.hpp
+   \brief ymm_union.hpp contains a convenientunion for YMM registers (256-bits).
    \author Diego Havenstein
 */
 #ifndef INCLUDED_SDSL_YMMUNION

From 8983625a9451e3642af104a7206dc4c94a68c6c7 Mon Sep 17 00:00:00 2001
From: Diego Havenstein <Havenstein@protonmail.ch>
Date: Fri, 29 Aug 2014 14:59:07 +0200
Subject: [PATCH 03/11] Bugfixes, now compiles and "make test" is running

---
 include/sdsl/bits.hpp      | 3 ++-
 include/sdsl/xmm_union.hpp | 4 ++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/include/sdsl/bits.hpp b/include/sdsl/bits.hpp
index a6ae19ab0..aaf044bbb 100644
--- a/include/sdsl/bits.hpp
+++ b/include/sdsl/bits.hpp
@@ -25,7 +25,8 @@
 #include <iostream>// for cerr
 #include <cassert>
 #include <x86intrin.h> // SSE/AVX
-#include <ymm_union.hpp> // convenient YMM register wrapper
+#include "ymm_union.hpp" // convenient YMM register wrapper
+#include "xmm_union.hpp" // convenient XMM register wrapper
 #ifdef __SSE4_2__
 #include <xmmintrin.h>
 #endif
diff --git a/include/sdsl/xmm_union.hpp b/include/sdsl/xmm_union.hpp
index 021851ec4..a28f4402c 100644
--- a/include/sdsl/xmm_union.hpp
+++ b/include/sdsl/xmm_union.hpp
@@ -22,9 +22,9 @@
 #define INCLUDED_SDSL_XMMUNION
 
 namespace sdsl
-{                     X
+{
 
-#ifdef __AVX2__
+#ifdef __SSE4_2__
 template<typename T>
 union XMM_union {
   __m128i xmm;

From 40409bceacfbb72ac8a99c77bc9f28d31d656581 Mon Sep 17 00:00:00 2001
From: Diego Havenstein <Havenstein@protonmail.ch>
Date: Fri, 29 Aug 2014 15:22:07 +0200
Subject: [PATCH 04/11] CheckAVX2.cmake

---
 CMakeLists.txt               |  9 +++++++++
 CMakeModules/CheckAVX2.cmake | 24 ++++++++++++++++++++++++
 include/sdsl/bits.hpp        |  2 ++
 3 files changed, 35 insertions(+)
 create mode 100644 CMakeModules/CheckAVX2.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 20b137765..bbbbd4387 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -61,6 +61,15 @@ if( BUILTIN_POPCNT )
     endif()
 endif()
 
+include(CheckAVX2)
+if( AVX2 )
+    if( CMAKE_COMPILER_IS_GNUCXX )
+        append_cxx_compiler_flags("-mavx2" "GCC" CMAKE_CXX_OPT_FLAGS)
+    else()
+        append_cxx_compiler_flags("-mavx2" "CLANG" CMAKE_CXX_OPT_FLAGS)
+    endif()
+endif()
+
 add_subdirectory(external)
 add_subdirectory(include)
 add_subdirectory(lib)
diff --git a/CMakeModules/CheckAVX2.cmake b/CMakeModules/CheckAVX2.cmake
new file mode 100644
index 000000000..dcd8815f8
--- /dev/null
+++ b/CMakeModules/CheckAVX2.cmake
@@ -0,0 +1,24 @@
+# Check if the CPU provides fast operations
+# for popcount, leftmost and rightmost bit
+
+set(AVX2 0)
+# Check if we are on a Linux system
+if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
+	# Use /proc/cpuinfo to get the information
+	file(STRINGS "/proc/cpuinfo" _cpuinfo)
+	if(_cpuinfo MATCHES "(avx2)")
+    set(AVX2 1)
+	endif()
+elseif(CMAKE_SYSTEM_NAME STREQUAL "Windows")
+#  handle windows
+#	get_filename_component(_vendor_id "[HKEY_LOCAL_MACHINE\\Hardware\\Description\\System\\CentralProcessor\\0;VendorIdentifier]" NAME CACHE)
+#	get_filename_component(_cpu_id "[HKEY_LOCAL_MACHINE\\Hardware\\Description\\System\\CentralProcessor\\0;Identifier]" NAME CACHE)	
+elseif(CMAKE_SYSTEM_NAME STREQUAL "Darwin")
+#  handle MacOs
+execute_process(COMMAND sysctl -n machdep.cpu.features
+                OUTPUT_VARIABLE _cpuinfo OUTPUT_STRIP_TRAILING_WHITESPACE)
+  if(_cpuinfo MATCHES "AVX2")
+    set(AVX2 1)
+	endif()
+endif()	
+	
diff --git a/include/sdsl/bits.hpp b/include/sdsl/bits.hpp
index aaf044bbb..9cf981945 100644
--- a/include/sdsl/bits.hpp
+++ b/include/sdsl/bits.hpp
@@ -242,6 +242,7 @@ struct bits {
 
 #ifdef __AVX2__
 inline uint64_t bits::cnt256(__m256i x){
+
   // 4-bit universal table
   static const __m256i POPCNT_LOOKUP_4BF_MASK256 = _mm256_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3,
                                                                     1, 2, 2, 3, 2, 3, 3, 4, 
@@ -270,6 +271,7 @@ inline uint64_t bits::cnt256(__m256i x){
 
 #ifdef __SSE4_2__
 inline uint64_t bits::cnt128(__m128i x){
+
   // 4-bit universal table
   static const __m128i POPCNT_LOOKUP_4BF_MASK = _mm_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
  

From 9748a346d3d366668e79506a54e93276ee53a03d Mon Sep 17 00:00:00 2001
From: Diego Havenstein <Havenstein@protonmail.ch>
Date: Fri, 29 Aug 2014 16:05:17 +0200
Subject: [PATCH 05/11] Bug fixes

---
 include/sdsl/bits.hpp | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/include/sdsl/bits.hpp b/include/sdsl/bits.hpp
index 9cf981945..d8c454dea 100644
--- a/include/sdsl/bits.hpp
+++ b/include/sdsl/bits.hpp
@@ -243,7 +243,8 @@ struct bits {
 #ifdef __AVX2__
 inline uint64_t bits::cnt256(__m256i x){
 
-  // 4-bit universal table
+  // 4-bit universal table, 4-bit mask
+  static const __m256i MASK4_256 = _mm256_set1_epi8(0x0F);
   static const __m256i POPCNT_LOOKUP_4BF_MASK256 = _mm256_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3,
                                                                     1, 2, 2, 3, 2, 3, 3, 4, 
                                                                     0, 1, 1, 2, 1, 2, 2, 3, 
@@ -252,8 +253,8 @@ inline uint64_t bits::cnt256(__m256i x){
   __m256i low, high, bwcount;
  
   // byte halves stored in separate YMM registers
-  low = _mm256_and_si256(MASK4_256, buffer.avx);
-  high = _mm256_and_si256(MASK4_256, _mm256_srli_epi16(buffer.avx, 4));
+  low = _mm256_and_si256(MASK4_256, x);
+  high = _mm256_and_si256(MASK4_256, _mm256_srli_epi16(x, 4));
  
   // bytewise population count
   bwcount = _mm256_add_epi8(_mm256_shuffle_epi8(POPCNT_LOOKUP_4BF_MASK256, low),
@@ -272,13 +273,14 @@ inline uint64_t bits::cnt256(__m256i x){
 #ifdef __SSE4_2__
 inline uint64_t bits::cnt128(__m128i x){
 
-  // 4-bit universal table
+  // 4-bit universal table, 4-bit mask
   static const __m128i POPCNT_LOOKUP_4BF_MASK = _mm_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
+  static const __m128i MASK4 = _mm_set1_epi8(0x0F);
  
   __m128i low, high, count;
 
-  low = _mm_and_si128(MASK4, buffer);
-  high = _mm_and_si128(MASK4, _mm_srli_epi16(buffer, 4));
+  low = _mm_and_si128(MASK4, x);
+  high = _mm_and_si128(MASK4, _mm_srli_epi16(x, 4));
   count = _mm_add_epi8(_mm_shuffle_epi8(POPCNT_LOOKUP_4BF_MASK, low),
                        _mm_shuffle_epi8(POPCNT_LOOKUP_4BF_MASK, high));
   

From 7f72dc4101a16d2a9d24c0fc2ebb5050620574aa Mon Sep 17 00:00:00 2001
From: Diego Havenstein <Havenstein@protonmail.ch>
Date: Mon, 1 Sep 2014 15:59:18 +0200
Subject: [PATCH 06/11] Some of the fixes suggested on GitHub by Simon

---
 CMakeLists.txt             |  3 ++-
 include/sdsl/bits.hpp      | 22 +++++++++++++++++++---
 include/sdsl/uint256_t.hpp |  6 +++---
 3 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index bbbbd4387..8bbae670d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -65,9 +65,10 @@ include(CheckAVX2)
 if( AVX2 )
     if( CMAKE_COMPILER_IS_GNUCXX )
         append_cxx_compiler_flags("-mavx2" "GCC" CMAKE_CXX_OPT_FLAGS)
-    else()
+    elseif( CMAKE_COMPILER_IS_CLANGXX )
         append_cxx_compiler_flags("-mavx2" "CLANG" CMAKE_CXX_OPT_FLAGS)
     endif()
+        message(STATUS "Your compiler is not supported yet!")
 endif()
 
 add_subdirectory(external)
diff --git a/include/sdsl/bits.hpp b/include/sdsl/bits.hpp
index d8c454dea..51027e427 100644
--- a/include/sdsl/bits.hpp
+++ b/include/sdsl/bits.hpp
@@ -105,6 +105,22 @@ struct bits {
      */
     static uint64_t cnt(uint64_t x);
 
+    //! Counts the number of set bits in YMM register x.
+    /*! \param  YMM register
+        \return Number of set bits.
+     */
+#ifdef __AVX2__
+    static uint64_t cnt256(__m256i x);
+#endif
+
+    //! Counts the number of set bits in XMM register x.
+    /*! \param  XMM register
+        \return Number of set bits.
+     */ 
+#ifdef __AVX2__
+    static uint64_t cnt128(__m128i x);
+#endif
+
     //! Position of the most significant set bit the 64-bit word x
     /*! \param x 64-bit word
         \return The position (in 0..63) of the least significant set bit
@@ -264,7 +280,7 @@ inline uint64_t bits::cnt256(__m256i x){
   __m256i fourSums = _mm256_sad_epu8(bwcount, _mm256_setzero_si256());
  
   // Use union to access individual bytes (unsigned integers)
-  sdsl::YMM_Union<uint8_t> ymm_union;
+  sdsl::YMM_union<uint8_t> ymm_union;
   ymm_union.ymm = fourSums;
   return ymm_union.ymm[0] + ymm_union.ymm[4] + ymm_union.ymm[8] + ymm_union.ymm[12];
 }
@@ -285,8 +301,8 @@ inline uint64_t bits::cnt128(__m128i x){
                        _mm_shuffle_epi8(POPCNT_LOOKUP_4BF_MASK, high));
   
   // Use union to access individual bytes (unsigned integers)
-  sdsl::XMM_Union<uint8_t> xmm_union;
-  xmm_union.sse = _mm_sad_epu8(x, _mm_setzero_si128());
+  sdsl::XMM_union<uint8_t> xmm_union;
+  xmm_union.xmm = _mm_sad_epu8(x, _mm_setzero_si128());
   return xmm_union.values[0] + xmm_union.values[4];
 }
 #endif
diff --git a/include/sdsl/uint256_t.hpp b/include/sdsl/uint256_t.hpp
index ec84d9f85..200e6f587 100644
--- a/include/sdsl/uint256_t.hpp
+++ b/include/sdsl/uint256_t.hpp
@@ -63,7 +63,7 @@ class uint256_t
 
         inline uint16_t popcount() {
 #ifdef __AVX2__ // Fastest method: 32 table lookups per clock cycle
-          sdsl::YMM_Union<uint64_t> ymm_union;
+          sdsl::YMM_union<uint64_t> ymm_union;
           ymm_union[0] = m_lo;
           ymm_union[1] = m_mid;
           ymm_union[2] = m_high >> 64;
@@ -72,8 +72,8 @@ class uint256_t
 #endif
           
 #ifdef __SSE4_2__ // 16 table lookups per clock cycle
-          sdsl::XMM_Union<uint64_t> xmm_union1;
-          sdsl::XMM_Union<uint64_t> xmm_union2;
+          sdsl::XMM_union<uint64_t> xmm_union1;
+          sdsl::XMM_union<uint64_t> xmm_union2;
           xmm_union1[0] = m_lo;
           xmm_union1[1] = m_mid;
           xmm_union2[0] = m_high >> 64;

From bd7ad14a0ee9b8ee731490cd39bb03151c36c7cc Mon Sep 17 00:00:00 2001
From: Diego Havenstein <Havenstein@protonmail.ch>
Date: Mon, 1 Sep 2014 16:02:23 +0200
Subject: [PATCH 07/11] __AVX2__ -> __SSE4_2__

---
 build/include/sdsl/bits.hpp | 694 ++++++++++++++++++++++++++++++++++++
 1 file changed, 694 insertions(+)
 create mode 100644 build/include/sdsl/bits.hpp

diff --git a/build/include/sdsl/bits.hpp b/build/include/sdsl/bits.hpp
new file mode 100644
index 000000000..88802f924
--- /dev/null
+++ b/build/include/sdsl/bits.hpp
@@ -0,0 +1,694 @@
+/* sdsl - succinct data structures library
+    Copyright (C) 2008 Simon Gog
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see http://www.gnu.org/licenses/ .
+*/
+/*! \file bits.hpp
+    \brief bits.hpp contains the sdsl::bits class.
+	\author Simon Gog
+*/
+#ifndef INCLUDED_SDSL_BITS
+#define INCLUDED_SDSL_BITS
+
+#include <stdint.h> // for uint64_t uint32_t declaration
+#include <iostream>// for cerr
+#include <cassert>
+#include <x86intrin.h> // SSE/AVX
+#include "ymm_union.hpp" // convenient YMM register wrapper
+#include "xmm_union.hpp" // convenient XMM register wrapper
+#ifdef __SSE4_2__
+#include <xmmintrin.h>
+#endif
+
+//! Namespace for the succinct data structure library.
+namespace sdsl
+{
+
+//! A helper class for bitwise tricks on 64 bit words.
+/*!
+	bits is a helper class for bitwise tricks and
+	techniques. For the basic tricks and techiques we refer to Donald E. Knuth's
+	"The Art of Computer Programming", Volume 4A, Chapter 7.1.3 and
+	the informative website of Sean E. Anderson about the topic:
+	http://www-graphics.stanford.edu/~seander/bithacks.html .
+
+	We have added new functions like: cnt11 and sel11.
+
+	All members of this class are static variables or methods.
+	This class cannot be instantiated.
+
+	\author Simon Gog
+ */
+struct bits {
+    bits() = delete;
+    //! 64bit mask with all bits set to 1.
+    constexpr static int64_t  all_set {-1LL};
+
+    //! This constant represents a de Bruijn sequence B(k,n) for k=2 and n=6.
+    /*! Details for de Bruijn sequences see
+       http://en.wikipedia.org/wiki/De_bruijn_sequence
+       deBruijn64 is used in combination with the
+       array lt_deBruijn_to_idx.
+    */
+    constexpr static uint64_t deBruijn64 {0x0218A392CD3D5DBFULL};
+
+    //! This table maps a 6-bit subsequence S[idx...idx+5] of constant deBruijn64 to idx.
+    /*! \sa deBruijn64
+    */
+    static const uint32_t lt_deBruijn_to_idx[64];
+
+    //! Array containing Fibonacci numbers less than \f$2^64\f$.
+    static const uint64_t lt_fib[92];
+
+    //! Lookup table for byte popcounts.
+    static const uint8_t lt_cnt[256];
+
+    //! Lookup table for most significant set bit in a byte.
+    static const uint32_t lt_hi[256];
+
+    //! lo_set[i] is a 64-bit word with the i least significant bits set and the high bits not set.
+    /*! lo_set[0] = 0ULL, lo_set[1]=1ULL, lo_set[2]=3ULL...
+     */
+    static const uint64_t lo_set[65];
+
+    //! lo_unset[i] is a 64-bit word with the i least significant bits not set and the high bits set.
+    /*! lo_unset[0] = FFFFFFFFFFFFFFFFULL, lo_unset_set[1]=FFFFFFFFFFFFFFFEULL, ...
+     */
+    static const uint64_t lo_unset[65];
+
+    //! Lookup table for least significant set bit in a byte.
+    static const uint8_t lt_lo[256];
+
+    //! Lookup table for select on bytes.
+    /*! Entry at idx = 256*j + i equals the position of the
+        (j+1)-th set bit in byte i. Positions lie in the range \f$[0..7]\f$.
+     */
+    static const uint8_t lt_sel[256*8];
+
+    //! Use to help to decide if a prefix sum stored in a byte overflows.
+    static const uint64_t ps_overflow[65];
+
+    //! Counts the number of set bits in x.
+    /*! \param  x 64-bit word
+        \return Number of set bits.
+     */
+    static uint64_t cnt(uint64_t x);
+
+    //! Counts the number of set bits in YMM register x.
+    /*! \param  YMM register
+        \return Number of set bits.
+     */
+#ifdef __AVX2__
+    static uint64_t cnt256(__m256i x);
+#endif
+
+    //! Counts the number of set bits in XMM register x.
+    /*! \param  XMM register
+        \return Number of set bits.
+     */ 
+#ifdef __SSE4_2__
+    static uint64_t cnt128(__m128i x);
+#endif
+
+    //! Position of the most significant set bit the 64-bit word x
+    /*! \param x 64-bit word
+        \return The position (in 0..63) of the least significant set bit
+                in `x` or 0 if x equals 0.
+    	\sa sel, lo
+    */
+    static uint32_t hi(uint64_t x);
+
+    //! Calculates the position of the rightmost 1-bit in the 64bit integer x if it exists
+    /*! \param x 64 bit integer.
+    	\return The position (in 0..63) of the rightmost 1-bit in the 64bit integer x if
+    	        x>0 and 0 if x equals 0.
+    	\sa sel, hi
+    */
+    static uint32_t lo(uint64_t x);
+
+    //! Counts the number of 1-bits in the 32bit integer x.
+    /*! This function is a variant of the method cnt. If
+    	32bit multiplication is fast, this method beats the cnt.
+    	for 32bit integers.
+    	\param x 64bit integer to count the bits.
+    	\return The number of 1-bits in x.
+     */
+    static uint32_t cnt32(uint32_t x);
+
+    //! Count the number of consecutive and distinct 11 in the 64bit integer x.
+    /*!
+      	\param x 64bit integer to count the terminating sequence 11 of a Fibonacci code.
+    	\param c Carry equals msb of the previous 64bit integer.
+     */
+    static uint32_t cnt11(uint64_t x, uint64_t& c);
+
+    //! Count the number of consecutive and distinct 11 in the 64bit integer x.
+    /*!
+      	\param x 64bit integer to count the terminating sequence 11 of a Fibonacci code.
+     */
+    static uint32_t cnt11(uint64_t x);
+
+    //! Count 10 bit pairs in the word x.
+    /*!
+     * \param x 64bit integer to count the 10 bit pairs.
+     * \param c Carry equals msb of the previous 64bit integer.
+     */
+    static uint32_t cnt10(uint64_t x, uint64_t& c);
+
+    //! Count 01 bit pairs in the word x.
+    /*!
+     * \param x 64bit integer to count the 01 bit pairs.
+     * \param c Carry equals msb of the previous 64bit integer.
+     */
+    static uint32_t cnt01(uint64_t x, uint64_t& c);
+
+    //! Map all 10 bit pairs to 01 or 1 if c=1 and the lsb=0. All other pairs are mapped to 00.
+    static uint64_t map10(uint64_t x, uint64_t c=0);
+
+    //! Map all 01 bit pairs to 01 of 1 if c=1 and the lsb=0. All other pairs are mapped to 00.
+    static uint64_t map01(uint64_t x, uint64_t c=1);
+
+    //! Calculate the position of the i-th rightmost 1 bit in the 64bit integer x
+    /*!
+      	\param x 64bit integer.
+    	\param i Argument i must be in the range \f$[1..cnt(x)]\f$.
+    	\pre Argument i must be in the range \f$[1..cnt(x)]\f$.
+      	\sa hi, lo
+     */
+    static uint32_t sel(uint64_t x, uint32_t i);
+    static uint32_t _sel(uint64_t x, uint32_t i);
+
+    //! Calculates the position of the i-th rightmost 11-bit-pattern which terminates a Fibonacci coded integer in x.
+    /*!	\param x 64 bit integer.
+        \param i Index of 11-bit-pattern. \f$i \in [1..cnt11(x)]\f$
+    	\param c Carry bit from word before
+     	\return The position (in 1..63) of the i-th 11-bit-pattern which terminates a Fibonacci coded integer in x if
+    	        x contains at least i 11-bit-patterns and a undefined value otherwise.
+        \sa cnt11, hi11, sel
+
+     */
+    static uint32_t sel11(uint64_t x, uint32_t i, uint32_t c=0);
+
+    //! Calculates the position of the leftmost 11-bit-pattern which terminates a Fibonacci coded integer in x.
+    /*! \param x 64 bit integer.
+        \return The position (in 1..63) of the leftmost 1 of the leftmost 11-bit-pattern which
+    	        terminates a Fibonacci coded integer in x if x contains a 11-bit-pattern
+    			and 0 otherwise.
+    	\sa cnt11, sel11
+    */
+    static uint32_t hi11(uint64_t x);
+
+    //! Writes value x to an bit position in an array.
+    static void write_int(uint64_t* word, uint64_t x, const uint8_t offset=0, const uint8_t len=64);
+
+    //! Writes value x to an bit position in an array and moves the bit-pointer.
+    static void write_int_and_move(uint64_t*& word, uint64_t x, uint8_t& offset, const uint8_t len);
+
+    //! Reads a value from a bit position in an array.
+    static uint64_t read_int(const uint64_t* word, uint8_t offset=0, const uint8_t len=64);
+
+    //! Reads a value from a bit position in an array and moved the bit-pointer.
+    static uint64_t read_int_and_move(const uint64_t*& word, uint8_t& offset, const uint8_t len=64);
+
+    //! Reads an unary decoded value from a bit position in an array.
+    static uint64_t read_unary(const uint64_t* word, uint8_t offset=0);
+
+    //! Reads an unary decoded value from a bit position in an array and moves the bit-pointer.
+    static uint64_t read_unary_and_move(const uint64_t*& word, uint8_t& offset);
+
+    //! Move the bit-pointer (=uint64_t word and offset) `len` to the right.
+    /*!\param word   64-bit word part of the bit pointer
+     * \param offset Offset part of the bit pointer
+     * \param len    Move distance. \f$ len \in [0..64] \f$
+     * \sa move_left
+     */
+    static void move_right(const uint64_t*& word, uint8_t& offset, const uint8_t len);
+
+    //! Move the bit-pointer (=uint64_t word and offset) `len` to the left.
+    /*!\param word   64-bit word part of the bit pointer
+     * \param offset Offset part of the bit pointer
+     * \param len    Move distance. \f$ len \in [0..64] \f$
+     * \sa move_right
+     */
+    static void move_left(const uint64_t*& word, uint8_t& offset, const uint8_t len);
+
+    //! Get the first one bit in the interval \f$[idx..\infty )\f$
+    static uint64_t next(const uint64_t* word, uint64_t idx);
+
+    //! Get the one bit with the greatest position in the interval \f$[0..idx]\f$
+    static uint64_t prev(const uint64_t* word, uint64_t idx);
+
+    //! reverses a given 64 bit word
+    static uint64_t rev(uint64_t x);
+};
+
+
+// ============= inline - implementations ================
+
+#ifdef __AVX2__
+inline uint64_t bits::cnt256(__m256i x){
+
+  // 4-bit universal table, 4-bit mask
+  static const __m256i MASK4_256 = _mm256_set1_epi8(0x0F);
+  static const __m256i POPCNT_LOOKUP_4BF_MASK256 = _mm256_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3,
+                                                                    1, 2, 2, 3, 2, 3, 3, 4, 
+                                                                    0, 1, 1, 2, 1, 2, 2, 3, 
+                                                                    1, 2, 2, 3, 2, 3, 3, 4);
+ 
+  __m256i low, high, bwcount;
+ 
+  // byte halves stored in separate YMM registers
+  low = _mm256_and_si256(MASK4_256, x);
+  high = _mm256_and_si256(MASK4_256, _mm256_srli_epi16(x, 4));
+ 
+  // bytewise population count
+  bwcount = _mm256_add_epi8(_mm256_shuffle_epi8(POPCNT_LOOKUP_4BF_MASK256, low),
+                          _mm256_shuffle_epi8(POPCNT_LOOKUP_4BF_MASK256, high));
+ 
+  // Computes sum of absolute differences and stores intermediate results at positions 0,4,8 and 12
+  __m256i fourSums = _mm256_sad_epu8(bwcount, _mm256_setzero_si256());
+ 
+  // Use union to access individual bytes (unsigned integers)
+  sdsl::YMM_union<uint8_t> ymm_union;
+  ymm_union.ymm = fourSums;
+  return ymm_union.ymm[0] + ymm_union.ymm[4] + ymm_union.ymm[8] + ymm_union.ymm[12];
+}
+#endif
+
+#ifdef __SSE4_2__
+inline uint64_t bits::cnt128(__m128i x){
+
+  // 4-bit universal table, 4-bit mask
+  static const __m128i POPCNT_LOOKUP_4BF_MASK = _mm_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
+  static const __m128i MASK4 = _mm_set1_epi8(0x0F);
+ 
+  __m128i low, high, count;
+
+  low = _mm_and_si128(MASK4, x);
+  high = _mm_and_si128(MASK4, _mm_srli_epi16(x, 4));
+  count = _mm_add_epi8(_mm_shuffle_epi8(POPCNT_LOOKUP_4BF_MASK, low),
+                       _mm_shuffle_epi8(POPCNT_LOOKUP_4BF_MASK, high));
+  
+  // Use union to access individual bytes (unsigned integers)
+  sdsl::XMM_union<uint8_t> xmm_union;
+  xmm_union.xmm = _mm_sad_epu8(x, _mm_setzero_si128());
+  return xmm_union.values[0] + xmm_union.values[4];
+}
+#endif
+
+// see page 11, Knuth TAOCP Vol 4 F1A
+inline uint64_t bits::cnt(uint64_t x)
+{
+#ifdef __SSE4_2__
+    return __builtin_popcountll(x);
+#else
+#ifdef POPCOUNT_TL
+    return lt_cnt[x&0xFFULL] + lt_cnt[(x>>8)&0xFFULL] +
+           lt_cnt[(x>>16)&0xFFULL] + lt_cnt[(x>>24)&0xFFULL] +
+           lt_cnt[(x>>32)&0xFFULL] + lt_cnt[(x>>40)&0xFFULL] +
+           lt_cnt[(x>>48)&0xFFULL] + lt_cnt[(x>>56)&0xFFULL];
+#else
+    x = x-((x>>1) & 0x5555555555555555ull);
+    x = (x & 0x3333333333333333ull) + ((x >> 2) & 0x3333333333333333ull);
+    x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0full;
+    return (0x0101010101010101ull*x >> 56);
+#endif
+#endif
+}
+
+inline uint32_t bits::cnt32(uint32_t x)
+{
+    x = x-((x>>1) & 0x55555555);
+    x = (x & 0x33333333) + ((x>>2) & 0x33333333);
+    return (0x10101010*x >>28)+(0x01010101*x >>28);
+}
+
+
+inline uint32_t bits::cnt11(uint64_t x, uint64_t& c)
+{
+    // extract "11" 2bit blocks
+    uint64_t ex11 = (x&(x>>1))&0x5555555555555555ULL, t;
+    // extract "10" 2bit blocks
+    uint64_t ex10or01 = (ex11|(ex11<<1))^x;
+
+    x = ex11 | ((t=(ex11|(ex11<<1))+(((ex10or01<<1)&0x5555555555555555ULL)|c))&(ex10or01&0x5555555555555555ULL));
+    c = (ex10or01>>63) or(t < (ex11|(ex11<<1)));
+
+    x = (x & 0x3333333333333333ULL) + ((x >> 2) & 0x3333333333333333ULL);
+    x = (x + (x >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
+    return (0x0101010101010101ULL*x >> 56);
+}
+
+inline uint32_t bits::cnt11(uint64_t x)
+{
+    // extract "11" 2bit blocks
+    uint64_t ex11 = (x&(x>>1))&0x5555555555555555ULL;
+    // extract "10" 2bit blocks
+    uint64_t ex10or01 = (ex11|(ex11<<1))^x;
+
+    x = ex11 | (((ex11|(ex11<<1))+((ex10or01<<1)&0x5555555555555555ULL))&(ex10or01&0x5555555555555555ULL));
+
+    x = (x & 0x3333333333333333ULL) + ((x >> 2) & 0x3333333333333333ULL);
+    x = (x + (x >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
+    return (0x0101010101010101ULL*x >> 56);
+}
+
+inline uint32_t bits::cnt10(uint64_t x, uint64_t& c)
+{
+    uint32_t res = cnt((x ^((x<<1) | c)) & (~x));
+    c = (x >> 63);
+    return res;
+}
+
+inline uint64_t bits::map10(uint64_t x, uint64_t c)
+{
+    return ((x ^((x << 1) | c)) & (~x));
+}
+
+inline uint32_t bits::cnt01(uint64_t x, uint64_t& c)
+{
+    uint32_t res = cnt((x ^((x<<1) | c)) & x);
+    c = (x >> 63);
+    return res;
+}
+inline uint64_t bits::map01(uint64_t x, uint64_t c)
+{
+    return ((x ^((x << 1) | c)) &  x);
+}
+
+inline uint32_t bits::sel(uint64_t x, uint32_t i)
+{
+#ifdef __SSE4_2__
+    uint64_t s = x, b;
+    s = s-((s>>1) & 0x5555555555555555ULL);
+    s = (s & 0x3333333333333333ULL) + ((s >> 2) & 0x3333333333333333ULL);
+    s = (s + (s >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
+    s = 0x0101010101010101ULL*s;
+// now s contains 8 bytes s[7],...,s[0]; s[j] contains the cumulative sum
+// of (j+1)*8 least significant bits of s
+    b = (s+ps_overflow[i]) & 0x8080808080808080ULL;
+// ps_overflow contains a bit mask x consisting of 8 bytes
+// x[7],...,x[0] and x[j] is set to 128-j
+// => a byte b[j] in b is >= 128 if cum sum >= j
+
+// __builtin_ctzll returns the number of trailing zeros, if b!=0
+    int  byte_nr = __builtin_ctzll(b) >> 3;   // byte nr in [0..7]
+    s <<= 8;
+    i -= (s >> (byte_nr<<3)) & 0xFFULL;
+    return (byte_nr << 3) + lt_sel[((i-1) << 8) + ((x>>(byte_nr<<3))&0xFFULL) ];
+#endif
+    return _sel(x, i);
+}
+
+inline uint32_t bits::_sel(uint64_t x, uint32_t i)
+{
+    uint64_t s = x, b;  // s = sum
+    s = s-((s>>1) & 0x5555555555555555ULL);
+    s = (s & 0x3333333333333333ULL) + ((s >> 2) & 0x3333333333333333ULL);
+    s = (s + (s >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
+    s = 0x0101010101010101ULL*s;
+    b = (s+ps_overflow[i]);//&0x8080808080808080ULL;// add something to the partial sums to cause overflow
+    i = (i-1)<<8;
+    if (b&0x0000000080000000ULL) // byte <=3
+        if (b&0x0000000000008000ULL) //byte <= 1
+            if (b&0x0000000000000080ULL)
+                return    lt_sel[(x&0xFFULL) + i];
+            else
+                return 8 +lt_sel[(((x>>8)&0xFFULL)  + i - ((s&0xFFULL)<<8))&0x7FFULL];//byte 1;
+        else//byte >1
+            if (b&0x0000000000800000ULL) //byte <=2
+                return 16+lt_sel[(((x>>16)&0xFFULL) + i - (s&0xFF00ULL))&0x7FFULL];//byte 2;
+            else
+                return 24+lt_sel[(((x>>24)&0xFFULL) + i - ((s>>8)&0xFF00ULL))&0x7FFULL];//byte 3;
+    else//  byte > 3
+        if (b&0x0000800000000000ULL) // byte <=5
+            if (b&0x0000008000000000ULL) //byte <=4
+                return 32+lt_sel[(((x>>32)&0xFFULL) + i - ((s>>16)&0xFF00ULL))&0x7FFULL];//byte 4;
+            else
+                return 40+lt_sel[(((x>>40)&0xFFULL) + i - ((s>>24)&0xFF00ULL))&0x7FFULL];//byte 5;
+        else// byte >5
+            if (b&0x0080000000000000ULL) //byte<=6
+                return 48+lt_sel[(((x>>48)&0xFFULL) + i - ((s>>32)&0xFF00ULL))&0x7FFULL];//byte 6;
+            else
+                return 56+lt_sel[(((x>>56)&0xFFULL) + i - ((s>>40)&0xFF00ULL))&0x7FFULL];//byte 7;
+    return 0;
+}
+
+// using built-in method or
+// 64-bit version of 32-bit proposal of
+// http://www-graphics.stanford.edu/~seander/bithacks.html
+inline uint32_t bits::hi(uint64_t x)
+{
+#ifdef __SSE4_2__
+    if (x == 0)
+        return 0;
+    return 63 - __builtin_clzll(x);
+#else
+    uint64_t t,tt; // temporaries
+    if ((tt = x >> 32)) { // hi >= 32
+        if ((t = tt >> 16)) { // hi >= 48
+            return (tt = t >> 8) ? 56 + lt_hi[tt] : 48 + lt_hi[t];
+        } else { // hi < 48
+            return (t = tt >> 8) ? 40 + lt_hi[t] : 32 + lt_hi[tt];
+        }
+    } else { // hi < 32
+        if ((t = x >> 16)) { // hi >= 16
+            return (tt = t >> 8) ? 24 + lt_hi[tt] : 16 + lt_hi[t];
+        } else { // hi < 16
+            return (tt = x >> 8) ?  8 + lt_hi[tt] : lt_hi[x];
+        }
+    }
+#endif
+}
+
+// details see: http://citeseer.ist.psu.edu/leiserson98using.html
+// or page 10, Knuth TAOCP Vol 4 F1A
+inline uint32_t bits::lo(uint64_t x)
+{
+#ifdef __SSE4_2__
+    if (x==0)
+        return 0;
+    return __builtin_ctzll(x);
+#else
+    if (x&1) return 0;
+    if (x&3) return 1;
+    if (x&7) return 2;
+    if (x&0x7F) { // in average every second random number x can be answered this way
+        return lt_lo[(x&0x7F)>>3]+3;
+    }
+    // x&-x equals x with only the lsb set
+    return lt_deBruijn_to_idx[((x&-x)*deBruijn64)>>58];
+#endif
+}
+
+inline uint32_t bits::hi11(uint64_t x)
+{
+    // extract "11" 2bit blocks
+    uint64_t ex11 = (x&(x>>1))&0x5555555555555555ULL;
+    // extract "10" 2bit blocks
+    uint64_t ex10or01 = (ex11|(ex11<<1))^x;
+    // extract "10" 2bit blocks
+    ex11 += (((ex11|(ex11<<1))+((ex10or01<<1)&0x5555555555555555ULL)) & ((ex10or01&0x5555555555555555ULL)|ex11));
+    return hi(ex11);
+}
+
+
+inline uint32_t bits::sel11(uint64_t x, uint32_t i, uint32_t c)
+{
+    uint64_t ex11 = (x&(x>>1))&0x5555555555555555ULL;
+    uint64_t ex10or01 = (ex11|(ex11<<1))^x;
+    ex11 += (((ex11|(ex11<<1))+(((ex10or01<<1)&0x5555555555555555ULL)|c)) & ((ex10or01&0x5555555555555555ULL)|ex11));
+    return sel(ex11,i);
+}
+
+inline void bits::write_int(uint64_t* word, uint64_t x, uint8_t offset, const uint8_t len)
+{
+    x &= bits::lo_set[len];
+    if (offset + len < 64) {
+        *word &=
+            ((bits::all_set << (offset+len)) | bits::lo_set[offset]); // mask 1..10..01..1
+        *word |= (x << offset);
+//		*word ^= ((*word ^ x) & (bits::lo_set[len] << offset) );
+//      surprisingly the above line is slower than the lines above
+    } else {
+        *word &=
+            ((bits::lo_set[offset]));  // mask 0....01..1
+        *word |= (x << offset);
+        if ((offset = (offset+len)&0x3F)) { // offset+len > 64
+            *(word+1) &= (~bits::lo_set[offset]); // mask 1...10..0
+//			*(word+1) &= bits::lo_unset[offset]; // mask 1...10..0
+//          surprisingly the above line is slower than the line above
+            *(word+1) |= (x >> (len-offset));
+        }
+    }
+}
+
+inline void bits::write_int_and_move(uint64_t*& word, uint64_t x, uint8_t& offset, const uint8_t len)
+{
+    x &= bits::lo_set[len];
+    if (offset + len < 64) {
+        *word &=
+            ((bits::all_set << (offset+len)) | bits::lo_set[offset]); // mask 1..10..01..1
+        *word |= (x << offset);
+        offset += len;
+    } else {
+        *word &=
+            ((bits::lo_set[offset]));  // mask 0....01..1
+        *word |= (x << offset);
+        if ((offset= (offset+len))>64) {// offset+len >= 64
+            offset &= 0x3F;
+            *(++word) &= (~bits::lo_set[offset]); // mask 1...10..0
+            *word |= (x >> (len-offset));
+        } else {
+            offset = 0;
+            ++word;
+        }
+    }
+}
+
+inline uint64_t bits::read_int(const uint64_t* word, uint8_t offset, const uint8_t len)
+{
+    uint64_t w1 = (*word)>>offset;
+    if ((offset+len) > 64) { // if offset+len > 64
+        return w1 |  // w1 or w2 adepted:
+               ((*(word+1) & bits::lo_set[(offset+len)&0x3F])   // set higher bits zero
+                << (64-offset));  // move bits to the left
+    } else {
+        return w1 & bits::lo_set[len];
+    }
+}
+
+inline uint64_t bits::read_int_and_move(const uint64_t*& word, uint8_t& offset, const uint8_t len)
+{
+    uint64_t w1 = (*word)>>offset;
+    if ((offset = (offset+len))>=64) {  // if offset+len > 64
+        if (offset==64) {
+            offset &= 0x3F;
+            ++word;
+            return w1;
+        } else {
+            offset &= 0x3F;
+            return w1 |
+                   (((*(++word)) & bits::lo_set[offset]) << (len-offset));
+        }
+    } else {
+        return w1 & bits::lo_set[len];
+    }
+}
+
+inline uint64_t bits::read_unary(const uint64_t* word, uint8_t offset)
+{
+    uint64_t w = *word >> offset;
+    if (w) {
+        return bits::lo(w);
+    } else {
+        if (0!=(w=*(++word)))
+            return bits::lo(w)+64-offset;
+        uint64_t cnt=2;
+        while (0==(w=*(++word)))
+            ++cnt;
+        return bits::lo(w)+(cnt<<6)-offset;
+    }
+    return 0;
+}
+
+inline uint64_t bits::read_unary_and_move(const uint64_t*& word, uint8_t& offset)
+{
+    uint64_t w = (*word) >> offset; // temporary variable is good for the performance
+    if (w) {
+        uint8_t r = bits::lo(w);
+        offset = (offset + r+1)&0x3F;
+        // we know that offset + r +1 <= 64, so if the new offset equals 0 increase word
+        word += (offset==0);
+        return r;
+    } else {
+        uint8_t rr=0;
+        if (0!=(w=*(++word))) {
+            rr = bits::lo(w)+64-offset;
+            offset = (offset+rr+1)&0x3F;
+            word += (offset==0);
+            return rr;
+        } else {
+            uint64_t cnt_1=1;
+            while (0==(w=*(++word)))
+                ++cnt_1;
+            rr = bits::lo(w)+64-offset;
+            offset = (offset+rr+1)&0x3F;
+            word += (offset==0);
+            return ((cnt_1)<<6) + rr;
+        }
+    }
+    return 0;
+}
+
+inline void bits::move_right(const uint64_t*& word, uint8_t& offset, const uint8_t len)
+{
+    if ((offset+=len)&0xC0) { // if offset >= 65
+        offset&=0x3F;
+        ++word;
+    }
+}
+
+inline void bits::move_left(const uint64_t*& word, uint8_t& offset, const uint8_t len)
+{
+    if ((offset-=len)&0xC0) {  // if offset-len<0
+        offset&=0x3F;
+        --word;
+    }
+}
+
+inline uint64_t bits::next(const uint64_t* word, uint64_t idx)
+{
+    word += (idx>>6);
+    if (*word & ~lo_set[idx&0x3F]) {
+        return (idx & ~((size_t)0x3F)) + lo(*word & ~lo_set[idx&0x3F]);
+    }
+    idx = (idx & ~((size_t)0x3F)) + 64;
+    ++word;
+    while (*word==0) {
+        idx += 64;
+        ++word;
+    }
+    return idx + lo(*word);
+}
+
+inline uint64_t bits::prev(const uint64_t* word, uint64_t idx)
+{
+    word += (idx>>6);
+    if (*word & lo_set[(idx&0x3F)+1]) {
+        return (idx & ~((size_t)0x3F)) + hi(*word & lo_set[(idx&0x3F)+1]);
+    }
+    idx = (idx & ~((size_t)0x3F)) - 64;
+    --word;
+    while (*word==0) {
+        idx -= 64;
+        --word;
+    }
+    return idx + hi(*word);
+}
+
+inline uint64_t bits::rev(uint64_t x)
+{
+    x = ((x & 0x5555555555555555ULL) << 1) | ((x & 0xAAAAAAAAAAAAAAAAULL) >> 1);
+    x = ((x & 0x3333333333333333ULL) << 2) | ((x & 0xCCCCCCCCCCCCCCCCULL) >> 2);
+    x = ((x & 0x0F0F0F0F0F0F0F0FULL) << 4) | ((x & 0xF0F0F0F0F0F0F0F0ULL) >> 4);
+    x = ((x & 0x00FF00FF00FF00FFULL) << 8) | ((x & 0xFF00FF00FF00FF00ULL) >> 8);
+    x = ((x & 0x0000FFFF0000FFFFULL) <<16) | ((x & 0xFFFF0000FFFF0000ULL) >>16);
+    x = ((x & 0x00000000FFFFFFFFULL) <<32) | ((x & 0xFFFFFFFF00000000ULL) >>32);
+    return x;
+}
+
+} // end namespace sdsl
+
+#endif

From 91362e69b971d3fa218e94d72e15a2d86e7597c3 Mon Sep 17 00:00:00 2001
From: Diego Havenstein <Havenstein@protonmail.ch>
Date: Mon, 1 Sep 2014 16:06:50 +0200
Subject: [PATCH 08/11] Little fix

---
 include/sdsl/bits.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/sdsl/bits.hpp b/include/sdsl/bits.hpp
index 51027e427..88802f924 100644
--- a/include/sdsl/bits.hpp
+++ b/include/sdsl/bits.hpp
@@ -117,7 +117,7 @@ struct bits {
     /*! \param  XMM register
         \return Number of set bits.
      */ 
-#ifdef __AVX2__
+#ifdef __SSE4_2__
     static uint64_t cnt128(__m128i x);
 #endif
 

From 24ad2ce57f3e41219fee2b901c9baf7493c15200 Mon Sep 17 00:00:00 2001
From: Diego Havenstein <Havenstein@protonmail.ch>
Date: Fri, 5 Sep 2014 16:05:02 +0200
Subject: [PATCH 09/11] bug in uint256_t fixed (wrong accessing pattern to
 array elements)

---
 build/include/sdsl/bits.hpp | 694 ------------------------------------
 include/sdsl/uint256_t.hpp  |  16 +-
 2 files changed, 8 insertions(+), 702 deletions(-)
 delete mode 100644 build/include/sdsl/bits.hpp

diff --git a/build/include/sdsl/bits.hpp b/build/include/sdsl/bits.hpp
deleted file mode 100644
index 88802f924..000000000
--- a/build/include/sdsl/bits.hpp
+++ /dev/null
@@ -1,694 +0,0 @@
-/* sdsl - succinct data structures library
-    Copyright (C) 2008 Simon Gog
-
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see http://www.gnu.org/licenses/ .
-*/
-/*! \file bits.hpp
-    \brief bits.hpp contains the sdsl::bits class.
-	\author Simon Gog
-*/
-#ifndef INCLUDED_SDSL_BITS
-#define INCLUDED_SDSL_BITS
-
-#include <stdint.h> // for uint64_t uint32_t declaration
-#include <iostream>// for cerr
-#include <cassert>
-#include <x86intrin.h> // SSE/AVX
-#include "ymm_union.hpp" // convenient YMM register wrapper
-#include "xmm_union.hpp" // convenient XMM register wrapper
-#ifdef __SSE4_2__
-#include <xmmintrin.h>
-#endif
-
-//! Namespace for the succinct data structure library.
-namespace sdsl
-{
-
-//! A helper class for bitwise tricks on 64 bit words.
-/*!
-	bits is a helper class for bitwise tricks and
-	techniques. For the basic tricks and techiques we refer to Donald E. Knuth's
-	"The Art of Computer Programming", Volume 4A, Chapter 7.1.3 and
-	the informative website of Sean E. Anderson about the topic:
-	http://www-graphics.stanford.edu/~seander/bithacks.html .
-
-	We have added new functions like: cnt11 and sel11.
-
-	All members of this class are static variables or methods.
-	This class cannot be instantiated.
-
-	\author Simon Gog
- */
-struct bits {
-    bits() = delete;
-    //! 64bit mask with all bits set to 1.
-    constexpr static int64_t  all_set {-1LL};
-
-    //! This constant represents a de Bruijn sequence B(k,n) for k=2 and n=6.
-    /*! Details for de Bruijn sequences see
-       http://en.wikipedia.org/wiki/De_bruijn_sequence
-       deBruijn64 is used in combination with the
-       array lt_deBruijn_to_idx.
-    */
-    constexpr static uint64_t deBruijn64 {0x0218A392CD3D5DBFULL};
-
-    //! This table maps a 6-bit subsequence S[idx...idx+5] of constant deBruijn64 to idx.
-    /*! \sa deBruijn64
-    */
-    static const uint32_t lt_deBruijn_to_idx[64];
-
-    //! Array containing Fibonacci numbers less than \f$2^64\f$.
-    static const uint64_t lt_fib[92];
-
-    //! Lookup table for byte popcounts.
-    static const uint8_t lt_cnt[256];
-
-    //! Lookup table for most significant set bit in a byte.
-    static const uint32_t lt_hi[256];
-
-    //! lo_set[i] is a 64-bit word with the i least significant bits set and the high bits not set.
-    /*! lo_set[0] = 0ULL, lo_set[1]=1ULL, lo_set[2]=3ULL...
-     */
-    static const uint64_t lo_set[65];
-
-    //! lo_unset[i] is a 64-bit word with the i least significant bits not set and the high bits set.
-    /*! lo_unset[0] = FFFFFFFFFFFFFFFFULL, lo_unset_set[1]=FFFFFFFFFFFFFFFEULL, ...
-     */
-    static const uint64_t lo_unset[65];
-
-    //! Lookup table for least significant set bit in a byte.
-    static const uint8_t lt_lo[256];
-
-    //! Lookup table for select on bytes.
-    /*! Entry at idx = 256*j + i equals the position of the
-        (j+1)-th set bit in byte i. Positions lie in the range \f$[0..7]\f$.
-     */
-    static const uint8_t lt_sel[256*8];
-
-    //! Use to help to decide if a prefix sum stored in a byte overflows.
-    static const uint64_t ps_overflow[65];
-
-    //! Counts the number of set bits in x.
-    /*! \param  x 64-bit word
-        \return Number of set bits.
-     */
-    static uint64_t cnt(uint64_t x);
-
-    //! Counts the number of set bits in YMM register x.
-    /*! \param  YMM register
-        \return Number of set bits.
-     */
-#ifdef __AVX2__
-    static uint64_t cnt256(__m256i x);
-#endif
-
-    //! Counts the number of set bits in XMM register x.
-    /*! \param  XMM register
-        \return Number of set bits.
-     */ 
-#ifdef __SSE4_2__
-    static uint64_t cnt128(__m128i x);
-#endif
-
-    //! Position of the most significant set bit the 64-bit word x
-    /*! \param x 64-bit word
-        \return The position (in 0..63) of the least significant set bit
-                in `x` or 0 if x equals 0.
-    	\sa sel, lo
-    */
-    static uint32_t hi(uint64_t x);
-
-    //! Calculates the position of the rightmost 1-bit in the 64bit integer x if it exists
-    /*! \param x 64 bit integer.
-    	\return The position (in 0..63) of the rightmost 1-bit in the 64bit integer x if
-    	        x>0 and 0 if x equals 0.
-    	\sa sel, hi
-    */
-    static uint32_t lo(uint64_t x);
-
-    //! Counts the number of 1-bits in the 32bit integer x.
-    /*! This function is a variant of the method cnt. If
-    	32bit multiplication is fast, this method beats the cnt.
-    	for 32bit integers.
-    	\param x 64bit integer to count the bits.
-    	\return The number of 1-bits in x.
-     */
-    static uint32_t cnt32(uint32_t x);
-
-    //! Count the number of consecutive and distinct 11 in the 64bit integer x.
-    /*!
-      	\param x 64bit integer to count the terminating sequence 11 of a Fibonacci code.
-    	\param c Carry equals msb of the previous 64bit integer.
-     */
-    static uint32_t cnt11(uint64_t x, uint64_t& c);
-
-    //! Count the number of consecutive and distinct 11 in the 64bit integer x.
-    /*!
-      	\param x 64bit integer to count the terminating sequence 11 of a Fibonacci code.
-     */
-    static uint32_t cnt11(uint64_t x);
-
-    //! Count 10 bit pairs in the word x.
-    /*!
-     * \param x 64bit integer to count the 10 bit pairs.
-     * \param c Carry equals msb of the previous 64bit integer.
-     */
-    static uint32_t cnt10(uint64_t x, uint64_t& c);
-
-    //! Count 01 bit pairs in the word x.
-    /*!
-     * \param x 64bit integer to count the 01 bit pairs.
-     * \param c Carry equals msb of the previous 64bit integer.
-     */
-    static uint32_t cnt01(uint64_t x, uint64_t& c);
-
-    //! Map all 10 bit pairs to 01 or 1 if c=1 and the lsb=0. All other pairs are mapped to 00.
-    static uint64_t map10(uint64_t x, uint64_t c=0);
-
-    //! Map all 01 bit pairs to 01 of 1 if c=1 and the lsb=0. All other pairs are mapped to 00.
-    static uint64_t map01(uint64_t x, uint64_t c=1);
-
-    //! Calculate the position of the i-th rightmost 1 bit in the 64bit integer x
-    /*!
-      	\param x 64bit integer.
-    	\param i Argument i must be in the range \f$[1..cnt(x)]\f$.
-    	\pre Argument i must be in the range \f$[1..cnt(x)]\f$.
-      	\sa hi, lo
-     */
-    static uint32_t sel(uint64_t x, uint32_t i);
-    static uint32_t _sel(uint64_t x, uint32_t i);
-
-    //! Calculates the position of the i-th rightmost 11-bit-pattern which terminates a Fibonacci coded integer in x.
-    /*!	\param x 64 bit integer.
-        \param i Index of 11-bit-pattern. \f$i \in [1..cnt11(x)]\f$
-    	\param c Carry bit from word before
-     	\return The position (in 1..63) of the i-th 11-bit-pattern which terminates a Fibonacci coded integer in x if
-    	        x contains at least i 11-bit-patterns and a undefined value otherwise.
-        \sa cnt11, hi11, sel
-
-     */
-    static uint32_t sel11(uint64_t x, uint32_t i, uint32_t c=0);
-
-    //! Calculates the position of the leftmost 11-bit-pattern which terminates a Fibonacci coded integer in x.
-    /*! \param x 64 bit integer.
-        \return The position (in 1..63) of the leftmost 1 of the leftmost 11-bit-pattern which
-    	        terminates a Fibonacci coded integer in x if x contains a 11-bit-pattern
-    			and 0 otherwise.
-    	\sa cnt11, sel11
-    */
-    static uint32_t hi11(uint64_t x);
-
-    //! Writes value x to an bit position in an array.
-    static void write_int(uint64_t* word, uint64_t x, const uint8_t offset=0, const uint8_t len=64);
-
-    //! Writes value x to an bit position in an array and moves the bit-pointer.
-    static void write_int_and_move(uint64_t*& word, uint64_t x, uint8_t& offset, const uint8_t len);
-
-    //! Reads a value from a bit position in an array.
-    static uint64_t read_int(const uint64_t* word, uint8_t offset=0, const uint8_t len=64);
-
-    //! Reads a value from a bit position in an array and moved the bit-pointer.
-    static uint64_t read_int_and_move(const uint64_t*& word, uint8_t& offset, const uint8_t len=64);
-
-    //! Reads an unary decoded value from a bit position in an array.
-    static uint64_t read_unary(const uint64_t* word, uint8_t offset=0);
-
-    //! Reads an unary decoded value from a bit position in an array and moves the bit-pointer.
-    static uint64_t read_unary_and_move(const uint64_t*& word, uint8_t& offset);
-
-    //! Move the bit-pointer (=uint64_t word and offset) `len` to the right.
-    /*!\param word   64-bit word part of the bit pointer
-     * \param offset Offset part of the bit pointer
-     * \param len    Move distance. \f$ len \in [0..64] \f$
-     * \sa move_left
-     */
-    static void move_right(const uint64_t*& word, uint8_t& offset, const uint8_t len);
-
-    //! Move the bit-pointer (=uint64_t word and offset) `len` to the left.
-    /*!\param word   64-bit word part of the bit pointer
-     * \param offset Offset part of the bit pointer
-     * \param len    Move distance. \f$ len \in [0..64] \f$
-     * \sa move_right
-     */
-    static void move_left(const uint64_t*& word, uint8_t& offset, const uint8_t len);
-
-    //! Get the first one bit in the interval \f$[idx..\infty )\f$
-    static uint64_t next(const uint64_t* word, uint64_t idx);
-
-    //! Get the one bit with the greatest position in the interval \f$[0..idx]\f$
-    static uint64_t prev(const uint64_t* word, uint64_t idx);
-
-    //! reverses a given 64 bit word
-    static uint64_t rev(uint64_t x);
-};
-
-
-// ============= inline - implementations ================
-
-#ifdef __AVX2__
-inline uint64_t bits::cnt256(__m256i x){
-
-  // 4-bit universal table, 4-bit mask
-  static const __m256i MASK4_256 = _mm256_set1_epi8(0x0F);
-  static const __m256i POPCNT_LOOKUP_4BF_MASK256 = _mm256_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3,
-                                                                    1, 2, 2, 3, 2, 3, 3, 4, 
-                                                                    0, 1, 1, 2, 1, 2, 2, 3, 
-                                                                    1, 2, 2, 3, 2, 3, 3, 4);
- 
-  __m256i low, high, bwcount;
- 
-  // byte halves stored in separate YMM registers
-  low = _mm256_and_si256(MASK4_256, x);
-  high = _mm256_and_si256(MASK4_256, _mm256_srli_epi16(x, 4));
- 
-  // bytewise population count
-  bwcount = _mm256_add_epi8(_mm256_shuffle_epi8(POPCNT_LOOKUP_4BF_MASK256, low),
-                          _mm256_shuffle_epi8(POPCNT_LOOKUP_4BF_MASK256, high));
- 
-  // Computes sum of absolute differences and stores intermediate results at positions 0,4,8 and 12
-  __m256i fourSums = _mm256_sad_epu8(bwcount, _mm256_setzero_si256());
- 
-  // Use union to access individual bytes (unsigned integers)
-  sdsl::YMM_union<uint8_t> ymm_union;
-  ymm_union.ymm = fourSums;
-  return ymm_union.ymm[0] + ymm_union.ymm[4] + ymm_union.ymm[8] + ymm_union.ymm[12];
-}
-#endif
-
-#ifdef __SSE4_2__
-inline uint64_t bits::cnt128(__m128i x){
-
-  // 4-bit universal table, 4-bit mask
-  static const __m128i POPCNT_LOOKUP_4BF_MASK = _mm_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
-  static const __m128i MASK4 = _mm_set1_epi8(0x0F);
- 
-  __m128i low, high, count;
-
-  low = _mm_and_si128(MASK4, x);
-  high = _mm_and_si128(MASK4, _mm_srli_epi16(x, 4));
-  count = _mm_add_epi8(_mm_shuffle_epi8(POPCNT_LOOKUP_4BF_MASK, low),
-                       _mm_shuffle_epi8(POPCNT_LOOKUP_4BF_MASK, high));
-  
-  // Use union to access individual bytes (unsigned integers)
-  sdsl::XMM_union<uint8_t> xmm_union;
-  xmm_union.xmm = _mm_sad_epu8(x, _mm_setzero_si128());
-  return xmm_union.values[0] + xmm_union.values[4];
-}
-#endif
-
-// see page 11, Knuth TAOCP Vol 4 F1A
-inline uint64_t bits::cnt(uint64_t x)
-{
-#ifdef __SSE4_2__
-    return __builtin_popcountll(x);
-#else
-#ifdef POPCOUNT_TL
-    return lt_cnt[x&0xFFULL] + lt_cnt[(x>>8)&0xFFULL] +
-           lt_cnt[(x>>16)&0xFFULL] + lt_cnt[(x>>24)&0xFFULL] +
-           lt_cnt[(x>>32)&0xFFULL] + lt_cnt[(x>>40)&0xFFULL] +
-           lt_cnt[(x>>48)&0xFFULL] + lt_cnt[(x>>56)&0xFFULL];
-#else
-    x = x-((x>>1) & 0x5555555555555555ull);
-    x = (x & 0x3333333333333333ull) + ((x >> 2) & 0x3333333333333333ull);
-    x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0full;
-    return (0x0101010101010101ull*x >> 56);
-#endif
-#endif
-}
-
-inline uint32_t bits::cnt32(uint32_t x)
-{
-    x = x-((x>>1) & 0x55555555);
-    x = (x & 0x33333333) + ((x>>2) & 0x33333333);
-    return (0x10101010*x >>28)+(0x01010101*x >>28);
-}
-
-
-inline uint32_t bits::cnt11(uint64_t x, uint64_t& c)
-{
-    // extract "11" 2bit blocks
-    uint64_t ex11 = (x&(x>>1))&0x5555555555555555ULL, t;
-    // extract "10" 2bit blocks
-    uint64_t ex10or01 = (ex11|(ex11<<1))^x;
-
-    x = ex11 | ((t=(ex11|(ex11<<1))+(((ex10or01<<1)&0x5555555555555555ULL)|c))&(ex10or01&0x5555555555555555ULL));
-    c = (ex10or01>>63) or(t < (ex11|(ex11<<1)));
-
-    x = (x & 0x3333333333333333ULL) + ((x >> 2) & 0x3333333333333333ULL);
-    x = (x + (x >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
-    return (0x0101010101010101ULL*x >> 56);
-}
-
-inline uint32_t bits::cnt11(uint64_t x)
-{
-    // extract "11" 2bit blocks
-    uint64_t ex11 = (x&(x>>1))&0x5555555555555555ULL;
-    // extract "10" 2bit blocks
-    uint64_t ex10or01 = (ex11|(ex11<<1))^x;
-
-    x = ex11 | (((ex11|(ex11<<1))+((ex10or01<<1)&0x5555555555555555ULL))&(ex10or01&0x5555555555555555ULL));
-
-    x = (x & 0x3333333333333333ULL) + ((x >> 2) & 0x3333333333333333ULL);
-    x = (x + (x >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
-    return (0x0101010101010101ULL*x >> 56);
-}
-
-inline uint32_t bits::cnt10(uint64_t x, uint64_t& c)
-{
-    uint32_t res = cnt((x ^((x<<1) | c)) & (~x));
-    c = (x >> 63);
-    return res;
-}
-
-inline uint64_t bits::map10(uint64_t x, uint64_t c)
-{
-    return ((x ^((x << 1) | c)) & (~x));
-}
-
-inline uint32_t bits::cnt01(uint64_t x, uint64_t& c)
-{
-    uint32_t res = cnt((x ^((x<<1) | c)) & x);
-    c = (x >> 63);
-    return res;
-}
-inline uint64_t bits::map01(uint64_t x, uint64_t c)
-{
-    return ((x ^((x << 1) | c)) &  x);
-}
-
-inline uint32_t bits::sel(uint64_t x, uint32_t i)
-{
-#ifdef __SSE4_2__
-    uint64_t s = x, b;
-    s = s-((s>>1) & 0x5555555555555555ULL);
-    s = (s & 0x3333333333333333ULL) + ((s >> 2) & 0x3333333333333333ULL);
-    s = (s + (s >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
-    s = 0x0101010101010101ULL*s;
-// now s contains 8 bytes s[7],...,s[0]; s[j] contains the cumulative sum
-// of (j+1)*8 least significant bits of s
-    b = (s+ps_overflow[i]) & 0x8080808080808080ULL;
-// ps_overflow contains a bit mask x consisting of 8 bytes
-// x[7],...,x[0] and x[j] is set to 128-j
-// => a byte b[j] in b is >= 128 if cum sum >= j
-
-// __builtin_ctzll returns the number of trailing zeros, if b!=0
-    int  byte_nr = __builtin_ctzll(b) >> 3;   // byte nr in [0..7]
-    s <<= 8;
-    i -= (s >> (byte_nr<<3)) & 0xFFULL;
-    return (byte_nr << 3) + lt_sel[((i-1) << 8) + ((x>>(byte_nr<<3))&0xFFULL) ];
-#endif
-    return _sel(x, i);
-}
-
-inline uint32_t bits::_sel(uint64_t x, uint32_t i)
-{
-    uint64_t s = x, b;  // s = sum
-    s = s-((s>>1) & 0x5555555555555555ULL);
-    s = (s & 0x3333333333333333ULL) + ((s >> 2) & 0x3333333333333333ULL);
-    s = (s + (s >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
-    s = 0x0101010101010101ULL*s;
-    b = (s+ps_overflow[i]);//&0x8080808080808080ULL;// add something to the partial sums to cause overflow
-    i = (i-1)<<8;
-    if (b&0x0000000080000000ULL) // byte <=3
-        if (b&0x0000000000008000ULL) //byte <= 1
-            if (b&0x0000000000000080ULL)
-                return    lt_sel[(x&0xFFULL) + i];
-            else
-                return 8 +lt_sel[(((x>>8)&0xFFULL)  + i - ((s&0xFFULL)<<8))&0x7FFULL];//byte 1;
-        else//byte >1
-            if (b&0x0000000000800000ULL) //byte <=2
-                return 16+lt_sel[(((x>>16)&0xFFULL) + i - (s&0xFF00ULL))&0x7FFULL];//byte 2;
-            else
-                return 24+lt_sel[(((x>>24)&0xFFULL) + i - ((s>>8)&0xFF00ULL))&0x7FFULL];//byte 3;
-    else//  byte > 3
-        if (b&0x0000800000000000ULL) // byte <=5
-            if (b&0x0000008000000000ULL) //byte <=4
-                return 32+lt_sel[(((x>>32)&0xFFULL) + i - ((s>>16)&0xFF00ULL))&0x7FFULL];//byte 4;
-            else
-                return 40+lt_sel[(((x>>40)&0xFFULL) + i - ((s>>24)&0xFF00ULL))&0x7FFULL];//byte 5;
-        else// byte >5
-            if (b&0x0080000000000000ULL) //byte<=6
-                return 48+lt_sel[(((x>>48)&0xFFULL) + i - ((s>>32)&0xFF00ULL))&0x7FFULL];//byte 6;
-            else
-                return 56+lt_sel[(((x>>56)&0xFFULL) + i - ((s>>40)&0xFF00ULL))&0x7FFULL];//byte 7;
-    return 0;
-}
-
-// using built-in method or
-// 64-bit version of 32-bit proposal of
-// http://www-graphics.stanford.edu/~seander/bithacks.html
-inline uint32_t bits::hi(uint64_t x)
-{
-#ifdef __SSE4_2__
-    if (x == 0)
-        return 0;
-    return 63 - __builtin_clzll(x);
-#else
-    uint64_t t,tt; // temporaries
-    if ((tt = x >> 32)) { // hi >= 32
-        if ((t = tt >> 16)) { // hi >= 48
-            return (tt = t >> 8) ? 56 + lt_hi[tt] : 48 + lt_hi[t];
-        } else { // hi < 48
-            return (t = tt >> 8) ? 40 + lt_hi[t] : 32 + lt_hi[tt];
-        }
-    } else { // hi < 32
-        if ((t = x >> 16)) { // hi >= 16
-            return (tt = t >> 8) ? 24 + lt_hi[tt] : 16 + lt_hi[t];
-        } else { // hi < 16
-            return (tt = x >> 8) ?  8 + lt_hi[tt] : lt_hi[x];
-        }
-    }
-#endif
-}
-
-// details see: http://citeseer.ist.psu.edu/leiserson98using.html
-// or page 10, Knuth TAOCP Vol 4 F1A
-inline uint32_t bits::lo(uint64_t x)
-{
-#ifdef __SSE4_2__
-    if (x==0)
-        return 0;
-    return __builtin_ctzll(x);
-#else
-    if (x&1) return 0;
-    if (x&3) return 1;
-    if (x&7) return 2;
-    if (x&0x7F) { // in average every second random number x can be answered this way
-        return lt_lo[(x&0x7F)>>3]+3;
-    }
-    // x&-x equals x with only the lsb set
-    return lt_deBruijn_to_idx[((x&-x)*deBruijn64)>>58];
-#endif
-}
-
-inline uint32_t bits::hi11(uint64_t x)
-{
-    // extract "11" 2bit blocks
-    uint64_t ex11 = (x&(x>>1))&0x5555555555555555ULL;
-    // extract "10" 2bit blocks
-    uint64_t ex10or01 = (ex11|(ex11<<1))^x;
-    // extract "10" 2bit blocks
-    ex11 += (((ex11|(ex11<<1))+((ex10or01<<1)&0x5555555555555555ULL)) & ((ex10or01&0x5555555555555555ULL)|ex11));
-    return hi(ex11);
-}
-
-
-inline uint32_t bits::sel11(uint64_t x, uint32_t i, uint32_t c)
-{
-    uint64_t ex11 = (x&(x>>1))&0x5555555555555555ULL;
-    uint64_t ex10or01 = (ex11|(ex11<<1))^x;
-    ex11 += (((ex11|(ex11<<1))+(((ex10or01<<1)&0x5555555555555555ULL)|c)) & ((ex10or01&0x5555555555555555ULL)|ex11));
-    return sel(ex11,i);
-}
-
-inline void bits::write_int(uint64_t* word, uint64_t x, uint8_t offset, const uint8_t len)
-{
-    x &= bits::lo_set[len];
-    if (offset + len < 64) {
-        *word &=
-            ((bits::all_set << (offset+len)) | bits::lo_set[offset]); // mask 1..10..01..1
-        *word |= (x << offset);
-//		*word ^= ((*word ^ x) & (bits::lo_set[len] << offset) );
-//      surprisingly the above line is slower than the lines above
-    } else {
-        *word &=
-            ((bits::lo_set[offset]));  // mask 0....01..1
-        *word |= (x << offset);
-        if ((offset = (offset+len)&0x3F)) { // offset+len > 64
-            *(word+1) &= (~bits::lo_set[offset]); // mask 1...10..0
-//			*(word+1) &= bits::lo_unset[offset]; // mask 1...10..0
-//          surprisingly the above line is slower than the line above
-            *(word+1) |= (x >> (len-offset));
-        }
-    }
-}
-
-inline void bits::write_int_and_move(uint64_t*& word, uint64_t x, uint8_t& offset, const uint8_t len)
-{
-    x &= bits::lo_set[len];
-    if (offset + len < 64) {
-        *word &=
-            ((bits::all_set << (offset+len)) | bits::lo_set[offset]); // mask 1..10..01..1
-        *word |= (x << offset);
-        offset += len;
-    } else {
-        *word &=
-            ((bits::lo_set[offset]));  // mask 0....01..1
-        *word |= (x << offset);
-        if ((offset= (offset+len))>64) {// offset+len >= 64
-            offset &= 0x3F;
-            *(++word) &= (~bits::lo_set[offset]); // mask 1...10..0
-            *word |= (x >> (len-offset));
-        } else {
-            offset = 0;
-            ++word;
-        }
-    }
-}
-
-inline uint64_t bits::read_int(const uint64_t* word, uint8_t offset, const uint8_t len)
-{
-    uint64_t w1 = (*word)>>offset;
-    if ((offset+len) > 64) { // if offset+len > 64
-        return w1 |  // w1 or w2 adepted:
-               ((*(word+1) & bits::lo_set[(offset+len)&0x3F])   // set higher bits zero
-                << (64-offset));  // move bits to the left
-    } else {
-        return w1 & bits::lo_set[len];
-    }
-}
-
-inline uint64_t bits::read_int_and_move(const uint64_t*& word, uint8_t& offset, const uint8_t len)
-{
-    uint64_t w1 = (*word)>>offset;
-    if ((offset = (offset+len))>=64) {  // if offset+len > 64
-        if (offset==64) {
-            offset &= 0x3F;
-            ++word;
-            return w1;
-        } else {
-            offset &= 0x3F;
-            return w1 |
-                   (((*(++word)) & bits::lo_set[offset]) << (len-offset));
-        }
-    } else {
-        return w1 & bits::lo_set[len];
-    }
-}
-
-inline uint64_t bits::read_unary(const uint64_t* word, uint8_t offset)
-{
-    uint64_t w = *word >> offset;
-    if (w) {
-        return bits::lo(w);
-    } else {
-        if (0!=(w=*(++word)))
-            return bits::lo(w)+64-offset;
-        uint64_t cnt=2;
-        while (0==(w=*(++word)))
-            ++cnt;
-        return bits::lo(w)+(cnt<<6)-offset;
-    }
-    return 0;
-}
-
-inline uint64_t bits::read_unary_and_move(const uint64_t*& word, uint8_t& offset)
-{
-    uint64_t w = (*word) >> offset; // temporary variable is good for the performance
-    if (w) {
-        uint8_t r = bits::lo(w);
-        offset = (offset + r+1)&0x3F;
-        // we know that offset + r +1 <= 64, so if the new offset equals 0 increase word
-        word += (offset==0);
-        return r;
-    } else {
-        uint8_t rr=0;
-        if (0!=(w=*(++word))) {
-            rr = bits::lo(w)+64-offset;
-            offset = (offset+rr+1)&0x3F;
-            word += (offset==0);
-            return rr;
-        } else {
-            uint64_t cnt_1=1;
-            while (0==(w=*(++word)))
-                ++cnt_1;
-            rr = bits::lo(w)+64-offset;
-            offset = (offset+rr+1)&0x3F;
-            word += (offset==0);
-            return ((cnt_1)<<6) + rr;
-        }
-    }
-    return 0;
-}
-
-inline void bits::move_right(const uint64_t*& word, uint8_t& offset, const uint8_t len)
-{
-    if ((offset+=len)&0xC0) { // if offset >= 65
-        offset&=0x3F;
-        ++word;
-    }
-}
-
-inline void bits::move_left(const uint64_t*& word, uint8_t& offset, const uint8_t len)
-{
-    if ((offset-=len)&0xC0) {  // if offset-len<0
-        offset&=0x3F;
-        --word;
-    }
-}
-
-inline uint64_t bits::next(const uint64_t* word, uint64_t idx)
-{
-    word += (idx>>6);
-    if (*word & ~lo_set[idx&0x3F]) {
-        return (idx & ~((size_t)0x3F)) + lo(*word & ~lo_set[idx&0x3F]);
-    }
-    idx = (idx & ~((size_t)0x3F)) + 64;
-    ++word;
-    while (*word==0) {
-        idx += 64;
-        ++word;
-    }
-    return idx + lo(*word);
-}
-
-inline uint64_t bits::prev(const uint64_t* word, uint64_t idx)
-{
-    word += (idx>>6);
-    if (*word & lo_set[(idx&0x3F)+1]) {
-        return (idx & ~((size_t)0x3F)) + hi(*word & lo_set[(idx&0x3F)+1]);
-    }
-    idx = (idx & ~((size_t)0x3F)) - 64;
-    --word;
-    while (*word==0) {
-        idx -= 64;
-        --word;
-    }
-    return idx + hi(*word);
-}
-
-inline uint64_t bits::rev(uint64_t x)
-{
-    x = ((x & 0x5555555555555555ULL) << 1) | ((x & 0xAAAAAAAAAAAAAAAAULL) >> 1);
-    x = ((x & 0x3333333333333333ULL) << 2) | ((x & 0xCCCCCCCCCCCCCCCCULL) >> 2);
-    x = ((x & 0x0F0F0F0F0F0F0F0FULL) << 4) | ((x & 0xF0F0F0F0F0F0F0F0ULL) >> 4);
-    x = ((x & 0x00FF00FF00FF00FFULL) << 8) | ((x & 0xFF00FF00FF00FF00ULL) >> 8);
-    x = ((x & 0x0000FFFF0000FFFFULL) <<16) | ((x & 0xFFFF0000FFFF0000ULL) >>16);
-    x = ((x & 0x00000000FFFFFFFFULL) <<32) | ((x & 0xFFFFFFFF00000000ULL) >>32);
-    return x;
-}
-
-} // end namespace sdsl
-
-#endif
diff --git a/include/sdsl/uint256_t.hpp b/include/sdsl/uint256_t.hpp
index 200e6f587..c522da121 100644
--- a/include/sdsl/uint256_t.hpp
+++ b/include/sdsl/uint256_t.hpp
@@ -64,20 +64,20 @@ class uint256_t
         inline uint16_t popcount() {
 #ifdef __AVX2__ // Fastest method: 32 table lookups per clock cycle
           sdsl::YMM_union<uint64_t> ymm_union;
-          ymm_union[0] = m_lo;
-          ymm_union[1] = m_mid;
-          ymm_union[2] = m_high >> 64;
-          ymm_union[3] = m_high;
+          ymm_union.values[0] = m_lo;
+          ymm_union.values[1] = m_mid;
+          ymm_union.values[2] = m_high >> 64;
+          ymm_union.values[3] = m_high;
           return bits::cnt256(ymm_union.ymm);
 #endif
           
 #ifdef __SSE4_2__ // 16 table lookups per clock cycle
           sdsl::XMM_union<uint64_t> xmm_union1;
           sdsl::XMM_union<uint64_t> xmm_union2;
-          xmm_union1[0] = m_lo;
-          xmm_union1[1] = m_mid;
-          xmm_union2[0] = m_high >> 64;
-          xmm_union2[1] = m_high;
+          xmm_union1.values[0] = m_lo;
+          xmm_union1.values[1] = m_mid;
+          xmm_union2.values[0] = m_high >> 64;
+          xmm_union2.values[1] = m_high;
 
           return bits::cnt128(xmm_union1.xmm) + bits::cnt128(xmm_union2.xmm);
           

From db534bb99b4ff67ab80a9df802c9b7cdf022d41a Mon Sep 17 00:00:00 2001
From: Diego Havenstein <Havenstein@protonmail.ch>
Date: Fri, 5 Sep 2014 18:53:17 +0200
Subject: [PATCH 10/11] values instead of ymm to access data in ymm_union

---
 include/sdsl/bits.hpp | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/include/sdsl/bits.hpp b/include/sdsl/bits.hpp
index 88802f924..306d32ee9 100644
--- a/include/sdsl/bits.hpp
+++ b/include/sdsl/bits.hpp
@@ -276,13 +276,10 @@ inline uint64_t bits::cnt256(__m256i x){
   bwcount = _mm256_add_epi8(_mm256_shuffle_epi8(POPCNT_LOOKUP_4BF_MASK256, low),
                           _mm256_shuffle_epi8(POPCNT_LOOKUP_4BF_MASK256, high));
  
-  // Computes sum of absolute differences and stores intermediate results at positions 0,4,8 and 12
-  __m256i fourSums = _mm256_sad_epu8(bwcount, _mm256_setzero_si256());
- 
   // Use union to access individual bytes (unsigned integers)
   sdsl::YMM_union<uint8_t> ymm_union;
-  ymm_union.ymm = fourSums;
-  return ymm_union.ymm[0] + ymm_union.ymm[4] + ymm_union.ymm[8] + ymm_union.ymm[12];
+  ymm_union.ymm = _mm256_sad_epu8(bwcount, _mm256_setzero_si256());
+  return ymm_union.values[0] + ymm_union.values[4] + ymm_union.values[8] + ymm_union.values[12];
 }
 #endif
 

From 7fe66dcb1d63c2ad820292dda92cd2b9a4fa1ff0 Mon Sep 17 00:00:00 2001
From: Diego Havenstein <Havenstein@protonmail.ch>
Date: Fri, 5 Sep 2014 19:00:22 +0200
Subject: [PATCH 11/11] count variable being used where it needs to be used now

---
 include/sdsl/bits.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/sdsl/bits.hpp b/include/sdsl/bits.hpp
index 306d32ee9..4f7436020 100644
--- a/include/sdsl/bits.hpp
+++ b/include/sdsl/bits.hpp
@@ -299,7 +299,7 @@ inline uint64_t bits::cnt128(__m128i x){
   
   // Use union to access individual bytes (unsigned integers)
   sdsl::XMM_union<uint8_t> xmm_union;
-  xmm_union.xmm = _mm_sad_epu8(x, _mm_setzero_si128());
+  xmm_union.xmm = _mm_sad_epu8(count, _mm_setzero_si128());
   return xmm_union.values[0] + xmm_union.values[4];
 }
 #endif