diff --git a/Cargo.lock b/Cargo.lock index a2f2c69..cb421e2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -29,7 +29,6 @@ version = "0.1.0" dependencies = [ "byteorder", "googletest", - "mur3", ] [[package]] @@ -61,12 +60,6 @@ version = "2.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273" -[[package]] -name = "mur3" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97af489e1e21b68de4c390ecca6703318bc1aa16e9733bcb62c089b73c6fbb1b" - [[package]] name = "num-traits" version = "0.2.19" diff --git a/Cargo.toml b/Cargo.toml index 90961f0..46b91f0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -36,7 +36,6 @@ rustdoc-args = ["--cfg", "docsrs"] [dependencies] byteorder = { version = "1.5.0" } -mur3 = { version = "0.1.0" } [dev-dependencies] googletest = { version = "0.14.2" } diff --git a/src/hash/mod.rs b/src/hash/mod.rs new file mode 100644 index 0000000..7059dbb --- /dev/null +++ b/src/hash/mod.rs @@ -0,0 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +mod murmurhash; + +pub use murmurhash::MurmurHash3X64128; diff --git a/src/hash/murmurhash.rs b/src/hash/murmurhash.rs new file mode 100644 index 0000000..b6d5106 --- /dev/null +++ b/src/hash/murmurhash.rs @@ -0,0 +1,237 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use byteorder::{ByteOrder, LE}; +use std::hash::Hasher; + +const DEFAULT_SEED: u64 = 9001; +const C1: u64 = 0x87c37b91114253d5; +const C2: u64 = 0x4cf5ad432745937f; + +/// The MurmurHash3 is a fast, non-cryptographic, 128-bit hash function that has +/// excellent avalanche and 2-way bit independence properties. +#[derive(Debug)] +pub struct MurmurHash3X64128 { + h1: u64, + h2: u64, + total: u64, + buf: [u8; 16], + buf_len: usize, +} + +impl MurmurHash3X64128 { + pub fn with_seed(seed: u64) -> Self { + MurmurHash3X64128 { + h1: seed, + h2: seed, + total: 0, + buf: [0; 16], + buf_len: 0, + } + } + + pub fn finish128(&self) -> (u64, u64) { + let mut h1 = self.h1; + let mut h2 = self.h2; + + let total = self.total + self.buf_len as u64; + let rem = self.buf_len; + + // tail + if rem > 0 { + if rem > 8 { + // read k2 little endian + let mut buf = [0u8; 8]; + let k2_len = rem - 8; + buf[..k2_len].copy_from_slice(&self.buf[8..rem]); + // mix k2 + let mut k2 = u64::from_le_bytes(buf); + k2 = k2.wrapping_mul(C2); + k2 = k2.rotate_left(33); + k2 = k2.wrapping_mul(C1); + h2 ^= k2; + } + + // read k1 little endian + let mut buf = [0u8; 8]; + let k1_len = rem.min(8); + buf[..k1_len].copy_from_slice(&self.buf[..k1_len]); + // mix k1 + let mut k1 = u64::from_le_bytes(buf); + k1 = k1.wrapping_mul(C1); + k1 = k1.rotate_left(31); + k1 = k1.wrapping_mul(C2); + h1 ^= k1; + } + + h1 ^= total; + h2 ^= total; + h1 = h1.wrapping_add(h2); + h2 = h2.wrapping_add(h1); + h1 = fmix64(h1); + h2 = fmix64(h2); + h1 = h1.wrapping_add(h2); + h2 = h2.wrapping_add(h1); + (h1, h2) + } + + #[inline] + fn update(&mut self, mut k1: u64, mut k2: u64) { + // k1 *= c1; k1 = MURMUR3_ROTL64(k1, 31); k1 *= c2; out.h1 ^= k1; + k1 = k1.wrapping_mul(C1); + k1 = k1.rotate_left(31); + k1 = k1.wrapping_mul(C2); + self.h1 ^= k1; + + // out.h1 = MURMUR3_ROTL64(out.h1, 27); out.h1 += out.h2; out.h1 = out.h1*5+0x52dce729; + self.h1 = self.h1.rotate_left(27); + self.h1 = self.h1.wrapping_add(self.h2); + self.h1 = self.h1.wrapping_mul(5).wrapping_add(0x52dce729); + + // k2 *= c2; k2 = MURMUR3_ROTL64(k2, 33); k2 *= c1; out.h2 ^= k2; + k2 = k2.wrapping_mul(C2); + k2 = k2.rotate_left(33); + k2 = k2.wrapping_mul(C1); + self.h2 ^= k2; + + // out.h2 = MURMUR3_ROTL64(out.h2,31); out.h2 += out.h1; out.h2 = out.h2*5+c4; + self.h2 = self.h2.rotate_left(31); + self.h2 = self.h2.wrapping_add(self.h1); + self.h2 = self.h2.wrapping_mul(5).wrapping_add(0x38495ab5); + + // accumulate total length + self.total += 16; + } +} + +impl Default for MurmurHash3X64128 { + fn default() -> Self { + Self::with_seed(DEFAULT_SEED) + } +} + +impl Hasher for MurmurHash3X64128 { + fn finish(&self) -> u64 { + self.finish128().0 + } + + fn write(&mut self, mut bytes: &[u8]) { + if self.buf_len + bytes.len() < 16 { + self.buf[self.buf_len..self.buf_len + bytes.len()].copy_from_slice(bytes); + self.buf_len += bytes.len(); + return; + } + + if self.buf_len != 0 { + let wanted = 16 - self.buf_len; + self.buf[self.buf_len..].copy_from_slice(&bytes[..wanted]); + + let k1 = LE::read_u64(&self.buf[0..8]); + let k2 = LE::read_u64(&self.buf[8..16]); + self.update(k1, k2); + + bytes = &bytes[wanted..]; + self.buf_len = 0; + } + + // Number of full 128-bit blocks of 16 bytes. + // Possible exclusion of a remainder of up to 15 bytes. + let blocks = bytes.len() >> 4; // bytes / 16 + + // Process the 128-bit blocks (the body) into the hash + for i in 0..blocks { + let lo = i << 4; + let mi = lo + 8; + let hi = mi + 8; + let k1 = LE::read_u64(&bytes[lo..mi]); + let k2 = LE::read_u64(&bytes[mi..hi]); + self.update(k1, k2); + } + + // remain bytes + let len = bytes.len() % 16; + if len > 0 { + self.buf[0..len].copy_from_slice(&bytes[blocks << 4..]); + self.buf_len = len; + } + } +} + +/// Finalization mix: force all bits of a hash block to avalanche. +#[inline] +fn fmix64(mut k: u64) -> u64 { + k ^= k >> 33; + k = k.wrapping_mul(0xff51afd7ed558ccd); + k ^= k >> 33; + k = k.wrapping_mul(0xc4ceb9fe1a85ec53); + k ^ (k >> 33) +} + +#[cfg(test)] +mod tests { + use super::*; + + fn murmurhash3_x64_128(key: &[u8], seed: u64) -> (u64, u64) { + let mut hasher = MurmurHash3X64128::with_seed(seed); + hasher.write(key); + hasher.finish128() + } + + #[test] + fn test_remainder() { + // remainder > 8 + let key = "The quick brown fox jumps over the lazy dog"; + let (h1, h2) = murmurhash3_x64_128(key.as_bytes(), 0); + assert_eq!(h1, 0xe34bbc7bbc071b6c); + assert_eq!(h2, 0x7a433ca9c49a9347); + + // change one bit + let key = "The quick brown fox jumps over the lazy eog"; + let (h1, h2) = murmurhash3_x64_128(key.as_bytes(), 0); + assert_eq!(h1, 0x362108102c62d1c9); + assert_eq!(h2, 0x3285cd100292b305); + + // test a remainder < 8 + let key = "The quick brown fox jumps over the lazy dogdogdog"; + let (h1, h2) = murmurhash3_x64_128(key.as_bytes(), 0); + assert_eq!(h1, 0x9c8205300e612fc4); + assert_eq!(h2, 0xcbc0af6136aa3df9); + + // test a remainder = 8 + let key = "The quick brown fox jumps over the lazy1"; + let (h1, h2) = murmurhash3_x64_128(key.as_bytes(), 0); + assert_eq!(h1, 0xe3301a827e5cdfe3); + assert_eq!(h2, 0xbdbf05f8da0f0392); + + // test a remainder = 0 + let key = "The quick brown fox jumps over t"; + let (h1, h2) = murmurhash3_x64_128(key.as_bytes(), 0); + assert_eq!(h1, 0xdf6af91bb29bdacf); + assert_eq!(h2, 0x91a341c58df1f3a6); + + // test a ones byte and a zeros byte + let key = [ + 0x54, 0x68, 0x65, 0x20, 0x71, 0x75, 0x69, 0x63, 0x6b, 0x20, 0x62, 0x72, 0x6f, 0x77, + 0x6e, 0x20, 0x66, 0x6f, 0x78, 0x20, 0x6a, 0x75, 0x6d, 0x70, 0x73, 0x20, 0x6f, 0x76, + 0x65, 0x72, 0x20, 0x74, 0x68, 0x65, 0x20, 0x6c, 0x61, 0x7a, 0x79, 0x20, 0x64, 0x6f, + 0x67, 0xff, 0x64, 0x6f, 0x67, 0x00, + ]; + let (h1, h2) = murmurhash3_x64_128(&key, 0); + assert_eq!(h1, 0xe88abda785929c9e); + assert_eq!(h2, 0x96b98587cacc83d6); + } +} diff --git a/src/hll/mod.rs b/src/hll/mod.rs index 40d1b31..d156649 100644 --- a/src/hll/mod.rs +++ b/src/hll/mod.rs @@ -74,6 +74,8 @@ use std::hash::Hash; +use crate::hash::MurmurHash3X64128; + mod array4; mod array6; mod array8; @@ -147,9 +149,7 @@ fn pack_coupon(slot: u32, value: u8) -> u32 { /// Generate a coupon from a hashable value. fn coupon(v: H) -> u32 { - const DEFAULT_SEED: u32 = 9001; - - let mut hasher = mur3::Hasher128::with_seed(DEFAULT_SEED); + let mut hasher = MurmurHash3X64128::default(); v.hash(&mut hasher); let (lo, hi) = hasher.finish128(); diff --git a/src/lib.rs b/src/lib.rs index d49faa0..c566727 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -33,3 +33,5 @@ compile_error!("datasketches does not support big-endian targets"); pub mod error; pub mod hll; pub mod tdigest; + +mod hash;