From ff3998dd8d21e557d63bcf00fc6a77116bcf6405 Mon Sep 17 00:00:00 2001 From: tison Date: Mon, 22 Dec 2025 14:44:38 +0800 Subject: [PATCH 1/5] add tests for murmurhash3 Signed-off-by: tison --- src/hash/mod.rs | 2 ++ src/hash/murmurhash.rs | 46 ++++++++++++++++++++++++++++++++++++++++++ src/lib.rs | 2 ++ 3 files changed, 50 insertions(+) create mode 100644 src/hash/mod.rs create mode 100644 src/hash/murmurhash.rs diff --git a/src/hash/mod.rs b/src/hash/mod.rs new file mode 100644 index 0000000..3801281 --- /dev/null +++ b/src/hash/mod.rs @@ -0,0 +1,2 @@ +mod murmurhash; + diff --git a/src/hash/murmurhash.rs b/src/hash/murmurhash.rs new file mode 100644 index 0000000..c5973fd --- /dev/null +++ b/src/hash/murmurhash.rs @@ -0,0 +1,46 @@ +#[cfg(test)] +mod tests { + #[test] + fn test_remainder() { + // remainder > 8 + let key = "The quick brown fox jumps over the lazy dog"; + let (h1, h2) = mur3::murmurhash3_x64_128(key.as_bytes(), 0); + assert_eq!(h1, 0xe34bbc7bbc071b6c); + assert_eq!(h2, 0x7a433ca9c49a9347); + + // change one bit + let key = "The quick brown fox jumps over the lazy eog"; + let (h1, h2) = mur3::murmurhash3_x64_128(key.as_bytes(), 0); + assert_eq!(h1, 0x362108102c62d1c9); + assert_eq!(h2, 0x3285cd100292b305); + + // test a remainder < 8 + let key = "The quick brown fox jumps over the lazy dogdogdog"; + let (h1, h2) = mur3::murmurhash3_x64_128(key.as_bytes(), 0); + assert_eq!(h1, 0x9c8205300e612fc4); + assert_eq!(h2, 0xcbc0af6136aa3df9); + + // test a remainder = 8 + let key = "The quick brown fox jumps over the lazy1"; + let (h1, h2) = mur3::murmurhash3_x64_128(key.as_bytes(), 0); + assert_eq!(h1, 0xe3301a827e5cdfe3); + assert_eq!(h2, 0xbdbf05f8da0f0392); + + // test a remainder = 0 + let key = "The quick brown fox jumps over t"; + let (h1, h2) = mur3::murmurhash3_x64_128(key.as_bytes(), 0); + assert_eq!(h1, 0xdf6af91bb29bdacf); + assert_eq!(h2, 0x91a341c58df1f3a6); + + // test a ones byte and a zeros byte + let key = [ + 0x54, 0x68, 0x65, 0x20, 0x71, 0x75, 0x69, 0x63, 0x6b, 0x20, 0x62, 0x72, 0x6f, 0x77, + 0x6e, 0x20, 0x66, 0x6f, 0x78, 0x20, 0x6a, 0x75, 0x6d, 0x70, 0x73, 0x20, 0x6f, 0x76, + 0x65, 0x72, 0x20, 0x74, 0x68, 0x65, 0x20, 0x6c, 0x61, 0x7a, 0x79, 0x20, 0x64, 0x6f, + 0x67, 0xff, 0x64, 0x6f, 0x67, 0x00, + ]; + let (h1, h2) = mur3::murmurhash3_x64_128(&key, 0); + assert_eq!(h1, 0xe88abda785929c9e); + assert_eq!(h2, 0x96b98587cacc83d6); + } +} diff --git a/src/lib.rs b/src/lib.rs index d49faa0..c566727 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -33,3 +33,5 @@ compile_error!("datasketches does not support big-endian targets"); pub mod error; pub mod hll; pub mod tdigest; + +mod hash; From e523bd2bc2dd4a52545dd26c8ac5f65a0abc1140 Mon Sep 17 00:00:00 2001 From: tison Date: Mon, 22 Dec 2025 15:41:59 +0800 Subject: [PATCH 2/5] impl murmurhash3 Signed-off-by: tison --- src/hash/mod.rs | 1 + src/hash/murmurhash.rs | 165 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 166 insertions(+) diff --git a/src/hash/mod.rs b/src/hash/mod.rs index 3801281..d730639 100644 --- a/src/hash/mod.rs +++ b/src/hash/mod.rs @@ -1,2 +1,3 @@ mod murmurhash; +pub use murmurhash::MurmurHash3X64128; diff --git a/src/hash/murmurhash.rs b/src/hash/murmurhash.rs index c5973fd..bc3f69b 100644 --- a/src/hash/murmurhash.rs +++ b/src/hash/murmurhash.rs @@ -1,3 +1,168 @@ +use byteorder::{ByteOrder, LE}; +use std::hash::Hasher; + +const DEFAULT_SEED: u64 = 9001; +const C1: u64 = 0x87c37b91114253d5; +const C2: u64 = 0x4cf5ad432745937f; +const C3: u64 = 0x52dce729; +const C4: u64 = 0x38495ab5; + +#[derive(Debug)] +pub struct MurmurHash3X64128 { + h1: u64, + h2: u64, + total: u64, + buf: [u8; 16], + buf_len: usize, +} + +impl MurmurHash3X64128 { + pub fn with_seed(seed: u64) -> Self { + MurmurHash3X64128 { + h1: seed, + h2: seed, + total: 0, + buf: [0; 16], + buf_len: 0, + } + } + + pub fn finish128(&self) -> (u64, u64) { + let mut h1 = self.h1; + let mut h2 = self.h2; + + let total = self.total + self.buf_len as u64; + let rem = self.buf_len; + + // tail + if rem > 0 { + if rem > 8 { + // read k2 little endian + let mut buf = [0u8; 8]; + buf[..rem - 8].copy_from_slice(&self.buf[8..]); + // mix k2 + let mut k2 = u64::from_le_bytes(buf); + k2 = k2.wrapping_mul(C2); + k2 = k2.rotate_left(33); + k2 = k2.wrapping_mul(C1); + h2 ^= k2; + } + + // read k1 little endian + let mut buf = [0u8; 8]; + let k1_len = rem.min(8); + buf[..k1_len].copy_from_slice(&self.buf[..k1_len]); + // mix k1 + let mut k1 = u64::from_le_bytes(buf); + k1 = k1.wrapping_mul(C1); + k1 = k1.rotate_left(31); + k1 = k1.wrapping_mul(C2); + h1 ^= k1; + } + + h1 ^= total; + h2 ^= total; + h1 = h1.wrapping_add(h2); + h2 = h2.wrapping_add(h1); + h1 = fmix64(h1); + h2 = fmix64(h2); + h1 = h1.wrapping_add(h2); + h2 = h2.wrapping_add(h1); + (h1, h2) + } + + #[inline] + fn update(&mut self, mut k1: u64, mut k2: u64) { + // k1 *= c1; k1 = MURMUR3_ROTL64(k1, 31); k1 *= c2; out.h1 ^= k1; + k1 = k1.wrapping_mul(C1); + k1 = k1.rotate_left(31); + k1 = k1.wrapping_mul(C2); + self.h1 ^= k1; + + // out.h1 = MURMUR3_ROTL64(out.h1, 27); out.h1 += out.h2; out.h1 = out.h1*5+c3; + self.h1 = self.h1.rotate_left(27); + self.h1 = self.h1.wrapping_add(self.h2); + self.h1 = self.h1.wrapping_mul(5).wrapping_add(C3); + + // k2 *= c2; k2 = MURMUR3_ROTL64(k2, 33); k2 *= c1; out.h2 ^= k2; + k2 = k2.wrapping_mul(C2); + k2 = k2.rotate_left(33); + k2 = k2.wrapping_mul(C1); + self.h2 ^= k2; + + // out.h2 = MURMUR3_ROTL64(out.h2,31); out.h2 += out.h1; out.h2 = out.h2*5+c4; + self.h2 = self.h2.rotate_left(31); + self.h2 = self.h2.wrapping_add(self.h1); + self.h2 = self.h2.wrapping_mul(5).wrapping_add(C4); + + // accumulate total length + self.total += 16; + } +} + +impl Default for MurmurHash3X64128 { + fn default() -> Self { + Self::with_seed(DEFAULT_SEED) + } +} + +impl Hasher for MurmurHash3X64128 { + fn finish(&self) -> u64 { + self.finish128().0 + } + + fn write(&mut self, mut bytes: &[u8]) { + if self.buf_len + bytes.len() < 16 { + self.buf[self.buf_len..self.buf_len + bytes.len()].copy_from_slice(bytes); + self.buf_len += bytes.len(); + return; + } + + if self.buf_len != 0 { + let wanted = 16 - self.buf_len; + self.buf[self.buf_len..].copy_from_slice(&bytes[..wanted]); + + let k1 = LE::read_u64(&self.buf[0..8]); + let k2 = LE::read_u64(&self.buf[8..16]); + self.update(k1, k2); + + bytes = &bytes[wanted..]; + self.buf_len = 0; + } + + // Number of full 128-bit blocks of 16 bytes. + // Possible exclusion of a remainder of up to 15 bytes. + let blocks = bytes.len() >> 4; // bytes / 16 + + // Process the 128-bit blocks (the body) into the hash + for i in 0..blocks { + let lo = i << 4; + let mi = lo + 8; + let hi = mi + 8; + let k1 = LE::read_u64(&bytes[lo..mi]); + let k2 = LE::read_u64(&bytes[mi..hi]); + self.update(k1, k2); + } + + // remain bytes + let len = bytes.len() % 16; + if len > 0 { + self.buf[0..len].copy_from_slice(&bytes[blocks << 4..]); + self.buf_len = len; + } + } +} + +/// Finalization mix: force all bits of a hash block to avalanche. +#[inline] +fn fmix64(mut k: u64) -> u64 { + k ^= k >> 33; + k = k.wrapping_mul(0xff51afd7ed558ccd); + k ^= k >> 33; + k = k.wrapping_mul(0xc4ceb9fe1a85ec53); + k ^ (k >> 33) +} + #[cfg(test)] mod tests { #[test] From b45a0c064a24459c79ee2667bf84710e36f86ad2 Mon Sep 17 00:00:00 2001 From: tison Date: Mon, 22 Dec 2025 15:48:52 +0800 Subject: [PATCH 3/5] test and fix up Signed-off-by: tison --- src/hash/murmurhash.rs | 33 +++++++++++++++++++++------------ src/hll/mod.rs | 6 +++--- 2 files changed, 24 insertions(+), 15 deletions(-) diff --git a/src/hash/murmurhash.rs b/src/hash/murmurhash.rs index bc3f69b..8fa3d4f 100644 --- a/src/hash/murmurhash.rs +++ b/src/hash/murmurhash.rs @@ -4,9 +4,9 @@ use std::hash::Hasher; const DEFAULT_SEED: u64 = 9001; const C1: u64 = 0x87c37b91114253d5; const C2: u64 = 0x4cf5ad432745937f; -const C3: u64 = 0x52dce729; -const C4: u64 = 0x38495ab5; +/// The MurmurHash3 is a fast, non-cryptographic, 128-bit hash function that has +/// excellent avalanche and 2-way bit independence properties. #[derive(Debug)] pub struct MurmurHash3X64128 { h1: u64, @@ -39,7 +39,8 @@ impl MurmurHash3X64128 { if rem > 8 { // read k2 little endian let mut buf = [0u8; 8]; - buf[..rem - 8].copy_from_slice(&self.buf[8..]); + let k2_len = rem - 8; + buf[..k2_len].copy_from_slice(&self.buf[8..rem]); // mix k2 let mut k2 = u64::from_le_bytes(buf); k2 = k2.wrapping_mul(C2); @@ -79,10 +80,10 @@ impl MurmurHash3X64128 { k1 = k1.wrapping_mul(C2); self.h1 ^= k1; - // out.h1 = MURMUR3_ROTL64(out.h1, 27); out.h1 += out.h2; out.h1 = out.h1*5+c3; + // out.h1 = MURMUR3_ROTL64(out.h1, 27); out.h1 += out.h2; out.h1 = out.h1*5+0x52dce729; self.h1 = self.h1.rotate_left(27); self.h1 = self.h1.wrapping_add(self.h2); - self.h1 = self.h1.wrapping_mul(5).wrapping_add(C3); + self.h1 = self.h1.wrapping_mul(5).wrapping_add(0x52dce729); // k2 *= c2; k2 = MURMUR3_ROTL64(k2, 33); k2 *= c1; out.h2 ^= k2; k2 = k2.wrapping_mul(C2); @@ -93,7 +94,7 @@ impl MurmurHash3X64128 { // out.h2 = MURMUR3_ROTL64(out.h2,31); out.h2 += out.h1; out.h2 = out.h2*5+c4; self.h2 = self.h2.rotate_left(31); self.h2 = self.h2.wrapping_add(self.h1); - self.h2 = self.h2.wrapping_mul(5).wrapping_add(C4); + self.h2 = self.h2.wrapping_mul(5).wrapping_add(0x38495ab5); // accumulate total length self.total += 16; @@ -165,35 +166,43 @@ fn fmix64(mut k: u64) -> u64 { #[cfg(test)] mod tests { + use super::*; + + fn murmurhash3_x64_128(key: &[u8], seed: u64) -> (u64, u64) { + let mut hasher = MurmurHash3X64128::with_seed(seed); + hasher.write(key); + hasher.finish128() + } + #[test] fn test_remainder() { // remainder > 8 let key = "The quick brown fox jumps over the lazy dog"; - let (h1, h2) = mur3::murmurhash3_x64_128(key.as_bytes(), 0); + let (h1, h2) = murmurhash3_x64_128(key.as_bytes(), 0); assert_eq!(h1, 0xe34bbc7bbc071b6c); assert_eq!(h2, 0x7a433ca9c49a9347); // change one bit let key = "The quick brown fox jumps over the lazy eog"; - let (h1, h2) = mur3::murmurhash3_x64_128(key.as_bytes(), 0); + let (h1, h2) = murmurhash3_x64_128(key.as_bytes(), 0); assert_eq!(h1, 0x362108102c62d1c9); assert_eq!(h2, 0x3285cd100292b305); // test a remainder < 8 let key = "The quick brown fox jumps over the lazy dogdogdog"; - let (h1, h2) = mur3::murmurhash3_x64_128(key.as_bytes(), 0); + let (h1, h2) = murmurhash3_x64_128(key.as_bytes(), 0); assert_eq!(h1, 0x9c8205300e612fc4); assert_eq!(h2, 0xcbc0af6136aa3df9); // test a remainder = 8 let key = "The quick brown fox jumps over the lazy1"; - let (h1, h2) = mur3::murmurhash3_x64_128(key.as_bytes(), 0); + let (h1, h2) = murmurhash3_x64_128(key.as_bytes(), 0); assert_eq!(h1, 0xe3301a827e5cdfe3); assert_eq!(h2, 0xbdbf05f8da0f0392); // test a remainder = 0 let key = "The quick brown fox jumps over t"; - let (h1, h2) = mur3::murmurhash3_x64_128(key.as_bytes(), 0); + let (h1, h2) = murmurhash3_x64_128(key.as_bytes(), 0); assert_eq!(h1, 0xdf6af91bb29bdacf); assert_eq!(h2, 0x91a341c58df1f3a6); @@ -204,7 +213,7 @@ mod tests { 0x65, 0x72, 0x20, 0x74, 0x68, 0x65, 0x20, 0x6c, 0x61, 0x7a, 0x79, 0x20, 0x64, 0x6f, 0x67, 0xff, 0x64, 0x6f, 0x67, 0x00, ]; - let (h1, h2) = mur3::murmurhash3_x64_128(&key, 0); + let (h1, h2) = murmurhash3_x64_128(&key, 0); assert_eq!(h1, 0xe88abda785929c9e); assert_eq!(h2, 0x96b98587cacc83d6); } diff --git a/src/hll/mod.rs b/src/hll/mod.rs index 40d1b31..d156649 100644 --- a/src/hll/mod.rs +++ b/src/hll/mod.rs @@ -74,6 +74,8 @@ use std::hash::Hash; +use crate::hash::MurmurHash3X64128; + mod array4; mod array6; mod array8; @@ -147,9 +149,7 @@ fn pack_coupon(slot: u32, value: u8) -> u32 { /// Generate a coupon from a hashable value. fn coupon(v: H) -> u32 { - const DEFAULT_SEED: u32 = 9001; - - let mut hasher = mur3::Hasher128::with_seed(DEFAULT_SEED); + let mut hasher = MurmurHash3X64128::default(); v.hash(&mut hasher); let (lo, hi) = hasher.finish128(); From c7e800bd07a27cdc9e47b8ebc20d3562f8165806 Mon Sep 17 00:00:00 2001 From: tison Date: Mon, 22 Dec 2025 15:50:01 +0800 Subject: [PATCH 4/5] drop mur3 dep Signed-off-by: tison --- Cargo.lock | 7 ------- Cargo.toml | 1 - 2 files changed, 8 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a2f2c69..cb421e2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -29,7 +29,6 @@ version = "0.1.0" dependencies = [ "byteorder", "googletest", - "mur3", ] [[package]] @@ -61,12 +60,6 @@ version = "2.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273" -[[package]] -name = "mur3" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97af489e1e21b68de4c390ecca6703318bc1aa16e9733bcb62c089b73c6fbb1b" - [[package]] name = "num-traits" version = "0.2.19" diff --git a/Cargo.toml b/Cargo.toml index 90961f0..46b91f0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -36,7 +36,6 @@ rustdoc-args = ["--cfg", "docsrs"] [dependencies] byteorder = { version = "1.5.0" } -mur3 = { version = "0.1.0" } [dev-dependencies] googletest = { version = "0.14.2" } From be29ae44b86e4fef1a49c855ef8bcd4a1802ddf7 Mon Sep 17 00:00:00 2001 From: tison Date: Mon, 22 Dec 2025 16:04:56 +0800 Subject: [PATCH 5/5] lint Signed-off-by: tison --- src/hash/mod.rs | 17 +++++++++++++++++ src/hash/murmurhash.rs | 17 +++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/src/hash/mod.rs b/src/hash/mod.rs index d730639..7059dbb 100644 --- a/src/hash/mod.rs +++ b/src/hash/mod.rs @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + mod murmurhash; pub use murmurhash::MurmurHash3X64128; diff --git a/src/hash/murmurhash.rs b/src/hash/murmurhash.rs index 8fa3d4f..b6d5106 100644 --- a/src/hash/murmurhash.rs +++ b/src/hash/murmurhash.rs @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + use byteorder::{ByteOrder, LE}; use std::hash::Hasher;