Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions datasketches/src/hash/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,8 @@
// under the License.

mod murmurhash;
mod xxhash;

pub use murmurhash::MurmurHash3X64128;
#[allow(unused_imports)]
pub use xxhash::XxHash64;
250 changes: 250 additions & 0 deletions datasketches/src/hash/xxhash.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,250 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

use std::hash::Hasher;

use byteorder::ByteOrder;
use byteorder::LE;

const DEFAULT_SEED: u64 = 0;

// Unsigned 64-bit primes from xxhash64.
const P1: u64 = 0x9E3779B185EBCA87;
const P2: u64 = 0xC2B2AE3D27D4EB4F;
const P3: u64 = 0x165667B19E3779F9;
const P4: u64 = 0x85EBCA77C2B2AE63;
const P5: u64 = 0x27D4EB2F165667C5;

/// The XxHash64 is a fast, non-cryptographic, 64-bit hash function that has
/// excellent avalanche and 2-way bit independence properties.
#[derive(Debug)]
pub struct XxHash64 {
seed: u64,
total_len: u64,
v1: u64,
v2: u64,
v3: u64,
v4: u64,
buffer: [u8; 32],
buffer_len: usize,
}

impl XxHash64 {
pub fn with_seed(seed: u64) -> Self {
XxHash64 {
seed,
total_len: 0,
v1: seed.wrapping_add(P1).wrapping_add(P2),
v2: seed.wrapping_add(P2),
v3: seed,
v4: seed.wrapping_sub(P1),
buffer: [0; 32],
buffer_len: 0,
}
}

pub fn finish64(&self) -> u64 {
let mut hash = if self.total_len >= 32 {
let mut acc = self
.v1
.rotate_left(1)
.wrapping_add(self.v2.rotate_left(7))
.wrapping_add(self.v3.rotate_left(12))
.wrapping_add(self.v4.rotate_left(18));
acc = merge_round(acc, self.v1);
acc = merge_round(acc, self.v2);
acc = merge_round(acc, self.v3);
acc = merge_round(acc, self.v4);
acc
} else {
self.seed.wrapping_add(P5)
};

hash = hash.wrapping_add(self.total_len);

let mut idx = 0;
let buf = &self.buffer[..self.buffer_len];
while idx + 8 <= buf.len() {
let mut k1 = LE::read_u64(&buf[idx..idx + 8]);
k1 = k1.wrapping_mul(P2);
k1 = k1.rotate_left(31);
k1 = k1.wrapping_mul(P1);
hash ^= k1;
hash = hash.rotate_left(27).wrapping_mul(P1).wrapping_add(P4);
idx += 8;
}

if idx + 4 <= buf.len() {
let k1 = LE::read_u32(&buf[idx..idx + 4]) as u64;
hash ^= k1.wrapping_mul(P1);
hash = hash.rotate_left(23).wrapping_mul(P2).wrapping_add(P3);
idx += 4;
}

while idx < buf.len() {
let k1 = buf[idx] as u64;
hash ^= k1.wrapping_mul(P5);
hash = hash.rotate_left(11).wrapping_mul(P1);
idx += 1;
}

finalize(hash)
}

#[allow(dead_code)]
pub fn hash_u64(input: u64, seed: u64) -> u64 {
let mut hash = seed.wrapping_add(P5).wrapping_add(8);
let mut k1 = input;
k1 = k1.wrapping_mul(P2);
k1 = k1.rotate_left(31);
k1 = k1.wrapping_mul(P1);
hash ^= k1;
hash = hash.rotate_left(27).wrapping_mul(P1).wrapping_add(P4);
finalize(hash)
}

#[inline]
fn update(&mut self, chunk: &[u8]) {
self.v1 = round(self.v1, LE::read_u64(&chunk[0..8]));
self.v2 = round(self.v2, LE::read_u64(&chunk[8..16]));
self.v3 = round(self.v3, LE::read_u64(&chunk[16..24]));
self.v4 = round(self.v4, LE::read_u64(&chunk[24..32]));
}
}

impl Default for XxHash64 {
fn default() -> Self {
Self::with_seed(DEFAULT_SEED)
}
}

impl Hasher for XxHash64 {
fn finish(&self) -> u64 {
self.finish64()
}

fn write(&mut self, bytes: &[u8]) {
self.total_len = self.total_len.wrapping_add(bytes.len() as u64);

if self.buffer_len + bytes.len() < 32 {
self.buffer[self.buffer_len..self.buffer_len + bytes.len()].copy_from_slice(bytes);
self.buffer_len += bytes.len();
return;
}

let mut bytes = bytes;

if self.buffer_len != 0 {
let needed = 32 - self.buffer_len;
self.buffer[self.buffer_len..].copy_from_slice(&bytes[..needed]);
let chunk = self.buffer;
self.update(&chunk);
self.buffer_len = 0;
bytes = &bytes[needed..];
}

let mut chunks = bytes.chunks_exact(32);
for chunk in &mut chunks {
self.update(chunk);
}

let remainder = chunks.remainder();
if !remainder.is_empty() {
self.buffer[..remainder.len()].copy_from_slice(remainder);
self.buffer_len = remainder.len();
}
}
}

#[inline]
fn round(mut acc: u64, input: u64) -> u64 {
acc = acc.wrapping_add(input.wrapping_mul(P2));
acc = acc.rotate_left(31);
acc.wrapping_mul(P1)
}

#[inline]
fn merge_round(mut acc: u64, val: u64) -> u64 {
let mut v = val;
v = v.wrapping_mul(P2);
v = v.rotate_left(31);
v = v.wrapping_mul(P1);
acc ^= v;
acc.wrapping_mul(P1).wrapping_add(P4)
}

#[inline]
fn finalize(mut hash: u64) -> u64 {
hash ^= hash >> 33;
hash = hash.wrapping_mul(P2);
hash ^= hash >> 29;
hash = hash.wrapping_mul(P3);
hash ^ (hash >> 32)
}

#[cfg(test)]
mod tests {
use super::*;

const PRIME32: u64 = 0x9E3779B1;
const PRIME64: u64 = 0x9E3779B185EBCA8D;

fn fill_test_buffer(len: usize) -> Vec<u8> {
let mut buffer = vec![0u8; len];
let mut byte_gen = PRIME32;
for byte in &mut buffer {
*byte = (byte_gen >> 56) as u8;
byte_gen = byte_gen.wrapping_mul(PRIME64);
}
buffer
}

fn xxhash64(data: &[u8], seed: u64) -> u64 {
let mut hasher = XxHash64::with_seed(seed);
hasher.write(data);
hasher.finish64()
}

#[test]
fn test_vectors_seed_zero() {
let buf = fill_test_buffer(101);
assert_eq!(xxhash64(&buf[..0], 0), 0xEF46DB3751D8E999);
assert_eq!(xxhash64(&buf[..1], 0), 0xE934A84ADB052768);
assert_eq!(xxhash64(&buf[..32], 0), 0x18B216492BB44B70);
assert_eq!(xxhash64(&buf[..33], 0), 0x55C8DC3E578F5B59);
assert_eq!(xxhash64(&buf[..100], 0), 0x4BFE019CD91D9EA4);
}

#[test]
fn test_vectors_seed_prime32() {
let buf = fill_test_buffer(101);
assert_eq!(xxhash64(&buf[..0], PRIME32), 0xAC75FDA2929B17EF);
assert_eq!(xxhash64(&buf[..1], PRIME32), 0x5014607643A9B4C3);
assert_eq!(xxhash64(&buf[..32], PRIME32), 0xB3F33BDF93ADE409);
assert_eq!(xxhash64(&buf[..100], PRIME32), 0x4853706DC9625CAE);
}

#[test]
fn test_long_check() {
let seed = 0;
let hash1 = XxHash64::hash_u64(123, seed);
let mut hasher = XxHash64::with_seed(seed);
hasher.write(&123u64.to_le_bytes());
let hash2 = hasher.finish64();
assert_eq!(hash2, hash1);
}
}