apache · andygrove · Jun 25, 2024 · Jun 14, 2024 · Jun 14, 2024 · Jun 14, 2024
diff --git a/NOTICE.txt b/NOTICE.txt
@@ -0,0 +1,21 @@
+This product includes software from the twox-hash project
+ * Copyright https://github.com/shepmaster/twox-hash
+ * Licensed under the MIT License;
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
diff --git a/core/Cargo.toml b/core/Cargo.toml
@@ -82,7 +82,6 @@ once_cell = "1.18.0"
 regex = "1.9.6"
 crc32fast = "1.3.2"
 simd-adler32 = "0.3.7"
-twox-hash = "1.6.3"
 
 [build-dependencies]
 prost-build = "0.9.0"
@@ -94,6 +93,7 @@ jni = { version = "0.21", features = ["invocation"] }
 lazy_static = "1.4"
 assertables = "7"
 hex = "0.4.3"
+twox-hash = "1.6.3"
 
 [features]
 default = []

diff --git a/core/src/execution/datafusion/spark_hash.rs b/core/src/execution/datafusion/spark_hash.rs
@@ -21,8 +21,7 @@ use arrow::{
     compute::take,
     datatypes::{ArrowNativeTypeOp, UInt16Type, UInt32Type, UInt64Type, UInt8Type},
 };
-use std::{hash::Hasher, sync::Arc};
-use twox_hash::XxHash64;
+use std::sync::Arc;
 
 use datafusion::{
     arrow::{
@@ -99,12 +98,144 @@ pub(crate) fn spark_compatible_murmur3_hash<T: AsRef<[u8]>>(data: T, seed: u32)
     }
 }
 
+const CHUNK_SIZE: usize = 32;
+
+pub const PRIME_1: u64 = 11_400_714_785_074_694_791;
+pub const PRIME_2: u64 = 14_029_467_366_897_019_727;
+pub const PRIME_3: u64 = 1_609_587_929_392_839_161;
+pub const PRIME_4: u64 = 9_650_029_242_287_828_579;
+pub const PRIME_5: u64 = 2_870_177_450_012_600_261;
+
+/// Custom implementation of xxhash64 based on code from https://github.com/shepmaster/twox-hash
+/// but optimized for our use case by removing any intermediate buffering, which is
+/// not required because we are operating on data that is already in memory.
 #[inline]
 pub(crate) fn spark_compatible_xxhash64<T: AsRef<[u8]>>(data: T, seed: u64) -> u64 {
-    // TODO: Rewrite with a stateless hasher to reduce stack allocation?
-    let mut hasher = XxHash64::with_seed(seed);
-    hasher.write(data.as_ref());
-    hasher.finish()
+    let data: &[u8] = data.as_ref();
+    let length_bytes = data.len();
+
+    // XxCore::with_seed
+    let mut v1 = seed.wrapping_add(PRIME_1).wrapping_add(PRIME_2);
+    let mut v2 = seed.wrapping_add(PRIME_2);
+    let mut v3 = seed;
+    let mut v4 = seed.wrapping_sub(PRIME_1);
+
+    // XxCore::ingest_chunks
+    #[inline(always)]
+    fn ingest_one_number(mut current_value: u64, mut value: u64) -> u64 {
+        value = value.wrapping_mul(PRIME_2);
+        current_value = current_value.wrapping_add(value);
+        current_value = current_value.rotate_left(31);
+        current_value.wrapping_mul(PRIME_1)
+    }
+
+    // process chunks of 32 bytes
+    let mut offset_u64_4 = 0;
+    let ptr_u64 = data.as_ptr() as *const u64;
+    unsafe {
+        while offset_u64_4 * CHUNK_SIZE + CHUNK_SIZE <= length_bytes {
+            v1 = ingest_one_number(v1, ptr_u64.add(offset_u64_4 * 4).read_unaligned().to_le());
+            v2 = ingest_one_number(
+                v2,
+                ptr_u64.add(offset_u64_4 * 4 + 1).read_unaligned().to_le(),
+            );
+            v3 = ingest_one_number(
+                v3,
+                ptr_u64.add(offset_u64_4 * 4 + 2).read_unaligned().to_le(),
+            );
+            v4 = ingest_one_number(
+                v4,
+                ptr_u64.add(offset_u64_4 * 4 + 3).read_unaligned().to_le(),
+            );
+            offset_u64_4 += 1;
+        }
+    }
+    let total_len = data.len() as u64;
+
+    let mut hash = if total_len >= CHUNK_SIZE as u64 {
+        // We have processed at least one full chunk
+        let mut hash = v1.rotate_left(1);
+        hash = hash.wrapping_add(v2.rotate_left(7));
+        hash = hash.wrapping_add(v3.rotate_left(12));
+        hash = hash.wrapping_add(v4.rotate_left(18));
+
+        #[inline(always)]
+        fn mix_one(mut hash: u64, mut value: u64) -> u64 {
+            value = value.wrapping_mul(PRIME_2);
+            value = value.rotate_left(31);
+            value = value.wrapping_mul(PRIME_1);
+            hash ^= value;
+            hash = hash.wrapping_mul(PRIME_1);
+            hash.wrapping_add(PRIME_4)
+        }
+
+        hash = mix_one(hash, v1);
+        hash = mix_one(hash, v2);
+        hash = mix_one(hash, v3);
+        hash = mix_one(hash, v4);
+
+        hash
+    } else {
+        seed.wrapping_add(PRIME_5)
+    };
+
+    hash = hash.wrapping_add(total_len);
+
+    // process u64s
+    let mut offset_u64 = offset_u64_4 * 4;
+    while offset_u64 * 8 + 8 <= length_bytes {
+        let mut k1 = unsafe {
+            ptr_u64
+                .add(offset_u64)
+                .read_unaligned()
+                .to_le()
+                .wrapping_mul(PRIME_2)
+        };
+        k1 = k1.rotate_left(31);
+        k1 = k1.wrapping_mul(PRIME_1);
+        hash ^= k1;
+        hash = hash.rotate_left(27);
+        hash = hash.wrapping_mul(PRIME_1);
+        hash = hash.wrapping_add(PRIME_4);
+        offset_u64 += 1;
+    }
+
+    // process u32s
+    let data = &data[offset_u64 * 8..];
+    let ptr_u32 = data.as_ptr() as *const u32;
+    let length_bytes = length_bytes - offset_u64 * 8;
+    let mut offset_u32 = 0;
+    while offset_u32 * 4 + 4 <= length_bytes {
+        let k1 = unsafe {
+            u64::from(ptr_u32.add(offset_u32).read_unaligned().to_le()).wrapping_mul(PRIME_1)
+        };
+        hash ^= k1;
+        hash = hash.rotate_left(23);
+        hash = hash.wrapping_mul(PRIME_2);
+        hash = hash.wrapping_add(PRIME_3);
+        offset_u32 += 1;
+    }
+
+    // process u8s
+    let data = &data[offset_u32 * 4..];
+    let length_bytes = length_bytes - offset_u32 * 4;
+    let mut offset_u8 = 0;
+    while offset_u8 < length_bytes {
+        let k1 = u64::from(data[offset_u8]).wrapping_mul(PRIME_5);
+        hash ^= k1;
+        hash = hash.rotate_left(11);
+        hash = hash.wrapping_mul(PRIME_1);
+        offset_u8 += 1;
+    }
+
+    // The final intermixing
+    hash ^= hash >> 33;
+    hash = hash.wrapping_mul(PRIME_2);
+    hash ^= hash >> 29;
+    hash = hash.wrapping_mul(PRIME_3);
+    hash ^= hash >> 32;
+
+    hash
 }
 
 macro_rules! hash_array {
@@ -504,13 +635,17 @@ pub(crate) fn pmod(hash: u32, n: usize) -> usize {
 
 #[cfg(test)]
 mod tests {
+    use super::spark_compatible_xxhash64;
     use arrow::array::{Float32Array, Float64Array};
+    use std::hash::Hasher;
     use std::sync::Arc;
 
     use crate::execution::datafusion::spark_hash::{
         create_murmur3_hashes, create_xxhash64_hashes, pmod,
     };
     use datafusion::arrow::array::{ArrayRef, Int32Array, Int64Array, Int8Array, StringArray};
+    use rand::Rng;
+    use twox_hash::XxHash64;
 
     macro_rules! test_hashes_internal {
         ($hash_method: ident, $input: expr, $initial_seeds: expr, $expected: expr) => {
@@ -564,6 +699,29 @@ mod tests {
         test_hashes_with_nulls!(create_xxhash64_hashes, T, values, expected, u64);
     }
 
+    #[test]
+    fn test_xxhash64_random() {
+        let mut rng = rand::thread_rng();
+        for len in 0..128 {
+            for _ in 0..10 {
+                let data: Vec<u8> = (0..len).map(|_| rng.gen()).collect();
+                let seed = rng.gen();
+                check_xxhash64(&data, seed);
+            }
+        }
+    }
+
+    fn check_xxhash64(data: &[u8], seed: u64) {
+        let mut hasher = XxHash64::with_seed(seed);
+        hasher.write(data.as_ref());
+        let hash1 = hasher.finish();
+        let hash2 = spark_compatible_xxhash64(data, seed);
+        if hash1 != hash2 {
+            panic!("input: {} with seed {seed} produced incorrect hash (comet={hash2}, twox-hash={hash1})",
+                   data.iter().map(|byte| format!("{:02x}", byte)).collect::<String>())
+        }
+    }
+
     #[test]
     fn test_i8() {
         test_murmur3_hash::<i8, Int8Array>(