apache · andygrove · Jun 25, 2024 · Jun 14, 2024 · Jun 14, 2024 · Jun 14, 2024
diff --git a/LICENSE.txt b/LICENSE.txt
@@ -201,6 +201,7 @@
    See the License for the specific language governing permissions and
    limitations under the License.
 
+--------------------------------------------------------------------------------
 
 This project includes code from Apache Aurora.
 
@@ -210,3 +211,30 @@ This project includes code from Apache Aurora.
 Copyright: 2016 The Apache Software Foundation.
 Home page: https://aurora.apache.org/
 License: http://www.apache.org/licenses/LICENSE-2.0
+
+--------------------------------------------------------------------------------
+
+This project includes software from the twox-hash project
+https://github.com/shepmaster/twox-hash
+
+The MIT License (MIT)
+
+Copyright (c) 2015 Jake Goulding
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
diff --git a/NOTICE.txt b/NOTICE.txt
@@ -0,0 +1,8 @@
+Apache DataFusion Comet
+Copyright 2024 The Apache Software Foundation
+
+This product includes software developed at
+The Apache Software Foundation (http://www.apache.org/).
+
+This product includes software from the twox-hash project (MIT License)
+https://github.com/shepmaster/twox-hash
diff --git a/core/Cargo.toml b/core/Cargo.toml
@@ -82,7 +82,6 @@ once_cell = "1.18.0"
 regex = "1.9.6"
 crc32fast = "1.3.2"
 simd-adler32 = "0.3.7"
-twox-hash = "1.6.3"
 
 [build-dependencies]
 prost-build = "0.9.0"
@@ -94,6 +93,7 @@ jni = { version = "0.21", features = ["invocation"] }
 lazy_static = "1.4"
 assertables = "7"
 hex = "0.4.3"
+twox-hash = "1.6.3"
 
 [features]
 default = []

diff --git a/core/src/execution/datafusion/expressions/mod.rs b/core/src/execution/datafusion/expressions/mod.rs
@@ -43,6 +43,7 @@ pub mod temporal;
 pub mod unbound;
 mod utils;
 pub mod variance;
+pub mod xxhash64;
 
 #[derive(Debug, Hash, PartialEq, Clone, Copy)]
 pub enum EvalMode {

diff --git a/core/src/execution/datafusion/expressions/xxhash64.rs b/core/src/execution/datafusion/expressions/xxhash64.rs
@@ -0,0 +1,186 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! xxhash64 implementation
+
+const CHUNK_SIZE: usize = 32;
+
+const PRIME_1: u64 = 11_400_714_785_074_694_791;
+const PRIME_2: u64 = 14_029_467_366_897_019_727;
+const PRIME_3: u64 = 1_609_587_929_392_839_161;
+const PRIME_4: u64 = 9_650_029_242_287_828_579;
+const PRIME_5: u64 = 2_870_177_450_012_600_261;
+
+/// Custom implementation of xxhash64 based on code from https://github.com/shepmaster/twox-hash
+/// but optimized for our use case by removing any intermediate buffering, which is
+/// not required because we are operating on data that is already in memory.
+#[inline]
+pub(crate) fn spark_compatible_xxhash64<T: AsRef<[u8]>>(data: T, seed: u64) -> u64 {
+    let data: &[u8] = data.as_ref();
+    let length_bytes = data.len();
+
+    let mut v1 = seed.wrapping_add(PRIME_1).wrapping_add(PRIME_2);
+    let mut v2 = seed.wrapping_add(PRIME_2);
+    let mut v3 = seed;
+    let mut v4 = seed.wrapping_sub(PRIME_1);
+
+    // process chunks of 32 bytes
+    let mut offset_u64_4 = 0;
+    let ptr_u64 = data.as_ptr() as *const u64;
+    unsafe {
+        while offset_u64_4 * CHUNK_SIZE + CHUNK_SIZE <= length_bytes {
+            v1 = ingest_one_number(v1, ptr_u64.add(offset_u64_4 * 4).read_unaligned().to_le());
+            v2 = ingest_one_number(
+                v2,
+                ptr_u64.add(offset_u64_4 * 4 + 1).read_unaligned().to_le(),
+            );
+            v3 = ingest_one_number(
+                v3,
+                ptr_u64.add(offset_u64_4 * 4 + 2).read_unaligned().to_le(),
+            );
+            v4 = ingest_one_number(
+                v4,
+                ptr_u64.add(offset_u64_4 * 4 + 3).read_unaligned().to_le(),
+            );
+            offset_u64_4 += 1;
+        }
+    }
+
+    let mut hash = if length_bytes >= CHUNK_SIZE {
+        // We have processed at least one full chunk
+        let mut hash = v1.rotate_left(1);
+        hash = hash.wrapping_add(v2.rotate_left(7));
+        hash = hash.wrapping_add(v3.rotate_left(12));
+        hash = hash.wrapping_add(v4.rotate_left(18));
+
+        hash = mix_one(hash, v1);
+        hash = mix_one(hash, v2);
+        hash = mix_one(hash, v3);
+        hash = mix_one(hash, v4);
+
+        hash
+    } else {
+        seed.wrapping_add(PRIME_5)
+    };
+
+    hash = hash.wrapping_add(length_bytes as u64);
+
+    // process u64s
+    let mut offset_u64 = offset_u64_4 * 4;
+    while offset_u64 * 8 + 8 <= length_bytes {
+        let mut k1 = unsafe {
+            ptr_u64
+                .add(offset_u64)
+                .read_unaligned()
+                .to_le()
+                .wrapping_mul(PRIME_2)
+        };
+        k1 = k1.rotate_left(31);
+        k1 = k1.wrapping_mul(PRIME_1);
+        hash ^= k1;
+        hash = hash.rotate_left(27);
+        hash = hash.wrapping_mul(PRIME_1);
+        hash = hash.wrapping_add(PRIME_4);
+        offset_u64 += 1;
+    }
+
+    // process u32s
+    let data = &data[offset_u64 * 8..];
+    let ptr_u32 = data.as_ptr() as *const u32;
+    let length_bytes = length_bytes - offset_u64 * 8;
+    let mut offset_u32 = 0;
+    while offset_u32 * 4 + 4 <= length_bytes {
+        let k1 = unsafe {
+            u64::from(ptr_u32.add(offset_u32).read_unaligned().to_le()).wrapping_mul(PRIME_1)
+        };
+        hash ^= k1;
+        hash = hash.rotate_left(23);
+        hash = hash.wrapping_mul(PRIME_2);
+        hash = hash.wrapping_add(PRIME_3);
+        offset_u32 += 1;
+    }
+
+    // process u8s
+    let data = &data[offset_u32 * 4..];
+    let length_bytes = length_bytes - offset_u32 * 4;
+    let mut offset_u8 = 0;
+    while offset_u8 < length_bytes {
+        let k1 = u64::from(data[offset_u8]).wrapping_mul(PRIME_5);
+        hash ^= k1;
+        hash = hash.rotate_left(11);
+        hash = hash.wrapping_mul(PRIME_1);
+        offset_u8 += 1;
+    }
+
+    // The final intermixing
+    hash ^= hash >> 33;
+    hash = hash.wrapping_mul(PRIME_2);
+    hash ^= hash >> 29;
+    hash = hash.wrapping_mul(PRIME_3);
+    hash ^= hash >> 32;
+
+    hash
+}
+
+#[inline(always)]
+fn ingest_one_number(mut current_value: u64, mut value: u64) -> u64 {
+    value = value.wrapping_mul(PRIME_2);
+    current_value = current_value.wrapping_add(value);
+    current_value = current_value.rotate_left(31);
+    current_value.wrapping_mul(PRIME_1)
+}
+
+#[inline(always)]
+fn mix_one(mut hash: u64, mut value: u64) -> u64 {
+    value = value.wrapping_mul(PRIME_2);
+    value = value.rotate_left(31);
+    value = value.wrapping_mul(PRIME_1);
+    hash ^= value;
+    hash = hash.wrapping_mul(PRIME_1);
+    hash.wrapping_add(PRIME_4)
+}
+
+#[cfg(test)]
+mod test {
+    use super::spark_compatible_xxhash64;
+    use rand::Rng;
+    use std::hash::Hasher;
+    use twox_hash::XxHash64;
+
+    #[test]
+    fn test_xxhash64_random() {
+        let mut rng = rand::thread_rng();
+        for len in 0..128 {
+            for _ in 0..10 {
+                let data: Vec<u8> = (0..len).map(|_| rng.gen()).collect();
+                let seed = rng.gen();
+                check_xxhash64(&data, seed);
+            }
+        }
+    }
+
+    fn check_xxhash64(data: &[u8], seed: u64) {
+        let mut hasher = XxHash64::with_seed(seed);
+        hasher.write(data.as_ref());
+        let hash1 = hasher.finish();
+        let hash2 = spark_compatible_xxhash64(data, seed);
+        if hash1 != hash2 {
+            panic!("input: {} with seed {seed} produced incorrect hash (comet={hash2}, twox-hash={hash1})",
+                   data.iter().map(|byte| format!("{:02x}", byte)).collect::<String>())
+        }
+    }
+}
diff --git a/core/src/execution/datafusion/spark_hash.rs b/core/src/execution/datafusion/spark_hash.rs
@@ -21,8 +21,7 @@ use arrow::{
     compute::take,
     datatypes::{ArrowNativeTypeOp, UInt16Type, UInt32Type, UInt64Type, UInt8Type},
 };
-use std::{hash::Hasher, sync::Arc};
-use twox_hash::XxHash64;
+use std::sync::Arc;
 
 use datafusion::{
     arrow::{
@@ -35,6 +34,8 @@ use datafusion::{
     error::{DataFusionError, Result},
 };
 
+use crate::execution::datafusion::expressions::xxhash64::spark_compatible_xxhash64;
+
 #[inline]
 pub(crate) fn spark_compatible_murmur3_hash<T: AsRef<[u8]>>(data: T, seed: u32) -> u32 {
     #[inline]
@@ -104,14 +105,6 @@ pub(crate) fn spark_compatible_murmur3_hash<T: AsRef<[u8]>>(data: T, seed: u32)
     }
 }
 
-#[inline]
-pub(crate) fn spark_compatible_xxhash64<T: AsRef<[u8]>>(data: T, seed: u64) -> u64 {
-    // TODO: Rewrite with a stateless hasher to reduce stack allocation?
-    let mut hasher = XxHash64::with_seed(seed);
-    hasher.write(data.as_ref());
-    hasher.finish()
-}
-
 macro_rules! hash_array {
     ($array_type: ident, $column: ident, $hashes: ident, $hash_method: ident) => {
         let array = $column.as_any().downcast_ref::<$array_type>().unwrap();