Skip to content

Commit

Permalink
Add support for decimal and timestamp inputs to hash and xxhash Spark…
Browse files Browse the repository at this point in the history
… functions (facebookincubator#5457)

Summary:
Spark implementation: https://github.com/apache/spark/blob/branch-3.3/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala#L538

Pull Request resolved: facebookincubator#5457

Reviewed By: kgpai

Differential Revision: D48064156

Pulled By: mbasmanova

fbshipit-source-id: a0e3af64b11f714b22bf1c655524673b66c31337
  • Loading branch information
jinchengchenghh authored and facebook-github-bot committed Aug 4, 2023
1 parent 4780a2e commit 5f28e65
Show file tree
Hide file tree
Showing 4 changed files with 63 additions and 4 deletions.
24 changes: 24 additions & 0 deletions velox/functions/sparksql/Hash.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,8 @@ void applyWithType(
CASE(VARBINARY, hash.hashBytes, StringView);
CASE(REAL, hash.hashFloat, float);
CASE(DOUBLE, hash.hashDouble, double);
CASE(HUGEINT, hash.hashLongDecimal, int128_t);
CASE(TIMESTAMP, hash.hashTimestamp, Timestamp);
#undef CASE
default:
VELOX_NYI(
Expand Down Expand Up @@ -139,6 +141,17 @@ class Murmur3Hash final {
return fmix(h1, input.size());
}

uint32_t hashLongDecimal(int128_t input, uint32_t seed) {
int32_t length;
char out[sizeof(int128_t)];
DecimalUtil::toByteArray(input, out, length);
return hashBytes(StringView(out, length), seed);
}

uint32_t hashTimestamp(Timestamp input, uint32_t seed) {
return hashInt64(input.toMicros(), seed);
}

private:
uint32_t mixK1(uint32_t k1) {
k1 *= 0xcc9e2d51;
Expand Down Expand Up @@ -245,6 +258,17 @@ class XxHash64 final {
return fmix(hash);
}

int64_t hashLongDecimal(int128_t input, uint32_t seed) {
int32_t length;
char out[sizeof(int128_t)];
DecimalUtil::toByteArray(input, out, length);
return hashBytes(StringView(out, length), seed);
}

int64_t hashTimestamp(Timestamp input, uint32_t seed) {
return hashInt64(input.toMicros(), seed);
}

private:
uint64_t fmix(uint64_t hash) {
hash ^= hash >> 33;
Expand Down
9 changes: 5 additions & 4 deletions velox/functions/sparksql/Hash.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,11 @@ namespace facebook::velox::functions::sparksql {
// - Integer types (tinyint, smallint, integer, bigint)
// - Varchar, varbinary
// - Real, double
//
// TODO:
// - Decimal
// - Date
// - Timestamp
//
// TODO:
// - Row, Array: hash the elements in order
// - Map: iterate over map, hashing key then value. Since map ordering is
// unspecified, hashing logically equivalent maps may result in
Expand All @@ -51,10 +51,11 @@ std::shared_ptr<exec::VectorFunction> makeHashWithSeed(
// - Integer types (byte, short, int, long)
// - String, Binary
// - Float, Double
// - Decimal
// - Date
// - Timestamp
//
// Unsupported:
// - Decimal
// - Datetime
// - Structs, Arrays: hash the elements in order
// - Maps: iterate over map, hashing key then value. Since map ordering is
// unspecified, hashing logically equivalent maps may result in
Expand Down
15 changes: 15 additions & 0 deletions velox/functions/sparksql/tests/HashTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,21 @@ TEST_F(HashTest, String) {
EXPECT_EQ(hash<std::string>(std::nullopt), 42);
}

TEST_F(HashTest, longDecimal) {
EXPECT_EQ(hash<int128_t>(12345678), -277285195);
EXPECT_EQ(hash<int128_t>(0), -783713497);
EXPECT_EQ(hash<int128_t>(DecimalUtil::kLongDecimalMin), 1400911110);
EXPECT_EQ(hash<int128_t>(DecimalUtil::kLongDecimalMax), -817514053);
EXPECT_EQ(hash<int128_t>(-12345678), -1198355617);
EXPECT_EQ(hash<int128_t>(std::nullopt), 42);
}

// Spark CLI select timestamp_micros(12345678) to get the Timestamp.
// select hash(Timestamp("1970-01-01 00:00:12.345678")) to get the hash value.
TEST_F(HashTest, Timestamp) {
EXPECT_EQ(hash<Timestamp>(Timestamp::fromMicros(12345678)), 1402875301);
}

TEST_F(HashTest, Int64) {
EXPECT_EQ(hash<int64_t>(0xcafecafedeadbeef), -256235155);
EXPECT_EQ(hash<int64_t>(0xdeadbeefcafecafe), 673261790);
Expand Down
19 changes: 19 additions & 0 deletions velox/functions/sparksql/tests/XxHash64Test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,25 @@ TEST_F(XxHash64Test, varchar) {
EXPECT_EQ(xxhash64<std::string>(std::nullopt), 42);
}

TEST_F(XxHash64Test, longDecimal) {
EXPECT_EQ(xxhash64<int128_t>(12345678), 4541350547708072824);
EXPECT_EQ(xxhash64<int128_t>(0), -8959994473701255385);
EXPECT_EQ(
xxhash64<int128_t>(DecimalUtil::kLongDecimalMin), -2254039905620870768);
EXPECT_EQ(
xxhash64<int128_t>(DecimalUtil::kLongDecimalMax), -47190729175993179);
EXPECT_EQ(xxhash64<int128_t>(-12345678), -7692719129258511951);
EXPECT_EQ(xxhash64<int128_t>(std::nullopt), 42);
}

// Spark CLI select timestamp_micros(12345678) to get the Timestamp.
// select xxhash64(Timestamp("1970-01-01 00:00:12.345678")) to get the hash
// value.
TEST_F(XxHash64Test, Timestamp) {
EXPECT_EQ(
xxhash64<Timestamp>(Timestamp::fromMicros(12345678)), 782671362992292307);
}

TEST_F(XxHash64Test, int64) {
EXPECT_EQ(xxhash64<int64_t>(0xcafecafedeadbeef), -6259772178006417012);
EXPECT_EQ(xxhash64<int64_t>(0xdeadbeefcafecafe), -1700188678616701932);
Expand Down

0 comments on commit 5f28e65

Please sign in to comment.