From 591d2beda0e0c8bd41a39bb9192cee45e6ac347f Mon Sep 17 00:00:00 2001 From: Kapil Singh Date: Fri, 8 Dec 2023 12:13:48 +0530 Subject: [PATCH] Add support to read plain encoded INT96 timestamp from Parquet file (#456) --- velox/dwio/common/DirectDecoder.h | 12 +++++- velox/dwio/common/IntDecoder.h | 37 +++++++++++++++++- velox/dwio/parquet/reader/PageReader.cpp | 14 ++----- .../examples/timestamp_dict_int96.parquet | Bin 0 -> 467 bytes .../examples/timestamp_plain_int96.parquet | Bin 0 -> 429 bytes .../tests/reader/ParquetTableScanTest.cpp | 27 +++++++++++++ .../sparksql/tests/DecimalUtilTest.cpp | 2 +- velox/type/Timestamp.h | 15 +++++++ 8 files changed, 93 insertions(+), 14 deletions(-) create mode 100644 velox/dwio/parquet/tests/examples/timestamp_dict_int96.parquet create mode 100644 velox/dwio/parquet/tests/examples/timestamp_plain_int96.parquet diff --git a/velox/dwio/common/DirectDecoder.h b/velox/dwio/common/DirectDecoder.h index b966a31d70d9..286e4c967b84 100644 --- a/velox/dwio/common/DirectDecoder.h +++ b/velox/dwio/common/DirectDecoder.h @@ -98,7 +98,17 @@ class DirectDecoder : public IntDecoder { } else if constexpr (std::is_same_v< typename Visitor::DataType, int128_t>) { - toSkip = visitor.process(super::template readInt(), atEnd); + if (super::numBytes == 12 /* INT96 */) { + int128_t encoded = super::template readInt(); + int32_t days = encoded & ((1ULL << 32) - 1); + uint64_t nanos = static_cast(encoded >> 32); + + auto timestamp = Timestamp::fromDaysAndNanos(days, nanos); + toSkip = + visitor.process(*reinterpret_cast(×tamp), atEnd); + } else { + toSkip = visitor.process(super::template readInt(), atEnd); + } } else { toSkip = visitor.process(super::template readInt(), atEnd); } diff --git a/velox/dwio/common/IntDecoder.h b/velox/dwio/common/IntDecoder.h index 1a6f6f597a26..a4287b5f22cc 100644 --- a/velox/dwio/common/IntDecoder.h +++ b/velox/dwio/common/IntDecoder.h @@ -167,6 +167,9 @@ class IntDecoder { template T readInt(); + template + T readInt96(); + template T readVInt(); @@ -453,12 +456,44 @@ inline T IntDecoder::readInt() { return readLittleEndianFromBigEndian(); } else { if constexpr (std::is_same_v) { - VELOX_NYI(); + if (numBytes == 12) { + // TODO:: Do we need to handle useVInts case? + return readInt96(); + } else { + VELOX_NYI(); + } } return readLongLE(); } } +template +template +inline T IntDecoder::readInt96() { + int64_t offset = 0; + unsigned char ch; + + // read unsigned byte 64 + uint64_t part1 = 0; + for (uint32_t i = 0; i < 8; ++i) { + ch = readByte(); + part1 |= (ch & BASE_256_MASK) << offset; + offset += 8; + } + + // read signed byte 32 + int32_t part2 = 0; + offset = 0; + for (uint32_t i = 0; i < 4; ++i) { + ch = readByte(); + part2 |= (ch & BASE_256_MASK) << offset; + offset += 8; + } + + int128_t result = part1; + return (result << 32) | part2; +} + template template inline T IntDecoder::readVInt() { diff --git a/velox/dwio/parquet/reader/PageReader.cpp b/velox/dwio/parquet/reader/PageReader.cpp index 79476b1ff9b5..771ecb28c2bf 100644 --- a/velox/dwio/parquet/reader/PageReader.cpp +++ b/velox/dwio/parquet/reader/PageReader.cpp @@ -414,11 +414,7 @@ void PageReader::prepareDictionary(const PageHeader& pageHeader) { // We start from the end to allow in-place expansion. auto values = dictionary_.values->asMutable(); auto parquetValues = dictionary_.values->asMutable(); - static constexpr int64_t kJulianToUnixEpochDays = 2440588LL; - static constexpr int64_t kSecondsPerDay = 86400LL; - static constexpr int64_t kNanosPerSecond = - Timestamp::kNanosecondsInMillisecond * - Timestamp::kMillisecondsInSecond; + for (auto i = dictionary_.numValues - 1; i >= 0; --i) { // Convert the timestamp into seconds and nanos since the Unix epoch, // 00:00:00.000000 on 1 January 1970. @@ -432,12 +428,8 @@ void PageReader::prepareDictionary(const PageHeader& pageHeader) { &days, parquetValues + i * sizeof(Int96Timestamp) + sizeof(uint64_t), sizeof(int32_t)); - int64_t seconds = (days - kJulianToUnixEpochDays) * kSecondsPerDay; - if (nanos > Timestamp::kMaxNanos) { - seconds += nanos / kNanosPerSecond; - nanos -= (nanos / kNanosPerSecond) * kNanosPerSecond; - } - values[i] = Timestamp(seconds, nanos); + + values[i] = Timestamp::fromDaysAndNanos(days, nanos); } break; } diff --git a/velox/dwio/parquet/tests/examples/timestamp_dict_int96.parquet b/velox/dwio/parquet/tests/examples/timestamp_dict_int96.parquet new file mode 100644 index 0000000000000000000000000000000000000000..661cb7a285227f8ead6c89fc829fc7cc99cbf37c GIT binary patch literal 467 zcmZXRze~eF6vtnFSSmP4Uq~QBIB2m@kEUsAsX7P_;#6E!?jTn zx`=;_Aa3Gc;_RiilQ+HZd!P5d@9w$>Cobw($0mN>y}oFBQs@BY8N5I3f;{*8Wq=2N zx{48S6Ak?M{`%4E9L53`0oEN=5}KxHs%0A-4-F|%wQ5Hj>4+uQ7in-MVumVUp{$yE z){g}kx@B8-h1*!L{DNMz{_?0Ay78zO&q{??igpXYsx6#kGsr+X8E*=fAcyoyQG%tM zG0kCeN!f@=E4px{HzG;|G8VFonddW--Hrt5 zkTgpsK}Kq%*B4QklFsRj3|O3(@@vS#63kVav3Mj?gGm%I9!W@ciyB$YC7s!Pem)2D zc3!2>wex^nm5e{bG<{285JF$8Fc&d}S}`{xcux`It)0PIN(02&SWcs)@AJfF4! zUi`)1Y-7_vxv3!1Z1-ZfDtP&LQ$_b8<498`ZOzhUk_j+%8{4>JVAQnzCK&eb9%-g- zKA3sO7zO|~R>8^&z?`ENC_?WTX5LbaYgyCsj`D&lbNZ7*=c25VX>M(M)T0cn?mtB} zOR1_ftA(WHbU`yAdCVoZ#!Dif3PA>>l4Tu9;*v=s(zqgn>m|u~wx-s_6|oVnQ6+g+ zs8wF4DGyU6Qm=(e{(Q+`xmt}^U|nuf53({Timestamp(1, 0), Timestamp(2, 0)}); + auto expected = makeRowVector({"time"}, {a}); + createDuckDbTable("expected", {expected}); + + auto vector = makeArrayVector({{}}); + loadData( + getExampleFilePath("timestamp_dict_int96.parquet"), + ROW({"time"}, {TIMESTAMP()}), + makeRowVector( + {"time"}, + { + vector, + })); + assertSelect({"time"}, "SELECT time from expected"); + + loadData( + getExampleFilePath("timestamp_plain_int96.parquet"), + ROW({"time"}, {TIMESTAMP()}), + makeRowVector( + {"time"}, + { + vector, + })); + assertSelect({"time"}, "SELECT time from expected"); +} + int main(int argc, char** argv) { testing::InitGoogleTest(&argc, argv); folly::init(&argc, &argv, false); diff --git a/velox/functions/sparksql/tests/DecimalUtilTest.cpp b/velox/functions/sparksql/tests/DecimalUtilTest.cpp index 34de356ae7ab..833b88605a20 100644 --- a/velox/functions/sparksql/tests/DecimalUtilTest.cpp +++ b/velox/functions/sparksql/tests/DecimalUtilTest.cpp @@ -30,7 +30,7 @@ class DecimalUtilTest : public testing::Test { R expectedResult, bool expectedOverflow) { R r; - bool overflow; + bool overflow = false; DecimalUtil::divideWithRoundUp(r, a, b, aRescale, overflow); ASSERT_EQ(overflow, expectedOverflow); ASSERT_EQ(r, expectedResult); diff --git a/velox/type/Timestamp.h b/velox/type/Timestamp.h index 5db3bcf30833..e774f9339aeb 100644 --- a/velox/type/Timestamp.h +++ b/velox/type/Timestamp.h @@ -100,6 +100,21 @@ struct Timestamp { VELOX_USER_DCHECK_LE(nanos, kMaxNanos, "Timestamp nanos out of range"); } + static Timestamp fromDaysAndNanos(int32_t days, uint64_t nanos) { + static constexpr int64_t kJulianToUnixEpochDays = 2440588LL; + static constexpr int64_t kSecondsPerDay = 86400LL; + static constexpr int64_t kNanosPerSecond = + Timestamp::kNanosecondsInMillisecond * Timestamp::kMillisecondsInSecond; + + int64_t seconds = (days - kJulianToUnixEpochDays) * kSecondsPerDay; + if (nanos > Timestamp::kMaxNanos) { + seconds += nanos / kNanosPerSecond; + nanos -= (nanos / kNanosPerSecond) * kNanosPerSecond; + } + + return Timestamp(seconds, nanos); + } + // Returns the current unix timestamp (ms precision). static Timestamp now();