From b138d158683c47d257f650b9b574429883e4dc4b Mon Sep 17 00:00:00 2001 From: Kapil Singh Date: Fri, 8 Dec 2023 12:13:48 +0530 Subject: [PATCH] [oap ] Add support to read plain encoded INT96 timestamp from Parquet file --- velox/dwio/common/DirectDecoder.h | 12 +++++- velox/dwio/common/IntDecoder.h | 37 +++++++++++++++++- .../examples/timestamp_dict_int96.parquet | Bin 0 -> 467 bytes .../examples/timestamp_plain_int96.parquet | Bin 0 -> 429 bytes .../tests/reader/ParquetTableScanTest.cpp | 28 +++++++++++++ 5 files changed, 75 insertions(+), 2 deletions(-) create mode 100644 velox/dwio/parquet/tests/examples/timestamp_dict_int96.parquet create mode 100644 velox/dwio/parquet/tests/examples/timestamp_plain_int96.parquet diff --git a/velox/dwio/common/DirectDecoder.h b/velox/dwio/common/DirectDecoder.h index 4cd9396d0936..fd8d62fecda3 100644 --- a/velox/dwio/common/DirectDecoder.h +++ b/velox/dwio/common/DirectDecoder.h @@ -92,7 +92,17 @@ class DirectDecoder : public IntDecoder { } else if constexpr (std::is_same_v< typename Visitor::DataType, int128_t>) { - toSkip = visitor.process(super::template readInt(), atEnd); + if (super::numBytes == 12 /* INT96 */) { + int128_t encoded = super::template readInt(); + int32_t days = encoded & ((1ULL << 32) - 1); + uint64_t nanos = static_cast(encoded >> 32); + + auto timestamp = Timestamp::fromDaysAndNanos(days, nanos); + toSkip = + visitor.process(*reinterpret_cast(×tamp), atEnd); + } else { + toSkip = visitor.process(super::template readInt(), atEnd); + } } else { toSkip = visitor.process(super::template readInt(), atEnd); } diff --git a/velox/dwio/common/IntDecoder.h b/velox/dwio/common/IntDecoder.h index e5e37b429663..6aded93677aa 100644 --- a/velox/dwio/common/IntDecoder.h +++ b/velox/dwio/common/IntDecoder.h @@ -154,6 +154,9 @@ class IntDecoder { template T readInt(); + template + T readInt96(); + template T readVInt(); @@ -438,12 +441,44 @@ inline T IntDecoder::readInt() { return readLittleEndianFromBigEndian(); } else { if constexpr (std::is_same_v) { - VELOX_NYI(); + if (numBytes == 12) { + // TODO:: Do we need to handle useVInts case? + return readInt96(); + } else { + VELOX_NYI(); + } } return readLongLE(); } } +template +template +inline T IntDecoder::readInt96() { + int64_t offset = 0; + unsigned char ch; + + // read unsigned byte 64 + uint64_t part1 = 0; + for (uint32_t i = 0; i < 8; ++i) { + ch = readByte(); + part1 |= (ch & BASE_256_MASK) << offset; + offset += 8; + } + + // read signed byte 32 + int32_t part2 = 0; + offset = 0; + for (uint32_t i = 0; i < 4; ++i) { + ch = readByte(); + part2 |= (ch & BASE_256_MASK) << offset; + offset += 8; + } + + int128_t result = part1; + return (result << 32) | part2; +} + template template inline T IntDecoder::readVInt() { diff --git a/velox/dwio/parquet/tests/examples/timestamp_dict_int96.parquet b/velox/dwio/parquet/tests/examples/timestamp_dict_int96.parquet new file mode 100644 index 0000000000000000000000000000000000000000..661cb7a285227f8ead6c89fc829fc7cc99cbf37c GIT binary patch literal 467 zcmZXRze~eF6vtnFSSmP4Uq~QBIB2m@kEUsAsX7P_;#6E!?jTn zx`=;_Aa3Gc;_RiilQ+HZd!P5d@9w$>Cobw($0mN>y}oFBQs@BY8N5I3f;{*8Wq=2N zx{48S6Ak?M{`%4E9L53`0oEN=5}KxHs%0A-4-F|%wQ5Hj>4+uQ7in-MVumVUp{$yE z){g}kx@B8-h1*!L{DNMz{_?0Ay78zO&q{??igpXYsx6#kGsr+X8E*=fAcyoyQG%tM zG0kCeN!f@=E4px{HzG;|G8VFonddW--Hrt5 zkTgpsK}Kq%*B4QklFsRj3|O3(@@vS#63kVav3Mj?gGm%I9!W@ciyB$YC7s!Pem)2D zc3!2>wex^nm5e{bG<{285JF$8Fc&d}S}`{xcux`It)0PIN(02&SWcs)@AJfF4! zUi`)1Y-7_vxv3!1Z1-ZfDtP&LQ$_b8<498`ZOzhUk_j+%8{4>JVAQnzCK&eb9%-g- zKA3sO7zO|~R>8^&z?`ENC_?WTX5LbaYgyCsj`D&lbNZ7*=c25VX>M(M)T0cn?mtB} zOR1_ftA(WHbU`yAdCVoZ#!Dif3PA>>l4Tu9;*v=s(zqgn>m|u~wx-s_6|oVnQ6+g+ zs8wF4DGyU6Qm=(e{(Q+`xmt}^U|nuf53({Timestamp(1, 0), Timestamp(2, 0)}); + auto expected = makeRowVector({"time"}, {a}); + createDuckDbTable("expected", {expected}); + + auto vector = makeArrayVector({{}}); + loadData( + getExampleFilePath("timestamp_dict_int96.parquet"), + ROW({"time"}, {TIMESTAMP()}), + makeRowVector( + {"time"}, + { + vector, + })); + assertSelect({"time"}, "SELECT time from expected"); + + loadData( + getExampleFilePath("timestamp_plain_int96.parquet"), + ROW({"time"}, {TIMESTAMP()}), + makeRowVector( + {"time"}, + { + vector, + })); + assertSelect({"time"}, "SELECT time from expected"); +} + int main(int argc, char** argv) { testing::InitGoogleTest(&argc, argv); folly::Init init{&argc, &argv, false};