Skip to content

Commit

Permalink
[oap ] Add support to read plain encoded INT96 timestamp from Parquet…
Browse files Browse the repository at this point in the history
… file
  • Loading branch information
mskapilks authored and binwei committed Jul 25, 2024
1 parent 50fb533 commit 4d4f5d2
Show file tree
Hide file tree
Showing 5 changed files with 74 additions and 2 deletions.
12 changes: 11 additions & 1 deletion velox/dwio/common/DirectDecoder.h
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,17 @@ class DirectDecoder : public IntDecoder<isSigned> {
} else if constexpr (std::is_same_v<
typename Visitor::DataType,
int128_t>) {
toSkip = visitor.process(super::template readInt<int128_t>(), atEnd);
if (super::numBytes == 12 /* INT96 */) {
int128_t encoded = super::template readInt<int128_t>();
int32_t days = encoded & ((1ULL << 32) - 1);
uint64_t nanos = static_cast<uint64_t>(encoded >> 32);

auto timestamp = Timestamp::fromDaysAndNanos(days, nanos);
toSkip =
visitor.process(*reinterpret_cast<int128_t*>(&timestamp), atEnd);
} else {
toSkip = visitor.process(super::template readInt<int128_t>(), atEnd);
}
} else {
toSkip = visitor.process(super::template readInt<int64_t>(), atEnd);
}
Expand Down
37 changes: 36 additions & 1 deletion velox/dwio/common/IntDecoder.h
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,9 @@ class IntDecoder {
template <typename T>
T readInt();

template <typename T>
T readInt96();

template <typename T>
T readVInt();

Expand Down Expand Up @@ -438,12 +441,44 @@ inline T IntDecoder<isSigned>::readInt() {
return readLittleEndianFromBigEndian<T>();
} else {
if constexpr (std::is_same_v<T, int128_t>) {
VELOX_NYI();
if (numBytes == 12) {
// TODO:: Do we need to handle useVInts case?
return readInt96<T>();
} else {
VELOX_NYI();
}
}
return readLongLE();
}
}

template <bool isSigned>
template <typename T>
inline T IntDecoder<isSigned>::readInt96() {
int64_t offset = 0;
unsigned char ch;

// read unsigned byte 64
uint64_t part1 = 0;
for (uint32_t i = 0; i < 8; ++i) {
ch = readByte();
part1 |= (ch & BASE_256_MASK) << offset;
offset += 8;
}

// read signed byte 32
int32_t part2 = 0;
offset = 0;
for (uint32_t i = 0; i < 4; ++i) {
ch = readByte();
part2 |= (ch & BASE_256_MASK) << offset;
offset += 8;
}

int128_t result = part1;
return (result << 32) | part2;
}

template <bool isSigned>
template <typename T>
inline T IntDecoder<isSigned>::readVInt() {
Expand Down
Binary file not shown.
Binary file not shown.
27 changes: 27 additions & 0 deletions velox/dwio/parquet/tests/reader/ParquetTableScanTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -964,6 +964,33 @@ TEST_F(ParquetTableScanTest, timestampPrecisionMicrosecond) {
>>>>>>> Support struct column reading with different schemas (5962)
}

TEST_F(ParquetTableScanTest, timestampINT96) {
auto a = makeFlatVector<Timestamp>({Timestamp(1, 0), Timestamp(2, 0)});
auto expected = makeRowVector({"time"}, {a});
createDuckDbTable("expected", {expected});

auto vector = makeArrayVector<Timestamp>({{}});
loadData(
getExampleFilePath("timestamp_dict_int96.parquet"),
ROW({"time"}, {TIMESTAMP()}),
makeRowVector(
{"time"},
{
vector,
}));
assertSelect({"time"}, "SELECT time from expected");

loadData(
getExampleFilePath("timestamp_plain_int96.parquet"),
ROW({"time"}, {TIMESTAMP()}),
makeRowVector(
{"time"},
{
vector,
}));
assertSelect({"time"}, "SELECT time from expected");
}

int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
folly::Init init{&argc, &argv, false};
Expand Down

0 comments on commit 4d4f5d2

Please sign in to comment.