From 01d536b62e8ca802cab42b62df47b8e907a29716 Mon Sep 17 00:00:00 2001 From: yingsu00 Date: Sun, 26 May 2024 22:55:40 -0700 Subject: [PATCH] Fix Parquet V2 page reading when max def/rep levels are 0 The existing code did not skip the definition and repetition level arrays if the maxDefine_ and maxRepeat_ are both zero but these arrays were actually more than 0 bytes. This commit fixes this problem and always uses pageHeader.data_page_header_v2.definition_levels_byte_length and pageHeader.data_page_header_v2.repetition_levels_byte_length as the array lengths because these values are guranteed to exist. --- velox/dwio/parquet/reader/PageReader.cpp | 11 ++++---- .../parquet/tests/examples/v2_page.parquet | Bin 0 -> 166 bytes .../tests/reader/ParquetReaderTest.cpp | 25 ++++++++++++++++++ 3 files changed, 30 insertions(+), 6 deletions(-) create mode 100644 velox/dwio/parquet/tests/examples/v2_page.parquet diff --git a/velox/dwio/parquet/reader/PageReader.cpp b/velox/dwio/parquet/reader/PageReader.cpp index 142f1017287f..4153ad49edac 100644 --- a/velox/dwio/parquet/reader/PageReader.cpp +++ b/velox/dwio/parquet/reader/PageReader.cpp @@ -261,12 +261,11 @@ void PageReader::prepareDataPageV2(const PageHeader& pageHeader, int64_t row) { return; } - uint32_t defineLength = maxDefine_ > 0 - ? pageHeader.data_page_header_v2.definition_levels_byte_length - : 0; - uint32_t repeatLength = maxRepeat_ > 0 - ? pageHeader.data_page_header_v2.repetition_levels_byte_length - : 0; + uint32_t defineLength = + pageHeader.data_page_header_v2.definition_levels_byte_length; + uint32_t repeatLength = + pageHeader.data_page_header_v2.repetition_levels_byte_length; + auto bytes = pageHeader.compressed_page_size; pageData_ = readBytes(bytes, pageBuffer_); diff --git a/velox/dwio/parquet/tests/examples/v2_page.parquet b/velox/dwio/parquet/tests/examples/v2_page.parquet new file mode 100644 index 0000000000000000000000000000000000000000..4275539bc1cdd00853209e9bddebdb9b1f2aa14b GIT binary patch literal 166 zcmWG=3^EjD6O|H`iV@`!WdLF>Q5I1q1_owk`36Q7Rt6xGfkBi>Qpba}Brz!`6(}dl zqRJ$}S(KWdnV*-PS{cg3AjTyrBPpZCAS22mDas{@Bq<7%6LS%BQRA?Y;9von1q2ct a5SoEOj6nm7C{Pxthl_!s0>}ygx)1;@UlX+e literal 0 HcmV?d00001 diff --git a/velox/dwio/parquet/tests/reader/ParquetReaderTest.cpp b/velox/dwio/parquet/tests/reader/ParquetReaderTest.cpp index 5488c0c7d3d9..1a535a148325 100644 --- a/velox/dwio/parquet/tests/reader/ParquetReaderTest.cpp +++ b/velox/dwio/parquet/tests/reader/ParquetReaderTest.cpp @@ -1156,3 +1156,28 @@ TEST_F(ParquetReaderTest, readVarbinaryFromFLBA) { result->as()->childAt(0)->asFlatVector()->valueAt( 0)); } + +TEST_F(ParquetReaderTest, testV2PageWithZeroMaxDefRep) { + // enum_type.parquet contains 1 column (ENUM) with 3 rows. + const std::string sample(getExampleFilePath("v2_page.parquet")); + + facebook::velox::dwio::common::ReaderOptions readerOptions{leafPool_.get()}; + auto reader = createReader(sample, readerOptions); + EXPECT_EQ(reader->numberOfRows(), 5ULL); + + auto rowType = reader->typeWithId(); + EXPECT_EQ(rowType->type()->kind(), TypeKind::ROW); + EXPECT_EQ(rowType->size(), 1ULL); + + EXPECT_EQ(rowType->childAt(0)->type()->kind(), TypeKind::BIGINT); + + auto outputRowType = ROW({"regionkey"}, {BIGINT()}); + auto rowReaderOpts = getReaderOpts(outputRowType); + rowReaderOpts.setScanSpec(makeScanSpec(outputRowType)); + auto rowReader = reader->createRowReader(rowReaderOpts); + + auto expected = makeRowVector({makeFlatVector({0, 1, 2, 3, 4})}); + + assertReadWithReaderAndExpected( + outputRowType, *rowReader, expected, *leafPool_); +}