Skip to content

Commit

Permalink
Fix Parquet Complex type handling
Browse files Browse the repository at this point in the history
  • Loading branch information
jaystarshot committed Mar 26, 2024
1 parent 5186d0b commit 121b376
Show file tree
Hide file tree
Showing 4 changed files with 47 additions and 18 deletions.
36 changes: 30 additions & 6 deletions velox/dwio/parquet/reader/ParquetReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,8 @@ std::shared_ptr<const ParquetTypeWithId> ReaderBase::getParquetColumnInfo(
auto& schema = fileMetaData_->schema;
uint32_t curSchemaIdx = schemaIdx;
auto& schemaElement = schema[curSchemaIdx];
bool isRepeated = false;
bool isOptional = false;

if (schemaElement.__isset.repetition_type) {
if (schemaElement.repetition_type !=
Expand All @@ -244,6 +246,11 @@ std::shared_ptr<const ParquetTypeWithId> ReaderBase::getParquetColumnInfo(
if (schemaElement.repetition_type ==
thrift::FieldRepetitionType::REPEATED) {
maxRepeat++;
isRepeated = true;
}
if (schemaElement.repetition_type ==
thrift::FieldRepetitionType::OPTIONAL) {
isOptional = true;
}
}

Expand Down Expand Up @@ -296,7 +303,9 @@ std::shared_ptr<const ParquetTypeWithId> ReaderBase::getParquetColumnInfo(
std::nullopt,
std::nullopt,
maxRepeat,
maxDefine);
maxDefine,
isOptional,
isRepeated);
}

// For backward-compatibility, a group annotated with MAP_KEY_VALUE
Expand All @@ -309,6 +318,9 @@ std::shared_ptr<const ParquetTypeWithId> ReaderBase::getParquetColumnInfo(
VELOX_CHECK_EQ(children.size(), 1);
const auto& child = children[0];
auto grandChildren = child->getChildren();
// This level will not be repeated since parquet will have a child layer with the repeated info which we are ignoring here
// eg https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#lists
isRepeated = true;
return std::make_shared<const ParquetTypeWithId>(
child->type(),
std::move(grandChildren),
Expand All @@ -319,7 +331,9 @@ std::shared_ptr<const ParquetTypeWithId> ReaderBase::getParquetColumnInfo(
std::nullopt,
std::nullopt,
maxRepeat + 1,
maxDefine);
maxDefine,
isOptional,
isRepeated);
}

default:
Expand All @@ -346,7 +360,9 @@ std::shared_ptr<const ParquetTypeWithId> ReaderBase::getParquetColumnInfo(
std::nullopt,
std::nullopt,
maxRepeat,
maxDefine);
maxDefine,
isOptional,
isRepeated)
} else if (children.size() == 2) {
// children of MAP
auto childrenCopy = children;
Expand All @@ -361,7 +377,9 @@ std::shared_ptr<const ParquetTypeWithId> ReaderBase::getParquetColumnInfo(
std::nullopt,
std::nullopt,
maxRepeat,
maxDefine);
maxDefine,
isOptional,
isRepeated);
}
} else {
// Row type
Expand All @@ -376,7 +394,9 @@ std::shared_ptr<const ParquetTypeWithId> ReaderBase::getParquetColumnInfo(
std::nullopt,
std::nullopt,
maxRepeat,
maxDefine);
maxDefine,
isOptional,
isRepeated);
}
}
} else { // leaf node
Expand All @@ -403,6 +423,8 @@ std::shared_ptr<const ParquetTypeWithId> ReaderBase::getParquetColumnInfo(
logicalType_,
maxRepeat,
maxDefine,
isOptional,
isRepeated,
precision,
scale,
type_length);
Expand All @@ -423,7 +445,9 @@ std::shared_ptr<const ParquetTypeWithId> ReaderBase::getParquetColumnInfo(
std::nullopt,
std::nullopt,
maxRepeat,
maxDefine - 1);
maxDefine - 1,
isOptional,
isRepeated);
}
return leafTypePtr;
}
Expand Down
15 changes: 7 additions & 8 deletions velox/dwio/parquet/reader/ParquetTypeWithId.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,15 +51,14 @@ bool ParquetTypeWithId::hasNonRepeatedLeaf() const {
}

LevelMode ParquetTypeWithId::makeLevelInfo(LevelInfo& info) const {
int16_t repeatedAncestor = 0;
for (auto parent = parquetParent(); parent;
parent = parent->parquetParent()) {
if (parent->type()->kind() == TypeKind::ARRAY ||
parent->type()->kind() == TypeKind::MAP) {
repeatedAncestor = parent->maxDefine_;
break;
int16_t repeatedAncestor = maxDefine_;
auto node = this;
do {
if (node->isOptional_) {
repeatedAncestor--;
}
}
node = node->parquetParent();
} while (node && !node->isRepeated_);
bool isList = type()->kind() == TypeKind::ARRAY;
bool isStruct = type()->kind() == TypeKind::ROW;
bool isMap = type()->kind() == TypeKind::MAP;
Expand Down
6 changes: 6 additions & 0 deletions velox/dwio/parquet/reader/ParquetTypeWithId.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ class ParquetTypeWithId : public dwio::common::TypeWithId {
std::optional<thrift::LogicalType> logicalType,
uint32_t maxRepeat,
uint32_t maxDefine,
bool isOptional,
bool isRepeated,
int32_t precision = 0,
int32_t scale = 0,
int32_t typeLength = 0)
Expand All @@ -54,6 +56,8 @@ class ParquetTypeWithId : public dwio::common::TypeWithId {
logicalType_(std::move(logicalType)),
maxRepeat_(maxRepeat),
maxDefine_(maxDefine),
isOptional_(isOptional),
isRepeated_(isRepeated),
precision_(precision),
scale_(scale),
typeLength_(typeLength) {}
Expand All @@ -79,6 +83,8 @@ class ParquetTypeWithId : public dwio::common::TypeWithId {
const std::optional<thrift::LogicalType> logicalType_;
const uint32_t maxRepeat_;
const uint32_t maxDefine_;
const bool isOptional_;
const bool isRepeated_;
const int32_t precision_;
const int32_t scale_;
const int32_t typeLength_;
Expand Down
8 changes: 4 additions & 4 deletions velox/dwio/parquet/tests/reader/ParquetTableScanTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -303,7 +303,7 @@ TEST_F(ParquetTableScanTest, singleRowStruct) {
}

// Core dump and incorrect result are fixed.
TEST_F(ParquetTableScanTest, DISABLED_array) {
TEST_F(ParquetTableScanTest, array) {
auto vector = makeArrayVector<int32_t>({{1, 2, 3}});

loadData(
Expand All @@ -321,7 +321,7 @@ TEST_F(ParquetTableScanTest, DISABLED_array) {

// Optional array with required elements.
// Incorrect result.
TEST_F(ParquetTableScanTest, DISABLED_optArrayReqEle) {
TEST_F(ParquetTableScanTest, optArrayReqEle) {
auto vector = makeArrayVector<StringView>({});

loadData(
Expand All @@ -342,7 +342,7 @@ TEST_F(ParquetTableScanTest, DISABLED_optArrayReqEle) {

// Required array with required elements.
// Core dump is fixed, but the result is incorrect.
TEST_F(ParquetTableScanTest, DISABLED_reqArrayReqEle) {
TEST_F(ParquetTableScanTest, reqArrayReqEle) {
auto vector = makeArrayVector<StringView>({});

loadData(
Expand All @@ -363,7 +363,7 @@ TEST_F(ParquetTableScanTest, DISABLED_reqArrayReqEle) {

// Required array with optional elements.
// Incorrect result.
TEST_F(ParquetTableScanTest, DISABLED_reqArrayOptEle) {
TEST_F(ParquetTableScanTest, reqArrayOptEle) {
auto vector = makeArrayVector<StringView>({});

loadData(
Expand Down

0 comments on commit 121b376

Please sign in to comment.