Skip to content

Commit

Permalink
apacheGH-43382: [C++][Parquet] min-max Statistics doesn't work well w…
Browse files Browse the repository at this point in the history
…hen one of min-max is truncated (apache#43383)

### Rationale for this change

See apache#43382

### What changes are included in this PR?

Change stats has min-max from min || max to &&

### Are these changes tested?

* [x] TODO

### Are there any user-facing changes?

Might affect interface using HasMinMax

**This PR includes breaking changes to public APIs.**

* GitHub Issue: apache#43382

Authored-by: mwish <maplewish117@gmail.com>
Signed-off-by: mwish <maplewish117@gmail.com>
  • Loading branch information
mapleFU authored Aug 5, 2024
1 parent bae2908 commit 66cb749
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 3 deletions.
23 changes: 23 additions & 0 deletions cpp/src/parquet/arrow/arrow_statistics_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -156,4 +156,27 @@ INSTANTIATE_TEST_SUITE_P(
/*expected_min=*/"z",
/*expected_max=*/"z"}));

TEST(StatisticsTest, TruncateOnlyHalfMinMax) {
// GH-43382: Tests when we only have min or max, the `HasMinMax` should be false.
std::shared_ptr<::arrow::ResizableBuffer> serialized_data = AllocateBuffer();
auto out_stream = std::make_shared<::arrow::io::BufferOutputStream>(serialized_data);
auto schema = ::arrow::schema({::arrow::field("a", ::arrow::utf8())});
::parquet::WriterProperties::Builder properties_builder;
properties_builder.max_statistics_size(2);
ASSERT_OK_AND_ASSIGN(
std::unique_ptr<FileWriter> writer,
FileWriter::Open(*schema, default_memory_pool(), out_stream,
properties_builder.build(), default_arrow_writer_properties()));
auto table = Table::Make(schema, {ArrayFromJSON(::arrow::utf8(), R"(["a", "abc"])")});
ASSERT_OK(writer->WriteTable(*table, std::numeric_limits<int64_t>::max()));
ASSERT_OK(writer->Close());
ASSERT_OK(out_stream->Close());

auto buffer_reader = std::make_shared<::arrow::io::BufferReader>(serialized_data);
auto parquet_reader = ParquetFileReader::Open(std::move(buffer_reader));
std::shared_ptr<FileMetaData> metadata = parquet_reader->metadata();
std::shared_ptr<Statistics> stats = metadata->RowGroup(0)->ColumnChunk(0)->statistics();
ASSERT_FALSE(stats->HasMinMax());
}

} // namespace parquet::arrow
4 changes: 2 additions & 2 deletions cpp/src/parquet/metadata.cc
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ static std::shared_ptr<Statistics> MakeTypedColumnStats(
descr, metadata.statistics.min_value, metadata.statistics.max_value,
metadata.num_values - metadata.statistics.null_count,
metadata.statistics.null_count, metadata.statistics.distinct_count,
metadata.statistics.__isset.max_value || metadata.statistics.__isset.min_value,
metadata.statistics.__isset.max_value && metadata.statistics.__isset.min_value,
metadata.statistics.__isset.null_count,
metadata.statistics.__isset.distinct_count);
}
Expand All @@ -106,7 +106,7 @@ static std::shared_ptr<Statistics> MakeTypedColumnStats(
descr, metadata.statistics.min, metadata.statistics.max,
metadata.num_values - metadata.statistics.null_count,
metadata.statistics.null_count, metadata.statistics.distinct_count,
metadata.statistics.__isset.max || metadata.statistics.__isset.min,
metadata.statistics.__isset.max && metadata.statistics.__isset.min,
metadata.statistics.__isset.null_count, metadata.statistics.__isset.distinct_count);
}

Expand Down
2 changes: 1 addition & 1 deletion cpp/src/parquet/statistics.h
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,7 @@ class PARQUET_EXPORT Statistics {
/// \brief The number of non-null values in the column
virtual int64_t num_values() const = 0;

/// \brief Return true if the min and max statistics are set. Obtain
/// \brief Return true if both min and max statistics are set. Obtain
/// with TypedStatistics<T>::min and max
virtual bool HasMinMax() const = 0;

Expand Down
25 changes: 25 additions & 0 deletions cpp/src/parquet/statistics_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1602,5 +1602,30 @@ TEST(TestEncodedStatistics, CopySafe) {
EXPECT_EQ("abc", encoded_statistics.max());
}

TEST(TestEncodedStatistics, ApplyStatSizeLimits) {
EncodedStatistics encoded_statistics;
encoded_statistics.set_min("a");
encoded_statistics.has_min = true;

encoded_statistics.set_max("abc");
encoded_statistics.has_max = true;

encoded_statistics.ApplyStatSizeLimits(2);

ASSERT_TRUE(encoded_statistics.has_min);
ASSERT_EQ("a", encoded_statistics.min());
ASSERT_FALSE(encoded_statistics.has_max);

NodePtr node =
PrimitiveNode::Make("StringColumn", Repetition::REQUIRED, Type::BYTE_ARRAY);
ColumnDescriptor descr(node, 0, 0);
std::shared_ptr<TypedStatistics<::parquet::ByteArrayType>> statistics =
std::dynamic_pointer_cast<TypedStatistics<::parquet::ByteArrayType>>(
Statistics::Make(&descr, &encoded_statistics,
/*num_values=*/1000));
// GH-43382: HasMinMax should be false if one of min/max is not set.
EXPECT_FALSE(statistics->HasMinMax());
}

} // namespace test
} // namespace parquet

0 comments on commit 66cb749

Please sign in to comment.