From 7aae8e1a4759162b135473d299a6e4ac679dd9d1 Mon Sep 17 00:00:00 2001 From: mwish Date: Mon, 8 Apr 2024 22:15:05 +0800 Subject: [PATCH] GH-41032: [C++][Parquet] Bugfixes and more tests in boolean arrow decoding (#41037) ### Rationale for this change Previous patches ( https://github.com/apache/arrow/pull/40876 , https://github.com/apache/arrow/pull/40995 ) introduce some bugs in boolean decoding. This patch add tests for DecodeArrow and fix the bug ### What changes are included in this PR? 1. Add tests for DecodeArrow with nulls and without nulls 2. Fix 2 bugs ### Are these changes tested? Yes ### Are there any user-facing changes? no * GitHub Issue: #41032 Lead-authored-by: mwish Co-authored-by: mwish Signed-off-by: Antoine Pitrou --- cpp/src/parquet/encoding.cc | 15 ++-- cpp/src/parquet/encoding_test.cc | 144 +++++++++++++++++++++++++++++++ 2 files changed, 153 insertions(+), 6 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 6e93b493392c9..3da5c64ace5dd 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -1192,10 +1192,12 @@ int PlainBooleanDecoder::DecodeArrow( int values_decoded = num_values - null_count; if (ARROW_PREDICT_FALSE(num_values_ < values_decoded)) { // A too large `num_values` was requested. - ParquetException::EofException(); + ParquetException::EofException( + "A too large `num_values` was requested in PlainBooleanDecoder: remain " + + std::to_string(num_values_) + ", requested: " + std::to_string(values_decoded)); } if (ARROW_PREDICT_FALSE(!bit_reader_->Advance(values_decoded))) { - ParquetException::EofException(); + ParquetException::EofException("PlainDecoder doesn't have enough values in page"); } if (null_count == 0) { @@ -1208,7 +1210,7 @@ int PlainBooleanDecoder::DecodeArrow( BitBlockCounter bit_counter(valid_bits, valid_bits_offset, num_values); int64_t value_position = 0; int64_t valid_bits_offset_position = valid_bits_offset; - int64_t previous_value_offset = 0; + int64_t previous_value_offset = total_num_values_ - num_values_; while (value_position < num_values) { auto block = bit_counter.NextWord(); if (block.AllSet()) { @@ -1224,8 +1226,7 @@ int PlainBooleanDecoder::DecodeArrow( } else { for (int64_t i = 0; i < block.length; ++i) { if (bit_util::GetBit(valid_bits, valid_bits_offset_position + i)) { - bool value = bit_util::GetBit( - data_, total_num_values_ - num_values_ + previous_value_offset); + bool value = bit_util::GetBit(data_, previous_value_offset); builder->UnsafeAppend(value); previous_value_offset += 1; } else { @@ -3177,7 +3178,9 @@ class RleBooleanDecoder : public DecoderImpl, virtual public BooleanDecoder { PARQUET_THROW_NOT_OK( out->AppendValues(values.begin(), values.begin() + current_batch_size)); num_values -= current_batch_size; - current_index_in_batch = 0; + // set current_index_in_batch to current_batch_size means + // the whole batch is totally consumed. + current_index_in_batch = current_batch_size; } while (num_values > 0); return num_non_null_values; } diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc index bb5126ce251d4..b91fcb0839cba 100644 --- a/cpp/src/parquet/encoding_test.cc +++ b/cpp/src/parquet/encoding_test.cc @@ -635,6 +635,150 @@ TEST(BooleanArrayEncoding, AdHocRoundTrip) { } } +class TestBooleanArrowDecoding : public ::testing::Test { + public: + // number of values including nulls + constexpr static int kNumValues = 10000; + + void SetUp() override { + null_probabilities_ = {0.0, 0.001, 0.5, 0.999, 1.0}; + read_batch_sizes_ = {1024, 4096, 10000}; + true_probabilities_ = {0.0, 0.001, 0.5, 0.999, 1.0}; + } + void TearDown() override {} + + void InitTestCase(Encoding::type encoding, double null_probability, + double true_probability) { + GenerateInputData(null_probability, true_probability); + SetupEncoderDecoder(encoding); + } + + void GenerateInputData(double null_probability, double true_probability) { + ::arrow::random::RandomArrayGenerator rag(0); + expected_dense_ = rag.Boolean(kNumValues, true_probability, null_probability); + null_count_ = static_cast(expected_dense_->null_count()); + valid_bits_ = expected_dense_->null_bitmap_data(); + + // Initialize input_data_ for the encoder from the expected_array_ values + const auto& boolean_array = + checked_cast(*expected_dense_); + input_data_.resize(boolean_array.length()); + + for (int64_t i = 0; i < boolean_array.length(); ++i) { + input_data_[i] = boolean_array.Value(i); + } + } + + // Setup encoder/decoder pair for testing with boolean encoding + void SetupEncoderDecoder(Encoding::type encoding) { + encoder_ = MakeTypedEncoder(encoding); + decoder_ = MakeTypedDecoder(encoding); + const auto* data_ptr = reinterpret_cast(input_data_.data()); + if (valid_bits_ != nullptr) { + ASSERT_NO_THROW(encoder_->PutSpaced(data_ptr, kNumValues, valid_bits_, 0)); + } else { + ASSERT_NO_THROW(encoder_->Put(data_ptr, kNumValues)); + } + buffer_ = encoder_->FlushValues(); + ResetTheDecoder(); + } + + void ResetTheDecoder() { + ASSERT_NE(nullptr, buffer_); + decoder_->SetData(kNumValues, buffer_->data(), static_cast(buffer_->size())); + } + + void CheckDense(int actual_num_values, const ::arrow::Array& chunk) { + ASSERT_EQ(actual_num_values, kNumValues - null_count_); + ASSERT_ARRAYS_EQUAL(chunk, *expected_dense_); + } + + void CheckDecodeArrow(Encoding::type encoding) { + for (double np : null_probabilities_) { + for (double true_prob : true_probabilities_) { + InitTestCase(encoding, np, true_prob); + for (int read_batch_size : this->read_batch_sizes_) { + ResetTheDecoder(); + + int num_values_left = kNumValues; + ::arrow::BooleanBuilder acc; + int actual_num_not_null_values = 0; + while (num_values_left > 0) { + int batch_size = std::min(num_values_left, read_batch_size); + ASSERT_NE(0, batch_size); + // Counting nulls + int64_t batch_null_count = 0; + if (null_count_ != 0) { + batch_null_count = + batch_size - ::arrow::internal::CountSetBits( + valid_bits_, kNumValues - num_values_left, batch_size); + } + int batch_num_values = + decoder_->DecodeArrow(batch_size, static_cast(batch_null_count), + valid_bits_, kNumValues - num_values_left, &acc); + actual_num_not_null_values += batch_num_values; + num_values_left -= batch_size; + } + std::shared_ptr<::arrow::Array> chunk; + ASSERT_OK(acc.Finish(&chunk)); + CheckDense(actual_num_not_null_values, *chunk); + } + } + } + } + + void CheckDecodeArrowNonNull(Encoding::type encoding) { + // NonNull skips tests for null_prob != 0. + for (auto true_prob : true_probabilities_) { + InitTestCase(encoding, /*null_probability=*/0, true_prob); + for (int read_batch_size : this->read_batch_sizes_) { + // Resume the decoder + ResetTheDecoder(); + ::arrow::BooleanBuilder acc; + int actual_num_values = 0; + int num_values_left = kNumValues; + while (num_values_left > 0) { + int batch_size = std::min(num_values_left, read_batch_size); + int batch_num_values = decoder_->DecodeArrowNonNull(batch_size, &acc); + actual_num_values += batch_num_values; + num_values_left -= batch_num_values; + } + std::shared_ptr<::arrow::Array> chunk; + ASSERT_OK(acc.Finish(&chunk)); + CheckDense(actual_num_values, *chunk); + } + } + } + + protected: + std::vector null_probabilities_; + std::vector true_probabilities_; + std::vector read_batch_sizes_; + std::shared_ptr<::arrow::Array> expected_dense_; + int null_count_{0}; + std::vector input_data_; + const uint8_t* valid_bits_ = NULLPTR; + std::unique_ptr encoder_; + std::unique_ptr decoder_; + std::shared_ptr buffer_; +}; + +TEST_F(TestBooleanArrowDecoding, CheckDecodeArrowUsingPlain) { + this->CheckDecodeArrow(Encoding::PLAIN); +} + +TEST_F(TestBooleanArrowDecoding, CheckDecodeArrowNonNullPlain) { + this->CheckDecodeArrowNonNull(Encoding::PLAIN); +} + +TEST_F(TestBooleanArrowDecoding, CheckDecodeArrowRle) { + this->CheckDecodeArrow(Encoding::RLE); +} + +TEST_F(TestBooleanArrowDecoding, CheckDecodeArrowNonNullRle) { + this->CheckDecodeArrowNonNull(Encoding::RLE); +} + template void GetDictDecoder(DictEncoder* encoder, int64_t num_values, std::shared_ptr* out_values,