Skip to content

Commit

Permalink
apacheGH-40872: [C++][Parquet] Encoding: Optimize DecodeArrow/Decode(…
Browse files Browse the repository at this point in the history
…bitmap) for PlainBooleanDecoder (apache#40876)

### Rationale for this change

This is for enhance boolean decoding. I optimized the `DecodeArrow` for PlainBoolean

### What changes are included in this PR?

Optimize DecodeArrow/Decode(bitmap) for PlainBooleanDecoder, and add benchmarks

### Are these changes tested?

Yes

### Are there any user-facing changes?

Minor optimization. And `Decode` boolean will change the syntax

* GitHub Issue: apache#40872

Lead-authored-by: mwish <maplewish117@gmail.com>
Co-authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: mwish <maplewish117@gmail.com>
  • Loading branch information
2 people authored and rok committed May 8, 2024
1 parent f98cd58 commit 05ae1e1
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 10 deletions.
11 changes: 5 additions & 6 deletions cpp/src/parquet/encoding.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1192,12 +1192,10 @@ int PlainBooleanDecoder::DecodeArrow(
int values_decoded = num_values - null_count;
if (ARROW_PREDICT_FALSE(num_values_ < values_decoded)) {
// A too large `num_values` was requested.
ParquetException::EofException(
"A too large `num_values` was requested in PlainBooleanDecoder: remain " +
std::to_string(num_values_) + ", requested: " + std::to_string(values_decoded));
ParquetException::EofException();
}
if (ARROW_PREDICT_FALSE(!bit_reader_->Advance(values_decoded))) {
ParquetException::EofException("PlainDecoder doesn't have enough values in page");
ParquetException::EofException();
}

if (null_count == 0) {
Expand All @@ -1210,7 +1208,7 @@ int PlainBooleanDecoder::DecodeArrow(
BitBlockCounter bit_counter(valid_bits, valid_bits_offset, num_values);
int64_t value_position = 0;
int64_t valid_bits_offset_position = valid_bits_offset;
int64_t previous_value_offset = total_num_values_ - num_values_;
int64_t previous_value_offset = 0;
while (value_position < num_values) {
auto block = bit_counter.NextWord();
if (block.AllSet()) {
Expand All @@ -1226,7 +1224,8 @@ int PlainBooleanDecoder::DecodeArrow(
} else {
for (int64_t i = 0; i < block.length; ++i) {
if (bit_util::GetBit(valid_bits, valid_bits_offset_position + i)) {
bool value = bit_util::GetBit(data_, previous_value_offset);
bool value = bit_util::GetBit(
data_, total_num_values_ - num_values_ + previous_value_offset);
builder->UnsafeAppend(value);
previous_value_offset += 1;
} else {
Expand Down
9 changes: 5 additions & 4 deletions cpp/src/parquet/encoding_benchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1518,10 +1518,11 @@ BENCHMARK_DEFINE_F(BM_DecodeArrowBooleanRle, DecodeArrowNonNull)
(benchmark::State& state) { DecodeArrowNonNullDenseBenchmark(state); }
BENCHMARK_REGISTER_F(BM_DecodeArrowBooleanRle, DecodeArrowNonNull)
->Range(MIN_RANGE, MAX_RANGE);
BENCHMARK_DEFINE_F(BM_DecodeArrowBooleanRle, DecodeArrowWithNull)
(benchmark::State& state) { DecodeArrowWithNullDenseBenchmark(state); }
BENCHMARK_REGISTER_F(BM_DecodeArrowBooleanRle, DecodeArrowWithNull)
->Apply(BooleanWithNullCustomArguments);
// TODO(mwish): RleBoolean not implemented DecodeArrow with null slots yet.
// BENCHMARK_DEFINE_F(BM_DecodeArrowBooleanRle, DecodeArrowWithNull)
//(benchmark::State& state) { DecodeArrowWithNullDenseBenchmark(state); }
// BENCHMARK_REGISTER_F(BM_DecodeArrowBooleanRle, DecodeArrowWithNull)
// ->Apply(BooleanWithNullCustomArguments);

BENCHMARK_DEFINE_F(BM_DecodeArrowBooleanPlain, DecodeArrow)
(benchmark::State& state) { DecodeArrowDenseBenchmark(state); }
Expand Down

0 comments on commit 05ae1e1

Please sign in to comment.