Skip to content

Commit

Permalink
apacheGH-40872: [C++][Parquet] Encoding: Optimize DecodeArrow/Decode(…
Browse files Browse the repository at this point in the history
…bitmap) for PlainBooleanDecoder (apache#40876)

### Rationale for this change

This is for enhance boolean decoding. I optimized the `DecodeArrow` for PlainBoolean

### What changes are included in this PR?

Optimize DecodeArrow/Decode(bitmap) for PlainBooleanDecoder, and add benchmarks

### Are these changes tested?

Yes

### Are there any user-facing changes?

Minor optimization. And `Decode` boolean will change the syntax

* GitHub Issue: apache#40872

Lead-authored-by: mwish <maplewish117@gmail.com>
Co-authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: mwish <maplewish117@gmail.com>
  • Loading branch information
2 people authored and vibhatha committed May 25, 2024
1 parent 16d94de commit a095016
Show file tree
Hide file tree
Showing 4 changed files with 261 additions and 61 deletions.
3 changes: 2 additions & 1 deletion cpp/src/parquet/column_reader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1316,7 +1316,8 @@ class TypedRecordReader : public TypedColumnReaderImpl<DType>,
levels_position_ = 0;
levels_capacity_ = 0;
read_dense_for_nullable_ = read_dense_for_nullable;
uses_values_ = !(descr->physical_type() == Type::BYTE_ARRAY);
// BYTE_ARRAY values are not stored in the `values_` buffer.
uses_values_ = descr->physical_type() != Type::BYTE_ARRAY;

if (uses_values_) {
values_ = AllocateBuffer(pool);
Expand Down
79 changes: 56 additions & 23 deletions cpp/src/parquet/encoding.cc
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ namespace bit_util = arrow::bit_util;
using arrow::Status;
using arrow::VisitNullBitmapInline;
using arrow::internal::AddWithOverflow;
using arrow::internal::BitBlockCounter;
using arrow::internal::checked_cast;
using arrow::internal::MultiplyWithOverflow;
using arrow::internal::SafeSignedSubtract;
Expand Down Expand Up @@ -1173,13 +1174,15 @@ class PlainBooleanDecoder : public DecoderImpl, virtual public BooleanDecoder {

private:
std::unique_ptr<::arrow::bit_util::BitReader> bit_reader_;
int total_num_values_{0};
};

PlainBooleanDecoder::PlainBooleanDecoder(const ColumnDescriptor* descr)
: DecoderImpl(descr, Encoding::PLAIN) {}

void PlainBooleanDecoder::SetData(int num_values, const uint8_t* data, int len) {
num_values_ = num_values;
DecoderImpl::SetData(num_values, data, len);
total_num_values_ = num_values;
bit_reader_ = std::make_unique<bit_util::BitReader>(data, len);
}

Expand All @@ -1188,19 +1191,52 @@ int PlainBooleanDecoder::DecodeArrow(
typename EncodingTraits<BooleanType>::Accumulator* builder) {
int values_decoded = num_values - null_count;
if (ARROW_PREDICT_FALSE(num_values_ < values_decoded)) {
// A too large `num_values` was requested.
ParquetException::EofException();
}
if (ARROW_PREDICT_FALSE(!bit_reader_->Advance(values_decoded))) {
ParquetException::EofException();
}

PARQUET_THROW_NOT_OK(builder->Reserve(num_values));

VisitNullBitmapInline(
valid_bits, valid_bits_offset, num_values, null_count,
[&]() {
bool value;
ARROW_IGNORE_EXPR(bit_reader_->GetValue(1, &value));
builder->UnsafeAppend(value);
},
[&]() { builder->UnsafeAppendNull(); });
if (null_count == 0) {
// FastPath: can copy the data directly
PARQUET_THROW_NOT_OK(builder->AppendValues(data_, values_decoded, NULLPTR,
total_num_values_ - num_values_));
} else {
// Handle nulls by BitBlockCounter
PARQUET_THROW_NOT_OK(builder->Reserve(num_values));
BitBlockCounter bit_counter(valid_bits, valid_bits_offset, num_values);
int64_t value_position = 0;
int64_t valid_bits_offset_position = valid_bits_offset;
int64_t previous_value_offset = 0;
while (value_position < num_values) {
auto block = bit_counter.NextWord();
if (block.AllSet()) {
// GH-40978: We don't have UnsafeAppendValues for booleans currently,
// so using `AppendValues` here.
PARQUET_THROW_NOT_OK(
builder->AppendValues(data_, block.length, NULLPTR, previous_value_offset));
previous_value_offset += block.length;
} else if (block.NoneSet()) {
// GH-40978: We don't have UnsafeAppendNulls for booleans currently,
// so using `AppendNulls` here.
PARQUET_THROW_NOT_OK(builder->AppendNulls(block.length));
} else {
for (int64_t i = 0; i < block.length; ++i) {
if (bit_util::GetBit(valid_bits, valid_bits_offset_position + i)) {
bool value = bit_util::GetBit(
data_, total_num_values_ - num_values_ + previous_value_offset);
builder->UnsafeAppend(value);
previous_value_offset += 1;
} else {
builder->UnsafeAppendNull();
}
}
}
value_position += block.length;
valid_bits_offset_position += block.length;
}
}

num_values_ -= values_decoded;
return values_decoded;
Expand All @@ -1214,18 +1250,15 @@ inline int PlainBooleanDecoder::DecodeArrow(

int PlainBooleanDecoder::Decode(uint8_t* buffer, int max_values) {
max_values = std::min(max_values, num_values_);
bool val;
::arrow::internal::BitmapWriter bit_writer(buffer, 0, max_values);
for (int i = 0; i < max_values; ++i) {
if (!bit_reader_->GetValue(1, &val)) {
ParquetException::EofException();
}
if (val) {
bit_writer.Set();
}
bit_writer.Next();
if (ARROW_PREDICT_FALSE(!bit_reader_->Advance(max_values))) {
ParquetException::EofException();
}
bit_writer.Finish();
// Copy the data directly
// Parquet's boolean encoding is bit-packed using LSB. So
// we can directly copy the data to the buffer.
::arrow::internal::CopyBitmap(this->data_, /*offset=*/total_num_values_ - num_values_,
/*length=*/max_values, /*dest=*/buffer,
/*dest_offset=*/0);
num_values_ -= max_values;
return max_values;
}
Expand Down Expand Up @@ -1692,7 +1725,7 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder<Type> {
}

protected:
Status IndexInBounds(int32_t index) {
Status IndexInBounds(int32_t index) const {
if (ARROW_PREDICT_TRUE(0 <= index && index < dictionary_length_)) {
return Status::OK();
}
Expand Down
4 changes: 3 additions & 1 deletion cpp/src/parquet/encoding.h
Original file line number Diff line number Diff line change
Expand Up @@ -400,7 +400,9 @@ class BooleanDecoder : virtual public TypedDecoder<BooleanType> {
/// \brief Decode and bit-pack values into a buffer
///
/// \param[in] buffer destination for decoded values
/// This buffer will contain bit-packed values.
/// This buffer will contain bit-packed values. If
/// max_values is not a multiple of 8, the trailing bits
/// of the last byte will be undefined.
/// \param[in] max_values max values to decode.
/// \return The number of values decoded. Should be identical to max_values except
/// at the end of the current data page.
Expand Down
Loading

0 comments on commit a095016

Please sign in to comment.