diff --git a/cpp/src/parquet/column/levels.h b/cpp/src/parquet/column/levels.h index fd84ec9d21c11..20261df37c48d 100644 --- a/cpp/src/parquet/column/levels.h +++ b/cpp/src/parquet/column/levels.h @@ -31,6 +31,28 @@ class LevelEncoder { public: LevelEncoder() {} + static int MaxBufferSize( + Encoding::type encoding, int16_t max_level, int num_buffered_values) { + int bit_width = BitUtil::Log2(max_level + 1); + int num_bytes = 0; + switch (encoding) { + case Encoding::RLE: { + // TODO: Due to the way we currently check if the buffer is full enough, + // we need to have MinBufferSize as head room. + num_bytes = RleEncoder::MaxBufferSize(bit_width, num_buffered_values) + + RleEncoder::MinBufferSize(bit_width); + break; + } + case Encoding::BIT_PACKED: { + num_bytes = BitUtil::Ceil(num_buffered_values * bit_width, 8); + break; + } + default: + throw ParquetException("Unknown encoding type for levels."); + } + return num_bytes; + } + // Initialize the LevelEncoder. void Init(Encoding::type encoding, int16_t max_level, int num_buffered_values, uint8_t* data, int data_size) { diff --git a/cpp/src/parquet/column/writer.cc b/cpp/src/parquet/column/writer.cc index 0472adb3400db..a23610c34b8d0 100644 --- a/cpp/src/parquet/column/writer.cc +++ b/cpp/src/parquet/column/writer.cc @@ -62,7 +62,9 @@ void ColumnWriter::WriteRepetitionLevels(int64_t num_levels, int16_t* levels) { std::shared_ptr ColumnWriter::RleEncodeLevels( const std::shared_ptr& buffer, int16_t max_level) { // TODO: This only works with due to some RLE specifics - int64_t rle_size = 2 * num_buffered_values_ + sizeof(uint32_t); + int64_t rle_size = + LevelEncoder::MaxBufferSize(Encoding::RLE, max_level, num_buffered_values_) + + sizeof(uint32_t); auto buffer_rle = std::make_shared(rle_size, allocator_); level_encoder_.Init(Encoding::RLE, max_level, num_buffered_values_, buffer_rle->mutable_data() + sizeof(uint32_t),