Skip to content

Commit

Permalink
PARQUET-599: Better size estimation for levels
Browse files Browse the repository at this point in the history
Still not an optimal size estimation but at least we will have always the required amount.

Author: Uwe L. Korn <uwelk@xhochy.com>

Closes apache#96 from xhochy/parquet-599 and squashes the following commits:

e8044b5 [Uwe L. Korn] PARQUET-599: Better size estimation for levels

Change-Id: I93e2b4d2914b50cf24889bcc0c20d669a49284d6
  • Loading branch information
xhochy authored and wesm committed May 9, 2016
1 parent 862a975 commit 37e621e
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 1 deletion.
22 changes: 22 additions & 0 deletions cpp/src/parquet/column/levels.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,28 @@ class LevelEncoder {
public:
LevelEncoder() {}

static int MaxBufferSize(
Encoding::type encoding, int16_t max_level, int num_buffered_values) {
int bit_width = BitUtil::Log2(max_level + 1);
int num_bytes = 0;
switch (encoding) {
case Encoding::RLE: {
// TODO: Due to the way we currently check if the buffer is full enough,
// we need to have MinBufferSize as head room.
num_bytes = RleEncoder::MaxBufferSize(bit_width, num_buffered_values) +
RleEncoder::MinBufferSize(bit_width);
break;
}
case Encoding::BIT_PACKED: {
num_bytes = BitUtil::Ceil(num_buffered_values * bit_width, 8);
break;
}
default:
throw ParquetException("Unknown encoding type for levels.");
}
return num_bytes;
}

// Initialize the LevelEncoder.
void Init(Encoding::type encoding, int16_t max_level, int num_buffered_values,
uint8_t* data, int data_size) {
Expand Down
4 changes: 3 additions & 1 deletion cpp/src/parquet/column/writer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,9 @@ void ColumnWriter::WriteRepetitionLevels(int64_t num_levels, int16_t* levels) {
std::shared_ptr<Buffer> ColumnWriter::RleEncodeLevels(
const std::shared_ptr<Buffer>& buffer, int16_t max_level) {
// TODO: This only works with due to some RLE specifics
int64_t rle_size = 2 * num_buffered_values_ + sizeof(uint32_t);
int64_t rle_size =
LevelEncoder::MaxBufferSize(Encoding::RLE, max_level, num_buffered_values_) +
sizeof(uint32_t);
auto buffer_rle = std::make_shared<OwnedMutableBuffer>(rle_size, allocator_);
level_encoder_.Init(Encoding::RLE, max_level, num_buffered_values_,
buffer_rle->mutable_data() + sizeof(uint32_t),
Expand Down

0 comments on commit 37e621e

Please sign in to comment.