-
Notifications
You must be signed in to change notification settings - Fork 3.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
ARROW-1712: [C++] Add method to BinaryBuilder to reserve space for value data #1481
Changes from 18 commits
c2f8dc4
232024e
d021c54
5b73c1c
5ebfb32
e0434e6
de318f4
b002e0b
8dd5eaa
9b5e805
5a5593e
15e045c
bbc6527
18f90fb
0b07895
d3c8202
8e4c892
5a5b70e
bc5db7d
77f8f3c
d4bbd15
360e601
707b67b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1154,6 +1154,54 @@ TEST_F(TestBinaryBuilder, TestScalarAppend) { | |
} | ||
} | ||
} | ||
|
||
TEST_F(TestBinaryBuilder, TestCapacityReserve) { | ||
vector<string> strings = {"a", "bb", "cc", "ddddd", "eeeee"}; | ||
int64_t N = static_cast<int>(strings.size()); | ||
int64_t length = 0; | ||
int64_t data_length = 0; | ||
int64_t capacity = N; | ||
|
||
ASSERT_OK(builder_->Reserve(capacity)); | ||
ASSERT_OK(builder_->ReserveData(capacity)); | ||
|
||
ASSERT_EQ(builder_->length(), length); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In googletest, the 1st parameter passed to |
||
ASSERT_EQ(builder_->capacity(), BitUtil::NextPower2(capacity)); | ||
ASSERT_EQ(builder_->value_data_length(), data_length); | ||
ASSERT_EQ(builder_->value_data_capacity(), capacity); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why is this not a power of 2? |
||
|
||
for(const string& str : strings) { | ||
ASSERT_OK(builder_->Append(str)); | ||
length++; | ||
data_length += static_cast<int>(str.size()); | ||
|
||
ASSERT_EQ(builder_->length(), length); | ||
ASSERT_EQ(builder_->capacity(), BitUtil::NextPower2(capacity)); | ||
ASSERT_EQ(builder_->value_data_length(), data_length); | ||
if (data_length <= capacity) { | ||
ASSERT_EQ(builder_->value_data_capacity(), capacity); | ||
} else { | ||
ASSERT_EQ(builder_->value_data_capacity(), data_length); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's not clear to me why these assertions would hold true
I would think that Can you make the strings you are appending much larger, at least 10 length each? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @wesm value_data_capacity() is actually always a multiple of 64 greater than or equal to the amount of data appended so far because the underlying buffer size is set to ensure that the capacity of the buffer is a multiple of 64 bytes as defined in Layout.md, i.e. ASSERT_EQ(BitUtil::RoundUpToMultipleOf64(data_length), builder_->value_data_capacity()) So if you call ReserveData(capacity) at the very beginning, then we have ASSERT_EQ(BitUtil::RoundUpToMultipleOf64(capacity), builder_->value_data_capacity()) |
||
} | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you add another call to ReserveData here, like |
||
|
||
int extra_capacity = 20; | ||
|
||
ASSERT_OK(builder_->Reserve(extra_capacity)); | ||
ASSERT_OK(builder_->ReserveData(extra_capacity)); | ||
|
||
ASSERT_EQ(builder_->length(), length); | ||
ASSERT_EQ(builder_->capacity(), BitUtil::NextPower2(length + extra_capacity)); | ||
ASSERT_EQ(builder_->value_data_length(), data_length); | ||
ASSERT_EQ(builder_->value_data_capacity(), data_length + extra_capacity); | ||
|
||
Done(); | ||
|
||
ASSERT_EQ(result_->length(), N); | ||
ASSERT_EQ(result_->null_count(), 0); | ||
ASSERT_EQ(result_->value_data()->size(), data_length); | ||
ASSERT_EQ(result_->value_data()->capacity(), BitUtil::RoundUpToMultipleOf64(data_length + extra_capacity)); | ||
} | ||
|
||
TEST_F(TestBinaryBuilder, TestZeroLength) { | ||
// All buffers are null | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1208,7 +1208,7 @@ ArrayBuilder* ListBuilder::value_builder() const { | |
// String and binary | ||
|
||
BinaryBuilder::BinaryBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool) | ||
: ArrayBuilder(type, pool), offsets_builder_(pool), value_data_builder_(pool) {} | ||
: ArrayBuilder(type, pool), offsets_builder_(pool), value_data_builder_(pool), data_capacity_(0) {} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think we need this extra member; we can just use |
||
|
||
BinaryBuilder::BinaryBuilder(MemoryPool* pool) : BinaryBuilder(binary(), pool) {} | ||
|
||
|
@@ -1222,9 +1222,21 @@ Status BinaryBuilder::Init(int64_t elements) { | |
Status BinaryBuilder::Resize(int64_t capacity) { | ||
DCHECK_LT(capacity, std::numeric_limits<int32_t>::max()); | ||
// one more then requested for offsets | ||
RETURN_NOT_OK(offsets_builder_.Resize((capacity + 1) * sizeof(int64_t))); | ||
RETURN_NOT_OK(offsets_builder_.Resize((capacity + 1) * sizeof(int32_t))); | ||
return ArrayBuilder::Resize(capacity); | ||
} | ||
|
||
Status BinaryBuilder::ReserveData(int64_t capacity) { | ||
if (value_data_length() + capacity > data_capacity_) { | ||
if (value_data_length() + capacity > std::numeric_limits<int32_t>::max()) { | ||
return Status::Invalid("Cannot reserve capacity larger than 2^31 - 1 in length for binary data"); | ||
} | ||
|
||
RETURN_NOT_OK(value_data_builder_.Resize(value_data_length() + capacity)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If you rebase on master, you can use |
||
data_capacity_ = value_data_length() + capacity; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not needed |
||
} | ||
return Status::OK(); | ||
} | ||
|
||
Status BinaryBuilder::AppendNextOffset() { | ||
const int64_t num_bytes = value_data_builder_.length(); | ||
|
@@ -1241,6 +1253,9 @@ Status BinaryBuilder::Append(const uint8_t* value, int32_t length) { | |
RETURN_NOT_OK(Reserve(1)); | ||
RETURN_NOT_OK(AppendNextOffset()); | ||
RETURN_NOT_OK(value_data_builder_.Append(value, length)); | ||
if (data_capacity_ < value_data_length()) { | ||
data_capacity_ = value_data_length(); | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Remove this |
||
UnsafeAppendToBitmap(true); | ||
return Status::OK(); | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -682,10 +682,13 @@ class ARROW_EXPORT BinaryBuilder : public ArrayBuilder { | |
|
||
Status Init(int64_t elements) override; | ||
Status Resize(int64_t capacity) override; | ||
Status ReserveData(int64_t capacity); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you add a doxygen comment, since this is a new API I note that the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we also call the argument to |
||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override; | ||
|
||
/// \return size of values buffer so far | ||
int64_t value_data_length() const { return value_data_builder_.length(); } | ||
/// \return capacity of values buffer | ||
int64_t value_data_capacity() const { return data_capacity_; } | ||
|
||
/// Temporary access to a value. | ||
/// | ||
|
@@ -696,6 +699,7 @@ class ARROW_EXPORT BinaryBuilder : public ArrayBuilder { | |
TypedBufferBuilder<int32_t> offsets_builder_; | ||
TypedBufferBuilder<uint8_t> value_data_builder_; | ||
|
||
int64_t data_capacity_; | ||
static constexpr int64_t kMaximumCapacity = std::numeric_limits<int32_t>::max() - 1; | ||
|
||
Status AppendNextOffset(); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These reservations should be viewed as different, because the size of the offsets buffer and the data buffer grow at different rates. The way this unit test should read is:
ReserveData
with enough space for some large-ish amount of data (say 4K bytes or so)ReserveData
made sure that no additional reallocations took place)