Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ARROW-13222: [C++] Improve type support for case_when #10806

Closed
wants to merge 31 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
8a5545f
ARROW-13222: [C++] Improve type support for case_when kernel
lidavidm Jul 21, 2021
2202b74
ARROW-13222: [C++] Add list type support for case_when kernel
lidavidm Jul 21, 2021
737347a
ARROW-13222: [C++] Add more benchmarks for case_when kernel
lidavidm Jul 22, 2021
6ffb272
ARROW-13222: [C++] Add basic implementations for map, fixed size list
lidavidm Jul 22, 2021
0d42d40
ARROW-13222: [C++] Much faster, but less generic case_when for nested…
lidavidm Jul 23, 2021
3ec727a
ARROW-13222: [C++] Expand type support
lidavidm Jul 26, 2021
d6b491c
ARROW-13222: [C++] Expand type support further
lidavidm Jul 26, 2021
a7a3b45
ARROW-13222: [C++] Fix typos
lidavidm Jul 26, 2021
d6a50ea
ARROW-13222: [C++] Fix types
lidavidm Jul 26, 2021
1f47c6d
ARROW-13222: [C++] casts for msyc
lidavidm Jul 26, 2021
a4336f8
ARROW-13222: [C++] Explicit cast to avoid accidental overflow
lidavidm Jul 27, 2021
2bde163
ARROW-13222: [C++] Add support for unions
lidavidm Aug 3, 2021
5307c5a
ARROW-13222: [C++] Trim unnecessary cases
lidavidm Aug 3, 2021
72811bb
ARROW-13222: [C++] Refactor GetValueAppender into builder methods
lidavidm Aug 18, 2021
d9cb23e
ARROW-13222: [C++] Untemplate common case
lidavidm Aug 18, 2021
c834b5c
ARROW-13222: [C++] Address feedback
lidavidm Aug 18, 2021
2b6dd26
ARROW-13222: [C++] Add tests for AppendArraySliceUnchecked
lidavidm Aug 19, 2021
7390452
ARROW-13222: [C++] Fix lint/MSVC warnings
lidavidm Aug 19, 2021
bd95fc8
ARROW-13222: [C++] Optimize BaseBinaryBuilder::AppendArraySliceUnchecked
lidavidm Aug 19, 2021
bf01084
ARROW-13222: [C++] Optimize StructBuilder::AppendArraySliceUnchecked
lidavidm Aug 19, 2021
b3c12a1
ARROW-13222: [C++] Reorganize union builders
lidavidm Aug 19, 2021
0324fd1
ARROW-13222: [C++] Don't over-allocate
lidavidm Aug 19, 2021
0ac03c1
Revert "ARROW-13222: [C++] Optimize BaseBinaryBuilder::AppendArraySli…
lidavidm Aug 19, 2021
e5581e6
ARROW-13222: [C++] Rename to AppendArraySlice
lidavidm Aug 19, 2021
ee1a6b0
ARROW-13222: [C++] Add forgotten RETURN_NOT_OK
lidavidm Aug 19, 2021
ef62b84
ARROW-13222: [C++] Add another forgotten RETURN_NOT_OK
lidavidm Aug 19, 2021
b52dab8
ARROW-13222: [C++] Fix qualifiers for MSVC2015
lidavidm Aug 19, 2021
814b2f3
ARROW-13222: [C++] Add cast for MSVC
lidavidm Aug 19, 2021
2f67f7f
ARROW-13222: [C++] Try to avoid apparently miscompilation under MinGW…
lidavidm Aug 20, 2021
4cc6387
ARROW-13222: [C++] Try to fix MinGW again
lidavidm Aug 23, 2021
f032791
ARROW-13222: [C++] Update compute.rst
lidavidm Aug 23, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
115 changes: 108 additions & 7 deletions cpp/src/arrow/array/array_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -377,6 +377,7 @@ TEST_F(TestArray, TestMakeArrayOfNullUnion) {
const int64_t union_length = 10;
auto s_union_ty = sparse_union({field("a", utf8()), field("b", int32())}, {0, 1});
ASSERT_OK_AND_ASSIGN(auto s_union_nulls, MakeArrayOfNull(s_union_ty, union_length));
ASSERT_OK(s_union_nulls->ValidateFull());
ASSERT_EQ(s_union_nulls->null_count(), 0);
{
const auto& typed_union = checked_cast<const SparseUnionArray&>(*s_union_nulls);
Expand All @@ -388,8 +389,23 @@ TEST_F(TestArray, TestMakeArrayOfNullUnion) {
}
}

s_union_ty = sparse_union({field("a", utf8()), field("b", int32())}, {2, 7});
ASSERT_OK_AND_ASSIGN(s_union_nulls, MakeArrayOfNull(s_union_ty, union_length));
ASSERT_OK(s_union_nulls->ValidateFull());
ASSERT_EQ(s_union_nulls->null_count(), 0);
{
const auto& typed_union = checked_cast<const SparseUnionArray&>(*s_union_nulls);
ASSERT_EQ(typed_union.field(0)->null_count(), union_length);

// Check type codes are all 2
for (int i = 0; i < union_length; ++i) {
ASSERT_EQ(typed_union.raw_type_codes()[i], 2);
}
}

auto d_union_ty = dense_union({field("a", utf8()), field("b", int32())}, {0, 1});
ASSERT_OK_AND_ASSIGN(auto d_union_nulls, MakeArrayOfNull(d_union_ty, union_length));
ASSERT_OK(d_union_nulls->ValidateFull());
ASSERT_EQ(d_union_nulls->null_count(), 0);
{
const auto& typed_union = checked_cast<const DenseUnionArray&>(*d_union_nulls);
Expand Down Expand Up @@ -484,23 +500,30 @@ static ScalarVector GetScalars() {
const auto dense_union_ty = ::arrow::dense_union(union_fields, union_type_codes);

return {
std::make_shared<BooleanScalar>(false), std::make_shared<Int8Scalar>(3),
std::make_shared<UInt16Scalar>(3), std::make_shared<Int32Scalar>(3),
std::make_shared<UInt64Scalar>(3), std::make_shared<DoubleScalar>(3.0),
std::make_shared<Date32Scalar>(10), std::make_shared<Date64Scalar>(11),
std::make_shared<BooleanScalar>(false),
std::make_shared<Int8Scalar>(3),
std::make_shared<UInt16Scalar>(3),
std::make_shared<Int32Scalar>(3),
std::make_shared<UInt64Scalar>(3),
std::make_shared<DoubleScalar>(3.0),
std::make_shared<Date32Scalar>(10),
std::make_shared<Date64Scalar>(11),
std::make_shared<Time32Scalar>(1000, time32(TimeUnit::SECOND)),
std::make_shared<Time64Scalar>(1111, time64(TimeUnit::MICRO)),
std::make_shared<TimestampScalar>(1111, timestamp(TimeUnit::MILLI)),
std::make_shared<MonthIntervalScalar>(1),
std::make_shared<DayTimeIntervalScalar>(daytime),
std::make_shared<DurationScalar>(60, duration(TimeUnit::SECOND)),
std::make_shared<BinaryScalar>(hello), std::make_shared<LargeBinaryScalar>(hello),
std::make_shared<BinaryScalar>(hello),
std::make_shared<LargeBinaryScalar>(hello),
std::make_shared<FixedSizeBinaryScalar>(
hello, fixed_size_binary(static_cast<int32_t>(hello->size()))),
std::make_shared<Decimal128Scalar>(Decimal128(10), decimal(16, 4)),
std::make_shared<Decimal256Scalar>(Decimal256(10), decimal(76, 38)),
std::make_shared<StringScalar>(hello), std::make_shared<LargeStringScalar>(hello),
std::make_shared<StringScalar>(hello),
std::make_shared<LargeStringScalar>(hello),
std::make_shared<ListScalar>(ArrayFromJSON(int8(), "[1, 2, 3]")),
ScalarFromJSON(map(int8(), utf8()), R"([[1, "foo"], [2, "bar"]])"),
std::make_shared<LargeListScalar>(ArrayFromJSON(int8(), "[1, 1, 2, 2, 3, 3]")),
std::make_shared<FixedSizeListScalar>(ArrayFromJSON(int8(), "[1, 2, 3, 4]")),
std::make_shared<StructScalar>(
Expand All @@ -517,7 +540,12 @@ static ScalarVector GetScalars() {
std::make_shared<DenseUnionScalar>(std::make_shared<Int32Scalar>(101), 6,
dense_union_ty),
std::make_shared<DenseUnionScalar>(std::make_shared<Int32Scalar>(101), 42,
dense_union_ty)};
dense_union_ty),
DictionaryScalar::Make(ScalarFromJSON(int8(), "1"),
ArrayFromJSON(utf8(), R"(["foo", "bar"])")),
DictionaryScalar::Make(ScalarFromJSON(uint8(), "1"),
ArrayFromJSON(utf8(), R"(["foo", "bar"])")),
};
}

TEST_F(TestArray, TestMakeArrayFromScalar) {
Expand All @@ -544,6 +572,8 @@ TEST_F(TestArray, TestMakeArrayFromScalar) {
}

for (auto scalar : scalars) {
// TODO(ARROW-13197): appending dictionary scalars not implemented
if (is_dictionary(scalar->type->id())) continue;
AssertAppendScalar(pool_, scalar);
}
}
Expand Down Expand Up @@ -598,6 +628,77 @@ TEST_F(TestArray, TestMakeArrayFromMapScalar) {
AssertAppendScalar(pool_, std::make_shared<MapScalar>(scalar));
}

TEST_F(TestArray, TestAppendArraySlice) {
auto scalars = GetScalars();
for (const auto& scalar : scalars) {
// TODO(ARROW-13573): appending dictionary arrays not implemented
if (is_dictionary(scalar->type->id())) continue;

ARROW_SCOPED_TRACE(*scalar->type);
ASSERT_OK_AND_ASSIGN(auto array, MakeArrayFromScalar(*scalar, 16));
ASSERT_OK_AND_ASSIGN(auto nulls, MakeArrayOfNull(scalar->type, 16));

std::unique_ptr<arrow::ArrayBuilder> builder;
ASSERT_OK(MakeBuilder(pool_, scalar->type, &builder));

ASSERT_OK(builder->AppendArraySlice(*array->data(), 0, 4));
ASSERT_EQ(4, builder->length());
ASSERT_OK(builder->AppendArraySlice(*array->data(), 0, 0));
ASSERT_EQ(4, builder->length());
ASSERT_OK(builder->AppendArraySlice(*array->data(), 1, 0));
ASSERT_EQ(4, builder->length());
ASSERT_OK(builder->AppendArraySlice(*array->data(), 1, 4));
ASSERT_EQ(8, builder->length());

ASSERT_OK(builder->AppendArraySlice(*nulls->data(), 0, 4));
ASSERT_EQ(12, builder->length());
if (!is_union(scalar->type->id())) {
ASSERT_EQ(4, builder->null_count());
}
ASSERT_OK(builder->AppendArraySlice(*nulls->data(), 0, 0));
ASSERT_EQ(12, builder->length());
if (!is_union(scalar->type->id())) {
ASSERT_EQ(4, builder->null_count());
}
ASSERT_OK(builder->AppendArraySlice(*nulls->data(), 1, 0));
ASSERT_EQ(12, builder->length());
if (!is_union(scalar->type->id())) {
ASSERT_EQ(4, builder->null_count());
}
ASSERT_OK(builder->AppendArraySlice(*nulls->data(), 1, 4));
ASSERT_EQ(16, builder->length());
if (!is_union(scalar->type->id())) {
ASSERT_EQ(8, builder->null_count());
}

std::shared_ptr<Array> result;
ASSERT_OK(builder->Finish(&result));
ASSERT_OK(result->ValidateFull());
ASSERT_EQ(16, result->length());
if (!is_union(scalar->type->id())) {
ASSERT_EQ(8, result->null_count());
}
}

{
ASSERT_OK_AND_ASSIGN(auto array, MakeArrayOfNull(null(), 16));
NullBuilder builder(pool_);
ASSERT_OK(builder.AppendArraySlice(*array->data(), 0, 4));
ASSERT_EQ(4, builder.length());
ASSERT_OK(builder.AppendArraySlice(*array->data(), 0, 0));
ASSERT_EQ(4, builder.length());
ASSERT_OK(builder.AppendArraySlice(*array->data(), 1, 0));
ASSERT_EQ(4, builder.length());
ASSERT_OK(builder.AppendArraySlice(*array->data(), 1, 4));
ASSERT_EQ(8, builder.length());
std::shared_ptr<Array> result;
ASSERT_OK(builder.Finish(&result));
ASSERT_OK(result->ValidateFull());
ASSERT_EQ(8, result->length());
ASSERT_EQ(8, result->null_count());
}
}

TEST_F(TestArray, ValidateBuffersPrimitive) {
auto empty_buffer = std::make_shared<Buffer>("");
auto null_buffer = Buffer::FromString("\xff");
Expand Down
19 changes: 19 additions & 0 deletions cpp/src/arrow/array/builder_base.h
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,14 @@ class ARROW_EXPORT ArrayBuilder {
Status AppendScalar(const Scalar& scalar, int64_t n_repeats);
Status AppendScalars(const ScalarVector& scalars);

/// \brief Append a range of values from an array.
///
/// The given array must be the same type as the builder.
virtual Status AppendArraySlice(const ArrayData& array, int64_t offset,
int64_t length) {
return Status::NotImplemented("AppendArraySlice for builder for ", *type());
}

/// For cases where raw data was memcpy'd into the internal buffers, allows us
/// to advance the length of the builder. It is your responsibility to use
/// this function responsibly.
Expand Down Expand Up @@ -189,6 +197,17 @@ class ARROW_EXPORT ArrayBuilder {
null_count_ = null_bitmap_builder_.false_count();
}

// Vector append. Copy from a given bitmap. If bitmap is null assume
// all of length bits are valid.
void UnsafeAppendToBitmap(const uint8_t* bitmap, int64_t offset, int64_t length) {
if (bitmap == NULLPTR) {
return UnsafeSetNotNull(length);
}
null_bitmap_builder_.UnsafeAppend(bitmap, offset, length);
length_ += length;
null_count_ = null_bitmap_builder_.false_count();
}

// Append the same validity value a given number of times.
void UnsafeAppendToBitmap(const int64_t num_bits, bool value) {
if (value) {
Expand Down
8 changes: 8 additions & 0 deletions cpp/src/arrow/array/builder_binary.cc
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,14 @@ Status FixedSizeBinaryBuilder::AppendValues(const uint8_t* data, int64_t length,
return byte_builder_.Append(data, length * byte_width_);
}

Status FixedSizeBinaryBuilder::AppendValues(const uint8_t* data, int64_t length,
const uint8_t* validity,
int64_t bitmap_offset) {
RETURN_NOT_OK(Reserve(length));
UnsafeAppendToBitmap(validity, bitmap_offset, length);
return byte_builder_.Append(data, length * byte_width_);
}

Status FixedSizeBinaryBuilder::AppendNull() {
RETURN_NOT_OK(Reserve(1));
UnsafeAppendNull();
Expand Down
27 changes: 27 additions & 0 deletions cpp/src/arrow/array/builder_binary.h
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,23 @@ class BaseBinaryBuilder : public ArrayBuilder {
return Status::OK();
}

Status AppendArraySlice(const ArrayData& array, int64_t offset,
int64_t length) override {
auto bitmap = array.GetValues<uint8_t>(0, 0);
auto offsets = array.GetValues<offset_type>(1);
auto data = array.GetValues<uint8_t>(2, 0);
for (int64_t i = 0; i < length; i++) {
if (!bitmap || BitUtil::GetBit(bitmap, array.offset + offset + i)) {
const offset_type start = offsets[offset + i];
const offset_type end = offsets[offset + i + 1];
ARROW_RETURN_NOT_OK(Append(data + start, end - start));
} else {
ARROW_RETURN_NOT_OK(AppendNull());
}
}
return Status::OK();
}

void Reset() override {
ArrayBuilder::Reset();
offsets_builder_.Reset();
Expand Down Expand Up @@ -486,12 +503,22 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder {
Status AppendValues(const uint8_t* data, int64_t length,
const uint8_t* valid_bytes = NULLPTR);

Status AppendValues(const uint8_t* data, int64_t length, const uint8_t* validity,
int64_t bitmap_offset);

Status AppendNull() final;
Status AppendNulls(int64_t length) final;

Status AppendEmptyValue() final;
Status AppendEmptyValues(int64_t length) final;

Status AppendArraySlice(const ArrayData& array, int64_t offset,
int64_t length) override {
return AppendValues(
array.GetValues<uint8_t>(1, 0) + ((array.offset + offset) * byte_width_), length,
array.GetValues<uint8_t>(0, 0), array.offset + offset);
}

void UnsafeAppend(const uint8_t* value) {
UnsafeAppendToBitmap(true);
if (ARROW_PREDICT_TRUE(byte_width_ > 0)) {
Expand Down
62 changes: 62 additions & 0 deletions cpp/src/arrow/array/builder_nested.h
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,23 @@ class BaseListBuilder : public ArrayBuilder {
return Status::OK();
}

Status AppendArraySlice(const ArrayData& array, int64_t offset,
int64_t length) override {
const offset_type* offsets = array.GetValues<offset_type>(1);
const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0]->data() : NULLPTR;
for (int64_t row = offset; row < offset + length; row++) {
if (!validity || BitUtil::GetBit(validity, array.offset + row)) {
ARROW_RETURN_NOT_OK(Append());
int64_t slot_length = offsets[row + 1] - offsets[row];
ARROW_RETURN_NOT_OK(value_builder_->AppendArraySlice(*array.child_data[0],
offsets[row], slot_length));
} else {
ARROW_RETURN_NOT_OK(AppendNull());
}
}
return Status::OK();
}

Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
ARROW_RETURN_NOT_OK(AppendNextOffset());

Expand Down Expand Up @@ -275,6 +292,25 @@ class ARROW_EXPORT MapBuilder : public ArrayBuilder {

Status AppendEmptyValues(int64_t length) final;

Status AppendArraySlice(const ArrayData& array, int64_t offset,
int64_t length) override {
const int32_t* offsets = array.GetValues<int32_t>(1);
const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0]->data() : NULLPTR;
for (int64_t row = offset; row < offset + length; row++) {
if (!validity || BitUtil::GetBit(validity, array.offset + row)) {
ARROW_RETURN_NOT_OK(Append());
const int64_t slot_length = offsets[row + 1] - offsets[row];
ARROW_RETURN_NOT_OK(key_builder_->AppendArraySlice(
*array.child_data[0]->child_data[0], offsets[row], slot_length));
ARROW_RETURN_NOT_OK(item_builder_->AppendArraySlice(
*array.child_data[0]->child_data[1], offsets[row], slot_length));
} else {
ARROW_RETURN_NOT_OK(AppendNull());
}
}
return Status::OK();
}

/// \brief Get builder to append keys.
///
/// Append a key with this builder should be followed by appending
Expand Down Expand Up @@ -374,6 +410,20 @@ class ARROW_EXPORT FixedSizeListBuilder : public ArrayBuilder {

Status AppendEmptyValues(int64_t length) final;

Status AppendArraySlice(const ArrayData& array, int64_t offset, int64_t length) final {
const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0]->data() : NULLPTR;
for (int64_t row = offset; row < offset + length; row++) {
if (!validity || BitUtil::GetBit(validity, array.offset + row)) {
ARROW_RETURN_NOT_OK(value_builder_->AppendArraySlice(
*array.child_data[0], list_size_ * (array.offset + row), list_size_));
ARROW_RETURN_NOT_OK(Append());
} else {
ARROW_RETURN_NOT_OK(AppendNull());
}
}
return Status::OK();
}

ArrayBuilder* value_builder() const { return value_builder_.get(); }

std::shared_ptr<DataType> type() const override {
Expand Down Expand Up @@ -467,6 +517,18 @@ class ARROW_EXPORT StructBuilder : public ArrayBuilder {
return Status::OK();
}

Status AppendArraySlice(const ArrayData& array, int64_t offset,
int64_t length) override {
for (int i = 0; static_cast<size_t>(i) < children_.size(); i++) {
ARROW_RETURN_NOT_OK(children_[i]->AppendArraySlice(*array.child_data[i],
array.offset + offset, length));
}
const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0]->data() : NULLPTR;
ARROW_RETURN_NOT_OK(Reserve(length));
UnsafeAppendToBitmap(validity, array.offset + offset, length);
return Status::OK();
}

void Reset() override;

ArrayBuilder* field_builder(int i) const { return children_[i].get(); }
Expand Down
8 changes: 8 additions & 0 deletions cpp/src/arrow/array/builder_primitive.cc
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,14 @@ Status BooleanBuilder::AppendValues(const uint8_t* values, int64_t length,
return Status::OK();
}

Status BooleanBuilder::AppendValues(const uint8_t* values, int64_t length,
const uint8_t* validity, int64_t offset) {
RETURN_NOT_OK(Reserve(length));
data_builder_.UnsafeAppend(values, offset, length);
ArrayBuilder::UnsafeAppendToBitmap(validity, offset, length);
return Status::OK();
}

Status BooleanBuilder::AppendValues(const uint8_t* values, int64_t length,
const std::vector<bool>& is_valid) {
RETURN_NOT_OK(Reserve(length));
Expand Down
Loading