Skip to content

Commit

Permalink
Random union array generation respects type ids in union type
Browse files Browse the repository at this point in the history
  • Loading branch information
zanmato1984 committed Dec 21, 2023
1 parent edc38a3 commit 7299d99
Show file tree
Hide file tree
Showing 4 changed files with 65 additions and 29 deletions.
8 changes: 6 additions & 2 deletions cpp/src/arrow/compare_benchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,9 @@ static void ArrayRangeEqualsSparseUnion(benchmark::State& state) {
auto values1 = rng.Int32(args.size, 0, 100, args.null_proportion);
auto values2 =
rng.String(args.size, /*min_length=*/0, /*max_length=*/15, args.null_proportion);
auto array = rng.SparseUnion({values1, values2}, args.size);
auto type = sparse_union({field("a", int32()), field("b", utf8())});
auto array = rng.SparseUnion(internal::checked_cast<const UnionType&>(*type),
{values1, values2}, args.size);

BenchmarkArrayRangeEquals(array, state);
}
Expand All @@ -146,7 +148,9 @@ static void ArrayRangeEqualsDenseUnion(benchmark::State& state) {
auto values1 = rng.Int32(args.size, 0, 100, args.null_proportion);
auto values2 =
rng.String(args.size, /*min_length=*/0, /*max_length=*/15, args.null_proportion);
auto array = rng.DenseUnion({values1, values2}, args.size);
auto type = dense_union({field("a", int32()), field("b", utf8())});
auto array = rng.DenseUnion(internal::checked_cast<const UnionType&>(*type),
{values1, values2}, args.size);

BenchmarkArrayRangeEquals(array, state);
}
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/arrow/compute/kernels/scalar_if_else_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2440,8 +2440,8 @@ TEST(TestCaseWhen, UnionBoolString) {

TEST(TestCaseWhen, UnionBoolStringRandom) {
for (const auto& type : std::vector<std::shared_ptr<DataType>>{
sparse_union({field("a", boolean()), field("b", utf8())}/*, {2, 7}*/),
dense_union({field("a", boolean()), field("b", utf8())}/*, {2, 7}*/)}) {
sparse_union({field("a", boolean()), field("b", utf8())}, {2, 7}),
dense_union({field("a", boolean()), field("b", utf8())}, {2, 7})}) {
ARROW_SCOPED_TRACE(type->ToString());
TestCaseWhenRandom(type);
}
Expand Down
76 changes: 53 additions & 23 deletions cpp/src/arrow/testing/random.cc
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,25 @@ static std::shared_ptr<NumericArray<ArrowType>> GenerateNumericArray(
return std::make_shared<NumericArray<ArrowType>>(array_data);
}

template <typename TypeCodeIndexType>
static std::shared_ptr<Array> GenerateUnionTypeIds(
const std::vector<int8_t>& type_codes, const TypeCodeIndexType* type_code_indices,
size_t size, int64_t alignment, MemoryPool* memory_pool) {
using TypeCode = UnionArray::type_code_t;
BufferVector buffers{2};

buffers[0] = nullptr;
buffers[1] = *AllocateBuffer(sizeof(TypeCode) * size, alignment, memory_pool);
auto type_code_data = reinterpret_cast<TypeCode*>(buffers[1]->mutable_data());
for (size_t i = 0; i < size; ++i) {
type_code_data[i] = type_codes[type_code_indices[i]];
}

auto array_data =
ArrayData::Make(arrow::int8(), size, std::move(buffers), /*null_count*/ 0);
return std::make_shared<Int8Array>(array_data);
}

#define PRIMITIVE_RAND_IMPL(Name, CType, ArrowType, Distribution) \
std::shared_ptr<Array> RandomArrayGenerator::Name( \
int64_t size, CType min, CType max, double probability, int64_t alignment, \
Expand Down Expand Up @@ -657,45 +676,54 @@ std::shared_ptr<Array> RandomArrayGenerator::RunEndEncoded(
return RunEndEncodedArray::Make(logical_size, run_ends, values).ValueOrDie();
}

std::shared_ptr<Array> RandomArrayGenerator::SparseUnion(const ArrayVector& fields,
int64_t size, int64_t alignment,
std::shared_ptr<Array> RandomArrayGenerator::SparseUnion(const UnionType& type,
ArrayVector fields, int64_t size,
int64_t alignment,
MemoryPool* memory_pool) {
DCHECK_GT(fields.size(), 0);
// Trivial type codes map
std::vector<UnionArray::type_code_t> type_codes(fields.size());
std::iota(type_codes.begin(), type_codes.end(), 0);
const auto& type_codes = type.type_codes();
DCHECK_EQ(type_codes.size(), fields.size());

// Generate array of type id indices within type_codes
auto type_id_indices = Int8(size, 0, static_cast<int8_t>(type_codes.size() - 1),
/*null_probability=*/0, alignment, memory_pool);
// Generate array of type ids
auto type_ids = Int8(size, 0, static_cast<int8_t>(fields.size() - 1),
/*null_probability=*/0, alignment, memory_pool);
return *SparseUnionArray::Make(*type_ids, fields, type_codes);
auto type_ids =
GenerateUnionTypeIds(type_codes, type_id_indices->data()->GetValues<int8_t>(1),
size, alignment, memory_pool);

return *SparseUnionArray::Make(*type_ids, std::move(fields), std::move(type_codes));
}

std::shared_ptr<Array> RandomArrayGenerator::DenseUnion(const ArrayVector& fields,
int64_t size, int64_t alignment,
std::shared_ptr<Array> RandomArrayGenerator::DenseUnion(const UnionType& type,
ArrayVector fields, int64_t size,
int64_t alignment,
MemoryPool* memory_pool) {
DCHECK_GT(fields.size(), 0);
// Trivial type codes map
std::vector<UnionArray::type_code_t> type_codes(fields.size());
std::iota(type_codes.begin(), type_codes.end(), 0);
const auto& type_codes = type.type_codes();
DCHECK_EQ(type_codes.size(), fields.size());

// Generate array of type id indices within type_codes
auto type_id_indices = Int8(size, 0, static_cast<int8_t>(type_codes.size() - 1),
/*null_probability=*/0, alignment, memory_pool);
// Generate array of type ids
auto type_ids = Int8(size, 0, static_cast<int8_t>(fields.size() - 1),
/*null_probability=*/0, alignment, memory_pool);
auto type_ids =
GenerateUnionTypeIds(type_codes, type_id_indices->data()->GetValues<int8_t>(1),
size, alignment, memory_pool);

// Generate array of offsets
const auto& concrete_ids = checked_cast<const Int8Array&>(*type_ids);
const auto& logical_type_ids = checked_cast<const Int8Array&>(*type_ids);
const auto& type_id_mapping = type.child_ids();
Int32Builder offsets_builder(memory_pool, alignment);
ABORT_NOT_OK(offsets_builder.Reserve(size));
std::vector<int32_t> last_offsets(fields.size(), 0);
for (int64_t i = 0; i < size; ++i) {
const auto field_id = concrete_ids.Value(i);
const auto field_id = type_id_mapping[logical_type_ids.Value(i)];
offsets_builder.UnsafeAppend(last_offsets[field_id]++);
}
std::shared_ptr<Array> offsets;
ABORT_NOT_OK(offsets_builder.Finish(&offsets));

return *DenseUnionArray::Make(*type_ids, *offsets, fields, type_codes);
return *DenseUnionArray::Make(*type_ids, *offsets, std::move(fields),
std::move(type_codes));
}

namespace {
Expand Down Expand Up @@ -939,9 +967,11 @@ std::shared_ptr<Array> RandomArrayGenerator::ArrayOf(const Field& field, int64_t
const auto& child_field = field.type()->field(i);
child_arrays[i] = ArrayOf(*child_field, length, alignment, memory_pool);
}
auto array = field.type()->id() == Type::type::SPARSE_UNION
? SparseUnion(child_arrays, length, alignment, memory_pool)
: DenseUnion(child_arrays, length, alignment, memory_pool);
auto union_type = internal::checked_pointer_cast<UnionType>(field.type());
auto array =
field.type()->id() == Type::type::SPARSE_UNION
? SparseUnion(*union_type, child_arrays, length, alignment, memory_pool)
: DenseUnion(*union_type, child_arrays, length, alignment, memory_pool);
return *array->View(field.type());
}

Expand Down
6 changes: 4 additions & 2 deletions cpp/src/arrow/testing/random.h
Original file line number Diff line number Diff line change
Expand Up @@ -475,7 +475,8 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator {
/// \param[in] size The size of the generated sparse union array
/// \param[in] alignment alignment for memory allocations (in bytes)
/// \param[in] memory_pool memory pool to allocate memory from
std::shared_ptr<Array> SparseUnion(const ArrayVector& fields, int64_t size,
std::shared_ptr<Array> SparseUnion(const UnionType& type, ArrayVector fields,
int64_t size,
int64_t alignment = kDefaultBufferAlignment,
MemoryPool* memory_pool = default_memory_pool());

Expand All @@ -489,7 +490,8 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator {
/// \param[in] size The size of the generated sparse union array
/// \param[in] alignment alignment for memory allocations (in bytes)
/// \param[in] memory_pool memory pool to allocate memory from
std::shared_ptr<Array> DenseUnion(const ArrayVector& fields, int64_t size,
std::shared_ptr<Array> DenseUnion(const UnionType& type, ArrayVector fields,
int64_t size,
int64_t alignment = kDefaultBufferAlignment,
MemoryPool* memory_pool = default_memory_pool());

Expand Down

0 comments on commit 7299d99

Please sign in to comment.