From 2b4969f853a84993fc52a65ba6106a34d5e400f4 Mon Sep 17 00:00:00 2001 From: Chendi Xue Date: Fri, 13 Dec 2019 15:59:13 +0800 Subject: [PATCH] [C++] Change to use sort + merge sort_array_to_indices performance is acceptable for now Signed-off-by: Chendi Xue --- .../compute/kernels/sort_arrays_to_indices.cc | 66 +++++++++++++++++-- .../kernels/sort_arrays_to_indices_test.cc | 3 + 2 files changed, 62 insertions(+), 7 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/sort_arrays_to_indices.cc b/cpp/src/arrow/compute/kernels/sort_arrays_to_indices.cc index 13b9ba1066940..2e11dddeead62 100644 --- a/cpp/src/arrow/compute/kernels/sort_arrays_to_indices.cc +++ b/cpp/src/arrow/compute/kernels/sort_arrays_to_indices.cc @@ -19,6 +19,7 @@ #include #include +#include #include #include #include @@ -101,6 +102,47 @@ class SortArraysToIndicesKernelImpl : public SortArraysToIndicesKernel { private: Comparator compare_; + std::vector> typed_arrays_; + + std::pair merge( + std::vector>::iterator + arrays_valid_range_begin, + std::vector>::iterator + arrays_valid_range_end) { + auto size = arrays_valid_range_end - arrays_valid_range_begin; + std::pair left; + std::pair right; + if (size > 2) { + auto half_size = size / 2; + auto arrays_valid_range_middle = arrays_valid_range_begin + half_size; + left = merge(arrays_valid_range_begin, arrays_valid_range_middle); + right = merge(arrays_valid_range_middle, arrays_valid_range_end); + } else if (size == 2) { + left = *arrays_valid_range_begin; + right = *(arrays_valid_range_end - 1); + } else { + // only one item + return *arrays_valid_range_begin; + } + auto left_size = left.second - left.first; + auto right_size = right.second - right.first; + + ArrayItemIndex* left_tmp = new ArrayItemIndex[left_size]; + memcpy(left_tmp, left.first, left_size * sizeof(ArrayItemIndex)); + ArrayItemIndex* right_tmp = new ArrayItemIndex[right_size]; + memcpy(right_tmp, right.first, right_size * sizeof(ArrayItemIndex)); + + std::set_union(left_tmp, left_tmp + left_size, right_tmp, right_tmp + right_size, + left.first, [this](ArrayItemIndex left, ArrayItemIndex right) { + return typed_arrays_[left.array_id]->GetView(left.id) < + typed_arrays_[right.array_id]->GetView(right.id); + }); + delete[] left_tmp; + delete[] right_tmp; + + assert((left.first + left_size + right_size) == right.second); + return std::make_pair(left.first, right.second); + } Status SortArraysToIndicesImpl(FunctionContext* ctx, std::vector> values, @@ -119,11 +161,16 @@ class SortArraysToIndicesKernelImpl : public SortArraysToIndicesKernel { ArrayItemIndex* indices_begin = reinterpret_cast(indices_buf->mutable_data()); ArrayItemIndex* indices_end = indices_begin + items_total; - std::vector> typed_arrays; + std::vector> arrays_valid_range; + int64_t array_id = 0; int64_t null_count_total = 0; int64_t indices_i = 0; + for (auto array : values) { + auto typed_array = std::dynamic_pointer_cast(array); + typed_arrays_.push_back(typed_array); + auto array_begin = indices_begin + indices_i; for (int64_t i = 0; i < array->length(); i++) { if (!array->IsNull(i)) { (indices_begin + indices_i)->array_id = array_id; @@ -135,14 +182,19 @@ class SortArraysToIndicesKernelImpl : public SortArraysToIndicesKernel { null_count_total++; } } - typed_arrays.push_back(std::dynamic_pointer_cast(array)); + // first round sort + auto array_end = indices_begin + indices_i; + std::stable_sort(array_begin, array_end, + [typed_array, this](ArrayItemIndex left, ArrayItemIndex right) { + return typed_array->GetView(left.id) < + typed_array->GetView(right.id); + }); + arrays_valid_range.push_back(std::make_pair(array_begin, array_end)); array_id++; } - auto nulls_begin = indices_begin + items_total - null_count_total; - std::stable_sort(indices_begin, nulls_begin, - [typed_arrays, this](ArrayItemIndex left, ArrayItemIndex right) { - return compare_(typed_arrays, left, right); - }); + + // merge sort + merge(arrays_valid_range.begin(), arrays_valid_range.end()); *offsets = std::make_shared( std::make_shared(sizeof(ArrayItemIndex) / sizeof(int32_t)), diff --git a/cpp/src/arrow/compute/kernels/sort_arrays_to_indices_test.cc b/cpp/src/arrow/compute/kernels/sort_arrays_to_indices_test.cc index ae26707411d4f..fe4d9408464d1 100644 --- a/cpp/src/arrow/compute/kernels/sort_arrays_to_indices_test.cc +++ b/cpp/src/arrow/compute/kernels/sort_arrays_to_indices_test.cc @@ -98,6 +98,9 @@ TYPED_TEST(TestSortToIndicesKernelForIntegral, SortIntegral) { std::vector input; input.push_back("[10, 12, 4, 50, 50, 32, 11]"); input.push_back("[1, 14, 43, 42, 6, null, 2]"); + input.push_back("[3, 64, 15, 7, 9, 19, 33]"); + input.push_back("[23, 17, 41, 18, 20, 35, 30]"); + input.push_back("[37, null, 22, 13, 8, 59, 21]"); this->SortArraysToIndices(input); }