diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 5d61112518f5e..0f4824ec99daa 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -716,7 +716,8 @@ set(ARROW_COMPUTE_SRCS compute/row/compare_internal.cc compute/row/grouper.cc compute/row/row_internal.cc - compute/util.cc) + compute/util.cc + compute/util_internal.cc) append_runtime_avx2_src(ARROW_COMPUTE_SRCS compute/key_hash_internal_avx2.cc) append_runtime_avx2_bmi2_src(ARROW_COMPUTE_SRCS compute/key_map_internal_avx2.cc) diff --git a/cpp/src/arrow/compute/key_hash_internal.h b/cpp/src/arrow/compute/key_hash_internal.h index 2b9c945565116..1f25beb0e1622 100644 --- a/cpp/src/arrow/compute/key_hash_internal.h +++ b/cpp/src/arrow/compute/key_hash_internal.h @@ -48,8 +48,12 @@ class ARROW_EXPORT Hashing32 { static void HashMultiColumn(const std::vector& cols, LightContext* ctx, uint32_t* out_hash); - // Clarify the max temp stack usage for HashBatch so the caller could reserve enough - // size in advance. + // Clarify the max temp stack usage for HashBatch, which might be necessary for the + // caller to be aware of at compile time to reserve enough stack size in advance. The + // HashBatch implementation uses one uint32 temp vector as a buffer for hash, one uint16 + // temp vector as a buffer for null indices and one uint32 temp vector as a buffer for + // null hash, all are of size kMiniBatchLength. Plus extra kMiniBatchLength to cope with + // stack padding and aligning. static constexpr auto kHashBatchTempStackUsage = (sizeof(uint32_t) + sizeof(uint16_t) + sizeof(uint32_t) + /*extra=*/1) * util::MiniBatch::kMiniBatchLength; @@ -167,8 +171,11 @@ class ARROW_EXPORT Hashing64 { static void HashMultiColumn(const std::vector& cols, LightContext* ctx, uint64_t* hashes); - // Clarify the max temp stack usage for HashBatch so the caller could reserve enough - // size in advance. + // Clarify the max temp stack usage for HashBatch, which might be necessary for the + // caller to be aware of at compile time to reserve enough stack size in advance. The + // HashBatch implementation uses one uint16 temp vector as a buffer for null indices and + // one uint64 temp vector as a buffer for null hash, all are of size kMiniBatchLength. + // Plus extra kMiniBatchLength to cope with stack padding and aligning. static constexpr auto kHashBatchTempStackUsage = (sizeof(uint16_t) + sizeof(uint64_t) + /*extra=*/1) * util::MiniBatch::kMiniBatchLength; diff --git a/cpp/src/arrow/compute/key_map_internal.h b/cpp/src/arrow/compute/key_map_internal.h index 8e06dc83483aa..a5e784a9e4463 100644 --- a/cpp/src/arrow/compute/key_map_internal.h +++ b/cpp/src/arrow/compute/key_map_internal.h @@ -21,6 +21,7 @@ #include #include "arrow/compute/util.h" +#include "arrow/compute/util_internal.h" #include "arrow/result.h" #include "arrow/status.h" #include "arrow/type_fwd.h" diff --git a/cpp/src/arrow/compute/light_array_internal.h b/cpp/src/arrow/compute/light_array_internal.h index 67de71bf56c92..995c4211998e0 100644 --- a/cpp/src/arrow/compute/light_array_internal.h +++ b/cpp/src/arrow/compute/light_array_internal.h @@ -22,6 +22,7 @@ #include "arrow/array.h" #include "arrow/compute/exec.h" #include "arrow/compute/util.h" +#include "arrow/compute/util_internal.h" #include "arrow/type.h" #include "arrow/util/cpu_info.h" #include "arrow/util/logging.h" diff --git a/cpp/src/arrow/compute/light_array_test.cc b/cpp/src/arrow/compute/light_array_test.cc index 08f36ee606025..cc02d489d138f 100644 --- a/cpp/src/arrow/compute/light_array_test.cc +++ b/cpp/src/arrow/compute/light_array_test.cc @@ -20,6 +20,7 @@ #include #include +#include "arrow/memory_pool.h" #include "arrow/testing/generator.h" #include "arrow/testing/gtest_util.h" #include "arrow/type.h" diff --git a/cpp/src/arrow/compute/row/compare_internal.h b/cpp/src/arrow/compute/row/compare_internal.h index e3e3f3c3b6020..a5a109b0b516a 100644 --- a/cpp/src/arrow/compute/row/compare_internal.h +++ b/cpp/src/arrow/compute/row/compare_internal.h @@ -32,8 +32,11 @@ namespace compute { class ARROW_EXPORT KeyCompare { public: - // Clarify the max temp stack usage for CompareColumnsToRows so the caller could reserve - // enough size in advance. + // Clarify the max temp stack usage for CompareColumnsToRows, which might be necessary + // for the caller to be aware of (possibly at compile time) to reserve enough stack size + // in advance. The CompareColumnsToRows implementation uses three uint8 temp vectors as + // buffers for match vectors, all are of size num_rows. Plus extra kMiniBatchLength to + // cope with stack padding and aligning. constexpr static int64_t CompareColumnsToRowsTempStackUsage(int64_t num_rows) { return (sizeof(uint8_t) + sizeof(uint8_t) + sizeof(uint8_t)) * num_rows + /*extra=*/util::MiniBatch::kMiniBatchLength; diff --git a/cpp/src/arrow/compute/util.cc b/cpp/src/arrow/compute/util.cc index cd20fe486e81a..b90b3a64056bd 100644 --- a/cpp/src/arrow/compute/util.cc +++ b/cpp/src/arrow/compute/util.cc @@ -17,11 +17,7 @@ #include "arrow/compute/util.h" -#include "arrow/table.h" -#include "arrow/util/bit_util.h" -#include "arrow/util/bitmap_ops.h" #include "arrow/util/logging.h" -#include "arrow/util/tracing_internal.h" #include "arrow/util/ubsan.h" namespace arrow { @@ -31,36 +27,6 @@ using internal::CpuInfo; namespace util { -void TempVectorStack::alloc(uint32_t num_bytes, uint8_t** data, int* id) { - int64_t estimated_alloc_size = EstimatedAllocationSize(num_bytes); - int64_t new_top = top_ + estimated_alloc_size; - // Stack overflow check (see GH-39582). - // XXX cannot return a regular Status because most consumers do not either. - ARROW_CHECK_LE(new_top, buffer_size_) - << "TempVectorStack::alloc overflow: allocating " << estimated_alloc_size - << " on top of " << top_ << " in stack of size " << buffer_size_; - *data = buffer_->mutable_data() + top_ + sizeof(uint64_t); - // We set 8 bytes before the beginning of the allocated range and - // 8 bytes after the end to check for stack overflow (which would - // result in those known bytes being corrupted). - reinterpret_cast(buffer_->mutable_data() + top_)[0] = kGuard1; - reinterpret_cast(buffer_->mutable_data() + new_top)[-1] = kGuard2; - *id = num_vectors_++; - top_ = new_top; -} - -void TempVectorStack::release(int id, uint32_t num_bytes) { - ARROW_DCHECK(num_vectors_ == id + 1); - int64_t size = EstimatedAllocationSize(num_bytes); - ARROW_DCHECK(reinterpret_cast(buffer_->mutable_data() + top_)[-1] == - kGuard2); - ARROW_DCHECK(top_ >= size); - top_ -= size; - ARROW_DCHECK(reinterpret_cast(buffer_->mutable_data() + top_)[0] == - kGuard1); - --num_vectors_; -} - namespace bit_util { inline uint64_t SafeLoadUpTo8Bytes(const uint8_t* bytes, int num_bytes) { diff --git a/cpp/src/arrow/compute/util.h b/cpp/src/arrow/compute/util.h index 88dce160ce936..d56e398667f66 100644 --- a/cpp/src/arrow/compute/util.h +++ b/cpp/src/arrow/compute/util.h @@ -24,17 +24,10 @@ #include #include -#include "arrow/buffer.h" #include "arrow/compute/expression.h" #include "arrow/compute/type_fwd.h" -#include "arrow/memory_pool.h" #include "arrow/result.h" -#include "arrow/status.h" -#include "arrow/util/bit_util.h" #include "arrow/util/cpu_info.h" -#include "arrow/util/mutex.h" -#include "arrow/util/thread_pool.h" -#include "arrow/util/type_fwd.h" #if defined(__clang__) || defined(__GNUC__) #define BYTESWAP(x) __builtin_bswap64(x) @@ -77,72 +70,6 @@ class MiniBatch { static constexpr int kMiniBatchLength = 1 << kLogMiniBatchLength; }; -/// Storage used to allocate temporary vectors of a batch size. -/// Temporary vectors should resemble allocating temporary variables on the stack -/// but in the context of vectorized processing where we need to store a vector of -/// temporaries instead of a single value. -class ARROW_EXPORT TempVectorStack { - template - friend class TempVectorHolder; - - public: - Status Init(MemoryPool* pool, int64_t size) { - num_vectors_ = 0; - top_ = 0; - buffer_size_ = EstimatedAllocationSize(size); - ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateResizableBuffer(size, pool)); - // Ensure later operations don't accidentally read uninitialized memory. - std::memset(buffer->mutable_data(), 0xFF, size); - buffer_ = std::move(buffer); - return Status::OK(); - } - - private: - static int64_t EstimatedAllocationSize(int64_t size) { - return PaddedAllocationSize(size) + 2 * sizeof(uint64_t); - } - - static int64_t PaddedAllocationSize(int64_t num_bytes) { - // Round up allocation size to multiple of 8 bytes - // to avoid returning temp vectors with unaligned address. - // - // Also add padding at the end to facilitate loads and stores - // using SIMD when number of vector elements is not divisible - // by the number of SIMD lanes. - // - return ::arrow::bit_util::RoundUp(num_bytes, sizeof(int64_t)) + kPadding; - } - void alloc(uint32_t num_bytes, uint8_t** data, int* id); - void release(int id, uint32_t num_bytes); - static constexpr uint64_t kGuard1 = 0x3141592653589793ULL; - static constexpr uint64_t kGuard2 = 0x0577215664901532ULL; - static constexpr int64_t kPadding = 64; - int num_vectors_; - int64_t top_; - std::unique_ptr buffer_; - int64_t buffer_size_; -}; - -template -class TempVectorHolder { - friend class TempVectorStack; - - public: - ~TempVectorHolder() { stack_->release(id_, num_elements_ * sizeof(T)); } - T* mutable_data() { return reinterpret_cast(data_); } - TempVectorHolder(TempVectorStack* stack, uint32_t num_elements) { - stack_ = stack; - num_elements_ = num_elements; - stack_->alloc(num_elements * sizeof(T), &data_, &id_); - } - - private: - TempVectorStack* stack_; - uint8_t* data_; - int id_; - uint32_t num_elements_; -}; - namespace bit_util { ARROW_EXPORT void bits_to_indexes(int bit_to_search, int64_t hardware_flags, diff --git a/cpp/src/arrow/compute/util_internal.cc b/cpp/src/arrow/compute/util_internal.cc new file mode 100644 index 0000000000000..cc26982fef110 --- /dev/null +++ b/cpp/src/arrow/compute/util_internal.cc @@ -0,0 +1,79 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/compute/util_internal.h" + +#include "arrow/compute/util.h" +#include "arrow/memory_pool.h" + +namespace arrow { +namespace util { + +Status TempVectorStack::Init(MemoryPool* pool, int64_t size) { + num_vectors_ = 0; + top_ = 0; + buffer_size_ = EstimatedAllocationSize(size); + ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateResizableBuffer(size, pool)); + // Ensure later operations don't accidentally read uninitialized memory. + std::memset(buffer->mutable_data(), 0xFF, size); + buffer_ = std::move(buffer); + return Status::OK(); +} + +int64_t TempVectorStack::PaddedAllocationSize(int64_t num_bytes) { + // Round up allocation size to multiple of 8 bytes + // to avoid returning temp vectors with unaligned address. + // + // Also add padding at the end to facilitate loads and stores + // using SIMD when number of vector elements is not divisible + // by the number of SIMD lanes. + // + return ::arrow::bit_util::RoundUp(num_bytes, sizeof(int64_t)) + kPadding; +} + +void TempVectorStack::alloc(uint32_t num_bytes, uint8_t** data, int* id) { + int64_t estimated_alloc_size = EstimatedAllocationSize(num_bytes); + int64_t new_top = top_ + estimated_alloc_size; + // Stack overflow check (see GH-39582). + // XXX cannot return a regular Status because most consumers do not either. + ARROW_CHECK_LE(new_top, buffer_size_) + << "TempVectorStack::alloc overflow: allocating " << estimated_alloc_size + << " on top of " << top_ << " in stack of size " << buffer_size_; + *data = buffer_->mutable_data() + top_ + sizeof(uint64_t); + // We set 8 bytes before the beginning of the allocated range and + // 8 bytes after the end to check for stack overflow (which would + // result in those known bytes being corrupted). + reinterpret_cast(buffer_->mutable_data() + top_)[0] = kGuard1; + reinterpret_cast(buffer_->mutable_data() + new_top)[-1] = kGuard2; + *id = num_vectors_++; + top_ = new_top; +} + +void TempVectorStack::release(int id, uint32_t num_bytes) { + ARROW_DCHECK(num_vectors_ == id + 1); + int64_t size = EstimatedAllocationSize(num_bytes); + ARROW_DCHECK(reinterpret_cast(buffer_->mutable_data() + top_)[-1] == + kGuard2); + ARROW_DCHECK(top_ >= size); + top_ -= size; + ARROW_DCHECK(reinterpret_cast(buffer_->mutable_data() + top_)[0] == + kGuard1); + --num_vectors_; +} + +} // namespace util +} // namespace arrow diff --git a/cpp/src/arrow/compute/util_internal.h b/cpp/src/arrow/compute/util_internal.h index 87e89a3350721..b560cddbf3b5a 100644 --- a/cpp/src/arrow/compute/util_internal.h +++ b/cpp/src/arrow/compute/util_internal.h @@ -17,6 +17,8 @@ #pragma once +#include "arrow/status.h" +#include "arrow/type_fwd.h" #include "arrow/util/logging.h" namespace arrow { @@ -27,5 +29,54 @@ void CheckAlignment(const void* ptr) { ARROW_DCHECK(reinterpret_cast(ptr) % sizeof(T) == 0); } +/// Storage used to allocate temporary vectors of a batch size. +/// Temporary vectors should resemble allocating temporary variables on the stack +/// but in the context of vectorized processing where we need to store a vector of +/// temporaries instead of a single value. +class ARROW_EXPORT TempVectorStack { + template + friend class TempVectorHolder; + + public: + Status Init(MemoryPool* pool, int64_t size); + + private: + static int64_t EstimatedAllocationSize(int64_t size) { + return PaddedAllocationSize(size) + 2 * sizeof(uint64_t); + } + + static int64_t PaddedAllocationSize(int64_t num_bytes); + + void alloc(uint32_t num_bytes, uint8_t** data, int* id); + void release(int id, uint32_t num_bytes); + static constexpr uint64_t kGuard1 = 0x3141592653589793ULL; + static constexpr uint64_t kGuard2 = 0x0577215664901532ULL; + static constexpr int64_t kPadding = 64; + int num_vectors_; + int64_t top_; + std::unique_ptr buffer_; + int64_t buffer_size_; +}; + +template +class TempVectorHolder { + friend class TempVectorStack; + + public: + ~TempVectorHolder() { stack_->release(id_, num_elements_ * sizeof(T)); } + T* mutable_data() { return reinterpret_cast(data_); } + TempVectorHolder(TempVectorStack* stack, uint32_t num_elements) { + stack_ = stack; + num_elements_ = num_elements; + stack_->alloc(num_elements * sizeof(T), &data_, &id_); + } + + private: + TempVectorStack* stack_; + uint8_t* data_; + int id_; + uint32_t num_elements_; +}; + } // namespace util } // namespace arrow