From 5b607208505c1ce3fc05f185bc839c97651362be Mon Sep 17 00:00:00 2001 From: Masha Basmanova Date: Mon, 18 Sep 2023 05:29:29 -0700 Subject: [PATCH] Optimize FlatVector::copyRanges (#6566) Summary: Pull Request resolved: https://github.com/facebookincubator/velox/pull/6566 FlatVector::copyRanges is slow when there are many small ranges. I noticed this while investigating slowness of array_constructor which used created 1-row ranges: https://github.com/facebookincubator/velox/discussions/5958#discussioncomment-6615155a ``` FlatVector.h void copyRanges( const BaseVector* source, const folly::Range& ranges) override { for (auto& range : ranges) { copy(source, range.targetIndex, range.sourceIndex, range.count); } } ``` This change optimizes FlatVector::copyRanges using code from copy(source, targetIndex, sourceIndex, count) which copies one range. This change also replaces the latter call with a call to copyRanges, hence, the overall amount of code is about the same. An earlier change optimized array_constructor to not use copyRanges for primitive types, but it is still used by Array/MapVector::copyRanges, although, these do not create 1-row ranges. Still the array_constructor benchmark shows some wins for arrays and maps. ``` Before: array_constructor_ARRAY_nullfree##2_const 54.31ms 18.41 array_constructor_ARRAY_nulls##1 33.50ms 29.85 array_constructor_ARRAY_nulls##2 59.05ms 16.93 array_constructor_ARRAY_nulls##3 88.36ms 11.32 array_constructor_ARRAY_nulls##2_null 74.53ms 13.42 array_constructor_ARRAY_nulls##2_const 102.54ms 9.75 After: array_constructor_ARRAY_nullfree##2_const 41.36ms 24.18 array_constructor_ARRAY_nulls##1 30.51ms 32.78 array_constructor_ARRAY_nulls##2 55.13ms 18.14 array_constructor_ARRAY_nulls##3 77.93ms 12.83 array_constructor_ARRAY_nulls##2_null 68.84ms 14.53 array_constructor_ARRAY_nulls##2_const 83.91ms 11.92 Before: array_constructor_MAP_nullfree##2_const 67.44ms 14.83 array_constructor_MAP_nulls##1 37.00ms 27.03 array_constructor_MAP_nulls##2 67.76ms 14.76 array_constructor_MAP_nulls##3 100.88ms 9.91 array_constructor_MAP_nulls##2_null 84.22ms 11.87 array_constructor_MAP_nulls##2_const 122.55ms 8.16 After: array_constructor_MAP_nullfree##2_const 49.94ms 20.03 array_constructor_MAP_nulls##1 34.34ms 29.12 array_constructor_MAP_nulls##2 55.23ms 18.11 array_constructor_MAP_nulls##3 82.64ms 12.10 array_constructor_MAP_nulls##2_null 70.74ms 14.14 array_constructor_MAP_nulls##2_const 88.13ms 11.35 ``` Differential Revision: D49269500 fbshipit-source-id: 8fc304cce31f782aef818e6d8ce052360af8e832 --- velox/vector/BaseVector.cpp | 23 ++++++ velox/vector/BaseVector.h | 51 +++++++++++-- velox/vector/ComplexVector.cpp | 135 +++++++++++++++------------------ velox/vector/FlatVector-inl.h | 130 +++++++++++++++++-------------- velox/vector/FlatVector.cpp | 46 ----------- velox/vector/FlatVector.h | 27 +------ 6 files changed, 203 insertions(+), 209 deletions(-) diff --git a/velox/vector/BaseVector.cpp b/velox/vector/BaseVector.cpp index 0aa8819a2811..beb488f9b88e 100644 --- a/velox/vector/BaseVector.cpp +++ b/velox/vector/BaseVector.cpp @@ -334,6 +334,29 @@ VectorPtr BaseVector::createInternal( } } +// static +void BaseVector::setNulls( + uint64_t* rawNulls, + const folly::Range& ranges, + bool isNull) { + const auto nullBits = isNull ? bits::kNull : bits::kNotNull; + applyToEachRange( + ranges, [&](auto targetIndex, auto /*sourceIndex*/, auto count) { + bits::fillBits(rawNulls, targetIndex, targetIndex + count, nullBits); + }); +} + +// static +void BaseVector::copyNulls( + uint64_t* targetRawNulls, + const uint64_t* sourceRawNulls, + const folly::Range& ranges) { + applyToEachRange(ranges, [&](auto targetIndex, auto sourceIndex, auto count) { + bits::copyBits( + sourceRawNulls, sourceIndex, targetRawNulls, targetIndex, count); + }); +} + void BaseVector::addNulls(const uint64_t* bits, const SelectivityVector& rows) { VELOX_CHECK(isNullsWritable()); VELOX_CHECK(length_ >= rows.end()); diff --git a/velox/vector/BaseVector.h b/velox/vector/BaseVector.h index d8a076c61f01..db47e646505f 100644 --- a/velox/vector/BaseVector.h +++ b/velox/vector/BaseVector.h @@ -371,6 +371,25 @@ class BaseVector { nulls_->asMutable(), idx, bits::kNull ? value : !value); } + struct CopyRange { + vector_size_t sourceIndex; + vector_size_t targetIndex; + vector_size_t count; + }; + + /// Sets null flags for each row in 'ranges' to 'isNull'. + static void setNulls( + uint64_t* rawNulls, + const folly::Range& ranges, + bool isNull); + + /// Copies null flags for each row in 'ranges' from 'sourceRawNulls' to + /// 'targetRawNulls'. + static void copyNulls( + uint64_t* targetRawNulls, + const uint64_t* sourceRawNulls, + const folly::Range& ranges); + static int32_t countNulls(const BufferPtr& nulls, vector_size_t begin, vector_size_t end) { return nulls ? bits::countNulls(nulls->as(), begin, end) : 0; @@ -437,12 +456,6 @@ class BaseVector { copyRanges(source, folly::Range(&range, 1)); } - struct CopyRange { - vector_size_t sourceIndex; - vector_size_t targetIndex; - vector_size_t count; - }; - /// Converts SelectivityVetor into a list of CopyRanges having sourceIndex == /// targetIndex. Aims to produce as few ranges as possible. If all rows are /// selected, returns a single range. @@ -906,6 +919,32 @@ class BaseVector { bool memoDisabled_{false}; }; +/// Loops over rows in 'ranges' and invokes 'func' for each row. +/// @param TFunc A void function taking two arguments: targetIndex and +/// sourceIndex. +template +void applyToEachRow( + const folly::Range& ranges, + const TFunc& func) { + for (const auto& range : ranges) { + for (auto i = 0; i < range.count; ++i) { + func(range.targetIndex + i, range.sourceIndex + i); + } + } +} + +/// Loops over 'ranges' and invokes 'func' for each range. +/// @param TFunc A void function taking 3 arguments: targetIndex, sourceIndex +/// and count. +template +void applyToEachRange( + const folly::Range& ranges, + const TFunc& func) { + for (const auto& range : ranges) { + func(range.targetIndex, range.sourceIndex, range.count); + } +} + template <> uint64_t BaseVector::byteSize(vector_size_t count); diff --git a/velox/vector/ComplexVector.cpp b/velox/vector/ComplexVector.cpp index d3be8131821e..23e54875f307 100644 --- a/velox/vector/ComplexVector.cpp +++ b/velox/vector/ComplexVector.cpp @@ -299,20 +299,17 @@ void RowVector::copyRanges( } if (isAllNullVector(*source)) { - for (const auto& range : ranges) { - for (auto i = 0; i < range.count; ++i) { - setNull(range.targetIndex + i, true); - } - } + BaseVector::setNulls(mutableRawNulls(), ranges, true); return; } auto minTargetIndex = std::numeric_limits::max(); auto maxTargetIndex = std::numeric_limits::min(); - for (auto& r : ranges) { - minTargetIndex = std::min(minTargetIndex, r.targetIndex); - maxTargetIndex = std::max(maxTargetIndex, r.targetIndex + r.count); - } + applyToEachRange(ranges, [&](auto targetIndex, auto sourceIndex, auto count) { + minTargetIndex = std::min(minTargetIndex, targetIndex); + maxTargetIndex = std::max(maxTargetIndex, targetIndex + count); + }); + SelectivityVector rows(maxTargetIndex); rows.setValidRange(0, minTargetIndex, false); rows.updateBounds(); @@ -324,11 +321,7 @@ void RowVector::copyRanges( DecodedVector decoded(*source); if (decoded.isIdentityMapping() && !decoded.mayHaveNulls()) { if (rawNulls_) { - auto* rawNulls = mutableRawNulls(); - for (auto& r : ranges) { - bits::fillBits( - rawNulls, r.targetIndex, r.targetIndex + r.count, bits::kNotNull); - } + setNulls(mutableRawNulls(), ranges, false); } auto* rowSource = source->loadedVector()->as(); for (int i = 0; i < children_.size(); ++i) { @@ -337,27 +330,26 @@ void RowVector::copyRanges( } else { std::vector baseRanges; baseRanges.reserve(ranges.size()); - for (auto& r : ranges) { - for (vector_size_t i = 0; i < r.count; ++i) { - bool isNull = decoded.isNullAt(r.sourceIndex + i); - setNull(r.targetIndex + i, isNull); - if (isNull) { - continue; - } - auto baseIndex = decoded.index(r.sourceIndex + i); - if (!baseRanges.empty() && - baseRanges.back().sourceIndex + 1 == baseIndex && - baseRanges.back().targetIndex + 1 == r.targetIndex + i) { - ++baseRanges.back().count; - } else { - baseRanges.push_back({ - .sourceIndex = baseIndex, - .targetIndex = r.targetIndex + i, - .count = 1, - }); - } + applyToEachRow(ranges, [&](auto targetIndex, auto sourceIndex) { + bool isNull = decoded.isNullAt(sourceIndex); + setNull(targetIndex, isNull); + if (isNull) { + return; } - } + auto baseIndex = decoded.index(sourceIndex); + if (!baseRanges.empty() && + baseRanges.back().sourceIndex + 1 == baseIndex && + baseRanges.back().targetIndex + 1 == targetIndex) { + ++baseRanges.back().count; + } else { + baseRanges.push_back({ + .sourceIndex = baseIndex, + .targetIndex = targetIndex, + .count = 1, + }); + } + }); + auto* rowSource = decoded.base()->as(); for (int i = 0; i < children_.size(); ++i) { children_[i]->copyRanges( @@ -463,11 +455,7 @@ void ArrayVectorBase::copyRangesImpl( VectorPtr* targetValues, VectorPtr* targetKeys) { if (isAllNullVector(*source)) { - for (const auto& range : ranges) { - for (auto i = 0; i < range.count; ++i) { - setNull(range.targetIndex + i, true); - } - } + BaseVector::setNulls(mutableRawNulls(), ranges, true); return; } @@ -537,46 +525,43 @@ void ArrayVectorBase::copyRangesImpl( } else { std::vector outRanges; vector_size_t totalCount = 0; - for (auto& range : ranges) { - if (range.count == 0) { - continue; - } - VELOX_DCHECK(BaseVector::length_ >= range.targetIndex + range.count); - totalCount += range.count; - } - outRanges.reserve(totalCount); - for (auto& range : ranges) { - for (vector_size_t i = 0; i < range.count; ++i) { - if (source->isNullAt(range.sourceIndex + i)) { - setNull(range.targetIndex + i, true); - } else { - if (setNotNulls) { - setNull(range.targetIndex + i, false); + applyToEachRange( + ranges, [&](auto targetIndex, auto sourceIndex, auto count) { + if (count > 0) { + VELOX_DCHECK(BaseVector::length_ >= targetIndex + count); + totalCount += count; } - vector_size_t wrappedIndex = - source->wrappedIndex(range.sourceIndex + i); - vector_size_t copySize = sourceArray->sizeAt(wrappedIndex); - - if (copySize > 0) { - auto copyOffset = sourceArray->offsetAt(wrappedIndex); - - // If we're copying two adjacent ranges, merge them. This only - // works if they're consecutive. - if (!outRanges.empty() && - (outRanges.back().sourceIndex + outRanges.back().count == - copyOffset)) { - outRanges.back().count += copySize; - } else { - outRanges.push_back({copyOffset, childSize, copySize}); - } + }); + outRanges.reserve(totalCount); + applyToEachRow(ranges, [&](auto targetIndex, auto sourceIndex) { + if (source->isNullAt(sourceIndex)) { + setNull(targetIndex, true); + } else { + if (setNotNulls) { + setNull(targetIndex, false); + } + auto wrappedIndex = source->wrappedIndex(sourceIndex); + auto copySize = sourceArray->sizeAt(wrappedIndex); + + if (copySize > 0) { + auto copyOffset = sourceArray->offsetAt(wrappedIndex); + + // If we're copying two adjacent ranges, merge them. This only + // works if they're consecutive. + if (!outRanges.empty() && + (outRanges.back().sourceIndex + outRanges.back().count == + copyOffset)) { + outRanges.back().count += copySize; + } else { + outRanges.push_back({copyOffset, childSize, copySize}); } - - mutableOffsets[range.targetIndex + i] = childSize; - mutableSizes[range.targetIndex + i] = copySize; - childSize += copySize; } + + mutableOffsets[targetIndex] = childSize; + mutableSizes[targetIndex] = copySize; + childSize += copySize; } - } + }); targetValues->get()->resize(childSize); targetValues->get()->copyRanges(sourceValues, outRanges); diff --git a/velox/vector/FlatVector-inl.h b/velox/vector/FlatVector-inl.h index 520d2558c0ab..ce90e2ae140f 100644 --- a/velox/vector/FlatVector-inl.h +++ b/velox/vector/FlatVector-inl.h @@ -265,29 +265,38 @@ void FlatVector::copyValuesAndNulls( } template -void FlatVector::copyValuesAndNulls( +void FlatVector::copyRanges( const BaseVector* source, - vector_size_t targetIndex, - vector_size_t sourceIndex, - vector_size_t count) { + const folly::Range& ranges) { if (source->typeKind() == TypeKind::UNKNOWN) { - auto* rawNulls = BaseVector::mutableRawNulls(); - for (auto i = 0; i < count; ++i) { - bits::setNull(rawNulls, targetIndex + i, true); - } - return; - } - - if (count == 0) { + BaseVector::setNulls(BaseVector::mutableRawNulls(), ranges, true); return; } source = source->loadedVector(); VELOX_CHECK( BaseVector::compatibleKind(BaseVector::typeKind(), source->typeKind())); - VELOX_CHECK_GE(source->size(), sourceIndex + count); - VELOX_CHECK_GE(BaseVector::length_, targetIndex + count); - const uint64_t* sourceNulls = source->rawNulls(); + + if constexpr (std::is_same_v) { + auto leaf = + source->wrappedVector()->asUnchecked>(); + if (BaseVector::pool_ != leaf->pool()) { + applyToEachRow(ranges, [&](auto targetIndex, auto sourceIndex) { + if (source->isNullAt(sourceIndex)) { + this->setNull(targetIndex, true); + } else { + this->set( + targetIndex, leaf->valueAt(source->wrappedIndex(sourceIndex))); + } + }); + return; + } + + // We copy referencing the storage of 'source'. + acquireSharedStringBuffers(source); + } + + const uint64_t* sourceRawNulls = source->rawNulls(); uint64_t* rawNulls = const_cast(BaseVector::rawNulls_); if (source->mayHaveNulls()) { rawNulls = BaseVector::mutableRawNulls(); @@ -303,81 +312,86 @@ void FlatVector::copyValuesAndNulls( auto* flatSource = source->asUnchecked>(); if (flatSource->values() == nullptr) { // All source values are null. - for (auto i = 0; i < count; ++i) { - bits::setNull(rawNulls, targetIndex + i, true); - } + BaseVector::setNulls(BaseVector::mutableRawNulls(), ranges, true); return; } if constexpr (std::is_same_v) { - auto* rawValues = reinterpret_cast(rawValues_); + auto rawValues = reinterpret_cast(rawValues_); auto* sourceValues = flatSource->template rawValues(); - bits::copyBits(sourceValues, sourceIndex, rawValues, targetIndex, count); + applyToEachRange( + ranges, [&](auto targetIndex, auto sourceIndex, auto count) { + bits::copyBits( + sourceValues, sourceIndex, rawValues, targetIndex, count); + }); } else { - const T* srcValues = flatSource->rawValues(); - if (Buffer::is_pod_like_v) { - memcpy( - &rawValues_[targetIndex], - &srcValues[sourceIndex], - count * sizeof(T)); - } else { - std::copy( - srcValues + sourceIndex, - srcValues + sourceIndex + count, - rawValues_ + targetIndex); - } + const T* sourceValues = flatSource->rawValues(); + applyToEachRange( + ranges, [&](auto targetIndex, auto sourceIndex, auto count) { + if (Buffer::is_pod_like_v) { + memcpy( + &rawValues_[targetIndex], + &sourceValues[sourceIndex], + count * sizeof(T)); + } else { + std::copy( + sourceValues + sourceIndex, + sourceValues + sourceIndex + count, + rawValues_ + targetIndex); + } + }); } if (rawNulls) { - if (sourceNulls) { - bits::copyBits(sourceNulls, sourceIndex, rawNulls, targetIndex, count); + if (sourceRawNulls) { + BaseVector::copyNulls(rawNulls, sourceRawNulls, ranges); } else { - bits::fillBits( - rawNulls, targetIndex, targetIndex + count, bits::kNotNull); + BaseVector::setNulls(rawNulls, ranges, false); } } } else if (source->isConstantEncoding()) { if (source->isNullAt(0)) { - bits::fillBits(rawNulls, targetIndex, targetIndex + count, bits::kNull); + BaseVector::setNulls(rawNulls, ranges, true); return; } auto constant = source->asUnchecked>(); T value = constant->valueAt(0); - if constexpr (std::is_same_v) { - auto* rawValues = reinterpret_cast(rawValues_); - bits::fillBits(rawValues, targetIndex, targetIndex + count, value); + auto rawValues = reinterpret_cast(rawValues_); + applyToEachRange( + ranges, [&](auto targetIndex, auto /*sourceIndex*/, auto count) { + bits::fillBits(rawValues, targetIndex, targetIndex + count, value); + }); } else { - for (auto row = targetIndex; row < targetIndex + count; ++row) { - rawValues_[row] = value; - } + applyToEachRow(ranges, [&](auto targetIndex, auto /*sourceIndex*/) { + rawValues_[targetIndex] = value; + }); } if (rawNulls) { - bits::fillBits( - rawNulls, targetIndex, targetIndex + count, bits::kNotNull); + BaseVector::setNulls(rawNulls, ranges, false); } } else { - auto sourceVector = source->asUnchecked>(); - for (int32_t i = 0; i < count; ++i) { - if (!source->isNullAt(sourceIndex + i)) { + auto* sourceVector = source->asUnchecked>(); + uint64_t* rawBoolValues = nullptr; + if constexpr (std::is_same_v) { + rawBoolValues = reinterpret_cast(rawValues_); + } + applyToEachRow(ranges, [&](auto targetIndex, auto sourceIndex) { + if (!source->isNullAt(sourceIndex)) { + auto sourceValue = sourceVector->valueAt(sourceIndex); if constexpr (std::is_same_v) { - auto* rawValues = reinterpret_cast(rawValues_); - bits::setBit( - rawValues, - targetIndex + i, - sourceVector->valueAt(sourceIndex + i)); + bits::setBit(rawBoolValues, targetIndex, sourceValue); } else { - rawValues_[targetIndex + i] = sourceVector->valueAt(sourceIndex + i); + rawValues_[targetIndex] = sourceValue; } - if (rawNulls) { - bits::clearNull(rawNulls, targetIndex + i); + bits::clearNull(rawNulls, targetIndex); } } else { - bits::setNull(rawNulls, targetIndex + i); + bits::setNull(rawNulls, targetIndex); } - } + }); } } diff --git a/velox/vector/FlatVector.cpp b/velox/vector/FlatVector.cpp index e4ccdfcee62e..684e04b3c3ed 100644 --- a/velox/vector/FlatVector.cpp +++ b/velox/vector/FlatVector.cpp @@ -308,52 +308,6 @@ void FlatVector::copy( } } -template <> -void FlatVector::copy( - const BaseVector* source, - vector_size_t targetIndex, - vector_size_t sourceIndex, - vector_size_t count) { - if (count == 0) { - return; - } - BaseVector::copy(source, targetIndex, sourceIndex, count); -} - -template <> -void FlatVector::copyRanges( - const BaseVector* source, - const folly::Range& ranges) { - if (source->typeKind() == TypeKind::UNKNOWN) { - for (const auto& range : ranges) { - for (auto i = 0; i < range.count; ++i) { - setNull(range.targetIndex + i, true); - } - } - return; - } - - auto leaf = source->wrappedVector()->asUnchecked>(); - if (pool_ == leaf->pool()) { - // We copy referencing the storage of 'source'. - for (auto& r : ranges) { - copyValuesAndNulls(source, r.targetIndex, r.sourceIndex, r.count); - } - acquireSharedStringBuffers(source); - } else { - for (auto& r : ranges) { - for (auto i = 0; i < r.count; ++i) { - if (source->isNullAt(r.sourceIndex + i)) { - setNull(r.targetIndex + i, true); - } else { - set(r.targetIndex + i, - leaf->valueAt(source->wrappedIndex(r.sourceIndex + i))); - } - } - } - } -} - // For strings, we also verify if they point to valid memory locations inside // the string buffers. template <> diff --git a/velox/vector/FlatVector.h b/velox/vector/FlatVector.h index 61cf67e9a2f3..d39c63ea530a 100644 --- a/velox/vector/FlatVector.h +++ b/velox/vector/FlatVector.h @@ -230,16 +230,13 @@ class FlatVector final : public SimpleVector { if (count == 0) { return; } - copyValuesAndNulls(source, targetIndex, sourceIndex, count); + BaseVector::CopyRange range{sourceIndex, targetIndex, count}; + copyRanges(source, folly::Range(&range, 1)); } void copyRanges( const BaseVector* source, - const folly::Range& ranges) override { - for (auto& range : ranges) { - copy(source, range.targetIndex, range.sourceIndex, range.count); - } - } + const folly::Range& ranges) override; void resize(vector_size_t newSize, bool setNotNull = true) override; @@ -450,12 +447,6 @@ class FlatVector final : public SimpleVector { const SelectivityVector& rows, const vector_size_t* toSourceRow); - void copyValuesAndNulls( - const BaseVector* source, - vector_size_t targetIndex, - vector_size_t sourceIndex, - vector_size_t count); - // Ensures that the values buffer has space for 'newSize' elements and is // mutable. Sets elements between the old and new sizes to 'initialValue' if // the new size > old size. @@ -509,18 +500,6 @@ void FlatVector::copy( const SelectivityVector& rows, const vector_size_t* toSourceRow); -template <> -void FlatVector::copy( - const BaseVector* source, - vector_size_t targetIndex, - vector_size_t sourceIndex, - vector_size_t count); - -template <> -void FlatVector::copyRanges( - const BaseVector* source, - const folly::Range& ranges); - template <> void FlatVector::validate( const VectorValidateOptions& options) const;