Skip to content

Commit

Permalink
Optimize array_constructor - v1 (#6566)
Browse files Browse the repository at this point in the history
Summary:

array_constructor is very slow: #5958 (comment)

array_constructor uses BaseVector::copyRanges, which is somewhat fast for arrays and maps, but very slow for primitive types:

```
FlatVector.h

  void copyRanges(
      const BaseVector* source,
      const folly::Range<const BaseVector::CopyRange*>& ranges) override {
    for (auto& range : ranges) {
      copy(source, range.targetIndex, range.sourceIndex, range.count);
    }
  }
```

FlatVector<T>::copy(source, rows, toSourceRow) is faster.

Switching from copyRanges to copy speeds up array_constructor for primitive types and structs significantly. Yet, this change makes arrays and maps slower.

The slowness is due to ArrayVector and MapVector not having implementation for copy(source, rows, toSourceRow). They rely on BaseVector::copy to translate rows + toSourceRow to ranges. This extra processing causes perf regression.

Hence, we use copy for primitive types and structs of these and copyRanges for everything else.

We also optimize FlatVector::copyRanges (which is used by Array/MapVector::copyRanges).

```
Before:

array_constructor_ARRAY_nullfree##1                        16.80ms     59.53
array_constructor_ARRAY_nullfree##2                        27.02ms     37.01
array_constructor_ARRAY_nullfree##3                        38.03ms     26.30
array_constructor_ARRAY_nullfree##2_null                   52.86ms     18.92
array_constructor_ARRAY_nullfree##2_const                  54.97ms     18.19
array_constructor_ARRAY_nulls##1                           30.61ms     32.66
array_constructor_ARRAY_nulls##2                           55.01ms     18.18
array_constructor_ARRAY_nulls##3                           80.69ms     12.39
array_constructor_ARRAY_nulls##2_null                      69.10ms     14.47
array_constructor_ARRAY_nulls##2_const                    103.85ms      9.63


After:

array_constructor_ARRAY_nullfree##1                        15.43ms     64.80
array_constructor_ARRAY_nullfree##2                        24.50ms     40.81
array_constructor_ARRAY_nullfree##3                        35.12ms     28.47
array_constructor_ARRAY_nullfree##2_null                   54.52ms     18.34
array_constructor_ARRAY_nullfree##2_const                  43.28ms     23.10
array_constructor_ARRAY_nulls##1                           28.60ms     34.96
array_constructor_ARRAY_nulls##2                           50.82ms     19.68
array_constructor_ARRAY_nulls##3                           70.31ms     14.22
array_constructor_ARRAY_nulls##2_null                      64.43ms     15.52
array_constructor_ARRAY_nulls##2_const                     80.71ms     12.39


Before:

array_constructor_INTEGER_nullfree##1                      19.72ms     50.71
array_constructor_INTEGER_nullfree##2                      34.51ms     28.97
array_constructor_INTEGER_nullfree##3                      47.95ms     20.86
array_constructor_INTEGER_nullfree##2_null                 58.68ms     17.04
array_constructor_INTEGER_nullfree##2_const                45.15ms     22.15
array_constructor_INTEGER_nulls##1                         29.99ms     33.34
array_constructor_INTEGER_nulls##2                         55.32ms     18.08
array_constructor_INTEGER_nulls##3                         78.53ms     12.73
array_constructor_INTEGER_nulls##2_null                    72.24ms     13.84
array_constructor_INTEGER_nulls##2_const                   71.13ms     14.06


After:

array_constructor_INTEGER_nullfree##1                       3.49ms    286.59
array_constructor_INTEGER_nullfree##2                       7.91ms    126.46
array_constructor_INTEGER_nullfree##3                      11.99ms     83.41
array_constructor_INTEGER_nullfree##2_null                 12.57ms     79.55
array_constructor_INTEGER_nullfree##2_const                11.03ms     90.67
array_constructor_INTEGER_nulls##1                          4.37ms    228.97
array_constructor_INTEGER_nulls##2                          9.99ms    100.14
array_constructor_INTEGER_nulls##3                         14.79ms     67.60
array_constructor_INTEGER_nulls##2_null                    12.21ms     81.92
array_constructor_INTEGER_nulls##2_const                   12.64ms     79.12


Before:

array_constructor_MAP_nullfree##1                          17.34ms     57.65
array_constructor_MAP_nullfree##2                          29.84ms     33.51
array_constructor_MAP_nullfree##3                          41.51ms     24.09
array_constructor_MAP_nullfree##2_null                     56.57ms     17.68
array_constructor_MAP_nullfree##2_const                    71.68ms     13.95
array_constructor_MAP_nulls##1                             36.22ms     27.61
array_constructor_MAP_nulls##2                             68.18ms     14.67
array_constructor_MAP_nulls##3                             95.12ms     10.51
array_constructor_MAP_nulls##2_null                        86.42ms     11.57
array_constructor_MAP_nulls##2_const                      120.10ms      8.33


After:

array_constructor_MAP_nullfree##1                          17.38ms     57.53
array_constructor_MAP_nullfree##2                          29.41ms     34.00
array_constructor_MAP_nullfree##3                          38.30ms     26.11
array_constructor_MAP_nullfree##2_null                     58.52ms     17.09
array_constructor_MAP_nullfree##2_const                    48.62ms     20.57
array_constructor_MAP_nulls##1                             30.60ms     32.68
array_constructor_MAP_nulls##2                             53.94ms     18.54
array_constructor_MAP_nulls##3                             86.48ms     11.56
array_constructor_MAP_nulls##2_null                        69.53ms     14.38
array_constructor_MAP_nulls##2_const                       87.56ms     11.42


Before:

array_constructor_ROW_nullfree##1                          33.88ms     29.52
array_constructor_ROW_nullfree##2                          62.00ms     16.13
array_constructor_ROW_nullfree##3                          89.54ms     11.17
array_constructor_ROW_nullfree##2_null                     78.46ms     12.75
array_constructor_ROW_nullfree##2_const                    95.53ms     10.47
array_constructor_ROW_nulls##1                             44.11ms     22.67
array_constructor_ROW_nulls##2                            115.43ms      8.66
array_constructor_ROW_nulls##3                            173.61ms      5.76
array_constructor_ROW_nulls##2_null                       130.40ms      7.67
array_constructor_ROW_nulls##2_const                      169.97ms      5.88

After:

array_constructor_ROW_nullfree##1                           5.64ms    177.44
array_constructor_ROW_nullfree##2                          14.40ms     69.44
array_constructor_ROW_nullfree##3                          21.46ms     46.59
array_constructor_ROW_nullfree##2_null                     19.14ms     52.26
array_constructor_ROW_nullfree##2_const                    18.60ms     53.77
array_constructor_ROW_nulls##1                             10.97ms     91.18
array_constructor_ROW_nulls##2                             18.29ms     54.67
array_constructor_ROW_nulls##3                             28.57ms     35.01
array_constructor_ROW_nulls##2_null                        25.10ms     39.84
array_constructor_ROW_nulls##2_const                       24.55ms     40.74
```

Differential Revision: D49269500
  • Loading branch information
mbasmanova authored and facebook-github-bot committed Sep 14, 2023
1 parent e9354bb commit 5d2e8fc
Show file tree
Hide file tree
Showing 6 changed files with 326 additions and 158 deletions.
77 changes: 63 additions & 14 deletions velox/functions/prestosql/ArrayConstructor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,24 @@ class ArrayConstructor : public exec::VectorFunction {
return false;
}

static bool shouldCopyRanges(const TypePtr& type) {
if (type->isPrimitiveType()) {
return false;
}

if (type->isRow()) {
const auto& rowType = type->asRow();
for (const auto& child : rowType.children()) {
if (shouldCopyRanges(child)) {
return true;
}
}
return false;
}

return true;
}

void apply(
const SelectivityVector& rows,
std::vector<VectorPtr>& args,
Expand Down Expand Up @@ -55,24 +73,55 @@ class ArrayConstructor : public exec::VectorFunction {
} else {
elementsResult->resize(baseOffset + numArgs * rows.countSelected());

std::vector<BaseVector::CopyRange> ranges;
ranges.reserve(rows.end());
if (shouldCopyRanges(elementsResult->type())) {
std::vector<BaseVector::CopyRange> ranges;
ranges.reserve(rows.end());

vector_size_t offset = baseOffset;
rows.applyToSelected([&](vector_size_t row) {
rawSizes[row] = numArgs;
rawOffsets[row] = offset;
ranges.push_back({row, offset, 1});
offset += numArgs;
});
vector_size_t offset = baseOffset;
rows.applyToSelected([&](vector_size_t row) {
rawSizes[row] = numArgs;
rawOffsets[row] = offset;
ranges.push_back({row, offset, 1});
offset += numArgs;
});

elementsResult->copyRanges(args[0].get(), ranges);

for (int i = 1; i < numArgs; i++) {
for (auto& range : ranges) {
++range.targetIndex;
}
elementsResult->copyRanges(args[i].get(), ranges);
}
} else {
SelectivityVector targetRows(elementsResult->size(), false);
std::vector<vector_size_t> toSourceRow(elementsResult->size());

vector_size_t offset = baseOffset;
rows.applyToSelected([&](vector_size_t row) {
rawSizes[row] = numArgs;
rawOffsets[row] = offset;

targetRows.setValid(offset, true);
toSourceRow[offset] = row;

offset += numArgs;
});
targetRows.updateBounds();
elementsResult->copy(args[0].get(), targetRows, toSourceRow.data());

elementsResult->copyRanges(args[0].get(), ranges);
for (int i = 1; i < numArgs; i++) {
targetRows.clearAll();
vector_size_t offset = baseOffset;
rows.applyToSelected([&](vector_size_t row) {
targetRows.setValid(offset + i, true);
toSourceRow[offset + i] = row;
offset += numArgs;
});

for (int i = 1; i < numArgs; i++) {
for (auto& range : ranges) {
++range.targetIndex;
targetRows.updateBounds();
elementsResult->copy(args[i].get(), targetRows, toSourceRow.data());
}
elementsResult->copyRanges(args[i].get(), ranges);
}
}
}
Expand Down
94 changes: 94 additions & 0 deletions velox/functions/prestosql/benchmarks/ArrayConstructorBenchmark.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <folly/Benchmark.h>
#include <folly/init/Init.h>

#include "velox/benchmarks/ExpressionBenchmarkBuilder.h"
#include "velox/functions/lib/LambdaFunctionUtil.h"
#include "velox/functions/lib/benchmarks/FunctionBenchmarkBase.h"
#include "velox/functions/prestosql/ArrayFunctions.h"
#include "velox/functions/prestosql/registration/RegistrationFunctions.h"

using namespace facebook::velox;
using namespace facebook::velox::exec;
using namespace facebook::velox::functions;

int main(int argc, char** argv) {
folly::init(&argc, &argv);

functions::prestosql::registerArrayFunctions();

ExpressionBenchmarkBuilder benchmarkBuilder;

auto* pool = benchmarkBuilder.pool();
auto& vm = benchmarkBuilder.vectorMaker();

auto createSet =
[&](const TypePtr& type, bool withNulls, const VectorPtr& constantInput) {
VectorFuzzer::Options options;
options.vectorSize = 1'000;
options.nullRatio = withNulls ? 0.2 : 0.0;

VectorFuzzer fuzzer(options, pool);
std::vector<VectorPtr> columns;
columns.push_back(fuzzer.fuzzFlat(type));
columns.push_back(fuzzer.fuzzFlat(type));
columns.push_back(fuzzer.fuzzFlat(type));
columns.push_back(
BaseVector::createNullConstant(type, options.vectorSize, pool));
columns.push_back(
BaseVector::wrapInConstant(options.vectorSize, 0, constantInput));

auto input = vm.rowVector({"c0", "c1", "c2", "n", "c"}, columns);

benchmarkBuilder
.addBenchmarkSet(
fmt::format(
"array_constructor_{}_{}",
mapTypeKindToName(type->kind()),
withNulls ? "nulls" : "nullfree"),
input)
.addExpression("1", "array_constructor(c0)")
.addExpression("2", "array_constructor(c0, c1)")
.addExpression("3", "array_constructor(c0, c1, c2)")
.addExpression("2_null", "array_constructor(c0, c1, n)")
.addExpression("2_const", "array_constructor(c0, c1, c)");
};

auto constantInteger = BaseVector::createConstant(INTEGER(), 11, 1, pool);
createSet(INTEGER(), true, constantInteger);
createSet(INTEGER(), false, constantInteger);

auto constantRow = vm.rowVector({
BaseVector::createConstant(INTEGER(), 11, 1, pool),
BaseVector::createConstant(DOUBLE(), 1.23, 1, pool),
});
createSet(ROW({INTEGER(), DOUBLE()}), true, constantRow);
createSet(ROW({INTEGER(), DOUBLE()}), false, constantRow);

auto constantArray = vm.arrayVector<int32_t>({{1, 2, 3, 4, 5}});
createSet(ARRAY(INTEGER()), true, constantArray);
createSet(ARRAY(INTEGER()), false, constantArray);

auto constantMap = vm.mapVector<int32_t, float>({{{1, 1.23}, {2, 2.34}}});
createSet(MAP(INTEGER(), REAL()), true, constantMap);
createSet(MAP(INTEGER(), REAL()), false, constantMap);

benchmarkBuilder.registerBenchmarks();

folly::runBenchmarks();
return 0;
}
35 changes: 20 additions & 15 deletions velox/vector/ComplexVector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -231,17 +231,19 @@ void RowVector::copy(
[&](auto row) { rawMappedIndices[row] = indices[toSourceRow[row]]; });
}

auto baseSource = decodedSource.base()->as<RowVector>();
for (auto i = 0; i < childrenSize_; ++i) {
if (baseSource->childAt(i)) {
BaseVector::ensureWritable(
rows, type()->asRow().childAt(i), pool(), children_[i]);
children_[i]->copy(
baseSource->childAt(i)->loadedVector(),
nonNullRows,
rawMappedIndices ? rawMappedIndices : indices);
} else {
children_[i].reset();
if (source->typeKind() != TypeKind::UNKNOWN) {
auto baseSource = decodedSource.base()->as<RowVector>();
for (auto i = 0; i < childrenSize_; ++i) {
if (baseSource->childAt(i)) {
BaseVector::ensureWritable(
rows, type()->asRow().childAt(i), pool(), children_[i]);
children_[i]->copy(
baseSource->childAt(i)->loadedVector(),
nonNullRows,
rawMappedIndices ? rawMappedIndices : indices);
} else {
children_[i].reset();
}
}
}
}
Expand Down Expand Up @@ -319,10 +321,13 @@ void RowVector::copyRanges(
}
}
}
auto* rowSource = decoded.base()->as<RowVector>();
for (int i = 0; i < children_.size(); ++i) {
children_[i]->copyRanges(
rowSource->childAt(i)->loadedVector(), baseRanges);

if (source->typeKind() != TypeKind::UNKNOWN) {
auto* rowSource = decoded.base()->as<RowVector>();
for (int i = 0; i < children_.size(); ++i) {
children_[i]->copyRanges(
rowSource->childAt(i)->loadedVector(), baseRanges);
}
}
}
}
Expand Down
124 changes: 87 additions & 37 deletions velox/vector/FlatVector-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -218,19 +218,35 @@ void FlatVector<T>::copyValuesAndNulls(
}

template <typename T>
void FlatVector<T>::copyValuesAndNulls(
void FlatVector<T>::copyRanges(
const BaseVector* source,
vector_size_t targetIndex,
vector_size_t sourceIndex,
vector_size_t count) {
if (count == 0) {
return;
const folly::Range<const BaseVector::CopyRange*>& ranges) {
if constexpr (std::is_same_v<T, StringView>) {
auto leaf =
source->wrappedVector()->asUnchecked<SimpleVector<StringView>>();
if (BaseVector::pool_ != leaf->pool()) {
for (const auto& r : ranges) {
for (auto i = 0; i < r.count; ++i) {
if (source->isNullAt(r.sourceIndex + i)) {
this->setNull(r.targetIndex + i, true);
} else {
this->set(
r.targetIndex + i,
leaf->valueAt(source->wrappedIndex(r.sourceIndex + i)));
}
}
}
return;
}

// We copy referencing the storage of 'source'.
acquireSharedStringBuffers(source);
}

source = source->loadedVector();
VELOX_CHECK(
BaseVector::compatibleKind(BaseVector::typeKind(), source->typeKind()));
VELOX_CHECK_GE(source->size(), sourceIndex + count);
VELOX_CHECK_GE(BaseVector::length_, targetIndex + count);

const uint64_t* sourceNulls = source->rawNulls();
uint64_t* rawNulls = const_cast<uint64_t*>(BaseVector::rawNulls_);
if (source->mayHaveNulls()) {
Expand All @@ -251,51 +267,85 @@ void FlatVector<T>::copyValuesAndNulls(
source->size());
} else if (source->typeKind() != TypeKind::UNKNOWN) {
auto flat = source->asUnchecked<FlatVector<T>>();
if (Buffer::is_pod_like_v<T>) {
memcpy(
&rawValues_[targetIndex],
&flat->rawValues()[sourceIndex],
count * sizeof(T));
} else {
const T* srcValues = flat->rawValues();
std::copy(
srcValues + sourceIndex,
srcValues + sourceIndex + count,
rawValues_ + targetIndex);
for (const auto& range : ranges) {
if (Buffer::is_pod_like_v<T>) {
memcpy(
&rawValues_[range.targetIndex],
&flat->rawValues()[range.sourceIndex],
range.count * sizeof(T));
} else {
const T* srcValues = flat->rawValues();
std::copy(
srcValues + range.sourceIndex,
srcValues + range.sourceIndex + range.count,
rawValues_ + range.targetIndex);
}
}
}

if (rawNulls) {
if (sourceNulls) {
bits::copyBits(sourceNulls, sourceIndex, rawNulls, targetIndex, count);
for (const auto& range : ranges) {
bits::copyBits(
sourceNulls,
range.sourceIndex,
rawNulls,
range.targetIndex,
range.count);
}
} else {
bits::fillBits(
rawNulls, targetIndex, targetIndex + count, bits::kNotNull);
for (const auto& range : ranges) {
bits::fillBits(
rawNulls,
range.targetIndex,
range.targetIndex + range.count,
bits::kNotNull);
}
}
}
} else if (source->isConstantEncoding()) {
if (source->isNullAt(0)) {
bits::fillBits(rawNulls, targetIndex, targetIndex + count, bits::kNull);
for (const auto& range : ranges) {
bits::fillBits(
rawNulls,
range.targetIndex,
range.targetIndex + range.count,
bits::kNull);
}
return;
}
auto constant = source->asUnchecked<ConstantVector<T>>();
T value = constant->valueAt(0);
for (auto row = targetIndex; row < targetIndex + count; ++row) {
rawValues_[row] = value;
}
if (rawNulls) {
bits::fillBits(
rawNulls, targetIndex, targetIndex + count, bits::kNotNull);
for (const auto& range : ranges) {
for (auto i = 0; i < range.count; ++i) {
rawValues_[range.targetIndex + i] = value;
}
if (rawNulls) {
bits::fillBits(
rawNulls,
range.targetIndex,
range.targetIndex + range.count,
bits::kNotNull);
}
}
} else {
auto sourceVector = source->asUnchecked<SimpleVector<T>>();
for (int32_t i = 0; i < count; ++i) {
if (!source->isNullAt(sourceIndex + i)) {
rawValues_[targetIndex + i] = sourceVector->valueAt(sourceIndex + i);
if (rawNulls) {
bits::clearNull(rawNulls, targetIndex + i);
auto sourceVector = source->typeKind() != TypeKind::UNKNOWN
? source->asUnchecked<SimpleVector<T>>()
: nullptr;
for (const auto& range : ranges) {
for (auto i = 0; i < range.count; ++i) {
auto row = range.targetIndex + i;
auto sourceRow = range.sourceIndex + i;
if (!source->isNullAt(sourceRow)) {
if (sourceVector) {
rawValues_[row] = sourceVector->valueAt(sourceRow);
}
if (rawNulls) {
bits::clearNull(rawNulls, row);
}
} else {
bits::setNull(rawNulls, row);
}
} else {
bits::setNull(rawNulls, targetIndex + i);
}
}
}
Expand Down
Loading

0 comments on commit 5d2e8fc

Please sign in to comment.