Skip to content

Commit

Permalink
Optimize comparision for collation UTF8_BIN and UTF8MB4_BIN (#5299)
Browse files Browse the repository at this point in the history
ref #5294
  • Loading branch information
solotzg authored Jul 7, 2022
1 parent 5295223 commit 97342db
Show file tree
Hide file tree
Showing 5 changed files with 287 additions and 30 deletions.
3 changes: 2 additions & 1 deletion dbms/src/Columns/ColumnConst.h
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,8 @@ class ColumnConst final : public COWPtrHelper<IColumn, ColumnConst>
template <typename T>
T getValue() const
{
return getField().safeGet<typename NearestFieldType<T>::Type>();
auto && tmp = getField();
return std::move(tmp.safeGet<typename NearestFieldType<T>::Type>());
}
};

Expand Down
210 changes: 210 additions & 0 deletions dbms/src/Functions/CollationOperatorOptimized.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
// Copyright 2022 PingCAP, Ltd.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once

#include <Columns/ColumnString.h>
#include <Core/AccurateComparison.h>
#include <Functions/StringUtil.h>
#include <common/StringRef.h>
#include <common/defines.h>

#include <cstddef>
#include <string_view>


namespace DB
{

template <typename T>
ALWAYS_INLINE inline int signum(T val)
{
return (0 < val) - (val < 0);
}

// Check equality is much faster than other comparison.
// - check size first
// - return 0 if equal else 1
__attribute__((flatten, always_inline, pure)) inline uint8_t RawStrEqualCompare(const std::string_view & lhs, const std::string_view & rhs)
{
return StringRef(lhs) == StringRef(rhs) ? 0 : 1;
}

// Compare str view by memcmp
__attribute__((flatten, always_inline, pure)) inline int RawStrCompare(const std::string_view & v1, const std::string_view & v2)
{
return signum(v1.compare(v2));
}

constexpr char SPACE = ' ';

// Remove tail space
__attribute__((flatten, always_inline, pure)) inline std::string_view RightTrim(const std::string_view & v)
{
if (likely(v.empty() || v.back() != SPACE))
return v;
size_t end = v.find_last_not_of(SPACE);
return end == std::string_view::npos ? std::string_view{} : std::string_view(v.data(), end + 1);
}

__attribute__((flatten, always_inline, pure)) inline int RtrimStrCompare(const std::string_view & va, const std::string_view & vb)
{
return RawStrCompare(RightTrim(va), RightTrim(vb));
}

// If true, only need to check equal or not.
template <typename T>
struct IsEqualRelated
{
static constexpr const bool value = false;
};

// For `EqualsOp` and `NotEqualsOp`, value is true.
template <typename... A>
struct IsEqualRelated<DB::EqualsOp<A...>>
{
static constexpr const bool value = true;
};
template <typename... A>
struct IsEqualRelated<DB::NotEqualsOp<A...>>
{
static constexpr const bool value = true;
};

// Loop columns and invoke callback for each pair.
template <typename F>
__attribute__((flatten, always_inline)) inline void LoopTwoColumns(
const ColumnString::Chars_t & a_data,
const ColumnString::Offsets & a_offsets,
const ColumnString::Chars_t & b_data,
const ColumnString::Offsets & b_offsets,
size_t size,
F && func)
{
for (size_t i = 0; i < size; ++i)
{
size_t a_size = StringUtil::sizeAt(a_offsets, i) - 1;
size_t b_size = StringUtil::sizeAt(b_offsets, i) - 1;
const auto * a_ptr = reinterpret_cast<const char *>(&a_data[StringUtil::offsetAt(a_offsets, i)]);
const auto * b_ptr = reinterpret_cast<const char *>(&b_data[StringUtil::offsetAt(b_offsets, i)]);

func({a_ptr, a_size}, {b_ptr, b_size}, i);
}
}

// Loop one column and invoke callback for each pair.
template <typename F>
__attribute__((flatten, always_inline)) inline void LoopOneColumn(
const ColumnString::Chars_t & a_data,
const ColumnString::Offsets & a_offsets,
size_t size,
F && func)
{
for (size_t i = 0; i < size; ++i)
{
size_t a_size = StringUtil::sizeAt(a_offsets, i) - 1;
const auto * a_ptr = reinterpret_cast<const char *>(&a_data[StringUtil::offsetAt(a_offsets, i)]);

func({a_ptr, a_size}, i);
}
}

// Handle str-column compare str-column.
// - Optimize UTF8_BIN and UTF8MB4_BIN
// - Check if columns do NOT contain tail space
// - If Op is `EqualsOp` or `NotEqualsOp`, optimize comparison by faster way
template <typename Op, typename Result>
ALWAYS_INLINE inline bool StringVectorStringVector(
const ColumnString::Chars_t & a_data,
const ColumnString::Offsets & a_offsets,
const ColumnString::Chars_t & b_data,
const ColumnString::Offsets & b_offsets,
const TiDB::TiDBCollatorPtr & collator,
Result & c)
{
bool use_optimized_path = false;

switch (collator->getCollatorId())
{
case TiDB::ITiDBCollator::UTF8MB4_BIN:
case TiDB::ITiDBCollator::UTF8_BIN:
{
size_t size = a_offsets.size();

LoopTwoColumns(a_data, a_offsets, b_data, b_offsets, size, [&c](const std::string_view & va, const std::string_view & vb, size_t i) {
if constexpr (IsEqualRelated<Op>::value)
{
c[i] = Op::apply(RawStrEqualCompare(RightTrim(va), RightTrim(vb)), 0);
}
else
{
c[i] = Op::apply(RtrimStrCompare(va, vb), 0);
}
});

use_optimized_path = true;

break;
}
default:
break;
}
return use_optimized_path;
}

// Handle str-column compare const-str.
// - Optimize UTF8_BIN and UTF8MB4_BIN
// - Right trim const-str first
// - Check if column does NOT contain tail space
// - If Op is `EqualsOp` or `NotEqualsOp`, optimize comparison by faster way
template <typename Op, typename Result>
ALWAYS_INLINE inline bool StringVectorConstant(
const ColumnString::Chars_t & a_data,
const ColumnString::Offsets & a_offsets,
const std::string_view & b,
const TiDB::TiDBCollatorPtr & collator,
Result & c)
{
bool use_optimized_path = false;

switch (collator->getCollatorId())
{
case TiDB::ITiDBCollator::UTF8MB4_BIN:
case TiDB::ITiDBCollator::UTF8_BIN:
{
size_t size = a_offsets.size();

std::string_view tar_str_view = RightTrim(b); // right trim const-str first

LoopOneColumn(a_data, a_offsets, size, [&c, &tar_str_view](const std::string_view & view, size_t i) {
if constexpr (IsEqualRelated<Op>::value)
{
c[i] = Op::apply(RawStrEqualCompare(RightTrim(view), tar_str_view), 0);
}
else
{
c[i] = Op::apply(RawStrCompare(RightTrim(view), tar_str_view), 0);
}
});

use_optimized_path = true;
break;
}
default:
break;
}
return use_optimized_path;
}

} // namespace DB
54 changes: 45 additions & 9 deletions dbms/src/Functions/FunctionsComparison.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
#include <DataTypes/DataTypeString.h>
#include <DataTypes/DataTypeTuple.h>
#include <DataTypes/DataTypesNumber.h>
#include <Functions/CollationOperatorOptimized.h>
#include <Functions/FunctionHelpers.h>
#include <Functions/FunctionsLogical.h>
#include <Functions/IFunction.h>
Expand Down Expand Up @@ -301,6 +302,12 @@ struct StringComparisonWithCollatorImpl
const TiDB::TiDBCollatorPtr & collator,
PaddedPODArray<ResultType> & c)
{
bool optimized_path = StringVectorStringVector<Op>(a_data, a_offsets, b_data, b_offsets, collator, c);
if (optimized_path)
{
return;
}

size_t size = a_offsets.size();

for (size_t i = 0; i < size; ++i)
Expand All @@ -317,10 +324,17 @@ struct StringComparisonWithCollatorImpl
static void NO_INLINE stringVectorConstant(
const ColumnString::Chars_t & a_data,
const ColumnString::Offsets & a_offsets,
const std::string & b,
const std::string_view & b,
const TiDB::TiDBCollatorPtr & collator,
PaddedPODArray<ResultType> & c)
{
bool optimized_path = StringVectorConstant<Op>(a_data, a_offsets, b, collator, c);

if (optimized_path)
{
return;
}

size_t size = a_offsets.size();
ColumnString::Offset b_size = b.size();
const char * b_data = reinterpret_cast<const char *>(b.data());
Expand All @@ -332,7 +346,7 @@ struct StringComparisonWithCollatorImpl
}

static void constantStringVector(
const std::string & a,
const std::string_view & a,
const ColumnString::Chars_t & b_data,
const ColumnString::Offsets & b_offsets,
const TiDB::TiDBCollatorPtr & collator,
Expand All @@ -342,8 +356,8 @@ struct StringComparisonWithCollatorImpl
}

static void constantConstant(
const std::string & a,
const std::string & b,
const std::string_view & a,
const std::string_view & b,
const TiDB::TiDBCollatorPtr & collator,
ResultType & c)
{
Expand Down Expand Up @@ -706,6 +720,25 @@ class FunctionComparison : public IFunction
}
}

static inline std::string_view genConstStrRef(const ColumnConst * c0_const)
{
std::string_view c0_const_str_ref{};
if (c0_const)
{
if (const auto * c0_const_string = checkAndGetColumn<ColumnString>(&c0_const->getDataColumn()); c0_const_string)
{
c0_const_str_ref = std::string_view(c0_const_string->getDataAt(0));
}
else if (const auto * c0_const_fixed_string = checkAndGetColumn<ColumnFixedString>(&c0_const->getDataColumn()); c0_const_fixed_string)
{
c0_const_str_ref = std::string_view(c0_const_fixed_string->getDataAt(0));
}
else
throw Exception("Logical error: ColumnConst contains not String nor FixedString column", ErrorCodes::ILLEGAL_COLUMN);
}
return c0_const_str_ref;
}

template <typename ResultColumnType>
bool executeStringWithCollator(
Block & block,
Expand All @@ -720,10 +753,13 @@ class FunctionComparison : public IFunction
using ResultType = typename ResultColumnType::value_type;
using StringImpl = StringComparisonWithCollatorImpl<Op<int, int>, ResultType>;

std::string_view c0_const_str_ref = genConstStrRef(c0_const);
std::string_view c1_const_str_ref = genConstStrRef(c1_const);

if (c0_const && c1_const)
{
ResultType res = 0;
StringImpl::constantConstant(c0_const->getValue<String>(), c1_const->getValue<String>(), collator, res);
StringImpl::constantConstant(c0_const_str_ref, c1_const_str_ref, collator, res);
block.getByPosition(result).column = block.getByPosition(result).type->createColumnConst(c0_const->size(), toField(res));
return true;
}
Expand All @@ -745,12 +781,12 @@ class FunctionComparison : public IFunction
StringImpl::stringVectorConstant(
c0_string->getChars(),
c0_string->getOffsets(),
c1_const->getValue<String>(),
c1_const_str_ref,
collator,
c_res->getData());
else if (c0_const && c1_string)
StringImpl::constantStringVector(
c0_const->getValue<String>(),
c0_const_str_ref,
c1_string->getChars(),
c1_string->getOffsets(),
collator,
Expand All @@ -770,8 +806,8 @@ class FunctionComparison : public IFunction
template <typename ReturnColumnType = ColumnUInt8>
bool executeString(Block & block, size_t result, const IColumn * c0, const IColumn * c1) const
{
const ColumnString * c0_string = checkAndGetColumn<ColumnString>(c0);
const ColumnString * c1_string = checkAndGetColumn<ColumnString>(c1);
const auto * c0_string = checkAndGetColumn<ColumnString>(c0);
const auto * c1_string = checkAndGetColumn<ColumnString>(c1);
const ColumnConst * c0_const = checkAndGetColumnConstStringOrFixedString(c0);
const ColumnConst * c1_const = checkAndGetColumnConstStringOrFixedString(c1);

Expand Down
Loading

0 comments on commit 97342db

Please sign in to comment.