Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize string sort for default collation UTF8MB4_BIN #5375

Merged
merged 19 commits into from
Jul 21, 2022
Merged
137 changes: 99 additions & 38 deletions dbms/src/Columns/ColumnString.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include <Columns/ColumnsCommon.h>
#include <Common/HashTable/Hash.h>
#include <DataStreams/ColumnGathererStream.h>
#include <Storages/Transaction/CollatorUtils.h>
#include <fmt/core.h>

/// Used in the `reserve` method, when the number of rows is known, but sizes of elements are not.
Expand Down Expand Up @@ -320,53 +321,94 @@ int ColumnString::compareAtWithCollationImpl(size_t n, size_t m, const IColumn &
rhs.sizeAt(m));
}


template <bool positive>
struct ColumnString::lessWithCollation
// Derived must implement function `int compare(const char *, size_t, const char *, size_t)`.
template <bool positive, typename Derived>
struct ColumnString::LessWithCollation
{
const ColumnString & parent;
const ICollator & collator;
const Derived & inner;

lessWithCollation(const ColumnString & parent_, const ICollator & collator_)
LessWithCollation(const ColumnString & parent_, const Derived & inner_)
: parent(parent_)
, collator(collator_)
, inner(inner_)
{}

bool operator()(size_t lhs, size_t rhs) const
FLATTEN_INLINE_PURE inline bool operator()(size_t lhs, size_t rhs) const
{
int res = collator.compare(
int res = inner.compare(
reinterpret_cast<const char *>(&parent.chars[parent.offsetAt(lhs)]),
parent.sizeAt(lhs),
parent.sizeAt(lhs) - 1, // remove tail '\0'
reinterpret_cast<const char *>(&parent.chars[parent.offsetAt(rhs)]),
parent.sizeAt(rhs));
parent.sizeAt(rhs) - 1) // remove tail '\0'
;

return positive ? (res < 0) : (res > 0);
if constexpr (positive)
{
return (res < 0);
}
else
{
return (res > 0);
}
}
};

void ColumnString::getPermutationWithCollationImpl(const ICollator & collator, bool reverse, size_t limit, Permutation & res) const
struct Utf8MB4BinCmp
{
size_t s = offsets.size();
res.resize(s);
for (size_t i = 0; i < s; ++i)
res[i] = i;

if (limit >= s)
limit = 0;
static FLATTEN_INLINE_PURE inline int compare(const char * s1, size_t length1, const char * s2, size_t length2)
{
return DB::BinCollatorCompare<true>(s1, length1, s2, length2);
}
};

if (limit)
// common util functions
template <>
struct ColumnString::LessWithCollation<false, void>
{
// `CollationCmpImpl` must implement function `int compare(const char *, size_t, const char *, size_t)`.
template <typename CollationCmpImpl>
static void getPermutationWithCollationImpl(const ColumnString & src, const CollationCmpImpl & collator_cmp_impl, bool reverse, size_t limit, Permutation & res)
{
if (reverse)
std::partial_sort(res.begin(), res.begin() + limit, res.end(), lessWithCollation<false>(*this, collator));
size_t s = src.offsets.size();
res.resize(s);
for (size_t i = 0; i < s; ++i)
res[i] = i;

if (limit >= s)
limit = 0;

if (limit)
{
if (reverse)
std::partial_sort(res.begin(), res.begin() + limit, res.end(), LessWithCollation<false, CollationCmpImpl>(src, collator_cmp_impl));
else
std::partial_sort(res.begin(), res.begin() + limit, res.end(), LessWithCollation<true, CollationCmpImpl>(src, collator_cmp_impl));
}
else
std::partial_sort(res.begin(), res.begin() + limit, res.end(), lessWithCollation<true>(*this, collator));
{
if (reverse)
std::sort(res.begin(), res.end(), LessWithCollation<false, CollationCmpImpl>(src, collator_cmp_impl));
else
std::sort(res.begin(), res.end(), LessWithCollation<true, CollationCmpImpl>(src, collator_cmp_impl));
}
}
else
};

void ColumnString::getPermutationWithCollationImpl(const ICollator & collator, bool reverse, size_t limit, Permutation & res) const
{
using PermutationWithCollationUtils = ColumnString::LessWithCollation<false, void>;

// optimize path for default collator `UTF8MB4_BIN`
if (TiDB::ITiDBCollator::getCollator(TiDB::ITiDBCollator::UTF8MB4_BIN) == &collator)
{
if (reverse)
std::sort(res.begin(), res.end(), lessWithCollation<false>(*this, collator));
else
std::sort(res.begin(), res.end(), lessWithCollation<true>(*this, collator));
Utf8MB4BinCmp cmp_impl;
PermutationWithCollationUtils::getPermutationWithCollationImpl(*this, cmp_impl, reverse, limit, res);
///
solotzg marked this conversation as resolved.
Show resolved Hide resolved
return;
}

{
PermutationWithCollationUtils::getPermutationWithCollationImpl(*this, collator, reverse, limit, res);
}
}

Expand All @@ -383,24 +425,43 @@ void ColumnString::updateWeakHash32(WeakHash32 & hash, const TiDB::TiDBCollatorP

if (collator != nullptr)
{
for (const auto & offset : offsets)
if (collator->canUseFastPath())
{
auto str_size = offset - prev_offset;
/// Skip last zero byte.
auto sort_key = collator->sortKey(reinterpret_cast<const char *>(pos), str_size - 1, sort_key_container);
*hash_data = ::updateWeakHash32(reinterpret_cast<const UInt8 *>(sort_key.data), sort_key.size, *hash_data);

pos += str_size;
prev_offset = offset;
++hash_data;
for (const auto & offset : offsets)
{
auto str_size = offset - prev_offset;

// Skip last zero byte.
auto sort_key = collator->fastPathSortKey(reinterpret_cast<const char *>(pos), str_size - 1);
*hash_data = ::updateWeakHash32(reinterpret_cast<const UInt8 *>(sort_key.data), sort_key.size, *hash_data);

pos += str_size;
prev_offset = offset;
++hash_data;
}
}
else
{
for (const auto & offset : offsets)
{
auto str_size = offset - prev_offset;

// Skip last zero byte.
auto sort_key = collator->sortKeyIndirect(reinterpret_cast<const char *>(pos), str_size - 1, sort_key_container);
*hash_data = ::updateWeakHash32(reinterpret_cast<const UInt8 *>(sort_key.data), sort_key.size, *hash_data);

pos += str_size;
prev_offset = offset;
++hash_data;
}
}
}
else
{
for (const auto & offset : offsets)
{
auto str_size = offset - prev_offset;
/// Skip last zero byte.
// Skip last zero byte.
*hash_data = ::updateWeakHash32(pos, str_size - 1, *hash_data);

pos += str_size;
Expand Down
82 changes: 48 additions & 34 deletions dbms/src/Columns/ColumnString.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include <Common/PODArray.h>
#include <Common/SipHash.h>
#include <Common/memcpySmall.h>
#include <Storages/Transaction/Collator.h>
#include <string.h>


Expand Down Expand Up @@ -52,8 +53,8 @@ class ColumnString final : public COWPtrHelper<IColumn, ColumnString>
template <bool positive>
struct less;

template <bool positive>
struct lessWithCollation;
template <bool positive, typename Derived>
struct LessWithCollation;

ColumnString() = default;

Expand Down Expand Up @@ -118,7 +119,7 @@ class ColumnString final : public COWPtrHelper<IColumn, ColumnString>

void insert(const Field & x) override
{
const String & s = DB::get<const String &>(x);
const auto & s = DB::get<const String &>(x);
const size_t old_size = chars.size();
const size_t size_to_append = s.size() + 1;
const size_t new_size = old_size + size_to_append;
Expand All @@ -134,7 +135,7 @@ class ColumnString final : public COWPtrHelper<IColumn, ColumnString>

void insertFrom(const IColumn & src_, size_t n) override
{
const ColumnString & src = static_cast<const ColumnString &>(src_);
const auto & src = static_cast<const ColumnString &>(src_);

if (n != 0)
{
Expand Down Expand Up @@ -207,21 +208,20 @@ class ColumnString final : public COWPtrHelper<IColumn, ColumnString>
{
size_t string_size = sizeAt(n);
size_t offset = offsetAt(n);
const void * src = &chars[offset];

StringRef res;

if (collator != nullptr)
{
/// Skip last zero byte.
auto sort_key = collator->sortKey(reinterpret_cast<const char *>(src), string_size - 1, sort_key_container);
string_size = sort_key.size;
src = sort_key.data;
}
StringRef sort_key{reinterpret_cast<const char *>(&chars[offset]), string_size};

// Skip last zero byte.
collator->sortKeyNullable(sort_key.data, sort_key.size - 1, sort_key_container, sort_key);

string_size = sort_key.size;

res.size = sizeof(string_size) + string_size;
char * pos = arena.allocContinue(res.size, begin);
memcpy(pos, &string_size, sizeof(string_size));
memcpy(pos + sizeof(string_size), src, string_size);
memcpy(pos + sizeof(string_size), sort_key.data, string_size);
res.data = pos;
return res;
}
Expand All @@ -244,33 +244,47 @@ class ColumnString final : public COWPtrHelper<IColumn, ColumnString>
{
size_t string_size = sizeAt(n);
size_t offset = offsetAt(n);
if (collator != nullptr)
{
auto sort_key = collator->sortKey(reinterpret_cast<const char *>(&chars[offset]), string_size, sort_key_container);
string_size = sort_key.size;
hash.update(reinterpret_cast<const char *>(&string_size), sizeof(string_size));
hash.update(sort_key.data, sort_key.size);
}
else
{
hash.update(reinterpret_cast<const char *>(&string_size), sizeof(string_size));
hash.update(reinterpret_cast<const char *>(&chars[offset]), string_size);
}

StringRef sort_key{reinterpret_cast<const char *>(&chars[offset]), string_size};

// Skip last zero byte.
collator->sortKeyNullable(sort_key.data, sort_key.size - 1, sort_key_container, sort_key);
string_size = sort_key.size;
hash.update(reinterpret_cast<const char *>(&string_size), sizeof(string_size));
hash.update(sort_key.data, sort_key.size);
}

void updateHashWithValues(IColumn::HashValues & hash_values, const TiDB::TiDBCollatorPtr & collator, String & sort_key_container) const override
{
if (collator != nullptr)
{
for (size_t i = 0; i < offsets.size(); ++i)
if (collator->canUseFastPath())
{
size_t string_size = sizeAt(i);
size_t offset = offsetAt(i);

auto sort_key = collator->sortKey(reinterpret_cast<const char *>(&chars[offset]), string_size, sort_key_container);
string_size = sort_key.size;
hash_values[i].update(reinterpret_cast<const char *>(&string_size), sizeof(string_size));
hash_values[i].update(sort_key.data, sort_key.size);
for (size_t i = 0; i < offsets.size(); ++i)
{
size_t string_size = sizeAt(i);
size_t offset = offsetAt(i);

// Skip last zero byte.
auto sort_key = collator->fastPathSortKey(reinterpret_cast<const char *>(&chars[offset]), string_size - 1);
string_size = sort_key.size;
hash_values[i].update(reinterpret_cast<const char *>(&string_size), sizeof(string_size));
hash_values[i].update(sort_key.data, sort_key.size);
}
}
else
{
for (size_t i = 0; i < offsets.size(); ++i)
{
size_t string_size = sizeAt(i);
size_t offset = offsetAt(i);

// Skip last zero byte.
auto sort_key = collator->sortKeyIndirect(reinterpret_cast<const char *>(&chars[offset]), string_size - 1, sort_key_container);
string_size = sort_key.size;
hash_values[i].update(reinterpret_cast<const char *>(&string_size), sizeof(string_size));
hash_values[i].update(sort_key.data, sort_key.size);
}
}
}
else
Expand Down Expand Up @@ -302,7 +316,7 @@ class ColumnString final : public COWPtrHelper<IColumn, ColumnString>

int compareAt(size_t n, size_t m, const IColumn & rhs_, int /*nan_direction_hint*/) const override
{
const ColumnString & rhs = static_cast<const ColumnString &>(rhs_);
const auto & rhs = static_cast<const ColumnString &>(rhs_);

const size_t size = sizeAt(n);
const size_t rhs_size = rhs.sizeAt(m);
Expand Down
26 changes: 25 additions & 1 deletion dbms/src/Columns/IColumn.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
#include <Columns/IColumn.h>
#include <IO/Operators.h>
#include <IO/WriteBufferFromString.h>

#include <Storages/Transaction/Collator.h>

namespace DB
{
Expand All @@ -34,4 +34,28 @@ String IColumn::dumpStructure() const
return res.str();
}

StringRef IColumn::serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const
{
return serializeValueIntoArena(n, arena, begin, nullptr, TiDB::dummy_sort_key_contaner);
}
void IColumn::updateHashWithValue(size_t n, SipHash & hash) const
{
updateHashWithValue(n, hash, nullptr, TiDB::dummy_sort_key_contaner);
}

void IColumn::updateHashWithValues(HashValues & hash_values) const
{
updateHashWithValues(hash_values, nullptr, TiDB::dummy_sort_key_contaner);
}

void IColumn::updateWeakHash32(WeakHash32 & hash) const
{
updateWeakHash32(hash, nullptr, TiDB::dummy_sort_key_contaner);
}

const char * IColumn::deserializeAndInsertFromArena(const char * pos)
{
return deserializeAndInsertFromArena(pos, nullptr);
}

} // namespace DB
Loading