From e938077ae3e9d75f8cd17410aad40766063f5d13 Mon Sep 17 00:00:00 2001 From: Zhigao Tong Date: Thu, 14 Jul 2022 13:39:08 +0800 Subject: [PATCH 01/15] optimize string sort with collation --- dbms/src/Columns/ColumnString.cpp | 85 +++++++++++++------ dbms/src/Columns/ColumnString.h | 2 +- .../Functions/CollationOperatorOptimized.h | 19 ++++- dbms/src/Storages/Transaction/Collator.cpp | 11 +-- 4 files changed, 80 insertions(+), 37 deletions(-) diff --git a/dbms/src/Columns/ColumnString.cpp b/dbms/src/Columns/ColumnString.cpp index 54d4238616f..352d80a8678 100644 --- a/dbms/src/Columns/ColumnString.cpp +++ b/dbms/src/Columns/ColumnString.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include /// Used in the `reserve` method, when the number of rows is known, but sizes of elements are not. @@ -320,53 +321,87 @@ int ColumnString::compareAtWithCollationImpl(size_t n, size_t m, const IColumn & rhs.sizeAt(m)); } - -template +// Derived must implement function `int compare(const char *, size_t, const char *, size_t)`. +template struct ColumnString::lessWithCollation { const ColumnString & parent; - const ICollator & collator; + const Derived & inner; - lessWithCollation(const ColumnString & parent_, const ICollator & collator_) + lessWithCollation(const ColumnString & parent_, const Derived & inner_) : parent(parent_) - , collator(collator_) + , inner(inner_) {} bool operator()(size_t lhs, size_t rhs) const { - int res = collator.compare( + int res = inner.compare( reinterpret_cast(&parent.chars[parent.offsetAt(lhs)]), - parent.sizeAt(lhs), + parent.sizeAt(lhs) - 1, // remove tail '\0' reinterpret_cast(&parent.chars[parent.offsetAt(rhs)]), - parent.sizeAt(rhs)); + parent.sizeAt(rhs) - 1) // remove tail '\0' + ; return positive ? (res < 0) : (res > 0); } }; -void ColumnString::getPermutationWithCollationImpl(const ICollator & collator, bool reverse, size_t limit, Permutation & res) const +struct Utf8MB4BinCmp { - size_t s = offsets.size(); - res.resize(s); - for (size_t i = 0; i < s; ++i) - res[i] = i; - - if (limit >= s) - limit = 0; + static inline int compare(const char * s1, size_t length1, const char * s2, size_t length2) + { + return DB::BinCollatorCompare(s1, length1, s2, length2); + } +}; - if (limit) +// common util functions +template <> +struct ColumnString::lessWithCollation +{ + // `CollationCmpImpl` must implement function `int compare(const char *, size_t, const char *, size_t)`. + template + static void getPermutationWithCollationImpl(const ColumnString & src, const CollationCmpImpl & collator_cmp_impl, bool reverse, size_t limit, Permutation & res) { - if (reverse) - std::partial_sort(res.begin(), res.begin() + limit, res.end(), lessWithCollation(*this, collator)); + size_t s = src.offsets.size(); + res.resize(s); + for (size_t i = 0; i < s; ++i) + res[i] = i; + + if (limit >= s) + limit = 0; + + if (limit) + { + if (reverse) + std::partial_sort(res.begin(), res.begin() + limit, res.end(), lessWithCollation(src, collator_cmp_impl)); + else + std::partial_sort(res.begin(), res.begin() + limit, res.end(), lessWithCollation(src, collator_cmp_impl)); + } else - std::partial_sort(res.begin(), res.begin() + limit, res.end(), lessWithCollation(*this, collator)); + { + if (reverse) + std::sort(res.begin(), res.end(), lessWithCollation(src, collator_cmp_impl)); + else + std::sort(res.begin(), res.end(), lessWithCollation(src, collator_cmp_impl)); + } } - else +}; + +void ColumnString::getPermutationWithCollationImpl(const ICollator & collator, bool reverse, size_t limit, Permutation & res) const +{ + using PermutationWithCollationUtils = ColumnString::lessWithCollation; + + // optimize path for default collator `UTF8MB4_BIN` + if (&collator == TiDB::ITiDBCollator::getCollator(TiDB::ITiDBCollator::UTF8MB4_BIN)) { - if (reverse) - std::sort(res.begin(), res.end(), lessWithCollation(*this, collator)); - else - std::sort(res.begin(), res.end(), lessWithCollation(*this, collator)); + Utf8MB4BinCmp cmp_impl; + PermutationWithCollationUtils::getPermutationWithCollationImpl(*this, cmp_impl, reverse, limit, res); + /// + return; + } + + { + PermutationWithCollationUtils::getPermutationWithCollationImpl(*this, collator, reverse, limit, res); } } diff --git a/dbms/src/Columns/ColumnString.h b/dbms/src/Columns/ColumnString.h index 48b02388a6c..c84847fa8e5 100644 --- a/dbms/src/Columns/ColumnString.h +++ b/dbms/src/Columns/ColumnString.h @@ -52,7 +52,7 @@ class ColumnString final : public COWPtrHelper template struct less; - template + template struct lessWithCollation; ColumnString() = default; diff --git a/dbms/src/Functions/CollationOperatorOptimized.h b/dbms/src/Functions/CollationOperatorOptimized.h index 395ecc5b9eb..6a0b8d35512 100644 --- a/dbms/src/Functions/CollationOperatorOptimized.h +++ b/dbms/src/Functions/CollationOperatorOptimized.h @@ -27,6 +27,8 @@ namespace DB { +#define INLINE_FLATTEN_PURE __attribute__((flatten, always_inline, pure)) + template ALWAYS_INLINE inline int signum(T val) { @@ -36,13 +38,13 @@ ALWAYS_INLINE inline int signum(T val) // Check equality is much faster than other comparison. // - check size first // - return 0 if equal else 1 -__attribute__((flatten, always_inline, pure)) inline uint8_t RawStrEqualCompare(const std::string_view & lhs, const std::string_view & rhs) +INLINE_FLATTEN_PURE inline uint8_t RawStrEqualCompare(const std::string_view & lhs, const std::string_view & rhs) { return StringRef(lhs) == StringRef(rhs) ? 0 : 1; } // Compare str view by memcmp -__attribute__((flatten, always_inline, pure)) inline int RawStrCompare(const std::string_view & v1, const std::string_view & v2) +INLINE_FLATTEN_PURE inline int RawStrCompare(const std::string_view & v1, const std::string_view & v2) { return signum(v1.compare(v2)); } @@ -50,7 +52,7 @@ __attribute__((flatten, always_inline, pure)) inline int RawStrCompare(const std constexpr char SPACE = ' '; // Remove tail space -__attribute__((flatten, always_inline, pure)) inline std::string_view RightTrim(const std::string_view & v) +INLINE_FLATTEN_PURE inline std::string_view RightTrim(const std::string_view & v) { if (likely(v.empty() || v.back() != SPACE)) return v; @@ -58,11 +60,20 @@ __attribute__((flatten, always_inline, pure)) inline std::string_view RightTrim( return end == std::string_view::npos ? std::string_view{} : std::string_view(v.data(), end + 1); } -__attribute__((flatten, always_inline, pure)) inline int RtrimStrCompare(const std::string_view & va, const std::string_view & vb) +INLINE_FLATTEN_PURE inline int RtrimStrCompare(const std::string_view & va, const std::string_view & vb) { return RawStrCompare(RightTrim(va), RightTrim(vb)); } +template +INLINE_FLATTEN_PURE inline int BinCollatorCompare(const char * s1, size_t length1, const char * s2, size_t length2) +{ + if constexpr (padding) + return DB::RtrimStrCompare({s1, length1}, {s2, length2}); + else + return DB::RawStrCompare({s1, length1}, {s2, length2}); +} + // If true, only need to check equal or not. template struct IsEqualRelated diff --git a/dbms/src/Storages/Transaction/Collator.cpp b/dbms/src/Storages/Transaction/Collator.cpp index 1b0221a6829..21fb6ce807b 100644 --- a/dbms/src/Storages/Transaction/Collator.cpp +++ b/dbms/src/Storages/Transaction/Collator.cpp @@ -186,10 +186,7 @@ class BinCollator final : public ITiDBCollator int compare(const char * s1, size_t length1, const char * s2, size_t length2) const override { - if constexpr (padding) - return DB::RtrimStrCompare({s1, length1}, {s2, length2}); - else - return DB::RawStrCompare({s1, length1}, {s2, length2}); + return DB::BinCollatorCompare(s1, length1, s2, length2); } StringRef sortKey(const char * s, size_t length, std::string &) const override @@ -593,6 +590,9 @@ TiDBCollatorPtr ITiDBCollator::getCollator(int32_t id) { switch (id) { + case ITiDBCollator::UTF8MB4_BIN: + static const auto utf8mb4_collator = UTF8MB4_BIN_TYPE(UTF8MB4_BIN); + return &utf8mb4_collator; case ITiDBCollator::BINARY: static const auto binary_collator = BinCollator(BINARY); return &binary_collator; @@ -602,9 +602,6 @@ TiDBCollatorPtr ITiDBCollator::getCollator(int32_t id) case ITiDBCollator::LATIN1_BIN: static const auto latin1_collator = BinCollator(LATIN1_BIN); return &latin1_collator; - case ITiDBCollator::UTF8MB4_BIN: - static const auto utf8mb4_collator = UTF8MB4_BIN_TYPE(UTF8MB4_BIN); - return &utf8mb4_collator; case ITiDBCollator::UTF8_BIN: static const auto utf8_collator = UTF8MB4_BIN_TYPE(UTF8_BIN); return &utf8_collator; From 213371b0f1ea0f2f029ad6dbf60362dac41e3af7 Mon Sep 17 00:00:00 2001 From: Zhigao Tong Date: Thu, 14 Jul 2022 14:35:00 +0800 Subject: [PATCH 02/15] add test case --- .../tidb-ci/new_collation_fullstack/expr.test | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/tests/tidb-ci/new_collation_fullstack/expr.test b/tests/tidb-ci/new_collation_fullstack/expr.test index 1e2135c4f2d..c88db20ba2f 100644 --- a/tests/tidb-ci/new_collation_fullstack/expr.test +++ b/tests/tidb-ci/new_collation_fullstack/expr.test @@ -88,4 +88,20 @@ mysql> set session tidb_isolation_read_engines='tiflash'; select /*+ read_from_s | min(value) | max(value) | min(value1) | max(value1) | +------------+------------+-------------+-------------+ | abc | def | abc | def | -+------------+------------+-------------+-------------+ \ No newline at end of file ++------------+------------+-------------+-------------+ + +mysql> insert into test.t values (4, '', 'def\n'), (5, '', 'def '); + +mysql> select /*+ read_from_storage(tiflash[t]) */ hex(max(value1)) from test.t; ++------------------+ +| hex(max(value1)) | ++------------------+ +| 6465660A | ++------------------+ + +mysql> select /*+ read_from_storage(tiflash[t]) */ hex(min(value1)) from test.t; ++------------------+ +| hex(min(value1)) | ++------------------+ +| 61626320 | ++------------------+ From 0b1b84265321c69433d561ddff81890f8bc0dcb4 Mon Sep 17 00:00:00 2001 From: Zhigao Tong Date: Thu, 14 Jul 2022 15:01:47 +0800 Subject: [PATCH 03/15] wip --- dbms/src/Columns/ColumnString.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dbms/src/Columns/ColumnString.cpp b/dbms/src/Columns/ColumnString.cpp index 352d80a8678..d14051a149a 100644 --- a/dbms/src/Columns/ColumnString.cpp +++ b/dbms/src/Columns/ColumnString.cpp @@ -333,7 +333,7 @@ struct ColumnString::lessWithCollation , inner(inner_) {} - bool operator()(size_t lhs, size_t rhs) const + INLINE_FLATTEN_PURE inline bool operator()(size_t lhs, size_t rhs) const { int res = inner.compare( reinterpret_cast(&parent.chars[parent.offsetAt(lhs)]), @@ -348,7 +348,7 @@ struct ColumnString::lessWithCollation struct Utf8MB4BinCmp { - static inline int compare(const char * s1, size_t length1, const char * s2, size_t length2) + static INLINE_FLATTEN_PURE inline int compare(const char * s1, size_t length1, const char * s2, size_t length2) { return DB::BinCollatorCompare(s1, length1, s2, length2); } From 32e1efdac9b19352383d128be8e2790b23b6c349 Mon Sep 17 00:00:00 2001 From: Zhigao Tong Date: Thu, 14 Jul 2022 15:27:01 +0800 Subject: [PATCH 04/15] wip --- dbms/src/Columns/ColumnString.cpp | 9 ++++++++- dbms/src/Functions/CollationOperatorOptimized.h | 2 +- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/dbms/src/Columns/ColumnString.cpp b/dbms/src/Columns/ColumnString.cpp index d14051a149a..0a4d7bac614 100644 --- a/dbms/src/Columns/ColumnString.cpp +++ b/dbms/src/Columns/ColumnString.cpp @@ -342,7 +342,14 @@ struct ColumnString::lessWithCollation parent.sizeAt(rhs) - 1) // remove tail '\0' ; - return positive ? (res < 0) : (res > 0); + if constexpr (positive) + { + return (res < 0); + } + else + { + return (res > 0); + } } }; diff --git a/dbms/src/Functions/CollationOperatorOptimized.h b/dbms/src/Functions/CollationOperatorOptimized.h index 6a0b8d35512..4f75cd96be4 100644 --- a/dbms/src/Functions/CollationOperatorOptimized.h +++ b/dbms/src/Functions/CollationOperatorOptimized.h @@ -46,7 +46,7 @@ INLINE_FLATTEN_PURE inline uint8_t RawStrEqualCompare(const std::string_view & l // Compare str view by memcmp INLINE_FLATTEN_PURE inline int RawStrCompare(const std::string_view & v1, const std::string_view & v2) { - return signum(v1.compare(v2)); + return v1.compare(v2); } constexpr char SPACE = ' '; From f8947c73b006471e1c508b5bad290c1b0884be7b Mon Sep 17 00:00:00 2001 From: Zhigao Tong Date: Thu, 14 Jul 2022 23:25:02 +0800 Subject: [PATCH 05/15] optimize sort key with collation --- dbms/src/Columns/ColumnString.cpp | 135 ++++++++++----- dbms/src/Columns/ColumnString.h | 82 ++++++---- dbms/src/Columns/IColumn.cpp | 26 ++- dbms/src/Columns/IColumn.h | 16 +- dbms/src/Common/ColumnsHashing.h | 16 +- .../DAGExpressionAnalyzerHelper.cpp | 2 +- dbms/src/Flash/Coprocessor/DAGUtils.cpp | 2 +- .../Functions/CollationOperatorOptimized.h | 36 ---- dbms/src/Functions/FunctionsArray.cpp | 4 +- dbms/src/Functions/FunctionsComparison.h | 4 +- dbms/src/Functions/FunctionsStringSearch.cpp | 41 ++--- dbms/src/Interpreters/AggregationCommon.h | 40 +---- dbms/src/Interpreters/SetVariants.h | 2 +- dbms/src/Storages/Transaction/Collator.cpp | 21 +-- dbms/src/Storages/Transaction/Collator.h | 154 +++++++++++++++++- dbms/src/Storages/Transaction/CollatorUtils.h | 96 +++++++++++ dbms/src/Storages/Transaction/TiDB.cpp | 4 +- .../tidb-ci/new_collation_fullstack/expr.test | 18 +- 18 files changed, 486 insertions(+), 213 deletions(-) create mode 100644 dbms/src/Storages/Transaction/CollatorUtils.h diff --git a/dbms/src/Columns/ColumnString.cpp b/dbms/src/Columns/ColumnString.cpp index 54d4238616f..6d6147d4147 100644 --- a/dbms/src/Columns/ColumnString.cpp +++ b/dbms/src/Columns/ColumnString.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include /// Used in the `reserve` method, when the number of rows is known, but sizes of elements are not. @@ -320,53 +321,94 @@ int ColumnString::compareAtWithCollationImpl(size_t n, size_t m, const IColumn & rhs.sizeAt(m)); } - -template -struct ColumnString::lessWithCollation +// Derived must implement function `int compare(const char *, size_t, const char *, size_t)`. +template +struct ColumnString::LessWithCollation { const ColumnString & parent; - const ICollator & collator; + const Derived & inner; - lessWithCollation(const ColumnString & parent_, const ICollator & collator_) + LessWithCollation(const ColumnString & parent_, const Derived & inner_) : parent(parent_) - , collator(collator_) + , inner(inner_) {} - bool operator()(size_t lhs, size_t rhs) const + FLATTEN_INLINE_PURE inline bool operator()(size_t lhs, size_t rhs) const { - int res = collator.compare( + int res = inner.compare( reinterpret_cast(&parent.chars[parent.offsetAt(lhs)]), - parent.sizeAt(lhs), + parent.sizeAt(lhs) - 1, // remove tail '\0' reinterpret_cast(&parent.chars[parent.offsetAt(rhs)]), - parent.sizeAt(rhs)); + parent.sizeAt(rhs) - 1) // remove tail '\0' + ; - return positive ? (res < 0) : (res > 0); + if constexpr (positive) + { + return (res < 0); + } + else + { + return (res > 0); + } } }; -void ColumnString::getPermutationWithCollationImpl(const ICollator & collator, bool reverse, size_t limit, Permutation & res) const +struct Utf8MB4BinCmp { - size_t s = offsets.size(); - res.resize(s); - for (size_t i = 0; i < s; ++i) - res[i] = i; - - if (limit >= s) - limit = 0; + static FLATTEN_INLINE_PURE inline int compare(const char * s1, size_t length1, const char * s2, size_t length2) + { + return DB::BinCollatorCompare(s1, length1, s2, length2); + } +}; - if (limit) +// common util functions +template <> +struct ColumnString::LessWithCollation +{ + // `CollationCmpImpl` must implement function `int compare(const char *, size_t, const char *, size_t)`. + template + static void getPermutationWithCollationImpl(const ColumnString & src, const CollationCmpImpl & collator_cmp_impl, bool reverse, size_t limit, Permutation & res) { - if (reverse) - std::partial_sort(res.begin(), res.begin() + limit, res.end(), lessWithCollation(*this, collator)); + size_t s = src.offsets.size(); + res.resize(s); + for (size_t i = 0; i < s; ++i) + res[i] = i; + + if (limit >= s) + limit = 0; + + if (limit) + { + if (reverse) + std::partial_sort(res.begin(), res.begin() + limit, res.end(), LessWithCollation(src, collator_cmp_impl)); + else + std::partial_sort(res.begin(), res.begin() + limit, res.end(), LessWithCollation(src, collator_cmp_impl)); + } else - std::partial_sort(res.begin(), res.begin() + limit, res.end(), lessWithCollation(*this, collator)); + { + if (reverse) + std::sort(res.begin(), res.end(), LessWithCollation(src, collator_cmp_impl)); + else + std::sort(res.begin(), res.end(), LessWithCollation(src, collator_cmp_impl)); + } } - else +}; + +void ColumnString::getPermutationWithCollationImpl(const ICollator & collator, bool reverse, size_t limit, Permutation & res) const +{ + using PermutationWithCollationUtils = ColumnString::LessWithCollation; + + // optimize path for default collator `UTF8MB4_BIN` + if (TiDB::ITiDBCollator::getCollator(TiDB::ITiDBCollator::UTF8MB4_BIN) == &collator) { - if (reverse) - std::sort(res.begin(), res.end(), lessWithCollation(*this, collator)); - else - std::sort(res.begin(), res.end(), lessWithCollation(*this, collator)); + Utf8MB4BinCmp cmp_impl; + PermutationWithCollationUtils::getPermutationWithCollationImpl(*this, cmp_impl, reverse, limit, res); + /// + return; + } + + { + PermutationWithCollationUtils::getPermutationWithCollationImpl(*this, collator, reverse, limit, res); } } @@ -383,16 +425,35 @@ void ColumnString::updateWeakHash32(WeakHash32 & hash, const TiDB::TiDBCollatorP if (collator != nullptr) { - for (const auto & offset : offsets) + if (collator->canUseFastPath()) { - auto str_size = offset - prev_offset; - /// Skip last zero byte. - auto sort_key = collator->sortKey(reinterpret_cast(pos), str_size - 1, sort_key_container); - *hash_data = ::updateWeakHash32(reinterpret_cast(sort_key.data), sort_key.size, *hash_data); - - pos += str_size; - prev_offset = offset; - ++hash_data; + for (const auto & offset : offsets) + { + auto str_size = offset - prev_offset; + + // Skip last zero byte. + auto sort_key = collator->fastPathSortKey(reinterpret_cast(pos), str_size - 1); + *hash_data = ::updateWeakHash32(reinterpret_cast(sort_key.data), sort_key.size, *hash_data); + + pos += str_size; + prev_offset = offset; + ++hash_data; + } + } + else + { + for (const auto & offset : offsets) + { + auto str_size = offset - prev_offset; + + // Skip last zero byte. + auto sort_key = collator->sortKeyIndirect(reinterpret_cast(pos), str_size - 1, sort_key_container); + *hash_data = ::updateWeakHash32(reinterpret_cast(sort_key.data), sort_key.size, *hash_data); + + pos += str_size; + prev_offset = offset; + ++hash_data; + } } } else diff --git a/dbms/src/Columns/ColumnString.h b/dbms/src/Columns/ColumnString.h index 48b02388a6c..da45ae59c83 100644 --- a/dbms/src/Columns/ColumnString.h +++ b/dbms/src/Columns/ColumnString.h @@ -19,6 +19,7 @@ #include #include #include +#include #include @@ -52,8 +53,8 @@ class ColumnString final : public COWPtrHelper template struct less; - template - struct lessWithCollation; + template + struct LessWithCollation; ColumnString() = default; @@ -118,7 +119,7 @@ class ColumnString final : public COWPtrHelper void insert(const Field & x) override { - const String & s = DB::get(x); + const auto & s = DB::get(x); const size_t old_size = chars.size(); const size_t size_to_append = s.size() + 1; const size_t new_size = old_size + size_to_append; @@ -134,7 +135,7 @@ class ColumnString final : public COWPtrHelper void insertFrom(const IColumn & src_, size_t n) override { - const ColumnString & src = static_cast(src_); + const auto & src = static_cast(src_); if (n != 0) { @@ -207,21 +208,20 @@ class ColumnString final : public COWPtrHelper { size_t string_size = sizeAt(n); size_t offset = offsetAt(n); - const void * src = &chars[offset]; StringRef res; - if (collator != nullptr) - { - /// Skip last zero byte. - auto sort_key = collator->sortKey(reinterpret_cast(src), string_size - 1, sort_key_container); - string_size = sort_key.size; - src = sort_key.data; - } + StringRef sort_key{reinterpret_cast(&chars[offset]), string_size}; + + // Skip last zero byte. + collator->sortKeyNullable(sort_key.data, sort_key.size - 1, sort_key_container, sort_key); + + string_size = sort_key.size; + res.size = sizeof(string_size) + string_size; char * pos = arena.allocContinue(res.size, begin); memcpy(pos, &string_size, sizeof(string_size)); - memcpy(pos + sizeof(string_size), src, string_size); + memcpy(pos + sizeof(string_size), sort_key.data, string_size); res.data = pos; return res; } @@ -244,33 +244,47 @@ class ColumnString final : public COWPtrHelper { size_t string_size = sizeAt(n); size_t offset = offsetAt(n); - if (collator != nullptr) - { - auto sort_key = collator->sortKey(reinterpret_cast(&chars[offset]), string_size, sort_key_container); - string_size = sort_key.size; - hash.update(reinterpret_cast(&string_size), sizeof(string_size)); - hash.update(sort_key.data, sort_key.size); - } - else - { - hash.update(reinterpret_cast(&string_size), sizeof(string_size)); - hash.update(reinterpret_cast(&chars[offset]), string_size); - } + + StringRef sort_key{reinterpret_cast(&chars[offset]), string_size}; + + // Skip last zero byte. + collator->sortKeyNullable(sort_key.data, sort_key.size - 1, sort_key_container, sort_key); + string_size = sort_key.size; + hash.update(reinterpret_cast(&string_size), sizeof(string_size)); + hash.update(sort_key.data, sort_key.size); } void updateHashWithValues(IColumn::HashValues & hash_values, const TiDB::TiDBCollatorPtr & collator, String & sort_key_container) const override { if (collator != nullptr) { - for (size_t i = 0; i < offsets.size(); ++i) + if (collator->canUseFastPath()) { - size_t string_size = sizeAt(i); - size_t offset = offsetAt(i); - - auto sort_key = collator->sortKey(reinterpret_cast(&chars[offset]), string_size, sort_key_container); - string_size = sort_key.size; - hash_values[i].update(reinterpret_cast(&string_size), sizeof(string_size)); - hash_values[i].update(sort_key.data, sort_key.size); + for (size_t i = 0; i < offsets.size(); ++i) + { + size_t string_size = sizeAt(i); + size_t offset = offsetAt(i); + + // Skip last zero byte. + auto sort_key = collator->fastPathSortKey(reinterpret_cast(&chars[offset]), string_size - 1); + string_size = sort_key.size; + hash_values[i].update(reinterpret_cast(&string_size), sizeof(string_size)); + hash_values[i].update(sort_key.data, sort_key.size); + } + } + else + { + for (size_t i = 0; i < offsets.size(); ++i) + { + size_t string_size = sizeAt(i); + size_t offset = offsetAt(i); + + // Skip last zero byte. + auto sort_key = collator->sortKeyIndirect(reinterpret_cast(&chars[offset]), string_size - 1, sort_key_container); + string_size = sort_key.size; + hash_values[i].update(reinterpret_cast(&string_size), sizeof(string_size)); + hash_values[i].update(sort_key.data, sort_key.size); + } } } else @@ -302,7 +316,7 @@ class ColumnString final : public COWPtrHelper int compareAt(size_t n, size_t m, const IColumn & rhs_, int /*nan_direction_hint*/) const override { - const ColumnString & rhs = static_cast(rhs_); + const auto & rhs = static_cast(rhs_); const size_t size = sizeAt(n); const size_t rhs_size = rhs.sizeAt(m); diff --git a/dbms/src/Columns/IColumn.cpp b/dbms/src/Columns/IColumn.cpp index 0cc776632ff..3022ebc6db9 100644 --- a/dbms/src/Columns/IColumn.cpp +++ b/dbms/src/Columns/IColumn.cpp @@ -15,7 +15,7 @@ #include #include #include - +#include namespace DB { @@ -34,4 +34,28 @@ String IColumn::dumpStructure() const return res.str(); } +StringRef IColumn::serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const +{ + return serializeValueIntoArena(n, arena, begin, nullptr, TiDB::dummy_sort_key_contaner); +} +void IColumn::updateHashWithValue(size_t n, SipHash & hash) const +{ + updateHashWithValue(n, hash, nullptr, TiDB::dummy_sort_key_contaner); +} + +void IColumn::updateHashWithValues(HashValues & hash_values) const +{ + updateHashWithValues(hash_values, nullptr, TiDB::dummy_sort_key_contaner); +} + +void IColumn::updateWeakHash32(WeakHash32 & hash) const +{ + updateWeakHash32(hash, nullptr, TiDB::dummy_sort_key_contaner); +} + +const char * IColumn::deserializeAndInsertFromArena(const char * pos) +{ + return deserializeAndInsertFromArena(pos, nullptr); +} + } // namespace DB diff --git a/dbms/src/Columns/IColumn.h b/dbms/src/Columns/IColumn.h index a5a25e6e28c..72f2d324bcb 100644 --- a/dbms/src/Columns/IColumn.h +++ b/dbms/src/Columns/IColumn.h @@ -21,10 +21,14 @@ #include #include #include -#include #include #include +namespace TiDB +{ +struct TiDBCollatorPtr; +} + namespace DB { namespace ErrorCodes @@ -178,7 +182,7 @@ class IColumn : public COWPtr * Parameter begin should be used with Arena::allocContinue. */ virtual StringRef serializeValueIntoArena(size_t n, Arena & arena, char const *& begin, const TiDB::TiDBCollatorPtr & collator, String & sort_key_container) const = 0; - StringRef serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const { return serializeValueIntoArena(n, arena, begin, nullptr, TiDB::dummy_sort_key_contaner); } + StringRef serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const; /** Deserializes a value that was serialized using IColumn::serializeValueIntoArena method. * Returns pointer to the position after the read data. @@ -194,23 +198,23 @@ class IColumn : public COWPtr * the complex column will be ok. */ virtual const char * deserializeAndInsertFromArena(const char * pos, const TiDB::TiDBCollatorPtr & collator) = 0; - const char * deserializeAndInsertFromArena(const char * pos) { return deserializeAndInsertFromArena(pos, nullptr); } + const char * deserializeAndInsertFromArena(const char * pos); /// Update state of hash function with value of n-th element. /// On subsequent calls of this method for sequence of column values of arbitary types, /// passed bytes to hash must identify sequence of values unambiguously. virtual void updateHashWithValue(size_t n, SipHash & hash, const TiDB::TiDBCollatorPtr & collator, String & sort_key_container) const = 0; - void updateHashWithValue(size_t n, SipHash & hash) const { updateHashWithValue(n, hash, nullptr, TiDB::dummy_sort_key_contaner); } + void updateHashWithValue(size_t n, SipHash & hash) const; using HashValues = PaddedPODArray; virtual void updateHashWithValues(HashValues & hash_values, const TiDB::TiDBCollatorPtr & collator, String & sort_key_container) const = 0; - void updateHashWithValues(HashValues & hash_values) const { updateHashWithValues(hash_values, nullptr, TiDB::dummy_sort_key_contaner); } + void updateHashWithValues(HashValues & hash_values) const; /// Update hash function value. Hash is calculated for each element. /// It's a fast weak hash function. Mainly need to scatter data between threads. /// WeakHash32 must have the same size as column. virtual void updateWeakHash32(WeakHash32 & hash, const TiDB::TiDBCollatorPtr & collator, String & sort_key_container) const = 0; - void updateWeakHash32(WeakHash32 & hash) const { updateWeakHash32(hash, nullptr, TiDB::dummy_sort_key_contaner); } + void updateWeakHash32(WeakHash32 & hash) const; /** Removes elements that don't match the filter. * Is used in WHERE and HAVING operations. diff --git a/dbms/src/Common/ColumnsHashing.h b/dbms/src/Common/ColumnsHashing.h index 525a7f5ab4d..ba31c8d38c4 100644 --- a/dbms/src/Common/ColumnsHashing.h +++ b/dbms/src/Common/ColumnsHashing.h @@ -99,7 +99,7 @@ struct HashMethodString HashMethodString(const ColumnRawPtrs & key_columns, const Sizes & /*key_sizes*/, const TiDB::TiDBCollators & collators) { const IColumn & column = *key_columns[0]; - const ColumnString & column_string = assert_cast(column); + const auto & column_string = assert_cast(column); offsets = column_string.getOffsets().data(); chars = column_string.getChars().data(); if (!collators.empty()) @@ -113,12 +113,13 @@ struct HashMethodString auto getKeyHolder(ssize_t row, [[maybe_unused]] Arena * pool, std::vector & sort_key_containers) const { auto last_offset = row == 0 ? 0 : offsets[row - 1]; + + // Skip last zero byte. StringRef key(chars + last_offset, offsets[row] - last_offset - 1); if constexpr (place_string_to_arena) { - if (collator) - key = collator->sortKey(key.data, key.size, sort_key_containers[0]); + collator->sortKeyNullable(key.data, key.size, sort_key_containers[0], key); return ArenaKeyHolder{key, *pool}; } else @@ -147,7 +148,7 @@ struct HashMethodFixedString HashMethodFixedString(const ColumnRawPtrs & key_columns, const Sizes & /*key_sizes*/, const TiDB::TiDBCollators & collators) { const IColumn & column = *key_columns[0]; - const ColumnFixedString & column_string = assert_cast(column); + const auto & column_string = assert_cast(column); n = column_string.getN(); chars = &column_string.getChars(); if (!collators.empty()) @@ -158,10 +159,7 @@ struct HashMethodFixedString { StringRef key(&(*chars)[row * n], n); - if (collator) - { - key = collator->sortKey(key.data, key.size, sort_key_containers[0]); - } + collator->sortKeyNullable(key.data, key.size, sort_key_containers[0], key); if constexpr (place_string_to_arena) { @@ -387,7 +385,7 @@ struct HashMethodHashed ALWAYS_INLINE Key getKeyHolder(size_t row, Arena *, std::vector & sort_key_containers) const { - return hash128(row, key_columns.size(), key_columns, collators, sort_key_containers); + return Hash128(row, key_columns.size(), key_columns, collators, sort_key_containers); } }; diff --git a/dbms/src/Flash/Coprocessor/DAGExpressionAnalyzerHelper.cpp b/dbms/src/Flash/Coprocessor/DAGExpressionAnalyzerHelper.cpp index 23bbb4586b3..f93989f037a 100644 --- a/dbms/src/Flash/Coprocessor/DAGExpressionAnalyzerHelper.cpp +++ b/dbms/src/Flash/Coprocessor/DAGExpressionAnalyzerHelper.cpp @@ -168,7 +168,7 @@ String DAGExpressionAnalyzerHelper::buildInFunction( actions->add(ExpressionAction::addColumn(column)); argument_names.push_back(column.name); - const auto * collator = getCollatorFromExpr(expr); + const auto collator = getCollatorFromExpr(expr); String expr_name = analyzer->applyFunction(func_name, argument_names, actions, collator); if (set->remaining_exprs.empty()) diff --git a/dbms/src/Flash/Coprocessor/DAGUtils.cpp b/dbms/src/Flash/Coprocessor/DAGUtils.cpp index 2003103a20a..17866c1f2ca 100644 --- a/dbms/src/Flash/Coprocessor/DAGUtils.cpp +++ b/dbms/src/Flash/Coprocessor/DAGUtils.cpp @@ -1329,7 +1329,7 @@ SortDescription getSortDescription(const std::vector & order_co if (removeNullable(order_columns[i].type)->isString()) collator = getCollatorFromExpr(by_items[i].expr()); - order_descr.emplace_back(name, direction, nulls_direction, collator); + order_descr.emplace_back(name, direction, nulls_direction, collator.inner.ptr); } return order_descr; } diff --git a/dbms/src/Functions/CollationOperatorOptimized.h b/dbms/src/Functions/CollationOperatorOptimized.h index 395ecc5b9eb..f69e1bc31e7 100644 --- a/dbms/src/Functions/CollationOperatorOptimized.h +++ b/dbms/src/Functions/CollationOperatorOptimized.h @@ -27,42 +27,6 @@ namespace DB { -template -ALWAYS_INLINE inline int signum(T val) -{ - return (0 < val) - (val < 0); -} - -// Check equality is much faster than other comparison. -// - check size first -// - return 0 if equal else 1 -__attribute__((flatten, always_inline, pure)) inline uint8_t RawStrEqualCompare(const std::string_view & lhs, const std::string_view & rhs) -{ - return StringRef(lhs) == StringRef(rhs) ? 0 : 1; -} - -// Compare str view by memcmp -__attribute__((flatten, always_inline, pure)) inline int RawStrCompare(const std::string_view & v1, const std::string_view & v2) -{ - return signum(v1.compare(v2)); -} - -constexpr char SPACE = ' '; - -// Remove tail space -__attribute__((flatten, always_inline, pure)) inline std::string_view RightTrim(const std::string_view & v) -{ - if (likely(v.empty() || v.back() != SPACE)) - return v; - size_t end = v.find_last_not_of(SPACE); - return end == std::string_view::npos ? std::string_view{} : std::string_view(v.data(), end + 1); -} - -__attribute__((flatten, always_inline, pure)) inline int RtrimStrCompare(const std::string_view & va, const std::string_view & vb) -{ - return RawStrCompare(RightTrim(va), RightTrim(vb)); -} - // If true, only need to check equal or not. template struct IsEqualRelated diff --git a/dbms/src/Functions/FunctionsArray.cpp b/dbms/src/Functions/FunctionsArray.cpp index 4ccf2a08667..a9091931ee6 100644 --- a/dbms/src/Functions/FunctionsArray.cpp +++ b/dbms/src/Functions/FunctionsArray.cpp @@ -1345,7 +1345,7 @@ void FunctionArrayUniq::executeHashed( set.clear(); size_t off = offsets[i]; for (size_t j = prev_off; j < off; ++j) - set.insert(hash128(j, count, columns, TiDB::dummy_collators, TiDB::dummy_sort_key_contaners)); + set.insert(Hash128(j, count, columns, TiDB::dummy_collators, TiDB::dummy_sort_key_contaners)); res_values[i] = set.size(); prev_off = off; @@ -1631,7 +1631,7 @@ void FunctionArrayEnumerateUniq::executeHashed( for (size_t j = prev_off; j < off; ++j) { // todo support collation - res_values[j] = ++indices[hash128(j, count, columns, TiDB::dummy_collators, TiDB::dummy_sort_key_contaners)]; + res_values[j] = ++indices[Hash128(j, count, columns, TiDB::dummy_collators, TiDB::dummy_sort_key_contaners)]; } prev_off = off; } diff --git a/dbms/src/Functions/FunctionsComparison.h b/dbms/src/Functions/FunctionsComparison.h index 8f7502fba85..511e6da5254 100644 --- a/dbms/src/Functions/FunctionsComparison.h +++ b/dbms/src/Functions/FunctionsComparison.h @@ -317,7 +317,7 @@ struct StringComparisonWithCollatorImpl size_t a_offset = StringUtil::offsetAt(a_offsets, i); size_t b_offset = StringUtil::offsetAt(b_offsets, i); - c[i] = Op::apply(collator->compare(reinterpret_cast(&a_data[a_offset]), a_size, reinterpret_cast(&b_data[b_offset]), b_size), 0); + c[i] = Op::apply(collator->compareIndirect(reinterpret_cast(&a_data[a_offset]), a_size, reinterpret_cast(&b_data[b_offset]), b_size), 0); } } @@ -341,7 +341,7 @@ struct StringComparisonWithCollatorImpl for (size_t i = 0; i < size; ++i) { /// Trailing zero byte of the smaller string is included in the comparison. - c[i] = Op::apply(collator->compare(reinterpret_cast(&a_data[StringUtil::offsetAt(a_offsets, i)]), StringUtil::sizeAt(a_offsets, i) - 1, b_data, b_size), 0); + c[i] = Op::apply(collator->compareIndirect(reinterpret_cast(&a_data[StringUtil::offsetAt(a_offsets, i)]), StringUtil::sizeAt(a_offsets, i) - 1, b_data, b_size), 0); } } diff --git a/dbms/src/Functions/FunctionsStringSearch.cpp b/dbms/src/Functions/FunctionsStringSearch.cpp index f0c6cd6f303..ab0ff30ff66 100644 --- a/dbms/src/Functions/FunctionsStringSearch.cpp +++ b/dbms/src/Functions/FunctionsStringSearch.cpp @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -1930,18 +1931,18 @@ class FunctionStringReplace : public IFunction const String & match_type, ColumnWithTypeAndName & column_result) const { - const ColumnConst * c1_const = typeid_cast(column_needle.get()); - const ColumnConst * c2_const = typeid_cast(column_replacement.get()); - String needle = c1_const->getValue(); - String replacement = c2_const->getValue(); + const auto * c1_const = typeid_cast(column_needle.get()); + const auto * c2_const = typeid_cast(column_replacement.get()); + auto needle = c1_const->getValue(); + auto replacement = c2_const->getValue(); - if (const ColumnString * col = checkAndGetColumn(column_src.get())) + if (const auto * col = checkAndGetColumn(column_src.get())) { auto col_res = ColumnString::create(); Impl::vector(col->getChars(), col->getOffsets(), needle, replacement, pos, occ, match_type, collator, col_res->getChars(), col_res->getOffsets()); column_result.column = std::move(col_res); } - else if (const ColumnFixedString * col = checkAndGetColumn(column_src.get())) + else if (const auto * col = checkAndGetColumn(column_src.get())) { auto col_res = ColumnString::create(); Impl::vectorFixed(col->getChars(), col->getN(), needle, replacement, pos, occ, match_type, collator, col_res->getChars(), col_res->getOffsets()); @@ -1964,17 +1965,17 @@ class FunctionStringReplace : public IFunction { if constexpr (Impl::support_non_const_needle) { - const ColumnString * col_needle = typeid_cast(column_needle.get()); - const ColumnConst * col_replacement_const = typeid_cast(column_replacement.get()); - String replacement = col_replacement_const->getValue(); + const auto * col_needle = typeid_cast(column_needle.get()); + const auto * col_replacement_const = typeid_cast(column_replacement.get()); + auto replacement = col_replacement_const->getValue(); - if (const ColumnString * col = checkAndGetColumn(column_src.get())) + if (const auto * col = checkAndGetColumn(column_src.get())) { auto col_res = ColumnString::create(); Impl::vectorNonConstNeedle(col->getChars(), col->getOffsets(), col_needle->getChars(), col_needle->getOffsets(), replacement, pos, occ, match_type, collator, col_res->getChars(), col_res->getOffsets()); column_result.column = std::move(col_res); } - else if (const ColumnFixedString * col = checkAndGetColumn(column_src.get())) + else if (const auto * col = checkAndGetColumn(column_src.get())) { auto col_res = ColumnString::create(); Impl::vectorFixedNonConstNeedle(col->getChars(), col->getN(), col_needle->getChars(), col_needle->getOffsets(), replacement, pos, occ, match_type, collator, col_res->getChars(), col_res->getOffsets()); @@ -2002,17 +2003,17 @@ class FunctionStringReplace : public IFunction { if constexpr (Impl::support_non_const_replacement) { - const ColumnConst * col_needle_const = typeid_cast(column_needle.get()); - String needle = col_needle_const->getValue(); - const ColumnString * col_replacement = typeid_cast(column_replacement.get()); + const auto * col_needle_const = typeid_cast(column_needle.get()); + auto needle = col_needle_const->getValue(); + const auto * col_replacement = typeid_cast(column_replacement.get()); - if (const ColumnString * col = checkAndGetColumn(column_src.get())) + if (const auto * col = checkAndGetColumn(column_src.get())) { auto col_res = ColumnString::create(); Impl::vectorNonConstReplacement(col->getChars(), col->getOffsets(), needle, col_replacement->getChars(), col_replacement->getOffsets(), pos, occ, match_type, collator, col_res->getChars(), col_res->getOffsets()); column_result.column = std::move(col_res); } - else if (const ColumnFixedString * col = checkAndGetColumn(column_src.get())) + else if (const auto * col = checkAndGetColumn(column_src.get())) { auto col_res = ColumnString::create(); Impl::vectorFixedNonConstReplacement(col->getChars(), col->getN(), needle, col_replacement->getChars(), col_replacement->getOffsets(), pos, occ, match_type, collator, col_res->getChars(), col_res->getOffsets()); @@ -2040,16 +2041,16 @@ class FunctionStringReplace : public IFunction { if constexpr (Impl::support_non_const_needle && Impl::support_non_const_replacement) { - const ColumnString * col_needle = typeid_cast(column_needle.get()); - const ColumnString * col_replacement = typeid_cast(column_replacement.get()); + const auto * col_needle = typeid_cast(column_needle.get()); + const auto * col_replacement = typeid_cast(column_replacement.get()); - if (const ColumnString * col = checkAndGetColumn(column_src.get())) + if (const auto * col = checkAndGetColumn(column_src.get())) { auto col_res = ColumnString::create(); Impl::vectorNonConstNeedleReplacement(col->getChars(), col->getOffsets(), col_needle->getChars(), col_needle->getOffsets(), col_replacement->getChars(), col_replacement->getOffsets(), pos, occ, match_type, collator, col_res->getChars(), col_res->getOffsets()); column_result.column = std::move(col_res); } - else if (const ColumnFixedString * col = checkAndGetColumn(column_src.get())) + else if (const auto * col = checkAndGetColumn(column_src.get())) { auto col_res = ColumnString::create(); Impl::vectorFixedNonConstNeedleReplacement(col->getChars(), col->getN(), col_needle->getChars(), col_needle->getOffsets(), col_replacement->getChars(), col_replacement->getOffsets(), pos, occ, match_type, collator, col_res->getChars(), col_res->getOffsets()); diff --git a/dbms/src/Interpreters/AggregationCommon.h b/dbms/src/Interpreters/AggregationCommon.h index 9f390133088..3a549fb1f8c 100644 --- a/dbms/src/Interpreters/AggregationCommon.h +++ b/dbms/src/Interpreters/AggregationCommon.h @@ -172,47 +172,15 @@ static inline T ALWAYS_INLINE packFixed( } -/// Hash a set of keys into a UInt128 value. -static inline UInt128 ALWAYS_INLINE hash128( - size_t i, - size_t keys_size, - const ColumnRawPtrs & key_columns, - StringRefs & keys, - const TiDB::TiDBCollators & collators, - std::vector & sort_key_containers) -{ - UInt128 key; - SipHash hash; - - for (size_t j = 0; j < keys_size; ++j) - { - /// Hashes the key. - keys[j] = key_columns[j]->getDataAtWithTerminatingZero(i); - if (!collators.empty() && collators[j] != nullptr) - { - // todo check if need to handle the terminating zero - /// Note if collation is enabled, keys only exists before next call to hash128 since it - /// will be overwritten in the next call - keys[j] = collators[j]->sortKey(keys[j].data, keys[j].size - 1, sort_key_containers[j]); - } - hash.update(keys[j].data, keys[j].size); - } - - hash.get128(key); - - return key; -} - - /// Almost the same as above but it doesn't return any reference to key data. -static inline UInt128 ALWAYS_INLINE hash128( +static inline UInt128 ALWAYS_INLINE Hash128( size_t i, size_t keys_size, const ColumnRawPtrs & key_columns, const TiDB::TiDBCollators & collators, std::vector & sort_key_containers) { - UInt128 key; + UInt128 key{}; SipHash hash; if (collators.empty()) @@ -252,7 +220,7 @@ static inline StringRef * ALWAYS_INLINE placeKeysInPool( return reinterpret_cast(res); } - +/* /// Copy keys to the pool. Then put into pool StringRefs to them and return the pointer to the first. static inline StringRef * ALWAYS_INLINE extractKeysAndPlaceInPool( size_t i, @@ -326,7 +294,7 @@ inline StringRef ALWAYS_INLINE extractKeysAndPlaceInPoolContiguous( return {res, sum_keys_size}; } - +*/ /** Serialize keys into a continuous chunk of memory. */ diff --git a/dbms/src/Interpreters/SetVariants.h b/dbms/src/Interpreters/SetVariants.h index 08ebb5d5100..58f102555b6 100644 --- a/dbms/src/Interpreters/SetVariants.h +++ b/dbms/src/Interpreters/SetVariants.h @@ -21,7 +21,7 @@ #include #include #include -#include + namespace DB { diff --git a/dbms/src/Storages/Transaction/Collator.cpp b/dbms/src/Storages/Transaction/Collator.cpp index 1b0221a6829..974135b38b3 100644 --- a/dbms/src/Storages/Transaction/Collator.cpp +++ b/dbms/src/Storages/Transaction/Collator.cpp @@ -18,6 +18,7 @@ #include #include +#include namespace DB::ErrorCodes { @@ -186,22 +187,12 @@ class BinCollator final : public ITiDBCollator int compare(const char * s1, size_t length1, const char * s2, size_t length2) const override { - if constexpr (padding) - return DB::RtrimStrCompare({s1, length1}, {s2, length2}); - else - return DB::RawStrCompare({s1, length1}, {s2, length2}); + return DB::BinCollatorCompare(s1, length1, s2, length2); } StringRef sortKey(const char * s, size_t length, std::string &) const override { - if constexpr (padding) - { - return StringRef(rtrim(s, length)); - } - else - { - return StringRef(s, length); - } + return DB::BinCollatorSortKey(s, length); } std::unique_ptr pattern() const override { return std::make_unique>>(); } @@ -593,6 +584,9 @@ TiDBCollatorPtr ITiDBCollator::getCollator(int32_t id) { switch (id) { + case ITiDBCollator::UTF8MB4_BIN: + static const auto utf8mb4_collator = UTF8MB4_BIN_TYPE(UTF8MB4_BIN); + return &utf8mb4_collator; case ITiDBCollator::BINARY: static const auto binary_collator = BinCollator(BINARY); return &binary_collator; @@ -602,9 +596,6 @@ TiDBCollatorPtr ITiDBCollator::getCollator(int32_t id) case ITiDBCollator::LATIN1_BIN: static const auto latin1_collator = BinCollator(LATIN1_BIN); return &latin1_collator; - case ITiDBCollator::UTF8MB4_BIN: - static const auto utf8mb4_collator = UTF8MB4_BIN_TYPE(UTF8MB4_BIN); - return &utf8mb4_collator; case ITiDBCollator::UTF8_BIN: static const auto utf8_collator = UTF8MB4_BIN_TYPE(UTF8_BIN); return &utf8_collator; diff --git a/dbms/src/Storages/Transaction/Collator.h b/dbms/src/Storages/Transaction/Collator.h index a5650cd9c7e..72300aae952 100644 --- a/dbms/src/Storages/Transaction/Collator.h +++ b/dbms/src/Storages/Transaction/Collator.h @@ -15,20 +15,16 @@ #pragma once #include -#include - -#include -#include +#include namespace TiDB { -using TiDBCollatorPtr = class ITiDBCollator const *; +struct TiDBCollatorPtr; using TiDBCollators = std::vector; -class ITiDBCollator : public ICollator +struct TiDBCollatorID { -public: enum { UTF8_GENERAL_CI = 33, @@ -42,6 +38,30 @@ class ITiDBCollator : public ICollator UTF8_BIN = 83, }; + int32_t getCollatorId() const { return collator_id; } + bool isBinary() const { return collator_id == BINARY; } + bool isCI() const + { + return collator_id == UTF8_UNICODE_CI || collator_id == UTF8_GENERAL_CI + || collator_id == UTF8MB4_UNICODE_CI || collator_id == UTF8MB4_GENERAL_CI; + } + bool isBin() const + { + return collator_id == UTF8_BIN || collator_id == UTF8MB4_BIN + || collator_id == ASCII_BIN || collator_id == LATIN1_BIN; + } + + explicit TiDBCollatorID(int32_t id) + : collator_id(id) + {} + + int32_t collator_id; +}; + +class ITiDBCollator : public TiDBCollatorID + , public ICollator +{ +public: /// Get the collator according to the internal collation ID, which directly comes from tipb and has been properly /// de-rewritten - the "New CI Collation" will flip the sign of the collation ID. static TiDBCollatorPtr getCollator(int32_t id); @@ -66,6 +86,7 @@ class ITiDBCollator : public ICollator int compare(const char * s1, size_t length1, const char * s2, size_t length2) const override = 0; virtual StringRef sortKey(const char * s, size_t length, std::string & container) const = 0; virtual std::unique_ptr pattern() const = 0; + int32_t getCollatorId() const { return collator_id; } bool isBinary() const { return collator_id == BINARY; } bool isCI() const @@ -81,8 +102,8 @@ class ITiDBCollator : public ICollator protected: explicit ITiDBCollator(int32_t collator_id_) - : collator_id(collator_id_){}; - int32_t collator_id; + : TiDBCollatorID(collator_id_) + {} }; /// these dummy_xxx are used as the default value to avoid too many meaningless @@ -91,4 +112,119 @@ extern TiDBCollators dummy_collators; extern std::vector dummy_sort_key_contaners; extern std::string dummy_sort_key_contaner; +struct TiDBCollatorPtrImpl : TiDBCollatorID +{ + class ITiDBCollator const * ptr{}; + + TiDBCollatorPtrImpl(class ITiDBCollator const * ptr_, int32_t collator_id_) + : TiDBCollatorID(collator_id_) + , ptr(ptr_) + { + } + + explicit TiDBCollatorPtrImpl(class ITiDBCollator const * ptr_) + : TiDBCollatorID(ptr_ ? ptr_->getCollatorId() : 0) + , ptr(ptr_) + { + } + + ALWAYS_INLINE inline int compare(const char * s1, size_t length1, const char * s2, size_t length2) const + { + if (likely(canUseFastPath())) + { + return DB::BinCollatorCompare(s1, length1, s2, length2); + } + else + { + return compareIndirect(s1, length1, s2, length2); + } + } + + ALWAYS_INLINE inline int compareIndirect(const char * s1, size_t length1, const char * s2, size_t length2) const + { + return ptr->compare(s1, length1, s2, length2); + } + + ALWAYS_INLINE inline bool canUseFastPath() const + { + return collator_id == ITiDBCollator::UTF8MB4_BIN; + } + + ALWAYS_INLINE inline StringRef sortKey(const char * s, size_t length, std::string & container) const + { + if (likely(canUseFastPath())) + { + return fastPathSortKey(s, length); + } + else + { + return sortKeyIndirect(s, length, container); + } + } + + ALWAYS_INLINE inline bool sortKeyNullable(const char * s, size_t length, std::string & container, StringRef & res) const + { + if (likely(canUseFastPath())) + { + res = fastPathSortKey(s, length); + return true; + } + else if (ptr) + { + res = sortKeyIndirect(s, length, container); + return true; + } + return false; + } + + static ALWAYS_INLINE inline StringRef fastPathSortKey(const char * s, size_t length) + { + return DB::BinCollatorSortKey(s, length); + } + + ALWAYS_INLINE inline StringRef sortKeyIndirect(const char * s, size_t length, std::string & container) const + { + return ptr->sortKey(s, length, container); + } + + ALWAYS_INLINE inline std::unique_ptr pattern() const { return ptr->pattern(); } + + ALWAYS_INLINE int32_t getCollatorId() const { return collator_id; } +}; + +struct TiDBCollatorPtr +{ + TiDBCollatorPtrImpl inner; + + TiDBCollatorPtr(class ITiDBCollator const * ptr_, int32_t collator_id_) + : inner(ptr_, collator_id_) + { + } + TiDBCollatorPtr(class ITiDBCollator const * ptr_ = nullptr) + : inner(ptr_) + { + } + bool operator==(const void * tar) const + { + return inner.ptr == tar; + } + bool operator!=(const void * tar) const + { + return inner.ptr != tar; + } + + explicit operator bool() const + { + return inner.ptr; + } + TiDBCollatorPtrImpl * operator->() + { + return &inner; + } + const TiDBCollatorPtrImpl * operator->() const + { + return &inner; + } +}; + } // namespace TiDB diff --git a/dbms/src/Storages/Transaction/CollatorUtils.h b/dbms/src/Storages/Transaction/CollatorUtils.h new file mode 100644 index 00000000000..e990c62880d --- /dev/null +++ b/dbms/src/Storages/Transaction/CollatorUtils.h @@ -0,0 +1,96 @@ +// Copyright 2022 PingCAP, Ltd. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include + +#define FLATTEN_INLINE_PURE __attribute__((flatten, always_inline, pure)) + +namespace DB +{ + +template +ALWAYS_INLINE inline int signum(T val) +{ + return (0 < val) - (val < 0); +} + +// Check equality is much faster than other comparison. +// - check size first +// - return 0 if equal else 1 +FLATTEN_INLINE_PURE inline uint8_t RawStrEqualCompare(const std::string_view & lhs, const std::string_view & rhs) +{ + return StringRef(lhs) == StringRef(rhs) ? 0 : 1; +} + +// Compare str view by memcmp +FLATTEN_INLINE_PURE inline int RawStrCompare(const std::string_view & v1, const std::string_view & v2) +{ + return v1.compare(v2); +} + +constexpr char SPACE = ' '; + +FLATTEN_INLINE_PURE inline std::string_view RightTrimRaw(const std::string_view & v) +{ + size_t end = v.find_last_not_of(SPACE); + return end == std::string_view::npos ? std::string_view{} : std::string_view(v.data(), end + 1); +} + +// Remove tail space +FLATTEN_INLINE_PURE inline std::string_view RightTrim(const std::string_view & v) +{ + if (likely(v.empty() || v.back() != SPACE)) + return v; + return RightTrimRaw(v); +} + +FLATTEN_INLINE_PURE inline std::string_view RightTrimNoEmpty(const std::string_view & v) +{ + if (likely(v.back() != SPACE)) + return v; + return RightTrimRaw(v); +} + +FLATTEN_INLINE_PURE inline int RtrimStrCompare(const std::string_view & va, const std::string_view & vb) +{ + return RawStrCompare(RightTrim(va), RightTrim(vb)); +} + +template +FLATTEN_INLINE_PURE inline int BinCollatorCompare(const char * s1, size_t length1, const char * s2, size_t length2) +{ + if constexpr (padding) + return DB::RtrimStrCompare({s1, length1}, {s2, length2}); + else + return DB::RawStrCompare({s1, length1}, {s2, length2}); +} + +template +FLATTEN_INLINE_PURE inline StringRef BinCollatorSortKey(const char * s, size_t length) +{ + if constexpr (padding) + { + return StringRef(RightTrim({s, length})); + } + else + { + return StringRef(s, length); + } +} +} // namespace DB diff --git a/dbms/src/Storages/Transaction/TiDB.cpp b/dbms/src/Storages/Transaction/TiDB.cpp index 6d07c47f235..59d5d97e65d 100644 --- a/dbms/src/Storages/Transaction/TiDB.cpp +++ b/dbms/src/Storages/Transaction/TiDB.cpp @@ -240,7 +240,7 @@ DB::Field ColumnInfo::getDecimalValue(const String & decimal_text) const // FIXME it still has bug: https://github.com/pingcap/tidb/issues/11435 Int64 ColumnInfo::getEnumIndex(const String & enum_id_or_text) const { - const auto * collator = ITiDBCollator::getCollator(collate.isEmpty() ? "binary" : collate.convert()); + auto collator = ITiDBCollator::getCollator(collate.isEmpty() ? "binary" : collate.convert()); if (!collator) // todo if new collation is enabled, should use "utf8mb4_bin" collator = ITiDBCollator::getCollator("binary"); @@ -257,7 +257,7 @@ Int64 ColumnInfo::getEnumIndex(const String & enum_id_or_text) const UInt64 ColumnInfo::getSetValue(const String & set_str) const { - const auto * collator = ITiDBCollator::getCollator(collate.isEmpty() ? "binary" : collate.convert()); + auto collator = ITiDBCollator::getCollator(collate.isEmpty() ? "binary" : collate.convert()); if (!collator) // todo if new collation is enabled, should use "utf8mb4_bin" collator = ITiDBCollator::getCollator("binary"); diff --git a/tests/tidb-ci/new_collation_fullstack/expr.test b/tests/tidb-ci/new_collation_fullstack/expr.test index 1e2135c4f2d..c88db20ba2f 100644 --- a/tests/tidb-ci/new_collation_fullstack/expr.test +++ b/tests/tidb-ci/new_collation_fullstack/expr.test @@ -88,4 +88,20 @@ mysql> set session tidb_isolation_read_engines='tiflash'; select /*+ read_from_s | min(value) | max(value) | min(value1) | max(value1) | +------------+------------+-------------+-------------+ | abc | def | abc | def | -+------------+------------+-------------+-------------+ \ No newline at end of file ++------------+------------+-------------+-------------+ + +mysql> insert into test.t values (4, '', 'def\n'), (5, '', 'def '); + +mysql> select /*+ read_from_storage(tiflash[t]) */ hex(max(value1)) from test.t; ++------------------+ +| hex(max(value1)) | ++------------------+ +| 6465660A | ++------------------+ + +mysql> select /*+ read_from_storage(tiflash[t]) */ hex(min(value1)) from test.t; ++------------------+ +| hex(min(value1)) | ++------------------+ +| 61626320 | ++------------------+ From 456c986e3eeb30b08bc5a1abdb1a8475784850b2 Mon Sep 17 00:00:00 2001 From: Zhigao Tong Date: Fri, 15 Jul 2022 09:53:27 +0800 Subject: [PATCH 06/15] small fix --- dbms/src/Columns/ColumnString.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/src/Columns/ColumnString.cpp b/dbms/src/Columns/ColumnString.cpp index 6d6147d4147..8b549c03ac3 100644 --- a/dbms/src/Columns/ColumnString.cpp +++ b/dbms/src/Columns/ColumnString.cpp @@ -461,7 +461,7 @@ void ColumnString::updateWeakHash32(WeakHash32 & hash, const TiDB::TiDBCollatorP for (const auto & offset : offsets) { auto str_size = offset - prev_offset; - /// Skip last zero byte. + // Skip last zero byte. *hash_data = ::updateWeakHash32(pos, str_size - 1, *hash_data); pos += str_size; From 427850c033315197b982c3c2b64a3b6e072925bd Mon Sep 17 00:00:00 2001 From: Zhigao Tong Date: Fri, 15 Jul 2022 10:03:55 +0800 Subject: [PATCH 07/15] rollback --- dbms/src/Interpreters/AggregationCommon.h | 32 +++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/dbms/src/Interpreters/AggregationCommon.h b/dbms/src/Interpreters/AggregationCommon.h index 3a549fb1f8c..8d2c78f4e5d 100644 --- a/dbms/src/Interpreters/AggregationCommon.h +++ b/dbms/src/Interpreters/AggregationCommon.h @@ -171,6 +171,38 @@ static inline T ALWAYS_INLINE packFixed( return key; } +/* +/// Hash a set of keys into a UInt128 value. +static inline UInt128 ALWAYS_INLINE hash128( + size_t i, + size_t keys_size, + const ColumnRawPtrs & key_columns, + StringRefs & keys, + const TiDB::TiDBCollators & collators, + std::vector & sort_key_containers) +{ + UInt128 key; + SipHash hash; + + for (size_t j = 0; j < keys_size; ++j) + { + /// Hashes the key. + keys[j] = key_columns[j]->getDataAtWithTerminatingZero(i); + if (!collators.empty() && collators[j] != nullptr) + { + // todo check if need to handle the terminating zero + /// Note if collation is enabled, keys only exists before next call to hash128 since it + /// will be overwritten in the next call + keys[j] = collators[j]->sortKey(keys[j].data, keys[j].size - 1, sort_key_containers[j]); + } + hash.update(keys[j].data, keys[j].size); + } + + hash.get128(key); + + return key; +} +*/ /// Almost the same as above but it doesn't return any reference to key data. static inline UInt128 ALWAYS_INLINE Hash128( From 0ac252686b6a58fed16bbc68e616987ad02c9f79 Mon Sep 17 00:00:00 2001 From: Zhigao Tong Date: Fri, 15 Jul 2022 10:58:16 +0800 Subject: [PATCH 08/15] rollback rename --- dbms/src/Common/ColumnsHashing.h | 2 +- dbms/src/Functions/FunctionsArray.cpp | 4 ++-- dbms/src/Interpreters/AggregationCommon.h | 3 ++- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/dbms/src/Common/ColumnsHashing.h b/dbms/src/Common/ColumnsHashing.h index ba31c8d38c4..13f30de98e8 100644 --- a/dbms/src/Common/ColumnsHashing.h +++ b/dbms/src/Common/ColumnsHashing.h @@ -385,7 +385,7 @@ struct HashMethodHashed ALWAYS_INLINE Key getKeyHolder(size_t row, Arena *, std::vector & sort_key_containers) const { - return Hash128(row, key_columns.size(), key_columns, collators, sort_key_containers); + return hash128(row, key_columns.size(), key_columns, collators, sort_key_containers); } }; diff --git a/dbms/src/Functions/FunctionsArray.cpp b/dbms/src/Functions/FunctionsArray.cpp index a9091931ee6..4ccf2a08667 100644 --- a/dbms/src/Functions/FunctionsArray.cpp +++ b/dbms/src/Functions/FunctionsArray.cpp @@ -1345,7 +1345,7 @@ void FunctionArrayUniq::executeHashed( set.clear(); size_t off = offsets[i]; for (size_t j = prev_off; j < off; ++j) - set.insert(Hash128(j, count, columns, TiDB::dummy_collators, TiDB::dummy_sort_key_contaners)); + set.insert(hash128(j, count, columns, TiDB::dummy_collators, TiDB::dummy_sort_key_contaners)); res_values[i] = set.size(); prev_off = off; @@ -1631,7 +1631,7 @@ void FunctionArrayEnumerateUniq::executeHashed( for (size_t j = prev_off; j < off; ++j) { // todo support collation - res_values[j] = ++indices[Hash128(j, count, columns, TiDB::dummy_collators, TiDB::dummy_sort_key_contaners)]; + res_values[j] = ++indices[hash128(j, count, columns, TiDB::dummy_collators, TiDB::dummy_sort_key_contaners)]; } prev_off = off; } diff --git a/dbms/src/Interpreters/AggregationCommon.h b/dbms/src/Interpreters/AggregationCommon.h index 8d2c78f4e5d..f02c3994ada 100644 --- a/dbms/src/Interpreters/AggregationCommon.h +++ b/dbms/src/Interpreters/AggregationCommon.h @@ -20,6 +20,7 @@ #include #include #include +#include #include @@ -205,7 +206,7 @@ static inline UInt128 ALWAYS_INLINE hash128( */ /// Almost the same as above but it doesn't return any reference to key data. -static inline UInt128 ALWAYS_INLINE Hash128( +static inline UInt128 ALWAYS_INLINE hash128( size_t i, size_t keys_size, const ColumnRawPtrs & key_columns, From 7b43a8a6c55dcc61f0f7a38cd6403f978e41da36 Mon Sep 17 00:00:00 2001 From: Zhigao Tong Date: Fri, 15 Jul 2022 11:24:04 +0800 Subject: [PATCH 09/15] Fix gtests --- dbms/src/Functions/tests/gtest_regexp.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/dbms/src/Functions/tests/gtest_regexp.cpp b/dbms/src/Functions/tests/gtest_regexp.cpp index d3eb93a0790..140e9014f17 100644 --- a/dbms/src/Functions/tests/gtest_regexp.cpp +++ b/dbms/src/Functions/tests/gtest_regexp.cpp @@ -66,8 +66,8 @@ class Regexp : public FunctionTest TEST_F(Regexp, testRegexpMatchType) { UInt8 res = false; - const auto * binary_collator = TiDB::ITiDBCollator::getCollator(TiDB::ITiDBCollator::BINARY); - const auto * ci_collator = TiDB::ITiDBCollator::getCollator(TiDB::ITiDBCollator::UTF8MB4_GENERAL_CI); + const auto binary_collator = TiDB::ITiDBCollator::getCollator(TiDB::ITiDBCollator::BINARY); + const auto ci_collator = TiDB::ITiDBCollator::getCollator(TiDB::ITiDBCollator::UTF8MB4_GENERAL_CI); DB::MatchImpl::constantConstant("a\nB\n", "(?m)(?i)^b", '\\', "", nullptr, res); ASSERT_TRUE(res == 1); DB::MatchImpl::constantConstant("a\nB\n", "^b", '\\', "mi", nullptr, res); @@ -1744,8 +1744,8 @@ TEST_F(Regexp, testRegexpMySQLCases) TEST_F(Regexp, testRegexpTiDBCase) { UInt8 res; - const auto * binary_collator = TiDB::ITiDBCollator::getCollator(TiDB::ITiDBCollator::BINARY); - const auto * ci_collator = TiDB::ITiDBCollator::getCollator(TiDB::ITiDBCollator::UTF8MB4_GENERAL_CI); + const auto binary_collator = TiDB::ITiDBCollator::getCollator(TiDB::ITiDBCollator::BINARY); + const auto ci_collator = TiDB::ITiDBCollator::getCollator(TiDB::ITiDBCollator::UTF8MB4_GENERAL_CI); DB::MatchImpl::constantConstant("a", "^$", '\\', "", nullptr, res); ASSERT_TRUE(res == 0); DB::MatchImpl::constantConstant("a", "a", '\\', "", nullptr, res); @@ -1782,7 +1782,7 @@ TEST_F(Regexp, testRegexpTiDBCase) TEST_F(Regexp, testRegexp) { - const auto * binary_collator = TiDB::ITiDBCollator::getCollator(TiDB::ITiDBCollator::BINARY); + const auto binary_collator = TiDB::ITiDBCollator::getCollator(TiDB::ITiDBCollator::BINARY); auto string_type = std::make_shared(); auto nullable_string_type = makeNullable(string_type); auto uint8_type = std::make_shared(); @@ -1954,8 +1954,8 @@ TEST_F(Regexp, testRegexpCustomerCases) TEST_F(Regexp, testRegexpReplaceMatchType) { String res; - const auto * binary_collator = TiDB::ITiDBCollator::getCollator(TiDB::ITiDBCollator::BINARY); - const auto * ci_collator = TiDB::ITiDBCollator::getCollator(TiDB::ITiDBCollator::UTF8MB4_GENERAL_CI); + const auto binary_collator = TiDB::ITiDBCollator::getCollator(TiDB::ITiDBCollator::BINARY); + const auto ci_collator = TiDB::ITiDBCollator::getCollator(TiDB::ITiDBCollator::UTF8MB4_GENERAL_CI); DB::ReplaceRegexpImpl::constant("a\nB\nc", "(?m)(?i)^b", "xxx", 1, 0, "", nullptr, res); ASSERT_TRUE(res == "a\nxxx\nc"); DB::ReplaceRegexpImpl::constant("a\nB\nc", "^b", "xxx", 1, 0, "mi", nullptr, res); @@ -2020,7 +2020,7 @@ TEST_F(Regexp, testRegexpReplaceMySQLCases) TEST_F(Regexp, testRegexpReplace) { - const auto * binary_collator = TiDB::ITiDBCollator::getCollator(TiDB::ITiDBCollator::BINARY); + const auto binary_collator = TiDB::ITiDBCollator::getCollator(TiDB::ITiDBCollator::BINARY); auto string_type = std::make_shared(); auto nullable_string_type = makeNullable(string_type); auto uint8_type = std::make_shared(); From 800ebec8e07a35f7cd0257b1b83878625778efce Mon Sep 17 00:00:00 2001 From: Zhigao Tong Date: Fri, 15 Jul 2022 13:49:42 +0800 Subject: [PATCH 10/15] Fix gtests --- dbms/src/Storages/Transaction/Collator.h | 6 ++ dbms/src/Storages/Transaction/CollatorUtils.h | 2 +- .../Transaction/tests/gtest_tidb_collator.cpp | 83 ++++++++++--------- 3 files changed, 49 insertions(+), 42 deletions(-) diff --git a/dbms/src/Storages/Transaction/Collator.h b/dbms/src/Storages/Transaction/Collator.h index 72300aae952..7d955c0e3f6 100644 --- a/dbms/src/Storages/Transaction/Collator.h +++ b/dbms/src/Storages/Transaction/Collator.h @@ -83,7 +83,13 @@ class ITiDBCollator : public TiDBCollatorID ~ITiDBCollator() override = default; + /** + 0 compare equal + <0 {s1, length1} < {s2, length2} with collation + >0 {s1, length1} > {s2, length2} with collation + */ int compare(const char * s1, size_t length1, const char * s2, size_t length2) const override = 0; + virtual StringRef sortKey(const char * s, size_t length, std::string & container) const = 0; virtual std::unique_ptr pattern() const = 0; diff --git a/dbms/src/Storages/Transaction/CollatorUtils.h b/dbms/src/Storages/Transaction/CollatorUtils.h index e990c62880d..3f318a5b700 100644 --- a/dbms/src/Storages/Transaction/CollatorUtils.h +++ b/dbms/src/Storages/Transaction/CollatorUtils.h @@ -33,7 +33,7 @@ ALWAYS_INLINE inline int signum(T val) // Check equality is much faster than other comparison. // - check size first // - return 0 if equal else 1 -FLATTEN_INLINE_PURE inline uint8_t RawStrEqualCompare(const std::string_view & lhs, const std::string_view & rhs) +FLATTEN_INLINE_PURE inline int RawStrEqualCompare(const std::string_view & lhs, const std::string_view & rhs) { return StringRef(lhs) == StringRef(rhs) ? 0 : 1; } diff --git a/dbms/src/Storages/Transaction/tests/gtest_tidb_collator.cpp b/dbms/src/Storages/Transaction/tests/gtest_tidb_collator.cpp index 9a6dae3db08..fcace3fb30e 100644 --- a/dbms/src/Storages/Transaction/tests/gtest_tidb_collator.cpp +++ b/dbms/src/Storages/Transaction/tests/gtest_tidb_collator.cpp @@ -66,59 +66,45 @@ const typename CollatorCases::SortKeyCase CollatorCases::sk_cases[] = { {"a", {PREVENT_TRUNC("\x61"), PREVENT_TRUNC("\x61"), PREVENT_TRUNC("\x00\x41"), PREVENT_TRUNC("\x61"), PREVENT_TRUNC("\x0e\x33")}}, {"A", {PREVENT_TRUNC("\x41"), PREVENT_TRUNC("\x41"), PREVENT_TRUNC("\x00\x41"), PREVENT_TRUNC("\x41"), PREVENT_TRUNC("\x0e\x33")}}, {"😃", - {PREVENT_TRUNC("\xf0\x9f\x98\x83"), PREVENT_TRUNC("\xf0\x9f\x98\x83"), PREVENT_TRUNC("\xff\xfd"), PREVENT_TRUNC("\xf0\x9f\x98\x83"), - PREVENT_TRUNC("\xff\xfd")}}, + {PREVENT_TRUNC("\xf0\x9f\x98\x83"), PREVENT_TRUNC("\xf0\x9f\x98\x83"), PREVENT_TRUNC("\xff\xfd"), PREVENT_TRUNC("\xf0\x9f\x98\x83"), PREVENT_TRUNC("\xff\xfd")}}, {"Foo © bar 𝌆 baz ☃ qux", - {PREVENT_TRUNC("\x46\x6f\x6f\x20\xc2\xa9\x20\x62\x61\x72\x20\xf0\x9d\x8c\x86\x20\x62\x61\x7a\x20\xe2\x98\x83\x20\x71\x75\x78"), - PREVENT_TRUNC("\x46\x6f\x6f\x20\xc2\xa9\x20\x62\x61\x72\x20\xf0\x9d\x8c\x86\x20\x62\x61\x7a\x20\xe2\x98\x83\x20\x71\x75\x78"), - PREVENT_TRUNC("\x00\x46\x00\x4f\x00\x4f\x00\x20\x00\xa9\x00\x20\x00\x42\x00\x41\x00\x52\x00\x20\xff\xfd\x00\x20\x00\x42\x00\x41" - "\x00\x5a\x00\x20\x26\x03\x00\x20\x00\x51\x00\x55\x00\x58"), - PREVENT_TRUNC("\x46\x6f\x6f\x20\xc2\xa9\x20\x62\x61\x72\x20\xf0\x9d\x8c\x86\x20\x62\x61\x7a\x20\xe2\x98\x83\x20\x71\x75\x78"), - PREVENT_TRUNC("\x0E\xB9\x0F\x82\x0F\x82\x02\x09\x02\xC5\x02\x09\x0E\x4A\x0E\x33\x0F\xC0\x02\x09\xFF\xFD\x02\x09\x0E\x4A\x0E\x33" - "\x10\x6A\x02\x09\x06\xFF\x02\x09\x0F\xB4\x10\x1F\x10\x5A")}}, + {PREVENT_TRUNC("\x46\x6f\x6f\x20\xc2\xa9\x20\x62\x61\x72\x20\xf0\x9d\x8c\x86\x20\x62\x61\x7a\x20\xe2\x98\x83\x20\x71\x75\x78"), + PREVENT_TRUNC("\x46\x6f\x6f\x20\xc2\xa9\x20\x62\x61\x72\x20\xf0\x9d\x8c\x86\x20\x62\x61\x7a\x20\xe2\x98\x83\x20\x71\x75\x78"), + PREVENT_TRUNC("\x00\x46\x00\x4f\x00\x4f\x00\x20\x00\xa9\x00\x20\x00\x42\x00\x41\x00\x52\x00\x20\xff\xfd\x00\x20\x00\x42\x00\x41" + "\x00\x5a\x00\x20\x26\x03\x00\x20\x00\x51\x00\x55\x00\x58"), + PREVENT_TRUNC("\x46\x6f\x6f\x20\xc2\xa9\x20\x62\x61\x72\x20\xf0\x9d\x8c\x86\x20\x62\x61\x7a\x20\xe2\x98\x83\x20\x71\x75\x78"), + PREVENT_TRUNC("\x0E\xB9\x0F\x82\x0F\x82\x02\x09\x02\xC5\x02\x09\x0E\x4A\x0E\x33\x0F\xC0\x02\x09\xFF\xFD\x02\x09\x0E\x4A\x0E\x33" + "\x10\x6A\x02\x09\x06\xFF\x02\x09\x0F\xB4\x10\x1F\x10\x5A")}}, {"a ", {PREVENT_TRUNC("\x61\x20"), PREVENT_TRUNC("\x61"), PREVENT_TRUNC("\x00\x41"), PREVENT_TRUNC("\x61"), PREVENT_TRUNC("\x0e\x33")}}, {"", {PREVENT_TRUNC(""), PREVENT_TRUNC(""), PREVENT_TRUNC(""), PREVENT_TRUNC(""), PREVENT_TRUNC("")}}, {"ß", - {PREVENT_TRUNC("\xc3\x9f"), PREVENT_TRUNC("\xc3\x9f"), PREVENT_TRUNC("\x00\x53"), PREVENT_TRUNC("\xc3\x9f"), - PREVENT_TRUNC("\x0F\xEA\x0F\xEA")}}, + {PREVENT_TRUNC("\xc3\x9f"), PREVENT_TRUNC("\xc3\x9f"), PREVENT_TRUNC("\x00\x53"), PREVENT_TRUNC("\xc3\x9f"), PREVENT_TRUNC("\x0F\xEA\x0F\xEA")}}, }; const typename CollatorCases::PatternCase CollatorCases::pattern_cases[] = { {"A", - {{"a", {false, false, true, false, true}}, {"A", {true, true, true, true, true}}, {"À", {false, false, true, false, true}}, - {"", {false, false, false, false, false}}}}, + {{"a", {false, false, true, false, true}}, {"A", {true, true, true, true, true}}, {"À", {false, false, true, false, true}}, {"", {false, false, false, false, false}}}}, {"_A", - {{"aA", {true, true, true, true, true}}, {"ÀA", {false, false, true, true, true}}, {"ÀÀ", {false, false, true, false, true}}, - {"", {false, false, false, false, false}}}}, + {{"aA", {true, true, true, true, true}}, {"ÀA", {false, false, true, true, true}}, {"ÀÀ", {false, false, true, false, true}}, {"", {false, false, false, false, false}}}}, {"%A", - {{"a", {false, false, true, false, true}}, {"ÀA", {true, true, true, true, true}}, {"À", {false, false, true, false, true}}, - {"", {false, false, false, false, false}}}}, + {{"a", {false, false, true, false, true}}, {"ÀA", {true, true, true, true, true}}, {"À", {false, false, true, false, true}}, {"", {false, false, false, false, false}}}}, {"À", - {{"a", {false, false, true, false, true}}, {"A", {false, false, true, false, true}}, {"À", {true, true, true, true, true}}, - {"", {false, false, false, false, false}}}}, + {{"a", {false, false, true, false, true}}, {"A", {false, false, true, false, true}}, {"À", {true, true, true, true, true}}, {"", {false, false, false, false, false}}}}, {"_À", - {{" À", {true, true, true, true, true}}, {"ÀA", {false, false, true, false, true}}, {"ÀÀ", {false, false, true, true, true}}, - {"", {false, false, false, false, false}}}}, + {{" À", {true, true, true, true, true}}, {"ÀA", {false, false, true, false, true}}, {"ÀÀ", {false, false, true, true, true}}, {"", {false, false, false, false, false}}}}, {"%À", - {{"À", {true, true, true, true, true}}, {"ÀÀÀ", {true, true, true, true, true}}, {"ÀA", {false, false, true, false, true}}, - {"", {false, false, false, false, false}}}}, + {{"À", {true, true, true, true, true}}, {"ÀÀÀ", {true, true, true, true, true}}, {"ÀA", {false, false, true, false, true}}, {"", {false, false, false, false, false}}}}, {"À_", - {{"À ", {true, true, true, true, true}}, {"ÀAA", {false, false, false, false, false}}, {"À", {false, false, false, false, false}}, - {"", {false, false, false, false, false}}}}, + {{"À ", {true, true, true, true, true}}, {"ÀAA", {false, false, false, false, false}}, {"À", {false, false, false, false, false}}, {"", {false, false, false, false, false}}}}, {"À%", - {{"À", {true, true, true, true, true}}, {"ÀÀÀ", {true, true, true, true, true}}, {"AÀ", {false, false, true, false, true}}, - {"", {false, false, false, false, false}}}}, + {{"À", {true, true, true, true, true}}, {"ÀÀÀ", {true, true, true, true, true}}, {"AÀ", {false, false, true, false, true}}, {"", {false, false, false, false, false}}}}, {"", - {{"À", {false, false, false, false, false}}, {"ÀÀÀ", {false, false, false, false, false}}, - {"AÀ", {false, false, false, false, false}}, {"", {true, true, true, true, true}}}}, + {{"À", {false, false, false, false, false}}, {"ÀÀÀ", {false, false, false, false, false}}, {"AÀ", {false, false, false, false, false}}, {"", {true, true, true, true, true}}}}, {"%", - {{"À", {true, true, true, true, true}}, {"ÀÀÀ", {true, true, true, true, true}}, {"AÀ", {true, true, true, true, true}}, - {"", {true, true, true, true, true}}}}, + {{"À", {true, true, true, true, true}}, {"ÀÀÀ", {true, true, true, true, true}}, {"AÀ", {true, true, true, true, true}}, {"", {true, true, true, true, true}}}}, {"a_%À", - {{"ÀÀ", {false, false, false, false, false}}, {"aÀÀ", {true, true, true, true, true}}, {"ÀÀÀÀ", {false, false, true, false, true}}, - {"ÀÀÀa", {false, false, true, false, true}}}}, + {{"ÀÀ", {false, false, false, false, false}}, {"aÀÀ", {true, true, true, true, true}}, {"ÀÀÀÀ", {false, false, true, false, true}}, {"ÀÀÀa", {false, false, true, false, true}}}}, {"À%_a", - {{"ÀÀ", {false, false, false, false, false}}, {"aÀÀ", {false, false, true, false, true}}, {"ÀÀÀa", {true, true, true, true, true}}, - {"aÀÀÀ", {false, false, true, false, true}}}}, + {{"ÀÀ", {false, false, false, false, false}}, {"aÀÀ", {false, false, true, false, true}}, {"ÀÀÀa", {true, true, true, true, true}}, {"aÀÀÀ", {false, false, true, false, true}}}}, {"___a", {{"中a", {true, true, false, false, false}}, {"中文字a", {false, false, true, true, true}}}}, {"𐐭", {{"𐐨", {false, false, true, false, false}}}}, }; @@ -133,7 +119,7 @@ void testCollator() const std::string & s2 = std::get<1>(c); int ans = std::get(std::get<2>(c)); std::cout << "Compare case (" << s1 << ", " << s2 << ", " << ans << ")" << std::endl; - ASSERT_EQ(collator->compare(s1.data(), s1.length(), s2.data(), s2.length()), ans); + ASSERT_EQ(signum((collator->compare(s1.data(), s1.length(), s2.data(), s2.length()))), ans); } for (const auto & c : CollatorCases::sk_cases) { @@ -189,14 +175,29 @@ struct UnicodeCICollator static constexpr auto collation_case = CollatorCases::UnicodeCI; }; -TEST(CollatorSuite, BinCollator) { testCollator(); } +TEST(CollatorSuite, BinCollator) +{ + testCollator(); +} -TEST(CollatorSuite, BinPaddingCollator) { testCollator(); } +TEST(CollatorSuite, BinPaddingCollator) +{ + testCollator(); +} -TEST(CollatorSuite, Utf8BinPaddingCollator) { testCollator(); } +TEST(CollatorSuite, Utf8BinPaddingCollator) +{ + testCollator(); +} -TEST(CollatorSuite, GeneralCICollator) { testCollator(); } +TEST(CollatorSuite, GeneralCICollator) +{ + testCollator(); +} -TEST(CollatorSuite, UnicodeCICollator) { testCollator(); } +TEST(CollatorSuite, UnicodeCICollator) +{ + testCollator(); +} } // namespace DB::tests From a16d596d0b115dfb8b3e59a63adb72120b661000 Mon Sep 17 00:00:00 2001 From: Zhigao Tong Date: Fri, 15 Jul 2022 14:12:36 +0800 Subject: [PATCH 11/15] format --- dbms/src/Storages/Transaction/Collator.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/dbms/src/Storages/Transaction/Collator.h b/dbms/src/Storages/Transaction/Collator.h index 7d955c0e3f6..22d4f5a6597 100644 --- a/dbms/src/Storages/Transaction/Collator.h +++ b/dbms/src/Storages/Transaction/Collator.h @@ -84,9 +84,10 @@ class ITiDBCollator : public TiDBCollatorID ~ITiDBCollator() override = default; /** - 0 compare equal - <0 {s1, length1} < {s2, length2} with collation - >0 {s1, length1} > {s2, length2} with collation + compare with collation + 0 equal + <0 {s1, length1} < {s2, length2} with collation + >0 {s1, length1} > {s2, length2} with collation */ int compare(const char * s1, size_t length1, const char * s2, size_t length2) const override = 0; From 7d44c4d51ce02df426af449d9bdbd48c176fb199 Mon Sep 17 00:00:00 2001 From: Zhigao Tong Date: Wed, 20 Jul 2022 14:37:32 +0800 Subject: [PATCH 12/15] rollback modify --- dbms/src/Columns/ColumnString.h | 38 ++++++++++++++---------- dbms/src/Common/ColumnsHashing.h | 8 +++-- dbms/src/Storages/Transaction/Collator.h | 27 ++++------------- 3 files changed, 34 insertions(+), 39 deletions(-) diff --git a/dbms/src/Columns/ColumnString.h b/dbms/src/Columns/ColumnString.h index da45ae59c83..ac2d2973239 100644 --- a/dbms/src/Columns/ColumnString.h +++ b/dbms/src/Columns/ColumnString.h @@ -208,20 +208,21 @@ class ColumnString final : public COWPtrHelper { size_t string_size = sizeAt(n); size_t offset = offsetAt(n); + const void * src = &chars[offset]; StringRef res; - StringRef sort_key{reinterpret_cast(&chars[offset]), string_size}; - - // Skip last zero byte. - collator->sortKeyNullable(sort_key.data, sort_key.size - 1, sort_key_container, sort_key); - - string_size = sort_key.size; - + if (collator != nullptr) + { + // Skip last zero byte. + auto sort_key = collator->sortKey(reinterpret_cast(src), string_size - 1, sort_key_container); + string_size = sort_key.size; + src = sort_key.data; + } res.size = sizeof(string_size) + string_size; char * pos = arena.allocContinue(res.size, begin); memcpy(pos, &string_size, sizeof(string_size)); - memcpy(pos + sizeof(string_size), sort_key.data, string_size); + memcpy(pos + sizeof(string_size), src, string_size); res.data = pos; return res; } @@ -244,14 +245,19 @@ class ColumnString final : public COWPtrHelper { size_t string_size = sizeAt(n); size_t offset = offsetAt(n); - - StringRef sort_key{reinterpret_cast(&chars[offset]), string_size}; - - // Skip last zero byte. - collator->sortKeyNullable(sort_key.data, sort_key.size - 1, sort_key_container, sort_key); - string_size = sort_key.size; - hash.update(reinterpret_cast(&string_size), sizeof(string_size)); - hash.update(sort_key.data, sort_key.size); + if (collator) + { + // Skip last zero byte. + auto sort_key = collator->sortKey(reinterpret_cast(&chars[offset]), string_size - 1, sort_key_container); + string_size = sort_key.size; + hash.update(reinterpret_cast(&string_size), sizeof(string_size)); + hash.update(sort_key.data, sort_key.size); + } + else + { + hash.update(reinterpret_cast(&string_size), sizeof(string_size)); + hash.update(reinterpret_cast(&chars[offset]), string_size); + } } void updateHashWithValues(IColumn::HashValues & hash_values, const TiDB::TiDBCollatorPtr & collator, String & sort_key_container) const override diff --git a/dbms/src/Common/ColumnsHashing.h b/dbms/src/Common/ColumnsHashing.h index 13f30de98e8..d906e2755e9 100644 --- a/dbms/src/Common/ColumnsHashing.h +++ b/dbms/src/Common/ColumnsHashing.h @@ -119,7 +119,8 @@ struct HashMethodString if constexpr (place_string_to_arena) { - collator->sortKeyNullable(key.data, key.size, sort_key_containers[0], key); + if (collator) + key = collator->sortKey(key.data, key.size, sort_key_containers[0]); return ArenaKeyHolder{key, *pool}; } else @@ -159,7 +160,10 @@ struct HashMethodFixedString { StringRef key(&(*chars)[row * n], n); - collator->sortKeyNullable(key.data, key.size, sort_key_containers[0], key); + if (collator) + { + key = collator->sortKey(key.data, key.size, sort_key_containers[0]); + } if constexpr (place_string_to_arena) { diff --git a/dbms/src/Storages/Transaction/Collator.h b/dbms/src/Storages/Transaction/Collator.h index 22d4f5a6597..55db14f44f5 100644 --- a/dbms/src/Storages/Transaction/Collator.h +++ b/dbms/src/Storages/Transaction/Collator.h @@ -169,21 +169,6 @@ struct TiDBCollatorPtrImpl : TiDBCollatorID } } - ALWAYS_INLINE inline bool sortKeyNullable(const char * s, size_t length, std::string & container, StringRef & res) const - { - if (likely(canUseFastPath())) - { - res = fastPathSortKey(s, length); - return true; - } - else if (ptr) - { - res = sortKeyIndirect(s, length, container); - return true; - } - return false; - } - static ALWAYS_INLINE inline StringRef fastPathSortKey(const char * s, size_t length) { return DB::BinCollatorSortKey(s, length); @@ -207,28 +192,28 @@ struct TiDBCollatorPtr : inner(ptr_, collator_id_) { } - TiDBCollatorPtr(class ITiDBCollator const * ptr_ = nullptr) + TiDBCollatorPtr(class ITiDBCollator const * ptr_ = nullptr) // NOLINT : inner(ptr_) { } - bool operator==(const void * tar) const + ALWAYS_INLINE bool operator==(const void * tar) const { return inner.ptr == tar; } - bool operator!=(const void * tar) const + ALWAYS_INLINE bool operator!=(const void * tar) const { return inner.ptr != tar; } - explicit operator bool() const + ALWAYS_INLINE explicit operator bool() const { return inner.ptr; } - TiDBCollatorPtrImpl * operator->() + ALWAYS_INLINE TiDBCollatorPtrImpl * operator->() { return &inner; } - const TiDBCollatorPtrImpl * operator->() const + ALWAYS_INLINE const TiDBCollatorPtrImpl * operator->() const { return &inner; } From 9599ff2c55bab682b77fb70d856d1d0c137dd1ab Mon Sep 17 00:00:00 2001 From: Zhigao Tong Date: Wed, 20 Jul 2022 20:38:16 +0800 Subject: [PATCH 13/15] rollback modify --- dbms/src/Columns/ColumnString.cpp | 80 ++++++---- dbms/src/Columns/ColumnString.h | 49 +----- dbms/src/Columns/IColumn.cpp | 26 +--- dbms/src/Columns/IColumn.h | 16 +- dbms/src/Common/ColumnsHashing.h | 6 +- .../DAGExpressionAnalyzerHelper.cpp | 2 +- dbms/src/Flash/Coprocessor/DAGUtils.cpp | 2 +- .../Functions/CollationOperatorOptimized.h | 35 ++++- dbms/src/Functions/FunctionsComparison.h | 4 +- dbms/src/Functions/FunctionsStringSearch.cpp | 41 +++-- dbms/src/Functions/tests/gtest_regexp.cpp | 16 +- dbms/src/Interpreters/AggregationCommon.h | 1 - dbms/src/Storages/Transaction/Collator.cpp | 3 +- dbms/src/Storages/Transaction/Collator.h | 146 ++---------------- dbms/src/Storages/Transaction/TiDB.cpp | 4 +- 15 files changed, 132 insertions(+), 299 deletions(-) diff --git a/dbms/src/Columns/ColumnString.cpp b/dbms/src/Columns/ColumnString.cpp index 8b549c03ac3..ce96165b689 100644 --- a/dbms/src/Columns/ColumnString.cpp +++ b/dbms/src/Columns/ColumnString.cpp @@ -17,9 +17,10 @@ #include #include #include -#include +#include #include + /// Used in the `reserve` method, when the number of rows is known, but sizes of elements are not. #define APPROX_STRING_SIZE 64 @@ -419,54 +420,73 @@ void ColumnString::updateWeakHash32(WeakHash32 & hash, const TiDB::TiDBCollatorP if (hash.getData().size() != s) throw Exception(fmt::format("Size of WeakHash32 does not match size of column: column size is {}, hash size is {}", s, hash.getData().size()), ErrorCodes::LOGICAL_ERROR); - const UInt8 * pos = chars.data(); UInt32 * hash_data = hash.getData().data(); - Offset prev_offset = 0; if (collator != nullptr) { - if (collator->canUseFastPath()) + if (collator->getCollatorId() == TiDB::ITiDBCollator::UTF8MB4_BIN) { - for (const auto & offset : offsets) - { - auto str_size = offset - prev_offset; - - // Skip last zero byte. - auto sort_key = collator->fastPathSortKey(reinterpret_cast(pos), str_size - 1); + // Skip last zero byte. + LoopOneColumn(chars, offsets, offsets.size(), [&](const std::string_view & view, size_t) { + auto sort_key = BinCollatorSortKey(view.data(), view.size()); *hash_data = ::updateWeakHash32(reinterpret_cast(sort_key.data), sort_key.size, *hash_data); - - pos += str_size; - prev_offset = offset; ++hash_data; - } + }); } else { - for (const auto & offset : offsets) - { - auto str_size = offset - prev_offset; - - // Skip last zero byte. - auto sort_key = collator->sortKeyIndirect(reinterpret_cast(pos), str_size - 1, sort_key_container); + // Skip last zero byte. + LoopOneColumn(chars, offsets, offsets.size(), [&](const std::string_view & view, size_t) { + auto sort_key = collator->sortKey(view.data(), view.size(), sort_key_container); *hash_data = ::updateWeakHash32(reinterpret_cast(sort_key.data), sort_key.size, *hash_data); - - pos += str_size; - prev_offset = offset; ++hash_data; - } + }); } } else { - for (const auto & offset : offsets) + // Skip last zero byte. + LoopOneColumn(chars, offsets, offsets.size(), [&](const std::string_view & view, size_t) { + *hash_data = ::updateWeakHash32(reinterpret_cast(view.data()), view.size(), *hash_data); + ++hash_data; + }); + } +} + +void ColumnString::updateHashWithValues(IColumn::HashValues & hash_values, const TiDB::TiDBCollatorPtr & collator, String & sort_key_container) const +{ + if (collator != nullptr) + { + if (collator->getCollatorId() == TiDB::ITiDBCollator::UTF8MB4_BIN) + { + // Skip last zero byte. + LoopOneColumn(chars, offsets, offsets.size(), [&hash_values](const std::string_view & view, size_t i) { + auto sort_key = BinCollatorSortKey(view.data(), view.size()); + size_t string_size = sort_key.size; + hash_values[i].update(reinterpret_cast(&string_size), sizeof(string_size)); + hash_values[i].update(sort_key.data, sort_key.size); + }); + } + else { - auto str_size = offset - prev_offset; // Skip last zero byte. - *hash_data = ::updateWeakHash32(pos, str_size - 1, *hash_data); + LoopOneColumn(chars, offsets, offsets.size(), [&](const std::string_view & view, size_t i) { + auto sort_key = collator->sortKey(view.data(), view.size(), sort_key_container); + size_t string_size = sort_key.size; + hash_values[i].update(reinterpret_cast(&string_size), sizeof(string_size)); + hash_values[i].update(sort_key.data, sort_key.size); + }); + } + } + else + { + for (size_t i = 0; i < offsets.size(); ++i) + { + size_t string_size = sizeAt(i); + size_t offset = offsetAt(i); - pos += str_size; - prev_offset = offset; - ++hash_data; + hash_values[i].update(reinterpret_cast(&string_size), sizeof(string_size)); + hash_values[i].update(reinterpret_cast(&chars[offset]), string_size); } } } diff --git a/dbms/src/Columns/ColumnString.h b/dbms/src/Columns/ColumnString.h index ac2d2973239..2204319e090 100644 --- a/dbms/src/Columns/ColumnString.h +++ b/dbms/src/Columns/ColumnString.h @@ -19,7 +19,6 @@ #include #include #include -#include #include @@ -245,7 +244,7 @@ class ColumnString final : public COWPtrHelper { size_t string_size = sizeAt(n); size_t offset = offsetAt(n); - if (collator) + if (collator != nullptr) { // Skip last zero byte. auto sort_key = collator->sortKey(reinterpret_cast(&chars[offset]), string_size - 1, sort_key_container); @@ -260,51 +259,7 @@ class ColumnString final : public COWPtrHelper } } - void updateHashWithValues(IColumn::HashValues & hash_values, const TiDB::TiDBCollatorPtr & collator, String & sort_key_container) const override - { - if (collator != nullptr) - { - if (collator->canUseFastPath()) - { - for (size_t i = 0; i < offsets.size(); ++i) - { - size_t string_size = sizeAt(i); - size_t offset = offsetAt(i); - - // Skip last zero byte. - auto sort_key = collator->fastPathSortKey(reinterpret_cast(&chars[offset]), string_size - 1); - string_size = sort_key.size; - hash_values[i].update(reinterpret_cast(&string_size), sizeof(string_size)); - hash_values[i].update(sort_key.data, sort_key.size); - } - } - else - { - for (size_t i = 0; i < offsets.size(); ++i) - { - size_t string_size = sizeAt(i); - size_t offset = offsetAt(i); - - // Skip last zero byte. - auto sort_key = collator->sortKeyIndirect(reinterpret_cast(&chars[offset]), string_size - 1, sort_key_container); - string_size = sort_key.size; - hash_values[i].update(reinterpret_cast(&string_size), sizeof(string_size)); - hash_values[i].update(sort_key.data, sort_key.size); - } - } - } - else - { - for (size_t i = 0; i < offsets.size(); ++i) - { - size_t string_size = sizeAt(i); - size_t offset = offsetAt(i); - - hash_values[i].update(reinterpret_cast(&string_size), sizeof(string_size)); - hash_values[i].update(reinterpret_cast(&chars[offset]), string_size); - } - } - } + void updateHashWithValues(IColumn::HashValues & hash_values, const TiDB::TiDBCollatorPtr & collator, String & sort_key_container) const override; void updateWeakHash32(WeakHash32 & hash, const TiDB::TiDBCollatorPtr &, String &) const override; diff --git a/dbms/src/Columns/IColumn.cpp b/dbms/src/Columns/IColumn.cpp index 3022ebc6db9..0cc776632ff 100644 --- a/dbms/src/Columns/IColumn.cpp +++ b/dbms/src/Columns/IColumn.cpp @@ -15,7 +15,7 @@ #include #include #include -#include + namespace DB { @@ -34,28 +34,4 @@ String IColumn::dumpStructure() const return res.str(); } -StringRef IColumn::serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const -{ - return serializeValueIntoArena(n, arena, begin, nullptr, TiDB::dummy_sort_key_contaner); -} -void IColumn::updateHashWithValue(size_t n, SipHash & hash) const -{ - updateHashWithValue(n, hash, nullptr, TiDB::dummy_sort_key_contaner); -} - -void IColumn::updateHashWithValues(HashValues & hash_values) const -{ - updateHashWithValues(hash_values, nullptr, TiDB::dummy_sort_key_contaner); -} - -void IColumn::updateWeakHash32(WeakHash32 & hash) const -{ - updateWeakHash32(hash, nullptr, TiDB::dummy_sort_key_contaner); -} - -const char * IColumn::deserializeAndInsertFromArena(const char * pos) -{ - return deserializeAndInsertFromArena(pos, nullptr); -} - } // namespace DB diff --git a/dbms/src/Columns/IColumn.h b/dbms/src/Columns/IColumn.h index 72f2d324bcb..a5a25e6e28c 100644 --- a/dbms/src/Columns/IColumn.h +++ b/dbms/src/Columns/IColumn.h @@ -21,14 +21,10 @@ #include #include #include +#include #include #include -namespace TiDB -{ -struct TiDBCollatorPtr; -} - namespace DB { namespace ErrorCodes @@ -182,7 +178,7 @@ class IColumn : public COWPtr * Parameter begin should be used with Arena::allocContinue. */ virtual StringRef serializeValueIntoArena(size_t n, Arena & arena, char const *& begin, const TiDB::TiDBCollatorPtr & collator, String & sort_key_container) const = 0; - StringRef serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const; + StringRef serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const { return serializeValueIntoArena(n, arena, begin, nullptr, TiDB::dummy_sort_key_contaner); } /** Deserializes a value that was serialized using IColumn::serializeValueIntoArena method. * Returns pointer to the position after the read data. @@ -198,23 +194,23 @@ class IColumn : public COWPtr * the complex column will be ok. */ virtual const char * deserializeAndInsertFromArena(const char * pos, const TiDB::TiDBCollatorPtr & collator) = 0; - const char * deserializeAndInsertFromArena(const char * pos); + const char * deserializeAndInsertFromArena(const char * pos) { return deserializeAndInsertFromArena(pos, nullptr); } /// Update state of hash function with value of n-th element. /// On subsequent calls of this method for sequence of column values of arbitary types, /// passed bytes to hash must identify sequence of values unambiguously. virtual void updateHashWithValue(size_t n, SipHash & hash, const TiDB::TiDBCollatorPtr & collator, String & sort_key_container) const = 0; - void updateHashWithValue(size_t n, SipHash & hash) const; + void updateHashWithValue(size_t n, SipHash & hash) const { updateHashWithValue(n, hash, nullptr, TiDB::dummy_sort_key_contaner); } using HashValues = PaddedPODArray; virtual void updateHashWithValues(HashValues & hash_values, const TiDB::TiDBCollatorPtr & collator, String & sort_key_container) const = 0; - void updateHashWithValues(HashValues & hash_values) const; + void updateHashWithValues(HashValues & hash_values) const { updateHashWithValues(hash_values, nullptr, TiDB::dummy_sort_key_contaner); } /// Update hash function value. Hash is calculated for each element. /// It's a fast weak hash function. Mainly need to scatter data between threads. /// WeakHash32 must have the same size as column. virtual void updateWeakHash32(WeakHash32 & hash, const TiDB::TiDBCollatorPtr & collator, String & sort_key_container) const = 0; - void updateWeakHash32(WeakHash32 & hash) const; + void updateWeakHash32(WeakHash32 & hash) const { updateWeakHash32(hash, nullptr, TiDB::dummy_sort_key_contaner); } /** Removes elements that don't match the filter. * Is used in WHERE and HAVING operations. diff --git a/dbms/src/Common/ColumnsHashing.h b/dbms/src/Common/ColumnsHashing.h index d906e2755e9..525a7f5ab4d 100644 --- a/dbms/src/Common/ColumnsHashing.h +++ b/dbms/src/Common/ColumnsHashing.h @@ -99,7 +99,7 @@ struct HashMethodString HashMethodString(const ColumnRawPtrs & key_columns, const Sizes & /*key_sizes*/, const TiDB::TiDBCollators & collators) { const IColumn & column = *key_columns[0]; - const auto & column_string = assert_cast(column); + const ColumnString & column_string = assert_cast(column); offsets = column_string.getOffsets().data(); chars = column_string.getChars().data(); if (!collators.empty()) @@ -113,8 +113,6 @@ struct HashMethodString auto getKeyHolder(ssize_t row, [[maybe_unused]] Arena * pool, std::vector & sort_key_containers) const { auto last_offset = row == 0 ? 0 : offsets[row - 1]; - - // Skip last zero byte. StringRef key(chars + last_offset, offsets[row] - last_offset - 1); if constexpr (place_string_to_arena) @@ -149,7 +147,7 @@ struct HashMethodFixedString HashMethodFixedString(const ColumnRawPtrs & key_columns, const Sizes & /*key_sizes*/, const TiDB::TiDBCollators & collators) { const IColumn & column = *key_columns[0]; - const auto & column_string = assert_cast(column); + const ColumnFixedString & column_string = assert_cast(column); n = column_string.getN(); chars = &column_string.getChars(); if (!collators.empty()) diff --git a/dbms/src/Flash/Coprocessor/DAGExpressionAnalyzerHelper.cpp b/dbms/src/Flash/Coprocessor/DAGExpressionAnalyzerHelper.cpp index f93989f037a..23bbb4586b3 100644 --- a/dbms/src/Flash/Coprocessor/DAGExpressionAnalyzerHelper.cpp +++ b/dbms/src/Flash/Coprocessor/DAGExpressionAnalyzerHelper.cpp @@ -168,7 +168,7 @@ String DAGExpressionAnalyzerHelper::buildInFunction( actions->add(ExpressionAction::addColumn(column)); argument_names.push_back(column.name); - const auto collator = getCollatorFromExpr(expr); + const auto * collator = getCollatorFromExpr(expr); String expr_name = analyzer->applyFunction(func_name, argument_names, actions, collator); if (set->remaining_exprs.empty()) diff --git a/dbms/src/Flash/Coprocessor/DAGUtils.cpp b/dbms/src/Flash/Coprocessor/DAGUtils.cpp index 17866c1f2ca..2003103a20a 100644 --- a/dbms/src/Flash/Coprocessor/DAGUtils.cpp +++ b/dbms/src/Flash/Coprocessor/DAGUtils.cpp @@ -1329,7 +1329,7 @@ SortDescription getSortDescription(const std::vector & order_co if (removeNullable(order_columns[i].type)->isString()) collator = getCollatorFromExpr(by_items[i].expr()); - order_descr.emplace_back(name, direction, nulls_direction, collator.inner.ptr); + order_descr.emplace_back(name, direction, nulls_direction, collator); } return order_descr; } diff --git a/dbms/src/Functions/CollationOperatorOptimized.h b/dbms/src/Functions/CollationOperatorOptimized.h index f69e1bc31e7..8276a41fa17 100644 --- a/dbms/src/Functions/CollationOperatorOptimized.h +++ b/dbms/src/Functions/CollationOperatorOptimized.h @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -47,6 +48,7 @@ struct IsEqualRelated> }; // Loop columns and invoke callback for each pair. +// Remove last zero byte. template __attribute__((flatten, always_inline)) inline void LoopTwoColumns( const ColumnString::Chars_t & a_data, @@ -56,18 +58,29 @@ __attribute__((flatten, always_inline)) inline void LoopTwoColumns( size_t size, F && func) { + ColumnString::Offset a_prev_offset = 0; + ColumnString::Offset b_prev_offset = 0; + const auto * a_ptr = reinterpret_cast(a_data.data()); + const auto * b_ptr = reinterpret_cast(b_data.data()); + for (size_t i = 0; i < size; ++i) { - size_t a_size = StringUtil::sizeAt(a_offsets, i) - 1; - size_t b_size = StringUtil::sizeAt(b_offsets, i) - 1; - const auto * a_ptr = reinterpret_cast(&a_data[StringUtil::offsetAt(a_offsets, i)]); - const auto * b_ptr = reinterpret_cast(&b_data[StringUtil::offsetAt(b_offsets, i)]); + auto a_size = a_offsets[i] - a_prev_offset; + auto b_size = b_offsets[i] - b_prev_offset; + + // Remove last zero byte. + func({a_ptr, a_size - 1}, {b_ptr, b_size - 1}, i); + + a_ptr += a_size; + b_ptr += b_size; - func({a_ptr, a_size}, {b_ptr, b_size}, i); + a_prev_offset = a_offsets[i]; + b_prev_offset = b_offsets[i]; } } // Loop one column and invoke callback for each pair. +// Remove last zero byte. template __attribute__((flatten, always_inline)) inline void LoopOneColumn( const ColumnString::Chars_t & a_data, @@ -75,12 +88,18 @@ __attribute__((flatten, always_inline)) inline void LoopOneColumn( size_t size, F && func) { + ColumnString::Offset a_prev_offset = 0; + const auto * a_ptr = reinterpret_cast(a_data.data()); + for (size_t i = 0; i < size; ++i) { - size_t a_size = StringUtil::sizeAt(a_offsets, i) - 1; - const auto * a_ptr = reinterpret_cast(&a_data[StringUtil::offsetAt(a_offsets, i)]); + auto a_size = a_offsets[i] - a_prev_offset; + + // Remove last zero byte. + func({a_ptr, a_size - 1}, i); - func({a_ptr, a_size}, i); + a_ptr += a_size; + a_prev_offset = a_offsets[i]; } } diff --git a/dbms/src/Functions/FunctionsComparison.h b/dbms/src/Functions/FunctionsComparison.h index 511e6da5254..8f7502fba85 100644 --- a/dbms/src/Functions/FunctionsComparison.h +++ b/dbms/src/Functions/FunctionsComparison.h @@ -317,7 +317,7 @@ struct StringComparisonWithCollatorImpl size_t a_offset = StringUtil::offsetAt(a_offsets, i); size_t b_offset = StringUtil::offsetAt(b_offsets, i); - c[i] = Op::apply(collator->compareIndirect(reinterpret_cast(&a_data[a_offset]), a_size, reinterpret_cast(&b_data[b_offset]), b_size), 0); + c[i] = Op::apply(collator->compare(reinterpret_cast(&a_data[a_offset]), a_size, reinterpret_cast(&b_data[b_offset]), b_size), 0); } } @@ -341,7 +341,7 @@ struct StringComparisonWithCollatorImpl for (size_t i = 0; i < size; ++i) { /// Trailing zero byte of the smaller string is included in the comparison. - c[i] = Op::apply(collator->compareIndirect(reinterpret_cast(&a_data[StringUtil::offsetAt(a_offsets, i)]), StringUtil::sizeAt(a_offsets, i) - 1, b_data, b_size), 0); + c[i] = Op::apply(collator->compare(reinterpret_cast(&a_data[StringUtil::offsetAt(a_offsets, i)]), StringUtil::sizeAt(a_offsets, i) - 1, b_data, b_size), 0); } } diff --git a/dbms/src/Functions/FunctionsStringSearch.cpp b/dbms/src/Functions/FunctionsStringSearch.cpp index ab0ff30ff66..f0c6cd6f303 100644 --- a/dbms/src/Functions/FunctionsStringSearch.cpp +++ b/dbms/src/Functions/FunctionsStringSearch.cpp @@ -23,7 +23,6 @@ #include #include #include -#include #include #include @@ -1931,18 +1930,18 @@ class FunctionStringReplace : public IFunction const String & match_type, ColumnWithTypeAndName & column_result) const { - const auto * c1_const = typeid_cast(column_needle.get()); - const auto * c2_const = typeid_cast(column_replacement.get()); - auto needle = c1_const->getValue(); - auto replacement = c2_const->getValue(); + const ColumnConst * c1_const = typeid_cast(column_needle.get()); + const ColumnConst * c2_const = typeid_cast(column_replacement.get()); + String needle = c1_const->getValue(); + String replacement = c2_const->getValue(); - if (const auto * col = checkAndGetColumn(column_src.get())) + if (const ColumnString * col = checkAndGetColumn(column_src.get())) { auto col_res = ColumnString::create(); Impl::vector(col->getChars(), col->getOffsets(), needle, replacement, pos, occ, match_type, collator, col_res->getChars(), col_res->getOffsets()); column_result.column = std::move(col_res); } - else if (const auto * col = checkAndGetColumn(column_src.get())) + else if (const ColumnFixedString * col = checkAndGetColumn(column_src.get())) { auto col_res = ColumnString::create(); Impl::vectorFixed(col->getChars(), col->getN(), needle, replacement, pos, occ, match_type, collator, col_res->getChars(), col_res->getOffsets()); @@ -1965,17 +1964,17 @@ class FunctionStringReplace : public IFunction { if constexpr (Impl::support_non_const_needle) { - const auto * col_needle = typeid_cast(column_needle.get()); - const auto * col_replacement_const = typeid_cast(column_replacement.get()); - auto replacement = col_replacement_const->getValue(); + const ColumnString * col_needle = typeid_cast(column_needle.get()); + const ColumnConst * col_replacement_const = typeid_cast(column_replacement.get()); + String replacement = col_replacement_const->getValue(); - if (const auto * col = checkAndGetColumn(column_src.get())) + if (const ColumnString * col = checkAndGetColumn(column_src.get())) { auto col_res = ColumnString::create(); Impl::vectorNonConstNeedle(col->getChars(), col->getOffsets(), col_needle->getChars(), col_needle->getOffsets(), replacement, pos, occ, match_type, collator, col_res->getChars(), col_res->getOffsets()); column_result.column = std::move(col_res); } - else if (const auto * col = checkAndGetColumn(column_src.get())) + else if (const ColumnFixedString * col = checkAndGetColumn(column_src.get())) { auto col_res = ColumnString::create(); Impl::vectorFixedNonConstNeedle(col->getChars(), col->getN(), col_needle->getChars(), col_needle->getOffsets(), replacement, pos, occ, match_type, collator, col_res->getChars(), col_res->getOffsets()); @@ -2003,17 +2002,17 @@ class FunctionStringReplace : public IFunction { if constexpr (Impl::support_non_const_replacement) { - const auto * col_needle_const = typeid_cast(column_needle.get()); - auto needle = col_needle_const->getValue(); - const auto * col_replacement = typeid_cast(column_replacement.get()); + const ColumnConst * col_needle_const = typeid_cast(column_needle.get()); + String needle = col_needle_const->getValue(); + const ColumnString * col_replacement = typeid_cast(column_replacement.get()); - if (const auto * col = checkAndGetColumn(column_src.get())) + if (const ColumnString * col = checkAndGetColumn(column_src.get())) { auto col_res = ColumnString::create(); Impl::vectorNonConstReplacement(col->getChars(), col->getOffsets(), needle, col_replacement->getChars(), col_replacement->getOffsets(), pos, occ, match_type, collator, col_res->getChars(), col_res->getOffsets()); column_result.column = std::move(col_res); } - else if (const auto * col = checkAndGetColumn(column_src.get())) + else if (const ColumnFixedString * col = checkAndGetColumn(column_src.get())) { auto col_res = ColumnString::create(); Impl::vectorFixedNonConstReplacement(col->getChars(), col->getN(), needle, col_replacement->getChars(), col_replacement->getOffsets(), pos, occ, match_type, collator, col_res->getChars(), col_res->getOffsets()); @@ -2041,16 +2040,16 @@ class FunctionStringReplace : public IFunction { if constexpr (Impl::support_non_const_needle && Impl::support_non_const_replacement) { - const auto * col_needle = typeid_cast(column_needle.get()); - const auto * col_replacement = typeid_cast(column_replacement.get()); + const ColumnString * col_needle = typeid_cast(column_needle.get()); + const ColumnString * col_replacement = typeid_cast(column_replacement.get()); - if (const auto * col = checkAndGetColumn(column_src.get())) + if (const ColumnString * col = checkAndGetColumn(column_src.get())) { auto col_res = ColumnString::create(); Impl::vectorNonConstNeedleReplacement(col->getChars(), col->getOffsets(), col_needle->getChars(), col_needle->getOffsets(), col_replacement->getChars(), col_replacement->getOffsets(), pos, occ, match_type, collator, col_res->getChars(), col_res->getOffsets()); column_result.column = std::move(col_res); } - else if (const auto * col = checkAndGetColumn(column_src.get())) + else if (const ColumnFixedString * col = checkAndGetColumn(column_src.get())) { auto col_res = ColumnString::create(); Impl::vectorFixedNonConstNeedleReplacement(col->getChars(), col->getN(), col_needle->getChars(), col_needle->getOffsets(), col_replacement->getChars(), col_replacement->getOffsets(), pos, occ, match_type, collator, col_res->getChars(), col_res->getOffsets()); diff --git a/dbms/src/Functions/tests/gtest_regexp.cpp b/dbms/src/Functions/tests/gtest_regexp.cpp index 140e9014f17..d3eb93a0790 100644 --- a/dbms/src/Functions/tests/gtest_regexp.cpp +++ b/dbms/src/Functions/tests/gtest_regexp.cpp @@ -66,8 +66,8 @@ class Regexp : public FunctionTest TEST_F(Regexp, testRegexpMatchType) { UInt8 res = false; - const auto binary_collator = TiDB::ITiDBCollator::getCollator(TiDB::ITiDBCollator::BINARY); - const auto ci_collator = TiDB::ITiDBCollator::getCollator(TiDB::ITiDBCollator::UTF8MB4_GENERAL_CI); + const auto * binary_collator = TiDB::ITiDBCollator::getCollator(TiDB::ITiDBCollator::BINARY); + const auto * ci_collator = TiDB::ITiDBCollator::getCollator(TiDB::ITiDBCollator::UTF8MB4_GENERAL_CI); DB::MatchImpl::constantConstant("a\nB\n", "(?m)(?i)^b", '\\', "", nullptr, res); ASSERT_TRUE(res == 1); DB::MatchImpl::constantConstant("a\nB\n", "^b", '\\', "mi", nullptr, res); @@ -1744,8 +1744,8 @@ TEST_F(Regexp, testRegexpMySQLCases) TEST_F(Regexp, testRegexpTiDBCase) { UInt8 res; - const auto binary_collator = TiDB::ITiDBCollator::getCollator(TiDB::ITiDBCollator::BINARY); - const auto ci_collator = TiDB::ITiDBCollator::getCollator(TiDB::ITiDBCollator::UTF8MB4_GENERAL_CI); + const auto * binary_collator = TiDB::ITiDBCollator::getCollator(TiDB::ITiDBCollator::BINARY); + const auto * ci_collator = TiDB::ITiDBCollator::getCollator(TiDB::ITiDBCollator::UTF8MB4_GENERAL_CI); DB::MatchImpl::constantConstant("a", "^$", '\\', "", nullptr, res); ASSERT_TRUE(res == 0); DB::MatchImpl::constantConstant("a", "a", '\\', "", nullptr, res); @@ -1782,7 +1782,7 @@ TEST_F(Regexp, testRegexpTiDBCase) TEST_F(Regexp, testRegexp) { - const auto binary_collator = TiDB::ITiDBCollator::getCollator(TiDB::ITiDBCollator::BINARY); + const auto * binary_collator = TiDB::ITiDBCollator::getCollator(TiDB::ITiDBCollator::BINARY); auto string_type = std::make_shared(); auto nullable_string_type = makeNullable(string_type); auto uint8_type = std::make_shared(); @@ -1954,8 +1954,8 @@ TEST_F(Regexp, testRegexpCustomerCases) TEST_F(Regexp, testRegexpReplaceMatchType) { String res; - const auto binary_collator = TiDB::ITiDBCollator::getCollator(TiDB::ITiDBCollator::BINARY); - const auto ci_collator = TiDB::ITiDBCollator::getCollator(TiDB::ITiDBCollator::UTF8MB4_GENERAL_CI); + const auto * binary_collator = TiDB::ITiDBCollator::getCollator(TiDB::ITiDBCollator::BINARY); + const auto * ci_collator = TiDB::ITiDBCollator::getCollator(TiDB::ITiDBCollator::UTF8MB4_GENERAL_CI); DB::ReplaceRegexpImpl::constant("a\nB\nc", "(?m)(?i)^b", "xxx", 1, 0, "", nullptr, res); ASSERT_TRUE(res == "a\nxxx\nc"); DB::ReplaceRegexpImpl::constant("a\nB\nc", "^b", "xxx", 1, 0, "mi", nullptr, res); @@ -2020,7 +2020,7 @@ TEST_F(Regexp, testRegexpReplaceMySQLCases) TEST_F(Regexp, testRegexpReplace) { - const auto binary_collator = TiDB::ITiDBCollator::getCollator(TiDB::ITiDBCollator::BINARY); + const auto * binary_collator = TiDB::ITiDBCollator::getCollator(TiDB::ITiDBCollator::BINARY); auto string_type = std::make_shared(); auto nullable_string_type = makeNullable(string_type); auto uint8_type = std::make_shared(); diff --git a/dbms/src/Interpreters/AggregationCommon.h b/dbms/src/Interpreters/AggregationCommon.h index f02c3994ada..a043760b9af 100644 --- a/dbms/src/Interpreters/AggregationCommon.h +++ b/dbms/src/Interpreters/AggregationCommon.h @@ -20,7 +20,6 @@ #include #include #include -#include #include diff --git a/dbms/src/Storages/Transaction/Collator.cpp b/dbms/src/Storages/Transaction/Collator.cpp index 974135b38b3..09024e777ce 100644 --- a/dbms/src/Storages/Transaction/Collator.cpp +++ b/dbms/src/Storages/Transaction/Collator.cpp @@ -13,12 +13,11 @@ // limitations under the License. #include -#include #include #include +#include #include -#include namespace DB::ErrorCodes { diff --git a/dbms/src/Storages/Transaction/Collator.h b/dbms/src/Storages/Transaction/Collator.h index 55db14f44f5..a5650cd9c7e 100644 --- a/dbms/src/Storages/Transaction/Collator.h +++ b/dbms/src/Storages/Transaction/Collator.h @@ -15,16 +15,20 @@ #pragma once #include -#include +#include + +#include +#include namespace TiDB { -struct TiDBCollatorPtr; +using TiDBCollatorPtr = class ITiDBCollator const *; using TiDBCollators = std::vector; -struct TiDBCollatorID +class ITiDBCollator : public ICollator { +public: enum { UTF8_GENERAL_CI = 33, @@ -38,30 +42,6 @@ struct TiDBCollatorID UTF8_BIN = 83, }; - int32_t getCollatorId() const { return collator_id; } - bool isBinary() const { return collator_id == BINARY; } - bool isCI() const - { - return collator_id == UTF8_UNICODE_CI || collator_id == UTF8_GENERAL_CI - || collator_id == UTF8MB4_UNICODE_CI || collator_id == UTF8MB4_GENERAL_CI; - } - bool isBin() const - { - return collator_id == UTF8_BIN || collator_id == UTF8MB4_BIN - || collator_id == ASCII_BIN || collator_id == LATIN1_BIN; - } - - explicit TiDBCollatorID(int32_t id) - : collator_id(id) - {} - - int32_t collator_id; -}; - -class ITiDBCollator : public TiDBCollatorID - , public ICollator -{ -public: /// Get the collator according to the internal collation ID, which directly comes from tipb and has been properly /// de-rewritten - the "New CI Collation" will flip the sign of the collation ID. static TiDBCollatorPtr getCollator(int32_t id); @@ -83,17 +63,9 @@ class ITiDBCollator : public TiDBCollatorID ~ITiDBCollator() override = default; - /** - compare with collation - 0 equal - <0 {s1, length1} < {s2, length2} with collation - >0 {s1, length1} > {s2, length2} with collation - */ int compare(const char * s1, size_t length1, const char * s2, size_t length2) const override = 0; - virtual StringRef sortKey(const char * s, size_t length, std::string & container) const = 0; virtual std::unique_ptr pattern() const = 0; - int32_t getCollatorId() const { return collator_id; } bool isBinary() const { return collator_id == BINARY; } bool isCI() const @@ -109,8 +81,8 @@ class ITiDBCollator : public TiDBCollatorID protected: explicit ITiDBCollator(int32_t collator_id_) - : TiDBCollatorID(collator_id_) - {} + : collator_id(collator_id_){}; + int32_t collator_id; }; /// these dummy_xxx are used as the default value to avoid too many meaningless @@ -119,104 +91,4 @@ extern TiDBCollators dummy_collators; extern std::vector dummy_sort_key_contaners; extern std::string dummy_sort_key_contaner; -struct TiDBCollatorPtrImpl : TiDBCollatorID -{ - class ITiDBCollator const * ptr{}; - - TiDBCollatorPtrImpl(class ITiDBCollator const * ptr_, int32_t collator_id_) - : TiDBCollatorID(collator_id_) - , ptr(ptr_) - { - } - - explicit TiDBCollatorPtrImpl(class ITiDBCollator const * ptr_) - : TiDBCollatorID(ptr_ ? ptr_->getCollatorId() : 0) - , ptr(ptr_) - { - } - - ALWAYS_INLINE inline int compare(const char * s1, size_t length1, const char * s2, size_t length2) const - { - if (likely(canUseFastPath())) - { - return DB::BinCollatorCompare(s1, length1, s2, length2); - } - else - { - return compareIndirect(s1, length1, s2, length2); - } - } - - ALWAYS_INLINE inline int compareIndirect(const char * s1, size_t length1, const char * s2, size_t length2) const - { - return ptr->compare(s1, length1, s2, length2); - } - - ALWAYS_INLINE inline bool canUseFastPath() const - { - return collator_id == ITiDBCollator::UTF8MB4_BIN; - } - - ALWAYS_INLINE inline StringRef sortKey(const char * s, size_t length, std::string & container) const - { - if (likely(canUseFastPath())) - { - return fastPathSortKey(s, length); - } - else - { - return sortKeyIndirect(s, length, container); - } - } - - static ALWAYS_INLINE inline StringRef fastPathSortKey(const char * s, size_t length) - { - return DB::BinCollatorSortKey(s, length); - } - - ALWAYS_INLINE inline StringRef sortKeyIndirect(const char * s, size_t length, std::string & container) const - { - return ptr->sortKey(s, length, container); - } - - ALWAYS_INLINE inline std::unique_ptr pattern() const { return ptr->pattern(); } - - ALWAYS_INLINE int32_t getCollatorId() const { return collator_id; } -}; - -struct TiDBCollatorPtr -{ - TiDBCollatorPtrImpl inner; - - TiDBCollatorPtr(class ITiDBCollator const * ptr_, int32_t collator_id_) - : inner(ptr_, collator_id_) - { - } - TiDBCollatorPtr(class ITiDBCollator const * ptr_ = nullptr) // NOLINT - : inner(ptr_) - { - } - ALWAYS_INLINE bool operator==(const void * tar) const - { - return inner.ptr == tar; - } - ALWAYS_INLINE bool operator!=(const void * tar) const - { - return inner.ptr != tar; - } - - ALWAYS_INLINE explicit operator bool() const - { - return inner.ptr; - } - ALWAYS_INLINE TiDBCollatorPtrImpl * operator->() - { - return &inner; - } - ALWAYS_INLINE const TiDBCollatorPtrImpl * operator->() const - { - return &inner; - } -}; - } // namespace TiDB diff --git a/dbms/src/Storages/Transaction/TiDB.cpp b/dbms/src/Storages/Transaction/TiDB.cpp index 59d5d97e65d..6d07c47f235 100644 --- a/dbms/src/Storages/Transaction/TiDB.cpp +++ b/dbms/src/Storages/Transaction/TiDB.cpp @@ -240,7 +240,7 @@ DB::Field ColumnInfo::getDecimalValue(const String & decimal_text) const // FIXME it still has bug: https://github.com/pingcap/tidb/issues/11435 Int64 ColumnInfo::getEnumIndex(const String & enum_id_or_text) const { - auto collator = ITiDBCollator::getCollator(collate.isEmpty() ? "binary" : collate.convert()); + const auto * collator = ITiDBCollator::getCollator(collate.isEmpty() ? "binary" : collate.convert()); if (!collator) // todo if new collation is enabled, should use "utf8mb4_bin" collator = ITiDBCollator::getCollator("binary"); @@ -257,7 +257,7 @@ Int64 ColumnInfo::getEnumIndex(const String & enum_id_or_text) const UInt64 ColumnInfo::getSetValue(const String & set_str) const { - auto collator = ITiDBCollator::getCollator(collate.isEmpty() ? "binary" : collate.convert()); + const auto * collator = ITiDBCollator::getCollator(collate.isEmpty() ? "binary" : collate.convert()); if (!collator) // todo if new collation is enabled, should use "utf8mb4_bin" collator = ITiDBCollator::getCollator("binary"); From 91197108d4c5aeec1635b6bd3dcfb6535f733067 Mon Sep 17 00:00:00 2001 From: Zhigao Tong Date: Wed, 20 Jul 2022 21:21:17 +0800 Subject: [PATCH 14/15] Fix compile --- dbms/src/Storages/Transaction/tests/gtest_tidb_collator.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/dbms/src/Storages/Transaction/tests/gtest_tidb_collator.cpp b/dbms/src/Storages/Transaction/tests/gtest_tidb_collator.cpp index fcace3fb30e..13c51dba2db 100644 --- a/dbms/src/Storages/Transaction/tests/gtest_tidb_collator.cpp +++ b/dbms/src/Storages/Transaction/tests/gtest_tidb_collator.cpp @@ -13,6 +13,7 @@ // limitations under the License. #include +#include #include namespace DB::tests From bac11cff415d5eaaeac2afa3333017b75a1656d8 Mon Sep 17 00:00:00 2001 From: Zhigao Tong Date: Wed, 20 Jul 2022 21:29:01 +0800 Subject: [PATCH 15/15] Fix --- dbms/src/Columns/ColumnString.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/dbms/src/Columns/ColumnString.cpp b/dbms/src/Columns/ColumnString.cpp index ce96165b689..c312a186275 100644 --- a/dbms/src/Columns/ColumnString.cpp +++ b/dbms/src/Columns/ColumnString.cpp @@ -317,9 +317,10 @@ int ColumnString::compareAtWithCollationImpl(size_t n, size_t m, const IColumn & return collator.compare( reinterpret_cast(&chars[offsetAt(n)]), - sizeAt(n), + sizeAt(n) - 1, // Skip last zero byte. reinterpret_cast(&rhs.chars[rhs.offsetAt(m)]), - rhs.sizeAt(m)); + rhs.sizeAt(m) - 1 // Skip last zero byte. + ); } // Derived must implement function `int compare(const char *, size_t, const char *, size_t)`. @@ -338,9 +339,9 @@ struct ColumnString::LessWithCollation { int res = inner.compare( reinterpret_cast(&parent.chars[parent.offsetAt(lhs)]), - parent.sizeAt(lhs) - 1, // remove tail '\0' + parent.sizeAt(lhs) - 1, // Skip last zero byte. reinterpret_cast(&parent.chars[parent.offsetAt(rhs)]), - parent.sizeAt(rhs) - 1) // remove tail '\0' + parent.sizeAt(rhs) - 1) // Skip last zero byte. ; if constexpr (positive)