From 5f7093b4c04e32480d0942f43f2022344e197f11 Mon Sep 17 00:00:00 2001 From: zhiqiang Date: Thu, 7 Dec 2023 16:36:13 +0800 Subject: [PATCH] [fix](split_by_string) Fix split by string core on column string (#28030) --- be/src/vec/functions/function_string.h | 102 +++++++++++++++--- .../string_functions/test_split_by_string.out | 20 ++++ .../test_split_by_string.groovy | 60 +++++++++++ 3 files changed, 165 insertions(+), 17 deletions(-) diff --git a/be/src/vec/functions/function_string.h b/be/src/vec/functions/function_string.h index 37a21a3ea5b6bc..e48fbd263ecc88 100644 --- a/be/src/vec/functions/function_string.h +++ b/be/src/vec/functions/function_string.h @@ -1803,6 +1803,7 @@ class FunctionSplitByString : public IFunction { const auto& [right_column, right_const] = unpack_if_const(block.get_by_position(arguments[1]).column); + DataTypePtr right_column_type = block.get_by_position(arguments[1]).type; DataTypePtr src_column_type = block.get_by_position(arguments[0]).type; auto dest_column_ptr = ColumnArray::create(make_nullable(src_column_type)->create_column(), ColumnArray::ColumnOffsets::create()); @@ -1818,27 +1819,42 @@ class FunctionSplitByString : public IFunction { dest_nested_column = dest_nullable_col->get_nested_column_ptr(); dest_nested_null_map = &dest_nullable_col->get_null_map_column().get_data(); - if (auto col_left = check_and_get_column(src_column.get())) { - if (auto col_right = check_and_get_column(right_column.get())) { - if (right_const) { - _execute_constant(*col_left, col_right->get_data_at(0), *dest_nested_column, - dest_offsets, dest_nested_null_map); - } else { - _execute_vector(*col_left, *col_right, *dest_nested_column, dest_offsets, - dest_nested_null_map); - } + auto col_left = check_and_get_column(src_column.get()); + if (!col_left) { + return Status::InternalError("Left operator of function {} can not be {}", get_name(), + src_column_type->get_name()); + } - block.replace_by_position(result, std::move(dest_column_ptr)); - return Status::OK(); - } + auto col_right = check_and_get_column(right_column.get()); + if (!col_right) { + return Status::InternalError("Right operator of function {} can not be {}", get_name(), + right_column_type->get_name()); + } + + // split_by_string(ColumnString, "xxx") + if (right_const) { + _execute_constant_delimiter(*col_left, col_right->get_data_at(0), *dest_nested_column, + dest_offsets, dest_nested_null_map); + } else if (left_const) { + // split_by_string("xxx", ColumnString) + _execute_constant_src_string(col_left->get_data_at(0), *col_right, *dest_nested_column, + dest_offsets, dest_nested_null_map); + } else { + // split_by_string(ColumnString, ColumnString) + _execute_vector(*col_left, *col_right, *dest_nested_column, dest_offsets, + dest_nested_null_map); } - return Status::RuntimeError("unimplements function {}", get_name()); + + block.replace_by_position(result, std::move(dest_column_ptr)); + + return Status::OK(); } private: - void _execute_constant(const ColumnString& src_column_string, const StringRef& delimiter_ref, - IColumn& dest_nested_column, ColumnArray::Offsets64& dest_offsets, - NullMapType* dest_nested_null_map) { + void _execute_constant_delimiter(const ColumnString& src_column_string, + const StringRef& delimiter_ref, IColumn& dest_nested_column, + ColumnArray::Offsets64& dest_offsets, + NullMapType* dest_nested_null_map) const { ColumnString& dest_column_string = reinterpret_cast(dest_nested_column); ColumnString::Chars& column_string_chars = dest_column_string.get_chars(); ColumnString::Offsets& column_string_offsets = dest_column_string.get_offsets(); @@ -1958,7 +1974,59 @@ class FunctionSplitByString : public IFunction { } } - size_t split_str(size_t& pos, const StringRef str_ref, StringRef delimiter_ref) { + void _execute_constant_src_string(const StringRef& str_ref, const ColumnString& delimiter_col, + IColumn& dest_nested_column, + ColumnArray::Offsets64& dest_offsets, + NullMapType* dest_nested_null_map) const { + ColumnString& dest_column_string = reinterpret_cast(dest_nested_column); + ColumnString::Chars& column_string_chars = dest_column_string.get_chars(); + ColumnString::Offsets& column_string_offsets = dest_column_string.get_offsets(); + column_string_chars.reserve(0); + + ColumnArray::Offset64 string_pos = 0; + ColumnArray::Offset64 dest_pos = 0; + const ColumnArray::Offset64 delimiter_offsets_size = delimiter_col.get_offsets().size(); + + for (size_t i = 0; i < delimiter_offsets_size; ++i) { + const StringRef delimiter_ref = delimiter_col.get_data_at(i); + + if (delimiter_ref.size == 0) { + for (size_t str_pos = 0; str_pos < str_ref.size;) { + const size_t str_offset = str_pos; + const size_t old_size = column_string_chars.size(); + str_pos++; + const size_t new_size = old_size + 1; + column_string_chars.resize(new_size); + memcpy(column_string_chars.data() + old_size, str_ref.data + str_offset, 1); + (*dest_nested_null_map).push_back(false); + string_pos++; + dest_pos++; + column_string_offsets.push_back(string_pos); + } + } else { + for (size_t str_pos = 0; str_pos <= str_ref.size;) { + const size_t str_offset = str_pos; + const size_t old_size = column_string_chars.size(); + const size_t split_part_size = split_str(str_pos, str_ref, delimiter_ref); + str_pos += delimiter_ref.size; + const size_t new_size = old_size + split_part_size; + column_string_chars.resize(new_size); + if (split_part_size > 0) { + memcpy_small_allow_read_write_overflow15( + column_string_chars.data() + old_size, str_ref.data + str_offset, + split_part_size); + } + (*dest_nested_null_map).push_back(false); + string_pos += split_part_size; + dest_pos++; + column_string_offsets.push_back(string_pos); + } + } + dest_offsets.push_back(dest_pos); + } + } + + size_t split_str(size_t& pos, const StringRef str_ref, StringRef delimiter_ref) const { size_t old_size = pos; size_t str_size = str_ref.size; while (pos < str_size && diff --git a/regression-test/data/query_p0/sql_functions/string_functions/test_split_by_string.out b/regression-test/data/query_p0/sql_functions/string_functions/test_split_by_string.out index 00d9ad99781881..c46fa2bd27e2cc 100644 --- a/regression-test/data/query_p0/sql_functions/string_functions/test_split_by_string.out +++ b/regression-test/data/query_p0/sql_functions/string_functions/test_split_by_string.out @@ -87,3 +87,23 @@ 9 a,b,c, , ["a", "b", "c", ""] 10 \N , \N +-- !sql_1 -- +1 ["a", "b", "c"] ["a", "b", "c"] ["a", "b", "c"] ["a", "b", "c"] +2 ["a", "b", "c"] ["a", "b", "c"] ["a", "b", "c"] ["a", "b", "c"] + +-- !sql_2 -- +3 ["a", "b", "c"] ["a", "b", "c"] ["a", "b", "c"] ["a", "b", "c"] +4 ["a", "b", "c"] ["a", "b", "c"] ["a", "b", "c"] ["a", "b", "c"] + +-- !sql_3 -- +1 [] [] [] [] +2 [] [] [] [] +3 ["a", ",", "b", ",", "c"] ["a", ",", "b", ",", "c"] ["a", ",", "b", ",", "c"] ["a", ",", "b", ",", "c"] +4 ["a", ",", "b", ",", "c"] ["a", ",", "b", ",", "c"] ["a", ",", "b", ",", "c"] ["a", ",", "b", ",", "c"] + +-- !sql_4 -- +1 [] [] [] [] +2 [] [] [] [] +3 [""] [""] [""] [""] +4 [""] [""] [""] [""] + diff --git a/regression-test/suites/query_p0/sql_functions/string_functions/test_split_by_string.groovy b/regression-test/suites/query_p0/sql_functions/string_functions/test_split_by_string.groovy index d3f0588518124c..2ec70e361242ce 100644 --- a/regression-test/suites/query_p0/sql_functions/string_functions/test_split_by_string.groovy +++ b/regression-test/suites/query_p0/sql_functions/string_functions/test_split_by_string.groovy @@ -102,4 +102,64 @@ suite("test_split_by_string") { qt_sql "SELECT *, split_by_string(v1, v2) FROM ${tableName2} ORDER BY k1" + + // Case where both of operator are column string is covered by above test. + sql """DROP TABLE IF EXISTS test_split_by_string_2""" + sql """ + CREATE TABLE IF NOT EXISTS test_split_by_string_2 ( + `rid` INT NULL, + `str` TEXT NULL, + `vc` VARCHAR(5) NULL, + `chr` CHAR(5) NULL, + `txt` TEXT NULL + ) ENGINE=OLAP + DUPLICATE KEY(`rid`) + DISTRIBUTED BY HASH(`rid`) BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1", + "storage_format" = "V2" + ) + """ + sql """ INSERT INTO test_split_by_string_2 + VALUES (1, "", "", "", ""), + (2, "", "", "", ""), + (3, "a,b,c", "a,b,c", "a,b,c", "a,b,c"), + (4, "a,b,c", "a,b,c", "a,b,c", "a,b,c") + """ + // Left operator is const, right operator is column string + qt_sql_1 """ + SELECT rid, + split_by_string("abc", str), + split_by_string("abc", vc), + split_by_string("abc", chr), + split_by_string("abc", txt) + FROM test_split_by_string_2 WHERE rid=1 OR rid=2 ORDER BY rid; + """ + // Left operator is column string, right operator is const + qt_sql_2 """ + SELECT rid, + split_by_string(str, ","), + split_by_string(vc, ","), + split_by_string(chr, ","), + split_by_string(txt, ",") + FROM test_split_by_string_2 WHERE rid=3 OR rid=4 ORDER BY rid; + """ + + // Empty string + qt_sql_3 """ + SELECT rid, + split_by_string(str, ""), + split_by_string(vc, ""), + split_by_string(chr, ""), + split_by_string(txt, "") + FROM test_split_by_string_2 ORDER BY rid; + """ + qt_sql_4 """ + SELECT rid, + split_by_string("", str), + split_by_string("", vc), + split_by_string("", chr), + split_by_string("", txt) + FROM test_split_by_string_2 ORDER BY rid; + """ } \ No newline at end of file