Skip to content

Commit

Permalink
[fix](split_by_string) Fix split by string core on column string (apa…
Browse files Browse the repository at this point in the history
  • Loading branch information
zhiqiang-hhhh committed Dec 14, 2023
1 parent 6a26cdf commit 5f7093b
Show file tree
Hide file tree
Showing 3 changed files with 165 additions and 17 deletions.
102 changes: 85 additions & 17 deletions be/src/vec/functions/function_string.h
Original file line number Diff line number Diff line change
Expand Up @@ -1803,6 +1803,7 @@ class FunctionSplitByString : public IFunction {
const auto& [right_column, right_const] =
unpack_if_const(block.get_by_position(arguments[1]).column);

DataTypePtr right_column_type = block.get_by_position(arguments[1]).type;
DataTypePtr src_column_type = block.get_by_position(arguments[0]).type;
auto dest_column_ptr = ColumnArray::create(make_nullable(src_column_type)->create_column(),
ColumnArray::ColumnOffsets::create());
Expand All @@ -1818,27 +1819,42 @@ class FunctionSplitByString : public IFunction {
dest_nested_column = dest_nullable_col->get_nested_column_ptr();
dest_nested_null_map = &dest_nullable_col->get_null_map_column().get_data();

if (auto col_left = check_and_get_column<ColumnString>(src_column.get())) {
if (auto col_right = check_and_get_column<ColumnString>(right_column.get())) {
if (right_const) {
_execute_constant(*col_left, col_right->get_data_at(0), *dest_nested_column,
dest_offsets, dest_nested_null_map);
} else {
_execute_vector(*col_left, *col_right, *dest_nested_column, dest_offsets,
dest_nested_null_map);
}
auto col_left = check_and_get_column<ColumnString>(src_column.get());
if (!col_left) {
return Status::InternalError("Left operator of function {} can not be {}", get_name(),
src_column_type->get_name());
}

block.replace_by_position(result, std::move(dest_column_ptr));
return Status::OK();
}
auto col_right = check_and_get_column<ColumnString>(right_column.get());
if (!col_right) {
return Status::InternalError("Right operator of function {} can not be {}", get_name(),
right_column_type->get_name());
}

// split_by_string(ColumnString, "xxx")
if (right_const) {
_execute_constant_delimiter(*col_left, col_right->get_data_at(0), *dest_nested_column,
dest_offsets, dest_nested_null_map);
} else if (left_const) {
// split_by_string("xxx", ColumnString)
_execute_constant_src_string(col_left->get_data_at(0), *col_right, *dest_nested_column,
dest_offsets, dest_nested_null_map);
} else {
// split_by_string(ColumnString, ColumnString)
_execute_vector(*col_left, *col_right, *dest_nested_column, dest_offsets,
dest_nested_null_map);
}
return Status::RuntimeError("unimplements function {}", get_name());

block.replace_by_position(result, std::move(dest_column_ptr));

return Status::OK();
}

private:
void _execute_constant(const ColumnString& src_column_string, const StringRef& delimiter_ref,
IColumn& dest_nested_column, ColumnArray::Offsets64& dest_offsets,
NullMapType* dest_nested_null_map) {
void _execute_constant_delimiter(const ColumnString& src_column_string,
const StringRef& delimiter_ref, IColumn& dest_nested_column,
ColumnArray::Offsets64& dest_offsets,
NullMapType* dest_nested_null_map) const {
ColumnString& dest_column_string = reinterpret_cast<ColumnString&>(dest_nested_column);
ColumnString::Chars& column_string_chars = dest_column_string.get_chars();
ColumnString::Offsets& column_string_offsets = dest_column_string.get_offsets();
Expand Down Expand Up @@ -1958,7 +1974,59 @@ class FunctionSplitByString : public IFunction {
}
}

size_t split_str(size_t& pos, const StringRef str_ref, StringRef delimiter_ref) {
void _execute_constant_src_string(const StringRef& str_ref, const ColumnString& delimiter_col,
IColumn& dest_nested_column,
ColumnArray::Offsets64& dest_offsets,
NullMapType* dest_nested_null_map) const {
ColumnString& dest_column_string = reinterpret_cast<ColumnString&>(dest_nested_column);
ColumnString::Chars& column_string_chars = dest_column_string.get_chars();
ColumnString::Offsets& column_string_offsets = dest_column_string.get_offsets();
column_string_chars.reserve(0);

ColumnArray::Offset64 string_pos = 0;
ColumnArray::Offset64 dest_pos = 0;
const ColumnArray::Offset64 delimiter_offsets_size = delimiter_col.get_offsets().size();

for (size_t i = 0; i < delimiter_offsets_size; ++i) {
const StringRef delimiter_ref = delimiter_col.get_data_at(i);

if (delimiter_ref.size == 0) {
for (size_t str_pos = 0; str_pos < str_ref.size;) {
const size_t str_offset = str_pos;
const size_t old_size = column_string_chars.size();
str_pos++;
const size_t new_size = old_size + 1;
column_string_chars.resize(new_size);
memcpy(column_string_chars.data() + old_size, str_ref.data + str_offset, 1);
(*dest_nested_null_map).push_back(false);
string_pos++;
dest_pos++;
column_string_offsets.push_back(string_pos);
}
} else {
for (size_t str_pos = 0; str_pos <= str_ref.size;) {
const size_t str_offset = str_pos;
const size_t old_size = column_string_chars.size();
const size_t split_part_size = split_str(str_pos, str_ref, delimiter_ref);
str_pos += delimiter_ref.size;
const size_t new_size = old_size + split_part_size;
column_string_chars.resize(new_size);
if (split_part_size > 0) {
memcpy_small_allow_read_write_overflow15(
column_string_chars.data() + old_size, str_ref.data + str_offset,
split_part_size);
}
(*dest_nested_null_map).push_back(false);
string_pos += split_part_size;
dest_pos++;
column_string_offsets.push_back(string_pos);
}
}
dest_offsets.push_back(dest_pos);
}
}

size_t split_str(size_t& pos, const StringRef str_ref, StringRef delimiter_ref) const {
size_t old_size = pos;
size_t str_size = str_ref.size;
while (pos < str_size &&
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -87,3 +87,23 @@
9 a,b,c, , ["a", "b", "c", ""]
10 \N , \N

-- !sql_1 --
1 ["a", "b", "c"] ["a", "b", "c"] ["a", "b", "c"] ["a", "b", "c"]
2 ["a", "b", "c"] ["a", "b", "c"] ["a", "b", "c"] ["a", "b", "c"]

-- !sql_2 --
3 ["a", "b", "c"] ["a", "b", "c"] ["a", "b", "c"] ["a", "b", "c"]
4 ["a", "b", "c"] ["a", "b", "c"] ["a", "b", "c"] ["a", "b", "c"]

-- !sql_3 --
1 [] [] [] []
2 [] [] [] []
3 ["a", ",", "b", ",", "c"] ["a", ",", "b", ",", "c"] ["a", ",", "b", ",", "c"] ["a", ",", "b", ",", "c"]
4 ["a", ",", "b", ",", "c"] ["a", ",", "b", ",", "c"] ["a", ",", "b", ",", "c"] ["a", ",", "b", ",", "c"]

-- !sql_4 --
1 [] [] [] []
2 [] [] [] []
3 [""] [""] [""] [""]
4 [""] [""] [""] [""]

Original file line number Diff line number Diff line change
Expand Up @@ -102,4 +102,64 @@ suite("test_split_by_string") {


qt_sql "SELECT *, split_by_string(v1, v2) FROM ${tableName2} ORDER BY k1"

// Case where both of operator are column string is covered by above test.
sql """DROP TABLE IF EXISTS test_split_by_string_2"""
sql """
CREATE TABLE IF NOT EXISTS test_split_by_string_2 (
`rid` INT NULL,
`str` TEXT NULL,
`vc` VARCHAR(5) NULL,
`chr` CHAR(5) NULL,
`txt` TEXT NULL
) ENGINE=OLAP
DUPLICATE KEY(`rid`)
DISTRIBUTED BY HASH(`rid`) BUCKETS 1
PROPERTIES (
"replication_allocation" = "tag.location.default: 1",
"storage_format" = "V2"
)
"""
sql """ INSERT INTO test_split_by_string_2
VALUES (1, "", "", "", ""),
(2, "", "", "", ""),
(3, "a,b,c", "a,b,c", "a,b,c", "a,b,c"),
(4, "a,b,c", "a,b,c", "a,b,c", "a,b,c")
"""
// Left operator is const, right operator is column string
qt_sql_1 """
SELECT rid,
split_by_string("abc", str),
split_by_string("abc", vc),
split_by_string("abc", chr),
split_by_string("abc", txt)
FROM test_split_by_string_2 WHERE rid=1 OR rid=2 ORDER BY rid;
"""
// Left operator is column string, right operator is const
qt_sql_2 """
SELECT rid,
split_by_string(str, ","),
split_by_string(vc, ","),
split_by_string(chr, ","),
split_by_string(txt, ",")
FROM test_split_by_string_2 WHERE rid=3 OR rid=4 ORDER BY rid;
"""

// Empty string
qt_sql_3 """
SELECT rid,
split_by_string(str, ""),
split_by_string(vc, ""),
split_by_string(chr, ""),
split_by_string(txt, "")
FROM test_split_by_string_2 ORDER BY rid;
"""
qt_sql_4 """
SELECT rid,
split_by_string("", str),
split_by_string("", vc),
split_by_string("", chr),
split_by_string("", txt)
FROM test_split_by_string_2 ORDER BY rid;
"""
}

0 comments on commit 5f7093b

Please sign in to comment.