From d6a157ae7a7e9954872b2e08d5bd11ef8ffbbe3a Mon Sep 17 00:00:00 2001 From: rui-mo Date: Tue, 20 Feb 2024 16:23:17 +0800 Subject: [PATCH] Fix split function --- velox/functions/sparksql/SplitFunctions.cpp | 31 ++++++++++----------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/velox/functions/sparksql/SplitFunctions.cpp b/velox/functions/sparksql/SplitFunctions.cpp index 4d092e692837..f9c1bc481672 100644 --- a/velox/functions/sparksql/SplitFunctions.cpp +++ b/velox/functions/sparksql/SplitFunctions.cpp @@ -18,20 +18,16 @@ #include "velox/expression/VectorFunction.h" #include "velox/expression/VectorWriters.h" +#include +#include namespace facebook::velox::functions::sparksql { namespace { -/// This class only implements the basic split version in which the pattern is a -/// single character class SplitCharacter final : public exec::VectorFunction { public: - explicit SplitCharacter(const char pattern) : pattern_{pattern} { - static constexpr std::string_view kRegexChars = ".$|()[{^?*+\\"; - VELOX_CHECK( - kRegexChars.find(pattern) == std::string::npos, - "This version of split supports single-length non-regex patterns"); - } + explicit SplitCharacter(const std::string& pattern) : + pattern_(boost::regex(pattern)) {} void apply( const SelectivityVector& rows, @@ -50,8 +46,15 @@ class SplitCharacter final : public exec::VectorFunction { auto& arrayWriter = resultWriter.current(); const StringView& current = input->valueAt(row); + std::string str = current.str(); const char* pos = current.begin(); const char* end = pos + current.size(); + boost::sregex_token_iterator posT(str.begin(), str.end(), pattern_, -1); + boost::sregex_token_iterator endT; + while (posT != endT) { + std::cout << "val: " << *posT++ << std::endl;; + } + const char* delim; do { delim = std::find(pos, end, pattern_); @@ -72,7 +75,7 @@ class SplitCharacter final : public exec::VectorFunction { } private: - const char pattern_; + boost::regex pattern_; }; /// This class will be updated in the future as we support more variants of @@ -91,12 +94,7 @@ class Split final : public exec::VectorFunction { VELOX_CHECK( delimiterVector, "Split function supports only constant delimiter"); auto patternString = args[1]->as>()->valueAt(0); - VELOX_CHECK_EQ( - patternString.size(), - 1, - "split only supports only single-character pattern"); - char pattern = patternString.data()[0]; - SplitCharacter splitCharacter(pattern); + SplitCharacter splitCharacter(patternString.str()); splitCharacter.apply(rows, args, nullptr, context, result); } }; @@ -119,10 +117,9 @@ std::shared_ptr createSplit( if (pattern.size() != 1) { return std::make_shared(); } - char charPattern = pattern.data()[0]; // TODO: Add support for zero-length pattern, 2-character pattern // TODO: add support for general regex pattern using R2 - return std::make_shared(charPattern); + return std::make_shared(pattern.str()); } std::vector> signatures() {