Skip to content

Commit

Permalink
Fix split function
Browse files Browse the repository at this point in the history
  • Loading branch information
rui-mo committed Feb 20, 2024
1 parent d55c48b commit d6a157a
Showing 1 changed file with 14 additions and 17 deletions.
31 changes: 14 additions & 17 deletions velox/functions/sparksql/SplitFunctions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,20 +18,16 @@

#include "velox/expression/VectorFunction.h"
#include "velox/expression/VectorWriters.h"
#include <boost/regex.hpp>
#include <iostream>

namespace facebook::velox::functions::sparksql {
namespace {

/// This class only implements the basic split version in which the pattern is a
/// single character
class SplitCharacter final : public exec::VectorFunction {
public:
explicit SplitCharacter(const char pattern) : pattern_{pattern} {
static constexpr std::string_view kRegexChars = ".$|()[{^?*+\\";
VELOX_CHECK(
kRegexChars.find(pattern) == std::string::npos,
"This version of split supports single-length non-regex patterns");
}
explicit SplitCharacter(const std::string& pattern) :
pattern_(boost::regex(pattern)) {}

void apply(
const SelectivityVector& rows,
Expand All @@ -50,8 +46,15 @@ class SplitCharacter final : public exec::VectorFunction {
auto& arrayWriter = resultWriter.current();

const StringView& current = input->valueAt<StringView>(row);
std::string str = current.str();
const char* pos = current.begin();
const char* end = pos + current.size();
boost::sregex_token_iterator posT(str.begin(), str.end(), pattern_, -1);
boost::sregex_token_iterator endT;
while (posT != endT) {
std::cout << "val: " << *posT++ << std::endl;;
}

const char* delim;
do {
delim = std::find(pos, end, pattern_);
Expand All @@ -72,7 +75,7 @@ class SplitCharacter final : public exec::VectorFunction {
}

private:
const char pattern_;
boost::regex pattern_;
};

/// This class will be updated in the future as we support more variants of
Expand All @@ -91,12 +94,7 @@ class Split final : public exec::VectorFunction {
VELOX_CHECK(
delimiterVector, "Split function supports only constant delimiter");
auto patternString = args[1]->as<ConstantVector<StringView>>()->valueAt(0);
VELOX_CHECK_EQ(
patternString.size(),
1,
"split only supports only single-character pattern");
char pattern = patternString.data()[0];
SplitCharacter splitCharacter(pattern);
SplitCharacter splitCharacter(patternString.str());
splitCharacter.apply(rows, args, nullptr, context, result);
}
};
Expand All @@ -119,10 +117,9 @@ std::shared_ptr<exec::VectorFunction> createSplit(
if (pattern.size() != 1) {
return std::make_shared<Split>();
}
char charPattern = pattern.data()[0];
// TODO: Add support for zero-length pattern, 2-character pattern
// TODO: add support for general regex pattern using R2
return std::make_shared<SplitCharacter>(charPattern);
return std::make_shared<SplitCharacter>(pattern.str());
}

std::vector<std::shared_ptr<exec::FunctionSignature>> signatures() {
Expand Down

0 comments on commit d6a157a

Please sign in to comment.