Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support regexp in TiFlash #3957

Merged
merged 5 commits into from
Feb 28, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion dbms/src/Common/OptimizedRegularExpression.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,8 @@ class OptimizedRegularExpressionImpl
{
RE_CASELESS = 0x00000001,
RE_NO_CAPTURE = 0x00000010,
RE_DOT_NL = 0x00000100
RE_DOT_NL = 0x00000100,
RE_NO_OPTIMIZE = 0x00001000
};

using Match = OptimizedRegularExpressionDetails::Match;
Expand Down
17 changes: 14 additions & 3 deletions dbms/src/Common/OptimizedRegularExpression.inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -262,10 +262,21 @@ void OptimizedRegularExpressionImpl<thread_safe>::analyze(
template <bool thread_safe>
OptimizedRegularExpressionImpl<thread_safe>::OptimizedRegularExpressionImpl(const std::string & regexp_, int options)
{
analyze(regexp_, required_substring, is_trivial, required_substring_is_prefix);
if (options & RE_NO_OPTIMIZE)
{
/// query from TiDB, currently, since analyze does not handle all the cases, skip the optimization
/// to avoid im-compatible issues
is_trivial = false;
required_substring.clear();
required_substring_is_prefix = false;
}
else
{
analyze(regexp_, required_substring, is_trivial, required_substring_is_prefix);
}

/// Just three following options are supported
if (options & (~(RE_CASELESS | RE_NO_CAPTURE | RE_DOT_NL)))
/// Just four following options are supported
if (options & (~(RE_CASELESS | RE_NO_CAPTURE | RE_DOT_NL | RE_NO_OPTIMIZE)))
throw Poco::Exception("OptimizedRegularExpression: Unsupported option.");

is_case_insensitive = options & RE_CASELESS;
Expand Down
28 changes: 27 additions & 1 deletion dbms/src/Flash/Coprocessor/DAGExpressionAnalyzerHelper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -380,6 +380,30 @@ String DAGExpressionAnalyzerHelper::buildRoundFunction(
return analyzer->applyFunction("tidbRoundWithFrac", argument_names, actions, getCollatorFromExpr(expr));
}

String DAGExpressionAnalyzerHelper::buildRegexpFunction(
DAGExpressionAnalyzer * analyzer,
const tipb::Expr & expr,
const ExpressionActionsPtr & actions)
{
const String & func_name = getFunctionName(expr);
Names argument_names;
for (const auto & child : expr.children())
{
String name = analyzer->getActions(child, actions);
argument_names.push_back(name);
Comment on lines +392 to +393
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

would names be replicated?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

replicated name should point to the same column, and it is ok that the arguments of the functions comes from the same column.

}
TiDB::TiDBCollatorPtr collator = getCollatorFromExpr(expr);
if (expr.sig() == tipb::ScalarFuncSig::RegexpReplaceSig || expr.sig() == tipb::ScalarFuncSig::RegexpSig)
{
/// according to https://github.com/pingcap/tidb/blob/v5.0.0/expression/builtin_like.go#L126,
/// For binary collation, it will use RegexpXXXSig, otherwise it will use RegexpXXXUTF8Sig
/// Need to set the collator explicitly because `getCollatorFromExpr` will return nullptr
/// if new collation is not enabled.
collator = TiDB::ITiDBCollator::getCollator(TiDB::ITiDBCollator::BINARY);
}
return analyzer->applyFunction(func_name, argument_names, actions, collator);
}

DAGExpressionAnalyzerHelper::FunctionBuilderMap DAGExpressionAnalyzerHelper::function_builder_map(
{{"in", DAGExpressionAnalyzerHelper::buildInFunction},
{"notIn", DAGExpressionAnalyzerHelper::buildInFunction},
Expand All @@ -401,6 +425,8 @@ DAGExpressionAnalyzerHelper::FunctionBuilderMap DAGExpressionAnalyzerHelper::fun
{"leftUTF8", DAGExpressionAnalyzerHelper::buildLeftUTF8Function},
{"date_add", DAGExpressionAnalyzerHelper::buildDateAddOrSubFunction<DateAdd>},
{"date_sub", DAGExpressionAnalyzerHelper::buildDateAddOrSubFunction<DateSub>},
{"regexp", DAGExpressionAnalyzerHelper::buildRegexpFunction},
{"replaceRegexpAll", DAGExpressionAnalyzerHelper::buildRegexpFunction},
{"tidbRound", DAGExpressionAnalyzerHelper::buildRoundFunction}});

} // namespace DB
} // namespace DB
7 changes: 6 additions & 1 deletion dbms/src/Flash/Coprocessor/DAGExpressionAnalyzerHelper.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,11 @@ class DAGExpressionAnalyzerHelper
const tipb::Expr & expr,
const ExpressionActionsPtr & actions);

static String buildRegexpFunction(
DAGExpressionAnalyzer * analyzer,
const tipb::Expr & expr,
const ExpressionActionsPtr & actions);

static String genFuncString(
const String & func_name,
const Names & argument_names,
Expand All @@ -74,4 +79,4 @@ class DAGExpressionAnalyzerHelper

static FunctionBuilderMap function_builder_map;
};
} // namespace DB
} // namespace DB
4 changes: 2 additions & 2 deletions dbms/src/Flash/Coprocessor/DAGUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -412,8 +412,8 @@ const std::unordered_map<tipb::ScalarFuncSig, String> scalar_func_map({
//{tipb::ScalarFuncSig::UUID, "cast"},

{tipb::ScalarFuncSig::LikeSig, "like3Args"},
//{tipb::ScalarFuncSig::RegexpSig, "cast"},
//{tipb::ScalarFuncSig::RegexpUTF8Sig, "cast"},
{tipb::ScalarFuncSig::RegexpSig, "regexp"},
{tipb::ScalarFuncSig::RegexpUTF8Sig, "regexp"},

//{tipb::ScalarFuncSig::JsonExtractSig, "cast"},
//{tipb::ScalarFuncSig::JsonUnquoteSig, "cast"},
Expand Down
3 changes: 2 additions & 1 deletion dbms/src/Functions/FunctionsStringArray.h
Original file line number Diff line number Diff line change
Expand Up @@ -280,7 +280,8 @@ class ExtractAllImpl
+ " of first argument of function " + getName() + ". Must be constant string.",
ErrorCodes::ILLEGAL_COLUMN);

re = Regexps::get<false, false>(col->getValue<String>());
int flags = OptimizedRegularExpression::RE_DOT_NL;
re = Regexps::get<false, false>(col->getValue<String>(), flags);
capture = re->getNumberOfSubpatterns() > 0 ? 1 : 0;

matches.resize(capture + 1);
Expand Down
Loading