Removed character class checks, modified documentation, fixed some bugs

codyschierbeck · Mar 15, 2024 · 340ed39 · 340ed39
1 parent 8588a83
commit 340ed39
Show file tree

Hide file tree

Showing 4 changed files with 49 additions and 103 deletions.
diff --git a/velox/docs/functions/spark/regexp.rst b/velox/docs/functions/spark/regexp.rst
@@ -5,20 +5,34 @@ Regular Expression Functions
 Regular expression functions use RE2 as the regex engine. RE2 is fast, but
 supports only a subset of PCRE syntax and in particular does not support
 backtracking and associated features (e.g. back references).
+Java and RE2 regex output can diverage and users should be cautious that
+the patterns they are using perform similarlly between RE2 and Java.
+For example, character class unions, intersections, and differences
+``([a[b]], [a&&[b]], [a&&[^b]])`` are intepreted as single characters
+that contain ``[, &, and ^`` rather than union, intersection, or
+difference of the character classes.
+
+
 See https://github.com/google/re2/wiki/Syntax for more information.
 
 .. spark:function:: regexp_extract(string, pattern) -> varchar
 
     Returns the first substring matched by the regular expression ``pattern``
-    in ``string``. ::
+    in ``string``.
+
+    regexp_extract does not support column references for the ``pattern`` argument.
+    Patterns must be constant values. ::
 
         SELECT regexp_extract('1a 2b 14m', '\d+'); -- 1
 
 .. spark:function:: regexp_extract(string, pattern, group) -> varchar
    :noindex:
 
     Finds the first occurrence of the regular expression ``pattern`` in
-    ``string`` and returns the capturing group number ``group``. ::
+    ``string`` and returns the capturing group number ``group``.
+
+    regexp_extract does not support column references for the ``pattern`` argument.
+    Patterns must be constant values. ::
 
         SELECT regexp_extract('1a 2b 14m', '(\d+)([a-z]+)', 2); -- 'a'
 
@@ -31,7 +45,10 @@ See https://github.com/google/re2/wiki/Syntax for more information.
     pattern only needs to be contained within ``string``, rather than
     needing to match all of ``string``. In other words, this performs a
     *contains* operation rather than a *match* operation. You can match
-    the entire string by anchoring the pattern using ``^`` and ``$``. ::
+    the entire string by anchoring the pattern using ``^`` and ``$``.
+
+    rlike does not support column references for the ``pattern`` argument.
+    Patterns must be constant values. ::
 
         SELECT rlike('1a 2b 14m', '\d+b'); -- true
 
@@ -40,11 +57,6 @@ See https://github.com/google/re2/wiki/Syntax for more information.
     Replaces all substrings in ``string`` that match the regular expression ``pattern`` with the string ``overwrite``. If no match is found, the original string is returned as is.
     There is a limit to the number of unique regexes to be compiled per function call, which is 20. If this limit is exceeded the function will throw an exception.
 
-    regexp_replace will throw an exception if ``string`` contains an invalid UTF-8 character, or if ``pattern`` does not conform to RE2 syntax: https://github.com/google/re2/wiki/Syntax.
-
-    regexp_replace does not support named ASCII character classes or ASCII character class union, intersection, or difference and will throw an exception if they are detected within the provided ``pattern``.
-
-
     Parameters:
 
     - **string**: The string to be searched.
@@ -65,11 +77,6 @@ See https://github.com/google/re2/wiki/Syntax for more information.
     Replaces all substrings in ``string`` that match the regular expression ``pattern`` with the string ``overwrite`` starting from the specified ``position``.  If no match is found, the original string is returned as is. If the ``position`` is less than one, the function throws an exception. If ``position`` is greater than the length of ``string``, the function returns the original ``string`` without any modifications.
     There is a limit to the number of unique regexes to be compiled per function call, which is 20. If this limit is exceeded the function will throw an exception.
 
-    regexp_replace will throw an exception if ``string`` contains an invalid UTF-8 character, if ``position`` is less than 1, or if ``pattern`` does not conform to RE2 syntax: https://github.com/google/re2/wiki/Syntax.
-
-    regexp_replace does not support named ASCII character classes or character class union, intersection, or difference and will throw an exception if they are detected within the provided ``pattern``.
-
-
     This function is 1-indexed, meaning the position of the first character is 1.
     Parameters:
 

diff --git a/velox/expression/tests/SparkExpressionFuzzerTest.cpp b/velox/expression/tests/SparkExpressionFuzzerTest.cpp
@@ -46,9 +46,9 @@ int main(int argc, char** argv) {
   // The following list are the Spark UDFs that hit issues
   // For rlike you need the following combo in the only list:
   // rlike, md5 and upper
-  // regexp_replace: https://github.com/facebookincubator/velox/issues/8438
   std::unordered_set<std::string> skipFunctions = {
       "regexp_extract",
+      // https://github.com/facebookincubator/velox/issues/8438
       "regexp_replace",
       "rlike",
       "chr",

diff --git a/velox/functions/sparksql/RegexFunctions.cpp b/velox/functions/sparksql/RegexFunctions.cpp
@@ -33,68 +33,12 @@ void checkForBadPattern(const RE2& re) {
   }
 }
 
-// Validates the provided regex pattern to ensure its compatibility with the
-// system. The function checks if the pattern uses features like character
-// class union, intersection, or difference which are not supported in C++ RE2
-// library but are supported in Java regex.
-//
-// This function should be called on the individual patterns of a decoded
-// vector. That way when a single pattern in a vector is invalid, we can still
-// operate on the remaining rows.
-//
-// @param pattern The regex pattern string to validate.
-// @param functionName (Optional) Name of the calling function to include in
-// error messages.
-//
-// @throws VELOX_USER_FAIL If the pattern is found to use unsupported features.
-// @note  Default functionName is "REGEXP_REPLACE" because it uses non-constant
-// patterns so it cannot be checked with "ensureRegexIsCompatible". No
-// other functions work with non-constant patterns, but they may in the future.
-//
-// @note Leaving functionName as an optional parameter makes room for
-// other functions to enable non-constant patterns in the future.
-void checkForCompatiblePattern(
-    const std::string& pattern,
-    const char* functionName) {
-  // If in a character class, points to the [ at the beginning of that class.
-  const char* charClassStart = nullptr;
-  // This minimal regex parser looks just for the class begin/end markers.
-  for (const char* c = pattern.data(); c < pattern.data() + pattern.size();
-       ++c) {
-    if (*c == '\\') {
-      ++c;
-    } else if (*c == '[') {
-      if (charClassStart) {
-        VELOX_USER_FAIL(
-            "{} does not support named ASCII character classes or class union, intersection, "
-            "or difference ([a[b]], [a&&[b]], [a&&[^b]])",
-            functionName);
-      }
-      charClassStart = c;
-      // A ] immediately after a [ does not end the character class, and is
-      // instead adds the character ].
-    } else if (*c == ']' && charClassStart + 1 != c) {
-      charClassStart = nullptr;
-    }
-  }
-}
-
-// Blocks patterns that contain character class union, intersection, or
-// difference because these are not understood by RE2 and will be parsed as a
-// different pattern than in java.util.regex.
-void ensureRegexIsConstantAndCompatible(
+void ensureRegexIsConstant(
     const char* functionName,
     const VectorPtr& patternVector) {
   if (!patternVector || !patternVector->isConstantEncoding()) {
     VELOX_USER_FAIL("{} requires a constant pattern.", functionName);
   }
-  if (patternVector->isNullAt(0)) {
-    return;
-  }
-  const StringView pattern =
-      patternVector->as<ConstantVector<StringView>>()->valueAt(0);
-  checkForCompatiblePattern(
-      std::string(pattern.data(), pattern.size()), functionName);
 }
 
 // REGEXP_REPLACE(string, pattern, overwrite) → string
@@ -207,8 +151,7 @@ struct RegexpReplaceFunction {
         kMaxCompiledRegexes,
         "regexp_replace hit the maximum number of unique regexes: {}",
         kMaxCompiledRegexes);
-    checkForCompatiblePattern(pattern, "regexp_replace");
-    auto patternRegex = std::make_unique<re2::RE2>(pattern);
+    auto patternRegex = std::make_unique<re2::RE2>(pattern, re2::RE2::Quiet);
     auto* rawPatternRegex = patternRegex.get();
     checkForBadPattern(*rawPatternRegex);
     cache_.emplace(pattern, std::move(patternRegex));
@@ -230,7 +173,7 @@ std::shared_ptr<exec::VectorFunction> makeRLike(
     const core::QueryConfig& config) {
   // Return any errors from re2Search() first.
   auto result = makeRe2Search(name, inputArgs, config);
-  ensureRegexIsConstantAndCompatible("RLIKE", inputArgs[1].constantValue);
+  ensureRegexIsConstant("RLIKE", inputArgs[1].constantValue);
   return result;
 }
 
@@ -239,8 +182,7 @@ std::shared_ptr<exec::VectorFunction> makeRegexExtract(
     const std::vector<exec::VectorFunctionArg>& inputArgs,
     const core::QueryConfig& config) {
   auto result = makeRe2Extract(name, inputArgs, config, /*emptyNoMatch=*/true);
-  ensureRegexIsConstantAndCompatible(
-      "REGEXP_EXTRACT", inputArgs[1].constantValue);
+  ensureRegexIsConstant("REGEXP_EXTRACT", inputArgs[1].constantValue);
   return result;
 }
 

diff --git a/velox/functions/sparksql/tests/RegexFunctionsTest.cpp b/velox/functions/sparksql/tests/RegexFunctionsTest.cpp
@@ -34,7 +34,6 @@ class RegexFunctionsTest : public test::SparkFunctionBaseTest {
  public:
   void SetUp() override {
     SparkFunctionBaseTest::SetUp();
-    registerRegexpReplace("");
     // For parsing literal integers as INTEGER, not BIGINT,
     // required by regexp_replace because its position argument
     // is INTEGER.
@@ -176,12 +175,6 @@ class RegexFunctionsTest : public test::SparkFunctionBaseTest {
 // error being thrown; some unsupported character class features result in
 // different results.
 TEST_F(RegexFunctionsTest, javaRegexIncompatibilities) {
-  // Character class union is not supported; parsed as [a\[b]\].
-  EXPECT_THROW(rlike("[]", R"([a[b]])"), VeloxUserError);
-  // Character class intersection not supported; parsed as [a&\[b]\].
-  EXPECT_THROW(rlike("&]", R"([a&&[b]])"), VeloxUserError);
-  // Character class difference not supported; parsed as [\w&\[\^b]\].
-  EXPECT_THROW(rlike("^]", R"([\w&&[^b]])"), VeloxUserError);
   // Unsupported character classes.
   EXPECT_THROW(rlike(" ", "\\h"), VeloxUserError);
   EXPECT_THROW(rlike(" ", "\\H"), VeloxUserError);
@@ -227,8 +220,6 @@ TEST_F(RegexFunctionsTest, blockUnsupportedEdgeCases) {
   EXPECT_THROW(
       evaluateOnce<bool>("rlike('a', c0)", std::optional<std::string>("a*")),
       VeloxUserError);
-  // Unsupported set union syntax.
-  EXPECT_THROW(rlike("", "[a[b]]"), VeloxUserError);
 }
 
 TEST_F(RegexFunctionsTest, regexMatchRegistration) {
@@ -237,7 +228,6 @@ TEST_F(RegexFunctionsTest, regexMatchRegistration) {
           "regexp_extract('a', c0)", std::optional<std::string>("a*")),
       VeloxUserError);
   EXPECT_EQ(regexp_extract("abc", "a."), "ab");
-  EXPECT_THROW(regexp_extract("[]", "[a[b]]"), VeloxUserError);
 }
 
 TEST_F(RegexFunctionsTest, regexpReplaceRegistration) {
@@ -263,14 +253,32 @@ TEST_F(RegexFunctionsTest, regexpReplaceSimple) {
 TEST_F(RegexFunctionsTest, badUTF8) {
   std::string badUTF = "\xF0\x82\x82\xAC";
   std::string badHalf = "\xF0\x82";
+  std::string overlongA = "\xC1\x81"; // 'A' should not be encoded like this.
+  std::string invalidCodePoint =
+      "\xED\xA0\x80"; // Start of the surrogate pair range in UTF-8
+  std::string incompleteSequence = "\xC3"; // Missing the continuation byte.
+  std::string illegalContinuation =
+      "\x80"; // This is a continuation byte without a start.
+  std::string unexpectedContinuation =
+      "\xE0\x80\x80\x80"; // Too many continuation bytes.
+
   VELOX_ASSERT_THROW(
       testingRegexpReplaceRows({badUTF}, {badHalf}, {"Bad"}), "invalid UTF-8");
-  // python converts above values to below and completes regexp_replace
-  // converts.
-  badUTF = "\xc3\xb0\xc2\xac";
-  badHalf = "\xc3\xb0";
-  auto result = testRegexpReplace(badUTF, badHalf, "");
-  EXPECT_EQ(result, "\xc2\xac");
+  VELOX_ASSERT_THROW(
+      testingRegexpReplaceRows({overlongA}, {badHalf}, {"Bad"}),
+      "invalid UTF-8");
+  VELOX_ASSERT_THROW(
+      testingRegexpReplaceRows({invalidCodePoint}, {badHalf}, {"Bad"}),
+      "invalid UTF-8");
+  VELOX_ASSERT_THROW(
+      testingRegexpReplaceRows({incompleteSequence}, {badHalf}, {"Bad"}),
+      "invalid UTF-8");
+  VELOX_ASSERT_THROW(
+      testingRegexpReplaceRows({illegalContinuation}, {badHalf}, {"Bad"}),
+      "invalid UTF-8");
+  VELOX_ASSERT_THROW(
+      testingRegexpReplaceRows({unexpectedContinuation}, {badHalf}, {"Bad"}),
+      "invalid UTF-8");
 }
 
 TEST_F(RegexFunctionsTest, regexpReplaceSimplePosition) {
@@ -317,17 +325,6 @@ TEST_F(RegexFunctionsTest, regexpReplaceWithEmptyString) {
   EXPECT_EQ(result, output);
 }
 
-TEST_F(RegexFunctionsTest, regexBadJavaPattern) {
-  VELOX_ASSERT_THROW(
-      testRegexpReplace("[]", "[a[b]]", ""),
-      "regexp_replace does not support named ASCII character classes or class union, intersection, or difference ([a[b]], [a&&[b]], [a&&[^b]])");
-  VELOX_ASSERT_THROW(
-      testRegexpReplace("[]", "[a&&[b]]", ""),
-      "regexp_replace does not support named ASCII character classes or class union, intersection, or difference ([a[b]], [a&&[b]], [a&&[^b]])");
-  VELOX_ASSERT_THROW(
-      testRegexpReplace("[]", "[a&&[^b]]", ""),
-      "regexp_replace does not support named ASCII character classes or class union, intersection, or difference ([a[b]], [a&&[b]], [a&&[^b]])");
-}
 
 
 TEST_F(RegexFunctionsTest, regexpReplacePosition) {