Fix url_extract_* functions for malformed URL's (facebookincubator#7668)

Summary: Fixes facebookincubator#7038 Malformed URL's which contain invalid escape sequences (%xx) used to throw in Velox, but not in Presto. Also, for absolute URI's, url_extract_path used to return NULL when it should return the path, e.g url_extract_path('foo') should return 'foo'. Fix this by making the scheme/authority/path regex to be compliant with the URI RFC (https://www.rfc-editor.org/rfc/rfc3986#appendix-A). Some examples of the new changes: ``` > SELECT url_extract_path('https://www.ucu.edu.uy/agenda/evento/%%UCUrlCompartir%%'); Before: throws exception. After: returns NULL. > SELECT url_extract_path('foo'); Before: returns NULL. After: returns 'foo'. ``` Pull Request resolved: facebookincubator#7668 Reviewed By: mbasmanova Differential Revision: D51487947 Pulled By: kgpai fbshipit-source-id: 3c80e196b1512f62cfcb4f3465ac75fc96b8482c
mbasmanova · Nov 27, 2023 · 70320dd · 70320dd
1 parent b69844d
commit 70320dd
Show file tree

Hide file tree

Showing 4 changed files with 164 additions and 35 deletions.
diff --git a/velox/docs/functions/presto/url.rst b/velox/docs/functions/presto/url.rst
@@ -2,10 +2,10 @@
 URL Functions
 =============
 
-Extraction Functions
---------------------
+Introduction
+------------
 
-The URL extraction functions extract components from HTTP URLs (or any valid URIs conforming to `RFC 2396 <https://tools.ietf.org/html/rfc2396.html>`_). The following syntax is supported:
+The URL extraction functions extract components from HTTP URLs (or any valid URIs conforming to `RFC 3986 <https://tools.ietf.org/html/rfc3986.html>`_). The following syntax is supported:
 
 .. code-block:: bash
 
@@ -14,6 +14,40 @@ The URL extraction functions extract components from HTTP URLs (or any valid URI
 
 The extracted components do not contain URI syntax separators such as ``:`` , ``?`` and ``#``.
 
+Consider for example the below URI:
+
+.. code-block::
+
+    http://www.ics.uci.edu/pub/ietf/uri/?k1=v1#Related
+
+    scheme    = http
+    authority = www.ics.uci.edu
+    path      = /pub/ietf/uri/
+    query     = k1=v1
+    fragment  = Related
+
+
+Invalid URI's
+-------------
+
+Well formed URI's should not contain ascii whitespace. `Percent-encoded URI's <https://www.rfc-editor.org/rfc/rfc3986#section-2.1>`_ should be followed by two hexadecimal
+digits after the percent character "%". All the url extract functions will return null when passed an invalid uri.
+
+.. code-block::
+
+    # Examples of url functions with Invalid URI's.
+
+    # Invalid URI due to whitespace
+    SELECT url_extract_path('foo '); -- NULL (1 row)
+    SELECT url_extract_host('http://www.foo.com '); -- NULL (1 row)
+
+    # Invalid URI due to improper escaping of '%'
+    SELECT url_extract_path('https://www.ucu.edu.uy/agenda/evento/%%UCUrlCompartir%%'); -- NULL (1 row)
+    SELECT url_extract_host('https://www.ucu.edu.uy/agenda/evento/%%UCUrlCompartir%%'); -- NULL (1 row)
+
+Extraction Functions
+--------------------
+
 .. function:: url_extract_fragment(url) -> varchar
 
     Returns the fragment identifier from ``url``.

diff --git a/velox/functions/prestosql/URLFunctions.cpp b/velox/functions/prestosql/URLFunctions.cpp
@@ -25,7 +25,7 @@ bool matchAuthorityAndPath(
     boost::cmatch& authorityMatch,
     bool& hasAuthority) {
   static const boost::regex kAuthorityAndPathRegex("//([^/]*)(/.*)?");
-  auto authorityAndPath = submatch(urlMatch, 2);
+  auto authorityAndPath = submatch(urlMatch, 3);
   if (!boost::regex_match(
           authorityAndPath.begin(),
           authorityAndPath.end(),

diff --git a/velox/functions/prestosql/URLFunctions.h b/velox/functions/prestosql/URLFunctions.h
@@ -18,6 +18,7 @@
 #include <boost/regex.hpp>
 #include <cctype>
 #include "velox/functions/Macros.h"
+#include "velox/functions/lib/string/StringImpl.h"
 
 namespace facebook::velox::functions {
 
@@ -29,11 +30,33 @@ FOLLY_ALWAYS_INLINE StringView submatch(const boost::cmatch& match, int idx) {
 
 template <typename TInString>
 bool parse(const TInString& rawUrl, boost::cmatch& match) {
+  /// This regex is taken from RFC - 3986.
+  /// See: https://www.rfc-editor.org/rfc/rfc3986#appendix-B
+  /// The basic groups are:
+  ///      scheme    = $2
+  ///      authority = $4
+  ///      path      = $5
+  ///      query     = $7
+  ///      fragment  = $9
+  /// For example a URI like below :
+  ///  http://www.ics.uci.edu/pub/ietf/uri/#Related
+  ///
+  ///   results in the following subexpression matches:
+  ///
+  ///      $1 = http:
+  ///      $2 = http
+  ///      $3 = //www.ics.uci.edu
+  ///      $4 = www.ics.uci.edu
+  ///      $5 = /pub/ietf/uri/
+  ///      $6 = <undefined>
+  ///      $7 = <undefined>
+  ///      $8 = #Related
+  ///      $9 = Related
   static const boost::regex kUriRegex(
-      "([a-zA-Z][a-zA-Z0-9+.-]*):" // scheme:
-      "([^?#]*)" // authority and path
-      "(?:\\?([^#]*))?" // ?query
-      "(?:#(.*))?"); // #fragment
+      "^(([^:\\/?#]+):)?" // scheme:
+      "(\\/\\/([^\\/?#]*))?([^?#]*)" // authority and path
+      "(\\?([^#]*))?" // ?query
+      "(#(.*))?"); // #fragment
 
   return boost::regex_match(
       rawUrl.data(), rawUrl.data() + rawUrl.size(), match, kUriRegex);
@@ -84,6 +107,37 @@ FOLLY_ALWAYS_INLINE void urlEscape(TOutString& output, const TInString& input) {
   output.resize(outIndex);
 }
 
+/// Performs initial validation of the URI.
+/// Checks if the URI contains ascii whitespaces or
+/// unescaped '%' chars.
+bool isValidURI(const StringView& input) {
+  const char* p = input.data();
+  const char* end = p + input.size();
+  char buf[3];
+  buf[2] = '\0';
+  char* endptr;
+  for (; p < end; ++p) {
+    if (stringImpl::isAsciiWhiteSpace(*p)) {
+      return false;
+    }
+
+    if (*p == '%') {
+      if (p + 2 < end) {
+        buf[0] = p[1];
+        buf[1] = p[2];
+        strtol(buf, &endptr, 16);
+        p += 2;
+        if (endptr != buf + 2) {
+          return false;
+        }
+      } else {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
 template <typename TOutString, typename TInString>
 FOLLY_ALWAYS_INLINE void urlUnescape(
     TOutString& output,
@@ -143,11 +197,15 @@ struct UrlExtractProtocolFunction {
   FOLLY_ALWAYS_INLINE bool call(
       out_type<Varchar>& result,
       const arg_type<Varchar>& url) {
+    if (!isValidURI(url)) {
+      result.setEmpty();
+      return false;
+    }
     boost::cmatch match;
     if (!parse(url, match)) {
       result.setEmpty();
     } else {
-      result.setNoCopy(submatch(match, 1));
+      result.setNoCopy(submatch(match, 2));
     }
     return true;
   }
@@ -166,11 +224,15 @@ struct UrlExtractFragmentFunction {
   FOLLY_ALWAYS_INLINE bool call(
       out_type<Varchar>& result,
       const arg_type<Varchar>& url) {
+    if (!isValidURI(url)) {
+      result.setEmpty();
+      return false;
+    }
     boost::cmatch match;
     if (!parse(url, match)) {
       result.setEmpty();
     } else {
-      result.setNoCopy(submatch(match, 4));
+      result.setNoCopy(submatch(match, 9));
     }
     return true;
   }
@@ -189,6 +251,10 @@ struct UrlExtractHostFunction {
   FOLLY_ALWAYS_INLINE bool call(
       out_type<Varchar>& result,
       const arg_type<Varchar>& url) {
+    if (!isValidURI(url)) {
+      result.setEmpty();
+      return false;
+    }
     boost::cmatch match;
     if (!parse(url, match)) {
       result.setEmpty();
@@ -214,6 +280,9 @@ struct UrlExtractPortFunction {
   VELOX_DEFINE_FUNCTION_TYPES(T);
 
   FOLLY_ALWAYS_INLINE bool call(int64_t& result, const arg_type<Varchar>& url) {
+    if (!isValidURI(url)) {
+      return false;
+    }
     boost::cmatch match;
     if (!parse(url, match)) {
       return false;
@@ -244,29 +313,23 @@ struct UrlExtractPathFunction {
 
   // Input is always ASCII, but result may or may not be ASCII.
 
-  FOLLY_ALWAYS_INLINE void call(
+  FOLLY_ALWAYS_INLINE bool call(
       out_type<Varchar>& result,
       const arg_type<Varchar>& url) {
+    if (!isValidURI(url)) {
+      result.setEmpty();
+      return false;
+    }
+
     boost::cmatch match;
     if (!parse(url, match)) {
       result.setEmpty();
-      return;
+      return false;
     }
 
-    boost::cmatch authAndPathMatch;
-    boost::cmatch authorityMatch;
-    bool hasAuthority;
+    urlUnescape(result, submatch(match, 5));
 
-    if (matchAuthorityAndPath(
-            match, authAndPathMatch, authorityMatch, hasAuthority)) {
-      StringView escapedPath;
-      if (hasAuthority) {
-        escapedPath = submatch(authAndPathMatch, 2);
-      } else {
-        escapedPath = submatch(match, 2);
-      }
-      urlUnescape(result, escapedPath);
-    }
+    return true;
   }
 };
 
@@ -283,13 +346,17 @@ struct UrlExtractQueryFunction {
   FOLLY_ALWAYS_INLINE bool call(
       out_type<Varchar>& result,
       const arg_type<Varchar>& url) {
+    if (!isValidURI(url)) {
+      result.setEmpty();
+      return false;
+    }
     boost::cmatch match;
     if (!parse(url, match)) {
       result.setEmpty();
       return true;
     }
 
-    auto query = submatch(match, 3);
+    auto query = submatch(match, 7);
     result.setNoCopy(query);
     return true;
   }
@@ -309,13 +376,17 @@ struct UrlExtractParameterFunction {
       out_type<Varchar>& result,
       const arg_type<Varchar>& url,
       const arg_type<Varchar>& param) {
+    if (!isValidURI(url)) {
+      result.setEmpty();
+      return false;
+    }
     boost::cmatch match;
     if (!parse(url, match)) {
       result.setEmpty();
       return false;
     }
 
-    auto query = submatch(match, 3);
+    auto query = submatch(match, 7);
     if (!query.empty()) {
       // Parse query string.
       static const boost::regex kQueryParamRegex(

diff --git a/velox/functions/prestosql/tests/URLFunctionsTest.cpp b/velox/functions/prestosql/tests/URLFunctionsTest.cpp
@@ -23,11 +23,11 @@ class URLFunctionsTest : public functions::test::FunctionBaseTest {
  protected:
   void validate(
       const std::string& url,
-      const std::string& expectedProtocol,
-      const std::string& expectedHost,
-      const std::string& expectedPath,
-      const std::string& expectedFragment,
-      const std::string& expectedQuery,
+      const std::optional<std::string>& expectedProtocol,
+      const std::optional<std::string>& expectedHost,
+      const std::optional<std::string>& expectedPath,
+      const std::optional<std::string>& expectedFragment,
+      const std::optional<std::string>& expectedQuery,
       const std::optional<int32_t> expectedPort) {
     const auto extractFn = [&](const std::string& fn,
                                const std::optional<std::string>& a) {
@@ -40,7 +40,7 @@ class URLFunctionsTest : public functions::test::FunctionBaseTest {
     };
 
     EXPECT_EQ(extractFn("protocol", url), expectedProtocol);
-    EXPECT_EQ(extractFn("host", url).value(), expectedHost);
+    EXPECT_EQ(extractFn("host", url), expectedHost);
     EXPECT_EQ(extractFn("path", url), expectedPath);
     EXPECT_EQ(extractFn("fragment", url), expectedFragment);
     EXPECT_EQ(extractFn("query", url), expectedQuery);
@@ -97,18 +97,42 @@ TEST_F(URLFunctionsTest, validateURL) {
       "",
       "",
       std::nullopt);
-  validate("foo", "", "", "", "", "", std::nullopt);
+  validate(
+      "https://www.ucu.edu.uy/agenda/evento/%%UCUrlCompartir%%",
+      std::nullopt,
+      std::nullopt,
+      std::nullopt,
+      std::nullopt,
+      std::nullopt,
+      std::nullopt);
+  validate("foo", "", "", "foo", "", "", std::nullopt);
+  validate(
+      "foo ",
+      std::nullopt,
+      std::nullopt,
+      std::nullopt,
+      std::nullopt,
+      std::nullopt,
+      std::nullopt);
 }
 
 TEST_F(URLFunctionsTest, extractPath) {
   const auto extractPath = [&](const std::optional<std::string>& url) {
     return evaluateOnce<std::string>("url_extract_path(c0)", url);
   };
 
-  ASSERT_EQ(
+  EXPECT_EQ(
       "/media/set/Books and Magazines.php",
       extractPath(
           "https://www.cnn.com/media/set/Books%20and%20Magazines.php?foo=bar"));
+
+  EXPECT_EQ(
+      "java-net@java.sun.com", extractPath("mailto:java-net@java.sun.com"));
+  EXPECT_EQ(
+      std::nullopt,
+      extractPath("https://www.ucu.edu.uy/agenda/evento/%%UCUrlCompartir%%"));
+  EXPECT_EQ("foo", extractPath("foo"));
+  EXPECT_EQ(std::nullopt, extractPath("BAD URL!"));
 }
 
 TEST_F(URLFunctionsTest, extractParameter) {