diff --git a/velox/docs/functions/presto/url.rst b/velox/docs/functions/presto/url.rst index 183b81503d32..231e53c60eac 100644 --- a/velox/docs/functions/presto/url.rst +++ b/velox/docs/functions/presto/url.rst @@ -2,10 +2,10 @@ URL Functions ============= -Extraction Functions --------------------- +Introduction +------------ -The URL extraction functions extract components from HTTP URLs (or any valid URIs conforming to `RFC 2396 `_). The following syntax is supported: +The URL extraction functions extract components from HTTP URLs (or any valid URIs conforming to `RFC 3986 `_). The following syntax is supported: .. code-block:: bash @@ -14,6 +14,40 @@ The URL extraction functions extract components from HTTP URLs (or any valid URI The extracted components do not contain URI syntax separators such as ``:`` , ``?`` and ``#``. +Consider for example the below URI: + +.. code-block:: + + http://www.ics.uci.edu/pub/ietf/uri/?k1=v1#Related + + scheme = http + authority = www.ics.uci.edu + path = /pub/ietf/uri/ + query = k1=v1 + fragment = Related + + +Invalid URI's +------------- + +Well formed URI's should not contain ascii whitespace. `Percent-encoded URI's `_ should be followed by two hexadecimal +digits after the percent character "%". All the url extract functions will return null when passed an invalid uri. + +.. code-block:: + + # Examples of url functions with Invalid URI's. + + # Invalid URI due to whitespace + SELECT url_extract_path('foo '); -- NULL (1 row) + SELECT url_extract_host('http://www.foo.com '); -- NULL (1 row) + + # Invalid URI due to improper escaping of '%' + SELECT url_extract_path('https://www.ucu.edu.uy/agenda/evento/%%UCUrlCompartir%%'); -- NULL (1 row) + SELECT url_extract_host('https://www.ucu.edu.uy/agenda/evento/%%UCUrlCompartir%%'); -- NULL (1 row) + +Extraction Functions +-------------------- + .. function:: url_extract_fragment(url) -> varchar Returns the fragment identifier from ``url``. diff --git a/velox/functions/prestosql/URLFunctions.cpp b/velox/functions/prestosql/URLFunctions.cpp index 7320cd073189..5df470dd5db8 100644 --- a/velox/functions/prestosql/URLFunctions.cpp +++ b/velox/functions/prestosql/URLFunctions.cpp @@ -25,7 +25,7 @@ bool matchAuthorityAndPath( boost::cmatch& authorityMatch, bool& hasAuthority) { static const boost::regex kAuthorityAndPathRegex("//([^/]*)(/.*)?"); - auto authorityAndPath = submatch(urlMatch, 2); + auto authorityAndPath = submatch(urlMatch, 3); if (!boost::regex_match( authorityAndPath.begin(), authorityAndPath.end(), diff --git a/velox/functions/prestosql/URLFunctions.h b/velox/functions/prestosql/URLFunctions.h index d3d50fd894a1..714490cb2985 100644 --- a/velox/functions/prestosql/URLFunctions.h +++ b/velox/functions/prestosql/URLFunctions.h @@ -18,6 +18,7 @@ #include #include #include "velox/functions/Macros.h" +#include "velox/functions/lib/string/StringImpl.h" namespace facebook::velox::functions { @@ -29,11 +30,33 @@ FOLLY_ALWAYS_INLINE StringView submatch(const boost::cmatch& match, int idx) { template bool parse(const TInString& rawUrl, boost::cmatch& match) { + /// This regex is taken from RFC - 3986. + /// See: https://www.rfc-editor.org/rfc/rfc3986#appendix-B + /// The basic groups are: + /// scheme = $2 + /// authority = $4 + /// path = $5 + /// query = $7 + /// fragment = $9 + /// For example a URI like below : + /// http://www.ics.uci.edu/pub/ietf/uri/#Related + /// + /// results in the following subexpression matches: + /// + /// $1 = http: + /// $2 = http + /// $3 = //www.ics.uci.edu + /// $4 = www.ics.uci.edu + /// $5 = /pub/ietf/uri/ + /// $6 = + /// $7 = + /// $8 = #Related + /// $9 = Related static const boost::regex kUriRegex( - "([a-zA-Z][a-zA-Z0-9+.-]*):" // scheme: - "([^?#]*)" // authority and path - "(?:\\?([^#]*))?" // ?query - "(?:#(.*))?"); // #fragment + "^(([^:\\/?#]+):)?" // scheme: + "(\\/\\/([^\\/?#]*))?([^?#]*)" // authority and path + "(\\?([^#]*))?" // ?query + "(#(.*))?"); // #fragment return boost::regex_match( rawUrl.data(), rawUrl.data() + rawUrl.size(), match, kUriRegex); @@ -84,6 +107,37 @@ FOLLY_ALWAYS_INLINE void urlEscape(TOutString& output, const TInString& input) { output.resize(outIndex); } +/// Performs initial validation of the URI. +/// Checks if the URI contains ascii whitespaces or +/// unescaped '%' chars. +bool isValidURI(const StringView& input) { + const char* p = input.data(); + const char* end = p + input.size(); + char buf[3]; + buf[2] = '\0'; + char* endptr; + for (; p < end; ++p) { + if (stringImpl::isAsciiWhiteSpace(*p)) { + return false; + } + + if (*p == '%') { + if (p + 2 < end) { + buf[0] = p[1]; + buf[1] = p[2]; + strtol(buf, &endptr, 16); + p += 2; + if (endptr != buf + 2) { + return false; + } + } else { + return false; + } + } + } + return true; +} + template FOLLY_ALWAYS_INLINE void urlUnescape( TOutString& output, @@ -143,11 +197,15 @@ struct UrlExtractProtocolFunction { FOLLY_ALWAYS_INLINE bool call( out_type& result, const arg_type& url) { + if (!isValidURI(url)) { + result.setEmpty(); + return false; + } boost::cmatch match; if (!parse(url, match)) { result.setEmpty(); } else { - result.setNoCopy(submatch(match, 1)); + result.setNoCopy(submatch(match, 2)); } return true; } @@ -166,11 +224,15 @@ struct UrlExtractFragmentFunction { FOLLY_ALWAYS_INLINE bool call( out_type& result, const arg_type& url) { + if (!isValidURI(url)) { + result.setEmpty(); + return false; + } boost::cmatch match; if (!parse(url, match)) { result.setEmpty(); } else { - result.setNoCopy(submatch(match, 4)); + result.setNoCopy(submatch(match, 9)); } return true; } @@ -189,6 +251,10 @@ struct UrlExtractHostFunction { FOLLY_ALWAYS_INLINE bool call( out_type& result, const arg_type& url) { + if (!isValidURI(url)) { + result.setEmpty(); + return false; + } boost::cmatch match; if (!parse(url, match)) { result.setEmpty(); @@ -214,6 +280,9 @@ struct UrlExtractPortFunction { VELOX_DEFINE_FUNCTION_TYPES(T); FOLLY_ALWAYS_INLINE bool call(int64_t& result, const arg_type& url) { + if (!isValidURI(url)) { + return false; + } boost::cmatch match; if (!parse(url, match)) { return false; @@ -244,29 +313,23 @@ struct UrlExtractPathFunction { // Input is always ASCII, but result may or may not be ASCII. - FOLLY_ALWAYS_INLINE void call( + FOLLY_ALWAYS_INLINE bool call( out_type& result, const arg_type& url) { + if (!isValidURI(url)) { + result.setEmpty(); + return false; + } + boost::cmatch match; if (!parse(url, match)) { result.setEmpty(); - return; + return false; } - boost::cmatch authAndPathMatch; - boost::cmatch authorityMatch; - bool hasAuthority; + urlUnescape(result, submatch(match, 5)); - if (matchAuthorityAndPath( - match, authAndPathMatch, authorityMatch, hasAuthority)) { - StringView escapedPath; - if (hasAuthority) { - escapedPath = submatch(authAndPathMatch, 2); - } else { - escapedPath = submatch(match, 2); - } - urlUnescape(result, escapedPath); - } + return true; } }; @@ -283,13 +346,17 @@ struct UrlExtractQueryFunction { FOLLY_ALWAYS_INLINE bool call( out_type& result, const arg_type& url) { + if (!isValidURI(url)) { + result.setEmpty(); + return false; + } boost::cmatch match; if (!parse(url, match)) { result.setEmpty(); return true; } - auto query = submatch(match, 3); + auto query = submatch(match, 7); result.setNoCopy(query); return true; } @@ -309,13 +376,17 @@ struct UrlExtractParameterFunction { out_type& result, const arg_type& url, const arg_type& param) { + if (!isValidURI(url)) { + result.setEmpty(); + return false; + } boost::cmatch match; if (!parse(url, match)) { result.setEmpty(); return false; } - auto query = submatch(match, 3); + auto query = submatch(match, 7); if (!query.empty()) { // Parse query string. static const boost::regex kQueryParamRegex( diff --git a/velox/functions/prestosql/tests/URLFunctionsTest.cpp b/velox/functions/prestosql/tests/URLFunctionsTest.cpp index 2269f02af4df..fd07766cc136 100644 --- a/velox/functions/prestosql/tests/URLFunctionsTest.cpp +++ b/velox/functions/prestosql/tests/URLFunctionsTest.cpp @@ -23,11 +23,11 @@ class URLFunctionsTest : public functions::test::FunctionBaseTest { protected: void validate( const std::string& url, - const std::string& expectedProtocol, - const std::string& expectedHost, - const std::string& expectedPath, - const std::string& expectedFragment, - const std::string& expectedQuery, + const std::optional& expectedProtocol, + const std::optional& expectedHost, + const std::optional& expectedPath, + const std::optional& expectedFragment, + const std::optional& expectedQuery, const std::optional expectedPort) { const auto extractFn = [&](const std::string& fn, const std::optional& a) { @@ -40,7 +40,7 @@ class URLFunctionsTest : public functions::test::FunctionBaseTest { }; EXPECT_EQ(extractFn("protocol", url), expectedProtocol); - EXPECT_EQ(extractFn("host", url).value(), expectedHost); + EXPECT_EQ(extractFn("host", url), expectedHost); EXPECT_EQ(extractFn("path", url), expectedPath); EXPECT_EQ(extractFn("fragment", url), expectedFragment); EXPECT_EQ(extractFn("query", url), expectedQuery); @@ -97,7 +97,23 @@ TEST_F(URLFunctionsTest, validateURL) { "", "", std::nullopt); - validate("foo", "", "", "", "", "", std::nullopt); + validate( + "https://www.ucu.edu.uy/agenda/evento/%%UCUrlCompartir%%", + std::nullopt, + std::nullopt, + std::nullopt, + std::nullopt, + std::nullopt, + std::nullopt); + validate("foo", "", "", "foo", "", "", std::nullopt); + validate( + "foo ", + std::nullopt, + std::nullopt, + std::nullopt, + std::nullopt, + std::nullopt, + std::nullopt); } TEST_F(URLFunctionsTest, extractPath) { @@ -105,10 +121,18 @@ TEST_F(URLFunctionsTest, extractPath) { return evaluateOnce("url_extract_path(c0)", url); }; - ASSERT_EQ( + EXPECT_EQ( "/media/set/Books and Magazines.php", extractPath( "https://www.cnn.com/media/set/Books%20and%20Magazines.php?foo=bar")); + + EXPECT_EQ( + "java-net@java.sun.com", extractPath("mailto:java-net@java.sun.com")); + EXPECT_EQ( + std::nullopt, + extractPath("https://www.ucu.edu.uy/agenda/evento/%%UCUrlCompartir%%")); + EXPECT_EQ("foo", extractPath("foo")); + EXPECT_EQ(std::nullopt, extractPath("BAD URL!")); } TEST_F(URLFunctionsTest, extractParameter) {