From 72e28679ebd8986d2dd201e565ce0cc50cfde2a9 Mon Sep 17 00:00:00 2001 From: Krishna Pai Date: Mon, 20 Nov 2023 18:06:03 -0800 Subject: [PATCH] Fix url_extract_path bugs. --- velox/functions/prestosql/URLFunctions.cpp | 2 +- velox/functions/prestosql/URLFunctions.h | 56 ++++--- .../prestosql/tests/URLFunctionsTest.cpp | 137 ++++++++++-------- 3 files changed, 101 insertions(+), 94 deletions(-) diff --git a/velox/functions/prestosql/URLFunctions.cpp b/velox/functions/prestosql/URLFunctions.cpp index 7320cd073189..5df470dd5db8 100644 --- a/velox/functions/prestosql/URLFunctions.cpp +++ b/velox/functions/prestosql/URLFunctions.cpp @@ -25,7 +25,7 @@ bool matchAuthorityAndPath( boost::cmatch& authorityMatch, bool& hasAuthority) { static const boost::regex kAuthorityAndPathRegex("//([^/]*)(/.*)?"); - auto authorityAndPath = submatch(urlMatch, 2); + auto authorityAndPath = submatch(urlMatch, 3); if (!boost::regex_match( authorityAndPath.begin(), authorityAndPath.end(), diff --git a/velox/functions/prestosql/URLFunctions.h b/velox/functions/prestosql/URLFunctions.h index 4424f4f48231..24e674e83750 100644 --- a/velox/functions/prestosql/URLFunctions.h +++ b/velox/functions/prestosql/URLFunctions.h @@ -18,6 +18,7 @@ #include #include #include "velox/functions/Macros.h" +#include "velox/functions/lib/string/StringImpl.h" namespace facebook::velox::functions { @@ -30,10 +31,10 @@ FOLLY_ALWAYS_INLINE StringView submatch(const boost::cmatch& match, int idx) { template bool parse(const TInString& rawUrl, boost::cmatch& match) { static const boost::regex kUriRegex( - "([a-zA-Z][a-zA-Z0-9+.-]*):" // scheme: - "([^?#]*)" // authority and path - "(?:\\?([^#]*))?" // ?query - "(?:#(.*))?"); // #fragment + "^(([^:\\/?#]+):)?" // scheme: + "(\\/\\/([^\\/?#]*))?([^?#]*)" // authority and path + "(\\?([^#]*))?" // ?query + "(#(.*))?"); // #fragment return boost::regex_match( rawUrl.data(), rawUrl.data() + rawUrl.size(), match, kUriRegex); @@ -84,7 +85,7 @@ FOLLY_ALWAYS_INLINE void urlEscape(TOutString& output, const TInString& input) { output.resize(outIndex); } -template +template FOLLY_ALWAYS_INLINE bool urlUnescapeCheck(const TInString& input) { const char* p = input.data(); const char* end = p + input.size(); @@ -92,6 +93,10 @@ FOLLY_ALWAYS_INLINE bool urlUnescapeCheck(const TInString& input) { buf[2] = '\0'; char* endptr; for (; p < end; ++p) { + if (stringImpl::isAsciiWhiteSpace(*p)) { + return false; + } + if (*p == '%') { if (p + 2 < end) { buf[0] = p[1]; @@ -177,7 +182,7 @@ struct UrlExtractProtocolFunction { if (!parse(url, match)) { result.setEmpty(); } else { - result.setNoCopy(submatch(match, 1)); + result.setNoCopy(submatch(match, 2)); } return true; } @@ -205,7 +210,7 @@ struct UrlExtractFragmentFunction { if (!parse(url, match)) { result.setEmpty(); } else { - result.setNoCopy(submatch(match, 4)); + result.setNoCopy(submatch(match, 9)); } return true; } @@ -241,7 +246,8 @@ struct UrlExtractHostFunction { if (matchAuthorityAndPath( match, authAndPathMatch, authorityMatch, hasAuthority) && hasAuthority) { - result.setNoCopy(submatch(authorityMatch, 3)); + auto host = submatch(authorityMatch, 3); + result.setNoCopy(host); } else { result.setEmpty(); } @@ -291,35 +297,23 @@ struct UrlExtractPathFunction { FOLLY_ALWAYS_INLINE bool call( out_type& result, const arg_type& url) { - auto validUrl = urlUnescapeCheck(url); - if (!validUrl) { - result.setEmpty(); - return false; - } boost::cmatch match; if (!parse(url, match)) { - result.setEmpty(); - return false; + result.setEmpty(); + return false; } - boost::cmatch authAndPathMatch; - boost::cmatch authorityMatch; - bool hasAuthority; - - if (matchAuthorityAndPath( - match, authAndPathMatch, authorityMatch, hasAuthority)) { - StringView escapedPath; - if (hasAuthority) { - escapedPath = submatch(authAndPathMatch, 2); - } else { - escapedPath = submatch(match, 2); - } - urlUnescape(result, escapedPath); + if (!urlUnescapeCheck(url)) { + result.setEmpty(); + return false; } + StringView escapedPath; + escapedPath = submatch(match, 5); + urlUnescape(result, escapedPath); + return true; } - }; template @@ -346,7 +340,7 @@ struct UrlExtractQueryFunction { return true; } - auto query = submatch(match, 3); + auto query = submatch(match, 7); result.setNoCopy(query); return true; } @@ -377,7 +371,7 @@ struct UrlExtractParameterFunction { return false; } - auto query = submatch(match, 3); + auto query = submatch(match, 7); if (!query.empty()) { // Parse query string. static const boost::regex kQueryParamRegex( diff --git a/velox/functions/prestosql/tests/URLFunctionsTest.cpp b/velox/functions/prestosql/tests/URLFunctionsTest.cpp index 077c9b3f46dd..fd07766cc136 100644 --- a/velox/functions/prestosql/tests/URLFunctionsTest.cpp +++ b/velox/functions/prestosql/tests/URLFunctionsTest.cpp @@ -38,8 +38,8 @@ class URLFunctionsTest : public functions::test::FunctionBaseTest { const auto extractPort = [&](const std::optional& a) { return evaluateOnce("url_extract_port(c0)", a); }; - auto protocol = extractFn("protocol", url); - EXPECT_EQ(protocol, expectedProtocol); + + EXPECT_EQ(extractFn("protocol", url), expectedProtocol); EXPECT_EQ(extractFn("host", url), expectedHost); EXPECT_EQ(extractFn("path", url), expectedPath); EXPECT_EQ(extractFn("fragment", url), expectedFragment); @@ -49,63 +49,71 @@ class URLFunctionsTest : public functions::test::FunctionBaseTest { }; TEST_F(URLFunctionsTest, validateURL) { -// validate( -// "http://example.com/path1/p.php?k1=v1&k2=v2#Ref1", -// "http", -// "example.com", -// "/path1/p.php", -// "Ref1", -// "k1=v1&k2=v2", -// std::nullopt); -// validate( -// "http://example.com/path1/p.php?", -// "http", -// "example.com", -// "/path1/p.php", -// "", -// "", -// std::nullopt); -// validate( -// "HTTP://example.com/path1/p.php?", -// "HTTP", -// "example.com", -// "/path1/p.php", -// "", -// "", -// std::nullopt); -// validate( -// "http://example.com:8080/path1/p.php?k1=v1&k2=v2#Ref1", -// "http", -// "example.com", -// "/path1/p.php", -// "Ref1", -// "k1=v1&k2=v2", -// 8080); -// validate( -// "https://username@example.com", -// "https", -// "example.com", -// "", -// "", -// "", -// std::nullopt); -// validate( -// "https:/auth/login.html", -// "https", -// "", -// "/auth/login.html", -// "", -// "", -// std::nullopt); - validate("https://www.ucu.edu.uy/agenda/evento/%%UCUrlCompartir%%", - std::nullopt, - std::nullopt, - std::nullopt, - std::nullopt, - std::nullopt, - std::nullopt); - //validate("foo", "", "", "foo", "", "", std::nullopt); - validate("foo!", "", "", "", "", "", std::nullopt); + validate( + "http://example.com/path1/p.php?k1=v1&k2=v2#Ref1", + "http", + "example.com", + "/path1/p.php", + "Ref1", + "k1=v1&k2=v2", + std::nullopt); + validate( + "http://example.com/path1/p.php?", + "http", + "example.com", + "/path1/p.php", + "", + "", + std::nullopt); + validate( + "HTTP://example.com/path1/p.php?", + "HTTP", + "example.com", + "/path1/p.php", + "", + "", + std::nullopt); + validate( + "http://example.com:8080/path1/p.php?k1=v1&k2=v2#Ref1", + "http", + "example.com", + "/path1/p.php", + "Ref1", + "k1=v1&k2=v2", + 8080); + validate( + "https://username@example.com", + "https", + "example.com", + "", + "", + "", + std::nullopt); + validate( + "https:/auth/login.html", + "https", + "", + "/auth/login.html", + "", + "", + std::nullopt); + validate( + "https://www.ucu.edu.uy/agenda/evento/%%UCUrlCompartir%%", + std::nullopt, + std::nullopt, + std::nullopt, + std::nullopt, + std::nullopt, + std::nullopt); + validate("foo", "", "", "foo", "", "", std::nullopt); + validate( + "foo ", + std::nullopt, + std::nullopt, + std::nullopt, + std::nullopt, + std::nullopt, + std::nullopt); } TEST_F(URLFunctionsTest, extractPath) { @@ -113,13 +121,18 @@ TEST_F(URLFunctionsTest, extractPath) { return evaluateOnce("url_extract_path(c0)", url); }; - ASSERT_EQ( + EXPECT_EQ( "/media/set/Books and Magazines.php", extractPath( "https://www.cnn.com/media/set/Books%20and%20Magazines.php?foo=bar")); - ASSERT_EQ(std::nullopt, extractPath("BAD URL!")); - ASSERT_EQ(std::nullopt, extractPath("https://www.ucu.edu.uy/agenda/evento/%%UCUrlCompartir%%")); + EXPECT_EQ( + "java-net@java.sun.com", extractPath("mailto:java-net@java.sun.com")); + EXPECT_EQ( + std::nullopt, + extractPath("https://www.ucu.edu.uy/agenda/evento/%%UCUrlCompartir%%")); + EXPECT_EQ("foo", extractPath("foo")); + EXPECT_EQ(std::nullopt, extractPath("BAD URL!")); } TEST_F(URLFunctionsTest, extractParameter) {