Skip to content

Commit

Permalink
Fix url_extract_path bugs.
Browse files Browse the repository at this point in the history
  • Loading branch information
kgpai committed Nov 21, 2023
1 parent 736a1e3 commit 72e2867
Show file tree
Hide file tree
Showing 3 changed files with 101 additions and 94 deletions.
2 changes: 1 addition & 1 deletion velox/functions/prestosql/URLFunctions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ bool matchAuthorityAndPath(
boost::cmatch& authorityMatch,
bool& hasAuthority) {
static const boost::regex kAuthorityAndPathRegex("//([^/]*)(/.*)?");
auto authorityAndPath = submatch(urlMatch, 2);
auto authorityAndPath = submatch(urlMatch, 3);
if (!boost::regex_match(
authorityAndPath.begin(),
authorityAndPath.end(),
Expand Down
56 changes: 25 additions & 31 deletions velox/functions/prestosql/URLFunctions.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include <boost/regex.hpp>
#include <cctype>
#include "velox/functions/Macros.h"
#include "velox/functions/lib/string/StringImpl.h"

namespace facebook::velox::functions {

Expand All @@ -30,10 +31,10 @@ FOLLY_ALWAYS_INLINE StringView submatch(const boost::cmatch& match, int idx) {
template <typename TInString>
bool parse(const TInString& rawUrl, boost::cmatch& match) {
static const boost::regex kUriRegex(
"([a-zA-Z][a-zA-Z0-9+.-]*):" // scheme:
"([^?#]*)" // authority and path
"(?:\\?([^#]*))?" // ?query
"(?:#(.*))?"); // #fragment
"^(([^:\\/?#]+):)?" // scheme:
"(\\/\\/([^\\/?#]*))?([^?#]*)" // authority and path
"(\\?([^#]*))?" // ?query
"(#(.*))?"); // #fragment

return boost::regex_match(
rawUrl.data(), rawUrl.data() + rawUrl.size(), match, kUriRegex);
Expand Down Expand Up @@ -84,14 +85,18 @@ FOLLY_ALWAYS_INLINE void urlEscape(TOutString& output, const TInString& input) {
output.resize(outIndex);
}

template<typename TInString>
template <typename TInString>
FOLLY_ALWAYS_INLINE bool urlUnescapeCheck(const TInString& input) {
const char* p = input.data();
const char* end = p + input.size();
char buf[3];
buf[2] = '\0';
char* endptr;
for (; p < end; ++p) {
if (stringImpl::isAsciiWhiteSpace(*p)) {
return false;
}

if (*p == '%') {
if (p + 2 < end) {
buf[0] = p[1];
Expand Down Expand Up @@ -177,7 +182,7 @@ struct UrlExtractProtocolFunction {
if (!parse(url, match)) {
result.setEmpty();
} else {
result.setNoCopy(submatch(match, 1));
result.setNoCopy(submatch(match, 2));
}
return true;
}
Expand Down Expand Up @@ -205,7 +210,7 @@ struct UrlExtractFragmentFunction {
if (!parse(url, match)) {
result.setEmpty();
} else {
result.setNoCopy(submatch(match, 4));
result.setNoCopy(submatch(match, 9));
}
return true;
}
Expand Down Expand Up @@ -241,7 +246,8 @@ struct UrlExtractHostFunction {
if (matchAuthorityAndPath(
match, authAndPathMatch, authorityMatch, hasAuthority) &&
hasAuthority) {
result.setNoCopy(submatch(authorityMatch, 3));
auto host = submatch(authorityMatch, 3);
result.setNoCopy(host);
} else {
result.setEmpty();
}
Expand Down Expand Up @@ -291,35 +297,23 @@ struct UrlExtractPathFunction {
FOLLY_ALWAYS_INLINE bool call(
out_type<Varchar>& result,
const arg_type<Varchar>& url) {
auto validUrl = urlUnescapeCheck(url);
if (!validUrl) {
result.setEmpty();
return false;
}
boost::cmatch match;
if (!parse(url, match)) {
result.setEmpty();
return false;
result.setEmpty();
return false;
}

boost::cmatch authAndPathMatch;
boost::cmatch authorityMatch;
bool hasAuthority;

if (matchAuthorityAndPath(
match, authAndPathMatch, authorityMatch, hasAuthority)) {
StringView escapedPath;
if (hasAuthority) {
escapedPath = submatch(authAndPathMatch, 2);
} else {
escapedPath = submatch(match, 2);
}
urlUnescape(result, escapedPath);
if (!urlUnescapeCheck(url)) {
result.setEmpty();
return false;
}

StringView escapedPath;
escapedPath = submatch(match, 5);
urlUnescape(result, escapedPath);

return true;
}

};

template <typename T>
Expand All @@ -346,7 +340,7 @@ struct UrlExtractQueryFunction {
return true;
}

auto query = submatch(match, 3);
auto query = submatch(match, 7);
result.setNoCopy(query);
return true;
}
Expand Down Expand Up @@ -377,7 +371,7 @@ struct UrlExtractParameterFunction {
return false;
}

auto query = submatch(match, 3);
auto query = submatch(match, 7);
if (!query.empty()) {
// Parse query string.
static const boost::regex kQueryParamRegex(
Expand Down
137 changes: 75 additions & 62 deletions velox/functions/prestosql/tests/URLFunctionsTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,8 @@ class URLFunctionsTest : public functions::test::FunctionBaseTest {
const auto extractPort = [&](const std::optional<std::string>& a) {
return evaluateOnce<int64_t>("url_extract_port(c0)", a);
};
auto protocol = extractFn("protocol", url);
EXPECT_EQ(protocol, expectedProtocol);

EXPECT_EQ(extractFn("protocol", url), expectedProtocol);
EXPECT_EQ(extractFn("host", url), expectedHost);
EXPECT_EQ(extractFn("path", url), expectedPath);
EXPECT_EQ(extractFn("fragment", url), expectedFragment);
Expand All @@ -49,77 +49,90 @@ class URLFunctionsTest : public functions::test::FunctionBaseTest {
};

TEST_F(URLFunctionsTest, validateURL) {
// validate(
// "http://example.com/path1/p.php?k1=v1&k2=v2#Ref1",
// "http",
// "example.com",
// "/path1/p.php",
// "Ref1",
// "k1=v1&k2=v2",
// std::nullopt);
// validate(
// "http://example.com/path1/p.php?",
// "http",
// "example.com",
// "/path1/p.php",
// "",
// "",
// std::nullopt);
// validate(
// "HTTP://example.com/path1/p.php?",
// "HTTP",
// "example.com",
// "/path1/p.php",
// "",
// "",
// std::nullopt);
// validate(
// "http://example.com:8080/path1/p.php?k1=v1&k2=v2#Ref1",
// "http",
// "example.com",
// "/path1/p.php",
// "Ref1",
// "k1=v1&k2=v2",
// 8080);
// validate(
// "https://username@example.com",
// "https",
// "example.com",
// "",
// "",
// "",
// std::nullopt);
// validate(
// "https:/auth/login.html",
// "https",
// "",
// "/auth/login.html",
// "",
// "",
// std::nullopt);
validate("https://www.ucu.edu.uy/agenda/evento/%%UCUrlCompartir%%",
std::nullopt,
std::nullopt,
std::nullopt,
std::nullopt,
std::nullopt,
std::nullopt);
//validate("foo", "", "", "foo", "", "", std::nullopt);
validate("foo!", "", "", "", "", "", std::nullopt);
validate(
"http://example.com/path1/p.php?k1=v1&k2=v2#Ref1",
"http",
"example.com",
"/path1/p.php",
"Ref1",
"k1=v1&k2=v2",
std::nullopt);
validate(
"http://example.com/path1/p.php?",
"http",
"example.com",
"/path1/p.php",
"",
"",
std::nullopt);
validate(
"HTTP://example.com/path1/p.php?",
"HTTP",
"example.com",
"/path1/p.php",
"",
"",
std::nullopt);
validate(
"http://example.com:8080/path1/p.php?k1=v1&k2=v2#Ref1",
"http",
"example.com",
"/path1/p.php",
"Ref1",
"k1=v1&k2=v2",
8080);
validate(
"https://username@example.com",
"https",
"example.com",
"",
"",
"",
std::nullopt);
validate(
"https:/auth/login.html",
"https",
"",
"/auth/login.html",
"",
"",
std::nullopt);
validate(
"https://www.ucu.edu.uy/agenda/evento/%%UCUrlCompartir%%",
std::nullopt,
std::nullopt,
std::nullopt,
std::nullopt,
std::nullopt,
std::nullopt);
validate("foo", "", "", "foo", "", "", std::nullopt);
validate(
"foo ",
std::nullopt,
std::nullopt,
std::nullopt,
std::nullopt,
std::nullopt,
std::nullopt);
}

TEST_F(URLFunctionsTest, extractPath) {
const auto extractPath = [&](const std::optional<std::string>& url) {
return evaluateOnce<std::string>("url_extract_path(c0)", url);
};

ASSERT_EQ(
EXPECT_EQ(
"/media/set/Books and Magazines.php",
extractPath(
"https://www.cnn.com/media/set/Books%20and%20Magazines.php?foo=bar"));

ASSERT_EQ(std::nullopt, extractPath("BAD URL!"));
ASSERT_EQ(std::nullopt, extractPath("https://www.ucu.edu.uy/agenda/evento/%%UCUrlCompartir%%"));
EXPECT_EQ(
"java-net@java.sun.com", extractPath("mailto:java-net@java.sun.com"));
EXPECT_EQ(
std::nullopt,
extractPath("https://www.ucu.edu.uy/agenda/evento/%%UCUrlCompartir%%"));
EXPECT_EQ("foo", extractPath("foo"));
EXPECT_EQ(std::nullopt, extractPath("BAD URL!"));
}

TEST_F(URLFunctionsTest, extractParameter) {
Expand Down

0 comments on commit 72e2867

Please sign in to comment.