Skip to content

Commit

Permalink
Fix url_extract_path bugs.
Browse files Browse the repository at this point in the history
  • Loading branch information
kgpai committed Nov 27, 2023
1 parent 72e2867 commit 716455c
Show file tree
Hide file tree
Showing 2 changed files with 73 additions and 25 deletions.
39 changes: 36 additions & 3 deletions velox/docs/functions/presto/url.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@
URL Functions
=============

Extraction Functions
--------------------
Introduction
------------

The URL extraction functions extract components from HTTP URLs (or any valid URIs conforming to `RFC 2396 <https://tools.ietf.org/html/rfc2396.html>`_). The following syntax is supported:
The URL extraction functions extract components from HTTP URLs (or any valid URIs conforming to `RFC 3986 <https://tools.ietf.org/html/rfc3986.html>`_). The following syntax is supported:

.. code-block:: bash
Expand All @@ -14,6 +14,39 @@ The URL extraction functions extract components from HTTP URLs (or any valid URI
The extracted components do not contain URI syntax separators such as ``:`` , ``?`` and ``#``.
Consider for example the below URI:
.. code-block::
http://www.ics.uci.edu/pub/ietf/uri/?k1=v1#Related
scheme = http
authority = www.ics.uci.edu
path = /pub/ietf/uri/
query = k1=v1
fragment = Related
Invalid URI's
-------------
Well formed URI's should not contain ascii whitespace. `Percent-encoded URI's <https://www.rfc-editor.org/rfc/rfc3986#section-2.1>`_ should be followed by two hexadecimal
digits after the percent character "%".
.. code-block::
# Examples of url functions with Invalid URI's.
# Invalid URI due to whitespace
SELECT url_extract_path('foo '); -- NULL (1 row)
# Invalid URI due to improper escaping of '%'
SELECT url_extract_path('https://www.ucu.edu.uy/agenda/evento/%%UCUrlCompartir%%'); -- NULL (1 row)
Extraction Functions
--------------------
.. function:: url_extract_fragment(url) -> varchar
Returns the fragment identifier from ``url``.
Expand Down
59 changes: 37 additions & 22 deletions velox/functions/prestosql/URLFunctions.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,28 @@ FOLLY_ALWAYS_INLINE StringView submatch(const boost::cmatch& match, int idx) {

template <typename TInString>
bool parse(const TInString& rawUrl, boost::cmatch& match) {
/// This regex is taken from RFC - 3986.
/// See: https://www.rfc-editor.org/rfc/rfc3986#appendix-B
/// The basic groups are:
/// scheme = $2
/// authority = $4
/// path = $5
/// query = $7
/// fragment = $9
/// For example a URI like below :
/// http://www.ics.uci.edu/pub/ietf/uri/#Related
///
/// results in the following subexpression matches:
///
/// $1 = http:
/// $2 = http
/// $3 = //www.ics.uci.edu
/// $4 = www.ics.uci.edu
/// $5 = /pub/ietf/uri/
/// $6 = <undefined>
/// $7 = <undefined>
/// $8 = #Related
/// $9 = Related
static const boost::regex kUriRegex(
"^(([^:\\/?#]+):)?" // scheme:
"(\\/\\/([^\\/?#]*))?([^?#]*)" // authority and path
Expand Down Expand Up @@ -85,8 +107,10 @@ FOLLY_ALWAYS_INLINE void urlEscape(TOutString& output, const TInString& input) {
output.resize(outIndex);
}

template <typename TInString>
FOLLY_ALWAYS_INLINE bool urlUnescapeCheck(const TInString& input) {
/// Performs initial validation of the URI.
/// Checks if the URI contains ascii whitespaces or
/// unescaped '%' chars.
bool isValidURI(const StringView& input) {
const char* p = input.data();
const char* end = p + input.size();
char buf[3];
Expand Down Expand Up @@ -173,8 +197,7 @@ struct UrlExtractProtocolFunction {
FOLLY_ALWAYS_INLINE bool call(
out_type<Varchar>& result,
const arg_type<Varchar>& url) {
auto validUrl = urlUnescapeCheck(url);
if (!validUrl) {
if (!isValidURI(url)) {
result.setEmpty();
return false;
}
Expand All @@ -201,8 +224,7 @@ struct UrlExtractFragmentFunction {
FOLLY_ALWAYS_INLINE bool call(
out_type<Varchar>& result,
const arg_type<Varchar>& url) {
auto validUrl = urlUnescapeCheck(url);
if (!validUrl) {
if (!isValidURI(url)) {
result.setEmpty();
return false;
}
Expand All @@ -229,8 +251,7 @@ struct UrlExtractHostFunction {
FOLLY_ALWAYS_INLINE bool call(
out_type<Varchar>& result,
const arg_type<Varchar>& url) {
auto validUrl = urlUnescapeCheck(url);
if (!validUrl) {
if (!isValidURI(url)) {
result.setEmpty();
return false;
}
Expand All @@ -246,8 +267,7 @@ struct UrlExtractHostFunction {
if (matchAuthorityAndPath(
match, authAndPathMatch, authorityMatch, hasAuthority) &&
hasAuthority) {
auto host = submatch(authorityMatch, 3);
result.setNoCopy(host);
result.setNoCopy(submatch(authorityMatch, 3));
} else {
result.setEmpty();
}
Expand All @@ -260,8 +280,7 @@ struct UrlExtractPortFunction {
VELOX_DEFINE_FUNCTION_TYPES(T);

FOLLY_ALWAYS_INLINE bool call(int64_t& result, const arg_type<Varchar>& url) {
auto validUrl = urlUnescapeCheck(url);
if (!validUrl) {
if (!isValidURI(url)) {
return false;
}
boost::cmatch match;
Expand Down Expand Up @@ -297,20 +316,18 @@ struct UrlExtractPathFunction {
FOLLY_ALWAYS_INLINE bool call(
out_type<Varchar>& result,
const arg_type<Varchar>& url) {
boost::cmatch match;
if (!parse(url, match)) {
if (!isValidURI(url)) {
result.setEmpty();
return false;
}

if (!urlUnescapeCheck(url)) {
boost::cmatch match;
if (!parse(url, match)) {
result.setEmpty();
return false;
}

StringView escapedPath;
escapedPath = submatch(match, 5);
urlUnescape(result, escapedPath);
urlUnescape(result, submatch(match, 5));

return true;
}
Expand All @@ -329,8 +346,7 @@ struct UrlExtractQueryFunction {
FOLLY_ALWAYS_INLINE bool call(
out_type<Varchar>& result,
const arg_type<Varchar>& url) {
auto validUrl = urlUnescapeCheck(url);
if (!validUrl) {
if (!isValidURI(url)) {
result.setEmpty();
return false;
}
Expand Down Expand Up @@ -360,8 +376,7 @@ struct UrlExtractParameterFunction {
out_type<Varchar>& result,
const arg_type<Varchar>& url,
const arg_type<Varchar>& param) {
auto validUrl = urlUnescapeCheck(url);
if (!validUrl) {
if (!isValidURI(url)) {
result.setEmpty();
return false;
}
Expand Down

0 comments on commit 716455c

Please sign in to comment.