From 716455c30449edda66961bdeddbe1e6b9a703f44 Mon Sep 17 00:00:00 2001 From: Krishna Pai Date: Tue, 21 Nov 2023 06:02:14 -0800 Subject: [PATCH] Fix url_extract_path bugs. --- velox/docs/functions/presto/url.rst | 39 ++++++++++++++-- velox/functions/prestosql/URLFunctions.h | 59 +++++++++++++++--------- 2 files changed, 73 insertions(+), 25 deletions(-) diff --git a/velox/docs/functions/presto/url.rst b/velox/docs/functions/presto/url.rst index 183b81503d32..872828e9d52f 100644 --- a/velox/docs/functions/presto/url.rst +++ b/velox/docs/functions/presto/url.rst @@ -2,10 +2,10 @@ URL Functions ============= -Extraction Functions --------------------- +Introduction +------------ -The URL extraction functions extract components from HTTP URLs (or any valid URIs conforming to `RFC 2396 `_). The following syntax is supported: +The URL extraction functions extract components from HTTP URLs (or any valid URIs conforming to `RFC 3986 `_). The following syntax is supported: .. code-block:: bash @@ -14,6 +14,39 @@ The URL extraction functions extract components from HTTP URLs (or any valid URI The extracted components do not contain URI syntax separators such as ``:`` , ``?`` and ``#``. +Consider for example the below URI: + +.. code-block:: + + http://www.ics.uci.edu/pub/ietf/uri/?k1=v1#Related + + scheme = http + authority = www.ics.uci.edu + path = /pub/ietf/uri/ + query = k1=v1 + fragment = Related + + +Invalid URI's +------------- + +Well formed URI's should not contain ascii whitespace. `Percent-encoded URI's `_ should be followed by two hexadecimal +digits after the percent character "%". + +.. code-block:: + + # Examples of url functions with Invalid URI's. + + # Invalid URI due to whitespace + SELECT url_extract_path('foo '); -- NULL (1 row) + + # Invalid URI due to improper escaping of '%' + SELECT url_extract_path('https://www.ucu.edu.uy/agenda/evento/%%UCUrlCompartir%%'); -- NULL (1 row) + + +Extraction Functions +-------------------- + .. function:: url_extract_fragment(url) -> varchar Returns the fragment identifier from ``url``. diff --git a/velox/functions/prestosql/URLFunctions.h b/velox/functions/prestosql/URLFunctions.h index 24e674e83750..714490cb2985 100644 --- a/velox/functions/prestosql/URLFunctions.h +++ b/velox/functions/prestosql/URLFunctions.h @@ -30,6 +30,28 @@ FOLLY_ALWAYS_INLINE StringView submatch(const boost::cmatch& match, int idx) { template bool parse(const TInString& rawUrl, boost::cmatch& match) { + /// This regex is taken from RFC - 3986. + /// See: https://www.rfc-editor.org/rfc/rfc3986#appendix-B + /// The basic groups are: + /// scheme = $2 + /// authority = $4 + /// path = $5 + /// query = $7 + /// fragment = $9 + /// For example a URI like below : + /// http://www.ics.uci.edu/pub/ietf/uri/#Related + /// + /// results in the following subexpression matches: + /// + /// $1 = http: + /// $2 = http + /// $3 = //www.ics.uci.edu + /// $4 = www.ics.uci.edu + /// $5 = /pub/ietf/uri/ + /// $6 = + /// $7 = + /// $8 = #Related + /// $9 = Related static const boost::regex kUriRegex( "^(([^:\\/?#]+):)?" // scheme: "(\\/\\/([^\\/?#]*))?([^?#]*)" // authority and path @@ -85,8 +107,10 @@ FOLLY_ALWAYS_INLINE void urlEscape(TOutString& output, const TInString& input) { output.resize(outIndex); } -template -FOLLY_ALWAYS_INLINE bool urlUnescapeCheck(const TInString& input) { +/// Performs initial validation of the URI. +/// Checks if the URI contains ascii whitespaces or +/// unescaped '%' chars. +bool isValidURI(const StringView& input) { const char* p = input.data(); const char* end = p + input.size(); char buf[3]; @@ -173,8 +197,7 @@ struct UrlExtractProtocolFunction { FOLLY_ALWAYS_INLINE bool call( out_type& result, const arg_type& url) { - auto validUrl = urlUnescapeCheck(url); - if (!validUrl) { + if (!isValidURI(url)) { result.setEmpty(); return false; } @@ -201,8 +224,7 @@ struct UrlExtractFragmentFunction { FOLLY_ALWAYS_INLINE bool call( out_type& result, const arg_type& url) { - auto validUrl = urlUnescapeCheck(url); - if (!validUrl) { + if (!isValidURI(url)) { result.setEmpty(); return false; } @@ -229,8 +251,7 @@ struct UrlExtractHostFunction { FOLLY_ALWAYS_INLINE bool call( out_type& result, const arg_type& url) { - auto validUrl = urlUnescapeCheck(url); - if (!validUrl) { + if (!isValidURI(url)) { result.setEmpty(); return false; } @@ -246,8 +267,7 @@ struct UrlExtractHostFunction { if (matchAuthorityAndPath( match, authAndPathMatch, authorityMatch, hasAuthority) && hasAuthority) { - auto host = submatch(authorityMatch, 3); - result.setNoCopy(host); + result.setNoCopy(submatch(authorityMatch, 3)); } else { result.setEmpty(); } @@ -260,8 +280,7 @@ struct UrlExtractPortFunction { VELOX_DEFINE_FUNCTION_TYPES(T); FOLLY_ALWAYS_INLINE bool call(int64_t& result, const arg_type& url) { - auto validUrl = urlUnescapeCheck(url); - if (!validUrl) { + if (!isValidURI(url)) { return false; } boost::cmatch match; @@ -297,20 +316,18 @@ struct UrlExtractPathFunction { FOLLY_ALWAYS_INLINE bool call( out_type& result, const arg_type& url) { - boost::cmatch match; - if (!parse(url, match)) { + if (!isValidURI(url)) { result.setEmpty(); return false; } - if (!urlUnescapeCheck(url)) { + boost::cmatch match; + if (!parse(url, match)) { result.setEmpty(); return false; } - StringView escapedPath; - escapedPath = submatch(match, 5); - urlUnescape(result, escapedPath); + urlUnescape(result, submatch(match, 5)); return true; } @@ -329,8 +346,7 @@ struct UrlExtractQueryFunction { FOLLY_ALWAYS_INLINE bool call( out_type& result, const arg_type& url) { - auto validUrl = urlUnescapeCheck(url); - if (!validUrl) { + if (!isValidURI(url)) { result.setEmpty(); return false; } @@ -360,8 +376,7 @@ struct UrlExtractParameterFunction { out_type& result, const arg_type& url, const arg_type& param) { - auto validUrl = urlUnescapeCheck(url); - if (!validUrl) { + if (!isValidURI(url)) { result.setEmpty(); return false; }