From 716455c30449edda66961bdeddbe1e6b9a703f44 Mon Sep 17 00:00:00 2001
From: Krishna Pai <kpai@fb.com>
Date: Tue, 21 Nov 2023 06:02:14 -0800
Subject: [PATCH] Fix url_extract_path bugs.

---
 velox/docs/functions/presto/url.rst      | 39 ++++++++++++++--
 velox/functions/prestosql/URLFunctions.h | 59 +++++++++++++++---------
 2 files changed, 73 insertions(+), 25 deletions(-)
diff --git a/velox/docs/functions/presto/url.rst b/velox/docs/functions/presto/url.rst
index 183b81503d32..872828e9d52f 100644
--- a/velox/docs/functions/presto/url.rst
+++ b/velox/docs/functions/presto/url.rst
@@ -2,10 +2,10 @@
 URL Functions
 =============
 
-Extraction Functions
---------------------
+Introduction
+------------
 
-The URL extraction functions extract components from HTTP URLs (or any valid URIs conforming to `RFC 2396 <https://tools.ietf.org/html/rfc2396.html>`_). The following syntax is supported:
+The URL extraction functions extract components from HTTP URLs (or any valid URIs conforming to `RFC 3986 <https://tools.ietf.org/html/rfc3986.html>`_). The following syntax is supported:
 
 .. code-block:: bash
 
@@ -14,6 +14,39 @@ The URL extraction functions extract components from HTTP URLs (or any valid URI
 
 The extracted components do not contain URI syntax separators such as ``:`` , ``?`` and ``#``.
 
+Consider for example the below URI:
+
+.. code-block::
+
+    http://www.ics.uci.edu/pub/ietf/uri/?k1=v1#Related
+
+    scheme    = http
+    authority = www.ics.uci.edu
+    path      = /pub/ietf/uri/
+    query     = k1=v1
+    fragment  = Related
+
+
+Invalid URI's
+-------------
+
+Well formed URI's should not contain ascii whitespace. `Percent-encoded URI's <https://www.rfc-editor.org/rfc/rfc3986#section-2.1>`_ should be followed by two hexadecimal
+digits after the percent character "%".
+
+.. code-block::
+
+    # Examples of url functions with Invalid URI's.
+
+    # Invalid URI due to whitespace
+    SELECT url_extract_path('foo '); -- NULL (1 row)
+
+    # Invalid URI due to improper escaping of '%'
+    SELECT url_extract_path('https://www.ucu.edu.uy/agenda/evento/%%UCUrlCompartir%%'); -- NULL (1 row)
+
+
+Extraction Functions
+--------------------
+
 .. function:: url_extract_fragment(url) -> varchar
 
     Returns the fragment identifier from ``url``.
diff --git a/velox/functions/prestosql/URLFunctions.h b/velox/functions/prestosql/URLFunctions.h
index 24e674e83750..714490cb2985 100644
--- a/velox/functions/prestosql/URLFunctions.h
+++ b/velox/functions/prestosql/URLFunctions.h
@@ -30,6 +30,28 @@ FOLLY_ALWAYS_INLINE StringView submatch(const boost::cmatch& match, int idx) {
 
 template <typename TInString>
 bool parse(const TInString& rawUrl, boost::cmatch& match) {
+  /// This regex is taken from RFC - 3986.
+  /// See: https://www.rfc-editor.org/rfc/rfc3986#appendix-B
+  /// The basic groups are:
+  ///      scheme    = $2
+  ///      authority = $4
+  ///      path      = $5
+  ///      query     = $7
+  ///      fragment  = $9
+  /// For example a URI like below :
+  ///  http://www.ics.uci.edu/pub/ietf/uri/#Related
+  ///
+  ///   results in the following subexpression matches:
+  ///
+  ///      $1 = http:
+  ///      $2 = http
+  ///      $3 = //www.ics.uci.edu
+  ///      $4 = www.ics.uci.edu
+  ///      $5 = /pub/ietf/uri/
+  ///      $6 = <undefined>
+  ///      $7 = <undefined>
+  ///      $8 = #Related
+  ///      $9 = Related
   static const boost::regex kUriRegex(
       "^(([^:\\/?#]+):)?" // scheme:
       "(\\/\\/([^\\/?#]*))?([^?#]*)" // authority and path
@@ -85,8 +107,10 @@ FOLLY_ALWAYS_INLINE void urlEscape(TOutString& output, const TInString& input) {
   output.resize(outIndex);
 }
 
-template <typename TInString>
-FOLLY_ALWAYS_INLINE bool urlUnescapeCheck(const TInString& input) {
+/// Performs initial validation of the URI.
+/// Checks if the URI contains ascii whitespaces or
+/// unescaped '%' chars.
+bool isValidURI(const StringView& input) {
   const char* p = input.data();
   const char* end = p + input.size();
   char buf[3];
@@ -173,8 +197,7 @@ struct UrlExtractProtocolFunction {
   FOLLY_ALWAYS_INLINE bool call(
       out_type<Varchar>& result,
       const arg_type<Varchar>& url) {
-    auto validUrl = urlUnescapeCheck(url);
-    if (!validUrl) {
+    if (!isValidURI(url)) {
       result.setEmpty();
       return false;
     }
@@ -201,8 +224,7 @@ struct UrlExtractFragmentFunction {
   FOLLY_ALWAYS_INLINE bool call(
       out_type<Varchar>& result,
       const arg_type<Varchar>& url) {
-    auto validUrl = urlUnescapeCheck(url);
-    if (!validUrl) {
+    if (!isValidURI(url)) {
       result.setEmpty();
       return false;
     }
@@ -229,8 +251,7 @@ struct UrlExtractHostFunction {
   FOLLY_ALWAYS_INLINE bool call(
       out_type<Varchar>& result,
       const arg_type<Varchar>& url) {
-    auto validUrl = urlUnescapeCheck(url);
-    if (!validUrl) {
+    if (!isValidURI(url)) {
       result.setEmpty();
       return false;
     }
@@ -246,8 +267,7 @@ struct UrlExtractHostFunction {
     if (matchAuthorityAndPath(
             match, authAndPathMatch, authorityMatch, hasAuthority) &&
         hasAuthority) {
-      auto host = submatch(authorityMatch, 3);
-      result.setNoCopy(host);
+      result.setNoCopy(submatch(authorityMatch, 3));
     } else {
       result.setEmpty();
     }
@@ -260,8 +280,7 @@ struct UrlExtractPortFunction {
   VELOX_DEFINE_FUNCTION_TYPES(T);
 
   FOLLY_ALWAYS_INLINE bool call(int64_t& result, const arg_type<Varchar>& url) {
-    auto validUrl = urlUnescapeCheck(url);
-    if (!validUrl) {
+    if (!isValidURI(url)) {
       return false;
     }
     boost::cmatch match;
@@ -297,20 +316,18 @@ struct UrlExtractPathFunction {
   FOLLY_ALWAYS_INLINE bool call(
       out_type<Varchar>& result,
       const arg_type<Varchar>& url) {
-    boost::cmatch match;
-    if (!parse(url, match)) {
+    if (!isValidURI(url)) {
       result.setEmpty();
       return false;
     }
 
-    if (!urlUnescapeCheck(url)) {
+    boost::cmatch match;
+    if (!parse(url, match)) {
       result.setEmpty();
       return false;
     }
 
-    StringView escapedPath;
-    escapedPath = submatch(match, 5);
-    urlUnescape(result, escapedPath);
+    urlUnescape(result, submatch(match, 5));
 
     return true;
   }
@@ -329,8 +346,7 @@ struct UrlExtractQueryFunction {
   FOLLY_ALWAYS_INLINE bool call(
       out_type<Varchar>& result,
       const arg_type<Varchar>& url) {
-    auto validUrl = urlUnescapeCheck(url);
-    if (!validUrl) {
+    if (!isValidURI(url)) {
       result.setEmpty();
       return false;
     }
@@ -360,8 +376,7 @@ struct UrlExtractParameterFunction {
       out_type<Varchar>& result,
       const arg_type<Varchar>& url,
       const arg_type<Varchar>& param) {
-    auto validUrl = urlUnescapeCheck(url);
-    if (!validUrl) {
+    if (!isValidURI(url)) {
       result.setEmpty();
       return false;
     }