From be4a8d89c700baa81aa875311dedd99aeaf50d66 Mon Sep 17 00:00:00 2001 From: Ben Kelly Date: Wed, 8 Sep 2021 19:47:33 +0000 Subject: [PATCH] URLPattern: Support pattern syntax in IPv6 hostnames. This adds support for patterns that having matching syntax inside of IPv6 address hostnames like: new URLPattern({ hostname: '[:address]' }); This issue is discussed here: https://github.com/WICG/urlpattern/issues/115 This CL also does a drive-by fix of a stale header reference in the blink presubmit warnings. Fixed: 1245998 Change-Id: I772258dc69c2b658ee4d7306b4f1975324624338 Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/3140140 Reviewed-by: Jeremy Roman Reviewed-by: Stephen McGruer Commit-Queue: Ben Kelly Cr-Commit-Position: refs/heads/main@{#919392} NOKEYCHECK=True GitOrigin-RevId: d78439a4b3e959d8f0fca925eaba89758164ccc9 --- .../modules/url_pattern/url_pattern_canon.cc | 25 ++++++ .../modules/url_pattern/url_pattern_canon.h | 1 + .../url_pattern/url_pattern_component.cc | 33 +++++++- .../presubmit/audit_non_blink_usage.py | 3 + blink/tools/blinkpy/style/checkers/cpp.py | 8 +- .../blinkpy/style/checkers/cpp_unittest.py | 7 +- .../resources/urlpatterntestdata.json | 79 +++++++++++++++++++ 7 files changed, 146 insertions(+), 10 deletions(-) diff --git a/blink/renderer/modules/url_pattern/url_pattern_canon.cc b/blink/renderer/modules/url_pattern/url_pattern_canon.cc index d05ff0666e76..1b363bcd0c09 100644 --- a/blink/renderer/modules/url_pattern/url_pattern_canon.cc +++ b/blink/renderer/modules/url_pattern/url_pattern_canon.cc @@ -7,6 +7,7 @@ #include "third_party/blink/renderer/modules/url_pattern/url_pattern_component.h" #include "third_party/blink/renderer/platform/bindings/exception_state.h" #include "third_party/blink/renderer/platform/weborigin/security_origin.h" +#include "third_party/blink/renderer/platform/wtf/text/ascii_ctype.h" #include "third_party/blink/renderer/platform/wtf/text/string_utf8_adaptor.h" #include "url/url_canon.h" #include "url/url_util.h" @@ -120,6 +121,30 @@ absl::StatusOr HostnameEncodeCallback(absl::string_view input) { return StdStringFromCanonOutput(canon_output, component); } +absl::StatusOr IPv6HostnameEncodeCallback( + absl::string_view input) { + std::string result; + result.reserve(input.size()); + // This implements a light validation and canonicalization of IPv6 hostname + // content. Ideally we would use the URL parser's hostname canonicalizer + // here, but that is too strict for the encoding callback. The callback may + // see only bits and pieces of the hostname pattern; e.g. for `[:address]` it + // sees the `[` and `]` strings as separate calls. Since the full URL + // hostname parser wants to completely parse IPv6 hostnames, this will always + // trigger an error. Therefore, to allow pattern syntax within IPv6 brackets + // we simply check for valid characters and lowercase any hex digits. + for (size_t i = 0; i < input.size(); ++i) { + char c = input[i]; + if (!IsASCIIHexDigit(c) && c != '[' && c != ']' && c != ':') { + return absl::InvalidArgumentError( + std::string("Invalid IPv6 hostname character '") + c + "' in '" + + std::string(input) + "'."); + } + result += ToASCIILower(c); + } + return result; +} + absl::StatusOr PortEncodeCallback(absl::string_view input) { if (input.empty()) return std::string(); diff --git a/blink/renderer/modules/url_pattern/url_pattern_canon.h b/blink/renderer/modules/url_pattern/url_pattern_canon.h index eb6f3b38582f..7db24574f164 100644 --- a/blink/renderer/modules/url_pattern/url_pattern_canon.h +++ b/blink/renderer/modules/url_pattern/url_pattern_canon.h @@ -33,6 +33,7 @@ absl::StatusOr ProtocolEncodeCallback(absl::string_view input); absl::StatusOr UsernameEncodeCallback(absl::string_view input); absl::StatusOr PasswordEncodeCallback(absl::string_view input); absl::StatusOr HostnameEncodeCallback(absl::string_view input); +absl::StatusOr IPv6HostnameEncodeCallback(absl::string_view input); absl::StatusOr PortEncodeCallback(absl::string_view input); absl::StatusOr StandardURLPathnameEncodeCallback( absl::string_view input); diff --git a/blink/renderer/modules/url_pattern/url_pattern_component.cc b/blink/renderer/modules/url_pattern/url_pattern_component.cc index ed7ddb895b02..66db96ee8b15 100644 --- a/blink/renderer/modules/url_pattern/url_pattern_component.cc +++ b/blink/renderer/modules/url_pattern/url_pattern_component.cc @@ -40,8 +40,31 @@ StringView TypeToString(Component::Type type) { NOTREACHED(); } +// Utility method to determine if a particular hostname pattern should be +// treated as an IPv6 hostname. This implements a simple and fast heuristic +// looking for a leading `[`. It is intended to catch the most common cases +// with minimum overhead. +bool TreatAsIPv6Hostname(base::StringPiece pattern_utf8) { + // The `[` string cannot be a valid IPv6 hostname. We need at least two + // characters to represent `[*`. + if (pattern_utf8.size() < 2) + return false; + + if (pattern_utf8[0] == '[') + return true; + + // We do a bit of extra work to detect brackets behind an escape and + // within a grouping. + if ((pattern_utf8[0] == '\\' || pattern_utf8[0] == '{') && + pattern_utf8[1] == '[') + return true; + + return false; +} + // Utility method to get the correct encoding callback for a given type. -liburlpattern::EncodeCallback GetEncodeCallback(Component::Type type, +liburlpattern::EncodeCallback GetEncodeCallback(base::StringPiece pattern_utf8, + Component::Type type, Component* protocol_component) { switch (type) { case Component::Type::kProtocol: @@ -51,7 +74,10 @@ liburlpattern::EncodeCallback GetEncodeCallback(Component::Type type, case Component::Type::kPassword: return PasswordEncodeCallback; case Component::Type::kHostname: - return HostnameEncodeCallback; + if (TreatAsIPv6Hostname(pattern_utf8)) + return IPv6HostnameEncodeCallback; + else + return HostnameEncodeCallback; case Component::Type::kPort: return PortEncodeCallback; case Component::Type::kPathname: @@ -205,7 +231,8 @@ Component* Component::Compile(StringView pattern, StringUTF8Adaptor utf8(final_pattern); auto parse_result = liburlpattern::Parse( absl::string_view(utf8.data(), utf8.size()), - GetEncodeCallback(type, protocol_component), options); + GetEncodeCallback(utf8.AsStringPiece(), type, protocol_component), + options); if (!parse_result.ok()) { exception_state.ThrowTypeError( "Invalid " + TypeToString(type) + " pattern '" + final_pattern + "'. " + diff --git a/blink/tools/blinkpy/presubmit/audit_non_blink_usage.py b/blink/tools/blinkpy/presubmit/audit_non_blink_usage.py index a2f0083b5f29..45b160599998 100755 --- a/blink/tools/blinkpy/presubmit/audit_non_blink_usage.py +++ b/blink/tools/blinkpy/presubmit/audit_non_blink_usage.py @@ -1307,6 +1307,9 @@ # liburlpattern API. "base::IsStringASCII", + # Needed to use part of the StringUTF8Adaptor API. + "base::StringPiece", + # //third_party/liburlpattern 'liburlpattern::.+', diff --git a/blink/tools/blinkpy/style/checkers/cpp.py b/blink/tools/blinkpy/style/checkers/cpp.py index fd61efa6e1d8..e5711f52ee4c 100644 --- a/blink/tools/blinkpy/style/checkers/cpp.py +++ b/blink/tools/blinkpy/style/checkers/cpp.py @@ -1472,7 +1472,7 @@ def get_previous_non_blank_line(clean_lines, line_number): def check_ctype_functions(clean_lines, line_number, file_state, error): """Looks for use of the standard functions in ctype.h and suggest they be replaced - by use of equivalent ones in ?. + by use of equivalent ones in "wtf/text/ascii_ctype.h"?. Args: clean_lines: A CleansedLines instance containing the file. @@ -1493,9 +1493,9 @@ def check_ctype_functions(clean_lines, line_number, file_state, error): ctype_function = ctype_function_search.group('ctype_function') error( - line_number, 'runtime/ctype_function', 4, - 'Use equivalent function in instead of the %s() function.' - % (ctype_function)) + line_number, 'runtime/ctype_function', 4, 'Use equivalent function in ' + '"third_party/blink/renderer/platform/wtf/text/ascii_ctype.h" instead ' + 'of the %s() function.' % (ctype_function)) def replaceable_check(operator, macro, line): diff --git a/blink/tools/blinkpy/style/checkers/cpp_unittest.py b/blink/tools/blinkpy/style/checkers/cpp_unittest.py index eb535d4d1434..9fb656ca61f0 100644 --- a/blink/tools/blinkpy/style/checkers/cpp_unittest.py +++ b/blink/tools/blinkpy/style/checkers/cpp_unittest.py @@ -2486,9 +2486,10 @@ def test_using_std_swap_ignored(self): def test_ctype_fucntion(self): self.assert_lint( - 'int i = isascii(8);', - 'Use equivalent function in instead of the ' - 'isascii() function. [runtime/ctype_function] [4]', 'foo.cpp') + 'int i = isascii(8);', 'Use equivalent function in ' + '"third_party/blink/renderer/platform/wtf/text/ascii_ctype.h" ' + 'instead of the isascii() function. [runtime/ctype_function] [4]', + 'foo.cpp') def test_redundant_virtual(self): self.assert_lint( diff --git a/blink/web_tests/external/wpt/urlpattern/resources/urlpatterntestdata.json b/blink/web_tests/external/wpt/urlpattern/resources/urlpatterntestdata.json index 5ef045e560ac..6138b30cdf5e 100644 --- a/blink/web_tests/external/wpt/urlpattern/resources/urlpatterntestdata.json +++ b/blink/web_tests/external/wpt/urlpattern/resources/urlpatterntestdata.json @@ -2199,6 +2199,85 @@ "pathname": { "input": "/", "groups": {} } } }, + { + "pattern": [ "http://[:address]/" ], + "inputs": [ "http://[::1]/" ], + "exactly_empty_components": [ "username", "password", "port", "search", + "hash" ], + "expected_obj": { + "protocol": "http", + "hostname": "[:address]", + "pathname": "/" + }, + "expected_match": { + "protocol": { "input": "http", "groups": {} }, + "hostname": { "input": "[::1]", "groups": { "address": "::1" }}, + "pathname": { "input": "/", "groups": {} } + } + }, + { + "pattern": [ "http://[\\:\\:AB\\::num]/" ], + "inputs": [ "http://[::ab:1]/" ], + "exactly_empty_components": [ "username", "password", "port", "search", + "hash" ], + "expected_obj": { + "protocol": "http", + "hostname": "[\\:\\:ab\\::num]", + "pathname": "/" + }, + "expected_match": { + "protocol": { "input": "http", "groups": {} }, + "hostname": { "input": "[::ab:1]", "groups": { "num": "1" }}, + "pathname": { "input": "/", "groups": {} } + } + }, + { + "pattern": [{ "hostname": "[\\:\\:AB\\::num]" }], + "inputs": [{ "hostname": "[::ab:1]" }], + "expected_obj": { + "hostname": "[\\:\\:ab\\::num]" + }, + "expected_match": { + "hostname": { "input": "[::ab:1]", "groups": { "num": "1" }} + } + }, + { + "pattern": [{ "hostname": "[\\:\\:xY\\::num]" }], + "expected_obj": "error" + }, + { + "pattern": [{ "hostname": "{[\\:\\:ab\\::num]}" }], + "inputs": [{ "hostname": "[::ab:1]" }], + "expected_match": { + "hostname": { "input": "[::ab:1]", "groups": { "num": "1" }} + } + }, + { + "pattern": [{ "hostname": "{[\\:\\:fé\\::num]}" }], + "expected_obj": "error" + }, + { + "pattern": [{ "hostname": "{[\\:\\::num\\:1]}" }], + "inputs": [{ "hostname": "[::ab:1]" }], + "expected_match": { + "hostname": { "input": "[::ab:1]", "groups": { "num": "ab" }} + } + }, + { + "pattern": [{ "hostname": "{[\\:\\::num\\:fé]}" }], + "expected_obj": "error" + }, + { + "pattern": [{ "hostname": "[*\\:1]" }], + "inputs": [{ "hostname": "[::ab:1]" }], + "expected_match": { + "hostname": { "input": "[::ab:1]", "groups": { "0": "::ab" }} + } + }, + { + "pattern": [{ "hostname": "*\\:1]" }], + "expected_obj": "error" + }, { "pattern": [ "https://foo{{@}}example.com" ], "inputs": [ "https://foo@example.com" ],