Skip to content

Commit

Permalink
Add URL parsing tests from WHATWG (#3188)
Browse files Browse the repository at this point in the history
Co-authored-by: Kar Petrosyan <92274156+karpetrosyan@users.noreply.github.com>
  • Loading branch information
tomchristie and karpetrosyan authored Jun 13, 2024
1 parent 92e9dfb commit db9072f
Show file tree
Hide file tree
Showing 4 changed files with 9,819 additions and 23 deletions.
30 changes: 17 additions & 13 deletions httpx/_urlparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,22 +253,27 @@ def urlparse(url: str = "", **kwargs: str | None) -> ParseResult:
parsed_userinfo != "" or parsed_host != "" or parsed_port is not None
)
validate_path(path, has_scheme=has_scheme, has_authority=has_authority)
if has_authority:
if has_scheme or has_authority:
path = normalize_path(path)

# The GEN_DELIMS set is... : / ? # [ ] @
# These do not need to be percent-quoted unless they serve as delimiters for the
# specific component.
WHATWG_SAFE = '`{}%|^\\"'

# For 'path' we need to drop ? and # from the GEN_DELIMS set.
parsed_path: str = quote(path, safe=SUB_DELIMS + ":/[]@")
parsed_path: str = quote(path, safe=SUB_DELIMS + WHATWG_SAFE + ":/[]@")
# For 'query' we need to drop '#' from the GEN_DELIMS set.
parsed_query: str | None = (
None if query is None else quote(query, safe=SUB_DELIMS + ":/?[]@")
None
if query is None
else quote(query, safe=SUB_DELIMS + WHATWG_SAFE + ":/?[]@")
)
# For 'fragment' we can include all of the GEN_DELIMS set.
parsed_fragment: str | None = (
None if fragment is None else quote(fragment, safe=SUB_DELIMS + ":/?#[]@")
None
if fragment is None
else quote(fragment, safe=SUB_DELIMS + WHATWG_SAFE + ":/?#[]@")
)

# The parsed ASCII bytestrings are our canonical form.
Expand Down Expand Up @@ -321,7 +326,8 @@ def encode_host(host: str) -> str:
# From https://datatracker.ietf.org/doc/html/rfc3986/#section-3.2.2
#
# reg-name = *( unreserved / pct-encoded / sub-delims )
return quote(host.lower(), safe=SUB_DELIMS)
WHATWG_SAFE = '"`{}%|\\'
return quote(host.lower(), safe=SUB_DELIMS + WHATWG_SAFE)

# IDNA hostnames
try:
Expand Down Expand Up @@ -369,19 +375,17 @@ def validate_path(path: str, has_scheme: bool, has_authority: bool) -> None:
# must either be empty or begin with a slash ("/") character."
if path and not path.startswith("/"):
raise InvalidURL("For absolute URLs, path must be empty or begin with '/'")
else:

if not has_scheme and not has_authority:
# If a URI does not contain an authority component, then the path cannot begin
# with two slash characters ("//").
if path.startswith("//"):
raise InvalidURL(
"URLs with no authority component cannot have a path starting with '//'"
)
raise InvalidURL("Relative URLs cannot have a path starting with '//'")

# In addition, a URI reference (Section 4.1) may be a relative-path reference,
# in which case the first path segment cannot contain a colon (":") character.
if path.startswith(":") and not has_scheme:
raise InvalidURL(
"URLs with no scheme component cannot have a path starting with ':'"
)
if path.startswith(":"):
raise InvalidURL("Relative URLs cannot have a path starting with ':'")


def normalize_path(path: str) -> str:
Expand Down
14 changes: 4 additions & 10 deletions tests/models/test_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,8 +230,8 @@ def test_url_normalized_host():


def test_url_percent_escape_host():
url = httpx.URL("https://exam%le.com/")
assert url.host == "exam%25le.com"
url = httpx.URL("https://exam le.com/")
assert url.host == "exam%20le.com"


def test_url_ipv4_like_host():
Expand Down Expand Up @@ -415,17 +415,11 @@ def test_urlparse_with_invalid_path():

with pytest.raises(httpx.InvalidURL) as exc:
httpx.URL(path="//abc")
assert (
str(exc.value)
== "URLs with no authority component cannot have a path starting with '//'"
)
assert str(exc.value) == "Relative URLs cannot have a path starting with '//'"

with pytest.raises(httpx.InvalidURL) as exc:
httpx.URL(path=":abc")
assert (
str(exc.value)
== "URLs with no scheme component cannot have a path starting with ':'"
)
assert str(exc.value) == "Relative URLs cannot have a path starting with ':'"


def test_url_with_relative_path():
Expand Down
52 changes: 52 additions & 0 deletions tests/models/test_whatwg.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# The WHATWG have various tests that can be used to validate the URL parsing.
#
# https://url.spec.whatwg.org/

import json

import pytest

from httpx._urlparse import urlparse

# URL test cases from...
# https://github.com/web-platform-tests/wpt/blob/master/url/resources/urltestdata.json
with open("tests/models/whatwg.json", "r") as input:
test_cases = json.load(input)
test_cases = [
item
for item in test_cases
if not isinstance(item, str) and not item.get("failure")
]


@pytest.mark.parametrize("test_case", test_cases)
def test_urlparse(test_case):
if test_case["href"] in ("a: foo.com", "lolscheme:x x#x%20x"):
# Skip these two test cases.
# WHATWG cases where are not using percent-encoding for the space character.
# Anyone know what's going on here?
return

p = urlparse(test_case["href"])

# Test cases include the protocol with the trailing ":"
protocol = p.scheme + ":"
# Include the square brackets for IPv6 addresses.
hostname = f"[{p.host}]" if ":" in p.host else p.host
# The test cases use a string representation of the port.
port = "" if p.port is None else str(p.port)
# I have nothing to say about this one.
path = p.path
# The 'search' and 'hash' components in the whatwg tests are semantic, not literal.
# Our parsing differentiates between no query/hash and empty-string query/hash.
search = "" if p.query in (None, "") else "?" + str(p.query)
hash = "" if p.fragment in (None, "") else "#" + str(p.fragment)

# URL hostnames are case-insensitive.
# We normalize these, unlike the WHATWG test cases.
assert protocol == test_case["protocol"]
assert hostname.lower() == test_case["hostname"].lower()
assert port == test_case["port"]
assert path == test_case["pathname"]
assert search == test_case["search"]
assert hash == test_case["hash"]
Loading

0 comments on commit db9072f

Please sign in to comment.