diff --git a/src/hyperlink/_url.py b/src/hyperlink/_url.py index 4fb84133..be69baf6 100644 --- a/src/hyperlink/_url.py +++ b/src/hyperlink/_url.py @@ -183,7 +183,7 @@ def __nonzero__(self): _SCHEMELESS_PATH_DELIMS = _ALL_DELIMS - _SCHEMELESS_PATH_SAFE _FRAGMENT_SAFE = _UNRESERVED_CHARS | _PATH_SAFE | set(u"/?") _FRAGMENT_DELIMS = _ALL_DELIMS - _FRAGMENT_SAFE -_QUERY_VALUE_SAFE = _UNRESERVED_CHARS | _FRAGMENT_SAFE - set(u"&+") +_QUERY_VALUE_SAFE = _UNRESERVED_CHARS | _FRAGMENT_SAFE - set(u"&") _QUERY_VALUE_DELIMS = _ALL_DELIMS - _QUERY_VALUE_SAFE _QUERY_KEY_SAFE = _UNRESERVED_CHARS | _QUERY_VALUE_SAFE - set(u"=") _QUERY_KEY_DELIMS = _ALL_DELIMS - _QUERY_KEY_SAFE @@ -467,9 +467,13 @@ def _encode_userinfo_part(text, maximal=True): ) # As of Mar 11, 2017, there were 44 netloc schemes, and 13 non-netloc +NO_QUERY_PLUS_SCHEMES = set() -def register_scheme(text, uses_netloc=True, default_port=None): - # type: (Text, bool, Optional[int]) -> None + +def register_scheme( + text, uses_netloc=True, default_port=None, query_plus_is_space=True +): + # type: (Text, bool, Optional[int], bool) -> None """Registers new scheme information, resulting in correct port and slash behavior from the URL object. There are dozens of standard schemes preregistered, so this function is mostly meant for @@ -485,6 +489,8 @@ def register_scheme(text, uses_netloc=True, default_port=None): not. Defaults to True. default_port: The default port, if any, for netloc-using schemes. + query_plus_is_space: If true, a "+" in the query string should be + decoded as a space by DecodedURL. .. _file an issue: https://github.com/mahmoud/hyperlink/issues """ @@ -510,6 +516,9 @@ def register_scheme(text, uses_netloc=True, default_port=None): else: raise ValueError("uses_netloc expected bool, not: %r" % uses_netloc) + if not query_plus_is_space: + NO_QUERY_PLUS_SCHEMES.add(text) + return @@ -922,9 +931,9 @@ class URL(object): https://example.com/hello/world The constructor runs basic type checks. All strings are expected - to be decoded (:class:`unicode` in Python 2). All arguments are - optional, defaulting to appropriately empty values. A full list of - constructor arguments is below. + to be text (:class:`str` in Python 3, :class:`unicode` in Python 2). All + arguments are optional, defaulting to appropriately empty values. A full + list of constructor arguments is below. Args: scheme: The text name of the scheme. @@ -934,9 +943,9 @@ class URL(object): it is known. See the ``SCHEME_PORT_MAP`` and :func:`register_default_port` for more info. path: A tuple of strings representing the slash-separated parts of the - path. + path, each percent-encoded. query: The query parameters, as a dictionary or as an sequence of - key-value pairs. + percent-encoded key-value pairs. fragment: The fragment part of the URL. rooted: A rooted URL is one which indicates an absolute path. This is True on any URL that includes a host, or any relative URL @@ -1969,6 +1978,16 @@ def remove( _EMPTY_URL = URL() +def _replace_plus(text): + # type: (Text) -> Text + return text.replace("+", "%20") + + +def _no_op(text): + # type: (Text) -> Text + return text + + class DecodedURL(object): """ :class:`DecodedURL` is a type designed to act as a higher-level @@ -1998,6 +2017,9 @@ class DecodedURL(object): lazy: Set to True to avoid pre-decode all parts of the URL to check for validity. Defaults to False. + query_plus_is_space: + characters in the query string should be treated + as spaces when decoding. If unspecified, the default is taken from + the scheme. .. note:: @@ -2012,9 +2034,12 @@ class DecodedURL(object): .. versionadded:: 18.0.0 """ - def __init__(self, url=_EMPTY_URL, lazy=False): - # type: (URL, bool) -> None + def __init__(self, url=_EMPTY_URL, lazy=False, query_plus_is_space=None): + # type: (URL, bool, Optional[bool]) -> None self._url = url + if query_plus_is_space is None: + query_plus_is_space = url.scheme not in NO_QUERY_PLUS_SCHEMES + self._query_plus_is_space = query_plus_is_space if not lazy: # cache the following, while triggering any decoding # issues with decodable fields @@ -2022,8 +2047,8 @@ def __init__(self, url=_EMPTY_URL, lazy=False): return @classmethod - def from_text(cls, text, lazy=False): - # type: (Text, bool) -> DecodedURL + def from_text(cls, text, lazy=False, query_plus_is_space=None): + # type: (Text, bool, Optional[bool]) -> DecodedURL """\ Make a `DecodedURL` instance from any text string containing a URL. @@ -2034,7 +2059,7 @@ def from_text(cls, text, lazy=False): Defaults to True. """ _url = URL.from_text(text) - return cls(_url, lazy=lazy) + return cls(_url, lazy=lazy, query_plus_is_space=query_plus_is_space) @property def encoded_url(self): @@ -2059,6 +2084,14 @@ def to_iri(self): "Passthrough to :meth:`~hyperlink.URL.to_iri()`" return self._url.to_iri() + def _clone(self, url): + # type: (URL) -> DecodedURL + return self.__class__( + url, + # TODO: propagate laziness? + query_plus_is_space=self._query_plus_is_space, + ) + def click(self, href=u""): # type: (Union[Text, URL, DecodedURL]) -> DecodedURL """Return a new DecodedURL wrapping the result of @@ -2066,7 +2099,9 @@ def click(self, href=u""): """ if isinstance(href, DecodedURL): href = href._url - return self.__class__(self._url.click(href=href)) + return self._clone( + self._url.click(href=href), + ) def sibling(self, segment): # type: (Text) -> DecodedURL @@ -2074,7 +2109,9 @@ def sibling(self, segment): return a new `DecodedURL` wrapping the result of :meth:`~hyperlink.URL.sibling()` """ - return self.__class__(self._url.sibling(_encode_reserved(segment))) + return self._clone( + self._url.sibling(_encode_reserved(segment)), + ) def child(self, *segments): # type: (Text) -> DecodedURL @@ -2085,7 +2122,7 @@ def child(self, *segments): if not segments: return self new_segs = [_encode_reserved(s) for s in segments] - return self.__class__(self._url.child(*new_segs)) + return self._clone(self._url.child(*new_segs)) def normalize( self, @@ -2101,7 +2138,7 @@ def normalize( """Return a new `DecodedURL` wrapping the result of :meth:`~hyperlink.URL.normalize()` """ - return self.__class__( + return self._clone( self._url.normalize( scheme, host, path, query, fragment, userinfo, percents ) @@ -2148,11 +2185,18 @@ def path(self): def query(self): # type: () -> QueryPairs if not hasattr(self, "_query"): + if self._query_plus_is_space: + predecode = _replace_plus + else: + predecode = _no_op + self._query = cast( QueryPairs, tuple( tuple( - _percent_decode(x, raise_subencoding_exc=True) + _percent_decode( + predecode(x), raise_subencoding_exc=True + ) if x is not None else None for x in (k, v) @@ -2248,7 +2292,7 @@ def replace( userinfo=userinfo_text, uses_netloc=uses_netloc, ) - return self.__class__(url=new_url) + return self._clone(url=new_url) def get(self, name): # type: (Text) -> List[Optional[Text]] diff --git a/src/hyperlink/test/test_decoded_url.py b/src/hyperlink/test/test_decoded_url.py index 7104bea5..235cd915 100644 --- a/src/hyperlink/test/test_decoded_url.py +++ b/src/hyperlink/test/test_decoded_url.py @@ -210,3 +210,19 @@ def test_click_decoded_url(self): assert clicked.host == durl.host assert clicked.path == durl_dest.path assert clicked.path == ("tëst",) + + def test_decode_plus(self): + # type: () -> None + durl = DecodedURL.from_text("/x+y%2B?a=b+c%2B") + assert durl.path == ("x+y+",) + assert durl.get("a") == ["b c+"] + assert durl.query == (("a", "b c+"),) + + def test_decode_nonplussed(self): + # type: () -> None + durl = DecodedURL.from_text( + "/x+y%2B?a=b+c%2B", query_plus_is_space=False + ) + assert durl.path == ("x+y+",) + assert durl.get("a") == ["b+c+"] + assert durl.query == (("a", "b+c+"),) diff --git a/src/hyperlink/test/test_scheme_registration.py b/src/hyperlink/test/test_scheme_registration.py index f98109a3..b43c91e3 100644 --- a/src/hyperlink/test/test_scheme_registration.py +++ b/src/hyperlink/test/test_scheme_registration.py @@ -5,7 +5,7 @@ from .. import _url from .common import HyperlinkTestCase -from .._url import register_scheme, URL +from .._url import register_scheme, URL, DecodedURL class TestSchemeRegistration(HyperlinkTestCase): @@ -70,3 +70,13 @@ def test_register_invalid_port(self): # type: () -> None with self.assertRaises(ValueError): register_scheme("nope", default_port=cast(bool, object())) + + def test_register_no_quote_plus_scheme(self): + # type: () -> None + register_scheme("keepplus", query_plus_is_space=False) + plus_is_not_space = DecodedURL.from_text( + "keepplus://example.com/?q=a+b" + ) + plus_is_space = DecodedURL.from_text("https://example.com/?q=a+b") + assert plus_is_not_space.get("q") == ["a+b"] + assert plus_is_space.get("q") == ["a b"] diff --git a/src/hyperlink/test/test_url.py b/src/hyperlink/test/test_url.py index 159d6a58..37c91726 100644 --- a/src/hyperlink/test/test_url.py +++ b/src/hyperlink/test/test_url.py @@ -133,6 +133,8 @@ "https://example.com/?a=%23", # hash in query param value "https://example.com/?a=%26", # ampersand in query param value "https://example.com/?a=%3D", # equals in query param value + "https://example.com/?foo+bar=baz", # plus in query param name + "https://example.com/?foo=bar+baz", # plus in query param value # double-encoded percent sign in all percent-encodable positions: "http://(%2525):(%2525)@example.com/(%2525)/?(%2525)=(%2525)#(%2525)", # colon in first part of schemeless relative url