From 876dc2e99c00a048b4fd212db9021994792ed3fb Mon Sep 17 00:00:00 2001
From: Lucy Wang <wxitb2017@gmail.com>
Date: Wed, 9 Jun 2021 13:32:02 +0800
Subject: [PATCH 01/17] setup mypy

---
 .github/workflows/build.yml | 3 +++
 .gitignore                  | 1 +
 tox.ini                     | 7 +++++++
 3 files changed, 11 insertions(+)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 2425ab35..0d607cf1 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -30,6 +30,9 @@ jobs:
         - python-version: 3.9
           env:
             TOXENV: black
+        - python-version: 3.9
+          env:
+            TOXENV: typing
 
     steps:
     - uses: actions/checkout@v2
diff --git a/.gitignore b/.gitignore
index a279e1af..9f81e120 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,3 +10,4 @@ _trial_temp
 .coverage
 coverage.xml
 .cache
+.mypy_cache/
diff --git a/tox.ini b/tox.ini
index 29e9145f..a954c1eb 100644
--- a/tox.ini
+++ b/tox.ini
@@ -22,6 +22,13 @@ deps =
 commands =
     bandit -r -c .bandit.yml {posargs:w3lib}
 
+[testenv:typing]
+basepython = python3
+deps =
+    mypy==0.901
+commands =
+    mypy --show-error-codes {posargs: w3lib tests}
+
 [testenv:flake8]
 basepython = python3
 deps =

From 1357f4d3dd0d60bb88f886dcca73c6fa8d6c1c0c Mon Sep 17 00:00:00 2001
From: Lucy Wang <wxitb2017@gmail.com>
Date: Wed, 16 Jun 2021 19:37:31 +0800
Subject: [PATCH 02/17] start to add typing

---
 .gitignore        |  2 ++
 mypy.ini          | 12 ++++++++++++
 run-mypy.sh       |  5 +++++
 w3lib/_types.py   |  5 +++++
 w3lib/encoding.py | 40 ++++++++++++++++++++++++----------------
 w3lib/http.py     | 24 +++++++++++++-----------
 6 files changed, 61 insertions(+), 27 deletions(-)
 create mode 100644 mypy.ini
 create mode 100755 run-mypy.sh
 create mode 100644 w3lib/_types.py

diff --git a/.gitignore b/.gitignore
index 9f81e120..bccc4a7b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,3 +11,5 @@ _trial_temp
 coverage.xml
 .cache
 .mypy_cache/
+/index.txt
+.dmypy.json
diff --git a/mypy.ini b/mypy.ini
new file mode 100644
index 00000000..d4c7c859
--- /dev/null
+++ b/mypy.ini
@@ -0,0 +1,12 @@
+[mypy]
+exclude = .*flycheck_.*
+show_error_codes = True
+check_untyped_defs = True
+
+[mypy-w3lib.*]
+# All non-tests functions must be typed.
+disallow_untyped_defs = True
+
+[mypy-tests.*]
+# Allow test functions to be untyped
+disallow_untyped_defs = False
diff --git a/run-mypy.sh b/run-mypy.sh
new file mode 100755
index 00000000..ea3b1332
--- /dev/null
+++ b/run-mypy.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+set -e
+
+mypy --txt-report . w3lib tests
diff --git a/w3lib/_types.py b/w3lib/_types.py
new file mode 100644
index 00000000..84499a6c
--- /dev/null
+++ b/w3lib/_types.py
@@ -0,0 +1,5 @@
+from typing import Union
+
+# the base class UnicodeError doesn't have attributes like start / end
+AnyUnicodeError = Union[UnicodeEncodeError, UnicodeDecodeError]
+StrOrBytes = Union[str, bytes]
diff --git a/w3lib/encoding.py b/w3lib/encoding.py
index 1a231155..68357d12 100644
--- a/w3lib/encoding.py
+++ b/w3lib/encoding.py
@@ -3,11 +3,14 @@
 """
 import re, codecs, encodings
 from sys import version_info
+from typing import  Callable, Match, Optional, Tuple, Union, cast
+from w3lib._types import AnyUnicodeError, StrOrBytes
+from w3lib.util import to_native_str
 
 _HEADER_ENCODING_RE = re.compile(r"charset=([\w-]+)", re.I)
 
 
-def http_content_type_encoding(content_type):
+def http_content_type_encoding(content_type: Optional[str]) -> Optional[str]:
     """Extract the encoding in the content-type header
 
     >>> import w3lib.encoding
@@ -21,6 +24,7 @@ def http_content_type_encoding(content_type):
         if match:
             return resolve_encoding(match.group(1))
 
+    return None
 
 # regexp for parsing HTTP meta tags
 _TEMPLATE = r"""%s\s*=\s*["']?\s*%s\s*["']?"""
@@ -51,7 +55,7 @@ def http_content_type_encoding(content_type):
 )
 
 
-def html_body_declared_encoding(html_body_str):
+def html_body_declared_encoding(html_body_str: StrOrBytes) -> Optional[str]:
     '''Return the encoding specified in meta tags in the html body,
     or ``None`` if no suitable encoding was found
 
@@ -75,6 +79,7 @@ def html_body_declared_encoding(html_body_str):
 
     # html5 suggests the first 1024 bytes are sufficient, we allow for more
     chunk = html_body_str[:4096]
+    match: Union[Optional[Match[bytes]], Optional[Match[str]]]
     if isinstance(chunk, bytes):
         match = _BODY_ENCODING_BYTES_RE.search(chunk)
     else:
@@ -87,7 +92,9 @@ def html_body_declared_encoding(html_body_str):
             or match.group("xmlcharset")
         )
         if encoding:
-            return resolve_encoding(encoding)
+            return resolve_encoding(to_native_str(encoding))
+
+    return None
 
 
 # Default encoding translation
@@ -117,8 +124,7 @@ def html_body_declared_encoding(html_body_str):
     "zh_cn": "gb18030",
 }
 
-
-def _c18n_encoding(encoding):
+def _c18n_encoding(encoding: str) -> str:
     """Canonicalize an encoding name
 
     This performs normalization and translates aliases using python's
@@ -128,7 +134,7 @@ def _c18n_encoding(encoding):
     return encodings.aliases.aliases.get(normed, normed)
 
 
-def resolve_encoding(encoding_alias):
+def resolve_encoding(encoding_alias: str) -> Optional[str]:
     """Return the encoding that `encoding_alias` maps to, or ``None``
     if the encoding cannot be interpreted
 
@@ -158,7 +164,7 @@ def resolve_encoding(encoding_alias):
 _FIRST_CHARS = set(c[0] for (c, _) in _BOM_TABLE)
 
 
-def read_bom(data):
+def read_bom(data: bytes) -> Union[Tuple[None, None], Tuple[str, bytes]]:
     r"""Read the byte order mark in the text, if present, and
     return the encoding represented by the BOM and the BOM.
 
@@ -189,10 +195,10 @@ def read_bom(data):
 
 # Python decoder doesn't follow unicode standard when handling
 # bad utf-8 encoded strings. see http://bugs.python.org/issue8271
-codecs.register_error("w3lib_replace", lambda exc: ("\ufffd", exc.end))
+codecs.register_error('w3lib_replace', lambda exc: ('\ufffd', cast(AnyUnicodeError, exc).end))
 
 
-def to_unicode(data_str, encoding):
+def to_unicode(data_str: bytes, encoding: str) -> str:
     """Convert a str object to unicode using the encoding given
 
     Characters that cannot be converted will be converted to ``\\ufffd`` (the
@@ -203,9 +209,8 @@ def to_unicode(data_str, encoding):
     )
 
 
-def html_to_unicode(
-    content_type_header, html_body_str, default_encoding="utf8", auto_detect_fun=None
-):
+def html_to_unicode(content_type_header: Optional[str], html_body_str: bytes,
+                    default_encoding: str = 'utf8', auto_detect_fun: Optional[Callable[[bytes], str]] = None) -> Tuple[str, str]:
     r'''Convert raw html bytes to unicode
 
     This attempts to make a reasonable guess at the content encoding of the
@@ -273,18 +278,21 @@ def html_to_unicode(
     if enc is not None:
         # remove BOM if it agrees with the encoding
         if enc == bom_enc:
-            html_body_str = html_body_str[len(bom) :]
-        elif enc == "utf-16" or enc == "utf-32":
+            bom = cast(bytes, bom)
+            html_body_str = html_body_str[len(bom):]
+        elif enc == 'utf-16' or enc == 'utf-32':
             # read endianness from BOM, or default to big endian
             # tools.ietf.org/html/rfc2781 section 4.3
             if bom_enc is not None and bom_enc.startswith(enc):
                 enc = bom_enc
-                html_body_str = html_body_str[len(bom) :]
+                bom = cast(bytes, bom)
+                html_body_str = html_body_str[len(bom):]
             else:
                 enc += "-be"
         return enc, to_unicode(html_body_str, enc)
     if bom_enc is not None:
-        return bom_enc, to_unicode(html_body_str[len(bom) :], bom_enc)
+        bom = cast(bytes, bom)
+        return bom_enc, to_unicode(html_body_str[len(bom):], bom_enc)
     enc = html_body_declared_encoding(html_body_str)
     if enc is None and (auto_detect_fun is not None):
         enc = auto_detect_fun(html_body_str)
diff --git a/w3lib/http.py b/w3lib/http.py
index f3793922..17ab1b65 100644
--- a/w3lib/http.py
+++ b/w3lib/http.py
@@ -1,7 +1,11 @@
 from base64 import urlsafe_b64encode
+from typing import Any, List, MutableMapping, Optional, AnyStr, Sequence, Union, Mapping
+from w3lib.util import to_bytes
 
+HeadersDictInput = Mapping[bytes, Union[Any, Sequence]]
+HeadersDictOutput = MutableMapping[bytes, List[bytes]]
 
-def headers_raw_to_dict(headers_raw):
+def headers_raw_to_dict(headers_raw: Optional[bytes]) -> Optional[HeadersDictOutput]:
     r"""
     Convert raw headers (single multi-line bytestring)
     to a dictionary.
@@ -30,7 +34,7 @@ def headers_raw_to_dict(headers_raw):
     headers = headers_raw.splitlines()
     headers_tuples = [header.split(b":", 1) for header in headers]
 
-    result_dict = {}
+    result_dict: HeadersDictOutput = {}
     for header_item in headers_tuples:
         if not len(header_item) == 2:
             continue
@@ -46,7 +50,7 @@ def headers_raw_to_dict(headers_raw):
     return result_dict
 
 
-def headers_dict_to_raw(headers_dict):
+def headers_dict_to_raw(headers_dict: Optional[HeadersDictInput]) -> Optional[bytes]:
     r"""
     Returns a raw HTTP headers representation of headers
 
@@ -78,7 +82,7 @@ def headers_dict_to_raw(headers_dict):
     return b"\r\n".join(raw_lines)
 
 
-def basic_auth_header(username, password, encoding="ISO-8859-1"):
+def basic_auth_header(username: AnyStr, password: AnyStr, encoding: str = 'ISO-8859-1') -> bytes:
     """
     Return an `Authorization` header field value for `HTTP Basic Access Authentication (RFC 2617)`_
 
@@ -90,10 +94,8 @@ def basic_auth_header(username, password, encoding="ISO-8859-1"):
 
     """
 
-    auth = "%s:%s" % (username, password)
-    if not isinstance(auth, bytes):
-        # XXX: RFC 2617 doesn't define encoding, but ISO-8859-1
-        # seems to be the most widely used encoding here. See also:
-        # http://greenbytes.de/tech/webdav/draft-ietf-httpauth-basicauth-enc-latest.html
-        auth = auth.encode(encoding)
-    return b"Basic " + urlsafe_b64encode(auth)
+    auth = "%r:%r" % (username, password)
+    # XXX: RFC 2617 doesn't define encoding, but ISO-8859-1
+    # seems to be the most widely used encoding here. See also:
+    # http://greenbytes.de/tech/webdav/draft-ietf-httpauth-basicauth-enc-latest.html
+    return b'Basic ' + urlsafe_b64encode(to_bytes(auth, encoding=encoding))

From 83e79ac868889679bf7c2f47c38e5b470f7031ee Mon Sep 17 00:00:00 2001
From: Lucy Wang <wxitb2017@gmail.com>
Date: Tue, 22 Jun 2021 20:25:33 +0800
Subject: [PATCH 03/17] add more type hints

---
 w3lib/html.py | 41 +++++++++++--------------
 w3lib/url.py  | 85 ++++++++++++++++++++++++++-------------------------
 w3lib/util.py | 20 +++++-------
 3 files changed, 68 insertions(+), 78 deletions(-)

diff --git a/w3lib/html.py b/w3lib/html.py
index 2bea60c9..347cb3aa 100644
--- a/w3lib/html.py
+++ b/w3lib/html.py
@@ -4,29 +4,22 @@
 
 import re
 from html.entities import name2codepoint
+from typing import Match, Sequence, AnyStr
 from urllib.parse import urljoin
 
 from w3lib.util import to_unicode
 from w3lib.url import safe_url_string
 
-_ent_re = re.compile(
-    r"&((?P<named>[a-z\d]+)|#(?P<dec>\d+)|#x(?P<hex>[a-f\d]+))(?P<semicolon>;?)",
-    re.IGNORECASE,
-)
-_tag_re = re.compile(r"<[a-zA-Z\/!].*?>", re.DOTALL)
-_baseurl_re = re.compile(r"<base\s[^>]*href\s*=\s*[\"\']\s*([^\"\'\s]+)\s*[\"\']", re.I)
-_meta_refresh_re = re.compile(
-    r'<meta\s[^>]*http-equiv[^>]*refresh[^>]*content\s*=\s*(?P<quote>["\'])(?P<int>(\d*\.)?\d+)\s*;\s*url=\s*(?P<url>.*?)(?P=quote)',
-    re.DOTALL | re.IGNORECASE,
-)
-_cdata_re = re.compile(
-    r"((?P<cdata_s><!\[CDATA\[)(?P<cdata_d>.*?)(?P<cdata_e>\]\]>))", re.DOTALL
-)
+_ent_re = re.compile(r'&((?P<named>[a-z\d]+)|#(?P<dec>\d+)|#x(?P<hex>[a-f\d]+))(?P<semicolon>;?)', re.IGNORECASE)
+_tag_re = re.compile(r'<[a-zA-Z\/!].*?>', re.DOTALL)
+_baseurl_re = re.compile(r'<base\s[^>]*href\s*=\s*[\"\']\s*([^\"\'\s]+)\s*[\"\']', re.I)
+_meta_refresh_re = re.compile(r'<meta\s[^>]*http-equiv[^>]*refresh[^>]*content\s*=\s*(?P<quote>["\'])(?P<int>(\d*\.)?\d+)\s*;\s*url=\s*(?P<url>.*?)(?P=quote)', re.DOTALL | re.IGNORECASE)
+_cdata_re = re.compile(r'((?P<cdata_s><!\[CDATA\[)(?P<cdata_d>.*?)(?P<cdata_e>\]\]>))', re.DOTALL)
 
-HTML5_WHITESPACE = " \t\n\r\x0c"
+HTML5_WHITESPACE = ' \t\n\r\x0c'
 
 
-def replace_entities(text, keep=(), remove_illegal=True, encoding="utf-8"):
+def replace_entities(text: AnyStr, keep: Sequence[str] = (), remove_illegal: bool = True, encoding: str ='utf-8'):
     """Remove entities from the given `text` by converting them to their
     corresponding unicode character.
 
@@ -54,14 +47,15 @@ def replace_entities(text, keep=(), remove_illegal=True, encoding="utf-8"):
 
     """
 
-    def convert_entity(m):
+    def convert_entity(m: Match):
         groups = m.groupdict()
-        if groups.get("dec"):
-            number = int(groups["dec"], 10)
-        elif groups.get("hex"):
-            number = int(groups["hex"], 16)
-        elif groups.get("named"):
-            entity_name = groups["named"]
+        number = None
+        if groups.get('dec'):
+            number = int(groups['dec'], 10)
+        elif groups.get('hex'):
+            number = int(groups['hex'], 16)
+        elif groups.get('named'):
+            entity_name = groups['named']
             if entity_name.lower() in keep:
                 return m.group(0)
             else:
@@ -85,8 +79,7 @@ def convert_entity(m):
 
     return _ent_re.sub(convert_entity, to_unicode(text, encoding))
 
-
-def has_entities(text, encoding=None):
+def has_entities(text: AnyStr, encoding=None):
     return bool(_ent_re.search(to_unicode(text, encoding)))
 
 
diff --git a/w3lib/url.py b/w3lib/url.py
index 9a39c98f..77b2fcec 100644
--- a/w3lib/url.py
+++ b/w3lib/url.py
@@ -9,8 +9,8 @@
 import re
 import string
 from collections import namedtuple
+from typing import Callable, Optional, Sequence, Tuple, Union, cast, Dict
 from urllib.parse import (
-    _coerce_args,
     parse_qs,
     parse_qsl,
     ParseResult,
@@ -23,13 +23,16 @@
     urlunparse,
     urlunsplit,
 )
+from urllib.parse import _coerce_args  # type: ignore
 from urllib.request import pathname2url, url2pathname
-from w3lib.util import to_unicode
+from w3lib.util import to_bytes, to_native_str, to_unicode
+from w3lib._types import AnyUnicodeError, StrOrBytes
 
 
 # error handling function for bytes-to-Unicode decoding errors with URLs
-def _quote_byte(error):
-    return (quote(error.object[error.start : error.end]), error.end)
+def _quote_byte(error: UnicodeError) -> Tuple[str, int]:
+    error = cast(AnyUnicodeError, error)
+    return (to_unicode(quote(error.object[error.start:error.end])), error.end)
 
 
 codecs.register_error("percentencode", _quote_byte)
@@ -49,7 +52,7 @@ def _quote_byte(error):
 )  # see https://infra.spec.whatwg.org/#ascii-tab-or-newline
 
 
-def safe_url_string(url, encoding="utf8", path_encoding="utf8", quote_path=True):
+def safe_url_string(url: StrOrBytes, encoding: str ='utf8', path_encoding: str ='utf8', quote_path: bool = True) -> str:
     """Convert the given URL into a legal URL by escaping unsafe characters
     according to RFC-3986. Also, ASCII tabs and newlines are removed
     as per https://url.spec.whatwg.org/#url-parsing.
@@ -83,7 +86,7 @@ def safe_url_string(url, encoding="utf8", path_encoding="utf8", quote_path=True)
     try:
         netloc = parts.netloc.encode("idna").decode()
     except UnicodeError:
-        netloc = parts.netloc
+        netloc = parts.netloc.encode('utf-8')
 
     # default encoding for path component SHOULD be UTF-8
     if quote_path:
@@ -105,8 +108,8 @@ def safe_url_string(url, encoding="utf8", path_encoding="utf8", quote_path=True)
 _parent_dirs = re.compile(r"/?(\.\./)+")
 
 
-def safe_download_url(url, encoding="utf8", path_encoding="utf8"):
-    """Make a url for download. This will call safe_url_string
+def safe_download_url(url: StrOrBytes, encoding: str ='utf8', path_encoding: str ='utf8') -> str:
+    """ Make a url for download. This will call safe_url_string
     and then strip the fragment, if one exists. The path will
     be normalised.
 
@@ -124,11 +127,11 @@ def safe_download_url(url, encoding="utf8", path_encoding="utf8"):
     return urlunsplit((scheme, netloc, path, query, ""))
 
 
-def is_url(text):
-    return text.partition("://")[0] in ("file", "http", "https")
+def is_url(text: str) -> bool:
+    return text.partition("://")[0] in ('file', 'http', 'https')
 
 
-def url_query_parameter(url, parameter, default=None, keep_blank_values=0):
+def url_query_parameter(url: StrOrBytes, parameter: str, default: Optional[str] = None, keep_blank_values: Union[bool, int]=0) -> Optional[str]:
     """Return the value of a url parameter, given the url and parameter name
 
     General case:
@@ -157,19 +160,17 @@ def url_query_parameter(url, parameter, default=None, keep_blank_values=0):
 
     """
 
-    queryparams = parse_qs(urlsplit(str(url))[3], keep_blank_values=keep_blank_values)
-    return queryparams.get(parameter, [default])[0]
+    queryparams = parse_qs(
+        urlsplit(str(url))[3],
+        keep_blank_values=bool(keep_blank_values)
+    )
+    if parameter in queryparams:
+        return queryparams[parameter][0]
+    else:
+        return default
 
 
-def url_query_cleaner(
-    url,
-    parameterlist=(),
-    sep="&",
-    kvsep="=",
-    remove=False,
-    unique=True,
-    keep_fragments=False,
-):
+def url_query_cleaner(url: StrOrBytes, parameterlist: Union[StrOrBytes, Sequence[StrOrBytes]] = (), sep: str = '&', kvsep: str = '=', remove: bool  = False, unique: bool = True, keep_fragments: bool = False) -> str:
     """Clean URL arguments leaving only those passed in the parameterlist keeping order
 
     >>> import w3lib.url
@@ -204,7 +205,9 @@ def url_query_cleaner(
     if isinstance(parameterlist, (str, bytes)):
         parameterlist = [parameterlist]
     url, fragment = urldefrag(url)
-    base, _, query = url.partition("?")
+    url = cast(str, url)
+    fragment = cast(str, fragment)
+    base, _, query = url.partition('?')
     seen = set()
     querylist = []
     for ksv in query.split(sep):
@@ -222,11 +225,10 @@ def url_query_cleaner(
             seen.add(k)
     url = "?".join([base, sep.join(querylist)]) if querylist else base
     if keep_fragments:
-        url += "#" + fragment
-    return url
+        url += '#' + fragment
+    return cast(str, url)
 
-
-def _add_or_replace_parameters(url, params):
+def _add_or_replace_parameters(url: str, params: Dict[str, str]) -> str:
     parsed = urlsplit(url)
     current_args = parse_qsl(parsed.query, keep_blank_values=True)
 
@@ -248,7 +250,7 @@ def _add_or_replace_parameters(url, params):
     return urlunsplit(parsed._replace(query=query))
 
 
-def add_or_replace_parameter(url, name, new_value):
+def add_or_replace_parameter(url: str, name: str, new_value: str) -> str:
     """Add or remove a parameter to a given url
 
     >>> import w3lib.url
@@ -264,7 +266,7 @@ def add_or_replace_parameter(url, name, new_value):
     return _add_or_replace_parameters(url, {name: new_value})
 
 
-def add_or_replace_parameters(url, new_parameters):
+def add_or_replace_parameters(url: str, new_parameters: Dict[str, str]) -> str:
     """Add or remove a parameters to a given url
 
     >>> import w3lib.url
@@ -279,7 +281,7 @@ def add_or_replace_parameters(url, new_parameters):
     return _add_or_replace_parameters(url, new_parameters)
 
 
-def path_to_file_uri(path):
+def path_to_file_uri(path: str) -> str:
     """Convert local filesystem path to legal File URIs as described in:
     http://en.wikipedia.org/wiki/File_URI_scheme
     """
@@ -289,7 +291,7 @@ def path_to_file_uri(path):
     return "file:///%s" % x.lstrip("/")
 
 
-def file_uri_to_path(uri):
+def file_uri_to_path(uri: str) -> str:
     """Convert File URI to local filesystem path according to:
     http://en.wikipedia.org/wiki/File_URI_scheme
     """
@@ -297,7 +299,7 @@ def file_uri_to_path(uri):
     return url2pathname(uri_path)
 
 
-def any_to_uri(uri_or_path):
+def any_to_uri(uri_or_path: str) -> str:
     """If given a path name, return its File URI, otherwise return it
     unmodified
     """
@@ -342,12 +344,11 @@ def any_to_uri(uri_or_path):
     ).encode()
 )
 
-_ParseDataURIResult = namedtuple(
-    "ParseDataURIResult", "media_type media_type_parameters data"
-)
+_ParseDataURIResult = namedtuple("_ParseDataURIResult",
+                                 "media_type media_type_parameters data")
 
 
-def parse_data_uri(uri):
+def parse_data_uri(uri: StrOrBytes) -> _ParseDataURIResult:
     """
 
     Parse a data: URI, returning a 3-tuple of media type, dictionary of media
@@ -389,7 +390,7 @@ def parse_data_uri(uri):
         if m:
             attribute, value, value_quoted = m.groups()
             if value_quoted:
-                value = re.sub(br"\\(.)", r"\1", value_quoted)
+                value = re.sub(br'\\(.)', rb'\1', value_quoted)
             media_type_params[attribute.decode()] = value.decode()
             uri = uri[m.end() :]
         else:
@@ -477,9 +478,8 @@ def canonicalize_url(url, keep_blank_values=True, keep_fragments=False, encoding
     # if not for proper URL expected by remote website.
     try:
         scheme, netloc, path, params, query, fragment = _safe_ParseResult(
-            parse_url(url), encoding=encoding or "utf8"
-        )
-    except UnicodeEncodeError as e:
+            parse_url(url), encoding=encoding or 'utf8')
+    except UnicodeEncodeError:
         scheme, netloc, path, params, query, fragment = _safe_ParseResult(
             parse_url(url), encoding="utf8"
         )
@@ -570,8 +570,9 @@ def parse_qsl_to_bytes(qs, keep_blank_values=False):
     # (at https://hg.python.org/cpython/rev/c38ac7ab8d9a)
     # except for the unquote(s, encoding, errors) calls replaced
     # with unquote_to_bytes(s)
-    qs, _coerce_result = _coerce_args(qs)
-    pairs = [s2 for s1 in qs.split("&") for s2 in s1.split(";")]
+    coerce_args = cast(Callable[..., Tuple[str, Callable]], _coerce_args)
+    qs, _coerce_result = coerce_args(qs)
+    pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
     r = []
     for name_value in pairs:
         if not name_value:
diff --git a/w3lib/util.py b/w3lib/util.py
index db8e16e8..4258b76b 100644
--- a/w3lib/util.py
+++ b/w3lib/util.py
@@ -1,13 +1,9 @@
 from warnings import warn
+from typing import Optional
+from w3lib._types import StrOrBytes
 
 
-def str_to_unicode(text, encoding=None, errors="strict"):
-    warn(
-        "The w3lib.utils.str_to_unicode function is deprecated and "
-        "will be removed in a future release.",
-        DeprecationWarning,
-        stacklevel=2,
-    )
+def str_to_unicode(text: StrOrBytes, encoding: Optional[str] = None, errors: str ='strict') -> str:
     if encoding is None:
         encoding = "utf-8"
     if isinstance(text, bytes):
@@ -15,7 +11,7 @@ def str_to_unicode(text, encoding=None, errors="strict"):
     return text
 
 
-def unicode_to_str(text, encoding=None, errors="strict"):
+def unicode_to_str(text: StrOrBytes, encoding: Optional[str] = None, errors: str = 'strict') -> bytes:
     warn(
         "The w3lib.utils.unicode_to_str function is deprecated and "
         "will be removed in a future release.",
@@ -29,7 +25,7 @@ def unicode_to_str(text, encoding=None, errors="strict"):
     return text
 
 
-def to_unicode(text, encoding=None, errors="strict"):
+def to_unicode(text: StrOrBytes, encoding: Optional[str] = None, errors: str = 'strict') -> str:
     """Return the unicode representation of a bytes object `text`. If `text`
     is already an unicode object, return it as-is."""
     if isinstance(text, str):
@@ -43,7 +39,7 @@ def to_unicode(text, encoding=None, errors="strict"):
     return text.decode(encoding, errors)
 
 
-def to_bytes(text, encoding=None, errors="strict"):
+def to_bytes(text: StrOrBytes, encoding: Optional[str] = None, errors: str = 'strict') -> bytes:
     """Return the binary representation of `text`. If `text`
     is already a bytes object, return it as-is."""
     if isinstance(text, bytes):
@@ -57,8 +53,8 @@ def to_bytes(text, encoding=None, errors="strict"):
     return text.encode(encoding, errors)
 
 
-def to_native_str(text, encoding=None, errors="strict"):
-    """Return str representation of `text`"""
+def to_native_str(text: StrOrBytes, encoding: Optional[str] = None, errors: str = 'strict') -> str:
+    """ Return str representation of `text` """
     warn(
         "The w3lib.utils.to_native_str function is deprecated and "
         "will be removed in a future release. Please use "

From c68b0662b13bc3bd62b9e281e3c66f6b7ee7ca44 Mon Sep 17 00:00:00 2001
From: Lucy Wang <wxitb2017@gmail.com>
Date: Fri, 2 Jul 2021 11:44:16 +0800
Subject: [PATCH 04/17] more type hints

---
 tests/test_encoding.py |  8 +++++---
 tests/test_http.py     | 18 ++++++++++++------
 tests/test_url.py      | 35 +++++++++++++++--------------------
 3 files changed, 32 insertions(+), 29 deletions(-)

diff --git a/tests/test_encoding.py b/tests/test_encoding.py
index b9e78922..3be7d796 100644
--- a/tests/test_encoding.py
+++ b/tests/test_encoding.py
@@ -38,14 +38,16 @@ def test_bom(self):
         utf32le = b"\xff\xfe\x00\x00\x34\x6c\x00\x00"
         for string in (utf16be, utf16le, utf32be, utf32le):
             bom_encoding, bom = read_bom(string)
-            decoded = string[len(bom) :].decode(bom_encoding)
+            assert bom_encoding is not None
+            assert bom is not None
+            decoded = string[len(bom):].decode(bom_encoding)
             self.assertEqual(water_unicode, decoded)
         # Body without BOM
-        enc, bom = read_bom("foo")
+        enc, bom = read_bom(b"foo")
         self.assertEqual(enc, None)
         self.assertEqual(bom, None)
         # Empty body
-        enc, bom = read_bom("")
+        enc, bom = read_bom(b"")
         self.assertEqual(enc, None)
         self.assertEqual(bom, None)
 
diff --git a/tests/test_http.py b/tests/test_http.py
index 127f4de9..fc59ae11 100644
--- a/tests/test_http.py
+++ b/tests/test_http.py
@@ -1,6 +1,7 @@
 import unittest
 from collections import OrderedDict
-from w3lib.http import basic_auth_header, headers_dict_to_raw, headers_raw_to_dict
+from w3lib.http import (HeadersDictInput, basic_auth_header,
+                        headers_dict_to_raw, headers_raw_to_dict)
 
 __doctests__ = ["w3lib.http"]  # for trial support
 
@@ -47,7 +48,10 @@ def test_headers_dict_to_raw(self):
         )
 
     def test_headers_dict_to_raw_listtuple(self):
-        dct = OrderedDict([(b"Content-type", [b"text/html"]), (b"Accept", [b"gzip"])])
+        dct: HeadersDictInput = OrderedDict([
+            (b'Content-type', [b'text/html']),
+            (b'Accept', [b'gzip'])
+        ])
         self.assertEqual(
             headers_dict_to_raw(dct), b"Content-type: text/html\r\nAccept: gzip"
         )
@@ -70,10 +74,12 @@ def test_headers_dict_to_raw_listtuple(self):
         )
 
     def test_headers_dict_to_raw_wrong_values(self):
-        dct = OrderedDict(
-            [
-                (b"Content-type", 0),
-            ]
+        dct: HeadersDictInput = OrderedDict([
+            (b'Content-type', 0),
+        ])
+        self.assertEqual(
+            headers_dict_to_raw(dct),
+            b''
         )
         self.assertEqual(headers_dict_to_raw(dct), b"")
 
diff --git a/tests/test_url.py b/tests/test_url.py
index edd816c6..1b02d5be 100644
--- a/tests/test_url.py
+++ b/tests/test_url.py
@@ -506,10 +506,10 @@ def test_add_or_replace_parameters(self):
         )
 
     def test_add_or_replace_parameters_does_not_change_input_param(self):
-        url = "http://domain/test?arg=original"
-        input_param = {"arg": "value"}
-        new_url = add_or_replace_parameters(url, input_param)  # noqa
-        self.assertEqual(input_param, {"arg": "value"})
+        url = 'http://domain/test?arg=original'
+        input_param = {'arg': 'value'}
+        add_or_replace_parameters(url, input_param)  # noqa
+        self.assertEqual(input_param, {'arg': 'value'})
 
     def test_url_query_cleaner(self):
         self.assertEqual("product.html", url_query_cleaner("product.html?"))
@@ -814,22 +814,17 @@ def test_normalize_percent_encoding_in_query_arguments(self):
         )
 
     def test_non_ascii_percent_encoding_in_paths(self):
-        self.assertEqual(
-            canonicalize_url("http://www.example.com/a do?a=1"),
-            "http://www.example.com/a%20do?a=1",
-        ),
-        self.assertEqual(
-            canonicalize_url("http://www.example.com/a %20do?a=1"),
-            "http://www.example.com/a%20%20do?a=1",
-        ),
-        self.assertEqual(
-            canonicalize_url("http://www.example.com/a do£.html?a=1"),
-            "http://www.example.com/a%20do%C2%A3.html?a=1",
-        )
-        self.assertEqual(
-            canonicalize_url(b"http://www.example.com/a do\xc2\xa3.html?a=1"),
-            "http://www.example.com/a%20do%C2%A3.html?a=1",
-        )
+        self.assertEqual(canonicalize_url("http://www.example.com/a do?a=1"),
+                                          "http://www.example.com/a%20do?a=1")
+
+        self.assertEqual(canonicalize_url("http://www.example.com/a %20do?a=1"),
+                                          "http://www.example.com/a%20%20do?a=1")
+
+        self.assertEqual(canonicalize_url("http://www.example.com/a do£.html?a=1"),
+                                          "http://www.example.com/a%20do%C2%A3.html?a=1")
+
+        self.assertEqual(canonicalize_url(b"http://www.example.com/a do\xc2\xa3.html?a=1"),
+                                          "http://www.example.com/a%20do%C2%A3.html?a=1")
 
     def test_non_ascii_percent_encoding_in_query_arguments(self):
         self.assertEqual(

From 8cd9cf137dbc6d465ff3c890f7172276c355d93c Mon Sep 17 00:00:00 2001
From: Lucy Wang <wxitb2017@gmail.com>
Date: Fri, 2 Jul 2021 12:14:23 +0800
Subject: [PATCH 05/17] flake8

---
 w3lib/encoding.py |  2 +-
 w3lib/html.py     |  2 +-
 w3lib/http.py     |  1 +
 w3lib/url.py      |  9 +++++----
 w3lib/util.py     | 21 +++++++++++++++------
 5 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/w3lib/encoding.py b/w3lib/encoding.py
index 68357d12..a9ffef99 100644
--- a/w3lib/encoding.py
+++ b/w3lib/encoding.py
@@ -3,7 +3,7 @@
 """
 import re, codecs, encodings
 from sys import version_info
-from typing import  Callable, Match, Optional, Tuple, Union, cast
+from typing import Callable, Match, Optional, Tuple, Union, cast
 from w3lib._types import AnyUnicodeError, StrOrBytes
 from w3lib.util import to_native_str
 
diff --git a/w3lib/html.py b/w3lib/html.py
index 347cb3aa..395e9d21 100644
--- a/w3lib/html.py
+++ b/w3lib/html.py
@@ -19,7 +19,7 @@
 HTML5_WHITESPACE = ' \t\n\r\x0c'
 
 
-def replace_entities(text: AnyStr, keep: Sequence[str] = (), remove_illegal: bool = True, encoding: str ='utf-8'):
+def replace_entities(text: AnyStr, keep: Sequence[str] = (), remove_illegal: bool = True, encoding: str = 'utf-8'):
     """Remove entities from the given `text` by converting them to their
     corresponding unicode character.
 
diff --git a/w3lib/http.py b/w3lib/http.py
index 17ab1b65..4d86ce51 100644
--- a/w3lib/http.py
+++ b/w3lib/http.py
@@ -5,6 +5,7 @@
 HeadersDictInput = Mapping[bytes, Union[Any, Sequence]]
 HeadersDictOutput = MutableMapping[bytes, List[bytes]]
 
+
 def headers_raw_to_dict(headers_raw: Optional[bytes]) -> Optional[HeadersDictOutput]:
     r"""
     Convert raw headers (single multi-line bytestring)
diff --git a/w3lib/url.py b/w3lib/url.py
index 77b2fcec..1b97647f 100644
--- a/w3lib/url.py
+++ b/w3lib/url.py
@@ -52,7 +52,7 @@ def _quote_byte(error: UnicodeError) -> Tuple[str, int]:
 )  # see https://infra.spec.whatwg.org/#ascii-tab-or-newline
 
 
-def safe_url_string(url: StrOrBytes, encoding: str ='utf8', path_encoding: str ='utf8', quote_path: bool = True) -> str:
+def safe_url_string(url: StrOrBytes, encoding: str = 'utf8', path_encoding: str = 'utf8', quote_path: bool = True) -> str:
     """Convert the given URL into a legal URL by escaping unsafe characters
     according to RFC-3986. Also, ASCII tabs and newlines are removed
     as per https://url.spec.whatwg.org/#url-parsing.
@@ -108,7 +108,7 @@ def safe_url_string(url: StrOrBytes, encoding: str ='utf8', path_encoding: str =
 _parent_dirs = re.compile(r"/?(\.\./)+")
 
 
-def safe_download_url(url: StrOrBytes, encoding: str ='utf8', path_encoding: str ='utf8') -> str:
+def safe_download_url(url: StrOrBytes, encoding: str = 'utf8', path_encoding: str = 'utf8') -> str:
     """ Make a url for download. This will call safe_url_string
     and then strip the fragment, if one exists. The path will
     be normalised.
@@ -131,7 +131,7 @@ def is_url(text: str) -> bool:
     return text.partition("://")[0] in ('file', 'http', 'https')
 
 
-def url_query_parameter(url: StrOrBytes, parameter: str, default: Optional[str] = None, keep_blank_values: Union[bool, int]=0) -> Optional[str]:
+def url_query_parameter(url: StrOrBytes, parameter: str, default: Optional[str] = None, keep_blank_values: Union[bool, int] = 0) -> Optional[str]:
     """Return the value of a url parameter, given the url and parameter name
 
     General case:
@@ -170,7 +170,8 @@ def url_query_parameter(url: StrOrBytes, parameter: str, default: Optional[str]
         return default
 
 
-def url_query_cleaner(url: StrOrBytes, parameterlist: Union[StrOrBytes, Sequence[StrOrBytes]] = (), sep: str = '&', kvsep: str = '=', remove: bool  = False, unique: bool = True, keep_fragments: bool = False) -> str:
+def url_query_cleaner(
+        url: StrOrBytes, parameterlist: Union[StrOrBytes, Sequence[StrOrBytes]] = (), sep: str = '&', kvsep: str = '=', remove: bool = False, unique: bool = True, keep_fragments: bool = False) -> str:
     """Clean URL arguments leaving only those passed in the parameterlist keeping order
 
     >>> import w3lib.url
diff --git a/w3lib/util.py b/w3lib/util.py
index 4258b76b..315d968a 100644
--- a/w3lib/util.py
+++ b/w3lib/util.py
@@ -3,15 +3,18 @@
 from w3lib._types import StrOrBytes
 
 
-def str_to_unicode(text: StrOrBytes, encoding: Optional[str] = None, errors: str ='strict') -> str:
+def str_to_unicode(
+    text: StrOrBytes, encoding: Optional[str] = None, errors: str = 'strict'
+) -> str:
     if encoding is None:
         encoding = "utf-8"
     if isinstance(text, bytes):
         return text.decode(encoding, errors)
     return text
 
-
-def unicode_to_str(text: StrOrBytes, encoding: Optional[str] = None, errors: str = 'strict') -> bytes:
+def unicode_to_str(
+    text: StrOrBytes, encoding: Optional[str] = None, errors: str = 'strict'
+) -> bytes:
     warn(
         "The w3lib.utils.unicode_to_str function is deprecated and "
         "will be removed in a future release.",
@@ -25,7 +28,9 @@ def unicode_to_str(text: StrOrBytes, encoding: Optional[str] = None, errors: str
     return text
 
 
-def to_unicode(text: StrOrBytes, encoding: Optional[str] = None, errors: str = 'strict') -> str:
+def to_unicode(
+    text: StrOrBytes, encoding: Optional[str] = None, errors: str = 'strict'
+) -> str:
     """Return the unicode representation of a bytes object `text`. If `text`
     is already an unicode object, return it as-is."""
     if isinstance(text, str):
@@ -39,7 +44,9 @@ def to_unicode(text: StrOrBytes, encoding: Optional[str] = None, errors: str = '
     return text.decode(encoding, errors)
 
 
-def to_bytes(text: StrOrBytes, encoding: Optional[str] = None, errors: str = 'strict') -> bytes:
+def to_bytes(
+    text: StrOrBytes, encoding: Optional[str] = None, errors: str = 'strict'
+) -> bytes:
     """Return the binary representation of `text`. If `text`
     is already a bytes object, return it as-is."""
     if isinstance(text, bytes):
@@ -53,7 +60,9 @@ def to_bytes(text: StrOrBytes, encoding: Optional[str] = None, errors: str = 'st
     return text.encode(encoding, errors)
 
 
-def to_native_str(text: StrOrBytes, encoding: Optional[str] = None, errors: str = 'strict') -> str:
+def to_native_str(
+    text: StrOrBytes, encoding: Optional[str] = None, errors: str = 'strict'
+) -> str:
     """ Return str representation of `text` """
     warn(
         "The w3lib.utils.to_native_str function is deprecated and "

From 43764623aa4e2c2fe1bb60b564409a186b3ca703 Mon Sep 17 00:00:00 2001
From: Lucy Wang <wxitb2017@gmail.com>
Date: Mon, 2 Aug 2021 14:24:38 +0800
Subject: [PATCH 06/17] more type hints

---
 .gitignore    |  1 +
 w3lib/html.py | 78 ++++++++++++++++++++++++---------------------------
 w3lib/http.py |  4 +--
 w3lib/url.py  | 43 ++++++++++++++--------------
 4 files changed, 61 insertions(+), 65 deletions(-)

diff --git a/.gitignore b/.gitignore
index bccc4a7b..714a9be8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,3 +13,4 @@ coverage.xml
 .mypy_cache/
 /index.txt
 .dmypy.json
+.hypothesis/
diff --git a/w3lib/html.py b/w3lib/html.py
index 395e9d21..bdbf7b75 100644
--- a/w3lib/html.py
+++ b/w3lib/html.py
@@ -4,7 +4,7 @@
 
 import re
 from html.entities import name2codepoint
-from typing import Match, Sequence, AnyStr
+from typing import Iterable, Match, AnyStr, Optional, Pattern, Tuple, Union
 from urllib.parse import urljoin
 
 from w3lib.util import to_unicode
@@ -19,7 +19,7 @@
 HTML5_WHITESPACE = ' \t\n\r\x0c'
 
 
-def replace_entities(text: AnyStr, keep: Sequence[str] = (), remove_illegal: bool = True, encoding: str = 'utf-8'):
+def replace_entities(text: AnyStr, keep: Iterable[str] = (), remove_illegal: bool = True, encoding: str = 'utf-8') -> str:
     """Remove entities from the given `text` by converting them to their
     corresponding unicode character.
 
@@ -47,7 +47,7 @@ def replace_entities(text: AnyStr, keep: Sequence[str] = (), remove_illegal: boo
 
     """
 
-    def convert_entity(m: Match):
+    def convert_entity(m: Match) -> str:
         groups = m.groupdict()
         number = None
         if groups.get('dec'):
@@ -79,11 +79,11 @@ def convert_entity(m: Match):
 
     return _ent_re.sub(convert_entity, to_unicode(text, encoding))
 
-def has_entities(text: AnyStr, encoding=None):
+def has_entities(text: AnyStr, encoding: Optional[str] = None) -> bool:
     return bool(_ent_re.search(to_unicode(text, encoding)))
 
 
-def replace_tags(text, token="", encoding=None):
+def replace_tags(text: AnyStr, token: str = '', encoding: Optional[str] = None) -> str:
     """Replace all markup tags found in the given `text` by the given token.
     By default `token` is an empty string so it just removes all tags.
 
@@ -106,11 +106,11 @@ def replace_tags(text, token="", encoding=None):
     return _tag_re.sub(token, to_unicode(text, encoding))
 
 
-_REMOVECOMMENTS_RE = re.compile("<!--.*?(?:-->|$)", re.DOTALL)
+_REMOVECOMMENTS_RE = re.compile('<!--.*?(?:-->|$)', re.DOTALL)
 
 
-def remove_comments(text, encoding=None):
-    """Remove HTML Comments.
+def remove_comments(text: AnyStr, encoding: Optional[str] = None) -> str:
+    """ Remove HTML Comments.
 
     >>> import w3lib.html
     >>> w3lib.html.remove_comments(b"test <!--textcoment--> whatever")
@@ -119,12 +119,11 @@ def remove_comments(text, encoding=None):
 
     """
 
-    text = to_unicode(text, encoding)
-    return _REMOVECOMMENTS_RE.sub("", text)
+    utext = to_unicode(text, encoding)
+    return _REMOVECOMMENTS_RE.sub('', utext)
 
-
-def remove_tags(text, which_ones=(), keep=(), encoding=None):
-    """Remove HTML Tags only.
+def remove_tags(text: AnyStr, which_ones: Iterable[str] = (), keep: Iterable[str] = (), encoding: Optional[str] = None) -> str:
+    """ Remove HTML Tags only.
 
     `which_ones` and `keep` are both tuples, there are four cases:
 
@@ -173,14 +172,14 @@ def remove_tags(text, which_ones=(), keep=(), encoding=None):
     which_ones = {tag.lower() for tag in which_ones}
     keep = {tag.lower() for tag in keep}
 
-    def will_remove(tag):
+    def will_remove(tag: str) -> bool:
         tag = tag.lower()
         if which_ones:
             return tag in which_ones
         else:
             return tag not in keep
 
-    def remove_tag(m):
+    def remove_tag(m: Match) -> str:
         tag = m.group(1)
         return "" if will_remove(tag) else m.group(0)
 
@@ -190,7 +189,7 @@ def remove_tag(m):
     return retags.sub(remove_tag, to_unicode(text, encoding))
 
 
-def remove_tags_with_content(text, which_ones=(), encoding=None):
+def remove_tags_with_content(text: AnyStr, which_ones: Iterable[str] = (), encoding: Optional[str] = None) -> str:
     """Remove tags and their content.
 
     `which_ones` is a tuple of which tags to remove including their content.
@@ -204,19 +203,18 @@ def remove_tags_with_content(text, which_ones=(), encoding=None):
 
     """
 
-    text = to_unicode(text, encoding)
+    utext = to_unicode(text, encoding)
     if which_ones:
         tags = "|".join(
             [r"<%s\b.*?</%s>|<%s\s*/>" % (tag, tag, tag) for tag in which_ones]
         )
         retags = re.compile(tags, re.DOTALL | re.IGNORECASE)
-        text = retags.sub("", text)
-    return text
+        utext = retags.sub('', utext)
+    return utext
 
 
-def replace_escape_chars(
-    text, which_ones=("\n", "\t", "\r"), replace_by="", encoding=None
-):
+def replace_escape_chars(text: AnyStr, which_ones: Iterable[str] = ('\n', '\t', '\r'), replace_by: str = '', \
+        encoding: Optional[str] = None) -> str:
     """Remove escape characters.
 
     `which_ones` is a tuple of which escape characters we want to remove.
@@ -227,13 +225,13 @@ def replace_escape_chars(
 
     """
 
-    text = to_unicode(text, encoding)
+    utext = to_unicode(text, encoding)
     for ec in which_ones:
-        text = text.replace(ec, to_unicode(replace_by, encoding))
-    return text
+        utext = utext.replace(ec, to_unicode(replace_by, encoding))
+    return utext
 
 
-def unquote_markup(text, keep=(), remove_illegal=True, encoding=None):
+def unquote_markup(text: AnyStr, keep: Iterable[str] = (), remove_illegal: bool = True, encoding: Optional[str] = None) -> str:
     """
     This function receives markup as a text (always a unicode string or
     a UTF-8 encoded string) and does the following:
@@ -245,7 +243,7 @@ def unquote_markup(text, keep=(), remove_illegal=True, encoding=None):
 
     """
 
-    def _get_fragments(txt, pattern):
+    def _get_fragments(txt: str, pattern: Pattern) -> Iterable[Union[str, Match]]:
         offset = 0
         for match in pattern.finditer(txt):
             match_s, match_e = match.span(1)
@@ -254,9 +252,9 @@ def _get_fragments(txt, pattern):
             offset = match_e
         yield txt[offset:]
 
-    text = to_unicode(text, encoding)
-    ret_text = ""
-    for fragment in _get_fragments(text, _cdata_re):
+    utext = to_unicode(text, encoding)
+    ret_text = ''
+    for fragment in _get_fragments(utext, _cdata_re):
         if isinstance(fragment, str):
             # it's not a CDATA (so we try to remove its entities)
             ret_text += replace_entities(
@@ -268,7 +266,7 @@ def _get_fragments(txt, pattern):
     return ret_text
 
 
-def get_base_url(text, baseurl="", encoding="utf-8"):
+def get_base_url(text: AnyStr, baseurl: str = '', encoding: str = 'utf-8') -> str:
     """Return the base url if declared in the given HTML `text`,
     relative to the given base url.
 
@@ -276,8 +274,8 @@ def get_base_url(text, baseurl="", encoding="utf-8"):
 
     """
 
-    text = to_unicode(text, encoding)
-    m = _baseurl_re.search(text)
+    utext = to_unicode(text, encoding)
+    m = _baseurl_re.search(utext)
     if m:
         return urljoin(
             safe_url_string(baseurl), safe_url_string(m.group(1), encoding=encoding)
@@ -286,9 +284,7 @@ def get_base_url(text, baseurl="", encoding="utf-8"):
         return safe_url_string(baseurl)
 
 
-def get_meta_refresh(
-    text, baseurl="", encoding="utf-8", ignore_tags=("script", "noscript")
-):
+def get_meta_refresh(text: AnyStr, baseurl: str = '', encoding: str = 'utf-8', ignore_tags: Iterable[str] = ('script', 'noscript')) -> Tuple[Optional[float], Optional[str]]:
     """Return  the http-equiv parameter of the HTML meta element from the given
     HTML text and return a tuple ``(interval, url)`` where interval is an integer
     containing the delay in seconds (or zero if not present) and url is a
@@ -299,13 +295,13 @@ def get_meta_refresh(
     """
 
     try:
-        text = to_unicode(text, encoding)
+        utext = to_unicode(text, encoding)
     except UnicodeDecodeError:
         print(text)
         raise
-    text = remove_tags_with_content(text, ignore_tags)
-    text = remove_comments(replace_entities(text))
-    m = _meta_refresh_re.search(text)
+    utext = remove_tags_with_content(utext, ignore_tags)
+    utext = remove_comments(replace_entities(utext))
+    m = _meta_refresh_re.search(utext)
     if m:
         interval = float(m.group("int"))
         url = safe_url_string(m.group("url").strip(" \"'"), encoding)
@@ -315,7 +311,7 @@ def get_meta_refresh(
         return None, None
 
 
-def strip_html5_whitespace(text):
+def strip_html5_whitespace(text: str) -> str:
     r"""
     Strip all leading and trailing space characters (as defined in
     https://www.w3.org/TR/html5/infrastructure.html#space-character).
diff --git a/w3lib/http.py b/w3lib/http.py
index 4d86ce51..9b92f2ef 100644
--- a/w3lib/http.py
+++ b/w3lib/http.py
@@ -1,6 +1,6 @@
 from base64 import urlsafe_b64encode
 from typing import Any, List, MutableMapping, Optional, AnyStr, Sequence, Union, Mapping
-from w3lib.util import to_bytes
+from w3lib.util import to_bytes, to_native_str
 
 HeadersDictInput = Mapping[bytes, Union[Any, Sequence]]
 HeadersDictOutput = MutableMapping[bytes, List[bytes]]
@@ -95,7 +95,7 @@ def basic_auth_header(username: AnyStr, password: AnyStr, encoding: str = 'ISO-8
 
     """
 
-    auth = "%r:%r" % (username, password)
+    auth = "%s:%s" % (to_native_str(username), to_native_str(password))
     # XXX: RFC 2617 doesn't define encoding, but ISO-8859-1
     # seems to be the most widely used encoding here. See also:
     # http://greenbytes.de/tech/webdav/draft-ietf-httpauth-basicauth-enc-latest.html
diff --git a/w3lib/url.py b/w3lib/url.py
index 1b97647f..889e5bc6 100644
--- a/w3lib/url.py
+++ b/w3lib/url.py
@@ -9,7 +9,7 @@
 import re
 import string
 from collections import namedtuple
-from typing import Callable, Optional, Sequence, Tuple, Union, cast, Dict
+from typing import Callable, List, Optional, Sequence, Tuple, Union, cast, Dict
 from urllib.parse import (
     parse_qs,
     parse_qsl,
@@ -25,7 +25,7 @@
 )
 from urllib.parse import _coerce_args  # type: ignore
 from urllib.request import pathname2url, url2pathname
-from w3lib.util import to_bytes, to_native_str, to_unicode
+from w3lib.util import to_unicode
 from w3lib._types import AnyUnicodeError, StrOrBytes
 
 
@@ -84,7 +84,7 @@ def safe_url_string(url: StrOrBytes, encoding: str = 'utf8', path_encoding: str
     # IDNA encoding can fail for too long labels (>63 characters)
     # or missing labels (e.g. http://.example.com)
     try:
-        netloc = parts.netloc.encode("idna").decode()
+        netloc = parts.netloc.encode('idna')
     except UnicodeError:
         netloc = parts.netloc.encode('utf-8')
 
@@ -94,15 +94,13 @@ def safe_url_string(url: StrOrBytes, encoding: str = 'utf8', path_encoding: str
     else:
         path = parts.path
 
-    return urlunsplit(
-        (
-            parts.scheme,
-            netloc.rstrip(":"),
-            path,
-            quote(parts.query.encode(encoding), _safe_chars),
-            quote(parts.fragment.encode(encoding), _safe_chars),
-        )
-    )
+    return urlunsplit((
+        parts.scheme,
+        netloc.decode().rstrip(':'),
+        path,
+        quote(parts.query.encode(encoding), _safe_chars),
+        quote(parts.fragment.encode(encoding), _safe_chars),
+    ))
 
 
 _parent_dirs = re.compile(r"/?(\.\./)+")
@@ -425,7 +423,7 @@ def parse_data_uri(uri: StrOrBytes) -> _ParseDataURIResult:
 ]
 
 
-def _safe_ParseResult(parts, encoding="utf8", path_encoding="utf8"):
+def _safe_ParseResult(parts: ParseResult, encoding: str = 'utf8', path_encoding: str = 'utf8') -> Tuple[str, str, str, str, str, str]:
     # IDNA encoding can fail for too long labels (>63 characters)
     # or missing labels (e.g. http://.example.com)
     try:
@@ -443,7 +441,8 @@ def _safe_ParseResult(parts, encoding="utf8", path_encoding="utf8"):
     )
 
 
-def canonicalize_url(url, keep_blank_values=True, keep_fragments=False, encoding=None):
+def canonicalize_url(url: StrOrBytes, keep_blank_values: bool = True, keep_fragments: bool = False,
+                     encoding: Optional[str] = None) -> str:
     r"""Canonicalize the given url by applying the following procedures:
 
     - sort query arguments, first by key, then by value
@@ -530,9 +529,9 @@ def canonicalize_url(url, keep_blank_values=True, keep_fragments=False, encoding
     )
 
 
-def _unquotepath(path):
-    for reserved in ("2f", "2F", "3f", "3F"):
-        path = path.replace("%" + reserved, "%25" + reserved.upper())
+def _unquotepath(path: str) -> bytes:
+    for reserved in ('2f', '2F', '3f', '3F'):
+        path = path.replace('%' + reserved, '%25' + reserved.upper())
 
     # standard lib's unquote() does not work for non-UTF-8
     # percent-escaped characters, they get lost.
@@ -542,7 +541,7 @@ def _unquotepath(path):
     return unquote_to_bytes(path)
 
 
-def parse_url(url, encoding=None):
+def parse_url(url: Union[StrOrBytes, ParseResult], encoding: Optional[str] = None) -> ParseResult:
     """Return urlparsed url from the given argument (which could be an already
     parsed url)
     """
@@ -551,7 +550,7 @@ def parse_url(url, encoding=None):
     return urlparse(to_unicode(url, encoding))
 
 
-def parse_qsl_to_bytes(qs, keep_blank_values=False):
+def parse_qsl_to_bytes(qs: str, keep_blank_values: bool = False) -> List[Tuple[bytes, bytes]]:
     """Parse a query given as a string argument.
 
     Data are returned as a list of name, value pairs as bytes.
@@ -586,11 +585,11 @@ def parse_qsl_to_bytes(qs, keep_blank_values=False):
             else:
                 continue
         if len(nv[1]) or keep_blank_values:
-            name = nv[0].replace("+", " ")
+            name: StrOrBytes = nv[0].replace('+', ' ')
             name = unquote_to_bytes(name)
             name = _coerce_result(name)
-            value = nv[1].replace("+", " ")
+            value: StrOrBytes = nv[1].replace('+', ' ')
             value = unquote_to_bytes(value)
             value = _coerce_result(value)
-            r.append((name, value))
+            r.append((cast(bytes, name), cast(bytes, value)))
     return r

From 78c82fdf29f4b079f51c0bad0924f1e56b155bbd Mon Sep 17 00:00:00 2001
From: Lucy Wang <wxitb2017@gmail.com>
Date: Mon, 9 Aug 2021 14:23:36 +0800
Subject: [PATCH 07/17] fix mypy errors

---
 tests/test_util.py | 4 ++--
 w3lib/html.py      | 6 +++---
 w3lib/url.py       | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/test_util.py b/tests/test_util.py
index 7243d175..088147c0 100644
--- a/tests/test_util.py
+++ b/tests/test_util.py
@@ -20,7 +20,7 @@ def test_deprecation(self):
 class ToBytesTestCase(TestCase):
     def test_type_error(self):
         with raises(TypeError):
-            to_bytes(True)
+            to_bytes(True)  # type: ignore
 
 
 class ToNativeStrTestCase(TestCase):
@@ -32,7 +32,7 @@ def test_deprecation(self):
 class ToUnicodeTestCase(TestCase):
     def test_type_error(self):
         with raises(TypeError):
-            to_unicode(True)
+            to_unicode(True)  # type: ignore
 
 
 class UnicodeToStrTestCase(TestCase):
diff --git a/w3lib/html.py b/w3lib/html.py
index bdbf7b75..62ad2aec 100644
--- a/w3lib/html.py
+++ b/w3lib/html.py
@@ -9,6 +9,7 @@
 
 from w3lib.util import to_unicode
 from w3lib.url import safe_url_string
+from w3lib._types import StrOrBytes
 
 _ent_re = re.compile(r'&((?P<named>[a-z\d]+)|#(?P<dec>\d+)|#x(?P<hex>[a-f\d]+))(?P<semicolon>;?)', re.IGNORECASE)
 _tag_re = re.compile(r'<[a-zA-Z\/!].*?>', re.DOTALL)
@@ -213,7 +214,7 @@ def remove_tags_with_content(text: AnyStr, which_ones: Iterable[str] = (), encod
     return utext
 
 
-def replace_escape_chars(text: AnyStr, which_ones: Iterable[str] = ('\n', '\t', '\r'), replace_by: str = '', \
+def replace_escape_chars(text: AnyStr, which_ones: Iterable[str] = ('\n', '\t', '\r'), replace_by: StrOrBytes = '', \
         encoding: Optional[str] = None) -> str:
     """Remove escape characters.
 
@@ -265,8 +266,7 @@ def _get_fragments(txt: str, pattern: Pattern) -> Iterable[Union[str, Match]]:
             ret_text += fragment.group("cdata_d")
     return ret_text
 
-
-def get_base_url(text: AnyStr, baseurl: str = '', encoding: str = 'utf-8') -> str:
+def get_base_url(text: AnyStr, baseurl: StrOrBytes = '', encoding: str = 'utf-8') -> str:
     """Return the base url if declared in the given HTML `text`,
     relative to the given base url.
 
diff --git a/w3lib/url.py b/w3lib/url.py
index 889e5bc6..3f3372ad 100644
--- a/w3lib/url.py
+++ b/w3lib/url.py
@@ -441,7 +441,7 @@ def _safe_ParseResult(parts: ParseResult, encoding: str = 'utf8', path_encoding:
     )
 
 
-def canonicalize_url(url: StrOrBytes, keep_blank_values: bool = True, keep_fragments: bool = False,
+def canonicalize_url(url: Union[StrOrBytes, ParseResult], keep_blank_values: bool = True, keep_fragments: bool = False,
                      encoding: Optional[str] = None) -> str:
     r"""Canonicalize the given url by applying the following procedures:
 

From 5fae2b9727296de3910a642df73f83adb2fde2e4 Mon Sep 17 00:00:00 2001
From: Lucy Wang <wxitb2017@gmail.com>
Date: Mon, 9 Aug 2021 14:24:43 +0800
Subject: [PATCH 08/17] black

---
 tests/test_encoding.py |   2 +-
 tests/test_http.py     |  26 ++++++-----
 tests/test_url.py      |  30 +++++++-----
 w3lib/encoding.py      |  22 ++++++---
 w3lib/html.py          |  99 ++++++++++++++++++++++++++++------------
 w3lib/http.py          |   6 ++-
 w3lib/url.py           | 101 +++++++++++++++++++++++++++--------------
 w3lib/util.py          |  11 +++--
 8 files changed, 195 insertions(+), 102 deletions(-)

diff --git a/tests/test_encoding.py b/tests/test_encoding.py
index 3be7d796..33d7f110 100644
--- a/tests/test_encoding.py
+++ b/tests/test_encoding.py
@@ -40,7 +40,7 @@ def test_bom(self):
             bom_encoding, bom = read_bom(string)
             assert bom_encoding is not None
             assert bom is not None
-            decoded = string[len(bom):].decode(bom_encoding)
+            decoded = string[len(bom) :].decode(bom_encoding)
             self.assertEqual(water_unicode, decoded)
         # Body without BOM
         enc, bom = read_bom(b"foo")
diff --git a/tests/test_http.py b/tests/test_http.py
index fc59ae11..efabb0ab 100644
--- a/tests/test_http.py
+++ b/tests/test_http.py
@@ -1,7 +1,11 @@
 import unittest
 from collections import OrderedDict
-from w3lib.http import (HeadersDictInput, basic_auth_header,
-                        headers_dict_to_raw, headers_raw_to_dict)
+from w3lib.http import (
+    HeadersDictInput,
+    basic_auth_header,
+    headers_dict_to_raw,
+    headers_raw_to_dict,
+)
 
 __doctests__ = ["w3lib.http"]  # for trial support
 
@@ -48,10 +52,9 @@ def test_headers_dict_to_raw(self):
         )
 
     def test_headers_dict_to_raw_listtuple(self):
-        dct: HeadersDictInput = OrderedDict([
-            (b'Content-type', [b'text/html']),
-            (b'Accept', [b'gzip'])
-        ])
+        dct: HeadersDictInput = OrderedDict(
+            [(b"Content-type", [b"text/html"]), (b"Accept", [b"gzip"])]
+        )
         self.assertEqual(
             headers_dict_to_raw(dct), b"Content-type: text/html\r\nAccept: gzip"
         )
@@ -74,14 +77,13 @@ def test_headers_dict_to_raw_listtuple(self):
         )
 
     def test_headers_dict_to_raw_wrong_values(self):
-        dct: HeadersDictInput = OrderedDict([
-            (b'Content-type', 0),
-        ])
-        self.assertEqual(
-            headers_dict_to_raw(dct),
-            b''
+        dct: HeadersDictInput = OrderedDict(
+            [
+                (b"Content-type", 0),
+            ]
         )
         self.assertEqual(headers_dict_to_raw(dct), b"")
+        self.assertEqual(headers_dict_to_raw(dct), b"")
 
         dct = OrderedDict([(b"Content-type", 1), (b"Accept", [b"gzip"])])
         self.assertEqual(headers_dict_to_raw(dct), b"Accept: gzip")
diff --git a/tests/test_url.py b/tests/test_url.py
index 1b02d5be..fe9ee999 100644
--- a/tests/test_url.py
+++ b/tests/test_url.py
@@ -506,10 +506,10 @@ def test_add_or_replace_parameters(self):
         )
 
     def test_add_or_replace_parameters_does_not_change_input_param(self):
-        url = 'http://domain/test?arg=original'
-        input_param = {'arg': 'value'}
+        url = "http://domain/test?arg=original"
+        input_param = {"arg": "value"}
         add_or_replace_parameters(url, input_param)  # noqa
-        self.assertEqual(input_param, {'arg': 'value'})
+        self.assertEqual(input_param, {"arg": "value"})
 
     def test_url_query_cleaner(self):
         self.assertEqual("product.html", url_query_cleaner("product.html?"))
@@ -814,17 +814,25 @@ def test_normalize_percent_encoding_in_query_arguments(self):
         )
 
     def test_non_ascii_percent_encoding_in_paths(self):
-        self.assertEqual(canonicalize_url("http://www.example.com/a do?a=1"),
-                                          "http://www.example.com/a%20do?a=1")
+        self.assertEqual(
+            canonicalize_url("http://www.example.com/a do?a=1"),
+            "http://www.example.com/a%20do?a=1",
+        )
 
-        self.assertEqual(canonicalize_url("http://www.example.com/a %20do?a=1"),
-                                          "http://www.example.com/a%20%20do?a=1")
+        self.assertEqual(
+            canonicalize_url("http://www.example.com/a %20do?a=1"),
+            "http://www.example.com/a%20%20do?a=1",
+        )
 
-        self.assertEqual(canonicalize_url("http://www.example.com/a do£.html?a=1"),
-                                          "http://www.example.com/a%20do%C2%A3.html?a=1")
+        self.assertEqual(
+            canonicalize_url("http://www.example.com/a do£.html?a=1"),
+            "http://www.example.com/a%20do%C2%A3.html?a=1",
+        )
 
-        self.assertEqual(canonicalize_url(b"http://www.example.com/a do\xc2\xa3.html?a=1"),
-                                          "http://www.example.com/a%20do%C2%A3.html?a=1")
+        self.assertEqual(
+            canonicalize_url(b"http://www.example.com/a do\xc2\xa3.html?a=1"),
+            "http://www.example.com/a%20do%C2%A3.html?a=1",
+        )
 
     def test_non_ascii_percent_encoding_in_query_arguments(self):
         self.assertEqual(
diff --git a/w3lib/encoding.py b/w3lib/encoding.py
index a9ffef99..32252105 100644
--- a/w3lib/encoding.py
+++ b/w3lib/encoding.py
@@ -26,6 +26,7 @@ def http_content_type_encoding(content_type: Optional[str]) -> Optional[str]:
 
     return None
 
+
 # regexp for parsing HTTP meta tags
 _TEMPLATE = r"""%s\s*=\s*["']?\s*%s\s*["']?"""
 _SKIP_ATTRS = """(?:\\s+
@@ -124,6 +125,7 @@ def html_body_declared_encoding(html_body_str: StrOrBytes) -> Optional[str]:
     "zh_cn": "gb18030",
 }
 
+
 def _c18n_encoding(encoding: str) -> str:
     """Canonicalize an encoding name
 
@@ -195,7 +197,9 @@ def read_bom(data: bytes) -> Union[Tuple[None, None], Tuple[str, bytes]]:
 
 # Python decoder doesn't follow unicode standard when handling
 # bad utf-8 encoded strings. see http://bugs.python.org/issue8271
-codecs.register_error('w3lib_replace', lambda exc: ('\ufffd', cast(AnyUnicodeError, exc).end))
+codecs.register_error(
+    "w3lib_replace", lambda exc: ("\ufffd", cast(AnyUnicodeError, exc).end)
+)
 
 
 def to_unicode(data_str: bytes, encoding: str) -> str:
@@ -209,8 +213,12 @@ def to_unicode(data_str: bytes, encoding: str) -> str:
     )
 
 
-def html_to_unicode(content_type_header: Optional[str], html_body_str: bytes,
-                    default_encoding: str = 'utf8', auto_detect_fun: Optional[Callable[[bytes], str]] = None) -> Tuple[str, str]:
+def html_to_unicode(
+    content_type_header: Optional[str],
+    html_body_str: bytes,
+    default_encoding: str = "utf8",
+    auto_detect_fun: Optional[Callable[[bytes], str]] = None,
+) -> Tuple[str, str]:
     r'''Convert raw html bytes to unicode
 
     This attempts to make a reasonable guess at the content encoding of the
@@ -279,20 +287,20 @@ def html_to_unicode(content_type_header: Optional[str], html_body_str: bytes,
         # remove BOM if it agrees with the encoding
         if enc == bom_enc:
             bom = cast(bytes, bom)
-            html_body_str = html_body_str[len(bom):]
-        elif enc == 'utf-16' or enc == 'utf-32':
+            html_body_str = html_body_str[len(bom) :]
+        elif enc == "utf-16" or enc == "utf-32":
             # read endianness from BOM, or default to big endian
             # tools.ietf.org/html/rfc2781 section 4.3
             if bom_enc is not None and bom_enc.startswith(enc):
                 enc = bom_enc
                 bom = cast(bytes, bom)
-                html_body_str = html_body_str[len(bom):]
+                html_body_str = html_body_str[len(bom) :]
             else:
                 enc += "-be"
         return enc, to_unicode(html_body_str, enc)
     if bom_enc is not None:
         bom = cast(bytes, bom)
-        return bom_enc, to_unicode(html_body_str[len(bom):], bom_enc)
+        return bom_enc, to_unicode(html_body_str[len(bom) :], bom_enc)
     enc = html_body_declared_encoding(html_body_str)
     if enc is None and (auto_detect_fun is not None):
         enc = auto_detect_fun(html_body_str)
diff --git a/w3lib/html.py b/w3lib/html.py
index 62ad2aec..634d90f5 100644
--- a/w3lib/html.py
+++ b/w3lib/html.py
@@ -11,16 +11,29 @@
 from w3lib.url import safe_url_string
 from w3lib._types import StrOrBytes
 
-_ent_re = re.compile(r'&((?P<named>[a-z\d]+)|#(?P<dec>\d+)|#x(?P<hex>[a-f\d]+))(?P<semicolon>;?)', re.IGNORECASE)
-_tag_re = re.compile(r'<[a-zA-Z\/!].*?>', re.DOTALL)
-_baseurl_re = re.compile(r'<base\s[^>]*href\s*=\s*[\"\']\s*([^\"\'\s]+)\s*[\"\']', re.I)
-_meta_refresh_re = re.compile(r'<meta\s[^>]*http-equiv[^>]*refresh[^>]*content\s*=\s*(?P<quote>["\'])(?P<int>(\d*\.)?\d+)\s*;\s*url=\s*(?P<url>.*?)(?P=quote)', re.DOTALL | re.IGNORECASE)
-_cdata_re = re.compile(r'((?P<cdata_s><!\[CDATA\[)(?P<cdata_d>.*?)(?P<cdata_e>\]\]>))', re.DOTALL)
-
-HTML5_WHITESPACE = ' \t\n\r\x0c'
-
-
-def replace_entities(text: AnyStr, keep: Iterable[str] = (), remove_illegal: bool = True, encoding: str = 'utf-8') -> str:
+_ent_re = re.compile(
+    r"&((?P<named>[a-z\d]+)|#(?P<dec>\d+)|#x(?P<hex>[a-f\d]+))(?P<semicolon>;?)",
+    re.IGNORECASE,
+)
+_tag_re = re.compile(r"<[a-zA-Z\/!].*?>", re.DOTALL)
+_baseurl_re = re.compile(r"<base\s[^>]*href\s*=\s*[\"\']\s*([^\"\'\s]+)\s*[\"\']", re.I)
+_meta_refresh_re = re.compile(
+    r'<meta\s[^>]*http-equiv[^>]*refresh[^>]*content\s*=\s*(?P<quote>["\'])(?P<int>(\d*\.)?\d+)\s*;\s*url=\s*(?P<url>.*?)(?P=quote)',
+    re.DOTALL | re.IGNORECASE,
+)
+_cdata_re = re.compile(
+    r"((?P<cdata_s><!\[CDATA\[)(?P<cdata_d>.*?)(?P<cdata_e>\]\]>))", re.DOTALL
+)
+
+HTML5_WHITESPACE = " \t\n\r\x0c"
+
+
+def replace_entities(
+    text: AnyStr,
+    keep: Iterable[str] = (),
+    remove_illegal: bool = True,
+    encoding: str = "utf-8",
+) -> str:
     """Remove entities from the given `text` by converting them to their
     corresponding unicode character.
 
@@ -51,12 +64,12 @@ def replace_entities(text: AnyStr, keep: Iterable[str] = (), remove_illegal: boo
     def convert_entity(m: Match) -> str:
         groups = m.groupdict()
         number = None
-        if groups.get('dec'):
-            number = int(groups['dec'], 10)
-        elif groups.get('hex'):
-            number = int(groups['hex'], 16)
-        elif groups.get('named'):
-            entity_name = groups['named']
+        if groups.get("dec"):
+            number = int(groups["dec"], 10)
+        elif groups.get("hex"):
+            number = int(groups["hex"], 16)
+        elif groups.get("named"):
+            entity_name = groups["named"]
             if entity_name.lower() in keep:
                 return m.group(0)
             else:
@@ -80,11 +93,12 @@ def convert_entity(m: Match) -> str:
 
     return _ent_re.sub(convert_entity, to_unicode(text, encoding))
 
+
 def has_entities(text: AnyStr, encoding: Optional[str] = None) -> bool:
     return bool(_ent_re.search(to_unicode(text, encoding)))
 
 
-def replace_tags(text: AnyStr, token: str = '', encoding: Optional[str] = None) -> str:
+def replace_tags(text: AnyStr, token: str = "", encoding: Optional[str] = None) -> str:
     """Replace all markup tags found in the given `text` by the given token.
     By default `token` is an empty string so it just removes all tags.
 
@@ -107,11 +121,11 @@ def replace_tags(text: AnyStr, token: str = '', encoding: Optional[str] = None)
     return _tag_re.sub(token, to_unicode(text, encoding))
 
 
-_REMOVECOMMENTS_RE = re.compile('<!--.*?(?:-->|$)', re.DOTALL)
+_REMOVECOMMENTS_RE = re.compile("<!--.*?(?:-->|$)", re.DOTALL)
 
 
 def remove_comments(text: AnyStr, encoding: Optional[str] = None) -> str:
-    """ Remove HTML Comments.
+    """Remove HTML Comments.
 
     >>> import w3lib.html
     >>> w3lib.html.remove_comments(b"test <!--textcoment--> whatever")
@@ -121,10 +135,16 @@ def remove_comments(text: AnyStr, encoding: Optional[str] = None) -> str:
     """
 
     utext = to_unicode(text, encoding)
-    return _REMOVECOMMENTS_RE.sub('', utext)
+    return _REMOVECOMMENTS_RE.sub("", utext)
+
 
-def remove_tags(text: AnyStr, which_ones: Iterable[str] = (), keep: Iterable[str] = (), encoding: Optional[str] = None) -> str:
-    """ Remove HTML Tags only.
+def remove_tags(
+    text: AnyStr,
+    which_ones: Iterable[str] = (),
+    keep: Iterable[str] = (),
+    encoding: Optional[str] = None,
+) -> str:
+    """Remove HTML Tags only.
 
     `which_ones` and `keep` are both tuples, there are four cases:
 
@@ -190,7 +210,9 @@ def remove_tag(m: Match) -> str:
     return retags.sub(remove_tag, to_unicode(text, encoding))
 
 
-def remove_tags_with_content(text: AnyStr, which_ones: Iterable[str] = (), encoding: Optional[str] = None) -> str:
+def remove_tags_with_content(
+    text: AnyStr, which_ones: Iterable[str] = (), encoding: Optional[str] = None
+) -> str:
     """Remove tags and their content.
 
     `which_ones` is a tuple of which tags to remove including their content.
@@ -210,12 +232,16 @@ def remove_tags_with_content(text: AnyStr, which_ones: Iterable[str] = (), encod
             [r"<%s\b.*?</%s>|<%s\s*/>" % (tag, tag, tag) for tag in which_ones]
         )
         retags = re.compile(tags, re.DOTALL | re.IGNORECASE)
-        utext = retags.sub('', utext)
+        utext = retags.sub("", utext)
     return utext
 
 
-def replace_escape_chars(text: AnyStr, which_ones: Iterable[str] = ('\n', '\t', '\r'), replace_by: StrOrBytes = '', \
-        encoding: Optional[str] = None) -> str:
+def replace_escape_chars(
+    text: AnyStr,
+    which_ones: Iterable[str] = ("\n", "\t", "\r"),
+    replace_by: StrOrBytes = "",
+    encoding: Optional[str] = None,
+) -> str:
     """Remove escape characters.
 
     `which_ones` is a tuple of which escape characters we want to remove.
@@ -232,7 +258,12 @@ def replace_escape_chars(text: AnyStr, which_ones: Iterable[str] = ('\n', '\t',
     return utext
 
 
-def unquote_markup(text: AnyStr, keep: Iterable[str] = (), remove_illegal: bool = True, encoding: Optional[str] = None) -> str:
+def unquote_markup(
+    text: AnyStr,
+    keep: Iterable[str] = (),
+    remove_illegal: bool = True,
+    encoding: Optional[str] = None,
+) -> str:
     """
     This function receives markup as a text (always a unicode string or
     a UTF-8 encoded string) and does the following:
@@ -254,7 +285,7 @@ def _get_fragments(txt: str, pattern: Pattern) -> Iterable[Union[str, Match]]:
         yield txt[offset:]
 
     utext = to_unicode(text, encoding)
-    ret_text = ''
+    ret_text = ""
     for fragment in _get_fragments(utext, _cdata_re):
         if isinstance(fragment, str):
             # it's not a CDATA (so we try to remove its entities)
@@ -266,7 +297,10 @@ def _get_fragments(txt: str, pattern: Pattern) -> Iterable[Union[str, Match]]:
             ret_text += fragment.group("cdata_d")
     return ret_text
 
-def get_base_url(text: AnyStr, baseurl: StrOrBytes = '', encoding: str = 'utf-8') -> str:
+
+def get_base_url(
+    text: AnyStr, baseurl: StrOrBytes = "", encoding: str = "utf-8"
+) -> str:
     """Return the base url if declared in the given HTML `text`,
     relative to the given base url.
 
@@ -284,7 +318,12 @@ def get_base_url(text: AnyStr, baseurl: StrOrBytes = '', encoding: str = 'utf-8'
         return safe_url_string(baseurl)
 
 
-def get_meta_refresh(text: AnyStr, baseurl: str = '', encoding: str = 'utf-8', ignore_tags: Iterable[str] = ('script', 'noscript')) -> Tuple[Optional[float], Optional[str]]:
+def get_meta_refresh(
+    text: AnyStr,
+    baseurl: str = "",
+    encoding: str = "utf-8",
+    ignore_tags: Iterable[str] = ("script", "noscript"),
+) -> Tuple[Optional[float], Optional[str]]:
     """Return  the http-equiv parameter of the HTML meta element from the given
     HTML text and return a tuple ``(interval, url)`` where interval is an integer
     containing the delay in seconds (or zero if not present) and url is a
diff --git a/w3lib/http.py b/w3lib/http.py
index 9b92f2ef..4ea31fad 100644
--- a/w3lib/http.py
+++ b/w3lib/http.py
@@ -83,7 +83,9 @@ def headers_dict_to_raw(headers_dict: Optional[HeadersDictInput]) -> Optional[by
     return b"\r\n".join(raw_lines)
 
 
-def basic_auth_header(username: AnyStr, password: AnyStr, encoding: str = 'ISO-8859-1') -> bytes:
+def basic_auth_header(
+    username: AnyStr, password: AnyStr, encoding: str = "ISO-8859-1"
+) -> bytes:
     """
     Return an `Authorization` header field value for `HTTP Basic Access Authentication (RFC 2617)`_
 
@@ -99,4 +101,4 @@ def basic_auth_header(username: AnyStr, password: AnyStr, encoding: str = 'ISO-8
     # XXX: RFC 2617 doesn't define encoding, but ISO-8859-1
     # seems to be the most widely used encoding here. See also:
     # http://greenbytes.de/tech/webdav/draft-ietf-httpauth-basicauth-enc-latest.html
-    return b'Basic ' + urlsafe_b64encode(to_bytes(auth, encoding=encoding))
+    return b"Basic " + urlsafe_b64encode(to_bytes(auth, encoding=encoding))
diff --git a/w3lib/url.py b/w3lib/url.py
index 3f3372ad..85498ed2 100644
--- a/w3lib/url.py
+++ b/w3lib/url.py
@@ -32,7 +32,7 @@
 # error handling function for bytes-to-Unicode decoding errors with URLs
 def _quote_byte(error: UnicodeError) -> Tuple[str, int]:
     error = cast(AnyUnicodeError, error)
-    return (to_unicode(quote(error.object[error.start:error.end])), error.end)
+    return (to_unicode(quote(error.object[error.start : error.end])), error.end)
 
 
 codecs.register_error("percentencode", _quote_byte)
@@ -52,7 +52,12 @@ def _quote_byte(error: UnicodeError) -> Tuple[str, int]:
 )  # see https://infra.spec.whatwg.org/#ascii-tab-or-newline
 
 
-def safe_url_string(url: StrOrBytes, encoding: str = 'utf8', path_encoding: str = 'utf8', quote_path: bool = True) -> str:
+def safe_url_string(
+    url: StrOrBytes,
+    encoding: str = "utf8",
+    path_encoding: str = "utf8",
+    quote_path: bool = True,
+) -> str:
     """Convert the given URL into a legal URL by escaping unsafe characters
     according to RFC-3986. Also, ASCII tabs and newlines are removed
     as per https://url.spec.whatwg.org/#url-parsing.
@@ -84,9 +89,9 @@ def safe_url_string(url: StrOrBytes, encoding: str = 'utf8', path_encoding: str
     # IDNA encoding can fail for too long labels (>63 characters)
     # or missing labels (e.g. http://.example.com)
     try:
-        netloc = parts.netloc.encode('idna')
+        netloc = parts.netloc.encode("idna")
     except UnicodeError:
-        netloc = parts.netloc.encode('utf-8')
+        netloc = parts.netloc.encode("utf-8")
 
     # default encoding for path component SHOULD be UTF-8
     if quote_path:
@@ -94,20 +99,24 @@ def safe_url_string(url: StrOrBytes, encoding: str = 'utf8', path_encoding: str
     else:
         path = parts.path
 
-    return urlunsplit((
-        parts.scheme,
-        netloc.decode().rstrip(':'),
-        path,
-        quote(parts.query.encode(encoding), _safe_chars),
-        quote(parts.fragment.encode(encoding), _safe_chars),
-    ))
+    return urlunsplit(
+        (
+            parts.scheme,
+            netloc.decode().rstrip(":"),
+            path,
+            quote(parts.query.encode(encoding), _safe_chars),
+            quote(parts.fragment.encode(encoding), _safe_chars),
+        )
+    )
 
 
 _parent_dirs = re.compile(r"/?(\.\./)+")
 
 
-def safe_download_url(url: StrOrBytes, encoding: str = 'utf8', path_encoding: str = 'utf8') -> str:
-    """ Make a url for download. This will call safe_url_string
+def safe_download_url(
+    url: StrOrBytes, encoding: str = "utf8", path_encoding: str = "utf8"
+) -> str:
+    """Make a url for download. This will call safe_url_string
     and then strip the fragment, if one exists. The path will
     be normalised.
 
@@ -126,10 +135,15 @@ def safe_download_url(url: StrOrBytes, encoding: str = 'utf8', path_encoding: st
 
 
 def is_url(text: str) -> bool:
-    return text.partition("://")[0] in ('file', 'http', 'https')
+    return text.partition("://")[0] in ("file", "http", "https")
 
 
-def url_query_parameter(url: StrOrBytes, parameter: str, default: Optional[str] = None, keep_blank_values: Union[bool, int] = 0) -> Optional[str]:
+def url_query_parameter(
+    url: StrOrBytes,
+    parameter: str,
+    default: Optional[str] = None,
+    keep_blank_values: Union[bool, int] = 0,
+) -> Optional[str]:
     """Return the value of a url parameter, given the url and parameter name
 
     General case:
@@ -159,8 +173,7 @@ def url_query_parameter(url: StrOrBytes, parameter: str, default: Optional[str]
     """
 
     queryparams = parse_qs(
-        urlsplit(str(url))[3],
-        keep_blank_values=bool(keep_blank_values)
+        urlsplit(str(url))[3], keep_blank_values=bool(keep_blank_values)
     )
     if parameter in queryparams:
         return queryparams[parameter][0]
@@ -169,7 +182,14 @@ def url_query_parameter(url: StrOrBytes, parameter: str, default: Optional[str]
 
 
 def url_query_cleaner(
-        url: StrOrBytes, parameterlist: Union[StrOrBytes, Sequence[StrOrBytes]] = (), sep: str = '&', kvsep: str = '=', remove: bool = False, unique: bool = True, keep_fragments: bool = False) -> str:
+    url: StrOrBytes,
+    parameterlist: Union[StrOrBytes, Sequence[StrOrBytes]] = (),
+    sep: str = "&",
+    kvsep: str = "=",
+    remove: bool = False,
+    unique: bool = True,
+    keep_fragments: bool = False,
+) -> str:
     """Clean URL arguments leaving only those passed in the parameterlist keeping order
 
     >>> import w3lib.url
@@ -206,7 +226,7 @@ def url_query_cleaner(
     url, fragment = urldefrag(url)
     url = cast(str, url)
     fragment = cast(str, fragment)
-    base, _, query = url.partition('?')
+    base, _, query = url.partition("?")
     seen = set()
     querylist = []
     for ksv in query.split(sep):
@@ -224,9 +244,10 @@ def url_query_cleaner(
             seen.add(k)
     url = "?".join([base, sep.join(querylist)]) if querylist else base
     if keep_fragments:
-        url += '#' + fragment
+        url += "#" + fragment
     return cast(str, url)
 
+
 def _add_or_replace_parameters(url: str, params: Dict[str, str]) -> str:
     parsed = urlsplit(url)
     current_args = parse_qsl(parsed.query, keep_blank_values=True)
@@ -343,8 +364,9 @@ def any_to_uri(uri_or_path: str) -> str:
     ).encode()
 )
 
-_ParseDataURIResult = namedtuple("_ParseDataURIResult",
-                                 "media_type media_type_parameters data")
+_ParseDataURIResult = namedtuple(
+    "_ParseDataURIResult", "media_type media_type_parameters data"
+)
 
 
 def parse_data_uri(uri: StrOrBytes) -> _ParseDataURIResult:
@@ -389,7 +411,7 @@ def parse_data_uri(uri: StrOrBytes) -> _ParseDataURIResult:
         if m:
             attribute, value, value_quoted = m.groups()
             if value_quoted:
-                value = re.sub(br'\\(.)', rb'\1', value_quoted)
+                value = re.sub(br"\\(.)", rb"\1", value_quoted)
             media_type_params[attribute.decode()] = value.decode()
             uri = uri[m.end() :]
         else:
@@ -423,7 +445,9 @@ def parse_data_uri(uri: StrOrBytes) -> _ParseDataURIResult:
 ]
 
 
-def _safe_ParseResult(parts: ParseResult, encoding: str = 'utf8', path_encoding: str = 'utf8') -> Tuple[str, str, str, str, str, str]:
+def _safe_ParseResult(
+    parts: ParseResult, encoding: str = "utf8", path_encoding: str = "utf8"
+) -> Tuple[str, str, str, str, str, str]:
     # IDNA encoding can fail for too long labels (>63 characters)
     # or missing labels (e.g. http://.example.com)
     try:
@@ -441,8 +465,12 @@ def _safe_ParseResult(parts: ParseResult, encoding: str = 'utf8', path_encoding:
     )
 
 
-def canonicalize_url(url: Union[StrOrBytes, ParseResult], keep_blank_values: bool = True, keep_fragments: bool = False,
-                     encoding: Optional[str] = None) -> str:
+def canonicalize_url(
+    url: Union[StrOrBytes, ParseResult],
+    keep_blank_values: bool = True,
+    keep_fragments: bool = False,
+    encoding: Optional[str] = None,
+) -> str:
     r"""Canonicalize the given url by applying the following procedures:
 
     - sort query arguments, first by key, then by value
@@ -478,7 +506,8 @@ def canonicalize_url(url: Union[StrOrBytes, ParseResult], keep_blank_values: boo
     # if not for proper URL expected by remote website.
     try:
         scheme, netloc, path, params, query, fragment = _safe_ParseResult(
-            parse_url(url), encoding=encoding or 'utf8')
+            parse_url(url), encoding=encoding or "utf8"
+        )
     except UnicodeEncodeError:
         scheme, netloc, path, params, query, fragment = _safe_ParseResult(
             parse_url(url), encoding="utf8"
@@ -530,8 +559,8 @@ def canonicalize_url(url: Union[StrOrBytes, ParseResult], keep_blank_values: boo
 
 
 def _unquotepath(path: str) -> bytes:
-    for reserved in ('2f', '2F', '3f', '3F'):
-        path = path.replace('%' + reserved, '%25' + reserved.upper())
+    for reserved in ("2f", "2F", "3f", "3F"):
+        path = path.replace("%" + reserved, "%25" + reserved.upper())
 
     # standard lib's unquote() does not work for non-UTF-8
     # percent-escaped characters, they get lost.
@@ -541,7 +570,9 @@ def _unquotepath(path: str) -> bytes:
     return unquote_to_bytes(path)
 
 
-def parse_url(url: Union[StrOrBytes, ParseResult], encoding: Optional[str] = None) -> ParseResult:
+def parse_url(
+    url: Union[StrOrBytes, ParseResult], encoding: Optional[str] = None
+) -> ParseResult:
     """Return urlparsed url from the given argument (which could be an already
     parsed url)
     """
@@ -550,7 +581,9 @@ def parse_url(url: Union[StrOrBytes, ParseResult], encoding: Optional[str] = Non
     return urlparse(to_unicode(url, encoding))
 
 
-def parse_qsl_to_bytes(qs: str, keep_blank_values: bool = False) -> List[Tuple[bytes, bytes]]:
+def parse_qsl_to_bytes(
+    qs: str, keep_blank_values: bool = False
+) -> List[Tuple[bytes, bytes]]:
     """Parse a query given as a string argument.
 
     Data are returned as a list of name, value pairs as bytes.
@@ -572,7 +605,7 @@ def parse_qsl_to_bytes(qs: str, keep_blank_values: bool = False) -> List[Tuple[b
     # with unquote_to_bytes(s)
     coerce_args = cast(Callable[..., Tuple[str, Callable]], _coerce_args)
     qs, _coerce_result = coerce_args(qs)
-    pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
+    pairs = [s2 for s1 in qs.split("&") for s2 in s1.split(";")]
     r = []
     for name_value in pairs:
         if not name_value:
@@ -585,10 +618,10 @@ def parse_qsl_to_bytes(qs: str, keep_blank_values: bool = False) -> List[Tuple[b
             else:
                 continue
         if len(nv[1]) or keep_blank_values:
-            name: StrOrBytes = nv[0].replace('+', ' ')
+            name: StrOrBytes = nv[0].replace("+", " ")
             name = unquote_to_bytes(name)
             name = _coerce_result(name)
-            value: StrOrBytes = nv[1].replace('+', ' ')
+            value: StrOrBytes = nv[1].replace("+", " ")
             value = unquote_to_bytes(value)
             value = _coerce_result(value)
             r.append((cast(bytes, name), cast(bytes, value)))
diff --git a/w3lib/util.py b/w3lib/util.py
index 315d968a..58ca867f 100644
--- a/w3lib/util.py
+++ b/w3lib/util.py
@@ -4,7 +4,7 @@
 
 
 def str_to_unicode(
-    text: StrOrBytes, encoding: Optional[str] = None, errors: str = 'strict'
+    text: StrOrBytes, encoding: Optional[str] = None, errors: str = "strict"
 ) -> str:
     if encoding is None:
         encoding = "utf-8"
@@ -12,8 +12,9 @@ def str_to_unicode(
         return text.decode(encoding, errors)
     return text
 
+
 def unicode_to_str(
-    text: StrOrBytes, encoding: Optional[str] = None, errors: str = 'strict'
+    text: StrOrBytes, encoding: Optional[str] = None, errors: str = "strict"
 ) -> bytes:
     warn(
         "The w3lib.utils.unicode_to_str function is deprecated and "
@@ -29,7 +30,7 @@ def unicode_to_str(
 
 
 def to_unicode(
-    text: StrOrBytes, encoding: Optional[str] = None, errors: str = 'strict'
+    text: StrOrBytes, encoding: Optional[str] = None, errors: str = "strict"
 ) -> str:
     """Return the unicode representation of a bytes object `text`. If `text`
     is already an unicode object, return it as-is."""
@@ -45,7 +46,7 @@ def to_unicode(
 
 
 def to_bytes(
-    text: StrOrBytes, encoding: Optional[str] = None, errors: str = 'strict'
+    text: StrOrBytes, encoding: Optional[str] = None, errors: str = "strict"
 ) -> bytes:
     """Return the binary representation of `text`. If `text`
     is already a bytes object, return it as-is."""
@@ -61,7 +62,7 @@ def to_bytes(
 
 
 def to_native_str(
-    text: StrOrBytes, encoding: Optional[str] = None, errors: str = 'strict'
+    text: StrOrBytes, encoding: Optional[str] = None, errors: str = "strict"
 ) -> str:
     """ Return str representation of `text` """
     warn(

From 9e1ee79923e2ebab38afb5a5c6d9d9ce158e3138 Mon Sep 17 00:00:00 2001
From: Lucy Wang <wxitb2017@gmail.com>
Date: Mon, 9 Aug 2021 14:26:34 +0800
Subject: [PATCH 09/17] use new mypy

---
 tox.ini | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tox.ini b/tox.ini
index a954c1eb..e2d212cf 100644
--- a/tox.ini
+++ b/tox.ini
@@ -25,7 +25,7 @@ commands =
 [testenv:typing]
 basepython = python3
 deps =
-    mypy==0.901
+    mypy==0.910
 commands =
     mypy --show-error-codes {posargs: w3lib tests}
 

From 7cdc107051f47b774983fd4c7ed95b36db17fd18 Mon Sep 17 00:00:00 2001
From: Lucy Wang <wxitb2017@gmail.com>
Date: Mon, 9 Aug 2021 14:40:04 +0800
Subject: [PATCH 10/17] fix ci

---
 tox.ini       | 2 ++
 w3lib/util.py | 8 +++++++-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/tox.ini b/tox.ini
index e2d212cf..4e8e4767 100644
--- a/tox.ini
+++ b/tox.ini
@@ -25,6 +25,8 @@ commands =
 [testenv:typing]
 basepython = python3
 deps =
+    # mypy would error if pytest (or its sub) not found
+    pytest
     mypy==0.910
 commands =
     mypy --show-error-codes {posargs: w3lib tests}
diff --git a/w3lib/util.py b/w3lib/util.py
index 58ca867f..c9eba65f 100644
--- a/w3lib/util.py
+++ b/w3lib/util.py
@@ -6,6 +6,12 @@
 def str_to_unicode(
     text: StrOrBytes, encoding: Optional[str] = None, errors: str = "strict"
 ) -> str:
+    warn(
+        "The w3lib.utils.str_to_unicode function is deprecated and "
+        "will be removed in a future release.",
+        DeprecationWarning,
+        stacklevel=2,
+    )
     if encoding is None:
         encoding = "utf-8"
     if isinstance(text, bytes):
@@ -64,7 +70,7 @@ def to_bytes(
 def to_native_str(
     text: StrOrBytes, encoding: Optional[str] = None, errors: str = "strict"
 ) -> str:
-    """ Return str representation of `text` """
+    """Return str representation of `text`"""
     warn(
         "The w3lib.utils.to_native_str function is deprecated and "
         "will be removed in a future release. Please use "

From 1a81a47c265b4f333c06f24786d9cf6fac72bf78 Mon Sep 17 00:00:00 2001
From: Lucy Wang <wxitb2017@gmail.com>
Date: Mon, 9 Aug 2021 15:05:36 +0800
Subject: [PATCH 11/17] fix sphinx

---
 w3lib/url.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/w3lib/url.py b/w3lib/url.py
index 85498ed2..e44c0860 100644
--- a/w3lib/url.py
+++ b/w3lib/url.py
@@ -364,12 +364,13 @@ def any_to_uri(uri_or_path: str) -> str:
     ).encode()
 )
 
-_ParseDataURIResult = namedtuple(
-    "_ParseDataURIResult", "media_type media_type_parameters data"
+ParseDataURIResult = namedtuple(
+    "ParseDataURIResult", "media_type media_type_parameters data"
 )
+ParseDataURIResult.__doc__ = "The return value type of `w3lib.url.parse_data_uri`."
 
 
-def parse_data_uri(uri: StrOrBytes) -> _ParseDataURIResult:
+def parse_data_uri(uri: StrOrBytes):  # type: ignore
     """
 
     Parse a data: URI, returning a 3-tuple of media type, dictionary of media
@@ -426,7 +427,7 @@ def parse_data_uri(uri: StrOrBytes) -> _ParseDataURIResult:
             raise ValueError("invalid data URI")
         data = base64.b64decode(data)
 
-    return _ParseDataURIResult(media_type, media_type_params, data)
+    return ParseDataURIResult(media_type, media_type_params, data)
 
 
 __all__ = [

From f877a6ce840da748d7416001411da26b724cde80 Mon Sep 17 00:00:00 2001
From: Lucy Wang <wxitb2017@gmail.com>
Date: Mon, 9 Aug 2021 15:06:50 +0800
Subject: [PATCH 12/17] doc

---
 w3lib/url.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/w3lib/url.py b/w3lib/url.py
index e44c0860..e6744744 100644
--- a/w3lib/url.py
+++ b/w3lib/url.py
@@ -370,6 +370,8 @@ def any_to_uri(uri_or_path: str) -> str:
 ParseDataURIResult.__doc__ = "The return value type of `w3lib.url.parse_data_uri`."
 
 
+# If we add the return type hint sphinx would error:
+#   w3lib/url.py:docstring of w3lib.url.parse_data_uri::py:class reference target not found: w3lib.url.ParseDataURIResult
 def parse_data_uri(uri: StrOrBytes):  # type: ignore
     """
 

From fc4d33a5722cf0144baa79ae40a676ebaff8d9df Mon Sep 17 00:00:00 2001
From: Lucy Wang <wxitb2017@gmail.com>
Date: Mon, 23 Aug 2021 11:29:30 +0800
Subject: [PATCH 13/17] Update w3lib/util.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Adrián Chaves <adrian@chaves.io>
---
 w3lib/util.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/w3lib/util.py b/w3lib/util.py
index c9eba65f..70f4ef52 100644
--- a/w3lib/util.py
+++ b/w3lib/util.py
@@ -1,5 +1,6 @@
 from warnings import warn
 from typing import Optional
+
 from w3lib._types import StrOrBytes
 
 

From 57675073281b30402ae172594959b311637069d6 Mon Sep 17 00:00:00 2001
From: Lucy Wang <wxitb2017@gmail.com>
Date: Mon, 23 Aug 2021 11:32:00 +0800
Subject: [PATCH 14/17] cr

---
 docs/w3lib.rst     |  2 ++
 run-mypy.sh        |  5 -----
 tests/test_html.py |  2 +-
 w3lib/url.py       | 12 ++++++------
 4 files changed, 9 insertions(+), 12 deletions(-)
 delete mode 100755 run-mypy.sh

diff --git a/docs/w3lib.rst b/docs/w3lib.rst
index bfde0304..c5233dd7 100644
--- a/docs/w3lib.rst
+++ b/docs/w3lib.rst
@@ -26,3 +26,5 @@ w3lib Package
 
 .. automodule:: w3lib.url
     :members:
+
+.. autoclass:: ParseDataURIResult
diff --git a/run-mypy.sh b/run-mypy.sh
deleted file mode 100755
index ea3b1332..00000000
--- a/run-mypy.sh
+++ /dev/null
@@ -1,5 +0,0 @@
-#!/bin/bash
-
-set -e
-
-mypy --txt-report . w3lib tests
diff --git a/tests/test_html.py b/tests/test_html.py
index 5a092f29..f6ca90d2 100644
--- a/tests/test_html.py
+++ b/tests/test_html.py
@@ -69,7 +69,7 @@ def test_illegal_entities(self):
     def test_browser_hack(self):
         # check browser hack for numeric character references in the 80-9F range
         self.assertEqual(replace_entities("x&#153;y", encoding="cp1252"), "x\u2122y")
-        self.assertEqual(replace_entities("x&#x99;y", encoding="cp1252"), u"x\u2122y")
+        self.assertEqual(replace_entities("x&#x99;y", encoding="cp1252"), "x\u2122y")
 
     def test_missing_semicolon(self):
         for entity, result in (
diff --git a/w3lib/url.py b/w3lib/url.py
index e6744744..d8066309 100644
--- a/w3lib/url.py
+++ b/w3lib/url.py
@@ -89,9 +89,11 @@ def safe_url_string(
     # IDNA encoding can fail for too long labels (>63 characters)
     # or missing labels (e.g. http://.example.com)
     try:
-        netloc = parts.netloc.encode("idna")
+        netloc_bytes = parts.netloc.encode("idna")
     except UnicodeError:
-        netloc = parts.netloc.encode("utf-8")
+        netloc = parts.netloc
+    else:
+        netloc = netloc_bytes.decode()
 
     # default encoding for path component SHOULD be UTF-8
     if quote_path:
@@ -102,7 +104,7 @@ def safe_url_string(
     return urlunsplit(
         (
             parts.scheme,
-            netloc.decode().rstrip(":"),
+            netloc.rstrip(":"),
             path,
             quote(parts.query.encode(encoding), _safe_chars),
             quote(parts.fragment.encode(encoding), _safe_chars),
@@ -370,9 +372,7 @@ def any_to_uri(uri_or_path: str) -> str:
 ParseDataURIResult.__doc__ = "The return value type of `w3lib.url.parse_data_uri`."
 
 
-# If we add the return type hint sphinx would error:
-#   w3lib/url.py:docstring of w3lib.url.parse_data_uri::py:class reference target not found: w3lib.url.ParseDataURIResult
-def parse_data_uri(uri: StrOrBytes):  # type: ignore
+def parse_data_uri(uri: StrOrBytes) -> ParseDataURIResult:
     """
 
     Parse a data: URI, returning a 3-tuple of media type, dictionary of media

From 341b3f31dade1393eadf98e1d4c7dffd35f6c2c3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= <adrian@chaves.io>
Date: Tue, 24 Aug 2021 10:51:40 +0200
Subject: [PATCH 15/17] Improve ParseDataURIResult documentation

---
 docs/w3lib.rst |  3 ++-
 w3lib/url.py   | 34 +++++++++++++++++++++-------------
 2 files changed, 23 insertions(+), 14 deletions(-)

diff --git a/docs/w3lib.rst b/docs/w3lib.rst
index c5233dd7..502554ff 100644
--- a/docs/w3lib.rst
+++ b/docs/w3lib.rst
@@ -27,4 +27,5 @@ w3lib Package
 .. automodule:: w3lib.url
     :members:
 
-.. autoclass:: ParseDataURIResult
+    .. autoclass:: ParseDataURIResult
+       :members:
diff --git a/w3lib/url.py b/w3lib/url.py
index d8066309..766ca56e 100644
--- a/w3lib/url.py
+++ b/w3lib/url.py
@@ -8,8 +8,17 @@
 import posixpath
 import re
 import string
-from collections import namedtuple
-from typing import Callable, List, Optional, Sequence, Tuple, Union, cast, Dict
+from typing import (
+    cast,
+    Callable,
+    Dict,
+    List,
+    NamedTuple,
+    Optional,
+    Sequence,
+    Tuple,
+    Union,
+)
 from urllib.parse import (
     parse_qs,
     parse_qsl,
@@ -366,20 +375,19 @@ def any_to_uri(uri_or_path: str) -> str:
     ).encode()
 )
 
-ParseDataURIResult = namedtuple(
-    "ParseDataURIResult", "media_type media_type_parameters data"
-)
-ParseDataURIResult.__doc__ = "The return value type of `w3lib.url.parse_data_uri`."
-
-
-def parse_data_uri(uri: StrOrBytes) -> ParseDataURIResult:
-    """
 
-    Parse a data: URI, returning a 3-tuple of media type, dictionary of media
-    type parameters, and data.
+class ParseDataURIResult(NamedTuple):
+    """Named tuple returned by :func:`parse_data_uri`."""
+    #: MIME type type and subtype, separated by / (e.g. ``"text/plain"``).
+    media_type: str
+    #: MIME type parameters (e.g. ``{"charset": "US-ASCII"}``).
+    media_type_parameters: dict[str, str]
+    #: Data, decoded if it was encoded in base64 format.
+    data: bytes
 
-    """
 
+def parse_data_uri(uri: StrOrBytes) -> ParseDataURIResult:
+    """Parse a data: URI into :class:`ParseDataURIResult`."""
     if not isinstance(uri, bytes):
         uri = safe_url_string(uri).encode("ascii")
 

From b1af346cf6c1f3bfa0947a6137085e275ad94976 Mon Sep 17 00:00:00 2001
From: Lucy Wang <wxitb2017@gmail.com>
Date: Tue, 24 Aug 2021 17:35:19 +0800
Subject: [PATCH 16/17] fix py36 compat

---
 w3lib/url.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/w3lib/url.py b/w3lib/url.py
index 766ca56e..e4a8121b 100644
--- a/w3lib/url.py
+++ b/w3lib/url.py
@@ -381,7 +381,7 @@ class ParseDataURIResult(NamedTuple):
     #: MIME type type and subtype, separated by / (e.g. ``"text/plain"``).
     media_type: str
     #: MIME type parameters (e.g. ``{"charset": "US-ASCII"}``).
-    media_type_parameters: dict[str, str]
+    media_type_parameters: Dict[str, str]
     #: Data, decoded if it was encoded in base64 format.
     data: bytes
 

From 9fcbe1356197024d4b0d66679428eb723ae13850 Mon Sep 17 00:00:00 2001
From: Lucy Wang <wxitb2017@gmail.com>
Date: Tue, 24 Aug 2021 17:36:40 +0800
Subject: [PATCH 17/17] black

---
 w3lib/url.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/w3lib/url.py b/w3lib/url.py
index e4a8121b..71398516 100644
--- a/w3lib/url.py
+++ b/w3lib/url.py
@@ -378,6 +378,7 @@ def any_to_uri(uri_or_path: str) -> str:
 
 class ParseDataURIResult(NamedTuple):
     """Named tuple returned by :func:`parse_data_uri`."""
+
     #: MIME type type and subtype, separated by / (e.g. ``"text/plain"``).
     media_type: str
     #: MIME type parameters (e.g. ``{"charset": "US-ASCII"}``).