scrapy · wRAR · Aug 24, 2021 · Jun 9, 2021 · Jun 16, 2021 · Jun 22, 2021
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -30,6 +30,9 @@ jobs:
         - python-version: 3.9
           env:
             TOXENV: black
+        - python-version: 3.9
+          env:
+            TOXENV: typing
 
     steps:
     - uses: actions/checkout@v2

diff --git a/.gitignore b/.gitignore
@@ -10,3 +10,7 @@ _trial_temp
 .coverage
 coverage.xml
 .cache
+.mypy_cache/
+/index.txt
+.dmypy.json
+.hypothesis/
diff --git a/docs/w3lib.rst b/docs/w3lib.rst
@@ -26,3 +26,5 @@ w3lib Package
 
 .. automodule:: w3lib.url
     :members:
+
+.. autoclass:: ParseDataURIResult
diff --git a/mypy.ini b/mypy.ini
@@ -0,0 +1,12 @@
+[mypy]
+exclude = .*flycheck_.*
+show_error_codes = True
+check_untyped_defs = True
+
+[mypy-w3lib.*]
+# All non-tests functions must be typed.
+disallow_untyped_defs = True
+
+[mypy-tests.*]
+# Allow test functions to be untyped
+disallow_untyped_defs = False
diff --git a/tests/test_encoding.py b/tests/test_encoding.py
@@ -38,14 +38,16 @@ def test_bom(self):
         utf32le = b"\xff\xfe\x00\x00\x34\x6c\x00\x00"
         for string in (utf16be, utf16le, utf32be, utf32le):
             bom_encoding, bom = read_bom(string)
+            assert bom_encoding is not None
+            assert bom is not None
             decoded = string[len(bom) :].decode(bom_encoding)
             self.assertEqual(water_unicode, decoded)
         # Body without BOM
-        enc, bom = read_bom("foo")
+        enc, bom = read_bom(b"foo")
         self.assertEqual(enc, None)
         self.assertEqual(bom, None)
         # Empty body
-        enc, bom = read_bom("")
+        enc, bom = read_bom(b"")
         self.assertEqual(enc, None)
         self.assertEqual(bom, None)
 

diff --git a/tests/test_html.py b/tests/test_html.py
@@ -69,7 +69,7 @@ def test_illegal_entities(self):
     def test_browser_hack(self):
         # check browser hack for numeric character references in the 80-9F range
         self.assertEqual(replace_entities("x&#153;y", encoding="cp1252"), "x\u2122y")
-        self.assertEqual(replace_entities("x&#x99;y", encoding="cp1252"), u"x\u2122y")
+        self.assertEqual(replace_entities("x&#x99;y", encoding="cp1252"), "x\u2122y")
 
     def test_missing_semicolon(self):
         for entity, result in (

diff --git a/tests/test_http.py b/tests/test_http.py
@@ -1,6 +1,11 @@
 import unittest
 from collections import OrderedDict
-from w3lib.http import basic_auth_header, headers_dict_to_raw, headers_raw_to_dict
+from w3lib.http import (
+    HeadersDictInput,
+    basic_auth_header,
+    headers_dict_to_raw,
+    headers_raw_to_dict,
+)
 
 __doctests__ = ["w3lib.http"]  # for trial support
 
@@ -47,7 +52,9 @@ def test_headers_dict_to_raw(self):
         )
 
     def test_headers_dict_to_raw_listtuple(self):
-        dct = OrderedDict([(b"Content-type", [b"text/html"]), (b"Accept", [b"gzip"])])
+        dct: HeadersDictInput = OrderedDict(
+            [(b"Content-type", [b"text/html"]), (b"Accept", [b"gzip"])]
+        )
         self.assertEqual(
             headers_dict_to_raw(dct), b"Content-type: text/html\r\nAccept: gzip"
         )
@@ -70,12 +77,13 @@ def test_headers_dict_to_raw_listtuple(self):
         )
 
     def test_headers_dict_to_raw_wrong_values(self):
-        dct = OrderedDict(
+        dct: HeadersDictInput = OrderedDict(
             [
                 (b"Content-type", 0),
             ]
         )
         self.assertEqual(headers_dict_to_raw(dct), b"")
+        self.assertEqual(headers_dict_to_raw(dct), b"")
 
         dct = OrderedDict([(b"Content-type", 1), (b"Accept", [b"gzip"])])
         self.assertEqual(headers_dict_to_raw(dct), b"Accept: gzip")
diff --git a/tests/test_url.py b/tests/test_url.py
@@ -508,7 +508,7 @@ def test_add_or_replace_parameters(self):
     def test_add_or_replace_parameters_does_not_change_input_param(self):
         url = "http://domain/test?arg=original"
         input_param = {"arg": "value"}
-        new_url = add_or_replace_parameters(url, input_param)  # noqa
+        add_or_replace_parameters(url, input_param)  # noqa
         self.assertEqual(input_param, {"arg": "value"})
 
     def test_url_query_cleaner(self):
@@ -817,15 +817,18 @@ def test_non_ascii_percent_encoding_in_paths(self):
         self.assertEqual(
             canonicalize_url("http://www.example.com/a do?a=1"),
             "http://www.example.com/a%20do?a=1",
-        ),
+        )
+
         self.assertEqual(
             canonicalize_url("http://www.example.com/a %20do?a=1"),
             "http://www.example.com/a%20%20do?a=1",
-        ),
+        )
+
         self.assertEqual(
             canonicalize_url("http://www.example.com/a do£.html?a=1"),
             "http://www.example.com/a%20do%C2%A3.html?a=1",
         )
+
         self.assertEqual(
             canonicalize_url(b"http://www.example.com/a do\xc2\xa3.html?a=1"),
             "http://www.example.com/a%20do%C2%A3.html?a=1",

diff --git a/tests/test_util.py b/tests/test_util.py
@@ -20,7 +20,7 @@ def test_deprecation(self):
 class ToBytesTestCase(TestCase):
     def test_type_error(self):
         with raises(TypeError):
-            to_bytes(True)
+            to_bytes(True)  # type: ignore
 
 
 class ToNativeStrTestCase(TestCase):
@@ -32,7 +32,7 @@ def test_deprecation(self):
 class ToUnicodeTestCase(TestCase):
     def test_type_error(self):
         with raises(TypeError):
-            to_unicode(True)
+            to_unicode(True)  # type: ignore
 
 
 class UnicodeToStrTestCase(TestCase):

diff --git a/tox.ini b/tox.ini
@@ -22,6 +22,15 @@ deps =
 commands =
     bandit -r -c .bandit.yml {posargs:w3lib}
 
+[testenv:typing]
+basepython = python3
+deps =
+    # mypy would error if pytest (or its sub) not found
+    pytest
+    mypy==0.910
+commands =
+    mypy --show-error-codes {posargs: w3lib tests}
+
 [testenv:flake8]
 basepython = python3
 deps =

diff --git a/w3lib/_types.py b/w3lib/_types.py
@@ -0,0 +1,5 @@
+from typing import Union
+
+# the base class UnicodeError doesn't have attributes like start / end
+AnyUnicodeError = Union[UnicodeEncodeError, UnicodeDecodeError]
+StrOrBytes = Union[str, bytes]
diff --git a/w3lib/encoding.py b/w3lib/encoding.py
@@ -3,11 +3,14 @@
 """
 import re, codecs, encodings
 from sys import version_info
+from typing import Callable, Match, Optional, Tuple, Union, cast
+from w3lib._types import AnyUnicodeError, StrOrBytes
+from w3lib.util import to_native_str
 
 _HEADER_ENCODING_RE = re.compile(r"charset=([\w-]+)", re.I)
 
 
-def http_content_type_encoding(content_type):
+def http_content_type_encoding(content_type: Optional[str]) -> Optional[str]:
     """Extract the encoding in the content-type header
 
     >>> import w3lib.encoding
@@ -21,6 +24,8 @@ def http_content_type_encoding(content_type):
         if match:
             return resolve_encoding(match.group(1))
 
+    return None
+
 
 # regexp for parsing HTTP meta tags
 _TEMPLATE = r"""%s\s*=\s*["']?\s*%s\s*["']?"""
@@ -51,7 +56,7 @@ def http_content_type_encoding(content_type):
 )
 
 
-def html_body_declared_encoding(html_body_str):
+def html_body_declared_encoding(html_body_str: StrOrBytes) -> Optional[str]:
     '''Return the encoding specified in meta tags in the html body,
     or ``None`` if no suitable encoding was found
 
@@ -75,6 +80,7 @@ def html_body_declared_encoding(html_body_str):
 
     # html5 suggests the first 1024 bytes are sufficient, we allow for more
     chunk = html_body_str[:4096]
+    match: Union[Optional[Match[bytes]], Optional[Match[str]]]
     if isinstance(chunk, bytes):
         match = _BODY_ENCODING_BYTES_RE.search(chunk)
     else:
@@ -87,7 +93,9 @@ def html_body_declared_encoding(html_body_str):
             or match.group("xmlcharset")
         )
         if encoding:
-            return resolve_encoding(encoding)
+            return resolve_encoding(to_native_str(encoding))
+
+    return None
 
 
 # Default encoding translation
@@ -118,7 +126,7 @@ def html_body_declared_encoding(html_body_str):
 }
 
 
-def _c18n_encoding(encoding):
+def _c18n_encoding(encoding: str) -> str:
     """Canonicalize an encoding name
 
     This performs normalization and translates aliases using python's
@@ -128,7 +136,7 @@ def _c18n_encoding(encoding):
     return encodings.aliases.aliases.get(normed, normed)
 
 
-def resolve_encoding(encoding_alias):
+def resolve_encoding(encoding_alias: str) -> Optional[str]:
     """Return the encoding that `encoding_alias` maps to, or ``None``
     if the encoding cannot be interpreted
 
@@ -158,7 +166,7 @@ def resolve_encoding(encoding_alias):
 _FIRST_CHARS = set(c[0] for (c, _) in _BOM_TABLE)
 
 
-def read_bom(data):
+def read_bom(data: bytes) -> Union[Tuple[None, None], Tuple[str, bytes]]:
     r"""Read the byte order mark in the text, if present, and
     return the encoding represented by the BOM and the BOM.
 
@@ -189,10 +197,12 @@ def read_bom(data):
 
 # Python decoder doesn't follow unicode standard when handling
 # bad utf-8 encoded strings. see http://bugs.python.org/issue8271
-codecs.register_error("w3lib_replace", lambda exc: ("\ufffd", exc.end))
+codecs.register_error(
+    "w3lib_replace", lambda exc: ("\ufffd", cast(AnyUnicodeError, exc).end)
+)
 
 
-def to_unicode(data_str, encoding):
+def to_unicode(data_str: bytes, encoding: str) -> str:
     """Convert a str object to unicode using the encoding given
 
     Characters that cannot be converted will be converted to ``\\ufffd`` (the
@@ -204,8 +214,11 @@ def to_unicode(data_str, encoding):
 
 
 def html_to_unicode(
-    content_type_header, html_body_str, default_encoding="utf8", auto_detect_fun=None
-):
+    content_type_header: Optional[str],
+    html_body_str: bytes,
+    default_encoding: str = "utf8",
+    auto_detect_fun: Optional[Callable[[bytes], str]] = None,
+) -> Tuple[str, str]:
     r'''Convert raw html bytes to unicode
 
     This attempts to make a reasonable guess at the content encoding of the
@@ -273,17 +286,20 @@ def html_to_unicode(
     if enc is not None:
         # remove BOM if it agrees with the encoding
         if enc == bom_enc:
+            bom = cast(bytes, bom)
             html_body_str = html_body_str[len(bom) :]
         elif enc == "utf-16" or enc == "utf-32":
             # read endianness from BOM, or default to big endian
             # tools.ietf.org/html/rfc2781 section 4.3
             if bom_enc is not None and bom_enc.startswith(enc):
                 enc = bom_enc
+                bom = cast(bytes, bom)
                 html_body_str = html_body_str[len(bom) :]
             else:
                 enc += "-be"
         return enc, to_unicode(html_body_str, enc)
     if bom_enc is not None:
+        bom = cast(bytes, bom)
         return bom_enc, to_unicode(html_body_str[len(bom) :], bom_enc)
     enc = html_body_declared_encoding(html_body_str)
     if enc is None and (auto_detect_fun is not None):
Original file line number	Diff line number	Diff line change
Expand Up		@@ -26,3 +26,5 @@ w3lib Package

		.. automodule:: w3lib.url
		:members:

		.. autoclass:: ParseDataURIResult
Gallaecio marked this conversation as resolved. Show resolved Hide resolved