Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Type annotations #172

Merged
merged 18 commits into from
Aug 24, 2021
3 changes: 3 additions & 0 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,9 @@ jobs:
- python-version: 3.9
env:
TOXENV: black
- python-version: 3.9
env:
TOXENV: typing

steps:
- uses: actions/checkout@v2
Expand Down
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,7 @@ _trial_temp
.coverage
coverage.xml
.cache
.mypy_cache/
/index.txt
.dmypy.json
.hypothesis/
2 changes: 2 additions & 0 deletions docs/w3lib.rst
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,5 @@ w3lib Package

.. automodule:: w3lib.url
:members:

.. autoclass:: ParseDataURIResult
Gallaecio marked this conversation as resolved.
Show resolved Hide resolved
12 changes: 12 additions & 0 deletions mypy.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
[mypy]
exclude = .*flycheck_.*
show_error_codes = True
check_untyped_defs = True

[mypy-w3lib.*]
# All non-tests functions must be typed.
disallow_untyped_defs = True

[mypy-tests.*]
# Allow test functions to be untyped
disallow_untyped_defs = False
6 changes: 4 additions & 2 deletions tests/test_encoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,14 +38,16 @@ def test_bom(self):
utf32le = b"\xff\xfe\x00\x00\x34\x6c\x00\x00"
for string in (utf16be, utf16le, utf32be, utf32le):
bom_encoding, bom = read_bom(string)
assert bom_encoding is not None
assert bom is not None
decoded = string[len(bom) :].decode(bom_encoding)
self.assertEqual(water_unicode, decoded)
# Body without BOM
enc, bom = read_bom("foo")
enc, bom = read_bom(b"foo")
self.assertEqual(enc, None)
self.assertEqual(bom, None)
# Empty body
enc, bom = read_bom("")
enc, bom = read_bom(b"")
self.assertEqual(enc, None)
self.assertEqual(bom, None)

Expand Down
2 changes: 1 addition & 1 deletion tests/test_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ def test_illegal_entities(self):
def test_browser_hack(self):
# check browser hack for numeric character references in the 80-9F range
self.assertEqual(replace_entities("x™y", encoding="cp1252"), "x\u2122y")
self.assertEqual(replace_entities("x™y", encoding="cp1252"), u"x\u2122y")
self.assertEqual(replace_entities("x™y", encoding="cp1252"), "x\u2122y")

def test_missing_semicolon(self):
for entity, result in (
Expand Down
14 changes: 11 additions & 3 deletions tests/test_http.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
import unittest
from collections import OrderedDict
from w3lib.http import basic_auth_header, headers_dict_to_raw, headers_raw_to_dict
from w3lib.http import (
HeadersDictInput,
basic_auth_header,
headers_dict_to_raw,
headers_raw_to_dict,
)

__doctests__ = ["w3lib.http"] # for trial support

Expand Down Expand Up @@ -47,7 +52,9 @@ def test_headers_dict_to_raw(self):
)

def test_headers_dict_to_raw_listtuple(self):
dct = OrderedDict([(b"Content-type", [b"text/html"]), (b"Accept", [b"gzip"])])
dct: HeadersDictInput = OrderedDict(
[(b"Content-type", [b"text/html"]), (b"Accept", [b"gzip"])]
)
self.assertEqual(
headers_dict_to_raw(dct), b"Content-type: text/html\r\nAccept: gzip"
)
Expand All @@ -70,12 +77,13 @@ def test_headers_dict_to_raw_listtuple(self):
)

def test_headers_dict_to_raw_wrong_values(self):
dct = OrderedDict(
dct: HeadersDictInput = OrderedDict(
[
(b"Content-type", 0),
]
)
self.assertEqual(headers_dict_to_raw(dct), b"")
self.assertEqual(headers_dict_to_raw(dct), b"")

dct = OrderedDict([(b"Content-type", 1), (b"Accept", [b"gzip"])])
self.assertEqual(headers_dict_to_raw(dct), b"Accept: gzip")
9 changes: 6 additions & 3 deletions tests/test_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -508,7 +508,7 @@ def test_add_or_replace_parameters(self):
def test_add_or_replace_parameters_does_not_change_input_param(self):
url = "http://domain/test?arg=original"
input_param = {"arg": "value"}
new_url = add_or_replace_parameters(url, input_param) # noqa
add_or_replace_parameters(url, input_param) # noqa
self.assertEqual(input_param, {"arg": "value"})

def test_url_query_cleaner(self):
Expand Down Expand Up @@ -817,15 +817,18 @@ def test_non_ascii_percent_encoding_in_paths(self):
self.assertEqual(
canonicalize_url("http://www.example.com/a do?a=1"),
"http://www.example.com/a%20do?a=1",
),
)

self.assertEqual(
canonicalize_url("http://www.example.com/a %20do?a=1"),
"http://www.example.com/a%20%20do?a=1",
),
)

self.assertEqual(
canonicalize_url("http://www.example.com/a do£.html?a=1"),
"http://www.example.com/a%20do%C2%A3.html?a=1",
)

self.assertEqual(
canonicalize_url(b"http://www.example.com/a do\xc2\xa3.html?a=1"),
"http://www.example.com/a%20do%C2%A3.html?a=1",
Expand Down
4 changes: 2 additions & 2 deletions tests/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def test_deprecation(self):
class ToBytesTestCase(TestCase):
def test_type_error(self):
with raises(TypeError):
to_bytes(True)
to_bytes(True) # type: ignore


class ToNativeStrTestCase(TestCase):
Expand All @@ -32,7 +32,7 @@ def test_deprecation(self):
class ToUnicodeTestCase(TestCase):
def test_type_error(self):
with raises(TypeError):
to_unicode(True)
to_unicode(True) # type: ignore


class UnicodeToStrTestCase(TestCase):
Expand Down
9 changes: 9 additions & 0 deletions tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,15 @@ deps =
commands =
bandit -r -c .bandit.yml {posargs:w3lib}

[testenv:typing]
basepython = python3
deps =
# mypy would error if pytest (or its sub) not found
pytest
mypy==0.910
commands =
mypy --show-error-codes {posargs: w3lib tests}

[testenv:flake8]
basepython = python3
deps =
Expand Down
5 changes: 5 additions & 0 deletions w3lib/_types.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from typing import Union

# the base class UnicodeError doesn't have attributes like start / end
AnyUnicodeError = Union[UnicodeEncodeError, UnicodeDecodeError]
StrOrBytes = Union[str, bytes]
36 changes: 26 additions & 10 deletions w3lib/encoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,14 @@
"""
import re, codecs, encodings
from sys import version_info
from typing import Callable, Match, Optional, Tuple, Union, cast
from w3lib._types import AnyUnicodeError, StrOrBytes
from w3lib.util import to_native_str

_HEADER_ENCODING_RE = re.compile(r"charset=([\w-]+)", re.I)


def http_content_type_encoding(content_type):
def http_content_type_encoding(content_type: Optional[str]) -> Optional[str]:
"""Extract the encoding in the content-type header

>>> import w3lib.encoding
Expand All @@ -21,6 +24,8 @@ def http_content_type_encoding(content_type):
if match:
return resolve_encoding(match.group(1))

return None


# regexp for parsing HTTP meta tags
_TEMPLATE = r"""%s\s*=\s*["']?\s*%s\s*["']?"""
Expand Down Expand Up @@ -51,7 +56,7 @@ def http_content_type_encoding(content_type):
)


def html_body_declared_encoding(html_body_str):
def html_body_declared_encoding(html_body_str: StrOrBytes) -> Optional[str]:
'''Return the encoding specified in meta tags in the html body,
or ``None`` if no suitable encoding was found

Expand All @@ -75,6 +80,7 @@ def html_body_declared_encoding(html_body_str):

# html5 suggests the first 1024 bytes are sufficient, we allow for more
chunk = html_body_str[:4096]
match: Union[Optional[Match[bytes]], Optional[Match[str]]]
if isinstance(chunk, bytes):
match = _BODY_ENCODING_BYTES_RE.search(chunk)
else:
Expand All @@ -87,7 +93,9 @@ def html_body_declared_encoding(html_body_str):
or match.group("xmlcharset")
)
if encoding:
return resolve_encoding(encoding)
return resolve_encoding(to_native_str(encoding))

return None


# Default encoding translation
Expand Down Expand Up @@ -118,7 +126,7 @@ def html_body_declared_encoding(html_body_str):
}


def _c18n_encoding(encoding):
def _c18n_encoding(encoding: str) -> str:
"""Canonicalize an encoding name

This performs normalization and translates aliases using python's
Expand All @@ -128,7 +136,7 @@ def _c18n_encoding(encoding):
return encodings.aliases.aliases.get(normed, normed)


def resolve_encoding(encoding_alias):
def resolve_encoding(encoding_alias: str) -> Optional[str]:
"""Return the encoding that `encoding_alias` maps to, or ``None``
if the encoding cannot be interpreted

Expand Down Expand Up @@ -158,7 +166,7 @@ def resolve_encoding(encoding_alias):
_FIRST_CHARS = set(c[0] for (c, _) in _BOM_TABLE)


def read_bom(data):
def read_bom(data: bytes) -> Union[Tuple[None, None], Tuple[str, bytes]]:
r"""Read the byte order mark in the text, if present, and
return the encoding represented by the BOM and the BOM.

Expand Down Expand Up @@ -189,10 +197,12 @@ def read_bom(data):

# Python decoder doesn't follow unicode standard when handling
# bad utf-8 encoded strings. see http://bugs.python.org/issue8271
codecs.register_error("w3lib_replace", lambda exc: ("\ufffd", exc.end))
codecs.register_error(
"w3lib_replace", lambda exc: ("\ufffd", cast(AnyUnicodeError, exc).end)
)


def to_unicode(data_str, encoding):
def to_unicode(data_str: bytes, encoding: str) -> str:
"""Convert a str object to unicode using the encoding given

Characters that cannot be converted will be converted to ``\\ufffd`` (the
Expand All @@ -204,8 +214,11 @@ def to_unicode(data_str, encoding):


def html_to_unicode(
content_type_header, html_body_str, default_encoding="utf8", auto_detect_fun=None
):
content_type_header: Optional[str],
html_body_str: bytes,
default_encoding: str = "utf8",
auto_detect_fun: Optional[Callable[[bytes], str]] = None,
) -> Tuple[str, str]:
r'''Convert raw html bytes to unicode

This attempts to make a reasonable guess at the content encoding of the
Expand Down Expand Up @@ -273,17 +286,20 @@ def html_to_unicode(
if enc is not None:
# remove BOM if it agrees with the encoding
if enc == bom_enc:
bom = cast(bytes, bom)
html_body_str = html_body_str[len(bom) :]
elif enc == "utf-16" or enc == "utf-32":
# read endianness from BOM, or default to big endian
# tools.ietf.org/html/rfc2781 section 4.3
if bom_enc is not None and bom_enc.startswith(enc):
enc = bom_enc
bom = cast(bytes, bom)
html_body_str = html_body_str[len(bom) :]
else:
enc += "-be"
return enc, to_unicode(html_body_str, enc)
if bom_enc is not None:
bom = cast(bytes, bom)
return bom_enc, to_unicode(html_body_str[len(bom) :], bom_enc)
enc = html_body_declared_encoding(html_body_str)
if enc is None and (auto_detect_fun is not None):
Expand Down
Loading