From 876dc2e99c00a048b4fd212db9021994792ed3fb Mon Sep 17 00:00:00 2001 From: Lucy Wang Date: Wed, 9 Jun 2021 13:32:02 +0800 Subject: [PATCH 01/17] setup mypy --- .github/workflows/build.yml | 3 +++ .gitignore | 1 + tox.ini | 7 +++++++ 3 files changed, 11 insertions(+) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 2425ab35..0d607cf1 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -30,6 +30,9 @@ jobs: - python-version: 3.9 env: TOXENV: black + - python-version: 3.9 + env: + TOXENV: typing steps: - uses: actions/checkout@v2 diff --git a/.gitignore b/.gitignore index a279e1af..9f81e120 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,4 @@ _trial_temp .coverage coverage.xml .cache +.mypy_cache/ diff --git a/tox.ini b/tox.ini index 29e9145f..a954c1eb 100644 --- a/tox.ini +++ b/tox.ini @@ -22,6 +22,13 @@ deps = commands = bandit -r -c .bandit.yml {posargs:w3lib} +[testenv:typing] +basepython = python3 +deps = + mypy==0.901 +commands = + mypy --show-error-codes {posargs: w3lib tests} + [testenv:flake8] basepython = python3 deps = From 1357f4d3dd0d60bb88f886dcca73c6fa8d6c1c0c Mon Sep 17 00:00:00 2001 From: Lucy Wang Date: Wed, 16 Jun 2021 19:37:31 +0800 Subject: [PATCH 02/17] start to add typing --- .gitignore | 2 ++ mypy.ini | 12 ++++++++++++ run-mypy.sh | 5 +++++ w3lib/_types.py | 5 +++++ w3lib/encoding.py | 40 ++++++++++++++++++++++++---------------- w3lib/http.py | 24 +++++++++++++----------- 6 files changed, 61 insertions(+), 27 deletions(-) create mode 100644 mypy.ini create mode 100755 run-mypy.sh create mode 100644 w3lib/_types.py diff --git a/.gitignore b/.gitignore index 9f81e120..bccc4a7b 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,5 @@ _trial_temp coverage.xml .cache .mypy_cache/ +/index.txt +.dmypy.json diff --git a/mypy.ini b/mypy.ini new file mode 100644 index 00000000..d4c7c859 --- /dev/null +++ b/mypy.ini @@ -0,0 +1,12 @@ +[mypy] +exclude = .*flycheck_.* +show_error_codes = True +check_untyped_defs = True + +[mypy-w3lib.*] +# All non-tests functions must be typed. +disallow_untyped_defs = True + +[mypy-tests.*] +# Allow test functions to be untyped +disallow_untyped_defs = False diff --git a/run-mypy.sh b/run-mypy.sh new file mode 100755 index 00000000..ea3b1332 --- /dev/null +++ b/run-mypy.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +set -e + +mypy --txt-report . w3lib tests diff --git a/w3lib/_types.py b/w3lib/_types.py new file mode 100644 index 00000000..84499a6c --- /dev/null +++ b/w3lib/_types.py @@ -0,0 +1,5 @@ +from typing import Union + +# the base class UnicodeError doesn't have attributes like start / end +AnyUnicodeError = Union[UnicodeEncodeError, UnicodeDecodeError] +StrOrBytes = Union[str, bytes] diff --git a/w3lib/encoding.py b/w3lib/encoding.py index 1a231155..68357d12 100644 --- a/w3lib/encoding.py +++ b/w3lib/encoding.py @@ -3,11 +3,14 @@ """ import re, codecs, encodings from sys import version_info +from typing import Callable, Match, Optional, Tuple, Union, cast +from w3lib._types import AnyUnicodeError, StrOrBytes +from w3lib.util import to_native_str _HEADER_ENCODING_RE = re.compile(r"charset=([\w-]+)", re.I) -def http_content_type_encoding(content_type): +def http_content_type_encoding(content_type: Optional[str]) -> Optional[str]: """Extract the encoding in the content-type header >>> import w3lib.encoding @@ -21,6 +24,7 @@ def http_content_type_encoding(content_type): if match: return resolve_encoding(match.group(1)) + return None # regexp for parsing HTTP meta tags _TEMPLATE = r"""%s\s*=\s*["']?\s*%s\s*["']?""" @@ -51,7 +55,7 @@ def http_content_type_encoding(content_type): ) -def html_body_declared_encoding(html_body_str): +def html_body_declared_encoding(html_body_str: StrOrBytes) -> Optional[str]: '''Return the encoding specified in meta tags in the html body, or ``None`` if no suitable encoding was found @@ -75,6 +79,7 @@ def html_body_declared_encoding(html_body_str): # html5 suggests the first 1024 bytes are sufficient, we allow for more chunk = html_body_str[:4096] + match: Union[Optional[Match[bytes]], Optional[Match[str]]] if isinstance(chunk, bytes): match = _BODY_ENCODING_BYTES_RE.search(chunk) else: @@ -87,7 +92,9 @@ def html_body_declared_encoding(html_body_str): or match.group("xmlcharset") ) if encoding: - return resolve_encoding(encoding) + return resolve_encoding(to_native_str(encoding)) + + return None # Default encoding translation @@ -117,8 +124,7 @@ def html_body_declared_encoding(html_body_str): "zh_cn": "gb18030", } - -def _c18n_encoding(encoding): +def _c18n_encoding(encoding: str) -> str: """Canonicalize an encoding name This performs normalization and translates aliases using python's @@ -128,7 +134,7 @@ def _c18n_encoding(encoding): return encodings.aliases.aliases.get(normed, normed) -def resolve_encoding(encoding_alias): +def resolve_encoding(encoding_alias: str) -> Optional[str]: """Return the encoding that `encoding_alias` maps to, or ``None`` if the encoding cannot be interpreted @@ -158,7 +164,7 @@ def resolve_encoding(encoding_alias): _FIRST_CHARS = set(c[0] for (c, _) in _BOM_TABLE) -def read_bom(data): +def read_bom(data: bytes) -> Union[Tuple[None, None], Tuple[str, bytes]]: r"""Read the byte order mark in the text, if present, and return the encoding represented by the BOM and the BOM. @@ -189,10 +195,10 @@ def read_bom(data): # Python decoder doesn't follow unicode standard when handling # bad utf-8 encoded strings. see http://bugs.python.org/issue8271 -codecs.register_error("w3lib_replace", lambda exc: ("\ufffd", exc.end)) +codecs.register_error('w3lib_replace', lambda exc: ('\ufffd', cast(AnyUnicodeError, exc).end)) -def to_unicode(data_str, encoding): +def to_unicode(data_str: bytes, encoding: str) -> str: """Convert a str object to unicode using the encoding given Characters that cannot be converted will be converted to ``\\ufffd`` (the @@ -203,9 +209,8 @@ def to_unicode(data_str, encoding): ) -def html_to_unicode( - content_type_header, html_body_str, default_encoding="utf8", auto_detect_fun=None -): +def html_to_unicode(content_type_header: Optional[str], html_body_str: bytes, + default_encoding: str = 'utf8', auto_detect_fun: Optional[Callable[[bytes], str]] = None) -> Tuple[str, str]: r'''Convert raw html bytes to unicode This attempts to make a reasonable guess at the content encoding of the @@ -273,18 +278,21 @@ def html_to_unicode( if enc is not None: # remove BOM if it agrees with the encoding if enc == bom_enc: - html_body_str = html_body_str[len(bom) :] - elif enc == "utf-16" or enc == "utf-32": + bom = cast(bytes, bom) + html_body_str = html_body_str[len(bom):] + elif enc == 'utf-16' or enc == 'utf-32': # read endianness from BOM, or default to big endian # tools.ietf.org/html/rfc2781 section 4.3 if bom_enc is not None and bom_enc.startswith(enc): enc = bom_enc - html_body_str = html_body_str[len(bom) :] + bom = cast(bytes, bom) + html_body_str = html_body_str[len(bom):] else: enc += "-be" return enc, to_unicode(html_body_str, enc) if bom_enc is not None: - return bom_enc, to_unicode(html_body_str[len(bom) :], bom_enc) + bom = cast(bytes, bom) + return bom_enc, to_unicode(html_body_str[len(bom):], bom_enc) enc = html_body_declared_encoding(html_body_str) if enc is None and (auto_detect_fun is not None): enc = auto_detect_fun(html_body_str) diff --git a/w3lib/http.py b/w3lib/http.py index f3793922..17ab1b65 100644 --- a/w3lib/http.py +++ b/w3lib/http.py @@ -1,7 +1,11 @@ from base64 import urlsafe_b64encode +from typing import Any, List, MutableMapping, Optional, AnyStr, Sequence, Union, Mapping +from w3lib.util import to_bytes +HeadersDictInput = Mapping[bytes, Union[Any, Sequence]] +HeadersDictOutput = MutableMapping[bytes, List[bytes]] -def headers_raw_to_dict(headers_raw): +def headers_raw_to_dict(headers_raw: Optional[bytes]) -> Optional[HeadersDictOutput]: r""" Convert raw headers (single multi-line bytestring) to a dictionary. @@ -30,7 +34,7 @@ def headers_raw_to_dict(headers_raw): headers = headers_raw.splitlines() headers_tuples = [header.split(b":", 1) for header in headers] - result_dict = {} + result_dict: HeadersDictOutput = {} for header_item in headers_tuples: if not len(header_item) == 2: continue @@ -46,7 +50,7 @@ def headers_raw_to_dict(headers_raw): return result_dict -def headers_dict_to_raw(headers_dict): +def headers_dict_to_raw(headers_dict: Optional[HeadersDictInput]) -> Optional[bytes]: r""" Returns a raw HTTP headers representation of headers @@ -78,7 +82,7 @@ def headers_dict_to_raw(headers_dict): return b"\r\n".join(raw_lines) -def basic_auth_header(username, password, encoding="ISO-8859-1"): +def basic_auth_header(username: AnyStr, password: AnyStr, encoding: str = 'ISO-8859-1') -> bytes: """ Return an `Authorization` header field value for `HTTP Basic Access Authentication (RFC 2617)`_ @@ -90,10 +94,8 @@ def basic_auth_header(username, password, encoding="ISO-8859-1"): """ - auth = "%s:%s" % (username, password) - if not isinstance(auth, bytes): - # XXX: RFC 2617 doesn't define encoding, but ISO-8859-1 - # seems to be the most widely used encoding here. See also: - # http://greenbytes.de/tech/webdav/draft-ietf-httpauth-basicauth-enc-latest.html - auth = auth.encode(encoding) - return b"Basic " + urlsafe_b64encode(auth) + auth = "%r:%r" % (username, password) + # XXX: RFC 2617 doesn't define encoding, but ISO-8859-1 + # seems to be the most widely used encoding here. See also: + # http://greenbytes.de/tech/webdav/draft-ietf-httpauth-basicauth-enc-latest.html + return b'Basic ' + urlsafe_b64encode(to_bytes(auth, encoding=encoding)) From 83e79ac868889679bf7c2f47c38e5b470f7031ee Mon Sep 17 00:00:00 2001 From: Lucy Wang Date: Tue, 22 Jun 2021 20:25:33 +0800 Subject: [PATCH 03/17] add more type hints --- w3lib/html.py | 41 +++++++++++-------------- w3lib/url.py | 85 ++++++++++++++++++++++++++------------------------- w3lib/util.py | 20 +++++------- 3 files changed, 68 insertions(+), 78 deletions(-) diff --git a/w3lib/html.py b/w3lib/html.py index 2bea60c9..347cb3aa 100644 --- a/w3lib/html.py +++ b/w3lib/html.py @@ -4,29 +4,22 @@ import re from html.entities import name2codepoint +from typing import Match, Sequence, AnyStr from urllib.parse import urljoin from w3lib.util import to_unicode from w3lib.url import safe_url_string -_ent_re = re.compile( - r"&((?P[a-z\d]+)|#(?P\d+)|#x(?P[a-f\d]+))(?P;?)", - re.IGNORECASE, -) -_tag_re = re.compile(r"<[a-zA-Z\/!].*?>", re.DOTALL) -_baseurl_re = re.compile(r"]*href\s*=\s*[\"\']\s*([^\"\'\s]+)\s*[\"\']", re.I) -_meta_refresh_re = re.compile( - r']*http-equiv[^>]*refresh[^>]*content\s*=\s*(?P["\'])(?P(\d*\.)?\d+)\s*;\s*url=\s*(?P.*?)(?P=quote)', - re.DOTALL | re.IGNORECASE, -) -_cdata_re = re.compile( - r"((?P.*?)(?P\]\]>))", re.DOTALL -) +_ent_re = re.compile(r'&((?P[a-z\d]+)|#(?P\d+)|#x(?P[a-f\d]+))(?P;?)', re.IGNORECASE) +_tag_re = re.compile(r'<[a-zA-Z\/!].*?>', re.DOTALL) +_baseurl_re = re.compile(r']*href\s*=\s*[\"\']\s*([^\"\'\s]+)\s*[\"\']', re.I) +_meta_refresh_re = re.compile(r']*http-equiv[^>]*refresh[^>]*content\s*=\s*(?P["\'])(?P(\d*\.)?\d+)\s*;\s*url=\s*(?P.*?)(?P=quote)', re.DOTALL | re.IGNORECASE) +_cdata_re = re.compile(r'((?P.*?)(?P\]\]>))', re.DOTALL) -HTML5_WHITESPACE = " \t\n\r\x0c" +HTML5_WHITESPACE = ' \t\n\r\x0c' -def replace_entities(text, keep=(), remove_illegal=True, encoding="utf-8"): +def replace_entities(text: AnyStr, keep: Sequence[str] = (), remove_illegal: bool = True, encoding: str ='utf-8'): """Remove entities from the given `text` by converting them to their corresponding unicode character. @@ -54,14 +47,15 @@ def replace_entities(text, keep=(), remove_illegal=True, encoding="utf-8"): """ - def convert_entity(m): + def convert_entity(m: Match): groups = m.groupdict() - if groups.get("dec"): - number = int(groups["dec"], 10) - elif groups.get("hex"): - number = int(groups["hex"], 16) - elif groups.get("named"): - entity_name = groups["named"] + number = None + if groups.get('dec'): + number = int(groups['dec'], 10) + elif groups.get('hex'): + number = int(groups['hex'], 16) + elif groups.get('named'): + entity_name = groups['named'] if entity_name.lower() in keep: return m.group(0) else: @@ -85,8 +79,7 @@ def convert_entity(m): return _ent_re.sub(convert_entity, to_unicode(text, encoding)) - -def has_entities(text, encoding=None): +def has_entities(text: AnyStr, encoding=None): return bool(_ent_re.search(to_unicode(text, encoding))) diff --git a/w3lib/url.py b/w3lib/url.py index 9a39c98f..77b2fcec 100644 --- a/w3lib/url.py +++ b/w3lib/url.py @@ -9,8 +9,8 @@ import re import string from collections import namedtuple +from typing import Callable, Optional, Sequence, Tuple, Union, cast, Dict from urllib.parse import ( - _coerce_args, parse_qs, parse_qsl, ParseResult, @@ -23,13 +23,16 @@ urlunparse, urlunsplit, ) +from urllib.parse import _coerce_args # type: ignore from urllib.request import pathname2url, url2pathname -from w3lib.util import to_unicode +from w3lib.util import to_bytes, to_native_str, to_unicode +from w3lib._types import AnyUnicodeError, StrOrBytes # error handling function for bytes-to-Unicode decoding errors with URLs -def _quote_byte(error): - return (quote(error.object[error.start : error.end]), error.end) +def _quote_byte(error: UnicodeError) -> Tuple[str, int]: + error = cast(AnyUnicodeError, error) + return (to_unicode(quote(error.object[error.start:error.end])), error.end) codecs.register_error("percentencode", _quote_byte) @@ -49,7 +52,7 @@ def _quote_byte(error): ) # see https://infra.spec.whatwg.org/#ascii-tab-or-newline -def safe_url_string(url, encoding="utf8", path_encoding="utf8", quote_path=True): +def safe_url_string(url: StrOrBytes, encoding: str ='utf8', path_encoding: str ='utf8', quote_path: bool = True) -> str: """Convert the given URL into a legal URL by escaping unsafe characters according to RFC-3986. Also, ASCII tabs and newlines are removed as per https://url.spec.whatwg.org/#url-parsing. @@ -83,7 +86,7 @@ def safe_url_string(url, encoding="utf8", path_encoding="utf8", quote_path=True) try: netloc = parts.netloc.encode("idna").decode() except UnicodeError: - netloc = parts.netloc + netloc = parts.netloc.encode('utf-8') # default encoding for path component SHOULD be UTF-8 if quote_path: @@ -105,8 +108,8 @@ def safe_url_string(url, encoding="utf8", path_encoding="utf8", quote_path=True) _parent_dirs = re.compile(r"/?(\.\./)+") -def safe_download_url(url, encoding="utf8", path_encoding="utf8"): - """Make a url for download. This will call safe_url_string +def safe_download_url(url: StrOrBytes, encoding: str ='utf8', path_encoding: str ='utf8') -> str: + """ Make a url for download. This will call safe_url_string and then strip the fragment, if one exists. The path will be normalised. @@ -124,11 +127,11 @@ def safe_download_url(url, encoding="utf8", path_encoding="utf8"): return urlunsplit((scheme, netloc, path, query, "")) -def is_url(text): - return text.partition("://")[0] in ("file", "http", "https") +def is_url(text: str) -> bool: + return text.partition("://")[0] in ('file', 'http', 'https') -def url_query_parameter(url, parameter, default=None, keep_blank_values=0): +def url_query_parameter(url: StrOrBytes, parameter: str, default: Optional[str] = None, keep_blank_values: Union[bool, int]=0) -> Optional[str]: """Return the value of a url parameter, given the url and parameter name General case: @@ -157,19 +160,17 @@ def url_query_parameter(url, parameter, default=None, keep_blank_values=0): """ - queryparams = parse_qs(urlsplit(str(url))[3], keep_blank_values=keep_blank_values) - return queryparams.get(parameter, [default])[0] + queryparams = parse_qs( + urlsplit(str(url))[3], + keep_blank_values=bool(keep_blank_values) + ) + if parameter in queryparams: + return queryparams[parameter][0] + else: + return default -def url_query_cleaner( - url, - parameterlist=(), - sep="&", - kvsep="=", - remove=False, - unique=True, - keep_fragments=False, -): +def url_query_cleaner(url: StrOrBytes, parameterlist: Union[StrOrBytes, Sequence[StrOrBytes]] = (), sep: str = '&', kvsep: str = '=', remove: bool = False, unique: bool = True, keep_fragments: bool = False) -> str: """Clean URL arguments leaving only those passed in the parameterlist keeping order >>> import w3lib.url @@ -204,7 +205,9 @@ def url_query_cleaner( if isinstance(parameterlist, (str, bytes)): parameterlist = [parameterlist] url, fragment = urldefrag(url) - base, _, query = url.partition("?") + url = cast(str, url) + fragment = cast(str, fragment) + base, _, query = url.partition('?') seen = set() querylist = [] for ksv in query.split(sep): @@ -222,11 +225,10 @@ def url_query_cleaner( seen.add(k) url = "?".join([base, sep.join(querylist)]) if querylist else base if keep_fragments: - url += "#" + fragment - return url + url += '#' + fragment + return cast(str, url) - -def _add_or_replace_parameters(url, params): +def _add_or_replace_parameters(url: str, params: Dict[str, str]) -> str: parsed = urlsplit(url) current_args = parse_qsl(parsed.query, keep_blank_values=True) @@ -248,7 +250,7 @@ def _add_or_replace_parameters(url, params): return urlunsplit(parsed._replace(query=query)) -def add_or_replace_parameter(url, name, new_value): +def add_or_replace_parameter(url: str, name: str, new_value: str) -> str: """Add or remove a parameter to a given url >>> import w3lib.url @@ -264,7 +266,7 @@ def add_or_replace_parameter(url, name, new_value): return _add_or_replace_parameters(url, {name: new_value}) -def add_or_replace_parameters(url, new_parameters): +def add_or_replace_parameters(url: str, new_parameters: Dict[str, str]) -> str: """Add or remove a parameters to a given url >>> import w3lib.url @@ -279,7 +281,7 @@ def add_or_replace_parameters(url, new_parameters): return _add_or_replace_parameters(url, new_parameters) -def path_to_file_uri(path): +def path_to_file_uri(path: str) -> str: """Convert local filesystem path to legal File URIs as described in: http://en.wikipedia.org/wiki/File_URI_scheme """ @@ -289,7 +291,7 @@ def path_to_file_uri(path): return "file:///%s" % x.lstrip("/") -def file_uri_to_path(uri): +def file_uri_to_path(uri: str) -> str: """Convert File URI to local filesystem path according to: http://en.wikipedia.org/wiki/File_URI_scheme """ @@ -297,7 +299,7 @@ def file_uri_to_path(uri): return url2pathname(uri_path) -def any_to_uri(uri_or_path): +def any_to_uri(uri_or_path: str) -> str: """If given a path name, return its File URI, otherwise return it unmodified """ @@ -342,12 +344,11 @@ def any_to_uri(uri_or_path): ).encode() ) -_ParseDataURIResult = namedtuple( - "ParseDataURIResult", "media_type media_type_parameters data" -) +_ParseDataURIResult = namedtuple("_ParseDataURIResult", + "media_type media_type_parameters data") -def parse_data_uri(uri): +def parse_data_uri(uri: StrOrBytes) -> _ParseDataURIResult: """ Parse a data: URI, returning a 3-tuple of media type, dictionary of media @@ -389,7 +390,7 @@ def parse_data_uri(uri): if m: attribute, value, value_quoted = m.groups() if value_quoted: - value = re.sub(br"\\(.)", r"\1", value_quoted) + value = re.sub(br'\\(.)', rb'\1', value_quoted) media_type_params[attribute.decode()] = value.decode() uri = uri[m.end() :] else: @@ -477,9 +478,8 @@ def canonicalize_url(url, keep_blank_values=True, keep_fragments=False, encoding # if not for proper URL expected by remote website. try: scheme, netloc, path, params, query, fragment = _safe_ParseResult( - parse_url(url), encoding=encoding or "utf8" - ) - except UnicodeEncodeError as e: + parse_url(url), encoding=encoding or 'utf8') + except UnicodeEncodeError: scheme, netloc, path, params, query, fragment = _safe_ParseResult( parse_url(url), encoding="utf8" ) @@ -570,8 +570,9 @@ def parse_qsl_to_bytes(qs, keep_blank_values=False): # (at https://hg.python.org/cpython/rev/c38ac7ab8d9a) # except for the unquote(s, encoding, errors) calls replaced # with unquote_to_bytes(s) - qs, _coerce_result = _coerce_args(qs) - pairs = [s2 for s1 in qs.split("&") for s2 in s1.split(";")] + coerce_args = cast(Callable[..., Tuple[str, Callable]], _coerce_args) + qs, _coerce_result = coerce_args(qs) + pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')] r = [] for name_value in pairs: if not name_value: diff --git a/w3lib/util.py b/w3lib/util.py index db8e16e8..4258b76b 100644 --- a/w3lib/util.py +++ b/w3lib/util.py @@ -1,13 +1,9 @@ from warnings import warn +from typing import Optional +from w3lib._types import StrOrBytes -def str_to_unicode(text, encoding=None, errors="strict"): - warn( - "The w3lib.utils.str_to_unicode function is deprecated and " - "will be removed in a future release.", - DeprecationWarning, - stacklevel=2, - ) +def str_to_unicode(text: StrOrBytes, encoding: Optional[str] = None, errors: str ='strict') -> str: if encoding is None: encoding = "utf-8" if isinstance(text, bytes): @@ -15,7 +11,7 @@ def str_to_unicode(text, encoding=None, errors="strict"): return text -def unicode_to_str(text, encoding=None, errors="strict"): +def unicode_to_str(text: StrOrBytes, encoding: Optional[str] = None, errors: str = 'strict') -> bytes: warn( "The w3lib.utils.unicode_to_str function is deprecated and " "will be removed in a future release.", @@ -29,7 +25,7 @@ def unicode_to_str(text, encoding=None, errors="strict"): return text -def to_unicode(text, encoding=None, errors="strict"): +def to_unicode(text: StrOrBytes, encoding: Optional[str] = None, errors: str = 'strict') -> str: """Return the unicode representation of a bytes object `text`. If `text` is already an unicode object, return it as-is.""" if isinstance(text, str): @@ -43,7 +39,7 @@ def to_unicode(text, encoding=None, errors="strict"): return text.decode(encoding, errors) -def to_bytes(text, encoding=None, errors="strict"): +def to_bytes(text: StrOrBytes, encoding: Optional[str] = None, errors: str = 'strict') -> bytes: """Return the binary representation of `text`. If `text` is already a bytes object, return it as-is.""" if isinstance(text, bytes): @@ -57,8 +53,8 @@ def to_bytes(text, encoding=None, errors="strict"): return text.encode(encoding, errors) -def to_native_str(text, encoding=None, errors="strict"): - """Return str representation of `text`""" +def to_native_str(text: StrOrBytes, encoding: Optional[str] = None, errors: str = 'strict') -> str: + """ Return str representation of `text` """ warn( "The w3lib.utils.to_native_str function is deprecated and " "will be removed in a future release. Please use " From c68b0662b13bc3bd62b9e281e3c66f6b7ee7ca44 Mon Sep 17 00:00:00 2001 From: Lucy Wang Date: Fri, 2 Jul 2021 11:44:16 +0800 Subject: [PATCH 04/17] more type hints --- tests/test_encoding.py | 8 +++++--- tests/test_http.py | 18 ++++++++++++------ tests/test_url.py | 35 +++++++++++++++-------------------- 3 files changed, 32 insertions(+), 29 deletions(-) diff --git a/tests/test_encoding.py b/tests/test_encoding.py index b9e78922..3be7d796 100644 --- a/tests/test_encoding.py +++ b/tests/test_encoding.py @@ -38,14 +38,16 @@ def test_bom(self): utf32le = b"\xff\xfe\x00\x00\x34\x6c\x00\x00" for string in (utf16be, utf16le, utf32be, utf32le): bom_encoding, bom = read_bom(string) - decoded = string[len(bom) :].decode(bom_encoding) + assert bom_encoding is not None + assert bom is not None + decoded = string[len(bom):].decode(bom_encoding) self.assertEqual(water_unicode, decoded) # Body without BOM - enc, bom = read_bom("foo") + enc, bom = read_bom(b"foo") self.assertEqual(enc, None) self.assertEqual(bom, None) # Empty body - enc, bom = read_bom("") + enc, bom = read_bom(b"") self.assertEqual(enc, None) self.assertEqual(bom, None) diff --git a/tests/test_http.py b/tests/test_http.py index 127f4de9..fc59ae11 100644 --- a/tests/test_http.py +++ b/tests/test_http.py @@ -1,6 +1,7 @@ import unittest from collections import OrderedDict -from w3lib.http import basic_auth_header, headers_dict_to_raw, headers_raw_to_dict +from w3lib.http import (HeadersDictInput, basic_auth_header, + headers_dict_to_raw, headers_raw_to_dict) __doctests__ = ["w3lib.http"] # for trial support @@ -47,7 +48,10 @@ def test_headers_dict_to_raw(self): ) def test_headers_dict_to_raw_listtuple(self): - dct = OrderedDict([(b"Content-type", [b"text/html"]), (b"Accept", [b"gzip"])]) + dct: HeadersDictInput = OrderedDict([ + (b'Content-type', [b'text/html']), + (b'Accept', [b'gzip']) + ]) self.assertEqual( headers_dict_to_raw(dct), b"Content-type: text/html\r\nAccept: gzip" ) @@ -70,10 +74,12 @@ def test_headers_dict_to_raw_listtuple(self): ) def test_headers_dict_to_raw_wrong_values(self): - dct = OrderedDict( - [ - (b"Content-type", 0), - ] + dct: HeadersDictInput = OrderedDict([ + (b'Content-type', 0), + ]) + self.assertEqual( + headers_dict_to_raw(dct), + b'' ) self.assertEqual(headers_dict_to_raw(dct), b"") diff --git a/tests/test_url.py b/tests/test_url.py index edd816c6..1b02d5be 100644 --- a/tests/test_url.py +++ b/tests/test_url.py @@ -506,10 +506,10 @@ def test_add_or_replace_parameters(self): ) def test_add_or_replace_parameters_does_not_change_input_param(self): - url = "http://domain/test?arg=original" - input_param = {"arg": "value"} - new_url = add_or_replace_parameters(url, input_param) # noqa - self.assertEqual(input_param, {"arg": "value"}) + url = 'http://domain/test?arg=original' + input_param = {'arg': 'value'} + add_or_replace_parameters(url, input_param) # noqa + self.assertEqual(input_param, {'arg': 'value'}) def test_url_query_cleaner(self): self.assertEqual("product.html", url_query_cleaner("product.html?")) @@ -814,22 +814,17 @@ def test_normalize_percent_encoding_in_query_arguments(self): ) def test_non_ascii_percent_encoding_in_paths(self): - self.assertEqual( - canonicalize_url("http://www.example.com/a do?a=1"), - "http://www.example.com/a%20do?a=1", - ), - self.assertEqual( - canonicalize_url("http://www.example.com/a %20do?a=1"), - "http://www.example.com/a%20%20do?a=1", - ), - self.assertEqual( - canonicalize_url("http://www.example.com/a do£.html?a=1"), - "http://www.example.com/a%20do%C2%A3.html?a=1", - ) - self.assertEqual( - canonicalize_url(b"http://www.example.com/a do\xc2\xa3.html?a=1"), - "http://www.example.com/a%20do%C2%A3.html?a=1", - ) + self.assertEqual(canonicalize_url("http://www.example.com/a do?a=1"), + "http://www.example.com/a%20do?a=1") + + self.assertEqual(canonicalize_url("http://www.example.com/a %20do?a=1"), + "http://www.example.com/a%20%20do?a=1") + + self.assertEqual(canonicalize_url("http://www.example.com/a do£.html?a=1"), + "http://www.example.com/a%20do%C2%A3.html?a=1") + + self.assertEqual(canonicalize_url(b"http://www.example.com/a do\xc2\xa3.html?a=1"), + "http://www.example.com/a%20do%C2%A3.html?a=1") def test_non_ascii_percent_encoding_in_query_arguments(self): self.assertEqual( From 8cd9cf137dbc6d465ff3c890f7172276c355d93c Mon Sep 17 00:00:00 2001 From: Lucy Wang Date: Fri, 2 Jul 2021 12:14:23 +0800 Subject: [PATCH 05/17] flake8 --- w3lib/encoding.py | 2 +- w3lib/html.py | 2 +- w3lib/http.py | 1 + w3lib/url.py | 9 +++++---- w3lib/util.py | 21 +++++++++++++++------ 5 files changed, 23 insertions(+), 12 deletions(-) diff --git a/w3lib/encoding.py b/w3lib/encoding.py index 68357d12..a9ffef99 100644 --- a/w3lib/encoding.py +++ b/w3lib/encoding.py @@ -3,7 +3,7 @@ """ import re, codecs, encodings from sys import version_info -from typing import Callable, Match, Optional, Tuple, Union, cast +from typing import Callable, Match, Optional, Tuple, Union, cast from w3lib._types import AnyUnicodeError, StrOrBytes from w3lib.util import to_native_str diff --git a/w3lib/html.py b/w3lib/html.py index 347cb3aa..395e9d21 100644 --- a/w3lib/html.py +++ b/w3lib/html.py @@ -19,7 +19,7 @@ HTML5_WHITESPACE = ' \t\n\r\x0c' -def replace_entities(text: AnyStr, keep: Sequence[str] = (), remove_illegal: bool = True, encoding: str ='utf-8'): +def replace_entities(text: AnyStr, keep: Sequence[str] = (), remove_illegal: bool = True, encoding: str = 'utf-8'): """Remove entities from the given `text` by converting them to their corresponding unicode character. diff --git a/w3lib/http.py b/w3lib/http.py index 17ab1b65..4d86ce51 100644 --- a/w3lib/http.py +++ b/w3lib/http.py @@ -5,6 +5,7 @@ HeadersDictInput = Mapping[bytes, Union[Any, Sequence]] HeadersDictOutput = MutableMapping[bytes, List[bytes]] + def headers_raw_to_dict(headers_raw: Optional[bytes]) -> Optional[HeadersDictOutput]: r""" Convert raw headers (single multi-line bytestring) diff --git a/w3lib/url.py b/w3lib/url.py index 77b2fcec..1b97647f 100644 --- a/w3lib/url.py +++ b/w3lib/url.py @@ -52,7 +52,7 @@ def _quote_byte(error: UnicodeError) -> Tuple[str, int]: ) # see https://infra.spec.whatwg.org/#ascii-tab-or-newline -def safe_url_string(url: StrOrBytes, encoding: str ='utf8', path_encoding: str ='utf8', quote_path: bool = True) -> str: +def safe_url_string(url: StrOrBytes, encoding: str = 'utf8', path_encoding: str = 'utf8', quote_path: bool = True) -> str: """Convert the given URL into a legal URL by escaping unsafe characters according to RFC-3986. Also, ASCII tabs and newlines are removed as per https://url.spec.whatwg.org/#url-parsing. @@ -108,7 +108,7 @@ def safe_url_string(url: StrOrBytes, encoding: str ='utf8', path_encoding: str = _parent_dirs = re.compile(r"/?(\.\./)+") -def safe_download_url(url: StrOrBytes, encoding: str ='utf8', path_encoding: str ='utf8') -> str: +def safe_download_url(url: StrOrBytes, encoding: str = 'utf8', path_encoding: str = 'utf8') -> str: """ Make a url for download. This will call safe_url_string and then strip the fragment, if one exists. The path will be normalised. @@ -131,7 +131,7 @@ def is_url(text: str) -> bool: return text.partition("://")[0] in ('file', 'http', 'https') -def url_query_parameter(url: StrOrBytes, parameter: str, default: Optional[str] = None, keep_blank_values: Union[bool, int]=0) -> Optional[str]: +def url_query_parameter(url: StrOrBytes, parameter: str, default: Optional[str] = None, keep_blank_values: Union[bool, int] = 0) -> Optional[str]: """Return the value of a url parameter, given the url and parameter name General case: @@ -170,7 +170,8 @@ def url_query_parameter(url: StrOrBytes, parameter: str, default: Optional[str] return default -def url_query_cleaner(url: StrOrBytes, parameterlist: Union[StrOrBytes, Sequence[StrOrBytes]] = (), sep: str = '&', kvsep: str = '=', remove: bool = False, unique: bool = True, keep_fragments: bool = False) -> str: +def url_query_cleaner( + url: StrOrBytes, parameterlist: Union[StrOrBytes, Sequence[StrOrBytes]] = (), sep: str = '&', kvsep: str = '=', remove: bool = False, unique: bool = True, keep_fragments: bool = False) -> str: """Clean URL arguments leaving only those passed in the parameterlist keeping order >>> import w3lib.url diff --git a/w3lib/util.py b/w3lib/util.py index 4258b76b..315d968a 100644 --- a/w3lib/util.py +++ b/w3lib/util.py @@ -3,15 +3,18 @@ from w3lib._types import StrOrBytes -def str_to_unicode(text: StrOrBytes, encoding: Optional[str] = None, errors: str ='strict') -> str: +def str_to_unicode( + text: StrOrBytes, encoding: Optional[str] = None, errors: str = 'strict' +) -> str: if encoding is None: encoding = "utf-8" if isinstance(text, bytes): return text.decode(encoding, errors) return text - -def unicode_to_str(text: StrOrBytes, encoding: Optional[str] = None, errors: str = 'strict') -> bytes: +def unicode_to_str( + text: StrOrBytes, encoding: Optional[str] = None, errors: str = 'strict' +) -> bytes: warn( "The w3lib.utils.unicode_to_str function is deprecated and " "will be removed in a future release.", @@ -25,7 +28,9 @@ def unicode_to_str(text: StrOrBytes, encoding: Optional[str] = None, errors: str return text -def to_unicode(text: StrOrBytes, encoding: Optional[str] = None, errors: str = 'strict') -> str: +def to_unicode( + text: StrOrBytes, encoding: Optional[str] = None, errors: str = 'strict' +) -> str: """Return the unicode representation of a bytes object `text`. If `text` is already an unicode object, return it as-is.""" if isinstance(text, str): @@ -39,7 +44,9 @@ def to_unicode(text: StrOrBytes, encoding: Optional[str] = None, errors: str = ' return text.decode(encoding, errors) -def to_bytes(text: StrOrBytes, encoding: Optional[str] = None, errors: str = 'strict') -> bytes: +def to_bytes( + text: StrOrBytes, encoding: Optional[str] = None, errors: str = 'strict' +) -> bytes: """Return the binary representation of `text`. If `text` is already a bytes object, return it as-is.""" if isinstance(text, bytes): @@ -53,7 +60,9 @@ def to_bytes(text: StrOrBytes, encoding: Optional[str] = None, errors: str = 'st return text.encode(encoding, errors) -def to_native_str(text: StrOrBytes, encoding: Optional[str] = None, errors: str = 'strict') -> str: +def to_native_str( + text: StrOrBytes, encoding: Optional[str] = None, errors: str = 'strict' +) -> str: """ Return str representation of `text` """ warn( "The w3lib.utils.to_native_str function is deprecated and " From 43764623aa4e2c2fe1bb60b564409a186b3ca703 Mon Sep 17 00:00:00 2001 From: Lucy Wang Date: Mon, 2 Aug 2021 14:24:38 +0800 Subject: [PATCH 06/17] more type hints --- .gitignore | 1 + w3lib/html.py | 78 ++++++++++++++++++++++++--------------------------- w3lib/http.py | 4 +-- w3lib/url.py | 43 ++++++++++++++-------------- 4 files changed, 61 insertions(+), 65 deletions(-) diff --git a/.gitignore b/.gitignore index bccc4a7b..714a9be8 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,4 @@ coverage.xml .mypy_cache/ /index.txt .dmypy.json +.hypothesis/ diff --git a/w3lib/html.py b/w3lib/html.py index 395e9d21..bdbf7b75 100644 --- a/w3lib/html.py +++ b/w3lib/html.py @@ -4,7 +4,7 @@ import re from html.entities import name2codepoint -from typing import Match, Sequence, AnyStr +from typing import Iterable, Match, AnyStr, Optional, Pattern, Tuple, Union from urllib.parse import urljoin from w3lib.util import to_unicode @@ -19,7 +19,7 @@ HTML5_WHITESPACE = ' \t\n\r\x0c' -def replace_entities(text: AnyStr, keep: Sequence[str] = (), remove_illegal: bool = True, encoding: str = 'utf-8'): +def replace_entities(text: AnyStr, keep: Iterable[str] = (), remove_illegal: bool = True, encoding: str = 'utf-8') -> str: """Remove entities from the given `text` by converting them to their corresponding unicode character. @@ -47,7 +47,7 @@ def replace_entities(text: AnyStr, keep: Sequence[str] = (), remove_illegal: boo """ - def convert_entity(m: Match): + def convert_entity(m: Match) -> str: groups = m.groupdict() number = None if groups.get('dec'): @@ -79,11 +79,11 @@ def convert_entity(m: Match): return _ent_re.sub(convert_entity, to_unicode(text, encoding)) -def has_entities(text: AnyStr, encoding=None): +def has_entities(text: AnyStr, encoding: Optional[str] = None) -> bool: return bool(_ent_re.search(to_unicode(text, encoding))) -def replace_tags(text, token="", encoding=None): +def replace_tags(text: AnyStr, token: str = '', encoding: Optional[str] = None) -> str: """Replace all markup tags found in the given `text` by the given token. By default `token` is an empty string so it just removes all tags. @@ -106,11 +106,11 @@ def replace_tags(text, token="", encoding=None): return _tag_re.sub(token, to_unicode(text, encoding)) -_REMOVECOMMENTS_RE = re.compile("|$)", re.DOTALL) +_REMOVECOMMENTS_RE = re.compile('|$)', re.DOTALL) -def remove_comments(text, encoding=None): - """Remove HTML Comments. +def remove_comments(text: AnyStr, encoding: Optional[str] = None) -> str: + """ Remove HTML Comments. >>> import w3lib.html >>> w3lib.html.remove_comments(b"test whatever") @@ -119,12 +119,11 @@ def remove_comments(text, encoding=None): """ - text = to_unicode(text, encoding) - return _REMOVECOMMENTS_RE.sub("", text) + utext = to_unicode(text, encoding) + return _REMOVECOMMENTS_RE.sub('', utext) - -def remove_tags(text, which_ones=(), keep=(), encoding=None): - """Remove HTML Tags only. +def remove_tags(text: AnyStr, which_ones: Iterable[str] = (), keep: Iterable[str] = (), encoding: Optional[str] = None) -> str: + """ Remove HTML Tags only. `which_ones` and `keep` are both tuples, there are four cases: @@ -173,14 +172,14 @@ def remove_tags(text, which_ones=(), keep=(), encoding=None): which_ones = {tag.lower() for tag in which_ones} keep = {tag.lower() for tag in keep} - def will_remove(tag): + def will_remove(tag: str) -> bool: tag = tag.lower() if which_ones: return tag in which_ones else: return tag not in keep - def remove_tag(m): + def remove_tag(m: Match) -> str: tag = m.group(1) return "" if will_remove(tag) else m.group(0) @@ -190,7 +189,7 @@ def remove_tag(m): return retags.sub(remove_tag, to_unicode(text, encoding)) -def remove_tags_with_content(text, which_ones=(), encoding=None): +def remove_tags_with_content(text: AnyStr, which_ones: Iterable[str] = (), encoding: Optional[str] = None) -> str: """Remove tags and their content. `which_ones` is a tuple of which tags to remove including their content. @@ -204,19 +203,18 @@ def remove_tags_with_content(text, which_ones=(), encoding=None): """ - text = to_unicode(text, encoding) + utext = to_unicode(text, encoding) if which_ones: tags = "|".join( [r"<%s\b.*?|<%s\s*/>" % (tag, tag, tag) for tag in which_ones] ) retags = re.compile(tags, re.DOTALL | re.IGNORECASE) - text = retags.sub("", text) - return text + utext = retags.sub('', utext) + return utext -def replace_escape_chars( - text, which_ones=("\n", "\t", "\r"), replace_by="", encoding=None -): +def replace_escape_chars(text: AnyStr, which_ones: Iterable[str] = ('\n', '\t', '\r'), replace_by: str = '', \ + encoding: Optional[str] = None) -> str: """Remove escape characters. `which_ones` is a tuple of which escape characters we want to remove. @@ -227,13 +225,13 @@ def replace_escape_chars( """ - text = to_unicode(text, encoding) + utext = to_unicode(text, encoding) for ec in which_ones: - text = text.replace(ec, to_unicode(replace_by, encoding)) - return text + utext = utext.replace(ec, to_unicode(replace_by, encoding)) + return utext -def unquote_markup(text, keep=(), remove_illegal=True, encoding=None): +def unquote_markup(text: AnyStr, keep: Iterable[str] = (), remove_illegal: bool = True, encoding: Optional[str] = None) -> str: """ This function receives markup as a text (always a unicode string or a UTF-8 encoded string) and does the following: @@ -245,7 +243,7 @@ def unquote_markup(text, keep=(), remove_illegal=True, encoding=None): """ - def _get_fragments(txt, pattern): + def _get_fragments(txt: str, pattern: Pattern) -> Iterable[Union[str, Match]]: offset = 0 for match in pattern.finditer(txt): match_s, match_e = match.span(1) @@ -254,9 +252,9 @@ def _get_fragments(txt, pattern): offset = match_e yield txt[offset:] - text = to_unicode(text, encoding) - ret_text = "" - for fragment in _get_fragments(text, _cdata_re): + utext = to_unicode(text, encoding) + ret_text = '' + for fragment in _get_fragments(utext, _cdata_re): if isinstance(fragment, str): # it's not a CDATA (so we try to remove its entities) ret_text += replace_entities( @@ -268,7 +266,7 @@ def _get_fragments(txt, pattern): return ret_text -def get_base_url(text, baseurl="", encoding="utf-8"): +def get_base_url(text: AnyStr, baseurl: str = '', encoding: str = 'utf-8') -> str: """Return the base url if declared in the given HTML `text`, relative to the given base url. @@ -276,8 +274,8 @@ def get_base_url(text, baseurl="", encoding="utf-8"): """ - text = to_unicode(text, encoding) - m = _baseurl_re.search(text) + utext = to_unicode(text, encoding) + m = _baseurl_re.search(utext) if m: return urljoin( safe_url_string(baseurl), safe_url_string(m.group(1), encoding=encoding) @@ -286,9 +284,7 @@ def get_base_url(text, baseurl="", encoding="utf-8"): return safe_url_string(baseurl) -def get_meta_refresh( - text, baseurl="", encoding="utf-8", ignore_tags=("script", "noscript") -): +def get_meta_refresh(text: AnyStr, baseurl: str = '', encoding: str = 'utf-8', ignore_tags: Iterable[str] = ('script', 'noscript')) -> Tuple[Optional[float], Optional[str]]: """Return the http-equiv parameter of the HTML meta element from the given HTML text and return a tuple ``(interval, url)`` where interval is an integer containing the delay in seconds (or zero if not present) and url is a @@ -299,13 +295,13 @@ def get_meta_refresh( """ try: - text = to_unicode(text, encoding) + utext = to_unicode(text, encoding) except UnicodeDecodeError: print(text) raise - text = remove_tags_with_content(text, ignore_tags) - text = remove_comments(replace_entities(text)) - m = _meta_refresh_re.search(text) + utext = remove_tags_with_content(utext, ignore_tags) + utext = remove_comments(replace_entities(utext)) + m = _meta_refresh_re.search(utext) if m: interval = float(m.group("int")) url = safe_url_string(m.group("url").strip(" \"'"), encoding) @@ -315,7 +311,7 @@ def get_meta_refresh( return None, None -def strip_html5_whitespace(text): +def strip_html5_whitespace(text: str) -> str: r""" Strip all leading and trailing space characters (as defined in https://www.w3.org/TR/html5/infrastructure.html#space-character). diff --git a/w3lib/http.py b/w3lib/http.py index 4d86ce51..9b92f2ef 100644 --- a/w3lib/http.py +++ b/w3lib/http.py @@ -1,6 +1,6 @@ from base64 import urlsafe_b64encode from typing import Any, List, MutableMapping, Optional, AnyStr, Sequence, Union, Mapping -from w3lib.util import to_bytes +from w3lib.util import to_bytes, to_native_str HeadersDictInput = Mapping[bytes, Union[Any, Sequence]] HeadersDictOutput = MutableMapping[bytes, List[bytes]] @@ -95,7 +95,7 @@ def basic_auth_header(username: AnyStr, password: AnyStr, encoding: str = 'ISO-8 """ - auth = "%r:%r" % (username, password) + auth = "%s:%s" % (to_native_str(username), to_native_str(password)) # XXX: RFC 2617 doesn't define encoding, but ISO-8859-1 # seems to be the most widely used encoding here. See also: # http://greenbytes.de/tech/webdav/draft-ietf-httpauth-basicauth-enc-latest.html diff --git a/w3lib/url.py b/w3lib/url.py index 1b97647f..889e5bc6 100644 --- a/w3lib/url.py +++ b/w3lib/url.py @@ -9,7 +9,7 @@ import re import string from collections import namedtuple -from typing import Callable, Optional, Sequence, Tuple, Union, cast, Dict +from typing import Callable, List, Optional, Sequence, Tuple, Union, cast, Dict from urllib.parse import ( parse_qs, parse_qsl, @@ -25,7 +25,7 @@ ) from urllib.parse import _coerce_args # type: ignore from urllib.request import pathname2url, url2pathname -from w3lib.util import to_bytes, to_native_str, to_unicode +from w3lib.util import to_unicode from w3lib._types import AnyUnicodeError, StrOrBytes @@ -84,7 +84,7 @@ def safe_url_string(url: StrOrBytes, encoding: str = 'utf8', path_encoding: str # IDNA encoding can fail for too long labels (>63 characters) # or missing labels (e.g. http://.example.com) try: - netloc = parts.netloc.encode("idna").decode() + netloc = parts.netloc.encode('idna') except UnicodeError: netloc = parts.netloc.encode('utf-8') @@ -94,15 +94,13 @@ def safe_url_string(url: StrOrBytes, encoding: str = 'utf8', path_encoding: str else: path = parts.path - return urlunsplit( - ( - parts.scheme, - netloc.rstrip(":"), - path, - quote(parts.query.encode(encoding), _safe_chars), - quote(parts.fragment.encode(encoding), _safe_chars), - ) - ) + return urlunsplit(( + parts.scheme, + netloc.decode().rstrip(':'), + path, + quote(parts.query.encode(encoding), _safe_chars), + quote(parts.fragment.encode(encoding), _safe_chars), + )) _parent_dirs = re.compile(r"/?(\.\./)+") @@ -425,7 +423,7 @@ def parse_data_uri(uri: StrOrBytes) -> _ParseDataURIResult: ] -def _safe_ParseResult(parts, encoding="utf8", path_encoding="utf8"): +def _safe_ParseResult(parts: ParseResult, encoding: str = 'utf8', path_encoding: str = 'utf8') -> Tuple[str, str, str, str, str, str]: # IDNA encoding can fail for too long labels (>63 characters) # or missing labels (e.g. http://.example.com) try: @@ -443,7 +441,8 @@ def _safe_ParseResult(parts, encoding="utf8", path_encoding="utf8"): ) -def canonicalize_url(url, keep_blank_values=True, keep_fragments=False, encoding=None): +def canonicalize_url(url: StrOrBytes, keep_blank_values: bool = True, keep_fragments: bool = False, + encoding: Optional[str] = None) -> str: r"""Canonicalize the given url by applying the following procedures: - sort query arguments, first by key, then by value @@ -530,9 +529,9 @@ def canonicalize_url(url, keep_blank_values=True, keep_fragments=False, encoding ) -def _unquotepath(path): - for reserved in ("2f", "2F", "3f", "3F"): - path = path.replace("%" + reserved, "%25" + reserved.upper()) +def _unquotepath(path: str) -> bytes: + for reserved in ('2f', '2F', '3f', '3F'): + path = path.replace('%' + reserved, '%25' + reserved.upper()) # standard lib's unquote() does not work for non-UTF-8 # percent-escaped characters, they get lost. @@ -542,7 +541,7 @@ def _unquotepath(path): return unquote_to_bytes(path) -def parse_url(url, encoding=None): +def parse_url(url: Union[StrOrBytes, ParseResult], encoding: Optional[str] = None) -> ParseResult: """Return urlparsed url from the given argument (which could be an already parsed url) """ @@ -551,7 +550,7 @@ def parse_url(url, encoding=None): return urlparse(to_unicode(url, encoding)) -def parse_qsl_to_bytes(qs, keep_blank_values=False): +def parse_qsl_to_bytes(qs: str, keep_blank_values: bool = False) -> List[Tuple[bytes, bytes]]: """Parse a query given as a string argument. Data are returned as a list of name, value pairs as bytes. @@ -586,11 +585,11 @@ def parse_qsl_to_bytes(qs, keep_blank_values=False): else: continue if len(nv[1]) or keep_blank_values: - name = nv[0].replace("+", " ") + name: StrOrBytes = nv[0].replace('+', ' ') name = unquote_to_bytes(name) name = _coerce_result(name) - value = nv[1].replace("+", " ") + value: StrOrBytes = nv[1].replace('+', ' ') value = unquote_to_bytes(value) value = _coerce_result(value) - r.append((name, value)) + r.append((cast(bytes, name), cast(bytes, value))) return r From 78c82fdf29f4b079f51c0bad0924f1e56b155bbd Mon Sep 17 00:00:00 2001 From: Lucy Wang Date: Mon, 9 Aug 2021 14:23:36 +0800 Subject: [PATCH 07/17] fix mypy errors --- tests/test_util.py | 4 ++-- w3lib/html.py | 6 +++--- w3lib/url.py | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/test_util.py b/tests/test_util.py index 7243d175..088147c0 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -20,7 +20,7 @@ def test_deprecation(self): class ToBytesTestCase(TestCase): def test_type_error(self): with raises(TypeError): - to_bytes(True) + to_bytes(True) # type: ignore class ToNativeStrTestCase(TestCase): @@ -32,7 +32,7 @@ def test_deprecation(self): class ToUnicodeTestCase(TestCase): def test_type_error(self): with raises(TypeError): - to_unicode(True) + to_unicode(True) # type: ignore class UnicodeToStrTestCase(TestCase): diff --git a/w3lib/html.py b/w3lib/html.py index bdbf7b75..62ad2aec 100644 --- a/w3lib/html.py +++ b/w3lib/html.py @@ -9,6 +9,7 @@ from w3lib.util import to_unicode from w3lib.url import safe_url_string +from w3lib._types import StrOrBytes _ent_re = re.compile(r'&((?P[a-z\d]+)|#(?P\d+)|#x(?P[a-f\d]+))(?P;?)', re.IGNORECASE) _tag_re = re.compile(r'<[a-zA-Z\/!].*?>', re.DOTALL) @@ -213,7 +214,7 @@ def remove_tags_with_content(text: AnyStr, which_ones: Iterable[str] = (), encod return utext -def replace_escape_chars(text: AnyStr, which_ones: Iterable[str] = ('\n', '\t', '\r'), replace_by: str = '', \ +def replace_escape_chars(text: AnyStr, which_ones: Iterable[str] = ('\n', '\t', '\r'), replace_by: StrOrBytes = '', \ encoding: Optional[str] = None) -> str: """Remove escape characters. @@ -265,8 +266,7 @@ def _get_fragments(txt: str, pattern: Pattern) -> Iterable[Union[str, Match]]: ret_text += fragment.group("cdata_d") return ret_text - -def get_base_url(text: AnyStr, baseurl: str = '', encoding: str = 'utf-8') -> str: +def get_base_url(text: AnyStr, baseurl: StrOrBytes = '', encoding: str = 'utf-8') -> str: """Return the base url if declared in the given HTML `text`, relative to the given base url. diff --git a/w3lib/url.py b/w3lib/url.py index 889e5bc6..3f3372ad 100644 --- a/w3lib/url.py +++ b/w3lib/url.py @@ -441,7 +441,7 @@ def _safe_ParseResult(parts: ParseResult, encoding: str = 'utf8', path_encoding: ) -def canonicalize_url(url: StrOrBytes, keep_blank_values: bool = True, keep_fragments: bool = False, +def canonicalize_url(url: Union[StrOrBytes, ParseResult], keep_blank_values: bool = True, keep_fragments: bool = False, encoding: Optional[str] = None) -> str: r"""Canonicalize the given url by applying the following procedures: From 5fae2b9727296de3910a642df73f83adb2fde2e4 Mon Sep 17 00:00:00 2001 From: Lucy Wang Date: Mon, 9 Aug 2021 14:24:43 +0800 Subject: [PATCH 08/17] black --- tests/test_encoding.py | 2 +- tests/test_http.py | 26 ++++++----- tests/test_url.py | 30 +++++++----- w3lib/encoding.py | 22 ++++++--- w3lib/html.py | 99 ++++++++++++++++++++++++++++------------ w3lib/http.py | 6 ++- w3lib/url.py | 101 +++++++++++++++++++++++++++-------------- w3lib/util.py | 11 +++-- 8 files changed, 195 insertions(+), 102 deletions(-) diff --git a/tests/test_encoding.py b/tests/test_encoding.py index 3be7d796..33d7f110 100644 --- a/tests/test_encoding.py +++ b/tests/test_encoding.py @@ -40,7 +40,7 @@ def test_bom(self): bom_encoding, bom = read_bom(string) assert bom_encoding is not None assert bom is not None - decoded = string[len(bom):].decode(bom_encoding) + decoded = string[len(bom) :].decode(bom_encoding) self.assertEqual(water_unicode, decoded) # Body without BOM enc, bom = read_bom(b"foo") diff --git a/tests/test_http.py b/tests/test_http.py index fc59ae11..efabb0ab 100644 --- a/tests/test_http.py +++ b/tests/test_http.py @@ -1,7 +1,11 @@ import unittest from collections import OrderedDict -from w3lib.http import (HeadersDictInput, basic_auth_header, - headers_dict_to_raw, headers_raw_to_dict) +from w3lib.http import ( + HeadersDictInput, + basic_auth_header, + headers_dict_to_raw, + headers_raw_to_dict, +) __doctests__ = ["w3lib.http"] # for trial support @@ -48,10 +52,9 @@ def test_headers_dict_to_raw(self): ) def test_headers_dict_to_raw_listtuple(self): - dct: HeadersDictInput = OrderedDict([ - (b'Content-type', [b'text/html']), - (b'Accept', [b'gzip']) - ]) + dct: HeadersDictInput = OrderedDict( + [(b"Content-type", [b"text/html"]), (b"Accept", [b"gzip"])] + ) self.assertEqual( headers_dict_to_raw(dct), b"Content-type: text/html\r\nAccept: gzip" ) @@ -74,14 +77,13 @@ def test_headers_dict_to_raw_listtuple(self): ) def test_headers_dict_to_raw_wrong_values(self): - dct: HeadersDictInput = OrderedDict([ - (b'Content-type', 0), - ]) - self.assertEqual( - headers_dict_to_raw(dct), - b'' + dct: HeadersDictInput = OrderedDict( + [ + (b"Content-type", 0), + ] ) self.assertEqual(headers_dict_to_raw(dct), b"") + self.assertEqual(headers_dict_to_raw(dct), b"") dct = OrderedDict([(b"Content-type", 1), (b"Accept", [b"gzip"])]) self.assertEqual(headers_dict_to_raw(dct), b"Accept: gzip") diff --git a/tests/test_url.py b/tests/test_url.py index 1b02d5be..fe9ee999 100644 --- a/tests/test_url.py +++ b/tests/test_url.py @@ -506,10 +506,10 @@ def test_add_or_replace_parameters(self): ) def test_add_or_replace_parameters_does_not_change_input_param(self): - url = 'http://domain/test?arg=original' - input_param = {'arg': 'value'} + url = "http://domain/test?arg=original" + input_param = {"arg": "value"} add_or_replace_parameters(url, input_param) # noqa - self.assertEqual(input_param, {'arg': 'value'}) + self.assertEqual(input_param, {"arg": "value"}) def test_url_query_cleaner(self): self.assertEqual("product.html", url_query_cleaner("product.html?")) @@ -814,17 +814,25 @@ def test_normalize_percent_encoding_in_query_arguments(self): ) def test_non_ascii_percent_encoding_in_paths(self): - self.assertEqual(canonicalize_url("http://www.example.com/a do?a=1"), - "http://www.example.com/a%20do?a=1") + self.assertEqual( + canonicalize_url("http://www.example.com/a do?a=1"), + "http://www.example.com/a%20do?a=1", + ) - self.assertEqual(canonicalize_url("http://www.example.com/a %20do?a=1"), - "http://www.example.com/a%20%20do?a=1") + self.assertEqual( + canonicalize_url("http://www.example.com/a %20do?a=1"), + "http://www.example.com/a%20%20do?a=1", + ) - self.assertEqual(canonicalize_url("http://www.example.com/a do£.html?a=1"), - "http://www.example.com/a%20do%C2%A3.html?a=1") + self.assertEqual( + canonicalize_url("http://www.example.com/a do£.html?a=1"), + "http://www.example.com/a%20do%C2%A3.html?a=1", + ) - self.assertEqual(canonicalize_url(b"http://www.example.com/a do\xc2\xa3.html?a=1"), - "http://www.example.com/a%20do%C2%A3.html?a=1") + self.assertEqual( + canonicalize_url(b"http://www.example.com/a do\xc2\xa3.html?a=1"), + "http://www.example.com/a%20do%C2%A3.html?a=1", + ) def test_non_ascii_percent_encoding_in_query_arguments(self): self.assertEqual( diff --git a/w3lib/encoding.py b/w3lib/encoding.py index a9ffef99..32252105 100644 --- a/w3lib/encoding.py +++ b/w3lib/encoding.py @@ -26,6 +26,7 @@ def http_content_type_encoding(content_type: Optional[str]) -> Optional[str]: return None + # regexp for parsing HTTP meta tags _TEMPLATE = r"""%s\s*=\s*["']?\s*%s\s*["']?""" _SKIP_ATTRS = """(?:\\s+ @@ -124,6 +125,7 @@ def html_body_declared_encoding(html_body_str: StrOrBytes) -> Optional[str]: "zh_cn": "gb18030", } + def _c18n_encoding(encoding: str) -> str: """Canonicalize an encoding name @@ -195,7 +197,9 @@ def read_bom(data: bytes) -> Union[Tuple[None, None], Tuple[str, bytes]]: # Python decoder doesn't follow unicode standard when handling # bad utf-8 encoded strings. see http://bugs.python.org/issue8271 -codecs.register_error('w3lib_replace', lambda exc: ('\ufffd', cast(AnyUnicodeError, exc).end)) +codecs.register_error( + "w3lib_replace", lambda exc: ("\ufffd", cast(AnyUnicodeError, exc).end) +) def to_unicode(data_str: bytes, encoding: str) -> str: @@ -209,8 +213,12 @@ def to_unicode(data_str: bytes, encoding: str) -> str: ) -def html_to_unicode(content_type_header: Optional[str], html_body_str: bytes, - default_encoding: str = 'utf8', auto_detect_fun: Optional[Callable[[bytes], str]] = None) -> Tuple[str, str]: +def html_to_unicode( + content_type_header: Optional[str], + html_body_str: bytes, + default_encoding: str = "utf8", + auto_detect_fun: Optional[Callable[[bytes], str]] = None, +) -> Tuple[str, str]: r'''Convert raw html bytes to unicode This attempts to make a reasonable guess at the content encoding of the @@ -279,20 +287,20 @@ def html_to_unicode(content_type_header: Optional[str], html_body_str: bytes, # remove BOM if it agrees with the encoding if enc == bom_enc: bom = cast(bytes, bom) - html_body_str = html_body_str[len(bom):] - elif enc == 'utf-16' or enc == 'utf-32': + html_body_str = html_body_str[len(bom) :] + elif enc == "utf-16" or enc == "utf-32": # read endianness from BOM, or default to big endian # tools.ietf.org/html/rfc2781 section 4.3 if bom_enc is not None and bom_enc.startswith(enc): enc = bom_enc bom = cast(bytes, bom) - html_body_str = html_body_str[len(bom):] + html_body_str = html_body_str[len(bom) :] else: enc += "-be" return enc, to_unicode(html_body_str, enc) if bom_enc is not None: bom = cast(bytes, bom) - return bom_enc, to_unicode(html_body_str[len(bom):], bom_enc) + return bom_enc, to_unicode(html_body_str[len(bom) :], bom_enc) enc = html_body_declared_encoding(html_body_str) if enc is None and (auto_detect_fun is not None): enc = auto_detect_fun(html_body_str) diff --git a/w3lib/html.py b/w3lib/html.py index 62ad2aec..634d90f5 100644 --- a/w3lib/html.py +++ b/w3lib/html.py @@ -11,16 +11,29 @@ from w3lib.url import safe_url_string from w3lib._types import StrOrBytes -_ent_re = re.compile(r'&((?P[a-z\d]+)|#(?P\d+)|#x(?P[a-f\d]+))(?P;?)', re.IGNORECASE) -_tag_re = re.compile(r'<[a-zA-Z\/!].*?>', re.DOTALL) -_baseurl_re = re.compile(r']*href\s*=\s*[\"\']\s*([^\"\'\s]+)\s*[\"\']', re.I) -_meta_refresh_re = re.compile(r']*http-equiv[^>]*refresh[^>]*content\s*=\s*(?P["\'])(?P(\d*\.)?\d+)\s*;\s*url=\s*(?P.*?)(?P=quote)', re.DOTALL | re.IGNORECASE) -_cdata_re = re.compile(r'((?P.*?)(?P\]\]>))', re.DOTALL) - -HTML5_WHITESPACE = ' \t\n\r\x0c' - - -def replace_entities(text: AnyStr, keep: Iterable[str] = (), remove_illegal: bool = True, encoding: str = 'utf-8') -> str: +_ent_re = re.compile( + r"&((?P[a-z\d]+)|#(?P\d+)|#x(?P[a-f\d]+))(?P;?)", + re.IGNORECASE, +) +_tag_re = re.compile(r"<[a-zA-Z\/!].*?>", re.DOTALL) +_baseurl_re = re.compile(r"]*href\s*=\s*[\"\']\s*([^\"\'\s]+)\s*[\"\']", re.I) +_meta_refresh_re = re.compile( + r']*http-equiv[^>]*refresh[^>]*content\s*=\s*(?P["\'])(?P(\d*\.)?\d+)\s*;\s*url=\s*(?P.*?)(?P=quote)', + re.DOTALL | re.IGNORECASE, +) +_cdata_re = re.compile( + r"((?P.*?)(?P\]\]>))", re.DOTALL +) + +HTML5_WHITESPACE = " \t\n\r\x0c" + + +def replace_entities( + text: AnyStr, + keep: Iterable[str] = (), + remove_illegal: bool = True, + encoding: str = "utf-8", +) -> str: """Remove entities from the given `text` by converting them to their corresponding unicode character. @@ -51,12 +64,12 @@ def replace_entities(text: AnyStr, keep: Iterable[str] = (), remove_illegal: boo def convert_entity(m: Match) -> str: groups = m.groupdict() number = None - if groups.get('dec'): - number = int(groups['dec'], 10) - elif groups.get('hex'): - number = int(groups['hex'], 16) - elif groups.get('named'): - entity_name = groups['named'] + if groups.get("dec"): + number = int(groups["dec"], 10) + elif groups.get("hex"): + number = int(groups["hex"], 16) + elif groups.get("named"): + entity_name = groups["named"] if entity_name.lower() in keep: return m.group(0) else: @@ -80,11 +93,12 @@ def convert_entity(m: Match) -> str: return _ent_re.sub(convert_entity, to_unicode(text, encoding)) + def has_entities(text: AnyStr, encoding: Optional[str] = None) -> bool: return bool(_ent_re.search(to_unicode(text, encoding))) -def replace_tags(text: AnyStr, token: str = '', encoding: Optional[str] = None) -> str: +def replace_tags(text: AnyStr, token: str = "", encoding: Optional[str] = None) -> str: """Replace all markup tags found in the given `text` by the given token. By default `token` is an empty string so it just removes all tags. @@ -107,11 +121,11 @@ def replace_tags(text: AnyStr, token: str = '', encoding: Optional[str] = None) return _tag_re.sub(token, to_unicode(text, encoding)) -_REMOVECOMMENTS_RE = re.compile('|$)', re.DOTALL) +_REMOVECOMMENTS_RE = re.compile("|$)", re.DOTALL) def remove_comments(text: AnyStr, encoding: Optional[str] = None) -> str: - """ Remove HTML Comments. + """Remove HTML Comments. >>> import w3lib.html >>> w3lib.html.remove_comments(b"test whatever") @@ -121,10 +135,16 @@ def remove_comments(text: AnyStr, encoding: Optional[str] = None) -> str: """ utext = to_unicode(text, encoding) - return _REMOVECOMMENTS_RE.sub('', utext) + return _REMOVECOMMENTS_RE.sub("", utext) + -def remove_tags(text: AnyStr, which_ones: Iterable[str] = (), keep: Iterable[str] = (), encoding: Optional[str] = None) -> str: - """ Remove HTML Tags only. +def remove_tags( + text: AnyStr, + which_ones: Iterable[str] = (), + keep: Iterable[str] = (), + encoding: Optional[str] = None, +) -> str: + """Remove HTML Tags only. `which_ones` and `keep` are both tuples, there are four cases: @@ -190,7 +210,9 @@ def remove_tag(m: Match) -> str: return retags.sub(remove_tag, to_unicode(text, encoding)) -def remove_tags_with_content(text: AnyStr, which_ones: Iterable[str] = (), encoding: Optional[str] = None) -> str: +def remove_tags_with_content( + text: AnyStr, which_ones: Iterable[str] = (), encoding: Optional[str] = None +) -> str: """Remove tags and their content. `which_ones` is a tuple of which tags to remove including their content. @@ -210,12 +232,16 @@ def remove_tags_with_content(text: AnyStr, which_ones: Iterable[str] = (), encod [r"<%s\b.*?|<%s\s*/>" % (tag, tag, tag) for tag in which_ones] ) retags = re.compile(tags, re.DOTALL | re.IGNORECASE) - utext = retags.sub('', utext) + utext = retags.sub("", utext) return utext -def replace_escape_chars(text: AnyStr, which_ones: Iterable[str] = ('\n', '\t', '\r'), replace_by: StrOrBytes = '', \ - encoding: Optional[str] = None) -> str: +def replace_escape_chars( + text: AnyStr, + which_ones: Iterable[str] = ("\n", "\t", "\r"), + replace_by: StrOrBytes = "", + encoding: Optional[str] = None, +) -> str: """Remove escape characters. `which_ones` is a tuple of which escape characters we want to remove. @@ -232,7 +258,12 @@ def replace_escape_chars(text: AnyStr, which_ones: Iterable[str] = ('\n', '\t', return utext -def unquote_markup(text: AnyStr, keep: Iterable[str] = (), remove_illegal: bool = True, encoding: Optional[str] = None) -> str: +def unquote_markup( + text: AnyStr, + keep: Iterable[str] = (), + remove_illegal: bool = True, + encoding: Optional[str] = None, +) -> str: """ This function receives markup as a text (always a unicode string or a UTF-8 encoded string) and does the following: @@ -254,7 +285,7 @@ def _get_fragments(txt: str, pattern: Pattern) -> Iterable[Union[str, Match]]: yield txt[offset:] utext = to_unicode(text, encoding) - ret_text = '' + ret_text = "" for fragment in _get_fragments(utext, _cdata_re): if isinstance(fragment, str): # it's not a CDATA (so we try to remove its entities) @@ -266,7 +297,10 @@ def _get_fragments(txt: str, pattern: Pattern) -> Iterable[Union[str, Match]]: ret_text += fragment.group("cdata_d") return ret_text -def get_base_url(text: AnyStr, baseurl: StrOrBytes = '', encoding: str = 'utf-8') -> str: + +def get_base_url( + text: AnyStr, baseurl: StrOrBytes = "", encoding: str = "utf-8" +) -> str: """Return the base url if declared in the given HTML `text`, relative to the given base url. @@ -284,7 +318,12 @@ def get_base_url(text: AnyStr, baseurl: StrOrBytes = '', encoding: str = 'utf-8' return safe_url_string(baseurl) -def get_meta_refresh(text: AnyStr, baseurl: str = '', encoding: str = 'utf-8', ignore_tags: Iterable[str] = ('script', 'noscript')) -> Tuple[Optional[float], Optional[str]]: +def get_meta_refresh( + text: AnyStr, + baseurl: str = "", + encoding: str = "utf-8", + ignore_tags: Iterable[str] = ("script", "noscript"), +) -> Tuple[Optional[float], Optional[str]]: """Return the http-equiv parameter of the HTML meta element from the given HTML text and return a tuple ``(interval, url)`` where interval is an integer containing the delay in seconds (or zero if not present) and url is a diff --git a/w3lib/http.py b/w3lib/http.py index 9b92f2ef..4ea31fad 100644 --- a/w3lib/http.py +++ b/w3lib/http.py @@ -83,7 +83,9 @@ def headers_dict_to_raw(headers_dict: Optional[HeadersDictInput]) -> Optional[by return b"\r\n".join(raw_lines) -def basic_auth_header(username: AnyStr, password: AnyStr, encoding: str = 'ISO-8859-1') -> bytes: +def basic_auth_header( + username: AnyStr, password: AnyStr, encoding: str = "ISO-8859-1" +) -> bytes: """ Return an `Authorization` header field value for `HTTP Basic Access Authentication (RFC 2617)`_ @@ -99,4 +101,4 @@ def basic_auth_header(username: AnyStr, password: AnyStr, encoding: str = 'ISO-8 # XXX: RFC 2617 doesn't define encoding, but ISO-8859-1 # seems to be the most widely used encoding here. See also: # http://greenbytes.de/tech/webdav/draft-ietf-httpauth-basicauth-enc-latest.html - return b'Basic ' + urlsafe_b64encode(to_bytes(auth, encoding=encoding)) + return b"Basic " + urlsafe_b64encode(to_bytes(auth, encoding=encoding)) diff --git a/w3lib/url.py b/w3lib/url.py index 3f3372ad..85498ed2 100644 --- a/w3lib/url.py +++ b/w3lib/url.py @@ -32,7 +32,7 @@ # error handling function for bytes-to-Unicode decoding errors with URLs def _quote_byte(error: UnicodeError) -> Tuple[str, int]: error = cast(AnyUnicodeError, error) - return (to_unicode(quote(error.object[error.start:error.end])), error.end) + return (to_unicode(quote(error.object[error.start : error.end])), error.end) codecs.register_error("percentencode", _quote_byte) @@ -52,7 +52,12 @@ def _quote_byte(error: UnicodeError) -> Tuple[str, int]: ) # see https://infra.spec.whatwg.org/#ascii-tab-or-newline -def safe_url_string(url: StrOrBytes, encoding: str = 'utf8', path_encoding: str = 'utf8', quote_path: bool = True) -> str: +def safe_url_string( + url: StrOrBytes, + encoding: str = "utf8", + path_encoding: str = "utf8", + quote_path: bool = True, +) -> str: """Convert the given URL into a legal URL by escaping unsafe characters according to RFC-3986. Also, ASCII tabs and newlines are removed as per https://url.spec.whatwg.org/#url-parsing. @@ -84,9 +89,9 @@ def safe_url_string(url: StrOrBytes, encoding: str = 'utf8', path_encoding: str # IDNA encoding can fail for too long labels (>63 characters) # or missing labels (e.g. http://.example.com) try: - netloc = parts.netloc.encode('idna') + netloc = parts.netloc.encode("idna") except UnicodeError: - netloc = parts.netloc.encode('utf-8') + netloc = parts.netloc.encode("utf-8") # default encoding for path component SHOULD be UTF-8 if quote_path: @@ -94,20 +99,24 @@ def safe_url_string(url: StrOrBytes, encoding: str = 'utf8', path_encoding: str else: path = parts.path - return urlunsplit(( - parts.scheme, - netloc.decode().rstrip(':'), - path, - quote(parts.query.encode(encoding), _safe_chars), - quote(parts.fragment.encode(encoding), _safe_chars), - )) + return urlunsplit( + ( + parts.scheme, + netloc.decode().rstrip(":"), + path, + quote(parts.query.encode(encoding), _safe_chars), + quote(parts.fragment.encode(encoding), _safe_chars), + ) + ) _parent_dirs = re.compile(r"/?(\.\./)+") -def safe_download_url(url: StrOrBytes, encoding: str = 'utf8', path_encoding: str = 'utf8') -> str: - """ Make a url for download. This will call safe_url_string +def safe_download_url( + url: StrOrBytes, encoding: str = "utf8", path_encoding: str = "utf8" +) -> str: + """Make a url for download. This will call safe_url_string and then strip the fragment, if one exists. The path will be normalised. @@ -126,10 +135,15 @@ def safe_download_url(url: StrOrBytes, encoding: str = 'utf8', path_encoding: st def is_url(text: str) -> bool: - return text.partition("://")[0] in ('file', 'http', 'https') + return text.partition("://")[0] in ("file", "http", "https") -def url_query_parameter(url: StrOrBytes, parameter: str, default: Optional[str] = None, keep_blank_values: Union[bool, int] = 0) -> Optional[str]: +def url_query_parameter( + url: StrOrBytes, + parameter: str, + default: Optional[str] = None, + keep_blank_values: Union[bool, int] = 0, +) -> Optional[str]: """Return the value of a url parameter, given the url and parameter name General case: @@ -159,8 +173,7 @@ def url_query_parameter(url: StrOrBytes, parameter: str, default: Optional[str] """ queryparams = parse_qs( - urlsplit(str(url))[3], - keep_blank_values=bool(keep_blank_values) + urlsplit(str(url))[3], keep_blank_values=bool(keep_blank_values) ) if parameter in queryparams: return queryparams[parameter][0] @@ -169,7 +182,14 @@ def url_query_parameter(url: StrOrBytes, parameter: str, default: Optional[str] def url_query_cleaner( - url: StrOrBytes, parameterlist: Union[StrOrBytes, Sequence[StrOrBytes]] = (), sep: str = '&', kvsep: str = '=', remove: bool = False, unique: bool = True, keep_fragments: bool = False) -> str: + url: StrOrBytes, + parameterlist: Union[StrOrBytes, Sequence[StrOrBytes]] = (), + sep: str = "&", + kvsep: str = "=", + remove: bool = False, + unique: bool = True, + keep_fragments: bool = False, +) -> str: """Clean URL arguments leaving only those passed in the parameterlist keeping order >>> import w3lib.url @@ -206,7 +226,7 @@ def url_query_cleaner( url, fragment = urldefrag(url) url = cast(str, url) fragment = cast(str, fragment) - base, _, query = url.partition('?') + base, _, query = url.partition("?") seen = set() querylist = [] for ksv in query.split(sep): @@ -224,9 +244,10 @@ def url_query_cleaner( seen.add(k) url = "?".join([base, sep.join(querylist)]) if querylist else base if keep_fragments: - url += '#' + fragment + url += "#" + fragment return cast(str, url) + def _add_or_replace_parameters(url: str, params: Dict[str, str]) -> str: parsed = urlsplit(url) current_args = parse_qsl(parsed.query, keep_blank_values=True) @@ -343,8 +364,9 @@ def any_to_uri(uri_or_path: str) -> str: ).encode() ) -_ParseDataURIResult = namedtuple("_ParseDataURIResult", - "media_type media_type_parameters data") +_ParseDataURIResult = namedtuple( + "_ParseDataURIResult", "media_type media_type_parameters data" +) def parse_data_uri(uri: StrOrBytes) -> _ParseDataURIResult: @@ -389,7 +411,7 @@ def parse_data_uri(uri: StrOrBytes) -> _ParseDataURIResult: if m: attribute, value, value_quoted = m.groups() if value_quoted: - value = re.sub(br'\\(.)', rb'\1', value_quoted) + value = re.sub(br"\\(.)", rb"\1", value_quoted) media_type_params[attribute.decode()] = value.decode() uri = uri[m.end() :] else: @@ -423,7 +445,9 @@ def parse_data_uri(uri: StrOrBytes) -> _ParseDataURIResult: ] -def _safe_ParseResult(parts: ParseResult, encoding: str = 'utf8', path_encoding: str = 'utf8') -> Tuple[str, str, str, str, str, str]: +def _safe_ParseResult( + parts: ParseResult, encoding: str = "utf8", path_encoding: str = "utf8" +) -> Tuple[str, str, str, str, str, str]: # IDNA encoding can fail for too long labels (>63 characters) # or missing labels (e.g. http://.example.com) try: @@ -441,8 +465,12 @@ def _safe_ParseResult(parts: ParseResult, encoding: str = 'utf8', path_encoding: ) -def canonicalize_url(url: Union[StrOrBytes, ParseResult], keep_blank_values: bool = True, keep_fragments: bool = False, - encoding: Optional[str] = None) -> str: +def canonicalize_url( + url: Union[StrOrBytes, ParseResult], + keep_blank_values: bool = True, + keep_fragments: bool = False, + encoding: Optional[str] = None, +) -> str: r"""Canonicalize the given url by applying the following procedures: - sort query arguments, first by key, then by value @@ -478,7 +506,8 @@ def canonicalize_url(url: Union[StrOrBytes, ParseResult], keep_blank_values: boo # if not for proper URL expected by remote website. try: scheme, netloc, path, params, query, fragment = _safe_ParseResult( - parse_url(url), encoding=encoding or 'utf8') + parse_url(url), encoding=encoding or "utf8" + ) except UnicodeEncodeError: scheme, netloc, path, params, query, fragment = _safe_ParseResult( parse_url(url), encoding="utf8" @@ -530,8 +559,8 @@ def canonicalize_url(url: Union[StrOrBytes, ParseResult], keep_blank_values: boo def _unquotepath(path: str) -> bytes: - for reserved in ('2f', '2F', '3f', '3F'): - path = path.replace('%' + reserved, '%25' + reserved.upper()) + for reserved in ("2f", "2F", "3f", "3F"): + path = path.replace("%" + reserved, "%25" + reserved.upper()) # standard lib's unquote() does not work for non-UTF-8 # percent-escaped characters, they get lost. @@ -541,7 +570,9 @@ def _unquotepath(path: str) -> bytes: return unquote_to_bytes(path) -def parse_url(url: Union[StrOrBytes, ParseResult], encoding: Optional[str] = None) -> ParseResult: +def parse_url( + url: Union[StrOrBytes, ParseResult], encoding: Optional[str] = None +) -> ParseResult: """Return urlparsed url from the given argument (which could be an already parsed url) """ @@ -550,7 +581,9 @@ def parse_url(url: Union[StrOrBytes, ParseResult], encoding: Optional[str] = Non return urlparse(to_unicode(url, encoding)) -def parse_qsl_to_bytes(qs: str, keep_blank_values: bool = False) -> List[Tuple[bytes, bytes]]: +def parse_qsl_to_bytes( + qs: str, keep_blank_values: bool = False +) -> List[Tuple[bytes, bytes]]: """Parse a query given as a string argument. Data are returned as a list of name, value pairs as bytes. @@ -572,7 +605,7 @@ def parse_qsl_to_bytes(qs: str, keep_blank_values: bool = False) -> List[Tuple[b # with unquote_to_bytes(s) coerce_args = cast(Callable[..., Tuple[str, Callable]], _coerce_args) qs, _coerce_result = coerce_args(qs) - pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')] + pairs = [s2 for s1 in qs.split("&") for s2 in s1.split(";")] r = [] for name_value in pairs: if not name_value: @@ -585,10 +618,10 @@ def parse_qsl_to_bytes(qs: str, keep_blank_values: bool = False) -> List[Tuple[b else: continue if len(nv[1]) or keep_blank_values: - name: StrOrBytes = nv[0].replace('+', ' ') + name: StrOrBytes = nv[0].replace("+", " ") name = unquote_to_bytes(name) name = _coerce_result(name) - value: StrOrBytes = nv[1].replace('+', ' ') + value: StrOrBytes = nv[1].replace("+", " ") value = unquote_to_bytes(value) value = _coerce_result(value) r.append((cast(bytes, name), cast(bytes, value))) diff --git a/w3lib/util.py b/w3lib/util.py index 315d968a..58ca867f 100644 --- a/w3lib/util.py +++ b/w3lib/util.py @@ -4,7 +4,7 @@ def str_to_unicode( - text: StrOrBytes, encoding: Optional[str] = None, errors: str = 'strict' + text: StrOrBytes, encoding: Optional[str] = None, errors: str = "strict" ) -> str: if encoding is None: encoding = "utf-8" @@ -12,8 +12,9 @@ def str_to_unicode( return text.decode(encoding, errors) return text + def unicode_to_str( - text: StrOrBytes, encoding: Optional[str] = None, errors: str = 'strict' + text: StrOrBytes, encoding: Optional[str] = None, errors: str = "strict" ) -> bytes: warn( "The w3lib.utils.unicode_to_str function is deprecated and " @@ -29,7 +30,7 @@ def unicode_to_str( def to_unicode( - text: StrOrBytes, encoding: Optional[str] = None, errors: str = 'strict' + text: StrOrBytes, encoding: Optional[str] = None, errors: str = "strict" ) -> str: """Return the unicode representation of a bytes object `text`. If `text` is already an unicode object, return it as-is.""" @@ -45,7 +46,7 @@ def to_unicode( def to_bytes( - text: StrOrBytes, encoding: Optional[str] = None, errors: str = 'strict' + text: StrOrBytes, encoding: Optional[str] = None, errors: str = "strict" ) -> bytes: """Return the binary representation of `text`. If `text` is already a bytes object, return it as-is.""" @@ -61,7 +62,7 @@ def to_bytes( def to_native_str( - text: StrOrBytes, encoding: Optional[str] = None, errors: str = 'strict' + text: StrOrBytes, encoding: Optional[str] = None, errors: str = "strict" ) -> str: """ Return str representation of `text` """ warn( From 9e1ee79923e2ebab38afb5a5c6d9d9ce158e3138 Mon Sep 17 00:00:00 2001 From: Lucy Wang Date: Mon, 9 Aug 2021 14:26:34 +0800 Subject: [PATCH 09/17] use new mypy --- tox.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tox.ini b/tox.ini index a954c1eb..e2d212cf 100644 --- a/tox.ini +++ b/tox.ini @@ -25,7 +25,7 @@ commands = [testenv:typing] basepython = python3 deps = - mypy==0.901 + mypy==0.910 commands = mypy --show-error-codes {posargs: w3lib tests} From 7cdc107051f47b774983fd4c7ed95b36db17fd18 Mon Sep 17 00:00:00 2001 From: Lucy Wang Date: Mon, 9 Aug 2021 14:40:04 +0800 Subject: [PATCH 10/17] fix ci --- tox.ini | 2 ++ w3lib/util.py | 8 +++++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/tox.ini b/tox.ini index e2d212cf..4e8e4767 100644 --- a/tox.ini +++ b/tox.ini @@ -25,6 +25,8 @@ commands = [testenv:typing] basepython = python3 deps = + # mypy would error if pytest (or its sub) not found + pytest mypy==0.910 commands = mypy --show-error-codes {posargs: w3lib tests} diff --git a/w3lib/util.py b/w3lib/util.py index 58ca867f..c9eba65f 100644 --- a/w3lib/util.py +++ b/w3lib/util.py @@ -6,6 +6,12 @@ def str_to_unicode( text: StrOrBytes, encoding: Optional[str] = None, errors: str = "strict" ) -> str: + warn( + "The w3lib.utils.str_to_unicode function is deprecated and " + "will be removed in a future release.", + DeprecationWarning, + stacklevel=2, + ) if encoding is None: encoding = "utf-8" if isinstance(text, bytes): @@ -64,7 +70,7 @@ def to_bytes( def to_native_str( text: StrOrBytes, encoding: Optional[str] = None, errors: str = "strict" ) -> str: - """ Return str representation of `text` """ + """Return str representation of `text`""" warn( "The w3lib.utils.to_native_str function is deprecated and " "will be removed in a future release. Please use " From 1a81a47c265b4f333c06f24786d9cf6fac72bf78 Mon Sep 17 00:00:00 2001 From: Lucy Wang Date: Mon, 9 Aug 2021 15:05:36 +0800 Subject: [PATCH 11/17] fix sphinx --- w3lib/url.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/w3lib/url.py b/w3lib/url.py index 85498ed2..e44c0860 100644 --- a/w3lib/url.py +++ b/w3lib/url.py @@ -364,12 +364,13 @@ def any_to_uri(uri_or_path: str) -> str: ).encode() ) -_ParseDataURIResult = namedtuple( - "_ParseDataURIResult", "media_type media_type_parameters data" +ParseDataURIResult = namedtuple( + "ParseDataURIResult", "media_type media_type_parameters data" ) +ParseDataURIResult.__doc__ = "The return value type of `w3lib.url.parse_data_uri`." -def parse_data_uri(uri: StrOrBytes) -> _ParseDataURIResult: +def parse_data_uri(uri: StrOrBytes): # type: ignore """ Parse a data: URI, returning a 3-tuple of media type, dictionary of media @@ -426,7 +427,7 @@ def parse_data_uri(uri: StrOrBytes) -> _ParseDataURIResult: raise ValueError("invalid data URI") data = base64.b64decode(data) - return _ParseDataURIResult(media_type, media_type_params, data) + return ParseDataURIResult(media_type, media_type_params, data) __all__ = [ From f877a6ce840da748d7416001411da26b724cde80 Mon Sep 17 00:00:00 2001 From: Lucy Wang Date: Mon, 9 Aug 2021 15:06:50 +0800 Subject: [PATCH 12/17] doc --- w3lib/url.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/w3lib/url.py b/w3lib/url.py index e44c0860..e6744744 100644 --- a/w3lib/url.py +++ b/w3lib/url.py @@ -370,6 +370,8 @@ def any_to_uri(uri_or_path: str) -> str: ParseDataURIResult.__doc__ = "The return value type of `w3lib.url.parse_data_uri`." +# If we add the return type hint sphinx would error: +# w3lib/url.py:docstring of w3lib.url.parse_data_uri::py:class reference target not found: w3lib.url.ParseDataURIResult def parse_data_uri(uri: StrOrBytes): # type: ignore """ From fc4d33a5722cf0144baa79ae40a676ebaff8d9df Mon Sep 17 00:00:00 2001 From: Lucy Wang Date: Mon, 23 Aug 2021 11:29:30 +0800 Subject: [PATCH 13/17] Update w3lib/util.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Adrián Chaves --- w3lib/util.py | 1 + 1 file changed, 1 insertion(+) diff --git a/w3lib/util.py b/w3lib/util.py index c9eba65f..70f4ef52 100644 --- a/w3lib/util.py +++ b/w3lib/util.py @@ -1,5 +1,6 @@ from warnings import warn from typing import Optional + from w3lib._types import StrOrBytes From 57675073281b30402ae172594959b311637069d6 Mon Sep 17 00:00:00 2001 From: Lucy Wang Date: Mon, 23 Aug 2021 11:32:00 +0800 Subject: [PATCH 14/17] cr --- docs/w3lib.rst | 2 ++ run-mypy.sh | 5 ----- tests/test_html.py | 2 +- w3lib/url.py | 12 ++++++------ 4 files changed, 9 insertions(+), 12 deletions(-) delete mode 100755 run-mypy.sh diff --git a/docs/w3lib.rst b/docs/w3lib.rst index bfde0304..c5233dd7 100644 --- a/docs/w3lib.rst +++ b/docs/w3lib.rst @@ -26,3 +26,5 @@ w3lib Package .. automodule:: w3lib.url :members: + +.. autoclass:: ParseDataURIResult diff --git a/run-mypy.sh b/run-mypy.sh deleted file mode 100755 index ea3b1332..00000000 --- a/run-mypy.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash - -set -e - -mypy --txt-report . w3lib tests diff --git a/tests/test_html.py b/tests/test_html.py index 5a092f29..f6ca90d2 100644 --- a/tests/test_html.py +++ b/tests/test_html.py @@ -69,7 +69,7 @@ def test_illegal_entities(self): def test_browser_hack(self): # check browser hack for numeric character references in the 80-9F range self.assertEqual(replace_entities("x™y", encoding="cp1252"), "x\u2122y") - self.assertEqual(replace_entities("x™y", encoding="cp1252"), u"x\u2122y") + self.assertEqual(replace_entities("x™y", encoding="cp1252"), "x\u2122y") def test_missing_semicolon(self): for entity, result in ( diff --git a/w3lib/url.py b/w3lib/url.py index e6744744..d8066309 100644 --- a/w3lib/url.py +++ b/w3lib/url.py @@ -89,9 +89,11 @@ def safe_url_string( # IDNA encoding can fail for too long labels (>63 characters) # or missing labels (e.g. http://.example.com) try: - netloc = parts.netloc.encode("idna") + netloc_bytes = parts.netloc.encode("idna") except UnicodeError: - netloc = parts.netloc.encode("utf-8") + netloc = parts.netloc + else: + netloc = netloc_bytes.decode() # default encoding for path component SHOULD be UTF-8 if quote_path: @@ -102,7 +104,7 @@ def safe_url_string( return urlunsplit( ( parts.scheme, - netloc.decode().rstrip(":"), + netloc.rstrip(":"), path, quote(parts.query.encode(encoding), _safe_chars), quote(parts.fragment.encode(encoding), _safe_chars), @@ -370,9 +372,7 @@ def any_to_uri(uri_or_path: str) -> str: ParseDataURIResult.__doc__ = "The return value type of `w3lib.url.parse_data_uri`." -# If we add the return type hint sphinx would error: -# w3lib/url.py:docstring of w3lib.url.parse_data_uri::py:class reference target not found: w3lib.url.ParseDataURIResult -def parse_data_uri(uri: StrOrBytes): # type: ignore +def parse_data_uri(uri: StrOrBytes) -> ParseDataURIResult: """ Parse a data: URI, returning a 3-tuple of media type, dictionary of media From 341b3f31dade1393eadf98e1d4c7dffd35f6c2c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 24 Aug 2021 10:51:40 +0200 Subject: [PATCH 15/17] Improve ParseDataURIResult documentation --- docs/w3lib.rst | 3 ++- w3lib/url.py | 34 +++++++++++++++++++++------------- 2 files changed, 23 insertions(+), 14 deletions(-) diff --git a/docs/w3lib.rst b/docs/w3lib.rst index c5233dd7..502554ff 100644 --- a/docs/w3lib.rst +++ b/docs/w3lib.rst @@ -27,4 +27,5 @@ w3lib Package .. automodule:: w3lib.url :members: -.. autoclass:: ParseDataURIResult + .. autoclass:: ParseDataURIResult + :members: diff --git a/w3lib/url.py b/w3lib/url.py index d8066309..766ca56e 100644 --- a/w3lib/url.py +++ b/w3lib/url.py @@ -8,8 +8,17 @@ import posixpath import re import string -from collections import namedtuple -from typing import Callable, List, Optional, Sequence, Tuple, Union, cast, Dict +from typing import ( + cast, + Callable, + Dict, + List, + NamedTuple, + Optional, + Sequence, + Tuple, + Union, +) from urllib.parse import ( parse_qs, parse_qsl, @@ -366,20 +375,19 @@ def any_to_uri(uri_or_path: str) -> str: ).encode() ) -ParseDataURIResult = namedtuple( - "ParseDataURIResult", "media_type media_type_parameters data" -) -ParseDataURIResult.__doc__ = "The return value type of `w3lib.url.parse_data_uri`." - - -def parse_data_uri(uri: StrOrBytes) -> ParseDataURIResult: - """ - Parse a data: URI, returning a 3-tuple of media type, dictionary of media - type parameters, and data. +class ParseDataURIResult(NamedTuple): + """Named tuple returned by :func:`parse_data_uri`.""" + #: MIME type type and subtype, separated by / (e.g. ``"text/plain"``). + media_type: str + #: MIME type parameters (e.g. ``{"charset": "US-ASCII"}``). + media_type_parameters: dict[str, str] + #: Data, decoded if it was encoded in base64 format. + data: bytes - """ +def parse_data_uri(uri: StrOrBytes) -> ParseDataURIResult: + """Parse a data: URI into :class:`ParseDataURIResult`.""" if not isinstance(uri, bytes): uri = safe_url_string(uri).encode("ascii") From b1af346cf6c1f3bfa0947a6137085e275ad94976 Mon Sep 17 00:00:00 2001 From: Lucy Wang Date: Tue, 24 Aug 2021 17:35:19 +0800 Subject: [PATCH 16/17] fix py36 compat --- w3lib/url.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/w3lib/url.py b/w3lib/url.py index 766ca56e..e4a8121b 100644 --- a/w3lib/url.py +++ b/w3lib/url.py @@ -381,7 +381,7 @@ class ParseDataURIResult(NamedTuple): #: MIME type type and subtype, separated by / (e.g. ``"text/plain"``). media_type: str #: MIME type parameters (e.g. ``{"charset": "US-ASCII"}``). - media_type_parameters: dict[str, str] + media_type_parameters: Dict[str, str] #: Data, decoded if it was encoded in base64 format. data: bytes From 9fcbe1356197024d4b0d66679428eb723ae13850 Mon Sep 17 00:00:00 2001 From: Lucy Wang Date: Tue, 24 Aug 2021 17:36:40 +0800 Subject: [PATCH 17/17] black --- w3lib/url.py | 1 + 1 file changed, 1 insertion(+) diff --git a/w3lib/url.py b/w3lib/url.py index e4a8121b..71398516 100644 --- a/w3lib/url.py +++ b/w3lib/url.py @@ -378,6 +378,7 @@ def any_to_uri(uri_or_path: str) -> str: class ParseDataURIResult(NamedTuple): """Named tuple returned by :func:`parse_data_uri`.""" + #: MIME type type and subtype, separated by / (e.g. ``"text/plain"``). media_type: str #: MIME type parameters (e.g. ``{"charset": "US-ASCII"}``).