diff --git a/CHANGES.rst b/CHANGES.rst index cdf21bc4..fe07f1ec 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -46,6 +46,10 @@ Released on XXX * **Drop support of charade, now that chardet is supported once more.** +* **Replace the charset keyword argument on parse and related methods + with a set of keyword arguments: override_encoding, transport_encoding, + same_origin_parent_encoding, likely_encoding, and default_encoding.** + 0.9999999/1.0b8 ~~~~~~~~~~~~~~~ diff --git a/README.rst b/README.rst index 6859ed30..2ad46090 100644 --- a/README.rst +++ b/README.rst @@ -51,7 +51,7 @@ pass into html5lib as follows: import html5lib with closing(urlopen("http://example.com/")) as f: - document = html5lib.parse(f, encoding=f.info().getparam("charset")) + document = html5lib.parse(f, transport_encoding=f.info().getparam("charset")) When using with ``urllib.request`` (Python 3), the charset from HTTP should be pass into html5lib as follows: @@ -62,7 +62,7 @@ should be pass into html5lib as follows: import html5lib with urlopen("http://example.com/") as f: - document = html5lib.parse(f, encoding=f.info().get_content_charset()) + document = html5lib.parse(f, transport_encoding=f.info().get_content_charset()) To have more control over the parser, create a parser object explicitly. For instance, to make the parser raise exceptions on parse errors, use: diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py index 224341b7..6a5c8bcb 100644 --- a/html5lib/html5parser.py +++ b/html5lib/html5parser.py @@ -28,19 +28,17 @@ ) -def parse(doc, treebuilder="etree", encoding=None, - namespaceHTMLElements=True, scripting=False): +def parse(doc, treebuilder="etree", namespaceHTMLElements=True, **kwargs): """Parse a string or file-like object into a tree""" tb = treebuilders.getTreeBuilder(treebuilder) p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements) - return p.parse(doc, encoding=encoding, scripting=scripting) + return p.parse(doc, **kwargs) -def parseFragment(doc, container="div", treebuilder="etree", encoding=None, - namespaceHTMLElements=True, scripting=False): +def parseFragment(doc, container="div", treebuilder="etree", namespaceHTMLElements=True, **kwargs): tb = treebuilders.getTreeBuilder(treebuilder) p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements) - return p.parseFragment(doc, container=container, encoding=encoding, scripting=scripting) + return p.parseFragment(doc, container=container, **kwargs) def method_decorator_metaclass(function): @@ -59,18 +57,13 @@ class HTMLParser(object): """HTML parser. Generates a tree structure from a stream of (possibly malformed) HTML""" - def __init__(self, tree=None, tokenizer=tokenizer.HTMLTokenizer, - strict=False, namespaceHTMLElements=True, debug=False): + def __init__(self, tree=None, strict=False, namespaceHTMLElements=True, debug=False): """ strict - raise an exception when a parse error is encountered tree - a treebuilder class controlling the type of tree that will be returned. Built in treebuilders can be accessed through html5lib.treebuilders.getTreeBuilder(treeType) - - tokenizer - a class that provides a stream of tokens to the treebuilder. - This may be replaced for e.g. a sanitizer which converts some tags to - text """ # Raise an exception on the first error encountered @@ -79,22 +72,17 @@ def __init__(self, tree=None, tokenizer=tokenizer.HTMLTokenizer, if tree is None: tree = treebuilders.getTreeBuilder("etree") self.tree = tree(namespaceHTMLElements) - self.tokenizer_class = tokenizer self.errors = [] self.phases = dict([(name, cls(self, self.tree)) for name, cls in getPhases(debug).items()]) - def _parse(self, stream, innerHTML=False, container="div", encoding=None, - parseMeta=True, useChardet=True, scripting=False, **kwargs): + def _parse(self, stream, innerHTML=False, container="div", scripting=False, **kwargs): self.innerHTMLMode = innerHTML self.container = container self.scripting = scripting - self.tokenizer = self.tokenizer_class(stream, encoding=encoding, - parseMeta=parseMeta, - useChardet=useChardet, - parser=self, **kwargs) + self.tokenizer = tokenizer.HTMLTokenizer(stream, parser=self, **kwargs) self.reset() try: @@ -232,8 +220,7 @@ def normalizedTokens(self): for token in self.tokenizer: yield self.normalizeToken(token) - def parse(self, stream, encoding=None, parseMeta=True, - useChardet=True, scripting=False): + def parse(self, stream, *args, **kwargs): """Parse a HTML document into a well-formed tree stream - a filelike object or string containing the HTML to be parsed @@ -245,13 +232,10 @@ def parse(self, stream, encoding=None, parseMeta=True, scripting - treat noscript elements as if javascript was turned on """ - self._parse(stream, innerHTML=False, encoding=encoding, - parseMeta=parseMeta, useChardet=useChardet, scripting=scripting) + self._parse(stream, False, None, *args, **kwargs) return self.tree.getDocument() - def parseFragment(self, stream, container="div", encoding=None, - parseMeta=False, useChardet=True, scripting=False): - # pylint:disable=unused-argument + def parseFragment(self, stream, *args, **kwargs): """Parse a HTML fragment into a well-formed tree fragment container - name of the element we're setting the innerHTML property @@ -266,8 +250,7 @@ def parseFragment(self, stream, container="div", encoding=None, scripting - treat noscript elements as if javascript was turned on """ - self._parse(stream, True, container=container, - encoding=encoding, scripting=scripting) + self._parse(stream, True, *args, **kwargs) return self.tree.getFragment() def parseError(self, errorcode="XXX-undefined-error", datavars=None): diff --git a/html5lib/inputstream.py b/html5lib/inputstream.py index cfabdd86..dafe33ca 100644 --- a/html5lib/inputstream.py +++ b/html5lib/inputstream.py @@ -128,7 +128,7 @@ def _readFromBuffer(self, bytes): return b"".join(rv) -def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True): +def HTMLInputStream(source, **kwargs): # Work around Python bug #20007: read(0) closes the connection. # http://bugs.python.org/issue20007 if (isinstance(source, http_client.HTTPResponse) or @@ -142,12 +142,13 @@ def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True): isUnicode = isinstance(source, text_type) if isUnicode: - if encoding is not None: - raise TypeError("Cannot explicitly set an encoding with a unicode string") + encodings = [x for x in kwargs if x.endswith("_encoding")] + if encodings: + raise TypeError("Cannot set an encoding with a unicode input, set %r" % encodings) - return HTMLUnicodeInputStream(source) + return HTMLUnicodeInputStream(source, **kwargs) else: - return HTMLBinaryInputStream(source, encoding, parseMeta, chardet) + return HTMLBinaryInputStream(source, **kwargs) class HTMLUnicodeInputStream(object): @@ -173,8 +174,6 @@ def __init__(self, source): regardless of any BOM or later declaration (such as in a meta element) - parseMeta - Look for a element containing encoding information - """ if not utils.supports_lone_surrogates: @@ -390,7 +389,9 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream): """ - def __init__(self, source, encoding=None, parseMeta=True, chardet=True): + def __init__(self, source, override_encoding=None, transport_encoding=None, + same_origin_parent_encoding=None, likely_encoding=None, + default_encoding="windows-1252", useChardet=True): """Initialises the HTMLInputStream. HTMLInputStream(source, [encoding]) -> Normalized stream from source @@ -403,8 +404,6 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True): regardless of any BOM or later declaration (such as in a meta element) - parseMeta - Look for a element containing encoding information - """ # Raw Stream - for unicode objects this will encode to utf-8 and set # self.charEncoding as appropriate @@ -412,21 +411,22 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True): HTMLUnicodeInputStream.__init__(self, self.rawStream) - self.charEncoding = (lookupEncoding(encoding), "certain") - # Encoding Information # Number of bytes to use when looking for a meta element with # encoding information self.numBytesMeta = 1024 # Number of bytes to use when using detecting encoding using chardet self.numBytesChardet = 100 - # Encoding to use if no other information can be found - self.defaultEncoding = "windows-1252" + # Things from args + self.override_encoding = override_encoding + self.transport_encoding = transport_encoding + self.same_origin_parent_encoding = same_origin_parent_encoding + self.likely_encoding = likely_encoding + self.default_encoding = default_encoding - # Detect encoding iff no explicit "transport level" encoding is supplied - if (self.charEncoding[0] is None): - self.charEncoding = self.detectEncoding(parseMeta, chardet) - assert self.charEncoding[0] is not None + # Determine encoding + self.charEncoding = self.determineEncoding(useChardet) + assert self.charEncoding[0] is not None # Call superclass self.reset() @@ -454,21 +454,45 @@ def openStream(self, source): return stream - def detectEncoding(self, parseMeta=True, chardet=True): - # First look for a BOM + def determineEncoding(self, chardet=True): + # BOMs take precedence over everything # This will also read past the BOM if present - encoding = self.detectBOM() - confidence = "certain" - # If there is no BOM need to look for meta elements with encoding - # information - if encoding is None and parseMeta: - encoding = self.detectEncodingMeta() - confidence = "tentative" + charEncoding = self.detectBOM(), "certain" + if charEncoding[0] is not None: + return charEncoding + + # If we've been overriden, we've been overriden + charEncoding = lookupEncoding(self.override_encoding), "certain" + if charEncoding[0] is not None: + return charEncoding + + # Now check the transport layer + charEncoding = lookupEncoding(self.transport_encoding), "certain" + if charEncoding[0] is not None: + return charEncoding + + # Look for meta elements with encoding information + charEncoding = self.detectEncodingMeta(), "tentative" + if charEncoding[0] is not None: + return charEncoding + + # Parent document encoding + charEncoding = lookupEncoding(self.same_origin_parent_encoding), "tentative" + if charEncoding[0] is not None and not charEncoding[0].name.startswith("utf-16"): + return charEncoding + + # "likely" encoding + charEncoding = lookupEncoding(self.likely_encoding), "tentative" + if charEncoding[0] is not None: + return charEncoding + # Guess with chardet, if available - if encoding is None and chardet: - confidence = "tentative" + if chardet: try: from chardet.universaldetector import UniversalDetector + except ImportError: + pass + else: buffers = [] detector = UniversalDetector() while not detector.done: @@ -481,14 +505,16 @@ def detectEncoding(self, parseMeta=True, chardet=True): detector.close() encoding = lookupEncoding(detector.result['encoding']) self.rawStream.seek(0) - except ImportError: - pass - # If all else fails use the default encoding - if encoding is None: - confidence = "tentative" - encoding = lookupEncoding(self.defaultEncoding) + if encoding is not None: + return encoding, "tentative" + + # Try the default encoding + charEncoding = lookupEncoding(self.default_encoding), "tentative" + if charEncoding[0] is not None: + return charEncoding - return encoding, confidence + # Fallback to html5lib's default if even that hasn't worked + return lookupEncoding("windows-1252"), "tentative" def changeEncoding(self, newEncoding): assert self.charEncoding[1] != "certain" diff --git a/html5lib/tests/test_encoding.py b/html5lib/tests/test_encoding.py index a66a2178..b6d20f24 100644 --- a/html5lib/tests/test_encoding.py +++ b/html5lib/tests/test_encoding.py @@ -2,6 +2,8 @@ import os +import pytest + from .support import get_data_files, test_dir, errorMessage, TestData as _TestData from html5lib import HTMLParser, inputstream @@ -11,7 +13,7 @@ def test_basic_prescan_length(): pad = 1024 - len(data) + 1 data = data.replace(b"-a-", b"-" + (b"a" * pad) + b"-") assert len(data) == 1024 # Sanity - stream = inputstream.HTMLBinaryInputStream(data, chardet=False) + stream = inputstream.HTMLBinaryInputStream(data, useChardet=False) assert 'utf-8' == stream.charEncoding[0].name @@ -20,7 +22,7 @@ def test_parser_reparse(): pad = 10240 - len(data) + 1 data = data.replace(b"-a-", b"-" + (b"a" * pad) + b"-") assert len(data) == 10240 # Sanity - stream = inputstream.HTMLBinaryInputStream(data, chardet=False) + stream = inputstream.HTMLBinaryInputStream(data, useChardet=False) assert 'windows-1252' == stream.charEncoding[0].name p = HTMLParser(namespaceHTMLElements=False) doc = p.parse(data, useChardet=False) @@ -28,6 +30,51 @@ def test_parser_reparse(): assert doc.find(".//title").text == "Caf\u00E9" +@pytest.mark.parametrize("expected,data,kwargs", [ + ("utf-16le", b"\xFF\xFE", {"override_encoding": "iso-8859-2"}), + ("utf-16be", b"\xFE\xFF", {"override_encoding": "iso-8859-2"}), + ("utf-8", b"\xEF\xBB\xBF", {"override_encoding": "iso-8859-2"}), + ("iso-8859-2", b"", {"override_encoding": "iso-8859-2", "transport_encoding": "iso-8859-3"}), + ("iso-8859-2", b"", {"transport_encoding": "iso-8859-2"}), + ("iso-8859-2", b"", {"same_origin_parent_encoding": "iso-8859-3"}), + ("iso-8859-2", b"", {"same_origin_parent_encoding": "iso-8859-2", "likely_encoding": "iso-8859-3"}), + ("iso-8859-2", b"", {"same_origin_parent_encoding": "utf-16", "likely_encoding": "iso-8859-2"}), + ("iso-8859-2", b"", {"same_origin_parent_encoding": "utf-16be", "likely_encoding": "iso-8859-2"}), + ("iso-8859-2", b"", {"same_origin_parent_encoding": "utf-16le", "likely_encoding": "iso-8859-2"}), + ("iso-8859-2", b"", {"likely_encoding": "iso-8859-2", "default_encoding": "iso-8859-3"}), + ("iso-8859-2", b"", {"default_encoding": "iso-8859-2"}), + ("windows-1252", b"", {"default_encoding": "totally-bogus-string"}), + ("windows-1252", b"", {}), +]) +def test_parser_args(expected, data, kwargs): + stream = inputstream.HTMLBinaryInputStream(data, useChardet=False, **kwargs) + assert expected == stream.charEncoding[0].name + p = HTMLParser() + p.parse(data, useChardet=False, **kwargs) + assert expected == p.documentEncoding + + +@pytest.mark.parametrize("kwargs", [ + {"override_encoding": "iso-8859-2"}, + {"override_encoding": None}, + {"transport_encoding": "iso-8859-2"}, + {"transport_encoding": None}, + {"same_origin_parent_encoding": "iso-8859-2"}, + {"same_origin_parent_encoding": None}, + {"likely_encoding": "iso-8859-2"}, + {"likely_encoding": None}, + {"default_encoding": "iso-8859-2"}, + {"default_encoding": None}, + {"foo_encoding": "iso-8859-2"}, + {"foo_encoding": None}, +]) +def test_parser_args_raises(kwargs): + with pytest.raises(TypeError) as exc_info: + p = HTMLParser() + p.parse("", useChardet=False, **kwargs) + assert exc_info.value.args[0].startswith("Cannot set an encoding with a unicode input") + + def runParserEncodingTest(data, encoding): p = HTMLParser() assert p.documentEncoding is None @@ -38,7 +85,7 @@ def runParserEncodingTest(data, encoding): def runPreScanEncodingTest(data, encoding): - stream = inputstream.HTMLBinaryInputStream(data, chardet=False) + stream = inputstream.HTMLBinaryInputStream(data, useChardet=False) encoding = encoding.lower().decode("ascii") # Very crude way to ignore irrelevant tests @@ -55,6 +102,7 @@ def test_encoding(): yield (runParserEncodingTest, test[b'data'], test[b'encoding']) yield (runPreScanEncodingTest, test[b'data'], test[b'encoding']) + # pylint:disable=wrong-import-position try: import chardet # noqa diff --git a/html5lib/tests/test_stream.py b/html5lib/tests/test_stream.py index 77e411d5..e8d9fd86 100644 --- a/html5lib/tests/test_stream.py +++ b/html5lib/tests/test_stream.py @@ -99,13 +99,13 @@ class HTMLBinaryInputStreamShortChunk(HTMLBinaryInputStream): def test_char_ascii(): - stream = HTMLInputStream(b"'", encoding='ascii') + stream = HTMLInputStream(b"'", override_encoding='ascii') assert stream.charEncoding[0].name == 'windows-1252' assert stream.char() == "'" def test_char_utf8(): - stream = HTMLInputStream('\u2018'.encode('utf-8'), encoding='utf-8') + stream = HTMLInputStream('\u2018'.encode('utf-8'), override_encoding='utf-8') assert stream.charEncoding[0].name == 'utf-8' assert stream.char() == '\u2018' diff --git a/html5lib/tokenizer.py b/html5lib/tokenizer.py index dd6ea75f..3f10c01f 100644 --- a/html5lib/tokenizer.py +++ b/html5lib/tokenizer.py @@ -31,16 +31,11 @@ class HTMLTokenizer(object): Points to HTMLInputStream object. """ - def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True, - lowercaseElementName=True, lowercaseAttrName=True, parser=None): + def __init__(self, stream, parser=None, **kwargs): - self.stream = HTMLInputStream(stream, encoding, parseMeta, useChardet) + self.stream = HTMLInputStream(stream, **kwargs) self.parser = parser - # Perform case conversions? - self.lowercaseElementName = lowercaseElementName - self.lowercaseAttrName = lowercaseAttrName - # Setup the initial tokenizer state self.escapeFlag = False self.lastFourChars = [] @@ -232,8 +227,7 @@ def emitCurrentToken(self): token = self.currentToken # Add token to the queue to be yielded if (token["type"] in tagTokenTypes): - if self.lowercaseElementName: - token["name"] = token["name"].translate(asciiUpper2Lower) + token["name"] = token["name"].translate(asciiUpper2Lower) if token["type"] == tokenTypes["EndTag"]: if token["data"]: self.tokenQueue.append({"type": tokenTypes["ParseError"], @@ -918,9 +912,8 @@ def attributeNameState(self): # Attributes are not dropped at this stage. That happens when the # start tag token is emitted so values can still be safely appended # to attributes, but we do want to report the parse error in time. - if self.lowercaseAttrName: - self.currentToken["data"][-1][0] = ( - self.currentToken["data"][-1][0].translate(asciiUpper2Lower)) + self.currentToken["data"][-1][0] = ( + self.currentToken["data"][-1][0].translate(asciiUpper2Lower)) for name, _ in self.currentToken["data"][:-1]: if self.currentToken["data"][-1][0] == name: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":