diff --git a/CHANGES.rst b/CHANGES.rst
index cdf21bc4..fe07f1ec 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -46,6 +46,10 @@ Released on XXX
* **Drop support of charade, now that chardet is supported once more.**
+* **Replace the charset keyword argument on parse and related methods
+ with a set of keyword arguments: override_encoding, transport_encoding,
+ same_origin_parent_encoding, likely_encoding, and default_encoding.**
+
0.9999999/1.0b8
~~~~~~~~~~~~~~~
diff --git a/README.rst b/README.rst
index 6859ed30..2ad46090 100644
--- a/README.rst
+++ b/README.rst
@@ -51,7 +51,7 @@ pass into html5lib as follows:
import html5lib
with closing(urlopen("http://example.com/")) as f:
- document = html5lib.parse(f, encoding=f.info().getparam("charset"))
+ document = html5lib.parse(f, transport_encoding=f.info().getparam("charset"))
When using with ``urllib.request`` (Python 3), the charset from HTTP
should be pass into html5lib as follows:
@@ -62,7 +62,7 @@ should be pass into html5lib as follows:
import html5lib
with urlopen("http://example.com/") as f:
- document = html5lib.parse(f, encoding=f.info().get_content_charset())
+ document = html5lib.parse(f, transport_encoding=f.info().get_content_charset())
To have more control over the parser, create a parser object explicitly.
For instance, to make the parser raise exceptions on parse errors, use:
diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py
index 224341b7..6a5c8bcb 100644
--- a/html5lib/html5parser.py
+++ b/html5lib/html5parser.py
@@ -28,19 +28,17 @@
)
-def parse(doc, treebuilder="etree", encoding=None,
- namespaceHTMLElements=True, scripting=False):
+def parse(doc, treebuilder="etree", namespaceHTMLElements=True, **kwargs):
"""Parse a string or file-like object into a tree"""
tb = treebuilders.getTreeBuilder(treebuilder)
p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
- return p.parse(doc, encoding=encoding, scripting=scripting)
+ return p.parse(doc, **kwargs)
-def parseFragment(doc, container="div", treebuilder="etree", encoding=None,
- namespaceHTMLElements=True, scripting=False):
+def parseFragment(doc, container="div", treebuilder="etree", namespaceHTMLElements=True, **kwargs):
tb = treebuilders.getTreeBuilder(treebuilder)
p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
- return p.parseFragment(doc, container=container, encoding=encoding, scripting=scripting)
+ return p.parseFragment(doc, container=container, **kwargs)
def method_decorator_metaclass(function):
@@ -59,18 +57,13 @@ class HTMLParser(object):
"""HTML parser. Generates a tree structure from a stream of (possibly
malformed) HTML"""
- def __init__(self, tree=None, tokenizer=tokenizer.HTMLTokenizer,
- strict=False, namespaceHTMLElements=True, debug=False):
+ def __init__(self, tree=None, strict=False, namespaceHTMLElements=True, debug=False):
"""
strict - raise an exception when a parse error is encountered
tree - a treebuilder class controlling the type of tree that will be
returned. Built in treebuilders can be accessed through
html5lib.treebuilders.getTreeBuilder(treeType)
-
- tokenizer - a class that provides a stream of tokens to the treebuilder.
- This may be replaced for e.g. a sanitizer which converts some tags to
- text
"""
# Raise an exception on the first error encountered
@@ -79,22 +72,17 @@ def __init__(self, tree=None, tokenizer=tokenizer.HTMLTokenizer,
if tree is None:
tree = treebuilders.getTreeBuilder("etree")
self.tree = tree(namespaceHTMLElements)
- self.tokenizer_class = tokenizer
self.errors = []
self.phases = dict([(name, cls(self, self.tree)) for name, cls in
getPhases(debug).items()])
- def _parse(self, stream, innerHTML=False, container="div", encoding=None,
- parseMeta=True, useChardet=True, scripting=False, **kwargs):
+ def _parse(self, stream, innerHTML=False, container="div", scripting=False, **kwargs):
self.innerHTMLMode = innerHTML
self.container = container
self.scripting = scripting
- self.tokenizer = self.tokenizer_class(stream, encoding=encoding,
- parseMeta=parseMeta,
- useChardet=useChardet,
- parser=self, **kwargs)
+ self.tokenizer = tokenizer.HTMLTokenizer(stream, parser=self, **kwargs)
self.reset()
try:
@@ -232,8 +220,7 @@ def normalizedTokens(self):
for token in self.tokenizer:
yield self.normalizeToken(token)
- def parse(self, stream, encoding=None, parseMeta=True,
- useChardet=True, scripting=False):
+ def parse(self, stream, *args, **kwargs):
"""Parse a HTML document into a well-formed tree
stream - a filelike object or string containing the HTML to be parsed
@@ -245,13 +232,10 @@ def parse(self, stream, encoding=None, parseMeta=True,
scripting - treat noscript elements as if javascript was turned on
"""
- self._parse(stream, innerHTML=False, encoding=encoding,
- parseMeta=parseMeta, useChardet=useChardet, scripting=scripting)
+ self._parse(stream, False, None, *args, **kwargs)
return self.tree.getDocument()
- def parseFragment(self, stream, container="div", encoding=None,
- parseMeta=False, useChardet=True, scripting=False):
- # pylint:disable=unused-argument
+ def parseFragment(self, stream, *args, **kwargs):
"""Parse a HTML fragment into a well-formed tree fragment
container - name of the element we're setting the innerHTML property
@@ -266,8 +250,7 @@ def parseFragment(self, stream, container="div", encoding=None,
scripting - treat noscript elements as if javascript was turned on
"""
- self._parse(stream, True, container=container,
- encoding=encoding, scripting=scripting)
+ self._parse(stream, True, *args, **kwargs)
return self.tree.getFragment()
def parseError(self, errorcode="XXX-undefined-error", datavars=None):
diff --git a/html5lib/inputstream.py b/html5lib/inputstream.py
index cfabdd86..dafe33ca 100644
--- a/html5lib/inputstream.py
+++ b/html5lib/inputstream.py
@@ -128,7 +128,7 @@ def _readFromBuffer(self, bytes):
return b"".join(rv)
-def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True):
+def HTMLInputStream(source, **kwargs):
# Work around Python bug #20007: read(0) closes the connection.
# http://bugs.python.org/issue20007
if (isinstance(source, http_client.HTTPResponse) or
@@ -142,12 +142,13 @@ def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True):
isUnicode = isinstance(source, text_type)
if isUnicode:
- if encoding is not None:
- raise TypeError("Cannot explicitly set an encoding with a unicode string")
+ encodings = [x for x in kwargs if x.endswith("_encoding")]
+ if encodings:
+ raise TypeError("Cannot set an encoding with a unicode input, set %r" % encodings)
- return HTMLUnicodeInputStream(source)
+ return HTMLUnicodeInputStream(source, **kwargs)
else:
- return HTMLBinaryInputStream(source, encoding, parseMeta, chardet)
+ return HTMLBinaryInputStream(source, **kwargs)
class HTMLUnicodeInputStream(object):
@@ -173,8 +174,6 @@ def __init__(self, source):
regardless of any BOM or later declaration (such as in a meta
element)
- parseMeta - Look for a element containing encoding information
-
"""
if not utils.supports_lone_surrogates:
@@ -390,7 +389,9 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
"""
- def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
+ def __init__(self, source, override_encoding=None, transport_encoding=None,
+ same_origin_parent_encoding=None, likely_encoding=None,
+ default_encoding="windows-1252", useChardet=True):
"""Initialises the HTMLInputStream.
HTMLInputStream(source, [encoding]) -> Normalized stream from source
@@ -403,8 +404,6 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
regardless of any BOM or later declaration (such as in a meta
element)
- parseMeta - Look for a element containing encoding information
-
"""
# Raw Stream - for unicode objects this will encode to utf-8 and set
# self.charEncoding as appropriate
@@ -412,21 +411,22 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
HTMLUnicodeInputStream.__init__(self, self.rawStream)
- self.charEncoding = (lookupEncoding(encoding), "certain")
-
# Encoding Information
# Number of bytes to use when looking for a meta element with
# encoding information
self.numBytesMeta = 1024
# Number of bytes to use when using detecting encoding using chardet
self.numBytesChardet = 100
- # Encoding to use if no other information can be found
- self.defaultEncoding = "windows-1252"
+ # Things from args
+ self.override_encoding = override_encoding
+ self.transport_encoding = transport_encoding
+ self.same_origin_parent_encoding = same_origin_parent_encoding
+ self.likely_encoding = likely_encoding
+ self.default_encoding = default_encoding
- # Detect encoding iff no explicit "transport level" encoding is supplied
- if (self.charEncoding[0] is None):
- self.charEncoding = self.detectEncoding(parseMeta, chardet)
- assert self.charEncoding[0] is not None
+ # Determine encoding
+ self.charEncoding = self.determineEncoding(useChardet)
+ assert self.charEncoding[0] is not None
# Call superclass
self.reset()
@@ -454,21 +454,45 @@ def openStream(self, source):
return stream
- def detectEncoding(self, parseMeta=True, chardet=True):
- # First look for a BOM
+ def determineEncoding(self, chardet=True):
+ # BOMs take precedence over everything
# This will also read past the BOM if present
- encoding = self.detectBOM()
- confidence = "certain"
- # If there is no BOM need to look for meta elements with encoding
- # information
- if encoding is None and parseMeta:
- encoding = self.detectEncodingMeta()
- confidence = "tentative"
+ charEncoding = self.detectBOM(), "certain"
+ if charEncoding[0] is not None:
+ return charEncoding
+
+ # If we've been overriden, we've been overriden
+ charEncoding = lookupEncoding(self.override_encoding), "certain"
+ if charEncoding[0] is not None:
+ return charEncoding
+
+ # Now check the transport layer
+ charEncoding = lookupEncoding(self.transport_encoding), "certain"
+ if charEncoding[0] is not None:
+ return charEncoding
+
+ # Look for meta elements with encoding information
+ charEncoding = self.detectEncodingMeta(), "tentative"
+ if charEncoding[0] is not None:
+ return charEncoding
+
+ # Parent document encoding
+ charEncoding = lookupEncoding(self.same_origin_parent_encoding), "tentative"
+ if charEncoding[0] is not None and not charEncoding[0].name.startswith("utf-16"):
+ return charEncoding
+
+ # "likely" encoding
+ charEncoding = lookupEncoding(self.likely_encoding), "tentative"
+ if charEncoding[0] is not None:
+ return charEncoding
+
# Guess with chardet, if available
- if encoding is None and chardet:
- confidence = "tentative"
+ if chardet:
try:
from chardet.universaldetector import UniversalDetector
+ except ImportError:
+ pass
+ else:
buffers = []
detector = UniversalDetector()
while not detector.done:
@@ -481,14 +505,16 @@ def detectEncoding(self, parseMeta=True, chardet=True):
detector.close()
encoding = lookupEncoding(detector.result['encoding'])
self.rawStream.seek(0)
- except ImportError:
- pass
- # If all else fails use the default encoding
- if encoding is None:
- confidence = "tentative"
- encoding = lookupEncoding(self.defaultEncoding)
+ if encoding is not None:
+ return encoding, "tentative"
+
+ # Try the default encoding
+ charEncoding = lookupEncoding(self.default_encoding), "tentative"
+ if charEncoding[0] is not None:
+ return charEncoding
- return encoding, confidence
+ # Fallback to html5lib's default if even that hasn't worked
+ return lookupEncoding("windows-1252"), "tentative"
def changeEncoding(self, newEncoding):
assert self.charEncoding[1] != "certain"
diff --git a/html5lib/tests/test_encoding.py b/html5lib/tests/test_encoding.py
index a66a2178..b6d20f24 100644
--- a/html5lib/tests/test_encoding.py
+++ b/html5lib/tests/test_encoding.py
@@ -2,6 +2,8 @@
import os
+import pytest
+
from .support import get_data_files, test_dir, errorMessage, TestData as _TestData
from html5lib import HTMLParser, inputstream
@@ -11,7 +13,7 @@ def test_basic_prescan_length():
pad = 1024 - len(data) + 1
data = data.replace(b"-a-", b"-" + (b"a" * pad) + b"-")
assert len(data) == 1024 # Sanity
- stream = inputstream.HTMLBinaryInputStream(data, chardet=False)
+ stream = inputstream.HTMLBinaryInputStream(data, useChardet=False)
assert 'utf-8' == stream.charEncoding[0].name
@@ -20,7 +22,7 @@ def test_parser_reparse():
pad = 10240 - len(data) + 1
data = data.replace(b"-a-", b"-" + (b"a" * pad) + b"-")
assert len(data) == 10240 # Sanity
- stream = inputstream.HTMLBinaryInputStream(data, chardet=False)
+ stream = inputstream.HTMLBinaryInputStream(data, useChardet=False)
assert 'windows-1252' == stream.charEncoding[0].name
p = HTMLParser(namespaceHTMLElements=False)
doc = p.parse(data, useChardet=False)
@@ -28,6 +30,51 @@ def test_parser_reparse():
assert doc.find(".//title").text == "Caf\u00E9"
+@pytest.mark.parametrize("expected,data,kwargs", [
+ ("utf-16le", b"\xFF\xFE", {"override_encoding": "iso-8859-2"}),
+ ("utf-16be", b"\xFE\xFF", {"override_encoding": "iso-8859-2"}),
+ ("utf-8", b"\xEF\xBB\xBF", {"override_encoding": "iso-8859-2"}),
+ ("iso-8859-2", b"", {"override_encoding": "iso-8859-2", "transport_encoding": "iso-8859-3"}),
+ ("iso-8859-2", b"", {"transport_encoding": "iso-8859-2"}),
+ ("iso-8859-2", b"", {"same_origin_parent_encoding": "iso-8859-3"}),
+ ("iso-8859-2", b"", {"same_origin_parent_encoding": "iso-8859-2", "likely_encoding": "iso-8859-3"}),
+ ("iso-8859-2", b"", {"same_origin_parent_encoding": "utf-16", "likely_encoding": "iso-8859-2"}),
+ ("iso-8859-2", b"", {"same_origin_parent_encoding": "utf-16be", "likely_encoding": "iso-8859-2"}),
+ ("iso-8859-2", b"", {"same_origin_parent_encoding": "utf-16le", "likely_encoding": "iso-8859-2"}),
+ ("iso-8859-2", b"", {"likely_encoding": "iso-8859-2", "default_encoding": "iso-8859-3"}),
+ ("iso-8859-2", b"", {"default_encoding": "iso-8859-2"}),
+ ("windows-1252", b"", {"default_encoding": "totally-bogus-string"}),
+ ("windows-1252", b"", {}),
+])
+def test_parser_args(expected, data, kwargs):
+ stream = inputstream.HTMLBinaryInputStream(data, useChardet=False, **kwargs)
+ assert expected == stream.charEncoding[0].name
+ p = HTMLParser()
+ p.parse(data, useChardet=False, **kwargs)
+ assert expected == p.documentEncoding
+
+
+@pytest.mark.parametrize("kwargs", [
+ {"override_encoding": "iso-8859-2"},
+ {"override_encoding": None},
+ {"transport_encoding": "iso-8859-2"},
+ {"transport_encoding": None},
+ {"same_origin_parent_encoding": "iso-8859-2"},
+ {"same_origin_parent_encoding": None},
+ {"likely_encoding": "iso-8859-2"},
+ {"likely_encoding": None},
+ {"default_encoding": "iso-8859-2"},
+ {"default_encoding": None},
+ {"foo_encoding": "iso-8859-2"},
+ {"foo_encoding": None},
+])
+def test_parser_args_raises(kwargs):
+ with pytest.raises(TypeError) as exc_info:
+ p = HTMLParser()
+ p.parse("", useChardet=False, **kwargs)
+ assert exc_info.value.args[0].startswith("Cannot set an encoding with a unicode input")
+
+
def runParserEncodingTest(data, encoding):
p = HTMLParser()
assert p.documentEncoding is None
@@ -38,7 +85,7 @@ def runParserEncodingTest(data, encoding):
def runPreScanEncodingTest(data, encoding):
- stream = inputstream.HTMLBinaryInputStream(data, chardet=False)
+ stream = inputstream.HTMLBinaryInputStream(data, useChardet=False)
encoding = encoding.lower().decode("ascii")
# Very crude way to ignore irrelevant tests
@@ -55,6 +102,7 @@ def test_encoding():
yield (runParserEncodingTest, test[b'data'], test[b'encoding'])
yield (runPreScanEncodingTest, test[b'data'], test[b'encoding'])
+
# pylint:disable=wrong-import-position
try:
import chardet # noqa
diff --git a/html5lib/tests/test_stream.py b/html5lib/tests/test_stream.py
index 77e411d5..e8d9fd86 100644
--- a/html5lib/tests/test_stream.py
+++ b/html5lib/tests/test_stream.py
@@ -99,13 +99,13 @@ class HTMLBinaryInputStreamShortChunk(HTMLBinaryInputStream):
def test_char_ascii():
- stream = HTMLInputStream(b"'", encoding='ascii')
+ stream = HTMLInputStream(b"'", override_encoding='ascii')
assert stream.charEncoding[0].name == 'windows-1252'
assert stream.char() == "'"
def test_char_utf8():
- stream = HTMLInputStream('\u2018'.encode('utf-8'), encoding='utf-8')
+ stream = HTMLInputStream('\u2018'.encode('utf-8'), override_encoding='utf-8')
assert stream.charEncoding[0].name == 'utf-8'
assert stream.char() == '\u2018'
diff --git a/html5lib/tokenizer.py b/html5lib/tokenizer.py
index dd6ea75f..3f10c01f 100644
--- a/html5lib/tokenizer.py
+++ b/html5lib/tokenizer.py
@@ -31,16 +31,11 @@ class HTMLTokenizer(object):
Points to HTMLInputStream object.
"""
- def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
- lowercaseElementName=True, lowercaseAttrName=True, parser=None):
+ def __init__(self, stream, parser=None, **kwargs):
- self.stream = HTMLInputStream(stream, encoding, parseMeta, useChardet)
+ self.stream = HTMLInputStream(stream, **kwargs)
self.parser = parser
- # Perform case conversions?
- self.lowercaseElementName = lowercaseElementName
- self.lowercaseAttrName = lowercaseAttrName
-
# Setup the initial tokenizer state
self.escapeFlag = False
self.lastFourChars = []
@@ -232,8 +227,7 @@ def emitCurrentToken(self):
token = self.currentToken
# Add token to the queue to be yielded
if (token["type"] in tagTokenTypes):
- if self.lowercaseElementName:
- token["name"] = token["name"].translate(asciiUpper2Lower)
+ token["name"] = token["name"].translate(asciiUpper2Lower)
if token["type"] == tokenTypes["EndTag"]:
if token["data"]:
self.tokenQueue.append({"type": tokenTypes["ParseError"],
@@ -918,9 +912,8 @@ def attributeNameState(self):
# Attributes are not dropped at this stage. That happens when the
# start tag token is emitted so values can still be safely appended
# to attributes, but we do want to report the parse error in time.
- if self.lowercaseAttrName:
- self.currentToken["data"][-1][0] = (
- self.currentToken["data"][-1][0].translate(asciiUpper2Lower))
+ self.currentToken["data"][-1][0] = (
+ self.currentToken["data"][-1][0].translate(asciiUpper2Lower))
for name, _ in self.currentToken["data"][:-1]:
if self.currentToken["data"][-1][0] == name:
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":