Skip to content

Commit

Permalink
Fix html5lib#120: introduce keyword arguments for encodings by source
Browse files Browse the repository at this point in the history
  • Loading branch information
gsnedders committed Jul 6, 2016
1 parent 6464fc4 commit a011a6a
Show file tree
Hide file tree
Showing 7 changed files with 79 additions and 58 deletions.
4 changes: 4 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,10 @@ Released on XXX

* **Drop support of charade, now that chardet is supported once more.**

* **Replace the charset keyword argument on parse and related methods
with a set of keyword arguments: override_encoding, transport_encoding,
same_origin_parent_encoding, likely_encoding, and default_encoding.**


0.9999999/1.0b8
~~~~~~~~~~~~~~~
Expand Down
4 changes: 2 additions & 2 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ pass into html5lib as follows:
import html5lib
with closing(urlopen("http://example.com/")) as f:
document = html5lib.parse(f, encoding=f.info().getparam("charset"))
document = html5lib.parse(f, transport_encoding=f.info().getparam("charset"))
When using with ``urllib.request`` (Python 3), the charset from HTTP
should be pass into html5lib as follows:
Expand All @@ -62,7 +62,7 @@ should be pass into html5lib as follows:
import html5lib
with urlopen("http://example.com/") as f:
document = html5lib.parse(f, encoding=f.info().get_content_charset())
document = html5lib.parse(f, transport_encoding=f.info().get_content_charset())
To have more control over the parser, create a parser object explicitly.
For instance, to make the parser raise exceptions on parse errors, use:
Expand Down
20 changes: 6 additions & 14 deletions html5lib/html5parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,15 +79,12 @@ def __init__(self, tree=None, strict=False, namespaceHTMLElements=True, debug=Fa
self.phases = dict([(name, cls(self, self.tree)) for name, cls in
getPhases(debug).items()])

def _parse(self, stream, innerHTML=False, container="div", encoding=None,
useChardet=True, scripting=False, **kwargs):
def _parse(self, stream, innerHTML=False, container="div", scripting=False, **kwargs):

self.innerHTMLMode = innerHTML
self.container = container
self.scripting = scripting
self.tokenizer = tokenizer.HTMLTokenizer(stream, encoding=encoding,
useChardet=useChardet,
parser=self, **kwargs)
self.tokenizer = tokenizer.HTMLTokenizer(stream, parser=self, **kwargs)
self.reset()

try:
Expand Down Expand Up @@ -225,8 +222,7 @@ def normalizedTokens(self):
for token in self.tokenizer:
yield self.normalizeToken(token)

def parse(self, stream, encoding=None,
useChardet=True, scripting=False):
def parse(self, stream, *args, **kwargs):
"""Parse a HTML document into a well-formed tree
stream - a filelike object or string containing the HTML to be parsed
Expand All @@ -238,13 +234,10 @@ def parse(self, stream, encoding=None,
scripting - treat noscript elements as if javascript was turned on
"""
self._parse(stream, innerHTML=False, encoding=encoding,
useChardet=useChardet, scripting=scripting)
self._parse(stream, False, None, *args, **kwargs)
return self.tree.getDocument()

def parseFragment(self, stream, container="div", encoding=None,
useChardet=True, scripting=False):
# pylint:disable=unused-argument
def parseFragment(self, stream, *args, **kwargs):
"""Parse a HTML fragment into a well-formed tree fragment
container - name of the element we're setting the innerHTML property
Expand All @@ -259,8 +252,7 @@ def parseFragment(self, stream, container="div", encoding=None,
scripting - treat noscript elements as if javascript was turned on
"""
self._parse(stream, True, container=container,
encoding=encoding, scripting=scripting)
self._parse(stream, True, *args, **kwargs)
return self.tree.getFragment()

def parseError(self, errorcode="XXX-undefined-error", datavars=None):
Expand Down
95 changes: 60 additions & 35 deletions html5lib/inputstream.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ def _readFromBuffer(self, bytes):
return b"".join(rv)


def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True):
def HTMLInputStream(source, override_encoding=None, **kwargs):
# Work around Python bug #20007: read(0) closes the connection.
# http://bugs.python.org/issue20007
if (isinstance(source, http_client.HTTPResponse) or
Expand All @@ -142,12 +142,12 @@ def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True):
isUnicode = isinstance(source, text_type)

if isUnicode:
if encoding is not None:
raise TypeError("Cannot explicitly set an encoding with a unicode string")
if override_encoding is not None:
raise TypeError("Cannot set an override encoding with a unicode input")

return HTMLUnicodeInputStream(source)
else:
return HTMLBinaryInputStream(source, encoding, parseMeta, chardet)
return HTMLBinaryInputStream(source, override_encoding=override_encoding, **kwargs)


class HTMLUnicodeInputStream(object):
Expand All @@ -173,8 +173,6 @@ def __init__(self, source):
regardless of any BOM or later declaration (such as in a meta
element)
parseMeta - Look for a <meta> element containing encoding information
"""

if not utils.supports_lone_surrogates:
Expand Down Expand Up @@ -390,7 +388,9 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
"""

def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
def __init__(self, source, override_encoding=None, transport_encoding=None,
same_origin_parent_encoding=None, likely_encoding=None,
default_encoding="windows-1252", useChardet=True):
"""Initialises the HTMLInputStream.
HTMLInputStream(source, [encoding]) -> Normalized stream from source
Expand All @@ -403,30 +403,29 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
regardless of any BOM or later declaration (such as in a meta
element)
parseMeta - Look for a <meta> element containing encoding information
"""
# Raw Stream - for unicode objects this will encode to utf-8 and set
# self.charEncoding as appropriate
self.rawStream = self.openStream(source)

HTMLUnicodeInputStream.__init__(self, self.rawStream)

self.charEncoding = (lookupEncoding(encoding), "certain")

# Encoding Information
# Number of bytes to use when looking for a meta element with
# encoding information
self.numBytesMeta = 1024
# Number of bytes to use when using detecting encoding using chardet
self.numBytesChardet = 100
# Encoding to use if no other information can be found
self.defaultEncoding = "windows-1252"
# Things from args
self.override_encoding = override_encoding
self.transport_encoding = transport_encoding
self.same_origin_parent_encoding = same_origin_parent_encoding
self.likely_encoding = likely_encoding
self.default_encoding = default_encoding

# Detect encoding iff no explicit "transport level" encoding is supplied
if (self.charEncoding[0] is None):
self.charEncoding = self.detectEncoding(parseMeta, chardet)
assert self.charEncoding[0] is not None
# Determine encoding
self.charEncoding = self.determineEncoding(useChardet)
assert self.charEncoding[0] is not None

# Call superclass
self.reset()
Expand Down Expand Up @@ -454,21 +453,45 @@ def openStream(self, source):

return stream

def detectEncoding(self, parseMeta=True, chardet=True):
# First look for a BOM
def determineEncoding(self, chardet=True):
# BOMs take precedence over everything
# This will also read past the BOM if present
encoding = self.detectBOM()
confidence = "certain"
# If there is no BOM need to look for meta elements with encoding
# information
if encoding is None and parseMeta:
encoding = self.detectEncodingMeta()
confidence = "tentative"
charEncoding = self.detectBOM(), "certain"
if charEncoding[0] is not None:
return charEncoding

# If we've been overriden, we've been overriden
charEncoding = lookupEncoding(self.override_encoding), "certain"
if charEncoding[0] is not None:
return charEncoding

# Now check the transport layer
charEncoding = lookupEncoding(self.transport_encoding), "certain"
if charEncoding[0] is not None:
return charEncoding

# Look for meta elements with encoding information
charEncoding = self.detectEncodingMeta(), "tentative"
if charEncoding[0] is not None:
return charEncoding

# Parent document encoding
charEncoding = lookupEncoding(self.same_origin_parent_encoding), "tentative"
if charEncoding[0] is not None and not charEncoding[0].name.startswith("utf-16"):
return charEncoding

# "likely" encoding
charEncoding = lookupEncoding(self.likely_encoding), "tentative"
if charEncoding[0] is not None:
return charEncoding

# Guess with chardet, if available
if encoding is None and chardet:
confidence = "tentative"
if chardet:
try:
from chardet.universaldetector import UniversalDetector
except ImportError:
pass
else:
buffers = []
detector = UniversalDetector()
while not detector.done:
Expand All @@ -481,14 +504,16 @@ def detectEncoding(self, parseMeta=True, chardet=True):
detector.close()
encoding = lookupEncoding(detector.result['encoding'])
self.rawStream.seek(0)
except ImportError:
pass
# If all else fails use the default encoding
if encoding is None:
confidence = "tentative"
encoding = lookupEncoding(self.defaultEncoding)
if encoding is not None:
return encoding, "tentative"

# Try the default encoding
charEncoding = lookupEncoding(self.default_encoding), "tentative"
if charEncoding[0] is not None:
return charEncoding

return encoding, confidence
# Fallback to html5lib's default if even that hasn't worked
return lookupEncoding("windows-1252"), "tentative"

def changeEncoding(self, newEncoding):
assert self.charEncoding[1] != "certain"
Expand Down
6 changes: 3 additions & 3 deletions html5lib/tests/test_encoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ def test_basic_prescan_length():
pad = 1024 - len(data) + 1
data = data.replace(b"-a-", b"-" + (b"a" * pad) + b"-")
assert len(data) == 1024 # Sanity
stream = inputstream.HTMLBinaryInputStream(data, chardet=False)
stream = inputstream.HTMLBinaryInputStream(data, useChardet=False)
assert 'utf-8' == stream.charEncoding[0].name


Expand All @@ -20,7 +20,7 @@ def test_parser_reparse():
pad = 10240 - len(data) + 1
data = data.replace(b"-a-", b"-" + (b"a" * pad) + b"-")
assert len(data) == 10240 # Sanity
stream = inputstream.HTMLBinaryInputStream(data, chardet=False)
stream = inputstream.HTMLBinaryInputStream(data, useChardet=False)
assert 'windows-1252' == stream.charEncoding[0].name
p = HTMLParser(namespaceHTMLElements=False)
doc = p.parse(data, useChardet=False)
Expand All @@ -38,7 +38,7 @@ def runParserEncodingTest(data, encoding):


def runPreScanEncodingTest(data, encoding):
stream = inputstream.HTMLBinaryInputStream(data, chardet=False)
stream = inputstream.HTMLBinaryInputStream(data, useChardet=False)
encoding = encoding.lower().decode("ascii")

# Very crude way to ignore irrelevant tests
Expand Down
4 changes: 2 additions & 2 deletions html5lib/tests/test_stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,13 +99,13 @@ class HTMLBinaryInputStreamShortChunk(HTMLBinaryInputStream):


def test_char_ascii():
stream = HTMLInputStream(b"'", encoding='ascii')
stream = HTMLInputStream(b"'", override_encoding='ascii')
assert stream.charEncoding[0].name == 'windows-1252'
assert stream.char() == "'"


def test_char_utf8():
stream = HTMLInputStream('\u2018'.encode('utf-8'), encoding='utf-8')
stream = HTMLInputStream('\u2018'.encode('utf-8'), override_encoding='utf-8')
assert stream.charEncoding[0].name == 'utf-8'
assert stream.char() == '\u2018'

Expand Down
4 changes: 2 additions & 2 deletions html5lib/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,9 @@ class HTMLTokenizer(object):
Points to HTMLInputStream object.
"""

def __init__(self, stream, encoding=None, useChardet=True, parser=None):
def __init__(self, stream, parser=None, **kwargs):

self.stream = HTMLInputStream(stream, encoding, True, useChardet)
self.stream = HTMLInputStream(stream, **kwargs)
self.parser = parser

# Setup the initial tokenizer state
Expand Down

0 comments on commit a011a6a

Please sign in to comment.