Skip to content

Commit

Permalink
Add stream-oriented version of convert_to_utf8().
Browse files Browse the repository at this point in the history
  • Loading branch information
lemon24 committed Jan 27, 2022
1 parent 3a806ae commit d10ffee
Show file tree
Hide file tree
Showing 2 changed files with 320 additions and 0 deletions.
222 changes: 222 additions & 0 deletions feedparser/encodings.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@

import cgi
import codecs
import io
import re

try:
Expand Down Expand Up @@ -291,3 +292,224 @@ def convert_to_utf8(http_headers, data, result):
result['bozo'] = True
result['bozo_exception'] = error
return data


# How much to read from a binary file in order to detect encoding.
# In inital tests, 4k was enough for ~160 mostly-English feeds;
# 64k seems like a safe margin.
CONVERT_FILE_PREFIX_LEN = 2 ** 16

# How much to read from a text file, and use as an utf-8 bytes prefix.
# Note that no encoding detection is needed in this case.
CONVERT_FILE_STR_PREFIX_LEN = 2 ** 13


def convert_file_to_utf8(http_headers, file, result, optimistic_encoding_detection=True):
"""Like convert_to_utf8(), but for a stream.
Unlike convert_to_utf8(), do not read the the entire file in memory;
instead, return a text stream that decodes it on the fly.
This should consume significantly less memory,
because it avoids (repeatedly) converting the entire file contents
from bytes to str and back.
To detect the encoding, only a prefix of the file contents is used.
In rare cases, the wrong encoding may be detected for this prefix;
use optimistic_encoding_detection=False to use the entire file contents
(equivalent to a plain convert_to_utf8() call).
Args:
http_headers (dict): The response headers.
file (IO[bytes] or IO[str]): A read()-able (binary) stream.
result (dict): The result dictionary.
optimistic_encoding_detection (bool):
If true, use only a prefix of the file content to detect encoding.
Returns:
StreamFactory: a stream factory, with the detected encoding set, if any
"""
# Currently, this wraps convert_to_utf8(), because the logic is simply
# too complicated to ensure it's re-implemented correctly for a stream.
# That said, it should be possible to change the implementation
# transparently (not sure it's worth it, though).

# If file is a text stream, we don't need to detect encoding;
# we still need a bytes prefix to run functions on for side effects:
# convert_to_utf8() to sniff / set result['content-type'], and
# replace_doctype() to extract safe_entities.

if isinstance(file.read(0), str):
prefix = file.read(CONVERT_FILE_STR_PREFIX_LEN).encode('utf-8')
prefix = convert_to_utf8(http_headers, prefix, result)
result['encoding'] = 'utf-8'
return StreamFactory(prefix, file, 'utf-8')

if optimistic_encoding_detection:
prefix = convert_file_prefix_to_utf8(http_headers, file, result)
return StreamFactory(prefix, file, result.get('encoding'))

else:
# this shouldn't increase memory usage if file is BytesIO,
# since BytesIO does copy-on-write; https://bugs.python.org/issue22003
data = convert_to_utf8(http_headers, file.read(), result)

# note that data *is* the prefix
return StreamFactory(data, io.BytesIO(b''), result.get('encoding'))


def convert_file_prefix_to_utf8(http_headers, file, result, prefix_len=CONVERT_FILE_PREFIX_LEN):
"""Like convert_to_utf8(), but only use the prefix of a binary file.
Set result like convert_to_utf8() would.
Return the updated prefix, as bytes.
"""
prefix = file.read(prefix_len)

# we call convert_to_utf8() up to 4 times,
# to make sure we eventually land on a code point boundary
for _ in range(4):
fake_result = {}
converted_prefix = convert_to_utf8(http_headers, prefix, fake_result)
if not fake_result.get('bozo'):
break

# check if the prefix we have is actually the whole thing
if len(prefix) < prefix_len:
break

byte = file.read(1)
if not byte:
break

prefix += byte
prefix_len += 1

result.update(fake_result)
return converted_prefix


class MissingEncoding(io.UnsupportedOperation):
pass


class StreamFactory:

"""Decode on the fly a binary stream that *may* have a known encoding.
If the underlying stream is seekable, it is possible to call
the get_{text,binary}_file() methods more than once.
"""

def __init__(self, prefix: bytes, file, encoding=None):
self.prefix = prefix
self.file = ResetFileWrapper(file)
self.encoding = encoding
self.should_reset = False

def get_text_file(self, fallback_encoding=None, errors='strict'):
encoding = self.encoding or fallback_encoding
if encoding is None:
raise MissingEncoding("cannot create text stream without encoding")

if isinstance(self.file.read(0), str):
file = PrefixFileWrapper(self.prefix.decode(encoding), self.file)
else:
file = PrefixFileWrapper(
self.prefix.decode('utf-8', errors),
codecs.getreader(encoding)(self.file, errors)
)

self.reset()
return file

def get_binary_file(self):
if isinstance(self.file.read(0), str):
raise io.UnsupportedOperation("underlying stream is text, not binary") from None

file = PrefixFileWrapper(self.prefix, self.file)

self.reset()
return file

def get_file(self):
try:
return self.get_text_file()
except MissingEncoding:
return self.get_binary_file()

def reset(self):
if self.should_reset:
self.file.reset()
self.should_reset = True


class ResetFileWrapper:
"""Given a seekable file, allow reading its content again
(from the current position) by calling reset().
"""
def __init__(self, file):
self.file = file
try:
self.file_initial_offset = file.tell()
except OSError:
self.file_initial_offset = None

def read(self, size=-1):
return self.file.read(size)

def reset(self):
# raises io.UnsupportedOperation if the underlying stream is not seekable
self.file.seek(self.file_initial_offset)


class PrefixFileWrapper:
"""Stitch a (possibly modified) prefix and a file into a new file object.
>>> file = io.StringIO('abcdef')
>>> file.read(2)
'ab'
>>> wrapped = PrefixFileWrapper(file.read(2).upper(), file)
>>> wrapped.read()
'CDef'
"""
def __init__(self, prefix, file):
self.prefix = prefix
self.file = file
self.offset = 0

def read(self, size=-1):
buffer = self.file.read(0)

if self.offset < len(self.prefix):
if size < 0:
chunk = self.prefix
else:
chunk = self.prefix[self.offset : self.offset+size]
size -= len(chunk)
buffer += chunk
self.offset += len(chunk)

while True:
chunk = self.file.read(size)
if not chunk:
break
buffer += chunk
self.offset += len(chunk)

if size <= 0:
break

size -= len(chunk)

return buffer

def close(self):
# do not touch the underlying stream
pass

98 changes: 98 additions & 0 deletions tests/runtests.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
import feedparser
import feedparser.api
import feedparser.datetimes
import feedparser.encodings
import feedparser.http
import feedparser.mixin
import feedparser.sanitizer
Expand Down Expand Up @@ -295,6 +296,102 @@ def test_gb2312_converted_to_gb18030_in_xml_encoding(self):
self.assertEqual(result.encoding, 'gb18030')


class TestEncodingsHelpers(BaseTestCase):

def test_reset_file_wrapper(self):
f = feedparser.encodings.ResetFileWrapper(io.BytesIO(b'abcdef'))
self.assertEqual(f.read(2) , b'ab')
f.reset()
self.assertEqual(f.read() , b'abcdef')

f = io.BytesIO(b'abcdef')
f.read(2)
f = feedparser.encodings.ResetFileWrapper(f)
self.assertEqual(f.read(2) , b'cd')
f.reset()
self.assertEqual(f.read() , b'cdef')

f = feedparser.encodings.ResetFileWrapper(_make_file_not_seekable(b'abcdef'))
self.assertEqual(f.read() , b'abcdef')
self.assertEqual(f.read() , b'')
with self.assertRaises(io.UnsupportedOperation):
f.reset()
self.assertEqual(f.read() , b'')

f = feedparser.encodings.ResetFileWrapper(_make_file_not_seekable(b'abcdef'))
self.assertEqual(f.read(3) , b'abc')
with self.assertRaises(io.UnsupportedOperation):
f.reset()
self.assertEqual(f.read() , b'def')

def test_prefix_file_wrapper_no_prefix(self):
f = feedparser.encodings.PrefixFileWrapper(b'', io.BytesIO(b'abc'))
self.assertEqual(f.read() , b'abc')

f = feedparser.encodings.PrefixFileWrapper(b'', io.BytesIO(b'abc'))
self.assertEqual(f.read(1) , b'a')


def make_prefix_file_wrapper_test(make_file):

def test(self):
f = feedparser.encodings.PrefixFileWrapper(b'abc', make_file(b'def'))
self.assertEqual(f.read() , b'abcdef')
self.assertEqual(f.read() , b'')

f = feedparser.encodings.PrefixFileWrapper(b'abc', make_file(b'def'))
self.assertEqual(f.read(2) , b'ab')
self.assertEqual(f.read(2) , b'cd')
self.assertEqual(f.read(2) , b'ef')
self.assertEqual(f.read(2) , b'')
self.assertEqual(f.read() , b'')

f = feedparser.encodings.PrefixFileWrapper(b'abc', make_file(b'def'))
self.assertEqual(f.read(3) , b'abc')
self.assertEqual(f.read(3) , b'def')
self.assertEqual(f.read(3) , b'')
self.assertEqual(f.read() , b'')

f = feedparser.encodings.PrefixFileWrapper(b'abc', make_file(b'def'))
self.assertEqual(f.read(0) , b'')
self.assertEqual(f.read() , b'abcdef')

return test


def _make_file_in_the_middle(data):
prefix = b'zzzzz'
rv = io.BytesIO(prefix + data)
rv.seek(len(prefix))
return rv

class _make_file_one_by_one(io.BytesIO):
def read(self, size=-1):
if size <= 0:
return super().read(size)
return super().read(1)

class _make_file_not_seekable(io.BytesIO):
def tell(self):
raise io.UnsupportedOperation
def seek(self, *args):
raise io.UnsupportedOperation

prefix_file_wrapper_file_factories = [
io.BytesIO,
_make_file_in_the_middle,
_make_file_one_by_one,
]

for factory in prefix_file_wrapper_file_factories:
func = make_prefix_file_wrapper_test(factory)
setattr(
TestEncodingsHelpers,
'test_prefix_file_wrapper_%s' % func.__name__.lstrip('_'),
func
)


class TestFeedParserDict(unittest.TestCase):
"""Ensure that FeedParserDict returns values as expected and won't crash"""

Expand Down Expand Up @@ -989,6 +1086,7 @@ def runtests():
testsuite.addTest(testloader.loadTestsFromTestCase(TestStrictParser))
testsuite.addTest(testloader.loadTestsFromTestCase(TestLooseParser))
testsuite.addTest(testloader.loadTestsFromTestCase(TestEncodings))
testsuite.addTest(testloader.loadTestsFromTestCase(TestEncodingsHelpers))
testsuite.addTest(testloader.loadTestsFromTestCase(TestDateParsers))
testsuite.addTest(testloader.loadTestsFromTestCase(TestHTMLGuessing))
testsuite.addTest(testloader.loadTestsFromTestCase(TestHTTPStatus))
Expand Down

0 comments on commit d10ffee

Please sign in to comment.