Skip to content

Commit

Permalink
In convert_file_to_utf8(), check the entire file can be decoded.
Browse files Browse the repository at this point in the history
  • Loading branch information
lemon24 committed Apr 25, 2022
1 parent a29c54b commit a1fe3e1
Show file tree
Hide file tree
Showing 2 changed files with 53 additions and 7 deletions.
39 changes: 32 additions & 7 deletions feedparser/encodings.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,8 @@ def convert_to_utf8(http_headers, data, result):
# Note that no encoding detection is needed in this case.
CONVERT_FILE_STR_PREFIX_LEN = 2 ** 13

CONVERT_FILE_TEST_CHUNK_LEN = 2 ** 16


def convert_file_to_utf8(http_headers, file, result, optimistic_encoding_detection=True):
"""Like convert_to_utf8(), but for a stream.
Expand Down Expand Up @@ -347,15 +349,38 @@ def convert_file_to_utf8(http_headers, file, result, optimistic_encoding_detecti

if optimistic_encoding_detection:
prefix = convert_file_prefix_to_utf8(http_headers, file, result)
return StreamFactory(prefix, file, result.get('encoding'))
factory = StreamFactory(prefix, file, result.get('encoding'))

else:
# this shouldn't increase memory usage if file is BytesIO,
# since BytesIO does copy-on-write; https://bugs.python.org/issue22003
data = convert_to_utf8(http_headers, file.read(), result)
# Before returning factory, ensure the entire file can be decoded;
# if it cannot, fall back to convert_to_utf8().
#
# Not doing this means feedparser.parse() may raise UnicodeDecodeError
# instead of setting bozo_exception to CharacterEncodingOverride,
# breaking the 6.x API.

try:
text_file = factory.get_text_file()
except MissingEncoding:
return factory
try:
# read in chunks to limit memory usage
while True:
chunk = text_file.read(CONVERT_FILE_TEST_CHUNK_LEN)
if not chunk:
break
del chunk
except UnicodeDecodeError:
# fall back to convert_to_utf8()
file = factory.get_binary_file()
else:
return factory

# this shouldn't increase memory usage if file is BytesIO,
# since BytesIO does copy-on-write; https://bugs.python.org/issue22003
data = convert_to_utf8(http_headers, file.read(), result)

# note that data *is* the prefix
return StreamFactory(data, io.BytesIO(b''), result.get('encoding'))
# note that data *is* the prefix
return StreamFactory(data, io.BytesIO(b''), result.get('encoding'))


def convert_file_prefix_to_utf8(
Expand Down
21 changes: 21 additions & 0 deletions tests/runtests.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,6 +331,27 @@ def test_prefix_file_wrapper_no_prefix(self):
f = feedparser.encodings.PrefixFileWrapper(b'', io.BytesIO(b'abc'))
self.assertEqual(f.read(1) , b'a')

def test_convert_file_to_utf8_decode_error_fallback(self):
from feedparser.encodings import convert_to_utf8, convert_file_to_utf8

input = (
"abcd😀".encode('utf-8') * feedparser.encodings.CONVERT_FILE_PREFIX_LEN
+ "abcd😀".encode('utf-32')
)
headers = {}

expected_result = {}
expected_output = convert_to_utf8(headers, input, expected_result)
actual_result = {}
factory = convert_file_to_utf8(headers, io.BytesIO(input), actual_result)

self.assertEqual(factory.get_binary_file().read(), expected_output)
self.assertEqual(actual_result['encoding'], expected_result['encoding'])
self.assertEqual(
type(actual_result['bozo_exception']),
type(expected_result['bozo_exception'])
)


def make_prefix_file_wrapper_test(make_file):

Expand Down

0 comments on commit a1fe3e1

Please sign in to comment.