In convert_file_to_utf8(), check the entire file can be decoded.

For kurtmckee#296.
lemon24 · Apr 25, 2022 · a1fe3e1 · a1fe3e1
1 parent a29c54b
commit a1fe3e1
Show file tree

Hide file tree

Showing 2 changed files with 53 additions and 7 deletions.
diff --git a/feedparser/encodings.py b/feedparser/encodings.py
@@ -303,6 +303,8 @@ def convert_to_utf8(http_headers, data, result):
 # Note that no encoding detection is needed in this case.
 CONVERT_FILE_STR_PREFIX_LEN = 2 ** 13
 
+CONVERT_FILE_TEST_CHUNK_LEN = 2 ** 16
+
 
 def convert_file_to_utf8(http_headers, file, result, optimistic_encoding_detection=True):
     """Like convert_to_utf8(), but for a stream.
@@ -347,15 +349,38 @@ def convert_file_to_utf8(http_headers, file, result, optimistic_encoding_detecti
 
     if optimistic_encoding_detection:
         prefix = convert_file_prefix_to_utf8(http_headers, file, result)
-        return StreamFactory(prefix, file, result.get('encoding'))
+        factory = StreamFactory(prefix, file, result.get('encoding'))
 
-    else:
-        # this shouldn't increase memory usage if file is BytesIO,
-        # since BytesIO does copy-on-write; https://bugs.python.org/issue22003
-        data = convert_to_utf8(http_headers, file.read(), result)
+        # Before returning factory, ensure the entire file can be decoded;
+        # if it cannot, fall back to convert_to_utf8().
+        #
+        # Not doing this means feedparser.parse() may raise UnicodeDecodeError
+        # instead of setting bozo_exception to CharacterEncodingOverride,
+        # breaking the 6.x API.
+
+        try:
+            text_file = factory.get_text_file()
+        except MissingEncoding:
+            return factory
+        try:
+            # read in chunks to limit memory usage
+            while True:
+                chunk = text_file.read(CONVERT_FILE_TEST_CHUNK_LEN)
+                if not chunk:
+                    break
+                del chunk
+        except UnicodeDecodeError:
+            # fall back to convert_to_utf8()
+            file = factory.get_binary_file()
+        else:
+            return factory
+
+    # this shouldn't increase memory usage if file is BytesIO,
+    # since BytesIO does copy-on-write; https://bugs.python.org/issue22003
+    data = convert_to_utf8(http_headers, file.read(), result)
 
-        # note that data *is* the prefix
-        return StreamFactory(data, io.BytesIO(b''), result.get('encoding'))
+    # note that data *is* the prefix
+    return StreamFactory(data, io.BytesIO(b''), result.get('encoding'))
 
 
 def convert_file_prefix_to_utf8(

diff --git a/tests/runtests.py b/tests/runtests.py
@@ -331,6 +331,27 @@ def test_prefix_file_wrapper_no_prefix(self):
         f = feedparser.encodings.PrefixFileWrapper(b'', io.BytesIO(b'abc'))
         self.assertEqual(f.read(1) , b'a')
 
+    def test_convert_file_to_utf8_decode_error_fallback(self):
+        from feedparser.encodings import convert_to_utf8, convert_file_to_utf8
+
+        input = (
+            "abcd😀".encode('utf-8') * feedparser.encodings.CONVERT_FILE_PREFIX_LEN
+            + "abcd😀".encode('utf-32')
+        )
+        headers = {}
+
+        expected_result = {}
+        expected_output = convert_to_utf8(headers, input, expected_result)
+        actual_result = {}
+        factory = convert_file_to_utf8(headers, io.BytesIO(input), actual_result)
+
+        self.assertEqual(factory.get_binary_file().read(), expected_output)
+        self.assertEqual(actual_result['encoding'], expected_result['encoding'])
+        self.assertEqual(
+            type(actual_result['bozo_exception']),
+            type(expected_result['bozo_exception'])
+        )
+
 
 def make_prefix_file_wrapper_test(make_file):