Add stream-oriented version of convert_to_utf8().

For kurtmckee#296.
lemon24 · Jan 27, 2022 · d10ffee · d10ffee
1 parent 3a806ae
commit d10ffee
Show file tree

Hide file tree

Showing 2 changed files with 320 additions and 0 deletions.
diff --git a/feedparser/encodings.py b/feedparser/encodings.py
@@ -28,6 +28,7 @@
 
 import cgi
 import codecs
+import io
 import re
 
 try:
@@ -291,3 +292,224 @@ def convert_to_utf8(http_headers, data, result):
         result['bozo'] = True
         result['bozo_exception'] = error
     return data
+
+
+# How much to read from a binary file in order to detect encoding.
+# In inital tests, 4k was enough for ~160 mostly-English feeds;
+# 64k seems like a safe margin.
+CONVERT_FILE_PREFIX_LEN = 2 ** 16
+
+# How much to read from a text file, and use as an utf-8 bytes prefix.
+# Note that no encoding detection is needed in this case.
+CONVERT_FILE_STR_PREFIX_LEN = 2 ** 13
+
+
+def convert_file_to_utf8(http_headers, file, result, optimistic_encoding_detection=True):
+    """Like convert_to_utf8(), but for a stream.
+
+    Unlike convert_to_utf8(), do not read the the entire file in memory;
+    instead, return a text stream that decodes it on the fly.
+    This should consume significantly less memory,
+    because it avoids (repeatedly) converting the entire file contents
+    from bytes to str and back.
+
+    To detect the encoding, only a prefix of the file contents is used.
+    In rare cases, the wrong encoding may be detected for this prefix;
+    use optimistic_encoding_detection=False to use the entire file contents
+    (equivalent to a plain convert_to_utf8() call).
+
+    Args:
+        http_headers (dict): The response headers.
+        file (IO[bytes] or IO[str]): A read()-able (binary) stream.
+        result (dict): The result dictionary.
+        optimistic_encoding_detection (bool):
+            If true, use only a prefix of the file content to detect encoding.
+
+    Returns:
+        StreamFactory: a stream factory, with the detected encoding set, if any
+
+    """
+    # Currently, this wraps convert_to_utf8(), because the logic is simply
+    # too complicated to ensure it's re-implemented correctly for a stream.
+    # That said, it should be possible to change the implementation
+    # transparently (not sure it's worth it, though).
+
+    # If file is a text stream, we don't need to detect encoding;
+    # we still need a bytes prefix to run functions on for side effects:
+    # convert_to_utf8() to sniff / set result['content-type'], and
+    # replace_doctype() to extract safe_entities.
+
+    if isinstance(file.read(0), str):
+        prefix = file.read(CONVERT_FILE_STR_PREFIX_LEN).encode('utf-8')
+        prefix = convert_to_utf8(http_headers, prefix, result)
+        result['encoding'] = 'utf-8'
+        return StreamFactory(prefix, file, 'utf-8')
+
+    if optimistic_encoding_detection:
+        prefix = convert_file_prefix_to_utf8(http_headers, file, result)
+        return StreamFactory(prefix, file, result.get('encoding'))
+
+    else:
+        # this shouldn't increase memory usage if file is BytesIO,
+        # since BytesIO does copy-on-write; https://bugs.python.org/issue22003
+        data = convert_to_utf8(http_headers, file.read(), result)
+
+        # note that data *is* the prefix
+        return StreamFactory(data, io.BytesIO(b''), result.get('encoding'))
+
+
+def convert_file_prefix_to_utf8(http_headers, file, result, prefix_len=CONVERT_FILE_PREFIX_LEN):
+    """Like convert_to_utf8(), but only use the prefix of a binary file.
+
+    Set result like convert_to_utf8() would.
+
+    Return the updated prefix, as bytes.
+
+    """
+    prefix = file.read(prefix_len)
+
+    # we call convert_to_utf8() up to 4 times,
+    # to make sure we eventually land on a code point boundary
+    for _ in range(4):
+        fake_result = {}
+        converted_prefix = convert_to_utf8(http_headers, prefix, fake_result)
+        if not fake_result.get('bozo'):
+            break
+
+        # check if the prefix we have is actually the whole thing
+        if len(prefix) < prefix_len:
+            break
+
+        byte = file.read(1)
+        if not byte:
+            break
+
+        prefix += byte
+        prefix_len += 1
+
+    result.update(fake_result)
+    return converted_prefix
+
+
+class MissingEncoding(io.UnsupportedOperation):
+    pass
+
+
+class StreamFactory:
+
+    """Decode on the fly a binary stream that *may* have a known encoding.
+
+    If the underlying stream is seekable, it is possible to call
+    the get_{text,binary}_file() methods more than once.
+
+    """
+
+    def __init__(self, prefix: bytes, file, encoding=None):
+        self.prefix = prefix
+        self.file = ResetFileWrapper(file)
+        self.encoding = encoding
+        self.should_reset = False
+
+    def get_text_file(self, fallback_encoding=None, errors='strict'):
+        encoding = self.encoding or fallback_encoding
+        if encoding is None:
+            raise MissingEncoding("cannot create text stream without encoding")
+
+        if isinstance(self.file.read(0), str):
+            file = PrefixFileWrapper(self.prefix.decode(encoding), self.file)
+        else:
+            file = PrefixFileWrapper(
+                self.prefix.decode('utf-8', errors),
+                codecs.getreader(encoding)(self.file, errors)
+            )
+
+        self.reset()
+        return file
+
+    def get_binary_file(self):
+        if isinstance(self.file.read(0), str):
+            raise io.UnsupportedOperation("underlying stream is text, not binary") from None
+
+        file = PrefixFileWrapper(self.prefix, self.file)
+
+        self.reset()
+        return file
+
+    def get_file(self):
+        try:
+            return self.get_text_file()
+        except MissingEncoding:
+            return self.get_binary_file()
+
+    def reset(self):
+        if self.should_reset:
+            self.file.reset()
+        self.should_reset = True
+
+
+class ResetFileWrapper:
+    """Given a seekable file, allow reading its content again
+    (from the current position) by calling reset().
+
+    """
+    def __init__(self, file):
+        self.file = file
+        try:
+            self.file_initial_offset = file.tell()
+        except OSError:
+            self.file_initial_offset = None
+
+    def read(self, size=-1):
+        return self.file.read(size)
+
+    def reset(self):
+        # raises io.UnsupportedOperation if the underlying stream is not seekable
+        self.file.seek(self.file_initial_offset)
+
+
+class PrefixFileWrapper:
+    """Stitch a (possibly modified) prefix and a file into a new file object.
+
+    >>> file = io.StringIO('abcdef')
+    >>> file.read(2)
+    'ab'
+    >>> wrapped = PrefixFileWrapper(file.read(2).upper(), file)
+    >>> wrapped.read()
+    'CDef'
+
+    """
+    def __init__(self, prefix, file):
+        self.prefix = prefix
+        self.file = file
+        self.offset = 0
+
+    def read(self, size=-1):
+        buffer = self.file.read(0)
+
+        if self.offset < len(self.prefix):
+            if size < 0:
+                chunk = self.prefix
+            else:
+                chunk = self.prefix[self.offset : self.offset+size]
+                size -= len(chunk)
+            buffer += chunk
+            self.offset += len(chunk)
+
+        while True:
+            chunk = self.file.read(size)
+            if not chunk:
+                break
+            buffer += chunk
+            self.offset += len(chunk)
+
+            if size <= 0:
+                break
+
+            size -= len(chunk)
+
+        return buffer
+
+    def close(self):
+        # do not touch the underlying stream
+        pass
+
diff --git a/tests/runtests.py b/tests/runtests.py
@@ -49,6 +49,7 @@
 import feedparser
 import feedparser.api
 import feedparser.datetimes
+import feedparser.encodings
 import feedparser.http
 import feedparser.mixin
 import feedparser.sanitizer
@@ -295,6 +296,102 @@ def test_gb2312_converted_to_gb18030_in_xml_encoding(self):
         self.assertEqual(result.encoding, 'gb18030')
 
 
+class TestEncodingsHelpers(BaseTestCase):
+
+    def test_reset_file_wrapper(self):
+        f = feedparser.encodings.ResetFileWrapper(io.BytesIO(b'abcdef'))
+        self.assertEqual(f.read(2) , b'ab')
+        f.reset()
+        self.assertEqual(f.read() , b'abcdef')
+
+        f = io.BytesIO(b'abcdef')
+        f.read(2)
+        f = feedparser.encodings.ResetFileWrapper(f)
+        self.assertEqual(f.read(2) , b'cd')
+        f.reset()
+        self.assertEqual(f.read() , b'cdef')
+
+        f = feedparser.encodings.ResetFileWrapper(_make_file_not_seekable(b'abcdef'))
+        self.assertEqual(f.read() , b'abcdef')
+        self.assertEqual(f.read() , b'')
+        with self.assertRaises(io.UnsupportedOperation):
+            f.reset()
+        self.assertEqual(f.read() , b'')
+
+        f = feedparser.encodings.ResetFileWrapper(_make_file_not_seekable(b'abcdef'))
+        self.assertEqual(f.read(3) , b'abc')
+        with self.assertRaises(io.UnsupportedOperation):
+            f.reset()
+        self.assertEqual(f.read() , b'def')
+
+    def test_prefix_file_wrapper_no_prefix(self):
+        f = feedparser.encodings.PrefixFileWrapper(b'', io.BytesIO(b'abc'))
+        self.assertEqual(f.read() , b'abc')
+
+        f = feedparser.encodings.PrefixFileWrapper(b'', io.BytesIO(b'abc'))
+        self.assertEqual(f.read(1) , b'a')
+
+
+def make_prefix_file_wrapper_test(make_file):
+
+    def test(self):
+        f = feedparser.encodings.PrefixFileWrapper(b'abc', make_file(b'def'))
+        self.assertEqual(f.read() , b'abcdef')
+        self.assertEqual(f.read() , b'')
+
+        f = feedparser.encodings.PrefixFileWrapper(b'abc', make_file(b'def'))
+        self.assertEqual(f.read(2) , b'ab')
+        self.assertEqual(f.read(2) , b'cd')
+        self.assertEqual(f.read(2) , b'ef')
+        self.assertEqual(f.read(2) , b'')
+        self.assertEqual(f.read() , b'')
+
+        f = feedparser.encodings.PrefixFileWrapper(b'abc', make_file(b'def'))
+        self.assertEqual(f.read(3) , b'abc')
+        self.assertEqual(f.read(3) , b'def')
+        self.assertEqual(f.read(3) , b'')
+        self.assertEqual(f.read() , b'')
+
+        f = feedparser.encodings.PrefixFileWrapper(b'abc', make_file(b'def'))
+        self.assertEqual(f.read(0) , b'')
+        self.assertEqual(f.read() , b'abcdef')
+
+    return test
+
+
+def _make_file_in_the_middle(data):
+    prefix = b'zzzzz'
+    rv = io.BytesIO(prefix + data)
+    rv.seek(len(prefix))
+    return rv
+
+class _make_file_one_by_one(io.BytesIO):
+    def read(self, size=-1):
+        if size <= 0:
+            return super().read(size)
+        return super().read(1)
+
+class _make_file_not_seekable(io.BytesIO):
+    def tell(self):
+        raise io.UnsupportedOperation
+    def seek(self, *args):
+        raise io.UnsupportedOperation
+
+prefix_file_wrapper_file_factories = [
+    io.BytesIO,
+    _make_file_in_the_middle,
+    _make_file_one_by_one,
+]
+
+for factory in prefix_file_wrapper_file_factories:
+    func = make_prefix_file_wrapper_test(factory)
+    setattr(
+        TestEncodingsHelpers,
+        'test_prefix_file_wrapper_%s' % func.__name__.lstrip('_'),
+        func
+    )
+
+
 class TestFeedParserDict(unittest.TestCase):
     """Ensure that FeedParserDict returns values as expected and won't crash"""
 
@@ -989,6 +1086,7 @@ def runtests():
     testsuite.addTest(testloader.loadTestsFromTestCase(TestStrictParser))
     testsuite.addTest(testloader.loadTestsFromTestCase(TestLooseParser))
     testsuite.addTest(testloader.loadTestsFromTestCase(TestEncodings))
+    testsuite.addTest(testloader.loadTestsFromTestCase(TestEncodingsHelpers))
     testsuite.addTest(testloader.loadTestsFromTestCase(TestDateParsers))
     testsuite.addTest(testloader.loadTestsFromTestCase(TestHTMLGuessing))
     testsuite.addTest(testloader.loadTestsFromTestCase(TestHTTPStatus))