From 00b29e4b32d3e18f3c965a65b8740f3ca470404d Mon Sep 17 00:00:00 2001
From: lemon24 <damian.adrian24@gmail.com>
Date: Sat, 29 Jan 2022 00:01:41 +0200
Subject: [PATCH] Only read the feed in memory if the underlying resource is
 not seekable.

For #296.
---
 feedparser/api.py | 69 ++++++++++++++++++++++++++++++++---------------
 tests/runtests.py | 10 +++----
 2 files changed, 52 insertions(+), 27 deletions(-)

diff --git a/feedparser/api.py b/feedparser/api.py
index eafe04c0..19ac4439 100644
--- a/feedparser/api.py
+++ b/feedparser/api.py
@@ -106,20 +106,42 @@ def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, h
     if request_headers is supplied it is a dictionary of HTTP request headers
     that will override the values generated by FeedParser.
 
-    :return: A bytes object.
+    :return: A seekable, readable file object.
     """
 
-    if hasattr(url_file_stream_or_string, 'read'):
-        return url_file_stream_or_string.read()
+    # Some notes on the history of the implementation of _open_resource().
+    #
+    # parse() might need to go over the feed content twice:
+    # if the strict parser fails, it tries again with the loose parser.
+    #
+    # In 5.2.0, this returned an open file, to be read() by parse().
+    # By 6.0.8, this returned bytes directly.
+    #
+    # Since #296 (>6.0.8), this once again returns an open file
+    # (to reduce memory usage, see convert_file_to_utf8() for details).
+    # However, to accommodate parse() needing the content twice,
+    # the returned file is guaranteed to be seekable.
+    # (If the underlying resource is not seekable,
+    # the content is read and wrapped in a io.BytesIO/StringIO.)
 
-    if isinstance(url_file_stream_or_string, str) \
-       and urllib.parse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp', 'file', 'feed'):
-        return http.get(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers, result)
+    if hasattr(url_file_stream_or_string, 'read'):
+        if hasattr(url_file_stream_or_string, 'seekable'):
+            if url_file_stream_or_string.seekable():
+                return url_file_stream_or_string
+        return _to_in_memory_file(url_file_stream_or_string.read())
+
+    looks_like_url = (
+        isinstance(url_file_stream_or_string, str)
+        and urllib.parse.urlparse(url_file_stream_or_string)[0]
+            in ('http', 'https', 'ftp', 'file', 'feed')
+    )
+    if looks_like_url:
+        data = http.get(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers, result)
+        return io.BytesIO(data)
 
     # try to open with native open function (if url_file_stream_or_string is a filename)
     try:
-        with open(url_file_stream_or_string, 'rb') as f:
-            data = f.read()
+        return open(url_file_stream_or_string, 'rb')
     except (IOError, UnicodeEncodeError, TypeError, ValueError):
         # if url_file_stream_or_string is a str object that
         # cannot be converted to the encoding returned by
@@ -129,13 +151,16 @@ def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, h
         # (such as an XML document encoded in UTF-32), TypeError will
         # be thrown.
         pass
-    else:
-        return data
 
-    # treat url_file_stream_or_string as string
-    if not isinstance(url_file_stream_or_string, bytes):
-        return url_file_stream_or_string.encode('utf-8')
-    return url_file_stream_or_string
+    # treat url_file_stream_or_string as bytes/string
+    return _to_in_memory_file(url_file_stream_or_string)
+
+
+def _to_in_memory_file(data):
+    if isinstance(data, str):
+        return io.StringIO(data)
+    else:
+        return io.BytesIO(data)
 
 
 class LooseFeedParser(LooseXMLParser, XMLParserMixin, BaseHTMLProcessor):
@@ -221,7 +246,7 @@ def parse(
     )
 
     try:
-        data = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers, result)
+        file = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers, result)
     except urllib.error.URLError as error:
         result.update({
             'bozo': True,
@@ -229,17 +254,17 @@ def parse(
         })
         return result
 
-    if not data:
+    # at this point, the file is guaranteed to be seekable;
+    # we read 1 byte/character to see if it's empty and return early
+    # (this preserves the behavior in 6.0.8)
+    initial_file_offset = file.tell()
+    if not file.read(1):
         return result
+    file.seek(initial_file_offset)
 
     # overwrite existing headers using response_headers
     result['headers'].update(response_headers or {})
 
-    # TODO (lemon24): remove this once _open_resource() returns an open file
-    file = io.BytesIO(data)
-
-    # TODO (lemon24): handle io.UnsupportedOperation raised by seek() attempts
-
     try:
         _parse_file_inplace(
             file,
@@ -257,7 +282,7 @@ def parse(
 
 
 def _parse_file_inplace(
-    file: IO[bytes],
+    file: Union[IO[bytes], IO[str]],
     result: dict,
     *,
     resolve_relative_uris: bool = None,
diff --git a/tests/runtests.py b/tests/runtests.py
index 60a7a1a8..97584e74 100644
--- a/tests/runtests.py
+++ b/tests/runtests.py
@@ -476,7 +476,7 @@ class TestOpenResource(unittest.TestCase):
     """Ensure that `_open_resource()` interprets its arguments as URIs, file-like objects, or in-memory feeds as expected"""
 
     def test_fileobj(self):
-        r = feedparser.api._open_resource(io.BytesIO(b''), '', '', '', '', [], {}, {})
+        r = feedparser.api._open_resource(io.BytesIO(b''), '', '', '', '', [], {}, {}).read()
         self.assertEqual(r, b'')
 
     def test_feed(self):
@@ -489,22 +489,22 @@ def test_feed_http(self):
 
     def test_bytes(self):
         s = b'<feed><item><title>text</title></item></feed>'
-        r = feedparser.api._open_resource(s, '', '', '', '', [], {}, {})
+        r = feedparser.api._open_resource(s, '', '', '', '', [], {}, {}).read()
         self.assertEqual(s, r)
 
     def test_string(self):
         s = b'<feed><item><title>text</title></item></feed>'
-        r = feedparser.api._open_resource(s, '', '', '', '', [], {}, {})
+        r = feedparser.api._open_resource(s, '', '', '', '', [], {}, {}).read()
         self.assertEqual(s, r)
 
     def test_unicode_1(self):
         s = b'<feed><item><title>text</title></item></feed>'
-        r = feedparser.api._open_resource(s, '', '', '', '', [], {}, {})
+        r = feedparser.api._open_resource(s, '', '', '', '', [], {}, {}).read()
         self.assertEqual(s, r)
 
     def test_unicode_2(self):
         s = br'<feed><item><title>t\u00e9xt</title></item></feed>'
-        r = feedparser.api._open_resource(s, '', '', '', '', [], {}, {})
+        r = feedparser.api._open_resource(s, '', '', '', '', [], {}, {}).read()
         self.assertEqual(s, r)
 
     def test_http_client_ascii_unicode_encode_error(self):