From 00b29e4b32d3e18f3c965a65b8740f3ca470404d Mon Sep 17 00:00:00 2001 From: lemon24 Date: Sat, 29 Jan 2022 00:01:41 +0200 Subject: [PATCH] Only read the feed in memory if the underlying resource is not seekable. For #296. --- feedparser/api.py | 69 ++++++++++++++++++++++++++++++++--------------- tests/runtests.py | 10 +++---- 2 files changed, 52 insertions(+), 27 deletions(-) diff --git a/feedparser/api.py b/feedparser/api.py index eafe04c0..19ac4439 100644 --- a/feedparser/api.py +++ b/feedparser/api.py @@ -106,20 +106,42 @@ def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, h if request_headers is supplied it is a dictionary of HTTP request headers that will override the values generated by FeedParser. - :return: A bytes object. + :return: A seekable, readable file object. """ - if hasattr(url_file_stream_or_string, 'read'): - return url_file_stream_or_string.read() + # Some notes on the history of the implementation of _open_resource(). + # + # parse() might need to go over the feed content twice: + # if the strict parser fails, it tries again with the loose parser. + # + # In 5.2.0, this returned an open file, to be read() by parse(). + # By 6.0.8, this returned bytes directly. + # + # Since #296 (>6.0.8), this once again returns an open file + # (to reduce memory usage, see convert_file_to_utf8() for details). + # However, to accommodate parse() needing the content twice, + # the returned file is guaranteed to be seekable. + # (If the underlying resource is not seekable, + # the content is read and wrapped in a io.BytesIO/StringIO.) - if isinstance(url_file_stream_or_string, str) \ - and urllib.parse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp', 'file', 'feed'): - return http.get(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers, result) + if hasattr(url_file_stream_or_string, 'read'): + if hasattr(url_file_stream_or_string, 'seekable'): + if url_file_stream_or_string.seekable(): + return url_file_stream_or_string + return _to_in_memory_file(url_file_stream_or_string.read()) + + looks_like_url = ( + isinstance(url_file_stream_or_string, str) + and urllib.parse.urlparse(url_file_stream_or_string)[0] + in ('http', 'https', 'ftp', 'file', 'feed') + ) + if looks_like_url: + data = http.get(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers, result) + return io.BytesIO(data) # try to open with native open function (if url_file_stream_or_string is a filename) try: - with open(url_file_stream_or_string, 'rb') as f: - data = f.read() + return open(url_file_stream_or_string, 'rb') except (IOError, UnicodeEncodeError, TypeError, ValueError): # if url_file_stream_or_string is a str object that # cannot be converted to the encoding returned by @@ -129,13 +151,16 @@ def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, h # (such as an XML document encoded in UTF-32), TypeError will # be thrown. pass - else: - return data - # treat url_file_stream_or_string as string - if not isinstance(url_file_stream_or_string, bytes): - return url_file_stream_or_string.encode('utf-8') - return url_file_stream_or_string + # treat url_file_stream_or_string as bytes/string + return _to_in_memory_file(url_file_stream_or_string) + + +def _to_in_memory_file(data): + if isinstance(data, str): + return io.StringIO(data) + else: + return io.BytesIO(data) class LooseFeedParser(LooseXMLParser, XMLParserMixin, BaseHTMLProcessor): @@ -221,7 +246,7 @@ def parse( ) try: - data = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers, result) + file = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers, result) except urllib.error.URLError as error: result.update({ 'bozo': True, @@ -229,17 +254,17 @@ def parse( }) return result - if not data: + # at this point, the file is guaranteed to be seekable; + # we read 1 byte/character to see if it's empty and return early + # (this preserves the behavior in 6.0.8) + initial_file_offset = file.tell() + if not file.read(1): return result + file.seek(initial_file_offset) # overwrite existing headers using response_headers result['headers'].update(response_headers or {}) - # TODO (lemon24): remove this once _open_resource() returns an open file - file = io.BytesIO(data) - - # TODO (lemon24): handle io.UnsupportedOperation raised by seek() attempts - try: _parse_file_inplace( file, @@ -257,7 +282,7 @@ def parse( def _parse_file_inplace( - file: IO[bytes], + file: Union[IO[bytes], IO[str]], result: dict, *, resolve_relative_uris: bool = None, diff --git a/tests/runtests.py b/tests/runtests.py index 60a7a1a8..97584e74 100644 --- a/tests/runtests.py +++ b/tests/runtests.py @@ -476,7 +476,7 @@ class TestOpenResource(unittest.TestCase): """Ensure that `_open_resource()` interprets its arguments as URIs, file-like objects, or in-memory feeds as expected""" def test_fileobj(self): - r = feedparser.api._open_resource(io.BytesIO(b''), '', '', '', '', [], {}, {}) + r = feedparser.api._open_resource(io.BytesIO(b''), '', '', '', '', [], {}, {}).read() self.assertEqual(r, b'') def test_feed(self): @@ -489,22 +489,22 @@ def test_feed_http(self): def test_bytes(self): s = b'text' - r = feedparser.api._open_resource(s, '', '', '', '', [], {}, {}) + r = feedparser.api._open_resource(s, '', '', '', '', [], {}, {}).read() self.assertEqual(s, r) def test_string(self): s = b'text' - r = feedparser.api._open_resource(s, '', '', '', '', [], {}, {}) + r = feedparser.api._open_resource(s, '', '', '', '', [], {}, {}).read() self.assertEqual(s, r) def test_unicode_1(self): s = b'text' - r = feedparser.api._open_resource(s, '', '', '', '', [], {}, {}) + r = feedparser.api._open_resource(s, '', '', '', '', [], {}, {}).read() self.assertEqual(s, r) def test_unicode_2(self): s = br't\u00e9xt' - r = feedparser.api._open_resource(s, '', '', '', '', [], {}, {}) + r = feedparser.api._open_resource(s, '', '', '', '', [], {}, {}).read() self.assertEqual(s, r) def test_http_client_ascii_unicode_encode_error(self):