From 8fdd9bc8b7c0ef0dc5587215b12adf9a368a8183 Mon Sep 17 00:00:00 2001 From: David Dotson Date: Tue, 7 Jun 2022 22:14:03 -0700 Subject: [PATCH 1/8] Added support to anyopen for taking filelike objects Use in this way requires specifying the compression, if any, that the stream features in order to decompress it. --- src/alchemlyb/parsing/util.py | 38 ++++++++++++++++++---- src/alchemlyb/tests/parsing/test_util.py | 41 ++++++++++++++++++++++++ 2 files changed, 72 insertions(+), 7 deletions(-) diff --git a/src/alchemlyb/parsing/util.py b/src/alchemlyb/parsing/util.py index 6bd2e096..91dce8b1 100644 --- a/src/alchemlyb/parsing/util.py +++ b/src/alchemlyb/parsing/util.py @@ -2,6 +2,8 @@ """ import os +from os import PathLike +from typing import IO, Optional import bz2 import gzip import zipfile @@ -14,19 +16,26 @@ def gzip_open(filename, mode): mode += 't' if mode in ['r','w','a','x'] else '' return gzip.open(filename, mode) -def anyopen(filename, mode='r'): - """Return a file stream for filename, even if compressed. +def anyopen(datafile: PathLike | IO, mode='r', compression=None): + """Return a file stream for file or stream, even if compressed. Supports files compressed with bzip2 (.bz2), gzip (.gz), and zip (.zip) compression schemes. The appropriate extension must be present for the function to properly handle the file. + If giving a stream for `datafile`, then you must specify `compression` if + the stream is compressed. + Parameters ---------- - filename : str - Path to file to use. + datafile : PathLike | IO + Path to file to use, or an open IO stream. If an IO stream, use + `compression` to specify the type of compression used, if any. mode : str Mode for stream; usually 'r' or 'w'. + compression : str + Use to specify compression. + Must be one of 'bz2', 'gz', 'zip'. Returns ------- @@ -39,12 +48,27 @@ def anyopen(filename, mode='r'): '.gz': gzip_open, '.zip': zipfile.ZipFile} - ext = os.path.splitext(filename)[1] + # compression selections available + compressions = {'bz2': bz2_open, + 'gz': gzip_open, + 'zip': zipfile.ZipFile} + + # if `datafile` is a stream + if hasattr(datafile, 'read'): + # if no compression specified, just pass the stream through + if compression is None: + return datafile + elif compression in compressions: + decompressor = compressions[compression] + return decompressor(datafile, mode=mode) + + # otherwise, treat as a file + ext = os.path.splitext(datafile)[1] if ext in extensions: - opener= extensions[ext] + opener = extensions[ext] else: opener = open - return opener(filename, mode) + return opener(datafile, mode) diff --git a/src/alchemlyb/tests/parsing/test_util.py b/src/alchemlyb/tests/parsing/test_util.py index 97c56fd6..d0f0a2aa 100644 --- a/src/alchemlyb/tests/parsing/test_util.py +++ b/src/alchemlyb/tests/parsing/test_util.py @@ -1,3 +1,5 @@ +import pytest + from alchemtest.gmx import load_expanded_ensemble_case_1 from alchemlyb.parsing.util import anyopen @@ -12,3 +14,42 @@ def test_gzip(): for filename in dataset['data'][leg]: with anyopen(filename, 'r') as f: assert type(f.readline()) is str + +def test_gzip_stream(): + """Test that `anyopen` reads streams with specified compression. + + """ + dataset = load_expanded_ensemble_case_1() + + for leg in dataset['data']: + for filename in dataset['data'][leg]: + with open(filename, 'rb') as f: + with anyopen(f, mode='r', compression='gz') as f_uc: + assert type(f_uc.readline()) is str + +def test_gzip_stream_wrong(): + """Test that `anyopen` gives failure for attempting to decompress gzip + stream with bz2. + + """ + dataset = load_expanded_ensemble_case_1() + + for leg in dataset['data']: + for filename in dataset['data'][leg]: + with open(filename, 'rb') as f: + with anyopen(f, mode='r', compression='bz2') as f_uc: + with pytest.raises(OSError, match='Invalid data stream'): + assert type(f_uc.readline()) is str + +def test_gzip_stream_wrong_no_compression(): + """Test that `anyopen` gives passthrough when no compression specified on a + stream. + + """ + dataset = load_expanded_ensemble_case_1() + + for leg in dataset['data']: + for filename in dataset['data'][leg]: + with open(filename, 'rb') as f: + with anyopen(f, mode='r') as f_uc: + assert type(f_uc.readline()) is bytes From 682ebc0d7de88be52e53cb9740bcb0aa48a167c0 Mon Sep 17 00:00:00 2001 From: David Dotson Date: Tue, 7 Jun 2022 22:51:33 -0700 Subject: [PATCH 2/8] Support python < 3.10 --- src/alchemlyb/parsing/util.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/alchemlyb/parsing/util.py b/src/alchemlyb/parsing/util.py index 91dce8b1..f4dd79ea 100644 --- a/src/alchemlyb/parsing/util.py +++ b/src/alchemlyb/parsing/util.py @@ -3,7 +3,7 @@ """ import os from os import PathLike -from typing import IO, Optional +from typing import IO, Optional, Union import bz2 import gzip import zipfile @@ -16,7 +16,7 @@ def gzip_open(filename, mode): mode += 't' if mode in ['r','w','a','x'] else '' return gzip.open(filename, mode) -def anyopen(datafile: PathLike | IO, mode='r', compression=None): +def anyopen(datafile: Union[PathLike, IO], mode='r', compression=None): """Return a file stream for file or stream, even if compressed. Supports files compressed with bzip2 (.bz2), gzip (.gz), and zip (.zip) From 5f6e46dc94cf0ac98198b52ad771f8b6386185cf Mon Sep 17 00:00:00 2001 From: David Dotson Date: Thu, 9 Jun 2022 22:16:32 -0700 Subject: [PATCH 3/8] Added checks for stream behavior in anyopen; dropped broken zip support We never had working zip archive support from the looks of it. This is because zipfiles are not just a compression scheme, they are an archive format that features compression. --- src/alchemlyb/parsing/util.py | 23 +++++++------ src/alchemlyb/tests/parsing/test_util.py | 41 ++++++++++++++++++++++++ 2 files changed, 52 insertions(+), 12 deletions(-) diff --git a/src/alchemlyb/parsing/util.py b/src/alchemlyb/parsing/util.py index f4dd79ea..d5483797 100644 --- a/src/alchemlyb/parsing/util.py +++ b/src/alchemlyb/parsing/util.py @@ -19,11 +19,11 @@ def gzip_open(filename, mode): def anyopen(datafile: Union[PathLike, IO], mode='r', compression=None): """Return a file stream for file or stream, even if compressed. - Supports files compressed with bzip2 (.bz2), gzip (.gz), and zip (.zip) - compression schemes. The appropriate extension must be present for - the function to properly handle the file. + Supports files compressed with bzip2 (.bz2) and gzip (.gz) compression + schemes. The appropriate extension must be present for the function to + properly handle the file. - If giving a stream for `datafile`, then you must specify `compression` if + If giving a stream for `datafile`, then you must specify `compression` if the stream is compressed. Parameters @@ -35,7 +35,7 @@ def anyopen(datafile: Union[PathLike, IO], mode='r', compression=None): Mode for stream; usually 'r' or 'w'. compression : str Use to specify compression. - Must be one of 'bz2', 'gz', 'zip'. + Must be one of 'bz2', 'gz'. Returns ------- @@ -45,22 +45,21 @@ def anyopen(datafile: Union[PathLike, IO], mode='r', compression=None): """ # opener for each type of file extensions = {'.bz2': bz2_open, - '.gz': gzip_open, - '.zip': zipfile.ZipFile} + '.gz': gzip_open} # compression selections available compressions = {'bz2': bz2_open, - 'gz': gzip_open, - 'zip': zipfile.ZipFile} + 'gz': gzip_open} # if `datafile` is a stream - if hasattr(datafile, 'read'): + if ((hasattr(datafile, 'read') and any((i in mode for i in ('r',)))) or + (hasattr(datafile, 'write') and any((i in mode for i in ('w', 'a', 'x'))))): # if no compression specified, just pass the stream through if compression is None: return datafile elif compression in compressions: - decompressor = compressions[compression] - return decompressor(datafile, mode=mode) + compressor = compressions[compression] + return compressor(datafile, mode=mode) # otherwise, treat as a file ext = os.path.splitext(datafile)[1] diff --git a/src/alchemlyb/tests/parsing/test_util.py b/src/alchemlyb/tests/parsing/test_util.py index d0f0a2aa..b854015b 100644 --- a/src/alchemlyb/tests/parsing/test_util.py +++ b/src/alchemlyb/tests/parsing/test_util.py @@ -1,3 +1,4 @@ +import io import pytest from alchemtest.gmx import load_expanded_ensemble_case_1 @@ -53,3 +54,43 @@ def test_gzip_stream_wrong_no_compression(): with open(filename, 'rb') as f: with anyopen(f, mode='r') as f_uc: assert type(f_uc.readline()) is bytes + +@pytest.mark.parametrize('compression', ['bz2', 'gz']) +def test_file_roundtrip(compression, tmp_path): + """Test that roundtripping write/read to a file works with `anyopen` + + """ + + data = "my momma told me to pick the very best one and you are not it" + + filepath = tmp_path / f'testfile.txt.{compression}' + with anyopen(filepath, mode='w') as f: + f.write(data) + + with anyopen(filepath, 'r') as f: + data_out = f.read() + + assert data_out == data + +@pytest.mark.parametrize('compression', ['bz2', 'gz']) +def test_file_roundtrip(compression): + """Test that roundtripping write/read to a file works with `anyopen` + + """ + + data = "my momma told me to pick the very best one and you are not it" + + with io.BytesIO() as stream: + + # write to stream + with anyopen(stream, mode='w', compression=compression) as f: + f.write(data) + + # start at the beginning + stream.seek(0) + + # read from stream + with anyopen(stream, 'r', compression=compression) as f: + data_out = f.read() + + assert data_out == data From 5980a945f3cd30e29a237c9441188e96d1c03115 Mon Sep 17 00:00:00 2001 From: David Dotson Date: Thu, 9 Jun 2022 22:30:11 -0700 Subject: [PATCH 4/8] Reworked anyopen to allow `compression` keyword to override extension detection --- src/alchemlyb/parsing/util.py | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/src/alchemlyb/parsing/util.py b/src/alchemlyb/parsing/util.py index d5483797..c3d44693 100644 --- a/src/alchemlyb/parsing/util.py +++ b/src/alchemlyb/parsing/util.py @@ -12,10 +12,12 @@ def bz2_open(filename, mode): mode += 't' if mode in ['r','w','a','x'] else '' return bz2.open(filename, mode) + def gzip_open(filename, mode): mode += 't' if mode in ['r','w','a','x'] else '' return gzip.open(filename, mode) + def anyopen(datafile: Union[PathLike, IO], mode='r', compression=None): """Return a file stream for file or stream, even if compressed. @@ -34,13 +36,14 @@ def anyopen(datafile: Union[PathLike, IO], mode='r', compression=None): mode : str Mode for stream; usually 'r' or 'w'. compression : str - Use to specify compression. - Must be one of 'bz2', 'gz'. + Use to specify compression. Must be one of 'bz2', 'gz'. + Overrides use of extension for determining compression if `datafile` is + a file. Returns ------- stream : stream - Open stream for reading. + Open stream for reading or writing, depending on mode. """ # opener for each type of file @@ -60,14 +63,20 @@ def anyopen(datafile: Union[PathLike, IO], mode='r', compression=None): elif compression in compressions: compressor = compressions[compression] return compressor(datafile, mode=mode) + else: + raise ValueError("`datafile` is a stream, but specified `compression` '{compression}' is not supported") # otherwise, treat as a file - ext = os.path.splitext(datafile)[1] - - if ext in extensions: - opener = extensions[ext] + # allow compression to override any extension on the file + if compression in compressions: + opener = compressions[compression] - else: - opener = open + # use extension to determine the compression used, if present + elif compression is None: + ext = os.path.splitext(datafile)[1] + if ext in extensions: + opener = extensions[ext] + else: + opener = open return opener(datafile, mode) From 23f86eeb29fccd132b438d6f097a7d1b55a0b30d Mon Sep 17 00:00:00 2001 From: David Dotson Date: Thu, 9 Jun 2022 22:42:24 -0700 Subject: [PATCH 5/8] Added additional tests, explicit compression options for anyopen --- src/alchemlyb/parsing/util.py | 6 ++-- src/alchemlyb/tests/parsing/test_util.py | 43 +++++++++++++++++++----- 2 files changed, 37 insertions(+), 12 deletions(-) diff --git a/src/alchemlyb/parsing/util.py b/src/alchemlyb/parsing/util.py index c3d44693..a87328e6 100644 --- a/src/alchemlyb/parsing/util.py +++ b/src/alchemlyb/parsing/util.py @@ -36,7 +36,7 @@ def anyopen(datafile: Union[PathLike, IO], mode='r', compression=None): mode : str Mode for stream; usually 'r' or 'w'. compression : str - Use to specify compression. Must be one of 'bz2', 'gz'. + Use to specify compression. Must be one of 'bzip2', 'gzip'. Overrides use of extension for determining compression if `datafile` is a file. @@ -51,8 +51,8 @@ def anyopen(datafile: Union[PathLike, IO], mode='r', compression=None): '.gz': gzip_open} # compression selections available - compressions = {'bz2': bz2_open, - 'gz': gzip_open} + compressions = {'bzip2': bz2_open, + 'gzip': gzip_open} # if `datafile` is a stream if ((hasattr(datafile, 'read') and any((i in mode for i in ('r',)))) or diff --git a/src/alchemlyb/tests/parsing/test_util.py b/src/alchemlyb/tests/parsing/test_util.py index b854015b..18d10e04 100644 --- a/src/alchemlyb/tests/parsing/test_util.py +++ b/src/alchemlyb/tests/parsing/test_util.py @@ -16,6 +16,7 @@ def test_gzip(): with anyopen(filename, 'r') as f: assert type(f.readline()) is str + def test_gzip_stream(): """Test that `anyopen` reads streams with specified compression. @@ -25,9 +26,10 @@ def test_gzip_stream(): for leg in dataset['data']: for filename in dataset['data'][leg]: with open(filename, 'rb') as f: - with anyopen(f, mode='r', compression='gz') as f_uc: + with anyopen(f, mode='r', compression='gzip') as f_uc: assert type(f_uc.readline()) is str + def test_gzip_stream_wrong(): """Test that `anyopen` gives failure for attempting to decompress gzip stream with bz2. @@ -38,10 +40,11 @@ def test_gzip_stream_wrong(): for leg in dataset['data']: for filename in dataset['data'][leg]: with open(filename, 'rb') as f: - with anyopen(f, mode='r', compression='bz2') as f_uc: + with anyopen(f, mode='r', compression='bzip2') as f_uc: with pytest.raises(OSError, match='Invalid data stream'): assert type(f_uc.readline()) is str + def test_gzip_stream_wrong_no_compression(): """Test that `anyopen` gives passthrough when no compression specified on a stream. @@ -55,15 +58,16 @@ def test_gzip_stream_wrong_no_compression(): with anyopen(f, mode='r') as f_uc: assert type(f_uc.readline()) is bytes -@pytest.mark.parametrize('compression', ['bz2', 'gz']) -def test_file_roundtrip(compression, tmp_path): - """Test that roundtripping write/read to a file works with `anyopen` + +@pytest.mark.parametrize('extension', ['bz2', 'gz']) +def test_file_roundtrip(extension, tmp_path): + """Test that roundtripping write/read to a file works with `anyopen`. """ data = "my momma told me to pick the very best one and you are not it" - filepath = tmp_path / f'testfile.txt.{compression}' + filepath = tmp_path / f'testfile.txt.{extension}' with anyopen(filepath, mode='w') as f: f.write(data) @@ -72,9 +76,30 @@ def test_file_roundtrip(compression, tmp_path): assert data_out == data -@pytest.mark.parametrize('compression', ['bz2', 'gz']) -def test_file_roundtrip(compression): - """Test that roundtripping write/read to a file works with `anyopen` + +@pytest.mark.parametrize('extension,compression', + [('bz2', 'gzip'), ('gz', 'bzip2')]) +def test_file_roundtrip_force_compression(extension, compression, tmp_path): + """Test that roundtripping write/read to a file works with `anyopen`, + in which we force compression despite different extension. + + """ + + data = "my momma told me to pick the very best one and you are not it" + + filepath = tmp_path / f'testfile.txt.{extension}' + with anyopen(filepath, mode='w', compression=compression) as f: + f.write(data) + + with anyopen(filepath, 'r', compression=compression) as f: + data_out = f.read() + + assert data_out == data + + +@pytest.mark.parametrize('compression', ['bzip2', 'gzip']) +def test_stream_roundtrip(compression): + """Test that roundtripping write/read to a stream works with `anyopen` """ From db5a9f760927817fbbdd845528d76e397a1558ec Mon Sep 17 00:00:00 2001 From: David Dotson Date: Tue, 28 Jun 2022 21:15:01 -0700 Subject: [PATCH 6/8] Updates from @orbeckst review --- src/alchemlyb/parsing/util.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/src/alchemlyb/parsing/util.py b/src/alchemlyb/parsing/util.py index a87328e6..f8259aa6 100644 --- a/src/alchemlyb/parsing/util.py +++ b/src/alchemlyb/parsing/util.py @@ -6,7 +6,6 @@ from typing import IO, Optional, Union import bz2 import gzip -import zipfile def bz2_open(filename, mode): mode += 't' if mode in ['r','w','a','x'] else '' @@ -23,10 +22,18 @@ def anyopen(datafile: Union[PathLike, IO], mode='r', compression=None): Supports files compressed with bzip2 (.bz2) and gzip (.gz) compression schemes. The appropriate extension must be present for the function to - properly handle the file. + properly handle the file without specifying `compression`. If giving a stream for `datafile`, then you must specify `compression` if - the stream is compressed. + the stream is compressed. Otherwise the stream will be passed through + as-is. + + If `datafile` is a filepath, then `compression` will take precedence over + any extension on the filename. Leaving `compression` as `None` will rely on + the extension for determining compression, if any. + + .. versionchanged:: 0.7.0 + Removed stated support for zip, given broken implementation. Parameters ---------- @@ -40,11 +47,16 @@ def anyopen(datafile: Union[PathLike, IO], mode='r', compression=None): Overrides use of extension for determining compression if `datafile` is a file. + .. versionadded:: 0.7.0 + Returns ------- stream : stream Open stream for reading or writing, depending on mode. + .. versionchanged:: 0.7.0 + Explicit support for writing added. + """ # opener for each type of file extensions = {'.bz2': bz2_open, From dab5309fa90682c8db9f9bc4853c6f335ecfa465 Mon Sep 17 00:00:00 2001 From: David Dotson Date: Wed, 29 Jun 2022 17:59:12 -0700 Subject: [PATCH 7/8] Added test for unsupported compression used for anyopen --- src/alchemlyb/tests/parsing/test_util.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/src/alchemlyb/tests/parsing/test_util.py b/src/alchemlyb/tests/parsing/test_util.py index 18d10e04..334ee0c2 100644 --- a/src/alchemlyb/tests/parsing/test_util.py +++ b/src/alchemlyb/tests/parsing/test_util.py @@ -119,3 +119,25 @@ def test_stream_roundtrip(compression): data_out = f.read() assert data_out == data + +def test_stream_unsupported_compression(): + """Test that we throw a ValueError when an unsupported compression is used. + + """ + + compression="fakez" + + data = b"my momma told me to pick the very best one and you are not it" + + with io.BytesIO() as stream: + + # write to stream + stream.write(data) + + # start at the beginning + stream.seek(0) + + # read from stream + with pytest.raises(ValueError): + with anyopen(stream, 'r', compression=compression) as f: + data_out = f.read() From 8240e6461130a7ddba4ed45f380bfcdf9bd8778e Mon Sep 17 00:00:00 2001 From: Oliver Beckstein Date: Wed, 29 Jun 2022 19:02:53 -0700 Subject: [PATCH 8/8] update CHANGES --- CHANGES | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGES b/CHANGES index 0001067d..cd04979f 100644 --- a/CHANGES +++ b/CHANGES @@ -14,7 +14,7 @@ The rules for this file: ------------------------------------------------------------------------------ -*/*/2022 xiki-tempula, IAlibay +*/*/2022 xiki-tempula, IAlibay, dotsdl, orbeckst * 0.7.0 @@ -23,11 +23,13 @@ Changes (Issue #193) - gmx parser now defaults to dropping NaN and corrupted lines (filter=True) (#171, PR #183) + - remove broken .zip support from util.anyopen() (PR #197) Enhancements - Add a base class for workflows (PR #188). - Add filter function to gmx.extract to make it more robust (PR #183): can filter incomplete/corrupted lines (#126, #171) with filter=True. + - Add support to util.anyopen() for taking filelike objects (PR #197) Fixes - Fixes setup.py and setup.cfg to prevent installations with Python versions