Added support to anyopen for taking filelike objects; remove broken `…

….zip` support (#197) * Added support to util.anyopen() for taking filelike objects Use in this way requires specifying the compression, if any, that the stream features in order to decompress it. Reworked anyopen to allow new `compression` keyword to override extension detection * Support python < 3.10 * Added checks for stream behavior in anyopen; * dropped broken zip support in anyopen We never had working zip archive support from the looks of it. This is because zipfiles are not just a compression scheme, they are an archive format that features compression. * Added additional tests * update CHANGES Co-authored-by: Oliver Beckstein <orbeckst@gmail.com>
alchemistry · Jun 30, 2022 · 72622c9 · 72622c9
1 parent e2f8c39
commit 72622c9
Show file tree

Hide file tree

Showing 3 changed files with 193 additions and 18 deletions.
diff --git a/CHANGES b/CHANGES
@@ -14,7 +14,7 @@ The rules for this file:
 
 ------------------------------------------------------------------------------
 
-*/*/2022 xiki-tempula, IAlibay
+*/*/2022 xiki-tempula, IAlibay, dotsdl, orbeckst
 
   * 0.7.0
 
@@ -23,11 +23,13 @@ Changes
     (Issue #193)
   - gmx parser now defaults to dropping NaN and corrupted lines (filter=True) 
     (#171, PR #183)
+  - remove broken .zip support from util.anyopen() (PR #197)
 
 Enhancements
   - Add a base class for workflows (PR #188).
   - Add filter function to gmx.extract to make it more robust (PR #183): can filter 
     incomplete/corrupted lines (#126, #171) with filter=True.
+  - Add support to util.anyopen() for taking filelike objects (PR #197)
 
 Fixes
   - Fixes setup.py and setup.cfg to prevent installations with Python versions

diff --git a/src/alchemlyb/parsing/util.py b/src/alchemlyb/parsing/util.py
@@ -2,49 +2,93 @@
 
 """
 import os
+from os import PathLike
+from typing import IO, Optional, Union
 import bz2
 import gzip
-import zipfile
 
 def bz2_open(filename, mode):
     mode += 't' if mode in ['r','w','a','x'] else ''
     return bz2.open(filename, mode)
 
+
 def gzip_open(filename, mode):
     mode += 't' if mode in ['r','w','a','x'] else ''
     return gzip.open(filename, mode)
 
-def anyopen(filename, mode='r'):
-    """Return a file stream for filename, even if compressed.
 
-    Supports files compressed with bzip2 (.bz2), gzip (.gz), and zip (.zip)
-    compression schemes. The appropriate extension must be present for
-    the function to properly handle the file.
+def anyopen(datafile: Union[PathLike, IO], mode='r', compression=None):
+    """Return a file stream for file or stream, even if compressed.
+
+    Supports files compressed with bzip2 (.bz2) and gzip (.gz) compression
+    schemes. The appropriate extension must be present for the function to
+    properly handle the file without specifying `compression`.
+
+    If giving a stream for `datafile`, then you must specify `compression` if
+    the stream is compressed. Otherwise the stream will be passed through
+    as-is.
+
+    If `datafile` is a filepath, then `compression` will take precedence over
+    any extension on the filename. Leaving `compression` as `None` will rely on
+    the extension for determining compression, if any.
+
+    .. versionchanged:: 0.7.0
+       Removed stated support for zip, given broken implementation.
 
     Parameters
     ----------
-    filename : str
-        Path to file to use.
+    datafile : PathLike | IO
+        Path to file to use, or an open IO stream. If an IO stream, use
+        `compression` to specify the type of compression used, if any.
     mode : str
         Mode for stream; usually 'r' or 'w'.
+    compression : str
+        Use to specify compression. Must be one of 'bzip2', 'gzip'.
+        Overrides use of extension for determining compression if `datafile` is
+        a file.
+
+        .. versionadded:: 0.7.0
 
     Returns
     -------
     stream : stream
-        Open stream for reading.
+        Open stream for reading or writing, depending on mode.
+
+        .. versionchanged:: 0.7.0
+           Explicit support for writing added.
 
     """
     # opener for each type of file
     extensions = {'.bz2': bz2_open,
-                  '.gz': gzip_open,
-                  '.zip': zipfile.ZipFile}
+                  '.gz': gzip_open}
+
+    # compression selections available
+    compressions = {'bzip2': bz2_open,
+                    'gzip': gzip_open}
 
-    ext = os.path.splitext(filename)[1]
+    # if `datafile` is a stream
+    if ((hasattr(datafile, 'read') and any((i in mode for i in ('r',)))) or 
+       (hasattr(datafile, 'write') and any((i in mode for i in ('w', 'a', 'x'))))):
+        # if no compression specified, just pass the stream through
+        if compression is None:
+            return datafile
+        elif compression in compressions:
+            compressor = compressions[compression]
+            return compressor(datafile, mode=mode)
+        else:
+            raise ValueError("`datafile` is a stream, but specified `compression` '{compression}' is not supported")
 
-    if ext in extensions:
-       opener= extensions[ext]
+    # otherwise, treat as a file
+    # allow compression to override any extension on the file
+    if compression in compressions:
+        opener = compressions[compression]
 
-    else:
-        opener = open
+    # use extension to determine the compression used, if present
+    elif compression is None:
+        ext = os.path.splitext(datafile)[1]
+        if ext in extensions:
+            opener = extensions[ext]
+        else:
+            opener = open
 
-    return opener(filename, mode)
+    return opener(datafile, mode)
diff --git a/src/alchemlyb/tests/parsing/test_util.py b/src/alchemlyb/tests/parsing/test_util.py
@@ -1,3 +1,6 @@
+import io
+import pytest
+
 from alchemtest.gmx import load_expanded_ensemble_case_1
 from alchemlyb.parsing.util import anyopen
 
@@ -12,3 +15,129 @@ def test_gzip():
         for filename in dataset['data'][leg]:
             with anyopen(filename, 'r') as f:
                 assert type(f.readline()) is str
+
+
+def test_gzip_stream():
+    """Test that `anyopen` reads streams with specified compression.
+
+    """
+    dataset = load_expanded_ensemble_case_1()
+
+    for leg in dataset['data']:
+        for filename in dataset['data'][leg]:
+            with open(filename, 'rb') as f:
+                with anyopen(f, mode='r', compression='gzip') as f_uc:
+                    assert type(f_uc.readline()) is str
+
+
+def test_gzip_stream_wrong():
+    """Test that `anyopen` gives failure for attempting to decompress gzip
+    stream with bz2.
+
+    """
+    dataset = load_expanded_ensemble_case_1()
+
+    for leg in dataset['data']:
+        for filename in dataset['data'][leg]:
+            with open(filename, 'rb') as f:
+                with anyopen(f, mode='r', compression='bzip2') as f_uc:
+                    with pytest.raises(OSError, match='Invalid data stream'):
+                        assert type(f_uc.readline()) is str
+
+
+def test_gzip_stream_wrong_no_compression():
+    """Test that `anyopen` gives passthrough when no compression specified on a
+    stream.
+
+    """
+    dataset = load_expanded_ensemble_case_1()
+
+    for leg in dataset['data']:
+        for filename in dataset['data'][leg]:
+            with open(filename, 'rb') as f:
+                with anyopen(f, mode='r') as f_uc:
+                    assert type(f_uc.readline()) is bytes
+
+
+@pytest.mark.parametrize('extension', ['bz2', 'gz'])
+def test_file_roundtrip(extension, tmp_path):
+    """Test that roundtripping write/read to a file works with `anyopen`.
+
+    """
+
+    data = "my momma told me to pick the very best one and you are not it"
+
+    filepath = tmp_path / f'testfile.txt.{extension}'
+    with anyopen(filepath, mode='w') as f:
+        f.write(data)
+
+    with anyopen(filepath, 'r') as f:
+        data_out = f.read()
+
+    assert data_out == data
+
+
+@pytest.mark.parametrize('extension,compression',
+        [('bz2', 'gzip'), ('gz', 'bzip2')])
+def test_file_roundtrip_force_compression(extension, compression, tmp_path):
+    """Test that roundtripping write/read to a file works with `anyopen`,
+    in which we force compression despite different extension.
+
+    """
+
+    data = "my momma told me to pick the very best one and you are not it"
+
+    filepath = tmp_path / f'testfile.txt.{extension}'
+    with anyopen(filepath, mode='w', compression=compression) as f:
+        f.write(data)
+
+    with anyopen(filepath, 'r', compression=compression) as f:
+        data_out = f.read()
+
+    assert data_out == data
+
+
+@pytest.mark.parametrize('compression', ['bzip2', 'gzip'])
+def test_stream_roundtrip(compression):
+    """Test that roundtripping write/read to a stream works with `anyopen`
+
+    """
+
+    data = "my momma told me to pick the very best one and you are not it"
+
+    with io.BytesIO() as stream:
+
+        # write to stream
+        with anyopen(stream, mode='w', compression=compression) as f:
+            f.write(data)
+
+        # start at the beginning
+        stream.seek(0)
+
+        # read from stream
+        with anyopen(stream, 'r', compression=compression) as f:
+            data_out = f.read()
+
+        assert data_out == data
+
+def test_stream_unsupported_compression():
+    """Test that we throw a ValueError when an unsupported compression is used.
+
+    """
+
+    compression="fakez"
+
+    data = b"my momma told me to pick the very best one and you are not it"
+
+    with io.BytesIO() as stream:
+
+        # write to stream
+        stream.write(data)
+
+        # start at the beginning
+        stream.seek(0)
+
+        # read from stream
+        with pytest.raises(ValueError):
+            with anyopen(stream, 'r', compression=compression) as f:
+                data_out = f.read()