Skip to content

Commit

Permalink
Added support to anyopen for taking filelike objects; remove broken `…
Browse files Browse the repository at this point in the history
….zip` support (#197)

* Added support to util.anyopen() for taking filelike objects

  Use in this way requires specifying the compression, if any,
  that the stream features in order to decompress it.

  Reworked anyopen to allow new `compression` keyword to override 
  extension detection

* Support python < 3.10
* Added checks for stream behavior in anyopen; 
* dropped broken zip support in anyopen

  We never had working zip archive support from the looks of it. This is
  because zipfiles are not just a compression scheme, they are an archive
  format that features compression.

* Added additional tests
* update CHANGES

Co-authored-by: Oliver Beckstein <orbeckst@gmail.com>
  • Loading branch information
dotsdl and orbeckst authored Jun 30, 2022
1 parent e2f8c39 commit 72622c9
Show file tree
Hide file tree
Showing 3 changed files with 193 additions and 18 deletions.
4 changes: 3 additions & 1 deletion CHANGES
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ The rules for this file:

------------------------------------------------------------------------------

*/*/2022 xiki-tempula, IAlibay
*/*/2022 xiki-tempula, IAlibay, dotsdl, orbeckst

* 0.7.0

Expand All @@ -23,11 +23,13 @@ Changes
(Issue #193)
- gmx parser now defaults to dropping NaN and corrupted lines (filter=True)
(#171, PR #183)
- remove broken .zip support from util.anyopen() (PR #197)

Enhancements
- Add a base class for workflows (PR #188).
- Add filter function to gmx.extract to make it more robust (PR #183): can filter
incomplete/corrupted lines (#126, #171) with filter=True.
- Add support to util.anyopen() for taking filelike objects (PR #197)

Fixes
- Fixes setup.py and setup.cfg to prevent installations with Python versions
Expand Down
78 changes: 61 additions & 17 deletions src/alchemlyb/parsing/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,49 +2,93 @@
"""
import os
from os import PathLike
from typing import IO, Optional, Union
import bz2
import gzip
import zipfile

def bz2_open(filename, mode):
mode += 't' if mode in ['r','w','a','x'] else ''
return bz2.open(filename, mode)


def gzip_open(filename, mode):
mode += 't' if mode in ['r','w','a','x'] else ''
return gzip.open(filename, mode)

def anyopen(filename, mode='r'):
"""Return a file stream for filename, even if compressed.

Supports files compressed with bzip2 (.bz2), gzip (.gz), and zip (.zip)
compression schemes. The appropriate extension must be present for
the function to properly handle the file.
def anyopen(datafile: Union[PathLike, IO], mode='r', compression=None):
"""Return a file stream for file or stream, even if compressed.
Supports files compressed with bzip2 (.bz2) and gzip (.gz) compression
schemes. The appropriate extension must be present for the function to
properly handle the file without specifying `compression`.
If giving a stream for `datafile`, then you must specify `compression` if
the stream is compressed. Otherwise the stream will be passed through
as-is.
If `datafile` is a filepath, then `compression` will take precedence over
any extension on the filename. Leaving `compression` as `None` will rely on
the extension for determining compression, if any.
.. versionchanged:: 0.7.0
Removed stated support for zip, given broken implementation.
Parameters
----------
filename : str
Path to file to use.
datafile : PathLike | IO
Path to file to use, or an open IO stream. If an IO stream, use
`compression` to specify the type of compression used, if any.
mode : str
Mode for stream; usually 'r' or 'w'.
compression : str
Use to specify compression. Must be one of 'bzip2', 'gzip'.
Overrides use of extension for determining compression if `datafile` is
a file.
.. versionadded:: 0.7.0
Returns
-------
stream : stream
Open stream for reading.
Open stream for reading or writing, depending on mode.
.. versionchanged:: 0.7.0
Explicit support for writing added.
"""
# opener for each type of file
extensions = {'.bz2': bz2_open,
'.gz': gzip_open,
'.zip': zipfile.ZipFile}
'.gz': gzip_open}

# compression selections available
compressions = {'bzip2': bz2_open,
'gzip': gzip_open}

ext = os.path.splitext(filename)[1]
# if `datafile` is a stream
if ((hasattr(datafile, 'read') and any((i in mode for i in ('r',)))) or
(hasattr(datafile, 'write') and any((i in mode for i in ('w', 'a', 'x'))))):
# if no compression specified, just pass the stream through
if compression is None:
return datafile
elif compression in compressions:
compressor = compressions[compression]
return compressor(datafile, mode=mode)
else:
raise ValueError("`datafile` is a stream, but specified `compression` '{compression}' is not supported")

if ext in extensions:
opener= extensions[ext]
# otherwise, treat as a file
# allow compression to override any extension on the file
if compression in compressions:
opener = compressions[compression]

else:
opener = open
# use extension to determine the compression used, if present
elif compression is None:
ext = os.path.splitext(datafile)[1]
if ext in extensions:
opener = extensions[ext]
else:
opener = open

return opener(filename, mode)
return opener(datafile, mode)
129 changes: 129 additions & 0 deletions src/alchemlyb/tests/parsing/test_util.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import io
import pytest

from alchemtest.gmx import load_expanded_ensemble_case_1
from alchemlyb.parsing.util import anyopen

Expand All @@ -12,3 +15,129 @@ def test_gzip():
for filename in dataset['data'][leg]:
with anyopen(filename, 'r') as f:
assert type(f.readline()) is str


def test_gzip_stream():
"""Test that `anyopen` reads streams with specified compression.
"""
dataset = load_expanded_ensemble_case_1()

for leg in dataset['data']:
for filename in dataset['data'][leg]:
with open(filename, 'rb') as f:
with anyopen(f, mode='r', compression='gzip') as f_uc:
assert type(f_uc.readline()) is str


def test_gzip_stream_wrong():
"""Test that `anyopen` gives failure for attempting to decompress gzip
stream with bz2.
"""
dataset = load_expanded_ensemble_case_1()

for leg in dataset['data']:
for filename in dataset['data'][leg]:
with open(filename, 'rb') as f:
with anyopen(f, mode='r', compression='bzip2') as f_uc:
with pytest.raises(OSError, match='Invalid data stream'):
assert type(f_uc.readline()) is str


def test_gzip_stream_wrong_no_compression():
"""Test that `anyopen` gives passthrough when no compression specified on a
stream.
"""
dataset = load_expanded_ensemble_case_1()

for leg in dataset['data']:
for filename in dataset['data'][leg]:
with open(filename, 'rb') as f:
with anyopen(f, mode='r') as f_uc:
assert type(f_uc.readline()) is bytes


@pytest.mark.parametrize('extension', ['bz2', 'gz'])
def test_file_roundtrip(extension, tmp_path):
"""Test that roundtripping write/read to a file works with `anyopen`.
"""

data = "my momma told me to pick the very best one and you are not it"

filepath = tmp_path / f'testfile.txt.{extension}'
with anyopen(filepath, mode='w') as f:
f.write(data)

with anyopen(filepath, 'r') as f:
data_out = f.read()

assert data_out == data


@pytest.mark.parametrize('extension,compression',
[('bz2', 'gzip'), ('gz', 'bzip2')])
def test_file_roundtrip_force_compression(extension, compression, tmp_path):
"""Test that roundtripping write/read to a file works with `anyopen`,
in which we force compression despite different extension.
"""

data = "my momma told me to pick the very best one and you are not it"

filepath = tmp_path / f'testfile.txt.{extension}'
with anyopen(filepath, mode='w', compression=compression) as f:
f.write(data)

with anyopen(filepath, 'r', compression=compression) as f:
data_out = f.read()

assert data_out == data


@pytest.mark.parametrize('compression', ['bzip2', 'gzip'])
def test_stream_roundtrip(compression):
"""Test that roundtripping write/read to a stream works with `anyopen`
"""

data = "my momma told me to pick the very best one and you are not it"

with io.BytesIO() as stream:

# write to stream
with anyopen(stream, mode='w', compression=compression) as f:
f.write(data)

# start at the beginning
stream.seek(0)

# read from stream
with anyopen(stream, 'r', compression=compression) as f:
data_out = f.read()

assert data_out == data

def test_stream_unsupported_compression():
"""Test that we throw a ValueError when an unsupported compression is used.
"""

compression="fakez"

data = b"my momma told me to pick the very best one and you are not it"

with io.BytesIO() as stream:

# write to stream
stream.write(data)

# start at the beginning
stream.seek(0)

# read from stream
with pytest.raises(ValueError):
with anyopen(stream, 'r', compression=compression) as f:
data_out = f.read()

0 comments on commit 72622c9

Please sign in to comment.