Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for (fast5.tar).xz binary compressed files #17106

Merged
merged 5 commits into from
Mar 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions lib/galaxy/config/sample/datatypes_conf.xml.sample
Original file line number Diff line number Diff line change
Expand Up @@ -353,6 +353,7 @@
<datatype extension="fast5.tar" type="galaxy.datatypes.binary:Fast5Archive" display_in_upload="true"/>
<datatype extension="fast5.tar.gz" type="galaxy.datatypes.binary:Fast5ArchiveGz" display_in_upload="true"/>
<datatype extension="fast5.tar.bz2" type="galaxy.datatypes.binary:Fast5ArchiveBz2" display_in_upload="true"/>
<datatype extension="fast5.tar.xz" type="galaxy.datatypes.binary:Fast5ArchiveXz" display_in_upload="true"/>
<datatype extension="peptideshaker_archive" type="galaxy.datatypes.binary:CompressedArchive" subclass="true" display_in_upload="true"/>
<datatype extension="percin" type="galaxy.datatypes.tabular:Tabular" subclass="true"/>
<datatype extension="percout" type="galaxy.datatypes.xml:GenericXml" subclass="true"/>
Expand Down Expand Up @@ -1049,6 +1050,7 @@
<sniffer type="galaxy.datatypes.binary:YepTar"/>
<sniffer type="galaxy.datatypes.binary:WiffTar"/>
<sniffer type="galaxy.datatypes.binary:Fast5ArchiveGz"/>
<sniffer type="galaxy.datatypes.binary:Fast5ArchiveXz"/>
<sniffer type="galaxy.datatypes.binary:Fast5ArchiveBz2"/>
<sniffer type="galaxy.datatypes.binary:Fast5Archive"/>
<sniffer type="galaxy.datatypes.binary:Meryldb" />
Expand Down
34 changes: 34 additions & 0 deletions lib/galaxy/datatypes/binary.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@
from galaxy.util.checkers import (
is_bz2,
is_gzip,
is_xz,
)
from . import (
data,
Expand Down Expand Up @@ -3687,6 +3688,9 @@ class Fast5ArchiveGz(Fast5Archive):
>>> fname = get_test_fname('test.fast5.tar.gz')
>>> Fast5ArchiveGz().sniff(fname)
True
>>> fname = get_test_fname('test.fast5.tar.xz')
>>> Fast5ArchiveGz().sniff(fname)
False
>>> fname = get_test_fname('test.fast5.tar.bz2')
>>> Fast5ArchiveGz().sniff(fname)
False
Expand All @@ -3703,6 +3707,33 @@ def sniff(self, filename: str) -> bool:
return Fast5Archive.sniff(self, filename)


class Fast5ArchiveXz(Fast5Archive):
"""
Class describing a xz-compressed FAST5 archive

>>> from galaxy.datatypes.sniff import get_test_fname
>>> fname = get_test_fname('test.fast5.tar.gz')
>>> Fast5ArchiveXz().sniff(fname)
False
>>> fname = get_test_fname('test.fast5.tar.xz')
>>> Fast5ArchiveXz().sniff(fname)
True
>>> fname = get_test_fname('test.fast5.tar.bz2')
>>> Fast5ArchiveXz().sniff(fname)
False
>>> fname = get_test_fname('test.fast5.tar')
>>> Fast5ArchiveXz().sniff(fname)
False
"""

file_ext = "fast5.tar.xz"

def sniff(self, filename: str) -> bool:
if not is_xz(filename):
return False
return Fast5Archive.sniff(self, filename)


class Fast5ArchiveBz2(Fast5Archive):
"""
Class describing a bzip2-compressed FAST5 archive
Expand All @@ -3711,6 +3742,9 @@ class Fast5ArchiveBz2(Fast5Archive):
>>> fname = get_test_fname('test.fast5.tar.bz2')
>>> Fast5ArchiveBz2().sniff(fname)
True
>>> fname = get_test_fname('test.fast5.tar.xz')
>>> Fast5ArchiveBz2().sniff(fname)
False
>>> fname = get_test_fname('test.fast5.tar.gz')
>>> Fast5ArchiveBz2().sniff(fname)
False
Expand Down
Binary file added lib/galaxy/datatypes/test/test.fast5.tar.xz
Binary file not shown.
1 change: 1 addition & 0 deletions lib/galaxy/util/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ def shlex_join(split_command):

gzip_magic = b"\x1f\x8b"
bz2_magic = b"BZh"
xz_magic = b"\xfd7zXZ\x00"
DEFAULT_ENCODING = os.environ.get("GALAXY_DEFAULT_ENCODING", "utf-8")
NULL_CHAR = b"\x00"
BINARY_CHARS = [NULL_CHAR]
Expand Down
28 changes: 28 additions & 0 deletions lib/galaxy/util/checkers.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import bz2
import gzip
import lzma
import os
import re
import tarfile
Expand Down Expand Up @@ -118,6 +119,26 @@ def check_gzip(file_path: str, check_content: bool = True) -> Tuple[bool, bool]:
return (True, True)


def check_xz(file_path: str, check_content: bool = True) -> Tuple[bool, bool]:
try:
with open(file_path, "rb") as temp:
magic_check = temp.read(6)
if magic_check != util.xz_magic:
return (False, False)
except Exception:
return (False, False)

if not check_content:
return (True, True)

with lzma.LZMAFile(file_path, mode="rb") as xzipped_file:
chunk = xzipped_file.read(CHUNK_SIZE)
# See if we have a compressed HTML file
if check_html(chunk, file_path=False):
return (True, False)
return (True, True)


def check_bz2(file_path: str, check_content: bool = True) -> Tuple[bool, bool]:
try:
with open(file_path, "rb") as temp:
Expand Down Expand Up @@ -166,6 +187,11 @@ def is_gzip(file_path: str) -> bool:
return is_gzipped


def is_xz(file_path: str) -> bool:
is_xzipped, is_valid = check_xz(file_path, check_content=False)
return is_xzipped


def is_zip(file_path: str) -> bool:
is_zipped, is_valid = check_zip(file_path, check_content=False)
return is_zipped
Expand Down Expand Up @@ -198,6 +224,7 @@ def check_image(file_path: str):
COMPRESSION_CHECK_FUNCTIONS: Dict[str, CompressionChecker] = {
"gzip": check_gzip,
"bz2": check_bz2,
"xz": check_xz,
"zip": check_zip,
}

Expand All @@ -212,5 +239,6 @@ def check_image(file_path: str):
"COMPRESSION_CHECK_FUNCTIONS",
"is_gzip",
"is_bz2",
"is_xz",
"is_zip",
)
14 changes: 10 additions & 4 deletions lib/galaxy/util/compression_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import gzip
import io
import logging
import lzma
import os
import tarfile
import tempfile
Expand All @@ -28,12 +29,13 @@
from .checkers import (
is_bz2,
is_gzip,
is_xz,
)

log = logging.getLogger(__name__)

FileObjTypeStr = Union[IO[str], io.TextIOWrapper]
FileObjTypeBytes = Union[gzip.GzipFile, bz2.BZ2File, IO[bytes]]
FileObjTypeBytes = Union[gzip.GzipFile, bz2.BZ2File, lzma.LZMAFile, IO[bytes]]
FileObjType = Union[FileObjTypeStr, FileObjTypeBytes]


Expand Down Expand Up @@ -65,7 +67,7 @@ def get_fileobj(filename: str, mode: str = "r", compressed_formats: Optional[Lis
:param filename: path to file that should be opened
:param mode: mode to pass to opener
:param compressed_formats: list of allowed compressed file formats among
'bz2', 'gzip' and 'zip'. If left to None, all 3 formats are allowed
'bz2', 'gzip', 'xz' and 'zip'. If left to None, all 3 formats are allowed
"""
return get_fileobj_raw(filename, mode, compressed_formats)[1]

Expand Down Expand Up @@ -100,20 +102,24 @@ def get_fileobj_raw(
filename: str, mode: str = "r", compressed_formats: Optional[List[str]] = None
) -> Tuple[Optional[str], FileObjType]:
if compressed_formats is None:
compressed_formats = ["bz2", "gzip", "zip"]
compressed_formats = ["bz2", "gzip", "xz", "zip"]
# Remove 't' from mode, which may cause an error for compressed files
mode = mode.replace("t", "")
# 'U' mode is deprecated, we open in 'r'.
if mode == "U":
mode = "r"
compressed_format = None
if "gzip" in compressed_formats and is_gzip(filename):
fh: Union[gzip.GzipFile, bz2.BZ2File, IO[bytes]] = gzip.GzipFile(filename, mode)
fh: Union[gzip.GzipFile, bz2.BZ2File, lzma.LZMAFile, IO[bytes]] = gzip.GzipFile(filename, mode)
compressed_format = "gzip"
elif "bz2" in compressed_formats and is_bz2(filename):
mode = cast(Literal["a", "ab", "r", "rb", "w", "wb", "x", "xb"], mode)
fh = bz2.BZ2File(filename, mode)
compressed_format = "bz2"
elif "xz" in compressed_formats and is_xz(filename):
mode = cast(Literal["a", "ab", "r", "rb", "w", "wb", "x", "xb"], mode)
fh = lzma.LZMAFile(filename, mode)
compressed_format = "xz"
elif "zip" in compressed_formats and zipfile.is_zipfile(filename):
# Return fileobj for the first file in a zip file.
# 'b' is not allowed in the ZipFile mode argument
Expand Down
3 changes: 2 additions & 1 deletion test/integration/test_datatype_upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from galaxy.util.checkers import (
is_bz2,
is_gzip,
is_xz,
is_zip,
)
from galaxy.util.hash_util import md5_hash_file
Expand Down Expand Up @@ -77,7 +78,7 @@ def upload_datatype_helper(
delete_cache_dir: bool = False,
) -> None:
is_compressed = False
for is_method in (is_bz2, is_gzip, is_zip):
for is_method in (is_bz2, is_gzip, is_xz, is_zip):
is_compressed = is_method(test_data.path)
if is_compressed:
break
Expand Down
Loading