From eaf676dbbb30f349f1db653e88580f81c0748048 Mon Sep 17 00:00:00 2001 From: Aris Tritas Date: Tue, 22 Aug 2023 20:14:57 +0200 Subject: [PATCH] compressor: replace snappy with cramjam The python-snappy library is no longer maintained. Cramjam is almost a drop-in replacement that allows for a considerable speedup both in the compression and decompression tasks. The relevant benchmark for snappy's framed format [1]. Add more tests for compression and decompression & modernize some pattern matching. [1]: https://github.com/milesgranger/pyrus-cramjam/blob/33c0516374fb9726ddcb82c5dfbe86be96d2cd35/cramjam-python/benchmarks/README.md --- .github/workflows/build.yml | 1 - .pylintrc | 2 +- mypy.ini | 4 +- rohmu.spec | 3 +- rohmu/compressor.py | 139 ++++++++++++++++++++++-------------- rohmu/filewrap.py | 1 - rohmu/rohmufile.py | 6 +- rohmu/snappyfile.py | 39 +++++----- rohmu/typing.py | 4 +- setup.cfg | 2 +- test/test_compressor.py | 58 ++++++++++++--- test/test_dates.py | 4 ++ 12 files changed, 167 insertions(+), 96 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index ab0aae79..1f6f61a0 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -70,7 +70,6 @@ jobs: - id: dependencies run: | - sudo apt-get install -y libsnappy-dev pip install -r requirements.txt pip install -r requirements.dev.txt diff --git a/.pylintrc b/.pylintrc index f95b17ec..ecae4979 100644 --- a/.pylintrc +++ b/.pylintrc @@ -1,5 +1,5 @@ [MAIN] -extension-pkg-allow-list=pydantic +extension-pkg-allow-list=cramjam,pydantic [MESSAGES CONTROL] disable= diff --git a/mypy.ini b/mypy.ini index db530bf0..42b5aab0 100644 --- a/mypy.ini +++ b/mypy.ini @@ -14,10 +14,10 @@ ignore_missing_imports = True [mypy-azure.common.*] ignore_missing_imports = True -[mypy-oauth2client.*] +[mypy-cramjam.*] ignore_missing_imports = True -[mypy-snappy.*] +[mypy-oauth2client.*] ignore_missing_imports = True [mypy-swiftclient.*] diff --git a/rohmu.spec b/rohmu.spec index a3e43496..a3e5fc53 100644 --- a/rohmu.spec +++ b/rohmu.spec @@ -13,7 +13,8 @@ Requires: python3-cryptography >= 1.6 Requires: python3-dateutil Requires: python3-pydantic Requires: python3-requests -Requires: python3-snappy +# Requires: python3-snappy +# TODO: Create python3-cramjam Requires: python3-zstandard BuildRequires: python3-devel BuildRequires: python3-flake8 diff --git a/rohmu/compressor.py b/rohmu/compressor.py index f8c88c53..a5df4884 100644 --- a/rohmu/compressor.py +++ b/rohmu/compressor.py @@ -7,16 +7,42 @@ from .errors import InvalidConfigurationError from .filewrap import Sink, Stream from .snappyfile import SnappyFile -from .typing import BinaryData, Compressor, Decompressor, FileLike, HasRead, HasWrite +from .typing import BinaryData, CompressionAlgorithm, Compressor, Decompressor, FileLike, HasRead, HasWrite from .zstdfile import open as zstd_open from typing import cast, IO import lzma try: - import snappy + import cramjam + + # Cramjam streaming classes are lazy and diverge from Compressor and Decompressor interfaces. + # Adapt the parent classes to flush and return the inner buffer after compress and decompress calls. + class CramjamStreamingCompressor(Compressor): + def __init__(self) -> None: + self._compressor = cramjam.snappy.Compressor() + + def compress(self, data: bytes) -> bytes: + self._compressor.compress(data) + return self.flush() + + def flush(self) -> bytes: + buf = self._compressor.flush() + return buf.read() + + class CramjamStreamingDecompressor(Decompressor): + def __init__(self) -> None: + self._decompressor = cramjam.snappy.Decompressor() + + def decompress(self, data: bytes) -> bytes: + self._decompressor.decompress(data) + buf = self._decompressor.flush() + return buf.read() + except ImportError: - snappy = None # type: ignore + cramjam = None # type: ignore + CramjamStreamingCompressor: Compressor | None = None # type: ignore[no-redef] + CramjamStreamingDecompressor: Decompressor | None = None # type: ignore[no-redef] try: import zstandard as zstd @@ -24,37 +50,43 @@ zstd = None # type: ignore -def CompressionFile(dst_fp: FileLike, algorithm: str, level: int = 0, threads: int = 0) -> FileLike: +def CompressionFile(dst_fp: FileLike, algorithm: CompressionAlgorithm, level: int = 0, threads: int = 0) -> FileLike: """This looks like a class to users, but is actually a function that instantiates a class based on algorithm.""" - if algorithm == "lzma": - return lzma.open(cast(IO[bytes], dst_fp), "w", preset=level) - - if algorithm == "snappy": - return SnappyFile(dst_fp, "wb") - - if algorithm == "zstd": - return zstd_open(dst_fp, "wb", level=level, threads=threads) - - if algorithm: - raise InvalidConfigurationError(f"invalid compression algorithm: {repr(algorithm)}") - - return dst_fp + compression_fileobj: FileLike + match algorithm: + case "lzma": + compression_fileobj = lzma.open(cast(IO[bytes], dst_fp), "w", preset=level) + case "snappy": + compression_fileobj = SnappyFile(dst_fp, "wb") + case "zstd": + compression_fileobj = zstd_open(dst_fp, "wb", level=level, threads=threads) + case _: + raise InvalidConfigurationError(f"invalid compression algorithm: {repr(algorithm)}") + return compression_fileobj + + +def create_streaming_compressor(algorithm: CompressionAlgorithm, level: int = 0) -> Compressor: + compressor: Compressor + match algorithm: + case "lzma": + compressor = lzma.LZMACompressor(lzma.FORMAT_XZ, -1, level, None) + case "snappy": + if CramjamStreamingCompressor is None: + raise ImportError("Unable to import cramjam") + compressor = CramjamStreamingCompressor() + case "zstd": + compressor = zstd.ZstdCompressor(level=level).compressobj() + case _: + raise InvalidConfigurationError(f"invalid compression algorithm: {repr(algorithm)}") + return compressor class CompressionStream(Stream): """Non-seekable stream of data that adds compression on top of given source stream""" - def __init__(self, src_fp: HasRead, algorithm: str, level: int = 0) -> None: + def __init__(self, src_fp: HasRead, algorithm: CompressionAlgorithm, level: int = 0) -> None: super().__init__(src_fp, minimum_read_size=32 * 1024) - self._compressor: Compressor - if algorithm == "lzma": - self._compressor = lzma.LZMACompressor(lzma.FORMAT_XZ, -1, level, None) - elif algorithm == "snappy": - self._compressor = snappy.StreamCompressor() - elif algorithm == "zstd": - self._compressor = zstd.ZstdCompressor(level=level).compressobj() - else: - raise InvalidConfigurationError(f"invalid compression algorithm: {repr(algorithm)}") + self._compressor = create_streaming_compressor(algorithm, level) def _process_chunk(self, data: bytes) -> bytes: return self._compressor.compress(data) @@ -63,42 +95,45 @@ def _finalize(self) -> bytes: return self._compressor.flush() -def DecompressionFile(src_fp: FileLike, algorithm: str) -> FileLike: +def DecompressionFile(src_fp: FileLike, algorithm: CompressionAlgorithm) -> FileLike: """This looks like a class to users, but is actually a function that instantiates a class based on algorithm.""" - if algorithm == "lzma": - return lzma.open(cast(IO[bytes], src_fp), "r") - - if algorithm == "snappy": - return SnappyFile(src_fp, "rb") - - if algorithm == "zstd": - return zstd_open(src_fp, "rb") + match algorithm: + case "lzma": + return lzma.open(cast(IO[bytes], src_fp), "r") + case "snappy": + return SnappyFile(src_fp, "rb") + case "zstd": + return zstd_open(src_fp, "rb") + case _: + raise InvalidConfigurationError(f"invalid compression algorithm: {repr(algorithm)}") - if algorithm: - raise InvalidConfigurationError(f"invalid compression algorithm: {repr(algorithm)}") - return src_fp +def create_streaming_decompressor(algorithm: CompressionAlgorithm) -> Decompressor: + decompressor: Decompressor + match algorithm: + case "lzma": + decompressor = lzma.LZMADecompressor() + case "snappy": + if CramjamStreamingDecompressor is None: + raise ImportError("Unable to import cramjam") + decompressor = CramjamStreamingDecompressor() + case "zstd": + decompressor = zstd.ZstdDecompressor().decompressobj() + case _: + raise InvalidConfigurationError(f"invalid compression algorithm: {repr(algorithm)}") + return decompressor class DecompressSink(Sink): - def __init__(self, next_sink: HasWrite, compression_algorithm: str): + def __init__(self, next_sink: HasWrite, compression_algorithm: CompressionAlgorithm): super().__init__(next_sink) - self.decompressor = self._create_decompressor(compression_algorithm) - - def _create_decompressor(self, alg: str) -> Decompressor: - if alg == "snappy": - return snappy.StreamDecompressor() - elif alg == "lzma": - return lzma.LZMADecompressor() - elif alg == "zstd": - return zstd.ZstdDecompressor().decompressobj() - raise InvalidConfigurationError(f"invalid compression algorithm: {repr(alg)}") + self.decompressor = create_streaming_decompressor(compression_algorithm) def write(self, data: BinaryData) -> int: data = bytes(data) if not isinstance(data, bytes) else data written = len(data) if not data: return written - data = self.decompressor.decompress(data) - self._write_to_next_sink(data) + decompressed_data = self.decompressor.decompress(data) + self._write_to_next_sink(decompressed_data) return written diff --git a/rohmu/filewrap.py b/rohmu/filewrap.py index 0e141b7c..4006b32a 100644 --- a/rohmu/filewrap.py +++ b/rohmu/filewrap.py @@ -178,7 +178,6 @@ def read(self, size: int = -1) -> bytes: bytes_available += len(dst_data) if not src_data: self._eof = True - if size < 0 or bytes_available < size: data = b"".join(chunks) self._remainder = b"" diff --git a/rohmu/rohmufile.py b/rohmu/rohmufile.py index 21c9aa70..a16bd0cf 100644 --- a/rohmu/rohmufile.py +++ b/rohmu/rohmufile.py @@ -12,7 +12,7 @@ from .encryptor import DecryptorFile, DecryptSink, EncryptorFile from .errors import InvalidConfigurationError from .filewrap import ThrottleSink -from .typing import FileLike, HasWrite, Metadata +from .typing import CompressionAlgorithm, FileLike, HasWrite, Metadata from contextlib import suppress from inspect import signature from rohmu.object_storage.base import IncrementalProgressCallbackType @@ -143,7 +143,7 @@ def read_file( def file_writer( *, fileobj: FileLike, - compression_algorithm: Optional[str] = None, + compression_algorithm: Optional[CompressionAlgorithm] = None, compression_level: int = 0, compression_threads: int = 0, rsa_public_key: Union[None, str, bytes] = None, @@ -162,7 +162,7 @@ def write_file( input_obj: FileLike, output_obj: FileLike, progress_callback: IncrementalProgressCallbackType = None, - compression_algorithm: Optional[str] = None, + compression_algorithm: Optional[CompressionAlgorithm] = None, compression_level: int = 0, compression_threads: int = 0, rsa_public_key: Union[None, str, bytes] = None, diff --git a/rohmu/snappyfile.py b/rohmu/snappyfile.py index deee951d..ea58722a 100644 --- a/rohmu/snappyfile.py +++ b/rohmu/snappyfile.py @@ -12,22 +12,22 @@ import io try: - import snappy + import cramjam except ImportError: - snappy = None # type: ignore + cramjam = None # type: ignore class SnappyFile(FileWrap): def __init__(self, next_fp: FileLike, mode: str) -> None: - if snappy is None: + if cramjam is None: raise io.UnsupportedOperation("Snappy is not available") if mode == "rb": - self.decr = snappy.StreamDecompressor() + self.decr = cramjam.snappy.Decompressor() self.encr = None elif mode == "wb": self.decr = None - self.encr = snappy.StreamCompressor() + self.encr = cramjam.snappy.Compressor() else: raise io.UnsupportedOperation("unsupported mode for SnappyFile") @@ -49,10 +49,11 @@ def write(self, data: BinaryData) -> int: # type: ignore [override] if self.encr is None: raise io.UnsupportedOperation("file not open for writing") data_as_bytes = bytes(data) - compressed_data = self.encr.compress(data_as_bytes) - self.next_fp.write(compressed_data) - self.offset += len(data_as_bytes) - return len(data_as_bytes) + block_size = self.encr.compress(data_as_bytes) + compressed_buffer = self.encr.flush() + self.next_fp.write(compressed_buffer) + self.offset += block_size + return block_size def writable(self) -> bool: return self.encr is not None @@ -62,19 +63,13 @@ def read(self, size: Optional[int] = -1) -> bytes: # pylint: disable=unused-arg self._check_not_closed() if self.decr is None: raise io.UnsupportedOperation("file not open for reading") - while not self.decr_done: - compressed = self.next_fp.read(IO_BLOCK_SIZE) - if not compressed: - self.decr_done = True - output = self.decr.flush() - else: - output = self.decr.decompress(compressed) - - if output: - self.offset += len(output) - return output - - return b"" + num_decompressed_bytes = 0 + while compressed := self.next_fp.read(IO_BLOCK_SIZE): + chunk_size = self.decr.decompress(compressed) + num_decompressed_bytes += chunk_size + self.offset += num_decompressed_bytes + output = self.decr.flush().read() + return output def readable(self) -> bool: return self.decr is not None diff --git a/rohmu/typing.py b/rohmu/typing.py index f7f32002..34ab3eff 100644 --- a/rohmu/typing.py +++ b/rohmu/typing.py @@ -1,7 +1,7 @@ from __future__ import annotations from types import TracebackType -from typing import Any, Dict, Optional, Protocol, Type, TYPE_CHECKING, Union +from typing import Any, Dict, Literal, Optional, Protocol, Type, TYPE_CHECKING, Union try: # Remove when dropping support for Python 3.7 @@ -32,6 +32,8 @@ StrOrPathLike = Union[str, "PathLike[str]"] +CompressionAlgorithm = Literal["lzma", "snappy", "zstd"] + class HasFileno(Protocol): def fileno(self) -> int: diff --git a/setup.cfg b/setup.cfg index 0efb458e..b5668504 100644 --- a/setup.cfg +++ b/setup.cfg @@ -25,6 +25,7 @@ packages = find: install_requires = azure-storage-blob >= 2.1.0 botocore + cramjam >= 2.7.0 cryptography google-api-python-client httplib2 @@ -32,7 +33,6 @@ install_requires = paramiko pydantic < 2 python-dateutil - python-snappy requests zstandard typing_extensions >= 3.10, < 5 diff --git a/test/test_compressor.py b/test/test_compressor.py index b13cdf04..65683275 100644 --- a/test/test_compressor.py +++ b/test/test_compressor.py @@ -1,19 +1,19 @@ -from rohmu.compressor import CompressionFile, DecompressionFile +from rohmu.compressor import CompressionFile, CompressionStream, DecompressionFile, DecompressSink +from rohmu.typing import CompressionAlgorithm +from typing import Final import io +import math import pytest +import random +import string +SAMPLE_BYTES: Final[bytes] = b"Some contents" -@pytest.mark.parametrize( - "algorithm,contents", - [ - ("lzma", b""), - ("snappy", b""), - ("lzma", b"Some contents"), - ("snappy", b"Some contents"), - ], -) -def test_compress_decompress_simple_file(algorithm: str, contents: bytes) -> None: + +@pytest.mark.parametrize("algorithm", ["lzma", "snappy", "zstd"]) +@pytest.mark.parametrize("contents", [b"", 100 * SAMPLE_BYTES], ids=["empty", "sample-bytes"]) +def test_compress_decompress_simple_file(algorithm: CompressionAlgorithm, contents: bytes) -> None: bio = io.BytesIO() ef = CompressionFile(bio, algorithm=algorithm) ef.write(contents) @@ -23,3 +23,39 @@ def test_compress_decompress_simple_file(algorithm: str, contents: bytes) -> Non df = DecompressionFile(bio, algorithm=algorithm) data = df.read() assert data == contents + + +@pytest.mark.skip(reason="neither snappy nor zstd seem to handle multiple chunks") +@pytest.mark.parametrize("algorithm", ["lzma", "snappy", "zstd"]) +def test_compress_decompress_multiple_chunks(algorithm: CompressionAlgorithm) -> None: + contents = "".join(random.choices(string.ascii_letters + string.digits, k=1_000_000)).encode() + num_bytes = len(contents) + print(f"Data size exponent (block size = 20): {math.log2(num_bytes)}") + bytes_buf = io.BytesIO() + ef = CompressionFile(bytes_buf, algorithm=algorithm) + ef.write(contents) + ef.close() + + bytes_buf.seek(0) + df = DecompressionFile(bytes_buf, algorithm=algorithm) + data = df.read() + assert data == contents + + +@pytest.mark.parametrize("algorithm", ["lzma", "snappy", "zstd"]) +@pytest.mark.parametrize("contents", [b"", 100 * SAMPLE_BYTES], ids=["empty", "sample-bytes"]) +def test_compress_decompress_streaming(algorithm: CompressionAlgorithm, contents: bytes) -> None: + input_buffer = io.BytesIO() + input_buffer.write(contents) + input_buffer.seek(0) + compression_stream = CompressionStream(input_buffer, algorithm) + compressed_bytes = compression_stream.read() + assert compression_stream.tell() == len(compressed_bytes) + + output_buffer = io.BytesIO() + decompression_sink = DecompressSink(output_buffer, algorithm) + num_bytes_written = decompression_sink.write(compressed_bytes) + assert num_bytes_written == len(compressed_bytes) + output_buffer.seek(0) + output_data = output_buffer.read() + assert output_data == contents diff --git a/test/test_dates.py b/test/test_dates.py index ff8ae69d..d251d52b 100644 --- a/test/test_dates.py +++ b/test/test_dates.py @@ -4,11 +4,13 @@ Copyright (c) 2017 Ohmu Ltd See LICENSE for details """ +from dateutil.parser import UnknownTimezoneWarning # type: ignore[attr-defined] from rohmu.dates import parse_timestamp import datetime import dateutil.tz import re +import warnings def test_parse_timestamp() -> None: @@ -24,6 +26,8 @@ def test_parse_timestamp() -> None: assert local_naive == local_aware.replace(tzinfo=None) str_unknown_aware = "2017-02-02 12:00:00 XYZ" + with warnings.catch_warnings(): + warnings.simplefilter("ignore", UnknownTimezoneWarning) unknown_aware_utc = parse_timestamp(str_unknown_aware) assert unknown_aware_utc.tzinfo == datetime.timezone.utc assert unknown_aware_utc.isoformat() == "2017-02-02T12:00:00+00:00"