From ad99bcfb1636f59b2ee78e1ccd7f88faa478cf9d Mon Sep 17 00:00:00 2001 From: Alexander Taepper Date: Mon, 29 Jan 2024 11:45:36 +0100 Subject: [PATCH] ZstdDecompressor now also contains a buffer for decompression --- include/silo/zstdfasta/zstd_decompressor.h | 10 ++-- include/silo/zstdfasta/zstdfasta_reader.h | 1 - .../silo/zstdfasta/zstdfasta_table_reader.h | 2 - src/silo/zstdfasta/zstd_decompressor.cpp | 46 ++++++++++++------- src/silo/zstdfasta/zstdfasta_reader.cpp | 4 +- src/silo/zstdfasta/zstdfasta_table_reader.cpp | 4 +- 6 files changed, 34 insertions(+), 33 deletions(-) diff --git a/include/silo/zstdfasta/zstd_decompressor.h b/include/silo/zstdfasta/zstd_decompressor.h index 1a799f515..fafef74df 100644 --- a/include/silo/zstdfasta/zstd_decompressor.h +++ b/include/silo/zstdfasta/zstd_decompressor.h @@ -10,6 +10,7 @@ namespace silo { class ZstdDecompressor { ZSTD_DDict* zstd_dictionary; ZSTD_DCtx* zstd_context; + std::string buffer; public: ZstdDecompressor(ZstdDecompressor&& other) noexcept; @@ -21,14 +22,9 @@ class ZstdDecompressor { explicit ZstdDecompressor(std::string_view dictionary_string); - size_t decompress(const std::string& input, std::string& output); + std::string_view decompress(const std::string& input); - size_t decompress( - const char* input_data, - size_t input_length, - char* output_data, - size_t output_length - ); + std::string_view decompress(const char* input_data, size_t input_length); }; } // namespace silo diff --git a/include/silo/zstdfasta/zstdfasta_reader.h b/include/silo/zstdfasta/zstdfasta_reader.h index ad8f84cfd..4c29d4fdf 100644 --- a/include/silo/zstdfasta/zstdfasta_reader.h +++ b/include/silo/zstdfasta/zstdfasta_reader.h @@ -15,7 +15,6 @@ class ZstdFastaReader { private: std::ifstream in_file; std::unique_ptr decompressor; - std::string genome_buffer; std::optional nextKey(); diff --git a/include/silo/zstdfasta/zstdfasta_table_reader.h b/include/silo/zstdfasta/zstdfasta_table_reader.h index 621c602b8..73a61221b 100644 --- a/include/silo/zstdfasta/zstdfasta_table_reader.h +++ b/include/silo/zstdfasta/zstdfasta_table_reader.h @@ -30,8 +30,6 @@ class ZstdFastaTableReader { std::unique_ptr decompressor; size_t current_row; - std::string genome_buffer; - std::optional nextKey(); std::string getTableQuery(); diff --git a/src/silo/zstdfasta/zstd_decompressor.cpp b/src/silo/zstdfasta/zstd_decompressor.cpp index 10ca74c2b..4d3e12705 100644 --- a/src/silo/zstdfasta/zstd_decompressor.cpp +++ b/src/silo/zstdfasta/zstd_decompressor.cpp @@ -16,43 +16,55 @@ ZstdDecompressor::~ZstdDecompressor() { ZstdDecompressor::ZstdDecompressor(std::string_view dictionary_string) { zstd_dictionary = ZSTD_createDDict(dictionary_string.data(), dictionary_string.length()); zstd_context = ZSTD_createDCtx(); + buffer = std::string(dictionary_string.size(), '\0'); } ZstdDecompressor::ZstdDecompressor(ZstdDecompressor&& other) noexcept { this->zstd_context = std::exchange(other.zstd_context, nullptr); this->zstd_dictionary = std::exchange(other.zstd_dictionary, nullptr); + this->buffer = std::move(other.buffer); } ZstdDecompressor& ZstdDecompressor::operator=(ZstdDecompressor&& other) noexcept { std::swap(this->zstd_context, other.zstd_context); std::swap(this->zstd_dictionary, other.zstd_dictionary); + std::swap(this->buffer, other.buffer); return *this; } -size_t ZstdDecompressor::decompress(const std::string& input, std::string& output) { - return decompress(input.data(), input.size(), output.data(), output.size()); +std::string_view ZstdDecompressor::decompress(const std::string& input) { + return decompress(input.data(), input.size()); } -size_t ZstdDecompressor::decompress( - const char* input_data, - size_t input_length, - char* output_data, - size_t output_length -) { +std::string_view ZstdDecompressor::decompress(const char* input_data, size_t input_length) { + size_t uncompressed_size = ZSTD_getFrameContentSize(input_data, input_length); + if (uncompressed_size == ZSTD_CONTENTSIZE_UNKNOWN) { + throw std::runtime_error(fmt::format( + "ZSTD_Error: Cannot decompress data with unknown size (getFrameContentSize == " + "UNKNOWN) for compressed data of length {}", + input_length + )); + } else if (uncompressed_size == ZSTD_CONTENTSIZE_ERROR) { + throw std::runtime_error(fmt::format( + "ZSTD_Error: Error in dependency, when getting decompressed size for compressed data of " + "length {}" + "(getFrameContentSize)", + input_length + )); + } + if (uncompressed_size > buffer.size()) { + buffer.resize(uncompressed_size); + } auto size_or_error_code = ZSTD_decompress_usingDDict( - zstd_context, output_data, output_length, input_data, input_length, zstd_dictionary + zstd_context, buffer.data(), buffer.size(), input_data, input_length, zstd_dictionary ); if (ZSTD_isError(size_or_error_code)) { const std::string error_name = ZSTD_getErrorName(size_or_error_code); - throw std::runtime_error(fmt::format( - "Error '{}' in dependency when decompressing using zstd (dst buffer size: {}, src size: " - "{}).", - error_name, - output_length, - input_length - )); + throw std::runtime_error( + fmt::format("Error '{}' in dependency when decompressing using zstd", error_name) + ); } - return size_or_error_code; + return std::string_view(buffer.data(), size_or_error_code); } } // namespace silo \ No newline at end of file diff --git a/src/silo/zstdfasta/zstdfasta_reader.cpp b/src/silo/zstdfasta/zstdfasta_reader.cpp index 12fd7ac81..c0b00668a 100644 --- a/src/silo/zstdfasta/zstdfasta_reader.cpp +++ b/src/silo/zstdfasta/zstdfasta_reader.cpp @@ -15,7 +15,6 @@ silo::ZstdFastaReader::ZstdFastaReader( if (!in_file) { throw std::runtime_error("Could not open file reader for file: " + in_file_name.string()); } - genome_buffer = std::string(compression_dict.length(), '\0'); } std::optional silo::ZstdFastaReader::nextKey() { @@ -73,8 +72,7 @@ std::optional silo::ZstdFastaReader::next(std::string& genome) { if (!key) { return std::nullopt; } - decompressor->decompress(compressed_buffer, genome_buffer); - genome = genome_buffer; + genome = decompressor->decompress(compressed_buffer); return key; } diff --git a/src/silo/zstdfasta/zstdfasta_table_reader.cpp b/src/silo/zstdfasta/zstdfasta_table_reader.cpp index 3ea717f88..eef702870 100644 --- a/src/silo/zstdfasta/zstdfasta_table_reader.cpp +++ b/src/silo/zstdfasta/zstdfasta_table_reader.cpp @@ -26,7 +26,6 @@ silo::ZstdFastaTableReader::ZstdFastaTableReader( order_by_clause(order_by_clause), decompressor(std::make_unique(compression_dict)) { SPDLOG_TRACE("Initializing ZstdFastaTableReader for table {}", table_name); - genome_buffer.resize(compression_dict.size()); reset(); SPDLOG_TRACE("Successfully initialized ZstdFastaTableReader for table {}", table_name); } @@ -78,8 +77,7 @@ std::optional silo::ZstdFastaTableReader::next(std::optionaldecompress(compressed_buffer.value(), genome_buffer); - genome = std::string(genome_buffer.data(), size); + genome = decompressor->decompress(compressed_buffer.value()); } else { genome = std::nullopt; }