Skip to content

Commit

Permalink
feat: automatically detect file endings for fasta files
Browse files Browse the repository at this point in the history
  • Loading branch information
Taepper committed Jul 17, 2023
1 parent 04a3fe4 commit 75bd14e
Show file tree
Hide file tree
Showing 8 changed files with 45 additions and 420 deletions.
4 changes: 3 additions & 1 deletion include/silo/common/input_stream_wrapper.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,13 @@
#include <iostream>
#include <memory>

#include <boost/iostreams/filtering_stream.hpp>

namespace silo {
struct InputStreamWrapper {
private:
std::ifstream file;
std::unique_ptr<std::istream> input_stream;
std::unique_ptr<boost::iostreams::filtering_istream> input_stream;

public:
explicit InputStreamWrapper(const std::filesystem::path& filename);
Expand Down
2 changes: 1 addition & 1 deletion include/silo/common/zstdfasta_reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
namespace silo {
class ZstdFastaReader {
private:
silo::InputStreamWrapper in_file;
std::ifstream in_file;
std::unique_ptr<silo::ZstdDecompressor> decompressor;
std::string genome_buffer;

Expand Down
41 changes: 33 additions & 8 deletions src/silo/common/input_stream_wrapper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,26 +2,51 @@

#include <utility>

#include <spdlog/spdlog.h>
#include <boost/iostreams/detail/error.hpp>
#include <boost/iostreams/filter/lzma.hpp>
#include <boost/iostreams/filter/zstd.hpp>
#include <boost/iostreams/filtering_stream.hpp>
#include <boost/iostreams/read.hpp>

#include "silo/preprocessing/preprocessing_exception.h"

namespace {

std::filesystem::path appendXZ(const std::filesystem::path& filename) {
return {filename.string() + ".xz"};
}

std::filesystem::path appendZST(const std::filesystem::path& filename) {
return {filename.string() + ".zst"};
}

} // namespace

namespace silo {
InputStreamWrapper::InputStreamWrapper(const std::filesystem::path& filename) {
if (filename.extension() == ".xz") {
InputStreamWrapper::InputStreamWrapper(const std::filesystem::path& filename)
: input_stream(std::make_unique<boost::iostreams::filtering_istream>()) {
if (std::filesystem::exists(filename)) {
SPDLOG_INFO("Detected file without specialized ending, processing raw: " + filename.string());
file = std::ifstream(filename, std::ios::binary);
std::unique_ptr<boost::iostreams::filtering_istream> archive =
std::make_unique<boost::iostreams::filtering_istream>();
archive->push(boost::iostreams::lzma_decompressor());
archive->push(file);
input_stream = std::move(archive);
} else if (std::filesystem::exists(appendXZ(filename))) {
SPDLOG_INFO("Detected file-ending .xz for input file " + filename.string());
file = std::ifstream(appendXZ(filename), std::ios::binary);
input_stream->push(boost::iostreams::lzma_decompressor());
} else if (std::filesystem::exists(appendZST(filename))) {
SPDLOG_INFO("Detected file-ending .zst for input file " + filename.string());
file = std::ifstream(appendZST(filename), std::ios::binary);
input_stream->push(boost::iostreams::zstd_decompressor());
} else {
input_stream = make_unique<std::ifstream>(filename.string(), std::ios::binary);
throw silo::PreprocessingException(
"Cannot find file with name or associated endings (.xz, .zst): " + filename.string()
);
}
input_stream->push(file);
}

std::istream& silo::InputStreamWrapper::getInputStream() const {
return *input_stream;
}

} // namespace silo
18 changes: 8 additions & 10 deletions src/silo/common/zstdfasta_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ silo::ZstdFastaReader::ZstdFastaReader(

std::optional<std::string> silo::ZstdFastaReader::nextKey() {
std::string key_with_prefix;
if (!getline(in_file.getInputStream(), key_with_prefix)) {
if (!getline(in_file, key_with_prefix)) {
return std::nullopt;
}

Expand All @@ -37,12 +37,12 @@ std::optional<std::string> silo::ZstdFastaReader::nextSkipGenome() {
}

std::string bytestream_length_str;
if (!getline(in_file.getInputStream(), bytestream_length_str)) {
if (!getline(in_file, bytestream_length_str)) {
throw FastaFormatException("Missing bytestream length in line following key: " + *key);
}
const size_t bytestream_length = std::stoul(bytestream_length_str);

in_file.getInputStream().ignore(static_cast<std::streamsize>(bytestream_length));
in_file.ignore(static_cast<std::streamsize>(bytestream_length));
return key;
}

Expand All @@ -53,16 +53,14 @@ std::optional<std::string> silo::ZstdFastaReader::nextCompressed(std::string& co
}

std::string bytestream_length_str;
if (!getline(in_file.getInputStream(), bytestream_length_str)) {
if (!getline(in_file, bytestream_length_str)) {
throw FastaFormatException("Missing bytestream length in line following key: " + *key);
}
const size_t bytestream_length = std::stoul(bytestream_length_str);

compressed_genome.resize(bytestream_length);
in_file.getInputStream().read(
compressed_genome.data(), static_cast<std::streamsize>(compressed_genome.size())
);
in_file.getInputStream().ignore(1);
in_file.read(compressed_genome.data(), static_cast<std::streamsize>(compressed_genome.size()));
in_file.ignore(1);
return key;
}

Expand All @@ -79,6 +77,6 @@ std::optional<std::string> silo::ZstdFastaReader::next(std::string& genome) {
}

void silo::ZstdFastaReader::reset() {
in_file.getInputStream().clear(); // clear fail and eof bits
in_file.getInputStream().seekg(0, std::ios::beg); // g pointer back to the start
in_file.clear(); // clear fail and eof bits
in_file.seekg(0, std::ios::beg); // g pointer back to the start
}
Loading

0 comments on commit 75bd14e

Please sign in to comment.