Skip to content

Commit

Permalink
feat: Generalizing the config for multiple nucleotide sequences and m…
Browse files Browse the repository at this point in the history
…ultiple genes
  • Loading branch information
Taepper committed Jul 4, 2023
1 parent 69fdc2b commit 9a80204
Show file tree
Hide file tree
Showing 67 changed files with 3,510 additions and 954 deletions.
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ find_package(roaring REQUIRED)
find_package(spdlog REQUIRED)
find_package(vincentlaucsb-csv-parser REQUIRED)
find_package(yaml-cpp REQUIRED)
find_package(zstd REQUIRED)

# ---------------------------------------------------------------------------
# Includes
Expand Down Expand Up @@ -86,6 +87,7 @@ target_link_libraries(
${vincentlaucsb-csv-parser_LIBRARIES}
${yaml-cpp_LIBRARIES}
nlohmann_json::nlohmann_json
zstd::libzstd_static
)

add_executable(siloApi src/silo_api/api.cpp ${SRC_SILO_API})
Expand Down
1 change: 1 addition & 0 deletions conanfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ class SiloRecipe(ConanFile):
"spdlog/1.11.0",
"vincentlaucsb-csv-parser/2.1.3",
"yaml-cpp/0.7.0",
"zstd/1.5.5",
]

default_options = {
Expand Down
9 changes: 2 additions & 7 deletions include/silo/common/nucleotide_symbols.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,8 @@
#include <array>
#include <iostream>

#include <spdlog/spdlog.h>

namespace silo {

static constexpr unsigned GENOME_LENGTH = 29903;

// https://www.bioinformatics.org/sms/iupac.html
enum class NUCLEOTIDE_SYMBOL {
GAP, // -, GAP
Expand Down Expand Up @@ -121,7 +117,7 @@ inline std::string genomeSymbolRepresentation(NUCLEOTIDE_SYMBOL symbol) {
return std::string(1, SYMBOL_REPRESENTATION.at(static_cast<unsigned>(symbol)));
}

inline NUCLEOTIDE_SYMBOL toNucleotideSymbol(char character) {
inline std::optional<NUCLEOTIDE_SYMBOL> toNucleotideSymbol(char character) {
switch (character) {
case '.':
case '-':
Expand Down Expand Up @@ -158,8 +154,7 @@ inline NUCLEOTIDE_SYMBOL toNucleotideSymbol(char character) {
case 'N':
return NUCLEOTIDE_SYMBOL::N;
default:
SPDLOG_ERROR("unrecognized symbol {}", character);
return NUCLEOTIDE_SYMBOL::GAP;
return std::nullopt;
}
}
} // namespace silo
Expand Down
37 changes: 37 additions & 0 deletions include/silo/common/zstdfasta_reader.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#ifndef SILO_ZSTDFASTA_READER_H
#define SILO_ZSTDFASTA_READER_H

#include <filesystem>
#include <iostream>

#include <zstd.h>

#include "silo/common/input_stream_wrapper.h"

namespace silo {
class ZstdFastaReader {
private:
silo::InputStreamWrapper in_file;
ZSTD_DDict* zstd_dictionary;
ZSTD_DCtx* zstd_context;
std::string genome_buffer;

bool populateKey(std::string& key);

public:
explicit ZstdFastaReader(
const std::filesystem::path& in_file_name,
const std::string& compression_dict
);

bool nextKey(std::string& key);

bool next(std::string& key, std::string& genome);

bool nextCompressed(std::string& key, std::string& compressed_genome);

void reset();
};
} // namespace silo

#endif // SILO_ZSTDFASTA_READER_H
31 changes: 31 additions & 0 deletions include/silo/common/zstdfasta_writer.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#ifndef SILO_ZSTDFASTA_WRITER_H
#define SILO_ZSTDFASTA_WRITER_H

#include <filesystem>
#include <iostream>

#include <zstd.h>

#include "silo/common/input_stream_wrapper.h"

namespace silo {
class ZstdFastaWriter {
private:
std::ofstream outStream;
ZSTD_CDict* zstd_dictionary;
ZSTD_CCtx* zstd_context;
std::string buffer;

public:
explicit ZstdFastaWriter(
const std::filesystem::path& out_file_name,
const std::string& compression_dict
);

void write(const std::string& key, const std::string& genome);

void writeRaw(const std::string& key, const std::string& compressed_genome);
};
} // namespace silo

#endif // SILO_ZSTDFASTA_WRITER_H
1 change: 1 addition & 0 deletions include/silo/config/database_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ struct DatabaseSchema {
};

struct DatabaseConfig {
std::string default_nucleotide_sequence;
DatabaseSchema schema;

[[nodiscard]] DatabaseMetadata getMetadata(const std::string& name) const;
Expand Down
25 changes: 13 additions & 12 deletions include/silo/database.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#ifndef SILO_DATABASE_H
#define SILO_DATABASE_H

#include <filesystem>
#include <iostream>
#include <string>
#include <unordered_map>
Expand All @@ -16,7 +17,8 @@
#include "silo/storage/column/pango_lineage_column.h"
#include "silo/storage/column/string_column.h"
#include "silo/storage/pango_lineage_alias.h"
#include "silo/storage/reference_genome.h"
#include "silo/storage/reference_genomes.h"
#include "silo/storage/sequence_store.h"

namespace silo {
struct DatabasePartition;
Expand All @@ -39,7 +41,7 @@ struct BitmapContainerSize;
class Database {
public:
silo::config::DatabaseConfig database_config;
std::unique_ptr<ReferenceGenome> reference_genome;
ReferenceGenomes reference_genomes;
std::vector<DatabasePartition> partitions;

std::unordered_map<std::string, storage::column::StringColumn> string_columns;
Expand All @@ -48,6 +50,7 @@ class Database {
std::unordered_map<std::string, storage::column::FloatColumn> float_columns;
std::unordered_map<std::string, storage::column::DateColumn> date_columns;
std::unordered_map<std::string, storage::column::PangoLineageColumn> pango_lineage_columns;
std::unordered_map<std::string, SequenceStore> nuc_sequences;

Database();

Expand All @@ -57,10 +60,8 @@ class Database {
);

void build(
const std::string& partition_name_prefix,
const std::string& metadata_file_suffix,
const std::string& sequence_file_suffix,
const silo::preprocessing::Partitions& partition_descriptor
const std::filesystem::path& input_folder,
const preprocessing::Partitions& partition_descriptor
);

virtual silo::DatabaseInfo getDatabaseInfo() const;
Expand All @@ -69,10 +70,6 @@ class Database {

[[maybe_unused]] void flipBitmaps();

[[maybe_unused]] void indexAllNucleotideSymbolsN();

[[maybe_unused]] void naiveIndexAllNucleotideSymbolsN();

[[maybe_unused]] void saveDatabaseState(
const std::string& save_directory,
const silo::preprocessing::Partitions& partition_descriptor
Expand All @@ -98,13 +95,17 @@ class Database {
PangoLineageAliasLookup alias_key;

void initializeColumns();
void initializeSequences();

BitmapSizePerSymbol calculateBitmapSizePerSymbol() const;

BitmapContainerSize calculateBitmapContainerSizePerGenomeSection(uint32_t section_length) const;
BitmapContainerSize calculateBitmapContainerSizePerGenomeSection(
size_t genome_length,
size_t section_length
) const;
};

std::string buildChunkName(unsigned partition, unsigned chunk);
std::string buildChunkString(unsigned partition, unsigned chunk);

} // namespace silo

Expand Down
6 changes: 3 additions & 3 deletions include/silo/database_info.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,15 +36,15 @@ struct BitmapContainerSizeStatistic {
};

struct BitmapContainerSize {
uint32_t section_length;
std::map<std::string, std::vector<uint32_t>> size_per_genome_symbol_and_section;
size_t section_length;
std::map<std::string, std::vector<size_t>> size_per_genome_symbol_and_section;

BitmapContainerSizeStatistic bitmap_container_size_statistic;

uint64_t total_bitmap_size_frozen;
uint64_t total_bitmap_size_computed;

explicit BitmapContainerSize(uint32_t section_length);
explicit BitmapContainerSize(size_t genome_length, size_t section_length);

BitmapContainerSize& operator+=(const BitmapContainerSize& other);
};
Expand Down
29 changes: 15 additions & 14 deletions include/silo/prepare_dataset.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
#include <string>
#include <unordered_map>

#include "silo/storage/reference_genomes.h"

namespace silo {

namespace config {
Expand All @@ -16,34 +18,34 @@ namespace preprocessing {
struct Partitions;
struct PangoLineageCounts;
class MetadataWriter;
class MetadataReader;
} // namespace preprocessing

class FastaReader;
class PangoLineageAliasLookup;

[[maybe_unused]] void pruneSequences(
const std::filesystem::path& metadata_in,
silo::preprocessing::MetadataReader& metadata_reader,
silo::FastaReader& sequences_in,
std::ostream& sequences_out,
const silo::config::DatabaseConfig& database_config
);

[[maybe_unused]] void pruneMetadata(
const std::filesystem::path& metadata_in,
silo::preprocessing::MetadataReader& metadata_reader,
silo::FastaReader& sequences_in,
silo::preprocessing::MetadataWriter& metadata_writer,
const silo::config::DatabaseConfig& database_config
);

void partitionSequences(
void partitionData(
const preprocessing::Partitions& partitions,
const std::filesystem::path& meta_in,
silo::FastaReader& sequence_in,
const std::string& output_prefix,
const std::filesystem::path& input_folder,
silo::preprocessing::MetadataReader& metadata_reader,
const std::filesystem::path& output_folder,
const PangoLineageAliasLookup& alias_key,
const std::string& metadata_file_extension,
const std::string& sequence_file_extension,
const silo::config::DatabaseConfig& database_config
const silo::config::DatabaseConfig& database_config,
const ReferenceGenomes& reference_genomes
);

struct SortChunkConfig {
Expand All @@ -53,11 +55,10 @@ struct SortChunkConfig {

void sortChunks(
const preprocessing::Partitions& partitions,
const std::string& input_prefix,
const std::string& output_prefix,
const std::string& metadata_file_extension,
const std::string& sequence_file_extension,
const SortChunkConfig& sort_chunk_config
const std::filesystem::path& input_folder,
const std::filesystem::path& output_folder,
const SortChunkConfig& sort_chunk_config,
const ReferenceGenomes& reference_genomes
);

} // namespace silo
Expand Down
13 changes: 6 additions & 7 deletions include/silo/preprocessing/metadata.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,20 +11,19 @@ namespace silo::preprocessing {

class MetadataReader {
public:
static std::vector<std::string> getColumn(
const std::filesystem::path& metadata_path,
const std::string& column_name
);
csv::CSVReader reader;

static csv::CSVReader getReader(const std::filesystem::path& metadata_path);
explicit MetadataReader(const std::filesystem::path& metadata_path);

std::vector<std::string> getColumn(const std::string& column_name);
};

class MetadataWriter {
private:
std::unique_ptr<std::ostream> out_stream;
std::ofstream out_stream;

public:
MetadataWriter(std::unique_ptr<std::ostream> out_stream);
explicit MetadataWriter(const std::filesystem::path& metadata_path);

void writeHeader(const csv::CSVReader& csv_reader);

Expand Down
6 changes: 0 additions & 6 deletions include/silo/preprocessing/preprocessing_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,6 @@ struct MetadataFilename {
std::string filename;
};

struct SequenceFilename {
std::string filename;
};

struct PangoLineageDefinitionFilename {
std::string filename;
};
Expand All @@ -47,7 +43,6 @@ struct PreprocessingConfig {
const InputDirectory& input_directory_,
const OutputDirectory& output_directory_,
const MetadataFilename& metadata_filename_,
const SequenceFilename& sequence_filename_,
const PangoLineageDefinitionFilename& pango_lineage_definition_filename_,
const PartitionsFolder& partition_folder_,
const SortedPartitionsFolder& sorted_partition_folder_,
Expand All @@ -60,7 +55,6 @@ struct PreprocessingConfig {
std::filesystem::path input_directory;
std::filesystem::path pango_lineage_definition_file;
std::filesystem::path metadata_file;
std::filesystem::path sequence_file;
std::filesystem::path partition_folder;
std::filesystem::path sorted_partition_folder;
std::filesystem::path serialization_folder;
Expand Down
6 changes: 4 additions & 2 deletions include/silo/query_engine/actions/nuc_mutations.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,12 @@

#include "silo/common/nucleotide_symbols.h"
#include "silo/query_engine/actions/action.h"
#include "silo/storage/sequence_store.h"

namespace silo::query_engine::actions {

class NucMutations : public Action {
std::optional<std::string> nuc_sequence_name;
double min_proportion;

static constexpr std::array<NUCLEOTIDE_SYMBOL, 5> VALID_MUTATION_SYMBOLS{
Expand All @@ -25,12 +27,12 @@ class NucMutations : public Action {

private:
static std::array<std::vector<uint32_t>, MUTATION_SYMBOL_COUNT> calculateMutationsPerPosition(
const Database& database,
const SequenceStore& seq_store,
std::vector<OperatorResult>& bitmap_filter
);

public:
explicit NucMutations(double min_proportion);
explicit NucMutations(std::optional<std::string> nuc_sequence_name, double min_proportion);

[[nodiscard]] QueryResult execute(
const Database& database,
Expand Down
1 change: 1 addition & 0 deletions include/silo/storage/column_group.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include "silo/storage/column/int_column.h"
#include "silo/storage/column/pango_lineage_column.h"
#include "silo/storage/column/string_column.h"
#include "silo/storage/sequence_store.h"

namespace silo::config {
struct DatabaseMetadata;
Expand Down
Loading

0 comments on commit 9a80204

Please sign in to comment.