Skip to content

Commit

Permalink
feat: allow reading segments and genes that are null from ndjson file #…
Browse files Browse the repository at this point in the history
  • Loading branch information
fengelniederhammer committed Jan 22, 2024
1 parent c1e96cc commit a8a9c89
Show file tree
Hide file tree
Showing 12 changed files with 319 additions and 134 deletions.
41 changes: 22 additions & 19 deletions include/silo/storage/sequence_store.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,12 @@
#include "silo/common/nucleotide_symbols.h"
#include "silo/common/symbol_map.h"

using std::deque;
using std::optional;
using std::pair;
using std::string;
using std::vector;

namespace boost::serialization {
class access;
} // namespace boost::serialization
Expand All @@ -38,12 +44,12 @@ class Position {

public:
explicit Position(typename SymbolType::Symbol symbol);
explicit Position(std::optional<typename SymbolType::Symbol> symbol);
explicit Position(optional<typename SymbolType::Symbol> symbol);

SymbolMap<SymbolType, roaring::Roaring> bitmaps;
std::optional<typename SymbolType::Symbol> symbol_whose_bitmap_is_flipped;
optional<typename SymbolType::Symbol> symbol_whose_bitmap_is_flipped;

std::optional<typename SymbolType::Symbol> flipMostNumerousBitmap(uint32_t sequence_count);
optional<typename SymbolType::Symbol> flipMostNumerousBitmap(uint32_t sequence_count);
};

struct SequenceStoreInfo {
Expand All @@ -68,20 +74,17 @@ class SequenceStorePartition {
// clang-format on
}

void fillIndexes(const std::vector<std::string>& genomes);
void fillIndexes(const vector<optional<string>>& genomes);

void fillNBitmaps(const std::vector<std::string>& genomes);
void fillNBitmaps(const vector<optional<string>>& genomes);

public:
explicit SequenceStorePartition(
const std::vector<typename SymbolType::Symbol>& reference_sequence
);

const std::vector<typename SymbolType::Symbol>& reference_sequence;
std::vector<std::pair<size_t, typename SymbolType::Symbol>>
indexing_differences_to_reference_sequence;
std::vector<Position<SymbolType>> positions;
std::vector<roaring::Roaring> missing_symbol_bitmaps;
explicit SequenceStorePartition(const vector<typename SymbolType::Symbol>& reference_sequence);

const vector<typename SymbolType::Symbol>& reference_sequence;
vector<pair<size_t, typename SymbolType::Symbol>> indexing_differences_to_reference_sequence;
vector<Position<SymbolType>> positions;
vector<roaring::Roaring> missing_symbol_bitmaps;
uint32_t sequence_count = 0;

[[nodiscard]] size_t computeSize() const;
Expand All @@ -95,24 +98,24 @@ class SequenceStorePartition {

size_t fill(silo::ZstdFastaTableReader& input);

void interpret(const std::vector<std::string>& genomes);
void interpret(const vector<optional<string>>& genomes);
};

template <typename SymbolType>
class SequenceStore {
public:
std::vector<typename SymbolType::Symbol> reference_sequence;
std::deque<SequenceStorePartition<SymbolType>> partitions;
vector<typename SymbolType::Symbol> reference_sequence;
deque<SequenceStorePartition<SymbolType>> partitions;

explicit SequenceStore(std::vector<typename SymbolType::Symbol> reference_sequence);
explicit SequenceStore(vector<typename SymbolType::Symbol> reference_sequence);

SequenceStorePartition<SymbolType>& createPartition();
};

} // namespace silo

template <>
struct [[maybe_unused]] fmt::formatter<silo::SequenceStoreInfo> : fmt::formatter<std::string> {
struct [[maybe_unused]] fmt::formatter<silo::SequenceStoreInfo> : fmt::formatter<string> {
[[maybe_unused]] static auto format(
silo::SequenceStoreInfo sequence_store_info,
format_context& ctx
Expand Down
41 changes: 26 additions & 15 deletions include/silo/zstdfasta/zstdfasta_table_reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,44 +7,55 @@
#include <string>
#include <string_view>

using std::optional;
using std::string;
using std::string_view;
using std::unique_ptr;

namespace duckdb {
struct Connection;
struct MaterializedQueryResult;
struct DataChunk;
} // namespace duckdb

using duckdb::Connection;
using duckdb::DataChunk;
using duckdb::MaterializedQueryResult;

namespace silo {
struct ZstdDecompressor;

class ZstdFastaTableReader {
private:
duckdb::Connection& connection;
std::string table_name;
std::string where_clause;
std::string order_by_clause;
std::unique_ptr<duckdb::MaterializedQueryResult> query_result;
std::unique_ptr<duckdb::DataChunk> current_chunk;
std::unique_ptr<silo::ZstdDecompressor> decompressor;
Connection& connection;
string table_name;
string where_clause;
string order_by_clause;
unique_ptr<MaterializedQueryResult> query_result;
unique_ptr<DataChunk> current_chunk;
unique_ptr<ZstdDecompressor> decompressor;
size_t current_row;

std::string genome_buffer;

std::optional<std::string> nextKey();

void advanceRow();

public:
explicit ZstdFastaTableReader(
duckdb::Connection& connection,
std::string_view table_name,
std::string_view compression_dict,
std::string_view where_clause,
std::string_view order_by_clause
Connection& connection,
string_view table_name,
string_view compression_dict,
string_view where_clause,
string_view order_by_clause
);

std::optional<std::string> nextSkipGenome();
optional<string> nextSkipGenome();

std::optional<std::string> next(std::string& genome);
optional<string> next(optional<string>& genome);

std::optional<std::string> nextCompressed(std::string& compressed_genome);
optional<string> nextCompressed(optional<string>& compressed_genome);

void reset();
};
Expand Down
2 changes: 1 addition & 1 deletion src/main.test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
#include "silo/common/log.h"

int main(int argc, char* argv[]) {
spdlog::set_level(spdlog::level::off);
spdlog::set_level(spdlog::level::info);
spdlog::null_logger_mt(silo::PERFORMANCE_LOGGER_NAME);
::testing::InitGoogleMock(&argc, argv);
return RUN_ALL_TESTS();
Expand Down
67 changes: 67 additions & 0 deletions src/silo/preprocessing/preprocessor.test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
#include "silo/preprocessing/preprocessor.h"

#include <gtest/gtest.h>
#include <nlohmann/json.hpp>

#include "silo/config/config_repository.h"
#include "silo/database.h"
#include "silo/database_info.h"
#include "silo/preprocessing/preprocessing_config_reader.h"
#include "silo/query_engine/query_engine.h"

using nlohmann::json;
using silo::config::ConfigRepository;
using silo::preprocessing::OptionalPreprocessingConfig;
using silo::preprocessing::PreprocessingConfigReader;
using silo::preprocessing::Preprocessor;
using silo::query_engine::QueryEngine;

TEST(PreprocessorTest, shouldProcessDataSetWithSequencesThatAreNull) {
const silo::preprocessing::InputDirectory input_directory{
"./testBaseData/ndjsonWithNullSequences/"
};

auto config = PreprocessingConfigReader()
.readConfig(input_directory.directory + "preprocessing_config.yaml")
.mergeValuesFromOrDefault(OptionalPreprocessingConfig());

const auto database_config =
ConfigRepository().getValidatedConfig(input_directory.directory + "database_config.yaml");

Preprocessor preprocessor(config, database_config);
auto database = preprocessor.preprocess();

const auto database_info = database.getDatabaseInfo();

EXPECT_GT(database_info.total_size, 0);
EXPECT_EQ(database_info.sequence_count, 5);

const QueryEngine query_engine(database);
const auto result = query_engine.executeQuery(R"(
{
"action": {
"type": "FastaAligned",
"sequenceName": ["someShortGene", "secondSegment"],
"orderByFields": ["accessionVersion"]
},
"filterExpression": {
"type": "True"
}
}
)");

const auto actual = nlohmann::json(result.query_result);
const json expected = {
{{"accessionVersion", "1.1"}, {"someShortGene", "MADS"}, {"secondSegment", "NNNNNNNNNNNNNNNN"}
},
{{"accessionVersion", "1.2"}, {"someShortGene", "MADS"}, {"secondSegment", "NNNNNNNNNNNNNNNN"}
},
{{"accessionVersion", "1.3"}, {"someShortGene", "XXXX"}, {"secondSegment", "NNNNNNNNNNNNNNNN"}
},
{{"accessionVersion", "1.4"}, {"someShortGene", "MADS"}, {"secondSegment", "NNNNNNNNNNNNNNNN"}
},
{{"accessionVersion", "1.5"}, {"someShortGene", "MADS"}, {"secondSegment", "NNNNNNNNNNNNNNNN"}
}
};
ASSERT_EQ(actual, expected);
}
Loading

0 comments on commit a8a9c89

Please sign in to comment.