From f61c803f74d135e75865aa09a9b373aa3603f19d Mon Sep 17 00:00:00 2001 From: Alexander Taepper Date: Sat, 29 Jul 2023 12:08:37 +0200 Subject: [PATCH 01/10] feat: flipped bitmap can now be set before insertion --- endToEndTests/test/query.test.js | 15 ++-- include/silo/storage/aa_store.h | 18 ++-- include/silo/storage/sequence_store.h | 12 ++- src/silo/storage/aa_store.cpp | 92 +++++++++++--------- src/silo/storage/database_partition.cpp | 22 ++++- src/silo/storage/position.test.cpp | 8 +- src/silo/storage/sequence_store.cpp | 107 ++++++++++++++---------- 7 files changed, 167 insertions(+), 107 deletions(-) diff --git a/endToEndTests/test/query.test.js b/endToEndTests/test/query.test.js index 683db6f77..9db69e5d8 100644 --- a/endToEndTests/test/query.test.js +++ b/endToEndTests/test/query.test.js @@ -12,12 +12,15 @@ describe('The /query endpoint', () => { .map(file => JSON.parse(fs.readFileSync(`${queriesPath}/${file}`))) .forEach(testCase => it('should return data for the test case ' + testCase.testCaseName, async () => { - const response = await server - .post('/query') - .send(testCase.query) - .expect(200) - .expect('Content-Type', 'application/json'); - return expect(response.body.queryResult).to.deep.equal(testCase.expectedQueryResult); + let response = await server.post('/query').send(testCase.query); + try { + expect(response.status).to.equal(200); + expect(response.header['content-type']).to.equal('application/json'); + } catch (error) { + console.error('Error in response header! Error body:', response.body); + throw error; + } + expect(response.body.queryResult).to.deep.equal(testCase.expectedQueryResult); }) ); diff --git a/include/silo/storage/aa_store.h b/include/silo/storage/aa_store.h index 5c29050e2..2ad1ab208 100644 --- a/include/silo/storage/aa_store.h +++ b/include/silo/storage/aa_store.h @@ -27,17 +27,23 @@ namespace silo { class ZstdFastaReader; enum class AA_SYMBOL : char; -struct AAPosition { +class AAPosition { friend class boost::serialization::access; template void serialize(Archive& archive, [[maybe_unused]] const uint32_t version) { // clang-format off - archive& symbol_whose_bitmap_is_flipped; - archive& bitmaps; + archive & symbol_whose_bitmap_is_flipped; + archive & bitmaps; // clang-format on } + AAPosition() = default; + + public: + explicit AAPosition(AA_SYMBOL symbol); + explicit AAPosition(std::optional symbol); + AASymbolMap bitmaps; std::optional symbol_whose_bitmap_is_flipped = std::nullopt; }; @@ -49,9 +55,9 @@ class AAStorePartition { template void serialize(Archive& archive, [[maybe_unused]] const uint32_t version) { // clang-format off - archive& sequence_count; - archive& positions; - archive& aa_symbol_x_bitmaps; + archive & sequence_count; + archive & positions; + archive & aa_symbol_x_bitmaps; // clang-format on } diff --git a/include/silo/storage/sequence_store.h b/include/silo/storage/sequence_store.h index 471a72fd3..8ffbf4bfe 100644 --- a/include/silo/storage/sequence_store.h +++ b/include/silo/storage/sequence_store.h @@ -28,17 +28,23 @@ class access; namespace silo { class ZstdFastaReader; -struct NucPosition { +class NucPosition { friend class boost::serialization::access; template void serialize(Archive& archive, [[maybe_unused]] const uint32_t version) { // clang-format off - archive& symbol_whose_bitmap_is_flipped; - archive& bitmaps; + archive & symbol_whose_bitmap_is_flipped; + archive & bitmaps; // clang-format on } + NucPosition() = default; + + public: + explicit NucPosition(NUCLEOTIDE_SYMBOL symbol); + explicit NucPosition(std::optional symbol); + NucleotideSymbolMap bitmaps; std::optional symbol_whose_bitmap_is_flipped = std::nullopt; }; diff --git a/src/silo/storage/aa_store.cpp b/src/silo/storage/aa_store.cpp index 414334ec3..c8a0afe3e 100644 --- a/src/silo/storage/aa_store.cpp +++ b/src/silo/storage/aa_store.cpp @@ -13,6 +13,21 @@ #include "silo/common/zstdfasta_reader.h" #include "silo/preprocessing/preprocessing_exception.h" +silo::AAPosition::AAPosition(AA_SYMBOL symbol) { + symbol_whose_bitmap_is_flipped = symbol; +} + +silo::AAPosition::AAPosition(std::optional symbol) { + symbol_whose_bitmap_is_flipped = symbol; +} + +silo::AAStorePartition::AAStorePartition(const std::vector& reference_sequence) + : reference_sequence(reference_sequence) { + for (AA_SYMBOL symbol : reference_sequence) { + positions.emplace_back(symbol); + } +} + size_t silo::AAStorePartition::fill(silo::ZstdFastaReader& input_file) { static constexpr size_t BUFFER_SIZE = 1024; @@ -47,37 +62,44 @@ const roaring::Roaring* silo::AAStorePartition::getBitmap(size_t position, AA_SY void silo::AAStorePartition::fillIndexes(const std::vector& sequences) { const size_t genome_length = positions.size(); static constexpr int COUNT_SYMBOLS_PER_PROCESSOR = 64; - const tbb::blocked_range positions_range( - 0, genome_length, genome_length / COUNT_SYMBOLS_PER_PROCESSOR - ); - tbb::parallel_for(positions_range, [&](const decltype(positions_range)& local) { - AASymbolMap> ids_per_symbol_for_current_position; - for (size_t position = local.begin(); position != local.end(); ++position) { - const size_t number_of_sequences = sequences.size(); - for (size_t sequence_id = 0; sequence_id < number_of_sequences; ++sequence_id) { - const char character = sequences[sequence_id][position]; - const auto symbol = charToAASymbol(character); - if (!symbol.has_value()) { - throw PreprocessingException( - "Found invalid symbol in Amino Acid sequence: " + std::to_string(character) + - "\nFull sequence: " + sequences[sequence_id] - ); + tbb::parallel_for( + tbb::blocked_range(0, genome_length, genome_length / COUNT_SYMBOLS_PER_PROCESSOR), + [&](const auto& local) { + AASymbolMap> ids_per_symbol_for_current_position; + for (size_t position = local.begin(); position != local.end(); ++position) { + const size_t number_of_sequences = sequences.size(); + for (size_t sequence_id = 0; sequence_id < number_of_sequences; ++sequence_id) { + const char character = sequences[sequence_id][position]; + const auto symbol = charToAASymbol(character); + if (!symbol.has_value()) { + throw PreprocessingException( + "Found invalid symbol in Amino Acid sequence: " + std::to_string(character) + + "\nFull sequence: " + sequences[sequence_id] + ); + } + if (symbol != AA_SYMBOL::X) { + ids_per_symbol_for_current_position[*symbol].push_back( + sequence_count + sequence_id + ); + } } - if (symbol != AA_SYMBOL::X) { - ids_per_symbol_for_current_position[*symbol].push_back(sequence_count + sequence_id); - } - } - for (const auto& symbol : AA_SYMBOLS) { - if (!ids_per_symbol_for_current_position.at(symbol).empty()) { - positions[position].bitmaps[symbol].addMany( - ids_per_symbol_for_current_position.at(symbol).size(), - ids_per_symbol_for_current_position.at(symbol).data() - ); - ids_per_symbol_for_current_position[symbol].clear(); + for (const AA_SYMBOL symbol : AA_SYMBOLS) { + if (!ids_per_symbol_for_current_position.at(symbol).empty()) { + positions[position].bitmaps[symbol].addMany( + ids_per_symbol_for_current_position.at(symbol).size(), + ids_per_symbol_for_current_position.at(symbol).data() + ); + ids_per_symbol_for_current_position[symbol].clear(); + } + if (symbol == positions[position].symbol_whose_bitmap_is_flipped) { + positions[position].bitmaps[symbol].flip( + sequence_count, sequence_count + number_of_sequences + ); + } } } } - }); + ); } void silo::AAStorePartition::fillXBitmaps(const std::vector& sequences) { @@ -85,15 +107,13 @@ void silo::AAStorePartition::fillXBitmaps(const std::vector& sequen aa_symbol_x_bitmaps.resize(sequence_count + sequences.size()); - const tbb::blocked_range range(0, sequences.size()); - tbb::parallel_for(range, [&](const decltype(range)& local) { - // For every symbol, calculate all sequence IDs that have that symbol at that position + tbb::parallel_for(tbb::blocked_range(0, sequences.size()), [&](const auto& local) { std::vector positions_with_aa_symbol_x; for (size_t sequence_id = local.begin(); sequence_id != local.end(); ++sequence_id) { for (size_t position = 0; position < genome_length; ++position) { const char character = sequences[sequence_id][position]; // No need to check the cast because we call fillIndexes first - const auto symbol = static_cast(character); + const auto symbol = charToAASymbol(character); if (symbol == AA_SYMBOL::X) { positions_with_aa_symbol_x.push_back(position); } @@ -115,10 +135,6 @@ void silo::AAStorePartition::interpret(const std::vector& sequences sequence_count += sequences.size(); } -silo::AAStorePartition::AAStorePartition(const std::vector& reference_sequence) - : reference_sequence(reference_sequence), - positions(reference_sequence.size()) {} - size_t silo::AAStorePartition::computeSize() const { size_t result = 0; for (const auto& position : positions) { @@ -131,8 +147,7 @@ size_t silo::AAStorePartition::computeSize() const { size_t silo::AAStorePartition::runOptimize() { std::atomic count_true = 0; - const tbb::blocked_range range(0U, positions.size()); - tbb::parallel_for(range, [&](const decltype(range) local) { + tbb::parallel_for(tbb::blocked_range(0U, positions.size()), [&](const auto& local) { for (auto position = local.begin(); position != local.end(); ++position) { for (const AA_SYMBOL symbol : AA_SYMBOLS) { if (positions[position].bitmaps[symbol].runOptimize()) { @@ -146,8 +161,7 @@ size_t silo::AAStorePartition::runOptimize() { size_t silo::AAStorePartition::shrinkToFit() { std::atomic saved = 0; - const tbb::blocked_range range(0U, positions.size()); - tbb::parallel_for(range, [&](const decltype(range) local) { + tbb::parallel_for(tbb::blocked_range(0U, positions.size()), [&](const auto& local) { size_t local_saved = 0; for (auto position = local.begin(); position != local.end(); ++position) { for (const AA_SYMBOL symbol : AA_SYMBOLS) { diff --git a/src/silo/storage/database_partition.cpp b/src/silo/storage/database_partition.cpp index 05f9e75ab..efd6d0eaa 100644 --- a/src/silo/storage/database_partition.cpp +++ b/src/silo/storage/database_partition.cpp @@ -25,19 +25,26 @@ void DatabasePartition::flipBitmaps() { auto& positions = seq_store.positions; tbb::parallel_for(tbb::blocked_range(0, positions.size()), [&](const auto& local) { for (auto position = local.begin(); position != local.end(); ++position) { + std::optional flipped_bitmap_before = + positions[position].symbol_whose_bitmap_is_flipped; std::optional max_symbol = std::nullopt; uint32_t max_count = 0; for (const auto& symbol : NUC_SYMBOLS) { roaring::Roaring bitmap = positions[position].bitmaps.at(symbol); bitmap.runOptimize(); - const uint32_t count = bitmap.cardinality(); + const uint32_t count = flipped_bitmap_before == symbol + ? sequenceCount - bitmap.cardinality() + : bitmap.cardinality(); if (count > max_count) { max_symbol = symbol; max_count = count; } } - if (max_symbol.has_value()) { + if (max_symbol.has_value() && max_symbol != flipped_bitmap_before) { + if (flipped_bitmap_before.has_value()) { + positions[position].bitmaps[*flipped_bitmap_before].flip(0, sequenceCount); + } positions[position].symbol_whose_bitmap_is_flipped = max_symbol; positions[position].bitmaps[*max_symbol].flip(0, sequenceCount); } @@ -48,19 +55,26 @@ void DatabasePartition::flipBitmaps() { auto& positions = seq_store.positions; tbb::parallel_for(tbb::blocked_range(0, positions.size()), [&](const auto& local) { for (auto position = local.begin(); position != local.end(); ++position) { + std::optional flipped_bitmap_before = + positions[position].symbol_whose_bitmap_is_flipped; std::optional max_symbol = std::nullopt; uint32_t max_count = 0; for (const auto& symbol : AA_SYMBOLS) { roaring::Roaring bitmap = positions[position].bitmaps.at(symbol); bitmap.runOptimize(); - const uint32_t count = bitmap.cardinality(); + const uint32_t count = flipped_bitmap_before == symbol + ? sequenceCount - bitmap.cardinality() + : bitmap.cardinality(); if (count > max_count) { max_symbol = symbol; max_count = count; } } - if (max_symbol.has_value()) { + if (max_symbol.has_value() && max_symbol != flipped_bitmap_before) { + if (flipped_bitmap_before.has_value()) { + positions[position].bitmaps[*flipped_bitmap_before].flip(0, sequenceCount); + } positions[position].symbol_whose_bitmap_is_flipped = max_symbol; positions[position].bitmaps[*max_symbol].flip(0, sequenceCount); } diff --git a/src/silo/storage/position.test.cpp b/src/silo/storage/position.test.cpp index 8e779eaf9..50bee39ed 100644 --- a/src/silo/storage/position.test.cpp +++ b/src/silo/storage/position.test.cpp @@ -24,10 +24,10 @@ void deserializeFromFile(const std::string& filename, silo::NucPosition& positio TEST(Position, shouldSerializeAndDeserializePositionsWithEmptyOptional) { const std::string test_file = "test.bin"; - const silo::NucPosition position_with_unset_optional; + const silo::NucPosition position_with_unset_optional(std::nullopt); serializeToFile(test_file, position_with_unset_optional); - silo::NucPosition deserialized_position; + silo::NucPosition deserialized_position(std::nullopt); deserializeFromFile(test_file, deserialized_position); EXPECT_FALSE(position_with_unset_optional.symbol_whose_bitmap_is_flipped.has_value()); @@ -39,11 +39,11 @@ TEST(Position, shouldSerializeAndDeserializePositionsWithEmptyOptional) { TEST(Position, shouldSerializeAndDeserializePositionWithSetOptional) { const std::string test_file = "test.bin"; - silo::NucPosition position_with_set_optional; + silo::NucPosition position_with_set_optional(std::nullopt); position_with_set_optional.symbol_whose_bitmap_is_flipped = silo::NUCLEOTIDE_SYMBOL::A; serializeToFile(test_file, position_with_set_optional); - silo::NucPosition deserialized_position; + silo::NucPosition deserialized_position(std::nullopt); deserializeFromFile(test_file, deserialized_position); EXPECT_TRUE(deserialized_position.symbol_whose_bitmap_is_flipped.has_value()); diff --git a/src/silo/storage/sequence_store.cpp b/src/silo/storage/sequence_store.cpp index be7f69af4..a59e3ba51 100644 --- a/src/silo/storage/sequence_store.cpp +++ b/src/silo/storage/sequence_store.cpp @@ -15,17 +15,21 @@ #include "silo/common/zstdfasta_reader.h" #include "silo/preprocessing/preprocessing_exception.h" -[[maybe_unused]] auto fmt::formatter::format( - silo::SequenceStoreInfo sequence_store_info, - fmt::format_context& ctx -) -> decltype(ctx.out()) { - return format_to( - ctx.out(), - "SequenceStoreInfo[sequence count: {}, size: {}, N bitmaps size: {}]", - sequence_store_info.sequence_count, - sequence_store_info.size, - silo::formatNumber(sequence_store_info.n_bitmaps_size) - ); +silo::NucPosition::NucPosition(NUCLEOTIDE_SYMBOL symbol) { + symbol_whose_bitmap_is_flipped = symbol; +} + +silo::NucPosition::NucPosition(std::optional symbol) { + symbol_whose_bitmap_is_flipped = symbol; +} + +silo::SequenceStorePartition::SequenceStorePartition( + const std::vector& reference_genome +) + : reference_genome(reference_genome) { + for (NUCLEOTIDE_SYMBOL symbol : reference_genome) { + positions.emplace_back(symbol); + } } size_t silo::SequenceStorePartition::fill(silo::ZstdFastaReader& input_file) { @@ -56,6 +60,19 @@ size_t silo::SequenceStorePartition::fill(silo::ZstdFastaReader& input_file) { return read_sequences_count; } +[[maybe_unused]] auto fmt::formatter::format( + silo::SequenceStoreInfo sequence_store_info, + fmt::format_context& ctx +) -> decltype(ctx.out()) { + return format_to( + ctx.out(), + "SequenceStoreInfo[sequence count: {}, size: {}, N bitmaps size: {}]", + sequence_store_info.sequence_count, + sequence_store_info.size, + silo::formatNumber(sequence_store_info.n_bitmaps_size) + ); +} + silo::SequenceStoreInfo silo::SequenceStorePartition::getInfo() const { size_t n_bitmaps_size = 0; for (const auto& bitmap : nucleotide_symbol_n_bitmaps) { @@ -74,36 +91,43 @@ const roaring::Roaring* silo::SequenceStorePartition::getBitmap( void silo::SequenceStorePartition::fillIndexes(const std::vector& genomes) { const size_t genome_length = positions.size(); static constexpr int COUNT_SYMBOLS_PER_PROCESSOR = 64; - const tbb::blocked_range range( - 0, genome_length, genome_length / COUNT_SYMBOLS_PER_PROCESSOR - ); - tbb::parallel_for(range, [&](const decltype(range)& local) { - NucleotideSymbolMap> ids_per_symbol_for_current_position; - for (size_t position = local.begin(); position != local.end(); ++position) { - const size_t number_of_genomes = genomes.size(); - for (size_t sequence_id = 0; sequence_id < number_of_genomes; ++sequence_id) { - char const character = genomes[sequence_id][position]; - const auto symbol = charToNucleotideSymbol(character); - if (!symbol.has_value()) { - throw PreprocessingException( - "Illegal character " + std::to_string(character) + " contained in sequence." - ); - } - if (symbol != NUCLEOTIDE_SYMBOL::N) { - ids_per_symbol_for_current_position[*symbol].push_back(sequence_count + sequence_id); + tbb::parallel_for( + tbb::blocked_range(0, genome_length, genome_length / COUNT_SYMBOLS_PER_PROCESSOR), + [&](const auto& local) { + NucleotideSymbolMap> ids_per_symbol_for_current_position; + for (size_t position = local.begin(); position != local.end(); ++position) { + const size_t number_of_sequences = genomes.size(); + for (size_t sequence_id = 0; sequence_id < number_of_sequences; ++sequence_id) { + char const character = genomes[sequence_id][position]; + const auto symbol = charToNucleotideSymbol(character); + if (!symbol.has_value()) { + throw PreprocessingException( + "Illegal character " + std::to_string(character) + " contained in sequence." + ); + } + if (symbol != NUCLEOTIDE_SYMBOL::N) { + ids_per_symbol_for_current_position[*symbol].push_back( + sequence_count + sequence_id + ); + } } - } - for (const auto& symbol : NUC_SYMBOLS) { - if (!ids_per_symbol_for_current_position.at(symbol).empty()) { - this->positions[position].bitmaps[symbol].addMany( - ids_per_symbol_for_current_position.at(symbol).size(), - ids_per_symbol_for_current_position.at(symbol).data() - ); - ids_per_symbol_for_current_position[symbol].clear(); + for (const auto& symbol : NUC_SYMBOLS) { + if (!ids_per_symbol_for_current_position.at(symbol).empty()) { + positions[position].bitmaps[symbol].addMany( + ids_per_symbol_for_current_position.at(symbol).size(), + ids_per_symbol_for_current_position.at(symbol).data() + ); + ids_per_symbol_for_current_position[symbol].clear(); + } + if (symbol == positions[position].symbol_whose_bitmap_is_flipped) { + positions[position].bitmaps[symbol].flip( + sequence_count, sequence_count + number_of_sequences + ); + } } } } - }); + ); } void silo::SequenceStorePartition::fillNBitmaps(const std::vector& genomes) { @@ -118,8 +142,7 @@ void silo::SequenceStorePartition::fillNBitmaps(const std::vector& for (size_t genome = local.begin(); genome != local.end(); ++genome) { for (size_t position = 0; position < genome_length; ++position) { char const character = genomes[genome][position]; - const NUCLEOTIDE_SYMBOL symbol = - charToNucleotideSymbol(character).value_or(NUCLEOTIDE_SYMBOL::N); + const auto symbol = charToNucleotideSymbol(character); if (symbol == NUCLEOTIDE_SYMBOL::N) { positions_with_nucleotide_symbol_n.push_back(position); } @@ -141,12 +164,6 @@ void silo::SequenceStorePartition::interpret(const std::vector& gen sequence_count += genomes.size(); } -silo::SequenceStorePartition::SequenceStorePartition( - const std::vector& reference_genome -) - : reference_genome(reference_genome), - positions(reference_genome.size()) {} - size_t silo::SequenceStorePartition::computeSize() const { size_t result = 0; for (const auto& position : positions) { From b0c95a0f6158eb3475a7e0231834c6c3be33cc59 Mon Sep 17 00:00:00 2001 From: Alexander Taepper Date: Sat, 29 Jul 2023 12:08:55 +0200 Subject: [PATCH 02/10] refactor: correct case-style for DatabasePartition::sequence_count --- include/silo/storage/database_partition.h | 8 ++++---- src/silo/database.cpp | 4 ++-- .../filter_expressions/aa_symbol_equals.cpp | 6 +++--- src/silo/query_engine/filter_expressions/and.cpp | 16 ++++++++-------- .../filter_expressions/date_between.cpp | 4 ++-- .../query_engine/filter_expressions/false.cpp | 2 +- .../filter_expressions/float_between.cpp | 2 +- .../filter_expressions/float_equals.cpp | 4 ++-- .../filter_expressions/int_between.cpp | 2 +- .../filter_expressions/int_equals.cpp | 4 ++-- src/silo/query_engine/filter_expressions/nof.cpp | 8 ++++---- .../nucleotide_symbol_equals.cpp | 6 +++--- src/silo/query_engine/filter_expressions/or.cpp | 8 ++++---- .../filter_expressions/pango_lineage_filter.cpp | 6 +++--- .../filter_expressions/string_equals.cpp | 10 +++++----- .../query_engine/filter_expressions/true.cpp | 2 +- src/silo/storage/database_partition.cpp | 12 ++++++------ 17 files changed, 52 insertions(+), 52 deletions(-) diff --git a/include/silo/storage/database_partition.h b/include/silo/storage/database_partition.h index 4620d96ce..dfbb756b4 100644 --- a/include/silo/storage/database_partition.h +++ b/include/silo/storage/database_partition.h @@ -58,7 +58,7 @@ class DatabasePartition { for(auto& [name, store] : aa_sequences){ archive & store; } - archive & sequenceCount; + archive & sequence_count; // clang-format on } @@ -68,12 +68,12 @@ class DatabasePartition { DatabasePartition() = default; public: - explicit DatabasePartition(std::vector chunks); - storage::ColumnPartitionGroup columns; std::map nuc_sequences; std::map aa_sequences; - uint32_t sequenceCount; + uint32_t sequence_count; + + explicit DatabasePartition(std::vector chunks); void flipBitmaps(); diff --git a/src/silo/database.cpp b/src/silo/database.cpp index e65916e1e..0b6ead2dd 100644 --- a/src/silo/database.cpp +++ b/src/silo/database.cpp @@ -100,7 +100,7 @@ void Database::build( return; } SPDLOG_DEBUG("Using metadata file: {}", metadata_file.string()); - partitions[partition_index].sequenceCount = + partitions[partition_index].sequence_count = partitions[partition_index].columns.fill(metadata_file, database_config); } } @@ -163,7 +163,7 @@ DatabaseInfo Database::getDatabaseInfo() const { local_nucleotide_symbol_n_bitmaps_size += bitmap.getSizeInBytes(false); } } - sequence_count += database_partition.sequenceCount; + sequence_count += database_partition.sequence_count; total_size += local_total_size; nucleotide_symbol_n_bitmaps_size += local_nucleotide_symbol_n_bitmaps_size; } diff --git a/src/silo/query_engine/filter_expressions/aa_symbol_equals.cpp b/src/silo/query_engine/filter_expressions/aa_symbol_equals.cpp index 35263f8ee..a0bd6472a 100644 --- a/src/silo/query_engine/filter_expressions/aa_symbol_equals.cpp +++ b/src/silo/query_engine/filter_expressions/aa_symbol_equals.cpp @@ -63,13 +63,13 @@ std::unique_ptr AASymbolEquals::compile if (aa_store_partition.positions[position].symbol_whose_bitmap_is_flipped == aa_symbol) { return std::make_unique( std::make_unique( - aa_store_partition.getBitmap(position, aa_symbol), database_partition.sequenceCount + aa_store_partition.getBitmap(position, aa_symbol), database_partition.sequence_count ), - database_partition.sequenceCount + database_partition.sequence_count ); } return std::make_unique( - aa_store_partition.getBitmap(position, aa_symbol), database_partition.sequenceCount + aa_store_partition.getBitmap(position, aa_symbol), database_partition.sequence_count ); } diff --git a/src/silo/query_engine/filter_expressions/and.cpp b/src/silo/query_engine/filter_expressions/and.cpp index cded77b78..3575e8c10 100644 --- a/src/silo/query_engine/filter_expressions/and.cpp +++ b/src/silo/query_engine/filter_expressions/and.cpp @@ -125,7 +125,7 @@ std::tupletype() == operators::EMPTY) { SPDLOG_TRACE("Shortcutting because found empty child"); OperatorVector empty; - empty.emplace_back(std::make_unique(database_partition.sequenceCount)); + empty.emplace_back(std::make_unique(database_partition.sequence_count)); return { std::move(empty), OperatorVector(), @@ -177,10 +177,10 @@ std::unique_ptr And::compile( SPDLOG_TRACE( "Compiled And filter expression to Full, since no predicates and no child operators" ); - return std::make_unique(database_partition.sequenceCount); + return std::make_unique(database_partition.sequence_count); } auto result = std::make_unique( - std::move(predicates), database_partition.sequenceCount + std::move(predicates), database_partition.sequence_count ); SPDLOG_TRACE( "Compiled And filter expression to {} - found only predicates", result->toString() @@ -194,20 +194,20 @@ std::unique_ptr And::compile( index_arithmetic_operator = std::move(non_negated_child_operators[0]); } else if (negated_child_operators.size() == 1 && non_negated_child_operators.empty()) { index_arithmetic_operator = std::make_unique( - std::move(negated_child_operators[0]), database_partition.sequenceCount + std::move(negated_child_operators[0]), database_partition.sequence_count ); } else if (non_negated_child_operators.empty()) { std::unique_ptr union_ret = std::make_unique( - std::move(negated_child_operators), database_partition.sequenceCount + std::move(negated_child_operators), database_partition.sequence_count ); index_arithmetic_operator = std::make_unique( - std::move(union_ret), database_partition.sequenceCount + std::move(union_ret), database_partition.sequence_count ); } else { index_arithmetic_operator = std::make_unique( std::move(non_negated_child_operators), std::move(negated_child_operators), - database_partition.sequenceCount + database_partition.sequence_count ); } if (predicates.empty()) { @@ -219,7 +219,7 @@ std::unique_ptr And::compile( return index_arithmetic_operator; } auto result = std::make_unique( - std::move(index_arithmetic_operator), std::move(predicates), database_partition.sequenceCount + std::move(index_arithmetic_operator), std::move(predicates), database_partition.sequence_count ); SPDLOG_TRACE("Compiled And filter expression to {}", result->toString()); diff --git a/src/silo/query_engine/filter_expressions/date_between.cpp b/src/silo/query_engine/filter_expressions/date_between.cpp index 80af583f4..d002eb284 100644 --- a/src/silo/query_engine/filter_expressions/date_between.cpp +++ b/src/silo/query_engine/filter_expressions/date_between.cpp @@ -72,13 +72,13 @@ std::unique_ptr DateBetween::compile( ) ); return std::make_unique( - std::move(predicates), database_partition.sequenceCount + std::move(predicates), database_partition.sequence_count ); } return std::make_unique( computeRangesOfSortedColumn(date_column, database_partition.getChunks()), - database_partition.sequenceCount + database_partition.sequence_count ); } diff --git a/src/silo/query_engine/filter_expressions/false.cpp b/src/silo/query_engine/filter_expressions/false.cpp index 4e94bff06..d008676c5 100644 --- a/src/silo/query_engine/filter_expressions/false.cpp +++ b/src/silo/query_engine/filter_expressions/false.cpp @@ -25,7 +25,7 @@ std::unique_ptr False::compile( const silo::DatabasePartition& database_partition, AmbiguityMode /*mode*/ ) const { - return std::make_unique(database_partition.sequenceCount); + return std::make_unique(database_partition.sequence_count); } // NOLINTNEXTLINE(readability-identifier-naming) diff --git a/src/silo/query_engine/filter_expressions/float_between.cpp b/src/silo/query_engine/filter_expressions/float_between.cpp index 84ef33612..4d7dfb9b3 100644 --- a/src/silo/query_engine/filter_expressions/float_between.cpp +++ b/src/silo/query_engine/filter_expressions/float_between.cpp @@ -65,7 +65,7 @@ std::unique_ptr FloatBetween::compile( } return std::make_unique( - std::move(predicates), database_partition.sequenceCount + std::move(predicates), database_partition.sequence_count ); } diff --git a/src/silo/query_engine/filter_expressions/float_equals.cpp b/src/silo/query_engine/filter_expressions/float_equals.cpp index 5cf8c76a7..2ce0eb5ea 100644 --- a/src/silo/query_engine/filter_expressions/float_equals.cpp +++ b/src/silo/query_engine/filter_expressions/float_equals.cpp @@ -39,7 +39,7 @@ std::unique_ptr FloatEquals::compile( silo::query_engine::filter_expressions::Expression::AmbiguityMode /*mode*/ ) const { if (!database_partition.columns.float_columns.contains(column)) { - return std::make_unique(database_partition.sequenceCount); + return std::make_unique(database_partition.sequence_count); } const auto& float_column = database_partition.columns.float_columns.at(column); @@ -48,7 +48,7 @@ std::unique_ptr FloatEquals::compile( std::make_unique>( float_column.getValues(), operators::Comparator::EQUALS, value ), - database_partition.sequenceCount + database_partition.sequence_count ); } diff --git a/src/silo/query_engine/filter_expressions/int_between.cpp b/src/silo/query_engine/filter_expressions/int_between.cpp index 5aa106805..19fb6e725 100644 --- a/src/silo/query_engine/filter_expressions/int_between.cpp +++ b/src/silo/query_engine/filter_expressions/int_between.cpp @@ -55,7 +55,7 @@ std::unique_ptr IntBetween::compile( } auto result = std::make_unique( - std::move(predicates), database_partition.sequenceCount + std::move(predicates), database_partition.sequence_count ); SPDLOG_TRACE("Compiled IntBetween filter expression to {}", result->toString()); diff --git a/src/silo/query_engine/filter_expressions/int_equals.cpp b/src/silo/query_engine/filter_expressions/int_equals.cpp index 3b8432cab..5761b163c 100644 --- a/src/silo/query_engine/filter_expressions/int_equals.cpp +++ b/src/silo/query_engine/filter_expressions/int_equals.cpp @@ -37,7 +37,7 @@ std::unique_ptr IntEquals::compile( Expression::AmbiguityMode /*mode*/ ) const { if (!database_partition.columns.int_columns.contains(column)) { - return std::make_unique(database_partition.sequenceCount); + return std::make_unique(database_partition.sequence_count); } const auto& int_column = database_partition.columns.int_columns.at(column); @@ -46,7 +46,7 @@ std::unique_ptr IntEquals::compile( std::make_unique>( int_column.getValues(), operators::Comparator::EQUALS, value ), - database_partition.sequenceCount + database_partition.sequence_count ); } diff --git a/src/silo/query_engine/filter_expressions/nof.cpp b/src/silo/query_engine/filter_expressions/nof.cpp index 7f8bbde90..6e040eb99 100644 --- a/src/silo/query_engine/filter_expressions/nof.cpp +++ b/src/silo/query_engine/filter_expressions/nof.cpp @@ -232,7 +232,7 @@ std::unique_ptr NOf::rewriteNonExact( std::move(non_negated_child_operators), std::move(negated_child_operators), false, - database_partition.sequenceCount + database_partition.sequence_count )); } @@ -245,7 +245,7 @@ std::unique_ptr NOf::rewriteNonExact( std::move(non_negated_child_operators), std::move(negated_child_operators), false, - database_partition.sequenceCount + database_partition.sequence_count )); } @@ -254,7 +254,7 @@ std::unique_ptr NOf::rewriteNonExact( std::move(at_least_k), std::move(at_least_k_plus_one), false, - database_partition.sequenceCount + database_partition.sequence_count ); } @@ -276,7 +276,7 @@ std::unique_ptr NOf::compile( std::move(non_negated_child_operators), std::move(negated_child_operators), match_exactly, - database_partition.sequenceCount + database_partition.sequence_count ); } diff --git a/src/silo/query_engine/filter_expressions/nucleotide_symbol_equals.cpp b/src/silo/query_engine/filter_expressions/nucleotide_symbol_equals.cpp index 778d6e0de..e4edd9d84 100644 --- a/src/silo/query_engine/filter_expressions/nucleotide_symbol_equals.cpp +++ b/src/silo/query_engine/filter_expressions/nucleotide_symbol_equals.cpp @@ -144,13 +144,13 @@ std::unique_ptr NucleotideSymbolEquals: return std::make_unique( std::make_unique( seq_store_partition.getBitmap(position, nucleotide_symbol), - database_partition.sequenceCount + database_partition.sequence_count ), - database_partition.sequenceCount + database_partition.sequence_count ); } return std::make_unique( - seq_store_partition.getBitmap(position, nucleotide_symbol), database_partition.sequenceCount + seq_store_partition.getBitmap(position, nucleotide_symbol), database_partition.sequence_count ); } diff --git a/src/silo/query_engine/filter_expressions/or.cpp b/src/silo/query_engine/filter_expressions/or.cpp index c16d6a916..fd7c319a7 100644 --- a/src/silo/query_engine/filter_expressions/or.cpp +++ b/src/silo/query_engine/filter_expressions/or.cpp @@ -60,7 +60,7 @@ std::unique_ptr Or::compile( continue; } if (child->type() == operators::FULL) { - return std::make_unique(database_partition.sequenceCount); + return std::make_unique(database_partition.sequence_count); } if (child->type() == operators::UNION) { auto* or_child = dynamic_cast(child.get()); @@ -74,7 +74,7 @@ std::unique_ptr Or::compile( filtered_child_operators.push_back(std::move(child)); } if (filtered_child_operators.empty()) { - return std::make_unique(database_partition.sequenceCount); + return std::make_unique(database_partition.sequence_count); } if (filtered_child_operators.size() == 1) { return std::move(filtered_child_operators[0]); @@ -86,11 +86,11 @@ std::unique_ptr Or::compile( [](const auto& child) { return child->type() == operators::COMPLEMENT; } )) { return operators::Complement::fromDeMorgan( - std::move(filtered_child_operators), database_partition.sequenceCount + std::move(filtered_child_operators), database_partition.sequence_count ); } return std::make_unique( - std::move(filtered_child_operators), database_partition.sequenceCount + std::move(filtered_child_operators), database_partition.sequence_count ); } diff --git a/src/silo/query_engine/filter_expressions/pango_lineage_filter.cpp b/src/silo/query_engine/filter_expressions/pango_lineage_filter.cpp index ee0b9b677..8fb3c07b3 100644 --- a/src/silo/query_engine/filter_expressions/pango_lineage_filter.cpp +++ b/src/silo/query_engine/filter_expressions/pango_lineage_filter.cpp @@ -47,7 +47,7 @@ std::unique_ptr PangoLineageFilter::com AmbiguityMode /*mode*/ ) const { if (!database_partition.columns.pango_lineage_columns.contains(column)) { - return std::make_unique(database_partition.sequenceCount); + return std::make_unique(database_partition.sequence_count); } std::string lineage_all_upper = lineage; @@ -60,9 +60,9 @@ std::unique_ptr PangoLineageFilter::com ? pango_lineage_column.filterIncludingSublineages({lineage_all_upper}) : pango_lineage_column.filter({lineage_all_upper}); if (bitmap == std::nullopt) { - return std::make_unique(database_partition.sequenceCount); + return std::make_unique(database_partition.sequence_count); } - return std::make_unique(bitmap.value(), database_partition.sequenceCount); + return std::make_unique(bitmap.value(), database_partition.sequence_count); } // NOLINTNEXTLINE(readability-identifier-naming) diff --git a/src/silo/query_engine/filter_expressions/string_equals.cpp b/src/silo/query_engine/filter_expressions/string_equals.cpp index d3996efb8..3c3de046d 100644 --- a/src/silo/query_engine/filter_expressions/string_equals.cpp +++ b/src/silo/query_engine/filter_expressions/string_equals.cpp @@ -45,10 +45,10 @@ std::unique_ptr StringEquals::compile( const roaring::Roaring& bitmap = string_column.filter(value); if (bitmap.isEmpty()) { - return std::make_unique(database_partition.sequenceCount); + return std::make_unique(database_partition.sequence_count); } return std::make_unique( - new roaring::Roaring(bitmap), database_partition.sequenceCount + new roaring::Roaring(bitmap), database_partition.sequence_count ); } @@ -60,13 +60,13 @@ std::unique_ptr StringEquals::compile( std::make_unique>( string_column.getValues(), operators::Comparator::EQUALS, embedded_string.value() ), - database_partition.sequenceCount + database_partition.sequence_count ); } - return std::make_unique(database_partition.sequenceCount); + return std::make_unique(database_partition.sequence_count); } - return std::make_unique(database_partition.sequenceCount); + return std::make_unique(database_partition.sequence_count); } // NOLINTNEXTLINE(readability-identifier-naming) diff --git a/src/silo/query_engine/filter_expressions/true.cpp b/src/silo/query_engine/filter_expressions/true.cpp index 6ec6f10c1..4f9433ab9 100644 --- a/src/silo/query_engine/filter_expressions/true.cpp +++ b/src/silo/query_engine/filter_expressions/true.cpp @@ -26,7 +26,7 @@ std::unique_ptr True::compile( const silo::DatabasePartition& database_partition, Expression::AmbiguityMode /*mode*/ ) const { - return std::make_unique(database_partition.sequenceCount); + return std::make_unique(database_partition.sequence_count); } // NOLINTNEXTLINE(readability-identifier-naming) diff --git a/src/silo/storage/database_partition.cpp b/src/silo/storage/database_partition.cpp index efd6d0eaa..82874c5eb 100644 --- a/src/silo/storage/database_partition.cpp +++ b/src/silo/storage/database_partition.cpp @@ -34,7 +34,7 @@ void DatabasePartition::flipBitmaps() { roaring::Roaring bitmap = positions[position].bitmaps.at(symbol); bitmap.runOptimize(); const uint32_t count = flipped_bitmap_before == symbol - ? sequenceCount - bitmap.cardinality() + ? sequence_count - bitmap.cardinality() : bitmap.cardinality(); if (count > max_count) { max_symbol = symbol; @@ -43,10 +43,10 @@ void DatabasePartition::flipBitmaps() { } if (max_symbol.has_value() && max_symbol != flipped_bitmap_before) { if (flipped_bitmap_before.has_value()) { - positions[position].bitmaps[*flipped_bitmap_before].flip(0, sequenceCount); + positions[position].bitmaps[*flipped_bitmap_before].flip(0, sequence_count); } positions[position].symbol_whose_bitmap_is_flipped = max_symbol; - positions[position].bitmaps[*max_symbol].flip(0, sequenceCount); + positions[position].bitmaps[*max_symbol].flip(0, sequence_count); } } }); @@ -64,7 +64,7 @@ void DatabasePartition::flipBitmaps() { roaring::Roaring bitmap = positions[position].bitmaps.at(symbol); bitmap.runOptimize(); const uint32_t count = flipped_bitmap_before == symbol - ? sequenceCount - bitmap.cardinality() + ? sequence_count - bitmap.cardinality() : bitmap.cardinality(); if (count > max_count) { max_symbol = symbol; @@ -73,10 +73,10 @@ void DatabasePartition::flipBitmaps() { } if (max_symbol.has_value() && max_symbol != flipped_bitmap_before) { if (flipped_bitmap_before.has_value()) { - positions[position].bitmaps[*flipped_bitmap_before].flip(0, sequenceCount); + positions[position].bitmaps[*flipped_bitmap_before].flip(0, sequence_count); } positions[position].symbol_whose_bitmap_is_flipped = max_symbol; - positions[position].bitmaps[*max_symbol].flip(0, sequenceCount); + positions[position].bitmaps[*max_symbol].flip(0, sequence_count); } } }); From 776d0928fe068561ee6b316c36278612eb3438ba Mon Sep 17 00:00:00 2001 From: Alexander Taepper Date: Sat, 29 Jul 2023 12:09:09 +0200 Subject: [PATCH 03/10] refactor: minor test fixes --- endToEndTests/test/info.test.js | 26 ++++++++++++------------- src/silo/database.test.cpp | 14 ++++++------- src/silo/prepare_dataset.cpp | 1 + src/silo/storage/database_partition.cpp | 2 ++ 4 files changed, 23 insertions(+), 20 deletions(-) diff --git a/endToEndTests/test/info.test.js b/endToEndTests/test/info.test.js index 32a62a04b..302ba175f 100644 --- a/endToEndTests/test/info.test.js +++ b/endToEndTests/test/info.test.js @@ -7,7 +7,7 @@ describe('The /info endpoint', () => { .get('/info') .expect(200) .expect('Content-Type', 'application/json') - .expect({ nBitmapsSize: 3898, sequenceCount: 100, totalSize: 60057070 }) + .expect({ nBitmapsSize: 3898, sequenceCount: 100, totalSize: 60074145 }) .end(done); }); @@ -26,15 +26,15 @@ describe('The /info endpoint', () => { 'bitmapContainerSizeStatistic' ); expect(returnedInfo.bitmapContainerSizePerGenomeSection.bitmapContainerSizeStatistic).to.deep.equal({ - numberOfArrayContainers: 43623, - numberOfBitsetContainers: 0, - numberOfRunContainers: 0, - numberOfValuesStoredInArrayContainers: 61931, + numberOfArrayContainers: 47970, + numberOfBitsetContainers: 209, + numberOfRunContainers: 209, + numberOfValuesStoredInArrayContainers: 64283, numberOfValuesStoredInBitsetContainers: 0, - numberOfValuesStoredInRunContainers: 0, - totalBitmapSizeArrayContainers: 123862, + numberOfValuesStoredInRunContainers: 2410, + totalBitmapSizeArrayContainers: 128566, totalBitmapSizeBitsetContainers: 0, - totalBitmapSizeRunContainers: 0, + totalBitmapSizeRunContainers: 3538, }); expect(returnedInfo.bitmapContainerSizePerGenomeSection).to.have.property( @@ -61,19 +61,19 @@ describe('The /info endpoint', () => { expect(returnedInfo).to.have.property('bitmapSizePerSymbol'); expect(returnedInfo.bitmapSizePerSymbol).to.deep.equal({ - '-': 6005380, - 'A': 6112762, + '-': 6003470, + 'A': 6127203, 'B': 5980600, - 'C': 6064610, + 'C': 6073069, 'D': 5980600, - 'G': 6067732, + 'G': 6075909, 'H': 5980600, 'K': 5980630, 'M': 5980620, 'N': 5980600, 'R': 5980620, 'S': 5980600, - 'T': 6125272, + 'T': 6139332, 'V': 5980600, 'W': 5980600, 'Y': 5980620, diff --git a/src/silo/database.test.cpp b/src/silo/database.test.cpp index 3832f4ce6..e5c669f23 100644 --- a/src/silo/database.test.cpp +++ b/src/silo/database.test.cpp @@ -40,10 +40,10 @@ TEST(DatabaseTest, shouldReturnCorrectDatabaseInfo) { const auto simple_info = database.getDatabaseInfo(); EXPECT_EQ( - detailed_info.bitmap_size_per_symbol.size_in_bytes.at(silo::NUCLEOTIDE_SYMBOL::A), 6112762 + detailed_info.bitmap_size_per_symbol.size_in_bytes.at(silo::NUCLEOTIDE_SYMBOL::A), 6127203 ); EXPECT_EQ( - detailed_info.bitmap_size_per_symbol.size_in_bytes.at(silo::NUCLEOTIDE_SYMBOL::GAP), 6005380 + detailed_info.bitmap_size_per_symbol.size_in_bytes.at(silo::NUCLEOTIDE_SYMBOL::GAP), 6003470 ); EXPECT_EQ( @@ -54,7 +54,7 @@ TEST(DatabaseTest, shouldReturnCorrectDatabaseInfo) { EXPECT_EQ( detailed_info.bitmap_container_size_per_genome_section.bitmap_container_size_statistic .number_of_values_stored_in_run_containers, - 0 + 2410 ); EXPECT_EQ( detailed_info.bitmap_container_size_per_genome_section.bitmap_container_size_statistic @@ -63,18 +63,18 @@ TEST(DatabaseTest, shouldReturnCorrectDatabaseInfo) { ); EXPECT_EQ( - detailed_info.bitmap_container_size_per_genome_section.total_bitmap_size_computed, 96162446 + detailed_info.bitmap_container_size_per_genome_section.total_bitmap_size_computed, 96205673 ); EXPECT_EQ( - detailed_info.bitmap_container_size_per_genome_section.total_bitmap_size_frozen, 48186777 + detailed_info.bitmap_container_size_per_genome_section.total_bitmap_size_frozen, 48217381 ); EXPECT_EQ( detailed_info.bitmap_container_size_per_genome_section.bitmap_container_size_statistic .total_bitmap_size_array_containers, - 123862 + 128566 ); - EXPECT_EQ(simple_info.total_size, 60057070); + EXPECT_EQ(simple_info.total_size, 60074145); EXPECT_EQ(simple_info.sequence_count, 100); EXPECT_EQ(simple_info.n_bitmaps_size, 3898); } diff --git a/src/silo/prepare_dataset.cpp b/src/silo/prepare_dataset.cpp index 43f69c91b..9a4bc43da 100644 --- a/src/silo/prepare_dataset.cpp +++ b/src/silo/prepare_dataset.cpp @@ -247,6 +247,7 @@ std::unordered_map sortMetadataFile( rows.reserve(chunk.size); for (auto& row : metadata_reader.reader) { + std::this_thread::sleep_for(std::chrono::nanoseconds(2)); std::string primary_key(row[sort_chunk_config.primary_key_name].get_sv()); const std::string date_str(row[sort_chunk_config.date_column_to_sort_by].get_sv()); diff --git a/src/silo/storage/database_partition.cpp b/src/silo/storage/database_partition.cpp index 82874c5eb..7d350358d 100644 --- a/src/silo/storage/database_partition.cpp +++ b/src/silo/storage/database_partition.cpp @@ -47,6 +47,7 @@ void DatabasePartition::flipBitmaps() { } positions[position].symbol_whose_bitmap_is_flipped = max_symbol; positions[position].bitmaps[*max_symbol].flip(0, sequence_count); + positions[position].bitmaps[*max_symbol].runOptimize(); } } }); @@ -77,6 +78,7 @@ void DatabasePartition::flipBitmaps() { } positions[position].symbol_whose_bitmap_is_flipped = max_symbol; positions[position].bitmaps[*max_symbol].flip(0, sequence_count); + positions[position].bitmaps[*max_symbol].runOptimize(); } } }); From 8d72eb419b89edd216dc255ce4d61580dff7ff78 Mon Sep 17 00:00:00 2001 From: Alexander Taepper Date: Thu, 3 Aug 2023 11:50:36 +0200 Subject: [PATCH 04/10] refactor: merge main into branch --- .../filter_expressions/insertion_contains.cpp | 2 +- src/silo/storage/aa_store.cpp | 29 +++++-------------- src/silo/storage/database_partition.cpp | 4 +-- src/silo/storage/sequence_store.cpp | 10 +++++-- 4 files changed, 19 insertions(+), 26 deletions(-) diff --git a/src/silo/query_engine/filter_expressions/insertion_contains.cpp b/src/silo/query_engine/filter_expressions/insertion_contains.cpp index 2c3a515d9..4aa519b82 100644 --- a/src/silo/query_engine/filter_expressions/insertion_contains.cpp +++ b/src/silo/query_engine/filter_expressions/insertion_contains.cpp @@ -47,7 +47,7 @@ std::unique_ptr InsertionContains::comp auto search_result = insertion_column.search(position, value); return OperatorResult(std::move(*search_result.release())); }, - database_partition.sequenceCount + database_partition.sequence_count ); } diff --git a/src/silo/storage/aa_store.cpp b/src/silo/storage/aa_store.cpp index 6a1809009..0bad0ff52 100644 --- a/src/silo/storage/aa_store.cpp +++ b/src/silo/storage/aa_store.cpp @@ -22,21 +22,27 @@ silo::AAPosition::AAPosition(std::optional symbol) { } void silo::AAPosition::flipMostNumerousBitmap(uint32_t sequence_count) { + std::optional flipped_bitmap_before = symbol_whose_bitmap_is_flipped; std::optional max_symbol = std::nullopt; uint32_t max_count = 0; for (const auto& symbol : AA_SYMBOLS) { roaring::Roaring bitmap = bitmaps.at(symbol); bitmap.runOptimize(); - const uint32_t count = bitmap.cardinality(); + const uint32_t count = flipped_bitmap_before == symbol ? sequence_count - bitmap.cardinality() + : bitmap.cardinality(); if (count > max_count) { max_symbol = symbol; max_count = count; } } - if (max_symbol.has_value()) { + if (max_symbol.has_value() && max_symbol != flipped_bitmap_before) { + if (flipped_bitmap_before.has_value()) { + bitmaps[*flipped_bitmap_before].flip(0, sequence_count); + } symbol_whose_bitmap_is_flipped = max_symbol; bitmaps[*max_symbol].flip(0, sequence_count); + bitmaps[*max_symbol].runOptimize(); } } @@ -47,25 +53,6 @@ silo::AAStorePartition::AAStorePartition(const std::vector& reference } } -void silo::AAPosition::flipMostNumerousBitmap(uint32_t sequence_count) { - std::optional max_symbol = std::nullopt; - uint32_t max_count = 0; - - for (const auto& symbol : AA_SYMBOLS) { - roaring::Roaring bitmap = bitmaps.at(symbol); - bitmap.runOptimize(); - const uint32_t count = bitmap.cardinality(); - if (count > max_count) { - max_symbol = symbol; - max_count = count; - } - } - if (max_symbol.has_value()) { - symbol_whose_bitmap_is_flipped = max_symbol; - bitmaps[*max_symbol].flip(0, sequence_count); - } -} - size_t silo::AAStorePartition::fill(silo::ZstdFastaReader& input_file) { static constexpr size_t BUFFER_SIZE = 1024; diff --git a/src/silo/storage/database_partition.cpp b/src/silo/storage/database_partition.cpp index 10d804bf0..8baa65f1e 100644 --- a/src/silo/storage/database_partition.cpp +++ b/src/silo/storage/database_partition.cpp @@ -25,7 +25,7 @@ void DatabasePartition::flipBitmaps() { auto& positions = seq_store.positions; tbb::parallel_for(tbb::blocked_range(0, positions.size()), [&](const auto& local) { for (auto position = local.begin(); position != local.end(); ++position) { - positions[position].flipMostNumerousBitmap(sequenceCount); + positions[position].flipMostNumerousBitmap(sequence_count); } }); } @@ -33,7 +33,7 @@ void DatabasePartition::flipBitmaps() { auto& positions = seq_store.positions; tbb::parallel_for(tbb::blocked_range(0, positions.size()), [&](const auto& local) { for (auto position = local.begin(); position != local.end(); ++position) { - positions[position].flipMostNumerousBitmap(sequenceCount); + positions[position].flipMostNumerousBitmap(sequence_count); } }); } diff --git a/src/silo/storage/sequence_store.cpp b/src/silo/storage/sequence_store.cpp index 8007b8664..d568ad242 100644 --- a/src/silo/storage/sequence_store.cpp +++ b/src/silo/storage/sequence_store.cpp @@ -24,21 +24,27 @@ silo::NucPosition::NucPosition(std::optional symbol) { } void silo::NucPosition::flipMostNumerousBitmap(uint32_t sequence_count) { + std::optional flipped_bitmap_before = symbol_whose_bitmap_is_flipped; std::optional max_symbol = std::nullopt; uint32_t max_count = 0; for (const auto& symbol : NUC_SYMBOLS) { roaring::Roaring bitmap = bitmaps.at(symbol); bitmap.runOptimize(); - const uint32_t count = bitmap.cardinality(); + const uint32_t count = flipped_bitmap_before == symbol ? sequence_count - bitmap.cardinality() + : bitmap.cardinality(); if (count > max_count) { max_symbol = symbol; max_count = count; } } - if (max_symbol.has_value()) { + if (max_symbol.has_value() && max_symbol != flipped_bitmap_before) { + if (flipped_bitmap_before.has_value()) { + bitmaps[*flipped_bitmap_before].flip(0, sequence_count); + } symbol_whose_bitmap_is_flipped = max_symbol; bitmaps[*max_symbol].flip(0, sequence_count); + bitmaps[*max_symbol].runOptimize(); } } From 34830ab0f0818c07f19ad28f8aa12b4229018fca Mon Sep 17 00:00:00 2001 From: Alexander Taepper Date: Thu, 3 Aug 2023 11:59:55 +0200 Subject: [PATCH 05/10] fix: linter --- endToEndTests/test/query.test.js | 18 +++++++++--------- src/silo/storage/aa_store.cpp | 2 +- src/silo/storage/sequence_store.cpp | 2 +- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/endToEndTests/test/query.test.js b/endToEndTests/test/query.test.js index 85cc446bd..2d276384b 100644 --- a/endToEndTests/test/query.test.js +++ b/endToEndTests/test/query.test.js @@ -12,15 +12,15 @@ describe('The /query endpoint', () => { testCases.forEach(testCase => it('should return data for the test case ' + testCase.testCaseName, async () => { - let response = await server.post('/query').send(testCase.query); - try { - expect(response.status).to.equal(200); - expect(response.header['content-type']).to.equal('application/json'); - } catch (error) { - console.error('Error in response header! Error body:', response.body); - throw error; - } - expect(response.body.queryResult).to.deep.equal(testCase.expectedQueryResult); + let response = await server.post('/query').send(testCase.query); + try { + expect(response.status).to.equal(200); + expect(response.header['content-type']).to.equal('application/json'); + } catch (error) { + console.error('Error in response header! Error body:', response.body); + throw error; + } + expect(response.body.queryResult).to.deep.equal(testCase.expectedQueryResult); }) ); diff --git a/src/silo/storage/aa_store.cpp b/src/silo/storage/aa_store.cpp index 0bad0ff52..ba27183ae 100644 --- a/src/silo/storage/aa_store.cpp +++ b/src/silo/storage/aa_store.cpp @@ -48,7 +48,7 @@ void silo::AAPosition::flipMostNumerousBitmap(uint32_t sequence_count) { silo::AAStorePartition::AAStorePartition(const std::vector& reference_sequence) : reference_sequence(reference_sequence) { - for (AA_SYMBOL symbol : reference_sequence) { + for (const AA_SYMBOL symbol : reference_sequence) { positions.emplace_back(symbol); } } diff --git a/src/silo/storage/sequence_store.cpp b/src/silo/storage/sequence_store.cpp index d568ad242..c7e20e849 100644 --- a/src/silo/storage/sequence_store.cpp +++ b/src/silo/storage/sequence_store.cpp @@ -52,7 +52,7 @@ silo::SequenceStorePartition::SequenceStorePartition( const std::vector& reference_genome ) : reference_genome(reference_genome) { - for (NUCLEOTIDE_SYMBOL symbol : reference_genome) { + for (const NUCLEOTIDE_SYMBOL symbol : reference_genome) { positions.emplace_back(symbol); } } From b65375320612625fd1a7325878174a1e2d621eda Mon Sep 17 00:00:00 2001 From: Alexander Taepper Date: Thu, 3 Aug 2023 12:27:01 +0200 Subject: [PATCH 06/10] fix: no longer have regression when no bitmap flipped is most efficient --- endToEndTests/test/info.test.js | 24 +++++++------- src/silo/database.test.cpp | 12 +++---- src/silo/storage/aa_store.cpp | 51 ++++++----------------------- src/silo/storage/sequence_store.cpp | 44 ++++++------------------- src/silo_api/info_handler.cpp | 2 +- 5 files changed, 39 insertions(+), 94 deletions(-) diff --git a/endToEndTests/test/info.test.js b/endToEndTests/test/info.test.js index 302ba175f..aec21d8c5 100644 --- a/endToEndTests/test/info.test.js +++ b/endToEndTests/test/info.test.js @@ -7,7 +7,7 @@ describe('The /info endpoint', () => { .get('/info') .expect(200) .expect('Content-Type', 'application/json') - .expect({ nBitmapsSize: 3898, sequenceCount: 100, totalSize: 60074145 }) + .expect({ nBitmapsSize: 3898, sequenceCount: 100, totalSize: 60055044 }) .end(done); }); @@ -26,15 +26,15 @@ describe('The /info endpoint', () => { 'bitmapContainerSizeStatistic' ); expect(returnedInfo.bitmapContainerSizePerGenomeSection.bitmapContainerSizeStatistic).to.deep.equal({ - numberOfArrayContainers: 47970, - numberOfBitsetContainers: 209, - numberOfRunContainers: 209, - numberOfValuesStoredInArrayContainers: 64283, + numberOfArrayContainers: 43545, + numberOfBitsetContainers: 0, + numberOfRunContainers: 78, + numberOfValuesStoredInArrayContainers: 59694, numberOfValuesStoredInBitsetContainers: 0, - numberOfValuesStoredInRunContainers: 2410, - totalBitmapSizeArrayContainers: 128566, + numberOfValuesStoredInRunContainers: 2237, + totalBitmapSizeArrayContainers: 119388, totalBitmapSizeBitsetContainers: 0, - totalBitmapSizeRunContainers: 3538, + totalBitmapSizeRunContainers: 2964, }); expect(returnedInfo.bitmapContainerSizePerGenomeSection).to.have.property( @@ -62,18 +62,18 @@ describe('The /info endpoint', () => { expect(returnedInfo).to.have.property('bitmapSizePerSymbol'); expect(returnedInfo.bitmapSizePerSymbol).to.deep.equal({ '-': 6003470, - 'A': 6127203, + 'A': 6112681, 'B': 5980600, - 'C': 6073069, + 'C': 6064603, 'D': 5980600, - 'G': 6075909, + 'G': 6067693, 'H': 5980600, 'K': 5980630, 'M': 5980620, 'N': 5980600, 'R': 5980620, 'S': 5980600, - 'T': 6139332, + 'T': 6125253, 'V': 5980600, 'W': 5980600, 'Y': 5980620, diff --git a/src/silo/database.test.cpp b/src/silo/database.test.cpp index e5c669f23..8e692e733 100644 --- a/src/silo/database.test.cpp +++ b/src/silo/database.test.cpp @@ -40,7 +40,7 @@ TEST(DatabaseTest, shouldReturnCorrectDatabaseInfo) { const auto simple_info = database.getDatabaseInfo(); EXPECT_EQ( - detailed_info.bitmap_size_per_symbol.size_in_bytes.at(silo::NUCLEOTIDE_SYMBOL::A), 6127203 + detailed_info.bitmap_size_per_symbol.size_in_bytes.at(silo::NUCLEOTIDE_SYMBOL::A), 6112681 ); EXPECT_EQ( detailed_info.bitmap_size_per_symbol.size_in_bytes.at(silo::NUCLEOTIDE_SYMBOL::GAP), 6003470 @@ -54,7 +54,7 @@ TEST(DatabaseTest, shouldReturnCorrectDatabaseInfo) { EXPECT_EQ( detailed_info.bitmap_container_size_per_genome_section.bitmap_container_size_statistic .number_of_values_stored_in_run_containers, - 2410 + 2237 ); EXPECT_EQ( detailed_info.bitmap_container_size_per_genome_section.bitmap_container_size_statistic @@ -63,18 +63,18 @@ TEST(DatabaseTest, shouldReturnCorrectDatabaseInfo) { ); EXPECT_EQ( - detailed_info.bitmap_container_size_per_genome_section.total_bitmap_size_computed, 96205673 + detailed_info.bitmap_container_size_per_genome_section.total_bitmap_size_computed, 96160390 ); EXPECT_EQ( - detailed_info.bitmap_container_size_per_genome_section.total_bitmap_size_frozen, 48217381 + detailed_info.bitmap_container_size_per_genome_section.total_bitmap_size_frozen, 48185111 ); EXPECT_EQ( detailed_info.bitmap_container_size_per_genome_section.bitmap_container_size_statistic .total_bitmap_size_array_containers, - 128566 + 119388 ); - EXPECT_EQ(simple_info.total_size, 60074145); + EXPECT_EQ(simple_info.total_size, 60055044); EXPECT_EQ(simple_info.sequence_count, 100); EXPECT_EQ(simple_info.n_bitmaps_size, 3898); } diff --git a/src/silo/storage/aa_store.cpp b/src/silo/storage/aa_store.cpp index ba27183ae..5cf23bd2e 100644 --- a/src/silo/storage/aa_store.cpp +++ b/src/silo/storage/aa_store.cpp @@ -29,6 +29,7 @@ void silo::AAPosition::flipMostNumerousBitmap(uint32_t sequence_count) { for (const auto& symbol : AA_SYMBOLS) { roaring::Roaring bitmap = bitmaps.at(symbol); bitmap.runOptimize(); + bitmap.shrinkToFit(); const uint32_t count = flipped_bitmap_before == symbol ? sequence_count - bitmap.cardinality() : bitmap.cardinality(); if (count > max_count) { @@ -36,13 +37,18 @@ void silo::AAPosition::flipMostNumerousBitmap(uint32_t sequence_count) { max_count = count; } } - if (max_symbol.has_value() && max_symbol != flipped_bitmap_before) { + if (max_symbol != flipped_bitmap_before) { if (flipped_bitmap_before.has_value()) { bitmaps[*flipped_bitmap_before].flip(0, sequence_count); + bitmaps[*flipped_bitmap_before].runOptimize(); + bitmaps[*flipped_bitmap_before].shrinkToFit(); + } + if (max_symbol.has_value()) { + bitmaps[*max_symbol].flip(0, sequence_count); + bitmaps[*max_symbol].runOptimize(); + bitmaps[*max_symbol].shrinkToFit(); } symbol_whose_bitmap_is_flipped = max_symbol; - bitmaps[*max_symbol].flip(0, sequence_count); - bitmaps[*max_symbol].runOptimize(); } } @@ -148,6 +154,7 @@ void silo::AAStorePartition::fillXBitmaps(const std::vector& sequen positions_with_aa_symbol_x.size(), positions_with_aa_symbol_x.data() ); aa_symbol_x_bitmaps[sequence_count + sequence_id].runOptimize(); + aa_symbol_x_bitmaps[sequence_count + sequence_id].shrinkToFit(); positions_with_aa_symbol_x.clear(); } } @@ -160,44 +167,6 @@ void silo::AAStorePartition::interpret(const std::vector& sequences sequence_count += sequences.size(); } -size_t silo::AAStorePartition::computeSize() const { - size_t result = 0; - for (const auto& position : positions) { - for (const AA_SYMBOL symbol : AA_SYMBOLS) { - result += position.bitmaps.at(symbol).getSizeInBytes(false); - } - } - return result; -} - -size_t silo::AAStorePartition::runOptimize() { - std::atomic count_true = 0; - tbb::parallel_for(tbb::blocked_range(0U, positions.size()), [&](const auto& local) { - for (auto position = local.begin(); position != local.end(); ++position) { - for (const AA_SYMBOL symbol : AA_SYMBOLS) { - if (positions[position].bitmaps[symbol].runOptimize()) { - ++count_true; - } - } - } - }); - return count_true; -} - -size_t silo::AAStorePartition::shrinkToFit() { - std::atomic saved = 0; - tbb::parallel_for(tbb::blocked_range(0U, positions.size()), [&](const auto& local) { - size_t local_saved = 0; - for (auto position = local.begin(); position != local.end(); ++position) { - for (const AA_SYMBOL symbol : AA_SYMBOLS) { - local_saved += positions[position].bitmaps[symbol].shrinkToFit(); - } - } - saved += local_saved; - }); - return saved; -} - silo::AAStore::AAStore(std::vector reference_sequence) : reference_sequence(std::move(reference_sequence)) {} diff --git a/src/silo/storage/sequence_store.cpp b/src/silo/storage/sequence_store.cpp index c7e20e849..9c5befd9b 100644 --- a/src/silo/storage/sequence_store.cpp +++ b/src/silo/storage/sequence_store.cpp @@ -29,8 +29,9 @@ void silo::NucPosition::flipMostNumerousBitmap(uint32_t sequence_count) { uint32_t max_count = 0; for (const auto& symbol : NUC_SYMBOLS) { - roaring::Roaring bitmap = bitmaps.at(symbol); + roaring::Roaring& bitmap = bitmaps[symbol]; bitmap.runOptimize(); + bitmap.shrinkToFit(); const uint32_t count = flipped_bitmap_before == symbol ? sequence_count - bitmap.cardinality() : bitmap.cardinality(); if (count > max_count) { @@ -38,13 +39,18 @@ void silo::NucPosition::flipMostNumerousBitmap(uint32_t sequence_count) { max_count = count; } } - if (max_symbol.has_value() && max_symbol != flipped_bitmap_before) { + if (max_symbol != flipped_bitmap_before) { if (flipped_bitmap_before.has_value()) { bitmaps[*flipped_bitmap_before].flip(0, sequence_count); + bitmaps[*flipped_bitmap_before].runOptimize(); + bitmaps[*flipped_bitmap_before].shrinkToFit(); + } + if (max_symbol.has_value()) { + bitmaps[*max_symbol].flip(0, sequence_count); + bitmaps[*max_symbol].runOptimize(); + bitmaps[*max_symbol].shrinkToFit(); } symbol_whose_bitmap_is_flipped = max_symbol; - bitmaps[*max_symbol].flip(0, sequence_count); - bitmaps[*max_symbol].runOptimize(); } } @@ -199,36 +205,6 @@ size_t silo::SequenceStorePartition::computeSize() const { return result; } -size_t silo::SequenceStorePartition::runOptimize() { - std::atomic count_true = 0; - const tbb::blocked_range range(0U, positions.size()); - tbb::parallel_for(range, [&](const decltype(range) local) { - for (auto position = local.begin(); position != local.end(); ++position) { - for (const NUCLEOTIDE_SYMBOL symbol : NUC_SYMBOLS) { - if (positions[position].bitmaps[symbol].runOptimize()) { - ++count_true; - } - } - } - }); - return count_true; -} - -size_t silo::SequenceStorePartition::shrinkToFit() { - std::atomic saved = 0; - const tbb::blocked_range range(0U, positions.size()); - tbb::parallel_for(range, [&](const decltype(range) local) { - size_t local_saved = 0; - for (auto position = local.begin(); position != local.end(); ++position) { - for (const NUCLEOTIDE_SYMBOL symbol : NUC_SYMBOLS) { - local_saved += positions[position].bitmaps[symbol].shrinkToFit(); - } - } - saved += local_saved; - }); - return saved; -} - silo::SequenceStore::SequenceStore(std::vector reference_genome) : reference_genome(std::move(reference_genome)) {} diff --git a/src/silo_api/info_handler.cpp b/src/silo_api/info_handler.cpp index 2e3fb6158..592125d51 100644 --- a/src/silo_api/info_handler.cpp +++ b/src/silo_api/info_handler.cpp @@ -25,7 +25,7 @@ void to_json(nlohmann::json& json, const BitmapContainerSizeStatistic& statistic json = nlohmann::json{ {"numberOfArrayContainers", statistics.number_of_array_containers}, {"numberOfRunContainers", statistics.number_of_run_containers}, - {"numberOfBitsetContainers", statistics.number_of_run_containers}, + {"numberOfBitsetContainers", statistics.number_of_bitset_containers}, {"numberOfValuesStoredInArrayContainers", statistics.number_of_values_stored_in_array_containers}, {"numberOfValuesStoredInRunContainers", statistics.number_of_values_stored_in_run_containers}, From ffcefd0fb82345dddce170c0254fb20a167e18ba Mon Sep 17 00:00:00 2001 From: Alexander Taepper Date: Thu, 3 Aug 2023 13:57:54 +0200 Subject: [PATCH 07/10] fix: divergence between mac and linux info test results, fix memory leaks in Threshold.cpp --- endToEndTests/test/info.test.js | 20 +++++++++---------- src/silo/database.test.cpp | 12 +++++------ src/silo/query_engine/operators/threshold.cpp | 6 ++---- 3 files changed, 18 insertions(+), 20 deletions(-) diff --git a/endToEndTests/test/info.test.js b/endToEndTests/test/info.test.js index aec21d8c5..a266c9cb7 100644 --- a/endToEndTests/test/info.test.js +++ b/endToEndTests/test/info.test.js @@ -7,7 +7,7 @@ describe('The /info endpoint', () => { .get('/info') .expect(200) .expect('Content-Type', 'application/json') - .expect({ nBitmapsSize: 3898, sequenceCount: 100, totalSize: 60055044 }) + .expect({ nBitmapsSize: 3898, sequenceCount: 100, totalSize: 60054981 }) .end(done); }); @@ -26,15 +26,15 @@ describe('The /info endpoint', () => { 'bitmapContainerSizeStatistic' ); expect(returnedInfo.bitmapContainerSizePerGenomeSection.bitmapContainerSizeStatistic).to.deep.equal({ - numberOfArrayContainers: 43545, + numberOfArrayContainers: 43540, numberOfBitsetContainers: 0, - numberOfRunContainers: 78, - numberOfValuesStoredInArrayContainers: 59694, + numberOfRunContainers: 83, + numberOfValuesStoredInArrayContainers: 59577, numberOfValuesStoredInBitsetContainers: 0, - numberOfValuesStoredInRunContainers: 2237, - totalBitmapSizeArrayContainers: 119388, + numberOfValuesStoredInRunContainers: 2354, + totalBitmapSizeArrayContainers: 119154, totalBitmapSizeBitsetContainers: 0, - totalBitmapSizeRunContainers: 2964, + totalBitmapSizeRunContainers: 3170, }); expect(returnedInfo.bitmapContainerSizePerGenomeSection).to.have.property( @@ -62,11 +62,11 @@ describe('The /info endpoint', () => { expect(returnedInfo).to.have.property('bitmapSizePerSymbol'); expect(returnedInfo.bitmapSizePerSymbol).to.deep.equal({ '-': 6003470, - 'A': 6112681, + 'A': 6112653, 'B': 5980600, - 'C': 6064603, + 'C': 6064589, 'D': 5980600, - 'G': 6067693, + 'G': 6067672, 'H': 5980600, 'K': 5980630, 'M': 5980620, diff --git a/src/silo/database.test.cpp b/src/silo/database.test.cpp index 8e692e733..e67e5b5e2 100644 --- a/src/silo/database.test.cpp +++ b/src/silo/database.test.cpp @@ -40,7 +40,7 @@ TEST(DatabaseTest, shouldReturnCorrectDatabaseInfo) { const auto simple_info = database.getDatabaseInfo(); EXPECT_EQ( - detailed_info.bitmap_size_per_symbol.size_in_bytes.at(silo::NUCLEOTIDE_SYMBOL::A), 6112681 + detailed_info.bitmap_size_per_symbol.size_in_bytes.at(silo::NUCLEOTIDE_SYMBOL::A), 6112653 ); EXPECT_EQ( detailed_info.bitmap_size_per_symbol.size_in_bytes.at(silo::NUCLEOTIDE_SYMBOL::GAP), 6003470 @@ -54,7 +54,7 @@ TEST(DatabaseTest, shouldReturnCorrectDatabaseInfo) { EXPECT_EQ( detailed_info.bitmap_container_size_per_genome_section.bitmap_container_size_statistic .number_of_values_stored_in_run_containers, - 2237 + 2354 ); EXPECT_EQ( detailed_info.bitmap_container_size_per_genome_section.bitmap_container_size_statistic @@ -63,18 +63,18 @@ TEST(DatabaseTest, shouldReturnCorrectDatabaseInfo) { ); EXPECT_EQ( - detailed_info.bitmap_container_size_per_genome_section.total_bitmap_size_computed, 96160390 + detailed_info.bitmap_container_size_per_genome_section.total_bitmap_size_computed, 96160327 ); EXPECT_EQ( - detailed_info.bitmap_container_size_per_genome_section.total_bitmap_size_frozen, 48185111 + detailed_info.bitmap_container_size_per_genome_section.total_bitmap_size_frozen, 48185073 ); EXPECT_EQ( detailed_info.bitmap_container_size_per_genome_section.bitmap_container_size_statistic .total_bitmap_size_array_containers, - 119388 + 119154 ); - EXPECT_EQ(simple_info.total_size, 60055044); + EXPECT_EQ(simple_info.total_size, 60054981); EXPECT_EQ(simple_info.sequence_count, 100); EXPECT_EQ(simple_info.n_bitmaps_size, 3898); } diff --git a/src/silo/query_engine/operators/threshold.cpp b/src/silo/query_engine/operators/threshold.cpp index 2af5c39e0..5b31dd710 100644 --- a/src/silo/query_engine/operators/threshold.cpp +++ b/src/silo/query_engine/operators/threshold.cpp @@ -123,10 +123,8 @@ OperatorResult Threshold::evaluate() const { partition_bitmaps[j] |= partition_bitmaps[j - 1] - *bitmap; } if (k - i > n - 1) { - roaring::api::roaring_bitmap_or_inplace( - &partition_bitmaps[0].roaring, - roaring::api::roaring_bitmap_flip(&bitmap->roaring, 0, row_count) - ); + bitmap->flip(0, row_count); + partition_bitmaps[0] |= *bitmap; } } // NOLINTEND(readability-identifier-length) From c2eeac88ec688f25e65409490dcf68b954e53789 Mon Sep 17 00:00:00 2001 From: Alexander Taepper Date: Thu, 3 Aug 2023 14:05:41 +0200 Subject: [PATCH 08/10] fix: fix memory leaks in indexed_string_column.cpp and insertion_contains.cpp --- include/silo/storage/column/indexed_string_column.h | 2 +- .../query_engine/filter_expressions/insertion_contains.cpp | 2 +- src/silo/query_engine/filter_expressions/string_equals.cpp | 6 +++--- src/silo/storage/column/indexed_string_column.cpp | 7 ++++--- src/silo/storage/column/indexed_string_column.test.cpp | 6 +++--- 5 files changed, 12 insertions(+), 11 deletions(-) diff --git a/include/silo/storage/column/indexed_string_column.h b/include/silo/storage/column/indexed_string_column.h index fbfcec727..e33171147 100644 --- a/include/silo/storage/column/indexed_string_column.h +++ b/include/silo/storage/column/indexed_string_column.h @@ -40,7 +40,7 @@ class IndexedStringColumnPartition { public: explicit IndexedStringColumnPartition(common::BidirectionalMap& lookup); - [[nodiscard]] roaring::Roaring filter(const std::string& value) const; + [[nodiscard]] std::optional filter(const std::string& value) const; void insert(const std::string& value); diff --git a/src/silo/query_engine/filter_expressions/insertion_contains.cpp b/src/silo/query_engine/filter_expressions/insertion_contains.cpp index 4aa519b82..a143cc006 100644 --- a/src/silo/query_engine/filter_expressions/insertion_contains.cpp +++ b/src/silo/query_engine/filter_expressions/insertion_contains.cpp @@ -45,7 +45,7 @@ std::unique_ptr InsertionContains::comp return std::make_unique( [&]() { auto search_result = insertion_column.search(position, value); - return OperatorResult(std::move(*search_result.release())); + return OperatorResult(std::move(*search_result)); }, database_partition.sequence_count ); diff --git a/src/silo/query_engine/filter_expressions/string_equals.cpp b/src/silo/query_engine/filter_expressions/string_equals.cpp index 3c3de046d..25cfae0de 100644 --- a/src/silo/query_engine/filter_expressions/string_equals.cpp +++ b/src/silo/query_engine/filter_expressions/string_equals.cpp @@ -42,13 +42,13 @@ std::unique_ptr StringEquals::compile( ) const { if (database_partition.columns.indexed_string_columns.contains(column)) { const auto& string_column = database_partition.columns.indexed_string_columns.at(column); - const roaring::Roaring& bitmap = string_column.filter(value); + const auto bitmap = string_column.filter(value); - if (bitmap.isEmpty()) { + if (bitmap == std::nullopt || bitmap.value()->isEmpty()) { return std::make_unique(database_partition.sequence_count); } return std::make_unique( - new roaring::Roaring(bitmap), database_partition.sequence_count + bitmap.value(), database_partition.sequence_count ); } diff --git a/src/silo/storage/column/indexed_string_column.cpp b/src/silo/storage/column/indexed_string_column.cpp index 7bf2b7434..41bed2d27 100644 --- a/src/silo/storage/column/indexed_string_column.cpp +++ b/src/silo/storage/column/indexed_string_column.cpp @@ -11,12 +11,13 @@ IndexedStringColumnPartition::IndexedStringColumnPartition( ) : lookup(lookup) {} -roaring::Roaring IndexedStringColumnPartition::filter(const std::string& value) const { +std::optional IndexedStringColumnPartition::filter(const std::string& value +) const { const auto value_id = lookup.getId(value); if (value_id.has_value()) { - return indexed_values.at(value_id.value()); + return &indexed_values.at(value_id.value()); } - return {}; + return std::nullopt; } void IndexedStringColumnPartition::insert(const std::string& value) { diff --git a/src/silo/storage/column/indexed_string_column.test.cpp b/src/silo/storage/column/indexed_string_column.test.cpp index ca341b03a..f1d1d3958 100644 --- a/src/silo/storage/column/indexed_string_column.test.cpp +++ b/src/silo/storage/column/indexed_string_column.test.cpp @@ -15,13 +15,13 @@ TEST(IndexedStringColumn, shouldReturnTheCorrectFilteredValues) { under_test.insert("value 1"); const auto result1 = under_test.filter("value 1"); - ASSERT_EQ(result1, roaring::Roaring({0, 4})); + ASSERT_EQ(*result1.value(), roaring::Roaring({0, 4})); const auto result2 = under_test.filter("value 2"); - ASSERT_EQ(result2, roaring::Roaring({1, 2})); + ASSERT_EQ(*result2.value(), roaring::Roaring({1, 2})); const auto result3 = under_test.filter("value that does not exist"); - ASSERT_EQ(result3, roaring::Roaring()); + ASSERT_EQ(result3, std::nullopt); } TEST(IndexedStringColumnPartition, insertValuesToPartition) { From 21daddb0ab727a584ac4ec5a611751123d686a56 Mon Sep 17 00:00:00 2001 From: Alexander Taepper Date: Fri, 4 Aug 2023 09:47:43 +0200 Subject: [PATCH 09/10] refactor: renaming variable --- src/silo/storage/aa_store.cpp | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/src/silo/storage/aa_store.cpp b/src/silo/storage/aa_store.cpp index 5cf23bd2e..53fab4360 100644 --- a/src/silo/storage/aa_store.cpp +++ b/src/silo/storage/aa_store.cpp @@ -22,33 +22,34 @@ silo::AAPosition::AAPosition(std::optional symbol) { } void silo::AAPosition::flipMostNumerousBitmap(uint32_t sequence_count) { - std::optional flipped_bitmap_before = symbol_whose_bitmap_is_flipped; - std::optional max_symbol = std::nullopt; + std::optional previous_flipped_bitmap_symbol = symbol_whose_bitmap_is_flipped; + std::optional new_flipped_bitmap_symbol = std::nullopt; uint32_t max_count = 0; for (const auto& symbol : AA_SYMBOLS) { roaring::Roaring bitmap = bitmaps.at(symbol); bitmap.runOptimize(); bitmap.shrinkToFit(); - const uint32_t count = flipped_bitmap_before == symbol ? sequence_count - bitmap.cardinality() - : bitmap.cardinality(); + const uint32_t count = previous_flipped_bitmap_symbol == symbol + ? sequence_count - bitmap.cardinality() + : bitmap.cardinality(); if (count > max_count) { - max_symbol = symbol; + new_flipped_bitmap_symbol = symbol; max_count = count; } } - if (max_symbol != flipped_bitmap_before) { - if (flipped_bitmap_before.has_value()) { - bitmaps[*flipped_bitmap_before].flip(0, sequence_count); - bitmaps[*flipped_bitmap_before].runOptimize(); - bitmaps[*flipped_bitmap_before].shrinkToFit(); + if (new_flipped_bitmap_symbol != previous_flipped_bitmap_symbol) { + if (previous_flipped_bitmap_symbol.has_value()) { + bitmaps[*previous_flipped_bitmap_symbol].flip(0, sequence_count); + bitmaps[*previous_flipped_bitmap_symbol].runOptimize(); + bitmaps[*previous_flipped_bitmap_symbol].shrinkToFit(); } - if (max_symbol.has_value()) { - bitmaps[*max_symbol].flip(0, sequence_count); - bitmaps[*max_symbol].runOptimize(); - bitmaps[*max_symbol].shrinkToFit(); + if (new_flipped_bitmap_symbol.has_value()) { + bitmaps[*new_flipped_bitmap_symbol].flip(0, sequence_count); + bitmaps[*new_flipped_bitmap_symbol].runOptimize(); + bitmaps[*new_flipped_bitmap_symbol].shrinkToFit(); } - symbol_whose_bitmap_is_flipped = max_symbol; + symbol_whose_bitmap_is_flipped = new_flipped_bitmap_symbol; } } From b8d5bee7ffe9447315b5982437f933b0e5bc2442 Mon Sep 17 00:00:00 2001 From: Alexander Taepper Date: Fri, 4 Aug 2023 10:00:22 +0200 Subject: [PATCH 10/10] refactor: formatting --- endToEndTests/test/query.test.js | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/endToEndTests/test/query.test.js b/endToEndTests/test/query.test.js index ba2c19600..52f25380a 100644 --- a/endToEndTests/test/query.test.js +++ b/endToEndTests/test/query.test.js @@ -12,13 +12,11 @@ describe('The /query endpoint', () => { testCases.forEach(testCase => it('should return data for the test case ' + testCase.testCaseName, async () => { - const response = await server - .post('/query') - .send(testCase.query); - expect(response.body.queryResult).to.deep.equal(testCase.expectedQueryResult); - expect(200); - expect('Content-Type', 'application/json'); - expect(headerToHaveDataVersion); + const response = await server.post('/query').send(testCase.query); + expect(response.body.queryResult).to.deep.equal(testCase.expectedQueryResult); + expect(200); + expect('Content-Type', 'application/json'); + expect(headerToHaveDataVersion); }) );