From 80bdc8abaea64ad4c9005053cffecc17ff2f2064 Mon Sep 17 00:00:00 2001 From: Alexander Taepper Date: Wed, 9 Aug 2023 12:36:39 +0200 Subject: [PATCH] feat: AAMutations with multiple sequences --- .../test/queries/aaMutDistribution.json | 48 ++--- .../test/queries/aaMutDistribution_all.json | 64 +++++++ .../queries/aaMutDistribution_multiple.json | 170 ++++++++++++++++++ .../silo/query_engine/actions/aa_mutations.h | 27 +-- .../query_engine/actions/aa_mutations.cpp | 114 ++++++++---- 5 files changed, 353 insertions(+), 70 deletions(-) create mode 100644 endToEndTests/test/queries/aaMutDistribution_all.json create mode 100644 endToEndTests/test/queries/aaMutDistribution_multiple.json diff --git a/endToEndTests/test/queries/aaMutDistribution.json b/endToEndTests/test/queries/aaMutDistribution.json index a06f60caa..5c087dc95 100644 --- a/endToEndTests/test/queries/aaMutDistribution.json +++ b/endToEndTests/test/queries/aaMutDistribution.json @@ -13,122 +13,122 @@ "expectedQueryResult": [ { "count": 37, - "position": "T19R", + "position": "S:T19R", "proportion": 0.3854166666666667 }, { "count": 37, - "position": "G142D", + "position": "S:G142D", "proportion": 0.49333333333333335 }, { "count": 34, - "position": "R158G", + "position": "S:R158G", "proportion": 0.3655913978494624 }, { "count": 34, - "position": "G339D", + "position": "S:G339D", "proportion": 0.3469387755102041 }, { "count": 33, - "position": "S373P", + "position": "S:S373P", "proportion": 0.3402061855670103 }, { "count": 33, - "position": "S375F", + "position": "S:S375F", "proportion": 0.336734693877551 }, { "count": 38, - "position": "L452R", + "position": "S:L452R", "proportion": 0.4318181818181818 }, { "count": 32, - "position": "S477N", + "position": "S:S477N", "proportion": 0.34408602150537637 }, { "count": 69, - "position": "T478K", + "position": "S:T478K", "proportion": 0.7340425531914894 }, { "count": 31, - "position": "E484A", + "position": "S:E484A", "proportion": 0.3333333333333333 }, { "count": 31, - "position": "Q493R", + "position": "S:Q493R", "proportion": 0.3333333333333333 }, { "count": 30, - "position": "Q498R", + "position": "S:Q498R", "proportion": 0.3225806451612903 }, { "count": 41, - "position": "N501Y", + "position": "S:N501Y", "proportion": 0.44086021505376344 }, { "count": 30, - "position": "Y505H", + "position": "S:Y505H", "proportion": 0.3225806451612903 }, { "count": 98, - "position": "D614G", + "position": "S:D614G", "proportion": 0.98989898989899 }, { "count": 37, - "position": "H655Y", + "position": "S:H655Y", "proportion": 0.37373737373737376 }, { "count": 34, - "position": "N679K", + "position": "S:N679K", "proportion": 0.34 }, { "count": 42, - "position": "P681H", + "position": "S:P681H", "proportion": 0.42 }, { "count": 38, - "position": "P681R", + "position": "S:P681R", "proportion": 0.38 }, { "count": 31, - "position": "N764K", + "position": "S:N764K", "proportion": 0.32978723404255317 }, { "count": 34, - "position": "D796Y", + "position": "S:D796Y", "proportion": 0.3469387755102041 }, { "count": 34, - "position": "D950N", + "position": "S:D950N", "proportion": 0.35789473684210527 }, { "count": 33, - "position": "Q954H", + "position": "S:Q954H", "proportion": 0.34375 }, { "count": 34, - "position": "N969K", + "position": "S:N969K", "proportion": 0.35051546391752575 } ] diff --git a/endToEndTests/test/queries/aaMutDistribution_all.json b/endToEndTests/test/queries/aaMutDistribution_all.json new file mode 100644 index 000000000..7847d147c --- /dev/null +++ b/endToEndTests/test/queries/aaMutDistribution_all.json @@ -0,0 +1,64 @@ +{ + "testCaseName": "The distribution of Amino Acid Mutations action for all sequences", + "query": { + "action": { + "type": "AminoAcidMutations", + "minProportion": 0.4 + }, + "filterExpression": { + "type": "True" + } + }, + "expectedQueryResult": [ + { + "count": 46, + "position": "N:R203K", + "proportion": 0.46 + }, + { + "count": 46, + "position": "N:G204R", + "proportion": 0.46 + }, + { + "count": 64, + "position": "ORF1a:T3255I", + "proportion": 0.6464646464646465 + }, + { + "count": 98, + "position": "ORF1b:P314L", + "proportion": 0.98989898989899 + }, + { + "count": 37, + "position": "S:G142D", + "proportion": 0.49333333333333335 + }, + { + "count": 38, + "position": "S:L452R", + "proportion": 0.4318181818181818 + }, + { + "count": 69, + "position": "S:T478K", + "proportion": 0.7340425531914894 + }, + { + "count": 41, + "position": "S:N501Y", + "proportion": 0.44086021505376344 + }, + { + "count": 98, + "position": "S:D614G", + "proportion": 0.98989898989899 + }, + { + "count": 42, + "position": "S:P681H", + "proportion": 0.42 + } + ] +} diff --git a/endToEndTests/test/queries/aaMutDistribution_multiple.json b/endToEndTests/test/queries/aaMutDistribution_multiple.json new file mode 100644 index 000000000..021fd5296 --- /dev/null +++ b/endToEndTests/test/queries/aaMutDistribution_multiple.json @@ -0,0 +1,170 @@ +{ + "testCaseName": "The distribution of Amino Acid Mutations action for multiple sequences", + "query": { + "action": { + "type": "AminoAcidMutations", + "sequenceName": ["S", "N"], + "minProportion": 0.3 + }, + "filterExpression": { + "type": "True" + } + }, + "expectedQueryResult": [ + { + "count": 37, + "position": "S:T19R", + "proportion": 0.3854166666666667 + }, + { + "count": 37, + "position": "S:G142D", + "proportion": 0.49333333333333335 + }, + { + "count": 34, + "position": "S:R158G", + "proportion": 0.3655913978494624 + }, + { + "count": 34, + "position": "S:G339D", + "proportion": 0.3469387755102041 + }, + { + "count": 33, + "position": "S:S373P", + "proportion": 0.3402061855670103 + }, + { + "count": 33, + "position": "S:S375F", + "proportion": 0.336734693877551 + }, + { + "count": 38, + "position": "S:L452R", + "proportion": 0.4318181818181818 + }, + { + "count": 32, + "position": "S:S477N", + "proportion": 0.34408602150537637 + }, + { + "count": 69, + "position": "S:T478K", + "proportion": 0.7340425531914894 + }, + { + "count": 31, + "position": "S:E484A", + "proportion": 0.3333333333333333 + }, + { + "count": 31, + "position": "S:Q493R", + "proportion": 0.3333333333333333 + }, + { + "count": 30, + "position": "S:Q498R", + "proportion": 0.3225806451612903 + }, + { + "count": 41, + "position": "S:N501Y", + "proportion": 0.44086021505376344 + }, + { + "count": 30, + "position": "S:Y505H", + "proportion": 0.3225806451612903 + }, + { + "count": 98, + "position": "S:D614G", + "proportion": 0.98989898989899 + }, + { + "count": 37, + "position": "S:H655Y", + "proportion": 0.37373737373737376 + }, + { + "count": 34, + "position": "S:N679K", + "proportion": 0.34 + }, + { + "count": 42, + "position": "S:P681H", + "proportion": 0.42 + }, + { + "count": 38, + "position": "S:P681R", + "proportion": 0.38 + }, + { + "count": 31, + "position": "S:N764K", + "proportion": 0.32978723404255317 + }, + { + "count": 34, + "position": "S:D796Y", + "proportion": 0.3469387755102041 + }, + { + "count": 34, + "position": "S:D950N", + "proportion": 0.35789473684210527 + }, + { + "count": 33, + "position": "S:Q954H", + "proportion": 0.34375 + }, + { + "count": 34, + "position": "S:N969K", + "proportion": 0.35051546391752575 + }, + { + "count": 34, + "position": "N:P13L", + "proportion": 0.3469387755102041 + }, + { + "count": 36, + "position": "N:D63G", + "proportion": 0.3673469387755102 + }, + { + "count": 46, + "position": "N:R203K", + "proportion": 0.46 + }, + { + "count": 38, + "position": "N:R203M", + "proportion": 0.38 + }, + { + "count": 46, + "position": "N:G204R", + "proportion": 0.46 + }, + { + "count": 30, + "position": "N:G215C", + "proportion": 0.3 + }, + { + "count": 37, + "position": "N:D377Y", + "proportion": 0.38144329896907214 + } + ] +} diff --git a/include/silo/query_engine/actions/aa_mutations.h b/include/silo/query_engine/actions/aa_mutations.h index e6bbe54d3..06c1bd8ab 100644 --- a/include/silo/query_engine/actions/aa_mutations.h +++ b/include/silo/query_engine/actions/aa_mutations.h @@ -17,10 +17,10 @@ namespace silo { class AAStore; -} +class AAStorePartition; +} // namespace silo namespace silo { class Database; -class AAStorePartition; } // namespace silo namespace silo::query_engine { struct OperatorResult; @@ -29,7 +29,7 @@ struct OperatorResult; namespace silo::query_engine::actions { class AAMutations : public Action { - std::string aa_sequence_name; + std::vector aa_sequence_names; double min_proportion; static constexpr std::array VALID_MUTATION_SYMBOLS{ @@ -60,30 +60,37 @@ class AAMutations : public Action { const std::string COUNT_FIELD_NAME = "count"; struct PrefilteredBitmaps { - std::vector> bitmaps; - std::vector> full_bitmaps; + std::vector> bitmaps; + std::vector> full_bitmaps; }; public: static constexpr double DEFAULT_MIN_PROPORTION = 0.05; private: - static PrefilteredBitmaps preFilterBitmaps( - const silo::AAStore& aa_store, + static std::unordered_map preFilterBitmaps( + const silo::Database& database, std::vector& bitmap_filter ); static void addMutationsCountsForPosition( uint32_t position, - PrefilteredBitmaps& bitmaps_to_evaluate, + const PrefilteredBitmaps& bitmaps_to_evaluate, AASymbolMap>& count_of_mutations_per_position ); static AASymbolMap> calculateMutationsPerPosition( const AAStore& aa_store, - std::vector& bitmap_filter + const PrefilteredBitmaps& bitmap_filter ); + void addMutationsToOutput( + const std::string& sequence_name, + const AAStore& aa_store, + const PrefilteredBitmaps& bitmap_filter, + std::vector& output + ) const; + [[nodiscard]] void validateOrderByFields(const Database& database) const override; [[nodiscard]] QueryResult execute( @@ -92,7 +99,7 @@ class AAMutations : public Action { ) const override; public: - explicit AAMutations(std::string aa_sequence_name, double min_proportion); + explicit AAMutations(std::vector&& aa_sequence_names, double min_proportion); }; // NOLINTNEXTLINE(readability-identifier-naming) diff --git a/src/silo/query_engine/actions/aa_mutations.cpp b/src/silo/query_engine/actions/aa_mutations.cpp index 37681de8e..1774f6c9a 100644 --- a/src/silo/query_engine/actions/aa_mutations.cpp +++ b/src/silo/query_engine/actions/aa_mutations.cpp @@ -26,29 +26,33 @@ using silo::query_engine::OperatorResult; namespace silo::query_engine::actions { -AAMutations::AAMutations(std::string aa_sequence_name, double min_proportion) - : aa_sequence_name(std::move(aa_sequence_name)), +AAMutations::AAMutations(std::vector&& aa_sequence_names, double min_proportion) + : aa_sequence_names(std::move(aa_sequence_names)), min_proportion(min_proportion) {} -AAMutations::PrefilteredBitmaps AAMutations::preFilterBitmaps( - const silo::AAStore& aa_store, +std::unordered_map AAMutations::preFilterBitmaps( + const silo::Database& database, std::vector& bitmap_filter ) { - PrefilteredBitmaps bitmaps_to_evaluate; - for (size_t i = 0; i < aa_store.partitions.size(); ++i) { - const silo::AAStorePartition& aa_store_partition = aa_store.partitions.at(i); + std::unordered_map bitmaps_to_evaluate; + for (size_t i = 0; i < database.partitions.size(); ++i) { + const DatabasePartition& database_partition = database.partitions.at(i); OperatorResult& filter = bitmap_filter[i]; const size_t cardinality = filter->cardinality(); if (cardinality == 0) { continue; } - if (cardinality == aa_store_partition.sequence_count) { - bitmaps_to_evaluate.full_bitmaps.emplace_back(std::move(filter), aa_store_partition); + if (cardinality == database_partition.sequence_count) { + for (const auto& [aa_name, aa_store] : database_partition.aa_sequences) { + bitmaps_to_evaluate[aa_name].full_bitmaps.emplace_back(filter, aa_store); + } } else { if (filter.isMutable()) { filter->runOptimize(); } - bitmaps_to_evaluate.bitmaps.emplace_back(std::move(filter), aa_store_partition); + for (const auto& [aa_name, aa_store] : database_partition.aa_sequences) { + bitmaps_to_evaluate[aa_name].bitmaps.emplace_back(filter, aa_store); + } } } return bitmaps_to_evaluate; @@ -56,7 +60,7 @@ AAMutations::PrefilteredBitmaps AAMutations::preFilterBitmaps( void AAMutations::addMutationsCountsForPosition( uint32_t position, - PrefilteredBitmaps& bitmaps_to_evaluate, + const PrefilteredBitmaps& bitmaps_to_evaluate, AASymbolMap>& count_of_mutations_per_position ) { for (auto& [filter, aa_store_partition] : bitmaps_to_evaluate.bitmaps) { @@ -89,12 +93,10 @@ void AAMutations::addMutationsCountsForPosition( AASymbolMap> AAMutations::calculateMutationsPerPosition( const AAStore& aa_store, - std::vector& bitmap_filter + const PrefilteredBitmaps& bitmap_filter ) { const size_t sequence_length = aa_store.reference_sequence.size(); - PrefilteredBitmaps bitmaps_to_evaluate = preFilterBitmaps(aa_store, bitmap_filter); - AASymbolMap> count_of_mutations_per_position; for (const auto symbol : VALID_MUTATION_SYMBOLS) { count_of_mutations_per_position[symbol].resize(sequence_length); @@ -104,9 +106,7 @@ AASymbolMap> AAMutations::calculateMutationsPerPosition( tbb::blocked_range(0, sequence_length, /*grain_size=*/POSITIONS_PER_PROCESS), [&](const auto& local) { for (uint32_t pos = local.begin(); pos != local.end(); ++pos) { - addMutationsCountsForPosition( - pos, bitmaps_to_evaluate, count_of_mutations_per_position - ); + addMutationsCountsForPosition(pos, bitmap_filter, count_of_mutations_per_position); } } ); @@ -129,24 +129,17 @@ void AAMutations::validateOrderByFields(const Database& /*database*/) const { } } -QueryResult AAMutations::execute( - const Database& database, - std::vector bitmap_filter +void AAMutations::addMutationsToOutput( + const std::string& sequence_name, + const AAStore& aa_store, + const PrefilteredBitmaps& bitmap_filter, + std::vector& output ) const { - using roaring::Roaring; - CHECK_SILO_QUERY( - database.aa_sequences.contains(aa_sequence_name), - "Database does not contain the amino acid sequence with name: '" + aa_sequence_name + "'" - ) - - const AAStore& aa_store = database.aa_sequences.at(aa_sequence_name); - const size_t sequence_length = aa_store.reference_sequence.size(); const AASymbolMap> count_of_mutations_per_position = calculateMutationsPerPosition(aa_store, bitmap_filter); - std::vector mutation_proportions; for (size_t pos = 0; pos < sequence_length; ++pos) { uint32_t total = 0; for (const AA_SYMBOL symbol : VALID_MUTATION_SYMBOLS) { @@ -169,26 +162,75 @@ QueryResult AAMutations::execute( map>> fields{ {POSITION_FIELD_NAME, - aaSymbolToChar(symbol_in_reference_genome) + std::to_string(pos + 1) + - aaSymbolToChar(symbol)}, + sequence_name + ":" + aaSymbolToChar(symbol_in_reference_genome) + + std::to_string(pos + 1) + aaSymbolToChar(symbol)}, {PROPORTION_FIELD_NAME, proportion}, {COUNT_FIELD_NAME, static_cast(count)}}; - mutation_proportions.push_back({fields}); + output.push_back({fields}); } } } } +} + +QueryResult AAMutations::execute( + const Database& database, + std::vector bitmap_filter +) const { + using roaring::Roaring; + + std::vector aa_sequence_names_to_evaluate; + for (const auto& aa_sequence_name : aa_sequence_names) { + CHECK_SILO_QUERY( + database.aa_sequences.contains(aa_sequence_name), + "Database does not contain the amino acid sequence with name: '" + aa_sequence_name + "'" + ) + aa_sequence_names_to_evaluate.emplace_back(aa_sequence_name); + } + if (aa_sequence_names.empty()) { + for (const auto& [aa_sequence_name, _] : database.aa_sequences) { + aa_sequence_names_to_evaluate.emplace_back(aa_sequence_name); + } + } + + std::unordered_map bitmaps_to_evaluate = + preFilterBitmaps(database, bitmap_filter); + + std::vector mutation_proportions; + for (const auto& aa_sequence_name : aa_sequence_names_to_evaluate) { + const AAStore& aa_store = database.aa_sequences.at(aa_sequence_name); + addMutationsToOutput( + aa_sequence_name, aa_store, bitmaps_to_evaluate.at(aa_sequence_name), mutation_proportions + ); + } return {mutation_proportions}; } // NOLINTNEXTLINE(readability-identifier-naming) void from_json(const nlohmann::json& json, std::unique_ptr& action) { CHECK_SILO_QUERY( - json.contains("sequenceName") && json["sequenceName"].is_string(), - "AminoAcidMutations action must have the string field sequenceName" + !json.contains("sequenceName") || + (json["sequenceName"].is_string() || json["sequenceName"].is_array()), + "AminoAcidMutations action can have the field sequenceName of type string or an array of " + "strings, but no other type" ) - const std::string aa_sequence_name = json["sequenceName"].get(); + std::vector sequence_names; + if (json.contains("sequenceName") && json["sequenceName"].is_array()) { + for (const auto& child : json["sequenceName"]) { + CHECK_SILO_QUERY( + child.is_string(), + "AminoAcidMutations action can have the field sequenceName of type string or an array " + "of " + "strings, but no other type; while parsing array encountered the element " + + child.dump() + " which is not of type string" + ) + sequence_names.emplace_back(child.get()); + } + } else if (json.contains("sequenceName") && json["sequenceName"].is_string()) { + sequence_names.emplace_back(json["sequenceName"].get()); + } + double min_proportion = AAMutations::DEFAULT_MIN_PROPORTION; if (json.contains("minProportion")) { min_proportion = json["minProportion"].get(); @@ -198,7 +240,7 @@ void from_json(const nlohmann::json& json, std::unique_ptr& action) ); } } - action = std::make_unique(aa_sequence_name, min_proportion); + action = std::make_unique(std::move(sequence_names), min_proportion); } } // namespace silo::query_engine::actions