From a646dab06f0acc5679549e7b902a67c20e1c88fe Mon Sep 17 00:00:00 2001 From: Alexander Taepper Date: Thu, 13 Jul 2023 09:39:21 +0200 Subject: [PATCH] feat: support recombinant lineages --- .../test/queries/GroupByLineage.json | 22 +++++-- .../GroupByLineageOrderByCountLimit.json | 6 +- .../test/queries/OffsetLimitOverlap.json | 6 +- .../test/queries/recombinantLineage.json | 19 ++++++ .../queries/recombinantLineageWithAlias.json | 19 ++++++ include/silo/common/pango_lineage.h | 18 +++-- include/silo/database.h | 2 + .../silo/preprocessing/pango_lineage_count.h | 4 +- include/silo/preprocessing/partition.h | 6 +- .../storage/column/pango_lineage_column.h | 24 ++++--- include/silo/storage/column_group.h | 1 - include/silo/storage/pango_lineage_alias.h | 19 +++++- src/silo/common/bidirectional_map.cpp | 2 +- src/silo/common/bidirectional_map.test.cpp | 20 +++--- src/silo/common/pango_lineage.cpp | 16 ++--- src/silo/common/pango_lineage.test.cpp | 36 +++++----- src/silo/database.cpp | 4 +- src/silo/prepare_dataset.cpp | 13 ++-- .../preprocessing/pango_lineage_count.cpp | 16 +++-- .../pango_lineage_count.test.cpp | 8 +-- src/silo/preprocessing/partition.cpp | 37 ++++++----- .../pango_lineage_filter.cpp | 13 ++-- .../storage/column/pango_lineage_column.cpp | 36 +++++----- .../column/pango_lineage_column.test.cpp | 18 +++-- src/silo/storage/column_group.cpp | 5 +- src/silo/storage/pango_lineage_alias.cpp | 65 +++++++++++++++---- src/silo/storage/pango_lineage_alias.test.cpp | 20 +++--- testBaseData/small_metadata_set.tsv | 4 +- 28 files changed, 296 insertions(+), 163 deletions(-) create mode 100644 endToEndTests/test/queries/recombinantLineage.json create mode 100644 endToEndTests/test/queries/recombinantLineageWithAlias.json diff --git a/endToEndTests/test/queries/GroupByLineage.json b/endToEndTests/test/queries/GroupByLineage.json index cfc281041..640957ef0 100644 --- a/endToEndTests/test/queries/GroupByLineage.json +++ b/endToEndTests/test/queries/GroupByLineage.json @@ -3,8 +3,12 @@ "query": { "action": { "type": "Aggregated", - "groupByFields": ["pango_lineage"], - "orderByFields": ["pango_lineage"] + "groupByFields": [ + "pango_lineage" + ], + "orderByFields": [ + "pango_lineage" + ] }, "filterExpression": { "type": "True" @@ -52,7 +56,7 @@ "pango_lineage": "B.1.1.70" }, { - "count": 7, + "count": 6, "pango_lineage": "B.1.160" }, { @@ -71,10 +75,6 @@ "count": 3, "pango_lineage": "B.1.221" }, - { - "count": 1, - "pango_lineage": "B.1.236" - }, { "count": 1, "pango_lineage": "B.1.258" @@ -106,6 +106,14 @@ { "count": 1, "pango_lineage": "B.1.617.2.9.2" + }, + { + "count": 1, + "pango_lineage": "XA.1" + }, + { + "count": 1, + "pango_lineage": "XBB.1.9.3.1" } ] } diff --git a/endToEndTests/test/queries/GroupByLineageOrderByCountLimit.json b/endToEndTests/test/queries/GroupByLineageOrderByCountLimit.json index d991644c0..e1035d8d5 100644 --- a/endToEndTests/test/queries/GroupByLineageOrderByCountLimit.json +++ b/endToEndTests/test/queries/GroupByLineageOrderByCountLimit.json @@ -3,7 +3,9 @@ "query": { "action": { "type": "Aggregated", - "groupByFields": ["pango_lineage"], + "groupByFields": [ + "pango_lineage" + ], "orderByFields": [ { "field": "count", @@ -26,7 +28,7 @@ "pango_lineage": "B.1.177" }, { - "count": 7, + "count": 6, "pango_lineage": "B.1.160" }, { diff --git a/endToEndTests/test/queries/OffsetLimitOverlap.json b/endToEndTests/test/queries/OffsetLimitOverlap.json index a4cef6984..95899046e 100644 --- a/endToEndTests/test/queries/OffsetLimitOverlap.json +++ b/endToEndTests/test/queries/OffsetLimitOverlap.json @@ -3,7 +3,9 @@ "query": { "action": { "type": "Details", - "orderByFields": ["gisaid_epi_isl"], + "orderByFields": [ + "gisaid_epi_isl" + ], "offset": 90, "limit": 90 }, @@ -62,7 +64,7 @@ "date": "2020-12-24", "division": "Sankt Gallen", "gisaid_epi_isl": "EPI_ISL_768148", - "pango_lineage": "B.1.160", + "pango_lineage": "XBB.1.9.3.1", "qc_value": 0.98, "region": "Europe", "unsorted_date": "2020-03-16" diff --git a/endToEndTests/test/queries/recombinantLineage.json b/endToEndTests/test/queries/recombinantLineage.json new file mode 100644 index 000000000..4d1689833 --- /dev/null +++ b/endToEndTests/test/queries/recombinantLineage.json @@ -0,0 +1,19 @@ +{ + "testCaseName": "Recombinant lineage XBB including sublineages", + "query": { + "action": { + "type": "Aggregated" + }, + "filterExpression": { + "type": "PangoLineage", + "column": "pango_lineage", + "value": "XBB", + "includeSublineages": true + } + }, + "expectedQueryResult": [ + { + "count": 1 + } + ] +} diff --git a/endToEndTests/test/queries/recombinantLineageWithAlias.json b/endToEndTests/test/queries/recombinantLineageWithAlias.json new file mode 100644 index 000000000..ba5d59995 --- /dev/null +++ b/endToEndTests/test/queries/recombinantLineageWithAlias.json @@ -0,0 +1,19 @@ +{ + "testCaseName": "Recombinant lineage GD with unaliasing", + "query": { + "action": { + "type": "Aggregated" + }, + "filterExpression": { + "type": "PangoLineage", + "column": "pango_lineage", + "value": "GD", + "includeSublineages": true + } + }, + "expectedQueryResult": [ + { + "count": 1 + } + ] +} diff --git a/include/silo/common/pango_lineage.h b/include/silo/common/pango_lineage.h index 4f3754035..586aa6b20 100644 --- a/include/silo/common/pango_lineage.h +++ b/include/silo/common/pango_lineage.h @@ -9,7 +9,11 @@ namespace silo::common { -struct PangoLineage { +struct RawPangoLineage { + std::string value; +}; + +struct UnaliasedPangoLineage { std::string value; template @@ -19,19 +23,19 @@ struct PangoLineage { // clang-format on } - bool isSublineageOf(const PangoLineage& other) const; + bool isSublineageOf(const UnaliasedPangoLineage& other) const; - std::vector getParentLineages() const; + std::vector getParentLineages() const; - bool operator<(const PangoLineage& other) const; - bool operator==(const PangoLineage& other) const; + bool operator<(const UnaliasedPangoLineage& other) const; + bool operator==(const UnaliasedPangoLineage& other) const; }; } // namespace silo::common template <> -struct std::hash { - std::size_t operator()(const silo::common::PangoLineage& pango_lineage) const; +struct std::hash { + std::size_t operator()(const silo::common::UnaliasedPangoLineage& pango_lineage) const; }; #endif // SILO_PANGO_LINEAGE_H diff --git a/include/silo/database.h b/include/silo/database.h index 6679141ef..7f0fd6163 100644 --- a/include/silo/database.h +++ b/include/silo/database.h @@ -87,7 +87,9 @@ class Database { PangoLineageAliasLookup alias_key; void initializeColumns(); + void initializeColumn(config::ColumnType column_type, const std::string& name); + void initializeSequences(); static BitmapSizePerSymbol calculateBitmapSizePerSymbol(const SequenceStore& seq_store); diff --git a/include/silo/preprocessing/pango_lineage_count.h b/include/silo/preprocessing/pango_lineage_count.h index 9a1815393..94c4fa59e 100644 --- a/include/silo/preprocessing/pango_lineage_count.h +++ b/include/silo/preprocessing/pango_lineage_count.h @@ -7,6 +7,8 @@ #include #include +#include "silo/common/pango_lineage.h" + namespace silo { struct PangoLineageAliasLookup; @@ -17,7 +19,7 @@ struct DatabaseConfig; namespace preprocessing { struct PangoLineageCount { - std::string pango_lineage; + common::UnaliasedPangoLineage pango_lineage; uint32_t count_of_sequences; }; diff --git a/include/silo/preprocessing/partition.h b/include/silo/preprocessing/partition.h index 2eda7a5b7..d1308a5de 100644 --- a/include/silo/preprocessing/partition.h +++ b/include/silo/preprocessing/partition.h @@ -6,6 +6,10 @@ #include #include +namespace silo::common { +class UnaliasedPangoLineage; +} // namespace silo::common + namespace silo::preprocessing { class PangoLineageCounts; @@ -23,7 +27,7 @@ struct Chunk { std::string prefix; uint32_t count_of_sequences; uint32_t offset; - std::vector pango_lineages; + std::vector pango_lineages; }; struct Partition { diff --git a/include/silo/storage/column/pango_lineage_column.h b/include/silo/storage/column/pango_lineage_column.h index bb40d2901..40bf80e33 100644 --- a/include/silo/storage/column/pango_lineage_column.h +++ b/include/silo/storage/column/pango_lineage_column.h @@ -13,6 +13,7 @@ #include "silo/common/bidirectional_map.h" #include "silo/common/pango_lineage.h" #include "silo/common/types.h" +#include "silo/storage/pango_lineage_alias.h" namespace boost::serialization { struct access; @@ -37,22 +38,26 @@ class PangoLineageColumnPartition { std::unordered_map indexed_values; std::unordered_map indexed_sublineage_values; - silo::common::BidirectionalMap& lookup; + silo::PangoLineageAliasLookup& alias_key; + silo::common::BidirectionalMap& lookup; - void insertSublineageValues(const common::PangoLineage& value, size_t row_number); + void insertSublineageValues(const common::UnaliasedPangoLineage& value, size_t row_number); public: - explicit PangoLineageColumnPartition(common::BidirectionalMap& lookup); + explicit PangoLineageColumnPartition( + silo::PangoLineageAliasLookup& alias_key, + common::BidirectionalMap& lookup + ); - void insert(const common::PangoLineage& value); + void insert(const common::RawPangoLineage& value); - roaring::Roaring filter(const common::PangoLineage& value) const; + roaring::Roaring filter(const common::RawPangoLineage& value) const; - roaring::Roaring filterIncludingSublineages(const common::PangoLineage& value) const; + roaring::Roaring filterIncludingSublineages(const common::RawPangoLineage& value) const; const std::vector& getValues() const; - inline common::PangoLineage lookupValue(Idx id) const { return lookup.getValue(id); } + inline common::UnaliasedPangoLineage lookupValue(Idx id) const { return lookup.getValue(id); } }; class PangoLineageColumn { @@ -67,11 +72,12 @@ class PangoLineageColumn { // TODO sync lookups in children } - std::unique_ptr> lookup; + std::unique_ptr> lookup; + std::unique_ptr alias_key; std::deque partitions; public: - explicit PangoLineageColumn(); + explicit PangoLineageColumn(silo::PangoLineageAliasLookup alias_key); PangoLineageColumnPartition& createPartition(); }; diff --git a/include/silo/storage/column_group.h b/include/silo/storage/column_group.h index 54cf89ac5..cdbdc62f0 100644 --- a/include/silo/storage/column_group.h +++ b/include/silo/storage/column_group.h @@ -61,7 +61,6 @@ struct ColumnGroup { uint32_t fill( const std::filesystem::path& input_file, - const PangoLineageAliasLookup& alias_key, const silo::config::DatabaseConfig& database_config ); diff --git a/include/silo/storage/pango_lineage_alias.h b/include/silo/storage/pango_lineage_alias.h index 7450757d0..35cf558cc 100644 --- a/include/silo/storage/pango_lineage_alias.h +++ b/include/silo/storage/pango_lineage_alias.h @@ -4,18 +4,31 @@ #include #include #include +#include +#include namespace silo { +namespace common { +struct UnaliasedPangoLineage; +struct RawPangoLineage; + +} // namespace common + class PangoLineageAliasLookup { private: - std::unordered_map alias_key; + std::unordered_map> alias_key; public: PangoLineageAliasLookup() = default; - explicit PangoLineageAliasLookup(std::unordered_map alias_key); - std::string resolvePangoLineageAlias(const std::string& pango_lineage) const; + explicit PangoLineageAliasLookup( + std::unordered_map> alias_key + ); + + [[nodiscard]] common::UnaliasedPangoLineage unaliasPangoLineage( + const common::RawPangoLineage& pango_lineage + ) const; static silo::PangoLineageAliasLookup readFromFile( const std::filesystem::path& pango_lineage_alias_file diff --git a/src/silo/common/bidirectional_map.cpp b/src/silo/common/bidirectional_map.cpp index 53a6a96dc..121dbfd89 100644 --- a/src/silo/common/bidirectional_map.cpp +++ b/src/silo/common/bidirectional_map.cpp @@ -37,7 +37,7 @@ Idx BidirectionalMap::getOrCreateId(V value) { return identifier; } -template class BidirectionalMap; +template class BidirectionalMap; template class BidirectionalMap; } // namespace silo::common diff --git a/src/silo/common/bidirectional_map.test.cpp b/src/silo/common/bidirectional_map.test.cpp index 2e34b2836..805fc18bf 100644 --- a/src/silo/common/bidirectional_map.test.cpp +++ b/src/silo/common/bidirectional_map.test.cpp @@ -23,19 +23,19 @@ TEST(BidirectionalMap, correctStdStringDict) { } TEST(BidirectionalMap, correctPangoLineageDict) { - BidirectionalMap under_test; - EXPECT_EQ(under_test.getId(PangoLineage{"Not in dict"}), std::nullopt); + BidirectionalMap under_test; + EXPECT_EQ(under_test.getId(UnaliasedPangoLineage{"Not in dict"}), std::nullopt); - EXPECT_EQ(under_test.getOrCreateId(PangoLineage{"Now in dict"}), 0); - EXPECT_EQ(under_test.getOrCreateId(PangoLineage{"Now in dict"}), 0); - EXPECT_EQ(under_test.getOrCreateId(PangoLineage{"Second in dict"}), 1); + EXPECT_EQ(under_test.getOrCreateId(UnaliasedPangoLineage{"Now in dict"}), 0); + EXPECT_EQ(under_test.getOrCreateId(UnaliasedPangoLineage{"Now in dict"}), 0); + EXPECT_EQ(under_test.getOrCreateId(UnaliasedPangoLineage{"Second in dict"}), 1); - EXPECT_EQ(under_test.getId(PangoLineage{"Now in dict"}), 0); - EXPECT_EQ(under_test.getId(PangoLineage{"Still not in dict"}), std::nullopt); - EXPECT_EQ(under_test.getId(PangoLineage{"Second in dict"}), 1); + EXPECT_EQ(under_test.getId(UnaliasedPangoLineage{"Now in dict"}), 0); + EXPECT_EQ(under_test.getId(UnaliasedPangoLineage{"Still not in dict"}), std::nullopt); + EXPECT_EQ(under_test.getId(UnaliasedPangoLineage{"Second in dict"}), 1); - EXPECT_EQ(under_test.getValue(0), PangoLineage{"Now in dict"}); - EXPECT_EQ(under_test.getValue(1), PangoLineage{"Second in dict"}); + EXPECT_EQ(under_test.getValue(0), UnaliasedPangoLineage{"Now in dict"}); + EXPECT_EQ(under_test.getValue(1), UnaliasedPangoLineage{"Second in dict"}); } } // namespace silo::common \ No newline at end of file diff --git a/src/silo/common/pango_lineage.cpp b/src/silo/common/pango_lineage.cpp index 5197d2103..70dc70316 100644 --- a/src/silo/common/pango_lineage.cpp +++ b/src/silo/common/pango_lineage.cpp @@ -2,15 +2,15 @@ #include -std::size_t std::hash::operator()( - const silo::common::PangoLineage& pango_lineage +std::size_t std::hash::operator()( + const silo::common::UnaliasedPangoLineage& pango_lineage ) const { return hash()(pango_lineage.value); } namespace silo::common { -bool PangoLineage::isSublineageOf(const silo::common::PangoLineage& other) const { +bool UnaliasedPangoLineage::isSublineageOf(const silo::common::UnaliasedPangoLineage& other) const { if (other.value.length() > value.length()) { return false; } @@ -18,23 +18,23 @@ bool PangoLineage::isSublineageOf(const silo::common::PangoLineage& other) const return value.starts_with(other.value); } -std::vector PangoLineage::getParentLineages() const { - std::vector parent_lineages; +std::vector UnaliasedPangoLineage::getParentLineages() const { + std::vector parent_lineages; std::string::size_type pos = 0; while (pos != std::string::npos) { pos = value.find('.', pos + 1); - parent_lineages.push_back(PangoLineage{value.substr(0, pos)}); + parent_lineages.push_back(UnaliasedPangoLineage{value.substr(0, pos)}); } return parent_lineages; } -bool PangoLineage::operator<(const PangoLineage& other) const { +bool UnaliasedPangoLineage::operator<(const UnaliasedPangoLineage& other) const { return value < other.value; } -bool PangoLineage::operator==(const PangoLineage& other) const { +bool UnaliasedPangoLineage::operator==(const UnaliasedPangoLineage& other) const { return value == other.value; } diff --git a/src/silo/common/pango_lineage.test.cpp b/src/silo/common/pango_lineage.test.cpp index 3c4434010..a6385f7f0 100644 --- a/src/silo/common/pango_lineage.test.cpp +++ b/src/silo/common/pango_lineage.test.cpp @@ -2,30 +2,30 @@ #include -TEST(PangoLineage, isSublineageOf) { - silo::common::PangoLineage const under_test{"A.1.2"}; +TEST(UnaliasedPangoLineage, isSublineageOf) { + silo::common::UnaliasedPangoLineage const under_test{"A.1.2"}; - EXPECT_TRUE(under_test.isSublineageOf(silo::common::PangoLineage{"A.1"})); - EXPECT_TRUE(under_test.isSublineageOf(silo::common::PangoLineage{"A"})); - EXPECT_TRUE(under_test.isSublineageOf(silo::common::PangoLineage{"A.1.2"})); + EXPECT_TRUE(under_test.isSublineageOf(silo::common::UnaliasedPangoLineage{"A.1"})); + EXPECT_TRUE(under_test.isSublineageOf(silo::common::UnaliasedPangoLineage{"A"})); + EXPECT_TRUE(under_test.isSublineageOf(silo::common::UnaliasedPangoLineage{"A.1.2"})); - EXPECT_FALSE(under_test.isSublineageOf(silo::common::PangoLineage{"A.1.2.3"})); - EXPECT_FALSE(under_test.isSublineageOf(silo::common::PangoLineage{"A.1.3"})); - EXPECT_FALSE(under_test.isSublineageOf(silo::common::PangoLineage{"B.1.2"})); - EXPECT_FALSE(under_test.isSublineageOf(silo::common::PangoLineage{"B.1.20"})); + EXPECT_FALSE(under_test.isSublineageOf(silo::common::UnaliasedPangoLineage{"A.1.2.3"})); + EXPECT_FALSE(under_test.isSublineageOf(silo::common::UnaliasedPangoLineage{"A.1.3"})); + EXPECT_FALSE(under_test.isSublineageOf(silo::common::UnaliasedPangoLineage{"B.1.2"})); + EXPECT_FALSE(under_test.isSublineageOf(silo::common::UnaliasedPangoLineage{"B.1.20"})); - EXPECT_FALSE(silo::common::PangoLineage{"B.1.20"}.isSublineageOf(under_test)); + EXPECT_FALSE(silo::common::UnaliasedPangoLineage{"B.1.20"}.isSublineageOf(under_test)); } TEST(PangoLineage, getParentLineages) { - silo::common::PangoLineage const under_test{"A.1.23.4.513"}; - - const std::vector expected = { - silo::common::PangoLineage{"A"}, - silo::common::PangoLineage{"A.1"}, - silo::common::PangoLineage{"A.1.23"}, - silo::common::PangoLineage{"A.1.23.4"}, - silo::common::PangoLineage{"A.1.23.4.513"}, + silo::common::UnaliasedPangoLineage const under_test{"A.1.23.4.513"}; + + const std::vector expected = { + silo::common::UnaliasedPangoLineage{"A"}, + silo::common::UnaliasedPangoLineage{"A.1"}, + silo::common::UnaliasedPangoLineage{"A.1.23"}, + silo::common::UnaliasedPangoLineage{"A.1.23.4"}, + silo::common::UnaliasedPangoLineage{"A.1.23.4.513"}, }; EXPECT_EQ(under_test.getParentLineages(), expected); diff --git a/src/silo/database.cpp b/src/silo/database.cpp index e5edb2d7d..6d3a4a29f 100644 --- a/src/silo/database.cpp +++ b/src/silo/database.cpp @@ -114,7 +114,7 @@ void Database::build( } SPDLOG_DEBUG("Using metadata file: {}", metadata_file.string()); partitions[partition_index].sequenceCount = - partitions[partition_index].columns.fill(metadata_file, alias_key, database_config); + partitions[partition_index].columns.fill(metadata_file, database_config); } } } @@ -519,7 +519,7 @@ void Database::initializeColumn(config::ColumnType column_type, const std::strin } } break; case config::ColumnType::INDEXED_PANGOLINEAGE: - pango_lineage_columns.emplace(name, storage::column::PangoLineageColumn()); + pango_lineage_columns.emplace(name, storage::column::PangoLineageColumn(this->alias_key)); for (auto& partition : partitions) { partition.insertColumn(name, pango_lineage_columns.at(name).createPartition()); } diff --git a/src/silo/prepare_dataset.cpp b/src/silo/prepare_dataset.cpp index d3c8ba6c9..d86bd4a0b 100644 --- a/src/silo/prepare_dataset.cpp +++ b/src/silo/prepare_dataset.cpp @@ -15,6 +15,7 @@ #include "silo/common/date.h" #include "silo/common/fasta_reader.h" +#include "silo/common/pango_lineage.h" #include "silo/common/zstdfasta_reader.h" #include "silo/common/zstdfasta_writer.h" #include "silo/config/database_config.h" @@ -124,12 +125,12 @@ std::unordered_map writeMetadataChunks( ) { std::unordered_map primary_key_to_sequence_partition_chunk; for (auto& row : metadata_reader) { - std::string const primary_key = row[database_config.schema.primary_key].get(); - std::string const pango_lineage = - alias_key.resolvePangoLineageAlias(row[database_config.schema.partition_by].get()); - row[database_config.schema.partition_by] = csv::CSVField{pango_lineage}; + const std::string primary_key = row[database_config.schema.primary_key].get(); + const silo::common::UnaliasedPangoLineage pango_lineage = + alias_key.unaliasPangoLineage({row[database_config.schema.partition_by].get()}); + row[database_config.schema.partition_by] = csv::CSVField{pango_lineage.value}; - std::string const chunk = pango_to_chunk[pango_lineage]; + const std::string chunk = pango_to_chunk[pango_lineage.value]; chunk_to_metadata_writers[chunk]->writeRow(row); primary_key_to_sequence_partition_chunk[primary_key] = chunk; @@ -229,7 +230,7 @@ void silo::partitionData( const auto& chunk = part.chunks[j]; chunk_names.push_back(silo::buildChunkString(i, j)); for (const auto& pango : chunk.pango_lineages) { - pango_to_chunk[pango] = chunk_names.back(); + pango_to_chunk[pango.value] = chunk_names.back(); } } } diff --git a/src/silo/preprocessing/pango_lineage_count.cpp b/src/silo/preprocessing/pango_lineage_count.cpp index c0c705a9a..d812f4b2e 100644 --- a/src/silo/preprocessing/pango_lineage_count.cpp +++ b/src/silo/preprocessing/pango_lineage_count.cpp @@ -1,5 +1,6 @@ #include "silo/preprocessing/pango_lineage_count.h" +#include #include #include #include @@ -13,7 +14,7 @@ namespace silo::preprocessing { void PangoLineageCounts::save(std::ostream& output_file) { for (const auto& pango_lineage_count : pango_lineage_counts) { - output_file << pango_lineage_count.pango_lineage << '\t' + output_file << pango_lineage_count.pango_lineage.value << '\t' << pango_lineage_count.count_of_sequences << '\n'; } output_file.flush(); @@ -32,7 +33,7 @@ PangoLineageCounts PangoLineageCounts::load(std::istream& input_stream) { break; } count = atoi(count_str.c_str()); - descriptor.pango_lineage_counts.emplace_back(PangoLineageCount{lineage, count}); + descriptor.pango_lineage_counts.emplace_back(PangoLineageCount{{lineage}, count}); } return descriptor; } @@ -45,14 +46,15 @@ PangoLineageCounts buildPangoLineageCounts( PangoLineageCounts pango_lineage_counts; uint32_t pango_lineage_ids_count = 0; - std::unordered_map pango_lineage_to_id; + std::unordered_map pango_lineage_to_id; - auto unresolved_pango_lineages = silo::preprocessing::MetadataReader(metadata_path) - .getColumn(database_config.schema.partition_by); + const std::vector unresolved_pango_lineages = + silo::preprocessing::MetadataReader(metadata_path) + .getColumn(database_config.schema.partition_by); for (const auto& unresolved_pango_lineage : unresolved_pango_lineages) { - std::string const pango_lineage = - alias_key.resolvePangoLineageAlias(unresolved_pango_lineage); + const common::UnaliasedPangoLineage pango_lineage = + alias_key.unaliasPangoLineage({unresolved_pango_lineage}); if (pango_lineage_to_id.contains(pango_lineage)) { auto pid = pango_lineage_to_id[pango_lineage]; diff --git a/src/silo/preprocessing/pango_lineage_count.test.cpp b/src/silo/preprocessing/pango_lineage_count.test.cpp index 291ff5574..7408d4cf1 100644 --- a/src/silo/preprocessing/pango_lineage_count.test.cpp +++ b/src/silo/preprocessing/pango_lineage_count.test.cpp @@ -18,12 +18,12 @@ TEST(PangoLineageCounts, buildPangoLineageCounts) { ); ASSERT_EQ(result.pango_lineage_counts.size(), 24); - ASSERT_EQ(result.pango_lineage_counts[0].pango_lineage, ""); + ASSERT_EQ(result.pango_lineage_counts[0].pango_lineage.value, ""); ASSERT_EQ(result.pango_lineage_counts[0].count_of_sequences, 1); - ASSERT_EQ(result.pango_lineage_counts[1].pango_lineage, "B.1"); + ASSERT_EQ(result.pango_lineage_counts[1].pango_lineage.value, "B.1"); ASSERT_EQ(result.pango_lineage_counts[1].count_of_sequences, 3); - ASSERT_EQ(result.pango_lineage_counts[7].pango_lineage, "B.1.1.7"); + ASSERT_EQ(result.pango_lineage_counts[7].pango_lineage.value, "B.1.1.7"); ASSERT_EQ(result.pango_lineage_counts[7].count_of_sequences, 48); - ASSERT_EQ(result.pango_lineage_counts[23].pango_lineage, "B.1.617.2.9.2"); + ASSERT_EQ(result.pango_lineage_counts[23].pango_lineage.value, "B.1.617.2.9.2"); ASSERT_EQ(result.pango_lineage_counts[23].count_of_sequences, 1); } diff --git a/src/silo/preprocessing/partition.cpp b/src/silo/preprocessing/partition.cpp index e09715284..e26c30c7e 100644 --- a/src/silo/preprocessing/partition.cpp +++ b/src/silo/preprocessing/partition.cpp @@ -2,9 +2,9 @@ #include #include +#include #include #include -#include #include #include @@ -43,27 +43,29 @@ std::vector mergePangosToChunks( uint32_t min_size ) { // Initialize chunks such that every chunk is just a pango_lineage - std::list chunks; + std::deque chunks; for (const auto& count : pango_lineage_counts) { - std::vector pango_lineages; - pango_lineages.push_back(count.pango_lineage); - const Chunk tmp = {count.pango_lineage, count.count_of_sequences, 0, pango_lineages}; - chunks.emplace_back(tmp); + chunks.push_back(Chunk{ + count.pango_lineage.value, + count.count_of_sequences, + 0, + std::vector{{count.pango_lineage}}}); } // We want to prioritise merges more closely related chunks. // Therefore, we first merge the chunks, with longer matching prefixes. // Precalculate the longest a prefix can be (which is the max length of lineages) - const uint32_t max_len = std::max_element( - pango_lineage_counts.begin(), - pango_lineage_counts.end(), - [](const PangoLineageCount& lhs, const PangoLineageCount& rhs) { - return lhs.pango_lineage.size() < rhs.pango_lineage.size(); - } - )->pango_lineage.size(); + const uint32_t max_len = + std::max_element( + pango_lineage_counts.begin(), + pango_lineage_counts.end(), + [](const PangoLineageCount& lhs, const PangoLineageCount& rhs) { + return lhs.pango_lineage.value.size() < rhs.pango_lineage.value.size(); + } + )->pango_lineage.value.size(); for (uint32_t len = max_len; len > 0; len--) { for (auto it = chunks.begin(); it != chunks.end() && std::next(it) != chunks.end();) { auto&& [pango1, pango2] = std::tie(*it, *std::next(it)); - std::string const common_prefix = commonPangoPrefix(pango1.prefix, pango2.prefix); + const std::string common_prefix = commonPangoPrefix(pango1.prefix, pango2.prefix); // We only look at possible merges with a common_prefix length of #len const bool one_chunk_is_very_small = pango1.count_of_sequences < min_size || pango2.count_of_sequences < min_size; @@ -211,13 +213,14 @@ Partitions Partitions::load(std::istream& input_file) { // size = atoi(size_str.c_str()); unused, only meta information count = atoi(count_str.c_str()); offset = atoi(offset_str.c_str()); - const silo::preprocessing::Chunk chunk{name, count, offset, std::vector()}; + const silo::preprocessing::Chunk chunk{ + name, count, offset, std::vector()}; descriptor.partitions.back().chunks.push_back(chunk); } else if (type.at(0) == 'L') { if (!getline(input_file, name, '\n')) { break; } - descriptor.partitions.back().chunks.back().pango_lineages.push_back(name); + descriptor.partitions.back().chunks.back().pango_lineages.push_back({name}); } else { throw silo::persistence::LoadDatabaseException("loadPartitions format exception"); } @@ -233,7 +236,7 @@ void Partitions::save(std::ostream& output_file) const { output_file << "C\t" << chunk.prefix << '\t' << chunk.pango_lineages.size() << '\t' << chunk.count_of_sequences << '\t' << chunk.offset << '\n'; for (const auto& pango_lineage : chunk.pango_lineages) { - output_file << "L\t" << pango_lineage << '\n'; + output_file << "L\t" << pango_lineage.value << '\n'; } } } diff --git a/src/silo/query_engine/filter_expressions/pango_lineage_filter.cpp b/src/silo/query_engine/filter_expressions/pango_lineage_filter.cpp index a5f47b3f2..60a15c1f1 100644 --- a/src/silo/query_engine/filter_expressions/pango_lineage_filter.cpp +++ b/src/silo/query_engine/filter_expressions/pango_lineage_filter.cpp @@ -42,7 +42,7 @@ std::string PangoLineageFilter::toString(const silo::Database& /*database*/) con } std::unique_ptr PangoLineageFilter::compile( - const silo::Database& database, + const silo::Database& /*database*/, const silo::DatabasePartition& database_partition, AmbiguityMode /*mode*/ ) const { @@ -50,14 +50,15 @@ std::unique_ptr PangoLineageFilter::com return std::make_unique(database_partition.sequenceCount); } - std::string lineage_copy = lineage; - std::transform(lineage_copy.begin(), lineage_copy.end(), lineage_copy.begin(), ::toupper); - const auto resolved_lineage = database.getAliasKey().resolvePangoLineageAlias(lineage_copy); + std::string lineage_all_upper = lineage; + std::transform( + lineage_all_upper.begin(), lineage_all_upper.end(), lineage_all_upper.begin(), ::toupper + ); const auto& pango_lineage_column = database_partition.columns.pango_lineage_columns.at(column); const auto& bitmap = include_sublineages - ? pango_lineage_column.filterIncludingSublineages({resolved_lineage}) - : pango_lineage_column.filter({resolved_lineage}); + ? pango_lineage_column.filterIncludingSublineages({lineage_all_upper}) + : pango_lineage_column.filter({lineage_all_upper}); return std::make_unique( new roaring::Roaring(bitmap), database_partition.sequenceCount diff --git a/src/silo/storage/column/pango_lineage_column.cpp b/src/silo/storage/column/pango_lineage_column.cpp index 25bedaa37..88deda654 100644 --- a/src/silo/storage/column/pango_lineage_column.cpp +++ b/src/silo/storage/column/pango_lineage_column.cpp @@ -9,23 +9,26 @@ namespace silo::storage::column { PangoLineageColumnPartition::PangoLineageColumnPartition( - common::BidirectionalMap& lookup + silo::PangoLineageAliasLookup& alias_key, + common::BidirectionalMap& lookup ) - : lookup(lookup){}; + : alias_key(alias_key), + lookup(lookup){}; -void PangoLineageColumnPartition::insert(const common::PangoLineage& value) { - for (const auto& parent_lineage : value.getParentLineages()) { +void PangoLineageColumnPartition::insert(const common::RawPangoLineage& value) { + const common::UnaliasedPangoLineage resolved_lineage = alias_key.unaliasPangoLineage(value); + for (const auto& parent_lineage : resolved_lineage.getParentLineages()) { (void)lookup.getOrCreateId(parent_lineage); } - const Idx value_id = lookup.getOrCreateId(value); + const Idx value_id = lookup.getOrCreateId(resolved_lineage); const size_t row_number = value_ids.size(); - indexed_values[value_id].add(row_number); - insertSublineageValues(value, row_number); value_ids.push_back(value_id); + indexed_values[value_id].add(row_number); + insertSublineageValues(resolved_lineage, row_number); } void PangoLineageColumnPartition::insertSublineageValues( - const common::PangoLineage& value, + const common::UnaliasedPangoLineage& value, size_t row_number ) { for (const auto& pango_lineage : value.getParentLineages()) { @@ -34,8 +37,9 @@ void PangoLineageColumnPartition::insertSublineageValues( } } -roaring::Roaring PangoLineageColumnPartition::filter(const common::PangoLineage& value) const { - auto value_id = lookup.getId(value); +roaring::Roaring PangoLineageColumnPartition::filter(const common::RawPangoLineage& value) const { + const common::UnaliasedPangoLineage resolved_lineage = alias_key.unaliasPangoLineage(value); + auto value_id = lookup.getId(resolved_lineage); if (value_id.has_value() && indexed_values.contains(value_id.value())) { return indexed_values.at(value_id.value()); } @@ -43,9 +47,10 @@ roaring::Roaring PangoLineageColumnPartition::filter(const common::PangoLineage& } roaring::Roaring PangoLineageColumnPartition::filterIncludingSublineages( - const common::PangoLineage& value + const common::RawPangoLineage& value ) const { - auto value_id = lookup.getId(value); + const common::UnaliasedPangoLineage resolved_lineage = alias_key.unaliasPangoLineage(value); + auto value_id = lookup.getId(resolved_lineage); if (value_id.has_value() && indexed_sublineage_values.contains(value_id.value())) { return indexed_sublineage_values.at(value_id.value()); } @@ -56,12 +61,13 @@ const std::vector& PangoLineageColumnPartition::getValues() const { return this->value_ids; } -PangoLineageColumn::PangoLineageColumn() { - lookup = std::make_unique>(); +PangoLineageColumn::PangoLineageColumn(silo::PangoLineageAliasLookup alias_key) { + lookup = std::make_unique>(); + this->alias_key = std::make_unique(std::move(alias_key)); }; PangoLineageColumnPartition& PangoLineageColumn::createPartition() { - return partitions.emplace_back(*lookup); + return partitions.emplace_back(*alias_key, *lookup); } } // namespace silo::storage::column diff --git a/src/silo/storage/column/pango_lineage_column.test.cpp b/src/silo/storage/column/pango_lineage_column.test.cpp index a8f2a9579..1927d904a 100644 --- a/src/silo/storage/column/pango_lineage_column.test.cpp +++ b/src/silo/storage/column/pango_lineage_column.test.cpp @@ -3,8 +3,10 @@ #include TEST(PangoLineageColumn, addingLineageAndThenSublineageFiltersCorrectly) { - silo::common::BidirectionalMap lookup; - silo::storage::column::PangoLineageColumnPartition under_test(lookup); + silo::common::BidirectionalMap lookup; + silo::PangoLineageAliasLookup alias_key = + silo::PangoLineageAliasLookup::readFromFile("testBaseData/pangolineage_alias.json"); + silo::storage::column::PangoLineageColumnPartition under_test(alias_key, lookup); under_test.insert({"A.1.2"}); under_test.insert({"A.1.2"}); @@ -20,8 +22,10 @@ TEST(PangoLineageColumn, addingLineageAndThenSublineageFiltersCorrectly) { } TEST(PangoLineageColumn, addingSublineageAndThenLineageFiltersCorrectly) { - silo::common::BidirectionalMap lookup; - silo::storage::column::PangoLineageColumnPartition under_test(lookup); + silo::common::BidirectionalMap lookup; + silo::PangoLineageAliasLookup alias_key = + silo::PangoLineageAliasLookup::readFromFile("testBaseData/pangolineage_alias.json"); + silo::storage::column::PangoLineageColumnPartition under_test(alias_key, lookup); under_test.insert({"A.1.2.3"}); under_test.insert({"A.1.2.3"}); @@ -37,8 +41,10 @@ TEST(PangoLineageColumn, addingSublineageAndThenLineageFiltersCorrectly) { } TEST(PangoLineageColumn, queryParentLineageThatWasNeverInserted) { - silo::common::BidirectionalMap lookup; - silo::storage::column::PangoLineageColumnPartition under_test(lookup); + silo::common::BidirectionalMap lookup; + silo::PangoLineageAliasLookup alias_key = + silo::PangoLineageAliasLookup::readFromFile("testBaseData/pangolineage_alias.json"); + silo::storage::column::PangoLineageColumnPartition under_test(alias_key, lookup); under_test.insert({"A.1.2.3"}); under_test.insert({"A.1.2.3"}); diff --git a/src/silo/storage/column_group.cpp b/src/silo/storage/column_group.cpp index 89c8050dd..ab4b902db 100644 --- a/src/silo/storage/column_group.cpp +++ b/src/silo/storage/column_group.cpp @@ -19,7 +19,6 @@ namespace silo::storage { uint32_t ColumnGroup::fill( const std::filesystem::path& input_file, - const PangoLineageAliasLookup& alias_key, const silo::config::DatabaseConfig& database_config ) { auto metadata_reader = silo::preprocessing::MetadataReader(input_file); @@ -30,15 +29,13 @@ uint32_t ColumnGroup::fill( for (auto& row : metadata_reader.reader) { for (const auto& item : database_config.schema.metadata) { const std::string value = row[item.name].get(); - const auto column_type = item.getColumnType(); if (column_type == silo::config::ColumnType::INDEXED_STRING) { indexed_string_columns.at(item.name).insert(value); } else if (column_type == silo::config::ColumnType::STRING) { string_columns.at(item.name).insert(value); } else if (column_type == silo::config::ColumnType::INDEXED_PANGOLINEAGE) { - const std::string pango_lineage = alias_key.resolvePangoLineageAlias(value); - pango_lineage_columns.at(item.name).insert({pango_lineage}); + pango_lineage_columns.at(item.name).insert({value}); } else if (column_type == silo::config::ColumnType::DATE) { date_columns.at(item.name).insert(common::stringToDate(value)); } else if (column_type == silo::config::ColumnType::INT) { diff --git a/src/silo/storage/pango_lineage_alias.cpp b/src/silo/storage/pango_lineage_alias.cpp index bca0d051d..dc9f68759 100644 --- a/src/silo/storage/pango_lineage_alias.cpp +++ b/src/silo/storage/pango_lineage_alias.cpp @@ -14,50 +14,87 @@ #include #include +#include "silo/common/pango_lineage.h" + namespace silo { PangoLineageAliasLookup::PangoLineageAliasLookup( - std::unordered_map alias_key + std::unordered_map> alias_key ) : alias_key(std::move(alias_key)) {} -std::string PangoLineageAliasLookup::resolvePangoLineageAlias(const std::string& pango_lineage +common::UnaliasedPangoLineage PangoLineageAliasLookup::unaliasPangoLineage( + const common::RawPangoLineage& pango_lineage ) const { std::string pango_lineage_prefix; - std::stringstream pango_lineage_stream(pango_lineage); + std::stringstream pango_lineage_stream(pango_lineage.value); getline(pango_lineage_stream, pango_lineage_prefix, '.'); if (alias_key.contains(pango_lineage_prefix)) { if (alias_key.at(pango_lineage_prefix).empty()) { - return pango_lineage; + return {pango_lineage.value}; + } + if (alias_key.at(pango_lineage_prefix).size() > 1) { + return {pango_lineage.value}; } if (pango_lineage_stream.eof()) { - return alias_key.at(pango_lineage_prefix); + return {alias_key.at(pango_lineage_prefix).at(0)}; } const std::string suffix( (std::istream_iterator(pango_lineage_stream)), std::istream_iterator() ); - return alias_key.at(pango_lineage_prefix) + '.' + suffix; + return {alias_key.at(pango_lineage_prefix).at(0) + '.' + suffix}; } - return pango_lineage; + return {pango_lineage.value}; } +/*std::vector PangoLineageAliasLookup::allUnaliasedPangoLineages( + const common::UnaliasedPangoLineage& pango_lineage +) const { + std::vector result; + std::vector queue; + queue.emplace_back(pango_lineage.value); + while (!queue.empty()) { + common::UnaliasedPangoLineage current_lineage = std::move(queue.back()); + queue.pop_back(); + + std::string pango_lineage_prefix; + std::stringstream pango_lineage_stream(current_lineage.value); + getline(pango_lineage_stream, pango_lineage_prefix, '.'); + if (alias_key.contains(pango_lineage_prefix)) { + if (alias_key.at(pango_lineage_prefix).empty()) { + result.emplace_back(std::move(current_lineage)); + } + std::string suffix; + if (!pango_lineage_stream.eof()) { + suffix = std::string( + (std::istream_iterator(pango_lineage_stream)), std::istream_iterator() + ); + } + for (const std::string& prefix : alias_key.at(pango_lineage_prefix)) { + queue.push_back({prefix + "." + suffix}); + } + } else { + result.emplace_back(current_lineage); + } + } + return result; +}*/ + namespace { -std::unordered_map readFromJson( +std::unordered_map> readFromJson( const std::filesystem::path& pango_lineage_alias_file ) { - std::unordered_map alias_keys; + std::unordered_map> alias_keys; nlohmann::json alias_key_json; std::ifstream(pango_lineage_alias_file) >> alias_key_json; for (const auto& [key, value] : alias_key_json.items()) { if (value.is_array()) { - SPDLOG_INFO( - "Alias value {} is a recombinant. Recombinants are not implemented yet.", value.dump() - ); - continue; + alias_keys[key] = value.get>(); + } else if (value.is_string() && !value.get().empty()) { + alias_keys[key] = {value.get()}; } - alias_keys[key] = value; } return alias_keys; } diff --git a/src/silo/storage/pango_lineage_alias.test.cpp b/src/silo/storage/pango_lineage_alias.test.cpp index 44b4ef1ae..0fb71e6fc 100644 --- a/src/silo/storage/pango_lineage_alias.test.cpp +++ b/src/silo/storage/pango_lineage_alias.test.cpp @@ -2,6 +2,7 @@ #include #include +#include "silo/common/pango_lineage.h" #include "silo/storage/pango_lineage_alias.h" namespace { @@ -13,17 +14,16 @@ struct TestParameter { class ResolveAliasTestFixture : public ::testing::TestWithParam { protected: - const silo::PangoLineageAliasLookup alias_map = silo::PangoLineageAliasLookup( - std::unordered_map{{"X", "A"}, {"XY", "A.1"}} - ); + const silo::PangoLineageAliasLookup alias_map = + silo::PangoLineageAliasLookup{{{"X", {"A"}}, {"XY", {"A.1"}}}}; }; TEST_P(ResolveAliasTestFixture, shouldReturnExpectedResolvedAlias) { const auto test_parameter = GetParam(); - const auto result = alias_map.resolvePangoLineageAlias(test_parameter.input); + const auto result = alias_map.unaliasPangoLineage({test_parameter.input}); - ASSERT_EQ(result, test_parameter.expected_result); + ASSERT_EQ(result.value, test_parameter.expected_result); } // NOLINTNEXTLINE(readability-identifier-length) @@ -45,11 +45,11 @@ TEST(PangoLineageAliasLookup, readFromFile) { auto under_test = silo::PangoLineageAliasLookup::readFromFile("testBaseData/pangolineage_alias.json"); - ASSERT_EQ(under_test.resolvePangoLineageAlias("B"), "B"); - ASSERT_EQ(under_test.resolvePangoLineageAlias("B.1"), "B.1"); - ASSERT_EQ(under_test.resolvePangoLineageAlias("B.1.2"), "B.1.2"); - ASSERT_EQ(under_test.resolvePangoLineageAlias("C"), "B.1.1.1"); - ASSERT_EQ(under_test.resolvePangoLineageAlias("EP"), "B.1.1.529.2.75.3.1.1.4"); + ASSERT_EQ(under_test.unaliasPangoLineage({"B"}).value, "B"); + ASSERT_EQ(under_test.unaliasPangoLineage({"B.1"}).value, "B.1"); + ASSERT_EQ(under_test.unaliasPangoLineage({"B.1.2"}).value, "B.1.2"); + ASSERT_EQ(under_test.unaliasPangoLineage({"C"}).value, "B.1.1.1"); + ASSERT_EQ(under_test.unaliasPangoLineage({"EP"}).value, "B.1.1.529.2.75.3.1.1.4"); } TEST(PangoLineageAliasLookup, readFromFileShouldThrowIfFileDoesNotExist) { diff --git a/testBaseData/small_metadata_set.tsv b/testBaseData/small_metadata_set.tsv index 3b5becdee..217fe0555 100644 --- a/testBaseData/small_metadata_set.tsv +++ b/testBaseData/small_metadata_set.tsv @@ -12,7 +12,7 @@ EPI_ISL_1749960 B.1.1.7 2021-04-15 Europe Switzerland Basel-Land 2021-02-03 59 0 EPI_ISL_1361468 B.1.1.7 2021-03-06 Europe Switzerland Zürich 2021-01-20 50 0.98 EPI_ISL_1408062 B.1.1.7 2021-03-03 Europe Switzerland Valais 2020-11-24 50 0.97 EPI_ISL_1597890 B.1.1.7 2021-03-21 "" Switzerland Vaud 2021-01-25 51 0.96 -EPI_ISL_1682849 B.1.236 2020-12-17 Europe Switzerland Thurgau 2021-01-21 52 0.95 +EPI_ISL_1682849 XA.1 2020-12-17 Europe Switzerland Thurgau 2021-01-21 52 0.95 EPI_ISL_1408805 B.1.221 2020-11-24 Europe Switzerland Schwyz 2020-12-09 53 0.94 EPI_ISL_1750868 B.1.1.189 2020-12-15 Europe Switzerland Solothurn 2021-01-20 54 0.93 EPI_ISL_2019350 B.1.1.7 2021-04-27 Europe Switzerland Valais 2020-12-21 55 0.92 @@ -49,7 +49,7 @@ EPI_ISL_1119584 B.1.1 2020-11-04 Europe Switzerland Solothurn 2021-07-05 52 0.92 EPI_ISL_1002052 B.1 2021-01-15 Europe Switzerland Solothurn 2021-07-15 53 0.91 EPI_ISL_466942 B.1 2020-03-08 Europe Switzerland Basel-Stadt 2021-05-12 54 0.90 EPI_ISL_1003849 B.1.160 2021-01-29 Europe Switzerland Neuchâtel 2021-08-05 55 0.89 -EPI_ISL_768148 B.1.160 2020-12-24 Europe Switzerland Sankt Gallen 2020-03-16 56 0.98 +EPI_ISL_768148 GD.1 2020-12-24 Europe Switzerland Sankt Gallen 2020-03-16 56 0.98 EPI_ISL_1080536 B.1.1.7 2021-02-10 Europe Switzerland Basel-Land 2021-08-04 57 0.97 EPI_ISL_1002156 B.1.221 2021-01-16 Europe Switzerland Basel-Land 2021-02-03 58 0.96 EPI_ISL_1119315 B.1.1.7 2021-02-14 Europe Switzerland Graubünden 2021-03-18 59 0.95