Skip to content

Commit

Permalink
feat: support recombinant lineages
Browse files Browse the repository at this point in the history
  • Loading branch information
Taepper committed Jul 13, 2023
1 parent 27b992a commit a646dab
Show file tree
Hide file tree
Showing 28 changed files with 296 additions and 163 deletions.
22 changes: 15 additions & 7 deletions endToEndTests/test/queries/GroupByLineage.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,12 @@
"query": {
"action": {
"type": "Aggregated",
"groupByFields": ["pango_lineage"],
"orderByFields": ["pango_lineage"]
"groupByFields": [
"pango_lineage"
],
"orderByFields": [
"pango_lineage"
]
},
"filterExpression": {
"type": "True"
Expand Down Expand Up @@ -52,7 +56,7 @@
"pango_lineage": "B.1.1.70"
},
{
"count": 7,
"count": 6,
"pango_lineage": "B.1.160"
},
{
Expand All @@ -71,10 +75,6 @@
"count": 3,
"pango_lineage": "B.1.221"
},
{
"count": 1,
"pango_lineage": "B.1.236"
},
{
"count": 1,
"pango_lineage": "B.1.258"
Expand Down Expand Up @@ -106,6 +106,14 @@
{
"count": 1,
"pango_lineage": "B.1.617.2.9.2"
},
{
"count": 1,
"pango_lineage": "XA.1"
},
{
"count": 1,
"pango_lineage": "XBB.1.9.3.1"
}
]
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
"query": {
"action": {
"type": "Aggregated",
"groupByFields": ["pango_lineage"],
"groupByFields": [
"pango_lineage"
],
"orderByFields": [
{
"field": "count",
Expand All @@ -26,7 +28,7 @@
"pango_lineage": "B.1.177"
},
{
"count": 7,
"count": 6,
"pango_lineage": "B.1.160"
},
{
Expand Down
6 changes: 4 additions & 2 deletions endToEndTests/test/queries/OffsetLimitOverlap.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
"query": {
"action": {
"type": "Details",
"orderByFields": ["gisaid_epi_isl"],
"orderByFields": [
"gisaid_epi_isl"
],
"offset": 90,
"limit": 90
},
Expand Down Expand Up @@ -62,7 +64,7 @@
"date": "2020-12-24",
"division": "Sankt Gallen",
"gisaid_epi_isl": "EPI_ISL_768148",
"pango_lineage": "B.1.160",
"pango_lineage": "XBB.1.9.3.1",
"qc_value": 0.98,
"region": "Europe",
"unsorted_date": "2020-03-16"
Expand Down
19 changes: 19 additions & 0 deletions endToEndTests/test/queries/recombinantLineage.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
{
"testCaseName": "Recombinant lineage XBB including sublineages",
"query": {
"action": {
"type": "Aggregated"
},
"filterExpression": {
"type": "PangoLineage",
"column": "pango_lineage",
"value": "XBB",
"includeSublineages": true
}
},
"expectedQueryResult": [
{
"count": 1
}
]
}
19 changes: 19 additions & 0 deletions endToEndTests/test/queries/recombinantLineageWithAlias.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
{
"testCaseName": "Recombinant lineage GD with unaliasing",
"query": {
"action": {
"type": "Aggregated"
},
"filterExpression": {
"type": "PangoLineage",
"column": "pango_lineage",
"value": "GD",
"includeSublineages": true
}
},
"expectedQueryResult": [
{
"count": 1
}
]
}
18 changes: 11 additions & 7 deletions include/silo/common/pango_lineage.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,11 @@

namespace silo::common {

struct PangoLineage {
struct RawPangoLineage {
std::string value;
};

struct UnaliasedPangoLineage {
std::string value;

template <class Archive>
Expand All @@ -19,19 +23,19 @@ struct PangoLineage {
// clang-format on
}

bool isSublineageOf(const PangoLineage& other) const;
bool isSublineageOf(const UnaliasedPangoLineage& other) const;

std::vector<PangoLineage> getParentLineages() const;
std::vector<UnaliasedPangoLineage> getParentLineages() const;

bool operator<(const PangoLineage& other) const;
bool operator==(const PangoLineage& other) const;
bool operator<(const UnaliasedPangoLineage& other) const;
bool operator==(const UnaliasedPangoLineage& other) const;
};

} // namespace silo::common

template <>
struct std::hash<silo::common::PangoLineage> {
std::size_t operator()(const silo::common::PangoLineage& pango_lineage) const;
struct std::hash<silo::common::UnaliasedPangoLineage> {
std::size_t operator()(const silo::common::UnaliasedPangoLineage& pango_lineage) const;
};

#endif // SILO_PANGO_LINEAGE_H
2 changes: 2 additions & 0 deletions include/silo/database.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,9 @@ class Database {
PangoLineageAliasLookup alias_key;

void initializeColumns();

void initializeColumn(config::ColumnType column_type, const std::string& name);

void initializeSequences();

static BitmapSizePerSymbol calculateBitmapSizePerSymbol(const SequenceStore& seq_store);
Expand Down
4 changes: 3 additions & 1 deletion include/silo/preprocessing/pango_lineage_count.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
#include <string>
#include <vector>

#include "silo/common/pango_lineage.h"

namespace silo {
struct PangoLineageAliasLookup;

Expand All @@ -17,7 +19,7 @@ struct DatabaseConfig;
namespace preprocessing {

struct PangoLineageCount {
std::string pango_lineage;
common::UnaliasedPangoLineage pango_lineage;
uint32_t count_of_sequences;
};

Expand Down
6 changes: 5 additions & 1 deletion include/silo/preprocessing/partition.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@
#include <string>
#include <vector>

namespace silo::common {
class UnaliasedPangoLineage;
} // namespace silo::common

namespace silo::preprocessing {

class PangoLineageCounts;
Expand All @@ -23,7 +27,7 @@ struct Chunk {
std::string prefix;
uint32_t count_of_sequences;
uint32_t offset;
std::vector<std::string> pango_lineages;
std::vector<common::UnaliasedPangoLineage> pango_lineages;
};

struct Partition {
Expand Down
24 changes: 15 additions & 9 deletions include/silo/storage/column/pango_lineage_column.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include "silo/common/bidirectional_map.h"
#include "silo/common/pango_lineage.h"
#include "silo/common/types.h"
#include "silo/storage/pango_lineage_alias.h"

namespace boost::serialization {
struct access;
Expand All @@ -37,22 +38,26 @@ class PangoLineageColumnPartition {
std::unordered_map<Idx, roaring::Roaring> indexed_values;
std::unordered_map<Idx, roaring::Roaring> indexed_sublineage_values;

silo::common::BidirectionalMap<common::PangoLineage>& lookup;
silo::PangoLineageAliasLookup& alias_key;
silo::common::BidirectionalMap<common::UnaliasedPangoLineage>& lookup;

void insertSublineageValues(const common::PangoLineage& value, size_t row_number);
void insertSublineageValues(const common::UnaliasedPangoLineage& value, size_t row_number);

public:
explicit PangoLineageColumnPartition(common::BidirectionalMap<common::PangoLineage>& lookup);
explicit PangoLineageColumnPartition(
silo::PangoLineageAliasLookup& alias_key,
common::BidirectionalMap<common::UnaliasedPangoLineage>& lookup
);

void insert(const common::PangoLineage& value);
void insert(const common::RawPangoLineage& value);

roaring::Roaring filter(const common::PangoLineage& value) const;
roaring::Roaring filter(const common::RawPangoLineage& value) const;

roaring::Roaring filterIncludingSublineages(const common::PangoLineage& value) const;
roaring::Roaring filterIncludingSublineages(const common::RawPangoLineage& value) const;

const std::vector<silo::Idx>& getValues() const;

inline common::PangoLineage lookupValue(Idx id) const { return lookup.getValue(id); }
inline common::UnaliasedPangoLineage lookupValue(Idx id) const { return lookup.getValue(id); }
};

class PangoLineageColumn {
Expand All @@ -67,11 +72,12 @@ class PangoLineageColumn {
// TODO sync lookups in children
}

std::unique_ptr<silo::common::BidirectionalMap<common::PangoLineage>> lookup;
std::unique_ptr<silo::common::BidirectionalMap<common::UnaliasedPangoLineage>> lookup;
std::unique_ptr<silo::PangoLineageAliasLookup> alias_key;
std::deque<PangoLineageColumnPartition> partitions;

public:
explicit PangoLineageColumn();
explicit PangoLineageColumn(silo::PangoLineageAliasLookup alias_key);

PangoLineageColumnPartition& createPartition();
};
Expand Down
1 change: 0 additions & 1 deletion include/silo/storage/column_group.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,6 @@ struct ColumnGroup {

uint32_t fill(
const std::filesystem::path& input_file,
const PangoLineageAliasLookup& alias_key,
const silo::config::DatabaseConfig& database_config
);

Expand Down
19 changes: 16 additions & 3 deletions include/silo/storage/pango_lineage_alias.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,31 @@
#include <filesystem>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>

namespace silo {

namespace common {
struct UnaliasedPangoLineage;
struct RawPangoLineage;

} // namespace common

class PangoLineageAliasLookup {
private:
std::unordered_map<std::string, std::string> alias_key;
std::unordered_map<std::string, std::vector<std::string>> alias_key;

public:
PangoLineageAliasLookup() = default;
explicit PangoLineageAliasLookup(std::unordered_map<std::string, std::string> alias_key);

std::string resolvePangoLineageAlias(const std::string& pango_lineage) const;
explicit PangoLineageAliasLookup(
std::unordered_map<std::string, std::vector<std::string>> alias_key
);

[[nodiscard]] common::UnaliasedPangoLineage unaliasPangoLineage(
const common::RawPangoLineage& pango_lineage
) const;

static silo::PangoLineageAliasLookup readFromFile(
const std::filesystem::path& pango_lineage_alias_file
Expand Down
2 changes: 1 addition & 1 deletion src/silo/common/bidirectional_map.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ Idx BidirectionalMap<V>::getOrCreateId(V value) {
return identifier;
}

template class BidirectionalMap<PangoLineage>;
template class BidirectionalMap<UnaliasedPangoLineage>;
template class BidirectionalMap<std::string>;

} // namespace silo::common
20 changes: 10 additions & 10 deletions src/silo/common/bidirectional_map.test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,19 +23,19 @@ TEST(BidirectionalMap, correctStdStringDict) {
}

TEST(BidirectionalMap, correctPangoLineageDict) {
BidirectionalMap<PangoLineage> under_test;
EXPECT_EQ(under_test.getId(PangoLineage{"Not in dict"}), std::nullopt);
BidirectionalMap<UnaliasedPangoLineage> under_test;
EXPECT_EQ(under_test.getId(UnaliasedPangoLineage{"Not in dict"}), std::nullopt);

EXPECT_EQ(under_test.getOrCreateId(PangoLineage{"Now in dict"}), 0);
EXPECT_EQ(under_test.getOrCreateId(PangoLineage{"Now in dict"}), 0);
EXPECT_EQ(under_test.getOrCreateId(PangoLineage{"Second in dict"}), 1);
EXPECT_EQ(under_test.getOrCreateId(UnaliasedPangoLineage{"Now in dict"}), 0);
EXPECT_EQ(under_test.getOrCreateId(UnaliasedPangoLineage{"Now in dict"}), 0);
EXPECT_EQ(under_test.getOrCreateId(UnaliasedPangoLineage{"Second in dict"}), 1);

EXPECT_EQ(under_test.getId(PangoLineage{"Now in dict"}), 0);
EXPECT_EQ(under_test.getId(PangoLineage{"Still not in dict"}), std::nullopt);
EXPECT_EQ(under_test.getId(PangoLineage{"Second in dict"}), 1);
EXPECT_EQ(under_test.getId(UnaliasedPangoLineage{"Now in dict"}), 0);
EXPECT_EQ(under_test.getId(UnaliasedPangoLineage{"Still not in dict"}), std::nullopt);
EXPECT_EQ(under_test.getId(UnaliasedPangoLineage{"Second in dict"}), 1);

EXPECT_EQ(under_test.getValue(0), PangoLineage{"Now in dict"});
EXPECT_EQ(under_test.getValue(1), PangoLineage{"Second in dict"});
EXPECT_EQ(under_test.getValue(0), UnaliasedPangoLineage{"Now in dict"});
EXPECT_EQ(under_test.getValue(1), UnaliasedPangoLineage{"Second in dict"});
}

} // namespace silo::common
Loading

0 comments on commit a646dab

Please sign in to comment.