Skip to content

Commit

Permalink
fixup! fix: resolve aliases when inserting to or querying lineage ind…
Browse files Browse the repository at this point in the history
…exes again

review comments
  • Loading branch information
Taepper committed Oct 9, 2024
1 parent cdefbe0 commit b9a24c7
Show file tree
Hide file tree
Showing 20 changed files with 273 additions and 162 deletions.
2 changes: 0 additions & 2 deletions include/silo/preprocessing/lineage_definition_file.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
#include <filesystem>
#include <vector>

#include <yaml-cpp/yaml.h>

#include "silo/common/lineage_name.h"

namespace silo::preprocessing {
Expand Down
2 changes: 1 addition & 1 deletion include/silo/storage/column/indexed_string_column.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,14 +41,14 @@ class IndexedStringColumnPartition {
std::optional<LineageIndex> lineage_index;
common::BidirectionalMap<std::string>* lookup;

public:
explicit IndexedStringColumnPartition(common::BidirectionalMap<std::string>* lookup);

explicit IndexedStringColumnPartition(
common::BidirectionalMap<std::string>* lookup,
const common::LineageTree* lineage_tree
);

public:
[[nodiscard]] std::optional<const roaring::Roaring*> filter(silo::Idx value_id) const;

std::optional<const roaring::Roaring*> filter(const std::optional<std::string>& value) const;
Expand Down
12 changes: 11 additions & 1 deletion include/silo/storage/lineage_index.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,17 @@

namespace silo::storage {

// Forward declaration for the friend class access of IndexedStringColumn
// which is the only allowed user of this class (/this class's constructor).
// This ensures safety of the lineage_tree pointer, which
// (i) is guaranteed to be initialized (and const) by the constructor and stays valid as
// (ii) the lifetime of this index is bound to the containing column_partition containing it
namespace column {
class IndexedStringColumnPartition;
}

class LineageIndex {
friend class column::IndexedStringColumnPartition;
friend class boost::serialization::access;

const common::LineageTree* lineage_tree;
Expand All @@ -28,9 +38,9 @@ class LineageIndex {
// clang-format on
}

public:
explicit LineageIndex(const common::LineageTree* lineage_tree);

public:
void insert(size_t row_id, Idx value_id);

std::optional<const roaring::Roaring*> filterIncludingSublineages(Idx value_id) const;
Expand Down
49 changes: 40 additions & 9 deletions src/silo/common/lineage_tree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -178,11 +178,12 @@ LineageTreeAndIdMap::LineageTreeAndIdMap(
: lineage_tree(std::move(lineage_tree)),
lineage_id_lookup_map(std::move(lineage_id_lookup_map)) {}

LineageTreeAndIdMap LineageTreeAndIdMap::fromLineageDefinitionFile(
const preprocessing::LineageDefinitionFile& file
namespace {

void assignLineageIds(
const preprocessing::LineageDefinitionFile& file,
BidirectionalMap<std::string>& lookup
) {
BidirectionalMap<std::string> lookup;
std::unordered_map<uint32_t, uint32_t> alias_mapping;
for (const auto& lineage : file.lineages) {
if (lookup.getId(lineage.lineage_name.string).has_value()) {
throw silo::preprocessing::PreprocessingException(fmt::format(
Expand All @@ -191,8 +192,16 @@ LineageTreeAndIdMap LineageTreeAndIdMap::fromLineageDefinitionFile(
}
lookup.getOrCreateId(lineage.lineage_name.string);
}
}

std::unordered_map<Idx, Idx> assignAliasIdsAndGetAliasMapping(
const preprocessing::LineageDefinitionFile& file,
BidirectionalMap<std::string>& lookup
) {
std::unordered_map<Idx, Idx> alias_mapping;
for (const auto& lineage : file.lineages) {
auto lineage_id = lookup.getId(lineage.lineage_name.string).value();
const auto lineage_id = lookup.getId(lineage.lineage_name.string);
ASSERT(lineage_id.has_value());
for (const auto& alias : lineage.aliases) {
if (lookup.getId(alias.string).has_value()) {
throw silo::preprocessing::PreprocessingException(fmt::format(
Expand All @@ -202,13 +211,21 @@ LineageTreeAndIdMap LineageTreeAndIdMap::fromLineageDefinitionFile(
));
}
auto alias_id = lookup.getOrCreateId(alias.string);
alias_mapping[alias_id] = lineage_id;
alias_mapping[alias_id] = lineage_id.value();
}
}
return alias_mapping;
}

std::vector<std::pair<Idx, Idx>> getParentChildEdges(
const preprocessing::LineageDefinitionFile& file,
const BidirectionalMap<std::string>& lookup,
const std::unordered_map<Idx, Idx>& alias_mapping
) {
std::vector<std::pair<Idx, Idx>> edge_list;
for (const auto& lineage : file.lineages) {
const Idx my_id = lookup.getId(lineage.lineage_name.string).value();
ASSERT(!alias_mapping.contains(my_id));
const auto child_id = lookup.getId(lineage.lineage_name.string);
ASSERT(child_id.has_value());

for (const auto& parent_lineage : lineage.parents) {
auto parent_id = lookup.getId(parent_lineage.string);
Expand All @@ -223,9 +240,23 @@ LineageTreeAndIdMap LineageTreeAndIdMap::fromLineageDefinitionFile(
if (alias_mapping.contains(parent_id.value())) {
parent_id = alias_mapping.at(parent_id.value());
}
edge_list.emplace_back(parent_id.value(), my_id);
edge_list.emplace_back(parent_id.value(), child_id.value());
}
}
return edge_list;
}

} // namespace

LineageTreeAndIdMap LineageTreeAndIdMap::fromLineageDefinitionFile(
const preprocessing::LineageDefinitionFile& file
) {
BidirectionalMap<std::string> lookup;
assignLineageIds(file, lookup);
std::unordered_map<Idx, Idx> alias_mapping = assignAliasIdsAndGetAliasMapping(file, lookup);

const std::vector<std::pair<Idx, Idx>> edge_list =
getParentChildEdges(file, lookup, alias_mapping);
auto lineage_tree =
LineageTree::fromEdgeList(file.lineages.size(), edge_list, lookup, std::move(alias_mapping));
return {std::move(lineage_tree), std::move(lookup)};
Expand Down
134 changes: 54 additions & 80 deletions src/silo/common/lineage_tree.test.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include "silo/common/lineage_tree.h"

#include <gmock/gmock.h>
#include <gtest/gtest.h>

#include "silo/preprocessing/lineage_definition_file.h"
Expand Down Expand Up @@ -61,13 +62,14 @@ TEST(LineageTreeAndIdMap, errorOnMissingParent) {
TEST(LineageTreeAndIdMap, correctTreeRelations) {
auto lineage_definition_file = LineageDefinitionFile::fromYAML(R"(
BASE:
aliases: [ base_alias ]
parents: []
CHILD1:
parents:
- BASE
CHILD2:
parents:
- BASE
- base_alias
GRANDCHILD1:
parents:
- CHILD1
Expand Down Expand Up @@ -106,19 +108,12 @@ TEST(LineageTreeAndIdMap, correctCycleErrorInFile) {
)"));
};

EXPECT_THROW(
{
try {
throwing_lambda();
} catch (const silo::preprocessing::PreprocessingException& e) {
ASSERT_EQ(
std::string(e.what()),
"The given LineageTree contains the cycle: BASE -> CHILD -> BASE"
);
throw;
}
},
silo::preprocessing::PreprocessingException
EXPECT_THAT(
throwing_lambda,
ThrowsMessage<silo::preprocessing::PreprocessingException>(::testing::HasSubstr(
"The given LineageTree contains the cycle: BASE -> CHILD -> BASE"

))
);
}

Expand All @@ -134,18 +129,12 @@ TEST(LineageTreeAndIdMap, correctSelfCycleErrorInFile) {
)"));
};

EXPECT_THROW(
{
try {
throwing_lambda();
} catch (const silo::preprocessing::PreprocessingException& e) {
ASSERT_EQ(
std::string(e.what()), "The given LineageTree contains the cycle: BASE -> BASE"
);
throw;
}
},
silo::preprocessing::PreprocessingException
EXPECT_THAT(
throwing_lambda,
ThrowsMessage<silo::preprocessing::PreprocessingException>(::testing::HasSubstr(
"The given LineageTree contains the cycle: BASE -> BASE"

))
);
}

Expand All @@ -166,19 +155,11 @@ BASE: {}
)"));
};

EXPECT_THROW(
{
try {
throwing_lambda();
} catch (const silo::preprocessing::PreprocessingException& e) {
ASSERT_EQ(
std::string(e.what()),
"The given LineageTree contains the cycle: CHILD1 -> CHILD2 -> CHILD3 -> CHILD1"
);
throw;
}
},
silo::preprocessing::PreprocessingException
EXPECT_THAT(
throwing_lambda,
ThrowsMessage<silo::preprocessing::PreprocessingException>(::testing::HasSubstr(
"The given LineageTree contains the cycle: CHILD1 -> CHILD2 -> CHILD3 -> CHILD1"
))
);
}

Expand All @@ -194,19 +175,12 @@ TEST(LineageDefinitionFile, errorOnDuplicateKey) {
- some_other_key)"));
};

EXPECT_THROW(
{
try {
throwing_lambda();
} catch (const silo::preprocessing::PreprocessingException& e) {
ASSERT_EQ(
std::string(e.what()),
"The lineage definitions contain the duplicate lineage 'some_duplicate_lineage'"
);
throw;
}
},
silo::preprocessing::PreprocessingException
EXPECT_THAT(
throwing_lambda,
ThrowsMessage<silo::preprocessing::PreprocessingException>(::testing::HasSubstr(
"The lineage definitions contain the duplicate lineage 'some_duplicate_lineage'"

))
);
}

Expand All @@ -226,20 +200,12 @@ TEST(LineageDefinitionFile, errorOnDuplicateAlias) {
- some_other_key)"));
};

EXPECT_THROW(
{
try {
throwing_lambda();
} catch (const silo::preprocessing::PreprocessingException& e) {
ASSERT_EQ(
std::string(e.what()),
"The alias 'duplicate_alias' for lineage 'lineage3' is already defined as a lineage "
"or another alias."
);
throw;
}
},
silo::preprocessing::PreprocessingException
EXPECT_THAT(
throwing_lambda,
ThrowsMessage<silo::preprocessing::PreprocessingException>(::testing::HasSubstr(
"The alias 'duplicate_alias' for lineage 'lineage3' is already defined as a lineage "
"or another alias."
))
);
}

Expand All @@ -259,23 +225,31 @@ TEST(LineageDefinitionFile, errorOnLineageAsAlias) {
- some_other_key)"));
};

EXPECT_THROW(
{
try {
throwing_lambda();
} catch (const silo::preprocessing::PreprocessingException& e) {
ASSERT_EQ(
std::string(e.what()),
"The alias 'lineage2_also_used_as_alias' for lineage 'lineage3' is already defined "
"as a lineage or another alias."
);
throw;
}
},
silo::preprocessing::PreprocessingException
EXPECT_THAT(
throwing_lambda,
ThrowsMessage<silo::preprocessing::PreprocessingException>(::testing::HasSubstr(
"The alias 'lineage2_also_used_as_alias' for lineage 'lineage3' is already defined "
"as a lineage or another alias."
))
);
}

TEST(containsCycle, doesNotFindCycleInPangoLineageTree) {
ASSERT_NO_THROW(LineageTreeAndIdMap::fromLineageDefinitionFilePath(
"testBaseData/exampleDataset/lineage_definitions.yaml"
));
}

TEST(containsCycle, doesNotFindCycleInMediumSizedChainGraph) {
std::vector<std::pair<uint32_t, uint32_t>> chain_edges;
uint32_t N = UINT16_MAX;
chain_edges.reserve(N);
for (size_t i = 0; i + 1 < N; i++) {
chain_edges.emplace_back(i, i + 1);
}
ASSERT_FALSE(silo::common::containsCycle(N, chain_edges));
}

TEST(containsCycle, findsCycles) {
// 1. Simple cycle
ASSERT_TRUE(silo::common::containsCycle(3, {{0, 1}, {1, 0}}));
Expand Down
20 changes: 14 additions & 6 deletions src/silo/config/util/config_repository.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,19 +73,27 @@ void validateDateToSortBy(
}
}

void validatePartitionBy(
const DatabaseConfig& config,
std::map<std::string, ValueType>& metadata_map
) {
void validatePartitionBy(const DatabaseConfig& config) {
if (config.schema.partition_by == std::nullopt) {
return;
}

const std::string partition_by = config.schema.partition_by.value();

if (metadata_map.find(partition_by) == metadata_map.end()) {
const auto& partition_by_metadata = std::find_if(
config.schema.metadata.begin(),
config.schema.metadata.end(),
[&](const DatabaseMetadata& metadata) { return metadata.name == partition_by; }
);
if (partition_by_metadata == config.schema.metadata.end()) {
throw ConfigException("partition_by '" + partition_by + "' is not in metadata");
}

if (partition_by_metadata->type != ValueType::STRING || !partition_by_metadata->lineage_index) {
throw ConfigException(
"partition_by '" + partition_by + "' must be of type STRING and needs 'lineageIndex' set"
);
}
}
} // namespace

Expand All @@ -102,7 +110,7 @@ void ConfigRepository::validateConfig(const DatabaseConfig& config) const {

validateDateToSortBy(config, metadata_map);

validatePartitionBy(config, metadata_map);
validatePartitionBy(config);
}

} // namespace silo::config
Loading

0 comments on commit b9a24c7

Please sign in to comment.