Skip to content

Commit

Permalink
Merge pull request #193 from GenSpectrum/optionalPangoLineages
Browse files Browse the repository at this point in the history
feat: make partition_by field in config optional
  • Loading branch information
Taepper authored Aug 8, 2023
2 parents dca8cf0 + 253f494 commit acf7197
Show file tree
Hide file tree
Showing 19 changed files with 261 additions and 62 deletions.
2 changes: 1 addition & 1 deletion include/silo/common/data_version.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ namespace silo {

class DataVersion {
public:
explicit DataVersion(const std::string& data_version = "");
explicit DataVersion(std::string data_version = "");
[[nodiscard]] std::string toString() const;

static std::string mineDataVersion();
Expand Down
2 changes: 1 addition & 1 deletion include/silo/config/database_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ struct DatabaseSchema {
std::vector<DatabaseMetadata> metadata;
std::string primary_key;
std::optional<std::string> date_to_sort_by;
std::string partition_by;
std::optional<std::string> partition_by;
};

struct DatabaseConfig {
Expand Down
8 changes: 7 additions & 1 deletion include/silo/prepare_dataset.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,13 @@ void partitionData(
const silo::preprocessing::PreprocessingConfig& preprocessing_config,
const preprocessing::Partitions& partitions,
const PangoLineageAliasLookup& alias_key,
const silo::config::DatabaseConfig& database_config,
const std::string& primary_key_field,
const std::string& partition_by_field,
const ReferenceGenomes& reference_genomes
);

void copyDataToPartitionDirectory(
const preprocessing::PreprocessingConfig& preprocessing_config,
const ReferenceGenomes& reference_genomes
);

Expand Down
2 changes: 1 addition & 1 deletion include/silo/preprocessing/pango_lineage_count.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ struct PangoLineageCounts {
PangoLineageCounts buildPangoLineageCounts(
const PangoLineageAliasLookup& alias_key,
const std::filesystem::path& metadata_path,
const silo::config::DatabaseConfig& database_config
const std::string& partition_by
);

} // namespace preprocessing
Expand Down
11 changes: 11 additions & 0 deletions include/silo/preprocessing/partition.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#define SILO_PARTITION_H

#include <cstdint>
#include <filesystem>
#include <functional>
#include <iosfwd>
#include <string>
Expand All @@ -17,6 +18,9 @@ class access;
namespace silo::common {
class UnaliasedPangoLineage;
}
namespace silo::config {
class DatabaseConfig;
}

namespace silo::preprocessing {

Expand Down Expand Up @@ -88,6 +92,8 @@ class Partitions {
std::unordered_map<std::string, silo::preprocessing::PartitionChunk> pango_to_chunk;

public:
Partitions();

explicit Partitions(std::vector<Partition> partitions_);

void save(std::ostream& output_file) const;
Expand All @@ -104,6 +110,11 @@ class Partitions {

Partitions buildPartitions(const PangoLineageCounts& pango_lineage_counts, Architecture arch);

Partitions createSingletonPartitions(
const std::filesystem::path& metadata_path,
const silo::config::DatabaseConfig& database_config
);

} // namespace silo::preprocessing

template <>
Expand Down
14 changes: 6 additions & 8 deletions include/silo/preprocessing/preprocessing_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,15 @@
#define SILO_PREPROCESSING_CONFIG_H

#include <filesystem>
#include <optional>
#include <string>
#include <unordered_map>

namespace silo::preprocessing {
struct PartitionChunk;
struct Partitions;
} // namespace silo::preprocessing

#include <fmt/core.h>

namespace silo::preprocessing {
struct PartitionChunk;
struct Partitions;

struct InputDirectory {
std::string directory;
Expand All @@ -27,7 +25,7 @@ struct MetadataFilename {
};

struct PangoLineageDefinitionFilename {
std::string filename;
std::optional<std::string> filename;
};

struct NucleotideSequencePrefix {
Expand Down Expand Up @@ -58,7 +56,7 @@ class PreprocessingConfig {
friend class fmt::formatter<silo::preprocessing::PreprocessingConfig>;

std::filesystem::path input_directory;
std::filesystem::path pango_lineage_definition_file;
std::optional<std::filesystem::path> pango_lineage_definition_file;
std::filesystem::path metadata_file;
std::filesystem::path partition_folder;
std::filesystem::path sorted_partition_folder;
Expand All @@ -83,7 +81,7 @@ class PreprocessingConfig {
const GenePrefix& gene_prefix_
);

[[nodiscard]] std::filesystem::path getPangoLineageDefinitionFilename() const;
[[nodiscard]] std::optional<std::filesystem::path> getPangoLineageDefinitionFilename() const;

[[nodiscard]] std::filesystem::path getReferenceGenomeFilename() const;

Expand Down
4 changes: 2 additions & 2 deletions src/silo/common/data_version.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ std::string DataVersion::mineDataVersion() {
return std::to_string(now_as_time_t);
}

DataVersion::DataVersion(const std::string& data_version)
: data_version(data_version) {}
DataVersion::DataVersion(std::string data_version)
: data_version(std::move(data_version)) {}

std::string DataVersion::toString() const {
return data_version;
Expand Down
16 changes: 10 additions & 6 deletions src/silo/config/config_repository.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,15 +72,19 @@ void validatePartitionBy(
const DatabaseConfig& config,
std::map<std::string, ValueType>& metadata_map
) {
if (metadata_map.find(config.schema.partition_by) == metadata_map.end()) {
throw ConfigException("partition_by '" + config.schema.partition_by + "' is not in metadata");
if (config.schema.partition_by == std::nullopt) {
return;
}

const std::string partition_by = config.schema.partition_by.value();

if (metadata_map.find(partition_by) == metadata_map.end()) {
throw ConfigException("partition_by '" + partition_by + "' is not in metadata");
}

const auto& partition_by_type = metadata_map[config.schema.partition_by];
const auto& partition_by_type = metadata_map[partition_by];
if (partition_by_type != ValueType::PANGOLINEAGE) {
throw ConfigException(
"partition_by '" + config.schema.partition_by + "' must be of type PANGOLINEAGE"
);
throw ConfigException("partition_by '" + partition_by + "' must be of type PANGOLINEAGE");
}
}

Expand Down
17 changes: 13 additions & 4 deletions src/silo/config/database_config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,11 @@ struct convert<silo::config::DatabaseSchema> {
} else {
schema.date_to_sort_by = std::nullopt;
}
schema.partition_by = node["partitionBy"].as<std::string>();
if (node["partitionBy"].IsDefined()) {
schema.partition_by = node["partitionBy"].as<std::string>();
} else {
schema.partition_by = std::nullopt;
}

if (!node["metadata"].IsSequence()) {
return false;
Expand All @@ -112,7 +116,9 @@ struct convert<silo::config::DatabaseSchema> {
Node node;
node["instanceName"] = schema.instance_name;
node["primaryKey"] = schema.primary_key;
node["partitionBy"] = schema.partition_by;
if (schema.partition_by.has_value()) {
node["partitionBy"] = *schema.partition_by;
}
if (schema.date_to_sort_by.has_value()) {
node["dateToSortBy"] = *schema.date_to_sort_by;
}
Expand Down Expand Up @@ -219,10 +225,13 @@ DatabaseConfig DatabaseConfigReader::readConfig(const std::filesystem::path& con
) -> decltype(ctx.out()) {
return format_to(
ctx.out(),
"{{ instance_name: '{}', primary_key: '{}', partition_by: '{}', metadata: [{}] }}",
"{{ instance_name: '{}', primary_key: '{}', partition_by: {}, date_to_sort_by: {}, metadata: "
"[{}] }}",
database_schema.instance_name,
database_schema.primary_key,
database_schema.partition_by,
database_schema.partition_by.has_value() ? "'" + *database_schema.partition_by + "'" : "none",
database_schema.date_to_sort_by.has_value() ? "'" + *database_schema.date_to_sort_by + "'"
: "none",
fmt::join(database_schema.metadata, ",")
);
}
Expand Down
8 changes: 8 additions & 0 deletions src/silo/config/database_config.test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -160,4 +160,12 @@ TEST(DatabaseConfigReader, shouldReadConfigWithoutDateToSortBy) {
ASSERT_EQ(config.schema.date_to_sort_by, std::nullopt);
}

TEST(DatabaseConfigReader, shouldReadConfigWithoutPartitionBy) {
const DatabaseConfig& config = DatabaseConfigReader().readConfig(
"testBaseData/test_database_config_without_partition_by.yaml"
);

ASSERT_EQ(config.schema.partition_by, std::nullopt);
}

} // namespace
61 changes: 42 additions & 19 deletions src/silo/database.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -520,32 +520,55 @@ Database Database::preprocessing(
);

SPDLOG_INFO("preprocessing - building alias key");
database.alias_key =
PangoLineageAliasLookup::readFromFile(preprocessing_config.getPangoLineageDefinitionFilename()
);
const auto pango_lineage_definition_filename =
preprocessing_config.getPangoLineageDefinitionFilename();
if (pango_lineage_definition_filename.has_value()) {
database.alias_key =
PangoLineageAliasLookup::readFromFile(pango_lineage_definition_filename.value());
}

SPDLOG_INFO("preprocessing - reading reference genome");
const ReferenceGenomes& reference_genomes =
ReferenceGenomes::readFromFile(preprocessing_config.getReferenceGenomeFilename());

SPDLOG_INFO("preprocessing - counting pango lineages");
const preprocessing::PangoLineageCounts pango_descriptor(preprocessing::buildPangoLineageCounts(
database.alias_key, preprocessing_config.getMetadataInputFilename(), database_config_
));
preprocessing::Partitions partition_descriptor;
if (database_config_.schema.partition_by.has_value()) {
SPDLOG_INFO("preprocessing - counting pango lineages");
const preprocessing::PangoLineageCounts pango_descriptor(
preprocessing::buildPangoLineageCounts(
database.alias_key,
preprocessing_config.getMetadataInputFilename(),
database_config_.schema.partition_by.value()
)
);

SPDLOG_INFO("preprocessing - calculating partitions");
const preprocessing::Partitions partition_descriptor(
preprocessing::buildPartitions(pango_descriptor, preprocessing::Architecture::MAX_PARTITIONS)
);
SPDLOG_INFO("preprocessing - calculating partitions");
partition_descriptor = preprocessing::buildPartitions(
pango_descriptor, preprocessing::Architecture::MAX_PARTITIONS
);

SPDLOG_INFO("preprocessing - partitioning data");
partitionData(
preprocessing_config,
partition_descriptor,
database.alias_key,
database_config_,
reference_genomes
);
SPDLOG_INFO("preprocessing - partitioning data");
partitionData(
preprocessing_config,
partition_descriptor,
database.alias_key,
database_config_.schema.primary_key,
database_config_.schema.partition_by.value(),
reference_genomes
);

} else {
SPDLOG_INFO(
"preprocessing - skip partition merging because no partition_by key was provided, instead "
"putting all sequences into the same partition"
);

partition_descriptor = preprocessing::createSingletonPartitions(
preprocessing_config.getMetadataInputFilename(), database_config_
);

copyDataToPartitionDirectory(preprocessing_config, reference_genomes);
}

if (database_config_.schema.date_to_sort_by.has_value()) {
SPDLOG_INFO("preprocessing - sorting chunks");
Expand Down
19 changes: 19 additions & 0 deletions src/silo/database.test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,25 @@ TEST(DatabaseTest, shouldBuildDatabaseWithoutErrors) {
EXPECT_EQ(simple_database_info.sequence_count, 100);
}

TEST(DatabaseTest, shouldSuccessfullyBuildDatabaseWithoutPartitionBy) {
const silo::preprocessing::InputDirectory input_directory{"./testBaseData/"};

auto config = silo::preprocessing::PreprocessingConfigReader().readConfig(
input_directory.directory + "test_preprocessing_config.yaml"
);

const auto database_config = silo::config::ConfigRepository().getValidatedConfig(
input_directory.directory + "test_database_config_without_partition_by.yaml"
);

auto database = silo::Database::preprocessing(config, database_config);

const auto simple_database_info = database.getDatabaseInfo();

EXPECT_GT(simple_database_info.total_size, 0);
EXPECT_EQ(simple_database_info.sequence_count, 100);
}

// NOLINTNEXTLINE(readability-function-cognitive-complexity)
TEST(DatabaseTest, shouldReturnCorrectDatabaseInfo) {
auto database{buildTestDatabase()};
Expand Down
Loading

0 comments on commit acf7197

Please sign in to comment.