Skip to content

Commit

Permalink
fix: start with empty files without throwing an error
Browse files Browse the repository at this point in the history
  • Loading branch information
Taepper committed May 31, 2024
1 parent 51c42fe commit d407b92
Show file tree
Hide file tree
Showing 37 changed files with 548 additions and 1,615 deletions.
1 change: 1 addition & 0 deletions include/silo/common/aa_symbols.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ class AminoAcid {
static constexpr std::string_view SYMBOL_NAME_LOWER_CASE = "amino acid";
static constexpr std::string_view SYMBOL_NAME_UPPER_CASE = "AMINO ACID";
static constexpr std::string_view SYMBOL_NAME_SHORT = "AA";
static constexpr std::string_view PREFIX = "aa_";

static constexpr std::array<Symbol, COUNT> SYMBOLS{
Symbol::GAP, Symbol::A, Symbol::C, Symbol::D, Symbol::E, Symbol::F, Symbol::G,
Expand Down
1 change: 1 addition & 0 deletions include/silo/common/nucleotide_symbols.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ class Nucleotide {
static constexpr std::string_view SYMBOL_NAME_LOWER_CASE = "nucleotide";
static constexpr std::string_view SYMBOL_NAME_UPPER_CASE = "NUCLEOTIDE";
static constexpr std::string_view SYMBOL_NAME_SHORT = "NUC";
static constexpr std::string_view PREFIX = "nuc_";

static constexpr std::array<Symbol, COUNT> SYMBOLS{
Symbol::GAP,
Expand Down
10 changes: 10 additions & 0 deletions include/silo/common/string_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,14 @@ std::string removeSymbol(const std::string& value, char symbol);

std::vector<std::string> slice(const std::vector<std::string>& elements, size_t start, size_t end);

std::vector<std::string> prepend(std::string_view prefix, const std::vector<std::string>& elements);

std::vector<std::string> tie(
std::string_view prefix,
const std::vector<std::string>& elements1,
std::string_view delimiter,
const std::vector<std::string>& elements2,
std::string_view suffix
);

} // namespace silo
3 changes: 1 addition & 2 deletions include/silo/config/database_config.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#pragma once

#include <filesystem>
#include <functional>
#include <optional>
#include <string>
#include <string_view>
Expand Down Expand Up @@ -31,8 +32,6 @@ class DatabaseSchema {
std::string primary_key;
std::optional<std::string> date_to_sort_by;
std::optional<std::string> partition_by;

[[nodiscard]] std::string getStrictOrderByClause() const;
};

class DatabaseConfig {
Expand Down
23 changes: 12 additions & 11 deletions include/silo/preprocessing/metadata_info.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,34 +2,35 @@

#include <filesystem>
#include <string>
#include <unordered_map>
#include <vector>

#include "silo/config/database_config.h"

namespace silo::preprocessing {

class PreprocessingDatabase;

class MetadataInfo {
std::unordered_map<std::string, std::string> metadata_selects;

MetadataInfo(std::unordered_map<std::string, std::string> metadata_selects);

public:
static MetadataInfo validateFromMetadataFile(
static void validateMetadataFile(
const std::filesystem::path& metadata_file,
const silo::config::DatabaseConfig& database_config
);

static MetadataInfo validateFromNdjsonFile(
static void validateNdjsonFile(
const std::filesystem::path& ndjson_file,
const silo::config::DatabaseConfig& database_config
);

std::vector<std::string> getMetadataFields() const;
static std::vector<std::string> getMetadataFields(
const silo::config::DatabaseConfig& database_config
);

static std::vector<std::string> getMetadataSQLTypes(
const silo::config::DatabaseConfig& database_config
);

std::vector<std::string> getMetadataSelects() const;
static std::vector<std::string> getMetadataSelects(
const silo::config::DatabaseConfig& database_config
);
};

} // namespace silo::preprocessing
44 changes: 23 additions & 21 deletions include/silo/preprocessing/preprocessor.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#pragma once

#include <optional>

#include "silo/config/database_config.h"
#include "silo/config/preprocessing_config.h"
#include "silo/preprocessing/preprocessing_database.h"
Expand All @@ -20,6 +22,15 @@ class Preprocessor {
ReferenceGenomes reference_genomes_;
PangoLineageAliasLookup alias_lookup_;

std::vector<std::string> nuc_sequences;
std::vector<std::string> aa_sequences;
std::vector<std::string> order_by_fields;
std::vector<std::string> prefixed_order_by_fields;
std::vector<std::string> prefixed_nuc_sequences;
std::vector<std::string> prefixed_aa_sequences;
std::vector<std::string> prefixed_nuc_insertions_fields;
std::vector<std::string> prefixed_aa_insertions_fields;

public:
Preprocessor(
config::PreprocessingConfig preprocessing_config,
Expand All @@ -31,6 +42,9 @@ class Preprocessor {
Database preprocess();

private:
static std::string makeNonNullKey(const std::string& field);
std::string getPartitionKeySelect() const;

void buildTablesFromNdjsonInput(const std::filesystem::path& file_name);
void buildMetadataTableFromFile(const std::filesystem::path& metadata_filename);

Expand All @@ -39,40 +53,31 @@ class Preprocessor {
void buildEmptyPartitioning();

void createInsertionsTableFromFile(
const std::map<std::string, std::string>& expected_sequences,
const std::vector<std::string>& expected_sequences,
const std::filesystem::path& insertion_file,
const std::string& table_name
);

void createPartitionedSequenceTablesFromNdjson(const std::filesystem::path& file_name);

void createAlignedPartitionedSequenceViews(
const std::filesystem::path& file_name,
const SequenceInfo& sequence_info,
const std::string& partition_by_select,
const std::string& partition_by_where
);
void createUnalignedPartitionedSequenceFiles(
const std::filesystem::path& file_name,
const std::string& partition_by_select,
const std::string& partition_by_where
);
void createAlignedPartitionedSequenceViews(const std::filesystem::path& file_name);
void createUnalignedPartitionedSequenceFiles(const std::filesystem::path& file_name);
void createUnalignedPartitionedSequenceFile(
const std::string& seq_name,
const std::string& table_sql
);

void createPartitionedSequenceTablesFromSequenceFiles();

template <typename SymbolType>
void createPartitionedTableForSequence(
const std::string& sequence_name,
const std::string& reference_sequence,
const std::filesystem::path& filename,
const std::string& table_prefix
const std::filesystem::path& filename
);

Database buildDatabase(
const preprocessing::Partitions& partition_descriptor,
const std::string& order_by_clause,
const std::filesystem::path& intermediate_results_directory
);

Expand All @@ -81,12 +86,9 @@ class Preprocessor {
const preprocessing::Partitions& partition_descriptor,
const std::string& order_by_clause
);
void buildNucleotideSequenceStore(
Database& database,
const preprocessing::Partitions& partition_descriptor,
const std::string& order_by_clause
);
void buildAminoAcidSequenceStore(

template <typename SymbolType>
void buildSequenceStore(
Database& database,
const preprocessing::Partitions& partition_descriptor,
const std::string& order_by_clause
Expand Down
22 changes: 11 additions & 11 deletions include/silo/preprocessing/sequence_info.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,32 +19,32 @@ namespace preprocessing {
class PreprocessingDatabase;

class SequenceInfo {
std::vector<std::string> nuc_sequence_names;
std::vector<std::string> aa_sequence_names;

public:
explicit SequenceInfo(const silo::ReferenceGenomes& reference_genomes);

[[nodiscard]] std::vector<std::string> getAlignedSequenceSelects(
[[nodiscard]] static std::vector<std::string> getAlignedSequenceSelects(
const silo::ReferenceGenomes& reference_genomes,
const PreprocessingDatabase& preprocessing_db
) const;
);

static std::string getNucleotideSequenceSelect(
[[nodiscard]] static std::string getNucleotideSequenceSelect(
std::string_view seq_name,
const PreprocessingDatabase& preprocessing_db
);

static std::string getUnalignedSequenceSelect(
[[nodiscard]] static std::string getUnalignedSequenceSelect(
std::string_view seq_name,
const PreprocessingDatabase& preprocessing_db
);

static std::string getAminoAcidSequenceSelect(
[[nodiscard]] static std::string getAminoAcidSequenceSelect(
std::string_view seq_name,
const PreprocessingDatabase& preprocessing_db
);

void validate(duckdb::Connection& connection, const std::filesystem::path& input_filename) const;
static void validateNdjsonFile(
const silo::ReferenceGenomes& reference_genomes,
duckdb::Connection& connection,
const std::filesystem::path& input_filename
);
};
} // namespace preprocessing
} // namespace silo
6 changes: 6 additions & 0 deletions include/silo/storage/reference_genomes.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,12 @@ struct ReferenceGenomes {

static ReferenceGenomes readFromFile(const std::filesystem::path& reference_genomes_path);

template <typename SymbolType>
std::vector<std::string> getSequenceNames() const;

template <typename SymbolType>
std::map<std::string, std::string> getRawSequenceMap() const;

template <typename SymbolType>
static std::vector<typename SymbolType::Symbol> stringToVector(const std::string& string);

Expand Down
34 changes: 34 additions & 0 deletions src/silo/common/string_utils.cpp
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
#include "silo/common/string_utils.h"

#include <algorithm>
#include <cassert>
#include <stdexcept>

#include <fmt/format.h>

namespace silo {

std::vector<std::string> splitBy(const std::string& value, const std::string_view delimiter) {
Expand Down Expand Up @@ -40,4 +43,35 @@ std::vector<std::string> slice(const std::vector<std::string>& elements, size_t
}
return sliced_elements;
}

std::vector<std::string> prepend(
std::string_view prefix,
const std::vector<std::string>& elements
) {
std::vector<std::string> output;
output.reserve(elements.size());
for (const std::string& str : elements) {
output.emplace_back(fmt::format("{}{}", prefix, str));
}
return output;
}

std::vector<std::string> tie(
std::string_view prefix,
const std::vector<std::string>& elements1,
std::string_view delimiter,
const std::vector<std::string>& elements2,
std::string_view suffix
) {
assert(elements1.size() == elements2.size());
std::vector<std::string> output;
output.reserve(elements1.size());
for (size_t i = 0; i < elements1.size(); ++i) {
output.emplace_back(
fmt::format("{}{}{}{}{}", prefix, elements1.at(i), delimiter, elements2.at(i), suffix)
);
}
return output;
}

} // namespace silo
10 changes: 0 additions & 10 deletions src/silo/config/database_config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -179,16 +179,6 @@ ColumnType DatabaseMetadata::getColumnType() const {
throw std::runtime_error("Did not find metadata with name: " + std::string(name));
}

std::string DatabaseSchema::getStrictOrderByClause() const {
if (date_to_sort_by.has_value()) {
SPDLOG_INFO("preprocessing - produce order by clause with a date to sort by");
return fmt::format("ORDER BY {}, {}", date_to_sort_by.value(), primary_key);
}

SPDLOG_INFO("preprocessing - produce order by clause without a date to sort by");
return fmt::format("ORDER BY {}", primary_key);
}

std::optional<DatabaseMetadata> DatabaseConfig::getMetadata(const std::string& name) const {
auto element = std::find_if(
std::begin(schema.metadata),
Expand Down
Loading

0 comments on commit d407b92

Please sign in to comment.