Skip to content

Commit

Permalink
fix: validate the column types earlier to prevent duckdb auto-casting…
Browse files Browse the repository at this point in the history
… to cast types with differing round-trip conversions
  • Loading branch information
Taepper authored and fengelniederhammer committed Oct 24, 2024
1 parent 1f2c94c commit 543c6e2
Show file tree
Hide file tree
Showing 5 changed files with 42 additions and 3 deletions.
2 changes: 2 additions & 0 deletions include/silo/preprocessing/identifier.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ class Identifier {

static std::string escapeIdentifier(const std::string& identifier);

static std::string escapeIdentifierSingleQuote(const std::string& identifier);

const std::string& getRawIdentifier() const;

std::string escape() const;
Expand Down
4 changes: 4 additions & 0 deletions include/silo/preprocessing/metadata_info.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,10 @@ class MetadataInfo {
const silo::config::DatabaseConfig& database_config
);

static std::string getNdjsonMetadataSQLColumnStruct(
const silo::config::DatabaseConfig& database_config
);

static std::vector<std::string> getMetadataSelects(
const silo::config::DatabaseConfig& database_config
);
Expand Down
13 changes: 13 additions & 0 deletions src/silo/preprocessing/identifier.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,19 @@ std::string Identifier::escapeIdentifier(const std::string& identifier) {
return "\"" + output + "\"";
}

// See https://duckdb.org/docs/sql/dialect/keywords_and_identifiers.html#identifiers
std::string Identifier::escapeIdentifierSingleQuote(const std::string& identifier) {
std::string output;
for (const char character : identifier) {
if (character == '\'') {
output += "''";
} else {
output += character;
}
}
return "'" + output + "'";
}

const std::string& Identifier::getRawIdentifier() const {
return raw_identifier;
}
Expand Down
19 changes: 19 additions & 0 deletions src/silo/preprocessing/metadata_info.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,25 @@ std::vector<std::string> MetadataInfo::getMetadataSQLTypes(
return ret;
}

std::string MetadataInfo::getNdjsonMetadataSQLColumnStruct(
const silo::config::DatabaseConfig& database_config
) {
const std::string metadata_struct =
fmt::format("STRUCT({})", boost::join(getMetadataSQLTypes(database_config), ","));

std::string result = fmt::format(
R"(
{{metadata: {},
alignedAminoAcidSequences: 'json',
alignedNucleotideSequences: 'json',
aminoAcidInsertions: 'json',
nucleotideInsertions: 'json',
unalignedNucleotideSequences: 'json'}})",
Identifier::escapeIdentifierSingleQuote(metadata_struct)
);
return result;
}

std::vector<std::string> MetadataInfo::getMetadataSelects(
const silo::config::DatabaseConfig& database_config
) {
Expand Down
7 changes: 4 additions & 3 deletions src/silo/preprocessing/preprocessor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -193,17 +193,18 @@ void Preprocessor::buildTablesFromNdjsonInput(const ValidatedNdjsonFile& input_f
}

(void)preprocessing_db.query(fmt::format(
"INSERT INTO metadata_table BY NAME (SELECT {} FROM read_json_auto('{}'));",
"INSERT INTO metadata_table BY NAME (SELECT {} FROM read_json('{}', columns = {}));",
boost::join(MetadataInfo::getMetadataSelects(database_config), ","),
input_file.getFileName().string()
input_file.getFileName().string(),
MetadataInfo::getNdjsonMetadataSQLColumnStruct(database_config)
));

auto null_primary_key_result = preprocessing_db.query(fmt::format(
R"-(
SELECT {0} FROM metadata_table
WHERE {0} IS NULL;
)-",
database_config.schema.primary_key
Identifier{database_config.schema.primary_key}.escape()
));
if (null_primary_key_result->RowCount() > 0) {
const std::string error_message = fmt::format(
Expand Down

0 comments on commit 543c6e2

Please sign in to comment.