Skip to content

Commit

Permalink
fix: correctly escape quotes in field names
Browse files Browse the repository at this point in the history
  • Loading branch information
Taepper committed Sep 25, 2024
1 parent fb58525 commit db662df
Show file tree
Hide file tree
Showing 12 changed files with 67 additions and 15 deletions.
7 changes: 7 additions & 0 deletions include/silo/preprocessing/identifiers.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,15 @@ class Identifiers {
std::vector<Identifier> identifiers;

public:
explicit Identifiers() = default;

Identifiers(const std::vector<std::string>& raw_identifiers);

template <typename T>
void addIdentifier(T&& identifier) {
identifiers.emplace_back(std::forward<T>(identifier));
}

Identifiers prefix(const std::string& prefix) const;

size_t size() const;
Expand Down
3 changes: 2 additions & 1 deletion include/silo/preprocessing/metadata_info.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include <vector>

#include "silo/config/database_config.h"
#include "silo/preprocessing/identifiers.h"

namespace silo::preprocessing {

Expand All @@ -21,7 +22,7 @@ class MetadataInfo {
const silo::config::DatabaseConfig& database_config
);

static std::vector<std::string> getMetadataFields(
static silo::preprocessing::Identifiers getMetadataFields(
const silo::config::DatabaseConfig& database_config
);

Expand Down
17 changes: 7 additions & 10 deletions src/silo/preprocessing/metadata_info.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -112,8 +112,8 @@ void MetadataInfo::validateNdjsonFile(

auto result = connection.Query(fmt::format(
"SELECT metadata.* "
"FROM read_json_auto(\"{}\") LIMIT 0; ",
ndjson_file.string()
"FROM read_json_auto({}) LIMIT 0; ",
Identifier{ndjson_file.string()}.escape()
));

if (result->HasError()) {
Expand Down Expand Up @@ -157,13 +157,10 @@ void MetadataInfo::validateNdjsonFile(
}
}

std::vector<std::string> MetadataInfo::getMetadataFields(
const silo::config::DatabaseConfig& database_config
) {
std::vector<std::string> ret;
ret.reserve(database_config.schema.metadata.size());
Identifiers MetadataInfo::getMetadataFields(const silo::config::DatabaseConfig& database_config) {
Identifiers ret;
for (const auto& field : database_config.schema.metadata) {
ret.push_back("\"" + field.name + "\"");
ret.addIdentifier(field.name);
}
return ret;
}
Expand All @@ -174,7 +171,7 @@ std::vector<std::string> MetadataInfo::getMetadataSQLTypes(
std::vector<std::string> ret;
ret.reserve(database_config.schema.metadata.size());
for (const auto& field : database_config.schema.metadata) {
ret.push_back(fmt::format("\"{}\" {}", field.name, toSQLType(field.type)));
ret.push_back(fmt::format("{} {}", Identifier{field.name}.escape(), toSQLType(field.type)));
}
return ret;
}
Expand All @@ -185,7 +182,7 @@ std::vector<std::string> MetadataInfo::getMetadataSelects(
std::vector<std::string> ret;
ret.reserve(database_config.schema.metadata.size());
for (const auto& field : database_config.schema.metadata) {
ret.push_back(fmt::format(R"( "metadata"."{0}" AS "{0}")", field.name));
ret.push_back(fmt::format(R"( "metadata".{0} AS {0})", Identifier{field.name}.escape()));
}
return ret;
}
Expand Down
7 changes: 4 additions & 3 deletions src/silo/preprocessing/metadata_info.test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,8 @@ TEST(MetadataInfo, isValidMedataFileShouldReturnTrueWithValidMetadataFile) {
}
};

const auto fields = silo::preprocessing::MetadataInfo::getMetadataFields(valid_config);
const auto fields =
silo::preprocessing::MetadataInfo::getMetadataFields(valid_config).getRawIdentifierStrings();
ASSERT_TRUE(std::ranges::find(fields, R"("gisaid_epi_isl")") != fields.end());
ASSERT_TRUE(std::ranges::find(fields, R"("pango_lineage")") != fields.end());
ASSERT_TRUE(std::ranges::find(fields, R"("date")") != fields.end());
Expand All @@ -77,8 +78,8 @@ TEST(MetadataInfo, shouldValidateCorrectNdjsonInputFile) {
}
};

const auto fields = silo::preprocessing::MetadataInfo::getMetadataFields(valid_config);

const auto fields =
silo::preprocessing::MetadataInfo::getMetadataFields(valid_config).getRawIdentifierStrings();
ASSERT_TRUE(std::ranges::find(fields, R"("gisaid_epi_isl")") != fields.end());
ASSERT_TRUE(std::ranges::find(fields, R"("pango_lineage")") != fields.end());
ASSERT_TRUE(std::ranges::find(fields, R"("date")") != fields.end());
Expand Down
4 changes: 3 additions & 1 deletion src/silo/preprocessing/preprocessor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,9 @@ void Preprocessor::buildMetadataTableFromFile(const std::filesystem::path& metad
(void)preprocessing_db.query(fmt::format(
"INSERT INTO metadata_table BY NAME (SELECT {} FROM read_csv_auto('{}', delim = '\t', "
"header = true));",
boost::join(MetadataInfo::getMetadataFields(database_config), ","),
boost::join(
MetadataInfo::getMetadataFields(database_config).getEscapedIdentifierStrings(), ","
),
metadata_filename.string()
));
}
Expand Down
25 changes: 25 additions & 0 deletions src/silo/preprocessing/preprocessor.test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,30 @@ const Scenario TSV_FILE_WITH_SQL_KEYWORD_AS_FIELD = {
.expected_query_result = NDJSON_WITH_SQL_KEYWORD_AS_FIELD.expected_query_result
};

const Scenario TSV_FILE_WITH_QUOTE_IN_FIELD_NAME = {
.input_directory = "testBaseData/tsvWithQuoteInFieldName/",
.expected_sequence_count = 2,
.query = R"(
{
"action": {
"type": "Aggregated",
"groupByFields": ["x\"y"],
"orderByFields": ["x\"y"]
},
"filterExpression": {
"type": "StringEquals",
"column": "x\"y",
"value": "a"
}
}
)",
.expected_query_result = nlohmann::json::parse(
R"([
{"count": 1, "x\"y": "a"}
])"
)
};

const Scenario EMPTY_INPUT_TSV = {
.input_directory = "testBaseData/emptyInputTsv/",
.expected_sequence_count = 0,
Expand Down Expand Up @@ -288,6 +312,7 @@ INSTANTIATE_TEST_SUITE_P(
SAM_FILES,
NDJSON_WITH_SQL_KEYWORD_AS_FIELD,
TSV_FILE_WITH_SQL_KEYWORD_AS_FIELD,
TSV_FILE_WITH_QUOTE_IN_FIELD_NAME,
NDJSON_WITH_NUMERIC_NAMES,
EMPTY_INPUT_TSV,
EMPTY_INPUT_NDJSON,
Expand Down
1 change: 1 addition & 0 deletions testBaseData/tsvWithQuoteInFieldName/aa_insertions.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
id
8 changes: 8 additions & 0 deletions testBaseData/tsvWithQuoteInFieldName/database_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
schema:
instanceName: Test
metadata:
- name: id
type: string
- name: 'x"y'
type: string
primaryKey: id
3 changes: 3 additions & 0 deletions testBaseData/tsvWithQuoteInFieldName/metadata.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
id "x""y"
1 a
2 b
1 change: 1 addition & 0 deletions testBaseData/tsvWithQuoteInFieldName/nuc_insertions.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
id
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
metadataFilename: "metadata.tsv"
referenceGenomeFilename: "reference_genomes.json"
4 changes: 4 additions & 0 deletions testBaseData/tsvWithQuoteInFieldName/reference_genomes.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"nucleotideSequences": [],
"genes": []
}

0 comments on commit db662df

Please sign in to comment.