From 9d37222821e94156b7a9c30c58ac38debb296cb6 Mon Sep 17 00:00:00 2001 From: Alexander Taepper Date: Wed, 25 Sep 2024 12:20:19 +0200 Subject: [PATCH] fix: correctly escape quotes in field names --- include/silo/preprocessing/identifiers.h | 7 ++++ include/silo/preprocessing/metadata_info.h | 3 +- src/silo/preprocessing/metadata_info.cpp | 17 ++++----- src/silo/preprocessing/metadata_info.test.cpp | 35 +++++++++++++------ src/silo/preprocessing/preprocessor.cpp | 4 ++- src/silo/preprocessing/preprocessor.test.cpp | 25 +++++++++++++ .../tsvWithQuoteInFieldName/aa_insertions.tsv | 1 + .../database_config.yaml | 8 +++++ .../tsvWithQuoteInFieldName/metadata.tsv | 3 ++ .../nuc_insertions.tsv | 1 + .../preprocessing_config.yaml | 2 ++ .../reference_genomes.json | 4 +++ 12 files changed, 88 insertions(+), 22 deletions(-) create mode 100644 testBaseData/tsvWithQuoteInFieldName/aa_insertions.tsv create mode 100644 testBaseData/tsvWithQuoteInFieldName/database_config.yaml create mode 100644 testBaseData/tsvWithQuoteInFieldName/metadata.tsv create mode 100644 testBaseData/tsvWithQuoteInFieldName/nuc_insertions.tsv create mode 100644 testBaseData/tsvWithQuoteInFieldName/preprocessing_config.yaml create mode 100644 testBaseData/tsvWithQuoteInFieldName/reference_genomes.json diff --git a/include/silo/preprocessing/identifiers.h b/include/silo/preprocessing/identifiers.h index 0a63fc42b..35ece1eb1 100644 --- a/include/silo/preprocessing/identifiers.h +++ b/include/silo/preprocessing/identifiers.h @@ -11,8 +11,15 @@ class Identifiers { std::vector identifiers; public: + explicit Identifiers() = default; + Identifiers(const std::vector& raw_identifiers); + template + void addIdentifier(T&& identifier) { + identifiers.emplace_back(std::forward(identifier)); + } + Identifiers prefix(const std::string& prefix) const; size_t size() const; diff --git a/include/silo/preprocessing/metadata_info.h b/include/silo/preprocessing/metadata_info.h index d85c51090..e5297d22c 100644 --- a/include/silo/preprocessing/metadata_info.h +++ b/include/silo/preprocessing/metadata_info.h @@ -5,6 +5,7 @@ #include #include "silo/config/database_config.h" +#include "silo/preprocessing/identifiers.h" namespace silo::preprocessing { @@ -21,7 +22,7 @@ class MetadataInfo { const silo::config::DatabaseConfig& database_config ); - static std::vector getMetadataFields( + static silo::preprocessing::Identifiers getMetadataFields( const silo::config::DatabaseConfig& database_config ); diff --git a/src/silo/preprocessing/metadata_info.cpp b/src/silo/preprocessing/metadata_info.cpp index e26e893db..9bde2d270 100644 --- a/src/silo/preprocessing/metadata_info.cpp +++ b/src/silo/preprocessing/metadata_info.cpp @@ -112,8 +112,8 @@ void MetadataInfo::validateNdjsonFile( auto result = connection.Query(fmt::format( "SELECT metadata.* " - "FROM read_json_auto(\"{}\") LIMIT 0; ", - ndjson_file.string() + "FROM read_json_auto({}) LIMIT 0; ", + Identifier{ndjson_file.string()}.escape() )); if (result->HasError()) { @@ -157,13 +157,10 @@ void MetadataInfo::validateNdjsonFile( } } -std::vector MetadataInfo::getMetadataFields( - const silo::config::DatabaseConfig& database_config -) { - std::vector ret; - ret.reserve(database_config.schema.metadata.size()); +Identifiers MetadataInfo::getMetadataFields(const silo::config::DatabaseConfig& database_config) { + Identifiers ret; for (const auto& field : database_config.schema.metadata) { - ret.push_back("\"" + field.name + "\""); + ret.addIdentifier(field.name); } return ret; } @@ -174,7 +171,7 @@ std::vector MetadataInfo::getMetadataSQLTypes( std::vector ret; ret.reserve(database_config.schema.metadata.size()); for (const auto& field : database_config.schema.metadata) { - ret.push_back(fmt::format("\"{}\" {}", field.name, toSQLType(field.type))); + ret.push_back(fmt::format("{} {}", Identifier{field.name}.escape(), toSQLType(field.type))); } return ret; } @@ -185,7 +182,7 @@ std::vector MetadataInfo::getMetadataSelects( std::vector ret; ret.reserve(database_config.schema.metadata.size()); for (const auto& field : database_config.schema.metadata) { - ret.push_back(fmt::format(R"( "metadata"."{0}" AS "{0}")", field.name)); + ret.push_back(fmt::format(R"( "metadata".{0} AS {0})", Identifier{field.name}.escape())); } return ret; } diff --git a/src/silo/preprocessing/metadata_info.test.cpp b/src/silo/preprocessing/metadata_info.test.cpp index 74b56d298..62a889217 100644 --- a/src/silo/preprocessing/metadata_info.test.cpp +++ b/src/silo/preprocessing/metadata_info.test.cpp @@ -53,11 +53,19 @@ TEST(MetadataInfo, isValidMedataFileShouldReturnTrueWithValidMetadataFile) { } }; - const auto fields = silo::preprocessing::MetadataInfo::getMetadataFields(valid_config); - ASSERT_TRUE(std::ranges::find(fields, R"("gisaid_epi_isl")") != fields.end()); - ASSERT_TRUE(std::ranges::find(fields, R"("pango_lineage")") != fields.end()); - ASSERT_TRUE(std::ranges::find(fields, R"("date")") != fields.end()); - ASSERT_TRUE(std::ranges::find(fields, R"("country")") != fields.end()); + const auto raw_fields = + silo::preprocessing::MetadataInfo::getMetadataFields(valid_config).getRawIdentifierStrings(); + ASSERT_TRUE(std::ranges::find(raw_fields, "gisaid_epi_isl") != raw_fields.end()); + ASSERT_TRUE(std::ranges::find(raw_fields, "pango_lineage") != raw_fields.end()); + ASSERT_TRUE(std::ranges::find(raw_fields, "date") != raw_fields.end()); + ASSERT_TRUE(std::ranges::find(raw_fields, "country") != raw_fields.end()); + + const auto escaped_fields = silo::preprocessing::MetadataInfo::getMetadataFields(valid_config) + .getEscapedIdentifierStrings(); + ASSERT_TRUE(std::ranges::find(escaped_fields, R"("gisaid_epi_isl")") != escaped_fields.end()); + ASSERT_TRUE(std::ranges::find(escaped_fields, R"("pango_lineage")") != escaped_fields.end()); + ASSERT_TRUE(std::ranges::find(escaped_fields, R"("date")") != escaped_fields.end()); + ASSERT_TRUE(std::ranges::find(escaped_fields, R"("country")") != escaped_fields.end()); } TEST(MetadataInfo, shouldValidateCorrectNdjsonInputFile) { @@ -77,10 +85,17 @@ TEST(MetadataInfo, shouldValidateCorrectNdjsonInputFile) { } }; - const auto fields = silo::preprocessing::MetadataInfo::getMetadataFields(valid_config); + const auto raw_fields = + silo::preprocessing::MetadataInfo::getMetadataFields(valid_config).getRawIdentifierStrings(); + ASSERT_TRUE(std::ranges::find(raw_fields, "gisaid_epi_isl") != raw_fields.end()); + ASSERT_TRUE(std::ranges::find(raw_fields, "pango_lineage") != raw_fields.end()); + ASSERT_TRUE(std::ranges::find(raw_fields, "date") != raw_fields.end()); + ASSERT_TRUE(std::ranges::find(raw_fields, "country") != raw_fields.end()); - ASSERT_TRUE(std::ranges::find(fields, R"("gisaid_epi_isl")") != fields.end()); - ASSERT_TRUE(std::ranges::find(fields, R"("pango_lineage")") != fields.end()); - ASSERT_TRUE(std::ranges::find(fields, R"("date")") != fields.end()); - ASSERT_TRUE(std::ranges::find(fields, R"("country")") != fields.end()); + const auto escaped_fields = silo::preprocessing::MetadataInfo::getMetadataFields(valid_config) + .getEscapedIdentifierStrings(); + ASSERT_TRUE(std::ranges::find(escaped_fields, R"("gisaid_epi_isl")") != escaped_fields.end()); + ASSERT_TRUE(std::ranges::find(escaped_fields, R"("pango_lineage")") != escaped_fields.end()); + ASSERT_TRUE(std::ranges::find(escaped_fields, R"("date")") != escaped_fields.end()); + ASSERT_TRUE(std::ranges::find(escaped_fields, R"("country")") != escaped_fields.end()); } diff --git a/src/silo/preprocessing/preprocessor.cpp b/src/silo/preprocessing/preprocessor.cpp index 9ddf5523d..b796258a2 100644 --- a/src/silo/preprocessing/preprocessor.cpp +++ b/src/silo/preprocessing/preprocessor.cpp @@ -230,7 +230,9 @@ void Preprocessor::buildMetadataTableFromFile(const std::filesystem::path& metad (void)preprocessing_db.query(fmt::format( "INSERT INTO metadata_table BY NAME (SELECT {} FROM read_csv_auto('{}', delim = '\t', " "header = true));", - boost::join(MetadataInfo::getMetadataFields(database_config), ","), + boost::join( + MetadataInfo::getMetadataFields(database_config).getEscapedIdentifierStrings(), "," + ), metadata_filename.string() )); } diff --git a/src/silo/preprocessing/preprocessor.test.cpp b/src/silo/preprocessing/preprocessor.test.cpp index 9ce80e315..38e2cc014 100644 --- a/src/silo/preprocessing/preprocessor.test.cpp +++ b/src/silo/preprocessing/preprocessor.test.cpp @@ -142,6 +142,30 @@ const Scenario TSV_FILE_WITH_SQL_KEYWORD_AS_FIELD = { .expected_query_result = NDJSON_WITH_SQL_KEYWORD_AS_FIELD.expected_query_result }; +const Scenario TSV_FILE_WITH_QUOTE_IN_FIELD_NAME = { + .input_directory = "testBaseData/tsvWithQuoteInFieldName/", + .expected_sequence_count = 2, + .query = R"( +{ + "action": { + "type": "Aggregated", + "groupByFields": ["x\"y"], + "orderByFields": ["x\"y"] + }, + "filterExpression": { + "type": "StringEquals", + "column": "x\"y", + "value": "a" + } +} + )", + .expected_query_result = nlohmann::json::parse( + R"([ + {"count": 1, "x\"y": "a"} + ])" + ) +}; + const Scenario EMPTY_INPUT_TSV = { .input_directory = "testBaseData/emptyInputTsv/", .expected_sequence_count = 0, @@ -288,6 +312,7 @@ INSTANTIATE_TEST_SUITE_P( SAM_FILES, NDJSON_WITH_SQL_KEYWORD_AS_FIELD, TSV_FILE_WITH_SQL_KEYWORD_AS_FIELD, + TSV_FILE_WITH_QUOTE_IN_FIELD_NAME, NDJSON_WITH_NUMERIC_NAMES, EMPTY_INPUT_TSV, EMPTY_INPUT_NDJSON, diff --git a/testBaseData/tsvWithQuoteInFieldName/aa_insertions.tsv b/testBaseData/tsvWithQuoteInFieldName/aa_insertions.tsv new file mode 100644 index 000000000..074d1eeb4 --- /dev/null +++ b/testBaseData/tsvWithQuoteInFieldName/aa_insertions.tsv @@ -0,0 +1 @@ +id diff --git a/testBaseData/tsvWithQuoteInFieldName/database_config.yaml b/testBaseData/tsvWithQuoteInFieldName/database_config.yaml new file mode 100644 index 000000000..fbff099dd --- /dev/null +++ b/testBaseData/tsvWithQuoteInFieldName/database_config.yaml @@ -0,0 +1,8 @@ +schema: + instanceName: Test + metadata: + - name: id + type: string + - name: 'x"y' + type: string + primaryKey: id diff --git a/testBaseData/tsvWithQuoteInFieldName/metadata.tsv b/testBaseData/tsvWithQuoteInFieldName/metadata.tsv new file mode 100644 index 000000000..50b6d258d --- /dev/null +++ b/testBaseData/tsvWithQuoteInFieldName/metadata.tsv @@ -0,0 +1,3 @@ +id "x""y" +1 a +2 b diff --git a/testBaseData/tsvWithQuoteInFieldName/nuc_insertions.tsv b/testBaseData/tsvWithQuoteInFieldName/nuc_insertions.tsv new file mode 100644 index 000000000..074d1eeb4 --- /dev/null +++ b/testBaseData/tsvWithQuoteInFieldName/nuc_insertions.tsv @@ -0,0 +1 @@ +id diff --git a/testBaseData/tsvWithQuoteInFieldName/preprocessing_config.yaml b/testBaseData/tsvWithQuoteInFieldName/preprocessing_config.yaml new file mode 100644 index 000000000..02a9fc6e9 --- /dev/null +++ b/testBaseData/tsvWithQuoteInFieldName/preprocessing_config.yaml @@ -0,0 +1,2 @@ +metadataFilename: "metadata.tsv" +referenceGenomeFilename: "reference_genomes.json" diff --git a/testBaseData/tsvWithQuoteInFieldName/reference_genomes.json b/testBaseData/tsvWithQuoteInFieldName/reference_genomes.json new file mode 100644 index 000000000..55830fa40 --- /dev/null +++ b/testBaseData/tsvWithQuoteInFieldName/reference_genomes.json @@ -0,0 +1,4 @@ +{ + "nucleotideSequences": [], + "genes": [] +}