From 5d908b271affa71be09114400aa36209af3797a2 Mon Sep 17 00:00:00 2001 From: Alexander Taepper Date: Mon, 30 Sep 2024 18:46:41 +0200 Subject: [PATCH] fixup! fix: correctly escape quotes in field names --- include/silo/preprocessing/identifier.h | 2 + include/silo/preprocessing/preprocessor.h | 3 +- src/silo/preprocessing/identifier.cpp | 4 + src/silo/preprocessing/metadata_info.test.cpp | 64 --- src/silo/preprocessing/preprocessor.cpp | 10 +- src/silo/preprocessing/preprocessor.test.cpp | 26 + .../aa_insertions.tsv | 101 ++++ .../database_config.yaml | 29 + .../nuc_insertions.tsv | 101 ++++ .../pangolineage_alias.json | 501 ++++++++++++++++++ .../preprocessing_config.yaml | 4 + .../reference_genomes.json | 6 + .../small_metadata_set.tsv | 101 ++++ 13 files changed, 882 insertions(+), 70 deletions(-) create mode 100644 testBaseData/tsvWithQuoteInPartitionBy/aa_insertions.tsv create mode 100644 testBaseData/tsvWithQuoteInPartitionBy/database_config.yaml create mode 100644 testBaseData/tsvWithQuoteInPartitionBy/nuc_insertions.tsv create mode 100644 testBaseData/tsvWithQuoteInPartitionBy/pangolineage_alias.json create mode 100644 testBaseData/tsvWithQuoteInPartitionBy/preprocessing_config.yaml create mode 100644 testBaseData/tsvWithQuoteInPartitionBy/reference_genomes.json create mode 100644 testBaseData/tsvWithQuoteInPartitionBy/small_metadata_set.tsv diff --git a/include/silo/preprocessing/identifier.h b/include/silo/preprocessing/identifier.h index d086759c9..e0101e55a 100644 --- a/include/silo/preprocessing/identifier.h +++ b/include/silo/preprocessing/identifier.h @@ -16,6 +16,8 @@ class Identifier { const std::string& getRawIdentifier() const; std::string escape() const; + + bool operator==(const Identifier& other) const; }; } // namespace silo::preprocessing diff --git a/include/silo/preprocessing/preprocessor.h b/include/silo/preprocessing/preprocessor.h index fe1cb25fe..d9d7d2c0e 100644 --- a/include/silo/preprocessing/preprocessor.h +++ b/include/silo/preprocessing/preprocessor.h @@ -6,6 +6,7 @@ #include "silo/config/database_config.h" #include "silo/config/preprocessing_config.h" #include "silo/database.h" +#include "silo/preprocessing/identifier.h" #include "silo/preprocessing/identifiers.h" #include "silo/preprocessing/preprocessing_database.h" #include "silo/preprocessing/validated_ndjson_file.h" @@ -58,7 +59,7 @@ class Preprocessor { void buildMetadataTableFromFile(const std::filesystem::path& metadata_filename); void buildPartitioningTable(); - void buildPartitioningTableByColumn(const std::string& partition_by_field); + void buildPartitioningTableByColumn(const Identifier& partition_by_field); void buildEmptyPartitioning(); template diff --git a/src/silo/preprocessing/identifier.cpp b/src/silo/preprocessing/identifier.cpp index 5828b8447..d6bf0420d 100644 --- a/src/silo/preprocessing/identifier.cpp +++ b/src/silo/preprocessing/identifier.cpp @@ -26,4 +26,8 @@ std::string Identifier::escape() const { return escapeIdentifier(raw_identifier); } +bool Identifier::operator==(const Identifier& other) const { + return raw_identifier == other.raw_identifier; +} + } // namespace silo::preprocessing diff --git a/src/silo/preprocessing/metadata_info.test.cpp b/src/silo/preprocessing/metadata_info.test.cpp index 62a889217..58a39b8ff 100644 --- a/src/silo/preprocessing/metadata_info.test.cpp +++ b/src/silo/preprocessing/metadata_info.test.cpp @@ -35,67 +35,3 @@ TEST( silo::preprocessing::PreprocessingException ); } - -TEST(MetadataInfo, isValidMedataFileShouldReturnTrueWithValidMetadataFile) { - const silo::config::DatabaseConfig valid_config{ - .default_nucleotide_sequence = "main", - .schema = - { - .instance_name = "testInstanceName", - .metadata = - { - {.name = "gisaid_epi_isl", .type = silo::config::ValueType::STRING}, - {.name = "pango_lineage", .type = silo::config::ValueType::PANGOLINEAGE}, - {.name = "date", .type = silo::config::ValueType::DATE}, - {.name = "country", .type = silo::config::ValueType::STRING}, - }, - .primary_key = "gisaid_epi_isl", - } - }; - - const auto raw_fields = - silo::preprocessing::MetadataInfo::getMetadataFields(valid_config).getRawIdentifierStrings(); - ASSERT_TRUE(std::ranges::find(raw_fields, "gisaid_epi_isl") != raw_fields.end()); - ASSERT_TRUE(std::ranges::find(raw_fields, "pango_lineage") != raw_fields.end()); - ASSERT_TRUE(std::ranges::find(raw_fields, "date") != raw_fields.end()); - ASSERT_TRUE(std::ranges::find(raw_fields, "country") != raw_fields.end()); - - const auto escaped_fields = silo::preprocessing::MetadataInfo::getMetadataFields(valid_config) - .getEscapedIdentifierStrings(); - ASSERT_TRUE(std::ranges::find(escaped_fields, R"("gisaid_epi_isl")") != escaped_fields.end()); - ASSERT_TRUE(std::ranges::find(escaped_fields, R"("pango_lineage")") != escaped_fields.end()); - ASSERT_TRUE(std::ranges::find(escaped_fields, R"("date")") != escaped_fields.end()); - ASSERT_TRUE(std::ranges::find(escaped_fields, R"("country")") != escaped_fields.end()); -} - -TEST(MetadataInfo, shouldValidateCorrectNdjsonInputFile) { - const silo::config::DatabaseConfig valid_config{ - .default_nucleotide_sequence = "main", - .schema = - { - .instance_name = "testInstanceName", - .metadata = - { - {.name = "gisaid_epi_isl", .type = silo::config::ValueType::STRING}, - {.name = "pango_lineage", .type = silo::config::ValueType::PANGOLINEAGE}, - {.name = "date", .type = silo::config::ValueType::DATE}, - {.name = "country", .type = silo::config::ValueType::STRING}, - }, - .primary_key = "gisaid_epi_isl", - } - }; - - const auto raw_fields = - silo::preprocessing::MetadataInfo::getMetadataFields(valid_config).getRawIdentifierStrings(); - ASSERT_TRUE(std::ranges::find(raw_fields, "gisaid_epi_isl") != raw_fields.end()); - ASSERT_TRUE(std::ranges::find(raw_fields, "pango_lineage") != raw_fields.end()); - ASSERT_TRUE(std::ranges::find(raw_fields, "date") != raw_fields.end()); - ASSERT_TRUE(std::ranges::find(raw_fields, "country") != raw_fields.end()); - - const auto escaped_fields = silo::preprocessing::MetadataInfo::getMetadataFields(valid_config) - .getEscapedIdentifierStrings(); - ASSERT_TRUE(std::ranges::find(escaped_fields, R"("gisaid_epi_isl")") != escaped_fields.end()); - ASSERT_TRUE(std::ranges::find(escaped_fields, R"("pango_lineage")") != escaped_fields.end()); - ASSERT_TRUE(std::ranges::find(escaped_fields, R"("date")") != escaped_fields.end()); - ASSERT_TRUE(std::ranges::find(escaped_fields, R"("country")") != escaped_fields.end()); -} diff --git a/src/silo/preprocessing/preprocessor.cpp b/src/silo/preprocessing/preprocessor.cpp index b796258a2..f71f05f53 100644 --- a/src/silo/preprocessing/preprocessor.cpp +++ b/src/silo/preprocessing/preprocessor.cpp @@ -259,14 +259,14 @@ void Preprocessor::buildPartitioningTable() { "preprocessing - partitioning input by metadata key '{}'", database_config.schema.partition_by.value() ); - buildPartitioningTableByColumn(database_config.schema.partition_by.value()); + buildPartitioningTableByColumn(Identifier{database_config.schema.partition_by.value()}); } else { SPDLOG_DEBUG("preprocessing - no metadata key for partitioning provided"); buildEmptyPartitioning(); } } -void Preprocessor::buildPartitioningTableByColumn(const std::string& partition_by_field) { +void Preprocessor::buildPartitioningTableByColumn(const Identifier& partition_by_field) { SPDLOG_DEBUG("preprocessing - calculating partitions"); (void)preprocessing_db.query(fmt::format( @@ -278,7 +278,7 @@ FROM (SELECT {} AS partition_key, COUNT(*) AS count GROUP BY partition_key ORDER BY partition_key); )-", - makeNonNullKey(partition_by_field) + makeNonNullKey(partition_by_field.escape()) )); // create Recursive Hierarchical Partitioning By Partition Field @@ -332,8 +332,8 @@ AND partition_keys.partition_key = 'NULL')) AND partition_keys.id >= partitioning.from_id AND partition_keys.id <= partitioning.to_id; )-", - makeNonNullKey(partition_by_field), - partition_by_field + makeNonNullKey(partition_by_field.escape()), + partition_by_field.escape() )); } diff --git a/src/silo/preprocessing/preprocessor.test.cpp b/src/silo/preprocessing/preprocessor.test.cpp index 38e2cc014..d5c0e80bf 100644 --- a/src/silo/preprocessing/preprocessor.test.cpp +++ b/src/silo/preprocessing/preprocessor.test.cpp @@ -166,6 +166,31 @@ const Scenario TSV_FILE_WITH_QUOTE_IN_FIELD_NAME = { ) }; +const Scenario TSV_FILE_WITH_QUOTE_IN_PARTITION_BY = { + .input_directory = "testBaseData/tsvWithQuoteInPartitionBy/", + .expected_sequence_count = 100, + .query = R"( +{ + "action": { + "type": "Aggregated", + "groupByFields": ["pango_\"lineage"], + "orderByFields": ["pango_\"lineage"], + "limit": 3 + }, + "filterExpression": { + "type": "True" + } +} + )", + .expected_query_result = nlohmann::json::parse( + R"([ + {"count":1,"pango_\"lineage":null}, + {"count":1,"pango_\"lineage":"AY.122"}, + {"count":4,"pango_\"lineage":"AY.43"} + ])" + ) +}; + const Scenario EMPTY_INPUT_TSV = { .input_directory = "testBaseData/emptyInputTsv/", .expected_sequence_count = 0, @@ -313,6 +338,7 @@ INSTANTIATE_TEST_SUITE_P( NDJSON_WITH_SQL_KEYWORD_AS_FIELD, TSV_FILE_WITH_SQL_KEYWORD_AS_FIELD, TSV_FILE_WITH_QUOTE_IN_FIELD_NAME, + TSV_FILE_WITH_QUOTE_IN_PARTITION_BY, NDJSON_WITH_NUMERIC_NAMES, EMPTY_INPUT_TSV, EMPTY_INPUT_NDJSON, diff --git a/testBaseData/tsvWithQuoteInPartitionBy/aa_insertions.tsv b/testBaseData/tsvWithQuoteInPartitionBy/aa_insertions.tsv new file mode 100644 index 000000000..3554f9a04 --- /dev/null +++ b/testBaseData/tsvWithQuoteInPartitionBy/aa_insertions.tsv @@ -0,0 +1,101 @@ +gisaid_epi_isl +EPI_ISL_1408408 +EPI_ISL_1749899 +EPI_ISL_2016901 +EPI_ISL_1749892 +EPI_ISL_1597932 +EPI_ISL_1407962 +EPI_ISL_1750503 +EPI_ISL_1360935 +EPI_ISL_2019235 +EPI_ISL_1749960 +EPI_ISL_1361468 +EPI_ISL_1408062 +EPI_ISL_1597890 +EPI_ISL_1682849 +EPI_ISL_1408805 +EPI_ISL_1750868 +EPI_ISL_2019350 +EPI_ISL_2017036 +EPI_ISL_1599113 +EPI_ISL_2214128 +EPI_ISL_2408472 +EPI_ISL_830864 +EPI_ISL_581968 +EPI_ISL_2213804 +EPI_ISL_2405276 +EPI_ISL_2213934 +EPI_ISL_2213984 +EPI_ISL_2574088 +EPI_ISL_2544226 +EPI_ISL_2360326 +EPI_ISL_2379651 +EPI_ISL_1036103 +EPI_ISL_931279 +EPI_ISL_931031 +EPI_ISL_1273458 +EPI_ISL_1273715 +EPI_ISL_737604 +EPI_ISL_1129663 +EPI_ISL_1003629 +EPI_ISL_737715 +EPI_ISL_1003036 +EPI_ISL_899762 +EPI_ISL_899725 +EPI_ISL_1195052 +EPI_ISL_1003519 +EPI_ISL_1003010 +EPI_ISL_1119584 +EPI_ISL_1002052 +EPI_ISL_466942 +EPI_ISL_1003849 +EPI_ISL_768148 +EPI_ISL_1080536 +EPI_ISL_1002156 +EPI_ISL_1119315 +EPI_ISL_1004495 +EPI_ISL_1001920 +EPI_ISL_1131102 +EPI_ISL_1003373 +EPI_ISL_721941 +EPI_ISL_1130868 +EPI_ISL_1003425 +EPI_ISL_737860 +EPI_ISL_1001493 +EPI_ISL_1260480 +EPI_ISL_1747885 +EPI_ISL_1747752 +EPI_ISL_1005148 +EPI_ISL_1748243 +EPI_ISL_1748215 +EPI_ISL_1748395 +EPI_ISL_1760534 +EPI_ISL_2086867 +EPI_ISL_1840634 +EPI_ISL_2180995 +EPI_ISL_2181005 +EPI_ISL_2180023 +EPI_ISL_2270139 +EPI_ISL_2544452 +EPI_ISL_2544332 +EPI_ISL_2307766 +EPI_ISL_2375490 +EPI_ISL_2374969 +EPI_ISL_2307888 +EPI_ISL_2375247 +EPI_ISL_2308054 +EPI_ISL_2375165 +EPI_ISL_2375097 +EPI_ISL_3128737 +EPI_ISL_3128811 +EPI_ISL_3086369 +EPI_ISL_3259931 +EPI_ISL_3267832 +EPI_ISL_3128796 +EPI_ISL_3016465 +EPI_ISL_3247294 +EPI_ISL_3578231 +EPI_ISL_3465732 +EPI_ISL_2367431 +EPI_ISL_3465556 +EPI_ISL_2359636 diff --git a/testBaseData/tsvWithQuoteInPartitionBy/database_config.yaml b/testBaseData/tsvWithQuoteInPartitionBy/database_config.yaml new file mode 100644 index 000000000..10759f7c6 --- /dev/null +++ b/testBaseData/tsvWithQuoteInPartitionBy/database_config.yaml @@ -0,0 +1,29 @@ +schema: + instanceName: sars_cov-2_minimal_test_config + metadata: + - name: gisaid_epi_isl + type: string + - name: date + type: date + - name: unsorted_date + type: date + - name: region + type: string + generateIndex: true + - name: country + type: string + generateIndex: true + - name: 'pango_"lineage' + type: pango_lineage + - name: division + type: string + generateIndex: true + - name: age + type: int + - name: qc_value + type: float + - name: test_boolean_column + type: boolean + primaryKey: gisaid_epi_isl + dateToSortBy: date + partitionBy: 'pango_"lineage' diff --git a/testBaseData/tsvWithQuoteInPartitionBy/nuc_insertions.tsv b/testBaseData/tsvWithQuoteInPartitionBy/nuc_insertions.tsv new file mode 100644 index 000000000..3554f9a04 --- /dev/null +++ b/testBaseData/tsvWithQuoteInPartitionBy/nuc_insertions.tsv @@ -0,0 +1,101 @@ +gisaid_epi_isl +EPI_ISL_1408408 +EPI_ISL_1749899 +EPI_ISL_2016901 +EPI_ISL_1749892 +EPI_ISL_1597932 +EPI_ISL_1407962 +EPI_ISL_1750503 +EPI_ISL_1360935 +EPI_ISL_2019235 +EPI_ISL_1749960 +EPI_ISL_1361468 +EPI_ISL_1408062 +EPI_ISL_1597890 +EPI_ISL_1682849 +EPI_ISL_1408805 +EPI_ISL_1750868 +EPI_ISL_2019350 +EPI_ISL_2017036 +EPI_ISL_1599113 +EPI_ISL_2214128 +EPI_ISL_2408472 +EPI_ISL_830864 +EPI_ISL_581968 +EPI_ISL_2213804 +EPI_ISL_2405276 +EPI_ISL_2213934 +EPI_ISL_2213984 +EPI_ISL_2574088 +EPI_ISL_2544226 +EPI_ISL_2360326 +EPI_ISL_2379651 +EPI_ISL_1036103 +EPI_ISL_931279 +EPI_ISL_931031 +EPI_ISL_1273458 +EPI_ISL_1273715 +EPI_ISL_737604 +EPI_ISL_1129663 +EPI_ISL_1003629 +EPI_ISL_737715 +EPI_ISL_1003036 +EPI_ISL_899762 +EPI_ISL_899725 +EPI_ISL_1195052 +EPI_ISL_1003519 +EPI_ISL_1003010 +EPI_ISL_1119584 +EPI_ISL_1002052 +EPI_ISL_466942 +EPI_ISL_1003849 +EPI_ISL_768148 +EPI_ISL_1080536 +EPI_ISL_1002156 +EPI_ISL_1119315 +EPI_ISL_1004495 +EPI_ISL_1001920 +EPI_ISL_1131102 +EPI_ISL_1003373 +EPI_ISL_721941 +EPI_ISL_1130868 +EPI_ISL_1003425 +EPI_ISL_737860 +EPI_ISL_1001493 +EPI_ISL_1260480 +EPI_ISL_1747885 +EPI_ISL_1747752 +EPI_ISL_1005148 +EPI_ISL_1748243 +EPI_ISL_1748215 +EPI_ISL_1748395 +EPI_ISL_1760534 +EPI_ISL_2086867 +EPI_ISL_1840634 +EPI_ISL_2180995 +EPI_ISL_2181005 +EPI_ISL_2180023 +EPI_ISL_2270139 +EPI_ISL_2544452 +EPI_ISL_2544332 +EPI_ISL_2307766 +EPI_ISL_2375490 +EPI_ISL_2374969 +EPI_ISL_2307888 +EPI_ISL_2375247 +EPI_ISL_2308054 +EPI_ISL_2375165 +EPI_ISL_2375097 +EPI_ISL_3128737 +EPI_ISL_3128811 +EPI_ISL_3086369 +EPI_ISL_3259931 +EPI_ISL_3267832 +EPI_ISL_3128796 +EPI_ISL_3016465 +EPI_ISL_3247294 +EPI_ISL_3578231 +EPI_ISL_3465732 +EPI_ISL_2367431 +EPI_ISL_3465556 +EPI_ISL_2359636 diff --git a/testBaseData/tsvWithQuoteInPartitionBy/pangolineage_alias.json b/testBaseData/tsvWithQuoteInPartitionBy/pangolineage_alias.json new file mode 100644 index 000000000..28210c7b9 --- /dev/null +++ b/testBaseData/tsvWithQuoteInPartitionBy/pangolineage_alias.json @@ -0,0 +1,501 @@ +{ + "A": "", + "B": "", + "C": "B.1.1.1", + "D": "B.1.1.25", + "G": "B.1.258.2", + "K": "B.1.1.277", + "L": "B.1.1.10", + "M": "B.1.1.294", + "N": "B.1.1.33", + "P": "B.1.1.28", + "Q": "B.1.1.7", + "R": "B.1.1.316", + "S": "B.1.1.217", + "U": "B.1.177.60", + "V": "B.1.177.54", + "W": "B.1.177.53", + "Y": "B.1.177.52", + "Z": "B.1.177.50", + "AA": "B.1.177.15", + "AB": "B.1.160.16", + "AC": "B.1.1.405", + "AD": "B.1.1.315", + "AE": "B.1.1.306", + "AF": "B.1.1.305", + "AG": "B.1.1.297", + "AH": "B.1.1.241", + "AJ": "B.1.1.240", + "AK": "B.1.1.232", + "AL": "B.1.1.231", + "AM": "B.1.1.216", + "AN": "B.1.1.200", + "AP": "B.1.1.70", + "AQ": "B.1.1.39", + "AS": "B.1.1.317", + "AT": "B.1.1.370", + "AU": "B.1.466.2", + "AV": "B.1.1.482", + "AW": "B.1.1.464", + "AY": "B.1.617.2", + "AZ": "B.1.1.318", + "BA": "B.1.1.529", + "BB": "B.1.621.1", + "BC": "B.1.1.529.1.1.1", + "BD": "B.1.1.529.1.17.2", + "BE": "B.1.1.529.5.3.1", + "BF": "B.1.1.529.5.2.1", + "BG": "B.1.1.529.2.12.1", + "BH": "B.1.1.529.2.38.3", + "BJ": "B.1.1.529.2.10.1", + "BK": "B.1.1.529.5.1.10", + "BL": "B.1.1.529.2.75.1", + "BM": "B.1.1.529.2.75.3", + "BN": "B.1.1.529.2.75.5", + "BP": "B.1.1.529.2.3.16", + "BQ": "B.1.1.529.5.3.1.1.1.1", + "BR": "B.1.1.529.2.75.4", + "BS": "B.1.1.529.2.3.2", + "BT": "B.1.1.529.5.1.21", + "BU": "B.1.1.529.5.2.16", + "BV": "B.1.1.529.5.2.20", + "BW": "B.1.1.529.5.6.2", + "BY": "B.1.1.529.2.75.6", + "BZ": "B.1.1.529.5.2.3", + "CA": "B.1.1.529.2.75.2", + "CB": "B.1.1.529.2.75.9", + "CC": "B.1.1.529.5.3.1.1.1.2", + "CD": "B.1.1.529.5.2.31", + "CE": "B.1.1.529.5.2.33", + "CF": "B.1.1.529.5.2.27", + "CG": "B.1.1.529.5.2.26", + "CH": "B.1.1.529.2.75.3.4.1.1", + "CJ": "B.1.1.529.2.75.3.1.1.1", + "CK": "B.1.1.529.5.2.24", + "CL": "B.1.1.529.5.1.29", + "CM": "B.1.1.529.2.3.20", + "CN": "B.1.1.529.5.2.21", + "CP": "B.1.1.529.5.2.6", + "CQ": "B.1.1.529.5.3.1.4.1.1", + "CR": "B.1.1.529.5.2.18", + "CS": "B.1.1.529.4.1.10", + "CT": "B.1.1.529.5.2.36", + "CU": "B.1.1.529.5.1.26", + "CV": "B.1.1.529.2.75.3.1.1.3", + "CW": "B.1.1.529.5.3.1.1.1.1.1.1.14", + "CY": "B.1.1.529.5.2.7", + "CZ": "B.1.1.529.5.3.1.1.1.1.1.1.1", + "DA": "B.1.1.529.5.2.38", + "DB": "B.1.1.529.5.2.25", + "DC": "B.1.1.529.4.6.5", + "DD": "B.1.1.529.2.3.21", + "DE": "B.1.1.529.5.1.23", + "DF": "B.1.1.529.5.10.1", + "DG": "B.1.1.529.5.2.24.2.1.1", + "DH": "B.1.1.529.5.1.22", + "DJ": "B.1.1.529.5.1.25", + "DK": "B.1.1.529.5.3.1.1.1.1.1.1.7", + "DL": "B.1.1.529.5.1.15", + "DM": "B.1.1.529.5.3.1.1.1.1.1.1.15", + "DN": "B.1.1.529.5.3.1.1.1.1.1.1.5", + "DP": "B.1.1.529.5.3.1.1.1.1.1.1.8", + "DQ": "B.1.1.529.5.2.47", + "DR": "B.1.1.529.5.3.1.1.1.1.1.1.3", + "DS": "B.1.1.529.2.75.5.1.3.1", + "DT": "B.1.1.529.5.3.1.1.1.1.1.1.32", + "DU": "B.1.1.529.5.3.1.1.1.1.1.1.2", + "DV": "B.1.1.529.2.75.3.4.1.1.1.1.1", + "DW": "B.1.1.529.5.3.1.1.2.1", + "DY": "B.1.1.529.5.2.48", + "DZ": "B.1.1.529.5.2.49", + "EA": "B.1.1.529.5.3.1.1.1.1.1.1.52", + "EB": "B.1.1.529.5.1.35", + "EC": "B.1.1.529.5.3.1.1.1.1.1.10.1", + "ED": "B.1.1.529.5.3.1.1.1.1.1.1.18", + "EE": "B.1.1.529.5.3.1.1.1.1.1.1.4", + "EF": "B.1.1.529.5.3.1.1.1.1.1.1.13", + "EG": "XBB.1.9.2", + "EH": "B.1.1.529.5.3.1.1.1.1.1.1.28", + "EJ": "B.1.1.529.2.75.5.1.3.8", + "EK": "XBB.1.5.13", + "EL": "XBB.1.5.14", + "EM": "XBB.1.5.7", + "EN": "B.1.1.529.5.3.1.1.1.1.1.1.46", + "EP": "B.1.1.529.2.75.3.1.1.4", + "EQ": "B.1.1.529.5.1.33", + "ER": "B.1.1.529.5.3.1.1.1.1.1.1.22", + "ES": "B.1.1.529.5.3.1.1.1.1.1.1.65", + "ET": "B.1.1.529.5.3.1.1.1.1.1.1.35", + "EU": "XBB.1.5.26", + "EV": "B.1.1.529.5.3.1.1.1.1.1.1.71", + "EW": "B.1.1.529.5.3.1.1.1.1.1.1.38", + "EY": "B.1.1.529.5.3.1.1.1.1.1.1.13.1.1.1", + "EZ": "B.1.1.529.5.3.1.1.1.1.1.1.43", + "FA": "B.1.1.529.5.3.1.1.1.1.1.1.10", + "FB": "B.1.1.529.5.3.1.1.1.1.1.2.1", + "FC": "B.1.1.529.5.3.1.1.1.1.1.1.72", + "FD": "XBB.1.5.15", + "FE": "XBB.1.18.1", + "FF": "B.1.1.529.5.3.1.1.1.1.1.8.2", + "FG": "XBB.1.5.16", + "FH": "XBB.1.5.17", + "FJ": "B.1.1.529.2.75.3.4.1.1.1.1.19", + "FK": "B.1.1.529.2.75.3.4.1.1.1.1.17", + "FL": "XBB.1.9.1", + "FM": "B.1.1.529.5.3.1.1.1.1.1.1.53", + "FN": "B.1.1.529.5.3.1.1.1.1.1.1.74", + "FP": "XBB.1.11.1", + "FQ": "B.1.1.529.5.3.1.1.1.1.1.1.39", + "FR": "B.1.1.529.2.75.5.1.2.3", + "FS": "B.1.1.529.2.75.3.4.1.1.1.1.12", + "FT": "XBB.1.5.39", + "FU": "XBB.1.16.1", + "FV": "B.1.1.529.2.3.20.8.1.1", + "FW": "XBB.1.28.1", + "FY": "XBB.1.22.1", + "FZ": "XBB.1.5.47", + "GA": "XBB.1.17.1", + "GB": "XBB.1.5.46", + "GC": "XBB.1.5.21", + "GD": "XBB.1.9.3", + "GE": "XBB.2.3.10", + "GF": "XBB.1.5.24", + "GG": "XBB.1.5.38", + "GH": "XBB.2.6.1", + "GJ": "XBB.2.3.3", + "GK": "XBB.1.5.70", + "GL": "XAY.1.1.1", + "GM": "XBB.2.3.6", + "GN": "XBB.1.5.73", + "GP": "B.1.1.529.2.75.3.4.1.1.1.1.11", + "GQ": "B.1.1.529.2.75.3.4.1.1.1.1.3", + "GR": "XBB.1.5.42", + "GS": "XBB.2.3.11", + "GT": "XBC.1.6.1", + "GU": "XBB.1.5.41", + "GV": "XBB.1.5.48", + "GW": "XBB.1.19.1", + "GY": "XBB.1.16.2", + "GZ": "XBB.2.3.4", + "HA": "XBB.1.5.86", + "HB": "XBB.1.34.2", + "HC": "XBB.1.5.44", + "XA": [ + "B.1.1.7", + "B.1.177" + ], + "XB": [ + "B.1.634", + "B.1.631" + ], + "XC": [ + "AY.29", + "B.1.1.7" + ], + "XD": [ + "B.1.617.2*", + "BA.1*" + ], + "XE": [ + "BA.1*", + "BA.2*" + ], + "XF": [ + "B.1.617.2*", + "BA.1*" + ], + "XG": [ + "BA.1*", + "BA.2*" + ], + "XH": [ + "BA.1*", + "BA.2*" + ], + "XJ": [ + "BA.1*", + "BA.2*" + ], + "XK": [ + "BA.1*", + "BA.2*" + ], + "XL": [ + "BA.1*", + "BA.2*" + ], + "XM": [ + "BA.1.1*", + "BA.2*" + ], + "XN": [ + "BA.1*", + "BA.2*" + ], + "XP": [ + "BA.1.1*", + "BA.2*" + ], + "XQ": [ + "BA.1.1*", + "BA.2*" + ], + "XR": [ + "BA.1.1*", + "BA.2*" + ], + "XS": [ + "B.1.617.2*", + "BA.1.1*" + ], + "XT": [ + "BA.2*", + "BA.1*" + ], + "XU": [ + "BA.1*", + "BA.2*" + ], + "XV": [ + "BA.1*", + "BA.2*" + ], + "XW": [ + "BA.1*", + "BA.2*" + ], + "XY": [ + "BA.1*", + "BA.2*" + ], + "XZ": [ + "BA.2*", + "BA.1*" + ], + "XAA": [ + "BA.1*", + "BA.2*" + ], + "XAB": [ + "BA.1*", + "BA.2*" + ], + "XAC": [ + "BA.2*", + "BA.1*", + "BA.2*" + ], + "XAD": [ + "BA.2*", + "BA.1*" + ], + "XAE": [ + "BA.2*", + "BA.1*" + ], + "XAF": [ + "BA.1*", + "BA.2*" + ], + "XAG": [ + "BA.1*", + "BA.2*" + ], + "XAH": [ + "BA.2*", + "BA.1*" + ], + "XAJ": [ + "BA.2.12.1*", + "BA.4*" + ], + "XAK": [ + "BA.2*", + "BA.1*", + "BA.2*" + ], + "XAL": [ + "BA.1*", + "BA.2*" + ], + "XAM": [ + "BA.1.1", + "BA.2.9" + ], + "XAN": [ + "BA.2*", + "BA.5.1" + ], + "XAP": [ + "BA.2*", + "BA.1*" + ], + "XAQ": [ + "BA.1*", + "BA.2*" + ], + "XAR": [ + "BA.1*", + "BA.2*" + ], + "XAS": [ + "BA.5*", + "BA.2*" + ], + "XAT": [ + "BA.2.3.13", + "BA.1*" + ], + "XAU": [ + "BA.1.1*", + "BA.2.9*" + ], + "XAV": [ + "BA.2*", + "BA.5*" + ], + "XAW": [ + "BA.2*", + "AY.122" + ], + "XAY": [ + "BA.2*", + "AY.45", + "BA.2*", + "AY.45", + "BA.2*" + ], + "XAZ": [ + "BA.2.5", + "BA.5", + "BA.2.5" + ], + "XBA": [ + "BA.2*", + "AY.45", + "BA.2*", + "AY.45", + "BA.2*" + ], + "XBB": [ + "BJ.1", + "BM.1.1.1" + ], + "XBC": [ + "BA.2*", + "B.1.617.2*", + "BA.2*", + "B.1.617.2*" + ], + "XBD": [ + "BA.2.75.2", + "BF.5" + ], + "XBE": [ + "BA.5.2", + "BE.4.1" + ], + "XBF": [ + "BA.5.2.3", + "CJ.1" + ], + "XBG": [ + "BA.2.76", + "BA.5.2" + ], + "XBH": [ + "BA.2.3.17", + "BA.2.75.2" + ], + "XBJ": [ + "BA.2.3.20", + "BA.5.2" + ], + "XBK": [ + "BA.5.2", + "CJ.1" + ], + "XBL": [ + "XBB.1.5.57", + "BA.2.75*", + "XBB.1.5.57" + ], + "XBM": [ + "BA.2.76", + "BF.3" + ], + "XBN": [ + "BA.2.75", + "XBB.3" + ], + "XBP": [ + "BA.2.75*", + "BQ.1*" + ], + "XBQ": [ + "BA.5.2", + "CJ.1" + ], + "XBR": [ + "BA.2.75", + "BQ.1" + ], + "XBS": [ + "BA.2.75", + "BQ.1" + ], + "XBT": [ + "BA.5.2.34", + "BA.2.75", + "BA.5.2.34" + ], + "XBU": [ + "BA.2.75.3", + "BQ.1", + "BA.2.75.3" + ], + "XBV": [ + "CR.1", + "XBB.1" + ], + "XBW": [ + "XBB.1.5", + "BQ.1.14" + ], + "XBY": [ + "BR.2.1", + "XBF" + ], + "XBZ": [ + "BA.5.2*", + "EF.1.3" + ], + "XCA": [ + "BA.2.75*", + "BQ.1*" + ], + "XCB": [ + "BF.31.1", + "BQ.1.10*" + ], + "XCC": [ + "CH.1.1.1", + "XBB.1.9.1" + ], + "XCD": [ + "XBB.1*", + "BQ.1.1.25*" + ], + "XCE": [ + "BQ.1*", + "FY.1" + ], + "XCF": [ + "XBB*", + "FE.1" + ], + "XCG": [ + "BA.5.2*", + "XBB.1" + ] +} diff --git a/testBaseData/tsvWithQuoteInPartitionBy/preprocessing_config.yaml b/testBaseData/tsvWithQuoteInPartitionBy/preprocessing_config.yaml new file mode 100644 index 000000000..ccac20090 --- /dev/null +++ b/testBaseData/tsvWithQuoteInPartitionBy/preprocessing_config.yaml @@ -0,0 +1,4 @@ +metadataFilename: "small_metadata_set.tsv" +pangoLineageDefinitionFilename: "pangolineage_alias.json" +referenceGenomeFilename: "reference_genomes.json" +preprocessingDatabaseLocation: "test.duckdb" diff --git a/testBaseData/tsvWithQuoteInPartitionBy/reference_genomes.json b/testBaseData/tsvWithQuoteInPartitionBy/reference_genomes.json new file mode 100644 index 000000000..c9868f376 --- /dev/null +++ b/testBaseData/tsvWithQuoteInPartitionBy/reference_genomes.json @@ -0,0 +1,6 @@ +{ + "nucleotideSequences": [ + ], + "genes": [ + ] +} diff --git a/testBaseData/tsvWithQuoteInPartitionBy/small_metadata_set.tsv b/testBaseData/tsvWithQuoteInPartitionBy/small_metadata_set.tsv new file mode 100644 index 000000000..ebf5e6031 --- /dev/null +++ b/testBaseData/tsvWithQuoteInPartitionBy/small_metadata_set.tsv @@ -0,0 +1,101 @@ +gisaid_epi_isl pango_"lineage date region country division unsorted_date age qc_value test_boolean_column +EPI_ISL_1408408 B.1.1.7 2021-03-18 Europe Switzerland Basel-Land 4 0.98 true +EPI_ISL_1749899 B.1.1.7 2021-04-13 Europe Switzerland Bern 2020-03-08 5 0.97 false +EPI_ISL_2016901 B.1.1.7 2021-04-25 Europe Switzerland Aargau 2021-01-29 6 0.96 +EPI_ISL_1749892 B.1.1.7 2021-04-13 Europe Switzerland Bern 2020-12-24 4 0.95 true +EPI_ISL_1597932 B.1.1.7 2021-03-19 Europe Switzerland Solothurn 2021-02-10 54 0.94 true +EPI_ISL_1407962 B.1.1.7 Europe Switzerland Solothurn 2021-01-16 55 0.93 false +EPI_ISL_1750503 B.1.258.17 2020-12-24 Europe Switzerland Zürich 2021-02-14 56 0.92 true +EPI_ISL_1360935 B.1.1.7 2021-03-08 Europe Switzerland Jura 2021-01-03 57 0.91 true +EPI_ISL_2019235 B.1.1.7 2021-04-28 Europe Switzerland Basel-Stadt 2021-01-22 58 0.9 false +EPI_ISL_1749960 B.1.1.7 2021-04-15 Europe Switzerland Basel-Land 2021-02-03 59 0.89 true +EPI_ISL_1361468 B.1.1.7 2021-03-06 Europe Switzerland Zürich 2021-01-20 50 0.98 +EPI_ISL_1408062 B.1.1.7 2021-03-03 Europe Switzerland Valais 2020-11-24 50 0.97 false +EPI_ISL_1597890 B.1.1.7 2021-03-21 Switzerland Vaud 2021-01-25 51 0.96 true +EPI_ISL_1682849 XA.1 2020-12-17 Europe Switzerland Thurgau 2021-01-21 52 0.95 false +EPI_ISL_1408805 B.1.221 2020-11-24 Europe Switzerland Schwyz 2020-12-09 53 0.94 true +EPI_ISL_1750868 B.1.1.189 2020-12-15 Europe Switzerland Solothurn 2021-01-20 54 0.93 true +EPI_ISL_2019350 B.1.1.7 2021-04-27 Europe Switzerland Valais 2020-12-21 55 0.92 +EPI_ISL_2017036 B.1.1.7 2021-04-23 Europe Switzerland Solothurn 2021-03-09 56 0.91 true +EPI_ISL_1599113 B.1.1.39 2020-12-08 Europe Switzerland Zürich 2021-03-05 57 0.9 false +EPI_ISL_2214128 B.1.1.7 2021-05-10 Europe Switzerland Geneva 2020-11-13 58 0.89 true +EPI_ISL_2408472 B.1.1.7 2021-05-25 Europe Switzerland Obwalden 2021-03-02 59 0.98 +EPI_ISL_830864 B.1.177 2020-10-08 Europe Switzerland Basel-Stadt 2021-03-03 50 0.97 false +EPI_ISL_581968 B.1.160 2020-08-17 Europe Switzerland Basel-Stadt 2021-03-25 50 0.96 true +EPI_ISL_2213804 Q.7 2021-05-08 Europe Switzerland Geneva 2021-04-12 51 false +EPI_ISL_2405276 B.1.1.7 2021-05-24 Europe Switzerland Vaud 2021-04-28 52 0.94 true +EPI_ISL_2213934 B.1.1.7 2021-05-13 Europe Switzerland Geneva 2021-04-23 53 0.93 +EPI_ISL_2213984 B.1.1.7 2021-05-08 Europe Switzerland Geneva 2021-05-09 54 0.92 true +EPI_ISL_2574088 B.1.1.7 2021-06-10 Europe Switzerland Sankt Gallen 2021-05-05 55 0.91 false +EPI_ISL_2544226 B.1.1.7 2021-06-05 Europe Switzerland Ticino 2021-05-12 56 0.9 false +EPI_ISL_2360326 Q.7 2021-05-23 Europe Switzerland Ticino 2021-03-10 57 0.89 false +EPI_ISL_2379651 B.1.1.7 2021-05-11 Europe Switzerland Valais 2021-06-01 58 0.98 true +EPI_ISL_1036103 B.1.258 2020-12-09 Europe Switzerland Aargau 2021-06-03 59 0.97 true +EPI_ISL_931279 B.1.1 2020-10-28 Europe Switzerland Basel-Stadt 2021-05-11 50 0.96 true +EPI_ISL_931031 B.1.177 2020-10-22 Europe Switzerland Basel-Stadt 2021-05-10 50 0.95 false +EPI_ISL_1273458 B.1.1.7 2021-01-26 Europe Switzerland Basel-Land 2021-05-18 51 0.94 true +EPI_ISL_1273715 B.1.160 2021-01-20 Europe Switzerland Basel-Stadt 2021-05-08 52 0.93 false +EPI_ISL_737604 B.1.1 2020-12-14 Europe Switzerland Bern 2021-05-14 53 0.92 true +EPI_ISL_1129663 B.1.1.7 2020-12-29 Europe Switzerland Bern 2021-05-07 54 0.91 true +EPI_ISL_1003629 B.1.1.39 2021-01-25 Europe Switzerland Aargau 2021-05-18 55 0.9 +EPI_ISL_737715 B.1.177 2020-12-13 Europe Switzerland Bern 2021-05-16 56 0.89 +EPI_ISL_1003036 B.1.177 2021-01-16 Europe Switzerland Aargau 2021-07-14 57 0.98 +EPI_ISL_899762 B.1.177 2020-12-25 Europe Switzerland Schwyz 2021-07-19 58 0.97 false +EPI_ISL_899725 B.1.177 2021-01-12 Europe Switzerland Solothurn 2021-07-14 59 0.96 true +EPI_ISL_1195052 B.1.1.7 2021-02-23 Europe Switzerland Solothurn 2021-07-04 50 0.95 +EPI_ISL_1003519 B.1.160.16 2021-01-22 Europe Switzerland 2021-07-29 50 0.94 false +EPI_ISL_1003010 B.1.36.35 2021-01-15 Europe Switzerland Solothurn 2021-07-19 51 0.93 true +EPI_ISL_1119584 B.1.1 2020-11-04 Europe Switzerland Solothurn 2021-07-05 52 0.92 false +EPI_ISL_1002052 B.1 2021-01-15 Europe Switzerland Solothurn 2021-07-15 53 0.91 true +EPI_ISL_466942 B.1 2020-03-08 Europe Switzerland Basel-Stadt 2021-05-12 54 0.9 true +EPI_ISL_1003849 B.1.160 2021-01-29 Europe Switzerland Neuchâtel 2021-08-05 55 0.89 false +EPI_ISL_768148 GD.1 2020-12-24 Europe Switzerland Sankt Gallen 2020-03-16 56 0.98 true +EPI_ISL_1080536 B.1.1.7 2021-02-10 Europe Switzerland Basel-Land 2021-08-04 57 0.97 false +EPI_ISL_1002156 B.1.221 2021-01-16 Europe Switzerland Basel-Land 2021-02-03 58 0.96 true +EPI_ISL_1119315 B.1.1.7 2021-02-14 Europe Switzerland Graubünden 2021-03-18 59 0.95 true +EPI_ISL_1004495 B.1.177.44 2021-01-03 Europe Switzerland 2021-04-13 50 0.94 false +EPI_ISL_1001920 B.1.177 2021-01-22 Europe Switzerland Bern 2021-04-25 50 0.93 false +EPI_ISL_1131102 B.1.160 2021-02-03 Europe Switzerland Zürich 2021-04-13 51 0.92 true +EPI_ISL_1003373 B.1.177 2021-01-20 Europe Switzerland Zürich 2021-03-19 52 0.91 false +EPI_ISL_721941 B.1.1.70 2020-11-24 Europe Switzerland Zürich 2021-03-15 53 0.9 false +EPI_ISL_1130868 B.1.525 2021-01-25 Europe Switzerland Zürich 2020-12-24 54 0.89 +EPI_ISL_1003425 B.1.177 2021-01-21 Europe Switzerland Uri 2021-03-08 55 0.98 true +EPI_ISL_737860 B.1.160 2020-12-09 Europe Switzerland Valais 2021-04-28 56 0.97 false +EPI_ISL_1001493 B.1.177.44 2021-01-20 Europe Switzerland Vaud 2021-04-15 57 0.96 true +EPI_ISL_1260480 B.1.160 2020-12-21 Europe Switzerland Zürich 2021-03-06 58 0.95 true +EPI_ISL_1747885 B.1.1.7 2021-03-09 Europe Switzerland Solothurn 2021-03-03 59 0.94 +EPI_ISL_1747752 B.1.1.7 2021-03-05 Europe Switzerland Basel-Land 2021-03-21 50 0.93 true +EPI_ISL_1005148 B.1.221 2020-11-13 Europe Switzerland Solothurn 2020-12-17 50 0.92 +EPI_ISL_1748243 B.1.1.7 2021-03-02 Europe Switzerland Solothurn 2020-11-24 0.91 false +EPI_ISL_1748215 B.1.1.7 2021-03-03 Europe Switzerland Solothurn 2020-12-15 52 0.9 false +EPI_ISL_1748395 B.1.1.7 2021-03-25 Europe Switzerland Basel-Stadt 2021-04-27 53 0.89 false +EPI_ISL_1760534 B.1.1.7 2021-04-12 Europe Switzerland Ticino 2021-04-23 54 0.98 +EPI_ISL_2086867 C.36.3 2021-04-28 Europe Switzerland Zürich 2020-12-08 55 0.97 true +EPI_ISL_1840634 Q.7 2021-04-23 Europe Switzerland Ticino 2021-05-10 56 0.96 false +EPI_ISL_2180995 B.1.1.7 2021-05-09 Europe Switzerland Basel-Stadt 2021-05-25 57 0.95 +EPI_ISL_2181005 B.1.1.7 2021-05-05 Europe Switzerland Basel-Stadt 2020-10-08 58 0.94 +EPI_ISL_2180023 B.1.1.7 2021-05-12 Europe Switzerland Ticino 2020-08-17 59 0.93 true +EPI_ISL_2270139 B.1.1.7 2021-03-10 Europe Switzerland Basel-Stadt 2021-05-08 50 0.92 true +EPI_ISL_2544452 B.1.1.7 2021-06-01 Europe Switzerland Schwyz 2021-05-24 50 0.91 false +EPI_ISL_2544332 B.1.1.7 2021-06-03 Europe Switzerland Bern 2021-05-13 51 0.9 +EPI_ISL_2307766 B.1.1.7 2021-05-11 Europe Switzerland Bern 2021-05-08 52 0.89 +EPI_ISL_2375490 B.1.1.7 2021-05-10 Europe Switzerland Valais 2021-06-10 53 0.98 true +EPI_ISL_2374969 B.1.1.7 2021-05-18 Europe Switzerland Aargau 2021-06-05 54 0.97 false +EPI_ISL_2307888 B.1.1.7 2021-05-08 Europe Switzerland Solothurn 2021-05-23 55 0.96 false +EPI_ISL_2375247 B.1.1.7 2021-05-14 Europe Switzerland Sankt Gallen 2021-05-11 56 true +EPI_ISL_2308054 B.1.1.7 2021-05-07 Europe Switzerland Zürich 2020-12-09 57 0.94 +EPI_ISL_2375165 B.1.1.7 2021-05-18 Europe Switzerland Basel-Land 2020-10-28 58 0.93 true +EPI_ISL_2375097 B.1.1.7 2021-05-16 Europe Switzerland Basel-Land 2020-10-22 59 0.92 false +EPI_ISL_3128737 AY.9.2 2021-07-14 Europe Switzerland Zürich 2021-01-26 50 0.91 true +EPI_ISL_3128811 B.1.617.2 2021-07-19 Europe Switzerland Aargau 2021-01-20 50 0.9 false +EPI_ISL_3086369 AY.122 2021-07-14 Europe Switzerland Ticino 2020-12-14 51 0.89 +EPI_ISL_3259931 AY.43 2021-07-04 Europe Switzerland Vaud 2020-12-29 52 0.98 +EPI_ISL_3267832 AY.43 2021-07-29 Europe Switzerland Bern 2021-01-25 53 0.97 +EPI_ISL_3128796 B.1.617.2 2021-07-19 Europe Switzerland Zürich 2020-12-13 54 0.96 false +EPI_ISL_3016465 B.1.1.7 2021-07-05 Europe Switzerland Valais 2021-01-16 0.95 false +EPI_ISL_3247294 2021-07-15 Europe Switzerland Basel-Stadt 2020-12-25 56 0.94 true +EPI_ISL_3578231 P.1 2021-05-12 Europe Switzerland Zürich 2021-01-12 57 0.93 false +EPI_ISL_3465732 AY.43 2021-08-05 Europe Switzerland Vaud 2021-02-23 58 0.92 false +EPI_ISL_2367431 B.1 2020-03-16 Europe Switzerland Vaud 2021-01-22 59 0.91 true +EPI_ISL_3465556 AY.43 2021-08-04 Europe Switzerland Solothurn 2021-01-15 50 0.9 false +EPI_ISL_2359636 B.1.1.189 2021-02-03 Europe Switzerland Vaud 2020-11-04 57 0.89 false