Skip to content

Commit

Permalink
fixup! fix: correctly escape quotes in field names
Browse files Browse the repository at this point in the history
  • Loading branch information
Taepper committed Sep 30, 2024
1 parent 2643a60 commit 5d908b2
Show file tree
Hide file tree
Showing 13 changed files with 882 additions and 70 deletions.
2 changes: 2 additions & 0 deletions include/silo/preprocessing/identifier.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ class Identifier {
const std::string& getRawIdentifier() const;

std::string escape() const;

bool operator==(const Identifier& other) const;
};

} // namespace silo::preprocessing
3 changes: 2 additions & 1 deletion include/silo/preprocessing/preprocessor.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include "silo/config/database_config.h"
#include "silo/config/preprocessing_config.h"
#include "silo/database.h"
#include "silo/preprocessing/identifier.h"
#include "silo/preprocessing/identifiers.h"
#include "silo/preprocessing/preprocessing_database.h"
#include "silo/preprocessing/validated_ndjson_file.h"
Expand Down Expand Up @@ -58,7 +59,7 @@ class Preprocessor {
void buildMetadataTableFromFile(const std::filesystem::path& metadata_filename);

void buildPartitioningTable();
void buildPartitioningTableByColumn(const std::string& partition_by_field);
void buildPartitioningTableByColumn(const Identifier& partition_by_field);
void buildEmptyPartitioning();

template <typename SymbolType>
Expand Down
4 changes: 4 additions & 0 deletions src/silo/preprocessing/identifier.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,8 @@ std::string Identifier::escape() const {
return escapeIdentifier(raw_identifier);
}

bool Identifier::operator==(const Identifier& other) const {
return raw_identifier == other.raw_identifier;
}

} // namespace silo::preprocessing
64 changes: 0 additions & 64 deletions src/silo/preprocessing/metadata_info.test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,67 +35,3 @@ TEST(
silo::preprocessing::PreprocessingException
);
}

TEST(MetadataInfo, isValidMedataFileShouldReturnTrueWithValidMetadataFile) {
const silo::config::DatabaseConfig valid_config{
.default_nucleotide_sequence = "main",
.schema =
{
.instance_name = "testInstanceName",
.metadata =
{
{.name = "gisaid_epi_isl", .type = silo::config::ValueType::STRING},
{.name = "pango_lineage", .type = silo::config::ValueType::PANGOLINEAGE},
{.name = "date", .type = silo::config::ValueType::DATE},
{.name = "country", .type = silo::config::ValueType::STRING},
},
.primary_key = "gisaid_epi_isl",
}
};

const auto raw_fields =
silo::preprocessing::MetadataInfo::getMetadataFields(valid_config).getRawIdentifierStrings();
ASSERT_TRUE(std::ranges::find(raw_fields, "gisaid_epi_isl") != raw_fields.end());
ASSERT_TRUE(std::ranges::find(raw_fields, "pango_lineage") != raw_fields.end());
ASSERT_TRUE(std::ranges::find(raw_fields, "date") != raw_fields.end());
ASSERT_TRUE(std::ranges::find(raw_fields, "country") != raw_fields.end());

const auto escaped_fields = silo::preprocessing::MetadataInfo::getMetadataFields(valid_config)
.getEscapedIdentifierStrings();
ASSERT_TRUE(std::ranges::find(escaped_fields, R"("gisaid_epi_isl")") != escaped_fields.end());
ASSERT_TRUE(std::ranges::find(escaped_fields, R"("pango_lineage")") != escaped_fields.end());
ASSERT_TRUE(std::ranges::find(escaped_fields, R"("date")") != escaped_fields.end());
ASSERT_TRUE(std::ranges::find(escaped_fields, R"("country")") != escaped_fields.end());
}

TEST(MetadataInfo, shouldValidateCorrectNdjsonInputFile) {
const silo::config::DatabaseConfig valid_config{
.default_nucleotide_sequence = "main",
.schema =
{
.instance_name = "testInstanceName",
.metadata =
{
{.name = "gisaid_epi_isl", .type = silo::config::ValueType::STRING},
{.name = "pango_lineage", .type = silo::config::ValueType::PANGOLINEAGE},
{.name = "date", .type = silo::config::ValueType::DATE},
{.name = "country", .type = silo::config::ValueType::STRING},
},
.primary_key = "gisaid_epi_isl",
}
};

const auto raw_fields =
silo::preprocessing::MetadataInfo::getMetadataFields(valid_config).getRawIdentifierStrings();
ASSERT_TRUE(std::ranges::find(raw_fields, "gisaid_epi_isl") != raw_fields.end());
ASSERT_TRUE(std::ranges::find(raw_fields, "pango_lineage") != raw_fields.end());
ASSERT_TRUE(std::ranges::find(raw_fields, "date") != raw_fields.end());
ASSERT_TRUE(std::ranges::find(raw_fields, "country") != raw_fields.end());

const auto escaped_fields = silo::preprocessing::MetadataInfo::getMetadataFields(valid_config)
.getEscapedIdentifierStrings();
ASSERT_TRUE(std::ranges::find(escaped_fields, R"("gisaid_epi_isl")") != escaped_fields.end());
ASSERT_TRUE(std::ranges::find(escaped_fields, R"("pango_lineage")") != escaped_fields.end());
ASSERT_TRUE(std::ranges::find(escaped_fields, R"("date")") != escaped_fields.end());
ASSERT_TRUE(std::ranges::find(escaped_fields, R"("country")") != escaped_fields.end());
}
10 changes: 5 additions & 5 deletions src/silo/preprocessing/preprocessor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -259,14 +259,14 @@ void Preprocessor::buildPartitioningTable() {
"preprocessing - partitioning input by metadata key '{}'",
database_config.schema.partition_by.value()
);
buildPartitioningTableByColumn(database_config.schema.partition_by.value());
buildPartitioningTableByColumn(Identifier{database_config.schema.partition_by.value()});
} else {
SPDLOG_DEBUG("preprocessing - no metadata key for partitioning provided");
buildEmptyPartitioning();
}
}

void Preprocessor::buildPartitioningTableByColumn(const std::string& partition_by_field) {
void Preprocessor::buildPartitioningTableByColumn(const Identifier& partition_by_field) {
SPDLOG_DEBUG("preprocessing - calculating partitions");

(void)preprocessing_db.query(fmt::format(
Expand All @@ -278,7 +278,7 @@ FROM (SELECT {} AS partition_key, COUNT(*) AS count
GROUP BY partition_key
ORDER BY partition_key);
)-",
makeNonNullKey(partition_by_field)
makeNonNullKey(partition_by_field.escape())
));

// create Recursive Hierarchical Partitioning By Partition Field
Expand Down Expand Up @@ -332,8 +332,8 @@ AND partition_keys.partition_key = 'NULL'))
AND partition_keys.id >= partitioning.from_id
AND partition_keys.id <= partitioning.to_id;
)-",
makeNonNullKey(partition_by_field),
partition_by_field
makeNonNullKey(partition_by_field.escape()),
partition_by_field.escape()
));
}

Expand Down
26 changes: 26 additions & 0 deletions src/silo/preprocessing/preprocessor.test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,31 @@ const Scenario TSV_FILE_WITH_QUOTE_IN_FIELD_NAME = {
)
};

const Scenario TSV_FILE_WITH_QUOTE_IN_PARTITION_BY = {
.input_directory = "testBaseData/tsvWithQuoteInPartitionBy/",
.expected_sequence_count = 100,
.query = R"(
{
"action": {
"type": "Aggregated",
"groupByFields": ["pango_\"lineage"],
"orderByFields": ["pango_\"lineage"],
"limit": 3
},
"filterExpression": {
"type": "True"
}
}
)",
.expected_query_result = nlohmann::json::parse(
R"([
{"count":1,"pango_\"lineage":null},
{"count":1,"pango_\"lineage":"AY.122"},
{"count":4,"pango_\"lineage":"AY.43"}
])"
)
};

const Scenario EMPTY_INPUT_TSV = {
.input_directory = "testBaseData/emptyInputTsv/",
.expected_sequence_count = 0,
Expand Down Expand Up @@ -313,6 +338,7 @@ INSTANTIATE_TEST_SUITE_P(
NDJSON_WITH_SQL_KEYWORD_AS_FIELD,
TSV_FILE_WITH_SQL_KEYWORD_AS_FIELD,
TSV_FILE_WITH_QUOTE_IN_FIELD_NAME,
TSV_FILE_WITH_QUOTE_IN_PARTITION_BY,
NDJSON_WITH_NUMERIC_NAMES,
EMPTY_INPUT_TSV,
EMPTY_INPUT_NDJSON,
Expand Down
101 changes: 101 additions & 0 deletions testBaseData/tsvWithQuoteInPartitionBy/aa_insertions.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
gisaid_epi_isl
EPI_ISL_1408408
EPI_ISL_1749899
EPI_ISL_2016901
EPI_ISL_1749892
EPI_ISL_1597932
EPI_ISL_1407962
EPI_ISL_1750503
EPI_ISL_1360935
EPI_ISL_2019235
EPI_ISL_1749960
EPI_ISL_1361468
EPI_ISL_1408062
EPI_ISL_1597890
EPI_ISL_1682849
EPI_ISL_1408805
EPI_ISL_1750868
EPI_ISL_2019350
EPI_ISL_2017036
EPI_ISL_1599113
EPI_ISL_2214128
EPI_ISL_2408472
EPI_ISL_830864
EPI_ISL_581968
EPI_ISL_2213804
EPI_ISL_2405276
EPI_ISL_2213934
EPI_ISL_2213984
EPI_ISL_2574088
EPI_ISL_2544226
EPI_ISL_2360326
EPI_ISL_2379651
EPI_ISL_1036103
EPI_ISL_931279
EPI_ISL_931031
EPI_ISL_1273458
EPI_ISL_1273715
EPI_ISL_737604
EPI_ISL_1129663
EPI_ISL_1003629
EPI_ISL_737715
EPI_ISL_1003036
EPI_ISL_899762
EPI_ISL_899725
EPI_ISL_1195052
EPI_ISL_1003519
EPI_ISL_1003010
EPI_ISL_1119584
EPI_ISL_1002052
EPI_ISL_466942
EPI_ISL_1003849
EPI_ISL_768148
EPI_ISL_1080536
EPI_ISL_1002156
EPI_ISL_1119315
EPI_ISL_1004495
EPI_ISL_1001920
EPI_ISL_1131102
EPI_ISL_1003373
EPI_ISL_721941
EPI_ISL_1130868
EPI_ISL_1003425
EPI_ISL_737860
EPI_ISL_1001493
EPI_ISL_1260480
EPI_ISL_1747885
EPI_ISL_1747752
EPI_ISL_1005148
EPI_ISL_1748243
EPI_ISL_1748215
EPI_ISL_1748395
EPI_ISL_1760534
EPI_ISL_2086867
EPI_ISL_1840634
EPI_ISL_2180995
EPI_ISL_2181005
EPI_ISL_2180023
EPI_ISL_2270139
EPI_ISL_2544452
EPI_ISL_2544332
EPI_ISL_2307766
EPI_ISL_2375490
EPI_ISL_2374969
EPI_ISL_2307888
EPI_ISL_2375247
EPI_ISL_2308054
EPI_ISL_2375165
EPI_ISL_2375097
EPI_ISL_3128737
EPI_ISL_3128811
EPI_ISL_3086369
EPI_ISL_3259931
EPI_ISL_3267832
EPI_ISL_3128796
EPI_ISL_3016465
EPI_ISL_3247294
EPI_ISL_3578231
EPI_ISL_3465732
EPI_ISL_2367431
EPI_ISL_3465556
EPI_ISL_2359636
29 changes: 29 additions & 0 deletions testBaseData/tsvWithQuoteInPartitionBy/database_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
schema:
instanceName: sars_cov-2_minimal_test_config
metadata:
- name: gisaid_epi_isl
type: string
- name: date
type: date
- name: unsorted_date
type: date
- name: region
type: string
generateIndex: true
- name: country
type: string
generateIndex: true
- name: 'pango_"lineage'
type: pango_lineage
- name: division
type: string
generateIndex: true
- name: age
type: int
- name: qc_value
type: float
- name: test_boolean_column
type: boolean
primaryKey: gisaid_epi_isl
dateToSortBy: date
partitionBy: 'pango_"lineage'
Loading

0 comments on commit 5d908b2

Please sign in to comment.