Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: correctly escape quotes in field names #596

Merged
merged 2 commits into from
Sep 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions include/silo/preprocessing/identifier.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ class Identifier {
const std::string& getRawIdentifier() const;

std::string escape() const;

bool operator==(const Identifier& other) const;
};

} // namespace silo::preprocessing
7 changes: 7 additions & 0 deletions include/silo/preprocessing/identifiers.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,15 @@ class Identifiers {
std::vector<Identifier> identifiers;

public:
explicit Identifiers() = default;

Identifiers(const std::vector<std::string>& raw_identifiers);

template <typename T>
void addIdentifier(T&& identifier) {
identifiers.emplace_back(std::forward<T>(identifier));
}

Identifiers prefix(const std::string& prefix) const;

size_t size() const;
Expand Down
3 changes: 2 additions & 1 deletion include/silo/preprocessing/metadata_info.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include <vector>

#include "silo/config/database_config.h"
#include "silo/preprocessing/identifiers.h"

namespace silo::preprocessing {

Expand All @@ -21,7 +22,7 @@ class MetadataInfo {
const silo::config::DatabaseConfig& database_config
);

static std::vector<std::string> getMetadataFields(
static silo::preprocessing::Identifiers getMetadataFields(
const silo::config::DatabaseConfig& database_config
);

Expand Down
3 changes: 2 additions & 1 deletion include/silo/preprocessing/preprocessor.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include "silo/config/database_config.h"
#include "silo/config/preprocessing_config.h"
#include "silo/database.h"
#include "silo/preprocessing/identifier.h"
#include "silo/preprocessing/identifiers.h"
#include "silo/preprocessing/preprocessing_database.h"
#include "silo/preprocessing/validated_ndjson_file.h"
Expand Down Expand Up @@ -58,7 +59,7 @@ class Preprocessor {
void buildMetadataTableFromFile(const std::filesystem::path& metadata_filename);

void buildPartitioningTable();
void buildPartitioningTableByColumn(const std::string& partition_by_field);
void buildPartitioningTableByColumn(const Identifier& partition_by_field);
void buildEmptyPartitioning();

template <typename SymbolType>
Expand Down
4 changes: 4 additions & 0 deletions src/silo/preprocessing/identifier.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,8 @@ std::string Identifier::escape() const {
return escapeIdentifier(raw_identifier);
}

bool Identifier::operator==(const Identifier& other) const {
return raw_identifier == other.raw_identifier;
}

} // namespace silo::preprocessing
17 changes: 7 additions & 10 deletions src/silo/preprocessing/metadata_info.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -112,8 +112,8 @@ void MetadataInfo::validateNdjsonFile(

auto result = connection.Query(fmt::format(
"SELECT metadata.* "
"FROM read_json_auto(\"{}\") LIMIT 0; ",
ndjson_file.string()
"FROM read_json_auto({}) LIMIT 0; ",
Identifier{ndjson_file.string()}.escape()
));

if (result->HasError()) {
Expand Down Expand Up @@ -157,13 +157,10 @@ void MetadataInfo::validateNdjsonFile(
}
}

std::vector<std::string> MetadataInfo::getMetadataFields(
const silo::config::DatabaseConfig& database_config
) {
std::vector<std::string> ret;
ret.reserve(database_config.schema.metadata.size());
Identifiers MetadataInfo::getMetadataFields(const silo::config::DatabaseConfig& database_config) {
Identifiers ret;
for (const auto& field : database_config.schema.metadata) {
ret.push_back("\"" + field.name + "\"");
ret.addIdentifier(field.name);
}
return ret;
}
Expand All @@ -174,7 +171,7 @@ std::vector<std::string> MetadataInfo::getMetadataSQLTypes(
std::vector<std::string> ret;
ret.reserve(database_config.schema.metadata.size());
for (const auto& field : database_config.schema.metadata) {
ret.push_back(fmt::format("\"{}\" {}", field.name, toSQLType(field.type)));
ret.push_back(fmt::format("{} {}", Identifier{field.name}.escape(), toSQLType(field.type)));
}
return ret;
}
Expand All @@ -185,7 +182,7 @@ std::vector<std::string> MetadataInfo::getMetadataSelects(
std::vector<std::string> ret;
ret.reserve(database_config.schema.metadata.size());
for (const auto& field : database_config.schema.metadata) {
ret.push_back(fmt::format(R"( "metadata"."{0}" AS "{0}")", field.name));
ret.push_back(fmt::format(R"( "metadata".{0} AS {0})", Identifier{field.name}.escape()));
}
return ret;
}
Expand Down
49 changes: 0 additions & 49 deletions src/silo/preprocessing/metadata_info.test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,52 +35,3 @@ TEST(
silo::preprocessing::PreprocessingException
);
}

TEST(MetadataInfo, isValidMedataFileShouldReturnTrueWithValidMetadataFile) {
const silo::config::DatabaseConfig valid_config{
.default_nucleotide_sequence = "main",
.schema =
{
.instance_name = "testInstanceName",
.metadata =
{
{.name = "gisaid_epi_isl", .type = silo::config::ValueType::STRING},
{.name = "pango_lineage", .type = silo::config::ValueType::PANGOLINEAGE},
{.name = "date", .type = silo::config::ValueType::DATE},
{.name = "country", .type = silo::config::ValueType::STRING},
},
.primary_key = "gisaid_epi_isl",
}
};

const auto fields = silo::preprocessing::MetadataInfo::getMetadataFields(valid_config);
ASSERT_TRUE(std::ranges::find(fields, R"("gisaid_epi_isl")") != fields.end());
ASSERT_TRUE(std::ranges::find(fields, R"("pango_lineage")") != fields.end());
ASSERT_TRUE(std::ranges::find(fields, R"("date")") != fields.end());
ASSERT_TRUE(std::ranges::find(fields, R"("country")") != fields.end());
}

TEST(MetadataInfo, shouldValidateCorrectNdjsonInputFile) {
const silo::config::DatabaseConfig valid_config{
.default_nucleotide_sequence = "main",
.schema =
{
.instance_name = "testInstanceName",
.metadata =
{
{.name = "gisaid_epi_isl", .type = silo::config::ValueType::STRING},
{.name = "pango_lineage", .type = silo::config::ValueType::PANGOLINEAGE},
{.name = "date", .type = silo::config::ValueType::DATE},
{.name = "country", .type = silo::config::ValueType::STRING},
},
.primary_key = "gisaid_epi_isl",
}
};

const auto fields = silo::preprocessing::MetadataInfo::getMetadataFields(valid_config);

ASSERT_TRUE(std::ranges::find(fields, R"("gisaid_epi_isl")") != fields.end());
ASSERT_TRUE(std::ranges::find(fields, R"("pango_lineage")") != fields.end());
ASSERT_TRUE(std::ranges::find(fields, R"("date")") != fields.end());
ASSERT_TRUE(std::ranges::find(fields, R"("country")") != fields.end());
}
14 changes: 8 additions & 6 deletions src/silo/preprocessing/preprocessor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,9 @@ void Preprocessor::buildMetadataTableFromFile(const std::filesystem::path& metad
(void)preprocessing_db.query(fmt::format(
"INSERT INTO metadata_table BY NAME (SELECT {} FROM read_csv_auto('{}', delim = '\t', "
"header = true));",
boost::join(MetadataInfo::getMetadataFields(database_config), ","),
boost::join(
MetadataInfo::getMetadataFields(database_config).getEscapedIdentifierStrings(), ","
),
metadata_filename.string()
));
}
Expand All @@ -257,14 +259,14 @@ void Preprocessor::buildPartitioningTable() {
"preprocessing - partitioning input by metadata key '{}'",
database_config.schema.partition_by.value()
);
buildPartitioningTableByColumn(database_config.schema.partition_by.value());
buildPartitioningTableByColumn(Identifier{database_config.schema.partition_by.value()});
} else {
SPDLOG_DEBUG("preprocessing - no metadata key for partitioning provided");
buildEmptyPartitioning();
}
}

void Preprocessor::buildPartitioningTableByColumn(const std::string& partition_by_field) {
void Preprocessor::buildPartitioningTableByColumn(const Identifier& partition_by_field) {
SPDLOG_DEBUG("preprocessing - calculating partitions");

(void)preprocessing_db.query(fmt::format(
Expand All @@ -276,7 +278,7 @@ FROM (SELECT {} AS partition_key, COUNT(*) AS count
GROUP BY partition_key
ORDER BY partition_key);
)-",
makeNonNullKey(partition_by_field)
makeNonNullKey(partition_by_field.escape())
));

// create Recursive Hierarchical Partitioning By Partition Field
Expand Down Expand Up @@ -330,8 +332,8 @@ AND partition_keys.partition_key = 'NULL'))
AND partition_keys.id >= partitioning.from_id
AND partition_keys.id <= partitioning.to_id;
)-",
makeNonNullKey(partition_by_field),
partition_by_field
makeNonNullKey(partition_by_field.escape()),
partition_by_field.escape()
));
}

Expand Down
51 changes: 51 additions & 0 deletions src/silo/preprocessing/preprocessor.test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,55 @@ const Scenario TSV_FILE_WITH_SQL_KEYWORD_AS_FIELD = {
.expected_query_result = NDJSON_WITH_SQL_KEYWORD_AS_FIELD.expected_query_result
};

const Scenario TSV_FILE_WITH_QUOTE_IN_FIELD_NAME = {
.input_directory = "testBaseData/tsvWithQuoteInFieldName/",
.expected_sequence_count = 2,
.query = R"(
{
"action": {
"type": "Aggregated",
"groupByFields": ["x\"y"],
"orderByFields": ["x\"y"]
},
"filterExpression": {
"type": "StringEquals",
"column": "x\"y",
"value": "a"
}
}
)",
.expected_query_result = nlohmann::json::parse(
R"([
{"count": 1, "x\"y": "a"}
])"
)
};

const Scenario TSV_FILE_WITH_QUOTE_IN_PARTITION_BY = {
.input_directory = "testBaseData/tsvWithQuoteInPartitionBy/",
.expected_sequence_count = 100,
.query = R"(
{
"action": {
"type": "Aggregated",
"groupByFields": ["pango_\"lineage"],
"orderByFields": ["pango_\"lineage"],
"limit": 3
},
"filterExpression": {
"type": "True"
}
}
)",
.expected_query_result = nlohmann::json::parse(
R"([
{"count":1,"pango_\"lineage":null},
{"count":1,"pango_\"lineage":"AY.122"},
{"count":4,"pango_\"lineage":"AY.43"}
])"
)
};

const Scenario EMPTY_INPUT_TSV = {
.input_directory = "testBaseData/emptyInputTsv/",
.expected_sequence_count = 0,
Expand Down Expand Up @@ -288,6 +337,8 @@ INSTANTIATE_TEST_SUITE_P(
SAM_FILES,
NDJSON_WITH_SQL_KEYWORD_AS_FIELD,
TSV_FILE_WITH_SQL_KEYWORD_AS_FIELD,
TSV_FILE_WITH_QUOTE_IN_FIELD_NAME,
TSV_FILE_WITH_QUOTE_IN_PARTITION_BY,
NDJSON_WITH_NUMERIC_NAMES,
EMPTY_INPUT_TSV,
EMPTY_INPUT_NDJSON,
Expand Down
1 change: 1 addition & 0 deletions testBaseData/tsvWithQuoteInFieldName/aa_insertions.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
id
8 changes: 8 additions & 0 deletions testBaseData/tsvWithQuoteInFieldName/database_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
schema:
instanceName: Test
metadata:
- name: id
type: string
- name: 'x"y'
type: string
primaryKey: id
3 changes: 3 additions & 0 deletions testBaseData/tsvWithQuoteInFieldName/metadata.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
id "x""y"
1 a
2 b
1 change: 1 addition & 0 deletions testBaseData/tsvWithQuoteInFieldName/nuc_insertions.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
id
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
metadataFilename: "metadata.tsv"
referenceGenomeFilename: "reference_genomes.json"
4 changes: 4 additions & 0 deletions testBaseData/tsvWithQuoteInFieldName/reference_genomes.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"nucleotideSequences": [],
"genes": []
}
Loading