From 23e909d7169a60ac91120e56fe03c28edaa1bc0c Mon Sep 17 00:00:00 2001 From: vasudeva8 Date: Mon, 17 Jun 2024 16:26:17 +0100 Subject: [PATCH] update for CNV:TR and optional leading phasing info --- inc/util/string_utils.hpp | 1 + inc/vcf/file_structure.hpp | 15 +- inc/vcf/optional_policy.hpp | 2 + inc/vcf/string_constants.hpp | 8 + src/vcf/record.cpp | 258 ++++++++++++++++-- src/vcf/validate_optional_policy.cpp | 87 +++++- .../v4.4/passed/passed_body_format.vcf | 2 +- test/vcf/metaentry_test.cpp | 131 ++++++++- test/vcf/optional_policy_test.cpp | 171 +++++++++++- test/vcf/predefined_format_tags_test.cpp | 14 + test/vcf/predefined_info_tags_test.cpp | 206 ++++++++++++++ 11 files changed, 863 insertions(+), 32 deletions(-) diff --git a/inc/util/string_utils.hpp b/inc/util/string_utils.hpp index 3e87285b..5726c5a4 100644 --- a/inc/util/string_utils.hpp +++ b/inc/util/string_utils.hpp @@ -32,6 +32,7 @@ namespace ebi /** * Splits `s` using `delims` as separator and fills the container `ret` with the parts. * An empty string results in an empty container `ret`. + * Expects a string without leading separators and when one is present, it would be part of 1st string. * @param s input string to split * @param delims any character here acts as a separator * @param ret return by reference the container filled with the string split. diff --git a/inc/vcf/file_structure.hpp b/inc/vcf/file_structure.hpp index 537109ff..b93a0534 100644 --- a/inc/vcf/file_structure.hpp +++ b/inc/vcf/file_structure.hpp @@ -205,7 +205,15 @@ namespace ebi { SVTYPE, { STRING, "1" } }, { VALIDATED, { FLAG, "0" } }, { THOUSAND_G, { FLAG, "0" } }, - { SVCLAIM, { STRING, "A" } } + { SVCLAIM, { STRING, "A" } }, + { RN, { INTEGER, "A" } }, + { RUS, { STRING, UNKNOWN_CARDINALITY } }, + { RUL, { INTEGER, UNKNOWN_CARDINALITY } }, + { RUC, { FLOAT, UNKNOWN_CARDINALITY } }, + { RB, { INTEGER, UNKNOWN_CARDINALITY } }, + { CIRUC, { FLOAT, UNKNOWN_CARDINALITY } }, + { CIRB, { INTEGER, UNKNOWN_CARDINALITY } }, + { RUB, { INTEGER, UNKNOWN_CARDINALITY } } }; const std::map> format_v41_v42 = { @@ -724,6 +732,11 @@ namespace ebi * @throw InfoBodyError */ void check_info_have_mandatory() const; + + /** + * gets total RN count + */ + int getRNvalue() const; }; std::ostream &operator<<(std::ostream &os, const Record &record); diff --git a/inc/vcf/optional_policy.hpp b/inc/vcf/optional_policy.hpp index 794420d2..d7af20e8 100644 --- a/inc/vcf/optional_policy.hpp +++ b/inc/vcf/optional_policy.hpp @@ -61,6 +61,8 @@ namespace ebi void check_body_entry_info_svlen(ParsingState & state, Record const & record) const; void check_body_entry_info_svclaim(ParsingState & state, Record const & record) const; void check_body_entry_info_confidence_interval(ParsingState & state, Record const & record) const; + void check_body_entry_info_rb_ruc(ParsingState & state, Record const & record) const; + void check_body_entry_info_rul_rus(ParsingState & state, Record const & record) const; void check_contig_meta(ParsingState & state, Record const & record) const; void check_alternate_allele_meta(ParsingState & state, Record const & record) const; void check_filter_meta(ParsingState & state, Record const & record) const; diff --git a/inc/vcf/string_constants.hpp b/inc/vcf/string_constants.hpp index 36103de6..2f5b542a 100644 --- a/inc/vcf/string_constants.hpp +++ b/inc/vcf/string_constants.hpp @@ -164,6 +164,14 @@ namespace ebi const std::string VALIDATED = "VALIDATED"; const std::string THOUSAND_G = "1000G"; const std::string SVCLAIM = "SVCLAIM"; + const std::string RN = "RN"; + const std::string RUS = "RUS"; + const std::string RUL = "RUL"; + const std::string RUC = "RUC"; + const std::string RB = "RB"; + const std::string CIRUC = "CIRUC"; + const std::string CIRB = "CIRB"; + const std::string RUB = "RUB"; // FORMAT predefined tags const std::string AHAP = "AHAP"; diff --git a/src/vcf/record.cpp b/src/vcf/record.cpp index 93cd8346..18e0c205 100644 --- a/src/vcf/record.cpp +++ b/src/vcf/record.cpp @@ -311,25 +311,42 @@ namespace ebi void Record::check_info_have_mandatory() const { static boost::regex deldup_regex("(<(DUP|DEL)(:[^>]+)*>)+"); + static boost::regex cnvtr_regex("()"); + bool svlencheck = false; + bool svclaimcheck = false; + bool cnvtrcheck = false; - if (source->version > Version::v43) { //not applicable for 4.1/2/3 - for (size_t i = 0; i < alternate_alleles.size(); ++i ) { - //SVLEN must be present for symbolic SV - auto & alternate = alternate_alleles[i]; - if (types[i] == RecordType::STRUCTURAL) { - if (info.find(SVLEN) == info.end()) { + if (source->version < Version::v44) { //not applicable for 4.1/2/3 + return; + } + for (size_t i = 0; i < alternate_alleles.size(); ++i ) { + //SVLEN must be present for symbolic SV + auto & alternate = alternate_alleles[i]; + if (types[i] == RecordType::STRUCTURAL) { + if (!svlencheck && info.find(SVLEN) == info.end()) { + std::stringstream message; + message << "INFO " << SVLEN << " must be present for symbolic structural variants"; + throw new InfoBodyError{line, message.str()}; + } + svlencheck = true; + + if (!svclaimcheck && boost::regex_match(alternate, deldup_regex)) { + //del/dup tags must have svclaim + if (info.find(SVCLAIM) == info.end()) { std::stringstream message; - message << "INFO " << SVLEN << " must be present for symbolic structural variants"; + message << "INFO " << SVCLAIM << " must be present for DEL/DUP"; throw new InfoBodyError{line, message.str()}; } - if (types[i] == RecordType::STRUCTURAL && boost::regex_match(alternate, deldup_regex)) { - //del/dup tags must have svclaim - if (info.find(SVCLAIM) == info.end()) { - std::stringstream message; - message << "INFO " << SVCLAIM << " must be present for DEL/DUP"; - throw new InfoBodyError{line, message.str()}; - } + svclaimcheck = true; + } + if (!cnvtrcheck && boost::regex_match(alternate, cnvtr_regex)) { + //cnv:tr must have either RUS/RUL + if (info.find(RUS) == info.end() && info.find(RUL) == info.end()) { + std::stringstream message; + message << "INFO " << RUS << " or " << RUL << " must be present for CNV:TR"; + throw new InfoBodyError{line, message.str()}; } + cnvtrcheck = true; } } } @@ -436,9 +453,12 @@ namespace ebi ErrorFix::RECOVERABLE_VALUE, field_key, expected}; } } - } else if (field_key == SVLEN && values.size() == alternate_alleles.size()) { + } else if (field_key == SVLEN) { if (source->version >= Version::v44) { - return; //no strict validation as val can be +/- ve and abs val is in use + return; //nothing to do as val can be +/- ve and abs val is in use + } + if (values.size() != alternate_alleles.size()) { //has unknown cardinality, not matching to allele count - nothing to do + return; } for (size_t i = 0; i < alternate_alleles.size(); i++) { if (check_alt_not_symbolic(i)) { @@ -476,11 +496,14 @@ namespace ebi ebi::util::print_container(message, PREDEFINED_INFO_SVTYPES, "", ", ", ""); throw new InfoBodyError{line, message.str(), "Found " + SVTYPE + " was '" + field_value + "'"}; } - } else if (source->version >= Version::v44 && field_key == SVCLAIM && values.size() == alternate_alleles.size()) { - //not applicable for anything < v4.4 + } else if (field_key == SVCLAIM) { + if (source->version < Version::v44) { //not applicable for anything < v4.4 + return; + } static boost::regex allele_regex("<(DUP|DEL|INS|INV|CNV)(:[^>]+)*>"); boost::cmatch pieces_match; - + + //number is A and will have matching number of entries as alleles for (size_t i = 0; i < alternate_alleles.size(); ++i) { std::string key = _OTHER; auto & allele = alternate_alleles[i]; @@ -508,6 +531,162 @@ namespace ebi BOOST_LOG_TRIVIAL(error) << "Invalid symbolic allele" << key << std::endl; } } + } else if (field_key == RUS) { //repeat unit sequence + if (source->version < Version::v44) { //not applicable for anything < v4.4 + return; + } + int rnCount = getRNvalue(); //get repeat no, it must match to RUS count + if (rnCount != values.size()) { + std::stringstream message; + message << "INFO " << RUS << " for record at " << line << " must have " << rnCount << " value(s)"; + throw new InfoBodyError{line, message.str(), "Found " + std::to_string(values.size()) + " value(s)"}; + } + //RUL - RUS matching check made below with RUL + } + else if (field_key == RUL) { //repeat unit length + if (source->version < Version::v44) { //not applicable for anything < v4.4 + return; + } + int rnCount = getRNvalue(); //get repeat no, it must match to RUL count + if (rnCount != values.size()) { + std::stringstream message; + message << "INFO " << RUL << " for record at " << line << " must have " << rnCount << " value(s)"; + throw new InfoBodyError{line, message.str(), "Found " + std::to_string(values.size()) + " value(s)"}; + } + auto itRUS = info.find(RUS); + if (itRUS != info.end() && itRUS->second != MISSING_VALUE) { //both RUL and RUS present, must match in count + std::vector RUSval; + util::string_split(itRUS->second, ",", RUSval); + if (values.size() != RUSval.size()) { + std::stringstream message; + message << "INFO " << RUL << " and " << RUS << " for record at " << line << " count must match"; + throw new InfoBodyError{line, message.str(), "Found " + std::to_string(values.size()) + " and " + + std::to_string(RUSval.size()) + " value(s)"}; + } + for (int i = 0; i < values.size(); ++i) { + if (std::stoi(values[i]) != RUSval[i].length()) { //must match to RUS length + std::stringstream message; + message << "INFO " << RUL << " not matching to " << RUS << " for record at " << line; + throw new InfoBodyError{line, message.str(), "Found length " + values[i] + " for " + RUSval[i]}; + } + } + } + } + else if (field_key == RUC) { //repeat unit count + if (source->version < Version::v44) { //not applicable for anything < v4.4 + return; + } + int rnCount = getRNvalue(); //get repeat no, it must match to RUC count + if (rnCount != values.size()) { + std::stringstream message; + message << "INFO " << RUC << " for record at " << line << " must have " << rnCount << " value(s)"; + throw new InfoBodyError{line, message.str(), "Found " + std::to_string(values.size()) + " value(s)"}; + } + } + else if (field_key == RB) { //repeat bases + if (source->version < Version::v44) { //not applicable for anything < v4.4 + return; + } + int rnCount = getRNvalue(); //get repeat no, it must match to RB count + if (rnCount != values.size()) { + std::stringstream message; + message << "INFO " << RB << " for record at " << line << " must have " << rnCount << " value(s)"; + throw new InfoBodyError{line, message.str(), "Found " + std::to_string(values.size()) + " value(s)"}; + } + } + else if (field_key == CIRUC) { //conf.interval repeat unit count + if (source->version < Version::v44) { //not applicable for anything < v4.4 + return; + } + auto it = info.find(RUC); + if (it != info.end()) { + std::vector RUCval; + util::string_split(it->second, ",", RUCval); + if (values.size() != 2 * RUCval.size()) { //ciruc count must be 2 * RUC count + std::stringstream message; + message << "INFO " << CIRUC << " for record at " << line << " must have " << 2 * RUCval.size() << " value(s)"; + throw new InfoBodyError{line, message.str(), "Found " + std::to_string(values.size()) + " value(s)"}; + } + for (int i = 0; i < values.size(); ++i) { + if (RUCval[i / 2] == MISSING_VALUE) { + if (values[i] != MISSING_VALUE) { //ciruc must be missing with ruc missing + std::stringstream message; + message << "INFO " << CIRUC << " for record at " << line << " pos " << i+1 << " must be \'" << MISSING_VALUE << "\'"; + throw new InfoBodyError{line, message.str(), "Found " + values[i]}; + } + } + } + } else if (values.size()) { + //CIRUC values without RUC! + std::stringstream message; + message << "INFO " << CIRUC << " at " << line << " can not have values without " << RUC; + throw new InfoBodyError{line, message.str(), "Found " + std::to_string(values.size()) + "value(s)"}; + } + } + else if (field_key == CIRB) { + if (source->version < Version::v44) { //not applicable for anything < v4.4 + return; + } + auto it = info.find(RB); + if (it != info.end()) { + std::vector RBval; + util::string_split(it->second, ",", RBval); + if (values.size() != 2 * RBval.size()) { //cirb count must be 2 * RB count + std::stringstream message; + message << "INFO " << CIRB << " for record at " << line << " must have " << 2 * RBval.size() << " value(s)"; + throw new InfoBodyError{line, message.str(), "Found " + std::to_string(values.size()) + " value(s)"}; + } + for (int i = 0; i < values.size(); ++i) { + if (RBval[i / 2] == MISSING_VALUE) { + if (values[i] != MISSING_VALUE) { //cirb must be missing with RB missing + std::stringstream message; + message << "INFO " << CIRB << " for record at " << line << " pos " << i+1 << " must be \'" << MISSING_VALUE << "\'"; + throw new InfoBodyError{line, message.str(), "Found " + values[i]}; + } + } + } + } else if (values.size()) { + //CIRB values without RB! + std::stringstream message; + message << "INFO " << CIRB << " at " << line << " can not have values without " << RB; + throw new InfoBodyError{line, message.str(), "Found " + std::to_string(values.size()) + "value(s)"}; + } + } + else if (field_key == RUB) { + if (source->version < Version::v44) { //not applicable for anything < v4.4 + return; + } + auto it = info.find(RUC); + if (it != info.end()) { + std::string message; + std::vector RUCval; + util::string_split(it->second, ",", RUCval); + int cnt = 0; + for (int i = 0; i < RUCval.size(); ++i) { //RUC must be integer with RUB + if (RUCval[i] == MISSING_VALUE) { + continue; + } + try { + check_value_type(INTEGER, RUCval[i], message); + } catch (const std::exception &typeError) { + std::stringstream message; + message << "INFO " << RUC << " for record at " << line << " must be integer with " + RUB; + throw new InfoBodyError{line, message.str()}; + } + cnt += std::stoi(RUCval[i]); + } + if (cnt != values.size()) { //RUB size must be sum(RUC[i]) + std::stringstream message; + message << "INFO " << RUB << " for record at " << line << " must have " << cnt << " value(s)"; + throw new InfoBodyError{line, message.str(), "Found " + std::to_string(values.size()) + " value(s)"}; + } + } + else { + //must be present + std::stringstream message; + message << "INFO " << RUB << " for record at " << line << " must have " + RUC; + throw new InfoBodyError{line, message.str()}; + } } } @@ -575,6 +754,14 @@ namespace ebi if (pos != std::string::npos) { GT_subfield = sample.substr(0, pos); } + //with v44, there can be optional leading phasing info, remove it and use + bool checkprefix = source->version < Version::v44? false : true; + if (checkprefix && !GT_subfield.empty()) { + if (GT_subfield.at(0) == '/' || GT_subfield.at(0) == '|') { + GT_subfield.erase(0,1); + } + } + return 1 + count_if(GT_subfield.begin(), GT_subfield.end(), [](char c) { return c == '/' || c == '|'; }); } else { BOOST_LOG_TRIVIAL(error) << "Cannot fetch ploidy from GT as GT is not present in the FORMAT"; @@ -591,6 +778,15 @@ namespace ebi // If the first format field is not a GT, then no alleles need to be checked if (format[0] == GT) { + //with v44, there can be optional leading phasing info, remove it and use + bool checkprefix = source->version < Version::v44? false : true; + if (checkprefix && !subfields.empty()) { + if (!subfields[0].empty()) { + if (subfields[0].at(0) == '/' || subfields[0].at(0) == '|') { + subfields[0].erase(0,1); + } + } + } check_sample_alleles(subfields); } @@ -853,7 +1049,8 @@ namespace ebi } void Record::check_field_integer_range(std::string const & field, std::vector const & values) const { - if (field == SVLEN || field == CIPOS || field == CIEND || field == CILEN || field == CICN || field == CICNADJ) { + if (field == SVLEN || field == CIPOS || field == CIEND || field == CILEN || field == CICN || field == CICNADJ || + field == CIRB) { // to ignore predefined tag fields which permit negative integral values return; } @@ -867,6 +1064,27 @@ namespace ebi } } + int Record::getRNvalue() const { + static boost::regex cnvtr_regex(""); + auto it = info.find(RN); + int rnCnt = 0; + std::vector values; + + if (it != info.end()) { //spilt RN field + util::string_split(it->second, ",", values); + } + + for (int i = 0; i < alternate_alleles.size(); ++i) { + if (values.size()) { //RN present + rnCnt += (values[i] == MISSING_VALUE) ? 0 : std::stoi(values[i]); + } else if (types[i] == RecordType::STRUCTURAL && boost::regex_match(alternate_alleles[i], cnvtr_regex)) { + //CNV:TR with no RN, consider as 1 + rnCnt++; + } + } + return rnCnt; + } + bool is_record_subfield_in_header(std::string const & field_value, std::multimap::iterator begin, std::multimap::iterator end) diff --git a/src/vcf/validate_optional_policy.cpp b/src/vcf/validate_optional_policy.cpp index dcb055c3..c092db58 100644 --- a/src/vcf/validate_optional_policy.cpp +++ b/src/vcf/validate_optional_policy.cpp @@ -55,6 +55,12 @@ namespace ebi // SVCLAIM check check_body_entry_info_svclaim(state, record); + // RB RUC check + check_body_entry_info_rb_ruc(state, record); + + // RUL RUS check + check_body_entry_info_rul_rus(state, record); + // Confidence interval tags should have first value <=0 and second value >= 0 check_body_entry_info_confidence_interval(state, record); @@ -225,7 +231,7 @@ namespace ebi std::vector values; if (record.source->version < Version::v44) { - return; //svclaim not present for version < v43 + return; //svclaim not present for version < v44 } auto it = record.info.find(SVCLAIM); if (it == record.info.end()) { @@ -251,24 +257,89 @@ namespace ebi void ValidateOptionalPolicy::check_body_entry_info_confidence_interval(ParsingState & state, Record const & record) const { - std::vector confidence_interval_tags = { CICN, CICNADJ, CIEND, CILEN, CIPOS }; + std::vector confidence_interval_tags = { CICN, CICNADJ, CIEND, CILEN, CIPOS, CIRB, CIRUC }; for (auto & confidence_interval_tag : confidence_interval_tags) { auto it = record.info.find(confidence_interval_tag); if (it != record.info.end()) { std::vector values; util::string_split(it->second, ",", values); - size_t scanned_first_value_length, scanned_second_value_length; - int first_numeric_value = std::stoi(values[0], &scanned_first_value_length); - int second_numeric_value = std::stoi(values[1], &scanned_second_value_length); - if (first_numeric_value > 0 || second_numeric_value < 0 - || values[0].size() != scanned_first_value_length || values[1].size() != scanned_second_value_length) { + if (values.size() % 2 != 0) { //CI should have even count throw new InfoBodyError{state.n_lines, "INFO " + confidence_interval_tag + - " is a confidence interval tag, which should have first value <= 0 and second value >= 0"}; + " is a confidence interval tag, which should have even number entries"}; + } + for (int i = 0; i < values.size(); i += 2) { + size_t scanned_first_value_length = 1, scanned_second_value_length = 1; + //considers missing value as 0 - valid value + int first_numeric_value = std::stoi(values[i] != MISSING_VALUE ? values[i] : "0", &scanned_first_value_length); + int second_numeric_value = std::stoi(values[i + 1] != MISSING_VALUE ? values[i + 1] : "0", &scanned_second_value_length); + if (first_numeric_value > 0 || second_numeric_value < 0 + || values[i].size() != scanned_first_value_length || values[i + 1].size() != scanned_second_value_length) { + throw new InfoBodyError{state.n_lines, + "INFO " + confidence_interval_tag + + " is a confidence interval tag, which should have first value <= 0 and second value >= 0"}; + } } } } } + + void ValidateOptionalPolicy::check_body_entry_info_rb_ruc(ParsingState & state, Record const & record) const + { + std::vector valRB, valRUC, valLen; + int rb = 0, ruc = 0 , rul = 0; + const float limit = 0.05; //5% variation + + if (record.source->version < Version::v44) { + return; //not valid for version < v44 + } + auto itRB = record.info.find(RB); + auto itRUC = record.info.find(RUC); + auto itRUL = record.info.find(RUL); + auto itRUS = record.info.find(RUS); + if (itRB == record.info.end() || itRUC == record.info.end()) { + return; //nothing to check + } + util::string_split(itRB->second, ",", valRB); + util::string_split(itRUC->second, ",", valRUC); + if (itRUL != record.info.end()) { + util::string_split(itRUL->second, ",", valLen); + } else { + util::string_split(itRUS->second, ",", valLen); + } + if (valRB.size() != valRUC.size() || valRB.size() != valLen.size()) { + return; //already checked in records + } + + for (size_t i = 0; i < valRB.size(); ++i) { + if (valRB[i] == MISSING_VALUE) { + continue; + } + rb = std::stoi(valRB[i]); + ruc = std::stoi(valRUC[i]); + rul = itRUL != record.info.end()? std::stoi(valLen[i]) : valLen[i].size(); + //RB ~= RUL * RUC + if ( (abs(rb - rul * ruc) / (float)rb) > limit) { + std::stringstream message; + message << "INFO " << "RB should be approximately RUC * unit_length"; + throw new InfoBodyError{record.line, message.str(), "Failed for position " + std::to_string(i)}; + } + } + } + + void ValidateOptionalPolicy::check_body_entry_info_rul_rus(ParsingState & state, Record const & record) const + { + if (record.source->version < Version::v44) { + return; //not valid for version < v44 + } + auto itRUL = record.info.find(RUL); + auto itRUS = record.info.find(RUS); + if (itRUS != record.info.end() && itRUL != record.info.end()) { //RUS, RUL together - redundant info + std::stringstream message; + message << "INFO " << "RUS and RUL present together, RUL can be avoided"; + throw new InfoBodyError{record.line, message.str()}; + } + } void ValidateOptionalPolicy::check_contig_meta(ParsingState & state, Record const & record) const { diff --git a/test/input_files/v4.4/passed/passed_body_format.vcf b/test/input_files/v4.4/passed/passed_body_format.vcf index 8b82ec3d..afe955b7 100644 --- a/test/input_files/v4.4/passed/passed_body_format.vcf +++ b/test/input_files/v4.4/passed/passed_body_format.vcf @@ -6,4 +6,4 @@ 1 400 rs182711216 C T 100 PASS AC=4 GT:G_S:GL 0|0:0.000:-0.18,-0.48,-2.49 0|0:0.000:-0.20,-0.44,-2.06 1 500 rs182711216 C T 100 PASS AC=4 GT:G%3AS:GL 0|0:0.000:-0.18,-0.48,-2.49 0|0:0.000:-0.20,-0.44,-2.06 1 600 rs182711216 C T 100 PASS AC=4 GT:G%3AS:GL |0|0:0.000:-0.18,-0.48,-2.49 |0|0:0.000:-0.20,-0.44,-2.06 -1 700 rs182711216 C T 100 PASS AC=4 GT:G%3AS:GL \0\0:0.000:-0.18,-0.48,-2.49 |0|0:0.000:-0.20,-0.44,-2.06 +1 700 rs182711216 C T 100 PASS AC=4 GT:G%3AS:GL /0/0:0.000:-0.18,-0.48,-2.49 |0|0:0.000:-0.20,-0.44,-2.06 diff --git a/test/vcf/metaentry_test.cpp b/test/vcf/metaentry_test.cpp index fe700e58..3eff97d3 100644 --- a/test/vcf/metaentry_test.cpp +++ b/test/vcf/metaentry_test.cpp @@ -3234,7 +3234,136 @@ namespace ebi source }), vcf::MetaSectionError* ); - } + + CHECK_NOTHROW( (vcf::MetaEntry { //valid definition + 1, + vcf::INFO, + { {vcf::ID, vcf::RN}, {vcf::NUMBER, vcf::A}, {vcf::TYPE, vcf::INTEGER}, {vcf::DESCRIPTION, "Total num. of repeat seq. in this allele."} }, + source + } ) ); + + CHECK_THROWS_AS( (vcf::MetaEntry { //invalid number + 1, + vcf::INFO, + { {vcf::ID, vcf::RN}, {vcf::NUMBER, vcf::R}, {vcf::TYPE, vcf::INTEGER}, {vcf::DESCRIPTION, "Total num. of repeat seq. in this allele."} }, + source + }), + vcf::MetaSectionError* ); + + CHECK_THROWS_AS( (vcf::MetaEntry { //invalid type + 1, + vcf::INFO, + { {vcf::ID, vcf::RN}, {vcf::NUMBER, vcf::A}, {vcf::TYPE, vcf::FLOAT}, {vcf::DESCRIPTION, "Total num. of repeat seq. in this allele."} }, + source + }), + vcf::MetaSectionError* ); + + CHECK_NOTHROW( (vcf::MetaEntry { //valid definition + 1, + vcf::INFO, + { {vcf::ID, vcf::RUS}, {vcf::NUMBER, vcf::UNKNOWN_CARDINALITY}, {vcf::TYPE, vcf::STRING}, {vcf::DESCRIPTION, "A Repeat unit sequence"} }, + source + } ) ); + + CHECK_THROWS_AS( (vcf::MetaEntry { //invalid type + 1, + vcf::INFO, + { {vcf::ID, vcf::RUS}, {vcf::NUMBER, vcf::UNKNOWN_CARDINALITY}, {vcf::TYPE, vcf::FLOAT}, {vcf::DESCRIPTION, "A Repeat unit sequence"} }, + source + }), + vcf::MetaSectionError* ); + + CHECK_NOTHROW( (vcf::MetaEntry { //valid definition + 1, + vcf::INFO, + { {vcf::ID, vcf::RUL}, {vcf::NUMBER, vcf::UNKNOWN_CARDINALITY}, {vcf::TYPE, vcf::INTEGER}, {vcf::DESCRIPTION, "Length of repeating unit"} }, + source + } ) ); + + CHECK_THROWS_AS( (vcf::MetaEntry { //invalid type + 1, + vcf::INFO, + { {vcf::ID, vcf::RUL}, {vcf::NUMBER, vcf::UNKNOWN_CARDINALITY}, {vcf::TYPE, vcf::FLOAT}, {vcf::DESCRIPTION, "Length of repeating unit"} }, + source + }), + vcf::MetaSectionError* ); + + CHECK_NOTHROW( (vcf::MetaEntry { //valid definition + 1, + vcf::INFO, + { {vcf::ID, vcf::RUC}, {vcf::NUMBER, vcf::UNKNOWN_CARDINALITY}, {vcf::TYPE, vcf::FLOAT}, {vcf::DESCRIPTION, "Count of repeating unit"} }, + source + } ) ); + + CHECK_THROWS_AS( (vcf::MetaEntry { //invalid type + 1, + vcf::INFO, + { {vcf::ID, vcf::RUC}, {vcf::NUMBER, vcf::UNKNOWN_CARDINALITY}, {vcf::TYPE, vcf::INTEGER}, {vcf::DESCRIPTION, "Count of repeating unit"} }, + source + }), + vcf::MetaSectionError* ); + + CHECK_NOTHROW( (vcf::MetaEntry { //valid definition + 1, + vcf::INFO, + { {vcf::ID, vcf::RB}, {vcf::NUMBER, vcf::UNKNOWN_CARDINALITY}, {vcf::TYPE, vcf::INTEGER}, {vcf::DESCRIPTION, "Total num. of bases in repeat seq."} }, + source + } ) ); + + CHECK_THROWS_AS( (vcf::MetaEntry { //invalid type + 1, + vcf::INFO, + { {vcf::ID, vcf::RB}, {vcf::NUMBER, vcf::UNKNOWN_CARDINALITY}, {vcf::TYPE, vcf::FLOAT}, {vcf::DESCRIPTION, "Total num. of bases in repeat seq."} }, + source + }), + vcf::MetaSectionError* ); + + CHECK_NOTHROW( (vcf::MetaEntry { //valid definition + 1, + vcf::INFO, + { {vcf::ID, vcf::CIRUC}, {vcf::NUMBER, vcf::UNKNOWN_CARDINALITY}, {vcf::TYPE, vcf::FLOAT}, {vcf::DESCRIPTION, "Confidence interval for RUC"} }, + source + } ) ); + + CHECK_THROWS_AS( (vcf::MetaEntry { //invalid type + 1, + vcf::INFO, + { {vcf::ID, vcf::CIRUC}, {vcf::NUMBER, vcf::UNKNOWN_CARDINALITY}, {vcf::TYPE, vcf::INTEGER}, {vcf::DESCRIPTION, "Confidence interval for RUC"} }, + source + }), + vcf::MetaSectionError* ); + + CHECK_NOTHROW( (vcf::MetaEntry { //valid definition + 1, + vcf::INFO, + { {vcf::ID, vcf::CIRB}, {vcf::NUMBER, vcf::UNKNOWN_CARDINALITY}, {vcf::TYPE, vcf::INTEGER}, {vcf::DESCRIPTION, "Confidence interval for RB"} }, + source + } ) ); + + CHECK_THROWS_AS( (vcf::MetaEntry { //invalid type + 1, + vcf::INFO, + { {vcf::ID, vcf::CIRB}, {vcf::NUMBER, vcf::UNKNOWN_CARDINALITY}, {vcf::TYPE, vcf::FLOAT}, {vcf::DESCRIPTION, "Confidence interval for RB"} }, + source + }), + vcf::MetaSectionError* ); + + CHECK_NOTHROW( (vcf::MetaEntry { //valid definition + 1, + vcf::INFO, + { {vcf::ID, vcf::RUB}, {vcf::NUMBER, vcf::UNKNOWN_CARDINALITY}, {vcf::TYPE, vcf::INTEGER}, {vcf::DESCRIPTION, "Number of bases in repeat unit"} }, + source + } ) ); + + CHECK_THROWS_AS( (vcf::MetaEntry { //invalid type + 1, + vcf::INFO, + { {vcf::ID, vcf::RUB}, {vcf::NUMBER, vcf::UNKNOWN_CARDINALITY}, {vcf::TYPE, vcf::FLOAT}, {vcf::DESCRIPTION, "Number of bases in repeat unit"} }, + source + }), + vcf::MetaSectionError* ); + + } } TEST_CASE("SAMPLE MetaEntry checks", "[checks][keyvalue]") diff --git a/test/vcf/optional_policy_test.cpp b/test/vcf/optional_policy_test.cpp index dfd619de..383cd8cf 100644 --- a/test/vcf/optional_policy_test.cpp +++ b/test/vcf/optional_policy_test.cpp @@ -241,6 +241,16 @@ namespace ebi }, source }); + source->meta_entries.emplace(vcf::ALT, + vcf::MetaEntry{ + 1, + vcf::ALT, + { + { vcf::ID, "CNV:TR" }, + { vcf::DESCRIPTION, "Cnv-TR" } + }, + source + }); vcf::ParsingState parsing_state{source, vcf::AdditionalChecks()}; @@ -331,7 +341,6 @@ namespace ebi }, source }); - source->meta_entries.emplace(vcf::INFO, vcf::MetaEntry{ 1, @@ -431,6 +440,166 @@ namespace ebi source})), vcf::InfoBodyError*); } + + SECTION("CNV:TR test") + { + source->meta_entries.emplace(vcf::INFO, + vcf::MetaEntry{ + 1, + vcf::INFO, + { + { vcf::ID, vcf::SVLEN }, + { vcf::NUMBER, "A" }, + { vcf::TYPE, vcf::INTEGER }, + { vcf::DESCRIPTION, "Difference in length between REF and ALT alleles" } + }, + source + }); + source->meta_entries.emplace(vcf::INFO, + vcf::MetaEntry{ + 1, + vcf::INFO, + { + { vcf::ID, vcf::RN }, { vcf::NUMBER, "A" }, { vcf::TYPE, vcf::INTEGER }, { vcf::DESCRIPTION, "RN" } + }, + source + }); + source->meta_entries.emplace(vcf::INFO, + vcf::MetaEntry{ + 1, + vcf::INFO, + { + { vcf::ID, vcf::RUS }, { vcf::NUMBER, vcf::UNKNOWN_CARDINALITY }, { vcf::TYPE, vcf::STRING }, { vcf::DESCRIPTION, "RUS" } + }, + source + }); + source->meta_entries.emplace(vcf::INFO, + vcf::MetaEntry{ + 1, + vcf::INFO, + { + { vcf::ID, vcf::RUC }, { vcf::NUMBER, vcf::UNKNOWN_CARDINALITY }, { vcf::TYPE, vcf::FLOAT }, { vcf::DESCRIPTION, "RUC" } + }, + source + }); + source->meta_entries.emplace(vcf::INFO, + vcf::MetaEntry{ + 1, + vcf::INFO, + { + { vcf::ID, vcf::RUL }, { vcf::NUMBER, vcf::UNKNOWN_CARDINALITY }, { vcf::TYPE, vcf::INTEGER }, { vcf::DESCRIPTION, "RUL" } + }, + source + }); + source->meta_entries.emplace(vcf::INFO, + vcf::MetaEntry{ + 1, + vcf::INFO, + { + { vcf::ID, vcf::CIRUC }, { vcf::NUMBER, vcf::UNKNOWN_CARDINALITY }, { vcf::TYPE, vcf::FLOAT }, { vcf::DESCRIPTION, "CIRUC" } + }, + source + }); + source->meta_entries.emplace(vcf::INFO, + vcf::MetaEntry{ + 1, + vcf::INFO, + { + { vcf::ID, vcf::CIRB }, { vcf::NUMBER, vcf::UNKNOWN_CARDINALITY }, { vcf::TYPE, vcf::INTEGER }, { vcf::DESCRIPTION, "CIRB" } + }, + source + }); + + CHECK_THROWS_AS( (optional_policy.optional_check_body_entry(parsing_state, vcf::Record{ //warning as RUS RUL together is redundant + 1, + "chr1", + 123456, + { "id123" }, + "A", + { "" }, + 1.0, + { vcf::PASS }, + { {vcf::SVLEN, "1"}, {vcf::RUS, "AT"}, {vcf::RUC, "2.0"}, {vcf::RB, "4"}, {vcf::RUL, "2"}}, + { vcf::GT }, + { "0|1" }, + source})), + vcf::InfoBodyError*); + + CHECK_THROWS_AS( (optional_policy.optional_check_body_entry(parsing_state, vcf::Record{ //invalid CIRB + 1, + "chr1", + 123456, + { "id123" }, + "A", + { "" }, + 1.0, + { vcf::PASS }, + { {vcf::SVLEN, "1"}, {vcf::RUC, "2.0"}, {vcf::RB, "4"}, {vcf::RUL, "2"}, {vcf::CIRB, "1,0"}}, + { vcf::GT }, + { "0|1" }, + source})), + vcf::InfoBodyError*); + + CHECK_THROWS_AS( (optional_policy.optional_check_body_entry(parsing_state, vcf::Record{ //invalid CIRUC + 1, + "chr1", + 123456, + { "id123" }, + "A", + { "" }, + 1.0, + { vcf::PASS }, + { {vcf::SVLEN, "1"}, {vcf::RUC, "2.0"}, {vcf::RUL, "2"}, {vcf::CIRUC, "-1,-1"}}, + { vcf::GT }, + { "0|1" }, + source})), + vcf::InfoBodyError*); + + CHECK_THROWS_AS( (optional_policy.optional_check_body_entry(parsing_state, vcf::Record{ //invalid CIRUC 2 + 1, + "chr1", + 123456, + { "id123" }, + "A", + { "" }, + 1.0, + { vcf::PASS }, + { {vcf::SVLEN, "1"}, {vcf::RN, "2"}, {vcf::RUC, "2.0,2"}, {vcf::RUL, "2,2"}, {vcf::CIRUC, "-1,1,1,1"}}, + { vcf::GT }, + { "0|1" }, + source})), + vcf::InfoBodyError*); + + CHECK_THROWS_AS( (optional_policy.optional_check_body_entry(parsing_state, vcf::Record{ //invalid CIRUC + 1, + "chr1", + 123456, + { "id123" }, + "A", + { "" }, + 1.0, + { vcf::PASS }, + { {vcf::SVLEN, "1"}, {vcf::RN, "2"}, {vcf::RUL, "2,2"}, {vcf::CIRUC, "-1,1,1,1"}}, + { vcf::GT }, + { "0|1" }, + source})), + vcf::InfoBodyError*); + + CHECK_THROWS_AS( (optional_policy.optional_check_body_entry(parsing_state, vcf::Record{ //invalid CIRB + 1, + "chr1", + 123456, + { "id123" }, + "A", + { "" }, + 1.0, + { vcf::PASS }, + { {vcf::SVLEN, "1"}, {vcf::RUC, "2.0"}, {vcf::RUL, "2"}, {vcf::CIRB, "1,0"}}, + { vcf::GT }, + { "0|1" }, + source})), + vcf::InfoBodyError*); + } } TEST_CASE("Alternate allele warnings", "[body alt warnings]") diff --git a/test/vcf/predefined_format_tags_test.cpp b/test/vcf/predefined_format_tags_test.cpp index 1152b1f4..2de64ce6 100644 --- a/test/vcf/predefined_format_tags_test.cpp +++ b/test/vcf/predefined_format_tags_test.cpp @@ -1020,6 +1020,20 @@ namespace ebi { "1:1.3,2.4" }, source})); + CHECK_NOTHROW( (vcf::Record{ //valid with leading phasing info + 1, + "chr1", + 123456, + { "id123" }, + "A", + { "AT" }, + 1.0, + { vcf::PASS }, + { {vcf::AA, "243"} }, + { vcf::GT}, + { "/1" }, + source})); + CHECK_NOTHROW( (vcf::Record{ 1, "chr1", diff --git a/test/vcf/predefined_info_tags_test.cpp b/test/vcf/predefined_info_tags_test.cpp index e51989cb..aad832c2 100644 --- a/test/vcf/predefined_info_tags_test.cpp +++ b/test/vcf/predefined_info_tags_test.cpp @@ -2358,6 +2358,212 @@ namespace ebi { "0|1" }, source}), vcf::InfoBodyError*); + + CHECK_NOTHROW( (vcf::Record{ //valid + 1, + "chr1", + 123456, + { "id123" }, + "A", + { "" }, + 1.0, + { vcf::PASS }, + { {vcf::SVLEN, "1"}, {vcf::RN, "1"}, {vcf::RUS, "AT"}, {vcf::RUC, "2.0"}, {vcf::RB, "4"}, {vcf::CIRUC, "-1,1"}, {vcf::CIRB, "-1,1"}}, + { vcf::GT }, + { "0|1" }, + source}) ); + + CHECK_NOTHROW( (vcf::Record{ //valid with non-cnv:tr as well + 1, + "chr1", + 123456, + { "id123" }, + "A", + { "T" }, + 1.0, + { vcf::PASS }, + { {vcf::RN, "1"}, {vcf::RUS, "AT"}, {vcf::RUC, "2.0"}, {vcf::RB, "4"}}, + { vcf::GT }, + { "0|1" }, + source}) ); + + CHECK_THROWS_AS( (vcf::Record{ //invalid as RUS not matching to assumed RN = 1 + 1, + "chr1", + 123456, + { "id123" }, + "A", + { "" }, + 1.0, + { vcf::PASS }, + { {vcf::SVLEN, "1"}, {vcf::RUS, "AT,CG"}, {vcf::RUC, "2.0"}, {vcf::RB, "4"}}, + { vcf::GT }, + { "0|1" }, + source}), + vcf::InfoBodyError*); + + CHECK_THROWS_AS( (vcf::Record{ //invalid as RUS not matching to RUL + 1, + "chr1", + 123456, + { "id123" }, + "A", + { "" }, + 1.0, + { vcf::PASS }, + { {vcf::SVLEN, "1"}, {vcf::RUS, "AT"}, {vcf::RUC, "2.0"}, {vcf::RB, "4"}, {vcf::RUL, "3"}}, + { vcf::GT }, + { "0|1" }, + source}), + vcf::InfoBodyError*); + + CHECK_THROWS_AS( (vcf::Record{ //invalid as RUS and RUL missing + 1, + "chr1", + 123456, + { "id123" }, + "A", + { "" }, + 1.0, + { vcf::PASS }, + { {vcf::SVLEN, "1"}, {vcf::RUC, "2.0"}, {vcf::RB, "4"}}, + { vcf::GT }, + { "0|1" }, + source}), + vcf::InfoBodyError*); + + CHECK_THROWS_AS( (vcf::Record{ //invalid as CIRB not matching to RB + 1, + "chr1", + 123456, + { "id123" }, + "A", + { "" }, + 1.0, + { vcf::PASS }, + { {vcf::SVLEN, "1"}, {vcf::RUC, "2.0"}, {vcf::RB, "4"}, {vcf::RUL, "2"}, {vcf::CIRB, "-1,0,-1,1"}}, + { vcf::GT }, + { "0|1" }, + source}), + vcf::InfoBodyError*); + + CHECK_THROWS_AS( (vcf::Record{ //invalid as CIRUC not matching to RUC + 1, + "chr1", + 123456, + { "id123" }, + "A", + { "" }, + 1.0, + { vcf::PASS }, + { {vcf::SVLEN, "1"}, {vcf::RUC, "2.0"}, {vcf::RUL, "2"}, {vcf::CIRUC, "-1,0,-1,1"}}, + { vcf::GT }, + { "0|1" }, + source}), + vcf::InfoBodyError*); + + CHECK_THROWS_AS( (vcf::Record{ //invalid as RUC has missing val and not so in CIRUC + 1, + "chr1", + 123456, + { "id123" }, + "A", + { "" }, + 1.0, + { vcf::PASS }, + { {vcf::SVLEN, "1"}, {vcf::RN, "3"}, {vcf::RUC, "2.0,.,1.5"}, {vcf::RUL, "2,10,2"}, {vcf::CIRUC, "-1,1,-1,0,-1,."}}, + { vcf::GT }, + { "0|1" }, + source}), + vcf::InfoBodyError*); + + CHECK_THROWS_AS( (vcf::Record{ //invalid as RB has missing val and not so in CIRRB + 1, + "chr1", + 123456, + { "id123" }, + "A", + { "" }, + 1.0, + { vcf::PASS }, + { {vcf::SVLEN, "1"}, {vcf::RN, "3"}, {vcf::RUC, "2.0,.,2.05"}, {vcf::RUL, "2,10,2"}, {vcf::RB, "4,.,4"}, {vcf::CIRB, "-1,1,-1,.,-1,."}}, + { vcf::GT }, + { "0|1" }, + source}), + vcf::InfoBodyError*); + + CHECK_NOTHROW( (vcf::Record{ //valid with RUB, RUC + 1, + "chr1", + 123456, + { "id123" }, + "A", + { "" }, + 1.0, + { vcf::PASS }, + { {vcf::SVLEN, "1"}, {vcf::RN, "3"}, {vcf::RUS, "AT,TTG,CA"}, {vcf::RUC, "2,3,2"}, {vcf::RUB, "2,2,3,3,3,2,2"}}, + { vcf::GT }, + { "0|1" }, + source}) ); + + CHECK_THROWS_AS( (vcf::Record{ //invalid as RUC is missing with RUB + 1, + "chr1", + 123456, + { "id123" }, + "A", + { "" }, + 1.0, + { vcf::PASS }, + { {vcf::SVLEN, "1"}, {vcf::RN, "3"}, {vcf::RUL, "2,3,2"}, {vcf::RUB, "2,2,3,3,3,2,2"}}, + { vcf::GT }, + { "0|1" }, + source}), + vcf::InfoBodyError*); + + CHECK_THROWS_AS( (vcf::Record{ //invalid as RUC unmatched to RUB + 1, + "chr1", + 123456, + { "id123" }, + "A", + { "" }, + 1.0, + { vcf::PASS }, + { {vcf::SVLEN, "1"}, {vcf::RN, "3"}, {vcf::RUL, "2,3,2"}, {vcf::RUB, "2,3,3,3,2,2"}}, + { vcf::GT }, + { "0|1" }, + source}), + vcf::InfoBodyError*); + + CHECK_THROWS_AS( (vcf::Record{ //invalid as RUC is not integer + 1, + "chr1", + 123456, + { "id123" }, + "A", + { "" }, + 1.0, + { vcf::PASS }, + { {vcf::SVLEN, "1"}, {vcf::RN, "3"}, {vcf::RUS, "AT,TTG,CA"}, {vcf::RUC, "2.2,3,2"}, {vcf::RUB, "2,2,3,3,3,2,2"}}, + { vcf::GT }, + { "0|1" }, + source}), + vcf::InfoBodyError*); + + CHECK_NOTHROW( (vcf::Record{ //RN with missing data + 1, + "chr1", + 123456, + { "id123" }, + "A", + { "","T","" }, + 1.0, + { vcf::PASS }, + { {vcf::SVLEN, "1,.,1"}, {vcf::SVCLAIM, "D,.,D"}, {vcf::RN, ".,3,1"}, {vcf::RUS, "AT,TTG,CA,TAC"}, {vcf::RUC, "2,3,2,2"}, {vcf::RUB, "2,2,3,3,3,2,2,3,3"}}, + { vcf::GT }, + { "0|1" }, + source}) ); } }