Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GA4GHTT-270: changes for CNV:TR and optional leading phasing info #249

Merged
merged 1 commit into from
Jul 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions inc/util/string_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ namespace ebi
/**
* Splits `s` using `delims` as separator and fills the container `ret` with the parts.
* An empty string results in an empty container `ret`.
* Expects a string without leading separators and when one is present, it would be part of 1st string.
* @param s input string to split
* @param delims any character here acts as a separator
* @param ret return by reference the container filled with the string split.
Expand Down
15 changes: 14 additions & 1 deletion inc/vcf/file_structure.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,15 @@ namespace ebi
{ SVTYPE, { STRING, "1" } },
{ VALIDATED, { FLAG, "0" } },
{ THOUSAND_G, { FLAG, "0" } },
{ SVCLAIM, { STRING, "A" } }
{ SVCLAIM, { STRING, "A" } },
{ RN, { INTEGER, "A" } },
{ RUS, { STRING, UNKNOWN_CARDINALITY } },
{ RUL, { INTEGER, UNKNOWN_CARDINALITY } },
{ RUC, { FLOAT, UNKNOWN_CARDINALITY } },
{ RB, { INTEGER, UNKNOWN_CARDINALITY } },
{ CIRUC, { FLOAT, UNKNOWN_CARDINALITY } },
{ CIRB, { INTEGER, UNKNOWN_CARDINALITY } },
{ RUB, { INTEGER, UNKNOWN_CARDINALITY } }
};

const std::map<std::string, std::pair<std::string, std::string>> format_v41_v42 = {
Expand Down Expand Up @@ -724,6 +732,11 @@ namespace ebi
* @throw InfoBodyError
*/
void check_info_have_mandatory() const;

/**
* gets total RN count
*/
int getRNvalue() const;
};

std::ostream &operator<<(std::ostream &os, const Record &record);
Expand Down
2 changes: 2 additions & 0 deletions inc/vcf/optional_policy.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ namespace ebi
void check_body_entry_info_svlen(ParsingState & state, Record const & record) const;
void check_body_entry_info_svclaim(ParsingState & state, Record const & record) const;
void check_body_entry_info_confidence_interval(ParsingState & state, Record const & record) const;
void check_body_entry_info_rb_ruc(ParsingState & state, Record const & record) const;
void check_body_entry_info_rul_rus(ParsingState & state, Record const & record) const;
void check_contig_meta(ParsingState & state, Record const & record) const;
void check_alternate_allele_meta(ParsingState & state, Record const & record) const;
void check_filter_meta(ParsingState & state, Record const & record) const;
Expand Down
8 changes: 8 additions & 0 deletions inc/vcf/string_constants.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,14 @@ namespace ebi
const std::string VALIDATED = "VALIDATED";
const std::string THOUSAND_G = "1000G";
const std::string SVCLAIM = "SVCLAIM";
const std::string RN = "RN";
const std::string RUS = "RUS";
const std::string RUL = "RUL";
const std::string RUC = "RUC";
const std::string RB = "RB";
const std::string CIRUC = "CIRUC";
const std::string CIRB = "CIRB";
const std::string RUB = "RUB";

// FORMAT predefined tags
const std::string AHAP = "AHAP";
Expand Down
258 changes: 238 additions & 20 deletions src/vcf/record.cpp

Large diffs are not rendered by default.

87 changes: 79 additions & 8 deletions src/vcf/validate_optional_policy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,12 @@ namespace ebi
// SVCLAIM check
check_body_entry_info_svclaim(state, record);

// RB RUC check
check_body_entry_info_rb_ruc(state, record);

// RUL RUS check
check_body_entry_info_rul_rus(state, record);

// Confidence interval tags should have first value <=0 and second value >= 0
check_body_entry_info_confidence_interval(state, record);

Expand Down Expand Up @@ -225,7 +231,7 @@ namespace ebi
std::vector<std::string> values;

if (record.source->version < Version::v44) {
return; //svclaim not present for version < v43
return; //svclaim not present for version < v44
}
auto it = record.info.find(SVCLAIM);
if (it == record.info.end()) {
Expand All @@ -251,24 +257,89 @@ namespace ebi

void ValidateOptionalPolicy::check_body_entry_info_confidence_interval(ParsingState & state, Record const & record) const
{
std::vector<std::string> confidence_interval_tags = { CICN, CICNADJ, CIEND, CILEN, CIPOS };
std::vector<std::string> confidence_interval_tags = { CICN, CICNADJ, CIEND, CILEN, CIPOS, CIRB, CIRUC };
for (auto & confidence_interval_tag : confidence_interval_tags) {
auto it = record.info.find(confidence_interval_tag);
if (it != record.info.end()) {
std::vector<std::string> values;
util::string_split(it->second, ",", values);
size_t scanned_first_value_length, scanned_second_value_length;
int first_numeric_value = std::stoi(values[0], &scanned_first_value_length);
int second_numeric_value = std::stoi(values[1], &scanned_second_value_length);
if (first_numeric_value > 0 || second_numeric_value < 0
|| values[0].size() != scanned_first_value_length || values[1].size() != scanned_second_value_length) {
if (values.size() % 2 != 0) { //CI should have even count
throw new InfoBodyError{state.n_lines,
"INFO " + confidence_interval_tag +
" is a confidence interval tag, which should have first value <= 0 and second value >= 0"};
" is a confidence interval tag, which should have even number entries"};
}
for (int i = 0; i < values.size(); i += 2) {
size_t scanned_first_value_length = 1, scanned_second_value_length = 1;
//considers missing value as 0 - valid value
int first_numeric_value = std::stoi(values[i] != MISSING_VALUE ? values[i] : "0", &scanned_first_value_length);
int second_numeric_value = std::stoi(values[i + 1] != MISSING_VALUE ? values[i + 1] : "0", &scanned_second_value_length);
if (first_numeric_value > 0 || second_numeric_value < 0
|| values[i].size() != scanned_first_value_length || values[i + 1].size() != scanned_second_value_length) {
throw new InfoBodyError{state.n_lines,
"INFO " + confidence_interval_tag +
" is a confidence interval tag, which should have first value <= 0 and second value >= 0"};
}
}
}
}
}

void ValidateOptionalPolicy::check_body_entry_info_rb_ruc(ParsingState & state, Record const & record) const
{
std::vector<std::string> valRB, valRUC, valLen;
int rb = 0, ruc = 0 , rul = 0;
const float limit = 0.05; //5% variation

if (record.source->version < Version::v44) {
return; //not valid for version < v44
}
auto itRB = record.info.find(RB);
auto itRUC = record.info.find(RUC);
auto itRUL = record.info.find(RUL);
auto itRUS = record.info.find(RUS);
if (itRB == record.info.end() || itRUC == record.info.end()) {
return; //nothing to check
}
util::string_split(itRB->second, ",", valRB);
util::string_split(itRUC->second, ",", valRUC);
if (itRUL != record.info.end()) {
util::string_split(itRUL->second, ",", valLen);
} else {
util::string_split(itRUS->second, ",", valLen);
}
if (valRB.size() != valRUC.size() || valRB.size() != valLen.size()) {
return; //already checked in records
}

for (size_t i = 0; i < valRB.size(); ++i) {
if (valRB[i] == MISSING_VALUE) {
continue;
}
rb = std::stoi(valRB[i]);
ruc = std::stoi(valRUC[i]);
rul = itRUL != record.info.end()? std::stoi(valLen[i]) : valLen[i].size();
//RB ~= RUL * RUC
if ( (abs(rb - rul * ruc) / (float)rb) > limit) {
std::stringstream message;
message << "INFO " << "RB should be approximately RUC * unit_length";
throw new InfoBodyError{record.line, message.str(), "Failed for position " + std::to_string(i)};
}
}
}

void ValidateOptionalPolicy::check_body_entry_info_rul_rus(ParsingState & state, Record const & record) const
{
if (record.source->version < Version::v44) {
return; //not valid for version < v44
}
auto itRUL = record.info.find(RUL);
auto itRUS = record.info.find(RUS);
if (itRUS != record.info.end() && itRUL != record.info.end()) { //RUS, RUL together - redundant info
std::stringstream message;
message << "INFO " << "RUS and RUL present together, RUL can be avoided";
throw new InfoBodyError{record.line, message.str()};
}
}

void ValidateOptionalPolicy::check_contig_meta(ParsingState & state, Record const & record) const
{
Expand Down
2 changes: 1 addition & 1 deletion test/input_files/v4.4/passed/passed_body_format.vcf
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@
1 400 rs182711216 C T 100 PASS AC=4 GT:G_S:GL 0|0:0.000:-0.18,-0.48,-2.49 0|0:0.000:-0.20,-0.44,-2.06
1 500 rs182711216 C T 100 PASS AC=4 GT:G%3AS:GL 0|0:0.000:-0.18,-0.48,-2.49 0|0:0.000:-0.20,-0.44,-2.06
1 600 rs182711216 C T 100 PASS AC=4 GT:G%3AS:GL |0|0:0.000:-0.18,-0.48,-2.49 |0|0:0.000:-0.20,-0.44,-2.06
1 700 rs182711216 C T 100 PASS AC=4 GT:G%3AS:GL \0\0:0.000:-0.18,-0.48,-2.49 |0|0:0.000:-0.20,-0.44,-2.06
1 700 rs182711216 C T 100 PASS AC=4 GT:G%3AS:GL /0/0:0.000:-0.18,-0.48,-2.49 |0|0:0.000:-0.20,-0.44,-2.06
131 changes: 130 additions & 1 deletion test/vcf/metaentry_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3234,7 +3234,136 @@ namespace ebi
source
}),
vcf::MetaSectionError* );
}

CHECK_NOTHROW( (vcf::MetaEntry { //valid definition
1,
vcf::INFO,
{ {vcf::ID, vcf::RN}, {vcf::NUMBER, vcf::A}, {vcf::TYPE, vcf::INTEGER}, {vcf::DESCRIPTION, "Total num. of repeat seq. in this allele."} },
source
} ) );

CHECK_THROWS_AS( (vcf::MetaEntry { //invalid number
1,
vcf::INFO,
{ {vcf::ID, vcf::RN}, {vcf::NUMBER, vcf::R}, {vcf::TYPE, vcf::INTEGER}, {vcf::DESCRIPTION, "Total num. of repeat seq. in this allele."} },
source
}),
vcf::MetaSectionError* );

CHECK_THROWS_AS( (vcf::MetaEntry { //invalid type
1,
vcf::INFO,
{ {vcf::ID, vcf::RN}, {vcf::NUMBER, vcf::A}, {vcf::TYPE, vcf::FLOAT}, {vcf::DESCRIPTION, "Total num. of repeat seq. in this allele."} },
source
}),
vcf::MetaSectionError* );

CHECK_NOTHROW( (vcf::MetaEntry { //valid definition
1,
vcf::INFO,
{ {vcf::ID, vcf::RUS}, {vcf::NUMBER, vcf::UNKNOWN_CARDINALITY}, {vcf::TYPE, vcf::STRING}, {vcf::DESCRIPTION, "A Repeat unit sequence"} },
source
} ) );

CHECK_THROWS_AS( (vcf::MetaEntry { //invalid type
1,
vcf::INFO,
{ {vcf::ID, vcf::RUS}, {vcf::NUMBER, vcf::UNKNOWN_CARDINALITY}, {vcf::TYPE, vcf::FLOAT}, {vcf::DESCRIPTION, "A Repeat unit sequence"} },
source
}),
vcf::MetaSectionError* );

CHECK_NOTHROW( (vcf::MetaEntry { //valid definition
1,
vcf::INFO,
{ {vcf::ID, vcf::RUL}, {vcf::NUMBER, vcf::UNKNOWN_CARDINALITY}, {vcf::TYPE, vcf::INTEGER}, {vcf::DESCRIPTION, "Length of repeating unit"} },
source
} ) );

CHECK_THROWS_AS( (vcf::MetaEntry { //invalid type
1,
vcf::INFO,
{ {vcf::ID, vcf::RUL}, {vcf::NUMBER, vcf::UNKNOWN_CARDINALITY}, {vcf::TYPE, vcf::FLOAT}, {vcf::DESCRIPTION, "Length of repeating unit"} },
source
}),
vcf::MetaSectionError* );

CHECK_NOTHROW( (vcf::MetaEntry { //valid definition
1,
vcf::INFO,
{ {vcf::ID, vcf::RUC}, {vcf::NUMBER, vcf::UNKNOWN_CARDINALITY}, {vcf::TYPE, vcf::FLOAT}, {vcf::DESCRIPTION, "Count of repeating unit"} },
source
} ) );

CHECK_THROWS_AS( (vcf::MetaEntry { //invalid type
1,
vcf::INFO,
{ {vcf::ID, vcf::RUC}, {vcf::NUMBER, vcf::UNKNOWN_CARDINALITY}, {vcf::TYPE, vcf::INTEGER}, {vcf::DESCRIPTION, "Count of repeating unit"} },
source
}),
vcf::MetaSectionError* );

CHECK_NOTHROW( (vcf::MetaEntry { //valid definition
1,
vcf::INFO,
{ {vcf::ID, vcf::RB}, {vcf::NUMBER, vcf::UNKNOWN_CARDINALITY}, {vcf::TYPE, vcf::INTEGER}, {vcf::DESCRIPTION, "Total num. of bases in repeat seq."} },
source
} ) );

CHECK_THROWS_AS( (vcf::MetaEntry { //invalid type
1,
vcf::INFO,
{ {vcf::ID, vcf::RB}, {vcf::NUMBER, vcf::UNKNOWN_CARDINALITY}, {vcf::TYPE, vcf::FLOAT}, {vcf::DESCRIPTION, "Total num. of bases in repeat seq."} },
source
}),
vcf::MetaSectionError* );

CHECK_NOTHROW( (vcf::MetaEntry { //valid definition
1,
vcf::INFO,
{ {vcf::ID, vcf::CIRUC}, {vcf::NUMBER, vcf::UNKNOWN_CARDINALITY}, {vcf::TYPE, vcf::FLOAT}, {vcf::DESCRIPTION, "Confidence interval for RUC"} },
source
} ) );

CHECK_THROWS_AS( (vcf::MetaEntry { //invalid type
1,
vcf::INFO,
{ {vcf::ID, vcf::CIRUC}, {vcf::NUMBER, vcf::UNKNOWN_CARDINALITY}, {vcf::TYPE, vcf::INTEGER}, {vcf::DESCRIPTION, "Confidence interval for RUC"} },
source
}),
vcf::MetaSectionError* );

CHECK_NOTHROW( (vcf::MetaEntry { //valid definition
1,
vcf::INFO,
{ {vcf::ID, vcf::CIRB}, {vcf::NUMBER, vcf::UNKNOWN_CARDINALITY}, {vcf::TYPE, vcf::INTEGER}, {vcf::DESCRIPTION, "Confidence interval for RB"} },
source
} ) );

CHECK_THROWS_AS( (vcf::MetaEntry { //invalid type
1,
vcf::INFO,
{ {vcf::ID, vcf::CIRB}, {vcf::NUMBER, vcf::UNKNOWN_CARDINALITY}, {vcf::TYPE, vcf::FLOAT}, {vcf::DESCRIPTION, "Confidence interval for RB"} },
source
}),
vcf::MetaSectionError* );

CHECK_NOTHROW( (vcf::MetaEntry { //valid definition
1,
vcf::INFO,
{ {vcf::ID, vcf::RUB}, {vcf::NUMBER, vcf::UNKNOWN_CARDINALITY}, {vcf::TYPE, vcf::INTEGER}, {vcf::DESCRIPTION, "Number of bases in repeat unit"} },
source
} ) );

CHECK_THROWS_AS( (vcf::MetaEntry { //invalid type
1,
vcf::INFO,
{ {vcf::ID, vcf::RUB}, {vcf::NUMBER, vcf::UNKNOWN_CARDINALITY}, {vcf::TYPE, vcf::FLOAT}, {vcf::DESCRIPTION, "Number of bases in repeat unit"} },
source
}),
vcf::MetaSectionError* );

}
}

TEST_CASE("SAMPLE MetaEntry checks", "[checks][keyvalue]")
Expand Down
Loading