From a705f947df8d8473725acd04c9c0356b582cb638 Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Thu, 8 Aug 2024 09:19:00 +0200 Subject: [PATCH] can modify single sample --- src/vcf-reader.cpp | 42 +++++++++++++++++------- tests/testthat/test-modify-vcf.R | 56 ++++++++++++++++++++++++++++++-- tests/testthat/test-vcf-reader.R | 16 +++++++++ 3 files changed, 101 insertions(+), 13 deletions(-) diff --git a/src/vcf-reader.cpp b/src/vcf-reader.cpp index 7d2cedd..37c21ff 100644 --- a/src/vcf-reader.cpp +++ b/src/vcf-reader.cpp @@ -138,6 +138,7 @@ class vcfreader { if (!samples.empty()) br.setSamples(samples); if (!region.empty()) br.setRegion(region); var.initHeader(br.header); + samples_in = samples; } ~vcfreader() {} @@ -275,15 +276,28 @@ class vcfreader { // WRITE inline void output(const std::string& vcffile) { bw.open(vcffile); + bw.initalHeader(br.header); + writable = true; + } + inline void modify() { bw.copyHeader(fin); + if (!samples_in.empty()) bw.header.setSamples(samples_in); var.resetHeader(bw.header); - writable = true; + modifiable = true; } inline void write() { - if (writable) bw.writeRecord(var); + if (writable) { + bw.writeRecord(var); + } else { + Rcpp::Rcout << "please call the `output()` function first to creat an output VCF\n"; + } } inline void close() { - if (writable) bw.close(); + if (writable) { + bw.close(); + } else { + Rcpp::Rcout << "please call the `output()` function first to creat an output VCF\n"; + } } inline void setCHR(std::string s) { var.setCHR(s.c_str()); } @@ -324,21 +338,27 @@ class vcfreader { inline void rmFormatTag(std::string s) { var.removeFORMAT(s); } inline void addINFO(const std::string& id, const std::string& number, const std::string& type, const std::string& desc) { - if (writable) - bw.header.addINFO(id, number, type, desc); - else - Rcpp::Rcout << "please call the `output(filename)` function first\n"; + if (!writable) { + Rcpp::Rcout << "please call the `output()` function first to creat an output VCF\n"; + return; + } + if (!modifiable) { modify(); } + bw.header.addINFO(id, number, type, desc); } inline void addFORMAT(const std::string& id, const std::string& number, const std::string& type, const std::string& desc) { - if (writable) - bw.header.addFORMAT(id, number, type, desc); - else - Rcpp::Rcout << "please call the `output(filename)` function first\n"; + if (!writable) { + Rcpp::Rcout << "please call the `output()` function first to creat an output VCF\n"; + return; + } + if (!modifiable) { modify(); } + bw.header.addFORMAT(id, number, type, desc); } private: + bool modifiable = false; bool writable = false; + std::string samples_in = ""; const std::string fin; vcfpp::BcfReader br; vcfpp::BcfRecord var; diff --git a/tests/testthat/test-modify-vcf.R b/tests/testthat/test-modify-vcf.R index 90eabd2..5d38f81 100644 --- a/tests/testthat/test-modify-vcf.R +++ b/tests/testthat/test-modify-vcf.R @@ -1,6 +1,7 @@ library(testthat) test_that("modify the genotypes", { + ## skip_on_os(c("windows"), arch = NULL) outvcf <- paste0(tempfile(), ".vcf.gz") bw <- vcfwriter$new(outvcf, "VCF4.3") @@ -29,9 +30,11 @@ test_that("modify the genotypes", { br$variant() ## get a variant record g3 <- br$genotypes(F) expect_identical(g0, g3) + }) -test_that("modify item in FORMAT", { +test_that("modify item in FORMAT for all samples", { + ## skip_on_os(c("windows"), arch = NULL) ## creat a VCF with GP in FORMAT outvcf <- paste0(tempfile(), ".vcf.gz") @@ -44,9 +47,10 @@ test_that("modify item in FORMAT", { s1 <- "chr20\t2006060\trs146931526\tG\tC\t100\tPASS\tAF=0.000998403\tGP\t0.966,0.034,0\t0.003,0.872,0.125" bw$writeline(s1) bw$close() + ## tests br <- vcfreader$new(outvcf) - expect_true(br$variant()) ## get a variant record + br$variant() ## get a variant record br$string() gp <- br$formatFloat("GP") gp <- array(gp, c(3, br$nsamples())) @@ -71,4 +75,52 @@ test_that("modify item in FORMAT", { expect_false(br$setFormatStr("STR","HHH,JJJ")) ## length(s) %% nsamples != 0 expect_true(br$setFormatStr("STR","HHHJJJ")) ## length(s) %% nsamples == 0 ## print(br$string()) + +}) + + +test_that("modify item in FORMAT for specific sample", { + + ## skip_on_os(c("windows"), arch = NULL) + ## creat a VCF with GP in FORMAT + outvcf <- paste0(tempfile(), ".vcf.gz") + bw <- vcfwriter$new(outvcf, "VCF4.3") + bw$addContig("chr20") + bw$addINFO("AF", "A", "Float", "Estimated allele frequency in the range (0,1)"); + bw$addFORMAT("GP", "3", "Float", "Posterior genotype probability of 0/0, 0/1, and 1/1"); + bw$addSample("NA12878") + bw$addSample("NA12879") + s1 <- "chr20\t2006060\trs146931526\tG\tC\t100\tPASS\tAF=0.000998403\tGP\t0.966,0.034,0\t0.003,0.872,0.125" + bw$writeline(s1) + bw$close() + + ## tests + br <- vcfreader$new(outvcf, region = "", samples = "NA12878") + br$variant() ## get a variant record + br$string() + br$samples() + gp <- br$formatFloat("GP") + gp <- array(gp, c(3, br$nsamples())) + ds <- gp[2,] + gp[3,] * 2 + ## now open another file for output + newvcf <- paste0(tempfile(), ".vcf.gz") + br$output(newvcf) + ## add INFO, DS in header first + br$addINFO("INFO", "1", "Float", "INFO score of imputation") + br$addFORMAT("DS", "1", "Float", "Diploid dosage") + br$addFORMAT("AC", "1", "Integer", "Allele counts") + br$addFORMAT("STR", "1", "String", "Test String type") + ## print(br$header()) + ## set DS in FORMAT now + br$setFormatFloat("DS", ds[1]) + + ## test if DS presents + expect_identical(br$formatFloat("DS"), ds[1]) + br$string() + + br$write() + br$close() + vcf <- vcftable(newvcf, format = "DS") + expect_true(vcf$DS==ds[1]) + }) diff --git a/tests/testthat/test-vcf-reader.R b/tests/testthat/test-vcf-reader.R index 62abaa3..bdea9f4 100644 --- a/tests/testthat/test-vcf-reader.R +++ b/tests/testthat/test-vcf-reader.R @@ -211,3 +211,19 @@ test_that("vcfreader: remove tag from FORMAT", { expect_error(br$formatInt("AD")) }) + +test_that("can set genotypes for single sample", { + + br <- vcfreader$new(svfile, "", "HG00096") + br$variant() + br$genotypes(F) + br$setGenotypes(c(1L,1L)) + outfile <- paste0(tempfile(), ".vcf.gz") + br$output(outfile) + br$write() + br$close() + + vcf <- vcftable(outfile) + expect_true(vcf$gt==2) + +})