diff --git a/.cirrus.yml b/.cirrus.yml
index 90cd56939..0961666db 100644
--- a/.cirrus.yml
+++ b/.cirrus.yml
@@ -76,7 +76,7 @@ gcc_task:
ubuntu_task:
name: ubuntu-clang
container:
- image: ubuntu:devel
+ image: ubuntu:latest
cpu: 2
memory: 1G
@@ -103,7 +103,7 @@ ubuntu_task:
apt-get install -y --no-install-suggests --no-install-recommends \
ca-certificates clang git autoconf automake \
make zlib1g-dev libbz2-dev liblzma-dev libcurl4-gnutls-dev \
- libssl-dev libdeflate-dev libncurses5-dev
+ libssl-dev libdeflate-dev
<< : *COMPILE
<< : *TEST
@@ -113,7 +113,7 @@ ubuntu_task:
rockylinux_task:
name: rockylinux-gcc
container:
- image: rockylinux:latest
+ image: rockylinux:9
cpu: 2
memory: 1G
@@ -126,8 +126,8 @@ rockylinux_task:
# NB: we could consider building a docker image with these
# preinstalled and specifying that instead, to speed up testing.
install_script: |
- yum install -y autoconf automake make gcc perl-Data-Dumper zlib-devel \
- bzip2 bzip2-devel xz-devel curl-devel openssl-devel ncurses-devel \
+ yum install -y autoconf automake make gcc perl-Data-Dumper perl-FindBin \
+ zlib-devel bzip2 bzip2-devel xz-devel curl-devel openssl-devel \
git diffutils
<< : *COMPILE
diff --git a/Makefile b/Makefile
index ad87f022b..f318386f1 100644
--- a/Makefile
+++ b/Makefile
@@ -104,7 +104,7 @@ endif
include config.mk
-PACKAGE_VERSION = 1.15.1
+PACKAGE_VERSION = 1.16
# If building from a Git repository, replace $(PACKAGE_VERSION) with the Git
# description of the working tree: either a release tag with the same value
diff --git a/NEWS b/NEWS
index aa4420f18..ee52dcc4b 100644
--- a/NEWS
+++ b/NEWS
@@ -1,3 +1,50 @@
+## Release 1.16 (18th August 2022)
+
+
+
+* New plugin `bcftools +variant-distance` to annotate records with distance to the
+ nearest variant (#1690)
+
+
+Changes affecting the whole of bcftools, or multiple commands:
+
+* The -i/-e filtering expressions
+
+ - Added support for querying of multiple filters, for example `-i 'FILTER="A;B"'`
+ can be used to select sites with two filters "A" and "B" set. See the documentation
+ for more examples.
+
+ - Added modulo arithmetic operator
+
+Changes affecting specific commands:
+
+* bcftools annotate
+
+ - A bug introduced in 1.14 caused that records with INFO/END annotation would
+ incorrectly trigger `-c ~INFO/END` mode of comparison even when not explicitly
+ requested, which would result in not transferring the annotation from a tab-delimited
+ file (#1733)
+
+* bcftools merge
+
+ - New `-m snp-ins-del` switch to merge SNVs, insertions and deletions separately (#1704)
+
+* bcftools mpileup
+
+ - New NMBZ annotation for Mann-Whitney U-z test on number of mismatches within
+ supporting reads
+
+ - Suppress the output of MQSBZ and FS annotations in absence of alternate allele
+
+* bcftools +scatter
+
+ - Fix erroneous addition of duplicate PG lines
+
+* bcftools +setGT
+
+ - Custom genotypes (e.g. `-n c:1/1`) now correctly override ploidy
+
+
## Release 1.15.1 (7th April 2022)
@@ -44,7 +91,6 @@
## Release 1.15 (21st February 2022)
-
* New `bcftools head` subcommand for conveniently displaying the headers
of a VCF or BCF file. Without any options, this is equivalent to
`bcftools view --header-only --no-version` but more succinct and memorable.
diff --git a/bam2bcf.c b/bam2bcf.c
index 76a0d439b..d373e99cb 100644
--- a/bam2bcf.c
+++ b/bam2bcf.c
@@ -1,7 +1,7 @@
/* bam2bcf.c -- variant calling.
Copyright (C) 2010-2012 Broad Institute.
- Copyright (C) 2012-2021 Genome Research Ltd.
+ Copyright (C) 2012-2022 Genome Research Ltd.
Author: Heng Li
@@ -89,6 +89,39 @@ void bcf_call_destroy(bcf_callaux_t *bca)
free(bca->bases); free(bca->inscns); free(bca);
}
+static int get_aux_nm(bam1_t *rec, int32_t qpos, int is_ref)
+{
+ uint8_t *nm_tag = bam_aux_get(rec, "NM");
+ if ( !nm_tag ) return -1;
+ int64_t nm = bam_aux2i(nm_tag);
+
+ // Count indels as single events, not as the number of inserted/deleted
+ // bases (which is what NM does). Add soft clips as mismatches.
+ int i;
+ for (i=0; i < rec->core.n_cigar; i++)
+ {
+ int val = bam_get_cigar(rec)[i] & BAM_CIGAR_MASK;
+ if ( val==BAM_CSOFT_CLIP )
+ {
+ nm += bam_get_cigar(rec)[i] >> BAM_CIGAR_SHIFT;
+ }
+ else if ( val==BAM_CINS || val==BAM_CDEL )
+ {
+ val = bam_get_cigar(rec)[i] >> BAM_CIGAR_SHIFT;
+ if ( val > 1 ) nm -= val - 1;
+ }
+ }
+
+ // Take into account MNPs, 2% of de novo SNVs appear within 20bp of another de novo SNV
+ // http://www.genome.org/cgi/doi/10.1101/gr.239756.118
+ nm -= is_ref ? 1 : 2;
+
+ if ( nm < 0 ) nm = 0;
+ if ( nm >= B2B_N_NM ) nm = B2B_N_NM - 1;
+
+ return nm;
+}
+
// position in the sequence with respect to the aligned part of the read
static int get_position(const bam_pileup1_t *p, int *len,
int *sc_len, int *sc_dist) {
@@ -158,6 +191,17 @@ void bcf_callaux_clean(bcf_callaux_t *bca, bcf_call_t *call)
if ( call->ADF ) memset(call->ADF,0,sizeof(int32_t)*(call->n+1)*B2B_MAX_ALLELES);
if ( call->ADR ) memset(call->ADR,0,sizeof(int32_t)*(call->n+1)*B2B_MAX_ALLELES);
if ( call->SCR ) memset(call->SCR,0,sizeof(*call->SCR)*(call->n+1));
+ if ( call->SCR ) memset(call->SCR,0,sizeof(*call->SCR)*(call->n+1));
+ if ( bca->fmt_flag&B2B_FMT_NMBZ )
+ {
+ memset(call->ref_nm,0,sizeof(*call->ref_nm)*(call->n+1)*B2B_N_NM);
+ memset(call->alt_nm,0,sizeof(*call->alt_nm)*(call->n+1)*B2B_N_NM);
+ }
+ else
+ {
+ memset(call->ref_nm,0,sizeof(*call->ref_nm)*B2B_N_NM);
+ memset(call->alt_nm,0,sizeof(*call->alt_nm)*B2B_N_NM);
+ }
memset(call->QS,0,sizeof(*call->QS)*call->n*B2B_MAX_ALLELES);
memset(bca->ref_scl, 0, 100*sizeof(int));
memset(bca->alt_scl, 0, 100*sizeof(int));
@@ -309,21 +353,26 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t
if (sc_len > 99) sc_len = 99;
}
}
-
int imq = mapQ * nqual_over_60;
int ibq = baseQ * nqual_over_60;
+ int inm = get_aux_nm(p->b,p->qpos,is_diff?0:1);
if ( bam_is_rev(p->b) )
bca->rev_mqs[imq]++;
else
bca->fwd_mqs[imq]++;
- if ( bam_seqi(bam_get_seq(p->b),p->qpos) == ref_base )
+ if ( !is_diff )
{
bca->ref_pos[epos]++;
bca->ref_bq[ibq]++;
bca->ref_mq[imq]++;
bca->ref_scl[sc_len]++;
+ if ( inm>=0 )
+ {
+ bca->ref_nm[inm]++;
+ if ( r->ref_nm ) r->ref_nm[inm]++;
+ }
}
else
{
@@ -331,6 +380,11 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t
bca->alt_bq[ibq]++;
bca->alt_mq[imq]++;
bca->alt_scl[sc_len]++;
+ if ( inm>=0 )
+ {
+ bca->alt_nm[inm]++;
+ if ( r->alt_nm ) r->alt_nm[inm]++;
+ }
}
}
@@ -798,6 +852,7 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int
call->n_alleles = j;
if (call->n_alleles == 1) return -1; // no reliable supporting read. stop doing anything
}
+ int has_alt = (call->n_alleles==2 && call->unseen!=-1) ? 0 : 1;
/*
* Set the phread likelihood array (call->PL) This array is 15 entries long
* for each sample because that is size of an upper or lower triangle of a
@@ -914,6 +969,9 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int
for (j = 0; j < 16; ++j) call->anno[j] += calls[i].anno[j];
}
+ // No need to calculate MWU tests when there is no ALT allele, this should speed up things slightly
+ if ( !has_alt ) return 0;
+
calc_SegBias(calls, call);
// calc_chisq_bias("XPOS", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_pos, bca->alt_pos, bca->npos);
@@ -922,7 +980,7 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int
if (bca->fmt_flag & B2B_INFO_ZSCORE) {
// U z-normalised as +/- number of standard deviations from mean.
- if (call->ori_ref < 0) {
+ if (call->ori_ref < 0) { // indel
if (bca->fmt_flag & B2B_INFO_RPB)
call->mwu_pos = calc_mwu_biasZ(bca->iref_pos, bca->ialt_pos,
bca->npos, 0, 1);
@@ -945,6 +1003,15 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int
call->mwu_sc = calc_mwu_biasZ(bca->ref_scl, bca->alt_scl,
100, 0,1);
}
+ call->mwu_nm[0] = calc_mwu_biasZ(bca->ref_nm, bca->alt_nm, B2B_N_NM,0,1);
+ if ( bca->fmt_flag & B2B_FMT_NMBZ )
+ {
+ for (i=0; imwu_nm[i+1] = val!=HUGE_VAL ? val : 0;
+ }
+ }
} else {
// Old method; U as probability between 0 and 1
if ( bca->fmt_flag & B2B_INFO_RPB )
@@ -976,7 +1043,7 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int
int bcf_call2bcf(bcf_call_t *bc, bcf1_t *rec, bcf_callret1_t *bcr, int fmt_flag, const bcf_callaux_t *bca, const char *ref)
{
extern double kt_fisher_exact(int n11, int n12, int n21, int n22, double *_left, double *_right, double *two);
- int i, j, nals = 1;
+ int i, j, nals = 1, has_alt = 0;
bcf_hdr_t *hdr = bc->bcf_hdr;
rec->rid = bc->tid;
@@ -1006,6 +1073,7 @@ int bcf_call2bcf(bcf_call_t *bc, bcf1_t *rec, bcf_callret1_t *bcr, int fmt_flag,
for (j = 0; j < bca->indelreg; ++j) kputc(ref[bc->pos+1+j], &bc->tmp);
}
nals++;
+ has_alt = 1;
}
}
else // SNP
@@ -1016,7 +1084,11 @@ int bcf_call2bcf(bcf_call_t *bc, bcf1_t *rec, bcf_callret1_t *bcr, int fmt_flag,
if (bc->a[i] < 0) break;
kputc(',', &bc->tmp);
if ( bc->unseen==i ) kputs("<*>", &bc->tmp);
- else kputc("ACGT"[bc->a[i]], &bc->tmp);
+ else
+ {
+ kputc("ACGT"[bc->a[i]], &bc->tmp);
+ has_alt = 1;
+ }
nals++;
}
}
@@ -1052,40 +1124,46 @@ int bcf_call2bcf(bcf_call_t *bc, bcf1_t *rec, bcf_callret1_t *bcr, int fmt_flag,
bcf_update_info_float(hdr, rec, "I16", tmpf, 16);
bcf_update_info_float(hdr, rec, "QS", bc->qsum, nals);
- if ( bc->vdb != HUGE_VAL ) bcf_update_info_float(hdr, rec, "VDB", &bc->vdb, 1);
- if ( bc->seg_bias != HUGE_VAL ) bcf_update_info_float(hdr, rec, "SGB", &bc->seg_bias, 1);
-
- if (bca->fmt_flag & B2B_INFO_ZSCORE) {
- if ( bc->mwu_pos != HUGE_VAL )
- bcf_update_info_float(hdr, rec, "RPBZ", &bc->mwu_pos, 1);
- if ( bc->mwu_mq != HUGE_VAL )
- bcf_update_info_float(hdr, rec, "MQBZ", &bc->mwu_mq, 1);
- if ( bc->mwu_mqs != HUGE_VAL )
- bcf_update_info_float(hdr, rec, "MQSBZ", &bc->mwu_mqs, 1);
- if ( bc->mwu_bq != HUGE_VAL )
- bcf_update_info_float(hdr, rec, "BQBZ", &bc->mwu_bq, 1);
- if ( bc->mwu_sc != HUGE_VAL )
- bcf_update_info_float(hdr, rec, "SCBZ", &bc->mwu_sc, 1);
- } else {
- if ( bc->mwu_pos != HUGE_VAL )
- bcf_update_info_float(hdr, rec, "RPB", &bc->mwu_pos, 1);
- if ( bc->mwu_mq != HUGE_VAL )
- bcf_update_info_float(hdr, rec, "MQB", &bc->mwu_mq, 1);
- if ( bc->mwu_mqs != HUGE_VAL )
- bcf_update_info_float(hdr, rec, "MQSB", &bc->mwu_mqs, 1);
- if ( bc->mwu_bq != HUGE_VAL )
- bcf_update_info_float(hdr, rec, "BQB", &bc->mwu_bq, 1);
- }
+ if ( has_alt )
+ {
+ if ( bc->vdb != HUGE_VAL ) bcf_update_info_float(hdr, rec, "VDB", &bc->vdb, 1);
+ if ( bc->seg_bias != HUGE_VAL ) bcf_update_info_float(hdr, rec, "SGB", &bc->seg_bias, 1);
+
+ if (bca->fmt_flag & B2B_INFO_ZSCORE) {
+ if ( bc->mwu_pos != HUGE_VAL )
+ bcf_update_info_float(hdr, rec, "RPBZ", &bc->mwu_pos, 1);
+ if ( bc->mwu_mq != HUGE_VAL )
+ bcf_update_info_float(hdr, rec, "MQBZ", &bc->mwu_mq, 1);
+ if ( bc->mwu_mqs != HUGE_VAL )
+ bcf_update_info_float(hdr, rec, "MQSBZ", &bc->mwu_mqs, 1);
+ if ( bc->mwu_bq != HUGE_VAL )
+ bcf_update_info_float(hdr, rec, "BQBZ", &bc->mwu_bq, 1);
+ if ( bc->mwu_nm[0] != HUGE_VAL )
+ bcf_update_info_float(hdr, rec, "NMBZ", bc->mwu_nm, 1);
+ if ( bc->mwu_sc != HUGE_VAL )
+ bcf_update_info_float(hdr, rec, "SCBZ", &bc->mwu_sc, 1);
+ } else {
+ if ( bc->mwu_pos != HUGE_VAL )
+ bcf_update_info_float(hdr, rec, "RPB", &bc->mwu_pos, 1);
+ if ( bc->mwu_mq != HUGE_VAL )
+ bcf_update_info_float(hdr, rec, "MQB", &bc->mwu_mq, 1);
+ if ( bc->mwu_mqs != HUGE_VAL )
+ bcf_update_info_float(hdr, rec, "MQSB", &bc->mwu_mqs, 1);
+ if ( bc->mwu_bq != HUGE_VAL )
+ bcf_update_info_float(hdr, rec, "BQB", &bc->mwu_bq, 1);
+ }
- if ( bc->strand_bias != HUGE_VAL )
- bcf_update_info_float(hdr, rec, "FS", &bc->strand_bias, 1);
+ if ( bc->strand_bias != HUGE_VAL )
+ bcf_update_info_float(hdr, rec, "FS", &bc->strand_bias, 1);
#if CDF_MWU_TESTS
- if ( bc->mwu_pos_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "RPB2", &bc->mwu_pos_cdf, 1);
- if ( bc->mwu_mq_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQB2", &bc->mwu_mq_cdf, 1);
- if ( bc->mwu_mqs_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQSB2", &bc->mwu_mqs_cdf, 1);
- if ( bc->mwu_bq_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "BQB2", &bc->mwu_bq_cdf, 1);
+ if ( bc->mwu_pos_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "RPB2", &bc->mwu_pos_cdf, 1);
+ if ( bc->mwu_mq_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQB2", &bc->mwu_mq_cdf, 1);
+ if ( bc->mwu_mqs_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQSB2", &bc->mwu_mqs_cdf, 1);
+ if ( bc->mwu_bq_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "BQB2", &bc->mwu_bq_cdf, 1);
#endif
+ }
+
tmpf[0] = bc->ori_depth ? (float)bc->mq0/bc->ori_depth : 0;
bcf_update_info_float(hdr, rec, "MQ0F", tmpf, 1);
@@ -1144,5 +1222,11 @@ int bcf_call2bcf(bcf_call_t *bc, bcf1_t *rec, bcf_callret1_t *bcr, int fmt_flag,
if ( fmt_flag&B2B_FMT_QS )
bcf_update_format_int32(hdr, rec, "QS", bc->QS, rec->n_sample*rec->n_allele);
+ if ( has_alt )
+ {
+ if ( fmt_flag&B2B_FMT_NMBZ )
+ bcf_update_format_float(hdr, rec, "NMBZ", bc->mwu_nm+1, rec->n_sample);
+ }
+
return 0;
}
diff --git a/bam2bcf.h b/bam2bcf.h
index e778b8952..c256b2696 100644
--- a/bam2bcf.h
+++ b/bam2bcf.h
@@ -1,7 +1,7 @@
/* bam2bcf.h -- variant calling.
Copyright (C) 2010-2012 Broad Institute.
- Copyright (C) 2012-2021 Genome Research Ltd.
+ Copyright (C) 2012-2022 Genome Research Ltd.
Author: Heng Li
@@ -61,9 +61,12 @@ DEALINGS IN THE SOFTWARE. */
#define B2B_INFO_RPB (1<<15)
#define B2B_FMT_QS (1<<16)
#define B2B_INFO_SCB (1<<17)
+#define B2B_FMT_NMBZ (1<<18) // per-sample NMBZ
#define B2B_INFO_ZSCORE (1<<30) // MWU as-is or Z-normalised
#define B2B_MAX_ALLELES 5
+#define B2B_N_NM 32 // number of NMBZ bins, i.e. max number of mismatches
+
#define B2B_DROP 0
#define B2B_INC_AD 1
@@ -100,6 +103,7 @@ typedef struct __bcf_callaux_t {
errmod_t *e;
void *rghash;
float indel_bias; // adjusts indel score threshold; lower => call more.
+ int32_t *ref_nm, *alt_nm; // pointers to bcf_call_t.{ref_nm,alt_nm}
} bcf_callaux_t;
// per-sample values
@@ -107,6 +111,7 @@ typedef struct {
uint32_t ori_depth; // ori_depth = anno[0..3] but before --min-BQ is applied
unsigned int mq0;
int32_t *ADF, *ADR, SCR, *QS; // FMT/QS
+ int32_t *ref_nm, *alt_nm;
// The fields are:
// depth fwd .. ref (0) and non-ref (2)
// depth rev .. ref (1) and non-ref (3)
@@ -133,10 +138,10 @@ typedef struct {
int n_supp; // number of supporting non-reference reads
double anno[16];
unsigned int depth, ori_depth, mq0;
- int32_t *PL, *DP4, *ADR, *ADF, *SCR, *QS;
+ int32_t *PL, *DP4, *ADR, *ADF, *SCR, *QS, *ref_nm, *alt_nm;
uint8_t *fmt_arr;
float vdb; // variant distance bias
- float mwu_pos, mwu_mq, mwu_bq, mwu_mqs, mwu_sc;
+ float mwu_pos, mwu_mq, mwu_bq, mwu_mqs, mwu_sc, *mwu_nm;
#if CDF_MWU_TESTS
float mwu_pos_cdf, mwu_mq_cdf, mwu_bq_cdf, mwu_mqs_cdf;
#endif
diff --git a/consensus.c b/consensus.c
index 9bd33cd4c..84ae905b7 100644
--- a/consensus.c
+++ b/consensus.c
@@ -1,19 +1,19 @@
/* The MIT License
- Copyright (c) 2014-2021 Genome Research Ltd.
+ Copyright (c) 2014-2022 Genome Research Ltd.
Author: Petr Danecek
-
+
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
-
+
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
-
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -123,9 +123,17 @@ typedef struct
}
args_t;
+static void destroy_chain(chain_t *chain)
+{
+ if ( !chain ) return;
+ free(chain->ref_gaps);
+ free(chain->alt_gaps);
+ free(chain->block_lengths);
+ free(chain);
+}
static chain_t* init_chain(chain_t *chain, int ref_ori_pos)
{
-// fprintf(stderr, "init_chain(*chain, ref_ori_pos=%d)\n", ref_ori_pos);
+ if ( chain ) destroy_chain(chain);
chain = (chain_t*) calloc(1,sizeof(chain_t));
chain->num = 0;
chain->block_lengths = NULL;
@@ -137,18 +145,6 @@ static chain_t* init_chain(chain_t *chain, int ref_ori_pos)
return chain;
}
-static void destroy_chain(args_t *args)
-{
- chain_t *chain = args->chain;
- free(chain->ref_gaps);
- free(chain->alt_gaps);
- free(chain->block_lengths);
- free(chain);
- chain = NULL;
- free(args->chr);
- args->chr = NULL;
-}
-
static void print_chain(args_t *args)
{
/*
@@ -172,7 +168,7 @@ static void print_chain(args_t *args)
- alt_start (same as ref_start, as no edits are recorded/applied before that position)
- alt_end (adjusted to match the length of the alt sequence)
- chain_num (just an auto-increment id)
-
+
the other (sorted) lines are:
- length of the ungapped alignment block
- gap on the ref sequence between this and the next block (all but the last line)
@@ -197,7 +193,7 @@ static void print_chain(args_t *args)
static void push_chain_gap(chain_t *chain, int ref_start, int ref_len, int alt_start, int alt_len)
{
-// fprintf(stderr, "push_chain_gap(*chain, ref_start=%d, ref_len=%d, alt_start=%d, alt_len=%d)\n", ref_start, ref_len, alt_start, alt_len);
+ // fprintf(stderr, "push_chain_gap(chain=%p, ref_start=%d, ref_len=%d, alt_start=%d, alt_len=%d)\n", chain, ref_start, ref_len, alt_start, alt_len);
int num = chain->num;
if (num && ref_start <= chain->ref_last_block_ori) {
@@ -305,6 +301,7 @@ static void destroy_data(args_t *args)
if ( args->chain_fname )
if ( fclose(args->fp_chain) ) error("Close failed: %s\n", args->chain_fname);
if ( fclose(args->fp_out) ) error("Close failed: %s\n", args->output_fname);
+ destroy_chain(args->chain);
}
static void init_region(args_t *args, char *line)
@@ -346,12 +343,8 @@ static void init_region(args_t *args, char *line)
bcf_sr_seek(args->files,line,args->fa_ori_pos);
if ( tmp_ptr ) *tmp_ptr = tmp;
fprintf(args->fp_out,">%s%s\n",args->chr_prefix?args->chr_prefix:"",line);
- if (args->chain_fname )
- {
+ if ( args->chain_fname )
args->chain = init_chain(args->chain, args->fa_ori_pos);
- } else {
- args->chain = NULL;
- }
}
static bcf1_t **next_vcf_line(args_t *args)
@@ -526,7 +519,7 @@ static void apply_variant(args_t *args, bcf1_t *rec)
if ( !args->missing_allele ) return;
ialt = -1;
}
- else
+ else
{
if ( !warned_haplotype )
{
@@ -544,11 +537,11 @@ static void apply_variant(args_t *args, bcf1_t *rec)
if ( !args->missing_allele ) return;
ialt = -1;
}
- else
+ else
ialt = bcf_gt_allele(ialt);
}
}
- else if ( action==use_iupac )
+ else if ( action==use_iupac )
{
ialt = -1;
int is_missing = 0, alen = 0, mlen = 0, fallback_alt = -1;
@@ -717,7 +710,7 @@ static void apply_variant(args_t *args, bcf1_t *rec)
fprintf(stderr,"The site %s:%"PRId64" overlaps with another variant, skipping...\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
return;
}
-
+
}
char *alt_allele = rec->d.allele[ialt];
@@ -743,7 +736,7 @@ static void apply_variant(args_t *args, bcf1_t *rec)
}
}
}
- if ( idx>=args->fa_buf.l )
+ if ( idx>=args->fa_buf.l )
error("FIXME: %s:%"PRId64" .. idx=%d, ori_pos=%d, len=%"PRIu64", off=%d\n",bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,idx,args->fa_ori_pos,(uint64_t)args->fa_buf.l,args->fa_mod_off);
// sanity check the reference base
@@ -803,8 +796,8 @@ static void apply_variant(args_t *args, bcf1_t *rec)
if ( fail )
{
char tmp = 0;
- if ( args->fa_buf.l - idx > rec->rlen )
- {
+ if ( args->fa_buf.l - idx > rec->rlen )
+ {
tmp = args->fa_buf.s[idx+rec->rlen];
args->fa_buf.s[idx+rec->rlen] = 0;
}
@@ -820,7 +813,7 @@ static void apply_variant(args_t *args, bcf1_t *rec)
alen = strlen(alt_allele);
len_diff = alen - rec->rlen;
- if ( args->mark_del && len_diff<0 )
+ if ( args->mark_del && len_diff<0 )
{
alt_allele = mark_del(rec->d.allele[0], rec->rlen, alt_allele, args->mark_del);
alen = rec->rlen;
@@ -833,7 +826,7 @@ static void apply_variant(args_t *args, bcf1_t *rec)
alen = strlen(alt_allele);
len_diff = alen - rec->rlen;
- if ( args->mark_del && len_diff<0 )
+ if ( args->mark_del && len_diff<0 )
{
alt_allele = mark_del(rec->d.allele[0], rec->rlen, alt_allele, args->mark_del);
alen = rec->rlen;
@@ -949,10 +942,8 @@ static void consensus(args_t *args)
if ( str.s[0]=='>' )
{
// new sequence encountered
- if (args->chain) {
- print_chain(args);
- destroy_chain(args);
- }
+ if ( args->chain ) print_chain(args);
+
// apply all cached variants and variants that might have been missed because of short fasta (see test/consensus.9.*)
bcf1_t **rec_ptr = NULL;
while ( args->rid>=0 && (rec_ptr = next_vcf_line(args)) )
@@ -1026,11 +1017,7 @@ static void consensus(args_t *args)
if ( args->fa_ori_pos + args->fa_buf.l - args->fa_mod_off <= rec->pos ) break;
apply_variant(args, rec);
}
- if (args->chain)
- {
- print_chain(args);
- destroy_chain(args);
- }
+ if (args->chain) print_chain(args);
if ( args->absent_allele ) apply_absent(args, HTS_POS_MAX);
flush_fa_buffer(args, 0);
bgzf_close(fasta);
@@ -1078,6 +1065,8 @@ static void usage(args_t *args)
fprintf(stderr, " # in the form \">chr:from-to\".\n");
fprintf(stderr, " samtools faidx ref.fa 8:11870-11890 | bcftools consensus in.vcf.gz > out.fa\n");
fprintf(stderr, "\n");
+ fprintf(stderr, " # See also http://samtools.github.io/bcftools/howtos/consensus-sequence.html\n");
+ fprintf(stderr, "\n");
exit(1);
}
@@ -1086,7 +1075,7 @@ int main_consensus(int argc, char *argv[])
args_t *args = (args_t*) calloc(1,sizeof(args_t));
args->argc = argc; args->argv = argv;
- static struct option loptions[] =
+ static struct option loptions[] =
{
{"mark-del",required_argument,NULL,1},
{"mark-ins",required_argument,NULL,2},
@@ -1109,7 +1098,7 @@ int main_consensus(int argc, char *argv[])
int c;
while ((c = getopt_long(argc, argv, "h?s:1Ii:e:H:f:o:m:c:M:p:a:",loptions,NULL)) >= 0)
{
- switch (c)
+ switch (c)
{
case 1 : args->mark_del = optarg[0]; break;
case 2 :
@@ -1126,10 +1115,10 @@ int main_consensus(int argc, char *argv[])
case 's': args->sample = optarg; break;
case 'o': args->output_fname = optarg; break;
case 'I': args->output_iupac = 1; break;
- case 'e':
+ case 'e':
if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
- case 'i':
+ case 'i':
if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
case 'f': args->ref_fname = optarg; break;
@@ -1139,12 +1128,12 @@ int main_consensus(int argc, char *argv[])
args->absent_allele = optarg[0];
if ( optarg[1]!=0 ) error("Expected single character with -a, got \"%s\"\n", optarg);
break;
- case 'M':
- args->missing_allele = optarg[0];
+ case 'M':
+ args->missing_allele = optarg[0];
if ( optarg[1]!=0 ) error("Expected single character with -M, got \"%s\"\n", optarg);
break;
case 'c': args->chain_fname = optarg; break;
- case 'H':
+ case 'H':
if ( !strcasecmp(optarg,"R") ) args->allele |= PICK_REF;
else if ( !strcasecmp(optarg,"A") ) args->allele |= PICK_ALT;
else if ( !strcasecmp(optarg,"L") ) args->allele |= PICK_LONG|PICK_REF;
diff --git a/convert.c b/convert.c
index 7fca60baa..5317cb8fd 100644
--- a/convert.c
+++ b/convert.c
@@ -1,6 +1,6 @@
/* convert.c -- functions for converting between VCF/BCF and related formats.
- Copyright (C) 2013-2021 Genome Research Ltd.
+ Copyright (C) 2013-2022 Genome Research Ltd.
Author: Petr Danecek
@@ -955,12 +955,12 @@ static void process_gt_to_hap(convert_t *convert, bcf1_t *line, fmt_t *fmt, int
}
else if ( bcf_gt_is_missing(ptr[0]) )
{
- if ( ptr[1]==bcf_int8_vector_end )
+ if ( ptr[1]==bcf_int8_vector_end )
{
str->s[str->l++] = '?'; str->s[str->l++] = ' '; str->s[str->l++] = '-'; str->s[str->l++] = ' ';
}
- else
- {
+ else
+ {
str->s[str->l++] = '?'; str->s[str->l++] = ' '; str->s[str->l++] = '?'; str->s[str->l++] = ' ';
}
}
@@ -1192,11 +1192,10 @@ static void process_pbinom(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isa
}
if ( n[0]==n[1] ) kputc(n[0]==0 ? '.':'0', str);
- else
+ else
{
double pval = n[0] < n[1] ? kf_betai(n[1], n[0] + 1, 0.5) : kf_betai(n[0], n[1] + 1, 0.5);
pval *= 2;
- assert( pval-1 < 1e-10 );
if ( pval>=1 ) pval = 0; // this can happen, machine precision error, eg. kf_betai(1,0,0.5)
else
pval = -4.34294481903*log(pval);
@@ -1356,12 +1355,12 @@ static char *parse_tag(convert_t *convert, char *p, int is_gtf)
if ( !strcmp(str.s, "SAMPLE") ) register_tag(convert, "SAMPLE", is_gtf, T_SAMPLE);
else if ( !strcmp(str.s, "GT") ) register_tag(convert, "GT", is_gtf, T_GT);
else if ( !strcmp(str.s, "TGT") ) register_tag(convert, "GT", is_gtf, T_TGT);
- else if ( !strcmp(str.s, "TBCSQ") )
+ else if ( !strcmp(str.s, "TBCSQ") )
{
fmt_t *fmt = register_tag(convert, "BCSQ", is_gtf, T_TBCSQ);
fmt->subscript = parse_subscript(&q);
if ( fmt->subscript==-1 )
- {
+ {
if ( !strncmp(q,"{*}",3) ) { fmt->subscript = 0; q += 3; }
}
else fmt->subscript++;
@@ -1408,7 +1407,7 @@ static char *parse_tag(convert_t *convert, char *p, int is_gtf)
else
{
_SET_NON_FORMAT_TAGS(register_tag, str.s, convert, str.s, is_gtf)
- else if ( !strcmp(str.s, "ALT") )
+ else if ( !strcmp(str.s, "ALT") )
{
fmt_t *fmt = register_tag(convert, str.s, is_gtf, T_ALT);
fmt->subscript = parse_subscript(&q);
@@ -1619,7 +1618,7 @@ int convert_line(convert_t *convert, bcf1_t *line, kstring_t *str)
str->l = 0;
for (i=0; infmt; i++)
{
- // Genotype fields.
+ // Genotype fields.
if ( convert->fmt[i].is_gt_field )
{
int j = i, js, k;
@@ -1640,7 +1639,7 @@ int convert_line(convert_t *convert, bcf1_t *line, kstring_t *str)
// anything to the string, we trim all genotype fields enclosed in square
// brackets here. This may be changed in future, time will show...
size_t l_start = str->l;
-
+
for (k=i; kfmt[k].type == T_MASK )
@@ -1678,7 +1677,7 @@ int convert_set_option(convert_t *convert, enum convert_option opt, ...)
va_list args;
va_start(args, opt);
- switch (opt)
+ switch (opt)
{
case allow_undef_tags:
convert->allow_undef_tags = va_arg(args, int);
diff --git a/doc/bcftools.1 b/doc/bcftools.1
index 006111fff..97df30037 100644
--- a/doc/bcftools.1
+++ b/doc/bcftools.1
@@ -2,12 +2,12 @@
.\" Title: bcftools
.\" Author: [see the "AUTHOR(S)" section]
.\" Generator: Asciidoctor 2.0.16.dev
-.\" Date: 2022-04-07
+.\" Date: 2022-08-18
.\" Manual: \ \&
.\" Source: \ \&
.\" Language: English
.\"
-.TH "BCFTOOLS" "1" "2022-04-07" "\ \&" "\ \&"
+.TH "BCFTOOLS" "1" "2022-08-18" "\ \&" "\ \&"
.ie \n(.g .ds Aq \(aq
.el .ds Aq '
.ss \n[.ss] 0
@@ -51,7 +51,7 @@ standard input (stdin) and outputs to the standard output (stdout). Several
commands can thus be combined with Unix pipes.
.SS "VERSION"
.sp
-This manual page was last updated \fB2022\-04\-07\fP and refers to bcftools git version \fB1.15.1\fP.
+This manual page was last updated \fB2022\-08\-18\fP and refers to bcftools git version \fB1.16\fP.
.SS "BCF1"
.sp
The BCF1 format output by versions of samtools <= 0.1.19 is \fBnot\fP
@@ -1550,6 +1550,8 @@ apply variants of the given sample
# Create consensus for one region. The fasta header lines are then expected
# in the form ">chr:from\-to".
samtools faidx ref.fa 8:11870\-11890 | bcftools consensus in.vcf.gz \-o out.fa
+
+ # For more examples see http://samtools.github.io/bcftools/howtos/consensus\-sequence.html
.fam
.fi
.if n .RE
@@ -2552,6 +2554,13 @@ see \fBCommon Options\fP
.RE
.SS "Stats options:"
.sp
+\fB\-a, \-\-all\fP
+.RS 4
+Used in conjunction with \fB\-s, \-\-stats\fP, print per contig stats
+for all contigs, even those with zero records and those for which
+no stats are recorded in the index file (shown as \fI.\fP).
+.RE
+.sp
\fB\-n, \-\-nrecords\fP
.RS 4
print the number of records based on the CSI or TBI index files
@@ -2562,7 +2571,7 @@ print the number of records based on the CSI or TBI index files
Print per contig stats based on the CSI or TBI index files.
Output format is three tab\-delimited columns listing the contig
name, contig length (\fI.\fP if unknown) and number of records for
-the contig. Contigs with zero records are not printed.
+the contig. Contigs with zero records are not printed by default.
.RE
.SS "bcftools isec [\fIOPTIONS\fP] \fIA.vcf.gz\fP \fIB.vcf.gz\fP [...]"
.sp
@@ -2799,7 +2808,7 @@ maximum number of alternate alleles that can be included in the PL tag. The defa
is 0 which disables the feature and outputs values for all alternate alleles.
.RE
.sp
-\fB\-m, \-\-merge\fP \fIsnps\fP|\fIindels\fP|\fIboth\fP|\fIall\fP|\fInone\fP|\fIid\fP
+\fB\-m, \-\-merge\fP \fIsnps\fP|\fIindels\fP|\fIboth\fP|\fIsnp\-ins\-del\fP|\fIall\fP|\fInone\fP|\fIid\fP
.RS 4
The option controls what types of multiallelic records can be created:
.RE
@@ -2807,12 +2816,13 @@ The option controls what types of multiallelic records can be created:
.if n .RS 4
.nf
.fam C
-\-m none .. no new multiallelics, output multiple records instead
-\-m snps .. allow multiallelic SNP records
-\-m indels .. allow multiallelic indel records
-\-m both .. both SNP and indel records can be multiallelic
-\-m all .. SNP records can be merged with indel records
-\-m id .. merge by ID
+\-m none .. no new multiallelics, output multiple records instead
+\-m snps .. allow multiallelic SNP records
+\-m indels .. allow multiallelic indel records
+\-m both .. both SNP and indel records can be multiallelic
+\-m all .. SNP records can be merged with indel records
+\-m snp\-ins\-del .. allow multiallelic SNVs, insertions, deletions, but don\*(Aqt mix them
+\-m id .. merge by ID
.fam
.fi
.if n .RE
@@ -3892,7 +3902,7 @@ TAG=func(TAG) Number:1 Type:Integer .. Experimental support for user\-defin
.RE
.RE
.sp
-\fBfix\-ploidy\fP
+\fBfixploidy\fP
.RS 4
sets correct ploidy
.RE
@@ -4020,6 +4030,11 @@ calculate transmission rate in trio children. The usage and format is similar to
calculate phase switch rate in trio samples, children samples must have phased GTs
.RE
.sp
+\fBvariant\-distance\fP
+.RS 4
+annotate sites with the distance to the nearest variant
+.RE
+.sp
\fBvariantkey\-hex\fP
.RS 4
generate unsorted VariantKey\-RSid index files in hexadecimal format
@@ -5105,12 +5120,12 @@ supported only to filter by the ID column)
. sp -1
. IP \(bu 2.3
.\}
-arithmetic operators
+arithmetic operators (addition, multiplication, subtraction, division, modulo)
.sp
.if n .RS 4
.nf
.fam C
-+,*,\-,/
++, *, \-, /, %
.fam
.fi
.if n .RE
@@ -5224,16 +5239,20 @@ FILTER, QUAL, ID, CHROM, POS, REF, ALT[0]
. sp -1
. IP \(bu 2.3
.\}
-starting with 1.11, the FILTER column can be queried as follows:
+the FILTER column can be queried as follows:
.sp
.if n .RS 4
.nf
.fam C
FILTER="PASS"
+FILTER="."
FILTER="A" .. exact match, for example "A;B" does not pass
+FILTER="A;B" .. exact match, "A;B" and "B;A" pass, everything else fails
FILTER!="A" .. exact match, for example "A;B" does pass
-FILTER~"A" .. both "A" and "A;B" pass
-FILTER!~"A" .. neither "A" nor "A;B" pass
+FILTER~"A" .. subset match, for example both "A" and "A;B" pass
+FILTER~"A;B" .. subset match, pass only if both "A" and "B" are present
+FILTER!~"A" .. complement match, for example both "A" and "A;B" fail
+FILTER!~"A;B" .. complement match, fail if both "A" and "B" are present
.fam
.fi
.if n .RE
diff --git a/doc/bcftools.html b/doc/bcftools.html
index d9afe6803..375d25a3d 100644
--- a/doc/bcftools.html
+++ b/doc/bcftools.html
@@ -50,7 +50,7 @@ DESCRIPTION
VERSION
-
This manual page was last updated 2022-04-07 and refers to bcftools git version 1.15.1.
+
This manual page was last updated 2022-08-18 and refers to bcftools git version 1.16.
@@ -1295,7 +1295,9 @@
bcftools consensus [OPTIONS] FILE
# Create consensus for one region. The fasta header lines are then expected
# in the form ">chr:from-to".
- samtools faidx ref.fa 8:11870-11890 | bcftools consensus in.vcf.gz -o out.fa
+ samtools faidx ref.fa 8:11870-11890 | bcftools consensus in.vcf.gz -o out.fa
+
+ # For more examples see http://samtools.github.io/bcftools/howtos/consensus-sequence.html
@@ -2281,6 +2283,12 @@ Indexing options:
Stats options:
+- -a, --all
+-
+
Used in conjunction with -s, --stats, print per contig stats
+for all contigs, even those with zero records and those for which
+no stats are recorded in the index file (shown as .).
+
- -n, --nrecords
-
print the number of records based on the CSI or TBI index files
@@ -2290,7 +2298,7 @@ Stats options:
Print per contig stats based on the CSI or TBI index files.
Output format is three tab-delimited columns listing the contig
name, contig length (. if unknown) and number of records for
-the contig. Contigs with zero records are not printed.
+the contig. Contigs with zero records are not printed by default.
@@ -2509,7 +2517,7 @@ bcftools merge [OPTIONS] A.vcf.gz B.vcf.gz<
maximum number of alternate alleles that can be included in the PL tag. The default value
is 0 which disables the feature and outputs values for all alternate alleles.
--m, --merge snps|indels|both|all|none|id
+-m, --merge snps|indels|both|snp-ins-del|all|none|id
The option controls what types of multiallelic records can be created:
@@ -2517,12 +2525,13 @@ bcftools merge [OPTIONS] A.vcf.gz B.vcf.gz<
-
-m none .. no new multiallelics, output multiple records instead
--m snps .. allow multiallelic SNP records
--m indels .. allow multiallelic indel records
--m both .. both SNP and indel records can be multiallelic
--m all .. SNP records can be merged with indel records
--m id .. merge by ID
+
-m none .. no new multiallelics, output multiple records instead
+-m snps .. allow multiallelic SNP records
+-m indels .. allow multiallelic indel records
+-m both .. both SNP and indel records can be multiallelic
+-m all .. SNP records can be merged with indel records
+-m snp-ins-del .. allow multiallelic SNVs, insertions, deletions, but don't mix them
+-m id .. merge by ID
@@ -3385,7 +3394,7 @@
List of plugins coming wi
-fix-ploidy
+fixploidy
sets correct ploidy
@@ -3488,6 +3497,10 @@ List of plugins coming wi
calculate phase switch rate in trio samples, children samples must have phased GTs
+variant-distance
+
+annotate sites with the distance to the nearest variant
+
variantkey-hex
generate unsorted VariantKey-RSid index files in hexadecimal format
@@ -4494,10 +4507,10 @@ EXPRESSIONS
-arithmetic operators
+arithmetic operators (addition, multiplication, subtraction, division, modulo)
@@ -4546,14 +4559,18 @@ EXPRESSIONS
-starting with 1.11, the FILTER column can be queried as follows:
+the FILTER column can be queried as follows:
FILTER="PASS"
+FILTER="."
FILTER="A" .. exact match, for example "A;B" does not pass
+FILTER="A;B" .. exact match, "A;B" and "B;A" pass, everything else fails
FILTER!="A" .. exact match, for example "A;B" does pass
-FILTER~"A" .. both "A" and "A;B" pass
-FILTER!~"A" .. neither "A" nor "A;B" pass
+FILTER~"A" .. subset match, for example both "A" and "A;B" pass
+FILTER~"A;B" .. subset match, pass only if both "A" and "B" are present
+FILTER!~"A" .. complement match, for example both "A" and "A;B" fail
+FILTER!~"A;B" .. complement match, fail if both "A" and "B" are present
@@ -5019,7 +5036,7 @@ COPYING