From d800f1323a545bfcf2bf8fe3f63b6e430bb5e75d Mon Sep 17 00:00:00 2001 From: pd3 Date: Tue, 14 Nov 2023 21:01:17 +0100 Subject: [PATCH 1/2] Temporary workaround when excessive memory is required by FORMAT fields The BCF limit of ~2GB per VCF row causes the parser to return an error. In this temporary solution we drop the excessive fields, typically these will be PL and other Number=G tags. in future work the library would automatically convert such tags into their localized alternatives (LPL). --- vcf.c | 35 ++++++++++++++++++++++++++++++----- 1 file changed, 30 insertions(+), 5 deletions(-) diff --git a/vcf.c b/vcf.c index 301d818cf..1a93bc0e6 100644 --- a/vcf.c +++ b/vcf.c @@ -2202,7 +2202,7 @@ int bcf_write(htsFile *hfp, bcf_hdr_t *h, bcf1_t *v) if ( hfp->format.format == vcf || hfp->format.format == text_format ) return vcf_write(hfp,h,v); - if ( v->errcode ) + if ( v->errcode & ~BCF_ERR_LIMITS ) // todo: unsure about the other BCF_ERR_LIMITS branches in vcf_parse_format_alloc4() { // vcf_parse1() encountered a new contig or tag, undeclared in the // header. At this point, the header must have been printed, @@ -3004,9 +3004,12 @@ static int vcf_parse_format_alloc4(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, // malformed VCF data is less likely to take excessive memory and/or // time. if ((uint64_t) mem->l + v->n_sample * (uint64_t)f->size > INT_MAX) { - hts_log_error("Excessive memory required by FORMAT fields at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1); + static int warned = 0; + if ( !warned ) hts_log_warning("Excessive memory required by FORMAT fields at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1); + warned = 1; v->errcode |= BCF_ERR_LIMITS; - return -1; + f->size = f->offset = 0; + continue; } f->offset = mem->l; @@ -3065,7 +3068,12 @@ static int vcf_parse_format_fill5(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, return -1; } - if (htype == BCF_HT_STR) { + if ( !z->size ) + { + // this field is to be ignored, it's too big + while ( *t != ':' && *t ) t++; + } + else if (htype == BCF_HT_STR) { int l; if (z->is_gt) { // Genotypes. @@ -3237,10 +3245,14 @@ static int vcf_parse_format_fill5(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, static int vcf_parse_format_gt6(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, const char *p, const char *q, fmt_aux_t *fmt) { kstring_t *str = &v->indiv; - int i; + int i, need_downsize = 0; if (v->n_sample > 0) { for (i = 0; i < v->n_fmt; ++i) { fmt_aux_t *z = &fmt[i]; + if ( !z->size ) { + need_downsize = 1; + continue; + } bcf_enc_int1(str, z->key); if ((z->y>>4&0xf) == BCF_HT_STR && !z->is_gt) { bcf_enc_size(str, z->size, BCF_BT_CHAR); @@ -3257,6 +3269,19 @@ static int vcf_parse_format_gt6(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, } } } + + } + if ( need_downsize ) { + i = 1; + while ( i < v->n_fmt ) { + if ( !fmt[i].size ) + { + memmove(&fmt[i-1],&fmt[i],sizeof(*fmt)); + v->n_fmt--; + } + else + i++; + } } return 0; From f1b5470c13dd50300cd72bd70ff70f930067d22b Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Tue, 21 Nov 2023 14:45:58 +0000 Subject: [PATCH 2/2] VCF parsing fix The removal of large tags by d800f1323a545bfcf2bf8fe3f63b6e430bb5e75d inadvertently changed output for invalid rows, such as the one introduced by 8f782d120433eb0321a1c0db6ee49e6bc838563e. This was caused by relying on fmt->size==0 never happening - such assumption is correct, I think, but only for valid tags, not such that failed parsing --- vcf.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/vcf.c b/vcf.c index 1a93bc0e6..7499ad8b9 100644 --- a/vcf.c +++ b/vcf.c @@ -3008,7 +3008,8 @@ static int vcf_parse_format_alloc4(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, if ( !warned ) hts_log_warning("Excessive memory required by FORMAT fields at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1); warned = 1; v->errcode |= BCF_ERR_LIMITS; - f->size = f->offset = 0; + f->size = -1; + f->offset = 0; continue; } @@ -3068,7 +3069,7 @@ static int vcf_parse_format_fill5(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, return -1; } - if ( !z->size ) + if ( z->size==-1 ) { // this field is to be ignored, it's too big while ( *t != ':' && *t ) t++; @@ -3249,7 +3250,7 @@ static int vcf_parse_format_gt6(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, if (v->n_sample > 0) { for (i = 0; i < v->n_fmt; ++i) { fmt_aux_t *z = &fmt[i]; - if ( !z->size ) { + if ( z->size==-1 ) { need_downsize = 1; continue; } @@ -3274,7 +3275,7 @@ static int vcf_parse_format_gt6(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, if ( need_downsize ) { i = 1; while ( i < v->n_fmt ) { - if ( !fmt[i].size ) + if ( fmt[i].size==-1 ) { memmove(&fmt[i-1],&fmt[i],sizeof(*fmt)); v->n_fmt--;