From 5ae8528d71879008e9c24abc5fe5332b204811f9 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Fri, 15 Sep 2023 16:33:00 +0100 Subject: [PATCH 1/2] Fix virtual offset adjustment when indexing on-the-fly When indexing on-the-fly, the virtual offset of the end of each read gets passed to hts_idx_push(). As HTSlib starts a new BGZF block if the next read doesn't fit, the stored offset needs to be adjusted to point to the start of the next block instead of the end of the old one. (While the two locations effectively map to the same uncompressed offset, pointing to the end of a block causes problems for HTSlib before 1.11, and HTSJDK.) When multi-threading, this adjustment was done by calling bgzf_idx_amend_last() from the main thread. However, if the data was written out too promptly, the function could be called too late preventing the adjustment from being applied. To fix this, the adjustment is now moved to bgzf_idx_flush() which is called by the writer thread, ensuring that it is always applied on time. The internal function bgzf_idx_amend_last() is removed as it's no longer needed. Fixes samtools/samtools#1861 (samtools merge --write-index results in Invalid file pointer [from HTSJDK] in rare cases whereas samtools index fixes the problem) --- bgzf.c | 70 ++++++++++++++++++++++++++++------------------------------ sam.c | 2 -- 2 files changed, 34 insertions(+), 38 deletions(-) diff --git a/bgzf.c b/bgzf.c index a775b1b83..bb7d91128 100644 --- a/bgzf.c +++ b/bgzf.c @@ -230,41 +230,8 @@ int bgzf_idx_push(BGZF *fp, hts_idx_t *hidx, int tid, hts_pos_t beg, hts_pos_t e return 0; } -/* - * bgzf analogue to hts_idx_amend_last. - * - * This is needed when multi-threading and writing indices on the fly. - * At the point of writing a record we know the virtual offset for start - * and end, but that end virtual offset may be the end of the current - * block. In standard indexing our end virtual offset becomes the start - * of the next block. Thus to ensure bit for bit compatibility we - * detect this boundary case and fix it up here. - * - * In theory this has no behavioural change, but it also works around - * a bug elsewhere which causes bgzf_read to return 0 when our offset - * is the end of a block rather than the start of the next. - */ -void bgzf_idx_amend_last(BGZF *fp, hts_idx_t *hidx, uint64_t offset) { - mtaux_t *mt = fp->mt; - if (!mt) { - hts_idx_amend_last(hidx, offset); - return; - } - - pthread_mutex_lock(&mt->idx_m); - hts_idx_cache_t *ic = &mt->idx_cache; - if (ic->nentries > 0) { - hts_idx_cache_entry *e = &ic->e[ic->nentries-1]; - if ((offset & 0xffff) == 0 && e->offset != 0) { - // bumped to next block number - e->offset = 0; - e->block_number++; - } - } - pthread_mutex_unlock(&mt->idx_m); -} - -static int bgzf_idx_flush(BGZF *fp) { +static int bgzf_idx_flush(BGZF *fp, + size_t block_uncomp_len, size_t block_comp_len) { mtaux_t *mt = fp->mt; if (!mt->idx_cache.e) { @@ -280,6 +247,37 @@ static int bgzf_idx_flush(BGZF *fp) { assert(mt->idx_cache.nentries == 0 || mt->block_written <= e[0].block_number); for (i = 0; i < mt->idx_cache.nentries && e[i].block_number == mt->block_written; i++) { + if (block_uncomp_len > 0 && e[i].offset == block_uncomp_len) { + /* + * If the virtual offset is at the end of the current block, + * adjust it to point to the start of the next one. This + * is needed when on-the-fly indexing has recorded a virtual + * offset just before a new block has been started, and makes + * on-the-fly and standard indexing give exactly the same results. + * + * In theory the two virtual offsets are equivalent, but pointing + * to the end of a block is inefficient, and caused problems with + * versions of HTSlib before 1.11 where bgzf_read() would + * incorrectly return EOF. + */ + + // Assert that this is the last entry for the current block_number + assert(i == mt->idx_cache.nentries - 1 + || e[i].block_number < e[i + 1].block_number); + + // Work out where the next block starts. For this entry, the + // offset will be zero. + uint64_t next_block_addr = mt->block_address + block_comp_len; + if (hts_idx_push(mt->hts_idx, e[i].tid, e[i].beg, e[i].end, + next_block_addr << 16, e[i].is_mapped) < 0) { + pthread_mutex_unlock(&mt->idx_m); + return -1; + } + // Count this entry and drop out of the loop + i++; + break; + } + if (hts_idx_push(mt->hts_idx, e[i].tid, e[i].beg, e[i].end, (mt->block_address << 16) + e[i].offset, e[i].is_mapped) < 0) { @@ -1423,7 +1421,7 @@ static void *bgzf_mt_writer(void *vp) { } // Flush any cached hts_idx_push calls - if (bgzf_idx_flush(fp) < 0) + if (bgzf_idx_flush(fp, j->uncomp_len, j->comp_len) < 0) goto err; if (hwrite(fp->fp, j->comp_data, j->comp_len) != j->comp_len) diff --git a/sam.c b/sam.c index de603cf16..80c391939 100644 --- a/sam.c +++ b/sam.c @@ -904,8 +904,6 @@ static int bam_write_idx1(htsFile *fp, const sam_hdr_t *h, const bam1_t *b) { return -1; if (!bfp->mt) hts_idx_amend_last(fp->idx, bgzf_tell(bfp)); - else - bgzf_idx_amend_last(bfp, fp->idx, bgzf_tell(bfp)); int ret = bam_write1(bfp, b); if (ret < 0) From b22fdd836e2d391b99805e66e63c9ddbcfe99ba4 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Mon, 18 Sep 2023 16:27:21 +0100 Subject: [PATCH 2/2] Fix BCF/VCF on-the-fly indexing issues * Switch from hts_idx_push() to bgzf_idx_push() for on-the-fly indexing of BCF and VCF.bgz files. The latter function is needed to record the correct offsets when using multi-threaded BGZF compression. Fixes samtools/bcftools#1985 * Only allow indexing of BGZF-compressed files. It's necessary to enforce this as on-the-fly indexing assumes that the file pointer is in htsFile::fp.bgzf, but uncompressed VCF uses htsFile::fp.hfile. --- vcf.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/vcf.c b/vcf.c index c126f7354..2c8471d17 100644 --- a/vcf.c +++ b/vcf.c @@ -2225,7 +2225,8 @@ int bcf_write(htsFile *hfp, bcf_hdr_t *h, bcf1_t *v) if ( bgzf_write(fp, v->indiv.s, v->indiv.l) != v->indiv.l ) return -1; if (hfp->idx) { - if (hts_idx_push(hfp->idx, v->rid, v->pos, v->pos + v->rlen, bgzf_tell(fp), 1) < 0) + if (bgzf_idx_push(fp, hfp->idx, v->rid, v->pos, v->pos + v->rlen, + bgzf_tell(fp), 1) < 0) return -1; } @@ -3959,19 +3960,19 @@ int vcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) if ( fp->format.compression!=no_compression ) { if (bgzf_flush_try(fp->fp.bgzf, fp->line.l) < 0) return -1; - if (fp->idx) - hts_idx_amend_last(fp->idx, bgzf_tell(fp->fp.bgzf)); ret = bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l); } else { ret = hwrite(fp->fp.hfile, fp->line.s, fp->line.l); } - if (fp->idx) { + if (fp->idx && fp->format.compression == bgzf) { int tid; if ((tid = hts_idx_tbi_name(fp->idx, v->rid, bcf_seqname_safe(h, v))) < 0) return -1; - if (hts_idx_push(fp->idx, tid, v->pos, v->pos + v->rlen, bgzf_tell(fp->fp.bgzf), 1) < 0) + if (bgzf_idx_push(fp->fp.bgzf, fp->idx, + tid, v->pos, v->pos + v->rlen, + bgzf_tell(fp->fp.bgzf), 1) < 0) return -1; } @@ -4160,6 +4161,11 @@ static int vcf_idx_init(htsFile *fp, bcf_hdr_t *h, int min_shift, const char *fn int bcf_idx_init(htsFile *fp, bcf_hdr_t *h, int min_shift, const char *fnidx) { int n_lvls, nids = 0; + if (fp->format.compression != bgzf) { + hts_log_error("Indexing is only supported on BGZF-compressed files"); + return -3; // Matches no-compression return for bcf_index_build3() + } + if (fp->format.format == vcf) return vcf_idx_init(fp, h, min_shift, fnidx);