diff --git a/cram/cram_decode.c b/cram/cram_decode.c index 86e2ef96e..2b2ad6029 100644 --- a/cram/cram_decode.c +++ b/cram/cram_decode.c @@ -1,5 +1,5 @@ /* -Copyright (c) 2012-2020, 2022-2023 Genome Research Ltd. +Copyright (c) 2012-2020, 2022-2024 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without @@ -3004,8 +3004,8 @@ int cram_decode_slice_mt(cram_fd *fd, cram_container *c, cram_slice *s, * Returns the used size of the bam record on success * -1 on failure. */ -static int cram_to_bam(sam_hdr_t *sh, cram_fd *fd, cram_slice *s, - cram_record *cr, int rec, bam_seq_t **bam) { +int cram_to_bam(sam_hdr_t *sh, cram_fd *fd, cram_slice *s, + cram_record *cr, int rec, bam_seq_t **bam) { int ret, rg_len; char name_a[1024], *name; int name_len; @@ -3172,7 +3172,7 @@ static cram_container *cram_first_slice(cram_fd *fd) { return c; } -static cram_slice *cram_next_slice(cram_fd *fd, cram_container **cp) { +cram_slice *cram_next_slice(cram_fd *fd, cram_container **cp) { cram_container *c_curr; // container being consumed via cram_get_seq() cram_slice *s_curr = NULL; diff --git a/cram/cram_decode.h b/cram/cram_decode.h index 400eb6beb..16d87a073 100644 --- a/cram/cram_decode.h +++ b/cram/cram_decode.h @@ -1,5 +1,5 @@ /* -Copyright (c) 2012-2013, 2018 Genome Research Ltd. +Copyright (c) 2012-2013, 2018, 2024 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without @@ -94,6 +94,15 @@ cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd, cram_block_slice_hdr *cram_decode_slice_header(cram_fd *fd, cram_block *b); +/*! INTERNAL: + * Loads and decodes the next slice worth of data. + * + * @return + * Returns cram slice pointer on success; + * NULL on failure + */ +cram_slice *cram_next_slice(cram_fd *fd, cram_container **cp); + /*! INTERNAL: * Decode an entire slice from container blocks. Fills out s->crecs[] array. * @@ -105,6 +114,22 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s, sam_hdr_t *hdr); +/*! INTERNAL: + * Converts a cram in-memory record into a bam in-memory record. We + * pass a pointer to a bam_seq_t pointer along with the a pointer to + * the allocated size. These can initially be pointers to NULL and zero. + * + * This function will reallocate the bam buffer as required and update + * (*bam)->alloc accordingly, allowing it to be used within a loop + * efficiently without needing to allocate new bam objects over and + * over again. + * + * Returns the used size of the bam record on success + * -1 on failure. + */ +int cram_to_bam(sam_hdr_t *sh, cram_fd *fd, cram_slice *s, + cram_record *cr, int rec, bam_seq_t **bam); + /* * Drains and frees the decode read-queue for a multi-threaded reader. */ diff --git a/cram/cram_external.c b/cram/cram_external.c index 7455185ad..1102e8daa 100644 --- a/cram/cram_external.c +++ b/cram/cram_external.c @@ -1,5 +1,5 @@ /* -Copyright (c) 2015, 2018-2020, 2022-2023 Genome Research Ltd. +Copyright (c) 2015, 2018-2020, 2022-2024 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without @@ -49,6 +49,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #include "../htslib/hfile.h" +#include "../hfile_internal.h" #include "cram.h" /* @@ -121,6 +122,16 @@ int cram_container_is_empty(cram_fd *fd) { return fd->empty_container; } +void cram_container_get_coords(cram_container *c, + int *refid, hts_pos_t *start, hts_pos_t *span) { + if (refid) + *refid = c->ref_seq_id; + if (start) + *start = c->ref_seq_start; + if (span) + *span = c->ref_seq_span; +} + /* *----------------------------------------------------------------------------- @@ -683,6 +694,7 @@ int cram_copy_slice(cram_fd *in, cram_fd *out, int32_t num_slice) { cram_free_block(blk); return -1; } + if (cram_write_block(out, blk) != 0) { cram_free_block(blk); return -1; @@ -704,6 +716,192 @@ int cram_copy_slice(cram_fd *in, cram_fd *out, int32_t num_slice) { return 0; } +/* + * Discards the next containers worth of data. + * Only the cram structure has been read so far. + * + * Returns 0 on success, + * -1 on failure + */ +static int cram_skip_container(cram_fd *in, cram_container *c) { + // Compression header + cram_block *blk; + if (!(blk = cram_read_block(in))) + return -1; + cram_free_block(blk); + + int i; + for (i = 0; i < c->num_landmarks; i++) { + cram_block_slice_hdr *hdr; + + if (!(blk = cram_read_block(in))) + return -1; + if (!(hdr = cram_decode_slice_header(in, blk))) { + cram_free_block(blk); + return -1; + } + cram_free_block(blk); + + int num_blocks = cram_slice_hdr_get_num_blocks(hdr), j; + for (j = 0; j < num_blocks; j++) { + blk = cram_read_block(in); + if (!blk) { + cram_free_slice_header(hdr); + return -1; + } + cram_free_block(blk); + } + cram_free_slice_header(hdr); + } + + return 0; +} + + +/* + * Copies a container, but filtering it down to a specific region, + * which has already been set on the 'in' fd. + * + * This is used in e.g. samtools cat where we specified a region and discover + * that a region doesn't entirely span the container, so we have to select + * which reads we need to copy out of it. + * + * If ref_id is non-NULL we also return the last ref_id we filtered. + * This can be -2 if it's multi-ref and we observe more than one reference, + * and actual ref_id >= -1 if it's multi-ref and we observe just one ref or + * it's fixed reference. + * + * Returns 0 on success + * -1 on error + */ +int cram_filter_container(cram_fd *in, cram_fd *out, cram_container *c, + int *ref_id) { + int err = 0, fixed_ref = -3; + + if (ref_id) + *ref_id = c->ref_seq_id; + + int rid = in->range.refid == -2 ? -1 : in->range.refid; + if (rid != c->ref_seq_id || + in->range.start > c->ref_seq_start + c->ref_seq_span-1) + // Except for multi-ref cases + if (c->ref_seq_id != -2) + return cram_skip_container(in, c); + + // Container compression header + cram_block *blk = cram_read_block(in); + if (!blk) + return -1; + c->comp_hdr = cram_decode_compression_header(in, blk); + in->ctr = c; + + // If it's multi-ref but a constant ref-id, then we can still do + // basic level chromosome filtering. Similarly multi-ref where we're + // _already_ in ref "*" (unmapped) means we can just copy the container + // as there are no positions to filter on and "*" sorts to the end. + // TODO: how to tell "already in" though? + if (c->ref_seq_id == -2) { + cram_codec *cd = c->comp_hdr->codecs[DS_RI]; + if (cd && cd->codec == E_HUFFMAN && cd->u.huffman.ncodes == 1 && + // this check should be always true anyway + rid == cd->u.huffman.codes[0].symbol) + // We're in multi-ref mode, but actually the entire container + // matches. So if we're in whole-chromosome mode we can just + // copy. + if (in->range.start <= 1 && + in->range.end >= (INT64_MAX&(0xffffffffULL<<32))) { + if (ref_id) + *ref_id = rid; + err |= cram_write_container(out, c) < 0; + err |= cram_write_block(out, blk); + return cram_copy_slice(in, out, c->num_landmarks) | -err; + } + } + + // A simple read-write loop with region filtering automatically due to + // an earlier CRAM_OPT_RANGE request. + // + // We can hit EOF when reaching the end of the range, but we still need + // to manually check we don't attempt to read beyond this single container. + + cram_range rng_copy = in->range; + in->range.start = INT64_MIN; + in->range.end = INT64_MAX; + + bam1_t *b = bam_init1(); + while ((c->curr_slice < c->max_slice || + c->slice->curr_rec < c->slice->max_rec)) { + cram_slice *s; + if (c->slice && c->slice->curr_rec < c->slice->max_rec) + s = c->slice; + else if (c->curr_slice < c->max_slice) + s = cram_next_slice(in, &c); + else + break; // end of container + c->slice = s; + + // This is more efficient if we check as a cram record instead of a + // bam record as we don't have to parse CIGAR end. + cram_record *cr = &c->slice->crecs[c->slice->curr_rec]; + if (fixed_ref == -3) + fixed_ref = cr->ref_id; + else if (fixed_ref != cr->ref_id) + fixed_ref = -2; + + if (rng_copy.refid != cr->ref_id) { + if (rng_copy.refid == -2) { + if (cr->ref_id > -1) { + // Want unmapped, but have mapped + c->slice->curr_rec++; + continue; + } + } else { + if (rng_copy.refid > cr->ref_id || rng_copy.refid == -1) { + // multi-ref and not at the correct ref yet + c->slice->curr_rec++; + continue; + } else { + // multi-ref and beyond the desired ref + break; + } + } + } + + // Correct ref, but check the desired region + if (cr->aend < rng_copy.start) { + c->slice->curr_rec++; + continue; + } + if (cr->apos > rng_copy.end) + break; + + // Broadly rquivalent to cram_get_bam_seq, but starting from 'cr' + err |= cram_to_bam(in->header, in, s, cr, s->curr_rec++, &b) < 0; + + if (cram_put_bam_seq(out, b) < 0) { + err |= 1; + break; + } + } + bam_destroy1(b); + + if (ref_id) + *ref_id = fixed_ref; + + in->range = rng_copy; + + // Avoids double frees as we stole the container from our other + // file descriptor. + in->ctr = NULL; + in->ctr_mt = NULL; + + err |= cram_flush(out); + cram_free_block(blk); + + return -err; +} + + /* * Renumbers RG numbers in a cram compression header. * diff --git a/cram/cram_index.c b/cram/cram_index.c index 639bc4c41..77c953d6c 100644 --- a/cram/cram_index.c +++ b/cram/cram_index.c @@ -1,5 +1,5 @@ /* -Copyright (c) 2013-2020, 2023 Genome Research Ltd. +Copyright (c) 2013-2020, 2023-2024 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without @@ -848,3 +848,193 @@ int cram_index_build(cram_fd *fd, const char *fn_base, const char *fn_idx) { return (bgzf_close(fp) >= 0)? 0 : -4; } + +// internal recursive step +static int64_t cram_num_containers_between_(cram_index *e, int64_t *last_pos, + int64_t nct, + off_t cstart, off_t cend, + int64_t *first, int64_t *last) { + int64_t nc = 0, i; + + if (e->offset) { + if (e->offset != *last_pos) { + if (e->offset >= cstart && (!cend || e->offset <= cend)) { + if (first && *first < 0) + *first = nct; + if (last) + *last = nct; + } + nc++; + } + // else a new multi-ref in same container + *last_pos = e->offset; + } + + for (i = 0; i < e->nslice; i++) + nc += cram_num_containers_between_(&e->e[i], last_pos, nc + nct, + cstart, cend, first, last); + + return nc; +} + +/*! Returns the number of containers in the CRAM file within given offsets. + * + * The cstart and cend offsets are the locations of the start of containers + * as returned by index_container_offset. + * + * If non-NULL, first and last will hold the inclusive range of container + * numbers, counting from zero. + * + * @return + * Returns the number of containers, equivalent to *last-*first+1. + */ +int64_t cram_num_containers_between(cram_fd *fd, + off_t cstart, off_t cend, + int64_t *first, int64_t *last) { + int64_t nc = 0, i; + int64_t last_pos = -99; + int64_t l_first = -1, l_last = -1; + + for (i = 0; i < fd->index_sz; i++) { + int j = i+1 == fd->index_sz ? 0 : i+1; // maps "*" to end + nc += cram_num_containers_between_(&fd->index[j], &last_pos, nc, + cstart, cend, &l_first, &l_last); + } + + if (first) + *first = l_first; + if (last) + *last = l_last; + + return l_last - l_first + 1; +} + +/* + * Queries the total number of distinct containers in the index. + * Note there may be more containers in the file than in the index, as we + * are not required to have an index entry for every one. + */ +int64_t cram_num_containers(cram_fd *fd) { + return cram_num_containers_between(fd, 0, 0, NULL, NULL); +} + + +/*! Returns the byte offset for the start of the n^th container. + * + * The index must have previously been loaded, otherwise <0 is returned. + */ +static cram_index *cram_container_num2offset_(cram_index *e, int num, + int64_t *last_pos, int *nc) { + if (e->offset) { + if (e->offset != *last_pos) { + if (*nc == num) + return e; + (*nc)++; + } + // else a new multi-ref in same container + *last_pos = e->offset; + } + + int i; + for (i = 0; i < e->nslice; i++) { + cram_index *tmp = cram_container_num2offset_(&e->e[i], num, + last_pos, nc); + if (tmp) + return tmp; + } + + + return NULL; +} + +off_t cram_container_num2offset(cram_fd *fd, int64_t num) { + int nc = 0, i; + int64_t last_pos = -9; + cram_index *e = NULL; + + for (i = 0; i < fd->index_sz; i++) { + int j = i+1 == fd->index_sz ? 0 : i+1; // maps "*" to end + if (!fd->index[j].nslice) + continue; + if ((e = cram_container_num2offset_(&fd->index[j], num, + &last_pos, &nc))) + break; + } + + return e ? e->offset : -1; +} + + +/*! Returns the container number for the first container at offset >= pos. + * + * The index must have previously been loaded, otherwise <0 is returned. + */ +static cram_index *cram_container_offset2num_(cram_index *e, off_t pos, + int64_t *last_pos, int *nc) { + if (e->offset) { + if (e->offset != *last_pos) { + if (e->offset >= pos) + return e; + (*nc)++; + } + // else a new multi-ref in same container + *last_pos = e->offset; + } + + int i; + for (i = 0; i < e->nslice; i++) { + cram_index *tmp = cram_container_offset2num_(&e->e[i], pos, + last_pos, nc); + if (tmp) + return tmp; + } + + + return NULL; +} + +int64_t cram_container_offset2num(cram_fd *fd, off_t pos) { + int nc = 0, i; + int64_t last_pos = -9; + cram_index *e = NULL; + + for (i = 0; i < fd->index_sz; i++) { + int j = i+1 == fd->index_sz ? 0 : i+1; // maps "*" to end + if (!fd->index[j].nslice) + continue; + if ((e = cram_container_offset2num_(&fd->index[j], pos, + &last_pos, &nc))) + break; + } + + return e ? nc : -1; +} + +/*! + * Returns the file offsets of CRAM containers covering a specific region + * query. Note both offsets are the START of the container. + * + * first will point to the start of the first overlapping container + * last will point to the start of the last overlapping container + * + * Returns 0 on success + * <0 on failure + */ +int cram_index_extents(cram_fd *fd, int refid, hts_pos_t start, hts_pos_t end, + off_t *first, off_t *last) { + cram_index *ci; + + if (first) { + if (!(ci = cram_index_query(fd, refid, start, NULL))) + return -1; + *first = ci->offset; + } + + if (last) { + if (!(ci = cram_index_query_last(fd, refid, end))) + return -1; + *last = ci->offset; + } + + return 0; +} diff --git a/htslib/cram.h b/htslib/cram.h index e0b51839c..841e4a9b6 100644 --- a/htslib/cram.h +++ b/htslib/cram.h @@ -209,6 +209,11 @@ HTSLIB_EXPORT int cram_container_is_empty(cram_fd *fd); +/* Returns chromosome and start/span from container struct */ +HTSLIB_EXPORT +void cram_container_get_coords(cram_container *c, + int *refid, hts_pos_t *start, hts_pos_t *span); + /* *----------------------------------------------------------------------------- * cram_block @@ -329,6 +334,18 @@ int cram_transcode_rg(cram_fd *in, cram_fd *out, HTSLIB_EXPORT int cram_copy_slice(cram_fd *in, cram_fd *out, int32_t num_slice); +/* + * Copies a container, but filtering it down to a specific region (as + * already specified in 'in' + * + * Returns 0 on success + * -1 on EOF + * -2 on error + */ +HTSLIB_EXPORT +int cram_filter_container(cram_fd *in, cram_fd *out, cram_container *c, + int *ref_id); + /* * Decodes a CRAM block compression header. * Returns header ptr on success @@ -744,6 +761,62 @@ static inline void sam_hdr_free(SAM_hdr *hdr) { sam_hdr_destroy(hdr); } HTSLIB_EXPORT refs_t *cram_get_refs(htsFile *fd); +/*! + * Returns the file offsets of CRAM slices covering a specific region + * query. Note both offsets are the START of the slice. + * + * first will point to the start of the first overlapping slice + * last will point to the start of the last overlapping slice + * + * @return + * Returns 0 on success + * <0 on failure + */ +HTSLIB_EXPORT +int cram_index_extents(cram_fd *fd, int refid, hts_pos_t start, hts_pos_t end, + off_t *first, off_t *last); + +/*! Returns the total number of containers in the CRAM index. + * + * Note the index is not required to have an entry for every container, but it + * will always have an index entry for the start of each chromosome. + * (Although in practice our indices do container one entry per container.) + * + * This is equivalent to cram_num_containers_between(fd, 0, 0, NULL, NULL) + */ +HTSLIB_EXPORT +int64_t cram_num_containers(cram_fd *fd); + +/*! Returns the number of containers in the CRAM index within given offsets. + * + * The cstart and cend offsets are the locations of the start of containers + * as returned by index_container_offset. + * + * If non-NULL, first and last will hold the inclusive range of container + * numbers, counting from zero. + * + * @return + * Returns the number of containers, equivalent to *last-*first+1. + */ +HTSLIB_EXPORT +int64_t cram_num_containers_between(cram_fd *fd, + off_t cstart, off_t cend, + int64_t *first, int64_t *last); + +/*! Returns the byte offset for the start of the n^th container. + * + * The index must have previously been loaded, otherwise <0 is returned. + */ +HTSLIB_EXPORT +off_t cram_container_num2offset(cram_fd *fd, int64_t n); + +/*! Returns the container number for the first container at offset >= pos. + * + * The index must have previously been loaded, otherwise <0 is returned. + */ +HTSLIB_EXPORT +int64_t cram_container_offset2num(cram_fd *fd, off_t pos); + /**@}*/ #ifdef __cplusplus