Skip to content

Commit

Permalink
Provide extra CRAM container manipulations and index queries.
Browse files Browse the repository at this point in the history
Added to support extra functionality to `samtools cat`.

- Some internal cram functions are no longer static as they're called
  from cram_external.c, but they don't have HTSLIB_EXPORT and aren't
  an official part of the API.
  These are cram_to_bam, cram_next_slice

- New public CRAM APIs:
  These facilitate manipulation at the container level, both seeking
  to specific byte offsets, but also being able to specify containers
  as the n^th container listed in the index.

  cram_container_get_coords returns refid, start and span fields from
  the opaque cram_container struct.

  cram_filter_container copies a container but applies region based
  filtering, as already specified in the cram_fd with a range request.
  (Note we currently also provide cram_copy_slice, but may want to add
  a cram_copy_container for consistency.)

  cram_index_extents queries an index to return byte offsets of the
  first and last container overlapping a specified region.

  cram_num_containers_between queries an index to report the number of
  indexed containers and their container numbers (starting at 0 for
  the first) covering a range.

  cram_num_containers is a simplified cram_num_containers_between
  doing only the counting operation and on the entire file.

  cram_container_num2offset returns the byte offset for the n^th
  container.  cram_container_offset2num does the reverse.

- A new cram_skip_container function, which is currently internal only
  but may potentially have use externally in the future.  It's used by
  cram_filter_container when it detects it'll filter out everything.

- cram_index_query now copes with HTS_IDX_NOCOOR (-2) and maps it
  over to refid -1.
  • Loading branch information
jkbonfield authored and whitwham committed May 3, 2024
1 parent ab7c09f commit 7576aca
Show file tree
Hide file tree
Showing 5 changed files with 493 additions and 7 deletions.
8 changes: 4 additions & 4 deletions cram/cram_decode.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
Copyright (c) 2012-2020, 2022-2023 Genome Research Ltd.
Copyright (c) 2012-2020, 2022-2024 Genome Research Ltd.
Author: James Bonfield <jkb@sanger.ac.uk>
Redistribution and use in source and binary forms, with or without
Expand Down Expand Up @@ -3004,8 +3004,8 @@ int cram_decode_slice_mt(cram_fd *fd, cram_container *c, cram_slice *s,
* Returns the used size of the bam record on success
* -1 on failure.
*/
static int cram_to_bam(sam_hdr_t *sh, cram_fd *fd, cram_slice *s,
cram_record *cr, int rec, bam_seq_t **bam) {
int cram_to_bam(sam_hdr_t *sh, cram_fd *fd, cram_slice *s,
cram_record *cr, int rec, bam_seq_t **bam) {
int ret, rg_len;
char name_a[1024], *name;
int name_len;
Expand Down Expand Up @@ -3172,7 +3172,7 @@ static cram_container *cram_first_slice(cram_fd *fd) {
return c;
}

static cram_slice *cram_next_slice(cram_fd *fd, cram_container **cp) {
cram_slice *cram_next_slice(cram_fd *fd, cram_container **cp) {
cram_container *c_curr; // container being consumed via cram_get_seq()
cram_slice *s_curr = NULL;

Expand Down
27 changes: 26 additions & 1 deletion cram/cram_decode.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
Copyright (c) 2012-2013, 2018 Genome Research Ltd.
Copyright (c) 2012-2013, 2018, 2024 Genome Research Ltd.
Author: James Bonfield <jkb@sanger.ac.uk>
Redistribution and use in source and binary forms, with or without
Expand Down Expand Up @@ -94,6 +94,15 @@ cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd,
cram_block_slice_hdr *cram_decode_slice_header(cram_fd *fd, cram_block *b);


/*! INTERNAL:
* Loads and decodes the next slice worth of data.
*
* @return
* Returns cram slice pointer on success;
* NULL on failure
*/
cram_slice *cram_next_slice(cram_fd *fd, cram_container **cp);

/*! INTERNAL:
* Decode an entire slice from container blocks. Fills out s->crecs[] array.
*
Expand All @@ -105,6 +114,22 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s,
sam_hdr_t *hdr);


/*! INTERNAL:
* Converts a cram in-memory record into a bam in-memory record. We
* pass a pointer to a bam_seq_t pointer along with the a pointer to
* the allocated size. These can initially be pointers to NULL and zero.
*
* This function will reallocate the bam buffer as required and update
* (*bam)->alloc accordingly, allowing it to be used within a loop
* efficiently without needing to allocate new bam objects over and
* over again.
*
* Returns the used size of the bam record on success
* -1 on failure.
*/
int cram_to_bam(sam_hdr_t *sh, cram_fd *fd, cram_slice *s,
cram_record *cr, int rec, bam_seq_t **bam);

/*
* Drains and frees the decode read-queue for a multi-threaded reader.
*/
Expand Down
200 changes: 199 additions & 1 deletion cram/cram_external.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
Copyright (c) 2015, 2018-2020, 2022-2023 Genome Research Ltd.
Copyright (c) 2015, 2018-2020, 2022-2024 Genome Research Ltd.
Author: James Bonfield <jkb@sanger.ac.uk>
Redistribution and use in source and binary forms, with or without
Expand Down Expand Up @@ -49,6 +49,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif

#include "../htslib/hfile.h"
#include "../hfile_internal.h"
#include "cram.h"

/*
Expand Down Expand Up @@ -121,6 +122,16 @@ int cram_container_is_empty(cram_fd *fd) {
return fd->empty_container;
}

void cram_container_get_coords(cram_container *c,
int *refid, hts_pos_t *start, hts_pos_t *span) {
if (refid)
*refid = c->ref_seq_id;
if (start)
*start = c->ref_seq_start;
if (span)
*span = c->ref_seq_span;
}


/*
*-----------------------------------------------------------------------------
Expand Down Expand Up @@ -683,6 +694,7 @@ int cram_copy_slice(cram_fd *in, cram_fd *out, int32_t num_slice) {
cram_free_block(blk);
return -1;
}

if (cram_write_block(out, blk) != 0) {
cram_free_block(blk);
return -1;
Expand All @@ -704,6 +716,192 @@ int cram_copy_slice(cram_fd *in, cram_fd *out, int32_t num_slice) {
return 0;
}

/*
* Discards the next containers worth of data.
* Only the cram structure has been read so far.
*
* Returns 0 on success,
* -1 on failure
*/
static int cram_skip_container(cram_fd *in, cram_container *c) {
// Compression header
cram_block *blk;
if (!(blk = cram_read_block(in)))
return -1;
cram_free_block(blk);

int i;
for (i = 0; i < c->num_landmarks; i++) {
cram_block_slice_hdr *hdr;

if (!(blk = cram_read_block(in)))
return -1;
if (!(hdr = cram_decode_slice_header(in, blk))) {
cram_free_block(blk);
return -1;
}
cram_free_block(blk);

int num_blocks = cram_slice_hdr_get_num_blocks(hdr), j;
for (j = 0; j < num_blocks; j++) {
blk = cram_read_block(in);
if (!blk) {
cram_free_slice_header(hdr);
return -1;
}
cram_free_block(blk);
}
cram_free_slice_header(hdr);
}

return 0;
}


/*
* Copies a container, but filtering it down to a specific region,
* which has already been set on the 'in' fd.
*
* This is used in e.g. samtools cat where we specified a region and discover
* that a region doesn't entirely span the container, so we have to select
* which reads we need to copy out of it.
*
* If ref_id is non-NULL we also return the last ref_id we filtered.
* This can be -2 if it's multi-ref and we observe more than one reference,
* and actual ref_id >= -1 if it's multi-ref and we observe just one ref or
* it's fixed reference.
*
* Returns 0 on success
* -1 on error
*/
int cram_filter_container(cram_fd *in, cram_fd *out, cram_container *c,
int *ref_id) {
int err = 0, fixed_ref = -3;

if (ref_id)
*ref_id = c->ref_seq_id;

int rid = in->range.refid == -2 ? -1 : in->range.refid;
if (rid != c->ref_seq_id ||
in->range.start > c->ref_seq_start + c->ref_seq_span-1)
// Except for multi-ref cases
if (c->ref_seq_id != -2)
return cram_skip_container(in, c);

// Container compression header
cram_block *blk = cram_read_block(in);
if (!blk)
return -1;
c->comp_hdr = cram_decode_compression_header(in, blk);
in->ctr = c;

// If it's multi-ref but a constant ref-id, then we can still do
// basic level chromosome filtering. Similarly multi-ref where we're
// _already_ in ref "*" (unmapped) means we can just copy the container
// as there are no positions to filter on and "*" sorts to the end.
// TODO: how to tell "already in" though?
if (c->ref_seq_id == -2) {
cram_codec *cd = c->comp_hdr->codecs[DS_RI];
if (cd && cd->codec == E_HUFFMAN && cd->u.huffman.ncodes == 1 &&
// this check should be always true anyway
rid == cd->u.huffman.codes[0].symbol)
// We're in multi-ref mode, but actually the entire container
// matches. So if we're in whole-chromosome mode we can just
// copy.
if (in->range.start <= 1 &&
in->range.end >= (INT64_MAX&(0xffffffffULL<<32))) {
if (ref_id)
*ref_id = rid;
err |= cram_write_container(out, c) < 0;
err |= cram_write_block(out, blk);
return cram_copy_slice(in, out, c->num_landmarks) | -err;
}
}

// A simple read-write loop with region filtering automatically due to
// an earlier CRAM_OPT_RANGE request.
//
// We can hit EOF when reaching the end of the range, but we still need
// to manually check we don't attempt to read beyond this single container.

cram_range rng_copy = in->range;
in->range.start = INT64_MIN;
in->range.end = INT64_MAX;

bam1_t *b = bam_init1();
while ((c->curr_slice < c->max_slice ||
c->slice->curr_rec < c->slice->max_rec)) {
cram_slice *s;
if (c->slice && c->slice->curr_rec < c->slice->max_rec)
s = c->slice;
else if (c->curr_slice < c->max_slice)
s = cram_next_slice(in, &c);
else
break; // end of container
c->slice = s;

// This is more efficient if we check as a cram record instead of a
// bam record as we don't have to parse CIGAR end.
cram_record *cr = &c->slice->crecs[c->slice->curr_rec];
if (fixed_ref == -3)
fixed_ref = cr->ref_id;
else if (fixed_ref != cr->ref_id)
fixed_ref = -2;

if (rng_copy.refid != cr->ref_id) {
if (rng_copy.refid == -2) {
if (cr->ref_id > -1) {
// Want unmapped, but have mapped
c->slice->curr_rec++;
continue;
}
} else {
if (rng_copy.refid > cr->ref_id || rng_copy.refid == -1) {
// multi-ref and not at the correct ref yet
c->slice->curr_rec++;
continue;
} else {
// multi-ref and beyond the desired ref
break;
}
}
}

// Correct ref, but check the desired region
if (cr->aend < rng_copy.start) {
c->slice->curr_rec++;
continue;
}
if (cr->apos > rng_copy.end)
break;

// Broadly rquivalent to cram_get_bam_seq, but starting from 'cr'
err |= cram_to_bam(in->header, in, s, cr, s->curr_rec++, &b) < 0;

if (cram_put_bam_seq(out, b) < 0) {
err |= 1;
break;
}
}
bam_destroy1(b);

if (ref_id)
*ref_id = fixed_ref;

in->range = rng_copy;

// Avoids double frees as we stole the container from our other
// file descriptor.
in->ctr = NULL;
in->ctr_mt = NULL;

err |= cram_flush(out);
cram_free_block(blk);

return -err;
}


/*
* Renumbers RG numbers in a cram compression header.
*
Expand Down
Loading

0 comments on commit 7576aca

Please sign in to comment.