Skip to content

Commit

Permalink
Add a cF CRAM specific tag.
Browse files Browse the repository at this point in the history
This is used to indicate when MD and NM were not present and should
not be regenerated during decode.

Bit 1 is set when MD shouldn't be produced and bit 2 when NM shouldn't
be.  In both cases this tag is only created when embed_ref=2 and MD
and/or NM is absent from the input data.  In this scenario we cannot
reproduce the reference from SEQ+MD and cannot therefore be certain
that the value reproduced is correct.  E.g. if the reference is
produced by consensus alone, then MD is a diff of this read vs
consensus and not this read vs the original reference used in by the
aligner.

The cF tag is automatically stripped out during decode, but only with
this version of htslib and above.  Older versions will just emit a
private-space aux tag (which hopefully is harmless except for the
unlikely event of it clashing with another private name-space tool).
  • Loading branch information
jkbonfield committed May 31, 2022
1 parent bdb088f commit 7039fc9
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 2 deletions.
18 changes: 18 additions & 0 deletions cram/cram_decode.c
Original file line number Diff line number Diff line change
Expand Up @@ -2037,12 +2037,30 @@ static int cram_decode_aux(cram_fd *fd,
m = map_find(c->comp_hdr->tag_encoding_map, tag_data, id);
if (!m)
return -1;

BLOCK_APPEND(s->aux_blk, (char *)tag_data, 3);

if (!m->codec) return -1;
r |= m->codec->decode(s, m->codec, blk, (char *)s->aux_blk, &out_sz);
if (r) break;
cr->aux_size += out_sz + 3;

// cF CRAM flags.
if (TN[-3]=='c' && TN[-2]=='F' && TN[-1]=='C' && out_sz == 1) {
// Remove cF tag
uint8_t cF = BLOCK_END(s->aux_blk)[-1];
BLOCK_SIZE(s->aux_blk) -= out_sz+3;
cr->aux_size -= out_sz+3;

// bit 1 => don't auto-decode MD.
// Pretend MD is present verbatim, so we don't auto-generate
if ((cF & 1) && has_MD && *has_MD == 0)
*has_MD = 1;

// bit 1 => don't auto-decode NM
if ((cF & 2) && has_NM && *has_NM == 0)
*has_NM = 1;
}
}
}

Expand Down
31 changes: 29 additions & 2 deletions cram/cram_encode.c
Original file line number Diff line number Diff line change
Expand Up @@ -2229,7 +2229,7 @@ static int cram_add_insertion(cram_container *c, cram_slice *s, cram_record *r,
static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c,
cram_slice *s, cram_record *cr,
int verbatim_NM, int verbatim_MD,
int NM, kstring_t *MD,
int NM, kstring_t *MD, int cf_tag,
int *err) {
char *aux, *orig, *rg = NULL;
int aux_size = bam_get_l_aux(b);
Expand All @@ -2242,6 +2242,24 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c,

orig = aux = (char *)bam_aux(b);


// cF:i => Extra CRAM bit flags.
// 1: Don't auto-decode MD (may be invalid)
// 2: Don't auto-decode NM (may be invalid)
if (cf_tag && CRAM_MAJOR_VERS(fd->version) < 4) {
// Temporary copy of aux so we can ammend it.
aux = malloc(aux_size+4);
if (!aux)
return NULL;

memcpy(aux, orig, aux_size);
aux[aux_size++] = 'c';
aux[aux_size++] = 'F';
aux[aux_size++] = 'C';
aux[aux_size++] = cf_tag;
orig = aux;
}

// Copy aux keys to td_b and aux values to slice aux blocks
while (aux - orig < aux_size && aux[0] != 0) {
int r;
Expand Down Expand Up @@ -2604,11 +2622,16 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c,
if (cram_stats_add(c->stats[DS_TL], cr->TL) < 0)
goto block_err;

if (orig != (char *)bam_aux(b))
free(orig);

if (err) *err = 0;
return rg;

err:
block_err:
if (orig != (char *)bam_aux(b))
free(orig);
return NULL;
}

Expand Down Expand Up @@ -2975,10 +2998,13 @@ static int process_one_read(cram_fd *fd, cram_container *c,
else
MD->l = 0;

int cf_tag = 0;
if (/*md &&*/ fd->embed_ref == 2) {
// Auto-generate and embed ref
cram_extend_ref(c, b);
cram_build_ref(b, md, c->ref, c->ref_set, c->ref_start, c->ref_end);
cf_tag = MD ? 0 : 1; // No MD
cf_tag |= bam_aux_get(b, "NM") ? 0 : 2; // No NM
}

//fprintf(stderr, "%s => %d\n", rg ? rg : "\"\"", cr->rg);
Expand Down Expand Up @@ -3297,7 +3323,8 @@ static int process_one_read(cram_fd *fd, cram_container *c,

cr->ntags = 0; //cram_stats_add(c->stats[DS_TC], cr->ntags);
int err = 0;
rg = cram_encode_aux(fd, b, c, s, cr, verbatim_NM, verbatim_MD, NM, MD, &err);
rg = cram_encode_aux(fd, b, c, s, cr, verbatim_NM, verbatim_MD, NM, MD,
cf_tag, &err);
if (err)
goto block_err;

Expand Down

0 comments on commit 7039fc9

Please sign in to comment.