Skip to content

Commit

Permalink
Add bam_aux_first()/bam_aux_next() tagged aux field iterator API
Browse files Browse the repository at this point in the history
Add new API functions for iterating through a BAM record's aux fields,
inline accessor methods for field tag and type (or code can continue
to use s-2 and *s), and a variant of bam_aux_del() that returns the
(updated) iterator to the following field (for use in iterator-based
loops that delete fields).

Add test cases for the new API functions.
  • Loading branch information
jmarshall committed Aug 20, 2022
1 parent 203f5bb commit ac3beaa
Show file tree
Hide file tree
Showing 3 changed files with 137 additions and 33 deletions.
53 changes: 52 additions & 1 deletion htslib/sam.h
Original file line number Diff line number Diff line change
Expand Up @@ -1438,7 +1438,6 @@ int sam_passes_filter(const sam_hdr_t *h, const bam1_t *b,

/// Converts a BAM aux tag to SAM format
/*
* @param b Pointer to the bam record
* @param key Two letter tag key
* @param type Single letter type code: ACcSsIifHZB.
* @param tag Tag data pointer, in BAM format
Expand Down Expand Up @@ -1628,6 +1627,29 @@ static inline const uint8_t *sam_format_aux1(const uint8_t *key,
return NULL;
}

/// Return a pointer to a BAM record's first aux field
/** @param b Pointer to the BAM record
@return Aux field pointer, or NULL if the record has none
When NULL is returned, errno will also be set to ENOENT. ("Aux field pointers"
point to the TYPE byte within the auxiliary data for that field; but in general
it is unnecessary for user code to be aware of this.)
*/
HTSLIB_EXPORT
uint8_t *bam_aux_first(const bam1_t *b);

/// Return a pointer to a BAM record's next aux field
/** @param b Pointer to the BAM record
@param s Aux field pointer, as returned by bam_aux_first()/_next()/_get()
@return Pointer to the next aux field, or NULL if no next field or error
Whenever NULL is returned, errno will also be set: ENOENT if @p s was the
record's last aux field; otherwise EINVAL, indicating that the BAM record's
aux data is corrupt.
*/
HTSLIB_EXPORT
uint8_t *bam_aux_next(const bam1_t *b, const uint8_t *s);

/// Return a pointer to an aux record
/** @param b Pointer to the bam record
@param tag Desired aux tag
Expand All @@ -1640,6 +1662,19 @@ static inline const uint8_t *sam_format_aux1(const uint8_t *key,
HTSLIB_EXPORT
uint8_t *bam_aux_get(const bam1_t *b, const char tag[2]);

/// Return the aux field's 2-character tag
/** @param s Aux field pointer, as returned by bam_aux_first()/_next()/_get()
@return Pointer to the tag characters, NOT NUL-terminated
*/
static inline
const char *bam_aux_tag(const uint8_t *s) { return (const char *) (s-2); }

/// Return the aux field's type character
/** @param s Aux field pointer, as returned by bam_aux_first()/_next()/_get()
@return The type character: one of cCsSiI/fd/A/Z/H/B
*/
static inline char bam_aux_type(const uint8_t *s) { return *s; }

/// Return a SAM formatting string containing a BAM tag
/** @param b Pointer to the bam record
@param tag Desired aux tag
Expand Down Expand Up @@ -1751,6 +1786,22 @@ int bam_aux_append(bam1_t *b, const char tag[2], char type, int len, const uint8
HTSLIB_EXPORT
int bam_aux_del(bam1_t *b, uint8_t *s);

/// Delete an aux field from a BAM record
/* @param b The BAM record to update
@param s Pointer to the aux field to delete, as returned by
bam_aux_first()/_next()/_get()
@return Pointer to the following aux field, or NULL if none or on error
Identical to @c bam_aux_del() apart from the return value, which is an
aux iterator suitable for use with @c bam_aux_next()/etc.
Whenever NULL is returned, errno will also be set: ENOENT if the aux field
deleted was the record's last one; otherwise EINVAL, indicating that the
BAM record's aux data is corrupt.
*/
HTSLIB_EXPORT
uint8_t *bam_aux_remove(bam1_t *b, uint8_t *s);

/// Update or add a string-type tag
/* @param b The bam record to update
@param tag Tag identifier
Expand Down
78 changes: 46 additions & 32 deletions sam.c
Original file line number Diff line number Diff line change
Expand Up @@ -4614,31 +4614,42 @@ static inline uint8_t *skip_aux(uint8_t *s, uint8_t *end)
}
}

uint8_t *bam_aux_first(const bam1_t *b)
{
uint8_t *s = bam_get_aux(b);
uint8_t *end = b->data + b->l_data;
if (s >= end) { errno = ENOENT; return NULL; }
return s+2;
}

uint8_t *bam_aux_next(const bam1_t *b, const uint8_t *s)
{
uint8_t *end = b->data + b->l_data;
uint8_t *next = s? skip_aux((uint8_t *) s, end) : end;
if (next == NULL) goto bad_aux;
if (next >= end) { errno = ENOENT; return NULL; }
return next+2;

bad_aux:
hts_log_error("Corrupted aux data for read %s", bam_get_qname(b));
errno = EINVAL;
return NULL;
}

uint8_t *bam_aux_get(const bam1_t *b, const char tag[2])
{
uint8_t *s, *end, *t = (uint8_t *) tag;
uint16_t y = (uint16_t) t[0]<<8 | t[1];
s = bam_get_aux(b);
end = b->data + b->l_data;
while (s != NULL && end - s >= 3) {
uint16_t x = (uint16_t) s[0]<<8 | s[1];
s += 2;
if (x == y) {
uint8_t *s;
for (s = bam_aux_first(b); s; s = bam_aux_next(b, s))
if (s[-2] == tag[0] && s[-1] == tag[1]) {
// Check the tag value is valid and complete
uint8_t *e = skip_aux(s, end);
if ((*s == 'Z' || *s == 'H') && *(e - 1) != '\0') {
goto bad_aux; // Unterminated string
}
if (e != NULL) {
return s;
} else {
goto bad_aux;
}
uint8_t *e = skip_aux(s, b->data + b->l_data);
if (e == NULL) goto bad_aux;
if ((*s == 'Z' || *s == 'H') && *(e - 1) != '\0') goto bad_aux;

return s;
}
s = skip_aux(s, end);
}
if (s == NULL) goto bad_aux;
errno = ENOENT;

// errno now as set by bam_aux_first()/bam_aux_next()
return NULL;

bad_aux:
Expand All @@ -4647,23 +4658,26 @@ uint8_t *bam_aux_get(const bam1_t *b, const char tag[2])
return NULL;
}

// s MUST BE returned by bam_aux_get()
int bam_aux_del(bam1_t *b, uint8_t *s)
{
uint8_t *p, *aux;
int l_aux = bam_get_l_aux(b);
aux = bam_get_aux(b);
p = s - 2;
s = skip_aux(s, aux + l_aux);
if (s == NULL) goto bad_aux;
memmove(p, s, l_aux - (s - aux));
b->l_data -= s - p;
return 0;
s = bam_aux_remove(b, s);
return (s || errno == ENOENT)? 0 : -1;
}

uint8_t *bam_aux_remove(bam1_t *b, uint8_t *s)
{
uint8_t *end = b->data + b->l_data;
uint8_t *next = skip_aux(s, end);
if (next == NULL) goto bad_aux;
memmove(s-2, next, end - next);
b->l_data -= next - (s-2);
if (next >= end) { errno = ENOENT; return NULL; }
else return s;

bad_aux:
hts_log_error("Corrupted aux data for read %s", bam_get_qname(b));
errno = EINVAL;
return -1;
return NULL;
}

int bam_aux_update_str(bam1_t *b, const char tag[2], int len, const char *data)
Expand Down
39 changes: 39 additions & 0 deletions test/sam.c
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,15 @@ uint8_t *check_bam_aux_get(const bam1_t *aln, const char *tag, char type)
return NULL;
}

static void check_aux_count(const bam1_t *aln, int expected, const char *what)
{
const uint8_t *itr;
int n = 0;
for (itr = bam_aux_first(aln); itr; itr = bam_aux_next(aln, itr)) n++;
if (n != expected)
fail("%s has %d aux fields, expected %d", what, n, expected);
}

static void check_int_B_array(bam1_t *aln, char *tag,
uint32_t nvals, int64_t *vals) {
uint8_t *p;
Expand Down Expand Up @@ -285,10 +294,30 @@ static int aux_fields1(void)
if ((p = check_bam_aux_get(aln, "XA", 'A')) && bam_aux2A(p) != 'k')
fail("XA field is '%c', expected 'k'", bam_aux2A(p));

check_aux_count(aln, 24, "Original record");

bam_aux_del(aln,p);
if (bam_aux_get(aln,"XA"))
fail("XA field was not deleted");

check_aux_count(aln, 23, "Record post-XA-deletion");

p = bam_aux_get(aln, "Y2");
if (p == NULL || strncmp(bam_aux_tag(p), "Y2", 2) != 0 || bam_aux_type(p) != 'i')
fail("bam_aux_get() missed Y2 field");

p = bam_aux_next(aln, p);
if (p == NULL || strncmp(bam_aux_tag(p), "Y3", 2) != 0 || bam_aux_type(p) != 'c')
fail("bam_aux_next() missed Y3 field");

p = bam_aux_get(aln, "Y8");
if (p == NULL || strncmp(bam_aux_tag(p), "Y8", 2) != 0 || bam_aux_type(p) != 'I')
fail("bam_aux_get() missed Y8 field");

p = bam_aux_next(aln, p);
if (p != NULL || errno != ENOENT)
fail("bam_aux_next missed the end of fields");

if ((p = check_bam_aux_get(aln, "Xi", 'C')) && bam_aux2i(p) != 37)
fail("Xi field is %"PRId64", expected 37", bam_aux2i(p));

Expand Down Expand Up @@ -492,6 +521,16 @@ static int aux_fields1(void)

if (strcmp(ks.s, r1) != 0)
fail("record formatted incorrectly: \"%s\"", ks.s);

// Test field removal APIs -- after the strcmp(..., r1) check so that
// can also check the formatting of the to-be-removed fields.

p = bam_aux_remove(aln, check_bam_aux_get(aln, "XH", 'H'));
if (bam_aux_get(aln, "XH"))
fail("XH field was not removed");
check_aux_count(aln, 31, "Record post-XH-removal");
if (strncmp(bam_aux_tag(p), "XB", 2) != 0 || bam_aux_type(p) != 'B')
fail("bam_aux_remove() missed XB field");
}
else fail("can't read record");

Expand Down

0 comments on commit ac3beaa

Please sign in to comment.