Skip to content

Commit

Permalink
Improve support for MM .? modifiers.
Browse files Browse the repository at this point in the history
The previous commit permitted these to exist, but didn't make the data
available to the caller.

This extends the API with additional queries to distinguish the
specifics about the modification types present.
  • Loading branch information
jkbonfield committed Apr 29, 2022
1 parent e51f72f commit 612cf6f
Show file tree
Hide file tree
Showing 10 changed files with 309 additions and 20 deletions.
38 changes: 38 additions & 0 deletions htslib/sam.h
Original file line number Diff line number Diff line change
Expand Up @@ -2271,6 +2271,44 @@ int bam_mods_at_qpos(const bam1_t *b, int qpos, hts_base_mod_state *state,
hts_base_mod *mods, int n_mods);


/// Returns data about a specific modification type for the alignment record.
/**
* @param b BAM alignment record
* @param state The base modification state pointer.
* @param code Modification code. If positive this is a character code,
* if negative it is a -ChEBI code.
*
* @param strand Boolean for top (0) or bottom (1) strand
* @param implicit Boolean for whether unlisted positions should be
* implicitly assumed to be unmodified, or require an
* explicit score and should be considered as unknown.
* Returned.
* @param canonical Canonical base type associated with this modification
* Returned.
*
* @return 0 on success or -1 if not found. The strand, implicit and canonical
* fields are filled out if passed in as non-NULL pointers.
*/
HTSLIB_EXPORT
int bam_mods_query_type(hts_base_mod_state *state, int code,
int *strand, int *implicit, char *canonical);

/// Returns the list of base modification codes provided for this
/// alignment record as an array of character codes (+ve) or ChEBI numbers
/// (negative).
/*
* @param b BAM alignment record
* @param state The base modification state pointer.
* @param ntype Filled out with the number of array elements returned
*
* @return the type array, with *ntype filled out with the size.
* The array returned should not be freed.
* It is a valid pointer until the state is freed using
* hts_base_mod_free().
*/
HTSLIB_EXPORT
int *bam_mods_recorded(hts_base_mod_state *state, int *ntype);

#ifdef __cplusplus
}
#endif
Expand Down
53 changes: 48 additions & 5 deletions sam.c
Original file line number Diff line number Diff line change
Expand Up @@ -6092,6 +6092,7 @@ struct hts_base_mod_state {
char *MMend[MAX_BASE_MOD]; // end of pos-delta string
uint8_t *ML[MAX_BASE_MOD]; // next qual
int MLstride[MAX_BASE_MOD]; // bytes between quals for this type
int implicit[MAX_BASE_MOD]; // treat unlisted positions as non-modified?
int seq_pos; // current position along sequence
int nmods; // used array size (0 to MAX_BASE_MOD-1).
};
Expand Down Expand Up @@ -6160,6 +6161,7 @@ int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state) {

char *cp = (char *)mm+1;
int mod_num = 0;
int implicit = 1;
while (*cp) {
for (; *cp; cp++) {
// cp should be [ACGTNU][+-]([a-zA-Z]+|[0-9]+)[.?]?(,\d+)*;
Expand Down Expand Up @@ -6192,16 +6194,15 @@ int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state) {
if (*cp == '\0')
return -1;
}

me = cp;

// Optional explicit vs implicit marker.
// Right now we ignore this field. A proper API for
// querying it will follow later.
// Optional explicit vs implicit marker
if (*cp == '.') {
// implicit = 1;
// default is implicit = 1;
cp++;
} else if (*cp == '?') {
// implicit = 0;
implicit = 0;
cp++;
} else if (*cp != ',' && *cp != ';') {
// parse error
Expand Down Expand Up @@ -6257,6 +6258,7 @@ int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state) {
state->strand [mod_num] = (strand == '-');
state->canonical[mod_num] = btype;
state->MLstride [mod_num] = stride;
state->implicit [mod_num] = implicit;

state->MMcount [mod_num] = delta;
if (b->core.flag & BAM_FREVERSE) {
Expand Down Expand Up @@ -6473,3 +6475,44 @@ int bam_mods_at_qpos(const bam1_t *b, int qpos, hts_base_mod_state *state,

return r;
}

/*
* Returns the list of base modification codes provided for this
* alignment record as an array of character codes (+ve) or ChEBI numbers
* (negative).
*
* Returns the array, with *ntype filled out with the size.
* The array returned should not be freed.
* It is a valid pointer until the state is freed using
* hts_base_mod_free().
*/
int *bam_mods_recorded(hts_base_mod_state *state, int *ntype) {
*ntype = state->nmods;
return state->type;
}

/*
* Returns data about a specific modification type for the alignment record.
* Code is either positive (eg 'm') or negative for ChEBI numbers.
*
* Return 0 on success or -1 if not found. The strand, implicit and canonical
* fields are filled out if passed in as non-NULL pointers.
*/
int bam_mods_query_type(hts_base_mod_state *state, int code,
int *strand, int *implicit, char *canonical) {
// Find code entry
int i;
for (i = 0; i < state->nmods; i++) {
if (state->type[i] == code)
break;
}
if (i == state->nmods)
return -1;

// Return data
if (strand) *strand = state->strand[i];
if (implicit) *implicit = state->implicit[i];
if (canonical) *canonical = "?AC?G???T??????N"[state->canonical[i]];

return 0;
}
1 change: 1 addition & 0 deletions test/base_mods/MM-chebi.out
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
34 C C+m204 C+(76792)33
35 A
---
Present: m #-76792 n
6 C C+m102
15 N N+n212
17 C C+m128
Expand Down
1 change: 1 addition & 0 deletions test/base_mods/MM-double.out
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
34 A
35 T
---
Present: m m o
1 G G-m115
7 C C+m128
12 G G-m141
Expand Down
103 changes: 103 additions & 0 deletions test/base_mods/MM-explicit-x.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
0 A
1 T
2 C
3 A
4 T
5 C
6 A
7 T
8 T
9 C C+m.200 C+h.10
10 C C+m.50 C+h.170
11 T
12 A
13 C
14 C C+m.160 C+h.20
15 G
16 C
17 T
18 A
19 T
20 A
21 G
22 C
23 C
24 T
---
Present: m h
9 C C+m200 C+h10
10 C C+m50 C+h170
14 C C+m160 C+h20

===

0 A
1 T
2 C
3 A
4 T
5 C
6 A
7 T
8 T
9 C C+m?200 C+h?10
10 C C+m?50 C+h?170
11 T
12 A
13 C C+m?10 C+h?5
14 C C+m?160 C+h?20
15 G
16 C C+m?10 C+h?5
17 T
18 A
19 T
20 A
21 G
22 C
23 C
24 T
---
Present: m h
9 C C+m200 C+h10
10 C C+m50 C+h170
13 C C+m10 C+h5
14 C C+m160 C+h20
16 C C+m10 C+h5

===

0 A
1 T
2 C
3 A
4 T
5 C
6 A
7 T
8 T
9 C C+m.200 C+h?10
10 C C+h?170
11 T
12 A
13 C C+h?5
14 C C+m.160 C+h?20
15 G
16 C C+h?5
17 T
18 A
19 T
20 A
21 G
22 C
23 C
24 T
---
Present: m h
9 C C+m200 C+h10
10 C C+h170
13 C C+h5
14 C C+m160 C+h20
16 C C+h5

===

3 changes: 3 additions & 0 deletions test/base_mods/MM-explicit.out
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
23 C
24 T
---
Present: m h
9 C C+m200 C+h10
10 C C+m50 C+h170
14 C C+m160 C+h20
Expand Down Expand Up @@ -56,6 +57,7 @@
23 C
24 T
---
Present: m h
9 C C+m200 C+h10
10 C C+m50 C+h170
13 C C+m10 C+h5
Expand Down Expand Up @@ -90,6 +92,7 @@
23 C
24 T
---
Present: m h
9 C C+m200 C+h10
10 C C+h170
13 C C+h5
Expand Down
22 changes: 18 additions & 4 deletions test/base_mods/MM-explicit.sam
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,24 @@
@CO unchecked and require an explicit statement to indicate it was
@CO looked at and no base modification was observed.
@CO
@CO 0 1 23 45 6 78
@CO ATCATCATTCCTACCGCTATAGCCT
@CO . . m. .m . .. m
@CO ? ? .h .. . ?? h
@CO ATCATCATTCCTACCGCTATAGCCT r1; implicit
@CO - - .. -. - --
@CO Mm M
@CO - - .. -. - --
@CO hH h
@CO
@CO ATCATCATTCCTACCGCTATAGCCT r2; explicit to a small region
@CO - - ?? ?? ? --
@CO Mm mM m
@CO - - ?? ?? ? --
@CO hH hh h
@CO
@CO ATCATCATTCCTACCGCTATAGCCT r3; mixture
@CO - - . -. - --
@CO M M
@CO - - ?? ?? ? --
@CO hH hh h --
@CO
r1 0 * 0 0 * * 0 0 ATCATCATTCCTACCGCTATAGCCT * Mm:Z:C+mh,2,0,1; Ml:B:C,200,10,50,170,160,20
r2 0 * 0 0 * * 0 0 ATCATCATTCCTACCGCTATAGCCT * Mm:Z:C+mh?,2,0,0,0,0; Ml:B:C,200,10,50,170,10,5,160,20,10,5
r3 0 * 0 0 * * 0 0 ATCATCATTCCTACCGCTATAGCCT * Mm:Z:C+m.,2,2;C+h?,2,0,0,0,0; Ml:B:C,200,160,10,170,5,20,5
2 changes: 2 additions & 0 deletions test/base_mods/MM-multi.out
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
34 C C+m230 C+h6
35 A
---
Present: m h n
6 C C+m128
15 N N+n215
17 C C+m153
Expand Down Expand Up @@ -83,6 +84,7 @@
34 C C+m204 C+h31
35 A
---
Present: m h n
6 C C+m77 C+h159
15 N N+n240
17 C C+m103 C+h133
Expand Down
9 changes: 5 additions & 4 deletions test/base_mods/base-mods.tst
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,11 @@
# samtools binary. This can be useful for testing older versions.

# Test files from SAM spec
P MM-chebi.out $test_mod MM-chebi.sam
P MM-double.out $test_mod MM-double.sam
P MM-multi.out $test_mod MM-multi.sam
P MM-explicit.out $test_mod MM-explicit.sam
P MM-chebi.out $test_mod MM-chebi.sam
P MM-double.out $test_mod MM-double.sam
P MM-multi.out $test_mod MM-multi.sam
P MM-explicit.out $test_mod MM-explicit.sam
P MM-explicit-x.out $test_mod -x MM-explicit.sam

# Pileup testing
P MM-pileup.out $pileup_mod < MM-pileup.sam
Expand Down
Loading

0 comments on commit 612cf6f

Please sign in to comment.