Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve support for MM .? modifiers. #1426

Merged
merged 1 commit into from
Apr 29, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions htslib/sam.h
Original file line number Diff line number Diff line change
Expand Up @@ -2271,6 +2271,44 @@ int bam_mods_at_qpos(const bam1_t *b, int qpos, hts_base_mod_state *state,
hts_base_mod *mods, int n_mods);


/// Returns data about a specific modification type for the alignment record.
/**
* @param b BAM alignment record
* @param state The base modification state pointer.
* @param code Modification code. If positive this is a character code,
* if negative it is a -ChEBI code.
*
* @param strand Boolean for top (0) or bottom (1) strand
* @param implicit Boolean for whether unlisted positions should be
* implicitly assumed to be unmodified, or require an
* explicit score and should be considered as unknown.
* Returned.
* @param canonical Canonical base type associated with this modification
* Returned.
*
* @return 0 on success or -1 if not found. The strand, implicit and canonical
* fields are filled out if passed in as non-NULL pointers.
*/
HTSLIB_EXPORT
int bam_mods_query_type(hts_base_mod_state *state, int code,
int *strand, int *implicit, char *canonical);

/// Returns the list of base modification codes provided for this
/// alignment record as an array of character codes (+ve) or ChEBI numbers
/// (negative).
/*
* @param b BAM alignment record
* @param state The base modification state pointer.
* @param ntype Filled out with the number of array elements returned
*
* @return the type array, with *ntype filled out with the size.
* The array returned should not be freed.
* It is a valid pointer until the state is freed using
* hts_base_mod_free().
*/
HTSLIB_EXPORT
int *bam_mods_recorded(hts_base_mod_state *state, int *ntype);

#ifdef __cplusplus
}
#endif
Expand Down
53 changes: 48 additions & 5 deletions sam.c
Original file line number Diff line number Diff line change
Expand Up @@ -6092,6 +6092,7 @@ struct hts_base_mod_state {
char *MMend[MAX_BASE_MOD]; // end of pos-delta string
uint8_t *ML[MAX_BASE_MOD]; // next qual
int MLstride[MAX_BASE_MOD]; // bytes between quals for this type
int implicit[MAX_BASE_MOD]; // treat unlisted positions as non-modified?
int seq_pos; // current position along sequence
int nmods; // used array size (0 to MAX_BASE_MOD-1).
};
Expand Down Expand Up @@ -6160,6 +6161,7 @@ int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state) {

char *cp = (char *)mm+1;
int mod_num = 0;
int implicit = 1;
while (*cp) {
for (; *cp; cp++) {
// cp should be [ACGTNU][+-]([a-zA-Z]+|[0-9]+)[.?]?(,\d+)*;
Expand Down Expand Up @@ -6192,16 +6194,15 @@ int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state) {
if (*cp == '\0')
return -1;
}

me = cp;

// Optional explicit vs implicit marker.
// Right now we ignore this field. A proper API for
// querying it will follow later.
// Optional explicit vs implicit marker
if (*cp == '.') {
// implicit = 1;
// default is implicit = 1;
cp++;
} else if (*cp == '?') {
// implicit = 0;
implicit = 0;
cp++;
} else if (*cp != ',' && *cp != ';') {
// parse error
Expand Down Expand Up @@ -6257,6 +6258,7 @@ int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state) {
state->strand [mod_num] = (strand == '-');
state->canonical[mod_num] = btype;
state->MLstride [mod_num] = stride;
state->implicit [mod_num] = implicit;

state->MMcount [mod_num] = delta;
if (b->core.flag & BAM_FREVERSE) {
Expand Down Expand Up @@ -6473,3 +6475,44 @@ int bam_mods_at_qpos(const bam1_t *b, int qpos, hts_base_mod_state *state,

return r;
}

/*
* Returns the list of base modification codes provided for this
* alignment record as an array of character codes (+ve) or ChEBI numbers
* (negative).
*
* Returns the array, with *ntype filled out with the size.
* The array returned should not be freed.
* It is a valid pointer until the state is freed using
* hts_base_mod_free().
*/
int *bam_mods_recorded(hts_base_mod_state *state, int *ntype) {
*ntype = state->nmods;
return state->type;
}

/*
* Returns data about a specific modification type for the alignment record.
* Code is either positive (eg 'm') or negative for ChEBI numbers.
*
* Return 0 on success or -1 if not found. The strand, implicit and canonical
* fields are filled out if passed in as non-NULL pointers.
*/
int bam_mods_query_type(hts_base_mod_state *state, int code,
int *strand, int *implicit, char *canonical) {
// Find code entry
int i;
for (i = 0; i < state->nmods; i++) {
if (state->type[i] == code)
break;
}
if (i == state->nmods)
return -1;

// Return data
if (strand) *strand = state->strand[i];
if (implicit) *implicit = state->implicit[i];
if (canonical) *canonical = "?AC?G???T??????N"[state->canonical[i]];

return 0;
}
1 change: 1 addition & 0 deletions test/base_mods/MM-chebi.out
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
34 C C+m204 C+(76792)33
35 A
---
Present: m #-76792 n
6 C C+m102
15 N N+n212
17 C C+m128
Expand Down
1 change: 1 addition & 0 deletions test/base_mods/MM-double.out
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
34 A
35 T
---
Present: m m o
1 G G-m115
7 C C+m128
12 G G-m141
Expand Down
103 changes: 103 additions & 0 deletions test/base_mods/MM-explicit-x.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
0 A
1 T
2 C
3 A
4 T
5 C
6 A
7 T
8 T
9 C C+m.200 C+h.10
10 C C+m.50 C+h.170
11 T
12 A
13 C
14 C C+m.160 C+h.20
15 G
16 C
17 T
18 A
19 T
20 A
21 G
22 C
23 C
24 T
---
Present: m h
9 C C+m200 C+h10
10 C C+m50 C+h170
14 C C+m160 C+h20

===

0 A
1 T
2 C
3 A
4 T
5 C
6 A
7 T
8 T
9 C C+m?200 C+h?10
10 C C+m?50 C+h?170
11 T
12 A
13 C C+m?10 C+h?5
14 C C+m?160 C+h?20
15 G
16 C C+m?10 C+h?5
17 T
18 A
19 T
20 A
21 G
22 C
23 C
24 T
---
Present: m h
9 C C+m200 C+h10
10 C C+m50 C+h170
13 C C+m10 C+h5
14 C C+m160 C+h20
16 C C+m10 C+h5

===

0 A
1 T
2 C
3 A
4 T
5 C
6 A
7 T
8 T
9 C C+m.200 C+h?10
10 C C+h?170
11 T
12 A
13 C C+h?5
14 C C+m.160 C+h?20
15 G
16 C C+h?5
17 T
18 A
19 T
20 A
21 G
22 C
23 C
24 T
---
Present: m h
9 C C+m200 C+h10
10 C C+h170
13 C C+h5
14 C C+m160 C+h20
16 C C+h5

===

3 changes: 3 additions & 0 deletions test/base_mods/MM-explicit.out
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
23 C
24 T
---
Present: m h
9 C C+m200 C+h10
10 C C+m50 C+h170
14 C C+m160 C+h20
Expand Down Expand Up @@ -56,6 +57,7 @@
23 C
24 T
---
Present: m h
9 C C+m200 C+h10
10 C C+m50 C+h170
13 C C+m10 C+h5
Expand Down Expand Up @@ -90,6 +92,7 @@
23 C
24 T
---
Present: m h
9 C C+m200 C+h10
10 C C+h170
13 C C+h5
Expand Down
22 changes: 18 additions & 4 deletions test/base_mods/MM-explicit.sam
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,24 @@
@CO unchecked and require an explicit statement to indicate it was
@CO looked at and no base modification was observed.
@CO
@CO 0 1 23 45 6 78
@CO ATCATCATTCCTACCGCTATAGCCT
@CO . . m. .m . .. m
@CO ? ? .h .. . ?? h
@CO ATCATCATTCCTACCGCTATAGCCT r1; implicit
@CO - - .. -. - --
@CO Mm M
@CO - - .. -. - --
@CO hH h
@CO
@CO ATCATCATTCCTACCGCTATAGCCT r2; explicit to a small region
@CO - - ?? ?? ? --
@CO Mm mM m
@CO - - ?? ?? ? --
@CO hH hh h
@CO
@CO ATCATCATTCCTACCGCTATAGCCT r3; mixture
@CO - - . -. - --
@CO M M
@CO - - ?? ?? ? --
@CO hH hh h --
@CO
r1 0 * 0 0 * * 0 0 ATCATCATTCCTACCGCTATAGCCT * Mm:Z:C+mh,2,0,1; Ml:B:C,200,10,50,170,160,20
r2 0 * 0 0 * * 0 0 ATCATCATTCCTACCGCTATAGCCT * Mm:Z:C+mh?,2,0,0,0,0; Ml:B:C,200,10,50,170,10,5,160,20,10,5
r3 0 * 0 0 * * 0 0 ATCATCATTCCTACCGCTATAGCCT * Mm:Z:C+m.,2,2;C+h?,2,0,0,0,0; Ml:B:C,200,160,10,170,5,20,5
2 changes: 2 additions & 0 deletions test/base_mods/MM-multi.out
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
34 C C+m230 C+h6
35 A
---
Present: m h n
6 C C+m128
15 N N+n215
17 C C+m153
Expand Down Expand Up @@ -83,6 +84,7 @@
34 C C+m204 C+h31
35 A
---
Present: m h n
6 C C+m77 C+h159
15 N N+n240
17 C C+m103 C+h133
Expand Down
9 changes: 5 additions & 4 deletions test/base_mods/base-mods.tst
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,11 @@
# samtools binary. This can be useful for testing older versions.

# Test files from SAM spec
P MM-chebi.out $test_mod MM-chebi.sam
P MM-double.out $test_mod MM-double.sam
P MM-multi.out $test_mod MM-multi.sam
P MM-explicit.out $test_mod MM-explicit.sam
P MM-chebi.out $test_mod MM-chebi.sam
P MM-double.out $test_mod MM-double.sam
P MM-multi.out $test_mod MM-multi.sam
P MM-explicit.out $test_mod MM-explicit.sam
P MM-explicit-x.out $test_mod -x MM-explicit.sam

# Pileup testing
P MM-pileup.out $pileup_mod < MM-pileup.sam
Expand Down
Loading