From 612cf6f98a4e8e76aed816bc2faab73ce1c199b1 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Wed, 3 Nov 2021 14:45:14 +0000 Subject: [PATCH] Improve support for MM .? modifiers. The previous commit permitted these to exist, but didn't make the data available to the caller. This extends the API with additional queries to distinguish the specifics about the modification types present. --- htslib/sam.h | 38 ++++++++++++ sam.c | 53 ++++++++++++++-- test/base_mods/MM-chebi.out | 1 + test/base_mods/MM-double.out | 1 + test/base_mods/MM-explicit-x.out | 103 +++++++++++++++++++++++++++++++ test/base_mods/MM-explicit.out | 3 + test/base_mods/MM-explicit.sam | 22 +++++-- test/base_mods/MM-multi.out | 2 + test/base_mods/base-mods.tst | 9 +-- test/test_mod.c | 97 ++++++++++++++++++++++++++--- 10 files changed, 309 insertions(+), 20 deletions(-) create mode 100644 test/base_mods/MM-explicit-x.out diff --git a/htslib/sam.h b/htslib/sam.h index 45dd51f0a..a6e64fbb2 100644 --- a/htslib/sam.h +++ b/htslib/sam.h @@ -2271,6 +2271,44 @@ int bam_mods_at_qpos(const bam1_t *b, int qpos, hts_base_mod_state *state, hts_base_mod *mods, int n_mods); +/// Returns data about a specific modification type for the alignment record. +/** + * @param b BAM alignment record + * @param state The base modification state pointer. + * @param code Modification code. If positive this is a character code, + * if negative it is a -ChEBI code. + * + * @param strand Boolean for top (0) or bottom (1) strand + * @param implicit Boolean for whether unlisted positions should be + * implicitly assumed to be unmodified, or require an + * explicit score and should be considered as unknown. + * Returned. + * @param canonical Canonical base type associated with this modification + * Returned. + * + * @return 0 on success or -1 if not found. The strand, implicit and canonical + * fields are filled out if passed in as non-NULL pointers. + */ +HTSLIB_EXPORT +int bam_mods_query_type(hts_base_mod_state *state, int code, + int *strand, int *implicit, char *canonical); + +/// Returns the list of base modification codes provided for this +/// alignment record as an array of character codes (+ve) or ChEBI numbers +/// (negative). +/* + * @param b BAM alignment record + * @param state The base modification state pointer. + * @param ntype Filled out with the number of array elements returned + * + * @return the type array, with *ntype filled out with the size. + * The array returned should not be freed. + * It is a valid pointer until the state is freed using + * hts_base_mod_free(). + */ +HTSLIB_EXPORT +int *bam_mods_recorded(hts_base_mod_state *state, int *ntype); + #ifdef __cplusplus } #endif diff --git a/sam.c b/sam.c index 04f3435f7..dd1b7d9fd 100644 --- a/sam.c +++ b/sam.c @@ -6092,6 +6092,7 @@ struct hts_base_mod_state { char *MMend[MAX_BASE_MOD]; // end of pos-delta string uint8_t *ML[MAX_BASE_MOD]; // next qual int MLstride[MAX_BASE_MOD]; // bytes between quals for this type + int implicit[MAX_BASE_MOD]; // treat unlisted positions as non-modified? int seq_pos; // current position along sequence int nmods; // used array size (0 to MAX_BASE_MOD-1). }; @@ -6160,6 +6161,7 @@ int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state) { char *cp = (char *)mm+1; int mod_num = 0; + int implicit = 1; while (*cp) { for (; *cp; cp++) { // cp should be [ACGTNU][+-]([a-zA-Z]+|[0-9]+)[.?]?(,\d+)*; @@ -6192,16 +6194,15 @@ int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state) { if (*cp == '\0') return -1; } + me = cp; - // Optional explicit vs implicit marker. - // Right now we ignore this field. A proper API for - // querying it will follow later. + // Optional explicit vs implicit marker if (*cp == '.') { - // implicit = 1; + // default is implicit = 1; cp++; } else if (*cp == '?') { - // implicit = 0; + implicit = 0; cp++; } else if (*cp != ',' && *cp != ';') { // parse error @@ -6257,6 +6258,7 @@ int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state) { state->strand [mod_num] = (strand == '-'); state->canonical[mod_num] = btype; state->MLstride [mod_num] = stride; + state->implicit [mod_num] = implicit; state->MMcount [mod_num] = delta; if (b->core.flag & BAM_FREVERSE) { @@ -6473,3 +6475,44 @@ int bam_mods_at_qpos(const bam1_t *b, int qpos, hts_base_mod_state *state, return r; } + +/* + * Returns the list of base modification codes provided for this + * alignment record as an array of character codes (+ve) or ChEBI numbers + * (negative). + * + * Returns the array, with *ntype filled out with the size. + * The array returned should not be freed. + * It is a valid pointer until the state is freed using + * hts_base_mod_free(). + */ +int *bam_mods_recorded(hts_base_mod_state *state, int *ntype) { + *ntype = state->nmods; + return state->type; +} + +/* + * Returns data about a specific modification type for the alignment record. + * Code is either positive (eg 'm') or negative for ChEBI numbers. + * + * Return 0 on success or -1 if not found. The strand, implicit and canonical + * fields are filled out if passed in as non-NULL pointers. + */ +int bam_mods_query_type(hts_base_mod_state *state, int code, + int *strand, int *implicit, char *canonical) { + // Find code entry + int i; + for (i = 0; i < state->nmods; i++) { + if (state->type[i] == code) + break; + } + if (i == state->nmods) + return -1; + + // Return data + if (strand) *strand = state->strand[i]; + if (implicit) *implicit = state->implicit[i]; + if (canonical) *canonical = "?AC?G???T??????N"[state->canonical[i]]; + + return 0; +} diff --git a/test/base_mods/MM-chebi.out b/test/base_mods/MM-chebi.out index cefdc545c..a6e7654cf 100644 --- a/test/base_mods/MM-chebi.out +++ b/test/base_mods/MM-chebi.out @@ -35,6 +35,7 @@ 34 C C+m204 C+(76792)33 35 A --- +Present: m #-76792 n 6 C C+m102 15 N N+n212 17 C C+m128 diff --git a/test/base_mods/MM-double.out b/test/base_mods/MM-double.out index 82d086a2f..e21ae314e 100644 --- a/test/base_mods/MM-double.out +++ b/test/base_mods/MM-double.out @@ -35,6 +35,7 @@ 34 A 35 T --- +Present: m m o 1 G G-m115 7 C C+m128 12 G G-m141 diff --git a/test/base_mods/MM-explicit-x.out b/test/base_mods/MM-explicit-x.out new file mode 100644 index 000000000..4abedc719 --- /dev/null +++ b/test/base_mods/MM-explicit-x.out @@ -0,0 +1,103 @@ +0 A +1 T +2 C +3 A +4 T +5 C +6 A +7 T +8 T +9 C C+m.200 C+h.10 +10 C C+m.50 C+h.170 +11 T +12 A +13 C +14 C C+m.160 C+h.20 +15 G +16 C +17 T +18 A +19 T +20 A +21 G +22 C +23 C +24 T +--- +Present: m h +9 C C+m200 C+h10 +10 C C+m50 C+h170 +14 C C+m160 C+h20 + +=== + +0 A +1 T +2 C +3 A +4 T +5 C +6 A +7 T +8 T +9 C C+m?200 C+h?10 +10 C C+m?50 C+h?170 +11 T +12 A +13 C C+m?10 C+h?5 +14 C C+m?160 C+h?20 +15 G +16 C C+m?10 C+h?5 +17 T +18 A +19 T +20 A +21 G +22 C +23 C +24 T +--- +Present: m h +9 C C+m200 C+h10 +10 C C+m50 C+h170 +13 C C+m10 C+h5 +14 C C+m160 C+h20 +16 C C+m10 C+h5 + +=== + +0 A +1 T +2 C +3 A +4 T +5 C +6 A +7 T +8 T +9 C C+m.200 C+h?10 +10 C C+h?170 +11 T +12 A +13 C C+h?5 +14 C C+m.160 C+h?20 +15 G +16 C C+h?5 +17 T +18 A +19 T +20 A +21 G +22 C +23 C +24 T +--- +Present: m h +9 C C+m200 C+h10 +10 C C+h170 +13 C C+h5 +14 C C+m160 C+h20 +16 C C+h5 + +=== + diff --git a/test/base_mods/MM-explicit.out b/test/base_mods/MM-explicit.out index 05e2828c4..f28b25f83 100644 --- a/test/base_mods/MM-explicit.out +++ b/test/base_mods/MM-explicit.out @@ -24,6 +24,7 @@ 23 C 24 T --- +Present: m h 9 C C+m200 C+h10 10 C C+m50 C+h170 14 C C+m160 C+h20 @@ -56,6 +57,7 @@ 23 C 24 T --- +Present: m h 9 C C+m200 C+h10 10 C C+m50 C+h170 13 C C+m10 C+h5 @@ -90,6 +92,7 @@ 23 C 24 T --- +Present: m h 9 C C+m200 C+h10 10 C C+h170 13 C C+h5 diff --git a/test/base_mods/MM-explicit.sam b/test/base_mods/MM-explicit.sam index e4e37103d..e85afa293 100644 --- a/test/base_mods/MM-explicit.sam +++ b/test/base_mods/MM-explicit.sam @@ -4,10 +4,24 @@ @CO unchecked and require an explicit statement to indicate it was @CO looked at and no base modification was observed. @CO -@CO 0 1 23 45 6 78 -@CO ATCATCATTCCTACCGCTATAGCCT -@CO . . m. .m . .. m -@CO ? ? .h .. . ?? h +@CO ATCATCATTCCTACCGCTATAGCCT r1; implicit +@CO - - .. -. - -- +@CO Mm M +@CO - - .. -. - -- +@CO hH h +@CO +@CO ATCATCATTCCTACCGCTATAGCCT r2; explicit to a small region +@CO - - ?? ?? ? -- +@CO Mm mM m +@CO - - ?? ?? ? -- +@CO hH hh h +@CO +@CO ATCATCATTCCTACCGCTATAGCCT r3; mixture +@CO - - . -. - -- +@CO M M +@CO - - ?? ?? ? -- +@CO hH hh h -- +@CO r1 0 * 0 0 * * 0 0 ATCATCATTCCTACCGCTATAGCCT * Mm:Z:C+mh,2,0,1; Ml:B:C,200,10,50,170,160,20 r2 0 * 0 0 * * 0 0 ATCATCATTCCTACCGCTATAGCCT * Mm:Z:C+mh?,2,0,0,0,0; Ml:B:C,200,10,50,170,10,5,160,20,10,5 r3 0 * 0 0 * * 0 0 ATCATCATTCCTACCGCTATAGCCT * Mm:Z:C+m.,2,2;C+h?,2,0,0,0,0; Ml:B:C,200,160,10,170,5,20,5 diff --git a/test/base_mods/MM-multi.out b/test/base_mods/MM-multi.out index 23c98d97b..e411a81ee 100644 --- a/test/base_mods/MM-multi.out +++ b/test/base_mods/MM-multi.out @@ -35,6 +35,7 @@ 34 C C+m230 C+h6 35 A --- +Present: m h n 6 C C+m128 15 N N+n215 17 C C+m153 @@ -83,6 +84,7 @@ 34 C C+m204 C+h31 35 A --- +Present: m h n 6 C C+m77 C+h159 15 N N+n240 17 C C+m103 C+h133 diff --git a/test/base_mods/base-mods.tst b/test/base_mods/base-mods.tst index c64128ee2..3809c0e6e 100644 --- a/test/base_mods/base-mods.tst +++ b/test/base_mods/base-mods.tst @@ -33,10 +33,11 @@ # samtools binary. This can be useful for testing older versions. # Test files from SAM spec -P MM-chebi.out $test_mod MM-chebi.sam -P MM-double.out $test_mod MM-double.sam -P MM-multi.out $test_mod MM-multi.sam -P MM-explicit.out $test_mod MM-explicit.sam +P MM-chebi.out $test_mod MM-chebi.sam +P MM-double.out $test_mod MM-double.sam +P MM-multi.out $test_mod MM-multi.sam +P MM-explicit.out $test_mod MM-explicit.sam +P MM-explicit-x.out $test_mod -x MM-explicit.sam # Pileup testing P MM-pileup.out $pileup_mod < MM-pileup.sam diff --git a/test/test_mod.c b/test/test_mod.c index aade3733c..f6f5b0718 100644 --- a/test/test_mod.c +++ b/test/test_mod.c @@ -22,6 +22,52 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +/* +This tests multiple APIs. The simplest is to parse the MM/ML tags with +bam_parse_basemod and then call bam_mods_at_next_pos once for each base in +the bam sequence to check for modifications. + +Ie: + + hts_base_mod_state *m = hts_base_mod_state_alloc(); + bam_parse_basemod(b, m); // b=bam1_t pointer + hts_base_mod mods[5]; + for (i = 0; i < b->core.l_qseq; i++) { + n = bam_mods_at_next_pos(b, m, mods, 5); + for (j = 0; j < n && j < 5; j++) { + // Report 'n'th mod at seq pos 'i'. + // mods[j].modified_base holds the base mod itself, with + // mods[j].canonical_base, mods[j].strand and mods[j].qual + // also present in hts_base_mod struct. + // ... + } + } + hts_base_mod_state_free(m); + +The extended mode has the same loop above, but calls bam_mods_query_type +to return additional meta-data including the strand, canonical base and +whether the base modification is recorded implicitly or explicitly: + + int ret = bam_mods_query_type(m, mods[j].modified_base, + &m_strand, &m_implicit, + &m_canonical); + +Looping over every base in the sequence is not particularly efficient +however unless this fits your natural processing order. The alternative +is to call bam_next_base_mod to iterate only over modified locations: + + hts_base_mod_state *m = hts_base_mod_state_alloc(); + bam_parse_basemod(b, m); // b=bam1_t pointer + hts_base_mod mods[5]; + while ((n=bam_next_basemod(b, m, mods, 5, &pos)) > 0) { + for (j = 0; j < n && j < 5; j++) { + // Report 'n'th mod at sequence position 'pos' + } + } + hts_base_mod_state_free(m); + +*/ + #include #include @@ -41,6 +87,14 @@ static char *code(int id) { int main(int argc, char **argv) { char out[1024] = {0}; + int extended = 0; + + if (argc > 1 && strcmp(argv[1], "-x") == 0) { + extended = 1; + argv++; + argc--; + } + if (argc < 2) return 1; @@ -69,12 +123,31 @@ int main(int argc, char **argv) { n = bam_mods_at_next_pos(b, m, mods, 5); lp += sprintf(lp, "%d\t%c\t", i, seq_nt16_str[bam_seqi(bam_get_seq(b), i)]); - for (j = 0; j < n && j < 5; j++) - lp += sprintf(lp, "%c%c%s%d ", - mods[j].canonical_base, - "+-"[mods[j].strand], - code(mods[j].modified_base), - mods[j].qual); + for (j = 0; j < n && j < 5; j++) { + if (extended) { + int m_strand, m_implicit; + char m_canonical; + int ret = bam_mods_query_type(m, mods[j].modified_base, + &m_strand, &m_implicit, + &m_canonical); + if (ret < 0 || + m_canonical != mods[j].canonical_base || + m_strand != mods[j].strand) + goto err; + lp += sprintf(lp, "%c%c%s%c%d ", + mods[j].canonical_base, + "+-"[mods[j].strand], + code(mods[j].modified_base), + "?."[m_implicit], + mods[j].qual); + } else { + lp += sprintf(lp, "%c%c%s%d ", + mods[j].canonical_base, + "+-"[mods[j].strand], + code(mods[j].modified_base), + mods[j].qual); + } + } *lp++ = '\n'; *lp++ = 0; @@ -88,17 +161,27 @@ int main(int argc, char **argv) { bam_parse_basemod(b, m); + // List possible mod choices. + int *all_mods; + int all_mods_n = 0; + all_mods = bam_mods_recorded(m, &all_mods_n); + printf("Present:"); + for (i = 0; i < all_mods_n; i++) + printf(all_mods[i] > 0 ? " %c" : " #%d", all_mods[i]); + putchar('\n'); + int pos; while ((n=bam_next_basemod(b, m, mods, 5, &pos)) > 0) { char line[8192]={0}, *lp = line; lp += sprintf(lp, "%d\t%c\t", pos, seq_nt16_str[bam_seqi(bam_get_seq(b), pos)]); - for (j = 0; j < n && j < 5; j++) + for (j = 0; j < n && j < 5; j++) { lp += sprintf(lp, "%c%c%s%d ", mods[j].canonical_base, "+-"[mods[j].strand], code(mods[j].modified_base), mods[j].qual); + } *lp++ = '\n'; *lp++ = 0;