-
Notifications
You must be signed in to change notification settings - Fork 443
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
SAM flag filter specification strings and parsing & filtering API functions #1361
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1090,6 +1090,73 @@ hts_pos_t bam_cigar2rlen(int n_cigar, const uint32_t *cigar); | |
HTSLIB_EXPORT | ||
hts_pos_t bam_endpos(const bam1_t *b); | ||
|
||
/*! @typedef | ||
* @abstract Flag filtering settings structure | ||
*/ | ||
typedef struct sam_flag_filter_s { | ||
uint32_t has_all:1, has_none:1, has_any:1, has_omit_any:1, has_exact:1, :27; | ||
// To pass this filter, a read must have... | ||
unsigned int all; // all of these flags | ||
unsigned int none; // none of these | ||
unsigned int any; // at least one of these | ||
unsigned int omit_any; // OMIT at least one of these | ||
unsigned int exact; // exactly these flags (and no others) | ||
} sam_flag_filter_t; | ||
|
||
/// Check whether the flags pass the collected flag filters | ||
/** @param flag Bitwise SAM flags | ||
@param filt Pointer to the flag filtering settings | ||
@return 1 if @a flag passes all the active filters, 0 if not | ||
*/ | ||
static inline | ||
int sam_flag_passes_flag_filter(unsigned int flag, const sam_flag_filter_t *filt) { | ||
if (filt->has_all && (flag & filt->all) != filt->all) return 0; | ||
if (filt->has_none && (flag & filt->none) != 0) return 0; | ||
if (filt->has_any && (flag & filt->any) == 0) return 0; | ||
if (filt->has_omit_any && (flag & filt->omit_any) == filt->omit_any) return 0; | ||
if (filt->has_exact && flag != filt->exact) return 0; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we set There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, I've been thinking about reworking it so |
||
return 1; | ||
} | ||
|
||
/// Check whether the alignment record's flags pass the collected flag filters | ||
/** @param b Pointer to the BAM record to be checked | ||
@param filt Pointer to the flag filtering settings | ||
@return 1 if the record passes all the active filters, 0 if not | ||
*/ | ||
static inline | ||
int sam_passes_flag_filter(const bam1_t *b, const sam_flag_filter_t *filt) { | ||
return sam_flag_passes_flag_filter(b->core.flag, filt); | ||
} | ||
|
||
/// Update flag filtering settings from a flag specification string | ||
/** @param s Flag specification string | ||
@param mode Initial punctuation mode character (one of [-&!^+|~=]) | ||
@param filt Pointer to the flag filtering settings to update | ||
@return >= 0 on success, -1 if the string syntax was invalid | ||
|
||
Flag specification strings are of the form `#flag,...,flag,#flag,...,flag,...`, | ||
i.e., a comma-separated list of flag items (each of which is either a numeric | ||
flag bitset, a symbolic (/[pPuUrR12sxdS_]+/) flag bitset, or a single flag | ||
name (eg "PAIRED")), each of which may be prefixed by a punctuation character | ||
that controls which filter the subsequent items contribute to (until the next | ||
punctuation character): | ||
|
||
Chr To pass filtering and be kept, the read must... | ||
- & ...have ALL flags prefixed by either - or & | ||
! ^ ...have NONE of the flags prefixed by either ! or ^ | ||
+ | ...have ANY (at least one) of the flags prefixed by either + or | | ||
~ ...NOT have at least one of the flags prefixed by ~ | ||
= ...have EXACTLY the flags prefixed by = (and no others) | ||
|
||
Flag items before the first punctuation character are controlled by @a mode. | ||
The settings are updated via `|=` so that sam_parse_flag_filter() can be used | ||
repeatedly to record filters from several specification strings. Hence | ||
@a filt should be zero-initialised before the first call. | ||
*/ | ||
HTSLIB_EXPORT | ||
int sam_parse_flag_filter(const char *s, char mode, sam_flag_filter_t *filt); | ||
|
||
|
||
HTSLIB_EXPORT | ||
int bam_str2flag(const char *str); /** returns negative value on error */ | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -56,7 +56,7 @@ lcbl. | |
0x0040 1 the read is the first read in a pair | ||
0x0080 2 the read is the second read in a pair | ||
0x0100 s the alignment is not primary | ||
0x0200 f the read fails platform/vendor quality checks | ||
0x0200 x filtered out, e.g., by platform/vendor quality checks | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The text change here reflects samtools/hts-specs#85. |
||
0x0400 d the read is either a PCR or an optical duplicate | ||
0x0800 S the alignment is supplementary | ||
.TE | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4955,6 +4955,164 @@ char *sam_open_mode_opts(const char *fn, | |
return mode_opts; | ||
} | ||
|
||
// Parses one item, which may be a set of flags specified numerically | ||
// or via symbolic letter codes, or a single flag via its long name. | ||
static unsigned long sam_parse_flag1(const char *s, char **endptr) | ||
{ | ||
if (isdigit_c(*s)) return strtoul(s, endptr, 0); | ||
|
||
unsigned long flag = 0; | ||
char name[24]; | ||
int i = 0, short_ok = 1; | ||
|
||
for (; isalpha_c(*s) || *s == '_'; s++) { | ||
if (i < sizeof name) name[i++] = toupper_c(*s); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is the reason for copying / uppercasing from s to name character by character before checking to avoid the POSIX It would seem simpler to just check See also Maybe There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, I have an irrational prejudice against Also they require you to add But admittedly computers are fast nowadays and I have a followup commit that rewrites There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Re: Regarding There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If you do M OTOH if you write your I said it was irrational! I may rewrite this using the combination of the existing There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm, for M strings being compared of length N, I'd see both Personally for printable strings I'd do something as trivial as Agreed it's perhaps slower than doing it once upfront to avoid the canonicalisation bit being repeated, but the complexity itself is the same order. Anyway, it's arguing over something that frankly irrelevant. You've got to be pretty special to have argument parsing being the bottleneck of your work. :-) I don't really mind either way. |
||
switch (*s) { | ||
case 'p': flag |= BAM_FPAIRED; break; | ||
case 'P': flag |= BAM_FPROPER_PAIR; break; | ||
case 'u': flag |= BAM_FUNMAP; break; | ||
case 'U': flag |= BAM_FMUNMAP; break; | ||
case 'r': flag |= BAM_FREVERSE; break; | ||
case 'R': flag |= BAM_FMREVERSE; break; | ||
case '1': flag |= BAM_FREAD1; break; | ||
case '2': flag |= BAM_FREAD2; break; | ||
case 's': flag |= BAM_FSECONDARY; break; | ||
case 'x': flag |= BAM_FQCFAIL; break; | ||
case 'd': flag |= BAM_FDUP; break; | ||
case 'S': flag |= BAM_FSUPPLEMENTARY; break; | ||
case '_': break; | ||
// TODO In future, add an option to allow [mMfF] as no-ops | ||
|
||
default: | ||
short_ok = 0; | ||
break; | ||
} | ||
} | ||
|
||
*endptr = (char *) s; | ||
|
||
if (i < sizeof name) { | ||
name[i] = '\0'; | ||
switch (name[0]) { | ||
case 'D': | ||
if (strcmp(name, "DUP") == 0) return BAM_FDUP; | ||
break; | ||
|
||
case 'M': | ||
if (strcmp(name, "MUNMAP") == 0) return BAM_FMUNMAP; | ||
else if (strcmp(name, "MREVERSE") == 0) return BAM_FMREVERSE; | ||
break; | ||
|
||
case 'P': | ||
if (strcmp(name, "PAIRED") == 0) return BAM_FPAIRED; | ||
else if (strcmp(name, "PROPER_PAIR") == 0) return BAM_FPROPER_PAIR; | ||
break; | ||
|
||
case 'Q': | ||
if (strcmp(name, "QCFAIL") == 0) return BAM_FQCFAIL; | ||
break; | ||
|
||
case 'R': | ||
if (strcmp(name, "REVERSE") == 0) return BAM_FREVERSE; | ||
else if (strcmp(name, "READ1") == 0) return BAM_FREAD1; | ||
else if (strcmp(name, "READ2") == 0) return BAM_FREAD2; | ||
break; | ||
|
||
case 'S': | ||
if (strcmp(name, "SECONDARY") == 0) return BAM_FSECONDARY; | ||
else if (strcmp(name, "SUPPLEMENTARY") == 0) return BAM_FSUPPLEMENTARY; | ||
break; | ||
|
||
case 'U': | ||
if (strcmp(name, "UNMAP") == 0) return BAM_FUNMAP; | ||
break; | ||
} | ||
} | ||
|
||
return short_ok? flag : 0; | ||
} | ||
|
||
// Parses one colon-terminated mode prefix, if present. | ||
static char parse_optional_mode_prefix(const char *s, char **endptr) | ||
{ | ||
char prefix[8]; | ||
int i = 0; | ||
|
||
for (; isalpha_c(*s) && i < sizeof prefix; s++) | ||
prefix[i++] = tolower_c(*s); | ||
|
||
if (*s != ':') return '\0'; | ||
|
||
*endptr = (char *) &s[1]; | ||
|
||
if (i < sizeof prefix) { | ||
prefix[i] = '\0'; | ||
switch (prefix[0]) { | ||
case 'a': | ||
if (strcmp(prefix, "all") == 0) return '&'; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. See above regarding Although I note one benefit(?) of this is "fubar:" will return '\0' while also marching on endptr to skip past the unknown token. Whether skipping it is helpful or not I don't know, given it's failed. |
||
else if (strcmp(prefix, "any") == 0) return '|'; | ||
else if (strcmp(prefix, "anynot") == 0) return '~'; | ||
break; | ||
|
||
case 'e': | ||
if (strcmp(prefix, "exact") == 0) return '='; | ||
break; | ||
|
||
case 'n': | ||
if (strcmp(prefix, "none") == 0) return '^'; | ||
else if (strcmp(prefix, "notall") == 0) return '~'; | ||
break; | ||
} | ||
} | ||
|
||
return '\0'; | ||
} | ||
|
||
int sam_parse_flag_filter(const char *s, char mode, sam_flag_filter_t *filt) | ||
{ | ||
while (*s) { | ||
if (isalnum_c(*s) || *s == '_') { | ||
char *end; | ||
char newmode = parse_optional_mode_prefix(s, &end); | ||
if (newmode) mode = newmode, s = end; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does this mean that "anynot:10,exatc:20" is silently parsed as "anynot:10,anynot:20"? It looks it. I'd think it would be best for an error to be returned when the prefix is an unrecognised term. It also permits potential future updates while ensuring the newer terms being passed to older libraries will be noticed rather than giving incorrect results. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good point. |
||
unsigned long flag = sam_parse_flag1(s, &end); | ||
if (!flag) return -1; | ||
s = end; | ||
|
||
switch (mode) { | ||
case '-': case '&': | ||
filt->has_all = 1; | ||
filt->all |= flag; | ||
break; | ||
case '!': case '^': | ||
filt->has_none = 1; | ||
filt->none |= flag; | ||
break; | ||
case '+': case '|': | ||
filt->has_any = 1; | ||
filt->any |= flag; | ||
break; | ||
case '~': | ||
filt->has_omit_any = 1; | ||
filt->omit_any |= flag; | ||
break; | ||
case '=': | ||
filt->has_exact = 1; | ||
filt->exact |= flag; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. See above comment - this is where I think it can simply set the |
||
break; | ||
default: | ||
return -1; | ||
} | ||
} | ||
else if (*s == ',') | ||
s++; | ||
else | ||
mode = *s++; | ||
} | ||
|
||
return 0; | ||
} | ||
|
||
#define STRNCMP(a,b,n) (strncasecmp((a),(b),(n)) || strlen(a)!=(n)) | ||
int bam_str2flag(const char *str) | ||
{ | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It took a while to figure out what this meant on the first read through. I think
not_all
is a clearer term thanomit_any
, or maybeany_unset
.That gives groupings (reordered for clarity) of all/not-all and any/not-any, or all-set/all-unset and any-set/any-unset.
I don't mind if these are perhaps just added in comments for the sake of clarity, but mixing up different terms ("none", "omit") doesn't aid understanding of what is effectively a trivial bit check.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've been strenuously avoiding
not_all
because it is ambiguous between “it must be that not all of these are 1“ and “it must be that all of these are not 1” — which are very different.any_unset
is probably better though.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Note {any,all}-{set,unset} is what we ended up picking in bcftools too, albeit that was "skip" rather than "keep", as it was being used as a filter.