samtools · jmarshall · Nov 24, 2021 · jkbonfield · Jan 19, 2022 · jmarshall
diff --git a/htslib/sam.h b/htslib/sam.h
@@ -1090,6 +1090,73 @@ hts_pos_t bam_cigar2rlen(int n_cigar, const uint32_t *cigar);
 HTSLIB_EXPORT
 hts_pos_t bam_endpos(const bam1_t *b);
 
+/*! @typedef
+ *  @abstract Flag filtering settings structure
+ */
+typedef struct sam_flag_filter_s {
+    uint32_t has_all:1, has_none:1, has_any:1, has_omit_any:1, has_exact:1, :27;
+    // To pass this filter, a read must have...
+    unsigned int all;       // all of these flags
+    unsigned int none;      // none of these
+    unsigned int any;       // at least one of these
+    unsigned int omit_any;  // OMIT at least one of these
+    unsigned int exact;     // exactly these flags (and no others)
+} sam_flag_filter_t;
+
+/// Check whether the flags pass the collected flag filters
+/** @param flag  Bitwise SAM flags
+    @param filt  Pointer to the flag filtering settings
+    @return      1 if @a flag passes all the active filters, 0 if not
+*/
+static inline
+int sam_flag_passes_flag_filter(unsigned int flag, const sam_flag_filter_t *filt) {
+    if (filt->has_all && (flag & filt->all) != filt->all) return 0;
+    if (filt->has_none && (flag & filt->none) != 0) return 0;
+    if (filt->has_any && (flag & filt->any) == 0) return 0;
+    if (filt->has_omit_any && (flag & filt->omit_any) == filt->omit_any) return 0;
+    if (filt->has_exact && flag != filt->exact) return 0;
+    return 1;
+}
+
+/// Check whether the alignment record's flags pass the collected flag filters
+/** @param b     Pointer to the BAM record to be checked
+    @param filt  Pointer to the flag filtering settings
+    @return      1 if the record passes all the active filters, 0 if not
+*/
+static inline
+int sam_passes_flag_filter(const bam1_t *b, const sam_flag_filter_t *filt) {
+    return sam_flag_passes_flag_filter(b->core.flag, filt);
+}
+
+/// Update flag filtering settings from a flag specification string
+/** @param s     Flag specification string
+    @param mode  Initial punctuation mode character (one of [-&!^+|~=])
+    @param filt  Pointer to the flag filtering settings to update
+    @return      >= 0 on success, -1 if the string syntax was invalid
+
+Flag specification strings are of the form `#flag,...,flag,#flag,...,flag,...`,
+i.e., a comma-separated list of flag items (each of which is either a numeric
+flag bitset, a symbolic (/[pPuUrR12sxdS_]+/) flag bitset, or a single flag
+name (eg "PAIRED")), each of which may be prefixed by a punctuation character
+that controls which filter the subsequent items contribute to (until the next
+punctuation character):
+
+    Chr  To pass filtering and be kept, the read must...
+    - &    ...have ALL flags prefixed by either - or &
+    ! ^    ...have NONE of the flags prefixed by either ! or ^
+    + |    ...have ANY (at least one) of the flags prefixed by either + or |
+    ~      ...NOT have at least one of the flags prefixed by ~
+    =      ...have EXACTLY the flags prefixed by = (and no others)
+
+Flag items before the first punctuation character are controlled by @a mode.
+The settings are updated via `|=` so that sam_parse_flag_filter() can be used
+repeatedly to record filters from several specification strings.  Hence
+@a filt should be zero-initialised before the first call.
+*/
+HTSLIB_EXPORT
+int sam_parse_flag_filter(const char *s, char mode, sam_flag_filter_t *filt);
+
+
 HTSLIB_EXPORT
 int   bam_str2flag(const char *str);    /** returns negative value on error */
 

diff --git a/sam.5 b/sam.5
@@ -56,7 +56,7 @@ lcbl.
 0x0040	1	the read is the first read in a pair
 0x0080	2	the read is the second read in a pair
 0x0100	s	the alignment is not primary
-0x0200	f	the read fails platform/vendor quality checks
+0x0200	x	filtered out, e.g., by platform/vendor quality checks
 0x0400	d	the read is either a PCR or an optical duplicate
 0x0800	S	the alignment is supplementary
 .TE

diff --git a/sam.c b/sam.c
@@ -4955,6 +4955,164 @@ char *sam_open_mode_opts(const char *fn,
     return mode_opts;
 }
 
+// Parses one item, which may be a set of flags specified numerically
+// or via symbolic letter codes, or a single flag via its long name.
+static unsigned long sam_parse_flag1(const char *s, char **endptr)
+{
+    if (isdigit_c(*s)) return strtoul(s, endptr, 0);
+
+    unsigned long flag = 0;
+    char name[24];
+    int i = 0, short_ok = 1;
+
+    for (; isalpha_c(*s) || *s == '_'; s++) {
+        if (i < sizeof name) name[i++] = toupper_c(*s);
+        switch (*s) {
+        case 'p': flag |= BAM_FPAIRED; break;
+        case 'P': flag |= BAM_FPROPER_PAIR; break;
+        case 'u': flag |= BAM_FUNMAP; break;
+        case 'U': flag |= BAM_FMUNMAP; break;
+        case 'r': flag |= BAM_FREVERSE; break;
+        case 'R': flag |= BAM_FMREVERSE; break;
+        case '1': flag |= BAM_FREAD1; break;
+        case '2': flag |= BAM_FREAD2; break;
+        case 's': flag |= BAM_FSECONDARY; break;
+        case 'x': flag |= BAM_FQCFAIL; break;
+        case 'd': flag |= BAM_FDUP; break;
+        case 'S': flag |= BAM_FSUPPLEMENTARY; break;
+        case '_': break;
+        // TODO In future, add an option to allow [mMfF] as no-ops
+
+        default:
+            short_ok = 0;
+            break;
+        }
+    }
+
+    *endptr = (char *) s;
+
+    if (i < sizeof name) {
+        name[i] = '\0';
+        switch (name[0]) {
+        case 'D':
+            if (strcmp(name, "DUP") == 0) return BAM_FDUP;
+            break;
+
+        case 'M':
+            if (strcmp(name, "MUNMAP") == 0) return BAM_FMUNMAP;
+            else if (strcmp(name, "MREVERSE") == 0) return BAM_FMREVERSE;
+            break;
+
+        case 'P':
+            if (strcmp(name, "PAIRED") == 0) return BAM_FPAIRED;
+            else if (strcmp(name, "PROPER_PAIR") == 0) return BAM_FPROPER_PAIR;
+            break;
+
+        case 'Q':
+            if (strcmp(name, "QCFAIL") == 0) return BAM_FQCFAIL;
+            break;
+
+        case 'R':
+            if (strcmp(name, "REVERSE") == 0) return BAM_FREVERSE;
+            else if (strcmp(name, "READ1") == 0) return BAM_FREAD1;
+            else if (strcmp(name, "READ2") == 0) return BAM_FREAD2;
+            break;
+
+        case 'S':
+            if (strcmp(name, "SECONDARY") == 0) return BAM_FSECONDARY;
+            else if (strcmp(name, "SUPPLEMENTARY") == 0) return BAM_FSUPPLEMENTARY;
+            break;
+
+        case 'U':
+            if (strcmp(name, "UNMAP") == 0) return BAM_FUNMAP;
+            break;
+        }
+    }
+
+    return short_ok? flag : 0;
+}
+
+// Parses one colon-terminated mode prefix, if present.
+static char parse_optional_mode_prefix(const char *s, char **endptr)
+{
+    char prefix[8];
+    int i = 0;
+
+    for (; isalpha_c(*s) && i < sizeof prefix; s++)
+        prefix[i++] = tolower_c(*s);
+
+    if (*s != ':') return '\0';
+
+    *endptr = (char *) &s[1];
+
+    if (i < sizeof prefix) {
+        prefix[i] = '\0';
+        switch (prefix[0]) {
+        case 'a':
+            if (strcmp(prefix, "all") == 0) return '&';
+            else if (strcmp(prefix, "any") == 0) return '|';
+            else if (strcmp(prefix, "anynot") == 0) return '~';
+            break;
+
+        case 'e':
+            if (strcmp(prefix, "exact") == 0) return '=';
+            break;
+
+        case 'n':
+            if (strcmp(prefix, "none") == 0) return '^';
+            else if (strcmp(prefix, "notall") == 0) return '~';
+            break;
+        }
+    }
+
+    return '\0';
+}
+
+int sam_parse_flag_filter(const char *s, char mode, sam_flag_filter_t *filt)
+{
+    while (*s) {
+        if (isalnum_c(*s) || *s == '_') {
+            char *end;
+            char newmode = parse_optional_mode_prefix(s, &end);
+            if (newmode) mode = newmode, s = end;
+            unsigned long flag = sam_parse_flag1(s, &end);
+            if (!flag) return -1;
+            s = end;
+
+            switch (mode) {
+            case '-': case '&':
+                filt->has_all = 1;
+                filt->all |= flag;
+                break;
+            case '!': case '^':
+                filt->has_none = 1;
+                filt->none |= flag;
+                break;
+            case '+': case '|':
+                filt->has_any = 1;
+                filt->any |= flag;
+                break;
+            case '~':
+                filt->has_omit_any = 1;
+                filt->omit_any |= flag;
+                break;
+            case '=':
+                filt->has_exact = 1;
+                filt->exact |= flag;
+                break;
+            default:
+                return -1;
+            }
+        }
+        else if (*s == ',')
+            s++;
+        else
+            mode = *s++;
+    }
+
+    return 0;
+}
+
 #define STRNCMP(a,b,n) (strncasecmp((a),(b),(n)) || strlen(a)!=(n))
 int bam_str2flag(const char *str)
 {