From 2f88671fac2fd02861b0bebe91af14d4c5bbc6f1 Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Wed, 30 Sep 2020 19:09:44 -0400 Subject: [PATCH 1/5] add samtools.filter_to_mapped_reads add samtools.filter_to_mapped_reads function to remove duplicates, and reads that are not properly paired, with options to allow unmapped reads and singletons --- tools/samtools.py | 48 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/tools/samtools.py b/tools/samtools.py index a118cf23..c57d8962 100644 --- a/tools/samtools.py +++ b/tools/samtools.py @@ -182,6 +182,54 @@ def removeDoublyMappedReads(self, inBam, outBam): opts = ['-b', '-F' '1028', '-f', '2', '-@', '3'] self.view(opts, inBam, outBam) + def filter_to_mapped_reads(self, inBam, outBam, allow_unmapped=True, remove_singletons=True): + ''' + This function writes a bam file filtered to include properly aligned reads. + If allow_unmapped=True, fully-unmapped pairs or unmapped single-end reads are also + written to the output (omitting pairs where only one mate maps). + + ''' + + with pysam.AlignmentFile(inBam, 'rb', check_sq=False) as inb: + with pysam.AlignmentFile(outBam, 'wb', header=inb.header) as outf: + # process the lines individually and write them or not, depending on the flags + # For explanation of flags, see: + # https://broadinstitute.github.io/picard/explain-flags.html + # https://pysam.readthedocs.io/en/latest/api.html + # https://samtools.github.io/hts-specs/SAMv1.pdf + # https://github.com/pysam-developers/pysam/blob/31183d7fac52b529b304bdf61ff933818ae4a71f/samtools/stats.c#L72-L81 + + for read in inb: + # check if a read is paired + is_single_end=not read.is_paired + + # if a PCR/optical duplicate, do not write + if read.is_duplicate: + continue + + if read.is_paired: + if allow_unmapped: + # if mates are not both either mapped or unmapped, do not write + if remove_singletons and read.is_unmapped != read.mate_is_unmapped: + continue + else: + # if not a proper pair (reads are oriented correctly and facing each other) + # do not write + if not read.is_proper_pair: + continue + + # do not write singleton reads. + #if read.is_unmapped or read.mate_is_unmapped: + # continue + + if is_single_end: + if not allow_unmapped: + if read.is_unmapped: + continue + + # otherwise write out the line + outf.write(read) + def filterByCigarString(self, inBam, outBam, regexToMatchForRemoval='^((?:[0-9]+[ID]){1}(?:[0-9]+[MNIDSHPX=])+)|((?:[0-9]+[MNIDSHPX=])+(?:[0-9]+[ID]){1})$', invertResult=False): From f2b7a7051f411758687a0ce81d4407366044915d Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Wed, 30 Sep 2020 19:17:58 -0400 Subject: [PATCH 2/5] add min_mapping_qual param to samtools.filter_to_mapped_reads() add min_mapping_qual param to samtools.filter_to_mapped_reads() --- tools/samtools.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tools/samtools.py b/tools/samtools.py index c57d8962..b55d9286 100644 --- a/tools/samtools.py +++ b/tools/samtools.py @@ -182,7 +182,7 @@ def removeDoublyMappedReads(self, inBam, outBam): opts = ['-b', '-F' '1028', '-f', '2', '-@', '3'] self.view(opts, inBam, outBam) - def filter_to_mapped_reads(self, inBam, outBam, allow_unmapped=True, remove_singletons=True): + def filter_to_mapped_reads(self, inBam, outBam, allow_unmapped=True, min_mapping_qual=None, remove_singletons=True): ''' This function writes a bam file filtered to include properly aligned reads. If allow_unmapped=True, fully-unmapped pairs or unmapped single-end reads are also @@ -203,6 +203,11 @@ def filter_to_mapped_reads(self, inBam, outBam, allow_unmapped=True, remove_sing # check if a read is paired is_single_end=not read.is_paired + # only include reads with mapping quality >= INT + # equivalent to samtools view -q + if min_mapping_qual is not None and read.mapping_quality < min_mapping_qual: + continue + # if a PCR/optical duplicate, do not write if read.is_duplicate: continue From c93ec3188b38fdab9d9f1a095a45f93b01bccd9b Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Wed, 30 Sep 2020 19:21:31 -0400 Subject: [PATCH 3/5] in read_utils.align_and_fix(), filter input and output via samtools.filter_to_mapped_reads() in read_utils.align_and_fix(), filter input and output via samtools.filter_to_mapped_reads(), such that marked pcr duplicates and mapped singletons are removed on the input side, and on the output side marked duplicates are removed as well as any reads not properly paired, except for single-end reads, which are allowed through if mapped (or in any case if allow_unmapped=True). This is intended to address a specific issue where NovaSeq contamination presented as singleton reads. --- read_utils.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/read_utils.py b/read_utils.py index c64c8095..eb0b5579 100755 --- a/read_utils.py +++ b/read_utils.py @@ -1208,6 +1208,12 @@ def align_and_fix( else: aligner_options = '' # use defaults + inBamFiltered = mkstempfname('.input-filtered.bam') + samtools.filter_to_mapped_reads(inBam, inBamFiltered, allow_unmapped=True) + filtered_count = samtools.count(inBamFiltered) + if filtered_count==0: + log.warning("zero reads after input filtering") + bam_aligned = mkstempfname('.aligned.bam') if aligner=="novoalign": if novoalign_amplicons_bed is not None: @@ -1216,7 +1222,7 @@ def align_and_fix( tools.novoalign.NovoalignTool(license_path=novoalign_license_path).index_fasta(refFastaCopy) tools.novoalign.NovoalignTool(license_path=novoalign_license_path).execute( - inBam, refFastaCopy, bam_aligned, + inBamFiltered, refFastaCopy, bam_aligned, options=aligner_options.split(), JVMmemory=JVMmemory ) @@ -1227,11 +1233,13 @@ def align_and_fix( opts = aligner_options.split() - bwa.align_mem_bam(inBam, refFastaCopy, bam_aligned, min_score_to_filter=bwa_min_score, threads=threads, options=opts) + bwa.align_mem_bam(inBamFiltered, refFastaCopy, bam_aligned, min_score_to_filter=bwa_min_score, threads=threads, options=opts) elif aligner=='minimap2': mm2 = tools.minimap2.Minimap2() - mm2.align_bam(inBam, refFastaCopy, bam_aligned, threads=threads, options=aligner_options.split()) + mm2.align_bam(inBamFiltered, refFastaCopy, bam_aligned, threads=threads, options=aligner_options.split()) + + os.unlink(inBamFiltered) if skip_mark_dupes: bam_marked = bam_aligned @@ -1256,7 +1264,7 @@ def align_and_fix( shutil.copyfile(bam_realigned, outBamAll) tools.picard.BuildBamIndexTool().execute(outBamAll) if outBamFiltered: - samtools.view(['-b', '-q', '1', '-F', '1028'], bam_realigned, outBamFiltered) + samtools.filter_to_mapped_reads(bam_realigned, outBamFiltered, allow_unmapped=False, min_mapping_qual=1) tools.picard.BuildBamIndexTool().execute(outBamFiltered) os.unlink(bam_realigned) From 3a2fb22ae00df0ace535a3911f2b99136a13a904 Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Fri, 2 Oct 2020 22:31:42 -0400 Subject: [PATCH 4/5] remove map quality filter from filter so we can rely on aligner-set bits only remove map quality filter from filter so we can rely on aligner-set bits only; adjust filter to simplify. --- read_utils.py | 23 +++++++++--------- tools/samtools.py | 59 +++++++++++++++++++++++------------------------ 2 files changed, 41 insertions(+), 41 deletions(-) diff --git a/read_utils.py b/read_utils.py index eb0b5579..ee01f140 100755 --- a/read_utils.py +++ b/read_utils.py @@ -1208,11 +1208,9 @@ def align_and_fix( else: aligner_options = '' # use defaults - inBamFiltered = mkstempfname('.input-filtered.bam') - samtools.filter_to_mapped_reads(inBam, inBamFiltered, allow_unmapped=True) - filtered_count = samtools.count(inBamFiltered) - if filtered_count==0: - log.warning("zero reads after input filtering") + input_readcount = samtools.count(inBam) + if input_readcount==0: + log.warning("zero reads present in input") bam_aligned = mkstempfname('.aligned.bam') if aligner=="novoalign": @@ -1222,7 +1220,7 @@ def align_and_fix( tools.novoalign.NovoalignTool(license_path=novoalign_license_path).index_fasta(refFastaCopy) tools.novoalign.NovoalignTool(license_path=novoalign_license_path).execute( - inBamFiltered, refFastaCopy, bam_aligned, + inBam, refFastaCopy, bam_aligned, options=aligner_options.split(), JVMmemory=JVMmemory ) @@ -1233,13 +1231,11 @@ def align_and_fix( opts = aligner_options.split() - bwa.align_mem_bam(inBamFiltered, refFastaCopy, bam_aligned, min_score_to_filter=bwa_min_score, threads=threads, options=opts) + bwa.align_mem_bam(inBam, refFastaCopy, bam_aligned, min_score_to_filter=bwa_min_score, threads=threads, options=opts) elif aligner=='minimap2': mm2 = tools.minimap2.Minimap2() - mm2.align_bam(inBamFiltered, refFastaCopy, bam_aligned, threads=threads, options=aligner_options.split()) - - os.unlink(inBamFiltered) + mm2.align_bam(inBam, refFastaCopy, bam_aligned, threads=threads, options=aligner_options.split()) if skip_mark_dupes: bam_marked = bam_aligned @@ -1264,7 +1260,12 @@ def align_and_fix( shutil.copyfile(bam_realigned, outBamAll) tools.picard.BuildBamIndexTool().execute(outBamAll) if outBamFiltered: - samtools.filter_to_mapped_reads(bam_realigned, outBamFiltered, allow_unmapped=False, min_mapping_qual=1) + filtered_any_mapq = mkstempfname('.filtered_any_mapq.bam') + # filter based on read flags + samtools.filter_to_proper_primary_mapped_reads(bam_realigned, filtered_any_mapq) + # remove reads with MAPQ <1 + samtools.view(['-b', '-q', '1'], filtered_any_mapq, outBamFiltered) + os.unlink(filtered_any_mapq) tools.picard.BuildBamIndexTool().execute(outBamFiltered) os.unlink(bam_realigned) diff --git a/tools/samtools.py b/tools/samtools.py index b55d9286..246f7c33 100644 --- a/tools/samtools.py +++ b/tools/samtools.py @@ -182,11 +182,17 @@ def removeDoublyMappedReads(self, inBam, outBam): opts = ['-b', '-F' '1028', '-f', '2', '-@', '3'] self.view(opts, inBam, outBam) - def filter_to_mapped_reads(self, inBam, outBam, allow_unmapped=True, min_mapping_qual=None, remove_singletons=True): + def filter_to_proper_primary_mapped_reads(self, inBam, outBam, require_pairs_to_be_proper=True, reject_singletons=True): ''' - This function writes a bam file filtered to include properly aligned reads. - If allow_unmapped=True, fully-unmapped pairs or unmapped single-end reads are also - written to the output (omitting pairs where only one mate maps). + This function writes a bam file filtered to include only reads that are: + - not flagged as duplicates + - not secondary or supplementary (split/chimeric reads) + - For paired-end reads: + - marked as proper pair (if require_pairs_to_be_proper=True) OR + both not unmapped (if require_pairs_to_be_proper=False) OR + not a member of a pair with a singleton (if reject_singletons=True) + - For single-end reads: + mapped ''' @@ -203,36 +209,29 @@ def filter_to_mapped_reads(self, inBam, outBam, allow_unmapped=True, min_mapping # check if a read is paired is_single_end=not read.is_paired - # only include reads with mapping quality >= INT - # equivalent to samtools view -q - if min_mapping_qual is not None and read.mapping_quality < min_mapping_qual: - continue - # if a PCR/optical duplicate, do not write if read.is_duplicate: continue - if read.is_paired: - if allow_unmapped: - # if mates are not both either mapped or unmapped, do not write - if remove_singletons and read.is_unmapped != read.mate_is_unmapped: - continue - else: - # if not a proper pair (reads are oriented correctly and facing each other) - # do not write - if not read.is_proper_pair: - continue - - # do not write singleton reads. - #if read.is_unmapped or read.mate_is_unmapped: - # continue - - if is_single_end: - if not allow_unmapped: - if read.is_unmapped: - continue - - # otherwise write out the line + # if a read is a secondary or supplementary mapping (split/chimeric), do not write + if read.is_secondary or read.is_supplementary: + continue + + # do not write if + # paired-end + if (read.is_paired and + # reject anything not marked as proper pair (this bit is not guaranteed) + (require_pairs_to_be_proper and not read.is_proper_pair) or + # reject pairs where both mates are unmapped + (read.mate_is_unmapped and read.is_unmapped) or + # reject reads where only one mate is mapped (singletons) + (reject_singletons and read.mate_is_unmapped!=read.is_unmapped )): + continue + + if is_single_end and read.is_unmapped: # or if this is single-end and unmapped, reject + continue + + # otherwise write the read to the output outf.write(read) def filterByCigarString(self, inBam, outBam, From bf25b821a5c81e46029ab946fa2c28720c23842e Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Tue, 13 Oct 2020 15:15:42 -0400 Subject: [PATCH 5/5] incorporate changes following review by @dpark01 re-write if statment for clarity; replace counting-based check for empty bam with one that tries a heuristic before counting (samtools.isEmpty(inBam)) --- read_utils.py | 3 +-- tools/samtools.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/read_utils.py b/read_utils.py index ee01f140..1603d22e 100755 --- a/read_utils.py +++ b/read_utils.py @@ -1208,8 +1208,7 @@ def align_and_fix( else: aligner_options = '' # use defaults - input_readcount = samtools.count(inBam) - if input_readcount==0: + if samtools.isEmpty(inBam): log.warning("zero reads present in input") bam_aligned = mkstempfname('.aligned.bam') diff --git a/tools/samtools.py b/tools/samtools.py index 246f7c33..5f2e6943 100644 --- a/tools/samtools.py +++ b/tools/samtools.py @@ -225,7 +225,7 @@ def filter_to_proper_primary_mapped_reads(self, inBam, outBam, require_pairs_to_ # reject pairs where both mates are unmapped (read.mate_is_unmapped and read.is_unmapped) or # reject reads where only one mate is mapped (singletons) - (reject_singletons and read.mate_is_unmapped!=read.is_unmapped )): + (reject_singletons and (read.mate_is_unmapped or read.is_unmapped))): continue if is_single_end and read.is_unmapped: # or if this is single-end and unmapped, reject