Skip to content

Commit

Permalink
Merge pull request #38 from broadinstitute/ct-filter-assembled_reads
Browse files Browse the repository at this point in the history
filter assembled reads in align_and_fix
  • Loading branch information
tomkinsc committed Oct 13, 2020
2 parents 0391ce7 + bf25b82 commit 767e7d4
Show file tree
Hide file tree
Showing 2 changed files with 61 additions and 1 deletion.
10 changes: 9 additions & 1 deletion read_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1208,6 +1208,9 @@ def align_and_fix(
else:
aligner_options = '' # use defaults

if samtools.isEmpty(inBam):
log.warning("zero reads present in input")

bam_aligned = mkstempfname('.aligned.bam')
if aligner=="novoalign":
if novoalign_amplicons_bed is not None:
Expand Down Expand Up @@ -1256,7 +1259,12 @@ def align_and_fix(
shutil.copyfile(bam_realigned, outBamAll)
tools.picard.BuildBamIndexTool().execute(outBamAll)
if outBamFiltered:
samtools.view(['-b', '-q', '1', '-F', '1028'], bam_realigned, outBamFiltered)
filtered_any_mapq = mkstempfname('.filtered_any_mapq.bam')
# filter based on read flags
samtools.filter_to_proper_primary_mapped_reads(bam_realigned, filtered_any_mapq)
# remove reads with MAPQ <1
samtools.view(['-b', '-q', '1'], filtered_any_mapq, outBamFiltered)
os.unlink(filtered_any_mapq)
tools.picard.BuildBamIndexTool().execute(outBamFiltered)
os.unlink(bam_realigned)

Expand Down
52 changes: 52 additions & 0 deletions tools/samtools.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,58 @@ def removeDoublyMappedReads(self, inBam, outBam):
opts = ['-b', '-F' '1028', '-f', '2', '-@', '3']
self.view(opts, inBam, outBam)

def filter_to_proper_primary_mapped_reads(self, inBam, outBam, require_pairs_to_be_proper=True, reject_singletons=True):
'''
This function writes a bam file filtered to include only reads that are:
- not flagged as duplicates
- not secondary or supplementary (split/chimeric reads)
- For paired-end reads:
- marked as proper pair (if require_pairs_to_be_proper=True) OR
both not unmapped (if require_pairs_to_be_proper=False) OR
not a member of a pair with a singleton (if reject_singletons=True)
- For single-end reads:
mapped
'''

with pysam.AlignmentFile(inBam, 'rb', check_sq=False) as inb:
with pysam.AlignmentFile(outBam, 'wb', header=inb.header) as outf:
# process the lines individually and write them or not, depending on the flags
# For explanation of flags, see:
# https://broadinstitute.github.io/picard/explain-flags.html
# https://pysam.readthedocs.io/en/latest/api.html
# https://samtools.github.io/hts-specs/SAMv1.pdf
# https://github.com/pysam-developers/pysam/blob/31183d7fac52b529b304bdf61ff933818ae4a71f/samtools/stats.c#L72-L81

for read in inb:
# check if a read is paired
is_single_end=not read.is_paired

# if a PCR/optical duplicate, do not write
if read.is_duplicate:
continue

# if a read is a secondary or supplementary mapping (split/chimeric), do not write
if read.is_secondary or read.is_supplementary:
continue

# do not write if
# paired-end
if (read.is_paired and
# reject anything not marked as proper pair (this bit is not guaranteed)
(require_pairs_to_be_proper and not read.is_proper_pair) or
# reject pairs where both mates are unmapped
(read.mate_is_unmapped and read.is_unmapped) or
# reject reads where only one mate is mapped (singletons)
(reject_singletons and (read.mate_is_unmapped or read.is_unmapped))):
continue

if is_single_end and read.is_unmapped: # or if this is single-end and unmapped, reject
continue

# otherwise write the read to the output
outf.write(read)

def filterByCigarString(self, inBam, outBam,
regexToMatchForRemoval='^((?:[0-9]+[ID]){1}(?:[0-9]+[MNIDSHPX=])+)|((?:[0-9]+[MNIDSHPX=])+(?:[0-9]+[ID]){1})$',
invertResult=False):
Expand Down

0 comments on commit 767e7d4

Please sign in to comment.