-
Notifications
You must be signed in to change notification settings - Fork 46
/
renamer.py
executable file
·73 lines (61 loc) · 2.68 KB
/
renamer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
#!/usr/bin/env python
import pysam
import argparse
import gzip
parser = argparse.ArgumentParser(description='make fastq from possorted_genome_bam.bam from cellranger')
parser.add_argument('-f', '--bam', required=True, help="cellranger bam")
parser.add_argument('-b', '--barcodes', required=True, help="cellranger barcodes.tsv")
parser.add_argument('-o', '--out', required=True, help="output fastq name")
parser.add_argument('-c', '--chrom', required = False, help="chrom")
parser.add_argument('-s', '--start', required = False, help="start")
parser.add_argument('-e', '--end', required = False, help="end")
parser.add_argument("--no_umi", required = False, default = "False", help = "set to True if your bam has no umi tag")
parser.add_argument("--umi_tag", required = False, default = "UB", help = "set if umi tag is not UB")
parser.add_argument("--cell_tag", required = False, default = "CB", help = "set if cell barcode tag is not CB")
args = parser.parse_args()
if args.no_umi == "True":
args.no_umi = True
else:
args.no_umi = False
UMI_TAG = args.umi_tag
CELL_TAG = args.cell_tag
assert (not(args.chrom) and not(args.start) and not(args.end)) or (args.chrom and args.start and args.end), "if specifying region, must specify chrom, start, and end"
fn = args.bam#"possorted_genome_bam.bam"#files[0]
bam = pysam.AlignmentFile(fn, "rb")
open_function = lambda f: gzip.open(f,"rt") if f[-3:] == ".gz" else open(f)
cell_barcodes = set([])
with open_function(args.barcodes) as barcodes:
for line in barcodes:
tokens=line.strip().split()
cell_barcodes.add(tokens[0])
if args.chrom:
bam = bam.fetch(args.chrom, int(args.start), int(args.end))
recent_umis = {}
with open(args.out,'w') as fastq:
for (index,read) in enumerate(bam):
if not read.has_tag(CELL_TAG):
continue
cell_barcode = read.get_tag(CELL_TAG)
if read.is_secondary or read.is_supplementary:
continue
pos = read.pos
if args.no_umi:
full_umi = cell_barcode + str(pos)
else:
if not read.has_tag(UMI_TAG):
continue
UMI = read.get_tag(UMI_TAG)
full_umi = cell_barcode + UMI + str(pos)
if full_umi in recent_umis:
continue
if read.seq is None:
continue
readname = read.qname
if read.has_tag(CELL_TAG) and read.get_tag(CELL_TAG) in cell_barcodes:
if args.no_umi:
fastq.write("@"+read.qname+";"+cell_barcode+"\n")
else:
fastq.write("@"+read.qname+";"+cell_barcode+";"+UMI+"\n")
fastq.write(read.seq+"\n")
fastq.write("+\n")
fastq.write(read.qual+"\n")