-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathfasta_removal.py
56 lines (43 loc) · 1.5 KB
/
fasta_removal.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
# Removes sequences that are in a list provided
# Requires fasta file and list of sequences to be removed (one name per line)
# Requires screed module
#usage
#python fasta.removal.py <originalfile.nucleotide.fasta> <namelist>
# 0 1 2
import sys, screed
# Inputs
filein = open(sys.argv[1], 'r')
filelist = open(sys.argv[2], 'r')
# Outputs
outy = sys.argv[1]
out1 = outy + '.cleaned.sequences'
fileout1 = open(out1, 'w')
out2 = outy + '.removed.sequences'
fileout2 = open(out2, 'w')
#create a list with the names of the sequences requested
requested_sequences = []
for line in filelist:
line = line.strip('\n').strip('\r')
requested_sequences.append(line)
#print requested sequences
number_records = len(requested_sequences)
print "%s records requested" % number_records
#read file, read each record, if name is in list write it, otherwise continue
counter = 1
for record in screed.open(sys.argv[1]):
sequence_name = record.name #get sequence name
if sequence_name in requested_sequences:
print "%s of %s records found" %(counter, number_records)
sequence = record.sequence
sequence = sequence.strip('*')
description = record.description
fileout2.write(">%s %s\n%s\n" %(sequence_name, description, sequence))
counter = counter + 1
else:
sequence = record.sequence
sequence = sequence.strip('*')
description = record.description
fileout1.write(">%s %s\n%s\n" %(sequence_name, description, sequence))
fileout.close()
fileout2.close()
filein.close()