-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathML_methods.py
324 lines (255 loc) · 10.8 KB
/
ML_methods.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
def get_short_site_data(seq, i, bwd, fwd):
'''
seq - str, the full DNA/RNA sequence to be sampled
T and U will be treated as the same nucleotide
i - int, the index of the editing site (assumed to be C)
bwd - int, # of nts prior to editing site to be included in vector
fwd - int, # of nts prior to editing site to be included in vector
Returns a list of ints, len (bwd + fwd) * 2, representing a binary vector
Each nt represented by 2 ints in the list: is_purine, pairs_GC
is_purine - 1 if nt is purine (AG), 0 if nt is pyrimidine (CT)
pairs_GC - 1 if nt participates in GC pairing (GC), 0 if it doesn't (AT)
'''
# Create list of 0s of correct length and bound seq to be considered
vector = [0]*((bwd + fwd + 1)*2)
min_i = max(0, i - bwd)
max_i = min(len(seq) - 1, i + fwd)
sub_seq = seq[min_i:max_i + 1]
# Changes 0 to 1 if nt belongs to correct category
for i in range(len(sub_seq)):
char = sub_seq[i].lower()
if char.lower() in 'ag':
vector[i*2] = 1
if char.lower() in 'gc':
vector[i*2 + 1] = 1
return vector[:bwd*2] + vector[bwd*2 + 2:]
def gen_site_data(gene_path, fol_path):
'''
gene_path - str, path to the CSV list of real sites
Must be a series of lines of format "gene,loc\n" with a header
loc must be 1-indexed position in coding sequence
fol_path - str, path to the folder of site files
Filenames must be in format "gene-cds.fasta"
Must contain files corresponding to all genes in gene_path file
Returns a dict of real & fake sites in format {gene:[loc1, loc2]}
Fake sites include 3x number of real sites, with 2 types:
2x sites that are chosen at complete random
1x sites that are chosen from position 25 in RNASee output
Some genes may lack a fake site
'''
import rna_see, random
# Extract real sites from gene_path file
with open(gene_path, 'r') as f:
lines = f.readlines()
real_sites = {}
for line in lines[1:]:
gene, loc = line.split(',')
loc = int(loc)
if gene in real_sites:
real_sites[gene].append(loc)
else:
real_sites[gene] = [loc]
# Randomly distributes fake sites based on # of real sites
num_real = 0
for gene in real_sites:
num_real += len(real_sites[gene])
fake_sites = {}
sites_per_gene_random = [0]*len(real_sites) # Randomly chosen sites
sites_per_gene_see = [0]*len(real_sites) # Position 25 RNASee sites
for i in range(num_real*2):
sites_per_gene_random[random.randrange(len(sites_per_gene_random))] += 1
for i in range(num_real):
sites_per_gene_see[random.randrange(len(sites_per_gene_see))] += 1
# Chooses random sites from list of Cs in each gene
for gene, num in zip(real_sites, sites_per_gene_random):
# Generates list of Cs in the file
with open(fol_path + '/' + '%s-cds.fasta' % gene) as f:
seq = ''.join(f.read().split('\n')[1:])
c_pos = []
for i in range(len(seq)):
if seq[i].lower() == 'c':
c_pos.append(i + 1)
# Chooses sites
fake_sites[gene] = []
for _ in range(num):
loc = random.choice(c_pos)
while loc in real_sites[gene] or loc in fake_sites[gene]:
loc = random.choice(c_pos)
fake_sites[gene].append(loc)
print('x2 Random Sites Selected\n')
# Chooses sites similar to editing sites from position 25 RNASee
# In case of same site being chosen already, takes a higher ranking site
for gene, num in zip(real_sites, sites_per_gene_see):
# Gets RNASee output of high-scoring sites
out = rna_see.see(fol_path + '/' + '%s-cds.fasta' % gene)
end = len(out)
# Extracts rank-25 site
# Chooses higher rank if site already in real or fake sites list
for _ in range(num):
i = 25
loc = int(out.iloc[[i]]['pos_c'])
while loc in real_sites[gene] or loc in fake_sites[gene]:
i -= 1
loc = int(out.iloc[[i]]['pos_c'])
fake_sites[gene].append(loc)
print('x', end='')
print('\nx1 25+ See Sites Selected')
return real_sites, fake_sites
def generate_vector_file(neg_file, pos_file, folder_path, out_file):
'''
neg_file - str, path to file of negative (non-editing) sites
csv with header, lines in format gene,loc
pos_file - str, path to file of positive (editing) sites
csv with header, lines in format gene,loc
folder_path - str, path to folder containing fasta files
filenames should be in format gene-cds.fasta
out_file - str, path of the output file
Creates CSV file of vectors in format gene,class,vector...\n
'''
# Get the gene names and locations of each negative site
with open(neg_file, 'r') as f:
genes = f.read().split('\n')
data = [] # Data for output to vector file
# Generate the row data (gene name, class identifier, vector)
for gene in genes[1:]:
if ',' not in gene:
continue
name, loc = gene.split(',')
with open(folder_path + '%s-cds.fasta' % name, 'r') as f:
seq = ''.join(f.read().split('\n')[1:])
site_info = get_short_site_data(seq, int(loc) - 1, 15, 10)
site_info = [str(x) for x in site_info]
data.append(name + ',0,' + ','.join(site_info) + '\n')
# Get the gene names and locations of each positive site
with open(pos_file, 'r') as f:
genes = f.read().split('\n')
# Generate the row data (gene name, class identifier, vector)
for gene in genes[1:]:
if ',' not in gene:
continue
name, loc = gene.split(',')
with open(folder_path + '%s-cds.fasta' % name, 'r') as f:
seq = ''.join(f.read().split('\n')[1:])
site_info = get_short_site_data(seq, int(loc) - 1, 15, 10)
site_info = [str(x) for x in site_info]
data.append(name + ',1,'+ ','.join(site_info) + '\n')
# Writes output file
with open('strat_short_vectors.csv', 'w') as f:
f.writelines(data)
import numpy
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
def random_forest_creation(vectors, negative_loc, positive_loc):
'''
vectors - str, path to a csv file with vectors for dataset; no header
Col 1 is gene name
Col 2 is class identifier; 0 - fake site, 1 - real site
Col 3 on is the vector, eg one generated by get_short_site_data
negative_loc - str, path to csv file with fake sites
all lines should be of format gene,loc with a header row
positive_loc - str, path to csv file with real sites
all lines should be of format gene,loc with a header row
Returns tuple RandomForestClassifier, list of training sites used
'''
# Gets list of vectors
with open(vectors, 'r') as f:
vector_lines = f.read().split('\n')
if ',' not in vector_lines[-1]:
vector_lines = vector_lines[:-1]
if vector_lines[0].split(',')[1] not in ['0', '1']:
vector_lines = vector_lines[1:]
# Gets a list of negative and positive sites in same order as vector file
# (if vector file was generated using method in this file)
gene_loc = []
with open(negative_loc, 'r') as f:
for line in f.readlines()[1:]:
if ',' in line:
gene_loc.append(tuple(line.strip().split(',') + [0]))
with open(positive_loc, 'r') as f:
for line in f.readlines()[1:]:
if ',' in line:
gene_loc.append(tuple(line.strip().split(',') + [1]))
# Separates vector lines into the vectors and class identifiers (bools)
vector_set = []
bool_set = []
for line in vector_lines:
vector_set.append([int(x) for x in line.split(',')[2:]])
bool_set.append(int(line.split(',')[1]))
vector_array = numpy.array(vector_set)
bool_array = numpy.array(bool_set)
# Creates train and test set using a 7:3 split
# Also saves genes and locations for training set to exclude from testing
x_train, x_test, y_train, y_test, \
loc_train, loc_test = train_test_split(vector_array, \
bool_array,\
gene_loc,\
test_size=0.3)
# Creates & fits random forest classifier; prints some basic stats
clf = RandomForestClassifier()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print('Recall:', metrics.recall_score(y_test, y_pred))
print('Precision:', metrics.precision_score(y_test, y_pred))
print('Accuracy:', metrics.accuracy_score(y_test, y_pred))
print('F1 Score:', metrics.f1_score(y_test, y_pred))
return clf, loc_train
#alpha = svm_creation('vector_file.csv')
import pickle
def scan_gene(ML_pickle, file_path, subset=[]):
'''
ML_pickle - str, filepath to a pickle file containing a ML model
ML model must take as input the output of get_short_site_data function
file_path - str, filepath to the CDS file for the gene in question
subset - iterable of ints, 1-indexed pos of sites to check
Returns list of 1-indexed pos of all predicted editing sites in given gene
'''
# Prepares ML model and sequence string for use
with open(ML_pickle, 'rb') as pickle_file:
model = pickle.load(pickle_file)
with open(file_path, 'r') as gene_file:
seq = ''.join(gene_file.read().split('\n')[1:])
chosen_sites = []
# Uses ML model to predict class of all C sites
if len(subset) > 0:
for loc in subset:
if seq[loc-1].lower() != 'c':
continue
test_vector = [get_short_site_data(seq, loc-1, 15, 10)]
if model.predict(test_vector)[0] and model.predict(test_vector)[0] != -1:
chosen_sites.append(loc)
else:
for i in range(len(seq)):
if seq[i].lower() != 'c':
continue
test_vector = [get_short_site_data(seq, i, 15, 10)]
if model.predict(test_vector)[0] and model.predict(test_vector)[0] != -1:
chosen_sites.append(i + 1)
return chosen_sites
def proba_scan_gene(ML_pickle, file_path, subset=None):
'''
ML_pickle - str, filepath to a pickle file containing an ML model
file_path - str, filepath to the CDS file for the gene in question
'''
with open(ML_pickle, 'rb') as pickle_file:
clf = pickle.load(pickle_file)
with open(file_path, 'r') as gene_file:
seq = ''.join(gene_file.read().split('\n')[1:])
#sites = []
sites = {}
if subset:
for site in subset:
if site < 1 or site >= len(seq) or seq[site-1].lower() != 'c':
continue
test_vector = [get_short_site_data(seq, site-1, 15, 10)]
score = clf.predict_proba(test_vector)[0][1]
sites[site] = score
else:
for i in range(len(seq)):
if seq[i].lower() != 'c':
continue
# Change vector parameters here
test_vector = [get_short_site_data(seq, i, 15, 10)]
score = clf.predict_proba(test_vector)[0][1]
if score > 0.5:
sites[i+1] = score
return sites