-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrules_based_methods.py
191 lines (166 loc) · 7.54 KB
/
rules_based_methods.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
#!/usr/bin/env python
from Bio.Seq import *
import pandas as pd
from convert import *
from scoring import *
import argparse
import os, math
def aa_change(pos_c,prot,rna,j):
'''
pos_c - int, 1-indexed location of editing site in RNA
prot - str, protein sequence corresponding to RNA
rna - str, full RNA coding sequence being considered
j - int, 0-indexed location of editing site in str rna
Returns tuple of (amino acid change, index of change in protein sequence)
amino acid change - str, <original> -> <changed> or "synonymous"
index of change - int
'''
pos = math.ceil(pos_c/3.0)
pos = int(pos)
# Edit the C>U
temp_rna = list(rna)
temp_rna[j] = 'u'
temp_rna = ''.join(temp_rna)
# Translate the edited sequence
temp_prot = translate(temp_rna)
if prot[pos-1] != temp_prot[pos-1]:
f = "{} -> {}".format(prot[pos-1],temp_prot[pos-1])
else:
f = "synonymous"
return f,pos
def scan_gene(seq, is_rna=False, cutoff=None, threshold=9, make_output=True, subset=None):
'''
seq - str, location of fasta file to be considered
must be a coding sequence, not full DNA/RNA, for aa_change
must have a comment/non-code sequence on first line
is_rna - bool, True if seq refers to a FASTA containing "U" instead of "T"
cutoff - int, number of sites to be returned; all returned for cutoff=None
threshold - int, score threshold OVER which site will be considered +
make_output - bool, if True will generate a file with info on included sites
Generates a tsv file containing sorted potential APOBEC3A/G editing sites
in same directory as the fasta file
Returns sorted DataFrame of potential APOBEC3A/G editing sites
rna - str, sequence of RNA in stem-loop surrounding editing site
dna - str, sequence of DNA corresponding to stem-loop
loop_len - int, length of loop (3 or 4) w/C at end
stem_len - int, length of stem surrouding loop
bulge - bool, whether there is a mismatch at -2 position in stem
pos_c - int, 1-indexed location of C in the full nt sequence
begin - int, first nt in the stem-loop
end - int, last nt in the stem-loop
score - int, score of the site; primary sorting factor
aa_chagne - str, amino acid change caused by C>U edit at the site
aa_pos - int, position of altered amino acid in the protein seq
'''
out = seq.split("/")[-1].replace(".fasta","")
# Read in input fasta
with open(seq, 'r') as f:
f.readline() # Skip comment on first line
lines = f.readlines()
if is_rna == True:
# Input was RNA sequence
rna = ''.join(lines)
rna = rna.lower()
else:
# Input was DNA; gets corresonding RNA sequence
dna = ''.join(lines)
dna = dna.lower()
rna = dna2rna(dna)
prot = translate(rna) # corresponding protein sequence
# Create dataframes
columns=('rna','dna','loop_len','stem_len','bulge','pos_c','begin','end','score')
df_all = pd.DataFrame(columns=columns)
df3 = pd.DataFrame(columns=columns)
df4 = pd.DataFrame(columns=columns)
if subset:
size = len(rna)
for pos_c in subset:
if pos_c < 4 or pos_c >= len(rna) or rna[pos_c-1] != 'c':
continue
j = pos_c-1
i = j - 3
if (rna[j-1] == 'u' or rna[j-1] == 'c'):
current = None
max_stem = 0
max_bulge = False
# Check for stem-loop surrounding site with a 4-nt loop
loop = 4
temp_df = palindrome(rna,loop,i,j,columns)
a,b = aa_change(pos_c,prot,rna,j)
temp_df['aa_change'] = a
temp_df['aa_pos'] = int(b)
if not temp_df.empty:
df4 = df4.append(temp_df, ignore_index=True, sort=False)
# Check for stem-loop surrounding site with a 3-nt loop
shift_i = i+1
loop = 3
temp_df = palindrome(rna,loop,shift_i,j,columns)
a,b = aa_change(pos_c,prot,rna,j)
temp_df['aa_change'] = a
temp_df['aa_pos'] = int(b)
if not temp_df.empty:
df3 = df3.append(temp_df, ignore_index=True, sort=False)
else:
size = len(rna)
i = 0
j = 3
# Iterate through all Cs within the sequence and check editing probability
while j < size:
if rna[j] == 'c':
pos_c = j+1
if (rna[j-1] == 'u' or rna[j-1] == 'c'):
current = None
max_stem = 0
max_bulge = False
# Check for stem-loop surrounding site with a 4-nt loop
loop = 4
temp_df = palindrome(rna,loop,i,j,columns)
a,b = aa_change(pos_c,prot,rna,j)
temp_df['aa_change'] = a
temp_df['aa_pos'] = int(b)
if not temp_df.empty:
df4 = df4.append(temp_df, ignore_index=True, sort=False)
# Check for stem-loop surrounding site with a 3-nt loop
shift_i = i+1
loop = 3
temp_df = palindrome(rna,loop,shift_i,j,columns)
a,b = aa_change(pos_c,prot,rna,j)
temp_df['aa_change'] = a
temp_df['aa_pos'] = int(b)
if not temp_df.empty:
df3 = df3.append(temp_df, ignore_index=True, sort=False)
i += 1
j += 1
# Combine DataFrames and sort according to score
df_all = df_all.append([df4, df3], ignore_index=True, sort=False)
df_all = df_all.sort_values(by=['score', 'stem_len'],\
ascending=[False, False])
df_all = df_all.drop_duplicates(subset='pos_c', keep='first')
# Generate file and return DataFrame
if cutoff != None:
df_all = df_all[:cutoff]
if make_output:
df_all.to_csv('{}-top{}.tsv'.format(out, cutoff),sep='\t',float_format='%.2f')
elif threshold != None:
df_all = df_all[df_all['score']>threshold]
if make_output:
df_all.to_csv('{}-thresh{}.tsv'.format(out, threshold),sep='\t',float_format='%.2f')
else:
if make_output:
df_all.to_csv('{}-full.tsv'.format(out),sep='\t',float_format='%.2f')
return df_all[:cutoff] if cutoff else df_all
if __name__ == '__main__':
# Arugments
parser = argparse.ArgumentParser(prog="RNAsee", description="Search a sequence for putative RNA editing sites by APOBEC3A/G.")
parser.add_argument('sequence', metavar='sequence', type=str, help="FASTA file for DNA seqeunce input.")
parser.add_argument('--cutoff','-c', type=int, help="Return only the top X high-scoring values.")
parser.add_argument('--threshold', '-t', type=int, help="Return only the sites with a score of > threshold.")
parser.add_argument('-v', '--version', action='version', version='%(prog)s v2.0')
parser.add_argument('--rna', action="store_true", help="The FASTA file is an RNA sequence (only A, U, G, C).")
args=parser.parse_args()
# Set variables
seq = args.sequence
is_rna = args.rna
cutoff = args.cutoff
threshold = args.threshold
df = scan_gene(seq, is_rna, cutoff=cutoff, threshold=threshold)