-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathhighlighter_bot.py
120 lines (96 loc) · 4.45 KB
/
highlighter_bot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import re, os, sys, wget, glob, time, argparse
import werkzeug
werkzeug.cached_property = werkzeug.utils.cached_property
import shutil
from robobrowser import RoboBrowser
parser = argparse.ArgumentParser()
parser.add_argument("-a", "--aminoacid", help="flag for amino acid sequence", action="store_true")
args = parser.parse_args()
aaflag = args.aminoacid
browser = RoboBrowser(history=True, parser='lxml')
url = "https://www.hiv.lanl.gov/content/sequence/HIGHLIGHT/highlighter_top.html?choice=mismatches"
download_url = "https://www.hiv.lanl.gov"
#download_referer = "https://www.hiv.lanl.gov/cgi-bin/HIGHLIGHT/highlighter.cgi"
browser.open(url)
form = browser.get_form(action=re.compile(r'highlighter.cgi'))
path = os.getcwd()
files = glob.glob(path + '/*.fasta')
num_files = len(files)
filenum = 0
full_time = time.time()
for file in files:
filenum += 1
job_time = time.time()
alignmentFilePath = file
cur_seq_path = os.path.splitext(file)[0]
cur_seq_name = os.path.basename(cur_seq_path)
treeFilePath = cur_seq_path+'.phy_phyml_tree.txt_newick.tre'
tmpAlignmentFilePath = os.path.join(path, '0_tmp.fasta')
tmpTreeFilePath = os.path.join(path, '0_tmp_newick.tre')
save_png = os.path.join(path, cur_seq_path + '_highlighter_untrimmed.png')
save_data = os.path.join(path, cur_seq_path + '_highlighter.txt')
save_rearr_fasta = os.path.join(path, cur_seq_path + '_highlighter.fasta')
print("cur_seq_path: " + cur_seq_path)
print("cur_seq_name: " + cur_seq_name)
print("alignmentFilePath: " + alignmentFilePath)
print("treeFilePath: " + treeFilePath)
if not os.path.isfile(alignmentFilePath):
sys.exit("File not found: " + alignmentFilePath)
if not os.path.isfile(treeFilePath):
sys.exit("File not found: " + treeFilePath)
if os.path.isfile(save_png):
print('Skipping {}, PNG already exists'.format(cur_seq_name))
continue
if os.path.isfile(save_data):
print('Odd, data file .txt exists but not PNG for {}. Proceeding anyway.'.format(cur_seq_name))
# if filenames are too long, use temporary shorter filenames
if len(cur_seq_name) > 50:
shutil.copyfile(alignmentFilePath, tmpAlignmentFilePath)
shutil.copyfile(treeFilePath, tmpTreeFilePath)
alignmentFilePath = tmpAlignmentFilePath
treeFilePath = tmpTreeFilePath
form["alignmentFile"].value = open(alignmentFilePath, 'r')
form["uploadTree"].value = open(treeFilePath, 'r')
form["choice"].value = "mismatches"
form["sort"].value = "tree"
form["treeType"].value = "upload"
form["tw_multiplier"].value = "7"
form["submit"].value = "" ### There are 2 input type="submit", we need the second one
if aaflag is True:
form["base"].value = "aa"
form["glyco"].value = "no" # or "no" if glycosolation information is not desired
else:
form["apobec"].value = "yes"
browser.session.headers['Referer'] = url
print('Submitting file {}/{}, {}'.format(filenum,num_files,cur_seq_name))
browser.submit_form(form)
# save the PNG and TXT results
anchors = browser.find_all('a', {'href': True})
image = None
data = None
rearr_fasta = None
for anchor in anchors:
if "highlighter.png" in anchor['href'] and "[View large]" in anchor.contents[0]:
image = download_url + anchor['href']
data = download_url + anchor['href'][0:anchor['href'].index("png")] + "txt"
elif "inseqs_rearr.fasta" in anchor['href']:
rearr_fasta = download_url + anchor['href']
if image == None:
sys.exit("No image for: " + cur_seq_name)
wget.download(image, save_png)
wget.download(data, save_data)
wget.download(rearr_fasta, save_rearr_fasta)
# clean up temporary files
if os.path.isfile(tmpAlignmentFilePath):
os.remove(tmpAlignmentFilePath)
if os.path.isfile(tmpTreeFilePath):
os.remove(tmpTreeFilePath)
jt = int(time.time() - job_time)
print('Took {} seconds to downloaded PNG, TXT, and FASTA from Highlighter for file: {}'.format(jt, cur_seq_name))
# please be courteous to the server's resources and do not reduce the sleep time between jobs below 60 seconds
if filenum != num_files:
time.sleep(60)
ft = int(time.time() - full_time)
print('All done! Completed in {} seconds. Exiting.'.format(ft))
# to trim PNGs for Phylobook, install ImageMagick and manually run:
#for f in *_untrimmed.png; do convert -crop +0+179 $f ${f%_untrimmed.png}.png; rm $f; done