-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdrugbankToPubChem.py
141 lines (121 loc) · 5.22 KB
/
drugbankToPubChem.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
# Current Drugbank Mapping TSV file address: https://www.drugbank.ca/releases/5-0-5/downloads/all-drugbank-vocabulary
from __future__ import print_function
import pubchempy
import os
import pycurl
import zipfile
import csv
import json
import re
source_zip_url = 'https://www.drugbank.ca/releases/5-0-5/downloads/all-drugbank-vocabulary'
temp_folder = './temp'
data_folder = './data'
scope_name = 'drugbank'
temp_file = os.path.join(temp_folder, scope_name, 'drugbankToPubChem.zip')
use_cache_only = False
if not os.path.isdir(temp_folder):
os.makedirs(temp_folder)
if not os.path.isdir(os.path.join(temp_folder, scope_name)):
os.makedirs(os.path.join(temp_folder, scope_name))
if not os.path.isfile(temp_file):
print("Downloading file ...")
try:
with open(temp_file, 'wb') as current_file:
c = pycurl.Curl()
c.setopt(c.FOLLOWLOCATION, 1L)
c.setopt(c.URL, source_zip_url)
c.setopt(c.WRITEDATA, current_file)
c.perform()
c.close()
except IOError, e:
print("Can't retrieve %r to %r: %s" % (source_zip_url, temp_folder, e))
quit()
print("Unzipping file ...")
try:
with zipfile.ZipFile(temp_file) as fdazip:
for n in fdazip.namelist():
destination = os.path.join(temp_folder, scope_name, n)
destination_dir = os.path.dirname(destination)
if not os.path.isdir(destination_dir):
os.makedirs(destination_dir)
with fdazip.open(n) as file:
with open(destination, 'w') as f:
f.write(file.read())
f.close()
file.close()
fdazip.close()
except zipfile.error, e:
print("Bad zipfile (from %r): %s" % (source_zip_url, e))
quit()
print("Reading data ...")
data = []
reg_split_no_esc = re.compile('[\s]?\|[\s]?')
with open(os.path.join(temp_folder, scope_name, "drugbank vocabulary.csv")) as csv_file:
reader = csv.DictReader(csv_file)
for row in reader:
# Fields: DrugBank ID,Accession Numbers,Common name,CAS,UNII,Synonyms,Standard InChI Key
drugbankId = row['DrugBank ID']
commonName = row['Common name']
synonyms = re.split(reg_split_no_esc, row['Synonyms'])
inChi = row['Standard InChI Key']
current = {
"drugbankId": drugbankId,
"commonName": commonName,
"synonyms": synonyms,
"inChi": inChi
}
if current['inChi'] is not None and current['inChi'] is not "":
data.append(current)
if not os.path.isdir(os.path.join(temp_folder, scope_name, "cache")):
os.makedirs(os.path.join(temp_folder, scope_name, "cache"))
# Filter out data that has already been downloaded
cached_files = [f.replace(".json", "") for f in os.listdir(os.path.join(temp_folder, scope_name, "cache")) if os.path.isfile(os.path.join(temp_folder, scope_name, "cache", f))]
data = [item for item in data if item["drugbankId"] not in cached_files]
if not use_cache_only:
print("Getting pubChemIds for", len(data), "items")
# map DrugBank compounds to pubchem using InChI
for row in data:
try:
compounds = pubchempy.get_compounds(row['inChi'], namespace='inchikey')
compounds = [compound.cid for compound in compounds]
if len(compounds) > 0:
row['pubChemIds'] = compounds
print(row['drugbankId'], "-->", row['pubChemIds'])
with open(os.path.join(temp_folder, scope_name, "cache", row['drugbankId'] + ".json"), 'wb') as out:
out.write(json.dumps(row))
out.close()
else:
print(row['drugbankId'], "-->", "NO HIT")
except pubchempy.BadRequestError:
print(row['drugbankId'], "-->", "Bad Request!")
continue
except pubchempy.ServerError:
print(row['drugbankId'], "-->", "Server Error!")
continue
except:
print("Unknown exception")
continue
print("Writing data...")
if not os.path.isdir(os.path.join(data_folder, scope_name)):
os.makedirs(os.path.join(data_folder, scope_name))
inChiToPubChemIds = []
cached_files = [f for f in os.listdir(os.path.join(temp_folder, scope_name, "cache")) if os.path.isfile(os.path.join(temp_folder, scope_name, "cache", f))]
for f in cached_files:
with open(os.path.join(temp_folder, scope_name, "cache", f)) as current:
inChiToPubChemIds.append(json.loads(current.read()))
current.close()
with open(os.path.join(data_folder, scope_name, "inChiToPubChemIds.json"), 'wb') as out:
out.write(json.dumps(inChiToPubChemIds))
out.close()
drugbankToPubChem = [{"drugbankId": e['drugbankId'], "pubChemIds": e['pubChemIds']} for e in inChiToPubChemIds]
with open(os.path.join(data_folder, scope_name, "drugbankToPubChem.json"), 'wb') as out:
out.write(json.dumps(drugbankToPubChem))
out.close()
with open(os.path.join(data_folder, scope_name, "pubChemToDrugbankDictionary.json"), 'wb') as out:
result = []
for item in drugbankToPubChem:
for pubChemId in item["pubChemIds"]:
result.append({"drugbankId": item['drugbankId'], "pubChemId": pubChemId})
out.write(json.dumps(result))
out.close()
quit()