-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstitch.py
58 lines (45 loc) · 1.76 KB
/
stitch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# Data taken from http://stitch.embl.de/download/chemicals.inchikeys.v5.0.tsv.gz
import json
import csv
import os
import gzip
import pycurl
temp_folder = './temp'
data_folder = './data'
scope_name = 'stitch'
source_zip_url = 'http://stitch.embl.de/download/chemicals.inchikeys.v5.0.tsv.gz'
temp_file = os.path.join(temp_folder, scope_name, "stitch_chemichal_inchi.tsv.gz")
if not os.path.isdir(temp_folder):
os.makedirs(temp_folder)
if not os.path.isdir(os.path.join(temp_folder, scope_name)):
os.makedirs(os.path.join(temp_folder, scope_name))
if not os.path.isfile(temp_file):
print("Downloading file ...")
try:
with open(temp_file, 'wb') as current_file:
c = pycurl.Curl()
c.setopt(c.FOLLOWLOCATION, 1L)
c.setopt(c.URL, source_zip_url)
c.setopt(c.WRITEDATA, current_file)
c.perform()
c.close()
except IOError, e:
print("Can't retrieve %r to %r: %s" % (source_zip_url, temp_folder, e))
quit()
print("Unzipping file ...")
with gzip.open(os.path.join(temp_folder, scope_name, "stitch_chemichal_inchi.tsv.gz"), 'rb') as f:
stitch_chemichal_inchi_content = f.read()
f.close()
print("Reading data ...")
stitch_to_umls = {}
umls_dictionary = set()
reader = csv.DictReader(stitch_chemichal_inchi_content.split("\n"), delimiter='\t',
fieldnames=[
'flat_chemical_id',
'stereo_chemical_id',
'placebo',
'inchikey'])
with open(os.path.join(data_folder, "sider", "stitchToUmls.json")) as data_file:
data = json.load(data_file)
data_file.close()
# TODO - Map stich id's from sider to the ones from stitch and save inChi to document