-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfda.py
111 lines (91 loc) · 3.15 KB
/
fda.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
# Data from FDA: drug names
# https://www.fda.gov/downloads/Drugs/InformationOnDrugs/UCM527389.zip
import urllib2
import zipfile
import os
import csv
import re
import json
source_zip_url = 'https://www.fda.gov/downloads/Drugs/InformationOnDrugs/UCM527389.zip'
temp_folder = './temp'
data_folder = './data'
file_name = 'fda-drugs'
temp_file = os.path.join(temp_folder, file_name + '.zip')
if not os.path.isdir(temp_folder):
os.makedirs(temp_folder)
if not os.path.isfile(temp_file):
print "Downloading file ..."
try:
response = urllib2.urlopen(source_zip_url)
zipcontent = response.read()
except IOError, e:
print "Can't retrieve %r to %r: %s" % (source_zip_url, temp_folder, e)
quit()
try:
with open(temp_file, 'w') as f:
f.write(zipcontent)
f.close()
except e:
print "Could not write zipfile to temp_dir"
quit()
print "Unzipping file ..."
try:
with zipfile.ZipFile(temp_file) as fdazip:
for n in fdazip.namelist():
destination = os.path.join(temp_folder, file_name, n)
destination_dir = os.path.dirname(destination)
if not os.path.isdir(destination_dir):
os.makedirs(destination_dir)
with fdazip.open(n) as file:
with open(destination, 'w') as f:
f.write(file.read())
f.close()
file.close()
fdazip.close()
except zipfile.error, e:
print "Bad zipfile (from %r): %s" % (source_zip_url, e)
quit()
print "Reading data ..."
drugs = set()
ingredients = set()
forms = set()
products = []
reg_split_no_esc = re.compile(';[\s]?')
with open(os.path.join(temp_folder, file_name, 'Products.txt')) as csv_file:
reader = csv.DictReader(csv_file, delimiter='\t')
for row in reader:
drug_name = row['DrugName'].lower()
ing_list = re.split(reg_split_no_esc, row['ActiveIngredient'].lower())
forms_list = re.split(reg_split_no_esc, row['Form'].lower())
strength = row['Strength'].lower()
drugs.add(drug_name)
for ingredient in ing_list:
ingredients.add(ingredient)
for form in forms_list:
forms.add(form)
current = {
"drugName": drug_name,
"activeIngradients": ing_list,
"form": forms_list,
"strength": strength
}
products.append(current)
print "Writing data ..."
if not os.path.isdir(data_folder):
os.makedirs(data_folder)
if not os.path.isdir(os.path.join(data_folder, file_name)):
os.makedirs(os.path.join(data_folder, file_name))
with open(os.path.join(data_folder, file_name, "drugs.json"), 'wb') as out:
out.write(json.dumps(list(drugs)))
out.close()
with open(os.path.join(data_folder, file_name, "ingredients.json"), 'wb') as out:
out.write(json.dumps(list(ingredients)))
out.close()
with open(os.path.join(data_folder, file_name, "forms.json"), 'wb') as out:
out.write(json.dumps(list(forms)))
out.close()
with open(os.path.join(data_folder, file_name, "products.json"), 'wb') as out:
out.write(json.dumps(products))
out.close()
print "Done."
quit()