-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathbuild_source.py
executable file
·228 lines (185 loc) · 5.91 KB
/
build_source.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
#!/usr/bin/python3
import database
from tqdm import tqdm
from rdkit import Chem
import pandas as pd
import os, json, pickle
"""Command line tools for building molecule sources"""
class MolGraph:
"""Analysis of molecule graph for consistency
Parameters
----------
mol : Chem.Mol
The molecule of interest
"""
def __init__(self,mol):
self.elems = [a.GetAtomicNum() for a in mol.GetAtoms()]
self.bonds = set()
for bond in mol.GetBonds():
self.bonds.add((bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()))
self.bonds.add((bond.GetEndAtomIdx(), bond.GetBeginAtomIdx()))
def matches(self,other):
"""Check if given molecule matches
Parameters
----------
other : Chem.Mol
Molecule to compare to
Returns
-------
bool
True if the molecule matches
"""
return self.elems == other.elems and self.bonds == other.bonds
def valid(mols):
"""Test valididate of given list of molecules
Parameters
----------
mols : list[Chem.Mol]
The list of molecules to analyze
Returns
-------
bool
True if the molecules are consistent
"""
#
# All molecules must have identical topology
#
ref = MolGraph(mols[0])
for m in mols[1:]:
if not ref.matches(MolGraph(m)):
return False
return True
#
# QMugs
#
# Isert, Clemens, Kenneth Atz, José Jiménez-Luna, and Gisbert Schneider.
# “QMugs, Quantum Mechanical Properties of Drug-like Molecules.”
# Scientific Data 9, no. 1 (June 7, 2022): 273. https://doi.org/10.1038/s41597-022-01390-7.
#
# Available here: https://libdrive.ethz.ch/index.php/s/X5vOBNSITAG5vzM
#
# Disk space requirements for source (structures only): about 32G
#
# The structures in this dataset appear to be of high quality and do not
# need filtering.
#
def process_qmugs(path='data/qmugs.db', rootdir="/data/QMugs", nrows=None):
#
# Storage
#
store = database.Storage(path)
#
# Master csv file (large!)
#
dt = pd.read_csv(os.path.join(rootdir,'summary.csv'),nrows=nrows)
for id,row in tqdm(dt.iterrows(),total=len(dt)):
name = row['chembl_id']
conf_name = row['conf_id']
#
# Let's assume conformers are named consistently
#
conformer_id = int(conf_name.split("_")[-1])
#
# Get sdf file, using assumed directory structure
#
sdf = Chem.SDMolSupplier(os.path.join(rootdir,'structures',name,conf_name+".sdf"), removeHs=False)
mol = next(sdf)
#
# Add
#
store.add_mol(mol, name, conformer_id)
if id%1000 == 999:
store.commit()
store.commit()
return store
def production_qmugs():
store = process_qmugs("data/qmugs.db")
store.fixed_set( 'tiny', 1000 )
store.fixed_set( 'debug', 10000 )
store.split_set( ['train', 'validate', 'test'], [0.8,0.1,0.1] )
#
# GEOM-drugs
#
# Axelrod, Simon, and Rafael Gómez-Bombarelli. “GEOM, Energy-Annotated Molecular Conformations
# for Property Prediction and Molecular Generation.”
# Scientific Data 9, no. 1 (April 21, 2022): 185. https://doi.org/10.1038/s41597-022-01288-4.
#
# wget https://dataverse.harvard.edu/api/access/datafile/4327252
# tar -xf 4327252
#
# Disk space requirements for source: about 103G
#
# This dataset suffers from a small level of anomalies presumably caused by
# the chemical structure being altered during minimization. These anomalies
# need to be removed for our use case. The strategy used here is to remove
# any molecule in its entirety if any of the corresponding conformers mismatch.
# Even atom reordering is not permitted.
#
def process_geom(path='data/geom-drugs.db', logfile="data/geom-drugs-issues.log", rootdir="/data/GEOM/rdkit_folder/drugs", nrows=None):
#
# Storage
#
store = database.Storage(path)
#
# Master json
#
contents = []
missing = []
with open(os.path.join(rootdir,"summary_dic.json"),"r") as jsonl:
for line in jsonl:
for k,v in json.loads(line).items():
try:
contents.append(v['pickle_path'])
if nrows is not None and len(contents) >= nrows: break
except KeyError:
missing.append(k)
print(f"Total of {len(contents)} molecules, with an additional {len(missing)} with no pickle")
log = open(logfile,"w")
for im,path in enumerate(tqdm(contents)):
data = pickle.load(open(os.path.join(rootdir,"..",path),"rb"))
mols = [conf['rd_mol'] for conf in data['conformers']]
#
# GEOM processing has a tendency to produce artifacts.
# If these are not removed, bad things happen.
#
if valid(mols):
for ic,mol in enumerate(mols):
store.add_mol(mol, str(im), ic)
else:
log.write(data['smiles']+'\n')
if im%1000 == 999:
store.commit()
store.commit()
return store
def production_geom():
store = process_geom("data/geom-drugs.db")
store.fixed_set( 'tiny', 1000 )
store.fixed_set( 'debug', 10000 )
store.split_set( ['train', 'validate', 'test'], [0.80,0.10,0.10] )
#
# The following constructs a data source from a directory
# of sdf files
#
def process_sdf(path, srcdir):
#
# Storage
#
store = database.Storage(path)
#
# Find files
#
sdfs = [r for r in os.listdir(srcdir) if r.endswith(".sdf")]
#
# Process
#
for s in tqdm(sdfs):
sdf = Chem.SDMolSupplier(os.path.join(srcdir,s), removeHs=False)
mol = next(sdf)
store.add_mol(mol, mol.GetProp("_Name"), 0)
store.commit()
return store
if __name__ == '__main__':
os.makedirs("data",exist_ok=True)
production_qmugs()
#production_geom()
#process_sdf('data/csd.db', 'CSD/sdf').fixed_set('all',99999)