-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathsupplement.tex
395 lines (367 loc) · 20.2 KB
/
supplement.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
\documentclass[journal=jcisd8,manuscript=article]{achemso}
\usepackage{graphicx}
\usepackage{xr-hyper}
\usepackage{hyperref}
\usepackage{xcolor}
\usepackage{verbatim}
\usepackage[subrefformat=parens]{subcaption}
\usepackage[finalizecache,cachedir=.]{minted}
\usepackage[version=3]{mhchem} % Formula subscripts using \ce{}
\renewcommand{\thetable}{S\arabic{table}}
\renewcommand{\thefigure}{S\arabic{figure}}
\author{Paul G. Francoeur}
\author{David R. Koes}
\email{dkoes@pitt.edu}
\affiliation[Pitt]{Department of Computational and Systems Biology, University of Pittsburgh, Pittsburgh, PA 15260}
\title{Supporting Information:\\SolTranNet -- A machine learning tool for fast aqueous solubility prediction.}
\begin{document}
\begin{figure}[tb]
\subfloat[Optimizer Hyperband Sweep]{
\centering
\begin{tabular}{|c|c|}
\hline
Parameter & Range \\
\hline
Training Epochs & [250, 2000] \\
Loss Function & MSE, MAE, Huber \\
Learning Rate & [0.0001, 0.1] \\
Momentum & [0.5, 0.9] \\
Optimizer & SGD, Adam \\
Weight Decay & [0, 0.5] \\
Dimension of Conformer Generation & 2D, 3D \\
Scaffold CCV fold & 0 \\
Target & minimize test RMSE \\
\hline
\end{tabular}
\label{tab:initsweep}
}
\subfloat[Architecture Grid Search]{
\centering
\begin{tabular}{|c|c|}
\hline
Parameter & Range \\
\hline
Dimension of Conformer Generation & 2D, 3D \\
Latent Dimension of Encoder & 2, 4, \textbf{8}, 16, 32, 64, 128, 256, 512, 1024 \\
Dropout & 0, \textbf{0.1} \\
Number of Attention Heads (\#Heads) & \textbf{2}, 4, 8, 16, 32 \\
$\lambda_1$ Attention & 0.25, 0.33, \textbf{0.5} \\
$\lambda_2$ Distance & \textbf{0}, 0.33 \\
Number of Encoder Stacks (\#Stacks) & 2, 4, 6, \textbf{8}, 16 \\
Scaffold CCV folds & 0, 1, 2 \\
Loss Function & Huber \\
Learning Rate & 0.04 \\
Momentum & 0.06 \\
Optimizer & SGD \\
Weight Decay & 0 \\
\hline
\end{tabular}
\label{tab:archsweep}
}
\caption{Two stage hyperparameter sweep for SolTranNet. The Optimizer sweep was performed via using Weights and Bias's hyperband sweeping tool using Bayesian optimization (during which all values in ranges are considered), with the target to minimize the test set RMSE. The Architecture sweep was a grid search over the listed parameters after the optimizer hyperparameters were set from the first stage. The deployed version of SolTranNet uses the hyperparameters in bold.}
\label{tab:wandsweep}
\end{figure}
\begin{table}
%\centering
\begin{tabular}{|c|c|c|c|c|c|c|}
\hline
Model & Parameters & CCV RMSE & Fold0 RMSE & Fold1 RMSE & Fold2 RMSE & Ind RMSE \\
\hline
MAT & 42,049,537 & 2.007 & 1.231 & 3.075 & 1.058 & 2.113 \\
\hline
0 & 21,061,633 & 1.449 & 1.161 & 1.959 & 1.054 & 1.914 \\
1 & 502,529 & 1.467 & 1.231 & 1.972 & 1.028 & 1.946 \\
2 & 336,385 & 1.377 & 1.260 & 1.682 & 1.126 & 1.870 \\
3 & 336,385 & 1.454 & 1.239 & 1.857 & 1.165 & 1.949 \\
4 & 22,657 & 1.297 & 1.187 & \textbf{1.582} & 1.066 & 1.903 \\
5 & 11,905 & \textbf{1.271} & \textbf{1.130} & 1.605 & \textbf{0.998} & 1.923 \\
6 & 11,905 & 1.350 & 1.272 & 1.680 & 1.012 & 1.903\\ \hline \hline
7 & 3,393 & 1.459 & 1.172 & 1.916 & 1.159 & \textbf{1.779}\\ \hline \hline
8 & 2,609 & 1.376 & 1.169 & 1.788 & 1.055 & 1.973 \\
9 & 2,609 & 1.454 & 1.252 & 1.919 & 1.043 & 1.973 \\
\hline \hline
Elastic & 1,463 & 1.835 & 1.843 & 1.965 & 1.687 & 2.090\\
Lasso & 1,077 & 1.876 & 1.914 & 1.984 & 1.719 & 2.136\\
PLS & 2,048 & 2.413 & 3.085 & 2.082 & 1.902 & 2.081\\
Ridge & 2,048 & 2.239 & 2.503 & 2.032 & 2.155 & 2.312 \\
\hline
\end{tabular}
\caption{SolTranNet model hyperparameter search for predicting solubility RMSE on the scaffold-based clustered cross-validation splits of AqSolDB (CCV) and our independent test set (Ind). Additionally, all models outperform linear approaches trained on RDKit fingerprints using scikit-learn\cite{scikit-learn}. The isolated row (model 7) is the final SolTranNet architecture.}
\label{tab:solsearchrmse}
\end{table}
\begin{table}
%\centering
\begin{tabular}{|c|c|c|c|c|c|c|}
\hline
Model & Parameters & CCV $R^2$ & Fold0 $R^2$ & Fold1 $R^2$ & Fold2 $R^2$ & Ind $R^2$ \\
\hline
MAT & 42,049,537 & 0.532 & 0.751 & 0.332 & 0.715 & 0.375 \\
\hline
0 & 21,061,633 & 0.670 & 0.778 & 0.520 & 0.727 & 0.4718 \\
1 & 502,529 & 0.662 & 0.769 & 0.465 & 0.733 & 0.4576 \\
2 & 336,385 & 0.693 & 0.764 & 0.563 & 0.706 & 0.4890 \\
3 & 336,385 & 0.672 & 0.764 & 0.539 & 0.685 & 0.4461 \\
4 & 22,657 & \textbf{0.724} & 0.780 & \textbf{0.626} & 0.716 & 0.4831 \\
5 & 11,905 & \textbf{0.723} & \textbf{0.793} & 0.595 & 0.744 & 0.4628 \\
6 & 11,905 & 0.713 & 0.786 & 0.582 & \textbf{0.756} & 0.4517 \\ \hline \hline
7 & 3,393 & 0.6764 & 0.7914 & 0.5243 & 0.7424 & \textbf{0.5528}\\ \hline \hline
8 & 2,609 & 0.695 & 0.779 & 0.545 & 0.737 & 0.4589\\
9 & 2,609 & 0.677 & 0.767 & 0.552 & 0.720 & 0.4678\\
\hline
Elastic & 1,463 & 0.414 & 0.441 & 0.394 & 0.408 & 0.377\\
Lasso & 1,077 & 0.395 & 0.409 & 0.378 & 0.392 & 0.351\\
PLS & 2,048 & 0.228 & 0.212 & 0.333 & 0.0747 & 0.676\\
Ridge & 2,048 & 0.319 & 0.289 & 0.354 & 0.291 & 0.261 \\
\hline
\end{tabular}
\caption{SolTranNet hyperparameter search correlations. There is a general trend of models with fewer parameters performing better. Additionally, all models significantly outperform linear ML approaches. The isolated row (model 7) is the final SolTranNet.}
\label{tab:solsearchr2}
\end{table}
\begin{table}
%\centering
\begin{tabular}{|c|c|}
\hline
Index & Description \\
\hline
0-11 & Atom Identity as one-hot (B,N,C,O,F,P,S,Cl,Br,I,Dummy,Other) \\
12-17 & Number of heavy atom neighbors as one-hot of 0-5 \\
18-22 & Number H atoms as one-hot of 0-4 \\
23 & Formal Charge \\
24 & In a Ring \\
25 & Aromatic \\
\hline
\end{tabular}
\caption{Featurization used to embed atoms in SolTranNet. This is the same schema as in the original MAT implementation. }
\label{tab:atomembed}
\end{table}
\begin{figure}[tb]
\centering
\begin{subfigure}[t]{0.48\textwidth}
\centering
\includegraphics[width=\linewidth]{figures/final_model_ind_dyn_RMSE.pdf}
\end{subfigure}%
\hfill
\begin{subfigure}[t]{0.48\textwidth}
\centering
\includegraphics[width=\linewidth]{figures/final_model_ind_dyn_R2.pdf}
\end{subfigure}
\caption{Boxplots of the performance of SolTranNet models on our independent test set using various dynamic stopping criteria. The Dynamic Stopping is the number of epochs that the model can fail to improve its fit to the training set before training is stopped. The boxplot is of 5 different seeds, with the ensemble performance shown with the blue dot. Interestingly, the ensemble outperforms the mean performance of the 5 models, but fails to be better than the best performing seed in all cases.}
\label{fig:dynsweep}
\end{figure}
\begin{table}
\begin{tabular}{|c|c|c|c|c|}
\hline
Model & Train RMSE & Test RMSE & Train $R^2$ & Test $R^2$ \\
\hline
Average of 5 Seeds & 1.171 (0.0504) & 1.878 (0.0925) & 0.784 (0.00643) & 0.509 (0.0445)\\
Ensemble & \textbf{1.068} & 1.814 & \textbf{0.798} & 0.517\\
Deployed & 1.144 & \textbf{1.711} & 0.777 & \textbf{0.577}\\
\hline
\end{tabular}
\caption{Performance of 5 different seeds of SolTranNet. Each model was trained on all of AqSolDB and tested on our independent test set. The first row reports the mean and standard deviation (in parenthesis) for 5 models trained with different random seed. The second row is the ensemble performance of the models from the first row, with the final row being the results of the single best performing model. We note that the ensemble outperforms each model on average, but fails to beat the best performing seed. Deployed refers to the single best performing model.}
\label{tab:deployed}
\end{table}
\begin{figure}[tb]
\centering
\begin{subfigure}[t]{0.48\textwidth}
\centering
\includegraphics[width=\linewidth]{figures/aqsol_scaf0_soldist.pdf}
\end{subfigure}%
\hfill
\begin{subfigure}[t]{0.48\textwidth}
\centering
\includegraphics[width=\linewidth]{figures/aqsol_scaf1_soldist.pdf}
\end{subfigure}
\begin{subfigure}[t]{0.48\textwidth}
\centering
\includegraphics[width=\linewidth]{figures/aqsol_scaf2_soldist.pdf}
\end{subfigure}
\begin{subfigure}[t]{0.48\textwidth}
\centering
\includegraphics[width=\linewidth]{figures/AqSolDB_solhist.pdf}
\end{subfigure}%
\hfill
\begin{subfigure}[t]{0.48\textwidth}
\centering
\includegraphics[width=\linewidth]{figures/Independent_solhist.pdf}
\end{subfigure}
\caption{Distributions of our various training set data. Each bin has a width of 0.25 units. The first 3 histograms are for the 3 fold scaffold split of AqSolDB, and the last 2 are the full AqSolDB and out independent test set. Note that fold 1 of our scaffold split is particularly challenging as the test set distribution is very different from the training set.}
\label{fig:solhists}
\end{figure}
\begin{figure}[tb]
\begin{subfigure}[t]{0.48\textwidth}
\centering
\includegraphics[width=\linewidth]{figures/2dv3d_rmse.pdf}
\caption{There is no statistically significant difference between these two collections ($p=0.144$), however the mean performance is better for the 3D models (1.522 vs 1.608).}
\end{subfigure}%
\hfill
\begin{subfigure}[t]{0.48\textwidth}
\centering
\includegraphics[width=\linewidth]{figures/2dv3d_r2.pdf}
\caption{There is no statistically significant difference between these two collections ($p=0.116$), but the mean performance of the 3D models is better than the 2D models (0.646 vs 0.614).}
\end{subfigure}
\caption{RMSE (a) and $R^2$ (b) performance for the first 10 models trained during the architecture sweep. There is no statistically significant difference between using 2D conformers or 3D conformers for generating the distance matrix used by SolTranNet.}
\label{fig:2dv3d}
\end{figure}
\begin{figure}[tb]
\centering
\begin{subfigure}[t]{0.48\textwidth}
\centering
\includegraphics[width=\linewidth]{figures/full_saltfragfirst_RMSEs_boxplots.pdf}
\end{subfigure}%
\hfill
\begin{subfigure}[t]{0.48\textwidth}
\centering
\includegraphics[width=\linewidth]{figures/othersMW_saltfragfirst_RMSEs_boxplots.pdf}
\end{subfigure}
\begin{subfigure}[t]{0.48\textwidth}
\centering
\includegraphics[width=\linewidth]{figures/others2plus_saltfragfirst_RMSEs_boxplots.pdf}
\end{subfigure}%
\hfill
\begin{subfigure}[t]{0.48\textwidth}
\centering
\includegraphics[width=\linewidth]{figures/noothers_saltfragfirst_RMSEs_boxplots.pdf}
\end{subfigure}
\caption{RMSE of 5 different SolTranNet trained on the scaffold split of AqSolDB. The horizontal axis is dependent on if the salts were fragmented in the test set, while the colors of the boxplot are dependent on if the salts were fragmented in the training set. Each plot has successively fewer data points in the training and testing sets by more stringently removing atoms which would be embedded as ``Other type'' during training. There is no statistically significant difference in performance on the test set between training on Normal, or Fragmented Salts for all sets.}
\label{fig:saltfragrmse}
\end{figure}
\begin{figure}[tb]
\centering
\begin{subfigure}[t]{0.48\textwidth}
\centering
\includegraphics[width=\linewidth]{figures/full_saltfragfirst_R2s_boxplots.pdf}
\caption{No statistically significant difference between performance on either test set when comparing training with or without fragmented salts ($p=$0.0596 and 0.1852).}
\label{fig:fullsfr2}
\end{subfigure}%
\hfill
\begin{subfigure}[t]{0.48\textwidth}
\centering
\includegraphics[width=\linewidth]{figures/othersMW_saltfragfirst_R2s_boxplots.pdf}
\caption{No statistically significant difference between performance on either test set when comparing training with or without fragmented salts ($p=$0.4751 and 0.1100).}
\label{fig:mwsfr2}
\end{subfigure}
\begin{subfigure}[t]{0.48\textwidth}
\centering
\includegraphics[width=\linewidth]{figures/others2plus_saltfragfirst_R2s_boxplots.pdf}
\caption{Training with fragmenting salts performed worse on testing on Normal ($p=0.0240$) and better on testing on Fragment ($p=0.03547$).}
\label{fig:onesfr2}
\end{subfigure}%
\hfill
\begin{subfigure}[t]{0.48\textwidth}
\centering
\includegraphics[width=\linewidth]{figures/noothers_saltfragfirst_R2s_boxplots.pdf}
\caption{Training with fragmented salts had no effect on testing on Normal ($p=0.1555$), but performed better when testing on Fragment ($p=0.0116$).}
\label{fig:noothersdf2}
\end{subfigure}
\caption{$R^2$ of 5 different SolTranNet trained on the scaffold split of AqSolDB. The horizontal axis is dependent on if the salts were fragmented in the test set, while the colors of the boxplot are dependent on if the salts were fragmented in the training set. Each plot has successively fewer data points in the training and testing sets by more stringently removing atoms which would be embedded as ``Other type'' during training.}
\label{fig:saltfragr2}
\end{figure}
\begin{table}
\begin{tabular}{|c|c|c|c|c|c|}
\hline
Dataset & Reported $R^2$ & Training $R^2$ & Best $R^2$ & Deployed $R^2$ & Overlap \\
\hline
ESOL MAT & -- & 0.911 (0.0151) & -- & -- & 1128/1128 \\
ESOL STN & -- & 0.915 (0.0106) & -- & 0.890 (0.021) & 1119/1128 \\
\hline
Cui2020 & 0.412 & 0.589(0.0244) & 0.611 & 0.656 & 0/62 \\
Boobier2017 & 0.706 & 0.543(0.142) & 0.724 & 0.773 & 23/25 \\
Louvric2020 & -- & 0.709(0.0537) & 0.783 & 0.744(0.0345) & 151.4/166 \\
Llinas2020 set1 & 0.62 & 0.496(0.0268) & 0.527 & 0.514 & 79/100 \\
LLinas2020 set2 & 0.75 & 0.769(0.0346) & 0.824 & 0.726 & 18/32 \\
\hline
\end{tabular}
\caption{Performance of SolTranNet (STN) on other published datasets. For SolTranNet training, we trained 5 different seeds of our final architecture on the provided training and testing splits and report the mean and standard deviation (in parenthesis). The SolTranNet Deployed column is using our final deployed model to predict the provided test set. The final column is the overlap of the provided test set with our deployed model's training set, as no attempt was made to remove molecules present in the test set from the deployed model's training set. Notably the Louvric set had 5 different randomly selected splits for training and testing, which is why there is a mean in the Deployed and Overlap columns.}
\label{tab:othersetsr2}
\end{table}
\begin{figure}[tb]
\centering
\begin{subfigure}[t]{0.33\textwidth}
\centering
\includegraphics[width=\linewidth]{figures/stn_deployed_pred_esol0.pdf}
\end{subfigure}%
\hfill
\begin{subfigure}[t]{0.33\textwidth}
\centering
\includegraphics[width=\linewidth]{figures/stn_deployed_pred_esol1.pdf}
\end{subfigure}%
\hfill
\begin{subfigure}[t]{0.33\textwidth}
\centering
\includegraphics[width=\linewidth]{figures/stn_deployed_pred_esol2.pdf}
\end{subfigure}
\begin{subfigure}[t]{0.33\textwidth}
\centering
\includegraphics[width=\linewidth]{figures/stn_deployed_pred_esol3.pdf}
\end{subfigure}%
\hfill
\begin{subfigure}[t]{0.33\textwidth}
\centering
\includegraphics[width=\linewidth]{figures/stn_deployed_pred_esol4.pdf}
\end{subfigure}%
\hfill
\begin{subfigure}[t]{0.33\textwidth}
\centering
\includegraphics[width=\linewidth]{figures/stn_deployed_pred_esol5.pdf}
\end{subfigure}
\caption{Plots of the Deployed SolTranNet model predicting the test sets for each of the random ESOL splits. The predictions have high correlation ($R^2$=0.890), and low RMSE (0.361). Notably, the ESOL data is present in the training set of SolTranNet.}
\label{fig:stndepesol}
\end{figure}
\begin{figure}[tb]
\centering
\begin{subfigure}[t]{0.33\textwidth}
\centering
\includegraphics[width=\linewidth]{figures/stn_esol0_pred_aqsol.pdf}
\end{subfigure}%
\hfill
\begin{subfigure}[t]{0.33\textwidth}
\centering
\includegraphics[width=\linewidth]{figures/stn_esol1_pred_aqsol.pdf}
\end{subfigure}%
\hfill
\begin{subfigure}[t]{0.33\textwidth}
\centering
\includegraphics[width=\linewidth]{figures/stn_esol2_pred_aqsol.pdf}
\end{subfigure}
\begin{subfigure}[t]{0.33\textwidth}
\centering
\includegraphics[width=\linewidth]{figures/stn_esol3_pred_aqsol.pdf}
\end{subfigure}%
\hfill
\begin{subfigure}[t]{0.33\textwidth}
\centering
\includegraphics[width=\linewidth]{figures/stn_esol4_pred_aqsol.pdf}
\end{subfigure}%
\hfill
\begin{subfigure}[t]{0.33\textwidth}
\centering
\includegraphics[width=\linewidth]{figures/stn_esol5_pred_aqsol.pdf}
\end{subfigure}
\caption{Plots of SolTranNet trained on the random splits of ESOL predicting AqSol. The predictions have low correlation ($R^2=0.552$), and high RMSE (1.67).}
\label{fig:stnesolaqsol}
\end{figure}
\begin{figure}[tb]
\centering
\includegraphics{figures/esol_stn_hist.pdf}
\caption{Training distributions of AqSol (blue) and ESOL (orange). Bin width is half a log unit. Notably, the distribution of solubilities in AqSol is markedly different than the distribution of solubilities in ESOL.}
\label{fig:esol_hist}
\end{figure}
\begin{table}
\centering
\begin{tabular}{|c|c|c|c|}
\hline
Training Set & Testing Set & RMSE & $R^2$ \\
\hline
ESOL* & AqSol & 1.67 (0.021) & 0.552 (0.0062) \\
AqSol & AqSol & 1.16 & 0.77 \\
\hline
\hline
AqSol & ESOL & 0.361 (0.017) & 0.89 (0.021) \\
ESOL & ESOL & 0.289 (0.011) & 0.915 (0.011) \\
\hline
\end{tabular}
\caption{SolTranNet comparison of performance utilizing ESOL or AqSol on predicting logS. Models tested on AqSol are tested on the entire AqSol dataset. Thus the AqSol-AqSol model is simply reporting its performance on its own training set (and is only a singular model). The models utilizing ESOL are either training or testing on the 6 provided random splits of ESOL from the original MAT publication. Note that these values were mean centered and normalized in the original MAT dataset, but have been transformed back to logS units. We report the mean performance, with the standard deviation in parenthesis. Note, no attempt was made to remove molecules present in the test set from the training set. You can see the predictions for the swapped sets in Fig~\ref{fig:stndepesol}-\ref{fig:stnesolaqsol} and the histogram of the solubilites in AqSol and ESOL in Fig~\ref{fig:esol_hist}. Notably, models trained with either set and tested on the other exhibit worse RMSE. However, models trained on ESOL fail to generalize their predictions to the broader range of molecules present in AqSol, which contributes to the low $R^2$ of their predictions and worse RMSE. This leads to a model trained on ESOL to be less useful as a screen, since it is likely to misclassify soluble molecules. The reverse situation does not occur, where a model trained on AqSol maintains a high Pearson's $R^2$ on ESOL, which makes it more suitable in a screening context.}
\label{tab:esolaqsol}
\end{table}
\end{document}