-
Notifications
You must be signed in to change notification settings - Fork 27
/
Copy pathtrainable.py
221 lines (206 loc) · 9.3 KB
/
trainable.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
import warnings
from abc import ABC, abstractmethod
from typing import Any, Callable, Dict, List, Optional, Tuple
import pandas as pd
from sklearn.model_selection import KFold, StratifiedKFold
from bofire.data_models.enum import OutputFilteringEnum
from bofire.data_models.features.api import (
CategoricalInput,
CategoricalOutput,
ContinuousOutput,
DiscreteInput,
)
from bofire.surrogates.diagnostics import CvResult, CvResults
from bofire.surrogates.surrogate import Surrogate
class TrainableSurrogate(ABC):
_output_filtering: OutputFilteringEnum = OutputFilteringEnum.ALL
def fit(self, experiments: pd.DataFrame, options: Optional[Dict] = None):
# validate
experiments = self.inputs.validate_experiments( # type: ignore
experiments, strict=False
)
experiments = self.outputs.validate_experiments(experiments) # type: ignore
# preprocess
experiments = self._preprocess_experiments(experiments)
X = experiments[self.inputs.get_keys()] # type: ignore
# TODO: output feature validation
Y = experiments[self.outputs.get_keys()] # type: ignore
# fit
options = options or {}
self._fit(X=X, Y=Y, **options) # type: ignore
def _preprocess_experiments(self, experiments: pd.DataFrame) -> pd.DataFrame:
if self._output_filtering is None:
return experiments
elif self._output_filtering == OutputFilteringEnum.ALL:
return self.outputs.preprocess_experiments_all_valid_outputs( # type: ignore
experiments=experiments,
output_feature_keys=self.outputs.get_keys(), # type: ignore
)
elif self._output_filtering == OutputFilteringEnum.ANY:
return self.outputs.preprocess_experiments_any_valid_outputs( # type: ignore
experiments=experiments,
output_feature_keys=self.outputs.get_keys(), # type: ignore
)
else:
raise ValueError("Unknown output filtering option requested.")
@abstractmethod
def _fit(self, X: pd.DataFrame, Y: pd.DataFrame, **kwargs):
pass
def cross_validate(
self,
experiments: pd.DataFrame,
folds: int = -1,
include_X: bool = False,
include_labcodes: bool = False,
random_state: Optional[int] = None,
stratified_feature: Optional[str] = None,
hooks: Optional[
Dict[
str,
Callable[
[
Surrogate,
pd.DataFrame,
pd.DataFrame,
pd.DataFrame,
pd.DataFrame,
],
Any,
],
]
] = None,
hook_kwargs: Optional[Dict[str, Dict[str, Any]]] = None,
) -> Tuple[CvResults, CvResults, Dict[str, List[Any]]]:
"""Perform a cross validation for the provided training data.
Args:
experiments (pd.DataFrame): Data on which the cross validation should be performed.
folds (int, optional): Number of folds. -1 is equal to LOO CV. Defaults to -1.
include_X (bool, optional): If true the X values of the fold are written to respective CvResult objects for
later analysis. Defaults to False.
random_state (int, optional): Controls the randomness of the indices in the train and test sets of each fold.
Defaults to None.
stratified_feature (str, optional): The feature name to preserve the percentage of samples for each class in
the stratified folds. Defaults to None.
hooks (Dict[str, Callable[[Model, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame], Any]], optional):
Dictionary of callable hooks that are called within the CV loop. The callable retrieves the current trained
modeld and the current CV folds in the following order: X_train, y_train, X_test, y_test. Defaults to {}.
hook_kwargs (Dict[str, Dict[str, Any]], optional): Dictionary holding hook specefic keyword arguments.
Defaults to {}.
Returns:
Tuple[CvResults, CvResults, Dict[str, List[Any]]]: First CvResults object reflects the training data,
second CvResults object the test data, dictionary object holds the return values of the applied hooks.
"""
if include_labcodes and "labcode" not in experiments.columns:
raise ValueError("No labcodes available for the provided experiments.")
if len(self.outputs) > 1: # type: ignore
raise NotImplementedError(
"Cross validation not implemented for multi-output models"
)
if stratified_feature is not None:
if stratified_feature not in (
self.inputs.get_keys() + self.outputs.get_keys() # type: ignore
):
raise ValueError(
"The feature to be stratified is not in the model inputs or outputs"
)
try:
feat = self.inputs.get_by_key(stratified_feature) # type: ignore
except KeyError:
feat = self.outputs.get_by_key(stratified_feature) # type: ignore
if not isinstance(
feat,
(DiscreteInput, CategoricalInput, CategoricalOutput, ContinuousOutput),
):
raise ValueError(
"The feature to be stratified needs to be a DiscreteInput, CategoricalInput, CategoricalOutput, or ContinuousOutput"
)
# first filter the experiments based on the model setting
experiments = self._preprocess_experiments(experiments)
n = len(experiments)
if folds > n:
warnings.warn(
f"Training data only has {n} experiments, which is less than folds, fallback to LOOCV."
)
folds = n
elif n == 0:
raise ValueError("Experiments is empty.")
elif folds < 2 and folds != -1:
raise ValueError("Folds must be -1 for LOO, or > 1.")
elif folds == -1:
folds = n
# preprocess hooks
if hooks is None:
hooks = {}
if hook_kwargs is None:
hook_kwargs = {}
hook_results = {key: [] for key in hooks.keys()}
# instantiate kfold object
if stratified_feature is None:
cv = KFold(n_splits=folds, shuffle=True, random_state=random_state)
cv_func = cv.split(experiments)
else:
cv = StratifiedKFold(
n_splits=folds, shuffle=True, random_state=random_state
)
cv_func = cv.split(
experiments.drop([stratified_feature], axis=1),
experiments[stratified_feature],
)
key = self.outputs.get_keys()[0] # type: ignore
train_results = []
test_results = []
# now get the indices for the split
for train_index, test_index in cv_func:
X_train = experiments.iloc[train_index][self.inputs.get_keys()] # type: ignore
X_test = experiments.iloc[test_index][self.inputs.get_keys()] # type: ignore
y_train = experiments.iloc[train_index][self.outputs.get_keys()] # type: ignore
y_test = experiments.iloc[test_index][self.outputs.get_keys()] # type: ignore
train_labcodes = (
experiments.iloc[train_index]["labcode"] if include_labcodes else None
)
test_labcodes = (
experiments.iloc[test_index]["labcode"] if include_labcodes else None
)
# now fit the model
self._fit(X_train, y_train)
# now do the scoring
y_test_pred = self.predict(X_test) # type: ignore
y_train_pred = self.predict(X_train) # type: ignore
# now store the results
train_results.append(
CvResult( # type: ignore
key=key,
observed=y_train[key],
predicted=y_train_pred[key + "_pred"],
standard_deviation=y_train_pred[key + "_sd"],
X=X_train if include_X else None,
labcodes=train_labcodes,
)
)
test_results.append(
CvResult( # type: ignore
key=key,
observed=y_test[key],
predicted=y_test_pred[key + "_pred"],
standard_deviation=y_test_pred[key + "_sd"],
X=X_test if include_X else None,
labcodes=test_labcodes,
)
)
# now call the hooks if available
for hookname, hook in hooks.items():
hook_results[hookname].append(
hook(
surrogate=self, # type: ignore
X_train=X_train,
y_train=y_train,
X_test=X_test,
y_test=y_test,
**hook_kwargs.get(hookname, {}),
)
)
return (
CvResults(results=train_results),
CvResults(results=test_results),
hook_results,
)