Skip to content

Commit

Permalink
Refactor gridsearch (#45)
Browse files Browse the repository at this point in the history
* Change hyper-opt logic to avoid fitting the same model multiple times

* Bump version

* Update docs

* Change non-existent variable name
  • Loading branch information
lgmoneda authored Jan 3, 2022
1 parent 5688604 commit 43f49b4
Show file tree
Hide file tree
Showing 3 changed files with 58 additions and 26 deletions.
19 changes: 10 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,24 +51,25 @@ aggregated. Options: {"avg": average, "max": maximum, the worst case}.

To use the environment-wise optimization:

```python
```python
from time_robust_forest.hyper_opt import env_wise_hyper_opt

params_grid = {"n_estimators": [30, 60, 120],
params_grid = {"n_estimators": [30, 60, 120],
"max_depth": [5, 10],
"min_impurity_decrease": [1e-1, 1e-3, 0],
"min_sample_periods": [5, 10, 30],
"period_criterion": ["max", "avg"]}

model = TimeForestClassifier(time_column=time_column)
opt_param = env_wise_hyper_opt(training_data[features + [time_column]],
training_data[TARGET],
model,

opt_param = env_wise_hyper_opt(training_data[features + [time_column]],
training_data[TARGET],
model,
time_column,
params_grid,
cv=5,
score=roc_auc_score)
scorer=make_scorer(roc_auc_score,
needs_proba=True))

```

Expand All @@ -82,7 +83,7 @@ Don't simply use a timestamp column from the dataset, make it discrete before an

This project is licensed under the terms of the `BSD-3` license. See [LICENSE](https://github.com/lgmoneda/time-robust-forest/blob/main/LICENSE) for more details.

## Useful links
## Useful links

- [Introducing the Time Robust Tree blog post](http://lgmoneda.github.io/2021/12/03/introducing-time-robust-tree.html)

Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ build-backend = "poetry.core.masonry.api"

[tool.poetry]
name = "time-robust-forest"
version = "0.1.11"
version = "0.1.12"
description = "Explores time information to train a robust random forest"
readme = "README.md"
authors = [
Expand Down
63 changes: 47 additions & 16 deletions time_robust_forest/hyper_opt.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,21 @@
from functools import partial

import pandas as pd
from sklearn.metrics import make_scorer, roc_auc_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold


def extract_results_from_grid_cv(cv_results, kfolds, envs):
"""
Extract the resuls from a fitted grid search object from sklearn so
Extract the resuls from a fitted grid search object from sklearn
to enable picking the best using custom logic.
"""

split_keys = [i for i in cv_results.keys() if "split" in i]

split_env = {
split_key: envs[i % len(envs)] for i, split_key in enumerate(split_keys)
split_key: split_key.split("env_")[-1]
for i, split_key in enumerate(split_keys)
}
params_idx = [i for i in range(len(cv_results["params"]))]
all_folds_df = []
Expand Down Expand Up @@ -43,26 +48,34 @@ def select_best_model_from_results_df(results_df):
)
results_df = results_df.groupby("params_idx").agg(second_agg_dict)

return results_df.iloc[results_df["perf"].argmax()]["params"]
return results_df.iloc[results_df["perf"].argmax()]["params"], results_df


def leave_one_env_out_cv(data, env_column="period", cv=5):
def env_stratified_folds(data, env_column="period", cv=5):
"""
Create folds that keep only one environment in the test fold.
Create folds that are stratified on the environment.
"""
envs = data[env_column].unique()
cv_sets = []
kfolds = StratifiedKFold(n_splits=cv)
for train_idx, test_idx in kfolds.split(data, data[env_column]):
for env in envs:
all_env_elements = data[data[env_column] == env].index
test_env_idx = [i for i in test_idx if i in all_env_elements]
cv_sets.append((train_idx, test_env_idx))
cv_sets.append((train_idx, test_idx))

return cv_sets


def grid_search(X, y, model, param_grid, env_cvs, score):
def env_wise_score(estimator, X, y, scorer, env, env_column):
"""
Filter data to evaluate only a specific environment using a
certain scorer.
"""
env_mask = X[env_column] == env
evaluation = scorer(estimator, X[env_mask], y[env_mask])

return evaluation


def grid_search(X, y, model, param_grid, env_cvs, scorer):
"""
FIt the grid search and return it.
"""
Expand All @@ -71,30 +84,48 @@ def grid_search(X, y, model, param_grid, env_cvs, score):
model,
param_grid=param_grid,
cv=env_cvs,
scoring=make_scorer(score),
scoring=scorer,
n_jobs=-1,
verbose=0,
refit=False,
)

grid_cv.fit(X, y)
return grid_cv


def env_wise_hyper_opt(
X, y, model, env_column, param_grid, cv=5, score=roc_auc_score
X,
y,
model,
env_column,
param_grid,
cv=5,
scorer=make_scorer(roc_auc_score, needs_proba=True),
ret_results=False,
):
"""
Optimize the hyper parmaters of a model considering the leave one env out
cross-validation and selecting the worst case regarding the test performance
in the different environments.
"""
env_cvs = leave_one_env_out_cv(X, env_column, cv)
env_cvs = env_stratified_folds(X, env_column, cv)
envs = X[env_column].unique()

scoring_fs = {
f"{scorer.__repr__()}_env_{env}": partial(
env_wise_score, scorer=scorer, env=env, env_column=env_column
)
for env in envs
}

grid_cv = grid_search(X, y, model, param_grid, env_cvs, score)
grid_cv = grid_search(X, y, model, param_grid, env_cvs, scoring_fs)

envs = X[env_column].unique()
results_df = extract_results_from_grid_cv(grid_cv.cv_results_, cv, envs)

opt_params = select_best_model_from_results_df(results_df)
opt_params, agg_results_df = select_best_model_from_results_df(results_df)

if ret_results:
return opt_params, results_df, agg_results_df

return opt_params

0 comments on commit 43f49b4

Please sign in to comment.