Skip to content

Commit

Permalink
Add CastRegression unit tests
Browse files Browse the repository at this point in the history
  • Loading branch information
ThomasMeissnerDS committed Dec 2, 2023
1 parent 4c08064 commit 02b7b2a
Show file tree
Hide file tree
Showing 7 changed files with 322 additions and 24 deletions.
5 changes: 2 additions & 3 deletions bluecast/blueprints/cast_cv_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,9 @@
from sklearn.model_selection import StratifiedKFold

from bluecast.blueprints.cast_regression import BlueCastRegression
from bluecast.config.training_config import TrainingConfig, XgboostFinalParamConfig
from bluecast.config.training_config import (
TrainingConfig,
XgboostFinalParamConfig,
XgboostTuneParamsConfig,
XgboostTuneParamsRegressionConfig as XgboostTuneParamsConfig,
)
from bluecast.experimentation.tracking import ExperimentTracker
from bluecast.general_utils.general_utils import logger
Expand Down
5 changes: 2 additions & 3 deletions bluecast/blueprints/cast_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,9 @@
import numpy as np
import pandas as pd

from bluecast.config.training_config import TrainingConfig, XgboostFinalParamConfig
from bluecast.config.training_config import (
TrainingConfig,
XgboostFinalParamConfig,
XgboostTuneParamsConfig,
XgboostTuneParamsRegressionConfig as XgboostTuneParamsConfig,
)
from bluecast.evaluation.eval_metrics import eval_regressor
from bluecast.evaluation.shap_values import shap_explanations
Expand Down
32 changes: 32 additions & 0 deletions bluecast/config/training_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,38 @@ class XgboostTuneParamsConfig(BaseModel):
booster: str = "gbtree"


class XgboostTuneParamsRegressionConfig(BaseModel):
"""Define hyperparameter tuning search space."""

max_depth_min: int = 2
max_depth_max: int = 6
alpha_min: float = 0.0
alpha_max: float = 10.0
lambda_min: float = 0.0
lambda_max: float = 10.0
gamma_min: float = 0.0
gamma_max: float = 10.0
subsample_min: float = 0.0
subsample_max: float = 10.0
max_leaves_min: int = 0
max_leaves_max: int = 0
sub_sample_min: float = 0.3
sub_sample_max: float = 1.0
col_sample_by_tree_min: float = 0.3
col_sample_by_tree_max: float = 1.0
col_sample_by_level_min: float = 0.3
col_sample_by_level_max: float = 1.0
eta_min: float = 0.001
eta_max: float = 0.3
steps_min: int = 2
steps_max: int = 1000
model_verbosity: int = 0
model_verbosity_during_final_training: int = 0
model_objective: str = "reg:squarederror"
model_eval_metric: str = "mse"
booster: str = "gbtree"


@dataclass
class XgboostFinalParamConfig:
"""Define final hyper parameters."""
Expand Down
17 changes: 0 additions & 17 deletions bluecast/ml_modelling/xgboost_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,23 +41,6 @@ def __init__(
self.model: Optional[xgb.XGBClassifier] = None
self.class_problem = class_problem
self.conf_training = conf_training

if not conf_xgboost:
logger(
"No Xgboost config provided. Change default config to fit regression task."
)
conf_xgboost = XgboostTuneParamsConfig()
conf_xgboost.model_objective = "reg:squarederror"
conf_xgboost.model_eval_metric = "mse"
self.conf_xgboost = conf_xgboost

if not conf_params_xgboost:
logger(
"No Xgboost final config provided. Change default config to fit regression task."
)
conf_params_xgboost = XgboostFinalParamConfig()
conf_params_xgboost.params["objective"] = "reg:squarederror"
conf_params_xgboost.params["eval_metric"] = "mse"
self.conf_params_xgboost = conf_params_xgboost

self.experiment_tracker = experiment_tracker
Expand Down
41 changes: 40 additions & 1 deletion bluecast/tests/make_data/create_data.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.datasets import make_classification, make_regression


def create_synthetic_dataframe(
Expand Down Expand Up @@ -82,3 +82,42 @@ def create_synthetic_multiclass_dataframe(
)

return df


def create_synthetic_dataframe_regression(
num_samples=1000, random_state: int = 20
) -> pd.DataFrame:
# Generate synthetic data using make_classification
x, y = make_regression(
n_samples=num_samples,
n_features=20,
n_informative=20,
n_redundant=0,
random_state=random_state,
)

# Create a datetime feature
start_date = pd.to_datetime("2022-01-01")
end_date = pd.to_datetime("2022-12-31")
datetime_feature = pd.date_range(
start=start_date, end=end_date, periods=num_samples
)

# Create categorical features
categorical_feature_1 = np.random.choice(["A", "B", "C"], size=num_samples)
categorical_feature_2 = np.random.choice(["X", "Y", "Z"], size=num_samples)

# Create a DataFrame
df = pd.DataFrame(
{
"categorical_feature_1": categorical_feature_1,
"categorical_feature_2": categorical_feature_2,
"numerical_feature_1": x[:, 0],
"numerical_feature_2": x[:, 1],
"numerical_feature_3": x[:, 2],
"datetime_feature": datetime_feature,
"target": y,
}
)

return df
Empty file.
246 changes: 246 additions & 0 deletions bluecast/tests/test_cast_regression.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,246 @@
from typing import Optional, Tuple

import numpy as np
import pandas as pd
import pytest
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV
from sklearn.metrics import make_scorer, matthews_corrcoef
from sklearn.model_selection import KFold

from bluecast.blueprints.cast_regression import BlueCastRegression
from bluecast.config.training_config import TrainingConfig
from bluecast.config.training_config import (
XgboostTuneParamsRegressionConfig as XgboostTuneParamsConfig,
)
from bluecast.ml_modelling.base_classes import BaseClassMlRegressionModel
from bluecast.preprocessing.custom import CustomPreprocessing
from bluecast.tests.make_data.create_data import create_synthetic_dataframe_regression


@pytest.fixture
def synthetic_train_test_data() -> Tuple[pd.DataFrame, pd.DataFrame]:
df_train = create_synthetic_dataframe_regression(2000, random_state=20)
df_val = create_synthetic_dataframe_regression(2000, random_state=200)
return df_train, df_val


def test_blueprint_xgboost(synthetic_train_test_data):
"""Test that tests the BlueCast class"""
df_train = synthetic_train_test_data[0]
df_val = synthetic_train_test_data[1]
xgboost_param_config = XgboostTuneParamsConfig()
xgboost_param_config.steps_max = 100
xgboost_param_config.max_depth_max = 3

# add custom last mile computation
class MyCustomLastMilePreprocessing(CustomPreprocessing):
def custom_function(self, df: pd.DataFrame) -> pd.DataFrame:
df["custom_col"] = 5
return df

def fit_transform(
self, df: pd.DataFrame, target: pd.Series
) -> Tuple[pd.DataFrame, pd.Series]:
df = self.custom_function(df)
df = df.head(1000)
target = target.head(1000)
return df, target

def transform(
self,
df: pd.DataFrame,
target: Optional[pd.Series] = None,
predicton_mode: bool = False,
) -> Tuple[pd.DataFrame, Optional[pd.Series]]:
df = self.custom_function(df)
if not predicton_mode and isinstance(target, pd.Series):
df = df.head(100)
target = target.head(100)
return df, target

custom_last_mile_computation = MyCustomLastMilePreprocessing()

automl = BlueCastRegression(
class_problem="regression",
target_column="target",
conf_xgboost=xgboost_param_config,
custom_last_mile_computation=custom_last_mile_computation,
)
automl.fit_eval(
df_train,
df_train.drop("target", axis=1),
df_train["target"],
target_col="target",
)
print("Autotuning successful.")
y_preds = automl.predict(df_val.drop("target", axis=1))
print("Predicting successful.")
assert len(y_preds) == len(df_val.index)


class CustomModel(BaseClassMlRegressionModel):
def __init__(self):
self.model = None

def fit(
self,
x_train: pd.DataFrame,
x_test: pd.DataFrame,
y_train: pd.Series,
y_test: pd.Series,
) -> None:
self.model = RandomForestClassifier()
self.model.fit(x_train, y_train)

def predict(self, df: pd.DataFrame) -> np.ndarray:
preds = self.model.predict(df)
return preds


def test_bluecast_with_custom_model():
# Create an instance of the custom model
custom_model = CustomModel()
train_config = TrainingConfig()
train_config.hyperparameter_tuning_rounds = 10
train_config.enable_feature_selection = True
train_config.hypertuning_cv_folds = 2
train_config.enable_grid_search_fine_tuning = True
train_config.gridsearch_nb_parameters_per_grid = 2

xgboost_param_config = XgboostTuneParamsConfig()
xgboost_param_config.steps_max = 100
xgboost_param_config.max_depth_max = 3

# add custom feature selection
class RFECVSelector(CustomPreprocessing):
def __init__(self, random_state: int = 0):
super().__init__()
self.selected_features = None
self.random_state = random_state
self.selection_strategy: RFECV = RFECV(
estimator=xgb.XGBClassifier(),
step=1,
cv=KFold(2, random_state=random_state, shuffle=True),
min_features_to_select=1,
scoring=make_scorer(matthews_corrcoef),
n_jobs=2,
)

def fit_transform(
self, df: pd.DataFrame, target: pd.Series
) -> Tuple[pd.DataFrame, Optional[pd.Series]]:
self.selection_strategy.fit(df, target)
self.selected_features = self.selection_strategy.support_
df = df.loc[:, self.selected_features]
return df, target

def transform(
self,
df: pd.DataFrame,
target: Optional[pd.Series] = None,
predicton_mode: bool = False,
) -> Tuple[pd.DataFrame, Optional[pd.Series]]:
df = df.loc[:, self.selected_features]
return df, target

class MyCustomPreprocessor(CustomPreprocessing):
def __init__(self, random_state: int = 0):
super().__init__()
self.selected_features = None
self.random_state = random_state
self.selection_strategy: RFECV = RFECV(
estimator=xgb.XGBClassifier(),
step=1,
cv=KFold(2, random_state=random_state, shuffle=True),
min_features_to_select=1,
scoring=make_scorer(matthews_corrcoef),
n_jobs=2,
)

def fit_transform(
self, df: pd.DataFrame, target: pd.Series
) -> Tuple[pd.DataFrame, Optional[pd.Series]]:
return df, target

def transform(
self,
df: pd.DataFrame,
target: Optional[pd.Series] = None,
predicton_mode: bool = False,
) -> Tuple[pd.DataFrame, Optional[pd.Series]]:
return df, target

class MyCustomInFoldPreprocessor(CustomPreprocessing):
def __init__(self):
super().__init__()

def fit_transform(
self, df: pd.DataFrame, target: pd.Series
) -> Tuple[pd.DataFrame, Optional[pd.Series]]:
df["leakage"] = target
return df, target

def transform(
self,
df: pd.DataFrame,
target: Optional[pd.Series] = None,
predicton_mode: bool = False,
) -> Tuple[pd.DataFrame, Optional[pd.Series]]:
df["leakage"] = 0
return df, target

custom_feature_selector = RFECVSelector()
custum_preproc = MyCustomPreprocessor()
custom_infold_preproc = MyCustomInFoldPreprocessor()

# Create an instance of the BlueCast class with the custom model
bluecast = BlueCastRegression(
class_problem="regression",
target_column="target",
ml_model=custom_model,
conf_xgboost=xgboost_param_config,
conf_training=train_config,
custom_feature_selector=custom_feature_selector,
custom_preprocessor=custum_preproc,
custom_in_fold_preprocessor=custom_infold_preproc,
)

# Create some sample data for testing
x_train = pd.DataFrame(
{
"feature1": [i for i in range(10)],
"feature2": [i for i in range(10)],
"feature3": [i for i in range(10)],
"feature4": [i for i in range(10)],
"feature5": [i for i in range(10)],
"feature6": [i for i in range(10)],
}
)
y_train = pd.Series([0, 1, 0, 1, 0, 1, 0, 1, 0, 1])
x_test = pd.DataFrame(
{
"feature1": [i for i in range(10)],
"feature2": [i for i in range(10)],
"feature3": [i for i in range(10)],
"feature4": [i for i in range(10)],
"feature5": [i for i in range(10)],
"feature6": [i for i in range(10)],
}
)

x_train["target"] = y_train

# Fit the BlueCast model using the custom model
bluecast.fit(x_train, "target")

# Predict on the test data using the custom model
preds = bluecast.predict(x_test)

# Assert the expected results
assert isinstance(preds, np.ndarray)
print(bluecast.experiment_tracker.experiment_id)
assert (
len(bluecast.experiment_tracker.experiment_id) == 0
) # due to custom model and fit method

0 comments on commit 02b7b2a

Please sign in to comment.