Add CastRegression unit tests

ThomasMeissnerDS · Dec 2, 2023 · 02b7b2a · 02b7b2a
1 parent 4c08064
commit 02b7b2a
Show file tree

Hide file tree

Showing 7 changed files with 322 additions and 24 deletions.
diff --git a/bluecast/blueprints/cast_cv_regression.py b/bluecast/blueprints/cast_cv_regression.py
@@ -4,10 +4,9 @@
 from sklearn.model_selection import StratifiedKFold
 
 from bluecast.blueprints.cast_regression import BlueCastRegression
+from bluecast.config.training_config import TrainingConfig, XgboostFinalParamConfig
 from bluecast.config.training_config import (
-    TrainingConfig,
-    XgboostFinalParamConfig,
-    XgboostTuneParamsConfig,
+    XgboostTuneParamsRegressionConfig as XgboostTuneParamsConfig,
 )
 from bluecast.experimentation.tracking import ExperimentTracker
 from bluecast.general_utils.general_utils import logger

diff --git a/bluecast/blueprints/cast_regression.py b/bluecast/blueprints/cast_regression.py
@@ -13,10 +13,9 @@
 import numpy as np
 import pandas as pd
 
+from bluecast.config.training_config import TrainingConfig, XgboostFinalParamConfig
 from bluecast.config.training_config import (
-    TrainingConfig,
-    XgboostFinalParamConfig,
-    XgboostTuneParamsConfig,
+    XgboostTuneParamsRegressionConfig as XgboostTuneParamsConfig,
 )
 from bluecast.evaluation.eval_metrics import eval_regressor
 from bluecast.evaluation.shap_values import shap_explanations

diff --git a/bluecast/config/training_config.py b/bluecast/config/training_config.py
@@ -110,6 +110,38 @@ class XgboostTuneParamsConfig(BaseModel):
     booster: str = "gbtree"
 
 
+class XgboostTuneParamsRegressionConfig(BaseModel):
+    """Define hyperparameter tuning search space."""
+
+    max_depth_min: int = 2
+    max_depth_max: int = 6
+    alpha_min: float = 0.0
+    alpha_max: float = 10.0
+    lambda_min: float = 0.0
+    lambda_max: float = 10.0
+    gamma_min: float = 0.0
+    gamma_max: float = 10.0
+    subsample_min: float = 0.0
+    subsample_max: float = 10.0
+    max_leaves_min: int = 0
+    max_leaves_max: int = 0
+    sub_sample_min: float = 0.3
+    sub_sample_max: float = 1.0
+    col_sample_by_tree_min: float = 0.3
+    col_sample_by_tree_max: float = 1.0
+    col_sample_by_level_min: float = 0.3
+    col_sample_by_level_max: float = 1.0
+    eta_min: float = 0.001
+    eta_max: float = 0.3
+    steps_min: int = 2
+    steps_max: int = 1000
+    model_verbosity: int = 0
+    model_verbosity_during_final_training: int = 0
+    model_objective: str = "reg:squarederror"
+    model_eval_metric: str = "mse"
+    booster: str = "gbtree"
+
+
 @dataclass
 class XgboostFinalParamConfig:
     """Define final hyper parameters."""

diff --git a/bluecast/ml_modelling/xgboost_regression.py b/bluecast/ml_modelling/xgboost_regression.py
@@ -41,23 +41,6 @@ def __init__(
         self.model: Optional[xgb.XGBClassifier] = None
         self.class_problem = class_problem
         self.conf_training = conf_training
-
-        if not conf_xgboost:
-            logger(
-                "No Xgboost config provided. Change default config to fit regression task."
-            )
-            conf_xgboost = XgboostTuneParamsConfig()
-            conf_xgboost.model_objective = "reg:squarederror"
-            conf_xgboost.model_eval_metric = "mse"
-        self.conf_xgboost = conf_xgboost
-
-        if not conf_params_xgboost:
-            logger(
-                "No Xgboost final config provided. Change default config to fit regression task."
-            )
-            conf_params_xgboost = XgboostFinalParamConfig()
-            conf_params_xgboost.params["objective"] = "reg:squarederror"
-            conf_params_xgboost.params["eval_metric"] = "mse"
         self.conf_params_xgboost = conf_params_xgboost
 
         self.experiment_tracker = experiment_tracker

diff --git a/bluecast/tests/make_data/create_data.py b/bluecast/tests/make_data/create_data.py
@@ -1,6 +1,6 @@
 import numpy as np
 import pandas as pd
-from sklearn.datasets import make_classification
+from sklearn.datasets import make_classification, make_regression
 
 
 def create_synthetic_dataframe(
@@ -82,3 +82,42 @@ def create_synthetic_multiclass_dataframe(
     )
 
     return df
+
+
+def create_synthetic_dataframe_regression(
+    num_samples=1000, random_state: int = 20
+) -> pd.DataFrame:
+    # Generate synthetic data using make_classification
+    x, y = make_regression(
+        n_samples=num_samples,
+        n_features=20,
+        n_informative=20,
+        n_redundant=0,
+        random_state=random_state,
+    )
+
+    # Create a datetime feature
+    start_date = pd.to_datetime("2022-01-01")
+    end_date = pd.to_datetime("2022-12-31")
+    datetime_feature = pd.date_range(
+        start=start_date, end=end_date, periods=num_samples
+    )
+
+    # Create categorical features
+    categorical_feature_1 = np.random.choice(["A", "B", "C"], size=num_samples)
+    categorical_feature_2 = np.random.choice(["X", "Y", "Z"], size=num_samples)
+
+    # Create a DataFrame
+    df = pd.DataFrame(
+        {
+            "categorical_feature_1": categorical_feature_1,
+            "categorical_feature_2": categorical_feature_2,
+            "numerical_feature_1": x[:, 0],
+            "numerical_feature_2": x[:, 1],
+            "numerical_feature_3": x[:, 2],
+            "datetime_feature": datetime_feature,
+            "target": y,
+        }
+    )
+
+    return df
diff --git a/bluecast/tests/test_cast_cv_regression.py b/bluecast/tests/test_cast_cv_regression.py
diff --git a/bluecast/tests/test_cast_regression.py b/bluecast/tests/test_cast_regression.py
@@ -0,0 +1,246 @@
+from typing import Optional, Tuple
+
+import numpy as np
+import pandas as pd
+import pytest
+import xgboost as xgb
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.feature_selection import RFECV
+from sklearn.metrics import make_scorer, matthews_corrcoef
+from sklearn.model_selection import KFold
+
+from bluecast.blueprints.cast_regression import BlueCastRegression
+from bluecast.config.training_config import TrainingConfig
+from bluecast.config.training_config import (
+    XgboostTuneParamsRegressionConfig as XgboostTuneParamsConfig,
+)
+from bluecast.ml_modelling.base_classes import BaseClassMlRegressionModel
+from bluecast.preprocessing.custom import CustomPreprocessing
+from bluecast.tests.make_data.create_data import create_synthetic_dataframe_regression
+
+
+@pytest.fixture
+def synthetic_train_test_data() -> Tuple[pd.DataFrame, pd.DataFrame]:
+    df_train = create_synthetic_dataframe_regression(2000, random_state=20)
+    df_val = create_synthetic_dataframe_regression(2000, random_state=200)
+    return df_train, df_val
+
+
+def test_blueprint_xgboost(synthetic_train_test_data):
+    """Test that tests the BlueCast class"""
+    df_train = synthetic_train_test_data[0]
+    df_val = synthetic_train_test_data[1]
+    xgboost_param_config = XgboostTuneParamsConfig()
+    xgboost_param_config.steps_max = 100
+    xgboost_param_config.max_depth_max = 3
+
+    # add custom last mile computation
+    class MyCustomLastMilePreprocessing(CustomPreprocessing):
+        def custom_function(self, df: pd.DataFrame) -> pd.DataFrame:
+            df["custom_col"] = 5
+            return df
+
+        def fit_transform(
+            self, df: pd.DataFrame, target: pd.Series
+        ) -> Tuple[pd.DataFrame, pd.Series]:
+            df = self.custom_function(df)
+            df = df.head(1000)
+            target = target.head(1000)
+            return df, target
+
+        def transform(
+            self,
+            df: pd.DataFrame,
+            target: Optional[pd.Series] = None,
+            predicton_mode: bool = False,
+        ) -> Tuple[pd.DataFrame, Optional[pd.Series]]:
+            df = self.custom_function(df)
+            if not predicton_mode and isinstance(target, pd.Series):
+                df = df.head(100)
+                target = target.head(100)
+            return df, target
+
+    custom_last_mile_computation = MyCustomLastMilePreprocessing()
+
+    automl = BlueCastRegression(
+        class_problem="regression",
+        target_column="target",
+        conf_xgboost=xgboost_param_config,
+        custom_last_mile_computation=custom_last_mile_computation,
+    )
+    automl.fit_eval(
+        df_train,
+        df_train.drop("target", axis=1),
+        df_train["target"],
+        target_col="target",
+    )
+    print("Autotuning successful.")
+    y_preds = automl.predict(df_val.drop("target", axis=1))
+    print("Predicting successful.")
+    assert len(y_preds) == len(df_val.index)
+
+
+class CustomModel(BaseClassMlRegressionModel):
+    def __init__(self):
+        self.model = None
+
+    def fit(
+        self,
+        x_train: pd.DataFrame,
+        x_test: pd.DataFrame,
+        y_train: pd.Series,
+        y_test: pd.Series,
+    ) -> None:
+        self.model = RandomForestClassifier()
+        self.model.fit(x_train, y_train)
+
+    def predict(self, df: pd.DataFrame) -> np.ndarray:
+        preds = self.model.predict(df)
+        return preds
+
+
+def test_bluecast_with_custom_model():
+    # Create an instance of the custom model
+    custom_model = CustomModel()
+    train_config = TrainingConfig()
+    train_config.hyperparameter_tuning_rounds = 10
+    train_config.enable_feature_selection = True
+    train_config.hypertuning_cv_folds = 2
+    train_config.enable_grid_search_fine_tuning = True
+    train_config.gridsearch_nb_parameters_per_grid = 2
+
+    xgboost_param_config = XgboostTuneParamsConfig()
+    xgboost_param_config.steps_max = 100
+    xgboost_param_config.max_depth_max = 3
+
+    # add custom feature selection
+    class RFECVSelector(CustomPreprocessing):
+        def __init__(self, random_state: int = 0):
+            super().__init__()
+            self.selected_features = None
+            self.random_state = random_state
+            self.selection_strategy: RFECV = RFECV(
+                estimator=xgb.XGBClassifier(),
+                step=1,
+                cv=KFold(2, random_state=random_state, shuffle=True),
+                min_features_to_select=1,
+                scoring=make_scorer(matthews_corrcoef),
+                n_jobs=2,
+            )
+
+        def fit_transform(
+            self, df: pd.DataFrame, target: pd.Series
+        ) -> Tuple[pd.DataFrame, Optional[pd.Series]]:
+            self.selection_strategy.fit(df, target)
+            self.selected_features = self.selection_strategy.support_
+            df = df.loc[:, self.selected_features]
+            return df, target
+
+        def transform(
+            self,
+            df: pd.DataFrame,
+            target: Optional[pd.Series] = None,
+            predicton_mode: bool = False,
+        ) -> Tuple[pd.DataFrame, Optional[pd.Series]]:
+            df = df.loc[:, self.selected_features]
+            return df, target
+
+    class MyCustomPreprocessor(CustomPreprocessing):
+        def __init__(self, random_state: int = 0):
+            super().__init__()
+            self.selected_features = None
+            self.random_state = random_state
+            self.selection_strategy: RFECV = RFECV(
+                estimator=xgb.XGBClassifier(),
+                step=1,
+                cv=KFold(2, random_state=random_state, shuffle=True),
+                min_features_to_select=1,
+                scoring=make_scorer(matthews_corrcoef),
+                n_jobs=2,
+            )
+
+        def fit_transform(
+            self, df: pd.DataFrame, target: pd.Series
+        ) -> Tuple[pd.DataFrame, Optional[pd.Series]]:
+            return df, target
+
+        def transform(
+            self,
+            df: pd.DataFrame,
+            target: Optional[pd.Series] = None,
+            predicton_mode: bool = False,
+        ) -> Tuple[pd.DataFrame, Optional[pd.Series]]:
+            return df, target
+
+    class MyCustomInFoldPreprocessor(CustomPreprocessing):
+        def __init__(self):
+            super().__init__()
+
+        def fit_transform(
+            self, df: pd.DataFrame, target: pd.Series
+        ) -> Tuple[pd.DataFrame, Optional[pd.Series]]:
+            df["leakage"] = target
+            return df, target
+
+        def transform(
+            self,
+            df: pd.DataFrame,
+            target: Optional[pd.Series] = None,
+            predicton_mode: bool = False,
+        ) -> Tuple[pd.DataFrame, Optional[pd.Series]]:
+            df["leakage"] = 0
+            return df, target
+
+    custom_feature_selector = RFECVSelector()
+    custum_preproc = MyCustomPreprocessor()
+    custom_infold_preproc = MyCustomInFoldPreprocessor()
+
+    # Create an instance of the BlueCast class with the custom model
+    bluecast = BlueCastRegression(
+        class_problem="regression",
+        target_column="target",
+        ml_model=custom_model,
+        conf_xgboost=xgboost_param_config,
+        conf_training=train_config,
+        custom_feature_selector=custom_feature_selector,
+        custom_preprocessor=custum_preproc,
+        custom_in_fold_preprocessor=custom_infold_preproc,
+    )
+
+    # Create some sample data for testing
+    x_train = pd.DataFrame(
+        {
+            "feature1": [i for i in range(10)],
+            "feature2": [i for i in range(10)],
+            "feature3": [i for i in range(10)],
+            "feature4": [i for i in range(10)],
+            "feature5": [i for i in range(10)],
+            "feature6": [i for i in range(10)],
+        }
+    )
+    y_train = pd.Series([0, 1, 0, 1, 0, 1, 0, 1, 0, 1])
+    x_test = pd.DataFrame(
+        {
+            "feature1": [i for i in range(10)],
+            "feature2": [i for i in range(10)],
+            "feature3": [i for i in range(10)],
+            "feature4": [i for i in range(10)],
+            "feature5": [i for i in range(10)],
+            "feature6": [i for i in range(10)],
+        }
+    )
+
+    x_train["target"] = y_train
+
+    # Fit the BlueCast model using the custom model
+    bluecast.fit(x_train, "target")
+
+    # Predict on the test data using the custom model
+    preds = bluecast.predict(x_test)
+
+    # Assert the expected results
+    assert isinstance(preds, np.ndarray)
+    print(bluecast.experiment_tracker.experiment_id)
+    assert (
+        len(bluecast.experiment_tracker.experiment_id) == 0
+    )  # due to custom model and fit method