Fix Catboost tuning

ThomasMeissnerDS · Jan 20, 2025 · 0f6e1c1 · 0f6e1c1
1 parent 5b96c69
commit 0f6e1c1
Show file tree

Hide file tree

Showing 4 changed files with 86 additions and 4 deletions.
diff --git a/bluecast/config/training_config.py b/bluecast/config/training_config.py
@@ -515,7 +515,9 @@ def __init__(
         catboost_eval_metric_tune_direction: str = "minimize",
     ):
         if bootstrap_type is None:
-            bootstrap_type = ["Bayesian", "Poisson", "MVS", "No"]
+            bootstrap_type = [
+                "Bayesian"
+            ]  # Poisson not possible on CPU, "MVS" requires min samples
         if grow_policy is None:
             grow_policy = ["SymmetricTree"]
 
@@ -613,7 +615,7 @@ def __init__(
         catboost_eval_metric_tune_direction: str = "minimize",
     ):
         if bootstrap_type is None:
-            bootstrap_type = ["Bayesian", "Poisson", "MVS", "No"]
+            bootstrap_type = ["Bayesian", "Poisson", "MVS"]
         if grow_policy is None:
             grow_policy = ["SymmetricTree"]
 

diff --git a/bluecast/ml_modelling/catboost.py b/bluecast/ml_modelling/catboost.py
@@ -171,7 +171,6 @@ def objective(trial):
                 "objective": self.conf_catboost.catboost_objective,
                 "eval_metric": self.conf_catboost.catboost_eval_metric,
                 "random_seed": self.conf_training.global_random_state,
-                # Some typical CatBoost hyperparameters:
                 "learning_rate": trial.suggest_float(
                     "learning_rate",
                     self.conf_catboost.learning_rate_min,
@@ -228,6 +227,10 @@ def objective(trial):
                     log=True,
                 ),
             }
+            if params["bootstrap_type"] == "Bayesian":
+                params["bagging_temperature"] = None
+                params["subsample"] = None
+
             params = {**params, **train_on}
 
             sample_weight_choice = trial.suggest_categorical(
@@ -380,6 +383,10 @@ def objective(trial):
                 }
                 final_best_params = {**final_best_params, **train_on}
 
+                if final_best_params["bootstrap_type"] == "Bayesian":
+                    final_best_params.pop("subsample", None)
+                    final_best_params.pop("bagging_temperature", None)
+
                 final_best_params = update_params_with_best_params(
                     final_best_params, best_param
                 )

diff --git a/bluecast/ml_modelling/catboost_regression.py b/bluecast/ml_modelling/catboost_regression.py
@@ -222,6 +222,11 @@ def objective(trial):
                     log=True,
                 ),
             }
+
+            if params["bootstrap_type"] == "Bayesian":
+                params["bagging_temperature"] = None
+                params["subsample"] = None
+
             params = {**params, **train_on}
 
             train_pool = Pool(x_train, label=y_train, cat_features=self.cat_columns)
@@ -354,6 +359,10 @@ def objective(trial):
                 # Merge device or other settings
                 final_best_params = {**final_best_params, **train_on}
 
+                if final_best_params["bootstrap_type"] == "Bayesian":
+                    final_best_params.pop("subsample", None)
+                    final_best_params.pop("bagging_temperature", None)
+
                 # Optionally apply a custom function to finalize best params
                 final_best_params = update_params_with_best_params(
                     final_best_params, catboost_best_param

diff --git a/bluecast/tests/test_catboost.py b/bluecast/tests/test_catboost.py
@@ -17,7 +17,71 @@ def test_bluecast_without_hyperparam_tuning():
     # Create an instance of the BlueCast class with the custom model
     bluecast = BlueCast(
         class_problem="binary",
-        ml_model=CatboostModel(class_problem='binary', conf_training=train_config, conf_catboost=catboost_pram_config),
+        ml_model=CatboostModel(
+            class_problem="binary",
+            conf_training=train_config,
+            conf_catboost=catboost_pram_config,
+        ),
+        conf_xgboost=catboost_pram_config,
+        conf_training=train_config,
+    )
+
+    # Create some sample data for testing
+    x_train = pd.DataFrame(
+        {
+            "feature1": [i for i in range(20)],
+            "feature2": [i for i in range(20)],
+            "feature3": [i for i in range(20)],
+            "feature4": [i for i in range(20)],
+            "feature5": [i for i in range(20)],
+            "feature6": [i for i in range(20)],
+        }
+    )
+    y_train = pd.Series([0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1])
+    x_test = pd.DataFrame(
+        {
+            "feature1": [i for i in range(10)],
+            "feature2": [i for i in range(10)],
+            "feature3": [i for i in range(10)],
+            "feature4": [i for i in range(10)],
+            "feature5": [i for i in range(10)],
+            "feature6": [i for i in range(10)],
+        }
+    )
+
+    x_train["target"] = y_train
+
+    # Fit the BlueCast model using the custom model
+    bluecast.fit(x_train, "target")
+
+    # Predict on the test data using the custom model
+    predicted_probas, predicted_classes = bluecast.predict(x_test)
+
+    # Assert the expected results
+    assert isinstance(predicted_probas, np.ndarray)
+    assert isinstance(predicted_classes, np.ndarray)
+    print(bluecast.experiment_tracker.experiment_id)
+    assert (
+        len(bluecast.experiment_tracker.experiment_id) == 0
+    )  # due to custom model and fit method
+
+
+def test_bluecast_with_hyperparam_tuning():
+    train_config = TrainingConfig()
+    train_config.hyperparameter_tuning_rounds = 10
+    train_config.hypertuning_cv_folds = 2
+    train_config.autotune_model = True
+
+    catboost_pram_config = CatboostTuneParamsConfig()
+
+    # Create an instance of the BlueCast class with the custom model
+    bluecast = BlueCast(
+        class_problem="binary",
+        ml_model=CatboostModel(
+            class_problem="binary",
+            conf_training=train_config,
+            conf_catboost=catboost_pram_config,
+        ),
         conf_xgboost=catboost_pram_config,
         conf_training=train_config,
     )