[tests][python-package] change boston dataset to synthetic dataset in…

… tests that don't check score (#4895) * change boston dataset to synthetic dataset in tests that don't evaluate score * format imports
microsoft · Dec 20, 2021 · 8a34b1a · 8a34b1a
1 parent 8e729af
commit 8a34b1a
Show file tree

Hide file tree

Showing 3 changed files with 22 additions and 16 deletions.
diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py
@@ -18,7 +18,7 @@
 
 import lightgbm as lgb
 
-from .utils import load_boston, load_breast_cancer, load_digits, load_iris
+from .utils import load_boston, load_breast_cancer, load_digits, load_iris, make_synthetic_regression
 
 decreasing_generator = itertools.count(0, -1)
 
@@ -731,7 +731,7 @@ def test_continue_train():
 
 
 def test_continue_train_reused_dataset():
-    X, y = load_boston(return_X_y=True)
+    X, y = make_synthetic_regression()
     params = {
         'objective': 'regression',
         'verbose': -1
@@ -791,7 +791,7 @@ def test_continue_train_multiclass():
 
 
 def test_cv():
-    X_train, y_train = load_boston(return_X_y=True)
+    X_train, y_train = make_synthetic_regression()
     params = {'verbose': -1}
     lgb_train = lgb.Dataset(X_train, y_train)
     # shuffle = False, override metric in params
@@ -887,7 +887,7 @@ def test_cvbooster():
 
 
 def test_feature_name():
-    X_train, y_train = load_boston(return_X_y=True)
+    X_train, y_train = make_synthetic_regression()
     params = {'verbose': -1}
     lgb_train = lgb.Dataset(X_train, y_train)
     feature_names = [f'f_{i}' for i in range(X_train.shape[-1])]
@@ -917,7 +917,7 @@ def test_feature_name_with_non_ascii():
 
 def test_save_load_copy_pickle():
     def train_and_predict(init_model=None, return_model=False):
-        X, y = load_boston(return_X_y=True)
+        X, y = make_synthetic_regression()
         X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
         params = {
             'objective': 'regression',
@@ -2102,7 +2102,7 @@ def test_default_objective_and_metric():
 
 @pytest.mark.skipif(psutil.virtual_memory().available / 1024 / 1024 / 1024 < 3, reason='not enough RAM')
 def test_model_size():
-    X, y = load_boston(return_X_y=True)
+    X, y = make_synthetic_regression()
     data = lgb.Dataset(X, y)
     bst = lgb.train({'verbose': -1}, data, num_boost_round=2)
     y_pred = bst.predict(X)
@@ -2515,7 +2515,7 @@ def test_dataset_params_with_reference():
 
 def test_extra_trees():
     # check extra trees increases regularization
-    X, y = load_boston(return_X_y=True)
+    X, y = make_synthetic_regression()
     lgb_x = lgb.Dataset(X, label=y)
     params = {'objective': 'regression',
               'num_leaves': 32,
@@ -2534,7 +2534,7 @@ def test_extra_trees():
 
 def test_path_smoothing():
     # check path smoothing increases regularization
-    X, y = load_boston(return_X_y=True)
+    X, y = make_synthetic_regression()
     lgb_x = lgb.Dataset(X, label=y)
     params = {'objective': 'regression',
               'num_leaves': 32,
@@ -2804,7 +2804,7 @@ def inner_test(X, y, params, early_stopping_rounds):
         np.testing.assert_allclose(pred4, pred6)
 
     # test for regression
-    X, y = load_boston(return_X_y=True)
+    X, y = make_synthetic_regression()
     params = {
         'objective': 'regression',
         'verbose': -1,

diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py
@@ -18,7 +18,8 @@
 
 import lightgbm as lgb
 
-from .utils import load_boston, load_breast_cancer, load_digits, load_iris, load_linnerud, make_ranking
+from .utils import (load_boston, load_breast_cancer, load_digits, load_iris, load_linnerud, make_ranking,
+                    make_synthetic_regression)
 
 sk_version = parse_version(sk_version)
 if sk_version < parse_version("0.23"):
@@ -184,7 +185,7 @@ def test_eval_at_aliases():
 
 @pytest.mark.parametrize("custom_objective", [True, False])
 def test_objective_aliases(custom_objective):
-    X, y = load_boston(return_X_y=True)
+    X, y = make_synthetic_regression()
     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
     if custom_objective:
         obj = custom_dummy_obj
@@ -440,7 +441,7 @@ def test_regressor_chain():
 
 
 def test_clone_and_property():
-    X, y = load_boston(return_X_y=True)
+    X, y = make_synthetic_regression()
     gbm = lgb.LGBMRegressor(n_estimators=10, verbose=-1)
     gbm.fit(X, y)
 
@@ -458,7 +459,7 @@ def test_clone_and_property():
 
 
 def test_joblib():
-    X, y = load_boston(return_X_y=True)
+    X, y = make_synthetic_regression()
     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
     gbm = lgb.LGBMRegressor(n_estimators=10, objective=custom_asymmetric_obj,
                             verbose=-1, importance_type='split')
@@ -499,7 +500,7 @@ def test_non_serializable_objects_in_callbacks(tmp_path):
     with pytest.raises(Exception, match="This class in not picklable"):
         joblib.dump(unpicklable_callback, tmp_path / 'tmp.joblib')
 
-    X, y = load_boston(return_X_y=True)
+    X, y = make_synthetic_regression()
     gbm = lgb.LGBMRegressor(n_estimators=5)
     gbm.fit(X, y, callbacks=[unpicklable_callback])
     assert gbm.booster_.attr('attr_set_inside_callback') == '40'
@@ -757,7 +758,7 @@ def test_predict_with_params_from_init():
 
 
 def test_evaluate_train_set():
-    X, y = load_boston(return_X_y=True)
+    X, y = make_synthetic_regression()
     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
     gbm = lgb.LGBMRegressor(n_estimators=10, verbose=-1)
     gbm.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)])
@@ -1332,7 +1333,7 @@ def test_training_succeeds_when_data_is_dataframe_and_label_is_column_array(task
         X, y = load_iris(return_X_y=True)
         model_factory = lgb.LGBMClassifier
     elif task == 'regression':
-        X, y = load_boston(return_X_y=True)
+        X, y = make_synthetic_regression()
         model_factory = lgb.LGBMRegressor
     X = pd.DataFrame(X)
     y_col_array = y.reshape(-1, 1)

diff --git a/tests/python_package_test/utils.py b/tests/python_package_test/utils.py
@@ -109,3 +109,8 @@ def make_ranking(n_samples=100, n_features=20, n_informative=5, gmax=2,
         X[:, j] = bias + coef * y_vec
 
     return X, y_vec, group_id_vec
+
+
+@lru_cache(maxsize=None)
+def make_synthetic_regression(n_samples=100):
+    return sklearn.datasets.make_regression(n_samples, n_features=4, n_informative=2, random_state=42)