From 2b4c300b3f3c6d42d17da3a2c68d6f6e8e350fd1 Mon Sep 17 00:00:00 2001 From: Avi Shinnar Date: Tue, 6 Feb 2024 22:23:07 -0500 Subject: [PATCH] Add support for scikit-learn 1.4 Signed-off-by: Avi Shinnar --- docs/requirements.txt | 2 +- lale/datasets/openml/openml_datasets.py | 6 +- lale/helpers.py | 24 +++++++ lale/lib/aif360/bagging_orbis_classifier.py | 11 ++-- lale/lib/autogen/kernel_pca.py | 22 ++++++- lale/lib/autogen/kernel_ridge.py | 19 ++++++ lale/lib/autogen/lars.py | 24 ++++++- lale/lib/autogen/lars_cv.py | 23 ++++++- lale/lib/autogen/lasso_lars.py | 23 ++++++- lale/lib/autogen/lasso_lars_cv.py | 24 ++++++- lale/lib/autogen/lasso_lars_ic.py | 23 ++++++- .../autogen/orthogonal_matching_pursuit.py | 25 +++++++- .../autogen/orthogonal_matching_pursuit_cv.py | 26 +++++++- lale/lib/lightgbm/lgbm_classifier.py | 7 +++ lale/lib/sklearn/_common_schemas.py | 33 ++++++++++ lale/lib/sklearn/ada_boost_classifier.py | 26 ++++++++ lale/lib/sklearn/decision_tree_classifier.py | 5 ++ lale/lib/sklearn/decision_tree_regressor.py | 6 ++ lale/lib/sklearn/extra_trees_classifier.py | 6 ++ lale/lib/sklearn/extra_trees_regressor.py | 6 ++ lale/lib/sklearn/feature_agglomeration.py | 47 ++++++++++++++ lale/lib/sklearn/k_neighbors_regressor.py | 37 +++++++++++ lale/lib/sklearn/nmf.py | 29 +++++++++ lale/lib/sklearn/random_forest_classifier.py | 8 +++ lale/lib/sklearn/random_forest_regressor.py | 7 +++ lale/lib/sklearn/simple_imputer.py | 29 +++++---- setup.py | 2 +- test/test_aif360_ensembles.py | 53 +++++++++++----- test/test_core_classifiers.py | 29 ++++++--- test/test_core_misc.py | 16 +++-- test/test_grammar.py | 3 +- test/test_optimizers.py | 62 ++++++++++++++----- test/test_relational_sklearn.py | 17 ++++- test/test_replace.py | 19 ++++-- test/test_type_checking.py | 23 ++++--- 35 files changed, 625 insertions(+), 97 deletions(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index 2e92a97c7..c49b59d35 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -5,7 +5,7 @@ graphviz hyperopt jsonschema jsonsubschema -scikit-learn>=1.0.0,<1.4 +scikit-learn>=1.0.0,<1.5.0 scipy pandas decorator diff --git a/lale/datasets/openml/openml_datasets.py b/lale/datasets/openml/openml_datasets.py index 8642968eb..7c1dfda7e 100644 --- a/lale/datasets/openml/openml_datasets.py +++ b/lale/datasets/openml/openml_datasets.py @@ -733,8 +733,12 @@ def fetch( ] txm1 = ColumnTransformer(transformers1, sparse_threshold=0.0) + if sklearn_version >= version.Version("1.2"): + ohe2 = OneHotEncoder(sparse_output=False) + else: + ohe2 = OneHotEncoder(sparse=False) transformers2 = [ - ("ohe", OneHotEncoder(sparse=False), list(range(len(categorical_cols)))), + ("ohe", ohe2, list(range(len(categorical_cols)))), ( "no_op", "passthrough", diff --git a/lale/helpers.py b/lale/helpers.py index 9306953a0..e0b0a6221 100644 --- a/lale/helpers.py +++ b/lale/helpers.py @@ -1308,6 +1308,30 @@ def get_sklearn_estimator_name() -> str: return "estimator" +def with_fixed_estimator_name(**kwargs): + """Some higher order sklearn operators changed the name of the nested estimator in later versions. + This fixes up the arguments, renaming estimator and base_estimator appropriately. + """ + + if "base_estimator" in kwargs or "estimator" in kwargs: + from packaging import version + + import lale.operators + + if lale.operators.sklearn_version < version.Version("1.2"): + return { + "base_estimator" if k == "estimator" else k: v + for k, v in kwargs.items() + } + else: + return { + "estimator" if k == "base_estimator" else k: v + for k, v in kwargs.items() + } + + return kwargs + + def get_estimator_param_name_from_hyperparams(hyperparams): be = hyperparams.get("base_estimator", "deprecated") if be == "deprecated" or (be is None and "estimator" in hyperparams): diff --git a/lale/lib/aif360/bagging_orbis_classifier.py b/lale/lib/aif360/bagging_orbis_classifier.py index 1449501ed..e642b667d 100644 --- a/lale/lib/aif360/bagging_orbis_classifier.py +++ b/lale/lib/aif360/bagging_orbis_classifier.py @@ -22,6 +22,7 @@ import lale.operators from lale.lib.imblearn._common_schemas import _hparam_n_jobs, _hparam_random_state +from ...helpers import with_fixed_estimator_name from .orbis import Orbis from .orbis import _hyperparams_schema as orbis_hyperparams_schema from .util import ( @@ -115,10 +116,12 @@ def _repair_dtypes(inner_X): # for some reason BaggingClassifier spoils dtypes repair_dtypes = lale.lib.sklearn.FunctionTransformer(func=_repair_dtypes) trainable_ensemble = lale.lib.sklearn.BaggingClassifier( - base_estimator=repair_dtypes >> orbis, - n_estimators=self.n_estimators, - n_jobs=self.sampler_hparams["n_jobs"], - random_state=self.sampler_hparams["random_state"], + **with_fixed_estimator_name( + estimator=repair_dtypes >> orbis, + n_estimators=self.n_estimators, + n_jobs=self.sampler_hparams["n_jobs"], + random_state=self.sampler_hparams["random_state"], + ) ) encoded_y = pd.Series(self.lab_enc.transform(y), index=y.index) self.trained_ensemble = trainable_ensemble.fit(X, encoded_y) diff --git a/lale/lib/autogen/kernel_pca.py b/lale/lib/autogen/kernel_pca.py index 6121f7057..191dca161 100644 --- a/lale/lib/autogen/kernel_pca.py +++ b/lale/lib/autogen/kernel_pca.py @@ -1,8 +1,9 @@ from numpy import inf, nan +from packaging import version from sklearn.decomposition import KernelPCA as Op from lale.docstrings import set_docstrings -from lale.operators import make_operator +from lale.operators import make_operator, sklearn_version class _KernelPCAImpl: @@ -239,4 +240,23 @@ def transform(self, X): } KernelPCA = make_operator(_KernelPCAImpl, _combined_schemas) +if sklearn_version >= version.Version("1.4"): + + KernelPCA = KernelPCA.customize_schema( + degree={ + "anyOf": [ + { + "type": "integer", + "minimumForOptimizer": 2, + "maximumForOptimizer": 3, + "distribution": "uniform", + }, + {"type": "number", "forOptimizer": False}, + ], + "default": 3, + "description": "Degree for poly kernels", + }, + set_as_available=True, + ) + set_docstrings(KernelPCA) diff --git a/lale/lib/autogen/kernel_ridge.py b/lale/lib/autogen/kernel_ridge.py index 9476ae925..ff85b6195 100644 --- a/lale/lib/autogen/kernel_ridge.py +++ b/lale/lib/autogen/kernel_ridge.py @@ -170,4 +170,23 @@ def predict(self, X): set_as_available=True, ) +if sklearn_version >= version.Version("1.4"): + + KernelRidge = KernelRidge.customize_schema( + degree={ + "anyOf": [ + { + "type": "integer", + "minimumForOptimizer": 0, + "maximumForOptimizer": 100, + "distribution": "uniform", + }, + {"type": "number", "forOptimizer": False}, + ], + "default": 3, + "description": "Degree of the polynomial kernel", + }, + set_as_available=True, + ) + set_docstrings(KernelRidge) diff --git a/lale/lib/autogen/lars.py b/lale/lib/autogen/lars.py index 869148197..b66da7bf1 100644 --- a/lale/lib/autogen/lars.py +++ b/lale/lib/autogen/lars.py @@ -1,8 +1,8 @@ -from numpy import inf, nan +from packaging import version from sklearn.linear_model import Lars as Op from lale.docstrings import set_docstrings -from lale.operators import make_operator +from lale.operators import make_operator, sklearn_version class _LarsImpl: @@ -197,4 +197,24 @@ def predict(self, X): } Lars = make_operator(_LarsImpl, _combined_schemas) +if sklearn_version >= version.Version("1.2"): + Lars = Lars.customize_schema( + normalize={ + "anyOf": [ + { + "type": "boolean", + "description": "This parameter is ignored when ``fit_intercept`` is set to False", + }, + {"enum": ["deprecated"]}, + ], + "default": "deprecated", + "description": "Deprecated", + }, + set_as_available=True, + ) + +if sklearn_version >= version.Version("1.4"): + Lars = Lars.customize_schema(normalize=None, set_as_available=True) + + set_docstrings(Lars) diff --git a/lale/lib/autogen/lars_cv.py b/lale/lib/autogen/lars_cv.py index 01225fc91..102f3d767 100644 --- a/lale/lib/autogen/lars_cv.py +++ b/lale/lib/autogen/lars_cv.py @@ -1,8 +1,8 @@ -from numpy import inf, nan +from packaging import version from sklearn.linear_model import LarsCV as Op from lale.docstrings import set_docstrings -from lale.operators import make_operator +from lale.operators import make_operator, sklearn_version class _LarsCVImpl: @@ -203,4 +203,23 @@ def predict(self, X): } LarsCV = make_operator(_LarsCVImpl, _combined_schemas) +if sklearn_version >= version.Version("1.2"): + LarsCV = LarsCV.customize_schema( + normalize={ + "anyOf": [ + { + "type": "boolean", + "description": "This parameter is ignored when ``fit_intercept`` is set to False", + }, + {"enum": ["deprecated"]}, + ], + "default": "deprecated", + "description": "Deprecated", + }, + set_as_available=True, + ) + +if sklearn_version >= version.Version("1.4"): + LarsCV = LarsCV.customize_schema(normalize=None, set_as_available=True) + set_docstrings(LarsCV) diff --git a/lale/lib/autogen/lasso_lars.py b/lale/lib/autogen/lasso_lars.py index d34e43454..87ffc24ab 100644 --- a/lale/lib/autogen/lasso_lars.py +++ b/lale/lib/autogen/lasso_lars.py @@ -1,8 +1,8 @@ -from numpy import inf, nan +from packaging import version from sklearn.linear_model import LassoLars as Op from lale.docstrings import set_docstrings -from lale.operators import make_operator +from lale.operators import make_operator, sklearn_version class _LassoLarsImpl: @@ -197,4 +197,23 @@ def predict(self, X): } LassoLars = make_operator(_LassoLarsImpl, _combined_schemas) +if sklearn_version >= version.Version("1.2"): + LassoLars = LassoLars.customize_schema( + normalize={ + "anyOf": [ + { + "type": "boolean", + "description": "This parameter is ignored when ``fit_intercept`` is set to False", + }, + {"enum": ["deprecated"]}, + ], + "default": "deprecated", + "description": "Deprecated", + }, + set_as_available=True, + ) + +if sklearn_version >= version.Version("1.4"): + LassoLars = LassoLars.customize_schema(normalize=None, set_as_available=True) + set_docstrings(LassoLars) diff --git a/lale/lib/autogen/lasso_lars_cv.py b/lale/lib/autogen/lasso_lars_cv.py index bc6c76118..fb9217497 100644 --- a/lale/lib/autogen/lasso_lars_cv.py +++ b/lale/lib/autogen/lasso_lars_cv.py @@ -1,8 +1,8 @@ -from numpy import inf, nan +from packaging import version from sklearn.linear_model import LassoLarsCV as Op from lale.docstrings import set_docstrings -from lale.operators import make_operator +from lale.operators import make_operator, sklearn_version class _LassoLarsCVImpl: @@ -203,4 +203,24 @@ def predict(self, X): } LassoLarsCV = make_operator(_LassoLarsCVImpl, _combined_schemas) + +if sklearn_version >= version.Version("1.2"): + LassoLarsCV = LassoLarsCV.customize_schema( + normalize={ + "anyOf": [ + { + "type": "boolean", + "description": "This parameter is ignored when ``fit_intercept`` is set to False", + }, + {"enum": ["deprecated"]}, + ], + "default": "deprecated", + "description": "Deprecated", + }, + set_as_available=True, + ) + +if sklearn_version >= version.Version("1.4"): + LassoLarsCV = LassoLarsCV.customize_schema(normalize=None, set_as_available=True) + set_docstrings(LassoLarsCV) diff --git a/lale/lib/autogen/lasso_lars_ic.py b/lale/lib/autogen/lasso_lars_ic.py index 2f374f1c1..66ae2bc72 100644 --- a/lale/lib/autogen/lasso_lars_ic.py +++ b/lale/lib/autogen/lasso_lars_ic.py @@ -1,8 +1,8 @@ -from numpy import inf, nan +from packaging import version from sklearn.linear_model import LassoLarsIC as Op from lale.docstrings import set_docstrings -from lale.operators import make_operator +from lale.operators import make_operator, sklearn_version class _LassoLarsICImpl: @@ -201,4 +201,23 @@ def predict(self, X): } LassoLarsIC = make_operator(_LassoLarsICImpl, _combined_schemas) +if sklearn_version >= version.Version("1.2"): + LassoLarsIC = LassoLarsIC.customize_schema( + normalize={ + "anyOf": [ + { + "type": "boolean", + "description": "This parameter is ignored when ``fit_intercept`` is set to False", + }, + {"enum": ["deprecated"]}, + ], + "default": "deprecated", + "description": "Deprecated", + }, + set_as_available=True, + ) + +if sklearn_version >= version.Version("1.4"): + LassoLarsIC = LassoLarsIC.customize_schema(normalize=None, set_as_available=True) + set_docstrings(LassoLarsIC) diff --git a/lale/lib/autogen/orthogonal_matching_pursuit.py b/lale/lib/autogen/orthogonal_matching_pursuit.py index 0d8889866..d58ad49eb 100644 --- a/lale/lib/autogen/orthogonal_matching_pursuit.py +++ b/lale/lib/autogen/orthogonal_matching_pursuit.py @@ -1,8 +1,8 @@ -from numpy import inf, nan +from packaging import version from sklearn.linear_model import OrthogonalMatchingPursuit as Op from lale.docstrings import set_docstrings -from lale.operators import make_operator +from lale.operators import make_operator, sklearn_version class _OrthogonalMatchingPursuitImpl: @@ -156,4 +156,25 @@ def predict(self, X): _OrthogonalMatchingPursuitImpl, _combined_schemas ) +if sklearn_version >= version.Version("1.2"): + OrthogonalMatchingPursuit = OrthogonalMatchingPursuit.customize_schema( + normalize={ + "anyOf": [ + { + "type": "boolean", + "description": "This parameter is ignored when ``fit_intercept`` is set to False", + }, + {"enum": ["deprecated"]}, + ], + "default": "deprecated", + "description": "Deprecated", + }, + set_as_available=True, + ) + +if sklearn_version >= version.Version("1.4"): + OrthogonalMatchingPursuit = OrthogonalMatchingPursuit.customize_schema( + normalize=None, set_as_available=True + ) + set_docstrings(OrthogonalMatchingPursuit) diff --git a/lale/lib/autogen/orthogonal_matching_pursuit_cv.py b/lale/lib/autogen/orthogonal_matching_pursuit_cv.py index 2970e7f80..eb0eaade5 100644 --- a/lale/lib/autogen/orthogonal_matching_pursuit_cv.py +++ b/lale/lib/autogen/orthogonal_matching_pursuit_cv.py @@ -1,8 +1,8 @@ -from numpy import inf, nan +from packaging import version from sklearn.linear_model import OrthogonalMatchingPursuitCV as Op from lale.docstrings import set_docstrings -from lale.operators import make_operator +from lale.operators import make_operator, sklearn_version class _OrthogonalMatchingPursuitCVImpl: @@ -175,4 +175,26 @@ def predict(self, X): _OrthogonalMatchingPursuitCVImpl, _combined_schemas ) +if sklearn_version >= version.Version("1.2"): + OrthogonalMatchingPursuitCV = OrthogonalMatchingPursuitCV.customize_schema( + normalize={ + "anyOf": [ + { + "type": "boolean", + "description": "This parameter is ignored when ``fit_intercept`` is set to False", + }, + {"enum": ["deprecated"]}, + ], + "default": "deprecated", + "description": "Deprecated", + }, + set_as_available=True, + ) + +if sklearn_version >= version.Version("1.4"): + OrthogonalMatchingPursuitCV = OrthogonalMatchingPursuitCV.customize_schema( + normalize=None, set_as_available=True + ) + + set_docstrings(OrthogonalMatchingPursuitCV) diff --git a/lale/lib/lightgbm/lgbm_classifier.py b/lale/lib/lightgbm/lgbm_classifier.py index 6adc34dc0..3bd7e85da 100644 --- a/lale/lib/lightgbm/lgbm_classifier.py +++ b/lale/lib/lightgbm/lgbm_classifier.py @@ -529,6 +529,13 @@ def score(self, X, y): set_as_available=True, ) + if lightgbm_version >= version.Version("4.0.0"): + # https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMClassifier.html#lightgbm.LGBMClassifier + LGBMClassifier = LGBMClassifier.customize_schema( + silent=None, + set_as_available=True, + ) + if lightgbm_version >= version.Version("4.0.0"): # https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMClassifier.html#lightgbm.LGBMClassifier LGBMClassifier = LGBMClassifier.customize_schema( diff --git a/lale/lib/sklearn/_common_schemas.py b/lale/lib/sklearn/_common_schemas.py index 73f008bf4..569346f19 100644 --- a/lale/lib/sklearn/_common_schemas.py +++ b/lale/lib/sklearn/_common_schemas.py @@ -61,3 +61,36 @@ "default": None, "description": "Weights applied to individual samples.", } + + +def schema_monotonic_cst(desc_add): + return { + "anyOf": [ + { + "type": "array", + "description": "array-like of int of shape (n_features)", + "items": {"enum": [-1, 0, 1]}, + }, + {"enum": [None], "description": "No constraints are applied."}, + ], + "default": None, + "description": "Indicates the monotonicity constraint to enforce on each feature." + + desc_add, + } + + +schema_monotonic_cst_regressor = schema_monotonic_cst( + """ +Monotonicity constraints are not supported for: +multioutput regressions (i.e. when n_outputs_ > 1), + +regressions trained on data with missing values.""" +) + +schema_monotonic_cst_classifier = schema_monotonic_cst( + """ +Monotonicity constraints are not supported for: +multioutput regressions (i.e. when n_outputs_ > 1), + +regressions trained on data with missing values.""" +) diff --git a/lale/lib/sklearn/ada_boost_classifier.py b/lale/lib/sklearn/ada_boost_classifier.py index 385db3dbd..3473edb67 100644 --- a/lale/lib/sklearn/ada_boost_classifier.py +++ b/lale/lib/sklearn/ada_boost_classifier.py @@ -326,5 +326,31 @@ def score(self, X, y, sample_weight=None): set_as_available=True, ) +if lale.operators.sklearn_version >= version.Version("1.4"): + AdaBoostClassifier = AdaBoostClassifier.customize_schema( + algorithm={ + "anyOf": [ + { + "enum": ["SAMME"], + "description": "Use the SAMME discrete boosting algorithm.", + }, + {"enum": ["SAMME.R"], "description": "deprecated"}, + ], + "default": "SAMME.R", + "description": "The boosting algorithm to use", + }, + set_as_available=True, + ) + +if lale.operators.sklearn_version >= version.Version("1.6"): + AdaBoostClassifier = AdaBoostClassifier.customize_schema( + algorithm={ + "enum": ["SAMME"], + "default": "SAMME", + "description": "Use the SAMME discrete boosting algorithm.", + }, + set_as_available=True, + ) + lale.docstrings.set_docstrings(AdaBoostClassifier) diff --git a/lale/lib/sklearn/decision_tree_classifier.py b/lale/lib/sklearn/decision_tree_classifier.py index 035625ea6..427912e9f 100644 --- a/lale/lib/sklearn/decision_tree_classifier.py +++ b/lale/lib/sklearn/decision_tree_classifier.py @@ -18,6 +18,7 @@ import lale.docstrings import lale.operators +from lale.lib.sklearn._common_schemas import schema_monotonic_cst_classifier _hyperparams_schema = { "description": "A decision tree classifier.", @@ -390,5 +391,9 @@ set_as_available=True, ) +if lale.operators.sklearn_version >= version.Version("1.4"): + DecisionTreeClassifier = DecisionTreeClassifier.customize_schema( + monotonic_cst=schema_monotonic_cst_classifier, set_as_available=True + ) lale.docstrings.set_docstrings(DecisionTreeClassifier) diff --git a/lale/lib/sklearn/decision_tree_regressor.py b/lale/lib/sklearn/decision_tree_regressor.py index a4a6e745a..99f024c64 100644 --- a/lale/lib/sklearn/decision_tree_regressor.py +++ b/lale/lib/sklearn/decision_tree_regressor.py @@ -18,6 +18,7 @@ import lale.docstrings import lale.operators +from lale.lib.sklearn._common_schemas import schema_monotonic_cst_regressor _hyperparams_schema = { "description": "A decision tree regressor.", @@ -380,4 +381,9 @@ set_as_available=True, ) +if lale.operators.sklearn_version >= version.Version("1.4"): + DecisionTreeRegressor = DecisionTreeRegressor.customize_schema( + monotonic_cst=schema_monotonic_cst_regressor, set_as_available=True + ) + lale.docstrings.set_docstrings(DecisionTreeRegressor) diff --git a/lale/lib/sklearn/extra_trees_classifier.py b/lale/lib/sklearn/extra_trees_classifier.py index b97853b6e..f8cb09332 100644 --- a/lale/lib/sklearn/extra_trees_classifier.py +++ b/lale/lib/sklearn/extra_trees_classifier.py @@ -18,6 +18,7 @@ import lale.docstrings import lale.operators +from lale.lib.sklearn._common_schemas import schema_monotonic_cst_classifier _hyperparams_schema = { "description": "An extra-trees classifier.", @@ -456,4 +457,9 @@ set_as_available=True, ) +if lale.operators.sklearn_version >= version.Version("1.4"): + ExtraTreesClassifier = ExtraTreesClassifier.customize_schema( + monotonic_cst=schema_monotonic_cst_classifier, set_as_available=True + ) + lale.docstrings.set_docstrings(ExtraTreesClassifier) diff --git a/lale/lib/sklearn/extra_trees_regressor.py b/lale/lib/sklearn/extra_trees_regressor.py index 9baccf026..1f1289926 100644 --- a/lale/lib/sklearn/extra_trees_regressor.py +++ b/lale/lib/sklearn/extra_trees_regressor.py @@ -18,6 +18,7 @@ import lale.docstrings import lale.operators +from lale.lib.sklearn._common_schemas import schema_monotonic_cst_regressor _hyperparams_schema = { "description": "An extra-trees regressor.", @@ -423,4 +424,9 @@ set_as_available=True, ) +if lale.operators.sklearn_version >= version.Version("1.4"): + ExtraTreesRegressor = ExtraTreesRegressor.customize_schema( + monotonic_cst=schema_monotonic_cst_regressor, set_as_available=True + ) + lale.docstrings.set_docstrings(ExtraTreesRegressor) diff --git a/lale/lib/sklearn/feature_agglomeration.py b/lale/lib/sklearn/feature_agglomeration.py index fc589e6c1..ab5f96c42 100644 --- a/lale/lib/sklearn/feature_agglomeration.py +++ b/lale/lib/sklearn/feature_agglomeration.py @@ -299,4 +299,51 @@ set_as_available=True, ) +if lale.operators.sklearn_version >= version.Version("1.4"): + # new: https://scikit-learn.org/1.2/modules/generated/sklearn.cluster.FeatureAgglomeration.html + FeatureAgglomeration = FeatureAgglomeration.customize_schema( + metric={ + "anyOf": [ + { + "enum": [ + "euclidean", + "l1", + "l2", + "manhattan", + "cosine", + "precomputed", + ] + }, + {"forOptimizer": False, "enum": [None], "description": "deprecated"}, + {"forOptimizer": False, "laleType": "callable"}, + ], + "default": "euclidean", + "description": "Metric used to compute the linkage. The default is `euclidean`", + }, + set_as_available=True, + ) + +if lale.operators.sklearn_version >= version.Version("1.6"): + # new: https://scikit-learn.org/1.2/modules/generated/sklearn.cluster.FeatureAgglomeration.html + FeatureAgglomeration = FeatureAgglomeration.customize_schema( + metric={ + "anyOf": [ + { + "enum": [ + "euclidean", + "l1", + "l2", + "manhattan", + "cosine", + "precomputed", + ] + }, + {"forOptimizer": False, "laleType": "callable"}, + ], + "default": "euclidean", + "description": "Metric used to compute the linkage. The default is `euclidean`", + }, + set_as_available=True, + ) + lale.docstrings.set_docstrings(FeatureAgglomeration) diff --git a/lale/lib/sklearn/k_neighbors_regressor.py b/lale/lib/sklearn/k_neighbors_regressor.py index e3465fea5..81c3d918b 100644 --- a/lale/lib/sklearn/k_neighbors_regressor.py +++ b/lale/lib/sklearn/k_neighbors_regressor.py @@ -13,6 +13,7 @@ # limitations under the License. import sklearn.neighbors +from packaging import version import lale.docstrings import lale.operators @@ -190,4 +191,40 @@ sklearn.neighbors.KNeighborsRegressor, _combined_schemas ) +if lale.operators.sklearn_version >= version.Version("1.4"): + + KNeighborsRegressor = KNeighborsRegressor.customize_schema( + metric={ + "anyOf": [ + { + "enum": [ + "cityblock", + "cosine", + "euclidean", + "haversine", + "l1", + "l2", + "manhattan", + "nan_euclidean", + "precomputed", + ], + }, + { + "laleType": "callable", + "description": "It takes two arrays representing 1D vectors as inputs and must return one value indicating the distance between those vectors. This works for Scipy’s metrics, but is less efficient than passing the metric name as a string.", + "forOptimizer": False, + }, + { + "laleType": "Any", + "description": "It will be passed directly to the underlying computation routines.", + "forOptimizer": False, + }, + ], + "description": "The distance metric to use for the tree.", + "default": "minkowski", + }, + set_as_available=True, + ) + + lale.docstrings.set_docstrings(KNeighborsRegressor) diff --git a/lale/lib/sklearn/nmf.py b/lale/lib/sklearn/nmf.py index a9768cb22..a5b2bcfba 100644 --- a/lale/lib/sklearn/nmf.py +++ b/lale/lib/sklearn/nmf.py @@ -285,4 +285,33 @@ NMF = NMF.customize_schema(alpha=None, regularization=None, set_as_available=True) +if lale.operators.sklearn_version >= version.Version("1.4"): + # new: https://scikit-learn.org/1.4/modules/generated/sklearn.decomposition.NMF.html + + NMF = NMF.customize_schema( + n_components={ + "anyOf": [ + { + "type": "integer", + "minimum": 1, + "laleMaximum": "X/items/maxItems", # number of columns + "minimumForOptimizer": 2, + "maximumForOptimizer": 256, + "distribution": "uniform", + }, + { + "description": "The number of components is automatically inferred from W or H shapes.", + "enum": ["auto"], + }, + { + "description": "If not set, keep all components.", + "enum": [None], + }, + ], + "default": None, + "description": "Number of components.", + }, + set_as_available=True, + ) + lale.docstrings.set_docstrings(NMF) diff --git a/lale/lib/sklearn/random_forest_classifier.py b/lale/lib/sklearn/random_forest_classifier.py index aef73fc44..dafe7206d 100644 --- a/lale/lib/sklearn/random_forest_classifier.py +++ b/lale/lib/sklearn/random_forest_classifier.py @@ -19,6 +19,8 @@ import lale.docstrings import lale.operators +from ._common_schemas import schema_monotonic_cst_classifier + _hyperparams_schema = { "description": "A random forest classifier.", "allOf": [ @@ -484,4 +486,10 @@ set_as_available=True, ) +if lale.operators.sklearn_version >= version.Version("1.4"): + RandomForestClassifier = RandomForestClassifier.customize_schema( + monotonic_cst=schema_monotonic_cst_classifier, set_as_available=True + ) + + lale.docstrings.set_docstrings(RandomForestClassifier) diff --git a/lale/lib/sklearn/random_forest_regressor.py b/lale/lib/sklearn/random_forest_regressor.py index 83eb6cd8e..7de85c388 100644 --- a/lale/lib/sklearn/random_forest_regressor.py +++ b/lale/lib/sklearn/random_forest_regressor.py @@ -18,6 +18,7 @@ import lale.docstrings import lale.operators +from lale.lib.sklearn._common_schemas import schema_monotonic_cst_regressor _hyperparams_schema = { "description": "A random forest regressor.", @@ -472,4 +473,10 @@ set_as_available=True, ) +if lale.operators.sklearn_version >= version.Version("1.4"): + RandomForestRegressor = RandomForestRegressor.customize_schema( + monotonic_cst=schema_monotonic_cst_regressor, set_as_available=True + ) + + lale.docstrings.set_docstrings(RandomForestRegressor) diff --git a/lale/lib/sklearn/simple_imputer.py b/lale/lib/sklearn/simple_imputer.py index fccedbc74..52e2440e4 100644 --- a/lale/lib/sklearn/simple_imputer.py +++ b/lale/lib/sklearn/simple_imputer.py @@ -116,17 +116,7 @@ def transform_schema(self, s_X): "description": "If True, a MissingIndicator transform will stack onto output of the imputer’s transform.", }, }, - }, - { - "description": "Imputation not possible when missing_values == 0 and input is sparse. Provide a dense array instead.", - "anyOf": [ - {"type": "object", "laleNot": "X/isSparse"}, - { - "type": "object", - "properties": {"missing_values": {"not": {"enum": [0]}}}, - }, - ], - }, + } ], } @@ -189,9 +179,24 @@ def transform_schema(self, s_X): }, } - SimpleImputer = lale.operators.make_operator(_SimpleImputerImpl, _combined_schemas) +if lale.operators.sklearn_version < version.Version("1.4"): + # this constraint is removed in scikit-learn version 1.4 + SimpleImputer = SimpleImputer.customize_schema( + constraint={ + "description": "Imputation not possible when missing_values == 0 and input is sparse. Provide a dense array instead.", + "anyOf": [ + {"type": "object", "laleNot": "X/isSparse"}, + { + "type": "object", + "properties": {"missing_values": {"not": {"enum": [0]}}}, + }, + ], + }, + set_as_available=True, + ) + if lale.operators.sklearn_version >= version.Version("1.1"): # old: https://scikit-learn.org/1.0/modules/generated/sklearn.impute.SimpleImputer.html#sklearn.impute.SimpleImputer # new: https://scikit-learn.org/1.1/modules/generated/sklearn.impute.SimpleImputer.html#sklearn.impute.SimpleImputer diff --git a/setup.py b/setup.py index 646bc89d3..491a07b5b 100644 --- a/setup.py +++ b/setup.py @@ -46,7 +46,7 @@ "hyperopt>=0.2,<=0.2.5", "jsonschema<=4.20.0", "jsonsubschema>=0.0.6", - "scikit-learn>=1.0.0,<1.4.0", + "scikit-learn>=1.0.0,<1.5.0", "scipy", "pandas", "packaging", diff --git a/test/test_aif360_ensembles.py b/test/test_aif360_ensembles.py index 8f29be4aa..f1867f61d 100644 --- a/test/test_aif360_ensembles.py +++ b/test/test_aif360_ensembles.py @@ -22,6 +22,7 @@ except ImportError: tensorflow_installed = False +from lale.helpers import with_fixed_estimator_name from lale.lib.aif360 import ( CalibratedEqOddsPostprocessing, DisparateImpactRemover, @@ -62,75 +63,95 @@ def _attempt_fit_predict(cls, model): def test_bagging_pre_estimator_mitigation_ensemble(self): model = DisparateImpactRemover(**self.fairness_info) >> BaggingClassifier( - base_estimator=DecisionTreeClassifier() + **with_fixed_estimator_name(estimator=DecisionTreeClassifier()) ) self._attempt_fit_predict(model) def test_bagging_post_estimator_mitigation_ensemble(self): model = CalibratedEqOddsPostprocessing( **self.fairness_info, - estimator=BaggingClassifier(base_estimator=DecisionTreeClassifier()) + estimator=BaggingClassifier( + **with_fixed_estimator_name(estimator=DecisionTreeClassifier()) + ) ) self._attempt_fit_predict(model) def test_bagging_pre_estimator_mitigation_base(self): model = BaggingClassifier( - base_estimator=DisparateImpactRemover(**self.fairness_info) - >> DecisionTreeClassifier() + **with_fixed_estimator_name( + estimator=DisparateImpactRemover(**self.fairness_info) + >> DecisionTreeClassifier() + ) ) self._attempt_fit_predict(model) def test_bagging_in_estimator_mitigation_base(self): - model = BaggingClassifier(base_estimator=PrejudiceRemover(**self.fairness_info)) + model = BaggingClassifier( + **with_fixed_estimator_name( + estimator=PrejudiceRemover(**self.fairness_info) + ) + ) self._attempt_fit_predict(model) def test_bagging_in_estimator_mitigation_base_1(self): if tensorflow_installed: tf.compat.v1.disable_eager_execution() model = BaggingClassifier( - base_estimator=AdversarialDebiasing(**self.fairness_info), - n_estimators=2, + **with_fixed_estimator_name( + estimator=AdversarialDebiasing(**self.fairness_info), + n_estimators=2, + ) ) self._attempt_fit_predict(model) def test_bagging_post_estimator_mitigation_base(self): model = BaggingClassifier( - base_estimator=CalibratedEqOddsPostprocessing( - **self.fairness_info, estimator=DecisionTreeClassifier() + **with_fixed_estimator_name( + estimator=CalibratedEqOddsPostprocessing( + **self.fairness_info, estimator=DecisionTreeClassifier() + ) ) ) self._attempt_fit_predict(model) def test_adaboost_pre_estimator_mitigation_ensemble(self): model = DisparateImpactRemover(**self.fairness_info) >> AdaBoostClassifier( - base_estimator=DecisionTreeClassifier() + **with_fixed_estimator_name(estimator=DecisionTreeClassifier()) ) self._attempt_fit_predict(model) def test_adaboost_post_estimator_mitigation_ensemble(self): model = CalibratedEqOddsPostprocessing( **self.fairness_info, - estimator=AdaBoostClassifier(base_estimator=DecisionTreeClassifier()) + estimator=AdaBoostClassifier( + **with_fixed_estimator_name(estimator=DecisionTreeClassifier()) + ) ) self._attempt_fit_predict(model) def test_adaboost_pre_estimator_mitigation_base(self): model = AdaBoostClassifier( - base_estimator=DisparateImpactRemover(**self.fairness_info) - >> DecisionTreeClassifier() + **with_fixed_estimator_name( + estimator=DisparateImpactRemover(**self.fairness_info) + >> DecisionTreeClassifier() + ) ) self._attempt_fit_predict(model) def test_adaboost_in_estimator_mitigation_base(self): model = AdaBoostClassifier( - base_estimator=PrejudiceRemover(**self.fairness_info) + **with_fixed_estimator_name( + estimator=PrejudiceRemover(**self.fairness_info) + ) ) self._attempt_fit_predict(model) def test_adaboost_post_estimator_mitigation_base(self): model = AdaBoostClassifier( - base_estimator=CalibratedEqOddsPostprocessing( - **self.fairness_info, estimator=DecisionTreeClassifier() + **with_fixed_estimator_name( + estimator=CalibratedEqOddsPostprocessing( + **self.fairness_info, estimator=DecisionTreeClassifier() + ) ) ) self._attempt_fit_predict(model) diff --git a/test/test_core_classifiers.py b/test/test_core_classifiers.py index e819f1b4f..ce9f0e643 100644 --- a/test/test_core_classifiers.py +++ b/test/test_core_classifiers.py @@ -21,6 +21,7 @@ import lale.lib.lale import lale.type_checking +from lale.helpers import with_fixed_estimator_name from lale.lib.lale import NoOp from lale.lib.sklearn import ( PCA, @@ -281,14 +282,18 @@ def setUp(self): def test_with_lale_classifiers(self): from lale.lib.sklearn import BaggingClassifier - clf = BaggingClassifier(base_estimator=LogisticRegression()) + clf = BaggingClassifier( + **with_fixed_estimator_name(estimator=LogisticRegression()) + ) trained = clf.fit(self.X_train, self.y_train) trained.predict(self.X_test) def test_with_lale_pipeline(self): from lale.lib.sklearn import BaggingClassifier - clf = BaggingClassifier(base_estimator=PCA() >> LogisticRegression()) + clf = BaggingClassifier( + **with_fixed_estimator_name(estimator=PCA() >> LogisticRegression()) + ) trained = clf.fit(self.X_train, self.y_train) trained.predict(self.X_test) @@ -296,7 +301,9 @@ def test_with_hyperopt(self): from lale.lib.lale import Hyperopt from lale.lib.sklearn import BaggingClassifier - clf = BaggingClassifier(base_estimator=LogisticRegression()) + clf = BaggingClassifier( + **with_fixed_estimator_name(estimator=LogisticRegression()) + ) trained = clf.auto_configure(self.X_train, self.y_train, Hyperopt, max_evals=1) print(trained.to_json()) @@ -304,7 +311,9 @@ def test_pipeline_with_hyperopt(self): from lale.lib.lale import Hyperopt from lale.lib.sklearn import BaggingClassifier - clf = BaggingClassifier(base_estimator=PCA() >> LogisticRegression()) + clf = BaggingClassifier( + **with_fixed_estimator_name(estimator=PCA() >> LogisticRegression()) + ) _ = clf.auto_configure(self.X_train, self.y_train, Hyperopt, max_evals=1) def test_pipeline_choice_with_hyperopt(self): @@ -312,14 +321,18 @@ def test_pipeline_choice_with_hyperopt(self): from lale.lib.sklearn import BaggingClassifier clf = BaggingClassifier( - base_estimator=PCA() >> (LogisticRegression() | KNeighborsClassifier()) + **with_fixed_estimator_name( + estimator=PCA() >> (LogisticRegression() | KNeighborsClassifier()) + ) ) _ = clf.auto_configure(self.X_train, self.y_train, Hyperopt, max_evals=1) def test_predict_log_proba(self): from lale.lib.sklearn import BaggingClassifier - clf = BaggingClassifier(base_estimator=PCA() >> LogisticRegression()) + clf = BaggingClassifier( + **with_fixed_estimator_name(estimator=PCA() >> LogisticRegression()) + ) trained = clf.fit(self.X_train, self.y_train) trained.predict_log_proba(self.X_test) @@ -334,7 +347,9 @@ def test_predict_log_proba_trained_trainable(self): def test_predict_log_proba_trainable(self): from lale.lib.sklearn import BaggingClassifier - clf = BaggingClassifier(base_estimator=PCA() >> LogisticRegression()) + clf = BaggingClassifier( + **with_fixed_estimator_name(estimator=PCA() >> LogisticRegression()) + ) with self.assertRaises(ValueError): clf.predict_log_proba(self.X_test) diff --git a/test/test_core_misc.py b/test/test_core_misc.py index 39da8153c..3928cd5be 100644 --- a/test/test_core_misc.py +++ b/test/test_core_misc.py @@ -32,7 +32,11 @@ import lale.type_checking # from lale.helpers import get_sklearn_estimator_name -from lale.helpers import nest_HPparams +from lale.helpers import ( + get_sklearn_estimator_name, + nest_HPparams, + with_fixed_estimator_name, +) from lale.lib.lale import ConcatFeatures, Hyperopt, NoOp from lale.lib.rasl import categorical from lale.lib.sklearn import ( @@ -626,7 +630,9 @@ def test_shallow0_trainable_pipeline_configured(self): def test_shallow_planned_nested_indiv_operator(self): from lale.lib.sklearn import BaggingClassifier, DecisionTreeClassifier - clf = BaggingClassifier(base_estimator=DecisionTreeClassifier()) + clf = BaggingClassifier( + **with_fixed_estimator_name(estimator=DecisionTreeClassifier()) + ) params = clf.get_params(deep=False) filtered_params = self.remove_lale_params(params) assert filtered_params["bootstrap"] @@ -670,10 +676,10 @@ def test_deep_planned_choice(self): def test_deep_planned_nested_indiv_operator(self): from lale.lib.sklearn import BaggingClassifier, DecisionTreeClassifier - est_name = "base_estimator" + est_name = get_sklearn_estimator_name() dtc = DecisionTreeClassifier() - clf = BaggingClassifier(base_estimator=dtc) + clf = BaggingClassifier(**with_fixed_estimator_name(estimator=dtc)) params = clf.get_params(deep=True) filtered_params = self.remove_lale_params(params) @@ -700,7 +706,7 @@ def test_deep_grammar(self): from lale.lib.sklearn import StandardScaler as Scaler dtc = DecisionTreeClassifier() - clf = BaggingClassifier(base_estimator=dtc) + clf = BaggingClassifier(**with_fixed_estimator_name(estimator=dtc)) params = clf.get_params(deep=True) filtered_params = self.remove_lale_params(params) diff --git a/test/test_grammar.py b/test/test_grammar.py index c84da6a61..3d73aa5f4 100644 --- a/test/test_grammar.py +++ b/test/test_grammar.py @@ -2,6 +2,7 @@ import lale.datasets from lale.grammar import Grammar +from lale.helpers import with_fixed_estimator_name from lale.lib.lale import ConcatFeatures as Concat from lale.lib.lale import Hyperopt, NoOp from lale.lib.sklearn import PCA @@ -61,7 +62,7 @@ def test_grammar_all_combinator(self): g.start = g.estimator g.estimator = g.term_est | g.transformer >> g.term_est g.term_est = g.prim_est | g.ensemble - g.ensemble = Boost(base_estimator=LR) + g.ensemble = Boost(**with_fixed_estimator_name(estimator=LR)) g.transformer = g.union_tfm | g.union_tfm >> g.transformer g.union_tfm = g.prim_tfm | g.union_body >> Concat g.union_body = g.transformer | g.transformer & g.union_body diff --git a/test/test_optimizers.py b/test/test_optimizers.py index a5bfc04da..03588da94 100644 --- a/test/test_optimizers.py +++ b/test/test_optimizers.py @@ -21,11 +21,13 @@ import jsonschema import numpy as np import pandas as pd +from packaging import version from sklearn.datasets import load_iris from sklearn.metrics import accuracy_score, make_scorer from sklearn.model_selection import train_test_split from lale import schemas +from lale.helpers import get_sklearn_estimator_name, with_fixed_estimator_name from lale.lib.lale import ( SMAC, ConcatFeatures, @@ -53,7 +55,7 @@ StandardScaler, TfidfVectorizer, ) -from lale.operators import TrainedIndividualOp, TrainedPipeline +from lale.operators import TrainedIndividualOp, TrainedPipeline, sklearn_version from lale.search.lale_smac import get_smac_space, lale_op_smac_tae from lale.search.op2hp import hyperopt_search_space @@ -140,10 +142,13 @@ def dont_test_smac_choice(self): # Import SMAC-utilities tfm = PCA() | Nystroem() | NoOp() + if sklearn_version >= version.Version("1.2"): + ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False) + else: + ohe = OneHotEncoder(handle_unknown="ignore", sparse=False) + planned_pipeline1 = ( - (OneHotEncoder(handle_unknown="ignore", sparse=False) | NoOp()) - >> tfm - >> (LogisticRegression() | KNeighborsClassifier()) + (ohe | NoOp()) >> tfm >> (LogisticRegression() | KNeighborsClassifier()) ) cs: ConfigurationSpace = get_smac_space(planned_pipeline1, lale_num_grids=1) @@ -515,9 +520,12 @@ def test_preprocessing_union(self): ) prep_num = Project(columns={"type": "number"}) >> Normalizer - prep_cat = Project(columns={"not": {"type": "number"}}) >> OneHotEncoder( - sparse=False - ) + if sklearn_version >= version.Version("1.2"): + ohe = OneHotEncoder(sparse_output=False) + else: + ohe = OneHotEncoder(sparse=False) + + prep_cat = Project(columns={"not": {"type": "number"}}) >> ohe planned = (prep_num & prep_cat) >> ConcatFeatures >> RandomForestClassifier hyperopt_classifier = Hyperopt(estimator=planned, max_evals=1) @@ -900,25 +908,35 @@ def setUp(self): def test_ada_boost(self): from lale.lib.sklearn import AdaBoostClassifier, DecisionTreeClassifier - clf = AdaBoostClassifier(base_estimator=DecisionTreeClassifier()) + clf = AdaBoostClassifier( + **with_fixed_estimator_name(estimator=DecisionTreeClassifier()) + ) trained = clf.auto_configure( self.X_train, self.y_train, optimizer=Hyperopt, max_evals=1 ) + + est_name = get_sklearn_estimator_name() + # Checking that the inner decision tree does not get the default value for min_samples_leaf, not sure if this will always pass self.assertNotEqual( - trained.hyperparams()["base_estimator"].hyperparams()["min_samples_leaf"], 1 + trained.hyperparams()[est_name].hyperparams()["min_samples_leaf"], 1 ) def test_ada_boost_pipe(self): from lale.lib.sklearn import AdaBoostClassifier, DecisionTreeClassifier - clf = AdaBoostClassifier(base_estimator=NoOp >> DecisionTreeClassifier()) + clf = AdaBoostClassifier( + **with_fixed_estimator_name(estimator=NoOp >> DecisionTreeClassifier()) + ) trained = clf.auto_configure( self.X_train, self.y_train, optimizer=Hyperopt, max_evals=1 ) + + est_name = get_sklearn_estimator_name() + # Checking that the inner decision tree does not get the default value for min_samples_leaf, not sure if this will always pass self.assertNotEqual( - trained.hyperparams()["base_estimator"] + trained.hyperparams()[est_name] .steps_list()[1] .hyperparams()["min_samples_leaf"], 1, @@ -929,7 +947,9 @@ def test_ada_boost1(self): from lale.lib.sklearn import AdaBoostClassifier - clf = AdaBoostClassifier(base_estimator=DecisionTreeClassifier()) + clf = AdaBoostClassifier( + **with_fixed_estimator_name(estimator=DecisionTreeClassifier()) + ) clf.fit(self.X_train, self.y_train) def test_ada_boost_regressor(self): @@ -939,13 +959,18 @@ def test_ada_boost_regressor(self): X_train, _X_test, y_train, _y_test = train_test_split(X, y) from lale.lib.sklearn import AdaBoostRegressor, DecisionTreeRegressor - reg = AdaBoostRegressor(base_estimator=DecisionTreeRegressor()) + reg = AdaBoostRegressor( + **with_fixed_estimator_name(estimator=DecisionTreeRegressor()) + ) trained = reg.auto_configure( X_train, y_train, optimizer=Hyperopt, max_evals=1, scoring="r2" ) + + est_name = get_sklearn_estimator_name() + # Checking that the inner decision tree does not get the default value for min_samples_leaf, not sure if this will always pass self.assertNotEqual( - trained.hyperparams()["base_estimator"].hyperparams()["min_samples_leaf"], 1 + trained.hyperparams()[est_name].hyperparams()["min_samples_leaf"], 1 ) def test_ada_boost_regressor_pipe(self): @@ -955,13 +980,18 @@ def test_ada_boost_regressor_pipe(self): X_train, _X_test, y_train, _y_test = train_test_split(X, y) from lale.lib.sklearn import AdaBoostRegressor, DecisionTreeRegressor - reg = AdaBoostRegressor(base_estimator=NoOp >> DecisionTreeRegressor()) + reg = AdaBoostRegressor( + **with_fixed_estimator_name(estimator=NoOp >> DecisionTreeRegressor()) + ) trained = reg.auto_configure( X_train, y_train, optimizer=Hyperopt, max_evals=1, scoring="r2" ) + + est_name = get_sklearn_estimator_name() + # Checking that the inner decision tree does not get the default value for min_samples_leaf, not sure if this will always pass self.assertNotEqual( - trained.hyperparams()["base_estimator"] + trained.hyperparams()[est_name] .steps_list()[1] .hyperparams()["min_samples_leaf"], 1, diff --git a/test/test_relational_sklearn.py b/test/test_relational_sklearn.py index 52e5269e6..4280d0492 100644 --- a/test/test_relational_sklearn.py +++ b/test/test_relational_sklearn.py @@ -28,6 +28,7 @@ import sklearn import sklearn.datasets from category_encoders import HashingEncoder as SkHashingEncoder +from packaging import version from sklearn.feature_selection import SelectKBest as SkSelectKBest from sklearn.impute import SimpleImputer as SkSimpleImputer from sklearn.metrics import accuracy_score as sk_accuracy_score @@ -93,7 +94,7 @@ SGDClassifier, ) from lale.lib.xgboost import XGBClassifier, XGBRegressor -from lale.operators import TrainedPipeline +from lale.operators import TrainedPipeline, sklearn_version assert sklearn.__version__ >= "1.0", sklearn.__version__ @@ -610,7 +611,12 @@ def test_transform(self): cat_columns = categorical()(train_X_pd) prefix = Map(columns={c: it[c] for c in cat_columns}) rasl_trainable = prefix >> RaslOneHotEncoder(sparse=False) - sk_trainable = prefix >> SkOneHotEncoder(sparse=False) + if sklearn_version >= version.Version("1.2"): + sk_ohe = SkOneHotEncoder(sparse_output=False) + else: + sk_ohe = SkOneHotEncoder(sparse=False) + + sk_trainable = prefix >> sk_ohe sk_trained = sk_trainable.fit(train_X_pd) sk_transformed = sk_trained.transform(test_X_pd) for tgt, dataset in self.tgt2creditg.items(): @@ -636,7 +642,12 @@ def test_predict(self): prefix = Map(columns={c: it[c] for c in cat_columns}) to_pd = Convert(astype="pandas") lr = LogisticRegression() - sk_trainable = prefix >> SkOneHotEncoder(sparse=False) >> lr + if sklearn_version >= version.Version("1.2"): + sk_ohe = SkOneHotEncoder(sparse_output=False) + else: + sk_ohe = SkOneHotEncoder(sparse=False) + + sk_trainable = prefix >> sk_ohe >> lr sk_trained = sk_trainable.fit(train_X_pd, train_y_pd) sk_predicted = sk_trained.predict(test_X_pd) rasl_trainable = prefix >> RaslOneHotEncoder(sparse=False) >> to_pd >> lr diff --git a/test/test_replace.py b/test/test_replace.py index ef42650db..4abab1d34 100644 --- a/test/test_replace.py +++ b/test/test_replace.py @@ -14,6 +14,7 @@ import unittest +from lale.helpers import with_fixed_estimator_name from lale.lib.lale import NoOp from lale.lib.sklearn import ( PCA, @@ -127,26 +128,34 @@ def test_nested_pipeline(self): def test_hyperparam_estimator(self): lr = LogisticRegression() linear_reg = LinearRegression() - ada = AdaBoostRegressor(base_estimator=lr) + ada = AdaBoostRegressor(**with_fixed_estimator_name(estimator=lr)) replaced_ada = ada.replace(lr, linear_reg) - expected_ada = AdaBoostRegressor(base_estimator=linear_reg) + expected_ada = AdaBoostRegressor( + **with_fixed_estimator_name(estimator=linear_reg) + ) self.assertEqual(replaced_ada.to_json(), expected_ada.to_json()) replaced_ada = ada.replace(LogisticRegression, linear_reg) - expected_ada = AdaBoostRegressor(base_estimator=linear_reg) + expected_ada = AdaBoostRegressor( + **with_fixed_estimator_name(estimator=linear_reg) + ) self.assertEqual(replaced_ada.to_json(), expected_ada.to_json()) ada_pipeline = PCA >> SimpleImputer >> ada replaced_pipeline = ada_pipeline.replace(lr, linear_reg) expected_pipeline = ( - PCA >> SimpleImputer >> AdaBoostRegressor(base_estimator=linear_reg) + PCA + >> SimpleImputer + >> AdaBoostRegressor(**with_fixed_estimator_name(estimator=linear_reg)) ) self.assertEqual(replaced_pipeline.to_json(), expected_pipeline.to_json()) ada_choice = PCA | ada replaced_choice = ada_choice.replace(lr, linear_reg) - expected_choice = PCA | AdaBoostRegressor(base_estimator=linear_reg) + expected_choice = PCA | AdaBoostRegressor( + **with_fixed_estimator_name(estimator=linear_reg) + ) self.assertEqual(replaced_choice.to_json(), expected_choice.to_json()) rfe = RFE(estimator=lr) diff --git a/test/test_type_checking.py b/test/test_type_checking.py index c01ff0f7f..9c780adac 100644 --- a/test/test_type_checking.py +++ b/test/test_type_checking.py @@ -16,8 +16,10 @@ from test import EnableSchemaValidation import jsonschema +from packaging import version import lale.lib.lale +import lale.operators from lale.lib.lale import ConcatFeatures, IdentityWrapper, NoOp from lale.lib.sklearn import NMF, PCA, LogisticRegression, TfidfVectorizer from lale.settings import ( @@ -932,9 +934,7 @@ def test_function_transformer(self): y = self.y trainable = sklearn.preprocessing.FunctionTransformer(**bad_hyperparams) - with self.assertRaisesRegex( - TypeError, "A sparse matrix was passed, but dense data is required." - ): + with self.assertRaisesRegex(TypeError, r"[sS]parse.* was passed.* dense data"): trainable.fit(bad_X, self.y) trainable = FunctionTransformer(**bad_hyperparams) @@ -1025,9 +1025,7 @@ def test_logistic_regression_1(self): bad_hyperparams = {"solver": "liblinear", "penalty": "none"} trainable = sklearn.linear_model.LogisticRegression(**bad_hyperparams) - with self.assertRaisesRegex( - ValueError, "penalty='none' is not supported for the liblinear solver" - ): + with self.assertRaisesRegex(ValueError, r"penalty"): trainable.fit(self.X, self.y) with EnableSchemaValidation(): @@ -1095,7 +1093,7 @@ def test_one_hot_encoder(self): bad_hyperparams = {"drop": "first", "handle_unknown": "ignore"} trainable = sklearn.preprocessing.OneHotEncoder(**bad_hyperparams) - if sklearn.__version__ < "1.0": + if lale.operators.sklearn_version < version.Version("1.0"): with self.assertRaisesRegex( ValueError, "`handle_unknown` must be 'error' when the drop parameter is specified", @@ -1103,7 +1101,7 @@ def test_one_hot_encoder(self): trainable.fit(self.X, self.y) with EnableSchemaValidation(): - if sklearn.__version__ < "1.0": + if lale.operators.sklearn_version < version.Version("1.0"): with self.assertRaises(jsonschema.ValidationError): OneHotEncoder(**bad_hyperparams) else: @@ -1114,7 +1112,7 @@ def test_ordinal_encoder_1(self): from lale.lib.sklearn import OrdinalEncoder - if sklearn.__version__ >= "0.24.1": + if lale.operators.sklearn_version >= version.Version("0.24.1"): bad_hyperparams = { "handle_unknown": "use_encoded_value", "unknown_value": None, @@ -1135,7 +1133,7 @@ def test_ordinal_encoder_2(self): from lale.lib.sklearn import OrdinalEncoder - if sklearn.__version__ >= "0.24.1": + if lale.operators.sklearn_version <= version.Version("0.24.1"): bad_hyperparams = {"handle_unknown": "error", "unknown_value": 1} trainable = sklearn.preprocessing.OrdinalEncoder(**bad_hyperparams) with self.assertRaisesRegex( @@ -1242,6 +1240,10 @@ def test_robust_scaler(self): with self.assertRaises(jsonschema.ValidationError): trainable.fit(bad_X, y) + @unittest.skipIf( + lale.operators.sklearn_version >= version.Version("1.4"), + "restrictions have been removed", + ) def test_simple_imputer(self): import sklearn @@ -1252,6 +1254,7 @@ def test_simple_imputer(self): bad_hyperparams = {"missing_values": 0} trainable = sklearn.impute.SimpleImputer(**bad_hyperparams) + with self.assertRaisesRegex( ValueError, "Imputation not possible when missing_values == 0 and input is sparse.",