diff --git a/docs/requirements.txt b/docs/requirements.txt index 10792e441..ff01d7e45 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -5,7 +5,7 @@ graphviz hyperopt jsonschema jsonsubschema -scikit-learn>=1.0.0,<=1.2.0 +scikit-learn>=1.0.0,<1.4 scipy pandas decorator diff --git a/lale/lib/autogen/fast_ica.py b/lale/lib/autogen/fast_ica.py index 1f7336dcc..ff89801af 100644 --- a/lale/lib/autogen/fast_ica.py +++ b/lale/lib/autogen/fast_ica.py @@ -1,8 +1,8 @@ -from numpy import inf, nan +from packaging import version from sklearn.decomposition import FastICA as Op from lale.docstrings import set_docstrings -from lale.operators import make_operator +from lale.operators import make_operator, sklearn_version class _FastICAImpl: @@ -173,4 +173,73 @@ def transform(self, X): } FastICA = make_operator(_FastICAImpl, _combined_schemas) +if sklearn_version >= version.Version("1.1"): + FastICA = FastICA.customize_schema( + whiten={ + "anyOf": [ + { + "enum": [False], + "description": "The data is already considered to be whitened, and no whitening is performed.", + }, + { + "enum": ["arbitrary-variance"], + "description": "(default) A whitening with variance arbitrary is used", + }, + { + "enum": ["unit-variance"], + "description": "The whitening matrix is rescaled to ensure that each recovered source has unit variance.", + }, + { + "enum": [True, "warn"], + "description": "deprecated. Use 'arbitrary-variance' instead", + }, + ], + "description": "Specify the whitening strategy to use.", + "default": "warn", + }, + set_as_available=True, + ) + +if sklearn_version >= version.Version("1.1"): + FastICA = FastICA.customize_schema( + whiten_solver={ + "anyOf": [ + { + "enum": ["eigh"], + "description": "Generally more memory efficient when n_samples >= n_features, and can be faster when n_samples >= 50 * n_features.", + }, + { + "enum": ["svd"], + "description": "More stable numerically if the problem is degenerate, and often faster when n_samples <= n_features.", + }, + ], + "description": "The solver to use for whitening.", + "default": "svd", + }, + set_as_available=True, + ) + +if sklearn_version >= version.Version("1.3"): + FastICA = FastICA.customize_schema( + whiten={ + "anyOf": [ + { + "enum": [False], + "description": "The data is already considered to be whitened, and no whitening is performed.", + }, + { + "enum": ["arbitrary-variance"], + "description": "A whitening with variance arbitrary is used", + }, + { + "enum": ["unit-variance"], + "description": "The whitening matrix is rescaled to ensure that each recovered source has unit variance.", + }, + ], + "description": "Specify the whitening strategy to use.", + "default": "arbitrary-variance", + }, + set_as_available=True, + ) + set_docstrings(FastICA) diff --git a/lale/lib/autogen/mini_batch_sparse_pca.py b/lale/lib/autogen/mini_batch_sparse_pca.py index bb680ee05..ddcad85ab 100644 --- a/lale/lib/autogen/mini_batch_sparse_pca.py +++ b/lale/lib/autogen/mini_batch_sparse_pca.py @@ -1,6 +1,7 @@ from numpy import inf, nan from packaging import version from sklearn.decomposition import MiniBatchSparsePCA as Op +from sklearn.utils._available_if import available_if from lale.docstrings import set_docstrings from lale.operators import make_operator, sklearn_version @@ -21,6 +22,10 @@ def fit(self, X, y=None): def transform(self, X): return self._wrapped_model.transform(X) + @available_if(lambda self: (hasattr(self._wrapped_model, "inverse_transform"))) + def inverse_transform(self, X): + return self._wrapped_model.inverse_transform(X) + _hyperparams_schema = { "$schema": "http://json-schema.org/draft-04/schema#", diff --git a/lale/lib/autogen/sparse_pca.py b/lale/lib/autogen/sparse_pca.py index 2986a03b7..ef88e911f 100644 --- a/lale/lib/autogen/sparse_pca.py +++ b/lale/lib/autogen/sparse_pca.py @@ -1,5 +1,6 @@ from numpy import inf, nan from sklearn.decomposition import SparsePCA as Op +from sklearn.utils._available_if import available_if from lale.docstrings import set_docstrings from lale.operators import make_operator @@ -20,6 +21,10 @@ def fit(self, X, y=None): def transform(self, X): return self._wrapped_model.transform(X) + @available_if(lambda self: (hasattr(self._wrapped_model, "inverse_transform"))) + def inverse_transform(self, X): + return self._wrapped_model.inverse_transform(X) + _hyperparams_schema = { "$schema": "http://json-schema.org/draft-04/schema#", diff --git a/lale/lib/sklearn/__init__.py b/lale/lib/sklearn/__init__.py index 17607aae5..43c4b00a6 100644 --- a/lale/lib/sklearn/__init__.py +++ b/lale/lib/sklearn/__init__.py @@ -85,6 +85,7 @@ * lale.lib.sklearn. `SelectKBest`_ * lale.lib.sklearn. `SimpleImputer`_ * lale.lib.sklearn. `StandardScaler`_ +* lale.lib.sklearn. `TargetEncoder`_ * lale.lib.sklearn. `TfidfVectorizer`_ * lale.lib.sklearn. `VarianceThreshold`_ @@ -130,6 +131,7 @@ .. _`Nystroem`: lale.lib.sklearn.nystroem.html .. _`OneHotEncoder`: lale.lib.sklearn.one_hot_encoder.html .. _`OrdinalEncoder`: lale.lib.sklearn.ordinal_encoder.html +.. _`TargetEncoder`: lale.lib.sklearn.target_encoder.html .. _`PassiveAggressiveClassifier`: lale.lib.sklearn.passive_aggressive_classifier.html .. _`PCA`: lale.lib.sklearn.pca.html .. _`Perceptron`: lale.lib.sklearn.perceptron.html @@ -231,6 +233,7 @@ from .standard_scaler import StandardScaler as StandardScaler from .svc import SVC as SVC from .svr import SVR as SVR +from .target_encoder import TargetEncoder as TargetEncoder from .tfidf_vectorizer import TfidfVectorizer as TfidfVectorizer from .variance_threshold import VarianceThreshold as VarianceThreshold from .voting_classifier import VotingClassifier as VotingClassifier diff --git a/lale/lib/sklearn/target_encoder.py b/lale/lib/sklearn/target_encoder.py new file mode 100644 index 000000000..49c1f6c2b --- /dev/null +++ b/lale/lib/sklearn/target_encoder.py @@ -0,0 +1,225 @@ +# Copyright 2019 IBM Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sklearn.preprocessing +from packaging import version + +import lale.docstrings +import lale.operators + + +class _TargetEncoderNotFoundImpl: + def __init__(self, **hyperparams): + raise NotImplementedError( + "TargetEncoder is only available with scikit-learn versions >= 1.3" + ) + + def transform(self, X): + raise NotImplementedError( + "TargetEncoder is only available with scikit-learn versions >= 1.3" + ) + + +_hyperparams_schema = { + "description": "Hyperparameter schema for the TargetEncoder model from scikit-learn.", + "allOf": [ + { + "type": "object", + "additionalProperties": False, + "required": ["categories", "target_type"], + "relevantToOptimizer": [], + "properties": { + "categories": { + "anyOf": [ + { + "description": "Determine categories automatically from training data.", + "enum": ["auto"], + }, + { + "description": "The ith list element holds the categories expected in the ith column.", + "type": "array", + "items": { + "anyOf": [ + { + "type": "array", + "items": {"type": "string"}, + }, + { + "type": "array", + "items": {"type": "number"}, + "description": "Should be sorted.", + }, + ] + }, + }, + ], + "default": "auto", + "description": "Categories (unique values) per feature.", + }, + "target_type": { + "anyOf": [ + { + "enum": ["auto"], + "description": "Type of target is inferred with type_of_target.", + }, + {"enum": ["continuous"], "description": "Continuous target"}, + {"enum": ["binary"], "description": "Binary target"}, + ], + "description": "Type of target.", + "default": "auto", + }, + "smooth": { + "anyOf": [ + { + "enum": ["auto"], + "description": "Set to an empirical Bayes estimate.", + }, + { + "type": "number", + "minimum": 0.0, + "maximum": 1.0, + "description": "A larger smooth value will put more weight on the global target mean", + }, + ], + "description": "The amount of mixing of the target mean conditioned on the value of the category with the global target mean.", + "default": "auto", + }, + "cv": { + "type": "integer", + "minimum": 1, + "description": "Determines the number of folds in the cross fitting strategy used in fit_transform. For classification targets, StratifiedKFold is used and for continuous targets, KFold is used.", + "default": 5, + }, + "shuffle": { + "type": "boolean", + "description": "Whether to shuffle the data in fit_transform before splitting into folds. Note that the samples within each split will not be shuffled.", + "default": True, + }, + "random_state": { + "description": "When shuffle is True, random_state affects the ordering of the indices, which controls the randomness of each fold. Otherwise, this parameter has no effect. Pass an int for reproducible output across multiple function calls.", + "anyOf": [ + { + "enum": [None], + }, + { + "description": "Use the provided random state, only affecting other users of that same random state instance.", + "laleType": "numpy.random.RandomState", + }, + {"description": "Explicit seed.", "type": "integer"}, + ], + "default": None, + }, + }, + } + ], +} + +_input_fit_schema = { + "type": "object", + "required": ["X"], + "additionalProperties": False, + "properties": { + "X": { + "description": "Features; the outer array is over samples.", + "type": "array", + "items": { + "anyOf": [ + {"type": "array", "items": {"type": "number"}}, + {"type": "array", "items": {"type": "string"}}, + ] + }, + }, + "y": { + "description": "The target data used to encode the categories.", + "type": "array", + }, + }, +} + +_input_transform_schema = { + "type": "object", + "required": ["X"], + "additionalProperties": False, + "properties": { + "X": { + "description": "Features; the outer array is over samples.", + "type": "array", + "items": { + "anyOf": [ + {"type": "array", "items": {"type": "number"}}, + {"type": "array", "items": {"type": "string"}}, + ] + }, + } + }, +} + +_output_transform_schema = { + "description": "Transformed input; the outer array is over samples.", + "type": "array", + "items": { + "anyOf": [ + {"type": "array", "items": {"type": "number"}}, + {"type": "array", "items": {"type": "string"}}, + ] + }, +} + +_combined_schemas = { + "$schema": "http://json-schema.org/draft-04/schema#", + "description": """`Target encoder`_ for regression and classification targets.. + +.. _`Target encoder`: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.TargetEncoder.html +""", + "documentation_url": "https://lale.readthedocs.io/en/latest/modules/lale.lib.sklearn.target_encoder.html", + "import_from": "sklearn.preprocessing", + "type": "object", + "tags": {"pre": ["categoricals"], "op": ["transformer"], "post": []}, + "properties": { + "hyperparams": _hyperparams_schema, + "input_fit": _input_fit_schema, + "input_transform": _input_transform_schema, + "output_transform": _output_transform_schema, + }, +} + +if lale.operators.sklearn_version >= version.Version("1.3"): + TargetEncoder = lale.operators.make_operator( + sklearn.preprocessing.TargetEncoder, _combined_schemas + ) +else: + TargetEncoder = lale.operators.make_operator( + _TargetEncoderNotFoundImpl, _combined_schemas + ) + + +if lale.operators.sklearn_version >= version.Version("1.4"): + TargetEncoder = TargetEncoder.customize_schema( + target_type={ + "anyOf": [ + { + "enum": ["auto"], + "description": "Type of target is inferred with type_of_target.", + }, + {"enum": ["continuous"], "description": "Continuous target"}, + {"enum": ["binary"], "description": "Binary target"}, + {"enum": ["multiclass"], "description": "Multiclass target"}, + ], + "description": "Type of target.", + "default": "auto", + }, + set_as_available=True, + ) + +lale.docstrings.set_docstrings(TargetEncoder) diff --git a/test/test_core_transformers.py b/test/test_core_transformers.py index 4ea42b0a5..3621d9ef0 100644 --- a/test/test_core_transformers.py +++ b/test/test_core_transformers.py @@ -18,6 +18,7 @@ import jsonschema import pandas as pd +from packaging import version import lale.lib.lale import lale.lib.sklearn @@ -34,8 +35,10 @@ LogisticRegression, MissingIndicator, Nystroem, - TfidfVectorizer, ) +from lale.lib.sklearn import TargetEncoder as SkTargetEncoder +from lale.lib.sklearn import TfidfVectorizer +from lale.operators import sklearn_version class TestFeaturePreprocessing(unittest.TestCase): @@ -302,6 +305,23 @@ def test_encode_unknown_with(self): _ = trained_oe._impl.inverse_transform(transformed_X) +class TestTargetEncoder(unittest.TestCase): + def test_sklearn_target_encoder(self): + import numpy as np + + X = np.array([["dog"] * 20 + ["cat"] * 30 + ["snake"] * 38], dtype=object).T + y = [90.3] * 5 + [80.1] * 15 + [20.4] * 5 + [20.1] * 25 + [21.2] * 8 + [49] * 30 + + if sklearn_version < version.Version("1.3"): + with self.assertRaises(NotImplementedError): + enc_auto = SkTargetEncoder(smooth="auto") + _ = enc_auto.fit_transform(X, y) + else: + # example from the TargetEncoder documentation + enc_auto = SkTargetEncoder(smooth="auto") + _ = enc_auto.fit_transform(X, y) + + class TestConcatFeatures(unittest.TestCase): def test_hyperparam_defaults(self): _ = ConcatFeatures()