diff --git a/doc/spec/estimation/dml.rst b/doc/spec/estimation/dml.rst index f0d37bc59..6612d7c74 100644 --- a/doc/spec/estimation/dml.rst +++ b/doc/spec/estimation/dml.rst @@ -663,7 +663,7 @@ To add fixed effect heterogeneity, we can create one-hot encodings of the id, wh from econml.dml import LinearDML from sklearn.preprocessing import OneHotEncoder # removing one id to avoid colinearity, as is standard for fixed effects - X_oh = OneHotEncoder(sparse=False).fit_transform(X)[:, 1:] + X_oh = OneHotEncoder(sparse_output=False).fit_transform(X)[:, 1:] est = LinearDML(model_y=RandomForestRegressor(), model_t=RandomForestRegressor()) diff --git a/econml/_ortho_learner.py b/econml/_ortho_learner.py index 4db1e2921..771a4e6b6 100644 --- a/econml/_ortho_learner.py +++ b/econml/_ortho_learner.py @@ -44,7 +44,7 @@ class in this module implements the general logic in a very versatile way TreatmentExpansionMixin) from .inference import BootstrapInference from .utilities import (_deprecate_positional, check_input_arrays, - cross_product, filter_none_kwargs, strata_from_discrete_arrays, + cross_product, filter_none_kwargs, one_hot_encoder, strata_from_discrete_arrays, inverse_onehot, jacify_featurizer, ndim, reshape, shape, transpose) from .sklearn_extensions.model_selection import ModelSelector @@ -780,7 +780,7 @@ def fit(self, Y, T, *, X=None, W=None, Z=None, sample_weight=None, freq_weight=N categories = self.categories if categories != 'auto': categories = [categories] # OneHotEncoder expects a 2D array with features per column - self.transformer = OneHotEncoder(categories=categories, sparse=False, drop='first') + self.transformer = one_hot_encoder(categories=categories, drop='first') self.transformer.fit(reshape(T, (-1, 1))) self._d_t = (len(self.transformer.categories_[0]) - 1,) elif self.treatment_featurizer: @@ -792,7 +792,7 @@ def fit(self, Y, T, *, X=None, W=None, Z=None, sample_weight=None, freq_weight=N self.transformer = None if self.discrete_instrument: - self.z_transformer = OneHotEncoder(categories='auto', sparse=False, drop='first') + self.z_transformer = one_hot_encoder(categories='auto', drop='first') self.z_transformer.fit(reshape(Z, (-1, 1))) else: self.z_transformer = None diff --git a/econml/metalearners/_metalearners.py b/econml/metalearners/_metalearners.py index bb6b5a334..96e78366f 100644 --- a/econml/metalearners/_metalearners.py +++ b/econml/metalearners/_metalearners.py @@ -16,7 +16,7 @@ from sklearn.utils import check_array, check_X_y from sklearn.preprocessing import OneHotEncoder, FunctionTransformer from ..utilities import (check_inputs, check_models, broadcast_unit_treatments, reshape_treatmentwise_effects, - inverse_onehot, transpose, _deprecate_positional) + one_hot_encoder, inverse_onehot, transpose, _deprecate_positional) from .._shap import _shap_explain_model_cate @@ -109,7 +109,7 @@ def fit(self, Y, T, *, X, inference=None): categories = self.categories if categories != 'auto': categories = [categories] # OneHotEncoder expects a 2D array with features per column - self.transformer = OneHotEncoder(categories=categories, sparse=False, drop='first') + self.transformer = one_hot_encoder(categories=categories, drop='first') T = self.transformer.fit_transform(T.reshape(-1, 1)) self._d_t = T.shape[1:] T = inverse_onehot(T) @@ -232,7 +232,7 @@ def fit(self, Y, T, *, X=None, inference=None): categories = self.categories if categories != 'auto': categories = [categories] # OneHotEncoder expects a 2D array with features per column - self.transformer = OneHotEncoder(categories=categories, sparse=False, drop='first') + self.transformer = one_hot_encoder(categories=categories, drop='first') T = self.transformer.fit_transform(T.reshape(-1, 1)) self._d_t = (T.shape[1], ) # Note: unlike other Metalearners, we need the controls' encoded column for training @@ -375,7 +375,7 @@ def fit(self, Y, T, *, X, inference=None): categories = self.categories if categories != 'auto': categories = [categories] # OneHotEncoder expects a 2D array with features per column - self.transformer = OneHotEncoder(categories=categories, sparse=False, drop='first') + self.transformer = one_hot_encoder(categories=categories, drop='first') T = self.transformer.fit_transform(T.reshape(-1, 1)) self._d_t = T.shape[1:] T = inverse_onehot(T) @@ -537,7 +537,7 @@ def fit(self, Y, T, *, X, inference=None): categories = self.categories if categories != 'auto': categories = [categories] # OneHotEncoder expects a 2D array with features per column - self.transformer = OneHotEncoder(categories=categories, sparse=False, drop='first') + self.transformer = one_hot_encoder(categories=categories, drop='first') T = self.transformer.fit_transform(T.reshape(-1, 1)) self._d_t = T.shape[1:] T = inverse_onehot(T) diff --git a/econml/orf/_ortho_forest.py b/econml/orf/_ortho_forest.py index 9c4c254cd..12afcdb2e 100644 --- a/econml/orf/_ortho_forest.py +++ b/econml/orf/_ortho_forest.py @@ -39,7 +39,7 @@ from ._causal_tree import CausalTree from ..inference import NormalInferenceResults from ..inference._inference import Inference -from ..utilities import (reshape, reshape_Y_T, MAX_RAND_SEED, check_inputs, _deprecate_positional, +from ..utilities import (one_hot_encoder, reshape, reshape_Y_T, MAX_RAND_SEED, check_inputs, _deprecate_positional, cross_product, inverse_onehot, check_input_arrays, jacify_featurizer, _RegressionWrapper, deprecated, ndim) from sklearn.model_selection import check_cv @@ -676,7 +676,7 @@ def fit(self, Y, T, *, X, W=None, inference='auto'): categories = self.categories if categories != 'auto': categories = [categories] # OneHotEncoder expects a 2D array with features per column - self.transformer = OneHotEncoder(categories=categories, sparse=False, drop='first') + self.transformer = one_hot_encoder(categories=categories, drop='first') d_t_in = T.shape[1:] T = self.transformer.fit_transform(T.reshape(-1, 1)) self._d_t = T.shape[1:] @@ -1030,7 +1030,7 @@ def fit(self, Y, T, *, X, W=None, inference='auto'): categories = self.categories if categories != 'auto': categories = [categories] # OneHotEncoder expects a 2D array with features per column - self.transformer = OneHotEncoder(categories=categories, sparse=False, drop='first') + self.transformer = one_hot_encoder(categories=categories, drop='first') d_t_in = T.shape[1:] T = self.transformer.fit_transform(T.reshape(-1, 1)) self._d_t = T.shape[1:] diff --git a/econml/sklearn_extensions/linear_model.py b/econml/sklearn_extensions/linear_model.py index 49edfa61c..c95f392de 100644 --- a/econml/sklearn_extensions/linear_model.py +++ b/econml/sklearn_extensions/linear_model.py @@ -108,7 +108,7 @@ def _fit_weighted_linear_model(self, X, y, sample_weight, check_input=None): # Normalize inputs X, y, X_offset, y_offset, X_scale = _preprocess_data( - X, y, fit_intercept=self.fit_intercept, normalize=False, + X, y, fit_intercept=self.fit_intercept, copy=self.copy_X, check_input=check_input if check_input is not None else True, sample_weight=sample_weight) # Weight inputs @@ -737,7 +737,7 @@ def fit(self, X, y, sample_weight=None, check_input=True): super().fit(X, y, sample_weight, check_input) # Center X, y X, y, X_offset, y_offset, X_scale = _preprocess_data( - X, y, fit_intercept=self.fit_intercept, normalize=False, + X, y, fit_intercept=self.fit_intercept, copy=self.copy_X, check_input=check_input, sample_weight=sample_weight) # Calculate quantities that will be used later on. Account for centered data diff --git a/econml/sklearn_extensions/model_selection.py b/econml/sklearn_extensions/model_selection.py index 5cab620c6..9628600a5 100644 --- a/econml/sklearn_extensions/model_selection.py +++ b/econml/sklearn_extensions/model_selection.py @@ -842,13 +842,17 @@ def _cross_val_predict(estimator, X, y=None, *, groups=None, cv=None, # independent, and that it is pickle-able. parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch) - predictions = parallel(delayed(_fit_and_predict)( - clone(estimator, safe=safe), X, y, train, test, verbose, fit_params, method) - for train, test in splits) + from pkg_resources import parse_version - if parse_version(sklearn.__version__) < parse_version("0.24.0"): - # Prior to 0.24.0, this private scikit-learn method returned a tuple of two values - predictions = [p[0] for p in predictions] + # verbose was removed from sklearn's non-public _fit_and_predict method in 1.4 + if parse_version(sklearn.__version__) < parse_version("1.4"): + predictions = parallel(delayed(_fit_and_predict)( + clone(estimator, safe=safe), X, y, train, test, verbose, fit_params, method) + for train, test in splits) + else: + predictions = parallel(delayed(_fit_and_predict)( + clone(estimator, safe=safe), X, y, train, test, fit_params, method) + for train, test in splits) inv_test_indices = np.empty(len(test_indices), dtype=int) inv_test_indices[test_indices] = np.arange(len(test_indices)) diff --git a/econml/solutions/causal_analysis/_causal_analysis.py b/econml/solutions/causal_analysis/_causal_analysis.py index b929ae03b..af61ed775 100644 --- a/econml/solutions/causal_analysis/_causal_analysis.py +++ b/econml/solutions/causal_analysis/_causal_analysis.py @@ -25,7 +25,7 @@ from ...inference import NormalInferenceResults from ...sklearn_extensions.linear_model import WeightedLasso from ...sklearn_extensions.model_selection import GridSearchCVList -from ...utilities import _RegressionWrapper, get_feature_names_or_default, inverse_onehot +from ...utilities import _RegressionWrapper, get_feature_names_or_default, inverse_onehot, one_hot_encoder # TODO: this utility is documented but internal; reimplement? from sklearn.utils import _safe_indexing @@ -203,8 +203,7 @@ def fit(self, X): if cat_cols.shape[1] > 0: self.has_cats = True # NOTE: set handle_unknown to 'ignore' so that we don't throw at runtime if given a novel value - self.one_hot_encoder = OneHotEncoder(sparse=False, - handle_unknown='ignore').fit(cat_cols) + self.one_hot_encoder = one_hot_encoder(handle_unknown='ignore').fit(cat_cols) else: self.has_cats = False self.d_x = X.shape[1] @@ -335,12 +334,12 @@ def _process_feature(name, feat_ind, verbose, categorical_inds, categories, hete # we achieve this by pipelining the X scaling with the Y and T models (with fixed scaling, not refitting) hinds = heterogeneity_inds[feat_ind] - WX_transformer = ColumnTransformer([('encode', OneHotEncoder(drop='first', sparse=False), + WX_transformer = ColumnTransformer([('encode', one_hot_encoder(drop='first'), [ind for ind in categorical_inds if ind != feat_ind]), ('drop', 'drop', feat_ind)], remainder=StandardScaler()) - W_transformer = ColumnTransformer([('encode', OneHotEncoder(drop='first', sparse=False), + W_transformer = ColumnTransformer([('encode', one_hot_encoder(drop='first'), [ind for ind in categorical_inds if ind != feat_ind and ind not in hinds]), ('drop', 'drop', hinds), @@ -732,8 +731,7 @@ def fit(self, X, y, warm_start=False): if train_y_model: # perform model selection for the Y model using all X, not on a per-column basis allX = ColumnTransformer([('encode', - OneHotEncoder( - drop='first', sparse=False), + one_hot_encoder(drop='first'), self.categorical)], remainder=StandardScaler()).fit_transform(X) @@ -757,7 +755,7 @@ def fit(self, X, y, warm_start=False): # note that this needs to happen after wrapping to generalize to the multi-class case, # since otherwise we'll have too many columns to be able to train a classifier - y = OneHotEncoder(drop='first', sparse=False).fit_transform(y) + y = one_hot_encoder(drop='first').fit_transform(y) assert y.ndim == 1 or y.shape[1] == 1, ("Multiclass classification isn't supported" if self.classification else "Only a single outcome is supported") diff --git a/econml/tests/test_deepiv.py b/econml/tests/test_deepiv.py index f86389d26..208e3e4be 100644 --- a/econml/tests/test_deepiv.py +++ b/econml/tests/test_deepiv.py @@ -235,7 +235,7 @@ def monte_carlo_error(g_hat, data_fn, ntest=5000, has_latent=False, debug=False) def one_hot(col, **kwargs): z = col.reshape(-1, 1) - enc = OneHotEncoder(sparse=False, **kwargs) + enc = OneHotEncoder(sparse_output=False, **kwargs) return enc.fit_transform(z) def sensf(x): @@ -375,7 +375,7 @@ def monte_carlo_error(g_hat, data_fn, ntest=5000, has_latent=False, debug=False) def one_hot(col, **kwargs): z = col.reshape(-1, 1) - enc = OneHotEncoder(sparse=False, **kwargs) + enc = OneHotEncoder(sparse_output=False, **kwargs) return enc.fit_transform(z) def sensf(x): diff --git a/econml/tests/test_dml.py b/econml/tests/test_dml.py index 57f6e9051..d7e9828e1 100644 --- a/econml/tests/test_dml.py +++ b/econml/tests/test_dml.py @@ -910,7 +910,7 @@ def test_can_use_featurizer(self): [1, 4, 5, 7, 9, 10, 12, 14, 17]) dml = LinearDML(model_y=LinearRegression(), model_t=LinearRegression(), - fit_cate_intercept=False, featurizer=OneHotEncoder(sparse=False), + fit_cate_intercept=False, featurizer=OneHotEncoder(sparse_output=False), cv=[splits, splits[::-1]]) T = np.tile([1, 2, 3], 6) diff --git a/econml/tests/test_utilities.py b/econml/tests/test_utilities.py index a3b5f8de7..676f498b7 100644 --- a/econml/tests/test_utilities.py +++ b/econml/tests/test_utilities.py @@ -99,7 +99,7 @@ def test_transpose_compatible(self): def test_inverse_onehot(self): T = np.random.randint(4, size=100) - T_oh = OneHotEncoder(categories='auto', sparse=False).fit_transform(T.reshape(-1, 1))[:, 1:] + T_oh = OneHotEncoder(categories='auto', sparse_output=False).fit_transform(T.reshape(-1, 1))[:, 1:] T_inv = inverse_onehot(T_oh) np.testing.assert_array_equal(T, T_inv) diff --git a/econml/utilities.py b/econml/utilities.py index 8e9bc8fa5..faaf27f9a 100644 --- a/econml/utilities.py +++ b/econml/utilities.py @@ -6,6 +6,7 @@ import numpy as np import pandas as pd import scipy.sparse +import sklearn import sparse as sp import itertools import inspect @@ -18,7 +19,7 @@ from functools import reduce, wraps from sklearn.utils import check_array, check_X_y from sklearn.utils.validation import assert_all_finite -from sklearn.preprocessing import PolynomialFeatures, LabelEncoder +from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures, LabelEncoder import warnings from warnings import warn from collections.abc import Iterable @@ -1508,3 +1509,15 @@ def strata_from_discrete_arrays(arrs): curr_array = temp + curr_array * len(enc.classes_) return curr_array + + +def one_hot_encoder(sparse=False, **kwargs): + """ + Wrapper for sklearn's OneHotEncoder that handles the name change from `sparse` to `sparse_output` + between sklearn versions 1.1 and 1.2. + """ + from pkg_resources import parse_version + if parse_version(sklearn.__version__) < parse_version("1.2"): + return OneHotEncoder(sparse=sparse, **kwargs) + else: + return OneHotEncoder(sparse_output=sparse, **kwargs) diff --git a/pyproject.toml b/pyproject.toml index 3e9a326b5..f7ca1cbc4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,7 +22,7 @@ classifiers = [ dependencies = [ "numpy", "scipy > 1.4.0", - "scikit-learn >= 1.0, < 1.4", + "scikit-learn >= 1.0, < 1.5", "sparse", "joblib >= 0.13.0", "statsmodels >= 0.10",