Skip to content

Commit

Permalink
Enable support for sklearn 1.4
Browse files Browse the repository at this point in the history
Signed-off-by: Keith Battocchi <kebatt@microsoft.com>
  • Loading branch information
kbattocchi committed Feb 14, 2024
1 parent b8a5e2f commit ab572de
Show file tree
Hide file tree
Showing 12 changed files with 49 additions and 34 deletions.
2 changes: 1 addition & 1 deletion doc/spec/estimation/dml.rst
Original file line number Diff line number Diff line change
Expand Up @@ -663,7 +663,7 @@ To add fixed effect heterogeneity, we can create one-hot encodings of the id, wh
from econml.dml import LinearDML
from sklearn.preprocessing import OneHotEncoder
# removing one id to avoid colinearity, as is standard for fixed effects
X_oh = OneHotEncoder(sparse=False).fit_transform(X)[:, 1:]
X_oh = OneHotEncoder(sparse_output=False).fit_transform(X)[:, 1:]

est = LinearDML(model_y=RandomForestRegressor(),
model_t=RandomForestRegressor())
Expand Down
6 changes: 3 additions & 3 deletions econml/_ortho_learner.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ class in this module implements the general logic in a very versatile way
TreatmentExpansionMixin)
from .inference import BootstrapInference
from .utilities import (_deprecate_positional, check_input_arrays,
cross_product, filter_none_kwargs, strata_from_discrete_arrays,
cross_product, filter_none_kwargs, one_hot_encoder, strata_from_discrete_arrays,
inverse_onehot, jacify_featurizer, ndim, reshape, shape, transpose)
from .sklearn_extensions.model_selection import ModelSelector

Expand Down Expand Up @@ -780,7 +780,7 @@ def fit(self, Y, T, *, X=None, W=None, Z=None, sample_weight=None, freq_weight=N
categories = self.categories
if categories != 'auto':
categories = [categories] # OneHotEncoder expects a 2D array with features per column
self.transformer = OneHotEncoder(categories=categories, sparse=False, drop='first')
self.transformer = one_hot_encoder(categories=categories, drop='first')
self.transformer.fit(reshape(T, (-1, 1)))
self._d_t = (len(self.transformer.categories_[0]) - 1,)
elif self.treatment_featurizer:
Expand All @@ -792,7 +792,7 @@ def fit(self, Y, T, *, X=None, W=None, Z=None, sample_weight=None, freq_weight=N
self.transformer = None

if self.discrete_instrument:
self.z_transformer = OneHotEncoder(categories='auto', sparse=False, drop='first')
self.z_transformer = one_hot_encoder(categories='auto', drop='first')
self.z_transformer.fit(reshape(Z, (-1, 1)))
else:
self.z_transformer = None
Expand Down
10 changes: 5 additions & 5 deletions econml/metalearners/_metalearners.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from sklearn.utils import check_array, check_X_y
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from ..utilities import (check_inputs, check_models, broadcast_unit_treatments, reshape_treatmentwise_effects,
inverse_onehot, transpose, _deprecate_positional)
one_hot_encoder, inverse_onehot, transpose, _deprecate_positional)
from .._shap import _shap_explain_model_cate


Expand Down Expand Up @@ -109,7 +109,7 @@ def fit(self, Y, T, *, X, inference=None):
categories = self.categories
if categories != 'auto':
categories = [categories] # OneHotEncoder expects a 2D array with features per column
self.transformer = OneHotEncoder(categories=categories, sparse=False, drop='first')
self.transformer = one_hot_encoder(categories=categories, drop='first')
T = self.transformer.fit_transform(T.reshape(-1, 1))
self._d_t = T.shape[1:]
T = inverse_onehot(T)
Expand Down Expand Up @@ -232,7 +232,7 @@ def fit(self, Y, T, *, X=None, inference=None):
categories = self.categories
if categories != 'auto':
categories = [categories] # OneHotEncoder expects a 2D array with features per column
self.transformer = OneHotEncoder(categories=categories, sparse=False, drop='first')
self.transformer = one_hot_encoder(categories=categories, drop='first')
T = self.transformer.fit_transform(T.reshape(-1, 1))
self._d_t = (T.shape[1], )
# Note: unlike other Metalearners, we need the controls' encoded column for training
Expand Down Expand Up @@ -375,7 +375,7 @@ def fit(self, Y, T, *, X, inference=None):
categories = self.categories
if categories != 'auto':
categories = [categories] # OneHotEncoder expects a 2D array with features per column
self.transformer = OneHotEncoder(categories=categories, sparse=False, drop='first')
self.transformer = one_hot_encoder(categories=categories, drop='first')
T = self.transformer.fit_transform(T.reshape(-1, 1))
self._d_t = T.shape[1:]
T = inverse_onehot(T)
Expand Down Expand Up @@ -537,7 +537,7 @@ def fit(self, Y, T, *, X, inference=None):
categories = self.categories
if categories != 'auto':
categories = [categories] # OneHotEncoder expects a 2D array with features per column
self.transformer = OneHotEncoder(categories=categories, sparse=False, drop='first')
self.transformer = one_hot_encoder(categories=categories, drop='first')
T = self.transformer.fit_transform(T.reshape(-1, 1))
self._d_t = T.shape[1:]
T = inverse_onehot(T)
Expand Down
6 changes: 3 additions & 3 deletions econml/orf/_ortho_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
from ._causal_tree import CausalTree
from ..inference import NormalInferenceResults
from ..inference._inference import Inference
from ..utilities import (reshape, reshape_Y_T, MAX_RAND_SEED, check_inputs, _deprecate_positional,
from ..utilities import (one_hot_encoder, reshape, reshape_Y_T, MAX_RAND_SEED, check_inputs, _deprecate_positional,
cross_product, inverse_onehot, check_input_arrays, jacify_featurizer,
_RegressionWrapper, deprecated, ndim)
from sklearn.model_selection import check_cv
Expand Down Expand Up @@ -676,7 +676,7 @@ def fit(self, Y, T, *, X, W=None, inference='auto'):
categories = self.categories
if categories != 'auto':
categories = [categories] # OneHotEncoder expects a 2D array with features per column
self.transformer = OneHotEncoder(categories=categories, sparse=False, drop='first')
self.transformer = one_hot_encoder(categories=categories, drop='first')
d_t_in = T.shape[1:]
T = self.transformer.fit_transform(T.reshape(-1, 1))
self._d_t = T.shape[1:]
Expand Down Expand Up @@ -1030,7 +1030,7 @@ def fit(self, Y, T, *, X, W=None, inference='auto'):
categories = self.categories
if categories != 'auto':
categories = [categories] # OneHotEncoder expects a 2D array with features per column
self.transformer = OneHotEncoder(categories=categories, sparse=False, drop='first')
self.transformer = one_hot_encoder(categories=categories, drop='first')
d_t_in = T.shape[1:]
T = self.transformer.fit_transform(T.reshape(-1, 1))
self._d_t = T.shape[1:]
Expand Down
4 changes: 2 additions & 2 deletions econml/sklearn_extensions/linear_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ def _fit_weighted_linear_model(self, X, y, sample_weight, check_input=None):

# Normalize inputs
X, y, X_offset, y_offset, X_scale = _preprocess_data(
X, y, fit_intercept=self.fit_intercept, normalize=False,
X, y, fit_intercept=self.fit_intercept,
copy=self.copy_X, check_input=check_input if check_input is not None else True,
sample_weight=sample_weight)
# Weight inputs
Expand Down Expand Up @@ -737,7 +737,7 @@ def fit(self, X, y, sample_weight=None, check_input=True):
super().fit(X, y, sample_weight, check_input)
# Center X, y
X, y, X_offset, y_offset, X_scale = _preprocess_data(
X, y, fit_intercept=self.fit_intercept, normalize=False,
X, y, fit_intercept=self.fit_intercept,
copy=self.copy_X, check_input=check_input, sample_weight=sample_weight)

# Calculate quantities that will be used later on. Account for centered data
Expand Down
16 changes: 10 additions & 6 deletions econml/sklearn_extensions/model_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -842,13 +842,17 @@ def _cross_val_predict(estimator, X, y=None, *, groups=None, cv=None,
# independent, and that it is pickle-able.
parallel = Parallel(n_jobs=n_jobs, verbose=verbose,
pre_dispatch=pre_dispatch)
predictions = parallel(delayed(_fit_and_predict)(
clone(estimator, safe=safe), X, y, train, test, verbose, fit_params, method)
for train, test in splits)

from pkg_resources import parse_version
if parse_version(sklearn.__version__) < parse_version("0.24.0"):
# Prior to 0.24.0, this private scikit-learn method returned a tuple of two values
predictions = [p[0] for p in predictions]
# verbose was removed from sklearn's non-public _fit_and_predict method in 1.4
if parse_version(sklearn.__version__) < parse_version("1.4"):
predictions = parallel(delayed(_fit_and_predict)(
clone(estimator, safe=safe), X, y, train, test, verbose, fit_params, method)
for train, test in splits)
else:
predictions = parallel(delayed(_fit_and_predict)(
clone(estimator, safe=safe), X, y, train, test, fit_params, method)
for train, test in splits)

inv_test_indices = np.empty(len(test_indices), dtype=int)
inv_test_indices[test_indices] = np.arange(len(test_indices))
Expand Down
14 changes: 6 additions & 8 deletions econml/solutions/causal_analysis/_causal_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from ...inference import NormalInferenceResults
from ...sklearn_extensions.linear_model import WeightedLasso
from ...sklearn_extensions.model_selection import GridSearchCVList
from ...utilities import _RegressionWrapper, get_feature_names_or_default, inverse_onehot
from ...utilities import _RegressionWrapper, get_feature_names_or_default, inverse_onehot, one_hot_encoder

# TODO: this utility is documented but internal; reimplement?
from sklearn.utils import _safe_indexing
Expand Down Expand Up @@ -203,8 +203,7 @@ def fit(self, X):
if cat_cols.shape[1] > 0:
self.has_cats = True
# NOTE: set handle_unknown to 'ignore' so that we don't throw at runtime if given a novel value
self.one_hot_encoder = OneHotEncoder(sparse=False,
handle_unknown='ignore').fit(cat_cols)
self.one_hot_encoder = one_hot_encoder(handle_unknown='ignore').fit(cat_cols)
else:
self.has_cats = False
self.d_x = X.shape[1]
Expand Down Expand Up @@ -335,12 +334,12 @@ def _process_feature(name, feat_ind, verbose, categorical_inds, categories, hete
# we achieve this by pipelining the X scaling with the Y and T models (with fixed scaling, not refitting)

hinds = heterogeneity_inds[feat_ind]
WX_transformer = ColumnTransformer([('encode', OneHotEncoder(drop='first', sparse=False),
WX_transformer = ColumnTransformer([('encode', one_hot_encoder(drop='first'),
[ind for ind in categorical_inds
if ind != feat_ind]),
('drop', 'drop', feat_ind)],
remainder=StandardScaler())
W_transformer = ColumnTransformer([('encode', OneHotEncoder(drop='first', sparse=False),
W_transformer = ColumnTransformer([('encode', one_hot_encoder(drop='first'),
[ind for ind in categorical_inds
if ind != feat_ind and ind not in hinds]),
('drop', 'drop', hinds),
Expand Down Expand Up @@ -732,8 +731,7 @@ def fit(self, X, y, warm_start=False):
if train_y_model:
# perform model selection for the Y model using all X, not on a per-column basis
allX = ColumnTransformer([('encode',
OneHotEncoder(
drop='first', sparse=False),
one_hot_encoder(drop='first'),
self.categorical)],
remainder=StandardScaler()).fit_transform(X)

Expand All @@ -757,7 +755,7 @@ def fit(self, X, y, warm_start=False):

# note that this needs to happen after wrapping to generalize to the multi-class case,
# since otherwise we'll have too many columns to be able to train a classifier
y = OneHotEncoder(drop='first', sparse=False).fit_transform(y)
y = one_hot_encoder(drop='first').fit_transform(y)

assert y.ndim == 1 or y.shape[1] == 1, ("Multiclass classification isn't supported" if self.classification
else "Only a single outcome is supported")
Expand Down
4 changes: 2 additions & 2 deletions econml/tests/test_deepiv.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,7 @@ def monte_carlo_error(g_hat, data_fn, ntest=5000, has_latent=False, debug=False)

def one_hot(col, **kwargs):
z = col.reshape(-1, 1)
enc = OneHotEncoder(sparse=False, **kwargs)
enc = OneHotEncoder(sparse_output=False, **kwargs)
return enc.fit_transform(z)

def sensf(x):
Expand Down Expand Up @@ -375,7 +375,7 @@ def monte_carlo_error(g_hat, data_fn, ntest=5000, has_latent=False, debug=False)

def one_hot(col, **kwargs):
z = col.reshape(-1, 1)
enc = OneHotEncoder(sparse=False, **kwargs)
enc = OneHotEncoder(sparse_output=False, **kwargs)
return enc.fit_transform(z)

def sensf(x):
Expand Down
2 changes: 1 addition & 1 deletion econml/tests/test_dml.py
Original file line number Diff line number Diff line change
Expand Up @@ -910,7 +910,7 @@ def test_can_use_featurizer(self):
[1, 4, 5, 7, 9, 10, 12, 14, 17])

dml = LinearDML(model_y=LinearRegression(), model_t=LinearRegression(),
fit_cate_intercept=False, featurizer=OneHotEncoder(sparse=False),
fit_cate_intercept=False, featurizer=OneHotEncoder(sparse_output=False),
cv=[splits, splits[::-1]])

T = np.tile([1, 2, 3], 6)
Expand Down
2 changes: 1 addition & 1 deletion econml/tests/test_utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ def test_transpose_compatible(self):

def test_inverse_onehot(self):
T = np.random.randint(4, size=100)
T_oh = OneHotEncoder(categories='auto', sparse=False).fit_transform(T.reshape(-1, 1))[:, 1:]
T_oh = OneHotEncoder(categories='auto', sparse_output=False).fit_transform(T.reshape(-1, 1))[:, 1:]
T_inv = inverse_onehot(T_oh)
np.testing.assert_array_equal(T, T_inv)

Expand Down
15 changes: 14 additions & 1 deletion econml/utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import numpy as np
import pandas as pd
import scipy.sparse
import sklearn
import sparse as sp
import itertools
import inspect
Expand All @@ -18,7 +19,7 @@
from functools import reduce, wraps
from sklearn.utils import check_array, check_X_y
from sklearn.utils.validation import assert_all_finite
from sklearn.preprocessing import PolynomialFeatures, LabelEncoder
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures, LabelEncoder
import warnings
from warnings import warn
from collections.abc import Iterable
Expand Down Expand Up @@ -1508,3 +1509,15 @@ def strata_from_discrete_arrays(arrs):
curr_array = temp + curr_array * len(enc.classes_)

return curr_array


def one_hot_encoder(sparse=False, **kwargs):
"""
Wrapper for sklearn's OneHotEncoder that handles the name change from `sparse` to `sparse_output`
between sklearn versions 1.1 and 1.2.
"""
from pkg_resources import parse_version
if parse_version(sklearn.__version__) < parse_version("1.2"):
return OneHotEncoder(sparse=sparse, **kwargs)
else:
return OneHotEncoder(sparse_output=sparse, **kwargs)
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ classifiers = [
dependencies = [
"numpy",
"scipy > 1.4.0",
"scikit-learn >= 1.0, < 1.4",
"scikit-learn >= 1.0, < 1.5",
"sparse",
"joblib >= 0.13.0",
"statsmodels >= 0.10",
Expand Down

0 comments on commit ab572de

Please sign in to comment.