Skip to content

Commit

Permalink
[ENH] Allow HonestForest to sample higher than max_samples # of boots…
Browse files Browse the repository at this point in the history
…traps (#206)

* Add function for estimating posteriors on oob samples
* Allow HonestForest to bootstrap sample greater than 1.0 n_samples

---------

Signed-off-by: Adam Li <adam2392@gmail.com>
  • Loading branch information
adam2392 authored Jan 31, 2024
1 parent 1c53c99 commit 15542ce
Show file tree
Hide file tree
Showing 7 changed files with 530 additions and 69 deletions.
2 changes: 2 additions & 0 deletions doc/whats_new/v0.6.rst
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ Changelog
- |Enhancement| :class:`sktree.HonestForestClassifier` now has a fitted
property ``oob_samples_``, which reproduces the sample indices per tree that is out
of bag, by `Adam Li`_ (:pr:`#200`).
- |Enhancement| :class:`sktree.HonestForestClassifier` will allow one to bootstrap sample higher
than the number of samples, controlled by the ``max_samples`` keyword argument by `Adam Li`_ (:pr:`#206`).


Code and Documentation Contributors
Expand Down
311 changes: 288 additions & 23 deletions sktree/ensemble/_honest_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,65 @@

import threading

from numbers import Integral, Real
import numpy as np
from joblib import Parallel, delayed
from sklearn.base import _fit_context
from sklearn.ensemble._base import _partition_estimators
from sklearn.utils.validation import check_is_fitted, check_X_y
from sklearn.ensemble._forest import _generate_unsampled_indices, _get_n_samples_bootstrap

from sklearn.utils.validation import check_is_fitted
from warnings import warn

from scipy.sparse import issparse

from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper
from sklearn.exceptions import DataConversionWarning
from sklearn.utils import check_random_state
from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
from sklearn.utils.multiclass import (
type_of_target,
)
from sklearn.utils.validation import (
_check_sample_weight,
)
from .._lib.sklearn.tree._tree import DOUBLE, DTYPE
from .._lib.sklearn.ensemble._forest import (
_parallel_build_trees,
)
from .._lib.sklearn.ensemble._forest import ForestClassifier
from .._lib.sklearn.tree import _tree as _sklearn_tree
from ..tree import HonestTreeClassifier

DTYPE = _sklearn_tree.DTYPE

def _get_n_samples_bootstrap(n_samples, max_samples):
"""
Get the number of samples in a bootstrap sample.
XXX: Note this is copied from sklearn. We override the ability
to sample a higher number of bootstrap samples to enable sampling
closer to 80% unique training data points for in-bag computation.
Parameters
----------
n_samples : int
Number of samples in the dataset.
max_samples : int or float
The maximum number of samples to draw from the total available:
- if float, this indicates a fraction of the total;
- if int, this indicates the exact number of samples;
- if None, this indicates the total number of samples.
Returns
-------
n_samples_bootstrap : int
The total number of samples to draw for the bootstrap sample.
"""
if max_samples is None:
return n_samples

if isinstance(max_samples, Integral):
return max_samples

if isinstance(max_samples, Real):
return round(n_samples * max_samples)


class HonestForestClassifier(ForestClassifier):
Expand Down Expand Up @@ -176,8 +223,7 @@ class HonestForestClassifier(ForestClassifier):
- If None (default), then draw `X.shape[0]` samples.
- If int, then draw `max_samples` samples.
- If float, then draw `max_samples * X.shape[0]` samples. Thus,
`max_samples` should be in the interval `(0.0, 1.0]`.
- If float, then draw `max_samples * X.shape[0]` samples.
honest_prior : {"ignore", "uniform", "empirical"}, default="empirical"
Method for dealing with empty leaves during evaluation of a test
Expand Down Expand Up @@ -269,7 +315,7 @@ class labels (multi-output problem).
oob_samples_ : list of lists, shape=(n_estimators, n_samples_bootstrap)
The indices of training samples that are "out-of-bag". Only used
if ``bootstrap=False``.
if ``bootstrap=True``.
Notes
-----
Expand Down Expand Up @@ -422,16 +468,228 @@ def fit(self, X, y, sample_weight=None, classes=None):
Returns
-------
self : HonestForestClassifier
Fitted tree estimator.
self : object
Fitted estimator.
"""
X, y = check_X_y(X, y, multi_output=True)
super().fit(X, y, sample_weight=sample_weight, classes=classes)
# XXX: This entire function is a copy of what is in scikit-learn
# with the exception of:
# - _get_n_samples_bootstrap is a re-defined function to allow higher
# max_samples

MAX_INT = np.iinfo(np.int32).max
# Validate or convert input data
if issparse(y):
raise ValueError("sparse multilabel-indicator for y is not supported.")

X, y = self._validate_data(
X,
y,
multi_output=True,
accept_sparse="csc",
dtype=DTYPE,
force_all_finite=False,
)
# _compute_missing_values_in_feature_mask checks if X has missing values and
# will raise an error if the underlying tree base estimator can't handle missing
# values. Only the criterion is required to determine if the tree supports
# missing values.
estimator = type(self.estimator)(criterion=self.criterion)
missing_values_in_feature_mask = estimator._compute_missing_values_in_feature_mask(
X, estimator_name=self.__class__.__name__
)

if sample_weight is not None:
sample_weight = _check_sample_weight(sample_weight, X)

if issparse(X):
# Pre-sort indices to avoid that each individual tree of the
# ensemble sorts the indices.
X.sort_indices()

y = np.atleast_1d(y)
if y.ndim == 2 and y.shape[1] == 1:
warn(
(
"A column-vector y was passed when a 1d array was"
" expected. Please change the shape of y to "
"(n_samples,), for example using ravel()."
),
DataConversionWarning,
stacklevel=2,
)

if y.ndim == 1:
# reshape is necessary to preserve the data contiguity against vs
# [:, np.newaxis] that does not.
y = np.reshape(y, (-1, 1))

if self.criterion == "poisson":
if np.any(y < 0):
raise ValueError(
"Some value(s) of y are negative which is "
"not allowed for Poisson regression."
)
if np.sum(y) <= 0:
raise ValueError(
"Sum of y is not strictly positive which "
"is necessary for Poisson regression."
)

self._n_samples, self.n_outputs_ = y.shape

y, expanded_class_weight = self._validate_y_class_weight(y, classes=classes)

if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
y = np.ascontiguousarray(y, dtype=DOUBLE)

if expanded_class_weight is not None:
if sample_weight is not None:
sample_weight = sample_weight * expanded_class_weight
else:
sample_weight = expanded_class_weight

if not self.bootstrap and self.max_samples is not None:
raise ValueError(
"`max_sample` cannot be set if `bootstrap=False`. "
"Either switch to `bootstrap=True` or set "
"`max_sample=None`."
)
elif self.bootstrap:
n_samples_bootstrap = _get_n_samples_bootstrap(
n_samples=X.shape[0], max_samples=self.max_samples
)
else:
n_samples_bootstrap = None

self._n_samples_bootstrap = n_samples_bootstrap

self._validate_estimator()

if not self.bootstrap and self.oob_score:
raise ValueError("Out of bag estimation only available if bootstrap=True")

random_state = check_random_state(self.random_state)

if not self.warm_start or not hasattr(self, "estimators_"):
# Free allocated memory, if any
self.estimators_ = []

n_more_estimators = self.n_estimators - len(self.estimators_)

if self.max_bins is not None:
# `_openmp_effective_n_threads` is used to take cgroups CPU quotes
# into account when determine the maximum number of threads to use.
n_threads = _openmp_effective_n_threads()

# Bin the data
# For ease of use of the API, the user-facing GBDT classes accept the
# parameter max_bins, which doesn't take into account the bin for
# missing values (which is always allocated). However, since max_bins
# isn't the true maximal number of bins, all other private classes
# (binmapper, histbuilder...) accept n_bins instead, which is the
# actual total number of bins. Everywhere in the code, the
# convention is that n_bins == max_bins + 1
n_bins = self.max_bins + 1 # + 1 for missing values
self._bin_mapper = _BinMapper(
n_bins=n_bins,
# is_categorical=self.is_categorical_,
known_categories=None,
random_state=random_state,
n_threads=n_threads,
)

# XXX: in order for this to work with the underlying tree submodule's Cython
# code, we need to convert this into the original data's DTYPE because
# the Cython code assumes that `DTYPE` is used.
# The proper implementation will be a lot more complicated and should be
# tackled once scikit-learn has finalized their inclusion of missing data
# and categorical support for decision trees
X = self._bin_data(X, is_training_data=True) # .astype(DTYPE)
else:
self._bin_mapper = None

if n_more_estimators < 0:
raise ValueError(
"n_estimators=%d must be larger or equal to "
"len(estimators_)=%d when warm_start==True"
% (self.n_estimators, len(self.estimators_))
)

elif n_more_estimators == 0:
warn("Warm-start fitting without increasing n_estimators does not " "fit new trees.")
else:
if self.warm_start and len(self.estimators_) > 0:
# We draw from the random state to get the random state we
# would have got if we hadn't used a warm_start.
random_state.randint(MAX_INT, size=len(self.estimators_))

trees = [
self._make_estimator(append=False, random_state=random_state)
for i in range(n_more_estimators)
]

# Parallel loop: we prefer the threading backend as the Cython code
# for fitting the trees is internally releasing the Python GIL
# making threading more efficient than multiprocessing in
# that case. However, for joblib 0.12+ we respect any
# parallel_backend contexts set at a higher level,
# since correctness does not rely on using threads.
trees = Parallel(
n_jobs=self.n_jobs,
verbose=self.verbose,
prefer="threads",
)(
delayed(_parallel_build_trees)(
t,
self.bootstrap,
X,
y,
sample_weight,
i,
len(trees),
verbose=self.verbose,
class_weight=self.class_weight,
n_samples_bootstrap=n_samples_bootstrap,
missing_values_in_feature_mask=missing_values_in_feature_mask,
classes=classes,
)
for i, t in enumerate(trees)
)

# Collect newly grown trees
self.estimators_.extend(trees)

if self.oob_score and (n_more_estimators > 0 or not hasattr(self, "oob_score_")):
y_type = type_of_target(y)
if y_type == "unknown" or (
self._estimator_type == "classifier" and y_type == "multiclass-multioutput"
):
# FIXME: we could consider to support multiclass-multioutput if
# we introduce or reuse a constructor parameter (e.g.
# oob_score) allowing our user to pass a callable defining the
# scoring strategy on OOB sample.
raise ValueError(
"The type of target cannot be used to compute OOB "
f"estimates. Got {y_type} while only the following are "
"supported: continuous, continuous-multioutput, binary, "
"multiclass, multilabel-indicator."
)

if callable(self.oob_score):
self._set_oob_score_and_attributes(X, y, scoring_function=self.oob_score)
else:
self._set_oob_score_and_attributes(X, y)

# Decapsulate classes_ attributes
if hasattr(self, "classes_") and self.n_outputs_ == 1:
self.n_classes_ = self.n_classes_[0]
self.classes_ = self.classes_[0]

# Compute honest decision function
self.honest_decision_function_ = self._predict_proba(
X, indices=self.honest_indices_, impute_missing=np.nan
)

return self

def predict_proba(self, X):
Expand Down Expand Up @@ -517,19 +775,26 @@ def oob_samples_(self):
if self.bootstrap is False:
raise RuntimeError("Cannot extract out-of-bag samples when bootstrap is False.")
check_is_fitted(self)
self._n_samples

oob_samples = []
n_samples_bootstrap = _get_n_samples_bootstrap(
self._n_samples,
self.max_samples,
)
for estimator in self.estimators_:
unsampled_indices = _generate_unsampled_indices(
estimator.random_state,
self._n_samples,
n_samples_bootstrap,

possible_indices = np.arange(self._n_samples)
for structure_idx, honest_idx in zip(self.structure_indices_, self.honest_indices_):
_oob_samples = np.setdiff1d(
possible_indices, np.concatenate((structure_idx, honest_idx))
)
oob_samples.append(unsampled_indices)
oob_samples.append(_oob_samples)
# n_samples_bootstrap = _get_n_samples_bootstrap(
# self._n_samples,
# self.max_samples,
# )
# for estimator in self.estimators_:
# unsampled_indices = _generate_unsampled_indices(
# estimator.random_state,
# self._n_samples,
# n_samples_bootstrap,
# )
# oob_samples.append(unsampled_indices)
return oob_samples

def _more_tags(self):
Expand Down
2 changes: 2 additions & 0 deletions sktree/stats/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
FeatureImportanceForestRegressor,
build_hyppo_cv_forest,
build_hyppo_oob_forest,
build_coleman_forest,
)
from .monte_carlo import PermutationTest
from .permutationforest import PermutationForestClassifier, PermutationForestRegressor
Expand All @@ -15,4 +16,5 @@
"PermutationTest",
"build_hyppo_cv_forest",
"build_hyppo_oob_forest",
"build_coleman_forest",
]
Loading

0 comments on commit 15542ce

Please sign in to comment.