[ENH] Allow HonestForest to sample higher than max_samples # of boots…

…traps (#206) * Add function for estimating posteriors on oob samples * Allow HonestForest to bootstrap sample greater than 1.0 n_samples --------- Signed-off-by: Adam Li <adam2392@gmail.com>
neurodata · Jan 31, 2024 · 15542ce · 15542ce
1 parent 1c53c99
commit 15542ce
Show file tree

Hide file tree

Showing 7 changed files with 530 additions and 69 deletions.
diff --git a/doc/whats_new/v0.6.rst b/doc/whats_new/v0.6.rst
@@ -32,6 +32,8 @@ Changelog
 - |Enhancement| :class:`sktree.HonestForestClassifier` now has a fitted
   property ``oob_samples_``, which reproduces the sample indices per tree that is out
   of bag, by `Adam Li`_ (:pr:`#200`).
+- |Enhancement| :class:`sktree.HonestForestClassifier` will allow one to bootstrap sample higher
+  than the number of samples, controlled by the ``max_samples`` keyword argument by `Adam Li`_ (:pr:`#206`).
 
 
 Code and Documentation Contributors

diff --git a/sktree/ensemble/_honest_forest.py b/sktree/ensemble/_honest_forest.py
@@ -3,18 +3,65 @@
 
 import threading
 
+from numbers import Integral, Real
 import numpy as np
 from joblib import Parallel, delayed
 from sklearn.base import _fit_context
 from sklearn.ensemble._base import _partition_estimators
-from sklearn.utils.validation import check_is_fitted, check_X_y
-from sklearn.ensemble._forest import _generate_unsampled_indices, _get_n_samples_bootstrap
-
+from sklearn.utils.validation import check_is_fitted
+from warnings import warn
+
+from scipy.sparse import issparse
+
+from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper
+from sklearn.exceptions import DataConversionWarning
+from sklearn.utils import check_random_state
+from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
+from sklearn.utils.multiclass import (
+    type_of_target,
+)
+from sklearn.utils.validation import (
+    _check_sample_weight,
+)
+from .._lib.sklearn.tree._tree import DOUBLE, DTYPE
+from .._lib.sklearn.ensemble._forest import (
+    _parallel_build_trees,
+)
 from .._lib.sklearn.ensemble._forest import ForestClassifier
-from .._lib.sklearn.tree import _tree as _sklearn_tree
 from ..tree import HonestTreeClassifier
 
-DTYPE = _sklearn_tree.DTYPE
+
+def _get_n_samples_bootstrap(n_samples, max_samples):
+    """
+    Get the number of samples in a bootstrap sample.
+
+    XXX: Note this is copied from sklearn. We override the ability
+    to sample a higher number of bootstrap samples to enable sampling
+    closer to 80% unique training data points for in-bag computation.
+
+    Parameters
+    ----------
+    n_samples : int
+        Number of samples in the dataset.
+    max_samples : int or float
+        The maximum number of samples to draw from the total available:
+            - if float, this indicates a fraction of the total;
+            - if int, this indicates the exact number of samples;
+            - if None, this indicates the total number of samples.
+
+    Returns
+    -------
+    n_samples_bootstrap : int
+        The total number of samples to draw for the bootstrap sample.
+    """
+    if max_samples is None:
+        return n_samples
+
+    if isinstance(max_samples, Integral):
+        return max_samples
+
+    if isinstance(max_samples, Real):
+        return round(n_samples * max_samples)
 
 
 class HonestForestClassifier(ForestClassifier):
@@ -176,8 +223,7 @@ class HonestForestClassifier(ForestClassifier):
 
         - If None (default), then draw `X.shape[0]` samples.
         - If int, then draw `max_samples` samples.
-        - If float, then draw `max_samples * X.shape[0]` samples. Thus,
-          `max_samples` should be in the interval `(0.0, 1.0]`.
+        - If float, then draw `max_samples * X.shape[0]` samples.
 
     honest_prior : {"ignore", "uniform", "empirical"}, default="empirical"
         Method for dealing with empty leaves during evaluation of a test
@@ -269,7 +315,7 @@ class labels (multi-output problem).
 
     oob_samples_ : list of lists, shape=(n_estimators, n_samples_bootstrap)
         The indices of training samples that are "out-of-bag". Only used
-        if ``bootstrap=False``.
+        if ``bootstrap=True``.
 
     Notes
     -----
@@ -422,16 +468,228 @@ def fit(self, X, y, sample_weight=None, classes=None):
 
         Returns
         -------
-        self : HonestForestClassifier
-            Fitted tree estimator.
+        self : object
+            Fitted estimator.
         """
-        X, y = check_X_y(X, y, multi_output=True)
-        super().fit(X, y, sample_weight=sample_weight, classes=classes)
+        # XXX: This entire function is a copy of what is in scikit-learn
+        # with the exception of:
+        # - _get_n_samples_bootstrap is a re-defined function to allow higher
+        #   max_samples
+
+        MAX_INT = np.iinfo(np.int32).max
+        # Validate or convert input data
+        if issparse(y):
+            raise ValueError("sparse multilabel-indicator for y is not supported.")
+
+        X, y = self._validate_data(
+            X,
+            y,
+            multi_output=True,
+            accept_sparse="csc",
+            dtype=DTYPE,
+            force_all_finite=False,
+        )
+        # _compute_missing_values_in_feature_mask checks if X has missing values and
+        # will raise an error if the underlying tree base estimator can't handle missing
+        # values. Only the criterion is required to determine if the tree supports
+        # missing values.
+        estimator = type(self.estimator)(criterion=self.criterion)
+        missing_values_in_feature_mask = estimator._compute_missing_values_in_feature_mask(
+            X, estimator_name=self.__class__.__name__
+        )
+
+        if sample_weight is not None:
+            sample_weight = _check_sample_weight(sample_weight, X)
+
+        if issparse(X):
+            # Pre-sort indices to avoid that each individual tree of the
+            # ensemble sorts the indices.
+            X.sort_indices()
+
+        y = np.atleast_1d(y)
+        if y.ndim == 2 and y.shape[1] == 1:
+            warn(
+                (
+                    "A column-vector y was passed when a 1d array was"
+                    " expected. Please change the shape of y to "
+                    "(n_samples,), for example using ravel()."
+                ),
+                DataConversionWarning,
+                stacklevel=2,
+            )
+
+        if y.ndim == 1:
+            # reshape is necessary to preserve the data contiguity against vs
+            # [:, np.newaxis] that does not.
+            y = np.reshape(y, (-1, 1))
+
+        if self.criterion == "poisson":
+            if np.any(y < 0):
+                raise ValueError(
+                    "Some value(s) of y are negative which is "
+                    "not allowed for Poisson regression."
+                )
+            if np.sum(y) <= 0:
+                raise ValueError(
+                    "Sum of y is not strictly positive which "
+                    "is necessary for Poisson regression."
+                )
+
+        self._n_samples, self.n_outputs_ = y.shape
+
+        y, expanded_class_weight = self._validate_y_class_weight(y, classes=classes)
+
+        if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
+            y = np.ascontiguousarray(y, dtype=DOUBLE)
+
+        if expanded_class_weight is not None:
+            if sample_weight is not None:
+                sample_weight = sample_weight * expanded_class_weight
+            else:
+                sample_weight = expanded_class_weight
+
+        if not self.bootstrap and self.max_samples is not None:
+            raise ValueError(
+                "`max_sample` cannot be set if `bootstrap=False`. "
+                "Either switch to `bootstrap=True` or set "
+                "`max_sample=None`."
+            )
+        elif self.bootstrap:
+            n_samples_bootstrap = _get_n_samples_bootstrap(
+                n_samples=X.shape[0], max_samples=self.max_samples
+            )
+        else:
+            n_samples_bootstrap = None
+
+        self._n_samples_bootstrap = n_samples_bootstrap
+
+        self._validate_estimator()
+
+        if not self.bootstrap and self.oob_score:
+            raise ValueError("Out of bag estimation only available if bootstrap=True")
+
+        random_state = check_random_state(self.random_state)
+
+        if not self.warm_start or not hasattr(self, "estimators_"):
+            # Free allocated memory, if any
+            self.estimators_ = []
+
+        n_more_estimators = self.n_estimators - len(self.estimators_)
+
+        if self.max_bins is not None:
+            # `_openmp_effective_n_threads` is used to take cgroups CPU quotes
+            # into account when determine the maximum number of threads to use.
+            n_threads = _openmp_effective_n_threads()
+
+            # Bin the data
+            # For ease of use of the API, the user-facing GBDT classes accept the
+            # parameter max_bins, which doesn't take into account the bin for
+            # missing values (which is always allocated). However, since max_bins
+            # isn't the true maximal number of bins, all other private classes
+            # (binmapper, histbuilder...) accept n_bins instead, which is the
+            # actual total number of bins. Everywhere in the code, the
+            # convention is that n_bins == max_bins + 1
+            n_bins = self.max_bins + 1  # + 1 for missing values
+            self._bin_mapper = _BinMapper(
+                n_bins=n_bins,
+                # is_categorical=self.is_categorical_,
+                known_categories=None,
+                random_state=random_state,
+                n_threads=n_threads,
+            )
+
+            # XXX: in order for this to work with the underlying tree submodule's Cython
+            # code, we need to convert this into the original data's DTYPE because
+            # the Cython code assumes that `DTYPE` is used.
+            # The proper implementation will be a lot more complicated and should be
+            # tackled once scikit-learn has finalized their inclusion of missing data
+            # and categorical support for decision trees
+            X = self._bin_data(X, is_training_data=True)  # .astype(DTYPE)
+        else:
+            self._bin_mapper = None
+
+        if n_more_estimators < 0:
+            raise ValueError(
+                "n_estimators=%d must be larger or equal to "
+                "len(estimators_)=%d when warm_start==True"
+                % (self.n_estimators, len(self.estimators_))
+            )
+
+        elif n_more_estimators == 0:
+            warn("Warm-start fitting without increasing n_estimators does not " "fit new trees.")
+        else:
+            if self.warm_start and len(self.estimators_) > 0:
+                # We draw from the random state to get the random state we
+                # would have got if we hadn't used a warm_start.
+                random_state.randint(MAX_INT, size=len(self.estimators_))
+
+            trees = [
+                self._make_estimator(append=False, random_state=random_state)
+                for i in range(n_more_estimators)
+            ]
+
+            # Parallel loop: we prefer the threading backend as the Cython code
+            # for fitting the trees is internally releasing the Python GIL
+            # making threading more efficient than multiprocessing in
+            # that case. However, for joblib 0.12+ we respect any
+            # parallel_backend contexts set at a higher level,
+            # since correctness does not rely on using threads.
+            trees = Parallel(
+                n_jobs=self.n_jobs,
+                verbose=self.verbose,
+                prefer="threads",
+            )(
+                delayed(_parallel_build_trees)(
+                    t,
+                    self.bootstrap,
+                    X,
+                    y,
+                    sample_weight,
+                    i,
+                    len(trees),
+                    verbose=self.verbose,
+                    class_weight=self.class_weight,
+                    n_samples_bootstrap=n_samples_bootstrap,
+                    missing_values_in_feature_mask=missing_values_in_feature_mask,
+                    classes=classes,
+                )
+                for i, t in enumerate(trees)
+            )
+
+            # Collect newly grown trees
+            self.estimators_.extend(trees)
+
+        if self.oob_score and (n_more_estimators > 0 or not hasattr(self, "oob_score_")):
+            y_type = type_of_target(y)
+            if y_type == "unknown" or (
+                self._estimator_type == "classifier" and y_type == "multiclass-multioutput"
+            ):
+                # FIXME: we could consider to support multiclass-multioutput if
+                # we introduce or reuse a constructor parameter (e.g.
+                # oob_score) allowing our user to pass a callable defining the
+                # scoring strategy on OOB sample.
+                raise ValueError(
+                    "The type of target cannot be used to compute OOB "
+                    f"estimates. Got {y_type} while only the following are "
+                    "supported: continuous, continuous-multioutput, binary, "
+                    "multiclass, multilabel-indicator."
+                )
+
+            if callable(self.oob_score):
+                self._set_oob_score_and_attributes(X, y, scoring_function=self.oob_score)
+            else:
+                self._set_oob_score_and_attributes(X, y)
+
+        # Decapsulate classes_ attributes
+        if hasattr(self, "classes_") and self.n_outputs_ == 1:
+            self.n_classes_ = self.n_classes_[0]
+            self.classes_ = self.classes_[0]
 
         # Compute honest decision function
         self.honest_decision_function_ = self._predict_proba(
             X, indices=self.honest_indices_, impute_missing=np.nan
         )
+
         return self
 
     def predict_proba(self, X):
@@ -517,19 +775,26 @@ def oob_samples_(self):
         if self.bootstrap is False:
             raise RuntimeError("Cannot extract out-of-bag samples when bootstrap is False.")
         check_is_fitted(self)
-        self._n_samples
+
         oob_samples = []
-        n_samples_bootstrap = _get_n_samples_bootstrap(
-            self._n_samples,
-            self.max_samples,
-        )
-        for estimator in self.estimators_:
-            unsampled_indices = _generate_unsampled_indices(
-                estimator.random_state,
-                self._n_samples,
-                n_samples_bootstrap,
+
+        possible_indices = np.arange(self._n_samples)
+        for structure_idx, honest_idx in zip(self.structure_indices_, self.honest_indices_):
+            _oob_samples = np.setdiff1d(
+                possible_indices, np.concatenate((structure_idx, honest_idx))
             )
-            oob_samples.append(unsampled_indices)
+            oob_samples.append(_oob_samples)
+        # n_samples_bootstrap = _get_n_samples_bootstrap(
+        #     self._n_samples,
+        #     self.max_samples,
+        # )
+        # for estimator in self.estimators_:
+        #     unsampled_indices = _generate_unsampled_indices(
+        #         estimator.random_state,
+        #         self._n_samples,
+        #         n_samples_bootstrap,
+        #     )
+        #     oob_samples.append(unsampled_indices)
         return oob_samples
 
     def _more_tags(self):

diff --git a/sktree/stats/__init__.py b/sktree/stats/__init__.py
@@ -3,6 +3,7 @@
     FeatureImportanceForestRegressor,
     build_hyppo_cv_forest,
     build_hyppo_oob_forest,
+    build_coleman_forest,
 )
 from .monte_carlo import PermutationTest
 from .permutationforest import PermutationForestClassifier, PermutationForestRegressor
@@ -15,4 +16,5 @@
     "PermutationTest",
     "build_hyppo_cv_forest",
     "build_hyppo_oob_forest",
+    "build_coleman_forest",
 ]