diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 27b1251f..bcae1a52 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -66,8 +66,9 @@ If you added a new inference method, add test code to the `tests/` folder.
 
 **CausalML** uses `pytest` for tests. Install `pytest` and `pytest-cov`, and the package dependencies:
 ```bash
-$ pip install pytest pytest-cov -r requirements.txt
+$ pip install .[test]
 ```
+See details for test dependencies in `pyproject.toml`
 
 ### Building Cython
 
@@ -75,6 +76,7 @@ In order to run tests, you need to build the Cython modules
 ```bash
 $ python setup.py build_ext --inplace
 ```
+This is important because during testing causalml modules are imported from the source code.
 
 ### Testing
 
diff --git a/Makefile b/Makefile
index 70258628..4c1c383d 100644
--- a/Makefile
+++ b/Makefile
@@ -1,11 +1,7 @@
 .PHONY: build_ext
-build_ext: install_req clean
+build_ext: clean
 	python setup.py build_ext --force --inplace
 
-.PHONY: install_req
-install_req:
-	pip install -r requirements.txt
-
 .PHONY: build
 build: build_ext
 	python setup.py bdist_wheel
diff --git a/causalml/inference/tree/_tree/__init__.py b/causalml/inference/tree/_tree/__init__.py
new file mode 100755
index 00000000..20af086a
--- /dev/null
+++ b/causalml/inference/tree/_tree/__init__.py
@@ -0,0 +1,4 @@
+"""
+This part of tree structures definition was initially borrowed from
+https://github.com/scikit-learn/scikit-learn/tree/1.0.2/sklearn/tree
+"""
diff --git a/causalml/inference/tree/_tree/_classes.py b/causalml/inference/tree/_tree/_classes.py
new file mode 100644
index 00000000..70031b6d
--- /dev/null
+++ b/causalml/inference/tree/_tree/_classes.py
@@ -0,0 +1,322 @@
+"""
+This module gathers tree-based methods, including decision, regression and
+randomized trees. Single and multi-output problems are both handled.
+"""
+
+# Authors: Gilles Louppe <g.louppe@gmail.com>
+#          Peter Prettenhofer <peter.prettenhofer@gmail.com>
+#          Brian Holt <bdholt1@gmail.com>
+#          Noel Dawe <noel@dawe.me>
+#          Satrajit Gosh <satrajit.ghosh@gmail.com>
+#          Joly Arnaud <arnaud.v.joly@gmail.com>
+#          Fares Hedayati <fares.hedayati@gmail.com>
+#          Nelson Liu <nelson@nelsonliu.me>
+#
+# License: BSD 3 clause
+
+
+from abc import ABCMeta
+from abc import abstractmethod
+from sklearn.base import MultiOutputMixin
+from sklearn.base import BaseEstimator
+from sklearn.base import is_classifier, clone
+from sklearn.utils import Bunch
+from sklearn.utils.validation import check_is_fitted
+
+import numpy as np
+from scipy.sparse import issparse
+
+from ._tree import Tree
+from ._tree import _build_pruned_tree_ccp
+from ._tree import ccp_pruning_path
+from . import _tree, _splitter
+
+DTYPE = _tree.DTYPE
+DOUBLE = _tree.DOUBLE
+
+DENSE_SPLITTERS = {
+    "best": _splitter.BestSplitter,
+    "random": _splitter.RandomSplitter,
+}
+
+SPARSE_SPLITTERS = {
+    "best": _splitter.BestSparseSplitter,
+    "random": _splitter.RandomSparseSplitter,
+}
+
+
+# =============================================================================
+# Base decision tree
+# =============================================================================
+
+
+class BaseDecisionTree(MultiOutputMixin, BaseEstimator, metaclass=ABCMeta):
+    """Base class for decision trees.
+
+    Warning: This class should not be used directly.
+    Use derived classes instead.
+    """
+
+    @abstractmethod
+    def __init__(
+        self,
+        *,
+        criterion,
+        splitter,
+        max_depth,
+        min_samples_split,
+        min_samples_leaf,
+        min_weight_fraction_leaf,
+        max_features,
+        max_leaf_nodes,
+        random_state,
+        min_impurity_decrease,
+        class_weight=None,
+        ccp_alpha=0.0,
+    ):
+        self.criterion = criterion
+        self.splitter = splitter
+        self.max_depth = max_depth
+        self.min_samples_split = min_samples_split
+        self.min_samples_leaf = min_samples_leaf
+        self.min_weight_fraction_leaf = min_weight_fraction_leaf
+        self.max_features = max_features
+        self.max_leaf_nodes = max_leaf_nodes
+        self.random_state = random_state
+        self.min_impurity_decrease = min_impurity_decrease
+        self.class_weight = class_weight
+        self.ccp_alpha = ccp_alpha
+
+    @abstractmethod
+    def fit(
+        self, X, y, sample_weight=None, check_input=True, X_idx_sorted="deprecated"
+    ):
+        pass
+
+    def get_depth(self):
+        """Return the depth of the decision tree.
+
+        The depth of a tree is the maximum distance between the root
+        and any leaf.
+
+        Returns
+        -------
+        self.tree_.max_depth : int
+            The maximum depth of the tree.
+        """
+        check_is_fitted(self)
+        return self.tree_.max_depth
+
+    def get_n_leaves(self):
+        """Return the number of leaves of the decision tree.
+
+        Returns
+        -------
+        self.tree_.n_leaves : int
+            Number of leaves.
+        """
+        check_is_fitted(self)
+        return self.tree_.n_leaves
+
+    def _validate_X_predict(self, X, check_input):
+        """Validate the training data on predict (probabilities)."""
+        if check_input:
+            X = self._validate_data(X, dtype=DTYPE, accept_sparse="csr", reset=False)
+            if issparse(X) and (
+                X.indices.dtype != np.intc or X.indptr.dtype != np.intc
+            ):
+                raise ValueError("No support for np.int64 index based sparse matrices")
+        else:
+            # The number of features is checked regardless of `check_input`
+            self._check_n_features(X, reset=False)
+        return X
+
+    def predict(self, X, check_input=True):
+        """Predict class or regression value for X.
+
+        For a classification model, the predicted class for each sample in X is
+        returned. For a regression model, the predicted value based on X is
+        returned.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The input samples. Internally, it will be converted to
+            ``dtype=np.float32`` and if a sparse matrix is provided
+            to a sparse ``csr_matrix``.
+
+        check_input : bool, default=True
+            Allow to bypass several input checking.
+            Don't use this parameter unless you know what you do.
+
+        Returns
+        -------
+        y : array-like of shape (n_samples,) or (n_samples, n_outputs)
+            The predicted classes, or the predict values.
+        """
+        check_is_fitted(self)
+        X = self._validate_X_predict(X, check_input)
+        proba = self.tree_.predict(X)
+        n_samples = X.shape[0]
+
+        # Classification
+        if is_classifier(self):
+            if self.n_outputs_ == 1:
+                return self.classes_.take(np.argmax(proba, axis=1), axis=0)
+
+            else:
+                class_type = self.classes_[0].dtype
+                predictions = np.zeros((n_samples, self.n_outputs_), dtype=class_type)
+                for k in range(self.n_outputs_):
+                    predictions[:, k] = self.classes_[k].take(
+                        np.argmax(proba[:, k], axis=1), axis=0
+                    )
+
+                return predictions
+
+        # Regression
+        else:
+            if self.n_outputs_ == 1:
+                return proba[:, 0]
+
+            else:
+                return proba[:, :, 0]
+
+    def apply(self, X, check_input=True):
+        """Return the index of the leaf that each sample is predicted as.
+
+        .. versionadded:: 0.17
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The input samples. Internally, it will be converted to
+            ``dtype=np.float32`` and if a sparse matrix is provided
+            to a sparse ``csr_matrix``.
+
+        check_input : bool, default=True
+            Allow to bypass several input checking.
+            Don't use this parameter unless you know what you do.
+
+        Returns
+        -------
+        X_leaves : array-like of shape (n_samples,)
+            For each datapoint x in X, return the index of the leaf x
+            ends up in. Leaves are numbered within
+            ``[0; self.tree_.node_count)``, possibly with gaps in the
+            numbering.
+        """
+        check_is_fitted(self)
+        X = self._validate_X_predict(X, check_input)
+        return self.tree_.apply(X)
+
+    def decision_path(self, X, check_input=True):
+        """Return the decision path in the tree.
+
+        .. versionadded:: 0.18
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The input samples. Internally, it will be converted to
+            ``dtype=np.float32`` and if a sparse matrix is provided
+            to a sparse ``csr_matrix``.
+
+        check_input : bool, default=True
+            Allow to bypass several input checking.
+            Don't use this parameter unless you know what you do.
+
+        Returns
+        -------
+        indicator : sparse matrix of shape (n_samples, n_nodes)
+            Return a node indicator CSR matrix where non zero elements
+            indicates that the samples goes through the nodes.
+        """
+        X = self._validate_X_predict(X, check_input)
+        return self.tree_.decision_path(X)
+
+    def _prune_tree(self):
+        """Prune tree using Minimal Cost-Complexity Pruning."""
+        check_is_fitted(self)
+
+        if self.ccp_alpha < 0.0:
+            raise ValueError("ccp_alpha must be greater than or equal to 0")
+
+        if self.ccp_alpha == 0.0:
+            return
+
+        # build pruned tree
+        if is_classifier(self):
+            n_classes = np.atleast_1d(self.n_classes_)
+            pruned_tree = Tree(self.n_features_in_, n_classes, self.n_outputs_)
+        else:
+            pruned_tree = Tree(
+                self.n_features_in_,
+                # TODO: the tree shouldn't need this param
+                np.array([1] * self.n_outputs_, dtype=np.intp),
+                self.n_outputs_,
+            )
+        _build_pruned_tree_ccp(pruned_tree, self.tree_, self.ccp_alpha)
+
+        self.tree_ = pruned_tree
+
+    def cost_complexity_pruning_path(self, X, y, sample_weight=None):
+        """Compute the pruning path during Minimal Cost-Complexity Pruning.
+
+        See :ref:`minimal_cost_complexity_pruning` for details on the pruning
+        process.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The training input samples. Internally, it will be converted to
+            ``dtype=np.float32`` and if a sparse matrix is provided
+            to a sparse ``csc_matrix``.
+
+        y : array-like of shape (n_samples,) or (n_samples, n_outputs)
+            The target values (class labels) as integers or strings.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights. If None, then samples are equally weighted. Splits
+            that would create child nodes with net zero or negative weight are
+            ignored while searching for a split in each node. Splits are also
+            ignored if they would result in any single class carrying a
+            negative weight in either child node.
+
+        Returns
+        -------
+        ccp_path : :class:`~sklearn.utils.Bunch`
+            Dictionary-like object, with the following attributes.
+
+            ccp_alphas : ndarray
+                Effective alphas of subtree during pruning.
+
+            impurities : ndarray
+                Sum of the impurities of the subtree leaves for the
+                corresponding alpha value in ``ccp_alphas``.
+        """
+        est = clone(self).set_params(ccp_alpha=0.0)
+        est.fit(X, y, sample_weight=sample_weight)
+        return Bunch(**ccp_pruning_path(est.tree_))
+
+    @property
+    def feature_importances_(self):
+        """Return the feature importances.
+
+        The importance of a feature is computed as the (normalized) total
+        reduction of the criterion brought by that feature.
+        It is also known as the Gini importance.
+
+        Warning: impurity-based feature importances can be misleading for
+        high cardinality features (many unique values). See
+        :func:`sklearn.inspection.permutation_importance` as an alternative.
+
+        Returns
+        -------
+        feature_importances_ : ndarray of shape (n_features,)
+            Normalized total reduction of criteria by feature
+            (Gini importance).
+        """
+        check_is_fitted(self)
+
+        return self.tree_.compute_feature_importances()
diff --git a/causalml/inference/tree/_tree/_criterion.pxd b/causalml/inference/tree/_tree/_criterion.pxd
new file mode 100644
index 00000000..762e3509
--- /dev/null
+++ b/causalml/inference/tree/_tree/_criterion.pxd
@@ -0,0 +1,78 @@
+# Authors: Gilles Louppe <g.louppe@gmail.com>
+#          Peter Prettenhofer <peter.prettenhofer@gmail.com>
+#          Brian Holt <bdholt1@gmail.com>
+#          Joel Nothman <joel.nothman@gmail.com>
+#          Arnaud Joly <arnaud.v.joly@gmail.com>
+#          Jacob Schreiber <jmschreiber91@gmail.com>
+#
+# License: BSD 3 clause
+
+# cython: cdivision=True
+# cython: boundscheck=False
+# cython: wraparound=False
+# cython: language_level=3
+# cython: linetrace=True
+
+import numpy as np
+cimport numpy as cnp
+
+from ._tree cimport DTYPE_t          # Type of X
+from ._tree cimport DOUBLE_t         # Type of y, sample_weight
+from ._tree cimport SIZE_t           # Type for indices and counters
+from ._tree cimport INT32_t          # Signed 32 bit integer
+from ._tree cimport UINT32_t         # Unsigned 32 bit integer
+
+cdef class Criterion:
+    # The criterion computes the impurity of a node and the reduction of
+    # impurity of a split on that node. It also computes the output statistics
+    # such as the mean in regression and class probabilities in classification.
+
+    # Internal structures
+    cdef const DOUBLE_t[:, ::1] y        # Values of y
+    cdef DOUBLE_t* sample_weight         # Sample weights
+
+    cdef SIZE_t* samples                 # Sample indices in X, y
+    cdef SIZE_t start                    # samples[start:pos] are the samples in the left node
+    cdef SIZE_t pos                      # samples[pos:end] are the samples in the right node
+    cdef SIZE_t end
+
+    cdef SIZE_t n_outputs                # Number of outputs
+    cdef SIZE_t n_samples                # Number of samples
+    cdef SIZE_t n_node_samples           # Number of samples in the node (end-start)
+    cdef double weighted_n_samples       # Weighted number of samples (in total)
+    cdef double weighted_n_node_samples  # Weighted number of samples in the node
+    cdef double weighted_n_left          # Weighted number of samples in the left node
+    cdef double weighted_n_right         # Weighted number of samples in the right node
+
+    cdef double* sum_total          # For classification criteria, the sum of the
+                                    # weighted count of each label. For regression,
+                                    # the sum of w*y. sum_total[k] is equal to
+                                    # sum_{i=start}^{end-1} w[samples[i]]*y[samples[i], k],
+                                    # where k is output index.
+    cdef double* sum_left           # Same as above, but for the left side of the split
+    cdef double* sum_right          # same as above, but for the right side of the split
+
+    # The criterion object is maintained such that left and right collected
+    # statistics correspond to samples[start:pos] and samples[pos:end].
+
+    # Methods
+    cdef int init(self, const DOUBLE_t[:, ::1] y, DOUBLE_t* sample_weight,
+                  double weighted_n_samples, SIZE_t* samples, SIZE_t start,
+                  SIZE_t end) nogil except -1
+    cdef int reset(self) nogil except -1
+    cdef int reverse_reset(self) nogil except -1
+    cdef int update(self, SIZE_t new_pos) nogil except -1
+    cdef double node_impurity(self) nogil
+    cdef void children_impurity(self, double* impurity_left,
+                                double* impurity_right) nogil
+    cdef void node_value(self, double* dest) nogil
+    cdef double impurity_improvement(self, double impurity_parent,
+                                     double impurity_left,
+                                     double impurity_right) nogil
+    cdef double proxy_impurity_improvement(self) nogil
+
+
+cdef class RegressionCriterion(Criterion):
+    """Abstract regression criterion."""
+
+    cdef double sq_sum_total
diff --git a/causalml/inference/tree/_tree/_criterion.pyx b/causalml/inference/tree/_tree/_criterion.pyx
new file mode 100755
index 00000000..ae281edd
--- /dev/null
+++ b/causalml/inference/tree/_tree/_criterion.pyx
@@ -0,0 +1,398 @@
+# Authors: Gilles Louppe <g.louppe@gmail.com>
+#          Peter Prettenhofer <peter.prettenhofer@gmail.com>
+#          Brian Holt <bdholt1@gmail.com>
+#          Noel Dawe <noel@dawe.me>
+#          Satrajit Gosh <satrajit.ghosh@gmail.com>
+#          Lars Buitinck
+#          Arnaud Joly <arnaud.v.joly@gmail.com>
+#          Joel Nothman <joel.nothman@gmail.com>
+#          Fares Hedayati <fares.hedayati@gmail.com>
+#          Jacob Schreiber <jmschreiber91@gmail.com>
+#          Nelson Liu <nelson@nelsonliu.me>
+#
+# License: BSD 3 clause
+
+# cython: cdivision=True
+# cython: boundscheck=False
+# cython: wraparound=False
+# cython: language_level=3
+# cython: linetrace=True
+
+from libc.stdlib cimport calloc
+from libc.stdlib cimport free
+from libc.string cimport memcpy
+from libc.string cimport memset
+from libc.math cimport fabs
+
+import numpy as np
+cimport numpy as cnp
+cnp.import_array()
+
+
+cdef class Criterion:
+    """Interface for impurity criteria.
+
+    This object stores methods on how to calculate how good a split is using
+    different metrics.
+    """
+
+    def __dealloc__(self):
+        """Destructor."""
+        free(self.sum_total)
+        free(self.sum_left)
+        free(self.sum_right)
+
+    def __getstate__(self):
+        return {}
+
+    def __setstate__(self, d):
+        pass
+
+    cdef int init(self, const DOUBLE_t[:, ::1] y, DOUBLE_t* sample_weight,
+                  double weighted_n_samples, SIZE_t* samples, SIZE_t start,
+                  SIZE_t end) nogil except -1:
+        """Placeholder for a method which will initialize the criterion.
+
+        Returns -1 in case of failure to allocate memory (and raise MemoryError)
+        or 0 otherwise.
+
+        Parameters
+        ----------
+        y : array-like, dtype=DOUBLE_t
+            y is a buffer that can store values for n_outputs target variables
+        sample_weight : array-like, dtype=DOUBLE_t
+            The weight of each sample
+        weighted_n_samples : double
+            The total weight of the samples being considered
+        samples : array-like, dtype=SIZE_t
+            Indices of the samples in X and y, where samples[start:end]
+            correspond to the samples in this node
+        start : SIZE_t
+            The first sample to be used on this node
+        end : SIZE_t
+            The last sample used on this node
+
+        """
+        pass
+
+    cdef int reset(self) nogil except -1:
+        """Reset the criterion at pos=start.
+
+        This method must be implemented by the subclass.
+        """
+        pass
+
+    cdef int reverse_reset(self) nogil except -1:
+        """Reset the criterion at pos=end.
+
+        This method must be implemented by the subclass.
+        """
+        pass
+
+    cdef int update(self, SIZE_t new_pos) nogil except -1:
+        """Updated statistics by moving samples[pos:new_pos] to the left child.
+
+        This updates the collected statistics by moving samples[pos:new_pos]
+        from the right child to the left child. It must be implemented by
+        the subclass.
+
+        Parameters
+        ----------
+        new_pos : SIZE_t
+            New starting index position of the samples in the right child
+        """
+        pass
+
+    cdef double node_impurity(self) nogil:
+        """Placeholder for calculating the impurity of the node.
+
+        Placeholder for a method which will evaluate the impurity of
+        the current node, i.e. the impurity of samples[start:end]. This is the
+        primary function of the criterion class. The smaller the impurity the
+        better.
+        """
+        pass
+
+    cdef void children_impurity(self, double* impurity_left,
+                                double* impurity_right) nogil:
+        """Placeholder for calculating the impurity of children.
+
+        Placeholder for a method which evaluates the impurity in
+        children nodes, i.e. the impurity of samples[start:pos] + the impurity
+        of samples[pos:end].
+
+        Parameters
+        ----------
+        impurity_left : double pointer
+            The memory address where the impurity of the left child should be
+            stored.
+        impurity_right : double pointer
+            The memory address where the impurity of the right child should be
+            stored
+        """
+        pass
+
+    cdef void node_value(self, double* dest) nogil:
+        """Placeholder for storing the node value.
+
+        Placeholder for a method which will compute the node value
+        of samples[start:end] and save the value into dest.
+
+        Parameters
+        ----------
+        dest : double pointer
+            The memory address where the node value should be stored.
+        """
+        pass
+
+    cdef double proxy_impurity_improvement(self) nogil:
+        """Compute a proxy of the impurity reduction.
+
+        This method is used to speed up the search for the best split.
+        It is a proxy quantity such that the split that maximizes this value
+        also maximizes the impurity improvement. It neglects all constant terms
+        of the impurity decrease for a given split.
+
+        The absolute impurity improvement is only computed by the
+        impurity_improvement method once the best split has been found.
+        """
+        cdef double impurity_left
+        cdef double impurity_right
+        self.children_impurity(&impurity_left, &impurity_right)
+
+        return (- self.weighted_n_right * impurity_right
+                - self.weighted_n_left * impurity_left)
+
+    cdef double impurity_improvement(self, double impurity_parent,
+                                     double impurity_left,
+                                     double impurity_right) nogil:
+        """Compute the improvement in impurity.
+
+        This method computes the improvement in impurity when a split occurs.
+        The weighted impurity improvement equation is the following:
+
+            N_t / N * (impurity - N_t_R / N_t * right_impurity
+                                - N_t_L / N_t * left_impurity)
+
+        where N is the total number of samples, N_t is the number of samples
+        at the current node, N_t_L is the number of samples in the left child,
+        and N_t_R is the number of samples in the right child,
+
+        Parameters
+        ----------
+        impurity_parent : double
+            The initial impurity of the parent node before the split
+
+        impurity_left : double
+            The impurity of the left child
+
+        impurity_right : double
+            The impurity of the right child
+
+        Return
+        ------
+        double : improvement in impurity after the split occurs
+        """
+        return ((self.weighted_n_node_samples / self.weighted_n_samples) *
+                (impurity_parent - (self.weighted_n_right /
+                                    self.weighted_n_node_samples * impurity_right)
+                                 - (self.weighted_n_left /
+                                    self.weighted_n_node_samples * impurity_left)))
+
+
+cdef class RegressionCriterion(Criterion):
+    r"""Abstract regression criterion.
+
+    This handles cases where the target is a continuous value, and is
+    evaluated by computing the variance of the target values left and right
+    of the split point. The computation takes linear time with `n_samples`
+    by using ::
+
+        var = \sum_i^n (y_i - y_bar) ** 2
+            = (\sum_i^n y_i ** 2) - n_samples * y_bar ** 2
+    """
+
+    def __cinit__(self, SIZE_t n_outputs, SIZE_t n_samples):
+        """Initialize parameters for this criterion.
+
+        Parameters
+        ----------
+        n_outputs : SIZE_t
+            The number of targets to be predicted
+
+        n_samples : SIZE_t
+            The total number of samples to fit on
+        """
+        # Default values
+        self.sample_weight = NULL
+
+        self.samples = NULL
+        self.start = 0
+        self.pos = 0
+        self.end = 0
+
+        self.n_outputs = n_outputs
+        self.n_samples = n_samples
+        self.n_node_samples = 0
+        self.weighted_n_node_samples = 0.0
+        self.weighted_n_left = 0.0
+        self.weighted_n_right = 0.0
+
+        self.sq_sum_total = 0.0
+
+        # Allocate accumulators. Make sure they are NULL, not uninitialized,
+        # before an exception can be raised (which triggers __dealloc__).
+        self.sum_total = NULL
+        self.sum_left = NULL
+        self.sum_right = NULL
+
+        # Allocate memory for the accumulators
+        self.sum_total = <double*> calloc(n_outputs, sizeof(double))
+        self.sum_left = <double*> calloc(n_outputs, sizeof(double))
+        self.sum_right = <double*> calloc(n_outputs, sizeof(double))
+
+        if (self.sum_total == NULL or
+                self.sum_left == NULL or
+                self.sum_right == NULL):
+            raise MemoryError()
+
+    def __reduce__(self):
+        return (type(self), (self.n_outputs, self.n_samples), self.__getstate__())
+
+    cdef int init(self, const DOUBLE_t[:, ::1] y, DOUBLE_t* sample_weight,
+                  double weighted_n_samples, SIZE_t* samples, SIZE_t start,
+                  SIZE_t end) nogil except -1:
+        """Initialize the criterion.
+
+        This initializes the criterion at node samples[start:end] and children
+        samples[start:start] and samples[start:end].
+        """
+        # Initialize fields
+        self.y = y
+        self.sample_weight = sample_weight
+        self.samples = samples
+        self.start = start
+        self.end = end
+        self.n_node_samples = end - start
+        self.weighted_n_samples = weighted_n_samples
+        self.weighted_n_node_samples = 0.
+
+        cdef SIZE_t i
+        cdef SIZE_t p
+        cdef SIZE_t k
+        cdef DOUBLE_t y_ik
+        cdef DOUBLE_t w_y_ik
+        cdef DOUBLE_t w = 1.0
+
+        self.sq_sum_total = 0.0
+        memset(self.sum_total, 0, self.n_outputs * sizeof(double))
+
+        for p in range(start, end):
+            i = samples[p]
+
+            if sample_weight != NULL:
+                w = sample_weight[i]
+
+            for k in range(self.n_outputs):
+                y_ik = self.y[i, k]
+                w_y_ik = w * y_ik
+                self.sum_total[k] += w_y_ik
+                self.sq_sum_total += w_y_ik * y_ik
+
+            self.weighted_n_node_samples += w
+
+        # Reset to pos=start
+        self.reset()
+        return 0
+
+    cdef int reset(self) nogil except -1:
+        """Reset the criterion at pos=start."""
+        cdef SIZE_t n_bytes = self.n_outputs * sizeof(double)
+        memset(self.sum_left, 0, n_bytes)
+        memcpy(self.sum_right, self.sum_total, n_bytes)
+
+        self.weighted_n_left = 0.0
+        self.weighted_n_right = self.weighted_n_node_samples
+        self.pos = self.start
+        return 0
+
+    cdef int reverse_reset(self) nogil except -1:
+        """Reset the criterion at pos=end."""
+        cdef SIZE_t n_bytes = self.n_outputs * sizeof(double)
+        memset(self.sum_right, 0, n_bytes)
+        memcpy(self.sum_left, self.sum_total, n_bytes)
+
+        self.weighted_n_right = 0.0
+        self.weighted_n_left = self.weighted_n_node_samples
+        self.pos = self.end
+        return 0
+
+    cdef int update(self, SIZE_t new_pos) nogil except -1:
+        """Updated statistics by moving samples[pos:new_pos] to the left."""
+        cdef double* sum_left = self.sum_left
+        cdef double* sum_right = self.sum_right
+        cdef double* sum_total = self.sum_total
+
+        cdef double* sample_weight = self.sample_weight
+        cdef SIZE_t* samples = self.samples
+
+        cdef SIZE_t pos = self.pos
+        cdef SIZE_t end = self.end
+        cdef SIZE_t i
+        cdef SIZE_t p
+        cdef SIZE_t k
+        cdef DOUBLE_t w = 1.0
+
+        # Update statistics up to new_pos
+        #
+        # Given that
+        #           sum_left[x] +  sum_right[x] = sum_total[x]
+        # and that sum_total is known, we are going to update
+        # sum_left from the direction that require the least amount
+        # of computations, i.e. from pos to new_pos or from end to new_pos.
+        if (new_pos - pos) <= (end - new_pos):
+            for p in range(pos, new_pos):
+                i = samples[p]
+
+                if sample_weight != NULL:
+                    w = sample_weight[i]
+
+                for k in range(self.n_outputs):
+                    sum_left[k] += w * self.y[i, k]
+
+                self.weighted_n_left += w
+        else:
+            self.reverse_reset()
+
+            for p in range(end - 1, new_pos - 1, -1):
+                i = samples[p]
+
+                if sample_weight != NULL:
+                    w = sample_weight[i]
+
+                for k in range(self.n_outputs):
+                    sum_left[k] -= w * self.y[i, k]
+
+                self.weighted_n_left -= w
+
+        self.weighted_n_right = (self.weighted_n_node_samples -
+                                 self.weighted_n_left)
+        for k in range(self.n_outputs):
+            sum_right[k] = sum_total[k] - sum_left[k]
+
+        self.pos = new_pos
+        return 0
+
+    cdef double node_impurity(self) nogil:
+        pass
+
+    cdef void children_impurity(self, double* impurity_left,
+                                double* impurity_right) nogil:
+        pass
+
+    cdef void node_value(self, double* dest) nogil:
+        """Compute the node value of samples[start:end] into dest."""
+        cdef SIZE_t k
+
+        for k in range(self.n_outputs):
+            dest[k] = self.sum_total[k] / self.weighted_n_node_samples
diff --git a/causalml/inference/tree/_tree/_splitter.pxd b/causalml/inference/tree/_tree/_splitter.pxd
new file mode 100644
index 00000000..5b63fc1d
--- /dev/null
+++ b/causalml/inference/tree/_tree/_splitter.pxd
@@ -0,0 +1,98 @@
+# Authors: Gilles Louppe <g.louppe@gmail.com>
+#          Peter Prettenhofer <peter.prettenhofer@gmail.com>
+#          Brian Holt <bdholt1@gmail.com>
+#          Joel Nothman <joel.nothman@gmail.com>
+#          Arnaud Joly <arnaud.v.joly@gmail.com>
+#          Jacob Schreiber <jmschreiber91@gmail.com>
+#
+# License: BSD 3 clause
+
+# cython: cdivision=True
+# cython: boundscheck=False
+# cython: wraparound=False
+# cython: language_level=3
+# cython: linetrace=True
+
+import numpy as np
+cimport numpy as cnp
+
+from ._criterion cimport Criterion
+
+from ._tree cimport DTYPE_t          # Type of X
+from ._tree cimport DOUBLE_t         # Type of y, sample_weight
+from ._tree cimport SIZE_t           # Type for indices and counters
+from ._tree cimport INT32_t          # Signed 32 bit integer
+from ._tree cimport UINT32_t         # Unsigned 32 bit integer
+
+
+cdef struct SplitRecord:
+    # Data to track sample split
+    SIZE_t feature         # Which feature to split on.
+    SIZE_t pos             # Split samples array at the given position,
+                           # i.e. count of samples below threshold for feature.
+                           # pos is >= end if the node is a leaf.
+    double threshold       # Threshold to split at.
+    double improvement     # Impurity improvement given parent node.
+    double impurity_left   # Impurity of the left split.
+    double impurity_right  # Impurity of the right split.
+
+cdef class Splitter:
+    # The splitter searches in the input space for a feature and a threshold
+    # to split the samples samples[start:end].
+    #
+    # The impurity computations are delegated to a criterion object.
+
+    # Internal structures
+    cdef public Criterion criterion      # Impurity criterion
+    cdef public SIZE_t max_features      # Number of features to test
+    cdef public SIZE_t min_samples_leaf  # Min samples in a leaf
+    cdef public double min_weight_leaf   # Minimum weight in a leaf
+
+    cdef object random_state             # Random state
+    cdef UINT32_t rand_r_state           # sklearn_rand_r random number state
+
+    cdef SIZE_t* samples                 # Sample indices in X, y
+    cdef SIZE_t n_samples                # X.shape[0]
+    cdef double weighted_n_samples       # Weighted number of samples
+    cdef SIZE_t* features                # Feature indices in X
+    cdef SIZE_t* constant_features       # Constant features indices
+    cdef SIZE_t n_features               # X.shape[1]
+    cdef DTYPE_t* feature_values         # temp. array holding feature values
+
+    cdef SIZE_t start                    # Start position for the current node
+    cdef SIZE_t end                      # End position for the current node
+
+    cdef const DOUBLE_t[:, ::1] y
+    cdef DOUBLE_t* sample_weight
+
+    # The samples vector `samples` is maintained by the Splitter object such
+    # that the samples contained in a node are contiguous. With this setting,
+    # `node_split` reorganizes the node samples `samples[start:end]` in two
+    # subsets `samples[start:pos]` and `samples[pos:end]`.
+
+    # The 1-d  `features` array of size n_features contains the features
+    # indices and allows fast sampling without replacement of features.
+
+    # The 1-d `constant_features` array of size n_features holds in
+    # `constant_features[:n_constant_features]` the feature ids with
+    # constant values for all the samples that reached a specific node.
+    # The value `n_constant_features` is given by the parent node to its
+    # child nodes.  The content of the range `[n_constant_features:]` is left
+    # undefined, but preallocated for performance reasons
+    # This allows optimization with depth-based tree building.
+
+    # Methods
+    cdef int init(self, object X, const DOUBLE_t[:, ::1] y,
+                  DOUBLE_t* sample_weight) except -1
+
+    cdef int node_reset(self, SIZE_t start, SIZE_t end,
+                        double* weighted_n_node_samples) nogil except -1
+
+    cdef int node_split(self,
+                        double impurity,   # Impurity of the node
+                        SplitRecord* split,
+                        SIZE_t* n_constant_features) nogil except -1
+
+    cdef void node_value(self, double* dest) nogil
+
+    cdef double node_impurity(self) nogil
diff --git a/causalml/inference/tree/_tree/_splitter.pyx b/causalml/inference/tree/_tree/_splitter.pyx
new file mode 100644
index 00000000..f72bcce9
--- /dev/null
+++ b/causalml/inference/tree/_tree/_splitter.pyx
@@ -0,0 +1,1547 @@
+# Authors: Gilles Louppe <g.louppe@gmail.com>
+#          Peter Prettenhofer <peter.prettenhofer@gmail.com>
+#          Brian Holt <bdholt1@gmail.com>
+#          Noel Dawe <noel@dawe.me>
+#          Satrajit Gosh <satrajit.ghosh@gmail.com>
+#          Lars Buitinck
+#          Arnaud Joly <arnaud.v.joly@gmail.com>
+#          Joel Nothman <joel.nothman@gmail.com>
+#          Fares Hedayati <fares.hedayati@gmail.com>
+#          Jacob Schreiber <jmschreiber91@gmail.com>
+#
+# License: BSD 3 clause
+
+# cython: cdivision=True
+# cython: boundscheck=False
+# cython: wraparound=False
+# cython: language_level=3
+# cython: linetrace=True
+
+from ._criterion cimport Criterion
+
+from libc.stdlib cimport free
+from libc.stdlib cimport qsort
+from libc.string cimport memcpy
+from libc.string cimport memset
+
+import numpy as np
+cimport numpy as cnp
+cnp.import_array()
+
+from scipy.sparse import csc_matrix
+
+from ._utils cimport log
+from ._utils cimport rand_int
+from ._utils cimport rand_uniform
+from ._utils cimport RAND_R_MAX
+from ._utils cimport safe_realloc
+
+cdef double INFINITY = np.inf
+
+# Mitigate precision differences between 32 bit and 64 bit
+cdef DTYPE_t FEATURE_THRESHOLD = 1e-7
+
+# Constant to switch between algorithm non zero value extract algorithm
+# in SparseSplitter
+cdef DTYPE_t EXTRACT_NNZ_SWITCH = 0.1
+
+cdef inline void _init_split(SplitRecord* self, SIZE_t start_pos) nogil:
+    self.impurity_left = INFINITY
+    self.impurity_right = INFINITY
+    self.pos = start_pos
+    self.feature = 0
+    self.threshold = 0.
+    self.improvement = -INFINITY
+
+cdef class Splitter:
+    """Abstract splitter class.
+
+    Splitters are called by tree builders to find the best splits on both
+    sparse and dense data, one split at a time.
+    """
+
+    def __cinit__(self, Criterion criterion, SIZE_t max_features,
+                  SIZE_t min_samples_leaf, double min_weight_leaf,
+                  object random_state):
+        """
+        Parameters
+        ----------
+        criterion : Criterion
+            The criterion to measure the quality of a split.
+
+        max_features : SIZE_t
+            The maximal number of randomly selected features which can be
+            considered for a split.
+
+        min_samples_leaf : SIZE_t
+            The minimal number of samples each leaf can have, where splits
+            which would result in having less samples in a leaf are not
+            considered.
+
+        min_weight_leaf : double
+            The minimal weight each leaf can have, where the weight is the sum
+            of the weights of each sample in it.
+
+        random_state : object
+            The user inputted random state to be used for pseudo-randomness
+        """
+
+        self.criterion = criterion
+
+        self.samples = NULL
+        self.n_samples = 0
+        self.features = NULL
+        self.n_features = 0
+        self.feature_values = NULL
+
+        self.sample_weight = NULL
+
+        self.max_features = max_features
+        self.min_samples_leaf = min_samples_leaf
+        self.min_weight_leaf = min_weight_leaf
+        self.random_state = random_state
+
+    def __dealloc__(self):
+        """Destructor."""
+
+        free(self.samples)
+        free(self.features)
+        free(self.constant_features)
+        free(self.feature_values)
+
+    def __getstate__(self):
+        return {}
+
+    def __setstate__(self, d):
+        pass
+
+    cdef int init(self,
+                   object X,
+                   const DOUBLE_t[:, ::1] y,
+                   DOUBLE_t* sample_weight) except -1:
+        """Initialize the splitter.
+
+        Take in the input data X, the target Y, and optional sample weights.
+
+        Returns -1 in case of failure to allocate memory (and raise MemoryError)
+        or 0 otherwise.
+
+        Parameters
+        ----------
+        X : object
+            This contains the inputs. Usually it is a 2d numpy array.
+
+        y : ndarray, dtype=DOUBLE_t
+            This is the vector of targets, or true labels, for the samples
+
+        sample_weight : DOUBLE_t*
+            The weights of the samples, where higher weighted samples are fit
+            closer than lower weight samples. If not provided, all samples
+            are assumed to have uniform weight.
+        """
+
+        self.rand_r_state = self.random_state.randint(0, RAND_R_MAX)
+        cdef SIZE_t n_samples = X.shape[0]
+
+        # Create a new array which will be used to store nonzero
+        # samples from the feature of interest
+        cdef SIZE_t* samples = safe_realloc(&self.samples, n_samples)
+
+        cdef SIZE_t i, j
+        cdef double weighted_n_samples = 0.0
+        j = 0
+
+        for i in range(n_samples):
+            # Only work with positively weighted samples
+            if sample_weight == NULL or sample_weight[i] != 0.0:
+                samples[j] = i
+                j += 1
+
+            if sample_weight != NULL:
+                weighted_n_samples += sample_weight[i]
+            else:
+                weighted_n_samples += 1.0
+
+        # Number of samples is number of positively weighted samples
+        self.n_samples = j
+        self.weighted_n_samples = weighted_n_samples
+
+        cdef SIZE_t n_features = X.shape[1]
+        cdef SIZE_t* features = safe_realloc(&self.features, n_features)
+
+        for i in range(n_features):
+            features[i] = i
+
+        self.n_features = n_features
+
+        safe_realloc(&self.feature_values, n_samples)
+        safe_realloc(&self.constant_features, n_features)
+
+        self.y = y
+
+        self.sample_weight = sample_weight
+        return 0
+
+    cdef int node_reset(self, SIZE_t start, SIZE_t end,
+                        double* weighted_n_node_samples) nogil except -1:
+        """Reset splitter on node samples[start:end].
+
+        Returns -1 in case of failure to allocate memory (and raise MemoryError)
+        or 0 otherwise.
+
+        Parameters
+        ----------
+        start : SIZE_t
+            The index of the first sample to consider
+        end : SIZE_t
+            The index of the last sample to consider
+        weighted_n_node_samples : ndarray, dtype=double pointer
+            The total weight of those samples
+        """
+
+        self.start = start
+        self.end = end
+
+        self.criterion.init(self.y,
+                            self.sample_weight,
+                            self.weighted_n_samples,
+                            self.samples,
+                            start,
+                            end)
+
+        weighted_n_node_samples[0] = self.criterion.weighted_n_node_samples
+        return 0
+
+    cdef int node_split(self, double impurity, SplitRecord* split,
+                        SIZE_t* n_constant_features) nogil except -1:
+        """Find the best split on node samples[start:end].
+
+        This is a placeholder method. The majority of computation will be done
+        here.
+
+        It should return -1 upon errors.
+        """
+
+        pass
+
+    cdef void node_value(self, double* dest) nogil:
+        """Copy the value of node samples[start:end] into dest."""
+
+        self.criterion.node_value(dest)
+
+    cdef double node_impurity(self) nogil:
+        """Return the impurity of the current node."""
+
+        return self.criterion.node_impurity()
+
+
+cdef class BaseDenseSplitter(Splitter):
+    cdef const DTYPE_t[:, :] X
+
+    cdef SIZE_t n_total_samples
+
+    cdef int init(self,
+                  object X,
+                  const DOUBLE_t[:, ::1] y,
+                  DOUBLE_t* sample_weight) except -1:
+        """Initialize the splitter
+
+        Returns -1 in case of failure to allocate memory (and raise MemoryError)
+        or 0 otherwise.
+        """
+
+        # Call parent init
+        Splitter.init(self, X, y, sample_weight)
+
+        self.X = X
+        return 0
+
+
+cdef class BestSplitter(BaseDenseSplitter):
+    """Splitter for finding the best split."""
+    def __reduce__(self):
+        return (BestSplitter, (self.criterion,
+                               self.max_features,
+                               self.min_samples_leaf,
+                               self.min_weight_leaf,
+                               self.random_state), self.__getstate__())
+
+    cdef int node_split(self, double impurity, SplitRecord* split,
+                        SIZE_t* n_constant_features) nogil except -1:
+        """Find the best split on node samples[start:end]
+
+        Returns -1 in case of failure to allocate memory (and raise MemoryError)
+        or 0 otherwise.
+        """
+        # Find the best split
+        cdef SIZE_t* samples = self.samples
+        cdef SIZE_t start = self.start
+        cdef SIZE_t end = self.end
+
+        cdef SIZE_t* features = self.features
+        cdef SIZE_t* constant_features = self.constant_features
+        cdef SIZE_t n_features = self.n_features
+
+        cdef DTYPE_t* Xf = self.feature_values
+        cdef SIZE_t max_features = self.max_features
+        cdef SIZE_t min_samples_leaf = self.min_samples_leaf
+        cdef double min_weight_leaf = self.min_weight_leaf
+        cdef UINT32_t* random_state = &self.rand_r_state
+
+        cdef SplitRecord best, current
+        cdef double current_proxy_improvement = -INFINITY
+        cdef double best_proxy_improvement = -INFINITY
+
+        cdef SIZE_t f_i = n_features
+        cdef SIZE_t f_j
+        cdef SIZE_t p
+        cdef SIZE_t feature_idx_offset
+        cdef SIZE_t feature_offset
+        cdef SIZE_t i
+        cdef SIZE_t j
+
+        cdef SIZE_t n_visited_features = 0
+        # Number of features discovered to be constant during the split search
+        cdef SIZE_t n_found_constants = 0
+        # Number of features known to be constant and drawn without replacement
+        cdef SIZE_t n_drawn_constants = 0
+        cdef SIZE_t n_known_constants = n_constant_features[0]
+        # n_total_constants = n_known_constants + n_found_constants
+        cdef SIZE_t n_total_constants = n_known_constants
+        cdef DTYPE_t current_feature_value
+        cdef SIZE_t partition_end
+
+        _init_split(&best, end)
+
+        # Sample up to max_features without replacement using a
+        # Fisher-Yates-based algorithm (using the local variables `f_i` and
+        # `f_j` to compute a permutation of the `features` array).
+        #
+        # Skip the CPU intensive evaluation of the impurity criterion for
+        # features that were already detected as constant (hence not suitable
+        # for good splitting) by ancestor nodes and save the information on
+        # newly discovered constant features to spare computation on descendant
+        # nodes.
+        while (f_i > n_total_constants and  # Stop early if remaining features
+                                            # are constant
+                (n_visited_features < max_features or
+                 # At least one drawn features must be non constant
+                 n_visited_features <= n_found_constants + n_drawn_constants)):
+
+            n_visited_features += 1
+
+            # Loop invariant: elements of features in
+            # - [:n_drawn_constant[ holds drawn and known constant features;
+            # - [n_drawn_constant:n_known_constant[ holds known constant
+            #   features that haven't been drawn yet;
+            # - [n_known_constant:n_total_constant[ holds newly found constant
+            #   features;
+            # - [n_total_constant:f_i[ holds features that haven't been drawn
+            #   yet and aren't constant apriori.
+            # - [f_i:n_features[ holds features that have been drawn
+            #   and aren't constant.
+
+            # Draw a feature at random
+            f_j = rand_int(n_drawn_constants, f_i - n_found_constants,
+                           random_state)
+
+            if f_j < n_known_constants:
+                # f_j in the interval [n_drawn_constants, n_known_constants[
+                features[n_drawn_constants], features[f_j] = features[f_j], features[n_drawn_constants]
+
+                n_drawn_constants += 1
+
+            else:
+                # f_j in the interval [n_known_constants, f_i - n_found_constants[
+                f_j += n_found_constants
+                # f_j in the interval [n_total_constants, f_i[
+                current.feature = features[f_j]
+
+                # Sort samples along that feature; by
+                # copying the values into an array and
+                # sorting the array in a manner which utilizes the cache more
+                # effectively.
+                for i in range(start, end):
+                    Xf[i] = self.X[samples[i], current.feature]
+
+                sort(Xf + start, samples + start, end - start)
+
+                if Xf[end - 1] <= Xf[start] + FEATURE_THRESHOLD:
+                    features[f_j], features[n_total_constants] = features[n_total_constants], features[f_j]
+
+                    n_found_constants += 1
+                    n_total_constants += 1
+
+                else:
+                    f_i -= 1
+                    features[f_i], features[f_j] = features[f_j], features[f_i]
+
+                    # Evaluate all splits
+                    self.criterion.reset()
+                    p = start
+
+                    while p < end:
+                        while (p + 1 < end and
+                               Xf[p + 1] <= Xf[p] + FEATURE_THRESHOLD):
+                            p += 1
+
+                        # (p + 1 >= end) or (X[samples[p + 1], current.feature] >
+                        #                    X[samples[p], current.feature])
+                        p += 1
+                        # (p >= end) or (X[samples[p], current.feature] >
+                        #                X[samples[p - 1], current.feature])
+
+                        if p < end:
+                            current.pos = p
+
+                            # Reject if min_samples_leaf is not guaranteed
+                            if (((current.pos - start) < min_samples_leaf) or
+                                    ((end - current.pos) < min_samples_leaf)):
+                                continue
+
+                            self.criterion.update(current.pos)
+
+                            # Reject if min_weight_leaf is not satisfied
+                            if ((self.criterion.weighted_n_left < min_weight_leaf) or
+                                    (self.criterion.weighted_n_right < min_weight_leaf)):
+                                continue
+
+                            current_proxy_improvement = self.criterion.proxy_impurity_improvement()
+
+                            if current_proxy_improvement > best_proxy_improvement:
+                                best_proxy_improvement = current_proxy_improvement
+                                # sum of halves is used to avoid infinite value
+                                current.threshold = Xf[p - 1] / 2.0 + Xf[p] / 2.0
+
+                                if ((current.threshold == Xf[p]) or
+                                    (current.threshold == INFINITY) or
+                                    (current.threshold == -INFINITY)):
+                                    current.threshold = Xf[p - 1]
+
+                                best = current  # copy
+
+        # Reorganize into samples[start:best.pos] + samples[best.pos:end]
+        if best.pos < end:
+            partition_end = end
+            p = start
+
+            while p < partition_end:
+                if self.X[samples[p], best.feature] <= best.threshold:
+                    p += 1
+
+                else:
+                    partition_end -= 1
+
+                    samples[p], samples[partition_end] = samples[partition_end], samples[p]
+
+            self.criterion.reset()
+            self.criterion.update(best.pos)
+            self.criterion.children_impurity(&best.impurity_left,
+                                             &best.impurity_right)
+            best.improvement = self.criterion.impurity_improvement(
+                impurity, best.impurity_left, best.impurity_right)
+
+        # Respect invariant for constant features: the original order of
+        # element in features[:n_known_constants] must be preserved for sibling
+        # and child nodes
+        memcpy(features, constant_features, sizeof(SIZE_t) * n_known_constants)
+
+        # Copy newly found constant features
+        memcpy(constant_features + n_known_constants,
+               features + n_known_constants,
+               sizeof(SIZE_t) * n_found_constants)
+
+        # Return values
+        split[0] = best
+        n_constant_features[0] = n_total_constants
+        return 0
+
+
+# Sort n-element arrays pointed to by Xf and samples, simultaneously,
+# by the values in Xf. Algorithm: Introsort (Musser, SP&E, 1997).
+cdef inline void sort(DTYPE_t* Xf, SIZE_t* samples, SIZE_t n) nogil:
+    if n == 0:
+      return
+    cdef int maxd = 2 * <int>log(n)
+    introsort(Xf, samples, n, maxd)
+
+
+cdef inline void swap(DTYPE_t* Xf, SIZE_t* samples,
+        SIZE_t i, SIZE_t j) nogil:
+    # Helper for sort
+    Xf[i], Xf[j] = Xf[j], Xf[i]
+    samples[i], samples[j] = samples[j], samples[i]
+
+
+cdef inline DTYPE_t median3(DTYPE_t* Xf, SIZE_t n) nogil:
+    # Median of three pivot selection, after Bentley and McIlroy (1993).
+    # Engineering a sort function. SP&E. Requires 8/3 comparisons on average.
+    cdef DTYPE_t a = Xf[0], b = Xf[n / 2], c = Xf[n - 1]
+    if a < b:
+        if b < c:
+            return b
+        elif a < c:
+            return c
+        else:
+            return a
+    elif b < c:
+        if a < c:
+            return a
+        else:
+            return c
+    else:
+        return b
+
+
+# Introsort with median of 3 pivot selection and 3-way partition function
+# (robust to repeated elements, e.g. lots of zero features).
+cdef void introsort(DTYPE_t* Xf, SIZE_t *samples,
+                    SIZE_t n, int maxd) nogil:
+    cdef DTYPE_t pivot
+    cdef SIZE_t i, l, r
+
+    while n > 1:
+        if maxd <= 0:   # max depth limit exceeded ("gone quadratic")
+            heapsort(Xf, samples, n)
+            return
+        maxd -= 1
+
+        pivot = median3(Xf, n)
+
+        # Three-way partition.
+        i = l = 0
+        r = n
+        while i < r:
+            if Xf[i] < pivot:
+                swap(Xf, samples, i, l)
+                i += 1
+                l += 1
+            elif Xf[i] > pivot:
+                r -= 1
+                swap(Xf, samples, i, r)
+            else:
+                i += 1
+
+        introsort(Xf, samples, l, maxd)
+        Xf += r
+        samples += r
+        n -= r
+
+
+cdef inline void sift_down(DTYPE_t* Xf, SIZE_t* samples,
+                           SIZE_t start, SIZE_t end) nogil:
+    # Restore heap order in Xf[start:end] by moving the max element to start.
+    cdef SIZE_t child, maxind, root
+
+    root = start
+    while True:
+        child = root * 2 + 1
+
+        # find max of root, left child, right child
+        maxind = root
+        if child < end and Xf[maxind] < Xf[child]:
+            maxind = child
+        if child + 1 < end and Xf[maxind] < Xf[child + 1]:
+            maxind = child + 1
+
+        if maxind == root:
+            break
+        else:
+            swap(Xf, samples, root, maxind)
+            root = maxind
+
+
+cdef void heapsort(DTYPE_t* Xf, SIZE_t* samples, SIZE_t n) nogil:
+    cdef SIZE_t start, end
+
+    # heapify
+    start = (n - 2) / 2
+    end = n
+    while True:
+        sift_down(Xf, samples, start, end)
+        if start == 0:
+            break
+        start -= 1
+
+    # sort by shrinking the heap, putting the max element immediately after it
+    end = n - 1
+    while end > 0:
+        swap(Xf, samples, 0, end)
+        sift_down(Xf, samples, 0, end)
+        end = end - 1
+
+
+cdef class RandomSplitter(BaseDenseSplitter):
+    """Splitter for finding the best random split."""
+    def __reduce__(self):
+        return (RandomSplitter, (self.criterion,
+                                 self.max_features,
+                                 self.min_samples_leaf,
+                                 self.min_weight_leaf,
+                                 self.random_state), self.__getstate__())
+
+    cdef int node_split(self, double impurity, SplitRecord* split,
+                        SIZE_t* n_constant_features) nogil except -1:
+        """Find the best random split on node samples[start:end]
+
+        Returns -1 in case of failure to allocate memory (and raise MemoryError)
+        or 0 otherwise.
+        """
+        # Draw random splits and pick the best
+        cdef SIZE_t* samples = self.samples
+        cdef SIZE_t start = self.start
+        cdef SIZE_t end = self.end
+
+        cdef SIZE_t* features = self.features
+        cdef SIZE_t* constant_features = self.constant_features
+        cdef SIZE_t n_features = self.n_features
+
+        cdef DTYPE_t* Xf = self.feature_values
+        cdef SIZE_t max_features = self.max_features
+        cdef SIZE_t min_samples_leaf = self.min_samples_leaf
+        cdef double min_weight_leaf = self.min_weight_leaf
+        cdef UINT32_t* random_state = &self.rand_r_state
+
+        cdef SplitRecord best, current
+        cdef double current_proxy_improvement = - INFINITY
+        cdef double best_proxy_improvement = - INFINITY
+
+        cdef SIZE_t f_i = n_features
+        cdef SIZE_t f_j
+        cdef SIZE_t p
+        cdef SIZE_t partition_end
+        cdef SIZE_t feature_stride
+        # Number of features discovered to be constant during the split search
+        cdef SIZE_t n_found_constants = 0
+        # Number of features known to be constant and drawn without replacement
+        cdef SIZE_t n_drawn_constants = 0
+        cdef SIZE_t n_known_constants = n_constant_features[0]
+        # n_total_constants = n_known_constants + n_found_constants
+        cdef SIZE_t n_total_constants = n_known_constants
+        cdef SIZE_t n_visited_features = 0
+        cdef DTYPE_t min_feature_value
+        cdef DTYPE_t max_feature_value
+        cdef DTYPE_t current_feature_value
+
+        _init_split(&best, end)
+
+        # Sample up to max_features without replacement using a
+        # Fisher-Yates-based algorithm (using the local variables `f_i` and
+        # `f_j` to compute a permutation of the `features` array).
+        #
+        # Skip the CPU intensive evaluation of the impurity criterion for
+        # features that were already detected as constant (hence not suitable
+        # for good splitting) by ancestor nodes and save the information on
+        # newly discovered constant features to spare computation on descendant
+        # nodes.
+        while (f_i > n_total_constants and  # Stop early if remaining features
+                                            # are constant
+                (n_visited_features < max_features or
+                 # At least one drawn features must be non constant
+                 n_visited_features <= n_found_constants + n_drawn_constants)):
+            n_visited_features += 1
+
+            # Loop invariant: elements of features in
+            # - [:n_drawn_constant[ holds drawn and known constant features;
+            # - [n_drawn_constant:n_known_constant[ holds known constant
+            #   features that haven't been drawn yet;
+            # - [n_known_constant:n_total_constant[ holds newly found constant
+            #   features;
+            # - [n_total_constant:f_i[ holds features that haven't been drawn
+            #   yet and aren't constant apriori.
+            # - [f_i:n_features[ holds features that have been drawn
+            #   and aren't constant.
+
+            # Draw a feature at random
+            f_j = rand_int(n_drawn_constants, f_i - n_found_constants,
+                           random_state)
+
+            if f_j < n_known_constants:
+                # f_j in the interval [n_drawn_constants, n_known_constants[
+                features[n_drawn_constants], features[f_j] = features[f_j], features[n_drawn_constants]
+                n_drawn_constants += 1
+
+            else:
+                # f_j in the interval [n_known_constants, f_i - n_found_constants[
+                f_j += n_found_constants
+                # f_j in the interval [n_total_constants, f_i[
+
+                current.feature = features[f_j]
+
+                # Find min, max
+                min_feature_value = self.X[samples[start], current.feature]
+                max_feature_value = min_feature_value
+                Xf[start] = min_feature_value
+
+                for p in range(start + 1, end):
+                    current_feature_value = self.X[samples[p], current.feature]
+                    Xf[p] = current_feature_value
+
+                    if current_feature_value < min_feature_value:
+                        min_feature_value = current_feature_value
+                    elif current_feature_value > max_feature_value:
+                        max_feature_value = current_feature_value
+
+                if max_feature_value <= min_feature_value + FEATURE_THRESHOLD:
+                    features[f_j], features[n_total_constants] = features[n_total_constants], current.feature
+
+                    n_found_constants += 1
+                    n_total_constants += 1
+
+                else:
+                    f_i -= 1
+                    features[f_i], features[f_j] = features[f_j], features[f_i]
+
+                    # Draw a random threshold
+                    current.threshold = rand_uniform(min_feature_value,
+                                                     max_feature_value,
+                                                     random_state)
+
+                    if current.threshold == max_feature_value:
+                        current.threshold = min_feature_value
+
+                    # Partition
+                    p, partition_end = start, end
+                    while p < partition_end:
+                        if Xf[p] <= current.threshold:
+                            p += 1
+                        else:
+                            partition_end -= 1
+
+                            Xf[p], Xf[partition_end] = Xf[partition_end], Xf[p]
+                            samples[p], samples[partition_end] = samples[partition_end], samples[p]
+
+                    current.pos = partition_end
+
+                    # Reject if min_samples_leaf is not guaranteed
+                    if (((current.pos - start) < min_samples_leaf) or
+                            ((end - current.pos) < min_samples_leaf)):
+                        continue
+
+                    # Evaluate split
+                    self.criterion.reset()
+                    self.criterion.update(current.pos)
+
+                    # Reject if min_weight_leaf is not satisfied
+                    if ((self.criterion.weighted_n_left < min_weight_leaf) or
+                            (self.criterion.weighted_n_right < min_weight_leaf)):
+                        continue
+
+                    current_proxy_improvement = self.criterion.proxy_impurity_improvement()
+
+                    if current_proxy_improvement > best_proxy_improvement:
+                        best_proxy_improvement = current_proxy_improvement
+                        best = current  # copy
+
+        # Reorganize into samples[start:best.pos] + samples[best.pos:end]
+        if best.pos < end:
+            if current.feature != best.feature:
+                p, partition_end = start, end
+
+                while p < partition_end:
+                    if self.X[samples[p], best.feature] <= best.threshold:
+                        p += 1
+                    else:
+                        partition_end -= 1
+
+                        samples[p], samples[partition_end] = samples[partition_end], samples[p]
+
+            self.criterion.reset()
+            self.criterion.update(best.pos)
+            self.criterion.children_impurity(&best.impurity_left,
+                                             &best.impurity_right)
+            best.improvement = self.criterion.impurity_improvement(
+                impurity, best.impurity_left, best.impurity_right)
+
+        # Respect invariant for constant features: the original order of
+        # element in features[:n_known_constants] must be preserved for sibling
+        # and child nodes
+        memcpy(features, constant_features, sizeof(SIZE_t) * n_known_constants)
+
+        # Copy newly found constant features
+        memcpy(constant_features + n_known_constants,
+               features + n_known_constants,
+               sizeof(SIZE_t) * n_found_constants)
+
+        # Return values
+        split[0] = best
+        n_constant_features[0] = n_total_constants
+        return 0
+
+
+cdef class BaseSparseSplitter(Splitter):
+    # The sparse splitter works only with csc sparse matrix format
+    cdef DTYPE_t* X_data
+    cdef INT32_t* X_indices
+    cdef INT32_t* X_indptr
+
+    cdef SIZE_t n_total_samples
+
+    cdef SIZE_t* index_to_samples
+    cdef SIZE_t* sorted_samples
+
+    def __cinit__(self, Criterion criterion, SIZE_t max_features,
+                  SIZE_t min_samples_leaf, double min_weight_leaf,
+                  object random_state):
+        # Parent __cinit__ is automatically called
+
+        self.X_data = NULL
+        self.X_indices = NULL
+        self.X_indptr = NULL
+
+        self.n_total_samples = 0
+
+        self.index_to_samples = NULL
+        self.sorted_samples = NULL
+
+    def __dealloc__(self):
+        """Deallocate memory."""
+        free(self.index_to_samples)
+        free(self.sorted_samples)
+
+    cdef int init(self,
+                  object X,
+                  const DOUBLE_t[:, ::1] y,
+                  DOUBLE_t* sample_weight) except -1:
+        """Initialize the splitter
+
+        Returns -1 in case of failure to allocate memory (and raise MemoryError)
+        or 0 otherwise.
+        """
+        # Call parent init
+        Splitter.init(self, X, y, sample_weight)
+
+        if not isinstance(X, csc_matrix):
+            raise ValueError("X should be in csc format")
+
+        cdef SIZE_t* samples = self.samples
+        cdef SIZE_t n_samples = self.n_samples
+
+        # Initialize X
+        cdef cnp.ndarray[dtype=DTYPE_t, ndim=1] data = X.data
+        cdef cnp.ndarray[dtype=INT32_t, ndim=1] indices = X.indices
+        cdef cnp.ndarray[dtype=INT32_t, ndim=1] indptr = X.indptr
+        cdef SIZE_t n_total_samples = X.shape[0]
+
+        self.X_data = <DTYPE_t*> data.data
+        self.X_indices = <INT32_t*> indices.data
+        self.X_indptr = <INT32_t*> indptr.data
+        self.n_total_samples = n_total_samples
+
+        # Initialize auxiliary array used to perform split
+        safe_realloc(&self.index_to_samples, n_total_samples)
+        safe_realloc(&self.sorted_samples, n_samples)
+
+        cdef SIZE_t* index_to_samples = self.index_to_samples
+        cdef SIZE_t p
+        for p in range(n_total_samples):
+            index_to_samples[p] = -1
+
+        for p in range(n_samples):
+            index_to_samples[samples[p]] = p
+        return 0
+
+    cdef inline SIZE_t _partition(self, double threshold,
+                                  SIZE_t end_negative, SIZE_t start_positive,
+                                  SIZE_t zero_pos) nogil:
+        """Partition samples[start:end] based on threshold."""
+
+        cdef SIZE_t p
+        cdef SIZE_t partition_end
+
+        cdef DTYPE_t* Xf = self.feature_values
+        cdef SIZE_t* samples = self.samples
+        cdef SIZE_t* index_to_samples = self.index_to_samples
+
+        if threshold < 0.:
+            p = self.start
+            partition_end = end_negative
+        elif threshold > 0.:
+            p = start_positive
+            partition_end = self.end
+        else:
+            # Data are already split
+            return zero_pos
+
+        while p < partition_end:
+            if Xf[p] <= threshold:
+                p += 1
+
+            else:
+                partition_end -= 1
+
+                Xf[p], Xf[partition_end] = Xf[partition_end], Xf[p]
+                sparse_swap(index_to_samples, samples, p, partition_end)
+
+        return partition_end
+
+    cdef inline void extract_nnz(self, SIZE_t feature,
+                                 SIZE_t* end_negative, SIZE_t* start_positive,
+                                 bint* is_samples_sorted) nogil:
+        """Extract and partition values for a given feature.
+
+        The extracted values are partitioned between negative values
+        Xf[start:end_negative[0]] and positive values Xf[start_positive[0]:end].
+        The samples and index_to_samples are modified according to this
+        partition.
+
+        The extraction corresponds to the intersection between the arrays
+        X_indices[indptr_start:indptr_end] and samples[start:end].
+        This is done efficiently using either an index_to_samples based approach
+        or binary search based approach.
+
+        Parameters
+        ----------
+        feature : SIZE_t,
+            Index of the feature we want to extract non zero value.
+
+
+        end_negative, start_positive : SIZE_t*, SIZE_t*,
+            Return extracted non zero values in self.samples[start:end] where
+            negative values are in self.feature_values[start:end_negative[0]]
+            and positive values are in
+            self.feature_values[start_positive[0]:end].
+
+        is_samples_sorted : bint*,
+            If is_samples_sorted, then self.sorted_samples[start:end] will be
+            the sorted version of self.samples[start:end].
+
+        """
+        cdef SIZE_t indptr_start = self.X_indptr[feature],
+        cdef SIZE_t indptr_end = self.X_indptr[feature + 1]
+        cdef SIZE_t n_indices = <SIZE_t>(indptr_end - indptr_start)
+        cdef SIZE_t n_samples = self.end - self.start
+
+        # Use binary search if n_samples * log(n_indices) <
+        # n_indices and index_to_samples approach otherwise.
+        # O(n_samples * log(n_indices)) is the running time of binary
+        # search and O(n_indices) is the running time of index_to_samples
+        # approach.
+        if ((1 - is_samples_sorted[0]) * n_samples * log(n_samples) +
+                n_samples * log(n_indices) < EXTRACT_NNZ_SWITCH * n_indices):
+            extract_nnz_binary_search(self.X_indices, self.X_data,
+                                      indptr_start, indptr_end,
+                                      self.samples, self.start, self.end,
+                                      self.index_to_samples,
+                                      self.feature_values,
+                                      end_negative, start_positive,
+                                      self.sorted_samples, is_samples_sorted)
+
+        # Using an index to samples  technique to extract non zero values
+        # index_to_samples is a mapping from X_indices to samples
+        else:
+            extract_nnz_index_to_samples(self.X_indices, self.X_data,
+                                         indptr_start, indptr_end,
+                                         self.samples, self.start, self.end,
+                                         self.index_to_samples,
+                                         self.feature_values,
+                                         end_negative, start_positive)
+
+
+cdef int compare_SIZE_t(const void* a, const void* b) nogil:
+    """Comparison function for sort."""
+    return <int>((<SIZE_t*>a)[0] - (<SIZE_t*>b)[0])
+
+
+cdef inline void binary_search(INT32_t* sorted_array,
+                               INT32_t start, INT32_t end,
+                               SIZE_t value, SIZE_t* index,
+                               INT32_t* new_start) nogil:
+    """Return the index of value in the sorted array.
+
+    If not found, return -1. new_start is the last pivot + 1
+    """
+    cdef INT32_t pivot
+    index[0] = -1
+    while start < end:
+        pivot = start + (end - start) / 2
+
+        if sorted_array[pivot] == value:
+            index[0] = pivot
+            start = pivot + 1
+            break
+
+        if sorted_array[pivot] < value:
+            start = pivot + 1
+        else:
+            end = pivot
+    new_start[0] = start
+
+
+cdef inline void extract_nnz_index_to_samples(INT32_t* X_indices,
+                                              DTYPE_t* X_data,
+                                              INT32_t indptr_start,
+                                              INT32_t indptr_end,
+                                              SIZE_t* samples,
+                                              SIZE_t start,
+                                              SIZE_t end,
+                                              SIZE_t* index_to_samples,
+                                              DTYPE_t* Xf,
+                                              SIZE_t* end_negative,
+                                              SIZE_t* start_positive) nogil:
+    """Extract and partition values for a feature using index_to_samples.
+
+    Complexity is O(indptr_end - indptr_start).
+    """
+    cdef INT32_t k
+    cdef SIZE_t index
+    cdef SIZE_t end_negative_ = start
+    cdef SIZE_t start_positive_ = end
+
+    for k in range(indptr_start, indptr_end):
+        if start <= index_to_samples[X_indices[k]] < end:
+            if X_data[k] > 0:
+                start_positive_ -= 1
+                Xf[start_positive_] = X_data[k]
+                index = index_to_samples[X_indices[k]]
+                sparse_swap(index_to_samples, samples, index, start_positive_)
+
+
+            elif X_data[k] < 0:
+                Xf[end_negative_] = X_data[k]
+                index = index_to_samples[X_indices[k]]
+                sparse_swap(index_to_samples, samples, index, end_negative_)
+                end_negative_ += 1
+
+    # Returned values
+    end_negative[0] = end_negative_
+    start_positive[0] = start_positive_
+
+
+cdef inline void extract_nnz_binary_search(INT32_t* X_indices,
+                                           DTYPE_t* X_data,
+                                           INT32_t indptr_start,
+                                           INT32_t indptr_end,
+                                           SIZE_t* samples,
+                                           SIZE_t start,
+                                           SIZE_t end,
+                                           SIZE_t* index_to_samples,
+                                           DTYPE_t* Xf,
+                                           SIZE_t* end_negative,
+                                           SIZE_t* start_positive,
+                                           SIZE_t* sorted_samples,
+                                           bint* is_samples_sorted) nogil:
+    """Extract and partition values for a given feature using binary search.
+
+    If n_samples = end - start and n_indices = indptr_end - indptr_start,
+    the complexity is
+
+        O((1 - is_samples_sorted[0]) * n_samples * log(n_samples) +
+          n_samples * log(n_indices)).
+    """
+    cdef SIZE_t n_samples
+
+    if not is_samples_sorted[0]:
+        n_samples = end - start
+        memcpy(sorted_samples + start, samples + start,
+               n_samples * sizeof(SIZE_t))
+        qsort(sorted_samples + start, n_samples, sizeof(SIZE_t),
+              compare_SIZE_t)
+        is_samples_sorted[0] = 1
+
+    while (indptr_start < indptr_end and
+           sorted_samples[start] > X_indices[indptr_start]):
+        indptr_start += 1
+
+    while (indptr_start < indptr_end and
+           sorted_samples[end - 1] < X_indices[indptr_end - 1]):
+        indptr_end -= 1
+
+    cdef SIZE_t p = start
+    cdef SIZE_t index
+    cdef SIZE_t k
+    cdef SIZE_t end_negative_ = start
+    cdef SIZE_t start_positive_ = end
+
+    while (p < end and indptr_start < indptr_end):
+        # Find index of sorted_samples[p] in X_indices
+        binary_search(X_indices, indptr_start, indptr_end,
+                      sorted_samples[p], &k, &indptr_start)
+
+        if k != -1:
+             # If k != -1, we have found a non zero value
+
+            if X_data[k] > 0:
+                start_positive_ -= 1
+                Xf[start_positive_] = X_data[k]
+                index = index_to_samples[X_indices[k]]
+                sparse_swap(index_to_samples, samples, index, start_positive_)
+
+
+            elif X_data[k] < 0:
+                Xf[end_negative_] = X_data[k]
+                index = index_to_samples[X_indices[k]]
+                sparse_swap(index_to_samples, samples, index, end_negative_)
+                end_negative_ += 1
+        p += 1
+
+    # Returned values
+    end_negative[0] = end_negative_
+    start_positive[0] = start_positive_
+
+
+cdef inline void sparse_swap(SIZE_t* index_to_samples, SIZE_t* samples,
+                             SIZE_t pos_1, SIZE_t pos_2) nogil:
+    """Swap sample pos_1 and pos_2 preserving sparse invariant."""
+    samples[pos_1], samples[pos_2] =  samples[pos_2], samples[pos_1]
+    index_to_samples[samples[pos_1]] = pos_1
+    index_to_samples[samples[pos_2]] = pos_2
+
+
+cdef class BestSparseSplitter(BaseSparseSplitter):
+    """Splitter for finding the best split, using the sparse data."""
+
+    def __reduce__(self):
+        return (BestSparseSplitter, (self.criterion,
+                                     self.max_features,
+                                     self.min_samples_leaf,
+                                     self.min_weight_leaf,
+                                     self.random_state), self.__getstate__())
+
+    cdef int node_split(self, double impurity, SplitRecord* split,
+                        SIZE_t* n_constant_features) nogil except -1:
+        """Find the best split on node samples[start:end], using sparse features
+
+        Returns -1 in case of failure to allocate memory (and raise MemoryError)
+        or 0 otherwise.
+        """
+        # Find the best split
+        cdef SIZE_t* samples = self.samples
+        cdef SIZE_t start = self.start
+        cdef SIZE_t end = self.end
+
+        cdef INT32_t* X_indices = self.X_indices
+        cdef INT32_t* X_indptr = self.X_indptr
+        cdef DTYPE_t* X_data = self.X_data
+
+        cdef SIZE_t* features = self.features
+        cdef SIZE_t* constant_features = self.constant_features
+        cdef SIZE_t n_features = self.n_features
+
+        cdef DTYPE_t* Xf = self.feature_values
+        cdef SIZE_t* sorted_samples = self.sorted_samples
+        cdef SIZE_t* index_to_samples = self.index_to_samples
+        cdef SIZE_t max_features = self.max_features
+        cdef SIZE_t min_samples_leaf = self.min_samples_leaf
+        cdef double min_weight_leaf = self.min_weight_leaf
+        cdef UINT32_t* random_state = &self.rand_r_state
+
+        cdef SplitRecord best, current
+        _init_split(&best, end)
+        cdef double current_proxy_improvement = - INFINITY
+        cdef double best_proxy_improvement = - INFINITY
+
+        cdef SIZE_t f_i = n_features
+        cdef SIZE_t f_j, p
+        cdef SIZE_t n_visited_features = 0
+        # Number of features discovered to be constant during the split search
+        cdef SIZE_t n_found_constants = 0
+        # Number of features known to be constant and drawn without replacement
+        cdef SIZE_t n_drawn_constants = 0
+        cdef SIZE_t n_known_constants = n_constant_features[0]
+        # n_total_constants = n_known_constants + n_found_constants
+        cdef SIZE_t n_total_constants = n_known_constants
+        cdef DTYPE_t current_feature_value
+
+        cdef SIZE_t p_next
+        cdef SIZE_t p_prev
+        cdef bint is_samples_sorted = 0  # indicate is sorted_samples is
+                                         # inititialized
+
+        # We assume implicitly that end_positive = end and
+        # start_negative = start
+        cdef SIZE_t start_positive
+        cdef SIZE_t end_negative
+
+        # Sample up to max_features without replacement using a
+        # Fisher-Yates-based algorithm (using the local variables `f_i` and
+        # `f_j` to compute a permutation of the `features` array).
+        #
+        # Skip the CPU intensive evaluation of the impurity criterion for
+        # features that were already detected as constant (hence not suitable
+        # for good splitting) by ancestor nodes and save the information on
+        # newly discovered constant features to spare computation on descendant
+        # nodes.
+        while (f_i > n_total_constants and  # Stop early if remaining features
+                                            # are constant
+                (n_visited_features < max_features or
+                 # At least one drawn features must be non constant
+                 n_visited_features <= n_found_constants + n_drawn_constants)):
+
+            n_visited_features += 1
+
+            # Loop invariant: elements of features in
+            # - [:n_drawn_constant[ holds drawn and known constant features;
+            # - [n_drawn_constant:n_known_constant[ holds known constant
+            #   features that haven't been drawn yet;
+            # - [n_known_constant:n_total_constant[ holds newly found constant
+            #   features;
+            # - [n_total_constant:f_i[ holds features that haven't been drawn
+            #   yet and aren't constant apriori.
+            # - [f_i:n_features[ holds features that have been drawn
+            #   and aren't constant.
+
+            # Draw a feature at random
+            f_j = rand_int(n_drawn_constants, f_i - n_found_constants,
+                           random_state)
+
+            if f_j < n_known_constants:
+                # f_j in the interval [n_drawn_constants, n_known_constants[
+                features[f_j], features[n_drawn_constants] = features[n_drawn_constants], features[f_j]
+
+                n_drawn_constants += 1
+
+            else:
+                # f_j in the interval [n_known_constants, f_i - n_found_constants[
+                f_j += n_found_constants
+                # f_j in the interval [n_total_constants, f_i[
+
+                current.feature = features[f_j]
+                self.extract_nnz(current.feature,
+                                 &end_negative, &start_positive,
+                                 &is_samples_sorted)
+
+                # Sort the positive and negative parts of `Xf`
+                sort(Xf + start, samples + start, end_negative - start)
+                sort(Xf + start_positive, samples + start_positive,
+                     end - start_positive)
+
+                # Update index_to_samples to take into account the sort
+                for p in range(start, end_negative):
+                    index_to_samples[samples[p]] = p
+                for p in range(start_positive, end):
+                    index_to_samples[samples[p]] = p
+
+                # Add one or two zeros in Xf, if there is any
+                if end_negative < start_positive:
+                    start_positive -= 1
+                    Xf[start_positive] = 0.
+
+                    if end_negative != start_positive:
+                        Xf[end_negative] = 0.
+                        end_negative += 1
+
+                if Xf[end - 1] <= Xf[start] + FEATURE_THRESHOLD:
+                    features[f_j], features[n_total_constants] = features[n_total_constants], features[f_j]
+
+                    n_found_constants += 1
+                    n_total_constants += 1
+
+                else:
+                    f_i -= 1
+                    features[f_i], features[f_j] = features[f_j], features[f_i]
+
+                    # Evaluate all splits
+                    self.criterion.reset()
+                    p = start
+
+                    while p < end:
+                        if p + 1 != end_negative:
+                            p_next = p + 1
+                        else:
+                            p_next = start_positive
+
+                        while (p_next < end and
+                               Xf[p_next] <= Xf[p] + FEATURE_THRESHOLD):
+                            p = p_next
+                            if p + 1 != end_negative:
+                                p_next = p + 1
+                            else:
+                                p_next = start_positive
+
+
+                        # (p_next >= end) or (X[samples[p_next], current.feature] >
+                        #                     X[samples[p], current.feature])
+                        p_prev = p
+                        p = p_next
+                        # (p >= end) or (X[samples[p], current.feature] >
+                        #                X[samples[p_prev], current.feature])
+
+
+                        if p < end:
+                            current.pos = p
+
+                            # Reject if min_samples_leaf is not guaranteed
+                            if (((current.pos - start) < min_samples_leaf) or
+                                    ((end - current.pos) < min_samples_leaf)):
+                                continue
+
+                            self.criterion.update(current.pos)
+
+                            # Reject if min_weight_leaf is not satisfied
+                            if ((self.criterion.weighted_n_left < min_weight_leaf) or
+                                    (self.criterion.weighted_n_right < min_weight_leaf)):
+                                continue
+
+                            current_proxy_improvement = self.criterion.proxy_impurity_improvement()
+
+                            if current_proxy_improvement > best_proxy_improvement:
+                                best_proxy_improvement = current_proxy_improvement
+                                # sum of halves used to avoid infinite values
+                                current.threshold = Xf[p_prev] / 2.0 + Xf[p] / 2.0
+
+                                if ((current.threshold == Xf[p]) or
+                                    (current.threshold == INFINITY) or
+                                    (current.threshold == -INFINITY)):
+                                    current.threshold = Xf[p_prev]
+
+                                best = current
+
+        # Reorganize into samples[start:best.pos] + samples[best.pos:end]
+        if best.pos < end:
+            self.extract_nnz(best.feature, &end_negative, &start_positive,
+                             &is_samples_sorted)
+
+            self._partition(best.threshold, end_negative, start_positive,
+                            best.pos)
+
+            self.criterion.reset()
+            self.criterion.update(best.pos)
+            self.criterion.children_impurity(&best.impurity_left,
+                                             &best.impurity_right)
+            best.improvement = self.criterion.impurity_improvement(
+                impurity, best.impurity_left, best.impurity_right)
+
+        # Respect invariant for constant features: the original order of
+        # element in features[:n_known_constants] must be preserved for sibling
+        # and child nodes
+        memcpy(features, constant_features, sizeof(SIZE_t) * n_known_constants)
+
+        # Copy newly found constant features
+        memcpy(constant_features + n_known_constants,
+               features + n_known_constants,
+               sizeof(SIZE_t) * n_found_constants)
+
+        # Return values
+        split[0] = best
+        n_constant_features[0] = n_total_constants
+        return 0
+
+
+cdef class RandomSparseSplitter(BaseSparseSplitter):
+    """Splitter for finding a random split, using the sparse data."""
+
+    def __reduce__(self):
+        return (RandomSparseSplitter, (self.criterion,
+                                       self.max_features,
+                                       self.min_samples_leaf,
+                                       self.min_weight_leaf,
+                                       self.random_state), self.__getstate__())
+
+    cdef int node_split(self, double impurity, SplitRecord* split,
+                        SIZE_t* n_constant_features) nogil except -1:
+        """Find a random split on node samples[start:end], using sparse features
+
+        Returns -1 in case of failure to allocate memory (and raise MemoryError)
+        or 0 otherwise.
+        """
+        # Find the best split
+        cdef SIZE_t* samples = self.samples
+        cdef SIZE_t start = self.start
+        cdef SIZE_t end = self.end
+
+        cdef INT32_t* X_indices = self.X_indices
+        cdef INT32_t* X_indptr = self.X_indptr
+        cdef DTYPE_t* X_data = self.X_data
+
+        cdef SIZE_t* features = self.features
+        cdef SIZE_t* constant_features = self.constant_features
+        cdef SIZE_t n_features = self.n_features
+
+        cdef DTYPE_t* Xf = self.feature_values
+        cdef SIZE_t* sorted_samples = self.sorted_samples
+        cdef SIZE_t* index_to_samples = self.index_to_samples
+        cdef SIZE_t max_features = self.max_features
+        cdef SIZE_t min_samples_leaf = self.min_samples_leaf
+        cdef double min_weight_leaf = self.min_weight_leaf
+        cdef UINT32_t* random_state = &self.rand_r_state
+
+        cdef SplitRecord best, current
+        _init_split(&best, end)
+        cdef double current_proxy_improvement = - INFINITY
+        cdef double best_proxy_improvement = - INFINITY
+
+        cdef DTYPE_t current_feature_value
+
+        cdef SIZE_t f_i = n_features
+        cdef SIZE_t f_j, p
+        cdef SIZE_t n_visited_features = 0
+        # Number of features discovered to be constant during the split search
+        cdef SIZE_t n_found_constants = 0
+        # Number of features known to be constant and drawn without replacement
+        cdef SIZE_t n_drawn_constants = 0
+        cdef SIZE_t n_known_constants = n_constant_features[0]
+        # n_total_constants = n_known_constants + n_found_constants
+        cdef SIZE_t n_total_constants = n_known_constants
+        cdef SIZE_t partition_end
+
+        cdef DTYPE_t min_feature_value
+        cdef DTYPE_t max_feature_value
+
+        cdef bint is_samples_sorted = 0  # indicate that sorted_samples is
+                                         # inititialized
+
+        # We assume implicitly that end_positive = end and
+        # start_negative = start
+        cdef SIZE_t start_positive
+        cdef SIZE_t end_negative
+
+        # Sample up to max_features without replacement using a
+        # Fisher-Yates-based algorithm (using the local variables `f_i` and
+        # `f_j` to compute a permutation of the `features` array).
+        #
+        # Skip the CPU intensive evaluation of the impurity criterion for
+        # features that were already detected as constant (hence not suitable
+        # for good splitting) by ancestor nodes and save the information on
+        # newly discovered constant features to spare computation on descendant
+        # nodes.
+        while (f_i > n_total_constants and  # Stop early if remaining features
+                                            # are constant
+                (n_visited_features < max_features or
+                 # At least one drawn features must be non constant
+                 n_visited_features <= n_found_constants + n_drawn_constants)):
+
+            n_visited_features += 1
+
+            # Loop invariant: elements of features in
+            # - [:n_drawn_constant[ holds drawn and known constant features;
+            # - [n_drawn_constant:n_known_constant[ holds known constant
+            #   features that haven't been drawn yet;
+            # - [n_known_constant:n_total_constant[ holds newly found constant
+            #   features;
+            # - [n_total_constant:f_i[ holds features that haven't been drawn
+            #   yet and aren't constant apriori.
+            # - [f_i:n_features[ holds features that have been drawn
+            #   and aren't constant.
+
+            # Draw a feature at random
+            f_j = rand_int(n_drawn_constants, f_i - n_found_constants,
+                           random_state)
+
+            if f_j < n_known_constants:
+                # f_j in the interval [n_drawn_constants, n_known_constants[
+                features[f_j], features[n_drawn_constants] = features[n_drawn_constants], features[f_j]
+
+                n_drawn_constants += 1
+
+            else:
+                # f_j in the interval [n_known_constants, f_i - n_found_constants[
+                f_j += n_found_constants
+                # f_j in the interval [n_total_constants, f_i[
+
+                current.feature = features[f_j]
+
+                self.extract_nnz(current.feature,
+                                 &end_negative, &start_positive,
+                                 &is_samples_sorted)
+
+                # Add one or two zeros in Xf, if there is any
+                if end_negative < start_positive:
+                    start_positive -= 1
+                    Xf[start_positive] = 0.
+
+                    if end_negative != start_positive:
+                        Xf[end_negative] = 0.
+                        end_negative += 1
+
+                # Find min, max in Xf[start:end_negative]
+                min_feature_value = Xf[start]
+                max_feature_value = min_feature_value
+
+                for p in range(start, end_negative):
+                    current_feature_value = Xf[p]
+
+                    if current_feature_value < min_feature_value:
+                        min_feature_value = current_feature_value
+                    elif current_feature_value > max_feature_value:
+                        max_feature_value = current_feature_value
+
+                # Update min, max given Xf[start_positive:end]
+                for p in range(start_positive, end):
+                    current_feature_value = Xf[p]
+
+                    if current_feature_value < min_feature_value:
+                        min_feature_value = current_feature_value
+                    elif current_feature_value > max_feature_value:
+                        max_feature_value = current_feature_value
+
+                if max_feature_value <= min_feature_value + FEATURE_THRESHOLD:
+                    features[f_j] = features[n_total_constants]
+                    features[n_total_constants] = current.feature
+
+                    n_found_constants += 1
+                    n_total_constants += 1
+
+                else:
+                    f_i -= 1
+                    features[f_i], features[f_j] = features[f_j], features[f_i]
+
+                    # Draw a random threshold
+                    current.threshold = rand_uniform(min_feature_value,
+                                                     max_feature_value,
+                                                     random_state)
+
+                    if current.threshold == max_feature_value:
+                        current.threshold = min_feature_value
+
+                    # Partition
+                    current.pos = self._partition(current.threshold,
+                                                  end_negative,
+                                                  start_positive,
+                                                  start_positive +
+                                                  (Xf[start_positive] == 0.))
+
+                    # Reject if min_samples_leaf is not guaranteed
+                    if (((current.pos - start) < min_samples_leaf) or
+                            ((end - current.pos) < min_samples_leaf)):
+                        continue
+
+                    # Evaluate split
+                    self.criterion.reset()
+                    self.criterion.update(current.pos)
+
+                    # Reject if min_weight_leaf is not satisfied
+                    if ((self.criterion.weighted_n_left < min_weight_leaf) or
+                            (self.criterion.weighted_n_right < min_weight_leaf)):
+                        continue
+
+                    current_proxy_improvement = self.criterion.proxy_impurity_improvement()
+
+                    if current_proxy_improvement > best_proxy_improvement:
+                        best_proxy_improvement = current_proxy_improvement
+                        self.criterion.children_impurity(&current.impurity_left,
+                                                         &current.impurity_right)
+                        current.improvement = self.criterion.impurity_improvement(
+                            impurity, current.impurity_left, current.impurity_right)
+                        best = current
+
+        # Reorganize into samples[start:best.pos] + samples[best.pos:end]
+        if best.pos < end:
+            if current.feature != best.feature:
+                self.extract_nnz(best.feature, &end_negative, &start_positive,
+                                 &is_samples_sorted)
+
+                self._partition(best.threshold, end_negative, start_positive,
+                                best.pos)
+
+            self.criterion.reset()
+            self.criterion.update(best.pos)
+            self.criterion.children_impurity(&best.impurity_left,
+                                             &best.impurity_right)
+            best.improvement = self.criterion.impurity_improvement(
+                impurity, best.impurity_left, best.impurity_right)
+
+        # Respect invariant for constant features: the original order of
+        # element in features[:n_known_constants] must be preserved for sibling
+        # and child nodes
+        memcpy(features, constant_features, sizeof(SIZE_t) * n_known_constants)
+
+        # Copy newly found constant features
+        memcpy(constant_features + n_known_constants,
+               features + n_known_constants,
+               sizeof(SIZE_t) * n_found_constants)
+
+        # Return values
+        split[0] = best
+        n_constant_features[0] = n_total_constants
+        return 0
diff --git a/causalml/inference/tree/_tree/_tree.pxd b/causalml/inference/tree/_tree/_tree.pxd
new file mode 100644
index 00000000..d32b5eae
--- /dev/null
+++ b/causalml/inference/tree/_tree/_tree.pxd
@@ -0,0 +1,114 @@
+# Authors: Gilles Louppe <g.louppe@gmail.com>
+#          Peter Prettenhofer <peter.prettenhofer@gmail.com>
+#          Brian Holt <bdholt1@gmail.com>
+#          Joel Nothman <joel.nothman@gmail.com>
+#          Arnaud Joly <arnaud.v.joly@gmail.com>
+#          Jacob Schreiber <jmschreiber91@gmail.com>
+#          Nelson Liu <nelson@nelsonliu.me>
+#
+# License: BSD 3 clause
+
+# cython: cdivision=True
+# cython: boundscheck=False
+# cython: wraparound=False
+# cython: language_level=3
+# cython: linetrace=True
+
+import numpy as np
+cimport numpy as cnp
+
+ctypedef cnp.npy_float32 DTYPE_t          # Type of X
+ctypedef cnp.npy_float64 DOUBLE_t         # Type of y, sample_weight
+ctypedef cnp.npy_intp SIZE_t              # Type for indices and counters
+ctypedef cnp.npy_int32 INT32_t            # Signed 32 bit integer
+ctypedef cnp.npy_uint32 UINT32_t          # Unsigned 32 bit integer
+
+from ._splitter cimport Splitter
+from ._splitter cimport SplitRecord
+
+
+cdef struct Node:
+    # Base storage structure for the nodes in a Tree object
+
+    SIZE_t left_child                    # id of the left child of the node
+    SIZE_t right_child                   # id of the right child of the node
+    SIZE_t feature                       # Feature used for splitting the node
+    DOUBLE_t threshold                   # Threshold value at the node
+    DOUBLE_t impurity                    # Impurity of the node (i.e., the value of the criterion)
+    SIZE_t n_node_samples                # Number of samples at the node
+    DOUBLE_t weighted_n_node_samples     # Weighted number of samples at the node
+
+
+cdef class Tree:
+    # The Tree object is a binary tree structure constructed by the
+    # TreeBuilder. The tree structure is used for predictions and feature importances.
+
+    # Input/Output layout
+    cdef public SIZE_t n_features        # Number of features in X
+    cdef SIZE_t* n_classes               # Number of classes in y[:, k]
+    cdef public SIZE_t n_outputs         # Number of outputs in y
+    cdef public SIZE_t max_n_classes     # max(n_classes)
+
+    # Inner structures: values are stored separately from node structure,
+    # since size is determined at runtime.
+    cdef public SIZE_t max_depth         # Max depth of the tree
+    cdef public SIZE_t node_count        # Counter for node IDs
+    cdef public SIZE_t capacity          # Capacity of tree, in terms of nodes
+    cdef Node* nodes                     # Array of nodes
+    cdef double* value                   # (capacity, n_outputs, max_n_classes) array of values
+    cdef SIZE_t value_stride             # = n_outputs * max_n_classes
+
+    # Methods
+    cdef SIZE_t _add_node(self, SIZE_t parent, bint is_left, bint is_leaf,
+                          SIZE_t feature, double threshold, double impurity,
+                          SIZE_t n_node_samples,
+                          double weighted_n_node_samples) nogil except -1
+    cdef int _resize(self, SIZE_t capacity) nogil except -1
+    cdef int _resize_c(self, SIZE_t capacity=*) nogil except -1
+
+    cdef cnp.ndarray _get_value_ndarray(self)
+    cdef cnp.ndarray _get_node_ndarray(self)
+
+    cpdef cnp.ndarray predict(self, object X)
+
+    cpdef cnp.ndarray apply(self, object X)
+    cdef cnp.ndarray _apply_dense(self, object X)
+    cdef cnp.ndarray _apply_sparse_csr(self, object X)
+
+    cpdef object decision_path(self, object X)
+    cdef object _decision_path_dense(self, object X)
+    cdef object _decision_path_sparse_csr(self, object X)
+
+    cpdef compute_feature_importances(self, normalize=*)
+
+
+cdef class TreeBuilder:
+    # The TreeBuilder recursively builds a Tree object from training samples,
+    # using a Splitter object for splitting internal nodes and assigning
+    # values to leaves.
+    #
+    # This class controls the various stopping criteria and the node splitting
+    # evaluation order, e.g. depth-first or best-first.
+
+    cdef Splitter splitter              # Splitting algorithm
+
+    cdef SIZE_t min_samples_split       # Minimum number of samples in an internal node
+    cdef SIZE_t min_samples_leaf        # Minimum number of samples in a leaf
+    cdef double min_weight_leaf         # Minimum weight in a leaf
+    cdef SIZE_t max_depth               # Maximal tree depth
+    cdef double min_impurity_decrease   # Impurity threshold for early stopping
+
+    cpdef build(
+        self,
+        Tree tree,
+        object X,
+        cnp.ndarray y,
+        cnp.ndarray sample_weight=*,
+    )
+
+    cdef _check_input(
+        self,
+        object X,
+        cnp.ndarray y,
+        cnp.ndarray sample_weight,
+    )
diff --git a/causalml/inference/tree/_tree/_tree.pyx b/causalml/inference/tree/_tree/_tree.pyx
new file mode 100755
index 00000000..b7af41a5
--- /dev/null
+++ b/causalml/inference/tree/_tree/_tree.pyx
@@ -0,0 +1,1771 @@
+# Authors: Gilles Louppe <g.louppe@gmail.com>
+#          Peter Prettenhofer <peter.prettenhofer@gmail.com>
+#          Brian Holt <bdholt1@gmail.com>
+#          Noel Dawe <noel@dawe.me>
+#          Satrajit Gosh <satrajit.ghosh@gmail.com>
+#          Lars Buitinck
+#          Arnaud Joly <arnaud.v.joly@gmail.com>
+#          Joel Nothman <joel.nothman@gmail.com>
+#          Fares Hedayati <fares.hedayati@gmail.com>
+#          Jacob Schreiber <jmschreiber91@gmail.com>
+#          Nelson Liu <nelson@nelsonliu.me>
+#
+# License: BSD 3 clause
+
+# cython: cdivision=True
+# cython: boundscheck=False
+# cython: wraparound=False
+# cython: language_level=3
+# cython: linetrace=True
+
+from cpython cimport Py_INCREF, PyObject, PyTypeObject
+
+from libc.stdlib cimport free
+from libc.math cimport fabs
+from libc.string cimport memcpy
+from libc.string cimport memset
+from libc.stdint cimport SIZE_MAX
+
+import struct
+
+import numpy as np
+cimport numpy as cnp
+cnp.import_array()
+
+from scipy.sparse import issparse
+from scipy.sparse import csr_matrix
+
+from ._utils cimport Stack
+from ._utils cimport StackRecord
+from ._utils cimport PriorityHeap
+from ._utils cimport PriorityHeapRecord
+from ._utils cimport safe_realloc
+from ._utils cimport sizet_ptr_to_ndarray
+
+
+cdef extern from "numpy/arrayobject.h":
+    object PyArray_NewFromDescr(PyTypeObject* subtype, cnp.dtype descr,
+                                int nd, cnp.npy_intp* dims,
+                                cnp.npy_intp* strides,
+                                void* data, int flags, object obj)
+    int PyArray_SetBaseObject(cnp.ndarray arr, PyObject* obj)
+
+# =============================================================================
+# Types and constants
+# =============================================================================
+
+from numpy import float32 as DTYPE
+from numpy import float64 as DOUBLE
+
+cdef double INFINITY = np.inf
+cdef double EPSILON = np.finfo('double').eps
+
+# Some handy constants (BestFirstTreeBuilder)
+cdef int IS_FIRST = 1
+cdef int IS_NOT_FIRST = 0
+cdef int IS_LEFT = 1
+cdef int IS_NOT_LEFT = 0
+
+TREE_LEAF = -1
+TREE_UNDEFINED = -2
+cdef SIZE_t _TREE_LEAF = TREE_LEAF
+cdef SIZE_t _TREE_UNDEFINED = TREE_UNDEFINED
+cdef SIZE_t INITIAL_STACK_SIZE = 10
+
+# Build the corresponding numpy dtype for Node.
+# This works by casting `dummy` to an array of Node of length 1, which numpy
+# can construct a `dtype`-object for. See https://stackoverflow.com/q/62448946
+# for a more detailed explanation.
+cdef Node dummy;
+NODE_DTYPE = np.asarray(<Node[:1]>(&dummy)).dtype
+
+# =============================================================================
+# TreeBuilder
+# =============================================================================
+
+cdef class TreeBuilder:
+    """Interface for different tree building strategies."""
+
+    cpdef build(self, Tree tree, object X, cnp.ndarray y,
+                cnp.ndarray sample_weight=None):
+        """Build a decision tree from the training set (X, y)."""
+        pass
+
+    cdef inline _check_input(self, object X, cnp.ndarray y,
+                             cnp.ndarray sample_weight):
+        """Check input dtype, layout and format"""
+        if issparse(X):
+            X = X.tocsc()
+            X.sort_indices()
+
+            if X.data.dtype != DTYPE:
+                X.data = np.ascontiguousarray(X.data, dtype=DTYPE)
+
+            if X.indices.dtype != np.int32 or X.indptr.dtype != np.int32:
+                raise ValueError("No support for np.int64 index based "
+                                 "sparse matrices")
+
+        elif X.dtype != DTYPE:
+            # since we have to copy we will make it fortran for efficiency
+            X = np.asfortranarray(X, dtype=DTYPE)
+
+        if y.dtype != DOUBLE or not y.flags.contiguous:
+            y = np.ascontiguousarray(y, dtype=DOUBLE)
+
+        if (sample_weight is not None and
+            (sample_weight.dtype != DOUBLE or
+            not sample_weight.flags.contiguous)):
+                sample_weight = np.asarray(sample_weight, dtype=DOUBLE,
+                                           order="C")
+
+        return X, y, sample_weight
+
+# Depth first builder ---------------------------------------------------------
+
+cdef class DepthFirstTreeBuilder(TreeBuilder):
+    """Build a decision tree in depth-first fashion."""
+
+    def __cinit__(self, Splitter splitter, SIZE_t min_samples_split,
+                  SIZE_t min_samples_leaf, double min_weight_leaf,
+                  SIZE_t max_depth, double min_impurity_decrease):
+        self.splitter = splitter
+        self.min_samples_split = min_samples_split
+        self.min_samples_leaf = min_samples_leaf
+        self.min_weight_leaf = min_weight_leaf
+        self.max_depth = max_depth
+        self.min_impurity_decrease = min_impurity_decrease
+
+    cpdef build(self, Tree tree, object X, cnp.ndarray y,
+                cnp.ndarray sample_weight=None):
+        """Build a decision tree from the training set (X, y)."""
+
+        # check input
+        X, y, sample_weight = self._check_input(X, y, sample_weight)
+
+        cdef DOUBLE_t* sample_weight_ptr = NULL
+        if sample_weight is not None:
+            sample_weight_ptr = <DOUBLE_t*> sample_weight.data
+
+        # Initial capacity
+        cdef int init_capacity
+
+        if tree.max_depth <= 10:
+            init_capacity = (2 ** (tree.max_depth + 1)) - 1
+        else:
+            init_capacity = 2047
+
+        tree._resize(init_capacity)
+
+        # Parameters
+        cdef Splitter splitter = self.splitter
+        cdef SIZE_t max_depth = self.max_depth
+        cdef SIZE_t min_samples_leaf = self.min_samples_leaf
+        cdef double min_weight_leaf = self.min_weight_leaf
+        cdef SIZE_t min_samples_split = self.min_samples_split
+        cdef double min_impurity_decrease = self.min_impurity_decrease
+
+        # Recursive partition (without actual recursion)
+        splitter.init(X, y, sample_weight_ptr)
+
+        cdef SIZE_t start
+        cdef SIZE_t end
+        cdef SIZE_t depth
+        cdef SIZE_t parent
+        cdef bint is_left
+        cdef SIZE_t n_node_samples = splitter.n_samples
+        cdef double weighted_n_samples = splitter.weighted_n_samples
+        cdef double weighted_n_node_samples
+        cdef SplitRecord split
+        cdef SIZE_t node_id
+
+        cdef double impurity = INFINITY
+        cdef SIZE_t n_constant_features
+        cdef bint is_leaf
+        cdef bint first = 1
+        cdef SIZE_t max_depth_seen = -1
+        cdef int rc = 0
+
+        cdef Stack stack = Stack(INITIAL_STACK_SIZE)
+        cdef StackRecord stack_record
+
+        with nogil:
+            # push root node onto stack
+            rc = stack.push(0, n_node_samples, 0, _TREE_UNDEFINED, 0, INFINITY, 0)
+            if rc == -1:
+                # got return code -1 - out-of-memory
+                with gil:
+                    raise MemoryError()
+
+            while not stack.is_empty():
+                stack.pop(&stack_record)
+
+                start = stack_record.start
+                end = stack_record.end
+                depth = stack_record.depth
+                parent = stack_record.parent
+                is_left = stack_record.is_left
+                impurity = stack_record.impurity
+                n_constant_features = stack_record.n_constant_features
+
+                n_node_samples = end - start
+                splitter.node_reset(start, end, &weighted_n_node_samples)
+
+                is_leaf = (depth >= max_depth or
+                           n_node_samples < min_samples_split or
+                           n_node_samples < 2 * min_samples_leaf or
+                           weighted_n_node_samples < 2 * min_weight_leaf)
+
+                if first:
+                    impurity = splitter.node_impurity()
+                    first = 0
+
+                # impurity == 0 with tolerance due to rounding errors
+                is_leaf = is_leaf or impurity <= EPSILON
+
+                if not is_leaf:
+                    splitter.node_split(impurity, &split, &n_constant_features)
+                    # If EPSILON=0 in the below comparison, float precision
+                    # issues stop splitting, producing trees that are
+                    # dissimilar to v0.18
+                    is_leaf = (is_leaf or split.pos >= end or
+                               (split.improvement + EPSILON <
+                                min_impurity_decrease))
+
+                node_id = tree._add_node(parent, is_left, is_leaf, split.feature,
+                                         split.threshold, impurity, n_node_samples,
+                                         weighted_n_node_samples)
+
+                if node_id == SIZE_MAX:
+                    rc = -1
+                    break
+
+                # Store value for all nodes, to facilitate tree/model
+                # inspection and interpretation
+                splitter.node_value(tree.value + node_id * tree.value_stride)
+
+                if not is_leaf:
+                    # Push right child on stack
+                    rc = stack.push(split.pos, end, depth + 1, node_id, 0,
+                                    split.impurity_right, n_constant_features)
+                    if rc == -1:
+                        break
+
+                    # Push left child on stack
+                    rc = stack.push(start, split.pos, depth + 1, node_id, 1,
+                                    split.impurity_left, n_constant_features)
+                    if rc == -1:
+                        break
+
+                if depth > max_depth_seen:
+                    max_depth_seen = depth
+
+            if rc >= 0:
+                rc = tree._resize_c(tree.node_count)
+
+            if rc >= 0:
+                tree.max_depth = max_depth_seen
+        if rc == -1:
+            raise MemoryError()
+
+
+# Best first builder ----------------------------------------------------------
+
+cdef inline int _add_to_frontier(PriorityHeapRecord* rec,
+                                 PriorityHeap frontier) nogil except -1:
+    """Adds record ``rec`` to the priority queue ``frontier``
+
+    Returns -1 in case of failure to allocate memory (and raise MemoryError)
+    or 0 otherwise.
+    """
+    return frontier.push(rec.node_id, rec.start, rec.end, rec.pos, rec.depth,
+                         rec.is_leaf, rec.improvement, rec.impurity,
+                         rec.impurity_left, rec.impurity_right)
+
+
+cdef class BestFirstTreeBuilder(TreeBuilder):
+    """Build a decision tree in best-first fashion.
+
+    The best node to expand is given by the node at the frontier that has the
+    highest impurity improvement.
+    """
+    cdef SIZE_t max_leaf_nodes
+
+    def __cinit__(self, Splitter splitter, SIZE_t min_samples_split,
+                  SIZE_t min_samples_leaf,  min_weight_leaf,
+                  SIZE_t max_depth, SIZE_t max_leaf_nodes,
+                  double min_impurity_decrease):
+        self.splitter = splitter
+        self.min_samples_split = min_samples_split
+        self.min_samples_leaf = min_samples_leaf
+        self.min_weight_leaf = min_weight_leaf
+        self.max_depth = max_depth
+        self.max_leaf_nodes = max_leaf_nodes
+        self.min_impurity_decrease = min_impurity_decrease
+
+    cpdef build(self, Tree tree, object X, cnp.ndarray y,
+                cnp.ndarray sample_weight=None):
+        """Build a decision tree from the training set (X, y)."""
+
+        # check input
+        X, y, sample_weight = self._check_input(X, y, sample_weight)
+
+        cdef DOUBLE_t* sample_weight_ptr = NULL
+        if sample_weight is not None:
+            sample_weight_ptr = <DOUBLE_t*> sample_weight.data
+
+        # Parameters
+        cdef Splitter splitter = self.splitter
+        cdef SIZE_t max_leaf_nodes = self.max_leaf_nodes
+        cdef SIZE_t min_samples_leaf = self.min_samples_leaf
+        cdef double min_weight_leaf = self.min_weight_leaf
+        cdef SIZE_t min_samples_split = self.min_samples_split
+
+        # Recursive partition (without actual recursion)
+        splitter.init(X, y, sample_weight_ptr)
+
+        cdef PriorityHeap frontier = PriorityHeap(INITIAL_STACK_SIZE)
+        cdef PriorityHeapRecord record
+        cdef PriorityHeapRecord split_node_left
+        cdef PriorityHeapRecord split_node_right
+
+        cdef SIZE_t n_node_samples = splitter.n_samples
+        cdef SIZE_t max_split_nodes = max_leaf_nodes - 1
+        cdef bint is_leaf
+        cdef SIZE_t max_depth_seen = -1
+        cdef int rc = 0
+        cdef Node* node
+
+        # Initial capacity
+        cdef SIZE_t init_capacity = max_split_nodes + max_leaf_nodes
+        tree._resize(init_capacity)
+
+        with nogil:
+            # add root to frontier
+            rc = self._add_split_node(splitter, tree, 0, n_node_samples,
+                                      INFINITY, IS_FIRST, IS_LEFT, NULL, 0,
+                                      &split_node_left)
+            if rc >= 0:
+                rc = _add_to_frontier(&split_node_left, frontier)
+
+            if rc == -1:
+                with gil:
+                    raise MemoryError()
+
+            while not frontier.is_empty():
+                frontier.pop(&record)
+
+                node = &tree.nodes[record.node_id]
+                is_leaf = (record.is_leaf or max_split_nodes <= 0)
+
+                if is_leaf:
+                    # Node is not expandable; set node as leaf
+                    node.left_child = _TREE_LEAF
+                    node.right_child = _TREE_LEAF
+                    node.feature = _TREE_UNDEFINED
+                    node.threshold = _TREE_UNDEFINED
+
+                else:
+                    # Node is expandable
+
+                    # Decrement number of split nodes available
+                    max_split_nodes -= 1
+
+                    # Compute left split node
+                    rc = self._add_split_node(splitter, tree,
+                                              record.start, record.pos,
+                                              record.impurity_left,
+                                              IS_NOT_FIRST, IS_LEFT, node,
+                                              record.depth + 1,
+                                              &split_node_left)
+                    if rc == -1:
+                        break
+
+                    # tree.nodes may have changed
+                    node = &tree.nodes[record.node_id]
+
+                    # Compute right split node
+                    rc = self._add_split_node(splitter, tree, record.pos,
+                                              record.end,
+                                              record.impurity_right,
+                                              IS_NOT_FIRST, IS_NOT_LEFT, node,
+                                              record.depth + 1,
+                                              &split_node_right)
+                    if rc == -1:
+                        break
+
+                    # Add nodes to queue
+                    rc = _add_to_frontier(&split_node_left, frontier)
+                    if rc == -1:
+                        break
+
+                    rc = _add_to_frontier(&split_node_right, frontier)
+                    if rc == -1:
+                        break
+
+                if record.depth > max_depth_seen:
+                    max_depth_seen = record.depth
+
+            if rc >= 0:
+                rc = tree._resize_c(tree.node_count)
+
+            if rc >= 0:
+                tree.max_depth = max_depth_seen
+
+        if rc == -1:
+            raise MemoryError()
+
+    cdef inline int _add_split_node(self, Splitter splitter, Tree tree,
+                                    SIZE_t start, SIZE_t end, double impurity,
+                                    bint is_first, bint is_left, Node* parent,
+                                    SIZE_t depth,
+                                    PriorityHeapRecord* res) nogil except -1:
+        """Adds node w/ partition ``[start, end)`` to the frontier. """
+        cdef SplitRecord split
+        cdef SIZE_t node_id
+        cdef SIZE_t n_node_samples
+        cdef SIZE_t n_constant_features = 0
+        cdef double weighted_n_samples = splitter.weighted_n_samples
+        cdef double min_impurity_decrease = self.min_impurity_decrease
+        cdef double weighted_n_node_samples
+        cdef bint is_leaf
+        cdef SIZE_t n_left, n_right
+        cdef double imp_diff
+
+        splitter.node_reset(start, end, &weighted_n_node_samples)
+
+        if is_first:
+            impurity = splitter.node_impurity()
+
+        n_node_samples = end - start
+        is_leaf = (depth >= self.max_depth or
+                   n_node_samples < self.min_samples_split or
+                   n_node_samples < 2 * self.min_samples_leaf or
+                   weighted_n_node_samples < 2 * self.min_weight_leaf or
+                   impurity <= EPSILON  # impurity == 0 with tolerance
+                   )
+
+        if not is_leaf:
+            splitter.node_split(impurity, &split, &n_constant_features)
+            # If EPSILON=0 in the below comparison, float precision issues stop
+            # splitting early, producing trees that are dissimilar to v0.18
+            is_leaf = (is_leaf or split.pos >= end or
+                       split.improvement + EPSILON < min_impurity_decrease)
+
+        node_id = tree._add_node(parent - tree.nodes
+                                 if parent != NULL
+                                 else _TREE_UNDEFINED,
+                                 is_left, is_leaf,
+                                 split.feature, split.threshold, impurity, n_node_samples,
+                                 weighted_n_node_samples)
+        if node_id == SIZE_MAX:
+            return -1
+
+        # compute values also for split nodes (might become leafs later).
+        splitter.node_value(tree.value + node_id * tree.value_stride)
+
+        res.node_id = node_id
+        res.start = start
+        res.end = end
+        res.depth = depth
+        res.impurity = impurity
+
+        if not is_leaf:
+            # is split node
+            res.pos = split.pos
+            res.is_leaf = 0
+            res.improvement = split.improvement
+            res.impurity_left = split.impurity_left
+            res.impurity_right = split.impurity_right
+
+        else:
+            # is leaf => 0 improvement
+            res.pos = end
+            res.is_leaf = 1
+            res.improvement = 0.0
+            res.impurity_left = impurity
+            res.impurity_right = impurity
+
+        return 0
+
+
+# =============================================================================
+# Tree
+# =============================================================================
+
+cdef class Tree:
+    """Array-based representation of a binary decision tree.
+
+    The binary tree is represented as a number of parallel arrays. The i-th
+    element of each array holds information about the node `i`. Node 0 is the
+    tree's root. You can find a detailed description of all arrays in
+    `_tree.pxd`. NOTE: Some of the arrays only apply to either leaves or split
+    nodes, resp. In this case the values of nodes of the other type are
+    arbitrary!
+
+    Attributes
+    ----------
+    node_count : int
+        The number of nodes (internal nodes + leaves) in the tree.
+
+    capacity : int
+        The current capacity (i.e., size) of the arrays, which is at least as
+        great as `node_count`.
+
+    max_depth : int
+        The depth of the tree, i.e. the maximum depth of its leaves.
+
+    children_left : array of int, shape [node_count]
+        children_left[i] holds the node id of the left child of node i.
+        For leaves, children_left[i] == TREE_LEAF. Otherwise,
+        children_left[i] > i. This child handles the case where
+        X[:, feature[i]] <= threshold[i].
+
+    children_right : array of int, shape [node_count]
+        children_right[i] holds the node id of the right child of node i.
+        For leaves, children_right[i] == TREE_LEAF. Otherwise,
+        children_right[i] > i. This child handles the case where
+        X[:, feature[i]] > threshold[i].
+
+    feature : array of int, shape [node_count]
+        feature[i] holds the feature to split on, for the internal node i.
+
+    threshold : array of double, shape [node_count]
+        threshold[i] holds the threshold for the internal node i.
+
+    value : array of double, shape [node_count, n_outputs, max_n_classes]
+        Contains the constant prediction value of each node.
+
+    impurity : array of double, shape [node_count]
+        impurity[i] holds the impurity (i.e., the value of the splitting
+        criterion) at node i.
+
+    n_node_samples : array of int, shape [node_count]
+        n_node_samples[i] holds the number of training samples reaching node i.
+
+    weighted_n_node_samples : array of int, shape [node_count]
+        weighted_n_node_samples[i] holds the weighted number of training samples
+        reaching node i.
+    """
+    # Wrap for outside world.
+    # WARNING: these reference the current `nodes` and `value` buffers, which
+    # must not be freed by a subsequent memory allocation.
+    # (i.e. through `_resize` or `__setstate__`)
+    property n_classes:
+        def __get__(self):
+            return sizet_ptr_to_ndarray(self.n_classes, self.n_outputs)
+
+    property children_left:
+        def __get__(self):
+            return self._get_node_ndarray()['left_child'][:self.node_count]
+
+    property children_right:
+        def __get__(self):
+            return self._get_node_ndarray()['right_child'][:self.node_count]
+
+    property n_leaves:
+        def __get__(self):
+            return np.sum(np.logical_and(
+                self.children_left == -1,
+                self.children_right == -1))
+
+    property feature:
+        def __get__(self):
+            return self._get_node_ndarray()['feature'][:self.node_count]
+
+    property threshold:
+        def __get__(self):
+            return self._get_node_ndarray()['threshold'][:self.node_count]
+
+    property impurity:
+        def __get__(self):
+            return self._get_node_ndarray()['impurity'][:self.node_count]
+
+    property n_node_samples:
+        def __get__(self):
+            return self._get_node_ndarray()['n_node_samples'][:self.node_count]
+
+    property weighted_n_node_samples:
+        def __get__(self):
+            return self._get_node_ndarray()['weighted_n_node_samples'][:self.node_count]
+
+    property value:
+        def __get__(self):
+            return self._get_value_ndarray()[:self.node_count]
+
+    def __cinit__(self, int n_features, cnp.ndarray n_classes, int n_outputs):
+        """Constructor."""
+        cdef SIZE_t dummy = 0
+        size_t_dtype = np.array(dummy).dtype
+
+        n_classes = _check_n_classes(n_classes, size_t_dtype)
+
+        # Input/Output layout
+        self.n_features = n_features
+        self.n_outputs = n_outputs
+        self.n_classes = NULL
+        safe_realloc(&self.n_classes, n_outputs)
+
+        self.max_n_classes = np.max(n_classes)
+        self.value_stride = n_outputs * self.max_n_classes
+
+        cdef SIZE_t k
+        for k in range(n_outputs):
+            self.n_classes[k] = n_classes[k]
+
+        # Inner structures
+        self.max_depth = 0
+        self.node_count = 0
+        self.capacity = 0
+        self.value = NULL
+        self.nodes = NULL
+
+    def __dealloc__(self):
+        """Destructor."""
+        # Free all inner structures
+        free(self.n_classes)
+        free(self.value)
+        free(self.nodes)
+
+    def __reduce__(self):
+        """Reduce re-implementation, for pickling."""
+        return (Tree, (self.n_features,
+                       sizet_ptr_to_ndarray(self.n_classes, self.n_outputs),
+                       self.n_outputs), self.__getstate__())
+
+    def __getstate__(self):
+        """Getstate re-implementation, for pickling."""
+        d = {}
+        # capacity is inferred during the __setstate__ using nodes
+        d["max_depth"] = self.max_depth
+        d["node_count"] = self.node_count
+        d["nodes"] = self._get_node_ndarray()
+        d["values"] = self._get_value_ndarray()
+        return d
+
+    def __setstate__(self, d):
+        """Setstate re-implementation, for unpickling."""
+        self.max_depth = d["max_depth"]
+        self.node_count = d["node_count"]
+
+        if 'nodes' not in d:
+            raise ValueError('You have loaded Tree version which '
+                             'cannot be imported')
+
+        node_ndarray = d['nodes']
+        value_ndarray = d['values']
+
+        value_shape = (node_ndarray.shape[0], self.n_outputs,
+                       self.max_n_classes)
+
+        node_ndarray = _check_node_ndarray(node_ndarray, expected_dtype=NODE_DTYPE)
+        value_ndarray = _check_value_ndarray(
+            value_ndarray,
+            expected_dtype=np.dtype(np.float64),
+            expected_shape=value_shape
+        )
+
+        self.capacity = node_ndarray.shape[0]
+        if self._resize_c(self.capacity) != 0:
+            raise MemoryError("resizing tree to %d" % self.capacity)
+        memcpy(self.nodes, cnp.PyArray_DATA(node_ndarray),
+               self.capacity * sizeof(Node))
+        memcpy(self.value, cnp.PyArray_DATA(value_ndarray),
+               self.capacity * self.value_stride * sizeof(double))
+
+    cdef int _resize(self, SIZE_t capacity) nogil except -1:
+        """Resize all inner arrays to `capacity`, if `capacity` == -1, then
+           double the size of the inner arrays.
+
+        Returns -1 in case of failure to allocate memory (and raise MemoryError)
+        or 0 otherwise.
+        """
+        if self._resize_c(capacity) != 0:
+            # Acquire gil only if we need to raise
+            with gil:
+                raise MemoryError()
+
+    cdef int _resize_c(self, SIZE_t capacity=SIZE_MAX) nogil except -1:
+        """Guts of _resize
+
+        Returns -1 in case of failure to allocate memory (and raise MemoryError)
+        or 0 otherwise.
+        """
+        if capacity == self.capacity and self.nodes != NULL:
+            return 0
+
+        if capacity == SIZE_MAX:
+            if self.capacity == 0:
+                capacity = 3  # default initial value
+            else:
+                capacity = 2 * self.capacity
+
+        safe_realloc(&self.nodes, capacity)
+        safe_realloc(&self.value, capacity * self.value_stride)
+
+        # value memory is initialised to 0 to enable classifier argmax
+        if capacity > self.capacity:
+            memset(<void*>(self.value + self.capacity * self.value_stride), 0,
+                   (capacity - self.capacity) * self.value_stride *
+                   sizeof(double))
+
+        # if capacity smaller than node_count, adjust the counter
+        if capacity < self.node_count:
+            self.node_count = capacity
+
+        self.capacity = capacity
+        return 0
+
+    cdef SIZE_t _add_node(self, SIZE_t parent, bint is_left, bint is_leaf,
+                          SIZE_t feature, double threshold, double impurity,
+                          SIZE_t n_node_samples,
+                          double weighted_n_node_samples) nogil except -1:
+        """Add a node to the tree.
+
+        The new node registers itself as the child of its parent.
+
+        Returns (size_t)(-1) on error.
+        """
+        cdef SIZE_t node_id = self.node_count
+
+        if node_id >= self.capacity:
+            if self._resize_c() != 0:
+                return SIZE_MAX
+
+        cdef Node* node = &self.nodes[node_id]
+        node.impurity = impurity
+        node.n_node_samples = n_node_samples
+        node.weighted_n_node_samples = weighted_n_node_samples
+
+        if parent != _TREE_UNDEFINED:
+            if is_left:
+                self.nodes[parent].left_child = node_id
+            else:
+                self.nodes[parent].right_child = node_id
+
+        if is_leaf:
+            node.left_child = _TREE_LEAF
+            node.right_child = _TREE_LEAF
+            node.feature = _TREE_UNDEFINED
+            node.threshold = _TREE_UNDEFINED
+
+        else:
+            # left_child and right_child will be set later
+            node.feature = feature
+            node.threshold = threshold
+
+        self.node_count += 1
+
+        return node_id
+
+    cpdef cnp.ndarray predict(self, object X):
+        """Predict target for X."""
+        out = self._get_value_ndarray().take(self.apply(X), axis=0,
+                                             mode='clip')
+        if self.n_outputs == 1:
+            out = out.reshape(X.shape[0], self.max_n_classes)
+        return out
+
+    cpdef cnp.ndarray apply(self, object X):
+        """Finds the terminal region (=leaf node) for each sample in X."""
+        if issparse(X):
+            return self._apply_sparse_csr(X)
+        else:
+            return self._apply_dense(X)
+
+    cdef inline cnp.ndarray _apply_dense(self, object X):
+        """Finds the terminal region (=leaf node) for each sample in X."""
+
+        # Check input
+        if not isinstance(X, np.ndarray):
+            raise ValueError("X should be in np.ndarray format, got %s"
+                             % type(X))
+
+        if X.dtype != DTYPE:
+            raise ValueError("X.dtype should be np.float32, got %s" % X.dtype)
+
+        # Extract input
+        cdef const DTYPE_t[:, :] X_ndarray = X
+        cdef SIZE_t n_samples = X.shape[0]
+
+        # Initialize output
+        cdef cnp.ndarray[SIZE_t] out = np.zeros((n_samples,), dtype=np.intp)
+        cdef SIZE_t* out_ptr = <SIZE_t*> out.data
+
+        # Initialize auxiliary data-structure
+        cdef Node* node = NULL
+        cdef SIZE_t i = 0
+
+        with nogil:
+            for i in range(n_samples):
+                node = self.nodes
+                # While node not a leaf
+                while node.left_child != _TREE_LEAF:
+                    # ... and node.right_child != _TREE_LEAF:
+                    if X_ndarray[i, node.feature] <= node.threshold:
+                        node = &self.nodes[node.left_child]
+                    else:
+                        node = &self.nodes[node.right_child]
+
+                out_ptr[i] = <SIZE_t>(node - self.nodes)  # node offset
+
+        return out
+
+    cdef inline cnp.ndarray _apply_sparse_csr(self, object X):
+        """Finds the terminal region (=leaf node) for each sample in sparse X.
+        """
+        # Check input
+        if not isinstance(X, csr_matrix):
+            raise ValueError("X should be in csr_matrix format, got %s"
+                             % type(X))
+
+        if X.dtype != DTYPE:
+            raise ValueError("X.dtype should be np.float32, got %s" % X.dtype)
+
+        # Extract input
+        cdef cnp.ndarray[ndim=1, dtype=DTYPE_t] X_data_ndarray = X.data
+        cdef cnp.ndarray[ndim=1, dtype=INT32_t] X_indices_ndarray  = X.indices
+        cdef cnp.ndarray[ndim=1, dtype=INT32_t] X_indptr_ndarray  = X.indptr
+
+        cdef DTYPE_t* X_data = <DTYPE_t*>X_data_ndarray.data
+        cdef INT32_t* X_indices = <INT32_t*>X_indices_ndarray.data
+        cdef INT32_t* X_indptr = <INT32_t*>X_indptr_ndarray.data
+
+        cdef SIZE_t n_samples = X.shape[0]
+        cdef SIZE_t n_features = X.shape[1]
+
+        # Initialize output
+        cdef cnp.ndarray[SIZE_t, ndim=1] out = np.zeros((n_samples,),
+                                                       dtype=np.intp)
+        cdef SIZE_t* out_ptr = <SIZE_t*> out.data
+
+        # Initialize auxiliary data-structure
+        cdef DTYPE_t feature_value = 0.
+        cdef Node* node = NULL
+        cdef DTYPE_t* X_sample = NULL
+        cdef SIZE_t i = 0
+        cdef INT32_t k = 0
+
+        # feature_to_sample as a data structure records the last seen sample
+        # for each feature; functionally, it is an efficient way to identify
+        # which features are nonzero in the present sample.
+        cdef SIZE_t* feature_to_sample = NULL
+
+        safe_realloc(&X_sample, n_features)
+        safe_realloc(&feature_to_sample, n_features)
+
+        with nogil:
+            memset(feature_to_sample, -1, n_features * sizeof(SIZE_t))
+
+            for i in range(n_samples):
+                node = self.nodes
+
+                for k in range(X_indptr[i], X_indptr[i + 1]):
+                    feature_to_sample[X_indices[k]] = i
+                    X_sample[X_indices[k]] = X_data[k]
+
+                # While node not a leaf
+                while node.left_child != _TREE_LEAF:
+                    # ... and node.right_child != _TREE_LEAF:
+                    if feature_to_sample[node.feature] == i:
+                        feature_value = X_sample[node.feature]
+
+                    else:
+                        feature_value = 0.
+
+                    if feature_value <= node.threshold:
+                        node = &self.nodes[node.left_child]
+                    else:
+                        node = &self.nodes[node.right_child]
+
+                out_ptr[i] = <SIZE_t>(node - self.nodes)  # node offset
+
+            # Free auxiliary arrays
+            free(X_sample)
+            free(feature_to_sample)
+
+        return out
+
+    cpdef object decision_path(self, object X):
+        """Finds the decision path (=node) for each sample in X."""
+        if issparse(X):
+            return self._decision_path_sparse_csr(X)
+        else:
+            return self._decision_path_dense(X)
+
+    cdef inline object _decision_path_dense(self, object X):
+        """Finds the decision path (=node) for each sample in X."""
+
+        # Check input
+        if not isinstance(X, np.ndarray):
+            raise ValueError("X should be in np.ndarray format, got %s"
+                             % type(X))
+
+        if X.dtype != DTYPE:
+            raise ValueError("X.dtype should be np.float32, got %s" % X.dtype)
+
+        # Extract input
+        cdef const DTYPE_t[:, :] X_ndarray = X
+        cdef SIZE_t n_samples = X.shape[0]
+
+        # Initialize output
+        cdef cnp.ndarray[SIZE_t] indptr = np.zeros(n_samples + 1, dtype=np.intp)
+        cdef SIZE_t* indptr_ptr = <SIZE_t*> indptr.data
+
+        cdef cnp.ndarray[SIZE_t] indices = np.zeros(n_samples *
+                                                   (1 + self.max_depth),
+                                                   dtype=np.intp)
+        cdef SIZE_t* indices_ptr = <SIZE_t*> indices.data
+
+        # Initialize auxiliary data-structure
+        cdef Node* node = NULL
+        cdef SIZE_t i = 0
+
+        with nogil:
+            for i in range(n_samples):
+                node = self.nodes
+                indptr_ptr[i + 1] = indptr_ptr[i]
+
+                # Add all external nodes
+                while node.left_child != _TREE_LEAF:
+                    # ... and node.right_child != _TREE_LEAF:
+                    indices_ptr[indptr_ptr[i + 1]] = <SIZE_t>(node - self.nodes)
+                    indptr_ptr[i + 1] += 1
+
+                    if X_ndarray[i, node.feature] <= node.threshold:
+                        node = &self.nodes[node.left_child]
+                    else:
+                        node = &self.nodes[node.right_child]
+
+                # Add the leave node
+                indices_ptr[indptr_ptr[i + 1]] = <SIZE_t>(node - self.nodes)
+                indptr_ptr[i + 1] += 1
+
+        indices = indices[:indptr[n_samples]]
+        cdef cnp.ndarray[SIZE_t] data = np.ones(shape=len(indices),
+                                               dtype=np.intp)
+        out = csr_matrix((data, indices, indptr),
+                         shape=(n_samples, self.node_count))
+
+        return out
+
+    cdef inline object _decision_path_sparse_csr(self, object X):
+        """Finds the decision path (=node) for each sample in X."""
+
+        # Check input
+        if not isinstance(X, csr_matrix):
+            raise ValueError("X should be in csr_matrix format, got %s"
+                             % type(X))
+
+        if X.dtype != DTYPE:
+            raise ValueError("X.dtype should be np.float32, got %s" % X.dtype)
+
+        # Extract input
+        cdef cnp.ndarray[ndim=1, dtype=DTYPE_t] X_data_ndarray = X.data
+        cdef cnp.ndarray[ndim=1, dtype=INT32_t] X_indices_ndarray  = X.indices
+        cdef cnp.ndarray[ndim=1, dtype=INT32_t] X_indptr_ndarray  = X.indptr
+
+        cdef DTYPE_t* X_data = <DTYPE_t*>X_data_ndarray.data
+        cdef INT32_t* X_indices = <INT32_t*>X_indices_ndarray.data
+        cdef INT32_t* X_indptr = <INT32_t*>X_indptr_ndarray.data
+
+        cdef SIZE_t n_samples = X.shape[0]
+        cdef SIZE_t n_features = X.shape[1]
+
+        # Initialize output
+        cdef cnp.ndarray[SIZE_t] indptr = np.zeros(n_samples + 1, dtype=np.intp)
+        cdef SIZE_t* indptr_ptr = <SIZE_t*> indptr.data
+
+        cdef cnp.ndarray[SIZE_t] indices = np.zeros(n_samples *
+                                                   (1 + self.max_depth),
+                                                   dtype=np.intp)
+        cdef SIZE_t* indices_ptr = <SIZE_t*> indices.data
+
+        # Initialize auxiliary data-structure
+        cdef DTYPE_t feature_value = 0.
+        cdef Node* node = NULL
+        cdef DTYPE_t* X_sample = NULL
+        cdef SIZE_t i = 0
+        cdef INT32_t k = 0
+
+        # feature_to_sample as a data structure records the last seen sample
+        # for each feature; functionally, it is an efficient way to identify
+        # which features are nonzero in the present sample.
+        cdef SIZE_t* feature_to_sample = NULL
+
+        safe_realloc(&X_sample, n_features)
+        safe_realloc(&feature_to_sample, n_features)
+
+        with nogil:
+            memset(feature_to_sample, -1, n_features * sizeof(SIZE_t))
+
+            for i in range(n_samples):
+                node = self.nodes
+                indptr_ptr[i + 1] = indptr_ptr[i]
+
+                for k in range(X_indptr[i], X_indptr[i + 1]):
+                    feature_to_sample[X_indices[k]] = i
+                    X_sample[X_indices[k]] = X_data[k]
+
+                # While node not a leaf
+                while node.left_child != _TREE_LEAF:
+                    # ... and node.right_child != _TREE_LEAF:
+
+                    indices_ptr[indptr_ptr[i + 1]] = <SIZE_t>(node - self.nodes)
+                    indptr_ptr[i + 1] += 1
+
+                    if feature_to_sample[node.feature] == i:
+                        feature_value = X_sample[node.feature]
+
+                    else:
+                        feature_value = 0.
+
+                    if feature_value <= node.threshold:
+                        node = &self.nodes[node.left_child]
+                    else:
+                        node = &self.nodes[node.right_child]
+
+                # Add the leave node
+                indices_ptr[indptr_ptr[i + 1]] = <SIZE_t>(node - self.nodes)
+                indptr_ptr[i + 1] += 1
+
+            # Free auxiliary arrays
+            free(X_sample)
+            free(feature_to_sample)
+
+        indices = indices[:indptr[n_samples]]
+        cdef cnp.ndarray[SIZE_t] data = np.ones(shape=len(indices),
+                                               dtype=np.intp)
+        out = csr_matrix((data, indices, indptr),
+                         shape=(n_samples, self.node_count))
+
+        return out
+
+
+    cpdef compute_feature_importances(self, normalize=True):
+        """Computes the importance of each feature (aka variable)."""
+        cdef Node* left
+        cdef Node* right
+        cdef Node* nodes = self.nodes
+        cdef Node* node = nodes
+        cdef Node* end_node = node + self.node_count
+
+        cdef double normalizer = 0.
+
+        cdef cnp.ndarray[cnp.float64_t, ndim=1] importances
+        importances = np.zeros((self.n_features,))
+        cdef DOUBLE_t* importance_data = <DOUBLE_t*>importances.data
+
+        with nogil:
+            while node != end_node:
+                if node.left_child != _TREE_LEAF:
+                    # ... and node.right_child != _TREE_LEAF:
+                    left = &nodes[node.left_child]
+                    right = &nodes[node.right_child]
+
+                    importance_data[node.feature] += (
+                        node.weighted_n_node_samples * node.impurity -
+                        left.weighted_n_node_samples * left.impurity -
+                        right.weighted_n_node_samples * right.impurity)
+                node += 1
+
+        importances /= nodes[0].weighted_n_node_samples
+
+        if normalize:
+            normalizer = np.sum(importances)
+
+            if normalizer > 0.0:
+                # Avoid dividing by zero (e.g., when root is pure)
+                importances /= normalizer
+
+        return importances
+
+    cdef cnp.ndarray _get_value_ndarray(self):
+        """Wraps value as a 3-d NumPy array.
+
+        The array keeps a reference to this Tree, which manages the underlying
+        memory.
+        """
+        cdef cnp.npy_intp shape[3]
+        shape[0] = <cnp.npy_intp> self.node_count
+        shape[1] = <cnp.npy_intp> self.n_outputs
+        shape[2] = <cnp.npy_intp> self.max_n_classes
+        cdef cnp.ndarray arr
+        arr = cnp.PyArray_SimpleNewFromData(3, shape, cnp.NPY_DOUBLE, self.value)
+        Py_INCREF(self)
+        if PyArray_SetBaseObject(arr, <PyObject*> self) < 0:
+            raise ValueError("Can't initialize array.")
+        return arr
+
+    cdef cnp.ndarray _get_node_ndarray(self):
+        """Wraps nodes as a NumPy struct array.
+
+        The array keeps a reference to this Tree, which manages the underlying
+        memory. Individual fields are publicly accessible as properties of the
+        Tree.
+        """
+        cdef cnp.npy_intp shape[1]
+        shape[0] = <cnp.npy_intp> self.node_count
+        cdef cnp.npy_intp strides[1]
+        strides[0] = sizeof(Node)
+        cdef cnp.ndarray arr
+        Py_INCREF(NODE_DTYPE)
+        arr = PyArray_NewFromDescr(<PyTypeObject *> cnp.ndarray,
+                                   <cnp.dtype> NODE_DTYPE, 1, shape,
+                                   strides, <void*> self.nodes,
+                                   cnp.NPY_DEFAULT, None)
+        Py_INCREF(self)
+        if PyArray_SetBaseObject(arr, <PyObject*> self) < 0:
+            raise ValueError("Can't initialize array.")
+        return arr
+
+    def compute_partial_dependence(self, DTYPE_t[:, ::1] X,
+                                   int[::1] target_features,
+                                   double[::1] out):
+        """Partial dependence of the response on the ``target_feature`` set.
+
+        For each sample in ``X`` a tree traversal is performed.
+        Each traversal starts from the root with weight 1.0.
+
+        At each non-leaf node that splits on a target feature, either
+        the left child or the right child is visited based on the feature
+        value of the current sample, and the weight is not modified.
+        At each non-leaf node that splits on a complementary feature,
+        both children are visited and the weight is multiplied by the fraction
+        of training samples which went to each child.
+
+        At each leaf, the value of the node is multiplied by the current
+        weight (weights sum to 1 for all visited terminal nodes).
+
+        Parameters
+        ----------
+        X : view on 2d ndarray, shape (n_samples, n_target_features)
+            The grid points on which the partial dependence should be
+            evaluated.
+        target_features : view on 1d ndarray, shape (n_target_features)
+            The set of target features for which the partial dependence
+            should be evaluated.
+        out : view on 1d ndarray, shape (n_samples)
+            The value of the partial dependence function on each grid
+            point.
+        """
+        cdef:
+            double[::1] weight_stack = np.zeros(self.node_count,
+                                                dtype=np.float64)
+            SIZE_t[::1] node_idx_stack = np.zeros(self.node_count,
+                                                  dtype=np.intp)
+            SIZE_t sample_idx
+            SIZE_t feature_idx
+            int stack_size
+            double left_sample_frac
+            double current_weight
+            double total_weight  # used for sanity check only
+            Node *current_node  # use a pointer to avoid copying attributes
+            SIZE_t current_node_idx
+            bint is_target_feature
+            SIZE_t _TREE_LEAF = TREE_LEAF  # to avoid python interactions
+
+        for sample_idx in range(X.shape[0]):
+            # init stacks for current sample
+            stack_size = 1
+            node_idx_stack[0] = 0  # root node
+            weight_stack[0] = 1  # all the samples are in the root node
+            total_weight = 0
+
+            while stack_size > 0:
+                # pop the stack
+                stack_size -= 1
+                current_node_idx = node_idx_stack[stack_size]
+                current_node = &self.nodes[current_node_idx]
+
+                if current_node.left_child == _TREE_LEAF:
+                    # leaf node
+                    out[sample_idx] += (weight_stack[stack_size] *
+                                        self.value[current_node_idx])
+                    total_weight += weight_stack[stack_size]
+                else:
+                    # non-leaf node
+
+                    # determine if the split feature is a target feature
+                    is_target_feature = False
+                    for feature_idx in range(target_features.shape[0]):
+                        if target_features[feature_idx] == current_node.feature:
+                            is_target_feature = True
+                            break
+
+                    if is_target_feature:
+                        # In this case, we push left or right child on stack
+                        if X[sample_idx, feature_idx] <= current_node.threshold:
+                            node_idx_stack[stack_size] = current_node.left_child
+                        else:
+                            node_idx_stack[stack_size] = current_node.right_child
+                        stack_size += 1
+                    else:
+                        # In this case, we push both children onto the stack,
+                        # and give a weight proportional to the number of
+                        # samples going through each branch.
+
+                        # push left child
+                        node_idx_stack[stack_size] = current_node.left_child
+                        left_sample_frac = (
+                            self.nodes[current_node.left_child].weighted_n_node_samples /
+                            current_node.weighted_n_node_samples)
+                        current_weight = weight_stack[stack_size]
+                        weight_stack[stack_size] = current_weight * left_sample_frac
+                        stack_size += 1
+
+                        # push right child
+                        node_idx_stack[stack_size] = current_node.right_child
+                        weight_stack[stack_size] = (
+                            current_weight * (1 - left_sample_frac))
+                        stack_size += 1
+
+            # Sanity check. Should never happen.
+            if not (0.999 < total_weight < 1.001):
+                raise ValueError("Total weight should be 1.0 but was %.9f" %
+                                 total_weight)
+
+
+def _check_n_classes(n_classes, expected_dtype):
+    if n_classes.ndim != 1:
+        raise ValueError(
+            f"Wrong dimensions for n_classes from the pickle: "
+            f"expected 1, got {n_classes.ndim}"
+        )
+
+    if n_classes.dtype == expected_dtype:
+        return n_classes
+
+    # Handles both different endianness and different bitness
+    if n_classes.dtype.kind == "i" and n_classes.dtype.itemsize in [4, 8]:
+        return n_classes.astype(expected_dtype, casting="same_kind")
+
+    raise ValueError(
+        "n_classes from the pickle has an incompatible dtype:\n"
+        f"- expected: {expected_dtype}\n"
+        f"- got:      {n_classes.dtype}"
+    )
+
+
+def _check_value_ndarray(value_ndarray, expected_dtype, expected_shape):
+    if value_ndarray.shape != expected_shape:
+        raise ValueError(
+            "Wrong shape for value array from the pickle: "
+            f"expected {expected_shape}, got {value_ndarray.shape}"
+        )
+
+    if not value_ndarray.flags.c_contiguous:
+        raise ValueError(
+            "value array from the pickle should be a C-contiguous array"
+        )
+
+    if value_ndarray.dtype == expected_dtype:
+        return value_ndarray
+
+    # Handles different endianness
+    if value_ndarray.dtype.str.endswith('f8'):
+        return value_ndarray.astype(expected_dtype, casting='equiv')
+
+    raise ValueError(
+        "value array from the pickle has an incompatible dtype:\n"
+        f"- expected: {expected_dtype}\n"
+        f"- got:      {value_ndarray.dtype}"
+    )
+
+
+def _dtype_to_dict(dtype):
+    return {name: dt.str for name, (dt, *rest) in dtype.fields.items()}
+
+
+def _dtype_dict_with_modified_bitness(dtype_dict):
+    # field names in Node struct with SIZE_t types (see sklearn/tree/_tree.pxd)
+    indexing_field_names = ["left_child", "right_child", "feature", "n_node_samples"]
+
+    expected_dtype_size = str(struct.calcsize("P"))
+    allowed_dtype_size = "8" if expected_dtype_size == "4" else "4"
+
+    allowed_dtype_dict = dtype_dict.copy()
+    for name in indexing_field_names:
+        allowed_dtype_dict[name] = allowed_dtype_dict[name].replace(
+            expected_dtype_size, allowed_dtype_size
+        )
+
+    return allowed_dtype_dict
+
+
+def _all_compatible_dtype_dicts(dtype):
+    # The Cython code for decision trees uses platform-specific SIZE_t
+    # typed indexing fields that correspond to either i4 or i8 dtypes for
+    # the matching fields in the numpy array depending on the bitness of
+    # the platform (32 bit or 64 bit respectively).
+    #
+    # We need to cast the indexing fields of the NODE_DTYPE-dtyped array at
+    # pickle load time to enable cross-bitness deployment scenarios. We
+    # typically want to make it possible to run the expensive fit method of
+    # a tree estimator on a 64 bit server platform, pickle the estimator
+    # for deployment and run the predict method of a low power 32 bit edge
+    # platform.
+    #
+    # A similar thing happens for endianness, the machine where the pickle was
+    # saved can have a different endianness than the machine where the pickle
+    # is loaded
+
+    dtype_dict = _dtype_to_dict(dtype)
+    dtype_dict_with_modified_bitness = _dtype_dict_with_modified_bitness(dtype_dict)
+    dtype_dict_with_modified_endianness = _dtype_to_dict(dtype.newbyteorder())
+    dtype_dict_with_modified_bitness_and_endianness = _dtype_dict_with_modified_bitness(
+        dtype_dict_with_modified_endianness
+    )
+
+    return [
+        dtype_dict,
+        dtype_dict_with_modified_bitness,
+        dtype_dict_with_modified_endianness,
+        dtype_dict_with_modified_bitness_and_endianness,
+    ]
+
+
+def _check_node_ndarray(node_ndarray, expected_dtype):
+    if node_ndarray.ndim != 1:
+        raise ValueError(
+            "Wrong dimensions for node array from the pickle: "
+            f"expected 1, got {node_ndarray.ndim}"
+        )
+
+    if not node_ndarray.flags.c_contiguous:
+        raise ValueError(
+            "node array from the pickle should be a C-contiguous array"
+        )
+
+    node_ndarray_dtype = node_ndarray.dtype
+    if node_ndarray_dtype == expected_dtype:
+        return node_ndarray
+
+    node_ndarray_dtype_dict = _dtype_to_dict(node_ndarray_dtype)
+    all_compatible_dtype_dicts = _all_compatible_dtype_dicts(expected_dtype)
+
+    if node_ndarray_dtype_dict not in all_compatible_dtype_dicts:
+        raise ValueError(
+            "node array from the pickle has an incompatible dtype:\n"
+            f"- expected: {expected_dtype}\n"
+            f"- got     : {node_ndarray_dtype}"
+        )
+
+    return node_ndarray.astype(expected_dtype, casting="same_kind")
+
+
+# =============================================================================
+# Build Pruned Tree
+# =============================================================================
+
+
+cdef class _CCPPruneController:
+    """Base class used by build_pruned_tree_ccp and ccp_pruning_path
+    to control pruning.
+    """
+    cdef bint stop_pruning(self, DOUBLE_t effective_alpha) nogil:
+        """Return 1 to stop pruning and 0 to continue pruning"""
+        return 0
+
+    cdef void save_metrics(self, DOUBLE_t effective_alpha,
+                           DOUBLE_t subtree_impurities) nogil:
+        """Save metrics when pruning"""
+        pass
+
+    cdef void after_pruning(self, unsigned char[:] in_subtree) nogil:
+        """Called after pruning"""
+        pass
+
+
+cdef class _AlphaPruner(_CCPPruneController):
+    """Use alpha to control when to stop pruning."""
+    cdef DOUBLE_t ccp_alpha
+    cdef SIZE_t capacity
+
+    def __cinit__(self, DOUBLE_t ccp_alpha):
+        self.ccp_alpha = ccp_alpha
+        self.capacity = 0
+
+    cdef bint stop_pruning(self, DOUBLE_t effective_alpha) nogil:
+        # The subtree on the previous iteration has the greatest ccp_alpha
+        # less than or equal to self.ccp_alpha
+        return self.ccp_alpha < effective_alpha
+
+    cdef void after_pruning(self, unsigned char[:] in_subtree) nogil:
+        """Updates the number of leaves in subtree"""
+        for i in range(in_subtree.shape[0]):
+            if in_subtree[i]:
+                self.capacity += 1
+
+
+cdef class _PathFinder(_CCPPruneController):
+    """Record metrics used to return the cost complexity path."""
+    cdef DOUBLE_t[:] ccp_alphas
+    cdef DOUBLE_t[:] impurities
+    cdef UINT32_t count
+
+    def __cinit__(self,  int node_count):
+        self.ccp_alphas = np.zeros(shape=(node_count), dtype=np.float64)
+        self.impurities = np.zeros(shape=(node_count), dtype=np.float64)
+        self.count = 0
+
+    cdef void save_metrics(self,
+                           DOUBLE_t effective_alpha,
+                           DOUBLE_t subtree_impurities) nogil:
+        self.ccp_alphas[self.count] = effective_alpha
+        self.impurities[self.count] = subtree_impurities
+        self.count += 1
+
+
+cdef _cost_complexity_prune(unsigned char[:] leaves_in_subtree, # OUT
+                            Tree orig_tree,
+                            _CCPPruneController controller):
+    """Perform cost complexity pruning.
+
+    This function takes an already grown tree, `orig_tree` and outputs a
+    boolean mask `leaves_in_subtree` to are the leaves in the pruned tree. The
+    controller signals when the pruning should stop and is passed the
+    metrics of the subtrees during the pruning process.
+
+    Parameters
+    ----------
+    leaves_in_subtree : unsigned char[:]
+        Output for leaves of subtree
+    orig_tree : Tree
+        Original tree
+    ccp_controller : _CCPPruneController
+        Cost complexity controller
+    """
+
+    cdef:
+        SIZE_t i
+        SIZE_t n_nodes = orig_tree.node_count
+        # prior probability using weighted samples
+        DOUBLE_t[:] weighted_n_node_samples = orig_tree.weighted_n_node_samples
+        DOUBLE_t total_sum_weights = weighted_n_node_samples[0]
+        DOUBLE_t[:] impurity = orig_tree.impurity
+        # weighted impurity of each node
+        DOUBLE_t[:] r_node = np.empty(shape=n_nodes, dtype=np.float64)
+
+        SIZE_t[:] child_l = orig_tree.children_left
+        SIZE_t[:] child_r = orig_tree.children_right
+        SIZE_t[:] parent = np.zeros(shape=n_nodes, dtype=np.intp)
+
+        # Only uses the start and parent variables
+        Stack stack = Stack(INITIAL_STACK_SIZE)
+        StackRecord stack_record
+        int rc = 0
+        SIZE_t node_idx
+
+        SIZE_t[:] n_leaves = np.zeros(shape=n_nodes, dtype=np.intp)
+        DOUBLE_t[:] r_branch = np.zeros(shape=n_nodes, dtype=np.float64)
+        DOUBLE_t current_r
+        SIZE_t leaf_idx
+        SIZE_t parent_idx
+
+        # candidate nodes that can be pruned
+        unsigned char[:] candidate_nodes = np.zeros(shape=n_nodes,
+                                                    dtype=np.uint8)
+        # nodes in subtree
+        unsigned char[:] in_subtree = np.ones(shape=n_nodes, dtype=np.uint8)
+        DOUBLE_t[:] g_node = np.zeros(shape=n_nodes, dtype=np.float64)
+        SIZE_t pruned_branch_node_idx
+        DOUBLE_t subtree_alpha
+        DOUBLE_t effective_alpha
+        SIZE_t child_l_idx
+        SIZE_t child_r_idx
+        SIZE_t n_pruned_leaves
+        DOUBLE_t r_diff
+        DOUBLE_t max_float64 = np.finfo(np.float64).max
+
+    # find parent node ids and leaves
+    with nogil:
+
+        for i in range(r_node.shape[0]):
+            r_node[i] = (
+                weighted_n_node_samples[i] * impurity[i] / total_sum_weights)
+
+        # Push root node, using StackRecord.start as node id
+        rc = stack.push(0, 0, 0, -1, 0, 0, 0)
+        if rc == -1:
+            with gil:
+                raise MemoryError("pruning tree")
+
+        while not stack.is_empty():
+            stack.pop(&stack_record)
+            node_idx = stack_record.start
+            parent[node_idx] = stack_record.parent
+            if child_l[node_idx] == _TREE_LEAF:
+                # ... and child_r[node_idx] == _TREE_LEAF:
+                leaves_in_subtree[node_idx] = 1
+            else:
+                rc = stack.push(child_l[node_idx], 0, 0, node_idx, 0, 0, 0)
+                if rc == -1:
+                    with gil:
+                        raise MemoryError("pruning tree")
+
+                rc = stack.push(child_r[node_idx], 0, 0, node_idx, 0, 0, 0)
+                if rc == -1:
+                    with gil:
+                        raise MemoryError("pruning tree")
+
+        # computes number of leaves in all branches and the overall impurity of
+        # the branch. The overall impurity is the sum of r_node in its leaves.
+        for leaf_idx in range(leaves_in_subtree.shape[0]):
+            if not leaves_in_subtree[leaf_idx]:
+                continue
+            r_branch[leaf_idx] = r_node[leaf_idx]
+
+            # bubble up values to ancestor nodes
+            current_r = r_node[leaf_idx]
+            while leaf_idx != 0:
+                parent_idx = parent[leaf_idx]
+                r_branch[parent_idx] += current_r
+                n_leaves[parent_idx] += 1
+                leaf_idx = parent_idx
+
+        for i in range(leaves_in_subtree.shape[0]):
+            candidate_nodes[i] = not leaves_in_subtree[i]
+
+        # save metrics before pruning
+        controller.save_metrics(0.0, r_branch[0])
+
+        # while root node is not a leaf
+        while candidate_nodes[0]:
+
+            # computes ccp_alpha for subtrees and finds the minimal alpha
+            effective_alpha = max_float64
+            for i in range(n_nodes):
+                if not candidate_nodes[i]:
+                    continue
+                subtree_alpha = (r_node[i] - r_branch[i]) / (n_leaves[i] - 1)
+                if subtree_alpha < effective_alpha:
+                    effective_alpha = subtree_alpha
+                    pruned_branch_node_idx = i
+
+            if controller.stop_pruning(effective_alpha):
+                break
+
+            # stack uses only the start variable
+            rc = stack.push(pruned_branch_node_idx, 0, 0, 0, 0, 0, 0)
+            if rc == -1:
+                with gil:
+                    raise MemoryError("pruning tree")
+
+            # descendants of branch are not in subtree
+            while not stack.is_empty():
+                stack.pop(&stack_record)
+                node_idx = stack_record.start
+
+                if not in_subtree[node_idx]:
+                    continue # branch has already been marked for pruning
+                candidate_nodes[node_idx] = 0
+                leaves_in_subtree[node_idx] = 0
+                in_subtree[node_idx] = 0
+
+                if child_l[node_idx] != _TREE_LEAF:
+                    # ... and child_r[node_idx] != _TREE_LEAF:
+                    rc = stack.push(child_l[node_idx], 0, 0, 0, 0, 0, 0)
+                    if rc == -1:
+                        with gil:
+                            raise MemoryError("pruning tree")
+                    rc = stack.push(child_r[node_idx], 0, 0, 0, 0, 0, 0)
+                    if rc == -1:
+                        with gil:
+                            raise MemoryError("pruning tree")
+            leaves_in_subtree[pruned_branch_node_idx] = 1
+            in_subtree[pruned_branch_node_idx] = 1
+
+            # updates number of leaves
+            n_pruned_leaves = n_leaves[pruned_branch_node_idx] - 1
+            n_leaves[pruned_branch_node_idx] = 0
+
+            # computes the increase in r_branch to bubble up
+            r_diff = r_node[pruned_branch_node_idx] - r_branch[pruned_branch_node_idx]
+            r_branch[pruned_branch_node_idx] = r_node[pruned_branch_node_idx]
+
+            # bubble up values to ancestors
+            node_idx = parent[pruned_branch_node_idx]
+            while node_idx != -1:
+                n_leaves[node_idx] -= n_pruned_leaves
+                r_branch[node_idx] += r_diff
+                node_idx = parent[node_idx]
+
+            controller.save_metrics(effective_alpha, r_branch[0])
+
+        controller.after_pruning(in_subtree)
+
+
+def _build_pruned_tree_ccp(
+    Tree tree, # OUT
+    Tree orig_tree,
+    DOUBLE_t ccp_alpha):
+    """Build a pruned tree from the original tree using cost complexity
+    pruning.
+
+    The values and nodes from the original tree are copied into the pruned
+    tree.
+
+    Parameters
+    ----------
+    tree : Tree
+        Location to place the pruned tree
+    orig_tree : Tree
+        Original tree
+    ccp_alpha : positive double
+        Complexity parameter. The subtree with the largest cost complexity
+        that is smaller than ``ccp_alpha`` will be chosen. By default,
+        no pruning is performed.
+    """
+
+    cdef:
+        SIZE_t n_nodes = orig_tree.node_count
+        unsigned char[:] leaves_in_subtree = np.zeros(
+            shape=n_nodes, dtype=np.uint8)
+
+    pruning_controller = _AlphaPruner(ccp_alpha=ccp_alpha)
+
+    _cost_complexity_prune(leaves_in_subtree, orig_tree, pruning_controller)
+
+    _build_pruned_tree(tree, orig_tree, leaves_in_subtree,
+                       pruning_controller.capacity)
+
+
+def ccp_pruning_path(Tree orig_tree):
+    """Computes the cost complexity pruning path.
+
+    Parameters
+    ----------
+    tree : Tree
+        Original tree.
+
+    Returns
+    -------
+    path_info : dict
+        Information about pruning path with attributes:
+
+        ccp_alphas : ndarray
+            Effective alphas of subtree during pruning.
+
+        impurities : ndarray
+            Sum of the impurities of the subtree leaves for the
+            corresponding alpha value in ``ccp_alphas``.
+    """
+    cdef:
+        unsigned char[:] leaves_in_subtree = np.zeros(
+            shape=orig_tree.node_count, dtype=np.uint8)
+
+    path_finder = _PathFinder(orig_tree.node_count)
+
+    _cost_complexity_prune(leaves_in_subtree, orig_tree, path_finder)
+
+    cdef:
+        UINT32_t total_items = path_finder.count
+        cnp.ndarray ccp_alphas = np.empty(shape=total_items,
+                                         dtype=np.float64)
+        cnp.ndarray impurities = np.empty(shape=total_items,
+                                         dtype=np.float64)
+        UINT32_t count = 0
+
+    while count < total_items:
+        ccp_alphas[count] = path_finder.ccp_alphas[count]
+        impurities[count] = path_finder.impurities[count]
+        count += 1
+
+    return {'ccp_alphas': ccp_alphas, 'impurities': impurities}
+
+
+cdef _build_pruned_tree(
+    Tree tree, # OUT
+    Tree orig_tree,
+    const unsigned char[:] leaves_in_subtree,
+    SIZE_t capacity):
+    """Build a pruned tree.
+
+    Build a pruned tree from the original tree by transforming the nodes in
+    ``leaves_in_subtree`` into leaves.
+
+    Parameters
+    ----------
+    tree : Tree
+        Location to place the pruned tree
+    orig_tree : Tree
+        Original tree
+    leaves_in_subtree : unsigned char memoryview, shape=(node_count, )
+        Boolean mask for leaves to include in subtree
+    capacity : SIZE_t
+        Number of nodes to initially allocate in pruned tree
+    """
+    tree._resize(capacity)
+
+    cdef:
+        SIZE_t orig_node_id
+        SIZE_t new_node_id
+        SIZE_t depth
+        SIZE_t parent
+        bint is_left
+        bint is_leaf
+
+        # value_stride for original tree and new tree are the same
+        SIZE_t value_stride = orig_tree.value_stride
+        SIZE_t max_depth_seen = -1
+        int rc = 0
+        Node* node
+        double* orig_value_ptr
+        double* new_value_ptr
+
+        # Only uses the start, depth, parent, and is_left variables
+        Stack stack = Stack(INITIAL_STACK_SIZE)
+        StackRecord stack_record
+
+    with nogil:
+        # push root node onto stack
+        rc = stack.push(0, 0, 0, _TREE_UNDEFINED, 0, 0.0, 0)
+        if rc == -1:
+            with gil:
+                raise MemoryError("pruning tree")
+
+        while not stack.is_empty():
+            stack.pop(&stack_record)
+
+            orig_node_id = stack_record.start
+            depth = stack_record.depth
+            parent = stack_record.parent
+            is_left = stack_record.is_left
+
+            is_leaf = leaves_in_subtree[orig_node_id]
+            node = &orig_tree.nodes[orig_node_id]
+
+            new_node_id = tree._add_node(
+                parent, is_left, is_leaf, node.feature, node.threshold,
+                node.impurity, node.n_node_samples,
+                node.weighted_n_node_samples)
+
+            if new_node_id == SIZE_MAX:
+                rc = -1
+                break
+
+            # copy value from original tree to new tree
+            orig_value_ptr = orig_tree.value + value_stride * orig_node_id
+            new_value_ptr = tree.value + value_stride * new_node_id
+            memcpy(new_value_ptr, orig_value_ptr, sizeof(double) * value_stride)
+
+            if not is_leaf:
+                # Push right child on stack
+                rc = stack.push(
+                    node.right_child, 0, depth + 1, new_node_id, 0, 0.0, 0)
+                if rc == -1:
+                    break
+
+                # push left child on stack
+                rc = stack.push(
+                    node.left_child, 0, depth + 1, new_node_id, 1, 0.0, 0)
+                if rc == -1:
+                    break
+
+            if depth > max_depth_seen:
+                max_depth_seen = depth
+
+        if rc >= 0:
+            tree.max_depth = max_depth_seen
+    if rc == -1:
+        raise MemoryError("pruning tree")
diff --git a/causalml/inference/tree/_tree/_utils.pxd b/causalml/inference/tree/_tree/_utils.pxd
new file mode 100644
index 00000000..098a0028
--- /dev/null
+++ b/causalml/inference/tree/_tree/_utils.pxd
@@ -0,0 +1,121 @@
+# Authors: Gilles Louppe <g.louppe@gmail.com>
+#          Peter Prettenhofer <peter.prettenhofer@gmail.com>
+#          Arnaud Joly <arnaud.v.joly@gmail.com>
+#          Jacob Schreiber <jmschreiber91@gmail.com>
+#          Nelson Liu <nelson@nelsonliu.me>
+#
+# License: BSD 3 clause
+
+# cython: cdivision=True
+# cython: boundscheck=False
+# cython: wraparound=False
+# cython: language_level=3
+# cython: linetrace=True
+
+import numpy as np
+cimport numpy as cnp
+from ._tree cimport Node
+
+ctypedef cnp.npy_float32 DTYPE_t          # Type of X
+ctypedef cnp.npy_float64 DOUBLE_t         # Type of y, sample_weight
+ctypedef cnp.npy_intp SIZE_t              # Type for indices and counters
+ctypedef cnp.npy_int32 INT32_t            # Signed 32 bit integer
+ctypedef cnp.npy_uint32 UINT32_t          # Unsigned 32 bit integer
+
+
+cdef enum:
+    # Max value for our rand_r replacement (near the bottom).
+    # We don't use RAND_MAX because it's different across platforms and
+    # particularly tiny on Windows/MSVC.
+    RAND_R_MAX = 0x7FFFFFFF
+
+
+# safe_realloc(&p, n) resizes the allocation of p to n * sizeof(*p) bytes or
+# raises a MemoryError. It never calls free, since that's __dealloc__'s job.
+#   cdef DTYPE_t *p = NULL
+#   safe_realloc(&p, n)
+# is equivalent to p = malloc(n * sizeof(*p)) with error checking.
+ctypedef fused realloc_ptr:
+    # Add pointer types here as needed.
+    (DTYPE_t*)
+    (SIZE_t*)
+    (unsigned char*)
+    (DOUBLE_t*)
+    (DOUBLE_t**)
+    (Node*)
+    (Node**)
+    (StackRecord*)
+    (PriorityHeapRecord*)
+
+cdef realloc_ptr safe_realloc(realloc_ptr* p, size_t nelems) nogil except *
+
+
+cdef SIZE_t rand_int(SIZE_t low, SIZE_t high,
+                     UINT32_t* random_state) nogil
+
+
+cdef double rand_uniform(double low, double high,
+                         UINT32_t* random_state) nogil
+
+
+cdef cnp.ndarray sizet_ptr_to_ndarray(SIZE_t* data, SIZE_t size)
+
+
+cdef double log(double x) nogil
+
+# =============================================================================
+# Stack data structure
+# =============================================================================
+
+# A record on the stack for depth-first tree growing
+cdef struct StackRecord:
+    SIZE_t start
+    SIZE_t end
+    SIZE_t depth
+    SIZE_t parent
+    bint is_left
+    double impurity
+    SIZE_t n_constant_features
+
+cdef class Stack:
+    cdef SIZE_t capacity
+    cdef SIZE_t top
+    cdef StackRecord* stack_
+
+    cdef bint is_empty(self) nogil
+    cdef int push(self, SIZE_t start, SIZE_t end, SIZE_t depth, SIZE_t parent,
+                  bint is_left, double impurity,
+                  SIZE_t n_constant_features) nogil except -1
+    cdef int pop(self, StackRecord* res) nogil
+
+
+# =============================================================================
+# PriorityHeap data structure
+# =============================================================================
+
+# A record on the frontier for best-first tree growing
+cdef struct PriorityHeapRecord:
+    SIZE_t node_id
+    SIZE_t start
+    SIZE_t end
+    SIZE_t pos
+    SIZE_t depth
+    bint is_leaf
+    double impurity
+    double impurity_left
+    double impurity_right
+    double improvement
+
+cdef class PriorityHeap:
+    cdef SIZE_t capacity
+    cdef SIZE_t heap_ptr
+    cdef PriorityHeapRecord* heap_
+
+    cdef bint is_empty(self) nogil
+    cdef void heapify_up(self, PriorityHeapRecord* heap, SIZE_t pos) nogil
+    cdef void heapify_down(self, PriorityHeapRecord* heap, SIZE_t pos, SIZE_t heap_length) nogil
+    cdef int push(self, SIZE_t node_id, SIZE_t start, SIZE_t end, SIZE_t pos,
+                  SIZE_t depth, bint is_leaf, double improvement,
+                  double impurity, double impurity_left,
+                  double impurity_right) nogil except -1
+    cdef int pop(self, PriorityHeapRecord* res) nogil
diff --git a/causalml/inference/tree/_tree/_utils.pyx b/causalml/inference/tree/_tree/_utils.pyx
new file mode 100755
index 00000000..4b69bc23
--- /dev/null
+++ b/causalml/inference/tree/_tree/_utils.pyx
@@ -0,0 +1,311 @@
+# Authors: Gilles Louppe <g.louppe@gmail.com>
+#          Peter Prettenhofer <peter.prettenhofer@gmail.com>
+#          Arnaud Joly <arnaud.v.joly@gmail.com>
+#          Jacob Schreiber <jmschreiber91@gmail.com>
+#          Nelson Liu <nelson@nelsonliu.me>
+#
+#
+# License: BSD 3 clause
+
+# cython: cdivision=True
+# cython: boundscheck=False
+# cython: wraparound=False
+# cython: language_level=3
+# cython: linetrace=True
+
+from libc.stdlib cimport free
+from libc.stdlib cimport malloc
+from libc.stdlib cimport realloc
+from libc.math cimport log as ln
+
+import numpy as np
+cimport numpy as cnp
+cnp.import_array()
+
+cdef inline UINT32_t DEFAULT_SEED = 1
+
+# =============================================================================
+# Helper functions
+# =============================================================================
+
+# rand_r replacement using a 32bit XorShift generator
+# See http://www.jstatsoft.org/v08/i14/paper for details
+cdef inline UINT32_t our_rand_r(UINT32_t* seed) nogil:
+    """Generate a pseudo-random np.uint32 from a np.uint32 seed"""
+    # seed shouldn't ever be 0.
+    if (seed[0] == 0): seed[0] = DEFAULT_SEED
+
+    seed[0] ^= <UINT32_t>(seed[0] << 13)
+    seed[0] ^= <UINT32_t>(seed[0] >> 17)
+    seed[0] ^= <UINT32_t>(seed[0] << 5)
+
+    # Note: we must be careful with the final line cast to np.uint32 so that
+    # the function behaves consistently across platforms.
+    #
+    # The following cast might yield different results on different platforms:
+    # wrong_cast = <UINT32_t> RAND_R_MAX + 1
+    #
+    # We can use:
+    # good_cast = <UINT32_t>(RAND_R_MAX + 1)
+    # or:
+    # cdef np.uint32_t another_good_cast = <UINT32_t>RAND_R_MAX + 1
+    return seed[0] % <UINT32_t>(RAND_R_MAX + 1)
+
+
+cdef realloc_ptr safe_realloc(realloc_ptr* p, size_t nelems) nogil except *:
+    # sizeof(realloc_ptr[0]) would be more like idiomatic C, but causes Cython
+    # 0.20.1 to crash.
+    cdef size_t nbytes = nelems * sizeof(p[0][0])
+    if nbytes / sizeof(p[0][0]) != nelems:
+        # Overflow in the multiplication
+        with gil:
+            raise MemoryError("could not allocate (%d * %d) bytes"
+                              % (nelems, sizeof(p[0][0])))
+    cdef realloc_ptr tmp = <realloc_ptr>realloc(p[0], nbytes)
+    if tmp == NULL:
+        with gil:
+            raise MemoryError("could not allocate %d bytes" % nbytes)
+
+    p[0] = tmp
+    return tmp  # for convenience
+
+
+def _realloc_test():
+    # Helper for tests. Tries to allocate <size_t>(-1) / 2 * sizeof(size_t)
+    # bytes, which will always overflow.
+    cdef SIZE_t* p = NULL
+    safe_realloc(&p, <size_t>(-1) / 2)
+    if p != NULL:
+        free(p)
+        assert False
+
+
+cdef inline cnp.ndarray sizet_ptr_to_ndarray(SIZE_t* data, SIZE_t size):
+    """Return copied data as 1D numpy array of intp's."""
+    cdef cnp.npy_intp shape[1]
+    shape[0] = <cnp.npy_intp> size
+    return cnp.PyArray_SimpleNewFromData(1, shape, cnp.NPY_INTP, data).copy()
+
+
+cdef inline SIZE_t rand_int(SIZE_t low, SIZE_t high,
+                            UINT32_t* random_state) nogil:
+    """Generate a random integer in [low; end)."""
+    return low + our_rand_r(random_state) % (high - low)
+
+
+cdef inline double rand_uniform(double low, double high,
+                                UINT32_t* random_state) nogil:
+    """Generate a random double in [low; high)."""
+    return ((high - low) * <double> our_rand_r(random_state) /
+            <double> RAND_R_MAX) + low
+
+
+cdef inline double log(double x) nogil:
+    return ln(x) / ln(2.0)
+
+
+# =============================================================================
+# Stack data structure
+# =============================================================================
+
+cdef class Stack:
+    """A LIFO data structure.
+
+    Attributes
+    ----------
+    capacity : SIZE_t
+        The elements the stack can hold; if more added then ``self.stack_``
+        needs to be resized.
+
+    top : SIZE_t
+        The number of elements currently on the stack.
+
+    stack : StackRecord pointer
+        The stack of records (upward in the stack corresponds to the right).
+    """
+
+    def __cinit__(self, SIZE_t capacity):
+        self.capacity = capacity
+        self.top = 0
+        self.stack_ = <StackRecord*> malloc(capacity * sizeof(StackRecord))
+
+    def __dealloc__(self):
+        free(self.stack_)
+
+    cdef bint is_empty(self) nogil:
+        return self.top <= 0
+
+    cdef int push(self, SIZE_t start, SIZE_t end, SIZE_t depth, SIZE_t parent,
+                  bint is_left, double impurity,
+                  SIZE_t n_constant_features) nogil except -1:
+        """Push a new element onto the stack.
+
+        Return -1 in case of failure to allocate memory (and raise MemoryError)
+        or 0 otherwise.
+        """
+        cdef SIZE_t top = self.top
+        cdef StackRecord* stack = NULL
+
+        # Resize if capacity not sufficient
+        if top >= self.capacity:
+            self.capacity *= 2
+            # Since safe_realloc can raise MemoryError, use `except -1`
+            safe_realloc(&self.stack_, self.capacity)
+
+        stack = self.stack_
+        stack[top].start = start
+        stack[top].end = end
+        stack[top].depth = depth
+        stack[top].parent = parent
+        stack[top].is_left = is_left
+        stack[top].impurity = impurity
+        stack[top].n_constant_features = n_constant_features
+
+        # Increment stack pointer
+        self.top = top + 1
+        return 0
+
+    cdef int pop(self, StackRecord* res) nogil:
+        """Remove the top element from the stack and copy to ``res``.
+
+        Returns 0 if pop was successful (and ``res`` is set); -1
+        otherwise.
+        """
+        cdef SIZE_t top = self.top
+        cdef StackRecord* stack = self.stack_
+
+        if top <= 0:
+            return -1
+
+        res[0] = stack[top - 1]
+        self.top = top - 1
+
+        return 0
+
+
+# =============================================================================
+# PriorityHeap data structure
+# =============================================================================
+
+cdef class PriorityHeap:
+    """A priority queue implemented as a binary heap.
+
+    The heap invariant is that the impurity improvement of the parent record
+    is larger then the impurity improvement of the children.
+
+    Attributes
+    ----------
+    capacity : SIZE_t
+        The capacity of the heap
+
+    heap_ptr : SIZE_t
+        The water mark of the heap; the heap grows from left to right in the
+        array ``heap_``. The following invariant holds ``heap_ptr < capacity``.
+
+    heap_ : PriorityHeapRecord*
+        The array of heap records. The maximum element is on the left;
+        the heap grows from left to right
+    """
+
+    def __cinit__(self, SIZE_t capacity):
+        self.capacity = capacity
+        self.heap_ptr = 0
+        safe_realloc(&self.heap_, capacity)
+
+    def __dealloc__(self):
+        free(self.heap_)
+
+    cdef bint is_empty(self) nogil:
+        return self.heap_ptr <= 0
+
+    cdef void heapify_up(self, PriorityHeapRecord* heap, SIZE_t pos) nogil:
+        """Restore heap invariant parent.improvement > child.improvement from
+           ``pos`` upwards. """
+        if pos == 0:
+            return
+
+        cdef SIZE_t parent_pos = (pos - 1) / 2
+
+        if heap[parent_pos].improvement < heap[pos].improvement:
+            heap[parent_pos], heap[pos] = heap[pos], heap[parent_pos]
+            self.heapify_up(heap, parent_pos)
+
+    cdef void heapify_down(self, PriorityHeapRecord* heap, SIZE_t pos,
+                           SIZE_t heap_length) nogil:
+        """Restore heap invariant parent.improvement > children.improvement from
+           ``pos`` downwards. """
+        cdef SIZE_t left_pos = 2 * (pos + 1) - 1
+        cdef SIZE_t right_pos = 2 * (pos + 1)
+        cdef SIZE_t largest = pos
+
+        if (left_pos < heap_length and
+                heap[left_pos].improvement > heap[largest].improvement):
+            largest = left_pos
+
+        if (right_pos < heap_length and
+                heap[right_pos].improvement > heap[largest].improvement):
+            largest = right_pos
+
+        if largest != pos:
+            heap[pos], heap[largest] = heap[largest], heap[pos]
+            self.heapify_down(heap, largest, heap_length)
+
+    cdef int push(self, SIZE_t node_id, SIZE_t start, SIZE_t end, SIZE_t pos,
+                  SIZE_t depth, bint is_leaf, double improvement,
+                  double impurity, double impurity_left,
+                  double impurity_right) nogil except -1:
+        """Push record on the priority heap.
+
+        Return -1 in case of failure to allocate memory (and raise MemoryError)
+        or 0 otherwise.
+        """
+        cdef SIZE_t heap_ptr = self.heap_ptr
+        cdef PriorityHeapRecord* heap = NULL
+
+        # Resize if capacity not sufficient
+        if heap_ptr >= self.capacity:
+            self.capacity *= 2
+            # Since safe_realloc can raise MemoryError, use `except -1`
+            safe_realloc(&self.heap_, self.capacity)
+
+        # Put element as last element of heap
+        heap = self.heap_
+        heap[heap_ptr].node_id = node_id
+        heap[heap_ptr].start = start
+        heap[heap_ptr].end = end
+        heap[heap_ptr].pos = pos
+        heap[heap_ptr].depth = depth
+        heap[heap_ptr].is_leaf = is_leaf
+        heap[heap_ptr].impurity = impurity
+        heap[heap_ptr].impurity_left = impurity_left
+        heap[heap_ptr].impurity_right = impurity_right
+        heap[heap_ptr].improvement = improvement
+
+        # Heapify up
+        self.heapify_up(heap, heap_ptr)
+
+        # Increase element count
+        self.heap_ptr = heap_ptr + 1
+        return 0
+
+    cdef int pop(self, PriorityHeapRecord* res) nogil:
+        """Remove max element from the heap. """
+        cdef SIZE_t heap_ptr = self.heap_ptr
+        cdef PriorityHeapRecord* heap = self.heap_
+
+        if heap_ptr <= 0:
+            return -1
+
+        # Take first element
+        res[0] = heap[0]
+
+        # Put last element to the front
+        heap[0], heap[heap_ptr - 1] = heap[heap_ptr - 1], heap[0]
+
+        # Restore heap invariant
+        if heap_ptr > 1:
+            self.heapify_down(heap, 0, heap_ptr - 1)
+
+        self.heap_ptr = heap_ptr - 1
+
+        return 0
diff --git a/causalml/inference/tree/causal/__init__.py b/causalml/inference/tree/causal/__init__.py
old mode 100644
new mode 100755
diff --git a/causalml/inference/tree/causal/_builder.pxd b/causalml/inference/tree/causal/_builder.pxd
old mode 100644
new mode 100755
index f3b8bef0..12bf7e58
--- a/causalml/inference/tree/causal/_builder.pxd
+++ b/causalml/inference/tree/causal/_builder.pxd
@@ -4,11 +4,11 @@
 # cython: language_level=3
 # cython: linetrace=True
 
-from sklearn.tree._tree cimport Node, Tree, TreeBuilder
-from sklearn.tree._tree cimport Splitter, SplitRecord
-from sklearn.tree._utils cimport StackRecord, Stack
-from sklearn.tree._utils cimport PriorityHeapRecord, PriorityHeap
-from sklearn.tree._tree cimport SIZE_t, DOUBLE_t
+from .._tree._tree cimport Node, Tree, TreeBuilder
+from .._tree._splitter cimport Splitter, SplitRecord
+from .._tree._utils cimport StackRecord, Stack
+from .._tree._utils cimport PriorityHeapRecord, PriorityHeap
+from .._tree._tree cimport SIZE_t, DOUBLE_t
 
 
 cdef struct FrontierRecord:
@@ -24,4 +24,4 @@ cdef struct FrontierRecord:
     double impurity
     double impurity_left
     double impurity_right
-    double improvement
\ No newline at end of file
+    double improvement
diff --git a/causalml/inference/tree/causal/_builder.pyx b/causalml/inference/tree/causal/_builder.pyx
old mode 100644
new mode 100755
index 5ef59873..3c9d32ef
--- a/causalml/inference/tree/causal/_builder.pyx
+++ b/causalml/inference/tree/causal/_builder.pyx
@@ -58,7 +58,7 @@ cdef class DepthFirstCausalTreeBuilder(TreeBuilder):
         cdef int init_capacity
 
         if tree.max_depth <= 10:
-            init_capacity = int(2 ** (tree.max_depth + 1)) - 1
+            init_capacity = (2 ** (tree.max_depth + 1)) - 1
         else:
             init_capacity = 2047
 
diff --git a/causalml/inference/tree/causal/_criterion.pxd b/causalml/inference/tree/causal/_criterion.pxd
old mode 100644
new mode 100755
index 4a1c5764..ebc05055
--- a/causalml/inference/tree/causal/_criterion.pxd
+++ b/causalml/inference/tree/causal/_criterion.pxd
@@ -5,8 +5,8 @@
 # cython: linetrace=True
 
 
-from sklearn.tree._criterion cimport RegressionCriterion
-from sklearn.tree._criterion cimport SIZE_t, DOUBLE_t
+from .._tree._criterion cimport RegressionCriterion
+from .._tree._criterion cimport SIZE_t, DOUBLE_t
 
 
 cdef struct NodeInfo:
@@ -23,4 +23,4 @@ cdef struct NodeInfo:
 cdef struct SplitState:
     NodeInfo node   # current node state
     NodeInfo right  # right split state
-    NodeInfo left   # left split state
\ No newline at end of file
+    NodeInfo left   # left split state
diff --git a/causalml/inference/tree/causal/_criterion.pyx b/causalml/inference/tree/causal/_criterion.pyx
old mode 100644
new mode 100755
index 3a5a413b..4ab9366f
--- a/causalml/inference/tree/causal/_criterion.pyx
+++ b/causalml/inference/tree/causal/_criterion.pyx
@@ -213,6 +213,7 @@ cdef class CausalRegressionCriterion(RegressionCriterion):
         """Compute penalty for the sample size difference between groups"""
         return self.groups_penalty * fabs(tr_count- ct_count)
 
+
 cdef class StandardMSE(CausalRegressionCriterion):
     """
     Standard MSE with treatment effect estimates
diff --git a/causalml/inference/tree/causal/_tree.py b/causalml/inference/tree/causal/_tree.py
old mode 100644
new mode 100755
index bc319590..542f8bae
--- a/causalml/inference/tree/causal/_tree.py
+++ b/causalml/inference/tree/causal/_tree.py
@@ -4,15 +4,16 @@
 from math import ceil
 
 import numpy as np
-from sklearn.tree._classes import CRITERIA_REG
-from sklearn.tree._classes import DTYPE, DOUBLE
-from sklearn.tree._classes import SPARSE_SPLITTERS, DENSE_SPLITTERS
-from sklearn.tree._classes import Tree, BaseDecisionTree
-from sklearn.tree._classes import issparse, check_random_state
-from sklearn.tree._criterion import Criterion
-from sklearn.tree._splitter import Splitter
+from scipy.sparse import issparse
+from sklearn.utils import check_random_state
 from sklearn.utils.validation import _check_sample_weight
 
+from .._tree._classes import DTYPE, DOUBLE
+from .._tree._classes import SPARSE_SPLITTERS, DENSE_SPLITTERS
+from .._tree._classes import Tree, BaseDecisionTree
+from .._tree._criterion import Criterion
+from .._tree._splitter import Splitter
+
 from ._builder import DepthFirstCausalTreeBuilder, BestFirstCausalTreeBuilder
 from ._criterion import StandardMSE, CausalMSE, TTest
 
@@ -21,7 +22,6 @@
     "standard_mse": StandardMSE,
     "t_test": TTest,
 }
-CRITERIA_REG.update(CAUSAL_TREES_CRITERIA)
 
 
 class BaseCausalDecisionTree(BaseDecisionTree):
@@ -193,7 +193,9 @@ def fit(
         # Build tree
         criterion = self.criterion
         if not isinstance(criterion, Criterion):
-            criterion = CRITERIA_REG[self.criterion](self.n_outputs_, n_samples)
+            criterion = CAUSAL_TREES_CRITERIA[self.criterion](
+                self.n_outputs_, n_samples
+            )
             criterion.eps = self.eps
             criterion.groups_penalty = self.groups_penalty
         else:
diff --git a/causalml/inference/tree/causal/causalforest.py b/causalml/inference/tree/causal/causalforest.py
old mode 100644
new mode 100755
index 527762db..77992ced
--- a/causalml/inference/tree/causal/causalforest.py
+++ b/causalml/inference/tree/causal/causalforest.py
@@ -1,23 +1,33 @@
 from typing import Union
+
 import numpy as np
 import forestci as fci
 from joblib import Parallel, delayed
 from warnings import catch_warnings, simplefilter, warn
-from sklearn.ensemble._forest import DOUBLE, DTYPE, MAX_INT
+
 from sklearn.exceptions import DataConversionWarning
-from sklearn.ensemble._forest import compute_sample_weight, issparse
-from sklearn.utils.fixes import _joblib_parallel_args
 from sklearn.utils.validation import check_random_state, _check_sample_weight
 from sklearn.utils.multiclass import type_of_target
-from sklearn.ensemble._forest import (
-    ForestRegressor,
-    RandomForestRegressor,
-    ExtraTreesRegressor,
-)
+from sklearn import __version__ as sklearn_version
+from sklearn.ensemble._forest import DOUBLE, DTYPE, MAX_INT
+from sklearn.ensemble._forest import ForestRegressor
+from sklearn.ensemble._forest import compute_sample_weight, issparse
 from sklearn.ensemble._forest import _generate_sample_indices, _get_n_samples_bootstrap
 
 from .causaltree import CausalTreeRegressor
 
+try:
+    from packaging.version import parse as Version
+except ModuleNotFoundError:
+    from distutils.version import LooseVersion as Version
+
+if Version(sklearn_version) >= Version("1.1.0"):
+    _joblib_parallel_args = dict(prefer="threads")
+else:
+    from sklearn.utils.fixes import _joblib_parallel_args
+
+    _joblib_parallel_args = _joblib_parallel_args(prefer="threads")
+
 
 def _parallel_build_trees(
     tree,
@@ -139,12 +149,18 @@ def __init__(
                     to train each base estimator.
             groups_cnt: (bool), count treatment and control groups for each node/leaf
         """
-        super().__init__(
-            base_estimator=CausalTreeRegressor(
-                control_name=control_name, criterion=criterion, groups_cnt=groups_cnt
-            ),
-            n_estimators=n_estimators,
-            estimator_params=(
+        self._estimator = CausalTreeRegressor(
+            control_name=control_name, criterion=criterion, groups_cnt=groups_cnt
+        )
+        _estimator_key = (
+            "estimator"
+            if Version(sklearn_version) >= Version("1.2.0")
+            else "base_estimator"
+        )
+        _parent_args = {
+            _estimator_key: self._estimator,
+            "n_estimators": n_estimators,
+            "estimator_params": (
                 "criterion",
                 "control_name",
                 "max_depth",
@@ -158,14 +174,16 @@ def __init__(
                 "min_samples_leaf",
                 "random_state",
             ),
-            bootstrap=bootstrap,
-            oob_score=oob_score,
-            n_jobs=n_jobs,
-            random_state=random_state,
-            verbose=verbose,
-            warm_start=warm_start,
-            max_samples=max_samples,
-        )
+            "bootstrap": bootstrap,
+            "oob_score": oob_score,
+            "n_jobs": n_jobs,
+            "random_state": random_state,
+            "verbose": verbose,
+            "warm_start": warm_start,
+            "max_samples": max_samples,
+        }
+
+        super().__init__(**_parent_args)
 
         self.criterion = criterion
         self.control_name = control_name
@@ -277,22 +295,6 @@ def _fit(self, X: np.ndarray, y: np.ndarray, sample_weight: np.ndarray = None):
 
         # Check parameters
         self._validate_estimator()
-        # TODO: Remove in v1.2
-        if isinstance(self, (RandomForestRegressor, ExtraTreesRegressor)):
-            if self.criterion == "mse":
-                warn(
-                    "Criterion 'mse' was deprecated in v1.0 and will be "
-                    "removed in version 1.2. Use `criterion='squared_error'` "
-                    "which is equivalent.",
-                    FutureWarning,
-                )
-            elif self.criterion == "mae":
-                warn(
-                    "Criterion 'mae' was deprecated in v1.0 and will be "
-                    "removed in version 1.2. Use `criterion='absolute_error'` "
-                    "which is equivalent.",
-                    FutureWarning,
-                )
 
         if not self.bootstrap and self.oob_score:
             raise ValueError("Out of bag estimation only available if bootstrap=True")
@@ -327,11 +329,10 @@ def _fit(self, X: np.ndarray, y: np.ndarray, sample_weight: np.ndarray = None):
                 self._make_estimator(append=False, random_state=random_state)
                 for i in range(n_more_estimators)
             ]
-
             trees = Parallel(
                 n_jobs=self.n_jobs,
                 verbose=self.verbose,
-                **_joblib_parallel_args(prefer="threads"),
+                **_joblib_parallel_args,
             )(
                 delayed(_parallel_build_trees)(
                     t,
@@ -377,7 +378,7 @@ def fit(self, X: np.ndarray, treatment: np.ndarray, y: np.ndarray):
         Returns:
              self
         """
-        X, y, w = self.base_estimator._prepare_data(X=X, y=y, treatment=treatment)
+        X, y, w = self._estimator._prepare_data(X=X, y=y, treatment=treatment)
         return self._fit(X=X, y=y, sample_weight=w)
 
     def predict(self, X: np.ndarray, with_outcomes: bool = False) -> np.ndarray:
diff --git a/causalml/inference/tree/causal/causaltree.py b/causalml/inference/tree/causal/causaltree.py
old mode 100644
new mode 100755
index bcf0b697..5e6c8180
--- a/causalml/inference/tree/causal/causaltree.py
+++ b/causalml/inference/tree/causal/causaltree.py
@@ -2,16 +2,18 @@
 import sys
 from typing import Union
 
-import numpy as np
 import tqdm
+import numpy as np
+from numpy import float32 as DTYPE
+
 from pathos.pools import ProcessPool as PPool
 from scipy.stats import norm
 from sklearn.base import RegressorMixin
-from sklearn.tree._tree import DTYPE
 from sklearn.utils import check_array
 from sklearn.utils.validation import check_is_fitted
 
 from causalml.inference.meta.utils import check_treatment_vector
+
 from ._tree import BaseCausalDecisionTree
 from ..utils import get_tree_leaves_mask, timeit
 
diff --git a/pyproject.toml b/pyproject.toml
old mode 100644
new mode 100755
index 98f2542d..c4c0586c
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -26,11 +26,11 @@ dependencies = [
     "forestci==0.6",
     "pathos==0.2.9",
     "pip>=10.0",
-    "numpy",
+    "numpy>=1.18.5",
     "scipy>=1.4.1",
     "matplotlib",
-    "pandas>=0.24.1",
-    "scikit-learn<=1.0.2",
+    "pandas>=0.24.1,<1.4.0",
+    "scikit-learn>=0.22.0",
     "statsmodels>=0.9.0",
     "Cython<=0.29.34",
     "seaborn",
@@ -62,8 +62,8 @@ requires = [
     "setuptools>=18.0",
     "wheel",
     "Cython<=0.29.34",
-    "numpy",
-    "scikit-learn<=1.0.2",
+    "numpy>=1.18.5",
+    "scikit-learn>=0.22.0",
 ]
 
 [project.urls]
diff --git a/setup.py b/setup.py
old mode 100644
new mode 100755
index bf67e9ac..6fcfb629
--- a/setup.py
+++ b/setup.py
@@ -1,3 +1,4 @@
+import multiprocessing as mp
 from setuptools import dist, setup, find_packages
 from setuptools.extension import Extension
 
@@ -15,34 +16,33 @@
     dist.Distribution().fetch_build_eggs(["numpy"])
     from numpy import get_include as np_get_include
 
+# fmt: off
+cython_modules = [
+    ("causalml.inference.tree._tree._tree", "causalml/inference/tree/_tree/_tree.pyx"),
+    ("causalml.inference.tree._tree._criterion", "causalml/inference/tree/_tree/_criterion.pyx"),
+    ("causalml.inference.tree._tree._splitter", "causalml/inference/tree/_tree/_splitter.pyx"),
+    ("causalml.inference.tree._tree._utils", "causalml/inference/tree/_tree/_utils.pyx"),
+    ("causalml.inference.tree.causal._criterion", "causalml/inference/tree/causal/_criterion.pyx"),
+    ("causalml.inference.tree.causal._builder", "causalml/inference/tree/causal/_builder.pyx"),
+    ("causalml.inference.tree.uplift", "causalml/inference/tree/uplift.pyx"),
+]
+# fmt: on
+
 extensions = [
     Extension(
-        "causalml.inference.tree.causal._criterion",
-        ["causalml/inference/tree/causal/_criterion.pyx"],
-        libraries=[],
-        include_dirs=[np_get_include()],
-        extra_compile_args=["-O3"],
-    ),
-    Extension(
-        "causalml.inference.tree.causal._builder",
-        ["causalml/inference/tree/causal/_builder.pyx"],
-        libraries=[],
-        include_dirs=[np_get_include()],
-        extra_compile_args=["-O3"],
-    ),
-    Extension(
-        "causalml.inference.tree.uplift",
-        ["causalml/inference/tree/uplift.pyx"],
+        name,
+        [source],
         libraries=[],
         include_dirs=[np_get_include()],
         extra_compile_args=["-O3"],
-    ),
+    )
+    for name, source in cython_modules
 ]
 
 packages = find_packages(exclude=["tests", "tests.*"])
 
 setup(
     packages=packages,
-    ext_modules=cythonize(extensions, annotate=True),
+    ext_modules=cythonize(extensions, annotate=True, nthreads=mp.cpu_count()),
     include_dirs=[np_get_include()],
 )