From 89b674cf5033ceef9f25b252ea81adb06c72d31f Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Mon, 3 Dec 2018 17:42:01 -0500
Subject: [PATCH 1/6] Added docstrings and sphinx support

---
 .gitignore                 |   3 +
 doc/Makefile               |  20 +++++
 doc/make.bat               |  36 ++++++++
 doc/source/conf.py         | 168 +++++++++++++++++++++++++++++++++++++
 doc/source/index.rst       |  24 ++++++
 pygbm/gradient_boosting.py | 124 +++++++++++++++++++++++++++
 6 files changed, 375 insertions(+)
 create mode 100644 doc/Makefile
 create mode 100644 doc/make.bat
 create mode 100644 doc/source/conf.py
 create mode 100644 doc/source/index.rst

diff --git a/.gitignore b/.gitignore
index 7b815f5..e9ee57e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -18,6 +18,9 @@ perf.data*
 # Benchmark data
 *.gz
 
+# doc
+doc/build
+
 # Extra files
 /build
 /dist
diff --git a/doc/Makefile b/doc/Makefile
new file mode 100644
index 0000000..174aae6
--- /dev/null
+++ b/doc/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+SPHINXPROJ    = pygbm
+SOURCEDIR     = source
+BUILDDIR      = build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
\ No newline at end of file
diff --git a/doc/make.bat b/doc/make.bat
new file mode 100644
index 0000000..5e12d19
--- /dev/null
+++ b/doc/make.bat
@@ -0,0 +1,36 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=source
+set BUILDDIR=build
+set SPHINXPROJ=pygbm
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.http://sphinx-doc.org/
+	exit /b 1
+)
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
+
+:end
+popd
diff --git a/doc/source/conf.py b/doc/source/conf.py
new file mode 100644
index 0000000..da1f00a
--- /dev/null
+++ b/doc/source/conf.py
@@ -0,0 +1,168 @@
+# -*- coding: utf-8 -*-
+#
+# Configuration file for the Sphinx documentation builder.
+#
+# This file does only contain a selection of the most common options. For a
+# full list see the documentation:
+# http://www.sphinx-doc.org/en/master/config
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+import os
+import sys
+sys.path.insert(0, os.path.abspath('..'))
+
+
+# -- Project information -----------------------------------------------------
+
+project = 'pygbm'
+# TODO: change authors and copyright to something like "pygbm authors"?
+copyright = '2018, Olivier Grisel, Nicolas Hug'
+author = 'Olivier Grisel, Nicolas Hug'
+
+# The short X.Y version
+version = ''
+# The full version, including alpha/beta/rc tags
+release = '0.1.0.dev0'
+
+
+# -- General configuration ---------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#
+# needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    'sphinx.ext.autodoc',
+    'sphinx.ext.doctest',
+    'sphinx.ext.mathjax',
+    'sphinx.ext.viewcode',
+    'sphinx.ext.napoleon',  # looks better than with numpydoc with RTD theme
+]
+
+# this is needed for some reason...
+# see https://github.com/numpy/numpydoc/issues/69
+numpydoc_class_members_toctree = False
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+#
+# source_suffix = ['.rst', '.md']
+source_suffix = '.rst'
+
+# The master toctree document.
+master_doc = 'index'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = None
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path .
+exclude_patterns = []
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = "sphinx_rtd_theme"  # Needs `pip install sphinx_rtd_theme`
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+#
+# html_theme_options = {}
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+# Custom sidebar templates, must be a dictionary that maps document names
+# to template names.
+#
+# The default sidebars (for documents that don't match any pattern) are
+# defined by theme itself.  Builtin themes are using these templates by
+# default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
+# 'searchbox.html']``.
+#
+# html_sidebars = {}
+
+
+# -- Options for HTMLHelp output ---------------------------------------------
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'pygbmdoc'
+
+
+# -- Options for LaTeX output ------------------------------------------------
+
+latex_elements = {
+    # The paper size ('letterpaper' or 'a4paper').
+    #
+    # 'papersize': 'letterpaper',
+
+    # The font size ('10pt', '11pt' or '12pt').
+    #
+    # 'pointsize': '10pt',
+
+    # Additional stuff for the LaTeX preamble.
+    #
+    # 'preamble': '',
+
+    # Latex figure (float) alignment
+    #
+    # 'figure_align': 'htbp',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+    (master_doc, 'pygbm.tex', 'pygbm Documentation',
+     'Olivier Grisel, Nicolas Hug', 'manual'),
+]
+
+
+# -- Options for manual page output ------------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+    (master_doc, 'pygbm', 'pygbm Documentation',
+     [author], 1)
+]
+
+
+# -- Options for Texinfo output ----------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+    (master_doc, 'pygbm', 'pygbm Documentation',
+     author, 'pygbm', 'One line description of project.',
+     'Miscellaneous'),
+]
+
+
+# -- Extension configuration -------------------------------------------------
diff --git a/doc/source/index.rst b/doc/source/index.rst
new file mode 100644
index 0000000..fd81432
--- /dev/null
+++ b/doc/source/index.rst
@@ -0,0 +1,24 @@
+.. pygbm documentation master file, created by
+   sphinx-quickstart on Mon Dec  3 16:25:20 2018.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+Welcome to pygbm's documentation!
+=================================
+
+.. .. toctree::
+..    :maxdepth: 2
+..    :caption: API Reference
+..    :hidden:
+
+.. automodule:: pygbm.gradient_boosting
+    :members:
+    :exclude-members: BaseGradientBoostingMachine
+
+
+.. Indices and tables
+.. ==================
+
+.. * :ref:`genindex`
+.. * :ref:`modindex`
+.. * :ref:`search`
diff --git a/pygbm/gradient_boosting.py b/pygbm/gradient_boosting.py
index 76ab098..ddfe2fa 100644
--- a/pygbm/gradient_boosting.py
+++ b/pygbm/gradient_boosting.py
@@ -1,3 +1,6 @@
+"""
+Gradient Boosting decision trees for classification and regression.
+"""
 from abc import ABC, abstractmethod
 
 import numpy as np
@@ -64,6 +67,20 @@ def _validate_parameters(self):
                              f'must be strictly positive.')
 
     def fit(self, X, y):
+        """Fit the gradient boosting model.
+
+        Parameters
+        ----------
+        X : array-like, shape=(n_samples, n_features)
+            The input samples.
+
+        y : array-like, shape=(n_samples,)
+            Target values.
+
+        Returns
+        -------
+        self : object
+        """
 
         fit_start_time = time()
         acc_find_split_time = 0.  # time spent finding the best splits
@@ -244,6 +261,46 @@ def _should_stop(self, scores):
 
 
 class GradientBoostingRegressor(BaseGradientBoostingMachine, RegressorMixin):
+    """Scikit-learn compatible Gradient Boosting Tree for regression.
+
+    Parameters
+    ----------
+    loss : {'least_squares'}, optional(default='least_squares')
+        The loss function to use in the boosting process.
+    learning_rate : float, optional(default=TODO)
+        The learning rate, also known as *shrinkage*. This is used as a
+        multiplicative factor for the leaves values.
+    max_iter : int, optional(default=TODO)
+        The maximum number of iterations of the boosting process, i.e. the
+        maximum number of trees.
+    max_leaf_nodes : int, optional(default=TODO)
+        The maximum number of leaves for each tree.
+    max_depth : int, optional(default=TODO)
+        The maximum depth of each tree. The depth of a tree is the number of
+        nodes to go from the root to the deepest leaf.
+    min_samples_leaf : int, optional(default=TODO)
+        The minimum number of samples per leaf.
+    l2_regularization : float, optional(default=TODO)
+        The L2 regularization parameter.
+    max_bins : int, optional(default=256)
+        The maximum number of bins to use. Before training, each feature of
+        the input array `X` is binned into at most `max_bins` bins, which
+        allows for a much faster training stage. Features with a small
+        number of unique values may use less than `max_bins` bins. Must be no
+        larger than 256.
+    max_no_improvement : int, optional(default=TODO)
+        TODO
+    validation_split : int or float, optional(default=TODO)
+        TODO
+    scoring : str, optional(default=TODO)
+        TODO
+    verbose: int, optional(default=0)
+        The verbosity level. If not zero, print some information about the
+        fitting process.
+    random_state : int, np.random.RandomStateInstance or None, \
+        optional(default=None)
+        TODO: any chance we can link to sklearn glossary?
+    """
 
     _VALID_LOSSES = ('least_squares',)
 
@@ -262,10 +319,63 @@ def __init__(self, loss='least_squares', learning_rate=0.1, max_iter=100,
             verbose=verbose, random_state=random_state)
 
     def predict(self, X):
+        """Predict values for X.
+
+        Parameters
+        ----------
+        X : array-like, shape=(n_samples, n_features)
+            The input samples.
+        """
         return self._raw_predict(X)
 
 
 class GradientBoostingClassifier(BaseGradientBoostingMachine, ClassifierMixin):
+    """Scikit-learn compatible Gradient Boosting Tree for classification.
+
+    Parameters
+    ----------
+    loss : {'auto', 'binary_crossentropy', 'categorical_crossentropy'}, \
+        optional(default='auto')
+        The loss function to use in the boosting process. 'binary_crossentropy'
+        (also known as logistic loss) is used for binary classification and
+        generalizes to 'categorical_crossentropy' for multiclass
+        classification. 'auto' will automatically choose eiher loss depending
+        on the nature of the problem.
+    learning_rate : float, optional(default=TODO)
+        The learning rate, also known as *shrinkage*. This is used as a
+        multiplicative factor for the leaves values.
+    max_iter : int, optional(default=TODO)
+        The maximum number of iterations of the boosting process, i.e. the
+        maximum number of trees for binary classification. For multiclass
+        classification, `n_classes` trees per iteration are built.
+    max_leaf_nodes : int, optional(default=TODO)
+        The maximum number of leaves for each tree.
+    max_depth : int, optional(default=TODO)
+        The maximum depth of each tree. The depth of a tree is the number of
+        nodes to go from the root to the deepest leaf.
+    min_samples_leaf : int, optional(default=TODO)
+        The minimum number of samples per leaf.
+    l2_regularization : float, optional(default=TODO)
+        The L2 regularization parameter.
+    max_bins : int, optional(default=256)
+        The maximum number of bins to use. Before training, each feature of
+        the input array `X` is binned into at most `max_bins` bins, which
+        allows for a much faster training stage. Features with a small
+        number of unique values may use less than `max_bins` bins. Must be no
+        larger than 256.
+    max_no_improvement : int, optional(default=TODO)
+        TODO
+    validation_split : int or float, optional(default=TODO)
+        TODO
+    scoring : str, optional(default=TODO)
+        TODO
+    verbose: int, optional(default=0)
+        The verbosity level. If not zero, print some information about the
+        fitting process.
+    random_state : int, np.random.RandomStateInstance or None, \
+        optional(default=None)
+        TODO: any chance we can link to sklearn glossary?
+    """
 
     _VALID_LOSSES = ('binary_crossentropy',)
 
@@ -285,9 +395,23 @@ def __init__(self, loss='binary_crossentropy', learning_rate=0.1,
             verbose=verbose, random_state=random_state)
 
     def predict(self, X):
+        """Predict classes for X.
+
+        Parameters
+        ----------
+        X : array-like, shape=(n_samples, n_features)
+            The input samples.
+        """
         return np.argmax(self.predict_proba(X), axis=1)
 
     def predict_proba(self, X):
+        """Predict class probabilities for X.
+
+        Parameters
+        ----------
+        X : array-like, shape=(n_samples, n_features)
+            The input samples.
+        """
         raw_predictions = self._raw_predict(X)
         return self.loss_.predict_proba(raw_predictions)
 

From e5bfdc4400398c9ddeb79665495618a209568cfa Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Mon, 3 Dec 2018 21:30:54 -0500
Subject: [PATCH 2/6] Some more docstrings

---
 pygbm/gradient_boosting.py | 63 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 61 insertions(+), 2 deletions(-)

diff --git a/pygbm/gradient_boosting.py b/pygbm/gradient_boosting.py
index ddfe2fa..d6f4cf3 100644
--- a/pygbm/gradient_boosting.py
+++ b/pygbm/gradient_boosting.py
@@ -17,6 +17,7 @@
 
 
 class BaseGradientBoostingMachine(BaseEstimator, ABC):
+    """Base class for gradient boosting estimators."""
 
     @abstractmethod
     def __init__(self, loss, learning_rate, max_iter, max_leaf_nodes,
@@ -39,6 +40,10 @@ def __init__(self, loss, learning_rate, max_iter, max_leaf_nodes,
         self.random_state = random_state
 
     def _validate_parameters(self):
+        """Validate parameters passed to __init__.
+
+        The parameters that are directly passed to the grower are checked in
+        TreeGrower."""
 
         if self.loss not in _LOSSES:
             raise ValueError("Invalid loss {}. Accepted losses are {}.".format(
@@ -198,7 +203,18 @@ def fit(self, X, y):
         return self
 
     def _raw_predict(self, X):
-        """Return the sum of the leaves values"""
+        """Return the sum of the leaves values over all predictors.
+
+        Parameters
+        ----------
+        X : array-like, shape=(n_samples, n_features)
+            The input samples.
+
+        Returns
+        -------
+        y : array, shape (n_samples * n_trees_per_iteration,)
+            The raw predicted values.
+        """
         # TODO: check input / check_fitted
         # TODO: make predictor behave correctly on pre-binned data
         raw_predictions = np.zeros(X.shape[0], dtype=np.float32)
@@ -208,6 +224,21 @@ def _raw_predict(self, X):
         return raw_predictions
 
     def _predict_binned(self, X_binned):
+        """Predict values or classes for binned data X.
+
+        TODO: This is incorrect now that we support classification right? This
+        should return classes, not the raw values from the leaves.
+
+        Parameters
+        ----------
+        X_binned : array-like, shape=(n_samples, n_features)
+            The binned input samples. Entries should be integers.
+
+        Returns
+        -------
+        y : array, shape (n_samples,)
+            The predicted values or classes.
+        """
         predicted = np.zeros(X_binned.shape[0], dtype=np.float32)
         for predictor in self.predictors_:
             predicted += predictor.predict_binned(X_binned)
@@ -325,6 +356,11 @@ def predict(self, X):
         ----------
         X : array-like, shape=(n_samples, n_features)
             The input samples.
+
+        Returns
+        -------
+        y : array, shape (n_samples,)
+            The predicted values.
         """
         return self._raw_predict(X)
 
@@ -401,6 +437,11 @@ def predict(self, X):
         ----------
         X : array-like, shape=(n_samples, n_features)
             The input samples.
+
+        Returns
+        -------
+        y : array, shape (n_samples,)
+            The predicted classes.
         """
         return np.argmax(self.predict_proba(X), axis=1)
 
@@ -411,6 +452,11 @@ def predict_proba(self, X):
         ----------
         X : array-like, shape=(n_samples, n_features)
             The input samples.
+
+        Returns
+        -------
+        p : array, shape (n_samples, n_classes)
+            The class probabilities of the input samples.
         """
         raw_predictions = self._raw_predict(X)
         return self.loss_.predict_proba(raw_predictions)
@@ -418,7 +464,20 @@ def predict_proba(self, X):
 
 @njit(parallel=True)
 def _update_y_pred(leaves_data, y_pred):
-    """Read prediction data on the training set from the grower leaves"""
+    """Update y_pred by reading the predictions of the ith tree directly
+    form the leaves.
+
+    Can only be used for predicting the training data. y_pred contains the
+    sum of the tree values from iteration 0 to i - 1. This adds the
+    predictions of the ith tree to y_pred.
+
+    Parameters
+    ----------
+    leaves_data: list of tuples (leaf.value, leaf.sample_indices)
+        The leaves data used to update y_pred.
+    y_pred : array-like, shape=(n_samples,)
+        The raw predictions for the training data.
+    """
     for leaf_idx in prange(len(leaves_data)):
         leaf_value, sample_indices = leaves_data[leaf_idx]
         for sample_idx in sample_indices:

From b0c8e4a8ff1ef4a1fa90da899fd1abb826be6cc7 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Mon, 3 Dec 2018 22:12:16 -0500
Subject: [PATCH 3/6] Added docstrings in grower.py

---
 pygbm/grower.py | 194 ++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 171 insertions(+), 23 deletions(-)

diff --git a/pygbm/grower.py b/pygbm/grower.py
index a49d029..054d059 100644
--- a/pygbm/grower.py
+++ b/pygbm/grower.py
@@ -1,3 +1,8 @@
+"""
+This module contains the TreeGrower class which builds a regression tree
+fitting a Newton-Raphson step, based on the gradients and hessians of the
+training data.
+"""
 from heapq import heappush, heappop
 import numpy as np
 from time import time
@@ -8,17 +13,67 @@
 
 
 class TreeNode:
-    split_info = None  # Result of the split evaluation
-    left_child = None  # Link to left node (only for non-leaf nodes)
-    right_child = None  # Link to right node (only for non-leaf nodes)
-    value = None  # Prediction value (only for leaf nodes)
-    histograms = None  # array of histogram shape = (n_features, n_bins)
-    sibling = None  # Link to sibling node, None for root
-    parent = None  # Link to parent node, None for root
-    find_split_time = 0.  # time spent finding the best split
-    construction_speed = 0.  # number of samples / find_split_time
-    apply_split_time = 0.  # time spent splitting the node
-    # wheter the subtraction method was used for histogram computation
+    """Tree Node class used in TreeGrower.
+
+    This isn't used for prediction purposes, only for training (see
+    TreePredictor).
+
+    Parameters
+    ----------
+    depth : int
+        The depth of the node, i.e. its distance from the root
+    samples_indices : array of int
+        The indices of the samples at the node
+    sum_gradients : float
+        The sum of the gradients of the samples at the nodes
+    sum_hessians : float
+        The sum of the hessians of the samples at the nodes
+    parent : TreeNode or None, optional(default=None)
+        The parent of the node. None for root.
+
+    Attributes
+    ----------
+    depth : int
+        The depth of the node, i.e. its distance from the root
+    samples_indices : array of int
+        The indices of the samples at the node
+    sum_gradients : float
+        The sum of the gradients of the samples at the nodes
+    sum_hessians : float
+        The sum of the hessians of the samples at the nodes
+    parent : TreeNode or None, optional(default=None)
+        The parent of the node. None for root.
+    split_info : SplitInfo or None
+        The result of the split evaluation
+    left_child : TreeNode or None
+        The left child of the node. None for leaves.
+    right_child : TreeNode or None
+        The right child of the node. None for leaves.
+    value : float or None
+        The value of the leaf, as computed in finalize_leaf(). None for
+        non-leaf nodes
+    find_split_time : float
+        The total time spent computing the histogram and finding the best
+        split at the node.
+    construction_speed : float
+        The Number of samples at the node divided find_split_time.
+    apply_split_time : float
+        The total time spent actually splitting the node, e.g. splitting
+        samples_indices into left and right child.
+    hist_subtraction : bool
+        Wheter the subtraction method was used for computing the histograms.
+    """
+
+    split_info = None
+    left_child = None
+    right_child = None
+    value = None
+    histograms = None
+    sibling = None
+    parent = None
+    find_split_time = 0.
+    construction_speed = 0.
+    apply_split_time = 0.
     hist_subtraction = False
 
     def __init__(self, depth, sample_indices, sum_gradients,
@@ -40,12 +95,18 @@ def __repr__(self):
         return out
 
     def __lt__(self, other_node):
-        """Comparison for priority queue
+        """Comparison for priority queue.
 
-        Nodes with high gain are higher priority than nodes with node gain.
+        Nodes with high gain are higher priority than nodes with low gain.
 
         heapq.heappush only need the '<' operator.
-        heapq.heappop take the smallest item first (smaller ishigher priority).
+        heapq.heappop take the smallest item first (smaller is higher
+        priority).
+
+        Parameters
+        -----------
+        other_node : TreeNode
+            The node to compare with.
         """
         if self.split_info is None or other_node.split_info is None:
             raise ValueError("Cannot compare nodes with split_info")
@@ -53,6 +114,50 @@ def __lt__(self, other_node):
 
 
 class TreeGrower:
+    """Tree grower class used to build a tree.
+
+    The tree is fitted to predict the values of a Newton-Raphson step. The
+    splits are considered in a best-first fashion, and the quality of a
+    split is defined in splitting._split_gain.
+
+    Parameters
+    ----------
+    features_data : array-like of int, shape=(n_samples, n_features)
+        The binned input samples. Must be Fortran-aligned.
+    gradients : array-like, shape=(n_samples,)
+        The gradients of each training sample. Those are the gradients of the
+        loss w.r.t the predictions, evaluated at iteration i - 1.
+    hessians : array-like, shape=(n_samples,)
+        The hessians of each training sample. Those are the hessians of the
+        loss w.r.t the predictions, evaluated at iteration i - 1.
+    max_leaf_nodes : int, optional(default=TODO)
+        The maximum number of leaves for each tree.
+    max_depth : int, optional(default=TODO)
+        The maximum depth of each tree. The depth of a tree is the number of
+        nodes to go from the root to the deepest leaf.
+    min_samples_leaf : int, optional(default=TODO)
+        The minimum number of samples per leaf.
+    min_gain_to_split : float, optional(default=0.)
+        The minimum gain needed to split a node. Splits with lower gain will
+        be ignored.
+    max_bins : int, optional(default=256)
+        The maximum number of bins. Used to define the shape of the
+        histograms.
+    n_bins_per_feature : array-like of int or int, optional(default=None)
+        The actual number of bins needed for each feature, which is lower or
+        equal to max_bins. If it's an int, all features are considered to
+        have the same number of bins. If None, all features are considered to
+        have `max_bins` bins.
+    l2_regularization : float, optional(default=TODO)
+        The L2 regularization parameter.
+    min_hessian_to_split : float, optional(default=TODO)
+        The minimum sum of hessians needed in each node. Splits that result in
+        at least one child having a sum of hessians less than
+        min_hessian_to_split are discarded.
+    shrinkage : float, optional(default=TODO)
+        The shrinkage parameter to apply to the leaves values, also known as
+        learning rate.
+    """
     def __init__(self, features_data, all_gradients, all_hessians,
                  max_leaf_nodes=None, max_depth=None, min_samples_leaf=20,
                  min_gain_to_split=0., max_bins=256, n_bins_per_feature=None,
@@ -92,6 +197,11 @@ def __init__(self, features_data, all_gradients, all_hessians,
     def _validate_parameters(self, features_data, max_leaf_nodes, max_depth,
                              min_samples_leaf, min_gain_to_split,
                              l2_regularization, min_hessian_to_split):
+        """Validate parameters passed to __init__.
+
+        Also validate parameters passed to SplittingContext because we cannot
+        raise exceptions in a jitclass.
+        """
         if features_data.dtype != np.uint8:
             raise NotImplementedError(
                 "Explicit feature binning required for now")
@@ -119,10 +229,12 @@ def _validate_parameters(self, features_data, max_leaf_nodes, max_depth,
                              f'must be positive.')
 
     def grow(self):
+        """Grow the tree, from root to leaves."""
         while self.can_split_further():
             self.split_next()
 
     def _intilialize_root(self):
+        """Initialize root node and finalize it if needed."""
         n_samples = self.features_data.shape[0]
         depth = 0
         if self.splitting_context.constant_hessian:
@@ -146,13 +258,24 @@ def _intilialize_root(self):
         self._compute_spittability(self.root)
 
     def _compute_spittability(self, node, only_hist=False):
-        """Compute histograms and split_info of a node and either make it a
-        leaf or push it on the splittable node heap.
-
-        only_hist is used when _compute_spittability was called by a sibling
-        node: we only want to compute the histograms, not finalize or push
-        the node. If _compute_spittability is called again by the grower on
-        this same node, the histograms won't be computed again.
+        """Compute histograms and best possible split of a node.
+
+        If the best possible gain is 0 of if the constraints aren't met
+        (min_samples_leaf, min_hessian_to_split, min_gain_to_split) then the
+        node is finalized (transformed into a leaf), else it is pushed on
+        the splittable node heap.
+
+        Parameters
+        ----------
+        node : TreeNode
+            The node to evaluate.
+        only_hist : bool, optional (default=False)
+            Whether to only compute the histograms and the SplitInfo. It is
+            set to `True` when `_compute_spittability` was called by a
+            sibling node: we only want to compute the histograms (which also
+            computes the SplitInfo), not finalize or push the node. If
+            `_compute_spittability` is called again by the grower on this
+            same node, the histograms won't be computed again.
         """
         # Compute split_info and histograms if not already done
         if node.split_info is None and node.histograms is None:
@@ -199,7 +322,12 @@ def _compute_spittability(self, node, only_hist=False):
     def split_next(self):
         """Split the node with highest potential gain.
 
-        Return the two resulting nodes created by the split.
+        Returns
+        -------
+        left : TreeNode
+            The resulting left child.
+        right : TreeNode
+            The resulting right child.
         """
         if len(self.splittable_nodes) == 0:
             raise StopIteration("No more splittable nodes")
@@ -259,10 +387,14 @@ def split_next(self):
         return left_child_node, right_child_node
 
     def can_split_further(self):
+        """Return True if there are still nodes to split."""
         return len(self.splittable_nodes) >= 1
 
     def _finalize_leaf(self, node):
-        """Compute the prediction value that minimizes the objective function
+        """Compute the prediction value that minimizes the objective function.
+
+        This sets the node.value attribute (node is a leaf iff node.value is
+        not None).
 
         See Equation 5 of:
         XGBoost: A Scalable Tree Boosting System, T. Chen, C. Guestrin, 2016
@@ -273,11 +405,26 @@ def _finalize_leaf(self, node):
         self.finalized_leaves.append(node)
 
     def _finalize_splittable_nodes(self):
+        """Transform all splittable nodes into leaves.
+
+        Used when some constraint is met e.g. maximum number of leaves or
+        maximum depth."""
         while len(self.splittable_nodes) > 0:
             node = self.splittable_nodes.pop()
             self._finalize_leaf(node)
 
     def make_predictor(self, bin_thresholds=None):
+        """Make a TreePredictor object out of the current tree.
+
+        Parameters
+        ----------
+        bin_thresholds : array-like of floats, optional (default=None)
+            The actual thresholds values of each bin.
+
+        Returns
+        -------
+        A TreePredictor object.
+        """
         predictor_nodes = np.zeros(self.n_nodes, dtype=PREDICTOR_RECORD_DTYPE)
         self._fill_predictor_node_array(predictor_nodes, self.root,
                                         bin_thresholds=bin_thresholds)
@@ -285,6 +432,7 @@ def make_predictor(self, bin_thresholds=None):
 
     def _fill_predictor_node_array(self, predictor_nodes, grower_node,
                                    bin_thresholds=None, next_free_idx=0):
+        """Helper used in make_predictor to set the TreePredictor fields."""
         node = predictor_nodes[next_free_idx]
         node['count'] = grower_node.n_samples
         node['depth'] = grower_node.depth

From 6d416f3716f36005bffcd66b0cafc600faf6b337 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Tue, 4 Dec 2018 14:40:40 -0500
Subject: [PATCH 4/6] Added docstrings to splitting.py

---
 pygbm/splitting.py | 164 ++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 148 insertions(+), 16 deletions(-)

diff --git a/pygbm/splitting.py b/pygbm/splitting.py
index 72baab1..f3e04e6 100644
--- a/pygbm/splitting.py
+++ b/pygbm/splitting.py
@@ -22,6 +22,29 @@
     ('n_samples_right', uint32),
 ])
 class SplitInfo:
+    """Pure data class to store information about a potential split.
+
+    Parameters
+    ----------
+    gain : float32
+        The gain of the split
+    feature_idx : int
+        The index of the feature to be split
+    bin_idx : int
+        The index of the bin on which the split is made
+    gradient_left : float32
+        The sum of the gradients of all the samples in the left child
+    hessian_left : float32
+        The sum of the hessians of all the samples in the left child
+    gradient_right : float32
+        The sum of the gradients of all the samples in the right child
+    hessian_right : float32
+        The sum of the hessians of all the samples in the right child
+    n_samples_left : int
+        The number of samples in the left child
+    n_samples_right : int
+        The number of samples in the right child
+    """
     def __init__(self, gain=-1., feature_idx=0, bin_idx=0,
                  gradient_left=0., hessian_left=0.,
                  gradient_right=0., hessian_right=0.,
@@ -59,6 +82,43 @@ def __init__(self, gain=-1., feature_idx=0, bin_idx=0,
     ('right_indices_buffer', uint32[::1]),
 ])
 class SplittingContext:
+    """Pure data class defining a splitting context.
+
+    Ideally it would also have methods but numba does not support annotating
+    jitclasses (so we can't use parallel=True). This structure is
+    instanciated in the grower and stores all the required information to
+    compute the SplitInfo and histograms of each node.
+
+    Parameters
+    ----------
+    n_features : int
+        The number of features.
+    binned_features : array of int
+        The binned input samples. Must be Fortran-aligned.
+    max_bins : int, optional(default=256)
+        The maximum number of bins. Used to define the shape of the
+        histograms.
+    n_bins_per_feature : array-like of int
+        The actual number of bins needed for each feature, which is lower or
+        equal to max_bins.
+    gradients : array-like, shape=(n_samples,)
+        The gradients of each training sample. Those are the gradients of the
+        loss w.r.t the predictions, evaluated at iteration i - 1.
+    hessians : array-like, shape=(n_samples,)
+        The hessians of each training sample. Those are the hessians of the
+        loss w.r.t the predictions, evaluated at iteration i - 1.
+    l2_regularization : float
+        The L2 regularization parameter.
+    min_hessian_to_split : float
+        The minimum sum of hessians needed in each node. Splits that result in
+        at least one child having a sum of hessians less than
+        min_hessian_to_split are discarded.
+    min_samples_leaf : int
+        The minimum number of samples per leaf.
+    min_gain_to_split : float, optional(default=0.)
+        The minimum gain needed to split a node. Splits with lower gain will
+        be ignored.
+    """
     def __init__(self, n_features, binned_features, max_bins,
                  n_bins_per_feature, all_gradients, all_hessians,
                  l2_regularization, min_hessian_to_split=1e-3,
@@ -109,6 +169,29 @@ def __init__(self, n_features, binned_features, max_bins,
 def split_indices(context, split_info, sample_indices):
     """Split samples into left and right arrays.
 
+    Parameters
+    ----------
+    context : SplittingContext
+        The splitting context
+    split_ingo : SplitInfo
+        The SplitInfo of the node to split
+    sample_indices : array of int
+        The indices of the samples at the node to split. This is a view on
+        context.partition, and it is modified inplace by placing the indices
+        of the left child at the beginning, and the indices of the right child
+        at the end.
+
+    Returns
+    -------
+    left_indices : array of int
+        The indices of the samples in the left child. This is a view on
+        context.partition.
+    right_indices : array of int
+        The indices of the samples in the right child. This is a view on
+        context.partition.
+
+    Details
+    -------
     This is a multi-threaded implementation inspired by lightgbm.
     Here is a quick break down. Let's suppose we want to split a node with
     24 samples named from a to x. context.partition looks like this (the * are
@@ -220,14 +303,27 @@ def split_indices(context, split_info, sample_indices):
 
 @njit(parallel=True)
 def find_node_split(context, sample_indices):
-    """For each feature, find the best bin to split on by scanning data.
-
-    This is done by calling _find_histogram_split that compute histograms
-    for the samples that reached this node.
-
-    Returns the best SplitInfo among all features, along with all the feature
-    histograms that can be latter used to compute the sibling or children
-    histograms by substraction.
+    """For each feature, find the best bin to split on at a given node.
+
+    Returns the best split info among all features, and the histograms of
+    all the features. The histograms are computed by scanning the whole
+    data.
+
+    Parameters
+    ----------
+    context : SplittingContext
+        The splitting context
+    sample_indices : array of int
+        The indices of the samples at the node to split.
+
+    Returns
+    -------
+    best_split_info : SplitInfo
+        The info about the best possible split among all features.
+    histograms : array of HISTOGRAM_DTYPE, shape=(n_features, max_bins)
+        The histograms of each feature. A histogram is an array of
+        HISTOGRAM_DTYPE of size `max_bins` (only
+        `n_bins_per_features[feature]` entries are relevant).
     """
 
     ctx = context  # shorter name to avoid various line breaks
@@ -242,6 +338,7 @@ def find_node_split(context, sample_indices):
     # This is a parallelized version of the following vanilla code:
     # for i range(n_samples):
     #     ctx.ordered_gradients[i] = ctx.all_gradients[samples_indices[i]]
+    # Ordering the gradients and hessians helps to improve cache hit.
     if sample_indices.shape[0] != ctx.all_gradients.shape[0]:
         n_threads = numba.config.NUMBA_DEFAULT_NUM_THREADS
         # Each threads writes data in ordered_xx from starts[thread_idx] to
@@ -292,15 +389,40 @@ def find_node_split(context, sample_indices):
 @njit(parallel=True)
 def find_node_split_subtraction(context, sample_indices, parent_histograms,
                                 sibling_histograms):
-    """For each feature, find the best bin to split by histogram substraction
+    """For each feature, find the best bin to split on at a given node.
 
-    This in turn calls _find_histogram_split_subtraction that does not need
-    to scan the samples from this node and can therefore be significantly
-    faster than computing the histograms from data.
+    Returns the best split info among all features, and the histograms of
+    all the features.
+
+    This does the same job as find_node_split() but uses the histograms of the
+    parent and sibling of the node to split. This allows to use the
+    identity: histogram(parent) = histogram(node) - histogram(sibling),
+    which is significantly faster than computing the histograms from data.
 
     Returns the best SplitInfo among all features, along with all the feature
     histograms that can be latter used to compute the sibling or children
     histograms by substraction.
+
+    Parameters
+    ----------
+    context : SplittingContext
+        The splitting context
+    sample_indices : array of int
+        The indices of the samples at the node to split.
+    parent_histograms : array of HISTOGRAM_DTYPE of shape(n_features, max_bins)
+        The histograms of the parent
+    sibling_histograms : array of HISTOGRAM_DTYPE of \
+        shape(n_features, max_bins)
+        The histograms of the sibling
+
+    Returns
+    -------
+    best_split_info : SplitInfo
+        The info about the best possible split among all features.
+    histograms : array of HISTOGRAM_DTYPE, shape=(n_features, max_bins)
+        The histograms of each feature. A histogram is an array of
+        HISTOGRAM_DTYPE of size `max_bins` (only
+        `n_bins_per_features[feature]` entries are relevant).
     """
 
     # We can pick any feature (here the first) in the histograms to
@@ -349,7 +471,10 @@ def _find_best_feature_to_split_helper(split_infos):
 
 @njit(fastmath=True)
 def _find_histogram_split(context, feature_idx, sample_indices):
-    """Compute the histogram for a given feature and return the best bin."""
+    """Compute the histogram for a given feature
+
+    Returns the best SplitInfo among all the possible bins of the feature.
+    """
     n_samples = sample_indices.shape[0]
     binned_feature = context.binned_features.T[feature_idx]
 
@@ -385,7 +510,8 @@ def _find_histogram_split_subtraction(context, feature_idx,
                                       n_samples):
     """Compute the histogram by substraction of parent and sibling
 
-    Uses the identity: hist(parent) = hist(left) + hist(right)
+    Uses the identity: hist(parent) = hist(left) + hist(right).
+    Returns the best SplitInfo among all the possible bins of the feature.
     """
     histogram = _subtract_histograms(
         context.max_bins,
@@ -399,7 +525,13 @@ def _find_histogram_split_subtraction(context, feature_idx,
               'n_samples_left': uint32},
       fastmath=True)
 def _find_best_bin_to_split_helper(context, feature_idx, histogram, n_samples):
-    """Find best bin to split on and return the corresponding SplitInfo"""
+    """Find best bin to split on, and return the corresponding SplitInfo.
+
+    Splits that do not satisfy the splitting constraints (min_gain_to_split,
+    etc.) are discarded here. If no split can satisfy the constraints, a
+    SplitInfo with a gain of -1 is returned. If for a given node the best
+    SplitInfo has a gain of -1, it is finalized into a leaf.
+    """
     # Allocate the structure for the best split information. It can be
     # returned as such (with a negative gain) if the min_hessian_to_split
     # condition is not satisfied. Such invalid splits are later discarded by
@@ -458,7 +590,7 @@ def _split_gain(gradient_left, hessian_left, gradient_right, hessian_right,
                 sum_gradients, sum_hessians, l2_regularization):
     """Loss reduction
 
-    Compute the reduction in loss after taking a split compared to keeping
+    Compute the reduction in loss after taking a split, compared to keeping
     the node a leaf of the tree.
 
     See Equation 7 of:

From 3721c225acb6eef09702cd68c0f5028bce8f9c11 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Tue, 4 Dec 2018 15:07:17 -0500
Subject: [PATCH 5/6] Added docstrings for binning

---
 pygbm/binning.py      | 78 +++++++++++++++++++++++++++++++++++++------
 tests/test_binning.py |  4 +--
 2 files changed, 69 insertions(+), 13 deletions(-)

diff --git a/pygbm/binning.py b/pygbm/binning.py
index 54ae328..1242fb5 100644
--- a/pygbm/binning.py
+++ b/pygbm/binning.py
@@ -11,28 +11,32 @@ def find_binning_thresholds(data, max_bins=256, subsample=int(2e5),
     Subsample the dataset if too large as the feature-wise quantiles
     should be stable.
 
+    If the number of unique values for a given feature is less than
+    `max_bins`, then the unique values are used instead of the quantiles.
+
     Parameters
     ----------
-    data: array-like (n_samples, n_features)
+    data: array-like, shape=(n_samples, n_features)
         The numerical dataset to analyse.
 
-    max_bins: int
+    max_bins: int, optional (default=256)
         The number of bins to extract for each feature. As we code the binned
         values as 8-bit integers, max_bins should be no larger than 256.
 
-    subsample: int
+    subsample: int, optional (default=2e5)
         Number of random subsamples to consider to compute the quantiles.
 
-    random_state: int or numpy.random.RandomState or None
+    random_state: int or numpy.random.RandomState or None, \
+        optional (default=None)
         Pseudo-random number generator to control the random sub-sampling.
 
     Return
     ------
     binning_thresholds: tuple of arrays
-        For each feature, store the increasing numeric values that can
+        For each feature, stores the increasing numeric values that can
         be used to separate the bins.
-        len(binning_thresholds) == n_features
-        Each array has size (n_bins - 1) where:
+        len(binning_thresholds) == n_features.
+        Each array has size `(n_bins - 1)` where:
             n_bins == min(max_bins, len(np.unique(data[:, feature_idx])))
     """
     if not (2 <= max_bins <= 256):
@@ -66,10 +70,23 @@ def find_binning_thresholds(data, max_bins=256, subsample=int(2e5),
     return tuple(binning_thresholds)
 
 
-def map_to_bins(data, binning_thresholds=None, out=None):
+def _map_to_bins(data, binning_thresholds=None, out=None):
     """Bin numerical values to discrete integer-coded levels.
 
-    # TODO: write doc for params and returned value.
+    Parameters
+    ----------
+    data : array-like, shape=(n_samples, n_features)
+        The numerical data to bin.
+    binning_thresholds : tuple of arrays
+        For each feature, stores the increasing numeric values that are
+        used to separate the bins.
+    out : array-like
+        If not None, write result inplace in out.
+
+    Returns
+    -------
+    binned_data : array of int, shape=data.shape
+        The binned data.
     """
     # TODO: add support for categorical data encoded as integers
     # TODO: add support for sparse data (numerical or categorical)
@@ -107,14 +124,42 @@ def _map_num_col_to_bins(data, binning_thresholds, binned):
 
 
 class BinMapper(BaseEstimator, TransformerMixin):
-    # TODO: write docstrings
+    """Transformer that maps a dataset into integer-valued bins
 
+    The bins are created in a feature-wise fashion, with equally-spaced
+    quantiles.
+
+    Parameters
+    ----------
+    max_bins : int, optional (default=256)
+        The maximum number of bins to use. If for a given feature the number of
+        unique values is less than `max_bins`, then those unique values will be
+        used instead of the quantiles.
+    subsample : int, optional (default=1e5)
+        If `n_samples > subsample`, then `sub_samples` samples will be randomly
+        choosen to compute the quantiles.
+        TODO: accept None?
+    random_state: int or numpy.random.RandomState or None, \
+        optional (default=None)
+        Pseudo-random number generator to control the random sub-sampling.
+    """
     def __init__(self, max_bins=256, subsample=int(1e5), random_state=None):
         self.max_bins = max_bins
         self.subsample = subsample
         self.random_state = random_state
 
     def fit(self, X, y=None):
+        """Fit data X by computing the binning thresholds.
+
+        Parameters
+        ----------
+        X: array-like
+            The data to bin
+
+        Returns
+        -------
+        self : object
+        """
         X = check_array(X)
         self.bin_thresholds_ = find_binning_thresholds(
             X, self.max_bins, subsample=self.subsample,
@@ -127,4 +172,15 @@ def fit(self, X, y=None):
         return self
 
     def transform(self, X):
-        return map_to_bins(X, binning_thresholds=self.bin_thresholds_)
+        """Bin data X.
+
+        Parameters
+        ----------
+        X: array-like
+            The data to bin
+
+        Returns
+        -------
+        X_binned : array-like
+            The binned data"""
+        return _map_to_bins(X, binning_thresholds=self.bin_thresholds_)
diff --git a/tests/test_binning.py b/tests/test_binning.py
index f605645..07ec656 100644
--- a/tests/test_binning.py
+++ b/tests/test_binning.py
@@ -2,7 +2,7 @@
 from numpy.testing import assert_array_equal, assert_allclose
 import pytest
 
-from pygbm.binning import BinMapper, find_binning_thresholds, map_to_bins
+from pygbm.binning import BinMapper, find_binning_thresholds, _map_to_bins
 
 
 DATA = np.random.RandomState(42).normal(
@@ -67,7 +67,7 @@ def test_find_binning_thresholds_invalid_n_bins():
 def test_map_to_bins(n_bins):
     bin_thresholds = find_binning_thresholds(DATA, max_bins=n_bins,
                                              random_state=0)
-    binned = map_to_bins(DATA, bin_thresholds)
+    binned = _map_to_bins(DATA, bin_thresholds)
     assert binned.shape == DATA.shape
     assert binned.dtype == np.uint8
     assert binned.flags.f_contiguous

From f88ed96d04be8044e6206562d6c6d39201f95357 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Tue, 4 Dec 2018 15:27:51 -0500
Subject: [PATCH 6/6] Some formatting

---
 doc/source/index.rst       | 22 +++++++++
 pygbm/binning.py           | 10 ++--
 pygbm/gradient_boosting.py |  8 ++--
 pygbm/grower.py            | 14 +++---
 pygbm/splitting.py         | 98 +++++++++++++++++++-------------------
 5 files changed, 86 insertions(+), 66 deletions(-)

diff --git a/doc/source/index.rst b/doc/source/index.rst
index fd81432..e98992c 100644
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@@ -11,10 +11,32 @@ Welcome to pygbm's documentation!
 ..    :caption: API Reference
 ..    :hidden:
 
+Gradient Boosting Estimators
+============================
+
 .. automodule:: pygbm.gradient_boosting
     :members:
     :exclude-members: BaseGradientBoostingMachine
 
+Grower
+======
+
+.. automodule:: pygbm.grower
+    :members:
+
+Splitting
+=========
+
+.. automodule:: pygbm.splitting
+    :members:
+
+Binning
+=======
+
+.. automodule:: pygbm.binning
+    :members:
+
+
 
 .. Indices and tables
 .. ==================
diff --git a/pygbm/binning.py b/pygbm/binning.py
index 1242fb5..22d95cc 100644
--- a/pygbm/binning.py
+++ b/pygbm/binning.py
@@ -12,7 +12,7 @@ def find_binning_thresholds(data, max_bins=256, subsample=int(2e5),
     should be stable.
 
     If the number of unique values for a given feature is less than
-    `max_bins`, then the unique values are used instead of the quantiles.
+    ``max_bins``, then the unique values are used instead of the quantiles.
 
     Parameters
     ----------
@@ -36,8 +36,8 @@ def find_binning_thresholds(data, max_bins=256, subsample=int(2e5),
         For each feature, stores the increasing numeric values that can
         be used to separate the bins.
         len(binning_thresholds) == n_features.
-        Each array has size `(n_bins - 1)` where:
-            n_bins == min(max_bins, len(np.unique(data[:, feature_idx])))
+        Each array has size ``(n_bins - 1)`` where:
+        ``n_bins == min(max_bins, len(np.unique(data[:, feature_idx])))``
     """
     if not (2 <= max_bins <= 256):
         raise ValueError(f'max_bins={max_bins} should be no smaller than 2 '
@@ -133,10 +133,10 @@ class BinMapper(BaseEstimator, TransformerMixin):
     ----------
     max_bins : int, optional (default=256)
         The maximum number of bins to use. If for a given feature the number of
-        unique values is less than `max_bins`, then those unique values will be
+        unique values is less than ``max_bins``, then those unique values will be
         used instead of the quantiles.
     subsample : int, optional (default=1e5)
-        If `n_samples > subsample`, then `sub_samples` samples will be randomly
+        If ``n_samples > subsample``, then ``sub_samples`` samples will be randomly
         choosen to compute the quantiles.
         TODO: accept None?
     random_state: int or numpy.random.RandomState or None, \
diff --git a/pygbm/gradient_boosting.py b/pygbm/gradient_boosting.py
index d6f4cf3..5e51e58 100644
--- a/pygbm/gradient_boosting.py
+++ b/pygbm/gradient_boosting.py
@@ -315,9 +315,9 @@ class GradientBoostingRegressor(BaseGradientBoostingMachine, RegressorMixin):
         The L2 regularization parameter.
     max_bins : int, optional(default=256)
         The maximum number of bins to use. Before training, each feature of
-        the input array `X` is binned into at most `max_bins` bins, which
+        the input array ``X`` is binned into at most ``max_bins`` bins, which
         allows for a much faster training stage. Features with a small
-        number of unique values may use less than `max_bins` bins. Must be no
+        number of unique values may use less than ``max_bins`` bins. Must be no
         larger than 256.
     max_no_improvement : int, optional(default=TODO)
         TODO
@@ -395,9 +395,9 @@ class GradientBoostingClassifier(BaseGradientBoostingMachine, ClassifierMixin):
         The L2 regularization parameter.
     max_bins : int, optional(default=256)
         The maximum number of bins to use. Before training, each feature of
-        the input array `X` is binned into at most `max_bins` bins, which
+        the input array ``X`` is binned into at most ``max_bins`` bins, which
         allows for a much faster training stage. Features with a small
-        number of unique values may use less than `max_bins` bins. Must be no
+        number of unique values may use less than ``max_bins`` bins. Must be no
         larger than 256.
     max_no_improvement : int, optional(default=TODO)
         TODO
diff --git a/pygbm/grower.py b/pygbm/grower.py
index 054d059..0f85119 100644
--- a/pygbm/grower.py
+++ b/pygbm/grower.py
@@ -126,10 +126,10 @@ class TreeGrower:
         The binned input samples. Must be Fortran-aligned.
     gradients : array-like, shape=(n_samples,)
         The gradients of each training sample. Those are the gradients of the
-        loss w.r.t the predictions, evaluated at iteration i - 1.
+        loss w.r.t the predictions, evaluated at iteration ``i - 1``.
     hessians : array-like, shape=(n_samples,)
         The hessians of each training sample. Those are the hessians of the
-        loss w.r.t the predictions, evaluated at iteration i - 1.
+        loss w.r.t the predictions, evaluated at iteration ``i - 1``.
     max_leaf_nodes : int, optional(default=TODO)
         The maximum number of leaves for each tree.
     max_depth : int, optional(default=TODO)
@@ -145,9 +145,9 @@ class TreeGrower:
         histograms.
     n_bins_per_feature : array-like of int or int, optional(default=None)
         The actual number of bins needed for each feature, which is lower or
-        equal to max_bins. If it's an int, all features are considered to
+        equal to ``max_bins``. If it's an int, all features are considered to
         have the same number of bins. If None, all features are considered to
-        have `max_bins` bins.
+        have ``max_bins`` bins.
     l2_regularization : float, optional(default=TODO)
         The L2 regularization parameter.
     min_hessian_to_split : float, optional(default=TODO)
@@ -271,10 +271,10 @@ def _compute_spittability(self, node, only_hist=False):
             The node to evaluate.
         only_hist : bool, optional (default=False)
             Whether to only compute the histograms and the SplitInfo. It is
-            set to `True` when `_compute_spittability` was called by a
+            set to ``True`` when ``_compute_spittability`` was called by a
             sibling node: we only want to compute the histograms (which also
-            computes the SplitInfo), not finalize or push the node. If
-            `_compute_spittability` is called again by the grower on this
+            computes the ``SplitInfo``), not finalize or push the node. If
+            ``_compute_spittability`` is called again by the grower on this
             same node, the histograms won't be computed again.
         """
         # Compute split_info and histograms if not already done
diff --git a/pygbm/splitting.py b/pygbm/splitting.py
index f3e04e6..d3ca18f 100644
--- a/pygbm/splitting.py
+++ b/pygbm/splitting.py
@@ -189,50 +189,48 @@ def split_indices(context, split_info, sample_indices):
     right_indices : array of int
         The indices of the samples in the right child. This is a view on
         context.partition.
-
-    Details
-    -------
-    This is a multi-threaded implementation inspired by lightgbm.
-    Here is a quick break down. Let's suppose we want to split a node with
-    24 samples named from a to x. context.partition looks like this (the * are
-    indices in other leaves that we don't care about):
-    partition = [*************abcdefghijklmnopqrstuvwx****************]
-                              ^                       ^
-                        node_position     node_position + node.n_samples
-
-    Ultimately, we want to reorder the samples inside the boundaries of the
-    leaf (which becomes a node) to now represent the samples in its left and
-    right child. For example:
-    partition = [*************abefilmnopqrtuxcdghjksvw*****************]
-                              ^              ^
-                      left_child_pos     right_child_pos
-    Note that left_child_pos always takes the value of node_position, and
-    right_child_pos = left_child_pos + left_child.n_samples. The order of the
-    samples inside a leaf is irrelevant.
-
-    1. samples_indices is a view on this region a..x. We conceptually
-       divide it into n_threads regions. Each thread will be responsible for
-       its own region. Here is an example with 4 threads:
-       samples_indices = [abcdef|ghijkl|mnopqr|stuvwx]
-    2. Each thread processes 6 = 24 // 4 entries and maps them into
-       left_indices_buffer or right_indices_buffer. For example, we could
-       have the following mapping ('.' denotes an undefined entry):
-       - left_indices_buffer =  [abef..|il....|mnopqr|tux...]
-       - right_indices_buffer = [cd....|ghjk..|......|svw...]
-    3. We keep track of the start positions of the regions (the '|') in
-       `offset_in_buffers` as well as the size of each region. We also keep
-       track of the number of samples put into the left/right child by each
-       thread. Concretely:
-       - left_counts =  [4, 2, 6, 3]
-       - right_counts = [2, 4, 0, 3]
-    4. Finally, we put left/right_indices_buffer back into the samples_indices,
-       without any undefined entries and the partition looks as expected
-       partition = [*************abefilmnopqrtuxcdghjksvw*****************]
-
-    Note: We here show left/right_indices_buffer as being the same size as
-    sample_indices for simplicity, but in reality they are of the same size as
-    partition.
     """
+    # This is a multi-threaded implementation inspired by lightgbm.
+    # Here is a quick break down. Let's suppose we want to split a node with
+    # 24 samples named from a to x. context.partition looks like this (the *
+    # are indices in other leaves that we don't care about):
+    # partition = [*************abcdefghijklmnopqrstuvwx****************]
+    #                           ^                       ^
+    #                     node_position     node_position + node.n_samples
+
+    # Ultimately, we want to reorder the samples inside the boundaries of the
+    # leaf (which becomes a node) to now represent the samples in its left and
+    # right child. For example:
+    # partition = [*************abefilmnopqrtuxcdghjksvw*****************]
+    #                           ^              ^
+    #                   left_child_pos     right_child_pos
+    # Note that left_child_pos always takes the value of node_position, and
+    # right_child_pos = left_child_pos + left_child.n_samples. The order of
+    # the samples inside a leaf is irrelevant.
+
+    # 1. samples_indices is a view on this region a..x. We conceptually
+    #    divide it into n_threads regions. Each thread will be responsible for
+    #    its own region. Here is an example with 4 threads:
+    #    samples_indices = [abcdef|ghijkl|mnopqr|stuvwx]
+    # 2. Each thread processes 6 = 24 // 4 entries and maps them into
+    #    left_indices_buffer or right_indices_buffer. For example, we could
+    #    have the following mapping ('.' denotes an undefined entry):
+    #    - left_indices_buffer =  [abef..|il....|mnopqr|tux...]
+    #    - right_indices_buffer = [cd....|ghjk..|......|svw...]
+    # 3. We keep track of the start positions of the regions (the '|') in
+    #    ``offset_in_buffers`` as well as the size of each region. We also keep
+    #    track of the number of samples put into the left/right child by each
+    #    thread. Concretely:
+    #    - left_counts =  [4, 2, 6, 3]
+    #    - right_counts = [2, 4, 0, 3]
+    # 4. Finally, we put left/right_indices_buffer back into the
+    #    samples_indices, without any undefined entries and the partition looks
+    #    as expected
+    #    partition = [*************abefilmnopqrtuxcdghjksvw*****************]
+
+    # Note: We here show left/right_indices_buffer as being the same size as
+    # sample_indices for simplicity, but in reality they are of the same size
+    # as partition.
 
     binned_feature = context.binned_features.T[split_info.feature_idx]
 
@@ -322,8 +320,8 @@ def find_node_split(context, sample_indices):
         The info about the best possible split among all features.
     histograms : array of HISTOGRAM_DTYPE, shape=(n_features, max_bins)
         The histograms of each feature. A histogram is an array of
-        HISTOGRAM_DTYPE of size `max_bins` (only
-        `n_bins_per_features[feature]` entries are relevant).
+        HISTOGRAM_DTYPE of size ``max_bins`` (only
+        ``n_bins_per_features[feature]`` entries are relevant).
     """
 
     ctx = context  # shorter name to avoid various line breaks
@@ -394,9 +392,9 @@ def find_node_split_subtraction(context, sample_indices, parent_histograms,
     Returns the best split info among all features, and the histograms of
     all the features.
 
-    This does the same job as find_node_split() but uses the histograms of the
-    parent and sibling of the node to split. This allows to use the
-    identity: histogram(parent) = histogram(node) - histogram(sibling),
+    This does the same job as ``find_node_split()`` but uses the histograms
+    of the parent and sibling of the node to split. This allows to use the
+    identity: ``histogram(parent) = histogram(node) - histogram(sibling)``,
     which is significantly faster than computing the histograms from data.
 
     Returns the best SplitInfo among all features, along with all the feature
@@ -421,8 +419,8 @@ def find_node_split_subtraction(context, sample_indices, parent_histograms,
         The info about the best possible split among all features.
     histograms : array of HISTOGRAM_DTYPE, shape=(n_features, max_bins)
         The histograms of each feature. A histogram is an array of
-        HISTOGRAM_DTYPE of size `max_bins` (only
-        `n_bins_per_features[feature]` entries are relevant).
+        HISTOGRAM_DTYPE of size ``max_bins`` (only
+        ``n_bins_per_features[feature]`` entries are relevant).
     """
 
     # We can pick any feature (here the first) in the histograms to