[doc] Mention data consistency for categorical features. (#9678)

dmlc · Oct 24, 2023 · 3ca06ac · 3ca06ac
1 parent 5e6cb63
commit 3ca06ac
Show file tree

Hide file tree

Showing 8 changed files with 292 additions and 95 deletions.
diff --git a/demo/guide-python/cat_in_the_dat.py b/demo/guide-python/cat_in_the_dat.py
@@ -11,10 +11,13 @@
 And the data can be found at:
 https://www.kaggle.com/shahules/an-overview-of-encoding-techniques/data
 
-Also, see the tutorial for using XGBoost with categorical data:
-:doc:`/tutorials/categorical`.
+  .. versionadded:: 1.6.0
 
-    .. versionadded 1.6.0
+See Also
+--------
+- :doc:`Tutorial </tutorials/categorical>`
+- :ref:`sphx_glr_python_examples_categorical.py`
+- :ref:`sphx_glr_python_examples_cat_pipeline.py`
 
 """
 

diff --git a/demo/guide-python/cat_pipeline.py b/demo/guide-python/cat_pipeline.py
@@ -0,0 +1,145 @@
+"""
+Feature engineering pipeline for categorical data
+=================================================
+
+The script showcases how to keep the categorical data encoding consistent across
+training and inference. There are many ways to attain the same goal, this script can be
+used as a starting point.
+
+See Also
+--------
+- :doc:`Tutorial </tutorials/categorical>`
+- :ref:`sphx_glr_python_examples_categorical.py`
+- :ref:`sphx_glr_python_examples_cat_in_the_dat.py`
+
+"""
+from typing import List, Tuple
+
+import numpy as np
+import pandas as pd
+from sklearn.compose import make_column_selector, make_column_transformer
+from sklearn.model_selection import train_test_split
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import OrdinalEncoder
+
+import xgboost as xgb
+
+
+def make_example_data() -> Tuple[pd.DataFrame, pd.Series, List[str]]:
+    """Generate data for demo."""
+    n_samples = 2048
+    rng = np.random.default_rng(1994)
+
+    # We have three categorical features, while the rest are numerical.
+    categorical_features = ["brand_id", "retailer_id", "category_id"]
+
+    df = pd.DataFrame(
+        np.random.randint(32, 96, size=(n_samples, 3)),
+        columns=categorical_features,
+    )
+
+    df["price"] = rng.integers(100, 200, size=(n_samples,))
+    df["stock_status"] = rng.choice([True, False], n_samples)
+    df["on_sale"] = rng.choice([True, False], n_samples)
+    df["label"] = rng.normal(loc=0.0, scale=1.0, size=n_samples)
+
+    X = df.drop(["label"], axis=1)
+    y = df["label"]
+
+    return X, y, categorical_features
+
+
+def native() -> None:
+    """Using the native XGBoost interface."""
+    X, y, cat_feats = make_example_data()
+
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, random_state=1994, test_size=0.2
+    )
+
+    # Create an encoder based on training data.
+    enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=np.nan)
+    enc.set_output(transform="pandas")
+    enc = enc.fit(X_train[cat_feats])
+
+    def enc_transform(X: pd.DataFrame) -> pd.DataFrame:
+        # don't make change inplace so that we can have demonstrations for encoding
+        X = X.copy()
+        cat_cols = enc.transform(X[cat_feats])
+        for i, name in enumerate(cat_feats):
+            # create pd.Series based on the encoder
+            cat_cols[name] = pd.Categorical.from_codes(
+                codes=cat_cols[name].astype(np.int32), categories=enc.categories_[i]
+            )
+        X[cat_feats] = cat_cols
+        return X
+
+    # Encode the data based on fitted encoder.
+    X_train_enc = enc_transform(X_train)
+    X_test_enc = enc_transform(X_test)
+    # Train XGBoost model using the native interface.
+    Xy_train = xgb.QuantileDMatrix(X_train_enc, y_train, enable_categorical=True)
+    Xy_test = xgb.QuantileDMatrix(
+        X_test_enc, y_test, enable_categorical=True, ref=Xy_train
+    )
+    booster = xgb.train({}, Xy_train)
+    booster.predict(Xy_test)
+
+    # Following shows that data are encoded consistently.
+
+    # We first obtain result from newly encoded data
+    predt0 = booster.inplace_predict(enc_transform(X_train.head(16)))
+    # then we obtain result from already encoded data from training.
+    predt1 = booster.inplace_predict(X_train_enc.head(16))
+
+    np.testing.assert_allclose(predt0, predt1)
+
+
+def pipeline() -> None:
+    """Using the sklearn pipeline."""
+    X, y, cat_feats = make_example_data()
+
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, random_state=3, test_size=0.2
+    )
+
+    enc = make_column_transformer(
+        (
+            OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=np.nan),
+            # all categorical feature names end with "_id"
+            make_column_selector(pattern=".*_id"),
+        ),
+        remainder="passthrough",
+        verbose_feature_names_out=False,
+    )
+    # No need to set pandas output, we use `feature_types` to indicate the type of
+    # features.
+
+    # enc.set_output(transform="pandas")
+
+    feature_types = ["c" if fn in cat_feats else "q" for fn in X_train.columns]
+    reg = xgb.XGBRegressor(
+        feature_types=feature_types, enable_categorical=True, n_estimators=10
+    )
+    p = make_pipeline(enc, reg)
+    p.fit(X_train, y_train)
+    # check XGBoost is using the feature type correctly.
+    model_types = reg.get_booster().feature_types
+    assert model_types is not None
+    for a, b in zip(model_types, feature_types):
+        assert a == b
+
+    # Following shows that data are encoded consistently.
+
+    # We first create a slice of data that doesn't contain all the categories
+    predt0 = p.predict(X_train.iloc[:16, :])
+    # Then we use the dataframe that contains all the categories
+    predt1 = p.predict(X_train)[:16]
+
+    # The resulting encoding is the same
+    np.testing.assert_allclose(predt0, predt1)
+
+
+if __name__ == "__main__":
+    pipeline()
+    native()
diff --git a/demo/guide-python/categorical.py b/demo/guide-python/categorical.py
@@ -8,10 +8,13 @@
 which creates a sparse matrix and potentially increase memory usage.  This demo
 showcases the experimental categorical data support, more advanced features are planned.
 
-Also, see :doc:`the tutorial </tutorials/categorical>` for using XGBoost with
-categorical data.
+  .. versionadded:: 1.5.0
 
-    .. versionadded:: 1.5.0
+See Also
+--------
+- :doc:`Tutorial </tutorials/categorical>`
+- :ref:`sphx_glr_python_examples_cat_in_the_dat.py`
+- :ref:`sphx_glr_python_examples_cat_pipeline.py`
 
 """
 from typing import Tuple
@@ -52,11 +55,13 @@ def make_categorical(
 
 def main() -> None:
     # Use builtin categorical data support
-    # For scikit-learn interface, the input data must be pandas DataFrame or cudf
-    # DataFrame with categorical features
+
+    # For scikit-learn interface, the input data should be pandas DataFrame or cudf
+    # DataFrame with categorical features. If an numpy/cupy array is used instead, the
+    # `feature_types` for `XGBRegressor` should be set accordingly.
     X, y = make_categorical(100, 10, 4, False)
-    # Specify `enable_categorical` to True, also we use onehot encoding based split
-    # here for demonstration. For details see the document of `max_cat_to_onehot`.
+    # Specify `enable_categorical` to True, also we use onehot-encoding-based split here
+    # for demonstration. For details see the document of `max_cat_to_onehot`.
     reg = xgb.XGBRegressor(
         tree_method="hist", enable_categorical=True, max_cat_to_onehot=5, device="cuda"
     )

diff --git a/doc/contrib/unit_tests.rst b/doc/contrib/unit_tests.rst
@@ -137,7 +137,7 @@ To build and run C++ unit tests enable tests while running CMake:
   ./testxgboost
 
 Flags like ``USE_CUDA``, ``USE_DMLC_GTEST`` are optional. For more info about how to build
-XGBoost from source, see :doc:`</build>`. One can also run all unit test using ctest tool
+XGBoost from source, see :doc:`/build`. One can also run all unit tests using ctest tool
 which provides higher flexibility. For example:
 
 .. code-block:: bash

diff --git a/doc/tutorials/categorical.rst b/doc/tutorials/categorical.rst
@@ -94,11 +94,11 @@ Using native interface
 **********************
 
 The ``scikit-learn`` interface is user friendly, but lacks some features that are only
-available in native interface.  For instance users cannot compute SHAP value directly or
-use quantized :class:`DMatrix <xgboost.DMatrix>`.  Also native interface supports data
-types other than dataframe, like ``numpy/cupy array``. To use the native interface with
-categorical data, we need to pass the similar parameter to :class:`DMatrix
-<xgboost.DMatrix>` and the :func:`train <xgboost.train>` function.  For dataframe input:
+available in native interface.  For instance users cannot compute SHAP value directly.
+Also native interface supports more data types. To use the native interface with
+categorical data, we need to pass the similar parameter to :class:`~xgboost.DMatrix` or
+:py:class:`~xgboost.QuantileDMatrix` and the :func:`train <xgboost.train>` function.  For
+dataframe input:
 
 .. code:: python
 
@@ -117,7 +117,6 @@ SHAP value computation:
   # categorical features are listed as "c"
   print(booster.feature_types)
 
-
 For other types of input, like ``numpy array``, we can tell XGBoost about the feature
 types by using the ``feature_types`` parameter in :class:`DMatrix <xgboost.DMatrix>`:
 
@@ -131,7 +130,31 @@ types by using the ``feature_types`` parameter in :class:`DMatrix <xgboost.DMatr
 
 For numerical data, the feature type can be ``"q"`` or ``"float"``, while for categorical
 feature it's specified as ``"c"``.  The Dask module in XGBoost has the same interface so
-:class:`dask.Array <dask.Array>` can also be used for categorical data.
+:class:`dask.Array <dask.Array>` can also be used for categorical data. Lastly, the
+sklearn interface :py:class:`~xgboost.XGBRegressor` has the same parameter.
+
+****************
+Data Consistency
+****************
+
+XGBoost accepts parameters to indicate which feature is considered categorical, either through the ``dtypes`` of a dataframe or through the ``feature_types`` parameter. However, XGBoost by itself doesn't store information on how categories are encoded in the first place. For instance, given an encoding schema that maps music genres to integer codes:
+
+.. code-block:: python
+
+  {"acoustic": 0, "indie": 1, "blues": 2, "country": 3}
+
+XGBoost doesn't know this mapping from the input and hence cannot store it in the model. The mapping usually happens in the users' data engineering pipeline with column transformers like :py:class:`sklearn.preprocessing.OrdinalEncoder`. To make sure correct result from XGBoost, users need to keep the pipeline for transforming data consistent across training and testing data. One should watch out for errors like:
+
+.. code-block:: python
+
+  X_train["genre"] = X_train["genre"].astype("category")
+  reg = xgb.XGBRegressor(enable_categorical=True).fit(X_train, y_train)
+
+  # invalid encoding
+  X_test["genre"] = X_test["genre"].astype("category")
+  reg.predict(X_test)
+
+In the above snippet, training data and test data are encoded separately, resulting in two different encoding schemas and invalid prediction result. See :ref:`sphx_glr_python_examples_cat_pipeline.py` for a worked example using ordinal encoder.
 
 *************
 Miscellaneous

diff --git a/python-package/xgboost/testing/__init__.py b/python-package/xgboost/testing/__init__.py
@@ -821,7 +821,7 @@ class DirectoryExcursion:
 
     """
 
-    def __init__(self, path: os.PathLike, cleanup: bool = False):
+    def __init__(self, path: Union[os.PathLike, str], cleanup: bool = False):
         self.path = path
         self.curdir = os.path.normpath(os.path.abspath(os.path.curdir))
         self.cleanup = cleanup

diff --git a/tests/ci_build/lint_python.py b/tests/ci_build/lint_python.py
@@ -21,6 +21,7 @@ class LintersPaths:
         "tests/python/test_data_iterator.py",
         "tests/python/test_dmatrix.py",
         "tests/python/test_dt.py",
+        "tests/python/test_demos.py",
         "tests/python/test_predict.py",
         "tests/python/test_quantile_dmatrix.py",
         "tests/python/test_tree_regularization.py",
@@ -41,6 +42,7 @@ class LintersPaths:
         "demo/guide-python/cat_in_the_dat.py",
         "demo/guide-python/callbacks.py",
         "demo/guide-python/categorical.py",
+        "demo/guide-python/cat_pipeline.py",
         "demo/guide-python/feature_weights.py",
         "demo/guide-python/sklearn_parallel.py",
         "demo/guide-python/spark_estimator_examples.py",
@@ -79,6 +81,7 @@ class LintersPaths:
         "python-package/",
         # tests
         "tests/python/test_dt.py",
+        "tests/python/test_demos.py",
         "tests/python/test_data_iterator.py",
         "tests/python-gpu/test_gpu_data_iterator.py",
         "tests/python-gpu/load_pickle.py",
@@ -89,6 +92,8 @@ class LintersPaths:
         "demo/json-model/json_parser.py",
         "demo/guide-python/external_memory.py",
         "demo/guide-python/cat_in_the_dat.py",
+        "demo/guide-python/categorical.py",
+        "demo/guide-python/cat_pipeline.py",
         "demo/guide-python/feature_weights.py",
         "demo/guide-python/individual_trees.py",
         "demo/guide-python/quantile_regression.py",