microsoft · jameslamb · Sep 3, 2024 · Jul 30, 2024 · Jul 30, 2024 · Jul 31, 2024
@@ -17,18 +17,61 @@ This page contains descriptions of all parameters in LightGBM.
 Parameters Format
 -----------------
 
+Parameters are merged together in the following order (later items overwrite earlier ones):
+
+1. LightGBM's default values
+2. (CLI only) configuration in a file passed like ``config=train.conf``
+3. (CLI only) configuration passed via the command line
+4. (Python, R) ``params`` function argument
+5. (C API) ``parameters`` or ``params`` function argument
+
+Many parameters have "aliases", alternative names which refer to the same configuration.
+
+Where a mix of the primary parameter name and aliases are given, the primary parameter name is always preferred to any aliases.
+
+For example, in Python:
+
+.. code-block:: python
+
+   # use learning rate of 0.07, becase 'learning_rate'
+   # is the primary parameter name
+   lgb.train(
+      params={
+         "learning_rate": 0.07,
+         "shrinkage_rate": 0.12
+      },
+      train_set=dtrain
+   )
+
+Where multiple aliases are given, and the primary parameter name is not, the first alias
+appearing in the lists returned by ``Config::parameter2aliases()`` in the C++ library is used.
+Those lists are hard-coded in a fairly arbitrary way... wherever possible, avoid relying on this behavior.
+
+For example, in Python:
+
+.. code-block:: python
+
+   # use learning rate of 0.12, LightGBM has a hard-coded preference for 'shrinkage_rate'
+   # over any other aliases, and 'learning_rate' is not provided
+   lgb.train(
+      params={
+         "eta": 0.19,
+         "shrinkage_rate": 0.12
+      },
+      train_set=dtrain
+   )
+
+**CLI**
+
 The parameters format is ``key1=value1 key2=value2 ...``.
 Parameters can be set both in config file and command line.
 By using command line, parameters should not have spaces before and after ``=``.
 By using config files, one line can only contain one parameter. You can use ``#`` to comment.
 
-If one parameter appears in both command line and config file, LightGBM will use the parameter from the command line.
-
-For the Python and R packages, any parameters that accept a list of values (usually they have ``multi-xxx`` type, e.g. ``multi-int`` or ``multi-double``) can be specified in those languages' default array types.
-For example, ``monotone_constraints`` can be specified as follows.
-
 **Python**
 
+Any parameters that accept multiple values should be passed as a Python list.
+
 .. code-block:: python
 
    params = {
@@ -38,6 +81,8 @@ For example, ``monotone_constraints`` can be specified as follows.
 
 **R**
 
+Any parameters that accept multiple values should be passed as an R list.
+
 .. code-block:: r
 
    params <- list(

@@ -63,6 +63,62 @@ def _emit_dataset_kwarg_warning(calling_function: str, argname: str) -> None:
     warnings.warn(msg, category=LGBMDeprecationWarning, stacklevel=2)
 
 
+def _choose_num_iterations(num_boost_round_kwarg: int, params: Dict[str, Any]) -> Dict[str, Any]:
+    """Choose number of boosting rounds.
+
+    In ``train()`` and ``cv()``, there are multiple ways to provide configuration for
+    the number of boosting rounds to perform:
+
+      * the ``num_boost_round`` keyword argument
+      * any of the ``num_iterations`` or its aliases via the ``params`` dictionary
+
+    These should be preferred in the following order (first one found wins):
+
+      1. ``num_iterations`` provided via ``params`` (because it's the main parameter name)
+      2. any other aliases of ``num_iterations`` provided via ``params``
+      3. the ``num_boost_round`` keyword argument
+
+    This function handles that choice, and issuing helpful warnings in the cases where the
+    result might be surprising.
+
+    Returns
+    -------
+    params : dict
+        Parameters, with ``"num_iterations"`` set to the preferred value and all other
+        aliases of ``num_iterations`` removed.
+    """
+    num_iteration_configs_provided = {
+        alias: params[alias] for alias in _ConfigAliases.get("num_iterations") if alias in params
+    }
+
+    # now that the relevant information has been pulled out of params, it's safe to overwrite it
+    # with the content that should be used for training (i.e. with aliases resolved)
+    params = _choose_param_value(
+        main_param_name="num_iterations",
+        params=params,
+        default_value=num_boost_round_kwarg,
+    )
+
+    # if there were not multiple boosting rounds configurations provided in params,
+    # then by definition they cannot have conflicting values... no need to warn
+    if len(num_iteration_configs_provided) <= 1:
+        return params
+
+    # if all the aliases have the same value, no need to warn
+    if len(set(num_iteration_configs_provided.values())) <= 1:
+        return params
+
+    # if this line is reached, lightgbm should warn
+    value_string = ", ".join(f"{alias}={val}" for alias, val in num_iteration_configs_provided.items())
+    _log_warning(
+        f"Found conflicting values for num_iterations provided via 'params': {value_string}. "
+        f"LightGBM will perform up to {params['num_iterations']} boosting rounds. "
+        "To be confident in the maximum number of boosting rounds LightGBM will perform and to "
+        "suppress this warning, modify 'params' so that only one of those is present."
+    )
+    return params
+
+
 def train(
     params: Dict[str, Any],
     train_set: Dataset,
@@ -198,11 +254,10 @@ def train(
     if callable(params["objective"]):
         fobj = params["objective"]
         params["objective"] = "none"
-    for alias in _ConfigAliases.get("num_iterations"):
-        if alias in params:
-            num_boost_round = params.pop(alias)
-            _log_warning(f"Found `{alias}` in params. Will use it instead of argument")
-    params["num_iterations"] = num_boost_round
+
+    params = _choose_num_iterations(num_boost_round_kwarg=num_boost_round, params=params)
+    num_boost_round = params["num_iterations"]
+
     # setting early stopping via global params should be possible
     params = _choose_param_value(
         main_param_name="early_stopping_round",
@@ -733,11 +788,10 @@ def cv(
     if callable(params["objective"]):
         fobj = params["objective"]
         params["objective"] = "none"
-    for alias in _ConfigAliases.get("num_iterations"):
-        if alias in params:
-            _log_warning(f"Found '{alias}' in params. Will use it instead of 'num_boost_round' argument")
-            num_boost_round = params.pop(alias)
-    params["num_iterations"] = num_boost_round
+
+    params = _choose_num_iterations(num_boost_round_kwarg=num_boost_round, params=params)
+    num_boost_round = params["num_iterations"]
+
     # setting early stopping via global params should be possible
     params = _choose_param_value(
         main_param_name="early_stopping_round",

@@ -24,6 +24,7 @@
 
 from .utils import (
     SERIALIZERS,
+    assert_silent,
     dummy_obj,
     load_breast_cancer,
     load_digits,
@@ -4289,7 +4290,7 @@ def test_verbosity_is_respected_when_using_custom_objective(capsys):
         "num_leaves": 3,
     }
     lgb.train({**params, "verbosity": -1}, ds, num_boost_round=1)
-    assert capsys.readouterr().out == ""
+    assert_silent(capsys)
     lgb.train({**params, "verbosity": 0}, ds, num_boost_round=1)
     assert "[LightGBM] [Warning] Unknown parameter: nonsense" in capsys.readouterr().out
 
@@ -4318,6 +4319,115 @@ def test_verbosity_can_suppress_alias_warnings(capsys, verbosity_param, verbosit
         assert re.search(r"\[LightGBM\]", stdout) is None
 
 
+def test_cv_only_raises_num_rounds_warning_when_expected(capsys):
+    X, y = make_synthetic_regression()
+    ds = lgb.Dataset(X, y)
+    base_params = {
+        "num_leaves": 5,
+        "objective": "regression",
+        "verbosity": -1,
+    }
+    additional_kwargs = {"return_cvbooster": True, "stratified": False}
+
+    # no warning: no aliases, all defaults
+    cv_bst = lgb.cv({**base_params}, ds, **additional_kwargs)
+    assert all(t == 100 for t in cv_bst["cvbooster"].num_trees())
+    assert_silent(capsys)
+
+    # no warning: no aliases, just num_boost_round
+    cv_bst = lgb.cv({**base_params}, ds, num_boost_round=2, **additional_kwargs)
+    assert all(t == 2 for t in cv_bst["cvbooster"].num_trees())
+    assert_silent(capsys)
+
+    # no warning: 1 alias + num_boost_round (both same value)
+    cv_bst = lgb.cv({**base_params, "n_iter": 3}, ds, num_boost_round=3, **additional_kwargs)
+    assert all(t == 3 for t in cv_bst["cvbooster"].num_trees())
+    assert_silent(capsys)
+
+    # no warning: 1 alias + num_boost_round (different values... value from params should win)
+    cv_bst = lgb.cv({**base_params, "n_iter": 4}, ds, num_boost_round=3, **additional_kwargs)
+    assert all(t == 4 for t in cv_bst["cvbooster"].num_trees())
+    assert_silent(capsys)
+
+    # no warning: 2 aliases (both same value)
+    cv_bst = lgb.cv({**base_params, "n_iter": 3, "num_iterations": 3}, ds, **additional_kwargs)
+    assert all(t == 3 for t in cv_bst["cvbooster"].num_trees())
+    assert_silent(capsys)
+
+    # no warning: 4 aliases (all same value)
+    cv_bst = lgb.cv({**base_params, "n_iter": 3, "num_trees": 3, "nrounds": 3, "max_iter": 3}, ds, **additional_kwargs)
+    assert all(t == 3 for t in cv_bst["cvbooster"].num_trees())
+    assert_silent(capsys)
+
+    # warning: 2 aliases (different values... "num_iterations" wins because it's the main param name)
+    with pytest.warns(UserWarning, match="LightGBM will perform up to 5 boosting rounds"):
+        cv_bst = lgb.cv({**base_params, "n_iter": 6, "num_iterations": 5}, ds, **additional_kwargs)
+    assert all(t == 5 for t in cv_bst["cvbooster"].num_trees())
+    # should not be any other logs (except the warning, intercepted by pytest)
+    assert_silent(capsys)
+
+    # warning: 2 aliases (different values... first one in the order from Config::parameter2aliases() wins)
+    with pytest.warns(UserWarning, match="LightGBM will perform up to 4 boosting rounds"):
+        cv_bst = lgb.cv({**base_params, "n_iter": 4, "max_iter": 5}, ds, **additional_kwargs)["cvbooster"]
+    assert all(t == 4 for t in cv_bst.num_trees())
+    # should not be any other logs (except the warning, intercepted by pytest)
+    assert_silent(capsys)
+
+
+def test_train_only_raises_num_rounds_warning_when_expected(capsys):
+    X, y = make_synthetic_regression()
+    ds = lgb.Dataset(X, y)
+    base_params = {
+        "num_leaves": 5,
+        "objective": "regression",
+        "verbosity": -1,
+    }
+
+    # no warning: no aliases, all defaults
+    bst = lgb.train({**base_params}, ds)
+    assert bst.num_trees() == 100
+    assert_silent(capsys)
+
+    # no warning: no aliases, just num_boost_round
+    bst = lgb.train({**base_params}, ds, num_boost_round=2)
+    assert bst.num_trees() == 2
+    assert_silent(capsys)
+
+    # no warning: 1 alias + num_boost_round (both same value)
+    bst = lgb.train({**base_params, "n_iter": 3}, ds, num_boost_round=3)
+    assert bst.num_trees() == 3
+    assert_silent(capsys)
+
+    # no warning: 1 alias + num_boost_round (different values... value from params should win)
+    bst = lgb.train({**base_params, "n_iter": 4}, ds, num_boost_round=3)
+    assert bst.num_trees() == 4
+    assert_silent(capsys)
+
+    # no warning: 2 aliases (both same value)
+    bst = lgb.train({**base_params, "n_iter": 3, "num_iterations": 3}, ds)
+    assert bst.num_trees() == 3
+    assert_silent(capsys)
+
+    # no warning: 4 aliases (all same value)
+    bst = lgb.train({**base_params, "n_iter": 3, "num_trees": 3, "nrounds": 3, "max_iter": 3}, ds)
+    assert bst.num_trees() == 3
+    assert_silent(capsys)
+
+    # warning: 2 aliases (different values... "num_iterations" wins because it's the main param name)
+    with pytest.warns(UserWarning, match="LightGBM will perform up to 5 boosting rounds"):
+        bst = lgb.train({**base_params, "n_iter": 6, "num_iterations": 5}, ds)
+    assert bst.num_trees() == 5
+    # should not be any other logs (except the warning, intercepted by pytest)
+    assert_silent(capsys)
+
+    # warning: 2 aliases (different values... first one in the order from Config::parameter2aliases() wins)
+    with pytest.warns(UserWarning, match="LightGBM will perform up to 4 boosting rounds"):
+        bst = lgb.train({**base_params, "n_iter": 4, "max_iter": 5}, ds)
+    assert bst.num_trees() == 4
+    # should not be any other logs (except the warning, intercepted by pytest)
+    assert_silent(capsys)
+
+
 @pytest.mark.skipif(not PANDAS_INSTALLED, reason="pandas is not installed")
 def test_validate_features():
     X, y = make_synthetic_regression()

@@ -24,6 +24,7 @@
 from lightgbm.compat import DATATABLE_INSTALLED, PANDAS_INSTALLED, dt_DataTable, pd_DataFrame, pd_Series
 
 from .utils import (
+    assert_silent,
     load_breast_cancer,
     load_digits,
     load_iris,
@@ -1336,6 +1337,51 @@ def test_verbosity_is_respected_when_using_custom_objective(capsys):
     assert "[LightGBM] [Warning] Unknown parameter: nonsense" in capsys.readouterr().out
 
 
+def test_fit_only_raises_num_rounds_warning_when_expected(capsys):
+    X, y = make_synthetic_regression()
+    base_kwargs = {
+        "num_leaves": 5,
+        "verbosity": -1,
+    }
+
+    # no warning: no aliases, all defaults
+    reg = lgb.LGBMRegressor(**base_kwargs).fit(X, y)
+    assert reg.n_estimators_ == 100
+    assert_silent(capsys)
+
+    # no warning: no aliases, just n_estimators
+    reg = lgb.LGBMRegressor(**base_kwargs, n_estimators=2).fit(X, y)
+    assert reg.n_estimators_ == 2
+    assert_silent(capsys)
+
+    # no warning: 1 alias + n_estimators (both same value)
+    reg = lgb.LGBMRegressor(**base_kwargs, n_estimators=3, n_iter=3).fit(X, y)
+    assert reg.n_estimators_ == 3
+    assert_silent(capsys)
+
+    # no warning: 1 alias + n_estimators (different values... value from params should win)
+    reg = lgb.LGBMRegressor(**base_kwargs, n_estimators=3, n_iter=4).fit(X, y)
+    assert reg.n_estimators_ == 4
+    assert_silent(capsys)
+
+    # no warning: 2 aliases (both same value)
+    reg = lgb.LGBMRegressor(**base_kwargs, n_iter=3, num_iterations=3).fit(X, y)
+    assert reg.n_estimators_ == 3
+    assert_silent(capsys)
+
+    # no warning: 4 aliases (all same value)
+    reg = lgb.LGBMRegressor(**base_kwargs, n_iter=3, num_trees=3, nrounds=3, max_iter=3).fit(X, y)
+    assert reg.n_estimators_ == 3
+    assert_silent(capsys)
+
+    # warning: 2 aliases (different values... first one in the order from Config::parameter2aliases() wins)
+    with pytest.warns(UserWarning, match="LightGBM will perform up to 4 boosting rounds"):
+        reg = lgb.LGBMRegressor(**base_kwargs, n_iter=4, max_iter=5).fit(X, y)
+    assert reg.n_estimators_ == 4
+    # should not be any other logs (except the warning, intercepted by pytest)
+    assert_silent(capsys)
+
+
 @pytest.mark.parametrize("estimator_class", [lgb.LGBMModel, lgb.LGBMClassifier, lgb.LGBMRegressor, lgb.LGBMRanker])
 def test_getting_feature_names_in_np_input(estimator_class):
     # input is a numpy array, which doesn't have feature names. LightGBM adds

@@ -191,6 +191,25 @@ def pickle_and_unpickle_object(obj, serializer):
     return obj_from_disk  # noqa: RET504
 
 
+def assert_silent(capsys) -> None:
+    """
+    Given a ``CaptureFixture`` instance (from the ``pytest`` built-in ``capsys`` fixture),
+    read the recently-captured data into a variable and assert that nothing was written
+    to stdout or stderr.
+
+    This is just hear to turn 3 lines of boilerplate into 1.
+
+    Note that this does have a side effect... ``capsys.readouterr()`` copies
+    from a buffer then frees it. So it will only store into ``.out`` and ``.err`` the
+    captured output since the last time that ``.readouterr()`` was called.
+
+    ref: https://docs.pytest.org/en/stable/how-to/capture-stdout-stderr.html
+    """
+    captured = capsys.readouterr()
+    assert captured.out == ""
+    assert captured.err == ""
+
+
 # doing this here, at import time, to ensure it only runs once_per import
 # instead of once per assertion
 _numpy_testing_supports_strict_kwarg = "strict" in getfullargspec(np.testing.assert_array_equal).kwonlyargs