Skip to content

Commit

Permalink
[tests][dask] Add voting_parallel algorithm in tests (fixes #3834) (#…
Browse files Browse the repository at this point in the history
…4088)

* include voting_parallel tree_learner in test_regressor, test_classifier and test_ranker

* remove test for warnings and test for error when using feature_parallel

* use real names for tree_learner intest and include test for aliases. use the error message in the test for error in feature parallel

* split all tests with rf in test_classifier

* remove task parametrization for tree_learner aliases test. smaller input data from feature_parallel error

* define task for tree_learner aliases
  • Loading branch information
jmoralez committed Apr 1, 2021
1 parent 46a20ab commit d517ba1
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 27 deletions.
6 changes: 0 additions & 6 deletions python-package/lightgbm/dask.py
Original file line number Diff line number Diff line change
Expand Up @@ -309,12 +309,6 @@ def _train(
_log_warning('Parameter tree_learner set to %s, which is not allowed. Using "data" as default' % params['tree_learner'])
params['tree_learner'] = 'data'

if params['tree_learner'] not in {'data', 'data_parallel'}:
_log_warning(
'Support for tree_learner %s in lightgbm.dask is experimental and may break in a future release. \n'
'Use "data" for a stable, well-tested interface.' % params['tree_learner']
)

# Some passed-in parameters can be removed:
# * 'num_machines': set automatically from Dask worker list
# * 'num_threads': overridden to match nthreads on each Dask process
Expand Down
61 changes: 40 additions & 21 deletions tests/python_package_test/test_dask.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
CLIENT_CLOSE_TIMEOUT = 120

tasks = ['binary-classification', 'multiclass-classification', 'regression', 'ranking']
distributed_training_algorithms = ['data', 'voting']
data_output = ['array', 'scipy_csr_matrix', 'dataframe', 'dataframe-with-categorical']
boosting_types = ['gbdt', 'dart', 'goss', 'rf']
group_sizes = [5, 5, 5, 10, 10, 10, 20, 20, 20, 50, 50]
Expand Down Expand Up @@ -235,14 +236,16 @@ def _unpickle(filepath, serializer):
@pytest.mark.parametrize('output', data_output)
@pytest.mark.parametrize('task', ['binary-classification', 'multiclass-classification'])
@pytest.mark.parametrize('boosting_type', boosting_types)
def test_classifier(output, task, boosting_type, client):
@pytest.mark.parametrize('tree_learner', distributed_training_algorithms)
def test_classifier(output, task, boosting_type, tree_learner, client):
X, y, w, _, dX, dy, dw, _ = _create_data(
objective=task,
output=output
)

params = {
"boosting_type": boosting_type,
"tree_learner": tree_learner,
"n_estimators": 50,
"num_leaves": 31
}
Expand Down Expand Up @@ -273,7 +276,7 @@ def test_classifier(output, task, boosting_type, client):
p2_proba = local_classifier.predict_proba(X)
s2 = local_classifier.score(X, y)

if boosting_type == 'rf' and output == 'dataframe-with-categorical':
if boosting_type == 'rf':
# https://github.com/microsoft/LightGBM/issues/4118
assert_eq(s1, s2, atol=0.01)
assert_eq(p1_proba, p2_proba, atol=0.8)
Expand Down Expand Up @@ -448,7 +451,8 @@ def test_training_does_not_fail_on_port_conflicts(client):

@pytest.mark.parametrize('output', data_output)
@pytest.mark.parametrize('boosting_type', boosting_types)
def test_regressor(output, boosting_type, client):
@pytest.mark.parametrize('tree_learner', distributed_training_algorithms)
def test_regressor(output, boosting_type, tree_learner, client):
X, y, w, _, dX, dy, dw, _ = _create_data(
objective='regression',
output=output
Expand All @@ -469,7 +473,7 @@ def test_regressor(output, boosting_type, client):
dask_regressor = lgb.DaskLGBMRegressor(
client=client,
time_out=5,
tree='data',
tree=tree_learner,
**params
)
dask_regressor = dask_regressor.fit(dX, dy, sample_weight=dw)
Expand Down Expand Up @@ -623,7 +627,8 @@ def test_regressor_quantile(output, client, alpha):
@pytest.mark.parametrize('output', ['array', 'dataframe', 'dataframe-with-categorical'])
@pytest.mark.parametrize('group', [None, group_sizes])
@pytest.mark.parametrize('boosting_type', boosting_types)
def test_ranker(output, group, boosting_type, client):
@pytest.mark.parametrize('tree_learner', distributed_training_algorithms)
def test_ranker(output, group, boosting_type, tree_learner, client):
if output == 'dataframe-with-categorical':
X, y, w, g, dX, dy, dw, dg = _create_data(
objective='ranking',
Expand Down Expand Up @@ -666,7 +671,7 @@ def test_ranker(output, group, boosting_type, client):
dask_ranker = lgb.DaskLGBMRanker(
client=client,
time_out=5,
tree_learner_type='data_parallel',
tree_learner_type=tree_learner,
**params
)
dask_ranker = dask_ranker.fit(dX, dy, sample_weight=dw, group=dg)
Expand Down Expand Up @@ -961,22 +966,36 @@ def test_warns_and_continues_on_unrecognized_tree_learner(client):
client.close(timeout=CLIENT_CLOSE_TIMEOUT)


def test_warns_but_makes_no_changes_for_feature_or_voting_tree_learner(client):
X = da.random.random((1e3, 10))
y = da.random.random((1e3, 1))
for tree_learner in ['feature_parallel', 'voting']:
dask_regressor = lgb.DaskLGBMRegressor(
client=client,
time_out=5,
tree_learner=tree_learner,
n_estimators=1,
num_leaves=2
)
with pytest.warns(UserWarning, match='Support for tree_learner %s in lightgbm' % tree_learner):
dask_regressor = dask_regressor.fit(X, y)
@pytest.mark.parametrize('tree_learner', ['data_parallel', 'voting_parallel'])
def test_training_respects_tree_learner_aliases(tree_learner, client):
task = 'regression'
_, _, _, _, dX, dy, dw, dg = _create_data(objective=task, output='array')
dask_factory = task_to_dask_factory[task]
dask_model = dask_factory(
client=client,
tree_learner=tree_learner,
time_out=5,
n_estimators=10,
num_leaves=15
)
dask_model.fit(dX, dy, sample_weight=dw, group=dg)

assert dask_regressor.fitted_
assert dask_regressor.get_params()['tree_learner'] == tree_learner
assert dask_model.fitted_
assert dask_model.get_params()['tree_learner'] == tree_learner


def test_error_on_feature_parallel_tree_learner(client):
X = da.random.random((100, 10), chunks=(50, 10))
y = da.random.random(100, chunks=50)
dask_regressor = lgb.DaskLGBMRegressor(
client=client,
time_out=5,
tree_learner='feature_parallel',
n_estimators=1,
num_leaves=2
)
with pytest.raises(lgb.basic.LightGBMError, match='Do not support feature parallel in c api'):
dask_regressor = dask_regressor.fit(X, y)

client.close(timeout=CLIENT_CLOSE_TIMEOUT)

Expand Down

0 comments on commit d517ba1

Please sign in to comment.