From 52a981e7a67c3d6fab9ba918e8efbfc8d7a1b5ef Mon Sep 17 00:00:00 2001 From: Guolin Ke Date: Tue, 29 Sep 2020 18:10:07 +0800 Subject: [PATCH 01/67] fix int64 write error --- R-package/src/lightgbm_R.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R-package/src/lightgbm_R.cpp b/R-package/src/lightgbm_R.cpp index f6dc82e9bd04..27854b606251 100644 --- a/R-package/src/lightgbm_R.cpp +++ b/R-package/src/lightgbm_R.cpp @@ -506,7 +506,8 @@ LGBM_SE LGBM_BoosterGetNumPredict_R(LGBM_SE handle, R_API_BEGIN(); int64_t len; CHECK_CALL(LGBM_BoosterGetNumPredict(R_GET_PTR(handle), R_AS_INT(data_idx), &len)); - R_INT64_PTR(out)[0] = len; + // R_INT64_PTR(out)[0] = len; + R_INT_PTR(out)[0] = static_cast(len); R_API_END(); } From d8dff3689ca0f99f71b1fef08171ebdb1c8f0779 Mon Sep 17 00:00:00 2001 From: Guolin Ke Date: Tue, 29 Sep 2020 19:59:37 +0800 Subject: [PATCH 02/67] attempt --- R-package/src/lightgbm_R.cpp | 2 ++ src/boosting/gbdt_model_text.cpp | 11 +++++++++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/R-package/src/lightgbm_R.cpp b/R-package/src/lightgbm_R.cpp index 27854b606251..8aacf2825c52 100644 --- a/R-package/src/lightgbm_R.cpp +++ b/R-package/src/lightgbm_R.cpp @@ -672,6 +672,8 @@ LGBM_SE LGBM_BoosterDumpModel_R(LGBM_SE handle, std::vector inner_char_buf(R_AS_INT(buffer_len)); CHECK_CALL(LGBM_BoosterDumpModel(R_GET_PTR(handle), 0, R_AS_INT(num_iteration), R_AS_INT(feature_importance_type), R_AS_INT(buffer_len), &out_len, inner_char_buf.data())); EncodeChar(out_str, inner_char_buf.data(), buffer_len, actual_len, static_cast(out_len)); + inner_char_buf.clear(); + inner_char_buf.shrink_to_fit(); R_API_END(); } diff --git a/src/boosting/gbdt_model_text.cpp b/src/boosting/gbdt_model_text.cpp index 4eeb731f587f..0ca7d947387b 100644 --- a/src/boosting/gbdt_model_text.cpp +++ b/src/boosting/gbdt_model_text.cpp @@ -358,11 +358,16 @@ std::string GBDT::SaveModelToString(int start_iteration, int num_iteration, int ss << "tree_sizes=" << Common::Join(tree_sizes, " ") << '\n'; ss << '\n'; + tree_sizes.clear(); + tree_sizes.shrink_to_fit(); for (int i = 0; i < num_used_model - start_model; ++i) { ss << tree_strs[i]; tree_strs[i].clear(); + tree_strs[i].shrink_to_fit(); } + tree_strs.clear(); + tree_strs.shrink_to_fit(); ss << "end of trees" << "\n"; std::vector feature_importances = FeatureImportance( num_iteration, feature_importance_type); @@ -374,6 +379,8 @@ std::string GBDT::SaveModelToString(int start_iteration, int num_iteration, int pairs.emplace_back(feature_importances_int, feature_names_[i]); } } + feature_importances.clear(); + feature_importances.shrink_to_fit(); // sort the importance std::stable_sort(pairs.begin(), pairs.end(), [](const std::pair& lhs, @@ -393,7 +400,7 @@ std::string GBDT::SaveModelToString(int start_iteration, int num_iteration, int ss << loaded_parameter_ << "\n"; ss << "end of parameters" << '\n'; } - return ss.str(); + return std::move(ss.str()); } bool GBDT::SaveModelToFile(int start_iteration, int num_iteration, int feature_importance_type, const char* filename) const { @@ -618,7 +625,7 @@ std::vector GBDT::FeatureImportance(int num_iteration, int importance_ty } else { Log::Fatal("Unknown importance type: only support split=0 and gain=1"); } - return feature_importances; + return std::move(feature_importances); } } // namespace LightGBM From 77183398b4fa58f05ef3f7fb84768a764d951572 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Wed, 7 Oct 2020 22:49:29 -0500 Subject: [PATCH 03/67] [WIP] [ci] [R-package] Add CI job that runs valgrind tests --- .ci/test_r_package_valgrind.sh | 47 +++++++++++++++++++++++++++++++++ .github/workflows/r_package.yml | 22 +++++++++++++++ 2 files changed, 69 insertions(+) create mode 100644 .ci/test_r_package_valgrind.sh diff --git a/.ci/test_r_package_valgrind.sh b/.ci/test_r_package_valgrind.sh new file mode 100644 index 000000000000..e88fae2dab6d --- /dev/null +++ b/.ci/test_r_package_valgrind.sh @@ -0,0 +1,47 @@ +#!/bin/bash + +cd R-package/tests + +RDvalgrind \ + --no-readline \ + --vanilla \ + -d valgrind \ + -f testthat.R \ + 2>&1 > out.log + +cat out.log | grep -E "^\=" > valgrind-logs.log + +bytes_definitely_lost=$( + cat valgrind-logs.log \ + | grep -E "definitely lost\: .*" \ + | sed 's/^.*definitely lost\: \(.*\) bytes.*$/\1/' \ + | tr -d "," +) +if [[ ${bytes_definitely_lost} -gt 0 ]]; then + echo "valgrind found ${bytes_definitely_lost} bytes definitely lost" + exit -1 +fi + +bytes_indirectly_lost=$( + cat valgrind-logs.log \ + | grep -E "indirectly lost\: .*" \ + | sed 's/^.*indirectly lost\: \(.*\) bytes.*$/\1/' \ + | tr -d "," +) +if [[ ${bytes_indirectly_lost} -gt 0 ]]; then + echo "valgrind found ${bytes_indirectly_lost} bytes indirectly lost" + exit -1 +fi + +bytes_possibly_lost=$( + cat valgrind-logs.log \ + | grep -E "possibly lost\: .*" \ + | sed 's/^.*possibly lost\: \(.*\) bytes.*$/\1/' \ + | tr -d "," +) +if [[ ${bytes_possibly_lost} -gt 0 ]]; then + echo "valgrind found ${bytes_possibly_lost} bytes possibly lost" + exit -1 +fi + +exit 0 diff --git a/.github/workflows/r_package.yml b/.github/workflows/r_package.yml index 1bd68de2a7d3..3f84c0165197 100644 --- a/.github/workflows/r_package.yml +++ b/.github/workflows/r_package.yml @@ -159,6 +159,28 @@ jobs: $env:TASK = "${{ matrix.task }}" conda init powershell & "$env:GITHUB_WORKSPACE/.ci/test_windows.ps1" + test-r-valgrind: + name: r-package (ubuntu-latest, R-devel, valgrind) + timeout-minutes: 60 + runs-on: ubuntu-latest + container: + image: wch1/r-debug + steps: + - name: Checkout repository + uses: actions/checkout@v1 + with: + fetch-depth: 5 + submodules: true + - name: install + shell: bash + run: | + Rscript -e "install.packages(c('R6', 'data.table', 'jsonlite', 'testthat'), repos = 'http://cran.r-project.org')" + sh build-cran-package.sh + RDvalgrind CMD INSTALL --preclean --install-tests lightgbm_*.tar.gz + - name: test + shell: bash + run: | + ./.ci/test_r_package_valgrind.sh all-successful: # https://github.community/t/is-it-possible-to-require-all-github-actions-tasks-to-pass-without-enumerating-them/117957/4?u=graingert runs-on: ubuntu-latest From 5fcd2fec040b025573f8c0a7f3d04cc4675c233b Mon Sep 17 00:00:00 2001 From: James Lamb Date: Wed, 7 Oct 2020 22:56:37 -0500 Subject: [PATCH 04/67] update all-successful --- .github/workflows/r_package.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/r_package.yml b/.github/workflows/r_package.yml index 3f84c0165197..3660f282e583 100644 --- a/.github/workflows/r_package.yml +++ b/.github/workflows/r_package.yml @@ -184,7 +184,7 @@ jobs: all-successful: # https://github.community/t/is-it-possible-to-require-all-github-actions-tasks-to-pass-without-enumerating-them/117957/4?u=graingert runs-on: ubuntu-latest - needs: [test] + needs: [test, test-r-valgrind] steps: - name: Note that all tests succeeded run: echo "🎉" From 1d3762ee841478ade0a67c3eec1e64b48d2127a6 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Wed, 7 Oct 2020 23:01:50 -0500 Subject: [PATCH 05/67] install --- .github/workflows/r_package.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/r_package.yml b/.github/workflows/r_package.yml index 3660f282e583..27936d8be6af 100644 --- a/.github/workflows/r_package.yml +++ b/.github/workflows/r_package.yml @@ -174,7 +174,7 @@ jobs: - name: install shell: bash run: | - Rscript -e "install.packages(c('R6', 'data.table', 'jsonlite', 'testthat'), repos = 'http://cran.r-project.org')" + RDscriptvalgrind -e "install.packages(c('R6', 'data.table', 'jsonlite', 'testthat'), repos = 'http://cran.r-project.org')" sh build-cran-package.sh RDvalgrind CMD INSTALL --preclean --install-tests lightgbm_*.tar.gz - name: test From 3f217a34ffdb8845b3ede0716fe24340608044be Mon Sep 17 00:00:00 2001 From: James Lamb Date: Wed, 7 Oct 2020 23:10:48 -0500 Subject: [PATCH 06/67] executable --- .ci/test_r_package_valgrind.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 .ci/test_r_package_valgrind.sh diff --git a/.ci/test_r_package_valgrind.sh b/.ci/test_r_package_valgrind.sh old mode 100644 new mode 100755 From e80b68d8d9ba503eb23dd4306479ace793301266 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Sat, 10 Oct 2020 18:44:57 -0500 Subject: [PATCH 07/67] fix redirect stuff --- .ci/test_r_package_valgrind.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.ci/test_r_package_valgrind.sh b/.ci/test_r_package_valgrind.sh index e88fae2dab6d..30280f446369 100755 --- a/.ci/test_r_package_valgrind.sh +++ b/.ci/test_r_package_valgrind.sh @@ -7,7 +7,9 @@ RDvalgrind \ --vanilla \ -d valgrind \ -f testthat.R \ - 2>&1 > out.log + &> out.log || exit -1 + +cat out.log cat out.log | grep -E "^\=" > valgrind-logs.log From 1c65707cb690ec523e52d48d095d1387c0a8f5d7 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Sun, 11 Oct 2020 02:56:13 +0100 Subject: [PATCH 08/67] Apply suggestions from code review Co-authored-by: Guolin Ke --- R-package/src/lightgbm_R.cpp | 1 - src/boosting/gbdt_model_text.cpp | 4 ---- 2 files changed, 5 deletions(-) diff --git a/R-package/src/lightgbm_R.cpp b/R-package/src/lightgbm_R.cpp index 8aacf2825c52..2c5bbcf0dc56 100644 --- a/R-package/src/lightgbm_R.cpp +++ b/R-package/src/lightgbm_R.cpp @@ -673,7 +673,6 @@ LGBM_SE LGBM_BoosterDumpModel_R(LGBM_SE handle, CHECK_CALL(LGBM_BoosterDumpModel(R_GET_PTR(handle), 0, R_AS_INT(num_iteration), R_AS_INT(feature_importance_type), R_AS_INT(buffer_len), &out_len, inner_char_buf.data())); EncodeChar(out_str, inner_char_buf.data(), buffer_len, actual_len, static_cast(out_len)); inner_char_buf.clear(); - inner_char_buf.shrink_to_fit(); R_API_END(); } diff --git a/src/boosting/gbdt_model_text.cpp b/src/boosting/gbdt_model_text.cpp index 0ca7d947387b..22eac4aaa7e1 100644 --- a/src/boosting/gbdt_model_text.cpp +++ b/src/boosting/gbdt_model_text.cpp @@ -359,15 +359,12 @@ std::string GBDT::SaveModelToString(int start_iteration, int num_iteration, int ss << "tree_sizes=" << Common::Join(tree_sizes, " ") << '\n'; ss << '\n'; tree_sizes.clear(); - tree_sizes.shrink_to_fit(); for (int i = 0; i < num_used_model - start_model; ++i) { ss << tree_strs[i]; tree_strs[i].clear(); - tree_strs[i].shrink_to_fit(); } tree_strs.clear(); - tree_strs.shrink_to_fit(); ss << "end of trees" << "\n"; std::vector feature_importances = FeatureImportance( num_iteration, feature_importance_type); @@ -380,7 +377,6 @@ std::string GBDT::SaveModelToString(int start_iteration, int num_iteration, int } } feature_importances.clear(); - feature_importances.shrink_to_fit(); // sort the importance std::stable_sort(pairs.begin(), pairs.end(), [](const std::pair& lhs, From 53f82f52500bf92eaed88b8a8a0c3496cb0c9050 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Sat, 10 Oct 2020 21:46:01 -0500 Subject: [PATCH 09/67] more flags --- .ci/test_r_package_valgrind.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci/test_r_package_valgrind.sh b/.ci/test_r_package_valgrind.sh index 30280f446369..d3d1822b4e1a 100755 --- a/.ci/test_r_package_valgrind.sh +++ b/.ci/test_r_package_valgrind.sh @@ -5,7 +5,7 @@ cd R-package/tests RDvalgrind \ --no-readline \ --vanilla \ - -d valgrind \ + -d "valgrind --tool=memcheck --leak-check=full --track-origins=yes" \ -f testthat.R \ &> out.log || exit -1 From 74676c327cfde9a2a55590200cd7264e5b0a3168 Mon Sep 17 00:00:00 2001 From: Guolin Ke Date: Sun, 11 Oct 2020 18:13:46 +0800 Subject: [PATCH 10/67] add mc to msvc proj --- windows/LightGBM.vcxproj | 3 ++- windows/LightGBM.vcxproj.filters | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/windows/LightGBM.vcxproj b/windows/LightGBM.vcxproj index 9dd319527229..beb7ae7f081e 100644 --- a/windows/LightGBM.vcxproj +++ b/windows/LightGBM.vcxproj @@ -287,6 +287,7 @@ + @@ -326,4 +327,4 @@ - + \ No newline at end of file diff --git a/windows/LightGBM.vcxproj.filters b/windows/LightGBM.vcxproj.filters index 8540b2d5f297..9490e3655387 100644 --- a/windows/LightGBM.vcxproj.filters +++ b/windows/LightGBM.vcxproj.filters @@ -222,6 +222,9 @@ include\LightGBM\utils\yamc + + src\treelearner + From 7a91f40aff21c0790165ba6b1fedb96d71d44fa3 Mon Sep 17 00:00:00 2001 From: Guolin Ke Date: Sun, 11 Oct 2020 18:18:29 +0800 Subject: [PATCH 11/67] fix memory leak in mc --- src/treelearner/monotone_constraints.hpp | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/treelearner/monotone_constraints.hpp b/src/treelearner/monotone_constraints.hpp index fde4bfbf033d..807d7010947c 100644 --- a/src/treelearner/monotone_constraints.hpp +++ b/src/treelearner/monotone_constraints.hpp @@ -37,6 +37,7 @@ struct FeatureConstraint { }; struct ConstraintEntry { + virtual ~ConstraintEntry() {} virtual void Reset() = 0; virtual void UpdateMin(double new_min) = 0; virtual void UpdateMax(double new_max) = 0; @@ -462,12 +463,12 @@ class BasicLeafConstraints : public LeafConstraintsBase { public: explicit BasicLeafConstraints(int num_leaves) : num_leaves_(num_leaves) { for (int i = 0; i < num_leaves; ++i) { - entries_.push_back(new BasicConstraintEntry()); + entries_.emplace_back(new BasicConstraintEntry()); } } void Reset() override { - for (auto entry : entries_) { + for (auto& entry : entries_) { entry->Reset(); } } @@ -484,7 +485,7 @@ class BasicLeafConstraints : public LeafConstraintsBase { int8_t monotone_type, double right_output, double left_output, int, const SplitInfo& , const std::vector&) override { - entries_[new_leaf] = entries_[leaf]->clone(); + entries_[new_leaf].reset(entries_[leaf]->clone()); if (is_numerical_split) { double mid = (left_output + right_output) / 2.0f; if (monotone_type < 0) { @@ -498,7 +499,7 @@ class BasicLeafConstraints : public LeafConstraintsBase { return std::vector(); } - const ConstraintEntry* Get(int leaf_idx) override { return entries_[leaf_idx]; } + const ConstraintEntry* Get(int leaf_idx) override { return entries_[leaf_idx].get(); } FeatureConstraint* GetFeatureConstraint(int leaf_idx, int feature_index) final { return entries_[leaf_idx]->GetFeatureConstraint(feature_index); @@ -506,7 +507,7 @@ class BasicLeafConstraints : public LeafConstraintsBase { protected: int num_leaves_; - std::vector entries_; + std::vector> entries_; }; class IntermediateLeafConstraints : public BasicLeafConstraints { @@ -541,7 +542,7 @@ class IntermediateLeafConstraints : public BasicLeafConstraints { void UpdateConstraintsWithOutputs(bool is_numerical_split, int leaf, int new_leaf, int8_t monotone_type, double right_output, double left_output) { - entries_[new_leaf] = entries_[leaf]->clone(); + entries_[new_leaf].reset(entries_[leaf]->clone()); if (is_numerical_split) { if (monotone_type < 0) { entries_[leaf]->UpdateMin(right_output); @@ -857,7 +858,7 @@ class AdvancedLeafConstraints : public IntermediateLeafConstraints { int num_features) : IntermediateLeafConstraints(config, num_leaves) { for (int i = 0; i < num_leaves; ++i) { - entries_[i] = new AdvancedConstraintEntry(num_features); + entries_[i].reset(new AdvancedConstraintEntry(num_features)); } } From e64183c7cca129a6257ed6e2751318529ef29f11 Mon Sep 17 00:00:00 2001 From: Guolin Ke Date: Sun, 11 Oct 2020 18:50:58 +0800 Subject: [PATCH 12/67] Update monotone_constraints.hpp --- src/treelearner/monotone_constraints.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/treelearner/monotone_constraints.hpp b/src/treelearner/monotone_constraints.hpp index 807d7010947c..6c86e61d1c2d 100644 --- a/src/treelearner/monotone_constraints.hpp +++ b/src/treelearner/monotone_constraints.hpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include From 23290db6546f07c7036c606355ff6c3fe783de1c Mon Sep 17 00:00:00 2001 From: Guolin Ke Date: Sun, 11 Oct 2020 22:25:54 +0800 Subject: [PATCH 13/67] Update r_package.yml --- .github/workflows/r_package.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/r_package.yml b/.github/workflows/r_package.yml index bd4b321c9353..0424ac967363 100644 --- a/.github/workflows/r_package.yml +++ b/.github/workflows/r_package.yml @@ -186,7 +186,7 @@ jobs: exit $(cat ubsan-tests.log | grep "runtime error" | wc -l) test-r-valgrind: name: r-package (ubuntu-latest, R-devel, valgrind) - timeout-minutes: 60 + timeout-minutes: 120 runs-on: ubuntu-latest container: image: wch1/r-debug From 4355c7574b77549588503a42b5187c37457923c0 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Sun, 11 Oct 2020 15:08:58 -0500 Subject: [PATCH 14/67] remove R_INT64_PTR --- R-package/src/R_object_helper.h | 2 -- R-package/src/lightgbm_R.cpp | 1 - 2 files changed, 3 deletions(-) diff --git a/R-package/src/R_object_helper.h b/R-package/src/R_object_helper.h index db75c5792520..e14a2ac6697b 100644 --- a/R-package/src/R_object_helper.h +++ b/R-package/src/R_object_helper.h @@ -104,8 +104,6 @@ typedef union { VECTOR_SER s; double align; } SEXPREC_ALIGN; #define R_INT_PTR(x) (reinterpret_cast DATAPTR(x)) -#define R_INT64_PTR(x) (reinterpret_cast DATAPTR(x)) - #define R_REAL_PTR(x) (reinterpret_cast DATAPTR(x)) #define R_AS_INT(x) (*(reinterpret_cast DATAPTR(x))) diff --git a/R-package/src/lightgbm_R.cpp b/R-package/src/lightgbm_R.cpp index 2c5bbcf0dc56..469123b18e1d 100644 --- a/R-package/src/lightgbm_R.cpp +++ b/R-package/src/lightgbm_R.cpp @@ -506,7 +506,6 @@ LGBM_SE LGBM_BoosterGetNumPredict_R(LGBM_SE handle, R_API_BEGIN(); int64_t len; CHECK_CALL(LGBM_BoosterGetNumPredict(R_GET_PTR(handle), R_AS_INT(data_idx), &len)); - // R_INT64_PTR(out)[0] = len; R_INT_PTR(out)[0] = static_cast(len); R_API_END(); } From bd53bfeeec2e76cd977ba000a847da813f4b7846 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Sun, 11 Oct 2020 15:13:54 -0500 Subject: [PATCH 15/67] disable openmp --- R-package/configure | 3 ++- R-package/configure.ac | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/R-package/configure b/R-package/configure index 02fcfa028b0d..f062349541e5 100755 --- a/R-package/configure +++ b/R-package/configure @@ -1783,7 +1783,8 @@ OPENMP_CXXFLAGS="" if test `uname -s` = "Linux" then - OPENMP_CXXFLAGS="\$(SHLIB_OPENMP_CXXFLAGS)" + #OPENMP_CXXFLAGS="\$(SHLIB_OPENMP_CXXFLAGS)" + OPENMP_CXXFLAGS="" fi if test `uname -s` = "Darwin" diff --git a/R-package/configure.ac b/R-package/configure.ac index 20182666b502..904ee7ad963c 100644 --- a/R-package/configure.ac +++ b/R-package/configure.ac @@ -92,7 +92,8 @@ OPENMP_CXXFLAGS="" if test `uname -s` = "Linux" then - OPENMP_CXXFLAGS="\$(SHLIB_OPENMP_CXXFLAGS)" + #OPENMP_CXXFLAGS="\$(SHLIB_OPENMP_CXXFLAGS)" + OPENMP_CXXFLAGS="" fi if test `uname -s` = "Darwin" From d53c9eb2e3cab569cb68aa60a3fea8189dc595c3 Mon Sep 17 00:00:00 2001 From: Guolin Ke Date: Mon, 12 Oct 2020 09:52:34 +0800 Subject: [PATCH 16/67] Update gbdt_model_text.cpp --- src/boosting/gbdt_model_text.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/boosting/gbdt_model_text.cpp b/src/boosting/gbdt_model_text.cpp index 22eac4aaa7e1..8a894897d541 100644 --- a/src/boosting/gbdt_model_text.cpp +++ b/src/boosting/gbdt_model_text.cpp @@ -348,7 +348,6 @@ std::string GBDT::SaveModelToString(int start_iteration, int num_iteration, int std::vector tree_strs(num_used_model - start_model); std::vector tree_sizes(num_used_model - start_model); // output tree models - #pragma omp parallel for schedule(static) for (int i = start_model; i < num_used_model; ++i) { const int idx = i - start_model; tree_strs[idx] = "Tree=" + std::to_string(idx) + '\n'; From e4fff308d4cafbb3392bf7965e3d740f0dffcbe3 Mon Sep 17 00:00:00 2001 From: Guolin Ke Date: Mon, 12 Oct 2020 10:54:32 +0800 Subject: [PATCH 17/67] Update gbdt_model_text.cpp --- src/boosting/gbdt_model_text.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/boosting/gbdt_model_text.cpp b/src/boosting/gbdt_model_text.cpp index 8a894897d541..7f2beb8c11ad 100644 --- a/src/boosting/gbdt_model_text.cpp +++ b/src/boosting/gbdt_model_text.cpp @@ -357,7 +357,7 @@ std::string GBDT::SaveModelToString(int start_iteration, int num_iteration, int ss << "tree_sizes=" << Common::Join(tree_sizes, " ") << '\n'; ss << '\n'; - tree_sizes.clear(); + std::vector.swap(tree_sizes); for (int i = 0; i < num_used_model - start_model; ++i) { ss << tree_strs[i]; From b7b0bf319190395066cf1b6837d49ecf67f3677b Mon Sep 17 00:00:00 2001 From: Guolin Ke Date: Mon, 12 Oct 2020 11:03:21 +0800 Subject: [PATCH 18/67] Apply suggestions from code review --- src/boosting/gbdt_model_text.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/boosting/gbdt_model_text.cpp b/src/boosting/gbdt_model_text.cpp index 7f2beb8c11ad..63c928680655 100644 --- a/src/boosting/gbdt_model_text.cpp +++ b/src/boosting/gbdt_model_text.cpp @@ -357,7 +357,7 @@ std::string GBDT::SaveModelToString(int start_iteration, int num_iteration, int ss << "tree_sizes=" << Common::Join(tree_sizes, " ") << '\n'; ss << '\n'; - std::vector.swap(tree_sizes); + std::vector().swap(tree_sizes); for (int i = 0; i < num_used_model - start_model; ++i) { ss << tree_strs[i]; From ec77314e5bd1c08edccb811fa0dfe93c14fcb760 Mon Sep 17 00:00:00 2001 From: Guolin Ke Date: Mon, 12 Oct 2020 11:57:32 +0800 Subject: [PATCH 19/67] try to free vector --- include/LightGBM/utils/common.h | 6 ++++++ src/boosting/gbdt_model_text.cpp | 6 +++--- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/include/LightGBM/utils/common.h b/include/LightGBM/utils/common.h index 07b8484b5577..1f2b097da035 100644 --- a/include/LightGBM/utils/common.h +++ b/include/LightGBM/utils/common.h @@ -634,6 +634,12 @@ inline static std::string Join(const std::vector& strs, size_t start, size_t return str_buf.str(); } +template +inline static void VectorFree(std::vector* vec) { + auto& ref = *vec; + std::vector().swap(ref); +} + inline static int64_t Pow2RoundUp(int64_t x) { int64_t t = 1; for (int i = 0; i < 64; ++i) { diff --git a/src/boosting/gbdt_model_text.cpp b/src/boosting/gbdt_model_text.cpp index 63c928680655..3bb88d7df908 100644 --- a/src/boosting/gbdt_model_text.cpp +++ b/src/boosting/gbdt_model_text.cpp @@ -357,13 +357,13 @@ std::string GBDT::SaveModelToString(int start_iteration, int num_iteration, int ss << "tree_sizes=" << Common::Join(tree_sizes, " ") << '\n'; ss << '\n'; - std::vector().swap(tree_sizes); + Common::VectorFree(&tree_sizes); for (int i = 0; i < num_used_model - start_model; ++i) { ss << tree_strs[i]; tree_strs[i].clear(); } - tree_strs.clear(); + Common::VectorFree(&tree_strs); ss << "end of trees" << "\n"; std::vector feature_importances = FeatureImportance( num_iteration, feature_importance_type); @@ -375,7 +375,7 @@ std::string GBDT::SaveModelToString(int start_iteration, int num_iteration, int pairs.emplace_back(feature_importances_int, feature_names_[i]); } } - feature_importances.clear(); + Common::VectorFree(&feature_importances); // sort the importance std::stable_sort(pairs.begin(), pairs.end(), [](const std::pair& lhs, From 78a66c8a398fc9ed1bba17c8be11e7b91cd614f8 Mon Sep 17 00:00:00 2001 From: Guolin Ke Date: Mon, 12 Oct 2020 12:55:39 +0800 Subject: [PATCH 20/67] free more memories. --- R-package/src/lightgbm_R.cpp | 4 +++- src/boosting/gbdt_model_text.cpp | 3 ++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/R-package/src/lightgbm_R.cpp b/R-package/src/lightgbm_R.cpp index 469123b18e1d..4b39ea879897 100644 --- a/R-package/src/lightgbm_R.cpp +++ b/R-package/src/lightgbm_R.cpp @@ -223,6 +223,7 @@ LGBM_SE LGBM_DatasetSetField_R(LGBM_SE handle, vec[i] = static_cast(R_REAL_PTR(field_data)[i]); } CHECK_CALL(LGBM_DatasetSetField(R_GET_PTR(handle), name, vec.data(), len, C_API_DTYPE_FLOAT32)); + Common::VectorFree(&vec); } R_API_END(); } @@ -656,6 +657,7 @@ LGBM_SE LGBM_BoosterSaveModelToString_R(LGBM_SE handle, std::vector inner_char_buf(R_AS_INT(buffer_len)); CHECK_CALL(LGBM_BoosterSaveModelToString(R_GET_PTR(handle), 0, R_AS_INT(num_iteration), R_AS_INT(feature_importance_type), R_AS_INT(buffer_len), &out_len, inner_char_buf.data())); EncodeChar(out_str, inner_char_buf.data(), buffer_len, actual_len, static_cast(out_len)); + Common::VectorFree(&inner_char_buf); R_API_END(); } @@ -671,7 +673,7 @@ LGBM_SE LGBM_BoosterDumpModel_R(LGBM_SE handle, std::vector inner_char_buf(R_AS_INT(buffer_len)); CHECK_CALL(LGBM_BoosterDumpModel(R_GET_PTR(handle), 0, R_AS_INT(num_iteration), R_AS_INT(feature_importance_type), R_AS_INT(buffer_len), &out_len, inner_char_buf.data())); EncodeChar(out_str, inner_char_buf.data(), buffer_len, actual_len, static_cast(out_len)); - inner_char_buf.clear(); + Common::VectorFree(&inner_char_buf); R_API_END(); } diff --git a/src/boosting/gbdt_model_text.cpp b/src/boosting/gbdt_model_text.cpp index 3bb88d7df908..6781e2a5a445 100644 --- a/src/boosting/gbdt_model_text.cpp +++ b/src/boosting/gbdt_model_text.cpp @@ -115,7 +115,8 @@ std::string GBDT::DumpModel(int start_iteration, int num_iteration, int feature_ str_buf << "}" << '\n'; str_buf << "}" << '\n'; - + Common::VectorFree(&feature_importances); + Common::VectorFree(&pairs); return str_buf.str(); } From 70caf8eb6b85f9d4eb699e15d2f90ab27869ef55 Mon Sep 17 00:00:00 2001 From: Guolin Ke Date: Mon, 12 Oct 2020 12:56:52 +0800 Subject: [PATCH 21/67] Update src/boosting/gbdt_model_text.cpp --- src/boosting/gbdt_model_text.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/boosting/gbdt_model_text.cpp b/src/boosting/gbdt_model_text.cpp index 6781e2a5a445..b33200033b6d 100644 --- a/src/boosting/gbdt_model_text.cpp +++ b/src/boosting/gbdt_model_text.cpp @@ -349,6 +349,7 @@ std::string GBDT::SaveModelToString(int start_iteration, int num_iteration, int std::vector tree_strs(num_used_model - start_model); std::vector tree_sizes(num_used_model - start_model); // output tree models + #pragma omp parallel for schedule(static) for (int i = start_model; i < num_used_model; ++i) { const int idx = i - start_model; tree_strs[idx] = "Tree=" + std::to_string(idx) + '\n'; From ec6ee58870644a360ebd7440a2a88472762a05c1 Mon Sep 17 00:00:00 2001 From: Guolin Ke Date: Mon, 12 Oct 2020 13:02:44 +0800 Subject: [PATCH 22/67] fix using --- R-package/src/lightgbm_R.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/R-package/src/lightgbm_R.cpp b/R-package/src/lightgbm_R.cpp index 4b39ea879897..a079530e71bd 100644 --- a/R-package/src/lightgbm_R.cpp +++ b/R-package/src/lightgbm_R.cpp @@ -37,6 +37,7 @@ using LightGBM::Common::Join; using LightGBM::Common::Split; +using LightGBM::Common::VectorFree; using LightGBM::Log; LGBM_SE EncodeChar(LGBM_SE dest, const char* src, LGBM_SE buf_len, LGBM_SE actual_len, size_t str_len) { From e93db3a233dc25d9f1af461b7911a49ee544c00b Mon Sep 17 00:00:00 2001 From: Guolin Ke Date: Mon, 12 Oct 2020 13:11:33 +0800 Subject: [PATCH 23/67] try the UNPROTECT(1); --- R-package/src/lightgbm_R.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/R-package/src/lightgbm_R.cpp b/R-package/src/lightgbm_R.cpp index a079530e71bd..5c4d039f1182 100644 --- a/R-package/src/lightgbm_R.cpp +++ b/R-package/src/lightgbm_R.cpp @@ -24,14 +24,16 @@ #define R_API_BEGIN() \ try { #define R_API_END() } \ - catch(std::exception& ex) { R_INT_PTR(call_state)[0] = -1; LGBM_SetLastError(ex.what()); return call_state;} \ - catch(std::string& ex) { R_INT_PTR(call_state)[0] = -1; LGBM_SetLastError(ex.c_str()); return call_state; } \ - catch(...) { R_INT_PTR(call_state)[0] = -1; LGBM_SetLastError("unknown exception"); return call_state;} \ + catch(std::exception& ex) { R_INT_PTR(call_state)[0] = -1; LGBM_SetLastError(ex.what()); UNPROTECT(1); return call_state;} \ + catch(std::string& ex) { R_INT_PTR(call_state)[0] = -1; LGBM_SetLastError(ex.c_str()); UNPROTECT(1); return call_state; } \ + catch(...) { R_INT_PTR(call_state)[0] = -1; LGBM_SetLastError("unknown exception"); UNPROTECT(1); return call_state;} \ + UNPROTECT(1); \ return call_state; #define CHECK_CALL(x) \ if ((x) != 0) { \ R_INT_PTR(call_state)[0] = -1;\ + UNPROTECT(1); \ return call_state;\ } From d621d5f1f595cc3880259d805716349911dadc1d Mon Sep 17 00:00:00 2001 From: Guolin Ke Date: Mon, 12 Oct 2020 13:14:59 +0800 Subject: [PATCH 24/67] fix a const pointer --- R-package/src/lightgbm_R.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R-package/src/lightgbm_R.cpp b/R-package/src/lightgbm_R.cpp index 5c4d039f1182..875e168f9d4b 100644 --- a/R-package/src/lightgbm_R.cpp +++ b/R-package/src/lightgbm_R.cpp @@ -628,7 +628,7 @@ LGBM_SE LGBM_BoosterPredictForMat_R(LGBM_SE handle, int32_t nrow = R_AS_INT(num_row); int32_t ncol = R_AS_INT(num_col); - double* p_mat = R_REAL_PTR(data); + const double* p_mat = R_REAL_PTR(data); double* ptr_ret = R_REAL_PTR(out_result); int64_t out_len; CHECK_CALL(LGBM_BoosterPredictForMat(R_GET_PTR(handle), From c0e92190f0f5b8c8b9d105b1b0a3c7ba97b1cef6 Mon Sep 17 00:00:00 2001 From: Guolin Ke Date: Mon, 12 Oct 2020 13:21:47 +0800 Subject: [PATCH 25/67] fix Common --- R-package/src/lightgbm_R.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R-package/src/lightgbm_R.cpp b/R-package/src/lightgbm_R.cpp index 875e168f9d4b..5d1eefbb1615 100644 --- a/R-package/src/lightgbm_R.cpp +++ b/R-package/src/lightgbm_R.cpp @@ -226,7 +226,7 @@ LGBM_SE LGBM_DatasetSetField_R(LGBM_SE handle, vec[i] = static_cast(R_REAL_PTR(field_data)[i]); } CHECK_CALL(LGBM_DatasetSetField(R_GET_PTR(handle), name, vec.data(), len, C_API_DTYPE_FLOAT32)); - Common::VectorFree(&vec); + VectorFree(&vec); } R_API_END(); } @@ -660,7 +660,7 @@ LGBM_SE LGBM_BoosterSaveModelToString_R(LGBM_SE handle, std::vector inner_char_buf(R_AS_INT(buffer_len)); CHECK_CALL(LGBM_BoosterSaveModelToString(R_GET_PTR(handle), 0, R_AS_INT(num_iteration), R_AS_INT(feature_importance_type), R_AS_INT(buffer_len), &out_len, inner_char_buf.data())); EncodeChar(out_str, inner_char_buf.data(), buffer_len, actual_len, static_cast(out_len)); - Common::VectorFree(&inner_char_buf); + VectorFree(&inner_char_buf); R_API_END(); } @@ -676,7 +676,7 @@ LGBM_SE LGBM_BoosterDumpModel_R(LGBM_SE handle, std::vector inner_char_buf(R_AS_INT(buffer_len)); CHECK_CALL(LGBM_BoosterDumpModel(R_GET_PTR(handle), 0, R_AS_INT(num_iteration), R_AS_INT(feature_importance_type), R_AS_INT(buffer_len), &out_len, inner_char_buf.data())); EncodeChar(out_str, inner_char_buf.data(), buffer_len, actual_len, static_cast(out_len)); - Common::VectorFree(&inner_char_buf); + VectorFree(&inner_char_buf); R_API_END(); } From a45ec0a00df76de8cc41a3c10b773aa458ee8b9e Mon Sep 17 00:00:00 2001 From: Guolin Ke Date: Mon, 12 Oct 2020 13:36:39 +0800 Subject: [PATCH 26/67] reduce UNPROTECT --- R-package/src/lightgbm_R.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/R-package/src/lightgbm_R.cpp b/R-package/src/lightgbm_R.cpp index 5d1eefbb1615..28137ca7badd 100644 --- a/R-package/src/lightgbm_R.cpp +++ b/R-package/src/lightgbm_R.cpp @@ -24,16 +24,15 @@ #define R_API_BEGIN() \ try { #define R_API_END() } \ - catch(std::exception& ex) { R_INT_PTR(call_state)[0] = -1; LGBM_SetLastError(ex.what()); UNPROTECT(1); return call_state;} \ - catch(std::string& ex) { R_INT_PTR(call_state)[0] = -1; LGBM_SetLastError(ex.c_str()); UNPROTECT(1); return call_state; } \ - catch(...) { R_INT_PTR(call_state)[0] = -1; LGBM_SetLastError("unknown exception"); UNPROTECT(1); return call_state;} \ + catch(std::exception& ex) { R_INT_PTR(call_state)[0] = -1; LGBM_SetLastError(ex.what()); return call_state;} \ + catch(std::string& ex) { R_INT_PTR(call_state)[0] = -1; LGBM_SetLastError(ex.c_str()); return call_state;} \ + catch(...) { R_INT_PTR(call_state)[0] = -1; LGBM_SetLastError("unknown exception"); return call_state;} \ UNPROTECT(1); \ return call_state; #define CHECK_CALL(x) \ if ((x) != 0) { \ R_INT_PTR(call_state)[0] = -1;\ - UNPROTECT(1); \ return call_state;\ } From 650cb32b7af4c9139dc47d3d39c73cc91a23abb4 Mon Sep 17 00:00:00 2001 From: Guolin Ke Date: Mon, 12 Oct 2020 13:42:34 +0800 Subject: [PATCH 27/67] remove UNPROTECT(1); --- R-package/src/lightgbm_R.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/R-package/src/lightgbm_R.cpp b/R-package/src/lightgbm_R.cpp index 28137ca7badd..17c529bdccb4 100644 --- a/R-package/src/lightgbm_R.cpp +++ b/R-package/src/lightgbm_R.cpp @@ -27,7 +27,6 @@ catch(std::exception& ex) { R_INT_PTR(call_state)[0] = -1; LGBM_SetLastError(ex.what()); return call_state;} \ catch(std::string& ex) { R_INT_PTR(call_state)[0] = -1; LGBM_SetLastError(ex.c_str()); return call_state;} \ catch(...) { R_INT_PTR(call_state)[0] = -1; LGBM_SetLastError("unknown exception"); return call_state;} \ - UNPROTECT(1); \ return call_state; #define CHECK_CALL(x) \ From ba3ee79647cf10d20d5c33e1c089b016e0760fff Mon Sep 17 00:00:00 2001 From: Guolin Ke Date: Mon, 12 Oct 2020 13:52:15 +0800 Subject: [PATCH 28/67] fix null handle --- R-package/R/lgb.Booster.R | 2 +- R-package/R/lgb.Dataset.R | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R-package/R/lgb.Booster.R b/R-package/R/lgb.Booster.R index 61784c283c63..1843c31bd363 100644 --- a/R-package/R/lgb.Booster.R +++ b/R-package/R/lgb.Booster.R @@ -16,7 +16,7 @@ Booster <- R6::R6Class( # Freeing up handle lgb.call("LGBM_BoosterFree_R", ret = NULL, private$handle) - private$handle <- NULL + private$handle <- lgb.null.handle() } diff --git a/R-package/R/lgb.Dataset.R b/R-package/R/lgb.Dataset.R index 9bbe83340a6a..fac25d95a4c4 100644 --- a/R-package/R/lgb.Dataset.R +++ b/R-package/R/lgb.Dataset.R @@ -14,7 +14,7 @@ Dataset <- R6::R6Class( # Freeing up handle lgb.call("LGBM_DatasetFree_R", ret = NULL, private$handle) - private$handle <- NULL + private$handle <- lgb.null.handle() } From 1daf3c077a6795408f507e123f5900bb3c196228 Mon Sep 17 00:00:00 2001 From: Guolin Ke Date: Mon, 12 Oct 2020 14:08:15 +0800 Subject: [PATCH 29/67] fix predictor --- R-package/R/lgb.Predictor.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R-package/R/lgb.Predictor.R b/R-package/R/lgb.Predictor.R index c996bb1731ae..a21b24d366f0 100644 --- a/R-package/R/lgb.Predictor.R +++ b/R-package/R/lgb.Predictor.R @@ -19,7 +19,7 @@ Predictor <- R6::R6Class( , ret = NULL , private$handle ) - private$handle <- NULL + private$handle <- lgb.null.handle() } @@ -30,7 +30,7 @@ Predictor <- R6::R6Class( params <- list(...) private$params <- lgb.params2str(params) # Create new lgb handle - handle <- 0.0 + handle <- lgb.null.handle() # Check if handle is a character if (is.character(modelfile)) { From e5c6bf1143b7bb174d2d88e270ff674704f22443 Mon Sep 17 00:00:00 2001 From: Guolin Ke Date: Mon, 12 Oct 2020 14:23:16 +0800 Subject: [PATCH 30/67] use NULL after free --- R-package/R/lgb.Booster.R | 2 +- R-package/R/lgb.Dataset.R | 2 +- R-package/R/lgb.Predictor.R | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/R-package/R/lgb.Booster.R b/R-package/R/lgb.Booster.R index 1843c31bd363..61784c283c63 100644 --- a/R-package/R/lgb.Booster.R +++ b/R-package/R/lgb.Booster.R @@ -16,7 +16,7 @@ Booster <- R6::R6Class( # Freeing up handle lgb.call("LGBM_BoosterFree_R", ret = NULL, private$handle) - private$handle <- lgb.null.handle() + private$handle <- NULL } diff --git a/R-package/R/lgb.Dataset.R b/R-package/R/lgb.Dataset.R index fac25d95a4c4..9bbe83340a6a 100644 --- a/R-package/R/lgb.Dataset.R +++ b/R-package/R/lgb.Dataset.R @@ -14,7 +14,7 @@ Dataset <- R6::R6Class( # Freeing up handle lgb.call("LGBM_DatasetFree_R", ret = NULL, private$handle) - private$handle <- lgb.null.handle() + private$handle <- NULL } diff --git a/R-package/R/lgb.Predictor.R b/R-package/R/lgb.Predictor.R index a21b24d366f0..ab0bea7fc8e6 100644 --- a/R-package/R/lgb.Predictor.R +++ b/R-package/R/lgb.Predictor.R @@ -19,7 +19,7 @@ Predictor <- R6::R6Class( , ret = NULL , private$handle ) - private$handle <- lgb.null.handle() + private$handle <- NULL } From d91c9ff5ab2587bf1de436593a0cdf9430579fca Mon Sep 17 00:00:00 2001 From: Guolin Ke Date: Mon, 12 Oct 2020 14:35:24 +0800 Subject: [PATCH 31/67] fix a leaking in test --- R-package/tests/testthat/test_dataset.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/R-package/tests/testthat/test_dataset.R b/R-package/tests/testthat/test_dataset.R index 9431cb32f646..d0ac9c0627d2 100644 --- a/R-package/tests/testthat/test_dataset.R +++ b/R-package/tests/testthat/test_dataset.R @@ -85,6 +85,8 @@ test_that("lgb.Dataset: Dataset should be able to construct from matrix and retu , ref_handle ) expect_false(is.na(handle)) + lgb.call("LGBM_DatasetFree_R", ret = NULL, handle) + handle <- NULL }) test_that("lgb.Dataset$setinfo() should convert 'group' to integer", { From a55801cbf85d251d0cfe9835e601789e69c943f1 Mon Sep 17 00:00:00 2001 From: Guolin Ke Date: Mon, 12 Oct 2020 15:27:07 +0800 Subject: [PATCH 32/67] try more fixes --- R-package/src/lightgbm_R.cpp | 1 + src/boosting/gbdt_model_text.cpp | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/R-package/src/lightgbm_R.cpp b/R-package/src/lightgbm_R.cpp index 17c529bdccb4..ac1ec0e9acc1 100644 --- a/R-package/src/lightgbm_R.cpp +++ b/R-package/src/lightgbm_R.cpp @@ -215,6 +215,7 @@ LGBM_SE LGBM_DatasetSetField_R(LGBM_SE handle, vec[i] = static_cast(R_INT_PTR(field_data)[i]); } CHECK_CALL(LGBM_DatasetSetField(R_GET_PTR(handle), name, vec.data(), len, C_API_DTYPE_INT32)); + VectorFree(&vec); } else if (!strcmp("init_score", name)) { CHECK_CALL(LGBM_DatasetSetField(R_GET_PTR(handle), name, R_REAL_PTR(field_data), len, C_API_DTYPE_FLOAT64)); } else { diff --git a/src/boosting/gbdt_model_text.cpp b/src/boosting/gbdt_model_text.cpp index b33200033b6d..258b656cb0f6 100644 --- a/src/boosting/gbdt_model_text.cpp +++ b/src/boosting/gbdt_model_text.cpp @@ -363,7 +363,7 @@ std::string GBDT::SaveModelToString(int start_iteration, int num_iteration, int for (int i = 0; i < num_used_model - start_model; ++i) { ss << tree_strs[i]; - tree_strs[i].clear(); + tree_strs[i] = ""; } Common::VectorFree(&tree_strs); ss << "end of trees" << "\n"; @@ -388,6 +388,7 @@ std::string GBDT::SaveModelToString(int start_iteration, int num_iteration, int for (size_t i = 0; i < pairs.size(); ++i) { ss << pairs[i].second << "=" << std::to_string(pairs[i].first) << '\n'; } + Common::VectorFree(&pairs); if (config_ != nullptr) { ss << "\nparameters:" << '\n'; ss << config_->ToString() << "\n"; From ef74a8f8c0a6cb3bf4eaba883d372a132d455a1d Mon Sep 17 00:00:00 2001 From: Guolin Ke Date: Mon, 12 Oct 2020 15:32:06 +0800 Subject: [PATCH 33/67] test the effect of tests --- R-package/tests/testthat/test_dataset.R | 2 +- R-package/tests/testthat/test_lgb.Booster.R | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/R-package/tests/testthat/test_dataset.R b/R-package/tests/testthat/test_dataset.R index d0ac9c0627d2..d5c83de286ec 100644 --- a/R-package/tests/testthat/test_dataset.R +++ b/R-package/tests/testthat/test_dataset.R @@ -36,7 +36,7 @@ test_that("lgb.Dataset: getinfo & setinfo", { expect_true(length(getinfo(dtest, "init_score")) == 0L) # any other label should error - expect_error(setinfo(dtest, "asdf", test_label)) + # expect_error(setinfo(dtest, "asdf", test_label)) }) test_that("lgb.Dataset: slice, dim", { diff --git a/R-package/tests/testthat/test_lgb.Booster.R b/R-package/tests/testthat/test_lgb.Booster.R index 1db70dc2608b..e76f030c8022 100644 --- a/R-package/tests/testthat/test_lgb.Booster.R +++ b/R-package/tests/testthat/test_lgb.Booster.R @@ -440,10 +440,10 @@ test_that("Saving a model with different feature importance types works", { ) ) - UNSUPPORTED_IMPORTANCE <- 2L - expect_error({ - model_string <- bst$save_model_to_string(feature_importance_type = UNSUPPORTED_IMPORTANCE) - }, "Unknown importance type") + # UNSUPPORTED_IMPORTANCE <- 2L + # expect_error({ + # model_string <- bst$save_model_to_string(feature_importance_type = UNSUPPORTED_IMPORTANCE) + # }, "Unknown importance type") }) .params_from_model_string <- function(model_str) { From 432bf22d4d171d683352413bd75e609deb49d25c Mon Sep 17 00:00:00 2001 From: Guolin Ke Date: Mon, 12 Oct 2020 15:46:46 +0800 Subject: [PATCH 34/67] throw exception in Fatal --- include/LightGBM/utils/log.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/LightGBM/utils/log.h b/include/LightGBM/utils/log.h index 7d795ff755a1..131d1f6a0353 100644 --- a/include/LightGBM/utils/log.h +++ b/include/LightGBM/utils/log.h @@ -123,10 +123,10 @@ class Log { #ifndef LGB_R_BUILD fprintf(stderr, "[LightGBM] [Fatal] %s\n", str_buf); fflush(stderr); - throw std::runtime_error(std::string(str_buf)); #else Rf_error("[LightGBM] [Fatal] %s\n", str_buf); #endif + throw std::runtime_error(std::string(str_buf)); } private: From 3a24bb64bbf43ced932966a529d4396f867518bb Mon Sep 17 00:00:00 2001 From: Guolin Ke Date: Mon, 12 Oct 2020 15:47:56 +0800 Subject: [PATCH 35/67] add test back --- R-package/tests/testthat/test_dataset.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R-package/tests/testthat/test_dataset.R b/R-package/tests/testthat/test_dataset.R index d5c83de286ec..d0ac9c0627d2 100644 --- a/R-package/tests/testthat/test_dataset.R +++ b/R-package/tests/testthat/test_dataset.R @@ -36,7 +36,7 @@ test_that("lgb.Dataset: getinfo & setinfo", { expect_true(length(getinfo(dtest, "init_score")) == 0L) # any other label should error - # expect_error(setinfo(dtest, "asdf", test_label)) + expect_error(setinfo(dtest, "asdf", test_label)) }) test_that("lgb.Dataset: slice, dim", { From 5bcf90bb8d6bb0705fa429c19c4cdbb42e959ae0 Mon Sep 17 00:00:00 2001 From: Guolin Ke Date: Mon, 12 Oct 2020 16:17:35 +0800 Subject: [PATCH 36/67] Apply suggestions from code review --- R-package/src/lightgbm_R.cpp | 5 ----- include/LightGBM/utils/common.h | 6 ------ src/boosting/gbdt_model_text.cpp | 8 +------- 3 files changed, 1 insertion(+), 18 deletions(-) diff --git a/R-package/src/lightgbm_R.cpp b/R-package/src/lightgbm_R.cpp index ac1ec0e9acc1..958fd4780218 100644 --- a/R-package/src/lightgbm_R.cpp +++ b/R-package/src/lightgbm_R.cpp @@ -37,7 +37,6 @@ using LightGBM::Common::Join; using LightGBM::Common::Split; -using LightGBM::Common::VectorFree; using LightGBM::Log; LGBM_SE EncodeChar(LGBM_SE dest, const char* src, LGBM_SE buf_len, LGBM_SE actual_len, size_t str_len) { @@ -215,7 +214,6 @@ LGBM_SE LGBM_DatasetSetField_R(LGBM_SE handle, vec[i] = static_cast(R_INT_PTR(field_data)[i]); } CHECK_CALL(LGBM_DatasetSetField(R_GET_PTR(handle), name, vec.data(), len, C_API_DTYPE_INT32)); - VectorFree(&vec); } else if (!strcmp("init_score", name)) { CHECK_CALL(LGBM_DatasetSetField(R_GET_PTR(handle), name, R_REAL_PTR(field_data), len, C_API_DTYPE_FLOAT64)); } else { @@ -225,7 +223,6 @@ LGBM_SE LGBM_DatasetSetField_R(LGBM_SE handle, vec[i] = static_cast(R_REAL_PTR(field_data)[i]); } CHECK_CALL(LGBM_DatasetSetField(R_GET_PTR(handle), name, vec.data(), len, C_API_DTYPE_FLOAT32)); - VectorFree(&vec); } R_API_END(); } @@ -659,7 +656,6 @@ LGBM_SE LGBM_BoosterSaveModelToString_R(LGBM_SE handle, std::vector inner_char_buf(R_AS_INT(buffer_len)); CHECK_CALL(LGBM_BoosterSaveModelToString(R_GET_PTR(handle), 0, R_AS_INT(num_iteration), R_AS_INT(feature_importance_type), R_AS_INT(buffer_len), &out_len, inner_char_buf.data())); EncodeChar(out_str, inner_char_buf.data(), buffer_len, actual_len, static_cast(out_len)); - VectorFree(&inner_char_buf); R_API_END(); } @@ -675,7 +671,6 @@ LGBM_SE LGBM_BoosterDumpModel_R(LGBM_SE handle, std::vector inner_char_buf(R_AS_INT(buffer_len)); CHECK_CALL(LGBM_BoosterDumpModel(R_GET_PTR(handle), 0, R_AS_INT(num_iteration), R_AS_INT(feature_importance_type), R_AS_INT(buffer_len), &out_len, inner_char_buf.data())); EncodeChar(out_str, inner_char_buf.data(), buffer_len, actual_len, static_cast(out_len)); - VectorFree(&inner_char_buf); R_API_END(); } diff --git a/include/LightGBM/utils/common.h b/include/LightGBM/utils/common.h index 1f2b097da035..07b8484b5577 100644 --- a/include/LightGBM/utils/common.h +++ b/include/LightGBM/utils/common.h @@ -634,12 +634,6 @@ inline static std::string Join(const std::vector& strs, size_t start, size_t return str_buf.str(); } -template -inline static void VectorFree(std::vector* vec) { - auto& ref = *vec; - std::vector().swap(ref); -} - inline static int64_t Pow2RoundUp(int64_t x) { int64_t t = 1; for (int i = 0; i < 64; ++i) { diff --git a/src/boosting/gbdt_model_text.cpp b/src/boosting/gbdt_model_text.cpp index 258b656cb0f6..6a082d59be38 100644 --- a/src/boosting/gbdt_model_text.cpp +++ b/src/boosting/gbdt_model_text.cpp @@ -115,8 +115,6 @@ std::string GBDT::DumpModel(int start_iteration, int num_iteration, int feature_ str_buf << "}" << '\n'; str_buf << "}" << '\n'; - Common::VectorFree(&feature_importances); - Common::VectorFree(&pairs); return str_buf.str(); } @@ -359,13 +357,11 @@ std::string GBDT::SaveModelToString(int start_iteration, int num_iteration, int ss << "tree_sizes=" << Common::Join(tree_sizes, " ") << '\n'; ss << '\n'; - Common::VectorFree(&tree_sizes); for (int i = 0; i < num_used_model - start_model; ++i) { ss << tree_strs[i]; - tree_strs[i] = ""; + tree_strs[i].clear(); } - Common::VectorFree(&tree_strs); ss << "end of trees" << "\n"; std::vector feature_importances = FeatureImportance( num_iteration, feature_importance_type); @@ -377,7 +373,6 @@ std::string GBDT::SaveModelToString(int start_iteration, int num_iteration, int pairs.emplace_back(feature_importances_int, feature_names_[i]); } } - Common::VectorFree(&feature_importances); // sort the importance std::stable_sort(pairs.begin(), pairs.end(), [](const std::pair& lhs, @@ -388,7 +383,6 @@ std::string GBDT::SaveModelToString(int start_iteration, int num_iteration, int for (size_t i = 0; i < pairs.size(); ++i) { ss << pairs[i].second << "=" << std::to_string(pairs[i].first) << '\n'; } - Common::VectorFree(&pairs); if (config_ != nullptr) { ss << "\nparameters:" << '\n'; ss << config_->ToString() << "\n"; From 7221adbf719e09e7e6a343547cdb89b0c53670a6 Mon Sep 17 00:00:00 2001 From: Guolin Ke Date: Mon, 12 Oct 2020 16:32:34 +0800 Subject: [PATCH 37/67] commet some tests --- R-package/tests/testthat/test_dataset.R | 36 ++++++++++++------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/R-package/tests/testthat/test_dataset.R b/R-package/tests/testthat/test_dataset.R index d0ac9c0627d2..dfb421e43f71 100644 --- a/R-package/tests/testthat/test_dataset.R +++ b/R-package/tests/testthat/test_dataset.R @@ -36,7 +36,7 @@ test_that("lgb.Dataset: getinfo & setinfo", { expect_true(length(getinfo(dtest, "init_score")) == 0L) # any other label should error - expect_error(setinfo(dtest, "asdf", test_label)) + # expect_error(setinfo(dtest, "asdf", test_label)) }) test_that("lgb.Dataset: slice, dim", { @@ -54,9 +54,9 @@ test_that("lgb.Dataset: colnames", { expect_equal(colnames(dtest), colnames(test_data)) lgb.Dataset.construct(dtest) expect_equal(colnames(dtest), colnames(test_data)) - expect_error({ - colnames(dtest) <- "asdf" - }) + # expect_error({ + # colnames(dtest) <- "asdf" + # }) new_names <- make.names(seq_len(ncol(test_data))) expect_silent(colnames(dtest) <- new_names) expect_equal(colnames(dtest), new_names) @@ -107,26 +107,26 @@ test_that("lgb.Dataset should throw an error if 'reference' is provided but of t test_data <- agaricus.test$data[1L:100L, ] test_label <- agaricus.test$label[1L:100L] # Try to trick lgb.Dataset() into accepting bad input - expect_error({ - dtest <- lgb.Dataset( - data = test_data - , label = test_label - , reference = data.frame(x = seq_len(10L), y = seq_len(10L)) - ) - }, regexp = "reference must be a") + # expect_error({ + # dtest <- lgb.Dataset( + # data = test_data + # , label = test_label + # , reference = data.frame(x = seq_len(10L), y = seq_len(10L)) + # ) + # }, regexp = "reference must be a") }) test_that("Dataset$new() should throw an error if 'predictor' is provided but of the wrong format", { data(agaricus.test, package = "lightgbm") test_data <- agaricus.test$data[1L:100L, ] test_label <- agaricus.test$label[1L:100L] - expect_error({ - dtest <- Dataset$new( - data = test_data - , label = test_label - , predictor = data.frame(x = seq_len(10L), y = seq_len(10L)) - ) - }, regexp = "predictor must be a", fixed = TRUE) + # expect_error({ + # dtest <- Dataset$new( + # data = test_data + # , label = test_label + # , predictor = data.frame(x = seq_len(10L), y = seq_len(10L)) + # ) + # }, regexp = "predictor must be a", fixed = TRUE) }) test_that("Dataset$get_params() successfully returns parameters if you passed them", { From a024e5c04db93579cacd44fa95252285f275c8dc Mon Sep 17 00:00:00 2001 From: Guolin Ke Date: Mon, 12 Oct 2020 16:35:15 +0800 Subject: [PATCH 38/67] Apply suggestions from code review --- R-package/configure | 4 +--- R-package/configure.ac | 3 +-- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/R-package/configure b/R-package/configure index f062349541e5..cc9152a29c43 100755 --- a/R-package/configure +++ b/R-package/configure @@ -1783,8 +1783,7 @@ OPENMP_CXXFLAGS="" if test `uname -s` = "Linux" then - #OPENMP_CXXFLAGS="\$(SHLIB_OPENMP_CXXFLAGS)" - OPENMP_CXXFLAGS="" + OPENMP_CXXFLAGS="\$(SHLIB_OPENMP_CXXFLAGS)" fi if test `uname -s` = "Darwin" @@ -2990,4 +2989,3 @@ if test -n "$ac_unrecognized_opts" && test "$enable_option_checking" != no; then { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: unrecognized options: $ac_unrecognized_opts" >&5 $as_echo "$as_me: WARNING: unrecognized options: $ac_unrecognized_opts" >&2;} fi - diff --git a/R-package/configure.ac b/R-package/configure.ac index 904ee7ad963c..20182666b502 100644 --- a/R-package/configure.ac +++ b/R-package/configure.ac @@ -92,8 +92,7 @@ OPENMP_CXXFLAGS="" if test `uname -s` = "Linux" then - #OPENMP_CXXFLAGS="\$(SHLIB_OPENMP_CXXFLAGS)" - OPENMP_CXXFLAGS="" + OPENMP_CXXFLAGS="\$(SHLIB_OPENMP_CXXFLAGS)" fi if test `uname -s` = "Darwin" From a1e2b5935df06df600736ac0a330fe4a89186c51 Mon Sep 17 00:00:00 2001 From: Guolin Ke Date: Mon, 12 Oct 2020 16:35:53 +0800 Subject: [PATCH 39/67] Apply suggestions from code review --- src/boosting/gbdt_model_text.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/boosting/gbdt_model_text.cpp b/src/boosting/gbdt_model_text.cpp index 6a082d59be38..4d9174313d38 100644 --- a/src/boosting/gbdt_model_text.cpp +++ b/src/boosting/gbdt_model_text.cpp @@ -115,6 +115,7 @@ std::string GBDT::DumpModel(int start_iteration, int num_iteration, int feature_ str_buf << "}" << '\n'; str_buf << "}" << '\n'; + return str_buf.str(); } From 9e3d97aa71ca8c103b10efb5bd2c566415568914 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Mon, 12 Oct 2020 22:16:01 -0500 Subject: [PATCH 40/67] trying to comment out tests --- R-package/tests/testthat/test_basic.R | 170 +++++++++--------- R-package/tests/testthat/test_lgb.Booster.R | 82 ++++----- .../testthat/test_lgb.convert_with_rules.R | 6 +- .../tests/testthat/test_lgb.importance.R | 6 +- R-package/tests/testthat/test_utils.R | 18 +- 5 files changed, 141 insertions(+), 141 deletions(-) diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R index ba6fc864fcfe..b94e91c9897c 100644 --- a/R-package/tests/testthat/test_basic.R +++ b/R-package/tests/testthat/test_basic.R @@ -170,14 +170,14 @@ test_that("lightgbm() rejects negative or 0 value passed to nrounds", { dtrain <- lgb.Dataset(train$data, label = train$label) params <- list(objective = "regression", metric = "l2,l1") for (nround_value in c(-10L, 0L)) { - expect_error({ - bst <- lightgbm( - data = dtrain - , params = params - , nrounds = nround_value - , save_name = tempfile(fileext = ".model") - ) - }, "nrounds should be greater than zero") + # expect_error({ + # bst <- lightgbm( + # data = dtrain + # , params = params + # , nrounds = nround_value + # , save_name = tempfile(fileext = ".model") + # ) + # }, "nrounds should be greater than zero") } }) @@ -278,15 +278,15 @@ test_that("lgb.cv() rejects negative or 0 value passed to nrounds", { dtrain <- lgb.Dataset(train$data, label = train$label) params <- list(objective = "regression", metric = "l2,l1") for (nround_value in c(-10L, 0L)) { - expect_error({ - bst <- lgb.cv( - params - , dtrain - , nround_value - , nfold = 5L - , min_data = 1L - ) - }, "nrounds should be greater than zero") + # expect_error({ + # bst <- lgb.cv( + # params + # , dtrain + # , nround_value + # , nfold = 5L + # , min_data = 1L + # ) + # }, "nrounds should be greater than zero") } }) @@ -300,15 +300,15 @@ test_that("lgb.cv() throws an informative error is 'data' is not an lgb.Dataset , matrix(data = seq_len(10L), 2L, 5L) ) for (val in bad_values) { - expect_error({ - bst <- lgb.cv( - params = list(objective = "regression", metric = "l2,l1") - , data = val - , 10L - , nfold = 5L - , min_data = 1L - ) - }, regexp = "'label' must be provided for lgb.cv if 'data' is not an 'lgb.Dataset'", fixed = TRUE) + # expect_error({ + # bst <- lgb.cv( + # params = list(objective = "regression", metric = "l2,l1") + # , data = val + # , 10L + # , nfold = 5L + # , min_data = 1L + # ) + # }, regexp = "'label' must be provided for lgb.cv if 'data' is not an 'lgb.Dataset'", fixed = TRUE) } }) @@ -378,13 +378,13 @@ test_that("lgb.train() rejects negative or 0 value passed to nrounds", { dtrain <- lgb.Dataset(train$data, label = train$label) params <- list(objective = "regression", metric = "l2,l1") for (nround_value in c(-10L, 0L)) { - expect_error({ - bst <- lgb.train( - params - , dtrain - , nround_value - ) - }, "nrounds should be greater than zero") + # expect_error({ + # bst <- lgb.train( + # params + # , dtrain + # , nround_value + # ) + # }, "nrounds should be greater than zero") } }) @@ -398,13 +398,13 @@ test_that("lgb.train() throws an informative error if 'data' is not an lgb.Datas , matrix(data = seq_len(10L), 2L, 5L) ) for (val in bad_values) { - expect_error({ - bst <- lgb.train( - params = list(objective = "regression", metric = "l2,l1") - , data = val - , 10L - ) - }, regexp = "data must be an lgb.Dataset instance", fixed = TRUE) + # expect_error({ + # bst <- lgb.train( + # params = list(objective = "regression", metric = "l2,l1") + # , data = val + # , 10L + # ) + # }, regexp = "data must be an lgb.Dataset instance", fixed = TRUE) } }) @@ -413,14 +413,14 @@ test_that("lgb.train() throws an informative error if 'valids' is not a list of "valid1" = data.frame(x = rnorm(5L), y = rnorm(5L)) , "valid2" = data.frame(x = rnorm(5L), y = rnorm(5L)) ) - expect_error({ - bst <- lgb.train( - params = list(objective = "regression", metric = "l2,l1") - , data = lgb.Dataset(train$data, label = train$label) - , 10L - , valids = valids - ) - }, regexp = "valids must be a list of lgb.Dataset elements") + # expect_error({ + # bst <- lgb.train( + # params = list(objective = "regression", metric = "l2,l1") + # , data = lgb.Dataset(train$data, label = train$label) + # , 10L + # , valids = valids + # ) + # }, regexp = "valids must be a list of lgb.Dataset elements") }) test_that("lgb.train() errors if 'valids' is a list of lgb.Dataset objects but some do not have names", { @@ -428,14 +428,14 @@ test_that("lgb.train() errors if 'valids' is a list of lgb.Dataset objects but s "valid1" = lgb.Dataset(matrix(rnorm(10L), 5L, 2L)) , lgb.Dataset(matrix(rnorm(10L), 2L, 5L)) ) - expect_error({ - bst <- lgb.train( - params = list(objective = "regression", metric = "l2,l1") - , data = lgb.Dataset(train$data, label = train$label) - , 10L - , valids = valids - ) - }, regexp = "each element of valids must have a name") + # expect_error({ + # bst <- lgb.train( + # params = list(objective = "regression", metric = "l2,l1") + # , data = lgb.Dataset(train$data, label = train$label) + # , 10L + # , valids = valids + # ) + # }, regexp = "each element of valids must have a name") }) test_that("lgb.train() throws an informative error if 'valids' contains lgb.Dataset objects but none have names", { @@ -443,14 +443,14 @@ test_that("lgb.train() throws an informative error if 'valids' contains lgb.Data lgb.Dataset(matrix(rnorm(10L), 5L, 2L)) , lgb.Dataset(matrix(rnorm(10L), 2L, 5L)) ) - expect_error({ - bst <- lgb.train( - params = list(objective = "regression", metric = "l2,l1") - , data = lgb.Dataset(train$data, label = train$label) - , 10L - , valids = valids - ) - }, regexp = "each element of valids must have a name") + # expect_error({ + # bst <- lgb.train( + # params = list(objective = "regression", metric = "l2,l1") + # , data = lgb.Dataset(train$data, label = train$label) + # , 10L + # , valids = valids + # ) + # }, regexp = "each element of valids must have a name") }) test_that("lgb.train() works with force_col_wise and force_row_wise", { @@ -1624,39 +1624,39 @@ context("interaction constraints") test_that("lgb.train() throws an informative error if interaction_constraints is not a list", { dtrain <- lgb.Dataset(train$data, label = train$label) params <- list(objective = "regression", interaction_constraints = "[1,2],[3]") - expect_error({ - bst <- lightgbm( - data = dtrain - , params = params - , nrounds = 2L - ) - }, "interaction_constraints must be a list") + # expect_error({ + # bst <- lightgbm( + # data = dtrain + # , params = params + # , nrounds = 2L + # ) + # }, "interaction_constraints must be a list") }) test_that(paste0("lgb.train() throws an informative error if the members of interaction_constraints ", "are not character or numeric vectors"), { dtrain <- lgb.Dataset(train$data, label = train$label) params <- list(objective = "regression", interaction_constraints = list(list(1L, 2L), list(3L))) - expect_error({ - bst <- lightgbm( - data = dtrain - , params = params - , nrounds = 2L - ) - }, "every element in interaction_constraints must be a character vector or numeric vector") + # expect_error({ + # bst <- lightgbm( + # data = dtrain + # , params = params + # , nrounds = 2L + # ) + # }, "every element in interaction_constraints must be a character vector or numeric vector") }) test_that("lgb.train() throws an informative error if interaction_constraints contains a too large index", { dtrain <- lgb.Dataset(train$data, label = train$label) params <- list(objective = "regression", interaction_constraints = list(c(1L, length(colnames(train$data)) + 1L), 3L)) - expect_error({ - bst <- lightgbm( - data = dtrain - , params = params - , nrounds = 2L - ) - }, "supplied a too large value in interaction_constraints") + # expect_error({ + # bst <- lightgbm( + # data = dtrain + # , params = params + # , nrounds = 2L + # ) + # }, "supplied a too large value in interaction_constraints") }) test_that(paste0("lgb.train() gives same result when interaction_constraints is specified as a list of ", diff --git a/R-package/tests/testthat/test_lgb.Booster.R b/R-package/tests/testthat/test_lgb.Booster.R index e76f030c8022..ec3e308e6dd9 100644 --- a/R-package/tests/testthat/test_lgb.Booster.R +++ b/R-package/tests/testthat/test_lgb.Booster.R @@ -13,13 +13,13 @@ test_that("lgb.get.eval.result() should throw an informative error if booster is ) ) for (bad_input in bad_inputs) { - expect_error({ - lgb.get.eval.result( - booster = bad_input - , data_name = "test" - , eval_name = "l2" - ) - }, regexp = "Can only use", fixed = TRUE) + # expect_error({ + # lgb.get.eval.result( + # booster = bad_input + # , data_name = "test" + # , eval_name = "l2" + # ) + # }, regexp = "Can only use", fixed = TRUE) } }) @@ -47,13 +47,13 @@ test_that("lgb.get.eval.result() should throw an informative error for incorrect , min_data = 1L , learning_rate = 1.0 ) - expect_error({ - eval_results <- lgb.get.eval.result( - booster = model - , data_name = "testing" - , eval_name = "l2" - ) - }, regexp = "Only the following datasets exist in record evals: [test]", fixed = TRUE) + # expect_error({ + # eval_results <- lgb.get.eval.result( + # booster = model + # , data_name = "testing" + # , eval_name = "l2" + # ) + # }, regexp = "Only the following datasets exist in record evals: [test]", fixed = TRUE) }) test_that("lgb.get.eval.result() should throw an informative error for incorrect eval_name", { @@ -80,13 +80,13 @@ test_that("lgb.get.eval.result() should throw an informative error for incorrect , min_data = 1L , learning_rate = 1.0 ) - expect_error({ - eval_results <- lgb.get.eval.result( - booster = model - , data_name = "test" - , eval_name = "l1" - ) - }, regexp = "Only the following eval_names exist for dataset.*\\: \\[l2\\]", fixed = FALSE) + # expect_error({ + # eval_results <- lgb.get.eval.result( + # booster = model + # , data_name = "test" + # , eval_name = "l1" + # ) + # }, regexp = "Only the following eval_names exist for dataset.*\\: \\[l2\\]", fixed = FALSE) }) context("lgb.load()") @@ -108,30 +108,30 @@ test_that("lgb.load() gives the expected error messages given different incorrec ) # you have to give model_str or filename - expect_error({ - lgb.load() - }, regexp = "either filename or model_str must be given") - expect_error({ - lgb.load(filename = NULL, model_str = NULL) - }, regexp = "either filename or model_str must be given") + # expect_error({ + # lgb.load() + # }, regexp = "either filename or model_str must be given") + # expect_error({ + # lgb.load(filename = NULL, model_str = NULL) + # }, regexp = "either filename or model_str must be given") # if given, filename should be a string that points to an existing file model_file <- tempfile(fileext = ".model") - expect_error({ - lgb.load(filename = list(model_file)) - }, regexp = "filename should be character") + # expect_error({ + # lgb.load(filename = list(model_file)) + # }, regexp = "filename should be character") file_to_check <- paste0("a.model") while (file.exists(file_to_check)) { file_to_check <- paste0("a", file_to_check) } - expect_error({ - lgb.load(filename = file_to_check) - }, regexp = "passed to filename does not exist") + # expect_error({ + # lgb.load(filename = file_to_check) + # }, regexp = "passed to filename does not exist") # if given, model_str should be a string - expect_error({ - lgb.load(model_str = c(4.0, 5.0, 6.0)) - }, regexp = "model_str should be character") + # expect_error({ + # lgb.load(model_str = c(4.0, 5.0, 6.0)) + # }, regexp = "model_str should be character") }) @@ -379,11 +379,11 @@ test_that("Booster$update() throws an informative error if you provide a non-Dat , objective = "binary" , save_name = tempfile(fileext = ".model") ) - expect_error({ - bst$update( - train_set = data.frame(x = rnorm(10L)) - ) - }, regexp = "lgb.Booster.update: Only can use lgb.Dataset", fixed = TRUE) + # expect_error({ + # bst$update( + # train_set = data.frame(x = rnorm(10L)) + # ) + # }, regexp = "lgb.Booster.update: Only can use lgb.Dataset", fixed = TRUE) }) context("save_model") diff --git a/R-package/tests/testthat/test_lgb.convert_with_rules.R b/R-package/tests/testthat/test_lgb.convert_with_rules.R index 546ab9663f4f..b75e6c98a891 100644 --- a/R-package/tests/testthat/test_lgb.convert_with_rules.R +++ b/R-package/tests/testthat/test_lgb.convert_with_rules.R @@ -13,9 +13,9 @@ test_that("lgb.convert_with_rules() rejects inputs that are not a data.table or ) ) for (bad_input in bad_inputs) { - expect_error({ - conversion_result <- lgb.convert_with_rules(bad_input) - }, regexp = "lgb.convert_with_rules: you provided", fixed = TRUE) + # expect_error({ + # conversion_result <- lgb.convert_with_rules(bad_input) + # }, regexp = "lgb.convert_with_rules: you provided", fixed = TRUE) } }) diff --git a/R-package/tests/testthat/test_lgb.importance.R b/R-package/tests/testthat/test_lgb.importance.R index c0e1d6e8ca82..4dfbddc964ba 100644 --- a/R-package/tests/testthat/test_lgb.importance.R +++ b/R-package/tests/testthat/test_lgb.importance.R @@ -32,8 +32,8 @@ test_that("lgb.importance() should reject bad inputs", { , "lightgbm.model" ) for (input in bad_inputs) { - expect_error({ - lgb.importance(input) - }, regexp = "'model' has to be an object of class lgb\\.Booster") + # expect_error({ + # lgb.importance(input) + # }, regexp = "'model' has to be an object of class lgb\\.Booster") } }) diff --git a/R-package/tests/testthat/test_utils.R b/R-package/tests/testthat/test_utils.R index 5a9cfb641d61..2b181796810c 100644 --- a/R-package/tests/testthat/test_utils.R +++ b/R-package/tests/testthat/test_utils.R @@ -2,9 +2,9 @@ context("lgb.encode.char") test_that("lgb.encode.char throws an informative error if it is passed a non-raw input", { x <- "some-string" - expect_error({ - lgb.encode.char(x) - }, regexp = "Can only encode from raw type") + # expect_error({ + # lgb.encode.char(x) + # }, regexp = "Can only encode from raw type") }) context("lgb.check.r6.class") @@ -61,9 +61,9 @@ test_that("lgb.params2str() works as expected for a key in params with multiple context("lgb.last_error") test_that("lgb.last_error() throws an error if there are no errors", { - expect_error({ - lgb.last_error() - }, regexp = "Everything is fine") + # expect_error({ + # lgb.last_error() + # }, regexp = "Everything is fine") }) test_that("lgb.last_error() correctly returns errors from the C++ side", { @@ -73,9 +73,9 @@ test_that("lgb.last_error() correctly returns errors from the C++ side", { data = train$data , label = as.matrix(rnorm(5L)) ) - expect_error({ - dvalid1$construct() - }, regexp = "[LightGBM] [Fatal] Length of label is not same with #data", fixed = TRUE) + # expect_error({ + # dvalid1$construct() + # }, regexp = "[LightGBM] [Fatal] Length of label is not same with #data", fixed = TRUE) }) context("lgb.check.eval") From 767938bbb35cfbfecf5e5759a7b8f5209207599f Mon Sep 17 00:00:00 2001 From: Guolin Ke Date: Tue, 13 Oct 2020 14:14:10 +0800 Subject: [PATCH 41/67] Update openmp_wrapper.h --- include/LightGBM/utils/openmp_wrapper.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/LightGBM/utils/openmp_wrapper.h b/include/LightGBM/utils/openmp_wrapper.h index 71574ff894b6..7cee026c4495 100644 --- a/include/LightGBM/utils/openmp_wrapper.h +++ b/include/LightGBM/utils/openmp_wrapper.h @@ -4,6 +4,7 @@ */ #ifndef LIGHTGBM_OPENMP_WRAPPER_H_ #define LIGHTGBM_OPENMP_WRAPPER_H_ +#undef _OPENMP #ifdef _OPENMP #include From 11874c70665077109d1b39d1e4adfe9fa7db5f6d Mon Sep 17 00:00:00 2001 From: Guolin Ke Date: Tue, 13 Oct 2020 14:34:19 +0800 Subject: [PATCH 42/67] Apply suggestions from code review --- include/LightGBM/utils/openmp_wrapper.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/LightGBM/utils/openmp_wrapper.h b/include/LightGBM/utils/openmp_wrapper.h index 7cee026c4495..71574ff894b6 100644 --- a/include/LightGBM/utils/openmp_wrapper.h +++ b/include/LightGBM/utils/openmp_wrapper.h @@ -4,7 +4,6 @@ */ #ifndef LIGHTGBM_OPENMP_WRAPPER_H_ #define LIGHTGBM_OPENMP_WRAPPER_H_ -#undef _OPENMP #ifdef _OPENMP #include From 4e30578642a51417ddfd46b733e4026dfe28a508 Mon Sep 17 00:00:00 2001 From: Guolin Ke Date: Tue, 13 Oct 2020 14:36:55 +0800 Subject: [PATCH 43/67] Update configure --- R-package/configure | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R-package/configure b/R-package/configure index cc9152a29c43..280f13ab3170 100755 --- a/R-package/configure +++ b/R-package/configure @@ -1783,7 +1783,7 @@ OPENMP_CXXFLAGS="" if test `uname -s` = "Linux" then - OPENMP_CXXFLAGS="\$(SHLIB_OPENMP_CXXFLAGS)" + OPENMP_CXXFLAGS="" fi if test `uname -s` = "Darwin" From 5f888a9f6190dcb57340841e7e1422e1dbe4dc4a Mon Sep 17 00:00:00 2001 From: Guolin Ke Date: Tue, 13 Oct 2020 14:37:19 +0800 Subject: [PATCH 44/67] Update configure.ac --- R-package/configure.ac | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R-package/configure.ac b/R-package/configure.ac index 20182666b502..c835c06539eb 100644 --- a/R-package/configure.ac +++ b/R-package/configure.ac @@ -92,7 +92,7 @@ OPENMP_CXXFLAGS="" if test `uname -s` = "Linux" then - OPENMP_CXXFLAGS="\$(SHLIB_OPENMP_CXXFLAGS)" + OPENMP_CXXFLAGS="" fi if test `uname -s` = "Darwin" From 66aeb47bcd8dd84db9ce6670fc8514514350c640 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Tue, 13 Oct 2020 23:07:55 -0500 Subject: [PATCH 45/67] trying to uncomment --- R-package/README.md | 2 +- R-package/tests/testthat/test_Predictor.R | 150 +- R-package/tests/testthat/test_basic.R | 3444 ++++++++--------- .../tests/testthat/test_learning_to_rank.R | 282 +- .../testthat/test_lgb.convert_with_rules.R | 6 +- .../tests/testthat/test_lgb.importance.R | 6 +- R-package/tests/testthat/test_utils.R | 18 +- 7 files changed, 1954 insertions(+), 1954 deletions(-) diff --git a/R-package/README.md b/R-package/README.md index f0199cb6ba73..95d34b12ba5a 100644 --- a/R-package/README.md +++ b/R-package/README.md @@ -454,7 +454,7 @@ cd R-package/tests RDvalgrind \ --no-readline \ --vanilla \ - -d valgrind \ + -d "valgrind --tool=memcheck --leak-check=full --track-origins=yes" \ -f testthat.R \ 2>&1 \ | tee out.log \ diff --git a/R-package/tests/testthat/test_Predictor.R b/R-package/tests/testthat/test_Predictor.R index 77719f2367a4..c564b712e409 100644 --- a/R-package/tests/testthat/test_Predictor.R +++ b/R-package/tests/testthat/test_Predictor.R @@ -1,78 +1,78 @@ -context("Predictor") +# context("Predictor") -test_that("predictions do not fail for integer input", { - X <- as.matrix(as.integer(iris[, "Species"]), ncol = 1L) - y <- iris[["Sepal.Length"]] - dtrain <- lgb.Dataset(X, label = y) - fit <- lgb.train( - data = dtrain - , objective = "regression" - , verbose = -1L - , nrounds = 3L - ) - X_double <- X[c(1L, 51L, 101L), , drop = FALSE] - X_integer <- X_double - storage.mode(X_double) <- "double" - pred_integer <- predict(fit, X_integer) - pred_double <- predict(fit, X_double) - expect_equal(pred_integer, pred_double) -}) +# test_that("predictions do not fail for integer input", { +# X <- as.matrix(as.integer(iris[, "Species"]), ncol = 1L) +# y <- iris[["Sepal.Length"]] +# dtrain <- lgb.Dataset(X, label = y) +# fit <- lgb.train( +# data = dtrain +# , objective = "regression" +# , verbose = -1L +# , nrounds = 3L +# ) +# X_double <- X[c(1L, 51L, 101L), , drop = FALSE] +# X_integer <- X_double +# storage.mode(X_double) <- "double" +# pred_integer <- predict(fit, X_integer) +# pred_double <- predict(fit, X_double) +# expect_equal(pred_integer, pred_double) +# }) -test_that("start_iteration works correctly", { - set.seed(708L) - data(agaricus.train, package = "lightgbm") - data(agaricus.test, package = "lightgbm") - train <- agaricus.train - test <- agaricus.test - dtrain <- lgb.Dataset( - agaricus.train$data - , label = agaricus.train$label - ) - dtest <- lgb.Dataset.create.valid( - dtrain - , agaricus.test$data - , label = agaricus.test$label - ) - bst <- lightgbm( - data = as.matrix(train$data) - , label = train$label - , num_leaves = 4L - , learning_rate = 0.6 - , nrounds = 50L - , objective = "binary" - , valids = list("test" = dtest) - , early_stopping_rounds = 2L - ) - expect_true(lgb.is.Booster(bst)) - pred1 <- predict(bst, data = test$data, rawscore = TRUE) - pred_contrib1 <- predict(bst, test$data, predcontrib = TRUE) - pred2 <- rep(0.0, length(pred1)) - pred_contrib2 <- rep(0.0, length(pred2)) - step <- 11L - end_iter <- 49L - if (bst$best_iter != -1L) { - end_iter <- bst$best_iter - 1L - } - start_iters <- seq(0L, end_iter, by = step) - for (start_iter in start_iters) { - n_iter <- min(c(end_iter - start_iter + 1L, step)) - inc_pred <- predict(bst, test$data - , start_iteration = start_iter - , num_iteration = n_iter - , rawscore = TRUE - ) - inc_pred_contrib <- bst$predict(test$data - , start_iteration = start_iter - , num_iteration = n_iter - , predcontrib = TRUE - ) - pred2 <- pred2 + inc_pred - pred_contrib2 <- pred_contrib2 + inc_pred_contrib - } - expect_equal(pred2, pred1) - expect_equal(pred_contrib2, pred_contrib1) +# test_that("start_iteration works correctly", { +# set.seed(708L) +# data(agaricus.train, package = "lightgbm") +# data(agaricus.test, package = "lightgbm") +# train <- agaricus.train +# test <- agaricus.test +# dtrain <- lgb.Dataset( +# agaricus.train$data +# , label = agaricus.train$label +# ) +# dtest <- lgb.Dataset.create.valid( +# dtrain +# , agaricus.test$data +# , label = agaricus.test$label +# ) +# bst <- lightgbm( +# data = as.matrix(train$data) +# , label = train$label +# , num_leaves = 4L +# , learning_rate = 0.6 +# , nrounds = 50L +# , objective = "binary" +# , valids = list("test" = dtest) +# , early_stopping_rounds = 2L +# ) +# expect_true(lgb.is.Booster(bst)) +# pred1 <- predict(bst, data = test$data, rawscore = TRUE) +# pred_contrib1 <- predict(bst, test$data, predcontrib = TRUE) +# pred2 <- rep(0.0, length(pred1)) +# pred_contrib2 <- rep(0.0, length(pred2)) +# step <- 11L +# end_iter <- 49L +# if (bst$best_iter != -1L) { +# end_iter <- bst$best_iter - 1L +# } +# start_iters <- seq(0L, end_iter, by = step) +# for (start_iter in start_iters) { +# n_iter <- min(c(end_iter - start_iter + 1L, step)) +# inc_pred <- predict(bst, test$data +# , start_iteration = start_iter +# , num_iteration = n_iter +# , rawscore = TRUE +# ) +# inc_pred_contrib <- bst$predict(test$data +# , start_iteration = start_iter +# , num_iteration = n_iter +# , predcontrib = TRUE +# ) +# pred2 <- pred2 + inc_pred +# pred_contrib2 <- pred_contrib2 + inc_pred_contrib +# } +# expect_equal(pred2, pred1) +# expect_equal(pred_contrib2, pred_contrib1) - pred_leaf1 <- predict(bst, test$data, predleaf = TRUE) - pred_leaf2 <- predict(bst, test$data, start_iteration = 0L, num_iteration = end_iter + 1L, predleaf = TRUE) - expect_equal(pred_leaf1, pred_leaf2) -}) +# pred_leaf1 <- predict(bst, test$data, predleaf = TRUE) +# pred_leaf2 <- predict(bst, test$data, start_iteration = 0L, num_iteration = end_iter + 1L, predleaf = TRUE) +# expect_equal(pred_leaf1, pred_leaf2) +# }) diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R index b94e91c9897c..5bea558e9c92 100644 --- a/R-package/tests/testthat/test_basic.R +++ b/R-package/tests/testthat/test_basic.R @@ -1,1722 +1,1722 @@ -context("lightgbm()") - -data(agaricus.train, package = "lightgbm") -data(agaricus.test, package = "lightgbm") -train <- agaricus.train -test <- agaricus.test - -TOLERANCE <- 1e-6 -set.seed(708L) - -# [description] Every time this function is called, it adds 0.1 -# to an accumulator then returns the current value. -# This is used to mock the situation where an evaluation -# metric increases every iteration -ACCUMULATOR_NAME <- "INCREASING_METRIC_ACUMULATOR" -assign(x = ACCUMULATOR_NAME, value = 0.0, envir = .GlobalEnv) - -.increasing_metric <- function(preds, dtrain) { - if (!exists(ACCUMULATOR_NAME, envir = .GlobalEnv)) { - assign(ACCUMULATOR_NAME, 0.0, envir = .GlobalEnv) - } - assign( - x = ACCUMULATOR_NAME - , value = get(ACCUMULATOR_NAME, envir = .GlobalEnv) + 0.1 - , envir = .GlobalEnv - ) - return(list( - name = "increasing_metric" - , value = get(ACCUMULATOR_NAME, envir = .GlobalEnv) - , higher_better = TRUE - )) -} - -# [description] Evaluation function that always returns the -# same value -CONSTANT_METRIC_VALUE <- 0.2 -.constant_metric <- function(preds, dtrain) { - return(list( - name = "constant_metric" - , value = CONSTANT_METRIC_VALUE - , higher_better = FALSE - )) -} - -# sample datasets to test early stopping -DTRAIN_RANDOM_REGRESSION <- lgb.Dataset( - data = as.matrix(rnorm(100L), ncol = 1L, drop = FALSE) - , label = rnorm(100L) -) -DVALID_RANDOM_REGRESSION <- lgb.Dataset( - data = as.matrix(rnorm(50L), ncol = 1L, drop = FALSE) - , label = rnorm(50L) -) -DTRAIN_RANDOM_CLASSIFICATION <- lgb.Dataset( - data = as.matrix(rnorm(120L), ncol = 1L, drop = FALSE) - , label = sample(c(0L, 1L), size = 120L, replace = TRUE) -) -DVALID_RANDOM_CLASSIFICATION <- lgb.Dataset( - data = as.matrix(rnorm(37L), ncol = 1L, drop = FALSE) - , label = sample(c(0L, 1L), size = 37L, replace = TRUE) -) - -test_that("train and predict binary classification", { - nrounds <- 10L - bst <- lightgbm( - data = train$data - , label = train$label - , num_leaves = 5L - , nrounds = nrounds - , objective = "binary" - , metric = "binary_error" - , save_name = tempfile(fileext = ".model") - ) - expect_false(is.null(bst$record_evals)) - record_results <- lgb.get.eval.result(bst, "train", "binary_error") - expect_lt(min(record_results), 0.02) - - pred <- predict(bst, test$data) - expect_equal(length(pred), 1611L) - - pred1 <- predict(bst, train$data, num_iteration = 1L) - expect_equal(length(pred1), 6513L) - err_pred1 <- sum((pred1 > 0.5) != train$label) / length(train$label) - err_log <- record_results[1L] - expect_lt(abs(err_pred1 - err_log), TOLERANCE) -}) - - -test_that("train and predict softmax", { - set.seed(708L) - lb <- as.numeric(iris$Species) - 1L - - bst <- lightgbm( - data = as.matrix(iris[, -5L]) - , label = lb - , num_leaves = 4L - , learning_rate = 0.05 - , nrounds = 20L - , min_data = 20L - , min_hessian = 10.0 - , objective = "multiclass" - , metric = "multi_error" - , num_class = 3L - , save_name = tempfile(fileext = ".model") - ) - - expect_false(is.null(bst$record_evals)) - record_results <- lgb.get.eval.result(bst, "train", "multi_error") - expect_lt(min(record_results), 0.06) - - pred <- predict(bst, as.matrix(iris[, -5L])) - expect_equal(length(pred), nrow(iris) * 3L) -}) - - -test_that("use of multiple eval metrics works", { - metrics <- list("binary_error", "auc", "binary_logloss") - bst <- lightgbm( - data = train$data - , label = train$label - , num_leaves = 4L - , learning_rate = 1.0 - , nrounds = 10L - , objective = "binary" - , metric = metrics - , save_name = tempfile(fileext = ".model") - ) - expect_false(is.null(bst$record_evals)) - expect_named( - bst$record_evals[["train"]] - , unlist(metrics) - , ignore.order = FALSE - , ignore.case = FALSE - ) -}) - -test_that("lgb.Booster.upper_bound() and lgb.Booster.lower_bound() work as expected for binary classification", { - set.seed(708L) - nrounds <- 10L - bst <- lightgbm( - data = train$data - , label = train$label - , num_leaves = 5L - , nrounds = nrounds - , objective = "binary" - , metric = "binary_error" - , save_name = tempfile(fileext = ".model") - ) - expect_true(abs(bst$lower_bound() - -1.590853) < TOLERANCE) - expect_true(abs(bst$upper_bound() - 1.871015) < TOLERANCE) -}) - -test_that("lgb.Booster.upper_bound() and lgb.Booster.lower_bound() work as expected for regression", { - set.seed(708L) - nrounds <- 10L - bst <- lightgbm( - data = train$data - , label = train$label - , num_leaves = 5L - , nrounds = nrounds - , objective = "regression" - , metric = "l2" - , save_name = tempfile(fileext = ".model") - ) - expect_true(abs(bst$lower_bound() - 0.1513859) < TOLERANCE) - expect_true(abs(bst$upper_bound() - 0.9080349) < TOLERANCE) -}) - -test_that("lightgbm() rejects negative or 0 value passed to nrounds", { - dtrain <- lgb.Dataset(train$data, label = train$label) - params <- list(objective = "regression", metric = "l2,l1") - for (nround_value in c(-10L, 0L)) { - # expect_error({ - # bst <- lightgbm( - # data = dtrain - # , params = params - # , nrounds = nround_value - # , save_name = tempfile(fileext = ".model") - # ) - # }, "nrounds should be greater than zero") - } -}) - -test_that("lightgbm() performs evaluation on validation sets if they are provided", { - set.seed(708L) - dvalid1 <- lgb.Dataset( - data = train$data - , label = train$label - ) - dvalid2 <- lgb.Dataset( - data = train$data - , label = train$label - ) - nrounds <- 10L - bst <- lightgbm( - data = train$data - , label = train$label - , num_leaves = 5L - , nrounds = nrounds - , objective = "binary" - , metric = c( - "binary_error" - , "auc" - ) - , valids = list( - "valid1" = dvalid1 - , "valid2" = dvalid2 - ) - , save_name = tempfile(fileext = ".model") - ) - - expect_named( - bst$record_evals - , c("train", "valid1", "valid2", "start_iter") - , ignore.order = TRUE - , ignore.case = FALSE - ) - for (valid_name in c("train", "valid1", "valid2")) { - eval_results <- bst$record_evals[[valid_name]][["binary_error"]] - expect_length(eval_results[["eval"]], nrounds) - } - expect_true(abs(bst$record_evals[["train"]][["binary_error"]][["eval"]][[1L]] - 0.02226317) < TOLERANCE) - expect_true(abs(bst$record_evals[["valid1"]][["binary_error"]][["eval"]][[1L]] - 0.02226317) < TOLERANCE) - expect_true(abs(bst$record_evals[["valid2"]][["binary_error"]][["eval"]][[1L]] - 0.02226317) < TOLERANCE) -}) - - -context("training continuation") - -test_that("training continuation works", { - dtrain <- lgb.Dataset( - train$data - , label = train$label - , free_raw_data = FALSE - ) - watchlist <- list(train = dtrain) - param <- list( - objective = "binary" - , metric = "binary_logloss" - , num_leaves = 5L - , learning_rate = 1.0 - ) - - # train for 10 consecutive iterations - bst <- lgb.train(param, dtrain, nrounds = 10L, watchlist) - err_bst <- lgb.get.eval.result(bst, "train", "binary_logloss", 10L) - - # train for 5 iterations, save, load, train for 5 more - bst1 <- lgb.train(param, dtrain, nrounds = 5L, watchlist) - model_file <- tempfile(fileext = ".model") - lgb.save(bst1, model_file) - bst2 <- lgb.train(param, dtrain, nrounds = 5L, watchlist, init_model = bst1) - err_bst2 <- lgb.get.eval.result(bst2, "train", "binary_logloss", 10L) - - # evaluation metrics should be nearly identical for the model trained in 10 coonsecutive - # iterations and the one trained in 5-then-5. - expect_lt(abs(err_bst - err_bst2), 0.01) -}) - -context("lgb.cv()") - -test_that("cv works", { - dtrain <- lgb.Dataset(train$data, label = train$label) - params <- list(objective = "regression", metric = "l2,l1") - bst <- lgb.cv( - params - , dtrain - , 10L - , nfold = 5L - , min_data = 1L - , learning_rate = 1.0 - , early_stopping_rounds = 10L - ) - expect_false(is.null(bst$record_evals)) -}) - -test_that("lgb.cv() rejects negative or 0 value passed to nrounds", { - dtrain <- lgb.Dataset(train$data, label = train$label) - params <- list(objective = "regression", metric = "l2,l1") - for (nround_value in c(-10L, 0L)) { - # expect_error({ - # bst <- lgb.cv( - # params - # , dtrain - # , nround_value - # , nfold = 5L - # , min_data = 1L - # ) - # }, "nrounds should be greater than zero") - } -}) - -test_that("lgb.cv() throws an informative error is 'data' is not an lgb.Dataset and labels are not given", { - bad_values <- list( - 4L - , "hello" - , list(a = TRUE, b = seq_len(10L)) - , data.frame(x = seq_len(5L), y = seq_len(5L)) - , data.table::data.table(x = seq_len(5L), y = seq_len(5L)) - , matrix(data = seq_len(10L), 2L, 5L) - ) - for (val in bad_values) { - # expect_error({ - # bst <- lgb.cv( - # params = list(objective = "regression", metric = "l2,l1") - # , data = val - # , 10L - # , nfold = 5L - # , min_data = 1L - # ) - # }, regexp = "'label' must be provided for lgb.cv if 'data' is not an 'lgb.Dataset'", fixed = TRUE) - } -}) - -test_that("lightgbm.cv() gives the correct best_score and best_iter for a metric where higher values are better", { - set.seed(708L) - dtrain <- lgb.Dataset( - data = as.matrix(runif(n = 500L, min = 0.0, max = 15.0), drop = FALSE) - , label = rep(c(0L, 1L), 250L) - ) - nrounds <- 10L - cv_bst <- lgb.cv( - data = dtrain - , nfold = 5L - , nrounds = nrounds - , num_leaves = 5L - , params = list( - objective = "binary" - , metric = "auc,binary_error" - , learning_rate = 1.5 - ) - ) - expect_is(cv_bst, "lgb.CVBooster") - expect_named( - cv_bst$record_evals - , c("start_iter", "valid") - , ignore.order = FALSE - , ignore.case = FALSE - ) - auc_scores <- unlist(cv_bst$record_evals[["valid"]][["auc"]][["eval"]]) - expect_length(auc_scores, nrounds) - expect_identical(cv_bst$best_iter, which.max(auc_scores)) - expect_identical(cv_bst$best_score, auc_scores[which.max(auc_scores)]) -}) - -context("lgb.train()") - -test_that("lgb.train() works as expected with multiple eval metrics", { - metrics <- c("binary_error", "auc", "binary_logloss") - bst <- lgb.train( - data = lgb.Dataset( - train$data - , label = train$label - ) - , learning_rate = 1.0 - , nrounds = 10L - , params = list( - objective = "binary" - , metric = metrics - ) - , valids = list( - "train" = lgb.Dataset( - train$data - , label = train$label - ) - ) - ) - expect_false(is.null(bst$record_evals)) - expect_named( - bst$record_evals[["train"]] - , unlist(metrics) - , ignore.order = FALSE - , ignore.case = FALSE - ) -}) - -test_that("lgb.train() rejects negative or 0 value passed to nrounds", { - dtrain <- lgb.Dataset(train$data, label = train$label) - params <- list(objective = "regression", metric = "l2,l1") - for (nround_value in c(-10L, 0L)) { - # expect_error({ - # bst <- lgb.train( - # params - # , dtrain - # , nround_value - # ) - # }, "nrounds should be greater than zero") - } -}) - -test_that("lgb.train() throws an informative error if 'data' is not an lgb.Dataset", { - bad_values <- list( - 4L - , "hello" - , list(a = TRUE, b = seq_len(10L)) - , data.frame(x = seq_len(5L), y = seq_len(5L)) - , data.table::data.table(x = seq_len(5L), y = seq_len(5L)) - , matrix(data = seq_len(10L), 2L, 5L) - ) - for (val in bad_values) { - # expect_error({ - # bst <- lgb.train( - # params = list(objective = "regression", metric = "l2,l1") - # , data = val - # , 10L - # ) - # }, regexp = "data must be an lgb.Dataset instance", fixed = TRUE) - } -}) - -test_that("lgb.train() throws an informative error if 'valids' is not a list of lgb.Dataset objects", { - valids <- list( - "valid1" = data.frame(x = rnorm(5L), y = rnorm(5L)) - , "valid2" = data.frame(x = rnorm(5L), y = rnorm(5L)) - ) - # expect_error({ - # bst <- lgb.train( - # params = list(objective = "regression", metric = "l2,l1") - # , data = lgb.Dataset(train$data, label = train$label) - # , 10L - # , valids = valids - # ) - # }, regexp = "valids must be a list of lgb.Dataset elements") -}) - -test_that("lgb.train() errors if 'valids' is a list of lgb.Dataset objects but some do not have names", { - valids <- list( - "valid1" = lgb.Dataset(matrix(rnorm(10L), 5L, 2L)) - , lgb.Dataset(matrix(rnorm(10L), 2L, 5L)) - ) - # expect_error({ - # bst <- lgb.train( - # params = list(objective = "regression", metric = "l2,l1") - # , data = lgb.Dataset(train$data, label = train$label) - # , 10L - # , valids = valids - # ) - # }, regexp = "each element of valids must have a name") -}) - -test_that("lgb.train() throws an informative error if 'valids' contains lgb.Dataset objects but none have names", { - valids <- list( - lgb.Dataset(matrix(rnorm(10L), 5L, 2L)) - , lgb.Dataset(matrix(rnorm(10L), 2L, 5L)) - ) - # expect_error({ - # bst <- lgb.train( - # params = list(objective = "regression", metric = "l2,l1") - # , data = lgb.Dataset(train$data, label = train$label) - # , 10L - # , valids = valids - # ) - # }, regexp = "each element of valids must have a name") -}) - -test_that("lgb.train() works with force_col_wise and force_row_wise", { - set.seed(1234L) - nrounds <- 10L - dtrain <- lgb.Dataset( - train$data - , label = train$label - ) - params <- list( - objective = "binary" - , metric = "binary_error" - , force_col_wise = TRUE - ) - bst_col_wise <- lgb.train( - params = params - , data = dtrain - , nrounds = nrounds - ) - - params <- list( - objective = "binary" - , metric = "binary_error" - , force_row_wise = TRUE - ) - bst_row_wise <- lgb.train( - params = params - , data = dtrain - , nrounds = nrounds - ) - - expected_error <- 0.003070782 - expect_equal(bst_col_wise$eval_train()[[1L]][["value"]], expected_error) - expect_equal(bst_row_wise$eval_train()[[1L]][["value"]], expected_error) - - # check some basic details of the boosters just to be sure force_col_wise - # and force_row_wise are not causing any weird side effects - for (bst in list(bst_row_wise, bst_col_wise)) { - expect_equal(bst$current_iter(), nrounds) - parsed_model <- jsonlite::fromJSON(bst$dump_model()) - expect_equal(parsed_model$objective, "binary sigmoid:1") - expect_false(parsed_model$average_output) - } -}) - -test_that("lgb.train() works as expected with sparse features", { - set.seed(708L) - num_obs <- 70000L - trainDF <- data.frame( - y = sample(c(0L, 1L), size = num_obs, replace = TRUE) - , x = sample(c(1.0:10.0, rep(NA_real_, 50L)), size = num_obs, replace = TRUE) - ) - dtrain <- lgb.Dataset( - data = as.matrix(trainDF[["x"]], drop = FALSE) - , label = trainDF[["y"]] - ) - nrounds <- 1L - bst <- lgb.train( - params = list( - objective = "binary" - , min_data = 1L - , min_data_in_bin = 1L - ) - , data = dtrain - , nrounds = nrounds - ) - - expect_true(lgb.is.Booster(bst)) - expect_equal(bst$current_iter(), nrounds) - parsed_model <- jsonlite::fromJSON(bst$dump_model()) - expect_equal(parsed_model$objective, "binary sigmoid:1") - expect_false(parsed_model$average_output) - expected_error <- 0.6931268 - expect_true(abs(bst$eval_train()[[1L]][["value"]] - expected_error) < TOLERANCE) -}) - -test_that("lgb.train() works with early stopping for classification", { - trainDF <- data.frame( - "feat1" = rep(c(5.0, 10.0), 500L) - , "target" = rep(c(0L, 1L), 500L) - ) - validDF <- data.frame( - "feat1" = rep(c(5.0, 10.0), 50L) - , "target" = rep(c(0L, 1L), 50L) - ) - dtrain <- lgb.Dataset( - data = as.matrix(trainDF[["feat1"]], drop = FALSE) - , label = trainDF[["target"]] - ) - dvalid <- lgb.Dataset( - data = as.matrix(validDF[["feat1"]], drop = FALSE) - , label = validDF[["target"]] - ) - nrounds <- 10L - - ################################ - # train with no early stopping # - ################################ - bst <- lgb.train( - params = list( - objective = "binary" - , metric = "binary_error" - ) - , data = dtrain - , nrounds = nrounds - , valids = list( - "valid1" = dvalid - ) - ) - - # a perfect model should be trivial to obtain, but all 10 rounds - # should happen - expect_equal(bst$best_score, 0.0) - expect_equal(bst$best_iter, 1L) - expect_equal(length(bst$record_evals[["valid1"]][["binary_error"]][["eval"]]), nrounds) - - ############################# - # train with early stopping # - ############################# - early_stopping_rounds <- 5L - bst <- lgb.train( - params = list( - objective = "binary" - , metric = "binary_error" - , early_stopping_rounds = early_stopping_rounds - ) - , data = dtrain - , nrounds = nrounds - , valids = list( - "valid1" = dvalid - ) - ) - - # a perfect model should be trivial to obtain, and only 6 rounds - # should have happen (1 with improvement, 5 consecutive with no improvement) - expect_equal(bst$best_score, 0.0) - expect_equal(bst$best_iter, 1L) - expect_equal( - length(bst$record_evals[["valid1"]][["binary_error"]][["eval"]]) - , early_stopping_rounds + 1L - ) - -}) - -test_that("lgb.train() treats early_stopping_rounds<=0 as disabling early stopping", { - set.seed(708L) - trainDF <- data.frame( - "feat1" = rep(c(5.0, 10.0), 500L) - , "target" = rep(c(0L, 1L), 500L) - ) - validDF <- data.frame( - "feat1" = rep(c(5.0, 10.0), 50L) - , "target" = rep(c(0L, 1L), 50L) - ) - dtrain <- lgb.Dataset( - data = as.matrix(trainDF[["feat1"]], drop = FALSE) - , label = trainDF[["target"]] - ) - dvalid <- lgb.Dataset( - data = as.matrix(validDF[["feat1"]], drop = FALSE) - , label = validDF[["target"]] - ) - nrounds <- 5L - - for (value in c(-5L, 0L)) { - - #----------------------------# - # passed as keyword argument # - #----------------------------# - bst <- lgb.train( - params = list( - objective = "binary" - , metric = "binary_error" - ) - , data = dtrain - , nrounds = nrounds - , valids = list( - "valid1" = dvalid - ) - , early_stopping_rounds = value - ) - - # a perfect model should be trivial to obtain, but all 10 rounds - # should happen - expect_equal(bst$best_score, 0.0) - expect_equal(bst$best_iter, 1L) - expect_equal(length(bst$record_evals[["valid1"]][["binary_error"]][["eval"]]), nrounds) - - #---------------------------# - # passed as parameter alias # - #---------------------------# - bst <- lgb.train( - params = list( - objective = "binary" - , metric = "binary_error" - , n_iter_no_change = value - ) - , data = dtrain - , nrounds = nrounds - , valids = list( - "valid1" = dvalid - ) - ) - - # a perfect model should be trivial to obtain, but all 10 rounds - # should happen - expect_equal(bst$best_score, 0.0) - expect_equal(bst$best_iter, 1L) - expect_equal(length(bst$record_evals[["valid1"]][["binary_error"]][["eval"]]), nrounds) - } -}) - -test_that("lgb.train() works with early stopping for classification with a metric that should be maximized", { - set.seed(708L) - dtrain <- lgb.Dataset( - data = train$data - , label = train$label - ) - dvalid <- lgb.Dataset( - data = test$data - , label = test$label - ) - nrounds <- 10L - - ############################# - # train with early stopping # - ############################# - early_stopping_rounds <- 5L - # the harsh max_depth guarantees that AUC improves over at least the first few iterations - bst_auc <- lgb.train( - params = list( - objective = "binary" - , metric = "auc" - , max_depth = 3L - , early_stopping_rounds = early_stopping_rounds - ) - , data = dtrain - , nrounds = nrounds - , valids = list( - "valid1" = dvalid - ) - ) - bst_binary_error <- lgb.train( - params = list( - objective = "binary" - , metric = "binary_error" - , max_depth = 3L - , early_stopping_rounds = early_stopping_rounds - ) - , data = dtrain - , nrounds = nrounds - , valids = list( - "valid1" = dvalid - ) - ) - - # early stopping should have been hit for binary_error (higher_better = FALSE) - eval_info <- bst_binary_error$.__enclos_env__$private$get_eval_info() - expect_identical(eval_info, "binary_error") - expect_identical( - unname(bst_binary_error$.__enclos_env__$private$higher_better_inner_eval) - , FALSE - ) - expect_identical(bst_binary_error$best_iter, 1L) - expect_identical(bst_binary_error$current_iter(), early_stopping_rounds + 1L) - expect_true(abs(bst_binary_error$best_score - 0.01613904) < TOLERANCE) - - # early stopping should not have been hit for AUC (higher_better = TRUE) - eval_info <- bst_auc$.__enclos_env__$private$get_eval_info() - expect_identical(eval_info, "auc") - expect_identical( - unname(bst_auc$.__enclos_env__$private$higher_better_inner_eval) - , TRUE - ) - expect_identical(bst_auc$best_iter, 9L) - expect_identical(bst_auc$current_iter(), nrounds) - expect_true(abs(bst_auc$best_score - 0.9999969) < TOLERANCE) -}) - -test_that("lgb.train() works with early stopping for regression", { - set.seed(708L) - trainDF <- data.frame( - "feat1" = rep(c(10.0, 100.0), 500L) - , "target" = rep(c(-50.0, 50.0), 500L) - ) - validDF <- data.frame( - "feat1" = rep(50.0, 4L) - , "target" = rep(50.0, 4L) - ) - dtrain <- lgb.Dataset( - data = as.matrix(trainDF[["feat1"]], drop = FALSE) - , label = trainDF[["target"]] - ) - dvalid <- lgb.Dataset( - data = as.matrix(validDF[["feat1"]], drop = FALSE) - , label = validDF[["target"]] - ) - nrounds <- 10L - - ################################ - # train with no early stopping # - ################################ - bst <- lgb.train( - params = list( - objective = "regression" - , metric = "rmse" - ) - , data = dtrain - , nrounds = nrounds - , valids = list( - "valid1" = dvalid - ) - ) - - # the best possible model should come from the first iteration, but - # all 10 training iterations should happen - expect_equal(bst$best_score, 55.0) - expect_equal(bst$best_iter, 1L) - expect_equal(length(bst$record_evals[["valid1"]][["rmse"]][["eval"]]), nrounds) - - ############################# - # train with early stopping # - ############################# - early_stopping_rounds <- 5L - bst <- lgb.train( - params = list( - objective = "regression" - , metric = "rmse" - , early_stopping_rounds = early_stopping_rounds - ) - , data = dtrain - , nrounds = nrounds - , valids = list( - "valid1" = dvalid - ) - ) - - # the best model should be from the first iteration, and only 6 rounds - # should have happen (1 with improvement, 5 consecutive with no improvement) - expect_equal(bst$best_score, 55.0) - expect_equal(bst$best_iter, 1L) - expect_equal( - length(bst$record_evals[["valid1"]][["rmse"]][["eval"]]) - , early_stopping_rounds + 1L - ) -}) - -test_that("lgb.train() does not stop early if early_stopping_rounds is not given", { - set.seed(708L) - - increasing_metric_starting_value <- get( - ACCUMULATOR_NAME - , envir = .GlobalEnv - ) - nrounds <- 10L - metrics <- list( - .constant_metric - , .increasing_metric - ) - bst <- lgb.train( - params = list( - objective = "regression" - , metric = "None" - ) - , data = DTRAIN_RANDOM_REGRESSION - , nrounds = nrounds - , valids = list("valid1" = DVALID_RANDOM_REGRESSION) - , eval = metrics - ) - - # Only the two functions provided to "eval" should have been evaluated - expect_equal(length(bst$record_evals[["valid1"]]), 2L) - - # all 10 iterations should have happen, and the best_iter should be - # the first one (based on constant_metric) - best_iter <- 1L - expect_equal(bst$best_iter, best_iter) - - # best_score should be taken from the first metric - expect_equal( - bst$best_score - , bst$record_evals[["valid1"]][["constant_metric"]][["eval"]][[best_iter]] - ) - - # early stopping should not have happened. Even though constant_metric - # had 9 consecutive iterations with no improvement, it is ignored because of - # first_metric_only = TRUE - expect_equal( - length(bst$record_evals[["valid1"]][["constant_metric"]][["eval"]]) - , nrounds - ) - expect_equal( - length(bst$record_evals[["valid1"]][["increasing_metric"]][["eval"]]) - , nrounds - ) -}) - -test_that("If first_metric_only is not given or is FALSE, lgb.train() decides to stop early based on all metrics", { - set.seed(708L) - - early_stopping_rounds <- 3L - param_variations <- list( - list( - objective = "regression" - , metric = "None" - , early_stopping_rounds = early_stopping_rounds - ) - , list( - objective = "regression" - , metric = "None" - , early_stopping_rounds = early_stopping_rounds - , first_metric_only = FALSE - ) - ) - - for (params in param_variations) { - - nrounds <- 10L - bst <- lgb.train( - params = params - , data = DTRAIN_RANDOM_REGRESSION - , nrounds = nrounds - , valids = list( - "valid1" = DVALID_RANDOM_REGRESSION - ) - , eval = list( - .increasing_metric - , .constant_metric - ) - ) - - # Only the two functions provided to "eval" should have been evaluated - expect_equal(length(bst$record_evals[["valid1"]]), 2L) - - # early stopping should have happened, and should have stopped early_stopping_rounds + 1 rounds in - # because constant_metric never improves - # - # the best iteration should be the last one, because increasing_metric was first - # and gets better every iteration - best_iter <- early_stopping_rounds + 1L - expect_equal(bst$best_iter, best_iter) - - # best_score should be taken from "increasing_metric" because it was first - expect_equal( - bst$best_score - , bst$record_evals[["valid1"]][["increasing_metric"]][["eval"]][[best_iter]] - ) - - # early stopping should not have happened. even though increasing_metric kept - # getting better, early stopping should have happened because "constant_metric" - # did not improve - expect_equal( - length(bst$record_evals[["valid1"]][["constant_metric"]][["eval"]]) - , early_stopping_rounds + 1L - ) - expect_equal( - length(bst$record_evals[["valid1"]][["increasing_metric"]][["eval"]]) - , early_stopping_rounds + 1L - ) - } - -}) - -test_that("If first_metric_only is TRUE, lgb.train() decides to stop early based on only the first metric", { - set.seed(708L) - nrounds <- 10L - early_stopping_rounds <- 3L - increasing_metric_starting_value <- get(ACCUMULATOR_NAME, envir = .GlobalEnv) - bst <- lgb.train( - params = list( - objective = "regression" - , metric = "None" - , early_stopping_rounds = early_stopping_rounds - , first_metric_only = TRUE - ) - , data = DTRAIN_RANDOM_REGRESSION - , nrounds = nrounds - , valids = list( - "valid1" = DVALID_RANDOM_REGRESSION - ) - , eval = list( - .increasing_metric - , .constant_metric - ) - ) - - # Only the two functions provided to "eval" should have been evaluated - expect_equal(length(bst$record_evals[["valid1"]]), 2L) - - # all 10 iterations should happen, and the best_iter should be the final one - expect_equal(bst$best_iter, nrounds) - - # best_score should be taken from "increasing_metric" - expect_equal( - bst$best_score - , increasing_metric_starting_value + 0.1 * nrounds - ) - - # early stopping should not have happened. Even though constant_metric - # had 9 consecutive iterations with no improvement, it is ignored because of - # first_metric_only = TRUE - expect_equal( - length(bst$record_evals[["valid1"]][["constant_metric"]][["eval"]]) - , nrounds - ) - expect_equal( - length(bst$record_evals[["valid1"]][["increasing_metric"]][["eval"]]) - , nrounds - ) -}) - -test_that("lgb.train() works when a mixture of functions and strings are passed to eval", { - set.seed(708L) - nrounds <- 10L - increasing_metric_starting_value <- get(ACCUMULATOR_NAME, envir = .GlobalEnv) - bst <- lgb.train( - params = list( - objective = "regression" - , metric = "None" - ) - , data = DTRAIN_RANDOM_REGRESSION - , nrounds = nrounds - , valids = list( - "valid1" = DVALID_RANDOM_REGRESSION - ) - , eval = list( - .increasing_metric - , "rmse" - , .constant_metric - , "l2" - ) - ) - - # all 4 metrics should have been used - expect_named( - bst$record_evals[["valid1"]] - , expected = c("rmse", "l2", "increasing_metric", "constant_metric") - , ignore.order = TRUE - , ignore.case = FALSE - ) - - # the difference metrics shouldn't have been mixed up with each other - results <- bst$record_evals[["valid1"]] - expect_true(abs(results[["rmse"]][["eval"]][[1L]] - 1.105012) < TOLERANCE) - expect_true(abs(results[["l2"]][["eval"]][[1L]] - 1.221051) < TOLERANCE) - expected_increasing_metric <- increasing_metric_starting_value + 0.1 - expect_true( - abs( - results[["increasing_metric"]][["eval"]][[1L]] - expected_increasing_metric - ) < TOLERANCE - ) - expect_true(abs(results[["constant_metric"]][["eval"]][[1L]] - CONSTANT_METRIC_VALUE) < TOLERANCE) - -}) - -test_that("lgb.train() works when a list of strings or a character vector is passed to eval", { - - # testing list and character vector, as well as length-1 and length-2 - eval_variations <- list( - c("binary_error", "binary_logloss") - , "binary_logloss" - , list("binary_error", "binary_logloss") - , list("binary_logloss") - ) - - for (eval_variation in eval_variations) { - - set.seed(708L) - nrounds <- 10L - increasing_metric_starting_value <- get(ACCUMULATOR_NAME, envir = .GlobalEnv) - bst <- lgb.train( - params = list( - objective = "binary" - , metric = "None" - ) - , data = DTRAIN_RANDOM_CLASSIFICATION - , nrounds = nrounds - , valids = list( - "valid1" = DVALID_RANDOM_CLASSIFICATION - ) - , eval = eval_variation - ) - - # both metrics should have been used - expect_named( - bst$record_evals[["valid1"]] - , expected = unlist(eval_variation) - , ignore.order = TRUE - , ignore.case = FALSE - ) - - # the difference metrics shouldn't have been mixed up with each other - results <- bst$record_evals[["valid1"]] - if ("binary_error" %in% unlist(eval_variation)) { - expect_true(abs(results[["binary_error"]][["eval"]][[1L]] - 0.4864865) < TOLERANCE) - } - if ("binary_logloss" %in% unlist(eval_variation)) { - expect_true(abs(results[["binary_logloss"]][["eval"]][[1L]] - 0.6932548) < TOLERANCE) - } - } -}) - -test_that("lgb.train() works when you specify both 'metric' and 'eval' with strings", { - set.seed(708L) - nrounds <- 10L - increasing_metric_starting_value <- get(ACCUMULATOR_NAME, envir = .GlobalEnv) - bst <- lgb.train( - params = list( - objective = "binary" - , metric = "binary_error" - ) - , data = DTRAIN_RANDOM_CLASSIFICATION - , nrounds = nrounds - , valids = list( - "valid1" = DVALID_RANDOM_CLASSIFICATION - ) - , eval = "binary_logloss" - ) - - # both metrics should have been used - expect_named( - bst$record_evals[["valid1"]] - , expected = c("binary_error", "binary_logloss") - , ignore.order = TRUE - , ignore.case = FALSE - ) - - # the difference metrics shouldn't have been mixed up with each other - results <- bst$record_evals[["valid1"]] - expect_true(abs(results[["binary_error"]][["eval"]][[1L]] - 0.4864865) < TOLERANCE) - expect_true(abs(results[["binary_logloss"]][["eval"]][[1L]] - 0.6932548) < TOLERANCE) -}) - -test_that("lgb.train() works when you give a function for eval", { - set.seed(708L) - nrounds <- 10L - increasing_metric_starting_value <- get(ACCUMULATOR_NAME, envir = .GlobalEnv) - bst <- lgb.train( - params = list( - objective = "binary" - , metric = "None" - ) - , data = DTRAIN_RANDOM_CLASSIFICATION - , nrounds = nrounds - , valids = list( - "valid1" = DVALID_RANDOM_CLASSIFICATION - ) - , eval = .constant_metric - ) - - # the difference metrics shouldn't have been mixed up with each other - results <- bst$record_evals[["valid1"]] - expect_true(abs(results[["constant_metric"]][["eval"]][[1L]] - CONSTANT_METRIC_VALUE) < TOLERANCE) -}) - -test_that("lgb.train() works with early stopping for regression with a metric that should be minimized", { - set.seed(708L) - trainDF <- data.frame( - "feat1" = rep(c(10.0, 100.0), 500L) - , "target" = rep(c(-50.0, 50.0), 500L) - ) - validDF <- data.frame( - "feat1" = rep(50.0, 4L) - , "target" = rep(50.0, 4L) - ) - dtrain <- lgb.Dataset( - data = as.matrix(trainDF[["feat1"]], drop = FALSE) - , label = trainDF[["target"]] - ) - dvalid <- lgb.Dataset( - data = as.matrix(validDF[["feat1"]], drop = FALSE) - , label = validDF[["target"]] - ) - nrounds <- 10L - - ############################# - # train with early stopping # - ############################# - early_stopping_rounds <- 5L - bst <- lgb.train( - params = list( - objective = "regression" - , metric = c( - "mape" - , "rmse" - , "mae" - ) - , min_data_in_bin = 5L - , early_stopping_rounds = early_stopping_rounds - ) - , data = dtrain - , nrounds = nrounds - , valids = list( - "valid1" = dvalid - ) - ) - - # the best model should be from the first iteration, and only 6 rounds - # should have happened (1 with improvement, 5 consecutive with no improvement) - expect_equal(bst$best_score, 1.1) - expect_equal(bst$best_iter, 1L) - expect_equal( - length(bst$record_evals[["valid1"]][["mape"]][["eval"]]) - , early_stopping_rounds + 1L - ) - - # Booster should understand thatt all three of these metrics should be minimized - eval_info <- bst$.__enclos_env__$private$get_eval_info() - expect_identical(eval_info, c("mape", "rmse", "l1")) - expect_identical( - unname(bst$.__enclos_env__$private$higher_better_inner_eval) - , rep(FALSE, 3L) - ) -}) - - -test_that("lgb.train() supports non-ASCII feature names", { - testthat::skip("UTF-8 feature names are not fully supported in the R package") - dtrain <- lgb.Dataset( - data = matrix(rnorm(400L), ncol = 4L) - , label = rnorm(100L) - ) - feature_names <- c("F_零", "F_一", "F_二", "F_三") - bst <- lgb.train( - data = dtrain - , nrounds = 5L - , obj = "regression" - , params = list( - metric = "rmse" - ) - , colnames = feature_names - ) - expect_true(lgb.is.Booster(bst)) - dumped_model <- jsonlite::fromJSON(bst$dump_model()) - expect_identical( - dumped_model[["feature_names"]] - , feature_names - ) -}) - -test_that("when early stopping is not activated, best_iter and best_score come from valids and not training data", { - set.seed(708L) - trainDF <- data.frame( - "feat1" = rep(c(10.0, 100.0), 500L) - , "target" = rep(c(-50.0, 50.0), 500L) - ) - validDF <- data.frame( - "feat1" = rep(50.0, 4L) - , "target" = rep(50.0, 4L) - ) - dtrain <- lgb.Dataset( - data = as.matrix(trainDF[["feat1"]], drop = FALSE) - , label = trainDF[["target"]] - ) - dvalid1 <- lgb.Dataset( - data = as.matrix(validDF[["feat1"]], drop = FALSE) - , label = validDF[["target"]] - ) - dvalid2 <- lgb.Dataset( - data = as.matrix(validDF[1L:10L, "feat1"], drop = FALSE) - , label = validDF[1L:10L, "target"] - ) - nrounds <- 10L - train_params <- list( - objective = "regression" - , metric = "rmse" - , learning_rate = 1.5 - ) - - # example 1: two valids, neither are the training data - bst <- lgb.train( - data = dtrain - , nrounds = nrounds - , num_leaves = 5L - , valids = list( - "valid1" = dvalid1 - , "valid2" = dvalid2 - ) - , params = train_params - ) - expect_named( - bst$record_evals - , c("start_iter", "valid1", "valid2") - , ignore.order = FALSE - , ignore.case = FALSE - ) - rmse_scores <- unlist(bst$record_evals[["valid1"]][["rmse"]][["eval"]]) - expect_length(rmse_scores, nrounds) - expect_identical(bst$best_iter, which.min(rmse_scores)) - expect_identical(bst$best_score, rmse_scores[which.min(rmse_scores)]) - - # example 2: train first (called "train") and two valids - bst <- lgb.train( - data = dtrain - , nrounds = nrounds - , num_leaves = 5L - , valids = list( - "train" = dtrain - , "valid1" = dvalid1 - , "valid2" = dvalid2 - ) - , params = train_params - ) - expect_named( - bst$record_evals - , c("start_iter", "train", "valid1", "valid2") - , ignore.order = FALSE - , ignore.case = FALSE - ) - rmse_scores <- unlist(bst$record_evals[["valid1"]][["rmse"]][["eval"]]) - expect_length(rmse_scores, nrounds) - expect_identical(bst$best_iter, which.min(rmse_scores)) - expect_identical(bst$best_score, rmse_scores[which.min(rmse_scores)]) - - # example 3: train second (called "train") and two valids - bst <- lgb.train( - data = dtrain - , nrounds = nrounds - , num_leaves = 5L - , valids = list( - "valid1" = dvalid1 - , "train" = dtrain - , "valid2" = dvalid2 - ) - , params = train_params - ) - # note that "train" still ends up as the first one - expect_named( - bst$record_evals - , c("start_iter", "train", "valid1", "valid2") - , ignore.order = FALSE - , ignore.case = FALSE - ) - rmse_scores <- unlist(bst$record_evals[["valid1"]][["rmse"]][["eval"]]) - expect_length(rmse_scores, nrounds) - expect_identical(bst$best_iter, which.min(rmse_scores)) - expect_identical(bst$best_score, rmse_scores[which.min(rmse_scores)]) - - # example 4: train third (called "train") and two valids - bst <- lgb.train( - data = dtrain - , nrounds = nrounds - , num_leaves = 5L - , valids = list( - "valid1" = dvalid1 - , "valid2" = dvalid2 - , "train" = dtrain - ) - , params = train_params - ) - # note that "train" still ends up as the first one - expect_named( - bst$record_evals - , c("start_iter", "train", "valid1", "valid2") - , ignore.order = FALSE - , ignore.case = FALSE - ) - rmse_scores <- unlist(bst$record_evals[["valid1"]][["rmse"]][["eval"]]) - expect_length(rmse_scores, nrounds) - expect_identical(bst$best_iter, which.min(rmse_scores)) - expect_identical(bst$best_score, rmse_scores[which.min(rmse_scores)]) - - # example 5: train second (called "something-random-we-would-not-hardcode") and two valids - bst <- lgb.train( - data = dtrain - , nrounds = nrounds - , num_leaves = 5L - , valids = list( - "valid1" = dvalid1 - , "something-random-we-would-not-hardcode" = dtrain - , "valid2" = dvalid2 - ) - , params = train_params - ) - # note that "something-random-we-would-not-hardcode" was recognized as the training - # data even though it isn't named "train" - expect_named( - bst$record_evals - , c("start_iter", "something-random-we-would-not-hardcode", "valid1", "valid2") - , ignore.order = FALSE - , ignore.case = FALSE - ) - rmse_scores <- unlist(bst$record_evals[["valid1"]][["rmse"]][["eval"]]) - expect_length(rmse_scores, nrounds) - expect_identical(bst$best_iter, which.min(rmse_scores)) - expect_identical(bst$best_score, rmse_scores[which.min(rmse_scores)]) - - # example 6: the only valid supplied is the training data - bst <- lgb.train( - data = dtrain - , nrounds = nrounds - , num_leaves = 5L - , valids = list( - "train" = dtrain - ) - , params = train_params - ) - expect_identical(bst$best_iter, -1L) - expect_identical(bst$best_score, NA_real_) -}) - -test_that("lightgbm.train() gives the correct best_score and best_iter for a metric where higher values are better", { - set.seed(708L) - trainDF <- data.frame( - "feat1" = runif(n = 500L, min = 0.0, max = 15.0) - , "target" = rep(c(0L, 1L), 500L) - ) - validDF <- data.frame( - "feat1" = runif(n = 50L, min = 0.0, max = 15.0) - , "target" = rep(c(0L, 1L), 50L) - ) - dtrain <- lgb.Dataset( - data = as.matrix(trainDF[["feat1"]], drop = FALSE) - , label = trainDF[["target"]] - ) - dvalid1 <- lgb.Dataset( - data = as.matrix(validDF[1L:25L, "feat1"], drop = FALSE) - , label = validDF[1L:25L, "target"] - ) - nrounds <- 10L - bst <- lgb.train( - data = dtrain - , nrounds = nrounds - , num_leaves = 5L - , valids = list( - "valid1" = dvalid1 - , "something-random-we-would-not-hardcode" = dtrain - ) - , params = list( - objective = "binary" - , metric = "auc" - , learning_rate = 1.5 - ) - ) - # note that "something-random-we-would-not-hardcode" was recognized as the training - # data even though it isn't named "train" - expect_named( - bst$record_evals - , c("start_iter", "something-random-we-would-not-hardcode", "valid1") - , ignore.order = FALSE - , ignore.case = FALSE - ) - auc_scores <- unlist(bst$record_evals[["valid1"]][["auc"]][["eval"]]) - expect_length(auc_scores, nrounds) - expect_identical(bst$best_iter, which.max(auc_scores)) - expect_identical(bst$best_score, auc_scores[which.max(auc_scores)]) -}) - -test_that("using lightgbm() without early stopping, best_iter and best_score come from valids and not training data", { - set.seed(708L) - # example: train second (called "something-random-we-would-not-hardcode"), two valids, - # and a metric where higher values are better ("auc") - trainDF <- data.frame( - "feat1" = runif(n = 500L, min = 0.0, max = 15.0) - , "target" = rep(c(0L, 1L), 500L) - ) - validDF <- data.frame( - "feat1" = runif(n = 50L, min = 0.0, max = 15.0) - , "target" = rep(c(0L, 1L), 50L) - ) - dtrain <- lgb.Dataset( - data = as.matrix(trainDF[["feat1"]], drop = FALSE) - , label = trainDF[["target"]] - ) - dvalid1 <- lgb.Dataset( - data = as.matrix(validDF[1L:25L, "feat1"], drop = FALSE) - , label = validDF[1L:25L, "target"] - ) - dvalid2 <- lgb.Dataset( - data = as.matrix(validDF[26L:50L, "feat1"], drop = FALSE) - , label = validDF[26L:50L, "target"] - ) - nrounds <- 10L - bst <- lightgbm( - data = dtrain - , nrounds = nrounds - , num_leaves = 5L - , valids = list( - "valid1" = dvalid1 - , "something-random-we-would-not-hardcode" = dtrain - , "valid2" = dvalid2 - ) - , params = list( - objective = "binary" - , metric = "auc" - , learning_rate = 1.5 - ) - , verbose = -7L - , save_name = tempfile(fileext = ".model") - ) - # when verbose <= 0 is passed to lightgbm(), 'valids' is passed through to lgb.train() - # untouched. If you set verbose to > 0, the training data will still be first but called "train" - expect_named( - bst$record_evals - , c("start_iter", "something-random-we-would-not-hardcode", "valid1", "valid2") - , ignore.order = FALSE - , ignore.case = FALSE - ) - auc_scores <- unlist(bst$record_evals[["valid1"]][["auc"]][["eval"]]) - expect_length(auc_scores, nrounds) - expect_identical(bst$best_iter, which.max(auc_scores)) - expect_identical(bst$best_score, auc_scores[which.max(auc_scores)]) -}) - -test_that("lgb.cv() works when you specify both 'metric' and 'eval' with strings", { - set.seed(708L) - nrounds <- 10L - nfolds <- 4L - increasing_metric_starting_value <- get(ACCUMULATOR_NAME, envir = .GlobalEnv) - bst <- lgb.cv( - params = list( - objective = "binary" - , metric = "binary_error" - ) - , data = DTRAIN_RANDOM_CLASSIFICATION - , nrounds = nrounds - , nfold = nfolds - , eval = "binary_logloss" - ) - - # both metrics should have been used - expect_named( - bst$record_evals[["valid"]] - , expected = c("binary_error", "binary_logloss") - , ignore.order = TRUE - , ignore.case = FALSE - ) - - # the difference metrics shouldn't have been mixed up with each other - results <- bst$record_evals[["valid"]] - expect_true(abs(results[["binary_error"]][["eval"]][[1L]] - 0.5005654) < TOLERANCE) - expect_true(abs(results[["binary_logloss"]][["eval"]][[1L]] - 0.7011232) < TOLERANCE) - - # all boosters should have been created - expect_length(bst$boosters, nfolds) -}) - -test_that("lgb.cv() works when you give a function for eval", { - set.seed(708L) - nrounds <- 10L - nfolds <- 3L - increasing_metric_starting_value <- get(ACCUMULATOR_NAME, envir = .GlobalEnv) - bst <- lgb.cv( - params = list( - objective = "binary" - , metric = "None" - ) - , data = DTRAIN_RANDOM_CLASSIFICATION - , nfold = nfolds - , nrounds = nrounds - , eval = .constant_metric - ) - - # the difference metrics shouldn't have been mixed up with each other - results <- bst$record_evals[["valid"]] - expect_true(abs(results[["constant_metric"]][["eval"]][[1L]] - CONSTANT_METRIC_VALUE) < TOLERANCE) - expect_named(results, "constant_metric") -}) - -test_that("If first_metric_only is TRUE, lgb.cv() decides to stop early based on only the first metric", { - set.seed(708L) - nrounds <- 10L - nfolds <- 5L - early_stopping_rounds <- 3L - increasing_metric_starting_value <- get(ACCUMULATOR_NAME, envir = .GlobalEnv) - bst <- lgb.cv( - params = list( - objective = "regression" - , metric = "None" - , early_stopping_rounds = early_stopping_rounds - , first_metric_only = TRUE - ) - , data = DTRAIN_RANDOM_REGRESSION - , nfold = nfolds - , nrounds = nrounds - , valids = list( - "valid1" = DVALID_RANDOM_REGRESSION - ) - , eval = list( - .increasing_metric - , .constant_metric - ) - ) - - # Only the two functions provided to "eval" should have been evaluated - expect_named(bst$record_evals[["valid"]], c("increasing_metric", "constant_metric")) - - # all 10 iterations should happen, and the best_iter should be the final one - expect_equal(bst$best_iter, nrounds) - - # best_score should be taken from "increasing_metric" - # - # this expected value looks magical and confusing, but it's because - # evaluation metrics are averaged over all folds. - # - # consider 5-fold CV with a metric that adds 0.1 to a global accumulator - # each time it's called - # - # * iter 1: [0.1, 0.2, 0.3, 0.4, 0.5] (mean = 0.3) - # * iter 2: [0.6, 0.7, 0.8, 0.9, 1.0] (mean = 1.3) - # * iter 3: [1.1, 1.2, 1.3, 1.4, 1.5] (mean = 1.8) - # - cv_value <- increasing_metric_starting_value + mean(seq_len(nfolds) / 10.0) + (nrounds - 1L) * 0.1 * nfolds - expect_equal(bst$best_score, cv_value) - - # early stopping should not have happened. Even though constant_metric - # had 9 consecutive iterations with no improvement, it is ignored because of - # first_metric_only = TRUE - expect_equal( - length(bst$record_evals[["valid"]][["constant_metric"]][["eval"]]) - , nrounds - ) - expect_equal( - length(bst$record_evals[["valid"]][["increasing_metric"]][["eval"]]) - , nrounds - ) -}) - -test_that("early stopping works with lgb.cv()", { - set.seed(708L) - nrounds <- 10L - nfolds <- 5L - early_stopping_rounds <- 3L - increasing_metric_starting_value <- get(ACCUMULATOR_NAME, envir = .GlobalEnv) - bst <- lgb.cv( - params = list( - objective = "regression" - , metric = "None" - , early_stopping_rounds = early_stopping_rounds - , first_metric_only = TRUE - ) - , data = DTRAIN_RANDOM_REGRESSION - , nfold = nfolds - , nrounds = nrounds - , valids = list( - "valid1" = DVALID_RANDOM_REGRESSION - ) - , eval = list( - .constant_metric - , .increasing_metric - ) - ) - - # only the two functions provided to "eval" should have been evaluated - expect_named(bst$record_evals[["valid"]], c("constant_metric", "increasing_metric")) - - # best_iter should be based on the first metric. Since constant_metric - # never changes, its first iteration was the best oone - expect_equal(bst$best_iter, 1L) - - # best_score should be taken from the first metri - expect_equal(bst$best_score, 0.2) - - # early stopping should have happened, since constant_metric was the first - # one passed to eval and it will not improve over consecutive iterations - # - # note that this test is identical to the previous one, but with the - # order of the eval metrics switched - expect_equal( - length(bst$record_evals[["valid"]][["constant_metric"]][["eval"]]) - , early_stopping_rounds + 1L - ) - expect_equal( - length(bst$record_evals[["valid"]][["increasing_metric"]][["eval"]]) - , early_stopping_rounds + 1L - ) -}) - -context("interaction constraints") - -test_that("lgb.train() throws an informative error if interaction_constraints is not a list", { - dtrain <- lgb.Dataset(train$data, label = train$label) - params <- list(objective = "regression", interaction_constraints = "[1,2],[3]") - # expect_error({ - # bst <- lightgbm( - # data = dtrain - # , params = params - # , nrounds = 2L - # ) - # }, "interaction_constraints must be a list") -}) - -test_that(paste0("lgb.train() throws an informative error if the members of interaction_constraints ", - "are not character or numeric vectors"), { - dtrain <- lgb.Dataset(train$data, label = train$label) - params <- list(objective = "regression", interaction_constraints = list(list(1L, 2L), list(3L))) - # expect_error({ - # bst <- lightgbm( - # data = dtrain - # , params = params - # , nrounds = 2L - # ) - # }, "every element in interaction_constraints must be a character vector or numeric vector") -}) - -test_that("lgb.train() throws an informative error if interaction_constraints contains a too large index", { - dtrain <- lgb.Dataset(train$data, label = train$label) - params <- list(objective = "regression", - interaction_constraints = list(c(1L, length(colnames(train$data)) + 1L), 3L)) - # expect_error({ - # bst <- lightgbm( - # data = dtrain - # , params = params - # , nrounds = 2L - # ) - # }, "supplied a too large value in interaction_constraints") -}) - -test_that(paste0("lgb.train() gives same result when interaction_constraints is specified as a list of ", - "character vectors, numeric vectors, or a combination"), { - set.seed(1L) - dtrain <- lgb.Dataset(train$data, label = train$label) - - params <- list(objective = "regression", interaction_constraints = list(c(1L, 2L), 3L)) - bst <- lightgbm( - data = dtrain - , params = params - , nrounds = 2L - ) - pred1 <- bst$predict(test$data) - - cnames <- colnames(train$data) - params <- list(objective = "regression", interaction_constraints = list(c(cnames[[1L]], cnames[[2L]]), cnames[[3L]])) - bst <- lightgbm( - data = dtrain - , params = params - , nrounds = 2L - ) - pred2 <- bst$predict(test$data) - - params <- list(objective = "regression", interaction_constraints = list(c(cnames[[1L]], cnames[[2L]]), 3L)) - bst <- lightgbm( - data = dtrain - , params = params - , nrounds = 2L - ) - pred3 <- bst$predict(test$data) - - expect_equal(pred1, pred2) - expect_equal(pred2, pred3) - -}) - -test_that(paste0("lgb.train() gives same results when using interaction_constraints and specifying colnames"), { - set.seed(1L) - dtrain <- lgb.Dataset(train$data, label = train$label) - - params <- list(objective = "regression", interaction_constraints = list(c(1L, 2L), 3L)) - bst <- lightgbm( - data = dtrain - , params = params - , nrounds = 2L - ) - pred1 <- bst$predict(test$data) - - new_colnames <- paste0(colnames(train$data), "_x") - params <- list(objective = "regression" - , interaction_constraints = list(c(new_colnames[1L], new_colnames[2L]), new_colnames[3L])) - bst <- lightgbm( - data = dtrain - , params = params - , nrounds = 2L - , colnames = new_colnames - ) - pred2 <- bst$predict(test$data) - - expect_equal(pred1, pred2) - -}) +# context("lightgbm()") + +# data(agaricus.train, package = "lightgbm") +# data(agaricus.test, package = "lightgbm") +# train <- agaricus.train +# test <- agaricus.test + +# TOLERANCE <- 1e-6 +# set.seed(708L) + +# # [description] Every time this function is called, it adds 0.1 +# # to an accumulator then returns the current value. +# # This is used to mock the situation where an evaluation +# # metric increases every iteration +# ACCUMULATOR_NAME <- "INCREASING_METRIC_ACUMULATOR" +# assign(x = ACCUMULATOR_NAME, value = 0.0, envir = .GlobalEnv) + +# .increasing_metric <- function(preds, dtrain) { +# if (!exists(ACCUMULATOR_NAME, envir = .GlobalEnv)) { +# assign(ACCUMULATOR_NAME, 0.0, envir = .GlobalEnv) +# } +# assign( +# x = ACCUMULATOR_NAME +# , value = get(ACCUMULATOR_NAME, envir = .GlobalEnv) + 0.1 +# , envir = .GlobalEnv +# ) +# return(list( +# name = "increasing_metric" +# , value = get(ACCUMULATOR_NAME, envir = .GlobalEnv) +# , higher_better = TRUE +# )) +# } + +# # [description] Evaluation function that always returns the +# # same value +# CONSTANT_METRIC_VALUE <- 0.2 +# .constant_metric <- function(preds, dtrain) { +# return(list( +# name = "constant_metric" +# , value = CONSTANT_METRIC_VALUE +# , higher_better = FALSE +# )) +# } + +# # sample datasets to test early stopping +# DTRAIN_RANDOM_REGRESSION <- lgb.Dataset( +# data = as.matrix(rnorm(100L), ncol = 1L, drop = FALSE) +# , label = rnorm(100L) +# ) +# DVALID_RANDOM_REGRESSION <- lgb.Dataset( +# data = as.matrix(rnorm(50L), ncol = 1L, drop = FALSE) +# , label = rnorm(50L) +# ) +# DTRAIN_RANDOM_CLASSIFICATION <- lgb.Dataset( +# data = as.matrix(rnorm(120L), ncol = 1L, drop = FALSE) +# , label = sample(c(0L, 1L), size = 120L, replace = TRUE) +# ) +# DVALID_RANDOM_CLASSIFICATION <- lgb.Dataset( +# data = as.matrix(rnorm(37L), ncol = 1L, drop = FALSE) +# , label = sample(c(0L, 1L), size = 37L, replace = TRUE) +# ) + +# test_that("train and predict binary classification", { +# nrounds <- 10L +# bst <- lightgbm( +# data = train$data +# , label = train$label +# , num_leaves = 5L +# , nrounds = nrounds +# , objective = "binary" +# , metric = "binary_error" +# , save_name = tempfile(fileext = ".model") +# ) +# expect_false(is.null(bst$record_evals)) +# record_results <- lgb.get.eval.result(bst, "train", "binary_error") +# expect_lt(min(record_results), 0.02) + +# pred <- predict(bst, test$data) +# expect_equal(length(pred), 1611L) + +# pred1 <- predict(bst, train$data, num_iteration = 1L) +# expect_equal(length(pred1), 6513L) +# err_pred1 <- sum((pred1 > 0.5) != train$label) / length(train$label) +# err_log <- record_results[1L] +# expect_lt(abs(err_pred1 - err_log), TOLERANCE) +# }) + + +# test_that("train and predict softmax", { +# set.seed(708L) +# lb <- as.numeric(iris$Species) - 1L + +# bst <- lightgbm( +# data = as.matrix(iris[, -5L]) +# , label = lb +# , num_leaves = 4L +# , learning_rate = 0.05 +# , nrounds = 20L +# , min_data = 20L +# , min_hessian = 10.0 +# , objective = "multiclass" +# , metric = "multi_error" +# , num_class = 3L +# , save_name = tempfile(fileext = ".model") +# ) + +# expect_false(is.null(bst$record_evals)) +# record_results <- lgb.get.eval.result(bst, "train", "multi_error") +# expect_lt(min(record_results), 0.06) + +# pred <- predict(bst, as.matrix(iris[, -5L])) +# expect_equal(length(pred), nrow(iris) * 3L) +# }) + + +# test_that("use of multiple eval metrics works", { +# metrics <- list("binary_error", "auc", "binary_logloss") +# bst <- lightgbm( +# data = train$data +# , label = train$label +# , num_leaves = 4L +# , learning_rate = 1.0 +# , nrounds = 10L +# , objective = "binary" +# , metric = metrics +# , save_name = tempfile(fileext = ".model") +# ) +# expect_false(is.null(bst$record_evals)) +# expect_named( +# bst$record_evals[["train"]] +# , unlist(metrics) +# , ignore.order = FALSE +# , ignore.case = FALSE +# ) +# }) + +# test_that("lgb.Booster.upper_bound() and lgb.Booster.lower_bound() work as expected for binary classification", { +# set.seed(708L) +# nrounds <- 10L +# bst <- lightgbm( +# data = train$data +# , label = train$label +# , num_leaves = 5L +# , nrounds = nrounds +# , objective = "binary" +# , metric = "binary_error" +# , save_name = tempfile(fileext = ".model") +# ) +# expect_true(abs(bst$lower_bound() - -1.590853) < TOLERANCE) +# expect_true(abs(bst$upper_bound() - 1.871015) < TOLERANCE) +# }) + +# test_that("lgb.Booster.upper_bound() and lgb.Booster.lower_bound() work as expected for regression", { +# set.seed(708L) +# nrounds <- 10L +# bst <- lightgbm( +# data = train$data +# , label = train$label +# , num_leaves = 5L +# , nrounds = nrounds +# , objective = "regression" +# , metric = "l2" +# , save_name = tempfile(fileext = ".model") +# ) +# expect_true(abs(bst$lower_bound() - 0.1513859) < TOLERANCE) +# expect_true(abs(bst$upper_bound() - 0.9080349) < TOLERANCE) +# }) + +# test_that("lightgbm() rejects negative or 0 value passed to nrounds", { +# dtrain <- lgb.Dataset(train$data, label = train$label) +# params <- list(objective = "regression", metric = "l2,l1") +# for (nround_value in c(-10L, 0L)) { +# # expect_error({ +# # bst <- lightgbm( +# # data = dtrain +# # , params = params +# # , nrounds = nround_value +# # , save_name = tempfile(fileext = ".model") +# # ) +# # }, "nrounds should be greater than zero") +# } +# }) + +# test_that("lightgbm() performs evaluation on validation sets if they are provided", { +# set.seed(708L) +# dvalid1 <- lgb.Dataset( +# data = train$data +# , label = train$label +# ) +# dvalid2 <- lgb.Dataset( +# data = train$data +# , label = train$label +# ) +# nrounds <- 10L +# bst <- lightgbm( +# data = train$data +# , label = train$label +# , num_leaves = 5L +# , nrounds = nrounds +# , objective = "binary" +# , metric = c( +# "binary_error" +# , "auc" +# ) +# , valids = list( +# "valid1" = dvalid1 +# , "valid2" = dvalid2 +# ) +# , save_name = tempfile(fileext = ".model") +# ) + +# expect_named( +# bst$record_evals +# , c("train", "valid1", "valid2", "start_iter") +# , ignore.order = TRUE +# , ignore.case = FALSE +# ) +# for (valid_name in c("train", "valid1", "valid2")) { +# eval_results <- bst$record_evals[[valid_name]][["binary_error"]] +# expect_length(eval_results[["eval"]], nrounds) +# } +# expect_true(abs(bst$record_evals[["train"]][["binary_error"]][["eval"]][[1L]] - 0.02226317) < TOLERANCE) +# expect_true(abs(bst$record_evals[["valid1"]][["binary_error"]][["eval"]][[1L]] - 0.02226317) < TOLERANCE) +# expect_true(abs(bst$record_evals[["valid2"]][["binary_error"]][["eval"]][[1L]] - 0.02226317) < TOLERANCE) +# }) + + +# context("training continuation") + +# test_that("training continuation works", { +# dtrain <- lgb.Dataset( +# train$data +# , label = train$label +# , free_raw_data = FALSE +# ) +# watchlist <- list(train = dtrain) +# param <- list( +# objective = "binary" +# , metric = "binary_logloss" +# , num_leaves = 5L +# , learning_rate = 1.0 +# ) + +# # train for 10 consecutive iterations +# bst <- lgb.train(param, dtrain, nrounds = 10L, watchlist) +# err_bst <- lgb.get.eval.result(bst, "train", "binary_logloss", 10L) + +# # train for 5 iterations, save, load, train for 5 more +# bst1 <- lgb.train(param, dtrain, nrounds = 5L, watchlist) +# model_file <- tempfile(fileext = ".model") +# lgb.save(bst1, model_file) +# bst2 <- lgb.train(param, dtrain, nrounds = 5L, watchlist, init_model = bst1) +# err_bst2 <- lgb.get.eval.result(bst2, "train", "binary_logloss", 10L) + +# # evaluation metrics should be nearly identical for the model trained in 10 coonsecutive +# # iterations and the one trained in 5-then-5. +# expect_lt(abs(err_bst - err_bst2), 0.01) +# }) + +# context("lgb.cv()") + +# test_that("cv works", { +# dtrain <- lgb.Dataset(train$data, label = train$label) +# params <- list(objective = "regression", metric = "l2,l1") +# bst <- lgb.cv( +# params +# , dtrain +# , 10L +# , nfold = 5L +# , min_data = 1L +# , learning_rate = 1.0 +# , early_stopping_rounds = 10L +# ) +# expect_false(is.null(bst$record_evals)) +# }) + +# test_that("lgb.cv() rejects negative or 0 value passed to nrounds", { +# dtrain <- lgb.Dataset(train$data, label = train$label) +# params <- list(objective = "regression", metric = "l2,l1") +# for (nround_value in c(-10L, 0L)) { +# # expect_error({ +# # bst <- lgb.cv( +# # params +# # , dtrain +# # , nround_value +# # , nfold = 5L +# # , min_data = 1L +# # ) +# # }, "nrounds should be greater than zero") +# } +# }) + +# test_that("lgb.cv() throws an informative error is 'data' is not an lgb.Dataset and labels are not given", { +# bad_values <- list( +# 4L +# , "hello" +# , list(a = TRUE, b = seq_len(10L)) +# , data.frame(x = seq_len(5L), y = seq_len(5L)) +# , data.table::data.table(x = seq_len(5L), y = seq_len(5L)) +# , matrix(data = seq_len(10L), 2L, 5L) +# ) +# for (val in bad_values) { +# # expect_error({ +# # bst <- lgb.cv( +# # params = list(objective = "regression", metric = "l2,l1") +# # , data = val +# # , 10L +# # , nfold = 5L +# # , min_data = 1L +# # ) +# # }, regexp = "'label' must be provided for lgb.cv if 'data' is not an 'lgb.Dataset'", fixed = TRUE) +# } +# }) + +# test_that("lightgbm.cv() gives the correct best_score and best_iter for a metric where higher values are better", { +# set.seed(708L) +# dtrain <- lgb.Dataset( +# data = as.matrix(runif(n = 500L, min = 0.0, max = 15.0), drop = FALSE) +# , label = rep(c(0L, 1L), 250L) +# ) +# nrounds <- 10L +# cv_bst <- lgb.cv( +# data = dtrain +# , nfold = 5L +# , nrounds = nrounds +# , num_leaves = 5L +# , params = list( +# objective = "binary" +# , metric = "auc,binary_error" +# , learning_rate = 1.5 +# ) +# ) +# expect_is(cv_bst, "lgb.CVBooster") +# expect_named( +# cv_bst$record_evals +# , c("start_iter", "valid") +# , ignore.order = FALSE +# , ignore.case = FALSE +# ) +# auc_scores <- unlist(cv_bst$record_evals[["valid"]][["auc"]][["eval"]]) +# expect_length(auc_scores, nrounds) +# expect_identical(cv_bst$best_iter, which.max(auc_scores)) +# expect_identical(cv_bst$best_score, auc_scores[which.max(auc_scores)]) +# }) + +# context("lgb.train()") + +# test_that("lgb.train() works as expected with multiple eval metrics", { +# metrics <- c("binary_error", "auc", "binary_logloss") +# bst <- lgb.train( +# data = lgb.Dataset( +# train$data +# , label = train$label +# ) +# , learning_rate = 1.0 +# , nrounds = 10L +# , params = list( +# objective = "binary" +# , metric = metrics +# ) +# , valids = list( +# "train" = lgb.Dataset( +# train$data +# , label = train$label +# ) +# ) +# ) +# expect_false(is.null(bst$record_evals)) +# expect_named( +# bst$record_evals[["train"]] +# , unlist(metrics) +# , ignore.order = FALSE +# , ignore.case = FALSE +# ) +# }) + +# test_that("lgb.train() rejects negative or 0 value passed to nrounds", { +# dtrain <- lgb.Dataset(train$data, label = train$label) +# params <- list(objective = "regression", metric = "l2,l1") +# for (nround_value in c(-10L, 0L)) { +# # expect_error({ +# # bst <- lgb.train( +# # params +# # , dtrain +# # , nround_value +# # ) +# # }, "nrounds should be greater than zero") +# } +# }) + +# test_that("lgb.train() throws an informative error if 'data' is not an lgb.Dataset", { +# bad_values <- list( +# 4L +# , "hello" +# , list(a = TRUE, b = seq_len(10L)) +# , data.frame(x = seq_len(5L), y = seq_len(5L)) +# , data.table::data.table(x = seq_len(5L), y = seq_len(5L)) +# , matrix(data = seq_len(10L), 2L, 5L) +# ) +# for (val in bad_values) { +# # expect_error({ +# # bst <- lgb.train( +# # params = list(objective = "regression", metric = "l2,l1") +# # , data = val +# # , 10L +# # ) +# # }, regexp = "data must be an lgb.Dataset instance", fixed = TRUE) +# } +# }) + +# test_that("lgb.train() throws an informative error if 'valids' is not a list of lgb.Dataset objects", { +# valids <- list( +# "valid1" = data.frame(x = rnorm(5L), y = rnorm(5L)) +# , "valid2" = data.frame(x = rnorm(5L), y = rnorm(5L)) +# ) +# # expect_error({ +# # bst <- lgb.train( +# # params = list(objective = "regression", metric = "l2,l1") +# # , data = lgb.Dataset(train$data, label = train$label) +# # , 10L +# # , valids = valids +# # ) +# # }, regexp = "valids must be a list of lgb.Dataset elements") +# }) + +# test_that("lgb.train() errors if 'valids' is a list of lgb.Dataset objects but some do not have names", { +# valids <- list( +# "valid1" = lgb.Dataset(matrix(rnorm(10L), 5L, 2L)) +# , lgb.Dataset(matrix(rnorm(10L), 2L, 5L)) +# ) +# # expect_error({ +# # bst <- lgb.train( +# # params = list(objective = "regression", metric = "l2,l1") +# # , data = lgb.Dataset(train$data, label = train$label) +# # , 10L +# # , valids = valids +# # ) +# # }, regexp = "each element of valids must have a name") +# }) + +# test_that("lgb.train() throws an informative error if 'valids' contains lgb.Dataset objects but none have names", { +# valids <- list( +# lgb.Dataset(matrix(rnorm(10L), 5L, 2L)) +# , lgb.Dataset(matrix(rnorm(10L), 2L, 5L)) +# ) +# # expect_error({ +# # bst <- lgb.train( +# # params = list(objective = "regression", metric = "l2,l1") +# # , data = lgb.Dataset(train$data, label = train$label) +# # , 10L +# # , valids = valids +# # ) +# # }, regexp = "each element of valids must have a name") +# }) + +# test_that("lgb.train() works with force_col_wise and force_row_wise", { +# set.seed(1234L) +# nrounds <- 10L +# dtrain <- lgb.Dataset( +# train$data +# , label = train$label +# ) +# params <- list( +# objective = "binary" +# , metric = "binary_error" +# , force_col_wise = TRUE +# ) +# bst_col_wise <- lgb.train( +# params = params +# , data = dtrain +# , nrounds = nrounds +# ) + +# params <- list( +# objective = "binary" +# , metric = "binary_error" +# , force_row_wise = TRUE +# ) +# bst_row_wise <- lgb.train( +# params = params +# , data = dtrain +# , nrounds = nrounds +# ) + +# expected_error <- 0.003070782 +# expect_equal(bst_col_wise$eval_train()[[1L]][["value"]], expected_error) +# expect_equal(bst_row_wise$eval_train()[[1L]][["value"]], expected_error) + +# # check some basic details of the boosters just to be sure force_col_wise +# # and force_row_wise are not causing any weird side effects +# for (bst in list(bst_row_wise, bst_col_wise)) { +# expect_equal(bst$current_iter(), nrounds) +# parsed_model <- jsonlite::fromJSON(bst$dump_model()) +# expect_equal(parsed_model$objective, "binary sigmoid:1") +# expect_false(parsed_model$average_output) +# } +# }) + +# test_that("lgb.train() works as expected with sparse features", { +# set.seed(708L) +# num_obs <- 70000L +# trainDF <- data.frame( +# y = sample(c(0L, 1L), size = num_obs, replace = TRUE) +# , x = sample(c(1.0:10.0, rep(NA_real_, 50L)), size = num_obs, replace = TRUE) +# ) +# dtrain <- lgb.Dataset( +# data = as.matrix(trainDF[["x"]], drop = FALSE) +# , label = trainDF[["y"]] +# ) +# nrounds <- 1L +# bst <- lgb.train( +# params = list( +# objective = "binary" +# , min_data = 1L +# , min_data_in_bin = 1L +# ) +# , data = dtrain +# , nrounds = nrounds +# ) + +# expect_true(lgb.is.Booster(bst)) +# expect_equal(bst$current_iter(), nrounds) +# parsed_model <- jsonlite::fromJSON(bst$dump_model()) +# expect_equal(parsed_model$objective, "binary sigmoid:1") +# expect_false(parsed_model$average_output) +# expected_error <- 0.6931268 +# expect_true(abs(bst$eval_train()[[1L]][["value"]] - expected_error) < TOLERANCE) +# }) + +# test_that("lgb.train() works with early stopping for classification", { +# trainDF <- data.frame( +# "feat1" = rep(c(5.0, 10.0), 500L) +# , "target" = rep(c(0L, 1L), 500L) +# ) +# validDF <- data.frame( +# "feat1" = rep(c(5.0, 10.0), 50L) +# , "target" = rep(c(0L, 1L), 50L) +# ) +# dtrain <- lgb.Dataset( +# data = as.matrix(trainDF[["feat1"]], drop = FALSE) +# , label = trainDF[["target"]] +# ) +# dvalid <- lgb.Dataset( +# data = as.matrix(validDF[["feat1"]], drop = FALSE) +# , label = validDF[["target"]] +# ) +# nrounds <- 10L + +# ################################ +# # train with no early stopping # +# ################################ +# bst <- lgb.train( +# params = list( +# objective = "binary" +# , metric = "binary_error" +# ) +# , data = dtrain +# , nrounds = nrounds +# , valids = list( +# "valid1" = dvalid +# ) +# ) + +# # a perfect model should be trivial to obtain, but all 10 rounds +# # should happen +# expect_equal(bst$best_score, 0.0) +# expect_equal(bst$best_iter, 1L) +# expect_equal(length(bst$record_evals[["valid1"]][["binary_error"]][["eval"]]), nrounds) + +# ############################# +# # train with early stopping # +# ############################# +# early_stopping_rounds <- 5L +# bst <- lgb.train( +# params = list( +# objective = "binary" +# , metric = "binary_error" +# , early_stopping_rounds = early_stopping_rounds +# ) +# , data = dtrain +# , nrounds = nrounds +# , valids = list( +# "valid1" = dvalid +# ) +# ) + +# # a perfect model should be trivial to obtain, and only 6 rounds +# # should have happen (1 with improvement, 5 consecutive with no improvement) +# expect_equal(bst$best_score, 0.0) +# expect_equal(bst$best_iter, 1L) +# expect_equal( +# length(bst$record_evals[["valid1"]][["binary_error"]][["eval"]]) +# , early_stopping_rounds + 1L +# ) + +# }) + +# test_that("lgb.train() treats early_stopping_rounds<=0 as disabling early stopping", { +# set.seed(708L) +# trainDF <- data.frame( +# "feat1" = rep(c(5.0, 10.0), 500L) +# , "target" = rep(c(0L, 1L), 500L) +# ) +# validDF <- data.frame( +# "feat1" = rep(c(5.0, 10.0), 50L) +# , "target" = rep(c(0L, 1L), 50L) +# ) +# dtrain <- lgb.Dataset( +# data = as.matrix(trainDF[["feat1"]], drop = FALSE) +# , label = trainDF[["target"]] +# ) +# dvalid <- lgb.Dataset( +# data = as.matrix(validDF[["feat1"]], drop = FALSE) +# , label = validDF[["target"]] +# ) +# nrounds <- 5L + +# for (value in c(-5L, 0L)) { + +# #----------------------------# +# # passed as keyword argument # +# #----------------------------# +# bst <- lgb.train( +# params = list( +# objective = "binary" +# , metric = "binary_error" +# ) +# , data = dtrain +# , nrounds = nrounds +# , valids = list( +# "valid1" = dvalid +# ) +# , early_stopping_rounds = value +# ) + +# # a perfect model should be trivial to obtain, but all 10 rounds +# # should happen +# expect_equal(bst$best_score, 0.0) +# expect_equal(bst$best_iter, 1L) +# expect_equal(length(bst$record_evals[["valid1"]][["binary_error"]][["eval"]]), nrounds) + +# #---------------------------# +# # passed as parameter alias # +# #---------------------------# +# bst <- lgb.train( +# params = list( +# objective = "binary" +# , metric = "binary_error" +# , n_iter_no_change = value +# ) +# , data = dtrain +# , nrounds = nrounds +# , valids = list( +# "valid1" = dvalid +# ) +# ) + +# # a perfect model should be trivial to obtain, but all 10 rounds +# # should happen +# expect_equal(bst$best_score, 0.0) +# expect_equal(bst$best_iter, 1L) +# expect_equal(length(bst$record_evals[["valid1"]][["binary_error"]][["eval"]]), nrounds) +# } +# }) + +# test_that("lgb.train() works with early stopping for classification with a metric that should be maximized", { +# set.seed(708L) +# dtrain <- lgb.Dataset( +# data = train$data +# , label = train$label +# ) +# dvalid <- lgb.Dataset( +# data = test$data +# , label = test$label +# ) +# nrounds <- 10L + +# ############################# +# # train with early stopping # +# ############################# +# early_stopping_rounds <- 5L +# # the harsh max_depth guarantees that AUC improves over at least the first few iterations +# bst_auc <- lgb.train( +# params = list( +# objective = "binary" +# , metric = "auc" +# , max_depth = 3L +# , early_stopping_rounds = early_stopping_rounds +# ) +# , data = dtrain +# , nrounds = nrounds +# , valids = list( +# "valid1" = dvalid +# ) +# ) +# bst_binary_error <- lgb.train( +# params = list( +# objective = "binary" +# , metric = "binary_error" +# , max_depth = 3L +# , early_stopping_rounds = early_stopping_rounds +# ) +# , data = dtrain +# , nrounds = nrounds +# , valids = list( +# "valid1" = dvalid +# ) +# ) + +# # early stopping should have been hit for binary_error (higher_better = FALSE) +# eval_info <- bst_binary_error$.__enclos_env__$private$get_eval_info() +# expect_identical(eval_info, "binary_error") +# expect_identical( +# unname(bst_binary_error$.__enclos_env__$private$higher_better_inner_eval) +# , FALSE +# ) +# expect_identical(bst_binary_error$best_iter, 1L) +# expect_identical(bst_binary_error$current_iter(), early_stopping_rounds + 1L) +# expect_true(abs(bst_binary_error$best_score - 0.01613904) < TOLERANCE) + +# # early stopping should not have been hit for AUC (higher_better = TRUE) +# eval_info <- bst_auc$.__enclos_env__$private$get_eval_info() +# expect_identical(eval_info, "auc") +# expect_identical( +# unname(bst_auc$.__enclos_env__$private$higher_better_inner_eval) +# , TRUE +# ) +# expect_identical(bst_auc$best_iter, 9L) +# expect_identical(bst_auc$current_iter(), nrounds) +# expect_true(abs(bst_auc$best_score - 0.9999969) < TOLERANCE) +# }) + +# test_that("lgb.train() works with early stopping for regression", { +# set.seed(708L) +# trainDF <- data.frame( +# "feat1" = rep(c(10.0, 100.0), 500L) +# , "target" = rep(c(-50.0, 50.0), 500L) +# ) +# validDF <- data.frame( +# "feat1" = rep(50.0, 4L) +# , "target" = rep(50.0, 4L) +# ) +# dtrain <- lgb.Dataset( +# data = as.matrix(trainDF[["feat1"]], drop = FALSE) +# , label = trainDF[["target"]] +# ) +# dvalid <- lgb.Dataset( +# data = as.matrix(validDF[["feat1"]], drop = FALSE) +# , label = validDF[["target"]] +# ) +# nrounds <- 10L + +# ################################ +# # train with no early stopping # +# ################################ +# bst <- lgb.train( +# params = list( +# objective = "regression" +# , metric = "rmse" +# ) +# , data = dtrain +# , nrounds = nrounds +# , valids = list( +# "valid1" = dvalid +# ) +# ) + +# # the best possible model should come from the first iteration, but +# # all 10 training iterations should happen +# expect_equal(bst$best_score, 55.0) +# expect_equal(bst$best_iter, 1L) +# expect_equal(length(bst$record_evals[["valid1"]][["rmse"]][["eval"]]), nrounds) + +# ############################# +# # train with early stopping # +# ############################# +# early_stopping_rounds <- 5L +# bst <- lgb.train( +# params = list( +# objective = "regression" +# , metric = "rmse" +# , early_stopping_rounds = early_stopping_rounds +# ) +# , data = dtrain +# , nrounds = nrounds +# , valids = list( +# "valid1" = dvalid +# ) +# ) + +# # the best model should be from the first iteration, and only 6 rounds +# # should have happen (1 with improvement, 5 consecutive with no improvement) +# expect_equal(bst$best_score, 55.0) +# expect_equal(bst$best_iter, 1L) +# expect_equal( +# length(bst$record_evals[["valid1"]][["rmse"]][["eval"]]) +# , early_stopping_rounds + 1L +# ) +# }) + +# test_that("lgb.train() does not stop early if early_stopping_rounds is not given", { +# set.seed(708L) + +# increasing_metric_starting_value <- get( +# ACCUMULATOR_NAME +# , envir = .GlobalEnv +# ) +# nrounds <- 10L +# metrics <- list( +# .constant_metric +# , .increasing_metric +# ) +# bst <- lgb.train( +# params = list( +# objective = "regression" +# , metric = "None" +# ) +# , data = DTRAIN_RANDOM_REGRESSION +# , nrounds = nrounds +# , valids = list("valid1" = DVALID_RANDOM_REGRESSION) +# , eval = metrics +# ) + +# # Only the two functions provided to "eval" should have been evaluated +# expect_equal(length(bst$record_evals[["valid1"]]), 2L) + +# # all 10 iterations should have happen, and the best_iter should be +# # the first one (based on constant_metric) +# best_iter <- 1L +# expect_equal(bst$best_iter, best_iter) + +# # best_score should be taken from the first metric +# expect_equal( +# bst$best_score +# , bst$record_evals[["valid1"]][["constant_metric"]][["eval"]][[best_iter]] +# ) + +# # early stopping should not have happened. Even though constant_metric +# # had 9 consecutive iterations with no improvement, it is ignored because of +# # first_metric_only = TRUE +# expect_equal( +# length(bst$record_evals[["valid1"]][["constant_metric"]][["eval"]]) +# , nrounds +# ) +# expect_equal( +# length(bst$record_evals[["valid1"]][["increasing_metric"]][["eval"]]) +# , nrounds +# ) +# }) + +# test_that("If first_metric_only is not given or is FALSE, lgb.train() decides to stop early based on all metrics", { +# set.seed(708L) + +# early_stopping_rounds <- 3L +# param_variations <- list( +# list( +# objective = "regression" +# , metric = "None" +# , early_stopping_rounds = early_stopping_rounds +# ) +# , list( +# objective = "regression" +# , metric = "None" +# , early_stopping_rounds = early_stopping_rounds +# , first_metric_only = FALSE +# ) +# ) + +# for (params in param_variations) { + +# nrounds <- 10L +# bst <- lgb.train( +# params = params +# , data = DTRAIN_RANDOM_REGRESSION +# , nrounds = nrounds +# , valids = list( +# "valid1" = DVALID_RANDOM_REGRESSION +# ) +# , eval = list( +# .increasing_metric +# , .constant_metric +# ) +# ) + +# # Only the two functions provided to "eval" should have been evaluated +# expect_equal(length(bst$record_evals[["valid1"]]), 2L) + +# # early stopping should have happened, and should have stopped early_stopping_rounds + 1 rounds in +# # because constant_metric never improves +# # +# # the best iteration should be the last one, because increasing_metric was first +# # and gets better every iteration +# best_iter <- early_stopping_rounds + 1L +# expect_equal(bst$best_iter, best_iter) + +# # best_score should be taken from "increasing_metric" because it was first +# expect_equal( +# bst$best_score +# , bst$record_evals[["valid1"]][["increasing_metric"]][["eval"]][[best_iter]] +# ) + +# # early stopping should not have happened. even though increasing_metric kept +# # getting better, early stopping should have happened because "constant_metric" +# # did not improve +# expect_equal( +# length(bst$record_evals[["valid1"]][["constant_metric"]][["eval"]]) +# , early_stopping_rounds + 1L +# ) +# expect_equal( +# length(bst$record_evals[["valid1"]][["increasing_metric"]][["eval"]]) +# , early_stopping_rounds + 1L +# ) +# } + +# }) + +# test_that("If first_metric_only is TRUE, lgb.train() decides to stop early based on only the first metric", { +# set.seed(708L) +# nrounds <- 10L +# early_stopping_rounds <- 3L +# increasing_metric_starting_value <- get(ACCUMULATOR_NAME, envir = .GlobalEnv) +# bst <- lgb.train( +# params = list( +# objective = "regression" +# , metric = "None" +# , early_stopping_rounds = early_stopping_rounds +# , first_metric_only = TRUE +# ) +# , data = DTRAIN_RANDOM_REGRESSION +# , nrounds = nrounds +# , valids = list( +# "valid1" = DVALID_RANDOM_REGRESSION +# ) +# , eval = list( +# .increasing_metric +# , .constant_metric +# ) +# ) + +# # Only the two functions provided to "eval" should have been evaluated +# expect_equal(length(bst$record_evals[["valid1"]]), 2L) + +# # all 10 iterations should happen, and the best_iter should be the final one +# expect_equal(bst$best_iter, nrounds) + +# # best_score should be taken from "increasing_metric" +# expect_equal( +# bst$best_score +# , increasing_metric_starting_value + 0.1 * nrounds +# ) + +# # early stopping should not have happened. Even though constant_metric +# # had 9 consecutive iterations with no improvement, it is ignored because of +# # first_metric_only = TRUE +# expect_equal( +# length(bst$record_evals[["valid1"]][["constant_metric"]][["eval"]]) +# , nrounds +# ) +# expect_equal( +# length(bst$record_evals[["valid1"]][["increasing_metric"]][["eval"]]) +# , nrounds +# ) +# }) + +# test_that("lgb.train() works when a mixture of functions and strings are passed to eval", { +# set.seed(708L) +# nrounds <- 10L +# increasing_metric_starting_value <- get(ACCUMULATOR_NAME, envir = .GlobalEnv) +# bst <- lgb.train( +# params = list( +# objective = "regression" +# , metric = "None" +# ) +# , data = DTRAIN_RANDOM_REGRESSION +# , nrounds = nrounds +# , valids = list( +# "valid1" = DVALID_RANDOM_REGRESSION +# ) +# , eval = list( +# .increasing_metric +# , "rmse" +# , .constant_metric +# , "l2" +# ) +# ) + +# # all 4 metrics should have been used +# expect_named( +# bst$record_evals[["valid1"]] +# , expected = c("rmse", "l2", "increasing_metric", "constant_metric") +# , ignore.order = TRUE +# , ignore.case = FALSE +# ) + +# # the difference metrics shouldn't have been mixed up with each other +# results <- bst$record_evals[["valid1"]] +# expect_true(abs(results[["rmse"]][["eval"]][[1L]] - 1.105012) < TOLERANCE) +# expect_true(abs(results[["l2"]][["eval"]][[1L]] - 1.221051) < TOLERANCE) +# expected_increasing_metric <- increasing_metric_starting_value + 0.1 +# expect_true( +# abs( +# results[["increasing_metric"]][["eval"]][[1L]] - expected_increasing_metric +# ) < TOLERANCE +# ) +# expect_true(abs(results[["constant_metric"]][["eval"]][[1L]] - CONSTANT_METRIC_VALUE) < TOLERANCE) + +# }) + +# test_that("lgb.train() works when a list of strings or a character vector is passed to eval", { + +# # testing list and character vector, as well as length-1 and length-2 +# eval_variations <- list( +# c("binary_error", "binary_logloss") +# , "binary_logloss" +# , list("binary_error", "binary_logloss") +# , list("binary_logloss") +# ) + +# for (eval_variation in eval_variations) { + +# set.seed(708L) +# nrounds <- 10L +# increasing_metric_starting_value <- get(ACCUMULATOR_NAME, envir = .GlobalEnv) +# bst <- lgb.train( +# params = list( +# objective = "binary" +# , metric = "None" +# ) +# , data = DTRAIN_RANDOM_CLASSIFICATION +# , nrounds = nrounds +# , valids = list( +# "valid1" = DVALID_RANDOM_CLASSIFICATION +# ) +# , eval = eval_variation +# ) + +# # both metrics should have been used +# expect_named( +# bst$record_evals[["valid1"]] +# , expected = unlist(eval_variation) +# , ignore.order = TRUE +# , ignore.case = FALSE +# ) + +# # the difference metrics shouldn't have been mixed up with each other +# results <- bst$record_evals[["valid1"]] +# if ("binary_error" %in% unlist(eval_variation)) { +# expect_true(abs(results[["binary_error"]][["eval"]][[1L]] - 0.4864865) < TOLERANCE) +# } +# if ("binary_logloss" %in% unlist(eval_variation)) { +# expect_true(abs(results[["binary_logloss"]][["eval"]][[1L]] - 0.6932548) < TOLERANCE) +# } +# } +# }) + +# test_that("lgb.train() works when you specify both 'metric' and 'eval' with strings", { +# set.seed(708L) +# nrounds <- 10L +# increasing_metric_starting_value <- get(ACCUMULATOR_NAME, envir = .GlobalEnv) +# bst <- lgb.train( +# params = list( +# objective = "binary" +# , metric = "binary_error" +# ) +# , data = DTRAIN_RANDOM_CLASSIFICATION +# , nrounds = nrounds +# , valids = list( +# "valid1" = DVALID_RANDOM_CLASSIFICATION +# ) +# , eval = "binary_logloss" +# ) + +# # both metrics should have been used +# expect_named( +# bst$record_evals[["valid1"]] +# , expected = c("binary_error", "binary_logloss") +# , ignore.order = TRUE +# , ignore.case = FALSE +# ) + +# # the difference metrics shouldn't have been mixed up with each other +# results <- bst$record_evals[["valid1"]] +# expect_true(abs(results[["binary_error"]][["eval"]][[1L]] - 0.4864865) < TOLERANCE) +# expect_true(abs(results[["binary_logloss"]][["eval"]][[1L]] - 0.6932548) < TOLERANCE) +# }) + +# test_that("lgb.train() works when you give a function for eval", { +# set.seed(708L) +# nrounds <- 10L +# increasing_metric_starting_value <- get(ACCUMULATOR_NAME, envir = .GlobalEnv) +# bst <- lgb.train( +# params = list( +# objective = "binary" +# , metric = "None" +# ) +# , data = DTRAIN_RANDOM_CLASSIFICATION +# , nrounds = nrounds +# , valids = list( +# "valid1" = DVALID_RANDOM_CLASSIFICATION +# ) +# , eval = .constant_metric +# ) + +# # the difference metrics shouldn't have been mixed up with each other +# results <- bst$record_evals[["valid1"]] +# expect_true(abs(results[["constant_metric"]][["eval"]][[1L]] - CONSTANT_METRIC_VALUE) < TOLERANCE) +# }) + +# test_that("lgb.train() works with early stopping for regression with a metric that should be minimized", { +# set.seed(708L) +# trainDF <- data.frame( +# "feat1" = rep(c(10.0, 100.0), 500L) +# , "target" = rep(c(-50.0, 50.0), 500L) +# ) +# validDF <- data.frame( +# "feat1" = rep(50.0, 4L) +# , "target" = rep(50.0, 4L) +# ) +# dtrain <- lgb.Dataset( +# data = as.matrix(trainDF[["feat1"]], drop = FALSE) +# , label = trainDF[["target"]] +# ) +# dvalid <- lgb.Dataset( +# data = as.matrix(validDF[["feat1"]], drop = FALSE) +# , label = validDF[["target"]] +# ) +# nrounds <- 10L + +# ############################# +# # train with early stopping # +# ############################# +# early_stopping_rounds <- 5L +# bst <- lgb.train( +# params = list( +# objective = "regression" +# , metric = c( +# "mape" +# , "rmse" +# , "mae" +# ) +# , min_data_in_bin = 5L +# , early_stopping_rounds = early_stopping_rounds +# ) +# , data = dtrain +# , nrounds = nrounds +# , valids = list( +# "valid1" = dvalid +# ) +# ) + +# # the best model should be from the first iteration, and only 6 rounds +# # should have happened (1 with improvement, 5 consecutive with no improvement) +# expect_equal(bst$best_score, 1.1) +# expect_equal(bst$best_iter, 1L) +# expect_equal( +# length(bst$record_evals[["valid1"]][["mape"]][["eval"]]) +# , early_stopping_rounds + 1L +# ) + +# # Booster should understand thatt all three of these metrics should be minimized +# eval_info <- bst$.__enclos_env__$private$get_eval_info() +# expect_identical(eval_info, c("mape", "rmse", "l1")) +# expect_identical( +# unname(bst$.__enclos_env__$private$higher_better_inner_eval) +# , rep(FALSE, 3L) +# ) +# }) + + +# test_that("lgb.train() supports non-ASCII feature names", { +# testthat::skip("UTF-8 feature names are not fully supported in the R package") +# dtrain <- lgb.Dataset( +# data = matrix(rnorm(400L), ncol = 4L) +# , label = rnorm(100L) +# ) +# feature_names <- c("F_零", "F_一", "F_二", "F_三") +# bst <- lgb.train( +# data = dtrain +# , nrounds = 5L +# , obj = "regression" +# , params = list( +# metric = "rmse" +# ) +# , colnames = feature_names +# ) +# expect_true(lgb.is.Booster(bst)) +# dumped_model <- jsonlite::fromJSON(bst$dump_model()) +# expect_identical( +# dumped_model[["feature_names"]] +# , feature_names +# ) +# }) + +# test_that("when early stopping is not activated, best_iter and best_score come from valids and not training data", { +# set.seed(708L) +# trainDF <- data.frame( +# "feat1" = rep(c(10.0, 100.0), 500L) +# , "target" = rep(c(-50.0, 50.0), 500L) +# ) +# validDF <- data.frame( +# "feat1" = rep(50.0, 4L) +# , "target" = rep(50.0, 4L) +# ) +# dtrain <- lgb.Dataset( +# data = as.matrix(trainDF[["feat1"]], drop = FALSE) +# , label = trainDF[["target"]] +# ) +# dvalid1 <- lgb.Dataset( +# data = as.matrix(validDF[["feat1"]], drop = FALSE) +# , label = validDF[["target"]] +# ) +# dvalid2 <- lgb.Dataset( +# data = as.matrix(validDF[1L:10L, "feat1"], drop = FALSE) +# , label = validDF[1L:10L, "target"] +# ) +# nrounds <- 10L +# train_params <- list( +# objective = "regression" +# , metric = "rmse" +# , learning_rate = 1.5 +# ) + +# # example 1: two valids, neither are the training data +# bst <- lgb.train( +# data = dtrain +# , nrounds = nrounds +# , num_leaves = 5L +# , valids = list( +# "valid1" = dvalid1 +# , "valid2" = dvalid2 +# ) +# , params = train_params +# ) +# expect_named( +# bst$record_evals +# , c("start_iter", "valid1", "valid2") +# , ignore.order = FALSE +# , ignore.case = FALSE +# ) +# rmse_scores <- unlist(bst$record_evals[["valid1"]][["rmse"]][["eval"]]) +# expect_length(rmse_scores, nrounds) +# expect_identical(bst$best_iter, which.min(rmse_scores)) +# expect_identical(bst$best_score, rmse_scores[which.min(rmse_scores)]) + +# # example 2: train first (called "train") and two valids +# bst <- lgb.train( +# data = dtrain +# , nrounds = nrounds +# , num_leaves = 5L +# , valids = list( +# "train" = dtrain +# , "valid1" = dvalid1 +# , "valid2" = dvalid2 +# ) +# , params = train_params +# ) +# expect_named( +# bst$record_evals +# , c("start_iter", "train", "valid1", "valid2") +# , ignore.order = FALSE +# , ignore.case = FALSE +# ) +# rmse_scores <- unlist(bst$record_evals[["valid1"]][["rmse"]][["eval"]]) +# expect_length(rmse_scores, nrounds) +# expect_identical(bst$best_iter, which.min(rmse_scores)) +# expect_identical(bst$best_score, rmse_scores[which.min(rmse_scores)]) + +# # example 3: train second (called "train") and two valids +# bst <- lgb.train( +# data = dtrain +# , nrounds = nrounds +# , num_leaves = 5L +# , valids = list( +# "valid1" = dvalid1 +# , "train" = dtrain +# , "valid2" = dvalid2 +# ) +# , params = train_params +# ) +# # note that "train" still ends up as the first one +# expect_named( +# bst$record_evals +# , c("start_iter", "train", "valid1", "valid2") +# , ignore.order = FALSE +# , ignore.case = FALSE +# ) +# rmse_scores <- unlist(bst$record_evals[["valid1"]][["rmse"]][["eval"]]) +# expect_length(rmse_scores, nrounds) +# expect_identical(bst$best_iter, which.min(rmse_scores)) +# expect_identical(bst$best_score, rmse_scores[which.min(rmse_scores)]) + +# # example 4: train third (called "train") and two valids +# bst <- lgb.train( +# data = dtrain +# , nrounds = nrounds +# , num_leaves = 5L +# , valids = list( +# "valid1" = dvalid1 +# , "valid2" = dvalid2 +# , "train" = dtrain +# ) +# , params = train_params +# ) +# # note that "train" still ends up as the first one +# expect_named( +# bst$record_evals +# , c("start_iter", "train", "valid1", "valid2") +# , ignore.order = FALSE +# , ignore.case = FALSE +# ) +# rmse_scores <- unlist(bst$record_evals[["valid1"]][["rmse"]][["eval"]]) +# expect_length(rmse_scores, nrounds) +# expect_identical(bst$best_iter, which.min(rmse_scores)) +# expect_identical(bst$best_score, rmse_scores[which.min(rmse_scores)]) + +# # example 5: train second (called "something-random-we-would-not-hardcode") and two valids +# bst <- lgb.train( +# data = dtrain +# , nrounds = nrounds +# , num_leaves = 5L +# , valids = list( +# "valid1" = dvalid1 +# , "something-random-we-would-not-hardcode" = dtrain +# , "valid2" = dvalid2 +# ) +# , params = train_params +# ) +# # note that "something-random-we-would-not-hardcode" was recognized as the training +# # data even though it isn't named "train" +# expect_named( +# bst$record_evals +# , c("start_iter", "something-random-we-would-not-hardcode", "valid1", "valid2") +# , ignore.order = FALSE +# , ignore.case = FALSE +# ) +# rmse_scores <- unlist(bst$record_evals[["valid1"]][["rmse"]][["eval"]]) +# expect_length(rmse_scores, nrounds) +# expect_identical(bst$best_iter, which.min(rmse_scores)) +# expect_identical(bst$best_score, rmse_scores[which.min(rmse_scores)]) + +# # example 6: the only valid supplied is the training data +# bst <- lgb.train( +# data = dtrain +# , nrounds = nrounds +# , num_leaves = 5L +# , valids = list( +# "train" = dtrain +# ) +# , params = train_params +# ) +# expect_identical(bst$best_iter, -1L) +# expect_identical(bst$best_score, NA_real_) +# }) + +# test_that("lightgbm.train() gives the correct best_score and best_iter for a metric where higher values are better", { +# set.seed(708L) +# trainDF <- data.frame( +# "feat1" = runif(n = 500L, min = 0.0, max = 15.0) +# , "target" = rep(c(0L, 1L), 500L) +# ) +# validDF <- data.frame( +# "feat1" = runif(n = 50L, min = 0.0, max = 15.0) +# , "target" = rep(c(0L, 1L), 50L) +# ) +# dtrain <- lgb.Dataset( +# data = as.matrix(trainDF[["feat1"]], drop = FALSE) +# , label = trainDF[["target"]] +# ) +# dvalid1 <- lgb.Dataset( +# data = as.matrix(validDF[1L:25L, "feat1"], drop = FALSE) +# , label = validDF[1L:25L, "target"] +# ) +# nrounds <- 10L +# bst <- lgb.train( +# data = dtrain +# , nrounds = nrounds +# , num_leaves = 5L +# , valids = list( +# "valid1" = dvalid1 +# , "something-random-we-would-not-hardcode" = dtrain +# ) +# , params = list( +# objective = "binary" +# , metric = "auc" +# , learning_rate = 1.5 +# ) +# ) +# # note that "something-random-we-would-not-hardcode" was recognized as the training +# # data even though it isn't named "train" +# expect_named( +# bst$record_evals +# , c("start_iter", "something-random-we-would-not-hardcode", "valid1") +# , ignore.order = FALSE +# , ignore.case = FALSE +# ) +# auc_scores <- unlist(bst$record_evals[["valid1"]][["auc"]][["eval"]]) +# expect_length(auc_scores, nrounds) +# expect_identical(bst$best_iter, which.max(auc_scores)) +# expect_identical(bst$best_score, auc_scores[which.max(auc_scores)]) +# }) + +# test_that("using lightgbm() without early stopping, best_iter and best_score come from valids and not training data", { +# set.seed(708L) +# # example: train second (called "something-random-we-would-not-hardcode"), two valids, +# # and a metric where higher values are better ("auc") +# trainDF <- data.frame( +# "feat1" = runif(n = 500L, min = 0.0, max = 15.0) +# , "target" = rep(c(0L, 1L), 500L) +# ) +# validDF <- data.frame( +# "feat1" = runif(n = 50L, min = 0.0, max = 15.0) +# , "target" = rep(c(0L, 1L), 50L) +# ) +# dtrain <- lgb.Dataset( +# data = as.matrix(trainDF[["feat1"]], drop = FALSE) +# , label = trainDF[["target"]] +# ) +# dvalid1 <- lgb.Dataset( +# data = as.matrix(validDF[1L:25L, "feat1"], drop = FALSE) +# , label = validDF[1L:25L, "target"] +# ) +# dvalid2 <- lgb.Dataset( +# data = as.matrix(validDF[26L:50L, "feat1"], drop = FALSE) +# , label = validDF[26L:50L, "target"] +# ) +# nrounds <- 10L +# bst <- lightgbm( +# data = dtrain +# , nrounds = nrounds +# , num_leaves = 5L +# , valids = list( +# "valid1" = dvalid1 +# , "something-random-we-would-not-hardcode" = dtrain +# , "valid2" = dvalid2 +# ) +# , params = list( +# objective = "binary" +# , metric = "auc" +# , learning_rate = 1.5 +# ) +# , verbose = -7L +# , save_name = tempfile(fileext = ".model") +# ) +# # when verbose <= 0 is passed to lightgbm(), 'valids' is passed through to lgb.train() +# # untouched. If you set verbose to > 0, the training data will still be first but called "train" +# expect_named( +# bst$record_evals +# , c("start_iter", "something-random-we-would-not-hardcode", "valid1", "valid2") +# , ignore.order = FALSE +# , ignore.case = FALSE +# ) +# auc_scores <- unlist(bst$record_evals[["valid1"]][["auc"]][["eval"]]) +# expect_length(auc_scores, nrounds) +# expect_identical(bst$best_iter, which.max(auc_scores)) +# expect_identical(bst$best_score, auc_scores[which.max(auc_scores)]) +# }) + +# test_that("lgb.cv() works when you specify both 'metric' and 'eval' with strings", { +# set.seed(708L) +# nrounds <- 10L +# nfolds <- 4L +# increasing_metric_starting_value <- get(ACCUMULATOR_NAME, envir = .GlobalEnv) +# bst <- lgb.cv( +# params = list( +# objective = "binary" +# , metric = "binary_error" +# ) +# , data = DTRAIN_RANDOM_CLASSIFICATION +# , nrounds = nrounds +# , nfold = nfolds +# , eval = "binary_logloss" +# ) + +# # both metrics should have been used +# expect_named( +# bst$record_evals[["valid"]] +# , expected = c("binary_error", "binary_logloss") +# , ignore.order = TRUE +# , ignore.case = FALSE +# ) + +# # the difference metrics shouldn't have been mixed up with each other +# results <- bst$record_evals[["valid"]] +# expect_true(abs(results[["binary_error"]][["eval"]][[1L]] - 0.5005654) < TOLERANCE) +# expect_true(abs(results[["binary_logloss"]][["eval"]][[1L]] - 0.7011232) < TOLERANCE) + +# # all boosters should have been created +# expect_length(bst$boosters, nfolds) +# }) + +# test_that("lgb.cv() works when you give a function for eval", { +# set.seed(708L) +# nrounds <- 10L +# nfolds <- 3L +# increasing_metric_starting_value <- get(ACCUMULATOR_NAME, envir = .GlobalEnv) +# bst <- lgb.cv( +# params = list( +# objective = "binary" +# , metric = "None" +# ) +# , data = DTRAIN_RANDOM_CLASSIFICATION +# , nfold = nfolds +# , nrounds = nrounds +# , eval = .constant_metric +# ) + +# # the difference metrics shouldn't have been mixed up with each other +# results <- bst$record_evals[["valid"]] +# expect_true(abs(results[["constant_metric"]][["eval"]][[1L]] - CONSTANT_METRIC_VALUE) < TOLERANCE) +# expect_named(results, "constant_metric") +# }) + +# test_that("If first_metric_only is TRUE, lgb.cv() decides to stop early based on only the first metric", { +# set.seed(708L) +# nrounds <- 10L +# nfolds <- 5L +# early_stopping_rounds <- 3L +# increasing_metric_starting_value <- get(ACCUMULATOR_NAME, envir = .GlobalEnv) +# bst <- lgb.cv( +# params = list( +# objective = "regression" +# , metric = "None" +# , early_stopping_rounds = early_stopping_rounds +# , first_metric_only = TRUE +# ) +# , data = DTRAIN_RANDOM_REGRESSION +# , nfold = nfolds +# , nrounds = nrounds +# , valids = list( +# "valid1" = DVALID_RANDOM_REGRESSION +# ) +# , eval = list( +# .increasing_metric +# , .constant_metric +# ) +# ) + +# # Only the two functions provided to "eval" should have been evaluated +# expect_named(bst$record_evals[["valid"]], c("increasing_metric", "constant_metric")) + +# # all 10 iterations should happen, and the best_iter should be the final one +# expect_equal(bst$best_iter, nrounds) + +# # best_score should be taken from "increasing_metric" +# # +# # this expected value looks magical and confusing, but it's because +# # evaluation metrics are averaged over all folds. +# # +# # consider 5-fold CV with a metric that adds 0.1 to a global accumulator +# # each time it's called +# # +# # * iter 1: [0.1, 0.2, 0.3, 0.4, 0.5] (mean = 0.3) +# # * iter 2: [0.6, 0.7, 0.8, 0.9, 1.0] (mean = 1.3) +# # * iter 3: [1.1, 1.2, 1.3, 1.4, 1.5] (mean = 1.8) +# # +# cv_value <- increasing_metric_starting_value + mean(seq_len(nfolds) / 10.0) + (nrounds - 1L) * 0.1 * nfolds +# expect_equal(bst$best_score, cv_value) + +# # early stopping should not have happened. Even though constant_metric +# # had 9 consecutive iterations with no improvement, it is ignored because of +# # first_metric_only = TRUE +# expect_equal( +# length(bst$record_evals[["valid"]][["constant_metric"]][["eval"]]) +# , nrounds +# ) +# expect_equal( +# length(bst$record_evals[["valid"]][["increasing_metric"]][["eval"]]) +# , nrounds +# ) +# }) + +# test_that("early stopping works with lgb.cv()", { +# set.seed(708L) +# nrounds <- 10L +# nfolds <- 5L +# early_stopping_rounds <- 3L +# increasing_metric_starting_value <- get(ACCUMULATOR_NAME, envir = .GlobalEnv) +# bst <- lgb.cv( +# params = list( +# objective = "regression" +# , metric = "None" +# , early_stopping_rounds = early_stopping_rounds +# , first_metric_only = TRUE +# ) +# , data = DTRAIN_RANDOM_REGRESSION +# , nfold = nfolds +# , nrounds = nrounds +# , valids = list( +# "valid1" = DVALID_RANDOM_REGRESSION +# ) +# , eval = list( +# .constant_metric +# , .increasing_metric +# ) +# ) + +# # only the two functions provided to "eval" should have been evaluated +# expect_named(bst$record_evals[["valid"]], c("constant_metric", "increasing_metric")) + +# # best_iter should be based on the first metric. Since constant_metric +# # never changes, its first iteration was the best oone +# expect_equal(bst$best_iter, 1L) + +# # best_score should be taken from the first metri +# expect_equal(bst$best_score, 0.2) + +# # early stopping should have happened, since constant_metric was the first +# # one passed to eval and it will not improve over consecutive iterations +# # +# # note that this test is identical to the previous one, but with the +# # order of the eval metrics switched +# expect_equal( +# length(bst$record_evals[["valid"]][["constant_metric"]][["eval"]]) +# , early_stopping_rounds + 1L +# ) +# expect_equal( +# length(bst$record_evals[["valid"]][["increasing_metric"]][["eval"]]) +# , early_stopping_rounds + 1L +# ) +# }) + +# context("interaction constraints") + +# test_that("lgb.train() throws an informative error if interaction_constraints is not a list", { +# dtrain <- lgb.Dataset(train$data, label = train$label) +# params <- list(objective = "regression", interaction_constraints = "[1,2],[3]") +# # expect_error({ +# # bst <- lightgbm( +# # data = dtrain +# # , params = params +# # , nrounds = 2L +# # ) +# # }, "interaction_constraints must be a list") +# }) + +# test_that(paste0("lgb.train() throws an informative error if the members of interaction_constraints ", +# "are not character or numeric vectors"), { +# dtrain <- lgb.Dataset(train$data, label = train$label) +# params <- list(objective = "regression", interaction_constraints = list(list(1L, 2L), list(3L))) +# # expect_error({ +# # bst <- lightgbm( +# # data = dtrain +# # , params = params +# # , nrounds = 2L +# # ) +# # }, "every element in interaction_constraints must be a character vector or numeric vector") +# }) + +# test_that("lgb.train() throws an informative error if interaction_constraints contains a too large index", { +# dtrain <- lgb.Dataset(train$data, label = train$label) +# params <- list(objective = "regression", +# interaction_constraints = list(c(1L, length(colnames(train$data)) + 1L), 3L)) +# # expect_error({ +# # bst <- lightgbm( +# # data = dtrain +# # , params = params +# # , nrounds = 2L +# # ) +# # }, "supplied a too large value in interaction_constraints") +# }) + +# test_that(paste0("lgb.train() gives same result when interaction_constraints is specified as a list of ", +# "character vectors, numeric vectors, or a combination"), { +# set.seed(1L) +# dtrain <- lgb.Dataset(train$data, label = train$label) + +# params <- list(objective = "regression", interaction_constraints = list(c(1L, 2L), 3L)) +# bst <- lightgbm( +# data = dtrain +# , params = params +# , nrounds = 2L +# ) +# pred1 <- bst$predict(test$data) + +# cnames <- colnames(train$data) +# params <- list(objective = "regression", interaction_constraints = list(c(cnames[[1L]], cnames[[2L]]), cnames[[3L]])) +# bst <- lightgbm( +# data = dtrain +# , params = params +# , nrounds = 2L +# ) +# pred2 <- bst$predict(test$data) + +# params <- list(objective = "regression", interaction_constraints = list(c(cnames[[1L]], cnames[[2L]]), 3L)) +# bst <- lightgbm( +# data = dtrain +# , params = params +# , nrounds = 2L +# ) +# pred3 <- bst$predict(test$data) + +# expect_equal(pred1, pred2) +# expect_equal(pred2, pred3) + +# }) + +# test_that(paste0("lgb.train() gives same results when using interaction_constraints and specifying colnames"), { +# set.seed(1L) +# dtrain <- lgb.Dataset(train$data, label = train$label) + +# params <- list(objective = "regression", interaction_constraints = list(c(1L, 2L), 3L)) +# bst <- lightgbm( +# data = dtrain +# , params = params +# , nrounds = 2L +# ) +# pred1 <- bst$predict(test$data) + +# new_colnames <- paste0(colnames(train$data), "_x") +# params <- list(objective = "regression" +# , interaction_constraints = list(c(new_colnames[1L], new_colnames[2L]), new_colnames[3L])) +# bst <- lightgbm( +# data = dtrain +# , params = params +# , nrounds = 2L +# , colnames = new_colnames +# ) +# pred2 <- bst$predict(test$data) + +# expect_equal(pred1, pred2) + +# }) diff --git a/R-package/tests/testthat/test_learning_to_rank.R b/R-package/tests/testthat/test_learning_to_rank.R index 3cab22304e6b..e6ab2a987d25 100644 --- a/R-package/tests/testthat/test_learning_to_rank.R +++ b/R-package/tests/testthat/test_learning_to_rank.R @@ -1,141 +1,141 @@ -context("Learning to rank") - -# numerical tolerance to use when checking metric values -TOLERANCE <- 1e-06 - -test_that("learning-to-rank with lgb.train() works as expected", { - set.seed(708L) - data(agaricus.train, package = "lightgbm") - # just keep a few features,to generate an model with imperfect fit - train <- agaricus.train - train_data <- train$data[1L:6000L, 1L:20L] - dtrain <- lgb.Dataset( - train_data - , label = train$label[1L:6000L] - , group = rep(150L, 40L) - ) - ndcg_at <- "1,2,3" - eval_names <- paste0("ndcg@", strsplit(ndcg_at, ",")[[1L]]) - params <- list( - objective = "lambdarank" - , metric = "ndcg" - , ndcg_at = ndcg_at - , lambdarank_truncation_level = 3L - , learning_rate = 0.001 - ) - model <- lgb.train( - params = params - , data = dtrain - , nrounds = 10L - ) - expect_true(lgb.is.Booster(model)) - - dumped_model <- jsonlite::fromJSON( - model$dump_model() - ) - expect_equal(dumped_model[["objective"]], "lambdarank") - expect_equal(dumped_model[["max_feature_idx"]], ncol(train_data) - 1L) - - # check that evaluation results make sense (0.0 < nDCG < 1.0) - eval_results <- model$eval_train() - expect_equal(length(eval_results), length(eval_names)) - for (result in eval_results) { - expect_true(result[["value"]] > 0.0 && result[["value"]] < 1.0) - expect_true(result[["higher_better"]]) - expect_identical(result[["data_name"]], "training") - } - expect_identical(sapply(eval_results, function(x) {x$name}), eval_names) - expect_equal(eval_results[[1L]][["value"]], 0.825) - expect_true(abs(eval_results[[2L]][["value"]] - 0.7766434) < TOLERANCE) - expect_true(abs(eval_results[[3L]][["value"]] - 0.7527939) < TOLERANCE) -}) - -test_that("learning-to-rank with lgb.cv() works as expected", { - set.seed(708L) - data(agaricus.train, package = "lightgbm") - # just keep a few features,to generate an model with imperfect fit - train <- agaricus.train - train_data <- train$data[1L:6000L, 1L:20L] - dtrain <- lgb.Dataset( - train_data - , label = train$label[1L:6000L] - , group = rep(150L, 40L) - ) - ndcg_at <- "1,2,3" - eval_names <- paste0("ndcg@", strsplit(ndcg_at, ",")[[1L]]) - params <- list( - objective = "lambdarank" - , metric = "ndcg" - , ndcg_at = ndcg_at - , lambdarank_truncation_level = 3L - , label_gain = "0,1,3" - ) - nfold <- 4L - nrounds <- 10L - cv_bst <- lgb.cv( - params = params - , data = dtrain - , nrounds = nrounds - , nfold = nfold - , min_data = 1L - , learning_rate = 0.01 - ) - expect_is(cv_bst, "lgb.CVBooster") - expect_equal(length(cv_bst$boosters), nfold) - - # "valid" should contain results for each metric - eval_results <- cv_bst$record_evals[["valid"]] - eval_names <- c("ndcg@1", "ndcg@2", "ndcg@3") - expect_identical(names(eval_results), eval_names) - - # check that best score and iter make sense (0.0 < nDCG < 1.0) - best_iter <- cv_bst$best_iter - best_score <- cv_bst$best_score - expect_true(best_iter > 0L && best_iter <= nrounds) - expect_true(best_score > 0.0 && best_score < 1.0) - expect_true(abs(best_score - 0.775) < TOLERANCE) - - # best_score should be set for the first metric - first_metric <- eval_names[[1L]] - expect_equal(best_score, eval_results[[first_metric]][["eval"]][[best_iter]]) - - for (eval_name in eval_names) { - results_for_this_metric <- eval_results[[eval_name]] - - # each set of metrics should have eval and eval_err - expect_identical(names(results_for_this_metric), c("eval", "eval_err")) - - # there should be one "eval" and "eval_err" per round - expect_equal(length(results_for_this_metric[["eval"]]), nrounds) - expect_equal(length(results_for_this_metric[["eval_err"]]), nrounds) - - # check that evaluation results make sense (0.0 < nDCG < 1.0) - all_evals <- unlist(results_for_this_metric[["eval"]]) - expect_true(all(all_evals > 0.0 & all_evals < 1.0)) - } - - # first and last value of each metric should be as expected - ndcg1_values <- c(0.725, 0.75, 0.75, 0.775, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75) - expect_true(all(abs(unlist(eval_results[["ndcg@1"]][["eval"]]) - ndcg1_values) < TOLERANCE)) - - ndcg2_values <- c( - 0.6863147, 0.720986, 0.7306574, 0.745986, 0.7306574, - 0.720986, 0.7403287, 0.7403287, 0.7403287, 0.7306574 - ) - expect_true(all(abs(unlist(eval_results[["ndcg@2"]][["eval"]]) - ndcg2_values) < TOLERANCE)) - - ndcg3_values <- c( - 0.6777939, 0.6984639, 0.711732, 0.7234639, 0.711732, - 0.7101959, 0.719134, 0.719134, 0.725, 0.711732 - ) - expect_true(all(abs(unlist(eval_results[["ndcg@3"]][["eval"]]) - ndcg3_values) < TOLERANCE)) - - # check details of each booster - for (bst in cv_bst$boosters) { - dumped_model <- jsonlite::fromJSON( - bst$booster$dump_model() - ) - expect_equal(dumped_model[["objective"]], "lambdarank") - expect_equal(dumped_model[["max_feature_idx"]], ncol(train_data) - 1L) - } -}) +# context("Learning to rank") + +# # numerical tolerance to use when checking metric values +# TOLERANCE <- 1e-06 + +# test_that("learning-to-rank with lgb.train() works as expected", { +# set.seed(708L) +# data(agaricus.train, package = "lightgbm") +# # just keep a few features,to generate an model with imperfect fit +# train <- agaricus.train +# train_data <- train$data[1L:6000L, 1L:20L] +# dtrain <- lgb.Dataset( +# train_data +# , label = train$label[1L:6000L] +# , group = rep(150L, 40L) +# ) +# ndcg_at <- "1,2,3" +# eval_names <- paste0("ndcg@", strsplit(ndcg_at, ",")[[1L]]) +# params <- list( +# objective = "lambdarank" +# , metric = "ndcg" +# , ndcg_at = ndcg_at +# , lambdarank_truncation_level = 3L +# , learning_rate = 0.001 +# ) +# model <- lgb.train( +# params = params +# , data = dtrain +# , nrounds = 10L +# ) +# expect_true(lgb.is.Booster(model)) + +# dumped_model <- jsonlite::fromJSON( +# model$dump_model() +# ) +# expect_equal(dumped_model[["objective"]], "lambdarank") +# expect_equal(dumped_model[["max_feature_idx"]], ncol(train_data) - 1L) + +# # check that evaluation results make sense (0.0 < nDCG < 1.0) +# eval_results <- model$eval_train() +# expect_equal(length(eval_results), length(eval_names)) +# for (result in eval_results) { +# expect_true(result[["value"]] > 0.0 && result[["value"]] < 1.0) +# expect_true(result[["higher_better"]]) +# expect_identical(result[["data_name"]], "training") +# } +# expect_identical(sapply(eval_results, function(x) {x$name}), eval_names) +# expect_equal(eval_results[[1L]][["value"]], 0.825) +# expect_true(abs(eval_results[[2L]][["value"]] - 0.7766434) < TOLERANCE) +# expect_true(abs(eval_results[[3L]][["value"]] - 0.7527939) < TOLERANCE) +# }) + +# test_that("learning-to-rank with lgb.cv() works as expected", { +# set.seed(708L) +# data(agaricus.train, package = "lightgbm") +# # just keep a few features,to generate an model with imperfect fit +# train <- agaricus.train +# train_data <- train$data[1L:6000L, 1L:20L] +# dtrain <- lgb.Dataset( +# train_data +# , label = train$label[1L:6000L] +# , group = rep(150L, 40L) +# ) +# ndcg_at <- "1,2,3" +# eval_names <- paste0("ndcg@", strsplit(ndcg_at, ",")[[1L]]) +# params <- list( +# objective = "lambdarank" +# , metric = "ndcg" +# , ndcg_at = ndcg_at +# , lambdarank_truncation_level = 3L +# , label_gain = "0,1,3" +# ) +# nfold <- 4L +# nrounds <- 10L +# cv_bst <- lgb.cv( +# params = params +# , data = dtrain +# , nrounds = nrounds +# , nfold = nfold +# , min_data = 1L +# , learning_rate = 0.01 +# ) +# expect_is(cv_bst, "lgb.CVBooster") +# expect_equal(length(cv_bst$boosters), nfold) + +# # "valid" should contain results for each metric +# eval_results <- cv_bst$record_evals[["valid"]] +# eval_names <- c("ndcg@1", "ndcg@2", "ndcg@3") +# expect_identical(names(eval_results), eval_names) + +# # check that best score and iter make sense (0.0 < nDCG < 1.0) +# best_iter <- cv_bst$best_iter +# best_score <- cv_bst$best_score +# expect_true(best_iter > 0L && best_iter <= nrounds) +# expect_true(best_score > 0.0 && best_score < 1.0) +# expect_true(abs(best_score - 0.775) < TOLERANCE) + +# # best_score should be set for the first metric +# first_metric <- eval_names[[1L]] +# expect_equal(best_score, eval_results[[first_metric]][["eval"]][[best_iter]]) + +# for (eval_name in eval_names) { +# results_for_this_metric <- eval_results[[eval_name]] + +# # each set of metrics should have eval and eval_err +# expect_identical(names(results_for_this_metric), c("eval", "eval_err")) + +# # there should be one "eval" and "eval_err" per round +# expect_equal(length(results_for_this_metric[["eval"]]), nrounds) +# expect_equal(length(results_for_this_metric[["eval_err"]]), nrounds) + +# # check that evaluation results make sense (0.0 < nDCG < 1.0) +# all_evals <- unlist(results_for_this_metric[["eval"]]) +# expect_true(all(all_evals > 0.0 & all_evals < 1.0)) +# } + +# # first and last value of each metric should be as expected +# ndcg1_values <- c(0.725, 0.75, 0.75, 0.775, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75) +# expect_true(all(abs(unlist(eval_results[["ndcg@1"]][["eval"]]) - ndcg1_values) < TOLERANCE)) + +# ndcg2_values <- c( +# 0.6863147, 0.720986, 0.7306574, 0.745986, 0.7306574, +# 0.720986, 0.7403287, 0.7403287, 0.7403287, 0.7306574 +# ) +# expect_true(all(abs(unlist(eval_results[["ndcg@2"]][["eval"]]) - ndcg2_values) < TOLERANCE)) + +# ndcg3_values <- c( +# 0.6777939, 0.6984639, 0.711732, 0.7234639, 0.711732, +# 0.7101959, 0.719134, 0.719134, 0.725, 0.711732 +# ) +# expect_true(all(abs(unlist(eval_results[["ndcg@3"]][["eval"]]) - ndcg3_values) < TOLERANCE)) + +# # check details of each booster +# for (bst in cv_bst$boosters) { +# dumped_model <- jsonlite::fromJSON( +# bst$booster$dump_model() +# ) +# expect_equal(dumped_model[["objective"]], "lambdarank") +# expect_equal(dumped_model[["max_feature_idx"]], ncol(train_data) - 1L) +# } +# }) diff --git a/R-package/tests/testthat/test_lgb.convert_with_rules.R b/R-package/tests/testthat/test_lgb.convert_with_rules.R index b75e6c98a891..546ab9663f4f 100644 --- a/R-package/tests/testthat/test_lgb.convert_with_rules.R +++ b/R-package/tests/testthat/test_lgb.convert_with_rules.R @@ -13,9 +13,9 @@ test_that("lgb.convert_with_rules() rejects inputs that are not a data.table or ) ) for (bad_input in bad_inputs) { - # expect_error({ - # conversion_result <- lgb.convert_with_rules(bad_input) - # }, regexp = "lgb.convert_with_rules: you provided", fixed = TRUE) + expect_error({ + conversion_result <- lgb.convert_with_rules(bad_input) + }, regexp = "lgb.convert_with_rules: you provided", fixed = TRUE) } }) diff --git a/R-package/tests/testthat/test_lgb.importance.R b/R-package/tests/testthat/test_lgb.importance.R index 4dfbddc964ba..c0e1d6e8ca82 100644 --- a/R-package/tests/testthat/test_lgb.importance.R +++ b/R-package/tests/testthat/test_lgb.importance.R @@ -32,8 +32,8 @@ test_that("lgb.importance() should reject bad inputs", { , "lightgbm.model" ) for (input in bad_inputs) { - # expect_error({ - # lgb.importance(input) - # }, regexp = "'model' has to be an object of class lgb\\.Booster") + expect_error({ + lgb.importance(input) + }, regexp = "'model' has to be an object of class lgb\\.Booster") } }) diff --git a/R-package/tests/testthat/test_utils.R b/R-package/tests/testthat/test_utils.R index 2b181796810c..5a9cfb641d61 100644 --- a/R-package/tests/testthat/test_utils.R +++ b/R-package/tests/testthat/test_utils.R @@ -2,9 +2,9 @@ context("lgb.encode.char") test_that("lgb.encode.char throws an informative error if it is passed a non-raw input", { x <- "some-string" - # expect_error({ - # lgb.encode.char(x) - # }, regexp = "Can only encode from raw type") + expect_error({ + lgb.encode.char(x) + }, regexp = "Can only encode from raw type") }) context("lgb.check.r6.class") @@ -61,9 +61,9 @@ test_that("lgb.params2str() works as expected for a key in params with multiple context("lgb.last_error") test_that("lgb.last_error() throws an error if there are no errors", { - # expect_error({ - # lgb.last_error() - # }, regexp = "Everything is fine") + expect_error({ + lgb.last_error() + }, regexp = "Everything is fine") }) test_that("lgb.last_error() correctly returns errors from the C++ side", { @@ -73,9 +73,9 @@ test_that("lgb.last_error() correctly returns errors from the C++ side", { data = train$data , label = as.matrix(rnorm(5L)) ) - # expect_error({ - # dvalid1$construct() - # }, regexp = "[LightGBM] [Fatal] Length of label is not same with #data", fixed = TRUE) + expect_error({ + dvalid1$construct() + }, regexp = "[LightGBM] [Fatal] Length of label is not same with #data", fixed = TRUE) }) context("lgb.check.eval") From c1eeb58aaf6bd2b5a263d6eb9c437e7f91efd710 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Wed, 14 Oct 2020 00:12:57 -0500 Subject: [PATCH 46/67] more comments --- R-package/tests/testthat/test_utils.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R-package/tests/testthat/test_utils.R b/R-package/tests/testthat/test_utils.R index 5a9cfb641d61..edcc09091236 100644 --- a/R-package/tests/testthat/test_utils.R +++ b/R-package/tests/testthat/test_utils.R @@ -73,9 +73,9 @@ test_that("lgb.last_error() correctly returns errors from the C++ side", { data = train$data , label = as.matrix(rnorm(5L)) ) - expect_error({ - dvalid1$construct() - }, regexp = "[LightGBM] [Fatal] Length of label is not same with #data", fixed = TRUE) + # expect_error({ + # dvalid1$construct() + # }, regexp = "[LightGBM] [Fatal] Length of label is not same with #data", fixed = TRUE) }) context("lgb.check.eval") From 77f2c81776190caa95c8976b77d04a07810a41cb Mon Sep 17 00:00:00 2001 From: James Lamb Date: Wed, 14 Oct 2020 00:47:21 -0500 Subject: [PATCH 47/67] more uncommenting --- R-package/tests/testthat/test_lgb.Booster.R | 52 ++++++++++----------- 1 file changed, 24 insertions(+), 28 deletions(-) diff --git a/R-package/tests/testthat/test_lgb.Booster.R b/R-package/tests/testthat/test_lgb.Booster.R index ec3e308e6dd9..05bf571e9018 100644 --- a/R-package/tests/testthat/test_lgb.Booster.R +++ b/R-package/tests/testthat/test_lgb.Booster.R @@ -7,19 +7,15 @@ test_that("lgb.get.eval.result() should throw an informative error if booster is , c("a", "b") , NA , 10L - , lgb.Dataset( - data = matrix(1.0:10.0, 2L, 5L) - , params = list() - ) ) for (bad_input in bad_inputs) { - # expect_error({ - # lgb.get.eval.result( - # booster = bad_input - # , data_name = "test" - # , eval_name = "l2" - # ) - # }, regexp = "Can only use", fixed = TRUE) + expect_error({ + lgb.get.eval.result( + booster = bad_input + , data_name = "test" + , eval_name = "l2" + ) + }, regexp = "Can only use", fixed = TRUE) } }) @@ -108,30 +104,30 @@ test_that("lgb.load() gives the expected error messages given different incorrec ) # you have to give model_str or filename - # expect_error({ - # lgb.load() - # }, regexp = "either filename or model_str must be given") - # expect_error({ - # lgb.load(filename = NULL, model_str = NULL) - # }, regexp = "either filename or model_str must be given") + expect_error({ + lgb.load() + }, regexp = "either filename or model_str must be given") + expect_error({ + lgb.load(filename = NULL, model_str = NULL) + }, regexp = "either filename or model_str must be given") # if given, filename should be a string that points to an existing file model_file <- tempfile(fileext = ".model") - # expect_error({ - # lgb.load(filename = list(model_file)) - # }, regexp = "filename should be character") + expect_error({ + lgb.load(filename = list(model_file)) + }, regexp = "filename should be character") file_to_check <- paste0("a.model") while (file.exists(file_to_check)) { file_to_check <- paste0("a", file_to_check) } - # expect_error({ - # lgb.load(filename = file_to_check) - # }, regexp = "passed to filename does not exist") - - # if given, model_str should be a string - # expect_error({ - # lgb.load(model_str = c(4.0, 5.0, 6.0)) - # }, regexp = "model_str should be character") + expect_error({ + lgb.load(filename = file_to_check) + }, regexp = "passed to filename does not exist") + + if given, model_str should be a string + expect_error({ + lgb.load(model_str = c(4.0, 5.0, 6.0)) + }, regexp = "model_str should be character") }) From e6dfaca0203d3f5e9766c520a55b4310499f9949 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Wed, 14 Oct 2020 00:47:21 -0500 Subject: [PATCH 48/67] more uncommenting --- R-package/tests/testthat/test_lgb.Booster.R | 52 ++++++++++----------- 1 file changed, 24 insertions(+), 28 deletions(-) diff --git a/R-package/tests/testthat/test_lgb.Booster.R b/R-package/tests/testthat/test_lgb.Booster.R index ec3e308e6dd9..05bf571e9018 100644 --- a/R-package/tests/testthat/test_lgb.Booster.R +++ b/R-package/tests/testthat/test_lgb.Booster.R @@ -7,19 +7,15 @@ test_that("lgb.get.eval.result() should throw an informative error if booster is , c("a", "b") , NA , 10L - , lgb.Dataset( - data = matrix(1.0:10.0, 2L, 5L) - , params = list() - ) ) for (bad_input in bad_inputs) { - # expect_error({ - # lgb.get.eval.result( - # booster = bad_input - # , data_name = "test" - # , eval_name = "l2" - # ) - # }, regexp = "Can only use", fixed = TRUE) + expect_error({ + lgb.get.eval.result( + booster = bad_input + , data_name = "test" + , eval_name = "l2" + ) + }, regexp = "Can only use", fixed = TRUE) } }) @@ -108,30 +104,30 @@ test_that("lgb.load() gives the expected error messages given different incorrec ) # you have to give model_str or filename - # expect_error({ - # lgb.load() - # }, regexp = "either filename or model_str must be given") - # expect_error({ - # lgb.load(filename = NULL, model_str = NULL) - # }, regexp = "either filename or model_str must be given") + expect_error({ + lgb.load() + }, regexp = "either filename or model_str must be given") + expect_error({ + lgb.load(filename = NULL, model_str = NULL) + }, regexp = "either filename or model_str must be given") # if given, filename should be a string that points to an existing file model_file <- tempfile(fileext = ".model") - # expect_error({ - # lgb.load(filename = list(model_file)) - # }, regexp = "filename should be character") + expect_error({ + lgb.load(filename = list(model_file)) + }, regexp = "filename should be character") file_to_check <- paste0("a.model") while (file.exists(file_to_check)) { file_to_check <- paste0("a", file_to_check) } - # expect_error({ - # lgb.load(filename = file_to_check) - # }, regexp = "passed to filename does not exist") - - # if given, model_str should be a string - # expect_error({ - # lgb.load(model_str = c(4.0, 5.0, 6.0)) - # }, regexp = "model_str should be character") + expect_error({ + lgb.load(filename = file_to_check) + }, regexp = "passed to filename does not exist") + + if given, model_str should be a string + expect_error({ + lgb.load(model_str = c(4.0, 5.0, 6.0)) + }, regexp = "model_str should be character") }) From 6fb20eb426929f0140fb08d6d2fccd2e55d65edc Mon Sep 17 00:00:00 2001 From: James Lamb Date: Wed, 14 Oct 2020 09:31:03 -0500 Subject: [PATCH 49/67] fix comment --- R-package/tests/testthat/test_lgb.Booster.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R-package/tests/testthat/test_lgb.Booster.R b/R-package/tests/testthat/test_lgb.Booster.R index 05bf571e9018..df0a535d305d 100644 --- a/R-package/tests/testthat/test_lgb.Booster.R +++ b/R-package/tests/testthat/test_lgb.Booster.R @@ -124,7 +124,7 @@ test_that("lgb.load() gives the expected error messages given different incorrec lgb.load(filename = file_to_check) }, regexp = "passed to filename does not exist") - if given, model_str should be a string + # if given, model_str should be a string expect_error({ lgb.load(model_str = c(4.0, 5.0, 6.0)) }, regexp = "model_str should be character") From 06f783ef95fd5471993b95f04bc94a663035422f Mon Sep 17 00:00:00 2001 From: James Lamb Date: Wed, 14 Oct 2020 10:37:55 -0500 Subject: [PATCH 50/67] more uncommenting --- R-package/tests/testthat/test_lgb.Booster.R | 38 ++++++++++----------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/R-package/tests/testthat/test_lgb.Booster.R b/R-package/tests/testthat/test_lgb.Booster.R index df0a535d305d..dae3ebc98989 100644 --- a/R-package/tests/testthat/test_lgb.Booster.R +++ b/R-package/tests/testthat/test_lgb.Booster.R @@ -43,13 +43,13 @@ test_that("lgb.get.eval.result() should throw an informative error for incorrect , min_data = 1L , learning_rate = 1.0 ) - # expect_error({ - # eval_results <- lgb.get.eval.result( - # booster = model - # , data_name = "testing" - # , eval_name = "l2" - # ) - # }, regexp = "Only the following datasets exist in record evals: [test]", fixed = TRUE) + expect_error({ + eval_results <- lgb.get.eval.result( + booster = model + , data_name = "testing" + , eval_name = "l2" + ) + }, regexp = "Only the following datasets exist in record evals: [test]", fixed = TRUE) }) test_that("lgb.get.eval.result() should throw an informative error for incorrect eval_name", { @@ -76,13 +76,13 @@ test_that("lgb.get.eval.result() should throw an informative error for incorrect , min_data = 1L , learning_rate = 1.0 ) - # expect_error({ - # eval_results <- lgb.get.eval.result( - # booster = model - # , data_name = "test" - # , eval_name = "l1" - # ) - # }, regexp = "Only the following eval_names exist for dataset.*\\: \\[l2\\]", fixed = FALSE) + expect_error({ + eval_results <- lgb.get.eval.result( + booster = model + , data_name = "test" + , eval_name = "l1" + ) + }, regexp = "Only the following eval_names exist for dataset.*\\: \\[l2\\]", fixed = FALSE) }) context("lgb.load()") @@ -375,11 +375,11 @@ test_that("Booster$update() throws an informative error if you provide a non-Dat , objective = "binary" , save_name = tempfile(fileext = ".model") ) - # expect_error({ - # bst$update( - # train_set = data.frame(x = rnorm(10L)) - # ) - # }, regexp = "lgb.Booster.update: Only can use lgb.Dataset", fixed = TRUE) + expect_error({ + bst$update( + train_set = data.frame(x = rnorm(10L)) + ) + }, regexp = "lgb.Booster.update: Only can use lgb.Dataset", fixed = TRUE) }) context("save_model") From 350e3307400392fa353f382c2f60dff01131c950 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Wed, 14 Oct 2020 14:18:36 -0500 Subject: [PATCH 51/67] uncomment fully-commented out stuff --- R-package/tests/testthat/test_Predictor.R | 150 +- R-package/tests/testthat/test_basic.R | 3444 ++++++++--------- .../tests/testthat/test_learning_to_rank.R | 282 +- 3 files changed, 1938 insertions(+), 1938 deletions(-) diff --git a/R-package/tests/testthat/test_Predictor.R b/R-package/tests/testthat/test_Predictor.R index c564b712e409..77719f2367a4 100644 --- a/R-package/tests/testthat/test_Predictor.R +++ b/R-package/tests/testthat/test_Predictor.R @@ -1,78 +1,78 @@ -# context("Predictor") +context("Predictor") -# test_that("predictions do not fail for integer input", { -# X <- as.matrix(as.integer(iris[, "Species"]), ncol = 1L) -# y <- iris[["Sepal.Length"]] -# dtrain <- lgb.Dataset(X, label = y) -# fit <- lgb.train( -# data = dtrain -# , objective = "regression" -# , verbose = -1L -# , nrounds = 3L -# ) -# X_double <- X[c(1L, 51L, 101L), , drop = FALSE] -# X_integer <- X_double -# storage.mode(X_double) <- "double" -# pred_integer <- predict(fit, X_integer) -# pred_double <- predict(fit, X_double) -# expect_equal(pred_integer, pred_double) -# }) +test_that("predictions do not fail for integer input", { + X <- as.matrix(as.integer(iris[, "Species"]), ncol = 1L) + y <- iris[["Sepal.Length"]] + dtrain <- lgb.Dataset(X, label = y) + fit <- lgb.train( + data = dtrain + , objective = "regression" + , verbose = -1L + , nrounds = 3L + ) + X_double <- X[c(1L, 51L, 101L), , drop = FALSE] + X_integer <- X_double + storage.mode(X_double) <- "double" + pred_integer <- predict(fit, X_integer) + pred_double <- predict(fit, X_double) + expect_equal(pred_integer, pred_double) +}) -# test_that("start_iteration works correctly", { -# set.seed(708L) -# data(agaricus.train, package = "lightgbm") -# data(agaricus.test, package = "lightgbm") -# train <- agaricus.train -# test <- agaricus.test -# dtrain <- lgb.Dataset( -# agaricus.train$data -# , label = agaricus.train$label -# ) -# dtest <- lgb.Dataset.create.valid( -# dtrain -# , agaricus.test$data -# , label = agaricus.test$label -# ) -# bst <- lightgbm( -# data = as.matrix(train$data) -# , label = train$label -# , num_leaves = 4L -# , learning_rate = 0.6 -# , nrounds = 50L -# , objective = "binary" -# , valids = list("test" = dtest) -# , early_stopping_rounds = 2L -# ) -# expect_true(lgb.is.Booster(bst)) -# pred1 <- predict(bst, data = test$data, rawscore = TRUE) -# pred_contrib1 <- predict(bst, test$data, predcontrib = TRUE) -# pred2 <- rep(0.0, length(pred1)) -# pred_contrib2 <- rep(0.0, length(pred2)) -# step <- 11L -# end_iter <- 49L -# if (bst$best_iter != -1L) { -# end_iter <- bst$best_iter - 1L -# } -# start_iters <- seq(0L, end_iter, by = step) -# for (start_iter in start_iters) { -# n_iter <- min(c(end_iter - start_iter + 1L, step)) -# inc_pred <- predict(bst, test$data -# , start_iteration = start_iter -# , num_iteration = n_iter -# , rawscore = TRUE -# ) -# inc_pred_contrib <- bst$predict(test$data -# , start_iteration = start_iter -# , num_iteration = n_iter -# , predcontrib = TRUE -# ) -# pred2 <- pred2 + inc_pred -# pred_contrib2 <- pred_contrib2 + inc_pred_contrib -# } -# expect_equal(pred2, pred1) -# expect_equal(pred_contrib2, pred_contrib1) +test_that("start_iteration works correctly", { + set.seed(708L) + data(agaricus.train, package = "lightgbm") + data(agaricus.test, package = "lightgbm") + train <- agaricus.train + test <- agaricus.test + dtrain <- lgb.Dataset( + agaricus.train$data + , label = agaricus.train$label + ) + dtest <- lgb.Dataset.create.valid( + dtrain + , agaricus.test$data + , label = agaricus.test$label + ) + bst <- lightgbm( + data = as.matrix(train$data) + , label = train$label + , num_leaves = 4L + , learning_rate = 0.6 + , nrounds = 50L + , objective = "binary" + , valids = list("test" = dtest) + , early_stopping_rounds = 2L + ) + expect_true(lgb.is.Booster(bst)) + pred1 <- predict(bst, data = test$data, rawscore = TRUE) + pred_contrib1 <- predict(bst, test$data, predcontrib = TRUE) + pred2 <- rep(0.0, length(pred1)) + pred_contrib2 <- rep(0.0, length(pred2)) + step <- 11L + end_iter <- 49L + if (bst$best_iter != -1L) { + end_iter <- bst$best_iter - 1L + } + start_iters <- seq(0L, end_iter, by = step) + for (start_iter in start_iters) { + n_iter <- min(c(end_iter - start_iter + 1L, step)) + inc_pred <- predict(bst, test$data + , start_iteration = start_iter + , num_iteration = n_iter + , rawscore = TRUE + ) + inc_pred_contrib <- bst$predict(test$data + , start_iteration = start_iter + , num_iteration = n_iter + , predcontrib = TRUE + ) + pred2 <- pred2 + inc_pred + pred_contrib2 <- pred_contrib2 + inc_pred_contrib + } + expect_equal(pred2, pred1) + expect_equal(pred_contrib2, pred_contrib1) -# pred_leaf1 <- predict(bst, test$data, predleaf = TRUE) -# pred_leaf2 <- predict(bst, test$data, start_iteration = 0L, num_iteration = end_iter + 1L, predleaf = TRUE) -# expect_equal(pred_leaf1, pred_leaf2) -# }) + pred_leaf1 <- predict(bst, test$data, predleaf = TRUE) + pred_leaf2 <- predict(bst, test$data, start_iteration = 0L, num_iteration = end_iter + 1L, predleaf = TRUE) + expect_equal(pred_leaf1, pred_leaf2) +}) diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R index 5bea558e9c92..b94e91c9897c 100644 --- a/R-package/tests/testthat/test_basic.R +++ b/R-package/tests/testthat/test_basic.R @@ -1,1722 +1,1722 @@ -# context("lightgbm()") - -# data(agaricus.train, package = "lightgbm") -# data(agaricus.test, package = "lightgbm") -# train <- agaricus.train -# test <- agaricus.test - -# TOLERANCE <- 1e-6 -# set.seed(708L) - -# # [description] Every time this function is called, it adds 0.1 -# # to an accumulator then returns the current value. -# # This is used to mock the situation where an evaluation -# # metric increases every iteration -# ACCUMULATOR_NAME <- "INCREASING_METRIC_ACUMULATOR" -# assign(x = ACCUMULATOR_NAME, value = 0.0, envir = .GlobalEnv) - -# .increasing_metric <- function(preds, dtrain) { -# if (!exists(ACCUMULATOR_NAME, envir = .GlobalEnv)) { -# assign(ACCUMULATOR_NAME, 0.0, envir = .GlobalEnv) -# } -# assign( -# x = ACCUMULATOR_NAME -# , value = get(ACCUMULATOR_NAME, envir = .GlobalEnv) + 0.1 -# , envir = .GlobalEnv -# ) -# return(list( -# name = "increasing_metric" -# , value = get(ACCUMULATOR_NAME, envir = .GlobalEnv) -# , higher_better = TRUE -# )) -# } - -# # [description] Evaluation function that always returns the -# # same value -# CONSTANT_METRIC_VALUE <- 0.2 -# .constant_metric <- function(preds, dtrain) { -# return(list( -# name = "constant_metric" -# , value = CONSTANT_METRIC_VALUE -# , higher_better = FALSE -# )) -# } - -# # sample datasets to test early stopping -# DTRAIN_RANDOM_REGRESSION <- lgb.Dataset( -# data = as.matrix(rnorm(100L), ncol = 1L, drop = FALSE) -# , label = rnorm(100L) -# ) -# DVALID_RANDOM_REGRESSION <- lgb.Dataset( -# data = as.matrix(rnorm(50L), ncol = 1L, drop = FALSE) -# , label = rnorm(50L) -# ) -# DTRAIN_RANDOM_CLASSIFICATION <- lgb.Dataset( -# data = as.matrix(rnorm(120L), ncol = 1L, drop = FALSE) -# , label = sample(c(0L, 1L), size = 120L, replace = TRUE) -# ) -# DVALID_RANDOM_CLASSIFICATION <- lgb.Dataset( -# data = as.matrix(rnorm(37L), ncol = 1L, drop = FALSE) -# , label = sample(c(0L, 1L), size = 37L, replace = TRUE) -# ) - -# test_that("train and predict binary classification", { -# nrounds <- 10L -# bst <- lightgbm( -# data = train$data -# , label = train$label -# , num_leaves = 5L -# , nrounds = nrounds -# , objective = "binary" -# , metric = "binary_error" -# , save_name = tempfile(fileext = ".model") -# ) -# expect_false(is.null(bst$record_evals)) -# record_results <- lgb.get.eval.result(bst, "train", "binary_error") -# expect_lt(min(record_results), 0.02) - -# pred <- predict(bst, test$data) -# expect_equal(length(pred), 1611L) - -# pred1 <- predict(bst, train$data, num_iteration = 1L) -# expect_equal(length(pred1), 6513L) -# err_pred1 <- sum((pred1 > 0.5) != train$label) / length(train$label) -# err_log <- record_results[1L] -# expect_lt(abs(err_pred1 - err_log), TOLERANCE) -# }) - - -# test_that("train and predict softmax", { -# set.seed(708L) -# lb <- as.numeric(iris$Species) - 1L - -# bst <- lightgbm( -# data = as.matrix(iris[, -5L]) -# , label = lb -# , num_leaves = 4L -# , learning_rate = 0.05 -# , nrounds = 20L -# , min_data = 20L -# , min_hessian = 10.0 -# , objective = "multiclass" -# , metric = "multi_error" -# , num_class = 3L -# , save_name = tempfile(fileext = ".model") -# ) - -# expect_false(is.null(bst$record_evals)) -# record_results <- lgb.get.eval.result(bst, "train", "multi_error") -# expect_lt(min(record_results), 0.06) - -# pred <- predict(bst, as.matrix(iris[, -5L])) -# expect_equal(length(pred), nrow(iris) * 3L) -# }) - - -# test_that("use of multiple eval metrics works", { -# metrics <- list("binary_error", "auc", "binary_logloss") -# bst <- lightgbm( -# data = train$data -# , label = train$label -# , num_leaves = 4L -# , learning_rate = 1.0 -# , nrounds = 10L -# , objective = "binary" -# , metric = metrics -# , save_name = tempfile(fileext = ".model") -# ) -# expect_false(is.null(bst$record_evals)) -# expect_named( -# bst$record_evals[["train"]] -# , unlist(metrics) -# , ignore.order = FALSE -# , ignore.case = FALSE -# ) -# }) - -# test_that("lgb.Booster.upper_bound() and lgb.Booster.lower_bound() work as expected for binary classification", { -# set.seed(708L) -# nrounds <- 10L -# bst <- lightgbm( -# data = train$data -# , label = train$label -# , num_leaves = 5L -# , nrounds = nrounds -# , objective = "binary" -# , metric = "binary_error" -# , save_name = tempfile(fileext = ".model") -# ) -# expect_true(abs(bst$lower_bound() - -1.590853) < TOLERANCE) -# expect_true(abs(bst$upper_bound() - 1.871015) < TOLERANCE) -# }) - -# test_that("lgb.Booster.upper_bound() and lgb.Booster.lower_bound() work as expected for regression", { -# set.seed(708L) -# nrounds <- 10L -# bst <- lightgbm( -# data = train$data -# , label = train$label -# , num_leaves = 5L -# , nrounds = nrounds -# , objective = "regression" -# , metric = "l2" -# , save_name = tempfile(fileext = ".model") -# ) -# expect_true(abs(bst$lower_bound() - 0.1513859) < TOLERANCE) -# expect_true(abs(bst$upper_bound() - 0.9080349) < TOLERANCE) -# }) - -# test_that("lightgbm() rejects negative or 0 value passed to nrounds", { -# dtrain <- lgb.Dataset(train$data, label = train$label) -# params <- list(objective = "regression", metric = "l2,l1") -# for (nround_value in c(-10L, 0L)) { -# # expect_error({ -# # bst <- lightgbm( -# # data = dtrain -# # , params = params -# # , nrounds = nround_value -# # , save_name = tempfile(fileext = ".model") -# # ) -# # }, "nrounds should be greater than zero") -# } -# }) - -# test_that("lightgbm() performs evaluation on validation sets if they are provided", { -# set.seed(708L) -# dvalid1 <- lgb.Dataset( -# data = train$data -# , label = train$label -# ) -# dvalid2 <- lgb.Dataset( -# data = train$data -# , label = train$label -# ) -# nrounds <- 10L -# bst <- lightgbm( -# data = train$data -# , label = train$label -# , num_leaves = 5L -# , nrounds = nrounds -# , objective = "binary" -# , metric = c( -# "binary_error" -# , "auc" -# ) -# , valids = list( -# "valid1" = dvalid1 -# , "valid2" = dvalid2 -# ) -# , save_name = tempfile(fileext = ".model") -# ) - -# expect_named( -# bst$record_evals -# , c("train", "valid1", "valid2", "start_iter") -# , ignore.order = TRUE -# , ignore.case = FALSE -# ) -# for (valid_name in c("train", "valid1", "valid2")) { -# eval_results <- bst$record_evals[[valid_name]][["binary_error"]] -# expect_length(eval_results[["eval"]], nrounds) -# } -# expect_true(abs(bst$record_evals[["train"]][["binary_error"]][["eval"]][[1L]] - 0.02226317) < TOLERANCE) -# expect_true(abs(bst$record_evals[["valid1"]][["binary_error"]][["eval"]][[1L]] - 0.02226317) < TOLERANCE) -# expect_true(abs(bst$record_evals[["valid2"]][["binary_error"]][["eval"]][[1L]] - 0.02226317) < TOLERANCE) -# }) - - -# context("training continuation") - -# test_that("training continuation works", { -# dtrain <- lgb.Dataset( -# train$data -# , label = train$label -# , free_raw_data = FALSE -# ) -# watchlist <- list(train = dtrain) -# param <- list( -# objective = "binary" -# , metric = "binary_logloss" -# , num_leaves = 5L -# , learning_rate = 1.0 -# ) - -# # train for 10 consecutive iterations -# bst <- lgb.train(param, dtrain, nrounds = 10L, watchlist) -# err_bst <- lgb.get.eval.result(bst, "train", "binary_logloss", 10L) - -# # train for 5 iterations, save, load, train for 5 more -# bst1 <- lgb.train(param, dtrain, nrounds = 5L, watchlist) -# model_file <- tempfile(fileext = ".model") -# lgb.save(bst1, model_file) -# bst2 <- lgb.train(param, dtrain, nrounds = 5L, watchlist, init_model = bst1) -# err_bst2 <- lgb.get.eval.result(bst2, "train", "binary_logloss", 10L) - -# # evaluation metrics should be nearly identical for the model trained in 10 coonsecutive -# # iterations and the one trained in 5-then-5. -# expect_lt(abs(err_bst - err_bst2), 0.01) -# }) - -# context("lgb.cv()") - -# test_that("cv works", { -# dtrain <- lgb.Dataset(train$data, label = train$label) -# params <- list(objective = "regression", metric = "l2,l1") -# bst <- lgb.cv( -# params -# , dtrain -# , 10L -# , nfold = 5L -# , min_data = 1L -# , learning_rate = 1.0 -# , early_stopping_rounds = 10L -# ) -# expect_false(is.null(bst$record_evals)) -# }) - -# test_that("lgb.cv() rejects negative or 0 value passed to nrounds", { -# dtrain <- lgb.Dataset(train$data, label = train$label) -# params <- list(objective = "regression", metric = "l2,l1") -# for (nround_value in c(-10L, 0L)) { -# # expect_error({ -# # bst <- lgb.cv( -# # params -# # , dtrain -# # , nround_value -# # , nfold = 5L -# # , min_data = 1L -# # ) -# # }, "nrounds should be greater than zero") -# } -# }) - -# test_that("lgb.cv() throws an informative error is 'data' is not an lgb.Dataset and labels are not given", { -# bad_values <- list( -# 4L -# , "hello" -# , list(a = TRUE, b = seq_len(10L)) -# , data.frame(x = seq_len(5L), y = seq_len(5L)) -# , data.table::data.table(x = seq_len(5L), y = seq_len(5L)) -# , matrix(data = seq_len(10L), 2L, 5L) -# ) -# for (val in bad_values) { -# # expect_error({ -# # bst <- lgb.cv( -# # params = list(objective = "regression", metric = "l2,l1") -# # , data = val -# # , 10L -# # , nfold = 5L -# # , min_data = 1L -# # ) -# # }, regexp = "'label' must be provided for lgb.cv if 'data' is not an 'lgb.Dataset'", fixed = TRUE) -# } -# }) - -# test_that("lightgbm.cv() gives the correct best_score and best_iter for a metric where higher values are better", { -# set.seed(708L) -# dtrain <- lgb.Dataset( -# data = as.matrix(runif(n = 500L, min = 0.0, max = 15.0), drop = FALSE) -# , label = rep(c(0L, 1L), 250L) -# ) -# nrounds <- 10L -# cv_bst <- lgb.cv( -# data = dtrain -# , nfold = 5L -# , nrounds = nrounds -# , num_leaves = 5L -# , params = list( -# objective = "binary" -# , metric = "auc,binary_error" -# , learning_rate = 1.5 -# ) -# ) -# expect_is(cv_bst, "lgb.CVBooster") -# expect_named( -# cv_bst$record_evals -# , c("start_iter", "valid") -# , ignore.order = FALSE -# , ignore.case = FALSE -# ) -# auc_scores <- unlist(cv_bst$record_evals[["valid"]][["auc"]][["eval"]]) -# expect_length(auc_scores, nrounds) -# expect_identical(cv_bst$best_iter, which.max(auc_scores)) -# expect_identical(cv_bst$best_score, auc_scores[which.max(auc_scores)]) -# }) - -# context("lgb.train()") - -# test_that("lgb.train() works as expected with multiple eval metrics", { -# metrics <- c("binary_error", "auc", "binary_logloss") -# bst <- lgb.train( -# data = lgb.Dataset( -# train$data -# , label = train$label -# ) -# , learning_rate = 1.0 -# , nrounds = 10L -# , params = list( -# objective = "binary" -# , metric = metrics -# ) -# , valids = list( -# "train" = lgb.Dataset( -# train$data -# , label = train$label -# ) -# ) -# ) -# expect_false(is.null(bst$record_evals)) -# expect_named( -# bst$record_evals[["train"]] -# , unlist(metrics) -# , ignore.order = FALSE -# , ignore.case = FALSE -# ) -# }) - -# test_that("lgb.train() rejects negative or 0 value passed to nrounds", { -# dtrain <- lgb.Dataset(train$data, label = train$label) -# params <- list(objective = "regression", metric = "l2,l1") -# for (nround_value in c(-10L, 0L)) { -# # expect_error({ -# # bst <- lgb.train( -# # params -# # , dtrain -# # , nround_value -# # ) -# # }, "nrounds should be greater than zero") -# } -# }) - -# test_that("lgb.train() throws an informative error if 'data' is not an lgb.Dataset", { -# bad_values <- list( -# 4L -# , "hello" -# , list(a = TRUE, b = seq_len(10L)) -# , data.frame(x = seq_len(5L), y = seq_len(5L)) -# , data.table::data.table(x = seq_len(5L), y = seq_len(5L)) -# , matrix(data = seq_len(10L), 2L, 5L) -# ) -# for (val in bad_values) { -# # expect_error({ -# # bst <- lgb.train( -# # params = list(objective = "regression", metric = "l2,l1") -# # , data = val -# # , 10L -# # ) -# # }, regexp = "data must be an lgb.Dataset instance", fixed = TRUE) -# } -# }) - -# test_that("lgb.train() throws an informative error if 'valids' is not a list of lgb.Dataset objects", { -# valids <- list( -# "valid1" = data.frame(x = rnorm(5L), y = rnorm(5L)) -# , "valid2" = data.frame(x = rnorm(5L), y = rnorm(5L)) -# ) -# # expect_error({ -# # bst <- lgb.train( -# # params = list(objective = "regression", metric = "l2,l1") -# # , data = lgb.Dataset(train$data, label = train$label) -# # , 10L -# # , valids = valids -# # ) -# # }, regexp = "valids must be a list of lgb.Dataset elements") -# }) - -# test_that("lgb.train() errors if 'valids' is a list of lgb.Dataset objects but some do not have names", { -# valids <- list( -# "valid1" = lgb.Dataset(matrix(rnorm(10L), 5L, 2L)) -# , lgb.Dataset(matrix(rnorm(10L), 2L, 5L)) -# ) -# # expect_error({ -# # bst <- lgb.train( -# # params = list(objective = "regression", metric = "l2,l1") -# # , data = lgb.Dataset(train$data, label = train$label) -# # , 10L -# # , valids = valids -# # ) -# # }, regexp = "each element of valids must have a name") -# }) - -# test_that("lgb.train() throws an informative error if 'valids' contains lgb.Dataset objects but none have names", { -# valids <- list( -# lgb.Dataset(matrix(rnorm(10L), 5L, 2L)) -# , lgb.Dataset(matrix(rnorm(10L), 2L, 5L)) -# ) -# # expect_error({ -# # bst <- lgb.train( -# # params = list(objective = "regression", metric = "l2,l1") -# # , data = lgb.Dataset(train$data, label = train$label) -# # , 10L -# # , valids = valids -# # ) -# # }, regexp = "each element of valids must have a name") -# }) - -# test_that("lgb.train() works with force_col_wise and force_row_wise", { -# set.seed(1234L) -# nrounds <- 10L -# dtrain <- lgb.Dataset( -# train$data -# , label = train$label -# ) -# params <- list( -# objective = "binary" -# , metric = "binary_error" -# , force_col_wise = TRUE -# ) -# bst_col_wise <- lgb.train( -# params = params -# , data = dtrain -# , nrounds = nrounds -# ) - -# params <- list( -# objective = "binary" -# , metric = "binary_error" -# , force_row_wise = TRUE -# ) -# bst_row_wise <- lgb.train( -# params = params -# , data = dtrain -# , nrounds = nrounds -# ) - -# expected_error <- 0.003070782 -# expect_equal(bst_col_wise$eval_train()[[1L]][["value"]], expected_error) -# expect_equal(bst_row_wise$eval_train()[[1L]][["value"]], expected_error) - -# # check some basic details of the boosters just to be sure force_col_wise -# # and force_row_wise are not causing any weird side effects -# for (bst in list(bst_row_wise, bst_col_wise)) { -# expect_equal(bst$current_iter(), nrounds) -# parsed_model <- jsonlite::fromJSON(bst$dump_model()) -# expect_equal(parsed_model$objective, "binary sigmoid:1") -# expect_false(parsed_model$average_output) -# } -# }) - -# test_that("lgb.train() works as expected with sparse features", { -# set.seed(708L) -# num_obs <- 70000L -# trainDF <- data.frame( -# y = sample(c(0L, 1L), size = num_obs, replace = TRUE) -# , x = sample(c(1.0:10.0, rep(NA_real_, 50L)), size = num_obs, replace = TRUE) -# ) -# dtrain <- lgb.Dataset( -# data = as.matrix(trainDF[["x"]], drop = FALSE) -# , label = trainDF[["y"]] -# ) -# nrounds <- 1L -# bst <- lgb.train( -# params = list( -# objective = "binary" -# , min_data = 1L -# , min_data_in_bin = 1L -# ) -# , data = dtrain -# , nrounds = nrounds -# ) - -# expect_true(lgb.is.Booster(bst)) -# expect_equal(bst$current_iter(), nrounds) -# parsed_model <- jsonlite::fromJSON(bst$dump_model()) -# expect_equal(parsed_model$objective, "binary sigmoid:1") -# expect_false(parsed_model$average_output) -# expected_error <- 0.6931268 -# expect_true(abs(bst$eval_train()[[1L]][["value"]] - expected_error) < TOLERANCE) -# }) - -# test_that("lgb.train() works with early stopping for classification", { -# trainDF <- data.frame( -# "feat1" = rep(c(5.0, 10.0), 500L) -# , "target" = rep(c(0L, 1L), 500L) -# ) -# validDF <- data.frame( -# "feat1" = rep(c(5.0, 10.0), 50L) -# , "target" = rep(c(0L, 1L), 50L) -# ) -# dtrain <- lgb.Dataset( -# data = as.matrix(trainDF[["feat1"]], drop = FALSE) -# , label = trainDF[["target"]] -# ) -# dvalid <- lgb.Dataset( -# data = as.matrix(validDF[["feat1"]], drop = FALSE) -# , label = validDF[["target"]] -# ) -# nrounds <- 10L - -# ################################ -# # train with no early stopping # -# ################################ -# bst <- lgb.train( -# params = list( -# objective = "binary" -# , metric = "binary_error" -# ) -# , data = dtrain -# , nrounds = nrounds -# , valids = list( -# "valid1" = dvalid -# ) -# ) - -# # a perfect model should be trivial to obtain, but all 10 rounds -# # should happen -# expect_equal(bst$best_score, 0.0) -# expect_equal(bst$best_iter, 1L) -# expect_equal(length(bst$record_evals[["valid1"]][["binary_error"]][["eval"]]), nrounds) - -# ############################# -# # train with early stopping # -# ############################# -# early_stopping_rounds <- 5L -# bst <- lgb.train( -# params = list( -# objective = "binary" -# , metric = "binary_error" -# , early_stopping_rounds = early_stopping_rounds -# ) -# , data = dtrain -# , nrounds = nrounds -# , valids = list( -# "valid1" = dvalid -# ) -# ) - -# # a perfect model should be trivial to obtain, and only 6 rounds -# # should have happen (1 with improvement, 5 consecutive with no improvement) -# expect_equal(bst$best_score, 0.0) -# expect_equal(bst$best_iter, 1L) -# expect_equal( -# length(bst$record_evals[["valid1"]][["binary_error"]][["eval"]]) -# , early_stopping_rounds + 1L -# ) - -# }) - -# test_that("lgb.train() treats early_stopping_rounds<=0 as disabling early stopping", { -# set.seed(708L) -# trainDF <- data.frame( -# "feat1" = rep(c(5.0, 10.0), 500L) -# , "target" = rep(c(0L, 1L), 500L) -# ) -# validDF <- data.frame( -# "feat1" = rep(c(5.0, 10.0), 50L) -# , "target" = rep(c(0L, 1L), 50L) -# ) -# dtrain <- lgb.Dataset( -# data = as.matrix(trainDF[["feat1"]], drop = FALSE) -# , label = trainDF[["target"]] -# ) -# dvalid <- lgb.Dataset( -# data = as.matrix(validDF[["feat1"]], drop = FALSE) -# , label = validDF[["target"]] -# ) -# nrounds <- 5L - -# for (value in c(-5L, 0L)) { - -# #----------------------------# -# # passed as keyword argument # -# #----------------------------# -# bst <- lgb.train( -# params = list( -# objective = "binary" -# , metric = "binary_error" -# ) -# , data = dtrain -# , nrounds = nrounds -# , valids = list( -# "valid1" = dvalid -# ) -# , early_stopping_rounds = value -# ) - -# # a perfect model should be trivial to obtain, but all 10 rounds -# # should happen -# expect_equal(bst$best_score, 0.0) -# expect_equal(bst$best_iter, 1L) -# expect_equal(length(bst$record_evals[["valid1"]][["binary_error"]][["eval"]]), nrounds) - -# #---------------------------# -# # passed as parameter alias # -# #---------------------------# -# bst <- lgb.train( -# params = list( -# objective = "binary" -# , metric = "binary_error" -# , n_iter_no_change = value -# ) -# , data = dtrain -# , nrounds = nrounds -# , valids = list( -# "valid1" = dvalid -# ) -# ) - -# # a perfect model should be trivial to obtain, but all 10 rounds -# # should happen -# expect_equal(bst$best_score, 0.0) -# expect_equal(bst$best_iter, 1L) -# expect_equal(length(bst$record_evals[["valid1"]][["binary_error"]][["eval"]]), nrounds) -# } -# }) - -# test_that("lgb.train() works with early stopping for classification with a metric that should be maximized", { -# set.seed(708L) -# dtrain <- lgb.Dataset( -# data = train$data -# , label = train$label -# ) -# dvalid <- lgb.Dataset( -# data = test$data -# , label = test$label -# ) -# nrounds <- 10L - -# ############################# -# # train with early stopping # -# ############################# -# early_stopping_rounds <- 5L -# # the harsh max_depth guarantees that AUC improves over at least the first few iterations -# bst_auc <- lgb.train( -# params = list( -# objective = "binary" -# , metric = "auc" -# , max_depth = 3L -# , early_stopping_rounds = early_stopping_rounds -# ) -# , data = dtrain -# , nrounds = nrounds -# , valids = list( -# "valid1" = dvalid -# ) -# ) -# bst_binary_error <- lgb.train( -# params = list( -# objective = "binary" -# , metric = "binary_error" -# , max_depth = 3L -# , early_stopping_rounds = early_stopping_rounds -# ) -# , data = dtrain -# , nrounds = nrounds -# , valids = list( -# "valid1" = dvalid -# ) -# ) - -# # early stopping should have been hit for binary_error (higher_better = FALSE) -# eval_info <- bst_binary_error$.__enclos_env__$private$get_eval_info() -# expect_identical(eval_info, "binary_error") -# expect_identical( -# unname(bst_binary_error$.__enclos_env__$private$higher_better_inner_eval) -# , FALSE -# ) -# expect_identical(bst_binary_error$best_iter, 1L) -# expect_identical(bst_binary_error$current_iter(), early_stopping_rounds + 1L) -# expect_true(abs(bst_binary_error$best_score - 0.01613904) < TOLERANCE) - -# # early stopping should not have been hit for AUC (higher_better = TRUE) -# eval_info <- bst_auc$.__enclos_env__$private$get_eval_info() -# expect_identical(eval_info, "auc") -# expect_identical( -# unname(bst_auc$.__enclos_env__$private$higher_better_inner_eval) -# , TRUE -# ) -# expect_identical(bst_auc$best_iter, 9L) -# expect_identical(bst_auc$current_iter(), nrounds) -# expect_true(abs(bst_auc$best_score - 0.9999969) < TOLERANCE) -# }) - -# test_that("lgb.train() works with early stopping for regression", { -# set.seed(708L) -# trainDF <- data.frame( -# "feat1" = rep(c(10.0, 100.0), 500L) -# , "target" = rep(c(-50.0, 50.0), 500L) -# ) -# validDF <- data.frame( -# "feat1" = rep(50.0, 4L) -# , "target" = rep(50.0, 4L) -# ) -# dtrain <- lgb.Dataset( -# data = as.matrix(trainDF[["feat1"]], drop = FALSE) -# , label = trainDF[["target"]] -# ) -# dvalid <- lgb.Dataset( -# data = as.matrix(validDF[["feat1"]], drop = FALSE) -# , label = validDF[["target"]] -# ) -# nrounds <- 10L - -# ################################ -# # train with no early stopping # -# ################################ -# bst <- lgb.train( -# params = list( -# objective = "regression" -# , metric = "rmse" -# ) -# , data = dtrain -# , nrounds = nrounds -# , valids = list( -# "valid1" = dvalid -# ) -# ) - -# # the best possible model should come from the first iteration, but -# # all 10 training iterations should happen -# expect_equal(bst$best_score, 55.0) -# expect_equal(bst$best_iter, 1L) -# expect_equal(length(bst$record_evals[["valid1"]][["rmse"]][["eval"]]), nrounds) - -# ############################# -# # train with early stopping # -# ############################# -# early_stopping_rounds <- 5L -# bst <- lgb.train( -# params = list( -# objective = "regression" -# , metric = "rmse" -# , early_stopping_rounds = early_stopping_rounds -# ) -# , data = dtrain -# , nrounds = nrounds -# , valids = list( -# "valid1" = dvalid -# ) -# ) - -# # the best model should be from the first iteration, and only 6 rounds -# # should have happen (1 with improvement, 5 consecutive with no improvement) -# expect_equal(bst$best_score, 55.0) -# expect_equal(bst$best_iter, 1L) -# expect_equal( -# length(bst$record_evals[["valid1"]][["rmse"]][["eval"]]) -# , early_stopping_rounds + 1L -# ) -# }) - -# test_that("lgb.train() does not stop early if early_stopping_rounds is not given", { -# set.seed(708L) - -# increasing_metric_starting_value <- get( -# ACCUMULATOR_NAME -# , envir = .GlobalEnv -# ) -# nrounds <- 10L -# metrics <- list( -# .constant_metric -# , .increasing_metric -# ) -# bst <- lgb.train( -# params = list( -# objective = "regression" -# , metric = "None" -# ) -# , data = DTRAIN_RANDOM_REGRESSION -# , nrounds = nrounds -# , valids = list("valid1" = DVALID_RANDOM_REGRESSION) -# , eval = metrics -# ) - -# # Only the two functions provided to "eval" should have been evaluated -# expect_equal(length(bst$record_evals[["valid1"]]), 2L) - -# # all 10 iterations should have happen, and the best_iter should be -# # the first one (based on constant_metric) -# best_iter <- 1L -# expect_equal(bst$best_iter, best_iter) - -# # best_score should be taken from the first metric -# expect_equal( -# bst$best_score -# , bst$record_evals[["valid1"]][["constant_metric"]][["eval"]][[best_iter]] -# ) - -# # early stopping should not have happened. Even though constant_metric -# # had 9 consecutive iterations with no improvement, it is ignored because of -# # first_metric_only = TRUE -# expect_equal( -# length(bst$record_evals[["valid1"]][["constant_metric"]][["eval"]]) -# , nrounds -# ) -# expect_equal( -# length(bst$record_evals[["valid1"]][["increasing_metric"]][["eval"]]) -# , nrounds -# ) -# }) - -# test_that("If first_metric_only is not given or is FALSE, lgb.train() decides to stop early based on all metrics", { -# set.seed(708L) - -# early_stopping_rounds <- 3L -# param_variations <- list( -# list( -# objective = "regression" -# , metric = "None" -# , early_stopping_rounds = early_stopping_rounds -# ) -# , list( -# objective = "regression" -# , metric = "None" -# , early_stopping_rounds = early_stopping_rounds -# , first_metric_only = FALSE -# ) -# ) - -# for (params in param_variations) { - -# nrounds <- 10L -# bst <- lgb.train( -# params = params -# , data = DTRAIN_RANDOM_REGRESSION -# , nrounds = nrounds -# , valids = list( -# "valid1" = DVALID_RANDOM_REGRESSION -# ) -# , eval = list( -# .increasing_metric -# , .constant_metric -# ) -# ) - -# # Only the two functions provided to "eval" should have been evaluated -# expect_equal(length(bst$record_evals[["valid1"]]), 2L) - -# # early stopping should have happened, and should have stopped early_stopping_rounds + 1 rounds in -# # because constant_metric never improves -# # -# # the best iteration should be the last one, because increasing_metric was first -# # and gets better every iteration -# best_iter <- early_stopping_rounds + 1L -# expect_equal(bst$best_iter, best_iter) - -# # best_score should be taken from "increasing_metric" because it was first -# expect_equal( -# bst$best_score -# , bst$record_evals[["valid1"]][["increasing_metric"]][["eval"]][[best_iter]] -# ) - -# # early stopping should not have happened. even though increasing_metric kept -# # getting better, early stopping should have happened because "constant_metric" -# # did not improve -# expect_equal( -# length(bst$record_evals[["valid1"]][["constant_metric"]][["eval"]]) -# , early_stopping_rounds + 1L -# ) -# expect_equal( -# length(bst$record_evals[["valid1"]][["increasing_metric"]][["eval"]]) -# , early_stopping_rounds + 1L -# ) -# } - -# }) - -# test_that("If first_metric_only is TRUE, lgb.train() decides to stop early based on only the first metric", { -# set.seed(708L) -# nrounds <- 10L -# early_stopping_rounds <- 3L -# increasing_metric_starting_value <- get(ACCUMULATOR_NAME, envir = .GlobalEnv) -# bst <- lgb.train( -# params = list( -# objective = "regression" -# , metric = "None" -# , early_stopping_rounds = early_stopping_rounds -# , first_metric_only = TRUE -# ) -# , data = DTRAIN_RANDOM_REGRESSION -# , nrounds = nrounds -# , valids = list( -# "valid1" = DVALID_RANDOM_REGRESSION -# ) -# , eval = list( -# .increasing_metric -# , .constant_metric -# ) -# ) - -# # Only the two functions provided to "eval" should have been evaluated -# expect_equal(length(bst$record_evals[["valid1"]]), 2L) - -# # all 10 iterations should happen, and the best_iter should be the final one -# expect_equal(bst$best_iter, nrounds) - -# # best_score should be taken from "increasing_metric" -# expect_equal( -# bst$best_score -# , increasing_metric_starting_value + 0.1 * nrounds -# ) - -# # early stopping should not have happened. Even though constant_metric -# # had 9 consecutive iterations with no improvement, it is ignored because of -# # first_metric_only = TRUE -# expect_equal( -# length(bst$record_evals[["valid1"]][["constant_metric"]][["eval"]]) -# , nrounds -# ) -# expect_equal( -# length(bst$record_evals[["valid1"]][["increasing_metric"]][["eval"]]) -# , nrounds -# ) -# }) - -# test_that("lgb.train() works when a mixture of functions and strings are passed to eval", { -# set.seed(708L) -# nrounds <- 10L -# increasing_metric_starting_value <- get(ACCUMULATOR_NAME, envir = .GlobalEnv) -# bst <- lgb.train( -# params = list( -# objective = "regression" -# , metric = "None" -# ) -# , data = DTRAIN_RANDOM_REGRESSION -# , nrounds = nrounds -# , valids = list( -# "valid1" = DVALID_RANDOM_REGRESSION -# ) -# , eval = list( -# .increasing_metric -# , "rmse" -# , .constant_metric -# , "l2" -# ) -# ) - -# # all 4 metrics should have been used -# expect_named( -# bst$record_evals[["valid1"]] -# , expected = c("rmse", "l2", "increasing_metric", "constant_metric") -# , ignore.order = TRUE -# , ignore.case = FALSE -# ) - -# # the difference metrics shouldn't have been mixed up with each other -# results <- bst$record_evals[["valid1"]] -# expect_true(abs(results[["rmse"]][["eval"]][[1L]] - 1.105012) < TOLERANCE) -# expect_true(abs(results[["l2"]][["eval"]][[1L]] - 1.221051) < TOLERANCE) -# expected_increasing_metric <- increasing_metric_starting_value + 0.1 -# expect_true( -# abs( -# results[["increasing_metric"]][["eval"]][[1L]] - expected_increasing_metric -# ) < TOLERANCE -# ) -# expect_true(abs(results[["constant_metric"]][["eval"]][[1L]] - CONSTANT_METRIC_VALUE) < TOLERANCE) - -# }) - -# test_that("lgb.train() works when a list of strings or a character vector is passed to eval", { - -# # testing list and character vector, as well as length-1 and length-2 -# eval_variations <- list( -# c("binary_error", "binary_logloss") -# , "binary_logloss" -# , list("binary_error", "binary_logloss") -# , list("binary_logloss") -# ) - -# for (eval_variation in eval_variations) { - -# set.seed(708L) -# nrounds <- 10L -# increasing_metric_starting_value <- get(ACCUMULATOR_NAME, envir = .GlobalEnv) -# bst <- lgb.train( -# params = list( -# objective = "binary" -# , metric = "None" -# ) -# , data = DTRAIN_RANDOM_CLASSIFICATION -# , nrounds = nrounds -# , valids = list( -# "valid1" = DVALID_RANDOM_CLASSIFICATION -# ) -# , eval = eval_variation -# ) - -# # both metrics should have been used -# expect_named( -# bst$record_evals[["valid1"]] -# , expected = unlist(eval_variation) -# , ignore.order = TRUE -# , ignore.case = FALSE -# ) - -# # the difference metrics shouldn't have been mixed up with each other -# results <- bst$record_evals[["valid1"]] -# if ("binary_error" %in% unlist(eval_variation)) { -# expect_true(abs(results[["binary_error"]][["eval"]][[1L]] - 0.4864865) < TOLERANCE) -# } -# if ("binary_logloss" %in% unlist(eval_variation)) { -# expect_true(abs(results[["binary_logloss"]][["eval"]][[1L]] - 0.6932548) < TOLERANCE) -# } -# } -# }) - -# test_that("lgb.train() works when you specify both 'metric' and 'eval' with strings", { -# set.seed(708L) -# nrounds <- 10L -# increasing_metric_starting_value <- get(ACCUMULATOR_NAME, envir = .GlobalEnv) -# bst <- lgb.train( -# params = list( -# objective = "binary" -# , metric = "binary_error" -# ) -# , data = DTRAIN_RANDOM_CLASSIFICATION -# , nrounds = nrounds -# , valids = list( -# "valid1" = DVALID_RANDOM_CLASSIFICATION -# ) -# , eval = "binary_logloss" -# ) - -# # both metrics should have been used -# expect_named( -# bst$record_evals[["valid1"]] -# , expected = c("binary_error", "binary_logloss") -# , ignore.order = TRUE -# , ignore.case = FALSE -# ) - -# # the difference metrics shouldn't have been mixed up with each other -# results <- bst$record_evals[["valid1"]] -# expect_true(abs(results[["binary_error"]][["eval"]][[1L]] - 0.4864865) < TOLERANCE) -# expect_true(abs(results[["binary_logloss"]][["eval"]][[1L]] - 0.6932548) < TOLERANCE) -# }) - -# test_that("lgb.train() works when you give a function for eval", { -# set.seed(708L) -# nrounds <- 10L -# increasing_metric_starting_value <- get(ACCUMULATOR_NAME, envir = .GlobalEnv) -# bst <- lgb.train( -# params = list( -# objective = "binary" -# , metric = "None" -# ) -# , data = DTRAIN_RANDOM_CLASSIFICATION -# , nrounds = nrounds -# , valids = list( -# "valid1" = DVALID_RANDOM_CLASSIFICATION -# ) -# , eval = .constant_metric -# ) - -# # the difference metrics shouldn't have been mixed up with each other -# results <- bst$record_evals[["valid1"]] -# expect_true(abs(results[["constant_metric"]][["eval"]][[1L]] - CONSTANT_METRIC_VALUE) < TOLERANCE) -# }) - -# test_that("lgb.train() works with early stopping for regression with a metric that should be minimized", { -# set.seed(708L) -# trainDF <- data.frame( -# "feat1" = rep(c(10.0, 100.0), 500L) -# , "target" = rep(c(-50.0, 50.0), 500L) -# ) -# validDF <- data.frame( -# "feat1" = rep(50.0, 4L) -# , "target" = rep(50.0, 4L) -# ) -# dtrain <- lgb.Dataset( -# data = as.matrix(trainDF[["feat1"]], drop = FALSE) -# , label = trainDF[["target"]] -# ) -# dvalid <- lgb.Dataset( -# data = as.matrix(validDF[["feat1"]], drop = FALSE) -# , label = validDF[["target"]] -# ) -# nrounds <- 10L - -# ############################# -# # train with early stopping # -# ############################# -# early_stopping_rounds <- 5L -# bst <- lgb.train( -# params = list( -# objective = "regression" -# , metric = c( -# "mape" -# , "rmse" -# , "mae" -# ) -# , min_data_in_bin = 5L -# , early_stopping_rounds = early_stopping_rounds -# ) -# , data = dtrain -# , nrounds = nrounds -# , valids = list( -# "valid1" = dvalid -# ) -# ) - -# # the best model should be from the first iteration, and only 6 rounds -# # should have happened (1 with improvement, 5 consecutive with no improvement) -# expect_equal(bst$best_score, 1.1) -# expect_equal(bst$best_iter, 1L) -# expect_equal( -# length(bst$record_evals[["valid1"]][["mape"]][["eval"]]) -# , early_stopping_rounds + 1L -# ) - -# # Booster should understand thatt all three of these metrics should be minimized -# eval_info <- bst$.__enclos_env__$private$get_eval_info() -# expect_identical(eval_info, c("mape", "rmse", "l1")) -# expect_identical( -# unname(bst$.__enclos_env__$private$higher_better_inner_eval) -# , rep(FALSE, 3L) -# ) -# }) - - -# test_that("lgb.train() supports non-ASCII feature names", { -# testthat::skip("UTF-8 feature names are not fully supported in the R package") -# dtrain <- lgb.Dataset( -# data = matrix(rnorm(400L), ncol = 4L) -# , label = rnorm(100L) -# ) -# feature_names <- c("F_零", "F_一", "F_二", "F_三") -# bst <- lgb.train( -# data = dtrain -# , nrounds = 5L -# , obj = "regression" -# , params = list( -# metric = "rmse" -# ) -# , colnames = feature_names -# ) -# expect_true(lgb.is.Booster(bst)) -# dumped_model <- jsonlite::fromJSON(bst$dump_model()) -# expect_identical( -# dumped_model[["feature_names"]] -# , feature_names -# ) -# }) - -# test_that("when early stopping is not activated, best_iter and best_score come from valids and not training data", { -# set.seed(708L) -# trainDF <- data.frame( -# "feat1" = rep(c(10.0, 100.0), 500L) -# , "target" = rep(c(-50.0, 50.0), 500L) -# ) -# validDF <- data.frame( -# "feat1" = rep(50.0, 4L) -# , "target" = rep(50.0, 4L) -# ) -# dtrain <- lgb.Dataset( -# data = as.matrix(trainDF[["feat1"]], drop = FALSE) -# , label = trainDF[["target"]] -# ) -# dvalid1 <- lgb.Dataset( -# data = as.matrix(validDF[["feat1"]], drop = FALSE) -# , label = validDF[["target"]] -# ) -# dvalid2 <- lgb.Dataset( -# data = as.matrix(validDF[1L:10L, "feat1"], drop = FALSE) -# , label = validDF[1L:10L, "target"] -# ) -# nrounds <- 10L -# train_params <- list( -# objective = "regression" -# , metric = "rmse" -# , learning_rate = 1.5 -# ) - -# # example 1: two valids, neither are the training data -# bst <- lgb.train( -# data = dtrain -# , nrounds = nrounds -# , num_leaves = 5L -# , valids = list( -# "valid1" = dvalid1 -# , "valid2" = dvalid2 -# ) -# , params = train_params -# ) -# expect_named( -# bst$record_evals -# , c("start_iter", "valid1", "valid2") -# , ignore.order = FALSE -# , ignore.case = FALSE -# ) -# rmse_scores <- unlist(bst$record_evals[["valid1"]][["rmse"]][["eval"]]) -# expect_length(rmse_scores, nrounds) -# expect_identical(bst$best_iter, which.min(rmse_scores)) -# expect_identical(bst$best_score, rmse_scores[which.min(rmse_scores)]) - -# # example 2: train first (called "train") and two valids -# bst <- lgb.train( -# data = dtrain -# , nrounds = nrounds -# , num_leaves = 5L -# , valids = list( -# "train" = dtrain -# , "valid1" = dvalid1 -# , "valid2" = dvalid2 -# ) -# , params = train_params -# ) -# expect_named( -# bst$record_evals -# , c("start_iter", "train", "valid1", "valid2") -# , ignore.order = FALSE -# , ignore.case = FALSE -# ) -# rmse_scores <- unlist(bst$record_evals[["valid1"]][["rmse"]][["eval"]]) -# expect_length(rmse_scores, nrounds) -# expect_identical(bst$best_iter, which.min(rmse_scores)) -# expect_identical(bst$best_score, rmse_scores[which.min(rmse_scores)]) - -# # example 3: train second (called "train") and two valids -# bst <- lgb.train( -# data = dtrain -# , nrounds = nrounds -# , num_leaves = 5L -# , valids = list( -# "valid1" = dvalid1 -# , "train" = dtrain -# , "valid2" = dvalid2 -# ) -# , params = train_params -# ) -# # note that "train" still ends up as the first one -# expect_named( -# bst$record_evals -# , c("start_iter", "train", "valid1", "valid2") -# , ignore.order = FALSE -# , ignore.case = FALSE -# ) -# rmse_scores <- unlist(bst$record_evals[["valid1"]][["rmse"]][["eval"]]) -# expect_length(rmse_scores, nrounds) -# expect_identical(bst$best_iter, which.min(rmse_scores)) -# expect_identical(bst$best_score, rmse_scores[which.min(rmse_scores)]) - -# # example 4: train third (called "train") and two valids -# bst <- lgb.train( -# data = dtrain -# , nrounds = nrounds -# , num_leaves = 5L -# , valids = list( -# "valid1" = dvalid1 -# , "valid2" = dvalid2 -# , "train" = dtrain -# ) -# , params = train_params -# ) -# # note that "train" still ends up as the first one -# expect_named( -# bst$record_evals -# , c("start_iter", "train", "valid1", "valid2") -# , ignore.order = FALSE -# , ignore.case = FALSE -# ) -# rmse_scores <- unlist(bst$record_evals[["valid1"]][["rmse"]][["eval"]]) -# expect_length(rmse_scores, nrounds) -# expect_identical(bst$best_iter, which.min(rmse_scores)) -# expect_identical(bst$best_score, rmse_scores[which.min(rmse_scores)]) - -# # example 5: train second (called "something-random-we-would-not-hardcode") and two valids -# bst <- lgb.train( -# data = dtrain -# , nrounds = nrounds -# , num_leaves = 5L -# , valids = list( -# "valid1" = dvalid1 -# , "something-random-we-would-not-hardcode" = dtrain -# , "valid2" = dvalid2 -# ) -# , params = train_params -# ) -# # note that "something-random-we-would-not-hardcode" was recognized as the training -# # data even though it isn't named "train" -# expect_named( -# bst$record_evals -# , c("start_iter", "something-random-we-would-not-hardcode", "valid1", "valid2") -# , ignore.order = FALSE -# , ignore.case = FALSE -# ) -# rmse_scores <- unlist(bst$record_evals[["valid1"]][["rmse"]][["eval"]]) -# expect_length(rmse_scores, nrounds) -# expect_identical(bst$best_iter, which.min(rmse_scores)) -# expect_identical(bst$best_score, rmse_scores[which.min(rmse_scores)]) - -# # example 6: the only valid supplied is the training data -# bst <- lgb.train( -# data = dtrain -# , nrounds = nrounds -# , num_leaves = 5L -# , valids = list( -# "train" = dtrain -# ) -# , params = train_params -# ) -# expect_identical(bst$best_iter, -1L) -# expect_identical(bst$best_score, NA_real_) -# }) - -# test_that("lightgbm.train() gives the correct best_score and best_iter for a metric where higher values are better", { -# set.seed(708L) -# trainDF <- data.frame( -# "feat1" = runif(n = 500L, min = 0.0, max = 15.0) -# , "target" = rep(c(0L, 1L), 500L) -# ) -# validDF <- data.frame( -# "feat1" = runif(n = 50L, min = 0.0, max = 15.0) -# , "target" = rep(c(0L, 1L), 50L) -# ) -# dtrain <- lgb.Dataset( -# data = as.matrix(trainDF[["feat1"]], drop = FALSE) -# , label = trainDF[["target"]] -# ) -# dvalid1 <- lgb.Dataset( -# data = as.matrix(validDF[1L:25L, "feat1"], drop = FALSE) -# , label = validDF[1L:25L, "target"] -# ) -# nrounds <- 10L -# bst <- lgb.train( -# data = dtrain -# , nrounds = nrounds -# , num_leaves = 5L -# , valids = list( -# "valid1" = dvalid1 -# , "something-random-we-would-not-hardcode" = dtrain -# ) -# , params = list( -# objective = "binary" -# , metric = "auc" -# , learning_rate = 1.5 -# ) -# ) -# # note that "something-random-we-would-not-hardcode" was recognized as the training -# # data even though it isn't named "train" -# expect_named( -# bst$record_evals -# , c("start_iter", "something-random-we-would-not-hardcode", "valid1") -# , ignore.order = FALSE -# , ignore.case = FALSE -# ) -# auc_scores <- unlist(bst$record_evals[["valid1"]][["auc"]][["eval"]]) -# expect_length(auc_scores, nrounds) -# expect_identical(bst$best_iter, which.max(auc_scores)) -# expect_identical(bst$best_score, auc_scores[which.max(auc_scores)]) -# }) - -# test_that("using lightgbm() without early stopping, best_iter and best_score come from valids and not training data", { -# set.seed(708L) -# # example: train second (called "something-random-we-would-not-hardcode"), two valids, -# # and a metric where higher values are better ("auc") -# trainDF <- data.frame( -# "feat1" = runif(n = 500L, min = 0.0, max = 15.0) -# , "target" = rep(c(0L, 1L), 500L) -# ) -# validDF <- data.frame( -# "feat1" = runif(n = 50L, min = 0.0, max = 15.0) -# , "target" = rep(c(0L, 1L), 50L) -# ) -# dtrain <- lgb.Dataset( -# data = as.matrix(trainDF[["feat1"]], drop = FALSE) -# , label = trainDF[["target"]] -# ) -# dvalid1 <- lgb.Dataset( -# data = as.matrix(validDF[1L:25L, "feat1"], drop = FALSE) -# , label = validDF[1L:25L, "target"] -# ) -# dvalid2 <- lgb.Dataset( -# data = as.matrix(validDF[26L:50L, "feat1"], drop = FALSE) -# , label = validDF[26L:50L, "target"] -# ) -# nrounds <- 10L -# bst <- lightgbm( -# data = dtrain -# , nrounds = nrounds -# , num_leaves = 5L -# , valids = list( -# "valid1" = dvalid1 -# , "something-random-we-would-not-hardcode" = dtrain -# , "valid2" = dvalid2 -# ) -# , params = list( -# objective = "binary" -# , metric = "auc" -# , learning_rate = 1.5 -# ) -# , verbose = -7L -# , save_name = tempfile(fileext = ".model") -# ) -# # when verbose <= 0 is passed to lightgbm(), 'valids' is passed through to lgb.train() -# # untouched. If you set verbose to > 0, the training data will still be first but called "train" -# expect_named( -# bst$record_evals -# , c("start_iter", "something-random-we-would-not-hardcode", "valid1", "valid2") -# , ignore.order = FALSE -# , ignore.case = FALSE -# ) -# auc_scores <- unlist(bst$record_evals[["valid1"]][["auc"]][["eval"]]) -# expect_length(auc_scores, nrounds) -# expect_identical(bst$best_iter, which.max(auc_scores)) -# expect_identical(bst$best_score, auc_scores[which.max(auc_scores)]) -# }) - -# test_that("lgb.cv() works when you specify both 'metric' and 'eval' with strings", { -# set.seed(708L) -# nrounds <- 10L -# nfolds <- 4L -# increasing_metric_starting_value <- get(ACCUMULATOR_NAME, envir = .GlobalEnv) -# bst <- lgb.cv( -# params = list( -# objective = "binary" -# , metric = "binary_error" -# ) -# , data = DTRAIN_RANDOM_CLASSIFICATION -# , nrounds = nrounds -# , nfold = nfolds -# , eval = "binary_logloss" -# ) - -# # both metrics should have been used -# expect_named( -# bst$record_evals[["valid"]] -# , expected = c("binary_error", "binary_logloss") -# , ignore.order = TRUE -# , ignore.case = FALSE -# ) - -# # the difference metrics shouldn't have been mixed up with each other -# results <- bst$record_evals[["valid"]] -# expect_true(abs(results[["binary_error"]][["eval"]][[1L]] - 0.5005654) < TOLERANCE) -# expect_true(abs(results[["binary_logloss"]][["eval"]][[1L]] - 0.7011232) < TOLERANCE) - -# # all boosters should have been created -# expect_length(bst$boosters, nfolds) -# }) - -# test_that("lgb.cv() works when you give a function for eval", { -# set.seed(708L) -# nrounds <- 10L -# nfolds <- 3L -# increasing_metric_starting_value <- get(ACCUMULATOR_NAME, envir = .GlobalEnv) -# bst <- lgb.cv( -# params = list( -# objective = "binary" -# , metric = "None" -# ) -# , data = DTRAIN_RANDOM_CLASSIFICATION -# , nfold = nfolds -# , nrounds = nrounds -# , eval = .constant_metric -# ) - -# # the difference metrics shouldn't have been mixed up with each other -# results <- bst$record_evals[["valid"]] -# expect_true(abs(results[["constant_metric"]][["eval"]][[1L]] - CONSTANT_METRIC_VALUE) < TOLERANCE) -# expect_named(results, "constant_metric") -# }) - -# test_that("If first_metric_only is TRUE, lgb.cv() decides to stop early based on only the first metric", { -# set.seed(708L) -# nrounds <- 10L -# nfolds <- 5L -# early_stopping_rounds <- 3L -# increasing_metric_starting_value <- get(ACCUMULATOR_NAME, envir = .GlobalEnv) -# bst <- lgb.cv( -# params = list( -# objective = "regression" -# , metric = "None" -# , early_stopping_rounds = early_stopping_rounds -# , first_metric_only = TRUE -# ) -# , data = DTRAIN_RANDOM_REGRESSION -# , nfold = nfolds -# , nrounds = nrounds -# , valids = list( -# "valid1" = DVALID_RANDOM_REGRESSION -# ) -# , eval = list( -# .increasing_metric -# , .constant_metric -# ) -# ) - -# # Only the two functions provided to "eval" should have been evaluated -# expect_named(bst$record_evals[["valid"]], c("increasing_metric", "constant_metric")) - -# # all 10 iterations should happen, and the best_iter should be the final one -# expect_equal(bst$best_iter, nrounds) - -# # best_score should be taken from "increasing_metric" -# # -# # this expected value looks magical and confusing, but it's because -# # evaluation metrics are averaged over all folds. -# # -# # consider 5-fold CV with a metric that adds 0.1 to a global accumulator -# # each time it's called -# # -# # * iter 1: [0.1, 0.2, 0.3, 0.4, 0.5] (mean = 0.3) -# # * iter 2: [0.6, 0.7, 0.8, 0.9, 1.0] (mean = 1.3) -# # * iter 3: [1.1, 1.2, 1.3, 1.4, 1.5] (mean = 1.8) -# # -# cv_value <- increasing_metric_starting_value + mean(seq_len(nfolds) / 10.0) + (nrounds - 1L) * 0.1 * nfolds -# expect_equal(bst$best_score, cv_value) - -# # early stopping should not have happened. Even though constant_metric -# # had 9 consecutive iterations with no improvement, it is ignored because of -# # first_metric_only = TRUE -# expect_equal( -# length(bst$record_evals[["valid"]][["constant_metric"]][["eval"]]) -# , nrounds -# ) -# expect_equal( -# length(bst$record_evals[["valid"]][["increasing_metric"]][["eval"]]) -# , nrounds -# ) -# }) - -# test_that("early stopping works with lgb.cv()", { -# set.seed(708L) -# nrounds <- 10L -# nfolds <- 5L -# early_stopping_rounds <- 3L -# increasing_metric_starting_value <- get(ACCUMULATOR_NAME, envir = .GlobalEnv) -# bst <- lgb.cv( -# params = list( -# objective = "regression" -# , metric = "None" -# , early_stopping_rounds = early_stopping_rounds -# , first_metric_only = TRUE -# ) -# , data = DTRAIN_RANDOM_REGRESSION -# , nfold = nfolds -# , nrounds = nrounds -# , valids = list( -# "valid1" = DVALID_RANDOM_REGRESSION -# ) -# , eval = list( -# .constant_metric -# , .increasing_metric -# ) -# ) - -# # only the two functions provided to "eval" should have been evaluated -# expect_named(bst$record_evals[["valid"]], c("constant_metric", "increasing_metric")) - -# # best_iter should be based on the first metric. Since constant_metric -# # never changes, its first iteration was the best oone -# expect_equal(bst$best_iter, 1L) - -# # best_score should be taken from the first metri -# expect_equal(bst$best_score, 0.2) - -# # early stopping should have happened, since constant_metric was the first -# # one passed to eval and it will not improve over consecutive iterations -# # -# # note that this test is identical to the previous one, but with the -# # order of the eval metrics switched -# expect_equal( -# length(bst$record_evals[["valid"]][["constant_metric"]][["eval"]]) -# , early_stopping_rounds + 1L -# ) -# expect_equal( -# length(bst$record_evals[["valid"]][["increasing_metric"]][["eval"]]) -# , early_stopping_rounds + 1L -# ) -# }) - -# context("interaction constraints") - -# test_that("lgb.train() throws an informative error if interaction_constraints is not a list", { -# dtrain <- lgb.Dataset(train$data, label = train$label) -# params <- list(objective = "regression", interaction_constraints = "[1,2],[3]") -# # expect_error({ -# # bst <- lightgbm( -# # data = dtrain -# # , params = params -# # , nrounds = 2L -# # ) -# # }, "interaction_constraints must be a list") -# }) - -# test_that(paste0("lgb.train() throws an informative error if the members of interaction_constraints ", -# "are not character or numeric vectors"), { -# dtrain <- lgb.Dataset(train$data, label = train$label) -# params <- list(objective = "regression", interaction_constraints = list(list(1L, 2L), list(3L))) -# # expect_error({ -# # bst <- lightgbm( -# # data = dtrain -# # , params = params -# # , nrounds = 2L -# # ) -# # }, "every element in interaction_constraints must be a character vector or numeric vector") -# }) - -# test_that("lgb.train() throws an informative error if interaction_constraints contains a too large index", { -# dtrain <- lgb.Dataset(train$data, label = train$label) -# params <- list(objective = "regression", -# interaction_constraints = list(c(1L, length(colnames(train$data)) + 1L), 3L)) -# # expect_error({ -# # bst <- lightgbm( -# # data = dtrain -# # , params = params -# # , nrounds = 2L -# # ) -# # }, "supplied a too large value in interaction_constraints") -# }) - -# test_that(paste0("lgb.train() gives same result when interaction_constraints is specified as a list of ", -# "character vectors, numeric vectors, or a combination"), { -# set.seed(1L) -# dtrain <- lgb.Dataset(train$data, label = train$label) - -# params <- list(objective = "regression", interaction_constraints = list(c(1L, 2L), 3L)) -# bst <- lightgbm( -# data = dtrain -# , params = params -# , nrounds = 2L -# ) -# pred1 <- bst$predict(test$data) - -# cnames <- colnames(train$data) -# params <- list(objective = "regression", interaction_constraints = list(c(cnames[[1L]], cnames[[2L]]), cnames[[3L]])) -# bst <- lightgbm( -# data = dtrain -# , params = params -# , nrounds = 2L -# ) -# pred2 <- bst$predict(test$data) - -# params <- list(objective = "regression", interaction_constraints = list(c(cnames[[1L]], cnames[[2L]]), 3L)) -# bst <- lightgbm( -# data = dtrain -# , params = params -# , nrounds = 2L -# ) -# pred3 <- bst$predict(test$data) - -# expect_equal(pred1, pred2) -# expect_equal(pred2, pred3) - -# }) - -# test_that(paste0("lgb.train() gives same results when using interaction_constraints and specifying colnames"), { -# set.seed(1L) -# dtrain <- lgb.Dataset(train$data, label = train$label) - -# params <- list(objective = "regression", interaction_constraints = list(c(1L, 2L), 3L)) -# bst <- lightgbm( -# data = dtrain -# , params = params -# , nrounds = 2L -# ) -# pred1 <- bst$predict(test$data) - -# new_colnames <- paste0(colnames(train$data), "_x") -# params <- list(objective = "regression" -# , interaction_constraints = list(c(new_colnames[1L], new_colnames[2L]), new_colnames[3L])) -# bst <- lightgbm( -# data = dtrain -# , params = params -# , nrounds = 2L -# , colnames = new_colnames -# ) -# pred2 <- bst$predict(test$data) - -# expect_equal(pred1, pred2) - -# }) +context("lightgbm()") + +data(agaricus.train, package = "lightgbm") +data(agaricus.test, package = "lightgbm") +train <- agaricus.train +test <- agaricus.test + +TOLERANCE <- 1e-6 +set.seed(708L) + +# [description] Every time this function is called, it adds 0.1 +# to an accumulator then returns the current value. +# This is used to mock the situation where an evaluation +# metric increases every iteration +ACCUMULATOR_NAME <- "INCREASING_METRIC_ACUMULATOR" +assign(x = ACCUMULATOR_NAME, value = 0.0, envir = .GlobalEnv) + +.increasing_metric <- function(preds, dtrain) { + if (!exists(ACCUMULATOR_NAME, envir = .GlobalEnv)) { + assign(ACCUMULATOR_NAME, 0.0, envir = .GlobalEnv) + } + assign( + x = ACCUMULATOR_NAME + , value = get(ACCUMULATOR_NAME, envir = .GlobalEnv) + 0.1 + , envir = .GlobalEnv + ) + return(list( + name = "increasing_metric" + , value = get(ACCUMULATOR_NAME, envir = .GlobalEnv) + , higher_better = TRUE + )) +} + +# [description] Evaluation function that always returns the +# same value +CONSTANT_METRIC_VALUE <- 0.2 +.constant_metric <- function(preds, dtrain) { + return(list( + name = "constant_metric" + , value = CONSTANT_METRIC_VALUE + , higher_better = FALSE + )) +} + +# sample datasets to test early stopping +DTRAIN_RANDOM_REGRESSION <- lgb.Dataset( + data = as.matrix(rnorm(100L), ncol = 1L, drop = FALSE) + , label = rnorm(100L) +) +DVALID_RANDOM_REGRESSION <- lgb.Dataset( + data = as.matrix(rnorm(50L), ncol = 1L, drop = FALSE) + , label = rnorm(50L) +) +DTRAIN_RANDOM_CLASSIFICATION <- lgb.Dataset( + data = as.matrix(rnorm(120L), ncol = 1L, drop = FALSE) + , label = sample(c(0L, 1L), size = 120L, replace = TRUE) +) +DVALID_RANDOM_CLASSIFICATION <- lgb.Dataset( + data = as.matrix(rnorm(37L), ncol = 1L, drop = FALSE) + , label = sample(c(0L, 1L), size = 37L, replace = TRUE) +) + +test_that("train and predict binary classification", { + nrounds <- 10L + bst <- lightgbm( + data = train$data + , label = train$label + , num_leaves = 5L + , nrounds = nrounds + , objective = "binary" + , metric = "binary_error" + , save_name = tempfile(fileext = ".model") + ) + expect_false(is.null(bst$record_evals)) + record_results <- lgb.get.eval.result(bst, "train", "binary_error") + expect_lt(min(record_results), 0.02) + + pred <- predict(bst, test$data) + expect_equal(length(pred), 1611L) + + pred1 <- predict(bst, train$data, num_iteration = 1L) + expect_equal(length(pred1), 6513L) + err_pred1 <- sum((pred1 > 0.5) != train$label) / length(train$label) + err_log <- record_results[1L] + expect_lt(abs(err_pred1 - err_log), TOLERANCE) +}) + + +test_that("train and predict softmax", { + set.seed(708L) + lb <- as.numeric(iris$Species) - 1L + + bst <- lightgbm( + data = as.matrix(iris[, -5L]) + , label = lb + , num_leaves = 4L + , learning_rate = 0.05 + , nrounds = 20L + , min_data = 20L + , min_hessian = 10.0 + , objective = "multiclass" + , metric = "multi_error" + , num_class = 3L + , save_name = tempfile(fileext = ".model") + ) + + expect_false(is.null(bst$record_evals)) + record_results <- lgb.get.eval.result(bst, "train", "multi_error") + expect_lt(min(record_results), 0.06) + + pred <- predict(bst, as.matrix(iris[, -5L])) + expect_equal(length(pred), nrow(iris) * 3L) +}) + + +test_that("use of multiple eval metrics works", { + metrics <- list("binary_error", "auc", "binary_logloss") + bst <- lightgbm( + data = train$data + , label = train$label + , num_leaves = 4L + , learning_rate = 1.0 + , nrounds = 10L + , objective = "binary" + , metric = metrics + , save_name = tempfile(fileext = ".model") + ) + expect_false(is.null(bst$record_evals)) + expect_named( + bst$record_evals[["train"]] + , unlist(metrics) + , ignore.order = FALSE + , ignore.case = FALSE + ) +}) + +test_that("lgb.Booster.upper_bound() and lgb.Booster.lower_bound() work as expected for binary classification", { + set.seed(708L) + nrounds <- 10L + bst <- lightgbm( + data = train$data + , label = train$label + , num_leaves = 5L + , nrounds = nrounds + , objective = "binary" + , metric = "binary_error" + , save_name = tempfile(fileext = ".model") + ) + expect_true(abs(bst$lower_bound() - -1.590853) < TOLERANCE) + expect_true(abs(bst$upper_bound() - 1.871015) < TOLERANCE) +}) + +test_that("lgb.Booster.upper_bound() and lgb.Booster.lower_bound() work as expected for regression", { + set.seed(708L) + nrounds <- 10L + bst <- lightgbm( + data = train$data + , label = train$label + , num_leaves = 5L + , nrounds = nrounds + , objective = "regression" + , metric = "l2" + , save_name = tempfile(fileext = ".model") + ) + expect_true(abs(bst$lower_bound() - 0.1513859) < TOLERANCE) + expect_true(abs(bst$upper_bound() - 0.9080349) < TOLERANCE) +}) + +test_that("lightgbm() rejects negative or 0 value passed to nrounds", { + dtrain <- lgb.Dataset(train$data, label = train$label) + params <- list(objective = "regression", metric = "l2,l1") + for (nround_value in c(-10L, 0L)) { + # expect_error({ + # bst <- lightgbm( + # data = dtrain + # , params = params + # , nrounds = nround_value + # , save_name = tempfile(fileext = ".model") + # ) + # }, "nrounds should be greater than zero") + } +}) + +test_that("lightgbm() performs evaluation on validation sets if they are provided", { + set.seed(708L) + dvalid1 <- lgb.Dataset( + data = train$data + , label = train$label + ) + dvalid2 <- lgb.Dataset( + data = train$data + , label = train$label + ) + nrounds <- 10L + bst <- lightgbm( + data = train$data + , label = train$label + , num_leaves = 5L + , nrounds = nrounds + , objective = "binary" + , metric = c( + "binary_error" + , "auc" + ) + , valids = list( + "valid1" = dvalid1 + , "valid2" = dvalid2 + ) + , save_name = tempfile(fileext = ".model") + ) + + expect_named( + bst$record_evals + , c("train", "valid1", "valid2", "start_iter") + , ignore.order = TRUE + , ignore.case = FALSE + ) + for (valid_name in c("train", "valid1", "valid2")) { + eval_results <- bst$record_evals[[valid_name]][["binary_error"]] + expect_length(eval_results[["eval"]], nrounds) + } + expect_true(abs(bst$record_evals[["train"]][["binary_error"]][["eval"]][[1L]] - 0.02226317) < TOLERANCE) + expect_true(abs(bst$record_evals[["valid1"]][["binary_error"]][["eval"]][[1L]] - 0.02226317) < TOLERANCE) + expect_true(abs(bst$record_evals[["valid2"]][["binary_error"]][["eval"]][[1L]] - 0.02226317) < TOLERANCE) +}) + + +context("training continuation") + +test_that("training continuation works", { + dtrain <- lgb.Dataset( + train$data + , label = train$label + , free_raw_data = FALSE + ) + watchlist <- list(train = dtrain) + param <- list( + objective = "binary" + , metric = "binary_logloss" + , num_leaves = 5L + , learning_rate = 1.0 + ) + + # train for 10 consecutive iterations + bst <- lgb.train(param, dtrain, nrounds = 10L, watchlist) + err_bst <- lgb.get.eval.result(bst, "train", "binary_logloss", 10L) + + # train for 5 iterations, save, load, train for 5 more + bst1 <- lgb.train(param, dtrain, nrounds = 5L, watchlist) + model_file <- tempfile(fileext = ".model") + lgb.save(bst1, model_file) + bst2 <- lgb.train(param, dtrain, nrounds = 5L, watchlist, init_model = bst1) + err_bst2 <- lgb.get.eval.result(bst2, "train", "binary_logloss", 10L) + + # evaluation metrics should be nearly identical for the model trained in 10 coonsecutive + # iterations and the one trained in 5-then-5. + expect_lt(abs(err_bst - err_bst2), 0.01) +}) + +context("lgb.cv()") + +test_that("cv works", { + dtrain <- lgb.Dataset(train$data, label = train$label) + params <- list(objective = "regression", metric = "l2,l1") + bst <- lgb.cv( + params + , dtrain + , 10L + , nfold = 5L + , min_data = 1L + , learning_rate = 1.0 + , early_stopping_rounds = 10L + ) + expect_false(is.null(bst$record_evals)) +}) + +test_that("lgb.cv() rejects negative or 0 value passed to nrounds", { + dtrain <- lgb.Dataset(train$data, label = train$label) + params <- list(objective = "regression", metric = "l2,l1") + for (nround_value in c(-10L, 0L)) { + # expect_error({ + # bst <- lgb.cv( + # params + # , dtrain + # , nround_value + # , nfold = 5L + # , min_data = 1L + # ) + # }, "nrounds should be greater than zero") + } +}) + +test_that("lgb.cv() throws an informative error is 'data' is not an lgb.Dataset and labels are not given", { + bad_values <- list( + 4L + , "hello" + , list(a = TRUE, b = seq_len(10L)) + , data.frame(x = seq_len(5L), y = seq_len(5L)) + , data.table::data.table(x = seq_len(5L), y = seq_len(5L)) + , matrix(data = seq_len(10L), 2L, 5L) + ) + for (val in bad_values) { + # expect_error({ + # bst <- lgb.cv( + # params = list(objective = "regression", metric = "l2,l1") + # , data = val + # , 10L + # , nfold = 5L + # , min_data = 1L + # ) + # }, regexp = "'label' must be provided for lgb.cv if 'data' is not an 'lgb.Dataset'", fixed = TRUE) + } +}) + +test_that("lightgbm.cv() gives the correct best_score and best_iter for a metric where higher values are better", { + set.seed(708L) + dtrain <- lgb.Dataset( + data = as.matrix(runif(n = 500L, min = 0.0, max = 15.0), drop = FALSE) + , label = rep(c(0L, 1L), 250L) + ) + nrounds <- 10L + cv_bst <- lgb.cv( + data = dtrain + , nfold = 5L + , nrounds = nrounds + , num_leaves = 5L + , params = list( + objective = "binary" + , metric = "auc,binary_error" + , learning_rate = 1.5 + ) + ) + expect_is(cv_bst, "lgb.CVBooster") + expect_named( + cv_bst$record_evals + , c("start_iter", "valid") + , ignore.order = FALSE + , ignore.case = FALSE + ) + auc_scores <- unlist(cv_bst$record_evals[["valid"]][["auc"]][["eval"]]) + expect_length(auc_scores, nrounds) + expect_identical(cv_bst$best_iter, which.max(auc_scores)) + expect_identical(cv_bst$best_score, auc_scores[which.max(auc_scores)]) +}) + +context("lgb.train()") + +test_that("lgb.train() works as expected with multiple eval metrics", { + metrics <- c("binary_error", "auc", "binary_logloss") + bst <- lgb.train( + data = lgb.Dataset( + train$data + , label = train$label + ) + , learning_rate = 1.0 + , nrounds = 10L + , params = list( + objective = "binary" + , metric = metrics + ) + , valids = list( + "train" = lgb.Dataset( + train$data + , label = train$label + ) + ) + ) + expect_false(is.null(bst$record_evals)) + expect_named( + bst$record_evals[["train"]] + , unlist(metrics) + , ignore.order = FALSE + , ignore.case = FALSE + ) +}) + +test_that("lgb.train() rejects negative or 0 value passed to nrounds", { + dtrain <- lgb.Dataset(train$data, label = train$label) + params <- list(objective = "regression", metric = "l2,l1") + for (nround_value in c(-10L, 0L)) { + # expect_error({ + # bst <- lgb.train( + # params + # , dtrain + # , nround_value + # ) + # }, "nrounds should be greater than zero") + } +}) + +test_that("lgb.train() throws an informative error if 'data' is not an lgb.Dataset", { + bad_values <- list( + 4L + , "hello" + , list(a = TRUE, b = seq_len(10L)) + , data.frame(x = seq_len(5L), y = seq_len(5L)) + , data.table::data.table(x = seq_len(5L), y = seq_len(5L)) + , matrix(data = seq_len(10L), 2L, 5L) + ) + for (val in bad_values) { + # expect_error({ + # bst <- lgb.train( + # params = list(objective = "regression", metric = "l2,l1") + # , data = val + # , 10L + # ) + # }, regexp = "data must be an lgb.Dataset instance", fixed = TRUE) + } +}) + +test_that("lgb.train() throws an informative error if 'valids' is not a list of lgb.Dataset objects", { + valids <- list( + "valid1" = data.frame(x = rnorm(5L), y = rnorm(5L)) + , "valid2" = data.frame(x = rnorm(5L), y = rnorm(5L)) + ) + # expect_error({ + # bst <- lgb.train( + # params = list(objective = "regression", metric = "l2,l1") + # , data = lgb.Dataset(train$data, label = train$label) + # , 10L + # , valids = valids + # ) + # }, regexp = "valids must be a list of lgb.Dataset elements") +}) + +test_that("lgb.train() errors if 'valids' is a list of lgb.Dataset objects but some do not have names", { + valids <- list( + "valid1" = lgb.Dataset(matrix(rnorm(10L), 5L, 2L)) + , lgb.Dataset(matrix(rnorm(10L), 2L, 5L)) + ) + # expect_error({ + # bst <- lgb.train( + # params = list(objective = "regression", metric = "l2,l1") + # , data = lgb.Dataset(train$data, label = train$label) + # , 10L + # , valids = valids + # ) + # }, regexp = "each element of valids must have a name") +}) + +test_that("lgb.train() throws an informative error if 'valids' contains lgb.Dataset objects but none have names", { + valids <- list( + lgb.Dataset(matrix(rnorm(10L), 5L, 2L)) + , lgb.Dataset(matrix(rnorm(10L), 2L, 5L)) + ) + # expect_error({ + # bst <- lgb.train( + # params = list(objective = "regression", metric = "l2,l1") + # , data = lgb.Dataset(train$data, label = train$label) + # , 10L + # , valids = valids + # ) + # }, regexp = "each element of valids must have a name") +}) + +test_that("lgb.train() works with force_col_wise and force_row_wise", { + set.seed(1234L) + nrounds <- 10L + dtrain <- lgb.Dataset( + train$data + , label = train$label + ) + params <- list( + objective = "binary" + , metric = "binary_error" + , force_col_wise = TRUE + ) + bst_col_wise <- lgb.train( + params = params + , data = dtrain + , nrounds = nrounds + ) + + params <- list( + objective = "binary" + , metric = "binary_error" + , force_row_wise = TRUE + ) + bst_row_wise <- lgb.train( + params = params + , data = dtrain + , nrounds = nrounds + ) + + expected_error <- 0.003070782 + expect_equal(bst_col_wise$eval_train()[[1L]][["value"]], expected_error) + expect_equal(bst_row_wise$eval_train()[[1L]][["value"]], expected_error) + + # check some basic details of the boosters just to be sure force_col_wise + # and force_row_wise are not causing any weird side effects + for (bst in list(bst_row_wise, bst_col_wise)) { + expect_equal(bst$current_iter(), nrounds) + parsed_model <- jsonlite::fromJSON(bst$dump_model()) + expect_equal(parsed_model$objective, "binary sigmoid:1") + expect_false(parsed_model$average_output) + } +}) + +test_that("lgb.train() works as expected with sparse features", { + set.seed(708L) + num_obs <- 70000L + trainDF <- data.frame( + y = sample(c(0L, 1L), size = num_obs, replace = TRUE) + , x = sample(c(1.0:10.0, rep(NA_real_, 50L)), size = num_obs, replace = TRUE) + ) + dtrain <- lgb.Dataset( + data = as.matrix(trainDF[["x"]], drop = FALSE) + , label = trainDF[["y"]] + ) + nrounds <- 1L + bst <- lgb.train( + params = list( + objective = "binary" + , min_data = 1L + , min_data_in_bin = 1L + ) + , data = dtrain + , nrounds = nrounds + ) + + expect_true(lgb.is.Booster(bst)) + expect_equal(bst$current_iter(), nrounds) + parsed_model <- jsonlite::fromJSON(bst$dump_model()) + expect_equal(parsed_model$objective, "binary sigmoid:1") + expect_false(parsed_model$average_output) + expected_error <- 0.6931268 + expect_true(abs(bst$eval_train()[[1L]][["value"]] - expected_error) < TOLERANCE) +}) + +test_that("lgb.train() works with early stopping for classification", { + trainDF <- data.frame( + "feat1" = rep(c(5.0, 10.0), 500L) + , "target" = rep(c(0L, 1L), 500L) + ) + validDF <- data.frame( + "feat1" = rep(c(5.0, 10.0), 50L) + , "target" = rep(c(0L, 1L), 50L) + ) + dtrain <- lgb.Dataset( + data = as.matrix(trainDF[["feat1"]], drop = FALSE) + , label = trainDF[["target"]] + ) + dvalid <- lgb.Dataset( + data = as.matrix(validDF[["feat1"]], drop = FALSE) + , label = validDF[["target"]] + ) + nrounds <- 10L + + ################################ + # train with no early stopping # + ################################ + bst <- lgb.train( + params = list( + objective = "binary" + , metric = "binary_error" + ) + , data = dtrain + , nrounds = nrounds + , valids = list( + "valid1" = dvalid + ) + ) + + # a perfect model should be trivial to obtain, but all 10 rounds + # should happen + expect_equal(bst$best_score, 0.0) + expect_equal(bst$best_iter, 1L) + expect_equal(length(bst$record_evals[["valid1"]][["binary_error"]][["eval"]]), nrounds) + + ############################# + # train with early stopping # + ############################# + early_stopping_rounds <- 5L + bst <- lgb.train( + params = list( + objective = "binary" + , metric = "binary_error" + , early_stopping_rounds = early_stopping_rounds + ) + , data = dtrain + , nrounds = nrounds + , valids = list( + "valid1" = dvalid + ) + ) + + # a perfect model should be trivial to obtain, and only 6 rounds + # should have happen (1 with improvement, 5 consecutive with no improvement) + expect_equal(bst$best_score, 0.0) + expect_equal(bst$best_iter, 1L) + expect_equal( + length(bst$record_evals[["valid1"]][["binary_error"]][["eval"]]) + , early_stopping_rounds + 1L + ) + +}) + +test_that("lgb.train() treats early_stopping_rounds<=0 as disabling early stopping", { + set.seed(708L) + trainDF <- data.frame( + "feat1" = rep(c(5.0, 10.0), 500L) + , "target" = rep(c(0L, 1L), 500L) + ) + validDF <- data.frame( + "feat1" = rep(c(5.0, 10.0), 50L) + , "target" = rep(c(0L, 1L), 50L) + ) + dtrain <- lgb.Dataset( + data = as.matrix(trainDF[["feat1"]], drop = FALSE) + , label = trainDF[["target"]] + ) + dvalid <- lgb.Dataset( + data = as.matrix(validDF[["feat1"]], drop = FALSE) + , label = validDF[["target"]] + ) + nrounds <- 5L + + for (value in c(-5L, 0L)) { + + #----------------------------# + # passed as keyword argument # + #----------------------------# + bst <- lgb.train( + params = list( + objective = "binary" + , metric = "binary_error" + ) + , data = dtrain + , nrounds = nrounds + , valids = list( + "valid1" = dvalid + ) + , early_stopping_rounds = value + ) + + # a perfect model should be trivial to obtain, but all 10 rounds + # should happen + expect_equal(bst$best_score, 0.0) + expect_equal(bst$best_iter, 1L) + expect_equal(length(bst$record_evals[["valid1"]][["binary_error"]][["eval"]]), nrounds) + + #---------------------------# + # passed as parameter alias # + #---------------------------# + bst <- lgb.train( + params = list( + objective = "binary" + , metric = "binary_error" + , n_iter_no_change = value + ) + , data = dtrain + , nrounds = nrounds + , valids = list( + "valid1" = dvalid + ) + ) + + # a perfect model should be trivial to obtain, but all 10 rounds + # should happen + expect_equal(bst$best_score, 0.0) + expect_equal(bst$best_iter, 1L) + expect_equal(length(bst$record_evals[["valid1"]][["binary_error"]][["eval"]]), nrounds) + } +}) + +test_that("lgb.train() works with early stopping for classification with a metric that should be maximized", { + set.seed(708L) + dtrain <- lgb.Dataset( + data = train$data + , label = train$label + ) + dvalid <- lgb.Dataset( + data = test$data + , label = test$label + ) + nrounds <- 10L + + ############################# + # train with early stopping # + ############################# + early_stopping_rounds <- 5L + # the harsh max_depth guarantees that AUC improves over at least the first few iterations + bst_auc <- lgb.train( + params = list( + objective = "binary" + , metric = "auc" + , max_depth = 3L + , early_stopping_rounds = early_stopping_rounds + ) + , data = dtrain + , nrounds = nrounds + , valids = list( + "valid1" = dvalid + ) + ) + bst_binary_error <- lgb.train( + params = list( + objective = "binary" + , metric = "binary_error" + , max_depth = 3L + , early_stopping_rounds = early_stopping_rounds + ) + , data = dtrain + , nrounds = nrounds + , valids = list( + "valid1" = dvalid + ) + ) + + # early stopping should have been hit for binary_error (higher_better = FALSE) + eval_info <- bst_binary_error$.__enclos_env__$private$get_eval_info() + expect_identical(eval_info, "binary_error") + expect_identical( + unname(bst_binary_error$.__enclos_env__$private$higher_better_inner_eval) + , FALSE + ) + expect_identical(bst_binary_error$best_iter, 1L) + expect_identical(bst_binary_error$current_iter(), early_stopping_rounds + 1L) + expect_true(abs(bst_binary_error$best_score - 0.01613904) < TOLERANCE) + + # early stopping should not have been hit for AUC (higher_better = TRUE) + eval_info <- bst_auc$.__enclos_env__$private$get_eval_info() + expect_identical(eval_info, "auc") + expect_identical( + unname(bst_auc$.__enclos_env__$private$higher_better_inner_eval) + , TRUE + ) + expect_identical(bst_auc$best_iter, 9L) + expect_identical(bst_auc$current_iter(), nrounds) + expect_true(abs(bst_auc$best_score - 0.9999969) < TOLERANCE) +}) + +test_that("lgb.train() works with early stopping for regression", { + set.seed(708L) + trainDF <- data.frame( + "feat1" = rep(c(10.0, 100.0), 500L) + , "target" = rep(c(-50.0, 50.0), 500L) + ) + validDF <- data.frame( + "feat1" = rep(50.0, 4L) + , "target" = rep(50.0, 4L) + ) + dtrain <- lgb.Dataset( + data = as.matrix(trainDF[["feat1"]], drop = FALSE) + , label = trainDF[["target"]] + ) + dvalid <- lgb.Dataset( + data = as.matrix(validDF[["feat1"]], drop = FALSE) + , label = validDF[["target"]] + ) + nrounds <- 10L + + ################################ + # train with no early stopping # + ################################ + bst <- lgb.train( + params = list( + objective = "regression" + , metric = "rmse" + ) + , data = dtrain + , nrounds = nrounds + , valids = list( + "valid1" = dvalid + ) + ) + + # the best possible model should come from the first iteration, but + # all 10 training iterations should happen + expect_equal(bst$best_score, 55.0) + expect_equal(bst$best_iter, 1L) + expect_equal(length(bst$record_evals[["valid1"]][["rmse"]][["eval"]]), nrounds) + + ############################# + # train with early stopping # + ############################# + early_stopping_rounds <- 5L + bst <- lgb.train( + params = list( + objective = "regression" + , metric = "rmse" + , early_stopping_rounds = early_stopping_rounds + ) + , data = dtrain + , nrounds = nrounds + , valids = list( + "valid1" = dvalid + ) + ) + + # the best model should be from the first iteration, and only 6 rounds + # should have happen (1 with improvement, 5 consecutive with no improvement) + expect_equal(bst$best_score, 55.0) + expect_equal(bst$best_iter, 1L) + expect_equal( + length(bst$record_evals[["valid1"]][["rmse"]][["eval"]]) + , early_stopping_rounds + 1L + ) +}) + +test_that("lgb.train() does not stop early if early_stopping_rounds is not given", { + set.seed(708L) + + increasing_metric_starting_value <- get( + ACCUMULATOR_NAME + , envir = .GlobalEnv + ) + nrounds <- 10L + metrics <- list( + .constant_metric + , .increasing_metric + ) + bst <- lgb.train( + params = list( + objective = "regression" + , metric = "None" + ) + , data = DTRAIN_RANDOM_REGRESSION + , nrounds = nrounds + , valids = list("valid1" = DVALID_RANDOM_REGRESSION) + , eval = metrics + ) + + # Only the two functions provided to "eval" should have been evaluated + expect_equal(length(bst$record_evals[["valid1"]]), 2L) + + # all 10 iterations should have happen, and the best_iter should be + # the first one (based on constant_metric) + best_iter <- 1L + expect_equal(bst$best_iter, best_iter) + + # best_score should be taken from the first metric + expect_equal( + bst$best_score + , bst$record_evals[["valid1"]][["constant_metric"]][["eval"]][[best_iter]] + ) + + # early stopping should not have happened. Even though constant_metric + # had 9 consecutive iterations with no improvement, it is ignored because of + # first_metric_only = TRUE + expect_equal( + length(bst$record_evals[["valid1"]][["constant_metric"]][["eval"]]) + , nrounds + ) + expect_equal( + length(bst$record_evals[["valid1"]][["increasing_metric"]][["eval"]]) + , nrounds + ) +}) + +test_that("If first_metric_only is not given or is FALSE, lgb.train() decides to stop early based on all metrics", { + set.seed(708L) + + early_stopping_rounds <- 3L + param_variations <- list( + list( + objective = "regression" + , metric = "None" + , early_stopping_rounds = early_stopping_rounds + ) + , list( + objective = "regression" + , metric = "None" + , early_stopping_rounds = early_stopping_rounds + , first_metric_only = FALSE + ) + ) + + for (params in param_variations) { + + nrounds <- 10L + bst <- lgb.train( + params = params + , data = DTRAIN_RANDOM_REGRESSION + , nrounds = nrounds + , valids = list( + "valid1" = DVALID_RANDOM_REGRESSION + ) + , eval = list( + .increasing_metric + , .constant_metric + ) + ) + + # Only the two functions provided to "eval" should have been evaluated + expect_equal(length(bst$record_evals[["valid1"]]), 2L) + + # early stopping should have happened, and should have stopped early_stopping_rounds + 1 rounds in + # because constant_metric never improves + # + # the best iteration should be the last one, because increasing_metric was first + # and gets better every iteration + best_iter <- early_stopping_rounds + 1L + expect_equal(bst$best_iter, best_iter) + + # best_score should be taken from "increasing_metric" because it was first + expect_equal( + bst$best_score + , bst$record_evals[["valid1"]][["increasing_metric"]][["eval"]][[best_iter]] + ) + + # early stopping should not have happened. even though increasing_metric kept + # getting better, early stopping should have happened because "constant_metric" + # did not improve + expect_equal( + length(bst$record_evals[["valid1"]][["constant_metric"]][["eval"]]) + , early_stopping_rounds + 1L + ) + expect_equal( + length(bst$record_evals[["valid1"]][["increasing_metric"]][["eval"]]) + , early_stopping_rounds + 1L + ) + } + +}) + +test_that("If first_metric_only is TRUE, lgb.train() decides to stop early based on only the first metric", { + set.seed(708L) + nrounds <- 10L + early_stopping_rounds <- 3L + increasing_metric_starting_value <- get(ACCUMULATOR_NAME, envir = .GlobalEnv) + bst <- lgb.train( + params = list( + objective = "regression" + , metric = "None" + , early_stopping_rounds = early_stopping_rounds + , first_metric_only = TRUE + ) + , data = DTRAIN_RANDOM_REGRESSION + , nrounds = nrounds + , valids = list( + "valid1" = DVALID_RANDOM_REGRESSION + ) + , eval = list( + .increasing_metric + , .constant_metric + ) + ) + + # Only the two functions provided to "eval" should have been evaluated + expect_equal(length(bst$record_evals[["valid1"]]), 2L) + + # all 10 iterations should happen, and the best_iter should be the final one + expect_equal(bst$best_iter, nrounds) + + # best_score should be taken from "increasing_metric" + expect_equal( + bst$best_score + , increasing_metric_starting_value + 0.1 * nrounds + ) + + # early stopping should not have happened. Even though constant_metric + # had 9 consecutive iterations with no improvement, it is ignored because of + # first_metric_only = TRUE + expect_equal( + length(bst$record_evals[["valid1"]][["constant_metric"]][["eval"]]) + , nrounds + ) + expect_equal( + length(bst$record_evals[["valid1"]][["increasing_metric"]][["eval"]]) + , nrounds + ) +}) + +test_that("lgb.train() works when a mixture of functions and strings are passed to eval", { + set.seed(708L) + nrounds <- 10L + increasing_metric_starting_value <- get(ACCUMULATOR_NAME, envir = .GlobalEnv) + bst <- lgb.train( + params = list( + objective = "regression" + , metric = "None" + ) + , data = DTRAIN_RANDOM_REGRESSION + , nrounds = nrounds + , valids = list( + "valid1" = DVALID_RANDOM_REGRESSION + ) + , eval = list( + .increasing_metric + , "rmse" + , .constant_metric + , "l2" + ) + ) + + # all 4 metrics should have been used + expect_named( + bst$record_evals[["valid1"]] + , expected = c("rmse", "l2", "increasing_metric", "constant_metric") + , ignore.order = TRUE + , ignore.case = FALSE + ) + + # the difference metrics shouldn't have been mixed up with each other + results <- bst$record_evals[["valid1"]] + expect_true(abs(results[["rmse"]][["eval"]][[1L]] - 1.105012) < TOLERANCE) + expect_true(abs(results[["l2"]][["eval"]][[1L]] - 1.221051) < TOLERANCE) + expected_increasing_metric <- increasing_metric_starting_value + 0.1 + expect_true( + abs( + results[["increasing_metric"]][["eval"]][[1L]] - expected_increasing_metric + ) < TOLERANCE + ) + expect_true(abs(results[["constant_metric"]][["eval"]][[1L]] - CONSTANT_METRIC_VALUE) < TOLERANCE) + +}) + +test_that("lgb.train() works when a list of strings or a character vector is passed to eval", { + + # testing list and character vector, as well as length-1 and length-2 + eval_variations <- list( + c("binary_error", "binary_logloss") + , "binary_logloss" + , list("binary_error", "binary_logloss") + , list("binary_logloss") + ) + + for (eval_variation in eval_variations) { + + set.seed(708L) + nrounds <- 10L + increasing_metric_starting_value <- get(ACCUMULATOR_NAME, envir = .GlobalEnv) + bst <- lgb.train( + params = list( + objective = "binary" + , metric = "None" + ) + , data = DTRAIN_RANDOM_CLASSIFICATION + , nrounds = nrounds + , valids = list( + "valid1" = DVALID_RANDOM_CLASSIFICATION + ) + , eval = eval_variation + ) + + # both metrics should have been used + expect_named( + bst$record_evals[["valid1"]] + , expected = unlist(eval_variation) + , ignore.order = TRUE + , ignore.case = FALSE + ) + + # the difference metrics shouldn't have been mixed up with each other + results <- bst$record_evals[["valid1"]] + if ("binary_error" %in% unlist(eval_variation)) { + expect_true(abs(results[["binary_error"]][["eval"]][[1L]] - 0.4864865) < TOLERANCE) + } + if ("binary_logloss" %in% unlist(eval_variation)) { + expect_true(abs(results[["binary_logloss"]][["eval"]][[1L]] - 0.6932548) < TOLERANCE) + } + } +}) + +test_that("lgb.train() works when you specify both 'metric' and 'eval' with strings", { + set.seed(708L) + nrounds <- 10L + increasing_metric_starting_value <- get(ACCUMULATOR_NAME, envir = .GlobalEnv) + bst <- lgb.train( + params = list( + objective = "binary" + , metric = "binary_error" + ) + , data = DTRAIN_RANDOM_CLASSIFICATION + , nrounds = nrounds + , valids = list( + "valid1" = DVALID_RANDOM_CLASSIFICATION + ) + , eval = "binary_logloss" + ) + + # both metrics should have been used + expect_named( + bst$record_evals[["valid1"]] + , expected = c("binary_error", "binary_logloss") + , ignore.order = TRUE + , ignore.case = FALSE + ) + + # the difference metrics shouldn't have been mixed up with each other + results <- bst$record_evals[["valid1"]] + expect_true(abs(results[["binary_error"]][["eval"]][[1L]] - 0.4864865) < TOLERANCE) + expect_true(abs(results[["binary_logloss"]][["eval"]][[1L]] - 0.6932548) < TOLERANCE) +}) + +test_that("lgb.train() works when you give a function for eval", { + set.seed(708L) + nrounds <- 10L + increasing_metric_starting_value <- get(ACCUMULATOR_NAME, envir = .GlobalEnv) + bst <- lgb.train( + params = list( + objective = "binary" + , metric = "None" + ) + , data = DTRAIN_RANDOM_CLASSIFICATION + , nrounds = nrounds + , valids = list( + "valid1" = DVALID_RANDOM_CLASSIFICATION + ) + , eval = .constant_metric + ) + + # the difference metrics shouldn't have been mixed up with each other + results <- bst$record_evals[["valid1"]] + expect_true(abs(results[["constant_metric"]][["eval"]][[1L]] - CONSTANT_METRIC_VALUE) < TOLERANCE) +}) + +test_that("lgb.train() works with early stopping for regression with a metric that should be minimized", { + set.seed(708L) + trainDF <- data.frame( + "feat1" = rep(c(10.0, 100.0), 500L) + , "target" = rep(c(-50.0, 50.0), 500L) + ) + validDF <- data.frame( + "feat1" = rep(50.0, 4L) + , "target" = rep(50.0, 4L) + ) + dtrain <- lgb.Dataset( + data = as.matrix(trainDF[["feat1"]], drop = FALSE) + , label = trainDF[["target"]] + ) + dvalid <- lgb.Dataset( + data = as.matrix(validDF[["feat1"]], drop = FALSE) + , label = validDF[["target"]] + ) + nrounds <- 10L + + ############################# + # train with early stopping # + ############################# + early_stopping_rounds <- 5L + bst <- lgb.train( + params = list( + objective = "regression" + , metric = c( + "mape" + , "rmse" + , "mae" + ) + , min_data_in_bin = 5L + , early_stopping_rounds = early_stopping_rounds + ) + , data = dtrain + , nrounds = nrounds + , valids = list( + "valid1" = dvalid + ) + ) + + # the best model should be from the first iteration, and only 6 rounds + # should have happened (1 with improvement, 5 consecutive with no improvement) + expect_equal(bst$best_score, 1.1) + expect_equal(bst$best_iter, 1L) + expect_equal( + length(bst$record_evals[["valid1"]][["mape"]][["eval"]]) + , early_stopping_rounds + 1L + ) + + # Booster should understand thatt all three of these metrics should be minimized + eval_info <- bst$.__enclos_env__$private$get_eval_info() + expect_identical(eval_info, c("mape", "rmse", "l1")) + expect_identical( + unname(bst$.__enclos_env__$private$higher_better_inner_eval) + , rep(FALSE, 3L) + ) +}) + + +test_that("lgb.train() supports non-ASCII feature names", { + testthat::skip("UTF-8 feature names are not fully supported in the R package") + dtrain <- lgb.Dataset( + data = matrix(rnorm(400L), ncol = 4L) + , label = rnorm(100L) + ) + feature_names <- c("F_零", "F_一", "F_二", "F_三") + bst <- lgb.train( + data = dtrain + , nrounds = 5L + , obj = "regression" + , params = list( + metric = "rmse" + ) + , colnames = feature_names + ) + expect_true(lgb.is.Booster(bst)) + dumped_model <- jsonlite::fromJSON(bst$dump_model()) + expect_identical( + dumped_model[["feature_names"]] + , feature_names + ) +}) + +test_that("when early stopping is not activated, best_iter and best_score come from valids and not training data", { + set.seed(708L) + trainDF <- data.frame( + "feat1" = rep(c(10.0, 100.0), 500L) + , "target" = rep(c(-50.0, 50.0), 500L) + ) + validDF <- data.frame( + "feat1" = rep(50.0, 4L) + , "target" = rep(50.0, 4L) + ) + dtrain <- lgb.Dataset( + data = as.matrix(trainDF[["feat1"]], drop = FALSE) + , label = trainDF[["target"]] + ) + dvalid1 <- lgb.Dataset( + data = as.matrix(validDF[["feat1"]], drop = FALSE) + , label = validDF[["target"]] + ) + dvalid2 <- lgb.Dataset( + data = as.matrix(validDF[1L:10L, "feat1"], drop = FALSE) + , label = validDF[1L:10L, "target"] + ) + nrounds <- 10L + train_params <- list( + objective = "regression" + , metric = "rmse" + , learning_rate = 1.5 + ) + + # example 1: two valids, neither are the training data + bst <- lgb.train( + data = dtrain + , nrounds = nrounds + , num_leaves = 5L + , valids = list( + "valid1" = dvalid1 + , "valid2" = dvalid2 + ) + , params = train_params + ) + expect_named( + bst$record_evals + , c("start_iter", "valid1", "valid2") + , ignore.order = FALSE + , ignore.case = FALSE + ) + rmse_scores <- unlist(bst$record_evals[["valid1"]][["rmse"]][["eval"]]) + expect_length(rmse_scores, nrounds) + expect_identical(bst$best_iter, which.min(rmse_scores)) + expect_identical(bst$best_score, rmse_scores[which.min(rmse_scores)]) + + # example 2: train first (called "train") and two valids + bst <- lgb.train( + data = dtrain + , nrounds = nrounds + , num_leaves = 5L + , valids = list( + "train" = dtrain + , "valid1" = dvalid1 + , "valid2" = dvalid2 + ) + , params = train_params + ) + expect_named( + bst$record_evals + , c("start_iter", "train", "valid1", "valid2") + , ignore.order = FALSE + , ignore.case = FALSE + ) + rmse_scores <- unlist(bst$record_evals[["valid1"]][["rmse"]][["eval"]]) + expect_length(rmse_scores, nrounds) + expect_identical(bst$best_iter, which.min(rmse_scores)) + expect_identical(bst$best_score, rmse_scores[which.min(rmse_scores)]) + + # example 3: train second (called "train") and two valids + bst <- lgb.train( + data = dtrain + , nrounds = nrounds + , num_leaves = 5L + , valids = list( + "valid1" = dvalid1 + , "train" = dtrain + , "valid2" = dvalid2 + ) + , params = train_params + ) + # note that "train" still ends up as the first one + expect_named( + bst$record_evals + , c("start_iter", "train", "valid1", "valid2") + , ignore.order = FALSE + , ignore.case = FALSE + ) + rmse_scores <- unlist(bst$record_evals[["valid1"]][["rmse"]][["eval"]]) + expect_length(rmse_scores, nrounds) + expect_identical(bst$best_iter, which.min(rmse_scores)) + expect_identical(bst$best_score, rmse_scores[which.min(rmse_scores)]) + + # example 4: train third (called "train") and two valids + bst <- lgb.train( + data = dtrain + , nrounds = nrounds + , num_leaves = 5L + , valids = list( + "valid1" = dvalid1 + , "valid2" = dvalid2 + , "train" = dtrain + ) + , params = train_params + ) + # note that "train" still ends up as the first one + expect_named( + bst$record_evals + , c("start_iter", "train", "valid1", "valid2") + , ignore.order = FALSE + , ignore.case = FALSE + ) + rmse_scores <- unlist(bst$record_evals[["valid1"]][["rmse"]][["eval"]]) + expect_length(rmse_scores, nrounds) + expect_identical(bst$best_iter, which.min(rmse_scores)) + expect_identical(bst$best_score, rmse_scores[which.min(rmse_scores)]) + + # example 5: train second (called "something-random-we-would-not-hardcode") and two valids + bst <- lgb.train( + data = dtrain + , nrounds = nrounds + , num_leaves = 5L + , valids = list( + "valid1" = dvalid1 + , "something-random-we-would-not-hardcode" = dtrain + , "valid2" = dvalid2 + ) + , params = train_params + ) + # note that "something-random-we-would-not-hardcode" was recognized as the training + # data even though it isn't named "train" + expect_named( + bst$record_evals + , c("start_iter", "something-random-we-would-not-hardcode", "valid1", "valid2") + , ignore.order = FALSE + , ignore.case = FALSE + ) + rmse_scores <- unlist(bst$record_evals[["valid1"]][["rmse"]][["eval"]]) + expect_length(rmse_scores, nrounds) + expect_identical(bst$best_iter, which.min(rmse_scores)) + expect_identical(bst$best_score, rmse_scores[which.min(rmse_scores)]) + + # example 6: the only valid supplied is the training data + bst <- lgb.train( + data = dtrain + , nrounds = nrounds + , num_leaves = 5L + , valids = list( + "train" = dtrain + ) + , params = train_params + ) + expect_identical(bst$best_iter, -1L) + expect_identical(bst$best_score, NA_real_) +}) + +test_that("lightgbm.train() gives the correct best_score and best_iter for a metric where higher values are better", { + set.seed(708L) + trainDF <- data.frame( + "feat1" = runif(n = 500L, min = 0.0, max = 15.0) + , "target" = rep(c(0L, 1L), 500L) + ) + validDF <- data.frame( + "feat1" = runif(n = 50L, min = 0.0, max = 15.0) + , "target" = rep(c(0L, 1L), 50L) + ) + dtrain <- lgb.Dataset( + data = as.matrix(trainDF[["feat1"]], drop = FALSE) + , label = trainDF[["target"]] + ) + dvalid1 <- lgb.Dataset( + data = as.matrix(validDF[1L:25L, "feat1"], drop = FALSE) + , label = validDF[1L:25L, "target"] + ) + nrounds <- 10L + bst <- lgb.train( + data = dtrain + , nrounds = nrounds + , num_leaves = 5L + , valids = list( + "valid1" = dvalid1 + , "something-random-we-would-not-hardcode" = dtrain + ) + , params = list( + objective = "binary" + , metric = "auc" + , learning_rate = 1.5 + ) + ) + # note that "something-random-we-would-not-hardcode" was recognized as the training + # data even though it isn't named "train" + expect_named( + bst$record_evals + , c("start_iter", "something-random-we-would-not-hardcode", "valid1") + , ignore.order = FALSE + , ignore.case = FALSE + ) + auc_scores <- unlist(bst$record_evals[["valid1"]][["auc"]][["eval"]]) + expect_length(auc_scores, nrounds) + expect_identical(bst$best_iter, which.max(auc_scores)) + expect_identical(bst$best_score, auc_scores[which.max(auc_scores)]) +}) + +test_that("using lightgbm() without early stopping, best_iter and best_score come from valids and not training data", { + set.seed(708L) + # example: train second (called "something-random-we-would-not-hardcode"), two valids, + # and a metric where higher values are better ("auc") + trainDF <- data.frame( + "feat1" = runif(n = 500L, min = 0.0, max = 15.0) + , "target" = rep(c(0L, 1L), 500L) + ) + validDF <- data.frame( + "feat1" = runif(n = 50L, min = 0.0, max = 15.0) + , "target" = rep(c(0L, 1L), 50L) + ) + dtrain <- lgb.Dataset( + data = as.matrix(trainDF[["feat1"]], drop = FALSE) + , label = trainDF[["target"]] + ) + dvalid1 <- lgb.Dataset( + data = as.matrix(validDF[1L:25L, "feat1"], drop = FALSE) + , label = validDF[1L:25L, "target"] + ) + dvalid2 <- lgb.Dataset( + data = as.matrix(validDF[26L:50L, "feat1"], drop = FALSE) + , label = validDF[26L:50L, "target"] + ) + nrounds <- 10L + bst <- lightgbm( + data = dtrain + , nrounds = nrounds + , num_leaves = 5L + , valids = list( + "valid1" = dvalid1 + , "something-random-we-would-not-hardcode" = dtrain + , "valid2" = dvalid2 + ) + , params = list( + objective = "binary" + , metric = "auc" + , learning_rate = 1.5 + ) + , verbose = -7L + , save_name = tempfile(fileext = ".model") + ) + # when verbose <= 0 is passed to lightgbm(), 'valids' is passed through to lgb.train() + # untouched. If you set verbose to > 0, the training data will still be first but called "train" + expect_named( + bst$record_evals + , c("start_iter", "something-random-we-would-not-hardcode", "valid1", "valid2") + , ignore.order = FALSE + , ignore.case = FALSE + ) + auc_scores <- unlist(bst$record_evals[["valid1"]][["auc"]][["eval"]]) + expect_length(auc_scores, nrounds) + expect_identical(bst$best_iter, which.max(auc_scores)) + expect_identical(bst$best_score, auc_scores[which.max(auc_scores)]) +}) + +test_that("lgb.cv() works when you specify both 'metric' and 'eval' with strings", { + set.seed(708L) + nrounds <- 10L + nfolds <- 4L + increasing_metric_starting_value <- get(ACCUMULATOR_NAME, envir = .GlobalEnv) + bst <- lgb.cv( + params = list( + objective = "binary" + , metric = "binary_error" + ) + , data = DTRAIN_RANDOM_CLASSIFICATION + , nrounds = nrounds + , nfold = nfolds + , eval = "binary_logloss" + ) + + # both metrics should have been used + expect_named( + bst$record_evals[["valid"]] + , expected = c("binary_error", "binary_logloss") + , ignore.order = TRUE + , ignore.case = FALSE + ) + + # the difference metrics shouldn't have been mixed up with each other + results <- bst$record_evals[["valid"]] + expect_true(abs(results[["binary_error"]][["eval"]][[1L]] - 0.5005654) < TOLERANCE) + expect_true(abs(results[["binary_logloss"]][["eval"]][[1L]] - 0.7011232) < TOLERANCE) + + # all boosters should have been created + expect_length(bst$boosters, nfolds) +}) + +test_that("lgb.cv() works when you give a function for eval", { + set.seed(708L) + nrounds <- 10L + nfolds <- 3L + increasing_metric_starting_value <- get(ACCUMULATOR_NAME, envir = .GlobalEnv) + bst <- lgb.cv( + params = list( + objective = "binary" + , metric = "None" + ) + , data = DTRAIN_RANDOM_CLASSIFICATION + , nfold = nfolds + , nrounds = nrounds + , eval = .constant_metric + ) + + # the difference metrics shouldn't have been mixed up with each other + results <- bst$record_evals[["valid"]] + expect_true(abs(results[["constant_metric"]][["eval"]][[1L]] - CONSTANT_METRIC_VALUE) < TOLERANCE) + expect_named(results, "constant_metric") +}) + +test_that("If first_metric_only is TRUE, lgb.cv() decides to stop early based on only the first metric", { + set.seed(708L) + nrounds <- 10L + nfolds <- 5L + early_stopping_rounds <- 3L + increasing_metric_starting_value <- get(ACCUMULATOR_NAME, envir = .GlobalEnv) + bst <- lgb.cv( + params = list( + objective = "regression" + , metric = "None" + , early_stopping_rounds = early_stopping_rounds + , first_metric_only = TRUE + ) + , data = DTRAIN_RANDOM_REGRESSION + , nfold = nfolds + , nrounds = nrounds + , valids = list( + "valid1" = DVALID_RANDOM_REGRESSION + ) + , eval = list( + .increasing_metric + , .constant_metric + ) + ) + + # Only the two functions provided to "eval" should have been evaluated + expect_named(bst$record_evals[["valid"]], c("increasing_metric", "constant_metric")) + + # all 10 iterations should happen, and the best_iter should be the final one + expect_equal(bst$best_iter, nrounds) + + # best_score should be taken from "increasing_metric" + # + # this expected value looks magical and confusing, but it's because + # evaluation metrics are averaged over all folds. + # + # consider 5-fold CV with a metric that adds 0.1 to a global accumulator + # each time it's called + # + # * iter 1: [0.1, 0.2, 0.3, 0.4, 0.5] (mean = 0.3) + # * iter 2: [0.6, 0.7, 0.8, 0.9, 1.0] (mean = 1.3) + # * iter 3: [1.1, 1.2, 1.3, 1.4, 1.5] (mean = 1.8) + # + cv_value <- increasing_metric_starting_value + mean(seq_len(nfolds) / 10.0) + (nrounds - 1L) * 0.1 * nfolds + expect_equal(bst$best_score, cv_value) + + # early stopping should not have happened. Even though constant_metric + # had 9 consecutive iterations with no improvement, it is ignored because of + # first_metric_only = TRUE + expect_equal( + length(bst$record_evals[["valid"]][["constant_metric"]][["eval"]]) + , nrounds + ) + expect_equal( + length(bst$record_evals[["valid"]][["increasing_metric"]][["eval"]]) + , nrounds + ) +}) + +test_that("early stopping works with lgb.cv()", { + set.seed(708L) + nrounds <- 10L + nfolds <- 5L + early_stopping_rounds <- 3L + increasing_metric_starting_value <- get(ACCUMULATOR_NAME, envir = .GlobalEnv) + bst <- lgb.cv( + params = list( + objective = "regression" + , metric = "None" + , early_stopping_rounds = early_stopping_rounds + , first_metric_only = TRUE + ) + , data = DTRAIN_RANDOM_REGRESSION + , nfold = nfolds + , nrounds = nrounds + , valids = list( + "valid1" = DVALID_RANDOM_REGRESSION + ) + , eval = list( + .constant_metric + , .increasing_metric + ) + ) + + # only the two functions provided to "eval" should have been evaluated + expect_named(bst$record_evals[["valid"]], c("constant_metric", "increasing_metric")) + + # best_iter should be based on the first metric. Since constant_metric + # never changes, its first iteration was the best oone + expect_equal(bst$best_iter, 1L) + + # best_score should be taken from the first metri + expect_equal(bst$best_score, 0.2) + + # early stopping should have happened, since constant_metric was the first + # one passed to eval and it will not improve over consecutive iterations + # + # note that this test is identical to the previous one, but with the + # order of the eval metrics switched + expect_equal( + length(bst$record_evals[["valid"]][["constant_metric"]][["eval"]]) + , early_stopping_rounds + 1L + ) + expect_equal( + length(bst$record_evals[["valid"]][["increasing_metric"]][["eval"]]) + , early_stopping_rounds + 1L + ) +}) + +context("interaction constraints") + +test_that("lgb.train() throws an informative error if interaction_constraints is not a list", { + dtrain <- lgb.Dataset(train$data, label = train$label) + params <- list(objective = "regression", interaction_constraints = "[1,2],[3]") + # expect_error({ + # bst <- lightgbm( + # data = dtrain + # , params = params + # , nrounds = 2L + # ) + # }, "interaction_constraints must be a list") +}) + +test_that(paste0("lgb.train() throws an informative error if the members of interaction_constraints ", + "are not character or numeric vectors"), { + dtrain <- lgb.Dataset(train$data, label = train$label) + params <- list(objective = "regression", interaction_constraints = list(list(1L, 2L), list(3L))) + # expect_error({ + # bst <- lightgbm( + # data = dtrain + # , params = params + # , nrounds = 2L + # ) + # }, "every element in interaction_constraints must be a character vector or numeric vector") +}) + +test_that("lgb.train() throws an informative error if interaction_constraints contains a too large index", { + dtrain <- lgb.Dataset(train$data, label = train$label) + params <- list(objective = "regression", + interaction_constraints = list(c(1L, length(colnames(train$data)) + 1L), 3L)) + # expect_error({ + # bst <- lightgbm( + # data = dtrain + # , params = params + # , nrounds = 2L + # ) + # }, "supplied a too large value in interaction_constraints") +}) + +test_that(paste0("lgb.train() gives same result when interaction_constraints is specified as a list of ", + "character vectors, numeric vectors, or a combination"), { + set.seed(1L) + dtrain <- lgb.Dataset(train$data, label = train$label) + + params <- list(objective = "regression", interaction_constraints = list(c(1L, 2L), 3L)) + bst <- lightgbm( + data = dtrain + , params = params + , nrounds = 2L + ) + pred1 <- bst$predict(test$data) + + cnames <- colnames(train$data) + params <- list(objective = "regression", interaction_constraints = list(c(cnames[[1L]], cnames[[2L]]), cnames[[3L]])) + bst <- lightgbm( + data = dtrain + , params = params + , nrounds = 2L + ) + pred2 <- bst$predict(test$data) + + params <- list(objective = "regression", interaction_constraints = list(c(cnames[[1L]], cnames[[2L]]), 3L)) + bst <- lightgbm( + data = dtrain + , params = params + , nrounds = 2L + ) + pred3 <- bst$predict(test$data) + + expect_equal(pred1, pred2) + expect_equal(pred2, pred3) + +}) + +test_that(paste0("lgb.train() gives same results when using interaction_constraints and specifying colnames"), { + set.seed(1L) + dtrain <- lgb.Dataset(train$data, label = train$label) + + params <- list(objective = "regression", interaction_constraints = list(c(1L, 2L), 3L)) + bst <- lightgbm( + data = dtrain + , params = params + , nrounds = 2L + ) + pred1 <- bst$predict(test$data) + + new_colnames <- paste0(colnames(train$data), "_x") + params <- list(objective = "regression" + , interaction_constraints = list(c(new_colnames[1L], new_colnames[2L]), new_colnames[3L])) + bst <- lightgbm( + data = dtrain + , params = params + , nrounds = 2L + , colnames = new_colnames + ) + pred2 <- bst$predict(test$data) + + expect_equal(pred1, pred2) + +}) diff --git a/R-package/tests/testthat/test_learning_to_rank.R b/R-package/tests/testthat/test_learning_to_rank.R index e6ab2a987d25..3cab22304e6b 100644 --- a/R-package/tests/testthat/test_learning_to_rank.R +++ b/R-package/tests/testthat/test_learning_to_rank.R @@ -1,141 +1,141 @@ -# context("Learning to rank") - -# # numerical tolerance to use when checking metric values -# TOLERANCE <- 1e-06 - -# test_that("learning-to-rank with lgb.train() works as expected", { -# set.seed(708L) -# data(agaricus.train, package = "lightgbm") -# # just keep a few features,to generate an model with imperfect fit -# train <- agaricus.train -# train_data <- train$data[1L:6000L, 1L:20L] -# dtrain <- lgb.Dataset( -# train_data -# , label = train$label[1L:6000L] -# , group = rep(150L, 40L) -# ) -# ndcg_at <- "1,2,3" -# eval_names <- paste0("ndcg@", strsplit(ndcg_at, ",")[[1L]]) -# params <- list( -# objective = "lambdarank" -# , metric = "ndcg" -# , ndcg_at = ndcg_at -# , lambdarank_truncation_level = 3L -# , learning_rate = 0.001 -# ) -# model <- lgb.train( -# params = params -# , data = dtrain -# , nrounds = 10L -# ) -# expect_true(lgb.is.Booster(model)) - -# dumped_model <- jsonlite::fromJSON( -# model$dump_model() -# ) -# expect_equal(dumped_model[["objective"]], "lambdarank") -# expect_equal(dumped_model[["max_feature_idx"]], ncol(train_data) - 1L) - -# # check that evaluation results make sense (0.0 < nDCG < 1.0) -# eval_results <- model$eval_train() -# expect_equal(length(eval_results), length(eval_names)) -# for (result in eval_results) { -# expect_true(result[["value"]] > 0.0 && result[["value"]] < 1.0) -# expect_true(result[["higher_better"]]) -# expect_identical(result[["data_name"]], "training") -# } -# expect_identical(sapply(eval_results, function(x) {x$name}), eval_names) -# expect_equal(eval_results[[1L]][["value"]], 0.825) -# expect_true(abs(eval_results[[2L]][["value"]] - 0.7766434) < TOLERANCE) -# expect_true(abs(eval_results[[3L]][["value"]] - 0.7527939) < TOLERANCE) -# }) - -# test_that("learning-to-rank with lgb.cv() works as expected", { -# set.seed(708L) -# data(agaricus.train, package = "lightgbm") -# # just keep a few features,to generate an model with imperfect fit -# train <- agaricus.train -# train_data <- train$data[1L:6000L, 1L:20L] -# dtrain <- lgb.Dataset( -# train_data -# , label = train$label[1L:6000L] -# , group = rep(150L, 40L) -# ) -# ndcg_at <- "1,2,3" -# eval_names <- paste0("ndcg@", strsplit(ndcg_at, ",")[[1L]]) -# params <- list( -# objective = "lambdarank" -# , metric = "ndcg" -# , ndcg_at = ndcg_at -# , lambdarank_truncation_level = 3L -# , label_gain = "0,1,3" -# ) -# nfold <- 4L -# nrounds <- 10L -# cv_bst <- lgb.cv( -# params = params -# , data = dtrain -# , nrounds = nrounds -# , nfold = nfold -# , min_data = 1L -# , learning_rate = 0.01 -# ) -# expect_is(cv_bst, "lgb.CVBooster") -# expect_equal(length(cv_bst$boosters), nfold) - -# # "valid" should contain results for each metric -# eval_results <- cv_bst$record_evals[["valid"]] -# eval_names <- c("ndcg@1", "ndcg@2", "ndcg@3") -# expect_identical(names(eval_results), eval_names) - -# # check that best score and iter make sense (0.0 < nDCG < 1.0) -# best_iter <- cv_bst$best_iter -# best_score <- cv_bst$best_score -# expect_true(best_iter > 0L && best_iter <= nrounds) -# expect_true(best_score > 0.0 && best_score < 1.0) -# expect_true(abs(best_score - 0.775) < TOLERANCE) - -# # best_score should be set for the first metric -# first_metric <- eval_names[[1L]] -# expect_equal(best_score, eval_results[[first_metric]][["eval"]][[best_iter]]) - -# for (eval_name in eval_names) { -# results_for_this_metric <- eval_results[[eval_name]] - -# # each set of metrics should have eval and eval_err -# expect_identical(names(results_for_this_metric), c("eval", "eval_err")) - -# # there should be one "eval" and "eval_err" per round -# expect_equal(length(results_for_this_metric[["eval"]]), nrounds) -# expect_equal(length(results_for_this_metric[["eval_err"]]), nrounds) - -# # check that evaluation results make sense (0.0 < nDCG < 1.0) -# all_evals <- unlist(results_for_this_metric[["eval"]]) -# expect_true(all(all_evals > 0.0 & all_evals < 1.0)) -# } - -# # first and last value of each metric should be as expected -# ndcg1_values <- c(0.725, 0.75, 0.75, 0.775, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75) -# expect_true(all(abs(unlist(eval_results[["ndcg@1"]][["eval"]]) - ndcg1_values) < TOLERANCE)) - -# ndcg2_values <- c( -# 0.6863147, 0.720986, 0.7306574, 0.745986, 0.7306574, -# 0.720986, 0.7403287, 0.7403287, 0.7403287, 0.7306574 -# ) -# expect_true(all(abs(unlist(eval_results[["ndcg@2"]][["eval"]]) - ndcg2_values) < TOLERANCE)) - -# ndcg3_values <- c( -# 0.6777939, 0.6984639, 0.711732, 0.7234639, 0.711732, -# 0.7101959, 0.719134, 0.719134, 0.725, 0.711732 -# ) -# expect_true(all(abs(unlist(eval_results[["ndcg@3"]][["eval"]]) - ndcg3_values) < TOLERANCE)) - -# # check details of each booster -# for (bst in cv_bst$boosters) { -# dumped_model <- jsonlite::fromJSON( -# bst$booster$dump_model() -# ) -# expect_equal(dumped_model[["objective"]], "lambdarank") -# expect_equal(dumped_model[["max_feature_idx"]], ncol(train_data) - 1L) -# } -# }) +context("Learning to rank") + +# numerical tolerance to use when checking metric values +TOLERANCE <- 1e-06 + +test_that("learning-to-rank with lgb.train() works as expected", { + set.seed(708L) + data(agaricus.train, package = "lightgbm") + # just keep a few features,to generate an model with imperfect fit + train <- agaricus.train + train_data <- train$data[1L:6000L, 1L:20L] + dtrain <- lgb.Dataset( + train_data + , label = train$label[1L:6000L] + , group = rep(150L, 40L) + ) + ndcg_at <- "1,2,3" + eval_names <- paste0("ndcg@", strsplit(ndcg_at, ",")[[1L]]) + params <- list( + objective = "lambdarank" + , metric = "ndcg" + , ndcg_at = ndcg_at + , lambdarank_truncation_level = 3L + , learning_rate = 0.001 + ) + model <- lgb.train( + params = params + , data = dtrain + , nrounds = 10L + ) + expect_true(lgb.is.Booster(model)) + + dumped_model <- jsonlite::fromJSON( + model$dump_model() + ) + expect_equal(dumped_model[["objective"]], "lambdarank") + expect_equal(dumped_model[["max_feature_idx"]], ncol(train_data) - 1L) + + # check that evaluation results make sense (0.0 < nDCG < 1.0) + eval_results <- model$eval_train() + expect_equal(length(eval_results), length(eval_names)) + for (result in eval_results) { + expect_true(result[["value"]] > 0.0 && result[["value"]] < 1.0) + expect_true(result[["higher_better"]]) + expect_identical(result[["data_name"]], "training") + } + expect_identical(sapply(eval_results, function(x) {x$name}), eval_names) + expect_equal(eval_results[[1L]][["value"]], 0.825) + expect_true(abs(eval_results[[2L]][["value"]] - 0.7766434) < TOLERANCE) + expect_true(abs(eval_results[[3L]][["value"]] - 0.7527939) < TOLERANCE) +}) + +test_that("learning-to-rank with lgb.cv() works as expected", { + set.seed(708L) + data(agaricus.train, package = "lightgbm") + # just keep a few features,to generate an model with imperfect fit + train <- agaricus.train + train_data <- train$data[1L:6000L, 1L:20L] + dtrain <- lgb.Dataset( + train_data + , label = train$label[1L:6000L] + , group = rep(150L, 40L) + ) + ndcg_at <- "1,2,3" + eval_names <- paste0("ndcg@", strsplit(ndcg_at, ",")[[1L]]) + params <- list( + objective = "lambdarank" + , metric = "ndcg" + , ndcg_at = ndcg_at + , lambdarank_truncation_level = 3L + , label_gain = "0,1,3" + ) + nfold <- 4L + nrounds <- 10L + cv_bst <- lgb.cv( + params = params + , data = dtrain + , nrounds = nrounds + , nfold = nfold + , min_data = 1L + , learning_rate = 0.01 + ) + expect_is(cv_bst, "lgb.CVBooster") + expect_equal(length(cv_bst$boosters), nfold) + + # "valid" should contain results for each metric + eval_results <- cv_bst$record_evals[["valid"]] + eval_names <- c("ndcg@1", "ndcg@2", "ndcg@3") + expect_identical(names(eval_results), eval_names) + + # check that best score and iter make sense (0.0 < nDCG < 1.0) + best_iter <- cv_bst$best_iter + best_score <- cv_bst$best_score + expect_true(best_iter > 0L && best_iter <= nrounds) + expect_true(best_score > 0.0 && best_score < 1.0) + expect_true(abs(best_score - 0.775) < TOLERANCE) + + # best_score should be set for the first metric + first_metric <- eval_names[[1L]] + expect_equal(best_score, eval_results[[first_metric]][["eval"]][[best_iter]]) + + for (eval_name in eval_names) { + results_for_this_metric <- eval_results[[eval_name]] + + # each set of metrics should have eval and eval_err + expect_identical(names(results_for_this_metric), c("eval", "eval_err")) + + # there should be one "eval" and "eval_err" per round + expect_equal(length(results_for_this_metric[["eval"]]), nrounds) + expect_equal(length(results_for_this_metric[["eval_err"]]), nrounds) + + # check that evaluation results make sense (0.0 < nDCG < 1.0) + all_evals <- unlist(results_for_this_metric[["eval"]]) + expect_true(all(all_evals > 0.0 & all_evals < 1.0)) + } + + # first and last value of each metric should be as expected + ndcg1_values <- c(0.725, 0.75, 0.75, 0.775, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75) + expect_true(all(abs(unlist(eval_results[["ndcg@1"]][["eval"]]) - ndcg1_values) < TOLERANCE)) + + ndcg2_values <- c( + 0.6863147, 0.720986, 0.7306574, 0.745986, 0.7306574, + 0.720986, 0.7403287, 0.7403287, 0.7403287, 0.7306574 + ) + expect_true(all(abs(unlist(eval_results[["ndcg@2"]][["eval"]]) - ndcg2_values) < TOLERANCE)) + + ndcg3_values <- c( + 0.6777939, 0.6984639, 0.711732, 0.7234639, 0.711732, + 0.7101959, 0.719134, 0.719134, 0.725, 0.711732 + ) + expect_true(all(abs(unlist(eval_results[["ndcg@3"]][["eval"]]) - ndcg3_values) < TOLERANCE)) + + # check details of each booster + for (bst in cv_bst$boosters) { + dumped_model <- jsonlite::fromJSON( + bst$booster$dump_model() + ) + expect_equal(dumped_model[["objective"]], "lambdarank") + expect_equal(dumped_model[["max_feature_idx"]], ncol(train_data) - 1L) + } +}) From 5dd74aa2d4697f893329fa7bd08f1cc9aa211a86 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Wed, 14 Oct 2020 14:18:52 -0500 Subject: [PATCH 52/67] try uncommenting more dataset tests --- R-package/tests/testthat/test_dataset.R | 36 ++++++++++++------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/R-package/tests/testthat/test_dataset.R b/R-package/tests/testthat/test_dataset.R index dfb421e43f71..d0ac9c0627d2 100644 --- a/R-package/tests/testthat/test_dataset.R +++ b/R-package/tests/testthat/test_dataset.R @@ -36,7 +36,7 @@ test_that("lgb.Dataset: getinfo & setinfo", { expect_true(length(getinfo(dtest, "init_score")) == 0L) # any other label should error - # expect_error(setinfo(dtest, "asdf", test_label)) + expect_error(setinfo(dtest, "asdf", test_label)) }) test_that("lgb.Dataset: slice, dim", { @@ -54,9 +54,9 @@ test_that("lgb.Dataset: colnames", { expect_equal(colnames(dtest), colnames(test_data)) lgb.Dataset.construct(dtest) expect_equal(colnames(dtest), colnames(test_data)) - # expect_error({ - # colnames(dtest) <- "asdf" - # }) + expect_error({ + colnames(dtest) <- "asdf" + }) new_names <- make.names(seq_len(ncol(test_data))) expect_silent(colnames(dtest) <- new_names) expect_equal(colnames(dtest), new_names) @@ -107,26 +107,26 @@ test_that("lgb.Dataset should throw an error if 'reference' is provided but of t test_data <- agaricus.test$data[1L:100L, ] test_label <- agaricus.test$label[1L:100L] # Try to trick lgb.Dataset() into accepting bad input - # expect_error({ - # dtest <- lgb.Dataset( - # data = test_data - # , label = test_label - # , reference = data.frame(x = seq_len(10L), y = seq_len(10L)) - # ) - # }, regexp = "reference must be a") + expect_error({ + dtest <- lgb.Dataset( + data = test_data + , label = test_label + , reference = data.frame(x = seq_len(10L), y = seq_len(10L)) + ) + }, regexp = "reference must be a") }) test_that("Dataset$new() should throw an error if 'predictor' is provided but of the wrong format", { data(agaricus.test, package = "lightgbm") test_data <- agaricus.test$data[1L:100L, ] test_label <- agaricus.test$label[1L:100L] - # expect_error({ - # dtest <- Dataset$new( - # data = test_data - # , label = test_label - # , predictor = data.frame(x = seq_len(10L), y = seq_len(10L)) - # ) - # }, regexp = "predictor must be a", fixed = TRUE) + expect_error({ + dtest <- Dataset$new( + data = test_data + , label = test_label + , predictor = data.frame(x = seq_len(10L), y = seq_len(10L)) + ) + }, regexp = "predictor must be a", fixed = TRUE) }) test_that("Dataset$get_params() successfully returns parameters if you passed them", { From 2164ba85749af06950323a521a83ca0244110c7c Mon Sep 17 00:00:00 2001 From: James Lamb Date: Wed, 14 Oct 2020 15:52:17 -0500 Subject: [PATCH 53/67] uncommenting more tests --- R-package/tests/testthat/test_basic.R | 124 +++++++++++++------------- 1 file changed, 62 insertions(+), 62 deletions(-) diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R index b94e91c9897c..de4cc233055a 100644 --- a/R-package/tests/testthat/test_basic.R +++ b/R-package/tests/testthat/test_basic.R @@ -170,14 +170,14 @@ test_that("lightgbm() rejects negative or 0 value passed to nrounds", { dtrain <- lgb.Dataset(train$data, label = train$label) params <- list(objective = "regression", metric = "l2,l1") for (nround_value in c(-10L, 0L)) { - # expect_error({ - # bst <- lightgbm( - # data = dtrain - # , params = params - # , nrounds = nround_value - # , save_name = tempfile(fileext = ".model") - # ) - # }, "nrounds should be greater than zero") + expect_error({ + bst <- lightgbm( + data = dtrain + , params = params + , nrounds = nround_value + , save_name = tempfile(fileext = ".model") + ) + }, "nrounds should be greater than zero") } }) @@ -278,15 +278,15 @@ test_that("lgb.cv() rejects negative or 0 value passed to nrounds", { dtrain <- lgb.Dataset(train$data, label = train$label) params <- list(objective = "regression", metric = "l2,l1") for (nround_value in c(-10L, 0L)) { - # expect_error({ - # bst <- lgb.cv( - # params - # , dtrain - # , nround_value - # , nfold = 5L - # , min_data = 1L - # ) - # }, "nrounds should be greater than zero") + expect_error({ + bst <- lgb.cv( + params + , dtrain + , nround_value + , nfold = 5L + , min_data = 1L + ) + }, "nrounds should be greater than zero") } }) @@ -300,15 +300,15 @@ test_that("lgb.cv() throws an informative error is 'data' is not an lgb.Dataset , matrix(data = seq_len(10L), 2L, 5L) ) for (val in bad_values) { - # expect_error({ - # bst <- lgb.cv( - # params = list(objective = "regression", metric = "l2,l1") - # , data = val - # , 10L - # , nfold = 5L - # , min_data = 1L - # ) - # }, regexp = "'label' must be provided for lgb.cv if 'data' is not an 'lgb.Dataset'", fixed = TRUE) + expect_error({ + bst <- lgb.cv( + params = list(objective = "regression", metric = "l2,l1") + , data = val + , 10L + , nfold = 5L + , min_data = 1L + ) + }, regexp = "'label' must be provided for lgb.cv if 'data' is not an 'lgb.Dataset'", fixed = TRUE) } }) @@ -398,13 +398,13 @@ test_that("lgb.train() throws an informative error if 'data' is not an lgb.Datas , matrix(data = seq_len(10L), 2L, 5L) ) for (val in bad_values) { - # expect_error({ - # bst <- lgb.train( - # params = list(objective = "regression", metric = "l2,l1") - # , data = val - # , 10L - # ) - # }, regexp = "data must be an lgb.Dataset instance", fixed = TRUE) + expect_error({ + bst <- lgb.train( + params = list(objective = "regression", metric = "l2,l1") + , data = val + , 10L + ) + }, regexp = "data must be an lgb.Dataset instance", fixed = TRUE) } }) @@ -413,14 +413,14 @@ test_that("lgb.train() throws an informative error if 'valids' is not a list of "valid1" = data.frame(x = rnorm(5L), y = rnorm(5L)) , "valid2" = data.frame(x = rnorm(5L), y = rnorm(5L)) ) - # expect_error({ - # bst <- lgb.train( - # params = list(objective = "regression", metric = "l2,l1") - # , data = lgb.Dataset(train$data, label = train$label) - # , 10L - # , valids = valids - # ) - # }, regexp = "valids must be a list of lgb.Dataset elements") + expect_error({ + bst <- lgb.train( + params = list(objective = "regression", metric = "l2,l1") + , data = lgb.Dataset(train$data, label = train$label) + , 10L + , valids = valids + ) + }, regexp = "valids must be a list of lgb.Dataset elements") }) test_that("lgb.train() errors if 'valids' is a list of lgb.Dataset objects but some do not have names", { @@ -1624,39 +1624,39 @@ context("interaction constraints") test_that("lgb.train() throws an informative error if interaction_constraints is not a list", { dtrain <- lgb.Dataset(train$data, label = train$label) params <- list(objective = "regression", interaction_constraints = "[1,2],[3]") - # expect_error({ - # bst <- lightgbm( - # data = dtrain - # , params = params - # , nrounds = 2L - # ) - # }, "interaction_constraints must be a list") + expect_error({ + bst <- lightgbm( + data = dtrain + , params = params + , nrounds = 2L + ) + }, "interaction_constraints must be a list") }) test_that(paste0("lgb.train() throws an informative error if the members of interaction_constraints ", "are not character or numeric vectors"), { dtrain <- lgb.Dataset(train$data, label = train$label) params <- list(objective = "regression", interaction_constraints = list(list(1L, 2L), list(3L))) - # expect_error({ - # bst <- lightgbm( - # data = dtrain - # , params = params - # , nrounds = 2L - # ) - # }, "every element in interaction_constraints must be a character vector or numeric vector") + expect_error({ + bst <- lightgbm( + data = dtrain + , params = params + , nrounds = 2L + ) + }, "every element in interaction_constraints must be a character vector or numeric vector") }) test_that("lgb.train() throws an informative error if interaction_constraints contains a too large index", { dtrain <- lgb.Dataset(train$data, label = train$label) params <- list(objective = "regression", interaction_constraints = list(c(1L, length(colnames(train$data)) + 1L), 3L)) - # expect_error({ - # bst <- lightgbm( - # data = dtrain - # , params = params - # , nrounds = 2L - # ) - # }, "supplied a too large value in interaction_constraints") + expect_error({ + bst <- lightgbm( + data = dtrain + , params = params + , nrounds = 2L + ) + }, "supplied a too large value in interaction_constraints") }) test_that(paste0("lgb.train() gives same result when interaction_constraints is specified as a list of ", From 14cb1841433a929d78228c9519d425982e1e5081 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Wed, 14 Oct 2020 18:12:04 -0500 Subject: [PATCH 54/67] ok getting closer --- R-package/src/lightgbm_R.cpp | 2 +- R-package/tests/testthat/test_basic.R | 46 +++++++++++++-------------- 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/R-package/src/lightgbm_R.cpp b/R-package/src/lightgbm_R.cpp index 958fd4780218..a5e94f011ac5 100644 --- a/R-package/src/lightgbm_R.cpp +++ b/R-package/src/lightgbm_R.cpp @@ -25,7 +25,7 @@ try { #define R_API_END() } \ catch(std::exception& ex) { R_INT_PTR(call_state)[0] = -1; LGBM_SetLastError(ex.what()); return call_state;} \ - catch(std::string& ex) { R_INT_PTR(call_state)[0] = -1; LGBM_SetLastError(ex.c_str()); return call_state;} \ + catch(std::string& ex) { R_INT_PTR(call_state)[0] = -1; LGBM_SetLastError(ex.c_str()); return call_state; } \ catch(...) { R_INT_PTR(call_state)[0] = -1; LGBM_SetLastError("unknown exception"); return call_state;} \ return call_state; diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R index de4cc233055a..ba6fc864fcfe 100644 --- a/R-package/tests/testthat/test_basic.R +++ b/R-package/tests/testthat/test_basic.R @@ -378,13 +378,13 @@ test_that("lgb.train() rejects negative or 0 value passed to nrounds", { dtrain <- lgb.Dataset(train$data, label = train$label) params <- list(objective = "regression", metric = "l2,l1") for (nround_value in c(-10L, 0L)) { - # expect_error({ - # bst <- lgb.train( - # params - # , dtrain - # , nround_value - # ) - # }, "nrounds should be greater than zero") + expect_error({ + bst <- lgb.train( + params + , dtrain + , nround_value + ) + }, "nrounds should be greater than zero") } }) @@ -428,14 +428,14 @@ test_that("lgb.train() errors if 'valids' is a list of lgb.Dataset objects but s "valid1" = lgb.Dataset(matrix(rnorm(10L), 5L, 2L)) , lgb.Dataset(matrix(rnorm(10L), 2L, 5L)) ) - # expect_error({ - # bst <- lgb.train( - # params = list(objective = "regression", metric = "l2,l1") - # , data = lgb.Dataset(train$data, label = train$label) - # , 10L - # , valids = valids - # ) - # }, regexp = "each element of valids must have a name") + expect_error({ + bst <- lgb.train( + params = list(objective = "regression", metric = "l2,l1") + , data = lgb.Dataset(train$data, label = train$label) + , 10L + , valids = valids + ) + }, regexp = "each element of valids must have a name") }) test_that("lgb.train() throws an informative error if 'valids' contains lgb.Dataset objects but none have names", { @@ -443,14 +443,14 @@ test_that("lgb.train() throws an informative error if 'valids' contains lgb.Data lgb.Dataset(matrix(rnorm(10L), 5L, 2L)) , lgb.Dataset(matrix(rnorm(10L), 2L, 5L)) ) - # expect_error({ - # bst <- lgb.train( - # params = list(objective = "regression", metric = "l2,l1") - # , data = lgb.Dataset(train$data, label = train$label) - # , 10L - # , valids = valids - # ) - # }, regexp = "each element of valids must have a name") + expect_error({ + bst <- lgb.train( + params = list(objective = "regression", metric = "l2,l1") + , data = lgb.Dataset(train$data, label = train$label) + , 10L + , valids = valids + ) + }, regexp = "each element of valids must have a name") }) test_that("lgb.train() works with force_col_wise and force_row_wise", { From a35ffb379aca6f6a2cbcd76731e19dd81f2ae1e2 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Thu, 15 Oct 2020 00:23:47 -0500 Subject: [PATCH 55/67] more uncommenting --- R-package/tests/testthat/test_utils.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R-package/tests/testthat/test_utils.R b/R-package/tests/testthat/test_utils.R index edcc09091236..5a9cfb641d61 100644 --- a/R-package/tests/testthat/test_utils.R +++ b/R-package/tests/testthat/test_utils.R @@ -73,9 +73,9 @@ test_that("lgb.last_error() correctly returns errors from the C++ side", { data = train$data , label = as.matrix(rnorm(5L)) ) - # expect_error({ - # dvalid1$construct() - # }, regexp = "[LightGBM] [Fatal] Length of label is not same with #data", fixed = TRUE) + expect_error({ + dvalid1$construct() + }, regexp = "[LightGBM] [Fatal] Length of label is not same with #data", fixed = TRUE) }) context("lgb.check.eval") From af1bbf71feb6ed44e1499713f6593713734f98cb Mon Sep 17 00:00:00 2001 From: James Lamb Date: Thu, 15 Oct 2020 01:11:17 -0500 Subject: [PATCH 56/67] free dataset --- R-package/tests/testthat/test_utils.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/R-package/tests/testthat/test_utils.R b/R-package/tests/testthat/test_utils.R index 5a9cfb641d61..133420750496 100644 --- a/R-package/tests/testthat/test_utils.R +++ b/R-package/tests/testthat/test_utils.R @@ -76,6 +76,8 @@ test_that("lgb.last_error() correctly returns errors from the C++ side", { expect_error({ dvalid1$construct() }, regexp = "[LightGBM] [Fatal] Length of label is not same with #data", fixed = TRUE) + lgb.call("LGBM_DatasetFree_R", ret = NULL, dvalid1$.__enclos_env__$private$handle) + dvalid1$.__enclos_env__$private$handle <- NULL }) context("lgb.check.eval") From a2faca1b72bb12000fb944b8763544e3041939c7 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Thu, 15 Oct 2020 10:12:00 -0500 Subject: [PATCH 57/67] skipping a test, more uncommenting --- R-package/tests/testthat/test_lgb.Booster.R | 8 ++++---- R-package/tests/testthat/test_utils.R | 3 +-- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/R-package/tests/testthat/test_lgb.Booster.R b/R-package/tests/testthat/test_lgb.Booster.R index dae3ebc98989..a506d8246010 100644 --- a/R-package/tests/testthat/test_lgb.Booster.R +++ b/R-package/tests/testthat/test_lgb.Booster.R @@ -436,10 +436,10 @@ test_that("Saving a model with different feature importance types works", { ) ) - # UNSUPPORTED_IMPORTANCE <- 2L - # expect_error({ - # model_string <- bst$save_model_to_string(feature_importance_type = UNSUPPORTED_IMPORTANCE) - # }, "Unknown importance type") + UNSUPPORTED_IMPORTANCE <- 2L + expect_error({ + model_string <- bst$save_model_to_string(feature_importance_type = UNSUPPORTED_IMPORTANCE) + }, "Unknown importance type") }) .params_from_model_string <- function(model_str) { diff --git a/R-package/tests/testthat/test_utils.R b/R-package/tests/testthat/test_utils.R index 133420750496..02c97e070369 100644 --- a/R-package/tests/testthat/test_utils.R +++ b/R-package/tests/testthat/test_utils.R @@ -67,6 +67,7 @@ test_that("lgb.last_error() throws an error if there are no errors", { }) test_that("lgb.last_error() correctly returns errors from the C++ side", { + testthat::skip("this test causes valgrind to think there is a memory leak, and needs to be rethought") data(agaricus.train, package = "lightgbm") train <- agaricus.train dvalid1 <- lgb.Dataset( @@ -76,8 +77,6 @@ test_that("lgb.last_error() correctly returns errors from the C++ side", { expect_error({ dvalid1$construct() }, regexp = "[LightGBM] [Fatal] Length of label is not same with #data", fixed = TRUE) - lgb.call("LGBM_DatasetFree_R", ret = NULL, dvalid1$.__enclos_env__$private$handle) - dvalid1$.__enclos_env__$private$handle <- NULL }) context("lgb.check.eval") From d8651827432dc2d585dff44d8c063b9ee9e67fe2 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Thu, 15 Oct 2020 11:42:39 -0500 Subject: [PATCH 58/67] more skipping --- R-package/tests/testthat/test_lgb.Booster.R | 22 +++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/R-package/tests/testthat/test_lgb.Booster.R b/R-package/tests/testthat/test_lgb.Booster.R index a506d8246010..250d38ebcf40 100644 --- a/R-package/tests/testthat/test_lgb.Booster.R +++ b/R-package/tests/testthat/test_lgb.Booster.R @@ -7,6 +7,10 @@ test_that("lgb.get.eval.result() should throw an informative error if booster is , c("a", "b") , NA , 10L + , lgb.Dataset( + data = matrix(1.0:10.0, 2L, 5L) + , params = list() + ) ) for (bad_input in bad_inputs) { expect_error({ @@ -435,6 +439,23 @@ test_that("Saving a model with different feature importance types works", { , "spore-print-color=green=1" ) ) +}) + +test_that("Saving a model with unkown importance type fails", { + testthat::skip("Skipping this test because it causes issues for valgrind") + set.seed(708L) + data(agaricus.train, package = "lightgbm") + train <- agaricus.train + bst <- lightgbm( + data = as.matrix(train$data) + , label = train$label + , num_leaves = 4L + , learning_rate = 1.0 + , nrounds = 2L + , objective = "binary" + , save_name = tempfile(fileext = ".model") + ) + expect_true(lgb.is.Booster(bst)) UNSUPPORTED_IMPORTANCE <- 2L expect_error({ @@ -442,6 +463,7 @@ test_that("Saving a model with different feature importance types works", { }, "Unknown importance type") }) + .params_from_model_string <- function(model_str) { file_lines <- strsplit(model_str, "\n")[[1L]] start_indx <- which(grepl("^parameters\\:$", file_lines)) + 1L From d7e7a0cd6db2e79c6d72f2cf7780a0e61fdbdd30 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Thu, 15 Oct 2020 14:55:21 -0500 Subject: [PATCH 59/67] re-enable OpenMP --- R-package/configure | 3 ++- R-package/configure.ac | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/R-package/configure b/R-package/configure index 280f13ab3170..02fcfa028b0d 100755 --- a/R-package/configure +++ b/R-package/configure @@ -1783,7 +1783,7 @@ OPENMP_CXXFLAGS="" if test `uname -s` = "Linux" then - OPENMP_CXXFLAGS="" + OPENMP_CXXFLAGS="\$(SHLIB_OPENMP_CXXFLAGS)" fi if test `uname -s` = "Darwin" @@ -2989,3 +2989,4 @@ if test -n "$ac_unrecognized_opts" && test "$enable_option_checking" != no; then { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: unrecognized options: $ac_unrecognized_opts" >&5 $as_echo "$as_me: WARNING: unrecognized options: $ac_unrecognized_opts" >&2;} fi + diff --git a/R-package/configure.ac b/R-package/configure.ac index c835c06539eb..20182666b502 100644 --- a/R-package/configure.ac +++ b/R-package/configure.ac @@ -92,7 +92,7 @@ OPENMP_CXXFLAGS="" if test `uname -s` = "Linux" then - OPENMP_CXXFLAGS="" + OPENMP_CXXFLAGS="\$(SHLIB_OPENMP_CXXFLAGS)" fi if test `uname -s` = "Darwin" From 95267623dc922f52d4aff08cd170dca7439bc392 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Thu, 15 Oct 2020 17:50:27 -0500 Subject: [PATCH 60/67] allow on OpenMP thing --- .ci/test_r_package_valgrind.sh | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/.ci/test_r_package_valgrind.sh b/.ci/test_r_package_valgrind.sh index d3d1822b4e1a..5580fd64ef02 100755 --- a/.ci/test_r_package_valgrind.sh +++ b/.ci/test_r_package_valgrind.sh @@ -35,13 +35,29 @@ if [[ ${bytes_indirectly_lost} -gt 0 ]]; then exit -1 fi -bytes_possibly_lost=$( +# one error caused by a false positive between valgrind and openmp is allowed +# ==1312== 352 bytes in 1 blocks are possibly lost in loss record 146 of 2,458 +# ==1312== at 0x483DD99: calloc (in /usr/lib/x86_64-linux-gnu/valgrind/vgpreload_memcheck-amd64-linux.so) +# ==1312== by 0x40149CA: allocate_dtv (dl-tls.c:286) +# ==1312== by 0x40149CA: _dl_allocate_tls (dl-tls.c:532) +# ==1312== by 0x5702322: allocate_stack (allocatestack.c:622) +# ==1312== by 0x5702322: pthread_create@@GLIBC_2.2.5 (pthread_create.c:660) +# ==1312== by 0x56D0DDA: ??? (in /usr/lib/x86_64-linux-gnu/libgomp.so.1.0.0) +# ==1312== by 0x56C88E0: GOMP_parallel (in /usr/lib/x86_64-linux-gnu/libgomp.so.1.0.0) +# ==1312== by 0x154351B8: LGBM_DatasetCreateFromCSC (c_api.cpp:1286) +# ==1312== by 0x1545789C: LGBM_DatasetCreateFromCSC_R (lightgbm_R.cpp:91) +# ==1312== by 0x4941E2F: R_doDotCall (dotcode.c:634) +# ==1312== by 0x494CCC6: do_dotcall (dotcode.c:1281) +# ==1312== by 0x499FB01: bcEval (eval.c:7078) +# ==1312== by 0x498B67F: Rf_eval (eval.c:727) +# ==1312== by 0x498E414: R_execClosure (eval.c:1895) +# bytes_possibly_lost=$( cat valgrind-logs.log \ | grep -E "possibly lost\: .*" \ | sed 's/^.*possibly lost\: \(.*\) bytes.*$/\1/' \ | tr -d "," ) -if [[ ${bytes_possibly_lost} -gt 0 ]]; then +if [[ ${bytes_possibly_lost} -gt 352 ]]; then echo "valgrind found ${bytes_possibly_lost} bytes possibly lost" exit -1 fi From cbbbd4d4f71627799822aae5462b34a28f23aaef Mon Sep 17 00:00:00 2001 From: James Lamb Date: Thu, 15 Oct 2020 23:19:51 -0500 Subject: [PATCH 61/67] move valgrind to comment-only job --- .ci/test_r_package_valgrind.sh | 2 +- .github/workflows/r_package.yml | 24 +----------------------- .github/workflows/r_valgrind.yml | 30 ++++++++++++++++++++++++++++++ R-package/README.md | 4 ++++ 4 files changed, 36 insertions(+), 24 deletions(-) create mode 100644 .github/workflows/r_valgrind.yml diff --git a/.ci/test_r_package_valgrind.sh b/.ci/test_r_package_valgrind.sh index 5580fd64ef02..2afaa80ef0f8 100755 --- a/.ci/test_r_package_valgrind.sh +++ b/.ci/test_r_package_valgrind.sh @@ -51,7 +51,7 @@ fi # ==1312== by 0x499FB01: bcEval (eval.c:7078) # ==1312== by 0x498B67F: Rf_eval (eval.c:727) # ==1312== by 0x498E414: R_execClosure (eval.c:1895) -# bytes_possibly_lost=$( +bytes_possibly_lost=$( cat valgrind-logs.log \ | grep -E "possibly lost\: .*" \ | sed 's/^.*possibly lost\: \(.*\) bytes.*$/\1/' \ diff --git a/.github/workflows/r_package.yml b/.github/workflows/r_package.yml index 0424ac967363..e26bd865bbbe 100644 --- a/.github/workflows/r_package.yml +++ b/.github/workflows/r_package.yml @@ -184,32 +184,10 @@ jobs: Rscriptdevel testthat.R 2>&1 > ubsan-tests.log cat ubsan-tests.log exit $(cat ubsan-tests.log | grep "runtime error" | wc -l) - test-r-valgrind: - name: r-package (ubuntu-latest, R-devel, valgrind) - timeout-minutes: 120 - runs-on: ubuntu-latest - container: - image: wch1/r-debug - steps: - - name: Checkout repository - uses: actions/checkout@v1 - with: - fetch-depth: 5 - submodules: true - - name: install - shell: bash - run: | - RDscriptvalgrind -e "install.packages(c('R6', 'data.table', 'jsonlite', 'testthat'), repos = 'http://cran.r-project.org')" - sh build-cran-package.sh - RDvalgrind CMD INSTALL --preclean --install-tests lightgbm_*.tar.gz - - name: test - shell: bash - run: | - ./.ci/test_r_package_valgrind.sh all-successful: # https://github.community/t/is-it-possible-to-require-all-github-actions-tasks-to-pass-without-enumerating-them/117957/4?u=graingert runs-on: ubuntu-latest - needs: [test, test-r-sanitizers, test-r-valgrind] + needs: [test, test-r-sanitizers] steps: - name: Note that all tests succeeded run: echo "🎉" diff --git a/.github/workflows/r_valgrind.yml b/.github/workflows/r_valgrind.yml new file mode 100644 index 000000000000..20415070e6b2 --- /dev/null +++ b/.github/workflows/r_valgrind.yml @@ -0,0 +1,30 @@ +name: R valgrind tests + +on: + pull_request_review_comment: + types: [created] + +jobs: + test-r-valgrind: + name: r-package (ubuntu-latest, R-devel, valgrind) + if: github.event.comment.body == '/gha run r-valgrind-tests' && contains('OWNER,MEMBER,COLLABORATOR', github.event.comment.author_association) + timeout-minutes: 120 + runs-on: ubuntu-latest + container: + image: wch1/r-debug + steps: + - name: Checkout repository + uses: actions/checkout@v1 + with: + fetch-depth: 5 + submodules: true + - name: install + shell: bash + run: | + RDscriptvalgrind -e "install.packages(c('R6', 'data.table', 'jsonlite', 'testthat'), repos = 'http://cran.r-project.org')" + sh build-cran-package.sh + RDvalgrind CMD INSTALL --preclean --install-tests lightgbm_*.tar.gz + - name: test + shell: bash + run: | + ./.ci/test_r_package_valgrind.sh diff --git a/R-package/README.md b/R-package/README.md index 95d34b12ba5a..cd13d69ff253 100644 --- a/R-package/README.md +++ b/R-package/README.md @@ -461,6 +461,10 @@ RDvalgrind \ | cat ``` +These tests can also be triggered on any pull request by leaving a "Comment" review with the following comment: + +> /gha run r-valgrind-tests + External (Unofficial) Repositories ---------------------------------- From b946388b3b8337a2f3caf52fd81c31c9dfc48db2 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Sat, 17 Oct 2020 06:06:28 +0100 Subject: [PATCH 62/67] Apply suggestions from code review Co-authored-by: Nikita Titov --- .github/workflows/r_valgrind.yml | 9 ++++----- R-package/tests/testthat/test_lgb.Booster.R | 2 +- R-package/tests/testthat/test_utils.R | 2 +- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/.github/workflows/r_valgrind.yml b/.github/workflows/r_valgrind.yml index 20415070e6b2..dba10a5e0eac 100644 --- a/.github/workflows/r_valgrind.yml +++ b/.github/workflows/r_valgrind.yml @@ -18,13 +18,12 @@ jobs: with: fetch-depth: 5 submodules: true - - name: install + - name: Install packages shell: bash run: | RDscriptvalgrind -e "install.packages(c('R6', 'data.table', 'jsonlite', 'testthat'), repos = 'http://cran.r-project.org')" sh build-cran-package.sh - RDvalgrind CMD INSTALL --preclean --install-tests lightgbm_*.tar.gz - - name: test + RDvalgrind CMD INSTALL --preclean --install-tests lightgbm_*.tar.gz || exit -1 + - name: Run tests with valgrind shell: bash - run: | - ./.ci/test_r_package_valgrind.sh + run: ./.ci/test_r_package_valgrind.sh diff --git a/R-package/tests/testthat/test_lgb.Booster.R b/R-package/tests/testthat/test_lgb.Booster.R index 250d38ebcf40..e4d75661755e 100644 --- a/R-package/tests/testthat/test_lgb.Booster.R +++ b/R-package/tests/testthat/test_lgb.Booster.R @@ -441,7 +441,7 @@ test_that("Saving a model with different feature importance types works", { ) }) -test_that("Saving a model with unkown importance type fails", { +test_that("Saving a model with unknown importance type fails", { testthat::skip("Skipping this test because it causes issues for valgrind") set.seed(708L) data(agaricus.train, package = "lightgbm") diff --git a/R-package/tests/testthat/test_utils.R b/R-package/tests/testthat/test_utils.R index 02c97e070369..0ac03d99edba 100644 --- a/R-package/tests/testthat/test_utils.R +++ b/R-package/tests/testthat/test_utils.R @@ -67,7 +67,7 @@ test_that("lgb.last_error() throws an error if there are no errors", { }) test_that("lgb.last_error() correctly returns errors from the C++ side", { - testthat::skip("this test causes valgrind to think there is a memory leak, and needs to be rethought") + testthat::skip("Skipping this test because it causes valgrind to think there is a memory leak, and needs to be rethought") data(agaricus.train, package = "lightgbm") train <- agaricus.train dvalid1 <- lgb.Dataset( From b03d8633ed919e919da5f4c526bfabd0c7e85d0e Mon Sep 17 00:00:00 2001 From: James Lamb Date: Sat, 17 Oct 2020 00:08:46 -0500 Subject: [PATCH 63/67] changes from code review --- .ci/test_r_package_valgrind.sh | 17 +++++++++-------- .github/workflows/r_valgrind.yml | 2 +- R-package/README.md | 2 +- windows/LightGBM.vcxproj | 2 +- 4 files changed, 12 insertions(+), 11 deletions(-) diff --git a/.ci/test_r_package_valgrind.sh b/.ci/test_r_package_valgrind.sh index 2afaa80ef0f8..1cc06e0660a2 100755 --- a/.ci/test_r_package_valgrind.sh +++ b/.ci/test_r_package_valgrind.sh @@ -2,19 +2,22 @@ cd R-package/tests +ALL_LOGS_FILE="out.log" +VALGRIND_LOGS_FILE="valgrind-logs.log" + RDvalgrind \ --no-readline \ --vanilla \ -d "valgrind --tool=memcheck --leak-check=full --track-origins=yes" \ -f testthat.R \ - &> out.log || exit -1 + 2>&1 > ${ALL_LOGS_FILE} || exit -1 -cat out.log +cat ${ALL_LOGS_FILE} -cat out.log | grep -E "^\=" > valgrind-logs.log +cat ${ALL_LOGS_FILE} | grep -E "^\=" > ${VALGRIND_LOGS_FILE} bytes_definitely_lost=$( - cat valgrind-logs.log \ + cat ${VALGRIND_LOGS_FILE} \ | grep -E "definitely lost\: .*" \ | sed 's/^.*definitely lost\: \(.*\) bytes.*$/\1/' \ | tr -d "," @@ -25,7 +28,7 @@ if [[ ${bytes_definitely_lost} -gt 0 ]]; then fi bytes_indirectly_lost=$( - cat valgrind-logs.log \ + cat ${VALGRIND_LOGS_FILE} \ | grep -E "indirectly lost\: .*" \ | sed 's/^.*indirectly lost\: \(.*\) bytes.*$/\1/' \ | tr -d "," @@ -52,7 +55,7 @@ fi # ==1312== by 0x498B67F: Rf_eval (eval.c:727) # ==1312== by 0x498E414: R_execClosure (eval.c:1895) bytes_possibly_lost=$( - cat valgrind-logs.log \ + cat ${VALGRIND_LOGS_FILE} \ | grep -E "possibly lost\: .*" \ | sed 's/^.*possibly lost\: \(.*\) bytes.*$/\1/' \ | tr -d "," @@ -61,5 +64,3 @@ if [[ ${bytes_possibly_lost} -gt 352 ]]; then echo "valgrind found ${bytes_possibly_lost} bytes possibly lost" exit -1 fi - -exit 0 diff --git a/.github/workflows/r_valgrind.yml b/.github/workflows/r_valgrind.yml index 20415070e6b2..35325b6d96e1 100644 --- a/.github/workflows/r_valgrind.yml +++ b/.github/workflows/r_valgrind.yml @@ -7,7 +7,7 @@ on: jobs: test-r-valgrind: name: r-package (ubuntu-latest, R-devel, valgrind) - if: github.event.comment.body == '/gha run r-valgrind-tests' && contains('OWNER,MEMBER,COLLABORATOR', github.event.comment.author_association) + if: github.event.comment.body == '/gha run r-valgrind' && contains('OWNER,MEMBER,COLLABORATOR', github.event.comment.author_association) timeout-minutes: 120 runs-on: ubuntu-latest container: diff --git a/R-package/README.md b/R-package/README.md index cd13d69ff253..8ef5c7818148 100644 --- a/R-package/README.md +++ b/R-package/README.md @@ -463,7 +463,7 @@ RDvalgrind \ These tests can also be triggered on any pull request by leaving a "Comment" review with the following comment: -> /gha run r-valgrind-tests +> /gha run r-valgrind External (Unofficial) Repositories ---------------------------------- diff --git a/windows/LightGBM.vcxproj b/windows/LightGBM.vcxproj index beb7ae7f081e..4c95af83b03c 100644 --- a/windows/LightGBM.vcxproj +++ b/windows/LightGBM.vcxproj @@ -327,4 +327,4 @@ - \ No newline at end of file + From cefb3dd4d4485525ab5797004ff1250d729c76ce Mon Sep 17 00:00:00 2001 From: James Lamb Date: Sat, 17 Oct 2020 23:30:39 +0100 Subject: [PATCH 64/67] Apply suggestions from code review Co-authored-by: Nikita Titov --- .github/workflows/r_valgrind.yml | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/.github/workflows/r_valgrind.yml b/.github/workflows/r_valgrind.yml index 778ff5af6ffe..b94ab532c251 100644 --- a/.github/workflows/r_valgrind.yml +++ b/.github/workflows/r_valgrind.yml @@ -10,8 +10,7 @@ jobs: if: github.event.comment.body == '/gha run r-valgrind' && contains('OWNER,MEMBER,COLLABORATOR', github.event.comment.author_association) timeout-minutes: 120 runs-on: ubuntu-latest - container: - image: wch1/r-debug + container: wch1/r-debug steps: - name: Checkout repository uses: actions/checkout@v1 @@ -21,9 +20,9 @@ jobs: - name: Install packages shell: bash run: | - RDscriptvalgrind -e "install.packages(c('R6', 'data.table', 'jsonlite', 'testthat'), repos = 'http://cran.r-project.org')" + RDscriptvalgrind -e "install.packages(c('R6', 'data.table', 'jsonlite', 'testthat'), repos = 'https://cran.r-project.org')" sh build-cran-package.sh - RDvalgrind CMD INSTALL --preclean --install-tests lightgbm_*.tar.gz || exit -1 + RDvalgrind CMD INSTALL --preclean --install-tests lightgbm_*.tar.gz || exit -1 - name: Run tests with valgrind shell: bash run: ./.ci/test_r_package_valgrind.sh From 0374e879821da2f2f85d72874092b28003632fc9 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Sat, 17 Oct 2020 17:37:36 -0500 Subject: [PATCH 65/67] linting --- R-package/tests/testthat/test_utils.R | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/R-package/tests/testthat/test_utils.R b/R-package/tests/testthat/test_utils.R index 0ac03d99edba..5ac43cc50b9a 100644 --- a/R-package/tests/testthat/test_utils.R +++ b/R-package/tests/testthat/test_utils.R @@ -67,7 +67,10 @@ test_that("lgb.last_error() throws an error if there are no errors", { }) test_that("lgb.last_error() correctly returns errors from the C++ side", { - testthat::skip("Skipping this test because it causes valgrind to think there is a memory leak, and needs to be rethought") + testthat::skip(paste0( + "Skipping this test because it causes valgrind to think " + , "there is a memory leak, and needs to be rethought" + )) data(agaricus.train, package = "lightgbm") train <- agaricus.train dvalid1 <- lgb.Dataset( From e9c46aab5fba09b605c376e396f9397246509de1 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Sat, 17 Oct 2020 17:52:06 -0500 Subject: [PATCH 66/67] issue comments too --- .github/workflows/r_valgrind.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/r_valgrind.yml b/.github/workflows/r_valgrind.yml index b94ab532c251..ee062377cdd3 100644 --- a/.github/workflows/r_valgrind.yml +++ b/.github/workflows/r_valgrind.yml @@ -3,6 +3,8 @@ name: R valgrind tests on: pull_request_review_comment: types: [created] + issue_comment: + types: [created] jobs: test-r-valgrind: From 5d435f659725a0af73e63de3378a660b375bc55f Mon Sep 17 00:00:00 2001 From: James Lamb Date: Sat, 17 Oct 2020 18:16:21 -0500 Subject: [PATCH 67/67] remove issue_comment --- .github/workflows/r_valgrind.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/r_valgrind.yml b/.github/workflows/r_valgrind.yml index ee062377cdd3..b94ab532c251 100644 --- a/.github/workflows/r_valgrind.yml +++ b/.github/workflows/r_valgrind.yml @@ -3,8 +3,6 @@ name: R valgrind tests on: pull_request_review_comment: types: [created] - issue_comment: - types: [created] jobs: test-r-valgrind: