feat: Voting methods for feature ranking in efs (#112)

* add stability selection article * add Rcpp code for approval voting feature ranking method * add citation * extra check during init() * update doc + use the Rcpp interface for approval voting * add templates for params in ArchiveBatchFSelect + updocs * use testthat expectations (not checkmate ones!) * add test for newly implemented voting methods * update test for av * fix note * refactor AV_rcpp, add SAV_rcpp * add norm_score, and SAV R function * add sav, improve doc * fix efs test * update and improve test for AV * add sav test * add borda score * update tests * add seq and revseq PAV Rcpp methods * add R functions for the PAV methods * comment printing * add tests for PAV methods * add PAV methods to efs * refactor: do not use C++ RNGs * fix startsWith * updocs * fix data.table note * add committee_size parameter, refactor borda score * add large data test for seq pav * refactor C++ code, add optimized PAV * remove revseq-PAV method, use optimized seqPAV * update tests * remove suboptimal seqPAV function * shuffle candidates outside Rcpp functions (same tie-breaking) * optimize Phragmen a bit => do not randomly select the candidate with min load * add phragmen's rule in efs * correct borda score + use phragmens rule * add tests for Phragmen's rule * correct weighted Phragmen's rule * add specific test for phragmen's rule * run document() * show data.table result after using ':=' * add n_resamples field + nicer obj print * cover edge case (eg lasso resulted in no features getting selected) * updocs * small styling fix * add Stabl ref * more descriptive name * add embedded ensemble feature selection * remove print() * add TOCHECK comment on benchmark design * use internal valid task * simplify * ... * store_models = FALSE * ... * separate the use of inner_measure and measure used in the test sets * updocs * update tests * refactor: expect_vector => expect_numeric * fix partial arg match * fix example * use fastVoteR for feature ranking * pass named list to callback parameter * skip test if fastVoteR is not available * refactor: better handling of inner measure * add tests for embedded_ensemble_fselect() * update NEWs * add active_measure field * remove Remotes as fastVoteR is now on CRAN :) * refine doc --------- Co-authored-by: be-marc <marcbecker@posteo.de>
mlr-org · Nov 30, 2024 · 003f6e9 · 003f6e9
1 parent 39d0f6a
commit 003f6e9
Show file tree

Hide file tree

Showing 12 changed files with 804 additions and 224 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -41,6 +41,7 @@ Suggests:
     mlr3learners,
     mlr3pipelines,
     rpart,
+    fastVoteR,
     testthat (>= 3.0.0)
 Config/testthat/edition: 3
 Config/testthat/parallel: true
@@ -74,6 +75,7 @@ Collate:
     'assertions.R'
     'auto_fselector.R'
     'bibentries.R'
+    'embedded_ensemble_fselect.R'
     'ensemble_fselect.R'
     'extract_inner_fselect_archives.R'
     'extract_inner_fselect_results.R'

diff --git a/NAMESPACE b/NAMESPACE
@@ -36,6 +36,7 @@ export(auto_fselector)
 export(callback_batch_fselect)
 export(clbk)
 export(clbks)
+export(embedded_ensemble_fselect)
 export(ensemble_fselect)
 export(extract_inner_fselect_archives)
 export(extract_inner_fselect_results)

diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,9 @@
 # mlr3fselect (development version)
 
+* Use [fastVoteR](https://github.com/bblodfon/fastVoteR) for feature ranking in `EnsembleFSResult()` objects
+* Add embedded ensemble feature selection `embedded_ensemble_fselect()`
+* Refactor `ensemble_fselect()` and `EnsembleFSResult()`
+
 # mlr3fselect 1.2.1
 
 * compatibility: mlr3 0.22.0

diff --git a/R/EnsembleFSResult.R b/R/EnsembleFSResult.R
diff --git a/R/bibentries.R b/R/bibentries.R
@@ -9,7 +9,6 @@ bibentries = c(
     title       = "ecr 2.0",
     booktitle   = "Proceedings of the Genetic and Evolutionary Computation Conference Companion"
   ),
-
   bergstra_2012 = bibentry("article",
     title       = "Random Search for Hyper-Parameter Optimization",
     author      = "James Bergstra and Yoshua Bengio",
@@ -20,8 +19,7 @@ bibentries = c(
     pages       = "281--305",
     url         = "https://jmlr.csail.mit.edu/papers/v13/bergstra12a.html"
   ),
-
-  thomas2017  = bibentry("article",
+  thomas2017 = bibentry("article",
     doi       = "10.1155/2017/1421409",
     year      = "2017",
     publisher = "Hindawi Limited",
@@ -31,8 +29,7 @@ bibentries = c(
     title     = "Probing for Sparse and Fast Variable Selection with Model-Based Boosting",
     journal   = "Computational and Mathematical Methods in Medicine"
   ),
-
-  wu2007      = bibentry("article",
+  wu2007 = bibentry("article",
     doi       = "10.1198/016214506000000843",
     year      = "2007",
     month     = "3",
@@ -44,8 +41,7 @@ bibentries = c(
     title     = "Controlling Variable Selection by the Addition of Pseudovariables",
     journal   = "Journal of the American Statistical Association"
   ),
-
-  guyon2002     = bibentry("article",
+  guyon2002 = bibentry("article",
     title       = "Gene Selection for Cancer Classification using Support Vector Machines",
     volume      = "46",
     issn        = "1573-0565",
@@ -56,7 +52,6 @@ bibentries = c(
     author      = "Isabelle Guyon and Jason Weston and Stephen Barnhill and Vladimir Vapnik",
     year        = "2002"
   ),
-
   kuhn2013 = bibentry("Inbook",
     author    = "Kuhn, Max and Johnson, Kjell",
     chapter   = "Over-Fitting and Model Tuning",
@@ -67,7 +62,6 @@ bibentries = c(
     pages     = "61--92",
     isbn      = "978-1-4614-6849-3"
   ),
-
   saeys2008 = bibentry("article",
     author      = "Saeys, Yvan and Abeel, Thomas and Van De Peer, Yves",
     doi         = "10.1007/978-3-540-87481-2_21",
@@ -79,7 +73,6 @@ bibentries = c(
     volume      = "5212 LNAI",
     year        = "2008"
   ),
-
   abeel2010 = bibentry("article",
     author    = "Abeel, Thomas and Helleputte, Thibault and Van de Peer, Yves and Dupont, Pierre and Saeys, Yvan",
     doi       = "10.1093/BIOINFORMATICS/BTP630",
@@ -92,7 +85,6 @@ bibentries = c(
     volume    = "26",
     year      = "2010"
   ),
-
   pes2020 = bibentry("article",
     author    = "Pes, Barbara",
     doi       = "10.1007/s00521-019-04082-3",
@@ -106,7 +98,6 @@ bibentries = c(
     volume    = "32",
     year      = "2020"
   ),
-
   das1999 = bibentry("article",
     author    = "Das, I",
     issn      = "09344373",
@@ -118,5 +109,31 @@ bibentries = c(
     title     = "On characterizing the 'knee' of the Pareto curve based on normal-boundary intersection",
     volume    = "18",
     year      = "1999"
+  ),
+  meinshausen2010 = bibentry("article",
+    author    = "Meinshausen, Nicolai and Buhlmann, Peter",
+    doi       = "10.1111/J.1467-9868.2010.00740.X",
+    eprint    = "0809.2932",
+    issn      = "1369-7412",
+    journal   = "Journal of the Royal Statistical Society Series B: Statistical Methodology",
+    month     = "sep",
+    number    = "4",
+    pages     = "417--473",
+    publisher = "Oxford Academic",
+    title     = "Stability Selection",
+    volume    = "72",
+    year      = "2010"
+  ),
+  hedou2024 = bibentry("article",
+    author = "Hedou, Julien and Maric, Ivana and Bellan, Gregoire and Einhaus, Jakob and Gaudilliere, Dyani K. and Ladant, Francois Xavier and Verdonk, Franck and Stelzer, Ina A. and Feyaerts, Dorien and Tsai, Amy S. and Ganio, Edward A. and Sabayev, Maximilian and Gillard, Joshua and Amar, Jonas and Cambriel, Amelie and Oskotsky, Tomiko T. and Roldan, Alennie and Golob, Jonathan L. and Sirota, Marina and Bonham, Thomas A. and Sato, Masaki and Diop, Maigane and Durand, Xavier and Angst, Martin S. and Stevenson, David K. and Aghaeepour, Nima and Montanari, Andrea and Gaudilliere, Brice", #nolint
+    doi = "10.1038/s41587-023-02033-x",
+    issn = "1546-1696",
+    journal = "Nature Biotechnology 2024",
+    month = "jan",
+    pages = "1--13",
+    publisher = "Nature Publishing Group",
+    title = "Discovery of sparse, reliable omic biomarkers with Stabl",
+    url = "https://www.nature.com/articles/s41587-023-02033-x",
+    year = "2024"
   )
 )
diff --git a/R/embedded_ensemble_fselect.R b/R/embedded_ensemble_fselect.R
@@ -0,0 +1,112 @@
+#' @title Embedded Ensemble Feature Selection
+#'
+#' @include CallbackBatchFSelect.R
+#'
+#' @description
+#' Ensemble feature selection using multiple learners.
+#' The ensemble feature selection method is designed to identify the most predictive features from a given dataset by leveraging multiple machine learning models and resampling techniques.
+#' Returns an [EnsembleFSResult].
+#'
+#' @details
+#' The method begins by applying an initial resampling technique specified by the user, to create **multiple subsamples** from the original dataset (train/test splits).
+#' This resampling process helps in generating diverse subsets of data for robust feature selection.
+#'
+#' For each subsample (train set) generated in the previous step, the method applies learners
+#' that support **embedded feature selection**.
+#' These learners are then scored on their ability to predict on the resampled
+#' test sets, storing the selected features during training, for each
+#' combination of subsample and learner.
+#'
+#' Results are stored in an [EnsembleFSResult].
+#'
+#' @param learners (list of [mlr3::Learner])\cr
+#'  The learners to be used for feature selection.
+#'  All learners must have the `selected_features` property, i.e. implement
+#'  embedded feature selection (e.g. regularized models).
+#' @param init_resampling ([mlr3::Resampling])\cr
+#'  The initial resampling strategy of the data, from which each train set
+#'  will be passed on to the learners and each test set will be used for
+#'  prediction.
+#'  Can only be [mlr3::ResamplingSubsampling] or [mlr3::ResamplingBootstrap].
+#' @param measure ([mlr3::Measure])\cr
+#'  The measure used to score each learner on the test sets generated by
+#'  `init_resampling`.
+#'  If `NULL`, default measure is used.
+#' @param store_benchmark_result (`logical(1)`)\cr
+#'  Whether to store the benchmark result in [EnsembleFSResult] or not.
+#'
+#' @template param_task
+#'
+#' @returns an [EnsembleFSResult] object.
+#'
+#' @source
+#' `r format_bib("meinshausen2010", "hedou2024")`
+#' @export
+#' @examples
+#' \donttest{
+#'   eefsr = embedded_ensemble_fselect(
+#'     task = tsk("sonar"),
+#'     learners = lrns(c("classif.rpart", "classif.featureless")),
+#'     init_resampling = rsmp("subsampling", repeats = 5),
+#'     measure = msr("classif.ce")
+#'   )
+#'   eefsr
+#' }
+embedded_ensemble_fselect = function(
+  task,
+  learners,
+  init_resampling,
+  measure,
+  store_benchmark_result = TRUE
+  ) {
+  assert_task(task)
+  assert_learners(as_learners(learners), task = task, properties = "selected_features")
+  assert_resampling(init_resampling)
+  assert_choice(class(init_resampling)[1], choices = c("ResamplingBootstrap", "ResamplingSubsampling"))
+  assert_measure(measure, task = task)
+  assert_flag(store_benchmark_result)
+
+  init_resampling$instantiate(task)
+
+  design = benchmark_grid(
+    tasks = task,
+    learners = learners,
+    resamplings = init_resampling
+  )
+
+  bmr = benchmark(design, store_models = TRUE)
+
+  trained_learners = bmr$score()$learner
+
+  # extract selected features
+  features = map(trained_learners, function(learner) {
+    learner$selected_features()
+  })
+
+  # extract n_features
+  n_features = map_int(features, length)
+
+  # extract scores on the test sets
+  scores = bmr$score(measure)
+
+  set(scores, j = "features", value = features)
+  set(scores, j = "n_features", value = n_features)
+  setnames(scores, "iteration", "resampling_iteration")
+
+  # remove R6 objects
+  set(scores, j = "learner", value = NULL)
+  set(scores, j = "task", value = NULL)
+  set(scores, j = "resampling", value = NULL)
+  set(scores, j = "prediction_test", value = NULL)
+  set(scores, j = "task_id", value = NULL)
+  set(scores, j = "nr", value = NULL)
+  set(scores, j = "resampling_id", value = NULL)
+  set(scores, j = "uhash", value = NULL)
+
+  EnsembleFSResult$new(
+    result = scores,
+    features = task$feature_names,
+    benchmark_result = if (store_benchmark_result) bmr,
+    measure = measure
+  )
+}