efs updates (#105)

* export class * correct doc * add args minimize, measure_var and active field nlearners * add pareto_front method * add export in NAMESPACE * improve doc a bit * update docs * add more tests * Update R/EnsembleFSResult.R Co-authored-by: Marc Becker <33069354+be-marc@users.noreply.github.com> * Update R/EnsembleFSResult.R Co-authored-by: Marc Becker <33069354+be-marc@users.noreply.github.com> * Update R/EnsembleFSResult.R Co-authored-by: Marc Becker <33069354+be-marc@users.noreply.github.com> * fix variable names * order also by measure (pareto points may have the same num of features) * more realistic pareto front in the test * add Das 'knee' paper * add "knee_points" method * fix method name * change stability args to a list * check stability_args * style change * fix test * supress warnings * 'featureless' learner produces same number of features as best in the RFE * update docs * correct method name and update docs * add tests for knee_points * fix bug in "knee_points" (didn't work properly for measures like accuracy) --------- Co-authored-by: Marc Becker <33069354+be-marc@users.noreply.github.com>
mlr-org · Jun 21, 2024 · 6d032dc · 6d032dc
1 parent d37184a
commit 6d032dc
Show file tree

Hide file tree

Showing 5 changed files with 356 additions and 24 deletions.
diff --git a/R/EnsembleFSResult.R b/R/EnsembleFSResult.R
@@ -4,7 +4,7 @@
 #'
 #' @description
 #' The `EnsembleFSResult` stores the results of ensemble feature selection.
-#' It includes methods for evaluating the stability of the feature selection process and for ranking the selected features.
+#' It includes methods for evaluating the stability of the feature selection process and for ranking the selected features among others.
 #' The function [ensemble_fselect()] returns an object of this class.
 #'
 #' @section S3 Methods:
@@ -15,6 +15,9 @@
 #'     * `benchmark_result` (`logical(1)`)\cr
 #'       Whether to add the learner, task and resampling information from the benchmark result.
 #'
+#' @references
+#' `r format_bib("das1999")`
+#'
 #' @export
 #' @examples
 #' \donttest{
@@ -39,6 +42,9 @@
 #'
 #'   # returns a ranking of all features
 #'   head(efsr$feature_ranking())
+#'
+#'   # returns the empirical pareto front (nfeatures vs error)
+#'   efsr$pareto_front()
 #' }
 EnsembleFSResult = R6Class("EnsembleFSResult",
   public = list(
@@ -56,18 +62,26 @@ EnsembleFSResult = R6Class("EnsembleFSResult",
     #'
     #' @param result ([data.table::data.table])\cr
     #'  The result of the ensemble feature selection.
-    #'  Column names should include `"resampling_id"`, `"learner_id"`, `"features"`
+    #'  Column names should include `"resampling_iteration"`, `"learner_id"`, `"features"`
     #'  and `"n_features"`.
     #' @param features ([character()])\cr
     #'  The vector of features of the task that was used in the ensemble feature
     #'  selection.
     #' @param benchmark_result ([mlr3::BenchmarkResult])\cr
     #'  The benchmark result object.
-    initialize = function(result, features, benchmark_result = NULL) {
+    #' @param measure_id (`character(1)`)\cr
+    #'  Column name of `"result"` that corresponds to the measure used.
+    #' @param minimize (`logical(1)`)\cr
+    #'  If `TRUE` (default), lower values of the measure correspond to higher performance.
+    initialize = function(result, features, benchmark_result = NULL, measure_id,
+                          minimize = TRUE) {
       assert_data_table(result)
-      assert_names(names(result), must.include = c("resampling_iteration", "learner_id", "features", "n_features"))
+      private$.measure_id = assert_string(measure_id, null.ok = FALSE)
+      mandatory_columns = c("resampling_iteration", "learner_id", "features", "n_features")
+      assert_names(names(result), must.include = c(mandatory_columns, measure_id))
       private$.result = result
       private$.features = assert_character(features, any.missing = FALSE, null.ok = FALSE)
+      private$.minimize = assert_logical(minimize, null.ok = FALSE)
       self$benchmark_result = if (!is.null(benchmark_result)) assert_benchmark_result(benchmark_result)
 
       self$man = "mlr3fselect::ensemble_fs_result"
@@ -144,7 +158,7 @@ EnsembleFSResult = R6Class("EnsembleFSResult",
     #'  The stability measure to be used.
     #'  One of the measures returned by [stabm::listStabilityMeasures()] in lower case.
     #'  Default is `"jaccard"`.
-    #' @param ... (`any`)\cr
+    #' @param stability_args (`list`)\cr
     #'  Additional arguments passed to the stability measure function.
     #' @param global (`logical(1)`)\cr
     #'  Whether to calculate the stability globally or for each learner.
@@ -153,10 +167,16 @@ EnsembleFSResult = R6Class("EnsembleFSResult",
     #'
     #' @return A `numeric()` value representing the stability of the selected features.
     #' Or a `numeric()` vector with the stability of the selected features for each learner.
-    stability = function(stability_measure = "jaccard", ..., global = TRUE, reset_cache = FALSE) {
+    stability = function(
+      stability_measure = "jaccard",
+      stability_args = NULL,
+      global = TRUE,
+      reset_cache = FALSE
+      ) {
       funs = stabm::listStabilityMeasures()$Name
       keys = tolower(gsub("stability", "", funs))
       assert_choice(stability_measure, choices = keys)
+      assert_list(stability_args, null.ok = TRUE, names = "named")
 
       if (global) {
         # cached results
@@ -165,7 +185,7 @@ EnsembleFSResult = R6Class("EnsembleFSResult",
         }
 
         fun = get(funs[which(stability_measure == keys)], envir = asNamespace("stabm"))
-        private$.stability_global[[stability_measure]] = fun(private$.result$features, ...)
+        private$.stability_global[[stability_measure]] = invoke(fun, features = private$.result$features, .args = stability_args)
         private$.stability_global[[stability_measure]]
       } else {
         # cached results
@@ -175,10 +195,133 @@ EnsembleFSResult = R6Class("EnsembleFSResult",
 
         fun = get(funs[which(stability_measure == keys)], envir = asNamespace("stabm"))
 
-        tab = private$.result[, list(score = fun(.SD$features, ...)), by = learner_id]
+        tab = private$.result[, list(score = invoke(fun, features = .SD$features, .args = stability_args)), by = learner_id]
         private$.stability_learner[[stability_measure]] = set_names(tab$score, tab$learner_id)
         private$.stability_learner[[stability_measure]]
       }
+    },
+
+    #' @description
+    #'
+    #' This function identifies the **Pareto front** of the ensemble feature
+    #' selection process, i.e., the set of points that represent the trade-off
+    #' between the number of features and performance (e.g. classification error).
+    #'
+    #' @param type (`character(1)`)\cr
+    #'  Specifies the type of Pareto front to return. See details.
+    #'
+    #' @details
+    #' Two options are available for the Pareto front:
+    #' - `"empirical"` (default): returns the empirical Pareto front.
+    #' - `"estimated"`: the Pareto front points are estimated by fitting a linear model with the inversed of the number of features (\eqn{1/x}) as input and the associated performance scores as output.
+    #'  This method is useful when the Pareto points are sparse and the front  assumes a convex shape if better performance corresponds to lower measure values (e.g. classification error), or a concave shape otherwise (e.g. classification accuracy).
+    #'  The `estimated` Pareto front will include points for a number of features ranging from 1 up to the maximum number found in the empirical Pareto front.
+    #'
+    #' @return A [data.table::data.table] with columns the number of features and the performance that together form the Pareto front.
+    pareto_front = function(type = "empirical") {
+      assert_choice(type, choices =  c("empirical", "estimated"))
+      result = private$.result
+      measure_id = private$.measure_id
+      minimize = private$.minimize
+
+      # Keep only n_features and performance scores
+      cols_to_keep = c("n_features", measure_id)
+      data = result[, ..cols_to_keep]
+
+      # Order data according to the measure
+      data = if (minimize)
+        data[order(n_features, -get(measure_id))]
+      else
+        data[order(n_features, get(measure_id))]
+
+      # Initialize the Pareto front
+      pf = data.table(n_features = numeric(0))
+      pf[, (measure_id) := numeric(0)]
+
+      # Initialize the best performance to a large number so
+      # that the Pareto front has at least one point
+      best_score = if (minimize) Inf else -Inf
+
+      for (i in seq_row(data)) {
+        # Determine the condition based on minimize
+        if (minimize) {
+          condition = data[[measure_id]][i] < best_score
+        } else {
+          condition = data[[measure_id]][i] > best_score
+        }
+
+        if (condition) {
+          pf = rbind(pf, data[i])
+          best_score = data[[measure_id]][i]
+        }
+      }
+
+      if (type == "estimated") {
+        # Transform the data (x => 1/x)
+        pf[, n_features_inv := 1 / n_features]
+
+        # Fit the linear model
+        form = mlr3misc::formulate(lhs = measure_id, rhs = "n_features_inv")
+        model = stats::lm(formula = form, data = pf)
+
+        # Predict values using the model to create a smooth curve
+        pf_pred = data.table(n_features = seq(1, max(data$n_features)))
+        pf_pred[, n_features_inv := 1 / n_features]
+        pf_pred[, (measure_id) := predict(model, newdata = pf_pred)]
+        pf_pred$n_features_inv = NULL
+        pf = pf_pred
+      }
+
+      pf
+    },
+
+    #' @description
+    #'
+    #' This function implements various *knee* point identification (KPI) methods, which select points in the Pareto front, such that an optimal trade-off between performance and number of features is achieved.
+    #' In most cases, only one such point is returned.
+    #'
+    #' @details
+    #' The available KPI methods are:
+    #'
+    #' - `"NBI"` (default): The **Normal-Boundary Intersection** method is a geometry-based method which calculates the perpendicular distance of each point from the line connecting the first and last points of the Pareto front.
+    #' The knee point is determined as the Pareto point with the maximum distance from this line, see Das (1999).
+    #'
+    #' @param method (`character(1)`)\cr
+    #'  Type of method to use to identify the knee point. See details.
+    #' @param type (`character(1)`)\cr
+    #'  Specifies the type of Pareto front to use for the identification of the knee point.
+    #'  See `pareto_front()` method for more details.
+    #'
+    #' @return A [data.table::data.table] with the knee point(s) of the Pareto front.
+    knee_points = function(method = "NBI", type = "empirical") {
+      assert_choice(method, choices = c("NBI"))
+      assert_choice(type, choices = c("empirical", "estimated"))
+      measure_id = private$.measure_id
+      minimize = private$.minimize
+
+      pf = if (type == "empirical") self$pareto_front() else self$pareto_front(type = "estimated")
+
+      # Scale the Pareto front data to (0-1) range
+      pf_norm = pf[, .(
+        nfeats_norm = (n_features - min(n_features)) /(max(n_features) - min(n_features)),
+        perf_norm = (get(measure_id) - min(get(measure_id))) / (max(get(measure_id)) - min(get(measure_id)))
+      )]
+
+      if (minimize) {
+        # The two edge points in the Pareto front are: (0,1) and (1,0)
+        # They define the line (x + y - 1 = 0) and their distance is sqrt(2)
+        pf_norm[, dist_to_line := abs(nfeats_norm + perf_norm - 1)/sqrt(2)]
+      } else {
+        # The two edge points in the Pareto front are: (0,0) and (1,1)
+        # They define the line (y - x = 0) and their distance is sqrt(2)
+        pf_norm[, dist_to_line := abs(nfeats_norm - perf_norm)/sqrt(2)]
+      }
+
+      # knee point is the one with the maximum distance
+      knee_index = which_max(pf_norm[, dist_to_line], ties_method = "first")
+      knee_point = pf[knee_index]
+
+      knee_point
     }
   ),
 
@@ -191,15 +334,31 @@ EnsembleFSResult = R6Class("EnsembleFSResult",
       if (is.null(self$benchmark_result)) return(private$.result)
       tab = as.data.table(self$benchmark_result)[, c("task", "learner", "resampling"), with = FALSE]
       cbind(private$.result, tab)
+    },
+
+    #' @field n_learners (`numeric(1)`)\cr
+    #' Returns the number of learners used in the ensemble feature selection.
+    n_learners = function(rhs) {
+      assert_ro_binding(rhs)
+      uniqueN(private$.result$learner_id)
+    },
+
+    #' @field measure (`character(1)`)\cr
+    #' Returns the measure id used in the ensemble feature selection.
+    measure = function(rhs) {
+      assert_ro_binding(rhs)
+      private$.measure_id
     }
   ),
 
   private = list(
-    .result = NULL,
+    .result = NULL, # with no R6 classes
     .stability_global = NULL,
     .stability_learner = NULL,
     .feature_ranking = NULL,
-    .features = NULL
+    .features = NULL,
+    .measure_id = NULL,
+    .minimize = NULL
   )
 )
 

diff --git a/R/bibentries.R b/R/bibentries.R
@@ -105,5 +105,19 @@ bibentries = c(
     title     = "Ensemble feature selection for high-dimensional data: a stability analysis across multiple domains",
     volume    = "32",
     year      = "2020"
+  ),
+
+  das1999 = bibentry("article",
+    author    = "Das, I",
+    doi       = "10.1007/BF01195985/METRICS",
+    issn      = "09344373",
+    journal   = "Structural Optimization",
+    month     = "may",
+    number    = "1-2",
+    pages     = "107--115",
+    publisher = "Springer",
+    title     = "On characterizing the 'knee' of the Pareto curve based on normal-boundary intersection",
+    volume    = "18",
+    year      = "1999"
   )
 )
diff --git a/R/ensemble_fselect.R b/R/ensemble_fselect.R
@@ -141,6 +141,8 @@ ensemble_fselect = function(
   EnsembleFSResult$new(
     result = grid,
     features = task$feature_names,
-    benchmark_result = if (store_benchmark_result) bmr
+    benchmark_result = if (store_benchmark_result) bmr,
+    measure_id = measure$id,
+    minimize = measure$minimize
   )
 }