Skip to content

Commit

Permalink
efs updates (#105)
Browse files Browse the repository at this point in the history
* export class

* correct doc

* add args minimize, measure_var and active field nlearners

* add pareto_front method

* add export in NAMESPACE

* improve doc a bit

* update docs

* add more tests

* Update R/EnsembleFSResult.R

Co-authored-by: Marc Becker <33069354+be-marc@users.noreply.github.com>

* Update R/EnsembleFSResult.R

Co-authored-by: Marc Becker <33069354+be-marc@users.noreply.github.com>

* Update R/EnsembleFSResult.R

Co-authored-by: Marc Becker <33069354+be-marc@users.noreply.github.com>

* fix variable names

* order also by measure (pareto points may have the same num of features)

* more realistic pareto front in the test

* add Das 'knee' paper

* add "knee_points" method

* fix method name

* change stability args to a list

* check stability_args

* style change

* fix test

* supress warnings

* 'featureless' learner produces same number of features as best in the RFE

* update docs

* correct method name and update docs

* add tests for knee_points

* fix bug in "knee_points" (didn't work properly for measures like accuracy)

---------

Co-authored-by: Marc Becker <33069354+be-marc@users.noreply.github.com>
  • Loading branch information
bblodfon and be-marc authored Jun 21, 2024
1 parent d37184a commit 6d032dc
Show file tree
Hide file tree
Showing 5 changed files with 356 additions and 24 deletions.
179 changes: 169 additions & 10 deletions R/EnsembleFSResult.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#'
#' @description
#' The `EnsembleFSResult` stores the results of ensemble feature selection.
#' It includes methods for evaluating the stability of the feature selection process and for ranking the selected features.
#' It includes methods for evaluating the stability of the feature selection process and for ranking the selected features among others.
#' The function [ensemble_fselect()] returns an object of this class.
#'
#' @section S3 Methods:
Expand All @@ -15,6 +15,9 @@
#' * `benchmark_result` (`logical(1)`)\cr
#' Whether to add the learner, task and resampling information from the benchmark result.
#'
#' @references
#' `r format_bib("das1999")`
#'
#' @export
#' @examples
#' \donttest{
Expand All @@ -39,6 +42,9 @@
#'
#' # returns a ranking of all features
#' head(efsr$feature_ranking())
#'
#' # returns the empirical pareto front (nfeatures vs error)
#' efsr$pareto_front()
#' }
EnsembleFSResult = R6Class("EnsembleFSResult",
public = list(
Expand All @@ -56,18 +62,26 @@ EnsembleFSResult = R6Class("EnsembleFSResult",
#'
#' @param result ([data.table::data.table])\cr
#' The result of the ensemble feature selection.
#' Column names should include `"resampling_id"`, `"learner_id"`, `"features"`
#' Column names should include `"resampling_iteration"`, `"learner_id"`, `"features"`
#' and `"n_features"`.
#' @param features ([character()])\cr
#' The vector of features of the task that was used in the ensemble feature
#' selection.
#' @param benchmark_result ([mlr3::BenchmarkResult])\cr
#' The benchmark result object.
initialize = function(result, features, benchmark_result = NULL) {
#' @param measure_id (`character(1)`)\cr
#' Column name of `"result"` that corresponds to the measure used.
#' @param minimize (`logical(1)`)\cr
#' If `TRUE` (default), lower values of the measure correspond to higher performance.
initialize = function(result, features, benchmark_result = NULL, measure_id,
minimize = TRUE) {
assert_data_table(result)
assert_names(names(result), must.include = c("resampling_iteration", "learner_id", "features", "n_features"))
private$.measure_id = assert_string(measure_id, null.ok = FALSE)
mandatory_columns = c("resampling_iteration", "learner_id", "features", "n_features")
assert_names(names(result), must.include = c(mandatory_columns, measure_id))
private$.result = result
private$.features = assert_character(features, any.missing = FALSE, null.ok = FALSE)
private$.minimize = assert_logical(minimize, null.ok = FALSE)
self$benchmark_result = if (!is.null(benchmark_result)) assert_benchmark_result(benchmark_result)

self$man = "mlr3fselect::ensemble_fs_result"
Expand Down Expand Up @@ -144,7 +158,7 @@ EnsembleFSResult = R6Class("EnsembleFSResult",
#' The stability measure to be used.
#' One of the measures returned by [stabm::listStabilityMeasures()] in lower case.
#' Default is `"jaccard"`.
#' @param ... (`any`)\cr
#' @param stability_args (`list`)\cr
#' Additional arguments passed to the stability measure function.
#' @param global (`logical(1)`)\cr
#' Whether to calculate the stability globally or for each learner.
Expand All @@ -153,10 +167,16 @@ EnsembleFSResult = R6Class("EnsembleFSResult",
#'
#' @return A `numeric()` value representing the stability of the selected features.
#' Or a `numeric()` vector with the stability of the selected features for each learner.
stability = function(stability_measure = "jaccard", ..., global = TRUE, reset_cache = FALSE) {
stability = function(
stability_measure = "jaccard",
stability_args = NULL,
global = TRUE,
reset_cache = FALSE
) {
funs = stabm::listStabilityMeasures()$Name
keys = tolower(gsub("stability", "", funs))
assert_choice(stability_measure, choices = keys)
assert_list(stability_args, null.ok = TRUE, names = "named")

if (global) {
# cached results
Expand All @@ -165,7 +185,7 @@ EnsembleFSResult = R6Class("EnsembleFSResult",
}

fun = get(funs[which(stability_measure == keys)], envir = asNamespace("stabm"))
private$.stability_global[[stability_measure]] = fun(private$.result$features, ...)
private$.stability_global[[stability_measure]] = invoke(fun, features = private$.result$features, .args = stability_args)
private$.stability_global[[stability_measure]]
} else {
# cached results
Expand All @@ -175,10 +195,133 @@ EnsembleFSResult = R6Class("EnsembleFSResult",

fun = get(funs[which(stability_measure == keys)], envir = asNamespace("stabm"))

tab = private$.result[, list(score = fun(.SD$features, ...)), by = learner_id]
tab = private$.result[, list(score = invoke(fun, features = .SD$features, .args = stability_args)), by = learner_id]
private$.stability_learner[[stability_measure]] = set_names(tab$score, tab$learner_id)
private$.stability_learner[[stability_measure]]
}
},

#' @description
#'
#' This function identifies the **Pareto front** of the ensemble feature
#' selection process, i.e., the set of points that represent the trade-off
#' between the number of features and performance (e.g. classification error).
#'
#' @param type (`character(1)`)\cr
#' Specifies the type of Pareto front to return. See details.
#'
#' @details
#' Two options are available for the Pareto front:
#' - `"empirical"` (default): returns the empirical Pareto front.
#' - `"estimated"`: the Pareto front points are estimated by fitting a linear model with the inversed of the number of features (\eqn{1/x}) as input and the associated performance scores as output.
#' This method is useful when the Pareto points are sparse and the front assumes a convex shape if better performance corresponds to lower measure values (e.g. classification error), or a concave shape otherwise (e.g. classification accuracy).
#' The `estimated` Pareto front will include points for a number of features ranging from 1 up to the maximum number found in the empirical Pareto front.
#'
#' @return A [data.table::data.table] with columns the number of features and the performance that together form the Pareto front.
pareto_front = function(type = "empirical") {
assert_choice(type, choices = c("empirical", "estimated"))
result = private$.result
measure_id = private$.measure_id
minimize = private$.minimize

# Keep only n_features and performance scores
cols_to_keep = c("n_features", measure_id)
data = result[, ..cols_to_keep]

# Order data according to the measure
data = if (minimize)
data[order(n_features, -get(measure_id))]
else
data[order(n_features, get(measure_id))]

# Initialize the Pareto front
pf = data.table(n_features = numeric(0))
pf[, (measure_id) := numeric(0)]

# Initialize the best performance to a large number so
# that the Pareto front has at least one point
best_score = if (minimize) Inf else -Inf

for (i in seq_row(data)) {
# Determine the condition based on minimize
if (minimize) {
condition = data[[measure_id]][i] < best_score
} else {
condition = data[[measure_id]][i] > best_score
}

if (condition) {
pf = rbind(pf, data[i])
best_score = data[[measure_id]][i]
}
}

if (type == "estimated") {
# Transform the data (x => 1/x)
pf[, n_features_inv := 1 / n_features]

# Fit the linear model
form = mlr3misc::formulate(lhs = measure_id, rhs = "n_features_inv")
model = stats::lm(formula = form, data = pf)

# Predict values using the model to create a smooth curve
pf_pred = data.table(n_features = seq(1, max(data$n_features)))
pf_pred[, n_features_inv := 1 / n_features]
pf_pred[, (measure_id) := predict(model, newdata = pf_pred)]
pf_pred$n_features_inv = NULL
pf = pf_pred
}

pf
},

#' @description
#'
#' This function implements various *knee* point identification (KPI) methods, which select points in the Pareto front, such that an optimal trade-off between performance and number of features is achieved.
#' In most cases, only one such point is returned.
#'
#' @details
#' The available KPI methods are:
#'
#' - `"NBI"` (default): The **Normal-Boundary Intersection** method is a geometry-based method which calculates the perpendicular distance of each point from the line connecting the first and last points of the Pareto front.
#' The knee point is determined as the Pareto point with the maximum distance from this line, see Das (1999).
#'
#' @param method (`character(1)`)\cr
#' Type of method to use to identify the knee point. See details.
#' @param type (`character(1)`)\cr
#' Specifies the type of Pareto front to use for the identification of the knee point.
#' See `pareto_front()` method for more details.
#'
#' @return A [data.table::data.table] with the knee point(s) of the Pareto front.
knee_points = function(method = "NBI", type = "empirical") {
assert_choice(method, choices = c("NBI"))
assert_choice(type, choices = c("empirical", "estimated"))
measure_id = private$.measure_id
minimize = private$.minimize

pf = if (type == "empirical") self$pareto_front() else self$pareto_front(type = "estimated")

# Scale the Pareto front data to (0-1) range
pf_norm = pf[, .(
nfeats_norm = (n_features - min(n_features)) /(max(n_features) - min(n_features)),
perf_norm = (get(measure_id) - min(get(measure_id))) / (max(get(measure_id)) - min(get(measure_id)))
)]

if (minimize) {
# The two edge points in the Pareto front are: (0,1) and (1,0)
# They define the line (x + y - 1 = 0) and their distance is sqrt(2)
pf_norm[, dist_to_line := abs(nfeats_norm + perf_norm - 1)/sqrt(2)]
} else {
# The two edge points in the Pareto front are: (0,0) and (1,1)
# They define the line (y - x = 0) and their distance is sqrt(2)
pf_norm[, dist_to_line := abs(nfeats_norm - perf_norm)/sqrt(2)]
}

# knee point is the one with the maximum distance
knee_index = which_max(pf_norm[, dist_to_line], ties_method = "first")
knee_point = pf[knee_index]

knee_point
}
),

Expand All @@ -191,15 +334,31 @@ EnsembleFSResult = R6Class("EnsembleFSResult",
if (is.null(self$benchmark_result)) return(private$.result)
tab = as.data.table(self$benchmark_result)[, c("task", "learner", "resampling"), with = FALSE]
cbind(private$.result, tab)
},

#' @field n_learners (`numeric(1)`)\cr
#' Returns the number of learners used in the ensemble feature selection.
n_learners = function(rhs) {
assert_ro_binding(rhs)
uniqueN(private$.result$learner_id)
},

#' @field measure (`character(1)`)\cr
#' Returns the measure id used in the ensemble feature selection.
measure = function(rhs) {
assert_ro_binding(rhs)
private$.measure_id
}
),

private = list(
.result = NULL,
.result = NULL, # with no R6 classes
.stability_global = NULL,
.stability_learner = NULL,
.feature_ranking = NULL,
.features = NULL
.features = NULL,
.measure_id = NULL,
.minimize = NULL
)
)

Expand Down
14 changes: 14 additions & 0 deletions R/bibentries.R
Original file line number Diff line number Diff line change
Expand Up @@ -105,5 +105,19 @@ bibentries = c(
title = "Ensemble feature selection for high-dimensional data: a stability analysis across multiple domains",
volume = "32",
year = "2020"
),

das1999 = bibentry("article",
author = "Das, I",
doi = "10.1007/BF01195985/METRICS",
issn = "09344373",
journal = "Structural Optimization",
month = "may",
number = "1-2",
pages = "107--115",
publisher = "Springer",
title = "On characterizing the 'knee' of the Pareto curve based on normal-boundary intersection",
volume = "18",
year = "1999"
)
)
4 changes: 3 additions & 1 deletion R/ensemble_fselect.R
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,8 @@ ensemble_fselect = function(
EnsembleFSResult$new(
result = grid,
features = task$feature_names,
benchmark_result = if (store_benchmark_result) bmr
benchmark_result = if (store_benchmark_result) bmr,
measure_id = measure$id,
minimize = measure$minimize
)
}
Loading

0 comments on commit 6d032dc

Please sign in to comment.