From 6d032dcc0454a641f223c9993b015f710db35769 Mon Sep 17 00:00:00 2001 From: John Zobolas Date: Fri, 21 Jun 2024 19:43:53 +0200 Subject: [PATCH] efs updates (#105) * export class * correct doc * add args minimize, measure_var and active field nlearners * add pareto_front method * add export in NAMESPACE * improve doc a bit * update docs * add more tests * Update R/EnsembleFSResult.R Co-authored-by: Marc Becker <33069354+be-marc@users.noreply.github.com> * Update R/EnsembleFSResult.R Co-authored-by: Marc Becker <33069354+be-marc@users.noreply.github.com> * Update R/EnsembleFSResult.R Co-authored-by: Marc Becker <33069354+be-marc@users.noreply.github.com> * fix variable names * order also by measure (pareto points may have the same num of features) * more realistic pareto front in the test * add Das 'knee' paper * add "knee_points" method * fix method name * change stability args to a list * check stability_args * style change * fix test * supress warnings * 'featureless' learner produces same number of features as best in the RFE * update docs * correct method name and update docs * add tests for knee_points * fix bug in "knee_points" (didn't work properly for measures like accuracy) --------- Co-authored-by: Marc Becker <33069354+be-marc@users.noreply.github.com> --- R/EnsembleFSResult.R | 179 +++++++++++++++++++++++-- R/bibentries.R | 14 ++ R/ensemble_fselect.R | 4 +- man/ensemble_fs_result.Rd | 106 ++++++++++++++- tests/testthat/test_ensemble_fselect.R | 77 +++++++++-- 5 files changed, 356 insertions(+), 24 deletions(-) diff --git a/R/EnsembleFSResult.R b/R/EnsembleFSResult.R index da9a6d3c..f689232b 100644 --- a/R/EnsembleFSResult.R +++ b/R/EnsembleFSResult.R @@ -4,7 +4,7 @@ #' #' @description #' The `EnsembleFSResult` stores the results of ensemble feature selection. -#' It includes methods for evaluating the stability of the feature selection process and for ranking the selected features. +#' It includes methods for evaluating the stability of the feature selection process and for ranking the selected features among others. #' The function [ensemble_fselect()] returns an object of this class. #' #' @section S3 Methods: @@ -15,6 +15,9 @@ #' * `benchmark_result` (`logical(1)`)\cr #' Whether to add the learner, task and resampling information from the benchmark result. #' +#' @references +#' `r format_bib("das1999")` +#' #' @export #' @examples #' \donttest{ @@ -39,6 +42,9 @@ #' #' # returns a ranking of all features #' head(efsr$feature_ranking()) +#' +#' # returns the empirical pareto front (nfeatures vs error) +#' efsr$pareto_front() #' } EnsembleFSResult = R6Class("EnsembleFSResult", public = list( @@ -56,18 +62,26 @@ EnsembleFSResult = R6Class("EnsembleFSResult", #' #' @param result ([data.table::data.table])\cr #' The result of the ensemble feature selection. - #' Column names should include `"resampling_id"`, `"learner_id"`, `"features"` + #' Column names should include `"resampling_iteration"`, `"learner_id"`, `"features"` #' and `"n_features"`. #' @param features ([character()])\cr #' The vector of features of the task that was used in the ensemble feature #' selection. #' @param benchmark_result ([mlr3::BenchmarkResult])\cr #' The benchmark result object. - initialize = function(result, features, benchmark_result = NULL) { + #' @param measure_id (`character(1)`)\cr + #' Column name of `"result"` that corresponds to the measure used. + #' @param minimize (`logical(1)`)\cr + #' If `TRUE` (default), lower values of the measure correspond to higher performance. + initialize = function(result, features, benchmark_result = NULL, measure_id, + minimize = TRUE) { assert_data_table(result) - assert_names(names(result), must.include = c("resampling_iteration", "learner_id", "features", "n_features")) + private$.measure_id = assert_string(measure_id, null.ok = FALSE) + mandatory_columns = c("resampling_iteration", "learner_id", "features", "n_features") + assert_names(names(result), must.include = c(mandatory_columns, measure_id)) private$.result = result private$.features = assert_character(features, any.missing = FALSE, null.ok = FALSE) + private$.minimize = assert_logical(minimize, null.ok = FALSE) self$benchmark_result = if (!is.null(benchmark_result)) assert_benchmark_result(benchmark_result) self$man = "mlr3fselect::ensemble_fs_result" @@ -144,7 +158,7 @@ EnsembleFSResult = R6Class("EnsembleFSResult", #' The stability measure to be used. #' One of the measures returned by [stabm::listStabilityMeasures()] in lower case. #' Default is `"jaccard"`. - #' @param ... (`any`)\cr + #' @param stability_args (`list`)\cr #' Additional arguments passed to the stability measure function. #' @param global (`logical(1)`)\cr #' Whether to calculate the stability globally or for each learner. @@ -153,10 +167,16 @@ EnsembleFSResult = R6Class("EnsembleFSResult", #' #' @return A `numeric()` value representing the stability of the selected features. #' Or a `numeric()` vector with the stability of the selected features for each learner. - stability = function(stability_measure = "jaccard", ..., global = TRUE, reset_cache = FALSE) { + stability = function( + stability_measure = "jaccard", + stability_args = NULL, + global = TRUE, + reset_cache = FALSE + ) { funs = stabm::listStabilityMeasures()$Name keys = tolower(gsub("stability", "", funs)) assert_choice(stability_measure, choices = keys) + assert_list(stability_args, null.ok = TRUE, names = "named") if (global) { # cached results @@ -165,7 +185,7 @@ EnsembleFSResult = R6Class("EnsembleFSResult", } fun = get(funs[which(stability_measure == keys)], envir = asNamespace("stabm")) - private$.stability_global[[stability_measure]] = fun(private$.result$features, ...) + private$.stability_global[[stability_measure]] = invoke(fun, features = private$.result$features, .args = stability_args) private$.stability_global[[stability_measure]] } else { # cached results @@ -175,10 +195,133 @@ EnsembleFSResult = R6Class("EnsembleFSResult", fun = get(funs[which(stability_measure == keys)], envir = asNamespace("stabm")) - tab = private$.result[, list(score = fun(.SD$features, ...)), by = learner_id] + tab = private$.result[, list(score = invoke(fun, features = .SD$features, .args = stability_args)), by = learner_id] private$.stability_learner[[stability_measure]] = set_names(tab$score, tab$learner_id) private$.stability_learner[[stability_measure]] } + }, + + #' @description + #' + #' This function identifies the **Pareto front** of the ensemble feature + #' selection process, i.e., the set of points that represent the trade-off + #' between the number of features and performance (e.g. classification error). + #' + #' @param type (`character(1)`)\cr + #' Specifies the type of Pareto front to return. See details. + #' + #' @details + #' Two options are available for the Pareto front: + #' - `"empirical"` (default): returns the empirical Pareto front. + #' - `"estimated"`: the Pareto front points are estimated by fitting a linear model with the inversed of the number of features (\eqn{1/x}) as input and the associated performance scores as output. + #' This method is useful when the Pareto points are sparse and the front assumes a convex shape if better performance corresponds to lower measure values (e.g. classification error), or a concave shape otherwise (e.g. classification accuracy). + #' The `estimated` Pareto front will include points for a number of features ranging from 1 up to the maximum number found in the empirical Pareto front. + #' + #' @return A [data.table::data.table] with columns the number of features and the performance that together form the Pareto front. + pareto_front = function(type = "empirical") { + assert_choice(type, choices = c("empirical", "estimated")) + result = private$.result + measure_id = private$.measure_id + minimize = private$.minimize + + # Keep only n_features and performance scores + cols_to_keep = c("n_features", measure_id) + data = result[, ..cols_to_keep] + + # Order data according to the measure + data = if (minimize) + data[order(n_features, -get(measure_id))] + else + data[order(n_features, get(measure_id))] + + # Initialize the Pareto front + pf = data.table(n_features = numeric(0)) + pf[, (measure_id) := numeric(0)] + + # Initialize the best performance to a large number so + # that the Pareto front has at least one point + best_score = if (minimize) Inf else -Inf + + for (i in seq_row(data)) { + # Determine the condition based on minimize + if (minimize) { + condition = data[[measure_id]][i] < best_score + } else { + condition = data[[measure_id]][i] > best_score + } + + if (condition) { + pf = rbind(pf, data[i]) + best_score = data[[measure_id]][i] + } + } + + if (type == "estimated") { + # Transform the data (x => 1/x) + pf[, n_features_inv := 1 / n_features] + + # Fit the linear model + form = mlr3misc::formulate(lhs = measure_id, rhs = "n_features_inv") + model = stats::lm(formula = form, data = pf) + + # Predict values using the model to create a smooth curve + pf_pred = data.table(n_features = seq(1, max(data$n_features))) + pf_pred[, n_features_inv := 1 / n_features] + pf_pred[, (measure_id) := predict(model, newdata = pf_pred)] + pf_pred$n_features_inv = NULL + pf = pf_pred + } + + pf + }, + + #' @description + #' + #' This function implements various *knee* point identification (KPI) methods, which select points in the Pareto front, such that an optimal trade-off between performance and number of features is achieved. + #' In most cases, only one such point is returned. + #' + #' @details + #' The available KPI methods are: + #' + #' - `"NBI"` (default): The **Normal-Boundary Intersection** method is a geometry-based method which calculates the perpendicular distance of each point from the line connecting the first and last points of the Pareto front. + #' The knee point is determined as the Pareto point with the maximum distance from this line, see Das (1999). + #' + #' @param method (`character(1)`)\cr + #' Type of method to use to identify the knee point. See details. + #' @param type (`character(1)`)\cr + #' Specifies the type of Pareto front to use for the identification of the knee point. + #' See `pareto_front()` method for more details. + #' + #' @return A [data.table::data.table] with the knee point(s) of the Pareto front. + knee_points = function(method = "NBI", type = "empirical") { + assert_choice(method, choices = c("NBI")) + assert_choice(type, choices = c("empirical", "estimated")) + measure_id = private$.measure_id + minimize = private$.minimize + + pf = if (type == "empirical") self$pareto_front() else self$pareto_front(type = "estimated") + + # Scale the Pareto front data to (0-1) range + pf_norm = pf[, .( + nfeats_norm = (n_features - min(n_features)) /(max(n_features) - min(n_features)), + perf_norm = (get(measure_id) - min(get(measure_id))) / (max(get(measure_id)) - min(get(measure_id))) + )] + + if (minimize) { + # The two edge points in the Pareto front are: (0,1) and (1,0) + # They define the line (x + y - 1 = 0) and their distance is sqrt(2) + pf_norm[, dist_to_line := abs(nfeats_norm + perf_norm - 1)/sqrt(2)] + } else { + # The two edge points in the Pareto front are: (0,0) and (1,1) + # They define the line (y - x = 0) and their distance is sqrt(2) + pf_norm[, dist_to_line := abs(nfeats_norm - perf_norm)/sqrt(2)] + } + + # knee point is the one with the maximum distance + knee_index = which_max(pf_norm[, dist_to_line], ties_method = "first") + knee_point = pf[knee_index] + + knee_point } ), @@ -191,15 +334,31 @@ EnsembleFSResult = R6Class("EnsembleFSResult", if (is.null(self$benchmark_result)) return(private$.result) tab = as.data.table(self$benchmark_result)[, c("task", "learner", "resampling"), with = FALSE] cbind(private$.result, tab) + }, + + #' @field n_learners (`numeric(1)`)\cr + #' Returns the number of learners used in the ensemble feature selection. + n_learners = function(rhs) { + assert_ro_binding(rhs) + uniqueN(private$.result$learner_id) + }, + + #' @field measure (`character(1)`)\cr + #' Returns the measure id used in the ensemble feature selection. + measure = function(rhs) { + assert_ro_binding(rhs) + private$.measure_id } ), private = list( - .result = NULL, + .result = NULL, # with no R6 classes .stability_global = NULL, .stability_learner = NULL, .feature_ranking = NULL, - .features = NULL + .features = NULL, + .measure_id = NULL, + .minimize = NULL ) ) diff --git a/R/bibentries.R b/R/bibentries.R index a7ecb1df..71e4d111 100644 --- a/R/bibentries.R +++ b/R/bibentries.R @@ -105,5 +105,19 @@ bibentries = c( title = "Ensemble feature selection for high-dimensional data: a stability analysis across multiple domains", volume = "32", year = "2020" + ), + + das1999 = bibentry("article", + author = "Das, I", + doi = "10.1007/BF01195985/METRICS", + issn = "09344373", + journal = "Structural Optimization", + month = "may", + number = "1-2", + pages = "107--115", + publisher = "Springer", + title = "On characterizing the 'knee' of the Pareto curve based on normal-boundary intersection", + volume = "18", + year = "1999" ) ) diff --git a/R/ensemble_fselect.R b/R/ensemble_fselect.R index afde7803..940dc2f5 100644 --- a/R/ensemble_fselect.R +++ b/R/ensemble_fselect.R @@ -141,6 +141,8 @@ ensemble_fselect = function( EnsembleFSResult$new( result = grid, features = task$feature_names, - benchmark_result = if (store_benchmark_result) bmr + benchmark_result = if (store_benchmark_result) bmr, + measure_id = measure$id, + minimize = measure$minimize ) } diff --git a/man/ensemble_fs_result.Rd b/man/ensemble_fs_result.Rd index d5720cfb..63a0cbc3 100644 --- a/man/ensemble_fs_result.Rd +++ b/man/ensemble_fs_result.Rd @@ -6,7 +6,7 @@ \title{Ensemble Feature Selection Result} \description{ The \code{EnsembleFSResult} stores the results of ensemble feature selection. -It includes methods for evaluating the stability of the feature selection process and for ranking the selected features. +It includes methods for evaluating the stability of the feature selection process and for ranking the selected features among others. The function \code{\link[=ensemble_fselect]{ensemble_fselect()}} returns an object of this class. } \section{S3 Methods}{ @@ -46,7 +46,16 @@ Whether to add the learner, task and resampling information from the benchmark r # returns a ranking of all features head(efsr$feature_ranking()) + + # returns the empirical pareto front (nfeatures vs error) + efsr$pareto_front() +} } +\references{ +Das, I (1999). +\dQuote{On characterizing the 'knee' of the Pareto curve based on normal-boundary intersection.} +\emph{Structural Optimization}, \bold{18}(1-2), 107--115. +ISSN 09344373, \doi{10.1007/BF01195985/METRICS}. } \section{Public fields}{ \if{html}{\out{
}} @@ -64,6 +73,12 @@ Manual page for this object.} \describe{ \item{\code{result}}{(\link[data.table:data.table]{data.table::data.table})\cr Returns the result of the ensemble feature selection.} + +\item{\code{n_learners}}{(\code{numeric(1)})\cr +Returns the number of learners used in the ensemble feature selection.} + +\item{\code{measure}}{(\code{character(1)})\cr +Returns the measure id used in the ensemble feature selection.} } \if{html}{\out{
}} } @@ -76,6 +91,8 @@ Returns the result of the ensemble feature selection.} \item \href{#method-EnsembleFSResult-help}{\code{EnsembleFSResult$help()}} \item \href{#method-EnsembleFSResult-feature_ranking}{\code{EnsembleFSResult$feature_ranking()}} \item \href{#method-EnsembleFSResult-stability}{\code{EnsembleFSResult$stability()}} +\item \href{#method-EnsembleFSResult-pareto_front}{\code{EnsembleFSResult$pareto_front()}} +\item \href{#method-EnsembleFSResult-knee_points}{\code{EnsembleFSResult$knee_points()}} \item \href{#method-EnsembleFSResult-clone}{\code{EnsembleFSResult$clone()}} } } @@ -85,7 +102,13 @@ Returns the result of the ensemble feature selection.} \subsection{Method \code{new()}}{ Creates a new instance of this \link[R6:R6Class]{R6} class. \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{EnsembleFSResult$new(result, features, benchmark_result = NULL)}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{EnsembleFSResult$new( + result, + features, + benchmark_result = NULL, + measure_id, + minimize = TRUE +)}\if{html}{\out{
}} } \subsection{Arguments}{ @@ -93,7 +116,7 @@ Creates a new instance of this \link[R6:R6Class]{R6} class. \describe{ \item{\code{result}}{(\link[data.table:data.table]{data.table::data.table})\cr The result of the ensemble feature selection. -Column names should include \code{"resampling_id"}, \code{"learner_id"}, \code{"features"} +Column names should include \code{"resampling_iteration"}, \code{"learner_id"}, \code{"features"} and \code{"n_features"}.} \item{\code{features}}{(\code{\link[=character]{character()}})\cr @@ -102,6 +125,12 @@ selection.} \item{\code{benchmark_result}}{(\link[mlr3:BenchmarkResult]{mlr3::BenchmarkResult})\cr The benchmark result object.} + +\item{\code{measure_id}}{(\code{character(1)})\cr +Column name of \code{"result"} that corresponds to the measure used.} + +\item{\code{minimize}}{(\code{logical(1)})\cr +If \code{TRUE} (default), lower values of the measure correspond to higher performance.} } \if{html}{\out{}} } @@ -187,7 +216,7 @@ When the same stability measure is requested again with different arguments, the \subsection{Usage}{ \if{html}{\out{
}}\preformatted{EnsembleFSResult$stability( stability_measure = "jaccard", - ..., + stability_args = NULL, global = TRUE, reset_cache = FALSE )}\if{html}{\out{
}} @@ -201,7 +230,7 @@ The stability measure to be used. One of the measures returned by \code{\link[stabm:listStabilityMeasures]{stabm::listStabilityMeasures()}} in lower case. Default is \code{"jaccard"}.} -\item{\code{...}}{(\code{any})\cr +\item{\code{stability_args}}{(\code{list})\cr Additional arguments passed to the stability measure function.} \item{\code{global}}{(\code{logical(1)})\cr @@ -218,6 +247,73 @@ Or a \code{numeric()} vector with the stability of the selected features for eac } } \if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-EnsembleFSResult-pareto_front}{}}} +\subsection{Method \code{pareto_front()}}{ +This function identifies the \strong{Pareto front} of the ensemble feature +selection process, i.e., the set of points that represent the trade-off +between the number of features and performance (e.g. classification error). +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{EnsembleFSResult$pareto_front(type = "empirical")}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{type}}{(\code{character(1)})\cr +Specifies the type of Pareto front to return. See details.} +} +\if{html}{\out{
}} +} +\subsection{Details}{ +Two options are available for the Pareto front: +\itemize{ +\item \code{"empirical"} (default): returns the empirical Pareto front. +\item \code{"estimated"}: the Pareto front points are estimated by fitting a linear model with the inversed of the number of features (\eqn{1/x}) as input and the associated performance scores as output. +This method is useful when the Pareto points are sparse and the front assumes a convex shape if better performance corresponds to lower measure values (e.g. classification error), or a concave shape otherwise (e.g. classification accuracy). +The \code{estimated} Pareto front will include points for a number of features ranging from 1 up to the maximum number found in the empirical Pareto front. +} +} + +\subsection{Returns}{ +A \link[data.table:data.table]{data.table::data.table} with columns the number of features and the performance that together form the Pareto front. +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-EnsembleFSResult-knee_points}{}}} +\subsection{Method \code{knee_points()}}{ +This function implements various \emph{knee} point identification (KPI) methods, which select points in the Pareto front, such that an optimal trade-off between performance and number of features is achieved. +In most cases, only one such point is returned. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{EnsembleFSResult$knee_points(method = "NBI", type = "empirical")}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{method}}{(\code{character(1)})\cr +Type of method to use to identify the knee point. See details.} + +\item{\code{type}}{(\code{character(1)})\cr +Specifies the type of Pareto front to use for the identification of the knee point. +See \code{pareto_front()} method for more details.} +} +\if{html}{\out{
}} +} +\subsection{Details}{ +The available KPI methods are: +\itemize{ +\item \code{"NBI"} (default): The \strong{Normal-Boundary Intersection} method is a geometry-based method which calculates the perpendicular distance of each point from the line connecting the first and last points of the Pareto front. +The knee point is determined as the Pareto point with the maximum distance from this line, see Das (1999). +} +} + +\subsection{Returns}{ +A \link[data.table:data.table]{data.table::data.table} with the knee point(s) of the Pareto front. +} +} +\if{html}{\out{
}} \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-EnsembleFSResult-clone}{}}} \subsection{Method \code{clone()}}{ diff --git a/tests/testthat/test_ensemble_fselect.R b/tests/testthat/test_ensemble_fselect.R index d6a03a0a..c0c68f16 100644 --- a/tests/testthat/test_ensemble_fselect.R +++ b/tests/testthat/test_ensemble_fselect.R @@ -16,9 +16,12 @@ test_that("ensemble feature selection works", { expect_vector(efsr$result$n_features, size = 4) expect_vector(efsr$result$classif.ce, size = 4) expect_benchmark_result(efsr$benchmark_result) + expect_equal(efsr$measure, "classif.ce") + expect_equal(efsr$n_learners, 2) # stability expect_number(efsr$stability(stability_measure = "jaccard")) + expect_error(efsr$stability(stability_args = list(20)), "have names") stability = efsr$stability(stability_measure = "jaccard", global = FALSE) expect_numeric(stability, len = 2) expect_names(names(stability), identical.to = c("classif.rpart", "classif.featureless")) @@ -28,6 +31,19 @@ test_that("ensemble feature selection works", { expect_data_table(feature_ranking, nrows = length(task$feature_names)) expect_names(names(feature_ranking), identical.to = c("feature", "inclusion_probability")) + # pareto_front + pf = efsr$pareto_front() + expect_data_table(pf) + expect_equal(names(pf), c("n_features", "classif.ce")) + pf_pred = suppressWarnings(efsr$pareto_front(type = "estimated")) + expect_data_table(pf_pred, nrows = max(efsr$result$n_features)) + expect_equal(names(pf_pred), c("n_features", "classif.ce")) + + # knee_points + kps = efsr$knee_points() + expect_data_table(kps, min.rows = 1) + expect_equal(names(kps), c("n_features", "classif.ce")) + # data.table conversion tab = as.data.table(efsr) expect_names(names(tab), identical.to = c("resampling_iteration", "learner_id", "features", "n_features", "classif.ce", "task", "learner", "resampling")) @@ -52,6 +68,8 @@ test_that("ensemble feature selection works without benchmark result", { expect_vector(efsr$result$n_features, size = 4) expect_vector(efsr$result$classif.ce, size = 4) expect_null(efsr$benchmark_result) + expect_equal(efsr$measure, "classif.ce") + expect_equal(efsr$n_learners, 2) # stability expect_number(efsr$stability(stability_measure = "jaccard")) @@ -64,6 +82,19 @@ test_that("ensemble feature selection works without benchmark result", { expect_data_table(feature_ranking, nrows = length(task$feature_names)) expect_names(names(feature_ranking), identical.to = c("feature", "inclusion_probability")) + # pareto_front + pf = efsr$pareto_front() + expect_data_table(pf) + expect_equal(names(pf), c("n_features", "classif.ce")) + pf_pred = suppressWarnings(efsr$pareto_front(type = "estimated")) + expect_data_table(pf_pred, nrows = max(efsr$result$n_features)) + expect_equal(names(pf_pred), c("n_features", "classif.ce")) + + # knee_points + kps = efsr$knee_points(type = "estimated") + expect_data_table(kps, min.rows = 1) + expect_equal(names(kps), c("n_features", "classif.ce")) + # data.table conversion tab = as.data.table(efsr) expect_names(names(tab), identical.to = c("resampling_iteration", "learner_id", "features", "n_features", "classif.ce")) @@ -88,6 +119,8 @@ test_that("ensemble feature selection works with rfe", { expect_vector(efsr$result$classif.ce, size = 4) expect_list(efsr$result$importance, any.missing = FALSE, len = 4) expect_benchmark_result(efsr$benchmark_result) + expect_equal(efsr$measure, "classif.ce") + expect_equal(efsr$n_learners, 2) # stability expect_number(efsr$stability(stability_measure = "jaccard")) @@ -100,25 +133,53 @@ test_that("ensemble feature selection works with rfe", { expect_data_table(feature_ranking, nrows = length(task$feature_names)) expect_names(names(feature_ranking), identical.to = c("feature", "inclusion_probability")) + # pareto_front + pf = efsr$pareto_front() + expect_data_table(pf) + expect_equal(names(pf), c("n_features", "classif.ce")) + pf_pred = suppressWarnings(efsr$pareto_front(type = "estimated")) + expect_data_table(pf_pred, nrows = max(efsr$result$n_features)) + expect_equal(names(pf_pred), c("n_features", "classif.ce")) + + # knee_points + kps = efsr$knee_points(type = "estimated") + expect_data_table(kps, min.rows = 1) + expect_equal(names(kps), c("n_features", "classif.ce")) + # data.table conversion tab = as.data.table(efsr) expect_names(names(tab), identical.to = c("resampling_iteration", "learner_id", "features", "n_features", "classif.ce", "importance", "task", "learner", "resampling")) }) test_that("EnsembleFSResult initialization", { - features = LETTERS - result = data.table(a = 1) # not proper column name - expect_error(EnsembleFSResult$new(result = result, features = features)) + result = data.table(a = 1, b = 3) + expect_error(EnsembleFSResult$new(result = result, features = LETTERS, measure_id = "a"), "missing elements") + + result = data.table( + resampling_iteration = c(1, 1, 1, 2, 2, 2, 3, 3, 3), + learner_id = rep(c("classif.xgboost", "classif.rpart", "classif.ranger"), 3), + n_features = c(2, 4, 4, 1, 5, 4, 1, 2, 4), + features = list( + c("V3", "V20"), + c("V3", "V5", "V19", "V15"), + c("V11", "V7", "V6", "V8"), + c("V11"), + c("V17", "V2", "V12", "V9", "V1"), + c("V11", "V18", "V9", "V2"), + c("V2"), + c("V4", "V12"), + c("V6", "V15", "V19", "V7")), + classif.ce = c(0.13, 0.24, 0.16, 0.11, 0.25, 0.18, 0.15, 0.1, 0.16) + ) - result = data.table(resampling_iteration = 1:2, learner_id = list("l1", "l2"), - features = list(LETTERS[1], LETTERS[1:3]), - n_features = c(1,3)) # works without benchmark result object - efsr = EnsembleFSResult$new(result = result, features = features) + efsr = EnsembleFSResult$new(result = result, features = paste0("V", 1:20), measure_id = "classif.ce") expect_class(efsr, "EnsembleFSResult") + expect_equal(efsr$n_learners, 3) tab = as.data.table(efsr) expect_data_table(tab) - expect_names(names(tab), identical.to = c("resampling_iteration", "learner_id", "features", "n_features")) + expect_names(names(tab), identical.to = c("resampling_iteration", "learner_id", + "n_features", "features", "classif.ce")) }) test_that("different callbacks can be set", {