Skip to content

Commit

Permalink
Merge pull request #22 from rpln/level_ordering_fix
Browse files Browse the repository at this point in the history
Retain original factor level ordering
  • Loading branch information
EmilHvitfeldt authored Aug 12, 2020
2 parents 7638ae4 + 20e758d commit d2c2b29
Show file tree
Hide file tree
Showing 12 changed files with 51 additions and 3 deletions.
2 changes: 2 additions & 0 deletions R/nearmiss.R
Original file line number Diff line number Diff line change
Expand Up @@ -153,9 +153,11 @@ bake.step_nearmiss <- function(object, new_data, ...) {
with_seed(
seed = object$seed,
code = {
original_levels <- levels(new_data[[object$column]])
new_data <- nearmiss(new_data, object$column,
k = object$neighbors,
under_ratio = object$under_ratio)
new_data[[object$column]] <- factor(new_data[[object$column]], levels = original_levels)
}
)

Expand Down
2 changes: 2 additions & 0 deletions R/rose.R
Original file line number Diff line number Diff line change
Expand Up @@ -179,11 +179,13 @@ bake.step_rose <- function(object, new_data, ...) {
with_seed(
seed = object$seed,
code = {
original_levels <- levels(new_data[[object$column]])
new_data <- ROSE(string2formula(object$column), new_data,
N = majority_size * object$over_ratio,
p = object$minority_prop,
hmult.majo = object$majority_smoothness,
hmult.mino = object$minority_smoothness)$data
new_data[[object$column]] <- factor(new_data[[object$column]], levels = original_levels)
}
)

Expand Down
7 changes: 4 additions & 3 deletions R/tomek.R
Original file line number Diff line number Diff line change
Expand Up @@ -142,10 +142,10 @@ response_0_1 <- function(x) {
ifelse(x == names(sort(table(x)))[1], 1, 0)
}
# Turns 0-1 coded variable back into factor variable
response_0_1_to_org <- function(old, new) {
response_0_1_to_org <- function(old, new, levels) {
ref <- names(sort(table(old)))
names(ref) <- c("1", "0")
factor(unname(ref[as.character(new)]))
factor(unname(ref[as.character(new)]), levels = levels)
}

#' @export
Expand All @@ -155,6 +155,7 @@ bake.step_tomek <- function(object, new_data, ...) {
with_seed(
seed = object$seed,
code = {
original_levels <- levels(new_data[[object$column]])
tomek_data <- ubTomek(X = select(new_data, -!!object$column),
Y = response_0_1(new_data[[object$column]]),
verbose = FALSE)
Expand All @@ -164,7 +165,7 @@ bake.step_tomek <- function(object, new_data, ...) {
new_data0 <- mutate(
tomek_data$X,
!!object$column := response_0_1_to_org(new_data[[object$column]],
tomek_data$Y)
tomek_data$Y, levels = original_levels)
)

as_tibble(new_data0[names(new_data)])
Expand Down
35 changes: 35 additions & 0 deletions tests/testthat/helper-test-functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -243,3 +243,38 @@ test_multi_minority <- function(step, ...) {
expect_true(all(max(table(rec1_p2$Species)) == 25))
})
}

test_factor_level_memory <- function(step, ...) {
# Only checks for two level case

circle_example_alt_levels <- list()
for(i in 1:4) circle_example_alt_levels[[i]] <- circle_example

# Checking for forgetting levels by majority/minor switching
for(i in c(2, 4)){
levels(circle_example_alt_levels[[i]]$class) <- rev(levels(circle_example_alt_levels[[i]]$class))
}

# Checking for forgetting levels by alphabetical switching
for(i in c(3, 4)){
circle_example_alt_levels[[i]]$class <- factor(circle_example_alt_levels[[i]]$class, levels = rev(levels(circle_example_alt_levels[[i]]$class)))
}

test_that("factor levels are not affected by alphabet ordering or class sizes", {
for(i in 1:4){
rec_p <- recipe(~ ., data = circle_example_alt_levels[[i]]) %>%
step(class) %>%
prep(training = circle_example_alt_levels[[i]])

expect_equal(
levels(circle_example_alt_levels[[i]]$class), # Original levels
rec_p$levels$class$values # New levels
)
expect_equal(
levels(circle_example_alt_levels[[i]]$class), # Original levels
levels(juice(rec_p)$class) # New levels
)
}
})

}
1 change: 1 addition & 0 deletions tests/testthat/test-adasyn.R
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,4 @@ test_tidy(step_adasyn)
test_over_ratio(step_adasyn)
test_multiclass(step_adasyn)
test_multi_majority(step_adasyn)
test_factor_level_memory(step_adasyn)
1 change: 1 addition & 0 deletions tests/testthat/test-bsmote.R
Original file line number Diff line number Diff line change
Expand Up @@ -57,3 +57,4 @@ test_over_ratio(step_bsmote, all_neighbors = TRUE)
test_multiclass(step_bsmote)
test_multi_majority(step_bsmote)
test_multi_majority(step_bsmote, all_neighbors = TRUE)
test_factor_level_memory(step_bsmote)
1 change: 1 addition & 0 deletions tests/testthat/test-downsample.R
Original file line number Diff line number Diff line change
Expand Up @@ -36,3 +36,4 @@ test_tidy(step_downsample)
test_under_ratio(step_downsample)
test_multiclass(step_downsample)
test_multi_minority(step_downsample)
test_factor_level_memory(step_downsample)
1 change: 1 addition & 0 deletions tests/testthat/test-nearmiss.R
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,4 @@ test_tidy(step_nearmiss)
test_under_ratio(step_nearmiss)
test_multiclass(step_nearmiss, rename(iris[-c(1:25, 51:75), ], class = Species))
test_multi_minority(step_nearmiss)
test_factor_level_memory(step_nearmiss)
1 change: 1 addition & 0 deletions tests/testthat/test-rose.R
Original file line number Diff line number Diff line change
Expand Up @@ -48,3 +48,4 @@ test_na_response(step_rose)
test_seed(step_rose)
test_tidy(step_rose)
test_2_class_only(step_rose)
test_factor_level_memory(step_rose)
1 change: 1 addition & 0 deletions tests/testthat/test-smote.R
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,4 @@ test_tidy(step_smote)
test_over_ratio(step_smote)
test_multiclass(step_smote)
test_multi_majority(step_smote)
test_factor_level_memory(step_smote)
1 change: 1 addition & 0 deletions tests/testthat/test-tomek.R
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,4 @@ test_character_error(step_tomek)
test_na_response(step_tomek)
test_tidy(step_tomek)
test_2_class_only(step_tomek)
test_factor_level_memory(step_tomek)
1 change: 1 addition & 0 deletions tests/testthat/test-upsample.R
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,4 @@ test_tidy(step_upsample)
test_over_ratio(step_upsample)
test_multiclass(step_upsample)
test_multi_majority(step_upsample)
test_factor_level_memory(step_upsample)

0 comments on commit d2c2b29

Please sign in to comment.