apache · thisisnic · Jul 15, 2021 · Jul 15, 2021 · Jul 15, 2021 · Jul 15, 2021
diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R
@@ -294,8 +294,8 @@ nse_funcs$substr <- function(x, start, stop) {
     msg = "`stop` must be length 1 - other lengths are not supported in Arrow"
   )
 
-  # substr treats values as if they're on a continous number line, so values 
-  # 0 are effectively blank characters - set `start` to 1 here so Arrow mimics 
+  # substr treats values as if they're on a continous number line, so values
+  # 0 are effectively blank characters - set `start` to 1 here so Arrow mimics
   # this behavior
   if (start <= 0) {
     start <- 1
@@ -310,7 +310,7 @@ nse_funcs$substr <- function(x, start, stop) {
   Expression$create(
     "utf8_slice_codeunits",
     x,
-    # we don't need to subtract 1 from `stop` as C++ counts exclusively 
+    # we don't need to subtract 1 from `stop` as C++ counts exclusively
     # which effectively cancels out the difference in indexing between R & C++
     options = list(start = start - 1L, stop = stop)
   )
@@ -336,14 +336,14 @@ nse_funcs$str_sub <- function(string, start = 1L, end = -1L) {
     end <- .Machine$integer.max
   }
 
-  # An end value lower than a start value returns an empty string in 
+  # An end value lower than a start value returns an empty string in
   # stringr::str_sub so set end to 0 here to match this behavior
   if (end < start) {
     end <- 0
   }
 
   # subtract 1 from `start` because C++ is 0-based and R is 1-based
-  # str_sub treats a `start` value of 0 or 1 as the same thing so don't subtract 1 when `start` == 0 
+  # str_sub treats a `start` value of 0 or 1 as the same thing so don't subtract 1 when `start` == 0
   # when `start` < 0, both str_sub and utf8_slice_codeunits count backwards from the end
   if (start > 0) {
     start <- start - 1L
@@ -634,20 +634,49 @@ nse_funcs$wday <- function(x, label = FALSE, abbr = TRUE, week_start = getOption
 }
 
 nse_funcs$log <- function(x, base = exp(1)) {
-  
+
   if (base == exp(1)) {
     return(Expression$create("ln_checked", x))
   }
-  
+
   if (base == 2) {
     return(Expression$create("log2_checked", x))
   }
-  
+
   if (base == 10) {
     return(Expression$create("log10_checked", x))
-  } 
+  }
   # ARROW-13345
   stop("`base` values other than exp(1), 2 and 10 not supported in Arrow", call. = FALSE)
 }
 
 nse_funcs$logb <- nse_funcs$log
+
+nse_funcs$if_else <- function(condition, true, false, missing = NULL){
+  if (!is.null(missing)) {
+    return(nse_funcs$if_else(
+      Expression$create("is_null", condition),
+      missing,
+      nse_funcs$if_else(condition, true, false)
+    ))
+  }
+
+  # TODO: if_else doesn't yet support factors/dictionaries this can be removed when
+  # ARROW-13358 merges
+  warn_r_types <- is.factor(true) || is.factor(false)
+  # However, if they are expressions, we need to use the functions from nse_funcs
+  warn_expression_types_true <- inherits(true, "Expression") &&  nse_funcs$is.factor(true)
+  warn_expression_types_false <- inherits(false, "Expression") && nse_funcs$is.factor(false)
+
+  if (warn_r_types | warn_expression_types_true | warn_expression_types_false) {
+    warning("Factors are currently converted to characters in if_else and ifelse", call. = FALSE)
+  }
+
+  build_expr("if_else", condition, true, false)
+}
+
+# Although base R ifelse allows `yes` and `no` to be different classes
+#
+nse_funcs$ifelse <- function(test, yes, no){
+ nse_funcs$if_else(condition = test, true = yes, false = no)
+}
diff --git a/r/tests/testthat/helper-expectation.R b/r/tests/testthat/helper-expectation.R
@@ -91,7 +91,8 @@ expect_dplyr_equal <- function(expr,
 
   if (isTRUE(warning)) {
     # Special-case the simple warning:
-    warning <- "not supported in Arrow; pulling data into R"
+    # TODO: ARROW-13362 pick one of in or by and use it everywhere
+    warning <- "not supported (in|by) Arrow; pulling data into R"
   }
 
   skip_msg <- NULL

diff --git a/r/tests/testthat/test-dplyr.R b/r/tests/testthat/test-dplyr.R
@@ -67,7 +67,7 @@ chr: string
 See $.data for the source Arrow object',
   fixed = TRUE
   )
-  
+
 })
 
 test_that("summarize", {
@@ -821,7 +821,7 @@ test_that("type checks on expressions", {
       collect(),
     tbl
   )
-  
+
   # the code in the expectation below depends on RE2
   skip_if_not_available("re2")
 
@@ -947,64 +947,64 @@ test_that("abs()", {
 })
 
 test_that("log functions", {
-  
+
   df <- tibble(x = c(1:10, NA, NA))
-  
+
   expect_dplyr_equal(
     input %>%
       mutate(y = log(x)) %>%
       collect(),
     df
   )
-  
+
   expect_dplyr_equal(
     input %>%
       mutate(y = log(x, base = exp(1))) %>%
       collect(),
     df
   )
-  
+
   expect_dplyr_equal(
     input %>%
       mutate(y = log(x, base = 2)) %>%
       collect(),
     df
   )
-  
+
   expect_dplyr_equal(
     input %>%
       mutate(y = log(x, base = 10)) %>%
       collect(),
     df
   )
-  
+
   expect_error(
     nse_funcs$log(Expression$scalar(x), base = 5),
     "`base` values other than exp(1), 2 and 10 not supported in Arrow",
     fixed = TRUE
   )
-  
+
   expect_dplyr_equal(
     input %>%
       mutate(y = logb(x)) %>%
       collect(),
     df
   )
-  
+
   expect_dplyr_equal(
     input %>%
       mutate(y = log1p(x)) %>%
       collect(),
     df
   )
-  
+
   expect_dplyr_equal(
     input %>%
       mutate(y = log2(x)) %>%
       collect(),
     df
   )
-  
+
   expect_dplyr_equal(
     input %>%
       mutate(y = log10(x)) %>%
@@ -1013,44 +1013,158 @@ test_that("log functions", {
   )
 
 })
-  
+
 test_that("trig functions", {
-  
+
   df <- tibble(x = c(seq(from = 0, to = 1, by = 0.1), NA))
-  
+
   expect_dplyr_equal(
     input %>%
       mutate(y = sin(x)) %>%
       collect(),
     df
   )
-  
+
   expect_dplyr_equal(
     input %>%
       mutate(y = cos(x)) %>%
       collect(),
     df
   )
-  
+
   expect_dplyr_equal(
     input %>%
       mutate(y = tan(x)) %>%
       collect(),
     df
   )
-  
+
   expect_dplyr_equal(
     input %>%
       mutate(y = asin(x)) %>%
       collect(),
     df
   )
-  
+
   expect_dplyr_equal(
     input %>%
       mutate(y = acos(x)) %>%
       collect(),
     df
   )
 
-})
+})
+
+test_that("if_else and ifelse", {
+  expect_dplyr_equal(
+    input %>%
+      mutate(
+        y = if_else(int > 5, 1, 0)
+      ) %>% collect(),
+    example_data
+  )
+
+  expect_dplyr_equal(
+    input %>%
+      mutate(
+        y = if_else(int > 5, int, 0L)
+      ) %>% collect(),
+    example_data
+  )
+
+  expect_error(
+    Table$create(example_data) %>%
+      mutate(
+        y = if_else(int > 5, 1, FALSE)
+      ) %>% collect(),
+    'NotImplemented: Function if_else has no kernel matching input types'
+  )
+
+  expect_dplyr_equal(
+    input %>%
+      mutate(
+        y = if_else(int > 5, 1, NA_real_)
+      ) %>% collect(),
+    example_data
+  )
+
+  expect_dplyr_equal(
+    input %>%
+      mutate(
+        y = ifelse(int > 5, 1, 0)
+      ) %>% collect(),
+    example_data
+  )
+
+  expect_dplyr_equal(
+    input %>%
+      mutate(
+        y = if_else(dbl > 5, TRUE, FALSE)
+      ) %>% collect(),
+    example_data
+  )
+
+  expect_dplyr_equal(
+    input %>%
+      mutate(
+        y = if_else(chr %in% letters[1:3], 1L, 3L)
+      ) %>% collect(),
+    example_data
+  )
+
+  expect_dplyr_equal(
+    input %>%
+      mutate(
+        y = if_else(int > 5, "one", "zero")
+      ) %>% collect(),
+    example_data
+  )
+
+  expect_dplyr_equal(
+    input %>%
+      mutate(
+        y = if_else(int > 5, chr, chr)
+      ) %>% collect(),
+    example_data
+  )
+
+  expect_dplyr_equal(
+    input %>%
+      mutate(
+        y = if_else(int > 5, chr, chr, missing = "MISSING")
+      ) %>% collect(),
+    example_data
+  )
+
+  # TODO: remove the mutate + warning after ARROW-13358 is merged and Arrow
+  # supports factors in if(_)else
+  expect_dplyr_equal(
+    input %>%
+      mutate(
+        y = if_else(int > 5, fct, factor("a"))
+      ) %>% collect() %>%
+      # This is a no-op on the Arrow side, but necesary to make the results equal
+      mutate(y = as.character(y)),
+    example_data,
+    warning = "Factors are currently converted to characters in if_else and ifelse"
+  )
+
+  skip("ARROW-12055 for better NaN support")
+  # currently NaNs are not NAs and so the missing argument is not correctly
+  # applied
+  expect_dplyr_equal(
+    input %>%
+      mutate(
+        y = if_else(dbl > 5, chr, chr, missing = "MISSING")
+      ) %>% collect(),
+    example_data_for_sorting
+  )
+
+  skip("TODO: could? should? we support the autocasting in ifelse")
+  expect_dplyr_equal(
+    input %>%
+      mutate(y = ifelse(int > 5, 1, FALSE)) %>%
+      collect(),
+    example_data
+  )
+})