diff --git a/r/NEWS.md b/r/NEWS.md index 63be8b9df9bf9..792b2f4081c9c 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -26,6 +26,8 @@ * Added bindings for the remainder of C data interface: Type, Field, and RecordBatchReader (from the experimental C stream interface). These also have `reticulate::py_to_r()` and `r_to_py()` methods. Along with the addition of the `Scanner$ToRecordBatchReader()` method, you can now build up a Dataset query in R and pass the resulting stream of batches to another tool in process. * `match_arrow()` now converts `x` into an `Array` if it is not a `Scalar`, `Array` or `ChunkedArray` and no longer dispatches `base::match()`. * `transmute()` now errors if passed arguments `.keep`, `.before`, or `.after`, for consistency with the behavior of `dplyr` on `data.frame`s. +* `is.na()` now evaluates to `TRUE` on `NaN` values in floating point number fields, for consistency with base R. +* `is.nan()` now evaluates to `FALSE` on `NA` values in floating point number fields and `FALSE` on all values in non-floating point fields, for consistency with base R. # arrow 4.0.1 diff --git a/r/R/arrow-datum.R b/r/R/arrow-datum.R index 8becc37daf2d4..4734d44c7ea33 100644 --- a/r/R/arrow-datum.R +++ b/r/R/arrow-datum.R @@ -47,10 +47,26 @@ is.infinite.ArrowDatum <- function(x) { } #' @export -is.na.ArrowDatum <- function(x) call_function("is_null", x) +is.na.ArrowDatum <- function(x) { + # TODO: if an option is added to the is_null kernel to treat NaN as NA, + # use that to simplify the code here (ARROW-13367) + if (x$type_id() %in% TYPES_WITH_NAN) { + call_function("is_nan", x) | call_function("is_null", x) + } else { + call_function("is_null", x) + } +} #' @export -is.nan.ArrowDatum <- function(x) call_function("is_nan", x) +is.nan.ArrowDatum <- function(x) { + if (x$type_id() %in% TYPES_WITH_NAN) { + # TODO: if an option is added to the is_nan kernel to treat NA as NaN, + # use that to simplify the code here (ARROW-13366) + call_function("is_nan", x) & call_function("is_valid", x) + } else { + Scalar$create(FALSE)$as_array(length(x)) + } +} #' @export as.vector.ArrowDatum <- function(x, mode) { diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index 35db573550d79..d118eefaa859b 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -57,6 +57,28 @@ nse_funcs$cast <- function(x, target_type, safe = TRUE, ...) { Expression$create("cast", x, options = opts) } +nse_funcs$is.na <- function(x) { + # TODO: if an option is added to the is_null kernel to treat NaN as NA, + # use that to simplify the code here (ARROW-13367) + if (is.double(x) || (inherits(x, "Expression") && + x$type_id() %in% TYPES_WITH_NAN)) { + build_expr("is_nan", x) | build_expr("is_null", x) + } else { + build_expr("is_null", x) + } +} + +nse_funcs$is.nan <- function(x) { + if (is.double(x) || (inherits(x, "Expression") && + x$type_id() %in% TYPES_WITH_NAN)) { + # TODO: if an option is added to the is_nan kernel to treat NA as NaN, + # use that to simplify the code here (ARROW-13366) + build_expr("is_nan", x) & build_expr("is_valid", x) + } else { + Expression$scalar(FALSE) + } +} + nse_funcs$is <- function(object, class2) { if (is.string(class2)) { switch(class2, @@ -147,35 +169,38 @@ nse_funcs$as.numeric <- function(x) { # is.* type functions nse_funcs$is.character <- function(x) { - x$type_id() %in% Type[c("STRING", "LARGE_STRING")] + is.character(x) || (inherits(x, "Expression") && + x$type_id() %in% Type[c("STRING", "LARGE_STRING")]) } nse_funcs$is.numeric <- function(x) { - x$type_id() %in% Type[c( + is.numeric(x) || (inherits(x, "Expression") && x$type_id() %in% Type[c( "UINT8", "INT8", "UINT16", "INT16", "UINT32", "INT32", "UINT64", "INT64", "HALF_FLOAT", "FLOAT", "DOUBLE", "DECIMAL", "DECIMAL256" - )] + )]) } nse_funcs$is.double <- function(x) { - x$type_id() == Type["DOUBLE"] + is.double(x) || (inherits(x, "Expression") && x$type_id() == Type["DOUBLE"]) } nse_funcs$is.integer <- function(x) { - x$type_id() %in% Type[c( + is.integer(x) || (inherits(x, "Expression") && x$type_id() %in% Type[c( "UINT8", "INT8", "UINT16", "INT16", "UINT32", "INT32", "UINT64", "INT64" - )] + )]) } nse_funcs$is.integer64 <- function(x) { - x$type_id() == Type["INT64"] + is.integer64(x) || (inherits(x, "Expression") && x$type_id() == Type["INT64"]) } nse_funcs$is.logical <- function(x) { - x$type_id() == Type["BOOL"] + is.logical(x) || (inherits(x, "Expression") && x$type_id() == Type["BOOL"]) } nse_funcs$is.factor <- function(x) { - x$type_id() == Type["DICTIONARY"] + is.factor(x) || (inherits(x, "Expression") && x$type_id() == Type["DICTIONARY"]) } nse_funcs$is.list <- function(x) { - x$type_id() %in% Type[c("LIST", "FIXED_SIZE_LIST", "LARGE_LIST")] + is.list(x) || (inherits(x, "Expression") && x$type_id() %in% Type[c( + "LIST", "FIXED_SIZE_LIST", "LARGE_LIST" + )]) } # rlang::is_* type functions @@ -294,8 +319,8 @@ nse_funcs$substr <- function(x, start, stop) { msg = "`stop` must be length 1 - other lengths are not supported in Arrow" ) - # substr treats values as if they're on a continous number line, so values - # 0 are effectively blank characters - set `start` to 1 here so Arrow mimics + # substr treats values as if they're on a continous number line, so values + # 0 are effectively blank characters - set `start` to 1 here so Arrow mimics # this behavior if (start <= 0) { start <- 1 @@ -310,7 +335,7 @@ nse_funcs$substr <- function(x, start, stop) { Expression$create( "utf8_slice_codeunits", x, - # we don't need to subtract 1 from `stop` as C++ counts exclusively + # we don't need to subtract 1 from `stop` as C++ counts exclusively # which effectively cancels out the difference in indexing between R & C++ options = list(start = start - 1L, stop = stop) ) @@ -336,14 +361,14 @@ nse_funcs$str_sub <- function(string, start = 1L, end = -1L) { end <- .Machine$integer.max } - # An end value lower than a start value returns an empty string in + # An end value lower than a start value returns an empty string in # stringr::str_sub so set end to 0 here to match this behavior if (end < start) { end <- 0 } # subtract 1 from `start` because C++ is 0-based and R is 1-based - # str_sub treats a `start` value of 0 or 1 as the same thing so don't subtract 1 when `start` == 0 + # str_sub treats a `start` value of 0 or 1 as the same thing so don't subtract 1 when `start` == 0 # when `start` < 0, both str_sub and utf8_slice_codeunits count backwards from the end if (start > 0) { start <- start - 1L @@ -634,20 +659,45 @@ nse_funcs$wday <- function(x, label = FALSE, abbr = TRUE, week_start = getOption } nse_funcs$log <- function(x, base = exp(1)) { - + if (base == exp(1)) { return(Expression$create("ln_checked", x)) } - + if (base == 2) { return(Expression$create("log2_checked", x)) } - + if (base == 10) { return(Expression$create("log10_checked", x)) - } + } # ARROW-13345 stop("`base` values other than exp(1), 2 and 10 not supported in Arrow", call. = FALSE) } nse_funcs$logb <- nse_funcs$log + +nse_funcs$if_else <- function(condition, true, false, missing = NULL){ + if (!is.null(missing)) { + return(nse_funcs$if_else( + nse_funcs$is.na(condition), + missing, + nse_funcs$if_else(condition, true, false) + )) + } + + # if_else doesn't yet support factors/dictionaries + # TODO: remove this after ARROW-13358 is merged + warn_types <- nse_funcs$is.factor(true) | nse_funcs$is.factor(false) + if (warn_types) { + warning("Dictionaries (in R: factors) are currently converted to strings (characters) in if_else and ifelse", call. = FALSE) + } + + build_expr("if_else", condition, true, false) +} + +# Although base R ifelse allows `yes` and `no` to be different classes +# +nse_funcs$ifelse <- function(test, yes, no){ + nse_funcs$if_else(condition = test, true = yes, false = no) +} diff --git a/r/R/enums.R b/r/R/enums.R index 8a5bf7366a9fe..019ebc7a3377b 100644 --- a/r/R/enums.R +++ b/r/R/enums.R @@ -81,6 +81,8 @@ Type <- enum("Type::type", LARGE_LIST = 36L ) +TYPES_WITH_NAN <- Type[c("HALF_FLOAT", "FLOAT", "DOUBLE")] + #' @rdname enums #' @export StatusCode <- enum("StatusCode", diff --git a/r/R/expression.R b/r/R/expression.R index b3fc9fe20c756..064f88ec0c31c 100644 --- a/r/R/expression.R +++ b/r/R/expression.R @@ -20,8 +20,8 @@ .unary_function_map <- list( "!" = "invert", "as.factor" = "dictionary_encode", - "is.na" = "is_null", - "is.nan" = "is_nan", + # is.na is defined in dplyr-functions.R + # is.nan is defined in dplyr-functions.R "abs" = "abs_checked", # nchar is defined in dplyr-functions.R "tolower" = "utf8_lower", @@ -206,4 +206,12 @@ Ops.Expression <- function(e1, e2) { } #' @export -is.na.Expression <- function(x) Expression$create("is_null", x) +is.na.Expression <- function(x) { + if (!is.null(x$schema) && x$type_id() %in% TYPES_WITH_NAN) { + # TODO: if an option is added to the is_null kernel to treat NaN as NA, + # use that to simplify the code here (ARROW-13367) + Expression$create("is_nan", x) | build_expr("is_null", x) + } else { + Expression$create("is_null", x) + } +} diff --git a/r/tests/testthat/helper-expectation.R b/r/tests/testthat/helper-expectation.R index 359e31ef57d87..c4dab9ace45b4 100644 --- a/r/tests/testthat/helper-expectation.R +++ b/r/tests/testthat/helper-expectation.R @@ -91,7 +91,8 @@ expect_dplyr_equal <- function(expr, if (isTRUE(warning)) { # Special-case the simple warning: - warning <- "not supported in Arrow; pulling data into R" + # TODO: ARROW-13362 pick one of in or by and use it everywhere + warning <- "not supported (in|by) Arrow; pulling data into R" } skip_msg <- NULL diff --git a/r/tests/testthat/test-Array.R b/r/tests/testthat/test-Array.R index 63ac64eee5fda..305f5a3463453 100644 --- a/r/tests/testthat/test-Array.R +++ b/r/tests/testthat/test-Array.R @@ -317,6 +317,23 @@ test_that("support for NaN (ARROW-3615)", { expect_equal(y$null_count, 1L) }) +test_that("is.nan() evalutes to FALSE on NA (for consistency with base R)", { + x <- c(1.0, NA, NaN, -1.0) + expect_vector_equal(is.nan(input), x) +}) + +test_that("is.nan() evalutes to FALSE on non-floats (for consistency with base R)", { + x <- c(1L, 2L, 3L) + y <- c("foo", "bar") + expect_vector_equal(is.nan(input), x) + expect_vector_equal(is.nan(input), y) +}) + +test_that("is.na() evalutes to TRUE on NaN (for consistency with base R)", { + x <- c(1, NA, NaN, -1) + expect_vector_equal(is.na(input), x) +}) + test_that("integer types casts (ARROW-3741)", { # Defining some type groups for use here and in the following tests int_types <- c(int8(), int16(), int32(), int64()) diff --git a/r/tests/testthat/test-compute-sort.R b/r/tests/testthat/test-compute-sort.R index 63977b554143c..373237ff9a1a5 100644 --- a/r/tests/testthat/test-compute-sort.R +++ b/r/tests/testthat/test-compute-sort.R @@ -118,7 +118,6 @@ test_that("sort(vector), sort(Array), sort(ChunkedArray) give equivalent results sort(input, decreasing = FALSE, na.last = TRUE), tbl$dbl ) - skip("is.na() evaluates to FALSE on Arrow NaN values (ARROW-12055)") expect_vector_equal( sort(input, decreasing = TRUE, na.last = NA), tbl$dbl diff --git a/r/tests/testthat/test-dplyr.R b/r/tests/testthat/test-dplyr.R index 63d0433fc2336..67baab3ede0ff 100644 --- a/r/tests/testthat/test-dplyr.R +++ b/r/tests/testthat/test-dplyr.R @@ -67,7 +67,7 @@ chr: string See $.data for the source Arrow object', fixed = TRUE ) - + }) test_that("summarize", { @@ -524,7 +524,7 @@ test_that("is.finite(), is.infinite(), is.nan()", { ) %>% collect(), df ) - skip("is.nan() evaluates to NA on NA values (ARROW-12850)") + # is.nan() evaluates to FALSE on NA_real_ (ARROW-12850) expect_dplyr_equal( input %>% transmute( @@ -534,6 +534,17 @@ test_that("is.finite(), is.infinite(), is.nan()", { ) }) +test_that("is.na() evaluates to TRUE on NaN (ARROW-12055)", { + df <- tibble(x = c(1.1, 2.2, NA_real_, 4.4, NaN, 6.6, 7.7)) + expect_dplyr_equal( + input %>% + transmute( + is_na = is.na(x) + ) %>% collect(), + df + ) +}) + test_that("type checks with is() giving Arrow types", { # with class2=DataType expect_equal( @@ -821,7 +832,7 @@ test_that("type checks on expressions", { collect(), tbl ) - + # the code in the expectation below depends on RE2 skip_if_not_available("re2") @@ -835,6 +846,31 @@ test_that("type checks on expressions", { ) }) +test_that("type checks on R scalar literals", { + expect_dplyr_equal( + input %>% + transmute( + chr_is_chr = is.character("foo"), + int_is_chr = is.character(42L), + int_is_int = is.integer(42L), + chr_is_int = is.integer("foo"), + dbl_is_num = is.numeric(3.14159), + int_is_num = is.numeric(42L), + chr_is_num = is.numeric("foo"), + dbl_is_dbl = is.double(3.14159), + chr_is_dbl = is.double("foo"), + lgl_is_lgl = is.logical(TRUE), + chr_is_lgl = is.logical("foo"), + fct_is_fct = is.factor(factor("foo", levels = c("foo", "bar", "baz"))), + chr_is_fct = is.factor("foo"), + lst_is_lst = is.list(list(c(a = "foo", b = "bar"))), + chr_is_lst = is.list("foo") + ) %>% + collect(), + tbl + ) +}) + test_that("as.factor()/dictionary_encode()", { skip("ARROW-12632: ExecuteScalarExpression cannot Execute non-scalar expression {x=dictionary_encode(x, {NON-REPRESENTABLE OPTIONS})}") df1 <- tibble(x = c("C", "D", "B", NA, "D", "B", "S", "A", "B", "Z", "B")) @@ -947,64 +983,64 @@ test_that("abs()", { }) test_that("log functions", { - + df <- tibble(x = c(1:10, NA, NA)) - + expect_dplyr_equal( input %>% mutate(y = log(x)) %>% collect(), df ) - + expect_dplyr_equal( input %>% mutate(y = log(x, base = exp(1))) %>% collect(), df ) - + expect_dplyr_equal( input %>% mutate(y = log(x, base = 2)) %>% collect(), df ) - + expect_dplyr_equal( input %>% mutate(y = log(x, base = 10)) %>% collect(), df ) - + expect_error( nse_funcs$log(Expression$scalar(x), base = 5), "`base` values other than exp(1), 2 and 10 not supported in Arrow", fixed = TRUE ) - + expect_dplyr_equal( input %>% mutate(y = logb(x)) %>% collect(), df ) - + expect_dplyr_equal( input %>% mutate(y = log1p(x)) %>% collect(), df ) - + expect_dplyr_equal( input %>% mutate(y = log2(x)) %>% collect(), df ) - + expect_dplyr_equal( input %>% mutate(y = log10(x)) %>% @@ -1013,39 +1049,39 @@ test_that("log functions", { ) }) - + test_that("trig functions", { - + df <- tibble(x = c(seq(from = 0, to = 1, by = 0.1), NA)) - + expect_dplyr_equal( input %>% mutate(y = sin(x)) %>% collect(), df ) - + expect_dplyr_equal( input %>% mutate(y = cos(x)) %>% collect(), df ) - + expect_dplyr_equal( input %>% mutate(y = tan(x)) %>% collect(), df ) - + expect_dplyr_equal( input %>% mutate(y = asin(x)) %>% collect(), df ) - + expect_dplyr_equal( input %>% mutate(y = acos(x)) %>% @@ -1053,4 +1089,129 @@ test_that("trig functions", { df ) -}) \ No newline at end of file +}) + +test_that("if_else and ifelse", { + tbl <- example_data + tbl$another_chr <- tail(letters, 10) + + expect_dplyr_equal( + input %>% + mutate( + y = if_else(int > 5, 1, 0) + ) %>% collect(), + tbl + ) + + expect_dplyr_equal( + input %>% + mutate( + y = if_else(int > 5, int, 0L) + ) %>% collect(), + tbl + ) + + expect_error( + Table$create(tbl) %>% + mutate( + y = if_else(int > 5, 1, FALSE) + ) %>% collect(), + 'NotImplemented: Function if_else has no kernel matching input types' + ) + + expect_dplyr_equal( + input %>% + mutate( + y = if_else(int > 5, 1, NA_real_) + ) %>% collect(), + tbl + ) + + expect_dplyr_equal( + input %>% + mutate( + y = ifelse(int > 5, 1, 0) + ) %>% collect(), + tbl + ) + + expect_dplyr_equal( + input %>% + mutate( + y = if_else(dbl > 5, TRUE, FALSE) + ) %>% collect(), + tbl + ) + + expect_dplyr_equal( + input %>% + mutate( + y = if_else(chr %in% letters[1:3], 1L, 3L) + ) %>% collect(), + tbl + ) + + expect_dplyr_equal( + input %>% + mutate( + y = if_else(int > 5, "one", "zero") + ) %>% collect(), + tbl + ) + + expect_dplyr_equal( + input %>% + mutate( + y = if_else(int > 5, chr, another_chr) + ) %>% collect(), + tbl + ) + + expect_dplyr_equal( + input %>% + mutate( + y = if_else(int > 5, "true", chr, missing = "MISSING") + ) %>% collect(), + tbl + ) + + # TODO: remove the mutate + warning after ARROW-13358 is merged and Arrow + # supports factors in if(_)else + expect_dplyr_equal( + input %>% + mutate( + y = if_else(int > 5, fct, factor("a")) + ) %>% collect() %>% + # This is a no-op on the Arrow side, but necesary to make the results equal + mutate(y = as.character(y)), + tbl, + warning = "Dictionaries .* are currently converted to strings .* in if_else and ifelse" + ) + + # detecting NA and NaN works just fine + expect_dplyr_equal( + input %>% + mutate( + y = if_else(is.na(dbl), chr, "false", missing = "MISSING") + ) %>% collect(), + example_data_for_sorting + ) + + # However, currently comparisons with NaNs return false and not NaNs or NAs + skip("ARROW-13364") + expect_dplyr_equal( + input %>% + mutate( + y = if_else(dbl > 5, chr, another_chr, missing = "MISSING") + ) %>% collect(), + example_data_for_sorting + ) + + skip("TODO: could? should? we support the autocasting in ifelse") + expect_dplyr_equal( + input %>% + mutate(y = ifelse(int > 5, 1, FALSE)) %>% + collect(), + tbl + ) +})