Skip to content

Commit

Permalink
Implement the arr subnamespace (#790)
Browse files Browse the repository at this point in the history
Co-authored-by: eitsupi <ts1s1andn@gmail.com>
  • Loading branch information
etiennebacher and eitsupi authored Feb 11, 2024
1 parent 6bfba9e commit 53d4aba
Show file tree
Hide file tree
Showing 32 changed files with 1,286 additions and 245 deletions.
1 change: 1 addition & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ Collate:
'error__string.R'
'error__trait.R'
'error_conversion.R'
'expr__array.R'
'expr__binary.R'
'expr__categorical.R'
'expr__datetime.R'
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ S3method("$",RPolarsDataTypeVector)
S3method("$",RPolarsDynamicGroupBy)
S3method("$",RPolarsErr)
S3method("$",RPolarsExpr)
S3method("$",RPolarsExprArrNameSpace)
S3method("$",RPolarsExprBinNameSpace)
S3method("$",RPolarsExprCatNameSpace)
S3method("$",RPolarsExprDTNameSpace)
Expand Down
30 changes: 30 additions & 0 deletions R/datatype.R
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,7 @@ DataType_new = function(str) {
DataType_constructors = function() {
list(
Datetime = DataType_Datetime,
Array = DataType_Array,
List = DataType_List,
Struct = DataType_Struct

Expand Down Expand Up @@ -230,6 +231,35 @@ DataType_Struct = function(...) {
unwrap("in pl$Struct:")
}

#' Create Array DataType
#'
#' The Array and List datatypes are very similar. The only difference is that
#' sub-arrays all have the same length while sublists can have different lengths.
#' Array methods can be accessed via the `$arr` subnamespace.
#'
#' @param datatype An inner DataType. The default is `"Unknown"` and is only a
#' placeholder for when inner DataType does not matter, e.g. as used in example.
#' @param width The length of the arrays.
#' @return An array DataType with an inner DataType
#' @examples
#' # basic Array
#' pl$Array(pl$Int32, 4)
#' # some nested Array
#' pl$Array(pl$Array(pl$Boolean, 3), 2)
DataType_Array = function(datatype = "unknown", width) {
if (is.character(datatype) && length(datatype) == 1) {
datatype = .pr$DataType$new(datatype)
}
if (!inherits(datatype, "RPolarsDataType")) {
stop(paste(
"input for generating a array DataType must be another DataType",
"or an interpretable name thereof."
))
}
.pr$DataType$new_array(datatype, width) |>
unwrap("in pl$Array():")
}

#' Create List DataType
#' @keywords pl
#' @param datatype an inner DataType, default is "Unknown" (placeholder for when inner DataType
Expand Down
218 changes: 218 additions & 0 deletions R/expr__array.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,218 @@
#' Sum all elements in an array
#'
#' @return Expr
#' @aliases arr_sum
#' @examples
#' df = pl$DataFrame(
#' values = list(c(1, 2), c(3, 4), c(NA_real_, 6)),
#' schema = list(values = pl$Array(pl$Float64, 2))
#' )
#' df$with_columns(sum = pl$col("values")$arr$sum())
ExprArr_sum = function() .pr$Expr$arr_sum(self)

# TODO: add example with NA when this is fixed:
# https://github.com/pola-rs/polars/issues/14359

#' Find the maximum value in an array
#'
#' @return Expr
#' @details
#' This method is only available with the "simd" feature.
#' See [polars_info] for more details.
#' @aliases arr_max
#' @examplesIf polars_info()$features$simd
#' df = pl$DataFrame(
#' values = list(c(1, 2), c(3, 4), c(5, 6)),
#' schema = list(values = pl$Array(pl$Float64, 2))
#' )
#' df$with_columns(max = pl$col("values")$arr$max())
ExprArr_max = function() {
# TODO: not to check simd here
check_feature("simd", "in $arr$max():")

.pr$Expr$arr_max(self)
}

# TODO: add example with NA when this is fixed:
# https://github.com/pola-rs/polars/issues/14359

#' Find the minimum value in an array
#'
#' @inherit ExprArr_max details
#' @return Expr
#' @aliases arr_min
#' @examplesIf polars_info()$features$simd
#' df = pl$DataFrame(
#' values = list(c(1, 2), c(3, 4), c(5, 6)),
#' schema = list(values = pl$Array(pl$Float64, 2))
#' )
#' df$with_columns(min = pl$col("values")$arr$min())
ExprArr_min = function() {
# TODO: not to check simd here
check_feature("simd", "in $arr$min():")

.pr$Expr$arr_min(self)
}

#' Sort values in an array
#'
#' @inheritParams Expr_sort
#' @aliases arr_sort
#' @examples
#' df = pl$DataFrame(
#' values = list(c(2, 1), c(3, 4), c(NA_real_, 6)),
#' schema = list(values = pl$Array(pl$Float64, 2))
#' )
#' df$with_columns(sort = pl$col("values")$arr$sort(nulls_last = TRUE))
ExprArr_sort = function(descending = FALSE, nulls_last = FALSE) .pr$Expr$arr_sort(self, descending, nulls_last)

#' Reverse values in an array
#'
#' @return Expr
#' @aliases arr_reverse
#' @examples
#' df = pl$DataFrame(
#' values = list(c(1, 2), c(3, 4), c(NA_real_, 6)),
#' schema = list(values = pl$Array(pl$Float64, 2))
#' )
#' df$with_columns(reverse = pl$col("values")$arr$reverse())
ExprArr_reverse = function() .pr$Expr$arr_reverse(self)

#' Get unique values in an array
#'
#' @inheritParams Expr_unique
#' @return Expr
#' @aliases arr_unique
#' @examples
#' df = pl$DataFrame(
#' values = list(c(1, 1, 2), c(4, 4, 4), c(NA_real_, 6, 7)),
#' schema = list(values = pl$Array(pl$Float64, 3))
#' )
#' df$with_columns(unique = pl$col("values")$arr$unique())
ExprArr_unique = function(maintain_order = FALSE) .pr$Expr$arr_unique(self, maintain_order)


#' Get the value by index in an array
#'
#' This allows to extract one value per array only.
#'
#' @param index An Expr or something coercible to an Expr, that must return a
#' single index. Values are 0-indexed (so index 0 would return the first item
#' of every sub-array) and negative values start from the end (index `-1`
#' returns the last item). If the index is out of bounds, it will return a
#' `null`. Strings are parsed as column names.
#'
#' @return Expr
#' @aliases arr_get
#' @examples
#' df = pl$DataFrame(
#' values = list(c(1, 2), c(3, 4), c(NA_real_, 6)),
#' idx = c(1, NA, 3),
#' schema = list(values = pl$Array(pl$Float64, 2))
#' )
#' df$with_columns(
#' using_expr = pl$col("values")$arr$get("idx"),
#' val_0 = pl$col("values")$arr$get(0),
#' val_minus_1 = pl$col("values")$arr$get(-1),
#' val_oob = pl$col("values")$arr$get(10)
#' )
ExprArr_get = function(index) {
.pr$Expr$arr_get(self, index) |>
unwrap("in $arr$get():")
}

#' Check if array contains a given value
#'
#' @param item Expr or something coercible to an Expr. Strings are *not* parsed
#' as columns.
#'
#' @return Expr
#' @aliases arr_contains
#' @examples
#' df = pl$DataFrame(
#' values = list(0:2, 4:6, c(NA_integer_, NA_integer_, NA_integer_)),
#' item = c(0L, 4L, 2L),
#' schema = list(values = pl$Array(pl$Float64, 3))
#' )
#' df$with_columns(
#' with_expr = pl$col("values")$arr$contains(pl$col("item")),
#' with_lit = pl$col("values")$arr$contains(1)
#' )
ExprArr_contains = function(item) .pr$Expr$arr_contains(self, item)

#' Join elements of an array
#'
#' Join all string items in a sub-array and place a separator between them. This
#' only works on columns of type `list[str]`.
#'
#' @param separator String to separate the items with. Can be an Expr. Strings
#' are *not* parsed as columns.
#' @inheritParams pl_concat_str
#'
#' @return Expr
#' @aliases arr_join
#' @examples
#' df = pl$DataFrame(
#' values = list(c("a", "b", "c"), c("x", "y", "z"), c("e", NA, NA)),
#' separator = c("-", "+", "/"),
#' schema = list(values = pl$Array(pl$String, 3))
#' )
#' df$with_columns(
#' join_with_expr = pl$col("values")$arr$join(pl$col("separator")),
#' join_with_lit = pl$col("values")$arr$join(" "),
#' join_ignore_null = pl$col("values")$arr$join(" ", ignore_nulls = TRUE)
#' )
ExprArr_join = function(separator, ignore_nulls = FALSE) {
.pr$Expr$arr_join(self, separator, ignore_nulls) |>
unwrap("in $arr$join():")
}

#' Get the index of the minimal value in an array
#'
#' @return Expr
#' @aliases arr_arg_min
#' @examples
#' df = pl$DataFrame(
#' values = list(1:2, 2:1),
#' schema = list(values = pl$Array(pl$Int32, 2))
#' )
#' df$with_columns(
#' arg_min = pl$col("values")$arr$arg_min()
#' )
ExprArr_arg_min = function() .pr$Expr$arr_arg_min(self)

#' Get the index of the maximal value in an array
#'
#' @return Expr
#' @aliases arr_arg_max
#' @examples
#' df = pl$DataFrame(
#' values = list(1:2, 2:1),
#' schema = list(values = pl$Array(pl$Int32, 2))
#' )
#' df$with_columns(
#' arg_max = pl$col("values")$arr$arg_max()
#' )
ExprArr_arg_max = function() .pr$Expr$arr_arg_max(self)

#' Evaluate whether all boolean values in an array are true
#'
#' @return Expr
#' @examples
#' df = pl$DataFrame(
#' values = list(c(TRUE, TRUE), c(FALSE, TRUE), c(FALSE, FALSE), c(NA, NA)),
#' schema = list(values = pl$Array(pl$Boolean, 2))
#' )
#' df$with_columns(all = pl$col("values")$arr$all())
ExprArr_all = function() .pr$Expr$arr_all(self)

#' Evaluate whether any boolean values in an array are true
#'
#' @return Expr
#' @examples
#' df = pl$DataFrame(
#' values = list(c(TRUE, TRUE), c(FALSE, TRUE), c(FALSE, FALSE), c(NA, NA)),
#' schema = list(values = pl$Array(pl$Boolean, 2))
#' )
#' df$with_columns(any = pl$col("values")$arr$any())
ExprArr_any = function() .pr$Expr$arr_any(self)
10 changes: 10 additions & 0 deletions R/expr__expr.R
Original file line number Diff line number Diff line change
Expand Up @@ -3316,6 +3316,16 @@ Expr_list = method_as_property(function() {
expr_list_make_sub_ns(self)
})

#' Array related methods
#'
#' Create an object namespace of all array related methods. See the individual
#' method pages for full details.
#' @return Expr
#' @noRd
Expr_arr = method_as_property(function() {
expr_arr_make_sub_ns(self)
})

#' String related methods
#'
#' Create an object namespace of all string related methods. See the individual
Expand Down
2 changes: 1 addition & 1 deletion R/expr__list.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#' Get the length of each list
#'
#' Return the number of elements in each list. Null values are counted in the
#' total. `$list$lengths()` is deprecated.
#' total.
#'
#' @return Expr
#' @aliases list_len
Expand Down
32 changes: 32 additions & 0 deletions R/extendr-wrappers.R
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,8 @@ RPolarsDataType$new_duration <- function() .Call(wrap__RPolarsDataType__new_dura

RPolarsDataType$new_list <- function(inner) .Call(wrap__RPolarsDataType__new_list, inner)

RPolarsDataType$new_array <- function(inner, width) .Call(wrap__RPolarsDataType__new_array, inner, width)

RPolarsDataType$new_object <- function() .Call(wrap__RPolarsDataType__new_object)

RPolarsDataType$new_struct <- function(l) .Call(wrap__RPolarsDataType__new_struct, l)
Expand Down Expand Up @@ -674,6 +676,36 @@ RPolarsExpr$list_any <- function() .Call(wrap__RPolarsExpr__list_any, self)

RPolarsExpr$list_set_operation <- function(other, operation) .Call(wrap__RPolarsExpr__list_set_operation, self, other, operation)

RPolarsExpr$arr_max <- function() .Call(wrap__RPolarsExpr__arr_max, self)

RPolarsExpr$arr_min <- function() .Call(wrap__RPolarsExpr__arr_min, self)

RPolarsExpr$arr_sum <- function() .Call(wrap__RPolarsExpr__arr_sum, self)

RPolarsExpr$arr_unique <- function(maintain_order) .Call(wrap__RPolarsExpr__arr_unique, self, maintain_order)

RPolarsExpr$arr_to_list <- function() .Call(wrap__RPolarsExpr__arr_to_list, self)

RPolarsExpr$arr_all <- function() .Call(wrap__RPolarsExpr__arr_all, self)

RPolarsExpr$arr_any <- function() .Call(wrap__RPolarsExpr__arr_any, self)

RPolarsExpr$arr_sort <- function(descending, nulls_last) .Call(wrap__RPolarsExpr__arr_sort, self, descending, nulls_last)

RPolarsExpr$arr_reverse <- function() .Call(wrap__RPolarsExpr__arr_reverse, self)

RPolarsExpr$arr_arg_min <- function() .Call(wrap__RPolarsExpr__arr_arg_min, self)

RPolarsExpr$arr_arg_max <- function() .Call(wrap__RPolarsExpr__arr_arg_max, self)

RPolarsExpr$arr_get <- function(index) .Call(wrap__RPolarsExpr__arr_get, self, index)

RPolarsExpr$arr_join <- function(separator, ignore_nulls) .Call(wrap__RPolarsExpr__arr_join, self, separator, ignore_nulls)

RPolarsExpr$arr_contains <- function(other) .Call(wrap__RPolarsExpr__arr_contains, self, other)

RPolarsExpr$arr_count_matches <- function(expr) .Call(wrap__RPolarsExpr__arr_count_matches, self, expr)

RPolarsExpr$dt_truncate <- function(every, offset) .Call(wrap__RPolarsExpr__dt_truncate, self, every, offset)

RPolarsExpr$dt_round <- function(every, offset) .Call(wrap__RPolarsExpr__dt_round, self, every, offset)
Expand Down
4 changes: 4 additions & 0 deletions R/zzz.R
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,10 @@ replace_private_with_pub_methods(RPolarsExpr, "^Expr_")
`$.RPolarsExprListNameSpace` = sub_name_space_accessor_function
expr_list_make_sub_ns = macro_new_subnamespace("^ExprList_", "RPolarsExprListNameSpace")

#' @export
`$.RPolarsExprArrNameSpace` = sub_name_space_accessor_function
expr_arr_make_sub_ns = macro_new_subnamespace("^ExprArr_", "RPolarsExprArrNameSpace")

#' @export
`$.RPolarsExprStrNameSpace` = sub_name_space_accessor_function
expr_str_make_sub_ns = macro_new_subnamespace("^ExprStr_", "RPolarsExprStrNameSpace")
Expand Down
2 changes: 1 addition & 1 deletion altdoc/altdoc_postprocessing.R
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ for (i in to_modify) {
orig
)
} else if (which_class %in% c(
"ExprBin", "ExprCat", "ExprDT", "ExprList",
"ExprArr", "ExprBin", "ExprCat", "ExprDT", "ExprList",
"ExprMeta", "ExprName", "ExprStr", "ExprStruct"
)) {
subns = tolower(gsub("Expr", "", which_class))
Expand Down
3 changes: 2 additions & 1 deletion altdoc/altdoc_preprocessing.R
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ classes = c(
"pl", "Series", "DataFrame", "LazyFrame", "GroupBy",
"LazyGroupBy", "RollingGroupBy", "DynamicGroupBy", "ExprList", "ExprBin",
"ExprCat", "ExprDT", "ExprMeta", "ExprName", "ExprStr", "ExprStruct",
"Expr", "IO", "RField", "RThreadHandle", "SQLContext", "S3"
"ExprArr", "Expr", "IO", "RField", "RThreadHandle", "SQLContext", "S3"
)
for (cl in classes) {
files = grep(paste0("^", cl, "_"), other, value = TRUE)
Expand All @@ -52,6 +52,7 @@ for (cl in classes) {
# expr: nested
nam = c(
"Expr" = "All others",
"ExprArr" = "Array",
"ExprList" = "List",
"ExprBin" = "Binary",
"ExprCat" = "Categorical",
Expand Down
Loading

0 comments on commit 53d4aba

Please sign in to comment.