From d9b5dad75b443cab6be3662e06f79e225a0ff6bf Mon Sep 17 00:00:00 2001 From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> Date: Sat, 19 Oct 2024 18:02:11 +0200 Subject: [PATCH] docs: add some docs for DataFrame --- R/dataframe-frame.R | 249 ++++++++++++++++++++++++++++++++- R/lazyframe-frame.R | 26 +++- man/dataframe__cast.Rd | 27 ++++ man/dataframe__clone.Rd | 54 +++++++ man/dataframe__drop.Rd | 27 ++++ man/dataframe__equals.Rd | 24 ++++ man/dataframe__filter.Rd | 30 ++++ man/dataframe__get_columns.Rd | 6 +- man/dataframe__group_by.Rd | 50 +++++++ man/dataframe__lazy.Rd | 17 +++ man/dataframe__select.Rd | 33 +++++ man/dataframe__slice.Rd | 28 ++++ man/dataframe__sort.Rd | 32 +++++ man/dataframe__to_series.Rd | 30 ++++ man/dataframe__with_columns.Rd | 46 ++++++ man/lazyframe__select.Rd | 10 +- man/lazyframe__with_columns.Rd | 14 +- man/pl__LazyFrame.Rd | 2 +- 18 files changed, 685 insertions(+), 20 deletions(-) create mode 100644 man/dataframe__cast.Rd create mode 100644 man/dataframe__clone.Rd create mode 100644 man/dataframe__drop.Rd create mode 100644 man/dataframe__equals.Rd create mode 100644 man/dataframe__filter.Rd create mode 100644 man/dataframe__group_by.Rd create mode 100644 man/dataframe__lazy.Rd create mode 100644 man/dataframe__select.Rd create mode 100644 man/dataframe__slice.Rd create mode 100644 man/dataframe__sort.Rd create mode 100644 man/dataframe__to_series.Rd create mode 100644 man/dataframe__with_columns.Rd diff --git a/R/dataframe-frame.R b/R/dataframe-frame.R index 2a0057a0..d839b458 100644 --- a/R/dataframe-frame.R +++ b/R/dataframe-frame.R @@ -12,6 +12,7 @@ #' so each argument in `...` is converted to a Polars Series by [as_polars_series()] #' and then passed to [as_polars_df()]. #' @aliases polars_data_frame DataFrame +#' #' @section Active bindings: #' - `columns`: `$columns` returns a character vector with the names of the columns. #' - `dtypes`: `$dtypes` returns a nameless list of the data type of each column. @@ -127,19 +128,69 @@ dataframe__to_struct <- function(name = "") { wrap() } +#' Convert an existing DataFrame to a LazyFrame +#' @description Start a new lazy query from a DataFrame. +#' +#' @inherit as_polars_lf return +#' @examples +#' pl$DataFrame(a = 1:2, b = c(NA, "a"))$lazy() dataframe__lazy <- function() { self$`_df`$lazy() |> wrap() } +#' Clone a DataFrame +#' +#' This is a cheap operation that does not copy data. Assigning does not copy +#' the DataFrame (environment object). This is because environment objects have +#' reference semantics. Calling $clone() creates a new environment, which can +#' be useful when dealing with attributes (see examples). +#' +#' @inherit as_polars_df return +#' @examples +#' df1 <- as_polars_df(iris) +#' +#' # Assigning does not copy the DataFrame (environment object), calling +#' # $clone() creates a new environment. +#' df2 <- df1 +#' df3 <- df1$clone() +#' rlang::env_label(df1) +#' rlang::env_label(df2) +#' rlang::env_label(df3) +#' +#' # Cloning can be useful to add attributes to data used in a function without +#' # adding those attributes to the original object. +#' +#' # Make a function to take a DataFrame, add an attribute, and return a +#' # DataFrame: +#' give_attr <- function(data) { +#' attr(data, "created_on") <- "2024-01-29" +#' data +#' } +#' df2 <- give_attr(df1) +#' +#' # Problem: the original DataFrame also gets the attribute while it shouldn't +#' attributes(df1) +#' +#' # Use $clone() inside the function to avoid that +#' give_attr <- function(data) { +#' data <- data$clone() +#' attr(data, "created_on") <- "2024-01-29" +#' data +#' } +#' df1 <- as_polars_df(iris) +#' df2 <- give_attr(df1) +#' +#' # now, the original DataFrame doesn't get this attribute +#' attributes(df1) dataframe__clone <- function() { self$`_df`$clone() |> wrap() } -#' Get the DataFrame as a List of Series +#' Get the DataFrame as a list of Series #' -#' @return A [list] of [Series] +#' @return A list of [Series] #' @seealso #' - [`as.list()`][as.list.polars_data_frame] #' @examples @@ -160,25 +211,119 @@ dataframe__get_columns <- function() { }) } +#' Group a DataFrame +#' +#' @inherit LazyFrame_group_by description params +#' @details Within each group, the order of the rows is always preserved, +#' regardless of the `maintain_order` argument. +#' @return [GroupBy][GroupBy_class] (a DataFrame with special groupby methods like `$agg()`) +#' @seealso +#' - [`$partition_by()`][DataFrame_partition_by] +#' @examples +#' df <- pl$DataFrame( +#' a = c("a", "b", "a", "b", "c"), +#' b = c(1, 2, 1, 3, 3), +#' c = c(5, 4, 3, 2, 1) +#' ) +#' +#' df$group_by("a")$agg(pl$col("b")$sum()) +#' +#' # Set `maintain_order = TRUE` to ensure the order of the groups is +#' # consistent with the input. +#' df$group_by("a", maintain_order = TRUE)$agg(pl$col("c")) +#' +#' # Group by multiple columns by passing a list of column names. +#' df$group_by(c("a", "b"))$agg(pl$max("c")) +#' +#' # Or pass some arguments to group by multiple columns in the same way. +#' # Expressions are also accepted. +#' df$group_by("a", pl$col("b") %/% 2)$agg( +#' pl$col("c")$mean() +#' ) +#' +#' # The columns will be renamed to the argument names. +#' df$group_by(d = "a", e = pl$col("b") %/% 2)$agg( +#' pl$col("c")$mean() +#' ) dataframe__group_by <- function(..., maintain_order = FALSE) { wrap_to_group_by(self, list2(...), maintain_order) } +#' Select and modify columns of a DataFrame +#' +#' @inherit lazyframe__select description params +#' @inherit as_polars_df return +#' @examples +#' as_polars_df(iris)$select( +#' abs_SL = pl$col("Sepal.Length")$abs(), +#' add_2_SL = pl$col("Sepal.Length") + 2 +#' ) dataframe__select <- function(...) { self$lazy()$select(...)$collect(`_eager` = TRUE) |> wrap() } +#' Modify/append column(s) of a DataFrame +#' +#' @inherit lazyframe__with_columns description params +#' @inherit as_polars_df return +#' @examples +#' as_polars_df(iris)$with_columns( +#' abs_SL = pl$col("Sepal.Length")$abs(), +#' add_2_SL = pl$col("Sepal.Length") + 2 +#' ) +#' +#' # same query +#' l_expr <- list( +#' pl$col("Sepal.Length")$abs()$alias("abs_SL"), +#' (pl$col("Sepal.Length") + 2)$alias("add_2_SL") +#' ) +#' as_polars_df(iris)$with_columns(l_expr) +#' +#' as_polars_df(iris)$with_columns( +#' SW_add_2 = (pl$col("Sepal.Width") + 2), +#' # unnamed expr will keep name "Sepal.Length" +#' pl$col("Sepal.Length")$abs() +#' ) dataframe__with_columns <- function(...) { self$lazy()$with_columns(...)$collect(`_eager` = TRUE) |> wrap() } +# TODO-REWRITE: before release, add in news that param idx was renamed "index" +# and mention that it errors if out of bounds +#' Select column as Series at index location +#' +#' @param index Index of the column to return as Series. Defaults to 0, which is +#' the first column. +#' +#' @return Series or NULL +#' @examples +#' df <- as_polars_df(iris[1:10, ]) +#' +#' # default is to extract the first column +#' df$to_series() +#' +#' # Polars is 0-indexed, so we use index = 1 to extract the *2nd* column +#' df$to_series(index = 1) +#' +#' # doesn't error if the column isn't there +#' df$to_series(index = 8) dataframe__to_series <- function(index = 0) { self$`_df`$to_series(index) |> wrap() } +#' Check whether the DataFrame is equal to another DataFrame +#' +#' @param other DataFrame to compare with. +#' @return A logical value +#' @examples +#' dat1 <- as_polars_df(iris) +#' dat2 <- as_polars_df(iris) +#' dat3 <- as_polars_df(mtcars) +#' dat1$equals(dat2) +#' dat1$equals(dat3) dataframe__equals <- function(other, ..., null_equal = TRUE) { wrap({ check_dots_empty0(...) @@ -188,6 +333,19 @@ dataframe__equals <- function(other, ..., null_equal = TRUE) { }) } +#' Get a slice of the DataFrame. +#' +#' @inherit as_polars_df return +#' @param offset Start index, can be a negative value. This is 0-indexed, so +#' `offset = 1` skips the first row. +#' @param length Length of the slice. If `NULL` (default), all rows starting at +#' the offset will be selected. +#' @examples +#' # skip the first 2 rows and take the 4 following rows +#' as_polars_df(mtcars)$slice(2, 4) +#' +#' # this is equivalent to: +#' mtcars[3:6, ] dataframe__slice <- function(offset, length = NULL) { wrap({ check_number_decimal(offset, allow_infinite = FALSE) @@ -198,6 +356,17 @@ dataframe__slice <- function(offset, length = NULL) { }) } +#' @inherit LazyFrame_head title details +#' @param n Number of rows to return. If a negative value is passed, +#' return all rows except the last [`abs(n)`][abs]. +#' @return A [DataFrame][DataFrame_class] +#' @examples +#' df <- pl$DataFrame(foo = 1:5, bar = 6:10, ham = letters[1:5]) +#' +#' df$head(3) +#' +#' # Pass a negative value to get all rows except the last `abs(n)`. +#' df$head(-3) dataframe__head <- function(n = 5) { wrap({ if (isTRUE(n < 0)) { @@ -207,6 +376,17 @@ dataframe__head <- function(n = 5) { }) } +#' @inherit LazyFrame_tail title +#' @param n Number of rows to return. If a negative value is passed, +#' return all rows except the first [`abs(n)`][abs]. +#' @inherit DataFrame_head return +#' @examples +#' df <- pl$DataFrame(foo = 1:5, bar = 6:10, ham = letters[1:5]) +#' +#' df$tail(3) +#' +#' # Pass a negative value to get all rows except the first `abs(n)`. +#' df$tail(-3) dataframe__tail <- function(n = 5) { wrap({ if (isTRUE(n < 0)) { @@ -216,22 +396,87 @@ dataframe__tail <- function(n = 5) { }) } +#' Drop columns of a DataFrame +#' +#' @param ... <[`dynamic-dots`][rlang::dyn-dots]> Characters of column names to +#' drop. Passed to [`pl$col()`][pl__col]. +#' @param strict Validate that all column names exist in the schema and throw an +#' exception if a column name does not exist in the schema. +#' +#' @inherit as_polars_df return +#' @examples +#' as_polars_df(mtcars)$drop(c("mpg", "hp")) +#' +#' # equivalent +#' as_polars_df(mtcars)$drop("mpg", "hp") dataframe__drop <- function(..., strict = TRUE) { self$lazy()$drop(..., strict = strict)$collect(`_eager` = TRUE) |> wrap() } # TODO: accept formulas for type mapping +#' Cast DataFrame column(s) to the specified dtype +#' +#' @inherit LazyFrame_cast description params +#' +#' @inherit as_polars_df return +#' @examples +#' df <- pl$DataFrame( +#' foo = 1:3, +#' bar = c(6, 7, 8), +#' ham = as.Date(c("2020-01-02", "2020-03-04", "2020-05-06")) +#' ) +#' +#' # Cast only some columns +#' df$cast(list(foo = pl$Float32, bar = pl$UInt8)) +#' +#' # Cast all columns to the same type +#' df$cast(pl$String) dataframe__cast <- function(..., strict = TRUE) { self$lazy()$cast(..., strict = strict)$collect(`_eager` = TRUE) |> wrap() } +#' Filter rows of a DataFrame +#' +#' @inherit LazyFrame_filter description params details +#' +#' @inherit as_polars_df return +#' @examples +#' df <- as_polars_df(iris) +#' +#' df$filter(pl$col("Sepal.Length") > 5) +#' +#' # This is equivalent to +#' # df$filter(pl$col("Sepal.Length") > 5 & pl$col("Petal.Width") < 1) +#' df$filter(pl$col("Sepal.Length") > 5, pl$col("Petal.Width") < 1) +#' +#' # rows where condition is NA are dropped +#' iris2 <- iris +#' iris2[c(1, 3, 5), "Species"] <- NA +#' df <- as_polars_df(iris2) +#' +#' df$filter(pl$col("Species") == "setosa") dataframe__filter <- function(...) { self$lazy()$filter(...)$collect(`_eager` = TRUE) |> wrap() } +#' Sort a DataFrame +#' @inherit LazyFrame_sort details description params +#' @inheritParams DataFrame_unique +#' @inherit as_polars_df return +#' @examples +#' df <- mtcars +#' df$mpg[1] <- NA +#' df <- as_polars_df(df) +#' df$sort("mpg") +#' df$sort("mpg", nulls_last = TRUE) +#' df$sort("cyl", "mpg") +#' df$sort(c("cyl", "mpg")) +#' df$sort(c("cyl", "mpg"), descending = TRUE) +#' df$sort(c("cyl", "mpg"), descending = c(TRUE, FALSE)) +#' df$sort(pl$col("cyl"), pl$col("mpg")) dataframe__sort <- function( ..., descending = FALSE, diff --git a/R/lazyframe-frame.R b/R/lazyframe-frame.R index 231742a2..89b890d9 100644 --- a/R/lazyframe-frame.R +++ b/R/lazyframe-frame.R @@ -6,7 +6,7 @@ #' and is the preferred (and highest-performance) mode of operation for polars. #' #' The `pl$LazyFrame(...)` function is a shortcut for `pl$DataFrame(...)$lazy()`. -#' @aliases plars_lazy_frame LazyFrame +#' @aliases polars_lazy_frame LazyFrame #' @inheritParams pl__DataFrame #' @return A polars [LazyFrame] #' @seealso @@ -46,7 +46,16 @@ wrap.PlRLazyFrame <- function(x, ...) { } # TODO: link to pl__select -#' Select columns from this LazyFrame +#' Select and modify columns of a LazyFrame +#' +#' @description +#' Select and perform operations on a subset of columns only. This discards +#' unmentioned columns (like `.()` in `data.table` and contrarily to +#' `dplyr::mutate()`). +#' +#' One cannot use new variables in subsequent expressions in the same +#' `$select()` call. For instance, if you create a variable `x`, you will only +#' be able to use it in another `$select()` or `$with_columns()` call. #' #' @inherit pl__LazyFrame return #' @param ... <[`dynamic-dots`][rlang::dyn-dots]> @@ -264,12 +273,17 @@ lazyframe__sort <- function( }) } -#' Add columns to this LazyFrame +#' Modify/append column(s) of a LazyFrame +#' +#' @description +#' Add columns or modify existing ones with expressions. This is similar to +#' `dplyr::mutate()` as it keeps unmentioned columns (unlike `$select()`). #' -#' Added columns will replace existing columns with the same name. +#' However, unlike `dplyr::mutate()`, one cannot use new variables in subsequent +#' expressions in the same `$with_columns()`call. For instance, if you create a +#' variable `x`, you will only be able to use it in another `$with_columns()` +#' or `$select()` call. #' -#' Creating a new LazyFrame using this method does not create a new copy of -#' existing data. #' @inherit pl__LazyFrame return #' @inheritParams lazyframe__select #' @examples diff --git a/man/dataframe__cast.Rd b/man/dataframe__cast.Rd new file mode 100644 index 00000000..afbc236d --- /dev/null +++ b/man/dataframe__cast.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dataframe-frame.R +\name{dataframe__cast} +\alias{dataframe__cast} +\title{Cast DataFrame column(s) to the specified dtype} +\usage{ +dataframe__cast(..., strict = TRUE) +} +\value{ +A polars \link{DataFrame} +} +\description{ +Cast DataFrame column(s) to the specified dtype +} +\examples{ +df <- pl$DataFrame( + foo = 1:3, + bar = c(6, 7, 8), + ham = as.Date(c("2020-01-02", "2020-03-04", "2020-05-06")) +) + +# Cast only some columns +df$cast(list(foo = pl$Float32, bar = pl$UInt8)) + +# Cast all columns to the same type +df$cast(pl$String) +} diff --git a/man/dataframe__clone.Rd b/man/dataframe__clone.Rd new file mode 100644 index 00000000..f6a70533 --- /dev/null +++ b/man/dataframe__clone.Rd @@ -0,0 +1,54 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dataframe-frame.R +\name{dataframe__clone} +\alias{dataframe__clone} +\title{Clone a DataFrame} +\usage{ +dataframe__clone() +} +\value{ +A polars \link{DataFrame} +} +\description{ +This is a cheap operation that does not copy data. Assigning does not copy +the DataFrame (environment object). This is because environment objects have +reference semantics. Calling $clone() creates a new environment, which can +be useful when dealing with attributes (see examples). +} +\examples{ +df1 <- as_polars_df(iris) + +# Assigning does not copy the DataFrame (environment object), calling +# $clone() creates a new environment. +df2 <- df1 +df3 <- df1$clone() +rlang::env_label(df1) +rlang::env_label(df2) +rlang::env_label(df3) + +# Cloning can be useful to add attributes to data used in a function without +# adding those attributes to the original object. + +# Make a function to take a DataFrame, add an attribute, and return a +# DataFrame: +give_attr <- function(data) { + attr(data, "created_on") <- "2024-01-29" + data +} +df2 <- give_attr(df1) + +# Problem: the original DataFrame also gets the attribute while it shouldn't +attributes(df1) + +# Use $clone() inside the function to avoid that +give_attr <- function(data) { + data <- data$clone() + attr(data, "created_on") <- "2024-01-29" + data +} +df1 <- as_polars_df(iris) +df2 <- give_attr(df1) + +# now, the original DataFrame doesn't get this attribute +attributes(df1) +} diff --git a/man/dataframe__drop.Rd b/man/dataframe__drop.Rd new file mode 100644 index 00000000..b979acc5 --- /dev/null +++ b/man/dataframe__drop.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dataframe-frame.R +\name{dataframe__drop} +\alias{dataframe__drop} +\title{Drop columns of a DataFrame} +\usage{ +dataframe__drop(..., strict = TRUE) +} +\arguments{ +\item{...}{<\code{\link[rlang:dyn-dots]{dynamic-dots}}> Characters of column names to +drop. Passed to \code{\link[=pl__col]{pl$col()}}.} + +\item{strict}{Validate that all column names exist in the schema and throw an +exception if a column name does not exist in the schema.} +} +\value{ +A polars \link{DataFrame} +} +\description{ +Drop columns of a DataFrame +} +\examples{ +as_polars_df(mtcars)$drop(c("mpg", "hp")) + +# equivalent +as_polars_df(mtcars)$drop("mpg", "hp") +} diff --git a/man/dataframe__equals.Rd b/man/dataframe__equals.Rd new file mode 100644 index 00000000..8fd8e193 --- /dev/null +++ b/man/dataframe__equals.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dataframe-frame.R +\name{dataframe__equals} +\alias{dataframe__equals} +\title{Check whether the DataFrame is equal to another DataFrame} +\usage{ +dataframe__equals(other, ..., null_equal = TRUE) +} +\arguments{ +\item{other}{DataFrame to compare with.} +} +\value{ +A logical value +} +\description{ +Check whether the DataFrame is equal to another DataFrame +} +\examples{ +dat1 <- as_polars_df(iris) +dat2 <- as_polars_df(iris) +dat3 <- as_polars_df(mtcars) +dat1$equals(dat2) +dat1$equals(dat3) +} diff --git a/man/dataframe__filter.Rd b/man/dataframe__filter.Rd new file mode 100644 index 00000000..71eb3993 --- /dev/null +++ b/man/dataframe__filter.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dataframe-frame.R +\name{dataframe__filter} +\alias{dataframe__filter} +\title{Filter rows of a DataFrame} +\usage{ +dataframe__filter(...) +} +\value{ +A polars \link{DataFrame} +} +\description{ +Filter rows of a DataFrame +} +\examples{ +df <- as_polars_df(iris) + +df$filter(pl$col("Sepal.Length") > 5) + +# This is equivalent to +# df$filter(pl$col("Sepal.Length") > 5 & pl$col("Petal.Width") < 1) +df$filter(pl$col("Sepal.Length") > 5, pl$col("Petal.Width") < 1) + +# rows where condition is NA are dropped +iris2 <- iris +iris2[c(1, 3, 5), "Species"] <- NA +df <- as_polars_df(iris2) + +df$filter(pl$col("Species") == "setosa") +} diff --git a/man/dataframe__get_columns.Rd b/man/dataframe__get_columns.Rd index fc93d599..820a1ded 100644 --- a/man/dataframe__get_columns.Rd +++ b/man/dataframe__get_columns.Rd @@ -2,15 +2,15 @@ % Please edit documentation in R/dataframe-frame.R \name{dataframe__get_columns} \alias{dataframe__get_columns} -\title{Get the DataFrame as a List of Series} +\title{Get the DataFrame as a list of Series} \usage{ dataframe__get_columns() } \value{ -A \link{list} of \link{Series} +A list of \link{Series} } \description{ -Get the DataFrame as a List of Series +Get the DataFrame as a list of Series } \examples{ df <- pl$DataFrame(foo = c(1, 2, 3), bar = c(4, 5, 6)) diff --git a/man/dataframe__group_by.Rd b/man/dataframe__group_by.Rd new file mode 100644 index 00000000..bcbcd3b9 --- /dev/null +++ b/man/dataframe__group_by.Rd @@ -0,0 +1,50 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dataframe-frame.R +\name{dataframe__group_by} +\alias{dataframe__group_by} +\title{Group a DataFrame} +\usage{ +dataframe__group_by(..., maintain_order = FALSE) +} +\value{ +\link[=GroupBy_class]{GroupBy} (a DataFrame with special groupby methods like \verb{$agg()}) +} +\description{ +Group a DataFrame +} +\details{ +Within each group, the order of the rows is always preserved, +regardless of the \code{maintain_order} argument. +} +\examples{ +df <- pl$DataFrame( + a = c("a", "b", "a", "b", "c"), + b = c(1, 2, 1, 3, 3), + c = c(5, 4, 3, 2, 1) +) + +df$group_by("a")$agg(pl$col("b")$sum()) + +# Set `maintain_order = TRUE` to ensure the order of the groups is +# consistent with the input. +df$group_by("a", maintain_order = TRUE)$agg(pl$col("c")) + +# Group by multiple columns by passing a list of column names. +df$group_by(c("a", "b"))$agg(pl$max("c")) + +# Or pass some arguments to group by multiple columns in the same way. +# Expressions are also accepted. +df$group_by("a", pl$col("b") \%/\% 2)$agg( + pl$col("c")$mean() +) + +# The columns will be renamed to the argument names. +df$group_by(d = "a", e = pl$col("b") \%/\% 2)$agg( + pl$col("c")$mean() +) +} +\seealso{ +\itemize{ +\item \code{\link[=DataFrame_partition_by]{$partition_by()}} +} +} diff --git a/man/dataframe__lazy.Rd b/man/dataframe__lazy.Rd new file mode 100644 index 00000000..a2a8d719 --- /dev/null +++ b/man/dataframe__lazy.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dataframe-frame.R +\name{dataframe__lazy} +\alias{dataframe__lazy} +\title{Convert an existing DataFrame to a LazyFrame} +\usage{ +dataframe__lazy() +} +\value{ +A polars \link{LazyFrame} +} +\description{ +Start a new lazy query from a DataFrame. +} +\examples{ +pl$DataFrame(a = 1:2, b = c(NA, "a"))$lazy() +} diff --git a/man/dataframe__select.Rd b/man/dataframe__select.Rd new file mode 100644 index 00000000..9d570adc --- /dev/null +++ b/man/dataframe__select.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dataframe-frame.R +\name{dataframe__select} +\alias{dataframe__select} +\title{Select and modify columns of a DataFrame} +\usage{ +dataframe__select(...) +} +\arguments{ +\item{...}{<\code{\link[rlang:dyn-dots]{dynamic-dots}}> +Name-value pairs of objects to be converted to polars \link[=Expr]{expressions} +by the \code{\link[=as_polars_expr]{as_polars_expr()}} function. +Characters are parsed as column names, other non-expression inputs are parsed as \link[=pl__lit]{literals}. +Each name will be used as the expression name.} +} +\value{ +A polars \link{DataFrame} +} +\description{ +Select and perform operations on a subset of columns only. This discards +unmentioned columns (like \code{.()} in \code{data.table} and contrarily to +\code{dplyr::mutate()}). + +One cannot use new variables in subsequent expressions in the same +\verb{$select()} call. For instance, if you create a variable \code{x}, you will only +be able to use it in another \verb{$select()} or \verb{$with_columns()} call. +} +\examples{ +as_polars_df(iris)$select( + abs_SL = pl$col("Sepal.Length")$abs(), + add_2_SL = pl$col("Sepal.Length") + 2 +) +} diff --git a/man/dataframe__slice.Rd b/man/dataframe__slice.Rd new file mode 100644 index 00000000..56ffdcab --- /dev/null +++ b/man/dataframe__slice.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dataframe-frame.R +\name{dataframe__slice} +\alias{dataframe__slice} +\title{Get a slice of the DataFrame.} +\usage{ +dataframe__slice(offset, length = NULL) +} +\arguments{ +\item{offset}{Start index, can be a negative value. This is 0-indexed, so +\code{offset = 1} skips the first row.} + +\item{length}{Length of the slice. If \code{NULL} (default), all rows starting at +the offset will be selected.} +} +\value{ +A polars \link{DataFrame} +} +\description{ +Get a slice of the DataFrame. +} +\examples{ +# skip the first 2 rows and take the 4 following rows +as_polars_df(mtcars)$slice(2, 4) + +# this is equivalent to: +mtcars[3:6, ] +} diff --git a/man/dataframe__sort.Rd b/man/dataframe__sort.Rd new file mode 100644 index 00000000..26fff1e3 --- /dev/null +++ b/man/dataframe__sort.Rd @@ -0,0 +1,32 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dataframe-frame.R +\name{dataframe__sort} +\alias{dataframe__sort} +\title{Sort a DataFrame} +\usage{ +dataframe__sort( + ..., + descending = FALSE, + nulls_last = FALSE, + multithreaded = TRUE, + maintain_order = FALSE +) +} +\value{ +A polars \link{DataFrame} +} +\description{ +Sort a DataFrame +} +\examples{ +df <- mtcars +df$mpg[1] <- NA +df <- as_polars_df(df) +df$sort("mpg") +df$sort("mpg", nulls_last = TRUE) +df$sort("cyl", "mpg") +df$sort(c("cyl", "mpg")) +df$sort(c("cyl", "mpg"), descending = TRUE) +df$sort(c("cyl", "mpg"), descending = c(TRUE, FALSE)) +df$sort(pl$col("cyl"), pl$col("mpg")) +} diff --git a/man/dataframe__to_series.Rd b/man/dataframe__to_series.Rd new file mode 100644 index 00000000..10f80651 --- /dev/null +++ b/man/dataframe__to_series.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dataframe-frame.R +\name{dataframe__to_series} +\alias{dataframe__to_series} +\title{Select column as Series at index location} +\usage{ +dataframe__to_series(index = 0) +} +\arguments{ +\item{index}{Index of the column to return as Series. Defaults to 0, which is +the first column.} +} +\value{ +Series or NULL +} +\description{ +Select column as Series at index location +} +\examples{ +df <- as_polars_df(iris[1:10, ]) + +# default is to extract the first column +df$to_series() + +# Polars is 0-indexed, so we use index = 1 to extract the *2nd* column +df$to_series(index = 1) + +# doesn't error if the column isn't there +df$to_series(index = 8) +} diff --git a/man/dataframe__with_columns.Rd b/man/dataframe__with_columns.Rd new file mode 100644 index 00000000..677dc3c0 --- /dev/null +++ b/man/dataframe__with_columns.Rd @@ -0,0 +1,46 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dataframe-frame.R +\name{dataframe__with_columns} +\alias{dataframe__with_columns} +\title{Modify/append column(s) of a DataFrame} +\usage{ +dataframe__with_columns(...) +} +\arguments{ +\item{...}{<\code{\link[rlang:dyn-dots]{dynamic-dots}}> +Name-value pairs of objects to be converted to polars \link[=Expr]{expressions} +by the \code{\link[=as_polars_expr]{as_polars_expr()}} function. +Characters are parsed as column names, other non-expression inputs are parsed as \link[=pl__lit]{literals}. +Each name will be used as the expression name.} +} +\value{ +A polars \link{DataFrame} +} +\description{ +Add columns or modify existing ones with expressions. This is similar to +\code{dplyr::mutate()} as it keeps unmentioned columns (unlike \verb{$select()}). + +However, unlike \code{dplyr::mutate()}, one cannot use new variables in subsequent +expressions in the same \verb{$with_columns()}call. For instance, if you create a +variable \code{x}, you will only be able to use it in another \verb{$with_columns()} +or \verb{$select()} call. +} +\examples{ +as_polars_df(iris)$with_columns( + abs_SL = pl$col("Sepal.Length")$abs(), + add_2_SL = pl$col("Sepal.Length") + 2 +) + +# same query +l_expr <- list( + pl$col("Sepal.Length")$abs()$alias("abs_SL"), + (pl$col("Sepal.Length") + 2)$alias("add_2_SL") +) +as_polars_df(iris)$with_columns(l_expr) + +as_polars_df(iris)$with_columns( + SW_add_2 = (pl$col("Sepal.Width") + 2), + # unnamed expr will keep name "Sepal.Length" + pl$col("Sepal.Length")$abs() +) +} diff --git a/man/lazyframe__select.Rd b/man/lazyframe__select.Rd index 2fcdb0bb..3ef41b08 100644 --- a/man/lazyframe__select.Rd +++ b/man/lazyframe__select.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/lazyframe-frame.R \name{lazyframe__select} \alias{lazyframe__select} -\title{Select columns from this LazyFrame} +\title{Select and modify columns of a LazyFrame} \usage{ lazyframe__select(...) } @@ -17,7 +17,13 @@ Each name will be used as the expression name.} A polars \link{LazyFrame} } \description{ -Select columns from this LazyFrame +Select and perform operations on a subset of columns only. This discards +unmentioned columns (like \code{.()} in \code{data.table} and contrarily to +\code{dplyr::mutate()}). + +One cannot use new variables in subsequent expressions in the same +\verb{$select()} call. For instance, if you create a variable \code{x}, you will only +be able to use it in another \verb{$select()} or \verb{$with_columns()} call. } \examples{ # Pass the name of a column to select that column. diff --git a/man/lazyframe__with_columns.Rd b/man/lazyframe__with_columns.Rd index 641f6f19..60262f97 100644 --- a/man/lazyframe__with_columns.Rd +++ b/man/lazyframe__with_columns.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/lazyframe-frame.R \name{lazyframe__with_columns} \alias{lazyframe__with_columns} -\title{Add columns to this LazyFrame} +\title{Modify/append column(s) of a LazyFrame} \usage{ lazyframe__with_columns(...) } @@ -17,11 +17,13 @@ Each name will be used as the expression name.} A polars \link{LazyFrame} } \description{ -Added columns will replace existing columns with the same name. -} -\details{ -Creating a new LazyFrame using this method does not create a new copy of -existing data. +Add columns or modify existing ones with expressions. This is similar to +\code{dplyr::mutate()} as it keeps unmentioned columns (unlike \verb{$select()}). + +However, unlike \code{dplyr::mutate()}, one cannot use new variables in subsequent +expressions in the same \verb{$with_columns()}call. For instance, if you create a +variable \code{x}, you will only be able to use it in another \verb{$with_columns()} +or \verb{$select()} call. } \examples{ # Pass an expression to add it as a new column. diff --git a/man/pl__LazyFrame.Rd b/man/pl__LazyFrame.Rd index 95d5dec0..1be5042f 100644 --- a/man/pl__LazyFrame.Rd +++ b/man/pl__LazyFrame.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/lazyframe-frame.R \name{pl__LazyFrame} \alias{pl__LazyFrame} -\alias{plars_lazy_frame} +\alias{polars_lazy_frame} \alias{LazyFrame} \title{Polars LazyFrame class (\code{polars_lazy_frame})} \usage{