Skip to content

Commit

Permalink
merge
Browse files Browse the repository at this point in the history
Merge branch 'main' into shift-refactor

# Conflicts:
#	tests/testthat/_snaps/after-wrappers.md
  • Loading branch information
etiennebacher committed Aug 23, 2024
2 parents d7a2ae4 + 1d49add commit 08f0820
Show file tree
Hide file tree
Showing 10 changed files with 260 additions and 122 deletions.
3 changes: 3 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,9 @@
`hive_partitioning`, `hive_schema`, and `try_parse_hive_dates` (#1183).
- `$scan_parquet()` and `$read_parquet()` gain two new arguments for more control
on importing hive partitions: `hive_schema` and `try_parse_hive_dates` (#1189).
- New method `$gather_every()` for `LazyFrame` and `DataFrame` (#1199).
- `$glimpse()` for `DataFrame` has two new arguments `max_items_per_column` and
`max_colname_length` (#1200).

### Other changes

Expand Down
53 changes: 37 additions & 16 deletions R/dataframe__frame.R
Original file line number Diff line number Diff line change
Expand Up @@ -1729,35 +1729,41 @@ DataFrame_describe = function(percentiles = c(0.25, 0.75), interpolation = "near
uw()
}

#' @title Glimpse values in a DataFrame
#' @keywords DataFrame
#' @param ... not used
#' Show a dense preview of the DataFrame
#'
#' The formatting shows one line per column so that wide DataFrames display
#' cleanly. Each line shows the column name, the data type, and the first few
#' values.
#'
#' @param ... Ignored.
#' @param max_items_per_column Maximum number of items to show per column.
#' @param max_colname_length Maximum length of the displayed column names. Values
#' that exceed this value are truncated with a trailing ellipsis.
#' @param return_as_string Logical (default `FALSE`). If `TRUE`, return the
#' output as a string.
#'
#' @return DataFrame
#' @examples
#' pl$DataFrame(iris)$glimpse()
DataFrame_glimpse = function(..., return_as_string = FALSE) {
# guard input
DataFrame_glimpse = function(
...,
max_items_per_column = 10,
max_colname_length = 50,
return_as_string = FALSE) {
if (!is_scalar_bool(return_as_string)) {
RPolarsErr$new()$
bad_robj(return_as_string)$
mistyped("bool")$
bad_arg("return_as_string") |>
Err() |>
Err_plain("`return_as_string` must be `TRUE` or `FALSE`.") |>
unwrap("in $glimpse() :")
}

# closure to extract col info from a column in <self>
max_num_value = min(10, self$height)
max_col_name_trunc = 50
max_num_value = min(max_items_per_column, self$height)

parse_column_ = \(col_name, dtype) {
dtype_str = dtype_str_repr(dtype) |> unwrap_or(paste0("??", str_string(dtype)))
if (inherits(dtype, "RPolarsDataType")) dtype_str = paste0(" <", dtype_str, ">")
val = self$select(pl$col(col_name)$slice(0, max_num_value))$to_list()[[1]]
val_str = paste(val, collapse = ", ")
if (nchar(col_name) > max_col_name_trunc) {
col_name = paste0(substr(col_name, 1, max_col_name_trunc - 3), "...")
if (nchar(col_name) > max_colname_length) {
col_name = paste0(substr(col_name, 1, max_colname_length - 3), "...")
}
list(
col_name = col_name,
Expand Down Expand Up @@ -1790,7 +1796,6 @@ DataFrame_glimpse = function(..., return_as_string = FALSE) {
) |>
unwrap("in $glimpse() :")

# chose return type
if (return_as_string) output else invisible(cat(output))
}

Expand Down Expand Up @@ -2488,3 +2493,19 @@ DataFrame_sql = function(query, ..., table_name = NULL, envir = parent.frame())
result() |>
unwrap("in $sql():")
}


#' Take every nth row in the DataFrame
#'
#' @inheritParams LazyFrame_gather_every
#'
#' @return A DataFrame
#'
#' @examples
#' df = pl$DataFrame(a = 1:4, b = 5:8)
#' df$gather_every(2)
#'
#' df$gather_every(2, offset = 1)
DataFrame_gather_every = function(n, offset = 0) {
self$select(pl$col("*")$gather_every(n, offset))
}
17 changes: 17 additions & 0 deletions R/lazyframe__lazy.R
Original file line number Diff line number Diff line change
Expand Up @@ -2235,3 +2235,20 @@ LazyFrame_sql = function(query, ..., table_name = NULL, envir = parent.frame())
}) |>
unwrap("in $sql():")
}


#' Take every nth row in the LazyFrame
#'
#' @param n Gather every `n`-th row.
#' @param offset Starting index.
#'
#' @return A LazyFrame
#'
#' @examples
#' lf = pl$LazyFrame(a = 1:4, b = 5:8)
#' lf$gather_every(2)$collect()
#'
#' lf$gather_every(2, offset = 1)$collect()
LazyFrame_gather_every = function(n, offset = 0) {
self$select(pl$col("*")$gather_every(n, offset))
}
25 changes: 25 additions & 0 deletions man/DataFrame_gather_every.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

21 changes: 16 additions & 5 deletions man/DataFrame_glimpse.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

25 changes: 25 additions & 0 deletions man/LazyFrame_gather_every.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

124 changes: 30 additions & 94 deletions tests/testthat/_snaps/after-wrappers.md
Original file line number Diff line number Diff line change
Expand Up @@ -81,52 +81,21 @@
[5] "drop" "drop_in_place" "drop_nulls" "dtype_strings"
[9] "dtypes" "equals" "estimated_size" "explode"
[13] "fill_nan" "fill_null" "filter" "first"
[17] "flags" "get_column" "get_columns" "glimpse"
[21] "group_by" "group_by_dynamic" "head" "height"
[25] "item" "join" "join_asof" "last"
[29] "lazy" "limit" "max" "mean"
[33] "median" "min" "n_chunks" "null_count"
[37] "partition_by" "pivot" "print" "quantile"
[41] "rechunk" "rename" "reverse" "rolling"
[45] "sample" "schema" "select" "select_seq"
[49] "shape" "shift" "slice" "sort"
[53] "sql" "std" "sum" "tail"
[57] "to_data_frame" "to_list" "to_raw_ipc" "to_series"
[61] "to_struct" "transpose" "unique" "unnest"
[65] "unpivot" "var" "width" "with_columns"
[69] "with_columns_seq" "with_row_index" "write_csv" "write_ipc"
[73] "write_json" "write_ndjson" "write_parquet"

---

Code
ls(.pr[[private_key]])
Output
[1] "clear" "clone_in_rust"
[3] "columns" "default"
[5] "drop_all_in_place" "drop_in_place"
[7] "dtype_strings" "dtypes"
[9] "equals" "estimated_size"
[11] "export_stream" "from_arrow_record_batches"
[13] "from_raw_ipc" "get_column"
[15] "get_columns" "lazy"
[17] "n_chunks" "new_with_capacity"
[19] "null_count" "partition_by"
[21] "pivot_expr" "print"
[23] "rechunk" "sample_frac"
[25] "sample_n" "schema"
[27] "select" "select_at_idx"
[29] "select_seq" "set_column_from_robj"
[31] "set_column_from_series" "set_column_names_mut"
[33] "shape" "to_list"
[35] "to_list_tag_structs" "to_list_unwind"
[37] "to_raw_ipc" "to_struct"
[39] "transpose" "unnest"
[41] "unpivot" "with_columns"
[43] "with_columns_seq" "with_row_index"
[45] "write_csv" "write_ipc"
[47] "write_json" "write_ndjson"
[49] "write_parquet"
[17] "flags" "gather_every" "get_column" "get_columns"
[21] "glimpse" "group_by" "group_by_dynamic" "head"
[25] "height" "item" "join" "join_asof"
[29] "last" "lazy" "limit" "max"
[33] "mean" "median" "min" "n_chunks"
[37] "null_count" "partition_by" "pivot" "print"
[41] "quantile" "rechunk" "rename" "reverse"
[45] "rolling" "sample" "schema" "select"
[49] "select_seq" "shape" "shift" "slice"
[53] "sort" "sql" "std" "sum"
[57] "tail" "to_data_frame" "to_list" "to_raw_ipc"
[61] "to_series" "to_struct" "transpose" "unique"
[65] "unnest" "unpivot" "var" "width"
[69] "with_columns" "with_columns_seq" "with_row_index" "write_csv"
[73] "write_ipc" "write_json" "write_ndjson" "write_parquet"

# public and private methods of each class GroupBy

Expand All @@ -147,54 +116,21 @@
[7] "drop_nulls" "dtypes" "explain"
[10] "explode" "fetch" "fill_nan"
[13] "fill_null" "filter" "first"
[16] "group_by" "group_by_dynamic" "head"
[19] "join" "join_asof" "last"
[22] "limit" "max" "mean"
[25] "median" "min" "print"
[28] "profile" "quantile" "rename"
[31] "reverse" "rolling" "schema"
[34] "select" "select_seq" "serialize"
[37] "shift" "sink_csv" "sink_ipc"
[40] "sink_ndjson" "sink_parquet" "slice"
[43] "sort" "sql" "std"
[46] "sum" "tail" "to_dot"
[49] "unique" "unnest" "unpivot"
[52] "var" "width" "with_columns"
[55] "with_columns_seq" "with_context" "with_row_index"

---

Code
ls(.pr[[private_key]])
Output
[1] "clone_in_rust" "collect"
[3] "collect_in_background" "debug_plan"
[5] "describe_optimized_plan" "describe_optimized_plan_tree"
[7] "describe_plan" "describe_plan_tree"
[9] "deserialize" "drop"
[11] "drop_nulls" "explode"
[13] "fetch" "fill_nan"
[15] "fill_null" "filter"
[17] "first" "group_by"
[19] "group_by_dynamic" "join"
[21] "join_asof" "last"
[23] "max" "mean"
[25] "median" "min"
[27] "optimization_toggle" "print"
[29] "profile" "quantile"
[31] "rename" "reverse"
[33] "rolling" "schema"
[35] "select" "select_seq"
[37] "serialize" "shift"
[39] "sink_csv" "sink_ipc"
[41] "sink_json" "sink_parquet"
[43] "slice" "sort_by_exprs"
[45] "std" "sum"
[47] "tail" "to_dot"
[49] "unique" "unnest"
[51] "unpivot" "var"
[53] "with_columns" "with_columns_seq"
[55] "with_context" "with_row_index"
[16] "gather_every" "group_by" "group_by_dynamic"
[19] "head" "join" "join_asof"
[22] "last" "limit" "max"
[25] "mean" "median" "min"
[28] "print" "profile" "quantile"
[31] "rename" "reverse" "rolling"
[34] "schema" "select" "select_seq"
[37] "serialize" "shift" "sink_csv"
[40] "sink_ipc" "sink_ndjson" "sink_parquet"
[43] "slice" "sort" "sql"
[46] "std" "sum" "tail"
[49] "to_dot" "unique" "unnest"
[52] "unpivot" "var" "width"
[55] "with_columns" "with_columns_seq" "with_context"
[58] "with_row_index"

# public and private methods of each class Expr

Expand Down
40 changes: 38 additions & 2 deletions tests/testthat/_snaps/dataframe.md
Original file line number Diff line number Diff line change
Expand Up @@ -475,10 +475,10 @@
│ max ┆ zz │
└────────────┴──────┘

# glimpse
# $glimpse() works

Code
pl$DataFrame(mtcars)$with_columns(pl$lit(42)$cast(pl$Int8))$glimpse()
df$glimpse()
Output
& mpg <f64> 21, 21, 22.8, 21.4, 18.7, 18.1, 14.3, 24.4, 22.8, 19.2
& cyl <f64> 6, 6, 4, 6, 8, 6, 8, 4, 4, 6
Expand All @@ -493,3 +493,39 @@
& carb <f64> 4, 4, 1, 1, 2, 1, 4, 2, 2, 4
& literal <i8> 42, 42, 42, 42, 42, 42, 42, 42, 42, 42

---

Code
df$glimpse(max_items_per_column = 2)
Output
& mpg <f64> 21, 21
& cyl <f64> 6, 6
& disp <f64> 160, 160
& hp <f64> 110, 110
& drat <f64> 3.9, 3.9
& wt <f64> 2.62, 2.875
& qsec <f64> 16.46, 17.02
& vs <f64> 0, 0
& am <f64> 1, 1
& gear <f64> 4, 4
& carb <f64> 4, 4
& literal <i8> 42, 42

---

Code
df$glimpse(max_colname_length = 2)
Output
& ... <f64> 21, 21, 22.8, 21.4, 18.7, 18.1, 14.3, 24.4, 22.8, 19.2
& ... <f64> 6, 6, 4, 6, 8, 6, 8, 4, 4, 6
& ... <f64> 160, 160, 108, 258, 360, 225, 360, 146.7, 140.8, 167.6
& hp <f64> 110, 110, 93, 110, 175, 105, 245, 62, 95, 123
& ... <f64> 3.9, 3.9, 3.85, 3.08, 3.15, 2.76, 3.21, 3.69, 3.92, 3.92
& wt <f64> 2.62, 2.875, 2.32, 3.215, 3.44, 3.46, 3.57, 3.19, 3.15, 3.44
& ... <f64> 16.46, 17.02, 18.61, 19.44, 17.02, 20.22, 15.84, 20, 22.9, 18.3
& vs <f64> 0, 0, 1, 1, 0, 1, 0, 1, 1, 1
& am <f64> 1, 1, 1, 0, 0, 0, 0, 0, 0, 0
& ... <f64> 4, 4, 4, 3, 3, 3, 3, 4, 4, 4
& ... <f64> 4, 4, 1, 1, 2, 1, 4, 2, 2, 4
& ... <i8> 42, 42, 42, 42, 42, 42, 42, 42, 42, 42

Loading

0 comments on commit 08f0820

Please sign in to comment.