merge

Merge branch 'main' into shift-refactor # Conflicts: # tests/testthat/_snaps/after-wrappers.md
pola-rs · Aug 23, 2024 · 08f0820 · 08f0820
2 parents d7a2ae4 + 1d49add
commit 08f0820
Show file tree

Hide file tree

Showing 10 changed files with 260 additions and 122 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -48,6 +48,9 @@
   `hive_partitioning`, `hive_schema`, and `try_parse_hive_dates` (#1183).
 - `$scan_parquet()` and `$read_parquet()` gain two new arguments for more control
   on importing hive partitions: `hive_schema` and `try_parse_hive_dates` (#1189).
+- New method `$gather_every()` for `LazyFrame` and `DataFrame` (#1199).
+- `$glimpse()` for `DataFrame` has two new arguments `max_items_per_column` and
+  `max_colname_length` (#1200).
 
 ### Other changes
 

diff --git a/R/dataframe__frame.R b/R/dataframe__frame.R
@@ -1729,35 +1729,41 @@ DataFrame_describe = function(percentiles = c(0.25, 0.75), interpolation = "near
     uw()
 }
 
-#' @title Glimpse values in a DataFrame
-#' @keywords DataFrame
-#' @param ... not used
+#' Show a dense preview of the DataFrame
+#'
+#' The formatting shows one line per column so that wide DataFrames display
+#' cleanly. Each line shows the column name, the data type, and the first few
+#' values.
+#'
+#' @param ... Ignored.
+#' @param max_items_per_column Maximum number of items to show per column.
+#' @param max_colname_length Maximum length of the displayed column names. Values
+#' that exceed this value are truncated with a trailing ellipsis.
 #' @param return_as_string Logical (default `FALSE`). If `TRUE`, return the
 #' output as a string.
+#'
 #' @return DataFrame
 #' @examples
 #' pl$DataFrame(iris)$glimpse()
-DataFrame_glimpse = function(..., return_as_string = FALSE) {
-  # guard input
+DataFrame_glimpse = function(
+    ...,
+    max_items_per_column = 10,
+    max_colname_length = 50,
+    return_as_string = FALSE) {
   if (!is_scalar_bool(return_as_string)) {
-    RPolarsErr$new()$
-      bad_robj(return_as_string)$
-      mistyped("bool")$
-      bad_arg("return_as_string") |>
-      Err() |>
+    Err_plain("`return_as_string` must be `TRUE` or `FALSE`.") |>
       unwrap("in $glimpse() :")
   }
 
-  # closure to extract col info from a column in <self>
-  max_num_value = min(10, self$height)
-  max_col_name_trunc = 50
+  max_num_value = min(max_items_per_column, self$height)
+
   parse_column_ = \(col_name, dtype) {
     dtype_str = dtype_str_repr(dtype) |> unwrap_or(paste0("??", str_string(dtype)))
     if (inherits(dtype, "RPolarsDataType")) dtype_str = paste0(" <", dtype_str, ">")
     val = self$select(pl$col(col_name)$slice(0, max_num_value))$to_list()[[1]]
     val_str = paste(val, collapse = ", ")
-    if (nchar(col_name) > max_col_name_trunc) {
-      col_name = paste0(substr(col_name, 1, max_col_name_trunc - 3), "...")
+    if (nchar(col_name) > max_colname_length) {
+      col_name = paste0(substr(col_name, 1, max_colname_length - 3), "...")
     }
     list(
       col_name = col_name,
@@ -1790,7 +1796,6 @@ DataFrame_glimpse = function(..., return_as_string = FALSE) {
   ) |>
     unwrap("in $glimpse() :")
 
-  # chose return type
   if (return_as_string) output else invisible(cat(output))
 }
 
@@ -2488,3 +2493,19 @@ DataFrame_sql = function(query, ..., table_name = NULL, envir = parent.frame())
     result() |>
     unwrap("in $sql():")
 }
+
+
+#' Take every nth row in the DataFrame
+#'
+#' @inheritParams LazyFrame_gather_every
+#'
+#' @return A DataFrame
+#'
+#' @examples
+#' df = pl$DataFrame(a = 1:4, b = 5:8)
+#' df$gather_every(2)
+#'
+#' df$gather_every(2, offset = 1)
+DataFrame_gather_every = function(n, offset = 0) {
+  self$select(pl$col("*")$gather_every(n, offset))
+}
diff --git a/R/lazyframe__lazy.R b/R/lazyframe__lazy.R
@@ -2235,3 +2235,20 @@ LazyFrame_sql = function(query, ..., table_name = NULL, envir = parent.frame())
   }) |>
     unwrap("in $sql():")
 }
+
+
+#' Take every nth row in the LazyFrame
+#'
+#' @param n Gather every `n`-th row.
+#' @param offset Starting index.
+#'
+#' @return A LazyFrame
+#'
+#' @examples
+#' lf = pl$LazyFrame(a = 1:4, b = 5:8)
+#' lf$gather_every(2)$collect()
+#'
+#' lf$gather_every(2, offset = 1)$collect()
+LazyFrame_gather_every = function(n, offset = 0) {
+  self$select(pl$col("*")$gather_every(n, offset))
+}
diff --git a/man/DataFrame_gather_every.Rd b/man/DataFrame_gather_every.Rd
diff --git a/man/DataFrame_glimpse.Rd b/man/DataFrame_glimpse.Rd
diff --git a/man/LazyFrame_gather_every.Rd b/man/LazyFrame_gather_every.Rd
diff --git a/tests/testthat/_snaps/after-wrappers.md b/tests/testthat/_snaps/after-wrappers.md
@@ -81,52 +81,21 @@
        [5] "drop"             "drop_in_place"    "drop_nulls"       "dtype_strings"   
        [9] "dtypes"           "equals"           "estimated_size"   "explode"         
       [13] "fill_nan"         "fill_null"        "filter"           "first"           
-      [17] "flags"            "get_column"       "get_columns"      "glimpse"         
-      [21] "group_by"         "group_by_dynamic" "head"             "height"          
-      [25] "item"             "join"             "join_asof"        "last"            
-      [29] "lazy"             "limit"            "max"              "mean"            
-      [33] "median"           "min"              "n_chunks"         "null_count"      
-      [37] "partition_by"     "pivot"            "print"            "quantile"        
-      [41] "rechunk"          "rename"           "reverse"          "rolling"         
-      [45] "sample"           "schema"           "select"           "select_seq"      
-      [49] "shape"            "shift"            "slice"            "sort"            
-      [53] "sql"              "std"              "sum"              "tail"            
-      [57] "to_data_frame"    "to_list"          "to_raw_ipc"       "to_series"       
-      [61] "to_struct"        "transpose"        "unique"           "unnest"          
-      [65] "unpivot"          "var"              "width"            "with_columns"    
-      [69] "with_columns_seq" "with_row_index"   "write_csv"        "write_ipc"       
-      [73] "write_json"       "write_ndjson"     "write_parquet"   
-
----
-
-    Code
-      ls(.pr[[private_key]])
-    Output
-       [1] "clear"                     "clone_in_rust"            
-       [3] "columns"                   "default"                  
-       [5] "drop_all_in_place"         "drop_in_place"            
-       [7] "dtype_strings"             "dtypes"                   
-       [9] "equals"                    "estimated_size"           
-      [11] "export_stream"             "from_arrow_record_batches"
-      [13] "from_raw_ipc"              "get_column"               
-      [15] "get_columns"               "lazy"                     
-      [17] "n_chunks"                  "new_with_capacity"        
-      [19] "null_count"                "partition_by"             
-      [21] "pivot_expr"                "print"                    
-      [23] "rechunk"                   "sample_frac"              
-      [25] "sample_n"                  "schema"                   
-      [27] "select"                    "select_at_idx"            
-      [29] "select_seq"                "set_column_from_robj"     
-      [31] "set_column_from_series"    "set_column_names_mut"     
-      [33] "shape"                     "to_list"                  
-      [35] "to_list_tag_structs"       "to_list_unwind"           
-      [37] "to_raw_ipc"                "to_struct"                
-      [39] "transpose"                 "unnest"                   
-      [41] "unpivot"                   "with_columns"             
-      [43] "with_columns_seq"          "with_row_index"           
-      [45] "write_csv"                 "write_ipc"                
-      [47] "write_json"                "write_ndjson"             
-      [49] "write_parquet"            
+      [17] "flags"            "gather_every"     "get_column"       "get_columns"     
+      [21] "glimpse"          "group_by"         "group_by_dynamic" "head"            
+      [25] "height"           "item"             "join"             "join_asof"       
+      [29] "last"             "lazy"             "limit"            "max"             
+      [33] "mean"             "median"           "min"              "n_chunks"        
+      [37] "null_count"       "partition_by"     "pivot"            "print"           
+      [41] "quantile"         "rechunk"          "rename"           "reverse"         
+      [45] "rolling"          "sample"           "schema"           "select"          
+      [49] "select_seq"       "shape"            "shift"            "slice"           
+      [53] "sort"             "sql"              "std"              "sum"             
+      [57] "tail"             "to_data_frame"    "to_list"          "to_raw_ipc"      
+      [61] "to_series"        "to_struct"        "transpose"        "unique"          
+      [65] "unnest"           "unpivot"          "var"              "width"           
+      [69] "with_columns"     "with_columns_seq" "with_row_index"   "write_csv"       
+      [73] "write_ipc"        "write_json"       "write_ndjson"     "write_parquet"   
 
 # public and private methods of each class GroupBy
 
@@ -147,54 +116,21 @@
        [7] "drop_nulls"            "dtypes"                "explain"              
       [10] "explode"               "fetch"                 "fill_nan"             
       [13] "fill_null"             "filter"                "first"                
-      [16] "group_by"              "group_by_dynamic"      "head"                 
-      [19] "join"                  "join_asof"             "last"                 
-      [22] "limit"                 "max"                   "mean"                 
-      [25] "median"                "min"                   "print"                
-      [28] "profile"               "quantile"              "rename"               
-      [31] "reverse"               "rolling"               "schema"               
-      [34] "select"                "select_seq"            "serialize"            
-      [37] "shift"                 "sink_csv"              "sink_ipc"             
-      [40] "sink_ndjson"           "sink_parquet"          "slice"                
-      [43] "sort"                  "sql"                   "std"                  
-      [46] "sum"                   "tail"                  "to_dot"               
-      [49] "unique"                "unnest"                "unpivot"              
-      [52] "var"                   "width"                 "with_columns"         
-      [55] "with_columns_seq"      "with_context"          "with_row_index"       
-
----
-
-    Code
-      ls(.pr[[private_key]])
-    Output
-       [1] "clone_in_rust"                "collect"                     
-       [3] "collect_in_background"        "debug_plan"                  
-       [5] "describe_optimized_plan"      "describe_optimized_plan_tree"
-       [7] "describe_plan"                "describe_plan_tree"          
-       [9] "deserialize"                  "drop"                        
-      [11] "drop_nulls"                   "explode"                     
-      [13] "fetch"                        "fill_nan"                    
-      [15] "fill_null"                    "filter"                      
-      [17] "first"                        "group_by"                    
-      [19] "group_by_dynamic"             "join"                        
-      [21] "join_asof"                    "last"                        
-      [23] "max"                          "mean"                        
-      [25] "median"                       "min"                         
-      [27] "optimization_toggle"          "print"                       
-      [29] "profile"                      "quantile"                    
-      [31] "rename"                       "reverse"                     
-      [33] "rolling"                      "schema"                      
-      [35] "select"                       "select_seq"                  
-      [37] "serialize"                    "shift"                       
-      [39] "sink_csv"                     "sink_ipc"                    
-      [41] "sink_json"                    "sink_parquet"                
-      [43] "slice"                        "sort_by_exprs"               
-      [45] "std"                          "sum"                         
-      [47] "tail"                         "to_dot"                      
-      [49] "unique"                       "unnest"                      
-      [51] "unpivot"                      "var"                         
-      [53] "with_columns"                 "with_columns_seq"            
-      [55] "with_context"                 "with_row_index"              
+      [16] "gather_every"          "group_by"              "group_by_dynamic"     
+      [19] "head"                  "join"                  "join_asof"            
+      [22] "last"                  "limit"                 "max"                  
+      [25] "mean"                  "median"                "min"                  
+      [28] "print"                 "profile"               "quantile"             
+      [31] "rename"                "reverse"               "rolling"              
+      [34] "schema"                "select"                "select_seq"           
+      [37] "serialize"             "shift"                 "sink_csv"             
+      [40] "sink_ipc"              "sink_ndjson"           "sink_parquet"         
+      [43] "slice"                 "sort"                  "sql"                  
+      [46] "std"                   "sum"                   "tail"                 
+      [49] "to_dot"                "unique"                "unnest"               
+      [52] "unpivot"               "var"                   "width"                
+      [55] "with_columns"          "with_columns_seq"      "with_context"         
+      [58] "with_row_index"       
 
 # public and private methods of each class Expr
 

diff --git a/tests/testthat/_snaps/dataframe.md b/tests/testthat/_snaps/dataframe.md
@@ -475,10 +475,10 @@
       │ max        ┆ zz   │
       └────────────┴──────┘
 
-# glimpse
+# $glimpse() works
 
     Code
-      pl$DataFrame(mtcars)$with_columns(pl$lit(42)$cast(pl$Int8))$glimpse()
+      df$glimpse()
     Output
       & mpg     <f64> 21, 21, 22.8, 21.4, 18.7, 18.1, 14.3, 24.4, 22.8, 19.2
       & cyl     <f64> 6, 6, 4, 6, 8, 6, 8, 4, 4, 6
@@ -493,3 +493,39 @@
       & carb    <f64> 4, 4, 1, 1, 2, 1, 4, 2, 2, 4
       & literal <i8>  42, 42, 42, 42, 42, 42, 42, 42, 42, 42
 
+---
+
+    Code
+      df$glimpse(max_items_per_column = 2)
+    Output
+      & mpg     <f64> 21, 21
+      & cyl     <f64> 6, 6
+      & disp    <f64> 160, 160
+      & hp      <f64> 110, 110
+      & drat    <f64> 3.9, 3.9
+      & wt      <f64> 2.62, 2.875
+      & qsec    <f64> 16.46, 17.02
+      & vs      <f64> 0, 0
+      & am      <f64> 1, 1
+      & gear    <f64> 4, 4
+      & carb    <f64> 4, 4
+      & literal <i8>  42, 42
+
+---
+
+    Code
+      df$glimpse(max_colname_length = 2)
+    Output
+      & ... <f64> 21, 21, 22.8, 21.4, 18.7, 18.1, 14.3, 24.4, 22.8, 19.2
+      & ... <f64> 6, 6, 4, 6, 8, 6, 8, 4, 4, 6
+      & ... <f64> 160, 160, 108, 258, 360, 225, 360, 146.7, 140.8, 167.6
+      & hp  <f64> 110, 110, 93, 110, 175, 105, 245, 62, 95, 123
+      & ... <f64> 3.9, 3.9, 3.85, 3.08, 3.15, 2.76, 3.21, 3.69, 3.92, 3.92
+      & wt  <f64> 2.62, 2.875, 2.32, 3.215, 3.44, 3.46, 3.57, 3.19, 3.15, 3.44
+      & ... <f64> 16.46, 17.02, 18.61, 19.44, 17.02, 20.22, 15.84, 20, 22.9, 18.3
+      & vs  <f64> 0, 0, 1, 1, 0, 1, 0, 1, 1, 1
+      & am  <f64> 1, 1, 1, 0, 0, 0, 0, 0, 0, 0
+      & ... <f64> 4, 4, 4, 3, 3, 3, 3, 4, 4, 4
+      & ... <f64> 4, 4, 1, 1, 2, 1, 4, 2, 2, 4
+      & ... <i8>  42, 42, 42, 42, 42, 42, 42, 42, 42, 42
+