pola-rs · etiennebacher · Jan 11, 2025 · Jan 12, 2025 · Jan 12, 2025 · Jan 12, 2025
@@ -276,6 +276,7 @@ list_any_all = ["polars-ops/list_any_all", "polars-plan/list_any_all"]
 array_any_all = ["polars-ops/array_any_all", "polars-plan/array_any_all", "dtype-array"]
 list_drop_nulls = ["polars-ops/list_drop_nulls", "polars-plan/list_drop_nulls"]
 list_sample = ["polars-ops/list_sample", "polars-plan/list_sample"]
+list_pad = ["polars-ops/list_pad", "polars-plan/list_pad"]
 cutqcut = ["polars-plan/cutqcut", "polars-ops/cutqcut"]
 rle = ["polars-plan/rle", "polars-ops/rle"]
 extract_groups = ["polars-plan/extract_groups"]
@@ -376,6 +377,7 @@ features = [
   "list_drop_nulls",
   "list_eval",
   "list_gather",
+  "list_pad",
   "list_sample",
   "list_sets",
   "list_to_struct",

@@ -134,6 +134,7 @@ list_sets = []
 list_any_all = []
 list_drop_nulls = []
 list_sample = ["polars-core/random"]
+list_pad = []
 extract_groups = ["dtype-struct", "polars-core/regex"]
 is_in = ["polars-core/reinterpret"]
 hist = ["dtype-categorical", "dtype-struct"]

@@ -1,3 +1,4 @@
+use std::cmp::Ordering;
 use std::fmt::Write;
 
 use arrow::array::ValueSize;
@@ -513,6 +514,134 @@ pub trait ListNameSpaceImpl: AsList {
         list_ca.apply_amortized(|s| s.as_ref().drop_nulls())
     }
 
+    #[cfg(feature = "list_pad")]
+    fn lst_pad_start(&self, fill_value: &Column, length: &Column) -> PolarsResult<ListChunked> {
+        let ca = self.as_list();
+        let inner_dtype = ca.inner_dtype();
+        let fill_dtype = fill_value.dtype();
+        let super_type = try_get_supertype(inner_dtype, fill_dtype)?;
+
+        let dtype = &DataType::List(Box::new(super_type.clone()));
+        let ca = ca.cast(dtype)?;
+        let ca = ca.list().unwrap();
+
+        let length = if length.len() == 1 {
+            &length.new_from_index(0, ca.len())
+        } else {
+            length
+        };
+        let length = length.strict_cast(&DataType::UInt64)?;
+        let mut length = length.u64()?.into_iter();
+
+        let fill_value = if fill_value.len() == 1 {
+            &fill_value.new_from_index(0, ca.len())
+        } else {
+            fill_value
+        };
+        let fill_value = fill_value.cast(&super_type)?;
+
+        let out: ListChunked = match super_type {
+            DataType::Int64 => {
+                let fill_value = fill_value.i64()?;
+                ca.zip_and_apply_amortized(fill_value, |s, fill_value| {
+                    let binding = s.unwrap();
+                    let s: &Series = binding.as_ref();
+                    let ca = s.i64().unwrap();
+                    let length = length.next().unwrap().unwrap() as usize;
+                    let mut fill_values;
+                    match length.cmp(&ca.len()) {
+                        Ordering::Equal | Ordering::Less => {
+                            fill_values = ca.clone();
+                        },
+                        Ordering::Greater => {
+                            fill_values = Int64Chunked::new_vec(
+                                PlSmallStr::EMPTY,
+                                vec![fill_value.unwrap(); length - ca.len()],
+                            );
+                            let _ = fill_values.append(ca);
+                        },
+                    };
+                    Some(fill_values.into())
+                })
+            },
+
+            DataType::Float64 => {
+                let fill_value = fill_value.f64()?;
+                ca.zip_and_apply_amortized(fill_value, |s, fill_value| {
+                    let binding = s.unwrap();
+                    let s: &Series = binding.as_ref();
+                    let ca = s.f64().unwrap();
+                    let length = length.next().unwrap().unwrap() as usize;
+                    let mut fill_values;
+                    match length.cmp(&ca.len()) {
+                        Ordering::Equal | Ordering::Less => {
+                            fill_values = ca.clone();
+                        },
+                        Ordering::Greater => {
+                            fill_values = Float64Chunked::new_vec(
+                                PlSmallStr::EMPTY,
+                                vec![fill_value.unwrap(); length - ca.len()],
+                            );
+                            let _ = fill_values.append(ca);
+                        },
+                    };
+                    Some(fill_values.into())
+                })
+            },
+            DataType::String => {
+                let fill_value = fill_value.str()?;
+                ca.zip_and_apply_amortized(fill_value, |s, fill_value| {
+                    let binding = s.unwrap();
+                    let s: &Series = binding.as_ref();
+                    let ca = s.str().unwrap();
+                    let length = length.next().unwrap().unwrap() as usize;
+                    let mut fill_values;
+                    match length.cmp(&ca.len()) {
+                        Ordering::Equal | Ordering::Less => {
+                            fill_values = ca.clone();
+                        },
+                        Ordering::Greater => {
+                            fill_values = StringChunked::new(
+                                PlSmallStr::EMPTY,
+                                vec![fill_value.unwrap(); length - ca.len()],
+                            );
+                            let _ = fill_values.append(ca);
+                        },
+                    };
+                    Some(fill_values.into())
+                })
+            },
+            DataType::Boolean => {
+                let fill_value = fill_value.bool()?;
+                ca.zip_and_apply_amortized(fill_value, |s, fill_value| {
+                    let binding = s.unwrap();
+                    let s: &Series = binding.as_ref();
+                    let ca = s.bool().unwrap();
+                    let length = length.next().unwrap().unwrap() as usize;
+                    let mut fill_values;
+                    match length.cmp(&ca.len()) {
+                        Ordering::Equal | Ordering::Less => {
+                            fill_values = ca.clone();
+                        },
+                        Ordering::Greater => {
+                            fill_values = BooleanChunked::new(
+                                PlSmallStr::EMPTY,
+                                vec![fill_value.unwrap(); length - ca.len()],
+                            );
+                            let _ = fill_values.append(ca);
+                        },
+                    };
+                    Some(fill_values.into())
+                })
+            },
+            dt => {
+                polars_bail!(InvalidOperation: "list.pad_start() doesn't work on data type {}", dt)
+            },
+        };
+
+        Ok(out)
+    }
+
     #[cfg(feature = "list_sample")]
     fn lst_sample_n(
         &self,

@@ -176,6 +176,7 @@ list_sets = ["polars-ops/list_sets"]
 list_any_all = ["polars-ops/list_any_all"]
 list_drop_nulls = ["polars-ops/list_drop_nulls"]
 list_sample = ["polars-ops/list_sample"]
+list_pad = ["polars-ops/list_pad"]
 cutqcut = ["polars-ops/cutqcut"]
 rle = ["polars-ops/rle"]
 extract_groups = ["regex", "dtype-struct", "polars-ops/extract_groups"]
@@ -293,6 +294,7 @@ features = [
   "streaming",
   "true_div",
   "sign",
+  "list_pad",
 ]
 # defines the configuration attribute `docsrs`
 rustdoc-args = ["--cfg", "docsrs"]
@@ -58,6 +58,8 @@ pub enum ListFunction {
     ToArray(usize),
     #[cfg(feature = "list_to_struct")]
     ToStruct(ListToStructArgs),
+    #[cfg(feature = "list_pad")]
+    PadStart,
 }
 
 impl ListFunction {
@@ -107,6 +109,8 @@ impl ListFunction {
             NUnique => mapper.with_dtype(IDX_DTYPE),
             #[cfg(feature = "list_to_struct")]
             ToStruct(args) => mapper.try_map_dtype(|x| args.get_output_dtype(x)),
+            #[cfg(feature = "list_pad")]
+            PadStart => mapper.with_same_dtype(),
         }
     }
 }
@@ -180,6 +184,8 @@ impl Display for ListFunction {
             ToArray(_) => "to_array",
             #[cfg(feature = "list_to_struct")]
             ToStruct(_) => "to_struct",
+            #[cfg(feature = "list_pad")]
+            PadStart => "pad_start",
         };
         write!(f, "list.{name}")
     }
@@ -243,6 +249,8 @@ impl From<ListFunction> for SpecialEq<Arc<dyn ColumnsUdf>> {
             NUnique => map!(n_unique),
             #[cfg(feature = "list_to_struct")]
             ToStruct(args) => map!(to_struct, &args),
+            #[cfg(feature = "list_pad")]
+            PadStart => map_as_slice!(pad_start),
         }
     }
 }
@@ -666,3 +674,11 @@ pub(super) fn to_struct(s: &Column, args: &ListToStructArgs) -> PolarsResult<Col
 pub(super) fn n_unique(s: &Column) -> PolarsResult<Column> {
     Ok(s.list()?.lst_n_unique()?.into_column())
 }
+
+#[cfg(feature = "list_pad")]
+pub(super) fn pad_start(args: &[Column]) -> PolarsResult<Column> {
+    let s = &args[0];
+    let fill_value = &args[1];
+    let length = &args[2];
+    Ok(s.list()?.lst_pad_start(fill_value, length)?.into_column())
+}
@@ -366,4 +366,16 @@ impl ListNameSpace {
         let other = other.into();
         self.set_operation(other, SetOperation::SymmetricDifference)
     }
+
+    /// Add elements in each sub-list until it matches the length of the longest
+    /// sub-list.
+    #[cfg(feature = "list_pad")]
+    pub fn pad_start(self, fill_value: Expr, length: Expr) -> Expr {
+        self.0.map_many_private(
+            FunctionExpr::ListExpr(ListFunction::PadStart),
+            &[fill_value, length],
+            false,
+            None,
+        )
+    }
 }
@@ -84,6 +84,7 @@ features = [
   "list_eval",
   "list_to_struct",
   "list_arithmetic",
+  "list_pad",
   "array_arithmetic",
   "array_to_struct",
   "log",
@@ -164,6 +165,7 @@ list_any_all = ["polars/list_any_all"]
 array_any_all = ["polars/array_any_all", "polars/dtype-array"]
 list_drop_nulls = ["polars/list_drop_nulls"]
 list_sample = ["polars/list_sample"]
+list_pad = ["polars/list_pad"]
 cutqcut = ["polars/cutqcut"]
 rle = ["polars/rle"]
 extract_groups = ["polars/extract_groups"]
@@ -211,6 +213,7 @@ operations = [
   "list_any_all",
   "list_drop_nulls",
   "list_sample",
+  "list_pad",
   "cutqcut",
   "rle",
   "extract_groups",

@@ -276,4 +276,13 @@ impl PyExpr {
         }
         .into()
     }
+
+    #[cfg(feature = "list_pad")]
+    fn list_pad_start(&self, fill_value: PyExpr, length: PyExpr) -> Self {
+        self.inner
+            .clone()
+            .list()
+            .pad_start(fill_value.inner, length.inner)
+            .into()
+    }
 }
@@ -189,6 +189,7 @@ list_sample = ["polars-lazy?/list_sample"]
 list_sets = ["polars-lazy?/list_sets"]
 list_to_struct = ["polars-ops/list_to_struct", "polars-lazy?/list_to_struct"]
 list_arithmetic = ["polars-core/list_arithmetic"]
+list_pad = ["polars-ops/list_pad", "polars-lazy?/list_pad"]
 array_arithmetic = ["polars-core/array_arithmetic", "dtype-array"]
 array_to_struct = ["polars-ops/array_to_struct", "polars-lazy?/array_to_struct"]
 log = ["polars-ops/log", "polars-lazy?/log"]

@@ -262,6 +262,7 @@
 //!     - `list_count` - Count elements in lists.
 //!     - `list_eval` - Apply expressions over list elements.
 //!     - `list_sets` - Compute UNION, INTERSECTION, and DIFFERENCE on list types.
+//!     - `list_pad` - Pad all sub-lists until they have the same length.
 //!     - `cumulative_eval` - Apply expressions over cumulatively increasing windows.
 //!     - `arg_where` - Get indices where condition holds.
 //!     - `search_sorted` - Find indices where elements should be inserted to maintain order.

@@ -71,6 +71,7 @@ list_any_all = ["polars-python/list_any_all"]
 array_any_all = ["polars-python/array_any_all"]
 list_drop_nulls = ["polars-python/list_drop_nulls"]
 list_sample = ["polars-python/list_sample"]
+list_pad = ["polars-python/list_pad"]
 cutqcut = ["polars-python/cutqcut"]
 rle = ["polars-python/rle"]
 extract_groups = ["polars-python/extract_groups"]

@@ -33,6 +33,7 @@ The following methods are available under the `expr.list` attribute.
     Expr.list.median
     Expr.list.min
     Expr.list.n_unique
+    Expr.list.pad_start
     Expr.list.reverse
     Expr.list.sample
     Expr.list.set_difference

@@ -33,6 +33,7 @@ The following methods are available under the `Series.list` attribute.
     Series.list.median
     Series.list.min
     Series.list.n_unique
+    Series.list.pad_start
     Series.list.reverse
     Series.list.sample
     Series.list.set_difference

@@ -1372,3 +1372,69 @@ def set_symmetric_difference(self, other: IntoExpr) -> Expr:
         """  # noqa: W505.
         other = parse_into_expression(other, str_as_lit=False)
         return wrap_expr(self._pyexpr.list_set_operation(other, "symmetric_difference"))
+
+    def pad_start(self, fill_value: IntoExpr, *, length: IntoExpr) -> Expr:
+        """
+        Pad the start of a sub-list until it reaches the given length.
+
+        Parameters
+        ----------
+        fill_value
+            Add this value at the left of the sub-list.
+        length
+            length to which sub-lists will be padded to. If a sub-list has more
+            than `length` elements, then it is not modified. If it has less than
+            `length` elements, `fill_value` is added on the left until `length`
+            is reached.
+
+        Examples
+        --------
+        >>> df = pl.DataFrame(
+        ...     {"a": [[1], [], [1, 2, 3]], "int": [0, 999, 2], "float": [0.0, 999, 2]}
+        ... )
+        >>> with pl.Config(fmt_table_cell_list_len=4):
+        ...     df.select(
+        ...         filled_int=pl.col("a").list.pad_start(pl.col("int"), length=4),
+        ...         filled_float=pl.col("a").list.pad_start(pl.col("float"), length=1),
+        ...     )
+        shape: (3, 2)
+        ┌──────────────────────┬─────────────────┐
+        │ filled_int           ┆ filled_float    │
+        │ ---                  ┆ ---             │
+        │ list[i64]            ┆ list[f64]       │
+        ╞══════════════════════╪═════════════════╡
+        │ [0, 0, 0, 1]         ┆ [1.0]           │
+        │ [999, 999, 999, 999] ┆ [999.0]         │
+        │ [2, 1, 2, 3]         ┆ [1.0, 2.0, 3.0] │
+        └──────────────────────┴─────────────────┘
+        >>> df = pl.DataFrame({"a": [["a"], [], ["b", "c", "d"]]})
+        >>> df.select(pl.col("a").list.pad_start("foo", length=2))
+        shape: (3, 1)
+        ┌─────────────────┐
+        │ a               │
+        │ ---             │
+        │ list[str]       │
+        ╞═════════════════╡
+        │ ["foo", "a"]    │
+        │ ["foo", "foo"]  │
+        │ ["b", "c", "d"] │
+        └─────────────────┘
+        >>> # The `length` argument also accepts expressions, for instance to
+        >>> # pad sub-lists to the longest sub-list:
+        >>> df.select(
+        ...     pl.col("a").list.pad_start("foo", length=pl.col("a").list.len().max())
+        ... )
+        shape: (3, 1)
+        ┌───────────────────────┐
+        │ a                     │
+        │ ---                   │
+        │ list[str]             │
+        ╞═══════════════════════╡
+        │ ["foo", "foo", "a"]   │
+        │ ["foo", "foo", "foo"] │
+        │ ["b", "c", "d"]       │
+        └───────────────────────┘
+        """
+        fill_value = parse_into_expression(fill_value, str_as_lit=True)
+        length = parse_into_expression(length, str_as_lit=True)
+        return wrap_expr(self._pyexpr.list_pad_start(fill_value, length))