Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add list.pad_start() #20674

Open
wants to merge 31 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 30 commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
3e036c7
init
etiennebacher Jan 11, 2025
34486da
allow expr
etiennebacher Jan 12, 2025
6b00202
minor [skip ci]
etiennebacher Jan 12, 2025
aa9a5c2
minor [skip ci]
etiennebacher Jan 12, 2025
73966bf
fix broadcasting
etiennebacher Jan 12, 2025
aa4fa4e
minor [skip ci]
etiennebacher Jan 12, 2025
2f5e959
docs
etiennebacher Jan 14, 2025
1181d73
remove some unwrap [skip ci]
etiennebacher Jan 14, 2025
64dd07f
add feature gate [skip ci]
etiennebacher Jan 14, 2025
e3c71bd
start tests [skip ci]
etiennebacher Jan 15, 2025
b36433a
more tests [skip ci]
etiennebacher Jan 15, 2025
b736ba6
typo
etiennebacher Jan 15, 2025
1d022ab
do not use named arg in docstrings
etiennebacher Jan 15, 2025
4b897e5
fmt
etiennebacher Jan 15, 2025
823f5db
mypy
etiennebacher Jan 15, 2025
4de5381
clippy
etiennebacher Jan 15, 2025
46d5a75
fmt again
etiennebacher Jan 15, 2025
7ed3aa4
docs for series
etiennebacher Jan 15, 2025
92f55ab
some tests fail in new-streaming
etiennebacher Jan 15, 2025
352fbf6
fmt
etiennebacher Jan 15, 2025
f0e9150
add for Boolean
etiennebacher Jan 15, 2025
3f7ca46
add arg `width`
etiennebacher Jan 24, 2025
bad748f
clippy
etiennebacher Jan 24, 2025
b87cd9a
ruff
etiennebacher Jan 24, 2025
28b295d
enable tests with new streaming
etiennebacher Jan 24, 2025
3cb1871
do not slice sublists larger than width
etiennebacher Feb 1, 2025
56a6865
accept expression in `width`
etiennebacher Feb 1, 2025
6b8e919
rename `width` to `length`
etiennebacher Feb 1, 2025
658347a
forgot to rename in tests
etiennebacher Feb 1, 2025
5d7fe4d
mistake in docs
etiennebacher Feb 1, 2025
dbf9035
typo [skip ci]
etiennebacher Feb 2, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions crates/polars-lazy/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,7 @@ list_any_all = ["polars-ops/list_any_all", "polars-plan/list_any_all"]
array_any_all = ["polars-ops/array_any_all", "polars-plan/array_any_all", "dtype-array"]
list_drop_nulls = ["polars-ops/list_drop_nulls", "polars-plan/list_drop_nulls"]
list_sample = ["polars-ops/list_sample", "polars-plan/list_sample"]
list_pad = ["polars-ops/list_pad", "polars-plan/list_pad"]
cutqcut = ["polars-plan/cutqcut", "polars-ops/cutqcut"]
rle = ["polars-plan/rle", "polars-ops/rle"]
extract_groups = ["polars-plan/extract_groups"]
Expand Down Expand Up @@ -376,6 +377,7 @@ features = [
"list_drop_nulls",
"list_eval",
"list_gather",
"list_pad",
"list_sample",
"list_sets",
"list_to_struct",
Expand Down
1 change: 1 addition & 0 deletions crates/polars-ops/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@ list_sets = []
list_any_all = []
list_drop_nulls = []
list_sample = ["polars-core/random"]
list_pad = []
extract_groups = ["dtype-struct", "polars-core/regex"]
is_in = ["polars-core/reinterpret"]
hist = ["dtype-categorical", "dtype-struct"]
Expand Down
129 changes: 129 additions & 0 deletions crates/polars-ops/src/chunked_array/list/namespace.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
use std::cmp::Ordering;
use std::fmt::Write;

use arrow::array::ValueSize;
Expand Down Expand Up @@ -513,6 +514,134 @@ pub trait ListNameSpaceImpl: AsList {
list_ca.apply_amortized(|s| s.as_ref().drop_nulls())
}

#[cfg(feature = "list_pad")]
fn lst_pad_start(&self, fill_value: &Column, length: &Column) -> PolarsResult<ListChunked> {
let ca = self.as_list();
let inner_dtype = ca.inner_dtype();
let fill_dtype = fill_value.dtype();
let super_type = try_get_supertype(inner_dtype, fill_dtype)?;

let dtype = &DataType::List(Box::new(super_type.clone()));
let ca = ca.cast(dtype)?;
let ca = ca.list().unwrap();

let length = if length.len() == 1 {
&length.new_from_index(0, ca.len())
} else {
length
};
let length = length.strict_cast(&DataType::UInt64)?;
let mut length = length.u64()?.into_iter();
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I couldn't find a way to pass this in zip_and_apply_amortized(), which is why I make it an iterator here and call next() inside zip_and_apply_amortized(). It works but feels clunky.


let fill_value = if fill_value.len() == 1 {
&fill_value.new_from_index(0, ca.len())
} else {
fill_value
};
let fill_value = fill_value.cast(&super_type)?;

let out: ListChunked = match super_type {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This entire match should probably be reduced by calling one function per datatype, which will be also used for list.pad_end() (in another PR).

DataType::Int64 => {
let fill_value = fill_value.i64()?;
ca.zip_and_apply_amortized(fill_value, |s, fill_value| {
let binding = s.unwrap();
let s: &Series = binding.as_ref();
let ca = s.i64().unwrap();
let length = length.next().unwrap().unwrap() as usize;
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same as before, works but feels clunky.

let mut fill_values;
match length.cmp(&ca.len()) {
Ordering::Equal | Ordering::Less => {
fill_values = ca.clone();
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the clone() here ok or a no-go? If the latter, what is the alternative?

},
Ordering::Greater => {
fill_values = Int64Chunked::new_vec(
PlSmallStr::EMPTY,
vec![fill_value.unwrap(); length - ca.len()],
);
let _ = fill_values.append(ca);
},
};
Some(fill_values.into())
})
},

DataType::Float64 => {
let fill_value = fill_value.f64()?;
ca.zip_and_apply_amortized(fill_value, |s, fill_value| {
let binding = s.unwrap();
let s: &Series = binding.as_ref();
let ca = s.f64().unwrap();
let length = length.next().unwrap().unwrap() as usize;
let mut fill_values;
match length.cmp(&ca.len()) {
Ordering::Equal | Ordering::Less => {
fill_values = ca.clone();
},
Ordering::Greater => {
fill_values = Float64Chunked::new_vec(
PlSmallStr::EMPTY,
vec![fill_value.unwrap(); length - ca.len()],
);
let _ = fill_values.append(ca);
},
};
Some(fill_values.into())
})
},
DataType::String => {
let fill_value = fill_value.str()?;
ca.zip_and_apply_amortized(fill_value, |s, fill_value| {
let binding = s.unwrap();
let s: &Series = binding.as_ref();
let ca = s.str().unwrap();
let length = length.next().unwrap().unwrap() as usize;
let mut fill_values;
match length.cmp(&ca.len()) {
Ordering::Equal | Ordering::Less => {
fill_values = ca.clone();
},
Ordering::Greater => {
fill_values = StringChunked::new(
PlSmallStr::EMPTY,
vec![fill_value.unwrap(); length - ca.len()],
);
let _ = fill_values.append(ca);
},
};
Some(fill_values.into())
})
},
DataType::Boolean => {
let fill_value = fill_value.bool()?;
ca.zip_and_apply_amortized(fill_value, |s, fill_value| {
let binding = s.unwrap();
let s: &Series = binding.as_ref();
let ca = s.bool().unwrap();
let length = length.next().unwrap().unwrap() as usize;
let mut fill_values;
match length.cmp(&ca.len()) {
Ordering::Equal | Ordering::Less => {
fill_values = ca.clone();
},
Ordering::Greater => {
fill_values = BooleanChunked::new(
PlSmallStr::EMPTY,
vec![fill_value.unwrap(); length - ca.len()],
);
let _ = fill_values.append(ca);
},
};
Some(fill_values.into())
})
},
dt => {
polars_bail!(InvalidOperation: "list.pad_start() doesn't work on data type {}", dt)
},
};

Ok(out)
}

#[cfg(feature = "list_sample")]
fn lst_sample_n(
&self,
Expand Down
2 changes: 2 additions & 0 deletions crates/polars-plan/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,7 @@ list_sets = ["polars-ops/list_sets"]
list_any_all = ["polars-ops/list_any_all"]
list_drop_nulls = ["polars-ops/list_drop_nulls"]
list_sample = ["polars-ops/list_sample"]
list_pad = ["polars-ops/list_pad"]
cutqcut = ["polars-ops/cutqcut"]
rle = ["polars-ops/rle"]
extract_groups = ["regex", "dtype-struct", "polars-ops/extract_groups"]
Expand Down Expand Up @@ -293,6 +294,7 @@ features = [
"streaming",
"true_div",
"sign",
"list_pad",
]
# defines the configuration attribute `docsrs`
rustdoc-args = ["--cfg", "docsrs"]
16 changes: 16 additions & 0 deletions crates/polars-plan/src/dsl/function_expr/list.rs
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@ pub enum ListFunction {
ToArray(usize),
#[cfg(feature = "list_to_struct")]
ToStruct(ListToStructArgs),
#[cfg(feature = "list_pad")]
PadStart,
}

impl ListFunction {
Expand Down Expand Up @@ -107,6 +109,8 @@ impl ListFunction {
NUnique => mapper.with_dtype(IDX_DTYPE),
#[cfg(feature = "list_to_struct")]
ToStruct(args) => mapper.try_map_dtype(|x| args.get_output_dtype(x)),
#[cfg(feature = "list_pad")]
PadStart => mapper.with_same_dtype(),
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is wrong because the output dtype can change because the inner dtype and the padding type are cast to the supertype.

However, it does pass the tests. Why is that? Is there a test I can add for this?

}
}
}
Expand Down Expand Up @@ -180,6 +184,8 @@ impl Display for ListFunction {
ToArray(_) => "to_array",
#[cfg(feature = "list_to_struct")]
ToStruct(_) => "to_struct",
#[cfg(feature = "list_pad")]
PadStart => "pad_start",
};
write!(f, "list.{name}")
}
Expand Down Expand Up @@ -243,6 +249,8 @@ impl From<ListFunction> for SpecialEq<Arc<dyn ColumnsUdf>> {
NUnique => map!(n_unique),
#[cfg(feature = "list_to_struct")]
ToStruct(args) => map!(to_struct, &args),
#[cfg(feature = "list_pad")]
PadStart => map_as_slice!(pad_start),
}
}
}
Expand Down Expand Up @@ -666,3 +674,11 @@ pub(super) fn to_struct(s: &Column, args: &ListToStructArgs) -> PolarsResult<Col
pub(super) fn n_unique(s: &Column) -> PolarsResult<Column> {
Ok(s.list()?.lst_n_unique()?.into_column())
}

#[cfg(feature = "list_pad")]
pub(super) fn pad_start(args: &[Column]) -> PolarsResult<Column> {
let s = &args[0];
let fill_value = &args[1];
let length = &args[2];
Ok(s.list()?.lst_pad_start(fill_value, length)?.into_column())
}
12 changes: 12 additions & 0 deletions crates/polars-plan/src/dsl/list.rs
Original file line number Diff line number Diff line change
Expand Up @@ -366,4 +366,16 @@ impl ListNameSpace {
let other = other.into();
self.set_operation(other, SetOperation::SymmetricDifference)
}

/// Add elements in each sub-list until it matches the length of the longest
/// sub-list.
#[cfg(feature = "list_pad")]
pub fn pad_start(self, fill_value: Expr, length: Expr) -> Expr {
self.0.map_many_private(
FunctionExpr::ListExpr(ListFunction::PadStart),
&[fill_value, length],
false,
None,
)
}
}
3 changes: 3 additions & 0 deletions crates/polars-python/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ features = [
"list_eval",
"list_to_struct",
"list_arithmetic",
"list_pad",
"array_arithmetic",
"array_to_struct",
"log",
Expand Down Expand Up @@ -164,6 +165,7 @@ list_any_all = ["polars/list_any_all"]
array_any_all = ["polars/array_any_all", "polars/dtype-array"]
list_drop_nulls = ["polars/list_drop_nulls"]
list_sample = ["polars/list_sample"]
list_pad = ["polars/list_pad"]
cutqcut = ["polars/cutqcut"]
rle = ["polars/rle"]
extract_groups = ["polars/extract_groups"]
Expand Down Expand Up @@ -211,6 +213,7 @@ operations = [
"list_any_all",
"list_drop_nulls",
"list_sample",
"list_pad",
"cutqcut",
"rle",
"extract_groups",
Expand Down
9 changes: 9 additions & 0 deletions crates/polars-python/src/expr/list.rs
Original file line number Diff line number Diff line change
Expand Up @@ -276,4 +276,13 @@ impl PyExpr {
}
.into()
}

#[cfg(feature = "list_pad")]
fn list_pad_start(&self, fill_value: PyExpr, length: PyExpr) -> Self {
self.inner
.clone()
.list()
.pad_start(fill_value.inner, length.inner)
.into()
}
}
1 change: 1 addition & 0 deletions crates/polars/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,7 @@ list_sample = ["polars-lazy?/list_sample"]
list_sets = ["polars-lazy?/list_sets"]
list_to_struct = ["polars-ops/list_to_struct", "polars-lazy?/list_to_struct"]
list_arithmetic = ["polars-core/list_arithmetic"]
list_pad = ["polars-ops/list_pad", "polars-lazy?/list_pad"]
array_arithmetic = ["polars-core/array_arithmetic", "dtype-array"]
array_to_struct = ["polars-ops/array_to_struct", "polars-lazy?/array_to_struct"]
log = ["polars-ops/log", "polars-lazy?/log"]
Expand Down
1 change: 1 addition & 0 deletions crates/polars/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,7 @@
//! - `list_count` - Count elements in lists.
//! - `list_eval` - Apply expressions over list elements.
//! - `list_sets` - Compute UNION, INTERSECTION, and DIFFERENCE on list types.
//! - `list_pad` - Pad all sub-lists until they have the same length.
//! - `cumulative_eval` - Apply expressions over cumulatively increasing windows.
//! - `arg_where` - Get indices where condition holds.
//! - `search_sorted` - Find indices where elements should be inserted to maintain order.
Expand Down
1 change: 1 addition & 0 deletions py-polars/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ list_any_all = ["polars-python/list_any_all"]
array_any_all = ["polars-python/array_any_all"]
list_drop_nulls = ["polars-python/list_drop_nulls"]
list_sample = ["polars-python/list_sample"]
list_pad = ["polars-python/list_pad"]
cutqcut = ["polars-python/cutqcut"]
rle = ["polars-python/rle"]
extract_groups = ["polars-python/extract_groups"]
Expand Down
1 change: 1 addition & 0 deletions py-polars/docs/source/reference/expressions/list.rst
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ The following methods are available under the `expr.list` attribute.
Expr.list.median
Expr.list.min
Expr.list.n_unique
Expr.list.pad_start
Expr.list.reverse
Expr.list.sample
Expr.list.set_difference
Expand Down
1 change: 1 addition & 0 deletions py-polars/docs/source/reference/series/list.rst
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ The following methods are available under the `Series.list` attribute.
Series.list.median
Series.list.min
Series.list.n_unique
Series.list.pad_start
Series.list.reverse
Series.list.sample
Series.list.set_difference
Expand Down
66 changes: 66 additions & 0 deletions py-polars/polars/expr/list.py
Original file line number Diff line number Diff line change
Expand Up @@ -1372,3 +1372,69 @@ def set_symmetric_difference(self, other: IntoExpr) -> Expr:
""" # noqa: W505.
other = parse_into_expression(other, str_as_lit=False)
return wrap_expr(self._pyexpr.list_set_operation(other, "symmetric_difference"))

def pad_start(self, fill_value: IntoExpr, *, length: IntoExpr) -> Expr:
"""
Pad the start of a sub-list until it reaches the given length.

Parameters
----------
fill_value
Add this value at the left of the sub-list.
length
length to which sub-lists will be padded to. If a sub-list has more
than `length` elements, then it is not modified. If it has less than
`length` elements, `fill_value` is added on the left until `length`
is reached.

Examples
--------
>>> df = pl.DataFrame(
... {"a": [[1], [], [1, 2, 3]], "int": [0, 999, 2], "float": [0.0, 999, 2]}
... )
>>> with pl.Config(fmt_table_cell_list_len=4):
... df.select(
... filled_int=pl.col("a").list.pad_start(pl.col("int"), length=4),
... filled_float=pl.col("a").list.pad_start(pl.col("float"), length=1),
... )
shape: (3, 2)
┌──────────────────────┬─────────────────┐
│ filled_int ┆ filled_float │
│ --- ┆ --- │
│ list[i64] ┆ list[f64] │
╞══════════════════════╪═════════════════╡
│ [0, 0, 0, 1] ┆ [1.0] │
│ [999, 999, 999, 999] ┆ [999.0] │
│ [2, 1, 2, 3] ┆ [1.0, 2.0, 3.0] │
└──────────────────────┴─────────────────┘
>>> df = pl.DataFrame({"a": [["a"], [], ["b", "c", "d"]]})
>>> df.select(pl.col("a").list.pad_start("foo", length=2))
shape: (3, 1)
┌─────────────────┐
│ a │
│ --- │
│ list[str] │
╞═════════════════╡
│ ["foo", "a"] │
│ ["foo", "foo"] │
│ ["b", "c", "d"] │
└─────────────────┘
>>> # The `length` argument also accepts expressions, for instance to
>>> # pad sub-lists to the longest sub-list:
>>> df.select(
... pl.col("a").list.pad_start("foo", length=pl.col("a").list.len().max())
... )
shape: (3, 1)
┌───────────────────────┐
│ a │
│ --- │
│ list[str] │
╞═══════════════════════╡
│ ["foo", "foo", "a"] │
│ ["foo", "foo", "foo"] │
│ ["b", "c", "d"] │
└───────────────────────┘
"""
fill_value = parse_into_expression(fill_value, str_as_lit=True)
length = parse_into_expression(length, str_as_lit=True)
return wrap_expr(self._pyexpr.list_pad_start(fill_value, length))
Loading
Loading