Skip to content

Commit

Permalink
feat: Add drop_nans method to DataFrame and LazyFrame (#20029)
Browse files Browse the repository at this point in the history
  • Loading branch information
alexander-beedie authored Nov 27, 2024
1 parent 74b9925 commit 0b218d8
Show file tree
Hide file tree
Showing 9 changed files with 249 additions and 39 deletions.
2 changes: 1 addition & 1 deletion crates/polars-core/src/series/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -553,7 +553,7 @@ impl Series {
}
}

/// Check if float value is NaN (note this is different than missing/ null)
/// Check if float value is NaN (note this is different than missing/null)
pub fn is_not_nan(&self) -> PolarsResult<BooleanChunked> {
match self.dtype() {
DataType::Float32 => Ok(self.f32().unwrap().is_not_nan()),
Expand Down
12 changes: 11 additions & 1 deletion crates/polars-lazy/src/frame/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1663,7 +1663,17 @@ impl LazyFrame {
Self::from_logical_plan(lp, opt_state)
}

/// Drop rows containing None.
/// Drop rows containing one or more NaN values.
///
/// `subset` is an optional `Vec` of column names to consider for NaNs; if None, all
/// floating point columns are considered.
pub fn drop_nans(self, subset: Option<Vec<Expr>>) -> LazyFrame {
let opt_state = self.get_opt_state();
let lp = self.get_plan_builder().drop_nans(subset).build();
Self::from_logical_plan(lp, opt_state)
}

/// Drop rows containing one or more None values.
///
/// `subset` is an optional `Vec` of column names to consider for nulls; if None, all
/// columns are considered.
Expand Down
20 changes: 20 additions & 0 deletions crates/polars-plan/src/plans/builder_dsl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,26 @@ impl DslBuilder {
)
}

pub fn drop_nans(self, subset: Option<Vec<Expr>>) -> Self {
if let Some(subset) = subset {
self.filter(
all_horizontal(
subset
.into_iter()
.map(|v| v.is_not_nan())
.collect::<Vec<_>>(),
)
.unwrap(),
)
} else {
self.filter(
// TODO: when Decimal supports NaN, include here
all_horizontal([dtype_cols([DataType::Float32, DataType::Float64]).is_not_nan()])
.unwrap(),
)
}
}

pub fn drop_nulls(self, subset: Option<Vec<Expr>>) -> Self {
if let Some(subset) = subset {
self.filter(
Expand Down
7 changes: 7 additions & 0 deletions crates/polars-python/src/lazyframe/general.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1114,6 +1114,13 @@ impl PyLazyFrame {
.into()
}

#[pyo3(signature = (subset=None))]
fn drop_nans(&self, subset: Option<Vec<PyExpr>>) -> Self {
let ldf = self.ldf.clone();
let subset = subset.map(|e| e.to_exprs());
ldf.drop_nans(subset).into()
}

#[pyo3(signature = (subset=None))]
fn drop_nulls(&self, subset: Option<Vec<PyExpr>>) -> Self {
let ldf = self.ldf.clone();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ Manipulation/selection
DataFrame.clone
DataFrame.drop
DataFrame.drop_in_place
DataFrame.drop_nans
DataFrame.drop_nulls
DataFrame.explode
DataFrame.extend
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ Manipulation/selection
LazyFrame.clear
LazyFrame.clone
LazyFrame.drop
LazyFrame.drop_nans
LazyFrame.drop_nulls
LazyFrame.explode
LazyFrame.fill_nan
Expand Down
77 changes: 77 additions & 0 deletions py-polars/polars/dataframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -5642,6 +5642,83 @@ def limit(self, n: int = 5) -> DataFrame:
"""
return self.head(n)

def drop_nans(
self,
subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = None,
) -> DataFrame:
"""
Drop all rows that contain one or more NaN values.
The original order of the remaining rows is preserved.
Parameters
----------
subset
Column name(s) for which NaN values are considered; if set to `None`
(default), use all columns (note that only floating-point columns
can contain NaNs).
Examples
--------
>>> df = pl.DataFrame(
... {
... "foo": [-20.5, float("nan"), 80.0],
... "bar": [float("nan"), 110.0, 25.5],
... "ham": ["xxx", "yyy", None],
... }
... )
The default behavior of this method is to drop rows where any single
value in the row is NaN:
>>> df.drop_nans()
shape: (1, 3)
┌──────┬──────┬──────┐
│ foo ┆ bar ┆ ham │
│ --- ┆ --- ┆ --- │
│ f64 ┆ f64 ┆ str │
╞══════╪══════╪══════╡
│ 80.0 ┆ 25.5 ┆ null │
└──────┴──────┴──────┘
This behaviour can be constrained to consider only a subset of columns, as
defined by name, or with a selector. For example, dropping rows only if
there is a NaN in the "bar" column:
>>> df.drop_nans(subset=["bar"])
shape: (2, 3)
┌──────┬───────┬──────┐
│ foo ┆ bar ┆ ham │
│ --- ┆ --- ┆ --- │
│ f64 ┆ f64 ┆ str │
╞══════╪═══════╪══════╡
│ NaN ┆ 110.0 ┆ yyy │
│ 80.0 ┆ 25.5 ┆ null │
└──────┴───────┴──────┘
Dropping a row only if *all* values are NaN requires a different formulation:
>>> df = pl.DataFrame(
... {
... "a": [float("nan"), float("nan"), float("nan"), float("nan")],
... "b": [10.0, 2.5, float("nan"), 5.25],
... "c": [65.75, float("nan"), float("nan"), 10.5],
... }
... )
>>> df.filter(~pl.all_horizontal(pl.all().is_nan()))
shape: (3, 3)
┌─────┬──────┬───────┐
│ a ┆ b ┆ c │
│ --- ┆ --- ┆ --- │
│ f64 ┆ f64 ┆ f64 │
╞═════╪══════╪═══════╡
│ NaN ┆ 10.0 ┆ 65.75 │
│ NaN ┆ 2.5 ┆ NaN │
│ NaN ┆ 5.25 ┆ 10.5 │
└─────┴──────┴───────┘
"""
return self.lazy().drop_nans(subset).collect(_eager=True)

def drop_nulls(
self,
subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = None,
Expand Down
103 changes: 82 additions & 21 deletions py-polars/polars/lazyframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -6169,12 +6169,91 @@ def unique(
subset = parse_into_list_of_expressions(subset)
return self._from_pyldf(self._ldf.unique(maintain_order, subset, keep))

def drop_nans(
self,
subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = None,
) -> LazyFrame:
"""
Drop all rows that contain one or more NaN values.
The original order of the remaining rows is preserved.
Parameters
----------
subset
Column name(s) for which NaN values are considered; if set to `None`
(default), use all columns (note that only floating-point columns
can contain NaNs).
Examples
--------
>>> lf = pl.LazyFrame(
... {
... "foo": [-20.5, float("nan"), 80.0],
... "bar": [float("nan"), 110.0, 25.5],
... "ham": ["xxx", "yyy", None],
... }
... )
The default behavior of this method is to drop rows where any single
value in the row is NaN:
>>> lf.drop_nans().collect()
shape: (1, 3)
┌──────┬──────┬──────┐
│ foo ┆ bar ┆ ham │
│ --- ┆ --- ┆ --- │
│ f64 ┆ f64 ┆ str │
╞══════╪══════╪══════╡
│ 80.0 ┆ 25.5 ┆ null │
└──────┴──────┴──────┘
This behaviour can be constrained to consider only a subset of columns, as
defined by name, or with a selector. For example, dropping rows only if
there is a NaN in the "bar" column:
>>> lf.drop_nans(subset=["bar"]).collect()
shape: (2, 3)
┌──────┬───────┬──────┐
│ foo ┆ bar ┆ ham │
│ --- ┆ --- ┆ --- │
│ f64 ┆ f64 ┆ str │
╞══════╪═══════╪══════╡
│ NaN ┆ 110.0 ┆ yyy │
│ 80.0 ┆ 25.5 ┆ null │
└──────┴───────┴──────┘
Dropping a row only if *all* values are NaN requires a different formulation:
>>> lf = pl.LazyFrame(
... {
... "a": [float("nan"), float("nan"), float("nan"), float("nan")],
... "b": [10.0, 2.5, float("nan"), 5.25],
... "c": [65.75, float("nan"), float("nan"), 10.5],
... }
... )
>>> lf.filter(~pl.all_horizontal(pl.all().is_nan())).collect()
shape: (3, 3)
┌─────┬──────┬───────┐
│ a ┆ b ┆ c │
│ --- ┆ --- ┆ --- │
│ f64 ┆ f64 ┆ f64 │
╞═════╪══════╪═══════╡
│ NaN ┆ 10.0 ┆ 65.75 │
│ NaN ┆ 2.5 ┆ NaN │
│ NaN ┆ 5.25 ┆ 10.5 │
└─────┴──────┴───────┘
"""
if subset is not None:
subset = parse_into_list_of_expressions(subset)
return self._from_pyldf(self._ldf.drop_nans(subset))

def drop_nulls(
self,
subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = None,
) -> LazyFrame:
"""
Drop all rows that contain null values.
Drop all rows that contain one or more null values.
The original order of the remaining rows is preserved.
Expand All @@ -6195,7 +6274,7 @@ def drop_nulls(
... )
The default behavior of this method is to drop rows where any single
value of the row is null.
value in the row is null:
>>> lf.drop_nulls().collect()
shape: (1, 3)
Expand Down Expand Up @@ -6223,10 +6302,7 @@ def drop_nulls(
│ 3 ┆ 8 ┆ null │
└─────┴─────┴──────┘
This method drops a row if any single value of the row is null.
Below are some example snippets that show how you could drop null
values based on other conditions:
Dropping a row only if *all* values are null requires a different formulation:
>>> lf = pl.LazyFrame(
... {
Expand All @@ -6235,21 +6311,6 @@ def drop_nulls(
... "c": [1, None, None, 1],
... }
... )
>>> lf.collect()
shape: (4, 3)
┌──────┬──────┬──────┐
│ a ┆ b ┆ c │
│ --- ┆ --- ┆ --- │
│ null ┆ i64 ┆ i64 │
╞══════╪══════╪══════╡
│ null ┆ 1 ┆ 1 │
│ null ┆ 2 ┆ null │
│ null ┆ null ┆ null │
│ null ┆ 1 ┆ 1 │
└──────┴──────┴──────┘
Drop a row only if all values are null:
>>> lf.filter(~pl.all_horizontal(pl.all().is_null())).collect()
shape: (3, 3)
┌──────┬─────┬──────┐
Expand Down
65 changes: 49 additions & 16 deletions py-polars/tests/unit/operations/test_drop.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,16 @@
from polars.testing import assert_frame_equal


def test_drop() -> None:
df = pl.DataFrame({"a": [2, 1, 3], "b": ["a", "b", "c"], "c": [1, 2, 3]})
df = df.drop("a")
assert df.shape == (3, 2)

df = pl.DataFrame({"a": [2, 1, 3], "b": ["a", "b", "c"], "c": [1, 2, 3]})
s = df.drop_in_place("a")
assert s.name == "a"


def test_drop_explode_6641() -> None:
df = pl.DataFrame(
{
Expand Down Expand Up @@ -68,30 +78,23 @@ def test_drop_nulls(subset: Any) -> None:
assert_frame_equal(result, df)


def test_drop() -> None:
df = pl.DataFrame({"a": [2, 1, 3], "b": ["a", "b", "c"], "c": [1, 2, 3]})
df = df.drop("a")
assert df.shape == (3, 2)
def test_drop_nulls_lazy() -> None:
lf = pl.LazyFrame({"foo": [1, 2, 3], "bar": [6, None, 8], "ham": ["a", "b", "c"]})
expected = pl.LazyFrame({"foo": [1, 3], "bar": [6, 8], "ham": ["a", "c"]})

df = pl.DataFrame({"a": [2, 1, 3], "b": ["a", "b", "c"], "c": [1, 2, 3]})
s = df.drop_in_place("a")
assert s.name == "a"
result = lf.drop_nulls()
assert_frame_equal(result, expected)

result = lf.drop_nulls(cs.contains("a"))
assert_frame_equal(result, expected)

def test_drop_nulls_lazy() -> None:

def test_drop_nulls_misc() -> None:
df = pl.DataFrame({"nrs": [None, 1, 2, 3, None, 4, 5, None]})
assert df.select(pl.col("nrs").drop_nulls()).to_dict(as_series=False) == {
"nrs": [1, 2, 3, 4, 5]
}

df = pl.DataFrame({"foo": [1, 2, 3], "bar": [6, None, 8], "ham": ["a", "b", "c"]})
expected = pl.DataFrame({"foo": [1, 3], "bar": [6, 8], "ham": ["a", "c"]})
result = df.lazy().drop_nulls().collect()
assert_frame_equal(result, expected)

result = df.drop_nulls(cs.contains("a"))
assert_frame_equal(result, expected)


def test_drop_columns() -> None:
out = pl.LazyFrame({"a": [1], "b": [2], "c": [3]}).drop(["a", "b"])
Expand All @@ -110,6 +113,36 @@ def test_drop_columns() -> None:
assert out2.collect_schema().names() == []


@pytest.mark.parametrize("lazy", [True, False])
def test_drop_nans(lazy: bool) -> None:
DataFrame = pl.LazyFrame if lazy else pl.DataFrame
df = DataFrame(
{
"a": [1.0, float("nan"), 3.0, 4.0],
"b": [10000, 20000, 30000, 40000],
"c": [-90.5, 25.0, 0.0, float("nan")],
}
)
expected = DataFrame(
{
"a": [1.0, 3.0],
"b": [10000, 30000],
"c": [-90.5, 0.0],
}
)
assert_frame_equal(expected, df.drop_nans())

expected = DataFrame(
{
"a": [1.0, float("nan"), 3.0],
"b": [10000, 20000, 30000],
"c": [-90.5, 25.0, 0.0],
}
)
assert_frame_equal(expected, df.drop_nans(subset=["c"]))
assert_frame_equal(expected, df.drop_nans(subset=cs.ends_with("c")))


def test_drop_nan_ignore_null_3525() -> None:
df = pl.DataFrame({"a": [1.0, float("nan"), 2.0, None, 3.0, 4.0]})
assert df.select(pl.col("a").drop_nans()).to_series().to_list() == [
Expand Down

0 comments on commit 0b218d8

Please sign in to comment.