From 0b218d8e2c449457284acbee30b0e5c10e7ec9c0 Mon Sep 17 00:00:00 2001 From: Alexander Beedie Date: Wed, 27 Nov 2024 23:11:31 +0400 Subject: [PATCH] feat: Add `drop_nans` method to DataFrame and LazyFrame (#20029) --- crates/polars-core/src/series/mod.rs | 2 +- crates/polars-lazy/src/frame/mod.rs | 12 +- crates/polars-plan/src/plans/builder_dsl.rs | 20 ++++ crates/polars-python/src/lazyframe/general.rs | 7 ++ .../reference/dataframe/modify_select.rst | 1 + .../reference/lazyframe/modify_select.rst | 1 + py-polars/polars/dataframe/frame.py | 77 +++++++++++++ py-polars/polars/lazyframe/frame.py | 103 ++++++++++++++---- py-polars/tests/unit/operations/test_drop.py | 65 ++++++++--- 9 files changed, 249 insertions(+), 39 deletions(-) diff --git a/crates/polars-core/src/series/mod.rs b/crates/polars-core/src/series/mod.rs index 81754abafa19..a98018146d86 100644 --- a/crates/polars-core/src/series/mod.rs +++ b/crates/polars-core/src/series/mod.rs @@ -553,7 +553,7 @@ impl Series { } } - /// Check if float value is NaN (note this is different than missing/ null) + /// Check if float value is NaN (note this is different than missing/null) pub fn is_not_nan(&self) -> PolarsResult { match self.dtype() { DataType::Float32 => Ok(self.f32().unwrap().is_not_nan()), diff --git a/crates/polars-lazy/src/frame/mod.rs b/crates/polars-lazy/src/frame/mod.rs index 9759988bb5cf..617bea135acb 100644 --- a/crates/polars-lazy/src/frame/mod.rs +++ b/crates/polars-lazy/src/frame/mod.rs @@ -1663,7 +1663,17 @@ impl LazyFrame { Self::from_logical_plan(lp, opt_state) } - /// Drop rows containing None. + /// Drop rows containing one or more NaN values. + /// + /// `subset` is an optional `Vec` of column names to consider for NaNs; if None, all + /// floating point columns are considered. + pub fn drop_nans(self, subset: Option>) -> LazyFrame { + let opt_state = self.get_opt_state(); + let lp = self.get_plan_builder().drop_nans(subset).build(); + Self::from_logical_plan(lp, opt_state) + } + + /// Drop rows containing one or more None values. /// /// `subset` is an optional `Vec` of column names to consider for nulls; if None, all /// columns are considered. diff --git a/crates/polars-plan/src/plans/builder_dsl.rs b/crates/polars-plan/src/plans/builder_dsl.rs index 258ad2d4e83f..ec8e7c4ceebe 100644 --- a/crates/polars-plan/src/plans/builder_dsl.rs +++ b/crates/polars-plan/src/plans/builder_dsl.rs @@ -231,6 +231,26 @@ impl DslBuilder { ) } + pub fn drop_nans(self, subset: Option>) -> Self { + if let Some(subset) = subset { + self.filter( + all_horizontal( + subset + .into_iter() + .map(|v| v.is_not_nan()) + .collect::>(), + ) + .unwrap(), + ) + } else { + self.filter( + // TODO: when Decimal supports NaN, include here + all_horizontal([dtype_cols([DataType::Float32, DataType::Float64]).is_not_nan()]) + .unwrap(), + ) + } + } + pub fn drop_nulls(self, subset: Option>) -> Self { if let Some(subset) = subset { self.filter( diff --git a/crates/polars-python/src/lazyframe/general.rs b/crates/polars-python/src/lazyframe/general.rs index f9fb740d4cae..6a47a0e191a8 100644 --- a/crates/polars-python/src/lazyframe/general.rs +++ b/crates/polars-python/src/lazyframe/general.rs @@ -1114,6 +1114,13 @@ impl PyLazyFrame { .into() } + #[pyo3(signature = (subset=None))] + fn drop_nans(&self, subset: Option>) -> Self { + let ldf = self.ldf.clone(); + let subset = subset.map(|e| e.to_exprs()); + ldf.drop_nans(subset).into() + } + #[pyo3(signature = (subset=None))] fn drop_nulls(&self, subset: Option>) -> Self { let ldf = self.ldf.clone(); diff --git a/py-polars/docs/source/reference/dataframe/modify_select.rst b/py-polars/docs/source/reference/dataframe/modify_select.rst index b3a3d024ebd2..4837e641c8e5 100644 --- a/py-polars/docs/source/reference/dataframe/modify_select.rst +++ b/py-polars/docs/source/reference/dataframe/modify_select.rst @@ -13,6 +13,7 @@ Manipulation/selection DataFrame.clone DataFrame.drop DataFrame.drop_in_place + DataFrame.drop_nans DataFrame.drop_nulls DataFrame.explode DataFrame.extend diff --git a/py-polars/docs/source/reference/lazyframe/modify_select.rst b/py-polars/docs/source/reference/lazyframe/modify_select.rst index f26a600966d2..aa5a0f009a81 100644 --- a/py-polars/docs/source/reference/lazyframe/modify_select.rst +++ b/py-polars/docs/source/reference/lazyframe/modify_select.rst @@ -12,6 +12,7 @@ Manipulation/selection LazyFrame.clear LazyFrame.clone LazyFrame.drop + LazyFrame.drop_nans LazyFrame.drop_nulls LazyFrame.explode LazyFrame.fill_nan diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index 5d61d80455f6..26ead13b2b33 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -5642,6 +5642,83 @@ def limit(self, n: int = 5) -> DataFrame: """ return self.head(n) + def drop_nans( + self, + subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = None, + ) -> DataFrame: + """ + Drop all rows that contain one or more NaN values. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + subset + Column name(s) for which NaN values are considered; if set to `None` + (default), use all columns (note that only floating-point columns + can contain NaNs). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [-20.5, float("nan"), 80.0], + ... "bar": [float("nan"), 110.0, 25.5], + ... "ham": ["xxx", "yyy", None], + ... } + ... ) + + The default behavior of this method is to drop rows where any single + value in the row is NaN: + + >>> df.drop_nans() + shape: (1, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════╪══════╪══════╡ + │ 80.0 ┆ 25.5 ┆ null │ + └──────┴──────┴──────┘ + + This behaviour can be constrained to consider only a subset of columns, as + defined by name, or with a selector. For example, dropping rows only if + there is a NaN in the "bar" column: + + >>> df.drop_nans(subset=["bar"]) + shape: (2, 3) + ┌──────┬───────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════╪═══════╪══════╡ + │ NaN ┆ 110.0 ┆ yyy │ + │ 80.0 ┆ 25.5 ┆ null │ + └──────┴───────┴──────┘ + + Dropping a row only if *all* values are NaN requires a different formulation: + + >>> df = pl.DataFrame( + ... { + ... "a": [float("nan"), float("nan"), float("nan"), float("nan")], + ... "b": [10.0, 2.5, float("nan"), 5.25], + ... "c": [65.75, float("nan"), float("nan"), 10.5], + ... } + ... ) + >>> df.filter(~pl.all_horizontal(pl.all().is_nan())) + shape: (3, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞═════╪══════╪═══════╡ + │ NaN ┆ 10.0 ┆ 65.75 │ + │ NaN ┆ 2.5 ┆ NaN │ + │ NaN ┆ 5.25 ┆ 10.5 │ + └─────┴──────┴───────┘ + """ + return self.lazy().drop_nans(subset).collect(_eager=True) + def drop_nulls( self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = None, diff --git a/py-polars/polars/lazyframe/frame.py b/py-polars/polars/lazyframe/frame.py index 062cbf36ac5a..4a9591ac89ed 100644 --- a/py-polars/polars/lazyframe/frame.py +++ b/py-polars/polars/lazyframe/frame.py @@ -6169,12 +6169,91 @@ def unique( subset = parse_into_list_of_expressions(subset) return self._from_pyldf(self._ldf.unique(maintain_order, subset, keep)) + def drop_nans( + self, + subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = None, + ) -> LazyFrame: + """ + Drop all rows that contain one or more NaN values. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + subset + Column name(s) for which NaN values are considered; if set to `None` + (default), use all columns (note that only floating-point columns + can contain NaNs). + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [-20.5, float("nan"), 80.0], + ... "bar": [float("nan"), 110.0, 25.5], + ... "ham": ["xxx", "yyy", None], + ... } + ... ) + + The default behavior of this method is to drop rows where any single + value in the row is NaN: + + >>> lf.drop_nans().collect() + shape: (1, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════╪══════╪══════╡ + │ 80.0 ┆ 25.5 ┆ null │ + └──────┴──────┴──────┘ + + This behaviour can be constrained to consider only a subset of columns, as + defined by name, or with a selector. For example, dropping rows only if + there is a NaN in the "bar" column: + + >>> lf.drop_nans(subset=["bar"]).collect() + shape: (2, 3) + ┌──────┬───────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════╪═══════╪══════╡ + │ NaN ┆ 110.0 ┆ yyy │ + │ 80.0 ┆ 25.5 ┆ null │ + └──────┴───────┴──────┘ + + Dropping a row only if *all* values are NaN requires a different formulation: + + >>> lf = pl.LazyFrame( + ... { + ... "a": [float("nan"), float("nan"), float("nan"), float("nan")], + ... "b": [10.0, 2.5, float("nan"), 5.25], + ... "c": [65.75, float("nan"), float("nan"), 10.5], + ... } + ... ) + >>> lf.filter(~pl.all_horizontal(pl.all().is_nan())).collect() + shape: (3, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞═════╪══════╪═══════╡ + │ NaN ┆ 10.0 ┆ 65.75 │ + │ NaN ┆ 2.5 ┆ NaN │ + │ NaN ┆ 5.25 ┆ 10.5 │ + └─────┴──────┴───────┘ + """ + if subset is not None: + subset = parse_into_list_of_expressions(subset) + return self._from_pyldf(self._ldf.drop_nans(subset)) + def drop_nulls( self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = None, ) -> LazyFrame: """ - Drop all rows that contain null values. + Drop all rows that contain one or more null values. The original order of the remaining rows is preserved. @@ -6195,7 +6274,7 @@ def drop_nulls( ... ) The default behavior of this method is to drop rows where any single - value of the row is null. + value in the row is null: >>> lf.drop_nulls().collect() shape: (1, 3) @@ -6223,10 +6302,7 @@ def drop_nulls( │ 3 ┆ 8 ┆ null │ └─────┴─────┴──────┘ - This method drops a row if any single value of the row is null. - - Below are some example snippets that show how you could drop null - values based on other conditions: + Dropping a row only if *all* values are null requires a different formulation: >>> lf = pl.LazyFrame( ... { @@ -6235,21 +6311,6 @@ def drop_nulls( ... "c": [1, None, None, 1], ... } ... ) - >>> lf.collect() - shape: (4, 3) - ┌──────┬──────┬──────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ null ┆ i64 ┆ i64 │ - ╞══════╪══════╪══════╡ - │ null ┆ 1 ┆ 1 │ - │ null ┆ 2 ┆ null │ - │ null ┆ null ┆ null │ - │ null ┆ 1 ┆ 1 │ - └──────┴──────┴──────┘ - - Drop a row only if all values are null: - >>> lf.filter(~pl.all_horizontal(pl.all().is_null())).collect() shape: (3, 3) ┌──────┬─────┬──────┐ diff --git a/py-polars/tests/unit/operations/test_drop.py b/py-polars/tests/unit/operations/test_drop.py index c7b340abace6..fec5c16c2bf0 100644 --- a/py-polars/tests/unit/operations/test_drop.py +++ b/py-polars/tests/unit/operations/test_drop.py @@ -7,6 +7,16 @@ from polars.testing import assert_frame_equal +def test_drop() -> None: + df = pl.DataFrame({"a": [2, 1, 3], "b": ["a", "b", "c"], "c": [1, 2, 3]}) + df = df.drop("a") + assert df.shape == (3, 2) + + df = pl.DataFrame({"a": [2, 1, 3], "b": ["a", "b", "c"], "c": [1, 2, 3]}) + s = df.drop_in_place("a") + assert s.name == "a" + + def test_drop_explode_6641() -> None: df = pl.DataFrame( { @@ -68,30 +78,23 @@ def test_drop_nulls(subset: Any) -> None: assert_frame_equal(result, df) -def test_drop() -> None: - df = pl.DataFrame({"a": [2, 1, 3], "b": ["a", "b", "c"], "c": [1, 2, 3]}) - df = df.drop("a") - assert df.shape == (3, 2) +def test_drop_nulls_lazy() -> None: + lf = pl.LazyFrame({"foo": [1, 2, 3], "bar": [6, None, 8], "ham": ["a", "b", "c"]}) + expected = pl.LazyFrame({"foo": [1, 3], "bar": [6, 8], "ham": ["a", "c"]}) - df = pl.DataFrame({"a": [2, 1, 3], "b": ["a", "b", "c"], "c": [1, 2, 3]}) - s = df.drop_in_place("a") - assert s.name == "a" + result = lf.drop_nulls() + assert_frame_equal(result, expected) + result = lf.drop_nulls(cs.contains("a")) + assert_frame_equal(result, expected) -def test_drop_nulls_lazy() -> None: + +def test_drop_nulls_misc() -> None: df = pl.DataFrame({"nrs": [None, 1, 2, 3, None, 4, 5, None]}) assert df.select(pl.col("nrs").drop_nulls()).to_dict(as_series=False) == { "nrs": [1, 2, 3, 4, 5] } - df = pl.DataFrame({"foo": [1, 2, 3], "bar": [6, None, 8], "ham": ["a", "b", "c"]}) - expected = pl.DataFrame({"foo": [1, 3], "bar": [6, 8], "ham": ["a", "c"]}) - result = df.lazy().drop_nulls().collect() - assert_frame_equal(result, expected) - - result = df.drop_nulls(cs.contains("a")) - assert_frame_equal(result, expected) - def test_drop_columns() -> None: out = pl.LazyFrame({"a": [1], "b": [2], "c": [3]}).drop(["a", "b"]) @@ -110,6 +113,36 @@ def test_drop_columns() -> None: assert out2.collect_schema().names() == [] +@pytest.mark.parametrize("lazy", [True, False]) +def test_drop_nans(lazy: bool) -> None: + DataFrame = pl.LazyFrame if lazy else pl.DataFrame + df = DataFrame( + { + "a": [1.0, float("nan"), 3.0, 4.0], + "b": [10000, 20000, 30000, 40000], + "c": [-90.5, 25.0, 0.0, float("nan")], + } + ) + expected = DataFrame( + { + "a": [1.0, 3.0], + "b": [10000, 30000], + "c": [-90.5, 0.0], + } + ) + assert_frame_equal(expected, df.drop_nans()) + + expected = DataFrame( + { + "a": [1.0, float("nan"), 3.0], + "b": [10000, 20000, 30000], + "c": [-90.5, 25.0, 0.0], + } + ) + assert_frame_equal(expected, df.drop_nans(subset=["c"])) + assert_frame_equal(expected, df.drop_nans(subset=cs.ends_with("c"))) + + def test_drop_nan_ignore_null_3525() -> None: df = pl.DataFrame({"a": [1.0, float("nan"), 2.0, None, 3.0, 4.0]}) assert df.select(pl.col("a").drop_nans()).to_series().to_list() == [