feat: Add drop_nans method to DataFrame and LazyFrame (#20029)

pola-rs · Nov 27, 2024 · 0b218d8 · 0b218d8
1 parent 74b9925
commit 0b218d8
Show file tree

Hide file tree

Showing 9 changed files with 249 additions and 39 deletions.
diff --git a/crates/polars-core/src/series/mod.rs b/crates/polars-core/src/series/mod.rs
@@ -553,7 +553,7 @@ impl Series {
         }
     }
 
-    /// Check if float value is NaN (note this is different than missing/ null)
+    /// Check if float value is NaN (note this is different than missing/null)
     pub fn is_not_nan(&self) -> PolarsResult<BooleanChunked> {
         match self.dtype() {
             DataType::Float32 => Ok(self.f32().unwrap().is_not_nan()),

diff --git a/crates/polars-lazy/src/frame/mod.rs b/crates/polars-lazy/src/frame/mod.rs
@@ -1663,7 +1663,17 @@ impl LazyFrame {
         Self::from_logical_plan(lp, opt_state)
     }
 
-    /// Drop rows containing None.
+    /// Drop rows containing one or more NaN values.
+    ///
+    /// `subset` is an optional `Vec` of column names to consider for NaNs; if None, all
+    /// floating point columns are considered.
+    pub fn drop_nans(self, subset: Option<Vec<Expr>>) -> LazyFrame {
+        let opt_state = self.get_opt_state();
+        let lp = self.get_plan_builder().drop_nans(subset).build();
+        Self::from_logical_plan(lp, opt_state)
+    }
+
+    /// Drop rows containing one or more None values.
     ///
     /// `subset` is an optional `Vec` of column names to consider for nulls; if None, all
     /// columns are considered.

diff --git a/crates/polars-plan/src/plans/builder_dsl.rs b/crates/polars-plan/src/plans/builder_dsl.rs
@@ -231,6 +231,26 @@ impl DslBuilder {
         )
     }
 
+    pub fn drop_nans(self, subset: Option<Vec<Expr>>) -> Self {
+        if let Some(subset) = subset {
+            self.filter(
+                all_horizontal(
+                    subset
+                        .into_iter()
+                        .map(|v| v.is_not_nan())
+                        .collect::<Vec<_>>(),
+                )
+                .unwrap(),
+            )
+        } else {
+            self.filter(
+                // TODO: when Decimal supports NaN, include here
+                all_horizontal([dtype_cols([DataType::Float32, DataType::Float64]).is_not_nan()])
+                    .unwrap(),
+            )
+        }
+    }
+
     pub fn drop_nulls(self, subset: Option<Vec<Expr>>) -> Self {
         if let Some(subset) = subset {
             self.filter(

diff --git a/crates/polars-python/src/lazyframe/general.rs b/crates/polars-python/src/lazyframe/general.rs
@@ -1114,6 +1114,13 @@ impl PyLazyFrame {
         .into()
     }
 
+    #[pyo3(signature = (subset=None))]
+    fn drop_nans(&self, subset: Option<Vec<PyExpr>>) -> Self {
+        let ldf = self.ldf.clone();
+        let subset = subset.map(|e| e.to_exprs());
+        ldf.drop_nans(subset).into()
+    }
+
     #[pyo3(signature = (subset=None))]
     fn drop_nulls(&self, subset: Option<Vec<PyExpr>>) -> Self {
         let ldf = self.ldf.clone();

diff --git a/py-polars/docs/source/reference/dataframe/modify_select.rst b/py-polars/docs/source/reference/dataframe/modify_select.rst
@@ -13,6 +13,7 @@ Manipulation/selection
     DataFrame.clone
     DataFrame.drop
     DataFrame.drop_in_place
+    DataFrame.drop_nans
     DataFrame.drop_nulls
     DataFrame.explode
     DataFrame.extend

diff --git a/py-polars/docs/source/reference/lazyframe/modify_select.rst b/py-polars/docs/source/reference/lazyframe/modify_select.rst
@@ -12,6 +12,7 @@ Manipulation/selection
     LazyFrame.clear
     LazyFrame.clone
     LazyFrame.drop
+    LazyFrame.drop_nans
     LazyFrame.drop_nulls
     LazyFrame.explode
     LazyFrame.fill_nan

diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py
@@ -5642,6 +5642,83 @@ def limit(self, n: int = 5) -> DataFrame:
         """
         return self.head(n)
 
+    def drop_nans(
+        self,
+        subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = None,
+    ) -> DataFrame:
+        """
+        Drop all rows that contain one or more NaN values.
+
+        The original order of the remaining rows is preserved.
+
+        Parameters
+        ----------
+        subset
+            Column name(s) for which NaN values are considered; if set to `None`
+            (default), use all columns (note that only floating-point columns
+            can contain NaNs).
+
+        Examples
+        --------
+        >>> df = pl.DataFrame(
+        ...     {
+        ...         "foo": [-20.5, float("nan"), 80.0],
+        ...         "bar": [float("nan"), 110.0, 25.5],
+        ...         "ham": ["xxx", "yyy", None],
+        ...     }
+        ... )
+
+        The default behavior of this method is to drop rows where any single
+        value in the row is NaN:
+
+        >>> df.drop_nans()
+        shape: (1, 3)
+        ┌──────┬──────┬──────┐
+        │ foo  ┆ bar  ┆ ham  │
+        │ ---  ┆ ---  ┆ ---  │
+        │ f64  ┆ f64  ┆ str  │
+        ╞══════╪══════╪══════╡
+        │ 80.0 ┆ 25.5 ┆ null │
+        └──────┴──────┴──────┘
+
+        This behaviour can be constrained to consider only a subset of columns, as
+        defined by name, or with a selector. For example, dropping rows only if
+        there is a NaN in the "bar" column:
+
+        >>> df.drop_nans(subset=["bar"])
+        shape: (2, 3)
+        ┌──────┬───────┬──────┐
+        │ foo  ┆ bar   ┆ ham  │
+        │ ---  ┆ ---   ┆ ---  │
+        │ f64  ┆ f64   ┆ str  │
+        ╞══════╪═══════╪══════╡
+        │ NaN  ┆ 110.0 ┆ yyy  │
+        │ 80.0 ┆ 25.5  ┆ null │
+        └──────┴───────┴──────┘
+
+        Dropping a row only if *all* values are NaN requires a different formulation:
+
+        >>> df = pl.DataFrame(
+        ...     {
+        ...         "a": [float("nan"), float("nan"), float("nan"), float("nan")],
+        ...         "b": [10.0, 2.5, float("nan"), 5.25],
+        ...         "c": [65.75, float("nan"), float("nan"), 10.5],
+        ...     }
+        ... )
+        >>> df.filter(~pl.all_horizontal(pl.all().is_nan()))
+        shape: (3, 3)
+        ┌─────┬──────┬───────┐
+        │ a   ┆ b    ┆ c     │
+        │ --- ┆ ---  ┆ ---   │
+        │ f64 ┆ f64  ┆ f64   │
+        ╞═════╪══════╪═══════╡
+        │ NaN ┆ 10.0 ┆ 65.75 │
+        │ NaN ┆ 2.5  ┆ NaN   │
+        │ NaN ┆ 5.25 ┆ 10.5  │
+        └─────┴──────┴───────┘
+        """
+        return self.lazy().drop_nans(subset).collect(_eager=True)
+
     def drop_nulls(
         self,
         subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = None,

diff --git a/py-polars/polars/lazyframe/frame.py b/py-polars/polars/lazyframe/frame.py
@@ -6169,12 +6169,91 @@ def unique(
             subset = parse_into_list_of_expressions(subset)
         return self._from_pyldf(self._ldf.unique(maintain_order, subset, keep))
 
+    def drop_nans(
+        self,
+        subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = None,
+    ) -> LazyFrame:
+        """
+        Drop all rows that contain one or more NaN values.
+
+        The original order of the remaining rows is preserved.
+
+        Parameters
+        ----------
+        subset
+            Column name(s) for which NaN values are considered; if set to `None`
+            (default), use all columns (note that only floating-point columns
+            can contain NaNs).
+
+        Examples
+        --------
+        >>> lf = pl.LazyFrame(
+        ...     {
+        ...         "foo": [-20.5, float("nan"), 80.0],
+        ...         "bar": [float("nan"), 110.0, 25.5],
+        ...         "ham": ["xxx", "yyy", None],
+        ...     }
+        ... )
+
+        The default behavior of this method is to drop rows where any single
+        value in the row is NaN:
+
+        >>> lf.drop_nans().collect()
+        shape: (1, 3)
+        ┌──────┬──────┬──────┐
+        │ foo  ┆ bar  ┆ ham  │
+        │ ---  ┆ ---  ┆ ---  │
+        │ f64  ┆ f64  ┆ str  │
+        ╞══════╪══════╪══════╡
+        │ 80.0 ┆ 25.5 ┆ null │
+        └──────┴──────┴──────┘
+
+        This behaviour can be constrained to consider only a subset of columns, as
+        defined by name, or with a selector. For example, dropping rows only if
+        there is a NaN in the "bar" column:
+
+        >>> lf.drop_nans(subset=["bar"]).collect()
+        shape: (2, 3)
+        ┌──────┬───────┬──────┐
+        │ foo  ┆ bar   ┆ ham  │
+        │ ---  ┆ ---   ┆ ---  │
+        │ f64  ┆ f64   ┆ str  │
+        ╞══════╪═══════╪══════╡
+        │ NaN  ┆ 110.0 ┆ yyy  │
+        │ 80.0 ┆ 25.5  ┆ null │
+        └──────┴───────┴──────┘
+
+        Dropping a row only if *all* values are NaN requires a different formulation:
+
+        >>> lf = pl.LazyFrame(
+        ...     {
+        ...         "a": [float("nan"), float("nan"), float("nan"), float("nan")],
+        ...         "b": [10.0, 2.5, float("nan"), 5.25],
+        ...         "c": [65.75, float("nan"), float("nan"), 10.5],
+        ...     }
+        ... )
+        >>> lf.filter(~pl.all_horizontal(pl.all().is_nan())).collect()
+        shape: (3, 3)
+        ┌─────┬──────┬───────┐
+        │ a   ┆ b    ┆ c     │
+        │ --- ┆ ---  ┆ ---   │
+        │ f64 ┆ f64  ┆ f64   │
+        ╞═════╪══════╪═══════╡
+        │ NaN ┆ 10.0 ┆ 65.75 │
+        │ NaN ┆ 2.5  ┆ NaN   │
+        │ NaN ┆ 5.25 ┆ 10.5  │
+        └─────┴──────┴───────┘
+        """
+        if subset is not None:
+            subset = parse_into_list_of_expressions(subset)
+        return self._from_pyldf(self._ldf.drop_nans(subset))
+
     def drop_nulls(
         self,
         subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = None,
     ) -> LazyFrame:
         """
-        Drop all rows that contain null values.
+        Drop all rows that contain one or more null values.
 
         The original order of the remaining rows is preserved.
 
@@ -6195,7 +6274,7 @@ def drop_nulls(
         ... )
 
         The default behavior of this method is to drop rows where any single
-        value of the row is null.
+        value in the row is null:
 
         >>> lf.drop_nulls().collect()
         shape: (1, 3)
@@ -6223,10 +6302,7 @@ def drop_nulls(
         │ 3   ┆ 8   ┆ null │
         └─────┴─────┴──────┘
 
-        This method drops a row if any single value of the row is null.
-
-        Below are some example snippets that show how you could drop null
-        values based on other conditions:
+        Dropping a row only if *all* values are null requires a different formulation:
 
         >>> lf = pl.LazyFrame(
         ...     {
@@ -6235,21 +6311,6 @@ def drop_nulls(
         ...         "c": [1, None, None, 1],
         ...     }
         ... )
-        >>> lf.collect()
-        shape: (4, 3)
-        ┌──────┬──────┬──────┐
-        │ a    ┆ b    ┆ c    │
-        │ ---  ┆ ---  ┆ ---  │
-        │ null ┆ i64  ┆ i64  │
-        ╞══════╪══════╪══════╡
-        │ null ┆ 1    ┆ 1    │
-        │ null ┆ 2    ┆ null │
-        │ null ┆ null ┆ null │
-        │ null ┆ 1    ┆ 1    │
-        └──────┴──────┴──────┘
-
-        Drop a row only if all values are null:
-
         >>> lf.filter(~pl.all_horizontal(pl.all().is_null())).collect()
         shape: (3, 3)
         ┌──────┬─────┬──────┐

diff --git a/py-polars/tests/unit/operations/test_drop.py b/py-polars/tests/unit/operations/test_drop.py
@@ -7,6 +7,16 @@
 from polars.testing import assert_frame_equal
 
 
+def test_drop() -> None:
+    df = pl.DataFrame({"a": [2, 1, 3], "b": ["a", "b", "c"], "c": [1, 2, 3]})
+    df = df.drop("a")
+    assert df.shape == (3, 2)
+
+    df = pl.DataFrame({"a": [2, 1, 3], "b": ["a", "b", "c"], "c": [1, 2, 3]})
+    s = df.drop_in_place("a")
+    assert s.name == "a"
+
+
 def test_drop_explode_6641() -> None:
     df = pl.DataFrame(
         {
@@ -68,30 +78,23 @@ def test_drop_nulls(subset: Any) -> None:
     assert_frame_equal(result, df)
 
 
-def test_drop() -> None:
-    df = pl.DataFrame({"a": [2, 1, 3], "b": ["a", "b", "c"], "c": [1, 2, 3]})
-    df = df.drop("a")
-    assert df.shape == (3, 2)
+def test_drop_nulls_lazy() -> None:
+    lf = pl.LazyFrame({"foo": [1, 2, 3], "bar": [6, None, 8], "ham": ["a", "b", "c"]})
+    expected = pl.LazyFrame({"foo": [1, 3], "bar": [6, 8], "ham": ["a", "c"]})
 
-    df = pl.DataFrame({"a": [2, 1, 3], "b": ["a", "b", "c"], "c": [1, 2, 3]})
-    s = df.drop_in_place("a")
-    assert s.name == "a"
+    result = lf.drop_nulls()
+    assert_frame_equal(result, expected)
 
+    result = lf.drop_nulls(cs.contains("a"))
+    assert_frame_equal(result, expected)
 
-def test_drop_nulls_lazy() -> None:
+
+def test_drop_nulls_misc() -> None:
     df = pl.DataFrame({"nrs": [None, 1, 2, 3, None, 4, 5, None]})
     assert df.select(pl.col("nrs").drop_nulls()).to_dict(as_series=False) == {
         "nrs": [1, 2, 3, 4, 5]
     }
 
-    df = pl.DataFrame({"foo": [1, 2, 3], "bar": [6, None, 8], "ham": ["a", "b", "c"]})
-    expected = pl.DataFrame({"foo": [1, 3], "bar": [6, 8], "ham": ["a", "c"]})
-    result = df.lazy().drop_nulls().collect()
-    assert_frame_equal(result, expected)
-
-    result = df.drop_nulls(cs.contains("a"))
-    assert_frame_equal(result, expected)
-
 
 def test_drop_columns() -> None:
     out = pl.LazyFrame({"a": [1], "b": [2], "c": [3]}).drop(["a", "b"])
@@ -110,6 +113,36 @@ def test_drop_columns() -> None:
     assert out2.collect_schema().names() == []
 
 
+@pytest.mark.parametrize("lazy", [True, False])
+def test_drop_nans(lazy: bool) -> None:
+    DataFrame = pl.LazyFrame if lazy else pl.DataFrame
+    df = DataFrame(
+        {
+            "a": [1.0, float("nan"), 3.0, 4.0],
+            "b": [10000, 20000, 30000, 40000],
+            "c": [-90.5, 25.0, 0.0, float("nan")],
+        }
+    )
+    expected = DataFrame(
+        {
+            "a": [1.0, 3.0],
+            "b": [10000, 30000],
+            "c": [-90.5, 0.0],
+        }
+    )
+    assert_frame_equal(expected, df.drop_nans())
+
+    expected = DataFrame(
+        {
+            "a": [1.0, float("nan"), 3.0],
+            "b": [10000, 20000, 30000],
+            "c": [-90.5, 25.0, 0.0],
+        }
+    )
+    assert_frame_equal(expected, df.drop_nans(subset=["c"]))
+    assert_frame_equal(expected, df.drop_nans(subset=cs.ends_with("c")))
+
+
 def test_drop_nan_ignore_null_3525() -> None:
     df = pl.DataFrame({"a": [1.0, float("nan"), 2.0, None, 3.0, 4.0]})
     assert df.select(pl.col("a").drop_nans()).to_series().to_list() == [