ENH: Add numeric_only to frame methods (#46708)

pandas-dev · Apr 9, 2022 · 860797b · 860797b
1 parent 8172879
commit 860797b
Show file tree

Hide file tree

Showing 3 changed files with 96 additions and 21 deletions.
diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
@@ -94,7 +94,7 @@ Other enhancements
 - :meth:`DataFrame.reset_index` now accepts a ``names`` argument which renames the index names (:issue:`6878`)
 - :meth:`pd.concat` now raises when ``levels`` is given but ``keys`` is None (:issue:`46653`)
 - :meth:`pd.concat` now raises when ``levels`` contains duplicate values (:issue:`46653`)
--
+- Added ``numeric_only`` argument to :meth:`DataFrame.corr`, :meth:`DataFrame.corrwith`, and :meth:`DataFrame.cov` (:issue:`46560`)
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_150.notable_bug_fixes:

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -9764,6 +9764,7 @@ def corr(
         self,
         method: str | Callable[[np.ndarray, np.ndarray], float] = "pearson",
         min_periods: int = 1,
+        numeric_only: bool = True,
     ) -> DataFrame:
         """
         Compute pairwise correlation of columns, excluding NA/null values.
@@ -9784,6 +9785,10 @@ def corr(
             Minimum number of observations required per pair of columns
             to have a valid result. Currently only available for Pearson
             and Spearman correlation.
+        numeric_only : bool, default True
+            Include only `float`, `int` or `boolean` data.
+
+            .. versionadded:: 1.5.0
 
         Returns
         -------
@@ -9823,10 +9828,13 @@ def corr(
         dogs   1.0   NaN
         cats   NaN   1.0
         """  # noqa:E501
-        numeric_df = self._get_numeric_data()
-        cols = numeric_df.columns
+        if numeric_only:
+            data = self._get_numeric_data()
+        else:
+            data = self
+        cols = data.columns
         idx = cols.copy()
-        mat = numeric_df.to_numpy(dtype=float, na_value=np.nan, copy=False)
+        mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False)
 
         if method == "pearson":
             correl = libalgos.nancorr(mat, minp=min_periods)
@@ -9865,7 +9873,12 @@ def corr(
 
         return self._constructor(correl, index=idx, columns=cols)
 
-    def cov(self, min_periods: int | None = None, ddof: int | None = 1) -> DataFrame:
+    def cov(
+        self,
+        min_periods: int | None = None,
+        ddof: int | None = 1,
+        numeric_only: bool = True,
+    ) -> DataFrame:
         """
         Compute pairwise covariance of columns, excluding NA/null values.
 
@@ -9896,6 +9909,11 @@ def cov(self, min_periods: int | None = None, ddof: int | None = 1) -> DataFrame
 
             .. versionadded:: 1.1.0
 
+        numeric_only : bool, default True
+            Include only `float`, `int` or `boolean` data.
+
+            .. versionadded:: 1.5.0
+
         Returns
         -------
         DataFrame
@@ -9964,10 +9982,13 @@ def cov(self, min_periods: int | None = None, ddof: int | None = 1) -> DataFrame
         b       NaN  1.248003  0.191417
         c -0.150812  0.191417  0.895202
         """
-        numeric_df = self._get_numeric_data()
-        cols = numeric_df.columns
+        if numeric_only:
+            data = self._get_numeric_data()
+        else:
+            data = self
+        cols = data.columns
         idx = cols.copy()
-        mat = numeric_df.to_numpy(dtype=float, na_value=np.nan, copy=False)
+        mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False)
 
         if notna(mat).all():
             if min_periods is not None and min_periods > len(mat):
@@ -9981,7 +10002,14 @@ def cov(self, min_periods: int | None = None, ddof: int | None = 1) -> DataFrame
 
         return self._constructor(base_cov, index=idx, columns=cols)
 
-    def corrwith(self, other, axis: Axis = 0, drop=False, method="pearson") -> Series:
+    def corrwith(
+        self,
+        other,
+        axis: Axis = 0,
+        drop=False,
+        method="pearson",
+        numeric_only: bool = True,
+    ) -> Series:
         """
         Compute pairwise correlation.
 
@@ -10008,6 +10036,11 @@ def corrwith(self, other, axis: Axis = 0, drop=False, method="pearson") -> Serie
             * callable: callable with input two 1d ndarrays
                 and returning a float.
 
+        numeric_only : bool, default True
+            Include only `float`, `int` or `boolean` data.
+
+            .. versionadded:: 1.5.0
+
         Returns
         -------
         Series
@@ -10039,7 +10072,10 @@ def corrwith(self, other, axis: Axis = 0, drop=False, method="pearson") -> Serie
         dtype: float64
         """  # noqa:E501
         axis = self._get_axis_number(axis)
-        this = self._get_numeric_data()
+        if numeric_only:
+            this = self._get_numeric_data()
+        else:
+            this = self
 
         # GH46174: when other is a Series object and axis=0, we achieve a speedup over
         # passing .corr() to .apply() by taking the columns as ndarrays and iterating
@@ -10052,19 +10088,23 @@ def corrwith(self, other, axis: Axis = 0, drop=False, method="pearson") -> Serie
         if isinstance(other, Series):
             if axis == 0 and method in ["pearson", "spearman"]:
                 corrs = {}
-                numeric_cols = self.select_dtypes(include=np.number).columns
-                ndf = self[numeric_cols].values.transpose()
+                if numeric_only:
+                    cols = self.select_dtypes(include=np.number).columns
+                    ndf = self[cols].values.transpose()
+                else:
+                    cols = self.columns
+                    ndf = self.values.transpose()
                 k = other.values
                 if method == "pearson":
                     for i, r in enumerate(ndf):
                         nonnull_mask = ~np.isnan(r) & ~np.isnan(k)
-                        corrs[numeric_cols[i]] = np.corrcoef(
-                            r[nonnull_mask], k[nonnull_mask]
-                        )[0, 1]
+                        corrs[cols[i]] = np.corrcoef(r[nonnull_mask], k[nonnull_mask])[
+                            0, 1
+                        ]
                 else:
                     for i, r in enumerate(ndf):
                         nonnull_mask = ~np.isnan(r) & ~np.isnan(k)
-                        corrs[numeric_cols[i]] = np.corrcoef(
+                        corrs[cols[i]] = np.corrcoef(
                             r[nonnull_mask].argsort().argsort(),
                             k[nonnull_mask].argsort().argsort(),
                         )[0, 1]

diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py
@@ -83,6 +83,20 @@ def test_cov_nullable_integer(self, other_column):
         expected = DataFrame(arr, columns=["a", "b"], index=["a", "b"])
         tm.assert_frame_equal(result, expected)
 
+    @pytest.mark.parametrize("numeric_only", [True, False])
+    def test_cov_numeric_only(self, numeric_only):
+        # when dtypes of pandas series are different
+        # then ndarray will have dtype=object,
+        # so it need to be properly handled
+        df = DataFrame({"a": [1, 0], "c": ["x", "y"]})
+        expected = DataFrame(0.5, index=["a"], columns=["a"])
+        if numeric_only:
+            result = df.cov(numeric_only=numeric_only)
+            tm.assert_frame_equal(result, expected)
+        else:
+            with pytest.raises(ValueError, match="could not convert string to float"):
+                df.cov(numeric_only=numeric_only)
+
 
 class TestDataFrameCorr:
     # DataFrame.corr(), as opposed to DataFrame.corrwith
@@ -235,6 +249,22 @@ def test_corr_min_periods_greater_than_length(self, method):
         )
         tm.assert_frame_equal(result, expected)
 
+    @td.skip_if_no_scipy
+    @pytest.mark.parametrize("meth", ["pearson", "kendall", "spearman"])
+    @pytest.mark.parametrize("numeric_only", [True, False])
+    def test_corr_numeric_only(self, meth, numeric_only):
+        # when dtypes of pandas series are different
+        # then ndarray will have dtype=object,
+        # so it need to be properly handled
+        df = DataFrame({"a": [1, 0], "b": [1, 0], "c": ["x", "y"]})
+        expected = DataFrame(np.ones((2, 2)), index=["a", "b"], columns=["a", "b"])
+        if numeric_only:
+            result = df.corr(meth, numeric_only=numeric_only)
+            tm.assert_frame_equal(result, expected)
+        else:
+            with pytest.raises(ValueError, match="could not convert string to float"):
+                df.corr(meth, numeric_only=numeric_only)
+
 
 class TestDataFrameCorrWith:
     def test_corrwith(self, datetime_frame):
@@ -300,16 +330,21 @@ def test_corrwith_matches_corrcoef(self):
         tm.assert_almost_equal(c1, c2)
         assert c1 < 1
 
-    def test_corrwith_mixed_dtypes(self):
+    @pytest.mark.parametrize("numeric_only", [True, False])
+    def test_corrwith_mixed_dtypes(self, numeric_only):
         # GH#18570
         df = DataFrame(
             {"a": [1, 4, 3, 2], "b": [4, 6, 7, 3], "c": ["a", "b", "c", "d"]}
         )
         s = Series([0, 6, 7, 3])
-        result = df.corrwith(s)
-        corrs = [df["a"].corr(s), df["b"].corr(s)]
-        expected = Series(data=corrs, index=["a", "b"])
-        tm.assert_series_equal(result, expected)
+        if numeric_only:
+            result = df.corrwith(s, numeric_only=numeric_only)
+            corrs = [df["a"].corr(s), df["b"].corr(s)]
+            expected = Series(data=corrs, index=["a", "b"])
+            tm.assert_series_equal(result, expected)
+        else:
+            with pytest.raises(TypeError, match="not supported for the input types"):
+                df.corrwith(s, numeric_only=numeric_only)
 
     def test_corrwith_index_intersection(self):
         df1 = DataFrame(np.random.random(size=(10, 2)), columns=["a", "b"])