Skip to content

Commit

Permalink
ENH: Add numeric_only to frame methods (#46708)
Browse files Browse the repository at this point in the history
  • Loading branch information
rhshadrach authored Apr 9, 2022
1 parent 8172879 commit 860797b
Show file tree
Hide file tree
Showing 3 changed files with 96 additions and 21 deletions.
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v1.5.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ Other enhancements
- :meth:`DataFrame.reset_index` now accepts a ``names`` argument which renames the index names (:issue:`6878`)
- :meth:`pd.concat` now raises when ``levels`` is given but ``keys`` is None (:issue:`46653`)
- :meth:`pd.concat` now raises when ``levels`` contains duplicate values (:issue:`46653`)
-
- Added ``numeric_only`` argument to :meth:`DataFrame.corr`, :meth:`DataFrame.corrwith`, and :meth:`DataFrame.cov` (:issue:`46560`)

.. ---------------------------------------------------------------------------
.. _whatsnew_150.notable_bug_fixes:
Expand Down
70 changes: 55 additions & 15 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -9764,6 +9764,7 @@ def corr(
self,
method: str | Callable[[np.ndarray, np.ndarray], float] = "pearson",
min_periods: int = 1,
numeric_only: bool = True,
) -> DataFrame:
"""
Compute pairwise correlation of columns, excluding NA/null values.
Expand All @@ -9784,6 +9785,10 @@ def corr(
Minimum number of observations required per pair of columns
to have a valid result. Currently only available for Pearson
and Spearman correlation.
numeric_only : bool, default True
Include only `float`, `int` or `boolean` data.
.. versionadded:: 1.5.0
Returns
-------
Expand Down Expand Up @@ -9823,10 +9828,13 @@ def corr(
dogs 1.0 NaN
cats NaN 1.0
""" # noqa:E501
numeric_df = self._get_numeric_data()
cols = numeric_df.columns
if numeric_only:
data = self._get_numeric_data()
else:
data = self
cols = data.columns
idx = cols.copy()
mat = numeric_df.to_numpy(dtype=float, na_value=np.nan, copy=False)
mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False)

if method == "pearson":
correl = libalgos.nancorr(mat, minp=min_periods)
Expand Down Expand Up @@ -9865,7 +9873,12 @@ def corr(

return self._constructor(correl, index=idx, columns=cols)

def cov(self, min_periods: int | None = None, ddof: int | None = 1) -> DataFrame:
def cov(
self,
min_periods: int | None = None,
ddof: int | None = 1,
numeric_only: bool = True,
) -> DataFrame:
"""
Compute pairwise covariance of columns, excluding NA/null values.
Expand Down Expand Up @@ -9896,6 +9909,11 @@ def cov(self, min_periods: int | None = None, ddof: int | None = 1) -> DataFrame
.. versionadded:: 1.1.0
numeric_only : bool, default True
Include only `float`, `int` or `boolean` data.
.. versionadded:: 1.5.0
Returns
-------
DataFrame
Expand Down Expand Up @@ -9964,10 +9982,13 @@ def cov(self, min_periods: int | None = None, ddof: int | None = 1) -> DataFrame
b NaN 1.248003 0.191417
c -0.150812 0.191417 0.895202
"""
numeric_df = self._get_numeric_data()
cols = numeric_df.columns
if numeric_only:
data = self._get_numeric_data()
else:
data = self
cols = data.columns
idx = cols.copy()
mat = numeric_df.to_numpy(dtype=float, na_value=np.nan, copy=False)
mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False)

if notna(mat).all():
if min_periods is not None and min_periods > len(mat):
Expand All @@ -9981,7 +10002,14 @@ def cov(self, min_periods: int | None = None, ddof: int | None = 1) -> DataFrame

return self._constructor(base_cov, index=idx, columns=cols)

def corrwith(self, other, axis: Axis = 0, drop=False, method="pearson") -> Series:
def corrwith(
self,
other,
axis: Axis = 0,
drop=False,
method="pearson",
numeric_only: bool = True,
) -> Series:
"""
Compute pairwise correlation.
Expand All @@ -10008,6 +10036,11 @@ def corrwith(self, other, axis: Axis = 0, drop=False, method="pearson") -> Serie
* callable: callable with input two 1d ndarrays
and returning a float.
numeric_only : bool, default True
Include only `float`, `int` or `boolean` data.
.. versionadded:: 1.5.0
Returns
-------
Series
Expand Down Expand Up @@ -10039,7 +10072,10 @@ def corrwith(self, other, axis: Axis = 0, drop=False, method="pearson") -> Serie
dtype: float64
""" # noqa:E501
axis = self._get_axis_number(axis)
this = self._get_numeric_data()
if numeric_only:
this = self._get_numeric_data()
else:
this = self

# GH46174: when other is a Series object and axis=0, we achieve a speedup over
# passing .corr() to .apply() by taking the columns as ndarrays and iterating
Expand All @@ -10052,19 +10088,23 @@ def corrwith(self, other, axis: Axis = 0, drop=False, method="pearson") -> Serie
if isinstance(other, Series):
if axis == 0 and method in ["pearson", "spearman"]:
corrs = {}
numeric_cols = self.select_dtypes(include=np.number).columns
ndf = self[numeric_cols].values.transpose()
if numeric_only:
cols = self.select_dtypes(include=np.number).columns
ndf = self[cols].values.transpose()
else:
cols = self.columns
ndf = self.values.transpose()
k = other.values
if method == "pearson":
for i, r in enumerate(ndf):
nonnull_mask = ~np.isnan(r) & ~np.isnan(k)
corrs[numeric_cols[i]] = np.corrcoef(
r[nonnull_mask], k[nonnull_mask]
)[0, 1]
corrs[cols[i]] = np.corrcoef(r[nonnull_mask], k[nonnull_mask])[
0, 1
]
else:
for i, r in enumerate(ndf):
nonnull_mask = ~np.isnan(r) & ~np.isnan(k)
corrs[numeric_cols[i]] = np.corrcoef(
corrs[cols[i]] = np.corrcoef(
r[nonnull_mask].argsort().argsort(),
k[nonnull_mask].argsort().argsort(),
)[0, 1]
Expand Down
45 changes: 40 additions & 5 deletions pandas/tests/frame/methods/test_cov_corr.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,20 @@ def test_cov_nullable_integer(self, other_column):
expected = DataFrame(arr, columns=["a", "b"], index=["a", "b"])
tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize("numeric_only", [True, False])
def test_cov_numeric_only(self, numeric_only):
# when dtypes of pandas series are different
# then ndarray will have dtype=object,
# so it need to be properly handled
df = DataFrame({"a": [1, 0], "c": ["x", "y"]})
expected = DataFrame(0.5, index=["a"], columns=["a"])
if numeric_only:
result = df.cov(numeric_only=numeric_only)
tm.assert_frame_equal(result, expected)
else:
with pytest.raises(ValueError, match="could not convert string to float"):
df.cov(numeric_only=numeric_only)


class TestDataFrameCorr:
# DataFrame.corr(), as opposed to DataFrame.corrwith
Expand Down Expand Up @@ -235,6 +249,22 @@ def test_corr_min_periods_greater_than_length(self, method):
)
tm.assert_frame_equal(result, expected)

@td.skip_if_no_scipy
@pytest.mark.parametrize("meth", ["pearson", "kendall", "spearman"])
@pytest.mark.parametrize("numeric_only", [True, False])
def test_corr_numeric_only(self, meth, numeric_only):
# when dtypes of pandas series are different
# then ndarray will have dtype=object,
# so it need to be properly handled
df = DataFrame({"a": [1, 0], "b": [1, 0], "c": ["x", "y"]})
expected = DataFrame(np.ones((2, 2)), index=["a", "b"], columns=["a", "b"])
if numeric_only:
result = df.corr(meth, numeric_only=numeric_only)
tm.assert_frame_equal(result, expected)
else:
with pytest.raises(ValueError, match="could not convert string to float"):
df.corr(meth, numeric_only=numeric_only)


class TestDataFrameCorrWith:
def test_corrwith(self, datetime_frame):
Expand Down Expand Up @@ -300,16 +330,21 @@ def test_corrwith_matches_corrcoef(self):
tm.assert_almost_equal(c1, c2)
assert c1 < 1

def test_corrwith_mixed_dtypes(self):
@pytest.mark.parametrize("numeric_only", [True, False])
def test_corrwith_mixed_dtypes(self, numeric_only):
# GH#18570
df = DataFrame(
{"a": [1, 4, 3, 2], "b": [4, 6, 7, 3], "c": ["a", "b", "c", "d"]}
)
s = Series([0, 6, 7, 3])
result = df.corrwith(s)
corrs = [df["a"].corr(s), df["b"].corr(s)]
expected = Series(data=corrs, index=["a", "b"])
tm.assert_series_equal(result, expected)
if numeric_only:
result = df.corrwith(s, numeric_only=numeric_only)
corrs = [df["a"].corr(s), df["b"].corr(s)]
expected = Series(data=corrs, index=["a", "b"])
tm.assert_series_equal(result, expected)
else:
with pytest.raises(TypeError, match="not supported for the input types"):
df.corrwith(s, numeric_only=numeric_only)

def test_corrwith_index_intersection(self):
df1 = DataFrame(np.random.random(size=(10, 2)), columns=["a", "b"])
Expand Down

0 comments on commit 860797b

Please sign in to comment.