Skip to content

Commit

Permalink
ERR: Shorten traceback in groupby _cython_agg_general (#52992)
Browse files Browse the repository at this point in the history
  • Loading branch information
rhshadrach authored Jun 19, 2023
1 parent 25c579a commit d06f2d3
Show file tree
Hide file tree
Showing 11 changed files with 91 additions and 53 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ Other enhancements
- Added ``engine_kwargs`` parameter to :meth:`DataFrame.to_excel` (:issue:`53220`)
- Added a new parameter ``by_row`` to :meth:`Series.apply`. When set to ``False`` the supplied callables will always operate on the whole Series (:issue:`53400`).
- Groupby aggregations (such as :meth:`DataFrameGroupby.sum`) now can preserve the dtype of the input instead of casting to ``float64`` (:issue:`44952`)
- Improved error message when :meth:`DataFrameGroupBy.agg` failed (:issue:`52930`)
- Many read/to_* functions, such as :meth:`DataFrame.to_pickle` and :func:`read_csv`, support forwarding compression arguments to lzma.LZMAFile (:issue:`52979`)
- Performance improvement in :func:`concat` with homogeneous ``np.float64`` or ``np.float32`` dtypes (:issue:`52685`)
- Performance improvement in :meth:`DataFrame.filter` when ``items`` is given (:issue:`52941`)
Expand Down
13 changes: 10 additions & 3 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1723,7 +1723,7 @@ def _agg_general(
return result.__finalize__(self.obj, method="groupby")

def _agg_py_fallback(
self, values: ArrayLike, ndim: int, alt: Callable
self, how: str, values: ArrayLike, ndim: int, alt: Callable
) -> ArrayLike:
"""
Fallback to pure-python aggregation if _cython_operation raises
Expand All @@ -1749,7 +1749,12 @@ def _agg_py_fallback(
# We do not get here with UDFs, so we know that our dtype
# should always be preserved by the implemented aggregations
# TODO: Is this exactly right; see WrappedCythonOp get_result_dtype?
res_values = self.grouper.agg_series(ser, alt, preserve_dtype=True)
try:
res_values = self.grouper.agg_series(ser, alt, preserve_dtype=True)
except Exception as err:
msg = f"agg function failed [how->{how},dtype->{ser.dtype}]"
# preserve the kind of exception that raised
raise type(err)(msg) from err

if ser.dtype == object:
res_values = res_values.astype(object, copy=False)
Expand Down Expand Up @@ -1791,8 +1796,10 @@ def array_func(values: ArrayLike) -> ArrayLike:
# TODO: shouldn't min_count matter?
if how in ["any", "all", "std", "sem"]:
raise # TODO: re-raise as TypeError? should not be reached
result = self._agg_py_fallback(values, ndim=data.ndim, alt=alt)
else:
return result

result = self._agg_py_fallback(how, values, ndim=data.ndim, alt=alt)
return result

new_mgr = data.grouped_reduce(array_func)
Expand Down
15 changes: 13 additions & 2 deletions pandas/tests/extension/base/groupby.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import re

import pytest

from pandas.core.dtypes.common import (
Expand Down Expand Up @@ -141,7 +143,16 @@ def test_in_numeric_groupby(self, data_for_grouping):
result = df.groupby("A").sum().columns
else:
expected = pd.Index(["C"])
with pytest.raises(TypeError, match="does not support"):
df.groupby("A").sum().columns

msg = "|".join(
[
# period/datetime
"does not support sum operations",
# all others
re.escape(f"agg function failed [how->sum,dtype->{dtype}"),
]
)
with pytest.raises(TypeError, match=msg):
df.groupby("A").sum()
result = df.groupby("A").sum(numeric_only=True).columns
tm.assert_index_equal(result, expected)
8 changes: 5 additions & 3 deletions pandas/tests/extension/test_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -585,7 +585,8 @@ def test_groupby_extension_agg(self, as_index, data_for_grouping, request):
super().test_groupby_extension_agg(as_index, data_for_grouping)

def test_in_numeric_groupby(self, data_for_grouping):
if is_string_dtype(data_for_grouping.dtype):
dtype = data_for_grouping.dtype
if is_string_dtype(dtype):
df = pd.DataFrame(
{
"A": [1, 1, 2, 2, 3, 3, 1, 4],
Expand All @@ -595,8 +596,9 @@ def test_in_numeric_groupby(self, data_for_grouping):
)

expected = pd.Index(["C"])
with pytest.raises(TypeError, match="does not support"):
df.groupby("A").sum().columns
msg = re.escape(f"agg function failed [how->sum,dtype->{dtype}")
with pytest.raises(TypeError, match=msg):
df.groupby("A").sum()
result = df.groupby("A").sum(numeric_only=True).columns
tm.assert_index_equal(result, expected)
else:
Expand Down
4 changes: 3 additions & 1 deletion pandas/tests/frame/test_stack_unstack.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from datetime import datetime
from io import StringIO
import itertools
import re

import numpy as np
import pytest
Expand Down Expand Up @@ -1897,7 +1898,8 @@ def test_stack_multiple_bug(self):
multi = df.set_index(["DATE", "ID"])
multi.columns.name = "Params"
unst = multi.unstack("ID")
with pytest.raises(TypeError, match="Could not convert"):
msg = re.escape("agg function failed [how->mean,dtype->object]")
with pytest.raises(TypeError, match=msg):
unst.resample("W-THU").mean()
down = unst.resample("W-THU").mean(numeric_only=True)
rs = down.stack("ID")
Expand Down
23 changes: 9 additions & 14 deletions pandas/tests/groupby/test_function.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import builtins
from io import StringIO
import re

import numpy as np
import pytest
Expand Down Expand Up @@ -249,8 +250,10 @@ def _check(self, df, method, expected_columns, expected_columns_numeric):
msg = "|".join(
[
"Categorical is not ordered",
"function is not implemented for this dtype",
f"Cannot perform {method} with non-ordered Categorical",
re.escape(f"agg function failed [how->{method},dtype->object]"),
# cumsum/cummin/cummax/cumprod
"function is not implemented for this dtype",
]
)
with pytest.raises(exception, match=msg):
Expand All @@ -259,12 +262,9 @@ def _check(self, df, method, expected_columns, expected_columns_numeric):
msg = "|".join(
[
"category type does not support sum operations",
"[Cc]ould not convert",
"can't multiply sequence by non-int of type 'str'",
re.escape(f"agg function failed [how->{method},dtype->object]"),
]
)
if method == "median":
msg = r"Cannot convert \['a' 'b'\] to numeric"
with pytest.raises(exception, match=msg):
getattr(gb, method)()
else:
Expand All @@ -274,16 +274,13 @@ def _check(self, df, method, expected_columns, expected_columns_numeric):
if method not in ("first", "last"):
msg = "|".join(
[
"[Cc]ould not convert",
"Categorical is not ordered",
"category type does not support",
"can't multiply sequence",
"function is not implemented for this dtype",
f"Cannot perform {method} with non-ordered Categorical",
re.escape(f"agg function failed [how->{method},dtype->object]"),
]
)
if method == "median":
msg = r"Cannot convert \['a' 'b'\] to numeric"
with pytest.raises(exception, match=msg):
getattr(gb, method)(numeric_only=False)
else:
Expand Down Expand Up @@ -1464,16 +1461,14 @@ def test_numeric_only(kernel, has_arg, numeric_only, keys):
msg = "|".join(
[
"not allowed for this dtype",
"must be a string or a number",
"cannot be performed against 'object' dtypes",
"must be a string or a real number",
# On PY39 message is "a number"; on PY310 and after is "a real number"
"must be a string or a.* number",
"unsupported operand type",
"not supported between instances of",
"function is not implemented for this dtype",
re.escape(f"agg function failed [how->{kernel},dtype->object]"),
]
)
if kernel == "median":
msg = r"Cannot convert \[<class 'object'> <class 'object'>\] to numeric"
with pytest.raises(exception, match=msg):
method(*args, **kwargs)
elif not has_arg and numeric_only is not lib.no_default:
Expand Down
26 changes: 16 additions & 10 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from datetime import datetime
from decimal import Decimal
import re

import numpy as np
import pytest
Expand Down Expand Up @@ -641,7 +642,7 @@ def test_frame_multi_key_function_list_partial_failure():

grouped = data.groupby(["A", "B"])
funcs = [np.mean, np.std]
msg = "Could not convert string 'dullshinyshiny' to numeric"
msg = re.escape("agg function failed [how->mean,dtype->object]")
with pytest.raises(TypeError, match=msg):
grouped.agg(funcs)

Expand Down Expand Up @@ -925,9 +926,10 @@ def test_groupby_multi_corner(df):

def test_raises_on_nuisance(df):
grouped = df.groupby("A")
with pytest.raises(TypeError, match="Could not convert"):
msg = re.escape("agg function failed [how->mean,dtype->object]")
with pytest.raises(TypeError, match=msg):
grouped.agg(np.mean)
with pytest.raises(TypeError, match="Could not convert"):
with pytest.raises(TypeError, match=msg):
grouped.mean()

df = df.loc[:, ["A", "C", "D"]]
Expand Down Expand Up @@ -975,10 +977,12 @@ def test_omit_nuisance_agg(df, agg_function, numeric_only):
if agg_function in no_drop_nuisance and not numeric_only:
# Added numeric_only as part of GH#46560; these do not drop nuisance
# columns when numeric_only is False
klass = ValueError if agg_function in ("std", "sem") else TypeError
msg = "|".join(["[C|c]ould not convert", "can't multiply sequence"])
if agg_function == "median":
msg = r"Cannot convert \['one' 'three' 'two'\] to numeric"
if agg_function in ("std", "sem"):
klass = ValueError
msg = "could not convert string to float: 'one'"
else:
klass = TypeError
msg = re.escape(f"agg function failed [how->{agg_function},dtype->object]")
with pytest.raises(klass, match=msg):
getattr(grouped, agg_function)(numeric_only=numeric_only)
else:
Expand All @@ -1003,9 +1007,10 @@ def test_raise_on_nuisance_python_single(df):

def test_raise_on_nuisance_python_multiple(three_group):
grouped = three_group.groupby(["A", "B"])
with pytest.raises(TypeError, match="Could not convert"):
msg = re.escape("agg function failed [how->mean,dtype->object]")
with pytest.raises(TypeError, match=msg):
grouped.agg(np.mean)
with pytest.raises(TypeError, match="Could not convert"):
with pytest.raises(TypeError, match=msg):
grouped.mean()


Expand Down Expand Up @@ -1045,7 +1050,8 @@ def test_wrap_aggregated_output_multindex(mframe):
df["baz", "two"] = "peekaboo"

keys = [np.array([0, 0, 1]), np.array([0, 0, 1])]
with pytest.raises(TypeError, match="Could not convert"):
msg = re.escape("agg function failed [how->mean,dtype->object]")
with pytest.raises(TypeError, match=msg):
df.groupby(keys).agg(np.mean)
agged = df.drop(columns=("baz", "two")).groupby(keys).agg(np.mean)
assert isinstance(agged.columns, MultiIndex)
Expand Down
24 changes: 12 additions & 12 deletions pandas/tests/groupby/test_raises.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
# test file.

import datetime
import re

import numpy as np
import pytest
Expand Down Expand Up @@ -162,24 +163,20 @@ def test_groupby_raises_string(
"max": (None, ""),
"mean": (
TypeError,
"Could not convert string '(xy|xyzwt|xyz|xztuo)' to numeric",
re.escape("agg function failed [how->mean,dtype->object]"),
),
"median": (
TypeError,
"|".join(
[
r"Cannot convert \['x' 'y' 'z'\] to numeric",
r"Cannot convert \['x' 'y'\] to numeric",
r"Cannot convert \['x' 'y' 'z' 'w' 't'\] to numeric",
r"Cannot convert \['x' 'z' 't' 'u' 'o'\] to numeric",
]
),
re.escape("agg function failed [how->median,dtype->object]"),
),
"min": (None, ""),
"ngroup": (None, ""),
"nunique": (None, ""),
"pct_change": (TypeError, "unsupported operand type"),
"prod": (TypeError, "can't multiply sequence by non-int of type 'str'"),
"prod": (
TypeError,
re.escape("agg function failed [how->prod,dtype->object]"),
),
"quantile": (TypeError, "cannot be performed against 'object' dtypes!"),
"rank": (None, ""),
"sem": (ValueError, "could not convert string to float"),
Expand All @@ -188,7 +185,10 @@ def test_groupby_raises_string(
"skew": (ValueError, "could not convert string to float"),
"std": (ValueError, "could not convert string to float"),
"sum": (None, ""),
"var": (TypeError, "could not convert string to float"),
"var": (
TypeError,
re.escape("agg function failed [how->var,dtype->object]"),
),
}[groupby_func]

_call_and_check(klass, msg, how, gb, groupby_func, args)
Expand Down Expand Up @@ -225,7 +225,7 @@ def test_groupby_raises_string_np(
np.sum: (None, ""),
np.mean: (
TypeError,
"Could not convert string '(xyzwt|xy|xyz|xztuo)' to numeric",
re.escape("agg function failed [how->mean,dtype->object]"),
),
}[groupby_func_np]

Expand Down
16 changes: 12 additions & 4 deletions pandas/tests/resample/test_resample_api.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from datetime import datetime
import re

import numpy as np
import pytest
Expand Down Expand Up @@ -186,7 +187,8 @@ def tests_raises_on_nuisance(test_frame):
tm.assert_frame_equal(result, expected)

expected = r[["A", "B", "C"]].mean()
with pytest.raises(TypeError, match="Could not convert"):
msg = re.escape("agg function failed [how->mean,dtype->object]")
with pytest.raises(TypeError, match=msg):
r.mean()
result = r.mean(numeric_only=True)
tm.assert_frame_equal(result, expected)
Expand Down Expand Up @@ -886,8 +888,13 @@ def test_frame_downsample_method(method, numeric_only, expected_data):

func = getattr(resampled, method)
if isinstance(expected_data, str):
klass = TypeError if method in ("var", "mean", "median", "prod") else ValueError
with pytest.raises(klass, match=expected_data):
if method in ("var", "mean", "median", "prod"):
klass = TypeError
msg = re.escape(f"agg function failed [how->{method},dtype->object]")
else:
klass = ValueError
msg = expected_data
with pytest.raises(klass, match=msg):
_ = func(**kwargs)
else:
result = func(**kwargs)
Expand Down Expand Up @@ -933,7 +940,8 @@ def test_series_downsample_method(method, numeric_only, expected_data):
with pytest.raises(TypeError, match=msg):
func(**kwargs)
elif method == "prod":
with pytest.raises(TypeError, match="can't multiply sequence by non-int"):
msg = re.escape("agg function failed [how->prod,dtype->object]")
with pytest.raises(TypeError, match=msg):
func(**kwargs)
else:
result = func(**kwargs)
Expand Down
5 changes: 4 additions & 1 deletion pandas/tests/reshape/merge/test_join.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import re

import numpy as np
import pytest

Expand Down Expand Up @@ -567,7 +569,8 @@ def test_mixed_type_join_with_suffix(self):
df.insert(5, "dt", "foo")

grouped = df.groupby("id")
with pytest.raises(TypeError, match="Could not convert"):
msg = re.escape("agg function failed [how->mean,dtype->object]")
with pytest.raises(TypeError, match=msg):
grouped.mean()
mn = grouped.mean(numeric_only=True)
cn = grouped.count()
Expand Down
Loading

0 comments on commit d06f2d3

Please sign in to comment.