diff --git a/ci/code_checks.sh b/ci/code_checks.sh index af406f816e6148..ec46e043c82948 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -440,7 +440,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.errors.UnsortedIndexError SA01" \ -i "pandas.errors.UnsupportedFunctionCall SA01" \ -i "pandas.errors.ValueLabelTypeMismatch SA01" \ - -i "pandas.get_option SA01" \ -i "pandas.infer_freq SA01" \ -i "pandas.interval_range RT03" \ -i "pandas.io.formats.style.Styler.apply RT03" \ diff --git a/doc/source/user_guide/merging.rst b/doc/source/user_guide/merging.rst index 53fc9245aa9720..f3b849dc6de453 100644 --- a/doc/source/user_guide/merging.rst +++ b/doc/source/user_guide/merging.rst @@ -763,7 +763,7 @@ Joining a single Index to a MultiIndex ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ You can join a :class:`DataFrame` with a :class:`Index` to a :class:`DataFrame` with a :class:`MultiIndex` on a level. -The ``name`` of the :class:`Index` with match the level name of the :class:`MultiIndex`. +The ``name`` of the :class:`Index` will match the level name of the :class:`MultiIndex`. .. ipython:: python diff --git a/pandas/_config/config.py b/pandas/_config/config.py index 8921e1b6863033..1b91a7c3ee6367 100644 --- a/pandas/_config/config.py +++ b/pandas/_config/config.py @@ -157,6 +157,12 @@ def get_option(pat: str) -> Any: ------ OptionError : if no such option exists + See Also + -------- + set_option : Set the value of the specified option or options. + reset_option : Reset one or more options to their default value. + describe_option : Print the description for one or more registered options. + Notes ----- For all available options, please view the :ref:`User Guide ` diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 01cc85ceff1819..87b2f97503b6fc 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -925,13 +925,26 @@ def _reorder_for_extension_array_stack( def stack_v3(frame: DataFrame, level: list[int]) -> Series | DataFrame: if frame.columns.nunique() != len(frame.columns): raise ValueError("Columns with duplicate values are not supported in stack") - - # If we need to drop `level` from columns, it needs to be in descending order set_levels = set(level) - drop_levnums = sorted(level, reverse=True) stack_cols = frame.columns._drop_level_numbers( [k for k in range(frame.columns.nlevels - 1, -1, -1) if k not in set_levels] ) + + result = stack_reshape(frame, level, set_levels, stack_cols) + + # Construct the correct MultiIndex by combining the frame's index and + # stacked columns. + ratio = 0 if frame.empty else len(result) // len(frame) + + index_levels: list | FrozenList + if isinstance(frame.index, MultiIndex): + index_levels = frame.index.levels + index_codes = list(np.tile(frame.index.codes, (1, ratio))) + else: + codes, uniques = factorize(frame.index, use_na_sentinel=False) + index_levels = [uniques] + index_codes = list(np.tile(codes, (1, ratio))) + if len(level) > 1: # Arrange columns in the order we want to take them, e.g. level=[2, 0, 1] sorter = np.argsort(level) @@ -939,13 +952,72 @@ def stack_v3(frame: DataFrame, level: list[int]) -> Series | DataFrame: ordered_stack_cols = stack_cols._reorder_ilevels(sorter) else: ordered_stack_cols = stack_cols - - stack_cols_unique = stack_cols.unique() ordered_stack_cols_unique = ordered_stack_cols.unique() + if isinstance(ordered_stack_cols, MultiIndex): + column_levels = ordered_stack_cols.levels + column_codes = ordered_stack_cols.drop_duplicates().codes + else: + column_levels = [ordered_stack_cols_unique] + column_codes = [factorize(ordered_stack_cols_unique, use_na_sentinel=False)[0]] + + # error: Incompatible types in assignment (expression has type "list[ndarray[Any, + # dtype[Any]]]", variable has type "FrozenList") + column_codes = [np.repeat(codes, len(frame)) for codes in column_codes] # type: ignore[assignment] + result.index = MultiIndex( + levels=index_levels + column_levels, + codes=index_codes + column_codes, + names=frame.index.names + list(ordered_stack_cols.names), + verify_integrity=False, + ) + + # sort result, but faster than calling sort_index since we know the order we need + len_df = len(frame) + n_uniques = len(ordered_stack_cols_unique) + indexer = np.arange(n_uniques) + idxs = np.tile(len_df * indexer, len_df) + np.repeat(np.arange(len_df), n_uniques) + result = result.take(idxs) + + # Reshape/rename if needed and dropna + if result.ndim == 2 and frame.columns.nlevels == len(level): + if len(result.columns) == 0: + result = Series(index=result.index) + else: + result = result.iloc[:, 0] + if result.ndim == 1: + result.name = None + + return result + + +def stack_reshape( + frame: DataFrame, level: list[int], set_levels: set[int], stack_cols: Index +) -> Series | DataFrame: + """Reshape the data of a frame for stack. + + This function takes care of most of the work that stack needs to do. Caller + will sort the result once the appropriate index is set. + + Parameters + ---------- + frame: DataFrame + DataFrame that is to be stacked. + level: list of ints. + Levels of the columns to stack. + set_levels: set of ints. + Same as level, but as a set. + stack_cols: Index. + Columns of the result when the DataFrame is stacked. + + Returns + ------- + The data of behind the stacked DataFrame. + """ + # If we need to drop `level` from columns, it needs to be in descending order + drop_levnums = sorted(level, reverse=True) # Grab data for each unique index to be stacked buf = [] - for idx in stack_cols_unique: + for idx in stack_cols.unique(): if len(frame.columns) == 1: data = frame.copy() else: @@ -972,10 +1044,8 @@ def stack_v3(frame: DataFrame, level: list[int]) -> Series | DataFrame: data.columns = RangeIndex(len(data.columns)) buf.append(data) - result: Series | DataFrame if len(buf) > 0 and not frame.empty: result = concat(buf, ignore_index=True) - ratio = len(result) // len(frame) else: # input is empty if len(level) < frame.columns.nlevels: @@ -984,7 +1054,6 @@ def stack_v3(frame: DataFrame, level: list[int]) -> Series | DataFrame: else: new_columns = [0] result = DataFrame(columns=new_columns, dtype=frame._values.dtype) - ratio = 0 if len(level) < frame.columns.nlevels: # concat column order may be different from dropping the levels @@ -992,46 +1061,4 @@ def stack_v3(frame: DataFrame, level: list[int]) -> Series | DataFrame: if not result.columns.equals(desired_columns): result = result[desired_columns] - # Construct the correct MultiIndex by combining the frame's index and - # stacked columns. - index_levels: list | FrozenList - if isinstance(frame.index, MultiIndex): - index_levels = frame.index.levels - index_codes = list(np.tile(frame.index.codes, (1, ratio))) - else: - codes, uniques = factorize(frame.index, use_na_sentinel=False) - index_levels = [uniques] - index_codes = list(np.tile(codes, (1, ratio))) - if isinstance(ordered_stack_cols, MultiIndex): - column_levels = ordered_stack_cols.levels - column_codes = ordered_stack_cols.drop_duplicates().codes - else: - column_levels = [ordered_stack_cols.unique()] - column_codes = [factorize(ordered_stack_cols_unique, use_na_sentinel=False)[0]] - # error: Incompatible types in assignment (expression has type "list[ndarray[Any, - # dtype[Any]]]", variable has type "FrozenList") - column_codes = [np.repeat(codes, len(frame)) for codes in column_codes] # type: ignore[assignment] - result.index = MultiIndex( - levels=index_levels + column_levels, - codes=index_codes + column_codes, - names=frame.index.names + list(ordered_stack_cols.names), - verify_integrity=False, - ) - - # sort result, but faster than calling sort_index since we know the order we need - len_df = len(frame) - n_uniques = len(ordered_stack_cols_unique) - indexer = np.arange(n_uniques) - idxs = np.tile(len_df * indexer, len_df) + np.repeat(np.arange(len_df), n_uniques) - result = result.take(idxs) - - # Reshape/rename if needed and dropna - if result.ndim == 2 and frame.columns.nlevels == len(level): - if len(result.columns) == 0: - result = Series(index=result.index) - else: - result = result.iloc[:, 0] - if result.ndim == 1: - result.name = None - return result diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index e27c782c1bdcf4..6f30dcfaaba7e9 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -322,6 +322,8 @@ def test_groupby_as_index_apply(): tm.assert_index_equal(res_as_apply, exp_as_apply) tm.assert_index_equal(res_not_as_apply, exp_not_as_apply) + +def test_groupby_as_index_apply_str(): ind = Index(list("abcde")) df = DataFrame([[1, 2], [2, 3], [1, 4], [1, 5], [2, 6]], index=ind) msg = "DataFrameGroupBy.apply operated on the grouping columns" @@ -379,8 +381,8 @@ def f(piece): {"value": piece, "demeaned": piece - piece.mean(), "logged": logged} ) - dr = bdate_range("1/1/2000", periods=100) - ts = Series(np.random.default_rng(2).standard_normal(100), index=dr) + dr = bdate_range("1/1/2000", periods=10) + ts = Series(np.random.default_rng(2).standard_normal(10), index=dr) grouped = ts.groupby(lambda x: x.month, group_keys=False) result = grouped.apply(f) @@ -639,13 +641,13 @@ def reindex_helper(x): def test_apply_corner_cases(): # #535, can't use sliding iterator - N = 1000 + N = 10 labels = np.random.default_rng(2).integers(0, 100, size=N) df = DataFrame( { "key": labels, "value1": np.random.default_rng(2).standard_normal(N), - "value2": ["foo", "bar", "baz", "qux"] * (N // 4), + "value2": ["foo", "bar", "baz", "qux", "a"] * (N // 5), } ) @@ -680,6 +682,8 @@ def test_apply_numeric_coercion_when_datetime(): result = df.groupby(["Number"]).apply(lambda x: x.iloc[0]) tm.assert_series_equal(result["Str"], expected["Str"]) + +def test_apply_numeric_coercion_when_datetime_getitem(): # GH 15421 df = DataFrame( {"A": [10, 20, 30], "B": ["foo", "3", "4"], "T": [pd.Timestamp("12:31:22")] * 3} @@ -695,6 +699,8 @@ def get_B(g): expected.index = df.A tm.assert_series_equal(result, expected) + +def test_apply_numeric_coercion_when_datetime_with_nat(): # GH 14423 def predictions(tool): out = Series(index=["p1", "p2", "useTime"], dtype=object) @@ -843,10 +849,24 @@ def test_func(x): tm.assert_frame_equal(result, expected) -def test_groupby_apply_none_first(): +@pytest.mark.parametrize( + "in_data, out_idx, out_data", + [ + [ + {"groups": [1, 1, 1, 2], "vars": [0, 1, 2, 3]}, + [[1, 1], [0, 2]], + {"groups": [1, 1], "vars": [0, 2]}, + ], + [ + {"groups": [1, 2, 2, 2], "vars": [0, 1, 2, 3]}, + [[2, 2], [1, 3]], + {"groups": [2, 2], "vars": [1, 3]}, + ], + ], +) +def test_groupby_apply_none_first(in_data, out_idx, out_data): # GH 12824. Tests if apply returns None first. - test_df1 = DataFrame({"groups": [1, 1, 1, 2], "vars": [0, 1, 2, 3]}) - test_df2 = DataFrame({"groups": [1, 2, 2, 2], "vars": [0, 1, 2, 3]}) + test_df1 = DataFrame(in_data) def test_func(x): if x.shape[0] < 2: @@ -856,14 +876,9 @@ def test_func(x): msg = "DataFrameGroupBy.apply operated on the grouping columns" with tm.assert_produces_warning(DeprecationWarning, match=msg): result1 = test_df1.groupby("groups").apply(test_func) - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result2 = test_df2.groupby("groups").apply(test_func) - index1 = MultiIndex.from_arrays([[1, 1], [0, 2]], names=["groups", None]) - index2 = MultiIndex.from_arrays([[2, 2], [1, 3]], names=["groups", None]) - expected1 = DataFrame({"groups": [1, 1], "vars": [0, 2]}, index=index1) - expected2 = DataFrame({"groups": [2, 2], "vars": [1, 3]}, index=index2) + index1 = MultiIndex.from_arrays(out_idx, names=["groups", None]) + expected1 = DataFrame(out_data, index=index1) tm.assert_frame_equal(result1, expected1) - tm.assert_frame_equal(result2, expected2) def test_groupby_apply_return_empty_chunk(): @@ -883,18 +898,16 @@ def test_groupby_apply_return_empty_chunk(): tm.assert_series_equal(result, expected) -def test_apply_with_mixed_types(): +@pytest.mark.parametrize("meth", ["apply", "transform"]) +def test_apply_with_mixed_types(meth): # gh-20949 df = DataFrame({"A": "a a b".split(), "B": [1, 2, 3], "C": [4, 6, 5]}) g = df.groupby("A", group_keys=False) - result = g.transform(lambda x: x / x.sum()) + result = getattr(g, meth)(lambda x: x / x.sum()) expected = DataFrame({"B": [1 / 3.0, 2 / 3.0, 1], "C": [0.4, 0.6, 1.0]}) tm.assert_frame_equal(result, expected) - result = g.apply(lambda x: x / x.sum()) - tm.assert_frame_equal(result, expected) - def test_func_returns_object(): # GH 28652 @@ -1106,7 +1119,7 @@ def test_apply_function_with_indexing_return_column(): @pytest.mark.parametrize( "udf", - [(lambda x: x.copy()), (lambda x: x.copy().rename(lambda y: y + 1))], + [lambda x: x.copy(), lambda x: x.copy().rename(lambda y: y + 1)], ) @pytest.mark.parametrize("group_keys", [True, False]) def test_apply_result_type(group_keys, udf): @@ -1214,7 +1227,7 @@ def test_apply_with_date_in_multiindex_does_not_convert_to_timestamp(): expected = df.iloc[[0, 2, 3]] expected = expected.reset_index() expected.index = MultiIndex.from_frame(expected[["A", "B", "idx"]]) - expected = expected.drop(columns="idx") + expected = expected.drop(columns=["idx"]) tm.assert_frame_equal(result, expected) for val in result.index.levels[1]: diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 5a43a42aa936f8..2194e5692aa0e5 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -82,7 +82,7 @@ def get_stats(group): assert result.index.names[0] == "C" -def test_basic(using_infer_string): # TODO: split this test +def test_basic(): cats = Categorical( ["a", "a", "a", "b", "b", "b", "c", "c", "c"], categories=["a", "b", "c", "d"], @@ -95,17 +95,20 @@ def test_basic(using_infer_string): # TODO: split this test result = data.groupby("b", observed=False).mean() tm.assert_frame_equal(result, expected) + +def test_basic_single_grouper(): cat1 = Categorical(["a", "a", "b", "b"], categories=["a", "b", "z"], ordered=True) cat2 = Categorical(["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True) df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]}) - # single grouper gb = df.groupby("A", observed=False) exp_idx = CategoricalIndex(["a", "b", "z"], name="A", ordered=True) expected = DataFrame({"values": Series([3, 7, 0], index=exp_idx)}) result = gb.sum(numeric_only=True) tm.assert_frame_equal(result, expected) + +def test_basic_string(using_infer_string): # GH 8623 x = DataFrame( [[1, "John P. Doe"], [2, "Jane Dove"], [1, "John P. Doe"]], @@ -133,8 +136,9 @@ def f(x): expected["person_name"] = expected["person_name"].astype(dtype) tm.assert_frame_equal(result, expected) + +def test_basic_monotonic(): # GH 9921 - # Monotonic df = DataFrame({"a": [5, 15, 25]}) c = pd.cut(df.a, bins=[0, 10, 20, 30, 40]) @@ -165,7 +169,8 @@ def f(x): tm.assert_series_equal(df.a.groupby(c, observed=False).filter(np.all), df["a"]) tm.assert_frame_equal(df.groupby(c, observed=False).filter(np.all), df) - # Non-monotonic + +def test_basic_non_monotonic(): df = DataFrame({"a": [5, 15, 25, -5]}) c = pd.cut(df.a, bins=[-10, 0, 10, 20, 30, 40]) @@ -183,6 +188,8 @@ def f(x): df.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), df[["a"]] ) + +def test_basic_cut_grouping(): # GH 9603 df = DataFrame({"a": [1, 0, 0, 0]}) c = pd.cut(df.a, [0, 1, 2, 3, 4], labels=Categorical(list("abcd"))) @@ -193,13 +200,14 @@ def f(x): expected.index.name = "a" tm.assert_series_equal(result, expected) - # more basic + +def test_more_basic(): levels = ["foo", "bar", "baz", "qux"] - codes = np.random.default_rng(2).integers(0, 4, size=100) + codes = np.random.default_rng(2).integers(0, 4, size=10) cats = Categorical.from_codes(codes, levels, ordered=True) - data = DataFrame(np.random.default_rng(2).standard_normal((100, 4))) + data = DataFrame(np.random.default_rng(2).standard_normal((10, 4))) result = data.groupby(cats, observed=False).mean() @@ -225,9 +233,9 @@ def f(x): # GH 10460 expc = Categorical.from_codes(np.arange(4).repeat(8), levels, ordered=True) exp = CategoricalIndex(expc) - tm.assert_index_equal((desc_result.stack().index.get_level_values(0)), exp) + tm.assert_index_equal(desc_result.stack().index.get_level_values(0), exp) exp = Index(["count", "mean", "std", "min", "25%", "50%", "75%", "max"] * 4) - tm.assert_index_equal((desc_result.stack().index.get_level_values(1)), exp) + tm.assert_index_equal(desc_result.stack().index.get_level_values(1), exp) def test_level_get_group(observed): @@ -352,6 +360,8 @@ def test_observed(observed): tm.assert_frame_equal(result, expected) + +def test_observed_single_column(observed): # https://github.com/pandas-dev/pandas/issues/8138 d = { "cat": Categorical( @@ -362,7 +372,6 @@ def test_observed(observed): } df = DataFrame(d) - # Grouping on a single column groups_single_key = df.groupby("cat", observed=observed) result = groups_single_key.mean() @@ -378,7 +387,17 @@ def test_observed(observed): tm.assert_frame_equal(result, expected) - # Grouping on two columns + +def test_observed_two_columns(observed): + # https://github.com/pandas-dev/pandas/issues/8138 + d = { + "cat": Categorical( + ["a", "b", "a", "b"], categories=["a", "b", "c"], ordered=True + ), + "ints": [1, 1, 2, 2], + "val": [10, 20, 30, 40], + } + df = DataFrame(d) groups_double_key = df.groupby(["cat", "ints"], observed=observed) result = groups_double_key.agg("mean") expected = DataFrame( @@ -404,6 +423,8 @@ def test_observed(observed): expected = df[(df.cat == c) & (df.ints == i)] tm.assert_frame_equal(result, expected) + +def test_observed_with_as_index(observed): # gh-8869 # with as_index d = { @@ -591,7 +612,6 @@ def test_dataframe_categorical_with_nan(observed): @pytest.mark.parametrize("ordered", [True, False]) -@pytest.mark.parametrize("observed", [True, False]) def test_dataframe_categorical_ordered_observed_sort(ordered, observed, sort): # GH 25871: Fix groupby sorting on ordered Categoricals # GH 25167: Groupby with observed=True doesn't sort @@ -627,11 +647,11 @@ def test_dataframe_categorical_ordered_observed_sort(ordered, observed, sort): def test_datetime(): # GH9049: ensure backward compatibility levels = pd.date_range("2014-01-01", periods=4) - codes = np.random.default_rng(2).integers(0, 4, size=100) + codes = np.random.default_rng(2).integers(0, 4, size=10) cats = Categorical.from_codes(codes, levels, ordered=True) - data = DataFrame(np.random.default_rng(2).standard_normal((100, 4))) + data = DataFrame(np.random.default_rng(2).standard_normal((10, 4))) result = data.groupby(cats, observed=False).mean() expected = data.groupby(np.asarray(cats), observed=False).mean() @@ -832,7 +852,10 @@ def test_preserve_categories(): df.groupby("A", sort=False, observed=False).first().index, nosort_index ) - # ordered=False + +def test_preserve_categories_ordered_false(): + # GH-13179 + categories = list("abc") df = DataFrame({"A": Categorical(list("ba"), categories=categories, ordered=False)}) sort_index = CategoricalIndex(categories, categories, ordered=False, name="A") # GH#48749 - don't change order of categories @@ -846,7 +869,8 @@ def test_preserve_categories(): ) -def test_preserve_categorical_dtype(): +@pytest.mark.parametrize("col", ["C1", "C2"]) +def test_preserve_categorical_dtype(col): # GH13743, GH13854 df = DataFrame( { @@ -865,18 +889,15 @@ def test_preserve_categorical_dtype(): "C2": Categorical(list("bac"), categories=list("bac"), ordered=True), } ) - for col in ["C1", "C2"]: - result1 = df.groupby(by=col, as_index=False, observed=False).mean( - numeric_only=True - ) - result2 = ( - df.groupby(by=col, as_index=True, observed=False) - .mean(numeric_only=True) - .reset_index() - ) - expected = exp_full.reindex(columns=result1.columns) - tm.assert_frame_equal(result1, expected) - tm.assert_frame_equal(result2, expected) + result1 = df.groupby(by=col, as_index=False, observed=False).mean(numeric_only=True) + result2 = ( + df.groupby(by=col, as_index=True, observed=False) + .mean(numeric_only=True) + .reset_index() + ) + expected = exp_full.reindex(columns=result1.columns) + tm.assert_frame_equal(result1, expected) + tm.assert_frame_equal(result2, expected) @pytest.mark.parametrize( @@ -931,6 +952,8 @@ def test_categorical_no_compress(): ) tm.assert_series_equal(result, exp) + +def test_categorical_no_compress_string(): cats = Categorical( ["a", "a", "a", "b", "b", "b", "c", "c", "c"], categories=["a", "b", "c", "d"], @@ -965,7 +988,7 @@ def test_sort(): # has a sorted x axis # self.cat.groupby(['value_group'])['value_group'].count().plot(kind='bar') - df = DataFrame({"value": np.random.default_rng(2).integers(0, 10000, 100)}) + df = DataFrame({"value": np.random.default_rng(2).integers(0, 10000, 10)}) labels = [f"{i} - {i+499}" for i in range(0, 10000, 500)] cat_labels = Categorical(labels, labels) diff --git a/pandas/tests/groupby/test_counting.py b/pandas/tests/groupby/test_counting.py index 2622895f9f8d21..47ad18c9ad2c83 100644 --- a/pandas/tests/groupby/test_counting.py +++ b/pandas/tests/groupby/test_counting.py @@ -321,19 +321,22 @@ def test_count_object(): expected = Series([3, 3], index=Index([2, 3], name="c"), name="a") tm.assert_series_equal(result, expected) + +def test_count_object_nan(): df = DataFrame({"a": ["a", np.nan, np.nan] + ["b"] * 3, "c": [2] * 3 + [3] * 3}) result = df.groupby("c").a.count() expected = Series([1, 3], index=Index([2, 3], name="c"), name="a") tm.assert_series_equal(result, expected) -def test_count_cross_type(): +@pytest.mark.parametrize("typ", ["object", "float32"]) +def test_count_cross_type(typ): # GH8169 # Set float64 dtype to avoid upcast when setting nan below vals = np.hstack( ( - np.random.default_rng(2).integers(0, 5, (100, 2)), - np.random.default_rng(2).integers(0, 2, (100, 2)), + np.random.default_rng(2).integers(0, 5, (10, 2)), + np.random.default_rng(2).integers(0, 2, (10, 2)), ) ).astype("float64") @@ -341,11 +344,10 @@ def test_count_cross_type(): df[df == 2] = np.nan expected = df.groupby(["c", "d"]).count() - for t in ["float32", "object"]: - df["a"] = df["a"].astype(t) - df["b"] = df["b"].astype(t) - result = df.groupby(["c", "d"]).count() - tm.assert_frame_equal(result, expected) + df["a"] = df["a"].astype(typ) + df["b"] = df["b"].astype(typ) + result = df.groupby(["c", "d"]).count() + tm.assert_frame_equal(result, expected) def test_lower_int_prec_count(): diff --git a/pandas/tests/groupby/test_cumulative.py b/pandas/tests/groupby/test_cumulative.py index 28dcb38d173f2e..b0a0414c1feb25 100644 --- a/pandas/tests/groupby/test_cumulative.py +++ b/pandas/tests/groupby/test_cumulative.py @@ -94,21 +94,28 @@ def test_groupby_cumprod_nan_influences_other_columns(): def test_cummin(dtypes_for_minmax): dtype = dtypes_for_minmax[0] - min_val = dtypes_for_minmax[1] # GH 15048 base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]}) expected_mins = [3, 3, 3, 2, 2, 2, 2, 1] df = base_df.astype(dtype) - expected = DataFrame({"B": expected_mins}).astype(dtype) result = df.groupby("A").cummin() tm.assert_frame_equal(result, expected) result = df.groupby("A", group_keys=False).B.apply(lambda x: x.cummin()).to_frame() tm.assert_frame_equal(result, expected) - # Test w/ min value for dtype + +def test_cummin_min_value_for_dtype(dtypes_for_minmax): + dtype = dtypes_for_minmax[0] + min_val = dtypes_for_minmax[1] + + # GH 15048 + base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]}) + expected_mins = [3, 3, 3, 2, 2, 2, 2, 1] + expected = DataFrame({"B": expected_mins}).astype(dtype) + df = base_df.astype(dtype) df.loc[[2, 6], "B"] = min_val df.loc[[1, 5], "B"] = min_val + 1 expected.loc[[2, 3, 6, 7], "B"] = min_val @@ -120,8 +127,10 @@ def test_cummin(dtypes_for_minmax): ) tm.assert_frame_equal(result, expected, check_exact=True) - # Test nan in some values + +def test_cummin_nan_in_some_values(dtypes_for_minmax): # Explicit cast to float to avoid implicit cast when setting nan + base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]}) base_df = base_df.astype({"B": "float"}) base_df.loc[[0, 2, 4, 6], "B"] = np.nan expected = DataFrame({"B": [np.nan, 4, np.nan, 2, np.nan, 3, np.nan, 1]}) @@ -132,6 +141,8 @@ def test_cummin(dtypes_for_minmax): ) tm.assert_frame_equal(result, expected) + +def test_cummin_datetime(): # GH 15561 df = DataFrame({"a": [1], "b": pd.to_datetime(["2001"])}) expected = Series(pd.to_datetime("2001"), index=[0], name="b") @@ -139,6 +150,8 @@ def test_cummin(dtypes_for_minmax): result = df.groupby("a")["b"].cummin() tm.assert_series_equal(expected, result) + +def test_cummin_getattr_series(): # GH 15635 df = DataFrame({"a": [1, 2, 1], "b": [1, 2, 2]}) result = df.groupby("a").b.cummin() @@ -163,7 +176,6 @@ def test_cummin_max_all_nan_column(method, dtype): def test_cummax(dtypes_for_minmax): dtype = dtypes_for_minmax[0] - max_val = dtypes_for_minmax[2] # GH 15048 base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]}) @@ -177,8 +189,18 @@ def test_cummax(dtypes_for_minmax): result = df.groupby("A", group_keys=False).B.apply(lambda x: x.cummax()).to_frame() tm.assert_frame_equal(result, expected) - # Test w/ max value for dtype + +def test_cummax_min_value_for_dtype(dtypes_for_minmax): + dtype = dtypes_for_minmax[0] + max_val = dtypes_for_minmax[2] + + # GH 15048 + base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]}) + expected_maxs = [3, 4, 4, 4, 2, 3, 3, 3] + + df = base_df.astype(dtype) df.loc[[2, 6], "B"] = max_val + expected = DataFrame({"B": expected_maxs}).astype(dtype) expected.loc[[2, 3, 6, 7], "B"] = max_val result = df.groupby("A").cummax() tm.assert_frame_equal(result, expected) @@ -187,8 +209,11 @@ def test_cummax(dtypes_for_minmax): ) tm.assert_frame_equal(result, expected) + +def test_cummax_nan_in_some_values(dtypes_for_minmax): # Test nan in some values # Explicit cast to float to avoid implicit cast when setting nan + base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]}) base_df = base_df.astype({"B": "float"}) base_df.loc[[0, 2, 4, 6], "B"] = np.nan expected = DataFrame({"B": [np.nan, 4, np.nan, 4, np.nan, 3, np.nan, 3]}) @@ -199,6 +224,8 @@ def test_cummax(dtypes_for_minmax): ) tm.assert_frame_equal(result, expected) + +def test_cummax_datetime(): # GH 15561 df = DataFrame({"a": [1], "b": pd.to_datetime(["2001"])}) expected = Series(pd.to_datetime("2001"), index=[0], name="b") @@ -206,6 +233,8 @@ def test_cummax(dtypes_for_minmax): result = df.groupby("a")["b"].cummax() tm.assert_series_equal(expected, result) + +def test_cummax_getattr_series(): # GH 15635 df = DataFrame({"a": [1, 2, 1], "b": [2, 1, 1]}) result = df.groupby("a").b.cummax() @@ -292,15 +321,12 @@ def test_nullable_int_not_cast_as_float(method, dtype, val): tm.assert_frame_equal(result, expected) -def test_cython_api2(): +def test_cython_api2(as_index): # this takes the fast apply path # cumsum (GH5614) + # GH 5755 - cumsum is a transformer and should ignore as_index df = DataFrame([[1, 2, np.nan], [1, np.nan, 9], [3, 4, 9]], columns=["A", "B", "C"]) expected = DataFrame([[2, np.nan], [np.nan, 9], [4, 9]], columns=["B", "C"]) - result = df.groupby("A").cumsum() - tm.assert_frame_equal(result, expected) - - # GH 5755 - cumsum is a transformer and should ignore as_index - result = df.groupby("A", as_index=False).cumsum() + result = df.groupby("A", as_index=as_index).cumsum() tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_filters.py b/pandas/tests/groupby/test_filters.py index a34170e9b55dbf..04883b3ef6b786 100644 --- a/pandas/tests/groupby/test_filters.py +++ b/pandas/tests/groupby/test_filters.py @@ -85,6 +85,9 @@ def test_filter_out_no_groups(): grouped = s.groupby(grouper) filtered = grouped.filter(lambda x: x.mean() > 0) tm.assert_series_equal(filtered, s) + + +def test_filter_out_no_groups_dataframe(): df = DataFrame({"A": [1, 12, 12, 1], "B": "a b c d".split()}) grouper = df["A"].apply(lambda x: x % 2) grouped = df.groupby(grouper) @@ -100,6 +103,9 @@ def test_filter_out_all_groups_in_df(): expected = DataFrame({"a": [np.nan] * 3, "b": [np.nan] * 3}) tm.assert_frame_equal(expected, res) + +def test_filter_out_all_groups_in_df_dropna_true(): + # GH12768 df = DataFrame({"a": [1, 1, 2], "b": [1, 2, 0]}) res = df.groupby("a") res = res.filter(lambda x: x["b"].sum() > 5, dropna=True) @@ -179,7 +185,7 @@ def test_filter_pdna_is_false(): def test_filter_against_workaround_ints(): # Series of ints - s = Series(np.random.default_rng(2).integers(0, 100, 100)) + s = Series(np.random.default_rng(2).integers(0, 100, 10)) grouper = s.apply(lambda x: np.round(x, -1)) grouped = s.groupby(grouper) f = lambda x: x.mean() > 10 @@ -191,7 +197,7 @@ def test_filter_against_workaround_ints(): def test_filter_against_workaround_floats(): # Series of floats - s = 100 * Series(np.random.default_rng(2).random(100)) + s = 100 * Series(np.random.default_rng(2).random(10)) grouper = s.apply(lambda x: np.round(x, -1)) grouped = s.groupby(grouper) f = lambda x: x.mean() > 10 @@ -203,13 +209,13 @@ def test_filter_against_workaround_floats(): def test_filter_against_workaround_dataframe(): # Set up DataFrame of ints, floats, strings. letters = np.array(list(ascii_lowercase)) - N = 100 + N = 10 random_letters = letters.take( np.random.default_rng(2).integers(0, 26, N, dtype=int) ) df = DataFrame( { - "ints": Series(np.random.default_rng(2).integers(0, 100, N)), + "ints": Series(np.random.default_rng(2).integers(0, 10, N)), "floats": N / 10 * Series(np.random.default_rng(2).random(N)), "letters": Series(random_letters), } @@ -217,26 +223,26 @@ def test_filter_against_workaround_dataframe(): # Group by ints; filter on floats. grouped = df.groupby("ints") - old_way = df[grouped.floats.transform(lambda x: x.mean() > N / 20).astype("bool")] - new_way = grouped.filter(lambda x: x["floats"].mean() > N / 20) + old_way = df[grouped.floats.transform(lambda x: x.mean() > N / 2).astype("bool")] + new_way = grouped.filter(lambda x: x["floats"].mean() > N / 2) tm.assert_frame_equal(new_way, old_way) # Group by floats (rounded); filter on strings. grouper = df.floats.apply(lambda x: np.round(x, -1)) grouped = df.groupby(grouper) - old_way = df[grouped.letters.transform(lambda x: len(x) < N / 10).astype("bool")] - new_way = grouped.filter(lambda x: len(x.letters) < N / 10) + old_way = df[grouped.letters.transform(lambda x: len(x) < N / 2).astype("bool")] + new_way = grouped.filter(lambda x: len(x.letters) < N / 2) tm.assert_frame_equal(new_way, old_way) # Group by strings; filter on ints. grouped = df.groupby("letters") - old_way = df[grouped.ints.transform(lambda x: x.mean() > N / 20).astype("bool")] - new_way = grouped.filter(lambda x: x["ints"].mean() > N / 20) + old_way = df[grouped.ints.transform(lambda x: x.mean() > N / 2).astype("bool")] + new_way = grouped.filter(lambda x: x["ints"].mean() > N / 2) tm.assert_frame_equal(new_way, old_way) def test_filter_using_len(): - # BUG GH4447 + # GH 4447 df = DataFrame({"A": np.arange(8), "B": list("aabbbbcc"), "C": np.arange(8)}) grouped = df.groupby("B") actual = grouped.filter(lambda x: len(x) > 2) @@ -250,8 +256,10 @@ def test_filter_using_len(): expected = df.loc[[]] tm.assert_frame_equal(actual, expected) - # Series have always worked properly, but we'll test anyway. - s = df["B"] + +def test_filter_using_len_series(): + # GH 4447 + s = Series(list("aabbbbcc"), name="B") grouped = s.groupby(s) actual = grouped.filter(lambda x: len(x) > 2) expected = Series(4 * ["b"], index=np.arange(2, 6, dtype=np.int64), name="B") @@ -262,10 +270,14 @@ def test_filter_using_len(): tm.assert_series_equal(actual, expected) -def test_filter_maintains_ordering(): - # Simple case: index is sequential. #4621 +@pytest.mark.parametrize( + "index", [range(8), range(7, -1, -1), [0, 2, 1, 3, 4, 6, 5, 7]] +) +def test_filter_maintains_ordering(index): + # GH 4621 df = DataFrame( - {"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]} + {"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]}, + index=index, ) s = df["pid"] grouped = df.groupby("tag") @@ -278,33 +290,6 @@ def test_filter_maintains_ordering(): expected = s.iloc[[1, 2, 4, 7]] tm.assert_series_equal(actual, expected) - # Now index is sequentially decreasing. - df.index = np.arange(len(df) - 1, -1, -1) - s = df["pid"] - grouped = df.groupby("tag") - actual = grouped.filter(lambda x: len(x) > 1) - expected = df.iloc[[1, 2, 4, 7]] - tm.assert_frame_equal(actual, expected) - - grouped = s.groupby(df["tag"]) - actual = grouped.filter(lambda x: len(x) > 1) - expected = s.iloc[[1, 2, 4, 7]] - tm.assert_series_equal(actual, expected) - - # Index is shuffled. - SHUFFLED = [4, 6, 7, 2, 1, 0, 5, 3] - df.index = df.index[SHUFFLED] - s = df["pid"] - grouped = df.groupby("tag") - actual = grouped.filter(lambda x: len(x) > 1) - expected = df.iloc[[1, 2, 4, 7]] - tm.assert_frame_equal(actual, expected) - - grouped = s.groupby(df["tag"]) - actual = grouped.filter(lambda x: len(x) > 1) - expected = s.iloc[[1, 2, 4, 7]] - tm.assert_series_equal(actual, expected) - def test_filter_multiple_timestamp(): # GH 10114