From d0fe636b7357bf16d17e33dc1dd09ad25fd63a16 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Wed, 30 Oct 2019 11:29:07 -0700 Subject: [PATCH] Fixed segfaults and incorrect results in GroupBy.quantile with NA Values in Grouping (#29173) --- doc/source/whatsnew/v1.0.0.rst | 1 + pandas/_libs/groupby.pyx | 3 +++ pandas/tests/groupby/test_function.py | 23 +++++++++++++++++++++++ 3 files changed, 27 insertions(+) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index fa1669b1f3343..a9218650d4fe7 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -411,6 +411,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.groupby` not offering selection by column name when ``axis=1`` (:issue:`27614`) - Bug in :meth:`DataFrameGroupby.agg` not able to use lambda function with named aggregation (:issue:`27519`) - Bug in :meth:`DataFrame.groupby` losing column name information when grouping by a categorical column (:issue:`28787`) +- Bug in :meth:`DataFrameGroupBy.quantile` where NA values in the grouping could cause segfaults or incorrect results (:issue:`28882`) Reshaping ^^^^^^^^^ diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 71de80da699e9..b4300c162156f 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -766,6 +766,9 @@ def group_quantile(ndarray[float64_t] out, with nogil: for i in range(N): lab = labels[i] + if lab == -1: # NA group label + continue + counts[lab] += 1 if not mask[i]: non_na_counts[lab] += 1 diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 571e710ba8928..2d7dfe49dc038 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1373,6 +1373,29 @@ def test_quantile_out_of_bounds_q_raises(): g.quantile(-1) +def test_quantile_missing_group_values_no_segfaults(): + # GH 28662 + data = np.array([1.0, np.nan, 1.0]) + df = pd.DataFrame(dict(key=data, val=range(3))) + + # Random segfaults; would have been guaranteed in loop + grp = df.groupby("key") + for _ in range(100): + grp.quantile() + + +def test_quantile_missing_group_values_correct_results(): + # GH 28662 + data = np.array([1.0, np.nan, 3.0, np.nan]) + df = pd.DataFrame(dict(key=data, val=range(4))) + + result = df.groupby("key").quantile() + expected = pd.DataFrame( + [1.0, 3.0], index=pd.Index([1.0, 3.0], name="key"), columns=["val"] + ) + tm.assert_frame_equal(result, expected) + + # pipe # --------------------------------