From b8a19bf12387c756ed3572f7b22d8692550a79eb Mon Sep 17 00:00:00 2001 From: Mabel Villalba Date: Sun, 26 Apr 2020 18:33:09 +0200 Subject: [PATCH] BUG: Fix quantile calculation. Only move -1 labels if there are any labels #33200 --- pandas/_libs/groupby.pyx | 76 +++++++++++++++++++++------------------- 1 file changed, 39 insertions(+), 37 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 46db384591bd57..278f7b72291350 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -779,47 +779,49 @@ def group_quantile(ndarray[float64_t] out, non_na_counts[lab] += 1 if labels.any(): - # Get an index of values sorted by labels and then values + # Put '-1' (NaN) labels as the last group so it does not interfere + # with the calculations. labels[labels==-1] = np.max(labels) + 1 - order = (values, labels) - sort_arr= np.lexsort(order).astype(np.int64, copy=False) - with nogil: - for i in range(ngroups): - # Figure out how many group elements there are - grp_sz = counts[i] - non_na_sz = non_na_counts[i] + # Get an index of values sorted by labels and then values + order = (values, labels) + sort_arr = np.lexsort(order).astype(np.int64, copy=False) + with nogil: + for i in range(ngroups): + # Figure out how many group elements there are + grp_sz = counts[i] + non_na_sz = non_na_counts[i] - if non_na_sz == 0: - out[i] = NaN + if non_na_sz == 0: + out[i] = NaN + else: + # Calculate where to retrieve the desired value + # Casting to int will intentionally truncate result + idx = grp_start + (q * (non_na_sz - 1)) + + val = values[sort_arr[idx]] + # If requested quantile falls evenly on a particular index + # then write that index's value out. Otherwise interpolate + q_idx = q * (non_na_sz - 1) + frac = q_idx % 1 + + if frac == 0.0 or interp == INTERPOLATION_LOWER: + out[i] = val else: - # Calculate where to retrieve the desired value - # Casting to int will intentionally truncate result - idx = grp_start + (q * (non_na_sz - 1)) - - val = values[sort_arr[idx]] - # If requested quantile falls evenly on a particular index - # then write that index's value out. Otherwise interpolate - q_idx = q * (non_na_sz - 1) - frac = q_idx % 1 - - if frac == 0.0 or interp == INTERPOLATION_LOWER: - out[i] = val - else: - next_val = values[sort_arr[idx + 1]] - if interp == INTERPOLATION_LINEAR: - out[i] = val + (next_val - val) * frac - elif interp == INTERPOLATION_HIGHER: + next_val = values[sort_arr[idx + 1]] + if interp == INTERPOLATION_LINEAR: + out[i] = val + (next_val - val) * frac + elif interp == INTERPOLATION_HIGHER: + out[i] = next_val + elif interp == INTERPOLATION_MIDPOINT: + out[i] = (val + next_val) / 2.0 + elif interp == INTERPOLATION_NEAREST: + if frac > .5 or (frac == .5 and q > .5): # Always OK? out[i] = next_val - elif interp == INTERPOLATION_MIDPOINT: - out[i] = (val + next_val) / 2.0 - elif interp == INTERPOLATION_NEAREST: - if frac > .5 or (frac == .5 and q > .5): # Always OK? - out[i] = next_val - else: - out[i] = val - - # Increment the index reference in sorted_arr for the next group - grp_start += grp_sz + else: + out[i] = val + + # Increment the index reference in sorted_arr for the next group + grp_start += grp_sz # ----------------------------------------------------------------------