Fix float grouping (#2791)

JuliaData · Jun 28, 2021 · f0b5a57 · f0b5a57
1 parent d701c09
commit f0b5a57
Show file tree

Hide file tree

Showing 4 changed files with 39 additions and 12 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -14,6 +14,10 @@
 
 ## Bug fixes
 
+* fix bug in how `groupby` handles grouping of float columns;
+  now `-0.0` is treated as *not integer* when deciding on which
+  grouping algorithm should be used
+  ([#2791](https://github.com/JuliaData/DataFrames.jl/pull/2791))
 * fix bug in how `issorted` handles custom orderings and improve performance
   of sorting when complex custom orderings are passed
   ([#2746](https://github.com/JuliaData/DataFrames.jl/pull/2746))

diff --git a/src/groupeddataframe/groupeddataframe.jl b/src/groupeddataframe/groupeddataframe.jl
@@ -45,14 +45,8 @@ into row groups.
   ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR).
 - `sort` : whether to sort groups according to the values of the grouping columns
   `cols`; if `sort=false` then the order of groups in the result is undefined
-  and may change in future releases. In the current implementation
-  groups are ordered following the order of appearance of values in the grouping
-  columns, except when all grouping columns provide non-`nothing`
-  `DataAPI.refpool` in which case the order of groups follows the order of
-  values returned by `DataAPI.refpool`. As a particular application of this rule
-  if all `cols` are `CategoricalVector`s then groups are always sorted
-  Integer columns with a narrow range also use this this optimization, so
-  to the order of groups when grouping on integer columns is undefined.
+  and may change in future releases; below a description of the current
+  implementation is provided.
 - `skipmissing` : whether to skip groups with `missing` values in one of the
   grouping columns `cols`
 
@@ -79,6 +73,18 @@ an `AbstractDict` can be used to index into a grouped data frame where
 the keys are column names of the data frame. The order of the keys does
 not matter in this case.
 
+In the current implementation if `sort=false` groups are ordered following the
+order of appearance of values in the grouping columns, except when all grouping
+columns provide non-`nothing` `DataAPI.refpool`, in which case the order of groups
+follows the order of values returned by `DataAPI.refpool`. As a particular application
+of this rule if all `cols` are `CategoricalVector`s then groups are always sorted.
+Integer columns with a narrow range also use this this optimization, so to the
+order of groups when grouping on integer columns is undefined.
+A column is considered to be an integer column when deciding on the grouping
+algorithm choice if its `eltype` is a subtype of `Union{Missing, Real}`,
+all its elements are either `missing` or pass `isinteger` test,
+and none of them is equal to `-0.0`.
+
 # See also
 
 [`combine`](@ref), [`select`](@ref), [`select!`](@ref), [`transform`](@ref), [`transform!`](@ref)

diff --git a/src/groupeddataframe/utils.jl b/src/groupeddataframe/utils.jl
@@ -143,10 +143,19 @@ function refpool_and_array(x::AbstractArray)
         else
             return nothing, nothing
         end
-    elseif x isa AbstractArray{<:Union{Real, Missing}} &&
-        all(v -> ismissing(v) | isinteger(v), x) &&
-        !isempty(skipmissing(x))
-        minval, maxval = extrema(skipmissing(x))
+    elseif x isa AbstractArray{<:Union{Real, Missing}} && length(x) > 0 &&
+        (x isa AbstractArray{<:Union{Integer, Missing}} ||
+         all(v -> (ismissing(v) | isinteger(v)) & !isequal(v, -0.0), x))
+        if Missing <: eltype(x)
+            smx = skipmissing(x)
+            if isempty(smx)
+                return nothing, nothing
+            else
+                minval, maxval = extrema(smx)
+            end
+        else
+            minval, maxval = extrema(x)
+        end
         ngroups = big(maxval) - big(minval) + 1
         # Threshold chosen with the same rationale as the row_group_slots refpool method:
         # refpool approach is faster but we should not allocate too much memory either

diff --git a/test/grouping.jl b/test/grouping.jl
@@ -3710,6 +3710,14 @@ end
     end
 end
 
+@testset "grouping floats" begin
+    @test length(groupby_checked(DataFrame(a=[0.0, -0.0]), :a)) == 2
+    @test getindex.(keys(groupby_checked(DataFrame(a=[3.0, 2.0, 0.0]), :a)), 1) ==
+          [0, 2, 3]
+    @test getindex.(keys(groupby_checked(DataFrame(a=[3.0, 2.0, -0.0]), :a)), 1) ==
+          [3, 2, 0]
+end
+
 @testset "aggregation with matrix of Pair" begin
     df = DataFrame(a=["a", "b","a", "b"], x=1:4, y=11:14)
     gdf = groupby_checked(df, :a)