Use CategoricalArray instead of NominalArray

New type merging NominalArray and OrdinalArray in 0.0.5.
JuliaData · Sep 22, 2016 · 2ec131e · kmsquire · Nov 1, 2016 · nalimilan
1 parent 63c1d96
commit 2ec131e
Show file tree

Hide file tree

Showing 20 changed files with 93 additions and 93 deletions.
diff --git a/REQUIRE b/REQUIRE
@@ -1,6 +1,6 @@
 julia 0.4
 NullableArrays 0.0.8
-CategoricalArrays 0.0.4
+CategoricalArrays 0.0.5
 StatsBase 0.8.3
 GZip
 SortingAlgorithms

diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl
@@ -411,7 +411,7 @@ function StatsBase.describe{T<:Number}(io, nv::AbstractArray{T})
     return
 end
 function StatsBase.describe{T}(io, nv::AbstractArray{T})
-    ispooled = isa(nv, NominalVector) ? "Pooled " : ""
+    ispooled = isa(nv, CategoricalVector) ? "Pooled " : ""
     nulls = countnull(nv)
     # if nothing else, just give the length and element type and NA count
     println(io, "Length    $(length(nv))")
@@ -650,7 +650,7 @@ unique!(df)  # modifies df
 function nonuniquekey(df::AbstractDataFrame)
     # Here's another (probably a lot faster) way to do `nonunique`
     # by grouping on all columns. It will fail if columns cannot be
-    # made into NominalVector's.
+    # made into CategoricalVector's.
     gd = groupby(df, _names(df))
     idx = [1:length(gd.idx)][gd.idx][gd.starts]
     res = fill(true, nrow(df))

diff --git a/src/abstractdataframe/join.jl b/src/abstractdataframe/join.jl
@@ -9,11 +9,11 @@ similar_nullable{T}(dv::AbstractArray{T}, dims::@compat(Union{Int, Tuple{Vararg{
 similar_nullable{T<:Nullable}(dv::AbstractArray{T}, dims::@compat(Union{Int, Tuple{Vararg{Int}}})) =
     NullableArray(eltype(T), dims)
 
-similar_nullable{T,R}(dv::NominalArray{T,R}, dims::@compat(Union{Int, Tuple{Vararg{Int}}})) =
-    NullableNominalArray(T, dims)
+similar_nullable{T,R}(dv::CategoricalArray{T,R}, dims::@compat(Union{Int, Tuple{Vararg{Int}}})) =
+    NullableCategoricalArray(T, dims)
 
-similar_nullable{T,R}(dv::OrdinalArray{T,R}, dims::@compat(Union{Int, Tuple{Vararg{Int}}})) =
-    NullableOrdinalArray(T, dims)
+similar_nullable{T,R}(dv::CategoricalArray{T,R}, dims::@compat(Union{Int, Tuple{Vararg{Int}}})) =
+    NullableCategoricalArray(T, dims)
 
 similar_nullable(df::AbstractDataFrame, dims::Int) =
     DataFrame(Any[similar_nullable(x, dims) for x in columns(df)], copy(index(df)))
@@ -106,9 +106,9 @@ function sharepools{S,N}(v1::Union{CategoricalArray{S,N}, NullableCategoricalArr
             refs2[i] = tidx2[v2.refs[i]]
         end
     end
-    pool = NominalPool{S, R}(index)
-    return (NominalArray(refs1, pool),
-            NominalArray(refs2, pool))
+    pool = CategoricalPool{S, R}(index)
+    return (CategoricalArray(refs1, pool),
+            CategoricalArray(refs2, pool))
 end
 
 function sharepools{S,N}(v1::Union{CategoricalArray{S,N}, NullableCategoricalArray{S,N}},
@@ -181,19 +181,19 @@ function sharepools(v1::AbstractArray,
         end
     end
 
-    pool = NominalPool(pool)
-    return (NominalArray(refs1, pool),
-            NominalArray(refs2, pool))
+    pool = CategoricalPool(pool)
+    return (CategoricalArray(refs1, pool),
+            CategoricalArray(refs2, pool))
 end
 
 sharepools(v1::NullableArray, v2::NullableArray) =
-    sharepools(NullableNominalArray(v1), NullableNominalArray(v2))
+    sharepools(NullableCategoricalArray(v1), NullableCategoricalArray(v2))
 
 sharepools(v1::AbstractArray, v2::NullableArray) =
-    sharepools(v1, NullableNominalArray(v2))
+    sharepools(v1, NullableCategoricalArray(v2))
 
 sharepools(v1::NullableArray, v2::AbstractArray) =
-    sharepools(NullableNominalArray(v2), v1)
+    sharepools(NullableCategoricalArray(v2), v1)
 
 function sharepools(df1::AbstractDataFrame, df2::AbstractDataFrame)
     # This method exists to allow merge to work with multiple columns.

diff --git a/src/abstractdataframe/reshape.jl b/src/abstractdataframe/reshape.jl
@@ -164,9 +164,9 @@ function unstack(df::AbstractDataFrame, rowkey::Int, colkey::Int, value::Int)
     # `rowkey` integer indicating which column to place along rows
     # `colkey` integer indicating which column to place along column headers
     # `value` integer indicating which column has values
-    refkeycol = NullableNominalArray(df[rowkey])
+    refkeycol = NullableCategoricalArray(df[rowkey])
     valuecol = df[value]
-    keycol = NullableNominalArray(df[colkey])
+    keycol = NullableCategoricalArray(df[colkey])
     Nrow = length(refkeycol.pool)
     Ncol = length(keycol.pool)
     T = eltype(valuecol)
@@ -204,7 +204,7 @@ function unstack(df::AbstractDataFrame, colkey::Int, value::Int)
     for i in 1:length(groupidxs)
         rowkey[groupidxs[i]] = i
     end
-    keycol = NullableNominalArray(df[colkey])
+    keycol = NullableCategoricalArray(df[colkey])
     valuecol = df[value]
     df1 = df[g.idx[g.starts], g.cols]
     Nrow = length(g)
@@ -297,7 +297,7 @@ Base.ndims(v::StackedVector) = 1
 Base.eltype(v::StackedVector) = promote_type(map(eltype, v.components)...)
 Base.similar(v::StackedVector, T, dims::Dims) = similar(v.components[1], T, dims)
 
-CategoricalArrays.NominalArray(v::StackedVector) = NominalArray(v[:]) # could be more efficient
+CategoricalArrays.CategoricalArray(v::StackedVector) = CategoricalArray(v[:]) # could be more efficient
 
 
 """
@@ -357,8 +357,8 @@ Base.reverse(v::RepeatedVector) = RepeatedVector(reverse(v.parent), v.inner, v.o
 Base.similar(v::RepeatedVector, T, dims::Dims) = similar(v.parent, T, dims)
 Base.unique(v::RepeatedVector) = unique(v.parent)
 
-function CategoricalArrays.NominalArray(v::RepeatedVector)
-    res = CategoricalArrays.NominalArray(v.parent)
+function CategoricalArrays.CategoricalArray(v::RepeatedVector)
+    res = CategoricalArrays.CategoricalArray(v.parent)
     res.refs = repeat(res.refs, inner = [v.inner], outer = [v.outer])
     res
 end

diff --git a/src/dataframe/dataframe.jl b/src/dataframe/dataframe.jl
@@ -2,7 +2,7 @@
 An AbstractDataFrame that stores a set of named columns
 
 The columns are normally AbstractVectors stored in memory,
-particularly a Vector, NullableVector, or NominalVector.
+particularly a Vector, NullableVector, or CategoricalVector.
 
 **Constructors**
 
@@ -135,7 +135,7 @@ function DataFrame(column_eltypes::Vector{DataType}, cnames::Vector{Symbol},
     columns = Array(Any, p)
     for j in 1:p
       if nominal[j]
-        columns[j] = NullableNominalArray(column_eltypes[j], nrows)
+        columns[j] = NullableCategoricalArray(column_eltypes[j], nrows)
       else
         columns[j] = NullableArray(column_eltypes[j], nrows)
       end
@@ -364,8 +364,8 @@ function insert_multiple_entries!{T <: Real}(df::DataFrame,
 end
 
 upgrade_vector{T<:Nullable}(v::AbstractArray{T}) = v
-upgrade_vector(v::NominalArray) = NullableNominalArray(v)
-upgrade_vector(v::OrdinalArray) = NullableOrdinalArray(v)
+upgrade_vector(v::CategoricalArray) = NullableCategoricalArray(v)
+upgrade_vector(v::CategoricalArray) = NullableCategoricalArray(v)
 upgrade_vector(v::AbstractArray) = NullableArray(v)
 
 function upgrade_scalar(df::DataFrame, v::AbstractArray)
@@ -789,8 +789,8 @@ end
 ##
 ##############################################################################
 
-pool(a::AbstractVector) = compact(NominalArray(a))
-pool{T<:Nullable}(a::AbstractVector{T}) = compact(NullableNominalArray(a))
+pool(a::AbstractVector) = compact(CategoricalArray(a))
+pool{T<:Nullable}(a::AbstractVector{T}) = compact(NullableCategoricalArray(a))
 
 function pool!(df::DataFrame, cname::@compat(Union{Integer, Symbol}))
     df[cname] = pool(df[cname])

diff --git a/src/dataframe/io.jl b/src/dataframe/io.jl
@@ -640,7 +640,7 @@ function builddf(rows::Integer,
         end
 
         if o.makefactors && !(is_int || is_float || is_bool)
-            columns[j] = NullableNominalArray(values, missing)
+            columns[j] = NullableCategoricalArray(values, missing)
         else
             columns[j] = NullableArray(values, missing)
         end
@@ -877,7 +877,7 @@ readtable(filename, [keyword options])
 *   `nastrings::Vector{String}` -- Translate any of the strings into this vector into a NULL value. Defaults to `["", "NULL", "NA"]`.
 *   `truestrings::Vector{String}` -- Translate any of the strings into this vector into a Boolean `true`. Defaults to `["T", "t", "TRUE", "true"]`.
 *   `falsestrings::Vector{String}` -- Translate any of the strings into this vector into a Boolean `false`. Defaults to `["F", "f", "FALSE", "false"]`.
-*   `makefactors::Bool` -- Convert string columns into `NominalVector`'s for use as factors. Defaults to `false`.
+*   `makefactors::Bool` -- Convert string columns into `CategoricalVector`'s for use as factors. Defaults to `false`.
 *   `nrows::Int` -- Read only `nrows` from the file. Defaults to `-1`, which indicates that the entire file should be read.
 *   `names::Vector{Symbol}` -- Use the values in this array as the names for all columns instead of or in lieu of the names in the file's header. Defaults to `[]`, which indicates that the header should be used if present or that numeric names should be invented if there is no header.
 *   `eltypes::Vector` -- Specify the types of all columns. Defaults to `[]`.
@@ -975,7 +975,7 @@ literals. Parses the string `s` containing delimiter-separated tabular data
 argument contains a list of flag characters, which, if present, are equivalent
 to supplying named arguments to `readtable` as follows:
 
-- `f`: `makefactors=true`, convert string columns to `NominalArray` columns
+- `f`: `makefactors=true`, convert string columns to `CategoricalArray` columns
 - `c`: `allowcomments=true`, ignore lines beginning with `#`
 - `H`: `header=false`, do not interpret the first line as column names
 """
@@ -1004,7 +1004,7 @@ separated values (CSV) using `readtable`, just as if it were being loaded from
 an external file. The suffix flags `f`, `c`, and `H` are optional. If present,
 they are equivalent to supplying named arguments to `readtable` as follows:
 
-* `f`: `makefactors=true`, convert string columns to `NominalArray` columns
+* `f`: `makefactors=true`, convert string columns to `CategoricalArray` columns
 * `c`: `allowcomments=true`, ignore lines beginning with `#`
 * `H`: `header=false`, do not interpret the first line as column names
 
@@ -1038,7 +1038,7 @@ character, just as if it were being loaded from an external file. The suffix
 flags `f`, `c`, and `H` are optional. If present, they are equivalent to
 supplying named arguments to `readtable` as follows:
 
-* `f`: `makefactors=true`, convert string columns to `NominalArray` columns
+* `f`: `makefactors=true`, convert string columns to `CategoricalArray` columns
 * `c`: `allowcomments=true`, ignore lines beginning with `#`
 * `H`: `header=false`, do not interpret the first line as column names
 
@@ -1074,7 +1074,7 @@ loaded from an external file. The suffix flags `f`, `c`, and `H` are optional.
 If present, they are equivalent to supplying named arguments to `readtable` as
 follows:
 
-* `f`: `makefactors=true`, convert string columns to `NominalArray` columns
+* `f`: `makefactors=true`, convert string columns to `CategoricalArray` columns
 * `c`: `allowcomments=true`, ignore lines beginning with `#`
 * `H`: `header=false`, do not interpret the first line as column names
 
@@ -1107,7 +1107,7 @@ separated values (TSV) using `readtable`, just as if it were being loaded from
 an external file. The suffix flags `f`, `c`, and `H` are optional. If present,
 they are equivalent to supplying named arguments to `readtable` as follows:
 
-* `f`: `makefactors=true`, convert string columns to `NominalArray` columns
+* `f`: `makefactors=true`, convert string columns to `CategoricalArray` columns
 * `c`: `allowcomments=true`, ignore lines beginning with `#`
 * `H`: `header=false`, do not interpret the first line as column names
 

diff --git a/src/groupeddataframe/grouping.jl b/src/groupeddataframe/grouping.jl
@@ -123,8 +123,8 @@ function groupby{T}(d::AbstractDataFrame, cols::Vector{T})
     ##     http://wesmckinney.com/blog/?p=489
 
     ncols = length(cols)
-    # use NominalArray to get a set of integer references for each unique item
-    nv = NullableNominalArray(d[cols[ncols]])
+    # use CategoricalArray to get a set of integer references for each unique item
+    nv = NullableCategoricalArray(d[cols[ncols]])
     # if there are NULLs, add 1 to the refs to avoid underflows in x later
     anynulls = (findfirst(nv.refs, 0) > 0 ? 1 : 0)
     # use UInt32 instead of the original array's integer size since the number of levels can be high
@@ -140,7 +140,7 @@ function groupby{T}(d::AbstractDataFrame, cols::Vector{T})
     ngroups = length(levels(nv)) + anynulls
     # if there's more than 1 column, do roughly the same thing repeatedly
     for j = (ncols - 1):-1:1
-        nv = NullableNominalArray(d[cols[j]])
+        nv = NullableCategoricalArray(d[cols[j]])
         anynulls = (findfirst(nv.refs, 0) > 0 ? 1 : 0)
         for i = 1:nrow(d)
             if nv.refs[i] != 0

diff --git a/src/other/utils.jl b/src/other/utils.jl
@@ -136,15 +136,15 @@ countnull(a::NullableArray) = sum(a.isnull)
 #'
 #' Count the number of missing values in a NullableCategoricalArray.
 #'
-#' @field na::NominalArray The NominalArray whose missing values
+#' @field na::CategoricalArray The CategoricalArray whose missing values
 #'        are to be counted.
 #'
 #' @returns count::Int The number of null values in `a`.
 #'
 #' @examples
 #'
-#' DataFrames.countnull(NominalArray([1, 2, 3]))
-function countnull(a::NominalArray)
+#' DataFrames.countnull(CategoricalArray([1, 2, 3]))
+function countnull(a::CategoricalArray)
     res = 0
     for x in a.refs
         res += x == 0

diff --git a/test/cat.jl b/test/cat.jl
@@ -91,20 +91,20 @@ module TestCat
     end
 
     # Minimal container type promotion
-    dfa = DataFrame(a = NominalArray([1, 2, 2]))
-    dfb = DataFrame(a = NominalArray([2, 3, 4]))
+    dfa = DataFrame(a = CategoricalArray([1, 2, 2]))
+    dfb = DataFrame(a = CategoricalArray([2, 3, 4]))
     dfc = DataFrame(a = NullableArray([2, 3, 4]))
     dfd = DataFrame(Any[2:4], [:a])
     dfab = vcat(dfa, dfb)
     dfac = vcat(dfa, dfc)
     @test isequal(dfab[:a], Nullable{Int}[1, 2, 2, 2, 3, 4])
     @test isequal(dfac[:a], Nullable{Int}[1, 2, 2, 2, 3, 4])
-    @test isa(dfab[:a], NullableNominalVector{Int})
+    @test isa(dfab[:a], NullableCategoricalVector{Int})
     # Fails on Julia 0.4 since promote_type(Nullable{Int}, Nullable{Float64}) gives Nullable{T}
     if VERSION >= v"0.5.0-dev"
-        @test isa(dfac[:a], NullableNominalVector{Int})
+        @test isa(dfac[:a], NullableCategoricalVector{Int})
     else
-        @test isa(dfac[:a], NullableNominalVector{Any})
+        @test isa(dfac[:a], NullableCategoricalVector{Any})
     end
     # ^^ container may flip if container promotion happens in Base/DataArrays
     dc = vcat(dfd, dfc)
@@ -118,7 +118,7 @@ module TestCat
     # Missing columns
     rename!(dfd, :a, :b)
     dfda = DataFrame(b = NullableArray(Nullable{Int}[2, 3, 4, Nullable(), Nullable(), Nullable()]),
-                     a = NullableNominalVector(Nullable{Int}[Nullable(), Nullable(), Nullable(), 1, 2, 2]))
+                     a = NullableCategoricalVector(Nullable{Int}[Nullable(), Nullable(), Nullable(), 1, 2, 2]))
     @test isequal(vcat(dfd, dfa), dfda)
 
     # Alignment

diff --git a/test/constructors.jl b/test/constructors.jl
@@ -10,14 +10,14 @@ module TestConstructors
     @test isequal(df.columns, Any[])
     @test isequal(df.colindex, Index())
 
-    df = DataFrame(Any[NullableNominalVector(zeros(3)),
-                       NullableNominalVector(ones(3))],
+    df = DataFrame(Any[NullableCategoricalVector(zeros(3)),
+                       NullableCategoricalVector(ones(3))],
                    Index([:x1, :x2]))
     @test size(df, 1) == 3
     @test size(df, 2) == 2
 
-    @test isequal(df, DataFrame(Any[NullableNominalVector(zeros(3)),
-                                    NullableNominalVector(ones(3))]))
+    @test isequal(df, DataFrame(Any[NullableCategoricalVector(zeros(3)),
+                                    NullableCategoricalVector(ones(3))]))
     @test isequal(df, DataFrame(x1 = [0.0, 0.0, 0.0],
                                 x2 = [1.0, 1.0, 1.0]))
 

diff --git a/test/contrasts.jl b/test/contrasts.jl
@@ -4,7 +4,7 @@ using Base.Test
 using DataFrames
 
 
-d = DataFrame(x = NominalVector([:a, :b, :c, :a, :a, :b]))
+d = DataFrame(x = CategoricalVector([:a, :b, :c, :a, :a, :b]))
 
 mf = ModelFrame(Formula(nothing, :x), d)
 

diff --git a/test/data.jl b/test/data.jl
@@ -78,15 +78,15 @@ module TestData
     N = 20
     #Cast to Int64 as rand() behavior differs between Int32/64
     d1 = NullableArray(rand(map(Int64, 1:2), N))
-    d2 = NullableNominalArray(Nullable{String}["A", "B", Nullable()])[rand(map(Int64, 1:3), N)]
+    d2 = NullableCategoricalArray(Nullable{String}["A", "B", Nullable()])[rand(map(Int64, 1:3), N)]
     d3 = NullableArray(randn(N))
     d4 = NullableArray(randn(N))
     df7 = DataFrame(Any[d1, d2, d3], [:d1, :d2, :d3])
 
     #test_group("groupby")
     gd = groupby(df7, :d1)
     @test length(gd) == 2
-    # @test isequal(gd[2]["d2"], NominalVector["A", "B", Nullable(), "A", Nullable(), Nullable(), Nullable(), Nullable()])
+    # @test isequal(gd[2]["d2"], CategoricalVector["A", "B", Nullable(), "A", Nullable(), Nullable(), Nullable(), Nullable()])
     @test isequal(sum(gd[2][:d3]), sum(df7[:d3][Vector(df7[:d1]) .== 2]))
 
     g1 = groupby(df7, [:d1, :d2])

diff --git a/test/dataframe.jl b/test/dataframe.jl
@@ -75,10 +75,10 @@ module TestDataFrame
     @test x0[:d] == Int[]
 
     # similar / nulls
-    df = DataFrame(a = 1, b = "b", c = NominalArray([3.3]))
+    df = DataFrame(a = 1, b = "b", c = CategoricalArray([3.3]))
     nulldf = DataFrame(a = NullableArray(Int, 2),
                        b = NullableArray(String, 2),
-                       c = NullableNominalArray(Float64, 2))
+                       c = NullableCategoricalArray(Float64, 2))
     @test isequal(nulldf, similar(df, 2))
     @test isequal(nulldf, DataFrames.similar_nullable(df, 2))
 
@@ -146,7 +146,7 @@ module TestDataFrame
     @test size(df, 2) == 3
     @test typeof(df[:, 1]) == NullableVector{Int}
     @test typeof(df[:, 2]) == NullableVector{Float64}
-    @test typeof(df[:, 3]) == NullableNominalVector{Compat.UTF8String,UInt32}
+    @test typeof(df[:, 3]) == NullableCategoricalVector{Compat.UTF8String,UInt32}
     @test allnull(df[:, 1])
     @test allnull(df[:, 2])
     @test allnull(df[:, 3])
@@ -297,18 +297,18 @@ module TestDataFrame
               describe(f, DataFrame(a=NullableArray([1, 2]),
                                     b=NullableArray(Nullable{String}["3", Nullable()])))
         @test nothing ==
-              describe(f, DataFrame(a=NominalArray([1, 2]),
-                                    b=NullableNominalArray(Nullable{String}["3", Nullable()])))
+              describe(f, DataFrame(a=CategoricalArray([1, 2]),
+                                    b=NullableCategoricalArray(Nullable{String}["3", Nullable()])))
         @test nothing == describe(f, [1, 2, 3])
         @test nothing == describe(f, NullableArray([1, 2, 3]))
-        @test nothing == describe(f, NominalArray([1, 2, 3]))
+        @test nothing == describe(f, CategoricalArray([1, 2, 3]))
         @test nothing == describe(f, Any["1", "2", Nullable()])
         @test nothing == describe(f, NullableArray(Nullable{String}["1", "2", Nullable()]))
-        @test nothing == describe(f, NullableNominalArray(Nullable{String}["1", "2", Nullable()]))
+        @test nothing == describe(f, NullableCategoricalArray(Nullable{String}["1", "2", Nullable()]))
     end
 
     #Check the output of unstack
-    df = DataFrame(Fish = NominalArray(["Bob", "Bob", "Batman", "Batman"]),
+    df = DataFrame(Fish = CategoricalArray(["Bob", "Bob", "Batman", "Batman"]),
                    Key = ["Mass", "Color", "Mass", "Color"],
                    Value = ["12 g", "Red", "18 g", "Grey"])
     # Check that reordering levels does not confuse unstack

diff --git a/test/dataframerow.jl b/test/dataframerow.jl
@@ -7,7 +7,7 @@ module TestDataFrameRow
                                                      1.2, 2.0,
                                                      Nullable(), Nullable()]),
                    c=NullableArray(Nullable{String}["A", "B", "C", "A", "B", Nullable()]),
-                   d=NullableNominalArray(Nullable{Symbol}[:A,  Nullable(),  :C,  :A,
+                   d=NullableCategoricalArray(Nullable{Symbol}[:A,  Nullable(),  :C,  :A,
                                                            Nullable(),  :C]))
     df2 = DataFrame(a = NullableArray([1, 2, 3]))