Skip to content

Commit

Permalink
Use CategoricalArray instead of NominalArray
Browse files Browse the repository at this point in the history
New type merging NominalArray and OrdinalArray in 0.0.5.
  • Loading branch information
nalimilan committed Sep 22, 2016
1 parent 63c1d96 commit 2ec131e
Show file tree
Hide file tree
Showing 20 changed files with 93 additions and 93 deletions.
2 changes: 1 addition & 1 deletion REQUIRE
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
julia 0.4
NullableArrays 0.0.8
CategoricalArrays 0.0.4
CategoricalArrays 0.0.5
StatsBase 0.8.3
GZip
SortingAlgorithms
Expand Down
4 changes: 2 additions & 2 deletions src/abstractdataframe/abstractdataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -411,7 +411,7 @@ function StatsBase.describe{T<:Number}(io, nv::AbstractArray{T})
return
end
function StatsBase.describe{T}(io, nv::AbstractArray{T})
ispooled = isa(nv, NominalVector) ? "Pooled " : ""
ispooled = isa(nv, CategoricalVector) ? "Pooled " : ""
nulls = countnull(nv)
# if nothing else, just give the length and element type and NA count
println(io, "Length $(length(nv))")
Expand Down Expand Up @@ -650,7 +650,7 @@ unique!(df) # modifies df
function nonuniquekey(df::AbstractDataFrame)
# Here's another (probably a lot faster) way to do `nonunique`
# by grouping on all columns. It will fail if columns cannot be
# made into NominalVector's.
# made into CategoricalVector's.
gd = groupby(df, _names(df))
idx = [1:length(gd.idx)][gd.idx][gd.starts]
res = fill(true, nrow(df))
Expand Down
26 changes: 13 additions & 13 deletions src/abstractdataframe/join.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,11 @@ similar_nullable{T}(dv::AbstractArray{T}, dims::@compat(Union{Int, Tuple{Vararg{
similar_nullable{T<:Nullable}(dv::AbstractArray{T}, dims::@compat(Union{Int, Tuple{Vararg{Int}}})) =
NullableArray(eltype(T), dims)

similar_nullable{T,R}(dv::NominalArray{T,R}, dims::@compat(Union{Int, Tuple{Vararg{Int}}})) =
NullableNominalArray(T, dims)
similar_nullable{T,R}(dv::CategoricalArray{T,R}, dims::@compat(Union{Int, Tuple{Vararg{Int}}})) =
NullableCategoricalArray(T, dims)

similar_nullable{T,R}(dv::OrdinalArray{T,R}, dims::@compat(Union{Int, Tuple{Vararg{Int}}})) =
NullableOrdinalArray(T, dims)
similar_nullable{T,R}(dv::CategoricalArray{T,R}, dims::@compat(Union{Int, Tuple{Vararg{Int}}})) =
NullableCategoricalArray(T, dims)

similar_nullable(df::AbstractDataFrame, dims::Int) =
DataFrame(Any[similar_nullable(x, dims) for x in columns(df)], copy(index(df)))
Expand Down Expand Up @@ -106,9 +106,9 @@ function sharepools{S,N}(v1::Union{CategoricalArray{S,N}, NullableCategoricalArr
refs2[i] = tidx2[v2.refs[i]]
end
end
pool = NominalPool{S, R}(index)
return (NominalArray(refs1, pool),
NominalArray(refs2, pool))
pool = CategoricalPool{S, R}(index)
return (CategoricalArray(refs1, pool),
CategoricalArray(refs2, pool))
end

function sharepools{S,N}(v1::Union{CategoricalArray{S,N}, NullableCategoricalArray{S,N}},
Expand Down Expand Up @@ -181,19 +181,19 @@ function sharepools(v1::AbstractArray,
end
end

pool = NominalPool(pool)
return (NominalArray(refs1, pool),
NominalArray(refs2, pool))
pool = CategoricalPool(pool)
return (CategoricalArray(refs1, pool),
CategoricalArray(refs2, pool))
end

sharepools(v1::NullableArray, v2::NullableArray) =
sharepools(NullableNominalArray(v1), NullableNominalArray(v2))
sharepools(NullableCategoricalArray(v1), NullableCategoricalArray(v2))

sharepools(v1::AbstractArray, v2::NullableArray) =
sharepools(v1, NullableNominalArray(v2))
sharepools(v1, NullableCategoricalArray(v2))

sharepools(v1::NullableArray, v2::AbstractArray) =
sharepools(NullableNominalArray(v2), v1)
sharepools(NullableCategoricalArray(v2), v1)

function sharepools(df1::AbstractDataFrame, df2::AbstractDataFrame)
# This method exists to allow merge to work with multiple columns.
Expand Down
12 changes: 6 additions & 6 deletions src/abstractdataframe/reshape.jl
Original file line number Diff line number Diff line change
Expand Up @@ -164,9 +164,9 @@ function unstack(df::AbstractDataFrame, rowkey::Int, colkey::Int, value::Int)
# `rowkey` integer indicating which column to place along rows
# `colkey` integer indicating which column to place along column headers
# `value` integer indicating which column has values
refkeycol = NullableNominalArray(df[rowkey])
refkeycol = NullableCategoricalArray(df[rowkey])
valuecol = df[value]
keycol = NullableNominalArray(df[colkey])
keycol = NullableCategoricalArray(df[colkey])
Nrow = length(refkeycol.pool)
Ncol = length(keycol.pool)
T = eltype(valuecol)
Expand Down Expand Up @@ -204,7 +204,7 @@ function unstack(df::AbstractDataFrame, colkey::Int, value::Int)
for i in 1:length(groupidxs)
rowkey[groupidxs[i]] = i
end
keycol = NullableNominalArray(df[colkey])
keycol = NullableCategoricalArray(df[colkey])
valuecol = df[value]
df1 = df[g.idx[g.starts], g.cols]
Nrow = length(g)
Expand Down Expand Up @@ -297,7 +297,7 @@ Base.ndims(v::StackedVector) = 1
Base.eltype(v::StackedVector) = promote_type(map(eltype, v.components)...)
Base.similar(v::StackedVector, T, dims::Dims) = similar(v.components[1], T, dims)

CategoricalArrays.NominalArray(v::StackedVector) = NominalArray(v[:]) # could be more efficient
CategoricalArrays.CategoricalArray(v::StackedVector) = CategoricalArray(v[:]) # could be more efficient


"""
Expand Down Expand Up @@ -357,8 +357,8 @@ Base.reverse(v::RepeatedVector) = RepeatedVector(reverse(v.parent), v.inner, v.o
Base.similar(v::RepeatedVector, T, dims::Dims) = similar(v.parent, T, dims)
Base.unique(v::RepeatedVector) = unique(v.parent)

function CategoricalArrays.NominalArray(v::RepeatedVector)
res = CategoricalArrays.NominalArray(v.parent)
function CategoricalArrays.CategoricalArray(v::RepeatedVector)
res = CategoricalArrays.CategoricalArray(v.parent)
res.refs = repeat(res.refs, inner = [v.inner], outer = [v.outer])
res
end
Expand Down
12 changes: 6 additions & 6 deletions src/dataframe/dataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
An AbstractDataFrame that stores a set of named columns
The columns are normally AbstractVectors stored in memory,
particularly a Vector, NullableVector, or NominalVector.
particularly a Vector, NullableVector, or CategoricalVector.
**Constructors**
Expand Down Expand Up @@ -135,7 +135,7 @@ function DataFrame(column_eltypes::Vector{DataType}, cnames::Vector{Symbol},
columns = Array(Any, p)
for j in 1:p
if nominal[j]
columns[j] = NullableNominalArray(column_eltypes[j], nrows)
columns[j] = NullableCategoricalArray(column_eltypes[j], nrows)
else
columns[j] = NullableArray(column_eltypes[j], nrows)
end

This comment has been minimized.

Copy link
@kmsquire

kmsquire Nov 1, 2016

Contributor

Indentation incorrect in this function.

This comment has been minimized.

Copy link
@nalimilan

nalimilan Nov 1, 2016

Author Member

Funny you're reading this commit. Care to make a PR?

Expand Down Expand Up @@ -364,8 +364,8 @@ function insert_multiple_entries!{T <: Real}(df::DataFrame,
end

upgrade_vector{T<:Nullable}(v::AbstractArray{T}) = v
upgrade_vector(v::NominalArray) = NullableNominalArray(v)
upgrade_vector(v::OrdinalArray) = NullableOrdinalArray(v)
upgrade_vector(v::CategoricalArray) = NullableCategoricalArray(v)
upgrade_vector(v::CategoricalArray) = NullableCategoricalArray(v)
upgrade_vector(v::AbstractArray) = NullableArray(v)

function upgrade_scalar(df::DataFrame, v::AbstractArray)
Expand Down Expand Up @@ -789,8 +789,8 @@ end
##
##############################################################################

pool(a::AbstractVector) = compact(NominalArray(a))
pool{T<:Nullable}(a::AbstractVector{T}) = compact(NullableNominalArray(a))
pool(a::AbstractVector) = compact(CategoricalArray(a))
pool{T<:Nullable}(a::AbstractVector{T}) = compact(NullableCategoricalArray(a))

function pool!(df::DataFrame, cname::@compat(Union{Integer, Symbol}))
df[cname] = pool(df[cname])
Expand Down
14 changes: 7 additions & 7 deletions src/dataframe/io.jl
Original file line number Diff line number Diff line change
Expand Up @@ -640,7 +640,7 @@ function builddf(rows::Integer,
end

if o.makefactors && !(is_int || is_float || is_bool)
columns[j] = NullableNominalArray(values, missing)
columns[j] = NullableCategoricalArray(values, missing)
else
columns[j] = NullableArray(values, missing)
end
Expand Down Expand Up @@ -877,7 +877,7 @@ readtable(filename, [keyword options])
* `nastrings::Vector{String}` -- Translate any of the strings into this vector into a NULL value. Defaults to `["", "NULL", "NA"]`.
* `truestrings::Vector{String}` -- Translate any of the strings into this vector into a Boolean `true`. Defaults to `["T", "t", "TRUE", "true"]`.
* `falsestrings::Vector{String}` -- Translate any of the strings into this vector into a Boolean `false`. Defaults to `["F", "f", "FALSE", "false"]`.
* `makefactors::Bool` -- Convert string columns into `NominalVector`'s for use as factors. Defaults to `false`.
* `makefactors::Bool` -- Convert string columns into `CategoricalVector`'s for use as factors. Defaults to `false`.
* `nrows::Int` -- Read only `nrows` from the file. Defaults to `-1`, which indicates that the entire file should be read.
* `names::Vector{Symbol}` -- Use the values in this array as the names for all columns instead of or in lieu of the names in the file's header. Defaults to `[]`, which indicates that the header should be used if present or that numeric names should be invented if there is no header.
* `eltypes::Vector` -- Specify the types of all columns. Defaults to `[]`.
Expand Down Expand Up @@ -975,7 +975,7 @@ literals. Parses the string `s` containing delimiter-separated tabular data
argument contains a list of flag characters, which, if present, are equivalent
to supplying named arguments to `readtable` as follows:
- `f`: `makefactors=true`, convert string columns to `NominalArray` columns
- `f`: `makefactors=true`, convert string columns to `CategoricalArray` columns
- `c`: `allowcomments=true`, ignore lines beginning with `#`
- `H`: `header=false`, do not interpret the first line as column names
"""
Expand Down Expand Up @@ -1004,7 +1004,7 @@ separated values (CSV) using `readtable`, just as if it were being loaded from
an external file. The suffix flags `f`, `c`, and `H` are optional. If present,
they are equivalent to supplying named arguments to `readtable` as follows:
* `f`: `makefactors=true`, convert string columns to `NominalArray` columns
* `f`: `makefactors=true`, convert string columns to `CategoricalArray` columns
* `c`: `allowcomments=true`, ignore lines beginning with `#`
* `H`: `header=false`, do not interpret the first line as column names
Expand Down Expand Up @@ -1038,7 +1038,7 @@ character, just as if it were being loaded from an external file. The suffix
flags `f`, `c`, and `H` are optional. If present, they are equivalent to
supplying named arguments to `readtable` as follows:
* `f`: `makefactors=true`, convert string columns to `NominalArray` columns
* `f`: `makefactors=true`, convert string columns to `CategoricalArray` columns
* `c`: `allowcomments=true`, ignore lines beginning with `#`
* `H`: `header=false`, do not interpret the first line as column names
Expand Down Expand Up @@ -1074,7 +1074,7 @@ loaded from an external file. The suffix flags `f`, `c`, and `H` are optional.
If present, they are equivalent to supplying named arguments to `readtable` as
follows:
* `f`: `makefactors=true`, convert string columns to `NominalArray` columns
* `f`: `makefactors=true`, convert string columns to `CategoricalArray` columns
* `c`: `allowcomments=true`, ignore lines beginning with `#`
* `H`: `header=false`, do not interpret the first line as column names
Expand Down Expand Up @@ -1107,7 +1107,7 @@ separated values (TSV) using `readtable`, just as if it were being loaded from
an external file. The suffix flags `f`, `c`, and `H` are optional. If present,
they are equivalent to supplying named arguments to `readtable` as follows:
* `f`: `makefactors=true`, convert string columns to `NominalArray` columns
* `f`: `makefactors=true`, convert string columns to `CategoricalArray` columns
* `c`: `allowcomments=true`, ignore lines beginning with `#`
* `H`: `header=false`, do not interpret the first line as column names
Expand Down
6 changes: 3 additions & 3 deletions src/groupeddataframe/grouping.jl
Original file line number Diff line number Diff line change
Expand Up @@ -123,8 +123,8 @@ function groupby{T}(d::AbstractDataFrame, cols::Vector{T})
## http://wesmckinney.com/blog/?p=489

ncols = length(cols)
# use NominalArray to get a set of integer references for each unique item
nv = NullableNominalArray(d[cols[ncols]])
# use CategoricalArray to get a set of integer references for each unique item
nv = NullableCategoricalArray(d[cols[ncols]])
# if there are NULLs, add 1 to the refs to avoid underflows in x later
anynulls = (findfirst(nv.refs, 0) > 0 ? 1 : 0)
# use UInt32 instead of the original array's integer size since the number of levels can be high
Expand All @@ -140,7 +140,7 @@ function groupby{T}(d::AbstractDataFrame, cols::Vector{T})
ngroups = length(levels(nv)) + anynulls
# if there's more than 1 column, do roughly the same thing repeatedly
for j = (ncols - 1):-1:1
nv = NullableNominalArray(d[cols[j]])
nv = NullableCategoricalArray(d[cols[j]])
anynulls = (findfirst(nv.refs, 0) > 0 ? 1 : 0)
for i = 1:nrow(d)
if nv.refs[i] != 0
Expand Down
6 changes: 3 additions & 3 deletions src/other/utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -136,15 +136,15 @@ countnull(a::NullableArray) = sum(a.isnull)
#'
#' Count the number of missing values in a NullableCategoricalArray.
#'
#' @field na::NominalArray The NominalArray whose missing values
#' @field na::CategoricalArray The CategoricalArray whose missing values
#' are to be counted.
#'
#' @returns count::Int The number of null values in `a`.
#'
#' @examples
#'
#' DataFrames.countnull(NominalArray([1, 2, 3]))
function countnull(a::NominalArray)
#' DataFrames.countnull(CategoricalArray([1, 2, 3]))
function countnull(a::CategoricalArray)
res = 0
for x in a.refs
res += x == 0
Expand Down
12 changes: 6 additions & 6 deletions test/cat.jl
Original file line number Diff line number Diff line change
Expand Up @@ -91,20 +91,20 @@ module TestCat
end

# Minimal container type promotion
dfa = DataFrame(a = NominalArray([1, 2, 2]))
dfb = DataFrame(a = NominalArray([2, 3, 4]))
dfa = DataFrame(a = CategoricalArray([1, 2, 2]))
dfb = DataFrame(a = CategoricalArray([2, 3, 4]))
dfc = DataFrame(a = NullableArray([2, 3, 4]))
dfd = DataFrame(Any[2:4], [:a])
dfab = vcat(dfa, dfb)
dfac = vcat(dfa, dfc)
@test isequal(dfab[:a], Nullable{Int}[1, 2, 2, 2, 3, 4])
@test isequal(dfac[:a], Nullable{Int}[1, 2, 2, 2, 3, 4])
@test isa(dfab[:a], NullableNominalVector{Int})
@test isa(dfab[:a], NullableCategoricalVector{Int})
# Fails on Julia 0.4 since promote_type(Nullable{Int}, Nullable{Float64}) gives Nullable{T}
if VERSION >= v"0.5.0-dev"
@test isa(dfac[:a], NullableNominalVector{Int})
@test isa(dfac[:a], NullableCategoricalVector{Int})
else
@test isa(dfac[:a], NullableNominalVector{Any})
@test isa(dfac[:a], NullableCategoricalVector{Any})
end
# ^^ container may flip if container promotion happens in Base/DataArrays
dc = vcat(dfd, dfc)
Expand All @@ -118,7 +118,7 @@ module TestCat
# Missing columns
rename!(dfd, :a, :b)
dfda = DataFrame(b = NullableArray(Nullable{Int}[2, 3, 4, Nullable(), Nullable(), Nullable()]),
a = NullableNominalVector(Nullable{Int}[Nullable(), Nullable(), Nullable(), 1, 2, 2]))
a = NullableCategoricalVector(Nullable{Int}[Nullable(), Nullable(), Nullable(), 1, 2, 2]))
@test isequal(vcat(dfd, dfa), dfda)

# Alignment
Expand Down
8 changes: 4 additions & 4 deletions test/constructors.jl
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,14 @@ module TestConstructors
@test isequal(df.columns, Any[])
@test isequal(df.colindex, Index())

df = DataFrame(Any[NullableNominalVector(zeros(3)),
NullableNominalVector(ones(3))],
df = DataFrame(Any[NullableCategoricalVector(zeros(3)),
NullableCategoricalVector(ones(3))],
Index([:x1, :x2]))
@test size(df, 1) == 3
@test size(df, 2) == 2

@test isequal(df, DataFrame(Any[NullableNominalVector(zeros(3)),
NullableNominalVector(ones(3))]))
@test isequal(df, DataFrame(Any[NullableCategoricalVector(zeros(3)),
NullableCategoricalVector(ones(3))]))
@test isequal(df, DataFrame(x1 = [0.0, 0.0, 0.0],
x2 = [1.0, 1.0, 1.0]))

Expand Down
2 changes: 1 addition & 1 deletion test/contrasts.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ using Base.Test
using DataFrames


d = DataFrame(x = NominalVector([:a, :b, :c, :a, :a, :b]))
d = DataFrame(x = CategoricalVector([:a, :b, :c, :a, :a, :b]))

mf = ModelFrame(Formula(nothing, :x), d)

Expand Down
4 changes: 2 additions & 2 deletions test/data.jl
Original file line number Diff line number Diff line change
Expand Up @@ -78,15 +78,15 @@ module TestData
N = 20
#Cast to Int64 as rand() behavior differs between Int32/64
d1 = NullableArray(rand(map(Int64, 1:2), N))
d2 = NullableNominalArray(Nullable{String}["A", "B", Nullable()])[rand(map(Int64, 1:3), N)]
d2 = NullableCategoricalArray(Nullable{String}["A", "B", Nullable()])[rand(map(Int64, 1:3), N)]
d3 = NullableArray(randn(N))
d4 = NullableArray(randn(N))
df7 = DataFrame(Any[d1, d2, d3], [:d1, :d2, :d3])

#test_group("groupby")
gd = groupby(df7, :d1)
@test length(gd) == 2
# @test isequal(gd[2]["d2"], NominalVector["A", "B", Nullable(), "A", Nullable(), Nullable(), Nullable(), Nullable()])
# @test isequal(gd[2]["d2"], CategoricalVector["A", "B", Nullable(), "A", Nullable(), Nullable(), Nullable(), Nullable()])
@test isequal(sum(gd[2][:d3]), sum(df7[:d3][Vector(df7[:d1]) .== 2]))

g1 = groupby(df7, [:d1, :d2])
Expand Down
16 changes: 8 additions & 8 deletions test/dataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -75,10 +75,10 @@ module TestDataFrame
@test x0[:d] == Int[]

# similar / nulls
df = DataFrame(a = 1, b = "b", c = NominalArray([3.3]))
df = DataFrame(a = 1, b = "b", c = CategoricalArray([3.3]))
nulldf = DataFrame(a = NullableArray(Int, 2),
b = NullableArray(String, 2),
c = NullableNominalArray(Float64, 2))
c = NullableCategoricalArray(Float64, 2))
@test isequal(nulldf, similar(df, 2))
@test isequal(nulldf, DataFrames.similar_nullable(df, 2))

Expand Down Expand Up @@ -146,7 +146,7 @@ module TestDataFrame
@test size(df, 2) == 3
@test typeof(df[:, 1]) == NullableVector{Int}
@test typeof(df[:, 2]) == NullableVector{Float64}
@test typeof(df[:, 3]) == NullableNominalVector{Compat.UTF8String,UInt32}
@test typeof(df[:, 3]) == NullableCategoricalVector{Compat.UTF8String,UInt32}
@test allnull(df[:, 1])
@test allnull(df[:, 2])
@test allnull(df[:, 3])
Expand Down Expand Up @@ -297,18 +297,18 @@ module TestDataFrame
describe(f, DataFrame(a=NullableArray([1, 2]),
b=NullableArray(Nullable{String}["3", Nullable()])))
@test nothing ==
describe(f, DataFrame(a=NominalArray([1, 2]),
b=NullableNominalArray(Nullable{String}["3", Nullable()])))
describe(f, DataFrame(a=CategoricalArray([1, 2]),
b=NullableCategoricalArray(Nullable{String}["3", Nullable()])))
@test nothing == describe(f, [1, 2, 3])
@test nothing == describe(f, NullableArray([1, 2, 3]))
@test nothing == describe(f, NominalArray([1, 2, 3]))
@test nothing == describe(f, CategoricalArray([1, 2, 3]))
@test nothing == describe(f, Any["1", "2", Nullable()])
@test nothing == describe(f, NullableArray(Nullable{String}["1", "2", Nullable()]))
@test nothing == describe(f, NullableNominalArray(Nullable{String}["1", "2", Nullable()]))
@test nothing == describe(f, NullableCategoricalArray(Nullable{String}["1", "2", Nullable()]))
end

#Check the output of unstack
df = DataFrame(Fish = NominalArray(["Bob", "Bob", "Batman", "Batman"]),
df = DataFrame(Fish = CategoricalArray(["Bob", "Bob", "Batman", "Batman"]),
Key = ["Mass", "Color", "Mass", "Color"],
Value = ["12 g", "Red", "18 g", "Grey"])
# Check that reordering levels does not confuse unstack
Expand Down
2 changes: 1 addition & 1 deletion test/dataframerow.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ module TestDataFrameRow
1.2, 2.0,
Nullable(), Nullable()]),
c=NullableArray(Nullable{String}["A", "B", "C", "A", "B", Nullable()]),
d=NullableNominalArray(Nullable{Symbol}[:A, Nullable(), :C, :A,
d=NullableCategoricalArray(Nullable{Symbol}[:A, Nullable(), :C, :A,
Nullable(), :C]))
df2 = DataFrame(a = NullableArray([1, 2, 3]))

Expand Down
Loading

0 comments on commit 2ec131e

Please sign in to comment.