Skip to content

Commit

Permalink
[BREAKING] Make DataFrameColumns stop being an AbstractVector
Browse files Browse the repository at this point in the history
  • Loading branch information
bkamins authored Jun 24, 2020
1 parent 45b31a2 commit 0412291
Show file tree
Hide file tree
Showing 12 changed files with 151 additions and 34 deletions.
1 change: 1 addition & 0 deletions docs/src/lib/functions.md
Original file line number Diff line number Diff line change
Expand Up @@ -120,5 +120,6 @@ disallowmissing!
```@docs
eachcol
eachrow
values
pairs
```
8 changes: 5 additions & 3 deletions docs/src/lib/types.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,14 @@ or when accessing a single row of a `DataFrame` or `SubDataFrame` via `getindex`

The `eachrow` function returns a value of the `DataFrameRows` type, which
serves as an iterator over rows of an `AbstractDataFrame`, returning `DataFrameRow` objects.
The `DataFrameRows` is a subtype of `AbstractVector` and supports its interface
with the exception that it is read-only.

Similarly, the `eachcol` function returns a value of the `DataFrameColumns` type, which
serves as an iterator over columns of an `AbstractDataFrame`.
is not an `AbstractVector`, but supports most of its API. The key differences are that it is read-only and
that the `keys` function returns a vector of `Symbol`s (and not integers as for normal vectors).

The `DataFrameRows` and `DataFrameColumns` types are subtypes of `AbstractVector` and support its interface
with the exception that they are read only. Note that they are not exported and should not be constructed directly,
Note that `DataFrameRows` and `DataFrameColumns` are not exported and should not be constructed directly,
but using the `eachrow` and `eachcol` functions.

The `RepeatedVector` and `StackedVector` types are subtypes of `AbstractVector` and support its interface
Expand Down
83 changes: 67 additions & 16 deletions src/abstractdataframe/iteration.jl
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,9 @@ Base.iterate(::AbstractDataFrame) =
Return a `DataFrameRows` that iterates a data frame row by row,
with each row represented as a `DataFrameRow`.
Because `DataFrameRow`s have an `eltype` of `Any`, use `copy(dfr::DataFrameRow)` to obtain
a named tuple, which supports iteration and property access like a `DataFrameRow`,
but also passes information on the `eltypes` of the columns of `df`.
Because `DataFrameRow`s have an `eltype` of `Any`, use `copy(dfr::DataFrameRow)` to obtain
a named tuple, which supports iteration and property access like a `DataFrameRow`,
but also passes information on the `eltypes` of the columns of `df`.
# Examples
```jldoctest
Expand Down Expand Up @@ -106,14 +106,30 @@ Base.propertynames(itr::DataFrameRows, private::Bool=false) = propertynames(pare

# Iteration by columns

const DATAFRAMECOLUMNS_DOCSTR = """
Indexing into `DataFrameColumns` objects using integer, `Symbol` or string
returns the corresponding column (without copying).
Indexing into `DataFrameColumns` objects using a multiple column selector
returns a subsetted `DataFrameColumns` object with a new parent containing
only the selected columns (without copying).
`DataFrameColumns` supports most of the `AbstractVector` API. The key
differences are that it is read-only and that the `keys` function returns a
vector of `Symbol`s (and not integers as for normal vectors).
In particular `findnext`, `findprev`, `findfirst`, `findlast`, and `findall`
functions are supported, and in `findnext` and `findprev` functions it is allowed
to pass an integer, string, or `Symbol` as a reference index.
"""
DataFrameColumns{<:AbstractDataFrame} <: AbstractVector{AbstractVector}

An `AbstractVector` that allows iteration over columns of an `AbstractDataFrame`.
Indexing into `DataFrameColumns` objects using integer or symbol indices
returns the corresponding column (without copying).
"""
struct DataFrameColumns{T<:AbstractDataFrame} <: AbstractVector{AbstractVector}
DataFrameColumns{<:AbstractDataFrame}
A vector-like object that allows iteration over columns of an `AbstractDataFrame`.
$DATAFRAMECOLUMNS_DOCSTR
"""
struct DataFrameColumns{T<:AbstractDataFrame}
df::T
end

Expand All @@ -123,9 +139,10 @@ Base.summary(io::IO, dfcs::DataFrameColumns) = print(io, summary(dfcs))
"""
eachcol(df::AbstractDataFrame)
Return a `DataFrameColumns` that is an `AbstractVector`
that allows iterating an `AbstractDataFrame` column by column.
Additionally it is allowed to index `DataFrameColumns` using column names.
Return a `DataFrameColumns` object that is a vector-like that allows iterating
an `AbstractDataFrame` column by column.
$DATAFRAMECOLUMNS_DOCSTR
# Examples
```jldoctest
Expand Down Expand Up @@ -159,15 +176,28 @@ julia> sum.(eachcol(df))
"""
eachcol(df::AbstractDataFrame) = DataFrameColumns(df)

Base.IteratorSize(::Type{<:DataFrameColumns}) = Base.HasShape{1}()
Base.size(itr::DataFrameColumns) = (size(parent(itr), 2),)
Base.IndexStyle(::Type{<:DataFrameColumns}) = Base.IndexLinear()

@inline function Base.getindex(itr::DataFrameColumns, j::Int)
@boundscheck checkbounds(itr, j)
@inbounds parent(itr)[!, j]
function Base.size(itr::DataFrameColumns, d::Integer)
d < 1 && throw(ArgumentError("dimension out of range"))
return d == 1 ? size(itr)[1] : 1
end

Base.getindex(itr::DataFrameColumns, j::Symbol) = parent(itr)[!, j]
Base.length(itr::DataFrameColumns) = size(itr)[1]
Base.eltype(::Type{<:DataFrameColumns}) = AbstractVector
Base.firstindex(itr::DataFrameColumns) = 1
Base.lastindex(itr::DataFrameColumns) = length(itr)
Base.iterate(itr::DataFrameColumns, i::Integer=1) =
i <= length(itr) ? (itr[i], i + 1) : nothing
Base.@propagate_inbounds Base.getindex(itr::DataFrameColumns, idx::ColumnIndex) =
parent(itr)[!, idx]
Base.@propagate_inbounds Base.getindex(itr::DataFrameColumns, idx::MultiColumnIndex) =
eachcol(parent(itr)[!, idx])
Base.:(==)(itr1::DataFrameColumns, itr2::DataFrameColumns) =
parent(itr1) == parent(itr2)
Base.isequal(itr1::DataFrameColumns, itr2::DataFrameColumns) =
isequal(parent(itr1), parent(itr2))

# separate methods are needed due to dispatch ambiguity
Base.getproperty(itr::DataFrameColumns, col_ind::Symbol) =
Expand All @@ -190,6 +220,13 @@ Get a vector of column names of `dfc` as `Symbol`s.
"""
Base.keys(itr::DataFrameColumns) = propertynames(itr)

"""
values(dfc::DataFrameColumns)
Get a vector of columns from `dfc`.
"""
Base.values(itr::DataFrameColumns) = collect(itr)

"""
pairs(dfc::DataFrameColumns)
Expand All @@ -198,6 +235,20 @@ with the corresponding column vector, i.e. `name => col`
where `name` is the column name of the column `col`.
"""
Base.pairs(itr::DataFrameColumns) = Base.Iterators.Pairs(itr, keys(itr))
Base.findnext(f::Function, itr::DataFrameColumns, i::Integer) =
findnext(f, values(itr), i)
Base.findnext(f::Function, itr::DataFrameColumns, i::Union{Symbol, AbstractString}) =
findnext(f, values(itr), index(parent(itr))[i])
Base.findprev(f::Function, itr::DataFrameColumns, i::Integer) =
findprev(f, values(itr), i)
Base.findprev(f::Function, itr::DataFrameColumns, i::Union{Symbol, AbstractString}) =
findprev(f, values(itr), index(parent(itr))[i])
Base.findfirst(f::Function, itr::DataFrameColumns) =
findfirst(f, values(itr))
Base.findlast(f::Function, itr::DataFrameColumns) =
findlast(f, values(itr))
Base.findall(f::Function, itr::DataFrameColumns) =
findall(f, values(itr))

Base.parent(itr::Union{DataFrameRows, DataFrameColumns}) = getfield(itr, :df)
Base.names(itr::Union{DataFrameRows, DataFrameColumns}) = names(parent(itr))
Expand Down
2 changes: 1 addition & 1 deletion src/dataframe/dataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -831,7 +831,7 @@ function Base.copy(df::DataFrame; copycols::Bool=true)
if copycols
df[:, :]
else
DataFrame(eachcol(df), _names(df), copycols=false)
DataFrame(_columns(df), _names(df), copycols=false)
end
end

Expand Down
2 changes: 1 addition & 1 deletion src/deprecated.jl
Original file line number Diff line number Diff line change
Expand Up @@ -313,7 +313,7 @@ function Base.join(df1::AbstractDataFrame, df2::AbstractDataFrame,
end
end

@deprecate eachcol(df::AbstractDataFrame, names::Bool) names ? collect(pairs(eachcol(df))) : eachcol(df)
@deprecate eachcol(df::AbstractDataFrame, names::Bool) names ? collect(pairs(eachcol(df))) : collect(eachcol(df))

@deprecate groupvars(gd::GroupedDataFrame) groupcols(gd)

Expand Down
2 changes: 1 addition & 1 deletion src/subdataframe/subdataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ function DataFrame(sdf::SubDataFrame; copycols::Bool=true)
if copycols
sdf[:, :]
else
DataFrame(eachcol(sdf), _names(sdf), copycols=false)
DataFrame(collect(eachcol(sdf)), _names(sdf), copycols=false)
end
end

Expand Down
17 changes: 17 additions & 0 deletions test/dataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -123,8 +123,25 @@ end
df = DataFrame(a=Union{Int, Missing}[2, 3],
b=Union{DataFrame, Missing}[DataFrame(c = 1), DataFrame(d = 2)])
dfc = copy(df)
dfcc = copy(df, copycols=false)
dfdc = deepcopy(df)

@test dfc == df
@test dfc.a !== df.a
@test dfc.b !== df.b
@test DataFrames._columns(dfc) == DataFrames._columns(df)
@test DataFrames._columns(dfc) !== DataFrames._columns(df)
@test dfcc == df
@test dfcc.a === df.a
@test dfcc.b === df.b
@test DataFrames._columns(dfcc) == DataFrames._columns(df)
@test DataFrames._columns(dfcc) !== DataFrames._columns(df)
@test dfdc == df
@test dfdc.a !== df.a
@test dfdc.b !== df.b
@test DataFrames._columns(dfdc) == DataFrames._columns(df)
@test DataFrames._columns(dfdc) !== DataFrames._columns(df)

df[1, :a] = 4
df[1, :b][!, :e] .= 5

Expand Down
3 changes: 3 additions & 0 deletions test/indexing_begin_tests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -30,4 +30,7 @@
@test df[[begin, end], [begin, end]] == df[[1,3], [1,4]]
df[[begin, end], [begin, end]] .= 1000
@test df.x1 == df.x4 == [1000, 222, 1000]

@test eachcol(df)[begin] == df[!, begin]
@test eachcol(df)[end] == df[!, end]
end
53 changes: 48 additions & 5 deletions test/iteration.jl
Original file line number Diff line number Diff line change
Expand Up @@ -22,16 +22,26 @@ using Test, DataFrames
@test collect(pairs(row)) isa Vector{Pair{Symbol, Int}}
end

@test size(eachcol(df)) == (size(df, 2),)
@test Base.IteratorSize(eachcol(df)) == Base.HasShape{1}()
@test parent(eachcol(df)) === df
@test names(eachcol(df)) == names(df)
@test IndexStyle(eachcol(df)) == IndexLinear()
@test Base.IndexStyle(eachcol(df)) == IndexLinear()
@test length(eachcol(df)) == size(df, 2)
@test size(eachcol(df)) == (size(df, 2),)
@test size(eachcol(df), 1) == size(df, 2)
@test size(eachcol(df), 2) == 1
@test_throws ArgumentError size(eachcol(df), 0)
@test eachcol(df)[1] == df[:, 1]
@test eachcol(df)[:A] === df[!, :A]
@test eachcol(df)[All()] == eachcol(df)
@test isequal(eachcol(df)[[1]], eachcol(df[!, [1]]))
@test eachcol(df).A === df[!, :A]
@test eachcol(df)["A"] === df[!, "A"]
@test eachcol(df)."A" === df[!, "A"]
@test collect(eachcol(df)) isa Vector{AbstractVector}
@test collect(eachcol(df)) == [[1, 2], [2, 3]]
@test eltype(eachcol(df)) == AbstractVector
@test_throws ArgumentError eachcol(df)[[1,1]]
@test eachcol(df)[[1]][1] === df.A
for col in eachcol(df)
@test isa(col, AbstractVector)
end
Expand Down Expand Up @@ -90,7 +100,7 @@ end
@test eachrow(sdf) == eachrow(df[[3,1,4], [3,1,4]])
@test size(eachrow(sdf)) == (3,)
@test eachcol(sdf) == eachcol(df[[3,1,4], [3,1,4]])
@test size(eachcol(sdf)) == (3,)
@test length(eachcol(sdf)) == 3
end

@testset "parent mutation" begin
Expand Down Expand Up @@ -127,7 +137,7 @@ end
end
end

@testset "keys and pairs for eachcol" begin
@testset "keys, values and pairs for eachcol" begin
df = DataFrame([11:16 21:26 31:36 41:46])

cols = eachcol(df)
Expand All @@ -141,6 +151,39 @@ end
@test cols[i] === cols[n]
end
@test_throws ArgumentError cols[:non_existent]

@test values(cols) == collect(cols)
end

@testset "findfirst, findnext, findlast, findprev, findall" begin
df = DataFrame(a=[1, 2, 1, 2], b=["1", "2", "1", "2"],
c=[1, 2, 1, 2], d=["1", "2", "1", "2"])

rows = eachrow(df)
@test findfirst(row -> row.a == 1, rows) == 1
@test findnext(row -> row.a == 1, rows, 2) == 3
@test findlast(row -> row.a == 1, rows) == 3
@test findprev(row -> row.a == 1, rows, 2) == 1
@test findall(row -> row.a == 1, rows) == [1, 3]

cols = eachcol(df)
@test findfirst(col -> eltype(col) <: Int, cols) == 1
@test findnext(col -> eltype(col) <: Int, cols, 2) == 3
@test findnext(col -> eltype(col) <: Int, cols, 10) === nothing
@test_throws BoundsError findnext(col -> eltype(col) <: Int, cols, -1)
@test_throws ArgumentError findnext(col -> eltype(col) <: Int, cols, :x1)
@test_throws ArgumentError findnext(col -> eltype(col) <: Int, cols, "x1")
@test findnext(col -> eltype(col) <: Int, cols, :b) == 3
@test findnext(col -> eltype(col) <: Int, cols, "b") == 3
@test findlast(col -> eltype(col) <: Int, cols) == 3
@test findprev(col -> eltype(col) <: Int, cols, 2) == 1
@test findprev(col -> eltype(col) <: Int, cols, :b) == 1
@test findprev(col -> eltype(col) <: Int, cols, "b") == 1
@test findprev(col -> eltype(col) <: Int, cols, -1) === nothing
@test_throws BoundsError findprev(col -> eltype(col) <: Int, cols, 10)
@test_throws ArgumentError findprev(col -> eltype(col) <: Int, cols, :x1)
@test_throws ArgumentError findprev(col -> eltype(col) <: Int, cols, "x1")
@test findall(col -> eltype(col) <: Int, cols) == [1, 3]
end

end # module
4 changes: 2 additions & 2 deletions test/reshape.jl
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ end
Union{Int, Missing}[2, 6], Union{Int, Missing}[3, 7],
Union{Int, Missing}[4, 8]], [:id, :a, :b, :c, :d])
@test isa(udf[!, 1], Vector{Int})
@test all(isa.(eachcol(udf)[2:end], Vector{Union{Int, Missing}}))
@test all(i -> isa(eachcol(udf)[i], Vector{Union{Int, Missing}}), 2:5)
df = DataFrame([categorical(repeat(1:2, inner=4)),
categorical(repeat('a':'d', outer=2)), categorical(1:8)],
[:id, :variable, :value])
Expand All @@ -136,7 +136,7 @@ end
Union{Int, Missing}[2, 6], Union{Int, Missing}[3, 7],
Union{Int, Missing}[4, 8]], [:id, :a, :b, :c, :d])
@test isa(udf[!, 1], CategoricalVector{Int})
@test all(isa.(eachcol(udf)[2:end], CategoricalVector{Union{Int, Missing}}))
@test all(i -> isa(eachcol(udf)[i], CategoricalVector{Union{Int, Missing}}), 2:5)
end

@testset "duplicate entries in unstack warnings" begin
Expand Down
6 changes: 3 additions & 3 deletions test/select.jl
Original file line number Diff line number Diff line change
Expand Up @@ -619,12 +619,12 @@ end

df = DataFrame(rand(10, 4))
df2 = select(df, :, :x1 => :x3)
@test df2 == DataFrame(eachcol(df)[[1,2,1,4]])
@test df2 == DataFrame(collect(eachcol(df))[[1,2,1,4]])
@test df2.x1 !== df2.x3
df2 = select(df, :, :x1 => :x3, copycols=false)
@test df2 == DataFrame(eachcol(df)[[1,2,1,4]])
@test df2 == DataFrame(collect(eachcol(df))[[1,2,1,4]])
@test df2.x1 === df2.x3
@test select(df, :x1 => :x3, :) == DataFrame(eachcol(df)[[1,1,2,4]],
@test select(df, :x1 => :x3, :) == DataFrame(collect(eachcol(df))[[1,1,2,4]],
[:x3, :x1, :x2, :x4])
select!(df, :, :x1 => :x3)
@test df2 == df
Expand Down
4 changes: 2 additions & 2 deletions test/tables.jl
Original file line number Diff line number Diff line change
Expand Up @@ -209,12 +209,12 @@ end
@test all(((a,b),) -> a === b, zip(eachcol(df), eachcol(df2)))

df2 = DataFrame(eachcol(df))
@test propertynames(df2) == [:x1, :x2, :x3, :x4]
@test df == df2
@test all(((a,b),) -> a == b, zip(eachcol(df), eachcol(df2)))
@test !any(((a,b),) -> a === b, zip(eachcol(df), eachcol(df2)))

df2 = DataFrame(eachcol(df))
@test propertynames(df2) == [:x1, :x2, :x3, :x4]
@test df == df2
@test !any(((a,b),) -> a === b, zip(eachcol(df), eachcol(df2)))

@test Tables.rowtable(df) == Tables.rowtable(eachrow(df))
Expand Down

0 comments on commit 0412291

Please sign in to comment.