[BREAKING] Make DataFrameColumns stop being an AbstractVector

JuliaData · Jun 24, 2020 · 0412291 · 0412291
1 parent 45b31a2
commit 0412291
Show file tree

Hide file tree

Showing 12 changed files with 151 additions and 34 deletions.
diff --git a/docs/src/lib/functions.md b/docs/src/lib/functions.md
@@ -120,5 +120,6 @@ disallowmissing!
 ```@docs
 eachcol
 eachrow
+values
 pairs
 ```
diff --git a/docs/src/lib/types.md b/docs/src/lib/types.md
@@ -37,12 +37,14 @@ or when accessing a single row of a `DataFrame` or `SubDataFrame` via `getindex`
 
 The `eachrow` function returns a value of the `DataFrameRows` type, which
 serves as an iterator over rows of an `AbstractDataFrame`, returning `DataFrameRow` objects.
+The `DataFrameRows` is a subtype of `AbstractVector` and supports its interface
+with the exception that it is read-only.
 
 Similarly, the `eachcol` function returns a value of the `DataFrameColumns` type, which
-serves as an iterator over columns of an `AbstractDataFrame`.
+is not an `AbstractVector`, but supports most of its API. The key differences are that it is read-only and
+that the `keys` function returns a vector of `Symbol`s (and not integers as for normal vectors).
 
-The `DataFrameRows` and `DataFrameColumns` types are subtypes of `AbstractVector` and support its interface
-with the exception that they are read only. Note that they are not exported and should not be constructed directly,
+Note that `DataFrameRows` and `DataFrameColumns` are not exported and should not be constructed directly,
 but using the `eachrow` and `eachcol` functions.
 
 The `RepeatedVector` and `StackedVector` types are subtypes of `AbstractVector` and support its interface

diff --git a/src/abstractdataframe/iteration.jl b/src/abstractdataframe/iteration.jl
@@ -30,9 +30,9 @@ Base.iterate(::AbstractDataFrame) =
 Return a `DataFrameRows` that iterates a data frame row by row,
 with each row represented as a `DataFrameRow`.
 
-Because `DataFrameRow`s have an `eltype` of `Any`, use `copy(dfr::DataFrameRow)` to obtain 
-a named tuple, which supports iteration and property access like a `DataFrameRow`, 
-but also passes information on the `eltypes` of the columns of `df`. 
+Because `DataFrameRow`s have an `eltype` of `Any`, use `copy(dfr::DataFrameRow)` to obtain
+a named tuple, which supports iteration and property access like a `DataFrameRow`,
+but also passes information on the `eltypes` of the columns of `df`.
 
 # Examples
 ```jldoctest
@@ -106,14 +106,30 @@ Base.propertynames(itr::DataFrameRows, private::Bool=false) = propertynames(pare
 
 # Iteration by columns
 
+const DATAFRAMECOLUMNS_DOCSTR = """
+Indexing into `DataFrameColumns` objects using integer, `Symbol` or string
+returns the corresponding column (without copying).
+Indexing into `DataFrameColumns` objects using a multiple column selector
+returns a subsetted `DataFrameColumns` object with a new parent containing
+only the selected columns (without copying).
+
+`DataFrameColumns` supports most of the `AbstractVector` API. The key
+differences are that it is read-only and that the `keys` function returns a
+vector of `Symbol`s (and not integers as for normal vectors).
+
+In particular `findnext`, `findprev`, `findfirst`, `findlast`, and `findall`
+functions are supported, and in `findnext` and `findprev` functions it is allowed
+to pass an integer, string, or `Symbol` as a reference index.
 """
-    DataFrameColumns{<:AbstractDataFrame} <: AbstractVector{AbstractVector}
 
-An `AbstractVector` that allows iteration over columns of an `AbstractDataFrame`.
-Indexing into `DataFrameColumns` objects using integer or symbol indices
-returns the corresponding column (without copying).
 """
-struct DataFrameColumns{T<:AbstractDataFrame} <: AbstractVector{AbstractVector}
+    DataFrameColumns{<:AbstractDataFrame}
+
+A vector-like object that allows iteration over columns of an `AbstractDataFrame`.
+
+$DATAFRAMECOLUMNS_DOCSTR
+"""
+struct DataFrameColumns{T<:AbstractDataFrame}
     df::T
 end
 
@@ -123,9 +139,10 @@ Base.summary(io::IO, dfcs::DataFrameColumns) = print(io, summary(dfcs))
 """
     eachcol(df::AbstractDataFrame)
 
-Return a `DataFrameColumns` that is an `AbstractVector`
-that allows iterating an `AbstractDataFrame` column by column.
-Additionally it is allowed to index `DataFrameColumns` using column names.
+Return a `DataFrameColumns` object that is a vector-like that allows iterating
+an `AbstractDataFrame` column by column.
+
+$DATAFRAMECOLUMNS_DOCSTR
 
 # Examples
 ```jldoctest
@@ -159,15 +176,28 @@ julia> sum.(eachcol(df))
 """
 eachcol(df::AbstractDataFrame) = DataFrameColumns(df)
 
+Base.IteratorSize(::Type{<:DataFrameColumns}) = Base.HasShape{1}()
 Base.size(itr::DataFrameColumns) = (size(parent(itr), 2),)
-Base.IndexStyle(::Type{<:DataFrameColumns}) = Base.IndexLinear()
 
-@inline function Base.getindex(itr::DataFrameColumns, j::Int)
-    @boundscheck checkbounds(itr, j)
-    @inbounds parent(itr)[!, j]
+function Base.size(itr::DataFrameColumns, d::Integer)
+    d < 1 && throw(ArgumentError("dimension out of range"))
+    return d == 1 ? size(itr)[1] : 1
 end
 
-Base.getindex(itr::DataFrameColumns, j::Symbol) = parent(itr)[!, j]
+Base.length(itr::DataFrameColumns) = size(itr)[1]
+Base.eltype(::Type{<:DataFrameColumns}) = AbstractVector
+Base.firstindex(itr::DataFrameColumns) = 1
+Base.lastindex(itr::DataFrameColumns) = length(itr)
+Base.iterate(itr::DataFrameColumns, i::Integer=1) =
+    i <= length(itr) ? (itr[i], i + 1) : nothing
+Base.@propagate_inbounds Base.getindex(itr::DataFrameColumns, idx::ColumnIndex) =
+    parent(itr)[!, idx]
+Base.@propagate_inbounds Base.getindex(itr::DataFrameColumns, idx::MultiColumnIndex) =
+    eachcol(parent(itr)[!, idx])
+Base.:(==)(itr1::DataFrameColumns, itr2::DataFrameColumns) =
+    parent(itr1) == parent(itr2)
+Base.isequal(itr1::DataFrameColumns, itr2::DataFrameColumns) =
+    isequal(parent(itr1), parent(itr2))
 
 # separate methods are needed due to dispatch ambiguity
 Base.getproperty(itr::DataFrameColumns, col_ind::Symbol) =
@@ -190,6 +220,13 @@ Get a vector of column names of `dfc` as `Symbol`s.
 """
 Base.keys(itr::DataFrameColumns) = propertynames(itr)
 
+"""
+    values(dfc::DataFrameColumns)
+
+Get a vector of columns from `dfc`.
+"""
+Base.values(itr::DataFrameColumns) = collect(itr)
+
 """
     pairs(dfc::DataFrameColumns)
 
@@ -198,6 +235,20 @@ with the corresponding column vector, i.e. `name => col`
 where `name` is the column name of the column `col`.
 """
 Base.pairs(itr::DataFrameColumns) = Base.Iterators.Pairs(itr, keys(itr))
+Base.findnext(f::Function, itr::DataFrameColumns, i::Integer) =
+    findnext(f, values(itr), i)
+Base.findnext(f::Function, itr::DataFrameColumns, i::Union{Symbol, AbstractString}) =
+    findnext(f, values(itr), index(parent(itr))[i])
+Base.findprev(f::Function, itr::DataFrameColumns, i::Integer) =
+    findprev(f, values(itr), i)
+Base.findprev(f::Function, itr::DataFrameColumns, i::Union{Symbol, AbstractString}) =
+    findprev(f, values(itr), index(parent(itr))[i])
+Base.findfirst(f::Function, itr::DataFrameColumns) =
+    findfirst(f, values(itr))
+Base.findlast(f::Function, itr::DataFrameColumns) =
+    findlast(f, values(itr))
+Base.findall(f::Function, itr::DataFrameColumns) =
+    findall(f, values(itr))
 
 Base.parent(itr::Union{DataFrameRows, DataFrameColumns}) = getfield(itr, :df)
 Base.names(itr::Union{DataFrameRows, DataFrameColumns}) = names(parent(itr))

diff --git a/src/dataframe/dataframe.jl b/src/dataframe/dataframe.jl
@@ -831,7 +831,7 @@ function Base.copy(df::DataFrame; copycols::Bool=true)
     if copycols
         df[:, :]
     else
-        DataFrame(eachcol(df), _names(df), copycols=false)
+        DataFrame(_columns(df), _names(df), copycols=false)
     end
 end
 

diff --git a/src/deprecated.jl b/src/deprecated.jl
@@ -313,7 +313,7 @@ function Base.join(df1::AbstractDataFrame, df2::AbstractDataFrame,
     end
 end
 
-@deprecate eachcol(df::AbstractDataFrame, names::Bool) names ? collect(pairs(eachcol(df))) : eachcol(df)
+@deprecate eachcol(df::AbstractDataFrame, names::Bool) names ? collect(pairs(eachcol(df))) : collect(eachcol(df))
 
 @deprecate groupvars(gd::GroupedDataFrame) groupcols(gd)
 

diff --git a/src/subdataframe/subdataframe.jl b/src/subdataframe/subdataframe.jl
@@ -168,7 +168,7 @@ function DataFrame(sdf::SubDataFrame; copycols::Bool=true)
     if copycols
         sdf[:, :]
     else
-        DataFrame(eachcol(sdf), _names(sdf), copycols=false)
+        DataFrame(collect(eachcol(sdf)), _names(sdf), copycols=false)
     end
 end
 

diff --git a/test/dataframe.jl b/test/dataframe.jl
@@ -123,8 +123,25 @@ end
     df = DataFrame(a=Union{Int, Missing}[2, 3],
                    b=Union{DataFrame, Missing}[DataFrame(c = 1), DataFrame(d = 2)])
     dfc = copy(df)
+    dfcc = copy(df, copycols=false)
     dfdc = deepcopy(df)
 
+    @test dfc == df
+    @test dfc.a !== df.a
+    @test dfc.b !== df.b
+    @test DataFrames._columns(dfc) == DataFrames._columns(df)
+    @test DataFrames._columns(dfc) !== DataFrames._columns(df)
+    @test dfcc == df
+    @test dfcc.a === df.a
+    @test dfcc.b === df.b
+    @test DataFrames._columns(dfcc) == DataFrames._columns(df)
+    @test DataFrames._columns(dfcc) !== DataFrames._columns(df)
+    @test dfdc == df
+    @test dfdc.a !== df.a
+    @test dfdc.b !== df.b
+    @test DataFrames._columns(dfdc) == DataFrames._columns(df)
+    @test DataFrames._columns(dfdc) !== DataFrames._columns(df)
+
     df[1, :a] = 4
     df[1, :b][!, :e] .= 5
 

diff --git a/test/indexing_begin_tests.jl b/test/indexing_begin_tests.jl
@@ -30,4 +30,7 @@
     @test df[[begin, end], [begin, end]] == df[[1,3], [1,4]]
     df[[begin, end], [begin, end]] .= 1000
     @test df.x1 == df.x4 == [1000, 222, 1000]
+
+    @test eachcol(df)[begin] == df[!, begin]
+    @test eachcol(df)[end] == df[!, end]
 end
diff --git a/test/iteration.jl b/test/iteration.jl
@@ -22,16 +22,26 @@ using Test, DataFrames
         @test collect(pairs(row)) isa Vector{Pair{Symbol, Int}}
     end
 
-    @test size(eachcol(df)) == (size(df, 2),)
+    @test Base.IteratorSize(eachcol(df)) == Base.HasShape{1}()
     @test parent(eachcol(df)) === df
     @test names(eachcol(df)) == names(df)
-    @test IndexStyle(eachcol(df)) == IndexLinear()
-    @test Base.IndexStyle(eachcol(df)) == IndexLinear()
     @test length(eachcol(df)) == size(df, 2)
+    @test size(eachcol(df)) == (size(df, 2),)
+    @test size(eachcol(df), 1) == size(df, 2)
+    @test size(eachcol(df), 2) == 1
+    @test_throws ArgumentError size(eachcol(df), 0)
     @test eachcol(df)[1] == df[:, 1]
+    @test eachcol(df)[:A] === df[!, :A]
+    @test eachcol(df)[All()] == eachcol(df)
+    @test isequal(eachcol(df)[[1]], eachcol(df[!, [1]]))
+    @test eachcol(df).A === df[!, :A]
+    @test eachcol(df)["A"] === df[!, "A"]
+    @test eachcol(df)."A" === df[!, "A"]
     @test collect(eachcol(df)) isa Vector{AbstractVector}
     @test collect(eachcol(df)) == [[1, 2], [2, 3]]
     @test eltype(eachcol(df)) == AbstractVector
+    @test_throws ArgumentError eachcol(df)[[1,1]]
+    @test eachcol(df)[[1]][1] === df.A
     for col in eachcol(df)
         @test isa(col, AbstractVector)
     end
@@ -90,7 +100,7 @@ end
     @test eachrow(sdf) == eachrow(df[[3,1,4], [3,1,4]])
     @test size(eachrow(sdf)) == (3,)
     @test eachcol(sdf) == eachcol(df[[3,1,4], [3,1,4]])
-    @test size(eachcol(sdf)) == (3,)
+    @test length(eachcol(sdf)) == 3
 end
 
 @testset "parent mutation" begin
@@ -127,7 +137,7 @@ end
     end
 end
 
-@testset "keys and pairs for eachcol" begin
+@testset "keys, values and pairs for eachcol" begin
     df = DataFrame([11:16 21:26 31:36 41:46])
 
     cols = eachcol(df)
@@ -141,6 +151,39 @@ end
         @test cols[i] === cols[n]
     end
     @test_throws ArgumentError cols[:non_existent]
+
+    @test values(cols) == collect(cols)
+end
+
+@testset "findfirst, findnext, findlast, findprev, findall" begin
+    df = DataFrame(a=[1, 2, 1, 2], b=["1", "2", "1", "2"],
+                   c=[1, 2, 1, 2], d=["1", "2", "1", "2"])
+
+    rows = eachrow(df)
+    @test findfirst(row -> row.a == 1, rows) == 1
+    @test findnext(row -> row.a == 1, rows, 2) == 3
+    @test findlast(row -> row.a == 1, rows) == 3
+    @test findprev(row -> row.a == 1, rows, 2) == 1
+    @test findall(row -> row.a == 1, rows) == [1, 3]
+
+    cols = eachcol(df)
+    @test findfirst(col -> eltype(col) <: Int, cols) == 1
+    @test findnext(col -> eltype(col) <: Int, cols, 2) == 3
+    @test findnext(col -> eltype(col) <: Int, cols, 10) === nothing
+    @test_throws BoundsError findnext(col -> eltype(col) <: Int, cols, -1)
+    @test_throws ArgumentError findnext(col -> eltype(col) <: Int, cols, :x1)
+    @test_throws ArgumentError findnext(col -> eltype(col) <: Int, cols, "x1")
+    @test findnext(col -> eltype(col) <: Int, cols, :b) == 3
+    @test findnext(col -> eltype(col) <: Int, cols, "b") == 3
+    @test findlast(col -> eltype(col) <: Int, cols) == 3
+    @test findprev(col -> eltype(col) <: Int, cols, 2) == 1
+    @test findprev(col -> eltype(col) <: Int, cols, :b) == 1
+    @test findprev(col -> eltype(col) <: Int, cols, "b") == 1
+    @test findprev(col -> eltype(col) <: Int, cols, -1) === nothing
+    @test_throws BoundsError findprev(col -> eltype(col) <: Int, cols, 10)
+    @test_throws ArgumentError findprev(col -> eltype(col) <: Int, cols, :x1)
+    @test_throws ArgumentError findprev(col -> eltype(col) <: Int, cols, "x1")
+    @test findall(col -> eltype(col) <: Int, cols) == [1, 3]
 end
 
 end # module
diff --git a/test/reshape.jl b/test/reshape.jl
@@ -126,7 +126,7 @@ end
                             Union{Int, Missing}[2, 6], Union{Int, Missing}[3, 7],
                             Union{Int, Missing}[4, 8]], [:id, :a, :b, :c, :d])
     @test isa(udf[!, 1], Vector{Int})
-    @test all(isa.(eachcol(udf)[2:end], Vector{Union{Int, Missing}}))
+    @test all(i -> isa(eachcol(udf)[i], Vector{Union{Int, Missing}}), 2:5)
     df = DataFrame([categorical(repeat(1:2, inner=4)),
                        categorical(repeat('a':'d', outer=2)), categorical(1:8)],
                    [:id, :variable, :value])
@@ -136,7 +136,7 @@ end
                             Union{Int, Missing}[2, 6], Union{Int, Missing}[3, 7],
                             Union{Int, Missing}[4, 8]], [:id, :a, :b, :c, :d])
     @test isa(udf[!, 1], CategoricalVector{Int})
-    @test all(isa.(eachcol(udf)[2:end], CategoricalVector{Union{Int, Missing}}))
+    @test all(i -> isa(eachcol(udf)[i], CategoricalVector{Union{Int, Missing}}), 2:5)
 end
 
 @testset "duplicate entries in unstack warnings" begin

diff --git a/test/select.jl b/test/select.jl
@@ -619,12 +619,12 @@ end
 
     df = DataFrame(rand(10, 4))
     df2 = select(df, :, :x1 => :x3)
-    @test df2 == DataFrame(eachcol(df)[[1,2,1,4]])
+    @test df2 == DataFrame(collect(eachcol(df))[[1,2,1,4]])
     @test df2.x1 !== df2.x3
     df2 = select(df, :, :x1 => :x3, copycols=false)
-    @test df2 == DataFrame(eachcol(df)[[1,2,1,4]])
+    @test df2 == DataFrame(collect(eachcol(df))[[1,2,1,4]])
     @test df2.x1 === df2.x3
-    @test select(df, :x1 => :x3, :) == DataFrame(eachcol(df)[[1,1,2,4]],
+    @test select(df, :x1 => :x3, :) == DataFrame(collect(eachcol(df))[[1,1,2,4]],
                                                  [:x3, :x1, :x2, :x4])
     select!(df, :, :x1 => :x3)
     @test df2 == df

diff --git a/test/tables.jl b/test/tables.jl
@@ -209,12 +209,12 @@ end
     @test all(((a,b),) -> a === b, zip(eachcol(df), eachcol(df2)))
 
     df2 = DataFrame(eachcol(df))
-    @test propertynames(df2) == [:x1, :x2, :x3, :x4]
+    @test df == df2
     @test all(((a,b),) -> a == b, zip(eachcol(df), eachcol(df2)))
     @test !any(((a,b),) -> a === b, zip(eachcol(df), eachcol(df2)))
 
     df2 = DataFrame(eachcol(df))
-    @test propertynames(df2) == [:x1, :x2, :x3, :x4]
+    @test df == df2
     @test !any(((a,b),) -> a === b, zip(eachcol(df), eachcol(df2)))
 
     @test Tables.rowtable(df) == Tables.rowtable(eachrow(df))
-Original file line number
+Diff line change
@@ Expand Up / @@ -120,5 +120,6 @@ disallowmissing! @@
     ```@docs
     eachcol
     eachrow
+    values
     pairs
     ```