From a0ccd1511e9d889a4f7c23fe0444c513ead5ab11 Mon Sep 17 00:00:00 2001 From: Cameron Prybol Date: Thu, 30 Mar 2017 18:49:58 -0700 Subject: [PATCH] adjust outer join behavior (types and right outer join bug) outer joins need to return nullable tables as they may introduce missing data. similar_nullable on DataTables has been removed (unused) and replaced with a similar_nullable that works on NullableCategoricalArrays, and this change is made to support the new changes to join. The 3 outer joins share a function with inner joins, and this shared function (compose_joined_table) now performs a check to see if the join type is :inner, and if so, it will return the same column type as the parent table rather than promoting to a nullable column. A bug was found in right-outer join behavior where the values unique to the right table were added to the table in the incorrect locations, overwriting data and leaving nulls where they shouldn't be. This bug, due to incorrect values in rightonly_ixs.join, was fixed by filling the last n-rows of the datatable where n = length(rightonly_ixs.join). Tests were checked for accuracy against pandas. --- src/abstractdatatable/join.jl | 90 ++++++++------- test/join.jl | 202 +++++++++++++++++++++++++++++++++- 2 files changed, 254 insertions(+), 38 deletions(-) diff --git a/src/abstractdatatable/join.jl b/src/abstractdatatable/join.jl index 0bc4656..d52e602 100644 --- a/src/abstractdatatable/join.jl +++ b/src/abstractdatatable/join.jl @@ -9,13 +9,10 @@ similar_nullable{T}(dv::AbstractArray{T}, dims::Union{Int, Tuple{Vararg{Int}}}) similar_nullable{T<:Nullable}(dv::AbstractArray{T}, dims::Union{Int, Tuple{Vararg{Int}}}) = NullableArray{eltype(T)}(dims) -similar_nullable{T,R}(dv::CategoricalArray{T,R}, dims::Union{Int, Tuple{Vararg{Int}}}) = +similar_nullable{T}(dv::CategoricalArray{T}, dims::Union{Int, Tuple{Vararg{Int}}}) = NullableCategoricalArray{T}(dims) -similar_nullable(dt::AbstractDataTable, dims::Int) = - DataTable(Any[similar_nullable(x, dims) for x in columns(dt)], copy(index(dt))) - -similar_nullable{T,R}(dv::NullableCategoricalArray{T,R}, dims::Union{Int, Tuple{Vararg{Int}}}) = +similar_nullable{T}(dv::NullableCategoricalArray{T}, dims::Union{Int, Tuple{Vararg{Int}}}) = NullableCategoricalArray{T}(dims) # helper structure for DataTables joining @@ -47,31 +44,28 @@ Base.length(x::RowIndexMap) = length(x.orig) # composes the joined data table using the maps between the left and right # table rows and the indices of rows in the result -function compose_joined_table(joiner::DataTableJoiner, +function compose_joined_table(joiner::DataTableJoiner, kind::Symbol, left_ixs::RowIndexMap, leftonly_ixs::RowIndexMap, right_ixs::RowIndexMap, rightonly_ixs::RowIndexMap) @assert length(left_ixs) == length(right_ixs) # compose left half of the result taking all left columns all_orig_left_ixs = vcat(left_ixs.orig, leftonly_ixs.orig) - if length(leftonly_ixs) > 0 + + ril = length(right_ixs) + lil = length(left_ixs) + loil = length(leftonly_ixs) + roil = length(rightonly_ixs) + + if loil > 0 # combine the matched (left_ixs.orig) and non-matched (leftonly_ixs.orig) indices of the left table rows # preserving the original rows order - all_orig_left_ixs = similar(left_ixs.orig, length(left_ixs)+length(leftonly_ixs)) + all_orig_left_ixs = similar(left_ixs.orig, lil + loil) @inbounds all_orig_left_ixs[left_ixs.join] = left_ixs.orig @inbounds all_orig_left_ixs[leftonly_ixs.join] = leftonly_ixs.orig else # the result contains only the left rows that are matched to right rows (left_ixs) all_orig_left_ixs = left_ixs.orig # no need to copy left_ixs.orig as it's not used elsewhere end - ril = length(right_ixs) - loil = length(leftonly_ixs) - roil = length(rightonly_ixs) - left_dt = DataTable(Any[resize!(col[all_orig_left_ixs], length(all_orig_left_ixs)+roil) - for col in columns(joiner.dtl)], - names(joiner.dtl)) - - # compose right half of the result taking all right columns excluding on - dtr_noon = without(joiner.dtr, joiner.on_cols) # permutation to swap rightonly and leftonly rows right_perm = vcat(1:ril, ril+roil+1:ril+roil+loil, ril+1:ril+roil) if length(leftonly_ixs) > 0 @@ -79,23 +73,45 @@ function compose_joined_table(joiner::DataTableJoiner, right_perm[vcat(right_ixs.join, leftonly_ixs.join)] = right_perm[1:ril+loil] end all_orig_right_ixs = vcat(right_ixs.orig, rightonly_ixs.orig) - right_dt = DataTable(Any[resize!(col[all_orig_right_ixs], length(all_orig_right_ixs)+loil)[right_perm] - for col in columns(dtr_noon)], - names(dtr_noon)) - # merge left and right parts of the joined table - res = hcat!(left_dt, right_dt) + + # compose right half of the result taking all right columns excluding on + dtr_noon = without(joiner.dtr, joiner.on_cols) + + nrow = length(all_orig_left_ixs) + roil + @assert nrow == length(all_orig_right_ixs) + loil + ncleft = ncol(joiner.dtl) + cols = Vector{Any}(ncleft + ncol(dtr_noon)) + _similar = kind == :inner ? similar : similar_nullable + for (i, col) in enumerate(columns(joiner.dtl)) + cols[i] = _similar(col, nrow) + fillcolumn!(cols[i], col, all_orig_left_ixs) + end + for (i, col) in enumerate(columns(dtr_noon)) + cols[i+ncleft] = _similar(col, nrow) + fillcolumn!(cols[i+ncleft], col, all_orig_right_ixs) + permute!(cols[i+ncleft], right_perm) + end + res = DataTable(cols, vcat(names(joiner.dtl), names(dtr_noon))) if length(rightonly_ixs.join) > 0 # some left rows are nulls, so the values of the "on" columns # need to be taken from the right for (on_col_ix, on_col) in enumerate(joiner.on_cols) # fix the result of the rightjoin by taking the nonnull values from the right table - res[on_col][rightonly_ixs.join] = joiner.dtr_on[rightonly_ixs.orig, on_col_ix] + offset = nrow - length(rightonly_ixs.orig) + fillcolumn!(res[on_col], joiner.dtr_on[on_col_ix], rightonly_ixs.orig, offset) end end return res end +function fillcolumn!{T1, T2}(dtcol::AbstractVector{T1}, refcol::AbstractVector{T2}, + indices::Vector{Int}, offset::Int=0) + @inbounds for (j, k) in enumerate(indices) + dtcol[j+offset] = refcol[k] + end +end + # map the indices of the left and right joined tables # to the indices of the rows in the resulting table # if `nothing` is given, the corresponding map is not built @@ -210,7 +226,8 @@ join(dt1::AbstractDataTable, - `:cross` : a full Cartesian product of the key combinations; every row of `dt1` is matched with every row of `dt2` -Null values are filled in where needed to complete joins. +For the three join operations that may introduce missing values (`:outer`, `:left`, +and `:right`), all columns of the returned data table will be nullable. ### Result @@ -246,22 +263,21 @@ function Base.join(dt1::AbstractDataTable, joiner = DataTableJoiner(dt1, dt2, on) if kind == :inner - compose_joined_table(joiner, update_row_maps!(joiner.dtl_on, joiner.dtr_on, - group_rows(joiner.dtr_on), - true, false, true, false)...) + compose_joined_table(joiner, kind, update_row_maps!(joiner.dtl_on, joiner.dtr_on, + group_rows(joiner.dtr_on), + true, false, true, false)...) elseif kind == :left - compose_joined_table(joiner, update_row_maps!(joiner.dtl_on, joiner.dtr_on, - group_rows(joiner.dtr_on), - true, true, true, false)...) + compose_joined_table(joiner, kind, update_row_maps!(joiner.dtl_on, joiner.dtr_on, + group_rows(joiner.dtr_on), + true, true, true, false)...) elseif kind == :right - right_ixs, rightonly_ixs, left_ixs, leftonly_ixs = update_row_maps!(joiner.dtr_on, joiner.dtl_on, - group_rows(joiner.dtl_on), - true, true, true, false) - compose_joined_table(joiner, left_ixs, leftonly_ixs, right_ixs, rightonly_ixs) + compose_joined_table(joiner, kind, update_row_maps!(joiner.dtr_on, joiner.dtl_on, + group_rows(joiner.dtl_on), + true, true, true, false)[[3, 4, 1, 2]]...) elseif kind == :outer - compose_joined_table(joiner, update_row_maps!(joiner.dtl_on, joiner.dtr_on, - group_rows(joiner.dtr_on), - true, true, true, true)...) + compose_joined_table(joiner, kind, update_row_maps!(joiner.dtl_on, joiner.dtr_on, + group_rows(joiner.dtr_on), + true, true, true, true)...) elseif kind == :semi # hash the right rows dtr_on_grp = group_rows(joiner.dtr_on) diff --git a/test/join.jl b/test/join.jl index 0ac3fe6..a656089 100644 --- a/test/join.jl +++ b/test/join.jl @@ -116,5 +116,205 @@ module TestJoin @test join(dt, dtnull, on = :x) == DataTable([collect(1:10), collect(2:11), NullableArray(3:12)], [:x, :y, :z]) @test join(dtnull, dt, on = :x) == - DataTable([NullableArray(1:10), NullableArray(3:12), NullableArray(2:11)], [:x, :z, :y]) + DataTable([NullableArray(1:10), NullableArray(3:12), collect(2:11)], [:x, :z, :y]) + + @testset "all joins" begin + dt1 = DataTable(Any[[1, 3, 5], [1.0, 3.0, 5.0]], [:id, :fid]) + dt2 = DataTable(Any[[0, 1, 2, 3, 4], [0.0, 1.0, 2.0, 3.0, 4.0]], [:id, :fid]) + N = Nullable() + + @test join(dt1, dt2, kind=:cross) == + DataTable(Any[repeat([1, 3, 5], inner = 5), + repeat([1, 3, 5], inner = 5), + repeat([0, 1, 2, 3, 4], outer = 3), + repeat([0, 1, 2, 3, 4], outer = 3)], + [:id, :fid, :id_1, :fid_1]) + @test typeof.(join(dt1, dt2, kind=:cross).columns) == + [Vector{Int}, Vector{Float64}, Vector{Int}, Vector{Float64}] + + i(on) = join(dt1, dt2, on = on, kind = :inner) + l(on) = join(dt1, dt2, on = on, kind = :left) + r(on) = join(dt1, dt2, on = on, kind = :right) + o(on) = join(dt1, dt2, on = on, kind = :outer) + s(on) = join(dt1, dt2, on = on, kind = :semi) + a(on) = join(dt1, dt2, on = on, kind = :anti) + + @test s(:id) == + s(:fid) == + s([:id, :fid]) == DataTable(Any[[1, 3], [1, 3]], [:id, :fid]) + @test typeof.(s(:id).columns) == + typeof.(s(:fid).columns) == + typeof.(s([:id, :fid]).columns) == [Vector{Int}, Vector{Float64}] + @test a(:id) == + a(:fid) == + a([:id, :fid]) == DataTable(Any[[5], [5]], [:id, :fid]) + @test typeof.(a(:id).columns) == + typeof.(a(:fid).columns) == + typeof.(a([:id, :fid]).columns) == [Vector{Int}, Vector{Float64}] + + on = :id + @test i(on) == DataTable(Any[[1, 3], [1, 3], [1, 3]], [:id, :fid, :fid_1]) + @test typeof.(i(on).columns) == [Vector{Int}, Vector{Float64}, Vector{Float64}] + @test l(on) == DataTable(id = NullableArray([1, 3, 5]), + fid = NullableArray([1, 3, 5]), + fid_1 = NullableArray([1, 3, N])) + @test typeof.(l(on).columns) == [NullableVector{Int}, + NullableVector{Float64}, + NullableVector{Float64}] + @test r(on) == DataTable(id = NullableArray([1, 3, 0, 2, 4]), + fid = NullableArray([1, 3, N, N, N]), + fid_1 = NullableArray([1, 3, 0, 2, 4])) + @test typeof.(r(on).columns) == [NullableVector{Int}, + NullableVector{Float64}, + NullableVector{Float64}] + @test o(on) == DataTable(id = NullableArray([1, 3, 5, 0, 2, 4]), + fid = NullableArray([1, 3, 5, N, N, N]), + fid_1 = NullableArray([1, 3, N, 0, 2, 4])) + @test typeof.(o(on).columns) == [NullableVector{Int}, + NullableVector{Float64}, + NullableVector{Float64}] + + on = :fid + @test i(on) == DataTable(Any[[1, 3], [1.0, 3.0], [1, 3]], [:id, :fid, :id_1]) + @test typeof.(i(on).columns) == [Vector{Int}, Vector{Float64}, Vector{Int}] + @test l(on) == DataTable(id = NullableArray([1, 3, 5]), + fid = NullableArray([1, 3, 5]), + id_1 = NullableArray([1, 3, N])) + @test typeof.(l(on).columns) == [NullableVector{Int}, + NullableVector{Float64}, + NullableVector{Int}] + @test r(on) == DataTable(id = NullableArray([1, 3, N, N, N]), + fid = NullableArray([1, 3, 0, 2, 4]), + id_1 = NullableArray([1, 3, 0, 2, 4])) + @test typeof.(r(on).columns) == [NullableVector{Int}, + NullableVector{Float64}, + NullableVector{Int}] + @test o(on) == DataTable(id = NullableArray([1, 3, 5, N, N, N]), + fid = NullableArray([1, 3, 5, 0, 2, 4]), + id_1 = NullableArray([1, 3, N, 0, 2, 4])) + @test typeof.(o(on).columns) == [NullableVector{Int}, + NullableVector{Float64}, + NullableVector{Int}] + + on = [:id, :fid] + @test i(on) == DataTable(Any[[1, 3], [1, 3]], [:id, :fid]) + @test typeof.(i(on).columns) == [Vector{Int}, Vector{Float64}] + @test l(on) == DataTable(id = NullableArray([1, 3, 5]), + fid = NullableArray([1, 3, 5])) + @test typeof.(l(on).columns) == [NullableVector{Int}, + NullableVector{Float64}] + @test r(on) == DataTable(id = NullableArray([1, 3, 0, 2, 4]), + fid = NullableArray([1, 3, 0, 2, 4])) + @test typeof.(r(on).columns) == [NullableVector{Int}, + NullableVector{Float64}] + @test o(on) == DataTable(id = NullableArray([1, 3, 5, 0, 2, 4]), + fid = NullableArray([1, 3, 5, 0, 2, 4])) + @test typeof.(o(on).columns) == [NullableVector{Int}, + NullableVector{Float64}] + end + + @testset "all joins with CategoricalArrays" begin + dt1 = DataTable(Any[CategoricalArray([1, 3, 5]), + CategoricalArray([1.0, 3.0, 5.0])], [:id, :fid]) + dt2 = DataTable(Any[CategoricalArray([0, 1, 2, 3, 4]), + CategoricalArray([0.0, 1.0, 2.0, 3.0, 4.0])], [:id, :fid]) + N = Nullable() + DRT = CategoricalArrays.DefaultRefType + + @test join(dt1, dt2, kind=:cross) == + DataTable(Any[repeat([1, 3, 5], inner = 5), + repeat([1, 3, 5], inner = 5), + repeat([0, 1, 2, 3, 4], outer = 3), + repeat([0, 1, 2, 3, 4], outer = 3)], + [:id, :fid, :id_1, :fid_1]) + @test typeof.(join(dt1, dt2, kind=:cross).columns) == + [CategoricalVector{i, DRT} for i in [Int, Float64, Int, Float64]] + + i(on) = join(dt1, dt2, on = on, kind = :inner) + l(on) = join(dt1, dt2, on = on, kind = :left) + r(on) = join(dt1, dt2, on = on, kind = :right) + o(on) = join(dt1, dt2, on = on, kind = :outer) + s(on) = join(dt1, dt2, on = on, kind = :semi) + a(on) = join(dt1, dt2, on = on, kind = :anti) + + @test s(:id) == + s(:fid) == + s([:id, :fid]) == DataTable(Any[[1, 3], [1, 3]], [:id, :fid]) + @test typeof.(s(:id).columns) == + typeof.(s(:fid).columns) == + typeof.(s([:id, :fid]).columns) == [CategoricalVector{Int, DRT}, + CategoricalVector{Float64, DRT}] + @test a(:id) == + a(:fid) == + a([:id, :fid]) == DataTable(Any[[5], [5]], [:id, :fid]) + @test typeof.(a(:id).columns) == + typeof.(a(:fid).columns) == + typeof.(a([:id, :fid]).columns) == [CategoricalVector{Int, DRT}, + CategoricalVector{Float64, DRT}] + + on = :id + @test i(on) == DataTable(Any[[1, 3], [1, 3], [1, 3]], [:id, :fid, :fid_1]) + @test typeof.(i(on).columns) == [CategoricalVector{Int, DRT}, + CategoricalVector{Float64, DRT}, + CategoricalVector{Float64, DRT}] + @test l(on) == DataTable(id = NullableArray([1, 3, 5]), + fid = NullableArray([1, 3, 5]), + fid_1 = NullableArray([1, 3, N])) + @test typeof.(l(on).columns) == [NullableCategoricalVector{Int, DRT}, + NullableCategoricalVector{Float64, DRT}, + NullableCategoricalVector{Float64, DRT}] + @test r(on) == DataTable(id = NullableArray([1, 3, 0, 2, 4]), + fid = NullableArray([1, 3, N, N, N]), + fid_1 = NullableArray([1, 3, 0, 2, 4])) + @test typeof.(r(on).columns) == [NullableCategoricalVector{Int, DRT}, + NullableCategoricalVector{Float64, DRT}, + NullableCategoricalVector{Float64, DRT}] + @test o(on) == DataTable(id = NullableArray([1, 3, 5, 0, 2, 4]), + fid = NullableArray([1, 3, 5, N, N, N]), + fid_1 = NullableArray([1, 3, N, 0, 2, 4])) + @test typeof.(o(on).columns) == [NullableCategoricalVector{Int, DRT}, + NullableCategoricalVector{Float64, DRT}, + NullableCategoricalVector{Float64, DRT}] + + on = :fid + @test i(on) == DataTable(Any[[1, 3], [1.0, 3.0], [1, 3]], [:id, :fid, :id_1]) + @test typeof.(i(on).columns) == [CategoricalVector{Int, DRT}, + CategoricalVector{Float64, DRT}, + CategoricalVector{Int, DRT}] + @test l(on) == DataTable(id = NullableArray([1, 3, 5]), + fid = NullableArray([1, 3, 5]), + id_1 = NullableArray([1, 3, N])) + @test typeof.(l(on).columns) == [NullableCategoricalVector{Int, DRT}, + NullableCategoricalVector{Float64, DRT}, + NullableCategoricalVector{Int, DRT}] + @test r(on) == DataTable(id = NullableArray([1, 3, N, N, N]), + fid = NullableArray([1, 3, 0, 2, 4]), + id_1 = NullableArray([1, 3, 0, 2, 4])) + @test typeof.(r(on).columns) == [NullableCategoricalVector{Int, DRT}, + NullableCategoricalVector{Float64, DRT}, + NullableCategoricalVector{Int, DRT}] + @test o(on) == DataTable(id = NullableArray([1, 3, 5, N, N, N]), + fid = NullableArray([1, 3, 5, 0, 2, 4]), + id_1 = NullableArray([1, 3, N, 0, 2, 4])) + @test typeof.(o(on).columns) == [NullableCategoricalVector{Int, DRT}, + NullableCategoricalVector{Float64, DRT}, + NullableCategoricalVector{Int, DRT}] + + on = [:id, :fid] + @test i(on) == DataTable(Any[[1, 3], [1, 3]], [:id, :fid]) + @test typeof.(i(on).columns) == [CategoricalVector{Int, DRT}, + CategoricalVector{Float64, DRT}] + @test l(on) == DataTable(id = NullableArray([1, 3, 5]), + fid = NullableArray([1, 3, 5])) + @test typeof.(l(on).columns) == [NullableCategoricalVector{Int, DRT}, + NullableCategoricalVector{Float64, DRT}] + @test r(on) == DataTable(id = NullableArray([1, 3, 0, 2, 4]), + fid = NullableArray([1, 3, 0, 2, 4])) + @test typeof.(r(on).columns) == [NullableCategoricalVector{Int, DRT}, + NullableCategoricalVector{Float64, DRT}] + @test o(on) == DataTable(id = NullableArray([1, 3, 5, 0, 2, 4]), + fid = NullableArray([1, 3, 5, 0, 2, 4])) + @test typeof.(o(on).columns) == [NullableCategoricalVector{Int, DRT}, + NullableCategoricalVector{Float64, DRT}] + end end