Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Run findall(rows) only if rows are not all true #2727

Merged
merged 17 commits into from
May 16, 2021
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
152 changes: 152 additions & 0 deletions src/other/utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -178,3 +178,155 @@ function _nt_like_hash(v, h::UInt)

return xor(objectid(Tuple(propertynames(v))), h)
end

_findall(B) = findall(B)

@inline _blsr(x)= x & (x-1)
const _msk64 = ~UInt64(0)

# findall optimized for B::BitVector
# the main idea from Base.findall(B::BitArray)
function _findall(B::BitVector)::Union{UnitRange{Int}, Vector{Int}}
nnzB = count(B)
nnzB == 0 && return Int[]
nnzB == length(B) && return 1:length(B)
local I
Bc = B.chunks
Bi = 1 # block index
i1 = 1 # index of current block beginng in B
i = 1 # index of the _next_ one in I
c = Bc[1] # current block

start = -1 # the begining of ones block
stop = -1 # the end of ones block

@inbounds while true # I not materialized
if i > nnzB # all ones in B found
Ir = start:start + i - 2
@assert length(Ir) == nnzB
return Ir
end

if c == 0
if start != -1 && stop == -1
stop = i1 - 1
end
while c == 0 # no need to return here as we returned above
i1 += 64
Bi += 1
c = Bc[Bi]
end
end
if c == _msk64
if stop != -1
I = Vector{Int}(undef,nnzB)
I[1:i-1] .= start:stop
break
end
if start == -1
start = i1
end
while c == _msk64
if Bi == length(Bc)
Ir = start:length(B)
@assert length(Ir) == nnzB
return Ir
end

i += 64
i1 += 64
Bi += 1
c = Bc[Bi]
end
end
if c != 0 # mixed ones and zeros in block
tz = trailing_zeros(c)
lz = leading_zeros(c)
co = c >> tz == (one(UInt64) << (64 - lz - tz)) - 1 # block of countinous ones in c
if stop != -1 # already found block of ones and zeros, just not materialized
I = Vector{Int}(undef,nnzB)
I[1:i-1] .= start:stop
break
elseif !co # not countinous ones
I = Vector{Int}(undef,nnzB)
if start != -1
I[1:i-1] .= start:start + i - 2
end
break
else # countinous block of ones
if start != -1
if tz > 0 # like __1111__ or 111111__
I = Vector{Int}(undef,nnzB)
I[1:i-1] .= start:start + i - 2
break
else # lz > 0, like __111111
stop = i1 + (64 - lz) - 1
i += 64 - lz

# return if last block
if Bi == length(Bc)
Ir = start:stop
@assert length(Ir) == nnzB
return Ir
end

i1 += 64
Bi += 1
c = Bc[Bi]
end
else # start == -1
start = i1 + tz

if lz > 0 # like __111111 or like __1111__
stop = i1 + (64 - lz) - 1
i += stop - start + 1
else # like 111111__
i += 64 - tz
end

# return if last block
if Bi == length(Bc)
Ir = start:start + i - 2
@assert length(Ir) == nnzB
return Ir
end

i1 += 64
Bi += 1
c = Bc[Bi]
end
end
end
end
@inbounds while true # I materialized, process like in Base.findall
if i > nnzB # all ones in B found
@assert nnzB == i - 1
return I
end

while c == 0 # no need to return here as we returned above
i1 += 64
Bi += 1
c = Bc[Bi]
end

while c == _msk64
I[i:i+64-1] .= i1:(i1+64-1)
i += 64
if Bi == length(Bc)
@assert nnzB == i - 1
return I
end
i1 += 64
Bi += 1
c = Bc[Bi]
end

while c != 0
tz = trailing_zeros(c)
c = _blsr(c) # zeros last nonzero bit
I[i] = i1 + tz
i += 1
end
end
end
2 changes: 1 addition & 1 deletion src/subdataframe/subdataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ Base.@propagate_inbounds function SubDataFrame(parent::DataFrame, rows::Abstract
throw(ArgumentError("invalid length of `AbstractVector{Bool}` row index " *
"(got $(length(rows)), expected $(nrow(parent)))"))
end
return SubDataFrame(parent, findall(rows), cols)
return SubDataFrame(parent, _findall(rows), cols)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you also test performance and corrected of view (it should be OK, but just to be sure as this is the point of the change we make so I prefer to double check). Thank you!

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, I'll check it with some more scenario than in the beginning of this PR

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fortunately it is faster :)

df = DataFrame(
    a = rand(1000000),
    b = allowmissing(rand(1000000)),
    c = [allowmissing(rand(1000000 - 1)); missing],
    d = repeat([1:9; missing], 100000),
    e = repeat([rand(50000); missings(50000)], 10),
    f = [missings(300000); ones(400000); missings(300000)],
    g = missings(1000000)
)

Main:

julia> @btime dropmissing(df, :a, view=true);
  1.872 ms (9 allocations: 7.75 MiB)

julia> @btime dropmissing(df, :b, view=true);
  2.075 ms (14 allocations: 7.75 MiB)

julia> @btime dropmissing(df, :c, view=true);
  2.111 ms (14 allocations: 7.75 MiB)

julia> @btime dropmissing(df, :d, view=true);
  1.913 ms (14 allocations: 6.99 MiB)

julia> @btime dropmissing(df, :e, view=true);
  1.117 ms (14 allocations: 3.94 MiB)

julia> @btime dropmissing(df, :f, view=true);
  909.713 μs (14 allocations: 3.18 MiB)
 
julia> @btime dropmissing(df, :g, view=true);
  152.216 μs (13 allocations: 126.67 KiB)

julia> @btime dropmissing(df, view=true);
  1.432 ms (38 allocations: 1.63 MiB)

Main + this PR:

julia> @btime dropmissing(df, :a, view=true);
  17.093 μs (7 allocations: 122.34 KiB)

julia> @btime dropmissing(df, :b, view=true);
  162.656 μs (12 allocations: 126.59 KiB)

julia> @btime dropmissing(df, :c, view=true);
  168.988 μs (12 allocations: 126.59 KiB)

julia> @btime dropmissing(df, :d, view=true);
  1.890 ms (14 allocations: 6.99 MiB)

julia> @btime dropmissing(df, :e, view=true);
  1.100 ms (14 allocations: 3.94 MiB)

julia> @btime dropmissing(df, :f, view=true);
  166.613 μs (12 allocations: 126.59 KiB)

julia> @btime dropmissing(df, :g, view=true);
  188.653 μs (13 allocations: 126.67 KiB)
  
julia> @btime dropmissing(df, view=true);
  1.410 ms (38 allocations: 1.63 MiB)

end

Base.@propagate_inbounds function SubDataFrame(parent::DataFrame, rows::AbstractVector, cols)
Expand Down
66 changes: 65 additions & 1 deletion test/utils.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
module TestUtils

using Test, DataFrames
using Test, Random, DataFrames

@testset "make_unique" begin
@test DataFrames.make_unique([:x, :x, :x_1, :x2], makeunique=true) == [:x, :x_2, :x_1, :x2]
Expand Down Expand Up @@ -149,4 +149,68 @@ end
@test_throws AssertionError DataFrames.split_to_chunks(10, 11)
end

@testset "_findall(B::BitVector)" begin
Random.seed!(1234)
BD = Dict(
"Empty" => (BitVector([]), Vector{Int}),
"T Big" => (trues(100000), UnitRange{Int}),
"F Big" => (falses(100000), Vector{Int}),
"T64 F64" => ([trues(64); falses(64)], UnitRange{Int}),
"F64 T64" => ([falses(64); trues(64)], UnitRange{Int}),
"F80 T100" => ([falses(85); trues(100)], UnitRange{Int}),
"F256 T32" => ([falses(256); trues(32)], UnitRange{Int}),
"F260 T32" => ([falses(260); trues(32)], UnitRange{Int}),
"TF Big" => ([trues(100000); falses(100000)], UnitRange{Int}),
"FT Big" => ([falses(100000); trues(100000)], UnitRange{Int}),

# some edge cases
"TFT small" => ([trues(85); falses(100); trues(85)], Vector{Int}),
"FTFFT small" => ([falses(64 + 32); trues(32); falses(128); trues(32)], Vector{Int}),
"TFTF small" => ([falses(64); trues(64); falses(64); trues(64)], Vector{Int}),
"TFT small" => ([trues(64); falses(10); trues(100)], Vector{Int}),

"FTF Big" => ([falses(8500); trues(100000); falses(65000)], UnitRange{Int}),
"TFT Big" => ([trues(8500); falses(100000); trues(65000)], Vector{Int}),
"FTFTFTF Big" => ([falses(65000); trues(65000); falses(65000); trues(65000); falses(65000); trues(65000); falses(65000)], Vector{Int}),

"FTFR small" => ([falses(85); trues(100); falses(65); rand([true, false], 2000)], Vector{Int}),
"R Big" => (BitVector(rand([true, false], 2000000)), Vector{Int}),
"RF Big" => ([BitVector(rand([true, false], 1000000)); falses(1000000)], Vector{Int}),
"RT Big" => ([BitVector(rand([true, false], 1000000)); trues(1000000)], Vector{Int}),
"FR Big" => ([falses(1000000); BitVector(rand([true, false], 1000000))], Vector{Int}),
"TR Big" => ([trues(1000000); BitVector(rand([true, false], 1000000))], Vector{Int}),
"FRT Big" => ([falses(1000000); BitVector(rand([true, false], 1000000)); trues(1000000)], Vector{Int}),
"TRF Big" => ([trues(1000000); BitVector(rand([true, false], 1000000)); falses(1000000)], Vector{Int}),
"FRF Big" => ([falses(1000000); BitVector(rand([true, false], 1000000)); falses(1000000)], Vector{Int}),
"TRT Big" => ([trues(1000000); BitVector(rand([true, false], 1000000)); trues(1000000)], Vector{Int}),
"RFR Big" => ([BitVector(rand([true, false], 1000000)); falses(1000000); BitVector(rand([true, false], 1000000))], Vector{Int}),
"FTFR Big" => ([falses(65000); trues(65000); falses(65000); rand([true, false], 20000)], Vector{Int}),
"T256 R100" => ([trues(256); rand([true, false], 2000)], Vector{Int}),
"F256 R100" => ([falses(256); rand([true, false], 2000)], Vector{Int}),
)
for (_, (B, T)) in BD
res = DataFrames._findall(B)
@test Base.findall(B) == res
@test res isa T
end

# 1:200 is to test all small cases
# 1000 is to test skipping multiple 64-bit blocks of 0 and 1
for n in [1:200; 1000], i in 1:n, j in i:n
x = falses(n)
x[i:j] .= true
res = DataFrames._findall(x)
@test res == i:j
@test res isa UnitRange{Int}
if j + 1 < n
x[j + 2] = true
# add one false and then true
@test DataFrames._findall(x) == [i:j; j+2]
# sprinkle trues randomly after one false
rand!(view(x, j + 2:n), Bool)
@test DataFrames._findall(x) == findall(x)
end
end
end

end # module