Skip to content

Commit

Permalink
Run findall(rows) only if rows are not all true (#2727)
Browse files Browse the repository at this point in the history
  • Loading branch information
pstorozenko authored May 16, 2021
1 parent db8b3a9 commit ed00241
Show file tree
Hide file tree
Showing 3 changed files with 218 additions and 2 deletions.
152 changes: 152 additions & 0 deletions src/other/utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -178,3 +178,155 @@ function _nt_like_hash(v, h::UInt)

return xor(objectid(Tuple(propertynames(v))), h)
end

_findall(B) = findall(B)

@inline _blsr(x) = x & (x-1)

# findall returning a range when possible (all true indices are contiguous), and optimized for B::BitVector
# the main idea is taken from Base.findall(B::BitArray)
function _findall(B::BitVector)::Union{UnitRange{Int}, Vector{Int}}
nnzB = count(B)
nnzB == 0 && return 1:0
nnzB == length(B) && return 1:length(B)
local I
Bc = B.chunks
Bi = 1 # block index
i1 = 1 # index of current block beginng in B
i = 1 # index of the _next_ one in I
c = Bc[1] # current block

start = -1 # the begining of ones block
stop = -1 # the end of ones block

@inbounds while true # I not materialized
if i > nnzB # all ones in B found
Ir = start:start + i - 2
@assert length(Ir) == nnzB
return Ir
end

if c == 0
if start != -1 && stop == -1
stop = i1 - 1
end
while c == 0 # no need to return here as we returned above
i1 += 64
Bi += 1
c = Bc[Bi]
end
end
if c == ~UInt64(0)
if stop != -1
I = Vector{Int}(undef,nnzB)
I[1:i-1] .= start:stop
break
end
if start == -1
start = i1
end
while c == ~UInt64(0)
if Bi == length(Bc)
Ir = start:length(B)
@assert length(Ir) == nnzB
return Ir
end

i += 64
i1 += 64
Bi += 1
c = Bc[Bi]
end
end
if c != 0 # mixed ones and zeros in block
tz = trailing_zeros(c)
lz = leading_zeros(c)
co = c >> tz == (one(UInt64) << (64 - lz - tz)) - 1 # block of countinous ones in c
if stop != -1 # already found block of ones and zeros, just not materialized
I = Vector{Int}(undef,nnzB)
I[1:i-1] .= start:stop
break
elseif !co # not countinous ones
I = Vector{Int}(undef,nnzB)
if start != -1
I[1:i-1] .= start:start + i - 2
end
break
else # countinous block of ones
if start != -1
if tz > 0 # like __1111__ or 111111__
I = Vector{Int}(undef,nnzB)
I[1:i-1] .= start:start + i - 2
break
else # lz > 0, like __111111
stop = i1 + (64 - lz) - 1
i += 64 - lz

# return if last block
if Bi == length(Bc)
Ir = start:stop
@assert length(Ir) == nnzB
return Ir
end

i1 += 64
Bi += 1
c = Bc[Bi]
end
else # start == -1
start = i1 + tz

if lz > 0 # like __111111 or like __1111__
stop = i1 + (64 - lz) - 1
i += stop - start + 1
else # like 111111__
i += 64 - tz
end

# return if last block
if Bi == length(Bc)
Ir = start:start + i - 2
@assert length(Ir) == nnzB
return Ir
end

i1 += 64
Bi += 1
c = Bc[Bi]
end
end
end
end
@inbounds while true # I materialized, process like in Base.findall
if i > nnzB # all ones in B found
@assert nnzB == i - 1
return I
end

while c == 0 # no need to return here as we returned above
i1 += 64
Bi += 1
c = Bc[Bi]
end

while c == ~UInt64(0)
I[i:i+64-1] .= i1:(i1+64-1)
i += 64
if Bi == length(Bc)
@assert nnzB == i - 1
return I
end
i1 += 64
Bi += 1
c = Bc[Bi]
end

while c != 0
tz = trailing_zeros(c)
c = _blsr(c) # zeros last nonzero bit
I[i] = i1 + tz
i += 1
end
end
@assert false "should not be reached"
end
2 changes: 1 addition & 1 deletion src/subdataframe/subdataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ Base.@propagate_inbounds function SubDataFrame(parent::DataFrame, rows::Abstract
throw(ArgumentError("invalid length of `AbstractVector{Bool}` row index " *
"(got $(length(rows)), expected $(nrow(parent)))"))
end
return SubDataFrame(parent, findall(rows), cols)
return SubDataFrame(parent, _findall(rows), cols)
end

Base.@propagate_inbounds function SubDataFrame(parent::DataFrame, rows::AbstractVector, cols)
Expand Down
66 changes: 65 additions & 1 deletion test/utils.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
module TestUtils

using Test, DataFrames
using Test, Random, DataFrames

@testset "make_unique" begin
@test DataFrames.make_unique([:x, :x, :x_1, :x2], makeunique=true) == [:x, :x_2, :x_1, :x2]
Expand Down Expand Up @@ -149,4 +149,68 @@ end
@test_throws AssertionError DataFrames.split_to_chunks(10, 11)
end

@testset "_findall(B::BitVector)" begin
Random.seed!(1234)
BD = Dict(
"Empty" => (BitVector([]), UnitRange{Int}),
"T Big" => (trues(100000), UnitRange{Int}),
"F Big" => (falses(100000), UnitRange{Int}),
"T64 F64" => ([trues(64); falses(64)], UnitRange{Int}),
"F64 T64" => ([falses(64); trues(64)], UnitRange{Int}),
"F80 T100" => ([falses(85); trues(100)], UnitRange{Int}),
"F256 T32" => ([falses(256); trues(32)], UnitRange{Int}),
"F260 T32" => ([falses(260); trues(32)], UnitRange{Int}),
"TF Big" => ([trues(100000); falses(100000)], UnitRange{Int}),
"FT Big" => ([falses(100000); trues(100000)], UnitRange{Int}),

# some edge cases
"TFT small" => ([trues(85); falses(100); trues(85)], Vector{Int}),
"FTFFT small" => ([falses(64 + 32); trues(32); falses(128); trues(32)], Vector{Int}),
"TFTF small" => ([falses(64); trues(64); falses(64); trues(64)], Vector{Int}),
"TFT small" => ([trues(64); falses(10); trues(100)], Vector{Int}),

"FTF Big" => ([falses(8500); trues(100000); falses(65000)], UnitRange{Int}),
"TFT Big" => ([trues(8500); falses(100000); trues(65000)], Vector{Int}),
"FTFTFTF Big" => ([falses(65000); trues(65000); falses(65000); trues(65000); falses(65000); trues(65000); falses(65000)], Vector{Int}),

"FTFR small" => ([falses(85); trues(100); falses(65); rand([true, false], 2000)], Vector{Int}),
"R Big" => (BitVector(rand([true, false], 2000000)), Vector{Int}),
"RF Big" => ([BitVector(rand([true, false], 1000000)); falses(1000000)], Vector{Int}),
"RT Big" => ([BitVector(rand([true, false], 1000000)); trues(1000000)], Vector{Int}),
"FR Big" => ([falses(1000000); BitVector(rand([true, false], 1000000))], Vector{Int}),
"TR Big" => ([trues(1000000); BitVector(rand([true, false], 1000000))], Vector{Int}),
"FRT Big" => ([falses(1000000); BitVector(rand([true, false], 1000000)); trues(1000000)], Vector{Int}),
"TRF Big" => ([trues(1000000); BitVector(rand([true, false], 1000000)); falses(1000000)], Vector{Int}),
"FRF Big" => ([falses(1000000); BitVector(rand([true, false], 1000000)); falses(1000000)], Vector{Int}),
"TRT Big" => ([trues(1000000); BitVector(rand([true, false], 1000000)); trues(1000000)], Vector{Int}),
"RFR Big" => ([BitVector(rand([true, false], 1000000)); falses(1000000); BitVector(rand([true, false], 1000000))], Vector{Int}),
"FTFR Big" => ([falses(65000); trues(65000); falses(65000); rand([true, false], 20000)], Vector{Int}),
"T256 R100" => ([trues(256); rand([true, false], 2000)], Vector{Int}),
"F256 R100" => ([falses(256); rand([true, false], 2000)], Vector{Int}),
)
for (_, (B, T)) in BD
res = DataFrames._findall(B)
@test Base.findall(B) == res
@test res isa T
end

# 1:200 is to test all small cases
# 1000 is to test skipping multiple 64-bit blocks of 0 and 1
for n in [1:200; 1000], i in 1:n, j in i:n
x = falses(n)
x[i:j] .= true
res = DataFrames._findall(x)
@test res == i:j
@test res isa UnitRange{Int}
if j + 1 < n
x[j + 2] = true
# add one false and then true
@test DataFrames._findall(x) == [i:j; j+2]
# sprinkle trues randomly after one false
rand!(view(x, j + 2:n), Bool)
@test DataFrames._findall(x) == findall(x)
end
end
end

end # module

0 comments on commit ed00241

Please sign in to comment.