From 8f923a6d6385f07cfe4d6f27ee1fe80e2b58ad30 Mon Sep 17 00:00:00 2001 From: chethega Date: Thu, 16 May 2019 12:57:36 +0200 Subject: [PATCH 1/8] changed broadcast! into bitarray algorithm --- base/broadcast.jl | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/base/broadcast.jl b/base/broadcast.jl index 4d5ca6b92fe19..ce16c9edb38b3 100644 --- a/base/broadcast.jl +++ b/base/broadcast.jl @@ -895,23 +895,23 @@ end @inline function copyto!(dest::BitArray, bc::Broadcasted{Nothing}) axes(dest) == axes(bc) || throwdm(axes(dest), axes(bc)) ischunkedbroadcast(dest, bc) && return chunkedcopyto!(dest, bc) - tmp = Vector{Bool}(undef, bitcache_size) destc = dest.chunks ind = cind = 1 - bc′ = preprocess(dest, bc) - @simd for I in eachindex(bc′) - @inbounds tmp[ind] = bc′[I] - ind += 1 - if ind > bitcache_size - dumpbitcache(destc, cind, tmp) - cind += bitcache_chunks - ind = 1 + bcp = preprocess(dest, bc) + length(bcp)<=0 && return dest + @inbounds for i = 0:Base.num_bit_chunks(length(bcp))-2 + z = UInt64(0) + for j=0:63 + z |= (bcp[i*64 + j + 1]::Bool) << (j&63) end + destc[i+1] = z end - if ind > 1 - @inbounds tmp[ind:bitcache_size] .= false - dumpbitcache(destc, cind, tmp) + i = Base.num_bit_chunks(length(bcp))-1 + z = UInt64(0) + @inbounds for j=0:(length(bcp)-1)&63 + z |= (bcp[i*64 + j + 1]::Bool) << (j&63) end + @inbounds destc[i+1] = z return dest end From b9c69425839b7ebe4bc4414b329930c7822ea890 Mon Sep 17 00:00:00 2001 From: chethega Date: Thu, 16 May 2019 14:48:15 +0200 Subject: [PATCH 2/8] restrict new alg to bitvector --- base/broadcast.jl | 34 +++++++++++++++++++++++++++++----- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/base/broadcast.jl b/base/broadcast.jl index ce16c9edb38b3..e15debbcb9174 100644 --- a/base/broadcast.jl +++ b/base/broadcast.jl @@ -890,15 +890,14 @@ preprocess_args(dest, args::Tuple{}) = () return dest end -# Performance optimization: for BitArray outputs, we cache the result -# in a "small" Vector{Bool}, and then copy in chunks into the output -@inline function copyto!(dest::BitArray, bc::Broadcasted{Nothing}) +# Performance optimization: for BitVector outputs, we cache the result +# in a 64-bit register before writing into memory (to bypass LSQ) +@inline function copyto!(dest::BitVector, bc::Broadcasted{Nothing}) axes(dest) == axes(bc) || throwdm(axes(dest), axes(bc)) ischunkedbroadcast(dest, bc) && return chunkedcopyto!(dest, bc) destc = dest.chunks - ind = cind = 1 + length(destc)<=0 && return dest bcp = preprocess(dest, bc) - length(bcp)<=0 && return dest @inbounds for i = 0:Base.num_bit_chunks(length(bcp))-2 z = UInt64(0) for j=0:63 @@ -915,6 +914,31 @@ end return dest end +# Performance optimization: for BitArray outputs, we cache the result +# in a "small" Vector{Bool}, and then copy in chunks into the output +@inline function copyto!(dest::BitArray, bc::Broadcasted{Nothing}) + axes(dest) == axes(bc) || throwdm(axes(dest), axes(bc)) + ischunkedbroadcast(dest, bc) && return chunkedcopyto!(dest, bc) + tmp = Vector{Bool}(undef, bitcache_size) + destc = dest.chunks + ind = cind = 1 + bc′ = preprocess(dest, bc) + @simd for I in eachindex(bc′) + @inbounds tmp[ind] = bc′[I] + ind += 1 + if ind > bitcache_size + dumpbitcache(destc, cind, tmp) + cind += bitcache_chunks + ind = 1 + end + end + if ind > 1 + @inbounds tmp[ind:bitcache_size] .= false + dumpbitcache(destc, cind, tmp) + end + return dest +end + # For some BitArray operations, we can work at the level of chunks. The trivial # implementation just walks over the UInt64 chunks in a linear fashion. # This requires three things: From 19ba49d2a187ab7033599b122d5fe59d33fc98e6 Mon Sep 17 00:00:00 2001 From: chethega Date: Thu, 16 May 2019 16:28:26 +0200 Subject: [PATCH 3/8] fix length checks --- base/broadcast.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/base/broadcast.jl b/base/broadcast.jl index e15debbcb9174..3528144009bb1 100644 --- a/base/broadcast.jl +++ b/base/broadcast.jl @@ -898,14 +898,14 @@ end destc = dest.chunks length(destc)<=0 && return dest bcp = preprocess(dest, bc) - @inbounds for i = 0:Base.num_bit_chunks(length(bcp))-2 + @inbounds for i = 0:length(destc)-2 z = UInt64(0) for j=0:63 z |= (bcp[i*64 + j + 1]::Bool) << (j&63) end destc[i+1] = z end - i = Base.num_bit_chunks(length(bcp))-1 + i = length(destc)-1 z = UInt64(0) @inbounds for j=0:(length(bcp)-1)&63 z |= (bcp[i*64 + j + 1]::Bool) << (j&63) From 95a593aaaf1c15cfacae7c2d9ba660b69798a08c Mon Sep 17 00:00:00 2001 From: Jameson Nash Date: Mon, 30 Oct 2023 20:31:55 -0400 Subject: [PATCH 4/8] Update broadcast.jl --- base/broadcast.jl | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/base/broadcast.jl b/base/broadcast.jl index 3528144009bb1..82f4dafe212ef 100644 --- a/base/broadcast.jl +++ b/base/broadcast.jl @@ -898,19 +898,21 @@ end destc = dest.chunks length(destc)<=0 && return dest bcp = preprocess(dest, bc) - @inbounds for i = 0:length(destc)-2 + len = num_bit_chunks(length(bcp)) + @inbounds for i = 0:(len - 2) z = UInt64(0) - for j=0:63 - z |= (bcp[i*64 + j + 1]::Bool) << (j&63) + for j = 0:63 + z |= (bcp[i*64 + j + 1]::Bool) << (j & 63) end - destc[i+1] = z + destc[i + 1] = z end - i = length(destc)-1 - z = UInt64(0) - @inbounds for j=0:(length(bcp)-1)&63 - z |= (bcp[i*64 + j + 1]::Bool) << (j&63) + @inbounds let i = len - 1 + z = UInt64(0) + for j = 0:((length(bcp) - 1) & 63) + z |= (bcp[i*64 + j + 1]::Bool) << (j & 63) + end + destc[i + 1] = z end - @inbounds destc[i+1] = z return dest end From 1c361c5d12e15ba47b22716c49aec305fe6e2ea6 Mon Sep 17 00:00:00 2001 From: Jameson Nash Date: Mon, 30 Oct 2023 20:32:45 -0400 Subject: [PATCH 5/8] Update broadcast.jl --- base/broadcast.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/base/broadcast.jl b/base/broadcast.jl index 82f4dafe212ef..61e346ad81e5f 100644 --- a/base/broadcast.jl +++ b/base/broadcast.jl @@ -896,8 +896,8 @@ end axes(dest) == axes(bc) || throwdm(axes(dest), axes(bc)) ischunkedbroadcast(dest, bc) && return chunkedcopyto!(dest, bc) destc = dest.chunks - length(destc)<=0 && return dest bcp = preprocess(dest, bc) + length(bcp) <= 0 && return dest len = num_bit_chunks(length(bcp)) @inbounds for i = 0:(len - 2) z = UInt64(0) From 4e9feeb86e33c6e8f182285d82f927560c84b3fc Mon Sep 17 00:00:00 2001 From: Jameson Nash Date: Mon, 30 Oct 2023 20:52:25 -0400 Subject: [PATCH 6/8] Update broadcast.jl --- base/broadcast.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/base/broadcast.jl b/base/broadcast.jl index 77d9dd4dda094..36fcc2b7eeed4 100644 --- a/base/broadcast.jl +++ b/base/broadcast.jl @@ -982,7 +982,7 @@ end destc = dest.chunks bcp = preprocess(dest, bc) length(bcp) <= 0 && return dest - len = num_bit_chunks(length(bcp)) + len = Base.num_bit_chunks(length(bcp)) @inbounds for i = 0:(len - 2) z = UInt64(0) for j = 0:63 From 1f7855318ab1e31737f27ed8ba008f1432bf3ca2 Mon Sep 17 00:00:00 2001 From: Jameson Nash Date: Wed, 1 Nov 2023 10:16:04 -0400 Subject: [PATCH 7/8] Update base/broadcast.jl --- base/broadcast.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/base/broadcast.jl b/base/broadcast.jl index 36fcc2b7eeed4..4cf647bdbc0ed 100644 --- a/base/broadcast.jl +++ b/base/broadcast.jl @@ -982,7 +982,7 @@ end destc = dest.chunks bcp = preprocess(dest, bc) length(bcp) <= 0 && return dest - len = Base.num_bit_chunks(length(bcp)) + len = Base.num_bit_chunks(Int(length(bcp))) @inbounds for i = 0:(len - 2) z = UInt64(0) for j = 0:63 From b72db5cf4fc968c7b43d575962378c737eb5fe5c Mon Sep 17 00:00:00 2001 From: Jameson Nash Date: Thu, 2 Nov 2023 10:11:24 -0400 Subject: [PATCH 8/8] Update broadcast.jl --- base/broadcast.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/base/broadcast.jl b/base/broadcast.jl index 4cf647bdbc0ed..d6e5513889cee 100644 --- a/base/broadcast.jl +++ b/base/broadcast.jl @@ -986,14 +986,14 @@ end @inbounds for i = 0:(len - 2) z = UInt64(0) for j = 0:63 - z |= (bcp[i*64 + j + 1]::Bool) << (j & 63) + z |= UInt64(bcp[i*64 + j + 1]::Bool) << (j & 63) end destc[i + 1] = z end @inbounds let i = len - 1 z = UInt64(0) for j = 0:((length(bcp) - 1) & 63) - z |= (bcp[i*64 + j + 1]::Bool) << (j & 63) + z |= UInt64(bcp[i*64 + j + 1]::Bool) << (j & 63) end destc[i + 1] = z end