From f3ae44c3f680312ae75a5e7363de3c70bedeaf0c Mon Sep 17 00:00:00 2001 From: chethega Date: Sat, 4 Nov 2023 14:20:35 +0100 Subject: [PATCH] changed broadcast! into bitarray algorithm (#32048) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cf https://discourse.julialang.org/t/broadcast-vs-slow-performance-allocations/24259/6 for some more discussion and https://github.com/JuliaLang/julia/issues/32047 for the question of validity in view of exceptions. Before: ``` julia> using BenchmarkTools, Random julia> y=1; xsmall=[1]; Random.seed!(42); xlarge=rand(1:4, 100_003); julia> @btime broadcast(==, $xsmall, $y); @btime broadcast(==, $xlarge, $y); @show hash(broadcast(==, xlarge, y).chunks); 860.500 ns (3 allocations: 4.31 KiB) 152.971 μs (3 allocations: 16.59 KiB) hash((broadcast(==, xlarge, y)).chunks) = 0xaa3b5a692968e128 ``` After: ``` julia> @btime broadcast(==, $xsmall, $y); @btime broadcast(==, $xlarge, $y); @show hash(broadcast(==, xlarge, y).chunks); 65.466 ns (2 allocations: 128 bytes) 42.927 μs (2 allocations: 12.41 KiB) hash((broadcast(==, xlarge, y)).chunks) = 0xaa3b5a692968e128 ``` Co-authored-by: Jameson Nash --- base/broadcast.jl | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/base/broadcast.jl b/base/broadcast.jl index 97e69cc680358..d6e5513889cee 100644 --- a/base/broadcast.jl +++ b/base/broadcast.jl @@ -974,6 +974,32 @@ preprocess(dest, x) = extrude(broadcast_unalias(dest, x)) return dest end +# Performance optimization: for BitVector outputs, we cache the result +# in a 64-bit register before writing into memory (to bypass LSQ) +@inline function copyto!(dest::BitVector, bc::Broadcasted{Nothing}) + axes(dest) == axes(bc) || throwdm(axes(dest), axes(bc)) + ischunkedbroadcast(dest, bc) && return chunkedcopyto!(dest, bc) + destc = dest.chunks + bcp = preprocess(dest, bc) + length(bcp) <= 0 && return dest + len = Base.num_bit_chunks(Int(length(bcp))) + @inbounds for i = 0:(len - 2) + z = UInt64(0) + for j = 0:63 + z |= UInt64(bcp[i*64 + j + 1]::Bool) << (j & 63) + end + destc[i + 1] = z + end + @inbounds let i = len - 1 + z = UInt64(0) + for j = 0:((length(bcp) - 1) & 63) + z |= UInt64(bcp[i*64 + j + 1]::Bool) << (j & 63) + end + destc[i + 1] = z + end + return dest +end + # Performance optimization: for BitArray outputs, we cache the result # in a "small" Vector{Bool}, and then copy in chunks into the output @inline function copyto!(dest::BitArray, bc::Broadcasted{Nothing})