From c54b40b1d3e9ee3d5400fe74f86d2bc7dd7fe19e Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@gmail.com>
Date: Wed, 6 Feb 2019 02:21:41 -0500
Subject: [PATCH 1/5] Workaround #28126, support SIMDing broadcast in more
 cases

This is an ugly performance hack around issue #28126 in some limited (but common) cases. The problem in short: when given many arrays of the same size, LLVM has difficulty hoisting the decision of whether a given dimension should be "extruded" out of the loop. This extra indirection in the index computation seems to foil the array bounds aliasing checks, which stymies SIMDification. The solution: check to see if _Julia_ can statically decide whether or not to extrude any dimensions in a given broadcast expression -- and if so, use a special array wrapper that flags that none of the dimensions in that array need to be extruded out in order to perform the broadcast.
---
 base/broadcast.jl | 51 ++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 42 insertions(+), 9 deletions(-)

diff --git a/base/broadcast.jl b/base/broadcast.jl
index d284cc52081ae..3b74e59e6b0a8 100644
--- a/base/broadcast.jl
+++ b/base/broadcast.jl
@@ -864,13 +864,14 @@ broadcast_unalias(::Nothing, src) = src
 
 # Preprocessing a `Broadcasted` does two things:
 # * unaliases any arguments from `dest`
-# * "extrudes" the arguments where it is advantageous to pre-compute the broadcasted indices
-@inline preprocess(dest, bc::Broadcasted{Style}) where {Style} = Broadcasted{Style}(bc.f, preprocess_args(dest, bc.args), bc.axes)
-preprocess(dest, x) = extrude(broadcast_unalias(dest, x))
+# * calls `f` on the arguments (typically `extrude`, which pre-computes the broadcasted indices where advantageous)
+@inline preprocess(dest, bc) = preprocess(extrude, dest, bc)
+@inline preprocess(f, dest, bc::Broadcasted{Style}) where {Style} = Broadcasted{Style}(bc.f, preprocess_args(f, dest, bc.args), bc.axes)
+preprocess(f, dest, x) = f(broadcast_unalias(dest, x))
 
-@inline preprocess_args(dest, args::Tuple) = (preprocess(dest, args[1]), preprocess_args(dest, tail(args))...)
-preprocess_args(dest, args::Tuple{Any}) = (preprocess(dest, args[1]),)
-preprocess_args(dest, args::Tuple{}) = ()
+@inline preprocess_args(f, dest, args::Tuple) = (preprocess(f, dest, args[1]), preprocess_args(f, dest, tail(args))...)
+preprocess_args(f, dest, args::Tuple{Any}) = (preprocess(f, dest, args[1]),)
+preprocess_args(f, dest, args::Tuple{}) = ()
 
 # Specialize this method if all you want to do is specialize on typeof(dest)
 @inline function copyto!(dest::AbstractArray, bc::Broadcasted{Nothing})
@@ -882,13 +883,45 @@ preprocess_args(dest, args::Tuple{}) = ()
             return copyto!(dest, A)
         end
     end
-    bc′ = preprocess(dest, bc)
-    @simd for I in eachindex(bc′)
-        @inbounds dest[I] = bc′[I]
+    # Ugly performance hack around issue #28126: determine if all arguments to the
+    # broadcast are sized such that the broadcasting core can statically determine
+    # whether a given dimension is "extruded" or not. If so, we don't need to check
+    # any array sizes within the inner loop. Ideally this really should be something
+    # that Julia and/or LLVM could figure out and eliminate... and indeed they can
+    # for limited numbers of arguments.
+    if _is_static_broadcast_28126(dest, bc)
+        bcs′ = preprocess(_nonextrude_28126, dest, bc)
+        @simd for I in eachindex(bcs′)
+            @inbounds dest[I] = bcs′[I]
+        end
+    else
+        bc′ = preprocess(extrude, dest, bc)
+        @simd for I in eachindex(bc′)
+            @inbounds dest[I] = bc′[I]
+        end
     end
     return dest
 end
 
+@inline _is_static_broadcast_28126(dest, bc::Broadcasted{Style}) where {Style} = _is_static_broadcast_28126_args(dest, bc.args)
+_is_static_broadcast_28126(dest, x) = false
+_is_static_broadcast_28126(dest, x::Union{Ref, Tuple, Type, Number, AbstractArray{<:Any,0}}) = true
+_is_static_broadcast_28126(dest::AbstractArray, x::AbstractArray{<:Any,0}) = true
+_is_static_broadcast_28126(dest::AbstractArray, x::AbstractArray{<:Any,1}) = axes(dest, 1) == axes(x, 1)
+_is_static_broadcast_28126(dest::AbstractArray, x::AbstractArray) = axes(dest) == axes(x) # This can be better with other missing dimensions
+
+@inline _is_static_broadcast_28126_args(dest, args::Tuple) = _is_static_broadcast_28126(dest, args[1]) && _is_static_broadcast_28126_args(dest, tail(args))
+_is_static_broadcast_28126_args(dest, args::Tuple{Any}) = _is_static_broadcast_28126(dest, args[1])
+_is_static_broadcast_28126_args(dest, args::Tuple{}) = true
+
+struct _NonExtruded28126{T}
+    x::T
+end
+@inline axes(b::_NonExtruded28126) = axes(b.x)
+Base.@propagate_inbounds _broadcast_getindex(b::_NonExtruded28126, i) = b.x[i]
+_nonextrude_28126(x::AbstractArray) = _NonExtruded28126(x)
+_nonextrude_28126(x) = x
+
 # Performance optimization: for BitArray outputs, we cache the result
 # in a "small" Vector{Bool}, and then copy in chunks into the output
 @inline function copyto!(dest::BitArray, bc::Broadcasted{Nothing})

From 31207246c9b7f7a306f1a43859cce5d39d0cf764 Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@gmail.com>
Date: Wed, 6 Feb 2019 03:29:05 -0500
Subject: [PATCH 2/5] Fixup NonExtruded indexing

---
 base/broadcast.jl | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/base/broadcast.jl b/base/broadcast.jl
index 3b74e59e6b0a8..89e0db7c5c17e 100644
--- a/base/broadcast.jl
+++ b/base/broadcast.jl
@@ -918,7 +918,10 @@ struct _NonExtruded28126{T}
     x::T
 end
 @inline axes(b::_NonExtruded28126) = axes(b.x)
-Base.@propagate_inbounds _broadcast_getindex(b::_NonExtruded28126, i) = b.x[i]
+Base.@propagate_inbounds _broadcast_getindex(b::_NonExtruded28126, i) = _broadcast_getindex(b, i)
+Base.@propagate_inbounds _broadcast_getindex(b::_NonExtruded28126{<:AbstractArray{<:Any,0}}, i) = b.x[]
+Base.@propagate_inbounds _broadcast_getindex(b::_NonExtruded28126{<:AbstractVector}, i) = b.x[i[1]]
+Base.@propagate_inbounds _broadcast_getindex(b::_NonExtruded28126{<:AbstractArray}, i) = b.x[i]
 _nonextrude_28126(x::AbstractArray) = _NonExtruded28126(x)
 _nonextrude_28126(x) = x
 

From 4261ac8c0d14f61f8a32126d69e15fa640245286 Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@gmail.com>
Date: Wed, 6 Feb 2019 10:30:05 -0500
Subject: [PATCH 3/5] inline and add test case

---
 base/broadcast.jl |  2 +-
 test/broadcast.jl | 11 +++++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/base/broadcast.jl b/base/broadcast.jl
index 89e0db7c5c17e..7885351135d58 100644
--- a/base/broadcast.jl
+++ b/base/broadcast.jl
@@ -870,7 +870,7 @@ broadcast_unalias(::Nothing, src) = src
 preprocess(f, dest, x) = f(broadcast_unalias(dest, x))
 
 @inline preprocess_args(f, dest, args::Tuple) = (preprocess(f, dest, args[1]), preprocess_args(f, dest, tail(args))...)
-preprocess_args(f, dest, args::Tuple{Any}) = (preprocess(f, dest, args[1]),)
+@inline preprocess_args(f, dest, args::Tuple{Any}) = (preprocess(f, dest, args[1]),)
 preprocess_args(f, dest, args::Tuple{}) = ()
 
 # Specialize this method if all you want to do is specialize on typeof(dest)
diff --git a/test/broadcast.jl b/test/broadcast.jl
index a8de5518f3724..a18c5fabca662 100644
--- a/test/broadcast.jl
+++ b/test/broadcast.jl
@@ -790,6 +790,17 @@ let
     @test Dict(c .=> d) == Dict("foo" => 1, "bar" => 2)
 end
 
+@testset "large fusions vectorize and don't allocate (#28126)" begin
+    u, k1, k2, k3, k4, k5, k6, k7 = (ones(1000) for i in 1:8)
+    function goo(u, k1, k2, k3, k4, k5, k6, k7)
+        @. u = 0.1*(0.1*k1 + 0.2*k2 + 0.3*k3 + 0.4*k4 + 0.5*k5 + 0.6*k6 + 0.7*k7)
+        nothing
+    end
+    @allocated goo(u, k1, k2, k3, k4, k5, k6, k7)
+    @test @allocated(goo(u, k1, k2, k3, k4, k5, k6, k7)) == 0
+    @test occursin("vector.body", sprint(code_llvm, goo, NTuple{8, Vector{Float32}}))
+end
+
 # Broadcasted iterable/indexable APIs
 let
     bc = Broadcast.instantiate(Broadcast.broadcasted(+, zeros(5), 5))

From 49446f46d552a4217276941a7da1a75cfb54bfaf Mon Sep 17 00:00:00 2001
From: Yingbo Ma <mayingbo5@gmail.com>
Date: Thu, 7 Feb 2019 00:33:01 -0500
Subject: [PATCH 4/5] Inline the whole broadcast expression to avoid allocation
 (#30986)

* Inline the whole broadcast expression to avoid allocation

`map` has an inline limit of 16. To make sure that the whole broadcast
tree gets inlined properly, I added the `_inlined_map` function. I am not
sure if it is a good idea, but worth trying.

This PR solves the issue which I have mentioned in
https://github.com/JuliaLang/julia/pull/30973/commits/2693778d0fc8696280b540e9135ea0220cf6d6a7#issuecomment-461248258

```julia
julia> @allocated foo(tmp, uprev, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12, k13,
                    k14, k15, k16, k17, k18, k19, k20, k21, k22, k23, k24, k25,
                    k26, k27, k28, k29, k30, k31, k32, k33, k34)
0
```

* Fix CI failure

* Stricter test (vectorization & no allocation for a 9-array bc)

* rm `_inlined_map`
---
 base/broadcast.jl |  7 ++++---
 test/broadcast.jl | 13 +++++++------
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/base/broadcast.jl b/base/broadcast.jl
index 7885351135d58..b6280745be288 100644
--- a/base/broadcast.jl
+++ b/base/broadcast.jl
@@ -464,7 +464,8 @@ julia> Broadcast.combine_axes(1, 1, 1)
 ()
 ```
 """
-@inline combine_axes(A, B...) = broadcast_shape(axes(A), combine_axes(B...))
+@inline combine_axes(A, B, C...) = broadcast_shape(axes(A), combine_axes(B, C...))
+@inline combine_axes(A, B) = broadcast_shape(axes(A), axes(B))
 combine_axes(A) = axes(A)
 
 # shape (i.e., tuple-of-indices) inputs
@@ -502,7 +503,7 @@ function check_broadcast_shape(shp, Ashp::Tuple)
     _bcsm(shp[1], Ashp[1]) || throw(DimensionMismatch("array could not be broadcast to match destination"))
     check_broadcast_shape(tail(shp), tail(Ashp))
 end
-check_broadcast_axes(shp, A) = check_broadcast_shape(shp, axes(A))
+@inline check_broadcast_axes(shp, A) = check_broadcast_shape(shp, axes(A))
 # comparing many inputs
 @inline function check_broadcast_axes(shp, A, As...)
     check_broadcast_axes(shp, A)
@@ -911,7 +912,7 @@ _is_static_broadcast_28126(dest::AbstractArray, x::AbstractArray{<:Any,1}) = axe
 _is_static_broadcast_28126(dest::AbstractArray, x::AbstractArray) = axes(dest) == axes(x) # This can be better with other missing dimensions
 
 @inline _is_static_broadcast_28126_args(dest, args::Tuple) = _is_static_broadcast_28126(dest, args[1]) && _is_static_broadcast_28126_args(dest, tail(args))
-_is_static_broadcast_28126_args(dest, args::Tuple{Any}) = _is_static_broadcast_28126(dest, args[1])
+@inline _is_static_broadcast_28126_args(dest, args::Tuple{Any}) = _is_static_broadcast_28126(dest, args[1])
 _is_static_broadcast_28126_args(dest, args::Tuple{}) = true
 
 struct _NonExtruded28126{T}
diff --git a/test/broadcast.jl b/test/broadcast.jl
index a18c5fabca662..483e3ebe01ee5 100644
--- a/test/broadcast.jl
+++ b/test/broadcast.jl
@@ -791,14 +791,15 @@ let
 end
 
 @testset "large fusions vectorize and don't allocate (#28126)" begin
-    u, k1, k2, k3, k4, k5, k6, k7 = (ones(1000) for i in 1:8)
-    function goo(u, k1, k2, k3, k4, k5, k6, k7)
-        @. u = 0.1*(0.1*k1 + 0.2*k2 + 0.3*k3 + 0.4*k4 + 0.5*k5 + 0.6*k6 + 0.7*k7)
+    using InteractiveUtils: code_llvm
+    u, uprev, k1, k2, k3, k4, k5, k6, k7 = (ones(1000) for i in 1:9)
+    function goo(u, uprev, k1, k2, k3, k4, k5, k6, k7)
+        @. u = uprev + 0.1*(0.1*k1 + 0.2*k2 + 0.3*k3 + 0.4*k4 + 0.5*k5 + 0.6*k6 + 0.7*k7)
         nothing
     end
-    @allocated goo(u, k1, k2, k3, k4, k5, k6, k7)
-    @test @allocated(goo(u, k1, k2, k3, k4, k5, k6, k7)) == 0
-    @test occursin("vector.body", sprint(code_llvm, goo, NTuple{8, Vector{Float32}}))
+    @allocated goo(u, uprev, k1, k2, k3, k4, k5, k6, k7)
+    @test @allocated(goo(u, uprev, k1, k2, k3, k4, k5, k6, k7)) == 0
+    @test occursin("vector.body", sprint(code_llvm, goo, NTuple{9, Vector{Float32}}))
 end
 
 # Broadcasted iterable/indexable APIs

From 8aa59077daa13657f627b3f795fbe9bf1fc1e093 Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@gmail.com>
Date: Thu, 14 Feb 2019 14:44:35 -0500
Subject: [PATCH 5/5] Move vectorization test to boundscheck

---
 test/boundscheck_exec.jl | 8 ++++++++
 test/broadcast.jl        | 2 --
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/test/boundscheck_exec.jl b/test/boundscheck_exec.jl
index 62a20921bd44e..f86baca82bbef 100644
--- a/test/boundscheck_exec.jl
+++ b/test/boundscheck_exec.jl
@@ -251,5 +251,13 @@ if bc_opt == bc_default || bc_opt == bc_off
     @test occursin("vector.body", sprint(code_llvm, g27079, Tuple{Vector{Int}}))
 end
 
+# Ensure broadcasting can vectorize when bounds checks are off
+if bc_opt != bc_on
+    function goo28126(u, uprev, k1, k2, k3, k4, k5, k6, k7)
+        @. u = uprev + 0.1*(0.1*k1 + 0.2*k2 + 0.3*k3 + 0.4*k4 + 0.5*k5 + 0.6*k6 + 0.7*k7)
+        nothing
+    end
+    @test occursin("vector.body", sprint(code_llvm, goo28126, NTuple{9, Vector{Float32}}))
+end
 
 end
diff --git a/test/broadcast.jl b/test/broadcast.jl
index 483e3ebe01ee5..0430dd2eb4bd6 100644
--- a/test/broadcast.jl
+++ b/test/broadcast.jl
@@ -791,7 +791,6 @@ let
 end
 
 @testset "large fusions vectorize and don't allocate (#28126)" begin
-    using InteractiveUtils: code_llvm
     u, uprev, k1, k2, k3, k4, k5, k6, k7 = (ones(1000) for i in 1:9)
     function goo(u, uprev, k1, k2, k3, k4, k5, k6, k7)
         @. u = uprev + 0.1*(0.1*k1 + 0.2*k2 + 0.3*k3 + 0.4*k4 + 0.5*k5 + 0.6*k6 + 0.7*k7)
@@ -799,7 +798,6 @@ end
     end
     @allocated goo(u, uprev, k1, k2, k3, k4, k5, k6, k7)
     @test @allocated(goo(u, uprev, k1, k2, k3, k4, k5, k6, k7)) == 0
-    @test occursin("vector.body", sprint(code_llvm, goo, NTuple{9, Vector{Float32}}))
 end
 
 # Broadcasted iterable/indexable APIs