From e9b8765406fa1e4087b90dff1c518ce30d2b4b03 Mon Sep 17 00:00:00 2001 From: Jutho Haegeman Date: Sun, 13 Apr 2014 01:06:51 +0200 Subject: [PATCH 01/12] permutedims2 --- base/multidimensional.jl | 125 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 125 insertions(+) diff --git a/base/multidimensional.jl b/base/multidimensional.jl index 0b78db5b085d0..43c831ef3b3d3 100644 --- a/base/multidimensional.jl +++ b/base/multidimensional.jl @@ -441,6 +441,8 @@ end ## permutedims + + for (V, PT, BT) in {((:N,), BitArray, BitArray), ((:T,:N), Array, StridedArray)} @eval @ngenerate N typeof(P) function permutedims!{$(V...)}(P::$PT{$(V...)}, B::$BT{$(V...)}, perm) dimsB = size(B) @@ -478,6 +480,129 @@ for (V, PT, BT) in {((:N,), BitArray, BitArray), ((:T,:N), Array, StridedArray)} end end + +@ngenerate N typeof(P) function permutedims2!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N}, perm) + length(perm) == N || error("expected permutation of size $N, but length(perm)=$(length(perm))") + isperm(perm) || error("input is not a permutation") + dims = size(P) + for i = 1:N + dims[i] == size(B,perm[i]) || throw(DimensionMismatch("destination tensor of incorrect size")) + end + stridesP=strides(P) + stridesB=strides(B)[perm] + + if isa(P,BitArray) + elszP=1 + else + elszP=sizeof(T1) + end + if isa(B,BitArray) + elszB=1 + else + elszB=sizeof(T2) + end + bdims=blockdims(dims,elszP,stridesP,elszB,stridesB) + # bdims=blockdims(dims,sizeof(T1),stridesP,sizeof(T2),stridesB) + + #calculates all the strides and dims as variables + @nexprs N d->(stridesB_{d} = stride(B, perm[d])) + @nexprs N d->(stridesP_{d} = stride(P, d)) + @nexprs N d->(dims_{d} = dims[d]) + @nexprs N d->(bdims_{d} = bdims[d]) + + if isa(B, SubArray) + offsetB = B.first_index + B = B.parent + else + offsetB = 1 + end + if isa(P, SubArray) + offsetP = P.first_index + P = P.parent + else + offsetP = 1 + end + + @nexprs 1 d->(indB_{N} = offsetB) + @nexprs 1 d->(indP_{N} = offsetP) + @nloops(N, outer, d->1:bdims_{d}:dims_{d}, + d->(indB_{d-1} = indB_{d};indP_{d-1}=indP_{d}), # PRE + d->(indB_{d} += bdims_{d}*stridesB_{d};indP_{d} += bdims_{d}*stridesP_{d}), # POST + begin # BODY + @nexprs 1 e->(ind2B_{N} = indB_0) + @nexprs 1 e->(ind2P_{N} = indP_0) + @nloops(N, inner, e->outer_{e}:min(outer_{e}+bdims_{e}-1,dims_{e}), + e->(ind2B_{e-1} = ind2B_{e};ind2P_{e-1}=ind2P_{e}), # PRE + e->(ind2B_{e} += stridesB_{e};ind2P_{e} += stridesP_{e}), # POST + @inbounds P[ind2P_0]=B[ind2B_0]) #BODY + end) + return P +end + +function blockdims{N}(dims::NTuple{N,Int},elszA::Int,stridesA::NTuple{N,Int},elszB::Int,stridesB::NTuple{N,Int}) + # blocking strategy for permutedims + if N==0 + return () + else + pA=sortperm(collect(stridesA)) + pB=sortperm(collect(stridesB)) + + cacheline=64 + # determine cache + effectivecachesize=25600 # 64*400 = ifloor(cachesize/1.28) with cachesize=32k and 1.28 safety margin to prevent complete cachefill + + # if smallest stride of A or B is not 1, then the effect size a subblock of A + # or B will take in the cache depends not only on the element size but also on + # the number of unused data that will be copied together with every element + cachesizeA=min(elszA*stridesA[pA[1]],cacheline) + cachesizeB=min(elszB*stridesB[pB[1]],cacheline) + + # check if complete data fits into cache: + if (cachesizeA+cachesizeB)*prod(dims)<=effectivecachesize + return dims + end + + # cache-friendly blocking strategy: + bstep=ones(Int,N) + for i=1:N + bstep[i]=max(1,div(cacheline,elszA*stridesA[i]),div(cacheline,elszB*stridesB[i])) + # bstep is the number of elements along that dimension that can be expected to be + # within a single cacheline for either array A or B; it would be suboptimal not to + # use all of them immediately + end + + bdims=copy(bstep) + i=1 + j=1 + # loop will try to make blocks maximal along dimensions of minimal strides + # for both A and B, until the blockdim equals the full dim along those + # dimensions, and then continue with the next dimensions + while true + bdims[pA[i]]+=bstep[pA[i]] + if (cachesizeA+cachesizeB)*prod(bdims)>effectivecachesize # this must become true at some point + bdims[pA[i]]-=bstep[pA[i]] + break + end + if bdims[pA[i]]>=dims[pA[i]] + bdims[pA[i]]=dims[pA[i]] + i+=1 + end + + bdims[pB[j]]+=bstep[pB[j]] + if (cachesizeA+cachesizeB)*prod(bdims)>effectivecachesize # this must become true at some point + bdims[pB[j]]-=bstep[pB[j]] + break + end + if bdims[pB[j]]>=dims[pB[j]] + bdims[pB[j]]=dims[pB[j]] + j+=1 + end + end + return tuple(bdims...) + end +end + + ## unique across dim immutable Prehashed From 359402b766bcf842c8ec30672fa4cc5954db0c6d Mon Sep 17 00:00:00 2001 From: Jutho Haegeman Date: Sun, 13 Apr 2014 10:45:35 +0200 Subject: [PATCH 02/12] permutedims2! fix --- base/multidimensional.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/base/multidimensional.jl b/base/multidimensional.jl index 43c831ef3b3d3..12f20afc337a7 100644 --- a/base/multidimensional.jl +++ b/base/multidimensional.jl @@ -494,12 +494,12 @@ end if isa(P,BitArray) elszP=1 else - elszP=sizeof(T1) + elszP=isbits(T1) ? sizeof(T1) : sizeof(Ptr) end if isa(B,BitArray) elszB=1 else - elszB=sizeof(T2) + elszB=isbits(T2) ? sizeof(T2) : sizeof(Ptr) end bdims=blockdims(dims,elszP,stridesP,elszB,stridesB) # bdims=blockdims(dims,sizeof(T1),stridesP,sizeof(T2),stridesB) From 38c17a5d5566e50014a6930dbcbc948b6735dbb8 Mon Sep 17 00:00:00 2001 From: Jutho Haegeman Date: Sun, 13 Apr 2014 21:05:22 +0200 Subject: [PATCH 03/12] replace permutedims --- base/multidimensional.jl | 84 ++++++++++++++++++++-------------------- 1 file changed, 42 insertions(+), 42 deletions(-) diff --git a/base/multidimensional.jl b/base/multidimensional.jl index 12f20afc337a7..b55a6e077db80 100644 --- a/base/multidimensional.jl +++ b/base/multidimensional.jl @@ -442,54 +442,54 @@ end ## permutedims - -for (V, PT, BT) in {((:N,), BitArray, BitArray), ((:T,:N), Array, StridedArray)} - @eval @ngenerate N typeof(P) function permutedims!{$(V...)}(P::$PT{$(V...)}, B::$BT{$(V...)}, perm) - dimsB = size(B) - length(perm) == N || error("expected permutation of size $N, but length(perm)=$(length(perm))") - isperm(perm) || error("input is not a permutation") - dimsP = size(P) - for i = 1:length(perm) - dimsP[i] == dimsB[perm[i]] || throw(DimensionMismatch("destination tensor of incorrect size")) - end - - #calculates all the strides - strides_1 = 0 - @nexprs N d->(strides_{d+1} = stride(B, perm[d])) - - #Creates offset, because indexing starts at 1 - offset = 1 - sum(@ntuple N d->strides_{d+1}) - - if isa(B, SubArray) - offset += B.first_index - 1 - B = B.parent - end - - ind = 1 - @nexprs 1 d->(counts_{N+1} = strides_{N+1}) # a trick to set counts_($N+1) - @nloops(N, i, P, - d->(counts_d = strides_d), # PRE - d->(counts_{d+1} += strides_{d+1}), # POST - begin # BODY - sumc = sum(@ntuple N d->counts_{d+1}) - @inbounds P[ind] = B[sumc+offset] - ind += 1 - end) - - return P - end -end - - -@ngenerate N typeof(P) function permutedims2!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N}, perm) +# +# for (V, PT, BT) in {((:N,), BitArray, BitArray), ((:T,:N), Array, StridedArray)} +# @eval @ngenerate N typeof(P) function permutedims!{$(V...)}(P::$PT{$(V...)}, B::$BT{$(V...)}, perm) +# dimsB = size(B) +# length(perm) == N || error("expected permutation of size $N, but length(perm)=$(length(perm))") +# isperm(perm) || error("input is not a permutation") +# dimsP = size(P) +# for i = 1:length(perm) +# dimsP[i] == dimsB[perm[i]] || throw(DimensionMismatch("destination tensor of incorrect size")) +# end +# +# #calculates all the strides +# strides_1 = 0 +# @nexprs N d->(strides_{d+1} = stride(B, perm[d])) +# +# #Creates offset, because indexing starts at 1 +# offset = 1 - sum(@ntuple N d->strides_{d+1}) +# +# if isa(B, SubArray) +# offset += B.first_index - 1 +# B = B.parent +# end +# +# ind = 1 +# @nexprs 1 d->(counts_{N+1} = strides_{N+1}) # a trick to set counts_($N+1) +# @nloops(N, i, P, +# d->(counts_d = strides_d), # PRE +# d->(counts_{d+1} += strides_{d+1}), # POST +# begin # BODY +# sumc = sum(@ntuple N d->counts_{d+1}) +# @inbounds P[ind] = B[sumc+offset] +# ind += 1 +# end) +# +# return P +# end +# end + + +@ngenerate N typeof(P) function permutedims!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N}, perm) length(perm) == N || error("expected permutation of size $N, but length(perm)=$(length(perm))") isperm(perm) || error("input is not a permutation") dims = size(P) for i = 1:N dims[i] == size(B,perm[i]) || throw(DimensionMismatch("destination tensor of incorrect size")) end - stridesP=strides(P) - stridesB=strides(B)[perm] + stridesP=ntuple(d->stride(P,d),N) + stridesB=ntuple(d->stride(B,perm[d]),N) if isa(P,BitArray) elszP=1 From bffcfe4188fb2253137934f7dc39b4bfe7b7d934 Mon Sep 17 00:00:00 2001 From: Jutho Haegeman Date: Mon, 14 Apr 2014 06:24:03 +0200 Subject: [PATCH 04/12] final fixes to permutedims --- base/multidimensional.jl | 56 +++++----------------------------------- 1 file changed, 7 insertions(+), 49 deletions(-) diff --git a/base/multidimensional.jl b/base/multidimensional.jl index b55a6e077db80..5562058c30609 100644 --- a/base/multidimensional.jl +++ b/base/multidimensional.jl @@ -441,46 +441,6 @@ end ## permutedims - -# -# for (V, PT, BT) in {((:N,), BitArray, BitArray), ((:T,:N), Array, StridedArray)} -# @eval @ngenerate N typeof(P) function permutedims!{$(V...)}(P::$PT{$(V...)}, B::$BT{$(V...)}, perm) -# dimsB = size(B) -# length(perm) == N || error("expected permutation of size $N, but length(perm)=$(length(perm))") -# isperm(perm) || error("input is not a permutation") -# dimsP = size(P) -# for i = 1:length(perm) -# dimsP[i] == dimsB[perm[i]] || throw(DimensionMismatch("destination tensor of incorrect size")) -# end -# -# #calculates all the strides -# strides_1 = 0 -# @nexprs N d->(strides_{d+1} = stride(B, perm[d])) -# -# #Creates offset, because indexing starts at 1 -# offset = 1 - sum(@ntuple N d->strides_{d+1}) -# -# if isa(B, SubArray) -# offset += B.first_index - 1 -# B = B.parent -# end -# -# ind = 1 -# @nexprs 1 d->(counts_{N+1} = strides_{N+1}) # a trick to set counts_($N+1) -# @nloops(N, i, P, -# d->(counts_d = strides_d), # PRE -# d->(counts_{d+1} += strides_{d+1}), # POST -# begin # BODY -# sumc = sum(@ntuple N d->counts_{d+1}) -# @inbounds P[ind] = B[sumc+offset] -# ind += 1 -# end) -# -# return P -# end -# end - - @ngenerate N typeof(P) function permutedims!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N}, perm) length(perm) == N || error("expected permutation of size $N, but length(perm)=$(length(perm))") isperm(perm) || error("input is not a permutation") @@ -488,9 +448,13 @@ end for i = 1:N dims[i] == size(B,perm[i]) || throw(DimensionMismatch("destination tensor of incorrect size")) end - stridesP=ntuple(d->stride(P,d),N) - stridesB=ntuple(d->stride(B,perm[d]),N) + + #calculates all the strides and dims as variables + @nexprs N d->(stridesB_{d} = stride(B, perm[d])) + @nexprs N d->(stridesP_{d} = stride(P, d)) + @nexprs N d->(dims_{d} = dims[d]) + # calculate blocking strategy if isa(P,BitArray) elszP=1 else @@ -501,13 +465,7 @@ end else elszB=isbits(T2) ? sizeof(T2) : sizeof(Ptr) end - bdims=blockdims(dims,elszP,stridesP,elszB,stridesB) - # bdims=blockdims(dims,sizeof(T1),stridesP,sizeof(T2),stridesB) - - #calculates all the strides and dims as variables - @nexprs N d->(stridesB_{d} = stride(B, perm[d])) - @nexprs N d->(stridesP_{d} = stride(P, d)) - @nexprs N d->(dims_{d} = dims[d]) + bdims=blockdims(dims,elszP,(@ntuple N d->stridesP_{d}),elszB,(@ntuple N d->stridesB_{d})) @nexprs N d->(bdims_{d} = bdims[d]) if isa(B, SubArray) From f69ffbde018058c7d0433881308e0c0442475e07 Mon Sep 17 00:00:00 2001 From: Jutho Haegeman Date: Tue, 15 Apr 2014 10:12:09 +0200 Subject: [PATCH 05/12] permutedims benchmarked --- base/multidimensional.jl | 82 +++++++++++++++++++++++++++++----------- 1 file changed, 60 insertions(+), 22 deletions(-) diff --git a/base/multidimensional.jl b/base/multidimensional.jl index 5562058c30609..70bf4977fbec4 100644 --- a/base/multidimensional.jl +++ b/base/multidimensional.jl @@ -441,6 +441,43 @@ end ## permutedims +for (V, PT, BT) in {((:N,), BitArray, BitArray), ((:T,:N), Array, StridedArray)} + @eval @ngenerate N typeof(P) function permutedimsold!{$(V...)}(P::$PT{$(V...)}, B::$BT{$(V...)}, perm) + dimsB = size(B) + length(perm) == N || error("expected permutation of size $N, but length(perm)=$(length(perm))") + isperm(perm) || error("input is not a permutation") + dimsP = size(P) + for i = 1:length(perm) + dimsP[i] == dimsB[perm[i]] || throw(DimensionMismatch("destination tensor of incorrect size")) + end + + #calculates all the strides + strides_1 = 0 + @nexprs N d->(strides_{d+1} = stride(B, perm[d])) + + #Creates offset, because indexing starts at 1 + offset = 1 - sum(@ntuple N d->strides_{d+1}) + + if isa(B, SubArray) + offset += B.first_index - 1 + B = B.parent + end + + ind = 1 + @nexprs 1 d->(counts_{N+1} = strides_{N+1}) # a trick to set counts_($N+1) + @nloops(N, i, P, + d->(counts_d = strides_d), # PRE + d->(counts_{d+1} += strides_{d+1}), # POST + begin # BODY + sumc = sum(@ntuple N d->counts_{d+1}) + @inbounds P[ind] = B[sumc+offset] + ind += 1 + end) + + return P + end +end + @ngenerate N typeof(P) function permutedims!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N}, perm) length(perm) == N || error("expected permutation of size $N, but length(perm)=$(length(perm))") isperm(perm) || error("input is not a permutation") @@ -465,7 +502,11 @@ end else elszB=isbits(T2) ? sizeof(T2) : sizeof(Ptr) end - bdims=blockdims(dims,elszP,(@ntuple N d->stridesP_{d}),elszB,(@ntuple N d->stridesB_{d})) + if (elszB+elszP)*length(P)<=1<<15 + bdims=dims + else + bdims=blockdims(dims,elszP,(@ntuple N d->stridesP_{d}),elszB,(@ntuple N d->stridesB_{d})) + end @nexprs N d->(bdims_{d} = bdims[d]) if isa(B, SubArray) @@ -521,46 +562,43 @@ function blockdims{N}(dims::NTuple{N,Int},elszA::Int,stridesA::NTuple{N,Int},els end # cache-friendly blocking strategy: - bstep=ones(Int,N) - for i=1:N - bstep[i]=max(1,div(cacheline,elszA*stridesA[i]),div(cacheline,elszB*stridesB[i])) - # bstep is the number of elements along that dimension that can be expected to be - # within a single cacheline for either array A or B; it would be suboptimal not to - # use all of them immediately - end + # bstep=ones(Int,N) + # for i=1:N + # bstep[i]=max(1,div(cacheline,elszA*stridesA[i]),div(cacheline,elszB*stridesB[i])) + # # bstep is the number of elements along that dimension that can be expected to be + # # within a single cacheline for either array A or B; it would be suboptimal not to + # # use all of them immediately + # end - bdims=copy(bstep) + bdims=ones(Int,N) i=1 j=1 # loop will try to make blocks maximal along dimensions of minimal strides # for both A and B, until the blockdim equals the full dim along those # dimensions, and then continue with the next dimensions while true - bdims[pA[i]]+=bstep[pA[i]] - if (cachesizeA+cachesizeB)*prod(bdims)>effectivecachesize # this must become true at some point - bdims[pA[i]]-=bstep[pA[i]] - break - end - if bdims[pA[i]]>=dims[pA[i]] - bdims[pA[i]]=dims[pA[i]] + while bdims[pA[i]]==dims[pA[i]] i+=1 end - - bdims[pB[j]]+=bstep[pB[j]] + bdims[pA[i]]+=1#bstep[pA[i]] if (cachesizeA+cachesizeB)*prod(bdims)>effectivecachesize # this must become true at some point - bdims[pB[j]]-=bstep[pB[j]] + bdims[pA[i]]-=1#bstep[pA[i]] break end - if bdims[pB[j]]>=dims[pB[j]] - bdims[pB[j]]=dims[pB[j]] + + while bdims[pB[j]]==dims[pB[j]] j+=1 end + bdims[pB[j]]+=1#bstep[pB[j]] + if (cachesizeA+cachesizeB)*prod(bdims)>effectivecachesize # this must become true at some point + bdims[pB[j]]-=1#bstep[pB[j]] + break + end end return tuple(bdims...) end end - ## unique across dim immutable Prehashed From 35dd3532480c166c09eb3bd49940bac6b22a748d Mon Sep 17 00:00:00 2001 From: Jutho Haegeman Date: Tue, 15 Apr 2014 10:17:39 +0200 Subject: [PATCH 06/12] remove permutedimsold --- base/multidimensional.jl | 37 ------------------------------------- 1 file changed, 37 deletions(-) diff --git a/base/multidimensional.jl b/base/multidimensional.jl index 70bf4977fbec4..5535a7db7f275 100644 --- a/base/multidimensional.jl +++ b/base/multidimensional.jl @@ -441,43 +441,6 @@ end ## permutedims -for (V, PT, BT) in {((:N,), BitArray, BitArray), ((:T,:N), Array, StridedArray)} - @eval @ngenerate N typeof(P) function permutedimsold!{$(V...)}(P::$PT{$(V...)}, B::$BT{$(V...)}, perm) - dimsB = size(B) - length(perm) == N || error("expected permutation of size $N, but length(perm)=$(length(perm))") - isperm(perm) || error("input is not a permutation") - dimsP = size(P) - for i = 1:length(perm) - dimsP[i] == dimsB[perm[i]] || throw(DimensionMismatch("destination tensor of incorrect size")) - end - - #calculates all the strides - strides_1 = 0 - @nexprs N d->(strides_{d+1} = stride(B, perm[d])) - - #Creates offset, because indexing starts at 1 - offset = 1 - sum(@ntuple N d->strides_{d+1}) - - if isa(B, SubArray) - offset += B.first_index - 1 - B = B.parent - end - - ind = 1 - @nexprs 1 d->(counts_{N+1} = strides_{N+1}) # a trick to set counts_($N+1) - @nloops(N, i, P, - d->(counts_d = strides_d), # PRE - d->(counts_{d+1} += strides_{d+1}), # POST - begin # BODY - sumc = sum(@ntuple N d->counts_{d+1}) - @inbounds P[ind] = B[sumc+offset] - ind += 1 - end) - - return P - end -end - @ngenerate N typeof(P) function permutedims!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N}, perm) length(perm) == N || error("expected permutation of size $N, but length(perm)=$(length(perm))") isperm(perm) || error("input is not a permutation") From b6ba27a748724d5d678b26b275d703ba507f6b32 Mon Sep 17 00:00:00 2001 From: Jutho Haegeman Date: Tue, 15 Apr 2014 10:20:03 +0200 Subject: [PATCH 07/12] remove mutating permutedims! from exports --- base/exports.jl | 1 - 1 file changed, 1 deletion(-) diff --git a/base/exports.jl b/base/exports.jl index c862ebfda3323..2419f66bf5e72 100644 --- a/base/exports.jl +++ b/base/exports.jl @@ -536,7 +536,6 @@ export permutations, permute!, permutedims, - permutedims!, prod!, prod, promote_shape, From eb7ebb0f85cb702dff3aba7dd04e639fc9ce131c Mon Sep 17 00:00:00 2001 From: Jutho Haegeman Date: Sun, 27 Apr 2014 12:20:58 -0700 Subject: [PATCH 08/12] cleanup of permutedims --- base/multidimensional.jl | 202 +++++++++++++++++++++++++++------------ 1 file changed, 139 insertions(+), 63 deletions(-) diff --git a/base/multidimensional.jl b/base/multidimensional.jl index 5535a7db7f275..fbee72e87cf00 100644 --- a/base/multidimensional.jl +++ b/base/multidimensional.jl @@ -441,67 +441,161 @@ end ## permutedims -@ngenerate N typeof(P) function permutedims!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N}, perm) +for (V, PT, BT) in {((:N,), BitArray, BitArray), ((:T,:N), Array, StridedArray)} + @eval @ngenerate N typeof(P) function permutedims!{$(V...)}(P::$PT{$(V...)}, B::$BT{$(V...)}, perm) + dimsB = size(B) + length(perm) == N || error("expected permutation of size $N, but length(perm)=$(length(perm))") + isperm(perm) || error("input is not a permutation") + dimsP = size(P) + for i = 1:length(perm) + dimsP[i] == dimsB[perm[i]] || throw(DimensionMismatch("destination tensor of incorrect size")) + end + + #calculates all the strides + strides_1 = 0 + @nexprs N d->(strides_{d+1} = stride(B, perm[d])) + + #Creates offset, because indexing starts at 1 + offset = 1 - sum(@ntuple N d->strides_{d+1}) + + if isa(B, SubArray) + offset += B.first_index - 1 + B = B.parent + end + + ind = 1 + @nexprs 1 d->(counts_{N+1} = strides_{N+1}) # a trick to set counts_($N+1) + @nloops(N, i, P, + d->(counts_d = strides_d), # PRE + d->(counts_{d+1} += strides_{d+1}), # POST + begin # BODY + sumc = sum(@ntuple N d->counts_{d+1}) + @inbounds P[ind] = B[sumc+offset] + ind += 1 + end) + + return P + end +end + +function permutedims1!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N},perm;basesize=1024) length(perm) == N || error("expected permutation of size $N, but length(perm)=$(length(perm))") isperm(perm) || error("input is not a permutation") dims = size(P) for i = 1:N dims[i] == size(B,perm[i]) || throw(DimensionMismatch("destination tensor of incorrect size")) end - - #calculates all the strides and dims as variables - @nexprs N d->(stridesB_{d} = stride(B, perm[d])) - @nexprs N d->(stridesP_{d} = stride(P, d)) - @nexprs N d->(dims_{d} = dims[d]) - # calculate blocking strategy - if isa(P,BitArray) - elszP=1 + if collect(perm)==[1:N] + copy!(P,B) + elseif prod(dims)<=basesize + stridesP=ntuple(N,d->stride(P,d)) + stridesB=ntuple(N,d->stride(B,perm[d])) + basepermutedims!(P,B,stridesP,stridesB,dims,ntuple(N,d->1)) else - elszP=isbits(T1) ? sizeof(T1) : sizeof(Ptr) + # apply blocked permutation + stridesP=ntuple(N,d->stride(P,d)) + stridesB=ntuple(N,d->stride(B,perm[d])) + bdims=blockdims(dims,stridesP,stridesB,basesize) + blockedpermutedims!(P,B,stridesP,stridesB,dims,bdims) end - if isa(B,BitArray) - elszB=1 - else - elszB=isbits(T2) ? sizeof(T2) : sizeof(Ptr) + return P +end +function permutedims2!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N},perm;basesize=1024) + length(perm) == N || error("expected permutation of size $N, but length(perm)=$(length(perm))") + isperm(perm) || error("input is not a permutation") + dims = size(P) + for i = 1:N + dims[i] == size(B,perm[i]) || throw(DimensionMismatch("destination tensor of incorrect size")) end - if (elszB+elszP)*length(P)<=1<<15 - bdims=dims + + if collect(perm)==[1:N] + copy!(P,B) + elseif prod(dims)<=basesize + stridesP=ntuple(N,d->stride(P,d)) + stridesB=ntuple(N,d->stride(B,perm[d])) + basepermutedims!(P,B,stridesP,stridesB,dims,ntuple(N,d->1)) else - bdims=blockdims(dims,elszP,(@ntuple N d->stridesP_{d}),elszB,(@ntuple N d->stridesB_{d})) + # apply recursive permutation + stridesP=ntuple(N,d->stride(P,d)) + stridesB=ntuple(N,d->stride(B,perm[d])) + minstrides=ntuple(N,d->min(stridesP[d],stridesB[d])) + recursivepermutedims!(P,B,minstrides,stridesP,stridesB,dims,0,0,basesize) end + return P +end + +@ngenerate N typeof(P) function blockedpermutedims!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N},stridesP::NTuple{N,Int},stridesB::NTuple{N,Int},dims::NTuple{N,Int},bdims::NTuple{N,Int}) + @nexprs N d->(dims_{d} = dims[d]) @nexprs N d->(bdims_{d} = bdims[d]) + + # use blocked algorithms + @nexprs 1 d->(indB_{N} = 1) + @nexprs 1 d->(indP_{N} = 1) + @nloops(N, i, d->1:bdims_{d}:dims_{d}, + d->(indB_{d-1} = indB_{d};indP_{d-1}=indP_{d}), # PRE + d->(indB_{d} += bdims_{d}*stridesB_{d};indP_{d} += bdims_{d}*stridesP_{d}), # POST + begin # BODY + offsetB=indB_0 + offsetP=indP_0 + blockdims=@ntuple N d->min(bdims_{d},dims_{d}-i_{d}+1) + basepermutedims!(P,B,stridesP,stridesB,blockdims,offsetP,offsetB) # base algorithm in block + end) + return P +end + +function recursivepermutedims!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N},minstrides::NTuple{N,Int},stridesP::NTuple{N,Int},stridesB::NTuple{N,Int},dims::NTuple{N,Int},offsetP::Int,offsetB::Int,basesize::Int) + if prod(dims)<=basesize + basepermutedims!(P,B,stridesP,stridesB,dims,offsetP,offsetB) # fall back to base algorithm for sufficiently small sizes + else + dmax=1 + max=dims[dmax]*minstrides[dmax] + for d=2:N + newmax=dims[d]*minstrides[d] + if dims[d]>1 && newmax>max + dmax=d + max=newmax + end + end + newdim=dims[dmax]>>1 + recursivepermutedims!(P,B,minstrides,stridesP,stridesB,ntuple(N,d->(d==dmax ? newdim : dims[d])),offsetP,offsetB,basesize) + recursivepermutedims!(P,B,minstrides,stridesP,stridesB,ntuple(N,d->(d==dmax ? dims[d]-newdim : dims[d])),offsetP+stridesP[dmax]*newdim,offsetB+stridesB[dmax]*newdim,basesize) + end + return P +end +@ngenerate N typeof(P) function basepermutedims!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N},stridesP::NTuple{N,Int},stridesB::NTuple{N,Int},dims::NTuple{N,Int},offsetP::Int=0,offsetB::Int=0) + #calculates strides, dims and offset + @nexprs N d->(stridesB_{d} = stridesB[d]) + @nexprs N d->(stridesP_{d} = stridesP[d]) + @nexprs N d->(dims_{d} = dims[d]) if isa(B, SubArray) - offsetB = B.first_index + startB = B.first_index B = B.parent else - offsetB = 1 + startB = 1 end if isa(P, SubArray) - offsetP = P.first_index + startP = P.first_index P = P.parent else - offsetP = 1 + startP = 1 end - - @nexprs 1 d->(indB_{N} = offsetB) - @nexprs 1 d->(indP_{N} = offsetP) - @nloops(N, outer, d->1:bdims_{d}:dims_{d}, + startP+=offsetP + startB+=offsetB + + # copy data + @nexprs 1 d->(indB_{N} = startB) + @nexprs 1 d->(indP_{N} = startP) + @nloops(N, i, d->1:dims_{d}, d->(indB_{d-1} = indB_{d};indP_{d-1}=indP_{d}), # PRE - d->(indB_{d} += bdims_{d}*stridesB_{d};indP_{d} += bdims_{d}*stridesP_{d}), # POST - begin # BODY - @nexprs 1 e->(ind2B_{N} = indB_0) - @nexprs 1 e->(ind2P_{N} = indP_0) - @nloops(N, inner, e->outer_{e}:min(outer_{e}+bdims_{e}-1,dims_{e}), - e->(ind2B_{e-1} = ind2B_{e};ind2P_{e-1}=ind2P_{e}), # PRE - e->(ind2B_{e} += stridesB_{e};ind2P_{e} += stridesP_{e}), # POST - @inbounds P[ind2P_0]=B[ind2B_0]) #BODY - end) + d->(indB_{d} += stridesB_{d};indP_{d} += stridesP_{d}), # POST + @inbounds P[indP_0]=B[indB_0]) + return P end -function blockdims{N}(dims::NTuple{N,Int},elszA::Int,stridesA::NTuple{N,Int},elszB::Int,stridesB::NTuple{N,Int}) +function blockdims{N}(dims::NTuple{N,Int},stridesA::NTuple{N,Int},stridesB::NTuple{N,Int},blocksize::Int) # blocking strategy for permutedims if N==0 return () @@ -509,30 +603,12 @@ function blockdims{N}(dims::NTuple{N,Int},elszA::Int,stridesA::NTuple{N,Int},els pA=sortperm(collect(stridesA)) pB=sortperm(collect(stridesB)) - cacheline=64 - # determine cache - effectivecachesize=25600 # 64*400 = ifloor(cachesize/1.28) with cachesize=32k and 1.28 safety margin to prevent complete cachefill - - # if smallest stride of A or B is not 1, then the effect size a subblock of A - # or B will take in the cache depends not only on the element size but also on - # the number of unused data that will be copied together with every element - cachesizeA=min(elszA*stridesA[pA[1]],cacheline) - cachesizeB=min(elszB*stridesB[pB[1]],cacheline) - - # check if complete data fits into cache: - if (cachesizeA+cachesizeB)*prod(dims)<=effectivecachesize + # check if complete data fits into block: + if prod(dims)<=blocksize return dims end - # cache-friendly blocking strategy: - # bstep=ones(Int,N) - # for i=1:N - # bstep[i]=max(1,div(cacheline,elszA*stridesA[i]),div(cacheline,elszB*stridesB[i])) - # # bstep is the number of elements along that dimension that can be expected to be - # # within a single cacheline for either array A or B; it would be suboptimal not to - # # use all of them immediately - # end - + # blocking strategy: bdims=ones(Int,N) i=1 j=1 @@ -543,22 +619,22 @@ function blockdims{N}(dims::NTuple{N,Int},elszA::Int,stridesA::NTuple{N,Int},els while bdims[pA[i]]==dims[pA[i]] i+=1 end - bdims[pA[i]]+=1#bstep[pA[i]] - if (cachesizeA+cachesizeB)*prod(bdims)>effectivecachesize # this must become true at some point - bdims[pA[i]]-=1#bstep[pA[i]] + bdims[pA[i]]+=1 + if prod(bdims)>blocksize # this must become true at some point + bdims[pA[i]]-=1 break end while bdims[pB[j]]==dims[pB[j]] j+=1 end - bdims[pB[j]]+=1#bstep[pB[j]] - if (cachesizeA+cachesizeB)*prod(bdims)>effectivecachesize # this must become true at some point - bdims[pB[j]]-=1#bstep[pB[j]] + bdims[pB[j]]+=1 + if prod(bdims)>blocksize # this must become true at some point + bdims[pB[j]]-=1 break end end - return tuple(bdims...) + return tuple(bdims...)::NTuple{N,Int} end end From e2cfbf1e0e2b7039939421da84655ab6c76d7c67 Mon Sep 17 00:00:00 2001 From: Jutho Haegeman Date: Sun, 27 Apr 2014 18:39:40 -0700 Subject: [PATCH 09/12] further updates --- base/multidimensional.jl | 52 +++++++++++++++++++++++++++++++++------- 1 file changed, 44 insertions(+), 8 deletions(-) diff --git a/base/multidimensional.jl b/base/multidimensional.jl index fbee72e87cf00..7e849c62da88b 100644 --- a/base/multidimensional.jl +++ b/base/multidimensional.jl @@ -478,7 +478,7 @@ for (V, PT, BT) in {((:N,), BitArray, BitArray), ((:T,:N), Array, StridedArray)} end end -function permutedims1!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N},perm;basesize=1024) +function permutedims1!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N},perm;basesize::Int=1024) length(perm) == N || error("expected permutation of size $N, but length(perm)=$(length(perm))") isperm(perm) || error("input is not a permutation") dims = size(P) @@ -489,9 +489,13 @@ function permutedims1!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N},perm if collect(perm)==[1:N] copy!(P,B) elseif prod(dims)<=basesize - stridesP=ntuple(N,d->stride(P,d)) stridesB=ntuple(N,d->stride(B,perm[d])) - basepermutedims!(P,B,stridesP,stridesB,dims,ntuple(N,d->1)) + if isa(P,Array) || isa(P,BitArray) + simplepermutedims!(P,B,stridesB,dims) + else + stridesP=ntuple(N,d->stride(P,d)) + basepermutedims!(P,B,stridesP,stridesB,dims) + end else # apply blocked permutation stridesP=ntuple(N,d->stride(P,d)) @@ -501,7 +505,7 @@ function permutedims1!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N},perm end return P end -function permutedims2!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N},perm;basesize=1024) +function permutedims2!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N},perm;basesize::Int=1024) length(perm) == N || error("expected permutation of size $N, but length(perm)=$(length(perm))") isperm(perm) || error("input is not a permutation") dims = size(P) @@ -512,9 +516,13 @@ function permutedims2!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N},perm if collect(perm)==[1:N] copy!(P,B) elseif prod(dims)<=basesize - stridesP=ntuple(N,d->stride(P,d)) stridesB=ntuple(N,d->stride(B,perm[d])) - basepermutedims!(P,B,stridesP,stridesB,dims,ntuple(N,d->1)) + if isa(P,Array) || isa(P,BitArray) + simplepermutedims!(P,B,stridesB,dims) + else + stridesP=ntuple(N,d->stride(P,d)) + basepermutedims!(P,B,stridesP,stridesB,dims) + end else # apply recursive permutation stridesP=ntuple(N,d->stride(P,d)) @@ -528,10 +536,12 @@ end @ngenerate N typeof(P) function blockedpermutedims!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N},stridesP::NTuple{N,Int},stridesB::NTuple{N,Int},dims::NTuple{N,Int},bdims::NTuple{N,Int}) @nexprs N d->(dims_{d} = dims[d]) @nexprs N d->(bdims_{d} = bdims[d]) + @nexprs N d->(stridesP_{d} = stridesP[d]) + @nexprs N d->(stridesB_{d} = stridesB[d]) # use blocked algorithms - @nexprs 1 d->(indB_{N} = 1) - @nexprs 1 d->(indP_{N} = 1) + @nexprs 1 d->(indB_{N} = 0) + @nexprs 1 d->(indP_{N} = 0) @nloops(N, i, d->1:bdims_{d}:dims_{d}, d->(indB_{d-1} = indB_{d};indP_{d-1}=indP_{d}), # PRE d->(indB_{d} += bdims_{d}*stridesB_{d};indP_{d} += bdims_{d}*stridesP_{d}), # POST @@ -594,6 +604,32 @@ end return P end +@ngenerate N typeof(P) function simplepermutedims!{T1,T2,N}(P::Array{T1,N},B::StridedArray{T2,N},stridesB::NTuple{N,Int},dims::NTuple{N,Int},offsetB::Int=0) + #calculates strides, dims and offset + @nexprs N d->(stridesB_{d} = stridesB[d]) + @nexprs N d->(dims_{d} = dims[d]) + if isa(B, SubArray) + startB = B.first_index + B = B.parent + else + startB = 1 + end + startB+=offsetB + + # copy data + @nexprs 1 d->(indB_{N} = startB) + indP=1 + @nloops(N, i, d->1:dims_{d}, + d->(indB_{d-1} = indB_{d}), # PRE + d->(indB_{d} += stridesB_{d}), # POST + begin + @inbounds P[indP]=B[indB_0] + indP+=1 + end) + + return P +end + function blockdims{N}(dims::NTuple{N,Int},stridesA::NTuple{N,Int},stridesB::NTuple{N,Int},blocksize::Int) # blocking strategy for permutedims From 29a572ef682bd23deb7fc2c584bb7ca84408ace7 Mon Sep 17 00:00:00 2001 From: Jutho Haegeman Date: Mon, 28 Apr 2014 02:40:13 -0700 Subject: [PATCH 10/12] all methods --- base/multidimensional.jl | 233 ++++++++++++++++++++++++++++++++------- 1 file changed, 191 insertions(+), 42 deletions(-) diff --git a/base/multidimensional.jl b/base/multidimensional.jl index 7e849c62da88b..cee51c584d15a 100644 --- a/base/multidimensional.jl +++ b/base/multidimensional.jl @@ -478,7 +478,74 @@ for (V, PT, BT) in {((:N,), BitArray, BitArray), ((:T,:N), Array, StridedArray)} end end -function permutedims1!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N},perm;basesize::Int=1024) +@ngenerate N typeof(P) function permutedims0!{T1,T2,N}(P::Array{T1,N},B::StridedArray{T2,N},perm) + length(perm) == N || error("expected permutation of size $N, but length(perm)=$(length(perm))") + isperm(perm) || error("input is not a permutation") + dims = size(P) + for i = 1:N + dims[i] == size(B,perm[i]) || throw(DimensionMismatch("destination tensor of incorrect size")) + end + + #calculates strides, dims and offset + @nexprs N d->(stridesB_{d} = stride(B,perm[d])) + @nexprs N d->(dims_{d} = dims[d]) + if isa(B, SubArray) + startB = B.first_index + B = B.parent + else + startB = 1 + end + + # copy data + @nexprs 1 d->(indB_{N} = startB) + indP=1 + @nloops(N, i, d->1:dims_{d}, + d->(indB_{d-1} = indB_{d}), # PRE + d->(indB_{d} += stridesB_{d}), # POST + begin + @inbounds P[indP]=B[indB_0] + indP+=1 + end) + + return P +end +@ngenerate N typeof(P) function permutedims1!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N},perm) + length(perm) == N || error("expected permutation of size $N, but length(perm)=$(length(perm))") + isperm(perm) || error("input is not a permutation") + dims = size(P) + for i = 1:N + dims[i] == size(B,perm[i]) || throw(DimensionMismatch("destination tensor of incorrect size")) + end + + #calculates strides, dims and offset + @nexprs N d->(stridesB_{d} = stride(B,perm[d])) + @nexprs N d->(stridesP_{d} = stride(P,d)) + @nexprs N d->(dims_{d} = dims[d]) + if isa(B, SubArray) + startB = B.first_index + B = B.parent + else + startB = 1 + end + if isa(P, SubArray) + startP = P.first_index + P = P.parent + else + startP = 1 + end + + # copy data + @nexprs 1 d->(indB_{N} = startB) + @nexprs 1 d->(indP_{N} = startP) + @nloops(N, i, d->1:dims_{d}, + d->(indB_{d-1} = indB_{d};indP_{d-1}=indP_{d}), # PRE + d->(indB_{d} += stridesB_{d};indP_{d} += stridesP_{d}), # POST + @inbounds P[indP_0]=B[indB_0]) + + return P +end + +function permutedims2!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N},perm,basesize::Int=1024) length(perm) == N || error("expected permutation of size $N, but length(perm)=$(length(perm))") isperm(perm) || error("input is not a permutation") dims = size(P) @@ -488,14 +555,10 @@ function permutedims1!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N},perm if collect(perm)==[1:N] copy!(P,B) - elseif prod(dims)<=basesize - stridesB=ntuple(N,d->stride(B,perm[d])) - if isa(P,Array) || isa(P,BitArray) - simplepermutedims!(P,B,stridesB,dims) - else - stridesP=ntuple(N,d->stride(P,d)) - basepermutedims!(P,B,stridesP,stridesB,dims) - end + elseif prod(dims)<=4*basesize + stridesB=strides(B)[perm] + stridesP=strides(P) + basepermutedims!(P,B,stridesP,stridesB,dims) else # apply blocked permutation stridesP=ntuple(N,d->stride(P,d)) @@ -505,7 +568,7 @@ function permutedims1!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N},perm end return P end -function permutedims2!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N},perm;basesize::Int=1024) +@ngenerate N typeof(P) function permutedims3!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N},perm,basesize::Int=1024) length(perm) == N || error("expected permutation of size $N, but length(perm)=$(length(perm))") isperm(perm) || error("input is not a permutation") dims = size(P) @@ -515,14 +578,32 @@ function permutedims2!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N},perm if collect(perm)==[1:N] copy!(P,B) - elseif prod(dims)<=basesize - stridesB=ntuple(N,d->stride(B,perm[d])) - if isa(P,Array) || isa(P,BitArray) - simplepermutedims!(P,B,stridesB,dims) + elseif prod(dims)<=4*basesize + @nexprs N d->(stridesB_{d} = stride(B,perm[d])) + @nexprs N d->(stridesP_{d} = stride(P,d)) + @nexprs N d->(dims_{d} = dims[d]) + if isa(B, SubArray) + startB = B.first_index + B = B.parent else - stridesP=ntuple(N,d->stride(P,d)) - basepermutedims!(P,B,stridesP,stridesB,dims) + startB = 1 end + if isa(P, SubArray) + startP = P.first_index + P = P.parent + else + startP = 1 + end + + # copy data + @nexprs 1 d->(indB_{N} = startB) + @nexprs 1 d->(indP_{N} = startP) + @nloops(N, i, d->1:dims_{d}, + d->(indB_{d-1} = indB_{d};indP_{d-1}=indP_{d}), # PRE + d->(indB_{d} += stridesB_{d};indP_{d} += stridesP_{d}), # POST + @inbounds P[indP_0]=B[indB_0]) + + return P else # apply recursive permutation stridesP=ntuple(N,d->stride(P,d)) @@ -533,6 +614,100 @@ function permutedims2!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N},perm return P end +@ngenerate N typeof(P) function permutedimsnew!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N},perm,basesize::Int=1024) + length(perm) == N || error("expected permutation of size $N, but length(perm)=$(length(perm))") + isperm(perm) || error("input is not a permutation") + dims = size(P) + for i = 1:N + dims[i] == size(B,perm[i]) || throw(DimensionMismatch("destination tensor of incorrect size")) + end + @nexprs N d->(stridesB_{d} = stride(B,perm[d])) + @nexprs N d->(stridesP_{d} = stride(P,d)) + @nexprs N d->(dims_{d} = dims[d]) + + if isa(B, SubArray) + startB = B.first_index + B = B.parent + else + startB = 1 + end + if isa(P, SubArray) + startP = P.first_index + P = P.parent + else + startP = 1 + end + + if prod(dims)<=4*basesize + # copy data + @nexprs 1 d->(indB_{N} = startB) + @nexprs 1 d->(indP_{N} = startP) + @nloops(N, i, d->1:dims_{d}, + d->(indB_{d-1} = indB_{d};indP_{d-1}=indP_{d}), # PRE + d->(indB_{d} += stridesB_{d};indP_{d} += stridesP_{d}), # POST + @inbounds P[indP_0]=B[indB_0]) + else + @nexprs N d->(minstrides_{d} = min(stridesB_{d},stridesP_{d})) + + M=iceil(log2(prod(dims)/basesize)) + step=zeros(Int,M) + level=1 + @nexprs N d->(vecbdims_{d} = zeros(Int,M)) + @nexprs N d->(vecbdims_{d}[level] = dims_{d}) + vecoffsetB=zeros(Int,M) + vecoffsetP=zeros(Int,M) + vecdP=zeros(Int,M) + vecdB=zeros(Int,M) + vecdmax=zeros(Int,M) + vecnewdim=zeros(Int,M) + while level>0 + if level==M + @nexprs N d->(bdims_{d} = vecbdims_{d}[M]) + @nexprs 1 d->(indP_{N} = startP+vecoffsetP[M]) + @nexprs 1 d->(indB_{N} = startB+vecoffsetB[M]) + @nloops(N, i, d->1:bdims_{d}, + d->(indB_{d-1} = indB_{d};indP_{d-1}=indP_{d}), # PRE + d->(indB_{d} += stridesB_{d};indP_{d} += stridesP_{d}), # POST + @inbounds P[indP_0]=B[indB_0]) + level-=1 + elseif step[level]==0 + @nexprs N d->(bdims_{d} = vecbdims_{d}[level]) + dmax=1 + maxval=minstrides_1*bdims_1 + newdim=bdims_1>>1 + dP=stridesP_1 + dB=stridesB_1 + @nexprs N d->(newmax=minstrides_{d}*bdims_{d};if bdims_{d}>1 && newmax>maxval;dmax=d;newdim=bdims_{d}>>1;dP=stridesP_{d};dB=stridesB_{d};maxval=newmax;end) + vecnewdim[level]=newdim + vecdmax[level]=dmax + vecdP[level]=dP + vecdB[level]=dB + + @nexprs N d->(vecbdims_{d}[level+1] = (d==dmax ? newdim : bdims_{d})) + vecoffsetP[level+1]=vecoffsetP[level] + vecoffsetB[level+1]=vecoffsetB[level] + step[level+1]=0 + + step[level]+=1 + level+=1 + elseif step[level]==1 + @nexprs N d->(bdims_{d} = vecbdims_{d}[level]) + + @nexprs N d->(vecbdims_{d}[level+1] = (d==vecdmax[level] ? bdims_{d}-vecnewdim[level] : bdims_{d})) + vecoffsetP[level+1]=vecoffsetP[level]+vecdP[level]*vecnewdim[level] + vecoffsetB[level+1]=vecoffsetB[level]+vecdB[level]*vecnewdim[level] + step[level+1]=0 + + step[level]+=1 + level+=1 + else + level-=1 + end + end + end + return P +end + @ngenerate N typeof(P) function blockedpermutedims!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N},stridesP::NTuple{N,Int},stridesB::NTuple{N,Int},dims::NTuple{N,Int},bdims::NTuple{N,Int}) @nexprs N d->(dims_{d} = dims[d]) @nexprs N d->(bdims_{d} = bdims[d]) @@ -604,32 +779,6 @@ end return P end -@ngenerate N typeof(P) function simplepermutedims!{T1,T2,N}(P::Array{T1,N},B::StridedArray{T2,N},stridesB::NTuple{N,Int},dims::NTuple{N,Int},offsetB::Int=0) - #calculates strides, dims and offset - @nexprs N d->(stridesB_{d} = stridesB[d]) - @nexprs N d->(dims_{d} = dims[d]) - if isa(B, SubArray) - startB = B.first_index - B = B.parent - else - startB = 1 - end - startB+=offsetB - - # copy data - @nexprs 1 d->(indB_{N} = startB) - indP=1 - @nloops(N, i, d->1:dims_{d}, - d->(indB_{d-1} = indB_{d}), # PRE - d->(indB_{d} += stridesB_{d}), # POST - begin - @inbounds P[indP]=B[indB_0] - indP+=1 - end) - - return P -end - function blockdims{N}(dims::NTuple{N,Int},stridesA::NTuple{N,Int},stridesB::NTuple{N,Int},blocksize::Int) # blocking strategy for permutedims From 2729e9b5e9967e6c34702137096a49ca80978868 Mon Sep 17 00:00:00 2001 From: Jutho Haegeman Date: Mon, 28 Apr 2014 09:40:17 -0700 Subject: [PATCH 11/12] recursive implementation --- base/multidimensional.jl | 251 --------------------------------------- 1 file changed, 251 deletions(-) diff --git a/base/multidimensional.jl b/base/multidimensional.jl index cee51c584d15a..b07942464e50a 100644 --- a/base/multidimensional.jl +++ b/base/multidimensional.jl @@ -478,142 +478,6 @@ for (V, PT, BT) in {((:N,), BitArray, BitArray), ((:T,:N), Array, StridedArray)} end end -@ngenerate N typeof(P) function permutedims0!{T1,T2,N}(P::Array{T1,N},B::StridedArray{T2,N},perm) - length(perm) == N || error("expected permutation of size $N, but length(perm)=$(length(perm))") - isperm(perm) || error("input is not a permutation") - dims = size(P) - for i = 1:N - dims[i] == size(B,perm[i]) || throw(DimensionMismatch("destination tensor of incorrect size")) - end - - #calculates strides, dims and offset - @nexprs N d->(stridesB_{d} = stride(B,perm[d])) - @nexprs N d->(dims_{d} = dims[d]) - if isa(B, SubArray) - startB = B.first_index - B = B.parent - else - startB = 1 - end - - # copy data - @nexprs 1 d->(indB_{N} = startB) - indP=1 - @nloops(N, i, d->1:dims_{d}, - d->(indB_{d-1} = indB_{d}), # PRE - d->(indB_{d} += stridesB_{d}), # POST - begin - @inbounds P[indP]=B[indB_0] - indP+=1 - end) - - return P -end -@ngenerate N typeof(P) function permutedims1!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N},perm) - length(perm) == N || error("expected permutation of size $N, but length(perm)=$(length(perm))") - isperm(perm) || error("input is not a permutation") - dims = size(P) - for i = 1:N - dims[i] == size(B,perm[i]) || throw(DimensionMismatch("destination tensor of incorrect size")) - end - - #calculates strides, dims and offset - @nexprs N d->(stridesB_{d} = stride(B,perm[d])) - @nexprs N d->(stridesP_{d} = stride(P,d)) - @nexprs N d->(dims_{d} = dims[d]) - if isa(B, SubArray) - startB = B.first_index - B = B.parent - else - startB = 1 - end - if isa(P, SubArray) - startP = P.first_index - P = P.parent - else - startP = 1 - end - - # copy data - @nexprs 1 d->(indB_{N} = startB) - @nexprs 1 d->(indP_{N} = startP) - @nloops(N, i, d->1:dims_{d}, - d->(indB_{d-1} = indB_{d};indP_{d-1}=indP_{d}), # PRE - d->(indB_{d} += stridesB_{d};indP_{d} += stridesP_{d}), # POST - @inbounds P[indP_0]=B[indB_0]) - - return P -end - -function permutedims2!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N},perm,basesize::Int=1024) - length(perm) == N || error("expected permutation of size $N, but length(perm)=$(length(perm))") - isperm(perm) || error("input is not a permutation") - dims = size(P) - for i = 1:N - dims[i] == size(B,perm[i]) || throw(DimensionMismatch("destination tensor of incorrect size")) - end - - if collect(perm)==[1:N] - copy!(P,B) - elseif prod(dims)<=4*basesize - stridesB=strides(B)[perm] - stridesP=strides(P) - basepermutedims!(P,B,stridesP,stridesB,dims) - else - # apply blocked permutation - stridesP=ntuple(N,d->stride(P,d)) - stridesB=ntuple(N,d->stride(B,perm[d])) - bdims=blockdims(dims,stridesP,stridesB,basesize) - blockedpermutedims!(P,B,stridesP,stridesB,dims,bdims) - end - return P -end -@ngenerate N typeof(P) function permutedims3!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N},perm,basesize::Int=1024) - length(perm) == N || error("expected permutation of size $N, but length(perm)=$(length(perm))") - isperm(perm) || error("input is not a permutation") - dims = size(P) - for i = 1:N - dims[i] == size(B,perm[i]) || throw(DimensionMismatch("destination tensor of incorrect size")) - end - - if collect(perm)==[1:N] - copy!(P,B) - elseif prod(dims)<=4*basesize - @nexprs N d->(stridesB_{d} = stride(B,perm[d])) - @nexprs N d->(stridesP_{d} = stride(P,d)) - @nexprs N d->(dims_{d} = dims[d]) - if isa(B, SubArray) - startB = B.first_index - B = B.parent - else - startB = 1 - end - if isa(P, SubArray) - startP = P.first_index - P = P.parent - else - startP = 1 - end - - # copy data - @nexprs 1 d->(indB_{N} = startB) - @nexprs 1 d->(indP_{N} = startP) - @nloops(N, i, d->1:dims_{d}, - d->(indB_{d-1} = indB_{d};indP_{d-1}=indP_{d}), # PRE - d->(indB_{d} += stridesB_{d};indP_{d} += stridesP_{d}), # POST - @inbounds P[indP_0]=B[indB_0]) - - return P - else - # apply recursive permutation - stridesP=ntuple(N,d->stride(P,d)) - stridesB=ntuple(N,d->stride(B,perm[d])) - minstrides=ntuple(N,d->min(stridesP[d],stridesB[d])) - recursivepermutedims!(P,B,minstrides,stridesP,stridesB,dims,0,0,basesize) - end - return P -end - @ngenerate N typeof(P) function permutedimsnew!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N},perm,basesize::Int=1024) length(perm) == N || error("expected permutation of size $N, but length(perm)=$(length(perm))") isperm(perm) || error("input is not a permutation") @@ -708,121 +572,6 @@ end return P end -@ngenerate N typeof(P) function blockedpermutedims!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N},stridesP::NTuple{N,Int},stridesB::NTuple{N,Int},dims::NTuple{N,Int},bdims::NTuple{N,Int}) - @nexprs N d->(dims_{d} = dims[d]) - @nexprs N d->(bdims_{d} = bdims[d]) - @nexprs N d->(stridesP_{d} = stridesP[d]) - @nexprs N d->(stridesB_{d} = stridesB[d]) - - # use blocked algorithms - @nexprs 1 d->(indB_{N} = 0) - @nexprs 1 d->(indP_{N} = 0) - @nloops(N, i, d->1:bdims_{d}:dims_{d}, - d->(indB_{d-1} = indB_{d};indP_{d-1}=indP_{d}), # PRE - d->(indB_{d} += bdims_{d}*stridesB_{d};indP_{d} += bdims_{d}*stridesP_{d}), # POST - begin # BODY - offsetB=indB_0 - offsetP=indP_0 - blockdims=@ntuple N d->min(bdims_{d},dims_{d}-i_{d}+1) - basepermutedims!(P,B,stridesP,stridesB,blockdims,offsetP,offsetB) # base algorithm in block - end) - return P -end - -function recursivepermutedims!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N},minstrides::NTuple{N,Int},stridesP::NTuple{N,Int},stridesB::NTuple{N,Int},dims::NTuple{N,Int},offsetP::Int,offsetB::Int,basesize::Int) - if prod(dims)<=basesize - basepermutedims!(P,B,stridesP,stridesB,dims,offsetP,offsetB) # fall back to base algorithm for sufficiently small sizes - else - dmax=1 - max=dims[dmax]*minstrides[dmax] - for d=2:N - newmax=dims[d]*minstrides[d] - if dims[d]>1 && newmax>max - dmax=d - max=newmax - end - end - newdim=dims[dmax]>>1 - recursivepermutedims!(P,B,minstrides,stridesP,stridesB,ntuple(N,d->(d==dmax ? newdim : dims[d])),offsetP,offsetB,basesize) - recursivepermutedims!(P,B,minstrides,stridesP,stridesB,ntuple(N,d->(d==dmax ? dims[d]-newdim : dims[d])),offsetP+stridesP[dmax]*newdim,offsetB+stridesB[dmax]*newdim,basesize) - end - return P -end - -@ngenerate N typeof(P) function basepermutedims!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N},stridesP::NTuple{N,Int},stridesB::NTuple{N,Int},dims::NTuple{N,Int},offsetP::Int=0,offsetB::Int=0) - #calculates strides, dims and offset - @nexprs N d->(stridesB_{d} = stridesB[d]) - @nexprs N d->(stridesP_{d} = stridesP[d]) - @nexprs N d->(dims_{d} = dims[d]) - if isa(B, SubArray) - startB = B.first_index - B = B.parent - else - startB = 1 - end - if isa(P, SubArray) - startP = P.first_index - P = P.parent - else - startP = 1 - end - startP+=offsetP - startB+=offsetB - - # copy data - @nexprs 1 d->(indB_{N} = startB) - @nexprs 1 d->(indP_{N} = startP) - @nloops(N, i, d->1:dims_{d}, - d->(indB_{d-1} = indB_{d};indP_{d-1}=indP_{d}), # PRE - d->(indB_{d} += stridesB_{d};indP_{d} += stridesP_{d}), # POST - @inbounds P[indP_0]=B[indB_0]) - - return P -end - -function blockdims{N}(dims::NTuple{N,Int},stridesA::NTuple{N,Int},stridesB::NTuple{N,Int},blocksize::Int) - # blocking strategy for permutedims - if N==0 - return () - else - pA=sortperm(collect(stridesA)) - pB=sortperm(collect(stridesB)) - - # check if complete data fits into block: - if prod(dims)<=blocksize - return dims - end - - # blocking strategy: - bdims=ones(Int,N) - i=1 - j=1 - # loop will try to make blocks maximal along dimensions of minimal strides - # for both A and B, until the blockdim equals the full dim along those - # dimensions, and then continue with the next dimensions - while true - while bdims[pA[i]]==dims[pA[i]] - i+=1 - end - bdims[pA[i]]+=1 - if prod(bdims)>blocksize # this must become true at some point - bdims[pA[i]]-=1 - break - end - - while bdims[pB[j]]==dims[pB[j]] - j+=1 - end - bdims[pB[j]]+=1 - if prod(bdims)>blocksize # this must become true at some point - bdims[pB[j]]-=1 - break - end - end - return tuple(bdims...)::NTuple{N,Int} - end -end - ## unique across dim immutable Prehashed From f221d2c675cd9d3455c66b61f88f72a46a779b0b Mon Sep 17 00:00:00 2001 From: Jutho Haegeman Date: Mon, 28 Apr 2014 14:58:15 -0700 Subject: [PATCH 12/12] explicit recursive attempt --- base/cartesian.jl | 30 ++++++++++++++++- base/multidimensional.jl | 72 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 101 insertions(+), 1 deletion(-) diff --git a/base/cartesian.jl b/base/cartesian.jl index 7bbf6e52ccfca..a892241f34e48 100644 --- a/base/cartesian.jl +++ b/base/cartesian.jl @@ -1,6 +1,6 @@ module Cartesian -export @ngenerate, @nsplat, @nloops, @nref, @ncall, @nexprs, @nextract, @nall, @ntuple, @nif, ngenerate +export @ngenerate, @nsplat, @nloops, @nfunction, @nref, @ncall, @nexprs, @nextract, @nall, @ntuple, @nif, ngenerate const CARTESIAN_DIMS = 4 @@ -299,6 +299,34 @@ function _nloops(N::Int, itersym::Symbol, rangeexpr::Expr, args::Expr...) ex end +# Generate function f(pre,i_1::T,i_2::T,..) from @nfunction N f pre i::T body +macro nfunction(N, fname, args...) + _nfunction(N, fname, args...) +end + +function _nfunction(N::Int, fname::Symbol, args...) + if length(args) < 2 + error("argument missing") + end + + prearg = args[1:end-2] + for k=1:length(prearg) + if !(isa(prearg[k],Symbol) || (isa(prearg[k],Expr) && prearg[k].head==:(::) && isa(prearg[k].args[1],Symbol) && isa(prearg[k].args[2],Symbol))) + error("invalid argument type for pre arguments") + end + end + iterarg = args[end-1] + if !(isa(iterarg,Symbol) || (isa(iterarg,Expr) && iterarg.head==:(::) && isa(iterarg.args[1],Symbol) && isa(iterarg.args[2],Symbol))) + error("invalid argument type for argument that will be iterated ") + end + iterarglist=(isa(iterarg,Symbol) ? [inlineanonymous(iterarg,i) for i=1:N] : [Expr(:(::),inlineanonymous(iterarg.args[1],i),iterarg.args[2]) for i=1:N]) + fcall=Expr(:call,fname,prearg...,iterarglist...) + + body = args[end] + + ex=Expr(:escape,Expr(:function,fcall,body)) +end + # Generate expression A[i1, i2, ...] macro nref(N, A, sym) _nref(N, A, sym) diff --git a/base/multidimensional.jl b/base/multidimensional.jl index b07942464e50a..2f67de77b2f57 100644 --- a/base/multidimensional.jl +++ b/base/multidimensional.jl @@ -572,6 +572,78 @@ end return P end +@ngenerate N typeof(P) function permutedimsnew2!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N},perm,basesize::Int=1024) + length(perm) == N || error("expected permutation of size $N, but length(perm)=$(length(perm))") + isperm(perm) || error("input is not a permutation") + dims = size(P) + for i = 1:N + dims[i] == size(B,perm[i]) || throw(DimensionMismatch("destination tensor of incorrect size")) + end + @nexprs N d->(stridesB_{d} = stride(B,perm[d])) + @nexprs N d->(stridesP_{d} = stride(P,d)) + @nexprs N d->(dims_{d} = dims[d]) + + if isa(B, SubArray) + startB = B.first_index + B = B.parent + else + startB = 1 + end + if isa(P, SubArray) + startP = P.first_index + P = P.parent + else + startP = 1 + end + + @nfunction(N,innerbase,offsetP::Int,offsetB::Int,bdims::Int,begin + @nexprs 1 d->(indB_{N} = startB+offsetP) + @nexprs 1 d->(indP_{N} = startP+offsetB) + @nloops(N, i, d->1:bdims_{d}, + d->(indB_{d-1} = indB_{d};indP_{d-1}=indP_{d}), # PRE + d->(indB_{d} += stridesB_{d};indP_{d} += stridesP_{d}), # POST + @inbounds P[indP_0]=B[indB_0]) + end) + + if prod(dims)<=4*basesize + @ncall N innerbase 0 0 dims + else + @nexprs N d->(minstrides_{d} = min(stridesB_{d},stridesP_{d})) + + @nfunction(N,innerrec,offsetP::Int,offsetB::Int,bdims::Int,begin + currentsize=1 + @nexprs N d->(currentsize *=bdims_{d}) + if currentsize<=basesize + @ncall N innerbase offsetP offsetB bdims + else + dmax=1 + maxval=minstrides_1*bdims_1 + @nexprs N d->(begin + newmax=minstrides_{d}*bdims_{d} + if bdims_{d}>1 && newmax>maxval + dmax=d + maxval=newmax + end + end) + @nexprs N d->(begin + if d==dmax + olddim=bdims_{d} + newdim=olddim>>1 + bdims_{d}=newdim + @ncall N innerrec offsetP offsetB bdims + bdims_{d}=olddim-newdim + offsetP+=stridesP_{d}*newdim + offsetB+=stridesB_{d}*newdim + @ncall N innerrec offsetP offsetB bdims + end + end) + end + end) + @ncall N innerrec 0 0 dims + end + return P +end + ## unique across dim immutable Prehashed