From e9b8765406fa1e4087b90dff1c518ce30d2b4b03 Mon Sep 17 00:00:00 2001
From: Jutho Haegeman <jutho.haegeman@ugent.be>
Date: Sun, 13 Apr 2014 01:06:51 +0200
Subject: [PATCH 01/12] permutedims2

---
 base/multidimensional.jl | 125 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 125 insertions(+)

diff --git a/base/multidimensional.jl b/base/multidimensional.jl
index 0b78db5b085d0..43c831ef3b3d3 100644
--- a/base/multidimensional.jl
+++ b/base/multidimensional.jl
@@ -441,6 +441,8 @@ end
 
 ## permutedims
 
+
+
 for (V, PT, BT) in {((:N,), BitArray, BitArray), ((:T,:N), Array, StridedArray)}
     @eval @ngenerate N typeof(P) function permutedims!{$(V...)}(P::$PT{$(V...)}, B::$BT{$(V...)}, perm)
         dimsB = size(B)
@@ -478,6 +480,129 @@ for (V, PT, BT) in {((:N,), BitArray, BitArray), ((:T,:N), Array, StridedArray)}
     end
 end
 
+
+@ngenerate N typeof(P) function permutedims2!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N}, perm)
+    length(perm) == N || error("expected permutation of size $N, but length(perm)=$(length(perm))")
+    isperm(perm) || error("input is not a permutation")
+    dims = size(P)
+    for i = 1:N
+        dims[i] == size(B,perm[i]) || throw(DimensionMismatch("destination tensor of incorrect size"))
+    end
+    stridesP=strides(P)
+    stridesB=strides(B)[perm]
+    
+    if isa(P,BitArray)
+        elszP=1
+    else
+        elszP=sizeof(T1)
+    end
+    if isa(B,BitArray)
+        elszB=1
+    else
+        elszB=sizeof(T2)
+    end
+    bdims=blockdims(dims,elszP,stridesP,elszB,stridesB)
+    # bdims=blockdims(dims,sizeof(T1),stridesP,sizeof(T2),stridesB)
+
+    #calculates all the strides and dims as variables
+    @nexprs N d->(stridesB_{d} = stride(B, perm[d]))
+    @nexprs N d->(stridesP_{d} = stride(P, d))
+    @nexprs N d->(dims_{d} = dims[d])
+    @nexprs N d->(bdims_{d} = bdims[d])
+
+    if isa(B, SubArray)
+        offsetB = B.first_index
+        B = B.parent
+    else
+        offsetB = 1
+    end
+    if isa(P, SubArray)
+        offsetP = P.first_index
+        P = P.parent
+    else
+        offsetP = 1
+    end
+
+    @nexprs 1 d->(indB_{N} = offsetB)
+    @nexprs 1 d->(indP_{N} = offsetP)
+    @nloops(N, outer, d->1:bdims_{d}:dims_{d},
+        d->(indB_{d-1} = indB_{d};indP_{d-1}=indP_{d}), # PRE
+        d->(indB_{d} += bdims_{d}*stridesB_{d};indP_{d} += bdims_{d}*stridesP_{d}), # POST
+        begin # BODY
+            @nexprs 1 e->(ind2B_{N} = indB_0)
+            @nexprs 1 e->(ind2P_{N} = indP_0)
+            @nloops(N, inner, e->outer_{e}:min(outer_{e}+bdims_{e}-1,dims_{e}),
+                e->(ind2B_{e-1} = ind2B_{e};ind2P_{e-1}=ind2P_{e}), # PRE
+                e->(ind2B_{e} += stridesB_{e};ind2P_{e} += stridesP_{e}), # POST
+                @inbounds P[ind2P_0]=B[ind2B_0]) #BODY
+        end)
+    return P
+end
+
+function blockdims{N}(dims::NTuple{N,Int},elszA::Int,stridesA::NTuple{N,Int},elszB::Int,stridesB::NTuple{N,Int})
+    # blocking strategy for permutedims
+    if N==0
+        return ()
+    else
+        pA=sortperm(collect(stridesA))
+        pB=sortperm(collect(stridesB))
+        
+        cacheline=64
+        # determine cache
+        effectivecachesize=25600 # 64*400 = ifloor(cachesize/1.28) with cachesize=32k and 1.28 safety margin to prevent complete cachefill
+        
+        # if smallest stride of A or B is not 1, then the effect size a subblock of A
+        # or B will take in the cache depends not only on the element size but also on
+        # the number of unused data that will be copied together with every element
+        cachesizeA=min(elszA*stridesA[pA[1]],cacheline)
+        cachesizeB=min(elszB*stridesB[pB[1]],cacheline)
+    
+        # check if complete data fits into cache:
+        if (cachesizeA+cachesizeB)*prod(dims)<=effectivecachesize
+            return dims
+        end
+    
+        # cache-friendly blocking strategy:
+        bstep=ones(Int,N)
+        for i=1:N
+            bstep[i]=max(1,div(cacheline,elszA*stridesA[i]),div(cacheline,elszB*stridesB[i]))
+            # bstep is the number of elements along that dimension that can be expected to be
+            # within a single cacheline for either array A or B; it would be suboptimal not to
+            # use all of them immediately
+        end
+        
+        bdims=copy(bstep)
+        i=1
+        j=1
+        # loop will try to make blocks maximal along dimensions of minimal strides
+        # for both A and B, until the blockdim equals the full dim along those
+        # dimensions, and then continue with the next dimensions
+        while true
+            bdims[pA[i]]+=bstep[pA[i]]
+            if (cachesizeA+cachesizeB)*prod(bdims)>effectivecachesize # this must become true at some point
+                bdims[pA[i]]-=bstep[pA[i]]
+                break
+            end
+            if bdims[pA[i]]>=dims[pA[i]]
+                bdims[pA[i]]=dims[pA[i]]
+                i+=1
+            end
+            
+            bdims[pB[j]]+=bstep[pB[j]]
+            if (cachesizeA+cachesizeB)*prod(bdims)>effectivecachesize # this must become true at some point
+                bdims[pB[j]]-=bstep[pB[j]]
+                break
+            end
+            if bdims[pB[j]]>=dims[pB[j]]
+                bdims[pB[j]]=dims[pB[j]]
+                j+=1
+            end
+        end
+        return tuple(bdims...)
+    end
+end
+
+
 ## unique across dim
 
 immutable Prehashed

From 359402b766bcf842c8ec30672fa4cc5954db0c6d Mon Sep 17 00:00:00 2001
From: Jutho Haegeman <jutho.haegeman@ugent.be>
Date: Sun, 13 Apr 2014 10:45:35 +0200
Subject: [PATCH 02/12] permutedims2! fix

---
 base/multidimensional.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/base/multidimensional.jl b/base/multidimensional.jl
index 43c831ef3b3d3..12f20afc337a7 100644
--- a/base/multidimensional.jl
+++ b/base/multidimensional.jl
@@ -494,12 +494,12 @@ end
     if isa(P,BitArray)
         elszP=1
     else
-        elszP=sizeof(T1)
+        elszP=isbits(T1) ? sizeof(T1) : sizeof(Ptr)
     end
     if isa(B,BitArray)
         elszB=1
     else
-        elszB=sizeof(T2)
+        elszB=isbits(T2) ? sizeof(T2) : sizeof(Ptr)
     end
     bdims=blockdims(dims,elszP,stridesP,elszB,stridesB)
     # bdims=blockdims(dims,sizeof(T1),stridesP,sizeof(T2),stridesB)

From 38c17a5d5566e50014a6930dbcbc948b6735dbb8 Mon Sep 17 00:00:00 2001
From: Jutho Haegeman <jutho.haegeman@ugent.be>
Date: Sun, 13 Apr 2014 21:05:22 +0200
Subject: [PATCH 03/12] replace permutedims

---
 base/multidimensional.jl | 84 ++++++++++++++++++++--------------------
 1 file changed, 42 insertions(+), 42 deletions(-)

diff --git a/base/multidimensional.jl b/base/multidimensional.jl
index 12f20afc337a7..b55a6e077db80 100644
--- a/base/multidimensional.jl
+++ b/base/multidimensional.jl
@@ -442,54 +442,54 @@ end
 ## permutedims
 
 
-
-for (V, PT, BT) in {((:N,), BitArray, BitArray), ((:T,:N), Array, StridedArray)}
-    @eval @ngenerate N typeof(P) function permutedims!{$(V...)}(P::$PT{$(V...)}, B::$BT{$(V...)}, perm)
-        dimsB = size(B)
-        length(perm) == N || error("expected permutation of size $N, but length(perm)=$(length(perm))")
-        isperm(perm) || error("input is not a permutation")
-        dimsP = size(P)
-        for i = 1:length(perm)
-            dimsP[i] == dimsB[perm[i]] || throw(DimensionMismatch("destination tensor of incorrect size"))
-        end
-
-        #calculates all the strides
-        strides_1 = 0
-        @nexprs N d->(strides_{d+1} = stride(B, perm[d]))
-
-        #Creates offset, because indexing starts at 1
-        offset = 1 - sum(@ntuple N d->strides_{d+1})
-
-        if isa(B, SubArray)
-            offset += B.first_index - 1
-            B = B.parent
-        end
-
-        ind = 1
-        @nexprs 1 d->(counts_{N+1} = strides_{N+1}) # a trick to set counts_($N+1)
-        @nloops(N, i, P,
-            d->(counts_d = strides_d), # PRE
-            d->(counts_{d+1} += strides_{d+1}), # POST
-            begin # BODY
-                sumc = sum(@ntuple N d->counts_{d+1})
-                @inbounds P[ind] = B[sumc+offset]
-                ind += 1
-            end)
-
-        return P
-    end
-end
-
-
-@ngenerate N typeof(P) function permutedims2!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N}, perm)
+# 
+# for (V, PT, BT) in {((:N,), BitArray, BitArray), ((:T,:N), Array, StridedArray)}
+#     @eval @ngenerate N typeof(P) function permutedims!{$(V...)}(P::$PT{$(V...)}, B::$BT{$(V...)}, perm)
+#         dimsB = size(B)
+#         length(perm) == N || error("expected permutation of size $N, but length(perm)=$(length(perm))")
+#         isperm(perm) || error("input is not a permutation")
+#         dimsP = size(P)
+#         for i = 1:length(perm)
+#             dimsP[i] == dimsB[perm[i]] || throw(DimensionMismatch("destination tensor of incorrect size"))
+#         end
+# 
+#         #calculates all the strides
+#         strides_1 = 0
+#         @nexprs N d->(strides_{d+1} = stride(B, perm[d]))
+# 
+#         #Creates offset, because indexing starts at 1
+#         offset = 1 - sum(@ntuple N d->strides_{d+1})
+# 
+#         if isa(B, SubArray)
+#             offset += B.first_index - 1
+#             B = B.parent
+#         end
+# 
+#         ind = 1
+#         @nexprs 1 d->(counts_{N+1} = strides_{N+1}) # a trick to set counts_($N+1)
+#         @nloops(N, i, P,
+#             d->(counts_d = strides_d), # PRE
+#             d->(counts_{d+1} += strides_{d+1}), # POST
+#             begin # BODY
+#                 sumc = sum(@ntuple N d->counts_{d+1})
+#                 @inbounds P[ind] = B[sumc+offset]
+#                 ind += 1
+#             end)
+# 
+#         return P
+#     end
+# end
+
+
+@ngenerate N typeof(P) function permutedims!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N}, perm)
     length(perm) == N || error("expected permutation of size $N, but length(perm)=$(length(perm))")
     isperm(perm) || error("input is not a permutation")
     dims = size(P)
     for i = 1:N
         dims[i] == size(B,perm[i]) || throw(DimensionMismatch("destination tensor of incorrect size"))
     end
-    stridesP=strides(P)
-    stridesB=strides(B)[perm]
+    stridesP=ntuple(d->stride(P,d),N)
+    stridesB=ntuple(d->stride(B,perm[d]),N)
     
     if isa(P,BitArray)
         elszP=1

From bffcfe4188fb2253137934f7dc39b4bfe7b7d934 Mon Sep 17 00:00:00 2001
From: Jutho Haegeman <jutho.haegeman@ugent.be>
Date: Mon, 14 Apr 2014 06:24:03 +0200
Subject: [PATCH 04/12] final fixes to permutedims

---
 base/multidimensional.jl | 56 +++++-----------------------------------
 1 file changed, 7 insertions(+), 49 deletions(-)

diff --git a/base/multidimensional.jl b/base/multidimensional.jl
index b55a6e077db80..5562058c30609 100644
--- a/base/multidimensional.jl
+++ b/base/multidimensional.jl
@@ -441,46 +441,6 @@ end
 
 ## permutedims
 
-
-# 
-# for (V, PT, BT) in {((:N,), BitArray, BitArray), ((:T,:N), Array, StridedArray)}
-#     @eval @ngenerate N typeof(P) function permutedims!{$(V...)}(P::$PT{$(V...)}, B::$BT{$(V...)}, perm)
-#         dimsB = size(B)
-#         length(perm) == N || error("expected permutation of size $N, but length(perm)=$(length(perm))")
-#         isperm(perm) || error("input is not a permutation")
-#         dimsP = size(P)
-#         for i = 1:length(perm)
-#             dimsP[i] == dimsB[perm[i]] || throw(DimensionMismatch("destination tensor of incorrect size"))
-#         end
-# 
-#         #calculates all the strides
-#         strides_1 = 0
-#         @nexprs N d->(strides_{d+1} = stride(B, perm[d]))
-# 
-#         #Creates offset, because indexing starts at 1
-#         offset = 1 - sum(@ntuple N d->strides_{d+1})
-# 
-#         if isa(B, SubArray)
-#             offset += B.first_index - 1
-#             B = B.parent
-#         end
-# 
-#         ind = 1
-#         @nexprs 1 d->(counts_{N+1} = strides_{N+1}) # a trick to set counts_($N+1)
-#         @nloops(N, i, P,
-#             d->(counts_d = strides_d), # PRE
-#             d->(counts_{d+1} += strides_{d+1}), # POST
-#             begin # BODY
-#                 sumc = sum(@ntuple N d->counts_{d+1})
-#                 @inbounds P[ind] = B[sumc+offset]
-#                 ind += 1
-#             end)
-# 
-#         return P
-#     end
-# end
-
-
 @ngenerate N typeof(P) function permutedims!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N}, perm)
     length(perm) == N || error("expected permutation of size $N, but length(perm)=$(length(perm))")
     isperm(perm) || error("input is not a permutation")
@@ -488,9 +448,13 @@ end
     for i = 1:N
         dims[i] == size(B,perm[i]) || throw(DimensionMismatch("destination tensor of incorrect size"))
     end
-    stridesP=ntuple(d->stride(P,d),N)
-    stridesB=ntuple(d->stride(B,perm[d]),N)
+
+    #calculates all the strides and dims as variables
+    @nexprs N d->(stridesB_{d} = stride(B, perm[d]))
+    @nexprs N d->(stridesP_{d} = stride(P, d))
+    @nexprs N d->(dims_{d} = dims[d])
     
+    # calculate blocking strategy
     if isa(P,BitArray)
         elszP=1
     else
@@ -501,13 +465,7 @@ end
     else
         elszB=isbits(T2) ? sizeof(T2) : sizeof(Ptr)
     end
-    bdims=blockdims(dims,elszP,stridesP,elszB,stridesB)
-    # bdims=blockdims(dims,sizeof(T1),stridesP,sizeof(T2),stridesB)
-
-    #calculates all the strides and dims as variables
-    @nexprs N d->(stridesB_{d} = stride(B, perm[d]))
-    @nexprs N d->(stridesP_{d} = stride(P, d))
-    @nexprs N d->(dims_{d} = dims[d])
+    bdims=blockdims(dims,elszP,(@ntuple N d->stridesP_{d}),elszB,(@ntuple N d->stridesB_{d}))
     @nexprs N d->(bdims_{d} = bdims[d])
 
     if isa(B, SubArray)

From f69ffbde018058c7d0433881308e0c0442475e07 Mon Sep 17 00:00:00 2001
From: Jutho Haegeman <jutho.haegeman@ugent.be>
Date: Tue, 15 Apr 2014 10:12:09 +0200
Subject: [PATCH 05/12] permutedims benchmarked

---
 base/multidimensional.jl | 82 +++++++++++++++++++++++++++++-----------
 1 file changed, 60 insertions(+), 22 deletions(-)

diff --git a/base/multidimensional.jl b/base/multidimensional.jl
index 5562058c30609..70bf4977fbec4 100644
--- a/base/multidimensional.jl
+++ b/base/multidimensional.jl
@@ -441,6 +441,43 @@ end
 
 ## permutedims
 
+for (V, PT, BT) in {((:N,), BitArray, BitArray), ((:T,:N), Array, StridedArray)}
+    @eval @ngenerate N typeof(P) function permutedimsold!{$(V...)}(P::$PT{$(V...)}, B::$BT{$(V...)}, perm)
+        dimsB = size(B)
+        length(perm) == N || error("expected permutation of size $N, but length(perm)=$(length(perm))")
+        isperm(perm) || error("input is not a permutation")
+        dimsP = size(P)
+        for i = 1:length(perm)
+            dimsP[i] == dimsB[perm[i]] || throw(DimensionMismatch("destination tensor of incorrect size"))
+        end
+
+        #calculates all the strides
+        strides_1 = 0
+        @nexprs N d->(strides_{d+1} = stride(B, perm[d]))
+
+        #Creates offset, because indexing starts at 1
+        offset = 1 - sum(@ntuple N d->strides_{d+1})
+
+        if isa(B, SubArray)
+            offset += B.first_index - 1
+            B = B.parent
+        end
+
+        ind = 1
+        @nexprs 1 d->(counts_{N+1} = strides_{N+1}) # a trick to set counts_($N+1)
+        @nloops(N, i, P,
+            d->(counts_d = strides_d), # PRE
+            d->(counts_{d+1} += strides_{d+1}), # POST
+            begin # BODY
+                sumc = sum(@ntuple N d->counts_{d+1})
+                @inbounds P[ind] = B[sumc+offset]
+                ind += 1
+            end)
+
+        return P
+    end
+end
+
 @ngenerate N typeof(P) function permutedims!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N}, perm)
     length(perm) == N || error("expected permutation of size $N, but length(perm)=$(length(perm))")
     isperm(perm) || error("input is not a permutation")
@@ -465,7 +502,11 @@ end
     else
         elszB=isbits(T2) ? sizeof(T2) : sizeof(Ptr)
     end
-    bdims=blockdims(dims,elszP,(@ntuple N d->stridesP_{d}),elszB,(@ntuple N d->stridesB_{d}))
+    if (elszB+elszP)*length(P)<=1<<15
+        bdims=dims
+    else
+        bdims=blockdims(dims,elszP,(@ntuple N d->stridesP_{d}),elszB,(@ntuple N d->stridesB_{d}))
+    end
     @nexprs N d->(bdims_{d} = bdims[d])
 
     if isa(B, SubArray)
@@ -521,46 +562,43 @@ function blockdims{N}(dims::NTuple{N,Int},elszA::Int,stridesA::NTuple{N,Int},els
         end
     
         # cache-friendly blocking strategy:
-        bstep=ones(Int,N)
-        for i=1:N
-            bstep[i]=max(1,div(cacheline,elszA*stridesA[i]),div(cacheline,elszB*stridesB[i]))
-            # bstep is the number of elements along that dimension that can be expected to be
-            # within a single cacheline for either array A or B; it would be suboptimal not to
-            # use all of them immediately
-        end
+        # bstep=ones(Int,N)
+        # for i=1:N
+        #     bstep[i]=max(1,div(cacheline,elszA*stridesA[i]),div(cacheline,elszB*stridesB[i]))
+        #     # bstep is the number of elements along that dimension that can be expected to be
+        #     # within a single cacheline for either array A or B; it would be suboptimal not to
+        #     # use all of them immediately
+        # end
         
-        bdims=copy(bstep)
+        bdims=ones(Int,N)
         i=1
         j=1
         # loop will try to make blocks maximal along dimensions of minimal strides
         # for both A and B, until the blockdim equals the full dim along those
         # dimensions, and then continue with the next dimensions
         while true
-            bdims[pA[i]]+=bstep[pA[i]]
-            if (cachesizeA+cachesizeB)*prod(bdims)>effectivecachesize # this must become true at some point
-                bdims[pA[i]]-=bstep[pA[i]]
-                break
-            end
-            if bdims[pA[i]]>=dims[pA[i]]
-                bdims[pA[i]]=dims[pA[i]]
+            while bdims[pA[i]]==dims[pA[i]]
                 i+=1
             end
-            
-            bdims[pB[j]]+=bstep[pB[j]]
+            bdims[pA[i]]+=1#bstep[pA[i]]
             if (cachesizeA+cachesizeB)*prod(bdims)>effectivecachesize # this must become true at some point
-                bdims[pB[j]]-=bstep[pB[j]]
+                bdims[pA[i]]-=1#bstep[pA[i]]
                 break
             end
-            if bdims[pB[j]]>=dims[pB[j]]
-                bdims[pB[j]]=dims[pB[j]]
+            
+            while bdims[pB[j]]==dims[pB[j]]
                 j+=1
             end
+            bdims[pB[j]]+=1#bstep[pB[j]]
+            if (cachesizeA+cachesizeB)*prod(bdims)>effectivecachesize # this must become true at some point
+                bdims[pB[j]]-=1#bstep[pB[j]]
+                break
+            end
         end
         return tuple(bdims...)
     end
 end
 
-
 ## unique across dim
 
 immutable Prehashed

From 35dd3532480c166c09eb3bd49940bac6b22a748d Mon Sep 17 00:00:00 2001
From: Jutho Haegeman <jutho.haegeman@ugent.be>
Date: Tue, 15 Apr 2014 10:17:39 +0200
Subject: [PATCH 06/12] remove permutedimsold

---
 base/multidimensional.jl | 37 -------------------------------------
 1 file changed, 37 deletions(-)

diff --git a/base/multidimensional.jl b/base/multidimensional.jl
index 70bf4977fbec4..5535a7db7f275 100644
--- a/base/multidimensional.jl
+++ b/base/multidimensional.jl
@@ -441,43 +441,6 @@ end
 
 ## permutedims
 
-for (V, PT, BT) in {((:N,), BitArray, BitArray), ((:T,:N), Array, StridedArray)}
-    @eval @ngenerate N typeof(P) function permutedimsold!{$(V...)}(P::$PT{$(V...)}, B::$BT{$(V...)}, perm)
-        dimsB = size(B)
-        length(perm) == N || error("expected permutation of size $N, but length(perm)=$(length(perm))")
-        isperm(perm) || error("input is not a permutation")
-        dimsP = size(P)
-        for i = 1:length(perm)
-            dimsP[i] == dimsB[perm[i]] || throw(DimensionMismatch("destination tensor of incorrect size"))
-        end
-
-        #calculates all the strides
-        strides_1 = 0
-        @nexprs N d->(strides_{d+1} = stride(B, perm[d]))
-
-        #Creates offset, because indexing starts at 1
-        offset = 1 - sum(@ntuple N d->strides_{d+1})
-
-        if isa(B, SubArray)
-            offset += B.first_index - 1
-            B = B.parent
-        end
-
-        ind = 1
-        @nexprs 1 d->(counts_{N+1} = strides_{N+1}) # a trick to set counts_($N+1)
-        @nloops(N, i, P,
-            d->(counts_d = strides_d), # PRE
-            d->(counts_{d+1} += strides_{d+1}), # POST
-            begin # BODY
-                sumc = sum(@ntuple N d->counts_{d+1})
-                @inbounds P[ind] = B[sumc+offset]
-                ind += 1
-            end)
-
-        return P
-    end
-end
-
 @ngenerate N typeof(P) function permutedims!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N}, perm)
     length(perm) == N || error("expected permutation of size $N, but length(perm)=$(length(perm))")
     isperm(perm) || error("input is not a permutation")

From b6ba27a748724d5d678b26b275d703ba507f6b32 Mon Sep 17 00:00:00 2001
From: Jutho Haegeman <jutho.haegeman@ugent.be>
Date: Tue, 15 Apr 2014 10:20:03 +0200
Subject: [PATCH 07/12] remove mutating permutedims! from exports

---
 base/exports.jl | 1 -
 1 file changed, 1 deletion(-)

diff --git a/base/exports.jl b/base/exports.jl
index c862ebfda3323..2419f66bf5e72 100644
--- a/base/exports.jl
+++ b/base/exports.jl
@@ -536,7 +536,6 @@ export
     permutations,
     permute!,
     permutedims,
-    permutedims!,
     prod!,
     prod,
     promote_shape,

From eb7ebb0f85cb702dff3aba7dd04e639fc9ce131c Mon Sep 17 00:00:00 2001
From: Jutho Haegeman <jutho.haegeman@ugent.be>
Date: Sun, 27 Apr 2014 12:20:58 -0700
Subject: [PATCH 08/12] cleanup of permutedims

---
 base/multidimensional.jl | 202 +++++++++++++++++++++++++++------------
 1 file changed, 139 insertions(+), 63 deletions(-)

diff --git a/base/multidimensional.jl b/base/multidimensional.jl
index 5535a7db7f275..fbee72e87cf00 100644
--- a/base/multidimensional.jl
+++ b/base/multidimensional.jl
@@ -441,67 +441,161 @@ end
 
 ## permutedims
 
-@ngenerate N typeof(P) function permutedims!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N}, perm)
+for (V, PT, BT) in {((:N,), BitArray, BitArray), ((:T,:N), Array, StridedArray)}
+    @eval @ngenerate N typeof(P) function permutedims!{$(V...)}(P::$PT{$(V...)}, B::$BT{$(V...)}, perm)
+        dimsB = size(B)
+        length(perm) == N || error("expected permutation of size $N, but length(perm)=$(length(perm))")
+        isperm(perm) || error("input is not a permutation")
+        dimsP = size(P)
+        for i = 1:length(perm)
+            dimsP[i] == dimsB[perm[i]] || throw(DimensionMismatch("destination tensor of incorrect size"))
+        end
+
+        #calculates all the strides
+        strides_1 = 0
+        @nexprs N d->(strides_{d+1} = stride(B, perm[d]))
+
+        #Creates offset, because indexing starts at 1
+        offset = 1 - sum(@ntuple N d->strides_{d+1})
+
+        if isa(B, SubArray)
+            offset += B.first_index - 1
+            B = B.parent
+        end
+
+        ind = 1
+        @nexprs 1 d->(counts_{N+1} = strides_{N+1}) # a trick to set counts_($N+1)
+        @nloops(N, i, P,
+            d->(counts_d = strides_d), # PRE
+            d->(counts_{d+1} += strides_{d+1}), # POST
+            begin # BODY
+                sumc = sum(@ntuple N d->counts_{d+1})
+                @inbounds P[ind] = B[sumc+offset]
+                ind += 1
+            end)
+
+        return P
+    end
+end
+
+function permutedims1!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N},perm;basesize=1024)
     length(perm) == N || error("expected permutation of size $N, but length(perm)=$(length(perm))")
     isperm(perm) || error("input is not a permutation")
     dims = size(P)
     for i = 1:N
         dims[i] == size(B,perm[i]) || throw(DimensionMismatch("destination tensor of incorrect size"))
     end
-
-    #calculates all the strides and dims as variables
-    @nexprs N d->(stridesB_{d} = stride(B, perm[d]))
-    @nexprs N d->(stridesP_{d} = stride(P, d))
-    @nexprs N d->(dims_{d} = dims[d])
     
-    # calculate blocking strategy
-    if isa(P,BitArray)
-        elszP=1
+    if collect(perm)==[1:N]
+        copy!(P,B)
+    elseif prod(dims)<=basesize
+        stridesP=ntuple(N,d->stride(P,d))
+        stridesB=ntuple(N,d->stride(B,perm[d]))
+        basepermutedims!(P,B,stridesP,stridesB,dims,ntuple(N,d->1))
     else
-        elszP=isbits(T1) ? sizeof(T1) : sizeof(Ptr)
+        # apply blocked permutation
+        stridesP=ntuple(N,d->stride(P,d))
+        stridesB=ntuple(N,d->stride(B,perm[d]))
+        bdims=blockdims(dims,stridesP,stridesB,basesize)
+        blockedpermutedims!(P,B,stridesP,stridesB,dims,bdims)
     end
-    if isa(B,BitArray)
-        elszB=1
-    else
-        elszB=isbits(T2) ? sizeof(T2) : sizeof(Ptr)
+    return P
+end
+function permutedims2!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N},perm;basesize=1024)
+    length(perm) == N || error("expected permutation of size $N, but length(perm)=$(length(perm))")
+    isperm(perm) || error("input is not a permutation")
+    dims = size(P)
+    for i = 1:N
+        dims[i] == size(B,perm[i]) || throw(DimensionMismatch("destination tensor of incorrect size"))
     end
-    if (elszB+elszP)*length(P)<=1<<15
-        bdims=dims
+    
+    if collect(perm)==[1:N]
+        copy!(P,B)
+    elseif prod(dims)<=basesize
+        stridesP=ntuple(N,d->stride(P,d))
+        stridesB=ntuple(N,d->stride(B,perm[d]))
+        basepermutedims!(P,B,stridesP,stridesB,dims,ntuple(N,d->1))
     else
-        bdims=blockdims(dims,elszP,(@ntuple N d->stridesP_{d}),elszB,(@ntuple N d->stridesB_{d}))
+        # apply recursive permutation
+        stridesP=ntuple(N,d->stride(P,d))
+        stridesB=ntuple(N,d->stride(B,perm[d]))
+        minstrides=ntuple(N,d->min(stridesP[d],stridesB[d]))
+        recursivepermutedims!(P,B,minstrides,stridesP,stridesB,dims,0,0,basesize)
     end
+    return P
+end
+
+@ngenerate N typeof(P) function blockedpermutedims!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N},stridesP::NTuple{N,Int},stridesB::NTuple{N,Int},dims::NTuple{N,Int},bdims::NTuple{N,Int})
+    @nexprs N d->(dims_{d} = dims[d])
     @nexprs N d->(bdims_{d} = bdims[d])
+    
+    # use blocked algorithms
+    @nexprs 1 d->(indB_{N} = 1)
+    @nexprs 1 d->(indP_{N} = 1)
+    @nloops(N, i, d->1:bdims_{d}:dims_{d},
+        d->(indB_{d-1} = indB_{d};indP_{d-1}=indP_{d}), # PRE
+        d->(indB_{d} += bdims_{d}*stridesB_{d};indP_{d} += bdims_{d}*stridesP_{d}), # POST
+        begin # BODY
+            offsetB=indB_0
+            offsetP=indP_0
+            blockdims=@ntuple N d->min(bdims_{d},dims_{d}-i_{d}+1)
+            basepermutedims!(P,B,stridesP,stridesB,blockdims,offsetP,offsetB) # base algorithm in block
+        end)
+    return P
+end
+
+function recursivepermutedims!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N},minstrides::NTuple{N,Int},stridesP::NTuple{N,Int},stridesB::NTuple{N,Int},dims::NTuple{N,Int},offsetP::Int,offsetB::Int,basesize::Int)
+    if prod(dims)<=basesize
+        basepermutedims!(P,B,stridesP,stridesB,dims,offsetP,offsetB) # fall back to base algorithm for sufficiently small sizes
+    else
+        dmax=1
+        max=dims[dmax]*minstrides[dmax]
+        for d=2:N
+            newmax=dims[d]*minstrides[d]
+            if dims[d]>1 && newmax>max
+                dmax=d
+                max=newmax
+            end
+        end
+        newdim=dims[dmax]>>1
+        recursivepermutedims!(P,B,minstrides,stridesP,stridesB,ntuple(N,d->(d==dmax ? newdim : dims[d])),offsetP,offsetB,basesize)
+        recursivepermutedims!(P,B,minstrides,stridesP,stridesB,ntuple(N,d->(d==dmax ? dims[d]-newdim : dims[d])),offsetP+stridesP[dmax]*newdim,offsetB+stridesB[dmax]*newdim,basesize)
+    end
+    return P
+end
 
+@ngenerate N typeof(P) function basepermutedims!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N},stridesP::NTuple{N,Int},stridesB::NTuple{N,Int},dims::NTuple{N,Int},offsetP::Int=0,offsetB::Int=0)
+    #calculates strides, dims and offset
+    @nexprs N d->(stridesB_{d} = stridesB[d])
+    @nexprs N d->(stridesP_{d} = stridesP[d])
+    @nexprs N d->(dims_{d} = dims[d])
     if isa(B, SubArray)
-        offsetB = B.first_index
+        startB = B.first_index
         B = B.parent
     else
-        offsetB = 1
+        startB = 1
     end
     if isa(P, SubArray)
-        offsetP = P.first_index
+        startP = P.first_index
         P = P.parent
     else
-        offsetP = 1
+        startP = 1
     end
-
-    @nexprs 1 d->(indB_{N} = offsetB)
-    @nexprs 1 d->(indP_{N} = offsetP)
-    @nloops(N, outer, d->1:bdims_{d}:dims_{d},
+    startP+=offsetP
+    startB+=offsetB
+    
+    # copy data
+    @nexprs 1 d->(indB_{N} = startB)
+    @nexprs 1 d->(indP_{N} = startP)
+    @nloops(N, i, d->1:dims_{d},
         d->(indB_{d-1} = indB_{d};indP_{d-1}=indP_{d}), # PRE
-        d->(indB_{d} += bdims_{d}*stridesB_{d};indP_{d} += bdims_{d}*stridesP_{d}), # POST
-        begin # BODY
-            @nexprs 1 e->(ind2B_{N} = indB_0)
-            @nexprs 1 e->(ind2P_{N} = indP_0)
-            @nloops(N, inner, e->outer_{e}:min(outer_{e}+bdims_{e}-1,dims_{e}),
-                e->(ind2B_{e-1} = ind2B_{e};ind2P_{e-1}=ind2P_{e}), # PRE
-                e->(ind2B_{e} += stridesB_{e};ind2P_{e} += stridesP_{e}), # POST
-                @inbounds P[ind2P_0]=B[ind2B_0]) #BODY
-        end)
+        d->(indB_{d} += stridesB_{d};indP_{d} += stridesP_{d}), # POST
+        @inbounds P[indP_0]=B[indB_0])
+        
     return P
 end
 
-function blockdims{N}(dims::NTuple{N,Int},elszA::Int,stridesA::NTuple{N,Int},elszB::Int,stridesB::NTuple{N,Int})
+function blockdims{N}(dims::NTuple{N,Int},stridesA::NTuple{N,Int},stridesB::NTuple{N,Int},blocksize::Int)
     # blocking strategy for permutedims
     if N==0
         return ()
@@ -509,30 +603,12 @@ function blockdims{N}(dims::NTuple{N,Int},elszA::Int,stridesA::NTuple{N,Int},els
         pA=sortperm(collect(stridesA))
         pB=sortperm(collect(stridesB))
         
-        cacheline=64
-        # determine cache
-        effectivecachesize=25600 # 64*400 = ifloor(cachesize/1.28) with cachesize=32k and 1.28 safety margin to prevent complete cachefill
-        
-        # if smallest stride of A or B is not 1, then the effect size a subblock of A
-        # or B will take in the cache depends not only on the element size but also on
-        # the number of unused data that will be copied together with every element
-        cachesizeA=min(elszA*stridesA[pA[1]],cacheline)
-        cachesizeB=min(elszB*stridesB[pB[1]],cacheline)
-    
-        # check if complete data fits into cache:
-        if (cachesizeA+cachesizeB)*prod(dims)<=effectivecachesize
+        # check if complete data fits into block:
+        if prod(dims)<=blocksize
             return dims
         end
     
-        # cache-friendly blocking strategy:
-        # bstep=ones(Int,N)
-        # for i=1:N
-        #     bstep[i]=max(1,div(cacheline,elszA*stridesA[i]),div(cacheline,elszB*stridesB[i]))
-        #     # bstep is the number of elements along that dimension that can be expected to be
-        #     # within a single cacheline for either array A or B; it would be suboptimal not to
-        #     # use all of them immediately
-        # end
-        
+        # blocking strategy:        
         bdims=ones(Int,N)
         i=1
         j=1
@@ -543,22 +619,22 @@ function blockdims{N}(dims::NTuple{N,Int},elszA::Int,stridesA::NTuple{N,Int},els
             while bdims[pA[i]]==dims[pA[i]]
                 i+=1
             end
-            bdims[pA[i]]+=1#bstep[pA[i]]
-            if (cachesizeA+cachesizeB)*prod(bdims)>effectivecachesize # this must become true at some point
-                bdims[pA[i]]-=1#bstep[pA[i]]
+            bdims[pA[i]]+=1
+            if prod(bdims)>blocksize # this must become true at some point
+                bdims[pA[i]]-=1
                 break
             end
             
             while bdims[pB[j]]==dims[pB[j]]
                 j+=1
             end
-            bdims[pB[j]]+=1#bstep[pB[j]]
-            if (cachesizeA+cachesizeB)*prod(bdims)>effectivecachesize # this must become true at some point
-                bdims[pB[j]]-=1#bstep[pB[j]]
+            bdims[pB[j]]+=1
+            if prod(bdims)>blocksize # this must become true at some point
+                bdims[pB[j]]-=1
                 break
             end
         end
-        return tuple(bdims...)
+        return tuple(bdims...)::NTuple{N,Int}
     end
 end
 

From e2cfbf1e0e2b7039939421da84655ab6c76d7c67 Mon Sep 17 00:00:00 2001
From: Jutho Haegeman <jutho.haegeman@ugent.be>
Date: Sun, 27 Apr 2014 18:39:40 -0700
Subject: [PATCH 09/12] further updates

---
 base/multidimensional.jl | 52 +++++++++++++++++++++++++++++++++-------
 1 file changed, 44 insertions(+), 8 deletions(-)

diff --git a/base/multidimensional.jl b/base/multidimensional.jl
index fbee72e87cf00..7e849c62da88b 100644
--- a/base/multidimensional.jl
+++ b/base/multidimensional.jl
@@ -478,7 +478,7 @@ for (V, PT, BT) in {((:N,), BitArray, BitArray), ((:T,:N), Array, StridedArray)}
     end
 end
 
-function permutedims1!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N},perm;basesize=1024)
+function permutedims1!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N},perm;basesize::Int=1024)
     length(perm) == N || error("expected permutation of size $N, but length(perm)=$(length(perm))")
     isperm(perm) || error("input is not a permutation")
     dims = size(P)
@@ -489,9 +489,13 @@ function permutedims1!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N},perm
     if collect(perm)==[1:N]
         copy!(P,B)
     elseif prod(dims)<=basesize
-        stridesP=ntuple(N,d->stride(P,d))
         stridesB=ntuple(N,d->stride(B,perm[d]))
-        basepermutedims!(P,B,stridesP,stridesB,dims,ntuple(N,d->1))
+        if isa(P,Array) || isa(P,BitArray)
+            simplepermutedims!(P,B,stridesB,dims)
+        else
+            stridesP=ntuple(N,d->stride(P,d))
+            basepermutedims!(P,B,stridesP,stridesB,dims)
+        end
     else
         # apply blocked permutation
         stridesP=ntuple(N,d->stride(P,d))
@@ -501,7 +505,7 @@ function permutedims1!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N},perm
     end
     return P
 end
-function permutedims2!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N},perm;basesize=1024)
+function permutedims2!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N},perm;basesize::Int=1024)
     length(perm) == N || error("expected permutation of size $N, but length(perm)=$(length(perm))")
     isperm(perm) || error("input is not a permutation")
     dims = size(P)
@@ -512,9 +516,13 @@ function permutedims2!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N},perm
     if collect(perm)==[1:N]
         copy!(P,B)
     elseif prod(dims)<=basesize
-        stridesP=ntuple(N,d->stride(P,d))
         stridesB=ntuple(N,d->stride(B,perm[d]))
-        basepermutedims!(P,B,stridesP,stridesB,dims,ntuple(N,d->1))
+        if isa(P,Array) || isa(P,BitArray)
+            simplepermutedims!(P,B,stridesB,dims)
+        else
+            stridesP=ntuple(N,d->stride(P,d))
+            basepermutedims!(P,B,stridesP,stridesB,dims)
+        end
     else
         # apply recursive permutation
         stridesP=ntuple(N,d->stride(P,d))
@@ -528,10 +536,12 @@ end
 @ngenerate N typeof(P) function blockedpermutedims!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N},stridesP::NTuple{N,Int},stridesB::NTuple{N,Int},dims::NTuple{N,Int},bdims::NTuple{N,Int})
     @nexprs N d->(dims_{d} = dims[d])
     @nexprs N d->(bdims_{d} = bdims[d])
+    @nexprs N d->(stridesP_{d} = stridesP[d])
+    @nexprs N d->(stridesB_{d} = stridesB[d])
     
     # use blocked algorithms
-    @nexprs 1 d->(indB_{N} = 1)
-    @nexprs 1 d->(indP_{N} = 1)
+    @nexprs 1 d->(indB_{N} = 0)
+    @nexprs 1 d->(indP_{N} = 0)
     @nloops(N, i, d->1:bdims_{d}:dims_{d},
         d->(indB_{d-1} = indB_{d};indP_{d-1}=indP_{d}), # PRE
         d->(indB_{d} += bdims_{d}*stridesB_{d};indP_{d} += bdims_{d}*stridesP_{d}), # POST
@@ -594,6 +604,32 @@ end
         
     return P
 end
+@ngenerate N typeof(P) function simplepermutedims!{T1,T2,N}(P::Array{T1,N},B::StridedArray{T2,N},stridesB::NTuple{N,Int},dims::NTuple{N,Int},offsetB::Int=0)
+    #calculates strides, dims and offset
+    @nexprs N d->(stridesB_{d} = stridesB[d])
+    @nexprs N d->(dims_{d} = dims[d])
+    if isa(B, SubArray)
+        startB = B.first_index
+        B = B.parent
+    else
+        startB = 1
+    end
+    startB+=offsetB
+    
+    # copy data
+    @nexprs 1 d->(indB_{N} = startB)
+    indP=1
+    @nloops(N, i, d->1:dims_{d},
+        d->(indB_{d-1} = indB_{d}), # PRE
+        d->(indB_{d} += stridesB_{d}), # POST
+        begin
+            @inbounds P[indP]=B[indB_0]
+            indP+=1
+        end)
+        
+    return P
+end
+
 
 function blockdims{N}(dims::NTuple{N,Int},stridesA::NTuple{N,Int},stridesB::NTuple{N,Int},blocksize::Int)
     # blocking strategy for permutedims

From 29a572ef682bd23deb7fc2c584bb7ca84408ace7 Mon Sep 17 00:00:00 2001
From: Jutho Haegeman <jutho.haegeman@ugent.be>
Date: Mon, 28 Apr 2014 02:40:13 -0700
Subject: [PATCH 10/12] all methods

---
 base/multidimensional.jl | 233 ++++++++++++++++++++++++++++++++-------
 1 file changed, 191 insertions(+), 42 deletions(-)

diff --git a/base/multidimensional.jl b/base/multidimensional.jl
index 7e849c62da88b..cee51c584d15a 100644
--- a/base/multidimensional.jl
+++ b/base/multidimensional.jl
@@ -478,7 +478,74 @@ for (V, PT, BT) in {((:N,), BitArray, BitArray), ((:T,:N), Array, StridedArray)}
     end
 end
 
-function permutedims1!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N},perm;basesize::Int=1024)
+@ngenerate N typeof(P) function permutedims0!{T1,T2,N}(P::Array{T1,N},B::StridedArray{T2,N},perm)
+    length(perm) == N || error("expected permutation of size $N, but length(perm)=$(length(perm))")
+    isperm(perm) || error("input is not a permutation")
+    dims = size(P)
+    for i = 1:N
+        dims[i] == size(B,perm[i]) || throw(DimensionMismatch("destination tensor of incorrect size"))
+    end
+    
+    #calculates strides, dims and offset
+    @nexprs N d->(stridesB_{d} = stride(B,perm[d]))
+    @nexprs N d->(dims_{d} = dims[d])
+    if isa(B, SubArray)
+        startB = B.first_index
+        B = B.parent
+    else
+        startB = 1
+    end
+    
+    # copy data
+    @nexprs 1 d->(indB_{N} = startB)
+    indP=1
+    @nloops(N, i, d->1:dims_{d},
+        d->(indB_{d-1} = indB_{d}), # PRE
+        d->(indB_{d} += stridesB_{d}), # POST
+        begin
+            @inbounds P[indP]=B[indB_0]
+            indP+=1
+        end)
+        
+    return P
+end
+@ngenerate N typeof(P) function permutedims1!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N},perm)
+    length(perm) == N || error("expected permutation of size $N, but length(perm)=$(length(perm))")
+    isperm(perm) || error("input is not a permutation")
+    dims = size(P)
+    for i = 1:N
+        dims[i] == size(B,perm[i]) || throw(DimensionMismatch("destination tensor of incorrect size"))
+    end
+
+    #calculates strides, dims and offset
+    @nexprs N d->(stridesB_{d} = stride(B,perm[d]))
+    @nexprs N d->(stridesP_{d} = stride(P,d))
+    @nexprs N d->(dims_{d} = dims[d])
+    if isa(B, SubArray)
+        startB = B.first_index
+        B = B.parent
+    else
+        startB = 1
+    end
+    if isa(P, SubArray)
+        startP = P.first_index
+        P = P.parent
+    else
+        startP = 1
+    end
+    
+    # copy data
+    @nexprs 1 d->(indB_{N} = startB)
+    @nexprs 1 d->(indP_{N} = startP)
+    @nloops(N, i, d->1:dims_{d},
+        d->(indB_{d-1} = indB_{d};indP_{d-1}=indP_{d}), # PRE
+        d->(indB_{d} += stridesB_{d};indP_{d} += stridesP_{d}), # POST
+        @inbounds P[indP_0]=B[indB_0])
+        
+    return P
+end
+
+function permutedims2!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N},perm,basesize::Int=1024)
     length(perm) == N || error("expected permutation of size $N, but length(perm)=$(length(perm))")
     isperm(perm) || error("input is not a permutation")
     dims = size(P)
@@ -488,14 +555,10 @@ function permutedims1!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N},perm
     
     if collect(perm)==[1:N]
         copy!(P,B)
-    elseif prod(dims)<=basesize
-        stridesB=ntuple(N,d->stride(B,perm[d]))
-        if isa(P,Array) || isa(P,BitArray)
-            simplepermutedims!(P,B,stridesB,dims)
-        else
-            stridesP=ntuple(N,d->stride(P,d))
-            basepermutedims!(P,B,stridesP,stridesB,dims)
-        end
+    elseif prod(dims)<=4*basesize
+        stridesB=strides(B)[perm]
+        stridesP=strides(P)
+        basepermutedims!(P,B,stridesP,stridesB,dims)
     else
         # apply blocked permutation
         stridesP=ntuple(N,d->stride(P,d))
@@ -505,7 +568,7 @@ function permutedims1!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N},perm
     end
     return P
 end
-function permutedims2!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N},perm;basesize::Int=1024)
+@ngenerate N typeof(P) function permutedims3!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N},perm,basesize::Int=1024)
     length(perm) == N || error("expected permutation of size $N, but length(perm)=$(length(perm))")
     isperm(perm) || error("input is not a permutation")
     dims = size(P)
@@ -515,14 +578,32 @@ function permutedims2!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N},perm
     
     if collect(perm)==[1:N]
         copy!(P,B)
-    elseif prod(dims)<=basesize
-        stridesB=ntuple(N,d->stride(B,perm[d]))
-        if isa(P,Array) || isa(P,BitArray)
-            simplepermutedims!(P,B,stridesB,dims)
+    elseif prod(dims)<=4*basesize
+        @nexprs N d->(stridesB_{d} = stride(B,perm[d]))
+        @nexprs N d->(stridesP_{d} = stride(P,d))
+        @nexprs N d->(dims_{d} = dims[d])
+        if isa(B, SubArray)
+            startB = B.first_index
+            B = B.parent
         else
-            stridesP=ntuple(N,d->stride(P,d))
-            basepermutedims!(P,B,stridesP,stridesB,dims)
+            startB = 1
         end
+        if isa(P, SubArray)
+            startP = P.first_index
+            P = P.parent
+        else
+            startP = 1
+        end
+    
+        # copy data
+        @nexprs 1 d->(indB_{N} = startB)
+        @nexprs 1 d->(indP_{N} = startP)
+        @nloops(N, i, d->1:dims_{d},
+            d->(indB_{d-1} = indB_{d};indP_{d-1}=indP_{d}), # PRE
+            d->(indB_{d} += stridesB_{d};indP_{d} += stridesP_{d}), # POST
+            @inbounds P[indP_0]=B[indB_0])
+        
+        return P
     else
         # apply recursive permutation
         stridesP=ntuple(N,d->stride(P,d))
@@ -533,6 +614,100 @@ function permutedims2!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N},perm
     return P
 end
 
+@ngenerate N typeof(P) function permutedimsnew!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N},perm,basesize::Int=1024)
+    length(perm) == N || error("expected permutation of size $N, but length(perm)=$(length(perm))")
+    isperm(perm) || error("input is not a permutation")
+    dims = size(P)
+    for i = 1:N
+        dims[i] == size(B,perm[i]) || throw(DimensionMismatch("destination tensor of incorrect size"))
+    end
+    @nexprs N d->(stridesB_{d} = stride(B,perm[d]))
+    @nexprs N d->(stridesP_{d} = stride(P,d))
+    @nexprs N d->(dims_{d} = dims[d])
+    
+    if isa(B, SubArray)
+        startB = B.first_index
+        B = B.parent
+    else
+        startB = 1
+    end
+    if isa(P, SubArray)
+        startP = P.first_index
+        P = P.parent
+    else
+        startP = 1
+    end
+    
+    if prod(dims)<=4*basesize
+        # copy data
+        @nexprs 1 d->(indB_{N} = startB)
+        @nexprs 1 d->(indP_{N} = startP)
+        @nloops(N, i, d->1:dims_{d},
+            d->(indB_{d-1} = indB_{d};indP_{d-1}=indP_{d}), # PRE
+            d->(indB_{d} += stridesB_{d};indP_{d} += stridesP_{d}), # POST
+            @inbounds P[indP_0]=B[indB_0])
+    else
+        @nexprs N d->(minstrides_{d} = min(stridesB_{d},stridesP_{d}))
+    
+        M=iceil(log2(prod(dims)/basesize))
+        step=zeros(Int,M)
+        level=1
+        @nexprs N d->(vecbdims_{d} = zeros(Int,M))
+        @nexprs N d->(vecbdims_{d}[level] = dims_{d})
+        vecoffsetB=zeros(Int,M)
+        vecoffsetP=zeros(Int,M)
+        vecdP=zeros(Int,M)
+        vecdB=zeros(Int,M)
+        vecdmax=zeros(Int,M)
+        vecnewdim=zeros(Int,M)
+        while level>0
+            if level==M
+                @nexprs N d->(bdims_{d} = vecbdims_{d}[M])
+                @nexprs 1 d->(indP_{N} = startP+vecoffsetP[M])
+                @nexprs 1 d->(indB_{N} = startB+vecoffsetB[M])
+                @nloops(N, i, d->1:bdims_{d},
+                    d->(indB_{d-1} = indB_{d};indP_{d-1}=indP_{d}), # PRE
+                    d->(indB_{d} += stridesB_{d};indP_{d} += stridesP_{d}), # POST
+                    @inbounds P[indP_0]=B[indB_0])
+                level-=1
+            elseif step[level]==0
+                @nexprs N d->(bdims_{d} = vecbdims_{d}[level])
+                dmax=1
+                maxval=minstrides_1*bdims_1
+                newdim=bdims_1>>1
+                dP=stridesP_1
+                dB=stridesB_1
+                @nexprs N d->(newmax=minstrides_{d}*bdims_{d};if bdims_{d}>1 && newmax>maxval;dmax=d;newdim=bdims_{d}>>1;dP=stridesP_{d};dB=stridesB_{d};maxval=newmax;end)
+                vecnewdim[level]=newdim
+                vecdmax[level]=dmax
+                vecdP[level]=dP
+                vecdB[level]=dB
+            
+                @nexprs N d->(vecbdims_{d}[level+1] = (d==dmax ? newdim : bdims_{d}))
+                vecoffsetP[level+1]=vecoffsetP[level]
+                vecoffsetB[level+1]=vecoffsetB[level]
+                step[level+1]=0
+            
+                step[level]+=1
+                level+=1
+            elseif step[level]==1
+                @nexprs N d->(bdims_{d} = vecbdims_{d}[level])
+            
+                @nexprs N d->(vecbdims_{d}[level+1] = (d==vecdmax[level] ? bdims_{d}-vecnewdim[level] : bdims_{d}))
+                vecoffsetP[level+1]=vecoffsetP[level]+vecdP[level]*vecnewdim[level]
+                vecoffsetB[level+1]=vecoffsetB[level]+vecdB[level]*vecnewdim[level]
+                step[level+1]=0
+            
+                step[level]+=1
+                level+=1
+            else
+                level-=1
+            end
+        end
+    end
+    return P
+end
+
 @ngenerate N typeof(P) function blockedpermutedims!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N},stridesP::NTuple{N,Int},stridesB::NTuple{N,Int},dims::NTuple{N,Int},bdims::NTuple{N,Int})
     @nexprs N d->(dims_{d} = dims[d])
     @nexprs N d->(bdims_{d} = bdims[d])
@@ -604,32 +779,6 @@ end
         
     return P
 end
-@ngenerate N typeof(P) function simplepermutedims!{T1,T2,N}(P::Array{T1,N},B::StridedArray{T2,N},stridesB::NTuple{N,Int},dims::NTuple{N,Int},offsetB::Int=0)
-    #calculates strides, dims and offset
-    @nexprs N d->(stridesB_{d} = stridesB[d])
-    @nexprs N d->(dims_{d} = dims[d])
-    if isa(B, SubArray)
-        startB = B.first_index
-        B = B.parent
-    else
-        startB = 1
-    end
-    startB+=offsetB
-    
-    # copy data
-    @nexprs 1 d->(indB_{N} = startB)
-    indP=1
-    @nloops(N, i, d->1:dims_{d},
-        d->(indB_{d-1} = indB_{d}), # PRE
-        d->(indB_{d} += stridesB_{d}), # POST
-        begin
-            @inbounds P[indP]=B[indB_0]
-            indP+=1
-        end)
-        
-    return P
-end
-
 
 function blockdims{N}(dims::NTuple{N,Int},stridesA::NTuple{N,Int},stridesB::NTuple{N,Int},blocksize::Int)
     # blocking strategy for permutedims

From 2729e9b5e9967e6c34702137096a49ca80978868 Mon Sep 17 00:00:00 2001
From: Jutho Haegeman <jutho.haegeman@ugent.be>
Date: Mon, 28 Apr 2014 09:40:17 -0700
Subject: [PATCH 11/12] recursive implementation

---
 base/multidimensional.jl | 251 ---------------------------------------
 1 file changed, 251 deletions(-)

diff --git a/base/multidimensional.jl b/base/multidimensional.jl
index cee51c584d15a..b07942464e50a 100644
--- a/base/multidimensional.jl
+++ b/base/multidimensional.jl
@@ -478,142 +478,6 @@ for (V, PT, BT) in {((:N,), BitArray, BitArray), ((:T,:N), Array, StridedArray)}
     end
 end
 
-@ngenerate N typeof(P) function permutedims0!{T1,T2,N}(P::Array{T1,N},B::StridedArray{T2,N},perm)
-    length(perm) == N || error("expected permutation of size $N, but length(perm)=$(length(perm))")
-    isperm(perm) || error("input is not a permutation")
-    dims = size(P)
-    for i = 1:N
-        dims[i] == size(B,perm[i]) || throw(DimensionMismatch("destination tensor of incorrect size"))
-    end
-    
-    #calculates strides, dims and offset
-    @nexprs N d->(stridesB_{d} = stride(B,perm[d]))
-    @nexprs N d->(dims_{d} = dims[d])
-    if isa(B, SubArray)
-        startB = B.first_index
-        B = B.parent
-    else
-        startB = 1
-    end
-    
-    # copy data
-    @nexprs 1 d->(indB_{N} = startB)
-    indP=1
-    @nloops(N, i, d->1:dims_{d},
-        d->(indB_{d-1} = indB_{d}), # PRE
-        d->(indB_{d} += stridesB_{d}), # POST
-        begin
-            @inbounds P[indP]=B[indB_0]
-            indP+=1
-        end)
-        
-    return P
-end
-@ngenerate N typeof(P) function permutedims1!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N},perm)
-    length(perm) == N || error("expected permutation of size $N, but length(perm)=$(length(perm))")
-    isperm(perm) || error("input is not a permutation")
-    dims = size(P)
-    for i = 1:N
-        dims[i] == size(B,perm[i]) || throw(DimensionMismatch("destination tensor of incorrect size"))
-    end
-
-    #calculates strides, dims and offset
-    @nexprs N d->(stridesB_{d} = stride(B,perm[d]))
-    @nexprs N d->(stridesP_{d} = stride(P,d))
-    @nexprs N d->(dims_{d} = dims[d])
-    if isa(B, SubArray)
-        startB = B.first_index
-        B = B.parent
-    else
-        startB = 1
-    end
-    if isa(P, SubArray)
-        startP = P.first_index
-        P = P.parent
-    else
-        startP = 1
-    end
-    
-    # copy data
-    @nexprs 1 d->(indB_{N} = startB)
-    @nexprs 1 d->(indP_{N} = startP)
-    @nloops(N, i, d->1:dims_{d},
-        d->(indB_{d-1} = indB_{d};indP_{d-1}=indP_{d}), # PRE
-        d->(indB_{d} += stridesB_{d};indP_{d} += stridesP_{d}), # POST
-        @inbounds P[indP_0]=B[indB_0])
-        
-    return P
-end
-
-function permutedims2!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N},perm,basesize::Int=1024)
-    length(perm) == N || error("expected permutation of size $N, but length(perm)=$(length(perm))")
-    isperm(perm) || error("input is not a permutation")
-    dims = size(P)
-    for i = 1:N
-        dims[i] == size(B,perm[i]) || throw(DimensionMismatch("destination tensor of incorrect size"))
-    end
-    
-    if collect(perm)==[1:N]
-        copy!(P,B)
-    elseif prod(dims)<=4*basesize
-        stridesB=strides(B)[perm]
-        stridesP=strides(P)
-        basepermutedims!(P,B,stridesP,stridesB,dims)
-    else
-        # apply blocked permutation
-        stridesP=ntuple(N,d->stride(P,d))
-        stridesB=ntuple(N,d->stride(B,perm[d]))
-        bdims=blockdims(dims,stridesP,stridesB,basesize)
-        blockedpermutedims!(P,B,stridesP,stridesB,dims,bdims)
-    end
-    return P
-end
-@ngenerate N typeof(P) function permutedims3!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N},perm,basesize::Int=1024)
-    length(perm) == N || error("expected permutation of size $N, but length(perm)=$(length(perm))")
-    isperm(perm) || error("input is not a permutation")
-    dims = size(P)
-    for i = 1:N
-        dims[i] == size(B,perm[i]) || throw(DimensionMismatch("destination tensor of incorrect size"))
-    end
-    
-    if collect(perm)==[1:N]
-        copy!(P,B)
-    elseif prod(dims)<=4*basesize
-        @nexprs N d->(stridesB_{d} = stride(B,perm[d]))
-        @nexprs N d->(stridesP_{d} = stride(P,d))
-        @nexprs N d->(dims_{d} = dims[d])
-        if isa(B, SubArray)
-            startB = B.first_index
-            B = B.parent
-        else
-            startB = 1
-        end
-        if isa(P, SubArray)
-            startP = P.first_index
-            P = P.parent
-        else
-            startP = 1
-        end
-    
-        # copy data
-        @nexprs 1 d->(indB_{N} = startB)
-        @nexprs 1 d->(indP_{N} = startP)
-        @nloops(N, i, d->1:dims_{d},
-            d->(indB_{d-1} = indB_{d};indP_{d-1}=indP_{d}), # PRE
-            d->(indB_{d} += stridesB_{d};indP_{d} += stridesP_{d}), # POST
-            @inbounds P[indP_0]=B[indB_0])
-        
-        return P
-    else
-        # apply recursive permutation
-        stridesP=ntuple(N,d->stride(P,d))
-        stridesB=ntuple(N,d->stride(B,perm[d]))
-        minstrides=ntuple(N,d->min(stridesP[d],stridesB[d]))
-        recursivepermutedims!(P,B,minstrides,stridesP,stridesB,dims,0,0,basesize)
-    end
-    return P
-end
-
 @ngenerate N typeof(P) function permutedimsnew!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N},perm,basesize::Int=1024)
     length(perm) == N || error("expected permutation of size $N, but length(perm)=$(length(perm))")
     isperm(perm) || error("input is not a permutation")
@@ -708,121 +572,6 @@ end
     return P
 end
 
-@ngenerate N typeof(P) function blockedpermutedims!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N},stridesP::NTuple{N,Int},stridesB::NTuple{N,Int},dims::NTuple{N,Int},bdims::NTuple{N,Int})
-    @nexprs N d->(dims_{d} = dims[d])
-    @nexprs N d->(bdims_{d} = bdims[d])
-    @nexprs N d->(stridesP_{d} = stridesP[d])
-    @nexprs N d->(stridesB_{d} = stridesB[d])
-    
-    # use blocked algorithms
-    @nexprs 1 d->(indB_{N} = 0)
-    @nexprs 1 d->(indP_{N} = 0)
-    @nloops(N, i, d->1:bdims_{d}:dims_{d},
-        d->(indB_{d-1} = indB_{d};indP_{d-1}=indP_{d}), # PRE
-        d->(indB_{d} += bdims_{d}*stridesB_{d};indP_{d} += bdims_{d}*stridesP_{d}), # POST
-        begin # BODY
-            offsetB=indB_0
-            offsetP=indP_0
-            blockdims=@ntuple N d->min(bdims_{d},dims_{d}-i_{d}+1)
-            basepermutedims!(P,B,stridesP,stridesB,blockdims,offsetP,offsetB) # base algorithm in block
-        end)
-    return P
-end
-
-function recursivepermutedims!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N},minstrides::NTuple{N,Int},stridesP::NTuple{N,Int},stridesB::NTuple{N,Int},dims::NTuple{N,Int},offsetP::Int,offsetB::Int,basesize::Int)
-    if prod(dims)<=basesize
-        basepermutedims!(P,B,stridesP,stridesB,dims,offsetP,offsetB) # fall back to base algorithm for sufficiently small sizes
-    else
-        dmax=1
-        max=dims[dmax]*minstrides[dmax]
-        for d=2:N
-            newmax=dims[d]*minstrides[d]
-            if dims[d]>1 && newmax>max
-                dmax=d
-                max=newmax
-            end
-        end
-        newdim=dims[dmax]>>1
-        recursivepermutedims!(P,B,minstrides,stridesP,stridesB,ntuple(N,d->(d==dmax ? newdim : dims[d])),offsetP,offsetB,basesize)
-        recursivepermutedims!(P,B,minstrides,stridesP,stridesB,ntuple(N,d->(d==dmax ? dims[d]-newdim : dims[d])),offsetP+stridesP[dmax]*newdim,offsetB+stridesB[dmax]*newdim,basesize)
-    end
-    return P
-end
-
-@ngenerate N typeof(P) function basepermutedims!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N},stridesP::NTuple{N,Int},stridesB::NTuple{N,Int},dims::NTuple{N,Int},offsetP::Int=0,offsetB::Int=0)
-    #calculates strides, dims and offset
-    @nexprs N d->(stridesB_{d} = stridesB[d])
-    @nexprs N d->(stridesP_{d} = stridesP[d])
-    @nexprs N d->(dims_{d} = dims[d])
-    if isa(B, SubArray)
-        startB = B.first_index
-        B = B.parent
-    else
-        startB = 1
-    end
-    if isa(P, SubArray)
-        startP = P.first_index
-        P = P.parent
-    else
-        startP = 1
-    end
-    startP+=offsetP
-    startB+=offsetB
-    
-    # copy data
-    @nexprs 1 d->(indB_{N} = startB)
-    @nexprs 1 d->(indP_{N} = startP)
-    @nloops(N, i, d->1:dims_{d},
-        d->(indB_{d-1} = indB_{d};indP_{d-1}=indP_{d}), # PRE
-        d->(indB_{d} += stridesB_{d};indP_{d} += stridesP_{d}), # POST
-        @inbounds P[indP_0]=B[indB_0])
-        
-    return P
-end
-
-function blockdims{N}(dims::NTuple{N,Int},stridesA::NTuple{N,Int},stridesB::NTuple{N,Int},blocksize::Int)
-    # blocking strategy for permutedims
-    if N==0
-        return ()
-    else
-        pA=sortperm(collect(stridesA))
-        pB=sortperm(collect(stridesB))
-        
-        # check if complete data fits into block:
-        if prod(dims)<=blocksize
-            return dims
-        end
-    
-        # blocking strategy:        
-        bdims=ones(Int,N)
-        i=1
-        j=1
-        # loop will try to make blocks maximal along dimensions of minimal strides
-        # for both A and B, until the blockdim equals the full dim along those
-        # dimensions, and then continue with the next dimensions
-        while true
-            while bdims[pA[i]]==dims[pA[i]]
-                i+=1
-            end
-            bdims[pA[i]]+=1
-            if prod(bdims)>blocksize # this must become true at some point
-                bdims[pA[i]]-=1
-                break
-            end
-            
-            while bdims[pB[j]]==dims[pB[j]]
-                j+=1
-            end
-            bdims[pB[j]]+=1
-            if prod(bdims)>blocksize # this must become true at some point
-                bdims[pB[j]]-=1
-                break
-            end
-        end
-        return tuple(bdims...)::NTuple{N,Int}
-    end
-end
-
 ## unique across dim
 
 immutable Prehashed

From f221d2c675cd9d3455c66b61f88f72a46a779b0b Mon Sep 17 00:00:00 2001
From: Jutho Haegeman <jutho.haegeman@ugent.be>
Date: Mon, 28 Apr 2014 14:58:15 -0700
Subject: [PATCH 12/12] explicit recursive attempt

---
 base/cartesian.jl        | 30 ++++++++++++++++-
 base/multidimensional.jl | 72 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 101 insertions(+), 1 deletion(-)

diff --git a/base/cartesian.jl b/base/cartesian.jl
index 7bbf6e52ccfca..a892241f34e48 100644
--- a/base/cartesian.jl
+++ b/base/cartesian.jl
@@ -1,6 +1,6 @@
 module Cartesian
 
-export @ngenerate, @nsplat, @nloops, @nref, @ncall, @nexprs, @nextract, @nall, @ntuple, @nif, ngenerate
+export @ngenerate, @nsplat, @nloops, @nfunction, @nref, @ncall, @nexprs, @nextract, @nall, @ntuple, @nif, ngenerate
 
 const CARTESIAN_DIMS = 4
 
@@ -299,6 +299,34 @@ function _nloops(N::Int, itersym::Symbol, rangeexpr::Expr, args::Expr...)
     ex
 end
 
+# Generate function f(pre,i_1::T,i_2::T,..) from @nfunction N f pre i::T body
+macro nfunction(N, fname, args...)
+    _nfunction(N, fname, args...)
+end
+
+function _nfunction(N::Int, fname::Symbol, args...)
+    if length(args) < 2
+        error("argument missing")
+    end
+    
+    prearg = args[1:end-2]
+    for k=1:length(prearg)
+        if !(isa(prearg[k],Symbol) || (isa(prearg[k],Expr) && prearg[k].head==:(::) && isa(prearg[k].args[1],Symbol) && isa(prearg[k].args[2],Symbol)))
+            error("invalid argument type for pre arguments")
+        end
+    end
+    iterarg = args[end-1]
+    if !(isa(iterarg,Symbol) || (isa(iterarg,Expr) && iterarg.head==:(::) && isa(iterarg.args[1],Symbol) && isa(iterarg.args[2],Symbol)))
+        error("invalid argument type for argument that will be iterated ")
+    end
+    iterarglist=(isa(iterarg,Symbol) ? [inlineanonymous(iterarg,i) for i=1:N] : [Expr(:(::),inlineanonymous(iterarg.args[1],i),iterarg.args[2]) for i=1:N])
+    fcall=Expr(:call,fname,prearg...,iterarglist...)
+    
+    body = args[end]
+    
+    ex=Expr(:escape,Expr(:function,fcall,body))
+end
+
 # Generate expression A[i1, i2, ...]
 macro nref(N, A, sym)
     _nref(N, A, sym)
diff --git a/base/multidimensional.jl b/base/multidimensional.jl
index b07942464e50a..2f67de77b2f57 100644
--- a/base/multidimensional.jl
+++ b/base/multidimensional.jl
@@ -572,6 +572,78 @@ end
     return P
 end
 
+@ngenerate N typeof(P) function permutedimsnew2!{T1,T2,N}(P::StridedArray{T1,N},B::StridedArray{T2,N},perm,basesize::Int=1024)
+    length(perm) == N || error("expected permutation of size $N, but length(perm)=$(length(perm))")
+    isperm(perm) || error("input is not a permutation")
+    dims = size(P)
+    for i = 1:N
+        dims[i] == size(B,perm[i]) || throw(DimensionMismatch("destination tensor of incorrect size"))
+    end
+    @nexprs N d->(stridesB_{d} = stride(B,perm[d]))
+    @nexprs N d->(stridesP_{d} = stride(P,d))
+    @nexprs N d->(dims_{d} = dims[d])
+    
+    if isa(B, SubArray)
+        startB = B.first_index
+        B = B.parent
+    else
+        startB = 1
+    end
+    if isa(P, SubArray)
+        startP = P.first_index
+        P = P.parent
+    else
+        startP = 1
+    end
+    
+    @nfunction(N,innerbase,offsetP::Int,offsetB::Int,bdims::Int,begin
+        @nexprs 1 d->(indB_{N} = startB+offsetP)
+        @nexprs 1 d->(indP_{N} = startP+offsetB)
+        @nloops(N, i, d->1:bdims_{d},
+            d->(indB_{d-1} = indB_{d};indP_{d-1}=indP_{d}), # PRE
+            d->(indB_{d} += stridesB_{d};indP_{d} += stridesP_{d}), # POST
+            @inbounds P[indP_0]=B[indB_0])
+        end)
+    
+    if prod(dims)<=4*basesize
+        @ncall N innerbase 0 0 dims
+    else
+        @nexprs N d->(minstrides_{d} = min(stridesB_{d},stridesP_{d}))
+    
+        @nfunction(N,innerrec,offsetP::Int,offsetB::Int,bdims::Int,begin
+                currentsize=1
+                @nexprs N d->(currentsize *=bdims_{d})
+                if currentsize<=basesize
+                    @ncall N innerbase offsetP offsetB bdims
+                else
+                    dmax=1
+                    maxval=minstrides_1*bdims_1
+                    @nexprs N d->(begin
+                            newmax=minstrides_{d}*bdims_{d}
+                            if bdims_{d}>1 && newmax>maxval
+                                dmax=d
+                                maxval=newmax
+                            end
+                        end)
+                    @nexprs N d->(begin
+                            if d==dmax
+                                olddim=bdims_{d}
+                                newdim=olddim>>1
+                                bdims_{d}=newdim
+                                @ncall N innerrec offsetP offsetB bdims
+                                bdims_{d}=olddim-newdim
+                                offsetP+=stridesP_{d}*newdim
+                                offsetB+=stridesB_{d}*newdim
+                                @ncall N innerrec offsetP offsetB bdims
+                            end
+                        end)
+                end
+            end)
+        @ncall N innerrec 0 0 dims
+    end
+    return P
+end
+
 ## unique across dim
 
 immutable Prehashed