-
Notifications
You must be signed in to change notification settings - Fork 4.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
33% performance degradation in .NET 5 #44457
Comments
I couldn't figure out the best area label to add to this issue. If you have write-permissions please help me learn by adding exactly one area label. |
I have run the benchmarks on my machine (Ivy Bridge) using your test from the repository, and did not see a regression from
I lost the ability to run your tests after that for some reason and now have replicated your setup with Benchmark.NET. [SimpleJob(RuntimeMoniker.NetCoreApp31)]
[SimpleJob(RuntimeMoniker.NetCoreApp30)]
[SimpleJob(RuntimeMoniker.NetCoreApp50)]
public class Benchmarks
{
public const int Width = 320;
public const int Height = 200;
[Benchmark]
public void AssignArray2DMD()
{
var array2d = new Array2D<int>(Height, Width);
int i = 0;
for (int y = 0; y < Height; y++)
{
for (int x = 0; x < Width; x++)
array2d[y, x] = ++i;
}
}
} Using this setup I too was not able to replicate the regression:
|
I'm not sure how you tested it but the self allocating constructor uses Using Benchmark.NET Unfortunately the overhead of Benchmark.NET is a bit too large for very quick test cases so you need to "magnify" the actual payload of the test. If you modify the test method like this: [Benchmark]
public void AssignArray2DMD()
{
var array2d = new Array2D<int>(Height, Width);
for (int iter = 0; iter < 10_000; iter++)
{
int i = 0;
for (int y = 0; y < Height; y++)
{
for (int x = 0; x < Width; x++)
array2d[y, x] = ++i;
}
}
array2d.Dispose();
} then the difference will be clear (at least on my machine):
|
I see, I did not think that would matter because I assumed the test was about the quality of indexer codegen. I have now run the corrected test you provided and there is still no difference: [Benchmark]
public void AssignArray2D()
{
var array2d = new Array2D<int>(Height, Width);
for (int iter = 0; iter < 10_000; iter++)
{
int i = 0;
for (int y = 0; y < Height; y++)
{
for (int x = 0; x < Width; x++)
array2d[y, x] = ++i;
}
}
array2d.Dispose();
}
BTW, it could be a good idea to get the diassembly for these methods with |
Hmm, it must be a platform-dependent thing, then. :( Thanks for the tip, I will provide a disassembly when I will have time again. |
Could be you have an alignment-sensitive loop? cc @kunalspathak |
I was playing with
And the assembly dumps. As the relevant codes are all inlined I copy-pasted the disassembled .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT; ConsoleApp1.Benchmarks.AssignArray2DMD()
push rdi
push rsi
sub rsp,58
vzeroupper
mov rsi,rcx
lea rdi,[rsp+20]
mov ecx,0E
xor eax,eax
rep stosd
mov rcx,rsi
mov dword ptr [rsp+3C],0C8
mov dword ptr [rsp+38],140
xor ecx,ecx
lea rdx,[rsp+20]
vxorps xmm0,xmm0,xmm0
vmovdqu xmmword ptr [rdx],xmm0
mov [rdx+10],rcx
lea rcx,[rsp+20]
mov edx,0FA00
mov r8d,1
call KGySoft.Collections.ArraySection`1[[System.Int32, System.Private.CoreLib]]..ctor(Int32, Boolean)
vmovdqu xmm0,xmmword ptr [rsp+20]
vmovdqu xmmword ptr [rsp+40],xmm0
mov rcx,[rsp+30]
mov [rsp+50],rcx
xor ecx,ecx
M00_L00:
xor eax,eax
xor edx,edx
M00_L01:
xor r8d,r8d
M00_L02:
inc eax
mov r9d,edx
imul r9d,[rsp+38]
add r9d,r8d
cmp qword ptr [rsp+40],0
je short M00_L03
mov r10,[rsp+40]
add r9d,[rsp+48]
cmp r9d,[r10+8]
jae short M00_L04
movsxd r9,r9d
mov [r10+r9*4+10],eax
inc r8d
cmp r8d,140
jl short M00_L02
inc edx
cmp edx,0C8
jl short M00_L01
inc ecx
cmp ecx,2710
jl short M00_L00
lea rcx,[rsp+40]
call KGySoft.Collections.ArraySection`1[[System.Int32, System.Private.CoreLib]].Release()
lea rax,[rsp+38]
vxorps xmm0,xmm0,xmm0
vmovdqu xmmword ptr [rax],xmm0
vmovdqu xmmword ptr [rax+10],xmm0
add rsp,58
pop rsi
pop rdi
ret
M00_L03:
call KGySoft.Throw.IndexOutOfRangeException()
int 3
M00_L04:
call CORINFO_HELP_RNGCHKFAIL
int 3
; Total bytes of code 241 .NET Core 5.0.0 (CoreCLR 5.0.20.47505, CoreFX 5.0.20.47505), X64 RyuJIT; ConsoleApp1.Benchmarks.AssignArray2DMD()
sub rsp,58
vzeroupper
vxorps xmm4,xmm4,xmm4
vmovdqa xmmword ptr [rsp+20],xmm4
vmovdqa xmmword ptr [rsp+30],xmm4
vmovdqa xmmword ptr [rsp+40],xmm4
xor eax,eax
mov [rsp+50],rax
mov dword ptr [rsp+3C],0C8
mov dword ptr [rsp+38],140
lea rcx,[rsp+20]
mov edx,0FA00
mov r8d,1
call KGySoft.Collections.ArraySection`1[[System.Int32, System.Private.CoreLib]]..ctor(Int32, Boolean)
vmovdqu xmm0,xmmword ptr [rsp+20]
vmovdqu xmmword ptr [rsp+40],xmm0
mov rcx,[rsp+30]
mov [rsp+50],rcx
xor ecx,ecx
M00_L00:
xor eax,eax
xor edx,edx
M00_L01:
xor r8d,r8d
M00_L02:
inc eax
mov r9d,edx
imul r9d,[rsp+38]
add r9d,r8d
cmp qword ptr [rsp+40],0
je short M00_L03
mov r10,[rsp+40]
add r9d,[rsp+48]
cmp r9d,[r10+8]
jae short M00_L04
movsxd r9,r9d
mov [r10+r9*4+10],eax
inc r8d
cmp r8d,140
jl short M00_L02
inc edx
cmp edx,0C8
jl short M00_L01
inc ecx
cmp ecx,2710
jl short M00_L00
lea rcx,[rsp+40]
call KGySoft.Collections.ArraySection`1[[System.Int32, System.Private.CoreLib]].Release()
vxorps xmm0,xmm0,xmm0
vmovdqu xmmword ptr [rsp+38],xmm0
vmovdqu xmmword ptr [rsp+48],xmm0
add rsp,58
ret
M00_L03:
call KGySoft.Throw.IndexOutOfRangeException()
int 3
M00_L04:
call CORINFO_HELP_RNGCHKFAIL
int 3
; Total bytes of code 225 Bonus content: .NET Core 5.0.0 (CoreCLR 5.0.20.47505, CoreFX 5.0.20.47505), X64 RyuJIT - when it is unable to dump the results:; BenchmarkDotNet.Autogenerated.Runnable_0.__ForDisassemblyDiagnoser__()
push rbp
sub rsp,20
lea rbp,[rsp+20]
mov [rbp+10],rcx
mov rcx,[rbp+10]
cmp dword ptr [rcx+38],0B
jne short M00_L00
mov rcx,[rbp+10]
call 00007FF8797F2578
M00_L00:
nop
lea rsp,[rbp]
pop rbp
ret
; Total bytes of code 40 |
@kunalspathak thoughts about where investigation should start ? |
Below is the assembly code for .NET 3.1 and .NET 5
.NET 3.1; Assembly listing for method <>c__DisplayClass1_0:<AccessTest>b__0():this
; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
; optimized code
; rsp based frame
; fully interruptible
; Final local variable assignments
;
; V00 this [V00,T05] ( 3, 18 ) ref -> rcx this class-hnd
; V01 loc0 [V01,T03] ( 4, 49 ) int -> rax
; V02 loc1 [V02,T04] ( 5, 29 ) int -> rdx
; V03 loc2 [V03,T01] ( 5, 68 ) int -> r8
; V04 OutArgs [V04 ] ( 1, 1 ) lclBlk (32) [rsp+0x00] "OutgoingArgSpace"
; V05 tmp1 [V05,T02] ( 2, 64 ) int -> rax "dup spill"
; V06 rat0 [V06,T00] ( 6,192 ) ref -> r9 "ReplaceWithLclVar is creating a new local variable"
;
; Lcl frame size = 32
G_M55748_IG01:
56 push rsi
4883EC20 sub rsp, 32
G_M55748_IG02:
33C0 xor eax, eax
33D2 xor edx, edx
G_M55748_IG03:
4533C0 xor r8d, r8d
G_M55748_IG04:
FFC0 inc eax
4C8B4908 mov r9, gword ptr [rcx+8]
448BD2 mov r10d, edx
452B5118 sub r10d, dword ptr [r9+24]
453B5110 cmp r10d, dword ptr [r9+16]
733C jae SHORT G_M55748_IG07
458BD8 mov r11d, r8d
452B591C sub r11d, dword ptr [r9+28]
453B5914 cmp r11d, dword ptr [r9+20]
732F jae SHORT G_M55748_IG07
418B7114 mov esi, dword ptr [r9+20]
490FAFF2 imul rsi, r10
4D8BD3 mov r10, r11
4C03D6 add r10, rsi
4389449120 mov dword ptr [r9+4*r10+32], eax
41FFC0 inc r8d
4181F840010000 cmp r8d, 320
7CC1 jl SHORT G_M55748_IG04
G_M55748_IG05:
FFC2 inc edx
81FAC8000000 cmp edx, 200
7CB4 jl SHORT G_M55748_IG03
G_M55748_IG06:
4883C420 add rsp, 32
5E pop rsi
C3 ret
G_M55748_IG07:
E8E0D1DE5D call CORINFO_HELP_RNGCHKFAIL
CC int3
; Total bytes of code 97, prolog size 5 for method <>c__DisplayClass1_0:<AccessTest>b__0():this
; ============================================================
; Assembly listing for method <>c__DisplayClass1_0:<AccessTest>b__1():this
; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
; optimized code
; rsp based frame
; fully interruptible
; Final local variable assignments
;
; V00 this [V00,T08] ( 3, 6 ) ref -> rcx this class-hnd
; V01 loc0 [V01,T04] ( 4, 49 ) int -> rax
; V02 loc1 [V02,T05] ( 6, 33 ) int -> rdx
; V03 loc2 [V03,T02] ( 6, 84 ) int -> r8
; V04 OutArgs [V04 ] ( 1, 1 ) lclBlk (32) [rsp+0x00] "OutgoingArgSpace"
; V05 tmp1 [V05,T03] ( 2, 64 ) int -> rax "dup spill"
; V06 tmp2 [V06,T00] ( 3, 96 ) ref -> r11 "arr expr"
; V07 tmp3 [V07,T01] ( 3, 96 ) ref -> r11 "arr expr"
; V08 cse0 [V08,T06] ( 2, 20 ) ref -> r9 "ValNumCSE"
; V09 cse1 [V09,T07] ( 2, 20 ) long -> r10 "ValNumCSE"
;
; Lcl frame size = 32
G_M55749_IG01:
56 push rsi
4883EC20 sub rsp, 32
G_M55749_IG02:
33C0 xor eax, eax
33D2 xor edx, edx
G_M55749_IG03:
4533C0 xor r8d, r8d
4C8B4910 mov r9, gword ptr [rcx+16]
4C63D2 movsxd r10, edx
G_M55749_IG04:
FFC0 inc eax
4D8BD9 mov r11, r9
413B5308 cmp edx, dword ptr [r11+8]
732F jae SHORT G_M55749_IG07
4F8B5CD310 mov r11, gword ptr [r11+8*r10+16]
453B4308 cmp r8d, dword ptr [r11+8]
7324 jae SHORT G_M55749_IG07
4963F0 movsxd rsi, r8d
418944B310 mov dword ptr [r11+4*rsi+16], eax
41FFC0 inc r8d
4181F840010000 cmp r8d, 320
7CD6 jl SHORT G_M55749_IG04
G_M55749_IG05:
FFC2 inc edx
81FAC8000000 cmp edx, 200
7CC2 jl SHORT G_M55749_IG03
G_M55749_IG06:
4883C420 add rsp, 32
5E pop rsi
C3 ret
G_M55749_IG07:
E85ECCDE5D call CORINFO_HELP_RNGCHKFAIL
CC int3
; Total bytes of code 83, prolog size 5 for method <>c__DisplayClass1_0:<AccessTest>b__1():this
; ============================================================
; Assembly listing for method <>c__DisplayClass1_0:<AccessTest>b__2():this
; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
; optimized code
; rsp based frame
; fully interruptible
; Final local variable assignments
;
; V00 this [V00,T11] ( 3, 6 ) ref -> rcx this class-hnd
; V01 loc0 [V01,T07] ( 4, 49 ) int -> rax
; V02 loc1 [V02,T09] ( 5, 29 ) int -> rdx
; V03 loc2 [V03,T04] ( 5, 68 ) int -> r8
; V04 OutArgs [V04 ] ( 1, 1 ) lclBlk (32) [rsp+0x00] "OutgoingArgSpace"
; V05 tmp1 [V05,T05] ( 2, 64 ) int -> rax "dup spill"
; V06 tmp2 [V06,T00] ( 4,128 ) byref -> r10 "Inlining Arg"
; V07 tmp3 [V07,T01] ( 3, 96 ) byref -> r11 "Inlining Arg"
; V08 tmp4 [V08,T06] ( 2, 64 ) int -> rsi "Inlining Arg"
; V09 tmp5 [V09,T02] ( 3, 96 ) ref -> r10 "arr expr"
; V10 tmp6 [V10,T03] ( 3, 96 ) int -> rsi "arr expr"
; V11 cse0 [V11,T10] ( 2, 20 ) byref -> r9 "ValNumCSE"
; V12 cse1 [V12,T08] ( 3, 48 ) ref -> r10 "ValNumCSE"
;
; Lcl frame size = 32
G_M55753_IG01:
56 push rsi
4883EC20 sub rsp, 32
G_M55753_IG02:
33C0 xor eax, eax
33D2 xor edx, edx
G_M55753_IG03:
4533C0 xor r8d, r8d
4C8D4918 lea r9, bword ptr [rcx+24]
G_M55753_IG04:
FFC0 inc eax
4D8BD1 mov r10, r9
453912 cmp dword ptr [r10], r10d
4D8D5A08 lea r11, bword ptr [r10+8]
8BF2 mov esi, edx
410FAF32 imul esi, dword ptr [r10]
4103F0 add esi, r8d
4D8B13 mov r10, gword ptr [r11]
4D85D2 test r10, r10
742E je SHORT G_M55753_IG08
G_M55753_IG05:
41037308 add esi, dword ptr [r11+8]
413B7208 cmp esi, dword ptr [r10+8]
732A jae SHORT G_M55753_IG09
4C63DE movsxd r11, esi
4389449A10 mov dword ptr [r10+4*r11+16], eax
41FFC0 inc r8d
4181F840010000 cmp r8d, 320
7CC5 jl SHORT G_M55753_IG04
G_M55753_IG06:
FFC2 inc edx
81FAC8000000 cmp edx, 200
7CB4 jl SHORT G_M55753_IG03
G_M55753_IG07:
4883C420 add rsp, 32
5E pop rsi
C3 ret
G_M55753_IG08:
E8B897A6FF call KGySoft.Throw:IndexOutOfRangeException()
CC int3
G_M55753_IG09:
E8DACBDE5D call CORINFO_HELP_RNGCHKFAIL
CC int3
; Total bytes of code 103, prolog size 5 for method <>c__DisplayClass1_0:<AccessTest>b__2():this
; ============================================================ .NET 5.0; Assembly listing for method <>c__DisplayClass1_0:<AccessTest>b__0():this
; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
; optimized code
; rsp based frame
; fully interruptible
; Final local variable assignments
;
; V00 this [V00,T05] ( 3, 18 ) ref -> rcx this class-hnd
; V01 loc0 [V01,T03] ( 4, 49 ) int -> rax
; V02 loc1 [V02,T04] ( 5, 29 ) int -> rdx
; V03 loc2 [V03,T01] ( 5, 68 ) int -> r8
; V04 OutArgs [V04 ] ( 1, 1 ) lclBlk (32) [rsp+0x00] "OutgoingArgSpace"
; V05 tmp1 [V05,T02] ( 2, 64 ) int -> rax "dup spill"
; V06 rat0 [V06,T00] ( 6,192 ) ref -> r9 "ReplaceWithLclVar is creating a new local variable"
;
; Lcl frame size = 32
G_M34868_IG01: ;; offset=0000H
56 push rsi
4883EC20 sub rsp, 32
;; bbWeight=1 PerfScore 1.25
G_M34868_IG02: ;; offset=0005H
33C0 xor eax, eax
33D2 xor edx, edx
;; bbWeight=1 PerfScore 0.50
G_M34868_IG03: ;; offset=0009H
4533C0 xor r8d, r8d
;; bbWeight=4 PerfScore 1.00
G_M34868_IG04: ;; offset=000CH
FFC0 inc eax
4C8B4908 mov r9, gword ptr [rcx+8]
448BD2 mov r10d, edx
452B5118 sub r10d, dword ptr [r9+24]
453B5110 cmp r10d, dword ptr [r9+16]
733C jae SHORT G_M34868_IG07
458BD8 mov r11d, r8d
452B591C sub r11d, dword ptr [r9+28]
453B5914 cmp r11d, dword ptr [r9+20]
732F jae SHORT G_M34868_IG07
418B7114 mov esi, dword ptr [r9+20]
490FAFF2 imul rsi, r10
4D8BD3 mov r10, r11
4C03D6 add r10, rsi
4389449120 mov dword ptr [r9+4*r10+32], eax
41FFC0 inc r8d
4181F840010000 cmp r8d, 320
7CC1 jl SHORT G_M34868_IG04
;; bbWeight=16 PerfScore 316.00
G_M34868_IG05: ;; offset=004BH
FFC2 inc edx
81FAC8000000 cmp edx, 200
7CB4 jl SHORT G_M34868_IG03
;; bbWeight=4 PerfScore 6.00
G_M34868_IG06: ;; offset=0055H
4883C420 add rsp, 32
5E pop rsi
C3 ret
;; bbWeight=1 PerfScore 1.75
G_M34868_IG07: ;; offset=005BH
E840F7B05E call CORINFO_HELP_RNGCHKFAIL
CC int3
;; bbWeight=0 PerfScore 0.00
; Total bytes of code 97, prolog size 5, PerfScore 336.20, instruction count 31, allocated bytes for code 97 (MethodHash=e4a377cb) for method <>c__DisplayClass1_0:<AccessTest>b__0():this
; ============================================================
; Assembly listing for method <>c__DisplayClass1_0:<AccessTest>b__1():this
; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
; optimized code
; rsp based frame
; fully interruptible
; Final local variable assignments
;
; V00 this [V00,T08] ( 3, 6 ) ref -> rcx this class-hnd
; V01 loc0 [V01,T04] ( 4, 49 ) int -> rax
; V02 loc1 [V02,T05] ( 6, 33 ) int -> rdx
; V03 loc2 [V03,T02] ( 6, 84 ) int -> r8
; V04 OutArgs [V04 ] ( 1, 1 ) lclBlk (32) [rsp+0x00] "OutgoingArgSpace"
; V05 tmp1 [V05,T03] ( 2, 64 ) int -> rax "dup spill"
; V06 tmp2 [V06,T00] ( 3, 96 ) ref -> r11 "arr expr"
; V07 tmp3 [V07,T01] ( 3, 96 ) ref -> r11 "arr expr"
; V08 cse0 [V08,T06] ( 2, 20 ) ref -> r9 "CSE - aggressive"
; V09 cse1 [V09,T07] ( 2, 20 ) long -> r10 "CSE - aggressive"
;
; Lcl frame size = 32
G_M58453_IG01: ;; offset=0000H
56 push rsi
4883EC20 sub rsp, 32
;; bbWeight=1 PerfScore 1.25
G_M58453_IG02: ;; offset=0005H
33C0 xor eax, eax
33D2 xor edx, edx
;; bbWeight=1 PerfScore 0.50
G_M58453_IG03: ;; offset=0009H
4533C0 xor r8d, r8d
4C8B4910 mov r9, gword ptr [rcx+16]
4C63D2 movsxd r10, edx
;; bbWeight=4 PerfScore 10.00
G_M58453_IG04: ;; offset=0013H
FFC0 inc eax
4D8BD9 mov r11, r9
413B5308 cmp edx, dword ptr [r11+8]
732F jae SHORT G_M58453_IG07
4F8B5CD310 mov r11, gword ptr [r11+8*r10+16]
453B4308 cmp r8d, dword ptr [r11+8]
7324 jae SHORT G_M58453_IG07
4963F0 movsxd rsi, r8d
418944B310 mov dword ptr [r11+4*rsi+16], eax
41FFC0 inc r8d
4181F840010000 cmp r8d, 320
7CD6 jl SHORT G_M58453_IG04
;; bbWeight=16 PerfScore 180.00
G_M58453_IG05: ;; offset=003DH
FFC2 inc edx
81FAC8000000 cmp edx, 200
7CC2 jl SHORT G_M58453_IG03
;; bbWeight=4 PerfScore 6.00
G_M58453_IG06: ;; offset=0047H
4883C420 add rsp, 32
5E pop rsi
C3 ret
;; bbWeight=1 PerfScore 1.75
G_M58453_IG07: ;; offset=004DH
E8AEF4B05E call CORINFO_HELP_RNGCHKFAIL
CC int3
;; bbWeight=0 PerfScore 0.00
; Total bytes of code 83, prolog size 5, PerfScore 207.80, instruction count 27, allocated bytes for code 83 (MethodHash=06451baa) for method <>c__DisplayClass1_0:<AccessTest>b__1():this
; ============================================================
; Assembly listing for method <>c__DisplayClass1_0:<AccessTest>b__2():this
; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
; optimized code
; rsp based frame
; fully interruptible
; Final local variable assignments
;
; V00 this [V00,T11] ( 3, 6 ) ref -> rcx this class-hnd
; V01 loc0 [V01,T07] ( 4, 49 ) int -> rax
; V02 loc1 [V02,T09] ( 5, 29 ) int -> rdx
; V03 loc2 [V03,T04] ( 5, 68 ) int -> r8
; V04 OutArgs [V04 ] ( 1, 1 ) lclBlk (32) [rsp+0x00] "OutgoingArgSpace"
; V05 tmp1 [V05,T05] ( 2, 64 ) int -> rax "dup spill"
; V06 tmp2 [V06,T00] ( 3, 96 ) byref -> r10 "Inlining Arg"
; V07 tmp3 [V07,T01] ( 3, 96 ) byref -> r11 "Inlining Arg"
; V08 tmp4 [V08,T06] ( 2, 64 ) int -> rsi "Inlining Arg"
; V09 tmp5 [V09,T02] ( 3, 96 ) ref -> r10 "arr expr"
; V10 tmp6 [V10,T03] ( 3, 96 ) int -> rsi "index expr"
; V11 cse0 [V11,T10] ( 2, 20 ) byref -> r9 "CSE - aggressive"
; V12 cse1 [V12,T08] ( 3, 48 ) ref -> r10 "CSE - aggressive"
;
; Lcl frame size = 32
G_M45046_IG01: ;; offset=0000H
56 push rsi
4883EC20 sub rsp, 32
;; bbWeight=1 PerfScore 1.25
G_M45046_IG02: ;; offset=0005H
33C0 xor eax, eax
33D2 xor edx, edx
;; bbWeight=1 PerfScore 0.50
G_M45046_IG03: ;; offset=0009H
4533C0 xor r8d, r8d
4C8D4918 lea r9, bword ptr [rcx+24]
;; bbWeight=4 PerfScore 3.00
G_M45046_IG04: ;; offset=0010H
FFC0 inc eax
4D8BD1 mov r10, r9
4D8D5A08 lea r11, bword ptr [r10+8]
8BF2 mov esi, edx
410FAF32 imul esi, dword ptr [r10]
4103F0 add esi, r8d
4D8B13 mov r10, gword ptr [r11]
4D85D2 test r10, r10
742E je SHORT G_M45046_IG08
;; bbWeight=16 PerfScore 108.00
G_M45046_IG05: ;; offset=002AH
41037308 add esi, dword ptr [r11+8]
413B7208 cmp esi, dword ptr [r10+8]
732A jae SHORT G_M45046_IG09
4C63DE movsxd r11, esi
4389449A10 mov dword ptr [r10+4*r11+16], eax
41FFC0 inc r8d
4181F840010000 cmp r8d, 320
7CC8 jl SHORT G_M45046_IG04
;; bbWeight=16 PerfScore 124.00
G_M45046_IG06: ;; offset=0048H
FFC2 inc edx
81FAC8000000 cmp edx, 200
7CB7 jl SHORT G_M45046_IG03
;; bbWeight=4 PerfScore 6.00
G_M45046_IG07: ;; offset=0052H
4883C420 add rsp, 32
5E pop rsi
C3 ret
;; bbWeight=1 PerfScore 1.75
G_M45046_IG08: ;; offset=0058H
E81BA3A4FF call KGySoft.Throw:IndexOutOfRangeException()
CC int3
;; bbWeight=0 PerfScore 0.00
G_M45046_IG09: ;; offset=005EH
E81DF4B05E call CORINFO_HELP_RNGCHKFAIL
CC int3
;; bbWeight=0 PerfScore 0.00
; Total bytes of code 100, prolog size 5, PerfScore 254.50, instruction count 33, allocated bytes for code 100 (MethodHash=0c6a5009) for method <>c__DisplayClass1_0:<AccessTest>b__2():this
; ============================================================ Here are my observations:
public static void RangeCheck()
{
int i = 0;
const int width = 320;
const int height = 200;
var array = new int[height, width];
for (int y = 0; y < height; y++)
{
for (int x = 0; x < width; x++)
array[y, x] = ++i;
}
} And here is the loop code: G_M23248_IG03: ;; offset=0039H
33C9 xor ecx, ecx
G_M23248_IG04: ;; offset=003BH
FFC6 inc esi
448BC2 mov r8d, edx
442B4018 sub r8d, dword ptr [rax+24]
443B4010 cmp r8d, dword ptr [rax+16]
733A jae SHORT G_M23248_IG07
448BC9 mov r9d, ecx
442B481C sub r9d, dword ptr [rax+28]
443B4814 cmp r9d, dword ptr [rax+20]
732D jae SHORT G_M23248_IG07
448B5014 mov r10d, dword ptr [rax+20]
4D0FAFD0 imul r10, r8
4D8BC1 mov r8, r9
4D03C2 add r8, r10
4289748020 mov dword ptr [rax+4*r8+32], esi
FFC1 inc ecx
81F940010000 cmp ecx, 320
7CC7 jl SHORT G_M23248_IG04
G_M23248_IG05: ;; offset=0074H
FFC2 inc edx
81FAC8000000 cmp edx, 200
7CBB jl SHORT G_M23248_IG03
; ..
; ..
G_M23248_IG07: ;; offset=0084H
E84709775F call CORINFO_HELP_RNGCHKFAIL
CC int3 cc: @briansull , @AndyAyersMS That leads to two suspicion: Code alignment or Memory alignment. Code alignmentI verified the code alignment part and here is my observation:
Before loop alignment: G_M34868_IG04: ;; offset=0010H
00007ffb`9cb6a710 FFC0 inc eax
00007ffb`9cb6a712 4C8B4908 mov r9, gword ptr [rcx+8]
00007ffb`9cb6a716 448BD2 mov r10d, edx
00007ffb`9cb6a719 452B5118 sub r10d, dword ptr [r9+24]
00007ffb`9cb6a71d 453B5110 cmp r10d, dword ptr [r9+16]
; ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (cmp: 1) 32B boundary ...............................
00007ffb`9cb6a721 733C jae SHORT G_M34868_IG07
00007ffb`9cb6a723 458BD8 mov r11d, r8d
00007ffb`9cb6a726 452B591C sub r11d, dword ptr [r9+28]
00007ffb`9cb6a72a 453B5914 cmp r11d, dword ptr [r9+20]
00007ffb`9cb6a72e 732F jae SHORT G_M34868_IG07
00007ffb`9cb6a730 418B7114 mov esi, dword ptr [r9+20]
00007ffb`9cb6a734 490FAFF2 imul rsi, r10
00007ffb`9cb6a738 4D8BD3 mov r10, r11
00007ffb`9cb6a73b 4C03D6 add r10, rsi
00007ffb`9cb6a73e 4389449120 mov dword ptr [r9+4*r10+32], eax
; ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (mov: 3) 32B boundary ...............................
00007ffb`9cb6a743 41FFC0 inc r8d
00007ffb`9cb6a746 4181F840010000 cmp r8d, 320
00007ffb`9cb6a74d 7CC1 jl SHORT G_M34868_IG04 After loop alignment: ;; Add alignment: 'Padding= 4, AlignmentBoundary= 16B.' in (<>c__DisplayClass1_0:<AccessTest>b__0():this)
00007ffb`9cb6a70c 0F1F4000 align
;; bbWeight=4 PerfScore 2.00
G_M34868_IG04: ;; offset=0010H
00007ffb`9cb6a710 FFC0 inc eax
00007ffb`9cb6a712 4C8B4908 mov r9, gword ptr [rcx+8]
00007ffb`9cb6a716 448BD2 mov r10d, edx
00007ffb`9cb6a719 452B5118 sub r10d, dword ptr [r9+24]
00007ffb`9cb6a71d 453B5110 cmp r10d, dword ptr [r9+16]
; ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (cmp: 1) 32B boundary ...............................
00007ffb`9cb6a721 733C jae SHORT G_M34868_IG07
00007ffb`9cb6a723 458BD8 mov r11d, r8d
00007ffb`9cb6a726 452B591C sub r11d, dword ptr [r9+28]
00007ffb`9cb6a72a 453B5914 cmp r11d, dword ptr [r9+20]
00007ffb`9cb6a72e 732F jae SHORT G_M34868_IG07
00007ffb`9cb6a730 418B7114 mov esi, dword ptr [r9+20]
00007ffb`9cb6a734 490FAFF2 imul rsi, r10
00007ffb`9cb6a738 4D8BD3 mov r10, r11
00007ffb`9cb6a73b 4C03D6 add r10, rsi
00007ffb`9cb6a73e 4389449120 mov dword ptr [r9+4*r10+32], eax
; ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (mov: 3) 32B boundary ...............................
00007ffb`9cb6a743 41FFC0 inc r8d
00007ffb`9cb6a746 4181F840010000 cmp r8d, 320
00007ffb`9cb6a74d 7CC1 jl SHORT G_M34868_IG04
;; bbWeight=16 PerfScore 316.00
G_M34868_IG05: ;; offset=004FH
00007ffb`9cb6a74f FFC2 inc edx
00007ffb`9cb6a751 81FAC8000000 cmp edx, 200
00007ffb`9cb6a757 7CB0 jl SHORT G_M34868_IG03
00007ffb`9cb6a9d0 4C63D2 movsxd r10, edx
;; Skip alignment: 'PaddingNeeded= 13, MaxPadding= 8, LoopSize= 42, AlignmentBoundary= 16B.' in (<>c__DisplayClass1_0:<AccessTest>b__1():this)
00007ffb`9cb6a9d3 align
;; bbWeight=4 PerfScore 11.00
G_M58453_IG04: ;; offset=0013H
00007ffb`9cb6a9d3 FFC0 inc eax
00007ffb`9cb6a9d5 4D8BD9 mov r11, r9
00007ffb`9cb6a9d8 413B5308 cmp edx, dword ptr [r11+8]
00007ffb`9cb6a9dc 732F jae SHORT G_M58453_IG07
00007ffb`9cb6a9de 4F8B5CD310 mov r11, gword ptr [r11+8*r10+16]
; ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (mov: 3) 32B boundary ...............................
00007ffb`9cb6a9e3 453B4308 cmp r8d, dword ptr [r11+8]
00007ffb`9cb6a9e7 7324 jae SHORT G_M58453_IG07
00007ffb`9cb6a9e9 4963F0 movsxd rsi, r8d
00007ffb`9cb6a9ec 418944B310 mov dword ptr [r11+4*rsi+16], eax
00007ffb`9cb6a9f1 41FFC0 inc r8d
00007ffb`9cb6a9f4 4181F840010000 cmp r8d, 320
00007ffb`9cb6a9fb 7CD6 jl SHORT G_M58453_IG04
;; bbWeight=16 PerfScore 180.00
Memory alignmentGiven the fact that no code changes happen between .NET 3.1 and .NET 5 and code alignment doesn't play much role because of the loop size, I think memory alignment could be the reason for inconsistent behavior although I am not sure why it would regress for .NET 5 in particular. I am not sure if any GC heuristics have changed that would align memory differently for allocations. To conclude, if we can eliminate the range check, we should be able to fit in such loops in a single cache line and hence get better performance. We should also investigate if there is anything around memory alignment that has changed between .NET 3.1 and .NET 5. I also noticed some Hope that helps! |
Setting area to codegen since it seems next action is there. |
I spent some more time today to compare the performance of .NET3.1 vs. .NET6 for Here are the fresh numbers on my Windows x64 machine:
I then went ahead and measured individual array access numbers and you can see them here: Individual array access benchmark numbers
As you can see, the only benchmark that is slow is "int[y, x] = value" and the reason is loop alignment padding + the way the benchmark is ran. 4-bytes padding is before the inner most loop and it gets executed
We already have #43227 issue that captures the work item to have padding at appropriate location that would not affect the performance adversely. At this point, I don't see any other actionable items to do for this issue so I would go ahead and close it. Feel free to comment / reopen if you have any other questions. Thank for reporting! |
Description
Note: I don't know whether the root cause of my issue is related with #36907 so I report an easily reproducible scenario here.
I have a high performance core library, which has some multidimensional span-like types such as Array2D and Array3D structs, which are affected by the performance degradation: accessing elements of these types are faster on .NET Core 3 than accessing elements of a regular multidimensional array but not when executing on .NET 5.
Reproduction:
KGySoft.CoreLibraries.PerformanceTest
project execute the Array2DPerformanceTest.AccessTest against both .NET Core 3 and .NET 5Online living version: I created also an online example. As per 11/10/2020 this executes the performance test on .NET Core 3.1 (this is a somewhat shortened test in order not to timeout). Targeting .NET 5 is not possible on .NET Fiddle yet.
Configuration
Regression?
The regression can be observed between .NET Core 3.0/3.1 and .NET 5.0
Data
The
Array2D
case has a 33% performance degradation (860 ms vs. 642 ms) while the regular 2D array and jagged array performance did not change essentially.Note: This test is reduced (both in time and cases) in order not to timeout .NET Fiddle. As it is not possible to run .NET 5 codes online yet it is only good for demonstrating that
Array2D
access is faster than regular 2D array access.Analysis
I'm not sure whether I could identify the hot-spot correctly but since
Array2D
usesArraySection
internally, and I could not observe any significant performance degradation inArraySection
performance test (feel free to setRepeat = 5
to get more reliable results just like above) I suspect that the issue lies in accessing the wrappedArraySection
struct inside theArray2D
struct here. However, I could not find any suspicious in the IL code, and I could not check the JITted machine code of the .NET 5 version.The text was updated successfully, but these errors were encountered: