Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ensure Vector<T>.op_Multiply is handled as an intrinsic in appropriate cases #49503

Merged
merged 9 commits into from
Mar 25, 2021

Conversation

tannergooding
Copy link
Member

@tannergooding tannergooding commented Mar 11, 2021

This resolves #49071 and resolves #30923 by ensuring Vector<T> * T is treated as an intrinsic so it generates the ideal codegen and not a for loop:

BenchmarkDotNet=v0.12.1.1466-nightly, OS=Windows 10.0.19042
AMD Ryzen 9 5950X, 1 CPU, 32 logical and 16 physical cores
.NET SDK=6.0.100-preview.3.21161.17
  [Host]     : .NET 6.0.0 (6.0.21.15916), X64 RyuJIT
  Job-KMJTTY : .NET 6.0.0 (42.42.42.42424), X64 RyuJIT

PowerPlanMode=00000000-0000-0000-0000-000000000000  Arguments=/p:DebugType=portable  Toolchain=CoreRun
IterationTime=250.0000 ms  MaxIterationCount=20  MinIterationCount=15
WarmupCount=1

.NET 5

Method Mean Error StdDev Median Min Max Gen 0 Gen 1 Gen 2 Allocated
Burgers_3 66.19 ms 0.324 ms 0.287 ms 66.16 ms 65.52 ms 66.53 ms - - - 156 KB
Disassembly .NET 5.0.4 (5.0.421.11614), X64 RyuJIT

; Burgers.GetCalculated3(Int32, Int32, Double, Double, Double, Double[])
       push      r15
       push      r14
       push      r12
       push      rdi
       push      rsi
       push      rbp
       push      rbx
       sub       rsp,50
       vzeroupper
       vmovaps   [rsp+40],xmm6
       vmovaps   [rsp+30],xmm7
       vmovaps   [rsp+20],xmm8
       mov       edi,ecx
       mov       esi,edx
       vmovaps   xmm6,xmm2
       vmovaps   xmm7,xmm3
       mov       rbx,[rsp+0B8]
       mov       edx,esi
       sar       edx,1F
       and       edx,3
       add       edx,esi
       and       edx,0FFFFFFFC
       mov       ecx,esi
       sub       ecx,edx
       mov       edx,ecx
       neg       edx
       lea       ebp,[rdx+rsi+4]
       movsxd    rdx,ebp
       mov       rcx,offset MT_System.Double[]
       call      CORINFO_HELP_NEWARR_1_VC
       mov       r14,rax
       movsxd    rdx,ebp
       mov       rcx,offset MT_System.Double[]
       call      CORINFO_HELP_NEWARR_1_VC
       mov       r15,rax
       mov       r8d,[rbx+8]
       mov       rcx,rbx
       mov       rdx,r15
       call      System.Array.Copy(System.Array, System.Array, Int32)
       vmulsd    xmm0,xmm7,qword ptr [rsp+0B0]
       vdivsd    xmm0,xmm0,xmm6
       vmovsd    xmm1,qword ptr [7FF99D1F1DE8]
       call      System.Math.Pow(Double, Double)
       xor       edx,edx
       test      edi,edi
       jle       near ptr M01_L03
       add       ebp,0FFFFFFFD
       lea       eax,[rsi+0FFFF]
       movsxd    rcx,eax
       add       esi,0FFFFFFFE
       movsxd    r8,esi
M01_L00:
       mov       r9d,1
       cmp       ebp,1
       jle       near ptr M01_L02
       mov       r10d,[r15+8]
       vdivsd    xmm1,xmm7,xmm6
M01_L01:
       cmp       r9d,r10d
       jae       near ptr M01_L05
       lea       ebx,[r9+3]
       cmp       ebx,r10d
       jae       near ptr M01_L05
       vmovupd   ymm2,[r15+r9*8+10]
       lea       r11d,[r9+0FFFF]
       cmp       r11d,r10d
       jae       near ptr M01_L05
       lea       r12d,[r9+2]
       cmp       r12d,r10d
       jae       near ptr M01_L05
       vmovupd   ymm3,[r15+r11*8+10]
       lea       r11d,[r9+1]
       cmp       r11d,r10d
       jae       near ptr M01_L05
       lea       r12d,[r9+4]
       cmp       r12d,r10d
       jae       near ptr M01_L05
       vmovupd   ymm4,[r15+r11*8+10]
       vbroadcastsd ymm5,xmm1
       vmulpd    ymm5,ymm5,ymm2
       vsubpd    ymm8,ymm2,ymm3
       vmulpd    ymm5,ymm5,ymm8
       vsubpd    ymm5,ymm2,ymm5
       vbroadcastsd ymm8,qword ptr [7FF99D1F1DF8]
       vmulpd    ymm2,ymm8,ymm2
       vsubpd    ymm2,ymm4,ymm2
       vaddpd    ymm2,ymm2,ymm3
       vbroadcastsd ymm3,xmm0
       vmulpd    ymm2,ymm3,ymm2
       vaddpd    ymm2,ymm5,ymm2
       mov       r11d,[r14+8]
       cmp       r9d,r11d
       jae       near ptr M01_L06
       cmp       ebx,r11d
       jae       near ptr M01_L07
       vmovupd   [r14+r9*8+10],ymm2
       mov       r9d,r12d
       cmp       r9d,ebp
       jl        near ptr M01_L01
M01_L02:
       mov       r10d,[r15+8]
       cmp       r10d,0
       jbe       near ptr M01_L05
       vmovsd    xmm1,qword ptr [r15+10]
       vmulsd    xmm2,xmm1,xmm7
       vdivsd    xmm2,xmm2,xmm6
       cmp       eax,r10d
       jae       near ptr M01_L05
       vmovsd    xmm3,qword ptr [r15+rcx*8+10]
       vsubsd    xmm4,xmm1,xmm3
       vmulsd    xmm2,xmm2,xmm4
       vsubsd    xmm2,xmm1,xmm2
       cmp       r10d,1
       jbe       near ptr M01_L05
       vmovsd    xmm4,qword ptr [r15+18]
       vmulsd    xmm1,xmm1,qword ptr [7FF99D1F1E08]
       vsubsd    xmm1,xmm4,xmm1
       vaddsd    xmm1,xmm1,xmm3
       vmulsd    xmm1,xmm1,xmm0
       vaddsd    xmm1,xmm2,xmm1
       mov       r11d,[r14+8]
       cmp       r11d,0
       jbe       near ptr M01_L05
       vmovsd    qword ptr [r14+10],xmm1
       vmovsd    xmm1,qword ptr [r15+rcx*8+10]
       vmulsd    xmm2,xmm1,xmm7
       vdivsd    xmm2,xmm2,xmm6
       cmp       esi,r10d
       jae       near ptr M01_L05
       vmovsd    xmm3,qword ptr [r15+r8*8+10]
       vsubsd    xmm4,xmm1,xmm3
       vmulsd    xmm2,xmm2,xmm4
       vsubsd    xmm2,xmm1,xmm2
       vaddsd    xmm1,xmm1,xmm1
       vmovsd    xmm4,qword ptr [r15+10]
       vsubsd    xmm1,xmm4,xmm1
       vaddsd    xmm1,xmm1,xmm3
       vmulsd    xmm1,xmm1,xmm0
       vaddsd    xmm1,xmm2,xmm1
       cmp       eax,r11d
       jae       short M01_L05
       vmovsd    qword ptr [r14+rcx*8+10],xmm1
       inc       edx
       cmp       edx,edi
       jl        short M01_L04
       mov       r15,r14
M01_L03:
       mov       rax,r15
       vmovaps   xmm6,[rsp+40]
       vmovaps   xmm7,[rsp+30]
       vmovaps   xmm8,[rsp+20]
       vzeroupper
       add       rsp,50
       pop       rbx
       pop       rbp
       pop       rsi
       pop       rdi
       pop       r12
       pop       r14
       pop       r15
       ret
M01_L04:
       xchg      r14,r15
       jmp       near ptr M01_L00
M01_L05:
       call      CORINFO_HELP_RNGCHKFAIL
M01_L06:
       call      CORINFO_HELP_THROW_ARGUMENTOUTOFRANGEEXCEPTION
M01_L07:
       call      CORINFO_HELP_THROW_ARGUMENTEXCEPTION
       int       3
; Total bytes of code 672

.NET 6

Method Mean Error StdDev Median Min Max Gen 0 Gen 1 Gen 2 Allocated
Burgers_3 76.40 ms 0.266 ms 0.249 ms 76.45 ms 75.93 ms 76.72 ms - - - 158 KB
Disassembly .NET 6.0.0 (42.42.42.42424), X64 RyuJIT

; Burgers.GetCalculated3(Int32, Int32, Double, Double, Double, Double[])
       push      r15
       push      r14
       push      r12
       push      rdi
       push      rsi
       push      rbp
       push      rbx
       sub       rsp,60
       vzeroupper
       vmovaps   [rsp+50],xmm6
       vmovaps   [rsp+40],xmm7
       vmovaps   [rsp+30],xmm8
       vmovaps   [rsp+20],xmm9
       mov       edi,ecx
       mov       esi,edx
       vmovaps   xmm6,xmm2
       vmovaps   xmm7,xmm3
       mov       rbx,[rsp+0C8]
       mov       edx,esi
       sar       edx,1F
       and       edx,3
       add       edx,esi
       and       edx,0FFFFFFFC
       mov       ecx,esi
       sub       ecx,edx
       mov       edx,ecx
       neg       edx
       lea       ebp,[rdx+rsi+4]
       movsxd    rdx,ebp
       mov       rcx,offset MT_System.Double[]
       call      CORINFO_HELP_NEWARR_1_VC
       mov       r14,rax
       movsxd    rdx,ebp
       mov       rcx,offset MT_System.Double[]
       call      CORINFO_HELP_NEWARR_1_VC
       mov       r15,rax
       mov       r8d,[rbx+8]
       mov       rcx,rbx
       mov       rdx,r15
       call      System.Array.Copy(System.Array, System.Array, Int32)
       vmulsd    xmm0,xmm7,qword ptr [rsp+0C0]
       vdivsd    xmm0,xmm0,xmm6
       vmovsd    xmm8,qword ptr [7FF99709A800]
       vmovaps   xmm1,xmm8
       call      System.Math.Pow(Double, Double)
       xor       edx,edx
       test      edi,edi
       jle       near ptr M01_L03
       add       ebp,0FFFFFFFD
       lea       eax,[rsi+0FFFF]
       movsxd    rcx,eax
       add       esi,0FFFFFFFE
       movsxd    r8,esi
M01_L00:
       mov       r9d,1
       cmp       ebp,1
       jle       near ptr M01_L02
       mov       r10d,[r15+8]
       vdivsd    xmm1,xmm7,xmm6
M01_L01:
       cmp       r9d,r10d
       jae       near ptr M01_L05
       lea       ebx,[r9+3]
       cmp       ebx,r10d
       jae       near ptr M01_L05
       vmovupd   ymm2,[r15+r9*8+10]
       lea       r11d,[r9+0FFFF]
       cmp       r11d,r10d
       jae       near ptr M01_L05
       lea       r12d,[r9+2]
       cmp       r12d,r10d
       jae       near ptr M01_L05
       vmovupd   ymm3,[r15+r11*8+10]
       lea       r11d,[r9+1]
       cmp       r11d,r10d
       jae       near ptr M01_L05
       lea       r12d,[r9+4]
       cmp       r12d,r10d
       jae       near ptr M01_L05
       vmovupd   ymm4,[r15+r11*8+10]
       vmovaps   xmm5,xmm1
       vbroadcastsd ymm5,xmm5
       vmulpd    ymm5,ymm2,ymm5
       vsubpd    ymm9,ymm2,ymm3
       vmulpd    ymm5,ymm5,ymm9
       vsubpd    ymm5,ymm2,ymm5
       vmovaps   xmm9,xmm8
       vbroadcastsd ymm9,xmm9
       vmulpd    ymm2,ymm9,ymm2
       vsubpd    ymm2,ymm4,ymm2
       vaddpd    ymm2,ymm2,ymm3
       vmovaps   xmm3,xmm0
       vbroadcastsd ymm3,xmm3
       vmulpd    ymm2,ymm3,ymm2
       vaddpd    ymm2,ymm5,ymm2
       mov       r11d,[r14+8]
       cmp       r9d,r11d
       jae       near ptr M01_L06
       cmp       ebx,r11d
       jae       near ptr M01_L07
       vmovupd   [r14+r9*8+10],ymm2
       mov       r9d,r12d
       cmp       r9d,ebp
       jl        near ptr M01_L01
M01_L02:
       mov       r10d,[r15+8]
       test      r10d,r10d
       je        near ptr M01_L05
       vmovsd    xmm1,qword ptr [r15+10]
       vmulsd    xmm2,xmm1,xmm7
       vdivsd    xmm2,xmm2,xmm6
       cmp       eax,r10d
       jae       near ptr M01_L05
       vmovsd    xmm3,qword ptr [r15+rcx*8+10]
       vsubsd    xmm4,xmm1,xmm3
       vmulsd    xmm2,xmm2,xmm4
       vsubsd    xmm2,xmm1,xmm2
       cmp       r10d,1
       jbe       near ptr M01_L05
       vmovsd    xmm4,qword ptr [r15+18]
       vmulsd    xmm1,xmm1,xmm8
       vsubsd    xmm1,xmm4,xmm1
       vaddsd    xmm1,xmm1,xmm3
       vmulsd    xmm1,xmm1,xmm0
       vaddsd    xmm1,xmm2,xmm1
       mov       r11d,[r14+8]
       test      r11d,r11d
       je        near ptr M01_L05
       vmovsd    qword ptr [r14+10],xmm1
       vmovsd    xmm1,qword ptr [r15+rcx*8+10]
       vmulsd    xmm2,xmm1,xmm7
       vdivsd    xmm2,xmm2,xmm6
       cmp       esi,r10d
       jae       near ptr M01_L05
       vmovsd    xmm3,qword ptr [r15+r8*8+10]
       vsubsd    xmm4,xmm1,xmm3
       vmulsd    xmm2,xmm2,xmm4
       vsubsd    xmm2,xmm1,xmm2
       vmovsd    xmm4,qword ptr [r15+10]
       vmulsd    xmm1,xmm1,xmm8
       vsubsd    xmm1,xmm4,xmm1
       vaddsd    xmm1,xmm1,xmm3
       vmulsd    xmm1,xmm1,xmm0
       vaddsd    xmm1,xmm2,xmm1
       cmp       eax,r11d
       jae       short M01_L05
       vmovsd    qword ptr [r14+rcx*8+10],xmm1
       inc       edx
       cmp       edx,edi
       jl        short M01_L04
       mov       r15,r14
M01_L03:
       mov       rax,r15
       vmovaps   xmm6,[rsp+50]
       vmovaps   xmm7,[rsp+40]
       vmovaps   xmm8,[rsp+30]
       vmovaps   xmm9,[rsp+20]
       vzeroupper
       add       rsp,60
       pop       rbx
       pop       rbp
       pop       rsi
       pop       rdi
       pop       r12
       pop       r14
       pop       r15
       ret
M01_L04:
       xchg      r14,r15
       jmp       near ptr M01_L00
M01_L05:
       call      CORINFO_HELP_RNGCHKFAIL
M01_L06:
       call      CORINFO_HELP_THROW_ARGUMENTOUTOFRANGEEXCEPTION
M01_L07:
       call      CORINFO_HELP_THROW_ARGUMENTEXCEPTION
       int       3
; Total bytes of code 694

Notably it is still slightly slower. This appears to be largely due to additional movaps inserted before the broadcast instructions:

       vmovaps   xmm9,xmm8
       vbroadcastsd ymm9,xmm9

This appears to be because we insert Avx2.BroadcastScalarToVector256(Vector128.CreateScalarUnsafe(value)) during lowering. The register allocator then assigns a different register for CreateScalarUnsafe even though its marked as tgtPrefUse (this appears to be related to this happening inside a loop).

I'm investigating further to see if there is a trivial fix here (other than expanding Vector256_Create earlier, such as in the importer).

@dotnet-issue-labeler dotnet-issue-labeler bot added the area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI label Mar 11, 2021
@tannergooding
Copy link
Member Author

tannergooding commented Mar 11, 2021

CC. @echesakovMSFT

@tannergooding tannergooding requested a review from echesakov March 12, 2021 20:51
@tannergooding
Copy link
Member Author

CC. @dotnet/jit-contrib

GenTree** broadcastOp = nullptr;

if (varTypeIsArithmetic(op1->TypeGet()))
{
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For Arm64 you don't neet broadcast operation when either op1 or op2 are floating-point.
Since the operands will be in SIMD registers already you should use MultiplyByElement (where elementIndex is 0) instead

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I also don't think you need to broadcast an integer value to the whole SIMD register.
You can use ins Vd.T[0], Xd followed by mul Vd, Vn, Vd.T[0] instead.

In this case, you can share some of the implementation for floating-point and integer types.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should need a CreateScalarUnsafe in this case instead then. I'll update to do the right thing.

  • Notably this wasn't possible before the ARM HWIntrinsics were brought online, which is likely why Vector wasn't doing that already.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Notably, MultiplyByScalar and MultiplyBySelectedScalar don't work for byte/sbyte and so those still need a broadcast.

case TYP_INT:
case TYP_UINT:
{
hwIntrinsic = NI_AVX2_MultiplyLow;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouildn't this be guarded with compOpportunisticallyDependsOn(InstructionSet_AVX2)?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, its already asserted at the top of the method (https://github.com/dotnet/runtime/blob/main/src/coreclr/jit/simdashwintrinsic.cpp#L392) as well as properly guarded further up in the stack.

We should never encounter SimdAsHWIntrinsicClassId::VectorT256 if AVX2 is not supported.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why at least AVX2 is required for VectorT256? Shouldn't AVX be sufficient?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Vector<T> supports all 10 primitive types and can only properly accelerate the integer types with AVX2 so it was decided it is only 32-bytes if AVX2 is supported and is 16-bytes otherwise, for the best overall perf/experience.

@tannergooding
Copy link
Member Author

@echesakovMSFT, any other feedback here or is it good to merge?

Copy link
Contributor

@echesakov echesakov left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks Good.
Thank you

@tannergooding tannergooding merged commit 88c0c48 into dotnet:main Mar 25, 2021
@ghost ghost locked as resolved and limited conversation to collaborators Apr 24, 2021
@karelz karelz added this to the 6.0.0 milestone May 20, 2021
@tannergooding tannergooding deleted the fix-49071 branch November 11, 2022 15:26
Sign up for free to subscribe to this conversation on GitHub. Already have an account? Sign in.
Labels
area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI
Projects
None yet
Development

Successfully merging this pull request may close these issues.

[Perf] Burgers.Test3 Regression on x64 Improve Vector<uint> and Vector<ushort> multiply
3 participants