Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimizes vector conversions with AVX512 #87878

Closed

Conversation

khushal1996
Copy link
Contributor

@khushal1996 khushal1996 commented Jun 21, 2023

NO NEED FOR REVIEW AT THIS TIME.

Following new APIs have been added here as a part of this PR -->

1. Vector.ConvertToUInt64

public static Vector256<ulong> ConvDoubleToUlong256(Vector256<double> val)
{
    return Vector256.ConvertToUInt64(val);
}

Assembly Before Optimization

G_M9746_IG01:  ;; offset=0000H
       push     rbp
       sub      rsp, 144
       vzeroupper 
       lea      rbp, [rsp+90H]
       vxorps   xmm4, xmm4, xmm4
       vmovdqa32 xmmword ptr [rbp-30H], xmm4
       vmovdqa32 xmmword ptr [rbp-20H], xmm4
       mov      bword ptr [rbp+10H], rcx
       mov      bword ptr [rbp+18H], rdx
						;; size=47 bbWeight=1 PerfScore 9.08
G_M9746_IG02:  ;; offset=002FH
       cmp      dword ptr [(reloc 0x7ffb63022898)], 0
       je       SHORT G_M9746_IG04
						;; size=9 bbWeight=1 PerfScore 4.00
G_M9746_IG03:  ;; offset=0038H
       call     CORINFO_HELP_DBG_IS_JUST_MY_CODE
						;; size=5 bbWeight=0.50 PerfScore 0.50
G_M9746_IG04:  ;; offset=003DH
       nop      
       mov      rdx, bword ptr [rbp+18H]
       vmovups  ymm0, ymmword ptr [rdx]
       vmovups  ymmword ptr [rbp-70H], ymm0
       lea      rdx, [rbp-70H]
       lea      rcx, [rbp-50H]
       call     [System.Runtime.Intrinsics.Vector256:ConvertToUInt64(System.Runtime.Intrinsics.Vector256`1[double]):System.Runtime.Intrinsics.Vector256`1[ulong]]
       vmovups  ymm0, ymmword ptr [rbp-50H]
       vmovups  ymmword ptr [rbp-30H], ymm0
       nop      
						;; size=56 bbWeight=1 PerfScore 16.50
G_M9746_IG05:  ;; offset=0075H
       mov      rax, bword ptr [rbp+10H]
       vmovups  ymm0, ymmword ptr [rbp-30H]
       vmovups  ymmword ptr [rax], ymm0
       mov      rax, bword ptr [rbp+10H]
						;; size=24 bbWeight=1 PerfScore 8.00
G_M9746_IG06:  ;; offset=008DH
       vzeroupper 
       add      rsp, 144
       pop      rbp
       ret      
						;; size=12 bbWeight=1 PerfScore 2.75

Assembly After Optimization

G_M9746_IG01:  ;; offset=0000H
       push     rbp
       sub      rsp, 112
       vzeroupper 
       lea      rbp, [rsp+70H]
       vxorps   xmm4, xmm4, xmm4
       vmovdqa32 xmmword ptr [rbp-30H], xmm4
       vmovdqa32 xmmword ptr [rbp-20H], xmm4
       mov      bword ptr [rbp+10H], rcx
       mov      bword ptr [rbp+18H], rdx
						;; size=41 bbWeight=1 PerfScore 9.08
G_M9746_IG02:  ;; offset=0029H
       cmp      dword ptr [(reloc 0x7ffb61d22898)], 0
       je       SHORT G_M9746_IG04
						;; size=9 bbWeight=1 PerfScore 4.00
G_M9746_IG03:  ;; offset=0032H
       call     CORINFO_HELP_DBG_IS_JUST_MY_CODE
						;; size=5 bbWeight=0.50 PerfScore 0.50
G_M9746_IG04:  ;; offset=0037H
       nop      
       mov      rax, bword ptr [rbp+18H]
       vcvttpd2uqq ymm0, ymmword ptr [rax]
       vmovups  ymmword ptr [rbp-50H], ymm0
       vmovups  ymm0, ymmword ptr [rbp-50H]
       vmovups  ymmword ptr [rbp-30H], ymm0
       nop      
						;; size=42 bbWeight=1 PerfScore 13.50
G_M9746_IG05:  ;; offset=0061H
       mov      rax, bword ptr [rbp+10H]
       vmovups  ymm0, ymmword ptr [rbp-30H]
       vmovups  ymmword ptr [rax], ymm0
       mov      rax, bword ptr [rbp+10H]
						;; size=24 bbWeight=1 PerfScore 8.00
G_M9746_IG06:  ;; offset=0079H
       vzeroupper 
       add      rsp, 112
       pop      rbp
       ret      
						;; size=9 bbWeight=1 PerfScore 2.75	

2. Vector.ConvertToDouble

public static Vector256<double> ConvUlongToDouble256(Vector256<ulong> val)
{
   return Vector256.ConvertToDouble(val);
}

Assembly Before Optimization

G_M14098_IG01:  ;; offset=0000H
       push     rbp
       sub      rsp, 144
       vzeroupper 
       lea      rbp, [rsp+90H]
       vxorps   xmm4, xmm4, xmm4
       vmovdqa32 xmmword ptr [rbp-30H], xmm4
       vmovdqa32 xmmword ptr [rbp-20H], xmm4
       mov      bword ptr [rbp+10H], rcx
       mov      bword ptr [rbp+18H], rdx
						;; size=47 bbWeight=1 PerfScore 9.08
G_M14098_IG02:  ;; offset=002FH
       cmp      dword ptr [(reloc 0x7ffb61ac2898)], 0
       je       SHORT G_M14098_IG04
						;; size=9 bbWeight=1 PerfScore 4.00
G_M14098_IG03:  ;; offset=0038H
       call     CORINFO_HELP_DBG_IS_JUST_MY_CODE
						;; size=5 bbWeight=0.50 PerfScore 0.50
G_M14098_IG04:  ;; offset=003DH
       nop      
       mov      rdx, bword ptr [rbp+18H]
       vmovups  ymm0, ymmword ptr [rdx]
       vmovups  ymmword ptr [rbp-70H], ymm0
       lea      rdx, [rbp-70H]
       lea      rcx, [rbp-50H]
       call     [System.Runtime.Intrinsics.Vector256:ConvertToDouble(System.Runtime.Intrinsics.Vector256`1[ulong]):System.Runtime.Intrinsics.Vector256`1[double]]
       vmovups  ymm0, ymmword ptr [rbp-50H]
       vmovups  ymmword ptr [rbp-30H], ymm0
       nop      
						;; size=56 bbWeight=1 PerfScore 16.50
G_M14098_IG05:  ;; offset=0075H
       mov      rax, bword ptr [rbp+10H]
       vmovups  ymm0, ymmword ptr [rbp-30H]
       vmovups  ymmword ptr [rax], ymm0
       mov      rax, bword ptr [rbp+10H]
						;; size=24 bbWeight=1 PerfScore 8.00
G_M14098_IG06:  ;; offset=008DH
       vzeroupper 
       add      rsp, 144
       pop      rbp
       ret      
						;; size=12 bbWeight=1 PerfScore 2.75

Assembly After Optimization

G_M14098_IG01:  ;; offset=0000H
       push     rbp
       sub      rsp, 112
       vzeroupper 
       lea      rbp, [rsp+70H]
       vxorps   xmm4, xmm4, xmm4
       vmovdqa32 xmmword ptr [rbp-30H], xmm4
       vmovdqa32 xmmword ptr [rbp-20H], xmm4
       mov      bword ptr [rbp+10H], rcx
       mov      bword ptr [rbp+18H], rdx
						;; size=41 bbWeight=1 PerfScore 9.08
G_M14098_IG02:  ;; offset=0029H
       cmp      dword ptr [(reloc 0x7ffb61ae2898)], 0
       je       SHORT G_M14098_IG04
						;; size=9 bbWeight=1 PerfScore 4.00
G_M14098_IG03:  ;; offset=0032H
       call     CORINFO_HELP_DBG_IS_JUST_MY_CODE
						;; size=5 bbWeight=0.50 PerfScore 0.50
G_M14098_IG04:  ;; offset=0037H
       nop      
       mov      rax, bword ptr [rbp+18H]
       vcvtuqq2pd ymm0, ymmword ptr [rax]
       vmovups  ymmword ptr [rbp-50H], ymm0
       vmovups  ymm0, ymmword ptr [rbp-50H]
       vmovups  ymmword ptr [rbp-30H], ymm0
       nop      
						;; size=42 bbWeight=1 PerfScore 13.50
G_M14098_IG05:  ;; offset=0061H
       mov      rax, bword ptr [rbp+10H]
       vmovups  ymm0, ymmword ptr [rbp-30H]
       vmovups  ymmword ptr [rax], ymm0
       mov      rax, bword ptr [rbp+10H]
						;; size=24 bbWeight=1 PerfScore 8.00
G_M14098_IG06:  ;; offset=0079H
       vzeroupper 
       add      rsp, 112
       pop      rbp
       ret      
						;; size=9 bbWeight=1 PerfScore 2.75

3. Vector.ConvertToInt64

public static Vector256<long> ConvDoubleToLong256(Vector256<double> val)
{
    return Vector256.ConvertToInt64(val);
}

Assembly Before Optimization

G_M10770_IG01:  ;; offset=0000H
       push     rbp
       sub      rsp, 144
       vzeroupper 
       lea      rbp, [rsp+90H]
       vxorps   xmm4, xmm4, xmm4
       vmovdqa32 xmmword ptr [rbp-30H], xmm4
       vmovdqa32 xmmword ptr [rbp-20H], xmm4
       mov      bword ptr [rbp+10H], rcx
       mov      bword ptr [rbp+18H], rdx
						;; size=47 bbWeight=1 PerfScore 9.08
G_M10770_IG02:  ;; offset=002FH
       cmp      dword ptr [(reloc 0x7ff8bed1c5f8)], 0
       je       SHORT G_M10770_IG04
						;; size=9 bbWeight=1 PerfScore 4.00
G_M10770_IG03:  ;; offset=0038H
       call     CORINFO_HELP_DBG_IS_JUST_MY_CODE
						;; size=5 bbWeight=0.50 PerfScore 0.50
G_M10770_IG04:  ;; offset=003DH
       nop      
       mov      rdx, bword ptr [rbp+18H]
       vmovups  ymm0, ymmword ptr [rdx]
       vmovups  ymmword ptr [rbp-70H], ymm0
       lea      rdx, [rbp-70H]
       lea      rcx, [rbp-50H]
       call     [System.Runtime.Intrinsics.Vector256:ConvertToInt64(System.Runtime.Intrinsics.Vector256`1[double]):System.Runtime.Intrinsics.Vector256`1[long]]
       vmovups  ymm0, ymmword ptr [rbp-50H]
       vmovups  ymmword ptr [rbp-30H], ymm0
       nop      
						;; size=56 bbWeight=1 PerfScore 16.50
G_M10770_IG05:  ;; offset=0075H
       mov      rax, bword ptr [rbp+10H]
       vmovups  ymm0, ymmword ptr [rbp-30H]
       vmovups  ymmword ptr [rax], ymm0
       mov      rax, bword ptr [rbp+10H]
						;; size=24 bbWeight=1 PerfScore 8.00
G_M10770_IG06:  ;; offset=008DH
       vzeroupper 
       add      rsp, 144
       pop      rbp
       ret      
						;; size=12 bbWeight=1 PerfScore 2.75

Assembly After Optimization

G_M10770_IG01:  ;; offset=0000H
       push     rbp
       sub      rsp, 112
       vzeroupper 
       lea      rbp, [rsp+70H]
       vxorps   xmm4, xmm4, xmm4
       vmovdqa32 xmmword ptr [rbp-30H], xmm4
       vmovdqa32 xmmword ptr [rbp-20H], xmm4
       mov      bword ptr [rbp+10H], rcx
       mov      bword ptr [rbp+18H], rdx
						;; size=41 bbWeight=1 PerfScore 9.08
G_M10770_IG02:  ;; offset=0029H
       cmp      dword ptr [(reloc 0x7ff8babbc5f8)], 0
       je       SHORT G_M10770_IG04
						;; size=9 bbWeight=1 PerfScore 4.00
G_M10770_IG03:  ;; offset=0032H
       call     CORINFO_HELP_DBG_IS_JUST_MY_CODE
						;; size=5 bbWeight=0.50 PerfScore 0.50
G_M10770_IG04:  ;; offset=0037H
       nop      
       mov      rax, bword ptr [rbp+18H]
       vcvttpd2qq ymm0, ymmword ptr [rax]
       vmovups  ymmword ptr [rbp-50H], ymm0
       vmovups  ymm0, ymmword ptr [rbp-50H]
       vmovups  ymmword ptr [rbp-30H], ymm0
       nop      
						;; size=42 bbWeight=1 PerfScore 13.50
G_M10770_IG05:  ;; offset=0061H
       mov      rax, bword ptr [rbp+10H]
       vmovups  ymm0, ymmword ptr [rbp-30H]
       vmovups  ymmword ptr [rax], ymm0
       mov      rax, bword ptr [rbp+10H]
						;; size=24 bbWeight=1 PerfScore 8.00
G_M10770_IG06:  ;; offset=0079H
       vzeroupper 
       add      rsp, 112
       pop      rbp
       ret      
						;; size=9 bbWeight=1 PerfScore 2.75

khushal1996 and others added 30 commits June 18, 2023 13:57
…vtsd2usi uses ulong.max_value to show FPE for negative, NAN and ulong_max + 1 values.
…architecture. This is because we have changed the JITDbl2Ulng helper function to mimic the new IEEE compliant AVX512 instruction vcvtsd2usi. In the process, we needed to update the library test case because the default Floating Point Error (FPE) value for the new instruction is different from the default MSVC FPE value i.e. 0.
…not changing the library test case but the API to make sure NaN cases are handled.
…id handling edge cases (-1,0) separately inside the helper.
trying to return EA_4BYTE for INS_vcvttss2usi to make sure that we read dword and not qword for float to ulong
… a special handling for vcvttss2usi64 to make sure we read only dword instead of qword for float to ulong conversion
…r nowayasserts and also checking for float and doubel both in lowercast for overflow and conversion to ulong
…) into a single node i.e. GT_CAST(TYP_ULONG, TYP_FLOAT)
…ideration 32bit and 64 bit version of vcvttss2usi.
…ertToUInt64 for double, ConvertToInt64 for double, ConvertToDouble for ulong/long for vector
…ort those conversions due to issues related to mismatch between non AVX512 and AVX512 machine
…X512 and also extending NI_VectorT512_ConvertToInt64 and NI_VectorT512_ConvertToUInt64 support for Float.
@dotnet-issue-labeler dotnet-issue-labeler bot added the area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI label Jun 21, 2023
@ghost ghost added the community-contribution Indicates that the PR has been added by a community member label Jun 21, 2023
@ghost
Copy link

ghost commented Jun 21, 2023

Tagging subscribers to this area: @JulieLeeMSFT, @jakobbotsch
See info in area-owners.md if you want to be subscribed.

Issue Details

Following new APIs have been added here as a part of this PR -->

1. Vector.ConvertToUInt64

public static Vector256<ulong> ConvDoubleToUlong256(Vector256<double> val)
{
    return Vector256.ConvertToUInt64(val);
}

Assembly Before Optimization

G_M9746_IG01:  ;; offset=0000H
       push     rbp
       sub      rsp, 144
       vzeroupper 
       lea      rbp, [rsp+90H]
       vxorps   xmm4, xmm4, xmm4
       vmovdqa32 xmmword ptr [rbp-30H], xmm4
       vmovdqa32 xmmword ptr [rbp-20H], xmm4
       mov      bword ptr [rbp+10H], rcx
       mov      bword ptr [rbp+18H], rdx
						;; size=47 bbWeight=1 PerfScore 9.08
G_M9746_IG02:  ;; offset=002FH
       cmp      dword ptr [(reloc 0x7ffb63022898)], 0
       je       SHORT G_M9746_IG04
						;; size=9 bbWeight=1 PerfScore 4.00
G_M9746_IG03:  ;; offset=0038H
       call     CORINFO_HELP_DBG_IS_JUST_MY_CODE
						;; size=5 bbWeight=0.50 PerfScore 0.50
G_M9746_IG04:  ;; offset=003DH
       nop      
       mov      rdx, bword ptr [rbp+18H]
       vmovups  ymm0, ymmword ptr [rdx]
       vmovups  ymmword ptr [rbp-70H], ymm0
       lea      rdx, [rbp-70H]
       lea      rcx, [rbp-50H]
       call     [System.Runtime.Intrinsics.Vector256:ConvertToUInt64(System.Runtime.Intrinsics.Vector256`1[double]):System.Runtime.Intrinsics.Vector256`1[ulong]]
       vmovups  ymm0, ymmword ptr [rbp-50H]
       vmovups  ymmword ptr [rbp-30H], ymm0
       nop      
						;; size=56 bbWeight=1 PerfScore 16.50
G_M9746_IG05:  ;; offset=0075H
       mov      rax, bword ptr [rbp+10H]
       vmovups  ymm0, ymmword ptr [rbp-30H]
       vmovups  ymmword ptr [rax], ymm0
       mov      rax, bword ptr [rbp+10H]
						;; size=24 bbWeight=1 PerfScore 8.00
G_M9746_IG06:  ;; offset=008DH
       vzeroupper 
       add      rsp, 144
       pop      rbp
       ret      
						;; size=12 bbWeight=1 PerfScore 2.75

Assembly After Optimization

G_M9746_IG01:  ;; offset=0000H
       push     rbp
       sub      rsp, 112
       vzeroupper 
       lea      rbp, [rsp+70H]
       vxorps   xmm4, xmm4, xmm4
       vmovdqa32 xmmword ptr [rbp-30H], xmm4
       vmovdqa32 xmmword ptr [rbp-20H], xmm4
       mov      bword ptr [rbp+10H], rcx
       mov      bword ptr [rbp+18H], rdx
						;; size=41 bbWeight=1 PerfScore 9.08
G_M9746_IG02:  ;; offset=0029H
       cmp      dword ptr [(reloc 0x7ffb61d22898)], 0
       je       SHORT G_M9746_IG04
						;; size=9 bbWeight=1 PerfScore 4.00
G_M9746_IG03:  ;; offset=0032H
       call     CORINFO_HELP_DBG_IS_JUST_MY_CODE
						;; size=5 bbWeight=0.50 PerfScore 0.50
G_M9746_IG04:  ;; offset=0037H
       nop      
       mov      rax, bword ptr [rbp+18H]
       vcvttpd2uqq ymm0, ymmword ptr [rax]
       vmovups  ymmword ptr [rbp-50H], ymm0
       vmovups  ymm0, ymmword ptr [rbp-50H]
       vmovups  ymmword ptr [rbp-30H], ymm0
       nop      
						;; size=42 bbWeight=1 PerfScore 13.50
G_M9746_IG05:  ;; offset=0061H
       mov      rax, bword ptr [rbp+10H]
       vmovups  ymm0, ymmword ptr [rbp-30H]
       vmovups  ymmword ptr [rax], ymm0
       mov      rax, bword ptr [rbp+10H]
						;; size=24 bbWeight=1 PerfScore 8.00
G_M9746_IG06:  ;; offset=0079H
       vzeroupper 
       add      rsp, 112
       pop      rbp
       ret      
						;; size=9 bbWeight=1 PerfScore 2.75	

2. Vector.ConvertToDouble

public static Vector256<double> ConvUlongToDouble256(Vector256<ulong> val)
{
   return Vector256.ConvertToDouble(val);
}

Assembly Before Optimization

G_M14098_IG01:  ;; offset=0000H
       push     rbp
       sub      rsp, 144
       vzeroupper 
       lea      rbp, [rsp+90H]
       vxorps   xmm4, xmm4, xmm4
       vmovdqa32 xmmword ptr [rbp-30H], xmm4
       vmovdqa32 xmmword ptr [rbp-20H], xmm4
       mov      bword ptr [rbp+10H], rcx
       mov      bword ptr [rbp+18H], rdx
						;; size=47 bbWeight=1 PerfScore 9.08
G_M14098_IG02:  ;; offset=002FH
       cmp      dword ptr [(reloc 0x7ffb61ac2898)], 0
       je       SHORT G_M14098_IG04
						;; size=9 bbWeight=1 PerfScore 4.00
G_M14098_IG03:  ;; offset=0038H
       call     CORINFO_HELP_DBG_IS_JUST_MY_CODE
						;; size=5 bbWeight=0.50 PerfScore 0.50
G_M14098_IG04:  ;; offset=003DH
       nop      
       mov      rdx, bword ptr [rbp+18H]
       vmovups  ymm0, ymmword ptr [rdx]
       vmovups  ymmword ptr [rbp-70H], ymm0
       lea      rdx, [rbp-70H]
       lea      rcx, [rbp-50H]
       call     [System.Runtime.Intrinsics.Vector256:ConvertToDouble(System.Runtime.Intrinsics.Vector256`1[ulong]):System.Runtime.Intrinsics.Vector256`1[double]]
       vmovups  ymm0, ymmword ptr [rbp-50H]
       vmovups  ymmword ptr [rbp-30H], ymm0
       nop      
						;; size=56 bbWeight=1 PerfScore 16.50
G_M14098_IG05:  ;; offset=0075H
       mov      rax, bword ptr [rbp+10H]
       vmovups  ymm0, ymmword ptr [rbp-30H]
       vmovups  ymmword ptr [rax], ymm0
       mov      rax, bword ptr [rbp+10H]
						;; size=24 bbWeight=1 PerfScore 8.00
G_M14098_IG06:  ;; offset=008DH
       vzeroupper 
       add      rsp, 144
       pop      rbp
       ret      
						;; size=12 bbWeight=1 PerfScore 2.75

Assembly After Optimization

G_M14098_IG01:  ;; offset=0000H
       push     rbp
       sub      rsp, 112
       vzeroupper 
       lea      rbp, [rsp+70H]
       vxorps   xmm4, xmm4, xmm4
       vmovdqa32 xmmword ptr [rbp-30H], xmm4
       vmovdqa32 xmmword ptr [rbp-20H], xmm4
       mov      bword ptr [rbp+10H], rcx
       mov      bword ptr [rbp+18H], rdx
						;; size=41 bbWeight=1 PerfScore 9.08
G_M14098_IG02:  ;; offset=0029H
       cmp      dword ptr [(reloc 0x7ffb61ae2898)], 0
       je       SHORT G_M14098_IG04
						;; size=9 bbWeight=1 PerfScore 4.00
G_M14098_IG03:  ;; offset=0032H
       call     CORINFO_HELP_DBG_IS_JUST_MY_CODE
						;; size=5 bbWeight=0.50 PerfScore 0.50
G_M14098_IG04:  ;; offset=0037H
       nop      
       mov      rax, bword ptr [rbp+18H]
       vcvtuqq2pd ymm0, ymmword ptr [rax]
       vmovups  ymmword ptr [rbp-50H], ymm0
       vmovups  ymm0, ymmword ptr [rbp-50H]
       vmovups  ymmword ptr [rbp-30H], ymm0
       nop      
						;; size=42 bbWeight=1 PerfScore 13.50
G_M14098_IG05:  ;; offset=0061H
       mov      rax, bword ptr [rbp+10H]
       vmovups  ymm0, ymmword ptr [rbp-30H]
       vmovups  ymmword ptr [rax], ymm0
       mov      rax, bword ptr [rbp+10H]
						;; size=24 bbWeight=1 PerfScore 8.00
G_M14098_IG06:  ;; offset=0079H
       vzeroupper 
       add      rsp, 112
       pop      rbp
       ret      
						;; size=9 bbWeight=1 PerfScore 2.75

3. Vector.ConvertToInt64

public static Vector256<long> ConvDoubleToLong256(Vector256<double> val)
{
    return Vector256.ConvertToInt64(val);
}

Assembly Before Optimization

G_M10770_IG01:  ;; offset=0000H
       push     rbp
       sub      rsp, 144
       vzeroupper 
       lea      rbp, [rsp+90H]
       vxorps   xmm4, xmm4, xmm4
       vmovdqa32 xmmword ptr [rbp-30H], xmm4
       vmovdqa32 xmmword ptr [rbp-20H], xmm4
       mov      bword ptr [rbp+10H], rcx
       mov      bword ptr [rbp+18H], rdx
						;; size=47 bbWeight=1 PerfScore 9.08
G_M10770_IG02:  ;; offset=002FH
       cmp      dword ptr [(reloc 0x7ff8bed1c5f8)], 0
       je       SHORT G_M10770_IG04
						;; size=9 bbWeight=1 PerfScore 4.00
G_M10770_IG03:  ;; offset=0038H
       call     CORINFO_HELP_DBG_IS_JUST_MY_CODE
						;; size=5 bbWeight=0.50 PerfScore 0.50
G_M10770_IG04:  ;; offset=003DH
       nop      
       mov      rdx, bword ptr [rbp+18H]
       vmovups  ymm0, ymmword ptr [rdx]
       vmovups  ymmword ptr [rbp-70H], ymm0
       lea      rdx, [rbp-70H]
       lea      rcx, [rbp-50H]
       call     [System.Runtime.Intrinsics.Vector256:ConvertToInt64(System.Runtime.Intrinsics.Vector256`1[double]):System.Runtime.Intrinsics.Vector256`1[long]]
       vmovups  ymm0, ymmword ptr [rbp-50H]
       vmovups  ymmword ptr [rbp-30H], ymm0
       nop      
						;; size=56 bbWeight=1 PerfScore 16.50
G_M10770_IG05:  ;; offset=0075H
       mov      rax, bword ptr [rbp+10H]
       vmovups  ymm0, ymmword ptr [rbp-30H]
       vmovups  ymmword ptr [rax], ymm0
       mov      rax, bword ptr [rbp+10H]
						;; size=24 bbWeight=1 PerfScore 8.00
G_M10770_IG06:  ;; offset=008DH
       vzeroupper 
       add      rsp, 144
       pop      rbp
       ret      
						;; size=12 bbWeight=1 PerfScore 2.75

Assembly After Optimization

G_M10770_IG01:  ;; offset=0000H
       push     rbp
       sub      rsp, 112
       vzeroupper 
       lea      rbp, [rsp+70H]
       vxorps   xmm4, xmm4, xmm4
       vmovdqa32 xmmword ptr [rbp-30H], xmm4
       vmovdqa32 xmmword ptr [rbp-20H], xmm4
       mov      bword ptr [rbp+10H], rcx
       mov      bword ptr [rbp+18H], rdx
						;; size=41 bbWeight=1 PerfScore 9.08
G_M10770_IG02:  ;; offset=0029H
       cmp      dword ptr [(reloc 0x7ff8babbc5f8)], 0
       je       SHORT G_M10770_IG04
						;; size=9 bbWeight=1 PerfScore 4.00
G_M10770_IG03:  ;; offset=0032H
       call     CORINFO_HELP_DBG_IS_JUST_MY_CODE
						;; size=5 bbWeight=0.50 PerfScore 0.50
G_M10770_IG04:  ;; offset=0037H
       nop      
       mov      rax, bword ptr [rbp+18H]
       vcvttpd2qq ymm0, ymmword ptr [rax]
       vmovups  ymmword ptr [rbp-50H], ymm0
       vmovups  ymm0, ymmword ptr [rbp-50H]
       vmovups  ymmword ptr [rbp-30H], ymm0
       nop      
						;; size=42 bbWeight=1 PerfScore 13.50
G_M10770_IG05:  ;; offset=0061H
       mov      rax, bword ptr [rbp+10H]
       vmovups  ymm0, ymmword ptr [rbp-30H]
       vmovups  ymmword ptr [rax], ymm0
       mov      rax, bword ptr [rbp+10H]
						;; size=24 bbWeight=1 PerfScore 8.00
G_M10770_IG06:  ;; offset=0079H
       vzeroupper 
       add      rsp, 112
       pop      rbp
       ret      
						;; size=9 bbWeight=1 PerfScore 2.75
Author: khushal1996
Assignees: -
Labels:

area-CodeGen-coreclr

Milestone: -

…endsOn checks to make sure they are ran only if we need AVX512. These checks being costly, moving them to the innermost checks in nested if checks.
@tannergooding tannergooding added the avx512 Related to the AVX-512 architecture label Jun 27, 2023
@ghost ghost closed this Aug 5, 2023
@ghost
Copy link

ghost commented Aug 5, 2023

Draft Pull Request was automatically closed for 30 days of inactivity. Please let us know if you'd like to reopen it.

@ghost ghost locked as resolved and limited conversation to collaborators Sep 4, 2023
This pull request was closed.
Sign up for free to subscribe to this conversation on GitHub. Already have an account? Sign in.
Labels
area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI avx512 Related to the AVX-512 architecture community-contribution Indicates that the PR has been added by a community member
Projects
None yet
Development

Successfully merging this pull request may close these issues.

2 participants