-
Notifications
You must be signed in to change notification settings - Fork 4.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
JIT: Avoid creating stores to dead fields in block morphing #81095
Conversation
Tagging subscribers to this area: @JulieLeeMSFT, @jakobbotsch, @kunalspathak |
/azp run runtime-coreclr jitstress, runtime-coreclr libraries-jitstress, Fuzzlyn |
Azure Pipelines successfully started running 3 pipeline(s). |
cc @dotnet/jit-contrib PTAL @SingleAccretion @EgorBo Diffs. Avoiding creating these stores frequently allows us to avoid spilling the address, and we also frequently avoid unnecessary null checks. For example: @@ -1,45 +1,40 @@
; Assembly listing for method Microsoft.CodeAnalysis.CSharp.BoundConversion:get_ConversionKind():ubyte:this
; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
; optimized code
; rsp based frame
; partially interruptible
; No matching PGO data
; 0 inlinees with PGO data; 2 single block inlinees; 0 inlinees without PGO data
; Final local variable assignments
;
-; V00 this [V00,T01] ( 3, 3 ) ref -> rcx this class-hnd single-def
+; V00 this [V00,T00] ( 3, 3 ) ref -> rcx this class-hnd single-def
;* V01 loc0 [V01 ] ( 0, 0 ) struct (16) zero-ref ld-addr-op
;# V02 OutArgs [V02 ] ( 1, 1 ) lclBlk ( 0) [rsp+00H] "OutgoingArgSpace"
-;* V03 tmp1 [V03 ] ( 0, 0 ) ref -> zero-ref single-def V01._uncommonData(offs=0x00) P-INDEP "field V01._uncommonData (fldOffset=0x0)"
-; V04 tmp2 [V04,T02] ( 2, 2 ) ubyte -> rax single-def V01._kind(offs=0x08) P-INDEP "field V01._kind (fldOffset=0x8)"
-; V05 tmp3 [V05,T00] ( 3, 6 ) byref -> rcx single-def "BlockOp address local"
+;* V03 tmp1 [V03 ] ( 0, 0 ) ref -> zero-ref V01._uncommonData(offs=0x00) P-INDEP "field V01._uncommonData (fldOffset=0x0)"
+; V04 tmp2 [V04,T01] ( 2, 2 ) ubyte -> rax single-def V01._kind(offs=0x08) P-INDEP "field V01._kind (fldOffset=0x8)"
;
; Lcl frame size = 0
G_M21365_IG01: ; bbWeight=1, gcrefRegs=00000000 {}, byrefRegs=00000000 {}, byref, nogc <-- Prolog IG
;; size=0 bbWeight=1 PerfScore 0.00
G_M21365_IG02: ; bbWeight=1, gcrefRegs=00000002 {rcx}, byrefRegs=00000000 {}, byref
; gcrRegs +[rcx]
- add rcx, 72
- ; gcrRegs -[rcx]
- ; byrRegs +[rcx]
- cmp dword ptr [rcx], ecx
- movzx rax, byte ptr [rcx+08H]
- ;; size=10 bbWeight=1 PerfScore 5.25
+ movzx rax, byte ptr [rcx+50H]
+ ;; size=4 bbWeight=1 PerfScore 2.00
G_M21365_IG03: ; bbWeight=1, epilog, nogc, extend
ret
;; size=1 bbWeight=1 PerfScore 1.00
-; Total bytes of code 11, prolog size 0, PerfScore 7.35, instruction count 4, allocated bytes for code 11 (MethodHash=53ceac8a) for method Microsoft.CodeAnalysis.CSharp.BoundConversion:get_ConversionKind():ubyte:this
+; Total bytes of code 5, prolog size 0, PerfScore 3.50, instruction count 2, allocated bytes for code 5 (MethodHash=53ceac8a) for method Microsoft.CodeAnalysis.CSharp.BoundConversion:get_ConversionKind():ubyte:this This also fixes #67739. Codegen diff for the example there: @@ -8,16 +8,15 @@
; Final local variable assignments
;
;* V00 this [V00 ] ( 0, 0 ) ref -> zero-ref this class-hnd single-def
-; V01 arg1 [V01,T06] ( 3, 3 ) ref -> rdx class-hnd single-def
-; V02 loc0 [V02,T02] ( 4, 10 ) long -> rax
-; V03 loc1 [V03,T04] ( 3, 6 ) ref -> rdx class-hnd single-def
-; V04 loc2 [V04,T01] ( 5, 17 ) int -> rcx
+; V01 arg1 [V01,T05] ( 3, 3 ) ref -> rdx class-hnd single-def
+; V02 loc0 [V02,T01] ( 4, 10 ) long -> rax
+; V03 loc1 [V03,T03] ( 3, 6 ) ref -> rdx class-hnd single-def
+; V04 loc2 [V04,T00] ( 5, 17 ) int -> rcx
;* V05 loc3 [V05 ] ( 0, 0 ) struct (16) zero-ref ld-addr-op
;# V06 OutArgs [V06 ] ( 1, 1 ) lclBlk ( 0) [rsp+00H] "OutgoingArgSpace"
;* V07 tmp1 [V07 ] ( 0, 0 ) ref -> zero-ref V05.key(offs=0x00) P-INDEP "field V05.key (fldOffset=0x0)"
-; V08 tmp2 [V08,T03] ( 2, 8 ) int -> r9 V05.value(offs=0x08) P-INDEP "field V05.value (fldOffset=0x8)"
-; V09 tmp3 [V09,T00] ( 3, 24 ) byref -> r9 "BlockOp address local"
-; V10 cse0 [V10,T05] ( 3, 6 ) int -> r8 "CSE - aggressive"
+; V08 tmp2 [V08,T02] ( 2, 8 ) int -> r9 V05.value(offs=0x08) P-INDEP "field V05.value (fldOffset=0x8)"
+; V09 cse0 [V09,T04] ( 3, 6 ) int -> r8 "CSE - aggressive"
;
; Lcl frame size = 0
@@ -34,19 +33,17 @@ G_M51593_IG02:
G_M51593_IG03:
mov r9d, ecx
shl r9, 4
- lea r9, bword ptr [rdx+r9+10H]
- cmp dword ptr [r9], r9d
- mov r9d, dword ptr [r9+08H]
+ mov r9d, dword ptr [rdx+r9+18H]
movsxd r9, r9d
add rax, r9
inc ecx
cmp r8d, ecx
jg SHORT G_M51593_IG03
- ;; size=32 bbWeight=4 PerfScore 35.00
+ ;; size=25 bbWeight=4 PerfScore 19.00
G_M51593_IG04:
ret
;; size=1 bbWeight=1 PerfScore 1.00
-; Total bytes of code 49, prolog size 0, PerfScore 44.90, instruction count 17, allocated bytes for code 49 (MethodHash=b5703676) for method Program:EnumerateKvps(System.Collections.Generic.KeyValuePair`2[System.String,int][]):long:this
+; Total bytes of code 42, prolog size 0, PerfScore 28.20, instruction count 15, allocated bytes for code 42 (MethodHash=b5703676) for method Program:EnumerateKvps(System.Collections.Generic.KeyValuePair`2[System.String,int][]):long:this
; ============================================================
There is the question of whether this can end up eliding a necessary null check if we eliminate the "first" indirections. I do not think so as the source address should not be an "almost null" byref by JIT IR invariants, and in the cases we use field-by-field copies latter fields should not be able to have "large" offsets. The regressions seem to be due to eliminating the stores leading to different ref counts and different CSE weights. Additionally, eliminating these stores this early means we sometime realize earlier that a BB is empty, and that leads to new flowgraph opportunities. E.g.: ------------ BB12 [013..014) -> BB14 (always), preds={BB11} succs={BB14}
-
-***** BB12
-STMT00058 ( INL16 @ 0x000[E-] ... ??? ) <- INL15 @ 0x000[E-] <- INL10 @ 0x008[E-] <- INLRT @ 0x013[E-]
-N003 ( 0, 0) [------] ----------- ▌ COMMA void $485
-N001 ( 0, 0) [------] ----------- ├──▌ NOP void $484
-N002 ( 0, 0) [------] ----------- └──▌ NOP void $485
-
------------ BB13 [013..014), preds={} succs={BB14}
...
*************** Starting PHASE Update flow graph opt pass
+
+ Moving BB16 after BB12 to enable reversal
+ New Basic Block BB27 [0082] created.
+ Setting edge weights for BB16 -> BB27 to [0 .. 3.402823e+38]
+ Setting edge weights for BB27 -> BB17 to [0 .. 3.402823e+38]
+
+ Reversing a conditional jump around an unconditional jump (BB11 -> BB16, BB12 -> BB14)
+ Setting edge weights for BB11 -> BB14 to [0 .. 3.402823e+38]
+
+ After reversing the jump:
+
+ -----------------------------------------------------------------------------------------------------------------------------------------
+ BBnum BBid ref try hnd preds weight lp [IL range] [jump] [EH region] [flags]
+ -----------------------------------------------------------------------------------------------------------------------------------------
+ BB01 [0000] 1 1 [000..001)-> BB05 ( cond ) i
+ BB02 [0010] 1 BB01 0.50 [000..001)-> BB06 ( cond ) i
+ BB03 [0011] 1 BB02 0.50 [000..001)-> BB06 ( cond ) i
+ BB04 [0013] 1 BB03 0.50 [000..001)-> BB10 (always) i
+ BB05 [0014] 1 BB01 0.50 [000..001)-> BB07 ( cond ) i idxlen
+ BB06 [0015] 3 BB02,BB03,BB05 0 [000..001) (throw ) i rare hascall gcsafe
+ BB07 [0016] 1 BB05 0.50 [000..001)-> BB09 ( cond ) i nullcheck
+ BB08 [0022] 1 BB07 0.50 [000..001) i hascall gcsafe
+ BB09 [0023] 2 BB07,BB08 0.50 [000..001) i nullcheck
+ BB10 [0017] 2 BB04,BB09 1 [000..013)-> BB18 ( cond ) i
+ BB11 [0001] 2 BB10,BB17 4 0 [013..014)-> BB14 ( cond ) i Loop Loop0 bwd bwd-target
+ BB16 [0027] 1 BB11 2 0 [013..023)-> BB18 ( cond ) i hascall gcsafe bwd
+ BB27 [0082] 1 BB16 4 [???..???)-> BB17 (always) internal
+ BB13 [0050] 0 2 0 [013..014) i hascall gcsafe bwd
+ BB14 [0051] 2 BB11,BB13 2 0 [013..014)-> BB18 ( cond ) i bwd
+ BB15 [0081] 1 BB14 2 0 [???..???)-> BB17 (always) internal
+ BB17 [0002] 2 BB15,BB27 4 0 [023..02B)-> BB11 ( cond ) i bwd
+ BB18 [0004] 4 BB10,BB14,BB16,BB17 1 [02B..02C)-> BB21 ( cond ) i
+ BB19 [0062] 1 BB18 1 [02B..02C)-> BB21 ( cond ) i
+ BB20 [0064] 1 BB19 1 [02B..02C)-> BB22 ( cond ) i
+ BB21 [0065] 3 BB18,BB19,BB20 0 [02B..02C) (throw ) i rare hascall gcsafe
+ BB22 [0066] 1 BB20 1 [02B..02C) i
+ BB23 [0074] 1 BB22 1 [02B..02C)-> BB25 ( cond ) i idxlen
+ BB24 [0075] 1 BB23 1 [02B..02C)-> BB26 ( cond ) i idxlen
+ BB25 [0076] 2 BB23,BB24 0 [02B..02C) (throw ) i rare hascall gcsafe
+ BB26 [0077] 1 BB24 1 [02B..036) (return) i
+ -----------------------------------------------------------------------------------------------------------------------------------------
The Fuzzlyn failure looks like #75442 and the other failures look known. |
src/coreclr/jit/morphblock.cpp
Outdated
@@ -1395,7 +1415,7 @@ GenTree* MorphCopyBlockHelper::CopyFieldByField() | |||
} | |||
else | |||
{ | |||
if (i == (fieldCnt - 1)) | |||
if (numCreated == fieldCnt - dyingFieldCnt) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It looks like numCreated
is only used here for the "last field" check.
This check can now actually be simplified to (once again) use the first field instead (result == nullptr
), it was originally altered to protect against bad zero-offset field sequences.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Changed this and the other check to result == nullptr
src/coreclr/jit/morphblock.cpp
Outdated
} | ||
} | ||
|
||
if (dyingFieldCnt == fieldCnt) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It is a little non-obvious why does this check reside here instead of before the loop.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Moved it up.
Ping @EgorBo |
More improvements dotnet/perf-autofiling-issues#12666 |
Fix #80498
Fix #67739