From b40a8520a27ae79319764a3dddff70c3bd02d75e Mon Sep 17 00:00:00 2001 From: SingleAccretion Date: Fri, 1 Apr 2022 00:08:10 +0300 Subject: [PATCH 1/2] Dissolve structs into primitives for PUTARG_STK The primary benefit of this change is "free" support for complex addressing modes, which is always desirable when we are loading a primitive (as opposed to the large struct case, where we would not want to use the 3-operand LEA in a loop, but instead cache the address in a register). The additional (future) benefit is that we will no longer need to mark the source local as DNER, once LCL_VAR sources for struct PUTARG_STKs are supported. --- src/coreclr/jit/codegenxarch.cpp | 43 +++----- src/coreclr/jit/lowerxarch.cpp | 183 +++++++++++++++++-------------- src/coreclr/jit/lsraxarch.cpp | 24 ++-- 3 files changed, 128 insertions(+), 122 deletions(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 66bf55d2b5e2da..4a4cdbc7caf2dc 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -7908,43 +7908,36 @@ void CodeGen::genPutArgStk(GenTreePutArgStk* putArgStk) #ifdef TARGET_X86 + // On a 32-bit target, all of the long arguments are handled with GT_FIELD_LISTs of TYP_INT. + assert(targetType != TYP_LONG); + assert((putArgStk->GetStackByteSize() % TARGET_POINTER_SIZE) == 0); + genAlignStackBeforeCall(putArgStk); - if ((data->OperGet() != GT_FIELD_LIST) && varTypeIsStruct(targetType)) + if (data->OperIs(GT_FIELD_LIST)) { - (void)genAdjustStackForPutArgStk(putArgStk); - genPutStructArgStk(putArgStk); + genPutArgStkFieldList(putArgStk); return; } - // On a 32-bit target, all of the long arguments are handled with GT_FIELD_LISTs of TYP_INT. - assert(targetType != TYP_LONG); - - const unsigned argSize = putArgStk->GetStackByteSize(); - assert((argSize % TARGET_POINTER_SIZE) == 0); - - if (data->isContainedIntOrIImmed()) + if (varTypeIsStruct(targetType)) { - if (data->IsIconHandle()) - { - inst_IV_handle(INS_push, data->AsIntCon()->gtIconVal); - } - else - { - inst_IV(INS_push, data->AsIntCon()->gtIconVal); - } - AddStackLevel(argSize); + genAdjustStackForPutArgStk(putArgStk); + genPutStructArgStk(putArgStk); + return; } - else if (data->OperGet() == GT_FIELD_LIST) + + genConsumeRegs(data); + + if (data->isUsedFromReg()) { - genPutArgStkFieldList(putArgStk); + genPushReg(targetType, data->GetRegNum()); } else { - // We should not see any contained nodes that are not immediates. - assert(data->isUsedFromReg()); - genConsumeReg(data); - genPushReg(targetType, data->GetRegNum()); + assert(genTypeSize(data) == TARGET_POINTER_SIZE); + inst_TT(INS_push, emitTypeSize(data), data); + AddStackLevel(TARGET_POINTER_SIZE); } #else // !TARGET_X86 { diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 82d488fe1e00f3..7e5371fb6d4209 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -462,7 +462,8 @@ void Lowering::ContainBlockStoreAddress(GenTreeBlk* blkNode, unsigned size, GenT // void Lowering::LowerPutArgStk(GenTreePutArgStk* putArgStk) { - GenTree* src = putArgStk->gtGetOp1(); + GenTree* src = putArgStk->gtGetOp1(); + bool srcIsLocal = src->OperIsLocalRead(); if (src->OperIs(GT_FIELD_LIST)) { @@ -530,109 +531,121 @@ void Lowering::LowerPutArgStk(GenTreePutArgStk* putArgStk) } #ifdef FEATURE_PUT_STRUCT_ARG_STK - if (src->TypeGet() != TYP_STRUCT) -#endif // FEATURE_PUT_STRUCT_ARG_STK + if (src->TypeIs(TYP_STRUCT)) { - // If the child of GT_PUTARG_STK is a constant, we don't need a register to - // move it to memory (stack location). - // - // On AMD64, we don't want to make 0 contained, because we can generate smaller code - // by zeroing a register and then storing it. E.g.: - // xor rdx, rdx - // mov gword ptr [rsp+28H], rdx - // is 2 bytes smaller than: - // mov gword ptr [rsp+28H], 0 - // - // On x86, we push stack arguments; we don't use 'mov'. So: - // push 0 - // is 1 byte smaller than: - // xor rdx, rdx - // push rdx + ClassLayout* layout = src->AsObj()->GetLayout(); + var_types regType = layout->GetRegisterType(); + srcIsLocal |= src->AsObj()->Addr()->OperIsLocalAddr(); - if (IsContainableImmed(putArgStk, src) -#if defined(TARGET_AMD64) - && !src->IsIntegralConst(0) -#endif // TARGET_AMD64 - ) + if (regType == TYP_UNDEF) { - MakeSrcContained(putArgStk, src); - } - return; - } - -#ifdef FEATURE_PUT_STRUCT_ARG_STK - GenTree* srcAddr = nullptr; - - bool haveLocalAddr = false; - if ((src->OperGet() == GT_OBJ) || (src->OperGet() == GT_IND)) - { - srcAddr = src->AsOp()->gtOp1; - assert(srcAddr != nullptr); - haveLocalAddr = srcAddr->OperIsLocalAddr(); - } - else - { - assert(varTypeIsSIMD(putArgStk)); - } + // In case of a CpBlk we could use a helper call. In case of putarg_stk we + // can't do that since the helper call could kill some already set up outgoing args. + // TODO-Amd64-Unix: converge the code for putarg_stk with cpyblk/cpyobj. + // The cpyXXXX code is rather complex and this could cause it to be more complex, but + // it might be the right thing to do. - ClassLayout* layout = src->AsObj()->GetLayout(); + unsigned size = putArgStk->GetStackByteSize(); + unsigned loadSize = layout->GetSize(); - // In case of a CpBlk we could use a helper call. In case of putarg_stk we - // can't do that since the helper call could kill some already set up outgoing args. - // TODO-Amd64-Unix: converge the code for putarg_stk with cpyblk/cpyobj. - // The cpyXXXX code is rather complex and this could cause it to be more complex, but - // it might be the right thing to do. + assert(loadSize <= size); - unsigned size = putArgStk->GetStackByteSize(); + // TODO-X86-CQ: The helper call either is not supported on x86 or required more work + // (I don't know which). - // TODO-X86-CQ: The helper call either is not supported on x86 or required more work - // (I don't know which). - - if (!layout->HasGCPtr()) - { + if (!layout->HasGCPtr()) + { #ifdef TARGET_X86 - if (size < XMM_REGSIZE_BYTES) - { - // Codegen for "Kind::Push" will always load bytes in TARGET_POINTER_SIZE - // chunks. As such, the correctness of this code depends on the fact that - // morph will copy any "mis-sized" (too small) non-local OBJs into a temp, - // thus preventing any possible out-of-bounds memory reads. - assert(((layout->GetSize() % TARGET_POINTER_SIZE) == 0) || src->OperIsLocalRead() || - (src->OperIsIndir() && src->AsIndir()->Addr()->IsLocalAddrExpr())); - putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Push; - } - else + // Codegen for "Kind::Push" will always load bytes in TARGET_POINTER_SIZE + // chunks. As such, the correctness of this code depends on the fact that + // morph will copy any "mis-sized" (too small) non-local OBJs into a temp, + // thus preventing any possible out-of-bounds memory reads. + assert(((layout->GetSize() % TARGET_POINTER_SIZE) == 0) || src->OperIsLocalRead() || + (src->OperIsIndir() && src->AsIndir()->Addr()->IsLocalAddrExpr())); + if (size < XMM_REGSIZE_BYTES) + { + putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Push; + } + else #endif // TARGET_X86 - if (size <= CPBLK_UNROLL_LIMIT) - { - putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Unroll; + if (size <= CPBLK_UNROLL_LIMIT) + { + putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Unroll; + } + else + { + putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::RepInstr; + } + } + else // There are GC pointers. + { +#ifdef TARGET_X86 + // On x86, we must use `push` to store GC references to the stack in order for the emitter to + // properly update the function's GC info. These `putargstk` nodes will generate a sequence of + // `push` instructions. + putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Push; +#else // !TARGET_X86 + putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::PartialRepInstr; +#endif // !TARGET_X86 + } + + // Always mark the OBJ and ADDR as contained trees by the putarg_stk. The codegen will deal with this tree. + MakeSrcContained(putArgStk, src); + if (src->OperIs(GT_OBJ) && src->AsObj()->Addr()->OperIsLocalAddr()) + { + // If the source address is the address of a lclVar, make the source address contained to avoid + // unnecessary copies. + MakeSrcContained(putArgStk, src->AsObj()->Addr()); + } } else { - putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::RepInstr; + // The ABI allows upper bits of small struct args to remain undefined, + // so if possible, widen the load to avoid the sign/zero-extension. + if (varTypeIsSmall(regType) && srcIsLocal) + { + assert(putArgStk->GetStackByteSize() <= genTypeSize(TYP_INT)); + regType = TYP_INT; + } + + src->SetOper(GT_IND); + src->ChangeType(regType); + LowerIndir(src->AsIndir()); } } - else // There are GC pointers. + + if (src->TypeIs(TYP_STRUCT)) { -#ifdef TARGET_X86 - // On x86, we must use `push` to store GC references to the stack in order for the emitter to properly update - // the function's GC info. These `putargstk` nodes will generate a sequence of `push` instructions. - putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Push; -#else // !TARGET_X86 - putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::PartialRepInstr; -#endif // !TARGET_X86 + return; } +#endif // FEATURE_PUT_STRUCT_ARG_STK + + assert(!src->TypeIs(TYP_STRUCT)); - // Always mark the OBJ and ADDR as contained trees by the putarg_stk. The codegen will deal with this tree. - MakeSrcContained(putArgStk, src); - if (haveLocalAddr) + // If the child of GT_PUTARG_STK is a constant, we don't need a register to + // move it to memory (stack location). + // + // On AMD64, we don't want to make 0 contained, because we can generate smaller code + // by zeroing a register and then storing it. E.g.: + // xor rdx, rdx + // mov gword ptr [rsp+28H], rdx + // is 2 bytes smaller than: + // mov gword ptr [rsp+28H], 0 + // + // On x86, we push stack arguments; we don't use 'mov'. So: + // push 0 + // is 1 byte smaller than: + // xor rdx, rdx + // push rdx + + if (IsContainableImmed(putArgStk, src) +#if defined(TARGET_AMD64) + && !src->IsIntegralConst(0) +#endif // TARGET_AMD64 + ) { - // If the source address is the address of a lclVar, make the source address contained to avoid unnecessary - // copies. - // - MakeSrcContained(putArgStk, srcAddr); + MakeSrcContained(putArgStk, src); } -#endif // FEATURE_PUT_STRUCT_ARG_STK } /* Lower GT_CAST(srcType, DstType) nodes. diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp index fb2a3c8a16aedb..54546e7b6f2856 100644 --- a/src/coreclr/jit/lsraxarch.cpp +++ b/src/coreclr/jit/lsraxarch.cpp @@ -1561,21 +1561,21 @@ int LinearScan::BuildPutArgStk(GenTreePutArgStk* putArgStk) GenTree* src = putArgStk->gtOp1; var_types type = src->TypeGet(); -#if defined(FEATURE_SIMD) && defined(TARGET_X86) - // For PutArgStk of a TYP_SIMD12, we need an extra register. - if (putArgStk->isSIMD12()) + if (type != TYP_STRUCT) { - buildInternalFloatRegisterDefForNode(putArgStk, internalFloatRegCandidates()); - BuildUse(putArgStk->gtOp1); - srcCount = 1; - buildInternalRegisterUses(); - return srcCount; - } +#if defined(FEATURE_SIMD) && defined(TARGET_X86) + // For PutArgStk of a TYP_SIMD12, we need an extra register. + if (putArgStk->isSIMD12()) + { + buildInternalFloatRegisterDefForNode(putArgStk, internalFloatRegCandidates()); + BuildUse(src); + srcCount = 1; + buildInternalRegisterUses(); + return srcCount; + } #endif // defined(FEATURE_SIMD) && defined(TARGET_X86) - if (type != TYP_STRUCT) - { - return BuildSimple(putArgStk); + return BuildOperandUses(src); } ssize_t size = putArgStk->GetStackByteSize(); From db08d4dd846ef8d7c5a0ab154b3a95611a6f77b5 Mon Sep 17 00:00:00 2001 From: SingleAccretion Date: Fri, 1 Apr 2022 00:09:10 +0300 Subject: [PATCH 2/2] Contain PUTARG_STK sources for "push [mem]" --- src/coreclr/jit/lowerxarch.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 7e5371fb6d4209..a2e68b1c85aee3 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -646,6 +646,14 @@ void Lowering::LowerPutArgStk(GenTreePutArgStk* putArgStk) { MakeSrcContained(putArgStk, src); } +#ifdef TARGET_X86 + else if ((genTypeSize(src) == TARGET_POINTER_SIZE) && IsContainableMemoryOp(src) && + IsSafeToContainMem(putArgStk, src)) + { + // Contain for "push [mem]". + MakeSrcContained(putArgStk, src); + } +#endif // TARGET_X86 } /* Lower GT_CAST(srcType, DstType) nodes.