Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable Fast Tail Call Optimization for ARM32 #66282

Merged
merged 13 commits into from
Mar 13, 2022
4 changes: 4 additions & 0 deletions src/coreclr/jit/compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -10536,6 +10536,10 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX

unsigned compArgSize; // total size of arguments in bytes (including register args (lvIsRegArg))

#ifdef TARGET_ARM
bool compHasSplitParam;
#endif

unsigned compMapILargNum(unsigned ILargNum); // map accounting for hidden args
unsigned compMapILvarNum(unsigned ILvarNum); // map accounting for hidden args
unsigned compMap2ILvarNum(unsigned varNum) const; // map accounting for hidden args
Expand Down
4 changes: 2 additions & 2 deletions src/coreclr/jit/compiler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -602,7 +602,7 @@ inline bool isRegParamType(var_types type)
#endif // !TARGET_X86
}

#if defined(TARGET_AMD64) || defined(TARGET_ARM64)
#if defined(TARGET_AMD64) || defined(TARGET_ARMARCH)
/*****************************************************************************/
// Returns true if 'type' is a struct that can be enregistered for call args
// or can be returned by value in multiple registers.
Expand Down Expand Up @@ -660,7 +660,7 @@ inline bool Compiler::VarTypeIsMultiByteAndCanEnreg(var_types typ

return result;
}
#endif // TARGET_AMD64 || TARGET_ARM64
#endif // TARGET_AMD64 || TARGET_ARMARCH

/*****************************************************************************/

Expand Down
4 changes: 2 additions & 2 deletions src/coreclr/jit/importer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8541,7 +8541,7 @@ bool Compiler::impTailCallRetTypeCompatible(bool allowWideni
return true;
}

#if defined(TARGET_AMD64) || defined(TARGET_ARM64)
#if defined(TARGET_AMD64) || defined(TARGET_ARMARCH)
// Jit64 compat:
if (callerRetType == TYP_VOID)
{
Expand Down Expand Up @@ -8571,7 +8571,7 @@ bool Compiler::impTailCallRetTypeCompatible(bool allowWideni
{
return (varTypeIsIntegral(calleeRetType) || isCalleeRetTypMBEnreg) && (callerRetTypeSize == calleeRetTypeSize);
}
#endif // TARGET_AMD64 || TARGET_ARM64
#endif // TARGET_AMD64 || TARGET_ARMARCH

return false;
}
Expand Down
5 changes: 5 additions & 0 deletions src/coreclr/jit/lclvars.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,10 @@ void Compiler::lvaInitTypeRef()
LclVarDsc* varDsc = varDscInfo.varDsc;
CORINFO_ARG_LIST_HANDLE localsSig = info.compMethodInfo->locals.args;

#ifdef TARGET_ARM
compHasSplitParam = varDscInfo.hasSplitParam;
#endif

for (unsigned i = 0; i < info.compMethodInfo->locals.numArgs;
i++, varNum++, varDsc++, localsSig = info.compCompHnd->getArgNext(localsSig))
{
Expand Down Expand Up @@ -968,6 +972,7 @@ void Compiler::lvaInitUserArgs(InitVarDscInfo* varDscInfo, unsigned skipArgs, un
unsigned numEnregistered = varDscInfo->maxIntRegArgNum - firstRegArgNum;
varDsc->SetStackOffset(-(int)numEnregistered * REGSIZE_BYTES);
varDscInfo->stackArgSize += (cSlots - numEnregistered) * REGSIZE_BYTES;
varDscInfo->hasSplitParam = true;
JITDUMP("set user arg V%02u offset to %d\n", varDscInfo->varNum, varDsc->GetStackOffset());
}
}
Expand Down
15 changes: 14 additions & 1 deletion src/coreclr/jit/lower.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2129,7 +2129,20 @@ void Lowering::RehomeArgForFastTailCall(unsigned int lclNum,
#ifdef DEBUG
comp->lvaTable[tmpLclNum].SetDoNotEnregReason(callerArgDsc->GetDoNotEnregReason());
#endif // DEBUG
GenTree* value = comp->gtNewLclvNode(lclNum, tmpTyp);

GenTree* value;
#ifdef TARGET_ARM
if (tmpTyp == TYP_LONG)
{
GenTree* loResult = comp->gtNewLclFldNode(lclNum, TYP_INT, 0);
GenTree* hiResult = comp->gtNewLclFldNode(lclNum, TYP_INT, 4);
value = new (comp, GT_LONG) GenTreeOp(GT_LONG, TYP_LONG, loResult, hiResult);
}
else
#endif // TARGET_ARM
{
value = comp->gtNewLclvNode(lclNum, tmpTyp);
}

if (tmpTyp == TYP_STRUCT)
{
Expand Down
6 changes: 5 additions & 1 deletion src/coreclr/jit/lsraarmarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,11 @@ int LinearScan::BuildCall(GenTreeCall* call)
{
// Fast tail call - make sure that call target is always computed in volatile registers
// that will not be overridden by epilog sequence.
ctrlExprCandidates = allRegs(TYP_INT) & RBM_INT_CALLEE_TRASH;
ctrlExprCandidates = allRegs(TYP_INT) & RBM_INT_CALLEE_TRASH & ~RBM_LR;
if (compiler->getNeedsGSSecurityCookie())
{
ctrlExprCandidates &= ~(genRegMask(REG_GSCOOKIE_TMP_0) | genRegMask(REG_GSCOOKIE_TMP_1));
}
assert(ctrlExprCandidates != RBM_NONE);
}
}
Expand Down
66 changes: 57 additions & 9 deletions src/coreclr/jit/morph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6773,15 +6773,6 @@ bool Compiler::fgCanFastTailCall(GenTreeCall* callee, const char** failReason)
unsigned calleeArgStackSize = 0;
unsigned callerArgStackSize = info.compArgStackSize;

for (unsigned index = 0; index < argInfo->ArgCount(); ++index)
{
fgArgTabEntry* arg = argInfo->GetArgEntry(index, false);

calleeArgStackSize = roundUp(calleeArgStackSize, arg->GetByteAlignment());
calleeArgStackSize += arg->GetStackByteSize();
}
calleeArgStackSize = GetOutgoingArgByteSize(calleeArgStackSize);

auto reportFastTailCallDecision = [&](const char* thisFailReason) {
if (failReason != nullptr)
{
Expand Down Expand Up @@ -6832,6 +6823,46 @@ bool Compiler::fgCanFastTailCall(GenTreeCall* callee, const char** failReason)
#endif // DEBUG
};

for (unsigned index = 0; index < argInfo->ArgCount(); ++index)
{
fgArgTabEntry* arg = argInfo->GetArgEntry(index, false);

calleeArgStackSize = roundUp(calleeArgStackSize, arg->GetByteAlignment());
calleeArgStackSize += arg->GetStackByteSize();
#ifdef TARGET_ARM
if (arg->IsSplit())
{
reportFastTailCallDecision("Splitted argument in callee is not supported on ARM32");
return false;
}
#endif // TARGET_ARM
}
calleeArgStackSize = GetOutgoingArgByteSize(calleeArgStackSize);

#ifdef TARGET_ARM
if (compHasSplitParam)
{
reportFastTailCallDecision("Splitted argument in caller is not supported on ARM32");
return false;
}

if (compIsProfilerHookNeeded())
{
reportFastTailCallDecision("Profiler is not supported on ARM32");
return false;
}

// On ARM32 we have only one non-parameter volatile register and we need it
// for the GS security cookie check. We could technically still tailcall
// when the callee does not use all argument registers, but we keep the
// code simple here.
if (getNeedsGSSecurityCookie())
{
reportFastTailCallDecision("Not enough registers available due to the GS security cookie check");
return false;
}
#endif

if (!opts.compFastTailCalls)
{
reportFastTailCallDecision("Configuration doesn't allow fast tail calls");
Expand All @@ -6844,6 +6875,15 @@ bool Compiler::fgCanFastTailCall(GenTreeCall* callee, const char** failReason)
return false;
}

#ifdef TARGET_ARM
if (callee->IsR2RRelativeIndir() || callee->HasNonStandardAddedArgs(this))
{
reportFastTailCallDecision(
"Method with non-standard args passed in callee saved register cannot be tail called");
return false;
}
#endif

// Note on vararg methods:
// If the caller is vararg method, we don't know the number of arguments passed by caller's caller.
// But we can be sure that in-coming arg area of vararg caller would be sufficient to hold its
Expand Down Expand Up @@ -7253,6 +7293,14 @@ GenTree* Compiler::fgMorphPotentialTailCall(GenTreeCall* call)
return nullptr;
}

#ifdef TARGET_ARM
if (call->gtCallMoreFlags & GTF_CALL_M_WRAPPER_DELEGATE_INV)
{
failTailCall("Non-standard calling convention");
return nullptr;
}
#endif

if (call->IsNoReturn() && !call->IsTailPrefixedCall())
{
// Such tail calls always throw an exception and we won't be able to see current
Expand Down
2 changes: 2 additions & 0 deletions src/coreclr/jit/register_arg_convention.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ struct InitVarDscInfo
// handles arguments.
regMaskTP fltArgSkippedRegMask;
bool anyFloatStackArgs;
bool hasSplitParam;
#endif // TARGET_ARM

#if FEATURE_FASTTAILCALL
Expand All @@ -45,6 +46,7 @@ struct InitVarDscInfo
#ifdef TARGET_ARM
fltArgSkippedRegMask = RBM_NONE;
anyFloatStackArgs = false;
hasSplitParam = false;
#endif // TARGET_ARM

#if FEATURE_FASTTAILCALL
Expand Down
4 changes: 2 additions & 2 deletions src/coreclr/jit/targetarm.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@
#define FEATURE_FIXED_OUT_ARGS 1 // Preallocate the outgoing arg area in the prolog
#define FEATURE_STRUCTPROMOTE 1 // JIT Optimization to promote fields of structs into registers
#define FEATURE_MULTIREG_STRUCT_PROMOTE 0 // True when we want to promote fields of a multireg struct into registers
#define FEATURE_FASTTAILCALL 0 // Tail calls made as epilog+jmp
#define FEATURE_TAILCALL_OPT 0 // opportunistic Tail calls (i.e. without ".tail" prefix) made as fast tail calls.
#define FEATURE_FASTTAILCALL 1 // Tail calls made as epilog+jmp
#define FEATURE_TAILCALL_OPT 1 // opportunistic Tail calls (i.e. without ".tail" prefix) made as fast tail calls.
#define FEATURE_SET_FLAGS 1 // Set to true to force the JIT to mark the trees with GTF_SET_FLAGS when the flags need to be set
#define FEATURE_MULTIREG_ARGS_OR_RET 1 // Support for passing and/or returning single values in more than one register (including HFA support)
#define FEATURE_MULTIREG_ARGS 1 // Support for passing a single argument in more than one register (including passing HFAs)
Expand Down