diff --git a/src/inc/jithelpers.h b/src/inc/jithelpers.h index 02a358beea02..fc64a3a4c6c7 100644 --- a/src/inc/jithelpers.h +++ b/src/inc/jithelpers.h @@ -368,11 +368,11 @@ JITHELPER(CORINFO_HELP_GVMLOOKUP_FOR_SLOT, NULL, CORINFO_HELP_SIG_NO_ALIGN_STUB) -#if defined(_TARGET_X86_) || defined(_TARGET_AMD64_) +#ifndef _TARGET_ARM64_ JITHELPER(CORINFO_HELP_STACK_PROBE, JIT_StackProbe, CORINFO_HELP_SIG_REG_ONLY) -#else // !_TARGET_X86_ && !_TARGET_AMD64_ +#else JITHELPER(CORINFO_HELP_STACK_PROBE, NULL, CORINFO_HELP_SIG_UNDEF) -#endif // !_TARGET_X86_ && !_TARGET_AMD64_ +#endif #undef JITHELPER #undef DYNAMICJITHELPER diff --git a/src/jit/codegenarm.cpp b/src/jit/codegenarm.cpp index b1db79d59f56..677b71256d49 100644 --- a/src/jit/codegenarm.cpp +++ b/src/jit/codegenarm.cpp @@ -1764,4 +1764,88 @@ void CodeGen::genProfilingLeaveCallback(unsigned helper) #endif // PROFILING_SUPPORTED +//------------------------------------------------------------------------ +// genAllocLclFrame: Probe the stack and allocate the local stack frame - subtract from SP. +// +// Notes: +// The first instruction of the prolog is always a push (which touches the lowest address +// of the stack), either of the LR register or of some argument registers, e.g., in the case of +// pre-spilling. The LR register is always pushed because we require it to allow for GC return +// address hijacking (see the comment in CodeGen::genPushCalleeSavedRegisters()). These pushes +// happen immediately before calling this function, so the SP at the current location has already +// been touched. +// +// Arguments: +// frameSize - the size of the stack frame being allocated. +// initReg - register to use as a scratch register. +// pInitRegZeroed - OUT parameter. *pInitRegZeroed is set to 'false' if and only if +// this call sets 'initReg' to a non-zero value. +// maskArgRegsLiveIn - incoming argument registers that are currently live. +// +// Return value: +// None +// +void CodeGen::genAllocLclFrame(unsigned frameSize, regNumber initReg, bool* pInitRegZeroed, regMaskTP maskArgRegsLiveIn) +{ + assert(compiler->compGeneratingProlog); + + if (frameSize == 0) + { + return; + } + + const target_size_t pageSize = compiler->eeGetPageSize(); + + assert(!compiler->info.compPublishStubParam || (REG_SECRET_STUB_PARAM != initReg)); + + if (frameSize < pageSize) + { + GetEmitter()->emitIns_R_I(INS_sub, EA_PTRSIZE, REG_SPBASE, frameSize); + } + else if (frameSize < compiler->getVeryLargeFrameSize()) + { + for (target_size_t probeOffset = pageSize; probeOffset <= frameSize; probeOffset += pageSize) + { + // Generate: + // movw initReg, -probeOffset + // ldr initReg, [SP + initReg] + + instGen_Set_Reg_To_Imm(EA_PTRSIZE, initReg, -(ssize_t)probeOffset); + GetEmitter()->emitIns_R_R_R(INS_ldr, EA_PTRSIZE, initReg, REG_SPBASE, initReg); + } + + regSet.verifyRegUsed(initReg); + *pInitRegZeroed = false; // The initReg does not contain zero + + instGen_Set_Reg_To_Imm(EA_PTRSIZE, initReg, frameSize); + compiler->unwindPadding(); + GetEmitter()->emitIns_R_R_R(INS_sub, EA_PTRSIZE, REG_SPBASE, REG_SPBASE, initReg); + } + else + { + assert(frameSize >= compiler->getVeryLargeFrameSize()); + + genInstrWithConstant(INS_sub, EA_PTRSIZE, REG_STACK_PROBE_HELPER_ARG, REG_SPBASE, frameSize, + INS_FLAGS_DONT_CARE, REG_STACK_PROBE_HELPER_ARG); + regSet.verifyRegUsed(REG_STACK_PROBE_HELPER_ARG); + genEmitHelperCall(CORINFO_HELP_STACK_PROBE, 0, EA_UNKNOWN, REG_STACK_PROBE_HELPER_CALL_TARGET); + compiler->unwindPadding(); + GetEmitter()->emitIns_R_R(INS_mov, EA_PTRSIZE, REG_SPBASE, REG_STACK_PROBE_HELPER_ARG); + + if ((genRegMask(initReg) & (RBM_STACK_PROBE_HELPER_ARG | RBM_STACK_PROBE_HELPER_CALL_TARGET | + RBM_STACK_PROBE_HELPER_TRASH)) != RBM_NONE) + { + *pInitRegZeroed = false; + } + } + + compiler->unwindAllocStack(frameSize); +#ifdef USING_SCOPE_INFO + if (!doubleAlignOrFramePointerUsed()) + { + psiAdjustStackLevel(frameSize); + } +#endif // USING_SCOPE_INFO +} + #endif // _TARGET_ARM_ diff --git a/src/jit/codegenarm64.cpp b/src/jit/codegenarm64.cpp index e651f8b9e9ad..9b983cf7a03f 100644 --- a/src/jit/codegenarm64.cpp +++ b/src/jit/codegenarm64.cpp @@ -7787,4 +7787,143 @@ void CodeGen::genArm64EmitterUnitTests() } #endif // defined(DEBUG) +//------------------------------------------------------------------------ +// genAllocLclFrame: Probe the stack. +// +// Notes: +// This only does the probing; allocating the frame is done when callee-saved registers are saved. +// This is done before anything has been pushed. The previous frame might have a large outgoing argument +// space that has been allocated, but the lowest addresses have not been touched. Our frame setup might +// not touch up to the first 504 bytes. This means we could miss a guard page. On Windows, however, +// there are always three guard pages, so we will not miss them all. On Linux, there is only one guard +// page by default, so we need to be more careful. We do an extra probe if we might not have probed +// recently enough. That is, if a call and prolog establishment might lead to missing a page. We do this +// on Windows as well just to be consistent, even though it should not be necessary. +// +// Arguments: +// frameSize - the size of the stack frame being allocated. +// initReg - register to use as a scratch register. +// pInitRegZeroed - OUT parameter. *pInitRegZeroed is set to 'false' if and only if +// this call sets 'initReg' to a non-zero value. +// maskArgRegsLiveIn - incoming argument registers that are currently live. +// +// Return value: +// None +// +void CodeGen::genAllocLclFrame(unsigned frameSize, regNumber initReg, bool* pInitRegZeroed, regMaskTP maskArgRegsLiveIn) +{ + assert(compiler->compGeneratingProlog); + + if (frameSize == 0) + { + return; + } + + const target_size_t pageSize = compiler->eeGetPageSize(); + + // What offset from the final SP was the last probe? If we haven't probed almost a complete page, and + // if the next action on the stack might subtract from SP first, before touching the current SP, then + // we do one more probe at the very bottom. This can happen if we call a function on arm64 that does + // a "STP fp, lr, [sp-504]!", that is, pre-decrement SP then store. Note that we probe here for arm64, + // but we don't alter SP. + target_size_t lastTouchDelta = 0; + + assert(!compiler->info.compPublishStubParam || (REG_SECRET_STUB_PARAM != initReg)); + + if (frameSize < pageSize) + { + lastTouchDelta = frameSize; + } + else if (frameSize < compiler->getVeryLargeFrameSize()) + { + lastTouchDelta = frameSize; + + for (target_size_t probeOffset = pageSize; probeOffset <= frameSize; probeOffset += pageSize) + { + // Generate: + // movw initReg, -probeOffset + // ldr wzr, [sp + initReg] + + instGen_Set_Reg_To_Imm(EA_PTRSIZE, initReg, -(ssize_t)probeOffset); + GetEmitter()->emitIns_R_R_R(INS_ldr, EA_4BYTE, REG_ZR, REG_SPBASE, initReg); + regSet.verifyRegUsed(initReg); + *pInitRegZeroed = false; // The initReg does not contain zero + + lastTouchDelta -= pageSize; + } + + assert(lastTouchDelta == frameSize % pageSize); + compiler->unwindPadding(); + } + else + { + assert(frameSize >= compiler->getVeryLargeFrameSize()); + + // Emit the following sequence to 'tickle' the pages. Note it is important that stack pointer not change + // until this is complete since the tickles could cause a stack overflow, and we need to be able to crawl + // the stack afterward (which means the stack pointer needs to be known). + + regMaskTP availMask = RBM_ALLINT & (regSet.rsGetModifiedRegsMask() | ~RBM_INT_CALLEE_SAVED); + availMask &= ~maskArgRegsLiveIn; // Remove all of the incoming argument registers as they are currently live + availMask &= ~genRegMask(initReg); // Remove the pre-calculated initReg + + regNumber rOffset = initReg; + regNumber rLimit; + regMaskTP tempMask; + + // We pick the next lowest register number for rLimit + noway_assert(availMask != RBM_NONE); + tempMask = genFindLowestBit(availMask); + rLimit = genRegNumFromMask(tempMask); + + // Generate: + // + // mov rOffset, -pageSize // On arm, this turns out to be "movw r1, 0xf000; sxth r1, r1". + // // We could save 4 bytes in the prolog by using "movs r1, 0" at the + // // runtime expense of running a useless first loop iteration. + // mov rLimit, -frameSize + // loop: + // ldr wzr, [sp + rOffset] + // sub rOffset, pageSize + // cmp rLimit, rOffset + // b.ls loop // If rLimit is lower or same, we need to probe this rOffset. Note + // // especially that if it is the same, we haven't probed this page. + + noway_assert((ssize_t)(int)frameSize == (ssize_t)frameSize); // make sure framesize safely fits within an int + + instGen_Set_Reg_To_Imm(EA_PTRSIZE, rOffset, -(ssize_t)pageSize); + instGen_Set_Reg_To_Imm(EA_PTRSIZE, rLimit, -(ssize_t)frameSize); + + // + // Can't have a label inside the ReJIT padding area + // + genPrologPadForReJit(); + + // There's a "virtual" label here. But we can't create a label in the prolog, so we use the magic + // `emitIns_J` with a negative `instrCount` to branch back a specific number of instructions. + + GetEmitter()->emitIns_R_R_R(INS_ldr, EA_4BYTE, REG_ZR, REG_SPBASE, rOffset); + GetEmitter()->emitIns_R_R_I(INS_sub, EA_PTRSIZE, rOffset, rOffset, pageSize); + GetEmitter()->emitIns_R_R(INS_cmp, EA_PTRSIZE, rLimit, rOffset); // If equal, we need to probe again + GetEmitter()->emitIns_J(INS_bls, NULL, -4); + + *pInitRegZeroed = false; // The initReg does not contain zero + + compiler->unwindPadding(); + + lastTouchDelta = frameSize % pageSize; + } + + if (lastTouchDelta + STACK_PROBE_BOUNDARY_THRESHOLD_BYTES > pageSize) + { + assert(lastTouchDelta + STACK_PROBE_BOUNDARY_THRESHOLD_BYTES < 2 * pageSize); + instGen_Set_Reg_To_Imm(EA_PTRSIZE, initReg, -(ssize_t)frameSize); + GetEmitter()->emitIns_R_R_R(INS_ldr, EA_4BYTE, REG_ZR, REG_SPBASE, initReg); + compiler->unwindPadding(); + + regSet.verifyRegUsed(initReg); + *pInitRegZeroed = false; // The initReg does not contain zero + } +} + #endif // _TARGET_ARM64_ diff --git a/src/jit/codegenarmarch.cpp b/src/jit/codegenarmarch.cpp index 7659ab34483e..fe43e7021cc9 100644 --- a/src/jit/codegenarmarch.cpp +++ b/src/jit/codegenarmarch.cpp @@ -3899,205 +3899,4 @@ void CodeGen::genStructReturn(GenTree* treeNode) } // op1 must be multi-reg GT_CALL } -//------------------------------------------------------------------------ -// genAllocLclFrame: Probe the stack and allocate the local stack frame: subtract from SP. -// -// Notes: -// On ARM64, this only does the probing; allocating the frame is done when callee-saved registers are saved. -// This is done before anything has been pushed. The previous frame might have a large outgoing argument -// space that has been allocated, but the lowest addresses have not been touched. Our frame setup might -// not touch up to the first 504 bytes. This means we could miss a guard page. On Windows, however, -// there are always three guard pages, so we will not miss them all. On Linux, there is only one guard -// page by default, so we need to be more careful. We do an extra probe if we might not have probed -// recently enough. That is, if a call and prolog establishment might lead to missing a page. We do this -// on Windows as well just to be consistent, even though it should not be necessary. -// -// On ARM32, the first instruction of the prolog is always a push (which touches the lowest address -// of the stack), either of the LR register or of some argument registers, e.g., in the case of -// pre-spilling. The LR register is always pushed because we require it to allow for GC return -// address hijacking (see the comment in CodeGen::genPushCalleeSavedRegisters()). These pushes -// happen immediately before calling this function, so the SP at the current location has already -// been touched. -// -void CodeGen::genAllocLclFrame(unsigned frameSize, regNumber initReg, bool* pInitRegZeroed, regMaskTP maskArgRegsLiveIn) -{ - assert(compiler->compGeneratingProlog); - - if (frameSize == 0) - { - return; - } - - const target_size_t pageSize = compiler->eeGetPageSize(); - -#ifdef _TARGET_ARM64_ - // What offset from the final SP was the last probe? If we haven't probed almost a complete page, and - // if the next action on the stack might subtract from SP first, before touching the current SP, then - // we do one more probe at the very bottom. This can happen if we call a function on arm64 that does - // a "STP fp, lr, [sp-504]!", that is, pre-decrement SP then store. Note that we probe here for arm64, - // but we don't alter SP. - target_size_t lastTouchDelta = 0; -#endif // _TARGET_ARM64_ - - assert(!compiler->info.compPublishStubParam || (REG_SECRET_STUB_PARAM != initReg)); - - if (frameSize < pageSize) - { -#ifdef _TARGET_ARM_ - inst_RV_IV(INS_sub, REG_SPBASE, frameSize, EA_PTRSIZE); -#endif // _TARGET_ARM_ - -#ifdef _TARGET_ARM64_ - lastTouchDelta = frameSize; -#endif // _TARGET_ARM64_ - } - else if (frameSize < compiler->getVeryLargeFrameSize()) - { -#if defined(_TARGET_ARM64_) - regNumber rTemp = REG_ZR; // We don't need a register for the target of the dummy load - lastTouchDelta = frameSize; -#else - regNumber rTemp = initReg; -#endif - - for (target_size_t probeOffset = pageSize; probeOffset <= frameSize; probeOffset += pageSize) - { - // Generate: - // movw initReg, -probeOffset - // ldr rTemp, [SP + initReg] // load into initReg on arm32, wzr on ARM64 - - instGen_Set_Reg_To_Imm(EA_PTRSIZE, initReg, -(ssize_t)probeOffset); - GetEmitter()->emitIns_R_R_R(INS_ldr, EA_4BYTE, rTemp, REG_SPBASE, initReg); - regSet.verifyRegUsed(initReg); - *pInitRegZeroed = false; // The initReg does not contain zero - -#ifdef _TARGET_ARM64_ - lastTouchDelta -= pageSize; -#endif // _TARGET_ARM64_ - } - -#ifdef _TARGET_ARM64_ - assert(lastTouchDelta == frameSize % pageSize); - compiler->unwindPadding(); -#else // !_TARGET_ARM64_ - instGen_Set_Reg_To_Imm(EA_PTRSIZE, initReg, frameSize); - compiler->unwindPadding(); - GetEmitter()->emitIns_R_R_R(INS_sub, EA_4BYTE, REG_SPBASE, REG_SPBASE, initReg); -#endif // !_TARGET_ARM64_ - } - else - { - assert(frameSize >= compiler->getVeryLargeFrameSize()); - - // Emit the following sequence to 'tickle' the pages. Note it is important that stack pointer not change - // until this is complete since the tickles could cause a stack overflow, and we need to be able to crawl - // the stack afterward (which means the stack pointer needs to be known). - // - // ARM64 needs 2 registers. ARM32 needs 3 registers. See VERY_LARGE_FRAME_SIZE_REG_MASK for how these - // are reserved. - - regMaskTP availMask = RBM_ALLINT & (regSet.rsGetModifiedRegsMask() | ~RBM_INT_CALLEE_SAVED); - availMask &= ~maskArgRegsLiveIn; // Remove all of the incoming argument registers as they are currently live - availMask &= ~genRegMask(initReg); // Remove the pre-calculated initReg - - regNumber rOffset = initReg; - regNumber rLimit; - regMaskTP tempMask; - -#if defined(_TARGET_ARM64_) - - regNumber rTemp = REG_ZR; // We don't need a register for the target of the dummy load - -#else // _TARGET_ARM_ - - regNumber rTemp; - - // We pick the next lowest register number for rTemp - noway_assert(availMask != RBM_NONE); - tempMask = genFindLowestBit(availMask); - rTemp = genRegNumFromMask(tempMask); - availMask &= ~tempMask; - -#endif // _TARGET_ARM_ - - // We pick the next lowest register number for rLimit - noway_assert(availMask != RBM_NONE); - tempMask = genFindLowestBit(availMask); - rLimit = genRegNumFromMask(tempMask); - availMask &= ~tempMask; - - // Generate: - // - // mov rOffset, -pageSize // On arm, this turns out to be "movw r1, 0xf000; sxth r1, r1". - // // We could save 4 bytes in the prolog by using "movs r1, 0" at the - // // runtime expense of running a useless first loop iteration. - // mov rLimit, -frameSize - // loop: - // ldr rTemp, [sp + rOffset] // rTemp = wzr on ARM64 - // sub rOffset, pageSize // Note that 0x1000 (normal ARM32 pageSize) on ARM32 uses the funky - // // Thumb immediate encoding - // cmp rLimit, rOffset - // b.ls loop // If rLimit is lower or same, we need to probe this rOffset. Note - // // especially that if it is the same, we haven't probed this page. - - noway_assert((ssize_t)(int)frameSize == (ssize_t)frameSize); // make sure framesize safely fits within an int - - instGen_Set_Reg_To_Imm(EA_PTRSIZE, rOffset, -(ssize_t)pageSize); - instGen_Set_Reg_To_Imm(EA_PTRSIZE, rLimit, -(ssize_t)frameSize); - - // - // Can't have a label inside the ReJIT padding area - // - genPrologPadForReJit(); - - // There's a "virtual" label here. But we can't create a label in the prolog, so we use the magic - // `emitIns_J` with a negative `instrCount` to branch back a specific number of instructions. - - GetEmitter()->emitIns_R_R_R(INS_ldr, EA_4BYTE, rTemp, REG_SPBASE, rOffset); -#if defined(_TARGET_ARM_) - regSet.verifyRegUsed(rTemp); - GetEmitter()->emitIns_R_I(INS_sub, EA_PTRSIZE, rOffset, pageSize); -#elif defined(_TARGET_ARM64_) - GetEmitter()->emitIns_R_R_I(INS_sub, EA_PTRSIZE, rOffset, rOffset, pageSize); -#endif - GetEmitter()->emitIns_R_R(INS_cmp, EA_PTRSIZE, rLimit, rOffset); // If equal, we need to probe again - GetEmitter()->emitIns_J(INS_bls, NULL, -4); - - *pInitRegZeroed = false; // The initReg does not contain zero - - compiler->unwindPadding(); - -#ifdef _TARGET_ARM64_ - lastTouchDelta = frameSize % pageSize; -#endif // _TARGET_ARM64_ - -#ifdef _TARGET_ARM_ - inst_RV_RV(INS_add, REG_SPBASE, rLimit, TYP_I_IMPL); -#endif // _TARGET_ARM_ - } - -#ifdef _TARGET_ARM64_ - if (lastTouchDelta + STACK_PROBE_BOUNDARY_THRESHOLD_BYTES > pageSize) - { - assert(lastTouchDelta + STACK_PROBE_BOUNDARY_THRESHOLD_BYTES < 2 * pageSize); - instGen_Set_Reg_To_Imm(EA_PTRSIZE, initReg, -(ssize_t)frameSize); - GetEmitter()->emitIns_R_R_R(INS_ldr, EA_4BYTE, REG_ZR, REG_SPBASE, initReg); - compiler->unwindPadding(); - - regSet.verifyRegUsed(initReg); - *pInitRegZeroed = false; // The initReg does not contain zero - } -#endif // _TARGET_ARM64_ - -#ifdef _TARGET_ARM_ - compiler->unwindAllocStack(frameSize); -#ifdef USING_SCOPE_INFO - if (!doubleAlignOrFramePointerUsed()) - { - psiAdjustStackLevel(frameSize); - } -#endif // USING_SCOPE_INFO -#endif // _TARGET_ARM_ -} - #endif // _TARGET_ARMARCH_ diff --git a/src/jit/codegencommon.cpp b/src/jit/codegencommon.cpp index 034eedc3d969..e23bb8951fd5 100644 --- a/src/jit/codegencommon.cpp +++ b/src/jit/codegencommon.cpp @@ -6823,12 +6823,14 @@ void CodeGen::genFinalizeFrame() } #endif // _TARGET_X86_ -#if defined(_TARGET_ARM_) - // We need to determine if we will change SP larger than a specific amount to determine if we want to use a loop - // to touch stack pages, that will require multiple registers. See genAllocLclFrame() for details. +#ifdef _TARGET_ARM_ + // Make sure that callee-saved registers used by call to a stack probing helper generated for very large stack + // frames + // (see `getVeryLargeFrameSize`) are pushed on stack. if (compiler->compLclFrameSize >= compiler->getVeryLargeFrameSize()) { - regSet.rsSetRegsModified(VERY_LARGE_FRAME_SIZE_REG_MASK); + regSet.rsSetRegsModified(RBM_STACK_PROBE_HELPER_ARG | RBM_STACK_PROBE_HELPER_CALL_TARGET | + RBM_STACK_PROBE_HELPER_TRASH); } // If there are any reserved registers, add them to the modified set. diff --git a/src/jit/codegenxarch.cpp b/src/jit/codegenxarch.cpp index aa2010a9789b..d3893f025b42 100644 --- a/src/jit/codegenxarch.cpp +++ b/src/jit/codegenxarch.cpp @@ -2215,7 +2215,17 @@ void CodeGen::genMultiRegCallStoreToLocal(GenTree* treeNode) } //------------------------------------------------------------------------ -// genAllocLclFrame: Probe the stack and allocate the local stack frame: subtract from SP. +// genAllocLclFrame: Probe the stack and allocate the local stack frame - subtract from SP. +// +// Arguments: +// frameSize - the size of the stack frame being allocated. +// initReg - register to use as a scratch register. +// pInitRegZeroed - OUT parameter. *pInitRegZeroed is set to 'false' if and only if +// this call sets 'initReg' to a non-zero value. +// maskArgRegsLiveIn - incoming argument registers that are currently live. +// +// Return value: +// None // void CodeGen::genAllocLclFrame(unsigned frameSize, regNumber initReg, bool* pInitRegZeroed, regMaskTP maskArgRegsLiveIn) { diff --git a/src/jit/compiler.h b/src/jit/compiler.h index 2008116d3db7..5addf6dfb29d 100644 --- a/src/jit/compiler.h +++ b/src/jit/compiler.h @@ -10526,20 +10526,6 @@ extern const BYTE genActualTypes[]; /*****************************************************************************/ -// VERY_LARGE_FRAME_SIZE_REG_MASK is the set of registers we need to use for -// the probing loop generated for very large stack frames (see `getVeryLargeFrameSize`). -// We only use this to ensure that if we need to reserve a callee-saved register, -// it will be reserved. For ARM32, only R12 and LR are non-callee-saved, non-argument -// registers, so we save at least one more callee-saved register. For ARM64, however, -// we already know we have at least three non-callee-saved, non-argument integer registers, -// so we don't need to save any more. - -#ifdef _TARGET_ARM_ -#define VERY_LARGE_FRAME_SIZE_REG_MASK (RBM_R4) -#endif - -/*****************************************************************************/ - extern BasicBlock dummyBB; /*****************************************************************************/ diff --git a/src/jit/target.h b/src/jit/target.h index 0d15cf81e6c5..216eab803e1a 100644 --- a/src/jit/target.h +++ b/src/jit/target.h @@ -1187,6 +1187,12 @@ typedef unsigned char regNumberSmall; // The first thing in an ARM32 prolog pushes LR to the stack, so this can be 0. #define STACK_PROBE_BOUNDARY_THRESHOLD_BYTES 0 + #define REG_STACK_PROBE_HELPER_ARG REG_R4 + #define RBM_STACK_PROBE_HELPER_ARG RBM_R4 + #define REG_STACK_PROBE_HELPER_CALL_TARGET REG_R5 + #define RBM_STACK_PROBE_HELPER_CALL_TARGET RBM_R5 + #define RBM_STACK_PROBE_HELPER_TRASH (RBM_R5 | RBM_LR) + #elif defined(_TARGET_ARM64_) #define CPU_LOAD_STORE_ARCH 1 diff --git a/src/vm/arm/asmhelpers.S b/src/vm/arm/asmhelpers.S index 1234813850ae..8d403867d3ae 100644 --- a/src/vm/arm/asmhelpers.S +++ b/src/vm/arm/asmhelpers.S @@ -1450,3 +1450,38 @@ DelayLoad_Helper\suffix: NESTED_END OnHijackTripThread, _TEXT #endif +// ------------------------------------------------------------------ +// The following helper will access ("probe") a word on each page of the stack +// starting with the page right beneath sp down to the one pointed to by r4. +// The procedure is needed to make sure that the "guard" page is pushed down below the allocated stack frame. +// The call to the helper will be emitted by JIT in the function/funclet prolog when large (larger than 0x3000 bytes) stack frame is required. +// On entry: +// r4 - points to the lowest address on the stack frame being allocated (i.e. [InitialSp - FrameSize]) +// sp - points to some byte on the last probed page +// On exit: +// r4 - is preserved +// r5 - is not preserved +// +// NOTE: this helper will probe at least one page below the one pointed to by sp. +#define PAGE_SIZE 0x1000 +#define PAGE_SIZE_LOG2 12 + + NESTED_ENTRY JIT_StackProbe, _TEXT, NoHandler + PROLOG_PUSH "{r7}" + PROLOG_STACK_SAVE r7 + + mov r5, sp // r5 points to some byte on the last probed page + bfc r5, #0, #PAGE_SIZE_LOG2 // r5 points to the **lowest address** on the last probed page + mov sp, r5 + +ProbeLoop: + // Immediate operand for the following instruction can not be greater than 4095. + sub sp, #(PAGE_SIZE - 4) // sp points to the **fourth** byte on the **next page** to probe + ldr r5, [sp, #-4]! // sp points to the lowest address on the **last probed** page + cmp sp, r4 + bhi ProbeLoop // If (sp > r4), then we need to probe at least one more page. + + EPILOG_STACK_RESTORE r7 + EPILOG_POP "{r7}" + EPILOG_BRANCH_REG lr + NESTED_END JIT_StackProbe, _TEXT diff --git a/src/vm/arm/asmhelpers.asm b/src/vm/arm/asmhelpers.asm index 21e0f6532f56..81eb1f71b004 100644 --- a/src/vm/arm/asmhelpers.asm +++ b/src/vm/arm/asmhelpers.asm @@ -2145,5 +2145,40 @@ $__RealName #endif // FEATURE_READYTORUN +;;----------------------------------------------------------------------------- +;; The following helper will access ("probe") a word on each page of the stack +;; starting with the page right beneath sp down to the one pointed to by r4. +;; The procedure is needed to make sure that the "guard" page is pushed down below the allocated stack frame. +;; The call to the helper will be emitted by JIT in the function/funclet prolog when large (larger than 0x3000 bytes) stack frame is required. +;;----------------------------------------------------------------------------- +; On entry: +; r4 - points to the lowest address on the stack frame being allocated (i.e. [InitialSp - FrameSize]) +; sp - points to some byte on the last probed page +; On exit: +; r4 - is preserved +; r5 - is not preserved +; +; NOTE: this helper will probe at least one page below the one pointed to by sp. +#define PAGE_SIZE_LOG2 12 + NESTED_ENTRY JIT_StackProbe + PROLOG_PUSH {r7} + PROLOG_STACK_SAVE r7 + + mov r5, sp ; r5 points to some byte on the last probed page + bfc r5, #0, #PAGE_SIZE_LOG2 ; r5 points to the **lowest address** on the last probed page + mov sp, r5 + +ProbeLoop + ; Immediate operand for the following instruction can not be greater than 4095. + sub sp, #(PAGE_SIZE - 4) ; sp points to the **fourth** byte on the **next page** to probe + ldr r5, [sp, #-4]! ; sp points to the lowest address on the **last probed** page + cmp sp, r4 + bhi ProbeLoop ; if (sp > r4), then we need to probe at least one more page. + + EPILOG_STACK_RESTORE r7 + EPILOG_POP {r7} + EPILOG_BRANCH_REG lr + NESTED_END + ; Must be at very end of file END diff --git a/src/vm/jitinterface.h b/src/vm/jitinterface.h index 41c3f367b4aa..3760e9cd6e48 100644 --- a/src/vm/jitinterface.h +++ b/src/vm/jitinterface.h @@ -436,10 +436,9 @@ extern "C" void STDCALL JIT_MemCpy(void *dest, const void *src, SIZE_T count); void STDMETHODCALLTYPE JIT_ProfilerEnterLeaveTailcallStub(UINT_PTR ProfilerHandle); - -#if defined(_TARGET_X86_) || defined(_TARGET_AMD64_) +#ifndef _TARGET_ARM64_ void STDCALL JIT_StackProbe(); -#endif // _TARGET_X86_ || _TARGET_AMD64_ +#endif // _TARGET_ARM64_ };