Skip to content
This repository has been archived by the owner on Jan 23, 2023. It is now read-only.

Implement stack probing using helpers #26807

Merged
merged 9 commits into from
Oct 7, 2019
12 changes: 7 additions & 5 deletions src/inc/corinfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -217,11 +217,11 @@ TODO: Talk about initializing strutures before use
#endif
#endif

SELECTANY const GUID JITEEVersionIdentifier = { /* e2ae5b32-a9ab-426e-bc2a-ae1a883e0367 */
0xe2ae5b32,
0xa9ab,
0x426e,
{0xbc, 0x2a, 0xae, 0x1a, 0x88, 0x3e, 0x03, 0x67}
SELECTANY const GUID JITEEVersionIdentifier = { /* 1ce51eeb-dfd0-4450-ba2c-ea0d2d863df5 */
0x1ce51eeb,
0xdfd0,
0x4450,
{0xba, 0x2c, 0xea, 0x0d, 0x2d, 0x86, 0x3d, 0xf5}
};

//////////////////////////////////////////////////////////////////////////////////////////////////////////
Expand Down Expand Up @@ -660,6 +660,8 @@ enum CorInfoHelpFunc

CORINFO_HELP_GVMLOOKUP_FOR_SLOT, // Resolve a generic virtual method target from this pointer and runtime method handle

CORINFO_HELP_STACK_PROBE, // Probes each page of the allocated stack frame

CORINFO_HELP_COUNT,
};

Expand Down
6 changes: 6 additions & 0 deletions src/inc/jithelpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -369,6 +369,12 @@

JITHELPER(CORINFO_HELP_GVMLOOKUP_FOR_SLOT, NULL, CORINFO_HELP_SIG_NO_ALIGN_STUB)

#if defined(_TARGET_X86_) || defined(_TARGET_AMD64_)
JITHELPER(CORINFO_HELP_STACK_PROBE, JIT_StackProbe, CORINFO_HELP_SIG_REG_ONLY)
#else // !_TARGET_X86_ && !_TARGET_AMD64_
JITHELPER(CORINFO_HELP_STACK_PROBE, NULL, CORINFO_HELP_SIG_UNDEF)
#endif // !_TARGET_X86_ && !_TARGET_AMD64_

#undef JITHELPER
#undef DYNAMICJITHELPER
#undef JITHELPER
Expand Down
3 changes: 3 additions & 0 deletions src/inc/readytorun.h
Original file line number Diff line number Diff line change
Expand Up @@ -335,6 +335,9 @@ enum ReadyToRunHelper

// JIT32 x86-specific exception handling
READYTORUN_HELPER_EndCatch = 0x110,

// Stack probing helper
READYTORUN_HELPER_StackProbe = 0x111,
echesakov marked this conversation as resolved.
Show resolved Hide resolved
};

//
Expand Down
4 changes: 4 additions & 0 deletions src/inc/readytorunhelpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -117,5 +117,9 @@ HELPER(READYTORUN_HELPER_PInvokeEnd, CORINFO_HELP_JIT_PINVOKE_END
HELPER(READYTORUN_HELPER_MonitorEnter, CORINFO_HELP_MON_ENTER, )
HELPER(READYTORUN_HELPER_MonitorExit, CORINFO_HELP_MON_EXIT, )

#if defined(_TARGET_X86_) || defined(_TARGET_AMD64_)
HELPER(READYTORUN_HELPER_StackProbe, CORINFO_HELP_STACK_PROBE, )
#endif

#undef HELPER
#undef OPTIMIZEFORSPEED
159 changes: 36 additions & 123 deletions src/jit/codegenxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2263,143 +2263,56 @@ void CodeGen::genAllocLclFrame(unsigned frameSize, regNumber initReg, bool* pIni
// Frame size >= 0x3000
assert(frameSize >= compiler->getVeryLargeFrameSize());

// Emit the following sequence to 'tickle' the pages.
// Note it is important that stack pointer not change until this is
// complete since the tickles could cause a stack overflow, and we
// need to be able to crawl the stack afterward (which means the
// stack pointer needs to be known).
#ifdef _TARGET_X86_
int spOffset = -(int)frameSize;

bool pushedStubParam = false;
if (compiler->info.compPublishStubParam && (REG_SECRET_STUB_PARAM == initReg))
if (compiler->info.compPublishStubParam)
{
// push register containing the StubParam
inst_RV(INS_push, REG_SECRET_STUB_PARAM, TYP_I_IMPL);
pushedStubParam = true;
GetEmitter()->emitIns_R(INS_push, EA_PTRSIZE, REG_SECRET_STUB_PARAM);
spOffset += REGSIZE_BYTES;
}

#ifndef _TARGET_UNIX_
instGen_Set_Reg_To_Zero(EA_PTRSIZE, initReg);
#endif
GetEmitter()->emitIns_R_AR(INS_lea, EA_PTRSIZE, REG_STACK_PROBE_HELPER_ARG, REG_SPBASE, spOffset);
regSet.verifyRegUsed(REG_STACK_PROBE_HELPER_ARG);

//
// Can't have a label inside the ReJIT padding area
//
// Can't have a call until we have enough padding for ReJit.
genPrologPadForReJit();
genEmitHelperCall(CORINFO_HELP_STACK_PROBE, 0, EA_UNKNOWN);

#ifndef _TARGET_UNIX_
// Code size for each instruction. We need this because the
// backward branch is hard-coded with the number of bytes to branch.
// The encoding differs based on the architecture and what register is
// used (namely, using RAX has a smaller encoding).
//
// xor eax,eax
// loop:
// For x86
// test [esp + eax], eax 3
// sub eax, 0x1000 5
// cmp EAX, -frameSize 5
// jge loop 2
//
// For AMD64 using RAX
// test [rsp + rax], rax 4
// sub rax, 0x1000 6
// cmp rax, -frameSize 6
// jge loop 2
//
// For AMD64 using RBP
// test [rsp + rbp], rbp 4
// sub rbp, 0x1000 7
// cmp rbp, -frameSize 7
// jge loop 2

GetEmitter()->emitIns_R_ARR(INS_test, EA_PTRSIZE, initReg, REG_SPBASE, initReg, 0);
inst_RV_IV(INS_sub, initReg, pageSize, EA_PTRSIZE);
inst_RV_IV(INS_cmp, initReg, -((ssize_t)frameSize), EA_PTRSIZE);

int bytesForBackwardJump;
#ifdef _TARGET_AMD64_
assert((initReg == REG_EAX) || (initReg == REG_EBP)); // We use RBP as initReg for EH funclets.
bytesForBackwardJump = ((initReg == REG_EAX) ? -18 : -20);
#else // !_TARGET_AMD64_
assert(initReg == REG_EAX);
bytesForBackwardJump = -15;
#endif // !_TARGET_AMD64_

// Branch backwards to start of loop
inst_IV(INS_jge, bytesForBackwardJump);

lastTouchDelta = frameSize % pageSize;

#else // _TARGET_UNIX_

// Code size for each instruction. We need this because the
// backward branch is hard-coded with the number of bytes to branch.
// The encoding differs based on the architecture and what register is
// used (namely, using RAX has a smaller encoding).
//
// For x86
// lea eax, [esp - frameSize]
// loop:
// lea esp, [esp - pageSize] 7
// test [esp], eax 3
// cmp esp, eax 2
// jge loop 2
// lea rsp, [rbp + frameSize]
//
// For AMD64 using RAX
// lea rax, [rsp - frameSize]
// loop:
// lea rsp, [rsp - pageSize] 8
// test [rsp], rax 4
// cmp rsp, rax 3
// jge loop 2
// lea rsp, [rax + frameSize]
//
// For AMD64 using RBP
// lea rbp, [rsp - frameSize]
// loop:
// lea rsp, [rsp - pageSize] 8
// test [rsp], rbp 4
// cmp rsp, rbp 3
// jge loop 2
// lea rsp, [rbp + frameSize]

int sPageSize = (int)pageSize;

GetEmitter()->emitIns_R_AR(INS_lea, EA_PTRSIZE, initReg, REG_SPBASE, -((ssize_t)frameSize)); // get frame border

GetEmitter()->emitIns_R_AR(INS_lea, EA_PTRSIZE, REG_SPBASE, REG_SPBASE, -sPageSize);
GetEmitter()->emitIns_R_AR(INS_test, EA_PTRSIZE, initReg, REG_SPBASE, 0);
inst_RV_RV(INS_cmp, REG_SPBASE, initReg);

int bytesForBackwardJump;
#ifdef _TARGET_AMD64_
assert((initReg == REG_EAX) || (initReg == REG_EBP)); // We use RBP as initReg for EH funclets.
bytesForBackwardJump = -17;
#else // !_TARGET_AMD64_
assert(initReg == REG_EAX);
bytesForBackwardJump = -14;
#endif // !_TARGET_AMD64_
if (compiler->info.compPublishStubParam)
{
GetEmitter()->emitIns_R(INS_pop, EA_PTRSIZE, REG_SECRET_STUB_PARAM);
GetEmitter()->emitIns_R_I(INS_sub, EA_PTRSIZE, REG_SPBASE, frameSize);
}
else
{
GetEmitter()->emitIns_R_R(INS_mov, EA_PTRSIZE, REG_SPBASE, REG_STACK_PROBE_HELPER_ARG);
}
#else // !_TARGET_X86_
static_assert_no_msg((RBM_STACK_PROBE_HELPER_ARG & (RBM_SECRET_STUB_PARAM | RBM_DEFAULT_HELPER_CALL_TARGET)) ==
RBM_NONE);

inst_IV(INS_jge, bytesForBackwardJump); // Branch backwards to start of loop
GetEmitter()->emitIns_R_AR(INS_lea, EA_PTRSIZE, REG_STACK_PROBE_HELPER_ARG, REG_SPBASE, -(int)frameSize);
regSet.verifyRegUsed(REG_STACK_PROBE_HELPER_ARG);

GetEmitter()->emitIns_R_AR(INS_lea, EA_PTRSIZE, REG_SPBASE, initReg, frameSize); // restore stack pointer
// Can't have a call until we have enough padding for ReJit.
genPrologPadForReJit();
genEmitHelperCall(CORINFO_HELP_STACK_PROBE, 0, EA_UNKNOWN);

lastTouchDelta = 0; // The loop code above actually over-probes: it always probes beyond the final SP we need.
if (initReg == REG_DEFAULT_HELPER_CALL_TARGET)
{
*pInitRegZeroed = false;
}

#endif // _TARGET_UNIX_
static_assert_no_msg((RBM_STACK_PROBE_HELPER_TRASH & RBM_STACK_PROBE_HELPER_ARG) == RBM_NONE);

*pInitRegZeroed = false; // The initReg does not contain zero
GetEmitter()->emitIns_R_R(INS_mov, EA_PTRSIZE, REG_SPBASE, REG_STACK_PROBE_HELPER_ARG);
#endif // !_TARGET_X86_

if (pushedStubParam)
if (initReg == REG_STACK_PROBE_HELPER_ARG)
{
// pop eax
inst_RV(INS_pop, REG_SECRET_STUB_PARAM, TYP_I_IMPL);
regSet.verifyRegUsed(REG_SECRET_STUB_PARAM);
*pInitRegZeroed = false;
}

// sub esp, frameSize 6
inst_RV_IV(INS_sub, REG_SPBASE, frameSize, EA_PTRSIZE);
}

if (lastTouchDelta + STACK_PROBE_BOUNDARY_THRESHOLD_BYTES > pageSize)
Expand Down Expand Up @@ -3333,7 +3246,7 @@ unsigned CodeGen::genMove8IfNeeded(unsigned size, regNumber longTmpReg, GenTree*
#ifdef _TARGET_X86_
instruction longMovIns = INS_movq;
#else // !_TARGET_X86_
instruction longMovIns = INS_mov;
instruction longMovIns = INS_mov;
#endif // !_TARGET_X86_
if ((size & 8) != 0)
{
Expand Down
14 changes: 14 additions & 0 deletions src/jit/target.h
Original file line number Diff line number Diff line change
Expand Up @@ -492,6 +492,11 @@ typedef unsigned char regNumberSmall;
// on the stack guard page, and must be touched before any further "SUB SP".
#define STACK_PROBE_BOUNDARY_THRESHOLD_BYTES ARG_STACK_PROBE_THRESHOLD_BYTES

#define REG_STACK_PROBE_HELPER_ARG REG_EAX
#define RBM_STACK_PROBE_HELPER_ARG RBM_EAX

#define RBM_STACK_PROBE_HELPER_TRASH RBM_NONE

#elif defined(_TARGET_AMD64_)
// TODO-AMD64-CQ: Fine tune the following xxBlk threshold values:

Expand Down Expand Up @@ -896,6 +901,15 @@ typedef unsigned char regNumberSmall;
// AMD64 uses FEATURE_FIXED_OUT_ARGS so this can be zero.
#define STACK_PROBE_BOUNDARY_THRESHOLD_BYTES 0

#define REG_STACK_PROBE_HELPER_ARG REG_R11
#define RBM_STACK_PROBE_HELPER_ARG RBM_R11

#ifdef _TARGET_UNIX_
#define RBM_STACK_PROBE_HELPER_TRASH RBM_NONE
#else // !_TARGET_UNIX_
#define RBM_STACK_PROBE_HELPER_TRASH RBM_RAX
#endif // !_TARGET_UNIX_

#elif defined(_TARGET_ARM_)

// TODO-ARM-CQ: Use shift for division by power of 2
Expand Down
2 changes: 2 additions & 0 deletions src/tools/crossgen2/Common/JitInterface/CorInfoHelpFunc.cs
Original file line number Diff line number Diff line change
Expand Up @@ -313,6 +313,8 @@ which is the right helper to use to allocate an object of a given type. */

CORINFO_HELP_GVMLOOKUP_FOR_SLOT, // Resolve a generic virtual method target from this pointer and runtime method handle

CORINFO_HELP_STACK_PROBE, // Probes each page of the allocated stack frame

CORINFO_HELP_COUNT,
}
}
10 changes: 5 additions & 5 deletions src/tools/crossgen2/jitinterface/jitwrapper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,11 @@ class CORJIT_FLAGS
unsigned __int64 corJitFlags;
};

static const GUID JITEEVersionIdentifier = { /* d609bed1-7831-49fc-bd49-b6f054dd4d46 */
0xe2ae5b32,
0xa9ab,
0x426e,
{0xbc, 0x2a, 0xae, 0x1a, 0x88, 0x3e, 0x03, 0x67}
static const GUID JITEEVersionIdentifier = { /* 1ce51eeb-dfd0-4450-ba2c-ea0d2d863df5 */
echesakov marked this conversation as resolved.
Show resolved Hide resolved
0x1ce51eeb,
0xdfd0,
0x4450,
{0xba, 0x2c, 0xea, 0x0d, 0x2d, 0x86, 0x3d, 0xf5}
};

class Jit
Expand Down
34 changes: 33 additions & 1 deletion src/vm/amd64/JitHelpers_Fast.asm
Original file line number Diff line number Diff line change
Expand Up @@ -955,5 +955,37 @@ endif ; _DEBUG

NESTED_END TailCallHelperStub, _TEXT

end
; The following helper will access ("probe") a word on each page of the stack
; starting with the page right beneath rsp down to the one pointed to by r11.
; The procedure is needed to make sure that the "guard" page is pushed down below the allocated stack frame.
; The call to the helper will be emitted by JIT in the function/funclet prolog when large (larger than 0x3000 bytes) stack frame is required.
;
; NOTE: this helper will NOT modify a value of rsp and can be defined as a leaf function.

PAGE_SIZE equ 1000h
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

An interesting side-effect here is that the page size (that we probe) is hard-coded to 0x1000, whereas in PAL builds, the page size is currently dynamic. For >4K pages, we might over-probe. But I suppose that is ok -- better perhaps than burning a register to pass in the page size, or creating extra page size specific helpers.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I though about several options here:

  1. Have several versions of the helpers (e.g. JIT_StackProbe_0x1000 and JIT_StackProbe_0x10000) on platforms that can have different page sizes (e.g. arm64) and do selection when emitting a call to the helper from JIT.
  2. Burn a register and pass a page size from the JIT side (which feels weird to me since JIT asks EE for a page size). It's also going to be tough for find a spare register on x86.
  3. Use a stack for passing parameters and basically do the same as in 2).
  4. Hard-code the page size as I did

I chose 4 and as a contingency plan if there will be a strong requirement for using "true" page size we can add a logic that will patch the helper during the process startup and adjust the page size.


LEAF_ENTRY JIT_StackProbe, _TEXT
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd like to see a line or two of comments in these asm helpers describing the purpose and function of the helper (what it does), in addition to the register "on entry" / "on exit" documentation (which is super useful).

It would also be useful to indicate what all the requirements are around each helper (as the requirements differ per platform). E.g., on Linux you can't probe beyond ESP/RSP.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@BruceForstall I believe I have addressed all your suggestions - please take a look and let me know if I need to clarify anything else

; On entry:
; r11 - points to the lowest address on the stack frame being allocated (i.e. [InitialSp - FrameSize])
; rsp - points to some byte on the last probed page
; On exit:
; rax - is not preserved
; r11 - is preserved
;
; NOTE: this helper will probe at least one page below the one pointed by rsp.

lea rax, [rsp - PAGE_SIZE] ; rax points to some byte on the first unprobed page
or rax, (PAGE_SIZE - 1) ; rax points to the **highest address** on the first unprobed page
; This is done to make the following loop end condition simpler.

ProbeLoop:
test dword ptr [rax], eax
sub rax, PAGE_SIZE ; rax points to the highest address of the **next page** to probe
cmp rax, r11
jge ProbeLoop ; if (rax >= r11), then we need to probe the page pointed to by rax.

ret

LEAF_END JIT_StackProbe, _TEXT

end
42 changes: 42 additions & 0 deletions src/vm/amd64/jithelpers_fast.S
Original file line number Diff line number Diff line change
Expand Up @@ -537,3 +537,45 @@ LEAF_ENTRY JIT_Stelem_Ref__ArrayStoreCheck_Helper, _TEXT
jmp C_FUNC(JIT_WriteBarrier)
#endif
LEAF_END JIT_Stelem_Ref__ArrayStoreCheck_Helper, _TEXT

// The following helper will access ("probe") a word on each page of the stack
// starting with the page right beneath rsp down to the one pointed to by r11.
// The procedure is needed to make sure that the "guard" page is pushed down below the allocated stack frame.
// The call to the helper will be emitted by JIT in the function/funclet prolog when large (larger than 0x3000 bytes) stack frame is required.
//
// NOTE: On Linux we must advance the stack pointer as we probe - it is not allowed to access 65535 bytes below rsp.
// Since this helper will modify a value of rsp - it must establish the frame pointer.
//
// See also https://github.com/dotnet/coreclr/issues/16827#issue-303331518 for more information.

#define PAGE_SIZE 0x1000

NESTED_ENTRY JIT_StackProbe, _TEXT, NoHandler
// On entry:
// r11 - points to the lowest address on the stack frame being allocated (i.e. [InitialSp - FrameSize])
// rsp - points to some byte on the last probed page
// On exit:
// r11 - is preserved
//
// NOTE: this helper will probe at least one page below the one pointed by rsp.

push_nonvol_reg rbp
mov rbp, rsp
set_cfa_register rbp, 16

END_PROLOGUE

sub rsp, PAGE_SIZE // rsp points to some byte on the first unprobed page
or rsp, (PAGE_SIZE - 1) // rsp points to the **highest address** on the first unprobed page
// This is done to make the following loop end condition simpler.

LOCAL_LABEL(ProbeLoop):
test dword ptr [rsp], eax
sub rsp, PAGE_SIZE // rsp points to the highest address of the **next page** to probe
cmp rsp, r11
jge LOCAL_LABEL(ProbeLoop) // if (rsp >= r11), then we need to probe the page pointed to by rsp.

RESET_FRAME_WITH_RBP
ret

NESTED_END JIT_StackProbe, _TEXT
Loading