Skip to content
This repository has been archived by the owner on Jan 23, 2023. It is now read-only.

Implement stack probing using helpers #26807

Merged
merged 9 commits into from
Oct 7, 2019
2 changes: 2 additions & 0 deletions src/inc/corinfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -660,6 +660,8 @@ enum CorInfoHelpFunc

CORINFO_HELP_GVMLOOKUP_FOR_SLOT, // Resolve a generic virtual method target from this pointer and runtime method handle

CORINFO_HELP_STACK_PROBE,
echesakov marked this conversation as resolved.
Show resolved Hide resolved

CORINFO_HELP_COUNT,
};

Expand Down
6 changes: 6 additions & 0 deletions src/inc/jithelpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -369,6 +369,12 @@

JITHELPER(CORINFO_HELP_GVMLOOKUP_FOR_SLOT, NULL, CORINFO_HELP_SIG_NO_ALIGN_STUB)

#if defined(_TARGET_X86_) || defined(_TARGET_AMD64_)
JITHELPER(CORINFO_HELP_STACK_PROBE, JIT_StackProbe, CORINFO_HELP_SIG_REG_ONLY)
#else // !_TARGET_X86_ && !_TARGET_AMD64_
JITHELPER(CORINFO_HELP_STACK_PROBE, NULL, CORINFO_HELP_SIG_UNDEF)
#endif // !_TARGET_X86_ && !_TARGET_AMD64_

#undef JITHELPER
#undef DYNAMICJITHELPER
#undef JITHELPER
Expand Down
3 changes: 3 additions & 0 deletions src/inc/readytorun.h
Original file line number Diff line number Diff line change
Expand Up @@ -335,6 +335,9 @@ enum ReadyToRunHelper

// JIT32 x86-specific exception handling
READYTORUN_HELPER_EndCatch = 0x110,

// Stack probing helper
READYTORUN_HELPER_StackProbe = 0x111,
echesakov marked this conversation as resolved.
Show resolved Hide resolved
};

//
Expand Down
4 changes: 4 additions & 0 deletions src/inc/readytorunhelpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -117,5 +117,9 @@ HELPER(READYTORUN_HELPER_PInvokeEnd, CORINFO_HELP_JIT_PINVOKE_END
HELPER(READYTORUN_HELPER_MonitorEnter, CORINFO_HELP_MON_ENTER, )
HELPER(READYTORUN_HELPER_MonitorExit, CORINFO_HELP_MON_EXIT, )

#if defined(_TARGET_X86_) || defined(_TARGET_AMD64_)
HELPER(READYTORUN_HELPER_StackProbe, CORINFO_HELP_STACK_PROBE, )
#endif

#undef HELPER
#undef OPTIMIZEFORSPEED
159 changes: 36 additions & 123 deletions src/jit/codegenxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2263,143 +2263,56 @@ void CodeGen::genAllocLclFrame(unsigned frameSize, regNumber initReg, bool* pIni
// Frame size >= 0x3000
assert(frameSize >= compiler->getVeryLargeFrameSize());

// Emit the following sequence to 'tickle' the pages.
// Note it is important that stack pointer not change until this is
// complete since the tickles could cause a stack overflow, and we
// need to be able to crawl the stack afterward (which means the
// stack pointer needs to be known).
#ifdef _TARGET_X86_
int spOffset = -(int)frameSize;

bool pushedStubParam = false;
if (compiler->info.compPublishStubParam && (REG_SECRET_STUB_PARAM == initReg))
if (compiler->info.compPublishStubParam)
{
// push register containing the StubParam
inst_RV(INS_push, REG_SECRET_STUB_PARAM, TYP_I_IMPL);
pushedStubParam = true;
GetEmitter()->emitIns_R(INS_push, EA_PTRSIZE, REG_SECRET_STUB_PARAM);
spOffset += REGSIZE_BYTES;
}

#ifndef _TARGET_UNIX_
instGen_Set_Reg_To_Zero(EA_PTRSIZE, initReg);
#endif
GetEmitter()->emitIns_R_AR(INS_lea, EA_PTRSIZE, REG_STACK_PROBE_HELPER_ARG, REG_SPBASE, spOffset);
regSet.verifyRegUsed(REG_STACK_PROBE_HELPER_ARG);

//
// Can't have a label inside the ReJIT padding area
//
// Can't have a call until we have enough padding for ReJit.
genPrologPadForReJit();
genEmitHelperCall(CORINFO_HELP_STACK_PROBE, 0, EA_UNKNOWN);

#ifndef _TARGET_UNIX_
// Code size for each instruction. We need this because the
// backward branch is hard-coded with the number of bytes to branch.
// The encoding differs based on the architecture and what register is
// used (namely, using RAX has a smaller encoding).
//
// xor eax,eax
// loop:
// For x86
// test [esp + eax], eax 3
// sub eax, 0x1000 5
// cmp EAX, -frameSize 5
// jge loop 2
//
// For AMD64 using RAX
// test [rsp + rax], rax 4
// sub rax, 0x1000 6
// cmp rax, -frameSize 6
// jge loop 2
//
// For AMD64 using RBP
// test [rsp + rbp], rbp 4
// sub rbp, 0x1000 7
// cmp rbp, -frameSize 7
// jge loop 2

GetEmitter()->emitIns_R_ARR(INS_test, EA_PTRSIZE, initReg, REG_SPBASE, initReg, 0);
inst_RV_IV(INS_sub, initReg, pageSize, EA_PTRSIZE);
inst_RV_IV(INS_cmp, initReg, -((ssize_t)frameSize), EA_PTRSIZE);

int bytesForBackwardJump;
#ifdef _TARGET_AMD64_
assert((initReg == REG_EAX) || (initReg == REG_EBP)); // We use RBP as initReg for EH funclets.
bytesForBackwardJump = ((initReg == REG_EAX) ? -18 : -20);
#else // !_TARGET_AMD64_
assert(initReg == REG_EAX);
bytesForBackwardJump = -15;
#endif // !_TARGET_AMD64_

// Branch backwards to start of loop
inst_IV(INS_jge, bytesForBackwardJump);

lastTouchDelta = frameSize % pageSize;

#else // _TARGET_UNIX_

// Code size for each instruction. We need this because the
// backward branch is hard-coded with the number of bytes to branch.
// The encoding differs based on the architecture and what register is
// used (namely, using RAX has a smaller encoding).
//
// For x86
// lea eax, [esp - frameSize]
// loop:
// lea esp, [esp - pageSize] 7
// test [esp], eax 3
// cmp esp, eax 2
// jge loop 2
// lea rsp, [rbp + frameSize]
//
// For AMD64 using RAX
// lea rax, [rsp - frameSize]
// loop:
// lea rsp, [rsp - pageSize] 8
// test [rsp], rax 4
// cmp rsp, rax 3
// jge loop 2
// lea rsp, [rax + frameSize]
//
// For AMD64 using RBP
// lea rbp, [rsp - frameSize]
// loop:
// lea rsp, [rsp - pageSize] 8
// test [rsp], rbp 4
// cmp rsp, rbp 3
// jge loop 2
// lea rsp, [rbp + frameSize]

int sPageSize = (int)pageSize;

GetEmitter()->emitIns_R_AR(INS_lea, EA_PTRSIZE, initReg, REG_SPBASE, -((ssize_t)frameSize)); // get frame border

GetEmitter()->emitIns_R_AR(INS_lea, EA_PTRSIZE, REG_SPBASE, REG_SPBASE, -sPageSize);
GetEmitter()->emitIns_R_AR(INS_test, EA_PTRSIZE, initReg, REG_SPBASE, 0);
inst_RV_RV(INS_cmp, REG_SPBASE, initReg);

int bytesForBackwardJump;
#ifdef _TARGET_AMD64_
assert((initReg == REG_EAX) || (initReg == REG_EBP)); // We use RBP as initReg for EH funclets.
bytesForBackwardJump = -17;
#else // !_TARGET_AMD64_
assert(initReg == REG_EAX);
bytesForBackwardJump = -14;
#endif // !_TARGET_AMD64_
if (compiler->info.compPublishStubParam)
{
GetEmitter()->emitIns_R(INS_pop, EA_PTRSIZE, REG_SECRET_STUB_PARAM);
GetEmitter()->emitIns_R_I(INS_sub, EA_PTRSIZE, REG_SPBASE, frameSize);
}
else
{
GetEmitter()->emitIns_R_R(INS_mov, EA_PTRSIZE, REG_SPBASE, REG_STACK_PROBE_HELPER_ARG);
}
#else // !_TARGET_X86_
static_assert_no_msg((RBM_STACK_PROBE_HELPER_ARG & (RBM_SECRET_STUB_PARAM | RBM_DEFAULT_HELPER_CALL_TARGET)) ==
RBM_NONE);

inst_IV(INS_jge, bytesForBackwardJump); // Branch backwards to start of loop
GetEmitter()->emitIns_R_AR(INS_lea, EA_PTRSIZE, REG_STACK_PROBE_HELPER_ARG, REG_SPBASE, -(int)frameSize);
regSet.verifyRegUsed(REG_STACK_PROBE_HELPER_ARG);

GetEmitter()->emitIns_R_AR(INS_lea, EA_PTRSIZE, REG_SPBASE, initReg, frameSize); // restore stack pointer
// Can't have a call until we have enough padding for ReJit.
genPrologPadForReJit();
genEmitHelperCall(CORINFO_HELP_STACK_PROBE, 0, EA_UNKNOWN);

lastTouchDelta = 0; // The loop code above actually over-probes: it always probes beyond the final SP we need.
if (initReg == REG_DEFAULT_HELPER_CALL_TARGET)
{
*pInitRegZeroed = false;
}

#endif // _TARGET_UNIX_
static_assert_no_msg((RBM_STACK_PROBE_HELPER_TRASH & RBM_STACK_PROBE_HELPER_ARG) == RBM_NONE);

*pInitRegZeroed = false; // The initReg does not contain zero
GetEmitter()->emitIns_R_R(INS_mov, EA_PTRSIZE, REG_SPBASE, REG_STACK_PROBE_HELPER_ARG);
#endif // !_TARGET_X86_

if (pushedStubParam)
if (initReg == REG_STACK_PROBE_HELPER_ARG)
{
// pop eax
inst_RV(INS_pop, REG_SECRET_STUB_PARAM, TYP_I_IMPL);
regSet.verifyRegUsed(REG_SECRET_STUB_PARAM);
*pInitRegZeroed = false;
}

// sub esp, frameSize 6
inst_RV_IV(INS_sub, REG_SPBASE, frameSize, EA_PTRSIZE);
}

if (lastTouchDelta + STACK_PROBE_BOUNDARY_THRESHOLD_BYTES > pageSize)
Expand Down Expand Up @@ -3333,7 +3246,7 @@ unsigned CodeGen::genMove8IfNeeded(unsigned size, regNumber longTmpReg, GenTree*
#ifdef _TARGET_X86_
instruction longMovIns = INS_movq;
#else // !_TARGET_X86_
instruction longMovIns = INS_mov;
instruction longMovIns = INS_mov;
#endif // !_TARGET_X86_
if ((size & 8) != 0)
{
Expand Down
14 changes: 14 additions & 0 deletions src/jit/target.h
Original file line number Diff line number Diff line change
Expand Up @@ -492,6 +492,11 @@ typedef unsigned char regNumberSmall;
// on the stack guard page, and must be touched before any further "SUB SP".
#define STACK_PROBE_BOUNDARY_THRESHOLD_BYTES ARG_STACK_PROBE_THRESHOLD_BYTES

#define REG_STACK_PROBE_HELPER_ARG REG_EAX
#define RBM_STACK_PROBE_HELPER_ARG RBM_EAX

#define RBM_STACK_PROBE_HELPER_TRASH RBM_NONE

#elif defined(_TARGET_AMD64_)
// TODO-AMD64-CQ: Fine tune the following xxBlk threshold values:

Expand Down Expand Up @@ -896,6 +901,15 @@ typedef unsigned char regNumberSmall;
// AMD64 uses FEATURE_FIXED_OUT_ARGS so this can be zero.
#define STACK_PROBE_BOUNDARY_THRESHOLD_BYTES 0

#define REG_STACK_PROBE_HELPER_ARG REG_R11
#define RBM_STACK_PROBE_HELPER_ARG RBM_R11

#ifdef _TARGET_UNIX_
#define RBM_STACK_PROBE_HELPER_TRASH RBM_NONE
#else // !_TARGET_UNIX_
#define RBM_STACK_PROBE_HELPER_TRASH RBM_RAX
#endif // !_TARGET_UNIX_

#elif defined(_TARGET_ARM_)

// TODO-ARM-CQ: Use shift for division by power of 2
Expand Down
26 changes: 25 additions & 1 deletion src/vm/amd64/JitHelpers_Fast.asm
Original file line number Diff line number Diff line change
Expand Up @@ -955,5 +955,29 @@ endif ; _DEBUG

NESTED_END TailCallHelperStub, _TEXT

end
PAGE_SIZE equ 1000h
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

An interesting side-effect here is that the page size (that we probe) is hard-coded to 0x1000, whereas in PAL builds, the page size is currently dynamic. For >4K pages, we might over-probe. But I suppose that is ok -- better perhaps than burning a register to pass in the page size, or creating extra page size specific helpers.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I though about several options here:

  1. Have several versions of the helpers (e.g. JIT_StackProbe_0x1000 and JIT_StackProbe_0x10000) on platforms that can have different page sizes (e.g. arm64) and do selection when emitting a call to the helper from JIT.
  2. Burn a register and pass a page size from the JIT side (which feels weird to me since JIT asks EE for a page size). It's also going to be tough for find a spare register on x86.
  3. Use a stack for passing parameters and basically do the same as in 2).
  4. Hard-code the page size as I did

I chose 4 and as a contingency plan if there will be a strong requirement for using "true" page size we can add a logic that will patch the helper during the process startup and adjust the page size.


LEAF_ENTRY JIT_StackProbe, _TEXT
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd like to see a line or two of comments in these asm helpers describing the purpose and function of the helper (what it does), in addition to the register "on entry" / "on exit" documentation (which is super useful).

It would also be useful to indicate what all the requirements are around each helper (as the requirements differ per platform). E.g., on Linux you can't probe beyond ESP/RSP.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@BruceForstall I believe I have addressed all your suggestions - please take a look and let me know if I need to clarify anything else

; On entry:
; r11 - points to the lowest address on the stack frame being allocated (i.e. [InitialSp - FrameSize])
; rsp - points to some byte on the last probed page
; On exit:
; rax - is not preserved
; r11 - is preserved
;
; NOTE: this helper will probe at least one page below the one pointed by rsp.

lea rax, [rsp - PAGE_SIZE] ; rax points to some byte on the first unprobed page
or rax, (PAGE_SIZE - 1) ; rax points to the last byte on the first unprobed page

ProbeLoop:
test dword ptr [rax], eax
sub rax, PAGE_SIZE
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The probing math is a little tricky, to ensure exactly the right number of pages are probed. I would suggest a couple comments, e.g.:

or      rax, (PAGE_SIZE - 1)   ; rax points to the **highest address** on the first unprobed page
                       ; This is done to make the loop end condition simpler.
...
sub     rax, PAGE_SIZE  ; rax points to the highest address of the next page to probe
...
cmp rax, r11 ; if rax >= r11, then we need to probe the page pointed to by rax.

cmp rax, r11
jge ProbeLoop

ret

LEAF_END JIT_StackProbe, _TEXT

end
31 changes: 31 additions & 0 deletions src/vm/amd64/jithelpers_fast.S
Original file line number Diff line number Diff line change
Expand Up @@ -537,3 +537,34 @@ LEAF_ENTRY JIT_Stelem_Ref__ArrayStoreCheck_Helper, _TEXT
jmp C_FUNC(JIT_WriteBarrier)
#endif
LEAF_END JIT_Stelem_Ref__ArrayStoreCheck_Helper, _TEXT

#define PAGE_SIZE 0x1000

NESTED_ENTRY JIT_StackProbe, _TEXT, NoHandler
// On entry:
// r11 - points to the lowest address on the stack frame being allocated (i.e. [InitialSp - FrameSize])
// rsp - points to some byte on the last probed page
// On exit:
// r11 - is preserved
//
// NOTE: this helper will probe at least one page below the one pointed by rsp.

push_nonvol_reg rbp
mov rbp, rsp
set_cfa_register rbp, 16

END_PROLOGUE

sub rsp, PAGE_SIZE // rsp points to some byte on the first unprobed page
or rsp, (PAGE_SIZE - 1) // rsp points to the last byte on the first unprobed page

LOCAL_LABEL(ProbeLoop):
test dword ptr [rsp], eax
sub rsp, PAGE_SIZE
cmp rsp, r11
jge LOCAL_LABEL(ProbeLoop)

RESET_FRAME_WITH_RBP
ret

NESTED_END JIT_StackProbe, _TEXT
31 changes: 31 additions & 0 deletions src/vm/i386/jithelp.S
Original file line number Diff line number Diff line change
Expand Up @@ -617,6 +617,37 @@ LEAF_ENTRY JIT_Dbl2IntSSE2, _TEXT
ret
LEAF_END JIT_Dbl2IntSSE2, _TEXT

// *********************************************************************/
// JIT_StackProbe
//
// Purpose:
// Does stack probing at one-page intervals.
//
#define PAGE_SIZE 0x1000
NESTED_ENTRY JIT_StackProbe, _TEXT, NoHandler
// On entry:
// eax - the lowest address of the stack frame being allocated (i.e. [InitialSp - FrameSize])
//
// NOTE: this helper will probe at least one page below the one pointed by esp.
PROLOG_BEG
PROLOG_END

sub esp, PAGE_SIZE // esp points to some byte on the first unprobed page
or esp, (PAGE_SIZE - 1) // esp points to the last byte on the first unprobed page

LOCAL_LABEL(ProbeLoop):
test [esp], eax
sub esp, PAGE_SIZE
cmp esp, eax
jge LOCAL_LABEL(ProbeLoop)

EPILOG_BEG
mov esp, ebp
EPILOG_END
ret

NESTED_END JIT_StackProbe, _TEXT

// *********************************************************************/
// This is the small write barrier thunk we use when we know the
// ephemeral generation is higher in memory than older generations.
Expand Down
25 changes: 25 additions & 0 deletions src/vm/i386/jithelp.asm
Original file line number Diff line number Diff line change
Expand Up @@ -1464,4 +1464,29 @@ JIT_EndCatch PROC stdcall public

JIT_EndCatch ENDP

PAGE_SIZE equ 1000h

_JIT_StackProbe@0 PROC public
; On entry:
; eax - the lowest address of the stack frame being allocated (i.e. [InitialSp - FrameSize])
;
; NOTE: this helper will probe at least one page below the one pointed by esp.
push ebp
mov ebp, esp

sub esp, PAGE_SIZE ; esp points to some byte on the first unprobed page
or esp, (PAGE_SIZE - 1) ; esp points to the last byte on the first unprobed page

ProbeLoop:
test [esp], eax
sub esp, PAGE_SIZE
cmp esp, eax
jge ProbeLoop

mov esp, ebp
pop ebp
ret

_JIT_StackProbe@0 ENDP

end
Loading