Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Local heap optimizations on Arm64 #64481

Merged
merged 5 commits into from
Feb 8, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 58 additions & 16 deletions src/coreclr/jit/codegenarm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2311,34 +2311,76 @@ void CodeGen::genLclHeap(GenTree* tree)
// We should reach here only for non-zero, constant size allocations.
assert(amount > 0);

const int storePairRegsWritesBytes = 2 * REGSIZE_BYTES;

// For small allocations we will generate up to four stp instructions, to zero 16 to 64 bytes.
static_assert_no_msg(STACK_ALIGN == (REGSIZE_BYTES * 2));
assert(amount % (REGSIZE_BYTES * 2) == 0); // stp stores two registers at a time
size_t stpCount = amount / (REGSIZE_BYTES * 2);
if (stpCount <= 4)
static_assert_no_msg(STACK_ALIGN == storePairRegsWritesBytes);
assert(amount % storePairRegsWritesBytes == 0); // stp stores two registers at a time

if (compiler->info.compInitMem)
{
while (stpCount != 0)
if (amount <= LCLHEAP_UNROLL_LIMIT)
{
// We can use pre-indexed addressing.
// stp ZR, ZR, [SP, #-16]! // STACK_ALIGN is 16
GetEmitter()->emitIns_R_R_R_I(INS_stp, EA_PTRSIZE, REG_ZR, REG_ZR, REG_SPBASE, -16, INS_OPTS_PRE_INDEX);
stpCount -= 1;
}
// The following zeroes the last 16 bytes and probes the page containing [sp, #16] address.
// stp xzr, xzr, [sp, #-16]!
GetEmitter()->emitIns_R_R_R_I(INS_stp, EA_8BYTE, REG_ZR, REG_ZR, REG_SPBASE, -storePairRegsWritesBytes,
INS_OPTS_PRE_INDEX);

lastTouchDelta = 0;
if (amount > storePairRegsWritesBytes)
{
// The following sets SP to its final value and zeroes the first 16 bytes of the allocated space.
// stp xzr, xzr, [sp, #-amount+16]!
const ssize_t finalSpDelta = (ssize_t)amount - storePairRegsWritesBytes;
GetEmitter()->emitIns_R_R_R_I(INS_stp, EA_8BYTE, REG_ZR, REG_ZR, REG_SPBASE, -finalSpDelta,
INS_OPTS_PRE_INDEX);

// The following zeroes the remaining space in [finalSp+16, initialSp-16) interval
// using a sequence of stp instruction with unsigned offset.
for (ssize_t offset = storePairRegsWritesBytes; offset < finalSpDelta;
offset += storePairRegsWritesBytes)
{
// stp xzr, xzr, [sp, #offset]
GetEmitter()->emitIns_R_R_R_I(INS_stp, EA_8BYTE, REG_ZR, REG_ZR, REG_SPBASE, offset);
}
}

goto ALLOC_DONE;
lastTouchDelta = 0;

goto ALLOC_DONE;
}
}
else if (!compiler->info.compInitMem && (amount < compiler->eeGetPageSize())) // must be < not <=
else if (amount < compiler->eeGetPageSize()) // must be < not <=
{
// Since the size is less than a page, simply adjust the SP value.
// The SP might already be in the guard page, so we must touch it BEFORE
// the alloc, not after.

// ldr wz, [SP, #0]
GetEmitter()->emitIns_R_R_I(INS_ldr, EA_4BYTE, REG_ZR, REG_SP, 0);
// Note the we check against the lower boundary of the post-index immediate range [-256, 256)
// since the offset is -amount.
const bool canEncodeLoadRegPostIndexOffset = amount <= 256;

genInstrWithConstant(INS_sub, EA_PTRSIZE, REG_SPBASE, REG_SPBASE, amount, rsGetRsvdReg());
if (canEncodeLoadRegPostIndexOffset)
{
GetEmitter()->emitIns_R_R_I(INS_ldr, EA_4BYTE, REG_ZR, REG_SPBASE, -(ssize_t)amount,
INS_OPTS_POST_INDEX);
}
else if (emitter::canEncodeLoadOrStorePairOffset(-(ssize_t)amount, EA_8BYTE))
{
// The following probes the page and allocates the local heap.
// ldp tmpReg, xzr, [sp], #-amount
// Note that we cannot use ldp xzr, xzr since
// the behaviour of ldp where two source registers are the same is unpredictable.
const regNumber tmpReg = targetReg;
GetEmitter()->emitIns_R_R_R_I(INS_ldp, EA_8BYTE, tmpReg, REG_ZR, REG_SPBASE, -(ssize_t)amount,
INS_OPTS_POST_INDEX);
}
else
{
// ldr wzr, [sp]
// sub, sp, #amount
GetEmitter()->emitIns_R_R_I(INS_ldr, EA_4BYTE, REG_ZR, REG_SPBASE, amount);
genInstrWithConstant(INS_sub, EA_PTRSIZE, REG_SPBASE, REG_SPBASE, amount, rsGetRsvdReg());
}

lastTouchDelta = amount;

Expand Down
13 changes: 5 additions & 8 deletions src/coreclr/jit/lsraarm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -543,14 +543,14 @@ int LinearScan::BuildNode(GenTree* tree)
{
assert(dstCount == 1);

// Need a variable number of temp regs (see genLclHeap() in codegenamd64.cpp):
// Need a variable number of temp regs (see genLclHeap() in codegenarm64.cpp):
// Here '-' means don't care.
//
// Size? Init Memory? # temp regs
// 0 - 0
// const and <=6 ptr words - 0
// const and <=UnrollLimit - 0
// const and <PageSize No 0
// >6 ptr words Yes 0
// >UnrollLimit Yes 0
// Non-const Yes 0
// Non-const No 2
//
Expand All @@ -569,12 +569,9 @@ int LinearScan::BuildNode(GenTree* tree)
// Note: The Gentree node is not updated here as it is cheap to recompute stack aligned size.
// This should also help in debugging as we can examine the original size specified with
// localloc.
sizeVal = AlignUp(sizeVal, STACK_ALIGN);
size_t stpCount = sizeVal / (REGSIZE_BYTES * 2);
sizeVal = AlignUp(sizeVal, STACK_ALIGN);

// For small allocations up to 4 'stp' instructions (i.e. 16 to 64 bytes of localloc)
//
if (stpCount <= 4)
if (sizeVal <= LCLHEAP_UNROLL_LIMIT)
{
// Need no internal registers
}
Expand Down
1 change: 1 addition & 0 deletions src/coreclr/jit/targetarm64.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#define CPBLK_LCL_UNROLL_LIMIT 128 // Upper bound to let the code generator to loop unroll CpBlk (when both srcAddr and dstAddr point to the stack)
#define INITBLK_UNROLL_LIMIT 64 // Upper bound to let the code generator to loop unroll InitBlk
#define INITBLK_LCL_UNROLL_LIMIT 128 // Upper bound to let the code generator to loop unroll InitBlk (when dstAddr points to the stack)
#define LCLHEAP_UNROLL_LIMIT 128 // Upper bound to let the code generator to loop unroll LclHeap (when zeroing is required)

#ifdef FEATURE_SIMD
#define ALIGN_SIMD_TYPES 1 // whether SIMD type locals are to be aligned
Expand Down