Skip to content

Commit

Permalink
Couple small optimizations for genLclHeap:
Browse files Browse the repository at this point in the history
1. When not required to zero the allocated space for local heap (for sizes up to 64 bytes) do not zero.

2. For sizes less than one PAGE_SIZE and when the size is an encodable offset
   use ldp tmpReg, xzr, [sp], #-amount that does probing at [sp] and allocates the space at the same time.

3. Allow non-loop zeroing (i.e. unrolled sequence) for sizes up to 128 bytes (i.e. up to LCLHEAP_UNROLL_LIMIT)

4. Do such zeroing in ascending order of effective address.
  • Loading branch information
echesakov committed Jan 29, 2022
1 parent 21ab197 commit b4ce794
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 25 deletions.
66 changes: 49 additions & 17 deletions src/coreclr/jit/codegenarm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2311,34 +2311,66 @@ void CodeGen::genLclHeap(GenTree* tree)
// We should reach here only for non-zero, constant size allocations.
assert(amount > 0);

const int storePairRegsWritesBytes = 2 * REGSIZE_BYTES;

// For small allocations we will generate up to four stp instructions, to zero 16 to 64 bytes.
static_assert_no_msg(STACK_ALIGN == (REGSIZE_BYTES * 2));
assert(amount % (REGSIZE_BYTES * 2) == 0); // stp stores two registers at a time
size_t stpCount = amount / (REGSIZE_BYTES * 2);
if (stpCount <= 4)
static_assert_no_msg(STACK_ALIGN == storePairRegsWritesBytes);
assert(amount % storePairRegsWritesBytes == 0); // stp stores two registers at a time

if (compiler->info.compInitMem)
{
while (stpCount != 0)
if (amount <= LCLHEAP_UNROLL_LIMIT)
{
// We can use pre-indexed addressing.
// stp ZR, ZR, [SP, #-16]! // STACK_ALIGN is 16
GetEmitter()->emitIns_R_R_R_I(INS_stp, EA_PTRSIZE, REG_ZR, REG_ZR, REG_SPBASE, -16, INS_OPTS_PRE_INDEX);
stpCount -= 1;
}
// The following zeroes the last 16 bytes and probes the page containing [sp, #16] address.
// stp xzr, xzr, [sp, #-16]!
GetEmitter()->emitIns_R_R_R_I(INS_stp, EA_8BYTE, REG_ZR, REG_ZR, REG_SPBASE, -storePairRegsWritesBytes,
INS_OPTS_PRE_INDEX);

lastTouchDelta = 0;
if (amount > storePairRegsWritesBytes)
{
// The following sets SP to its final value and zeroes the first 16 bytes of the allocated space.
// stp xzr, xzr, [sp, #-amount+16]!
const ssize_t finalSpDelta = (ssize_t)amount - storePairRegsWritesBytes;
GetEmitter()->emitIns_R_R_R_I(INS_stp, EA_8BYTE, REG_ZR, REG_ZR, REG_SPBASE, -finalSpDelta,
INS_OPTS_PRE_INDEX);

// The following zeroes the remaining space in [finalSp+16, initialSp-16) interval
// using a sequence of stp instruction with unsigned offset.
for (ssize_t offset = storePairRegsWritesBytes; offset < (ssize_t)amount - storePairRegsWritesBytes;
offset += storePairRegsWritesBytes)
{
// stp xzr, xzr, [sp, #offset]
GetEmitter()->emitIns_R_R_R_I(INS_stp, EA_8BYTE, REG_ZR, REG_ZR, REG_SPBASE, offset);
}
}

goto ALLOC_DONE;
lastTouchDelta = 0;

goto ALLOC_DONE;
}
}
else if (!compiler->info.compInitMem && (amount < compiler->eeGetPageSize())) // must be < not <=
else if (amount < compiler->eeGetPageSize()) // must be < not <=
{
// Since the size is less than a page, simply adjust the SP value.
// The SP might already be in the guard page, so we must touch it BEFORE
// the alloc, not after.

// ldr wz, [SP, #0]
GetEmitter()->emitIns_R_R_I(INS_ldr, EA_4BYTE, REG_ZR, REG_SP, 0);

genInstrWithConstant(INS_sub, EA_PTRSIZE, REG_SPBASE, REG_SPBASE, amount, rsGetRsvdReg());
if (emitter::canEncodeLoadOrStorePairOffset(amount, EA_8BYTE))
{
// The following probes the page and allocates the local heap.
// ldp tmpReg, xzr, [sp], #-amount
// Note that behaviour of ldp where two source registers are the same is unpredictable.
const regNumber tmpReg = targetReg;
GetEmitter()->emitIns_R_R_R_I(INS_ldp, EA_8BYTE, tmpReg, REG_ZR, REG_SPBASE, -(ssize_t)amount,
INS_OPTS_POST_INDEX);
}
else
{
// ldr wzr, [sp]
// sub, sp, #amount
GetEmitter()->emitIns_R_R_I(INS_ldr, EA_4BYTE, REG_ZR, REG_SPBASE, amount);
genInstrWithConstant(INS_sub, EA_PTRSIZE, REG_SPBASE, REG_SPBASE, amount, rsGetRsvdReg());
}

lastTouchDelta = amount;

Expand Down
13 changes: 5 additions & 8 deletions src/coreclr/jit/lsraarm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -543,14 +543,14 @@ int LinearScan::BuildNode(GenTree* tree)
{
assert(dstCount == 1);

// Need a variable number of temp regs (see genLclHeap() in codegenamd64.cpp):
// Need a variable number of temp regs (see genLclHeap() in codegenarm64.cpp):
// Here '-' means don't care.
//
// Size? Init Memory? # temp regs
// 0 - 0
// const and <=6 ptr words - 0
// const and <=UnrollLimit - 0
// const and <PageSize No 0
// >6 ptr words Yes 0
// >UnrollLimit Yes 0
// Non-const Yes 0
// Non-const No 2
//
Expand All @@ -569,12 +569,9 @@ int LinearScan::BuildNode(GenTree* tree)
// Note: The Gentree node is not updated here as it is cheap to recompute stack aligned size.
// This should also help in debugging as we can examine the original size specified with
// localloc.
sizeVal = AlignUp(sizeVal, STACK_ALIGN);
size_t stpCount = sizeVal / (REGSIZE_BYTES * 2);
sizeVal = AlignUp(sizeVal, STACK_ALIGN);

// For small allocations up to 4 'stp' instructions (i.e. 16 to 64 bytes of localloc)
//
if (stpCount <= 4)
if (sizeVal <= LCLHEAP_UNROLL_LIMIT)
{
// Need no internal registers
}
Expand Down

0 comments on commit b4ce794

Please sign in to comment.