diff --git a/lib/Target/ARM/ARMFrameLowering.cpp b/lib/Target/ARM/ARMFrameLowering.cpp index 8a0c30eb2c4e..b7b2e940d8de 100644 --- a/lib/Target/ARM/ARMFrameLowering.cpp +++ b/lib/Target/ARM/ARMFrameLowering.cpp @@ -14,6 +14,7 @@ #include "ARMFrameLowering.h" #include "ARMBaseInstrInfo.h" #include "ARMBaseRegisterInfo.h" +#include "ARMConstantPoolValue.h" #include "ARMInstrInfo.h" #include "ARMMachineFunctionInfo.h" #include "ARMTargetMachine.h" @@ -1639,10 +1640,20 @@ static uint32_t AlignToARMConstant(uint32_t Value) { // stack limit. static const uint64_t kSplitStackAvailable = 256; +void +ARMFrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const { + const ARMSubtarget *ST = &MF.getTarget().getSubtarget(); + if(ST->isThumb()) { + adjustForSegmentedStacksThumb(MF); + } else { + adjustForSegmentedStacksARM(MF); + } +} + // Adjust function prologue to enable split stack. // Only support android and linux. void -ARMFrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const { +ARMFrameLowering::adjustForSegmentedStacksARM(MachineFunction &MF) const { const ARMSubtarget *ST = &MF.getTarget().getSubtarget(); // Doesn't support vararg function. @@ -1855,3 +1866,168 @@ ARMFrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const { MF.verify(); #endif } + +void +ARMFrameLowering::adjustForSegmentedStacksThumb(MachineFunction &MF) const { +// const ARMSubtarget *ST = &MF.getTarget().getSubtarget(); + + // Doesn't support vararg function. + if (MF.getFunction()->isVarArg()) + report_fatal_error("Segmented stacks do not support vararg functions."); + + MachineBasicBlock &prologueMBB = MF.front(); + MachineFrameInfo* MFI = MF.getFrameInfo(); + const ARMBaseInstrInfo &TII = *TM.getInstrInfo(); + ARMFunctionInfo* ARMFI = MF.getInfo(); + DebugLoc DL; + + // Use R4 and R5 as scratch register. + // We should save R4 and R5 before use it and restore before + // leave the function. + unsigned ScratchReg0 = ARM::R4; + unsigned ScratchReg1 = ARM::R5; + uint64_t AlignedStackSize; + + MachineBasicBlock* prevStackMBB = MF.CreateMachineBasicBlock(); + MachineBasicBlock* postStackMBB = MF.CreateMachineBasicBlock(); + MachineBasicBlock* allocMBB = MF.CreateMachineBasicBlock(); + MachineBasicBlock* getMBB = MF.CreateMachineBasicBlock(); + MachineBasicBlock* mcrMBB = MF.CreateMachineBasicBlock(); + + for (MachineBasicBlock::livein_iterator i = prologueMBB.livein_begin(), + e = prologueMBB.livein_end(); i != e; ++i) { + allocMBB->addLiveIn(*i); + getMBB->addLiveIn(*i); + mcrMBB->addLiveIn(*i); + prevStackMBB->addLiveIn(*i); + postStackMBB->addLiveIn(*i); + } + + MF.push_front(postStackMBB); + MF.push_front(allocMBB); + MF.push_front(getMBB); + MF.push_front(mcrMBB); + MF.push_front(prevStackMBB); + + // The required stack size that is aligend to ARM constant critarion. + uint64_t StackSize = MFI->getStackSize(); + + AlignedStackSize = AlignToARMConstant(StackSize); + + // When the frame size is less than 256 we just compare the stack + // boundary directly to the value of the stack pointer, per gcc. + bool CompareStackPointer = AlignedStackSize < kSplitStackAvailable; + + // We will use two of callee save registers as scratch register so we + // need to save those registers into stack frame before use it. + // We will use SR0 to hold stack limit and SR1 to stack size requested. + // and arguments for __morestack(). + // SR0: Scratch Register #0 + // SR1: Scratch Register #1 + // push {SR0, SR1} + AddDefaultPred(BuildMI(prevStackMBB, DL, TII.get(ARM::tPUSH))) + .addReg(ScratchReg0) + .addReg(ScratchReg1); + + // mov SR1, sp + AddDefaultPred(BuildMI(mcrMBB, DL, TII.get(ARM::tMOVr), ScratchReg1) + .addReg(ARM::SP)); + + if (!CompareStackPointer) { + // sub SR1, #StackSize + AddDefaultPred(AddDefaultCC(BuildMI(mcrMBB, DL, TII.get(ARM::tSUBi8), ScratchReg1)) + .addReg(ScratchReg1).addImm(AlignedStackSize)); + } + + unsigned PCLabelId = ARMFI->createPICLabelUId(); + ARMConstantPoolValue *NewCPV = ARMConstantPoolSymbol:: + Create(MF.getFunction()->getContext(), "STACK_LIMIT", PCLabelId, 0); + MachineConstantPool *MCP = MF.getConstantPool(); + unsigned CPI = MCP->getConstantPoolIndex(NewCPV, MF.getAlignment()); + + //ldr SR0, [pc, offset(STACK_LIMIT)] + AddDefaultPred(BuildMI(getMBB, DL, TII.get(ARM::tLDRpci), ScratchReg0) + .addConstantPoolIndex(CPI)); + + //ldr SR0, [SR0] + AddDefaultPred(BuildMI(getMBB, DL, TII.get(ARM::tLDRi), ScratchReg0) + .addReg(ScratchReg0) + .addImm(0)); + + // Compare stack limit with stack size requested. + // cmp SR0, SR1 + AddDefaultPred(BuildMI(getMBB, DL, TII.get(ARM::tCMPr)) + .addReg(ScratchReg0) + .addReg(ScratchReg1)); + + // This jump is taken if StackLimit < SP - stack required. + BuildMI(getMBB, DL, TII.get(ARM::tBcc)) + .addMBB(postStackMBB) + .addImm(ARMCC::LO) + .addReg(ARM::CPSR); + + + // Calling __morestack(StackSize, Size of stack arguments). + // __morestack knows that the stack size requested is in SR0(r4) + // and amount size of stack arguments is in SR1(r5). + + // Pass first argument for the __morestack by Scratch Register #0. + // The amount size of stack required + AddDefaultPred(AddDefaultCC(BuildMI(allocMBB, DL, TII.get(ARM::tMOVi8), ScratchReg0)) + .addImm(AlignedStackSize)); + // Pass second argument for the __morestack by Scratch Register #1. + // The amount size of stack consumed to save function arguments. + AddDefaultPred(AddDefaultCC(BuildMI(allocMBB, DL, TII.get(ARM::tMOVi8), ScratchReg1)) + .addImm(AlignToARMConstant(ARMFI->getArgumentStackSize()))); + + // push {lr} - Save return address of this function. + AddDefaultPred(BuildMI(allocMBB, DL, TII.get(ARM::tPUSH))) + .addReg(ARM::LR); + + // Call __morestack(). + AddDefaultPred(BuildMI(allocMBB, DL, TII.get(ARM::tBL))) + .addExternalSymbol("__morestack"); + + // Restore return address of this original function. + // pop {SR0} + AddDefaultPred(BuildMI(allocMBB, DL, TII.get(ARM::tPOP))) + .addReg(ScratchReg0); + + // mov lr, SR0 + AddDefaultPred(BuildMI(allocMBB, DL, TII.get(ARM::tMOVr), ARM::LR) + .addReg(ScratchReg0)); + + // Restore SR0 and SR1 in case of __morestack() was called. + // __morestack() will skip postStackMBB block so we need to restore + // scratch registers from here. + // pop {SR0, SR1} + AddDefaultPred(BuildMI(allocMBB, DL, TII.get(ARM::tPOP))) + .addReg(ScratchReg0) + .addReg(ScratchReg1); + + // Return from this function. + AddDefaultPred(BuildMI(allocMBB, DL, TII.get(ARM::tMOVr), ARM::PC) + .addReg(ARM::LR)); + + // Restore SR0 and SR1 in case of __morestack() was not called. + // pop {SR0, SR1} + AddDefaultPred(BuildMI(postStackMBB, DL, TII.get(ARM::tPOP))) + .addReg(ScratchReg0) + .addReg(ScratchReg1); + + // Organizing MBB lists + postStackMBB->addSuccessor(&prologueMBB); + + allocMBB->addSuccessor(postStackMBB); + + getMBB->addSuccessor(postStackMBB); + getMBB->addSuccessor(allocMBB); + + mcrMBB->addSuccessor(getMBB); + + prevStackMBB->addSuccessor(mcrMBB); + +#ifdef XDEBUG + MF.verify(); +#endif +} diff --git a/lib/Target/ARM/ARMFrameLowering.h b/lib/Target/ARM/ARMFrameLowering.h index 16b477ae3cd2..0cb8e5af5402 100644 --- a/lib/Target/ARM/ARMFrameLowering.h +++ b/lib/Target/ARM/ARMFrameLowering.h @@ -62,6 +62,8 @@ class ARMFrameLowering : public TargetFrameLowering { RegScavenger *RS) const; void adjustForSegmentedStacks(MachineFunction &MF) const; + void adjustForSegmentedStacksThumb(MachineFunction &MF) const; + void adjustForSegmentedStacksARM(MachineFunction &MF) const; private: void emitPushInst(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, diff --git a/test/CodeGen/Thumb/segmented-stacks-dynamic.ll b/test/CodeGen/Thumb/segmented-stacks-dynamic.ll new file mode 100644 index 000000000000..bc698b50b8d7 --- /dev/null +++ b/test/CodeGen/Thumb/segmented-stacks-dynamic.ll @@ -0,0 +1,41 @@ +; RUN: llc < %s -mcpu=generic -mtriple=thumb-linux-android -segmented-stacks -verify-machineinstrs | FileCheck %s -check-prefix=Thumb-Linux-Android +; RUN: llc < %s -mcpu=generic -mtriple=thumb-linux-android -segmented-stacks -filetype=obj + +; Just to prevent the alloca from being optimized away +declare void @dummy_use(i32*, i32) + +define i32 @test_basic(i32 %l) { + %mem = alloca i32, i32 %l + call void @dummy_use (i32* %mem, i32 %l) + %terminate = icmp eq i32 %l, 0 + br i1 %terminate, label %true, label %false + +true: + ret i32 0 + +false: + %newlen = sub i32 %l, 1 + %retvalue = call i32 @test_basic(i32 %newlen) + ret i32 %retvalue + +; Thumb-Linux-Android: test_basic: + +; Thumb-Linux-Android: push {r4, r5} +; Thumb-Linux-Android-NEXT: mov r5, sp +; Thumb-Linux-Android-NEXT: ldr r4, .LCPI0_0 +; Thumb-Linux-Android-NEXT: ldr r4, [r4] +; Thumb-Linux-Android-NEXT: cmp r4, r5 +; Thumb-Linux-Android-NEXT: blo .LBB0_2 + +; Thumb-Linux-Android: mov r4, #16 +; Thumb-Linux-Android-NEXT: mov r5, #0 +; Thumb-Linux-Android-NEXT: push {lr} +; Thumb-Linux-Android-NEXT: bl __morestack +; Thumb-Linux-Android-NEXT: pop {r4} +; Thumb-Linux-Android-NEXT: mov lr, r4 +; Thumb-Linux-Android-NEXT: pop {r4, r5} +; Thumb-Linux-Android-NEXT: mov pc, lr + +; Thumb-Linux-Android: pop {r4, r5} + +} diff --git a/test/CodeGen/Thumb/segmented-stacks.ll b/test/CodeGen/Thumb/segmented-stacks.ll new file mode 100644 index 000000000000..1c6250f7036d --- /dev/null +++ b/test/CodeGen/Thumb/segmented-stacks.ll @@ -0,0 +1,145 @@ +; RUN: llc < %s -mcpu=generic -mtriple=thumb-linux-android -segmented-stacks -verify-machineinstrs | FileCheck %s -check-prefix=Thumb-Linux-Android + +; We used to crash with filetype=obj +; RUN: llc < %s -mcpu=generic -mtriple=thumb-linux-android -segmented-stacks -filetype=obj + + +; Just to prevent the alloca from being optimized away +declare void @dummy_use(i32*, i32) + +define void @test_basic() { + %mem = alloca i32, i32 10 + call void @dummy_use (i32* %mem, i32 10) + ret void + +; Thumb-Linux-Android: test_basic: + +; Thumb-Linux-Android: push {r4, r5} +; Thumb-Linux-Android-NEXT: mov r5, sp +; Thumb-Linux-Android-NEXT: ldr r4, .LCPI0_0 +; Thumb-Linux-Android-NEXT: ldr r4, [r4] +; Thumb-Linux-Android-NEXT: cmp r4, r5 +; Thumb-Linux-Android-NEXT: blo .LBB0_2 + +; Thumb-Linux-Android: mov r4, #48 +; Thumb-Linux-Android-NEXT: mov r5, #0 +; Thumb-Linux-Android-NEXT: push {lr} +; Thumb-Linux-Android-NEXT: bl __morestack +; Thumb-Linux-Android-NEXT: pop {r4} +; Thumb-Linux-Android-NEXT: mov lr, r4 +; Thumb-Linux-Android-NEXT: pop {r4, r5} +; Thumb-Linux-Android-NEXT: mov pc, lr + +; Thumb-Linux-Android: pop {r4, r5} + +} + +define i32 @test_nested(i32 * nest %closure, i32 %other) { + %addend = load i32 * %closure + %result = add i32 %other, %addend + ret i32 %result + +; Thumb-Linux-Android: test_nested: + +; Thumb-Linux-Android: push {r4, r5} +; Thumb-Linux-Android-NEXT: mov r5, sp +; Thumb-Linux-Android-NEXT: ldr r4, .LCPI1_0 +; Thumb-Linux-Android-NEXT: ldr r4, [r4] +; Thumb-Linux-Android-NEXT: cmp r4, r5 +; Thumb-Linux-Android-NEXT: blo .LBB1_2 + +; Thumb-Linux-Android: mov r4, #0 +; Thumb-Linux-Android-NEXT: mov r5, #0 +; Thumb-Linux-Android-NEXT: push {lr} +; Thumb-Linux-Android-NEXT: bl __morestack +; Thumb-Linux-Android-NEXT: pop {r4} +; Thumb-Linux-Android-NEXT: mov lr, r4 +; Thumb-Linux-Android-NEXT: pop {r4, r5} +; Thumb-Linux-Android-NEXT: mov pc, lr + +; Thumb-Linux-Android: pop {r4, r5} + +} + +define void @test_large() { + %mem = alloca i32, i32 10000 + call void @dummy_use (i32* %mem, i32 0) + ret void + +; Thumb-Linux-Android: test_large: + +; Thumb-Linux-Android: push {r4, r5} +; Thumb-Linux-Android-NEXT: mov r5, sp +; Thumb-Linux-Android-NEXT: sub r5, #40192 +; Thumb-Linux-Android-NEXT: ldr r4, .LCPI2_2 +; Thumb-Linux-Android-NEXT: ldr r4, [r4] +; Thumb-Linux-Android-NEXT: cmp r4, r5 +; Thumb-Linux-Android-NEXT: blo .LBB2_2 + +; Thumb-Linux-Android: mov r4, #40192 +; Thumb-Linux-Android-NEXT: mov r5, #0 +; Thumb-Linux-Android-NEXT: push {lr} +; Thumb-Linux-Android-NEXT: bl __morestack +; Thumb-Linux-Android-NEXT: pop {r4} +; Thumb-Linux-Android-NEXT: mov lr, r4 +; Thumb-Linux-Android-NEXT: pop {r4, r5} +; Thumb-Linux-Android-NEXT: mov pc, lr + +; Thumb-Linux-Android: pop {r4, r5} + +} + +define fastcc void @test_fastcc() { + %mem = alloca i32, i32 10 + call void @dummy_use (i32* %mem, i32 10) + ret void + +; Thumb-Linux-Android: test_fastcc: + +; Thumb-Linux-Android: push {r4, r5} +; Thumb-Linux-Android-NEXT: mov r5, sp +; Thumb-Linux-Android-NEXT: ldr r4, .LCPI3_0 +; Thumb-Linux-Android-NEXT: ldr r4, [r4] +; Thumb-Linux-Android-NEXT: cmp r4, r5 +; Thumb-Linux-Android-NEXT: blo .LBB3_2 + +; Thumb-Linux-Android: mov r4, #48 +; Thumb-Linux-Android-NEXT: mov r5, #0 +; Thumb-Linux-Android-NEXT: push {lr} +; Thumb-Linux-Android-NEXT: bl __morestack +; Thumb-Linux-Android-NEXT: pop {r4} +; Thumb-Linux-Android-NEXT: mov lr, r4 +; Thumb-Linux-Android-NEXT: pop {r4, r5} +; Thumb-Linux-Android-NEXT: mov pc, lr + +; Thumb-Linux-Android: pop {r4, r5} + +} + +define fastcc void @test_fastcc_large() { + %mem = alloca i32, i32 10000 + call void @dummy_use (i32* %mem, i32 0) + ret void + +; Thumb-Linux-Android: test_fastcc_large: + +; Thumb-Linux-Android: push {r4, r5} +; Thumb-Linux-Android-NEXT: mov r5, sp +; Thumb-Linux-Android-NEXT: sub r5, #40192 +; Thumb-Linux-Android-NEXT: ldr r4, .LCPI4_2 +; Thumb-Linux-Android-NEXT: ldr r4, [r4] +; Thumb-Linux-Android-NEXT: cmp r4, r5 +; Thumb-Linux-Android-NEXT: blo .LBB4_2 + +; Thumb-Linux-Android: mov r4, #40192 +; Thumb-Linux-Android-NEXT: mov r5, #0 +; Thumb-Linux-Android-NEXT: push {lr} +; Thumb-Linux-Android-NEXT: bl __morestack +; Thumb-Linux-Android-NEXT: pop {r4} +; Thumb-Linux-Android-NEXT: mov lr, r4 +; Thumb-Linux-Android-NEXT: pop {r4, r5} +; Thumb-Linux-Android-NEXT: mov pc, lr + +; Thumb-Linux-Android: pop {r4, r5} + +}