Skip to content

Commit

Permalink
Cranelift AArch64: Improve the handling of callee-saved registers
Browse files Browse the repository at this point in the history
SIMD & FP registers are now saved and restored in pairs, similarly
to general-purpose registers. Also, only the bottom 64 bits of the
registers are saved and restored (in case of non-Baldrdash ABIs),
which is the requirement from the Procedure Call Standard for the
Arm 64-bit Architecture.

As for the callee-saved general-purpose registers, if a procedure
needs to save and restore an odd number of them, it no longer uses
store and load pair instructions for the last register.

Copyright (c) 2021, Arm Limited.
  • Loading branch information
akirilov-arm committed Apr 13, 2021
1 parent 2a32567 commit 1c617dd
Show file tree
Hide file tree
Showing 6 changed files with 747 additions and 74 deletions.
300 changes: 247 additions & 53 deletions cranelift/codegen/src/isa/aarch64/abi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -135,12 +135,28 @@ impl Into<AMode> for StackAMode {
// Returns the size of stack space needed to store the
// `int_reg` and `vec_reg`.
fn saved_reg_stack_size(
call_conv: isa::CallConv,
int_reg: &[Writable<RealReg>],
vec_reg: &[Writable<RealReg>],
) -> (usize, usize) {
// Round up to multiple of 2, to keep 16-byte stack alignment.
let int_save_bytes = (int_reg.len() + (int_reg.len() & 1)) * 8;
let vec_save_bytes = vec_reg.len() * 16;
// The Baldrdash ABIs require saving and restoring the whole 16-byte
// SIMD & FP registers, so the necessary stack space is always a
// multiple of the mandatory 16-byte stack alignment. However, the
// Procedure Call Standard for the Arm 64-bit Architecture (AAPCS64,
// including several related ABIs such as the one used by Windows)
// mandates saving only the bottom 8 bytes of the vector registers,
// so in that case we round up the number of registers to ensure proper
// stack alignment (similarly to the situation with `int_reg`).
let vec_reg_size = if call_conv.extends_baldrdash() { 16 } else { 8 };
let vec_save_padding = if call_conv.extends_baldrdash() {
0
} else {
vec_reg.len() & 1
};
let vec_save_bytes = (vec_reg.len() + vec_save_padding) * vec_reg_size;

(int_save_bytes, vec_save_bytes)
}

Expand Down Expand Up @@ -591,7 +607,8 @@ impl ABIMachineSpec for AArch64MachineDeps {
let mut insts = SmallVec::new();
let (clobbered_int, clobbered_vec) = get_regs_saved_in_prologue(call_conv, clobbers);

let (int_save_bytes, vec_save_bytes) = saved_reg_stack_size(&clobbered_int, &clobbered_vec);
let (int_save_bytes, vec_save_bytes) =
saved_reg_stack_size(call_conv, &clobbered_int, &clobbered_vec);
let total_save_bytes = int_save_bytes + vec_save_bytes;
let clobber_size = total_save_bytes as i32;

Expand Down Expand Up @@ -620,59 +637,170 @@ impl ABIMachineSpec for AArch64MachineDeps {
// `frame_offset` tracks offset above start-of-clobbers for unwind-info
// purposes.
let mut clobber_offset = clobber_size as u32;
for reg_pair in clobbered_int.chunks(2) {
let (r1, r2) = if reg_pair.len() == 2 {
// .to_reg().to_reg(): Writable<RealReg> --> RealReg --> Reg
(reg_pair[0].to_reg().to_reg(), reg_pair[1].to_reg().to_reg())
} else {
(reg_pair[0].to_reg().to_reg(), zero_reg())
};
let clobber_offset_change = 16;
let iter = clobbered_int.chunks_exact(2);

if let [rd] = iter.remainder() {
let rd = rd.to_reg().to_reg();

debug_assert_eq!(rd.get_class(), RegClass::I64);
// str rd, [sp, #-16]!
insts.push(Inst::Store64 {
rd,
mem: AMode::PreIndexed(
writable_stack_reg(),
SImm9::maybe_from_i64(-clobber_offset_change).unwrap(),
),
flags: MemFlags::trusted(),
});

if flags.unwind_info() {
clobber_offset -= clobber_offset_change as u32;
insts.push(Inst::Unwind {
inst: UnwindInst::SaveReg {
clobber_offset,
reg: rd.to_real_reg(),
},
});
}
}

let mut iter = iter.rev();

debug_assert!(r1.get_class() == RegClass::I64);
debug_assert!(r2.get_class() == RegClass::I64);
while let Some([rt, rt2]) = iter.next() {
// .to_reg().to_reg(): Writable<RealReg> --> RealReg --> Reg
let rt = rt.to_reg().to_reg();
let rt2 = rt2.to_reg().to_reg();

// stp r1, r2, [sp, #-16]!
debug_assert!(rt.get_class() == RegClass::I64);
debug_assert!(rt2.get_class() == RegClass::I64);

// stp rt, rt2, [sp, #-16]!
insts.push(Inst::StoreP64 {
rt: r1,
rt2: r2,
rt,
rt2,
mem: PairAMode::PreIndexed(
writable_stack_reg(),
SImm7Scaled::maybe_from_i64(-16, types::I64).unwrap(),
SImm7Scaled::maybe_from_i64(-clobber_offset_change, types::I64).unwrap(),
),
flags: MemFlags::trusted(),
});

if flags.unwind_info() {
clobber_offset -= 8;
if r2 != zero_reg() {
insts.push(Inst::Unwind {
inst: UnwindInst::SaveReg {
clobber_offset,
reg: r2.to_real_reg(),
},
});
clobber_offset -= clobber_offset_change as u32;
insts.push(Inst::Unwind {
inst: UnwindInst::SaveReg {
clobber_offset,
reg: rt.to_real_reg(),
},
});
insts.push(Inst::Unwind {
inst: UnwindInst::SaveReg {
clobber_offset: clobber_offset + (clobber_offset_change / 2) as u32,
reg: rt2.to_real_reg(),
},
});
}
}

let store_vec_reg = |rd| {
if call_conv.extends_baldrdash() {
Inst::FpuStore128 {
rd,
mem: AMode::PreIndexed(
writable_stack_reg(),
SImm9::maybe_from_i64(-clobber_offset_change).unwrap(),
),
flags: MemFlags::trusted(),
}
} else {
Inst::FpuStore64 {
rd,
mem: AMode::PreIndexed(
writable_stack_reg(),
SImm9::maybe_from_i64(-clobber_offset_change).unwrap(),
),
flags: MemFlags::trusted(),
}
clobber_offset -= 8;
}
};
let iter = clobbered_vec.chunks_exact(2);

if let [rd] = iter.remainder() {
let rd = rd.to_reg().to_reg();

debug_assert_eq!(rd.get_class(), RegClass::V128);
insts.push(store_vec_reg(rd));

if flags.unwind_info() {
clobber_offset -= clobber_offset_change as u32;
insts.push(Inst::Unwind {
inst: UnwindInst::SaveReg {
clobber_offset,
reg: r1.to_real_reg(),
reg: rd.to_real_reg(),
},
});
}
}

for reg in clobbered_vec.iter() {
insts.push(Inst::FpuStore128 {
rd: reg.to_reg().to_reg(),
mem: AMode::PreIndexed(writable_stack_reg(), SImm9::maybe_from_i64(-16).unwrap()),
flags: MemFlags::trusted(),
});
let store_vec_reg_pair = |rt, rt2| {
if call_conv.extends_baldrdash() {
let clobber_offset_change = 32;

(
Inst::FpuStoreP128 {
rt,
rt2,
mem: PairAMode::PreIndexed(
writable_stack_reg(),
SImm7Scaled::maybe_from_i64(-clobber_offset_change, I8X16).unwrap(),
),
flags: MemFlags::trusted(),
},
clobber_offset_change as u32,
)
} else {
let clobber_offset_change = 16;

(
Inst::FpuStoreP64 {
rt,
rt2,
mem: PairAMode::PreIndexed(
writable_stack_reg(),
SImm7Scaled::maybe_from_i64(-clobber_offset_change, F64).unwrap(),
),
flags: MemFlags::trusted(),
},
clobber_offset_change as u32,
)
}
};
let mut iter = iter.rev();

while let Some([rt, rt2]) = iter.next() {
let rt = rt.to_reg().to_reg();
let rt2 = rt2.to_reg().to_reg();

debug_assert_eq!(rt.get_class(), RegClass::V128);
debug_assert_eq!(rt2.get_class(), RegClass::V128);

let (inst, clobber_offset_change) = store_vec_reg_pair(rt, rt2);

insts.push(inst);

if flags.unwind_info() {
clobber_offset -= 16;
clobber_offset -= clobber_offset_change;
insts.push(Inst::Unwind {
inst: UnwindInst::SaveReg {
clobber_offset,
reg: reg.to_reg(),
reg: rt.to_real_reg(),
},
});
insts.push(Inst::Unwind {
inst: UnwindInst::SaveReg {
clobber_offset: clobber_offset + clobber_offset_change / 2,
reg: rt2.to_real_reg(),
},
});
}
Expand Down Expand Up @@ -700,31 +828,83 @@ impl ABIMachineSpec for AArch64MachineDeps {
insts.extend(Self::gen_sp_reg_adjust(fixed_frame_storage_size as i32));
}

for reg in clobbered_vec.iter().rev() {
insts.push(Inst::FpuLoad128 {
rd: Writable::from_reg(reg.to_reg().to_reg()),
mem: AMode::PostIndexed(writable_stack_reg(), SImm9::maybe_from_i64(16).unwrap()),
flags: MemFlags::trusted(),
});
let load_vec_reg = |rd| {
if call_conv.extends_baldrdash() {
Inst::FpuLoad128 {
rd,
mem: AMode::PostIndexed(
writable_stack_reg(),
SImm9::maybe_from_i64(16).unwrap(),
),
flags: MemFlags::trusted(),
}
} else {
Inst::FpuLoad64 {
rd,
mem: AMode::PostIndexed(
writable_stack_reg(),
SImm9::maybe_from_i64(16).unwrap(),
),
flags: MemFlags::trusted(),
}
}
};
let load_vec_reg_pair = |rt, rt2| {
if call_conv.extends_baldrdash() {
Inst::FpuLoadP128 {
rt,
rt2,
mem: PairAMode::PostIndexed(
writable_stack_reg(),
SImm7Scaled::maybe_from_i64(32, I8X16).unwrap(),
),
flags: MemFlags::trusted(),
}
} else {
Inst::FpuLoadP64 {
rt,
rt2,
mem: PairAMode::PostIndexed(
writable_stack_reg(),
SImm7Scaled::maybe_from_i64(16, F64).unwrap(),
),
flags: MemFlags::trusted(),
}
}
};

let mut iter = clobbered_vec.chunks_exact(2);

while let Some([rt, rt2]) = iter.next() {
let rt = rt.map(|r| r.to_reg());
let rt2 = rt2.map(|r| r.to_reg());

debug_assert_eq!(rt.to_reg().get_class(), RegClass::V128);
debug_assert_eq!(rt2.to_reg().get_class(), RegClass::V128);
insts.push(load_vec_reg_pair(rt, rt2));
}

for reg_pair in clobbered_int.chunks(2).rev() {
let (r1, r2) = if reg_pair.len() == 2 {
(
reg_pair[0].map(|r| r.to_reg()),
reg_pair[1].map(|r| r.to_reg()),
)
} else {
(reg_pair[0].map(|r| r.to_reg()), writable_zero_reg())
};
debug_assert!(iter.remainder().len() <= 1);

if let [rd] = iter.remainder() {
let rd = rd.map(|r| r.to_reg());

debug_assert_eq!(rd.to_reg().get_class(), RegClass::V128);
insts.push(load_vec_reg(rd));
}

let mut iter = clobbered_int.chunks_exact(2);

debug_assert!(r1.to_reg().get_class() == RegClass::I64);
debug_assert!(r2.to_reg().get_class() == RegClass::I64);
while let Some([rt, rt2]) = iter.next() {
let rt = rt.map(|r| r.to_reg());
let rt2 = rt2.map(|r| r.to_reg());

// ldp r1, r2, [sp], #16
debug_assert_eq!(rt.to_reg().get_class(), RegClass::I64);
debug_assert_eq!(rt2.to_reg().get_class(), RegClass::I64);
// ldp rt, rt2, [sp], #16
insts.push(Inst::LoadP64 {
rt: r1,
rt2: r2,
rt,
rt2,
mem: PairAMode::PostIndexed(
writable_stack_reg(),
SImm7Scaled::maybe_from_i64(16, I64).unwrap(),
Expand All @@ -733,6 +913,20 @@ impl ABIMachineSpec for AArch64MachineDeps {
});
}

debug_assert!(iter.remainder().len() <= 1);

if let [rd] = iter.remainder() {
let rd = rd.map(|r| r.to_reg());

debug_assert_eq!(rd.to_reg().get_class(), RegClass::I64);
// ldr rd, [sp], #16
insts.push(Inst::ULoad64 {
rd,
mem: AMode::PostIndexed(writable_stack_reg(), SImm9::maybe_from_i64(16).unwrap()),
flags: MemFlags::trusted(),
});
}

// If this is Baldrdash-2020, restore the callee (i.e., our) TLS
// register. We may have allocated it for something else and clobbered
// it, but the ABI expects us to leave the TLS register unchanged.
Expand Down
Loading

0 comments on commit 1c617dd

Please sign in to comment.