Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

suboptimal codegen for llvm.x86.addcarryx.u32 and u64 #40891

Open
gnzlbg mannequin opened this issue Apr 20, 2019 · 4 comments
Open

suboptimal codegen for llvm.x86.addcarryx.u32 and u64 #40891

gnzlbg mannequin opened this issue Apr 20, 2019 · 4 comments
Labels
backend:X86 bugzilla Issues migrated from bugzilla

Comments

@gnzlbg
Copy link
Mannequin

gnzlbg mannequin commented Apr 20, 2019

Bugzilla Link 41546
Version trunk
OS All
CC @topperc,@RKSimon,@rotateright

Extended Description

MP multiply is documented by Intel as one of the main usecases of addcarryx intrinsics (https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/ia-large-integer-arithmetic-paper.pdf).

We implement this in Rust as:

#[target_feature(enable = "adx,bmi2")]
pub unsafe fn mp_mul_intrin(a: &[u64], b: &[u64], c: &mut [u64]) {
    let len = a.len();
    if len != b.len() {
        unreachable_unchecked();
    }
    if c.len() != len * 2 {
        unreachable_unchecked();
    }

    for b_i in (0..len).into_iter().rev() {
        let b_elem = *b.get_unchecked(b_i);
        let mut carry_lo = 0;
        let mut carry_hi = 0;
        for a_i in (0..len).into_iter().rev() {
            let mut hi = 0;
            let lo = _mulx_u64(*a.get_unchecked(a_i), b_elem, &mut hi);
            carry_lo = _addcarryx_u64(
                carry_lo,
                lo,
                *c.get_unchecked(b_i + a_i + 1),
                c.get_unchecked_mut(b_i + a_i + 1),
            );
            carry_hi = _addcarryx_u64(
                carry_hi,
                hi,
                *c.get_unchecked(b_i + a_i),
                c.get_unchecked_mut(b_i + a_i),
            );
        }
        *c.get_unchecked_mut(b_i) += carry_lo as u64;
    }
}

which produces the following LLVM-IR after optimizations (https://rust.godbolt.org/z/EJFEHB):

source_filename = "example.3a1fbbbh-cgu.0"
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

define void @_ZN7example13mp_mul_intrin17ha949a97694eafb34E([0 x i64]* noalias nocapture nonnull readonly align 8 %a.0, i64 %a.1, [0 x i64]* noalias nocapture nonnull readonly align 8 %b.0, i64 %b.1, [0 x i64]* nocapture nonnull align 8 %c.0, i64 %c.1) unnamed_addr #0 personality i32 (...)* @rust_eh_personality {
start:
  %0 = icmp eq i64 %a.1, 0
  br i1 %0, label %bb15, label %bb14

bb14: ; preds = %start, %bb23
  %iter.sroa.4.040 = phi i64 [ %1, %bb23 ], [ %a.1, %start ]
  %1 = add i64 %iter.sroa.4.040, -1
  %2 = getelementptr inbounds [0 x i64], [0 x i64]* %b.0, i64 0, i64 %1
  %3 = load i64, i64* %2, align 8
  %4 = zext i64 %3 to i128
  br label %bb22

bb15: ; preds = %bb23, %start
  ret void

bb22: ; preds = %bb14, %bb22
  %carry_hi.039 = phi i8 [ 0, %bb14 ], [ %24, %bb22 ]
  %carry_lo.038 = phi i8 [ 0, %bb14 ], [ %19, %bb22 ]
  %iter2.sroa.4.037 = phi i64 [ %a.1, %bb14 ], [ %5, %bb22 ]
  %5 = add i64 %iter2.sroa.4.037, -1
  %6 = getelementptr inbounds [0 x i64], [0 x i64]* %a.0, i64 0, i64 %5
  %7 = load i64, i64* %6, align 8
  %8 = zext i64 %7 to i128
  %9 = mul nuw i128 %8, %4
  %10 = lshr i128 %9, 64
  %11 = trunc i128 %10 to i64
  %12 = trunc i128 %9 to i64
  %13 = add i64 %5, %1
  %14 = add i64 %iter2.sroa.4.037, %1
  %15 = getelementptr inbounds [0 x i64], [0 x i64]* %c.0, i64 0, i64 %14
  %16 = load i64, i64* %15, align 8
  %17 = tail call { i8, i64 } @llvm.x86.addcarry.64(i8 %carry_lo.038, i64 %12, i64 %16) #3
  %18 = extractvalue { i8, i64 } %17, 1
  store i64 %18, i64* %15, align 8
  %19 = extractvalue { i8, i64 } %17, 0
  %20 = getelementptr inbounds [0 x i64], [0 x i64]* %c.0, i64 0, i64 %13
  %21 = load i64, i64* %20, align 8
  %22 = tail call { i8, i64 } @llvm.x86.addcarry.64(i8 %carry_hi.039, i64 %11, i64 %21) #3
  %23 = extractvalue { i8, i64 } %22, 1
  store i64 %23, i64* %20, align 8
  %24 = extractvalue { i8, i64 } %22, 0
  %25 = icmp eq i64 %5, 0
  br i1 %25, label %bb23, label %bb22

bb23: ; preds = %bb22
  %26 = zext i8 %19 to i64
  %27 = getelementptr inbounds [0 x i64], [0 x i64]* %c.0, i64 0, i64 %1
  %28 = load i64, i64* %27, align 8
  %29 = add i64 %28, %26
  store i64 %29, i64* %27, align 8
  %30 = icmp eq i64 %1, 0
  br i1 %30, label %bb15, label %bb14
}

declare i32 @rust_eh_personality(...) unnamed_addr #1

declare { i8, i64 } @llvm.x86.addcarry.64(i8, i64, i64) #2

attributes #0 = { nounwind nonlazybind "probe-stack"="__rust_probestack""target-cpu"="x86-64""target-features"="+adx,+bmi2" }
attributes #1 = { nonlazybind "target-cpu"="x86-64" }
attributes #2 = { nounwind readnone }
attributes #3 = { nounwind }

!llvm.module.flags = !{#0}

!0 = !{i32 2, !"RtLibUseGOT", i32 1}

which gets compiled down to the following machine code:

example::mp_mul_intrin:
  pushq %r15
  pushq %r14
  pushq %r12
  pushq %rbx
  testq %rsi, %rsi
  je .LBB0_5
  movq %rdx, %r9
  movq %rsi, %r10
  negq %r10
  movq %rsi, %rax
  shlq $4, %rax
  leaq (%r8,%rax), %r12
  addq $-8, %r12
  leaq (%rdi,%rsi,8), %r11
  addq $-8, %r11
.LBB0_2:
  leaq -1(%rsi), %r14
  movq -8(%r9,%rsi,8), %r15
  xorl %ebx, %ebx
  xorl %ecx, %ecx
  xorl %edi, %edi
.LBB0_3:
  movq %r15, %rax
  mulq (%r11,%rbx,8)
  addb $-1, %dil
  adcq %rax, (%r12,%rbx,8)
  setb %dil
  addb $-1, %cl
  adcq %rdx, -8(%r12,%rbx,8)
  setb %cl
  addq $-1, %rbx
  cmpq %rbx, %r10
  jne .LBB0_3
  movzbl %dil, %eax
  addq %rax, -8(%r8,%rsi,8)
  addq $-8, %r12
  movq %r14, %rsi
  testq %r14, %r14
  jne .LBB0_2
.LBB0_5:
  popq %rbx
  popq %r12
  popq %r14
  popq %r15
  retq

Implementing this operation using inline assembly with the expected machine code output:

#[inline(never)]
#[no_mangle]
pub fn mp_mul_asm(a: &[u64], b: &[u64]) -> Vec<u64> {
    unsafe {
        let len = a.len();
        assert_eq!(len, b.len());
        let mut c = vec![0; len * 2];
        
        asm!("
    # start iteration of `b_i`: `len`-1 downto 0
    lea -1($3), %rsi
1:  # every iteration of `b_i`
        # load `b_elem`
        mov ($1, %rsi, 8), %rdx
        # clear `carry_lo`, `carry_hi`
        xor %r9, %r9
        # start iteration of `a_i`+1: `len` downto 1
        mov $3, %rcx
        jmp 3f
2:          # the end of every iteration of `a_i`+1, except the last iteration
            # no displacement, RCX has already been decremented
            mov %r10, (%r11, %rcx, 8)
3:      # every iteration of `a_i`+1
            # compute c + b_i
            lea ($2, %rsi, 8), %r11
            # compute a[a_i]*b_elem
            mulx -8($0, %rcx, 8), %r8, %r9
            # add low word
            mov (%r11, %rcx, 8), %r10
            adcx %r8, %r10
            mov %r10, (%r11, %rcx, 8)
            # add high word
            mov -8(%r11, %rcx, 8), %r10
            adox %r9, %r10
            # this is done later to be able to add in last carry in last iteration of `a_i`+1
            # mov %r10, -8(%r11, %rcx, 8)
            # advance `a_i`+1
            lea -1(%rcx), %rcx
            jrcxz 4f
            jmp 2b
4:          # the end of the last iteration of `a_i`+1
            adc $$0, %r10
            mov %r10, (%r11)
        # advance `b_i`
        dec %rsi
        jns 1b
"
            :
            : "r"(a.as_ptr()), "r"(b.as_ptr()), "r"(c.as_mut_ptr()), "r"(len)
            : "rsi", "rcx", "rdx", "r8", "r9", "r10", "r11", "flags"
        );

        c
    }
}

and benchmarking both (rust-lang/stdarch#666 (comment)) shows a significant performance difference: 390ns/iteration for the inline assembly version vs 507ns/iteration for the one using llvm.x86.addcarryx.u64.

It appears that LLVM always replaces llvm.x86.addcarryx.u64 with a polyfill based on llvm.x86.addcarry.u64 and then fails to emit adcx instructions.

@topperc
Copy link
Collaborator

topperc commented Apr 20, 2019

The X86 backend isn't currently set up to model the C flag and O flag separately. We model all of the flags as one register. Because of this we can't interleave the flag dependencies. We would need to do something about that before it makes sense to implement _addcarryx_u64 as anything other than plain adc.

@gnzlbg
Copy link
Mannequin Author

gnzlbg mannequin commented Apr 22, 2019

Hmm. Indeed. LLVM should also emit a proper mulx here for adcx to make sense.

@topperc
Copy link
Collaborator

topperc commented Apr 22, 2019

Even that's not enough. We need to use jrcxz for the loop control and an lea for the index adjustment. And we need to keep flags alive across basic block boundaries which I don't think we usually do.

@llvmbot llvmbot transferred this issue from llvm/llvm-bugzilla-archive Dec 10, 2021
@chfast
Copy link
Member

chfast commented Feb 12, 2024

Is it possible to have custom legalization of e.g. mul i256 that uses mulx, adcx and adox?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
backend:X86 bugzilla Issues migrated from bugzilla
Projects
None yet
Development

No branches or pull requests

2 participants